summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-02-24 14:34:34 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-02-24 14:34:34 +0000
commit946b54554e13d6a97940df936123855e0a305abc (patch)
tree80a778fbd7bb3c7858cfac572df1cb08cfa4f988
parentInitial commit. (diff)
downloadmdadm-946b54554e13d6a97940df936123855e0a305abc.tar.xz
mdadm-946b54554e13d6a97940df936123855e0a305abc.zip
Adding upstream version 4.2.upstream/4.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--.gitignore18
-rw-r--r--ANNOUNCE-3.098
-rw-r--r--ANNOUNCE-3.0.122
-rw-r--r--ANNOUNCE-3.0.221
-rw-r--r--ANNOUNCE-3.0.329
-rw-r--r--ANNOUNCE-3.133
-rw-r--r--ANNOUNCE-3.1.139
-rw-r--r--ANNOUNCE-3.1.246
-rw-r--r--ANNOUNCE-3.1.346
-rw-r--r--ANNOUNCE-3.1.437
-rw-r--r--ANNOUNCE-3.1.542
-rw-r--r--ANNOUNCE-3.277
-rw-r--r--ANNOUNCE-3.2.175
-rw-r--r--ANNOUNCE-3.2.236
-rw-r--r--ANNOUNCE-3.2.324
-rw-r--r--ANNOUNCE-3.2.4144
-rw-r--r--ANNOUNCE-3.2.531
-rw-r--r--ANNOUNCE-3.2.657
-rw-r--r--ANNOUNCE-3.363
-rw-r--r--ANNOUNCE-3.3.123
-rw-r--r--ANNOUNCE-3.3.216
-rw-r--r--ANNOUNCE-3.3.318
-rw-r--r--ANNOUNCE-3.3.437
-rw-r--r--ANNOUNCE-3.424
-rw-r--r--ANNOUNCE-4.022
-rw-r--r--ANNOUNCE-4.116
-rw-r--r--ANNOUNCE-4.219
-rw-r--r--Assemble.c2227
-rw-r--r--Build.c227
-rw-r--r--COPYING339
-rw-r--r--ChangeLog306
-rw-r--r--Create.c1118
-rw-r--r--Detail.c879
-rw-r--r--Dump.c319
-rw-r--r--Examine.c228
-rw-r--r--Grow.c5229
-rw-r--r--INSTALL13
-rw-r--r--Incremental.c1764
-rw-r--r--Kill.c147
-rw-r--r--Makefile332
-rw-r--r--Manage.c1767
-rw-r--r--Monitor.c1275
-rw-r--r--Query.c140
-rw-r--r--README.initramfs122
-rw-r--r--ReadMe.c656
-rw-r--r--TODO213
-rw-r--r--bitmap.c534
-rw-r--r--bitmap.h291
-rw-r--r--clustermd_tests/00r10_Create50
-rw-r--r--clustermd_tests/00r1_Create50
-rw-r--r--clustermd_tests/01r10_Grow_bitmap-switch51
-rw-r--r--clustermd_tests/01r10_Grow_resize38
-rw-r--r--clustermd_tests/01r1_Grow_add68
-rw-r--r--clustermd_tests/01r1_Grow_bitmap-switch51
-rw-r--r--clustermd_tests/01r1_Grow_resize23
-rw-r--r--clustermd_tests/02r10_Manage_add33
-rw-r--r--clustermd_tests/02r10_Manage_add-spare30
-rw-r--r--clustermd_tests/02r10_Manage_re-add18
-rw-r--r--clustermd_tests/02r1_Manage_add33
-rw-r--r--clustermd_tests/02r1_Manage_add-spare30
-rw-r--r--clustermd_tests/02r1_Manage_re-add16
-rw-r--r--clustermd_tests/03r10_switch-recovery21
-rw-r--r--clustermd_tests/03r10_switch-resync18
-rw-r--r--clustermd_tests/03r1_switch-recovery21
-rw-r--r--clustermd_tests/03r1_switch-resync18
-rw-r--r--clustermd_tests/cluster_conf43
-rw-r--r--clustermd_tests/func.sh332
-rw-r--r--config.c1235
-rw-r--r--coverity-gcc-hack.h10
-rw-r--r--crc32.c360
-rw-r--r--crc32.h441
-rw-r--r--crc32c.c104
-rw-r--r--dlink.c74
-rw-r--r--dlink.h25
-rw-r--r--external-reshape-design.txt280
-rwxr-xr-xinventory284
-rw-r--r--lib.c575
-rwxr-xr-xmakedist96
-rw-r--r--managemon.c943
-rw-r--r--mapfile.c511
-rw-r--r--maps.c185
-rw-r--r--md.41317
-rw-r--r--md5.h136
-rw-r--r--md_p.h295
-rw-r--r--md_u.h115
-rw-r--r--mdadm.8.in3452
-rw-r--r--mdadm.c2078
-rw-r--r--mdadm.conf-example65
-rw-r--r--mdadm.conf.5706
-rw-r--r--mdadm.h1887
-rw-r--r--mdadm.spec47
-rw-r--r--mdmon-design.txt146
-rw-r--r--mdmon.8257
-rw-r--r--mdmon.c594
-rw-r--r--mdmon.h111
-rw-r--r--mdopen.c509
-rw-r--r--mdstat.c441
-rw-r--r--misc/mdcheck166
-rw-r--r--misc/syslog-events27
-rw-r--r--mkinitramfs55
-rw-r--r--monitor.c909
-rw-r--r--msg.c475
-rw-r--r--msg.h37
-rw-r--r--part.h79
-rw-r--r--platform-intel.c969
-rw-r--r--platform-intel.h259
-rw-r--r--policy.c931
-rw-r--r--probe_roms.c331
-rw-r--r--probe_roms.h24
-rw-r--r--pwgr.c17
-rw-r--r--raid5extend.c80
-rw-r--r--raid6check.896
-rw-r--r--raid6check.c714
-rw-r--r--restripe.c1038
-rw-r--r--sg_io.c60
-rw-r--r--sha1.c415
-rw-r--r--sha1.h136
-rw-r--r--super-ddf.c5244
-rw-r--r--super-gpt.c220
-rw-r--r--super-intel.c12894
-rw-r--r--super-mbr.c206
-rw-r--r--super0.c1350
-rw-r--r--super1.c2980
-rw-r--r--swap_super.c81
-rw-r--r--sysfs.c1167
-rw-r--r--systemd/SUSE-mdadm_env.sh48
-rw-r--r--systemd/mdadm-grow-continue@.service17
-rw-r--r--systemd/mdadm-last-resort@.service8
-rw-r--r--systemd/mdadm-last-resort@.timer7
-rw-r--r--systemd/mdadm.shutdown4
-rw-r--r--systemd/mdcheck_continue.service17
-rw-r--r--systemd/mdcheck_continue.timer15
-rw-r--r--systemd/mdcheck_start.service17
-rw-r--r--systemd/mdcheck_start.timer16
-rw-r--r--systemd/mdmon@.service28
-rw-r--r--systemd/mdmonitor-oneshot.service15
-rw-r--r--systemd/mdmonitor-oneshot.timer15
-rw-r--r--systemd/mdmonitor.service16
-rwxr-xr-xtest283
-rw-r--r--tests/00linear25
-rw-r--r--tests/00multipath29
-rw-r--r--tests/00names13
-rw-r--r--tests/00raid043
-rw-r--r--tests/00raid138
-rw-r--r--tests/00raid1018
-rw-r--r--tests/00raid416
-rw-r--r--tests/00raid533
-rw-r--r--tests/00raid616
-rw-r--r--tests/00readonly22
-rw-r--r--tests/01r1fail29
-rw-r--r--tests/01r5fail27
-rw-r--r--tests/01r5integ33
-rw-r--r--tests/01raid6integ57
-rw-r--r--tests/01replace52
-rw-r--r--tests/02lineargrow23
-rw-r--r--tests/02r1add40
-rw-r--r--tests/02r1grow36
-rw-r--r--tests/02r5grow53
-rw-r--r--tests/02r6grow36
-rw-r--r--tests/03assem-incr17
-rw-r--r--tests/03r0assem137
-rw-r--r--tests/03r5assem109
-rw-r--r--tests/03r5assem-failed12
-rw-r--r--tests/03r5assemV1128
-rw-r--r--tests/04r0update20
-rw-r--r--tests/04r1update15
-rw-r--r--tests/04r5swap18
-rw-r--r--tests/04update-metadata48
-rw-r--r--tests/04update-uuid82
-rw-r--r--tests/05r1-add-internalbitmap20
-rw-r--r--tests/05r1-add-internalbitmap-v1a20
-rw-r--r--tests/05r1-add-internalbitmap-v1b20
-rw-r--r--tests/05r1-add-internalbitmap-v1c20
-rw-r--r--tests/05r1-bitmapfile49
-rw-r--r--tests/05r1-failfast74
-rw-r--r--tests/05r1-grow-external33
-rw-r--r--tests/05r1-grow-internal31
-rw-r--r--tests/05r1-grow-internal-131
-rw-r--r--tests/05r1-internalbitmap47
-rw-r--r--tests/05r1-internalbitmap-v1a48
-rw-r--r--tests/05r1-internalbitmap-v1b49
-rw-r--r--tests/05r1-internalbitmap-v1c48
-rw-r--r--tests/05r1-n3-bitmapfile53
-rw-r--r--tests/05r1-re-add39
-rw-r--r--tests/05r1-re-add-nosuper38
-rw-r--r--tests/05r1-remove-internalbitmap18
-rw-r--r--tests/05r1-remove-internalbitmap-v1a18
-rw-r--r--tests/05r1-remove-internalbitmap-v1b18
-rw-r--r--tests/05r1-remove-internalbitmap-v1c18
-rw-r--r--tests/05r5-bitmapfile49
-rw-r--r--tests/05r5-internalbitmap47
-rw-r--r--tests/05r6-bitmapfile49
-rw-r--r--tests/05r6tor027
-rw-r--r--tests/06name12
-rw-r--r--tests/06sysfs11
-rw-r--r--tests/06wrmostly13
-rw-r--r--tests/07autoassemble24
-rw-r--r--tests/07autodetect34
-rw-r--r--tests/07changelevelintr61
-rw-r--r--tests/07changelevels114
-rw-r--r--tests/07layouts91
-rw-r--r--tests/07reshape5intr41
-rw-r--r--tests/07revert-grow52
-rw-r--r--tests/07revert-inplace44
-rw-r--r--tests/07revert-shrink56
-rw-r--r--tests/07testreshape545
-rw-r--r--tests/09imsm-assemble73
-rw-r--r--tests/09imsm-create-fail-rebuild78
-rw-r--r--tests/09imsm-overlap28
-rw-r--r--tests/10ddf-assemble-missing61
-rw-r--r--tests/10ddf-create89
-rw-r--r--tests/10ddf-create-fail-rebuild77
-rw-r--r--tests/10ddf-fail-create-race66
-rw-r--r--tests/10ddf-fail-readd55
-rw-r--r--tests/10ddf-fail-readd-readonly71
-rw-r--r--tests/10ddf-fail-spare86
-rw-r--r--tests/10ddf-fail-stop-readd66
-rw-r--r--tests/10ddf-fail-twice59
-rw-r--r--tests/10ddf-fail-two-spares86
-rw-r--r--tests/10ddf-geometry82
-rw-r--r--tests/10ddf-incremental-wrong-order131
-rw-r--r--tests/10ddf-sudden-degraded18
-rw-r--r--tests/11spare-migration454
-rw-r--r--tests/12imsm-r0_2d-grow-r0_3d20
-rw-r--r--tests/12imsm-r0_2d-grow-r0_4d20
-rw-r--r--tests/12imsm-r0_2d-grow-r0_5d20
-rw-r--r--tests/12imsm-r0_3d-grow-r0_4d20
-rw-r--r--tests/12imsm-r5_3d-grow-r5_4d20
-rw-r--r--tests/12imsm-r5_3d-grow-r5_5d20
-rw-r--r--tests/13imsm-r0_r0_2d-grow-r0_r0_4d29
-rw-r--r--tests/13imsm-r0_r0_2d-grow-r0_r0_5d29
-rw-r--r--tests/13imsm-r0_r0_3d-grow-r0_r0_4d29
-rw-r--r--tests/13imsm-r0_r5_3d-grow-r0_r5_4d29
-rw-r--r--tests/13imsm-r0_r5_3d-grow-r0_r5_5d29
-rw-r--r--tests/13imsm-r5_r0_3d-grow-r5_r0_4d29
-rw-r--r--tests/13imsm-r5_r0_3d-grow-r5_r0_5d29
-rw-r--r--tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d29
-rw-r--r--tests/14imsm-r0_3d_no_spares-migrate-r5_3d21
-rw-r--r--tests/14imsm-r0_r0_2d-takeover-r10_4d30
-rw-r--r--tests/14imsm-r10_4d-grow-r10_5d20
-rw-r--r--tests/14imsm-r10_r5_4d-takeover-r0_2d30
-rw-r--r--tests/14imsm-r1_2d-grow-r1_3d19
-rw-r--r--tests/14imsm-r1_2d-takeover-r0_2d21
-rw-r--r--tests/14imsm-r5_3d-grow-r5_5d-no-spares20
-rw-r--r--tests/14imsm-r5_3d-migrate-r4_3d21
-rw-r--r--tests/15imsm-r0_3d_64k-migrate-r0_3d_256k21
-rw-r--r--tests/15imsm-r5_3d_4k-migrate-r5_3d_256k21
-rw-r--r--tests/15imsm-r5_3d_64k-migrate-r5_3d_256k21
-rw-r--r--tests/15imsm-r5_6d_4k-migrate-r5_6d_256k21
-rw-r--r--tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k34
-rw-r--r--tests/16imsm-r0_3d-migrate-r5_4d22
-rw-r--r--tests/16imsm-r0_5d-migrate-r5_6d22
-rw-r--r--tests/16imsm-r5_3d-migrate-r0_3d21
-rw-r--r--tests/16imsm-r5_5d-migrate-r0_5d21
-rw-r--r--tests/18imsm-1d-takeover-r0_1d22
-rw-r--r--tests/18imsm-1d-takeover-r1_2d20
-rw-r--r--tests/18imsm-r0_2d-takeover-r10_4d22
-rw-r--r--tests/18imsm-r10_4d-takeover-r0_2d22
-rw-r--r--tests/18imsm-r1_2d-takeover-r0_1d21
-rw-r--r--tests/19raid6auto-repair49
-rw-r--r--tests/19raid6check27
-rw-r--r--tests/19raid6repair56
-rw-r--r--tests/19repair-does-not-destroy28
-rw-r--r--tests/20raid5journal64
-rw-r--r--tests/21raid5cache87
-rw-r--r--tests/ToTest44
-rw-r--r--tests/env-ddf-template113
-rw-r--r--tests/env-imsm-template91
-rw-r--r--tests/func.sh344
-rw-r--r--tests/imsm-grow-template119
-rw-r--r--tests/utils191
-rw-r--r--udev-md-clustered-confirm-device.rules21
-rw-r--r--udev-md-raid-arrays.rules44
-rw-r--r--udev-md-raid-assembly.rules38
-rw-r--r--udev-md-raid-creating.rules7
-rw-r--r--udev-md-raid-safe-timeouts.rules61
-rw-r--r--util.c2378
-rw-r--r--uuid.c112
-rw-r--r--xmalloc.c84
279 files changed, 77998 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..217fe76
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,18 @@
+/*.o
+/*.man
+/*-stamp
+/mdadm
+/mdadm.8
+/mdadm.udeb
+/mdassemble
+/mdmon
+/swap_super
+/test_stripe
+/TAGS
+/mdadm.O2
+/mdadm.Os
+/mdadm.static
+/mdassemble.auto
+/mdassemble.static
+/mdmon.O2
+/raid6check
diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0
new file mode 100644
index 0000000..f2d4f84
--- /dev/null
+++ b/ANNOUNCE-3.0
@@ -0,0 +1,98 @@
+Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux
+
+I am pleased to (finally) announce the availability of
+ mdadm version 3.0
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This is a major new version and as such should be treated with some
+caution. However it has seen substantial testing and is considerred
+to be ready for wide use.
+
+
+The significant change which justifies the new major version number is
+that mdadm can now handle metadata updates entirely in userspace.
+This allows mdadm to support metadata formats that the kernel knows
+nothing about.
+
+Currently two such metadata formats are supported:
+ - DDF - The SNIA standard format
+ - Intel Matrix - The metadata used by recent Intel ICH controlers.
+
+Also the approach to device names has changed significantly.
+
+If udev is installed on the system, mdadm will not create any devices
+in /dev. Rather it allows udev to manage those devices. For this to work
+as expected, the included udev rules file should be installed.
+
+If udev is not installed, mdadm will still create devices and symlinks
+as required, and will also remove them when the array is stopped.
+
+mdadm now requires all devices which do not have a standard name (mdX
+or md_dX) to live in the directory /dev/md/. Names in this directory
+will always be created as symlinks back to the standard name in /dev.
+
+The man pages contain some information about the new externally managed
+metadata. However see below for a more condensed overview.
+
+Externally managed metadata introduces the concept of a 'container'.
+A container is a collection of (normally) physical devices which have
+a common set of metadata. A container is assembled as an md array, but
+is left 'inactive'.
+
+A container can contain one or more data arrays. These are composed from
+slices (partitions?) of various devices in the container.
+
+For example, a 5 devices DDF set can container a RAID1 using the first
+half of two devices, a RAID0 using the first half of the remain 3 devices,
+and a RAID5 over thte second half of all 5 devices.
+
+A container can be created with
+
+ mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde]
+
+or "-e imsm" to use the Intel Matrix Storage Manager.
+
+An array can be created within a container either by giving the
+container name and the only member:
+
+ mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0
+
+or by listing the component devices
+
+ mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde]
+
+To assemble a container, it is easiest just to pass each device in turn to
+mdadm -I
+
+ for i in /dev/sd[abcde]
+ do mdadm -I $i
+ done
+
+This will assemble the container and the components.
+
+Alternately the container can be assembled explicitly
+
+ mdadm -A /dev/md0 /dev/sd[abcde]
+
+Then the components can all be assembled with
+
+ mdadm -I /dev/md0
+
+For each container, mdadm will start a program called "mdmon" which will
+monitor the array and effect any metadata updates needed. The array is
+initially assembled readonly. It is up to "mdmon" to mark the metadata
+as 'dirty' and which the array to 'read-write'.
+
+The version 0.90 and 1.x metadata formats supported by previous
+versions for mdadm are still supported and the kernel still performs
+the same updates it use to. The new 'mdmon' approach is only used for
+newly introduced metadata types.
+
+NeilBrown 2nd June 2009
diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1
new file mode 100644
index 0000000..91b4428
--- /dev/null
+++ b/ANNOUNCE-3.0.1
@@ -0,0 +1,22 @@
+Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This contains only minor bug fixes over 3.0. If you are using
+3.0, you could consider upgrading.
+
+The brief change log is:
+ - Fix various segfaults
+ - Fixed for --examine with containers
+ - Lots of other little fixes.
+
+NeilBrown 25th September 2009
diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2
new file mode 100644
index 0000000..93643d1
--- /dev/null
+++ b/ANNOUNCE-3.0.2
@@ -0,0 +1,21 @@
+Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This just contains one bugfix over 3.0.1 - I was obviously a bit hasty
+in releasing that one.
+
+The brief change log is:
+ - Fix crash when hosthost is not set, as often happens in
+ early boot.
+
+NeilBrown 25th September 2009
diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3
new file mode 100644
index 0000000..d6117a1
--- /dev/null
+++ b/ANNOUNCE-3.0.3
@@ -0,0 +1,29 @@
+Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.0.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+This contains a collection of bug fixes and minor enhancements over
+3.0.1.
+
+The brief change log is:
+ - Improvements for creating arrays giving just a name, like 'foo',
+ rather than the full '/dev/md/foo'.
+ - Improvements for assembling member arrays of containers.
+ - Improvements to test suite
+ - Add option to change increment for RebuildNN messages reported
+ by "mdadm --monitor"
+ - Improvements to mdmon 'hand-over' from initrd to final root.
+ - Handle merging of devices that have left an IMSM array and are
+ being re-incorporated.
+ - Add missing space in "--detail --brief" output.
+
+NeilBrown 22nd October 2009
diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1
new file mode 100644
index 0000000..343b85d
--- /dev/null
+++ b/ANNOUNCE-3.1
@@ -0,0 +1,33 @@
+Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux
+
+Hot on the heals of 3.0.3 I am pleased to announce the availability of
+ mdadm version 3.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+
+It contains significant feature enhancements over 3.0.x
+
+The brief change log is:
+ - Support --grow to change the layout of RAID4/5/6
+ - Support --grow to change the chunksize of raid 4/5/6
+ - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and
+ back.
+ - Support --grow to reduce the number of devices in RAID4/5/6.
+ - Support restart of these grow options which assembling an array
+ which is partially grown.
+ - Assorted tests of this code, and of different RAID6 layouts.
+
+Note that a 2.6.31 or later is needed to have access to these.
+Reducing devices in a RAID4/5/6 requires 2.6.32.
+Changing RAID5 to RAID1 requires 2.6.33.
+
+You should only upgrade if you need to use, or which to test, these
+features.
+
+NeilBrown 22nd October 2009
diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1
new file mode 100644
index 0000000..9e480dc
--- /dev/null
+++ b/ANNOUNCE-3.1.1
@@ -0,0 +1,39 @@
+Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix release over 3.1, which was withdrawn due to serious
+bugs. So it might be best to ignore 3.1 and say that this is a significant
+feature release over 3.0.x
+
+Significant changes are:
+ - RAID level conversion between RAID1, RAID5, and RAID6 are
+ possible were the kernel supports it (2.6.32 at least)
+ - online chunksize and layout changing for RAID5 and RAID6
+ where the kernel supports it.
+ - reduce the number of devices in a RAID4/5/6 array.
+
+ - The default metadata is not v1.1. This metadata is stored at the
+ start of the device so is safer in many ways but could interfere with
+ boot loaded. The old default (0.90) is still available and fully
+ supported.
+
+ - The default chunksize is now 512K rather than 64K. This seems more
+ appropriate for modern devices.
+
+ - The default bitmap chunksize for internal bitmaps is now at least
+ 64Meg as fine grained bitmaps tend to impact performance more for
+ little extra gain.
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.1.
+
+NeilBrown 19th November 2009
diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2
new file mode 100644
index 0000000..321b8be
--- /dev/null
+++ b/ANNOUNCE-3.1.2
@@ -0,0 +1,46 @@
+Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.1.
+
+Significant changes are:
+ - The default metadata has change again (sorry about that).
+ It is now v1.2 and will hopefully stay that way. It turned
+ out there with boot-block issues with v1.1 which make it
+ unsuitable for a default, though in many cases it is still
+ suitable to use.
+ - Stopping a container is not permitted when members are still
+ active
+ - Add 'homehost' to the valid words for the "AUTO" config file
+ line. When followed by "-all", this causes mdadm to
+ auto-assemble any array belonging to this host, but not
+ auto-assemble anything else.
+ - Fix some bugs with "--grow --chunksize=" for changing chunksize.
+ - VAR_RUN can be easily changed at compile time just like ALT_RUN.
+ This gives distros more flexability in how to manage the
+ pid and sock files that mdmon needs.
+ - Various mdmon fixes
+ - Alway make bitmap 4K-aligned if at all possible.
+ - If mdadm.conf lists arrays which have inter-dependencies,
+ the previously had to be listed in the "right" order. Now
+ any order should work.
+ - Fix --force assembly of v1.x arrays which are in the process
+ of recovering.
+ - Add section on 'scrubbing' to 'md' man page.
+ - Various command-line-option parsing improvements.
+ - ... and lots of other bug fixes.
+
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.2
+
+NeilBrown 10th March 2010
diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3
new file mode 100644
index 0000000..95b2b6c
--- /dev/null
+++ b/ANNOUNCE-3.1.3
@@ -0,0 +1,46 @@
+Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.2
+
+Significant changes are:
+ - mapfile now lives in a fixed location which default to
+ /dev/.mdadm/map but can be changed at compile time. This
+ location is choses and most distros provide it during early
+ boot and preserve it through. As long a /dev exists and is
+ writable, /dev/.mdadm will be created.
+ Other files file communication with mdmon live here too.
+ This fixes a bug reported by Debian and Gentoo users where
+ udev would spin in early-boot.
+ - IMSM and DDF metadata will not be recognised on partitions
+ as they should only be used on whole-disks.
+ - Various overflows causes by 2G drives have been addressed.
+ - A subarray of an IMSM contain can now be killed with
+ --kill-subarray. Also subarrays can be renamed with
+ --update-subarray
+ - -If (or --incremental --fail) can be used from udev to
+ fail and remove from all arrays a device which has been
+ unplugged from the system. i.e. hot-unplug-support.
+ - "mdadm /dev/mdX --re-add missing" will look for any device
+ that looks like it should be a member of /dev/mdX but isn't
+ and will automatically --re-add it
+ - Now compile with -Wextra to get extra warnings.
+ - Lots of minor bug fixes, documentation improvements, etcc
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.3
+
+It is expected that the next release will be 3.2 with a number of new
+features. 3.1.4 will only happen if important bugs show up before 3.2
+is stable.
+
+NeilBrown 6th August 2010
diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4
new file mode 100644
index 0000000..c157a36
--- /dev/null
+++ b/ANNOUNCE-3.1.4
@@ -0,0 +1,37 @@
+Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.4
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.3.
+3.1.3 had a couple of embarrasing regressions and a couple of other
+issues surfaces which had easy fixes so I decided to make a 3.1.4
+release after all.
+
+Two fixes related to configs that aren't using udev:
+ - Don't remove md devices which 'standard' names on --stop
+ - Allow dev_open to work on read-only /dev
+And fixed regressions:
+ - Allow --incremental to add spares to an array
+ - Accept --no-degraded as a deprecated option rather than
+ throwing an error
+ - Return correct success status when --incrmental assembling
+ a container which does not yet have enough devices.
+ - Don't link mdadm with pthreads, only mdmon needs it.
+ - Fix compiler warning due to bad use of snprintf
+ - Fix spare migration
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.4
+
+It is expected that the next release will be 3.2 with a number of new
+features.
+
+NeilBrown 31st August 2010
diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5
new file mode 100644
index 0000000..baa1f92
--- /dev/null
+++ b/ANNOUNCE-3.1.5
@@ -0,0 +1,42 @@
+Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.1.5
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git?p=mdadm
+
+This is a bugfix/stability release over 3.1.4. It contains all the
+important bugfixes found while working on 3.2 and 3.2.1. It will be
+the last 3.1.x release - 3.2.1 is expected to be released in a few days.
+
+Changes include:
+ - Fixes for v1.x metadata on big-endian machines.
+ - man page improvements
+ - Improve '--detail --export' when run on partitions of an md array.
+ - Fix regression with removing 'failed' or 'detached' devices.
+ - Fixes for "--assemble --force" in various unusual cases.
+ - Allow '-Y' to mean --export. This was documented but not implemented.
+ - Various fixed for handling 'ddf' metadata. This is now more reliable
+ but could benefit from more interoperability testing.
+ - Correctly list subarrays of a container in "--detail" output.
+ - Improve checks on whether the requested number of devices is supported
+ by the metadata - both for --create and --grow.
+ - Don't remove partitions from a device that is being included in an
+ array until we are fully committed to including it.
+ - Allow "--assemble --update=no-bitmap" so an array with a corrupt
+ bitmap can still be assembled.
+ - Don't allow --add to succeed if it looks like a "--re-add" is probably
+ wanted, but cannot succeed. This avoids inadvertently turning
+ devices into spares when an array is failed.
+
+This release is believed to be stable and you should feel free to
+upgrade to 3.1.5
+
+
+NeilBrown 23rd March 2011
+
diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2
new file mode 100644
index 0000000..9e282bc
--- /dev/null
+++ b/ANNOUNCE-3.2
@@ -0,0 +1,77 @@
+Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY)
+
+I am pleased to announce the availability of
+ mdadm version 3.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm devel-3.2
+ http://neil.brown.name/git?p=mdadm
+
+This is a "Developers only" release. Please don't consider using it
+or making it available to others without reading the following.
+
+
+By far the most significant change in this release related to the
+management of reshaping arrays. This code has been substantially
+re-written so that it can work with 'externally managed metadata' -
+Intel's IMSM in particular. We now support level migration and
+OnLine Capacity Expansion on these arrays.
+
+However, while the code largely works it has not been tested
+exhaustively so there are likely to be problems. As the reshape code
+for native metadata arrays was changed as part of this rewrite these
+problems could also result in regressions for reshape of native
+metadata.
+
+It is partly to encourage greater testing that this release is being
+made. Any reports of problem - particular reproducible recipes for
+triggering the problems - will be gratefully received.
+
+It is hopped that a "3.2.1" release will be available in early March
+which will be a bugfix release over this and can be considered
+suitable for general use.
+
+Other changes of note:
+
+ - Policy framework.
+ Various policy statements can be made in the mdadm.conf to guide
+ the behaviour of mdadm, particular with regards to how new devices
+ are treated by "mdadm -I".
+ Depending on the 'action' associated with a device (identified by
+ its 'path') such need devices can be automatically re-added to and
+ existing array that they previously fell out off, or automatically
+ added as a spare if they appear to contain no data.
+
+ - mdadm now has a limited understanding of partition tables. This
+ allows the policy framework to make decisions about partitioned
+ devices as well.
+
+ - --incremental --remove can be told what --path the device was on,
+ and this info will be recorded so that another device appearing at
+ the same physical location can be preferentially added to the same
+ array (provides the spare-same-slot action policy applied to the
+ path).
+
+ - A new flags "--invalid-backup" flag is available in --assemble
+ mode. This can be used to re-assemble an array which was stopping
+ in the middle of a reshape, and for which the 'backup file' is no
+ longer available or is corrupted. The array may have some
+ corruption in it at the point where reshape was up to, but at least
+ the rest of the array will become available.
+
+
+ - Various internal restructuring - more is needed.
+
+
+Any feed back and bug reports are always welcomed at:
+ linux-raid@vger.kernel.org
+
+And please: don't use this in production - particularly not the
+--grow functionality.
+
+NeilBrown 1st February 2011
+
+
diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1
new file mode 100644
index 0000000..0e7826c
--- /dev/null
+++ b/ANNOUNCE-3.2.1
@@ -0,0 +1,75 @@
+
+
+I am pleased to announce the availability of
+ mdadm version 3.2.1
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+Many of the changes in this release are of internal interest only,
+restructuring and refactoring code and so forth.
+
+Most of the bugs found and fixed during development for 3.2.1 have been
+back-ported for the recently-release 3.1.5 so this release primarily
+provides a few new features over 3.1.5.
+
+They include:
+ - policy framework
+ Policy can be expressed for moving spare devices between arrays, and
+ for how to handle hot-plugged devices. This policy can be different
+ for devices plugged in to different controllers etc.
+ This, for example, allows a configuration where when a device is plugged
+ in it is immediately included in an md array as a hot spare and
+ possibly starts recovery immediately if an array is degraded.
+
+ - some understanding of mbr and gpt paritition tables
+ This is primarly to support the new hot-plug support. If a
+ device is plugged in and policy suggests it should have a partition table,
+ the partition table will be copied from a suitably similar device, and
+ then the partitions will hot-plug and can then be added to md arrays.
+
+ - "--incremental --remove" can remember where a device was removed from
+ so if a device gets plugged back in the same place, special policy applies
+ to it, allowing it to be included in an array even if a general hotplug
+ will not be included.
+
+ - enhanced reshape options, including growing a RAID0 by converting to RAID4,
+ restriping, and converting back. Also convertions between RAID0 and
+ RAID10 and between RAID1 and RAID10 are possible (with a suitably recent
+ kernel).
+
+ - spare migration for IMSM arrays.
+ Spare migration can now work across 'containers' using non-native metadata
+ and specifically Intel's IMSM arrays support spare migrations.
+
+ - OLCE and level migration for Intel IMSM arrays.
+ OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is
+ supported for Intel Matrix Storage Manager arrays.
+ This support is currently 'experimental' for technical reasons. It can
+ be enabled with "export MDADM_EXPERIMENTAL=1"
+
+ - avoid including wayward devices
+ If you split a RAID1, mount the two halves as two separate degraded RAID1s,
+ and then later bring the two back together, it is possible that the md
+ metadata won't properly show that one must over-ride the other.
+ mdadm now does extra checking to detect this possibilty and avoid
+ potentially corrupting data.
+
+ - remove any possible confusion between similar options.
+ e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't
+ notice if one was used where the other was expected.
+
+ - allow K,M,G suffixes on chunk sizes
+
+
+While mdadm-3.2.1 is considered to be reasonably stable, you should
+only use it if you want to try out the new features, or if you
+generally like to be on the bleeding edge. If the new features are not
+important to you, then 3.1.5 is probably the appropriate version to be using
+until 3.2.2 comes out.
+
+NeilBrown 28th March 2011
diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2
new file mode 100644
index 0000000..b70d18b
--- /dev/null
+++ b/ANNOUNCE-3.2.2
@@ -0,0 +1,36 @@
+Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.2
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a stablising release for the 3.2 series.
+Many of the changes just fix bugs introduces in 3.2 or 3.2.1.
+
+There are some new features. They are:
+ - reshaping IMSM (Intel metadata) arrays is no longer 'experimental',
+ it should work properly and be largely compatible with IMSM drivers in
+ other platforms.
+ - --assume-clean can be used with --grow --size to avoid resyncing the
+ new part of the array. This is only support with very new kernels.
+ - RAID0 arrays can have chunksize which is not a power of 2. This has been
+ supported in the kernel for a while but is only now supprted by
+ mdadm.
+
+ - A new tool 'raid6check' is available which can check a RAID6 array,
+ or part of it, and report which device is most inconsistent with the
+ others if any stripe is inconsistent. This is still under development
+ and does not have a man page yet. If anyone tries it out and has any
+ questions or experience to report, they would be most welcome on
+ linux-raid@vger.kernel.org.
+
+Future releases in the 3.2 series will only be made if bugfixes are needed.
+The next release to add features is expected to be 3.3.
+
+NeilBrown 17th June 2011
diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3
new file mode 100644
index 0000000..8a8dba4
--- /dev/null
+++ b/ANNOUNCE-3.2.3
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.3
+
+It is available at the usual places:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a bugfix release for the 3.2 series with many
+minor fixes with little or no impact.
+
+The largest single area of change is support for reshape of Intel
+IMSM arrays (OnLine Capacity Explansion and Level Migtration).
+Among other fixes, this now has a better chance of surviving if a
+device fails during reshape.
+
+Upgrading is recommended - particularly if you use mdadm for IMSM
+arrays - but not essential.
+
+NeilBrown 23rd December 2011
diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4
new file mode 100644
index 0000000..e321678
--- /dev/null
+++ b/ANNOUNCE-3.2.4
@@ -0,0 +1,144 @@
+Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.4
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release is largely a bugfix release for the 3.2 series with many
+minor fixes with little or no impact.
+
+"--oneline" log of changes is below. Some notable ones are:
+
+ - --offroot argument to improve interactions between mdmon and initrd
+ - --prefer argument to select which /dev names to display in some
+ circumstances.
+ - relax restructions on when "--add" will be allowed
+ - Fix bug with adding write-intent-bitmap to active array
+ - Now defaults to "/run/mdadm" for storing run-time files.
+
+Upgrading is encouraged.
+
+The next mdadm release is expected to be 3.3 with a number of new
+features.
+
+NeilBrown 9th May 2012
+
+77b3ac8 monitor: make return from read_and_act more symbolic.
+68226a8 monitor: ensure we retry soon when 'remove' fails.
+8453f8d fix: Monitor sometimes crashes
+90fa1a2 Work around gcc-4.7's strict aliasing checks
+0c4304c fix: container creation with --incremental used.
+5d1c7cd FIX: External metadata sometimes is not updated
+3c20f98 FIX: mdmon check in reshape_container() can cause a problem
+59ab9f5 FIX: Typo error in fprint command
+9587c37 imsm: load_super_imsm_all function refactoring
+ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list
+ca9de18 imsm: validate the number of imsm volumes per controller
+30602f5 imsm: display fd in error trace when when store_imsm_mpb failes
+eb155f6 mdmon: Use getopt_long() to parse command line options
+08ca2ad Add --offroot argument to mdadm
+da82751 Add --offroot argument to mdmon
+a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot
+f878b24 imsm: fix, the second array need to have the whole available space on devices
+d597705 getinfo_super1: Use MaxSector in place of sb->size
+6ef8905 super1: make aread/awrite always use an aligned buffer.
+de5a472 Remove avail_disks arg from 'enough'.
+da8fe5a Assemble: fix --force assemble during reshape.
+b10c663 config: fix handing of 'homehost' in AUTO line.
+92d49ec FIX: NULL pointer to strdup() can be passed
+d2bde6d imsm: FIX: No new missing disks are allowed during general migration
+111e9fd FIX: Array is not run when expansion disks are added
+bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1
+50927b1 Fix: Sometimes mdmon throws core dump during reshape
+78340e2 Flush mdmon before next reshape step during container operation
+e174219 imsm: FIX: Chunk size migration problem
+f93346e FIX: use md position to reshape restart
+6a75c8c imsm: FIX: use md position to reshape restart
+51d83f5 imsm: FIX: Clear migration record when migration switches to next volume.
+e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes
+1ca90aa FIX: Do not try to (continue) reshape using inactive array
+9f1b0f0 config: conf_match should ignore devname when not set.
+d669228 Use posix_memalign() for memory used to write bitmaps
+178950e FIX: Changes in '0' case for reshape position verification
+9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure
+4011421 Print error message if failing to write super for 1.x metadata
+0011874 Use MDMON_DIR for pid files created in Monitor.c
+56d1885 Assemble: don't use O_EXCL until we have checked device content.
+b720636 Assemble: support assembling of a RAID0 being reshaped.
+c69ffac Manage: allow --re-add to failed array.
+52f07f5 Reset bad flag on map update
+911cead super1: support superblocks up to 4K.
+ad6db3c Create: reduce the verbosity of 'default_layout'.
+b2bfdfa super1.c don't keep recalculating bitmap pointer
+4122675 Define and use SUPER1_SIZE for allocations
+1afa930 init_super1() memset full buffer allocated for superblock
+2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset
+3c0bcd4 Use 4K buffer alignment for superblock allocations
+308340a Use struct align_fd to cache fd's block size for aligned reads/writes
+65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset
+de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR()
+0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR
+654a381 super-intel.c: Use ROUND_UP() instead of manually coding it
+42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer
+d4633e0 Examine: fix array size calculation for RAID10.
+e62b778 Assemble: improve verbose logging when including old devices.
+0073a6e Remove possible crash during RAID6 -> RAID5 reshape.
+69fe207 Incremental: fix adding devices with --incremental
+bcbb311 Manage: replace 'return 1' with 'goto abort'.
+9f58469 Manage: freeze recovery while adding multiple devices.
+ae6c05a Create: round off size for RAID1 arrays.
+5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail.
+c07d640 Fix tests/05r1-re-add-nosupper
+2d762ad Fix the new ROUND_UP macro.
+fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs.
+5551b11 imsm: avoid overflows for disks over 1TB
+97f81ee clear hi bits if not used after loading metadata from disk
+e03640b simplify calculating array_blocks
+29cd082 show 2TB volumes/disks support in --detail-platform
+2cc699a check volume size in validate_geometry_imsm_orom
+9126b9a check that no disk over 2TB is used to create container when no support
+027c374 imsm: set 2tb disk attribute for spare
+3556c2f Fix typo: wan -> want
+15632a9 parse_size: distinguish between 0 and error.
+fbdef49 Bitmap_offset is a signed number
+508a7f1 super1: leave more space in front of data by default.
+40110b9 Fix two typos in fprintf messages
+342460c mdadm man page: fix typo
+0e7f69a imsm: display maximum volumes per controller and array
+36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10
+7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change()
+f3871fd imsm: Add new metadata update for volume size expansion
+54397ed imsm: Execute size change for external metatdata
+016e00f FIX: Support metadata changes rollback
+fbf3d20 imsm: FIX: Support metadata changes rollback
+44f6f18 FIX: Extend size of raid0 array
+7e7e9a4 FIX: Respect metadata size limitations
+65a9798 FIX: Detect error and rollback metadata
+13bcac9 imsm: Add function imsm_get_free_size()
+b130333 imsm: Support setting max size for size change operation
+c41e00b imsm: FIX: Component size alignment check
+58d26a2 FIX: Size change is possible as standalone change only
+4aecb54 FIX: Assembled second array is in read only state during reshape
+ae2416e FIX: resolve make everything compilation error
+480f356 Raid limit of 1024 when scanning for devices.
+c2ecf5f Add --prefer option for --detail and --monitor
+0a99975 Relax restrictions on when --add is permitted.
+7ce0570 imsm: fix: rebuild does not continue after reboot
+b51702b fix: correct extending size of raid0 array
+34a1395 Fix sign extension of bitmap_offset in super1.c
+012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset
+5d7b407 imsm: fix: thunderdome may drop 2tb attribute
+5ffdc2d Update test for "is udev active".
+96fd06e Adjust to new standard of /run
+974e039 test: don't worry too much about array size.
+b0a658f Grow: failing the set the per-device size is not an error.
+36614e9 super-intel.c: Don't try to close negative fd
+562aa10 super-intel.c: Fix resource leak from opendir()
+
diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5
new file mode 100644
index 0000000..396da12
--- /dev/null
+++ b/ANNOUNCE-3.2.5
@@ -0,0 +1,31 @@
+Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux
+
+I am somewhat disappointed to have to announce the availability of
+ mdadm version 3.2.5
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This release primarily fixes a serious regression in 3.2.4.
+This regression does *not* cause any risk to data. It simply
+means that adding a device with "--add" would sometime fail
+when it should not.
+
+The fix also includes a couple of minor fixes such as making
+the "--layout=preserve" option to "--grow" work again.
+
+A reminder that the default location for runtime files is now
+"/run/mdadm". If you compile this for a distro that does not
+have "/run", you will need to compile with an alternate setting for
+MAP_DIR. e.g.
+ make MAP_DIR=/var/run/mdadm
+or
+ make MAP_DIR=/dev/.mdadm
+
+NeilBrown 18th May 2012
+
diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6
new file mode 100644
index 0000000..f5cfd49
--- /dev/null
+++ b/ANNOUNCE-3.2.6
@@ -0,0 +1,57 @@
+Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.2.6
+
+It is available at the usual places, now including github:
+ countrycode=xx.
+ http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://neil.brown.name/git/mdadm
+
+This is a stablity release which adds a number of bugfixs to 3.2.5.
+There are no real stand-out fixes, just lots of little bits and pieces.
+
+Below is the "git log --oneline --reverse" list of changes since
+3.2.5.
+
+NeilBrown 25th October 2012
+
+b7e05d2 udev-rules: prevent systemd from mount devices before they are ready.
+0d478e2 mdadm: Fix Segmentation fault.
+42f0ca1 imsm: fix: correct checking volume's degradation
+fcf2195 Monitor: fix inconsistencies in values for ->percent
+5f862fb Monitor: Report NewArray when an array the disappeared, reappears.
+6f51b1c Monitor: fix reporting for Fail vs FailSpare etc.
+68ad53b mdmon: fix arg parsing.
+517f135 Assemble: don't leak memory with fdlist.
+090900c udev-rules: prevent systemd from mount devices before they are ready.
+446e000 sha1.h: remove ansidecl.h header inclusion
+ec894f5 Manage: zero metadata before adding to 'external' array.
+3a84db5 ddf: allow a non-spare to be used to recovery a missing device.
+c5d61ca ddf: hack to fix container recognition.
+23084aa mdmon: fix arg processing for -a
+c4e96a3 mdmon: allow --takeover when original was started with --offroot
+80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf
+c5c56d6 mapfile: fix mapfile rebuild for containers
+aec89f6 fix segfaults in Detail()
+2117ad1 Fix 'enough' function for RAID10.
+0bc300d Use --offroot flag when assembling md arrays via --incrmental
+ac78f24 Grow: make warning about old metadata more explicit.
+14026ab Replace sha1.h with slightly older version.
+6f6809f Add zlib license to crc32.c
+5267ba0 Handles spaces in array names better.
+c51f288 imsm: allow --assume-clean to work.
+acf7076 Grow: allow --grow --continue to work for native metadata.
+335d2a6 Grow: fix a couple of typos with --assume-clean usage
+9ff1427 Fix open_container
+3713633 mdadm: super0: do not override uuid with homehost
+31bff58 Trivial bugfix and spelling fixes.
+e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'.
+22a6461 super0: allow creation of array on 2TB+ devices.
+a5d47a2 Create new md devices consistently
+eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf
+ecdf2d7 Query: don't be confused by partition tables.
+f7b75c1 Query: allow member of non-0.90 arrays to be better reported.
diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3
new file mode 100644
index 0000000..f770aa1
--- /dev/null
+++ b/ANNOUNCE-3.3
@@ -0,0 +1,63 @@
+Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm
+
+This is a major new release so don't be too surprised if there are a
+few issues. If I hear about them they will be fixed in 3.3.1.
+git log reports nearly 500 changes since 3.2.6 so I won't list them
+all.
+
+Some highlights are:
+
+- Some array reshapes can proceed without needing backup file.
+ This is done by changing the 'data_offset' so we never need to write
+ any data back over where it was before. If there is no "head space"
+ or "tail space" to allow data_offset to change, the old mechanism
+ with a backup file can still be used.
+- RAID10 arrays can be reshaped to change the number of devices,
+ change the chunk size, or change the layout between 'near'
+ and 'offset'.
+ This will always change data_offset, and will fail if there is no
+ room for data_offset to be moved.
+- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array.
+- bad-block-logs are supported (but not heavily tested yet)
+- "--assemble --update=revert-reshape" can be used to undo a reshape
+ that has just been started but isn't really wanted. This is very
+ new and while it passes basic tests it cannot be guaranteed.
+- improved locking between --incremental and --assemble
+- uses systemd to run "mdmon" if systemd is configured to do that.
+- kernel names of md devices can be non-numeric. e.g. "md_home" rather than
+ "md0". This will probably confuse lots of other tools, so you need to
+ echo CREATE names=yes >> /etc/mdadm.conf
+ or the feature will not be used. (you also need a reasonably new kernel).
+- "--stop" can be given a kernel name instead of a device name. i.e
+ mdadm --stop md4
+ will work even if /dev/md4 doesn't exist.
+- "--detail --export" has some information about the devices in the array
+- --dump and --restore can be used to backup and restore the metadata on an
+ array.
+- Hot-replace is supported with
+ mdadm /dev/mdX --replace /dev/foo
+ and
+ mdadm /dev/mdX --replace /dev/foo --with /dev/bar
+- Config file can be a directory in which case all "*.conf" files are
+ read in lexical order.
+ Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d
+ Thus
+ echo CREATE name=yes > /etc/mdadm.conf.d/names.conf
+ will also enable the use of named md devices.
+
+- Lots of improvements to DDF support including adding support for
+ RAID10 (thanks Martin Wilck).
+
+and lots of bugfixes and other little changes.
+
+NeilBrown 3rd September 2013
diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1
new file mode 100644
index 0000000..7d5e666
--- /dev/null
+++ b/ANNOUNCE-3.3.1
@@ -0,0 +1,23 @@
+Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.1
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+The main changes are:
+ - lots of work on "DDF" support. Hopefully it will be more stable
+ now. Bug reports are always welcome.
+ - improved interactions with 'systemd'. Where possible, background
+ tasks are run from systemd (if it is present) rather then forking
+ disassociationg from the session. This is important because udev
+ doesn't really let you disassociate.
+
+though there are a number of other little bug fixes too.
+
+NeilBrown 5th June 2014
diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2
new file mode 100644
index 0000000..6b54961
--- /dev/null
+++ b/ANNOUNCE-3.3.2
@@ -0,0 +1,16 @@
+Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.2
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+Changes since 3.3.1 are mostly little bugfixes and some man-page
+updates.
+
+NeilBrown 21st August 2014
diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3
new file mode 100644
index 0000000..ac1b217
--- /dev/null
+++ b/ANNOUNCE-3.3.3
@@ -0,0 +1,18 @@
+Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.3.3
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+The 100 changes since 3.3.3 are mostly little bugfixes and some improvements
+to the selftests.
+raid6check now handle all RAID6 layouts including DDF correctly.
+See git log for the rest.
+
+NeilBrown 24th July 2015
diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4
new file mode 100644
index 0000000..52b9456
--- /dev/null
+++ b/ANNOUNCE-3.3.4
@@ -0,0 +1,37 @@
+Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux
+
+I am somewhat disappointed to have to announce the availability of
+ mdadm version 3.3.4
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm.git
+
+In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage
+Manager) metadata was handled. Previously an IMSM array would only
+be assembled if it was attached to an IMSM controller.
+
+In 3.3 this was relaxed as there are circumstances where the
+controller is not properly detected. Unfortunately this has negative
+consequences which have only just come to light.
+
+If you have an IMSM RAID1 configured and then disable RAID in the
+BIOS, the metadata will remain on the devices. If you then install
+some other OS on one device and then install Linux on the other, Linux
+might eventually start noticing the IMSM metadata (depending a bit on whether
+mdadm is included in the initramfs) and might start up the RAID1. This could
+copy one device over the other, thus trashing one of the installations.
+
+Not good.
+
+So with this release IMSM arrays will only be assembled if attached to
+an IMSM controller, or if "--force" is given to --assemble, or if the
+environment variable IMSM_NO_PLATFORM is set (used primarily for
+testing).
+
+I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later.
+
+NeilBrown 3rd August 2015.
diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4
new file mode 100644
index 0000000..2689732
--- /dev/null
+++ b/ANNOUNCE-3.4
@@ -0,0 +1,24 @@
+Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 3.4
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://github.com/neilbrown/mdadm
+ git://neil.brown.name/mdadm
+ http://git.neil.brown.name/git/mdadm
+
+The new second-level version number reflects significant new
+functionality, particular support for journalled RAID5/6 and clustered
+RAID1. This new support is probably still buggy. Please report bugs.
+
+There are also a number of fixes for Intel's IMSM metadata support,
+and an assortment of minor bug fixes.
+
+I plan for this to be the last release of mdadm that I provide as I am
+retiring from MD and mdadm maintenance. Jes Sorensen has volunteered
+to oversee mdadm for the next while. Thanks Jes!
+
+NeilBrown 28th January 2016
diff --git a/ANNOUNCE-4.0 b/ANNOUNCE-4.0
new file mode 100644
index 0000000..f79c540
--- /dev/null
+++ b/ANNOUNCE-4.0
@@ -0,0 +1,22 @@
+Subject: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 4.0
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
+ http://git.kernel.org/cgit/utils/mdadm/
+
+The update in major version number primarily indicates this is a
+release by it's new maintainer. In addition it contains a large number
+of fixes in particular for IMSM RAID and clustered RAID support. In
+addition this release includes support for IMSM 4k sector drives,
+failfast and better documentation for journaled RAID.
+
+This is my first release of mdadm. Please thank Neil Brown for his
+previous work as maintainer and blame me for all the bugs I caused
+since taking over.
+
+Jes Sorensen, 2017-01-09
diff --git a/ANNOUNCE-4.1 b/ANNOUNCE-4.1
new file mode 100644
index 0000000..a273b9a
--- /dev/null
+++ b/ANNOUNCE-4.1
@@ -0,0 +1,16 @@
+Subject: ANNOUNCE: mdadm 4.1 - A tool for managing md Soft RAID under Linux
+
+I am pleased to announce the availability of
+ mdadm version 4.1
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
+ http://git.kernel.org/cgit/utils/mdadm/
+
+The update constitutes more than one year of enhancements and bug fixes
+including for IMSM RAID, Partial Parity Log, clustered RAID support,
+improved testing, and gcc-8 support.
+
+Jes Sorensen, 2018-10-01
diff --git a/ANNOUNCE-4.2 b/ANNOUNCE-4.2
new file mode 100644
index 0000000..8b22d09
--- /dev/null
+++ b/ANNOUNCE-4.2
@@ -0,0 +1,19 @@
+Subject: ANNOUNCE: mdadm 4.2 - A tool for managing md Soft RAID under Linux
+
+I am pleased to finally announce the availability of mdadm-4.2.
+get 4.2 out the door soon.
+
+It is available at the usual places:
+ http://www.kernel.org/pub/linux/utils/raid/mdadm/
+and via git at
+ git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git
+ http://git.kernel.org/cgit/utils/mdadm/
+
+The release includes more than two years of development and bugfixes,
+so it is difficult to remember everything. Highlights include
+enhancements and bug fixes including for IMSM RAID, Partial Parity
+Log, clustered RAID support, improved testing, and gcc-9 support.
+
+Thank you everyone who contributed to this release!
+
+Jes Sorensen, 2021-12-30
diff --git a/Assemble.c b/Assemble.c
new file mode 100644
index 0000000..704b829
--- /dev/null
+++ b/Assemble.c
@@ -0,0 +1,2227 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <ctype.h>
+
+mapping_t assemble_statuses[] = {
+ { "but cannot be started", INCR_NO },
+ { "but not safe to start", INCR_UNSAFE },
+ { "and started", INCR_YES },
+ { NULL, INCR_ALREADY }
+};
+
+
+/**
+ * struct assembly_array_info - General, meaningful information for assembly.
+ * @name: Array name.
+ * @new_cnt: Count of drives known to be members, recently added.
+ * @preexist_cnt: Count of member drives in pre-assembled array.
+ * @exp_cnt: Count of known expansion targets.
+ *
+ * FIXME: @exp_new_cnt for recently added expansion targets.
+ */
+struct assembly_array_info {
+ char *name;
+ int new_cnt;
+ int preexist_cnt;
+ int exp_cnt;
+};
+
+/**
+ * set_array_assembly_status() - generate status of assembly for an array.
+ * @c: Global settings.
+ * @result: Pointer to status mask.
+ * @status: Status to be set/printed.
+ * @arr: Array information.
+ *
+ * Print status message to user or set it in @result if it is not NULL.
+ */
+static void set_array_assembly_status(struct context *c,
+ int *result, int status,
+ struct assembly_array_info *arr)
+{
+ int raid_disks = arr->preexist_cnt + arr->new_cnt;
+ char *status_msg = map_num(assemble_statuses, status);
+
+ if (c->export && result)
+ *result |= status;
+
+ if (c->export || c->verbose < 0)
+ return;
+
+ pr_err("%s has been assembled with %d device%s", arr->name,
+ raid_disks, raid_disks == 1 ? "":"s");
+ if (arr->preexist_cnt > 0)
+ fprintf(stderr, " (%d new)", arr->new_cnt);
+ if (arr->exp_cnt)
+ fprintf(stderr, " ( + %d for expansion)", arr->exp_cnt);
+ if (status_msg)
+ fprintf(stderr, " %s", status_msg);
+ fprintf(stderr, ".\n");
+}
+
+static int name_matches(char *found, char *required, char *homehost, int require_homehost)
+{
+ /* See if the name found matches the required name, possibly
+ * prefixed with 'homehost'
+ */
+ char *sep;
+ unsigned int l;
+
+ if (strcmp(found, required)==0)
+ return 1;
+ sep = strchr(found, ':');
+ if (!sep)
+ return 0;
+ l = sep - found;
+ if (strncmp(found, "any:", 4) == 0 ||
+ (homehost && strcmp(homehost, "any") == 0) ||
+ !require_homehost ||
+ (homehost && strlen(homehost) == l &&
+ strncmp(found, homehost, l) == 0)) {
+ /* matching homehost */
+ if (strcmp(sep+1, required) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+static int is_member_busy(char *metadata_version)
+{
+ /* check if the given member array is active */
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent;
+ int busy = 0;
+
+ for (ent = mdstat; ent; ent = ent->next) {
+ if (ent->metadata_version == NULL)
+ continue;
+ if (strncmp(ent->metadata_version, "external:", 9) != 0)
+ continue;
+ if (!is_subarray(&ent->metadata_version[9]))
+ continue;
+ /* Skip first char - it can be '/' or '-' */
+ if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) {
+ busy = 1;
+ break;
+ }
+ }
+ free_mdstat(mdstat);
+
+ return busy;
+}
+
+static int ident_matches(struct mddev_ident *ident,
+ struct mdinfo *content,
+ struct supertype *tst,
+ char *homehost, int require_homehost,
+ char *update, char *devname)
+{
+
+ if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) &&
+ same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 &&
+ memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) {
+ if (devname)
+ pr_err("%s has wrong uuid.\n", devname);
+ return 0;
+ }
+ if (ident->name[0] && (!update || strcmp(update, "name")!= 0) &&
+ name_matches(content->name, ident->name, homehost, require_homehost)==0) {
+ if (devname)
+ pr_err("%s has wrong name.\n", devname);
+ return 0;
+ }
+ if (ident->super_minor != UnSet &&
+ ident->super_minor != content->array.md_minor) {
+ if (devname)
+ pr_err("%s has wrong super-minor.\n",
+ devname);
+ return 0;
+ }
+ if (ident->level != UnSet &&
+ ident->level != content->array.level) {
+ if (devname)
+ pr_err("%s has wrong raid level.\n",
+ devname);
+ return 0;
+ }
+ if (ident->raid_disks != UnSet &&
+ content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */
+ ident->raid_disks!= content->array.raid_disks) {
+ if (devname)
+ pr_err("%s requires wrong number of drives.\n",
+ devname);
+ return 0;
+ }
+ if (ident->member && ident->member[0]) {
+ /* content->text_version must match */
+ char *s = strchr(content->text_version+1, '/');
+ if (s == NULL) {
+ if (devname)
+ pr_err("%s is not a container and one is required.\n",
+ devname);
+ return 0;
+ } else if (strcmp(ident->member, s+1) != 0) {
+ if (devname)
+ pr_err("skipping wrong member %s is %s\n",
+ content->text_version, devname);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int select_devices(struct mddev_dev *devlist,
+ struct mddev_ident *ident,
+ struct supertype **stp,
+ struct mdinfo **contentp,
+ struct context *c,
+ int inargv, int auto_assem)
+{
+ struct mddev_dev *tmpdev;
+ int num_devs;
+ struct supertype *st = *stp;
+ struct mdinfo *content = NULL;
+ int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0);
+ struct domainlist *domains = NULL;
+ dev_t rdev;
+
+ tmpdev = devlist; num_devs = 0;
+ while (tmpdev) {
+ if (tmpdev->used)
+ tmpdev->used = 2;
+ else
+ num_devs++;
+ tmpdev->disposition = 0;
+ tmpdev = tmpdev->next;
+ }
+
+ /* first walk the list of devices to find a consistent set
+ * that match the criterea, if that is possible.
+ * We flag the ones we like with 'used'.
+ */
+ for (tmpdev = devlist;
+ tmpdev;
+ tmpdev = tmpdev ? tmpdev->next : NULL) {
+ char *devname = tmpdev->devname;
+ int dfd;
+ struct supertype *tst;
+ struct dev_policy *pol = NULL;
+ int found_container = 0;
+
+ if (tmpdev->used > 1)
+ continue;
+
+ if (ident->container) {
+ if (ident->container[0] == '/' &&
+ !same_dev(ident->container, devname)) {
+ if (report_mismatch)
+ pr_err("%s is not the container required (%s)\n",
+ devname, ident->container);
+ continue;
+ }
+ } else if (ident->devices &&
+ !match_oneof(ident->devices, devname)) {
+ /* Note that we ignore the "device=" identifier if a
+ * "container=" is given. Checking both is unnecessarily
+ * complicated.
+ */
+ if (report_mismatch)
+ pr_err("%s is not one of %s\n", devname, ident->devices);
+ continue;
+ }
+
+ tst = dup_super(st);
+
+ dfd = dev_open(devname, O_RDONLY);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("cannot open device %s: %s\n",
+ devname, strerror(errno));
+ tmpdev->used = 2;
+ } else if (!fstat_is_blkdev(dfd, devname, &rdev)) {
+ tmpdev->used = 2;
+ } else if (must_be_container(dfd)) {
+ if (st) {
+ /* already found some components, this cannot
+ * be another one.
+ */
+ if (report_mismatch)
+ pr_err("%s is a container, but we are looking for components\n",
+ devname);
+ tmpdev->used = 2;
+ } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) {
+ if (report_mismatch)
+ pr_err("not a recognisable container: %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (!tst->ss->load_container ||
+ tst->ss->load_container(tst, dfd, NULL)) {
+ if (report_mismatch)
+ pr_err("no correct container type: %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (auto_assem &&
+ !conf_test_metadata(tst->ss->name,
+ (pol = devid_policy(rdev)),
+ tst->ss->match_home(tst, c->homehost) == 1)) {
+ if (report_mismatch)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, tst->ss->name);
+ tmpdev->used = 2;
+ } else
+ found_container = 1;
+ } else {
+ if (!tst && (tst = guess_super(dfd)) == NULL) {
+ if (report_mismatch)
+ pr_err("no recogniseable superblock on %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if ((tst->ignore_hw_compat = 0),
+ tst->ss->load_super(tst, dfd,
+ report_mismatch ? devname : NULL)) {
+ if (report_mismatch)
+ pr_err("no RAID superblock on %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (tst->ss->compare_super == NULL) {
+ if (report_mismatch)
+ pr_err("Cannot assemble %s metadata on %s\n",
+ tst->ss->name, devname);
+ tmpdev->used = 2;
+ } else if (auto_assem && st == NULL &&
+ !conf_test_metadata(tst->ss->name,
+ (pol = devid_policy(rdev)),
+ tst->ss->match_home(tst, c->homehost) == 1)) {
+ if (report_mismatch)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, tst->ss->name);
+ tmpdev->used = 2;
+ }
+ }
+ if (dfd >= 0) close(dfd);
+ if (tmpdev->used == 2) {
+ if (auto_assem || !inargv)
+ /* Ignore unrecognised devices during auto-assembly */
+ goto loop;
+ if (ident->name[0] ||
+ ident->super_minor != UnSet)
+ /* Ignore unrecognised device if looking for
+ * specific array */
+ goto loop;
+ if (ident->uuid_set)
+ /* ignore unrecognized device if looking for
+ * specific uuid
+ */
+ goto loop;
+
+ pr_err("%s has no superblock - assembly aborted\n",
+ devname);
+ if (st)
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ if (tst)
+ tst->ss->free_super(tst);
+ return -1;
+ }
+
+ if (found_container) {
+ /* tmpdev is a container. We need to be either
+ * looking for a member, or auto-assembling
+ */
+ /* should be safe to try an exclusive open now, we
+ * have rejected anything that some other mdadm might
+ * be looking at
+ */
+ dfd = dev_open(devname, O_RDONLY | O_EXCL);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("%s is busy - skipping\n", devname);
+ goto loop;
+ }
+ close(dfd);
+
+ if (ident->container && ident->container[0] != '/') {
+ /* we have a uuid */
+ int uuid[4];
+
+ content = *contentp;
+ tst->ss->getinfo_super(tst, content, NULL);
+
+ if (!parse_uuid(ident->container, uuid) ||
+ !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) {
+ if (report_mismatch)
+ pr_err("%s has wrong UUID to be required container\n",
+ devname);
+ goto loop;
+ }
+ }
+ /* It is worth looking inside this container.
+ */
+ if (c->verbose > 0)
+ pr_err("looking in container %s\n",
+ devname);
+
+ for (content = tst->ss->container_content(tst, NULL);
+ content;
+ content = content->next) {
+
+ if (!ident_matches(ident, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ /* message already printed */;
+ else if (is_member_busy(content->text_version)) {
+ if (report_mismatch)
+ pr_err("member %s in %s is already assembled\n",
+ content->text_version,
+ devname);
+ } else if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) {
+ /* do not assemble arrays with unsupported configurations */
+ pr_err("Cannot activate member %s in %s.\n",
+ content->text_version,
+ devname);
+ } else
+ break;
+ }
+ if (!content) {
+ tmpdev->used = 2;
+ goto loop; /* empty container */
+ }
+
+ st = tst; tst = NULL;
+ if (!auto_assem && inargv && tmpdev->next != NULL) {
+ pr_err("%s is a container, but is not only device given: confused and aborting\n",
+ devname);
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ return -1;
+ }
+ if (c->verbose > 0)
+ pr_err("found match on member %s in %s\n",
+ content->text_version, devname);
+
+ /* make sure we finished the loop */
+ tmpdev = NULL;
+ goto loop;
+ } else {
+ content = *contentp;
+ tst->ss->getinfo_super(tst, content, NULL);
+
+ if (!ident_matches(ident, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ goto loop;
+
+ if (auto_assem) {
+ /* Never auto-assemble things that conflict
+ * with mdadm.conf in some way
+ */
+ struct mddev_ident *match;
+ int rv = 0;
+
+ match = conf_match(tst, content, devname,
+ report_mismatch ? c->verbose : -1,
+ &rv);
+ if (!match && rv == 2)
+ goto loop;
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (report_mismatch)
+ pr_err("%s is a member of an explicitly ignored array\n",
+ devname);
+ goto loop;
+ }
+ if (match && !ident_matches(match, content, tst,
+ c->homehost, c->require_homehost,
+ c->update,
+ report_mismatch ? devname : NULL))
+ /* Array exists in mdadm.conf but some
+ * details don't match, so reject it
+ */
+ goto loop;
+ }
+
+ /* should be safe to try an exclusive open now, we
+ * have rejected anything that some other mdadm might
+ * be looking at
+ */
+ dfd = dev_open(devname, O_RDONLY | O_EXCL);
+ if (dfd < 0) {
+ if (report_mismatch)
+ pr_err("%s is busy - skipping\n", devname);
+ goto loop;
+ }
+ close(dfd);
+
+ if (st == NULL)
+ st = dup_super(tst);
+ if (st->minor_version == -1)
+ st->minor_version = tst->minor_version;
+
+ if (memcmp(content->uuid, uuid_zero,
+ sizeof(int[4])) == 0) {
+ /* this is a floating spare. It cannot define
+ * an array unless there are no more arrays of
+ * this type to be found. It can be included
+ * in an array of this type though.
+ */
+ tmpdev->used = 3;
+ goto loop;
+ }
+
+ if (st->ss != tst->ss ||
+ st->minor_version != tst->minor_version ||
+ st->ss->compare_super(st, tst, 1) != 0) {
+ /* Some mismatch. If exactly one array matches this host,
+ * we can resolve on that one.
+ * Or, if we are auto assembling, we just ignore the second
+ * for now.
+ */
+ if (auto_assem)
+ goto loop;
+ if (c->homehost) {
+ int first = st->ss->match_home(st, c->homehost);
+ int last = tst->ss->match_home(tst, c->homehost);
+ if (first != last &&
+ (first == 1 || last == 1)) {
+ /* We can do something */
+ if (first) {/* just ignore this one */
+ if (report_mismatch)
+ pr_err("%s misses out due to wrong homehost\n",
+ devname);
+ goto loop;
+ } else { /* reject all those sofar */
+ struct mddev_dev *td;
+ if (report_mismatch)
+ pr_err("%s overrides previous devices due to good homehost\n",
+ devname);
+ for (td=devlist; td != tmpdev; td=td->next)
+ if (td->used == 1)
+ td->used = 0;
+ tmpdev->used = 1;
+ goto loop;
+ }
+ }
+ }
+ pr_err("superblock on %s doesn't match others - assembly aborted\n",
+ devname);
+ tst->ss->free_super(tst);
+ st->ss->free_super(st);
+ dev_policy_free(pol);
+ domain_free(domains);
+ return -1;
+ }
+ tmpdev->used = 1;
+ }
+ loop:
+ /* Collect domain information from members only */
+ if (tmpdev && tmpdev->used == 1) {
+ if (!pol)
+ pol = devid_policy(rdev);
+ domain_merge(&domains, pol, tst?tst->ss->name:NULL);
+ }
+ dev_policy_free(pol);
+ pol = NULL;
+ if (tst)
+ tst->ss->free_super(tst);
+ }
+
+ /* Check if we found some imsm spares but no members */
+ if ((auto_assem ||
+ (ident->uuid_set &&
+ memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) &&
+ (!st || !st->sb))
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ if (tmpdev->used != 3)
+ continue;
+ tmpdev->used = 1;
+ content = *contentp;
+
+ if (!st->sb) {
+ /* we need sb from one of the spares */
+ int dfd = dev_open(tmpdev->devname, O_RDONLY);
+ if (dfd < 0 ||
+ st->ss->load_super(st, dfd, NULL))
+ tmpdev->used = 2;
+ close_fd(&dfd);
+ }
+ }
+
+ /* Now reject spares that don't match domains of identified members */
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ if (tmpdev->used != 3)
+ continue;
+ if (!stat_is_blkdev(tmpdev->devname, &rdev)) {
+ tmpdev->used = 2;
+ } else {
+ struct dev_policy *pol = devid_policy(rdev);
+ int dt = domain_test(domains, pol, NULL);
+ if (inargv && dt != 0)
+ /* take this spare as domains match
+ * if there are any */
+ tmpdev->used = 1;
+ else if (!inargv && dt == 1)
+ /* device wasn't explicitly listed, so need
+ * explicit domain match - which we have */
+ tmpdev->used = 1;
+ else
+ /* if domains don't match mark as unused */
+ tmpdev->used = 0;
+ dev_policy_free(pol);
+ }
+ }
+ domain_free(domains);
+ *stp = st;
+ if (st && st->sb && content == *contentp)
+ st->ss->getinfo_super(st, content, NULL);
+ *contentp = content;
+
+ return num_devs;
+}
+
+struct devs {
+ char *devname;
+ int uptodate; /* set once we decide that this device is as
+ * recent as everything else in the array.
+ */
+ int included; /* set if the device is already in the array
+ * due to a previous '-I'
+ */
+ struct mdinfo i;
+};
+
+static int load_devices(struct devs *devices, char *devmap,
+ struct mddev_ident *ident, struct supertype **stp,
+ struct mddev_dev *devlist, struct context *c,
+ struct mdinfo *content,
+ int mdfd, char *mddev,
+ int *most_recentp, int *bestcntp, int **bestp,
+ int inargv)
+{
+ struct mddev_dev *tmpdev;
+ int devcnt = 0;
+ int nextspare = 0;
+ int bitmap_done = 0;
+ int most_recent = -1;
+ int bestcnt = 0;
+ int *best = *bestp;
+ struct supertype *st = *stp;
+
+ for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) {
+ char *devname = tmpdev->devname;
+ struct stat stb;
+ struct supertype *tst;
+ int i;
+ int dfd;
+ int disk_state;
+
+ if (tmpdev->used != 1)
+ continue;
+ /* looks like a good enough match to update the super block if needed */
+ if (c->update) {
+ /* prepare useful information in info structures */
+ struct stat stb2;
+ int err;
+ fstat(mdfd, &stb2);
+
+ if (strcmp(c->update, "uuid") == 0 && !ident->uuid_set)
+ random_uuid((__u8 *)ident->uuid);
+
+ if (strcmp(c->update, "ppl") == 0 &&
+ ident->bitmap_fd >= 0) {
+ pr_err("PPL is not compatible with bitmap\n");
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ return -1;
+ }
+
+ dfd = dev_open(devname,
+ tmpdev->disposition == 'I'
+ ? O_RDWR : (O_RDWR|O_EXCL));
+
+ tst = dup_super(st);
+ if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
+ pr_err("cannot re-read metadata from %s - aborting\n",
+ devname);
+ if (dfd >= 0)
+ close(dfd);
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
+ return -1;
+ }
+ tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
+
+ memcpy(content->uuid, ident->uuid, 16);
+ strcpy(content->name, ident->name);
+ content->array.md_minor = minor(stb2.st_rdev);
+
+ if (strcmp(c->update, "byteorder") == 0)
+ err = 0;
+ else if (strcmp(c->update, "home-cluster") == 0) {
+ tst->cluster_name = c->homecluster;
+ err = tst->ss->write_bitmap(tst, dfd, NameUpdate);
+ } else if (strcmp(c->update, "nodes") == 0) {
+ tst->nodes = c->nodes;
+ err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate);
+ } else if (strcmp(c->update, "revert-reshape") == 0 &&
+ c->invalid_backup)
+ err = tst->ss->update_super(tst, content,
+ "revert-reshape-nobackup",
+ devname, c->verbose,
+ ident->uuid_set,
+ c->homehost);
+ else
+ err = tst->ss->update_super(tst, content, c->update,
+ devname, c->verbose,
+ ident->uuid_set,
+ c->homehost);
+ if (err < 0) {
+ if (err == -1)
+ pr_err("--update=%s not understood for %s metadata\n",
+ c->update, tst->ss->name);
+ tst->ss->free_super(tst);
+ free(tst);
+ close(mdfd);
+ close(dfd);
+ free(devices);
+ free(devmap);
+ *stp = st;
+ return -1;
+ }
+ if (strcmp(c->update, "uuid")==0 &&
+ !ident->uuid_set) {
+ ident->uuid_set = 1;
+ memcpy(ident->uuid, content->uuid, 16);
+ }
+ if (tst->ss->store_super(tst, dfd))
+ pr_err("Could not re-write superblock on %s.\n",
+ devname);
+
+ if (strcmp(c->update, "uuid")==0 &&
+ ident->bitmap_fd >= 0 && !bitmap_done) {
+ if (bitmap_update_uuid(ident->bitmap_fd,
+ content->uuid,
+ tst->ss->swapuuid) != 0)
+ pr_err("Could not update uuid on external bitmap.\n");
+ else
+ bitmap_done = 1;
+ }
+ } else {
+ dfd = dev_open(devname,
+ tmpdev->disposition == 'I'
+ ? O_RDWR : (O_RDWR|O_EXCL));
+ tst = dup_super(st);
+
+ if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) {
+ pr_err("cannot re-read metadata from %s - aborting\n",
+ devname);
+ if (dfd >= 0)
+ close(dfd);
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ tst->ss->free_super(tst);
+ free(tst);
+ *stp = st;
+ return -1;
+ }
+ tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks);
+ }
+
+ fstat(dfd, &stb);
+ close(dfd);
+
+ if (c->verbose > 0)
+ pr_err("%s is identified as a member of %s, slot %d%s.\n",
+ devname, mddev, content->disk.raid_disk,
+ (content->disk.state & (1<<MD_DISK_REPLACEMENT)) ? " replacement":"");
+ devices[devcnt].devname = devname;
+ devices[devcnt].uptodate = 0;
+ devices[devcnt].included = (tmpdev->disposition == 'I');
+ devices[devcnt].i = *content;
+ devices[devcnt].i.disk.major = major(stb.st_rdev);
+ devices[devcnt].i.disk.minor = minor(stb.st_rdev);
+
+ disk_state = devices[devcnt].i.disk.state & ~((1<<MD_DISK_FAILFAST) |
+ (1<<MD_DISK_WRITEMOSTLY));
+ if (disk_state == ((1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC))) {
+ if (most_recent < 0 ||
+ devices[devcnt].i.events
+ > devices[most_recent].i.events) {
+ struct supertype *tmp = tst;
+ tst = st;
+ st = tmp;
+ most_recent = devcnt;
+ }
+ }
+ tst->ss->free_super(tst);
+ free(tst);
+
+ if (content->array.level == LEVEL_MULTIPATH)
+ /* with multipath, the raid_disk from the superblock is meaningless */
+ i = devcnt;
+ else
+ i = devices[devcnt].i.disk.raid_disk;
+ if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) {
+ if (nextspare < content->array.raid_disks*2)
+ nextspare = content->array.raid_disks*2;
+ i = nextspare++;
+ } else {
+ /* i is raid_disk - double it so there is room for
+ * replacements */
+ i *= 2;
+ if (devices[devcnt].i.disk.state & (1<<MD_DISK_REPLACEMENT))
+ i++;
+ if (i >= content->array.raid_disks*2 &&
+ i >= nextspare)
+ nextspare = i+1;
+ }
+ if (i < 10000) {
+ if (i >= bestcnt) {
+ int newbestcnt = i+10;
+ int *newbest = xmalloc(sizeof(int)*newbestcnt);
+ int c;
+ for (c=0; c < newbestcnt; c++)
+ if (c < bestcnt)
+ newbest[c] = best[c];
+ else
+ newbest[c] = -1;
+ if (best)free(best);
+ best = newbest;
+ bestcnt = newbestcnt;
+ }
+ if (best[i] >=0 &&
+ devices[best[i]].i.events ==
+ devices[devcnt].i.events &&
+ (devices[best[i]].i.disk.minor !=
+ devices[devcnt].i.disk.minor) &&
+ st->ss == &super0 &&
+ content->array.level != LEVEL_MULTIPATH) {
+ /* two different devices with identical superblock.
+ * Could be a mis-detection caused by overlapping
+ * partitions. fail-safe.
+ */
+ pr_err("WARNING %s and %s appear to have very similar superblocks.\n"
+ " If they are really different, please --zero the superblock on one\n"
+ " If they are the same or overlap, please remove one from %s.\n",
+ devices[best[i]].devname, devname,
+ inargv ? "the list" :
+ "the\n DEVICE list in mdadm.conf"
+ );
+ close(mdfd);
+ free(devices);
+ free(devmap);
+ *stp = st;
+ return -1;
+ }
+ if (best[i] == -1 || (devices[best[i]].i.events
+ < devices[devcnt].i.events))
+ best[i] = devcnt;
+ else if (st->ss == &super_imsm)
+ best[i+1] = devcnt;
+ }
+ devcnt++;
+ }
+ if (most_recent >= 0)
+ *most_recentp = most_recent;
+ *bestcntp = bestcnt;
+ *bestp = best;
+ *stp = st;
+ return devcnt;
+}
+
+static int force_array(struct mdinfo *content,
+ struct devs *devices,
+ int *best, int bestcnt, char *avail,
+ int most_recent,
+ struct supertype *st,
+ struct context *c)
+{
+ int okcnt = 0;
+ while (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1,
+ avail) ||
+ (content->reshape_active && content->delta_disks > 0 &&
+ !enough(content->array.level, (content->array.raid_disks
+ - content->delta_disks),
+ content->new_layout, 1, avail))) {
+ /* Choose the newest best drive which is
+ * not up-to-date, update the superblock
+ * and add it.
+ */
+ int fd;
+ struct supertype *tst;
+ unsigned long long current_events;
+ int chosen_drive = -1;
+ int i;
+
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt;
+ i += 2) {
+ int j = best[i];
+ if (j < 0)
+ continue;
+ if (devices[j].uptodate)
+ continue;
+ if (devices[j].i.recovery_start != MaxSector) {
+ int delta;
+ if (!devices[j].i.reshape_active ||
+ devices[j].i.delta_disks <= 0)
+ continue;
+ /* When increasing number of devices, an
+ * added device also appears to be
+ * recovering. It is safe to include it
+ * as long as it won't be a source of
+ * data.
+ * For now, just allow for last data
+ * devices in RAID4 or last devices in RAID4/5/6.
+ */
+ delta = devices[j].i.delta_disks;
+ if (devices[j].i.array.level >= 4 &&
+ devices[j].i.array.level <= 6 &&
+ i/2 >= content->array.raid_disks - delta)
+ /* OK */;
+ else if (devices[j].i.array.level == 4 &&
+ i/2 >= content->array.raid_disks - delta - 1)
+ /* OK */;
+ else
+ continue;
+ } else if (devices[j].i.reshape_active !=
+ content->reshape_active ||
+ (devices[j].i.reshape_active &&
+ devices[j].i.reshape_progress !=
+ content->reshape_progress))
+ /* Here, it may be a source of data. If two
+ * devices claim different progresses, it
+ * means that reshape boundaries differ for
+ * their own devices. Kernel will only treat
+ * the first one as reshape progress and
+ * go on. It may cause disaster, so avoid it.
+ */
+ continue;
+ if (chosen_drive < 0 ||
+ devices[j].i.events
+ > devices[chosen_drive].i.events)
+ chosen_drive = j;
+ }
+ if (chosen_drive < 0)
+ break;
+ current_events = devices[chosen_drive].i.events;
+ add_another:
+ if (c->verbose >= 0)
+ pr_err("forcing event count in %s(%d) from %d up to %d\n",
+ devices[chosen_drive].devname,
+ devices[chosen_drive].i.disk.raid_disk,
+ (int)(devices[chosen_drive].i.events),
+ (int)(devices[most_recent].i.events));
+ fd = dev_open(devices[chosen_drive].devname,
+ devices[chosen_drive].included ? O_RDWR
+ : (O_RDWR|O_EXCL));
+ if (fd < 0) {
+ pr_err("Couldn't open %s for write - not updating\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ continue;
+ }
+ tst = dup_super(st);
+ if (tst->ss->load_super(tst,fd, NULL)) {
+ close(fd);
+ pr_err("RAID superblock disappeared from %s - not updating.\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ continue;
+ }
+ content->events = devices[most_recent].i.events;
+ tst->ss->update_super(tst, content, "force-one",
+ devices[chosen_drive].devname, c->verbose,
+ 0, NULL);
+
+ if (tst->ss->store_super(tst, fd)) {
+ close(fd);
+ pr_err("Could not re-write superblock on %s\n",
+ devices[chosen_drive].devname);
+ devices[chosen_drive].i.events = 0;
+ tst->ss->free_super(tst);
+ continue;
+ }
+ close(fd);
+ devices[chosen_drive].i.events = devices[most_recent].i.events;
+ devices[chosen_drive].uptodate = 1;
+ avail[chosen_drive] = 1;
+ okcnt++;
+ tst->ss->free_super(tst);
+ /* If there are any other drives of the same vintage,
+ * add them in as well. We can't lose and we might gain
+ */
+ for (i = 0;
+ i < content->array.raid_disks * 2 && i < bestcnt ;
+ i += 2) {
+ int j = best[i];
+ if (j >= 0 &&
+ !devices[j].uptodate &&
+ devices[j].i.recovery_start == MaxSector &&
+ devices[j].i.events == current_events &&
+ ((!devices[j].i.reshape_active &&
+ !content->reshape_active) ||
+ (devices[j].i.reshape_active ==
+ content->reshape_active &&
+ devices[j].i.reshape_progress ==
+ content->reshape_progress))) {
+ chosen_drive = j;
+ goto add_another;
+ }
+ }
+ }
+ return okcnt;
+}
+
+static int start_array(int mdfd,
+ char *mddev,
+ struct mdinfo *content,
+ struct supertype *st,
+ struct mddev_ident *ident,
+ int *best, int bestcnt,
+ int chosen_drive,
+ struct devs *devices,
+ unsigned int okcnt,
+ unsigned int sparecnt,
+ unsigned int rebuilding_cnt,
+ unsigned int journalcnt,
+ struct context *c,
+ int clean, char *avail,
+ int start_partial_ok,
+ int err_ok,
+ int was_forced
+ )
+{
+ int rv;
+ int i;
+ unsigned int req_cnt;
+
+ if (content->journal_device_required && (content->journal_clean == 0)) {
+ if (!c->force) {
+ pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n");
+ return 1;
+ }
+ pr_err("Journal is missing or stale, starting array read only.\n");
+ c->readonly = 1;
+ }
+
+ if (content->consistency_policy == CONSISTENCY_POLICY_PPL)
+ clean = 1;
+
+ rv = set_array_info(mdfd, st, content);
+ if (rv && !err_ok) {
+ pr_err("failed to set array info for %s: %s\n",
+ mddev, strerror(errno));
+ return 1;
+ }
+ if (ident->bitmap_fd >= 0) {
+ if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) {
+ pr_err("SET_BITMAP_FILE failed.\n");
+ return 1;
+ }
+ } else if (ident->bitmap_file) {
+ /* From config file */
+ int bmfd = open(ident->bitmap_file, O_RDWR);
+ if (bmfd < 0) {
+ pr_err("Could not open bitmap file %s\n",
+ ident->bitmap_file);
+ return 1;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
+ pr_err("Failed to set bitmapfile for %s\n", mddev);
+ close(bmfd);
+ return 1;
+ }
+ close(bmfd);
+ }
+
+ /* First, add the raid disks, but add the chosen one last */
+ for (i = 0; i <= bestcnt; i++) {
+ int j;
+ if (i < bestcnt) {
+ j = best[i];
+ if (j == chosen_drive)
+ continue;
+ } else
+ j = chosen_drive;
+
+ if (j >= 0 && !devices[j].included) {
+ int dfd;
+
+ dfd = dev_open(devices[j].devname, O_RDWR|O_EXCL);
+ if (dfd >= 0) {
+ remove_partitions(dfd);
+ close(dfd);
+ }
+ rv = add_disk(mdfd, st, content, &devices[j].i);
+
+ if (rv) {
+ pr_err("failed to add %s to %s: %s\n",
+ devices[j].devname, mddev,
+ strerror(errno));
+ if (errno == EINVAL && content->array.level == 0 &&
+ content->array.layout != 0) {
+ cont_err("Possibly your kernel doesn't support RAID0 layouts.\n");
+ cont_err("Please upgrade.\n");
+ }
+ if (i < content->array.raid_disks * 2 ||
+ i == bestcnt)
+ okcnt--;
+ else
+ sparecnt--;
+ } else if (c->verbose > 0) {
+ pr_err("added %s to %s as %d%s%s\n",
+ devices[j].devname, mddev,
+ devices[j].i.disk.raid_disk,
+ devices[j].uptodate?"":
+ " (possibly out of date)",
+ (devices[j].i.disk.state &
+ (1<<MD_DISK_REPLACEMENT)) ?
+ " replacement":"");
+ }
+ } else if (j >= 0) {
+ if (c->verbose > 0)
+ pr_err("%s is already in %s as %d\n",
+ devices[j].devname, mddev,
+ devices[j].i.disk.raid_disk);
+ } else if (c->verbose > 0 &&
+ i < content->array.raid_disks * 2 && (i & 1) == 0)
+ pr_err("no uptodate device for slot %d of %s\n",
+ i/2, mddev);
+ }
+
+ if (content->array.level == LEVEL_CONTAINER) {
+ sysfs_rules_apply(mddev, content);
+ if (c->verbose >= 0) {
+ pr_err("Container %s has been assembled with %d drive%s",
+ mddev, okcnt + sparecnt + journalcnt,
+ okcnt + sparecnt + journalcnt == 1 ? "" : "s");
+ if (okcnt < (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)\n",
+ content->array.raid_disks);
+ else
+ fprintf(stderr, "\n");
+ }
+
+ if (st->ss->validate_container) {
+ struct mdinfo *devices_list;
+ struct mdinfo *info_devices;
+ unsigned int count;
+
+ devices_list = NULL;
+ info_devices = xmalloc(sizeof(struct mdinfo) *
+ (okcnt + sparecnt));
+ for (count = 0; count < okcnt + sparecnt; count++) {
+ info_devices[count] = devices[count].i;
+ info_devices[count].next = devices_list;
+ devices_list = &info_devices[count];
+ }
+ if (st->ss->validate_container(devices_list))
+ pr_err("Mismatch detected!\n");
+ free(info_devices);
+ }
+
+ st->ss->free_super(st);
+ sysfs_uevent(content, "change");
+ if (err_ok && okcnt < (unsigned)content->array.raid_disks)
+ /* Was partial, is still partial, so signal an error
+ * to ensure we don't retry */
+ return 1;
+ return 0;
+ }
+
+ /* Get number of in-sync devices according to the superblock.
+ * We must have this number to start the array without -s or -R
+ */
+ req_cnt = content->array.working_disks;
+
+ if (c->runstop == 1 ||
+ (c->runstop <= 0 &&
+ (enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean, avail) &&
+ (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok)))) {
+ /* This array is good-to-go.
+ * If a reshape is in progress then we might need to
+ * continue monitoring it. In that case we start
+ * it read-only and let the grow code make it writable.
+ */
+ int rv;
+
+ if (content->reshape_active &&
+ !(content->reshape_active & RESHAPE_NO_BACKUP) &&
+ content->delta_disks <= 0) {
+ if (!c->backup_file) {
+ pr_err("%s: Need a backup file to complete reshape of this array.\n",
+ mddev);
+ pr_err("Please provided one with \"--backup-file=...\"\n");
+ if (c->update &&
+ strcmp(c->update, "revert-reshape") == 0)
+ pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n");
+ return 1;
+ }
+ rv = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ if (rv == 0)
+ rv = Grow_continue(mdfd, st, content,
+ c->backup_file, 0,
+ c->freeze_reshape);
+ } else if (c->readonly &&
+ sysfs_attribute_available(content, NULL,
+ "array_state")) {
+ rv = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ } else
+ rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ reopen_mddev(mdfd); /* drop O_EXCL */
+ if (rv == 0) {
+ sysfs_rules_apply(mddev, content);
+ if (c->verbose >= 0) {
+ pr_err("%s has been started with %d drive%s",
+ mddev, okcnt, okcnt==1?"":"s");
+ if (okcnt < (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)",
+ content->array.raid_disks);
+ if (rebuilding_cnt)
+ fprintf(stderr, "%s %d rebuilding",
+ sparecnt?",":" and",
+ rebuilding_cnt);
+ if (sparecnt)
+ fprintf(stderr, " and %d spare%s",
+ sparecnt,
+ sparecnt == 1 ? "" : "s");
+ if (content->journal_clean)
+ fprintf(stderr, " and %d journal",
+ journalcnt);
+ fprintf(stderr, ".\n");
+ }
+ if (content->reshape_active &&
+ content->array.level >= 4 &&
+ content->array.level <= 6) {
+ /* might need to increase the size
+ * of the stripe cache - default is 256
+ */
+ int chunk_size = content->array.chunk_size;
+
+ if (content->reshape_active &&
+ content->new_chunk > chunk_size)
+ chunk_size = content->new_chunk;
+ if (256 < 4 * ((chunk_size+4065)/4096)) {
+ struct mdinfo *sra;
+
+ sra = sysfs_read(mdfd, NULL, 0);
+ if (sra)
+ sysfs_set_num(sra, NULL,
+ "stripe_cache_size",
+ (4 * chunk_size / 4096) + 1);
+ sysfs_free(sra);
+ }
+ }
+ if (okcnt < (unsigned)content->array.raid_disks) {
+ /* If any devices did not get added
+ * because the kernel rejected them based
+ * on event count, try adding them
+ * again providing the action policy is
+ * 're-add' or greater. The bitmap
+ * might allow them to be included, or
+ * they will become spares.
+ */
+ for (i = 0; i < bestcnt; i++) {
+ int j = best[i];
+ if (j >= 0 && !devices[j].uptodate) {
+ if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add))
+ continue;
+ rv = add_disk(mdfd, st, content,
+ &devices[j].i);
+ if (rv == 0 && c->verbose >= 0)
+ pr_err("%s has been re-added.\n",
+ devices[j].devname);
+ }
+ }
+ }
+ if (content->array.level == 6 &&
+ okcnt + 1 == (unsigned)content->array.raid_disks &&
+ was_forced) {
+ struct mdinfo *sra;
+
+ sra = sysfs_read(mdfd, NULL, 0);
+ if (sra)
+ sysfs_set_str(sra, NULL,
+ "sync_action", "repair");
+ sysfs_free(sra);
+ }
+ return 0;
+ }
+ pr_err("failed to RUN_ARRAY %s: %s\n", mddev, strerror(errno));
+ if (errno == 524 /* ENOTSUP */ &&
+ content->array.level == 0 && content->array.layout == 0)
+ cont_err("Please use --update=layout-original or --update=layout-alternate\n");
+
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail))
+ pr_err("Not enough devices to start the array.\n");
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean, avail))
+ pr_err("Not enough devices to start the array while not clean - consider --force.\n");
+
+ return 1;
+ }
+ if (c->runstop == -1) {
+ pr_err("%s assembled from %d drive%s",
+ mddev, okcnt, okcnt == 1 ? "" : "s");
+ if (okcnt != (unsigned)content->array.raid_disks)
+ fprintf(stderr, " (out of %d)",
+ content->array.raid_disks);
+ fprintf(stderr, ", but not started.\n");
+ return 2;
+ }
+ if (c->verbose >= -1) {
+ pr_err("%s assembled from %d drive%s",
+ mddev, okcnt, okcnt == 1 ? "" : "s");
+ if (rebuilding_cnt)
+ fprintf(stderr, "%s %d rebuilding",
+ sparecnt ? "," : " and", rebuilding_cnt);
+ if (sparecnt)
+ fprintf(stderr, " and %d spare%s", sparecnt,
+ sparecnt == 1 ? "" : "s");
+ if (!enough(content->array.level, content->array.raid_disks,
+ content->array.layout, 1, avail))
+ fprintf(stderr, " - not enough to start the array.\n");
+ else if (!enough(content->array.level,
+ content->array.raid_disks,
+ content->array.layout, clean, avail))
+ fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n");
+ else {
+ if (req_cnt == (unsigned)content->array.raid_disks)
+ fprintf(stderr, " - need all %d to start it",
+ req_cnt);
+ else
+ fprintf(stderr, " - need %d to start", req_cnt);
+ fprintf(stderr, " (use --run to insist).\n");
+ }
+ }
+ return 1;
+}
+
+int Assemble(struct supertype *st, char *mddev,
+ struct mddev_ident *ident,
+ struct mddev_dev *devlist,
+ struct context *c)
+{
+ /*
+ * The task of Assemble is to find a collection of
+ * devices that should (according to their superblocks)
+ * form an array, and to give this collection to the MD driver.
+ * In Linux-2.4 and later, this involves submitting a
+ * SET_ARRAY_INFO ioctl with no arg - to prepare
+ * the array - and then submit a number of
+ * ADD_NEW_DISK ioctls to add disks into
+ * the array. Finally RUN_ARRAY might
+ * be submitted to start the array.
+ *
+ * Much of the work of Assemble is in finding and/or
+ * checking the disks to make sure they look right.
+ *
+ * If mddev is not set, then scan must be set and we
+ * read through the config file for dev+uuid mapping
+ * We recurse, setting mddev, for each device that
+ * - isn't running
+ * - has a valid uuid (or any uuid if !uuidset)
+ *
+ * If mddev is set, we try to determine state of md.
+ * check version - must be at least 0.90.0
+ * check kernel version. must be at least 2.4.
+ * If not, we can possibly fall back on START_ARRAY
+ * Try to GET_ARRAY_INFO.
+ * If possible, give up
+ * If not, try to STOP_ARRAY just to make sure
+ *
+ * If !uuidset and scan, look in conf-file for uuid
+ * If not found, give up
+ * If !devlist and scan and uuidset, get list of devs from conf-file
+ *
+ * For each device:
+ * Check superblock - discard if bad
+ * Check uuid (set if we don't have one) - discard if no match
+ * Check superblock similarity if we have a superblock - discard if different
+ * Record events, devicenum
+ * This should give us a list of devices for the array
+ * We should collect the most recent event number
+ *
+ * Count disks with recent enough event count
+ * While force && !enough disks
+ * Choose newest rejected disks, update event count
+ * mark clean and rewrite superblock
+ * If recent kernel:
+ * SET_ARRAY_INFO
+ * foreach device with recent events : ADD_NEW_DISK
+ * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY
+ * If old kernel:
+ * Check the device numbers in superblock are right
+ * update superblock if any changes
+ * START_ARRAY
+ *
+ */
+ int rv = -1;
+ int mdfd = -1;
+ int clean;
+ int auto_assem = (mddev == NULL && !ident->uuid_set &&
+ ident->super_minor == UnSet && ident->name[0] == 0 &&
+ (ident->container == NULL || ident->member == NULL));
+ struct devs *devices = NULL;
+ char *devmap;
+ int *best = NULL; /* indexed by raid_disk */
+ int bestcnt = 0;
+ int devcnt;
+ unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt;
+ int journal_clean = 0;
+ int i;
+ int was_forced = 0;
+ int most_recent = 0;
+ int chosen_drive;
+ int change = 0;
+ int inargv = 0;
+ int start_partial_ok = (c->runstop >= 0) &&
+ (c->force || devlist==NULL || auto_assem);
+ int num_devs;
+ struct mddev_dev *tmpdev;
+ struct mdinfo info;
+ struct mdinfo *content = NULL;
+ struct mdinfo *pre_exist = NULL;
+ char *avail;
+ char *name = NULL;
+ char chosen_name[1024];
+ struct map_ent *map = NULL;
+ struct map_ent *mp;
+
+ /*
+ * If any subdevs are listed, then any that don't
+ * match ident are discarded. Remainder must all match and
+ * become the array.
+ * If no subdevs, then we scan all devices in the config file, but
+ * there must be something in the identity
+ */
+
+ if (!devlist &&
+ ident->uuid_set == 0 &&
+ (ident->super_minor < 0 || ident->super_minor == UnSet) &&
+ ident->name[0] == 0 &&
+ (ident->container == NULL || ident->member == NULL) &&
+ ident->devices == NULL) {
+ pr_err("No identity information available for %s - cannot assemble.\n",
+ mddev ? mddev : "further assembly");
+ return 1;
+ }
+
+ if (devlist == NULL)
+ devlist = conf_get_devs();
+ else if (mddev)
+ inargv = 1;
+
+try_again:
+ /* We come back here when doing auto-assembly and attempting some
+ * set of devices failed. Those are now marked as ->used==2 and
+ * we ignore them and try again
+ */
+ if (!st && ident->st)
+ st = ident->st;
+ if (c->verbose>0)
+ pr_err("looking for devices for %s\n",
+ mddev ? mddev : "further assembly");
+
+ content = &info;
+ if (st && c->force)
+ st->ignore_hw_compat = 1;
+ num_devs = select_devices(devlist, ident, &st, &content, c,
+ inargv, auto_assem);
+ if (num_devs < 0)
+ return 1;
+
+ if (!st || !st->sb || !content)
+ return 2;
+
+ /* We have a full set of devices - we now need to find the
+ * array device.
+ * However there is a risk that we are racing with "mdadm -I"
+ * and the array is already partially assembled - we will have
+ * rejected any devices already in this address.
+ * So we take a lock on the map file - to prevent further races -
+ * and look for the uuid in there. If found and the array is
+ * active, we abort. If found and the array is not active
+ * we commit to that md device and add all the contained devices
+ * to our list. We flag them so that we don't try to re-add,
+ * but can remove if they turn out to not be wanted.
+ */
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile - continue anyway...\n");
+ if (c->update && strcmp(c->update,"uuid") == 0)
+ mp = NULL;
+ else
+ mp = map_by_uuid(&map, content->uuid);
+ if (mp) {
+ struct mdinfo *dv;
+ /* array already exists. */
+ pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS);
+ if (pre_exist->array.level != UnSet) {
+ pr_err("Found some drive for an array that is already active: %s\n",
+ mp->path);
+ pr_err("giving up.\n");
+ goto out;
+ }
+ for (dv = pre_exist->devs; dv; dv = dv->next) {
+ /* We want to add this device to our list,
+ * but it could already be there if "mdadm -I"
+ * started *after* we checked for O_EXCL.
+ * If we add it to the top of the list
+ * it will be preferred over later copies.
+ */
+ struct mddev_dev *newdev;
+ char *devname = map_dev(dv->disk.major,
+ dv->disk.minor,
+ 0);
+ if (!devname)
+ continue;
+ newdev = xmalloc(sizeof(*newdev));
+ newdev->devname = devname;
+ newdev->disposition = 'I';
+ newdev->used = 1;
+ newdev->next = devlist;
+ devlist = newdev;
+ num_devs++;
+ }
+ strcpy(chosen_name, mp->path);
+ if (c->verbose > 0 || mddev == NULL ||
+ strcmp(mddev, chosen_name) != 0)
+ pr_err("Merging with already-assembled %s\n",
+ chosen_name);
+ mdfd = open_dev_excl(mp->devnm);
+ } else {
+ int trustworthy = FOREIGN;
+ name = content->name;
+ switch (st->ss->match_home(st, c->homehost)
+ ?: st->ss->match_home(st, "any")) {
+ case 1:
+ trustworthy = LOCAL;
+ name = strchr(content->name, ':');
+ if (name)
+ name++;
+ else
+ name = content->name;
+ break;
+ }
+ if (mddev && map_by_name(&map, mddev) != NULL) {
+ pr_err("Cannot create device with %s because is in use\n", mddev);
+ goto out;
+ }
+ if (!auto_assem)
+ /* If the array is listed in mdadm.conf or on
+ * command line, then we trust the name
+ * even if the array doesn't look local
+ */
+ trustworthy = LOCAL;
+
+ if (name[0] == 0 &&
+ content->array.level == LEVEL_CONTAINER) {
+ name = content->text_version;
+ trustworthy = METADATA;
+ }
+
+ if (name[0] && trustworthy != LOCAL &&
+ ! c->require_homehost &&
+ conf_name_is_free(name))
+ trustworthy = LOCAL;
+
+ if (trustworthy == LOCAL &&
+ strchr(name, ':'))
+ /* Ignore 'host:' prefix of name */
+ name = strchr(name, ':')+1;
+
+ mdfd = create_mddev(mddev, name, ident->autof, trustworthy,
+ chosen_name, 0);
+ }
+ if (mdfd < 0) {
+ st->ss->free_super(st);
+ if (auto_assem)
+ goto try_again;
+ goto out;
+ }
+ mddev = chosen_name;
+ if (pre_exist == NULL) {
+ if (mddev_busy(fd2devnm(mdfd))) {
+ pr_err("%s already active, cannot restart it!\n",
+ mddev);
+ for (tmpdev = devlist ;
+ tmpdev && tmpdev->used != 1;
+ tmpdev = tmpdev->next)
+ ;
+ if (tmpdev && auto_assem)
+ pr_err("%s needed for %s...\n",
+ mddev, tmpdev->devname);
+ close(mdfd);
+ mdfd = -3;
+ st->ss->free_super(st);
+ if (auto_assem)
+ goto try_again;
+ goto out;
+ }
+ /* just incase it was started but has no content */
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ }
+
+ if (content != &info) {
+ /* This is a member of a container. Try starting the array. */
+ int err;
+ err = assemble_container_content(st, mdfd, content, c,
+ chosen_name, NULL);
+ close(mdfd);
+ return err;
+ }
+
+ /* Ok, no bad inconsistancy, we can try updating etc */
+ devices = xcalloc(num_devs, sizeof(*devices));
+ devmap = xcalloc(num_devs, content->array.raid_disks);
+ devcnt = load_devices(devices, devmap, ident, &st, devlist,
+ c, content, mdfd, mddev,
+ &most_recent, &bestcnt, &best, inargv);
+ if (devcnt < 0) {
+ mdfd = -3;
+ /*
+ * devices is already freed in load_devices, so set devices
+ * to NULL to avoid double free devices.
+ */
+ devices = NULL;
+ goto out;
+ }
+
+ if (devcnt == 0) {
+ pr_err("no devices found for %s\n",
+ mddev);
+ if (st)
+ st->ss->free_super(st);
+ free(devmap);
+ goto out;
+ }
+
+ if (c->update && strcmp(c->update, "byteorder")==0)
+ st->minor_version = 90;
+
+ st->ss->getinfo_super(st, content, NULL);
+ clean = content->array.state & 1;
+
+ /* now we have some devices that might be suitable.
+ * I wonder how many
+ */
+ avail = xcalloc(content->array.raid_disks, 1);
+ okcnt = 0;
+ replcnt = 0;
+ sparecnt=0;
+ journalcnt=0;
+ rebuilding_cnt=0;
+ for (i=0; i< bestcnt; i++) {
+ int j = best[i];
+ int event_margin = 1; /* always allow a difference of '1'
+ * like the kernel does
+ */
+ if (j < 0) continue;
+ /* note: we ignore error flags in multipath arrays
+ * as they don't make sense
+ */
+ if (content->array.level != LEVEL_MULTIPATH) {
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) {
+ if (content->journal_device_required)
+ journalcnt++;
+ else /* unexpected journal, mark as faulty */
+ devices[j].i.disk.state |= (1<<MD_DISK_FAULTY);
+ } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) {
+ if (!(devices[j].i.disk.state
+ & (1<<MD_DISK_FAULTY))) {
+ devices[j].uptodate = 1;
+ sparecnt++;
+ }
+ continue;
+ }
+ }
+ /* If this device thinks that 'most_recent' has failed, then
+ * we must reject this device.
+ */
+ if (j != most_recent && !c->force &&
+ content->array.raid_disks > 0 &&
+ devices[most_recent].i.disk.raid_disk >= 0 &&
+ devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) {
+ if (c->verbose > -1)
+ pr_err("ignoring %s as it reports %s as failed\n",
+ devices[j].devname, devices[most_recent].devname);
+ best[i] = -1;
+ continue;
+ }
+ /* Require event counter to be same as, or just less than,
+ * most recent. If it is bigger, it must be a stray spare and
+ * should be ignored.
+ */
+ if (devices[j].i.events+event_margin >=
+ devices[most_recent].i.events &&
+ devices[j].i.events <=
+ devices[most_recent].i.events
+ ) {
+ devices[j].uptodate = 1;
+ if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL))
+ journal_clean = 1;
+ if (i < content->array.raid_disks * 2) {
+ if (devices[j].i.recovery_start == MaxSector ||
+ (content->reshape_active &&
+ i >= content->array.raid_disks - content->delta_disks)) {
+ if (!avail[i/2]) {
+ okcnt++;
+ avail[i/2]=1;
+ } else
+ replcnt++;
+ } else
+ rebuilding_cnt++;
+ } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL)
+ sparecnt++;
+ }
+ }
+ free(devmap);
+ if (c->force) {
+ int force_ok = force_array(content, devices, best, bestcnt,
+ avail, most_recent, st, c);
+ okcnt += force_ok;
+ if (force_ok)
+ was_forced = 1;
+ }
+ /* Now we want to look at the superblock which the kernel will base things on
+ * and compare the devices that we think are working with the devices that the
+ * superblock thinks are working.
+ * If there are differences and --force is given, then update this chosen
+ * superblock.
+ */
+ chosen_drive = -1;
+ st->ss->free_super(st);
+ for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) {
+ int j = best[i];
+ int fd;
+
+ if (j<0)
+ continue;
+ if (!devices[j].uptodate)
+ continue;
+ if (devices[j].i.events < devices[most_recent].i.events)
+ continue;
+ chosen_drive = j;
+ if ((fd=dev_open(devices[j].devname,
+ devices[j].included ? O_RDONLY
+ : (O_RDONLY|O_EXCL)))< 0) {
+ pr_err("Cannot open %s: %s\n",
+ devices[j].devname, strerror(errno));
+ goto out;
+ }
+ if (st->ss->load_super(st,fd, NULL)) {
+ close(fd);
+ pr_err("RAID superblock has disappeared from %s\n",
+ devices[j].devname);
+ goto out;
+ }
+ close(fd);
+ }
+ if (st->sb == NULL) {
+ pr_err("No suitable drives found for %s\n", mddev);
+ goto out;
+ }
+ st->ss->getinfo_super(st, content, NULL);
+ if (sysfs_init(content, mdfd, NULL)) {
+ pr_err("Unable to initialize sysfs\n");
+ goto out;
+ }
+
+ /* after reload context, store journal_clean in context */
+ content->journal_clean = journal_clean;
+ for (i=0; i<bestcnt; i++) {
+ int j = best[i];
+ unsigned int desired_state;
+
+ if (j < 0)
+ continue;
+ if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL)
+ desired_state = (1<<MD_DISK_JOURNAL);
+ else if (i >= content->array.raid_disks * 2)
+ desired_state = 0;
+ else if (i & 1)
+ desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT);
+ else
+ desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC);
+
+ desired_state |= devices[j].i.disk.state & ((1<<MD_DISK_FAILFAST) |
+ (1<<MD_DISK_WRITEMOSTLY));
+
+ if (!devices[j].uptodate)
+ continue;
+
+ devices[j].i.disk.state = desired_state;
+ if (!(devices[j].i.array.state & 1))
+ clean = 0;
+
+ if (st->ss->update_super(st, &devices[j].i, "assemble", NULL,
+ c->verbose, 0, NULL)) {
+ if (c->force) {
+ if (c->verbose >= 0)
+ pr_err("clearing FAULTY flag for device %d in %s for %s\n",
+ j, mddev, devices[j].devname);
+ change = 1;
+ } else {
+ if (c->verbose >= -1)
+ pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n",
+ i, mddev, devices[j].devname);
+ }
+ }
+#if 0
+ if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) {
+ pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n",
+ i, mddev);
+ }
+#endif
+ }
+ if (c->force && !clean &&
+ !enough(content->array.level, content->array.raid_disks,
+ content->array.layout, clean,
+ avail)) {
+ change += st->ss->update_super(st, content, "force-array",
+ devices[chosen_drive].devname, c->verbose,
+ 0, NULL);
+ was_forced = 1;
+ clean = 1;
+ }
+
+ if (change) {
+ int fd;
+ fd = dev_open(devices[chosen_drive].devname,
+ devices[chosen_drive].included ?
+ O_RDWR : (O_RDWR|O_EXCL));
+ if (fd < 0) {
+ pr_err("Could not open %s for write - cannot Assemble array.\n",
+ devices[chosen_drive].devname);
+ goto out;
+ }
+ if (st->ss->store_super(st, fd)) {
+ close(fd);
+ pr_err("Could not re-write superblock on %s\n",
+ devices[chosen_drive].devname);
+ goto out;
+ }
+ if (c->verbose >= 0)
+ pr_err("Marking array %s as 'clean'\n",
+ mddev);
+ close(fd);
+ }
+
+ /* If we are in the middle of a reshape we may need to restore saved data
+ * that was moved aside due to the reshape overwriting live data
+ * The code of doing this lives in Grow.c
+ */
+ if (content->reshape_active &&
+ !(content->reshape_active & RESHAPE_NO_BACKUP)) {
+ int err = 0;
+ int *fdlist = xmalloc(sizeof(int)* bestcnt);
+ if (c->verbose > 0)
+ pr_err("%s has an active reshape - checking if critical section needs to be restored\n",
+ chosen_name);
+ if (!c->backup_file)
+ c->backup_file = locate_backup(content->sys_name);
+ enable_fds(bestcnt/2);
+ for (i = 0; i < bestcnt/2; i++) {
+ int j = best[i*2];
+ if (j >= 0) {
+ fdlist[i] = dev_open(devices[j].devname,
+ devices[j].included
+ ? O_RDWR : (O_RDWR|O_EXCL));
+ if (fdlist[i] < 0) {
+ pr_err("Could not open %s for write - cannot Assemble array.\n",
+ devices[j].devname);
+ err = 1;
+ break;
+ }
+ } else
+ fdlist[i] = -1;
+ }
+ if (!err) {
+ if (st->ss->external && st->ss->recover_backup)
+ err = st->ss->recover_backup(st, content);
+ else
+ err = Grow_restart(st, content, fdlist, bestcnt/2,
+ c->backup_file, c->verbose > 0);
+ if (err && c->invalid_backup) {
+ if (c->verbose > 0)
+ pr_err("continuing without restoring backup\n");
+ err = 0;
+ }
+ }
+ while (i>0) {
+ i--;
+ if (fdlist[i]>=0) close(fdlist[i]);
+ }
+ free(fdlist);
+ if (err) {
+ pr_err("Failed to restore critical section for reshape, sorry.\n");
+ if (c->backup_file == NULL)
+ cont_err("Possibly you needed to specify the --backup-file\n");
+ goto out;
+ }
+ }
+
+ /* Almost ready to actually *do* something */
+ /* First, fill in the map, so that udev can find our name
+ * as soon as we become active.
+ */
+ if (c->update && strcmp(c->update, "metadata")==0) {
+ content->array.major_version = 1;
+ content->array.minor_version = 0;
+ strcpy(content->text_version, "1.0");
+ }
+
+ map_update(&map, fd2devnm(mdfd), content->text_version,
+ content->uuid, chosen_name);
+
+ rv = start_array(mdfd, mddev, content,
+ st, ident, best, bestcnt,
+ chosen_drive, devices, okcnt, sparecnt,
+ rebuilding_cnt, journalcnt,
+ c,
+ clean, avail, start_partial_ok,
+ pre_exist != NULL,
+ was_forced);
+ if (rv == 1 && !pre_exist)
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ free(devices);
+out:
+ map_unlock(&map);
+ if (rv == 0) {
+ wait_for(chosen_name, mdfd);
+ close(mdfd);
+ if (auto_assem) {
+ int usecs = 1;
+ /* There is a nasty race with 'mdadm --monitor'.
+ * If it opens this device before we close it,
+ * it gets an incomplete open on which IO
+ * doesn't work and the capacity is
+ * wrong.
+ * If we reopen (to check for layered devices)
+ * before --monitor closes, we loose.
+ *
+ * So: wait upto 1 second for there to be
+ * a non-zero capacity.
+ */
+ while (usecs < 1000) {
+ mdfd = open(mddev, O_RDONLY);
+ if (mdfd >= 0) {
+ unsigned long long size;
+ if (get_dev_size(mdfd, NULL, &size) &&
+ size > 0)
+ break;
+ close(mdfd);
+ }
+ usleep(usecs);
+ usecs <<= 1;
+ }
+ }
+ } else if (mdfd >= 0)
+ close(mdfd);
+
+ /* '2' means 'OK, but not started yet' */
+ if (rv == -1) {
+ free(devices);
+ return 1;
+ }
+ return rv == 2 ? 0 : rv;
+}
+
+int assemble_container_content(struct supertype *st, int mdfd,
+ struct mdinfo *content, struct context *c,
+ char *chosen_name, int *result)
+{
+ struct mdinfo *dev, *sra, *dev2;
+ struct assembly_array_info array = {chosen_name, 0, 0, 0};
+ int old_raid_disks;
+ int start_reshape;
+ char *avail;
+ int err;
+ int is_raid456, is_clean, all_disks;
+
+ if (sysfs_init(content, mdfd, NULL)) {
+ pr_err("Unable to initialize sysfs\n");
+ return 1;
+ }
+
+ sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS);
+ if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) {
+ if (content->array.major_version == -1 &&
+ content->array.minor_version == -2 &&
+ c->readonly &&
+ content->text_version[0] == '/')
+ content->text_version[0] = '-';
+ if (sysfs_set_array(content, 9003) != 0) {
+ sysfs_free(sra);
+ return 1;
+ }
+ }
+
+ /* There are two types of reshape: container wide or sub-array specific
+ * Check if metadata requests blocking container wide reshapes
+ */
+ start_reshape = (content->reshape_active &&
+ !((content->reshape_active == CONTAINER_RESHAPE) &&
+ (content->array.state & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))));
+
+ /* Block subarray here if it is under reshape now
+ * Do not allow for any changes in this array
+ */
+ if (st->ss->external && content->recovery_blocked && start_reshape)
+ block_subarray(content);
+
+ for (dev2 = sra->devs; dev2; dev2 = dev2->next) {
+ for (dev = content->devs; dev; dev = dev->next)
+ if (dev2->disk.major == dev->disk.major &&
+ dev2->disk.minor == dev->disk.minor)
+ break;
+ if (dev)
+ continue;
+ /* Don't want this one any more */
+ if (sysfs_set_str(sra, dev2, "slot", "none") < 0 &&
+ errno == EBUSY) {
+ pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name);
+ sysfs_free(sra);
+ return 1;
+ }
+ sysfs_set_str(sra, dev2, "state", "remove");
+ }
+ old_raid_disks = content->array.raid_disks - content->delta_disks;
+ avail = xcalloc(content->array.raid_disks, 1);
+ for (dev = content->devs; dev; dev = dev->next) {
+ if (dev->disk.raid_disk >= 0)
+ avail[dev->disk.raid_disk] = 1;
+ if (sysfs_add_disk(content, dev, 1) == 0) {
+ if (dev->disk.raid_disk >= old_raid_disks &&
+ content->reshape_active)
+ array.exp_cnt++;
+ else
+ array.new_cnt++;
+ } else if (errno == EEXIST)
+ array.preexist_cnt++;
+ }
+ sysfs_free(sra);
+
+ all_disks = array.new_cnt + array.exp_cnt + array.preexist_cnt;
+
+ map_update(NULL, fd2devnm(mdfd), content->text_version,
+ content->uuid, chosen_name);
+
+ if (content->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ st->ss->validate_ppl) {
+ content->array.state |= 1;
+ err = 0;
+
+ for (dev = content->devs; dev; dev = dev->next) {
+ int dfd;
+ char *devpath;
+ int ret;
+
+ ret = st->ss->validate_ppl(st, content, dev);
+ if (ret == 0)
+ continue;
+
+ if (ret < 0) {
+ err = 1;
+ break;
+ }
+
+ if (!c->force) {
+ pr_err("%s contains invalid PPL - consider --force or --update-subarray with --update=no-ppl\n",
+ chosen_name);
+ content->array.state &= ~1;
+ avail[dev->disk.raid_disk] = 0;
+ break;
+ }
+
+ /* have --force - overwrite the invalid ppl */
+ devpath = map_dev(dev->disk.major, dev->disk.minor, 0);
+ dfd = dev_open(devpath, O_RDWR);
+ if (dfd < 0) {
+ pr_err("Failed to open %s\n", devpath);
+ err = 1;
+ break;
+ }
+
+ err = st->ss->write_init_ppl(st, content, dfd);
+ close(dfd);
+
+ if (err)
+ break;
+ }
+
+ if (err) {
+ free(avail);
+ return err;
+ }
+ } else if (c->force) {
+ /* Set the array as 'clean' so that we can proceed with starting
+ * it even if we don't have all devices. Mdmon doesn't care
+ * if the dirty flag is set in metadata, it will start managing
+ * it anyway.
+ * This is really important for raid456 (RWH case), other levels
+ * are started anyway.
+ */
+ content->array.state |= 1;
+ }
+
+ is_raid456 = (content->array.level >= 4 && content->array.level <= 6);
+ is_clean = content->array.state & 1;
+
+ if (enough(content->array.level, content->array.raid_disks,
+ content->array.layout, is_clean, avail) == 0) {
+ set_array_assembly_status(c, result, INCR_NO, &array);
+
+ if (c->verbose >= 0 && is_raid456 && !is_clean)
+ pr_err("Consider --force to start dirty degraded array\n");
+
+ free(avail);
+ return 1;
+ }
+ free(avail);
+
+ if (c->runstop <= 0 && all_disks < content->array.working_disks) {
+
+ set_array_assembly_status(c, result, INCR_UNSAFE, &array);
+
+ if (c->verbose >= 0 && c->force)
+ pr_err("Consider --run to start array as degraded.\n");
+ return 1;
+ }
+
+ if (is_raid456 && content->resync_start != MaxSector && c->force &&
+ all_disks < content->array.raid_disks) {
+
+ content->resync_start = MaxSector;
+ err = sysfs_set_num(content, NULL, "resync_start", MaxSector);
+ if (err)
+ return 1;
+
+ pr_err("%s array state forced to clean. It may cause data corruption.\n",
+ chosen_name);
+ }
+
+ /*
+ * Before activating the array, perform extra steps required
+ * to configure the internal write-intent bitmap.
+ */
+ if (content->consistency_policy == CONSISTENCY_POLICY_BITMAP &&
+ st->ss->set_bitmap)
+ st->ss->set_bitmap(st, content);
+
+ if (start_reshape) {
+ int spare = content->array.raid_disks + array.exp_cnt;
+ if (restore_backup(st, content,
+ array.new_cnt,
+ spare, &c->backup_file, c->verbose) == 1)
+ return 1;
+
+ if (content->reshape_progress == 0) {
+ /* If reshape progress is 0 - we are assembling the
+ * array that was stopped, before reshape has started.
+ * Array needs to be started as active, Grow_continue()
+ * will start the reshape.
+ */
+ sysfs_set_num(content, NULL, "reshape_position",
+ MaxSector);
+ err = sysfs_set_str(content, NULL,
+ "array_state", "active");
+ sysfs_set_num(content, NULL, "reshape_position", 0);
+ } else {
+ err = sysfs_set_str(content, NULL,
+ "array_state", "readonly");
+ }
+
+ if (err)
+ return 1;
+
+ if (st->ss->external) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ if (mdmon_running(st->container_devnm) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+
+ err = Grow_continue(mdfd, st, content, c->backup_file,
+ 0, c->freeze_reshape);
+ } else switch(content->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(content, NULL, "array_state",
+ c->readonly ? "readonly" : "active");
+ break;
+ default:
+ err = sysfs_set_str(content, NULL, "array_state",
+ "readonly");
+ /* start mdmon if needed. */
+ if (!err) {
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(st->container_devnm);
+ }
+ break;
+ }
+ if (!err)
+ sysfs_set_safemode(content, content->safe_mode_delay);
+
+ /* Block subarray here if it is not reshaped now
+ * It has be blocked a little later to allow mdmon to switch in
+ * in to R/W state
+ */
+ if (st->ss->external && content->recovery_blocked &&
+ !start_reshape)
+ block_subarray(content);
+
+ if (err)
+ set_array_assembly_status(c, result, INCR_NO, &array);
+ else {
+ set_array_assembly_status(c, result, INCR_YES, &array);
+ wait_for(chosen_name, mdfd);
+ sysfs_rules_apply(chosen_name, content);
+ }
+
+ return err;
+ /* FIXME should have an O_EXCL and wait for read-auto */
+}
diff --git a/Build.c b/Build.c
new file mode 100644
index 0000000..962c2e3
--- /dev/null
+++ b/Build.c
@@ -0,0 +1,227 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+
+int Build(char *mddev, struct mddev_dev *devlist,
+ struct shape *s, struct context *c)
+{
+ /* Build a linear or raid0 arrays without superblocks
+ * We cannot really do any checks, we just do it.
+ * For md_version < 0.90.0, we call REGISTER_DEV
+ * with the device numbers, and then
+ * START_MD giving the "geometry"
+ * geometry is 0xpp00cc
+ * where pp is personality: 1==linear, 2=raid0
+ * cc = chunk size factor: 0==4k, 1==8k etc.
+ */
+ int i;
+ dev_t rdev;
+ int subdevs = 0, missing_disks = 0;
+ struct mddev_dev *dv;
+ int bitmap_fd;
+ unsigned long long bitmapsize;
+ int mdfd;
+ char chosen_name[1024];
+ int uuid[4] = {0,0,0,0};
+ struct map_ent *map = NULL;
+ mdu_array_info_t array;
+ mdu_param_t param; /* not used by syscall */
+
+ if (s->level == UnSet) {
+ pr_err("a RAID level is needed to Build an array.\n");
+ return 1;
+ }
+ /* scan all devices, make sure they really are block devices */
+ for (dv = devlist; dv; dv=dv->next) {
+ subdevs++;
+ if (strcmp("missing", dv->devname) == 0) {
+ missing_disks++;
+ continue;
+ }
+ if (!stat_is_blkdev(dv->devname, NULL))
+ return 1;
+ }
+
+ if (s->raiddisks != subdevs) {
+ pr_err("requested %d devices in array but listed %d\n",
+ s->raiddisks, subdevs);
+ return 1;
+ }
+
+ if (s->layout == UnSet)
+ switch(s->level) {
+ default: /* no layout */
+ s->layout = 0;
+ break;
+ case 10:
+ s->layout = 0x102; /* near=2, far=1 */
+ if (c->verbose > 0)
+ pr_err("layout defaults to n1\n");
+ break;
+ case 5:
+ case 6:
+ s->layout = map_name(r5layout, "default");
+ if (c->verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(r5layout, s->layout));
+ break;
+ case LEVEL_FAULTY:
+ s->layout = map_name(faultylayout, "default");
+
+ if (c->verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout));
+ break;
+ }
+
+ /* We need to create the device. It can have no name. */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, NULL, c->autof, LOCAL,
+ chosen_name, 0);
+ if (mdfd < 0) {
+ map_unlock(&map);
+ return 1;
+ }
+ mddev = chosen_name;
+
+ map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name);
+ map_unlock(&map);
+
+ array.level = s->level;
+ if (s->size == MAX_SIZE)
+ s->size = 0;
+ array.size = s->size;
+ array.nr_disks = s->raiddisks;
+ array.raid_disks = s->raiddisks;
+ array.md_minor = 0;
+ if (fstat_is_blkdev(mdfd, mddev, &rdev))
+ array.md_minor = minor(rdev);
+ array.not_persistent = 1;
+ array.state = 0; /* not clean, but no errors */
+ if (s->assume_clean)
+ array.state |= 1;
+ array.active_disks = s->raiddisks - missing_disks;
+ array.working_disks = s->raiddisks - missing_disks;
+ array.spare_disks = 0;
+ array.failed_disks = missing_disks;
+ if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR))
+ s->chunk = 64;
+ array.chunk_size = s->chunk*1024;
+ array.layout = s->layout;
+ if (md_set_array_info(mdfd, &array)) {
+ pr_err("md_set_array_info() failed for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort;
+ }
+
+ if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
+ s->bitmap_file = NULL;
+ if (s->bitmap_file && s->level <= 0) {
+ pr_err("bitmaps not meaningful with level %s\n",
+ map_num(pers, s->level)?:"given");
+ goto abort;
+ }
+ /* now add the devices */
+ for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) {
+ mdu_disk_info_t disk;
+ unsigned long long dsize;
+ int fd;
+
+ if (strcmp("missing", dv->devname) == 0)
+ continue;
+ if (!stat_is_blkdev(dv->devname, &rdev))
+ goto abort;
+ fd = open(dv->devname, O_RDONLY|O_EXCL);
+ if (fd < 0) {
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ if (get_dev_size(fd, NULL, &dsize) &&
+ (s->size == 0 || s->size == MAX_SIZE || dsize < s->size))
+ s->size = dsize;
+ close(fd);
+ disk.number = i;
+ disk.raid_disk = i;
+ disk.state = (1<<MD_DISK_SYNC) | (1<<MD_DISK_ACTIVE);
+ if (dv->writemostly == FlagSet)
+ disk.state |= 1<<MD_DISK_WRITEMOSTLY;
+ disk.major = major(rdev);
+ disk.minor = minor(rdev);
+ if (ioctl(mdfd, ADD_NEW_DISK, &disk)) {
+ pr_err("ADD_NEW_DISK failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ }
+ /* now to start it */
+ if (s->bitmap_file) {
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ int major = BITMAP_MAJOR_HI;
+#if 0
+ if (s->bitmap_chunk == UnSet) {
+ pr_err("%s cannot be opened.\n", s->bitmap_file);
+ goto abort;
+ }
+#endif
+ bitmapsize = s->size >> 9; /* FIXME wrong for RAID10 */
+ if (CreateBitmap(s->bitmap_file, 1, NULL,
+ s->bitmap_chunk, c->delay,
+ s->write_behind, bitmapsize, major)) {
+ goto abort;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("%s cannot be opened.\n", s->bitmap_file);
+ goto abort;
+ }
+ }
+ if (bitmap_fd >= 0) {
+ if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort;
+ }
+ }
+ }
+ if (ioctl(mdfd, RUN_ARRAY, &param)) {
+ pr_err("RUN_ARRAY failed: %s\n", strerror(errno));
+ if (s->chunk & (s->chunk - 1)) {
+ cont_err("Problem may be that chunk size is not a power of 2\n");
+ }
+ goto abort;
+ }
+
+ if (c->verbose >= 0)
+ pr_err("array %s built and started.\n",
+ mddev);
+ wait_for(mddev, mdfd);
+ close(mdfd);
+ return 0;
+
+ abort:
+ ioctl(mdfd, STOP_ARRAY, 0);
+ close(mdfd);
+ return 1;
+}
diff --git a/COPYING b/COPYING
new file mode 100644
index 0000000..d159169
--- /dev/null
+++ b/COPYING
@@ -0,0 +1,339 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The licenses for most software are designed to take away your
+freedom to share and change it. By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users. This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it. (Some other Free Software Foundation software is covered by
+the GNU Lesser General Public License instead.) You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+ To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have. You must make sure that they, too, receive or can get the
+source code. And you must show them these terms so they know their
+rights.
+
+ We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+ Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software. If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+ Finally, any free program is threatened constantly by software
+patents. We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary. To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ GNU GENERAL PUBLIC LICENSE
+ TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+ 0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License. The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language. (Hereinafter, translation is included without limitation in
+the term "modification".) Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope. The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+ 1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+ 2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+ a) You must cause the modified files to carry prominent notices
+ stating that you changed the files and the date of any change.
+
+ b) You must cause any work that you distribute or publish, that in
+ whole or in part contains or is derived from the Program or any
+ part thereof, to be licensed as a whole at no charge to all third
+ parties under the terms of this License.
+
+ c) If the modified program normally reads commands interactively
+ when run, you must cause it, when started running for such
+ interactive use in the most ordinary way, to print or display an
+ announcement including an appropriate copyright notice and a
+ notice that there is no warranty (or else, saying that you provide
+ a warranty) and that users may redistribute the program under
+ these conditions, and telling the user how to view a copy of this
+ License. (Exception: if the Program itself is interactive but
+ does not normally print such an announcement, your work based on
+ the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole. If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works. But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+ 3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+ a) Accompany it with the complete corresponding machine-readable
+ source code, which must be distributed under the terms of Sections
+ 1 and 2 above on a medium customarily used for software interchange; or,
+
+ b) Accompany it with a written offer, valid for at least three
+ years, to give any third party, for a charge no more than your
+ cost of physically performing source distribution, a complete
+ machine-readable copy of the corresponding source code, to be
+ distributed under the terms of Sections 1 and 2 above on a medium
+ customarily used for software interchange; or,
+
+ c) Accompany it with the information you received as to the offer
+ to distribute corresponding source code. (This alternative is
+ allowed only for noncommercial distribution and only if you
+ received the program in object code or executable form with such
+ an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it. For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable. However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+ 4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License. Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+ 5. You are not required to accept this License, since you have not
+signed it. However, nothing else grants you permission to modify or
+distribute the Program or its derivative works. These actions are
+prohibited by law if you do not accept this License. Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+ 6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions. You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+ 7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all. For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices. Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+ 8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded. In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+ 9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number. If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation. If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+ 10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission. For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this. Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+ NO WARRANTY
+
+ 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+ 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License along
+ with this program; if not, write to the Free Software Foundation, Inc.,
+ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+ Gnomovision version 69, Copyright (C) year name of author
+ Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary. Here is a sample; alter the names:
+
+ Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+ `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+ <signature of Ty Coon>, 1 April 1989
+ Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs. If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License.
diff --git a/ChangeLog b/ChangeLog
new file mode 100644
index 0000000..a3bf700
--- /dev/null
+++ b/ChangeLog
@@ -0,0 +1,306 @@
+Please see git logs for detailed change log.
+This file just contains highlight.
+
+Changes Prior to release 3.3
+- Some array reshapes can proceed without needing backup file.
+ This is done by changing the 'data_offset' so we never need to write
+ any data back over where it was before. If there is no "head space"
+ or "tail space" to allow data_offset to change, the old mechanism
+ with a backup file can still be used.
+- RAID10 arrays can be reshaped to change the number of devices,
+ change the chunk size, or change the layout between 'near'
+ and 'offset'.
+ This will always change data_offset, and will fail if there is no
+ room for data_offset to be moved.
+- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array.
+- bad-block-logs are supported (but not heavily tested yet)
+- "--assemble --update=revert-reshape" can be used to undo a reshape
+ that has just been started but isn't really wanted. This is very
+ new and while it passes basic tests it cannot be guaranteed.
+- improved locking between --incremental and --assemble
+- uses systemd to run "mdmon" if systemd is configured to do that.
+- kernel names of md devices can be non-numeric. e.g. "md_home" rather than
+ "md0". This will probably confuse lots of other tools, so you need to
+ echo CREATE names=yes >> /etc/mdadm.conf
+ or the feature will not be used. (you also need a reasonably new kernel).
+- "--stop" can be given a kernel name instead of a device name. i.e
+ mdadm --stop md4
+ will work even if /dev/md4 doesn't exist.
+- "--detail --export" has some information about the devices in the array
+- --dump and --restore can be used to backup and restore the metadata on an
+ array.
+- Hot-replace is supported with
+ mdadm /dev/mdX --replace /dev/foo
+ and
+ mdadm /dev/mdX --replace /dev/foo --with /dev/bar
+- Config file can be a directory in which case all "*.conf" files are
+ read in lexical order.
+ Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d
+ Thus
+ echo CREATE name=yes > /etc/mdadm.conf.d/names.conf
+ will also enable the use of named md devices.
+
+- Lots of improvements to DDF support including adding support for
+ RAID10 (thanks Martin Wilck).
+
+Changes Prior to release 3.2.6
+ - There are no real stand-out fixes, just lots of little bits and pieces.
+
+Changes Prior to release 3.2.5
+ - This release primarily fixes a serious regression in 3.2.4.
+ This regression does *not* cause any risk to data. It simply
+ means that adding a device with "--add" would sometime fail
+ when it should not.
+
+ - The fix also includes a couple of minor fixes such as making
+ the "--layout=preserve" option to "--grow" work again.
+
+
+Changes Prior to release 3.2.4
+"--oneline" log of changes is below. Some notable ones are:
+
+ - --offroot argument to improve interactions between mdmon and initrd
+ - --prefer argument to select which /dev names to display in some
+ circumstances.
+ - relax restructions on when "--add" will be allowed
+ - Fix bug with adding write-intent-bitmap to active array
+ - Now defaults to "/run/mdadm" for storing run-time files.
+
+Changes Prior to release 3.2.3
+ - The largest single area of change is support for reshape of Intel
+ IMSM arrays (OnLine Capacity Explansion and Level Migration).
+ - Among other fixes, this now has a better chance of surviving if a
+ device fails during reshape.
+
+Changes Prior to release 3.2.2
+ - reshaping IMSM (Intel metadata) arrays is no longer 'experimental',
+ it should work properly and be largely compatible with IMSM drivers in
+ other platforms.
+ - --assume-clean can be used with --grow --size to avoid resyncing the
+ new part of the array. This is only support with very new kernels.
+ - RAID0 arrays can have chunksize which is not a power of 2. This has been
+ supported in the kernel for a while but is only now supprted by
+ mdadm.
+
+ - A new tool 'raid6check' is available which can check a RAID6 array,
+ or part of it, and report which device is most inconsistent with the
+ others if any stripe is inconsistent. This is still under development
+ and does not have a man page yet. If anyone tries it out and has any
+ questions or experience to report, they would be most welcome on
+ linux-raid@vger.kernel.org.
+
+Changes Prior to release 3.2.1
+ - policy framework
+ Policy can be expressed for moving spare devices between arrays, and
+ for how to handle hot-plugged devices. This policy can be different
+ for devices plugged in to different controllers etc.
+ This, for example, allows a configuration where when a device is plugged
+ in it is immediately included in an md array as a hot spare and
+ possibly starts recovery immediately if an array is degraded.
+
+ - some understanding of mbr and gpt paritition tables
+ This is primarly to support the new hot-plug support. If a
+ device is plugged in and policy suggests it should have a partition table,
+ the partition table will be copied from a suitably similar device, and
+ then the partitions will hot-plug and can then be added to md arrays.
+
+ - "--incremental --remove" can remember where a device was removed from
+ so if a device gets plugged back in the same place, special policy applies
+ to it, allowing it to be included in an array even if a general hotplug
+ will not be included.
+
+ - enhanced reshape options, including growing a RAID0 by converting to RAID4,
+ restriping, and converting back. Also convertions between RAID0 and
+ RAID10 and between RAID1 and RAID10 are possible (with a suitably recent
+ kernel).
+
+ - spare migration for IMSM arrays.
+ Spare migration can now work across 'containers' using non-native metadata
+ and specifically Intel's IMSM arrays support spare migrations.
+
+ - OLCE and level migration for Intel IMSM arrays.
+ OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is
+ supported for Intel Matrix Storage Manager arrays.
+ This support is currently 'experimental' for technical reasons. It can
+ be enabled with "export MDADM_EXPERIMENTAL=1"
+
+ - avoid including wayward devices
+ If you split a RAID1, mount the two halves as two separate degraded RAID1s,
+ and then later bring the two back together, it is possible that the md
+ metadata won't properly show that one must over-ride the other.
+ mdadm now does extra checking to detect this possibilty and avoid
+ potentially corrupting data.
+
+ - remove any possible confusion between similar options.
+ e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't
+ notice if one was used where the other was expected.
+
+ - allow K,M,G suffixes on chunk sizes
+
+Changes Prior to release 3.2
+ - By far the most significant change in this release related to the
+ management of reshaping arrays. This code has been substantially
+ re-written so that it can work with 'externally managed metadata' -
+ Intel's IMSM in particular. We now support level migration and
+ OnLine Capacity Expansion on these arrays.
+ - Policy framework.
+ Various policy statements can be made in the mdadm.conf to guide
+ the behaviour of mdadm, particular with regards to how new devices
+ are treated by "mdadm -I".
+ Depending on the 'action' associated with a device (identified by
+ its 'path') such need devices can be automatically re-added to and
+ existing array that they previously fell out off, or automatically
+ added as a spare if they appear to contain no data.
+
+ - mdadm now has a limited understanding of partition tables. This
+ allows the policy framework to make decisions about partitioned
+ devices as well.
+
+ - --incremental --remove can be told what --path the device was on,
+ and this info will be recorded so that another device appearing at
+ the same physical location can be preferentially added to the same
+ array (provides the spare-same-slot action policy applied to the
+ path).
+
+ - A new flags "--invalid-backup" flag is available in --assemble
+ mode. This can be used to re-assemble an array which was stopping
+ in the middle of a reshape, and for which the 'backup file' is no
+ longer available or is corrupted. The array may have some
+ corruption in it at the point where reshape was up to, but at least
+ the rest of the array will become available.
+
+
+ - Various internal restructuring - more is needed.
+
+Changes Prior to release 3.1.5
+ - Fixes for v1.x metadata on big-endian machines.
+ - man page improvements
+ - Improve '--detail --export' when run on partitions of an md array.
+ - Fix regression with removing 'failed' or 'detached' devices.
+ - Fixes for "--assemble --force" in various unusual cases.
+ - Allow '-Y' to mean --export. This was documented but not implemented.
+ - Various fixed for handling 'ddf' metadata. This is now more reliable
+ but could benefit from more interoperability testing.
+ - Correctly list subarrays of a container in "--detail" output.
+ - Improve checks on whether the requested number of devices is supported
+ by the metadata - both for --create and --grow.
+ - Don't remove partitions from a device that is being included in an
+ array until we are fully committed to including it.
+ - Allow "--assemble --update=no-bitmap" so an array with a corrupt
+ bitmap can still be assembled.
+ - Don't allow --add to succeed if it looks like a "--re-add" is probably
+ wanted, but cannot succeed. This avoids inadvertently turning
+ devices into spares when an array is failed.
+
+Changes Prior to release 3.1.4
+ Two fixes related to configs that aren't using udev:
+ - Don't remove md devices which 'standard' names on --stop
+ - Allow dev_open to work on read-only /dev
+ And fixed regressions:
+ - Allow --incremental to add spares to an array
+ - Accept --no-degraded as a deprecated option rather than
+ throwing an error
+ - Return correct success status when --incrmental assembling
+ a container which does not yet have enough devices.
+ - Don't link mdadm with pthreads, only mdmon needs it.
+ - Fix compiler warning due to bad use of snprintf
+
+Changes Prior to release 3.1.3
+ - mapfile now lives in a fixed location which default to
+ /dev/.mdadm/map but can be changed at compile time. This
+ location is choses and most distros provide it during early
+ boot and preserve it through. As long a /dev exists and is
+ writable, /dev/.mdadm will be created.
+ Other files file communication with mdmon live here too.
+ This fixes a bug reported by Debian and Gentoo users where
+ udev would spin in early-boot.
+ - IMSM and DDF metadata will not be recognised on partitions
+ as they should only be used on whole-disks.
+ - Various overflows causes by 2G drives have been addressed.
+ - A subarray of an IMSM contain can now be killed with
+ --kill-subarray. Also subarrays can be renamed with
+ --update-subarray
+ - -If (or --incremental --fail) can be used from udev to
+ fail and remove from all arrays a device which has been
+ unplugged from the system. i.e. hot-unplug-support.
+ - "mdadm /dev/mdX --re-add missing" will look for any device
+ that looks like it should be a member of /dev/mdX but isn't
+ and will automatically --re-add it
+ - Now compile with -Wextra to get extra warnings.
+ - Lots of minor bug fixes, documentation improvements, etcc
+
+Changes Prior to release 3.1.2
+ - The default metadata has change again (sorry about that).
+ It is now v1.2 and will hopefully stay that way. It turned
+ out there with boot-block issues with v1.1 which make it
+ unsuitable for a default, though in many cases it is still
+ suitable to use.
+ - Stopping a container is not permitted when members are still
+ active
+ - Add 'homehost' to the valid words for the "AUTO" config file
+ line. When followed by "-all", this causes mdadm to
+ auto-assemble any array belonging to this host, but not
+ auto-assemble anything else.
+ - Fix some bugs with "--grow --chunksize=" for changing chunksize.
+ - VAR_RUN can be easily changed at compile time just like ALT_RUN.
+ This gives distros more flexability in how to manage the
+ pid and sock files that mdmon needs.
+ - Various mdmon fixes
+ - Alway make bitmap 4K-aligned if at all possible.
+ - If mdadm.conf lists arrays which have inter-dependencies,
+ the previously had to be listed in the "right" order. Now
+ any order should work.
+ - Fix --force assembly of v1.x arrays which are in the process
+ of recovering.
+ - Add section on 'scrubbing' to 'md' man page.
+ - Various command-line-option parsing improvements.
+ - ... and lots of other bug fixes.
+
+Changes Prior to release 3.1.1
+ - Multiple fixes for new --grow levels including fixes for
+ serious data corruption problems.
+ - Change default metadata to v1.1
+ - Change default chunk size to 512K
+ - Change default bitmap chunk size to 64Meg
+ - When --re-add is used, don't fall back to
+ --add if --re-add fails as this can destroy data.
+
+Changes Prior to release 3.1
+ - Support --grow to change the layout of RAID4/5/6
+ - Support --grow to change the chunksize of raid 4/5/6
+ - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and
+ back.
+ - Support --grow to reduce the number of devices in RAID4/5/6.
+ - Support restart of these grow options which assembling an array
+ which is partially grown.
+ - Assorted tests of this code, and of different RAID6 layouts.
+
+Changes Prior to release 3.0.3
+ - Improvements for creating arrays giving just a name, like 'foo',
+ rather than the full '/dev/md/foo'.
+ - Improvements for assembling member arrays of containers.
+ - Improvements to test suite
+ - Add option to change increment for RebuildNN messages reported
+ by "mdadm --monitor"
+ - Improvements to mdmon 'hand-over' from initrd to final root.
+ - Handle merging of devices that have left an IMSM array and are
+ being re-incorporated.
+ - Add missing space in "--detail --brief" output.
+
+Changes Prior to release 3.0.2
+ - Fix crash when hosthost is not set, as often happens in
+ early boot.
+
+Changes Prior to release 3.0.1
+ - Fix various segfaults
+ - Fixed for --examine with containers
+ - Lots of other little fixes.
+
+Changes Prior to release 3.0
+ - Support for externally managed metadata, specifically DDF and IMSM.
+ - Depend on udev to create entries in /dev, rather than creating them
+ ourselves.
+ - remove --auto-update-home-hosts
+ - new config file line "auto"
+ - new "<ignore>" and "any" options for "homehost"
+ - numerous bug fixes and minor enhancements.
diff --git a/Create.c b/Create.c
new file mode 100644
index 0000000..0ff1922
--- /dev/null
+++ b/Create.c
@@ -0,0 +1,1118 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+#include <ctype.h>
+
+static int round_size_and_verify(unsigned long long *size, int chunk)
+{
+ if (*size == 0)
+ return 0;
+ *size &= ~(unsigned long long)(chunk - 1);
+ if (*size == 0) {
+ pr_err("Size cannot be smaller than chunk.\n");
+ return 1;
+ }
+ return 0;
+}
+
+static int default_layout(struct supertype *st, int level, int verbose)
+{
+ int layout = UnSet;
+
+ if (st && st->ss->default_geometry)
+ st->ss->default_geometry(st, &level, &layout, NULL);
+
+ if (layout == UnSet)
+ switch(level) {
+ default: /* no layout */
+ layout = 0;
+ break;
+ case 0:
+ layout = RAID0_ORIG_LAYOUT;
+ break;
+ case 10:
+ layout = 0x102; /* near=2, far=1 */
+ if (verbose > 0)
+ pr_err("layout defaults to n2\n");
+ break;
+ case 5:
+ case 6:
+ layout = map_name(r5layout, "default");
+ if (verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(r5layout, layout));
+ break;
+ case LEVEL_FAULTY:
+ layout = map_name(faultylayout, "default");
+
+ if (verbose > 0)
+ pr_err("layout defaults to %s\n", map_num(faultylayout, layout));
+ break;
+ }
+
+ return layout;
+}
+
+int Create(struct supertype *st, char *mddev,
+ char *name, int *uuid,
+ int subdevs, struct mddev_dev *devlist,
+ struct shape *s,
+ struct context *c, unsigned long long data_offset)
+{
+ /*
+ * Create a new raid array.
+ *
+ * First check that necessary details are available
+ * (i.e. level, raid-disks)
+ *
+ * Then check each disk to see what might be on it
+ * and report anything interesting.
+ *
+ * If anything looks odd, and runstop not set,
+ * abort.
+ *
+ * SET_ARRAY_INFO and ADD_NEW_DISK, and
+ * if runstop==run, or raiddisks disks were used,
+ * RUN_ARRAY
+ */
+ int mdfd;
+ unsigned long long minsize = 0, maxsize = 0;
+ char *mindisc = NULL;
+ char *maxdisc = NULL;
+ int dnum, raid_disk_num;
+ struct mddev_dev *dv;
+ dev_t rdev;
+ int fail = 0, warn = 0;
+ int first_missing = subdevs * 2;
+ int second_missing = subdevs * 2;
+ int missing_disks = 0;
+ int insert_point = subdevs * 2; /* where to insert a missing drive */
+ int total_slots;
+ int pass;
+ int rv;
+ int bitmap_fd;
+ int have_container = 0;
+ int container_fd = -1;
+ int need_mdmon = 0;
+ unsigned long long bitmapsize;
+ struct mdinfo info, *infos;
+ int did_default = 0;
+ int do_default_layout = 0;
+ int do_default_chunk = 0;
+ unsigned long safe_mode_delay = 0;
+ char chosen_name[1024];
+ struct map_ent *map = NULL;
+ unsigned long long newsize;
+ mdu_array_info_t inf;
+
+ int major_num = BITMAP_MAJOR_HI;
+ if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) {
+ major_num = BITMAP_MAJOR_CLUSTERED;
+ if (c->nodes <= 1) {
+ pr_err("At least 2 nodes are needed for cluster-md\n");
+ return 1;
+ }
+ }
+
+ memset(&info, 0, sizeof(info));
+ if (s->level == UnSet && st && st->ss->default_geometry)
+ st->ss->default_geometry(st, &s->level, NULL, NULL);
+ if (s->level == UnSet) {
+ pr_err("a RAID level is needed to create an array.\n");
+ return 1;
+ }
+ if (s->raiddisks < 4 && s->level == 6) {
+ pr_err("at least 4 raid-devices needed for level 6\n");
+ return 1;
+ }
+ if (s->raiddisks > 256 && s->level == 6) {
+ pr_err("no more than 256 raid-devices supported for level 6\n");
+ return 1;
+ }
+ if (s->raiddisks < 2 && s->level >= 4) {
+ pr_err("at least 2 raid-devices needed for level %d\n", s->level);
+ return 1;
+ }
+ if (s->level <= 0 && s->sparedisks) {
+ pr_err("This level does not support spare devices\n");
+ return 1;
+ }
+
+ if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) {
+ /* If given a single device, it might be a container, and we can
+ * extract a device list from there
+ */
+ int fd;
+
+ memset(&inf, 0, sizeof(inf));
+ fd = open(devlist->devname, O_RDONLY);
+ if (fd >= 0 &&
+ md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) {
+ /* yep, looks like a container */
+ if (st) {
+ rv = st->ss->load_container(st, fd,
+ devlist->devname);
+ if (rv == 0)
+ have_container = 1;
+ } else {
+ st = super_by_fd(fd, NULL);
+ if (st && !(rv = st->ss->
+ load_container(st, fd,
+ devlist->devname)))
+ have_container = 1;
+ else
+ st = NULL;
+ }
+ if (have_container) {
+ subdevs = s->raiddisks;
+ first_missing = subdevs * 2;
+ second_missing = subdevs * 2;
+ insert_point = subdevs * 2;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ }
+ if (st && st->ss->external && s->sparedisks) {
+ pr_err("This metadata type does not support spare disks at create time\n");
+ return 1;
+ }
+ if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) {
+ pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks);
+ return 1;
+ }
+ if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) {
+ pr_err("You haven't given enough devices (real or missing) to create this array\n");
+ return 1;
+ }
+ if (s->bitmap_file && s->level <= 0) {
+ pr_err("bitmaps not meaningful with level %s\n",
+ map_num(pers, s->level)?:"given");
+ return 1;
+ }
+
+ /* now set some defaults */
+
+ if (s->layout == UnSet) {
+ do_default_layout = 1;
+ s->layout = default_layout(st, s->level, c->verbose);
+ }
+
+ if (s->level == 10)
+ /* check layout fits in array*/
+ if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) {
+ pr_err("that layout requires at least %d devices\n",
+ (s->layout&255) * ((s->layout>>8)&255));
+ return 1;
+ }
+
+ switch(s->level) {
+ case 4:
+ case 5:
+ case 10:
+ case 6:
+ case 0:
+ if (s->chunk == 0 || s->chunk == UnSet) {
+ s->chunk = UnSet;
+ do_default_chunk = 1;
+ /* chunk will be set later */
+ }
+ break;
+ case LEVEL_LINEAR:
+ /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */
+ if (get_linux_version() < 2006016 && s->chunk == 0) {
+ s->chunk = 64;
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to 64K\n");
+ }
+ break;
+ case 1:
+ case LEVEL_FAULTY:
+ case LEVEL_MULTIPATH:
+ case LEVEL_CONTAINER:
+ if (s->chunk) {
+ pr_err("specifying chunk size is forbidden for this level\n");
+ return 1;
+ }
+ break;
+ default:
+ pr_err("unknown level %d\n", s->level);
+ return 1;
+ }
+
+ if (s->size == MAX_SIZE)
+ /* use '0' to mean 'max' now... */
+ s->size = 0;
+ if (s->size && s->chunk && s->chunk != UnSet)
+ if (round_size_and_verify(&s->size, s->chunk))
+ return 1;
+
+ newsize = s->size * 2;
+ if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks,
+ &s->chunk, s->size*2,
+ data_offset, NULL,
+ &newsize, s->consistency_policy,
+ c->verbose >= 0))
+ return 1;
+
+ if (s->chunk && s->chunk != UnSet) {
+ newsize &= ~(unsigned long long)(s->chunk*2 - 1);
+ if (do_default_chunk) {
+ /* default chunk was just set */
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to %dK\n", s->chunk);
+ if (round_size_and_verify(&s->size, s->chunk))
+ return 1;
+ do_default_chunk = 0;
+ }
+ }
+
+ if (s->size == 0) {
+ s->size = newsize / 2;
+ if (s->level == 1)
+ /* If this is ever reshaped to RAID5, we will
+ * need a chunksize. So round it off a bit
+ * now just to be safe
+ */
+ s->size &= ~(64ULL-1);
+
+ if (s->size && c->verbose > 0)
+ pr_err("setting size to %lluK\n", s->size);
+ }
+
+ /* now look at the subdevs */
+ info.array.active_disks = 0;
+ info.array.working_disks = 0;
+ dnum = 0;
+ for (dv = devlist; dv; dv = dv->next)
+ if (data_offset == VARIABLE_OFFSET)
+ dv->data_offset = INVALID_SECTORS;
+ else
+ dv->data_offset = data_offset;
+
+ for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) {
+ char *dname = dv->devname;
+ unsigned long long freesize;
+ int dfd;
+ char *doff;
+
+ if (strcasecmp(dname, "missing") == 0) {
+ if (first_missing > dnum)
+ first_missing = dnum;
+ if (second_missing > dnum && dnum > first_missing)
+ second_missing = dnum;
+ missing_disks ++;
+ continue;
+ }
+ if (data_offset == VARIABLE_OFFSET) {
+ doff = strchr(dname, ':');
+ if (doff) {
+ *doff++ = 0;
+ dv->data_offset = parse_size(doff);
+ } else
+ dv->data_offset = INVALID_SECTORS;
+ } else
+ dv->data_offset = data_offset;
+
+ dfd = open(dname, O_RDONLY);
+ if (dfd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dname, strerror(errno));
+ exit(2);
+ }
+ if (!fstat_is_blkdev(dfd, dname, NULL)) {
+ close(dfd);
+ exit(2);
+ }
+ close(dfd);
+ info.array.working_disks++;
+ if (dnum < s->raiddisks && dv->disposition != 'j')
+ info.array.active_disks++;
+ if (st == NULL) {
+ struct createinfo *ci = conf_get_create_info();
+ if (ci)
+ st = ci->supertype;
+ }
+ if (st == NULL) {
+ /* Need to choose a default metadata, which is different
+ * depending on geometry of array.
+ */
+ int i;
+ char *name = "default";
+ for(i = 0; !st && superlist[i]; i++) {
+ st = superlist[i]->match_metadata_desc(name);
+ if (!st)
+ continue;
+ if (do_default_layout)
+ s->layout = default_layout(st, s->level, c->verbose);
+ switch (st->ss->validate_geometry(
+ st, s->level, s->layout, s->raiddisks,
+ &s->chunk, s->size*2,
+ dv->data_offset, dname,
+ &freesize, s->consistency_policy,
+ c->verbose > 0)) {
+ case -1: /* Not valid, message printed, and not
+ * worth checking any further */
+ exit(2);
+ break;
+ case 0: /* Geometry not valid */
+ free(st);
+ st = NULL;
+ s->chunk = do_default_chunk ? UnSet : s->chunk;
+ break;
+ case 1: /* All happy */
+ break;
+ }
+ }
+
+ if (!st) {
+ int dfd = open(dname, O_RDONLY|O_EXCL);
+ if (dfd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dname, strerror(errno));
+ exit(2);
+ }
+ pr_err("device %s not suitable for any style of array\n",
+ dname);
+ exit(2);
+ }
+ if (st->ss != &super0 ||
+ st->minor_version != 90)
+ did_default = 1;
+ } else {
+ if (do_default_layout)
+ s->layout = default_layout(st, s->level, 0);
+ if (!st->ss->validate_geometry(st, s->level, s->layout,
+ s->raiddisks,
+ &s->chunk, s->size*2,
+ dv->data_offset,
+ dname, &freesize,
+ s->consistency_policy,
+ c->verbose >= 0)) {
+
+ pr_err("%s is not suitable for this array.\n",
+ dname);
+ fail = 1;
+ continue;
+ }
+ }
+
+ if (dv->disposition == 'j')
+ goto skip_size_check; /* skip write journal for size check */
+
+ freesize /= 2; /* convert to K */
+ if (s->chunk && s->chunk != UnSet) {
+ /* round to chunk size */
+ freesize = freesize & ~(s->chunk-1);
+ if (do_default_chunk) {
+ /* default chunk was just set */
+ if (c->verbose > 0)
+ pr_err("chunk size defaults to %dK\n", s->chunk);
+ if (round_size_and_verify(&s->size, s->chunk))
+ return 1;
+ do_default_chunk = 0;
+ }
+ }
+ if (!freesize) {
+ pr_err("no free space left on %s\n", dname);
+ fail = 1;
+ continue;
+ }
+
+ if (s->size && freesize < s->size) {
+ pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n",
+ dname, freesize, s->size);
+ fail = 1;
+ continue;
+ }
+ if (maxdisc == NULL || (maxdisc && freesize > maxsize)) {
+ maxdisc = dname;
+ maxsize = freesize;
+ }
+ if (mindisc ==NULL || (mindisc && freesize < minsize)) {
+ mindisc = dname;
+ minsize = freesize;
+ }
+ skip_size_check:
+ if (c->runstop != 1 || c->verbose >= 0) {
+ int fd = open(dname, O_RDONLY);
+ if (fd < 0) {
+ pr_err("Cannot open %s: %s\n",
+ dname, strerror(errno));
+ fail = 1;
+ continue;
+ }
+ warn |= check_ext2(fd, dname);
+ warn |= check_reiser(fd, dname);
+ warn |= check_raid(fd, dname);
+ if (strcmp(st->ss->name, "1.x") == 0 &&
+ st->minor_version >= 1)
+ /* metadata at front */
+ warn |= check_partitions(fd, dname, 0, 0);
+ else if (s->level == 1 || s->level == LEVEL_CONTAINER ||
+ (s->level == 0 && s->raiddisks == 1))
+ /* partitions could be meaningful */
+ warn |= check_partitions(fd, dname, freesize*2, s->size*2);
+ else
+ /* partitions cannot be meaningful */
+ warn |= check_partitions(fd, dname, 0, 0);
+ if (strcmp(st->ss->name, "1.x") == 0 &&
+ st->minor_version >= 1 &&
+ did_default &&
+ s->level == 1 &&
+ (warn & 1024) == 0) {
+ warn |= 1024;
+ pr_err("Note: this array has metadata at the start and\n"
+ " may not be suitable as a boot device. If you plan to\n"
+ " store '/boot' on this device please ensure that\n"
+ " your boot-loader understands md/v1.x metadata, or use\n"
+ " --metadata=0.90\n");
+ }
+ close(fd);
+ }
+ }
+ if (missing_disks == dnum && !have_container) {
+ pr_err("Subdevs can't be all missing\n");
+ return 1;
+ }
+ if (s->raiddisks + s->sparedisks > st->max_devs) {
+ pr_err("Too many devices: %s metadata only supports %d\n",
+ st->ss->name, st->max_devs);
+ return 1;
+ }
+ if (have_container)
+ info.array.working_disks = s->raiddisks;
+ if (fail) {
+ pr_err("create aborted\n");
+ return 1;
+ }
+ if (s->size == 0) {
+ if (mindisc == NULL && !have_container) {
+ pr_err("no size and no drives given - aborting create.\n");
+ return 1;
+ }
+ if (s->level > 0 || s->level == LEVEL_MULTIPATH ||
+ s->level == LEVEL_FAULTY || st->ss->external) {
+ /* size is meaningful */
+ if (!st->ss->validate_geometry(st, s->level, s->layout,
+ s->raiddisks,
+ &s->chunk, minsize*2,
+ data_offset,
+ NULL, NULL,
+ s->consistency_policy, 0)) {
+ pr_err("devices too large for RAID level %d\n", s->level);
+ return 1;
+ }
+ s->size = minsize;
+ if (s->level == 1)
+ /* If this is ever reshaped to RAID5, we will
+ * need a chunksize. So round it off a bit
+ * now just to be safe
+ */
+ s->size &= ~(64ULL-1);
+ if (c->verbose > 0)
+ pr_err("size set to %lluK\n", s->size);
+ }
+ }
+
+ if (!s->bitmap_file &&
+ !st->ss->external &&
+ s->level >= 1 &&
+ st->ss->add_internal_bitmap &&
+ s->journaldisks == 0 &&
+ (s->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
+ s->consistency_policy != CONSISTENCY_POLICY_PPL) &&
+ (s->write_behind || s->size > 100*1024*1024ULL)) {
+ if (c->verbose > 0)
+ pr_err("automatically enabling write-intent bitmap on large array\n");
+ s->bitmap_file = "internal";
+ }
+ if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0)
+ s->bitmap_file = NULL;
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ !st->ss->write_init_ppl) {
+ pr_err("%s metadata does not support PPL\n", st->ss->name);
+ return 1;
+ }
+
+ if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) {
+ if (c->runstop != 1 || c->verbose >= 0)
+ pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n",
+ maxdisc, s->size);
+ warn = 1;
+ }
+
+ if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) {
+ if (c->runstop != 1 || c->verbose >= 0)
+ pr_err("%s unable to enumerate platform support\n"
+ " array may not be compatible with hardware/firmware\n",
+ st->ss->name);
+ warn = 1;
+ }
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
+
+ if (warn) {
+ if (c->runstop!= 1) {
+ if (!ask("Continue creating array? ")) {
+ pr_err("create aborted.\n");
+ return 1;
+ }
+ } else {
+ if (c->verbose > 0)
+ pr_err("creation continuing despite oddities due to --run\n");
+ }
+ }
+
+ /* If this is raid4/5, we want to configure the last active slot
+ * as missing, so that a reconstruct happens (faster than re-parity)
+ * FIX: Can we do this for raid6 as well?
+ */
+ if (st->ss->external == 0 && s->assume_clean == 0 &&
+ c->force == 0 && first_missing >= s->raiddisks) {
+ switch (s->level) {
+ case 4:
+ case 5:
+ insert_point = s->raiddisks-1;
+ s->sparedisks++;
+ info.array.active_disks--;
+ missing_disks++;
+ break;
+ default:
+ break;
+ }
+ }
+ /* For raid6, if creating with 1 missing drive, make a good drive
+ * into a spare, else the create will fail
+ */
+ if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks &&
+ st->ss->external == 0 &&
+ second_missing >= s->raiddisks && s->level == 6) {
+ insert_point = s->raiddisks - 1;
+ if (insert_point == first_missing)
+ insert_point--;
+ s->sparedisks ++;
+ info.array.active_disks--;
+ missing_disks++;
+ }
+
+ if (s->level <= 0 && first_missing < subdevs * 2) {
+ pr_err("This level does not support missing devices\n");
+ return 1;
+ }
+
+ /* We need to create the device */
+ map_lock(&map);
+ mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name, 1);
+ if (mdfd < 0) {
+ map_unlock(&map);
+ return 1;
+ }
+ /* verify if chosen_name is not in use,
+ * it could be in conflict with already existing device
+ * e.g. container, array
+ */
+ if (strncmp(chosen_name, "/dev/md/", 8) == 0 &&
+ map_by_name(&map, chosen_name+8) != NULL) {
+ pr_err("Array name %s is in use already.\n",
+ chosen_name);
+ close(mdfd);
+ map_unlock(&map);
+ udev_unblock();
+ return 1;
+ }
+ mddev = chosen_name;
+
+ memset(&inf, 0, sizeof(inf));
+ md_get_array_info(mdfd, &inf);
+ if (inf.working_disks != 0) {
+ pr_err("another array by this name is already running.\n");
+ goto abort_locked;
+ }
+
+ /* Ok, lets try some ioctls */
+
+ info.array.level = s->level;
+ info.array.size = s->size;
+ info.array.raid_disks = s->raiddisks;
+ /* The kernel should *know* what md_minor we are dealing
+ * with, but it chooses to trust me instead. Sigh
+ */
+ info.array.md_minor = 0;
+ if (fstat_is_blkdev(mdfd, mddev, &rdev))
+ info.array.md_minor = minor(rdev);
+ info.array.not_persistent = 0;
+
+ if (((s->level == 4 || s->level == 5) &&
+ (insert_point < s->raiddisks || first_missing < s->raiddisks)) ||
+ (s->level == 6 && (insert_point < s->raiddisks ||
+ second_missing < s->raiddisks)) ||
+ (s->level <= 0) || s->assume_clean) {
+ info.array.state = 1; /* clean, but one+ drive will be missing*/
+ info.resync_start = MaxSector;
+ } else {
+ info.array.state = 0; /* not clean, but no errors */
+ info.resync_start = 0;
+ }
+ if (s->level == 10) {
+ /* for raid10, the bitmap size is the capacity of the array,
+ * which is array.size * raid_disks / ncopies;
+ * .. but convert to sectors.
+ */
+ int ncopies = ((s->layout>>8) & 255) * (s->layout & 255);
+ bitmapsize = s->size * s->raiddisks / ncopies * 2;
+/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/
+ } else
+ bitmapsize = s->size * 2;
+
+ /* There is lots of redundancy in these disk counts,
+ * raid_disks is the most meaningful value
+ * it describes the geometry of the array
+ * it is constant
+ * nr_disks is total number of used slots.
+ * it should be raid_disks+spare_disks
+ * spare_disks is the number of extra disks present
+ * see above
+ * active_disks is the number of working disks in
+ * active slots. (With raid_disks)
+ * working_disks is the total number of working disks,
+ * including spares
+ * failed_disks is the number of disks marked failed
+ *
+ * Ideally, the kernel would keep these (except raid_disks)
+ * up-to-date as we ADD_NEW_DISK, but it doesn't (yet).
+ * So for now, we assume that all raid and spare
+ * devices will be given.
+ */
+ info.array.spare_disks=s->sparedisks;
+ info.array.failed_disks=missing_disks;
+ info.array.nr_disks = info.array.working_disks
+ + info.array.failed_disks;
+ info.array.layout = s->layout;
+ info.array.chunk_size = s->chunk*1024;
+
+ if (name == NULL || *name == 0) {
+ /* base name on mddev */
+ /* /dev/md0 -> 0
+ * /dev/md_d0 -> d0
+ * /dev/md_foo -> foo
+ * /dev/md/1 -> 1
+ * /dev/md/d1 -> d1
+ * /dev/md/home -> home
+ * /dev/mdhome -> home
+ */
+ /* FIXME compare this with rules in create_mddev */
+ name = strrchr(mddev, '/');
+ if (name) {
+ name++;
+ if (strncmp(name, "md_", 3) == 0 &&
+ strlen(name) > 3 && (name-mddev) == 5 /* /dev/ */)
+ name += 3;
+ else if (strncmp(name, "md", 2) == 0 &&
+ strlen(name) > 2 && isdigit(name[2]) &&
+ (name-mddev) == 5 /* /dev/ */)
+ name += 2;
+ }
+ }
+ if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid,
+ data_offset))
+ goto abort_locked;
+
+ total_slots = info.array.nr_disks;
+ st->ss->getinfo_super(st, &info, NULL);
+ if (sysfs_init(&info, mdfd, NULL)) {
+ pr_err("unable to initialize sysfs\n");
+ goto abort_locked;
+ }
+
+ if (did_default && c->verbose >= 0) {
+ if (is_subarray(info.text_version)) {
+ char devnm[32];
+ char *ep;
+ struct mdinfo *mdi;
+
+ strncpy(devnm, info.text_version+1, 32);
+ devnm[31] = 0;
+ ep = strchr(devnm, '/');
+ if (ep)
+ *ep = 0;
+
+ mdi = sysfs_read(-1, devnm, GET_VERSION);
+
+ pr_err("Creating array inside %s container %s\n",
+ mdi?mdi->text_version:"managed", devnm);
+ sysfs_free(mdi);
+ } else
+ pr_err("Defaulting to version %s metadata\n", info.text_version);
+ }
+
+ map_update(&map, fd2devnm(mdfd), info.text_version,
+ info.uuid, chosen_name);
+ /* Keep map locked until devices have been added to array
+ * to stop another mdadm from finding and using those devices.
+ */
+
+ if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 ||
+ strcmp(s->bitmap_file, "clustered") == 0)) {
+ if (!st->ss->add_internal_bitmap) {
+ pr_err("internal bitmaps not supported with %s metadata\n",
+ st->ss->name);
+ goto abort_locked;
+ }
+ if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk,
+ c->delay, s->write_behind,
+ bitmapsize, 1, major_num)) {
+ pr_err("Given bitmap chunk size not supported.\n");
+ goto abort_locked;
+ }
+ s->bitmap_file = NULL;
+ }
+
+ if (sysfs_init(&info, mdfd, NULL)) {
+ pr_err("unable to initialize sysfs\n");
+ goto abort_locked;
+ }
+
+ if (st->ss->external && st->container_devnm[0]) {
+ /* member */
+
+ /* When creating a member, we need to be careful
+ * to negotiate with mdmon properly.
+ * If it is already running, we cannot write to
+ * the devices and must ask it to do that part.
+ * If it isn't running, we write to the devices,
+ * and then start it.
+ * We hold an exclusive open on the container
+ * device to make sure mdmon doesn't exit after
+ * we checked that it is running.
+ *
+ * For now, fail if it is already running.
+ */
+ container_fd = open_dev_excl(st->container_devnm);
+ if (container_fd < 0) {
+ pr_err("Cannot get exclusive open on container - weird.\n");
+ goto abort_locked;
+ }
+ if (mdmon_running(st->container_devnm)) {
+ if (c->verbose)
+ pr_err("reusing mdmon for %s.\n",
+ st->container_devnm);
+ st->update_tail = &st->updates;
+ } else
+ need_mdmon = 1;
+ }
+ rv = set_array_info(mdfd, st, &info);
+ if (rv) {
+ pr_err("failed to set array info for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort_locked;
+ }
+
+ if (s->bitmap_file) {
+ int uuid[4];
+
+ st->ss->uuid_from_super(st, uuid);
+ if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk,
+ c->delay, s->write_behind,
+ bitmapsize,
+ major_num)) {
+ goto abort_locked;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("weird: %s cannot be opened\n",
+ s->bitmap_file);
+ goto abort_locked;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ mddev, strerror(errno));
+ goto abort_locked;
+ }
+ }
+
+ infos = xmalloc(sizeof(*infos) * total_slots);
+ enable_fds(total_slots);
+ for (pass = 1; pass <= 2; pass++) {
+ struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */
+
+ for (dnum = 0, raid_disk_num = 0, dv = devlist; dv;
+ dv = (dv->next) ? (dv->next) : moved_disk, dnum++) {
+ int fd;
+ struct mdinfo *inf = &infos[dnum];
+
+ if (dnum >= total_slots)
+ abort();
+ if (dnum == insert_point) {
+ raid_disk_num += 1;
+ moved_disk = dv;
+ continue;
+ }
+ if (strcasecmp(dv->devname, "missing") == 0) {
+ raid_disk_num += 1;
+ continue;
+ }
+ if (have_container)
+ moved_disk = NULL;
+ if (have_container && dnum < info.array.raid_disks - 1)
+ /* repeatedly use the container */
+ moved_disk = dv;
+
+ switch(pass) {
+ case 1:
+ *inf = info;
+
+ inf->disk.number = dnum;
+ inf->disk.raid_disk = raid_disk_num++;
+
+ if (dv->disposition == 'j') {
+ inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL;
+ inf->disk.state = (1<<MD_DISK_JOURNAL);
+ raid_disk_num--;
+ } else if (inf->disk.raid_disk < s->raiddisks)
+ inf->disk.state = (1<<MD_DISK_ACTIVE) |
+ (1<<MD_DISK_SYNC);
+ else
+ inf->disk.state = 0;
+
+ if (dv->writemostly == FlagSet) {
+ if (major_num == BITMAP_MAJOR_CLUSTERED) {
+ pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname);
+ goto abort_locked;
+ } else
+ inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY);
+ }
+ if (dv->failfast == FlagSet)
+ inf->disk.state |= (1<<MD_DISK_FAILFAST);
+
+ if (have_container)
+ fd = -1;
+ else {
+ if (st->ss->external &&
+ st->container_devnm[0])
+ fd = open(dv->devname, O_RDWR);
+ else
+ fd = open(dv->devname, O_RDWR|O_EXCL);
+
+ if (fd < 0) {
+ pr_err("failed to open %s after earlier success - aborting\n",
+ dv->devname);
+ goto abort_locked;
+ }
+ if (!fstat_is_blkdev(fd, dv->devname, &rdev))
+ return 1;
+ inf->disk.major = major(rdev);
+ inf->disk.minor = minor(rdev);
+ }
+ if (fd >= 0)
+ remove_partitions(fd);
+ if (st->ss->add_to_super(st, &inf->disk,
+ fd, dv->devname,
+ dv->data_offset)) {
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort_locked;
+ }
+ st->ss->getinfo_super(st, inf, NULL);
+ safe_mode_delay = inf->safe_mode_delay;
+
+ if (have_container && c->verbose > 0)
+ pr_err("Using %s for device %d\n",
+ map_dev(inf->disk.major,
+ inf->disk.minor,
+ 0), dnum);
+
+ if (!have_container) {
+ /* getinfo_super might have lost these ... */
+ inf->disk.major = major(rdev);
+ inf->disk.minor = minor(rdev);
+ }
+ break;
+ case 2:
+ inf->errors = 0;
+
+ rv = add_disk(mdfd, st, &info, inf);
+
+ if (rv) {
+ pr_err("ADD_NEW_DISK for %s failed: %s\n",
+ dv->devname, strerror(errno));
+ if (errno == EINVAL &&
+ info.array.level == 0) {
+ pr_err("Possibly your kernel doesn't support RAID0 layouts.\n");
+ pr_err("Either upgrade, or use --layout=dangerous\n");
+ }
+ goto abort_locked;
+ }
+ break;
+ }
+ if (!have_container &&
+ dv == moved_disk && dnum != insert_point) break;
+ }
+ if (pass == 1) {
+ struct mdinfo info_new;
+ struct map_ent *me = NULL;
+
+ /* check to see if the uuid has changed due to these
+ * metadata changes, and if so update the member array
+ * and container uuid. Note ->write_init_super clears
+ * the subarray cursor such that ->getinfo_super once
+ * again returns container info.
+ */
+ st->ss->getinfo_super(st, &info_new, NULL);
+ if (st->ss->external && s->level != LEVEL_CONTAINER &&
+ !same_uuid(info_new.uuid, info.uuid, 0)) {
+ map_update(&map, fd2devnm(mdfd),
+ info_new.text_version,
+ info_new.uuid, chosen_name);
+ me = map_by_devnm(&map, st->container_devnm);
+ }
+
+ if (st->ss->write_init_super(st)) {
+ st->ss->free_super(st);
+ goto abort_locked;
+ }
+ /*
+ * Before activating the array, perform extra steps
+ * required to configure the internal write-intent
+ * bitmap.
+ */
+ if (info_new.consistency_policy ==
+ CONSISTENCY_POLICY_BITMAP &&
+ st->ss->set_bitmap &&
+ st->ss->set_bitmap(st, &info)) {
+ st->ss->free_super(st);
+ goto abort_locked;
+ }
+
+ /* update parent container uuid */
+ if (me) {
+ char *path = xstrdup(me->path);
+
+ st->ss->getinfo_super(st, &info_new, NULL);
+ map_update(&map, st->container_devnm,
+ info_new.text_version,
+ info_new.uuid, path);
+ free(path);
+ }
+
+ flush_metadata_updates(st);
+ st->ss->free_super(st);
+ }
+ }
+ map_unlock(&map);
+ free(infos);
+
+ if (s->level == LEVEL_CONTAINER) {
+ /* No need to start. But we should signal udev to
+ * create links */
+ sysfs_uevent(&info, "change");
+ if (c->verbose >= 0)
+ pr_err("container %s prepared.\n", mddev);
+ wait_for(chosen_name, mdfd);
+ } else if (c->runstop == 1 || subdevs >= s->raiddisks) {
+ if (st->ss->external) {
+ int err;
+ switch(s->level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ err = sysfs_set_str(&info, NULL, "array_state",
+ c->readonly
+ ? "readonly"
+ : "active");
+ need_mdmon = 0;
+ break;
+ default:
+ err = sysfs_set_str(&info, NULL, "array_state",
+ "readonly");
+ break;
+ }
+ sysfs_set_safemode(&info, safe_mode_delay);
+ if (err) {
+ pr_err("failed to activate array.\n");
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ } else if (c->readonly &&
+ sysfs_attribute_available(
+ &info, NULL, "array_state")) {
+ if (sysfs_set_str(&info, NULL,
+ "array_state", "readonly") < 0) {
+ pr_err("Failed to start array: %s\n",
+ strerror(errno));
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ } else {
+ /* param is not actually used */
+ mdu_param_t param;
+ if (ioctl(mdfd, RUN_ARRAY, &param)) {
+ pr_err("RUN_ARRAY failed: %s\n",
+ strerror(errno));
+ if (errno == 524 /* ENOTSUP */ &&
+ info.array.level == 0)
+ cont_err("Please use --layout=original or --layout=alternate\n");
+ if (info.array.chunk_size & (info.array.chunk_size-1)) {
+ cont_err("Problem may be that chunk size is not a power of 2\n");
+ }
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ goto abort;
+ }
+ /* if start_ro module parameter is set, array is
+ * auto-read-only, which is bad as the resync won't
+ * start. So lets make it read-write now.
+ */
+ ioctl(mdfd, RESTART_ARRAY_RW, NULL);
+ }
+ if (c->verbose >= 0)
+ pr_err("array %s started.\n", mddev);
+ if (st->ss->external && st->container_devnm[0]) {
+ if (need_mdmon)
+ start_mdmon(st->container_devnm);
+
+ ping_monitor(st->container_devnm);
+ close(container_fd);
+ }
+ wait_for(chosen_name, mdfd);
+ } else {
+ pr_err("not starting array - not enough devices.\n");
+ }
+ udev_unblock();
+ close(mdfd);
+ sysfs_uevent(&info, "change");
+ return 0;
+
+ abort:
+ udev_unblock();
+ map_lock(&map);
+ abort_locked:
+ map_remove(&map, fd2devnm(mdfd));
+ map_unlock(&map);
+
+ if (mdfd >= 0)
+ close(mdfd);
+ return 1;
+}
diff --git a/Detail.c b/Detail.c
new file mode 100644
index 0000000..95d4cc7
--- /dev/null
+++ b/Detail.c
@@ -0,0 +1,879 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+#include <ctype.h>
+#include <dirent.h>
+
+static int cmpstringp(const void *p1, const void *p2)
+{
+ return strcmp(* (char * const *) p1, * (char * const *) p2);
+}
+
+static int add_device(const char *dev, char ***p_devices,
+ int *p_max_devices, int n_devices)
+{
+ if (n_devices + 1 >= *p_max_devices) {
+ *p_max_devices += 16;
+ *p_devices = xrealloc(*p_devices, *p_max_devices *
+ sizeof(**p_devices));
+ if (!*p_devices) {
+ *p_max_devices = 0;
+ return 0;
+ }
+ };
+ (*p_devices)[n_devices] = xstrdup(dev);
+ return n_devices + 1;
+}
+
+int Detail(char *dev, struct context *c)
+{
+ /*
+ * Print out details for an md array
+ */
+ int fd = open(dev, O_RDONLY);
+ mdu_array_info_t array;
+ mdu_disk_info_t *disks = NULL;
+ int next;
+ int d;
+ time_t atime;
+ char *str;
+ char **devices = NULL;
+ int max_devices = 0, n_devices = 0;
+ int spares = 0;
+ struct stat stb;
+ int failed = 0;
+ struct supertype *st = NULL;
+ char *subarray = NULL;
+ int max_disks = MD_SB_DISKS; /* just a default */
+ struct mdinfo *info = NULL;
+ struct mdinfo *sra = NULL;
+ struct mdinfo *subdev;
+ char *member = NULL;
+ char *container = NULL;
+
+ int rv = c->test ? 4 : 1;
+ int avail_disks = 0;
+ char *avail = NULL;
+ int external;
+ int inactive;
+ int is_container = 0;
+ char *arrayst;
+
+ if (fd < 0) {
+ pr_err("cannot open %s: %s\n",
+ dev, strerror(errno));
+ return rv;
+ }
+ sra = sysfs_read(fd, NULL, GET_VERSION | GET_DEVS |
+ GET_ARRAY_STATE | GET_STATE);
+ if (!sra) {
+ if (md_get_array_info(fd, &array)) {
+ pr_err("%s does not appear to be an md device\n", dev);
+ goto out;
+ }
+ }
+ external = (sra != NULL && sra->array.major_version == -1 &&
+ sra->array.minor_version == -2);
+ inactive = (sra != NULL && !md_array_is_active(sra));
+ st = super_by_fd(fd, &subarray);
+ if (md_get_array_info(fd, &array)) {
+ if (errno == ENODEV) {
+ if (sra->array.major_version == -1 &&
+ sra->array.minor_version == -1 &&
+ sra->devs == NULL) {
+ pr_err("Array associated with md device %s does not exist.\n",
+ dev);
+ goto out;
+ }
+ array = sra->array;
+ } else {
+ pr_err("cannot get array detail for %s: %s\n",
+ dev, strerror(errno));
+ goto out;
+ }
+ }
+
+ if (array.raid_disks == 0 && external)
+ is_container = 1;
+ if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode))
+ stb.st_rdev = 0;
+ rv = 0;
+
+ if (st)
+ max_disks = st->max_devs;
+
+ if (subarray) {
+ /* This is a subarray of some container.
+ * We want the name of the container, and the member
+ */
+ dev_t devid = devnm2devid(st->container_devnm);
+ int cfd, err;
+
+ member = subarray;
+ container = map_dev_preferred(major(devid), minor(devid),
+ 1, c->prefer);
+ cfd = open_dev(st->container_devnm);
+ if (cfd >= 0) {
+ err = st->ss->load_container(st, cfd, NULL);
+ close(cfd);
+ if (err == 0)
+ info = st->ss->container_content(st, subarray);
+ }
+ }
+
+ /* try to load a superblock. Try sra->devs first, then try ioctl */
+ if (st && !info)
+ for (d = 0, subdev = sra ? sra->devs : NULL;
+ d < max_disks || subdev;
+ subdev ? (void)(subdev = subdev->next) : (void)(d++)){
+ mdu_disk_info_t disk;
+ char *dv;
+ int fd2;
+ int err;
+
+ if (subdev)
+ disk = subdev->disk;
+ else {
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if (d >= array.raid_disks &&
+ disk.major == 0 && disk.minor == 0)
+ continue;
+ }
+
+ if (array.raid_disks > 0 &&
+ (disk.state & (1 << MD_DISK_ACTIVE)) == 0)
+ continue;
+
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 < 0)
+ continue;
+
+ if (st->sb)
+ st->ss->free_super(st);
+
+ err = st->ss->load_super(st, fd2, NULL);
+ close(fd2);
+ if (err)
+ continue;
+ if (info)
+ free(info);
+ if (subarray)
+ info = st->ss->container_content(st, subarray);
+ else {
+ info = xmalloc(sizeof(*info));
+ st->ss->getinfo_super(st, info, NULL);
+ }
+ if (!info)
+ continue;
+
+ if (array.raid_disks != 0 && /* container */
+ (info->array.ctime != array.ctime ||
+ info->array.level != array.level)) {
+ st->ss->free_super(st);
+ continue;
+ }
+ /* some formats (imsm) have free-floating-spares
+ * with a uuid of uuid_zero, they don't
+ * have very good info about the rest of the
+ * container, so keep searching when
+ * encountering such a device. Otherwise, stop
+ * after the first successful call to
+ * ->load_super.
+ */
+ if (memcmp(uuid_zero,
+ info->uuid,
+ sizeof(uuid_zero)) == 0) {
+ st->ss->free_super(st);
+ continue;
+ }
+ break;
+ }
+
+ /* Ok, we have some info to print... */
+ if (inactive && info)
+ str = map_num(pers, info->array.level);
+ else
+ str = map_num(pers, array.level);
+
+ if (c->export) {
+ if (array.raid_disks) {
+ if (str)
+ printf("MD_LEVEL=%s\n", str);
+ printf("MD_DEVICES=%d\n", array.raid_disks);
+ } else {
+ if (is_container)
+ printf("MD_LEVEL=container\n");
+ printf("MD_DEVICES=%d\n", array.nr_disks);
+ }
+ if (container) {
+ printf("MD_CONTAINER=%s\n", container);
+ printf("MD_MEMBER=%s\n", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf("MD_METADATA=%s\n", sra->text_version);
+ else
+ printf("MD_METADATA=%d.%d\n",
+ array.major_version,
+ array.minor_version);
+ }
+
+ if (st && st->sb && info) {
+ char nbuf[64];
+ struct map_ent *mp, *map = NULL;
+
+ fname_from_uuid(st, info, nbuf, ':');
+ printf("MD_UUID=%s\n", nbuf + 5);
+ mp = map_by_uuid(&map, info->uuid);
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0) {
+ printf("MD_DEVNAME=");
+ print_escape(mp->path + 8);
+ putchar('\n');
+ }
+
+ if (st->ss->export_detail_super)
+ st->ss->export_detail_super(st);
+ map_free(map);
+ } else {
+ struct map_ent *mp, *map = NULL;
+ char nbuf[64];
+ mp = map_by_devnm(&map, fd2devnm(fd));
+ if (mp) {
+ __fname_from_uuid(mp->uuid, 0, nbuf, ':');
+ printf("MD_UUID=%s\n", nbuf+5);
+ }
+ if (mp && mp->path &&
+ strncmp(mp->path, "/dev/md/", 8) == 0) {
+ printf("MD_DEVNAME=");
+ print_escape(mp->path+8);
+ putchar('\n');
+ }
+ map_free(map);
+ }
+ if (!c->no_devices && sra) {
+ struct mdinfo *mdi;
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ char *path;
+ char *sysdev = xstrdup(mdi->sys_name);
+ char *cp;
+
+ path = map_dev(mdi->disk.major,
+ mdi->disk.minor, 0);
+ for (cp = sysdev; *cp; cp++)
+ if (!isalnum(*cp))
+ *cp = '_';
+
+ if (mdi->disk.raid_disk >= 0)
+ printf("MD_DEVICE_%s_ROLE=%d\n",
+ sysdev,
+ mdi->disk.raid_disk);
+ else
+ printf("MD_DEVICE_%s_ROLE=spare\n",
+ sysdev);
+ if (path)
+ printf("MD_DEVICE_%s_DEV=%s\n",
+ sysdev, path);
+ }
+ }
+ goto out;
+ }
+
+ disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t));
+ for (d = 0; d < max_disks * 2; d++) {
+ disks[d].state = (1 << MD_DISK_REMOVED);
+ disks[d].major = disks[d].minor = 0;
+ disks[d].number = -1;
+ disks[d].raid_disk = d / 2;
+ }
+
+ next = array.raid_disks * 2;
+ if (inactive) {
+ struct mdinfo *mdi;
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ disks[next++] = mdi->disk;
+ disks[next - 1].number = -1;
+ }
+ } else for (d = 0; d < max_disks; d++) {
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0) {
+ if (d < array.raid_disks)
+ pr_err("cannot get device detail for device %d: %s\n",
+ d, strerror(errno));
+ continue;
+ }
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks &&
+ disks[disk.raid_disk * 2].state == (1 << MD_DISK_REMOVED) &&
+ ((disk.state & (1 << MD_DISK_JOURNAL)) == 0))
+ disks[disk.raid_disk * 2] = disk;
+ else if (disk.raid_disk >= 0 &&
+ disk.raid_disk < array.raid_disks &&
+ disks[disk.raid_disk * 2 + 1].state ==
+ (1 << MD_DISK_REMOVED) &&
+ !(disk.state & (1 << MD_DISK_JOURNAL)))
+ disks[disk.raid_disk * 2 + 1] = disk;
+ else if (next < max_disks * 2)
+ disks[next++] = disk;
+ }
+
+ avail = xcalloc(array.raid_disks, 1);
+
+ for (d = 0; d < array.raid_disks; d++) {
+ char dv[PATH_MAX], dv_rep[PATH_MAX];
+ snprintf(dv, PATH_MAX, "/sys/dev/block/%d:%d",
+ disks[d*2].major, disks[d*2].minor);
+ snprintf(dv_rep, PATH_MAX, "/sys/dev/block/%d:%d",
+ disks[d*2+1].major, disks[d*2+1].minor);
+
+ if ((is_dev_alive(dv) && (disks[d*2].state & (1<<MD_DISK_SYNC))) ||
+ (is_dev_alive(dv_rep) && (disks[d*2+1].state & (1<<MD_DISK_SYNC)))) {
+ avail_disks ++;
+ avail[d] = 1;
+ } else
+ rv |= !! c->test;
+ }
+
+ if (c->brief) {
+ mdu_bitmap_file_t bmf;
+ if (inactive && !is_container)
+ printf("INACTIVE-ARRAY %s", dev);
+ else
+ printf("ARRAY %s", dev);
+ if (c->verbose > 0) {
+ if (array.raid_disks)
+ printf(" level=%s num-devices=%d",
+ str ? str : "-unknown-",
+ array.raid_disks);
+ else if (is_container)
+ printf(" level=container num-devices=%d",
+ array.nr_disks);
+ else
+ printf(" num-devices=%d", array.nr_disks);
+ }
+ if (container) {
+ printf(" container=%s", container);
+ printf(" member=%s", member);
+ } else {
+ if (sra && sra->array.major_version < 0)
+ printf(" metadata=%s", sra->text_version);
+ else
+ printf(" metadata=%d.%d", array.major_version,
+ array.minor_version);
+ }
+
+ /* Only try GET_BITMAP_FILE for 0.90.01 and later */
+ if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) {
+ printf(" bitmap=%s", bmf.pathname);
+ }
+ } else {
+ mdu_bitmap_file_t bmf;
+ unsigned long long larray_size;
+ struct mdstat_ent *ms = mdstat_read(0, 0);
+ struct mdstat_ent *e;
+ char *devnm;
+
+ devnm = stat2devnm(&stb);
+ for (e = ms; e; e = e->next)
+ if (strcmp(e->devnm, devnm) == 0)
+ break;
+ if (!get_dev_size(fd, NULL, &larray_size))
+ larray_size = 0;
+
+ printf("%s:\n", dev);
+
+ if (container)
+ printf(" Container : %s, member %s\n",
+ container, member);
+ else {
+ if (sra && sra->array.major_version < 0)
+ printf(" Version : %s\n",
+ sra->text_version);
+ else
+ printf(" Version : %d.%d\n",
+ array.major_version,
+ array.minor_version);
+ }
+
+ atime = array.ctime;
+ if (atime)
+ printf(" Creation Time : %.24s\n", ctime(&atime));
+ if (is_container)
+ str = "container";
+ if (str)
+ printf(" Raid Level : %s\n", str);
+ if (larray_size)
+ printf(" Array Size : %llu%s\n",
+ (larray_size >> 10),
+ human_size(larray_size));
+ if (array.level >= 1) {
+ if (sra)
+ array.major_version = sra->array.major_version;
+ if (array.major_version != 0 &&
+ (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) {
+ unsigned long long dsize;
+
+ dsize = get_component_size(fd);
+ if (dsize > 0)
+ printf(" Used Dev Size : %llu%s\n",
+ dsize/2,
+ human_size((long long)dsize<<9));
+ else
+ printf(" Used Dev Size : unknown\n");
+ } else
+ printf(" Used Dev Size : %lu%s\n",
+ (unsigned long)array.size,
+ human_size((unsigned long long)
+ array.size << 10));
+ }
+ if (array.raid_disks)
+ printf(" Raid Devices : %d\n", array.raid_disks);
+ printf(" Total Devices : %d\n", array.nr_disks);
+ if (!container &&
+ ((sra == NULL && array.major_version == 0) ||
+ (sra && sra->array.major_version == 0)))
+ printf(" Preferred Minor : %d\n", array.md_minor);
+ if (sra == NULL || sra->array.major_version >= 0)
+ printf(" Persistence : Superblock is %spersistent\n",
+ array.not_persistent ? "not " : "");
+ printf("\n");
+ /* Only try GET_BITMAP_FILE for 0.90.01 and later */
+ if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) {
+ printf(" Intent Bitmap : %s\n", bmf.pathname);
+ printf("\n");
+ } else if (array.state & (1<<MD_SB_CLUSTERED))
+ printf(" Intent Bitmap : Internal(Clustered)\n\n");
+ else if (array.state & (1<<MD_SB_BITMAP_PRESENT))
+ printf(" Intent Bitmap : Internal\n\n");
+ atime = array.utime;
+ if (atime)
+ printf(" Update Time : %.24s\n", ctime(&atime));
+ if (array.raid_disks) {
+ static char *sync_action[] = {
+ ", recovering", ", resyncing",
+ ", reshaping", ", checking" };
+ char *st;
+ if (avail_disks == array.raid_disks)
+ st = "";
+ else if (!enough(array.level, array.raid_disks,
+ array.layout, 1, avail))
+ st = ", FAILED";
+ else
+ st = ", degraded";
+
+ if (array.state & (1 << MD_SB_CLEAN)) {
+ if ((array.level == 0) ||
+ (array.level == LEVEL_LINEAR))
+ arrayst = map_num(sysfs_array_states,
+ sra->array_state);
+ else
+ arrayst = "clean";
+ } else {
+ arrayst = "active";
+ if (array.state & (1<<MD_SB_CLUSTERED)) {
+ for (d = 0; d < max_disks * 2; d++) {
+ char *dv;
+ mdu_disk_info_t disk = disks[d];
+
+ /* only check first valid disk in cluster env */
+ if ((disk.state & (MD_DISK_SYNC | MD_DISK_ACTIVE))
+ && (disk.major | disk.minor)) {
+ dv = map_dev_preferred(disk.major, disk.minor, 0,
+ c->prefer);
+ if (!dv)
+ continue;
+ arrayst = IsBitmapDirty(dv) ? "active" : "clean";
+ break;
+ }
+ }
+ }
+ }
+
+ printf(" State : %s%s%s%s%s%s%s \n",
+ arrayst, st,
+ (!e || (e->percent < 0 &&
+ e->percent != RESYNC_PENDING &&
+ e->percent != RESYNC_DELAYED &&
+ e->percent != RESYNC_REMOTE)) ?
+ "" : sync_action[e->resync],
+ larray_size ? "": ", Not Started",
+ (e && e->percent == RESYNC_DELAYED) ?
+ " (DELAYED)": "",
+ (e && e->percent == RESYNC_PENDING) ?
+ " (PENDING)": "",
+ (e && e->percent == RESYNC_REMOTE) ?
+ " (REMOTE)": "");
+ } else if (inactive && !is_container) {
+ printf(" State : inactive\n");
+ }
+ if (array.raid_disks)
+ printf(" Active Devices : %d\n", array.active_disks);
+ if (array.working_disks > 0)
+ printf(" Working Devices : %d\n",
+ array.working_disks);
+ if (array.raid_disks) {
+ printf(" Failed Devices : %d\n", array.failed_disks);
+ if (!external)
+ printf(" Spare Devices : %d\n", array.spare_disks);
+ }
+ printf("\n");
+ if (array.level == 5) {
+ str = map_num(r5layout, array.layout);
+ printf(" Layout : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (array.level == 0 && array.layout) {
+ str = map_num(r0layout, array.layout);
+ printf(" Layout : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (array.level == 6) {
+ str = map_num(r6layout, array.layout);
+ printf(" Layout : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (array.level == 10) {
+ printf(" Layout :");
+ print_r10_layout(array.layout);
+ printf("\n");
+ }
+ switch (array.level) {
+ case 0:
+ case 4:
+ case 5:
+ case 10:
+ case 6:
+ if (array.chunk_size)
+ printf(" Chunk Size : %dK\n\n",
+ array.chunk_size/1024);
+ break;
+ case -1:
+ printf(" Rounding : %dK\n\n",
+ array.chunk_size/1024);
+ break;
+ default:
+ break;
+ }
+
+ if (array.raid_disks) {
+ struct mdinfo *mdi;
+
+ mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY);
+ if (mdi) {
+ char *policy = map_num(consistency_policies,
+ mdi->consistency_policy);
+ sysfs_free(mdi);
+ if (policy)
+ printf("Consistency Policy : %s\n\n",
+ policy);
+ }
+ }
+
+ if (e && e->percent >= 0) {
+ static char *sync_action[] = {
+ "Rebuild", "Resync", "Reshape", "Check"};
+ printf(" %7s Status : %d%% complete\n",
+ sync_action[e->resync], e->percent);
+ }
+
+ if ((st && st->sb) && (info && info->reshape_active)) {
+#if 0
+This is pretty boring
+ printf(" Reshape pos'n : %llu%s\n",
+ (unsigned long long) info->reshape_progress << 9,
+ human_size((unsigned long long)
+ info->reshape_progress << 9));
+#endif
+ if (info->delta_disks != 0)
+ printf(" Delta Devices : %d, (%d->%d)\n",
+ info->delta_disks,
+ array.raid_disks - info->delta_disks,
+ array.raid_disks);
+ if (info->new_level != array.level) {
+ str = map_num(pers, info->new_level);
+ printf(" New Level : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (info->new_level != array.level ||
+ info->new_layout != array.layout) {
+ if (info->new_level == 5) {
+ str = map_num(r5layout,
+ info->new_layout);
+ printf(" New Layout : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (info->new_level == 6) {
+ str = map_num(r6layout,
+ info->new_layout);
+ printf(" New Layout : %s\n",
+ str ? str : "-unknown-");
+ }
+ if (info->new_level == 10) {
+ printf(" New Layout : near=%d, %s=%d\n",
+ info->new_layout & 255,
+ (info->new_layout & 0x10000) ?
+ "offset" : "far",
+ (info->new_layout >> 8) & 255);
+ }
+ }
+ if (info->new_chunk != array.chunk_size)
+ printf(" New Chunksize : %dK\n",
+ info->new_chunk/1024);
+ printf("\n");
+ } else if (e && e->percent >= 0)
+ printf("\n");
+ free_mdstat(ms);
+
+ if (st && st->sb)
+ st->ss->detail_super(st, c->homehost, subarray);
+
+ if (array.raid_disks == 0 && sra &&
+ sra->array.major_version == -1 &&
+ sra->array.minor_version == -2 &&
+ sra->text_version[0] != '/') {
+ /* This looks like a container. Find any active arrays
+ * That claim to be a member.
+ */
+ DIR *dir = opendir("/sys/block");
+ struct dirent *de;
+
+ printf(" Member Arrays :");
+
+ while (dir && (de = readdir(dir)) != NULL) {
+ char path[287];
+ char vbuf[1024];
+ int nlen = strlen(sra->sys_name);
+ dev_t devid;
+ if (de->d_name[0] == '.')
+ continue;
+ sprintf(path,
+ "/sys/block/%s/md/metadata_version",
+ de->d_name);
+ if (load_sys(path, vbuf, sizeof(vbuf)) < 0)
+ continue;
+ if (strncmp(vbuf, "external:", 9) ||
+ !is_subarray(vbuf + 9) ||
+ strncmp(vbuf + 10, sra->sys_name, nlen) ||
+ vbuf[10 + nlen] != '/')
+ continue;
+ devid = devnm2devid(de->d_name);
+ printf(" %s",
+ map_dev_preferred(major(devid),
+ minor(devid), 1,
+ c->prefer));
+ }
+ if (dir)
+ closedir(dir);
+ printf("\n\n");
+ }
+
+ if (!c->no_devices) {
+ if (array.raid_disks)
+ printf(" Number Major Minor RaidDevice State\n");
+ else
+ printf(" Number Major Minor RaidDevice\n");
+ }
+ }
+
+ /* if --no_devices specified, not print component devices info */
+ if (c->no_devices)
+ goto skip_devices_state;
+
+ for (d = 0; d < max_disks * 2; d++) {
+ char *dv;
+ mdu_disk_info_t disk = disks[d];
+
+ if (d >= array.raid_disks * 2 &&
+ disk.major == 0 && disk.minor == 0)
+ continue;
+ if ((d & 1) && disk.major == 0 && disk.minor == 0)
+ continue;
+ if (!c->brief) {
+ if (d == array.raid_disks*2)
+ printf("\n");
+ if (disk.number < 0 && disk.raid_disk < 0)
+ printf(" - %5d %5d - ",
+ disk.major, disk.minor);
+ else if (disk.raid_disk < 0 ||
+ disk.state & (1 << MD_DISK_JOURNAL))
+ printf(" %5d %5d %5d - ",
+ disk.number, disk.major, disk.minor);
+ else if (disk.number < 0)
+ printf(" - %5d %5d %5d ",
+ disk.major, disk.minor, disk.raid_disk);
+ else
+ printf(" %5d %5d %5d %5d ",
+ disk.number, disk.major, disk.minor,
+ disk.raid_disk);
+ }
+ if (!c->brief && array.raid_disks) {
+ if (disk.state & (1 << MD_DISK_FAULTY)) {
+ printf(" faulty");
+ if (disk.raid_disk < array.raid_disks &&
+ disk.raid_disk >= 0)
+ failed++;
+ }
+ if (disk.state & (1 << MD_DISK_ACTIVE))
+ printf(" active");
+ if (disk.state & (1 << MD_DISK_SYNC)) {
+ printf(" sync");
+ if (array.level == 10 &&
+ (array.layout & ~0x1FFFF) == 0) {
+ int nc = array.layout & 0xff;
+ int fc = (array.layout >> 8) & 0xff;
+ int copies = nc*fc;
+ if (fc == 1 &&
+ array.raid_disks % copies == 0 &&
+ copies <= 26) {
+ /* We can divide the devices
+ into 'sets' */
+ int set;
+ set = disk.raid_disk % copies;
+ printf(" set-%c", set + 'A');
+ }
+ }
+ }
+ if (disk.state & (1 << MD_DISK_REMOVED))
+ printf(" removed");
+ if (disk.state & (1 << MD_DISK_WRITEMOSTLY))
+ printf(" writemostly");
+ if (disk.state & (1 << MD_DISK_FAILFAST))
+ printf(" failfast");
+ if (disk.state & (1 << MD_DISK_JOURNAL))
+ printf(" journal");
+ if ((disk.state &
+ ((1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC) |
+ (1 << MD_DISK_REMOVED) | (1 << MD_DISK_FAULTY) |
+ (1 << MD_DISK_JOURNAL))) == 0) {
+ printf(" spare");
+ if (disk.raid_disk < array.raid_disks &&
+ disk.raid_disk >= 0)
+ printf(" rebuilding");
+ }
+ }
+ if (disk.state == 0)
+ spares++;
+ dv = map_dev_preferred(disk.major, disk.minor, 0, c->prefer);
+ if (dv != NULL) {
+ if (c->brief)
+ n_devices = add_device(dv, &devices,
+ &max_devices, n_devices);
+ else
+ printf(" %s", dv);
+ } else if (disk.major | disk.minor)
+ printf(" missing");
+ if (!c->brief)
+ printf("\n");
+ }
+
+skip_devices_state:
+ if (spares && c->brief && array.raid_disks)
+ printf(" spares=%d", spares);
+ if (c->brief && st && st->sb)
+ st->ss->brief_detail_super(st, subarray);
+ if (st)
+ st->ss->free_super(st);
+
+ if (c->brief && c->verbose > 0 && devices) {
+ qsort(devices, n_devices, sizeof(*devices), cmpstringp);
+ printf("\n devices=%s", devices[0]);
+ for (d = 1; d < n_devices; d++)
+ printf(",%s", devices[d]);
+ }
+ if (c->brief)
+ printf("\n");
+ if (c->test &&
+ !enough(array.level, array.raid_disks, array.layout, 1, avail))
+ rv = 2;
+
+out:
+ free(info);
+ free(disks);
+ close(fd);
+ free(subarray);
+ free(avail);
+ if (devices)
+ for (d = 0; d < n_devices; d++)
+ free(devices[d]);
+ free(devices);
+ sysfs_free(sra);
+ free(st);
+ return rv;
+}
+
+int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path)
+{
+ /* display platform capabilities for the given metadata format
+ * 'scan' in this context means iterate over all metadata types
+ */
+ int i;
+ int err = 1;
+
+ if (ss && export && ss->export_detail_platform)
+ err = ss->export_detail_platform(verbose, controller_path);
+ else if (ss && ss->detail_platform)
+ err = ss->detail_platform(verbose, 0, controller_path);
+ else if (ss) {
+ if (verbose > 0)
+ pr_err("%s metadata is platform independent\n",
+ ss->name ? : "[no name]");
+ } else if (!scan) {
+ if (verbose > 0)
+ pr_err("specify a metadata type or --scan\n");
+ }
+
+ if (!scan)
+ return err;
+
+ err = 0;
+ for (i = 0; superlist[i]; i++) {
+ struct superswitch *meta = superlist[i];
+
+ if (meta == ss)
+ continue;
+ if (verbose > 0)
+ pr_err("checking metadata %s\n",
+ meta->name ? : "[no name]");
+ if (!meta->detail_platform) {
+ if (verbose > 0)
+ pr_err("%s metadata is platform independent\n",
+ meta->name ? : "[no name]");
+ } else if (export && meta->export_detail_platform) {
+ err |= meta->export_detail_platform(verbose, controller_path);
+ } else
+ err |= meta->detail_platform(verbose, 0, controller_path);
+ }
+
+ return err;
+}
diff --git a/Dump.c b/Dump.c
new file mode 100644
index 0000000..736bcb6
--- /dev/null
+++ b/Dump.c
@@ -0,0 +1,319 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2013 Neil Brown <neilb@suse.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <sys/dir.h>
+
+int Dump_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st)
+{
+ /* create a new file in 'dir' named for the basename of 'dev'.
+ * Truncate to the same size as 'dev' and ask the metadata
+ * handler to copy metadata there.
+ * For every name in /dev/disk/by-id that points to this device,
+ * create a hardlink in 'dir'.
+ * Complain if any of those hardlinks cannot be created.
+ */
+ int fd, fl;
+ struct stat stb, dstb;
+ char *base;
+ char *fname = NULL;
+ unsigned long long size;
+ DIR *dirp;
+ struct dirent *de;
+
+ if (stat(dir, &stb) != 0 ||
+ (S_IFMT & stb.st_mode) != S_IFDIR) {
+ pr_err("--dump requires an existing directory, not: %s\n",
+ dir);
+ return 16;
+ }
+
+ fd = dev_open(dev, O_RDONLY);
+ if (fd < 0) {
+ pr_err("Cannot open %s to dump metadata: %s\n",
+ dev, strerror(errno));
+ return 1;
+ }
+ if (!get_dev_size(fd, dev, &size)) {
+ close(fd);
+ return 1;
+ }
+
+ if (st == NULL)
+ st = guess_super_type(fd, guess_array);
+ if (!st) {
+ pr_err("Cannot find RAID metadata on %s\n", dev);
+ close(fd);
+ return 1;
+ }
+
+ st->ignore_hw_compat = 1;
+ if (st->ss->load_super(st, fd, NULL) != 0) {
+ pr_err("No %s metadata found on %s\n",
+ st->ss->name, dev);
+ close(fd);
+ return 1;
+ }
+ if (st->ss->copy_metadata == NULL) {
+ pr_err("%s metadata on %s cannot be copied\n",
+ st->ss->name, dev);
+ close(fd);
+ return 1;
+ }
+
+ base = strrchr(dev, '/');
+ if (base)
+ base++;
+ else
+ base = dev;
+ xasprintf(&fname, "%s/%s", dir, base);
+ fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666);
+ if (fl < 0) {
+ pr_err("Cannot create dump file %s: %s\n",
+ fname, strerror(errno));
+ close(fd);
+ free(fname);
+ return 1;
+ }
+ if (ftruncate(fl, size) < 0) {
+ pr_err("failed to set size of dump file: %s\n",
+ strerror(errno));
+ close(fd);
+ close(fl);
+ free(fname);
+ return 1;
+ }
+
+ if (st->ss->copy_metadata(st, fd, fl) != 0) {
+ pr_err("Failed to copy metadata from %s to %s\n",
+ dev, fname);
+ close(fd);
+ close(fl);
+ unlink(fname);
+ free(fname);
+ return 1;
+ }
+ if (c->verbose >= 0)
+ printf("%s saved as %s.\n", dev, fname);
+ fstat(fd, &dstb);
+ close(fd);
+ close(fl);
+ if ((dstb.st_mode & S_IFMT) != S_IFBLK) {
+ /* Not a block device, so cannot create links */
+ free(fname);
+ return 0;
+ }
+ /* mostly done: just want to find some other names */
+ dirp = opendir("/dev/disk/by-id");
+ if (!dirp) {
+ free(fname);
+ return 0;
+ }
+ while ((de = readdir(dirp)) != NULL) {
+ char *p = NULL;
+ if (de->d_name[0] == '.')
+ continue;
+ xasprintf(&p, "/dev/disk/by-id/%s", de->d_name);
+ if (stat(p, &stb) != 0 ||
+ (stb.st_mode & S_IFMT) != S_IFBLK ||
+ stb.st_rdev != dstb.st_rdev) {
+ /* Not this one */
+ free(p);
+ continue;
+ }
+ free(p);
+ xasprintf(&p, "%s/%s", dir, de->d_name);
+ if (link(fname, p) == 0) {
+ if (c->verbose >= 0)
+ printf("%s also saved as %s.\n",
+ dev, p);
+ } else {
+ pr_err("Could not save %s as %s!!\n",
+ dev, p);
+ }
+ free(p);
+ }
+ closedir(dirp);
+ free(fname);
+ return 0;
+}
+
+int Restore_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st, int only)
+{
+ /* If 'dir' really is a directory we choose a name
+ * from it that matches a suitable name in /dev/disk/by-id,
+ * and copy metadata from the file to the device.
+ * If two names from by-id match and aren't both the same
+ * inode, we fail. If none match and basename of 'dev'
+ * can be found in dir, use that.
+ * If 'dir' is really a file then it is only permitted if
+ * 'only' is set (meaning there was only one device given)
+ * and the metadata is restored irrespective of file names.
+ */
+ int fd, fl;
+ struct stat stb, dstb;
+ char *fname = NULL;
+ unsigned long long size;
+
+ if (stat(dir, &stb) != 0) {
+ pr_err("%s does not exist: cannot restore from there.\n",
+ dir);
+ return 16;
+ } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) {
+ pr_err("--restore requires a directory when multiple devices given\n");
+ return 16;
+ }
+
+ fd = dev_open(dev, O_RDWR);
+ if (fd < 0) {
+ pr_err("Cannot open %s to restore metadata: %s\n",
+ dev, strerror(errno));
+ return 1;
+ }
+ if (!get_dev_size(fd, dev, &size)) {
+ close(fd);
+ return 1;
+ }
+
+ if ((S_IFMT & stb.st_mode) == S_IFDIR) {
+ /* choose one name from the directory. */
+ DIR *d = opendir(dir);
+ struct dirent *de;
+ char *chosen = NULL;
+ unsigned int chosen_inode = 0;
+
+ fstat(fd, &dstb);
+
+ while (d && (de = readdir(d)) != NULL) {
+ if (de->d_name[0] == '.')
+ continue;
+ xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name);
+ if (stat(fname, &stb) != 0) {
+ free(fname);
+ continue;
+ }
+ free(fname);
+ if ((S_IFMT & stb.st_mode) != S_IFBLK)
+ continue;
+ if (stb.st_rdev != dstb.st_rdev)
+ continue;
+ /* This file is a good match for our device. */
+ xasprintf(&fname, "%s/%s", dir, de->d_name);
+ if (stat(fname, &stb) != 0) {
+ /* Weird! */
+ free(fname);
+ continue;
+ }
+ if (chosen == NULL) {
+ chosen = fname;
+ chosen_inode = stb.st_ino;
+ continue;
+ }
+ if (chosen_inode == stb.st_ino) {
+ /* same, no need to change */
+ free(fname);
+ continue;
+ }
+ /* Oh dear, two names both match. Must give up. */
+ pr_err("Both %s and %s seem suitable for %s. Please choose one.\n",
+ chosen, fname, dev);
+ free(fname);
+ free(chosen);
+ close(fd);
+ closedir(d);
+ return 1;
+ }
+ closedir(d);
+ if (!chosen) {
+ /* One last chance: try basename of device */
+ char *base = strrchr(dev, '/');
+ if (base)
+ base++;
+ else
+ base = dev;
+ xasprintf(&fname, "%s/%s", dir, base);
+ if (stat(fname, &stb) == 0)
+ chosen = fname;
+ else
+ free(fname);
+ }
+ fname = chosen;
+ } else
+ fname = strdup(dir);
+
+ if (!fname) {
+ pr_err("Cannot find suitable file in %s for %s\n",
+ dir, dev);
+ close(fd);
+ return 1;
+ }
+
+ fl = open(fname, O_RDONLY);
+ if (!fl) {
+ pr_err("Could not open %s for --restore.\n",
+ fname);
+ goto err;
+ }
+ if (stat(fname, &stb) != 0) {
+ pr_err("Could not stat %s for --restore.\n",
+ fname);
+ goto err;
+ }
+ if (((unsigned long long)stb.st_size) != size) {
+ pr_err("%s is not the same size as %s - cannot restore.\n",
+ fname, dev);
+ goto err;
+ }
+ if (st == NULL)
+ st = guess_super_type(fl, guess_array);
+ if (!st) {
+ pr_err("Cannot find metadata on %s\n", fname);
+ goto err;
+ }
+ st->ignore_hw_compat = 1;
+ if (st->ss->load_super(st, fl, NULL) != 0) {
+ pr_err("No %s metadata found on %s\n",
+ st->ss->name, fname);
+ goto err;
+ }
+ if (st->ss->copy_metadata == NULL) {
+ pr_err("%s metadata on %s cannot be copied\n",
+ st->ss->name, dev);
+ goto err;
+ }
+ if (st->ss->copy_metadata(st, fl, fd) != 0) {
+ pr_err("Failed to copy metadata from %s to %s\n",
+ fname, dev);
+ goto err;
+ }
+ if (c->verbose >= 0)
+ printf("%s restored from %s.\n", dev, fname);
+ close(fl);
+ close(fd);
+ free(fname);
+ return 0;
+
+err:
+ close(fd);
+ close(fl);
+ free(fname);
+ return 1;
+}
diff --git a/Examine.c b/Examine.c
new file mode 100644
index 0000000..9574a3c
--- /dev/null
+++ b/Examine.c
@@ -0,0 +1,228 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "dlink.h"
+
+#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
+#error no endian defined
+#endif
+#include "md_u.h"
+#include "md_p.h"
+int Examine(struct mddev_dev *devlist,
+ struct context *c,
+ struct supertype *forcest)
+{
+
+ /* Read the raid superblock from a device and
+ * display important content.
+ *
+ * If cannot be found, print reason: too small, bad magic
+ *
+ * Print:
+ * version, ctime, level, size, raid+spare+
+ * prefered minor
+ * uuid
+ *
+ * utime, state etc
+ *
+ * If (brief) gather devices for same array and just print a mdadm.conf
+ * line including devices=
+ * if devlist==NULL, use conf_get_devs()
+ */
+ int fd;
+ int rv = 0;
+
+ struct array {
+ struct supertype *st;
+ struct mdinfo info;
+ void *devs;
+ struct array *next;
+ int spares;
+ } *arrays = NULL;
+
+ for (; devlist ; devlist = devlist->next) {
+ struct supertype *st;
+ int have_container = 0;
+ int err = 0;
+ int container = 0;
+
+ fd = dev_open(devlist->devname, O_RDONLY);
+ if (fd < 0) {
+ if (!c->scan) {
+ pr_err("cannot open %s: %s\n",
+ devlist->devname, strerror(errno));
+ rv = 1;
+ }
+ continue;
+ }
+
+ if (forcest)
+ st = dup_super(forcest);
+ else if (must_be_container(fd)) {
+ /* might be a container */
+ st = super_by_fd(fd, NULL);
+ container = 1;
+ } else
+ st = guess_super(fd);
+ if (st) {
+ err = 1;
+ st->ignore_hw_compat = 1;
+ if (!container)
+ err = st->ss->load_super(st, fd,
+ (c->brief||c->scan) ? NULL
+ :devlist->devname);
+ if (err && st->ss->load_container) {
+ err = st->ss->load_container(st, fd,
+ (c->brief||c->scan) ? NULL
+ :devlist->devname);
+ if (!err)
+ have_container = 1;
+ }
+ st->ignore_hw_compat = 0;
+ } else {
+ if (!c->brief) {
+ pr_err("No md superblock detected on %s.\n", devlist->devname);
+ rv = 1;
+ }
+ err = 1;
+ }
+ close(fd);
+
+ if (err) {
+ if (st)
+ st->ss->free_super(st);
+ continue;
+ }
+
+ if (c->SparcAdjust)
+ st->ss->update_super(st, NULL, "sparc2.2",
+ devlist->devname, 0, 0, NULL);
+ /* Ok, its good enough to try, though the checksum could be wrong */
+
+ if (c->brief && st->ss->brief_examine_super == NULL) {
+ if (!c->scan)
+ pr_err("No brief listing for %s on %s\n",
+ st->ss->name, devlist->devname);
+ } else if (c->brief) {
+ struct array *ap;
+ char *d;
+ for (ap = arrays; ap; ap = ap->next) {
+ if (st->ss == ap->st->ss &&
+ st->ss->compare_super(ap->st, st, 0) == 0)
+ break;
+ }
+ if (!ap) {
+ ap = xmalloc(sizeof(*ap));
+ ap->devs = dl_head();
+ ap->next = arrays;
+ ap->spares = 0;
+ ap->st = st;
+ arrays = ap;
+ st->ss->getinfo_super(st, &ap->info, NULL);
+ } else
+ st->ss->getinfo_super(st, &ap->info, NULL);
+ if (!have_container &&
+ !(ap->info.disk.state & (1<<MD_DISK_SYNC)))
+ ap->spares++;
+ d = dl_strdup(devlist->devname);
+ dl_add(ap->devs, d);
+ } else if (c->export) {
+ if (st->ss->export_examine_super)
+ st->ss->export_examine_super(st);
+ st->ss->free_super(st);
+ } else {
+ printf("%s:\n",devlist->devname);
+ st->ss->examine_super(st, c->homehost);
+ st->ss->free_super(st);
+ }
+ }
+ if (c->brief) {
+ struct array *ap;
+ for (ap = arrays; ap; ap = ap->next) {
+ char sep='=';
+ char *d;
+ int newline = 0;
+
+ ap->st->ss->brief_examine_super(ap->st, c->verbose > 0);
+ if (ap->spares && !ap->st->ss->external)
+ newline += printf(" spares=%d", ap->spares);
+ if (c->verbose > 0) {
+ newline += printf(" devices");
+ for (d = dl_next(ap->devs);
+ d != ap->devs;
+ d=dl_next(d)) {
+ printf("%c%s", sep, d);
+ sep=',';
+ }
+ }
+ if (ap->st->ss->brief_examine_subarrays) {
+ if (newline)
+ printf("\n");
+ ap->st->ss->brief_examine_subarrays(ap->st, c->verbose);
+ }
+ ap->st->ss->free_super(ap->st);
+ /* FIXME free ap */
+ if (ap->spares || c->verbose > 0)
+ printf("\n");
+ }
+ }
+ return rv;
+}
+
+int ExamineBadblocks(char *devname, int brief, struct supertype *forcest)
+{
+ int fd = dev_open(devname, O_RDONLY);
+ struct supertype *st = forcest;
+ int err = 1;
+
+ if (fd < 0) {
+ pr_err("cannot open %s: %s\n", devname, strerror(errno));
+ return 1;
+ }
+ if (!st)
+ st = guess_super(fd);
+ if (!st) {
+ if (!brief)
+ pr_err("No md superblock detected on %s\n", devname);
+ goto out;
+ }
+ if (!st->ss->examine_badblocks) {
+ pr_err("%s metadata does not support badblocks\n", st->ss->name);
+ goto out;
+ }
+ err = st->ss->load_super(st, fd, brief ? NULL : devname);
+ if (err)
+ goto out;
+ err = st->ss->examine_badblocks(st, fd, devname);
+
+out:
+ if (fd >= 0)
+ close(fd);
+ if (st) {
+ st->ss->free_super(st);
+ free(st);
+ }
+ return err;
+}
diff --git a/Grow.c b/Grow.c
new file mode 100644
index 0000000..9c6fc95
--- /dev/null
+++ b/Grow.c
@@ -0,0 +1,5229 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+#include "mdadm.h"
+#include "dlink.h"
+#include <sys/mman.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <signal.h>
+#include <sys/wait.h>
+
+#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN)
+#error no endian defined
+#endif
+#include "md_u.h"
+#include "md_p.h"
+
+int restore_backup(struct supertype *st,
+ struct mdinfo *content,
+ int working_disks,
+ int next_spare,
+ char **backup_filep,
+ int verbose)
+{
+ int i;
+ int *fdlist;
+ struct mdinfo *dev;
+ int err;
+ int disk_count = next_spare + working_disks;
+ char *backup_file = *backup_filep;
+
+ dprintf("Called restore_backup()\n");
+ fdlist = xmalloc(sizeof(int) * disk_count);
+
+ enable_fds(next_spare);
+ for (i = 0; i < next_spare; i++)
+ fdlist[i] = -1;
+ for (dev = content->devs; dev; dev = dev->next) {
+ char buf[22];
+ int fd;
+
+ sprintf(buf, "%d:%d", dev->disk.major, dev->disk.minor);
+ fd = dev_open(buf, O_RDWR);
+
+ if (dev->disk.raid_disk >= 0)
+ fdlist[dev->disk.raid_disk] = fd;
+ else
+ fdlist[next_spare++] = fd;
+ }
+
+ if (!backup_file) {
+ backup_file = locate_backup(content->sys_name);
+ *backup_filep = backup_file;
+ }
+
+ if (st->ss->external && st->ss->recover_backup)
+ err = st->ss->recover_backup(st, content);
+ else
+ err = Grow_restart(st, content, fdlist, next_spare,
+ backup_file, verbose > 0);
+
+ while (next_spare > 0) {
+ next_spare--;
+ if (fdlist[next_spare] >= 0)
+ close(fdlist[next_spare]);
+ }
+ free(fdlist);
+ if (err) {
+ pr_err("Failed to restore critical section for reshape - sorry.\n");
+ if (!backup_file)
+ pr_err("Possibly you need to specify a --backup-file\n");
+ return 1;
+ }
+
+ dprintf("restore_backup() returns status OK.\n");
+ return 0;
+}
+
+int Grow_Add_device(char *devname, int fd, char *newdev)
+{
+ /* Add a device to an active array.
+ * Currently, just extend a linear array.
+ * This requires writing a new superblock on the
+ * new device, calling the kernel to add the device,
+ * and if that succeeds, update the superblock on
+ * all other devices.
+ * This means that we need to *find* all other devices.
+ */
+ struct mdinfo info;
+
+ dev_t rdev;
+ int nfd, fd2;
+ int d, nd;
+ struct supertype *st = NULL;
+ char *subarray = NULL;
+
+ if (md_get_array_info(fd, &info.array) < 0) {
+ pr_err("cannot get array info for %s\n", devname);
+ return 1;
+ }
+
+ if (info.array.level != -1) {
+ pr_err("can only add devices to linear arrays\n");
+ return 1;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("cannot handle arrays with superblock version %d\n",
+ info.array.major_version);
+ return 1;
+ }
+
+ if (subarray) {
+ pr_err("Cannot grow linear sub-arrays yet\n");
+ free(subarray);
+ free(st);
+ return 1;
+ }
+
+ nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT);
+ if (nfd < 0) {
+ pr_err("cannot open %s\n", newdev);
+ free(st);
+ return 1;
+ }
+ if (!fstat_is_blkdev(nfd, newdev, &rdev)) {
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ /* now check out all the devices and make sure we can read the
+ * superblock */
+ for (d=0 ; d < info.array.raid_disks ; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+
+ st->ss->free_super(st);
+
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0) {
+ pr_err("cannot get device detail for device %d\n", d);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv) {
+ pr_err("cannot find device file for device %d\n", d);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+ fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0) {
+ pr_err("cannot open device file %s\n", dv);
+ close(nfd);
+ free(st);
+ return 1;
+ }
+
+ if (st->ss->load_super(st, fd2, NULL)) {
+ pr_err("cannot find super block on %s\n", dv);
+ close(nfd);
+ close(fd2);
+ free(st);
+ return 1;
+ }
+ close(fd2);
+ }
+ /* Ok, looks good. Lets update the superblock and write it out to
+ * newdev.
+ */
+
+ info.disk.number = d;
+ info.disk.major = major(rdev);
+ info.disk.minor = minor(rdev);
+ info.disk.raid_disk = d;
+ info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ if (st->ss->update_super(st, &info, "linear-grow-new", newdev,
+ 0, 0, NULL) != 0) {
+ pr_err("Preparing new metadata failed on %s\n", newdev);
+ close(nfd);
+ return 1;
+ }
+
+ if (st->ss->store_super(st, nfd)) {
+ pr_err("Cannot store new superblock on %s\n", newdev);
+ close(nfd);
+ return 1;
+ }
+ close(nfd);
+
+ if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) {
+ pr_err("Cannot add new disk to this array\n");
+ return 1;
+ }
+ /* Well, that seems to have worked.
+ * Now go through and update all superblocks
+ */
+
+ if (md_get_array_info(fd, &info.array) < 0) {
+ pr_err("cannot get array info for %s\n", devname);
+ return 1;
+ }
+
+ nd = d;
+ for (d=0 ; d < info.array.raid_disks ; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0) {
+ pr_err("cannot get device detail for device %d\n", d);
+ return 1;
+ }
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv) {
+ pr_err("cannot find device file for device %d\n", d);
+ return 1;
+ }
+ fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0) {
+ pr_err("cannot open device file %s\n", dv);
+ return 1;
+ }
+ if (st->ss->load_super(st, fd2, NULL)) {
+ pr_err("cannot find super block on %s\n", dv);
+ close(fd);
+ close(fd2);
+ return 1;
+ }
+ info.array.raid_disks = nd+1;
+ info.array.nr_disks = nd+1;
+ info.array.active_disks = nd+1;
+ info.array.working_disks = nd+1;
+
+ if (st->ss->update_super(st, &info, "linear-grow-update", dv,
+ 0, 0, NULL) != 0) {
+ pr_err("Updating metadata failed on %s\n", dv);
+ close(fd2);
+ return 1;
+ }
+
+ if (st->ss->store_super(st, fd2)) {
+ pr_err("Cannot store new superblock on %s\n", dv);
+ close(fd2);
+ return 1;
+ }
+ close(fd2);
+ }
+
+ return 0;
+}
+
+int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s)
+{
+ /*
+ * First check that array doesn't have a bitmap
+ * Then create the bitmap
+ * Then add it
+ *
+ * For internal bitmaps, we need to check the version,
+ * find all the active devices, and write the bitmap block
+ * to all devices
+ */
+ mdu_bitmap_file_t bmf;
+ mdu_array_info_t array;
+ struct supertype *st;
+ char *subarray = NULL;
+ int major = BITMAP_MAJOR_HI;
+ unsigned long long bitmapsize, array_size;
+ struct mdinfo *mdi;
+
+ /*
+ * We only ever get called if s->bitmap_file is != NULL, so this check
+ * is just here to quiet down static code checkers.
+ */
+ if (!s->bitmap_file)
+ return 1;
+
+ if (strcmp(s->bitmap_file, "clustered") == 0)
+ major = BITMAP_MAJOR_CLUSTERED;
+
+ if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) {
+ if (errno == ENOMEM)
+ pr_err("Memory allocation failure.\n");
+ else
+ pr_err("bitmaps not supported by this kernel.\n");
+ return 1;
+ }
+ if (bmf.pathname[0]) {
+ if (strcmp(s->bitmap_file,"none") == 0) {
+ if (ioctl(fd, SET_BITMAP_FILE, -1) != 0) {
+ pr_err("failed to remove bitmap %s\n",
+ bmf.pathname);
+ return 1;
+ }
+ return 0;
+ }
+ pr_err("%s already has a bitmap (%s)\n", devname, bmf.pathname);
+ return 1;
+ }
+ if (md_get_array_info(fd, &array) != 0) {
+ pr_err("cannot get array status for %s\n", devname);
+ return 1;
+ }
+ if (array.state & (1 << MD_SB_BITMAP_PRESENT)) {
+ if (strcmp(s->bitmap_file, "none")==0) {
+ array.state &= ~(1 << MD_SB_BITMAP_PRESENT);
+ if (md_set_array_info(fd, &array) != 0) {
+ if (array.state & (1 << MD_SB_CLUSTERED))
+ pr_err("failed to remove clustered bitmap.\n");
+ else
+ pr_err("failed to remove internal bitmap.\n");
+ return 1;
+ }
+ return 0;
+ }
+ pr_err("bitmap already present on %s\n", devname);
+ return 1;
+ }
+
+ if (strcmp(s->bitmap_file, "none") == 0) {
+ pr_err("no bitmap found on %s\n", devname);
+ return 1;
+ }
+ if (array.level <= 0) {
+ pr_err("Bitmaps not meaningful with level %s\n",
+ map_num(pers, array.level)?:"of this array");
+ return 1;
+ }
+ bitmapsize = array.size;
+ bitmapsize <<= 1;
+ if (get_dev_size(fd, NULL, &array_size) &&
+ array_size > (0x7fffffffULL << 9)) {
+ /* Array is big enough that we cannot trust array.size
+ * try other approaches
+ */
+ bitmapsize = get_component_size(fd);
+ }
+ if (bitmapsize == 0) {
+ pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n");
+ return 1;
+ }
+
+ if (array.level == 10) {
+ int ncopies;
+
+ ncopies = (array.layout & 255) * ((array.layout >> 8) & 255);
+ bitmapsize = bitmapsize * array.raid_disks / ncopies;
+
+ if (strcmp(s->bitmap_file, "clustered") == 0 &&
+ !is_near_layout_10(array.layout)) {
+ pr_err("only near layout is supported with clustered raid10\n");
+ return 1;
+ }
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("Cannot understand version %d.%d\n",
+ array.major_version, array.minor_version);
+ return 1;
+ }
+ if (subarray) {
+ pr_err("Cannot add bitmaps to sub-arrays yet\n");
+ free(subarray);
+ free(st);
+ return 1;
+ }
+
+ mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY);
+ if (mdi) {
+ if (mdi->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ pr_err("Cannot add bitmap to array with PPL\n");
+ free(mdi);
+ free(st);
+ return 1;
+ }
+ free(mdi);
+ }
+
+ if (strcmp(s->bitmap_file, "internal") == 0 ||
+ strcmp(s->bitmap_file, "clustered") == 0) {
+ int rv;
+ int d;
+ int offset_setable = 0;
+ if (st->ss->add_internal_bitmap == NULL) {
+ pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name);
+ return 1;
+ }
+ st->nodes = c->nodes;
+ st->cluster_name = c->homecluster;
+ mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION);
+ if (mdi)
+ offset_setable = 1;
+ for (d = 0; d < st->max_devs; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ int fd2;
+
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ if ((disk.state & (1 << MD_DISK_SYNC)) == 0)
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+ if (((disk.state & (1 << MD_DISK_WRITEMOSTLY)) == 0) &&
+ (strcmp(s->bitmap_file, "clustered") == 0)) {
+ pr_err("%s disks marked write-mostly are not supported with clustered bitmap\n",devname);
+ return 1;
+ }
+ fd2 = dev_open(dv, O_RDWR);
+ if (fd2 < 0)
+ continue;
+ rv = st->ss->load_super(st, fd2, NULL);
+ if (!rv) {
+ rv = st->ss->add_internal_bitmap(
+ st, &s->bitmap_chunk, c->delay,
+ s->write_behind, bitmapsize,
+ offset_setable, major);
+ if (!rv) {
+ st->ss->write_bitmap(st, fd2,
+ NodeNumUpdate);
+ } else {
+ pr_err("failed to create internal bitmap - chunksize problem.\n");
+ }
+ } else {
+ pr_err("failed to load super-block.\n");
+ }
+ close(fd2);
+ if (rv)
+ return 1;
+ }
+ if (offset_setable) {
+ st->ss->getinfo_super(st, mdi, NULL);
+ if (sysfs_init(mdi, fd, NULL)) {
+ pr_err("failed to initialize sysfs.\n");
+ free(mdi);
+ }
+ rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location",
+ mdi->bitmap_offset);
+ free(mdi);
+ } else {
+ if (strcmp(s->bitmap_file, "clustered") == 0)
+ array.state |= (1 << MD_SB_CLUSTERED);
+ array.state |= (1 << MD_SB_BITMAP_PRESENT);
+ rv = md_set_array_info(fd, &array);
+ }
+ if (rv < 0) {
+ if (errno == EBUSY)
+ pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n");
+ pr_err("failed to set internal bitmap.\n");
+ return 1;
+ }
+ } else {
+ int uuid[4];
+ int bitmap_fd;
+ int d;
+ int max_devs = st->max_devs;
+
+ /* try to load a superblock */
+ for (d = 0; d < max_devs; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ int fd2;
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if ((disk.major==0 && disk.minor == 0) ||
+ (disk.state & (1 << MD_DISK_REMOVED)))
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 >= 0) {
+ if (st->ss->load_super(st, fd2, NULL) == 0) {
+ close(fd2);
+ st->ss->uuid_from_super(st, uuid);
+ break;
+ }
+ close(fd2);
+ }
+ }
+ if (d == max_devs) {
+ pr_err("cannot find UUID for array!\n");
+ return 1;
+ }
+ if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid,
+ s->bitmap_chunk, c->delay, s->write_behind,
+ bitmapsize, major)) {
+ return 1;
+ }
+ bitmap_fd = open(s->bitmap_file, O_RDWR);
+ if (bitmap_fd < 0) {
+ pr_err("weird: %s cannot be opened\n", s->bitmap_file);
+ return 1;
+ }
+ if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) {
+ int err = errno;
+ if (errno == EBUSY)
+ pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n");
+ pr_err("Cannot set bitmap file for %s: %s\n",
+ devname, strerror(err));
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+int Grow_consistency_policy(char *devname, int fd, struct context *c, struct shape *s)
+{
+ struct supertype *st;
+ struct mdinfo *sra;
+ struct mdinfo *sd;
+ char *subarray = NULL;
+ int ret = 0;
+ char container_dev[PATH_MAX];
+ char buf[20];
+
+ if (s->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
+ s->consistency_policy != CONSISTENCY_POLICY_PPL) {
+ pr_err("Operation not supported for consistency policy %s\n",
+ map_num(consistency_policies, s->consistency_policy));
+ return 1;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st)
+ return 1;
+
+ sra = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY|GET_LEVEL|
+ GET_DEVS|GET_STATE);
+ if (!sra) {
+ ret = 1;
+ goto free_st;
+ }
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ !st->ss->write_init_ppl) {
+ pr_err("%s metadata does not support PPL\n", st->ss->name);
+ ret = 1;
+ goto free_info;
+ }
+
+ if (sra->array.level != 5) {
+ pr_err("Operation not supported for array level %d\n",
+ sra->array.level);
+ ret = 1;
+ goto free_info;
+ }
+
+ if (sra->consistency_policy == (unsigned)s->consistency_policy) {
+ pr_err("Consistency policy is already %s\n",
+ map_num(consistency_policies, s->consistency_policy));
+ ret = 1;
+ goto free_info;
+ } else if (sra->consistency_policy != CONSISTENCY_POLICY_RESYNC &&
+ sra->consistency_policy != CONSISTENCY_POLICY_PPL) {
+ pr_err("Current consistency policy is %s, cannot change to %s\n",
+ map_num(consistency_policies, sra->consistency_policy),
+ map_num(consistency_policies, s->consistency_policy));
+ ret = 1;
+ goto free_info;
+ }
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) {
+ ret = 1;
+ goto free_info;
+ } else if (strcmp(buf, "reshape\n") == 0) {
+ pr_err("PPL cannot be enabled when reshape is in progress\n");
+ ret = 1;
+ goto free_info;
+ }
+ }
+
+ if (subarray) {
+ char *update;
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL)
+ update = "ppl";
+ else
+ update = "no-ppl";
+
+ sprintf(container_dev, "/dev/%s", st->container_devnm);
+
+ ret = Update_subarray(container_dev, subarray, update, NULL,
+ c->verbose);
+ if (ret)
+ goto free_info;
+ }
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ struct mdinfo info;
+
+ if (subarray) {
+ struct mdinfo *mdi;
+ int cfd;
+
+ cfd = open(container_dev, O_RDWR|O_EXCL);
+ if (cfd < 0) {
+ pr_err("Failed to open %s\n", container_dev);
+ ret = 1;
+ goto free_info;
+ }
+
+ ret = st->ss->load_container(st, cfd, st->container_devnm);
+ close(cfd);
+
+ if (ret) {
+ pr_err("Cannot read superblock for %s\n",
+ container_dev);
+ goto free_info;
+ }
+
+ mdi = st->ss->container_content(st, subarray);
+ info = *mdi;
+ free(mdi);
+ }
+
+ for (sd = sra->devs; sd; sd = sd->next) {
+ int dfd;
+ char *devpath;
+
+ devpath = map_dev(sd->disk.major, sd->disk.minor, 0);
+ dfd = dev_open(devpath, O_RDWR);
+ if (dfd < 0) {
+ pr_err("Failed to open %s\n", devpath);
+ ret = 1;
+ goto free_info;
+ }
+
+ if (!subarray) {
+ ret = st->ss->load_super(st, dfd, NULL);
+ if (ret) {
+ pr_err("Failed to load super-block.\n");
+ close(dfd);
+ goto free_info;
+ }
+
+ ret = st->ss->update_super(st, sra, "ppl",
+ devname,
+ c->verbose, 0, NULL);
+ if (ret) {
+ close(dfd);
+ st->ss->free_super(st);
+ goto free_info;
+ }
+ st->ss->getinfo_super(st, &info, NULL);
+ }
+
+ ret |= sysfs_set_num(sra, sd, "ppl_sector",
+ info.ppl_sector);
+ ret |= sysfs_set_num(sra, sd, "ppl_size",
+ info.ppl_size);
+
+ if (ret) {
+ pr_err("Failed to set PPL attributes for %s\n",
+ sd->sys_name);
+ close(dfd);
+ st->ss->free_super(st);
+ goto free_info;
+ }
+
+ ret = st->ss->write_init_ppl(st, &info, dfd);
+ if (ret)
+ pr_err("Failed to write PPL\n");
+
+ close(dfd);
+
+ if (!subarray)
+ st->ss->free_super(st);
+
+ if (ret)
+ goto free_info;
+ }
+ }
+
+ ret = sysfs_set_str(sra, NULL, "consistency_policy",
+ map_num(consistency_policies,
+ s->consistency_policy));
+ if (ret)
+ pr_err("Failed to change array consistency policy\n");
+
+free_info:
+ sysfs_free(sra);
+free_st:
+ free(st);
+ free(subarray);
+
+ return ret;
+}
+
+/*
+ * When reshaping an array we might need to backup some data.
+ * This is written to all spares with a 'super_block' describing it.
+ * The superblock goes 4K from the end of the used space on the
+ * device.
+ * It if written after the backup is complete.
+ * It has the following structure.
+ */
+
+static struct mdp_backup_super {
+ char magic[16]; /* md_backup_data-1 or -2 */
+ __u8 set_uuid[16];
+ __u64 mtime;
+ /* start/sizes in 512byte sectors */
+ __u64 devstart; /* address on backup device/file of data */
+ __u64 arraystart;
+ __u64 length;
+ __u32 sb_csum; /* csum of preceeding bytes. */
+ __u32 pad1;
+ __u64 devstart2; /* offset in to data of second section */
+ __u64 arraystart2;
+ __u64 length2;
+ __u32 sb_csum2; /* csum of preceeding bytes. */
+ __u8 pad[512-68-32];
+} __attribute__((aligned(512))) bsb, bsb2;
+
+static __u32 bsb_csum(char *buf, int len)
+{
+ int i;
+ int csum = 0;
+ for (i = 0; i < len; i++)
+ csum = (csum<<3) + buf[0];
+ return __cpu_to_le32(csum);
+}
+
+static int check_idle(struct supertype *st)
+{
+ /* Check that all member arrays for this container, or the
+ * container of this array, are idle
+ */
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+ struct mdstat_ent *ent, *e;
+ int is_idle = 1;
+
+ ent = mdstat_read(0, 0);
+ for (e = ent ; e; e = e->next) {
+ if (!is_container_member(e, container))
+ continue;
+ /* frozen array is not idle*/
+ if (e->percent >= 0 || e->metadata_version[9] == '-') {
+ is_idle = 0;
+ break;
+ }
+ }
+ free_mdstat(ent);
+ return is_idle;
+}
+
+static int freeze_container(struct supertype *st)
+{
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+
+ if (!check_idle(st))
+ return -1;
+
+ if (block_monitor(container, 1)) {
+ pr_err("failed to freeze container\n");
+ return -2;
+ }
+
+ return 1;
+}
+
+static void unfreeze_container(struct supertype *st)
+{
+ char *container = (st->container_devnm[0]
+ ? st->container_devnm : st->devnm);
+
+ unblock_monitor(container, 1);
+}
+
+static int freeze(struct supertype *st)
+{
+ /* Try to freeze resync/rebuild on this array/container.
+ * Return -1 if the array is busy,
+ * return -2 container cannot be frozen,
+ * return 0 if this kernel doesn't support 'frozen'
+ * return 1 if it worked.
+ */
+ if (st->ss->external)
+ return freeze_container(st);
+ else {
+ struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ int err;
+ char buf[20];
+
+ if (!sra)
+ return -1;
+ /* Need to clear any 'read-auto' status */
+ if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 &&
+ strncmp(buf, "read-auto", 9) == 0)
+ sysfs_set_str(sra, NULL, "array_state", "clean");
+
+ err = sysfs_freeze_array(sra);
+ sysfs_free(sra);
+ return err;
+ }
+}
+
+static void unfreeze(struct supertype *st)
+{
+ if (st->ss->external)
+ return unfreeze_container(st);
+ else {
+ struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ char buf[20];
+
+ if (sra &&
+ sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "frozen\n") == 0)
+ sysfs_set_str(sra, NULL, "sync_action", "idle");
+ sysfs_free(sra);
+ }
+}
+
+static void wait_reshape(struct mdinfo *sra)
+{
+ int fd = sysfs_get_fd(sra, NULL, "sync_action");
+ char action[20];
+
+ if (fd < 0)
+ return;
+
+ while (sysfs_fd_get_str(fd, action, 20) > 0 &&
+ strncmp(action, "reshape", 7) == 0)
+ sysfs_wait(fd, NULL);
+ close(fd);
+}
+
+static int reshape_super(struct supertype *st, unsigned long long size,
+ int level, int layout, int chunksize, int raid_disks,
+ int delta_disks, char *backup_file, char *dev,
+ int direction, int verbose)
+{
+ /* nothing extra to check in the native case */
+ if (!st->ss->external)
+ return 0;
+ if (!st->ss->reshape_super || !st->ss->manage_reshape) {
+ pr_err("%s metadata does not support reshape\n",
+ st->ss->name);
+ return 1;
+ }
+
+ return st->ss->reshape_super(st, size, level, layout, chunksize,
+ raid_disks, delta_disks, backup_file, dev,
+ direction, verbose);
+}
+
+static void sync_metadata(struct supertype *st)
+{
+ if (st->ss->external) {
+ if (st->update_tail) {
+ flush_metadata_updates(st);
+ st->update_tail = &st->updates;
+ } else
+ st->ss->sync_metadata(st);
+ }
+}
+
+static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n)
+{
+ /* when dealing with external metadata subarrays we need to be
+ * prepared to handle EAGAIN. The kernel may need to wait for
+ * mdmon to mark the array active so the kernel can handle
+ * allocations/writeback when preparing the reshape action
+ * (md_allow_write()). We temporarily disable safe_mode_delay
+ * to close a race with the array_state going clean before the
+ * next write to raid_disks / stripe_cache_size
+ */
+ char safe[50];
+ int rc;
+
+ /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */
+ if (!container ||
+ (strcmp(name, "raid_disks") != 0 &&
+ strcmp(name, "stripe_cache_size") != 0))
+ return sysfs_set_num(sra, NULL, name, n);
+
+ rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe));
+ if (rc <= 0)
+ return -1;
+ sysfs_set_num(sra, NULL, "safe_mode_delay", 0);
+ rc = sysfs_set_num(sra, NULL, name, n);
+ if (rc < 0 && errno == EAGAIN) {
+ ping_monitor(container);
+ /* if we get EAGAIN here then the monitor is not active
+ * so stop trying
+ */
+ rc = sysfs_set_num(sra, NULL, name, n);
+ }
+ sysfs_set_str(sra, NULL, "safe_mode_delay", safe);
+ return rc;
+}
+
+int start_reshape(struct mdinfo *sra, int already_running,
+ int before_data_disks, int data_disks, struct supertype *st)
+{
+ int err;
+ unsigned long long sync_max_to_set;
+
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress);
+ err = err ?: sysfs_set_num(sra, NULL, "suspend_lo",
+ sra->reshape_progress);
+ if (before_data_disks <= data_disks)
+ sync_max_to_set = sra->reshape_progress / data_disks;
+ else
+ sync_max_to_set = (sra->component_size * data_disks
+ - sra->reshape_progress) / data_disks;
+
+ if (!already_running)
+ sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set);
+
+ if (st->ss->external)
+ err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set);
+ else
+ err = err ?: sysfs_set_str(sra, NULL, "sync_max", "max");
+
+ if (!already_running && err == 0) {
+ int cnt = 5;
+ do {
+ err = sysfs_set_str(sra, NULL, "sync_action",
+ "reshape");
+ if (err)
+ sleep(1);
+ } while (err && errno == EBUSY && cnt-- > 0);
+ }
+ return err;
+}
+
+void abort_reshape(struct mdinfo *sra)
+{
+ sysfs_set_str(sra, NULL, "sync_action", "idle");
+ /*
+ * Prior to kernel commit: 23ddff3792f6 ("md: allow suspend_lo and
+ * suspend_hi to decrease as well as increase.")
+ * you could only increase suspend_{lo,hi} unless the region they
+ * covered was empty. So to reset to 0, you need to push suspend_lo
+ * up past suspend_hi first. So to maximize the chance of mdadm
+ * working on all kernels, we want to keep doing that.
+ */
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "sync_min", 0);
+ // It isn't safe to reset sync_max as we aren't monitoring.
+ // Array really should be stopped at this point.
+}
+
+int remove_disks_for_takeover(struct supertype *st,
+ struct mdinfo *sra,
+ int layout)
+{
+ int nr_of_copies;
+ struct mdinfo *remaining;
+ int slot;
+
+ if (st->ss->external) {
+ int rv = 0;
+ struct mdinfo *arrays = st->ss->container_content(st, NULL);
+ /*
+ * containter_content returns list of arrays in container
+ * If arrays->next is not NULL it means that there are
+ * 2 arrays in container and operation should be blocked
+ */
+ if (arrays) {
+ if (arrays->next)
+ rv = 1;
+ sysfs_free(arrays);
+ if (rv) {
+ pr_err("Error. Cannot perform operation on /dev/%s\n", st->devnm);
+ pr_err("For this operation it MUST be single array in container\n");
+ return rv;
+ }
+ }
+ }
+
+ if (sra->array.level == 10)
+ nr_of_copies = layout & 0xff;
+ else if (sra->array.level == 1)
+ nr_of_copies = sra->array.raid_disks;
+ else
+ return 1;
+
+ remaining = sra->devs;
+ sra->devs = NULL;
+ /* for each 'copy', select one device and remove from the list. */
+ for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) {
+ struct mdinfo **diskp;
+ int found = 0;
+
+ /* Find a working device to keep */
+ for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) {
+ struct mdinfo *disk = *diskp;
+
+ if (disk->disk.raid_disk < slot)
+ continue;
+ if (disk->disk.raid_disk >= slot + nr_of_copies)
+ continue;
+ if (disk->disk.state & (1<<MD_DISK_REMOVED))
+ continue;
+ if (disk->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (!(disk->disk.state & (1<<MD_DISK_SYNC)))
+ continue;
+
+ /* We have found a good disk to use! */
+ *diskp = disk->next;
+ disk->next = sra->devs;
+ sra->devs = disk;
+ found = 1;
+ break;
+ }
+ if (!found)
+ break;
+ }
+
+ if (slot < sra->array.raid_disks) {
+ /* didn't find all slots */
+ struct mdinfo **e;
+ e = &remaining;
+ while (*e)
+ e = &(*e)->next;
+ *e = sra->devs;
+ sra->devs = remaining;
+ return 1;
+ }
+
+ /* Remove all 'remaining' devices from the array */
+ while (remaining) {
+ struct mdinfo *sd = remaining;
+ remaining = sd->next;
+
+ sysfs_set_str(sra, sd, "state", "faulty");
+ sysfs_set_str(sra, sd, "slot", "none");
+ /* for external metadata disks should be removed in mdmon */
+ if (!st->ss->external)
+ sysfs_set_str(sra, sd, "state", "remove");
+ sd->disk.state |= (1<<MD_DISK_REMOVED);
+ sd->disk.state &= ~(1<<MD_DISK_SYNC);
+ sd->next = sra->devs;
+ sra->devs = sd;
+ }
+ return 0;
+}
+
+void reshape_free_fdlist(int *fdlist,
+ unsigned long long *offsets,
+ int size)
+{
+ int i;
+
+ for (i = 0; i < size; i++)
+ if (fdlist[i] >= 0)
+ close(fdlist[i]);
+
+ free(fdlist);
+ free(offsets);
+}
+
+int reshape_prepare_fdlist(char *devname,
+ struct mdinfo *sra,
+ int raid_disks,
+ int nrdisks,
+ unsigned long blocks,
+ char *backup_file,
+ int *fdlist,
+ unsigned long long *offsets)
+{
+ int d = 0;
+ struct mdinfo *sd;
+
+ enable_fds(nrdisks);
+ for (d = 0; d <= nrdisks; d++)
+ fdlist[d] = -1;
+ d = raid_disks;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC) &&
+ sd->disk.raid_disk < raid_disks) {
+ char *dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ fdlist[sd->disk.raid_disk] = dev_open(dn, O_RDONLY);
+ offsets[sd->disk.raid_disk] = sd->data_offset*512;
+ if (fdlist[sd->disk.raid_disk] < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ d = -1;
+ goto release;
+ }
+ } else if (backup_file == NULL) {
+ /* spare */
+ char *dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ fdlist[d] = dev_open(dn, O_RDWR);
+ offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512;
+ if (fdlist[d] < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ d = -1;
+ goto release;
+ }
+ d++;
+ }
+ }
+release:
+ return d;
+}
+
+int reshape_open_backup_file(char *backup_file,
+ int fd,
+ char *devname,
+ long blocks,
+ int *fdlist,
+ unsigned long long *offsets,
+ char *sys_name,
+ int restart)
+{
+ /* Return 1 on success, 0 on any form of failure */
+ /* need to check backup file is large enough */
+ char buf[512];
+ struct stat stb;
+ unsigned int dev;
+ int i;
+
+ *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL),
+ S_IRUSR | S_IWUSR);
+ *offsets = 8 * 512;
+ if (*fdlist < 0) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+ /* Guard against backup file being on array device.
+ * If array is partitioned or if LVM etc is in the
+ * way this will not notice, but it is better than
+ * nothing.
+ */
+ fstat(*fdlist, &stb);
+ dev = stb.st_dev;
+ fstat(fd, &stb);
+ if (stb.st_rdev == dev) {
+ pr_err("backup file must NOT be on the array being reshaped.\n");
+ close(*fdlist);
+ return 0;
+ }
+
+ memset(buf, 0, 512);
+ for (i=0; i < blocks + 8 ; i++) {
+ if (write(*fdlist, buf, 512) != 512) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+ }
+ if (fsync(*fdlist) != 0) {
+ pr_err("%s: cannot create backup file %s: %s\n",
+ devname, backup_file, strerror(errno));
+ return 0;
+ }
+
+ if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) {
+ char *bu = make_backup(sys_name);
+ if (symlink(backup_file, bu))
+ pr_err("Recording backup file in " MAP_DIR " failed: %s\n",
+ strerror(errno));
+ free(bu);
+ }
+
+ return 1;
+}
+
+unsigned long compute_backup_blocks(int nchunk, int ochunk,
+ unsigned int ndata, unsigned int odata)
+{
+ unsigned long a, b, blocks;
+ /* So how much do we need to backup.
+ * We need an amount of data which is both a whole number of
+ * old stripes and a whole number of new stripes.
+ * So LCM for (chunksize*datadisks).
+ */
+ a = (ochunk/512) * odata;
+ b = (nchunk/512) * ndata;
+ /* Find GCD */
+ a = GCD(a, b);
+ /* LCM == product / GCD */
+ blocks = (unsigned long)(ochunk/512) * (unsigned long)(nchunk/512) *
+ odata * ndata / a;
+
+ return blocks;
+}
+
+char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re)
+{
+ /* Based on the current array state in info->array and
+ * the changes in info->new_* etc, determine:
+ * - whether the change is possible
+ * - Intermediate level/raid_disks/layout
+ * - whether a restriping reshape is needed
+ * - number of sectors in minimum change unit. This
+ * will cover a whole number of stripes in 'before' and
+ * 'after'.
+ *
+ * Return message if the change should be rejected
+ * NULL if the change can be achieved
+ *
+ * This can be called as part of starting a reshape, or
+ * when assembling an array that is undergoing reshape.
+ */
+ int near, far, offset, copies;
+ int new_disks;
+ int old_chunk, new_chunk;
+ /* delta_parity records change in number of devices
+ * caused by level change
+ */
+ int delta_parity = 0;
+
+ memset(re, 0, sizeof(*re));
+
+ /* If a new level not explicitly given, we assume no-change */
+ if (info->new_level == UnSet)
+ info->new_level = info->array.level;
+
+ if (info->new_chunk)
+ switch (info->new_level) {
+ case 0:
+ case 4:
+ case 5:
+ case 6:
+ case 10:
+ /* chunk size is meaningful, must divide component_size
+ * evenly
+ */
+ if (info->component_size % (info->new_chunk/512)) {
+ unsigned long long shrink = info->component_size;
+ shrink &= ~(unsigned long long)(info->new_chunk/512-1);
+ pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n",
+ info->new_chunk/1024, info->component_size/2);
+ pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n",
+ devname, shrink/2);
+ pr_err("will shrink the array so the given chunk size would work.\n");
+ return "";
+ }
+ break;
+ default:
+ return "chunk size not meaningful for this level";
+ }
+ else
+ info->new_chunk = info->array.chunk_size;
+
+ switch (info->array.level) {
+ default:
+ return "No reshape is possibly for this RAID level";
+ case LEVEL_LINEAR:
+ if (info->delta_disks != UnSet)
+ return "Only --add is supported for LINEAR, setting --raid-disks is not needed";
+ else
+ return "Only --add is supported for LINEAR, other --grow options are not meaningful";
+ case 1:
+ /* RAID1 can convert to RAID1 with different disks, or
+ * raid5 with 2 disks, or
+ * raid0 with 1 disk
+ */
+ if (info->new_level > 1 && (info->component_size & 7))
+ return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first.";
+ if (info->new_level == 0) {
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ return "Cannot change number of disks with RAID1->RAID0 conversion";
+ re->level = 0;
+ re->before.data_disks = 1;
+ re->after.data_disks = 1;
+ return NULL;
+ }
+ if (info->new_level == 1) {
+ if (info->delta_disks == UnSet)
+ /* Don't know what to do */
+ return "no change requested for Growing RAID1";
+ re->level = 1;
+ return NULL;
+ }
+ if (info->array.raid_disks != 2 && info->new_level == 5)
+ return "Can only convert a 2-device array to RAID5";
+ if (info->array.raid_disks == 2 && info->new_level == 5) {
+ re->level = 5;
+ re->before.data_disks = 1;
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ re->after.data_disks = 1 + info->delta_disks;
+ else
+ re->after.data_disks = 1;
+ if (re->after.data_disks < 1)
+ return "Number of disks too small for RAID5";
+
+ re->before.layout = ALGORITHM_LEFT_SYMMETRIC;
+ info->array.chunk_size = 65536;
+ break;
+ }
+ /* Could do some multi-stage conversions, but leave that to
+ * later.
+ */
+ return "Impossibly level change request for RAID1";
+
+ case 10:
+ /* RAID10 can be converted from near mode to
+ * RAID0 by removing some devices.
+ * It can also be reshaped if the kernel supports
+ * new_data_offset.
+ */
+ switch (info->new_level) {
+ case 0:
+ if ((info->array.layout & ~0xff) != 0x100)
+ return "Cannot Grow RAID10 with far/offset layout";
+ /*
+ * number of devices must be multiple of
+ * number of copies
+ */
+ if (info->array.raid_disks %
+ (info->array.layout & 0xff))
+ return "RAID10 layout too complex for Grow operation";
+
+ new_disks = (info->array.raid_disks /
+ (info->array.layout & 0xff));
+ if (info->delta_disks == UnSet)
+ info->delta_disks = (new_disks
+ - info->array.raid_disks);
+
+ if (info->delta_disks !=
+ new_disks - info->array.raid_disks)
+ return "New number of raid-devices impossible for RAID10";
+ if (info->new_chunk &&
+ info->new_chunk != info->array.chunk_size)
+ return "Cannot change chunk-size with RAID10 Grow";
+
+ /* looks good */
+ re->level = 0;
+ re->before.data_disks = new_disks;
+ re->after.data_disks = re->before.data_disks;
+ return NULL;
+
+ case 10:
+ near = info->array.layout & 0xff;
+ far = (info->array.layout >> 8) & 0xff;
+ offset = info->array.layout & 0x10000;
+ if (far > 1 && !offset)
+ return "Cannot reshape RAID10 in far-mode";
+ copies = near * far;
+
+ old_chunk = info->array.chunk_size * far;
+
+ if (info->new_layout == UnSet)
+ info->new_layout = info->array.layout;
+ else {
+ near = info->new_layout & 0xff;
+ far = (info->new_layout >> 8) & 0xff;
+ offset = info->new_layout & 0x10000;
+ if (far > 1 && !offset)
+ return "Cannot reshape RAID10 to far-mode";
+ if (near * far != copies)
+ return "Cannot change number of copies when reshaping RAID10";
+ }
+ if (info->delta_disks == UnSet)
+ info->delta_disks = 0;
+ new_disks = (info->array.raid_disks +
+ info->delta_disks);
+
+ new_chunk = info->new_chunk * far;
+
+ re->level = 10;
+ re->before.layout = info->array.layout;
+ re->before.data_disks = info->array.raid_disks;
+ re->after.layout = info->new_layout;
+ re->after.data_disks = new_disks;
+ /* For RAID10 we don't do backup but do allow reshape,
+ * so set backup_blocks to INVALID_SECTORS rather than
+ * zero.
+ * And there is no need to synchronise stripes on both
+ * 'old' and 'new'. So the important
+ * number is the minimum data_offset difference
+ * which is the larger of (offset copies * chunk).
+ */
+ re->backup_blocks = INVALID_SECTORS;
+ re->min_offset_change = max(old_chunk, new_chunk) / 512;
+ if (new_disks < re->before.data_disks &&
+ info->space_after < re->min_offset_change)
+ /* Reduce component size by one chunk */
+ re->new_size = (info->component_size -
+ re->min_offset_change);
+ else
+ re->new_size = info->component_size;
+ re->new_size = re->new_size * new_disks / copies;
+ return NULL;
+
+ default:
+ return "RAID10 can only be changed to RAID0";
+ }
+ case 0:
+ /* RAID0 can be converted to RAID10, or to RAID456 */
+ if (info->new_level == 10) {
+ if (info->new_layout == UnSet &&
+ info->delta_disks == UnSet) {
+ /* Assume near=2 layout */
+ info->new_layout = 0x102;
+ info->delta_disks = info->array.raid_disks;
+ }
+ if (info->new_layout == UnSet) {
+ int copies = 1 + (info->delta_disks
+ / info->array.raid_disks);
+ if (info->array.raid_disks * (copies-1) !=
+ info->delta_disks)
+ return "Impossible number of devices for RAID0->RAID10";
+ info->new_layout = 0x100 + copies;
+ }
+ if (info->delta_disks == UnSet) {
+ int copies = info->new_layout & 0xff;
+ if (info->new_layout != 0x100 + copies)
+ return "New layout impossible for RAID0->RAID10";;
+ info->delta_disks = (copies - 1) *
+ info->array.raid_disks;
+ }
+ if (info->new_chunk &&
+ info->new_chunk != info->array.chunk_size)
+ return "Cannot change chunk-size with RAID0->RAID10";
+ /* looks good */
+ re->level = 10;
+ re->before.data_disks = (info->array.raid_disks +
+ info->delta_disks);
+ re->after.data_disks = re->before.data_disks;
+ re->before.layout = info->new_layout;
+ return NULL;
+ }
+
+ /* RAID0 can also covert to RAID0/4/5/6 by first converting to
+ * a raid4 style layout of the final level.
+ */
+ switch (info->new_level) {
+ case 4:
+ delta_parity = 1;
+ case 0:
+ re->level = 4;
+ re->before.layout = 0;
+ break;
+ case 5:
+ delta_parity = 1;
+ re->level = 5;
+ re->before.layout = ALGORITHM_PARITY_N;
+ if (info->new_layout == UnSet)
+ info->new_layout = map_name(r5layout, "default");
+ break;
+ case 6:
+ delta_parity = 2;
+ re->level = 6;
+ re->before.layout = ALGORITHM_PARITY_N;
+ if (info->new_layout == UnSet)
+ info->new_layout = map_name(r6layout, "default");
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ re->before.data_disks = info->array.raid_disks;
+ /* determining 'after' layout happens outside this 'switch' */
+ break;
+
+ case 4:
+ info->array.layout = ALGORITHM_PARITY_N;
+ case 5:
+ switch (info->new_level) {
+ case 0:
+ delta_parity = -1;
+ case 4:
+ re->level = info->array.level;
+ re->before.data_disks = info->array.raid_disks - 1;
+ re->before.layout = info->array.layout;
+ break;
+ case 5:
+ re->level = 5;
+ re->before.data_disks = info->array.raid_disks - 1;
+ re->before.layout = info->array.layout;
+ break;
+ case 6:
+ delta_parity = 1;
+ re->level = 6;
+ re->before.data_disks = info->array.raid_disks - 1;
+ switch (info->array.layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6;
+ break;
+ case ALGORITHM_PARITY_0:
+ re->before.layout = ALGORITHM_PARITY_0_6;
+ break;
+ case ALGORITHM_PARITY_N:
+ re->before.layout = ALGORITHM_PARITY_N_6;
+ break;
+ default:
+ return "Cannot convert an array with this layout";
+ }
+ break;
+ case 1:
+ if (info->array.raid_disks != 2)
+ return "Can only convert a 2-device array to RAID1";
+ if (info->delta_disks != UnSet &&
+ info->delta_disks != 0)
+ return "Cannot set raid_disk when converting RAID5->RAID1";
+ re->level = 1;
+ info->new_chunk = 0;
+ return NULL;
+ default:
+ return "Impossible level change requested";
+ }
+ break;
+ case 6:
+ switch (info->new_level) {
+ case 4:
+ case 5:
+ delta_parity = -1;
+ case 6:
+ re->level = 6;
+ re->before.data_disks = info->array.raid_disks - 2;
+ re->before.layout = info->array.layout;
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ break;
+ }
+
+ /* If we reached here then it looks like a re-stripe is
+ * happening. We have determined the intermediate level
+ * and initial raid_disks/layout and stored these in 're'.
+ *
+ * We need to deduce the final layout that can be atomically
+ * converted to the end state.
+ */
+ switch (info->new_level) {
+ case 0:
+ /* We can only get to RAID0 from RAID4 or RAID5
+ * with appropriate layout and one extra device
+ */
+ if (re->level != 4 && re->level != 5)
+ return "Cannot covert to RAID0 from this level";
+
+ switch (re->level) {
+ case 4:
+ re->before.layout = 0;
+ re->after.layout = 0;
+ break;
+ case 5:
+ re->after.layout = ALGORITHM_PARITY_N;
+ break;
+ }
+ break;
+
+ case 4:
+ /* We can only get to RAID4 from RAID5 */
+ if (re->level != 4 && re->level != 5)
+ return "Cannot convert to RAID4 from this level";
+
+ switch (re->level) {
+ case 4:
+ re->after.layout = 0;
+ break;
+ case 5:
+ re->after.layout = ALGORITHM_PARITY_N;
+ break;
+ }
+ break;
+
+ case 5:
+ /* We get to RAID5 from RAID5 or RAID6 */
+ if (re->level != 5 && re->level != 6)
+ return "Cannot convert to RAID5 from this level";
+
+ switch (re->level) {
+ case 5:
+ if (info->new_layout == UnSet)
+ re->after.layout = re->before.layout;
+ else
+ re->after.layout = info->new_layout;
+ break;
+ case 6:
+ if (info->new_layout == UnSet)
+ info->new_layout = re->before.layout;
+
+ /* after.layout needs to be raid6 version of new_layout */
+ if (info->new_layout == ALGORITHM_PARITY_N)
+ re->after.layout = ALGORITHM_PARITY_N;
+ else {
+ char layout[40];
+ char *ls = map_num(r5layout, info->new_layout);
+ int l;
+ if (ls) {
+ /* Current RAID6 layout has a RAID5
+ * equivalent - good
+ */
+ strcat(strcpy(layout, ls), "-6");
+ l = map_name(r6layout, layout);
+ if (l == UnSet)
+ return "Cannot find RAID6 layout to convert to";
+ } else {
+ /* Current RAID6 has no equivalent.
+ * If it is already a '-6' layout we
+ * can leave it unchanged, else we must
+ * fail
+ */
+ ls = map_num(r6layout,
+ info->new_layout);
+ if (!ls ||
+ strcmp(ls+strlen(ls)-2, "-6") != 0)
+ return "Please specify new layout";
+ l = info->new_layout;
+ }
+ re->after.layout = l;
+ }
+ }
+ break;
+
+ case 6:
+ /* We must already be at level 6 */
+ if (re->level != 6)
+ return "Impossible level change";
+ if (info->new_layout == UnSet)
+ re->after.layout = info->array.layout;
+ else
+ re->after.layout = info->new_layout;
+ break;
+ default:
+ return "Impossible level change requested";
+ }
+ if (info->delta_disks == UnSet)
+ info->delta_disks = delta_parity;
+
+ re->after.data_disks =
+ (re->before.data_disks + info->delta_disks - delta_parity);
+
+ switch (re->level) {
+ case 6:
+ re->parity = 2;
+ break;
+ case 4:
+ case 5:
+ re->parity = 1;
+ break;
+ default:
+ re->parity = 0;
+ break;
+ }
+ /* So we have a restripe operation, we need to calculate the number
+ * of blocks per reshape operation.
+ */
+ re->new_size = info->component_size * re->before.data_disks;
+ if (info->new_chunk == 0)
+ info->new_chunk = info->array.chunk_size;
+ if (re->after.data_disks == re->before.data_disks &&
+ re->after.layout == re->before.layout &&
+ info->new_chunk == info->array.chunk_size) {
+ /* Nothing to change, can change level immediately. */
+ re->level = info->new_level;
+ re->backup_blocks = 0;
+ return NULL;
+ }
+ if (re->after.data_disks == 1 && re->before.data_disks == 1) {
+ /* chunk and layout changes make no difference */
+ re->level = info->new_level;
+ re->backup_blocks = 0;
+ return NULL;
+ }
+
+ if (re->after.data_disks == re->before.data_disks &&
+ get_linux_version() < 2006032)
+ return "in-place reshape is not safe before 2.6.32 - sorry.";
+
+ if (re->after.data_disks < re->before.data_disks &&
+ get_linux_version() < 2006030)
+ return "reshape to fewer devices is not supported before 2.6.30 - sorry.";
+
+ re->backup_blocks = compute_backup_blocks(
+ info->new_chunk, info->array.chunk_size,
+ re->after.data_disks, re->before.data_disks);
+ re->min_offset_change = re->backup_blocks / re->before.data_disks;
+
+ re->new_size = info->component_size * re->after.data_disks;
+ return NULL;
+}
+
+static int set_array_size(struct supertype *st, struct mdinfo *sra,
+ char *text_version)
+{
+ struct mdinfo *info;
+ char *subarray;
+ int ret_val = -1;
+
+ if ((st == NULL) || (sra == NULL))
+ return ret_val;
+
+ if (text_version == NULL)
+ text_version = sra->text_version;
+ subarray = strchr(text_version + 1, '/')+1;
+ info = st->ss->container_content(st, subarray);
+ if (info) {
+ unsigned long long current_size = 0;
+ unsigned long long new_size = info->custom_array_size/2;
+
+ if (sysfs_get_ll(sra, NULL, "array_size", &current_size) == 0 &&
+ new_size > current_size) {
+ if (sysfs_set_num(sra, NULL, "array_size", new_size)
+ < 0)
+ dprintf("Error: Cannot set array size");
+ else {
+ ret_val = 0;
+ dprintf("Array size changed");
+ }
+ dprintf_cont(" from %llu to %llu.\n",
+ current_size, new_size);
+ }
+ sysfs_free(info);
+ } else
+ dprintf("Error: set_array_size(): info pointer in NULL\n");
+
+ return ret_val;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ int force, struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ char *backup_file, int verbose, int forked,
+ int restart, int freeze_reshape);
+static int reshape_container(char *container, char *devname,
+ int mdfd,
+ struct supertype *st,
+ struct mdinfo *info,
+ int force,
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape);
+
+int Grow_reshape(char *devname, int fd,
+ struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ struct context *c, struct shape *s)
+{
+ /* Make some changes in the shape of an array.
+ * The kernel must support the change.
+ *
+ * There are three different changes. Each can trigger
+ * a resync or recovery so we freeze that until we have
+ * requested everything (if kernel supports freezing - 2.6.30).
+ * The steps are:
+ * - change size (i.e. component_size)
+ * - change level
+ * - change layout/chunksize/ndisks
+ *
+ * The last can require a reshape. It is different on different
+ * levels so we need to check the level before actioning it.
+ * Some times the level change needs to be requested after the
+ * reshape (e.g. raid6->raid5, raid5->raid0)
+ *
+ */
+ struct mdu_array_info_s array;
+ int rv = 0;
+ struct supertype *st;
+ char *subarray = NULL;
+
+ int frozen;
+ int changed = 0;
+ char *container = NULL;
+ int cfd = -1;
+
+ struct mddev_dev *dv;
+ int added_disks;
+
+ struct mdinfo info;
+ struct mdinfo *sra;
+
+ if (md_get_array_info(fd, &array) < 0) {
+ pr_err("%s is not an active md array - aborting\n",
+ devname);
+ return 1;
+ }
+ if (s->level != UnSet && s->chunk) {
+ pr_err("Cannot change array level in the same operation as changing chunk size.\n");
+ return 1;
+ }
+
+ if (data_offset != INVALID_SECTORS && array.level != 10 &&
+ (array.level < 4 || array.level > 6)) {
+ pr_err("--grow --data-offset not yet supported\n");
+ return 1;
+ }
+
+ if (s->size > 0 &&
+ (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) {
+ pr_err("cannot change component size at the same time as other changes.\n"
+ " Change size first, then check data is intact before making other changes.\n");
+ return 1;
+ }
+
+ if (s->raiddisks && s->raiddisks < array.raid_disks &&
+ array.level > 1 && get_linux_version() < 2006032 &&
+ !check_env("MDADM_FORCE_FEWER")) {
+ pr_err("reducing the number of devices is not safe before Linux 2.6.32\n"
+ " Please use a newer kernel\n");
+ return 1;
+ }
+
+ if (array.level > 1 && s->size > 1 &&
+ (unsigned long long) (array.chunk_size / 1024) > s->size) {
+ pr_err("component size must be larger than chunk size.\n");
+ return 1;
+ }
+
+ st = super_by_fd(fd, &subarray);
+ if (!st) {
+ pr_err("Unable to determine metadata format for %s\n", devname);
+ return 1;
+ }
+ if (s->raiddisks > st->max_devs) {
+ pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs);
+ return 1;
+ }
+ if (s->level == 0 && (array.state & (1 << MD_SB_BITMAP_PRESENT)) &&
+ !(array.state & (1 << MD_SB_CLUSTERED)) && !st->ss->external) {
+ array.state &= ~(1 << MD_SB_BITMAP_PRESENT);
+ if (md_set_array_info(fd, &array) != 0) {
+ pr_err("failed to remove internal bitmap.\n");
+ return 1;
+ }
+ }
+
+ /* in the external case we need to check that the requested reshape is
+ * supported, and perform an initial check that the container holds the
+ * pre-requisite spare devices (mdmon owns final validation)
+ */
+ if (st->ss->external) {
+ int retval;
+
+ if (subarray) {
+ container = st->container_devnm;
+ cfd = open_dev_excl(st->container_devnm);
+ } else {
+ container = st->devnm;
+ close(fd);
+ cfd = open_dev_excl(st->devnm);
+ fd = cfd;
+ }
+ if (cfd < 0) {
+ pr_err("Unable to open container for %s\n", devname);
+ free(subarray);
+ return 1;
+ }
+
+ retval = st->ss->load_container(st, cfd, NULL);
+
+ if (retval) {
+ pr_err("Cannot read superblock for %s\n", devname);
+ free(subarray);
+ return 1;
+ }
+
+ /* check if operation is supported for metadata handler */
+ if (st->ss->container_content) {
+ struct mdinfo *cc = NULL;
+ struct mdinfo *content = NULL;
+
+ cc = st->ss->container_content(st, subarray);
+ for (content = cc; content ; content = content->next) {
+ int allow_reshape = 1;
+
+ /* check if reshape is allowed based on metadata
+ * indications stored in content.array.status
+ */
+ if (content->array.state &
+ (1 << MD_SB_BLOCK_VOLUME))
+ allow_reshape = 0;
+ if (content->array.state &
+ (1 << MD_SB_BLOCK_CONTAINER_RESHAPE))
+ allow_reshape = 0;
+ if (!allow_reshape) {
+ pr_err("cannot reshape arrays in container with unsupported metadata: %s(%s)\n",
+ devname, container);
+ sysfs_free(cc);
+ free(subarray);
+ return 1;
+ }
+ if (content->consistency_policy ==
+ CONSISTENCY_POLICY_PPL) {
+ pr_err("Operation not supported when ppl consistency policy is enabled\n");
+ sysfs_free(cc);
+ free(subarray);
+ return 1;
+ }
+ if (content->consistency_policy ==
+ CONSISTENCY_POLICY_BITMAP) {
+ pr_err("Operation not supported when write-intent bitmap is enabled\n");
+ sysfs_free(cc);
+ free(subarray);
+ return 1;
+ }
+ }
+ sysfs_free(cc);
+ }
+ if (mdmon_running(container))
+ st->update_tail = &st->updates;
+ }
+
+ added_disks = 0;
+ for (dv = devlist; dv; dv = dv->next)
+ added_disks++;
+ if (s->raiddisks > array.raid_disks &&
+ array.spare_disks + added_disks <
+ (s->raiddisks - array.raid_disks) &&
+ !c->force) {
+ pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n"
+ " Use --force to over-ride this check.\n",
+ s->raiddisks - array.raid_disks,
+ s->raiddisks - array.raid_disks == 1 ? "" : "s",
+ array.spare_disks + added_disks);
+ return 1;
+ }
+
+ sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS |
+ GET_STATE | GET_VERSION);
+ if (sra) {
+ if (st->ss->external && subarray == NULL) {
+ array.level = LEVEL_CONTAINER;
+ sra->array.level = LEVEL_CONTAINER;
+ }
+ } else {
+ pr_err("failed to read sysfs parameters for %s\n",
+ devname);
+ return 1;
+ }
+ frozen = freeze(st);
+ if (frozen < -1) {
+ /* freeze() already spewed the reason */
+ sysfs_free(sra);
+ return 1;
+ } else if (frozen < 0) {
+ pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname);
+ sysfs_free(sra);
+ return 1;
+ }
+
+ /* ========= set size =============== */
+ if (s->size > 0 &&
+ (s->size == MAX_SIZE || s->size != (unsigned)array.size)) {
+ unsigned long long orig_size = get_component_size(fd)/2;
+ unsigned long long min_csize;
+ struct mdinfo *mdi;
+ int raid0_takeover = 0;
+
+ if (orig_size == 0)
+ orig_size = (unsigned) array.size;
+
+ if (orig_size == 0) {
+ pr_err("Cannot set device size in this type of array.\n");
+ rv = 1;
+ goto release;
+ }
+
+ if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL,
+ devname, APPLY_METADATA_CHANGES,
+ c->verbose > 0)) {
+ rv = 1;
+ goto release;
+ }
+ sync_metadata(st);
+ if (st->ss->external) {
+ /* metadata can have size limitation
+ * update size value according to metadata information
+ */
+ struct mdinfo *sizeinfo =
+ st->ss->container_content(st, subarray);
+ if (sizeinfo) {
+ unsigned long long new_size =
+ sizeinfo->custom_array_size/2;
+ int data_disks = get_data_disks(
+ sizeinfo->array.level,
+ sizeinfo->array.layout,
+ sizeinfo->array.raid_disks);
+ new_size /= data_disks;
+ dprintf("Metadata size correction from %llu to %llu (%llu)\n",
+ orig_size, new_size,
+ new_size * data_disks);
+ s->size = new_size;
+ sysfs_free(sizeinfo);
+ }
+ }
+
+ /* Update the size of each member device in case
+ * they have been resized. This will never reduce
+ * below the current used-size. The "size" attribute
+ * understands '0' to mean 'max'.
+ */
+ min_csize = 0;
+ for (mdi = sra->devs; mdi; mdi = mdi->next) {
+ sysfs_set_num(sra, mdi, "size",
+ s->size == MAX_SIZE ? 0 : s->size);
+ if (array.not_persistent == 0 &&
+ array.major_version == 0 &&
+ get_linux_version() < 3001000) {
+ /* Dangerous to allow size to exceed 2TB */
+ unsigned long long csize;
+ if (sysfs_get_ll(sra, mdi, "size",
+ &csize) == 0) {
+ if (csize >= 2ULL*1024*1024*1024)
+ csize = 2ULL*1024*1024*1024;
+ if ((min_csize == 0 ||
+ (min_csize > csize)))
+ min_csize = csize;
+ }
+ }
+ }
+ if (min_csize && s->size > min_csize) {
+ pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n");
+ rv = 1;
+ goto size_change_error;
+ }
+ if (min_csize && s->size == MAX_SIZE) {
+ /* Don't let the kernel choose a size - it will get
+ * it wrong
+ */
+ pr_err("Limited v0.90 array to 2TB per device\n");
+ s->size = min_csize;
+ }
+ if (st->ss->external) {
+ if (sra->array.level == 0) {
+ rv = sysfs_set_str(sra, NULL, "level", "raid5");
+ if (!rv) {
+ raid0_takeover = 1;
+ /* get array parameters after takeover
+ * to change one parameter at time only
+ */
+ rv = md_get_array_info(fd, &array);
+ }
+ }
+ /* make sure mdmon is
+ * aware of the new level */
+ if (!mdmon_running(st->container_devnm))
+ start_mdmon(st->container_devnm);
+ ping_monitor(container);
+ if (mdmon_running(st->container_devnm) &&
+ st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+
+ if (s->size == MAX_SIZE)
+ s->size = 0;
+ array.size = s->size;
+ if (s->size & ~INT32_MAX) {
+ /* got truncated to 32bit, write to
+ * component_size instead
+ */
+ if (sra)
+ rv = sysfs_set_num(sra, NULL,
+ "component_size", s->size);
+ else
+ rv = -1;
+ } else {
+ rv = md_set_array_info(fd, &array);
+
+ /* manage array size when it is managed externally
+ */
+ if ((rv == 0) && st->ss->external)
+ rv = set_array_size(st, sra, sra->text_version);
+ }
+
+ if (raid0_takeover) {
+ /* do not recync non-existing parity,
+ * we will drop it anyway
+ */
+ sysfs_set_str(sra, NULL, "sync_action", "frozen");
+ /* go back to raid0, drop parity disk
+ */
+ sysfs_set_str(sra, NULL, "level", "raid0");
+ md_get_array_info(fd, &array);
+ }
+
+size_change_error:
+ if (rv != 0) {
+ int err = errno;
+
+ /* restore metadata */
+ if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0,
+ UnSet, NULL, devname,
+ ROLLBACK_METADATA_CHANGES,
+ c->verbose) == 0)
+ sync_metadata(st);
+ pr_err("Cannot set device size for %s: %s\n",
+ devname, strerror(err));
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before size can be changed\n");
+ rv = 1;
+ goto release;
+ }
+ if (s->assume_clean) {
+ /* This will fail on kernels older than 3.0 unless
+ * a backport has been arranged.
+ */
+ if (sra == NULL ||
+ sysfs_set_str(sra, NULL, "resync_start",
+ "none") < 0)
+ pr_err("--assume-clean not supported with --grow on this kernel\n");
+ }
+ md_get_array_info(fd, &array);
+ s->size = get_component_size(fd)/2;
+ if (s->size == 0)
+ s->size = array.size;
+ if (c->verbose >= 0) {
+ if (s->size == orig_size)
+ pr_err("component size of %s unchanged at %lluK\n",
+ devname, s->size);
+ else
+ pr_err("component size of %s has been set to %lluK\n",
+ devname, s->size);
+ }
+ changed = 1;
+ } else if (array.level != LEVEL_CONTAINER) {
+ s->size = get_component_size(fd)/2;
+ if (s->size == 0)
+ s->size = array.size;
+ }
+
+ /* See if there is anything else to do */
+ if ((s->level == UnSet || s->level == array.level) &&
+ (s->layout_str == NULL) &&
+ (s->chunk == 0 || s->chunk == array.chunk_size) &&
+ data_offset == INVALID_SECTORS &&
+ (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) {
+ /* Nothing more to do */
+ if (!changed && c->verbose >= 0)
+ pr_err("%s: no change requested\n", devname);
+ goto release;
+ }
+
+ /* ========= check for Raid10/Raid1 -> Raid0 conversion ===============
+ * current implementation assumes that following conditions must be met:
+ * - RAID10:
+ * - far_copies == 1
+ * - near_copies == 2
+ */
+ if ((s->level == 0 && array.level == 10 && sra &&
+ array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) ||
+ (s->level == 0 && array.level == 1 && sra)) {
+ int err;
+
+ err = remove_disks_for_takeover(st, sra, array.layout);
+ if (err) {
+ dprintf("Array cannot be reshaped\n");
+ if (cfd > -1)
+ close(cfd);
+ rv = 1;
+ goto release;
+ }
+ /* Make sure mdmon has seen the device removal
+ * and updated metadata before we continue with
+ * level change
+ */
+ if (container)
+ ping_monitor(container);
+ }
+
+ memset(&info, 0, sizeof(info));
+ info.array = array;
+ if (sysfs_init(&info, fd, NULL)) {
+ pr_err("failed to initialize sysfs.\n");
+ rv = 1;
+ goto release;
+ }
+ strcpy(info.text_version, sra->text_version);
+ info.component_size = s->size*2;
+ info.new_level = s->level;
+ info.new_chunk = s->chunk * 1024;
+ if (info.array.level == LEVEL_CONTAINER) {
+ info.delta_disks = UnSet;
+ info.array.raid_disks = s->raiddisks;
+ } else if (s->raiddisks)
+ info.delta_disks = s->raiddisks - info.array.raid_disks;
+ else
+ info.delta_disks = UnSet;
+ if (s->layout_str == NULL) {
+ info.new_layout = UnSet;
+ if (info.array.level == 6 &&
+ (info.new_level == 6 || info.new_level == UnSet) &&
+ info.array.layout >= 16) {
+ pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname);
+ cont_err("during the reshape, please specify --layout=preserve\n");
+ cont_err("If you want to change it, specify a layout or use --layout=normalise\n");
+ rv = 1;
+ goto release;
+ }
+ } else if (strcmp(s->layout_str, "normalise") == 0 ||
+ strcmp(s->layout_str, "normalize") == 0) {
+ /* If we have a -6 RAID6 layout, remove the '-6'. */
+ info.new_layout = UnSet;
+ if (info.array.level == 6 && info.new_level == UnSet) {
+ char l[40], *h;
+ strcpy(l, map_num(r6layout, info.array.layout));
+ h = strrchr(l, '-');
+ if (h && strcmp(h, "-6") == 0) {
+ *h = 0;
+ info.new_layout = map_name(r6layout, l);
+ }
+ } else {
+ pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ } else if (strcmp(s->layout_str, "preserve") == 0) {
+ /* This means that a non-standard RAID6 layout
+ * is OK.
+ * In particular:
+ * - When reshape a RAID6 (e.g. adding a device)
+ * which is in a non-standard layout, it is OK
+ * to preserve that layout.
+ * - When converting a RAID5 to RAID6, leave it in
+ * the XXX-6 layout, don't re-layout.
+ */
+ if (info.array.level == 6 && info.new_level == UnSet)
+ info.new_layout = info.array.layout;
+ else if (info.array.level == 5 && info.new_level == 6) {
+ char l[40];
+ strcpy(l, map_num(r5layout, info.array.layout));
+ strcat(l, "-6");
+ info.new_layout = map_name(r6layout, l);
+ } else {
+ pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ } else {
+ int l = info.new_level;
+ if (l == UnSet)
+ l = info.array.level;
+ switch (l) {
+ case 5:
+ info.new_layout = map_name(r5layout, s->layout_str);
+ break;
+ case 6:
+ info.new_layout = map_name(r6layout, s->layout_str);
+ break;
+ case 10:
+ info.new_layout = parse_layout_10(s->layout_str);
+ break;
+ case LEVEL_FAULTY:
+ info.new_layout = parse_layout_faulty(s->layout_str);
+ break;
+ default:
+ pr_err("layout not meaningful with this level\n");
+ rv = 1;
+ goto release;
+ }
+ if (info.new_layout == UnSet) {
+ pr_err("layout %s not understood for this level\n",
+ s->layout_str);
+ rv = 1;
+ goto release;
+ }
+ }
+
+ if (array.level == LEVEL_FAULTY) {
+ if (s->level != UnSet && s->level != array.level) {
+ pr_err("cannot change level of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->chunk) {
+ pr_err("cannot set chunksize of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->raiddisks && s->raiddisks != 1) {
+ pr_err("cannot set raid_disks of Faulty device\n");
+ rv =1 ;
+ }
+ if (s->layout_str) {
+ if (md_get_array_info(fd, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ array.layout = info.new_layout;
+ if (md_set_array_info(fd, &array) != 0) {
+ pr_err("failed to set new layout\n");
+ rv = 1;
+ } else if (c->verbose >= 0)
+ printf("layout for %s set to %d\n",
+ devname, array.layout);
+ }
+ } else if (array.level == LEVEL_CONTAINER) {
+ /* This change is to be applied to every array in the
+ * container. This is only needed when the metadata imposes
+ * restraints of the various arrays in the container.
+ * Currently we only know that IMSM requires all arrays
+ * to have the same number of devices so changing the
+ * number of devices (On-Line Capacity Expansion) must be
+ * performed at the level of the container
+ */
+ close_fd(&fd);
+ rv = reshape_container(container, devname, -1, st, &info,
+ c->force, c->backup_file, c->verbose,
+ 0, 0, 0);
+ frozen = 0;
+ } else {
+ /* get spare devices from external metadata
+ */
+ if (st->ss->external) {
+ struct mdinfo *info2;
+
+ info2 = st->ss->container_content(st, subarray);
+ if (info2) {
+ info.array.spare_disks =
+ info2->array.spare_disks;
+ sysfs_free(info2);
+ }
+ }
+
+ /* Impose these changes on a single array. First
+ * check that the metadata is OK with the change. */
+
+ if (reshape_super(st, 0, info.new_level,
+ info.new_layout, info.new_chunk,
+ info.array.raid_disks, info.delta_disks,
+ c->backup_file, devname,
+ APPLY_METADATA_CHANGES, c->verbose)) {
+ rv = 1;
+ goto release;
+ }
+ sync_metadata(st);
+ rv = reshape_array(container, fd, devname, st, &info, c->force,
+ devlist, data_offset, c->backup_file,
+ c->verbose, 0, 0, 0);
+ frozen = 0;
+ }
+release:
+ sysfs_free(sra);
+ if (frozen > 0)
+ unfreeze(st);
+ return rv;
+}
+
+/* verify_reshape_position()
+ * Function checks if reshape position in metadata is not farther
+ * than position in md.
+ * Return value:
+ * 0 : not valid sysfs entry
+ * it can be caused by not started reshape, it should be started
+ * by reshape array or raid0 array is before takeover
+ * -1 : error, reshape position is obviously wrong
+ * 1 : success, reshape progress correct or updated
+*/
+static int verify_reshape_position(struct mdinfo *info, int level)
+{
+ int ret_val = 0;
+ char buf[40];
+ int rv;
+
+ /* read sync_max, failure can mean raid0 array */
+ rv = sysfs_get_str(info, NULL, "sync_max", buf, 40);
+
+ if (rv > 0) {
+ char *ep;
+ unsigned long long position = strtoull(buf, &ep, 0);
+
+ dprintf("Read sync_max sysfs entry is: %s\n", buf);
+ if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) {
+ position *= get_data_disks(level,
+ info->new_layout,
+ info->array.raid_disks);
+ if (info->reshape_progress < position) {
+ dprintf("Corrected reshape progress (%llu) to md position (%llu)\n",
+ info->reshape_progress, position);
+ info->reshape_progress = position;
+ ret_val = 1;
+ } else if (info->reshape_progress > position) {
+ pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n",
+ position, info->reshape_progress);
+ ret_val = -1;
+ } else {
+ dprintf("Reshape position in md and metadata are the same;");
+ ret_val = 1;
+ }
+ }
+ } else if (rv == 0) {
+ /* for valid sysfs entry, 0-length content
+ * should be indicated as error
+ */
+ ret_val = -1;
+ }
+
+ return ret_val;
+}
+
+static unsigned long long choose_offset(unsigned long long lo,
+ unsigned long long hi,
+ unsigned long long min,
+ unsigned long long max)
+{
+ /* Choose a new offset between hi and lo.
+ * It must be between min and max, but
+ * we would prefer something near the middle of hi/lo, and also
+ * prefer to be aligned to a big power of 2.
+ *
+ * So we start with the middle, then for each bit,
+ * starting at '1' and increasing, if it is set, we either
+ * add it or subtract it if possible, preferring the option
+ * which is furthest from the boundary.
+ *
+ * We stop once we get a 1MB alignment. As units are in sectors,
+ * 1MB = 2*1024 sectors.
+ */
+ unsigned long long choice = (lo + hi) / 2;
+ unsigned long long bit = 1;
+
+ for (bit = 1; bit < 2*1024; bit = bit << 1) {
+ unsigned long long bigger, smaller;
+ if (! (bit & choice))
+ continue;
+ bigger = choice + bit;
+ smaller = choice - bit;
+ if (bigger > max && smaller < min)
+ break;
+ if (bigger > max)
+ choice = smaller;
+ else if (smaller < min)
+ choice = bigger;
+ else if (hi - bigger > smaller - lo)
+ choice = bigger;
+ else
+ choice = smaller;
+ }
+ return choice;
+}
+
+static int set_new_data_offset(struct mdinfo *sra, struct supertype *st,
+ char *devname, int delta_disks,
+ unsigned long long data_offset,
+ unsigned long long min,
+ int can_fallback)
+{
+ struct mdinfo *sd;
+ int dir = 0;
+ int err = 0;
+ unsigned long long before, after;
+
+ /* Need to find min space before and after so same is used
+ * on all devices
+ */
+ before = UINT64_MAX;
+ after = UINT64_MAX;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int dfd;
+ int rv;
+ struct supertype *st2;
+ struct mdinfo info2;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0) {
+ pr_err("%s: cannot open component %s\n",
+ devname, dn ? dn : "-unknown-");
+ goto release;
+ }
+ st2 = dup_super(st);
+ rv = st2->ss->load_super(st2,dfd, NULL);
+ close(dfd);
+ if (rv) {
+ free(st2);
+ pr_err("%s: cannot get superblock from %s\n",
+ devname, dn);
+ goto release;
+ }
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ free(st2);
+ if (info2.space_before == 0 &&
+ info2.space_after == 0) {
+ /* Metadata doesn't support data_offset changes */
+ if (!can_fallback)
+ pr_err("%s: Metadata version doesn't support data_offset changes\n",
+ devname);
+ goto fallback;
+ }
+ if (before > info2.space_before)
+ before = info2.space_before;
+ if (after > info2.space_after)
+ after = info2.space_after;
+
+ if (data_offset != INVALID_SECTORS) {
+ if (dir == 0) {
+ if (info2.data_offset == data_offset) {
+ pr_err("%s: already has that data_offset\n",
+ dn);
+ goto release;
+ }
+ if (data_offset < info2.data_offset)
+ dir = -1;
+ else
+ dir = 1;
+ } else if ((data_offset <= info2.data_offset &&
+ dir == 1) ||
+ (data_offset >= info2.data_offset &&
+ dir == -1)) {
+ pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n",
+ dn);
+ goto release;
+ }
+ }
+ }
+ if (before == UINT64_MAX)
+ /* impossible really, there must be no devices */
+ return 1;
+
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ unsigned long long new_data_offset;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (delta_disks < 0) {
+ /* Don't need any space as array is shrinking
+ * just move data_offset up by min
+ */
+ if (data_offset == INVALID_SECTORS)
+ new_data_offset = sd->data_offset + min;
+ else {
+ if (data_offset < sd->data_offset + min) {
+ pr_err("--data-offset too small for %s\n",
+ dn);
+ goto release;
+ }
+ new_data_offset = data_offset;
+ }
+ } else if (delta_disks > 0) {
+ /* need space before */
+ if (before < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("Insufficient head-space for reshape on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset == INVALID_SECTORS)
+ new_data_offset = sd->data_offset - min;
+ else {
+ if (data_offset > sd->data_offset - min) {
+ pr_err("--data-offset too large for %s\n",
+ dn);
+ goto release;
+ }
+ new_data_offset = data_offset;
+ }
+ } else {
+ if (dir == 0) {
+ /* can move up or down. If 'data_offset'
+ * was set we would have already decided,
+ * so just choose direction with most space.
+ */
+ if (before > after)
+ dir = -1;
+ else
+ dir = 1;
+ }
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ dir == 1 ? "backwards" : "forwards");
+ if (dir > 0) {
+ /* Increase data offset */
+ if (after < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("Insufficient tail-space for reshape on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ data_offset < sd->data_offset + min) {
+ pr_err("--data-offset too small on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS)
+ new_data_offset = data_offset;
+ else
+ new_data_offset = choose_offset(sd->data_offset,
+ sd->data_offset + after,
+ sd->data_offset + min,
+ sd->data_offset + after);
+ } else {
+ /* Decrease data offset */
+ if (before < min) {
+ if (can_fallback)
+ goto fallback;
+ pr_err("insufficient head-room on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ data_offset > sd->data_offset - min) {
+ pr_err("--data-offset too large on %s\n",
+ dn);
+ goto release;
+ }
+ if (data_offset != INVALID_SECTORS)
+ new_data_offset = data_offset;
+ else
+ new_data_offset = choose_offset(sd->data_offset - before,
+ sd->data_offset,
+ sd->data_offset - before,
+ sd->data_offset - min);
+ }
+ }
+ err = sysfs_set_num(sra, sd, "new_offset", new_data_offset);
+ if (err < 0 && errno == E2BIG) {
+ /* try again after increasing data size to max */
+ err = sysfs_set_num(sra, sd, "size", 0);
+ if (err < 0 && errno == EINVAL &&
+ !(sd->disk.state & (1<<MD_DISK_SYNC))) {
+ /* some kernels have a bug where you cannot
+ * use '0' on spare devices. */
+ sysfs_set_num(sra, sd, "size",
+ (sra->component_size + after)/2);
+ }
+ err = sysfs_set_num(sra, sd, "new_offset",
+ new_data_offset);
+ }
+ if (err < 0) {
+ if (errno == E2BIG && data_offset != INVALID_SECTORS) {
+ pr_err("data-offset is too big for %s\n", dn);
+ goto release;
+ }
+ if (sd == sra->devs &&
+ (errno == ENOENT || errno == E2BIG))
+ /* Early kernel, no 'new_offset' file,
+ * or kernel doesn't like us.
+ * For RAID5/6 this is not fatal
+ */
+ return 1;
+ pr_err("Cannot set new_offset for %s\n", dn);
+ break;
+ }
+ }
+ return err;
+release:
+ return -1;
+fallback:
+ /* Just use a backup file */
+ return 1;
+}
+
+static int raid10_reshape(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ struct reshape *reshape,
+ unsigned long long data_offset,
+ int force, int verbose)
+{
+ /* Changing raid_disks, layout, chunksize or possibly
+ * just data_offset for a RAID10.
+ * We must always change data_offset. We change by at least
+ * ->min_offset_change which is the largest of the old and new
+ * chunk sizes.
+ * If raid_disks is increasing, then data_offset must decrease
+ * by at least this copy size.
+ * If raid_disks is unchanged, data_offset must increase or
+ * decrease by at least min_offset_change but preferably by much more.
+ * We choose half of the available space.
+ * If raid_disks is decreasing, data_offset must increase by
+ * at least min_offset_change. To allow of this, component_size
+ * must be decreased by the same amount.
+ *
+ * So we calculate the required minimum and direction, possibly
+ * reduce the component_size, then iterate through the devices
+ * and set the new_data_offset.
+ * If that all works, we set chunk_size, layout, raid_disks, and start
+ * 'reshape'
+ */
+ struct mdinfo *sra;
+ unsigned long long min;
+ int err = 0;
+
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK
+ );
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n", devname);
+ goto release;
+ }
+ min = reshape->min_offset_change;
+
+ if (info->delta_disks)
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ info->delta_disks < 0 ? "backwards" : "forwards");
+ if (info->delta_disks < 0 && info->space_after < min) {
+ int rv = sysfs_set_num(sra, NULL, "component_size",
+ (sra->component_size - min)/2);
+ if (rv) {
+ pr_err("cannot reduce component size\n");
+ goto release;
+ }
+ }
+ err = set_new_data_offset(sra, st, devname, info->delta_disks,
+ data_offset, min, 0);
+ if (err == 1) {
+ pr_err("Cannot set new_data_offset: RAID10 reshape not\n");
+ cont_err("supported on this kernel\n");
+ err = -1;
+ }
+ if (err < 0)
+ goto release;
+
+ if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
+ err = errno;
+ if (!err && sysfs_set_num(sra, NULL, "layout",
+ reshape->after.layout) < 0)
+ err = errno;
+ if (!err &&
+ sysfs_set_num(sra, NULL, "raid_disks",
+ info->array.raid_disks + info->delta_disks) < 0)
+ err = errno;
+ if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0)
+ err = errno;
+ if (err) {
+ pr_err("Cannot set array shape for %s\n",
+ devname);
+ if (err == EBUSY &&
+ (info->array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err(" Bitmap must be removed before shape can be changed\n");
+ goto release;
+ }
+ sysfs_free(sra);
+ return 0;
+release:
+ sysfs_free(sra);
+ return 1;
+}
+
+static void get_space_after(int fd, struct supertype *st, struct mdinfo *info)
+{
+ struct mdinfo *sra, *sd;
+ /* Initialisation to silence compiler warning */
+ unsigned long long min_space_before = 0, min_space_after = 0;
+ int first = 1;
+
+ sra = sysfs_read(fd, NULL, GET_DEVS);
+ if (!sra)
+ return;
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int dfd;
+ struct supertype *st2;
+ struct mdinfo info2;
+
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 0);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ break;
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2,dfd, NULL)) {
+ close(dfd);
+ free(st2);
+ break;
+ }
+ close(dfd);
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ free(st2);
+ if (first ||
+ min_space_before > info2.space_before)
+ min_space_before = info2.space_before;
+ if (first ||
+ min_space_after > info2.space_after)
+ min_space_after = info2.space_after;
+ first = 0;
+ }
+ if (sd == NULL && !first) {
+ info->space_after = min_space_after;
+ info->space_before = min_space_before;
+ }
+ sysfs_free(sra);
+}
+
+static void update_cache_size(char *container, struct mdinfo *sra,
+ struct mdinfo *info,
+ int disks, unsigned long long blocks)
+{
+ /* Check that the internal stripe cache is
+ * large enough, or it won't work.
+ * It must hold at least 4 stripes of the larger
+ * chunk size
+ */
+ unsigned long cache;
+ cache = max(info->array.chunk_size, info->new_chunk);
+ cache *= 4; /* 4 stripes minimum */
+ cache /= 512; /* convert to sectors */
+ /* make sure there is room for 'blocks' with a bit to spare */
+ if (cache < 16 + blocks / disks)
+ cache = 16 + blocks / disks;
+ cache /= (4096/512); /* Convert from sectors to pages */
+
+ if (sra->cache_size < cache)
+ subarray_set_num(container, sra, "stripe_cache_size",
+ cache+1);
+}
+
+static int impose_reshape(struct mdinfo *sra,
+ struct mdinfo *info,
+ struct supertype *st,
+ int fd,
+ int restart,
+ char *devname, char *container,
+ struct reshape *reshape)
+{
+ struct mdu_array_info_s array;
+
+ sra->new_chunk = info->new_chunk;
+
+ if (restart) {
+ /* for external metadata checkpoint saved by mdmon can be lost
+ * or missed /due to e.g. crash/. Check if md is not during
+ * restart farther than metadata points to.
+ * If so, this means metadata information is obsolete.
+ */
+ if (st->ss->external)
+ verify_reshape_position(info, reshape->level);
+ sra->reshape_progress = info->reshape_progress;
+ } else {
+ sra->reshape_progress = 0;
+ if (reshape->after.data_disks < reshape->before.data_disks)
+ /* start from the end of the new array */
+ sra->reshape_progress = (sra->component_size
+ * reshape->after.data_disks);
+ }
+
+ md_get_array_info(fd, &array);
+ if (info->array.chunk_size == info->new_chunk &&
+ reshape->before.layout == reshape->after.layout &&
+ st->ss->external == 0) {
+ /* use SET_ARRAY_INFO but only if reshape hasn't started */
+ array.raid_disks = reshape->after.data_disks + reshape->parity;
+ if (!restart && md_set_array_info(fd, &array) != 0) {
+ int err = errno;
+
+ pr_err("Cannot set device shape for %s: %s\n",
+ devname, strerror(errno));
+
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before shape can be changed\n");
+
+ goto release;
+ }
+ } else if (!restart) {
+ /* set them all just in case some old 'new_*' value
+ * persists from some earlier problem.
+ */
+ int err = 0;
+ if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0)
+ err = errno;
+ if (!err && sysfs_set_num(sra, NULL, "layout",
+ reshape->after.layout) < 0)
+ err = errno;
+ if (!err && subarray_set_num(container, sra, "raid_disks",
+ reshape->after.data_disks +
+ reshape->parity) < 0)
+ err = errno;
+ if (err) {
+ pr_err("Cannot set device shape for %s\n", devname);
+
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before shape can be changed\n");
+ goto release;
+ }
+ }
+ return 0;
+release:
+ return -1;
+}
+
+static int impose_level(int fd, int level, char *devname, int verbose)
+{
+ char *c;
+ struct mdu_array_info_s array;
+ struct mdinfo info;
+
+ if (sysfs_init(&info, fd, NULL)) {
+ pr_err("failed to initialize sysfs.\n");
+ return 1;
+ }
+
+ md_get_array_info(fd, &array);
+ if (level == 0 && (array.level >= 4 && array.level <= 6)) {
+ /* To convert to RAID0 we need to fail and
+ * remove any non-data devices. */
+ int found = 0;
+ int d;
+ int data_disks = array.raid_disks - 1;
+ if (array.level == 6)
+ data_disks -= 1;
+ if (array.level == 5 && array.layout != ALGORITHM_PARITY_N)
+ return -1;
+ if (array.level == 6 && array.layout != ALGORITHM_PARITY_N_6)
+ return -1;
+ sysfs_set_str(&info, NULL,"sync_action", "idle");
+ /* First remove any spares so no recovery starts */
+ for (d = 0, found = 0;
+ d < MAX_DISKS && found < array.nr_disks; d++) {
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ found++;
+ if ((disk.state & (1 << MD_DISK_ACTIVE)) &&
+ disk.raid_disk < data_disks)
+ /* keep this */
+ continue;
+ ioctl(fd, HOT_REMOVE_DISK,
+ makedev(disk.major, disk.minor));
+ }
+ /* Now fail anything left */
+ md_get_array_info(fd, &array);
+ for (d = 0, found = 0;
+ d < MAX_DISKS && found < array.nr_disks; d++) {
+ mdu_disk_info_t disk;
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ found++;
+ if ((disk.state & (1 << MD_DISK_ACTIVE)) &&
+ disk.raid_disk < data_disks)
+ /* keep this */
+ continue;
+ ioctl(fd, SET_DISK_FAULTY,
+ makedev(disk.major, disk.minor));
+ hot_remove_disk(fd, makedev(disk.major, disk.minor), 1);
+ }
+ }
+ c = map_num(pers, level);
+ if (c) {
+ int err = sysfs_set_str(&info, NULL, "level", c);
+ if (err) {
+ err = errno;
+ pr_err("%s: could not set level to %s\n",
+ devname, c);
+ if (err == EBUSY &&
+ (array.state & (1<<MD_SB_BITMAP_PRESENT)))
+ cont_err("Bitmap must be removed before level can be changed\n");
+ return err;
+ }
+ if (verbose >= 0)
+ pr_err("level of %s changed to %s\n", devname, c);
+ }
+ return 0;
+}
+
+int sigterm = 0;
+static void catch_term(int sig)
+{
+ sigterm = 1;
+}
+
+static int reshape_array(char *container, int fd, char *devname,
+ struct supertype *st, struct mdinfo *info,
+ int force, struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ char *backup_file, int verbose, int forked,
+ int restart, int freeze_reshape)
+{
+ struct reshape reshape;
+ int spares_needed;
+ char *msg;
+ int orig_level = UnSet;
+ int odisks;
+ int delayed;
+
+ struct mdu_array_info_s array;
+ char *c;
+
+ struct mddev_dev *dv;
+ int added_disks;
+
+ int *fdlist = NULL;
+ unsigned long long *offsets = NULL;
+ int d;
+ int nrdisks;
+ int err;
+ unsigned long blocks;
+ unsigned long long array_size;
+ int done;
+ struct mdinfo *sra = NULL;
+ char buf[20];
+
+ /* when reshaping a RAID0, the component_size might be zero.
+ * So try to fix that up.
+ */
+ if (md_get_array_info(fd, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ if (array.level == 0 && info->component_size == 0) {
+ get_dev_size(fd, NULL, &array_size);
+ info->component_size = array_size / array.raid_disks;
+ }
+
+ if (array.level == 10)
+ /* Need space_after info */
+ get_space_after(fd, st, info);
+
+ if (info->reshape_active) {
+ int new_level = info->new_level;
+ info->new_level = UnSet;
+ if (info->delta_disks > 0)
+ info->array.raid_disks -= info->delta_disks;
+ msg = analyse_change(devname, info, &reshape);
+ info->new_level = new_level;
+ if (info->delta_disks > 0)
+ info->array.raid_disks += info->delta_disks;
+ if (!restart)
+ /* Make sure the array isn't read-only */
+ ioctl(fd, RESTART_ARRAY_RW, 0);
+ } else
+ msg = analyse_change(devname, info, &reshape);
+ if (msg) {
+ /* if msg == "", error has already been printed */
+ if (msg[0])
+ pr_err("%s\n", msg);
+ goto release;
+ }
+ if (restart && (reshape.level != info->array.level ||
+ reshape.before.layout != info->array.layout ||
+ reshape.before.data_disks + reshape.parity !=
+ info->array.raid_disks - max(0, info->delta_disks))) {
+ pr_err("reshape info is not in native format - cannot continue.\n");
+ goto release;
+ }
+
+ if (st->ss->external && restart && (info->reshape_progress == 0) &&
+ !((sysfs_get_str(info, NULL, "sync_action",
+ buf, sizeof(buf)) > 0) &&
+ (strncmp(buf, "reshape", 7) == 0))) {
+ /* When reshape is restarted from '0', very begin of array
+ * it is possible that for external metadata reshape and array
+ * configuration doesn't happen.
+ * Check if md has the same opinion, and reshape is restarted
+ * from 0. If so, this is regular reshape start after reshape
+ * switch in metadata to next array only.
+ */
+ if ((verify_reshape_position(info, reshape.level) >= 0) &&
+ (info->reshape_progress == 0))
+ restart = 0;
+ }
+ if (restart) {
+ /*
+ * reshape already started. just skip to monitoring
+ * the reshape
+ */
+ if (reshape.backup_blocks == 0)
+ return 0;
+ if (restart & RESHAPE_NO_BACKUP)
+ return 0;
+
+ /* Need 'sra' down at 'started:' */
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|
+ GET_CHUNK|GET_CACHE);
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n",
+ devname);
+ goto release;
+ }
+
+ if (!backup_file)
+ backup_file = locate_backup(sra->sys_name);
+
+ goto started;
+ }
+ /* The container is frozen but the array may not be.
+ * So freeze the array so spares don't get put to the wrong use
+ * FIXME there should probably be a cleaner separation between
+ * freeze_array and freeze_container.
+ */
+ sysfs_freeze_array(info);
+ /* Check we have enough spares to not be degraded */
+ added_disks = 0;
+ for (dv = devlist; dv ; dv=dv->next)
+ added_disks++;
+ spares_needed = max(reshape.before.data_disks,
+ reshape.after.data_disks) +
+ reshape.parity - array.raid_disks;
+
+ if (!force && info->new_level > 1 && info->array.level > 1 &&
+ spares_needed > info->array.spare_disks + added_disks) {
+ pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n"
+ " Use --force to over-ride this check.\n",
+ spares_needed,
+ spares_needed == 1 ? "" : "s",
+ info->array.spare_disks + added_disks);
+ goto release;
+ }
+ /* Check we have enough spares to not fail */
+ spares_needed = max(reshape.before.data_disks,
+ reshape.after.data_disks)
+ - array.raid_disks;
+ if ((info->new_level > 1 || info->new_level == 0) &&
+ spares_needed > info->array.spare_disks +added_disks) {
+ pr_err("Need %d spare%s to create working array, and only have %d.\n",
+ spares_needed, spares_needed == 1 ? "" : "s",
+ info->array.spare_disks + added_disks);
+ goto release;
+ }
+
+ if (reshape.level != array.level) {
+ int err = impose_level(fd, reshape.level, devname, verbose);
+ if (err)
+ goto release;
+ info->new_layout = UnSet; /* after level change,
+ * layout is meaningless */
+ orig_level = array.level;
+ sysfs_freeze_array(info);
+
+ if (reshape.level > 0 && st->ss->external) {
+ /* make sure mdmon is aware of the new level */
+ if (mdmon_running(container))
+ flush_mdmon(container);
+
+ if (!mdmon_running(container))
+ start_mdmon(container);
+ ping_monitor(container);
+ if (mdmon_running(container) && st->update_tail == NULL)
+ st->update_tail = &st->updates;
+ }
+ }
+ /* ->reshape_super might have chosen some spares from the
+ * container that it wants to be part of the new array.
+ * We can collect them with ->container_content and give
+ * them to the kernel.
+ */
+ if (st->ss->reshape_super && st->ss->container_content) {
+ char *subarray = strchr(info->text_version+1, '/')+1;
+ struct mdinfo *info2 =
+ st->ss->container_content(st, subarray);
+ struct mdinfo *d;
+
+ if (info2) {
+ if (sysfs_init(info2, fd, st->devnm)) {
+ pr_err("unable to initialize sysfs for %s\n",
+ st->devnm);
+ free(info2);
+ goto release;
+ }
+ /* When increasing number of devices, we need to set
+ * new raid_disks before adding these, or they might
+ * be rejected.
+ */
+ if (reshape.backup_blocks &&
+ reshape.after.data_disks >
+ reshape.before.data_disks)
+ subarray_set_num(container, info2, "raid_disks",
+ reshape.after.data_disks +
+ reshape.parity);
+ for (d = info2->devs; d; d = d->next) {
+ if (d->disk.state == 0 &&
+ d->disk.raid_disk >= 0) {
+ /* This is a spare that wants to
+ * be part of the array.
+ */
+ add_disk(fd, st, info2, d);
+ }
+ }
+ sysfs_free(info2);
+ }
+ }
+ /* We might have been given some devices to add to the
+ * array. Now that the array has been changed to the right
+ * level and frozen, we can safely add them.
+ */
+ if (devlist) {
+ if (Manage_subdevs(devname, fd, devlist, verbose, 0, NULL, 0))
+ goto release;
+ }
+
+ if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS)
+ reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512;
+ if (reshape.backup_blocks == 0) {
+ /* No restriping needed, but we might need to impose
+ * some more changes: layout, raid_disks, chunk_size
+ */
+ /* read current array info */
+ if (md_get_array_info(fd, &array) != 0) {
+ dprintf("Cannot get array information.\n");
+ goto release;
+ }
+ /* compare current array info with new values and if
+ * it is different update them to new */
+ if (info->new_layout != UnSet &&
+ info->new_layout != array.layout) {
+ array.layout = info->new_layout;
+ if (md_set_array_info(fd, &array) != 0) {
+ pr_err("failed to set new layout\n");
+ goto release;
+ } else if (verbose >= 0)
+ printf("layout for %s set to %d\n",
+ devname, array.layout);
+ }
+ if (info->delta_disks != UnSet && info->delta_disks != 0 &&
+ array.raid_disks !=
+ (info->array.raid_disks + info->delta_disks)) {
+ array.raid_disks += info->delta_disks;
+ if (md_set_array_info(fd, &array) != 0) {
+ pr_err("failed to set raid disks\n");
+ goto release;
+ } else if (verbose >= 0) {
+ printf("raid_disks for %s set to %d\n",
+ devname, array.raid_disks);
+ }
+ }
+ if (info->new_chunk != 0 &&
+ info->new_chunk != array.chunk_size) {
+ if (sysfs_set_num(info, NULL,
+ "chunk_size", info->new_chunk) != 0) {
+ pr_err("failed to set chunk size\n");
+ goto release;
+ } else if (verbose >= 0)
+ printf("chunk size for %s set to %d\n",
+ devname, info->new_chunk);
+ }
+ unfreeze(st);
+ return 0;
+ }
+
+ /*
+ * There are three possibilities.
+ * 1/ The array will shrink.
+ * We need to ensure the reshape will pause before reaching
+ * the 'critical section'. We also need to fork and wait for
+ * that to happen. When it does we
+ * suspend/backup/complete/unfreeze
+ *
+ * 2/ The array will not change size.
+ * This requires that we keep a backup of a sliding window
+ * so that we can restore data after a crash. So we need
+ * to fork and monitor progress.
+ * In future we will allow the data_offset to change, so
+ * a sliding backup becomes unnecessary.
+ *
+ * 3/ The array will grow. This is relatively easy.
+ * However the kernel's restripe routines will cheerfully
+ * overwrite some early data before it is safe. So we
+ * need to make a backup of the early parts of the array
+ * and be ready to restore it if rebuild aborts very early.
+ * For externally managed metadata, we still need a forked
+ * child to monitor the reshape and suspend IO over the region
+ * that is being reshaped.
+ *
+ * We backup data by writing it to one spare, or to a
+ * file which was given on command line.
+ *
+ * In each case, we first make sure that storage is available
+ * for the required backup.
+ * Then we:
+ * - request the shape change.
+ * - fork to handle backup etc.
+ */
+ /* Check that we can hold all the data */
+ get_dev_size(fd, NULL, &array_size);
+ if (reshape.new_size < (array_size/512)) {
+ pr_err("this change will reduce the size of the array.\n"
+ " use --grow --array-size first to truncate array.\n"
+ " e.g. mdadm --grow %s --array-size %llu\n",
+ devname, reshape.new_size/2);
+ goto release;
+ }
+
+ if (array.level == 10) {
+ /* Reshaping RAID10 does not require any data backup by
+ * user-space. Instead it requires that the data_offset
+ * is changed to avoid the need for backup.
+ * So this is handled very separately
+ */
+ if (restart)
+ /* Nothing to do. */
+ return 0;
+ return raid10_reshape(container, fd, devname, st, info,
+ &reshape, data_offset, force, verbose);
+ }
+ sra = sysfs_read(fd, NULL,
+ GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK|
+ GET_CACHE);
+ if (!sra) {
+ pr_err("%s: Cannot get array details from sysfs\n",
+ devname);
+ goto release;
+ }
+
+ if (!backup_file)
+ switch(set_new_data_offset(sra, st, devname,
+ reshape.after.data_disks - reshape.before.data_disks,
+ data_offset,
+ reshape.min_offset_change, 1)) {
+ case -1:
+ goto release;
+ case 0:
+ /* Updated data_offset, so it's easy now */
+ update_cache_size(container, sra, info,
+ min(reshape.before.data_disks,
+ reshape.after.data_disks),
+ reshape.backup_blocks);
+
+ /* Right, everything seems fine. Let's kick things off.
+ */
+ sync_metadata(st);
+
+ if (impose_reshape(sra, info, st, fd, restart,
+ devname, container, &reshape) < 0)
+ goto release;
+ if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) {
+ struct mdinfo *sd;
+ if (errno != EINVAL) {
+ pr_err("Failed to initiate reshape!\n");
+ goto release;
+ }
+ /* revert data_offset and try the old way */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ sysfs_set_num(sra, sd, "new_offset",
+ sd->data_offset);
+ sysfs_set_str(sra, NULL, "reshape_direction",
+ "forwards");
+ }
+ break;
+ }
+ if (info->new_level == reshape.level)
+ return 0;
+ /* need to adjust level when reshape completes */
+ switch(fork()) {
+ case -1: /* ignore error, but don't wait */
+ return 0;
+ default: /* parent */
+ return 0;
+ case 0:
+ manage_fork_fds(0);
+ map_fork();
+ break;
+ }
+ close(fd);
+ wait_reshape(sra);
+ fd = open_dev(sra->sys_name);
+ if (fd >= 0)
+ impose_level(fd, info->new_level, devname, verbose);
+ return 0;
+ case 1: /* Couldn't set data_offset, try the old way */
+ if (data_offset != INVALID_SECTORS) {
+ pr_err("Cannot update data_offset on this array\n");
+ goto release;
+ }
+ break;
+ }
+
+started:
+ /* Decide how many blocks (sectors) for a reshape
+ * unit. The number we have so far is just a minimum
+ */
+ blocks = reshape.backup_blocks;
+ if (reshape.before.data_disks ==
+ reshape.after.data_disks) {
+ /* Make 'blocks' bigger for better throughput, but
+ * not so big that we reject it below.
+ * Try for 16 megabytes
+ */
+ while (blocks * 32 < sra->component_size && blocks < 16*1024*2)
+ blocks *= 2;
+ } else
+ pr_err("Need to backup %luK of critical section..\n", blocks/2);
+
+ if (blocks >= sra->component_size/2) {
+ pr_err("%s: Something wrong - reshape aborted\n", devname);
+ goto release;
+ }
+
+ /* Now we need to open all these devices so we can read/write.
+ */
+ nrdisks = max(reshape.before.data_disks,
+ reshape.after.data_disks) + reshape.parity
+ + sra->array.spare_disks;
+ fdlist = xcalloc((1+nrdisks), sizeof(int));
+ offsets = xcalloc((1+nrdisks), sizeof(offsets[0]));
+
+ odisks = reshape.before.data_disks + reshape.parity;
+ d = reshape_prepare_fdlist(devname, sra, odisks, nrdisks, blocks,
+ backup_file, fdlist, offsets);
+ if (d < odisks) {
+ goto release;
+ }
+ if ((st->ss->manage_reshape == NULL) ||
+ (st->ss->recover_backup == NULL)) {
+ if (backup_file == NULL) {
+ if (reshape.after.data_disks <=
+ reshape.before.data_disks) {
+ pr_err("%s: Cannot grow - need backup-file\n",
+ devname);
+ pr_err(" Please provide one with \"--backup=...\"\n");
+ goto release;
+ } else if (d == odisks) {
+ pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname);
+ goto release;
+ }
+ } else {
+ if (!reshape_open_backup_file(backup_file, fd, devname,
+ (signed)blocks,
+ fdlist+d, offsets+d,
+ sra->sys_name, restart)) {
+ goto release;
+ }
+ d++;
+ }
+ }
+
+ update_cache_size(container, sra, info,
+ min(reshape.before.data_disks,
+ reshape.after.data_disks), blocks);
+
+ /* Right, everything seems fine. Let's kick things off.
+ * If only changing raid_disks, use ioctl, else use
+ * sysfs.
+ */
+ sync_metadata(st);
+
+ if (impose_reshape(sra, info, st, fd, restart,
+ devname, container, &reshape) < 0)
+ goto release;
+
+ err = start_reshape(sra, restart, reshape.before.data_disks,
+ reshape.after.data_disks, st);
+ if (err) {
+ pr_err("Cannot %s reshape for %s\n",
+ restart ? "continue" : "start", devname);
+ goto release;
+ }
+ if (restart)
+ sysfs_set_str(sra, NULL, "array_state", "active");
+ if (freeze_reshape) {
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n",
+ sra->reshape_progress);
+ return 1;
+ }
+
+ if (!forked)
+ if (continue_via_systemd(container ?: sra->sys_name,
+ GROW_SERVICE)) {
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ return 0;
+ }
+
+ close(fd);
+ /* Now we just need to kick off the reshape and watch, while
+ * handling backups of the data...
+ * This is all done by a forked background process.
+ */
+ switch(forked ? 0 : fork()) {
+ case -1:
+ pr_err("Cannot run child to monitor reshape: %s\n",
+ strerror(errno));
+ abort_reshape(sra);
+ goto release;
+ default:
+ free(fdlist);
+ free(offsets);
+ sysfs_free(sra);
+ return 0;
+ case 0:
+ map_fork();
+ break;
+ }
+
+ /* If another array on the same devices is busy, the
+ * reshape will wait for them. This would mean that
+ * the first section that we suspend will stay suspended
+ * for a long time. So check on that possibility
+ * by looking for "DELAYED" in /proc/mdstat, and if found,
+ * wait a while
+ */
+ do {
+ struct mdstat_ent *mds, *m;
+ delayed = 0;
+ mds = mdstat_read(1, 0);
+ for (m = mds; m; m = m->next)
+ if (strcmp(m->devnm, sra->sys_name) == 0) {
+ if (m->resync && m->percent == RESYNC_DELAYED)
+ delayed = 1;
+ if (m->resync == 0)
+ /* Haven't started the reshape thread
+ * yet, wait a bit
+ */
+ delayed = 2;
+ break;
+ }
+ free_mdstat(mds);
+ if (delayed == 1 && get_linux_version() < 3007000) {
+ pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n"
+ " You might experience problems until other reshapes complete.\n");
+ delayed = 0;
+ }
+ if (delayed)
+ mdstat_wait(30 - (delayed-1) * 25);
+ } while (delayed);
+ mdstat_close();
+ if (check_env("MDADM_GROW_VERIFY"))
+ fd = open(devname, O_RDONLY | O_DIRECT);
+ else
+ fd = -1;
+ mlockall(MCL_FUTURE);
+
+ signal(SIGTERM, catch_term);
+
+ if (st->ss->external) {
+ /* metadata handler takes it from here */
+ done = st->ss->manage_reshape(
+ fd, sra, &reshape, st, blocks,
+ fdlist, offsets, d - odisks, fdlist + odisks,
+ offsets + odisks);
+ } else
+ done = child_monitor(
+ fd, sra, &reshape, st, blocks, fdlist, offsets,
+ d - odisks, fdlist + odisks, offsets + odisks);
+
+ free(fdlist);
+ free(offsets);
+
+ if (backup_file && done) {
+ char *bul;
+ bul = make_backup(sra->sys_name);
+ if (bul) {
+ char buf[1024];
+ int l = readlink(bul, buf, sizeof(buf) - 1);
+ if (l > 0) {
+ buf[l]=0;
+ unlink(buf);
+ }
+ unlink(bul);
+ free(bul);
+ }
+ unlink(backup_file);
+ }
+ if (!done) {
+ abort_reshape(sra);
+ goto out;
+ }
+
+ if (!st->ss->external &&
+ !(reshape.before.data_disks != reshape.after.data_disks &&
+ info->custom_array_size) && info->new_level == reshape.level &&
+ !forked) {
+ /* no need to wait for the reshape to finish as
+ * there is nothing more to do.
+ */
+ sysfs_free(sra);
+ exit(0);
+ }
+ wait_reshape(sra);
+
+ if (st->ss->external) {
+ /* Re-load the metadata as much could have changed */
+ int cfd = open_dev(st->container_devnm);
+ if (cfd >= 0) {
+ flush_mdmon(container);
+ st->ss->free_super(st);
+ st->ss->load_container(st, cfd, container);
+ close(cfd);
+ }
+ }
+
+ /* set new array size if required customer_array_size is used
+ * by this metadata.
+ */
+ if (reshape.before.data_disks != reshape.after.data_disks &&
+ info->custom_array_size)
+ set_array_size(st, info, info->text_version);
+
+ if (info->new_level != reshape.level) {
+ if (fd < 0)
+ fd = open(devname, O_RDONLY);
+ impose_level(fd, info->new_level, devname, verbose);
+ close(fd);
+ if (info->new_level == 0)
+ st->update_tail = NULL;
+ }
+out:
+ sysfs_free(sra);
+ if (forked)
+ return 0;
+ unfreeze(st);
+ exit(0);
+
+release:
+ free(fdlist);
+ free(offsets);
+ if (orig_level != UnSet && sra) {
+ c = map_num(pers, orig_level);
+ if (c && sysfs_set_str(sra, NULL, "level", c) == 0)
+ pr_err("aborting level change\n");
+ }
+ sysfs_free(sra);
+ if (!forked)
+ unfreeze(st);
+ return 1;
+}
+
+/* mdfd handle is passed to be closed in child process (after fork).
+ */
+int reshape_container(char *container, char *devname,
+ int mdfd,
+ struct supertype *st,
+ struct mdinfo *info,
+ int force,
+ char *backup_file, int verbose,
+ int forked, int restart, int freeze_reshape)
+{
+ struct mdinfo *cc = NULL;
+ int rv = restart;
+ char last_devnm[32] = "";
+
+ /* component_size is not meaningful for a container,
+ * so pass '0' meaning 'no change'
+ */
+ if (!restart &&
+ reshape_super(st, 0, info->new_level,
+ info->new_layout, info->new_chunk,
+ info->array.raid_disks, info->delta_disks,
+ backup_file, devname, APPLY_METADATA_CHANGES,
+ verbose)) {
+ unfreeze(st);
+ return 1;
+ }
+
+ sync_metadata(st);
+
+ /* ping monitor to be sure that update is on disk
+ */
+ ping_monitor(container);
+
+ if (!forked && !freeze_reshape)
+ if (continue_via_systemd(container, GROW_SERVICE))
+ return 0;
+
+ switch (forked ? 0 : fork()) {
+ case -1: /* error */
+ perror("Cannot fork to complete reshape\n");
+ unfreeze(st);
+ return 1;
+ default: /* parent */
+ if (!freeze_reshape)
+ printf("%s: multi-array reshape continues in background\n", Name);
+ return 0;
+ case 0: /* child */
+ manage_fork_fds(0);
+ map_fork();
+ break;
+ }
+
+ /* close unused handle in child process
+ */
+ if (mdfd > -1)
+ close(mdfd);
+
+ while(1) {
+ /* For each member array with reshape_active,
+ * we need to perform the reshape.
+ * We pick the first array that needs reshaping and
+ * reshape it. reshape_array() will re-read the metadata
+ * so the next time through a different array should be
+ * ready for reshape.
+ * It is possible that the 'different' array will not
+ * be assembled yet. In that case we simple exit.
+ * When it is assembled, the mdadm which assembles it
+ * will take over the reshape.
+ */
+ struct mdinfo *content;
+ int fd;
+ struct mdstat_ent *mdstat;
+ char *adev;
+ dev_t devid;
+
+ sysfs_free(cc);
+
+ cc = st->ss->container_content(st, NULL);
+
+ for (content = cc; content ; content = content->next) {
+ char *subarray;
+ if (!content->reshape_active)
+ continue;
+
+ subarray = strchr(content->text_version+1, '/')+1;
+ mdstat = mdstat_by_subdev(subarray, container);
+ if (!mdstat)
+ continue;
+ if (mdstat->active == 0) {
+ pr_err("Skipping inactive array %s.\n",
+ mdstat->devnm);
+ free_mdstat(mdstat);
+ mdstat = NULL;
+ continue;
+ }
+ break;
+ }
+ if (!content)
+ break;
+
+ devid = devnm2devid(mdstat->devnm);
+ adev = map_dev(major(devid), minor(devid), 0);
+ if (!adev)
+ adev = content->text_version;
+
+ fd = open_dev(mdstat->devnm);
+ if (fd < 0) {
+ pr_err("Device %s cannot be opened for reshape.\n",
+ adev);
+ break;
+ }
+
+ if (strcmp(last_devnm, mdstat->devnm) == 0) {
+ /* Do not allow for multiple reshape_array() calls for
+ * the same array.
+ * It can happen when reshape_array() returns without
+ * error, when reshape is not finished (wrong reshape
+ * starting/continuation conditions). Mdmon doesn't
+ * switch to next array in container and reentry
+ * conditions for the same array occur.
+ * This is possibly interim until the behaviour of
+ * reshape_array is resolved().
+ */
+ printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev);
+ close(fd);
+ break;
+ }
+ strcpy(last_devnm, mdstat->devnm);
+
+ if (sysfs_init(content, fd, mdstat->devnm)) {
+ pr_err("Unable to initialize sysfs for %s\n",
+ mdstat->devnm);
+ rv = 1;
+ break;
+ }
+
+ if (mdmon_running(container))
+ flush_mdmon(container);
+
+ rv = reshape_array(container, fd, adev, st,
+ content, force, NULL, INVALID_SECTORS,
+ backup_file, verbose, 1, restart,
+ freeze_reshape);
+ close(fd);
+
+ if (freeze_reshape) {
+ sysfs_free(cc);
+ exit(0);
+ }
+
+ restart = 0;
+ if (rv)
+ break;
+
+ if (mdmon_running(container))
+ flush_mdmon(container);
+ }
+ if (!rv)
+ unfreeze(st);
+ sysfs_free(cc);
+ exit(0);
+}
+
+/*
+ * We run a child process in the background which performs the following
+ * steps:
+ * - wait for resync to reach a certain point
+ * - suspend io to the following section
+ * - backup that section
+ * - allow resync to proceed further
+ * - resume io
+ * - discard the backup.
+ *
+ * When are combined in slightly different ways in the three cases.
+ * Grow:
+ * - suspend/backup/allow/wait/resume/discard
+ * Shrink:
+ * - allow/wait/suspend/backup/allow/wait/resume/discard
+ * same-size:
+ * - wait/resume/discard/suspend/backup/allow
+ *
+ * suspend/backup/allow always come together
+ * wait/resume/discard do too.
+ * For the same-size case we have two backups to improve flow.
+ *
+ */
+
+int progress_reshape(struct mdinfo *info, struct reshape *reshape,
+ unsigned long long backup_point,
+ unsigned long long wait_point,
+ unsigned long long *suspend_point,
+ unsigned long long *reshape_completed, int *frozen)
+{
+ /* This function is called repeatedly by the reshape manager.
+ * It determines how much progress can safely be made and allows
+ * that progress.
+ * - 'info' identifies the array and particularly records in
+ * ->reshape_progress the metadata's knowledge of progress
+ * This is a sector offset from the start of the array
+ * of the next array block to be relocated. This number
+ * may increase from 0 or decrease from array_size, depending
+ * on the type of reshape that is happening.
+ * Note that in contrast, 'sync_completed' is a block count of the
+ * reshape so far. It gives the distance between the start point
+ * (head or tail of device) and the next place that data will be
+ * written. It always increases.
+ * - 'reshape' is the structure created by analyse_change
+ * - 'backup_point' shows how much the metadata manager has backed-up
+ * data. For reshapes with increasing progress, it is the next address
+ * to be backed up, previous addresses have been backed-up. For
+ * decreasing progress, it is the earliest address that has been
+ * backed up - later address are also backed up.
+ * So addresses between reshape_progress and backup_point are
+ * backed up providing those are in the 'correct' order.
+ * - 'wait_point' is an array address. When reshape_completed
+ * passes this point, progress_reshape should return. It might
+ * return earlier if it determines that ->reshape_progress needs
+ * to be updated or further backup is needed.
+ * - suspend_point is maintained by progress_reshape and the caller
+ * should not touch it except to initialise to zero.
+ * It is an array address and it only increases in 2.6.37 and earlier.
+ * This makes it difficult to handle reducing reshapes with
+ * external metadata.
+ * However: it is similar to backup_point in that it records the
+ * other end of a suspended region from reshape_progress.
+ * it is moved to extend the region that is safe to backup and/or
+ * reshape
+ * - reshape_completed is read from sysfs and returned. The caller
+ * should copy this into ->reshape_progress when it has reason to
+ * believe that the metadata knows this, and any backup outside this
+ * has been erased.
+ *
+ * Return value is:
+ * 1 if more data from backup_point - but only as far as suspend_point,
+ * should be backed up
+ * 0 if things are progressing smoothly
+ * -1 if the reshape is finished because it is all done,
+ * -2 if the reshape is finished due to an error.
+ */
+
+ int advancing = (reshape->after.data_disks
+ >= reshape->before.data_disks);
+ unsigned long long need_backup; /* All data between start of array and
+ * here will at some point need to
+ * be backed up.
+ */
+ unsigned long long read_offset, write_offset;
+ unsigned long long write_range;
+ unsigned long long max_progress, target, completed;
+ unsigned long long array_size = (info->component_size
+ * reshape->before.data_disks);
+ int fd;
+ char buf[20];
+
+ /* First, we unsuspend any region that is now known to be safe.
+ * If suspend_point is on the 'wrong' side of reshape_progress, then
+ * we don't have or need suspension at the moment. This is true for
+ * native metadata when we don't need to back-up.
+ */
+ if (advancing) {
+ if (info->reshape_progress <= *suspend_point)
+ sysfs_set_num(info, NULL, "suspend_lo",
+ info->reshape_progress);
+ } else {
+ /* Note: this won't work in 2.6.37 and before.
+ * Something somewhere should make sure we don't need it!
+ */
+ if (info->reshape_progress >= *suspend_point)
+ sysfs_set_num(info, NULL, "suspend_hi",
+ info->reshape_progress);
+ }
+
+ /* Now work out how far it is safe to progress.
+ * If the read_offset for ->reshape_progress is less than
+ * 'blocks' beyond the write_offset, we can only progress as far
+ * as a backup.
+ * Otherwise we can progress until the write_offset for the new location
+ * reaches (within 'blocks' of) the read_offset at the current location.
+ * However that region must be suspended unless we are using native
+ * metadata.
+ * If we need to suspend more, we limit it to 128M per device, which is
+ * rather arbitrary and should be some time-based calculation.
+ */
+ read_offset = info->reshape_progress / reshape->before.data_disks;
+ write_offset = info->reshape_progress / reshape->after.data_disks;
+ write_range = info->new_chunk/512;
+ if (reshape->before.data_disks == reshape->after.data_disks)
+ need_backup = array_size;
+ else
+ need_backup = reshape->backup_blocks;
+ if (advancing) {
+ if (read_offset < write_offset + write_range)
+ max_progress = backup_point;
+ else
+ max_progress =
+ read_offset * reshape->after.data_disks;
+ } else {
+ if (read_offset > write_offset - write_range)
+ /* Can only progress as far as has been backed up,
+ * which must be suspended */
+ max_progress = backup_point;
+ else if (info->reshape_progress <= need_backup)
+ max_progress = backup_point;
+ else {
+ if (info->array.major_version >= 0)
+ /* Can progress until backup is needed */
+ max_progress = need_backup;
+ else {
+ /* Can progress until metadata update is required */
+ max_progress =
+ read_offset * reshape->after.data_disks;
+ /* but data must be suspended */
+ if (max_progress < *suspend_point)
+ max_progress = *suspend_point;
+ }
+ }
+ }
+
+ /* We know it is safe to progress to 'max_progress' providing
+ * it is suspended or we are using native metadata.
+ * Consider extending suspend_point 128M per device if it
+ * is less than 64M per device beyond reshape_progress.
+ * But always do a multiple of 'blocks'
+ * FIXME this is too big - it takes to long to complete
+ * this much.
+ */
+ target = 64*1024*2 * min(reshape->before.data_disks,
+ reshape->after.data_disks);
+ target /= reshape->backup_blocks;
+ if (target < 2)
+ target = 2;
+ target *= reshape->backup_blocks;
+
+ /* For externally managed metadata we always need to suspend IO to
+ * the area being reshaped so we regularly push suspend_point forward.
+ * For native metadata we only need the suspend if we are going to do
+ * a backup.
+ */
+ if (advancing) {
+ if ((need_backup > info->reshape_progress ||
+ info->array.major_version < 0) &&
+ *suspend_point < info->reshape_progress + target) {
+ if (need_backup < *suspend_point + 2 * target)
+ *suspend_point = need_backup;
+ else if (*suspend_point + 2 * target < array_size)
+ *suspend_point += 2 * target;
+ else
+ *suspend_point = array_size;
+ sysfs_set_num(info, NULL, "suspend_hi", *suspend_point);
+ if (max_progress > *suspend_point)
+ max_progress = *suspend_point;
+ }
+ } else {
+ if (info->array.major_version >= 0) {
+ /* Only need to suspend when about to backup */
+ if (info->reshape_progress < need_backup * 2 &&
+ *suspend_point > 0) {
+ *suspend_point = 0;
+ sysfs_set_num(info, NULL, "suspend_lo", 0);
+ sysfs_set_num(info, NULL, "suspend_hi",
+ need_backup);
+ }
+ } else {
+ /* Need to suspend continually */
+ if (info->reshape_progress < *suspend_point)
+ *suspend_point = info->reshape_progress;
+ if (*suspend_point + target < info->reshape_progress)
+ /* No need to move suspend region yet */;
+ else {
+ if (*suspend_point >= 2 * target)
+ *suspend_point -= 2 * target;
+ else
+ *suspend_point = 0;
+ sysfs_set_num(info, NULL, "suspend_lo",
+ *suspend_point);
+ }
+ if (max_progress < *suspend_point)
+ max_progress = *suspend_point;
+ }
+ }
+
+ /* now set sync_max to allow that progress. sync_max, like
+ * sync_completed is a count of sectors written per device, so
+ * we find the difference between max_progress and the start point,
+ * and divide that by after.data_disks to get a sync_max
+ * number.
+ * At the same time we convert wait_point to a similar number
+ * for comparing against sync_completed.
+ */
+ /* scale down max_progress to per_disk */
+ max_progress /= reshape->after.data_disks;
+ /*
+ * Round to chunk size as some kernels give an erroneously
+ * high number
+ */
+ max_progress /= info->new_chunk/512;
+ max_progress *= info->new_chunk/512;
+ /* And round to old chunk size as the kernel wants that */
+ max_progress /= info->array.chunk_size/512;
+ max_progress *= info->array.chunk_size/512;
+ /* Limit progress to the whole device */
+ if (max_progress > info->component_size)
+ max_progress = info->component_size;
+ wait_point /= reshape->after.data_disks;
+ if (!advancing) {
+ /* switch from 'device offset' to 'processed block count' */
+ max_progress = info->component_size - max_progress;
+ wait_point = info->component_size - wait_point;
+ }
+
+ if (!*frozen)
+ sysfs_set_num(info, NULL, "sync_max", max_progress);
+
+ /* Now wait. If we have already reached the point that we were
+ * asked to wait to, don't wait at all, else wait for any change.
+ * We need to select on 'sync_completed' as that is the place that
+ * notifications happen, but we are really interested in
+ * 'reshape_position'
+ */
+ fd = sysfs_get_fd(info, NULL, "sync_completed");
+ if (fd < 0)
+ goto check_progress;
+
+ if (sysfs_fd_get_ll(fd, &completed) < 0)
+ goto check_progress;
+
+ while (completed < max_progress && completed < wait_point) {
+ /* Check that sync_action is still 'reshape' to avoid
+ * waiting forever on a dead array
+ */
+ char action[20];
+ if (sysfs_get_str(info, NULL, "sync_action", action, 20) <= 0 ||
+ strncmp(action, "reshape", 7) != 0)
+ break;
+ /* Some kernels reset 'sync_completed' to zero
+ * before setting 'sync_action' to 'idle'.
+ * So we need these extra tests.
+ */
+ if (completed == 0 && advancing &&
+ strncmp(action, "idle", 4) == 0 &&
+ info->reshape_progress > 0)
+ break;
+ if (completed == 0 && !advancing &&
+ strncmp(action, "idle", 4) == 0 &&
+ info->reshape_progress <
+ (info->component_size * reshape->after.data_disks))
+ break;
+ sysfs_wait(fd, NULL);
+ if (sysfs_fd_get_ll(fd, &completed) < 0)
+ goto check_progress;
+ }
+ /* Some kernels reset 'sync_completed' to zero,
+ * we need to have real point we are in md.
+ * So in that case, read 'reshape_position' from sysfs.
+ */
+ if (completed == 0) {
+ unsigned long long reshapep;
+ char action[20];
+ if (sysfs_get_str(info, NULL, "sync_action", action, 20) > 0 &&
+ strncmp(action, "idle", 4) == 0 &&
+ sysfs_get_ll(info, NULL,
+ "reshape_position", &reshapep) == 0)
+ *reshape_completed = reshapep;
+ } else {
+ /* some kernels can give an incorrectly high
+ * 'completed' number, so round down */
+ completed /= (info->new_chunk/512);
+ completed *= (info->new_chunk/512);
+ /* Convert 'completed' back in to a 'progress' number */
+ completed *= reshape->after.data_disks;
+ if (!advancing)
+ completed = (info->component_size
+ * reshape->after.data_disks
+ - completed);
+ *reshape_completed = completed;
+ }
+
+ close(fd);
+
+ /* We return the need_backup flag. Caller will decide
+ * how much - a multiple of ->backup_blocks up to *suspend_point
+ */
+ if (advancing)
+ return need_backup > info->reshape_progress;
+ else
+ return need_backup >= info->reshape_progress;
+
+check_progress:
+ /* if we couldn't read a number from sync_completed, then
+ * either the reshape did complete, or it aborted.
+ * We can tell which by checking for 'none' in reshape_position.
+ * If it did abort, then it might immediately restart if it
+ * it was just a device failure that leaves us degraded but
+ * functioning.
+ */
+ if (sysfs_get_str(info, NULL, "reshape_position", buf,
+ sizeof(buf)) < 0 || strncmp(buf, "none", 4) != 0) {
+ /* The abort might only be temporary. Wait up to 10
+ * seconds for fd to contain a valid number again.
+ */
+ int wait = 10000;
+ int rv = -2;
+ unsigned long long new_sync_max;
+ while (fd >= 0 && rv < 0 && wait > 0) {
+ if (sysfs_wait(fd, &wait) != 1)
+ break;
+ switch (sysfs_fd_get_ll(fd, &completed)) {
+ case 0:
+ /* all good again */
+ rv = 1;
+ /* If "sync_max" is no longer max_progress
+ * we need to freeze things
+ */
+ sysfs_get_ll(info, NULL, "sync_max",
+ &new_sync_max);
+ *frozen = (new_sync_max != max_progress);
+ break;
+ case -2: /* read error - abort */
+ wait = 0;
+ break;
+ }
+ }
+ if (fd >= 0)
+ close(fd);
+ return rv; /* abort */
+ } else {
+ /* Maybe racing with array shutdown - check state */
+ if (fd >= 0)
+ close(fd);
+ if (sysfs_get_str(info, NULL, "array_state", buf,
+ sizeof(buf)) < 0 ||
+ strncmp(buf, "inactive", 8) == 0 ||
+ strncmp(buf, "clear",5) == 0)
+ return -2; /* abort */
+ return -1; /* complete */
+ }
+}
+
+/* FIXME return status is never checked */
+static int grow_backup(struct mdinfo *sra,
+ unsigned long long offset, /* per device */
+ unsigned long stripes, /* per device, in old chunks */
+ int *sources, unsigned long long *offsets,
+ int disks, int chunk, int level, int layout,
+ int dests, int *destfd, unsigned long long *destoffsets,
+ int part, int *degraded,
+ char *buf)
+{
+ /* Backup 'blocks' sectors at 'offset' on each device of the array,
+ * to storage 'destfd' (offset 'destoffsets'), after first
+ * suspending IO. Then allow resync to continue
+ * over the suspended section.
+ * Use part 'part' of the backup-super-block.
+ */
+ int odata = disks;
+ int rv = 0;
+ int i;
+ unsigned long long ll;
+ int new_degraded;
+ //printf("offset %llu\n", offset);
+ if (level >= 4)
+ odata--;
+ if (level == 6)
+ odata--;
+
+ /* Check that array hasn't become degraded, else we might backup the wrong data */
+ if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0)
+ return -1; /* FIXME this error is ignored */
+ new_degraded = (int)ll;
+ if (new_degraded != *degraded) {
+ /* check each device to ensure it is still working */
+ struct mdinfo *sd;
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char sbuf[100];
+
+ if (sysfs_get_str(sra, sd, "state",
+ sbuf, sizeof(sbuf)) < 0 ||
+ strstr(sbuf, "faulty") ||
+ strstr(sbuf, "in_sync") == NULL) {
+ /* this device is dead */
+ sd->disk.state = (1<<MD_DISK_FAULTY);
+ if (sd->disk.raid_disk >= 0 &&
+ sources[sd->disk.raid_disk] >= 0) {
+ close(sources[sd->disk.raid_disk]);
+ sources[sd->disk.raid_disk] = -1;
+ }
+ }
+ }
+ }
+ *degraded = new_degraded;
+ }
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(offset * odata);
+ bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata);
+ } else {
+ bsb.arraystart = __cpu_to_le64(offset * odata);
+ bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata);
+ }
+ if (part)
+ bsb.magic[15] = '2';
+ for (i = 0; i < dests; i++)
+ if (part)
+ lseek64(destfd[i], destoffsets[i] +
+ __le64_to_cpu(bsb.devstart2)*512, 0);
+ else
+ lseek64(destfd[i], destoffsets[i], 0);
+
+ rv = save_stripes(sources, offsets, disks, chunk, level, layout,
+ dests, destfd, offset * 512 * odata,
+ stripes * chunk * odata, buf);
+
+ if (rv)
+ return rv;
+ bsb.mtime = __cpu_to_le64(time(0));
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+
+ bsb.sb_csum = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+
+ rv = -1;
+ if ((unsigned long long)lseek64(destfd[i],
+ destoffsets[i] - 4096, 0) !=
+ destoffsets[i] - 4096)
+ break;
+ if (write(destfd[i], &bsb, 512) != 512)
+ break;
+ if (destoffsets[i] > 4096) {
+ if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) !=
+ destoffsets[i]+stripes*chunk*odata)
+ break;
+ if (write(destfd[i], &bsb, 512) != 512)
+ break;
+ }
+ fsync(destfd[i]);
+ rv = 0;
+ }
+
+ return rv;
+}
+
+/* in 2.6.30, the value reported by sync_completed can be
+ * less that it should be by one stripe.
+ * This only happens when reshape hits sync_max and pauses.
+ * So allow wait_backup to either extent sync_max further
+ * than strictly necessary, or return before the
+ * sync has got quite as far as we would really like.
+ * This is what 'blocks2' is for.
+ * The various caller give appropriate values so that
+ * every works.
+ */
+/* FIXME return value is often ignored */
+static int forget_backup(int dests, int *destfd,
+ unsigned long long *destoffsets,
+ int part)
+{
+ /*
+ * Erase backup 'part' (which is 0 or 1)
+ */
+ int i;
+ int rv;
+
+ if (part) {
+ bsb.arraystart2 = __cpu_to_le64(0);
+ bsb.length2 = __cpu_to_le64(0);
+ } else {
+ bsb.arraystart = __cpu_to_le64(0);
+ bsb.length = __cpu_to_le64(0);
+ }
+ bsb.mtime = __cpu_to_le64(time(0));
+ rv = 0;
+ for (i = 0; i < dests; i++) {
+ bsb.devstart = __cpu_to_le64(destoffsets[i]/512);
+ bsb.sb_csum = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum)-((char*)&bsb));
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0)
+ bsb.sb_csum2 = bsb_csum((char*)&bsb,
+ ((char*)&bsb.sb_csum2)-((char*)&bsb));
+ if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) !=
+ destoffsets[i]-4096)
+ rv = -1;
+ if (rv == 0 && write(destfd[i], &bsb, 512) != 512)
+ rv = -1;
+ fsync(destfd[i]);
+ }
+ return rv;
+}
+
+static void fail(char *msg)
+{
+ int rv;
+ rv = (write(2, msg, strlen(msg)) != (int)strlen(msg));
+ rv |= (write(2, "\n", 1) != 1);
+ exit(rv ? 1 : 2);
+}
+
+static char *abuf, *bbuf;
+static unsigned long long abuflen;
+static void validate(int afd, int bfd, unsigned long long offset)
+{
+ /* check that the data in the backup against the array.
+ * This is only used for regression testing and should not
+ * be used while the array is active
+ */
+ if (afd < 0)
+ return;
+ lseek64(bfd, offset - 4096, 0);
+ if (read(bfd, &bsb2, 512) != 512)
+ fail("cannot read bsb");
+ if (bsb2.sb_csum != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum)-((char*)&bsb2)))
+ fail("first csum bad");
+ if (memcmp(bsb2.magic, "md_backup_data", 14) != 0)
+ fail("magic is bad");
+ if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 &&
+ bsb2.sb_csum2 != bsb_csum((char*)&bsb2,
+ ((char*)&bsb2.sb_csum2)-((char*)&bsb2)))
+ fail("second csum bad");
+
+ if (__le64_to_cpu(bsb2.devstart)*512 != offset)
+ fail("devstart is wrong");
+
+ if (bsb2.length) {
+ unsigned long long len = __le64_to_cpu(bsb2.length)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ if (posix_memalign((void**)&abuf, 4096, abuflen) ||
+ posix_memalign((void**)&bbuf, 4096, abuflen)) {
+ abuflen = 0;
+ /* just stop validating on mem-alloc failure */
+ return;
+ }
+ }
+
+ lseek64(bfd, offset, 0);
+ if ((unsigned long long)read(bfd, bbuf, len) != len) {
+ //printf("len %llu\n", len);
+ fail("read first backup failed");
+ }
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0);
+ if ((unsigned long long)read(afd, abuf, len) != len)
+ fail("read first from array failed");
+ if (memcmp(bbuf, abuf, len) != 0) {
+#if 0
+ int i;
+ printf("offset=%llu len=%llu\n",
+ (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len);
+ for (i=0; i<len; i++)
+ if (bbuf[i] != abuf[i]) {
+ printf("first diff byte %d\n", i);
+ break;
+ }
+#endif
+ fail("data1 compare failed");
+ }
+ }
+ if (bsb2.length2) {
+ unsigned long long len = __le64_to_cpu(bsb2.length2)*512;
+
+ if (abuflen < len) {
+ free(abuf);
+ free(bbuf);
+ abuflen = len;
+ abuf = xmalloc(abuflen);
+ bbuf = xmalloc(abuflen);
+ }
+
+ lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0);
+ if ((unsigned long long)read(bfd, bbuf, len) != len)
+ fail("read second backup failed");
+ lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0);
+ if ((unsigned long long)read(afd, abuf, len) != len)
+ fail("read second from array failed");
+ if (memcmp(bbuf, abuf, len) != 0)
+ fail("data2 compare failed");
+ }
+}
+
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ /* Monitor a reshape where backup is being performed using
+ * 'native' mechanism - either to a backup file, or
+ * to some space in a spare.
+ */
+ char *buf;
+ int degraded = -1;
+ unsigned long long speed;
+ unsigned long long suspend_point, array_size;
+ unsigned long long backup_point, wait_point;
+ unsigned long long reshape_completed;
+ int done = 0;
+ int increasing = reshape->after.data_disks >=
+ reshape->before.data_disks;
+ int part = 0; /* The next part of the backup area to fill. It
+ * may already be full, so we need to check */
+ int level = reshape->level;
+ int layout = reshape->before.layout;
+ int data = reshape->before.data_disks;
+ int disks = reshape->before.data_disks + reshape->parity;
+ int chunk = sra->array.chunk_size;
+ struct mdinfo *sd;
+ unsigned long stripes;
+ int uuid[4];
+ int frozen = 0;
+
+ /* set up the backup-super-block. This requires the
+ * uuid from the array.
+ */
+ /* Find a superblock */
+ for (sd = sra->devs; sd; sd = sd->next) {
+ char *dn;
+ int devfd;
+ int ok;
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ dn = map_dev(sd->disk.major, sd->disk.minor, 1);
+ devfd = dev_open(dn, O_RDONLY);
+ if (devfd < 0)
+ continue;
+ ok = st->ss->load_super(st, devfd, NULL);
+ close(devfd);
+ if (ok == 0)
+ break;
+ }
+ if (!sd) {
+ pr_err("Cannot find a superblock\n");
+ return 0;
+ }
+
+ memset(&bsb, 0, 512);
+ memcpy(bsb.magic, "md_backup_data-1", 16);
+ st->ss->uuid_from_super(st, uuid);
+ memcpy(bsb.set_uuid, uuid, 16);
+ bsb.mtime = __cpu_to_le64(time(0));
+ bsb.devstart2 = blocks;
+
+ stripes = blocks / (sra->array.chunk_size/512) /
+ reshape->before.data_disks;
+
+ if (posix_memalign((void**)&buf, 4096, disks * chunk))
+ /* Don't start the 'reshape' */
+ return 0;
+ if (reshape->before.data_disks == reshape->after.data_disks) {
+ sysfs_get_ll(sra, NULL, "sync_speed_min", &speed);
+ sysfs_set_num(sra, NULL, "sync_speed_min", 200000);
+ }
+
+ if (increasing) {
+ array_size = sra->component_size * reshape->after.data_disks;
+ backup_point = sra->reshape_progress;
+ suspend_point = 0;
+ } else {
+ array_size = sra->component_size * reshape->before.data_disks;
+ backup_point = reshape->backup_blocks;
+ suspend_point = array_size;
+ }
+
+ while (!done) {
+ int rv;
+
+ /* Want to return as soon the oldest backup slot can
+ * be released as that allows us to start backing up
+ * some more, providing suspend_point has been
+ * advanced, which it should have.
+ */
+ if (increasing) {
+ wait_point = array_size;
+ if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+ wait_point = (__le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length));
+ if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+ wait_point = (__le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2));
+ } else {
+ wait_point = 0;
+ if (part == 0 && __le64_to_cpu(bsb.length) > 0)
+ wait_point = __le64_to_cpu(bsb.arraystart);
+ if (part == 1 && __le64_to_cpu(bsb.length2) > 0)
+ wait_point = __le64_to_cpu(bsb.arraystart2);
+ }
+
+ reshape_completed = sra->reshape_progress;
+ rv = progress_reshape(sra, reshape,
+ backup_point, wait_point,
+ &suspend_point, &reshape_completed,
+ &frozen);
+ /* external metadata would need to ping_monitor here */
+ sra->reshape_progress = reshape_completed;
+
+ /* Clear any backup region that is before 'here' */
+ if (increasing) {
+ if (__le64_to_cpu(bsb.length) > 0 &&
+ reshape_completed >= (__le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length)))
+ forget_backup(dests, destfd,
+ destoffsets, 0);
+ if (__le64_to_cpu(bsb.length2) > 0 &&
+ reshape_completed >= (__le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2)))
+ forget_backup(dests, destfd,
+ destoffsets, 1);
+ } else {
+ if (__le64_to_cpu(bsb.length) > 0 &&
+ reshape_completed <= (__le64_to_cpu(bsb.arraystart)))
+ forget_backup(dests, destfd,
+ destoffsets, 0);
+ if (__le64_to_cpu(bsb.length2) > 0 &&
+ reshape_completed <= (__le64_to_cpu(bsb.arraystart2)))
+ forget_backup(dests, destfd,
+ destoffsets, 1);
+ }
+ if (sigterm)
+ rv = -2;
+ if (rv < 0) {
+ if (rv == -1)
+ done = 1;
+ break;
+ }
+ if (rv == 0 && increasing && !st->ss->external) {
+ /* No longer need to monitor this reshape */
+ sysfs_set_str(sra, NULL, "sync_max", "max");
+ done = 1;
+ break;
+ }
+
+ while (rv) {
+ unsigned long long offset;
+ unsigned long actual_stripes;
+ /* Need to backup some data.
+ * If 'part' is not used and the desired
+ * backup size is suspended, do a backup,
+ * then consider the next part.
+ */
+ /* Check that 'part' is unused */
+ if (part == 0 && __le64_to_cpu(bsb.length) != 0)
+ break;
+ if (part == 1 && __le64_to_cpu(bsb.length2) != 0)
+ break;
+
+ offset = backup_point / data;
+ actual_stripes = stripes;
+ if (increasing) {
+ if (offset + actual_stripes * (chunk/512) >
+ sra->component_size)
+ actual_stripes = ((sra->component_size - offset)
+ / (chunk/512));
+ if (offset + actual_stripes * (chunk/512) >
+ suspend_point/data)
+ break;
+ } else {
+ if (offset < actual_stripes * (chunk/512))
+ actual_stripes = offset / (chunk/512);
+ offset -= actual_stripes * (chunk/512);
+ if (offset < suspend_point/data)
+ break;
+ }
+ if (actual_stripes == 0)
+ break;
+ grow_backup(sra, offset, actual_stripes, fds, offsets,
+ disks, chunk, level, layout, dests, destfd,
+ destoffsets, part, &degraded, buf);
+ validate(afd, destfd[0], destoffsets[0]);
+ /* record where 'part' is up to */
+ part = !part;
+ if (increasing)
+ backup_point += actual_stripes * (chunk/512) * data;
+ else
+ backup_point -= actual_stripes * (chunk/512) * data;
+ }
+ }
+
+ /* FIXME maybe call progress_reshape one more time instead */
+ /* remove any remaining suspension */
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+ sysfs_set_num(sra, NULL, "sync_min", 0);
+
+ if (reshape->before.data_disks == reshape->after.data_disks)
+ sysfs_set_num(sra, NULL, "sync_speed_min", speed);
+ free(buf);
+ return done;
+}
+
+/*
+ * If any spare contains md_back_data-1 which is recent wrt mtime,
+ * write that data into the array and update the super blocks with
+ * the new reshape_progress
+ */
+int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist,
+ int cnt, char *backup_file, int verbose)
+{
+ int i, j;
+ int old_disks;
+ unsigned long long *offsets;
+ unsigned long long nstripe, ostripe;
+ int ndata, odata;
+
+ odata = info->array.raid_disks - info->delta_disks - 1;
+ if (info->array.level == 6)
+ odata--; /* number of data disks */
+ ndata = info->array.raid_disks - 1;
+ if (info->new_level == 6)
+ ndata--;
+
+ old_disks = info->array.raid_disks - info->delta_disks;
+
+ if (info->delta_disks <= 0)
+ /* Didn't grow, so the backup file must have
+ * been used
+ */
+ old_disks = cnt;
+ for (i=old_disks-(backup_file?1:0); i<cnt; i++) {
+ struct mdinfo dinfo;
+ int fd;
+ int bsbsize;
+ char *devname, namebuf[20];
+ unsigned long long lo, hi;
+
+ /* This was a spare and may have some saved data on it.
+ * Load the superblock, find and load the
+ * backup_super_block.
+ * If either fail, go on to next device.
+ * If the backup contains no new info, just return
+ * else restore data and update all superblocks
+ */
+ if (i == old_disks-1) {
+ fd = open(backup_file, O_RDONLY);
+ if (fd<0) {
+ pr_err("backup file %s inaccessible: %s\n",
+ backup_file, strerror(errno));
+ continue;
+ }
+ devname = backup_file;
+ } else {
+ fd = fdlist[i];
+ if (fd < 0)
+ continue;
+ if (st->ss->load_super(st, fd, NULL))
+ continue;
+
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ st->ss->free_super(st);
+
+ if (lseek64(fd,
+ (dinfo.data_offset + dinfo.component_size - 8) <<9,
+ 0) < 0) {
+ pr_err("Cannot seek on device %d\n", i);
+ continue; /* Cannot seek */
+ }
+ sprintf(namebuf, "device-%d", i);
+ devname = namebuf;
+ }
+ if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) {
+ if (verbose)
+ pr_err("Cannot read from %s\n", devname);
+ continue; /* Cannot read */
+ }
+ if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 &&
+ memcmp(bsb.magic, "md_backup_data-2", 16) != 0) {
+ if (verbose)
+ pr_err("No backup metadata on %s\n", devname);
+ continue;
+ }
+ if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) {
+ if (verbose)
+ pr_err("Bad backup-metadata checksum on %s\n",
+ devname);
+ continue; /* bad checksum */
+ }
+ if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 &&
+ bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) {
+ if (verbose)
+ pr_err("Bad backup-metadata checksum2 on %s\n",
+ devname);
+ continue; /* Bad second checksum */
+ }
+ if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) {
+ if (verbose)
+ pr_err("Wrong uuid on backup-metadata on %s\n",
+ devname);
+ continue; /* Wrong uuid */
+ }
+
+ /*
+ * array utime and backup-mtime should be updated at
+ * much the same time, but it seems that sometimes
+ * they aren't... So allow considerable flexability in
+ * matching, and allow this test to be overridden by
+ * an environment variable.
+ */
+ if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) ||
+ time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) {
+ if (check_env("MDADM_GROW_ALLOW_OLD")) {
+ pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n",
+ (unsigned long)__le64_to_cpu(bsb.mtime),
+ (unsigned long)info->array.utime);
+ } else {
+ pr_err("too-old timestamp on backup-metadata on %s\n", devname);
+ pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n");
+ continue; /* time stamp is too bad */
+ }
+ }
+
+ if (bsb.magic[15] == '1') {
+ if (bsb.length == 0)
+ continue;
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if (__le64_to_cpu(bsb.arraystart)
+ + __le64_to_cpu(bsb.length)
+ < info->reshape_progress) {
+ nonew:
+ if (verbose)
+ pr_err("backup-metadata found on %s but is not needed\n", devname);
+ continue; /* No new data here */
+ }
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ } else {
+ if (bsb.length == 0 && bsb.length2 == 0)
+ continue;
+ if (info->delta_disks >= 0) {
+ /* reshape_progress is increasing */
+ if ((__le64_to_cpu(bsb.arraystart)
+ + __le64_to_cpu(bsb.length)
+ < info->reshape_progress) &&
+ (__le64_to_cpu(bsb.arraystart2)
+ + __le64_to_cpu(bsb.length2)
+ < info->reshape_progress))
+ goto nonew; /* No new data here */
+ } else {
+ /* reshape_progress is decreasing */
+ if (__le64_to_cpu(bsb.arraystart) >=
+ info->reshape_progress &&
+ __le64_to_cpu(bsb.arraystart2) >=
+ info->reshape_progress)
+ goto nonew; /* No new data here */
+ }
+ }
+ if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) {
+ second_fail:
+ if (verbose)
+ pr_err("Failed to verify secondary backup-metadata block on %s\n",
+ devname);
+ continue; /* Cannot seek */
+ }
+ /* There should be a duplicate backup superblock 4k before here */
+ if (lseek64(fd, -4096, 1) < 0 ||
+ read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2))
+ goto second_fail; /* Cannot find leading superblock */
+ if (bsb.magic[15] == '1')
+ bsbsize = offsetof(struct mdp_backup_super, pad1);
+ else
+ bsbsize = offsetof(struct mdp_backup_super, pad);
+ if (memcmp(&bsb2, &bsb, bsbsize) != 0)
+ goto second_fail; /* Cannot find leading superblock */
+
+ /* Now need the data offsets for all devices. */
+ offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks);
+ for(j=0; j<info->array.raid_disks; j++) {
+ if (fdlist[j] < 0)
+ continue;
+ if (st->ss->load_super(st, fdlist[j], NULL))
+ /* FIXME should be this be an error */
+ continue;
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ st->ss->free_super(st);
+ offsets[j] = dinfo.data_offset * 512;
+ }
+ printf("%s: restoring critical section\n", Name);
+
+ if (restore_stripes(fdlist, offsets, info->array.raid_disks,
+ info->new_chunk, info->new_level,
+ info->new_layout, fd,
+ __le64_to_cpu(bsb.devstart)*512,
+ __le64_to_cpu(bsb.arraystart)*512,
+ __le64_to_cpu(bsb.length)*512, NULL)) {
+ /* didn't succeed, so giveup */
+ if (verbose)
+ pr_err("Error restoring backup from %s\n",
+ devname);
+ free(offsets);
+ return 1;
+ }
+
+ if (bsb.magic[15] == '2' &&
+ restore_stripes(fdlist, offsets, info->array.raid_disks,
+ info->new_chunk, info->new_level,
+ info->new_layout, fd,
+ __le64_to_cpu(bsb.devstart)*512 +
+ __le64_to_cpu(bsb.devstart2)*512,
+ __le64_to_cpu(bsb.arraystart2)*512,
+ __le64_to_cpu(bsb.length2)*512, NULL)) {
+ /* didn't succeed, so giveup */
+ if (verbose)
+ pr_err("Error restoring second backup from %s\n",
+ devname);
+ free(offsets);
+ return 1;
+ }
+
+ free(offsets);
+
+ /* Ok, so the data is restored. Let's update those superblocks. */
+
+ lo = hi = 0;
+ if (bsb.length) {
+ lo = __le64_to_cpu(bsb.arraystart);
+ hi = lo + __le64_to_cpu(bsb.length);
+ }
+ if (bsb.magic[15] == '2' && bsb.length2) {
+ unsigned long long lo1, hi1;
+ lo1 = __le64_to_cpu(bsb.arraystart2);
+ hi1 = lo1 + __le64_to_cpu(bsb.length2);
+ if (lo == hi) {
+ lo = lo1;
+ hi = hi1;
+ } else if (lo < lo1)
+ hi = hi1;
+ else
+ lo = lo1;
+ }
+ if (lo < hi && (info->reshape_progress < lo ||
+ info->reshape_progress > hi))
+ /* backup does not affect reshape_progress*/ ;
+ else if (info->delta_disks >= 0) {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart) +
+ __le64_to_cpu(bsb.length);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2;
+
+ p2 = __le64_to_cpu(bsb.arraystart2) +
+ __le64_to_cpu(bsb.length2);
+ if (p2 > info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ } else {
+ info->reshape_progress = __le64_to_cpu(bsb.arraystart);
+ if (bsb.magic[15] == '2') {
+ unsigned long long p2;
+
+ p2 = __le64_to_cpu(bsb.arraystart2);
+ if (p2 < info->reshape_progress)
+ info->reshape_progress = p2;
+ }
+ }
+ for (j=0; j<info->array.raid_disks; j++) {
+ if (fdlist[j] < 0)
+ continue;
+ if (st->ss->load_super(st, fdlist[j], NULL))
+ continue;
+ st->ss->getinfo_super(st, &dinfo, NULL);
+ dinfo.reshape_progress = info->reshape_progress;
+ st->ss->update_super(st, &dinfo, "_reshape_progress",
+ NULL,0, 0, NULL);
+ st->ss->store_super(st, fdlist[j]);
+ st->ss->free_super(st);
+ }
+ return 0;
+ }
+ /* Didn't find any backup data, try to see if any
+ * was needed.
+ */
+ if (info->delta_disks < 0) {
+ /* When shrinking, the critical section is at the end.
+ * So see if we are before the critical section.
+ */
+ unsigned long long first_block;
+ nstripe = ostripe = 0;
+ first_block = 0;
+ while (ostripe >= nstripe) {
+ ostripe += info->array.chunk_size / 512;
+ first_block = ostripe * odata;
+ nstripe = first_block / ndata / (info->new_chunk/512) *
+ (info->new_chunk/512);
+ }
+
+ if (info->reshape_progress >= first_block)
+ return 0;
+ }
+ if (info->delta_disks > 0) {
+ /* See if we are beyond the critical section. */
+ unsigned long long last_block;
+ nstripe = ostripe = 0;
+ last_block = 0;
+ while (nstripe >= ostripe) {
+ nstripe += info->new_chunk / 512;
+ last_block = nstripe * ndata;
+ ostripe = last_block / odata / (info->array.chunk_size/512) *
+ (info->array.chunk_size/512);
+ }
+
+ if (info->reshape_progress >= last_block)
+ return 0;
+ }
+ /* needed to recover critical section! */
+ if (verbose)
+ pr_err("Failed to find backup of critical section\n");
+ return 1;
+}
+
+int Grow_continue_command(char *devname, int fd,
+ char *backup_file, int verbose)
+{
+ int ret_val = 0;
+ struct supertype *st = NULL;
+ struct mdinfo *content = NULL;
+ struct mdinfo array;
+ char *subarray = NULL;
+ struct mdinfo *cc = NULL;
+ struct mdstat_ent *mdstat = NULL;
+ int cfd = -1;
+ int fd2;
+
+ dprintf("Grow continue from command line called for %s\n", devname);
+
+ st = super_by_fd(fd, &subarray);
+ if (!st || !st->ss) {
+ pr_err("Unable to determine metadata format for %s\n", devname);
+ return 1;
+ }
+ dprintf("Grow continue is run for ");
+ if (st->ss->external == 0) {
+ int d;
+ int cnt = 5;
+ dprintf_cont("native array (%s)\n", devname);
+ if (md_get_array_info(fd, &array.array) < 0) {
+ pr_err("%s is not an active md array - aborting\n",
+ devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ content = &array;
+ sysfs_init(content, fd, NULL);
+ /* Need to load a superblock.
+ * FIXME we should really get what we need from
+ * sysfs
+ */
+ do {
+ for (d = 0; d < MAX_DISKS; d++) {
+ mdu_disk_info_t disk;
+ char *dv;
+ int err;
+ disk.number = d;
+ if (md_get_disk_info(fd, &disk) < 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0)
+ continue;
+ dv = map_dev(disk.major, disk.minor, 1);
+ if (!dv)
+ continue;
+ fd2 = dev_open(dv, O_RDONLY);
+ if (fd2 < 0)
+ continue;
+ err = st->ss->load_super(st, fd2, NULL);
+ close(fd2);
+ if (err)
+ continue;
+ break;
+ }
+ if (d == MAX_DISKS) {
+ pr_err("Unable to load metadata for %s\n",
+ devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ st->ss->getinfo_super(st, content, NULL);
+ if (!content->reshape_active)
+ sleep(3);
+ else
+ break;
+ } while (cnt-- > 0);
+ } else {
+ char *container;
+
+ if (subarray) {
+ dprintf_cont("subarray (%s)\n", subarray);
+ container = st->container_devnm;
+ cfd = open_dev_excl(st->container_devnm);
+ } else {
+ container = st->devnm;
+ close(fd);
+ cfd = open_dev_excl(st->devnm);
+ dprintf_cont("container (%s)\n", container);
+ fd = cfd;
+ }
+ if (cfd < 0) {
+ pr_err("Unable to open container for %s\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ /* find in container array under reshape
+ */
+ ret_val = st->ss->load_container(st, cfd, NULL);
+ if (ret_val) {
+ pr_err("Cannot read superblock for %s\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ cc = st->ss->container_content(st, subarray);
+ for (content = cc; content ; content = content->next) {
+ char *array_name;
+ int allow_reshape = 1;
+
+ if (content->reshape_active == 0)
+ continue;
+ /* The decision about array or container wide
+ * reshape is taken in Grow_continue based
+ * content->reshape_active state, therefore we
+ * need to check_reshape based on
+ * reshape_active and subarray name
+ */
+ if (content->array.state & (1<<MD_SB_BLOCK_VOLUME))
+ allow_reshape = 0;
+ if (content->reshape_active == CONTAINER_RESHAPE &&
+ (content->array.state
+ & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))
+ allow_reshape = 0;
+
+ if (!allow_reshape) {
+ pr_err("cannot continue reshape of an array in container with unsupported metadata: %s(%s)\n",
+ devname, container);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ array_name = strchr(content->text_version+1, '/')+1;
+ mdstat = mdstat_by_subdev(array_name, container);
+ if (!mdstat)
+ continue;
+ if (mdstat->active == 0) {
+ pr_err("Skipping inactive array %s.\n",
+ mdstat->devnm);
+ free_mdstat(mdstat);
+ mdstat = NULL;
+ continue;
+ }
+ break;
+ }
+ if (!content) {
+ pr_err("Unable to determine reshaped array for %s\n", devname);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ fd2 = open_dev(mdstat->devnm);
+ if (fd2 < 0) {
+ pr_err("cannot open (%s)\n", mdstat->devnm);
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ if (sysfs_init(content, fd2, mdstat->devnm)) {
+ pr_err("Unable to initialize sysfs for %s, Grow cannot continue.\n",
+ mdstat->devnm);
+ ret_val = 1;
+ close(fd2);
+ goto Grow_continue_command_exit;
+ }
+
+ close(fd2);
+
+ /* start mdmon in case it is not running
+ */
+ if (!mdmon_running(container))
+ start_mdmon(container);
+ ping_monitor(container);
+
+ if (mdmon_running(container))
+ st->update_tail = &st->updates;
+ else {
+ pr_err("No mdmon found. Grow cannot continue.\n");
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+ }
+
+ /* verify that array under reshape is started from
+ * correct position
+ */
+ if (verify_reshape_position(content, content->array.level) < 0) {
+ ret_val = 1;
+ goto Grow_continue_command_exit;
+ }
+
+ /* continue reshape
+ */
+ ret_val = Grow_continue(fd, st, content, backup_file, 1, 0);
+
+Grow_continue_command_exit:
+ if (cfd > -1)
+ close(cfd);
+ st->ss->free_super(st);
+ free_mdstat(mdstat);
+ sysfs_free(cc);
+ free(subarray);
+
+ return ret_val;
+}
+
+int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info,
+ char *backup_file, int forked, int freeze_reshape)
+{
+ int ret_val = 2;
+
+ if (!info->reshape_active)
+ return ret_val;
+
+ if (st->ss->external) {
+ int cfd = open_dev(st->container_devnm);
+
+ if (cfd < 0)
+ return 1;
+
+ st->ss->load_container(st, cfd, st->container_devnm);
+ close(cfd);
+ ret_val = reshape_container(st->container_devnm, NULL, mdfd,
+ st, info, 0, backup_file, 0,
+ forked, 1 | info->reshape_active,
+ freeze_reshape);
+ } else
+ ret_val = reshape_array(NULL, mdfd, "array", st, info, 1,
+ NULL, INVALID_SECTORS, backup_file,
+ 0, forked, 1 | info->reshape_active,
+ freeze_reshape);
+
+ return ret_val;
+}
+
+char *make_backup(char *name)
+{
+ char *base = "backup_file-";
+ int len;
+ char *fname;
+
+ len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1;
+ fname = xmalloc(len);
+ sprintf(fname, "%s/%s%s", MAP_DIR, base, name);
+ return fname;
+}
+
+char *locate_backup(char *name)
+{
+ char *fl = make_backup(name);
+ struct stat stb;
+
+ if (stat(fl, &stb) == 0 && S_ISREG(stb.st_mode))
+ return fl;
+
+ free(fl);
+ return NULL;
+}
diff --git a/INSTALL b/INSTALL
new file mode 100644
index 0000000..f7bcc3e
--- /dev/null
+++ b/INSTALL
@@ -0,0 +1,13 @@
+
+To build mdadm, simply run:
+
+ make
+
+to install, run
+
+ make install
+
+as root.
+
+
+No configuration is necessary.
diff --git a/Incremental.c b/Incremental.c
new file mode 100644
index 0000000..a57fc32
--- /dev/null
+++ b/Incremental.c
@@ -0,0 +1,1764 @@
+/*
+ * Incremental.c - support --incremental. Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ * Paper: Neil Brown
+ * Novell Inc
+ * GPO Box Q1283
+ * QVB Post Office, NSW 1230
+ * Australia
+ */
+
+#include "mdadm.h"
+#include <sys/wait.h>
+#include <dirent.h>
+#include <ctype.h>
+
+static int count_active(struct supertype *st, struct mdinfo *sra,
+ int mdfd, char **availp,
+ struct mdinfo *info);
+static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
+ int number, __u64 events, int verbose,
+ char *array_name);
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target,
+ struct supertype *st, int verbose);
+
+static int Incremental_container(struct supertype *st, char *devname,
+ struct context *c, char *only);
+
+int Incremental(struct mddev_dev *devlist, struct context *c,
+ struct supertype *st)
+{
+ /* Add this device to an array, creating the array if necessary
+ * and starting the array if sensible or - if runstop>0 - if possible.
+ *
+ * This has several steps:
+ *
+ * 1/ Check if device is permitted by mdadm.conf, reject if not.
+ * 2/ Find metadata, reject if none appropriate (check
+ * version/name from args)
+ * 3/ Check if there is a match in mdadm.conf
+ * 3a/ if not, check for homehost match. If no match, assemble as
+ * a 'foreign' array.
+ * 4/ Determine device number.
+ * - If in mdadm.conf with std name, use that
+ * - UUID in /var/run/mdadm.map use that
+ * - If name is suggestive, use that. unless in use with different uuid.
+ * - Choose a free, high number.
+ * - Use a partitioned device unless strong suggestion not to.
+ * e.g. auto=md
+ * Don't choose partitioned for containers.
+ * 5/ Find out if array already exists
+ * 5a/ if it does not
+ * - choose a name, from mdadm.conf or 'name' field in array.
+ * - create the array
+ * - add the device
+ * 5b/ if it does
+ * - check one drive in array to make sure metadata is a reasonably
+ * close match. Reject if not (e.g. different type)
+ * - add the device
+ * 6/ Make sure /var/run/mdadm.map contains this array.
+ * 7/ Is there enough devices to possibly start the array?
+ * For a container, this means running Incremental_container.
+ * 7a/ if not, finish with success.
+ * 7b/ if yes,
+ * - read all metadata and arrange devices like -A does
+ * - if number of OK devices match expected, or -R and there are enough,
+ * start the array (auto-readonly).
+ */
+ dev_t rdev, rdev2;
+ struct mdinfo info, dinfo;
+ struct mdinfo *sra = NULL, *d;
+ struct mddev_ident *match;
+ char chosen_name[1024];
+ char *md_devname;
+ int rv = 1;
+ struct map_ent *mp, *map = NULL;
+ int dfd = -1, mdfd = -1;
+ char *avail = NULL;
+ int active_disks;
+ int trustworthy;
+ char *name_to_use;
+ struct dev_policy *policy = NULL;
+ struct map_ent target_array;
+ int have_target;
+ char *devname = devlist->devname;
+ int journal_device_missing = 0;
+
+ struct createinfo *ci = conf_get_create_info();
+
+ if (!stat_is_blkdev(devname, &rdev))
+ return rv;
+ dfd = dev_open(devname, O_RDONLY);
+ if (dfd < 0) {
+ if (c->verbose >= 0)
+ pr_err("cannot open %s: %s.\n",
+ devname, strerror(errno));
+ return rv;
+ }
+ /* If the device is a container, we do something very different */
+ if (must_be_container(dfd)) {
+ if (!st)
+ st = super_by_fd(dfd, NULL);
+ if (st && st->ss->load_container)
+ rv = st->ss->load_container(st, dfd, NULL);
+
+ close(dfd);
+ if (!rv && st->ss->container_content) {
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ if (c->export)
+ printf("MD_DEVNAME=%s\n", devname);
+ rv = Incremental_container(st, devname, c, NULL);
+ map_unlock(&map);
+ return rv;
+ }
+
+ pr_err("%s is not part of an md array.\n",
+ devname);
+ return rv;
+ }
+
+ /* 1/ Check if device is permitted by mdadm.conf */
+
+ for (;devlist; devlist = devlist->next)
+ if (conf_test_dev(devlist->devname))
+ break;
+ if (!devlist) {
+ devlist = conf_get_devs();
+ for (;devlist; devlist = devlist->next) {
+ if (stat_is_blkdev(devlist->devname, &rdev2) &&
+ rdev2 == rdev)
+ break;
+ }
+ }
+ if (!devlist) {
+ if (c->verbose >= 0)
+ pr_err("%s not permitted by mdadm.conf.\n",
+ devname);
+ goto out;
+ }
+
+ /* 2/ Find metadata, reject if none appropriate (check
+ * version/name from args) */
+
+ if (!fstat_is_blkdev(dfd, devname, &rdev))
+ goto out;
+
+ dinfo.disk.major = major(rdev);
+ dinfo.disk.minor = minor(rdev);
+
+ policy = disk_policy(&dinfo);
+ have_target = policy_check_path(&dinfo, &target_array);
+
+ if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) {
+ if (c->verbose >= 0)
+ pr_err("no recognisable superblock on %s.\n",
+ devname);
+ rv = try_spare(devname, &dfd, policy,
+ have_target ? &target_array : NULL,
+ NULL, c->verbose);
+ goto out;
+ }
+ st->ignore_hw_compat = 0;
+
+ if (st->ss->compare_super == NULL ||
+ st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) {
+ if (c->verbose >= 0)
+ pr_err("no RAID superblock on %s.\n",
+ devname);
+ rv = try_spare(devname, &dfd, policy,
+ have_target ? &target_array : NULL,
+ st, c->verbose);
+ free(st);
+ goto out;
+ }
+ close (dfd); dfd = -1;
+
+ st->ss->getinfo_super(st, &info, NULL);
+
+ /* 3/ Check if there is a match in mdadm.conf */
+ match = conf_match(st, &info, devname, c->verbose, &rv);
+ if (!match && rv == 2)
+ goto out;
+
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (c->verbose >= 0)
+ pr_err("array containing %s is explicitly ignored by mdadm.conf\n",
+ devname);
+ goto out;
+ }
+
+ /* 3a/ if not, check for homehost match. If no match, continue
+ * but don't trust the 'name' in the array. Thus a 'random' minor
+ * number will be assigned, and the device name will be based
+ * on that. */
+ if (match)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, c->homehost) == 1)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, "any") == 1)
+ trustworthy = LOCAL_ANY;
+ else
+ trustworthy = FOREIGN;
+
+ if (!match && !conf_test_metadata(st->ss->name, policy,
+ (trustworthy == LOCAL))) {
+ if (c->verbose >= 1)
+ pr_err("%s has metadata type %s for which auto-assembly is disabled\n",
+ devname, st->ss->name);
+ goto out;
+ }
+ if (trustworthy == LOCAL_ANY)
+ trustworthy = LOCAL;
+
+ /* There are three possible sources for 'autof': command line,
+ * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf.
+ * ARRAY takes precedence, then command line, then
+ * CREATE.
+ */
+ if (match && match->autof)
+ c->autof = match->autof;
+ if (c->autof == 0)
+ c->autof = ci->autof;
+
+ name_to_use = info.name;
+ if (name_to_use[0] == 0 && info.array.level == LEVEL_CONTAINER) {
+ name_to_use = info.text_version;
+ trustworthy = METADATA;
+ }
+ if (name_to_use[0] && trustworthy != LOCAL &&
+ ! c->require_homehost &&
+ conf_name_is_free(name_to_use))
+ trustworthy = LOCAL;
+
+ /* strip "hostname:" prefix from name if we have decided
+ * to treat it as LOCAL
+ */
+ if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL)
+ name_to_use = strchr(name_to_use, ':')+1;
+
+ /* 4/ Check if array exists.
+ */
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ /* Now check we can get O_EXCL. If not, probably "mdadm -A" has
+ * taken over
+ */
+ dfd = dev_open(devname, O_RDONLY|O_EXCL);
+ if (dfd < 0) {
+ if (c->verbose >= 0)
+ pr_err("cannot reopen %s: %s.\n",
+ devname, strerror(errno));
+ goto out_unlock;
+ }
+ /* Cannot hold it open while we add the device to the array,
+ * so we must release the O_EXCL and depend on the map_lock()
+ * So now is the best time to remove any partitions.
+ */
+ remove_partitions(dfd);
+ close(dfd);
+ dfd = -1;
+
+ mp = map_by_uuid(&map, info.uuid);
+ if (mp)
+ mdfd = open_dev(mp->devnm);
+ else
+ mdfd = -1;
+
+ if (mdfd < 0) {
+
+ /* Skip the clustered ones. This should be started by
+ * clustering resource agents
+ */
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ goto out;
+
+ /* Couldn't find an existing array, maybe make a new one */
+ mdfd = create_mddev(match ? match->devname : NULL,
+ name_to_use, c->autof, trustworthy, chosen_name, 0);
+
+ if (mdfd < 0)
+ goto out_unlock;
+
+ if (sysfs_init(&info, mdfd, NULL)) {
+ pr_err("unable to initialize sysfs for %s\n",
+ chosen_name);
+ rv = 2;
+ goto out_unlock;
+ }
+
+ if (set_array_info(mdfd, st, &info) != 0) {
+ pr_err("failed to set array info for %s: %s\n",
+ chosen_name, strerror(errno));
+ rv = 2;
+ goto out_unlock;
+ }
+
+ dinfo = info;
+ dinfo.disk.major = major(rdev);
+ dinfo.disk.minor = minor(rdev);
+ if (add_disk(mdfd, st, &info, &dinfo) != 0) {
+ pr_err("failed to add %s to new array %s: %s.\n",
+ devname, chosen_name, strerror(errno));
+ ioctl(mdfd, STOP_ARRAY, 0);
+ rv = 2;
+ goto out_unlock;
+ }
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+
+ if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) {
+ /* It really should be 'none' - must be old buggy
+ * kernel, and mdadm -I may not be able to complete.
+ * So reject it.
+ */
+ ioctl(mdfd, STOP_ARRAY, NULL);
+ pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n");
+ rv = 2;
+ goto out_unlock;
+ }
+ info.array.working_disks = 1;
+ /* 6/ Make sure /var/run/mdadm.map contains this array. */
+ map_update(&map, fd2devnm(mdfd),
+ info.text_version,
+ info.uuid, chosen_name);
+ } else {
+ /* 5b/ if it does */
+ /* - check one drive in array to make sure metadata is a reasonably */
+ /* close match. Reject if not (e.g. different type) */
+ /* - add the device */
+ char dn[20];
+ int dfd2;
+ int err;
+ struct supertype *st2;
+ struct mdinfo info2, *d;
+
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, mp->devnm);
+
+ /* It is generally not OK to add non-spare drives to a
+ * running array as they are probably missing because
+ * they failed. However if runstop is 1, then the
+ * array was possibly started early and our best bet is
+ * to add this anyway.
+ * Also if action policy is re-add or better we allow
+ * re-add.
+ * This doesn't apply to containers as the 'non-spare'
+ * flag has a different meaning. The test has to happen
+ * at the device level there
+ */
+ if (!st->ss->external &&
+ (info.disk.state & (1 << MD_DISK_SYNC)) != 0 &&
+ !policy_action_allows(policy, st->ss->name, act_re_add) &&
+ c->runstop < 1) {
+ if (md_array_active(mdfd)) {
+ pr_err("not adding %s to active array (without --run) %s\n",
+ devname, chosen_name);
+ rv = 2;
+ goto out_unlock;
+ }
+ }
+ if (!sra) {
+ rv = 2;
+ goto out_unlock;
+ }
+ if (sra->devs) {
+ sprintf(dn, "%d:%d", sra->devs->disk.major,
+ sra->devs->disk.minor);
+ dfd2 = dev_open(dn, O_RDONLY);
+ if (dfd2 < 0) {
+ pr_err("unable to open %s\n", devname);
+ rv = 2;
+ goto out_unlock;
+ }
+ st2 = dup_super(st);
+ if (st2->ss->load_super(st2, dfd2, NULL) ||
+ st->ss->compare_super(st, st2, 1) != 0) {
+ pr_err("metadata mismatch between %s and chosen array %s\n",
+ devname, chosen_name);
+ close(dfd2);
+ rv = 2;
+ goto out_unlock;
+ }
+ close(dfd2);
+ st2->ss->getinfo_super(st2, &info2, NULL);
+ st2->ss->free_super(st2);
+ if (info.array.level != info2.array.level ||
+ memcmp(info.uuid, info2.uuid, 16) != 0 ||
+ info.array.raid_disks != info2.array.raid_disks) {
+ pr_err("unexpected difference between %s and %s.\n",
+ chosen_name, devname);
+ rv = 2;
+ goto out_unlock;
+ }
+ }
+ info.disk.major = major(rdev);
+ info.disk.minor = minor(rdev);
+ /* add disk needs to know about containers */
+ if (st->ss->external)
+ sra->array.level = LEVEL_CONTAINER;
+
+ if (info.array.state & (1 << MD_SB_CLUSTERED))
+ info.disk.state |= (1 << MD_DISK_CLUSTER_ADD);
+
+ err = add_disk(mdfd, st, sra, &info);
+ if (err < 0 && errno == EBUSY) {
+ /* could be another device present with the same
+ * disk.number. Find and reject any such
+ */
+ find_reject(mdfd, st, sra, info.disk.number,
+ info.events, c->verbose, chosen_name);
+ err = add_disk(mdfd, st, sra, &info);
+ }
+ if (err < 0 && errno == EINVAL &&
+ info.disk.state & (1<<MD_DISK_SYNC)) {
+ /* Maybe it needs to be added as a spare */
+ if (policy_action_allows(policy, st->ss->name,
+ act_force_spare)) {
+ info.disk.state &= ~(1<<MD_DISK_SYNC);
+ err = add_disk(mdfd, st, sra, &info);
+ } else
+ if (c->verbose >= 0)
+ pr_err("can only add %s to %s as a spare, and force-spare is not set.\n",
+ devname, chosen_name);
+ }
+ if (err < 0) {
+ pr_err("failed to add %s to existing array %s: %s.\n",
+ devname, chosen_name, strerror(errno));
+ rv = 2;
+ goto out_unlock;
+ }
+ info.array.working_disks = 0;
+ for (d = sra->devs; d; d=d->next)
+ info.array.working_disks ++;
+
+ }
+ if (strncmp(chosen_name, "/dev/md/", 8) == 0)
+ md_devname = chosen_name+8;
+ else
+ md_devname = chosen_name;
+ if (c->export) {
+ printf("MD_DEVICE=%s\n", fd2devnm(mdfd));
+ printf("MD_DEVNAME=%s\n", md_devname);
+ printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no");
+ }
+
+ /* 7/ Is there enough devices to possibly start the array? */
+ /* 7a/ if not, finish with success. */
+ if (info.array.level == LEVEL_CONTAINER) {
+ char devnm[32];
+ /* Try to assemble within the container */
+ sysfs_uevent(sra, "change");
+ if (!c->export && c->verbose >= 0)
+ pr_err("container %s now has %d device%s\n",
+ chosen_name, info.array.working_disks,
+ info.array.working_disks == 1?"":"s");
+ sysfs_rules_apply(chosen_name, &info);
+ wait_for(chosen_name, mdfd);
+ if (st->ss->external)
+ strcpy(devnm, fd2devnm(mdfd));
+ if (st->ss->load_container)
+ rv = st->ss->load_container(st, mdfd, NULL);
+ close(mdfd);
+ sysfs_free(sra);
+ if (!rv)
+ rv = Incremental_container(st, chosen_name, c, NULL);
+ map_unlock(&map);
+ /* after spare is added, ping monitor for external metadata
+ * so that it can eg. try to rebuild degraded array */
+ if (st->ss->external)
+ ping_monitor(devnm);
+ return rv;
+ }
+
+ /* We have added something to the array, so need to re-read the
+ * state. Eventually this state should be kept up-to-date as
+ * things change.
+ */
+ sysfs_free(sra);
+ sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE |
+ GET_OFFSET | GET_SIZE));
+ active_disks = count_active(st, sra, mdfd, &avail, &info);
+
+ journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0);
+
+ if (info.consistency_policy == CONSISTENCY_POLICY_PPL)
+ info.array.state |= 1;
+
+ if (enough(info.array.level, info.array.raid_disks,
+ info.array.layout, info.array.state & 1, avail) == 0) {
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, not enough to start (%d).\n",
+ devname, chosen_name, active_disks);
+ rv = 0;
+ goto out_unlock;
+ }
+
+ /* 7b/ if yes, */
+ /* - if number of OK devices match expected, or -R and there */
+ /* are enough, */
+ /* + add any bitmap file */
+ /* + start the array (auto-readonly). */
+
+ if (md_array_active(mdfd)) {
+ if (c->export) {
+ printf("MD_STARTED=already\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s which is already active.\n",
+ devname, chosen_name);
+ rv = 0;
+ goto out_unlock;
+ }
+
+ map_unlock(&map);
+ if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) {
+ struct mdinfo *dsk;
+ /* Let's try to start it */
+
+ if (journal_device_missing)
+ pr_err("Trying to run with missing journal device\n");
+ if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) {
+ pr_err("%s: This array is being reshaped and cannot be started\n",
+ chosen_name);
+ cont_err("by --incremental. Please use --assemble\n");
+ goto out;
+ }
+ if (match && match->bitmap_file) {
+ int bmfd = open(match->bitmap_file, O_RDWR);
+ if (bmfd < 0) {
+ pr_err("Could not open bitmap file %s.\n",
+ match->bitmap_file);
+ goto out;
+ }
+ if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) {
+ close(bmfd);
+ pr_err("Failed to set bitmapfile for %s.\n",
+ chosen_name);
+ goto out;
+ }
+ close(bmfd);
+ }
+ /* Need to remove from the array any devices which
+ * 'count_active' discerned were too old or inappropriate
+ */
+ for (d = sra ? sra->devs : NULL ; d ; d = d->next)
+ if (d->disk.state & (1<<MD_DISK_REMOVED))
+ remove_disk(mdfd, st, sra, d);
+
+ if ((sra == NULL || active_disks >= info.array.working_disks) &&
+ trustworthy != FOREIGN)
+ rv = ioctl(mdfd, RUN_ARRAY, NULL);
+ else
+ rv = sysfs_set_str(sra, NULL,
+ "array_state", "read-auto");
+ /* Array might be O_EXCL which will interfere with
+ * fsck and mount. So re-open without O_EXCL.
+ */
+ reopen_mddev(mdfd);
+ if (rv == 0) {
+ if (c->export) {
+ printf("MD_STARTED=yes\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, which has been started.\n",
+ devname, chosen_name);
+ rv = 0;
+ wait_for(chosen_name, mdfd);
+ /* We just started the array, so some devices
+ * might have been evicted from the array
+ * because their event counts were too old.
+ * If the action=re-add policy is in-force for
+ * those devices we should re-add them now.
+ */
+ for (dsk = sra->devs; dsk ; dsk = dsk->next) {
+ if (disk_action_allows(dsk, st->ss->name,
+ act_re_add) &&
+ add_disk(mdfd, st, sra, dsk) == 0)
+ pr_err("%s re-added to %s\n",
+ dsk->sys_name, chosen_name);
+ }
+ } else {
+ pr_err("%s attached to %s, but failed to start: %s.\n",
+ devname, chosen_name, strerror(errno));
+ rv = 1;
+ }
+ } else {
+ if (c->export) {
+ printf("MD_STARTED=unsafe\n");
+ } else if (journal_device_missing) {
+ pr_err("Journal device is missing, not safe to start yet.\n");
+ } else if (c->verbose >= 0)
+ pr_err("%s attached to %s, not enough to start safely.\n",
+ devname, chosen_name);
+ rv = 0;
+ }
+out:
+ free(avail);
+ if (dfd >= 0)
+ close(dfd);
+ if (mdfd >= 0)
+ close(mdfd);
+ if (policy)
+ dev_policy_free(policy);
+ sysfs_free(sra);
+ return rv;
+out_unlock:
+ map_unlock(&map);
+ goto out;
+}
+
+static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra,
+ int number, __u64 events, int verbose,
+ char *array_name)
+{
+ /* Find a device attached to this array with a disk.number of number
+ * and events less than the passed events, and remove the device.
+ */
+ struct mdinfo *d;
+
+ if (md_array_active(mdfd))
+ return; /* not safe to remove from active arrays
+ * without thinking more */
+
+ for (d = sra->devs; d ; d = d->next) {
+ char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte
+ int dfd;
+ struct mdinfo info;
+ sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ if (st->ss->load_super(st, dfd, NULL)) {
+ close(dfd);
+ continue;
+ }
+ st->ss->getinfo_super(st, &info, NULL);
+ st->ss->free_super(st);
+ close(dfd);
+
+ if (info.disk.number != number || info.events >= events)
+ continue;
+
+ if (d->disk.raid_disk > -1)
+ sysfs_set_str(sra, d, "slot", "none");
+ if (sysfs_set_str(sra, d, "state", "remove") == 0)
+ if (verbose >= 0)
+ pr_err("removing old device %s from %s\n",
+ d->sys_name+4, array_name);
+ }
+}
+
+static int count_active(struct supertype *st, struct mdinfo *sra,
+ int mdfd, char **availp,
+ struct mdinfo *bestinfo)
+{
+ /* count how many devices in sra think they are active */
+ struct mdinfo *d;
+ int cnt = 0;
+ int replcnt = 0;
+ __u64 max_events = 0;
+ __u64 max_journal_events = 0;
+ char *avail = NULL;
+ int *best = NULL;
+ char *devmap = NULL;
+ int numdevs = 0;
+ int devnum;
+ int b, i;
+ int raid_disks = 0;
+
+ if (!sra)
+ return 0;
+
+ for (d = sra->devs ; d ; d = d->next)
+ numdevs++;
+ for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) {
+ char dn[30];
+ int dfd;
+ int ok;
+ struct mdinfo info;
+
+ sprintf(dn, "%d:%d", d->disk.major, d->disk.minor);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ ok = st->ss->load_super(st, dfd, NULL);
+ close(dfd);
+ if (ok != 0)
+ continue;
+
+ info.array.raid_disks = raid_disks;
+ st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum);
+ if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL &&
+ info.events > max_journal_events)
+ max_journal_events = info.events;
+ if (!avail) {
+ raid_disks = info.array.raid_disks;
+ avail = xcalloc(raid_disks, 1);
+ *availp = avail;
+
+ best = xcalloc(raid_disks, sizeof(int));
+ devmap = xcalloc(raid_disks, numdevs);
+
+ st->ss->getinfo_super(st, &info, devmap);
+ }
+
+ if (info.disk.state & (1<<MD_DISK_SYNC))
+ {
+ if (cnt == 0) {
+ cnt++;
+ max_events = info.events;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ } else if (info.events == max_events) {
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ } else if (info.events == max_events-1) {
+ if (avail[info.disk.raid_disk] == 0) {
+ avail[info.disk.raid_disk] = 1;
+ best[info.disk.raid_disk] = devnum;
+ }
+ } else if (info.events < max_events - 1)
+ ;
+ else if (info.events == max_events+1) {
+ int i;
+ max_events = info.events;
+ for (i = 0; i < raid_disks; i++)
+ if (avail[i])
+ avail[i]--;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ } else { /* info.events much bigger */
+ memset(avail, 0, raid_disks);
+ max_events = info.events;
+ avail[info.disk.raid_disk] = 2;
+ best[info.disk.raid_disk] = devnum;
+ st->ss->getinfo_super(st, bestinfo, NULL);
+ }
+ } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT))
+ replcnt++;
+ st->ss->free_super(st);
+ }
+ if (max_journal_events >= max_events - 1)
+ bestinfo->journal_clean = 1;
+
+ if (!avail)
+ return 0;
+ /* We need to reject any device that thinks the best device is
+ * failed or missing */
+ for (b = 0; b < raid_disks; b++)
+ if (avail[b] == 2)
+ break;
+ cnt = 0;
+ for (i = 0 ; i < raid_disks ; i++) {
+ if (i != b && avail[i])
+ if (devmap[raid_disks * best[i] + b] == 0) {
+ /* This device thinks 'b' is failed -
+ * don't use it */
+ devnum = best[i];
+ for (d=sra->devs ; devnum; d = d->next)
+ devnum--;
+ d->disk.state |= (1 << MD_DISK_REMOVED);
+ avail[i] = 0;
+ }
+ if (avail[i])
+ cnt++;
+ }
+ /* Also need to reject any spare device with an event count that
+ * is too high
+ */
+ for (d = sra->devs; d; d = d->next) {
+ if (!(d->disk.state & (1<<MD_DISK_SYNC)) &&
+ d->events > max_events)
+ d->disk.state |= (1 << MD_DISK_REMOVED);
+ }
+ free(best);
+ free(devmap);
+ return cnt + replcnt;
+}
+
+/* test if container has degraded member(s) */
+static int
+container_members_max_degradation(struct map_ent *map, struct map_ent *me)
+{
+ struct mdinfo *sra;
+ int degraded, max_degraded = 0;
+
+ for(; map; map = map->next) {
+ if (!metadata_container_matches(map->metadata, me->devnm))
+ continue;
+ /* most accurate information regarding array degradation */
+ sra = sysfs_read(-1, map->devnm,
+ GET_DISKS | GET_DEVS | GET_STATE);
+ if (!sra)
+ continue;
+ degraded = sra->array.raid_disks - sra->array.active_disks -
+ sra->array.spare_disks;
+ if (degraded > max_degraded)
+ max_degraded = degraded;
+ sysfs_free(sra);
+ }
+
+ return max_degraded;
+}
+
+static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target, int bare,
+ struct supertype *st, int verbose)
+{
+ /* This device doesn't have any md metadata
+ * The device policy allows 'spare' and if !bare, it allows spare-same-slot.
+ * If 'st' is not set, then we only know that some metadata allows this,
+ * others possibly don't.
+ * So look for a container or array to attach the device to.
+ * Prefer 'target' if that is set and the array is found.
+ *
+ * If st is set, then only arrays of that type are considered
+ * Return 0 on success, or some exit code on failure, probably 1.
+ */
+ int rv = 1;
+ dev_t rdev;
+ struct map_ent *mp, *map = NULL;
+ struct mdinfo *chosen = NULL;
+ int dfd = *dfdp;
+
+ if (!fstat_is_blkdev(dfd, devname, &rdev))
+ return 1;
+
+ /*
+ * Now we need to find a suitable array to add this to.
+ * We only accept arrays that:
+ * - match 'st'
+ * - are in the same domains as the device
+ * - are of an size for which the device will be useful
+ * and we choose the one that is the most degraded
+ */
+
+ if (map_lock(&map)) {
+ pr_err("failed to get exclusive lock on mapfile\n");
+ return 1;
+ }
+ for (mp = map ; mp ; mp = mp->next) {
+ struct supertype *st2;
+ struct domainlist *dl = NULL;
+ struct mdinfo *sra;
+ unsigned long long devsize, freesize = 0;
+ struct spare_criteria sc = {0, 0};
+
+ if (is_subarray(mp->metadata))
+ continue;
+ if (st) {
+ st2 = st->ss->match_metadata_desc(mp->metadata);
+ if (!st2 ||
+ (st->minor_version >= 0 &&
+ st->minor_version != st2->minor_version)) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as metadata type doesn't match\n",
+ devname, mp->path);
+ free(st2);
+ continue;
+ }
+ free(st2);
+ }
+ sra = sysfs_read(-1, mp->devnm,
+ GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+ GET_COMPONENT|GET_VERSION);
+ if (sra)
+ sra->array.failed_disks = -1;
+ else
+ continue;
+ if (st == NULL) {
+ int i;
+ st2 = NULL;
+ for(i = 0; !st2 && superlist[i]; i++)
+ st2 = superlist[i]->match_metadata_desc(
+ sra->text_version);
+ if (!st2) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as metadata not recognised.\n",
+ devname, mp->path);
+ goto next;
+ }
+ /* Need to double check the 'act_spare' permissions applies
+ * to this metadata.
+ */
+ if (!policy_action_allows(pol, st2->ss->name, act_spare))
+ goto next;
+ if (!bare && !policy_action_allows(pol, st2->ss->name,
+ act_spare_same_slot))
+ goto next;
+ } else
+ st2 = st;
+ /* update number of failed disks for mostly degraded
+ * container member */
+ if (sra->array.failed_disks == -1)
+ sra->array.failed_disks = container_members_max_degradation(map, mp);
+
+ get_dev_size(dfd, NULL, &devsize);
+ if (sra->component_size == 0) {
+ /* true for containers, here we must read superblock
+ * to obtain minimum spare size */
+ struct supertype *st3 = dup_super(st2);
+ int mdfd = open_dev(mp->devnm);
+ if (mdfd < 0) {
+ free(st3);
+ goto next;
+ }
+ if (st3->ss->load_container &&
+ !st3->ss->load_container(st3, mdfd, mp->path)) {
+ if (st3->ss->get_spare_criteria)
+ st3->ss->get_spare_criteria(st3, &sc);
+ st3->ss->free_super(st3);
+ }
+ free(st3);
+ close(mdfd);
+ }
+ if ((sra->component_size > 0 &&
+ st2->ss->validate_geometry(st2, sra->array.level, sra->array.layout,
+ sra->array.raid_disks, &sra->array.chunk_size,
+ sra->component_size,
+ sra->devs ? sra->devs->data_offset : INVALID_SECTORS,
+ devname, &freesize, sra->consistency_policy,
+ 0) &&
+ freesize < sra->component_size) ||
+ (sra->component_size == 0 && devsize < sc.min_size)) {
+ if (verbose > 1)
+ pr_err("not adding %s to %s as it is too small\n",
+ devname, mp->path);
+ goto next;
+ }
+ /* test against target.
+ * If 'target' is set and 'bare' is false, we only accept
+ * arrays/containers that match 'target'.
+ * If 'target' is set and 'bare' is true, we prefer the
+ * array which matches 'target'.
+ * target is considered only if we deal with degraded array
+ */
+ if (target && policy_action_allows(pol, st2->ss->name,
+ act_spare_same_slot)) {
+ if (strcmp(target->metadata, mp->metadata) == 0 &&
+ memcmp(target->uuid, mp->uuid,
+ sizeof(target->uuid)) == 0 &&
+ sra->array.failed_disks > 0) {
+ /* This is our target!! */
+ sysfs_free(chosen);
+ chosen = sra;
+ sra = NULL;
+ /* skip to end so we don't check any more */
+ while (mp->next)
+ mp = mp->next;
+ goto next;
+ }
+ /* not our target */
+ if (!bare)
+ goto next;
+ }
+
+ dl = domain_from_array(sra, st2->ss->name);
+ if (domain_test(dl, pol, st2->ss->name) != 1) {
+ /* domain test fails */
+ if (verbose > 1)
+ pr_err("not adding %s to %s as it is not in a compatible domain\n",
+ devname, mp->path);
+
+ goto next;
+ }
+ /* all tests passed, OK to add to this array */
+ if (!chosen) {
+ chosen = sra;
+ sra = NULL;
+ } else if (chosen->array.failed_disks < sra->array.failed_disks) {
+ sysfs_free(chosen);
+ chosen = sra;
+ sra = NULL;
+ }
+ next:
+ sysfs_free(sra);
+ if (st != st2)
+ free(st2);
+ if (dl)
+ domain_free(dl);
+ }
+ if (chosen) {
+ /* add current device to chosen array as a spare */
+ int mdfd = open_dev(chosen->sys_name);
+ if (mdfd >= 0) {
+ struct mddev_dev devlist;
+ char chosen_devname[24]; // 2*11 for int (including signs) + colon + null
+ devlist.next = NULL;
+ devlist.used = 0;
+ devlist.writemostly = FlagDefault;
+ devlist.failfast = FlagDefault;
+ devlist.devname = chosen_devname;
+ sprintf(chosen_devname, "%d:%d", major(rdev),
+ minor(rdev));
+ devlist.disposition = 'a';
+ close(dfd);
+ *dfdp = -1;
+ rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist,
+ -1, 0, NULL, 0);
+ close(mdfd);
+ }
+ if (verbose > 0) {
+ if (rv == 0)
+ pr_err("added %s as spare for %s\n",
+ devname, chosen->sys_name);
+ else
+ pr_err("failed to add %s as spare for %s\n",
+ devname, chosen->sys_name);
+ }
+ sysfs_free(chosen);
+ }
+ map_unlock(&map);
+ return rv;
+}
+
+static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct supertype *st, int verbose)
+{
+ /* we know that at least one partition virtual-metadata is
+ * allowed to incorporate spares like this device. We need to
+ * find a suitable device to copy partition information from.
+ *
+ * Getting a list of all disk (not partition) devices is
+ * slightly non-trivial. We could look at /sys/block, but
+ * that is theoretically due to be removed. Maybe best to use
+ * /dev/disk/by-path/?* and ignore names ending '-partNN' as
+ * we depend on this directory of 'path' info. But that fails
+ * to find loop devices and probably others. Maybe don't
+ * worry about that, they aren't the real target.
+ *
+ * So: check things in /dev/disk/by-path to see if they are in
+ * a compatible domain, then load the partition table and see
+ * if it is OK for the new device, and choose the largest
+ * partition table that fits.
+ */
+ DIR *dir;
+ struct dirent *de;
+ char *chosen = NULL;
+ unsigned long long chosen_size = 0;
+ struct supertype *chosen_st = NULL;
+ int fd;
+
+ dir = opendir("/dev/disk/by-path");
+ if (!dir)
+ return 1;
+ while ((de = readdir(dir)) != NULL) {
+ char *ep;
+ struct dev_policy *pol2 = NULL;
+ struct domainlist *domlist = NULL;
+ int fd = -1;
+ struct mdinfo info;
+ struct supertype *st2 = NULL;
+ char *devname = NULL;
+ unsigned long long devsectors;
+ char *pathlist[2];
+
+ if (de->d_ino == 0 || de->d_name[0] == '.' ||
+ (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN))
+ goto next;
+
+ ep = de->d_name + strlen(de->d_name);
+ while (ep > de->d_name &&
+ isdigit(ep[-1]))
+ ep--;
+ if (ep > de->d_name + 5 &&
+ strncmp(ep-5, "-part", 5) == 0)
+ /* This is a partition - skip it */
+ goto next;
+
+ pathlist[0] = de->d_name;
+ pathlist[1] = NULL;
+ pol2 = path_policy(pathlist, type_disk);
+
+ domain_merge(&domlist, pol2, st ? st->ss->name : NULL);
+ if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1)
+ /* new device is incompatible with this device. */
+ goto next;
+
+ domain_free(domlist);
+ domlist = NULL;
+
+ if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) {
+ devname = NULL;
+ goto next;
+ }
+ fd = open(devname, O_RDONLY);
+ if (fd < 0)
+ goto next;
+ if (get_dev_size(fd, devname, &devsectors) == 0)
+ goto next;
+ devsectors >>= 9;
+
+ if (st)
+ st2 = dup_super(st);
+ else
+ st2 = guess_super_type(fd, guess_partitions);
+ if (st2 == NULL || st2->ss->load_super(st2, fd, NULL) < 0)
+ goto next;
+ st2->ignore_hw_compat = 0;
+
+ if (!st) {
+ /* Check domain policy again, this time referring to metadata */
+ domain_merge(&domlist, pol2, st2->ss->name);
+ if (domain_test(domlist, pol, st2->ss->name) != 1)
+ /* Incompatible devices for this metadata type */
+ goto next;
+ if (!policy_action_allows(pol, st2->ss->name, act_spare))
+ /* Some partition types allow sparing, but not
+ * this one.
+ */
+ goto next;
+ }
+
+ st2->ss->getinfo_super(st2, &info, NULL);
+ if (info.component_size > devsectors)
+ /* This partitioning doesn't fit in the device */
+ goto next;
+
+ /* This is an acceptable device to copy partition
+ * metadata from. We could just stop here, but I
+ * think I want to keep looking incase a larger
+ * metadata which makes better use of the device can
+ * be found.
+ */
+ if (chosen == NULL || chosen_size < info.component_size) {
+ chosen_size = info.component_size;
+ free(chosen);
+ chosen = devname;
+ devname = NULL;
+ if (chosen_st) {
+ chosen_st->ss->free_super(chosen_st);
+ free(chosen_st);
+ }
+ chosen_st = st2;
+ st2 = NULL;
+ }
+
+ next:
+ free(devname);
+ domain_free(domlist);
+ dev_policy_free(pol2);
+ if (st2)
+ st2->ss->free_super(st2);
+ free(st2);
+
+ if (fd >= 0)
+ close(fd);
+ }
+
+ closedir(dir);
+
+ if (!chosen)
+ return 1;
+
+ /* 'chosen' is the best device we can find. Let's write its
+ * metadata to devname dfd is read-only so don't use that
+ */
+ fd = open(devname, O_RDWR);
+ if (fd >= 0) {
+ chosen_st->ss->store_super(chosen_st, fd);
+ close(fd);
+ }
+ free(chosen);
+ chosen_st->ss->free_super(chosen_st);
+ free(chosen_st);
+ return 0;
+}
+
+static int is_bare(int dfd)
+{
+ unsigned long long size = 0;
+ char bufpad[4096 + 4096];
+ char *buf = (char*)(((long)bufpad + 4096) & ~4095);
+
+ if (lseek(dfd, 0, SEEK_SET) != 0 ||
+ read(dfd, buf, 4096) != 4096)
+ return 0;
+
+ if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+ return 0;
+ if (memcmp(buf, buf+1, 4095) != 0)
+ return 0;
+
+ /* OK, first 4K appear blank, try the end. */
+ get_dev_size(dfd, NULL, &size);
+ if (lseek(dfd, size-4096, SEEK_SET) < 0 ||
+ read(dfd, buf, 4096) != 4096)
+ return 0;
+
+ if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff')
+ return 0;
+ if (memcmp(buf, buf+1, 4095) != 0)
+ return 0;
+
+ return 1;
+}
+
+/* adding a spare to a regular array is quite different from adding one to
+ * a set-of-partitions virtual array.
+ * This function determines which is worth trying and tries as appropriate.
+ * Arrays are given priority over partitions.
+ */
+static int try_spare(char *devname, int *dfdp, struct dev_policy *pol,
+ struct map_ent *target,
+ struct supertype *st, int verbose)
+{
+ int i;
+ int rv;
+ int arrays_ok = 0;
+ int partitions_ok = 0;
+ int dfd = *dfdp;
+ int bare;
+
+ /* Can only add a spare if device has at least one domain */
+ if (pol_find(pol, pol_domain) == NULL)
+ return 1;
+ /* And only if some action allows spares */
+ if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare))
+ return 1;
+
+ /* Now check if the device is bare.
+ * bare devices can always be added as a spare
+ * non-bare devices can only be added if spare-same-slot is permitted,
+ * and this device is replacing a previous device - in which case 'target'
+ * will be set.
+ */
+ if (!is_bare(dfd)) {
+ /* Must have a target and allow same_slot */
+ /* Later - may allow force_spare without target */
+ if (!target ||
+ !policy_action_allows(pol, st?st->ss->name:NULL,
+ act_spare_same_slot)) {
+ if (verbose > 1)
+ pr_err("%s is not bare, so not considering as a spare\n",
+ devname);
+ return 1;
+ }
+ bare = 0;
+ } else
+ bare = 1;
+
+ /* It might be OK to add this device to an array - need to see
+ * what arrays might be candidates.
+ */
+ if (st) {
+ /* just try to add 'array' or 'partition' based on this metadata */
+ if (st->ss->add_to_super)
+ return array_try_spare(devname, dfdp, pol, target, bare,
+ st, verbose);
+ else
+ return partition_try_spare(devname, dfdp, pol,
+ st, verbose);
+ }
+ /* No metadata was specified or found so options are open.
+ * Check for whether any array metadata, or any partition metadata
+ * might allow adding the spare. This check is just help to avoid
+ * a more costly scan of all arrays when we can be sure that will
+ * fail.
+ */
+ for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) {
+ if (superlist[i]->add_to_super && !arrays_ok &&
+ policy_action_allows(pol, superlist[i]->name, act_spare))
+ arrays_ok = 1;
+ if (superlist[i]->add_to_super == NULL && !partitions_ok &&
+ policy_action_allows(pol, superlist[i]->name, act_spare))
+ partitions_ok = 1;
+ }
+ rv = 1;
+ if (arrays_ok)
+ rv = array_try_spare(devname, dfdp, pol, target, bare,
+ st, verbose);
+ if (rv != 0 && partitions_ok)
+ rv = partition_try_spare(devname, dfdp, pol, st, verbose);
+ return rv;
+}
+
+int IncrementalScan(struct context *c, char *devnm)
+{
+ /* look at every device listed in the 'map' file.
+ * If one is found that is not running then:
+ * look in mdadm.conf for bitmap file.
+ * if one exists, but array has none, add it.
+ * try to start array in auto-readonly mode
+ */
+ struct map_ent *mapl = NULL;
+ struct map_ent *me;
+ struct mddev_ident *devs, *mddev;
+ int rv = 0;
+ char container[32];
+ char *only = NULL;
+
+ map_read(&mapl);
+ devs = conf_get_ident(NULL);
+
+restart:
+ for (me = mapl ; me ; me = me->next) {
+ struct mdinfo *sra;
+ int mdfd;
+
+ if (devnm && strcmp(devnm, me->devnm) != 0)
+ continue;
+ if (me->metadata[0] == '/') {
+ char *sl;
+
+ if (!devnm)
+ continue;
+
+ /* member array, need to work on container */
+ strncpy(container, me->metadata+1, 32);
+ container[31] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ only = devnm;
+ devnm = container;
+ goto restart;
+ }
+ mdfd = open_dev(me->devnm);
+
+ if (!is_fd_valid(mdfd))
+ continue;
+ if (!isdigit(me->metadata[0])) {
+ /* must be a container */
+ struct supertype *st = super_by_fd(mdfd, NULL);
+ int ret = 0;
+ struct map_ent *map = NULL;
+
+ if (st && st->ss->load_container)
+ ret = st->ss->load_container(st, mdfd, NULL);
+ close_fd(&mdfd);
+ if (!ret && st && st->ss->container_content) {
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ ret = Incremental_container(st, me->path, c, only);
+ map_unlock(&map);
+ }
+ if (ret)
+ rv = 1;
+ continue;
+ }
+ if (md_array_active(mdfd)) {
+ close_fd(&mdfd);
+ continue;
+ }
+ /* Ok, we can try this one. Maybe it needs a bitmap */
+ for (mddev = devs ; mddev ; mddev = mddev->next)
+ if (mddev->devname && me->path &&
+ devname_matches(mddev->devname, me->path))
+ break;
+ if (mddev && mddev->bitmap_file) {
+ /*
+ * Note: early kernels will wrongly fail this, so it
+ * is a hint only
+ */
+ int added = -1;
+ int bmfd;
+
+ bmfd = open(mddev->bitmap_file, O_RDWR);
+ if (is_fd_valid(bmfd)) {
+ added = ioctl(mdfd, SET_BITMAP_FILE, bmfd);
+ close_fd(&bmfd);
+ }
+ if (c->verbose >= 0) {
+ if (added == 0)
+ pr_err("Added bitmap %s to %s\n",
+ mddev->bitmap_file, me->path);
+ else if (errno != EEXIST)
+ pr_err("Failed to add bitmap to %s: %s\n",
+ me->path, strerror(errno));
+ }
+ }
+ /* FIXME check for reshape_active and consider not
+ * starting array.
+ */
+ sra = sysfs_read(mdfd, NULL, 0);
+ if (sra) {
+ if (sysfs_set_str(sra, NULL,
+ "array_state", "read-auto") == 0) {
+ if (c->verbose >= 0)
+ pr_err("started array %s\n",
+ me->path ?: me->devnm);
+ } else {
+ pr_err("failed to start array %s: %s\n",
+ me->path ?: me->devnm,
+ strerror(errno));
+ rv = 1;
+ }
+ sysfs_free(sra);
+ }
+ close_fd(&mdfd);
+ }
+ map_free(mapl);
+ return rv;
+}
+
+static char *container2devname(char *devname)
+{
+ char *mdname = NULL;
+
+ if (devname[0] == '/') {
+ int fd = open(devname, O_RDONLY);
+ if (fd >= 0) {
+ mdname = xstrdup(fd2devnm(fd));
+ close(fd);
+ }
+ } else {
+ int uuid[4];
+ struct map_ent *mp, *map = NULL;
+
+ if (!parse_uuid(devname, uuid))
+ return mdname;
+ mp = map_by_uuid(&map, uuid);
+ if (mp)
+ mdname = xstrdup(mp->devnm);
+ map_free(map);
+ }
+
+ return mdname;
+}
+
+static int Incremental_container(struct supertype *st, char *devname,
+ struct context *c, char *only)
+{
+ /* Collect the contents of this container and for each
+ * array, choose a device name and assemble the array.
+ */
+
+ struct mdinfo *list;
+ struct mdinfo *ra;
+ struct map_ent *map = NULL;
+ struct mdinfo info;
+ int trustworthy;
+ struct mddev_ident *match;
+ int rv = 0;
+ int result = 0;
+
+ st->ss->getinfo_super(st, &info, NULL);
+
+ if ((c->runstop > 0 && info.container_enough >= 0) ||
+ info.container_enough > 0)
+ /* pass */;
+ else {
+ if (c->export) {
+ printf("MD_STARTED=no\n");
+ } else if (c->verbose)
+ pr_err("not enough devices to start the container\n");
+ return 0;
+ }
+
+ match = conf_match(st, &info, devname, c->verbose, &rv);
+ if (match == NULL && rv == 2)
+ return rv;
+
+ /* Need to compute 'trustworthy' */
+ if (match)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, c->homehost) == 1)
+ trustworthy = LOCAL;
+ else if (st->ss->match_home(st, "any") == 1)
+ trustworthy = LOCAL;
+ else
+ trustworthy = FOREIGN;
+
+ list = st->ss->container_content(st, NULL);
+ /* when nothing to activate - quit */
+ if (list == NULL) {
+ if (c->export) {
+ printf("MD_STARTED=nothing\n");
+ }
+ return 0;
+ }
+ for (ra = list ; ra ; ra = ra->next) {
+ int mdfd;
+ char chosen_name[1024];
+ struct map_ent *mp;
+ struct mddev_ident *match = NULL;
+
+ /* do not activate arrays blocked by metadata handler */
+ if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) {
+ pr_err("Cannot activate array %s in %s.\n",
+ ra->text_version, devname);
+ continue;
+ }
+ mp = map_by_uuid(&map, ra->uuid);
+
+ if (mp) {
+ mdfd = open_dev(mp->devnm);
+ if (mp->path)
+ strcpy(chosen_name, mp->path);
+ else
+ strcpy(chosen_name, mp->devnm);
+ } else if (!only) {
+
+ /* Check in mdadm.conf for container == devname and
+ * member == ra->text_version after second slash.
+ */
+ char *sub = strchr(ra->text_version+1, '/');
+ struct mddev_ident *array_list;
+ if (sub) {
+ sub++;
+ array_list = conf_get_ident(NULL);
+ } else
+ array_list = NULL;
+ for(; array_list ; array_list = array_list->next) {
+ char *dn;
+ if (array_list->member == NULL ||
+ array_list->container == NULL)
+ continue;
+ if (strcmp(array_list->member, sub) != 0)
+ continue;
+ if (array_list->uuid_set &&
+ !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid))
+ continue;
+ dn = container2devname(array_list->container);
+ if (dn == NULL)
+ continue;
+ if (strncmp(dn, ra->text_version+1,
+ strlen(dn)) != 0 ||
+ ra->text_version[strlen(dn)+1] != '/') {
+ free(dn);
+ continue;
+ }
+ free(dn);
+ /* we have a match */
+ match = array_list;
+ if (c->verbose>0)
+ pr_err("match found for member %s\n",
+ array_list->member);
+ break;
+ }
+
+ if (match && match->devname &&
+ strcasecmp(match->devname, "<ignore>") == 0) {
+ if (c->verbose > 0)
+ pr_err("array %s/%s is explicitly ignored by mdadm.conf\n",
+ match->container, match->member);
+ continue;
+ }
+ if (match)
+ trustworthy = LOCAL;
+
+ mdfd = create_mddev(match ? match->devname : NULL,
+ ra->name,
+ c->autof,
+ trustworthy,
+ chosen_name, 0);
+ }
+ if (only && (!mp || strcmp(mp->devnm, only) != 0))
+ continue;
+
+ if (mdfd < 0) {
+ pr_err("failed to open %s: %s.\n",
+ chosen_name, strerror(errno));
+ return 2;
+ }
+
+ assemble_container_content(st, mdfd, ra, c,
+ chosen_name, &result);
+ map_free(map);
+ map = NULL;
+ close(mdfd);
+ }
+ if (c->export && result) {
+ char sep = '=';
+ printf("MD_STARTED");
+ if (result & INCR_NO) {
+ printf("%cno", sep);
+ sep = ',';
+ }
+ if (result & INCR_UNSAFE) {
+ printf("%cunsafe", sep);
+ sep = ',';
+ }
+ if (result & INCR_ALREADY) {
+ printf("%calready", sep);
+ sep = ',';
+ }
+ if (result & INCR_YES) {
+ printf("%cyes", sep);
+ sep = ',';
+ }
+ printf("\n");
+ }
+ return 0;
+}
+
+static void run_udisks(char *arg1, char *arg2)
+{
+ int pid = fork();
+ int status;
+ if (pid == 0) {
+ manage_fork_fds(1);
+ execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL);
+ execl("/bin/udisks", "udisks", arg1, arg2, NULL);
+ exit(1);
+ }
+ while (pid > 0 && wait(&status) != pid)
+ ;
+}
+
+static int force_remove(char *devnm, int fd, struct mdinfo *mdi, int verbose)
+{
+ int rv;
+ int devid = devnm2devid(devnm);
+
+ run_udisks("--unmount", map_dev(major(devid), minor(devid), 0));
+ rv = Manage_stop(devnm, fd, verbose, 1);
+ if (rv) {
+ /* At least we can try to trigger a 'remove' */
+ sysfs_uevent(mdi, "remove");
+ if (verbose)
+ pr_err("Fail to stop %s too.\n", devnm);
+ }
+ return rv;
+}
+
+static void remove_from_member_array(struct mdstat_ent *memb,
+ struct mddev_dev *devlist, int verbose)
+{
+ int rv;
+ struct mdinfo mmdi;
+ int subfd = open_dev(memb->devnm);
+
+ if (subfd >= 0) {
+ rv = Manage_subdevs(memb->devnm, subfd, devlist, verbose,
+ 0, NULL, 0);
+ if (rv & 2) {
+ if (sysfs_init(&mmdi, -1, memb->devnm))
+ pr_err("unable to initialize sysfs for: %s\n",
+ memb->devnm);
+ else
+ force_remove(memb->devnm, subfd, &mmdi,
+ verbose);
+ }
+ close(subfd);
+ }
+}
+
+/*
+ * IncrementalRemove - Attempt to see if the passed in device belongs to any
+ * raid arrays, and if so first fail (if needed) and then remove the device.
+ *
+ * @devname - The device we want to remove
+ * @id_path - name as found in /dev/disk/by-path for this device
+ *
+ * Note: the device name must be a kernel name like "sda", so
+ * that we can find it in /proc/mdstat
+ */
+int IncrementalRemove(char *devname, char *id_path, int verbose)
+{
+ int mdfd;
+ int rv = 0;
+ struct mdstat_ent *ent;
+ struct mddev_dev devlist;
+ struct mdinfo mdi;
+ char buf[32];
+
+ if (!id_path)
+ dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n");
+
+ if (strchr(devname, '/')) {
+ pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname);
+ return 1;
+ }
+ ent = mdstat_by_component(devname);
+ if (!ent) {
+ if (verbose >= 0)
+ pr_err("%s does not appear to be a component of any array\n", devname);
+ return 1;
+ }
+ if (sysfs_init(&mdi, -1, ent->devnm)) {
+ pr_err("unable to initialize sysfs for: %s\n", devname);
+ return 1;
+ }
+ mdfd = open_dev_excl(ent->devnm);
+ if (is_fd_valid(mdfd)) {
+ close_fd(&mdfd);
+ if (sysfs_get_str(&mdi, NULL, "array_state",
+ buf, sizeof(buf)) > 0) {
+ if (strncmp(buf, "active", 6) == 0 ||
+ strncmp(buf, "clean", 5) == 0)
+ sysfs_set_str(&mdi, NULL,
+ "array_state", "read-auto");
+ }
+ }
+ mdfd = open_dev(ent->devnm);
+ if (mdfd < 0) {
+ if (verbose >= 0)
+ pr_err("Cannot open array %s!!\n", ent->devnm);
+ free_mdstat(ent);
+ return 1;
+ }
+
+ if (id_path) {
+ struct map_ent *map = NULL, *me;
+ me = map_by_devnm(&map, ent->devnm);
+ if (me)
+ policy_save_path(id_path, me);
+ map_free(map);
+ }
+
+ memset(&devlist, 0, sizeof(devlist));
+ devlist.devname = devname;
+ devlist.disposition = 'f';
+ /* for a container, we must fail each member array */
+ if (ent->metadata_version &&
+ strncmp(ent->metadata_version, "external:", 9) == 0) {
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *memb;
+ for (memb = mdstat ; memb ; memb = memb->next) {
+ if (is_container_member(memb, ent->devnm))
+ remove_from_member_array(memb,
+ &devlist, verbose);
+ }
+ free_mdstat(mdstat);
+ } else {
+ rv |= Manage_subdevs(ent->devnm, mdfd, &devlist,
+ verbose, 0, NULL, 0);
+ if (rv & 2) {
+ /* Failed due to EBUSY, try to stop the array.
+ * Give udisks a chance to unmount it first.
+ */
+ rv = force_remove(ent->devnm, mdfd, &mdi, verbose);
+ goto end;
+ }
+ }
+
+ devlist.disposition = 'r';
+ rv = Manage_subdevs(ent->devnm, mdfd, &devlist,
+ verbose, 0, NULL, 0);
+end:
+ close(mdfd);
+ free_mdstat(ent);
+ return rv;
+}
diff --git a/Kill.c b/Kill.c
new file mode 100644
index 0000000..bfd0efd
--- /dev/null
+++ b/Kill.c
@@ -0,0 +1,147 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ *
+ * Added by Dale Stephenson
+ * steph@snapserver.com
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+
+int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl)
+{
+ /*
+ * Nothing fancy about Kill. It just zeroes out a superblock
+ * Definitely not safe.
+ * Returns:
+ * 0 - a zero superblock was successfully written out
+ * 1 - failed to write the zero superblock
+ * 2 - failed to open the device.
+ * 4 - failed to find a superblock.
+ */
+
+ int fd, rv = 0;
+
+ if (force)
+ noexcl = 1;
+ fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL));
+ if (fd < 0) {
+ if (verbose >= 0)
+ pr_err("Couldn't open %s for write - not zeroing\n",
+ dev);
+ return 2;
+ }
+ if (st == NULL)
+ st = guess_super(fd);
+ if (st == NULL || st->ss->init_super == NULL) {
+ if (verbose >= 0)
+ pr_err("Unrecognised md component device - %s\n", dev);
+ close(fd);
+ return 4;
+ }
+ st->ignore_hw_compat = 1;
+ rv = st->ss->load_super(st, fd, dev);
+ if (rv == 0 || (force && rv >= 2)) {
+ st->ss->free_super(st);
+ st->ss->init_super(st, NULL, NULL, "", NULL, NULL,
+ INVALID_SECTORS);
+ if (st->ss->store_super(st, fd)) {
+ if (verbose >= 0)
+ pr_err("Could not zero superblock on %s\n",
+ dev);
+ rv = 1;
+ } else if (rv) {
+ if (verbose >= 0)
+ pr_err("superblock zeroed anyway\n");
+ rv = 0;
+ }
+ }
+ close(fd);
+ return rv;
+}
+
+int Kill_subarray(char *dev, char *subarray, int verbose)
+{
+ /* Delete a subarray out of a container, the subarry must be
+ * inactive. The subarray string must be a subarray index
+ * number.
+ *
+ * 0 = successfully deleted subarray from all container members
+ * 1 = failed to sync metadata to one or more devices
+ * 2 = failed to find the container, subarray, or other resource
+ * issue
+ */
+ struct supertype supertype, *st = &supertype;
+ int fd, rv = 2;
+
+ memset(st, 0, sizeof(*st));
+
+ fd = open_subarray(dev, subarray, st, verbose < 0);
+ if (fd < 0)
+ return 2;
+
+ if (!st->ss->kill_subarray) {
+ if (verbose >= 0)
+ pr_err("Operation not supported for %s metadata\n",
+ st->ss->name);
+ goto free_super;
+ }
+
+ if (is_subarray_active(subarray, st->devnm)) {
+ if (verbose >= 0)
+ pr_err("Subarray-%s still active, aborting\n",
+ subarray);
+ goto free_super;
+ }
+
+ if (mdmon_running(st->devnm))
+ st->update_tail = &st->updates;
+
+ /* ok we've found our victim, drop the axe */
+ rv = st->ss->kill_subarray(st, subarray);
+ if (rv) {
+ if (verbose >= 0)
+ pr_err("Failed to delete subarray-%s from %s\n",
+ subarray, dev);
+ goto free_super;
+ }
+
+ /* FIXME these routines do not report success/failure */
+ if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+
+ if (verbose >= 0)
+ pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n",
+ subarray, dev);
+
+ rv = 0;
+
+ free_super:
+ st->ss->free_super(st);
+ close(fd);
+
+ return rv;
+}
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..2a51d81
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,332 @@
+#
+# mdadm - manage Linux "md" devices aka RAID arrays.
+#
+# Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au>
+# Copyright (C) 2013 Neil Brown <neilb@suse.de>
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program; if not, write to the Free Software
+# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+#
+# Author: Neil Brown
+# Email: <neilb@cse.unsw.edu.au>
+# Paper: Neil Brown
+# School of Computer Science and Engineering
+# The University of New South Wales
+# Sydney, 2052
+# Australia
+#
+
+# define "CXFLAGS" to give extra flags to CC.
+# e.g. make CXFLAGS=-O to optimise
+CXFLAGS ?=-O2
+TCC = tcc
+UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found )
+#DIET_GCC = diet gcc
+# sorry, but diet-libc doesn't know about posix_memalign,
+# so we cannot use it any more.
+DIET_GCC = gcc -DHAVE_STDINT_H
+
+KLIBC=/home/src/klibc/klibc-0.77
+
+KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32
+
+ifdef COVERITY
+COVERITY_FLAGS=-include coverity-gcc-hack.h
+endif
+
+ifeq ($(origin CC),default)
+CC := $(CROSS_COMPILE)gcc
+endif
+CXFLAGS ?= -ggdb
+CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter
+ifdef WARN_UNUSED
+CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3
+endif
+
+FALLTHROUGH := $(shell gcc -v --help 2>&1 | grep "implicit-fallthrough" | wc -l)
+ifneq "$(FALLTHROUGH)" "0"
+CWFLAGS += -Wimplicit-fallthrough=0
+endif
+
+ifdef DEBIAN
+CPPFLAGS += -DDEBIAN
+endif
+ifdef DEFAULT_OLD_METADATA
+ CPPFLAGS += -DDEFAULT_OLD_METADATA
+ DEFAULT_METADATA=0.90
+else
+ DEFAULT_METADATA=1.2
+endif
+CPPFLAGS += -DBINDIR=\"$(BINDIR)\"
+
+PKG_CONFIG ?= pkg-config
+
+SYSCONFDIR = /etc
+CONFFILE = $(SYSCONFDIR)/mdadm.conf
+CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf
+MAILCMD =/usr/sbin/sendmail -t
+CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\"
+# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the
+# pivotroot from early boot to late boot.
+# /run is best, but for distros that don't support that.
+# /dev can work, in which case you probably want /dev/.mdadm
+RUN_DIR=/run/mdadm
+CHECK_RUN_DIR=1
+MAP_DIR=$(RUN_DIR)
+MAP_FILE = map
+MAP_PATH = $(MAP_DIR)/$(MAP_FILE)
+MDMON_DIR = $(RUN_DIR)
+# place for autoreplace cookies
+FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots
+SYSTEMD_DIR=/lib/systemd/system
+LIB_DIR=/usr/libexec/mdadm
+
+COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC)
+DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM)
+
+DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\"
+DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\"
+DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\"
+CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM)
+
+VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//')
+VERS_DATE = $(shell [ -d .git ] && date --iso-8601 --date="`git log -n1 --format=format:%cd --date=iso --date=short`")
+DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",)
+DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",)
+DEXTRAVERSION = $(if $(EXTRAVERSION),-DEXTRAVERSION="\" - $(EXTRAVERSION)\"",)
+CFLAGS += $(DVERS) $(DDATE) $(DEXTRAVERSION)
+
+# The glibc TLS ABI requires applications that call clone(2) to set up
+# TLS data structures, use pthreads until mdmon implements this support
+USE_PTHREADS = 1
+ifdef USE_PTHREADS
+CFLAGS += -DUSE_PTHREADS
+MON_LDFLAGS += -pthread
+endif
+
+# If you want a static binary, you might uncomment these
+# LDFLAGS = -static
+# STRIP = -s
+LDLIBS = -ldl
+
+# To explicitly disable libudev, set -DNO_LIBUDEV in CXFLAGS
+ifeq (, $(findstring -DNO_LIBUDEV, $(CXFLAGS)))
+ LDLIBS += -ludev
+endif
+
+INSTALL = /usr/bin/install
+DESTDIR =
+BINDIR = /sbin
+MANDIR = /usr/share/man
+MAN4DIR = $(MANDIR)/man4
+MAN5DIR = $(MANDIR)/man5
+MAN8DIR = $(MANDIR)/man8
+
+UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null)
+ifndef UDEVDIR
+ UDEVDIR = /lib/udev
+endif
+
+ifeq (,$(findstring s,$(MAKEFLAGS)))
+ ECHO=echo
+else
+ ECHO=:
+endif
+
+OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o uuid.o util.o maps.o lib.o \
+ Manage.o Assemble.o Build.o \
+ Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \
+ Incremental.o Dump.o \
+ mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \
+ super-mbr.o super-gpt.o \
+ restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \
+ platform-intel.o probe_roms.o crc32c.o
+
+CHECK_OBJS = restripe.o uuid.o sysfs.o maps.o lib.o xmalloc.o dlink.o
+
+SRCS = $(patsubst %.o,%.c,$(OBJS))
+
+INCL = mdadm.h part.h bitmap.h
+
+MON_OBJS = mdmon.o monitor.o managemon.o uuid.o util.o maps.o mdstat.o sysfs.o \
+ policy.o lib.o \
+ Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \
+ super-mbr.o super-gpt.o \
+ super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \
+ platform-intel.o probe_roms.o crc32c.o
+
+MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS))
+
+STATICSRC = pwgr.c
+STATICOBJS = pwgr.o
+
+all : mdadm mdmon
+man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man
+
+check_rundir:
+ @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \
+ echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \
+ echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \
+ echo "***** or set CHECK_RUN_DIR=0"; exit 1; \
+ fi
+
+everything: all mdadm.static swap_super test_stripe raid6check \
+ mdadm.Os mdadm.O2 man
+everything-test: all mdadm.static swap_super test_stripe \
+ mdadm.Os mdadm.O2 man
+# mdadm.uclibc doesn't work on x86-64
+# mdadm.tcc doesn't work..
+
+%.o: %.c
+ $(CC) $(CFLAGS) $(CPPFLAGS) $(COVERITY_FLAGS) -o $@ -c $<
+
+mdadm : $(OBJS) | check_rundir
+ $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS)
+
+mdadm.static : $(OBJS) $(STATICOBJS)
+ $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS)
+
+mdadm.tcc : $(SRCS) $(INCL)
+ $(TCC) -o mdadm.tcc $(SRCS)
+
+mdadm.klibc : $(SRCS) $(INCL)
+ rm -f $(OBJS)
+ $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS)
+
+mdadm.Os : $(SRCS) $(INCL)
+ $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS)
+
+mdadm.O2 : $(SRCS) $(INCL) mdmon.O2
+ $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS)
+
+mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h
+ $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS)
+
+# use '-z now' to guarantee no dynamic linker interactions with the monitor thread
+mdmon : $(MON_OBJS) | check_rundir
+ $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS)
+msg.o: msg.c msg.h
+
+test_stripe : restripe.c xmalloc.o mdadm.h
+ $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c
+
+raid6check : raid6check.o mdadm.h $(CHECK_OBJS)
+ $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS)
+
+mdadm.8 : mdadm.8.in
+ sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \
+ -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8
+
+mdadm.man : mdadm.8
+ man -l mdadm.8 > mdadm.man
+
+mdmon.man : mdmon.8
+ man -l mdmon.8 > mdmon.man
+
+md.man : md.4
+ man -l md.4 > md.man
+
+mdadm.conf.man : mdadm.conf.5
+ man -l mdadm.conf.5 > mdadm.conf.man
+
+raid6check.man : raid6check.8
+ man -l raid6check.8 > raid6check.man
+
+$(OBJS) : $(INCL) mdmon.h
+$(MON_OBJS) : $(INCL) mdmon.h
+
+sha1.o : sha1.c sha1.h md5.h
+ $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c
+
+install : install-bin install-man install-udev
+
+install-static : mdadm.static install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm
+
+install-tcc : mdadm.tcc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm
+
+install-uclibc : mdadm.uclibc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm
+
+install-klibc : mdadm.klibc install-man
+ $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm
+
+install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8
+ $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8
+ $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8
+ $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4
+ $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5
+
+install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules udev-md-raid-creating.rules \
+ udev-md-clustered-confirm-device.rules
+ @for file in 01-md-raid-creating.rules 63-md-raid-arrays.rules 64-md-raid-assembly.rules \
+ 69-md-clustered-confirm-device.rules ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \
+ $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \
+ rm -f .install.tmp.1; \
+ done
+
+install-systemd: systemd/mdmon@.service
+ @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \
+ mdadm-last-resort@.service mdadm-grow-continue@.service \
+ mdcheck_start.timer mdcheck_start.service \
+ mdcheck_continue.timer mdcheck_continue.service \
+ mdmonitor-oneshot.timer mdmonitor-oneshot.service \
+ ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \
+ $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \
+ rm -f .install.tmp.2; \
+ done
+ @for file in mdadm.shutdown ; \
+ do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \
+ $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \
+ rm -f .install.tmp.3; \
+ done
+ if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(LIB_DIR)/mdadm_env.sh ;fi
+
+install-bin: mdadm mdmon
+ $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm
+ $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon
+
+uninstall:
+ rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm
+
+test: mdadm mdmon test_stripe swap_super raid6check
+ @echo "Please run './test' as root"
+
+clean :
+ rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \
+ mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \
+ .merge_file_* mdadm.Os mdadm.O2 mdmon.O2 swap_super init.cpio.gz \
+ mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon mdadm.8
+ rm -rf cov-int
+
+dist : clean
+ ./makedist
+
+testdist : everything-test clean
+ ./makedist test
+
+TAGS :
+ etags *.h *.c
+
+DISTRO_MAKEFILE := $(wildcard distropkg/Makefile)
+ifdef DISTRO_MAKEFILE
+include $(DISTRO_MAKEFILE)
+endif
diff --git a/Manage.c b/Manage.c
new file mode 100644
index 0000000..f789e0c
--- /dev/null
+++ b/Manage.c
@@ -0,0 +1,1767 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_u.h"
+#include "md_p.h"
+#include <ctype.h>
+
+int Manage_ro(char *devname, int fd, int readonly)
+{
+ /* switch to readonly or rw
+ *
+ * requires >= 0.90.0
+ * first check that array is runing
+ * use RESTART_ARRAY_RW or STOP_ARRAY_RO
+ *
+ */
+ struct mdinfo *mdi;
+ int rv = 0;
+
+ /* If this is an externally-managed array, we need to modify the
+ * metadata_version so that mdmon doesn't undo our change.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION);
+ if (mdi &&
+ mdi->array.major_version == -1 &&
+ is_subarray(mdi->text_version)) {
+ char vers[64];
+ strcpy(vers, "external:");
+ strcat(vers, mdi->text_version);
+ if (readonly > 0) {
+ int rv;
+ /* We set readonly ourselves. */
+ vers[9] = '-';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ close(fd);
+ rv = sysfs_set_str(mdi, NULL, "array_state", "readonly");
+
+ if (rv < 0) {
+ pr_err("failed to set readonly for %s: %s\n",
+ devname, strerror(errno));
+
+ vers[9] = mdi->text_version[0];
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+ rv = 1;
+ goto out;
+ }
+ } else {
+ char *cp;
+ /* We cannot set read/write - must signal mdmon */
+ vers[9] = '/';
+ sysfs_set_str(mdi, NULL, "metadata_version", vers);
+
+ cp = strchr(vers+10, '/');
+ if (cp)
+ *cp = 0;
+ ping_monitor(vers+10);
+ if (mdi->array.level <= 0)
+ sysfs_set_str(mdi, NULL, "array_state", "active");
+ }
+ goto out;
+ }
+
+ if (!md_array_active(fd)) {
+ pr_err("%s does not appear to be active.\n", devname);
+ rv = 1;
+ goto out;
+ }
+
+ if (readonly > 0) {
+ if (ioctl(fd, STOP_ARRAY_RO, NULL)) {
+ pr_err("failed to set readonly for %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+ } else if (readonly < 0) {
+ if (ioctl(fd, RESTART_ARRAY_RW, NULL)) {
+ pr_err("failed to set writable for %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+ }
+out:
+ sysfs_free(mdi);
+ return rv;
+}
+
+static void remove_devices(char *devnm, char *path)
+{
+ /*
+ * Remove names at 'path' - possibly with
+ * partition suffixes - which link to the 'standard'
+ * name for devnm. These were probably created
+ * by mdadm when the array was assembled.
+ */
+ char base[40];
+ char *path2;
+ char link[1024];
+ int n;
+ int part;
+ char *be;
+ char *pe;
+
+ if (!path)
+ return;
+
+ sprintf(base, "/dev/%s", devnm);
+ be = base + strlen(base);
+
+ path2 = xmalloc(strlen(path)+20);
+ strcpy(path2, path);
+ pe = path2 + strlen(path2);
+
+ for (part = 0; part < 16; part++) {
+ if (part) {
+ sprintf(be, "p%d", part);
+
+ if (isdigit(pe[-1]))
+ sprintf(pe, "p%d", part);
+ else
+ sprintf(pe, "%d", part);
+ }
+ n = readlink(path2, link, sizeof(link));
+ if (n > 0 && (int)strlen(base) == n &&
+ strncmp(link, base, n) == 0)
+ unlink(path2);
+ }
+ free(path2);
+}
+
+int Manage_run(char *devname, int fd, struct context *c)
+{
+ /* Run the array. Array must already be configured
+ * Requires >= 0.90.0
+ */
+ char nm[32], *nmp;
+
+ nmp = fd2devnm(fd);
+ if (!nmp) {
+ pr_err("Cannot find %s in sysfs!!\n", devname);
+ return 1;
+ }
+ strcpy(nm, nmp);
+ return IncrementalScan(c, nm);
+}
+
+int Manage_stop(char *devname, int fd, int verbose, int will_retry)
+{
+ /* Stop the array. Array must already be configured
+ * 'will_retry' means that error messages are not wanted.
+ */
+ int rv = 0;
+ struct map_ent *map = NULL;
+ struct mdinfo *mdi;
+ char devnm[32];
+ char container[32];
+ int err;
+ int count;
+ char buf[32];
+ unsigned long long rd1, rd2;
+
+ if (will_retry && verbose == 0)
+ verbose = -1;
+
+ strcpy(devnm, fd2devnm(fd));
+ /* Get EXCL access first. If this fails, then attempting
+ * to stop is probably a bad idea.
+ */
+ mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION);
+ if (mdi && is_subarray(mdi->text_version)) {
+ char *sl;
+ strncpy(container, mdi->text_version+1, sizeof(container));
+ container[sizeof(container)-1] = 0;
+ sl = strchr(container, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ container[0] = 0;
+ close(fd);
+ count = 5;
+ while (((fd = ((devname[0] == '/')
+ ?open(devname, O_RDONLY|O_EXCL)
+ :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 ||
+ strcmp(fd2devnm(fd), devnm) != 0) && container[0] &&
+ mdmon_running(container) && count) {
+ /* Can't open, so something might be wrong. However it
+ * is a container, so we might be racing with mdmon, so
+ * retry for a bit.
+ */
+ if (fd >= 0)
+ close(fd);
+ flush_mdmon(container);
+ count--;
+ }
+ if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) {
+ if (fd >= 0)
+ close(fd);
+ if (verbose >= 0)
+ pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n",
+ devname);
+ return 1;
+ }
+ /* If this is an mdmon managed array, just write 'inactive'
+ * to the array state and let mdmon clear up.
+ */
+ if (mdi &&
+ mdi->array.level > 0 &&
+ is_subarray(mdi->text_version)) {
+ int err;
+ /* This is mdmon managed. */
+ close(fd);
+
+ /* As we had an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25;
+ while (count &&
+ (err = sysfs_set_str(mdi, NULL,
+ "array_state",
+ "inactive")) < 0 &&
+ errno == EBUSY) {
+ usleep(200000);
+ count--;
+ }
+ if (err) {
+ if (verbose >= 0)
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ rv = 1;
+ goto out;
+ }
+
+ /* Give monitor a chance to act */
+ ping_monitor(mdi->text_version);
+
+ fd = open_dev_excl(devnm);
+ if (fd < 0) {
+ if (verbose >= 0)
+ pr_err("failed to completely stop %s: Device is busy\n",
+ devname);
+ rv = 1;
+ goto out;
+ }
+ } else if (mdi &&
+ mdi->array.major_version == -1 &&
+ mdi->array.minor_version == -2 &&
+ !is_subarray(mdi->text_version)) {
+ struct mdstat_ent *mds, *m;
+ /* container, possibly mdmon-managed.
+ * Make sure mdmon isn't opening it, which
+ * would interfere with the 'stop'
+ */
+ ping_monitor(mdi->sys_name);
+
+ /* now check that there are no existing arrays
+ * which are members of this array
+ */
+ mds = mdstat_read(0, 0);
+ for (m = mds; m; m = m->next)
+ if (m->metadata_version &&
+ strncmp(m->metadata_version, "external:", 9)==0 &&
+ metadata_container_matches(m->metadata_version+9,
+ devnm)) {
+ if (verbose >= 0)
+ pr_err("Cannot stop container %s: member %s still active\n",
+ devname, m->devnm);
+ free_mdstat(mds);
+ rv = 1;
+ goto out;
+ }
+ }
+
+ /* If the array is undergoing a reshape which changes the number
+ * of devices, then it would be nice to stop it at a point where
+ * it has completed a full number of stripes in both old and
+ * new layouts as this will allow the reshape to be reverted.
+ * So if 'sync_action' is "reshape" and 'raid_disks' shows two
+ * different numbers, then
+ * - freeze reshape
+ * - set sync_max to next multiple of both data_disks and
+ * chunk sizes (or next but one)
+ * - unfreeze reshape
+ * - wait on 'sync_completed' for that point to be reached.
+ */
+ if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) &&
+ sysfs_attribute_available(mdi, NULL, "sync_action") &&
+ sysfs_attribute_available(mdi, NULL, "reshape_direction") &&
+ sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "reshape\n") == 0 &&
+ sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) {
+ unsigned long long position, curr;
+ unsigned long long chunk1, chunk2;
+ unsigned long long rddiv, chunkdiv;
+ unsigned long long sectors;
+ unsigned long long sync_max, old_sync_max;
+ unsigned long long completed;
+ int backwards = 0;
+ int delay;
+ int scfd;
+
+ delay = 40;
+ while (rd1 > rd2 && delay > 0 &&
+ sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) {
+ /* must be in the critical section - wait a bit */
+ delay -= 1;
+ usleep(100000);
+ }
+
+ if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0)
+ goto done;
+ /* Array is frozen */
+
+ rd1 -= mdi->array.level == 6 ? 2 : 1;
+ rd2 -= mdi->array.level == 6 ? 2 : 1;
+ sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf));
+ if (strncmp(buf, "back", 4) == 0)
+ backwards = 1;
+ if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) {
+ /* reshape must have finished now */
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+ goto done;
+ }
+ sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2);
+ chunk1 /= 512;
+ chunk2 /= 512;
+ rddiv = GCD(rd1, rd2);
+ chunkdiv = GCD(chunk1, chunk2);
+ sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2;
+
+ if (backwards) {
+ /* Need to subtract 'reshape_position' from
+ * array size to get equivalent of sync_max.
+ * Size calculation based on raid5_size in kernel.
+ */
+ unsigned long long size = mdi->component_size;
+ size &= ~(chunk1-1);
+ size &= ~(chunk2-1);
+ /* rd1 must be smaller */
+ /* Reshape may have progressed further backwards than
+ * recorded, so target even further back (hence "-1")
+ */
+ position = (position / sectors - 1) * sectors;
+ /* rd1 is always the conversion factor between 'sync'
+ * position and 'reshape' position.
+ * We read 1 "new" stripe worth of data from where-ever,
+ * and when write out that full stripe.
+ */
+ sync_max = size - position/rd1;
+ } else {
+ /* Reshape will very likely be beyond position, and it may
+ * be too late to stop at '+1', so aim for '+2'
+ */
+ position = (position / sectors + 2) * sectors;
+ sync_max = position/rd1;
+ }
+ if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0)
+ old_sync_max = mdi->component_size;
+ /* Must not advance sync_max as that could confuse
+ * the reshape monitor */
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ sysfs_set_str(mdi, NULL, "sync_action", "idle");
+
+ /* That should have set things going again. Now we
+ * wait a little while (3 second max) for sync_completed
+ * to reach the target.
+ * The reshape process can block for 500msec if
+ * the sync speed limit is hit, so we need to wait
+ * a lot longer than that. 1 second is usually
+ * enough. 3 is safe.
+ */
+ delay = 3000;
+ scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed");
+ while (scfd >= 0 && delay > 0 && old_sync_max > 0) {
+ unsigned long long max_completed;
+ sysfs_get_ll(mdi, NULL, "reshape_position", &curr);
+ sysfs_fd_get_str(scfd, buf, sizeof(buf));
+ if (strncmp(buf, "none", 4) == 0) {
+ /* Either reshape has aborted, or hasn't
+ * quite started yet. Wait a bit and
+ * check 'sync_action' to see.
+ */
+ usleep(10000);
+ sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf));
+ if (strncmp(buf, "reshape", 7) != 0)
+ break;
+ }
+
+ if (sysfs_fd_get_two(scfd, &completed,
+ &max_completed) == 2 &&
+ /* 'completed' sometimes reads as max-uulong */
+ completed < max_completed &&
+ (completed > sync_max ||
+ (completed == sync_max && curr != position))) {
+ while (completed > sync_max) {
+ sync_max += sectors / rd1;
+ if (backwards)
+ position -= sectors;
+ else
+ position += sectors;
+ }
+ if (sync_max < old_sync_max)
+ sysfs_set_num(mdi, NULL, "sync_max", sync_max);
+ }
+
+ if (!backwards && curr >= position)
+ break;
+ if (backwards && curr <= position)
+ break;
+ sysfs_wait(scfd, &delay);
+ }
+ if (scfd >= 0)
+ close(scfd);
+
+ }
+done:
+
+ /* As we have an O_EXCL open, any use of the device
+ * which blocks STOP_ARRAY is probably a transient use,
+ * so it is reasonable to retry for a while - 5 seconds.
+ */
+ count = 25; err = 0;
+ while (count && fd >= 0 &&
+ (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) {
+ usleep(200000);
+ count --;
+ }
+ if (fd >= 0 && err) {
+ if (verbose >= 0) {
+ pr_err("failed to stop array %s: %s\n",
+ devname, strerror(errno));
+ if (errno == EBUSY)
+ cont_err("Perhaps a running process, mounted filesystem or active volume group?\n");
+ }
+ rv = 1;
+ goto out;
+ }
+
+ if (get_linux_version() < 2006028) {
+ /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array
+ * was stopped, so We'll do it here just to be sure. Drop any
+ * partitions as well...
+ */
+ if (fd >= 0)
+ ioctl(fd, BLKRRPART, 0);
+ if (mdi)
+ sysfs_uevent(mdi, "change");
+ }
+
+ if (devnm[0] && use_udev()) {
+ struct map_ent *mp = map_by_devnm(&map, devnm);
+ remove_devices(devnm, mp ? mp->path : NULL);
+ }
+
+ if (verbose >= 0)
+ pr_err("stopped %s\n", devname);
+ map_lock(&map);
+ map_remove(&map, devnm);
+ map_unlock(&map);
+out:
+ sysfs_free(mdi);
+
+ return rv;
+}
+
+static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp)
+{
+ struct mddev_dev *new;
+ new = xmalloc(sizeof(*new));
+ memset(new, 0, sizeof(*new));
+ new->devname = xstrdup(name);
+ new->disposition = disp;
+ new->next = dv->next;
+ dv->next = new;
+ return new;
+}
+
+static void add_faulty(struct mddev_dev *dv, int fd, char disp)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int i;
+
+ if (md_get_array_info(fd, &array) != 0)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (md_get_disk_info(fd, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ if ((disk.state & 1) == 0) /* not faulty */
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, disp);
+ }
+}
+
+static void add_detached(struct mddev_dev *dv, int fd, char disp)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int i;
+
+ if (md_get_array_info(fd, &array) != 0)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ int sfd;
+ disk.number = i;
+ if (md_get_disk_info(fd, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ sfd = dev_open(buf, O_RDONLY);
+ if (sfd >= 0) {
+ /* Not detached */
+ close(sfd);
+ continue;
+ }
+ if (errno != ENXIO)
+ /* Probably not detached */
+ continue;
+ dv = add_one(dv, buf, disp);
+ }
+}
+
+static void add_set(struct mddev_dev *dv, int fd, char set_char)
+{
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ int remaining_disks;
+ int copies, set;
+ int i;
+
+ if (md_get_array_info(fd, &array) != 0)
+ return;
+ if (array.level != 10)
+ return;
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies)
+ return;
+
+ remaining_disks = array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ char buf[40];
+ disk.number = i;
+ if (md_get_disk_info(fd, &disk) != 0)
+ continue;
+ if (disk.major == 0 && disk.minor == 0)
+ continue;
+ remaining_disks--;
+ set = disk.raid_disk % copies;
+ if (set_char != set + 'A')
+ continue;
+ sprintf(buf, "%d:%d", disk.major, disk.minor);
+ dv = add_one(dv, buf, dv->disposition);
+ }
+}
+
+int attempt_re_add(int fd, int tfd, struct mddev_dev *dv,
+ struct supertype *dev_st, struct supertype *tst,
+ unsigned long rdev,
+ char *update, char *devname, int verbose,
+ mdu_array_info_t *array)
+{
+ struct mdinfo mdi;
+ int duuid[4];
+ int ouuid[4];
+
+ dev_st->ss->getinfo_super(dev_st, &mdi, NULL);
+ dev_st->ss->uuid_from_super(dev_st, ouuid);
+ if (tst->sb)
+ tst->ss->uuid_from_super(tst, duuid);
+ else
+ /* Assume uuid matches: kernel will check */
+ memcpy(duuid, ouuid, sizeof(ouuid));
+ if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) &&
+ !(mdi.disk.state & (1<<MD_DISK_FAULTY)) &&
+ memcmp(duuid, ouuid, sizeof(ouuid))==0) {
+ /* Looks like it is worth a
+ * try. Need to make sure
+ * kernel will accept it
+ * though.
+ */
+ mdu_disk_info_t disc;
+ /* re-add doesn't work for version-1 superblocks
+ * before 2.6.18 :-(
+ */
+ if (array->major_version == 1 &&
+ get_linux_version() <= 2006018)
+ goto skip_re_add;
+ disc.number = mdi.disk.number;
+ if (md_get_disk_info(fd, &disc) != 0 ||
+ disc.major != 0 || disc.minor != 0)
+ goto skip_re_add;
+ disc.major = major(rdev);
+ disc.minor = minor(rdev);
+ disc.number = mdi.disk.number;
+ disc.raid_disk = mdi.disk.raid_disk;
+ disc.state = mdi.disk.state;
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ /* extra flags are needed when adding to a cluster as
+ * there are two cases to distinguish
+ */
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+ if (dv->writemostly == FlagSet)
+ disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->writemostly == FlagClear)
+ disc.state &= ~(1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
+ if (dv->failfast == FlagClear)
+ disc.state &= ~(1 << MD_DISK_FAILFAST);
+ remove_partitions(tfd);
+ if (update || dv->writemostly != FlagDefault ||
+ dv->failfast != FlagDefault) {
+ int rv = -1;
+ tfd = dev_open(dv->devname, O_RDWR);
+ if (tfd < 0) {
+ pr_err("failed to open %s for superblock update during re-add\n", dv->devname);
+ return -1;
+ }
+
+ if (dv->writemostly == FlagSet)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "writemostly",
+ devname, verbose, 0, NULL);
+ if (dv->writemostly == FlagClear)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "readwrite",
+ devname, verbose, 0, NULL);
+ if (dv->failfast == FlagSet)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "failfast",
+ devname, verbose, 0, NULL);
+ if (dv->failfast == FlagClear)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, "nofailfast",
+ devname, verbose, 0, NULL);
+ if (update)
+ rv = dev_st->ss->update_super(
+ dev_st, NULL, update,
+ devname, verbose, 0, NULL);
+ if (rv == 0)
+ rv = dev_st->ss->store_super(dev_st, tfd);
+ close(tfd);
+ if (rv != 0) {
+ pr_err("failed to update superblock during re-add\n");
+ return -1;
+ }
+ }
+ /* don't even try if disk is marked as faulty */
+ errno = 0;
+ if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) {
+ if (verbose >= 0)
+ pr_err("re-added %s\n", dv->devname);
+ return 1;
+ }
+ if (errno == ENOMEM || errno == EROFS) {
+ pr_err("add new device failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ if (dv->disposition == 'M')
+ return 0;
+ return -1;
+ }
+ }
+skip_re_add:
+ return 0;
+}
+
+int Manage_add(int fd, int tfd, struct mddev_dev *dv,
+ struct supertype *tst, mdu_array_info_t *array,
+ int force, int verbose, char *devname,
+ char *update, unsigned long rdev, unsigned long long array_size,
+ int raid_slot)
+{
+ unsigned long long ldsize;
+ struct supertype *dev_st;
+ int j;
+ mdu_disk_info_t disc;
+
+ if (!get_dev_size(tfd, dv->devname, &ldsize)) {
+ if (dv->disposition == 'M')
+ return 0;
+ else
+ return -1;
+ }
+
+ if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) {
+ /* More than 4TB is wasted on v0.90 */
+ if (!force) {
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Add --force is you really want to add this device.\n",
+ dv->devname, devname);
+ return -1;
+ }
+ pr_err("%s is larger than %s can effectively use.\n"
+ " Adding anyway as --force was given.\n",
+ dv->devname, devname);
+ }
+
+ if (array->not_persistent == 0 || tst->ss->external) {
+
+ /* need to find a sample superblock to copy, and
+ * a spare slot to use.
+ * For 'external' array (well, container based),
+ * We can just load the metadata for the array->
+ */
+ int array_failed;
+ if (tst->sb)
+ /* already loaded */;
+ else if (tst->ss->external) {
+ tst->ss->load_container(tst, fd, NULL);
+ } else for (j = 0; j < tst->max_devs; j++) {
+ char *dev;
+ int dfd;
+ disc.number = j;
+ if (md_get_disk_info(fd, &disc))
+ continue;
+ if (disc.major==0 && disc.minor==0)
+ continue;
+ if ((disc.state & 4)==0) /* sync */
+ continue;
+ /* Looks like a good device to try */
+ dev = map_dev(disc.major, disc.minor, 1);
+ if (!dev)
+ continue;
+ dfd = dev_open(dev, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ if (tst->ss->load_super(tst, dfd,
+ NULL)) {
+ close(dfd);
+ continue;
+ }
+ close(dfd);
+ break;
+ }
+ /* FIXME this is a bad test to be using */
+ if (!tst->sb && (dv->disposition != 'a' &&
+ dv->disposition != 'S')) {
+ /* we are re-adding a device to a
+ * completely dead array - have to depend
+ * on kernel to check
+ */
+ } else if (!tst->sb) {
+ pr_err("cannot load array metadata from %s\n", devname);
+ return -1;
+ }
+
+ /* Make sure device is large enough */
+ if (dv->disposition != 'j' && /* skip size check for Journal */
+ tst->sb &&
+ tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) <
+ array_size) {
+ if (dv->disposition == 'M')
+ return 0;
+ pr_err("%s not large enough to join array\n",
+ dv->devname);
+ return -1;
+ }
+
+ /* Possibly this device was recently part of
+ * the array and was temporarily removed, and
+ * is now being re-added. If so, we can
+ * simply re-add it.
+ */
+
+ if (array->not_persistent == 0) {
+ dev_st = dup_super(tst);
+ dev_st->ss->load_super(dev_st, tfd, NULL);
+ if (dev_st->sb && dv->disposition != 'S') {
+ int rv;
+
+ rv = attempt_re_add(fd, tfd, dv, dev_st, tst,
+ rdev, update, devname,
+ verbose, array);
+ dev_st->ss->free_super(dev_st);
+ if (rv)
+ return rv;
+ }
+ }
+ if (dv->disposition == 'M') {
+ if (verbose > 0)
+ pr_err("--re-add for %s to %s is not possible\n",
+ dv->devname, devname);
+ return 0;
+ }
+ if (dv->disposition == 'A') {
+ pr_err("--re-add for %s to %s is not possible\n",
+ dv->devname, devname);
+ return -1;
+ }
+ if (array->active_disks < array->raid_disks) {
+ char *avail = xcalloc(array->raid_disks, 1);
+ int d;
+ int found = 0;
+
+ for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) {
+ disc.number = d;
+ if (md_get_disk_info(fd, &disc))
+ continue;
+ if (disc.major == 0 && disc.minor == 0)
+ continue;
+ if (!(disc.state & (1<<MD_DISK_SYNC)))
+ continue;
+ avail[disc.raid_disk] = 1;
+ found++;
+ }
+ array_failed = !enough(array->level, array->raid_disks,
+ array->layout, 1, avail);
+ free(avail);
+ } else
+ array_failed = 0;
+ if (array_failed) {
+ pr_err("%s has failed so using --add cannot work and might destroy\n",
+ devname);
+ pr_err("data on %s. You should stop the array and re-assemble it.\n",
+ dv->devname);
+ return -1;
+ }
+ } else {
+ /* non-persistent. Must ensure that new drive
+ * is at least array->size big.
+ */
+ if (ldsize/512 < array_size) {
+ pr_err("%s not large enough to join array\n",
+ dv->devname);
+ return -1;
+ }
+ }
+ /* committed to really trying this device now*/
+ remove_partitions(tfd);
+
+ /* in 2.6.17 and earlier, version-1 superblocks won't
+ * use the number we write, but will choose a free number.
+ * we must choose the same free number, which requires
+ * starting at 'raid_disks' and counting up
+ */
+ for (j = array->raid_disks; j < tst->max_devs; j++) {
+ disc.number = j;
+ if (md_get_disk_info(fd, &disc))
+ break;
+ if (disc.major==0 && disc.minor==0)
+ break;
+ if (disc.state & 8) /* removed */
+ break;
+ }
+ disc.major = major(rdev);
+ disc.minor = minor(rdev);
+ if (raid_slot < 0)
+ disc.number = j;
+ else
+ disc.number = raid_slot;
+ disc.state = 0;
+
+ /* only add journal to array that supports journaling */
+ if (dv->disposition == 'j') {
+ struct mdinfo *mdp;
+
+ mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+ if (!mdp) {
+ pr_err("%s unable to read array state.\n", devname);
+ return -1;
+ }
+
+ if (mdp->array_state != ARRAY_READONLY) {
+ sysfs_free(mdp);
+ pr_err("%s is not readonly, cannot add journal.\n", devname);
+ return -1;
+ }
+
+ sysfs_free(mdp);
+
+ disc.raid_disk = 0;
+ }
+
+ if (array->not_persistent==0) {
+ int dfd;
+ if (dv->disposition == 'j')
+ disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC);
+ if (dv->writemostly == FlagSet)
+ disc.state |= 1 << MD_DISK_WRITEMOSTLY;
+ if (dv->failfast == FlagSet)
+ disc.state |= 1 << MD_DISK_FAILFAST;
+ dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ if (tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname, INVALID_SECTORS))
+ return -1;
+ if (tst->ss->write_init_super(tst))
+ return -1;
+ } else if (dv->disposition == 'A') {
+ /* this had better be raid1.
+ * As we are "--re-add"ing we must find a spare slot
+ * to fill.
+ */
+ char *used = xcalloc(array->raid_disks, 1);
+ for (j = 0; j < tst->max_devs; j++) {
+ mdu_disk_info_t disc2;
+ disc2.number = j;
+ if (md_get_disk_info(fd, &disc2))
+ continue;
+ if (disc2.major==0 && disc2.minor==0)
+ continue;
+ if (disc2.state & 8) /* removed */
+ continue;
+ if (disc2.raid_disk < 0)
+ continue;
+ if (disc2.raid_disk > array->raid_disks)
+ continue;
+ used[disc2.raid_disk] = 1;
+ }
+ for (j = 0 ; j < array->raid_disks; j++)
+ if (!used[j]) {
+ disc.raid_disk = j;
+ disc.state |= (1<<MD_DISK_SYNC);
+ break;
+ }
+ free(used);
+ }
+
+ if (array->state & (1 << MD_SB_CLUSTERED)) {
+ if (dv->disposition == 'c')
+ disc.state |= (1 << MD_DISK_CANDIDATE);
+ else
+ disc.state |= (1 << MD_DISK_CLUSTER_ADD);
+ }
+
+ if (dv->writemostly == FlagSet)
+ disc.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (dv->failfast == FlagSet)
+ disc.state |= (1 << MD_DISK_FAILFAST);
+ if (tst->ss->external) {
+ /* add a disk
+ * to an external metadata container */
+ struct mdinfo new_mdi;
+ struct mdinfo *sra;
+ int container_fd;
+ char devnm[32];
+ int dfd;
+
+ strcpy(devnm, fd2devnm(fd));
+
+ container_fd = open_dev_excl(devnm);
+ if (container_fd < 0) {
+ pr_err("add failed for %s: could not get exclusive access to container\n",
+ dv->devname);
+ tst->ss->free_super(tst);
+ return -1;
+ }
+
+ /* Check if metadata handler is able to accept the drive */
+ if (!tst->ss->validate_geometry(tst, LEVEL_CONTAINER, 0, 1, NULL,
+ 0, 0, dv->devname, NULL, 0, 1)) {
+ close(container_fd);
+ return -1;
+ }
+
+ Kill(dv->devname, NULL, 0, -1, 0);
+ dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT);
+ if (tst->ss->add_to_super(tst, &disc, dfd,
+ dv->devname, INVALID_SECTORS)) {
+ close(dfd);
+ close(container_fd);
+ return -1;
+ }
+ if (!mdmon_running(tst->container_devnm))
+ tst->ss->sync_metadata(tst);
+
+ sra = sysfs_read(container_fd, NULL, 0);
+ if (!sra) {
+ pr_err("add failed for %s: sysfs_read failed\n",
+ dv->devname);
+ close(container_fd);
+ tst->ss->free_super(tst);
+ return -1;
+ }
+ sra->array.level = LEVEL_CONTAINER;
+ /* Need to set data_offset and component_size */
+ tst->ss->getinfo_super(tst, &new_mdi, NULL);
+ new_mdi.disk.major = disc.major;
+ new_mdi.disk.minor = disc.minor;
+ new_mdi.recovery_start = 0;
+ /* Make sure fds are closed as they are O_EXCL which
+ * would block add_disk */
+ tst->ss->free_super(tst);
+ if (sysfs_add_disk(sra, &new_mdi, 0) != 0) {
+ pr_err("add new device to external metadata failed for %s\n", dv->devname);
+ close(container_fd);
+ sysfs_free(sra);
+ return -1;
+ }
+ ping_monitor(devnm);
+ sysfs_free(sra);
+ close(container_fd);
+ } else {
+ tst->ss->free_super(tst);
+ if (ioctl(fd, ADD_NEW_DISK, &disc)) {
+ if (dv->disposition == 'j')
+ pr_err("Failed to hot add %s as journal, "
+ "please try restart %s.\n", dv->devname, devname);
+ else
+ pr_err("add new device failed for %s as %d: %s\n",
+ dv->devname, j, strerror(errno));
+ return -1;
+ }
+ if (dv->disposition == 'j') {
+ pr_err("Journal added successfully, making %s read-write\n", devname);
+ if (Manage_ro(devname, fd, -1))
+ pr_err("Failed to make %s read-write\n", devname);
+ }
+
+ }
+ if (verbose >= 0)
+ pr_err("added %s\n", dv->devname);
+ return 1;
+}
+
+int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv,
+ int sysfd, unsigned long rdev, int force, int verbose, char *devname)
+{
+ int lfd = -1;
+ int err;
+
+ if (tst->ss->external) {
+ /* To remove a device from a container, we must
+ * check that it isn't in use in an array.
+ * This involves looking in the 'holders'
+ * directory - there must be just one entry,
+ * the container.
+ * To ensure that it doesn't get used as a
+ * hot spare while we are checking, we
+ * get an O_EXCL open on the container
+ */
+ int ret;
+ char devnm[32];
+ strcpy(devnm, fd2devnm(fd));
+ lfd = open_dev_excl(devnm);
+ if (lfd < 0) {
+ pr_err("Cannot get exclusive access to container - odd\n");
+ return -1;
+ }
+ /* We may not be able to check on holders in
+ * sysfs, either because we don't have the dev num
+ * (rdev == 0) or because the device has been detached
+ * and the 'holders' directory no longer exists
+ * (ret == -1). In that case, assume it is OK to
+ * remove.
+ */
+ if (rdev == 0)
+ ret = -1;
+ else {
+ /*
+ * The drive has already been set to 'faulty', however
+ * monitor might not have had time to process it and the
+ * drive might still have an entry in the 'holders'
+ * directory. Try a few times to avoid a false error
+ */
+ int count = 20;
+
+ do {
+ ret = sysfs_unique_holder(devnm, rdev);
+ if (ret < 2)
+ break;
+ usleep(100 * 1000); /* 100ms */
+ } while (--count > 0);
+
+ if (ret == 0) {
+ pr_err("%s is not a member, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ if (ret >= 2) {
+ pr_err("%s is still in use, cannot remove.\n",
+ dv->devname);
+ close(lfd);
+ return -1;
+ }
+ }
+ }
+ /* FIXME check that it is a current member */
+ if (sysfd >= 0) {
+ /* device has been removed and we don't know
+ * the major:minor number
+ */
+ err = sys_hot_remove_disk(sysfd, force);
+ } else {
+ err = hot_remove_disk(fd, rdev, force);
+ if (err && errno == ENODEV) {
+ /* Old kernels rejected this if no personality
+ * is registered */
+ struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS);
+ struct mdinfo *dv = NULL;
+ if (sra)
+ dv = sra->devs;
+ for ( ; dv ; dv=dv->next)
+ if (dv->disk.major == (int)major(rdev) &&
+ dv->disk.minor == (int)minor(rdev))
+ break;
+ if (dv)
+ err = sysfs_set_str(sra, dv,
+ "state", "remove");
+ else
+ err = -1;
+ sysfs_free(sra);
+ }
+ }
+ if (err) {
+ pr_err("hot remove failed for %s: %s\n", dv->devname,
+ strerror(errno));
+ if (lfd >= 0)
+ close(lfd);
+ return -1;
+ }
+ if (tst->ss->external) {
+ /*
+ * Before dropping our exclusive open we make an
+ * attempt at preventing mdmon from seeing an
+ * 'add' event before reconciling this 'remove'
+ * event.
+ */
+ char *devnm = fd2devnm(fd);
+
+ if (!devnm) {
+ pr_err("unable to get container name\n");
+ return -1;
+ }
+
+ ping_manager(devnm);
+ }
+ if (lfd >= 0)
+ close(lfd);
+ if (verbose >= 0)
+ pr_err("hot removed %s from %s\n",
+ dv->devname, devname);
+ return 1;
+}
+
+int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv,
+ unsigned long rdev, int verbose, char *devname)
+{
+ struct mdinfo *mdi, *di;
+ if (tst->ss->external) {
+ pr_err("--replace only supported for native metadata (0.90 or 1.x)\n");
+ return -1;
+ }
+ /* Need to find the device in sysfs and add 'want_replacement' to the
+ * status.
+ */
+ mdi = sysfs_read(fd, NULL, GET_DEVS);
+ if (!mdi || !mdi->devs) {
+ pr_err("Cannot find status of %s to enable replacement - strange\n",
+ devname);
+ return -1;
+ }
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (int)major(rdev) &&
+ di->disk.minor == (int)minor(rdev))
+ break;
+ if (di) {
+ int rv;
+ if (di->disk.raid_disk < 0) {
+ pr_err("%s is not active and so cannot be replaced.\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ rv = sysfs_set_str(mdi, di,
+ "state", "want_replacement");
+ if (rv) {
+ sysfs_free(mdi);
+ pr_err("Failed to request replacement for %s\n",
+ dv->devname);
+ return -1;
+ }
+ if (verbose >= 0)
+ pr_err("Marked %s (device %d in %s) for replacement\n",
+ dv->devname, di->disk.raid_disk, devname);
+ /* If there is a matching 'with', we need to tell it which
+ * raid disk
+ */
+ while (dv && dv->disposition != 'W')
+ dv = dv->next;
+ if (dv) {
+ dv->disposition = 'w';
+ dv->used = di->disk.raid_disk;
+ }
+ return 1;
+ }
+ sysfs_free(mdi);
+ pr_err("%s not found in %s so cannot --replace it\n",
+ dv->devname, devname);
+ return -1;
+}
+
+int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv,
+ unsigned long rdev, int verbose, char *devname)
+{
+ struct mdinfo *mdi, *di;
+ /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */
+ mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE);
+ if (!mdi || !mdi->devs) {
+ pr_err("Cannot find status of %s to enable replacement - strange\n",
+ devname);
+ return -1;
+ }
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (int)major(rdev) &&
+ di->disk.minor == (int)minor(rdev))
+ break;
+ if (di) {
+ int rv;
+ if (di->disk.state & (1<<MD_DISK_FAULTY)) {
+ pr_err("%s is faulty and cannot be a replacement\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ if (di->disk.raid_disk >= 0) {
+ pr_err("%s is active and cannot be a replacement\n",
+ dv->devname);
+ sysfs_free(mdi);
+ return -1;
+ }
+ rv = sysfs_set_num(mdi, di,
+ "slot", dv->used);
+ if (rv) {
+ sysfs_free(mdi);
+ pr_err("Failed to set %s as preferred replacement.\n",
+ dv->devname);
+ return -1;
+ }
+ if (verbose >= 0)
+ pr_err("Marked %s in %s as replacement for device %d\n",
+ dv->devname, devname, dv->used);
+ return 1;
+ }
+ sysfs_free(mdi);
+ pr_err("%s not found in %s so cannot make it preferred replacement\n",
+ dv->devname, devname);
+ return -1;
+}
+
+int Manage_subdevs(char *devname, int fd,
+ struct mddev_dev *devlist, int verbose, int test,
+ char *update, int force)
+{
+ /* Do something to each dev.
+ * devmode can be
+ * 'a' - add the device
+ * 'S' - add the device as a spare - don't try re-add
+ * 'j' - add the device as a journal device
+ * 'A' - re-add the device
+ * 'r' - remove the device: HOT_REMOVE_DISK
+ * device can be 'faulty' or 'detached' in which case all
+ * matching devices are removed.
+ * 'f' - set the device faulty SET_DISK_FAULTY
+ * device can be 'detached' in which case any device that
+ * is inaccessible will be marked faulty.
+ * 'R' - mark this device as wanting replacement.
+ * 'W' - this device is added if necessary and activated as
+ * a replacement for a previous 'R' device.
+ * -----
+ * 'w' - 'W' will be changed to 'w' when it is paired with
+ * a 'R' device. If a 'W' is found while walking the list
+ * it must be unpaired, and is an error.
+ * 'M' - this is created by a 'missing' target. It is a slight
+ * variant on 'A'
+ * 'F' - Another variant of 'A', where the device was faulty
+ * so must be removed from the array first.
+ * 'c' - confirm the device as found (for clustered environments)
+ *
+ * For 'f' and 'r', the device can also be a kernel-internal
+ * name such as 'sdb'.
+ */
+ mdu_array_info_t array;
+ unsigned long long array_size;
+ struct mddev_dev *dv;
+ int tfd = -1;
+ struct supertype *tst;
+ char *subarray = NULL;
+ int sysfd = -1;
+ int count = 0; /* number of actions taken */
+ struct mdinfo info;
+ struct mdinfo devinfo;
+ int frozen = 0;
+ int busy = 0;
+ int raid_slot = -1;
+
+ if (sysfs_init(&info, fd, NULL)) {
+ pr_err("sysfs not availabile for %s\n", devname);
+ goto abort;
+ }
+
+ if (md_get_array_info(fd, &array)) {
+ pr_err("Cannot get array info for %s\n", devname);
+ goto abort;
+ }
+ /* array.size is only 32 bits and may be truncated.
+ * So read from sysfs if possible, and record number of sectors
+ */
+
+ array_size = get_component_size(fd);
+ if (array_size <= 0)
+ array_size = array.size * 2;
+
+ tst = super_by_fd(fd, &subarray);
+ if (!tst) {
+ pr_err("unsupport array - version %d.%d\n",
+ array.major_version, array.minor_version);
+ goto abort;
+ }
+
+ for (dv = devlist; dv; dv = dv->next) {
+ dev_t rdev = 0; /* device to add/remove etc */
+ int rv;
+ int mj,mn;
+
+ raid_slot = -1;
+ if (dv->disposition == 'c') {
+ rv = parse_cluster_confirm_arg(dv->devname,
+ &dv->devname,
+ &raid_slot);
+ if (rv) {
+ pr_err("Could not get the devname of cluster\n");
+ goto abort;
+ }
+ }
+
+ if (strcmp(dv->devname, "failed") == 0 ||
+ strcmp(dv->devname, "faulty") == 0) {
+ if (dv->disposition != 'A' && dv->disposition != 'r') {
+ pr_err("%s only meaningful with -r or --re-add, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+ add_faulty(dv, fd, (dv->disposition == 'A'
+ ? 'F' : 'r'));
+ continue;
+ }
+ if (strcmp(dv->devname, "detached") == 0) {
+ if (dv->disposition != 'r' && dv->disposition != 'f') {
+ pr_err("%s only meaningful with -r of -f, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+ add_detached(dv, fd, dv->disposition);
+ continue;
+ }
+
+ if (strcmp(dv->devname, "missing") == 0) {
+ struct mddev_dev *add_devlist;
+ struct mddev_dev **dp;
+ if (dv->disposition == 'c') {
+ rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL);
+ break;
+ }
+
+ if (dv->disposition != 'A') {
+ pr_err("'missing' only meaningful with --re-add\n");
+ goto abort;
+ }
+ add_devlist = conf_get_devs();
+ if (add_devlist == NULL) {
+ pr_err("no devices to scan for missing members.\n");
+ continue;
+ }
+ for (dp = &add_devlist; *dp; dp = & (*dp)->next)
+ /* 'M' (for 'missing') is like 'A' without errors */
+ (*dp)->disposition = 'M';
+ *dp = dv->next;
+ dv->next = add_devlist;
+ continue;
+ }
+
+ if (strncmp(dv->devname, "set-", 4) == 0 &&
+ strlen(dv->devname) == 5) {
+ int copies;
+
+ if (dv->disposition != 'r' &&
+ dv->disposition != 'f') {
+ pr_err("'%s' only meaningful with -r or -f\n",
+ dv->devname);
+ goto abort;
+ }
+ if (array.level != 10) {
+ pr_err("'%s' only meaningful with RAID10 arrays\n",
+ dv->devname);
+ goto abort;
+ }
+ copies = ((array.layout & 0xff) *
+ ((array.layout >> 8) & 0xff));
+ if (array.raid_disks % copies != 0 ||
+ dv->devname[4] < 'A' ||
+ dv->devname[4] >= 'A' + copies ||
+ copies > 26) {
+ pr_err("'%s' not meaningful with this array\n",
+ dv->devname);
+ goto abort;
+ }
+ add_set(dv, fd, dv->devname[4]);
+ continue;
+ }
+
+ if (strchr(dv->devname, '/') == NULL &&
+ strchr(dv->devname, ':') == NULL &&
+ strlen(dv->devname) < 50) {
+ /* Assume this is a kernel-internal name like 'sda1' */
+ int found = 0;
+ char dname[55];
+ if (dv->disposition != 'r' && dv->disposition != 'f') {
+ pr_err("%s only meaningful with -r or -f, not -%c\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ }
+
+ sprintf(dname, "dev-%s", dv->devname);
+ sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev");
+ if (sysfd >= 0) {
+ char dn[20];
+ if (sysfs_fd_get_str(sysfd, dn, 20) > 0 &&
+ sscanf(dn, "%d:%d", &mj,&mn) == 2) {
+ rdev = makedev(mj,mn);
+ found = 1;
+ }
+ close(sysfd);
+ sysfd = -1;
+ }
+ if (!found) {
+ sysfd = sysfs_open(fd2devnm(fd), dname, "state");
+ if (sysfd < 0) {
+ pr_err("%s does not appear to be a component of %s\n",
+ dv->devname, devname);
+ goto abort;
+ }
+ }
+ } else if ((dv->disposition == 'r' ||
+ dv->disposition == 'f') &&
+ get_maj_min(dv->devname, &mj, &mn)) {
+ /* for 'fail' and 'remove', the device might
+ * not exist.
+ */
+ rdev = makedev(mj, mn);
+ } else {
+ tfd = dev_open(dv->devname, O_RDONLY);
+ if (tfd >= 0) {
+ fstat_is_blkdev(tfd, dv->devname, &rdev);
+ close(tfd);
+ } else {
+ int open_err = errno;
+ if (!stat_is_blkdev(dv->devname, &rdev)) {
+ if (dv->disposition == 'M')
+ /* non-fatal. Also improbable */
+ continue;
+ goto abort;
+ }
+ if (dv->disposition == 'r')
+ /* Be happy, the stat worked, that is
+ * enough for --remove
+ */
+ ;
+ else {
+ if (dv->disposition == 'M')
+ /* non-fatal */
+ continue;
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(open_err));
+ goto abort;
+ }
+ }
+ }
+ switch(dv->disposition){
+ default:
+ pr_err("internal error - devmode[%s]=%d\n",
+ dv->devname, dv->disposition);
+ goto abort;
+ case 'a':
+ case 'S': /* --add-spare */
+ case 'j': /* --add-journal */
+ case 'A':
+ case 'M': /* --re-add missing */
+ case 'F': /* --re-add faulty */
+ case 'c': /* --cluster-confirm */
+ /* add the device */
+ if (subarray) {
+ pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n");
+ goto abort;
+ }
+
+ /* Let's first try to write re-add to sysfs */
+ if (rdev != 0 &&
+ (dv->disposition == 'A' || dv->disposition == 'F')) {
+ sysfs_init_dev(&devinfo, rdev);
+ if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) {
+ pr_err("re-add %s to %s succeed\n",
+ dv->devname, info.sys_name);
+ break;
+ }
+ }
+
+ if (dv->disposition == 'F')
+ /* Need to remove first */
+ hot_remove_disk(fd, rdev, force);
+ /* Make sure it isn't in use (in 2.6 or later) */
+ tfd = dev_open(dv->devname, O_RDONLY|O_EXCL);
+ if (tfd >= 0) {
+ /* We know no-one else is using it. We'll
+ * need non-exclusive access to add it, so
+ * do that now.
+ */
+ close(tfd);
+ tfd = dev_open(dv->devname, O_RDONLY);
+ }
+ if (tfd < 0) {
+ if (dv->disposition == 'M')
+ continue;
+ pr_err("Cannot open %s: %s\n",
+ dv->devname, strerror(errno));
+ goto abort;
+ }
+ if (!frozen) {
+ if (sysfs_freeze_array(&info) == 1)
+ frozen = 1;
+ else
+ frozen = -1;
+ }
+ rv = Manage_add(fd, tfd, dv, tst, &array,
+ force, verbose, devname, update,
+ rdev, array_size, raid_slot);
+ close(tfd);
+ tfd = -1;
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+
+ case 'r':
+ /* hot remove */
+ if (subarray) {
+ pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n");
+ rv = -1;
+ } else
+ rv = Manage_remove(tst, fd, dv, sysfd,
+ rdev, verbose, force,
+ devname);
+ if (sysfd >= 0)
+ close(sysfd);
+ sysfd = -1;
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+
+ case 'f': /* set faulty */
+ /* FIXME check current member */
+ if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) ||
+ (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY,
+ rdev))) {
+ if (errno == EBUSY)
+ busy = 1;
+ pr_err("set device faulty failed for %s: %s\n",
+ dv->devname, strerror(errno));
+ if (sysfd >= 0)
+ close(sysfd);
+ goto abort;
+ }
+ if (sysfd >= 0)
+ close(sysfd);
+ sysfd = -1;
+ count++;
+ if (verbose >= 0)
+ pr_err("set %s faulty in %s\n",
+ dv->devname, devname);
+ break;
+ case 'R': /* Mark as replaceable */
+ if (subarray) {
+ pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n");
+ rv = -1;
+ } else {
+ if (!frozen) {
+ if (sysfs_freeze_array(&info) == 1)
+ frozen = 1;
+ else
+ frozen = -1;
+ }
+ rv = Manage_replace(tst, fd, dv,
+ rdev, verbose,
+ devname);
+ }
+ if (rv < 0)
+ goto abort;
+ if (rv > 0)
+ count++;
+ break;
+ case 'W': /* --with device that doesn't match */
+ pr_err("No matching --replace device for --with %s\n",
+ dv->devname);
+ goto abort;
+ case 'w': /* --with device which was matched */
+ rv = Manage_with(tst, fd, dv,
+ rdev, verbose, devname);
+ if (rv < 0)
+ goto abort;
+ break;
+ }
+ }
+ if (frozen > 0)
+ sysfs_set_str(&info, NULL, "sync_action","idle");
+ if (test && count == 0)
+ return 2;
+ return 0;
+
+abort:
+ if (frozen > 0)
+ sysfs_set_str(&info, NULL, "sync_action","idle");
+ return !test && busy ? 2 : 1;
+}
+
+int autodetect(void)
+{
+ /* Open any md device, and issue the RAID_AUTORUN ioctl */
+ int rv = 1;
+ int fd = dev_open("9:0", O_RDONLY);
+ if (fd >= 0) {
+ if (ioctl(fd, RAID_AUTORUN, 0) == 0)
+ rv = 0;
+ close(fd);
+ }
+ return rv;
+}
+
+int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose)
+{
+ struct supertype supertype, *st = &supertype;
+ int fd, rv = 2;
+
+ memset(st, 0, sizeof(*st));
+
+ fd = open_subarray(dev, subarray, st, verbose < 0);
+ if (fd < 0)
+ return 2;
+
+ if (!st->ss->update_subarray) {
+ if (verbose >= 0)
+ pr_err("Operation not supported for %s metadata\n",
+ st->ss->name);
+ goto free_super;
+ }
+
+ if (mdmon_running(st->devnm))
+ st->update_tail = &st->updates;
+
+ rv = st->ss->update_subarray(st, subarray, update, ident);
+
+ if (rv) {
+ if (verbose >= 0)
+ pr_err("Failed to update %s of subarray-%s in %s\n",
+ update, subarray, dev);
+ } else if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+
+ if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0)
+ pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n",
+ subarray, dev);
+
+ free_super:
+ st->ss->free_super(st);
+ close(fd);
+
+ return rv;
+}
+
+/* Move spare from one array to another If adding to destination array fails
+ * add back to original array.
+ * Returns 1 on success, 0 on failure */
+int move_spare(char *from_devname, char *to_devname, dev_t devid)
+{
+ struct mddev_dev devlist;
+ char devname[20];
+
+ /* try to remove and add */
+ int fd1 = open(to_devname, O_RDONLY);
+ int fd2 = open(from_devname, O_RDONLY);
+
+ if (fd1 < 0 || fd2 < 0) {
+ if (fd1 >= 0)
+ close(fd1);
+ if (fd2 >= 0)
+ close(fd2);
+ return 0;
+ }
+
+ devlist.next = NULL;
+ devlist.used = 0;
+ devlist.writemostly = FlagDefault;
+ devlist.failfast = FlagDefault;
+ devlist.devname = devname;
+ sprintf(devname, "%d:%d", major(devid), minor(devid));
+
+ devlist.disposition = 'r';
+ if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) {
+ devlist.disposition = 'a';
+ if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0,
+ NULL, 0) == 0) {
+ /* make sure manager is aware of changes */
+ ping_manager(to_devname);
+ ping_manager(from_devname);
+ close(fd1);
+ close(fd2);
+ return 1;
+ }
+ else
+ Manage_subdevs(from_devname, fd2, &devlist,
+ -1, 0, NULL, 0);
+ }
+ close(fd1);
+ close(fd2);
+ return 0;
+}
diff --git a/Monitor.c b/Monitor.c
new file mode 100644
index 0000000..30c031a
--- /dev/null
+++ b/Monitor.c
@@ -0,0 +1,1275 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+#include <sys/wait.h>
+#include <signal.h>
+#include <limits.h>
+#include <syslog.h>
+#ifndef NO_LIBUDEV
+#include <libudev.h>
+#endif
+
+struct state {
+ char *devname;
+ char devnm[32]; /* to sync with mdstat info */
+ unsigned int utime;
+ int err;
+ char *spare_group;
+ int active, working, failed, spare, raid;
+ int from_config;
+ int from_auto;
+ int expected_spares;
+ int devstate[MAX_DISKS];
+ dev_t devid[MAX_DISKS];
+ int percent;
+ char parent_devnm[32]; /* For subarray, devnm of parent.
+ * For others, ""
+ */
+ struct supertype *metadata;
+ struct state *subarray;/* for a container it is a link to first subarray
+ * for a subarray it is a link to next subarray
+ * in the same container */
+ struct state *parent; /* for a subarray it is a link to its container
+ */
+ struct state *next;
+};
+
+struct alert_info {
+ char *mailaddr;
+ char *mailfrom;
+ char *alert_cmd;
+ int dosyslog;
+};
+static int make_daemon(char *pidfile);
+static int check_one_sharer(int scan);
+static void write_autorebuild_pid(void);
+static void alert(char *event, char *dev, char *disc, struct alert_info *info);
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+ int test, struct alert_info *info,
+ int increments, char *prefer);
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+ int test, struct alert_info *info);
+static void try_spare_migration(struct state *statelist, struct alert_info *info);
+static void link_containers_with_subarrays(struct state *list);
+#ifndef NO_LIBUDEV
+static int check_udev_activity(void);
+#endif
+
+int Monitor(struct mddev_dev *devlist,
+ char *mailaddr, char *alert_cmd,
+ struct context *c,
+ int daemonise, int oneshot,
+ int dosyslog, char *pidfile, int increments,
+ int share)
+{
+ /*
+ * Every few seconds, scan every md device looking for changes
+ * When a change is found, log it, possibly run the alert command,
+ * and possibly send Email
+ *
+ * For each array, we record:
+ * Update time
+ * active/working/failed/spare drives
+ * State of each device.
+ * %rebuilt if rebuilding
+ *
+ * If the update time changes, check out all the data again
+ * It is possible that we cannot get the state of each device
+ * due to bugs in the md kernel module.
+ * We also read /proc/mdstat to get rebuild percent,
+ * and to get state on all active devices incase of kernel bug.
+ *
+ * Events are:
+ * Fail
+ * An active device had Faulty set or Active/Sync removed
+ * FailSpare
+ * A spare device had Faulty set
+ * SpareActive
+ * An active device had a reverse transition
+ * RebuildStarted
+ * percent went from -1 to +ve
+ * RebuildNN
+ * percent went from below to not-below NN%
+ * DeviceDisappeared
+ * Couldn't access a device which was previously visible
+ *
+ * if we detect an array with active<raid and spare==0
+ * we look at other arrays that have same spare-group
+ * If we find one with active==raid and spare>0,
+ * and if we can get_disk_info and find a name
+ * Then we hot-remove and hot-add to the other array
+ *
+ * If devlist is NULL, then we can monitor everything because --scan
+ * was given. We get an initial list from config file and add anything
+ * that appears in /proc/mdstat
+ */
+
+ struct state *statelist = NULL;
+ struct state *st2;
+ int finished = 0;
+ struct mdstat_ent *mdstat = NULL;
+ char *mailfrom;
+ struct alert_info info;
+ struct mddev_ident *mdlist;
+ int delay_for_event = c->delay;
+
+ if (!mailaddr) {
+ mailaddr = conf_get_mailaddr();
+ if (mailaddr && ! c->scan)
+ pr_err("Monitor using email address \"%s\" from config file\n",
+ mailaddr);
+ }
+ mailfrom = conf_get_mailfrom();
+
+ if (!alert_cmd) {
+ alert_cmd = conf_get_program();
+ if (alert_cmd && !c->scan)
+ pr_err("Monitor using program \"%s\" from config file\n",
+ alert_cmd);
+ }
+ if (c->scan && !mailaddr && !alert_cmd && !dosyslog) {
+ pr_err("No mail address or alert command - not monitoring.\n");
+ return 1;
+ }
+ info.alert_cmd = alert_cmd;
+ info.mailaddr = mailaddr;
+ info.mailfrom = mailfrom;
+ info.dosyslog = dosyslog;
+
+ if (share){
+ if (check_one_sharer(c->scan))
+ return 1;
+ }
+
+ if (daemonise) {
+ int rv = make_daemon(pidfile);
+ if (rv >= 0)
+ return rv;
+ }
+
+ if (share)
+ write_autorebuild_pid();
+
+ if (devlist == NULL) {
+ mdlist = conf_get_ident(NULL);
+ for (; mdlist; mdlist = mdlist->next) {
+ struct state *st;
+
+ if (mdlist->devname == NULL)
+ continue;
+ if (strcasecmp(mdlist->devname, "<ignore>") == 0)
+ continue;
+ st = xcalloc(1, sizeof *st);
+ if (mdlist->devname[0] == '/')
+ st->devname = xstrdup(mdlist->devname);
+ else {
+ st->devname = xmalloc(8+strlen(mdlist->devname)+1);
+ strcpy(strcpy(st->devname, "/dev/md/"),
+ mdlist->devname);
+ }
+ st->next = statelist;
+ st->devnm[0] = 0;
+ st->percent = RESYNC_UNKNOWN;
+ st->from_config = 1;
+ st->expected_spares = mdlist->spare_disks;
+ if (mdlist->spare_group)
+ st->spare_group = xstrdup(mdlist->spare_group);
+ statelist = st;
+ }
+ } else {
+ struct mddev_dev *dv;
+
+ for (dv = devlist; dv; dv = dv->next) {
+ struct state *st = xcalloc(1, sizeof *st);
+ mdlist = conf_get_ident(dv->devname);
+ st->devname = xstrdup(dv->devname);
+ st->next = statelist;
+ st->devnm[0] = 0;
+ st->percent = RESYNC_UNKNOWN;
+ st->expected_spares = -1;
+ if (mdlist) {
+ st->expected_spares = mdlist->spare_disks;
+ if (mdlist->spare_group)
+ st->spare_group = xstrdup(mdlist->spare_group);
+ }
+ statelist = st;
+ }
+ }
+
+ while (!finished) {
+ int new_found = 0;
+ struct state *st, **stp;
+ int anydegraded = 0;
+ int anyredundant = 0;
+
+ if (mdstat)
+ free_mdstat(mdstat);
+ mdstat = mdstat_read(oneshot ? 0 : 1, 0);
+
+ for (st = statelist; st; st = st->next) {
+ if (check_array(st, mdstat, c->test, &info,
+ increments, c->prefer))
+ anydegraded = 1;
+ /* for external arrays, metadata is filled for
+ * containers only
+ */
+ if (st->metadata && st->metadata->ss->external)
+ continue;
+ if (st->err == 0 && !anyredundant)
+ anyredundant = 1;
+ }
+
+ /* now check if there are any new devices found in mdstat */
+ if (c->scan)
+ new_found = add_new_arrays(mdstat, &statelist, c->test,
+ &info);
+
+ /* If an array has active < raid && spare == 0 && spare_group != NULL
+ * Look for another array with spare > 0 and active == raid and same spare_group
+ * if found, choose a device and hotremove/hotadd
+ */
+ if (share && anydegraded)
+ try_spare_migration(statelist, &info);
+ if (!new_found) {
+ if (oneshot)
+ break;
+ else if (!anyredundant) {
+ pr_err("No array with redundancy detected, stopping\n");
+ break;
+ }
+ else {
+#ifndef NO_LIBUDEV
+ /*
+ * Wait for udevd to finish new devices
+ * processing.
+ */
+ if (mdstat_wait(delay_for_event) &&
+ check_udev_activity())
+ pr_err("Error while waiting for UDEV to complete new devices processing\n");
+#else
+ int wait_result = mdstat_wait(delay_for_event);
+ /*
+ * Give chance to process new device
+ */
+ if (wait_result != 0) {
+ if (c->delay > 5)
+ delay_for_event = 5;
+ } else
+ delay_for_event = c->delay;
+#endif
+ mdstat_close();
+ }
+ }
+ c->test = 0;
+
+ for (stp = &statelist; (st = *stp) != NULL; ) {
+ if (st->from_auto && st->err > 5) {
+ *stp = st->next;
+ free(st->devname);
+ free(st->spare_group);
+ free(st);
+ } else
+ stp = &st->next;
+ }
+ }
+ for (st2 = statelist; st2; st2 = statelist) {
+ statelist = st2->next;
+ free(st2);
+ }
+
+ if (pidfile)
+ unlink(pidfile);
+ return 0;
+}
+
+static int make_daemon(char *pidfile)
+{
+ /* Return:
+ * -1 in the forked daemon
+ * 0 in the parent
+ * 1 on error
+ * so a none-negative becomes the exit code.
+ */
+ int pid = fork();
+ if (pid > 0) {
+ if (!pidfile)
+ printf("%d\n", pid);
+ else {
+ FILE *pid_file = NULL;
+ int fd = open(pidfile, O_WRONLY | O_CREAT | O_TRUNC,
+ 0644);
+ if (fd >= 0)
+ pid_file = fdopen(fd, "w");
+ if (!pid_file)
+ perror("cannot create pid file");
+ else {
+ fprintf(pid_file,"%d\n", pid);
+ fclose(pid_file);
+ }
+ }
+ return 0;
+ }
+ if (pid < 0) {
+ perror("daemonise");
+ return 1;
+ }
+ manage_fork_fds(0);
+ setsid();
+ return -1;
+}
+
+static int check_one_sharer(int scan)
+{
+ int pid;
+ FILE *comm_fp;
+ FILE *fp;
+ char comm_path[PATH_MAX];
+ char path[PATH_MAX];
+ char comm[20];
+
+ sprintf(path, "%s/autorebuild.pid", MDMON_DIR);
+ fp = fopen(path, "r");
+ if (fp) {
+ if (fscanf(fp, "%d", &pid) != 1)
+ pid = -1;
+ snprintf(comm_path, sizeof(comm_path),
+ "/proc/%d/comm", pid);
+ comm_fp = fopen(comm_path, "r");
+ if (comm_fp) {
+ if (fscanf(comm_fp, "%19s", comm) &&
+ strncmp(basename(comm), Name, strlen(Name)) == 0) {
+ if (scan) {
+ pr_err("Only one autorebuild process allowed in scan mode, aborting\n");
+ fclose(comm_fp);
+ fclose(fp);
+ return 1;
+ } else {
+ pr_err("Warning: One autorebuild process already running.\n");
+ }
+ }
+ fclose(comm_fp);
+ }
+ fclose(fp);
+ }
+ return 0;
+}
+
+static void write_autorebuild_pid()
+{
+ char path[PATH_MAX];
+ int pid;
+ FILE *fp = NULL;
+ sprintf(path, "%s/autorebuild.pid", MDMON_DIR);
+
+ if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) {
+ pr_err("Can't create autorebuild.pid file\n");
+ } else {
+ int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0700);
+
+ if (fd >= 0)
+ fp = fdopen(fd, "w");
+
+ if (!fp)
+ pr_err("Can't create autorebuild.pid file\n");
+ else {
+ pid = getpid();
+ fprintf(fp, "%d\n", pid);
+ fclose(fp);
+ }
+ }
+}
+
+static void alert(char *event, char *dev, char *disc, struct alert_info *info)
+{
+ int priority;
+
+ if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) {
+ time_t now = time(0);
+
+ printf("%1.15s: %s on %s %s\n", ctime(&now) + 4,
+ event, dev, disc?disc:"unknown device");
+ }
+ if (info->alert_cmd) {
+ int pid = fork();
+ switch(pid) {
+ default:
+ waitpid(pid, NULL, 0);
+ break;
+ case -1:
+ break;
+ case 0:
+ execl(info->alert_cmd, info->alert_cmd,
+ event, dev, disc, NULL);
+ exit(2);
+ }
+ }
+ if (info->mailaddr && (strncmp(event, "Fail", 4) == 0 ||
+ strncmp(event, "Test", 4) == 0 ||
+ strncmp(event, "Spares", 6) == 0 ||
+ strncmp(event, "Degrade", 7) == 0)) {
+ FILE *mp = popen(Sendmail, "w");
+ if (mp) {
+ FILE *mdstat;
+ char hname[256];
+ gethostname(hname, sizeof(hname));
+ signal(SIGPIPE, SIG_IGN);
+ if (info->mailfrom)
+ fprintf(mp, "From: %s\n", info->mailfrom);
+ else
+ fprintf(mp, "From: %s monitoring <root>\n",
+ Name);
+ fprintf(mp, "To: %s\n", info->mailaddr);
+ fprintf(mp, "Subject: %s event on %s:%s\n\n",
+ event, dev, hname);
+
+ fprintf(mp,
+ "This is an automatically generated mail message from %s\n", Name);
+ fprintf(mp, "running on %s\n\n", hname);
+
+ fprintf(mp,
+ "A %s event had been detected on md device %s.\n\n", event, dev);
+
+ if (disc && disc[0] != ' ')
+ fprintf(mp,
+ "It could be related to component device %s.\n\n", disc);
+ if (disc && disc[0] == ' ')
+ fprintf(mp, "Extra information:%s.\n\n", disc);
+
+ fprintf(mp, "Faithfully yours, etc.\n");
+
+ mdstat = fopen("/proc/mdstat", "r");
+ if (mdstat) {
+ char buf[8192];
+ int n;
+ fprintf(mp,
+ "\nP.S. The /proc/mdstat file currently contains the following:\n\n");
+ while ((n = fread(buf, 1, sizeof(buf),
+ mdstat)) > 0)
+ n = fwrite(buf, 1, n, mp);
+ fclose(mdstat);
+ }
+ pclose(mp);
+ }
+ }
+
+ /* log the event to syslog maybe */
+ if (info->dosyslog) {
+ /* Log at a different severity depending on the event.
+ *
+ * These are the critical events: */
+ if (strncmp(event, "Fail", 4) == 0 ||
+ strncmp(event, "Degrade", 7) == 0 ||
+ strncmp(event, "DeviceDisappeared", 17) == 0)
+ priority = LOG_CRIT;
+ /* Good to know about, but are not failures: */
+ else if (strncmp(event, "Rebuild", 7) == 0 ||
+ strncmp(event, "MoveSpare", 9) == 0 ||
+ strncmp(event, "Spares", 6) != 0)
+ priority = LOG_WARNING;
+ /* Everything else: */
+ else
+ priority = LOG_INFO;
+
+ if (disc && disc[0] != ' ')
+ syslog(priority,
+ "%s event detected on md device %s, component device %s", event, dev, disc);
+ else if (disc)
+ syslog(priority,
+ "%s event detected on md device %s: %s",
+ event, dev, disc);
+ else
+ syslog(priority,
+ "%s event detected on md device %s",
+ event, dev);
+ }
+}
+
+static int check_array(struct state *st, struct mdstat_ent *mdstat,
+ int test, struct alert_info *ainfo,
+ int increments, char *prefer)
+{
+ /* Update the state 'st' to reflect any changes shown in mdstat,
+ * or found by directly examining the array, and return
+ * '1' if the array is degraded, or '0' if it is optimal (or dead).
+ */
+ struct { int state, major, minor; } info[MAX_DISKS];
+ struct mdinfo *sra = NULL;
+ mdu_array_info_t array;
+ struct mdstat_ent *mse = NULL, *mse2;
+ char *dev = st->devname;
+ int fd;
+ int i;
+ int remaining_disks;
+ int last_disk;
+ int new_array = 0;
+ int retval;
+ int is_container = 0;
+ unsigned long redundancy_only_flags = 0;
+
+ if (test)
+ alert("TestMessage", dev, NULL, ainfo);
+
+ retval = 0;
+
+ fd = open(dev, O_RDONLY);
+ if (fd < 0)
+ goto disappeared;
+
+ if (st->devnm[0] == 0)
+ strcpy(st->devnm, fd2devnm(fd));
+
+ for (mse2 = mdstat; mse2; mse2 = mse2->next)
+ if (strcmp(mse2->devnm, st->devnm) == 0) {
+ mse2->devnm[0] = 0; /* flag it as "used" */
+ mse = mse2;
+ }
+
+ if (!mse) {
+ /* duplicated array in statelist
+ * or re-created after reading mdstat
+ */
+ st->err++;
+ goto out;
+ }
+
+ if (mse->level == NULL)
+ is_container = 1;
+
+ if (!is_container && !md_array_active(fd))
+ goto disappeared;
+
+ fcntl(fd, F_SETFD, FD_CLOEXEC);
+ if (md_get_array_info(fd, &array) < 0)
+ goto disappeared;
+
+ if (!is_container && map_name(pers, mse->level) > 0)
+ redundancy_only_flags |= GET_MISMATCH;
+
+ sra = sysfs_read(-1, st->devnm, GET_LEVEL | GET_DISKS | GET_DEVS |
+ GET_STATE | redundancy_only_flags);
+
+ if (!sra)
+ goto disappeared;
+
+ /* It's much easier to list what array levels can't
+ * have a device disappear than all of them that can
+ */
+ if (sra->array.level == 0 || sra->array.level == -1) {
+ if (!st->err && !st->from_config)
+ alert("DeviceDisappeared", dev, " Wrong-Level", ainfo);
+ st->err++;
+ goto out;
+ }
+
+ /* this array is in /proc/mdstat */
+ if (array.utime == 0)
+ /* external arrays don't update utime, so
+ * just make sure it is always different. */
+ array.utime = st->utime + 1;;
+
+ if (st->err) {
+ /* New array appeared where previously had an error */
+ st->err = 0;
+ st->percent = RESYNC_NONE;
+ new_array = 1;
+ if (!is_container)
+ alert("NewArray", st->devname, NULL, ainfo);
+ }
+
+ if (st->utime == array.utime && st->failed == sra->array.failed_disks &&
+ st->working == sra->array.working_disks &&
+ st->spare == sra->array.spare_disks &&
+ (mse == NULL || (mse->percent == st->percent))) {
+ if ((st->active < st->raid) && st->spare == 0)
+ retval = 1;
+ goto out;
+ }
+ if (st->utime == 0 && /* new array */
+ mse->pattern && strchr(mse->pattern, '_') /* degraded */)
+ alert("DegradedArray", dev, NULL, ainfo);
+
+ if (st->utime == 0 && /* new array */ st->expected_spares > 0 &&
+ sra->array.spare_disks < st->expected_spares)
+ alert("SparesMissing", dev, NULL, ainfo);
+ if (st->percent < 0 && st->percent != RESYNC_UNKNOWN &&
+ mse->percent >= 0)
+ alert("RebuildStarted", dev, NULL, ainfo);
+ if (st->percent >= 0 && mse->percent >= 0 &&
+ (mse->percent / increments) > (st->percent / increments)) {
+ char percentalert[18];
+ /*
+ * "RebuildNN" (10 chars) or "RebuildStarted" (15 chars)
+ */
+
+ if((mse->percent / increments) == 0)
+ snprintf(percentalert, sizeof(percentalert),
+ "RebuildStarted");
+ else
+ snprintf(percentalert, sizeof(percentalert),
+ "Rebuild%02d", mse->percent);
+
+ alert(percentalert, dev, NULL, ainfo);
+ }
+
+ if (mse->percent == RESYNC_NONE && st->percent >= 0) {
+ /* Rebuild/sync/whatever just finished.
+ * If there is a number in /mismatch_cnt,
+ * we should report that.
+ */
+ if (sra && sra->mismatch_cnt > 0) {
+ char cnt[80];
+ snprintf(cnt, sizeof(cnt),
+ " mismatches found: %d (on raid level %d)",
+ sra->mismatch_cnt, sra->array.level);
+ alert("RebuildFinished", dev, cnt, ainfo);
+ } else
+ alert("RebuildFinished", dev, NULL, ainfo);
+ }
+ st->percent = mse->percent;
+
+ remaining_disks = sra->array.nr_disks;
+ for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) {
+ mdu_disk_info_t disc;
+ disc.number = i;
+ if (md_get_disk_info(fd, &disc) >= 0) {
+ info[i].state = disc.state;
+ info[i].major = disc.major;
+ info[i].minor = disc.minor;
+ if (disc.major || disc.minor)
+ remaining_disks --;
+ } else
+ info[i].major = info[i].minor = 0;
+ }
+ last_disk = i;
+
+ if (mse->metadata_version &&
+ strncmp(mse->metadata_version, "external:", 9) == 0 &&
+ is_subarray(mse->metadata_version+9)) {
+ char *sl;
+ strcpy(st->parent_devnm, mse->metadata_version + 10);
+ sl = strchr(st->parent_devnm, '/');
+ if (sl)
+ *sl = 0;
+ } else
+ st->parent_devnm[0] = 0;
+ if (st->metadata == NULL && st->parent_devnm[0] == 0)
+ st->metadata = super_by_fd(fd, NULL);
+
+ for (i = 0; i < MAX_DISKS; i++) {
+ mdu_disk_info_t disc = {0, 0, 0, 0, 0};
+ int newstate = 0;
+ int change;
+ char *dv = NULL;
+ disc.number = i;
+ if (i < last_disk && (info[i].major || info[i].minor)) {
+ newstate = info[i].state;
+ dv = map_dev_preferred(info[i].major, info[i].minor, 1,
+ prefer);
+ disc.state = newstate;
+ disc.major = info[i].major;
+ disc.minor = info[i].minor;
+ } else
+ newstate = (1 << MD_DISK_REMOVED);
+
+ if (dv == NULL && st->devid[i])
+ dv = map_dev_preferred(major(st->devid[i]),
+ minor(st->devid[i]), 1, prefer);
+ change = newstate ^ st->devstate[i];
+ if (st->utime && change && !st->err && !new_array) {
+ if ((st->devstate[i]&change) & (1 << MD_DISK_SYNC))
+ alert("Fail", dev, dv, ainfo);
+ else if ((newstate & (1 << MD_DISK_FAULTY)) &&
+ (disc.major || disc.minor) &&
+ st->devid[i] == makedev(disc.major,
+ disc.minor))
+ alert("FailSpare", dev, dv, ainfo);
+ else if ((newstate&change) & (1 << MD_DISK_SYNC))
+ alert("SpareActive", dev, dv, ainfo);
+ }
+ st->devstate[i] = newstate;
+ st->devid[i] = makedev(disc.major, disc.minor);
+ }
+ st->active = sra->array.active_disks;
+ st->working = sra->array.working_disks;
+ st->spare = sra->array.spare_disks;
+ st->failed = sra->array.failed_disks;
+ st->utime = array.utime;
+ st->raid = sra->array.raid_disks;
+ st->err = 0;
+ if ((st->active < st->raid) && st->spare == 0)
+ retval = 1;
+
+ out:
+ if (sra)
+ sysfs_free(sra);
+ if (fd >= 0)
+ close(fd);
+ return retval;
+
+ disappeared:
+ if (!st->err && !is_container)
+ alert("DeviceDisappeared", dev, NULL, ainfo);
+ st->err++;
+ goto out;
+}
+
+static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist,
+ int test, struct alert_info *info)
+{
+ struct mdstat_ent *mse;
+ int new_found = 0;
+ char *name;
+
+ for (mse = mdstat; mse; mse = mse->next)
+ if (mse->devnm[0] && (!mse->level || /* retrieve containers */
+ (strcmp(mse->level, "raid0") != 0 &&
+ strcmp(mse->level, "linear") != 0))) {
+ struct state *st = xcalloc(1, sizeof *st);
+ mdu_array_info_t array;
+ int fd;
+
+ name = get_md_name(mse->devnm);
+ if (!name) {
+ free(st);
+ continue;
+ }
+
+ st->devname = xstrdup(name);
+ if ((fd = open(st->devname, O_RDONLY)) < 0 ||
+ md_get_array_info(fd, &array) < 0) {
+ /* no such array */
+ if (fd >= 0)
+ close(fd);
+ put_md_name(st->devname);
+ free(st->devname);
+ if (st->metadata) {
+ st->metadata->ss->free_super(st->metadata);
+ free(st->metadata);
+ }
+ free(st);
+ continue;
+ }
+ close(fd);
+ st->next = *statelist;
+ st->err = 1;
+ st->from_auto = 1;
+ strcpy(st->devnm, mse->devnm);
+ st->percent = RESYNC_UNKNOWN;
+ st->expected_spares = -1;
+ if (mse->metadata_version &&
+ strncmp(mse->metadata_version,
+ "external:", 9) == 0 &&
+ is_subarray(mse->metadata_version+9)) {
+ char *sl;
+ strcpy(st->parent_devnm,
+ mse->metadata_version+10);
+ sl = strchr(st->parent_devnm, '/');
+ *sl = 0;
+ } else
+ st->parent_devnm[0] = 0;
+ *statelist = st;
+ if (test)
+ alert("TestMessage", st->devname, NULL, info);
+ new_found = 1;
+ }
+ return new_found;
+}
+
+static int get_required_spare_criteria(struct state *st,
+ struct spare_criteria *sc)
+{
+ int fd;
+
+ if (!st->metadata || !st->metadata->ss->get_spare_criteria) {
+ sc->min_size = 0;
+ sc->sector_size = 0;
+ return 0;
+ }
+
+ fd = open(st->devname, O_RDONLY);
+ if (fd < 0)
+ return 1;
+ if (st->metadata->ss->external)
+ st->metadata->ss->load_container(st->metadata, fd, st->devname);
+ else
+ st->metadata->ss->load_super(st->metadata, fd, st->devname);
+ close(fd);
+ if (!st->metadata->sb)
+ return 1;
+
+ st->metadata->ss->get_spare_criteria(st->metadata, sc);
+ st->metadata->ss->free_super(st->metadata);
+
+ return 0;
+}
+
+static int check_donor(struct state *from, struct state *to)
+{
+ struct state *sub;
+
+ if (from == to)
+ return 0;
+ if (from->parent)
+ /* Cannot move from a member */
+ return 0;
+ if (from->err)
+ return 0;
+ for (sub = from->subarray; sub; sub = sub->subarray)
+ /* If source array has degraded subarrays, don't
+ * remove anything
+ */
+ if (sub->active < sub->raid)
+ return 0;
+ if (from->metadata->ss->external == 0)
+ if (from->active < from->raid)
+ return 0;
+ if (from->spare <= 0)
+ return 0;
+ return 1;
+}
+
+static dev_t choose_spare(struct state *from, struct state *to,
+ struct domainlist *domlist, struct spare_criteria *sc)
+{
+ int d;
+ dev_t dev = 0;
+
+ for (d = from->raid; !dev && d < MAX_DISKS; d++) {
+ if (from->devid[d] > 0 && from->devstate[d] == 0) {
+ struct dev_policy *pol;
+ unsigned long long dev_size;
+ unsigned int dev_sector_size;
+
+ if (to->metadata->ss->external &&
+ test_partition_from_id(from->devid[d]))
+ continue;
+
+ if (sc->min_size &&
+ dev_size_from_id(from->devid[d], &dev_size) &&
+ dev_size < sc->min_size)
+ continue;
+
+ if (sc->sector_size &&
+ dev_sector_size_from_id(from->devid[d],
+ &dev_sector_size) &&
+ sc->sector_size != dev_sector_size)
+ continue;
+
+ pol = devid_policy(from->devid[d]);
+ if (from->spare_group)
+ pol_add(&pol, pol_domain,
+ from->spare_group, NULL);
+ if (domain_test(domlist, pol,
+ to->metadata->ss->name) == 1)
+ dev = from->devid[d];
+ dev_policy_free(pol);
+ }
+ }
+ return dev;
+}
+
+static dev_t container_choose_spare(struct state *from, struct state *to,
+ struct domainlist *domlist,
+ struct spare_criteria *sc, int active)
+{
+ /* This is similar to choose_spare, but we cannot trust devstate,
+ * so we need to read the metadata instead
+ */
+ struct mdinfo *list;
+ struct supertype *st = from->metadata;
+ int fd = open(from->devname, O_RDONLY);
+ int err;
+ dev_t dev = 0;
+
+ if (fd < 0)
+ return 0;
+ if (!st->ss->getinfo_super_disks) {
+ close(fd);
+ return 0;
+ }
+
+ err = st->ss->load_container(st, fd, NULL);
+ close(fd);
+ if (err)
+ return 0;
+
+ if (from == to) {
+ /* We must check if number of active disks has not increased
+ * since ioctl in main loop. mdmon may have added spare
+ * to subarray. If so we do not need to look for more spares
+ * so return non zero value */
+ int active_cnt = 0;
+ struct mdinfo *dp;
+ list = st->ss->getinfo_super_disks(st);
+ if (!list) {
+ st->ss->free_super(st);
+ return 1;
+ }
+ dp = list->devs;
+ while (dp) {
+ if (dp->disk.state & (1 << MD_DISK_SYNC) &&
+ !(dp->disk.state & (1 << MD_DISK_FAULTY)))
+ active_cnt++;
+ dp = dp->next;
+ }
+ sysfs_free(list);
+ if (active < active_cnt) {
+ /* Spare just activated.*/
+ st->ss->free_super(st);
+ return 1;
+ }
+ }
+
+ /* We only need one spare so full list not needed */
+ list = container_choose_spares(st, sc, domlist, from->spare_group,
+ to->metadata->ss->name, 1);
+ if (list) {
+ struct mdinfo *disks = list->devs;
+ if (disks)
+ dev = makedev(disks->disk.major, disks->disk.minor);
+ sysfs_free(list);
+ }
+ st->ss->free_super(st);
+ return dev;
+}
+
+static void try_spare_migration(struct state *statelist, struct alert_info *info)
+{
+ struct state *from;
+ struct state *st;
+ struct spare_criteria sc;
+
+ link_containers_with_subarrays(statelist);
+ for (st = statelist; st; st = st->next)
+ if (st->active < st->raid && st->spare == 0 && !st->err) {
+ struct domainlist *domlist = NULL;
+ int d;
+ struct state *to = st;
+
+ if (to->parent_devnm[0] && !to->parent)
+ /* subarray monitored without parent container
+ * we can't move spares here */
+ continue;
+
+ if (to->parent)
+ /* member of a container */
+ to = to->parent;
+
+ if (get_required_spare_criteria(to, &sc))
+ continue;
+ if (to->metadata->ss->external) {
+ /* We must make sure there is
+ * no suitable spare in container already.
+ * If there is we don't add more */
+ dev_t devid = container_choose_spare(
+ to, to, NULL, &sc, st->active);
+ if (devid > 0)
+ continue;
+ }
+ for (d = 0; d < MAX_DISKS; d++)
+ if (to->devid[d])
+ domainlist_add_dev(&domlist,
+ to->devid[d],
+ to->metadata->ss->name);
+ if (to->spare_group)
+ domain_add(&domlist, to->spare_group);
+ /*
+ * No spare migration if the destination
+ * has no domain. Skip this array.
+ */
+ if (!domlist)
+ continue;
+ for (from=statelist ; from ; from=from->next) {
+ dev_t devid;
+ if (!check_donor(from, to))
+ continue;
+ if (from->metadata->ss->external)
+ devid = container_choose_spare(
+ from, to, domlist, &sc, 0);
+ else
+ devid = choose_spare(from, to, domlist,
+ &sc);
+ if (devid > 0 &&
+ move_spare(from->devname, to->devname,
+ devid)) {
+ alert("MoveSpare", to->devname,
+ from->devname, info);
+ break;
+ }
+ }
+ domain_free(domlist);
+ }
+}
+
+/* search the statelist to connect external
+ * metadata subarrays with their containers
+ * We always completely rebuild the tree from scratch as
+ * that is safest considering the possibility of entries
+ * disappearing or changing.
+ */
+static void link_containers_with_subarrays(struct state *list)
+{
+ struct state *st;
+ struct state *cont;
+ for (st = list; st; st = st->next) {
+ st->parent = NULL;
+ st->subarray = NULL;
+ }
+ for (st = list; st; st = st->next)
+ if (st->parent_devnm[0])
+ for (cont = list; cont; cont = cont->next)
+ if (!cont->err && cont->parent_devnm[0] == 0 &&
+ strcmp(cont->devnm, st->parent_devnm) == 0) {
+ st->parent = cont;
+ st->subarray = cont->subarray;
+ cont->subarray = st;
+ break;
+ }
+}
+
+#ifndef NO_LIBUDEV
+/* function: check_udev_activity
+ * Description: Function waits for udev to finish
+ * events processing.
+ * Returns:
+ * 1 - detected error while opening udev
+ * 2 - timeout
+ * 0 - successfull completion
+ */
+static int check_udev_activity(void)
+{
+ struct udev *udev = NULL;
+ struct udev_queue *udev_queue = NULL;
+ int timeout_cnt = 30;
+ int rc = 0;
+
+ /*
+ * In rare cases systemd may not have udevm,
+ * in such cases just exit with rc 0
+ */
+ if (!use_udev())
+ goto out;
+
+ udev = udev_new();
+ if (!udev) {
+ rc = 1;
+ goto out;
+ }
+
+ udev_queue = udev_queue_new(udev);
+ if (!udev_queue) {
+ rc = 1;
+ goto out;
+ }
+
+ if (udev_queue_get_queue_is_empty(udev_queue))
+ goto out;
+
+ while (!udev_queue_get_queue_is_empty(udev_queue)) {
+ sleep(1);
+
+ if (timeout_cnt)
+ timeout_cnt--;
+ else {
+ rc = 2;
+ goto out;
+ }
+ }
+
+out:
+ if (udev_queue)
+ udev_queue_unref(udev_queue);
+ if (udev)
+ udev_unref(udev);
+ return rc;
+}
+#endif
+
+/* Not really Monitor but ... */
+int Wait(char *dev)
+{
+ char devnm[32];
+ dev_t rdev;
+ char *tmp;
+ int rv = 1;
+ int frozen_remaining = 3;
+
+ if (!stat_is_blkdev(dev, &rdev))
+ return 2;
+
+ tmp = devid2devnm(rdev);
+ if (!tmp) {
+ pr_err("Cannot get md device name.\n");
+ return 2;
+ }
+
+ strcpy(devnm, tmp);
+
+ while(1) {
+ struct mdstat_ent *ms = mdstat_read(1, 0);
+ struct mdstat_ent *e;
+
+ for (e = ms; e; e = e->next)
+ if (strcmp(e->devnm, devnm) == 0)
+ break;
+
+ if (e && e->percent == RESYNC_NONE) {
+ /* We could be in the brief pause before something
+ * starts. /proc/mdstat doesn't show that, but
+ * sync_action does.
+ */
+ struct mdinfo mdi;
+ char buf[21];
+
+ if (sysfs_init(&mdi, -1, devnm))
+ return 2;
+ if (sysfs_get_str(&mdi, NULL, "sync_action",
+ buf, 20) > 0 &&
+ strcmp(buf,"idle\n") != 0) {
+ e->percent = RESYNC_UNKNOWN;
+ if (strcmp(buf, "frozen\n") == 0) {
+ if (frozen_remaining == 0)
+ e->percent = RESYNC_NONE;
+ else
+ frozen_remaining -= 1;
+ }
+ }
+ }
+ if (!e || e->percent == RESYNC_NONE) {
+ if (e && e->metadata_version &&
+ strncmp(e->metadata_version, "external:", 9) == 0) {
+ if (is_subarray(&e->metadata_version[9]))
+ ping_monitor(&e->metadata_version[9]);
+ else
+ ping_monitor(devnm);
+ }
+ free_mdstat(ms);
+ return rv;
+ }
+ free_mdstat(ms);
+ rv = 0;
+ mdstat_wait(5);
+ }
+}
+
+/* The state "broken" is used only for RAID0/LINEAR - it's the same as
+ * "clean", but used in case the array has one or more members missing.
+ */
+static char *clean_states[] = {
+ "clear", "inactive", "readonly", "read-auto", "clean", "broken", NULL };
+
+int WaitClean(char *dev, int verbose)
+{
+ int fd;
+ struct mdinfo *mdi;
+ int rv = 1;
+ char devnm[32];
+
+ if (!stat_is_blkdev(dev, NULL))
+ return 2;
+ fd = open(dev, O_RDONLY);
+ if (fd < 0) {
+ if (verbose)
+ pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+
+ strcpy(devnm, fd2devnm(fd));
+ mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE);
+ if (!mdi) {
+ if (verbose)
+ pr_err("Failed to read sysfs attributes for %s\n", dev);
+ close(fd);
+ return 0;
+ }
+
+ switch(mdi->array.level) {
+ case LEVEL_LINEAR:
+ case LEVEL_MULTIPATH:
+ case 0:
+ /* safemode delay is irrelevant for these levels */
+ rv = 0;
+ }
+
+ /* for internal metadata the kernel handles the final clean
+ * transition, containers can never be dirty
+ */
+ if (!is_subarray(mdi->text_version))
+ rv = 0;
+
+ /* safemode disabled ? */
+ if (mdi->safe_mode_delay == 0)
+ rv = 0;
+
+ if (rv) {
+ int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state");
+ char buf[20];
+ int delay = 5000;
+
+ /* minimize the safe_mode_delay and prepare to wait up to 5s
+ * for writes to quiesce
+ */
+ sysfs_set_safemode(mdi, 1);
+
+ /* wait for array_state to be clean */
+ while (1) {
+ rv = read(state_fd, buf, sizeof(buf));
+ if (rv < 0)
+ break;
+ if (sysfs_match_word(buf, clean_states) <
+ (int)ARRAY_SIZE(clean_states) - 1)
+ break;
+ rv = sysfs_wait(state_fd, &delay);
+ if (rv < 0 && errno != EINTR)
+ break;
+ lseek(state_fd, 0, SEEK_SET);
+ }
+ if (rv < 0)
+ rv = 1;
+ else if (ping_monitor(mdi->text_version) == 0) {
+ /* we need to ping to close the window between array
+ * state transitioning to clean and the metadata being
+ * marked clean
+ */
+ rv = 0;
+ } else {
+ rv = 1;
+ pr_err("Error connecting monitor with %s\n", dev);
+ }
+ if (rv && verbose)
+ pr_err("Error waiting for %s to be clean\n", dev);
+
+ /* restore the original safe_mode_delay */
+ sysfs_set_safemode(mdi, mdi->safe_mode_delay);
+ close(state_fd);
+ }
+
+ sysfs_free(mdi);
+ close(fd);
+
+ return rv;
+}
diff --git a/Query.c b/Query.c
new file mode 100644
index 0000000..23fbf8a
--- /dev/null
+++ b/Query.c
@@ -0,0 +1,140 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include "md_u.h"
+
+int Query(char *dev)
+{
+ /* Give a brief description of the device,
+ * whether it is an md device and whether it has
+ * a superblock
+ */
+ int fd;
+ int ioctlerr, staterr;
+ int superror;
+ int level, raid_disks, spare_disks;
+ struct mdinfo info;
+ struct mdinfo *sra;
+ struct supertype *st = NULL;
+ unsigned long long larray_size;
+ struct stat stb;
+ char *mddev;
+ mdu_disk_info_t disc;
+ char *activity;
+
+ fd = open(dev, O_RDONLY);
+ if (fd < 0){
+ pr_err("cannot open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+
+ if (fstat(fd, &stb) < 0)
+ staterr = errno;
+ else
+ staterr = 0;
+
+ ioctlerr = 0;
+
+ sra = sysfs_read(fd, dev, GET_DISKS | GET_LEVEL | GET_DEVS | GET_STATE);
+ if (sra) {
+ level = sra->array.level;
+ raid_disks = sra->array.raid_disks;
+ spare_disks = sra->array.spare_disks;
+ } else {
+ mdu_array_info_t array;
+
+ if (md_get_array_info(fd, &array) < 0) {
+ ioctlerr = errno;
+ level = -1;
+ raid_disks = -1;
+ spare_disks = -1;
+ } else {
+ level = array.level;
+ raid_disks = array.raid_disks;
+ spare_disks = array.spare_disks;
+ }
+ }
+
+ if (!ioctlerr && !staterr) {
+ if (!get_dev_size(fd, NULL, &larray_size))
+ larray_size = 0;
+ }
+
+ if (ioctlerr == ENODEV)
+ printf("%s: is an md device which is not active\n", dev);
+ else if (ioctlerr && major(stb.st_rdev) != MD_MAJOR)
+ printf("%s: is not an md array\n", dev);
+ else if (ioctlerr)
+ printf("%s: is an md device, but gives \"%s\" when queried\n",
+ dev, strerror(ioctlerr));
+ else {
+ printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n",
+ dev, human_size_brief(larray_size,IEC),
+ map_num(pers, level), raid_disks,
+ spare_disks, spare_disks == 1 ? "" : "s");
+ }
+ st = guess_super(fd);
+ if (st && st->ss->compare_super != NULL)
+ superror = st->ss->load_super(st, fd, dev);
+ else
+ superror = -1;
+ close(fd);
+ if (superror == 0) {
+ /* array might be active... */
+ int uuid[4];
+ struct map_ent *me, *map = NULL;
+ st->ss->getinfo_super(st, &info, NULL);
+ st->ss->uuid_from_super(st, uuid);
+ me = map_by_uuid(&map, uuid);
+ if (me) {
+ mddev = me->path;
+ disc.number = info.disk.number;
+ activity = "undetected";
+ if (mddev && (fd = open(mddev, O_RDONLY))>=0) {
+ if (md_array_active(fd)) {
+ if (md_get_disk_info(fd, &disc) >= 0 &&
+ makedev((unsigned)disc.major,(unsigned)disc.minor) == stb.st_rdev)
+ activity = "active";
+ else
+ activity = "mismatch";
+ }
+ close(fd);
+ }
+ } else {
+ activity = "inactive";
+ mddev = "array";
+ }
+ printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n",
+ dev,
+ info.disk.number, info.array.raid_disks,
+ activity,
+ map_num(pers, info.array.level),
+ mddev);
+ if (st->ss == &super0)
+ put_md_name(mddev);
+ }
+ return 0;
+}
diff --git a/README.initramfs b/README.initramfs
new file mode 100644
index 0000000..c5fa668
--- /dev/null
+++ b/README.initramfs
@@ -0,0 +1,122 @@
+Assembling md arrays at boot time.
+---------------------------------
+December 2005
+
+These notes apply to 2.6 kernels only and, in some cases,
+to 2.6.15 or later.
+
+Md arrays can be assembled at boot time using the 'autodetect' functionality
+which is triggered by storing components of an array in partitions of type
+'fd' - Linux Raid Autodetect.
+They can also be assembled by specifying the component devices in a
+kernel parameter such as
+ md=0,/dev/sda,/dev/sdb
+In this case, /dev/md0 will be assembled (because of the 0) from the listed
+devices.
+
+These mechanisms, while useful, do not provide complete functionality
+and are unlikely to be extended. The preferred way to assemble md
+arrays at boot time is using 'mdadm'. To assemble an array which
+contains the root filesystem, mdadm needs to be run before that
+filesystem is mounted, and so needs to be run from an initial-ram-fs.
+It is how this can work that is the primary focus of this document.
+
+It should be noted up front that only the array containing the root
+filesystem should be assembled from the initramfs. Any other arrays
+should be assembled under the control of files on the main filesystem
+as this enhanced flexibility and maintainability.
+
+A minimal initramfs for assembling md arrays can be created using 3
+files and one directory. These are:
+
+/bin Directory
+/bin/mdadm statically linked mdadm binary
+/bin/busybox statically linked busybox binary
+/bin/sh hard link to /bin/busybox
+/init a shell script which call mdadm appropriately.
+
+An example init script is:
+
+==============================================
+#!/bin/sh
+
+echo 'Auto-assembling boot md array'
+mkdir /proc
+mount -t proc proc /proc
+if [ -n "$rootuuid" ]
+then arg=--uuid=$rootuuid
+elif [ -n "$mdminor" ]
+then arg=--super-minor=$mdminor
+else arg=--super-minor=0
+fi
+echo "Using $arg"
+mdadm -Acpartitions $arg --auto=part /dev/mda
+cd /
+mount /dev/mda1 /root || mount /dev/mda /root
+umount /proc
+cd /root
+exec chroot . /sbin/init < /dev/console > /dev/console 2>&1
+=============================================
+
+This could certainly be extended, or merged into a larger init script.
+Though tested and in production use, it is not presented here as
+"The Right Way" to do it, but as a useful example.
+Some key points are:
+
+ /proc needs to be mounted so that /proc/partitions can be accessed
+ by mdadm, and so that /proc/filesystems can be accessed by mount.
+
+ The uuid of the array can be passed in as a kernel parameter
+ (rootuuid). As the kernel doesn't use this value, it is made available
+ in the environment for /init
+
+ If no uuid is given, we default to md0, (--super-minor=0) which is a
+ commonly used to store the root filesystem. This may not work in
+ all situations.
+
+ We assemble the array as a partitionable array (/dev/mda) even if we
+ end up using the whole array. There is no cost in using the partitionable
+ interface, and in this context it is simpler.
+
+ We try mounting both /dev/mda1 and /dev/mda as they are the most like
+ part of the array to contain the root filesystem.
+
+ The --auto flag is given to mdadm so that it will create /dev/md*
+ files automatically. This is needed as /dev will not contain
+ and md files, and udev will not create them (as udev only created device
+ files after the device exists, and mdadm need the device file to create
+ the device). Note that the created md files may not exist in /dev
+ of the mounted root filesystem. This needs to be deal with separately
+ from mdadm - possibly using udev.
+
+ We do not need to create device files for the components which will
+ be assembled into /dev/mda. mdadm finds the major/minor numbers from
+ /proc/partitions and creates a temporary /dev file if one doesn't already
+ exist.
+
+The script "mkinitramfs" which is included with the mdadm distribution
+can be used to create a minimal initramfs. It creates a file called
+'init.cpio.gz' which can be specified as an 'initrd' to lilo or grub
+(or whatever boot loader is being used).
+
+
+
+
+Resume from an md array
+-----------------------
+
+If you want to make use of the suspend-to-disk/resume functionality in Linux,
+and want to have swap on an md array, you will need to assemble the array
+before resume is possible.
+However, because the array is active in the resumed image, you do not want
+anything written to any drives during the resume process, such as superblock
+updates or array resync.
+
+This can be achieved in 2.6.15-rc1 and later kernels using the
+'start_readonly' module parameter.
+Simply include the command
+ echo 1 > /sys/module/md_mod/parameters/start_ro
+before assembling the array with 'mdadm'.
+You can then echo
+ 9:0
+or whatever is appropriate to /sys/power/resume to trigger the resume.
diff --git a/ReadMe.c b/ReadMe.c
new file mode 100644
index 0000000..8139976
--- /dev/null
+++ b/ReadMe.c
@@ -0,0 +1,656 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
+ * Copyright (C) 2016-2017 Jes Sorensen <Jes.Sorensen@gmail.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ * Maintainer: Jes Sorensen
+ * Email: <Jes.Sorensen@gmail.com>
+ */
+
+#include "mdadm.h"
+
+#ifndef VERSION
+#define VERSION "4.2"
+#endif
+#ifndef VERS_DATE
+#define VERS_DATE "2021-12-30"
+#endif
+#ifndef EXTRAVERSION
+#define EXTRAVERSION ""
+#endif
+char Version[] = "mdadm - v" VERSION " - " VERS_DATE EXTRAVERSION "\n";
+
+/*
+ * File: ReadMe.c
+ *
+ * This file contains general comments about the implementation
+ * and the various usage messages that can be displayed by mdadm
+ *
+ */
+
+/*
+ * mdadm has 7 major modes of operation:
+ * 1/ Create
+ * This mode is used to create a new array with a superblock
+ * 2/ Assemble
+ * This mode is used to assemble the parts of a previously created
+ * array into an active array. Components can be explicitly given
+ * or can be searched for. mdadm (optionally) checks that the components
+ * do form a bona-fide array, and can, on request, fiddle superblock
+ * version numbers so as to assemble a faulty array.
+ * 3/ Build
+ * This is for building legacy arrays without superblocks
+ * 4/ Manage
+ * This is for doing something to one or more devices
+ * in an array, such as add,remove,fail.
+ * run/stop/readonly/readwrite are also available
+ * 5/ Misc
+ * This is for doing things to individual devices.
+ * They might be parts of an array so
+ * zero-superblock, examine might be appropriate
+ * They might be md arrays so
+ * run,stop,rw,ro,detail might be appropriate
+ * Also query will treat it as either
+ * 6/ Monitor
+ * This mode never exits but just monitors arrays and reports changes.
+ * 7/ Grow
+ * This mode allows for changing of key attributes of a raid array, such
+ * as size, number of devices, and possibly even layout.
+ * 8/ Incremental
+ * Is assembles an array incrementally instead of all at once.
+ * As devices are discovered they can be passed to "mdadm --incremental"
+ * which will collect them. When enough devices to for an array are
+ * found, it is started.
+ */
+
+char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:r:n:x:u:c:d:z:U:N:safRSow1tye:k";
+char short_bitmap_options[]=
+ "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:r:n:x:u:c:d:z:U:N:sarfRSow1tye:k:";
+char short_bitmap_auto_options[]=
+ "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:r:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:";
+
+struct option long_options[] = {
+ {"manage", 0, 0, ManageOpt},
+ {"misc", 0, 0, MiscOpt},
+ {"assemble", 0, 0, 'A'},
+ {"build", 0, 0, 'B'},
+ {"create", 0, 0, 'C'},
+ {"detail", 0, 0, 'D'},
+ {"examine", 0, 0, 'E'},
+ {"follow", 0, 0, 'F'},
+ {"grow", 0, 0, 'G'},
+ {"incremental",0,0, 'I'},
+ {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */
+ {"query", 0, 0, 'Q'},
+ {"examine-bitmap", 0, 0, 'X'},
+ {"auto-detect", 0, 0, AutoDetect},
+ {"detail-platform", 0, 0, DetailPlatform},
+ {"kill-subarray", 1, 0, KillSubarray},
+ {"update-subarray", 1, 0, UpdateSubarray},
+ {"udev-rules", 2, 0, UdevRules},
+ {"offroot", 0, 0, OffRootOpt},
+ {"examine-badblocks", 0, 0, ExamineBB},
+
+ {"dump", 1, 0, Dump},
+ {"restore", 1, 0, Restore},
+
+ /* synonyms */
+ {"monitor", 0, 0, 'F'},
+
+ /* after those will normally come the name of the md device */
+
+ {"help", 0, 0, 'h'},
+ {"help-options",0,0, HelpOptions},
+ {"version", 0, 0, 'V'},
+ {"verbose", 0, 0, 'v'},
+ {"quiet", 0, 0, 'q'},
+
+ /* For create or build: */
+ {"chunk", 1, 0, ChunkSize},
+ {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a
+ * rounding number */
+ {"level", 1, 0, 'l'}, /* 0,1,4,5,6,linear */
+ {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */
+ {"layout", 1, 0, Layout},
+ {"raid-disks",1, 0, 'n'},
+ {"raid-devices",1, 0, 'n'},
+ {"spare-disks",1,0, 'x'},
+ {"spare-devices",1,0, 'x'},
+ {"size", 1, 0, 'z'},
+ {"auto", 1, 0, Auto}, /* also for --assemble */
+ {"assume-clean",0,0, AssumeClean },
+ {"metadata", 1, 0, 'e'}, /* superblock format */
+ {"bitmap", 1, 0, Bitmap},
+ {"bitmap-chunk", 1, 0, BitmapChunk},
+ {"write-behind", 2, 0, WriteBehind},
+ {"write-mostly",0, 0, WriteMostly},
+ {"failfast", 0, 0, FailFast},
+ {"nofailfast",0, 0, NoFailFast},
+ {"re-add", 0, 0, ReAdd},
+ {"homehost", 1, 0, HomeHost},
+ {"symlinks", 1, 0, Symlinks},
+ {"data-offset",1, 0, DataOffset},
+ {"nodes",1, 0, Nodes}, /* also for --assemble */
+ {"home-cluster",1, 0, ClusterName},
+ {"write-journal",1, 0, WriteJournal},
+ {"consistency-policy", 1, 0, 'k'},
+
+ /* For assemble */
+ {"uuid", 1, 0, 'u'},
+ {"super-minor",1,0, SuperMinor},
+ {"name", 1, 0, 'N'},
+ {"config", 1, 0, ConfigFile},
+ {"scan", 0, 0, 's'},
+ {"force", 0, 0, Force},
+ {"update", 1, 0, 'U'},
+ {"freeze-reshape", 0, 0, FreezeReshape},
+
+ /* Management */
+ {"add", 0, 0, Add},
+ {"add-spare", 0, 0, AddSpare},
+ {"add-journal", 0, 0, AddJournal},
+ {"remove", 0, 0, Remove},
+ {"fail", 0, 0, Fail},
+ {"set-faulty",0, 0, Fail},
+ {"replace", 0, 0, Replace},
+ {"with", 0, 0, With},
+ {"run", 0, 0, 'R'},
+ {"stop", 0, 0, 'S'},
+ {"readonly", 0, 0, 'o'},
+ {"readwrite", 0, 0, 'w'},
+ {"no-degraded",0,0, NoDegraded },
+ {"wait", 0, 0, WaitOpt},
+ {"wait-clean", 0, 0, Waitclean },
+ {"action", 1, 0, Action },
+ {"cluster-confirm", 0, 0, ClusterConfirm},
+
+ /* For Detail/Examine */
+ {"brief", 0, 0, Brief},
+ {"no-devices",0, 0, NoDevices},
+ {"export", 0, 0, 'Y'},
+ {"sparc2.2", 0, 0, Sparc22},
+ {"test", 0, 0, 't'},
+ {"prefer", 1, 0, Prefer},
+
+ /* For Follow/monitor */
+ {"mail", 1, 0, EMail},
+ {"program", 1, 0, ProgramOpt},
+ {"alert", 1, 0, ProgramOpt},
+ {"increment", 1, 0, Increment},
+ {"delay", 1, 0, 'd'},
+ {"daemonise", 0, 0, Fork},
+ {"daemonize", 0, 0, Fork},
+ {"oneshot", 0, 0, '1'},
+ {"pid-file", 1, 0, 'i'},
+ {"syslog", 0, 0, 'y'},
+ {"no-sharing", 0, 0, NoSharing},
+
+ /* For Grow */
+ {"backup-file", 1,0, BackupFile},
+ {"invalid-backup",0,0,InvalidBackup},
+ {"array-size", 1, 0, 'Z'},
+ {"continue", 0, 0, Continue},
+
+ /* For Incremental */
+ {"rebuild-map", 0, 0, RebuildMapOpt},
+ {"path", 1, 0, IncrementalPath},
+
+ {0, 0, 0, 0}
+};
+
+char Usage[] =
+"Usage: mdadm --help\n"
+" for help\n"
+;
+
+char Help[] =
+"mdadm is used for building, managing, and monitoring\n"
+"Linux md devices (aka RAID arrays)\n"
+"Usage: mdadm --create device options...\n"
+" Create a new array from unused devices.\n"
+" mdadm --assemble device options...\n"
+" Assemble a previously created array.\n"
+" mdadm --build device options...\n"
+" Create or assemble an array without metadata.\n"
+" mdadm --manage device options...\n"
+" make changes to an existing array.\n"
+" mdadm --misc options... devices\n"
+" report on or modify various md related devices.\n"
+" mdadm --grow options device\n"
+" resize/reshape an active array\n"
+" mdadm --incremental device\n"
+" add/remove a device to/from an array as appropriate\n"
+" mdadm --monitor options...\n"
+" Monitor one or more array for significant changes.\n"
+" mdadm device options...\n"
+" Shorthand for --manage.\n"
+"Any parameter that does not start with '-' is treated as a device name\n"
+"or, for --examine-bitmap, a file name.\n"
+"The first such name is often the name of an md device. Subsequent\n"
+"names are often names of component devices.\n"
+"\n"
+" For detailed help on the above major modes use --help after the mode\n"
+" e.g.\n"
+" mdadm --assemble --help\n"
+" For general help on options use\n"
+" mdadm --help-options\n"
+;
+
+char OptionHelp[] =
+"Any parameter that does not start with '-' is treated as a device name\n"
+"or, for --examine-bitmap, a file name.\n"
+"The first such name is often the name of an md device. Subsequent\n"
+"names are often names of component devices.\n"
+"\n"
+"Some common options are:\n"
+" --help -h : General help message or, after above option,\n"
+" mode specific help message\n"
+" --help-options : This help message\n"
+" --version -V : Print version information for mdadm\n"
+" --verbose -v : Be more verbose about what is happening\n"
+" --quiet -q : Don't print un-necessary messages\n"
+" --brief -b : Be less verbose, more brief\n"
+" --export -Y : With --detail, --detail-platform or --examine use\n"
+" key=value format for easy import into environment\n"
+" --force -f : Override normal checks and be more forceful\n"
+"\n"
+" --assemble -A : Assemble an array\n"
+" --build -B : Build an array without metadata\n"
+" --create -C : Create a new array\n"
+" --detail -D : Display details of an array\n"
+" --examine -E : Examine superblock on an array component\n"
+" --examine-bitmap -X: Display the detail of a bitmap file\n"
+" --examine-badblocks: Display list of known bad blocks on device\n"
+" --monitor -F : monitor (follow) some arrays\n"
+" --grow -G : resize/ reshape and array\n"
+" --incremental -I : add/remove a single device to/from an array as appropriate\n"
+" --query -Q : Display general information about how a\n"
+" device relates to the md driver\n"
+" --auto-detect : Start arrays auto-detected by the kernel\n"
+;
+/*
+"\n"
+" For create or build:\n"
+" --bitmap= -b : File to store bitmap in - may pre-exist for --build\n"
+" --chunk= -c : chunk size of kibibytes\n"
+" --rounding= : rounding factor for linear array (==chunk size)\n"
+" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n"
+" : 0,1,10,mp,faulty or linear for build.\n"
+" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n"
+" --layout= : same as --parity, for RAID10: [fno]NN \n"
+" --raid-devices= -n : number of active devices in array\n"
+" --spare-devices= -x: number of spare (eXtra) devices in initial array\n"
+" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
+" --force -f : Honour devices as listed on command line. Don't\n"
+" : insert a missing drive for RAID5.\n"
+" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n"
+" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n"
+" --delay= -d : seconds between bitmap updates\n"
+" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n"
+" --name= -N : Textual name for array - max 32 characters\n"
+"\n"
+" For assemble:\n"
+" --bitmap= -b : File to find bitmap information in\n"
+" --uuid= -u : uuid of array to assemble. Devices which don't\n"
+" have this uuid are excluded\n"
+" --super-minor= -m : minor number to look for in super-block when\n"
+" choosing devices to use.\n"
+" --name= -N : Array name to look for in super-block.\n"
+" --config= -c : config file\n"
+" --scan -s : scan config file for missing information\n"
+" --force -f : Assemble the array even if some superblocks appear out-of-date\n"
+" --update= -U : Update superblock: try '-A --update=?' for list of options.\n"
+" --no-degraded : Do not start any degraded arrays - default unless --scan.\n"
+"\n"
+" For detail or examine:\n"
+" --brief -b : Just print device name and UUID\n"
+"\n"
+" For follow/monitor:\n"
+" --mail= -m : Address to mail alerts of failure to\n"
+" --program= -p : Program to run when an event is detected\n"
+" --alert= : same as --program\n"
+" --delay= -d : seconds of delay between polling state. default=60\n"
+"\n"
+" General management:\n"
+" --add -a : add, or hotadd subsequent devices\n"
+" --re-add : re-add a recently removed device\n"
+" --remove -r : remove subsequent devices\n"
+" --fail -f : mark subsequent devices as faulty\n"
+" --set-faulty : same as --fail\n"
+" --replace : mark a device for replacement\n"
+" --run -R : start a partially built array\n"
+" --stop -S : deactivate array, releasing all resources\n"
+" --readonly -o : mark array as readonly\n"
+" --readwrite -w : mark array as readwrite\n"
+" --zero-superblock : erase the MD superblock from a device.\n"
+" --wait -W : wait for recovery/resync/reshape to finish.\n"
+;
+*/
+
+char Help_create[] =
+"Usage: mdadm --create device --chunk=X --level=Y --raid-devices=Z devices\n"
+"\n"
+" This usage will initialise a new md array, associate some\n"
+" devices with it, and activate the array. In order to create an\n"
+" array with some devices missing, use the special word 'missing' in\n"
+" place of the relevant device name.\n"
+"\n"
+" Before devices are added, they are checked to see if they already contain\n"
+" raid superblocks or filesystems. They are also checked to see if\n"
+" the variance in device size exceeds 1%.\n"
+" If any discrepancy is found, the user will be prompted for confirmation\n"
+" before the array is created. The presence of a '--run' can override this\n"
+" caution.\n"
+"\n"
+" If the --size option is given then only that many kilobytes of each\n"
+" device is used, no matter how big each device is.\n"
+" If no --size is given, the apparent size of the smallest drive given\n"
+" is used for raid level 1 and greater, and the full device is used for\n"
+" other levels.\n"
+"\n"
+" Options that are valid with --create (-C) are:\n"
+" --bitmap= -b : Create a bitmap for the array with the given filename\n"
+" : or an internal bitmap if 'internal' is given\n"
+" --chunk= -c : chunk size in kibibytes\n"
+" --rounding= : rounding factor for linear array (==chunk size)\n"
+" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n"
+" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n"
+" --layout= : same as --parity, for RAID10: [fno]NN \n"
+" --raid-devices= -n : number of active devices in array\n"
+" --spare-devices= -x : number of spare (eXtra) devices in initial array\n"
+" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n"
+" --data-offset= : Space to leave between start of device and start\n"
+" : of array data.\n"
+" --force -f : Honour devices as listed on command line. Don't\n"
+" : insert a missing drive for RAID5.\n"
+" --run -R : insist of running the array even if not all\n"
+" : devices are present or some look odd.\n"
+" --readonly -o : start the array readonly - not supported yet.\n"
+" --name= -N : Textual name for array - max 32 characters\n"
+" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n"
+" --delay= -d : bitmap update delay in seconds.\n"
+" --write-journal= : Specify journal device for RAID-4/5/6 array\n"
+" --consistency-policy= : Specify the policy that determines how the array\n"
+" -k : maintains consistency in case of unexpected shutdown.\n"
+"\n"
+;
+
+char Help_build[] =
+"Usage: mdadm --build device -chunk=X --level=Y --raid-devices=Z devices\n"
+"\n"
+" This usage is similar to --create. The difference is that it creates\n"
+" a legacy array without a superblock. With these arrays there is no\n"
+" different between initially creating the array and subsequently\n"
+" assembling the array, except that hopefully there is useful data\n"
+" there in the second case.\n"
+"\n"
+" The level may only be 0, 1, 10, linear, multipath, or faulty.\n"
+" All devices must be listed and the array will be started once complete.\n"
+" Options that are valid with --build (-B) are:\n"
+" --bitmap= : file to store/find bitmap information in.\n"
+" --chunk= -c : chunk size of kibibytes\n"
+" --rounding= : rounding factor for linear array (==chunk size)\n"
+" --level= -l : 0, 1, 10, linear, multipath, faulty\n"
+" --raid-devices= -n : number of active devices in array\n"
+" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n"
+" --delay= -d : bitmap update delay in seconds.\n"
+;
+
+char Help_assemble[] =
+"Usage: mdadm --assemble device options...\n"
+" mdadm --assemble --scan options...\n"
+"\n"
+"This usage assembles one or more raid arrays from pre-existing\n"
+"components.\n"
+"For each array, mdadm needs to know the md device, the identity of\n"
+"the array, and a number of sub devices. These can be found in a number\n"
+"of ways.\n"
+"\n"
+"The md device is given on the command line, is found listed in the\n"
+"config file, or can be deduced from the array identity.\n"
+"The array identity is determined either from the --uuid, --name, or\n"
+"--super-minor commandline arguments, from the config file,\n"
+"or from the first component device on the command line.\n"
+"\n"
+"The different combinations of these are as follows:\n"
+" If the --scan option is not given, then only devices and identities\n"
+" listed on the command line are considered.\n"
+" The first device will be the array device, and the remainder will be\n"
+" examined when looking for components.\n"
+" If an explicit identity is given with --uuid or --super-minor, then\n"
+" only devices with a superblock which matches that identity is considered,\n"
+" otherwise every device listed is considered.\n"
+"\n"
+" If the --scan option is given, and no devices are listed, then\n"
+" every array listed in the config file is considered for assembly.\n"
+" The identity of candidate devices are determined from the config file.\n"
+" After these arrays are assembled, mdadm will look for other devices\n"
+" that could form further arrays and tries to assemble them. This can\n"
+" be disabled using the 'AUTO' option in the config file.\n"
+"\n"
+" If the --scan option is given as well as one or more devices, then\n"
+" Those devices are md devices that are to be assembled. Their identity\n"
+" and components are determined from the config file.\n"
+"\n"
+" If mdadm can not find all of the components for an array, it will assemble\n"
+" it but not activate it unless --run or --scan is given. To preserve this\n"
+" behaviour even with --scan, add --no-degraded. Note that \"all of the\n"
+" components\" means as many as were present the last time the array was running\n"
+" as recorded in the superblock. If the array was already degraded, and\n"
+" the missing device is not a new problem, it will still be assembled. It\n"
+" is only newly missing devices that cause the array not to be started.\n"
+"\n"
+"Options that are valid with --assemble (-A) are:\n"
+" --bitmap= : bitmap file to use with the array\n"
+" --uuid= -u : uuid of array to assemble. Devices which don't\n"
+" have this uuid are excluded\n"
+" --super-minor= -m : minor number to look for in super-block when\n"
+" choosing devices to use.\n"
+" --name= -N : Array name to look for in super-block.\n"
+" --config= -c : config file\n"
+" --scan -s : scan config file for missing information\n"
+" --run -R : Try to start the array even if not enough devices\n"
+" for a full array are present\n"
+" --force -f : Assemble the array even if some superblocks appear\n"
+" : out-of-date. This involves modifying the superblocks.\n"
+" --update= -U : Update superblock: try '-A --update=?' for option list.\n"
+" --no-degraded : Assemble but do not start degraded arrays.\n"
+" --readonly -o : Mark the array as read-only. No resync will start.\n"
+;
+
+char Help_manage[] =
+"Usage: mdadm arraydevice options component devices...\n"
+"\n"
+"This usage is for managing the component devices within an array.\n"
+"The --manage option is not needed and is assumed if the first argument\n"
+"is a device name or a management option.\n"
+"The first device listed will be taken to be an md array device, any\n"
+"subsequent devices are (potential) components of that array.\n"
+"\n"
+"Options that are valid with management mode are:\n"
+" --add -a : hotadd subsequent devices to the array\n"
+" --re-add : subsequent devices are re-added if there were\n"
+" : recent members of the array\n"
+" --remove -r : remove subsequent devices, which must not be active\n"
+" --fail -f : mark subsequent devices a faulty\n"
+" --set-faulty : same as --fail\n"
+" --replace : mark device(s) to be replaced by spares. Once\n"
+" : replacement completes, device will be marked faulty\n"
+" --with : Indicate which spare a previous '--replace' should\n"
+" : prefer to use\n"
+" --run -R : start a partially built array\n"
+" --stop -S : deactivate array, releasing all resources\n"
+" --readonly -o : mark array as readonly\n"
+" --readwrite -w : mark array as readwrite\n"
+;
+
+char Help_misc[] =
+"Usage: mdadm misc_option devices...\n"
+"\n"
+"This usage is for performing some task on one or more devices, which\n"
+"may be arrays or components, depending on the task.\n"
+"The --misc option is not needed (though it is allowed) and is assumed\n"
+"if the first argument in a misc option.\n"
+"\n"
+"Options that are valid with the miscellaneous mode are:\n"
+" --query -Q : Display general information about how a\n"
+" device relates to the md driver\n"
+" --detail -D : Display details of an array\n"
+" --detail-platform : Display hardware/firmware details\n"
+" --examine -E : Examine superblock on an array component\n"
+" --examine-bitmap -X: Display contents of a bitmap file\n"
+" --examine-badblocks: Display list of known bad blocks on device\n"
+" --zero-superblock : erase the MD superblock from a device.\n"
+" --run -R : start a partially built array\n"
+" --stop -S : deactivate array, releasing all resources\n"
+" --readonly -o : mark array as readonly\n"
+" --readwrite -w : mark array as readwrite\n"
+" --test -t : exit status 0 if ok, 1 if degrade, 2 if dead, 4 if missing\n"
+" --wait -W : wait for resync/rebuild/recovery to finish\n"
+" --action= : initiate or abort ('idle' or 'frozen') a 'check' or 'repair'.\n"
+;
+
+char Help_monitor[] =
+"Usage: mdadm --monitor options devices\n"
+"\n"
+"This usage causes mdadm to monitor a number of md arrays by periodically\n"
+"polling their status and acting on any changes.\n"
+"If any devices are listed then those devices are monitored, otherwise\n"
+"all devices listed in the config file are monitored.\n"
+"The address for mailing advisories to, and the program to handle\n"
+"each change can be specified in the config file or on the command line.\n"
+"There must be at least one destination for advisories, whether\n"
+"an email address, a program, or --syslog\n"
+"\n"
+"Options that are valid with the monitor (-F --follow) mode are:\n"
+" --mail= -m : Address to mail alerts of failure to\n"
+" --program= -p : Program to run when an event is detected\n"
+" --alert= : same as --program\n"
+" --syslog -y : Report alerts via syslog\n"
+" --increment= -r : Report RebuildNN events in the given increment. default=20\n"
+" --delay= -d : seconds of delay between polling state. default=60\n"
+" --config= -c : specify a different config file\n"
+" --scan -s : find mail-address/program in config file\n"
+" --daemonise -f : Fork and continue in child, parent exits\n"
+" --pid-file= -i : In daemon mode write pid to specified file instead of stdout\n"
+" --oneshot -1 : Check for degraded arrays, then exit\n"
+" --test -t : Generate a TestMessage event against each array at startup\n"
+;
+
+char Help_grow[] =
+"Usage: mdadm --grow device options\n"
+"\n"
+"This usage causes mdadm to attempt to reconfigure a running array.\n"
+"This is only possibly if the kernel being used supports a particular\n"
+"reconfiguration.\n"
+"\n"
+"Options that are valid with the grow (-G --grow) mode are:\n"
+" --level= -l : Tell mdadm what level to convert the array to.\n"
+" --layout= -p : For a FAULTY array, set/change the error mode.\n"
+" : for other arrays, update the layout\n"
+" --size= -z : Change the active size of devices in an array.\n"
+" : This is useful if all devices have been replaced\n"
+" : with larger devices. Value is in Kilobytes, or\n"
+" : the special word 'max' meaning 'as large as possible'.\n"
+" --assume-clean : When increasing the --size, this flag will avoid\n"
+" : a resync of the new space\n"
+" --chunk= -c : Change the chunksize of the array\n"
+" --raid-devices= -n : Change the number of active devices in an array.\n"
+" --add= -a : Add listed devices as part of reshape. This is\n"
+" : needed for resizing a RAID0 which cannot have\n"
+" : spares already present.\n"
+" --bitmap= -b : Add or remove a write-intent bitmap.\n"
+" --backup-file= file : A file on a different device to store data for a\n"
+" : short time while increasing raid-devices on a\n"
+" : RAID4/5/6 array. Also needed throughout a reshape\n"
+" : when changing parameters other than raid-devices\n"
+" --array-size= -Z : Change visible size of array. This does not change any\n"
+" : data on the device, and is not stable across restarts.\n"
+" --data-offset= : Location on device to move start of data to.\n"
+" --consistency-policy= : Change the consistency policy of an active array.\n"
+" -k : Currently works only for PPL with RAID5.\n"
+;
+
+char Help_incr[] =
+"Usage: mdadm --incremental [-Rqrsf] device\n"
+"\n"
+"This usage allows for incremental assembly of md arrays. Devices can be\n"
+"added one at a time as they are discovered. Once an array has all expected\n"
+"devices, it will be started.\n"
+"\n"
+"Optionally, the process can be reversed by using the fail option.\n"
+"When fail mode is invoked, mdadm will see if the device belongs to an array\n"
+"and then both fail (if needed) and remove the device from that array.\n"
+"\n"
+"Options that are valid with incremental assembly (-I --incremental) are:\n"
+" --run -R : Run arrays as soon as a minimal number of devices are\n"
+" : present rather than waiting for all expected.\n"
+" --quiet -q : Don't print any information messages, just errors.\n"
+" --rebuild-map -r : Rebuild the 'map' file that mdadm uses for tracking\n"
+" : partial arrays.\n"
+" --scan -s : Use with -R to start any arrays that have the minimal\n"
+" : required number of devices, but are not yet started.\n"
+" --fail -f : First fail (if needed) and then remove device from\n"
+" : any array that it is a member of.\n"
+;
+
+char Help_config[] =
+"The /etc/mdadm.conf config file:\n\n"
+" The config file contains, apart from blank lines and comment lines that\n"
+" start with a hash(#), array lines, device lines, and various\n"
+" configuration lines.\n"
+" Each line is constructed of a number of space separated words, and can\n"
+" be continued on subsequent physical lines by indenting those lines.\n"
+"\n"
+" A device line starts with the word 'device' and then has a number of words\n"
+" which identify devices. These words should be names of devices in the\n"
+" filesystem, and can contain wildcards. There can be multiple words or each\n"
+" device line, and multiple device lines. All devices so listed are checked\n"
+" for relevant super blocks when assembling arrays.\n"
+"\n"
+" An array line start with the word 'array'. This is followed by the name of\n"
+" the array device in the filesystem, e.g. '/dev/md2'. Subsequent words\n"
+" describe the identity of the array, used to recognise devices to include in the\n"
+" array. The identity can be given as a UUID with a word starting 'uuid=', or\n"
+" as a minor-number stored in the superblock using 'super-minor=', or as a list\n"
+" of devices. This is given as a comma separated list of names, possibly\n"
+" containing wildcards, preceded by 'devices='. If multiple critea are given,\n"
+" than a device must match all of them to be considered.\n"
+"\n"
+" Other configuration lines include:\n"
+" mailaddr, mailfrom, program used for --monitor mode\n"
+" create, auto used when creating device names in /dev\n"
+" homehost, policy, part-policy used to guide policy in various\n"
+" situations\n"
+"\n"
+;
+
+char *mode_help[mode_count] = {
+ [0] = Help,
+ [ASSEMBLE] = Help_assemble,
+ [BUILD] = Help_build,
+ [CREATE] = Help_create,
+ [MANAGE] = Help_manage,
+ [MISC] = Help_misc,
+ [MONITOR] = Help_monitor,
+ [GROW] = Help_grow,
+ [INCREMENTAL] = Help_incr,
+};
diff --git a/TODO b/TODO
new file mode 100644
index 0000000..279d20d
--- /dev/null
+++ b/TODO
@@ -0,0 +1,213 @@
+ - add 'name' field to metadata type and use it.
+ - use validate_geometry more
+ - metadata should be able to check/reject bitmap stuff.
+
+DDF:
+ Three new metadata types:
+ ddf - used only to create a container.
+ ddf-bvd - used to create an array in a container
+ ddf-svd - used to create a secondary array from bvds.
+
+ Usage:
+ mdadm -C /dev/ddf1 /dev/sd[abcdef]
+ mdadm -C /dev/md1 -e ddf /dev/sd[a-f]
+ mdadm -C /dev/md1 -l container /dev/sd[a-f]
+
+ Each of these create a new ddf container using all those
+ devices. The name 'ddf*' signals that ddf metadata should be used.
+ '-e ddf' only supports one level - 'container'. 'container' is only
+ supported by ddf.
+
+ mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ???
+ mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb
+ If exactly one device is given, and it is a container, we select
+ devices from that container.
+ If devices are given that are already in use, they must be in use by
+ a container, and the array is created in the container.
+ If devices given are bvds, we slip under the hood to make
+ the svd arrays.
+
+ mdadm -A /dev/ddf ......
+ base drives make a container. Anything in that container is started
+ auto-read-only.
+ if /dev/ddf is already assembled, we assemble bvds and svds inside it.
+
+
+2005-dec-20
+ Want an incremental assembly mode to work nicely with udev.
+ Core usage would be something like
+ mdadm --incr-assemble /dev/newdevice
+ This would
+ - examine the device to determine uuid etc.
+ - look for a match in /etc/mdadm.conf, abort if not found
+ - find that device and collect current contents
+ - perform an 'assemble' analysis to make sure we have the best set of devices.
+ - remove or add devices as appropriate
+ - possibly start the array if it was complete
+
+ Other usages could involve
+ - specify which array to auto-add to.
+ This requires an existing array for uuid matching... is there any point?
+
+ -
+
+
+2004-june-02
+ * Don't print 'errors' flag, it is meaningless. DONE
+ * Handle new superblock format
+ * create device file on demand, particularly partitionable devices. DONE
+ BUT figure a way to create the partition devices.
+ auto=partN
+ * Use Event: interface to listen for events. DONE, untested
+ * Make sure mdadm -As can assemble multi-level RAIDs ok.
+ * --build to build raid1 or multipath arrays
+ clean or not ???
+
+----------------------------------------------------------------------------
+* mdadm --monitor to monitor failed multipath paths and re-instate them.
+
+* Maybe make "--help" fit in 80x24 and have a --long-help with more info. DONE
+
+
+* maybe "missing" instead of <bold>missing</> in doco DONE
+* possibly wait for resync to start, or even finish while assembling.- NO
+
+* -Db should have a devices= entry if possible. - DONE
+* when assembling multipath arrays, ignore any error indicators. - DONE
+* rationalise --monitor usage:
+ mdadm --monitor
+ doesn't do as expected. DONE
+
+* --assemble could have a --update option. - DONE
+ following word can be:
+ sparc2.2
+ super-minor
+
+* mdadm /dev/md11, where md11 is raid0 can segfault, particularly when looking in the
+ [UU_UUU] string ... which doesn't exist !
+It should be more sensible. DONE
+
+Example:
+
+from Raimund Sacherer <raimund.sacherer@ngit.at>
+
+mke2fs -m0 -q /dev/ram1 300
+mount -n -t ext2 /dev/ram1 /tmp
+echo DEVICE /dev/[sh]* >> /tmp/mdadm.conf
+mdadm -Esb /dev/[sh]* 2>/dev/null >> /tmp/mdadm.conf
+mdadm -ARsc /tmp/mdadm.conf
+umount /tmp
+
+
+?? Allow -S /dev/md? - current complains subsequent not a/d/r - DONE
+
+* new "Query" mode to subsume --detail and --examine.
+ --query or -Q, takes a device and tells if it is an MD device,
+ and also tells in a raid superblock is found.
+ DONE
+
+* write mdstat.c to parse /proc/mdstat file
+ Build list of arrays: name, rebuild-percent
+ DONE
+
+* parse /proc/partitions and map major/minor into /dev/* names,
+ and use that for default DEVICE list ????
+
+* --detail --scan to read /proc/mdstat, and then iterate over these,
+ but assume --brief. --verbose can override
+ check each subdevice to see if it is in conf_get_devs.
+ Warn if not.
+ DONE, but don't warn yet...
+
+* Support multipath ... maybe...
+ maybe DONE
+
+* --follow to syslog
+
+* --follow to move spares around DONE
+
+* --follow to notice other events: DONE
+ rebuild started
+ spare activated
+ spare removed
+ spare added
+
+------------------------------------
+- --examine --scan scans all drives and build an mdadm.conf file DONE
+
+- check superblock checksum in examine DONE
+- report "chunk" or "rounding" depending on raid level DONE
+- report "linear" instead of "-1" for raid level DONE
+- decode ayout depending on raid level DONE
+- --verbose and --force flags. DONE
+
+- set md_minor, *_disks for Create - DONE
+- for create raid5, how to choose between
+ all working, but not insync
+ one missing, one spare, insync DONE (--force)
+- and for raid1 - some failed drives... (missing)
+
+- when RUN_ARRAY, make sure *_disks counts are right
+
+- get --detail to extract extra stuff from superblock,
+ like uuid DONE
+- --detail --brief to give a config file line DONE
+- parse config file. DONE
+- test...
+
+- when --assemble --scan, if an underlying device is an md device,
+ then try to assemble that device first.
+
+
+- mdadm -S /dev/md0 /dev/md1 gives internal error FIXED
+
+- mdadm --detail --scan print summary of what it can find? DONE
+
+
+---------
+Assemble doesn't add spares. - DONE
+Create to allow "missing" name for devices.
+Create to accept "--force" for do exactly what is requested
+- get Assemble to upgrade devices if force flag.
+ARRAY lines in config file to have super_minor=n
+ARRAY lines in config file to have device=pattern, and only accept
+ those devices
+ If UUID given, insist on that
+ If not, but super_minor given, require all found with that minor
+ to have same uuid
+ If only device given, all valid supers on those devices must have
+ same uuid
+allow /dev/mdX as first argument before any options
+Possible --dry-run option for create and assemble--force
+
+Assemble to check that all devices mentioned in superblock
+ are present.
+
+New mode: --Monitor (or --Follow)
+ Periodically check status of all arrays (listed in config file).
+ Log every event and apparent cause - or differences
+ Email and alert - or run a program - for important events
+ Move spares around if necessary.
+
+ An Array line can have a spare-group= field that indicates that
+ the array shares spares with other arrays with the same
+ spare-group name.
+ If an array has a failed and no spares, then check all other
+ arrays in the spare group. If one has no failures and a spare,
+ then consider that spare.
+ Choose the smallest considered spare that is large enough.
+ If there is one, then hot-remove it from it's home, and
+ hot-add it to the array in question.
+
+ --mail-to address
+ --alert-handler program
+
+ Will also extract information from /proc/mdstat if present,
+ and consider 20% marks in rebuild as events.
+
+ Events are:
+ drive fails - causes mail to be sent
+ rebuild started
+ spare activated
+ spare removed
+ spare added
diff --git a/bitmap.c b/bitmap.c
new file mode 100644
index 0000000..9a7ffe3
--- /dev/null
+++ b/bitmap.c
@@ -0,0 +1,534 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2004 Paul Clements, SteelEye Technology, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#include "mdadm.h"
+
+static inline void sb_le_to_cpu(bitmap_super_t *sb)
+{
+ sb->magic = __le32_to_cpu(sb->magic);
+ sb->version = __le32_to_cpu(sb->version);
+ /* uuid gets no translation */
+ sb->events = __le64_to_cpu(sb->events);
+ sb->events_cleared = __le64_to_cpu(sb->events_cleared);
+ sb->state = __le32_to_cpu(sb->state);
+ sb->chunksize = __le32_to_cpu(sb->chunksize);
+ sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep);
+ sb->sync_size = __le64_to_cpu(sb->sync_size);
+ sb->write_behind = __le32_to_cpu(sb->write_behind);
+ sb->nodes = __le32_to_cpu(sb->nodes);
+ sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved);
+}
+
+static inline void sb_cpu_to_le(bitmap_super_t *sb)
+{
+ sb_le_to_cpu(sb); /* these are really the same thing */
+}
+
+mapping_t bitmap_states[] = {
+ { "OK", 0 },
+ { "Out of date", 2 },
+ { NULL, -1 }
+};
+
+static const char *bitmap_state(int state_num)
+{
+ char *state = map_num(bitmap_states, state_num);
+ return state ? state : "Unknown";
+}
+
+static const char *human_chunksize(unsigned long bytes)
+{
+ static char buf[16];
+ char *suffixes[] = { "B", "KB", "MB", "GB", "TB", NULL };
+ int i = 0;
+
+ while (bytes >> 10) {
+ bytes >>= 10;
+ i++;
+ }
+
+ snprintf(buf, sizeof(buf), "%lu %s", bytes, suffixes[i]);
+
+ return buf;
+}
+
+typedef struct bitmap_info_s {
+ bitmap_super_t sb;
+ unsigned long long total_bits;
+ unsigned long long dirty_bits;
+} bitmap_info_t;
+
+/* count the dirty bits in the first num_bits of byte */
+static inline int count_dirty_bits_byte(char byte, int num_bits)
+{
+ int num = 0;
+
+ switch (num_bits) { /* fall through... */
+ case 8: if (byte & 128) num++;
+ case 7: if (byte & 64) num++;
+ case 6: if (byte & 32) num++;
+ case 5: if (byte & 16) num++;
+ case 4: if (byte & 8) num++;
+ case 3: if (byte & 4) num++;
+ case 2: if (byte & 2) num++;
+ case 1: if (byte & 1) num++;
+ default: break;
+ }
+
+ return num;
+}
+
+static int count_dirty_bits(char *buf, int num_bits)
+{
+ int i, num = 0;
+
+ for (i = 0; i < num_bits / 8; i++)
+ num += count_dirty_bits_byte(buf[i], 8);
+
+ if (num_bits % 8) /* not an even byte boundary */
+ num += count_dirty_bits_byte(buf[i], num_bits % 8);
+
+ return num;
+}
+
+static bitmap_info_t *bitmap_fd_read(int fd, int brief)
+{
+ /* Note: fd might be open O_DIRECT, so we must be
+ * careful to align reads properly
+ */
+ unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0;
+ bitmap_info_t *info;
+ void *buf;
+ unsigned int n, skip;
+
+ if (posix_memalign(&buf, 4096, 8192) != 0) {
+ pr_err("failed to allocate 8192 bytes\n");
+ return NULL;
+ }
+ n = read(fd, buf, 8192);
+
+ info = xmalloc(sizeof(*info));
+
+ if (n < sizeof(info->sb)) {
+ pr_err("failed to read superblock of bitmap file: %s\n", strerror(errno));
+ free(info);
+ free(buf);
+ return NULL;
+ }
+ memcpy(&info->sb, buf, sizeof(info->sb));
+ skip = sizeof(info->sb);
+
+ sb_le_to_cpu(&info->sb); /* convert superblock to CPU byte ordering */
+
+ if (brief || info->sb.sync_size == 0 || info->sb.chunksize == 0)
+ goto out;
+
+ /* read the rest of the file counting total bits and dirty bits --
+ * we stop when either:
+ * 1) we hit EOF, in which case we assume the rest of the bits (if any)
+ * are dirty
+ * 2) we've read the full bitmap, in which case we ignore any trailing
+ * data in the file
+ */
+ total_bits = bitmap_bits(info->sb.sync_size, info->sb.chunksize);
+
+ while(read_bits < total_bits) {
+ unsigned long long remaining = total_bits - read_bits;
+
+ if (n == 0) {
+ n = read(fd, buf, 8192);
+ skip = 0;
+ if (n <= 0)
+ break;
+ }
+ if (remaining > (n-skip) * 8) /* we want the full buffer */
+ remaining = (n-skip) * 8;
+
+ dirty_bits += count_dirty_bits(buf+skip, remaining);
+
+ read_bits += remaining;
+ n = 0;
+ }
+
+ if (read_bits < total_bits) { /* file truncated... */
+ pr_err("WARNING: bitmap file is not large enough for array size %llu!\n\n",
+ (unsigned long long)info->sb.sync_size);
+ total_bits = read_bits;
+ }
+out:
+ free(buf);
+ info->total_bits = total_bits;
+ info->dirty_bits = dirty_bits;
+ return info;
+}
+
+static int
+bitmap_file_open(char *filename, struct supertype **stp, int node_num, int fd)
+{
+ struct stat stb;
+ struct supertype *st = *stp;
+
+ /* won't re-open filename when (fd >= 0) */
+ if (fd < 0)
+ fd = open(filename, O_RDONLY|O_DIRECT);
+ if (fd < 0) {
+ pr_err("failed to open bitmap file %s: %s\n",
+ filename, strerror(errno));
+ return -1;
+ }
+
+ if (fstat(fd, &stb) < 0) {
+ pr_err("fstat failed for %s: %s\n", filename, strerror(errno));
+ close(fd);
+ return -1;
+ }
+ if ((stb.st_mode & S_IFMT) == S_IFBLK) {
+ /* block device, so we are probably after an internal bitmap */
+ if (!st)
+ st = guess_super(fd);
+ if (!st) {
+ /* just look at device... */
+ lseek(fd, 0, 0);
+ } else if (!st->ss->locate_bitmap) {
+ pr_err("No bitmap possible with %s metadata\n",
+ st->ss->name);
+ close(fd);
+ return -1;
+ } else {
+ if (st->ss->locate_bitmap(st, fd, node_num)) {
+ pr_err("%s doesn't have bitmap\n", filename);
+ close(fd);
+ fd = -1;
+ }
+ }
+ *stp = st;
+ }
+
+ return fd;
+}
+
+static __u32 swapl(__u32 l)
+{
+ char *c = (char*)&l;
+ char t= c[0];
+ c[0] = c[3];
+ c[3] = t;
+
+ t = c[1];
+ c[1] = c[2];
+ c[2] = t;
+ return l;
+}
+int ExamineBitmap(char *filename, int brief, struct supertype *st)
+{
+ /*
+ * Read the bitmap file and display its contents
+ */
+
+ bitmap_super_t *sb;
+ bitmap_info_t *info;
+ int rv = 1;
+ char buf[64];
+ int swap;
+ int fd, i;
+ __u32 uuid32[4];
+
+ fd = bitmap_file_open(filename, &st, 0, -1);
+ if (fd < 0)
+ return rv;
+
+ info = bitmap_fd_read(fd, brief);
+ if (!info)
+ return rv;
+ sb = &info->sb;
+ if (sb->magic != BITMAP_MAGIC) {
+ pr_err("This is an md array. To view a bitmap you need to examine\n");
+ pr_err("a member device, not the array.\n");
+ pr_err("Reporting bitmap that would be used if this array were used\n");
+ pr_err("as a member of some other array\n");
+ }
+ printf(" Filename : %s\n", filename);
+ printf(" Magic : %08x\n", sb->magic);
+ if (sb->magic != BITMAP_MAGIC) {
+ pr_err("invalid bitmap magic 0x%x, the bitmap file appears\n",
+ sb->magic);
+ pr_err("to be corrupted or missing.\n");
+ }
+ printf(" Version : %d\n", sb->version);
+ if (sb->version < BITMAP_MAJOR_LO ||
+ sb->version > BITMAP_MAJOR_CLUSTERED) {
+ pr_err("unknown bitmap version %d, either the bitmap file\n",
+ sb->version);
+ pr_err("is corrupted or you need to upgrade your tools\n");
+ goto free_info;
+ }
+
+ rv = 0;
+ if (st)
+ swap = st->ss->swapuuid;
+ else
+#if __BYTE_ORDER == BIG_ENDIAN
+ swap = 0;
+#else
+ swap = 1;
+#endif
+ memcpy(uuid32, sb->uuid, 16);
+ if (swap)
+ printf(" UUID : %08x:%08x:%08x:%08x\n",
+ swapl(uuid32[0]),
+ swapl(uuid32[1]),
+ swapl(uuid32[2]),
+ swapl(uuid32[3]));
+ else
+ printf(" UUID : %08x:%08x:%08x:%08x\n",
+ uuid32[0],
+ uuid32[1],
+ uuid32[2],
+ uuid32[3]);
+
+ if (sb->nodes == 0) {
+ printf(" Events : %llu\n", (unsigned long long)sb->events);
+ printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared);
+ printf(" State : %s\n", bitmap_state(sb->state));
+
+ }
+
+ printf(" Chunksize : %s\n", human_chunksize(sb->chunksize));
+ printf(" Daemon : %ds flush period\n", sb->daemon_sleep);
+ if (sb->write_behind)
+ sprintf(buf, "Allow write behind, max %d", sb->write_behind);
+ else
+ sprintf(buf, "Normal");
+ printf(" Write Mode : %s\n", buf);
+ printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2,
+ human_size(sb->sync_size * 512));
+
+ if (sb->nodes == 0) {
+ if (brief)
+ goto free_info;
+ printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+ info->total_bits, info->dirty_bits,
+ 100.0 * info->dirty_bits / (info->total_bits?:1));
+ } else {
+ printf(" Cluster nodes : %d\n", sb->nodes);
+ printf(" Cluster name : %-64s\n", sb->cluster_name);
+ for (i = 0; i < (int)sb->nodes; i++) {
+ st = NULL;
+ free(info);
+ fd = bitmap_file_open(filename, &st, i, fd);
+ if (fd < 0) {
+ printf(" Unable to open bitmap file on node: %i\n", i);
+ continue;
+ }
+ info = bitmap_fd_read(fd, brief);
+ if (!info) {
+ printf(" Unable to read bitmap on node: %i\n", i);
+ continue;
+ }
+ sb = &info->sb;
+ if (sb->magic != BITMAP_MAGIC)
+ pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic);
+
+ printf(" Node Slot : %d\n", i);
+ printf(" Events : %llu\n",
+ (unsigned long long)sb->events);
+ printf(" Events Cleared : %llu\n",
+ (unsigned long long)sb->events_cleared);
+ printf(" State : %s\n", bitmap_state(sb->state));
+ if (brief)
+ continue;
+ printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n",
+ info->total_bits, info->dirty_bits,
+ 100.0 * info->dirty_bits / (info->total_bits?:1));
+ }
+ }
+
+free_info:
+ close(fd);
+ free(info);
+ return rv;
+}
+
+int IsBitmapDirty(char *filename)
+{
+ /*
+ * Read the bitmap file
+ * It will break reading bitmap action immediately when meeting any error.
+ *
+ * Return: 1(dirty), 0 (clean), -1(error)
+ */
+
+ int fd = -1, rv = 0, i;
+ struct supertype *st = NULL;
+ bitmap_info_t *info = NULL;
+ bitmap_super_t *sb = NULL;
+
+ fd = bitmap_file_open(filename, &st, 0, fd);
+ free(st);
+ if (fd < 0)
+ goto out;
+
+ info = bitmap_fd_read(fd, 0);
+ if (!info) {
+ close(fd);
+ goto out;
+ }
+
+ sb = &info->sb;
+ for (i = 0; i < (int)sb->nodes; i++) {
+ st = NULL;
+ free(info);
+ info = NULL;
+
+ fd = bitmap_file_open(filename, &st, i, fd);
+ free(st);
+ if (fd < 0)
+ goto out;
+
+ info = bitmap_fd_read(fd, 0);
+ if (!info) {
+ close(fd);
+ goto out;
+ }
+
+ sb = &info->sb;
+ if (sb->magic != BITMAP_MAGIC) { /* invalid bitmap magic */
+ free(info);
+ close(fd);
+ goto out;
+ }
+
+ if (info->dirty_bits)
+ rv = 1;
+ }
+ close(fd);
+ free(info);
+ return rv;
+out:
+ return -1;
+}
+
+int CreateBitmap(char *filename, int force, char uuid[16],
+ unsigned long chunksize, unsigned long daemon_sleep,
+ unsigned long write_behind,
+ unsigned long long array_size /* sectors */,
+ int major)
+{
+ /*
+ * Create a bitmap file with a superblock and (optionally) a full bitmap
+ */
+
+ FILE *fp;
+ int rv = 1;
+ char block[512];
+ bitmap_super_t sb;
+ long long bytes, filesize;
+
+ if (!force && access(filename, F_OK) == 0) {
+ pr_err("bitmap file %s already exists, use --force to overwrite\n", filename);
+ return rv;
+ }
+
+ fp = fopen(filename, "w");
+ if (fp == NULL) {
+ pr_err("failed to open bitmap file %s: %s\n",
+ filename, strerror(errno));
+ return rv;
+ }
+
+ if (chunksize == UnSet) {
+ /* We don't want more than 2^21 chunks, as 2^11 fill up one
+ * 4K page (2 bytes per chunk), and 2^10 address of those
+ * fill up a 4K indexing page. 2^20 might be safer, especially
+ * on 64bit hosts, so use that.
+ */
+ chunksize = DEFAULT_BITMAP_CHUNK;
+ /* <<20 for 2^20 chunks, >>9 to convert bytes to sectors */
+ while (array_size > ((unsigned long long)chunksize << (20-9)))
+ chunksize <<= 1;
+ }
+
+ memset(&sb, 0, sizeof(sb));
+ sb.magic = BITMAP_MAGIC;
+ sb.version = major;
+ if (uuid != NULL)
+ memcpy(sb.uuid, uuid, 16);
+ sb.chunksize = chunksize;
+ sb.daemon_sleep = daemon_sleep;
+ sb.write_behind = write_behind;
+ sb.sync_size = array_size;
+
+ sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */
+
+ if (fwrite(&sb, sizeof(sb), 1, fp) != 1) {
+ pr_err("failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno));
+ goto out;
+ }
+
+ /* calculate the size of the bitmap and write it to disk */
+ bytes = (bitmap_bits(array_size, chunksize) + 7) / 8;
+ if (!bytes) {
+ rv = 0;
+ goto out;
+ }
+
+ filesize = bytes + sizeof(sb);
+
+ memset(block, 0xff, sizeof(block));
+
+ while (bytes > 0) {
+ if (fwrite(block, sizeof(block), 1, fp) != 1) {
+ pr_err("failed to write bitmap file %s: %s\n", filename, strerror(errno));
+ goto out;
+ }
+ bytes -= sizeof(block);
+ }
+
+ rv = 0;
+ fflush(fp);
+ /* make the file be the right size (well, to the nearest byte) */
+ if (ftruncate(fileno(fp), filesize))
+ perror("ftrunace");
+out:
+ fclose(fp);
+ if (rv)
+ unlink(filename); /* possibly corrupted, better get rid of it */
+ return rv;
+}
+
+int bitmap_update_uuid(int fd, int *uuid, int swap)
+{
+ struct bitmap_super_s bm;
+ if (lseek(fd, 0, 0) != 0)
+ return 1;
+ if (read(fd, &bm, sizeof(bm)) != sizeof(bm))
+ return 1;
+ if (bm.magic != __cpu_to_le32(BITMAP_MAGIC))
+ return 1;
+ copy_uuid(bm.uuid, uuid, swap);
+ if (lseek(fd, 0, 0) != 0)
+ return 2;
+ if (write(fd, &bm, sizeof(bm)) != sizeof(bm)) {
+ lseek(fd, 0, 0);
+ return 2;
+ }
+ lseek(fd, 0, 0);
+ return 0;
+}
diff --git a/bitmap.h b/bitmap.h
new file mode 100644
index 0000000..7b1f80f
--- /dev/null
+++ b/bitmap.h
@@ -0,0 +1,291 @@
+/*
+ * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003
+ *
+ * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc.
+ */
+#ifndef BITMAP_H
+#define BITMAP_H 1
+
+#define BITMAP_MAJOR_LO 3
+/* version 4 insists the bitmap is in little-endian order
+ * with version 3, it is host-endian which is non-portable
+ */
+#define BITMAP_MAJOR_HI 4
+#define BITMAP_MAJOR_HOSTENDIAN 3
+#define BITMAP_MAJOR_CLUSTERED 5
+
+#define BITMAP_MINOR 39
+
+/*
+ * in-memory bitmap:
+ *
+ * Use 16 bit block counters to track pending writes to each "chunk".
+ * The 2 high order bits are special-purpose, the first is a flag indicating
+ * whether a resync is needed. The second is a flag indicating whether a
+ * resync is active.
+ * This means that the counter is actually 14 bits:
+ *
+ * +--------+--------+------------------------------------------------+
+ * | resync | resync | counter |
+ * | needed | active | |
+ * | (0-1) | (0-1) | (0-16383) |
+ * +--------+--------+------------------------------------------------+
+ *
+ * The "resync needed" bit is set when:
+ * a '1' bit is read from storage at startup.
+ * a write request fails on some drives
+ * a resync is aborted on a chunk with 'resync active' set
+ * It is cleared (and resync-active set) when a resync starts across all drives
+ * of the chunk.
+ *
+ *
+ * The "resync active" bit is set when:
+ * a resync is started on all drives, and resync_needed is set.
+ * resync_needed will be cleared (as long as resync_active wasn't already set).
+ * It is cleared when a resync completes.
+ *
+ * The counter counts pending write requests, plus the on-disk bit.
+ * When the counter is '1' and the resync bits are clear, the on-disk
+ * bit can be cleared as well, thus setting the counter to 0.
+ * When we set a bit, or in the counter (to start a write), if the fields is
+ * 0, we first set the disk bit and set the counter to 1.
+ *
+ * If the counter is 0, the on-disk bit is clear and the stipe is clean
+ * Anything that dirties the stipe pushes the counter to 2 (at least)
+ * and sets the on-disk bit (lazily).
+ * If a periodic sweep find the counter at 2, it is decremented to 1.
+ * If the sweep find the counter at 1, the on-disk bit is cleared and the
+ * counter goes to zero.
+ *
+ * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block
+ * counters as a fallback when "page" memory cannot be allocated:
+ *
+ * Normal case (page memory allocated):
+ *
+ * page pointer (32-bit)
+ *
+ * [ ] ------+
+ * |
+ * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters)
+ * c1 c2 c2048
+ *
+ * Hijacked case (page memory allocation failed):
+ *
+ * hijacked page pointer (32-bit)
+ *
+ * [ ][ ] (no page memory allocated)
+ * counter #1 (16-bit) counter #2 (16-bit)
+ *
+ */
+
+#ifdef __KERNEL__
+
+#define PAGE_BITS (PAGE_SIZE << 3)
+#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3)
+
+typedef __u16 bitmap_counter_t;
+#define COUNTER_BITS 16
+#define COUNTER_BIT_SHIFT 4
+#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8)
+#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3)
+
+#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1)))
+#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2)))
+#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1)
+#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK)
+#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK)
+#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX)
+
+/* how many counters per page? */
+#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS)
+/* same, except a shift value for more efficient bitops */
+#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT)
+/* same, except a mask value for more efficient bitops */
+#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
+
+#define BITMAP_BLOCK_SIZE 512
+#define BITMAP_BLOCK_SHIFT 9
+
+/* how many blocks per chunk? (this is variable) */
+#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
+#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
+
+/* when hijacked, the counters and bits represent even larger "chunks" */
+/* there will be 1024 chunks represented by each counter in the page pointers */
+#define PAGEPTR_BLOCK_RATIO(bitmap) \
+ (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
+#define PAGEPTR_BLOCK_SHIFT(bitmap) \
+ (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
+#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
+
+/*
+ * on-disk bitmap:
+ *
+ * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap
+ * file a page at a time. There's a superblock at the start of the file.
+ */
+
+/* map chunks (bits) to file pages - offset by the size of the superblock */
+#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3))
+
+#endif
+
+/*
+ * bitmap structures:
+ */
+
+#define BITMAP_MAGIC 0x6d746962
+
+/* use these for bitmap->flags and bitmap->sb->state bit-fields */
+enum bitmap_state {
+ BITMAP_ACTIVE = 0x001, /* the bitmap is in use */
+ BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */
+};
+
+/* the superblock at the front of the bitmap file -- little endian */
+typedef struct bitmap_super_s {
+ __u32 magic; /* 0 BITMAP_MAGIC */
+ __u32 version; /* 4 the bitmap major for now, could change... */
+ __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */
+ __u64 events; /* 24 event counter for the bitmap (1)*/
+ __u64 events_cleared;/*32 event counter when last bit cleared (2) */
+ __u64 sync_size; /* 40 the size of the md device's sync range(3) */
+ __u32 state; /* 48 bitmap state information */
+ __u32 chunksize; /* 52 the bitmap chunk size in bytes */
+ __u32 daemon_sleep; /* 56 seconds between disk flushes */
+ __u32 write_behind; /* 60 number of outstanding write-behind writes */
+ __u32 sectors_reserved; /* 64 number of 512-byte sectors that are
+ * reserved for the bitmap. */
+ __u32 nodes; /* 68 the maximum number of nodes in cluster. */
+ __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */
+ __u8 pad[256 - 136]; /* set to zero */
+} bitmap_super_t;
+
+/* notes:
+ * (1) This event counter is updated before the eventcounter in the md superblock
+ * When a bitmap is loaded, it is only accepted if this event counter is equal
+ * to, or one greater than, the event counter in the superblock.
+ * (2) This event counter is updated when the other one is *if*and*only*if* the
+ * array is not degraded. As bits are not cleared when the array is degraded,
+ * this represents the last time that any bits were cleared.
+ * If a device is being added that has an event count with this value or
+ * higher, it is accepted as conforming to the bitmap.
+ * (3)This is the number of sectors represented by the bitmap, and is the range that
+ * resync happens across. For raid1 and raid5/6 it is the size of individual
+ * devices. For raid10 it is the size of the array.
+ */
+
+#ifdef __KERNEL__
+
+/* the in-memory bitmap is represented by bitmap_pages */
+struct bitmap_page {
+ /*
+ * map points to the actual memory page
+ */
+ char *map;
+ /*
+ * in emergencies (when map cannot be allocated), hijack the map
+ * pointer and use it as two counters itself
+ */
+ unsigned int hijacked;
+ /*
+ * count of dirty bits on the page
+ */
+ int count;
+};
+
+/* keep track of bitmap file pages that have pending writes on them */
+struct page_list {
+ struct list_head list;
+ struct page *page;
+};
+
+/* the main bitmap structure - one per mddev */
+struct bitmap {
+ struct bitmap_page *bp;
+ unsigned long pages; /* total number of pages in the bitmap */
+ unsigned long missing_pages; /* number of pages not yet allocated */
+
+ mddev_t *mddev; /* the md device that the bitmap is for */
+
+ int counter_bits; /* how many bits per block counter */
+
+ /* bitmap chunksize -- how much data does each bit represent? */
+ unsigned long chunksize;
+ unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */
+ unsigned long chunks; /* total number of data chunks for the array */
+
+ /* We hold a count on the chunk currently being synced, and drop
+ * it when the last block is started. If the resync is aborted
+ * midway, we need to be able to drop that count, so we remember
+ * the counted chunk..
+ */
+ unsigned long syncchunk;
+
+ __u64 events_cleared;
+
+ /* bitmap spinlock */
+ spinlock_t lock;
+
+ struct file *file; /* backing disk file */
+ struct page *sb_page; /* cached copy of the bitmap file superblock */
+ struct page **filemap; /* list of cache pages for the file */
+ unsigned long *filemap_attr; /* attributes associated w/ filemap pages */
+ unsigned long file_pages; /* number of pages in the file */
+
+ unsigned long flags;
+
+ /*
+ * the bitmap daemon - periodically wakes up and sweeps the bitmap
+ * file, cleaning up bits and flushing out pages to disk as necessary
+ */
+ mdk_thread_t *daemon;
+ unsigned long daemon_sleep; /* how many seconds between updates? */
+
+ /*
+ * bitmap write daemon - this daemon performs writes to the bitmap file
+ * this thread is only needed because of a limitation in ext3 (jbd)
+ * that does not allow a task to have two journal transactions ongoing
+ * simultaneously (even if the transactions are for two different
+ * filesystems) -- in the case of bitmap, that would be the filesystem
+ * that the bitmap file resides on and the filesystem that is mounted
+ * on the md device -- see current->journal_info in jbd/transaction.c
+ */
+ mdk_thread_t *write_daemon;
+ mdk_thread_t *writeback_daemon;
+ spinlock_t write_lock;
+ struct semaphore write_ready;
+ struct semaphore write_done;
+ unsigned long writes_pending;
+ wait_queue_head_t write_wait;
+ struct list_head write_pages;
+ struct list_head complete_pages;
+ mempool_t *write_pool;
+};
+
+/* the bitmap API */
+
+/* these are used only by md/bitmap */
+int bitmap_create(mddev_t *mddev);
+void bitmap_destroy(mddev_t *mddev);
+int bitmap_active(struct bitmap *bitmap);
+
+char *file_path(struct file *file, char *buf, int count);
+void bitmap_print_sb(struct bitmap *bitmap);
+int bitmap_update_sb(struct bitmap *bitmap);
+
+int bitmap_setallbits(struct bitmap *bitmap);
+
+/* these are exported */
+void bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors);
+void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors,
+ int success);
+int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks);
+void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted);
+void bitmap_close_sync(struct bitmap *bitmap);
+
+int bitmap_unplug(struct bitmap *bitmap);
+#endif
+
+#endif
diff --git a/clustermd_tests/00r10_Create b/clustermd_tests/00r10_Create
new file mode 100644
index 0000000..8aa5a70
--- /dev/null
+++ b/clustermd_tests/00r10_Create
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check $NODE1 resync
+check $NODE2 PENDING
+check all wait
+check all raid10
+check all bitmap
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l10 -b clustered -n3 --layout n3 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid10
+check all bitmap
+check all state UUU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l10 -b clustered -n2 -x1 --layout n2 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid10
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+name=tstmd
+mdadm -CR $md0 -l10 -b clustered -n2 $dev0 $dev1 --layout n2 --name=$name --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+for ip in $NODE1 $NODE2
+do
+ ssh $ip "mdadm -D $md0 | grep 'Name' | grep -q $name"
+ [ $? -ne '0' ] &&
+ die "$ip: check --name=$name failed."
+done
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/00r1_Create b/clustermd_tests/00r1_Create
new file mode 100644
index 0000000..709bb7b
--- /dev/null
+++ b/clustermd_tests/00r1_Create
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check $NODE1 resync
+check $NODE2 PENDING
+check all wait
+check all raid1
+check all bitmap
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid1
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+name=tstmd
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --name=$name --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+for ip in $NODE1 $NODE2
+do
+ ssh $ip "mdadm -D $md0 | grep 'Name' | grep -q $name"
+ [ $? -ne '0' ] &&
+ die "$ip: check --name=$name failed."
+done
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/01r10_Grow_bitmap-switch b/clustermd_tests/01r10_Grow_bitmap-switch
new file mode 100644
index 0000000..1794719
--- /dev/null
+++ b/clustermd_tests/01r10_Grow_bitmap-switch
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+
+# switch 'clustered' bitmap to 'none', and then 'none' to 'internal'
+stop_md $NODE2 $md0
+mdadm --grow $md0 --bitmap=none
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'clustered' to 'none' failed."
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] &&
+ die "$NODE1: bitmap still exists in member_disks."
+check all nobitmap
+mdadm --grow $md0 --bitmap=internal
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'none' to 'internal' failed."
+sleep 1
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] ||
+ die "$NODE1: create 'internal' bitmap failed."
+check $NODE1 bitmap
+
+# switch 'internal' bitmap to 'none', and then 'none' to 'clustered'
+mdadm --grow $md0 --bitmap=none
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'internal' to 'none' failed."
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] &&
+ die "$NODE1: bitmap still exists in member_disks."
+check $NODE1 nobitmap
+mdadm --grow $md0 --bitmap=clustered
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'none' to 'clustered' failed."
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+sleep 1
+for ip in $NODES
+do
+ ssh $ip "mdadm -X $dev0 $dev1 | grep -q 'Cluster name'" ||
+ die "$ip: create 'clustered' bitmap failed."
+done
+check all bitmap
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/01r10_Grow_resize b/clustermd_tests/01r10_Grow_resize
new file mode 100644
index 0000000..c69b785
--- /dev/null
+++ b/clustermd_tests/01r10_Grow_resize
@@ -0,0 +1,38 @@
+#!/bin/bash
+
+size=20000
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 --size $size --chunk=64 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+
+mdadm --grow $md0 --size max
+check $NODE1 resync
+check $NODE1 wait
+check all state UU
+
+mdadm --grow $md0 --size $size
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 --chunk=64 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+
+mdadm --grow $md0 --chunk=128
+check $NODE1 reshape
+check $NODE1 wait
+check all chunk 128
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/01r1_Grow_add b/clustermd_tests/01r1_Grow_add
new file mode 100644
index 0000000..5706114
--- /dev/null
+++ b/clustermd_tests/01r1_Grow_add
@@ -0,0 +1,68 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --grow $md0 --raid-devices=3 --add $dev2
+sleep 0.3
+grep recovery /proc/mdstat
+if [ $? -eq '0' ]
+then
+ check $NODE1 wait
+else
+ check $NODE2 recovery
+ check $NODE2 wait
+fi
+check all state UUU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid1
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --grow $md0 --raid-devices=3 --add $dev3
+sleep 0.3
+grep recovery /proc/mdstat
+if [ $? -eq '0' ]
+then
+ check $NODE1 wait
+else
+ check $NODE2 recovery
+ check $NODE2 wait
+fi
+check all state UUU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid1
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --grow $md0 --raid-devices=3
+sleep 0.3
+grep recovery /proc/mdstat
+if [ $? -eq '0' ]
+then
+ check $NODE1 wait
+else
+ check $NODE2 recovery
+ check $NODE2 wait
+fi
+check all state UUU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/01r1_Grow_bitmap-switch b/clustermd_tests/01r1_Grow_bitmap-switch
new file mode 100644
index 0000000..3b363d9
--- /dev/null
+++ b/clustermd_tests/01r1_Grow_bitmap-switch
@@ -0,0 +1,51 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+
+# switch 'clustered' bitmap to 'none', and then 'none' to 'internal'
+stop_md $NODE2 $md0
+mdadm --grow $md0 --bitmap=none
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'clustered' to 'none' failed."
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] &&
+ die "$NODE1: bitmap still exists in member_disks."
+check all nobitmap
+mdadm --grow $md0 --bitmap=internal
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'none' to 'internal' failed."
+sleep 2
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] ||
+ die "$NODE1: create 'internal' bitmap failed."
+check $NODE1 bitmap
+
+# switch 'internal' bitmap to 'none', and then 'none' to 'clustered'
+mdadm --grow $md0 --bitmap=none
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'internal' to 'none' failed."
+mdadm -X $dev0 $dev1 &> /dev/null
+[ $? -eq '0' ] &&
+ die "$NODE1: bitmap still exists in member_disks."
+check $NODE1 nobitmap
+mdadm --grow $md0 --bitmap=clustered
+[ $? -eq '0' ] ||
+ die "$NODE1: change bitmap 'none' to 'clustered' failed."
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+sleep 2
+for ip in $NODES
+do
+ ssh $ip "mdadm -X $dev0 $dev1 | grep -q 'Cluster name'" ||
+ die "$ip: create 'clustered' bitmap failed."
+done
+check all bitmap
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/01r1_Grow_resize b/clustermd_tests/01r1_Grow_resize
new file mode 100644
index 0000000..6d6e22a
--- /dev/null
+++ b/clustermd_tests/01r1_Grow_resize
@@ -0,0 +1,23 @@
+#!/bin/bash
+
+size=10000
+
+mdadm -CR $md0 -l1 -b clustered --size $size -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+
+mdadm --grow $md0 --size max
+check $NODE1 resync
+check $NODE1 wait
+check all state UU
+
+mdadm --grow $md0 --size $size
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r10_Manage_add b/clustermd_tests/02r10_Manage_add
new file mode 100644
index 0000000..8e878ab
--- /dev/null
+++ b/clustermd_tests/02r10_Manage_add
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0 --remove $dev0
+mdadm --zero $dev2
+mdadm --manage $md0 --add $dev2
+sleep 0.3
+check $NODE1 recovery
+check $NODE1 wait
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add $dev2
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r10_Manage_add-spare b/clustermd_tests/02r10_Manage_add-spare
new file mode 100644
index 0000000..9924aa8
--- /dev/null
+++ b/clustermd_tests/02r10_Manage_add-spare
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add-spare $dev2
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid10
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add-spare $dev3
+check all spares 2
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r10_Manage_re-add b/clustermd_tests/02r10_Manage_re-add
new file mode 100644
index 0000000..2288a00
--- /dev/null
+++ b/clustermd_tests/02r10_Manage_re-add
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid10
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0 --remove $dev0
+mdadm --manage $md0 --re-add $dev0
+check $NODE1 recovery
+check all wait
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r1_Manage_add b/clustermd_tests/02r1_Manage_add
new file mode 100644
index 0000000..ab2751c
--- /dev/null
+++ b/clustermd_tests/02r1_Manage_add
@@ -0,0 +1,33 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0 --remove $dev0
+mdadm --zero $dev2
+mdadm --manage $md0 --add $dev2
+sleep 0.3
+check $NODE1 recovery
+check $NODE1 wait
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add $dev2
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r1_Manage_add-spare b/clustermd_tests/02r1_Manage_add-spare
new file mode 100644
index 0000000..eab8111
--- /dev/null
+++ b/clustermd_tests/02r1_Manage_add-spare
@@ -0,0 +1,30 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add-spare $dev2
+check all spares 1
+check all state UU
+check all dmesg
+stop_md all $md0
+
+mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid1
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --manage $md0 --add-spare $dev3
+check all spares 2
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/02r1_Manage_re-add b/clustermd_tests/02r1_Manage_re-add
new file mode 100644
index 0000000..d0d13e5
--- /dev/null
+++ b/clustermd_tests/02r1_Manage_re-add
@@ -0,0 +1,16 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check all nosync
+check all raid1
+check all bitmap
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0 --remove $dev0
+mdadm --manage $md0 --re-add $dev0
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/03r10_switch-recovery b/clustermd_tests/03r10_switch-recovery
new file mode 100644
index 0000000..867388d
--- /dev/null
+++ b/clustermd_tests/03r10_switch-recovery
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid10
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0
+sleep 0.2
+check $NODE1 recovery
+stop_md $NODE1 $md0
+check $NODE2 recovery
+check $NODE2 wait
+check $NODE2 state UU
+check all dmesg
+stop_md $NODE2 $md0
+
+exit 0
diff --git a/clustermd_tests/03r10_switch-resync b/clustermd_tests/03r10_switch-resync
new file mode 100644
index 0000000..127c569
--- /dev/null
+++ b/clustermd_tests/03r10_switch-resync
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check $NODE1 resync
+check $NODE2 PENDING
+stop_md $NODE1 $md0
+check $NODE2 resync
+check $NODE2 wait
+mdadm -A $md0 $dev0 $dev1
+check all raid10
+check all bitmap
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/03r1_switch-recovery b/clustermd_tests/03r1_switch-recovery
new file mode 100644
index 0000000..a1a7cbe
--- /dev/null
+++ b/clustermd_tests/03r1_switch-recovery
@@ -0,0 +1,21 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2
+check all nosync
+check all raid1
+check all bitmap
+check all spares 1
+check all state UU
+check all dmesg
+mdadm --manage $md0 --fail $dev0
+sleep 0.3
+check $NODE1 recovery
+stop_md $NODE1 $md0
+check $NODE2 recovery
+check $NODE2 wait
+check $NODE2 state UU
+check all dmesg
+stop_md $NODE2 $md0
+
+exit 0
diff --git a/clustermd_tests/03r1_switch-resync b/clustermd_tests/03r1_switch-resync
new file mode 100644
index 0000000..d99e1c5
--- /dev/null
+++ b/clustermd_tests/03r1_switch-resync
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1
+ssh $NODE2 mdadm -A $md0 $dev0 $dev1
+check $NODE1 resync
+check $NODE2 PENDING
+stop_md $NODE1 $md0
+check $NODE2 resync
+check $NODE2 wait
+mdadm -A $md0 $dev0 $dev1
+check all raid1
+check all bitmap
+check all nosync
+check all state UU
+check all dmesg
+stop_md all $md0
+
+exit 0
diff --git a/clustermd_tests/cluster_conf b/clustermd_tests/cluster_conf
new file mode 100644
index 0000000..4f0c9fb
--- /dev/null
+++ b/clustermd_tests/cluster_conf
@@ -0,0 +1,43 @@
+# Prerequisite:
+# 1. The clustermd_tests/ cases only support to test 2-node-cluster, cluster
+# requires packages: 'pacemaker+corosync+sbd+crmsh', all packages link at
+# "https://github.com/ClusterLabs/", and also requires dlm resource running
+# on each node of cluster.
+# For quick start HA-cluster with SUSE distributions, refer to the chapter 6-8:
+# https://www.suse.com/documentation/sle-ha-12/install-quick/data/install-quick.html
+# For Redhat distributions, please refer to:
+# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/high_availability_add-on_administration/index
+# 2. Setup ssh-access with no-authorized mode, it should be:
+# # 'ssh $node1 -l root ls' and 'ssh $node2 -l root ls' success on any node.
+# 3. Fill-up node-ip part and disks part as following.
+
+# Set node1 as the master node, the cluster-md cases should run on this node,
+# and node2 is the slave node.
+# For example:
+# NODE1=192.168.1.100 (testing run here)
+# NODE2=192.168.1.101
+NODE1=
+NODE2=
+
+# Provide the devlist for clustermd-testing, alternative: if set the step 1,
+# don't set step 2, and vice versa.
+# 1. Use ISCSI service to provide shared storage, then login ISCSI target via
+# to ISCSI_TARGET_ID and ISCSI_TARGET_IP on iscsi clients, commands like:
+# Execute on iscsi clients:
+# 1) discover the iscsi server.
+# # iscsiadm -m discovery -t st -p $ISCSI_TARGET_IP
+# 2) login and establish connection.
+# # iscsiadm -m node -T $ISCSI_TARGET_ID -p $ISCSI_TARGET_IP -l
+# Note:
+# On ISCSI server, must create all iscsi-luns in one target_id, recommend more
+# than 6 luns/disks for testing, and each disk should be: 100M < disk < 800M.
+# 2. If all cluster-nodes mounted the same disks directly, and the devname are
+# the same on all nodes, then put them to 'devlist'.
+
+# For example: (Only set $ISCSI_TARGET_ID is enough if iscsi has already connected)
+# ISCSI_TARGET_ID=iqn.2018-01.example.com:clustermd-testing
+# ISCSI_TARGET_IP=192.168.1.102
+ISCSI_TARGET_ID=
+
+#devlist=/dev/sda /dev/sdb /dev/sdc /dev/sdd
+devlist=
diff --git a/clustermd_tests/func.sh b/clustermd_tests/func.sh
new file mode 100644
index 0000000..801d604
--- /dev/null
+++ b/clustermd_tests/func.sh
@@ -0,0 +1,332 @@
+#!/bin/bash
+
+check_ssh()
+{
+ NODE1="$(grep '^NODE1' $CLUSTER_CONF | cut -d'=' -f2)"
+ NODE2="$(grep '^NODE2' $CLUSTER_CONF | cut -d'=' -f2)"
+ [ -z "$NODE1" -o -z "$NODE2" ] && {
+ echo "Please provide node-ip in $CLUSTER_CONF."
+ exit 1
+ }
+ for ip in $NODE1 $NODE2
+ do
+ ssh -o NumberOfPasswordPrompts=0 $ip -l root "pwd" > /dev/null
+ [ $? -ne 0 ] && {
+ echo "Please setup ssh-access with no-authorized mode."
+ exit 1
+ }
+ done
+}
+
+fetch_devlist()
+{
+ ISCSI_ID="$(grep '^ISCSI_TARGET_ID' $CLUSTER_CONF | cut -d'=' -f2)"
+ devlist="$(grep '^devlist' $CLUSTER_CONF | cut -d'=' -f2)"
+ if [ ! -z "$ISCSI_ID" -a ! -z "$devlist" ]
+ then
+ echo "Config ISCSI_TARGET_ID or devlist in $CLUSTER_CONF."
+ exit 1
+ elif [ ! -z "$ISCSI_ID" -a -z "$devlist" ]
+ then
+ for ip in $NODE1 $NODE2
+ do
+ ssh $ip "ls /dev/disk/by-path/*$ISCSI_ID*" > /dev/null
+ [ $? -ne 0 ] && {
+ echo "$ip: No disks found in '$ISCSI_ID' connection."
+ exit 1
+ }
+ done
+ devlist=($(ls /dev/disk/by-path/*$ISCSI_ID*))
+ fi
+ # sbd disk cannot use in testing
+ # Init devlist as an array
+ i=''
+ devlist=(${devlist[@]#$i})
+ for i in ${devlist[@]}
+ do
+ sbd -d $i dump &> /dev/null
+ [ $? -eq '0' ] && devlist=(${devlist[@]#$i})
+ done
+ for i in $(seq 0 ${#devlist[@]})
+ do
+ eval "dev$i=${devlist[$i]}"
+ done
+ [ "${#devlist[@]}" -lt 6 ] && {
+ echo "Cluster-md testing requires 6 disks at least."
+ exit 1
+ }
+}
+
+check_dlm()
+{
+ if ! crm configure show | grep -q dlm
+ then
+ crm configure primitive dlm ocf:pacemaker:controld \
+ op monitor interval=60 timeout=60 \
+ meta target-role=Started &> /dev/null
+ crm configure group base-group dlm
+ crm configure clone base-clone base-group \
+ meta interleave=true
+ fi
+ sleep 1
+ for ip in $NODE1 $NODE2
+ do
+ ssh $ip "pgrep dlm_controld > /dev/null" || {
+ echo "$ip: dlm_controld daemon doesn't exist."
+ exit 1
+ }
+ done
+ crm_mon -r -n1 | grep -iq "fail\|not" && {
+ echo "Please clear cluster-resource errors."
+ exit 1
+ }
+}
+
+check_env()
+{
+ user=$(id -un)
+ [ "X$user" = "Xroot" ] || {
+ echo "testing can only be done as 'root'."
+ exit 1
+ }
+ [ \! -x $mdadm ] && {
+ echo "test: please run make everything before perform testing."
+ exit 1
+ }
+ check_ssh
+ commands=(mdadm iscsiadm bc modinfo dlm_controld
+ udevadm crm crm_mon lsblk pgrep sbd)
+ for ip in $NODE1 $NODE2
+ do
+ for cmd in ${commands[@]}
+ do
+ ssh $ip "which $cmd &> /dev/null" || {
+ echo "$ip: $cmd, command not found!"
+ exit 1
+ }
+ done
+ mods=(raid1 raid10 md_mod dlm md-cluster)
+ for mod in ${mods[@]}
+ do
+ ssh $ip "modinfo $mod > /dev/null" || {
+ echo "$ip: $mod, module doesn't exist."
+ exit 1
+ }
+ done
+ ssh $ip "lsblk -a | grep -iq raid"
+ [ $? -eq 0 ] && {
+ echo "$ip: Please run testing without running RAIDs environment."
+ exit 1
+ }
+ ssh $ip "modprobe md_mod"
+ done
+ fetch_devlist
+ check_dlm
+ [ -d $logdir ] || mkdir -p $logdir
+}
+
+# $1/node, $2/optional
+stop_md()
+{
+ if [ "$1" == "all" ]
+ then
+ NODES=($NODE1 $NODE2)
+ elif [ "$1" == "$NODE1" -o "$1" == "$NODE2" ]
+ then
+ NODES=$1
+ else
+ die "$1: unknown parameter."
+ fi
+ if [ -z "$2" ]
+ then
+ for ip in ${NODES[@]}
+ do
+ ssh $ip mdadm -Ssq
+ done
+ else
+ for ip in ${NODES[@]}
+ do
+ ssh $ip mdadm -S $2
+ done
+ fi
+}
+
+# $1/optional, it shows why to save log
+save_log()
+{
+ status=$1
+ logfile="$status""$_basename".log
+
+ cat $targetdir/stderr >> $targetdir/log
+ cp $targetdir/log $logdir/$_basename.log
+
+ for ip in $NODE1 $NODE2
+ do
+ echo "##$ip: saving dmesg." >> $logdir/$logfile
+ ssh $ip "dmesg -c" >> $logdir/$logfile
+ echo "##$ip: saving proc mdstat." >> $logdir/$logfile
+ ssh $ip "cat /proc/mdstat" >> $logdir/$logfile
+ array=($(ssh $ip "mdadm -Ds | cut -d' ' -f2"))
+
+ if [ ! -z "$array" -a ${#array[@]} -ge 1 ]
+ then
+ echo "##$ip: mdadm -D ${array[@]}" >> $logdir/$logfile
+ ssh $ip "mdadm -D ${array[@]}" >> $logdir/$logfile
+ md_disks=($(ssh $ip "mdadm -DY ${array[@]} | grep "/dev/" | cut -d'=' -f2"))
+ cat /proc/mdstat | grep -q "bitmap"
+ if [ $? -eq 0 ]
+ then
+ echo "##$ip: mdadm -X ${md_disks[@]}" >> $logdir/$logfile
+ ssh $ip "mdadm -X ${md_disks[@]}" >> $logdir/$logfile
+ echo "##$ip: mdadm -E ${md_disks[@]}" >> $logdir/$logfile
+ ssh $ip "mdadm -E ${md_disks[@]}" >> $logdir/$logfile
+ fi
+ else
+ echo "##$ip: no array assembled!" >> $logdir/$logfile
+ fi
+ done
+ [ "$1" == "fail" ] &&
+ echo "See $logdir/$_basename.log and $logdir/$logfile for details"
+ stop_md all
+}
+
+do_setup()
+{
+ check_env
+ ulimit -c unlimited
+}
+
+do_clean()
+{
+ for ip in $NODE1 $NODE2
+ do
+ ssh $ip "mdadm -Ssq; dmesg -c > /dev/null"
+ done
+ mdadm --zero ${devlist[@]} &> /dev/null
+}
+
+cleanup()
+{
+ check_ssh
+ do_clean
+}
+
+# check: $1/cluster_node $2/feature $3/optional
+check()
+{
+ NODES=()
+ if [ "$1" == "all" ]
+ then
+ NODES=($NODE1 $NODE2)
+ elif [ "$1" == "$NODE1" -o "$1" == "$NODE2" ]
+ then
+ NODES=$1
+ else
+ die "$1: unknown parameter."
+ fi
+ case $2 in
+ spares )
+ for ip in ${NODES[@]}
+ do
+ spares=$(ssh $ip "tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)'")
+ [ "$spares" -ne "$3" ] &&
+ die "$ip: expected $3 spares, but found $spares"
+ done
+ ;;
+ raid* )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -sq "$2" /proc/mdstat" ||
+ die "$ip: check '$2' failed."
+ done
+ ;;
+ PENDING | recovery | resync | reshape )
+ cnt=5
+ for ip in ${NODES[@]}
+ do
+ while ! ssh $ip "grep -sq '$2' /proc/mdstat"
+ do
+ if [ "$cnt" -gt '0' ]
+ then
+ sleep 0.2
+ cnt=$[cnt-1]
+ else
+ die "$ip: no '$2' happening!"
+ fi
+ done
+ done
+ ;;
+ wait )
+ local cnt=60
+ for ip in ${NODES[@]}
+ do
+ p=$(ssh $ip "cat /proc/sys/dev/raid/speed_limit_max")
+ ssh $ip "echo 200000 > /proc/sys/dev/raid/speed_limit_max"
+ while ssh $ip "grep -Esq '(resync|recovery|reshape|check|repair)' /proc/mdstat"
+ do
+ if [ "$cnt" -gt '0' ]
+ then
+ sleep 5
+ cnt=$[cnt-1]
+ else
+ die "$ip: Check '$2' timeout over 300 seconds."
+ fi
+ done
+ ssh $ip "echo $p > /proc/sys/dev/raid/speed_limit_max"
+ done
+ ;;
+ bitmap )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -sq '$2' /proc/mdstat" ||
+ die "$ip: no '$2' found in /proc/mdstat."
+ done
+ ;;
+ nobitmap )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -sq 'bitmap' /proc/mdstat" &&
+ die "$ip: 'bitmap' found in /proc/mdstat."
+ done
+ ;;
+ chunk )
+ for ip in ${NODES[@]}
+ do
+ chunk_size=`awk -F',' '/chunk/{print $2}' /proc/mdstat | awk -F'[a-z]' '{print $1}'`
+ [ "$chunk_size" -ne "$3" ] &&
+ die "$ip: chunksize should be $3, but it's $chunk_size"
+ done
+ ;;
+ state )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -Esq 'blocks.*\[$3\]\$' /proc/mdstat" ||
+ die "$ip: no '$3' found in /proc/mdstat."
+ done
+ ;;
+ nosync )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -Eq '(resync|recovery)' /proc/mdstat" &&
+ die "$ip: resync or recovery is happening!"
+ done
+ ;;
+ readonly )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "grep -sq "read-only" /proc/mdstat" ||
+ die "$ip: check '$2' failed!"
+ done
+ ;;
+ dmesg )
+ for ip in ${NODES[@]}
+ do
+ ssh $ip "dmesg | grep -iq 'error\|call trace\|segfault'" &&
+ die "$ip: check '$2' prints errors!"
+ done
+ ;;
+ * )
+ die "unknown parameter $2"
+ ;;
+ esac
+}
diff --git a/config.c b/config.c
new file mode 100644
index 0000000..9c72545
--- /dev/null
+++ b/config.c
@@ -0,0 +1,1235 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "dlink.h"
+#include <dirent.h>
+#include <glob.h>
+#include <fnmatch.h>
+#include <ctype.h>
+#include <pwd.h>
+#include <grp.h>
+
+/*
+ * Read the config file
+ *
+ * conf_get_uuids gets a list of devicename+uuid pairs
+ * conf_get_devs gets device names after expanding wildcards
+ *
+ * Each keeps the returned list and frees it when asked to make
+ * a new list.
+ *
+ * The format of the config file needs to be fairly extensible.
+ * Now, arrays only have names and uuids and devices merely are.
+ * But later arrays might want names, and devices might want superblock
+ * versions, and who knows what else.
+ * I like free format, abhore backslash line continuation, adore
+ * indentation for structure and am ok about # comments.
+ *
+ * So, each line that isn't blank or a #comment must either start
+ * with a key word, and not be indented, or must start with a
+ * non-key-word and must be indented.
+ *
+ * Keywords are DEVICE and ARRAY ... and several others.
+ * DEV{ICE} introduces some devices that might contain raid components.
+ * e.g.
+ * DEV style=0 /dev/sda* /dev/hd*
+ * DEV style=1 /dev/sd[b-f]*
+ * ARR{AY} describes an array giving md device and attributes like uuid=whatever
+ * e.g.
+ * ARRAY /dev/md0 uuid=whatever name=something
+ * Spaces separate words on each line. Quoting, with "" or '' protects them,
+ * but may not wrap over lines
+ *
+ */
+#ifndef _POSIX_C_SOURCE
+#define _POSIX_C_SOURCE 200809L
+#endif
+
+#ifndef CONFFILE
+#define CONFFILE "/etc/mdadm.conf"
+#endif
+#ifndef CONFFILE2
+/* for Debian compatibility .... */
+#define CONFFILE2 "/etc/mdadm/mdadm.conf"
+#endif
+char DefaultConfFile[] = CONFFILE;
+char DefaultConfDir[] = CONFFILE ".d";
+char DefaultAltConfFile[] = CONFFILE2;
+char DefaultAltConfDir[] = CONFFILE2 ".d";
+
+enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev,
+ Homehost, HomeCluster, AutoMode, Policy, PartPolicy, Sysfs,
+ MonitorDelay, LTEnd };
+char *keywords[] = {
+ [Devices] = "devices",
+ [Array] = "array",
+ [Mailaddr] = "mailaddr",
+ [Mailfrom] = "mailfrom",
+ [Program] = "program",
+ [CreateDev]= "create",
+ [Homehost] = "homehost",
+ [HomeCluster] = "homecluster",
+ [AutoMode] = "auto",
+ [Policy] = "policy",
+ [PartPolicy]="part-policy",
+ [Sysfs] = "sysfs",
+ [MonitorDelay] = "monitordelay",
+ [LTEnd] = NULL
+};
+
+/*
+ * match_keyword returns an index into the keywords array, or -1 for no match
+ * case is ignored, and at least three characters must be given
+ */
+
+int match_keyword(char *word)
+{
+ int len = strlen(word);
+ int n;
+
+ if (len < 3)
+ return -1;
+ for (n = 0; keywords[n]; n++) {
+ if (strncasecmp(word, keywords[n], len) == 0)
+ return n;
+ }
+
+ return -1;
+}
+
+struct conf_dev {
+ struct conf_dev *next;
+ char *name;
+} *cdevlist = NULL;
+
+struct mddev_dev *load_partitions(void)
+{
+ FILE *f = fopen("/proc/partitions", "r");
+ char buf[1024];
+ struct mddev_dev *rv = NULL;
+
+ if (f == NULL) {
+ pr_err("cannot open /proc/partitions\n");
+ return NULL;
+ }
+ while (fgets(buf, 1024, f)) {
+ int major, minor;
+ char *name, *mp;
+ struct mddev_dev *d;
+
+ buf[1023] = '\0';
+ if (buf[0] != ' ')
+ continue;
+ major = strtoul(buf, &mp, 10);
+ if (mp == buf || *mp != ' ')
+ continue;
+ minor = strtoul(mp, NULL, 10);
+
+ name = map_dev(major, minor, 1);
+ if (!name)
+ continue;
+ d = xcalloc(1, sizeof(*d));
+ d->devname = xstrdup(name);
+ d->next = rv;
+ rv = d;
+ }
+ fclose(f);
+ return rv;
+}
+
+struct mddev_dev *load_containers(void)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent;
+ struct mddev_dev *d;
+ struct mddev_dev *rv = NULL;
+ struct map_ent *map = NULL, *me;
+
+ if (!mdstat)
+ return NULL;
+
+ for (ent = mdstat; ent; ent = ent->next)
+ if (ent->metadata_version &&
+ strncmp(ent->metadata_version, "external:", 9) == 0 &&
+ !is_subarray(&ent->metadata_version[9])) {
+ d = xcalloc(1, sizeof(*d));
+ me = map_by_devnm(&map, ent->devnm);
+ if (me)
+ d->devname = xstrdup(me->path);
+ else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) {
+ free(d);
+ continue;
+ }
+ d->next = rv;
+ rv = d;
+ map_free(map);
+ map = NULL;
+ }
+ free_mdstat(mdstat);
+
+ return rv;
+}
+
+struct createinfo createinfo = {
+ .autof = 2, /* by default, create devices with standard names */
+ .symlinks = 1,
+ .names = 0, /* By default, stick with numbered md devices. */
+ .bblist = 1, /* Use a bad block list by default */
+#ifdef DEBIAN
+ .gid = 6, /* disk */
+ .mode = 0660,
+#else
+ .mode = 0600,
+#endif
+};
+
+int parse_auto(char *str, char *msg, int config)
+{
+ int autof;
+ if (str == NULL || *str == 0)
+ autof = 2;
+ else if (strcasecmp(str, "no") == 0)
+ autof = 1;
+ else if (strcasecmp(str, "yes") == 0)
+ autof = 2;
+ else if (strcasecmp(str, "md") == 0)
+ autof = config ? 5:3;
+ else {
+ /* There might be digits, and maybe a hypen, at the end */
+ char *e = str + strlen(str);
+ int num = 4;
+ int len;
+ while (e > str && isdigit(e[-1]))
+ e--;
+ if (*e) {
+ num = atoi(e);
+ if (num <= 0)
+ num = 1;
+ }
+ if (e > str && e[-1] == '-')
+ e--;
+ len = e - str;
+ if ((len == 2 && strncasecmp(str, "md", 2) == 0)) {
+ autof = config ? 5 : 3;
+ } else if ((len == 3 && strncasecmp(str, "yes", 3) == 0)) {
+ autof = 2;
+ } else if ((len == 3 && strncasecmp(str, "mdp", 3) == 0)) {
+ autof = config ? 6 : 4;
+ } else if ((len == 1 && strncasecmp(str, "p", 1) == 0) ||
+ (len >= 4 && strncasecmp(str, "part", 4) == 0)) {
+ autof = 6;
+ } else {
+ pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n"
+ " optionally followed by a number.\n",
+ msg, str);
+ exit(2);
+ }
+ autof |= num << 3;
+ }
+ return autof;
+}
+
+static void createline(char *line)
+{
+ char *w;
+ char *ep;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (strncasecmp(w, "auto=", 5) == 0)
+ createinfo.autof = parse_auto(w + 5, "auto=", 1);
+ else if (strncasecmp(w, "owner=", 6) == 0) {
+ if (w[6] == 0) {
+ pr_err("missing owner name\n");
+ continue;
+ }
+ createinfo.uid = strtoul(w + 6, &ep, 10);
+ if (*ep != 0) {
+ struct passwd *pw;
+ /* must be a name */
+ pw = getpwnam(w + 6);
+ if (pw)
+ createinfo.uid = pw->pw_uid;
+ else
+ pr_err("CREATE user %s not found\n",
+ w + 6);
+ }
+ } else if (strncasecmp(w, "group=", 6) == 0) {
+ if (w[6] == 0) {
+ pr_err("missing group name\n");
+ continue;
+ }
+ createinfo.gid = strtoul(w + 6, &ep, 10);
+ if (*ep != 0) {
+ struct group *gr;
+ /* must be a name */
+ gr = getgrnam(w + 6);
+ if (gr)
+ createinfo.gid = gr->gr_gid;
+ else
+ pr_err("CREATE group %s not found\n",
+ w + 6);
+ }
+ } else if (strncasecmp(w, "mode=", 5) == 0) {
+ if (w[5] == 0) {
+ pr_err("missing CREATE mode\n");
+ continue;
+ }
+ createinfo.mode = strtoul(w + 5, &ep, 8);
+ if (*ep != 0) {
+ createinfo.mode = 0600;
+ pr_err("unrecognised CREATE mode %s\n",
+ w + 5);
+ }
+ } else if (strncasecmp(w, "metadata=", 9) == 0) {
+ /* style of metadata to use by default */
+ int i;
+ for (i = 0; superlist[i] && !createinfo.supertype; i++)
+ createinfo.supertype = superlist[i]->match_metadata_desc(w + 9);
+ if (!createinfo.supertype)
+ pr_err("metadata format %s unknown, ignoring\n",
+ w+9);
+ } else if (strncasecmp(w, "symlinks=yes", 12) == 0)
+ createinfo.symlinks = 1;
+ else if (strncasecmp(w, "symlinks=no", 11) == 0)
+ createinfo.symlinks = 0;
+ else if (strncasecmp(w, "names=yes", 12) == 0)
+ createinfo.names = 1;
+ else if (strncasecmp(w, "names=no", 11) == 0)
+ createinfo.names = 0;
+ else if (strncasecmp(w, "bbl=no", 11) == 0)
+ createinfo.bblist = 0;
+ else if (strncasecmp(w, "bbl=yes", 11) == 0)
+ createinfo.bblist = 1;
+ else {
+ pr_err("unrecognised word on CREATE line: %s\n",
+ w);
+ }
+ }
+}
+
+void devline(char *line)
+{
+ char *w;
+ struct conf_dev *cd;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (w[0] == '/' || strcasecmp(w, "partitions") == 0 ||
+ strcasecmp(w, "containers") == 0) {
+ cd = xmalloc(sizeof(*cd));
+ cd->name = xstrdup(w);
+ cd->next = cdevlist;
+ cdevlist = cd;
+ } else {
+ pr_err("unreconised word on DEVICE line: %s\n", w);
+ }
+ }
+}
+
+struct mddev_ident *mddevlist = NULL;
+struct mddev_ident **mddevlp = &mddevlist;
+
+static int is_number(char *w)
+{
+ /* check if there are 1 or more digits and nothing else */
+ int digits = 0;
+ while (*w && isdigit(*w)) {
+ digits++;
+ w++;
+ }
+ return (digits && ! *w);
+}
+
+void arrayline(char *line)
+{
+ char *w;
+
+ struct mddev_ident mis;
+ struct mddev_ident *mi;
+
+ mis.uuid_set = 0;
+ mis.super_minor = UnSet;
+ mis.level = UnSet;
+ mis.raid_disks = UnSet;
+ mis.spare_disks = 0;
+ mis.devices = NULL;
+ mis.devname = NULL;
+ mis.spare_group = NULL;
+ mis.autof = 0;
+ mis.next = NULL;
+ mis.st = NULL;
+ mis.bitmap_fd = -1;
+ mis.bitmap_file = NULL;
+ mis.name[0] = 0;
+ mis.container = NULL;
+ mis.member = NULL;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (w[0] == '/' || strchr(w, '=') == NULL) {
+ /* This names the device, or is '<ignore>'.
+ * The rules match those in create_mddev.
+ * 'w' must be:
+ * /dev/md/{anything}
+ * /dev/mdNN
+ * /dev/md_dNN
+ * <ignore>
+ * or anything that doesn't start '/' or '<'
+ */
+ if (strcasecmp(w, "<ignore>") == 0 ||
+ strncmp(w, "/dev/md/", 8) == 0 ||
+ (w[0] != '/' && w[0] != '<') ||
+ (strncmp(w, "/dev/md", 7) == 0 &&
+ is_number(w + 7)) ||
+ (strncmp(w, "/dev/md_d", 9) == 0 &&
+ is_number(w + 9))) {
+ /* This is acceptable */;
+ if (mis.devname)
+ pr_err("only give one device per ARRAY line: %s and %s\n",
+ mis.devname, w);
+ else
+ mis.devname = w;
+ }else {
+ pr_err("%s is an invalid name for an md device - ignored.\n", w);
+ }
+ } else if (strncasecmp(w, "uuid=", 5) == 0) {
+ if (mis.uuid_set)
+ pr_err("only specify uuid once, %s ignored.\n",
+ w);
+ else {
+ if (parse_uuid(w + 5, mis.uuid))
+ mis.uuid_set = 1;
+ else
+ pr_err("bad uuid: %s\n", w);
+ }
+ } else if (strncasecmp(w, "super-minor=", 12) == 0) {
+ if (mis.super_minor != UnSet)
+ pr_err("only specify super-minor once, %s ignored.\n",
+ w);
+ else {
+ char *endptr;
+ int minor = strtol(w + 12, &endptr, 10);
+
+ if (w[12] == 0 || endptr[0] != 0 || minor < 0)
+ pr_err("invalid super-minor number: %s\n",
+ w);
+ else
+ mis.super_minor = minor;
+ }
+ } else if (strncasecmp(w, "name=", 5) == 0) {
+ if (mis.name[0])
+ pr_err("only specify name once, %s ignored.\n",
+ w);
+ else if (strlen(w + 5) > 32)
+ pr_err("name too long, ignoring %s\n", w);
+ else
+ strcpy(mis.name, w + 5);
+
+ } else if (strncasecmp(w, "bitmap=", 7) == 0) {
+ if (mis.bitmap_file)
+ pr_err("only specify bitmap file once. %s ignored\n",
+ w);
+ else
+ mis.bitmap_file = xstrdup(w + 7);
+
+ } else if (strncasecmp(w, "devices=", 8 ) == 0) {
+ if (mis.devices)
+ pr_err("only specify devices once (use a comma separated list). %s ignored\n",
+ w);
+ else
+ mis.devices = xstrdup(w + 8);
+ } else if (strncasecmp(w, "spare-group=", 12) == 0) {
+ if (mis.spare_group)
+ pr_err("only specify one spare group per array. %s ignored.\n",
+ w);
+ else
+ mis.spare_group = xstrdup(w + 12);
+ } else if (strncasecmp(w, "level=", 6) == 0 ) {
+ /* this is mainly for compatability with --brief output */
+ mis.level = map_name(pers, w + 6);
+ } else if (strncasecmp(w, "disks=", 6) == 0) {
+ /* again, for compat */
+ mis.raid_disks = atoi(w + 6);
+ } else if (strncasecmp(w, "num-devices=", 12) == 0) {
+ /* again, for compat */
+ mis.raid_disks = atoi(w + 12);
+ } else if (strncasecmp(w, "spares=", 7) == 0) {
+ /* for warning if not all spares present */
+ mis.spare_disks = atoi(w + 7);
+ } else if (strncasecmp(w, "metadata=", 9) == 0) {
+ /* style of metadata on the devices. */
+ int i;
+
+ for(i=0; superlist[i] && !mis.st; i++)
+ mis.st = superlist[i]->
+ match_metadata_desc(w + 9);
+
+ if (!mis.st)
+ pr_err("metadata format %s unknown, ignored.\n",
+ w + 9);
+ } else if (strncasecmp(w, "auto=", 5) == 0 ) {
+ /* whether to create device special files as needed */
+ mis.autof = parse_auto(w + 5, "auto type", 0);
+ } else if (strncasecmp(w, "member=", 7) == 0) {
+ /* subarray within a container */
+ mis.member = xstrdup(w + 7);
+ } else if (strncasecmp(w, "container=", 10) == 0) {
+ /* The container holding this subarray.
+ * Either a device name or a uuid */
+ mis.container = xstrdup(w + 10);
+ } else {
+ pr_err("unrecognised word on ARRAY line: %s\n",
+ w);
+ }
+ }
+ if (mis.uuid_set == 0 && mis.devices == NULL &&
+ mis.super_minor == UnSet && mis.name[0] == 0 &&
+ (mis.container == NULL || mis.member == NULL))
+ pr_err("ARRAY line %s has no identity information.\n",
+ mis.devname);
+ else {
+ mi = xmalloc(sizeof(*mi));
+ *mi = mis;
+ mi->devname = mis.devname ? xstrdup(mis.devname) : NULL;
+ mi->next = NULL;
+ *mddevlp = mi;
+ mddevlp = &mi->next;
+ }
+}
+
+static char *alert_email = NULL;
+void mailline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w))
+ if (alert_email == NULL)
+ alert_email = xstrdup(w);
+}
+
+static char *alert_mail_from = NULL;
+void mailfromline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (alert_mail_from == NULL)
+ alert_mail_from = xstrdup(w);
+ else {
+ char *t = NULL;
+
+ if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) {
+ free(alert_mail_from);
+ alert_mail_from = t;
+ }
+ }
+ }
+}
+
+static char *alert_program = NULL;
+void programline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w))
+ if (alert_program == NULL)
+ alert_program = xstrdup(w);
+}
+
+static char *home_host = NULL;
+static int require_homehost = 1;
+void homehostline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (strcasecmp(w, "<ignore>") == 0)
+ require_homehost = 0;
+ else if (home_host == NULL) {
+ if (strcasecmp(w, "<none>") == 0)
+ home_host = xstrdup("");
+ else
+ home_host = xstrdup(w);
+ }
+ }
+}
+
+static char *home_cluster = NULL;
+void homeclusterline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (home_cluster == NULL) {
+ if (strcasecmp(w, "<none>") == 0)
+ home_cluster = xstrdup("");
+ else
+ home_cluster = xstrdup(w);
+ }
+ }
+}
+
+static int monitor_delay;
+void monitordelayline(char *line)
+{
+ char *w;
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ if (monitor_delay == 0)
+ monitor_delay = strtol(w, NULL, 10);
+ }
+}
+
+char auto_yes[] = "yes";
+char auto_no[] = "no";
+char auto_homehost[] = "homehost";
+
+static int auto_seen = 0;
+void autoline(char *line)
+{
+ char *w;
+ char *seen;
+ int super_cnt;
+ char *dflt = auto_yes;
+ int homehost = 0;
+ int i;
+
+ if (auto_seen)
+ return;
+ auto_seen = 1;
+
+ /*
+ * Parse the 'auto' line creating policy statements for the 'auto'
+ * policy.
+ *
+ * The default is 'yes' but the 'auto' line might over-ride that.
+ * Words in the line are processed in order with the first
+ * match winning.
+ * word can be:
+ * +version - that version can be assembled
+ * -version - that version cannot be auto-assembled
+ * yes or +all - any other version can be assembled
+ * no or -all - no other version can be assembled.
+ * homehost - any array associated by 'homehost' to this
+ * host can be assembled.
+ *
+ * Thus:
+ * +ddf -0.90 homehost -all
+ * will auto-assemble any ddf array, no 0.90 array, and
+ * any other array (imsm, 1.x) if and only if it is identified
+ * as belonging to this host.
+ *
+ * We translate that to policy by creating 'auto=yes' when we see
+ * a '+version' line, 'auto=no' if we see '-version' before 'homehost',
+ * or 'auto=homehost' if we see '-version' after 'homehost'.
+ * When we see yes, no, +all or -all we stop and any version that hasn't
+ * been seen gets an appropriate auto= entry.
+ */
+
+ /*
+ * If environment variable MDADM_CONF_AUTO is defined, then
+ * it is prepended to the auto line. This allow a script
+ * to easily disable some metadata types.
+ */
+ w = getenv("MDADM_CONF_AUTO");
+ if (w && *w) {
+ char *l = xstrdup(w);
+ char *head = line;
+ w = strtok(l, " \t");
+ while (w) {
+ char *nw = dl_strdup(w);
+ dl_insert(head, nw);
+ head = nw;
+ w = strtok(NULL, " \t");
+ }
+ free(l);
+ }
+
+ for (super_cnt = 0; superlist[super_cnt]; super_cnt++)
+ ;
+ seen = xcalloc(super_cnt, 1);
+
+ for (w = dl_next(line); w != line; w = dl_next(w)) {
+ char *val;
+
+ if (strcasecmp(w, "yes") == 0) {
+ dflt = auto_yes;
+ break;
+ }
+ if (strcasecmp(w, "no") == 0) {
+ if (homehost)
+ dflt = auto_homehost;
+ else
+ dflt = auto_no;
+ break;
+ }
+ if (strcasecmp(w, "homehost") == 0) {
+ homehost = 1;
+ continue;
+ }
+ if (w[0] == '+')
+ val = auto_yes;
+ else if (w[0] == '-') {
+ if (homehost)
+ val = auto_homehost;
+ else
+ val = auto_no;
+ } else
+ continue;
+
+ if (strcasecmp(w + 1, "all") == 0) {
+ dflt = val;
+ break;
+ }
+ for (i = 0; superlist[i]; i++) {
+ const char *version = superlist[i]->name;
+ if (strcasecmp(w + 1, version) == 0)
+ break;
+ /* 1 matches 1.x, 0 matches 0.90 */
+ if (version[1] == '.' && strlen(w + 1) == 1 &&
+ w[1] == version[0])
+ break;
+ /* 1.anything matches 1.x */
+ if (strcmp(version, "1.x") == 0 &&
+ strncmp(w + 1, "1.", 2) == 0)
+ break;
+ }
+ if (superlist[i] == NULL)
+ /* ignore this word */
+ continue;
+ if (seen[i])
+ /* already know about this metadata */
+ continue;
+ policy_add(rule_policy, pol_auto, val, pol_metadata,
+ superlist[i]->name, NULL);
+ seen[i] = 1;
+ }
+ for (i = 0; i < super_cnt; i++)
+ if (!seen[i])
+ policy_add(rule_policy, pol_auto, dflt, pol_metadata,
+ superlist[i]->name, NULL);
+
+ free(seen);
+}
+
+int loaded = 0;
+
+static char *conffile = NULL;
+void set_conffile(char *file)
+{
+ conffile = file;
+}
+
+void conf_file(FILE *f)
+{
+ char *line;
+ while ((line = conf_line(f))) {
+ switch(match_keyword(line)) {
+ case Devices:
+ devline(line);
+ break;
+ case Array:
+ arrayline(line);
+ break;
+ case Mailaddr:
+ mailline(line);
+ break;
+ case Mailfrom:
+ mailfromline(line);
+ break;
+ case Program:
+ programline(line);
+ break;
+ case CreateDev:
+ createline(line);
+ break;
+ case Homehost:
+ homehostline(line);
+ break;
+ case HomeCluster:
+ homeclusterline(line);
+ break;
+ case AutoMode:
+ autoline(line);
+ break;
+ case Policy:
+ policyline(line, rule_policy);
+ break;
+ case PartPolicy:
+ policyline(line, rule_part);
+ break;
+ case Sysfs:
+ sysfsline(line);
+ break;
+ case MonitorDelay:
+ monitordelayline(line);
+ break;
+ default:
+ pr_err("Unknown keyword %s\n", line);
+ }
+ free_line(line);
+ }
+}
+
+struct fname {
+ struct fname *next;
+ char name[];
+};
+
+void conf_file_or_dir(FILE *f)
+{
+ struct stat st;
+ DIR *dir;
+ struct dirent *dp;
+ struct fname *list = NULL;
+
+ fstat(fileno(f), &st);
+ if (S_ISREG(st.st_mode))
+ conf_file(f);
+ else if (!S_ISDIR(st.st_mode))
+ return;
+#if _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L
+ dir = fdopendir(fileno(f));
+ if (!dir)
+ return;
+ while ((dp = readdir(dir)) != NULL) {
+ int l;
+ struct fname *fn, **p;
+ if (dp->d_ino == 0)
+ continue;
+ if (dp->d_name[0] == '.')
+ continue;
+ l = strlen(dp->d_name);
+ if (l < 6 || strcmp(dp->d_name + l - 5, ".conf") != 0)
+ continue;
+ fn = xmalloc(sizeof(*fn) + l + 1);
+ strcpy(fn->name, dp->d_name);
+ for (p = &list;
+ *p && strcmp((*p)->name, fn->name) < 0;
+ p = & (*p)->next)
+ ;
+ fn->next = *p;
+ *p = fn;
+ }
+ while (list) {
+ int fd;
+ FILE *f2;
+ struct fname *fn = list;
+ list = list->next;
+ fd = openat(fileno(f), fn->name, O_RDONLY);
+ free(fn);
+ if (fd < 0)
+ continue;
+ f2 = fdopen(fd, "r");
+ if (!f2) {
+ close(fd);
+ continue;
+ }
+ conf_file(f2);
+ fclose(f2);
+ }
+ closedir(dir);
+#endif
+}
+
+void load_conffile(void)
+{
+ FILE *f;
+ char *confdir = NULL;
+ char *head;
+
+ if (loaded)
+ return;
+ if (conffile == NULL) {
+ conffile = DefaultConfFile;
+ confdir = DefaultConfDir;
+ }
+
+ if (strcmp(conffile, "partitions") == 0) {
+ char *list = dl_strdup("DEV");
+ dl_init(list);
+ dl_add(list, dl_strdup("partitions"));
+ devline(list);
+ free_line(list);
+ } else if (strcmp(conffile, "none") != 0) {
+ f = fopen(conffile, "r");
+ /* Debian chose to relocate mdadm.conf into /etc/mdadm/.
+ * To allow Debian users to compile from clean source and still
+ * have a working mdadm, we read /etc/mdadm/mdadm.conf
+ * if /etc/mdadm.conf doesn't exist
+ */
+ if (f == NULL && conffile == DefaultConfFile) {
+ f = fopen(DefaultAltConfFile, "r");
+ if (f) {
+ conffile = DefaultAltConfFile;
+ confdir = DefaultAltConfDir;
+ }
+ }
+ if (f) {
+ conf_file_or_dir(f);
+ fclose(f);
+ }
+ if (confdir) {
+ f = fopen(confdir, "r");
+ if (f) {
+ conf_file_or_dir(f);
+ fclose(f);
+ }
+ }
+ }
+ /* If there was no AUTO line, process an empty line
+ * now so that the MDADM_CONF_AUTO env var gets processed.
+ */
+ head = dl_strdup("AUTO");
+ dl_init(head);
+ autoline(head);
+ free_line(head);
+
+ loaded = 1;
+}
+
+char *conf_get_mailaddr(void)
+{
+ load_conffile();
+ return alert_email;
+}
+
+char *conf_get_mailfrom(void)
+{
+ load_conffile();
+ return alert_mail_from;
+}
+
+char *conf_get_program(void)
+{
+ load_conffile();
+ return alert_program;
+}
+
+char *conf_get_homehost(int *require_homehostp)
+{
+ load_conffile();
+ if (require_homehostp)
+ *require_homehostp = require_homehost;
+ return home_host;
+}
+
+char *conf_get_homecluster(void)
+{
+ load_conffile();
+ return home_cluster;
+}
+
+int conf_get_monitor_delay(void)
+{
+ load_conffile();
+ return monitor_delay;
+}
+
+struct createinfo *conf_get_create_info(void)
+{
+ load_conffile();
+ return &createinfo;
+}
+
+struct mddev_ident *conf_get_ident(char *dev)
+{
+ struct mddev_ident *rv;
+ load_conffile();
+ rv = mddevlist;
+ while (dev && rv && (rv->devname == NULL ||
+ !devname_matches(dev, rv->devname)))
+ rv = rv->next;
+ return rv;
+}
+
+static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list)
+{
+ while (*dlp)
+ dlp = &(*dlp)->next;
+ *dlp = list;
+}
+
+struct mddev_dev *conf_get_devs()
+{
+ glob_t globbuf;
+ struct conf_dev *cd;
+ int flags = 0;
+ static struct mddev_dev *dlist = NULL;
+ unsigned int i;
+
+ while (dlist) {
+ struct mddev_dev *t = dlist;
+ dlist = dlist->next;
+ free(t->devname);
+ free(t);
+ }
+
+ load_conffile();
+
+ if (cdevlist == NULL) {
+ /* default to 'partitions' and 'containers' */
+ dlist = load_partitions();
+ append_dlist(&dlist, load_containers());
+ }
+
+ for (cd = cdevlist; cd; cd = cd->next) {
+ if (strcasecmp(cd->name, "partitions") == 0)
+ append_dlist(&dlist, load_partitions());
+ else if (strcasecmp(cd->name, "containers") == 0)
+ append_dlist(&dlist, load_containers());
+ else {
+ glob(cd->name, flags, NULL, &globbuf);
+ flags |= GLOB_APPEND;
+ }
+ }
+ if (flags & GLOB_APPEND) {
+ for (i = 0; i < globbuf.gl_pathc; i++) {
+ struct mddev_dev *t;
+ t = xcalloc(1, sizeof(*t));
+ t->devname = xstrdup(globbuf.gl_pathv[i]);
+ t->next = dlist;
+ dlist = t;
+/* printf("one dev is %s\n", t->devname);*/
+ }
+ globfree(&globbuf);
+ }
+
+ return dlist;
+}
+
+int conf_test_dev(char *devname)
+{
+ struct conf_dev *cd;
+ if (cdevlist == NULL)
+ /* allow anything by default */
+ return 1;
+ for (cd = cdevlist; cd; cd = cd->next) {
+ if (strcasecmp(cd->name, "partitions") == 0)
+ return 1;
+ if (fnmatch(cd->name, devname, FNM_PATHNAME) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost)
+{
+ /* If anyone said 'yes', that sticks.
+ * else if homehost applies, use that
+ * else if there is a 'no', say 'no'.
+ * else 'yes'.
+ */
+ struct dev_policy *p;
+ int no = 0, found_homehost = 0;
+ load_conffile();
+
+ pol = pol_find(pol, pol_auto);
+ pol_for_each(p, pol, version) {
+ if (strcmp(p->value, "yes") == 0)
+ return 1;
+ if (strcmp(p->value, "homehost") == 0)
+ found_homehost = 1;
+ if (strcmp(p->value, "no") == 0)
+ no = 1;
+ }
+ if (is_homehost && found_homehost)
+ return 1;
+ if (no)
+ return 0;
+ return 1;
+}
+
+int match_oneof(char *devices, char *devname)
+{
+ /* check if one of the comma separated patterns in devices
+ * matches devname
+ */
+
+ while (devices && *devices) {
+ char patn[1024];
+ char *p = devices;
+ devices = strchr(devices, ',');
+ if (!devices)
+ devices = p + strlen(p);
+ if (devices-p < 1024) {
+ strncpy(patn, p, devices - p);
+ patn[devices-p] = 0;
+ if (fnmatch(patn, devname, FNM_PATHNAME) == 0)
+ return 1;
+ }
+ if (*devices == ',')
+ devices++;
+ }
+ return 0;
+}
+
+int devname_matches(char *name, char *match)
+{
+ /* See if the given array name matches the
+ * given match from config file.
+ *
+ * First strip and /dev/md/ or /dev/, then
+ * see if there might be a numeric match of
+ * mdNN with NN
+ * then just strcmp
+ */
+ if (strncmp(name, "/dev/md/", 8) == 0)
+ name += 8;
+ else if (strncmp(name, "/dev/", 5) == 0)
+ name += 5;
+
+ if (strncmp(match, "/dev/md/", 8) == 0)
+ match += 8;
+ else if (strncmp(match, "/dev/", 5) == 0)
+ match += 5;
+
+ if (strncmp(name, "md", 2) == 0 && isdigit(name[2]))
+ name += 2;
+ if (strncmp(match, "md", 2) == 0 && isdigit(match[2]))
+ match += 2;
+
+ return (strcmp(name, match) == 0);
+}
+
+int conf_name_is_free(char *name)
+{
+ /* Check if this name is already taken by an ARRAY entry in
+ * the config file.
+ * It can be taken either by a match on devname, name, or
+ * even super-minor.
+ */
+ struct mddev_ident *dev;
+
+ load_conffile();
+ for (dev = mddevlist; dev; dev = dev->next) {
+ char nbuf[100];
+ if (dev->devname && devname_matches(name, dev->devname))
+ return 0;
+ if (dev->name[0] && devname_matches(name, dev->name))
+ return 0;
+ sprintf(nbuf, "%d", dev->super_minor);
+ if (dev->super_minor != UnSet && devname_matches(name, nbuf))
+ return 0;
+ }
+ return 1;
+}
+
+struct mddev_ident *conf_match(struct supertype *st,
+ struct mdinfo *info,
+ char *devname,
+ int verbose, int *rvp)
+{
+ struct mddev_ident *array_list, *match;
+ array_list = conf_get_ident(NULL);
+ match = NULL;
+ for (; array_list; array_list = array_list->next) {
+ if (array_list->uuid_set &&
+ same_uuid(array_list->uuid, info->uuid,
+ st->ss->swapuuid) == 0) {
+ if (verbose >= 2 && array_list->devname)
+ pr_err("UUID differs from %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->name[0] &&
+ strcasecmp(array_list->name, info->name) != 0) {
+ if (verbose >= 2 && array_list->devname)
+ pr_err("Name differs from %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->devices && devname &&
+ !match_oneof(array_list->devices, devname)) {
+ if (verbose >= 2 && array_list->devname)
+ pr_err("Not a listed device for %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (array_list->super_minor != UnSet &&
+ array_list->super_minor != info->array.md_minor) {
+ if (verbose >= 2 && array_list->devname)
+ pr_err("Different super-minor to %s.\n",
+ array_list->devname);
+ continue;
+ }
+ if (!array_list->uuid_set && !array_list->name[0] &&
+ !array_list->devices && array_list->super_minor == UnSet) {
+ if (verbose >= 2 && array_list->devname)
+ pr_err("%s doesn't have any identifying information.\n",
+ array_list->devname);
+ continue;
+ }
+ /* FIXME, should I check raid_disks and level too?? */
+
+ if (match) {
+ if (verbose >= 0) {
+ if (match->devname && array_list->devname)
+ pr_err("we match both %s and %s - cannot decide which to use.\n",
+ match->devname,
+ array_list->devname);
+ else
+ pr_err("multiple lines in mdadm.conf match\n");
+ }
+ if (rvp)
+ *rvp = 2;
+ match = NULL;
+ break;
+ }
+ match = array_list;
+ }
+ return match;
+}
+
+int conf_verify_devnames(struct mddev_ident *array_list)
+{
+ struct mddev_ident *a1, *a2;
+
+ for (a1 = array_list; a1; a1 = a1->next) {
+ if (!a1->devname)
+ continue;
+ if (strcmp(a1->devname, "<ignore>") == 0)
+ continue;
+ for (a2 = a1->next; a2; a2 = a2->next) {
+ if (!a2->devname)
+ continue;
+ if (strcmp(a1->devname, a2->devname) != 0)
+ continue;
+
+ if (a1->uuid_set && a2->uuid_set) {
+ char nbuf[64];
+ __fname_from_uuid(a1->uuid, 0, nbuf, ':');
+ pr_err("Devices %s and ",
+ nbuf);
+ __fname_from_uuid(a2->uuid, 0, nbuf, ':');
+ fprintf(stderr,
+ "%s have the same name: %s\n",
+ nbuf, a1->devname);
+ } else
+ pr_err("Device %s given twice in config file\n", a1->devname);
+ return 1;
+ }
+ }
+
+ return 0;
+}
diff --git a/coverity-gcc-hack.h b/coverity-gcc-hack.h
new file mode 100644
index 0000000..2d94a8b
--- /dev/null
+++ b/coverity-gcc-hack.h
@@ -0,0 +1,10 @@
+#if !defined(__KERNEL__) && defined(__x86_64__) && defined(__COVERITY_GCC_VERSION_AT_LEAST)
+#if __COVERITY_GCC_VERSION_AT_LEAST(7, 0)
+typedef float _Float128 __attribute__((__vector_size__(128)));
+typedef float _Float64 __attribute__((__vector_size__(64)));
+typedef float _Float32 __attribute__((__vector_size__(32)));
+typedef float _Float128x __attribute__((__vector_size__(128)));
+typedef float _Float64x __attribute__((__vector_size__(64)));
+typedef float _Float32x __attribute__((__vector_size__(32)));
+#endif
+#endif
diff --git a/crc32.c b/crc32.c
new file mode 100644
index 0000000..94fda06
--- /dev/null
+++ b/crc32.c
@@ -0,0 +1,360 @@
+/* crc32.c -- compute the CRC-32 of a data stream
+ * Copyright (C) 1995-2003 Mark Adler
+ * For conditions of distribution and use, see copyright notice in zlib.h
+ *
+ * Note: zlib license from from zlib.h added explicitly as mdadm does
+ * not include zlib.h. License from v1.2.2 of zlib:
+ *
+ * This software is provided 'as-is', without any express or implied
+ * warranty. In no event will the authors be held liable for any damages
+ * arising from the use of this software.
+ *
+ * Permission is granted to anyone to use this software for any purpose,
+ * including commercial applications, and to alter it and redistribute it
+ * freely, subject to the following restrictions:
+ *
+ * 1. The origin of this software must not be misrepresented; you must not
+ * claim that you wrote the original software. If you use this software
+ * in a product, an acknowledgment in the product documentation would be
+ * appreciated but is not required.
+ * 2. Altered source versions must be plainly marked as such, and must not be
+ * misrepresented as being the original software.
+ * 3. This notice may not be removed or altered from any source distribution.
+ *
+ *
+ * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster
+ * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing
+ * tables for updating the shift register in one step with three exclusive-ors
+ * instead of four steps with four exclusive-ors. This results about a factor
+ * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3.
+ */
+
+/* @(#) $Id$ */
+
+/*
+ Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore
+ protection on the static variables used to control the first-use generation
+ of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should
+ first call get_crc_table() to initialize the tables before allowing more than
+ one thread to use crc32().
+ */
+
+#ifdef MAKECRCH
+# include <stdio.h>
+# ifndef DYNAMIC_CRC_TABLE
+# define DYNAMIC_CRC_TABLE
+# endif /* !DYNAMIC_CRC_TABLE */
+#endif /* MAKECRCH */
+
+/* #include "zutil.h" / * for STDC and FAR definitions */
+#define STDC
+#define FAR
+#define Z_NULL ((void*)0)
+#define OF(X) X
+#define ZEXPORT
+typedef long ptrdiff_t;
+#define NOBYFOUR
+
+#define local static
+
+/* Find a four-byte integer type for crc32_little() and crc32_big(). */
+#ifndef NOBYFOUR
+# ifdef STDC /* need ANSI C limits.h to determine sizes */
+# include <limits.h>
+# define BYFOUR
+# if (UINT_MAX == 0xffffffffUL)
+ typedef unsigned int u4;
+# else
+# if (ULONG_MAX == 0xffffffffUL)
+ typedef unsigned long u4;
+# else
+# if (USHRT_MAX == 0xffffffffUL)
+ typedef unsigned short u4;
+# else
+# undef BYFOUR /* can't find a four-byte integer type! */
+# endif
+# endif
+# endif
+# endif /* STDC */
+#endif /* !NOBYFOUR */
+
+/* Definitions for doing the crc four data bytes at a time. */
+#ifdef BYFOUR
+# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \
+ (((w)&0xff00)<<8)+(((w)&0xff)<<24))
+ local unsigned long crc32_little OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+ local unsigned long crc32_big OF((unsigned long,
+ const unsigned char FAR *, unsigned));
+# define TBLS 8
+#else
+# define TBLS 1
+#endif /* BYFOUR */
+
+#ifdef DYNAMIC_CRC_TABLE
+
+local volatile int crc_table_empty = 1;
+local unsigned long FAR crc_table[TBLS][256];
+local void make_crc_table OF((void));
+#ifdef MAKECRCH
+ local void write_table OF((FILE *, const unsigned long FAR *));
+#endif /* MAKECRCH */
+
+/*
+ Generate tables for a byte-wise 32-bit CRC calculation on the polynomial:
+ x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+
+ Polynomials over GF(2) are represented in binary, one bit per coefficient,
+ with the lowest powers in the most significant bit. Then adding polynomials
+ is just exclusive-or, and multiplying a polynomial by x is a right shift by
+ one. If we call the above polynomial p, and represent a byte as the
+ polynomial q, also with the lowest power in the most significant bit (so the
+ byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p,
+ where a mod b means the remainder after dividing a by b.
+
+ This calculation is done using the shift-register method of multiplying and
+ taking the remainder. The register is initialized to zero, and for each
+ incoming bit, x^32 is added mod p to the register if the bit is a one (where
+ x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by
+ x (which is shifting right by one and adding x^32 mod p if the bit shifted
+ out is a one). We start with the highest power (least significant bit) of
+ q and repeat for all eight bits of q.
+
+ The first table is simply the CRC of all possible eight bit values. This is
+ all the information needed to generate CRCs on data a byte at a time for all
+ combinations of CRC register values and incoming bytes. The remaining tables
+ allow for word-at-a-time CRC calculation for both big-endian and little-
+ endian machines, where a word is four bytes.
+*/
+local void make_crc_table()
+{
+ unsigned long c;
+ int n, k;
+ unsigned long poly; /* polynomial exclusive-or pattern */
+ /* terms of polynomial defining this crc (except x^32): */
+ static volatile int first = 1; /* flag to limit concurrent making */
+ static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26};
+
+ /* See if another task is already doing this (not thread-safe, but better
+ than nothing -- significantly reduces duration of vulnerability in
+ case the advice about DYNAMIC_CRC_TABLE is ignored) */
+ if (first) {
+ first = 0;
+
+ /* make exclusive-or pattern from polynomial (0xedb88320UL) */
+ poly = 0UL;
+ for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++)
+ poly |= 1UL << (31 - p[n]);
+
+ /* generate a crc for every 8-bit value */
+ for (n = 0; n < 256; n++) {
+ c = (unsigned long)n;
+ for (k = 0; k < 8; k++)
+ c = c & 1 ? poly ^ (c >> 1) : c >> 1;
+ crc_table[0][n] = c;
+ }
+
+#ifdef BYFOUR
+ /* generate crc for each value followed by one, two, and three zeros,
+ and then the byte reversal of those as well as the first table */
+ for (n = 0; n < 256; n++) {
+ c = crc_table[0][n];
+ crc_table[4][n] = REV(c);
+ for (k = 1; k < 4; k++) {
+ c = crc_table[0][c & 0xff] ^ (c >> 8);
+ crc_table[k][n] = c;
+ crc_table[k + 4][n] = REV(c);
+ }
+ }
+#endif /* BYFOUR */
+
+ crc_table_empty = 0;
+ }
+ else { /* not first */
+ /* wait for the other guy to finish (not efficient, but rare) */
+ while (crc_table_empty)
+ ;
+ }
+
+#ifdef MAKECRCH
+ /* write out CRC tables to crc32.h */
+ {
+ FILE *out;
+
+ out = fopen("crc32.h", "w");
+ if (out == NULL) return;
+ fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n");
+ fprintf(out, " * Generated automatically by crc32.c\n */\n\n");
+ fprintf(out, "local const unsigned long FAR ");
+ fprintf(out, "crc_table[TBLS][256] =\n{\n {\n");
+ write_table(out, crc_table[0]);
+# ifdef BYFOUR
+ fprintf(out, "#ifdef BYFOUR\n");
+ for (k = 1; k < 8; k++) {
+ fprintf(out, " },\n {\n");
+ write_table(out, crc_table[k]);
+ }
+ fprintf(out, "#endif\n");
+# endif /* BYFOUR */
+ fprintf(out, " }\n};\n");
+ fclose(out);
+ }
+#endif /* MAKECRCH */
+}
+
+#ifdef MAKECRCH
+local void write_table(out, table)
+ FILE *out;
+ const unsigned long FAR *table;
+{
+ int n;
+
+ for (n = 0; n < 256; n++)
+ fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n],
+ n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", "));
+}
+#endif /* MAKECRCH */
+
+#else /* !DYNAMIC_CRC_TABLE */
+/* ========================================================================
+ * Tables of CRC-32s of all single-byte values, made by make_crc_table().
+ */
+#include "crc32.h"
+#endif /* DYNAMIC_CRC_TABLE */
+
+/* =========================================================================
+ * This function can be used by asm versions of crc32()
+ */
+const unsigned long FAR * ZEXPORT get_crc_table(void)
+{
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+ return (const unsigned long FAR *)crc_table;
+}
+
+/* ========================================================================= */
+#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8)
+#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1
+
+/* ========================================================================= */
+unsigned long ZEXPORT crc32(
+ unsigned long crc,
+ const unsigned char FAR *buf,
+ unsigned len)
+{
+ if (buf == Z_NULL) return 0UL;
+
+#ifdef DYNAMIC_CRC_TABLE
+ if (crc_table_empty)
+ make_crc_table();
+#endif /* DYNAMIC_CRC_TABLE */
+
+#ifdef BYFOUR
+ if (sizeof(void *) == sizeof(ptrdiff_t)) {
+ u4 endian;
+
+ endian = 1;
+ if (*((unsigned char *)(&endian)))
+ return crc32_little(crc, buf, len);
+ else
+ return crc32_big(crc, buf, len);
+ }
+#endif /* BYFOUR */
+/* crc = crc ^ 0xffffffffUL;*/
+ while (len >= 8) {
+ DO8;
+ len -= 8;
+ }
+ if (len) do {
+ DO1;
+ } while (--len);
+ return crc /* ^ 0xffffffffUL*/;
+}
+
+#ifdef BYFOUR
+
+/* ========================================================================= */
+#define DOLIT4 c ^= *buf4++; \
+ c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \
+ crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24]
+#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4
+
+/* ========================================================================= */
+local unsigned long crc32_little(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = (u4)crc;
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)buf;
+ while (len >= 32) {
+ DOLIT32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOLIT4;
+ len -= 4;
+ }
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)c;
+}
+
+/* ========================================================================= */
+#define DOBIG4 c ^= *++buf4; \
+ c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \
+ crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24]
+#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4
+
+/* ========================================================================= */
+local unsigned long crc32_big(crc, buf, len)
+ unsigned long crc;
+ const unsigned char FAR *buf;
+ unsigned len;
+{
+ register u4 c;
+ register const u4 FAR *buf4;
+
+ c = REV((u4)crc);
+ c = ~c;
+ while (len && ((ptrdiff_t)buf & 3)) {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ len--;
+ }
+
+ buf4 = (const u4 FAR *)buf;
+ buf4--;
+ while (len >= 32) {
+ DOBIG32;
+ len -= 32;
+ }
+ while (len >= 4) {
+ DOBIG4;
+ len -= 4;
+ }
+ buf4++;
+ buf = (const unsigned char FAR *)buf4;
+
+ if (len) do {
+ c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8);
+ } while (--len);
+ c = ~c;
+ return (unsigned long)(REV(c));
+}
+
+#endif /* BYFOUR */
diff --git a/crc32.h b/crc32.h
new file mode 100644
index 0000000..8053b61
--- /dev/null
+++ b/crc32.h
@@ -0,0 +1,441 @@
+/* crc32.h -- tables for rapid CRC calculation
+ * Generated automatically by crc32.c
+ */
+
+local const unsigned long FAR crc_table[TBLS][256] =
+{
+ {
+ 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+ 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+ 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+ 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+ 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+ 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+ 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+ 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+ 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+ 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+ 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+ 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+ 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+ 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+ 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+ 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+ 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+ 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+ 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+ 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+ 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+ 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+ 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+ 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+ 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+ 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+ 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+ 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+ 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+ 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+ 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+ 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+ 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+ 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+ 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+ 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+ 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+ 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+ 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+ 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+ 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+ 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+ 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+ 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+ 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+ 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+ 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+ 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+ 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+ 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+ 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+ 0x2d02ef8dUL
+#ifdef BYFOUR
+ },
+ {
+ 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+ 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+ 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+ 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+ 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+ 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+ 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+ 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+ 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+ 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+ 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+ 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+ 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+ 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+ 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+ 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+ 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+ 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+ 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+ 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+ 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+ 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+ 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+ 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+ 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+ 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+ 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+ 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+ 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+ 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+ 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+ 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+ 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+ 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+ 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+ 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+ 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+ 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+ 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+ 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+ 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+ 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+ 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+ 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+ 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+ 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+ 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+ 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+ 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+ 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+ 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+ 0x9324fd72UL
+ },
+ {
+ 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+ 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+ 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+ 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+ 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+ 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+ 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+ 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+ 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+ 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+ 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+ 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+ 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+ 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+ 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+ 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+ 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+ 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+ 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+ 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+ 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+ 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+ 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+ 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+ 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+ 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+ 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+ 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+ 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+ 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+ 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+ 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+ 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+ 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+ 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+ 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+ 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+ 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+ 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+ 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+ 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+ 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+ 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+ 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+ 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+ 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+ 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+ 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+ 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+ 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+ 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+ 0xbe9834edUL
+ },
+ {
+ 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+ 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+ 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+ 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+ 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+ 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+ 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+ 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+ 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+ 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+ 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+ 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+ 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+ 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+ 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+ 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+ 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+ 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+ 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+ 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+ 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+ 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+ 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+ 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+ 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+ 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+ 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+ 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+ 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+ 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+ 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+ 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+ 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+ 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+ 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+ 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+ 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+ 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+ 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+ 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+ 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+ 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+ 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+ 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+ 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+ 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+ 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+ 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+ 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+ 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+ 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+ 0xde0506f1UL
+ },
+ {
+ 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL,
+ 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL,
+ 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL,
+ 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL,
+ 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL,
+ 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL,
+ 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL,
+ 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL,
+ 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL,
+ 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL,
+ 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL,
+ 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL,
+ 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL,
+ 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL,
+ 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL,
+ 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL,
+ 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL,
+ 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL,
+ 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL,
+ 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL,
+ 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL,
+ 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL,
+ 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL,
+ 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL,
+ 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL,
+ 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL,
+ 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL,
+ 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL,
+ 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL,
+ 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL,
+ 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL,
+ 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL,
+ 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL,
+ 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL,
+ 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL,
+ 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL,
+ 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL,
+ 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL,
+ 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL,
+ 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL,
+ 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL,
+ 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL,
+ 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL,
+ 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL,
+ 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL,
+ 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL,
+ 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL,
+ 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL,
+ 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL,
+ 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL,
+ 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL,
+ 0x8def022dUL
+ },
+ {
+ 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL,
+ 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL,
+ 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL,
+ 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL,
+ 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL,
+ 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL,
+ 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL,
+ 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL,
+ 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL,
+ 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL,
+ 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL,
+ 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL,
+ 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL,
+ 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL,
+ 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL,
+ 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL,
+ 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL,
+ 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL,
+ 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL,
+ 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL,
+ 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL,
+ 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL,
+ 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL,
+ 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL,
+ 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL,
+ 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL,
+ 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL,
+ 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL,
+ 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL,
+ 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL,
+ 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL,
+ 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL,
+ 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL,
+ 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL,
+ 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL,
+ 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL,
+ 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL,
+ 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL,
+ 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL,
+ 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL,
+ 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL,
+ 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL,
+ 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL,
+ 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL,
+ 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL,
+ 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL,
+ 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL,
+ 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL,
+ 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL,
+ 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL,
+ 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL,
+ 0x72fd2493UL
+ },
+ {
+ 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL,
+ 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL,
+ 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL,
+ 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL,
+ 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL,
+ 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL,
+ 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL,
+ 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL,
+ 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL,
+ 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL,
+ 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL,
+ 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL,
+ 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL,
+ 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL,
+ 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL,
+ 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL,
+ 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL,
+ 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL,
+ 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL,
+ 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL,
+ 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL,
+ 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL,
+ 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL,
+ 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL,
+ 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL,
+ 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL,
+ 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL,
+ 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL,
+ 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL,
+ 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL,
+ 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL,
+ 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL,
+ 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL,
+ 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL,
+ 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL,
+ 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL,
+ 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL,
+ 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL,
+ 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL,
+ 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL,
+ 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL,
+ 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL,
+ 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL,
+ 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL,
+ 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL,
+ 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL,
+ 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL,
+ 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL,
+ 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL,
+ 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL,
+ 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL,
+ 0xed3498beUL
+ },
+ {
+ 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL,
+ 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL,
+ 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL,
+ 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL,
+ 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL,
+ 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL,
+ 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL,
+ 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL,
+ 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL,
+ 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL,
+ 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL,
+ 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL,
+ 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL,
+ 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL,
+ 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL,
+ 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL,
+ 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL,
+ 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL,
+ 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL,
+ 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL,
+ 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL,
+ 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL,
+ 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL,
+ 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL,
+ 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL,
+ 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL,
+ 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL,
+ 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL,
+ 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL,
+ 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL,
+ 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL,
+ 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL,
+ 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL,
+ 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL,
+ 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL,
+ 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL,
+ 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL,
+ 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL,
+ 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL,
+ 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL,
+ 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL,
+ 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL,
+ 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL,
+ 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL,
+ 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL,
+ 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL,
+ 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL,
+ 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL,
+ 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL,
+ 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL,
+ 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL,
+ 0xf10605deUL
+#endif
+ }
+};
diff --git a/crc32c.c b/crc32c.c
new file mode 100644
index 0000000..156cba1
--- /dev/null
+++ b/crc32c.c
@@ -0,0 +1,104 @@
+/*
+ * Oct 28, 2015 Song Liu simplified the code and port it to mdadm
+ *
+ * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin
+ * cleaned up code to current version of sparse and added the slicing-by-8
+ * algorithm to the closely similar existing slicing-by-4 algorithm.
+ *
+ * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks!
+ * Code was from the public domain, copyright abandoned. Code was
+ * subsequently included in the kernel, thus was re-licensed under the
+ * GNU GPL v2.
+ *
+ * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com>
+ * Same crc32 function was used in 5 other places in the kernel.
+ * I made one version, and deleted the others.
+ * There are various incantations of crc32(). Some use a seed of 0 or ~0.
+ * Some xor at the end with ~0. The generic crc32() function takes
+ * seed as an argument, and doesn't xor at the end. Then individual
+ * users can do whatever they need.
+ * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0.
+ * fs/jffs2 uses seed 0, doesn't xor with ~0.
+ * fs/partitions/efi.c uses seed ~0, xor's with ~0.
+ *
+ * This source code is licensed under the GNU General Public License,
+ * Version 2. See the file COPYING for more details.
+ */
+
+#include <sys/types.h>
+#include <asm/types.h>
+#include <stdlib.h>
+
+/*
+ * There are multiple 16-bit CRC polynomials in common use, but this is
+ * *the* standard CRC-32 polynomial, first popularized by Ethernet.
+ * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0
+ */
+#define CRCPOLY_LE 0xedb88320
+#define CRCPOLY_BE 0x04c11db7
+
+/*
+ * This is the CRC32c polynomial, as outlined by Castagnoli.
+ * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+
+ * x^8+x^6+x^0
+ */
+#define CRC32C_POLY_LE 0x82F63B78
+
+/**
+ * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II
+ * CRC32/CRC32C
+ * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other
+ * uses, or the previous crc32/crc32c value if computing incrementally.
+ * @p: pointer to buffer over which CRC32/CRC32C is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32/CRC32c LE polynomial
+ */
+static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p,
+ size_t len, __u32 polynomial)
+{
+ int i;
+ while (len--) {
+ crc ^= *p++;
+ for (i = 0; i < 8; i++)
+ crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0);
+ }
+ return crc;
+}
+
+__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRCPOLY_LE);
+}
+
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_le_generic(crc, p, len, CRC32C_POLY_LE);
+}
+
+/**
+ * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32
+ * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for
+ * other uses, or the previous crc32 value if computing incrementally.
+ * @p: pointer to buffer over which CRC32 is run
+ * @len: length of buffer @p
+ * @polynomial: CRC32 BE polynomial
+ */
+static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p,
+ size_t len, __u32 polynomial)
+{
+ int i;
+ while (len--) {
+ crc ^= *p++ << 24;
+ for (i = 0; i < 8; i++)
+ crc =
+ (crc << 1) ^ ((crc & 0x80000000) ? polynomial :
+ 0);
+ }
+ return crc;
+}
+
+__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len)
+{
+ return crc32_be_generic(crc, p, len, CRCPOLY_BE);
+}
diff --git a/dlink.c b/dlink.c
new file mode 100644
index 0000000..69aa7aa
--- /dev/null
+++ b/dlink.c
@@ -0,0 +1,74 @@
+
+/* doubly linked lists */
+/* This is free software. No strings attached. No copyright claimed */
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef __dietlibc__
+char *strncpy(char *dest, const char *src, size_t n) __THROW;
+#endif
+void *xcalloc(size_t num, size_t size);
+#include "dlink.h"
+
+void *dl_head()
+{
+ void *h;
+ h = dl_alloc(0);
+ dl_next(h) = h;
+ dl_prev(h) = h;
+ return h;
+}
+
+void dl_free(void *v)
+{
+ struct __dl_head *vv = v;
+ free(vv-1);
+}
+
+void dl_init(void *v)
+{
+ dl_next(v) = v;
+ dl_prev(v) = v;
+}
+
+void dl_insert(void *head, void *val)
+{
+ dl_next(val) = dl_next(head);
+ dl_prev(val) = head;
+ dl_next(dl_prev(val)) = val;
+ dl_prev(dl_next(val)) = val;
+}
+
+void dl_add(void *head, void *val)
+{
+ dl_prev(val) = dl_prev(head);
+ dl_next(val) = head;
+ dl_next(dl_prev(val)) = val;
+ dl_prev(dl_next(val)) = val;
+}
+
+void dl_del(void *val)
+{
+ if (dl_prev(val) == 0 || dl_next(val) == 0)
+ return;
+ dl_prev(dl_next(val)) = dl_prev(val);
+ dl_next(dl_prev(val)) = dl_next(val);
+ dl_prev(val) = dl_next(val) = 0;
+}
+
+char *dl_strndup(char *s, int l)
+{
+ char *n;
+ if (s == NULL)
+ return NULL;
+ n = dl_newv(char, l+1);
+ strncpy(n, s, l+1);
+ n[l] = 0;
+ return n;
+}
+
+char *dl_strdup(char *s)
+{
+ return dl_strndup(s, (int)strlen(s));
+}
diff --git a/dlink.h b/dlink.h
new file mode 100644
index 0000000..ab2a945
--- /dev/null
+++ b/dlink.h
@@ -0,0 +1,25 @@
+
+/* doubley linked lists */
+/* This is free software. No strings attached. No copyright claimed */
+
+struct __dl_head
+{
+ void * dh_prev;
+ void * dh_next;
+};
+
+#define dl_alloc(size) ((void*)(((char*)xcalloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head)))
+#define dl_new(t) ((t*)dl_alloc(sizeof(t)))
+#define dl_newv(t,n) ((t*)dl_alloc(sizeof(t)*n))
+
+#define dl_next(p) *(&(((struct __dl_head*)(p))[-1].dh_next))
+#define dl_prev(p) *(&(((struct __dl_head*)(p))[-1].dh_prev))
+
+void *dl_head(void);
+char *dl_strdup(char *);
+char *dl_strndup(char *, int);
+void dl_insert(void*, void*);
+void dl_add(void*, void*);
+void dl_del(void*);
+void dl_free(void*);
+void dl_init(void*);
diff --git a/external-reshape-design.txt b/external-reshape-design.txt
new file mode 100644
index 0000000..e4cf4e1
--- /dev/null
+++ b/external-reshape-design.txt
@@ -0,0 +1,280 @@
+External Reshape
+
+1 Problem statement
+
+External (third-party metadata) reshape differs from native-metadata
+reshape in three key ways:
+
+1.1 Format specific constraints
+
+In the native case reshape is limited by what is implemented in the
+generic reshape routine (Grow_reshape()) and what is supported by the
+kernel. There are exceptional cases where Grow_reshape() may block
+operations when it knows that the kernel implementation is broken, but
+otherwise the kernel is relied upon to be the final arbiter of what
+reshape operations are supported.
+
+In the external case the kernel, and the generic checks in
+Grow_reshape(), become the super-set of what reshapes are possible. The
+metadata format may not support, or have yet to implement a given
+reshape type. The implication for Grow_reshape() is that it must query
+the metadata handler and effect changes in the metadata before the new
+geometry is posted to the kernel. The ->reshape_super method allows
+Grow_reshape() to validate the requested operation and post the metadata
+update.
+
+1.2 Scope of reshape
+
+Native metadata reshape is always performed at the array scope (no
+metadata relationship with sibling arrays on the same disks). External
+reshape, depending on the format, may not allow the number of member
+disks to be changed in a subarray unless the change is simultaneously
+applied to all subarrays in the container. For example the imsm format
+requires all member disks to be a member of all subarrays, so a 4-disk
+raid5 in a container that also houses a 4-disk raid10 array could not be
+reshaped to 5 disks as the imsm format does not support a 5-disk raid10
+representation. This requires the ->reshape_super method to check the
+contents of the array and ask the user to run the reshape at container
+scope (if all subarrays are agreeable to the change), or report an
+error in the case where one subarray cannot support the change.
+
+1.3 Monitoring / checkpointing
+
+Reshape, unlike rebuild/resync, requires strict checkpointing to survive
+interrupted reshape operations. For example when expanding a raid5
+array the first few stripes of the array will be overwritten in a
+destructive manner. When restarting the reshape process we need to know
+the exact location of the last successfully written stripe, and we need
+to restore the data in any partially overwritten stripe. Native
+metadata stores this backup data in the unused portion of spares that
+are being promoted to array members, or in an external backup file
+(located on a non-involved block device).
+
+The kernel is in charge of recording checkpoints of reshape progress,
+but mdadm is delegated the task of managing the backup space which
+involves:
+1/ Identifying what data will be overwritten in the next unit of reshape
+ operation
+2/ Suspending access to that region so that a snapshot of the data can
+ be transferred to the backup space.
+3/ Allowing the kernel to reshape the saved region and setting the
+ boundary for the next backup.
+
+In the external reshape case we want to preserve this mdadm
+'reshape-manager' arrangement, but have a third actor, mdmon, to
+consider. It is tempting to give the role of managing reshape to mdmon,
+but that is counter to its role as a monitor, and conflicts with the
+existing capabilities and role of mdadm to manage the progress of
+reshape. For clarity the external reshape implementation maintains the
+role of mdmon as a (mostly) passive recorder of raid events, and mdadm
+treats it as it would the kernel in the native reshape case (modulo
+needing to send explicit metadata update messages and checking that
+mdmon took the expected action).
+
+External reshape can use the generic md backup file as a fallback, but in the
+optimal/firmware-compatible case the reshape-manager will use the metadata
+specific areas for managing reshape. The implementation also needs to spawn a
+reshape-manager per subarray when the reshape is being carried out at the
+container level. For these two reasons the ->manage_reshape() method is
+introduced. This method in addition to base tasks mentioned above:
+1/ Processed each subarray one at a time in series - where appropriate.
+2/ Uses either generic routines in Grow.c for md-style backup file
+ support, or uses the metadata-format specific location for storing
+ recovery data.
+This aims to avoid a "midlayer mistake"[1] and lets the metadata handler
+optionally take advantage of generic infrastructure in Grow.c
+
+2 Details for specific reshape requests
+
+There are quite a few moving pieces spread out across md, mdadm, and mdmon for
+the support of external reshape, and there are several different types of
+reshape that need to be comprehended by the implementation. A rundown of
+these details follows.
+
+2.0 General provisions:
+
+Obtain an exclusive open on the container to make sure we are not
+running concurrently with a Create() event.
+
+2.1 Freezing sync_action
+
+ Before making any attempt at a reshape we 'freeze' every array in
+ the container to ensure no spare assignment or recovery happens.
+ This involves writing 'frozen' to sync_action and changing the '/'
+ after 'external:' in metadata_version to a '-'. mdmon knows that
+ this means not to perform any management.
+
+ Before doing this we check that all sync_actions are 'idle', which
+ is racy but still useful.
+ Afterwards we check that all member arrays have no spares
+ or partial spares (recovery_start != 'none') which would indicate a
+ race. If they do, we unfreeze again.
+
+ Once this completes we know all the arrays are stable. They may
+ still have failed devices as devices can fail at any time. However
+ we treat those like failures that happen during the reshape.
+
+2.2 Reshape size
+
+ 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+ initializes st->update_tail
+ 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change
+ is allowed (being performed at subarray scope / enough room) prepares a
+ metadata update
+ 3/ mdadm::Grow_reshape(): flushes the metadata update (via
+ flush_metadata_update(), or ->sync_metadata())
+ 4/ mdadm::Grow_reshape(): post the new size to the kernel
+
+
+2.3 Reshape level (simple-takeover)
+
+"simple-takeover" implies the level change can be satisfied without touching
+sync_action
+
+ 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally
+ initializes st->update_tail
+ 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change
+ is allowed (being performed at subarray scope) prepares a
+ metadata update
+ 2a/ raid10 --> raid0: degrade all mirror legs prior to calling
+ ->reshape_super
+ 3/ mdadm::Grow_reshape(): flushes the metadata update (via
+ flush_metadata_update(), or ->sync_metadata())
+ 4/ mdadm::Grow_reshape(): post the new level to the kernel
+
+2.4 Reshape chunk, layout
+
+2.5 Reshape raid disks (grow)
+
+ 1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail
+ because only redundant raid levels can modify the number of raid disks
+ 2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level
+ change is allowed (being performed at proper scope / permissible
+ geometry / proper spares available in the container), chooses
+ the spares to use, and prepares a metadata update.
+ 3/ mdadm::Grow_reshape(): Converts each subarray in the container to the
+ raid level that can perform the reshape and starts mdmon.
+ 4/ mdadm::Grow_reshape(): Pushes the update to mdmon.
+ 5/ mdadm::Grow_reshape(): uses container_content to find details of
+ the spares and passes them to the kernel.
+ 6/ mdadm::Grow_reshape(): gives raid_disks update to the kernel,
+ sets sync_max, sync_min, suspend_lo, suspend_hi all to zero,
+ and starts the reshape by writing 'reshape' to sync_action.
+ 7/ mdmon::monitor notices the sync_action change and tells
+ managemon to check for new devices. managemon notices the new
+ devices, opens relevant sysfs file, and passes them all to
+ monitor.
+ 8/ mdadm::Grow_reshape() calls ->manage_reshape to oversee the
+ rest of the reshape.
+
+ 9/ mdadm::<format>->manage_reshape(): saves data that will be overwritten by
+ the kernel to either the backup file or the metadata specific location,
+ advances sync_max, waits for reshape, ping mdmon, repeat.
+ Meanwhile mdmon::read_and_act(): records checkpoints.
+ Specifically.
+
+ 9a/ if the 'next' stripe to be reshaped will over-write
+ itself during reshape then:
+ 9a.1/ increase suspend_hi to cover a suitable number of
+ stripes.
+ 9a.2/ backup those stripes safely.
+ 9a.3/ advance sync_max to allow those stripes to be backed up
+ 9a.4/ when sync_completed indicates that those stripes have
+ been reshaped, manage_reshape must ping_manager
+ 9a.5/ when mdmon notices that sync_completed has been updated,
+ it records the new checkpoint in the metadata
+ 9a.6/ after the ping_manager, manage_reshape will increase
+ suspend_lo to allow access to those stripes again
+
+ 9b/ if the 'next' stripe to be reshaped will over-write unused
+ space during reshape then we apply same process as above,
+ except that there is no need to back anything up.
+ Note that we *do* need to keep suspend_hi progressing as
+ it is not safe to write to the area-under-reshape. For
+ kernel-managed-metadata this protection is provided by
+ ->reshape_safe, but that does not protect us in the case
+ of user-space-managed-metadata.
+
+ 10/ mdadm::<format>->manage_reshape(): Once reshape completes changes the raid
+ level back to the nominal raid level (if necessary)
+
+ FIXME: native metadata does not have the capability to record the original
+ raid level in reshape-restart case because the kernel always records current
+ raid level to the metadata, whereas external metadata can masquerade at an
+ alternate level based on the reshape state.
+
+2.6 Reshape raid disks (shrink)
+
+3 Interaction with metadata handle.
+
+ The following calls are made into the metadata handler to assist
+ with initiating and monitoring a 'reshape'.
+
+ 1/ ->reshape_super is called quite early (after only minimial
+ checks) to make sure that the metadata can record the new shape
+ and any necessary transitions. It may be passed a 'container'
+ or an individual array within a container, and it should notice
+ the difference and act accordingly.
+ When a reshape is requested against a container it is expected
+ that it should be applied to every array in the container,
+ however it is up to the metadata handler to determine final
+ policy.
+
+ If the reshape is supportable, the internal copy of the metadata
+ should be updated, and a metadata update suitable for sending
+ to mdmon should be queued.
+
+ If the reshape will involve converting spares into array members,
+ this must be recorded in the metadata too.
+
+ 2/ ->container_content will be called to find out the new state
+ of all the array, or all arrays in the container. Any newly
+ added devices (with state==0 and raid_disk >= 0) will be added
+ to the array as spares with the relevant slot number.
+
+ It is likely that the info returned by ->container_content will
+ have ->reshape_active set, ->reshape_progress set to e.g. 0, and
+ new_* set appropriately. mdadm will use this information to
+ cause the correct reshape to start at an appropriate time.
+
+ 3/ ->set_array_state will be called by mdmon when reshape has
+ started and again periodically as it progresses. This should
+ record the ->last_checkpoint as the point where reshape has
+ progressed to. When the reshape finished this will be called
+ again and it should notice that ->curr_action is no longer
+ 'reshape' and so should record that the reshape has finished
+ providing 'last_checkpoint' has progressed suitably.
+
+ 4/ ->manage_reshape will be called once the reshape has been set
+ up in the kernel but before sync_max has been moved from 0, so
+ no actual reshape will have happened.
+
+ ->manage_reshape should call progress_reshape() to allow the
+ reshape to progress, and should back-up any data as indicated
+ by the return value. See the documentation of that function
+ for more details.
+ ->manage_reshape will be called multiple times when a
+ container is being reshaped, once for each member array in
+ the container.
+
+
+ The progress of the metadata is as follows:
+ 1/ mdadm sends a metadata update to mdmon which marks the array
+ as undergoing a reshape. This is set up by
+ ->reshape_super and applied by ->process_update
+ For container-wide reshape, this happens once for the whole
+ container.
+ 2/ mdmon notices progress via the sysfs files and calls
+ ->set_array_state to update the state periodically
+ For container-wide reshape, this happens repeatedly for
+ one array, then repeatedly for the next, etc.
+ 3/ mdmon notices when reshape has finished and call
+ ->set_array_state to record the the reshape is complete.
+ For container-wide reshape, this happens once for each
+ member array.
+
+
+
+...
+
+[1]: Linux kernel design patterns - part 3, Neil Brown https://lwn.net/Articles/336262/
diff --git a/inventory b/inventory
new file mode 100755
index 0000000..c4801b4
--- /dev/null
+++ b/inventory
@@ -0,0 +1,284 @@
+
+.gitignore
+ANNOUNCE-3.0
+ANNOUNCE-3.0.1
+ANNOUNCE-3.0.2
+ANNOUNCE-3.0.3
+ANNOUNCE-3.1
+ANNOUNCE-3.1.1
+ANNOUNCE-3.1.2
+ANNOUNCE-3.1.3
+ANNOUNCE-3.1.4
+ANNOUNCE-3.1.5
+ANNOUNCE-3.2
+ANNOUNCE-3.2.1
+ANNOUNCE-3.2.2
+ANNOUNCE-3.2.3
+ANNOUNCE-3.2.4
+ANNOUNCE-3.2.5
+ANNOUNCE-3.2.6
+ANNOUNCE-3.3
+ANNOUNCE-3.3.1
+ANNOUNCE-3.3.2
+ANNOUNCE-3.3.3
+ANNOUNCE-3.3.4
+ANNOUNCE-3.4
+ANNOUNCE-4.0
+ANNOUNCE-4.1
+ANNOUNCE-4.2
+Assemble.c
+Build.c
+COPYING
+ChangeLog
+Create.c
+Detail.c
+Dump.c
+Examine.c
+Grow.c
+INSTALL
+Incremental.c
+Kill.c
+Makefile
+Manage.c
+Monitor.c
+Query.c
+README.initramfs
+ReadMe.c
+TODO
+bitmap.c
+bitmap.h
+clustermd_tests/
+clustermd_tests/00r10_Create
+clustermd_tests/00r1_Create
+clustermd_tests/01r10_Grow_bitmap-switch
+clustermd_tests/01r10_Grow_resize
+clustermd_tests/01r1_Grow_add
+clustermd_tests/01r1_Grow_bitmap-switch
+clustermd_tests/01r1_Grow_resize
+clustermd_tests/02r10_Manage_add
+clustermd_tests/02r10_Manage_add-spare
+clustermd_tests/02r10_Manage_re-add
+clustermd_tests/02r1_Manage_add
+clustermd_tests/02r1_Manage_add-spare
+clustermd_tests/02r1_Manage_re-add
+clustermd_tests/03r10_switch-recovery
+clustermd_tests/03r10_switch-resync
+clustermd_tests/03r1_switch-recovery
+clustermd_tests/03r1_switch-resync
+clustermd_tests/cluster_conf
+clustermd_tests/func.sh
+config.c
+coverity-gcc-hack.h
+crc32.c
+crc32.h
+crc32c.c
+dlink.c
+dlink.h
+external-reshape-design.txt
+inventory
+lib.c
+makedist
+managemon.c
+mapfile.c
+maps.c
+md.4
+md5.h
+md_p.h
+md_u.h
+mdadm.8.in
+mdadm.c
+mdadm.conf-example
+mdadm.conf.5
+mdadm.h
+mdadm.spec
+mdmon-design.txt
+mdmon.8
+mdmon.c
+mdmon.h
+mdopen.c
+mdstat.c
+misc/
+misc/mdcheck
+misc/syslog-events
+mkinitramfs
+monitor.c
+msg.c
+msg.h
+part.h
+platform-intel.c
+platform-intel.h
+policy.c
+probe_roms.c
+probe_roms.h
+pwgr.c
+raid5extend.c
+raid6check.8
+raid6check.c
+restripe.c
+sg_io.c
+sha1.c
+sha1.h
+super-ddf.c
+super-gpt.c
+super-intel.c
+super-mbr.c
+super0.c
+super1.c
+swap_super.c
+sysfs.c
+systemd/
+systemd/SUSE-mdadm_env.sh
+systemd/mdadm-grow-continue@.service
+systemd/mdadm-last-resort@.service
+systemd/mdadm-last-resort@.timer
+systemd/mdadm.shutdown
+systemd/mdcheck_continue.service
+systemd/mdcheck_continue.timer
+systemd/mdcheck_start.service
+systemd/mdcheck_start.timer
+systemd/mdmon@.service
+systemd/mdmonitor-oneshot.service
+systemd/mdmonitor-oneshot.timer
+systemd/mdmonitor.service
+test
+tests/
+tests/00linear
+tests/00multipath
+tests/00names
+tests/00raid0
+tests/00raid1
+tests/00raid10
+tests/00raid4
+tests/00raid5
+tests/00raid6
+tests/00readonly
+tests/01r1fail
+tests/01r5fail
+tests/01r5integ
+tests/01raid6integ
+tests/01replace
+tests/02lineargrow
+tests/02r1add
+tests/02r1grow
+tests/02r5grow
+tests/02r6grow
+tests/03assem-incr
+tests/03r0assem
+tests/03r5assem
+tests/03r5assem-failed
+tests/03r5assemV1
+tests/04r0update
+tests/04r1update
+tests/04r5swap
+tests/04update-metadata
+tests/04update-uuid
+tests/05r1-add-internalbitmap
+tests/05r1-add-internalbitmap-v1a
+tests/05r1-add-internalbitmap-v1b
+tests/05r1-add-internalbitmap-v1c
+tests/05r1-bitmapfile
+tests/05r1-failfast
+tests/05r1-grow-external
+tests/05r1-grow-internal
+tests/05r1-grow-internal-1
+tests/05r1-internalbitmap
+tests/05r1-internalbitmap-v1a
+tests/05r1-internalbitmap-v1b
+tests/05r1-internalbitmap-v1c
+tests/05r1-n3-bitmapfile
+tests/05r1-re-add
+tests/05r1-re-add-nosuper
+tests/05r1-remove-internalbitmap
+tests/05r1-remove-internalbitmap-v1a
+tests/05r1-remove-internalbitmap-v1b
+tests/05r1-remove-internalbitmap-v1c
+tests/05r5-bitmapfile
+tests/05r5-internalbitmap
+tests/05r6-bitmapfile
+tests/05r6tor0
+tests/06name
+tests/06sysfs
+tests/06wrmostly
+tests/07autoassemble
+tests/07autodetect
+tests/07changelevelintr
+tests/07changelevels
+tests/07layouts
+tests/07reshape5intr
+tests/07revert-grow
+tests/07revert-inplace
+tests/07revert-shrink
+tests/07testreshape5
+tests/09imsm-assemble
+tests/09imsm-create-fail-rebuild
+tests/09imsm-overlap
+tests/10ddf-assemble-missing
+tests/10ddf-create
+tests/10ddf-create-fail-rebuild
+tests/10ddf-fail-create-race
+tests/10ddf-fail-readd
+tests/10ddf-fail-readd-readonly
+tests/10ddf-fail-spare
+tests/10ddf-fail-stop-readd
+tests/10ddf-fail-twice
+tests/10ddf-fail-two-spares
+tests/10ddf-geometry
+tests/10ddf-incremental-wrong-order
+tests/10ddf-sudden-degraded
+tests/11spare-migration
+tests/12imsm-r0_2d-grow-r0_3d
+tests/12imsm-r0_2d-grow-r0_4d
+tests/12imsm-r0_2d-grow-r0_5d
+tests/12imsm-r0_3d-grow-r0_4d
+tests/12imsm-r5_3d-grow-r5_4d
+tests/12imsm-r5_3d-grow-r5_5d
+tests/13imsm-r0_r0_2d-grow-r0_r0_4d
+tests/13imsm-r0_r0_2d-grow-r0_r0_5d
+tests/13imsm-r0_r0_3d-grow-r0_r0_4d
+tests/13imsm-r0_r5_3d-grow-r0_r5_4d
+tests/13imsm-r0_r5_3d-grow-r0_r5_5d
+tests/13imsm-r5_r0_3d-grow-r5_r0_4d
+tests/13imsm-r5_r0_3d-grow-r5_r0_5d
+tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d
+tests/14imsm-r0_3d_no_spares-migrate-r5_3d
+tests/14imsm-r0_r0_2d-takeover-r10_4d
+tests/14imsm-r10_4d-grow-r10_5d
+tests/14imsm-r10_r5_4d-takeover-r0_2d
+tests/14imsm-r1_2d-grow-r1_3d
+tests/14imsm-r1_2d-takeover-r0_2d
+tests/14imsm-r5_3d-grow-r5_5d-no-spares
+tests/14imsm-r5_3d-migrate-r4_3d
+tests/15imsm-r0_3d_64k-migrate-r0_3d_256k
+tests/15imsm-r5_3d_4k-migrate-r5_3d_256k
+tests/15imsm-r5_3d_64k-migrate-r5_3d_256k
+tests/15imsm-r5_6d_4k-migrate-r5_6d_256k
+tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k
+tests/16imsm-r0_3d-migrate-r5_4d
+tests/16imsm-r0_5d-migrate-r5_6d
+tests/16imsm-r5_3d-migrate-r0_3d
+tests/16imsm-r5_5d-migrate-r0_5d
+tests/18imsm-1d-takeover-r0_1d
+tests/18imsm-1d-takeover-r1_2d
+tests/18imsm-r0_2d-takeover-r10_4d
+tests/18imsm-r10_4d-takeover-r0_2d
+tests/18imsm-r1_2d-takeover-r0_1d
+tests/19raid6auto-repair
+tests/19raid6check
+tests/19raid6repair
+tests/19repair-does-not-destroy
+tests/20raid5journal
+tests/21raid5cache
+tests/ToTest
+tests/env-ddf-template
+tests/env-imsm-template
+tests/func.sh
+tests/imsm-grow-template
+tests/utils
+udev-md-clustered-confirm-device.rules
+udev-md-raid-arrays.rules
+udev-md-raid-assembly.rules
+udev-md-raid-creating.rules
+udev-md-raid-safe-timeouts.rules
+util.c
+uuid.c
+xmalloc.c
diff --git a/lib.c b/lib.c
new file mode 100644
index 0000000..7e3e3d4
--- /dev/null
+++ b/lib.c
@@ -0,0 +1,575 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "dlink.h"
+#include <ctype.h>
+#include <limits.h>
+
+bool is_dev_alive(char *path)
+{
+ if (!path)
+ return false;
+
+ if (access(path, R_OK) == 0)
+ return true;
+
+ return false;
+}
+
+/* This fill contains various 'library' style function. They
+ * have no dependency on anything outside this file.
+ */
+
+int get_mdp_major(void)
+{
+ static int mdp_major = -1;
+ FILE *fl;
+ char *w;
+ int have_block = 0;
+ int have_devices = 0;
+ int last_num = -1;
+
+ if (mdp_major != -1)
+ return mdp_major;
+
+ fl = fopen("/proc/devices", "r");
+ if (!fl)
+ return -1;
+
+ while ((w = conf_word(fl, 1))) {
+ if (have_block && strcmp(w, "devices:") == 0)
+ have_devices = 1;
+ have_block = (strcmp(w, "Block") == 0);
+ if (isdigit(w[0]))
+ last_num = atoi(w);
+ if (have_devices && strcmp(w, "mdp") == 0)
+ mdp_major = last_num;
+ free(w);
+ }
+ fclose(fl);
+
+ return mdp_major;
+}
+
+char *devid2kname(dev_t devid)
+{
+ char path[30];
+ char link[PATH_MAX];
+ static char devnm[32];
+ char *cp;
+ int n;
+
+ /* Look at the
+ * /sys/dev/block/%d:%d link which must look like
+ * and take the last component.
+ */
+ sprintf(path, "/sys/dev/block/%d:%d", major(devid), minor(devid));
+ n = readlink(path, link, sizeof(link) - 1);
+ if (n > 0) {
+ link[n] = 0;
+ cp = strrchr(link, '/');
+ if (cp) {
+ strcpy(devnm, cp + 1);
+ return devnm;
+ }
+ }
+ return NULL;
+}
+
+char *stat2kname(struct stat *st)
+{
+ if ((S_IFMT & st->st_mode) != S_IFBLK)
+ return NULL;
+
+ return devid2kname(st->st_rdev);
+}
+
+char *fd2kname(int fd)
+{
+ struct stat stb;
+
+ if (fstat(fd, &stb) == 0)
+ return stat2kname(&stb);
+
+ return NULL;
+}
+
+char *devid2devnm(dev_t devid)
+{
+ char path[30];
+ char link[200];
+ static char devnm[32];
+ char *cp, *ep;
+ int n;
+
+ /* Might be an extended-minor partition or a
+ * named md device. Look at the
+ * /sys/dev/block/%d:%d link which must look like
+ * ../../block/mdXXX/mdXXXpYY
+ * or
+ * ...../block/md_FOO
+ */
+ sprintf(path, "/sys/dev/block/%d:%d", major(devid), minor(devid));
+ n = readlink(path, link, sizeof(link) - 1);
+ if (n > 0) {
+ link[n] = 0;
+ cp = strstr(link, "/block/");
+ if (cp) {
+ cp += 7;
+ ep = strchr(cp, '/');
+ if (ep)
+ *ep = 0;
+ strcpy(devnm, cp);
+ return devnm;
+ }
+ }
+ if (major(devid) == MD_MAJOR)
+ sprintf(devnm,"md%d", minor(devid));
+ else if (major(devid) == (unsigned)get_mdp_major())
+ sprintf(devnm,"md_d%d",
+ (minor(devid)>>MdpMinorShift));
+ else
+ return NULL;
+
+ return devnm;
+}
+
+char *stat2devnm(struct stat *st)
+{
+ if ((S_IFMT & st->st_mode) != S_IFBLK)
+ return NULL;
+
+ return devid2devnm(st->st_rdev);
+}
+
+char *fd2devnm(int fd)
+{
+ struct stat stb;
+
+ if (fstat(fd, &stb) == 0)
+ return stat2devnm(&stb);
+
+ return NULL;
+}
+
+/* When we create a new array, we don't want the content to
+ * be immediately examined by udev - it is probably meaningless.
+ * So create /run/mdadm/creating-mdXXX and expect that a udev
+ * rule will noticed this and act accordingly.
+ */
+static char block_path[] = "/run/mdadm/creating-%s";
+static char *unblock_path = NULL;
+void udev_block(char *devnm)
+{
+ int fd;
+ char *path = NULL;
+
+ xasprintf(&path, block_path, devnm);
+ fd = open(path, O_CREAT|O_RDWR, 0600);
+ if (fd >= 0) {
+ close(fd);
+ unblock_path = path;
+ } else
+ free(path);
+}
+
+void udev_unblock(void)
+{
+ if (unblock_path)
+ unlink(unblock_path);
+ free(unblock_path);
+ unblock_path = NULL;
+}
+
+/*
+ * convert a major/minor pair for a block device into a name in /dev, if possible.
+ * On the first call, walk /dev collecting name.
+ * Put them in a simple linked listfor now.
+ */
+struct devmap {
+ int major, minor;
+ char *name;
+ struct devmap *next;
+} *devlist = NULL;
+int devlist_ready = 0;
+
+int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s)
+{
+ struct stat st;
+
+ if (S_ISLNK(stb->st_mode)) {
+ if (stat(name, &st) != 0)
+ return 0;
+ stb = &st;
+ }
+
+ if ((stb->st_mode&S_IFMT)== S_IFBLK) {
+ char *n = xstrdup(name);
+ struct devmap *dm = xmalloc(sizeof(*dm));
+ if (strncmp(n, "/dev/./", 7) == 0)
+ strcpy(n + 4, name + 6);
+ if (dm) {
+ dm->major = major(stb->st_rdev);
+ dm->minor = minor(stb->st_rdev);
+ dm->name = n;
+ dm->next = devlist;
+ devlist = dm;
+ }
+ }
+
+ return 0;
+}
+
+#ifndef HAVE_NFTW
+#ifdef HAVE_FTW
+int add_dev_1(const char *name, const struct stat *stb, int flag)
+{
+ return add_dev(name, stb, flag, NULL);
+}
+int nftw(const char *path,
+ int (*han)(const char *name, const struct stat *stb,
+ int flag, struct FTW *s), int nopenfd, int flags)
+{
+ return ftw(path, add_dev_1, nopenfd);
+}
+#else
+int nftw(const char *path,
+ int (*han)(const char *name, const struct stat *stb,
+ int flag, struct FTW *s), int nopenfd, int flags)
+{
+ return 0;
+}
+#endif /* HAVE_FTW */
+#endif /* HAVE_NFTW */
+
+/*
+ * Find a block device with the right major/minor number.
+ * If we find multiple names, choose the shortest.
+ * If we find a name in /dev/md/, we prefer that.
+ * This applies only to names for MD devices.
+ * If 'prefer' is set (normally to e.g. /by-path/)
+ * then we prefer a name which contains that string.
+ */
+char *map_dev_preferred(int major, int minor, int create,
+ char *prefer)
+{
+ struct devmap *p;
+ char *regular = NULL, *preferred=NULL;
+ int did_check = 0;
+
+ if (major == 0 && minor == 0)
+ return NULL;
+
+ retry:
+ if (!devlist_ready) {
+ char *dev = "/dev";
+ struct stat stb;
+ while(devlist) {
+ struct devmap *d = devlist;
+ devlist = d->next;
+ free(d->name);
+ free(d);
+ }
+ if (lstat(dev, &stb) == 0 && S_ISLNK(stb.st_mode))
+ dev = "/dev/.";
+ nftw(dev, add_dev, 10, FTW_PHYS);
+ devlist_ready=1;
+ did_check = 1;
+ }
+
+ for (p = devlist; p; p = p->next)
+ if (p->major == major && p->minor == minor) {
+ if (strncmp(p->name, "/dev/md/",8) == 0 ||
+ (prefer && strstr(p->name, prefer))) {
+ if (preferred == NULL ||
+ strlen(p->name) < strlen(preferred))
+ preferred = p->name;
+ } else {
+ if (regular == NULL ||
+ strlen(p->name) < strlen(regular))
+ regular = p->name;
+ }
+ }
+ if (!regular && !preferred && !did_check) {
+ devlist_ready = 0;
+ goto retry;
+ }
+ if (create && !regular && !preferred) {
+ static char buf[30];
+ snprintf(buf, sizeof(buf), "%d:%d", major, minor);
+ regular = buf;
+ }
+
+ return preferred ? preferred : regular;
+}
+
+/* conf_word gets one word from the conf file.
+ * if "allow_key", then accept words at the start of a line,
+ * otherwise stop when such a word is found.
+ * We assume that the file pointer is at the end of a word, so the
+ * next character is a space, or a newline. If not, it is the start of a line.
+ */
+
+char *conf_word(FILE *file, int allow_key)
+{
+ int wsize = 100;
+ int len = 0;
+ int c;
+ int quote;
+ int wordfound = 0;
+ char *word = xmalloc(wsize);
+
+ while (wordfound == 0) {
+ /* at the end of a word.. */
+ c = getc(file);
+ if (c == '#')
+ while (c != EOF && c != '\n')
+ c = getc(file);
+ if (c == EOF)
+ break;
+ if (c == '\n')
+ continue;
+
+ if (c != ' ' && c != '\t' && ! allow_key) {
+ ungetc(c, file);
+ break;
+ }
+ /* looks like it is safe to get a word here, if there is one */
+ quote = 0;
+ /* first, skip any spaces */
+ while (c == ' ' || c == '\t')
+ c = getc(file);
+ if (c != EOF && c != '\n' && c != '#') {
+ /* we really have a character of a word, so start saving it */
+ while (c != EOF && c != '\n' &&
+ (quote || (c != ' ' && c != '\t'))) {
+ wordfound = 1;
+ if (quote && c == quote)
+ quote = 0;
+ else if (quote == 0 && (c == '\'' || c == '"'))
+ quote = c;
+ else {
+ if (len == wsize-1) {
+ wsize += 100;
+ word = xrealloc(word, wsize);
+ }
+ word[len++] = c;
+ }
+ c = getc(file);
+ /* Hack for broken kernels (2.6.14-.24) that put
+ * "active(auto-read-only)"
+ * in /proc/mdstat instead of
+ * "active (auto-read-only)"
+ */
+ if (c == '(' && len >= 6 &&
+ strncmp(word + len - 6, "active", 6) == 0)
+ c = ' ';
+ }
+ }
+ if (c != EOF)
+ ungetc(c, file);
+ }
+ word[len] = 0;
+
+ /* Further HACK for broken kernels.. 2.6.14-2.6.24 */
+ if (strcmp(word, "auto-read-only)") == 0)
+ strcpy(word, "(auto-read-only)");
+
+/* printf("word is <%s>\n", word); */
+ if (!wordfound) {
+ free(word);
+ word = NULL;
+ }
+ return word;
+}
+
+void print_quoted(char *str)
+{
+ /* Printf the string with surrounding quotes
+ * iff needed.
+ * If no space, tab, or quote - leave unchanged.
+ * Else print surrounded by " or ', swapping quotes
+ * when we find one that will cause confusion.
+ */
+
+ char first_quote = 0, q;
+ char *c;
+
+ for (c = str; *c; c++) {
+ switch(*c) {
+ case '\'':
+ case '"':
+ first_quote = *c;
+ break;
+ case ' ':
+ case '\t':
+ first_quote = *c;
+ continue;
+ default:
+ continue;
+ }
+ break;
+ }
+ if (!first_quote) {
+ printf("%s", str);
+ return;
+ }
+
+ if (first_quote == '"')
+ q = '\'';
+ else
+ q = '"';
+ putchar(q);
+ for (c = str; *c; c++) {
+ if (*c == q) {
+ putchar(q);
+ q ^= '"' ^ '\'';
+ putchar(q);
+ }
+ putchar(*c);
+ }
+ putchar(q);
+}
+
+void print_escape(char *str)
+{
+ /* print str, but change space and tab to '_'
+ * as is suitable for device names
+ */
+ for (; *str; str++) {
+ switch (*str) {
+ case ' ':
+ case '\t':
+ putchar('_');
+ break;
+ case '/':
+ putchar('-');
+ break;
+ default:
+ putchar(*str);
+ }
+ }
+}
+
+int check_env(char *name)
+{
+ char *val = getenv(name);
+
+ if (val && atoi(val) == 1)
+ return 1;
+
+ return 0;
+}
+
+int use_udev(void)
+{
+ static int use = -1;
+ struct stat stb;
+
+ if (use < 0) {
+ use = ((stat("/dev/.udev", &stb) == 0 ||
+ stat("/run/udev", &stb) == 0) &&
+ check_env("MDADM_NO_UDEV") == 0);
+ }
+ return use;
+}
+
+unsigned long GCD(unsigned long a, unsigned long b)
+{
+ while (a != b) {
+ if (a < b)
+ b -= a;
+ if (b < a)
+ a -= b;
+ }
+ return a;
+}
+
+/*
+ * conf_line reads one logical line from the conffile or mdstat.
+ * It skips comments and continues until it finds a line that starts
+ * with a non blank/comment. This character is pushed back for the next call
+ * A doubly linked list of words is returned.
+ * the first word will be a keyword. Other words will have had quotes removed.
+ */
+
+char *conf_line(FILE *file)
+{
+ char *w;
+ char *list;
+
+ w = conf_word(file, 1);
+ if (w == NULL)
+ return NULL;
+
+ list = dl_strdup(w);
+ free(w);
+ dl_init(list);
+
+ while ((w = conf_word(file, 0))){
+ char *w2 = dl_strdup(w);
+ free(w);
+ dl_add(list, w2);
+ }
+/* printf("got a line\n");*/
+ return list;
+}
+
+void free_line(char *line)
+{
+ char *w;
+ for (w = dl_next(line); w != line; w = dl_next(line)) {
+ dl_del(w);
+ dl_free(w);
+ }
+ dl_free(line);
+}
+
+/**
+ * parse_num() - Parse int from string.
+ * @dest: Pointer to destination.
+ * @num: Pointer to string that is going to be parsed.
+ *
+ * If string contains anything after a number, error code is returned.
+ * The same happens when number is bigger than INT_MAX or smaller than 0.
+ * Writes to destination only if successfully read the number.
+ *
+ * Return: 0 on success, 1 otherwise.
+ */
+int parse_num(int *dest, char *num)
+{
+ char *c = NULL;
+ long temp;
+
+ if (!num)
+ return 1;
+
+ errno = 0;
+ temp = strtol(num, &c, 10);
+ if (temp < 0 || temp > INT_MAX || *c || errno != 0 || num == c)
+ return 1;
+ *dest = temp;
+ return 0;
+}
diff --git a/makedist b/makedist
new file mode 100755
index 0000000..0c4b39e
--- /dev/null
+++ b/makedist
@@ -0,0 +1,96 @@
+#!/bin/sh
+# avoid silly sorting
+export LANG=C
+arg=$1
+target=~/public_html/source/mdadm
+if [ " $arg" = " test" ]
+then
+ target=/tmp/mdadm-test
+ rm -rf $target
+ mkdir -p $target
+fi
+if [ -d $target ]
+then :
+else echo $target is not a directory
+ exit 2
+fi
+set `grep '^#define VERSION' ReadMe.c `
+version=`echo $3 | sed -e 's/"//g'`
+grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 ||
+ {
+ echo mdadm.8.in does not mention version $version.
+ exit 1
+ }
+grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 ||
+ {
+ echo mdmon.8 does not mention version $version.
+ exit 1
+ }
+rpmv=`echo $version | tr - _`
+grep "^Version: *$rpmv$" mdadm.spec > /dev/null 2>&1 ||
+ {
+ echo mdadm.spec does not mention version $version.
+ exit 1
+ }
+if [ -f ANNOUNCE-$version ]
+then :
+else
+ echo ANNOUNCE-$version does not exist
+ exit 1
+fi
+if grep "^ANNOUNCE-$version\$" inventory
+then :
+else { cat inventory ; echo ANNOUNCE-$version ; } | sort -o inventory
+fi
+
+echo version = $version
+base=mdadm-$rpmv.tar.gz
+if [ " $arg" != " diff" ]
+then
+ if [ -f $target/$base ]
+ then
+ echo $target/$base exists.
+ exit 1
+ fi
+ trap "rm $target/$base; exit" 1 2 3
+ git archive --prefix=mdadm-$rpmv/ HEAD | gzip --best > $target/$base
+ chmod a+r $target/$base
+ ls -l $target/$base
+ if tar tzf $target/$base | sed 's,[^/]*/,,' | sort | diff -u inventory -
+ then : correct files found
+ else echo "Extra files, or inventory is out-of-date"
+ rm $target/$base
+ exit 1
+ fi
+ rpmbuild -ta $target/$base || exit 1
+ find ~/rpmbuild/RPMS -name "*mdadm-$version-*" \
+ -exec cp {} $target/RPM \;
+ cp ANNOUNCE-$version $target/ANNOUNCE
+ cp ChangeLog $target/ChangeLog
+ if [ " $arg" != " test" ]
+ then
+ echo -n "Confirm signing this release? "
+ read a
+ if [ " $a" != " y" ]; then echo OK - bye. ; exit 1; fi
+ if zcat $target/$base | gpg -ba > $target/$base.sign && gpg -ba $target/ANNOUNCE
+ then
+ kup put $target/$base $target/$base.sign \
+ /pub/linux/utils/raid/mdadm/mdadm-$version.tar.gz
+ kup put $target/ANNOUNCE $target/ANNOUNCE.asc /pub/linux/utils/raid/mdadm/ANNOUNCE
+ else
+ echo signing failed
+ exit 1
+ fi
+ fi
+else
+ if [ ! -f $target/$base ]
+ then
+ echo $target/$base does not exist.
+ exit 1
+ fi
+ ( cd .. ; ln -s mdadm.v2 mdadm-$version ; tar chf - --exclude=.git --exclude="TAGS" --exclude='*,v' --exclude='*~' --exclude='*.o' --exclude mdadm --exclude=mdadm'.[^ch0-9]' --exclude=RCS mdadm-$version ; rm mdadm-$version ) | gzip --best > /var/tmp/mdadm-new.tgz
+ mkdir /var/tmp/mdadm-old ; zcat $target/$base | ( cd /var/tmp/mdadm-old ; tar xf - )
+ mkdir /var/tmp/mdadm-new ; zcat /var/tmp/mdadm-new.tgz | ( cd /var/tmp/mdadm-new ; tar xf - )
+ diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new
+ rm -rf /var/tmp/mdadm-old /var/tmp/mdadm-new /var/tmp/mdadm-new.tgz
+fi
diff --git a/managemon.c b/managemon.c
new file mode 100644
index 0000000..bb7334c
--- /dev/null
+++ b/managemon.c
@@ -0,0 +1,943 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * The management thread for monitoring active md arrays.
+ * This thread does things which might block such as memory
+ * allocation.
+ * In particular:
+ *
+ * - Find out about new arrays in this container.
+ * Allocate the data structures and open the files.
+ *
+ * For this we watch /proc/mdstat and find new arrays with
+ * metadata type that confirms sharing. e.g. "md4"
+ * When we find a new array we slip it into the list of
+ * arrays and signal 'monitor' by writing to a pipe.
+ *
+ * - Respond to reshape requests by allocating new data structures
+ * and opening new files.
+ *
+ * These come as a change to raid_disks. We allocate a new
+ * version of the data structures and slip it into the list.
+ * 'monitor' will notice and release the old version.
+ * Changes to level, chunksize, layout.. do not need re-allocation.
+ * Reductions in raid_disks don't really either, but we handle
+ * them the same way for consistency.
+ *
+ * - When a device is added to the container, we add it to the metadata
+ * as a spare.
+ *
+ * - Deal with degraded array
+ * We only do this when first noticing the array is degraded.
+ * This can be when we first see the array, when sync completes or
+ * when recovery completes.
+ *
+ * Check if number of failed devices suggests recovery is needed, and
+ * skip if not.
+ * Ask metadata to allocate a spare device
+ * Add device as not in_sync and give a role
+ * Update metadata.
+ * Open sysfs files and pass to monitor.
+ * Make sure that monitor Starts recovery....
+ *
+ * - Pass on metadata updates from external programs such as
+ * mdadm creating a new array.
+ *
+ * This is most-messy.
+ * It might involve adding a new array or changing the status of
+ * a spare, or any reconfig that the kernel doesn't get involved in.
+ *
+ * The required updates are received via a named pipe. There will
+ * be one named pipe for each container. Each message contains a
+ * sync marker: 0x5a5aa5a5, A byte count, and the message. This is
+ * passed to the metadata handler which will interpret and process it.
+ * For 'DDF' messages are internal data blocks with the leading
+ * 'magic number' signifying what sort of data it is.
+ *
+ */
+
+/*
+ * We select on /proc/mdstat and the named pipe.
+ * We create new arrays or updated version of arrays and slip
+ * them into the head of the list, then signal 'monitor' via a pipe write.
+ * 'monitor' will notice and place the old array on a return list.
+ * Metadata updates are placed on a queue just like they arrive
+ * from the named pipe.
+ *
+ * When new arrays are found based on correct metadata string, we
+ * need to identify them with an entry in the metadata. Maybe we require
+ * the metadata to be mdX/NN when NN is the index into an appropriate table.
+ *
+ */
+
+/*
+ * List of tasks:
+ * - Watch for spares to be added to the container, and write updated
+ * metadata to them.
+ * - Watch for new arrays using this container, confirm they match metadata
+ * and if so, start monitoring them
+ * - Watch for spares being added to monitored arrays. This shouldn't
+ * happen, as we should do all the adding. Just remove them.
+ * - Watch for change in raid-disks, chunk-size, etc. Update metadata and
+ * start a reshape.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/socket.h>
+#include <signal.h>
+
+static void close_aa(struct active_array *aa)
+{
+ struct mdinfo *d;
+
+ for (d = aa->info.devs; d; d = d->next) {
+ close(d->recovery_fd);
+ close(d->state_fd);
+ close(d->bb_fd);
+ close(d->ubb_fd);
+ }
+
+ if (aa->action_fd >= 0)
+ close(aa->action_fd);
+ if (aa->info.state_fd >= 0)
+ close(aa->info.state_fd);
+ if (aa->resync_start_fd >= 0)
+ close(aa->resync_start_fd);
+ if (aa->metadata_fd >= 0)
+ close(aa->metadata_fd);
+ if (aa->sync_completed_fd >= 0)
+ close(aa->sync_completed_fd);
+ if (aa->safe_mode_delay_fd >= 0)
+ close(aa->safe_mode_delay_fd);
+}
+
+static void free_aa(struct active_array *aa)
+{
+ /* Note that this doesn't close fds if they are being used
+ * by a clone. ->container will be set for a clone
+ */
+ dprintf("sys_name: %s\n", aa->info.sys_name);
+ if (!aa->container)
+ close_aa(aa);
+ while (aa->info.devs) {
+ struct mdinfo *d = aa->info.devs;
+ aa->info.devs = d->next;
+ free(d);
+ }
+ free(aa);
+}
+
+static struct active_array *duplicate_aa(struct active_array *aa)
+{
+ struct active_array *newa = xmalloc(sizeof(*newa));
+ struct mdinfo **dp1, **dp2;
+
+ *newa = *aa;
+ newa->next = NULL;
+ newa->replaces = NULL;
+ newa->info.next = NULL;
+
+ dp2 = &newa->info.devs;
+
+ for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) {
+ struct mdinfo *d;
+ if ((*dp1)->state_fd < 0)
+ continue;
+
+ d = xmalloc(sizeof(*d));
+ *d = **dp1;
+ *dp2 = d;
+ dp2 = & d->next;
+ }
+ *dp2 = NULL;
+
+ return newa;
+}
+
+static void wakeup_monitor(void)
+{
+ /* tgkill(getpid(), mon_tid, SIGUSR1); */
+ int pid = getpid();
+ syscall(SYS_tgkill, pid, mon_tid, SIGUSR1);
+}
+
+static void remove_old(void)
+{
+ if (discard_this) {
+ discard_this->next = NULL;
+ free_aa(discard_this);
+ if (pending_discard == discard_this)
+ pending_discard = NULL;
+ discard_this = NULL;
+ wakeup_monitor();
+ }
+}
+
+static void replace_array(struct supertype *container,
+ struct active_array *old,
+ struct active_array *new)
+{
+ /* To replace an array, we add it to the top of the list
+ * marked with ->replaces to point to the original.
+ * 'monitor' will take the original out of the list
+ * and put it on 'discard_this'. We take it from there
+ * and discard it.
+ */
+ remove_old();
+ while (pending_discard) {
+ while (discard_this == NULL)
+ sleep(1);
+ remove_old();
+ }
+ pending_discard = old;
+ new->replaces = old;
+ new->next = container->arrays;
+ container->arrays = new;
+ wakeup_monitor();
+}
+
+struct metadata_update *update_queue = NULL;
+struct metadata_update *update_queue_handled = NULL;
+struct metadata_update *update_queue_pending = NULL;
+
+static void free_updates(struct metadata_update **update)
+{
+ while (*update) {
+ struct metadata_update *this = *update;
+ void **space_list = this->space_list;
+
+ *update = this->next;
+ free(this->buf);
+ free(this->space);
+ while (space_list) {
+ void *space = space_list;
+ space_list = *space_list;
+ free(space);
+ }
+ free(this);
+ }
+}
+
+void check_update_queue(struct supertype *container)
+{
+ free_updates(&update_queue_handled);
+
+ if (update_queue == NULL &&
+ update_queue_pending) {
+ update_queue = update_queue_pending;
+ update_queue_pending = NULL;
+ wakeup_monitor();
+ }
+}
+
+static void queue_metadata_update(struct metadata_update *mu)
+{
+ struct metadata_update **qp;
+
+ qp = &update_queue_pending;
+ while (*qp)
+ qp = & ((*qp)->next);
+ *qp = mu;
+}
+
+static void add_disk_to_container(struct supertype *st, struct mdinfo *sd)
+{
+ int dfd;
+ char nm[20];
+ struct metadata_update *update = NULL;
+ mdu_disk_info_t dk = {
+ .number = -1,
+ .major = sd->disk.major,
+ .minor = sd->disk.minor,
+ .raid_disk = -1,
+ .state = 0,
+ };
+
+ dprintf("add %d:%d to container\n", sd->disk.major, sd->disk.minor);
+
+ sd->next = st->devs;
+ st->devs = sd;
+
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, O_RDWR);
+ if (dfd < 0)
+ return;
+
+ st->update_tail = &update;
+ st->ss->add_to_super(st, &dk, dfd, NULL, INVALID_SECTORS);
+ st->ss->write_init_super(st);
+ queue_metadata_update(update);
+ st->update_tail = NULL;
+}
+
+/*
+ * Create and queue update structure about the removed disks.
+ * The update is prepared by super type handler and passed to the monitor
+ * thread.
+ */
+static void remove_disk_from_container(struct supertype *st, struct mdinfo *sd)
+{
+ struct metadata_update *update = NULL;
+ mdu_disk_info_t dk = {
+ .number = -1,
+ .major = sd->disk.major,
+ .minor = sd->disk.minor,
+ .raid_disk = -1,
+ .state = 0,
+ };
+ dprintf("remove %d:%d from container\n",
+ sd->disk.major, sd->disk.minor);
+
+ st->update_tail = &update;
+ st->ss->remove_from_super(st, &dk);
+ /* FIXME this write_init_super shouldn't be here.
+ * We have it after add_to_super to write to new device,
+ * but with 'remove' we don't ant to write to that device!
+ */
+ st->ss->write_init_super(st);
+ queue_metadata_update(update);
+ st->update_tail = NULL;
+}
+
+static void manage_container(struct mdstat_ent *mdstat,
+ struct supertype *container)
+{
+ /* Of interest here are:
+ * - if a new device has been added to the container, we
+ * add it to the array ignoring any metadata on it.
+ * - if a device has been removed from the container, we
+ * remove it from the device list and update the metadata.
+ * FIXME should we look for compatible metadata and take hints
+ * about spare assignment.... probably not.
+ */
+ if (mdstat->devcnt != container->devcnt) {
+ struct mdinfo **cdp, *cd, *di, *mdi;
+ int found;
+
+ /* read /sys/block/NAME/md/dev-??/block/dev to find out
+ * what is there, and compare with container->info.devs
+ * To see what is removed and what is added.
+ * These need to be remove from, or added to, the array
+ */
+ mdi = sysfs_read(-1, mdstat->devnm, GET_DEVS);
+ if (!mdi) {
+ /* invalidate the current count so we can try again */
+ container->devcnt = -1;
+ return;
+ }
+
+ /* check for removals */
+ for (cdp = &container->devs; *cdp; ) {
+ found = 0;
+ for (di = mdi->devs; di; di = di->next)
+ if (di->disk.major == (*cdp)->disk.major &&
+ di->disk.minor == (*cdp)->disk.minor) {
+ found = 1;
+ break;
+ }
+ if (!found) {
+ cd = *cdp;
+ *cdp = (*cdp)->next;
+ remove_disk_from_container(container, cd);
+ free(cd);
+ } else
+ cdp = &(*cdp)->next;
+ }
+
+ /* check for additions */
+ for (di = mdi->devs; di; di = di->next) {
+ for (cd = container->devs; cd; cd = cd->next)
+ if (di->disk.major == cd->disk.major &&
+ di->disk.minor == cd->disk.minor)
+ break;
+ if (!cd) {
+ struct mdinfo *newd = xmalloc(sizeof(*newd));
+
+ *newd = *di;
+ add_disk_to_container(container, newd);
+ }
+ }
+ sysfs_free(mdi);
+ container->devcnt = mdstat->devcnt;
+ }
+}
+
+static int sysfs_open2(char *devnum, char *name, char *attr)
+{
+ int fd = sysfs_open(devnum, name, attr);
+ if (fd >= 0) {
+ /* seq_file in the kernel allocates buffer space
+ * on the first read. Do that now so 'monitor'
+ * never needs too.
+ */
+ char buf[200];
+ if (read(fd, buf, sizeof(buf)) < 0)
+ /* pretend not to ignore return value */
+ return fd;
+ }
+ return fd;
+}
+
+static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone,
+ struct active_array *aa)
+{
+ if (!disk || !clone)
+ return -1;
+
+ *disk = *clone;
+ disk->recovery_fd = sysfs_open2(aa->info.sys_name, disk->sys_name,
+ "recovery_start");
+ if (disk->recovery_fd < 0)
+ return -1;
+ disk->state_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, "state");
+ if (disk->state_fd < 0) {
+ close(disk->recovery_fd);
+ return -1;
+ }
+ disk->bb_fd = sysfs_open2(aa->info.sys_name, disk->sys_name,
+ "bad_blocks");
+ if (disk->bb_fd < 0) {
+ close(disk->recovery_fd);
+ close(disk->state_fd);
+ return -1;
+ }
+ disk->ubb_fd = sysfs_open2(aa->info.sys_name, disk->sys_name,
+ "unacknowledged_bad_blocks");
+ if (disk->ubb_fd < 0) {
+ close(disk->recovery_fd);
+ close(disk->state_fd);
+ close(disk->bb_fd);
+ return -1;
+ }
+ disk->prev_state = read_dev_state(disk->state_fd);
+ disk->curr_state = disk->prev_state;
+ disk->next = aa->info.devs;
+ aa->info.devs = disk;
+
+ return 0;
+}
+
+static void manage_member(struct mdstat_ent *mdstat,
+ struct active_array *a)
+{
+ /* Compare mdstat info with known state of member array.
+ * We do not need to look for device state changes here, that
+ * is dealt with by the monitor.
+ *
+ * If a reshape is being requested, monitor will have noticed
+ * that sync_action changed and will have set check_reshape.
+ * We just need to see if new devices have appeared. All metadata
+ * updates will already have been processed.
+ *
+ * We also want to handle degraded arrays here by
+ * trying to find and assign a spare.
+ * We do that whenever the monitor tells us too.
+ */
+ char buf[64];
+ int frozen;
+ struct supertype *container = a->container;
+ struct mdinfo *mdi;
+
+ if (container == NULL)
+ /* Raced with something */
+ return;
+
+ if (mdstat->active) {
+ // FIXME
+ a->info.array.raid_disks = mdstat->raid_disks;
+ // MORE
+ }
+
+ mdi = sysfs_read(-1, mdstat->devnm,
+ GET_COMPONENT|GET_CONSISTENCY_POLICY);
+ if (mdi) {
+ a->info.component_size = mdi->component_size;
+ a->info.consistency_policy = mdi->consistency_policy;
+ sysfs_free(mdi);
+ }
+
+ /* honor 'frozen' */
+ if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0)
+ frozen = buf[9] == '-';
+ else
+ frozen = 1; /* can't read metadata_version assume the worst */
+
+ /* If sync_action is not 'idle' then don't try recovery now */
+ if (!frozen &&
+ sysfs_get_str(&a->info, NULL, "sync_action",
+ buf, sizeof(buf)) > 0 && strncmp(buf, "idle", 4) != 0)
+ frozen = 1;
+
+ if (mdstat->level) {
+ int level = map_name(pers, mdstat->level);
+ if (level == 0 || level == LEVEL_LINEAR) {
+ a->to_remove = 1;
+ wakeup_monitor();
+ return;
+ }
+ else if (a->info.array.level != level && level > 0) {
+ struct active_array *newa = duplicate_aa(a);
+ if (newa) {
+ newa->info.array.level = level;
+ replace_array(container, a, newa);
+ a = newa;
+ }
+ }
+ }
+
+ /* we are after monitor kick,
+ * so container field can be cleared - check it again
+ */
+ if (a->container == NULL)
+ return;
+
+ if (sigterm && a->info.safe_mode_delay != 1 &&
+ a->safe_mode_delay_fd >= 0) {
+ long int new_delay = 1;
+ char delay[10];
+ ssize_t len;
+
+ len = snprintf(delay, sizeof(delay), "0.%03ld\n", new_delay);
+ if (write(a->safe_mode_delay_fd, delay, len) == len)
+ a->info.safe_mode_delay = new_delay;
+ }
+
+ /* We don't check the array while any update is pending, as it
+ * might container a change (such as a spare assignment) which
+ * could affect our decisions.
+ */
+ if (a->check_degraded && !frozen &&
+ update_queue == NULL && update_queue_pending == NULL) {
+ struct metadata_update *updates = NULL;
+ struct mdinfo *newdev = NULL;
+ struct active_array *newa;
+ struct mdinfo *d;
+
+ a->check_degraded = 0;
+
+ /* The array may not be degraded, this is just a good time
+ * to check.
+ */
+ newdev = container->ss->activate_spare(a, &updates);
+ if (!newdev)
+ return;
+
+ newa = duplicate_aa(a);
+ if (!newa)
+ goto out;
+ /* prevent the kernel from activating the disk(s) before we
+ * finish adding them
+ */
+ dprintf("freezing %s\n", a->info.sys_name);
+ sysfs_set_str(&a->info, NULL, "sync_action", "frozen");
+
+ /* Add device to array and set offset/size/slot.
+ * and open files for each newdev */
+ for (d = newdev; d ; d = d->next) {
+ struct mdinfo *newd;
+
+ newd = xmalloc(sizeof(*newd));
+ if (sysfs_add_disk(&newa->info, d, 0) < 0) {
+ free(newd);
+ continue;
+ }
+ disk_init_and_add(newd, d, newa);
+ }
+ queue_metadata_update(updates);
+ updates = NULL;
+ while (update_queue_pending || update_queue) {
+ check_update_queue(container);
+ usleep(15*1000);
+ }
+ replace_array(container, a, newa);
+ if (sysfs_set_str(&a->info, NULL,
+ "sync_action", "recover") == 0)
+ newa->prev_action = recover;
+ dprintf("recovery started on %s\n", a->info.sys_name);
+ out:
+ while (newdev) {
+ d = newdev->next;
+ free(newdev);
+ newdev = d;
+ }
+ free_updates(&updates);
+ }
+
+ if (a->check_reshape) {
+ /* mdadm might have added some devices to the array.
+ * We want to disk_init_and_add any such device to a
+ * duplicate_aa and replace a with that.
+ * mdstat doesn't have enough info so we sysfs_read
+ * and look for new stuff.
+ */
+ struct mdinfo *info, *d, *d2, *newd;
+ unsigned long long array_size;
+ struct active_array *newa = NULL;
+ a->check_reshape = 0;
+ info = sysfs_read(-1, mdstat->devnm,
+ GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE);
+ if (!info)
+ goto out2;
+ for (d = info->devs; d; d = d->next) {
+ if (d->disk.raid_disk < 0)
+ continue;
+ for (d2 = a->info.devs; d2; d2 = d2->next)
+ if (d2->disk.raid_disk ==
+ d->disk.raid_disk)
+ break;
+ if (d2)
+ /* already have this one */
+ continue;
+ if (!newa) {
+ newa = duplicate_aa(a);
+ if (!newa)
+ break;
+ }
+ newd = xmalloc(sizeof(*newd));
+ disk_init_and_add(newd, d, newa);
+ }
+ if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0 &&
+ a->info.custom_array_size > array_size*2) {
+ sysfs_set_num(info, NULL, "array_size",
+ a->info.custom_array_size/2);
+ }
+ out2:
+ sysfs_free(info);
+ if (newa)
+ replace_array(container, a, newa);
+ }
+}
+
+static int aa_ready(struct active_array *aa)
+{
+ struct mdinfo *d;
+ int level = aa->info.array.level;
+
+ for (d = aa->info.devs; d; d = d->next)
+ if (d->state_fd < 0)
+ return 0;
+
+ if (aa->info.state_fd < 0)
+ return 0;
+
+ if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0))
+ return 0;
+
+ if (!aa->container)
+ return 0;
+
+ return 1;
+}
+
+static void manage_new(struct mdstat_ent *mdstat,
+ struct supertype *container,
+ struct active_array *victim)
+{
+ /* A new array has appeared in this container.
+ * Hopefully it is already recorded in the metadata.
+ * Check, then create the new array to report it to
+ * the monitor.
+ */
+
+ struct active_array *new = NULL;
+ struct mdinfo *mdi = NULL, *di;
+ int i, inst;
+ int failed = 0;
+ char buf[40];
+
+ /* check if array is ready to be monitored */
+ if (!mdstat->active || !mdstat->level)
+ return;
+ if (strncmp(mdstat->level, "raid0", strlen("raid0")) == 0 ||
+ strncmp(mdstat->level, "linear", strlen("linear")) == 0)
+ return;
+
+ mdi = sysfs_read(-1, mdstat->devnm,
+ GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT|
+ GET_SAFEMODE|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE|
+ GET_LAYOUT|GET_DEVS_ALL);
+
+ if (!mdi)
+ return;
+ new = xcalloc(1, sizeof(*new));
+
+ strcpy(new->info.sys_name, mdstat->devnm);
+
+ new->prev_state = new->curr_state = new->next_state = inactive;
+ new->prev_action= new->curr_action= new->next_action= idle;
+
+ new->container = container;
+
+ if (parse_num(&inst, to_subarray(mdstat, container->devnm)) != 0)
+ goto error;
+
+ new->info.array = mdi->array;
+ new->info.component_size = mdi->component_size;
+
+ for (i = 0; i < new->info.array.raid_disks; i++) {
+ struct mdinfo *newd = xmalloc(sizeof(*newd));
+
+ for (di = mdi->devs; di; di = di->next)
+ if (i == di->disk.raid_disk)
+ break;
+
+ if (disk_init_and_add(newd, di, new) != 0) {
+ if (newd)
+ free(newd);
+
+ failed++;
+ if (failed > new->info.array.failed_disks) {
+ /* we cannot properly monitor without all working disks */
+ new->container = NULL;
+ break;
+ }
+ }
+ }
+
+ new->action_fd = sysfs_open2(new->info.sys_name, NULL, "sync_action");
+ new->info.state_fd = sysfs_open2(new->info.sys_name, NULL, "array_state");
+ new->resync_start_fd = sysfs_open2(new->info.sys_name, NULL, "resync_start");
+ new->metadata_fd = sysfs_open2(new->info.sys_name, NULL, "metadata_version");
+ new->sync_completed_fd = sysfs_open2(new->info.sys_name, NULL, "sync_completed");
+ new->safe_mode_delay_fd = sysfs_open2(new->info.sys_name, NULL,
+ "safe_mode_delay");
+
+ dprintf("inst: %d action: %d state: %d\n", inst,
+ new->action_fd, new->info.state_fd);
+
+ if (mdi->safe_mode_delay >= 50)
+ /* Normal start, mdadm set this. */
+ new->info.safe_mode_delay = mdi->safe_mode_delay;
+ else
+ /* Restart, just pick a number */
+ new->info.safe_mode_delay = 5000;
+ sysfs_set_safemode(&new->info, new->info.safe_mode_delay);
+
+ /* reshape_position is set by mdadm in sysfs
+ * read this information for new arrays only (empty victim)
+ */
+ if ((victim == NULL) &&
+ (sysfs_get_str(mdi, NULL, "sync_action", buf, 40) > 0) &&
+ (strncmp(buf, "reshape", 7) == 0)) {
+ if (sysfs_get_ll(mdi, NULL, "reshape_position",
+ &new->last_checkpoint) != 0)
+ new->last_checkpoint = 0;
+ else {
+ int data_disks = mdi->array.raid_disks;
+ if (mdi->array.level == 4 || mdi->array.level == 5)
+ data_disks--;
+ if (mdi->array.level == 6)
+ data_disks -= 2;
+
+ new->last_checkpoint /= data_disks;
+ }
+ dprintf("mdmon: New monitored array is under reshape.\n"
+ " Last checkpoint is: %llu\n",
+ new->last_checkpoint);
+ }
+
+ sysfs_free(mdi);
+ mdi = NULL;
+
+ /* if everything checks out tell the metadata handler we want to
+ * manage this instance
+ */
+ if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) {
+ goto error;
+ } else {
+ replace_array(container, victim, new);
+ if (failed) {
+ new->check_degraded = 1;
+ manage_member(mdstat, new);
+ }
+ }
+ return;
+
+error:
+ pr_err("failed to monitor %s\n", mdstat->metadata_version);
+ if (new) {
+ new->container = NULL;
+ free_aa(new);
+ }
+ if (mdi)
+ sysfs_free(mdi);
+}
+
+void manage(struct mdstat_ent *mdstat, struct supertype *container)
+{
+ /* We have just read mdstat and need to compare it with
+ * the known active arrays.
+ * Arrays with the wrong metadata are ignored.
+ */
+
+ for ( ; mdstat ; mdstat = mdstat->next) {
+ struct active_array *a;
+ if (strcmp(mdstat->devnm, container->devnm) == 0) {
+ manage_container(mdstat, container);
+ continue;
+ }
+ if (!is_container_member(mdstat, container->devnm))
+ /* Not for this array */
+ continue;
+ /* Looks like a member of this container */
+ for (a = container->arrays; a; a = a->next) {
+ if (strcmp(mdstat->devnm, a->info.sys_name) == 0) {
+ if (a->container && a->to_remove == 0)
+ manage_member(mdstat, a);
+ break;
+ }
+ }
+ if ((a == NULL || !a->container) && !sigterm)
+ manage_new(mdstat, container, a);
+ }
+}
+
+static void handle_message(struct supertype *container, struct metadata_update *msg)
+{
+ /* queue this metadata update through to the monitor */
+
+ struct metadata_update *mu;
+
+ if (msg->len <= 0)
+ while (update_queue_pending || update_queue) {
+ check_update_queue(container);
+ usleep(15*1000);
+ }
+
+ if (msg->len == 0) { /* ping_monitor */
+ int cnt;
+
+ cnt = monitor_loop_cnt;
+ if (cnt & 1)
+ cnt += 2; /* wait until next pselect */
+ else
+ cnt += 3; /* wait for 2 pselects */
+ wakeup_monitor();
+
+ while (monitor_loop_cnt - cnt < 0)
+ usleep(10 * 1000);
+ } else if (msg->len == -1) { /* ping_manager */
+ struct mdstat_ent *mdstat = mdstat_read(1, 0);
+
+ manage(mdstat, container);
+ free_mdstat(mdstat);
+ } else if (!sigterm) {
+ mu = xmalloc(sizeof(*mu));
+ mu->len = msg->len;
+ mu->buf = msg->buf;
+ msg->buf = NULL;
+ mu->space = NULL;
+ mu->space_list = NULL;
+ mu->next = NULL;
+ if (container->ss->prepare_update)
+ if (!container->ss->prepare_update(container, mu))
+ free_updates(&mu);
+ queue_metadata_update(mu);
+ }
+}
+
+void read_sock(struct supertype *container)
+{
+ int fd;
+ struct metadata_update msg;
+ int terminate = 0;
+ long fl;
+ int tmo = 3; /* 3 second timeout before hanging up the socket */
+
+ fd = accept(container->sock, NULL, NULL);
+ if (fd < 0)
+ return;
+
+ fl = fcntl(fd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(fd, F_SETFL, fl);
+
+ do {
+ msg.buf = NULL;
+
+ /* read and validate the message */
+ if (receive_message(fd, &msg, tmo) == 0) {
+ handle_message(container, &msg);
+ if (msg.len == 0) {
+ /* ping reply with version */
+ msg.buf = Version;
+ msg.len = strlen(Version) + 1;
+ if (send_message(fd, &msg, tmo) < 0)
+ terminate = 1;
+ } else if (ack(fd, tmo) < 0)
+ terminate = 1;
+ } else
+ terminate = 1;
+
+ } while (!terminate);
+
+ close(fd);
+}
+
+int exit_now = 0;
+int manager_ready = 0;
+void do_manager(struct supertype *container)
+{
+ struct mdstat_ent *mdstat;
+ sigset_t set;
+
+ sigprocmask(SIG_UNBLOCK, NULL, &set);
+ sigdelset(&set, SIGUSR1);
+ sigdelset(&set, SIGTERM);
+
+ do {
+
+ if (exit_now)
+ exit(0);
+
+ /* Can only 'manage' things if 'monitor' is not making
+ * structural changes to metadata, so need to check
+ * update_queue
+ */
+ if (update_queue == NULL) {
+ mdstat = mdstat_read(1, 0);
+
+ manage(mdstat, container);
+
+ read_sock(container);
+
+ free_mdstat(mdstat);
+ }
+ remove_old();
+
+ check_update_queue(container);
+
+ manager_ready = 1;
+
+ if (sigterm)
+ wakeup_monitor();
+
+ if (update_queue == NULL)
+ mdstat_wait_fd(container->sock, &set);
+ else
+ /* If an update is happening, just wait for signal */
+ pselect(0, NULL, NULL, NULL, NULL, &set);
+ } while(1);
+}
diff --git a/mapfile.c b/mapfile.c
new file mode 100644
index 0000000..6b2207d
--- /dev/null
+++ b/mapfile.c
@@ -0,0 +1,511 @@
+/*
+ * mapfile - keep track of uuid <-> array mapping. Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ * Paper: Neil Brown
+ * Novell Inc
+ * GPO Box Q1283
+ * QVB Post Office, NSW 1230
+ * Australia
+ */
+
+/* The mapfile is used to track arrays being created in --incremental
+ * mode. It particularly allows lookup from UUID to array device, but
+ * also allows the array device name to be easily found.
+ *
+ * The map file is line based with space separated fields. The fields are:
+ * Device id - mdX or mdpX where X is a number.
+ * metadata - 0.90 1.0 1.1 1.2 ddf ...
+ * UUID - uuid of the array
+ * path - path where device created: /dev/md/home
+ *
+ * The best place for the mapfile is /run/mdadm/map. Distros and users
+ * which have not switched to /run yet can choose a different location
+ * at compile time via MAP_DIR and MAP_FILE.
+ */
+#include "mdadm.h"
+#include <sys/file.h>
+#include <ctype.h>
+
+#define MAP_READ 0
+#define MAP_NEW 1
+#define MAP_LOCK 2
+#define MAP_DIRNAME 3
+
+char *mapname[4] = {
+ MAP_DIR "/" MAP_FILE,
+ MAP_DIR "/" MAP_FILE ".new",
+ MAP_DIR "/" MAP_FILE ".lock",
+ MAP_DIR
+};
+
+int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT|O_TRUNC };
+char *mapsmode[3] = { "r", "w", "w"};
+
+FILE *open_map(int modenum)
+{
+ int fd;
+ if ((mapmode[modenum] & O_CREAT))
+ /* Attempt to create directory, don't worry about
+ * failure.
+ */
+ (void)mkdir(mapname[MAP_DIRNAME], 0755);
+ fd = open(mapname[modenum], mapmode[modenum], 0600);
+ if (fd >= 0)
+ return fdopen(fd, mapsmode[modenum]);
+ return NULL;
+}
+
+int map_write(struct map_ent *mel)
+{
+ FILE *f;
+ int err;
+
+ f = open_map(MAP_NEW);
+
+ if (!f)
+ return 0;
+ for (; mel; mel = mel->next) {
+ if (mel->bad)
+ continue;
+ fprintf(f, "%s ", mel->devnm);
+ fprintf(f, "%s ", mel->metadata);
+ fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0],
+ mel->uuid[1], mel->uuid[2], mel->uuid[3]);
+ fprintf(f, "%s\n", mel->path?:"");
+ }
+ fflush(f);
+ err = ferror(f);
+ fclose(f);
+ if (err) {
+ unlink(mapname[1]);
+ return 0;
+ }
+ return rename(mapname[1],
+ mapname[0]) == 0;
+}
+
+static FILE *lf = NULL;
+int map_lock(struct map_ent **melp)
+{
+ while (lf == NULL) {
+ struct stat buf;
+ lf = open_map(MAP_LOCK);
+ if (lf == NULL)
+ return -1;
+ if (flock(fileno(lf), LOCK_EX) != 0) {
+ fclose(lf);
+ lf = NULL;
+ return -1;
+ }
+ if (fstat(fileno(lf), &buf) != 0 ||
+ buf.st_nlink == 0) {
+ /* The owner of the lock unlinked it,
+ * so we have a lock on a stale file,
+ * try again
+ */
+ fclose(lf);
+ lf = NULL;
+ }
+ }
+ if (*melp)
+ map_free(*melp);
+ map_read(melp);
+ return 0;
+}
+
+void map_unlock(struct map_ent **melp)
+{
+ if (lf) {
+ /* must unlink before closing the file,
+ * as only the owner of the lock may
+ * unlink the file
+ */
+ unlink(mapname[2]);
+ fclose(lf);
+ }
+ if (*melp)
+ map_free(*melp);
+ lf = NULL;
+}
+
+void map_fork(void)
+{
+ /* We are forking, so must close the lock file.
+ * Don't risk flushing anything though.
+ */
+ if (lf) {
+ close(fileno(lf));
+ fclose(lf);
+ lf = NULL;
+ }
+}
+
+void map_add(struct map_ent **melp,
+ char * devnm, char *metadata, int uuid[4], char *path)
+{
+ struct map_ent *me = xmalloc(sizeof(*me));
+
+ strcpy(me->devnm, devnm);
+ strcpy(me->metadata, metadata);
+ memcpy(me->uuid, uuid, 16);
+ me->path = path ? xstrdup(path) : NULL;
+ me->next = *melp;
+ me->bad = 0;
+ *melp = me;
+}
+
+void map_read(struct map_ent **melp)
+{
+ FILE *f;
+ char buf[8192];
+ char path[201];
+ int uuid[4];
+ char devnm[32];
+ char metadata[30];
+
+ *melp = NULL;
+
+ f = open_map(MAP_READ);
+ if (!f) {
+ RebuildMap();
+ f = open_map(MAP_READ);
+ }
+ if (!f)
+ return;
+
+ while (fgets(buf, sizeof(buf), f)) {
+ path[0] = 0;
+ if (sscanf(buf, " %s %s %x:%x:%x:%x %200s",
+ devnm, metadata, uuid, uuid+1,
+ uuid+2, uuid+3, path) >= 7) {
+ map_add(melp, devnm, metadata, uuid, path);
+ }
+ }
+ fclose(f);
+}
+
+void map_free(struct map_ent *map)
+{
+ while (map) {
+ struct map_ent *mp = map;
+ map = mp->next;
+ free(mp->path);
+ free(mp);
+ }
+}
+
+int map_update(struct map_ent **mpp, char *devnm, char *metadata,
+ int uuid[4], char *path)
+{
+ struct map_ent *map, *mp;
+ int rv;
+
+ if (mpp && *mpp)
+ map = *mpp;
+ else
+ map_read(&map);
+
+ for (mp = map ; mp ; mp=mp->next)
+ if (strcmp(mp->devnm, devnm) == 0) {
+ strcpy(mp->metadata, metadata);
+ memcpy(mp->uuid, uuid, 16);
+ free(mp->path);
+ mp->path = path ? xstrdup(path) : NULL;
+ mp->bad = 0;
+ break;
+ }
+ if (!mp)
+ map_add(&map, devnm, metadata, uuid, path);
+ if (mpp)
+ *mpp = NULL;
+ rv = map_write(map);
+ map_free(map);
+ return rv;
+}
+
+void map_delete(struct map_ent **mapp, char *devnm)
+{
+ struct map_ent *mp;
+
+ if (*mapp == NULL)
+ map_read(mapp);
+
+ for (mp = *mapp; mp; mp = *mapp) {
+ if (strcmp(mp->devnm, devnm) == 0) {
+ *mapp = mp->next;
+ free(mp->path);
+ free(mp);
+ } else
+ mapp = & mp->next;
+ }
+}
+
+void map_remove(struct map_ent **mapp, char *devnm)
+{
+ if (devnm[0] == 0)
+ return;
+
+ map_delete(mapp, devnm);
+ map_write(*mapp);
+ map_free(*mapp);
+ *mapp = NULL;
+}
+
+struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4])
+{
+ struct map_ent *mp;
+ if (!*map)
+ map_read(map);
+
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (memcmp(uuid, mp->uuid, 16) != 0)
+ continue;
+ if (!mddev_busy(mp->devnm)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
+ return NULL;
+}
+
+struct map_ent *map_by_devnm(struct map_ent **map, char *devnm)
+{
+ struct map_ent *mp;
+ if (!*map)
+ map_read(map);
+
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (strcmp(mp->devnm, devnm) != 0)
+ continue;
+ if (!mddev_busy(mp->devnm)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
+ return NULL;
+}
+
+struct map_ent *map_by_name(struct map_ent **map, char *name)
+{
+ struct map_ent *mp;
+ if (!*map)
+ map_read(map);
+
+ for (mp = *map ; mp ; mp = mp->next) {
+ if (!mp->path)
+ continue;
+ if (strncmp(mp->path, "/dev/md/", 8) != 0)
+ continue;
+ if (strcmp(mp->path+8, name) != 0)
+ continue;
+ if (!mddev_busy(mp->devnm)) {
+ mp->bad = 1;
+ continue;
+ }
+ return mp;
+ }
+ return NULL;
+}
+
+/* sets the proper subarray and container_dev according to the metadata
+ * version super_by_fd does this automatically, this routine is meant as
+ * a supplement for guess_super()
+ */
+static char *get_member_info(struct mdstat_ent *ent)
+{
+
+ if (ent->metadata_version == NULL ||
+ strncmp(ent->metadata_version, "external:", 9) != 0)
+ return NULL;
+
+ if (is_subarray(&ent->metadata_version[9])) {
+ char *subarray;
+
+ subarray = strrchr(ent->metadata_version, '/');
+ return subarray + 1;
+ }
+ return NULL;
+}
+
+void RebuildMap(void)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *md;
+ struct map_ent *map = NULL;
+ int require_homehost;
+ char sys_hostname[256];
+ char *homehost = conf_get_homehost(&require_homehost);
+
+ if (homehost == NULL || strcmp(homehost, "<system>")==0) {
+ if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
+ sys_hostname[sizeof(sys_hostname)-1] = 0;
+ homehost = sys_hostname;
+ }
+ }
+
+ for (md = mdstat ; md ; md = md->next) {
+ struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_DEVS);
+ struct mdinfo *sd;
+
+ if (!sra)
+ continue;
+
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ char namebuf[100];
+ char dn[30];
+ int dfd;
+ int ok;
+ dev_t devid;
+ struct supertype *st;
+ char *subarray = NULL;
+ char *path;
+ struct mdinfo *info;
+
+ sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(dn, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ st = guess_super(dfd);
+ if ( st == NULL)
+ ok = -1;
+ else {
+ subarray = get_member_info(md);
+ ok = st->ss->load_super(st, dfd, NULL);
+ }
+ close(dfd);
+ if (ok != 0)
+ continue;
+ if (subarray)
+ info = st->ss->container_content(st, subarray);
+ else {
+ info = xmalloc(sizeof(*info));
+ st->ss->getinfo_super(st, info, NULL);
+ }
+ if (!info)
+ continue;
+
+ devid = devnm2devid(md->devnm);
+ path = map_dev(major(devid), minor(devid), 0);
+ if (path == NULL ||
+ strncmp(path, "/dev/md/", 8) != 0) {
+ /* We would really like a name that provides
+ * an MD_DEVNAME for udev.
+ * The name needs to be unique both in /dev/md/
+ * and in this mapfile.
+ * It needs to match what -I or -As would come
+ * up with.
+ * That means:
+ * Check if array is in mdadm.conf
+ * - if so use that.
+ * determine trustworthy from homehost etc
+ * find a unique name based on metadata name.
+ *
+ */
+ struct mddev_ident *match = conf_match(st, info,
+ NULL, 0,
+ NULL);
+ struct stat stb;
+ if (match && match->devname && match->devname[0] == '/') {
+ path = match->devname;
+ if (path[0] != '/') {
+ strcpy(namebuf, "/dev/md/");
+ strcat(namebuf, path);
+ path = namebuf;
+ }
+ } else {
+ int unum = 0;
+ char *sep = "_";
+ const char *name;
+ int conflict = 1;
+ if ((homehost == NULL ||
+ st->ss->match_home(st, homehost) != 1) &&
+ st->ss->match_home(st, "any") != 1 &&
+ (require_homehost ||
+ !conf_name_is_free(info->name)))
+ /* require a numeric suffix */
+ unum = 0;
+ else
+ /* allow name to be used as-is if no conflict */
+ unum = -1;
+ name = info->name;
+ if (!*name) {
+ name = st->ss->name;
+ if (!isdigit(name[strlen(name)-1]) &&
+ unum == -1) {
+ unum = 0;
+ sep = "";
+ }
+ }
+ if (strchr(name, ':')) {
+ /* Probably a uniquifying
+ * hostname prefix. Allow
+ * without a suffix, and strip
+ * hostname if it is us.
+ */
+ if (homehost && unum == -1 &&
+ strncmp(name, homehost,
+ strlen(homehost)) == 0 &&
+ name[strlen(homehost)] == ':')
+ name += strlen(homehost)+1;
+ unum = -1;
+ }
+
+ while (conflict) {
+ if (unum >= 0)
+ sprintf(namebuf, "/dev/md/%s%s%d",
+ name, sep, unum);
+ else
+ sprintf(namebuf, "/dev/md/%s",
+ name);
+ unum++;
+ if (lstat(namebuf, &stb) != 0 &&
+ (map == NULL ||
+ !map_by_name(&map, namebuf+8)))
+ conflict = 0;
+ }
+ path = namebuf;
+ }
+ }
+ map_add(&map, md->devnm,
+ info->text_version,
+ info->uuid, path);
+ st->ss->free_super(st);
+ free(info);
+ break;
+ }
+ sysfs_free(sra);
+ }
+ /* Only trigger a change if we wrote a new map file */
+ if (map_write(map))
+ for (md = mdstat ; md ; md = md->next) {
+ struct mdinfo *sra = sysfs_read(-1, md->devnm,
+ GET_VERSION);
+ if (sra)
+ sysfs_uevent(sra, "change");
+ sysfs_free(sra);
+ }
+ map_free(map);
+ free_mdstat(mdstat);
+}
diff --git a/maps.c b/maps.c
new file mode 100644
index 0000000..a4fd279
--- /dev/null
+++ b/maps.c
@@ -0,0 +1,185 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2011 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+
+/* name/number mappings */
+
+mapping_t r5layout[] = {
+ { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+ { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+ { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "default", ALGORITHM_LEFT_SYMMETRIC},
+ { "la", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ls", ALGORITHM_LEFT_SYMMETRIC},
+ { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "parity-first", ALGORITHM_PARITY_0},
+ { "parity-last", ALGORITHM_PARITY_N},
+ { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC},
+
+ { NULL, UnSet }
+};
+mapping_t r6layout[] = {
+ { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC},
+ { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC},
+ { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "default", ALGORITHM_LEFT_SYMMETRIC},
+ { "la", ALGORITHM_LEFT_ASYMMETRIC},
+ { "ra", ALGORITHM_RIGHT_ASYMMETRIC},
+ { "ls", ALGORITHM_LEFT_SYMMETRIC},
+ { "rs", ALGORITHM_RIGHT_SYMMETRIC},
+
+ { "parity-first", ALGORITHM_PARITY_0},
+ { "parity-last", ALGORITHM_PARITY_N},
+ { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART},
+ { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART},
+ { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE},
+
+ { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6},
+ { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6},
+ { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6},
+ { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6},
+ { "parity-first-6", ALGORITHM_PARITY_0_6},
+
+ { NULL, UnSet }
+};
+
+/* raid0 layout is only needed because of a bug in 3.14 which changed
+ * the effective layout of raid0 arrays with varying device sizes.
+ */
+mapping_t r0layout[] = {
+ { "original", RAID0_ORIG_LAYOUT},
+ { "alternate", RAID0_ALT_MULTIZONE_LAYOUT},
+ { "1", 1}, /* aka ORIG */
+ { "2", 2}, /* aka ALT */
+ { "dangerous", 0},
+ { NULL, UnSet},
+};
+
+mapping_t pers[] = {
+ { "linear", LEVEL_LINEAR},
+ { "raid0", 0},
+ { "0", 0},
+ { "stripe", 0},
+ { "raid1", 1},
+ { "1", 1},
+ { "mirror", 1},
+ { "raid4", 4},
+ { "4", 4},
+ { "raid5", 5},
+ { "5", 5},
+ { "multipath", LEVEL_MULTIPATH},
+ { "mp", LEVEL_MULTIPATH},
+ { "raid6", 6},
+ { "6", 6},
+ { "raid10", 10},
+ { "10", 10},
+ { "faulty", LEVEL_FAULTY},
+ { "container", LEVEL_CONTAINER},
+ { NULL, UnSet }
+};
+
+mapping_t modes[] = {
+ { "assemble", ASSEMBLE},
+ { "build", BUILD},
+ { "create", CREATE},
+ { "manage", MANAGE},
+ { "misc", MISC},
+ { "monitor", MONITOR},
+ { "grow", GROW},
+ { "incremental", INCREMENTAL},
+ { "auto-detect", AUTODETECT},
+ { NULL, UnSet }
+};
+
+mapping_t faultylayout[] = {
+ { "write-transient", WriteTransient },
+ { "wt", WriteTransient },
+ { "read-transient", ReadTransient },
+ { "rt", ReadTransient },
+ { "write-persistent", WritePersistent },
+ { "wp", WritePersistent },
+ { "read-persistent", ReadPersistent },
+ { "rp", ReadPersistent },
+ { "write-all", WriteAll },
+ { "wa", WriteAll },
+ { "read-fixable", ReadFixable },
+ { "rf", ReadFixable },
+
+ { "clear", ClearErrors},
+ { "flush", ClearFaults},
+ { "none", ClearErrors},
+ { "default", ClearErrors},
+ { NULL, UnSet }
+};
+
+mapping_t consistency_policies[] = {
+ { "unknown", CONSISTENCY_POLICY_UNKNOWN},
+ { "none", CONSISTENCY_POLICY_NONE},
+ { "resync", CONSISTENCY_POLICY_RESYNC},
+ { "bitmap", CONSISTENCY_POLICY_BITMAP},
+ { "journal", CONSISTENCY_POLICY_JOURNAL},
+ { "ppl", CONSISTENCY_POLICY_PPL},
+ { NULL, CONSISTENCY_POLICY_UNKNOWN }
+};
+
+mapping_t sysfs_array_states[] = {
+ { "active-idle", ARRAY_ACTIVE_IDLE },
+ { "active", ARRAY_ACTIVE },
+ { "clear", ARRAY_CLEAR },
+ { "inactive", ARRAY_INACTIVE },
+ { "suspended", ARRAY_SUSPENDED },
+ { "readonly", ARRAY_READONLY },
+ { "read-auto", ARRAY_READ_AUTO },
+ { "clean", ARRAY_CLEAN },
+ { "write-pending", ARRAY_WRITE_PENDING },
+ { "broken", ARRAY_BROKEN },
+ { NULL, ARRAY_UNKNOWN_STATE }
+};
+
+char *map_num(mapping_t *map, int num)
+{
+ while (map->name) {
+ if (map->num == num)
+ return map->name;
+ map++;
+ }
+ return NULL;
+}
+
+int map_name(mapping_t *map, char *name)
+{
+ while (map->name && strcmp(map->name, name) != 0)
+ map++;
+
+ return map->num;
+}
diff --git a/md.4 b/md.4
new file mode 100644
index 0000000..7a0bc7e
--- /dev/null
+++ b/md.4
@@ -0,0 +1,1317 @@
+.\" Copyright Neil Brown and others.
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" See file COPYING in distribution for details.
+.if n .pl 1000v
+.TH MD 4
+.SH NAME
+md \- Multiple Device driver aka Linux Software RAID
+.SH SYNOPSIS
+.BI /dev/md n
+.br
+.BI /dev/md/ n
+.br
+.BR /dev/md/ name
+.SH DESCRIPTION
+The
+.B md
+driver provides virtual devices that are created from one or more
+independent underlying devices. This array of devices often contains
+redundancy and the devices are often disk drives, hence the acronym RAID
+which stands for a Redundant Array of Independent Disks.
+.PP
+.B md
+supports RAID levels
+1 (mirroring),
+4 (striped array with parity device),
+5 (striped array with distributed parity information),
+6 (striped array with distributed dual redundancy information), and
+10 (striped and mirrored).
+If some number of underlying devices fails while using one of these
+levels, the array will continue to function; this number is one for
+RAID levels 4 and 5, two for RAID level 6, and all but one (N-1) for
+RAID level 1, and dependent on configuration for level 10.
+.PP
+.B md
+also supports a number of pseudo RAID (non-redundant) configurations
+including RAID0 (striped array), LINEAR (catenated array),
+MULTIPATH (a set of different interfaces to the same device),
+and FAULTY (a layer over a single device into which errors can be injected).
+
+.SS MD METADATA
+Each device in an array may have some
+.I metadata
+stored in the device. This metadata is sometimes called a
+.BR superblock .
+The metadata records information about the structure and state of the array.
+This allows the array to be reliably re-assembled after a shutdown.
+
+From Linux kernel version 2.6.10,
+.B md
+provides support for two different formats of metadata, and
+other formats can be added. Prior to this release, only one format is
+supported.
+
+The common format \(em known as version 0.90 \(em has
+a superblock that is 4K long and is written into a 64K aligned block that
+starts at least 64K and less than 128K from the end of the device
+(i.e. to get the address of the superblock round the size of the
+device down to a multiple of 64K and then subtract 64K).
+The available size of each device is the amount of space before the
+super block, so between 64K and 128K is lost when a device in
+incorporated into an MD array.
+This superblock stores multi-byte fields in a processor-dependent
+manner, so arrays cannot easily be moved between computers with
+different processors.
+
+The new format \(em known as version 1 \(em has a superblock that is
+normally 1K long, but can be longer. It is normally stored between 8K
+and 12K from the end of the device, on a 4K boundary, though
+variations can be stored at the start of the device (version 1.1) or 4K from
+the start of the device (version 1.2).
+This metadata format stores multibyte data in a
+processor-independent format and supports up to hundreds of
+component devices (version 0.90 only supports 28).
+
+The metadata contains, among other things:
+.TP
+LEVEL
+The manner in which the devices are arranged into the array
+(LINEAR, RAID0, RAID1, RAID4, RAID5, RAID10, MULTIPATH).
+.TP
+UUID
+a 128 bit Universally Unique Identifier that identifies the array that
+contains this device.
+
+.PP
+When a version 0.90 array is being reshaped (e.g. adding extra devices
+to a RAID5), the version number is temporarily set to 0.91. This
+ensures that if the reshape process is stopped in the middle (e.g. by
+a system crash) and the machine boots into an older kernel that does
+not support reshaping, then the array will not be assembled (which
+would cause data corruption) but will be left untouched until a kernel
+that can complete the reshape processes is used.
+
+.SS ARRAYS WITHOUT METADATA
+While it is usually best to create arrays with superblocks so that
+they can be assembled reliably, there are some circumstances when an
+array without superblocks is preferred. These include:
+.TP
+LEGACY ARRAYS
+Early versions of the
+.B md
+driver only supported LINEAR and RAID0 configurations and did not use
+a superblock (which is less critical with these configurations).
+While such arrays should be rebuilt with superblocks if possible,
+.B md
+continues to support them.
+.TP
+FAULTY
+Being a largely transparent layer over a different device, the FAULTY
+personality doesn't gain anything from having a superblock.
+.TP
+MULTIPATH
+It is often possible to detect devices which are different paths to
+the same storage directly rather than having a distinctive superblock
+written to the device and searched for on all paths. In this case,
+a MULTIPATH array with no superblock makes sense.
+.TP
+RAID1
+In some configurations it might be desired to create a RAID1
+configuration that does not use a superblock, and to maintain the state of
+the array elsewhere. While not encouraged for general use, it does
+have special-purpose uses and is supported.
+
+.SS ARRAYS WITH EXTERNAL METADATA
+
+From release 2.6.28, the
+.I md
+driver supports arrays with externally managed metadata. That is,
+the metadata is not managed by the kernel but rather by a user-space
+program which is external to the kernel. This allows support for a
+variety of metadata formats without cluttering the kernel with lots of
+details.
+.PP
+.I md
+is able to communicate with the user-space program through various
+sysfs attributes so that it can make appropriate changes to the
+metadata \- for example to mark a device as faulty. When necessary,
+.I md
+will wait for the program to acknowledge the event by writing to a
+sysfs attribute.
+The manual page for
+.IR mdmon (8)
+contains more detail about this interaction.
+
+.SS CONTAINERS
+Many metadata formats use a single block of metadata to describe a
+number of different arrays which all use the same set of devices.
+In this case it is helpful for the kernel to know about the full set
+of devices as a whole. This set is known to md as a
+.IR container .
+A container is an
+.I md
+array with externally managed metadata and with device offset and size
+so that it just covers the metadata part of the devices. The
+remainder of each device is available to be incorporated into various
+arrays.
+
+.SS LINEAR
+
+A LINEAR array simply catenates the available space on each
+drive to form one large virtual drive.
+
+One advantage of this arrangement over the more common RAID0
+arrangement is that the array may be reconfigured at a later time with
+an extra drive, so the array is made bigger without disturbing the
+data that is on the array. This can even be done on a live
+array.
+
+If a chunksize is given with a LINEAR array, the usable space on each
+device is rounded down to a multiple of this chunksize.
+
+.SS RAID0
+
+A RAID0 array (which has zero redundancy) is also known as a
+striped array.
+A RAID0 array is configured at creation with a
+.B "Chunk Size"
+which must be a power of two (prior to Linux 2.6.31), and at least 4
+kibibytes.
+
+The RAID0 driver assigns the first chunk of the array to the first
+device, the second chunk to the second device, and so on until all
+drives have been assigned one chunk. This collection of chunks forms a
+.BR stripe .
+Further chunks are gathered into stripes in the same way, and are
+assigned to the remaining space in the drives.
+
+If devices in the array are not all the same size, then once the
+smallest device has been exhausted, the RAID0 driver starts
+collecting chunks into smaller stripes that only span the drives which
+still have remaining space.
+
+A bug was introduced in linux 3.14 which changed the layout of blocks in
+a RAID0 beyond the region that is striped over all devices. This bug
+does not affect an array with all devices the same size, but can affect
+other RAID0 arrays.
+
+Linux 5.4 (and some stable kernels to which the change was backported)
+will not normally assemble such an array as it cannot know which layout
+to use. There is a module parameter "raid0.default_layout" which can be
+set to "1" to force the kernel to use the pre-3.14 layout or to "2" to
+force it to use the 3.14-and-later layout. when creating a new RAID0
+array,
+.I mdadm
+will record the chosen layout in the metadata in a way that allows newer
+kernels to assemble the array without needing a module parameter.
+
+To assemble an old array on a new kernel without using the module parameter,
+use either the
+.B "--update=layout-original"
+option or the
+.B "--update=layout-alternate"
+option.
+
+Once you have updated the layout you will not be able to mount the array
+on an older kernel. If you need to revert to an older kernel, the
+layout information can be erased with the
+.B "--update=layout-unspecificed"
+option. If you use this option to
+.B --assemble
+while running a newer kernel, the array will NOT assemble, but the
+metadata will be update so that it can be assembled on an older kernel.
+
+No that setting the layout to "unspecified" removes protections against
+this bug, and you must be sure that the kernel you use matches the
+layout of the array.
+
+.SS RAID1
+
+A RAID1 array is also known as a mirrored set (though mirrors tend to
+provide reflected images, which RAID1 does not) or a plex.
+
+Once initialised, each device in a RAID1 array contains exactly the
+same data. Changes are written to all devices in parallel. Data is
+read from any one device. The driver attempts to distribute read
+requests across all devices to maximise performance.
+
+All devices in a RAID1 array should be the same size. If they are
+not, then only the amount of space available on the smallest device is
+used (any extra space on other devices is wasted).
+
+Note that the read balancing done by the driver does not make the RAID1
+performance profile be the same as for RAID0; a single stream of
+sequential input will not be accelerated (e.g. a single dd), but
+multiple sequential streams or a random workload will use more than one
+spindle. In theory, having an N-disk RAID1 will allow N sequential
+threads to read from all disks.
+
+Individual devices in a RAID1 can be marked as "write-mostly".
+These drives are excluded from the normal read balancing and will only
+be read from when there is no other option. This can be useful for
+devices connected over a slow link.
+
+.SS RAID4
+
+A RAID4 array is like a RAID0 array with an extra device for storing
+parity. This device is the last of the active devices in the
+array. Unlike RAID0, RAID4 also requires that all stripes span all
+drives, so extra space on devices that are larger than the smallest is
+wasted.
+
+When any block in a RAID4 array is modified, the parity block for that
+stripe (i.e. the block in the parity device at the same device offset
+as the stripe) is also modified so that the parity block always
+contains the "parity" for the whole stripe. I.e. its content is
+equivalent to the result of performing an exclusive-or operation
+between all the data blocks in the stripe.
+
+This allows the array to continue to function if one device fails.
+The data that was on that device can be calculated as needed from the
+parity block and the other data blocks.
+
+.SS RAID5
+
+RAID5 is very similar to RAID4. The difference is that the parity
+blocks for each stripe, instead of being on a single device, are
+distributed across all devices. This allows more parallelism when
+writing, as two different block updates will quite possibly affect
+parity blocks on different devices so there is less contention.
+
+This also allows more parallelism when reading, as read requests are
+distributed over all the devices in the array instead of all but one.
+
+.SS RAID6
+
+RAID6 is similar to RAID5, but can handle the loss of any \fItwo\fP
+devices without data loss. Accordingly, it requires N+2 drives to
+store N drives worth of data.
+
+The performance for RAID6 is slightly lower but comparable to RAID5 in
+normal mode and single disk failure mode. It is very slow in dual
+disk failure mode, however.
+
+.SS RAID10
+
+RAID10 provides a combination of RAID1 and RAID0, and is sometimes known
+as RAID1+0. Every datablock is duplicated some number of times, and
+the resulting collection of datablocks are distributed over multiple
+drives.
+
+When configuring a RAID10 array, it is necessary to specify the number
+of replicas of each data block that are required (this will usually
+be\ 2) and whether their layout should be "near", "far" or "offset"
+(with "offset" being available since Linux\ 2.6.18).
+
+.B About the RAID10 Layout Examples:
+.br
+The examples below visualise the chunk distribution on the underlying
+devices for the respective layout.
+
+For simplicity it is assumed that the size of the chunks equals the
+size of the blocks of the underlying devices as well as those of the
+RAID10 device exported by the kernel (for example \fB/dev/md/\fPname).
+.br
+Therefore the chunks\ /\ chunk numbers map directly to the blocks\ /\
+block addresses of the exported RAID10 device.
+
+Decimal numbers (0,\ 1, 2,\ ...) are the chunks of the RAID10 and due
+to the above assumption also the blocks and block addresses of the
+exported RAID10 device.
+.br
+Repeated numbers mean copies of a chunk\ /\ block (obviously on
+different underlying devices).
+.br
+Hexadecimal numbers (0x00,\ 0x01, 0x02,\ ...) are the block addresses
+of the underlying devices.
+
+.TP
+\fB "near" Layout\fP
+When "near" replicas are chosen, the multiple copies of a given chunk are laid
+out consecutively ("as close to each other as possible") across the stripes of
+the array.
+
+With an even number of devices, they will likely (unless some misalignment is
+present) lay at the very same offset on the different devices.
+.br
+This is as the "classic" RAID1+0; that is two groups of mirrored devices (in the
+example below the groups Device\ #1\ /\ #2 and Device\ #3\ /\ #4 are each a
+RAID1) both in turn forming a striped RAID0.
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| C | C | C | C | C |
+| - | - | - | - | - |
+ C C S C S
+ C C S C S
+ C C S S S
+ C C S S S.
+;
+;Device #1;Device #2;Device #3;Device #4
+0x00;0;0;1;1
+0x01;2;2;3;3
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+0x80;254;254;255;255
+;\\---------v---------/;\\---------v---------/
+;RAID1;RAID1
+;\\---------------------v---------------------/
+;RAID0
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| C | C | C | C | C | C |
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+0x00;0;0;1;1;2
+0x01;2;3;3;4;4
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.
+0x80;317;318;318;319;319
+;
+.TE
+
+.TP
+\fB "far" Layout\fP
+When "far" replicas are chosen, the multiple copies of a given chunk
+are laid out quite distant ("as far as reasonably possible") from each
+other.
+
+First a complete sequence of all data blocks (that is all the data one
+sees on the exported RAID10 block device) is striped over the
+devices. Then another (though "shifted") complete sequence of all data
+blocks; and so on (in the case of more than 2\ copies per chunk).
+
+The "shift" needed to prevent placing copies of the same chunks on the
+same devices is actually a cyclic permutation with offset\ 1 of each
+of the stripes within a complete sequence of chunks.
+.br
+The offset\ 1 is relative to the previous complete sequence of chunks,
+so in case of more than 2\ copies per chunk one gets the following
+offsets:
+.br
+1.\ complete sequence of chunks: offset\ =\ \ 0
+.br
+2.\ complete sequence of chunks: offset\ =\ \ 1
+.br
+3.\ complete sequence of chunks: offset\ =\ \ 2
+.br
+ :
+.br
+n.\ complete sequence of chunks: offset\ =\ n-1
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| - | - | - | - | - |
+C.
+;
+;Device #1;Device #2;Device #3;Device #4
+;
+0x00;0;1;2;3;\\
+0x01;4;5;6;7;> [#]
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x40;252;253;254;255;/
+0x41;3;0;1;2;\\
+0x42;7;4;5;6;> [#]~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x80;255;252;253;254;/
+;
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+;
+0x00;0;1;2;3;4;\\
+0x01;5;6;7;8;9;> [#]
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x40;315;316;317;318;319;/
+0x41;4;0;1;2;3;\\
+0x42;9;5;6;7;8;> [#]~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+:;:;:;:;:;:;:
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;:
+0x80;319;315;316;317;318;/
+;
+.TE
+
+With [#]\ being the complete sequence of chunks and [#]~\ the cyclic permutation
+with offset\ 1 thereof (in the case of more than 2 copies per chunk there would
+be ([#]~)~,\ (([#]~)~)~,\ ...).
+
+The advantage of this layout is that MD can easily spread sequential reads over
+the devices, making them similar to RAID0 in terms of speed.
+.br
+The cost is more seeking for writes, making them substantially slower.
+
+.TP
+\fB"offset" Layout\fP
+When "offset" replicas are chosen, all the copies of a given chunk are
+striped consecutively ("offset by the stripe length after each other")
+over the devices.
+
+Explained in detail, <number of devices> consecutive chunks are
+striped over the devices, immediately followed by a "shifted" copy of
+these chunks (and by further such "shifted" copies in the case of more
+than 2\ copies per chunk).
+.br
+This pattern repeats for all further consecutive chunks of the
+exported RAID10 device (in other words: all further data blocks).
+
+The "shift" needed to prevent placing copies of the same chunks on the
+same devices is actually a cyclic permutation with offset\ 1 of each
+of the striped copies of <number of devices> consecutive chunks.
+.br
+The offset\ 1 is relative to the previous striped copy of <number of
+devices> consecutive chunks, so in case of more than 2\ copies per
+chunk one gets the following offsets:
+.br
+1.\ <number of devices> consecutive chunks: offset\ =\ \ 0
+.br
+2.\ <number of devices> consecutive chunks: offset\ =\ \ 1
+.br
+3.\ <number of devices> consecutive chunks: offset\ =\ \ 2
+.br
+ :
+.br
+n.\ <number of devices> consecutive chunks: offset\ =\ n-1
+
+.ne 10
+.B Example with 2\ copies per chunk and an even number\ (4) of devices:
+.TS
+tab(;);
+ C - - - -
+ C | C | C | C | C |
+| - | - | - | - | - |
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| C | C | C | C | C | L
+| - | - | - | - | - |
+C.
+;
+;Device #1;Device #2;Device #3;Device #4
+;
+0x00;0;1;2;3;) AA
+0x01;3;0;1;2;) AA~
+0x02;4;5;6;7;) AB
+0x03;7;4;5;6;) AB~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+:;:;:;:;:; :
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+0x79;251;252;253;254;) EX
+0x80;254;251;252;253;) EX~
+;
+.TE
+
+.ne 10
+.B Example with 2\ copies per chunk and an odd number\ (5) of devices:
+.TS
+tab(;);
+ C - - - - -
+ C | C | C | C | C | C |
+| - | - | - | - | - | - |
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| C | C | C | C | C | C | L
+| - | - | - | - | - | - |
+C.
+;
+;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5
+;
+0x00;0;1;2;3;4;) AA
+0x01;4;0;1;2;3;) AA~
+0x02;5;6;7;8;9;) AB
+0x03;9;5;6;7;8;) AB~
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+:;:;:;:;:;:; :
+\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\.
+0x79;314;315;316;317;318;) EX
+0x80;318;314;315;316;317;) EX~
+;
+.TE
+
+With AA,\ AB,\ ..., AZ,\ BA,\ ... being the sets of <number of devices> consecutive
+chunks and AA~,\ AB~,\ ..., AZ~,\ BA~,\ ... the cyclic permutations with offset\ 1
+thereof (in the case of more than 2 copies per chunk there would be (AA~)~,\ ...
+as well as ((AA~)~)~,\ ... and so on).
+
+This should give similar read characteristics to "far" if a suitably large chunk
+size is used, but without as much seeking for writes.
+.PP
+
+
+It should be noted that the number of devices in a RAID10 array need
+not be a multiple of the number of replica of each data block; however,
+there must be at least as many devices as replicas.
+
+If, for example, an array is created with 5 devices and 2 replicas,
+then space equivalent to 2.5 of the devices will be available, and
+every block will be stored on two different devices.
+
+Finally, it is possible to have an array with both "near" and "far"
+copies. If an array is configured with 2 near copies and 2 far
+copies, then there will be a total of 4 copies of each block, each on
+a different drive. This is an artifact of the implementation and is
+unlikely to be of real value.
+
+.SS MULTIPATH
+
+MULTIPATH is not really a RAID at all as there is only one real device
+in a MULTIPATH md array. However there are multiple access points
+(paths) to this device, and one of these paths might fail, so there
+are some similarities.
+
+A MULTIPATH array is composed of a number of logically different
+devices, often fibre channel interfaces, that all refer the the same
+real device. If one of these interfaces fails (e.g. due to cable
+problems), the MULTIPATH driver will attempt to redirect requests to
+another interface.
+
+The MULTIPATH drive is not receiving any ongoing development and
+should be considered a legacy driver. The device-mapper based
+multipath drivers should be preferred for new installations.
+
+.SS FAULTY
+The FAULTY md module is provided for testing purposes. A FAULTY array
+has exactly one component device and is normally assembled without a
+superblock, so the md array created provides direct access to all of
+the data in the component device.
+
+The FAULTY module may be requested to simulate faults to allow testing
+of other md levels or of filesystems. Faults can be chosen to trigger
+on read requests or write requests, and can be transient (a subsequent
+read/write at the address will probably succeed) or persistent
+(subsequent read/write of the same address will fail). Further, read
+faults can be "fixable" meaning that they persist until a write
+request at the same address.
+
+Fault types can be requested with a period. In this case, the fault
+will recur repeatedly after the given number of requests of the
+relevant type. For example if persistent read faults have a period of
+100, then every 100th read request would generate a fault, and the
+faulty sector would be recorded so that subsequent reads on that
+sector would also fail.
+
+There is a limit to the number of faulty sectors that are remembered.
+Faults generated after this limit is exhausted are treated as
+transient.
+
+The list of faulty sectors can be flushed, and the active list of
+failure modes can be cleared.
+
+.SS UNCLEAN SHUTDOWN
+
+When changes are made to a RAID1, RAID4, RAID5, RAID6, or RAID10 array
+there is a possibility of inconsistency for short periods of time as
+each update requires at least two block to be written to different
+devices, and these writes probably won't happen at exactly the same
+time. Thus if a system with one of these arrays is shutdown in the
+middle of a write operation (e.g. due to power failure), the array may
+not be consistent.
+
+To handle this situation, the md driver marks an array as "dirty"
+before writing any data to it, and marks it as "clean" when the array
+is being disabled, e.g. at shutdown. If the md driver finds an array
+to be dirty at startup, it proceeds to correct any possibly
+inconsistency. For RAID1, this involves copying the contents of the
+first drive onto all other drives. For RAID4, RAID5 and RAID6 this
+involves recalculating the parity for each stripe and making sure that
+the parity block has the correct data. For RAID10 it involves copying
+one of the replicas of each block onto all the others. This process,
+known as "resynchronising" or "resync" is performed in the background.
+The array can still be used, though possibly with reduced performance.
+
+If a RAID4, RAID5 or RAID6 array is degraded (missing at least one
+drive, two for RAID6) when it is restarted after an unclean shutdown, it cannot
+recalculate parity, and so it is possible that data might be
+undetectably corrupted. The 2.4 md driver
+.B does not
+alert the operator to this condition. The 2.6 md driver will fail to
+start an array in this condition without manual intervention, though
+this behaviour can be overridden by a kernel parameter.
+
+.SS RECOVERY
+
+If the md driver detects a write error on a device in a RAID1, RAID4,
+RAID5, RAID6, or RAID10 array, it immediately disables that device
+(marking it as faulty) and continues operation on the remaining
+devices. If there are spare drives, the driver will start recreating
+on one of the spare drives the data which was on that failed drive,
+either by copying a working drive in a RAID1 configuration, or by
+doing calculations with the parity block on RAID4, RAID5 or RAID6, or
+by finding and copying originals for RAID10.
+
+In kernels prior to about 2.6.15, a read error would cause the same
+effect as a write error. In later kernels, a read-error will instead
+cause md to attempt a recovery by overwriting the bad block. i.e. it
+will find the correct data from elsewhere, write it over the block
+that failed, and then try to read it back again. If either the write
+or the re-read fail, md will treat the error the same way that a write
+error is treated, and will fail the whole device.
+
+While this recovery process is happening, the md driver will monitor
+accesses to the array and will slow down the rate of recovery if other
+activity is happening, so that normal access to the array will not be
+unduly affected. When no other activity is happening, the recovery
+process proceeds at full speed. The actual speed targets for the two
+different situations can be controlled by the
+.B speed_limit_min
+and
+.B speed_limit_max
+control files mentioned below.
+
+.SS SCRUBBING AND MISMATCHES
+
+As storage devices can develop bad blocks at any time it is valuable
+to regularly read all blocks on all devices in an array so as to catch
+such bad blocks early. This process is called
+.IR scrubbing .
+
+md arrays can be scrubbed by writing either
+.I check
+or
+.I repair
+to the file
+.I md/sync_action
+in the
+.I sysfs
+directory for the device.
+
+Requesting a scrub will cause
+.I md
+to read every block on every device in the array, and check that the
+data is consistent. For RAID1 and RAID10, this means checking that the copies
+are identical. For RAID4, RAID5, RAID6 this means checking that the
+parity block is (or blocks are) correct.
+
+If a read error is detected during this process, the normal read-error
+handling causes correct data to be found from other devices and to be
+written back to the faulty device. In many case this will
+effectively
+.I fix
+the bad block.
+
+If all blocks read successfully but are found to not be consistent,
+then this is regarded as a
+.IR mismatch .
+
+If
+.I check
+was used, then no action is taken to handle the mismatch, it is simply
+recorded.
+If
+.I repair
+was used, then a mismatch will be repaired in the same way that
+.I resync
+repairs arrays. For RAID5/RAID6 new parity blocks are written. For RAID1/RAID10,
+all but one block are overwritten with the content of that one block.
+
+A count of mismatches is recorded in the
+.I sysfs
+file
+.IR md/mismatch_cnt .
+This is set to zero when a
+scrub starts and is incremented whenever a sector is
+found that is a mismatch.
+.I md
+normally works in units much larger than a single sector and when it
+finds a mismatch, it does not determine exactly how many actual sectors were
+affected but simply adds the number of sectors in the IO unit that was
+used. So a value of 128 could simply mean that a single 64KB check
+found an error (128 x 512bytes = 64KB).
+
+If an array is created by
+.I mdadm
+with
+.I \-\-assume\-clean
+then a subsequent check could be expected to find some mismatches.
+
+On a truly clean RAID5 or RAID6 array, any mismatches should indicate
+a hardware problem at some level - software issues should never cause
+such a mismatch.
+
+However on RAID1 and RAID10 it is possible for software issues to
+cause a mismatch to be reported. This does not necessarily mean that
+the data on the array is corrupted. It could simply be that the
+system does not care what is stored on that part of the array - it is
+unused space.
+
+The most likely cause for an unexpected mismatch on RAID1 or RAID10
+occurs if a swap partition or swap file is stored on the array.
+
+When the swap subsystem wants to write a page of memory out, it flags
+the page as 'clean' in the memory manager and requests the swap device
+to write it out. It is quite possible that the memory will be
+changed while the write-out is happening. In that case the 'clean'
+flag will be found to be clear when the write completes and so the
+swap subsystem will simply forget that the swapout had been attempted,
+and will possibly choose a different page to write out.
+
+If the swap device was on RAID1 (or RAID10), then the data is sent
+from memory to a device twice (or more depending on the number of
+devices in the array). Thus it is possible that the memory gets changed
+between the times it is sent, so different data can be written to
+the different devices in the array. This will be detected by
+.I check
+as a mismatch. However it does not reflect any corruption as the
+block where this mismatch occurs is being treated by the swap system as
+being empty, and the data will never be read from that block.
+
+It is conceivable for a similar situation to occur on non-swap files,
+though it is less likely.
+
+Thus the
+.I mismatch_cnt
+value can not be interpreted very reliably on RAID1 or RAID10,
+especially when the device is used for swap.
+
+
+.SS BITMAP WRITE-INTENT LOGGING
+
+From Linux 2.6.13,
+.I md
+supports a bitmap based write-intent log. If configured, the bitmap
+is used to record which blocks of the array may be out of sync.
+Before any write request is honoured, md will make sure that the
+corresponding bit in the log is set. After a period of time with no
+writes to an area of the array, the corresponding bit will be cleared.
+
+This bitmap is used for two optimisations.
+
+Firstly, after an unclean shutdown, the resync process will consult
+the bitmap and only resync those blocks that correspond to bits in the
+bitmap that are set. This can dramatically reduce resync time.
+
+Secondly, when a drive fails and is removed from the array, md stops
+clearing bits in the intent log. If that same drive is re-added to
+the array, md will notice and will only recover the sections of the
+drive that are covered by bits in the intent log that are set. This
+can allow a device to be temporarily removed and reinserted without
+causing an enormous recovery cost.
+
+The intent log can be stored in a file on a separate device, or it can
+be stored near the superblocks of an array which has superblocks.
+
+It is possible to add an intent log to an active array, or remove an
+intent log if one is present.
+
+In 2.6.13, intent bitmaps are only supported with RAID1. Other levels
+with redundancy are supported from 2.6.15.
+
+.SS BAD BLOCK LIST
+
+From Linux 3.5 each device in an
+.I md
+array can store a list of known-bad-blocks. This list is 4K in size
+and usually positioned at the end of the space between the superblock
+and the data.
+
+When a block cannot be read and cannot be repaired by writing data
+recovered from other devices, the address of the block is stored in
+the bad block list. Similarly if an attempt to write a block fails,
+the address will be recorded as a bad block. If attempting to record
+the bad block fails, the whole device will be marked faulty.
+
+Attempting to read from a known bad block will cause a read error.
+Attempting to write to a known bad block will be ignored if any write
+errors have been reported by the device. If there have been no write
+errors then the data will be written to the known bad block and if
+that succeeds, the address will be removed from the list.
+
+This allows an array to fail more gracefully - a few blocks on
+different devices can be faulty without taking the whole array out of
+action.
+
+The list is particularly useful when recovering to a spare. If a few blocks
+cannot be read from the other devices, the bulk of the recovery can
+complete and those few bad blocks will be recorded in the bad block list.
+
+.SS RAID WRITE HOLE
+
+Due to non-atomicity nature of RAID write operations,
+interruption of write operations (system crash, etc.) to RAID456
+array can lead to inconsistent parity and data loss (so called
+RAID-5 write hole).
+To plug the write hole md supports two mechanisms described below.
+
+.TP
+DIRTY STRIPE JOURNAL
+From Linux 4.4, md supports write ahead journal for RAID456.
+When the array is created, an additional journal device can be added to
+the array through write-journal option. The RAID write journal works
+similar to file system journals. Before writing to the data
+disks, md persists data AND parity of the stripe to the journal
+device. After crashes, md searches the journal device for
+incomplete write operations, and replay them to the data disks.
+
+When the journal device fails, the RAID array is forced to run in
+read-only mode.
+
+.TP
+PARTIAL PARITY LOG
+From Linux 4.12 md supports Partial Parity Log (PPL) for RAID5 arrays only.
+Partial parity for a write operation is the XOR of stripe data chunks not
+modified by the write. PPL is stored in the metadata region of RAID member drives,
+no additional journal drive is needed.
+After crashes, if one of the not modified data disks of
+the stripe is missing, this updated parity can be used to recover its
+data.
+
+This mechanism is documented more fully in the file
+Documentation/md/raid5-ppl.rst
+
+.SS WRITE-BEHIND
+
+From Linux 2.6.14,
+.I md
+supports WRITE-BEHIND on RAID1 arrays.
+
+This allows certain devices in the array to be flagged as
+.IR write-mostly .
+MD will only read from such devices if there is no
+other option.
+
+If a write-intent bitmap is also provided, write requests to
+write-mostly devices will be treated as write-behind requests and md
+will not wait for writes to those requests to complete before
+reporting the write as complete to the filesystem.
+
+This allows for a RAID1 with WRITE-BEHIND to be used to mirror data
+over a slow link to a remote computer (providing the link isn't too
+slow). The extra latency of the remote link will not slow down normal
+operations, but the remote system will still have a reasonably
+up-to-date copy of all data.
+
+.SS FAILFAST
+
+From Linux 4.10,
+.I
+md
+supports FAILFAST for RAID1 and RAID10 arrays. This is a flag that
+can be set on individual drives, though it is usually set on all
+drives, or no drives.
+
+When
+.I md
+sends an I/O request to a drive that is marked as FAILFAST, and when
+the array could survive the loss of that drive without losing data,
+.I md
+will request that the underlying device does not perform any retries.
+This means that a failure will be reported to
+.I md
+promptly, and it can mark the device as faulty and continue using the
+other device(s).
+.I md
+cannot control the timeout that the underlying devices use to
+determine failure. Any changes desired to that timeout must be set
+explictly on the underlying device, separately from using
+.IR mdadm .
+
+If a FAILFAST request does fail, and if it is still safe to mark the
+device as faulty without data loss, that will be done and the array
+will continue functioning on a reduced number of devices. If it is not
+possible to safely mark the device as faulty,
+.I md
+will retry the request without disabling retries in the underlying
+device. In any case,
+.I md
+will not attempt to repair read errors on a device marked as FAILFAST
+by writing out the correct. It will just mark the device as faulty.
+
+FAILFAST is appropriate for storage arrays that have a low probability
+of true failure, but will sometimes introduce unacceptable delays to
+I/O requests while performing internal maintenance. The value of
+setting FAILFAST involves a trade-off. The gain is that the chance of
+unacceptable delays is substantially reduced. The cost is that the
+unlikely event of data-loss on one device is slightly more likely to
+result in data-loss for the array.
+
+When a device in an array using FAILFAST is marked as faulty, it will
+usually become usable again in a short while.
+.I mdadm
+makes no attempt to detect that possibility. Some separate
+mechanism, tuned to the specific details of the expected failure modes,
+needs to be created to monitor devices to see when they return to full
+functionality, and to then re-add them to the array. In order of
+this "re-add" functionality to be effective, an array using FAILFAST
+should always have a write-intent bitmap.
+
+.SS RESTRIPING
+
+.IR Restriping ,
+also known as
+.IR Reshaping ,
+is the processes of re-arranging the data stored in each stripe into a
+new layout. This might involve changing the number of devices in the
+array (so the stripes are wider), changing the chunk size (so stripes
+are deeper or shallower), or changing the arrangement of data and
+parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6).
+
+As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to
+have a different number of devices (more or fewer) and to have a
+different layout or chunk size. It can also convert between these
+different RAID levels. It can also convert between RAID0 and RAID10,
+and between RAID0 and RAID4 or RAID5.
+Other possibilities may follow in future kernels.
+
+During any stripe process there is a 'critical section' during which
+live data is being overwritten on disk. For the operation of
+increasing the number of drives in a RAID5, this critical section
+covers the first few stripes (the number being the product of the old
+and new number of devices). After this critical section is passed,
+data is only written to areas of the array which no longer hold live
+data \(em the live data has already been located away.
+
+For a reshape which reduces the number of devices, the 'critical
+section' is at the end of the reshape process.
+
+md is not able to ensure data preservation if there is a crash
+(e.g. power failure) during the critical section. If md is asked to
+start an array which failed during a critical section of restriping,
+it will fail to start the array.
+
+To deal with this possibility, a user-space program must
+.IP \(bu 4
+Disable writes to that section of the array (using the
+.B sysfs
+interface),
+.IP \(bu 4
+take a copy of the data somewhere (i.e. make a backup),
+.IP \(bu 4
+allow the process to continue and invalidate the backup and restore
+write access once the critical section is passed, and
+.IP \(bu 4
+provide for restoring the critical data before restarting the array
+after a system crash.
+.PP
+
+.B mdadm
+versions from 2.4 do this for growing a RAID5 array.
+
+For operations that do not change the size of the array, like simply
+increasing chunk size, or converting RAID5 to RAID6 with one extra
+device, the entire process is the critical section. In this case, the
+restripe will need to progress in stages, as a section is suspended,
+backed up, restriped, and released.
+
+.SS SYSFS INTERFACE
+Each block device appears as a directory in
+.I sysfs
+(which is usually mounted at
+.BR /sys ).
+For MD devices, this directory will contain a subdirectory called
+.B md
+which contains various files for providing access to information about
+the array.
+
+This interface is documented more fully in the file
+.B Documentation/admin-guide/md.rst
+which is distributed with the kernel sources. That file should be
+consulted for full documentation. The following are just a selection
+of attribute files that are available.
+
+.TP
+.B md/sync_speed_min
+This value, if set, overrides the system-wide setting in
+.B /proc/sys/dev/raid/speed_limit_min
+for this array only.
+Writing the value
+.B "system"
+to this file will cause the system-wide setting to have effect.
+
+.TP
+.B md/sync_speed_max
+This is the partner of
+.B md/sync_speed_min
+and overrides
+.B /proc/sys/dev/raid/speed_limit_max
+described below.
+
+.TP
+.B md/sync_action
+This can be used to monitor and control the resync/recovery process of
+MD.
+In particular, writing "check" here will cause the array to read all
+data block and check that they are consistent (e.g. parity is correct,
+or all mirror replicas are the same). Any discrepancies found are
+.B NOT
+corrected.
+
+A count of problems found will be stored in
+.BR md/mismatch_count .
+
+Alternately, "repair" can be written which will cause the same check
+to be performed, but any errors will be corrected.
+
+Finally, "idle" can be written to stop the check/repair process.
+
+.TP
+.B md/stripe_cache_size
+This is only available on RAID5 and RAID6. It records the size (in
+pages per device) of the stripe cache which is used for synchronising
+all write operations to the array and all read operations if the array
+is degraded. The default is 256. Valid values are 17 to 32768.
+Increasing this number can increase performance in some situations, at
+some cost in system memory. Note, setting this value too high can
+result in an "out of memory" condition for the system.
+
+memory_consumed = system_page_size * nr_disks * stripe_cache_size
+
+.TP
+.B md/preread_bypass_threshold
+This is only available on RAID5 and RAID6. This variable sets the
+number of times MD will service a full-stripe-write before servicing a
+stripe that requires some "prereading". For fairness this defaults to
+1. Valid values are 0 to stripe_cache_size. Setting this to 0
+maximizes sequential-write throughput at the cost of fairness to threads
+doing small or random writes.
+
+.TP
+.B md/bitmap/backlog
+The value stored in the file only has any effect on RAID1 when write-mostly
+devices are active, and write requests to those devices are proceed in the
+background.
+
+This variable sets a limit on the number of concurrent background writes,
+the valid values are 0 to 16383, 0 means that write-behind is not allowed,
+while any other number means it can happen. If there are more write requests
+than the number, new writes will by synchronous.
+
+.TP
+.B md/bitmap/can_clear
+This is for externally managed bitmaps, where the kernel writes the bitmap
+itself, but metadata describing the bitmap is managed by mdmon or similar.
+
+When the array is degraded, bits mustn't be cleared. When the array becomes
+optimal again, bit can be cleared, but first the metadata needs to record
+the current event count. So md sets this to 'false' and notifies mdmon,
+then mdmon updates the metadata and writes 'true'.
+
+There is no code in mdmon to actually do this, so maybe it doesn't even
+work.
+
+.TP
+.B md/bitmap/chunksize
+The bitmap chunksize can only be changed when no bitmap is active, and
+the value should be power of 2 and at least 512.
+
+.TP
+.B md/bitmap/location
+This indicates where the write-intent bitmap for the array is stored.
+It can be "none" or "file" or a signed offset from the array metadata
+- measured in sectors. You cannot set a file by writing here - that can
+only be done with the SET_BITMAP_FILE ioctl.
+
+Write 'none' to 'bitmap/location' will clear bitmap, and the previous
+location value must be write to it to restore bitmap.
+
+.TP
+.B md/bitmap/max_backlog_used
+This keeps track of the maximum number of concurrent write-behind requests
+for an md array, writing any value to this file will clear it.
+
+.TP
+.B md/bitmap/metadata
+This can be 'internal' or 'clustered' or 'external'. 'internal' is set
+by default, which means the metadata for bitmap is stored in the first 256
+bytes of the bitmap space. 'clustered' means separate bitmap metadata are
+used for each cluster node. 'external' means that bitmap metadata is managed
+externally to the kernel.
+
+.TP
+.B md/bitmap/space
+This shows the space (in sectors) which is available at md/bitmap/location,
+and allows the kernel to know when it is safe to resize the bitmap to match
+a resized array. It should big enough to contain the total bytes in the bitmap.
+
+For 1.0 metadata, assume we can use up to the superblock if before, else
+to 4K beyond superblock. For other metadata versions, assume no change is
+possible.
+
+.TP
+.B md/bitmap/time_base
+This shows the time (in seconds) between disk flushes, and is used to looking
+for bits in the bitmap to be cleared.
+
+The default value is 5 seconds, and it should be an unsigned long value.
+
+.SS KERNEL PARAMETERS
+
+The md driver recognised several different kernel parameters.
+.TP
+.B raid=noautodetect
+This will disable the normal detection of md arrays that happens at
+boot time. If a drive is partitioned with MS-DOS style partitions,
+then if any of the 4 main partitions has a partition type of 0xFD,
+then that partition will normally be inspected to see if it is part of
+an MD array, and if any full arrays are found, they are started. This
+kernel parameter disables this behaviour.
+
+.TP
+.B raid=partitionable
+.TP
+.B raid=part
+These are available in 2.6 and later kernels only. They indicate that
+autodetected MD arrays should be created as partitionable arrays, with
+a different major device number to the original non-partitionable md
+arrays. The device number is listed as
+.I mdp
+in
+.IR /proc/devices .
+
+.TP
+.B md_mod.start_ro=1
+.TP
+.B /sys/module/md_mod/parameters/start_ro
+This tells md to start all arrays in read-only mode. This is a soft
+read-only that will automatically switch to read-write on the first
+write request. However until that write request, nothing is written
+to any device by md, and in particular, no resync or recovery
+operation is started.
+
+.TP
+.B md_mod.start_dirty_degraded=1
+.TP
+.B /sys/module/md_mod/parameters/start_dirty_degraded
+As mentioned above, md will not normally start a RAID4, RAID5, or
+RAID6 that is both dirty and degraded as this situation can imply
+hidden data loss. This can be awkward if the root filesystem is
+affected. Using this module parameter allows such arrays to be started
+at boot time. It should be understood that there is a real (though
+small) risk of data corruption in this situation.
+
+.TP
+.BI md= n , dev , dev ,...
+.TP
+.BI md=d n , dev , dev ,...
+This tells the md driver to assemble
+.B /dev/md n
+from the listed devices. It is only necessary to start the device
+holding the root filesystem this way. Other arrays are best started
+once the system is booted.
+
+In 2.6 kernels, the
+.B d
+immediately after the
+.B =
+indicates that a partitionable device (e.g.
+.BR /dev/md/d0 )
+should be created rather than the original non-partitionable device.
+
+.TP
+.BI md= n , l , c , i , dev...
+This tells the md driver to assemble a legacy RAID0 or LINEAR array
+without a superblock.
+.I n
+gives the md device number,
+.I l
+gives the level, 0 for RAID0 or \-1 for LINEAR,
+.I c
+gives the chunk size as a base-2 logarithm offset by twelve, so 0
+means 4K, 1 means 8K.
+.I i
+is ignored (legacy support).
+
+.SH FILES
+.TP
+.B /proc/mdstat
+Contains information about the status of currently running array.
+.TP
+.B /proc/sys/dev/raid/speed_limit_min
+A readable and writable file that reflects the current "goal" rebuild
+speed for times when non-rebuild activity is current on an array.
+The speed is in Kibibytes per second, and is a per-device rate, not a
+per-array rate (which means that an array with more disks will shuffle
+more data for a given speed). The default is 1000.
+
+.TP
+.B /proc/sys/dev/raid/speed_limit_max
+A readable and writable file that reflects the current "goal" rebuild
+speed for times when no non-rebuild activity is current on an array.
+The default is 200,000.
+
+.SH SEE ALSO
+.BR mdadm (8),
diff --git a/md5.h b/md5.h
new file mode 100644
index 0000000..145970d
--- /dev/null
+++ b/md5.h
@@ -0,0 +1,136 @@
+/* Declaration of functions and data types used for MD5 sum computing
+ library functions.
+ Copyright (C) 1995-1997,1999-2005 Free Software Foundation, Inc.
+
+ NOTE: The canonical source of this file is maintained with the GNU C
+ Library. Bugs can be reported to bug-glibc@prep.ai.mit.edu.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+#ifndef _MD5_H
+#define _MD5_H 1
+
+#include <stdio.h>
+
+#if HAVE_INTTYPES_H
+# include <inttypes.h>
+#endif
+#if HAVE_STDINT_H || _LIBC || defined __UCLIBC__
+# include <stdint.h>
+#endif
+
+#ifndef __GNUC_PREREQ
+# if defined __GNUC__ && defined __GNUC_MINOR__
+# define __GNUC_PREREQ(maj, min) \
+ ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min))
+# else
+# define __GNUC_PREREQ(maj, min) 0
+# endif
+#endif
+
+#ifndef __THROW
+# if defined __cplusplus && __GNUC_PREREQ (2,8)
+# define __THROW throw ()
+# else
+# define __THROW
+# endif
+#endif
+
+#ifndef __attribute__
+# if ! __GNUC_PREREQ (2,8) || __STRICT_ANSI__
+# define __attribute__(x)
+# endif
+#endif
+
+#ifndef _LIBC
+# define __md5_buffer md5_buffer
+# define __md5_finish_ctx md5_finish_ctx
+# define __md5_init_ctx md5_init_ctx
+# define __md5_process_block md5_process_block
+# define __md5_process_bytes md5_process_bytes
+# define __md5_read_ctx md5_read_ctx
+# define __md5_stream md5_stream
+#endif
+
+typedef uint32_t md5_uint32;
+
+/* Structure to save state of computation between the single steps. */
+struct md5_ctx
+{
+ md5_uint32 A;
+ md5_uint32 B;
+ md5_uint32 C;
+ md5_uint32 D;
+
+ md5_uint32 total[2];
+ md5_uint32 buflen;
+ char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32))));
+};
+
+/*
+ * The following three functions are build up the low level used in
+ * the functions `md5_stream' and `md5_buffer'.
+ */
+
+/* Initialize structure containing state of computation.
+ (RFC 1321, 3.3: Step 3) */
+extern void __md5_init_ctx (struct md5_ctx *ctx) __THROW;
+
+/* Starting with the result of former calls of this function (or the
+ initialization function update the context for the next LEN bytes
+ starting at BUFFER.
+ It is necessary that LEN is a multiple of 64!!! */
+extern void __md5_process_block (const void *buffer, size_t len,
+ struct md5_ctx *ctx) __THROW;
+
+/* Starting with the result of former calls of this function (or the
+ initialization function update the context for the next LEN bytes
+ starting at BUFFER.
+ It is NOT required that LEN is a multiple of 64. */
+extern void __md5_process_bytes (const void *buffer, size_t len,
+ struct md5_ctx *ctx) __THROW;
+
+/* Process the remaining bytes in the buffer and put result from CTX
+ in first 16 bytes following RESBUF. The result is always in little
+ endian byte order, so that a byte-wise output yields to the wanted
+ ASCII representation of the message digest.
+
+ IMPORTANT: On some systems it is required that RESBUF be correctly
+ aligned for a 32 bits value. */
+extern void *__md5_finish_ctx (struct md5_ctx *ctx, void *resbuf) __THROW;
+
+
+/* Put result from CTX in first 16 bytes following RESBUF. The result is
+ always in little endian byte order, so that a byte-wise output yields
+ to the wanted ASCII representation of the message digest.
+
+ IMPORTANT: On some systems it is required that RESBUF is correctly
+ aligned for a 32 bits value. */
+extern void *__md5_read_ctx (const struct md5_ctx *ctx, void *resbuf) __THROW;
+
+
+/* Compute MD5 message digest for bytes read from STREAM. The
+ resulting message digest number will be written into the 16 bytes
+ beginning at RESBLOCK. */
+extern int __md5_stream (FILE *stream, void *resblock) __THROW;
+
+/* Compute MD5 message digest for LEN bytes beginning at BUFFER. The
+ result is always in little endian byte order, so that a byte-wise
+ output yields to the wanted ASCII representation of the message
+ digest. */
+extern void *__md5_buffer (const char *buffer, size_t len,
+ void *resblock) __THROW;
+
+#endif /* md5.h */
diff --git a/md_p.h b/md_p.h
new file mode 100644
index 0000000..358a28c
--- /dev/null
+++ b/md_p.h
@@ -0,0 +1,295 @@
+/*
+ md_p.h : physical layout of Linux RAID devices
+ Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef _MD_P_H
+#define _MD_P_H
+
+/*
+ * RAID superblock.
+ *
+ * The RAID superblock maintains some statistics on each RAID configuration.
+ * Each real device in the RAID set contains it near the end of the device.
+ * Some of the ideas are copied from the ext2fs implementation.
+ *
+ * We currently use 4096 bytes as follows:
+ *
+ * word offset function
+ *
+ * 0 - 31 Constant generic RAID device information.
+ * 32 - 63 Generic state information.
+ * 64 - 127 Personality specific information.
+ * 128 - 511 12 32-words descriptors of the disks in the raid set.
+ * 512 - 911 Reserved.
+ * 912 - 1023 Disk specific descriptor.
+ */
+
+/*
+ * If x is the real device size in bytes, we return an apparent size of:
+ *
+ * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES
+ *
+ * and place the 4kB superblock at offset y.
+ */
+#define MD_RESERVED_BYTES (64 * 1024)
+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
+#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE)
+
+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS)
+
+#define MD_SB_BYTES 4096
+#define MD_SB_WORDS (MD_SB_BYTES / 4)
+#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE)
+#define MD_SB_SECTORS (MD_SB_BYTES / 512)
+
+/*
+ * The following are counted in 32-bit words
+ */
+#define MD_SB_GENERIC_OFFSET 0
+#define MD_SB_PERSONALITY_OFFSET 64
+#define MD_SB_DISKS_OFFSET 128
+#define MD_SB_DESCRIPTOR_OFFSET 992
+
+#define MD_SB_GENERIC_CONSTANT_WORDS 32
+#define MD_SB_GENERIC_STATE_WORDS 32
+#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS)
+#define MD_SB_PERSONALITY_WORDS 64
+#define MD_SB_DESCRIPTOR_WORDS 32
+#define MD_SB_DISKS 27
+#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS)
+#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS)
+
+/*
+ * Device "operational" state bits
+ */
+#define MD_DISK_FAULTY 0 /* disk is faulty / operational */
+#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */
+#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */
+#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */
+#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster
+ * For clustered enviroments only.
+ */
+#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed
+ * For clustered enviroments only.
+ */
+
+#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config.
+ * read requests will only be sent here in
+ * dire need
+ */
+#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */
+
+#define MD_DISK_REPLACEMENT 17
+#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */
+
+#define MD_DISK_ROLE_SPARE 0xffff
+#define MD_DISK_ROLE_FAULTY 0xfffe
+#define MD_DISK_ROLE_JOURNAL 0xfffd
+#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */
+
+typedef struct mdp_device_descriptor_s {
+ __u32 number; /* 0 Device number in the entire set */
+ __u32 major; /* 1 Device major number */
+ __u32 minor; /* 2 Device minor number */
+ __u32 raid_disk; /* 3 The role of the device in the raid set */
+ __u32 state; /* 4 Operational state */
+ __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5];
+} mdp_disk_t;
+
+#define MD_SB_MAGIC 0xa92b4efc
+
+/*
+ * Superblock state bits
+ */
+#define MD_SB_CLEAN 0
+#define MD_SB_ERRORS 1
+#define MD_SB_BBM_ERRORS 2
+#define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */
+#define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays
+ * in container can be activated */
+#define MD_SB_CLUSTERED 5 /* MD is clustered */
+#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */
+
+typedef struct mdp_superblock_s {
+ /*
+ * Constant generic information
+ */
+ __u32 md_magic; /* 0 MD identifier */
+ __u32 major_version; /* 1 major version to which the set conforms */
+ __u32 minor_version; /* 2 minor version ... */
+ __u32 patch_version; /* 3 patchlevel version ... */
+ __u32 gvalid_words; /* 4 Number of used words in this section */
+ __u32 set_uuid0; /* 5 Raid set identifier */
+ __u32 ctime; /* 6 Creation time */
+ __u32 level; /* 7 Raid personality */
+ __u32 size; /* 8 Apparent size of each individual disk */
+ __u32 nr_disks; /* 9 total disks in the raid set */
+ __u32 raid_disks; /* 10 disks in a fully functional raid set */
+ __u32 md_minor; /* 11 preferred MD minor device number */
+ __u32 not_persistent; /* 12 does it have a persistent superblock */
+ __u32 set_uuid1; /* 13 Raid set identifier #2 */
+ __u32 set_uuid2; /* 14 Raid set identifier #3 */
+ __u32 set_uuid3; /* 15 Raid set identifier #4 */
+ __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16];
+
+ /*
+ * Generic state information
+ */
+ __u32 utime; /* 0 Superblock update time */
+ __u32 state; /* 1 State bits (clean, ...) */
+ __u32 active_disks; /* 2 Number of currently active disks */
+ __u32 working_disks; /* 3 Number of working disks */
+ __u32 failed_disks; /* 4 Number of failed disks */
+ __u32 spare_disks; /* 5 Number of spare disks */
+ __u32 sb_csum; /* 6 checksum of the whole superblock */
+#if __BYTE_ORDER == __BIG_ENDIAN
+ __u32 events_hi; /* 7 high-order of superblock update count */
+ __u32 events_lo; /* 8 low-order of superblock update count */
+ __u32 cp_events_hi; /* 9 high-order of checkpoint update count */
+ __u32 cp_events_lo; /* 10 low-order of checkpoint update count */
+#else
+ __u32 events_lo; /* 7 low-order of superblock update count */
+ __u32 events_hi; /* 8 high-order of superblock update count */
+ __u32 cp_events_lo; /* 9 low-order of checkpoint update count */
+ __u32 cp_events_hi; /* 10 high-order of checkpoint update count */
+#endif
+ __u32 recovery_cp; /* 11 recovery checkpoint sector count */
+ /* There are only valid for minor_version > 90 */
+ __u64 reshape_position; /* 12,13 next address in array-space for reshape */
+ __u32 new_level; /* 14 new level we are reshaping to */
+ __u32 delta_disks; /* 15 change in number of raid_disks */
+ __u32 new_layout; /* 16 new layout */
+ __u32 new_chunk; /* 17 new chunk size (bytes) */
+ __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18];
+
+ /*
+ * Personality information
+ */
+ __u32 layout; /* 0 the array's physical layout */
+ __u32 chunk_size; /* 1 chunk size in bytes */
+ __u32 root_pv; /* 2 LV root PV */
+ __u32 root_block; /* 3 LV root block */
+ __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4];
+
+ /*
+ * Disks information
+ */
+ mdp_disk_t disks[MD_SB_DISKS];
+
+ /*
+ * Reserved
+ */
+ __u32 reserved[MD_SB_RESERVED_WORDS];
+
+ /*
+ * Active descriptor
+ */
+ mdp_disk_t this_disk;
+
+} mdp_super_t;
+
+#ifdef __TINYC__
+typedef unsigned long long __u64;
+#endif
+
+static inline __u64 md_event(mdp_super_t *sb) {
+ __u64 ev = sb->events_hi;
+ return (ev<<32)| sb->events_lo;
+}
+
+struct r5l_payload_header {
+ __u16 type;
+ __u16 flags;
+} __attribute__ ((__packed__));
+
+enum r5l_payload_type {
+ R5LOG_PAYLOAD_DATA = 0,
+ R5LOG_PAYLOAD_PARITY = 1,
+ R5LOG_PAYLOAD_FLUSH = 2,
+};
+
+struct r5l_payload_data_parity {
+ struct r5l_payload_header header;
+ __u32 size; /* sector. data/parity size. each 4k has a checksum */
+ __u64 location; /* sector. For data, it's raid sector. For
+ parity, it's stripe sector */
+ __u32 checksum[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_data_parity_flag {
+ R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */
+ /*
+ * RESHAPED/RESHAPING is only set when there is reshape activity. Note,
+ * both data/parity of a stripe should have the same flag set
+ *
+ * RESHAPED: reshape is running, and this stripe finished reshape
+ * RESHAPING: reshape is running, and this stripe isn't reshaped
+ * */
+ R5LOG_PAYLOAD_FLAG_RESHAPED = 2,
+ R5LOG_PAYLOAD_FLAG_RESHAPING = 3,
+};
+
+struct r5l_payload_flush {
+ struct r5l_payload_header header;
+ __u32 size; /* flush_stripes size, bytes */
+ __u64 flush_stripes[];
+} __attribute__ ((__packed__));
+
+enum r5l_payload_flush_flag {
+ R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */
+};
+
+struct r5l_meta_block {
+ __u32 magic;
+ __u32 checksum;
+ __u8 version;
+ __u8 __zero_pading_1;
+ __u16 __zero_pading_2;
+ __u32 meta_size; /* whole size of the block */
+
+ __u64 seq;
+ __u64 position; /* sector, start from rdev->data_offset, current position */
+ struct r5l_payload_header payloads[];
+} __attribute__ ((__packed__));
+
+#define R5LOG_VERSION 0x1
+#define R5LOG_MAGIC 0x6433c509
+
+struct ppl_header_entry {
+ __u64 data_sector; /* raid sector of the new data */
+ __u32 pp_size; /* length of partial parity */
+ __u32 data_size; /* length of data */
+ __u32 parity_disk; /* member disk containing parity */
+ __u32 checksum; /* checksum of this entry's partial parity */
+} __attribute__ ((__packed__));
+
+#define PPL_HEADER_SIZE 4096
+#define PPL_HDR_RESERVED 512
+#define PPL_HDR_ENTRY_SPACE \
+ (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__u32) - sizeof(__u64))
+#define PPL_HDR_MAX_ENTRIES \
+ (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry))
+
+struct ppl_header {
+ __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */
+ __u32 signature; /* signature (family number of volume) */
+ __u32 padding; /* zero pad */
+ __u64 generation; /* generation number of the header */
+ __u32 entries_count; /* number of entries in entry array */
+ __u32 checksum; /* checksum of the header */
+ struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES];
+} __attribute__ ((__packed__));
+
+#endif
diff --git a/md_u.h b/md_u.h
new file mode 100644
index 0000000..b30893c
--- /dev/null
+++ b/md_u.h
@@ -0,0 +1,115 @@
+/*
+ md_u.h : user <=> kernel API between Linux raidtools and RAID drivers
+ Copyright (C) 1998 Ingo Molnar
+
+ This program is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ You should have received a copy of the GNU General Public License
+ (for example /usr/src/linux/COPYING); if not, write to the Free
+ Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*/
+
+#ifndef _MD_U_H
+#define _MD_U_H
+
+/* ioctls */
+
+/* status */
+#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t)
+#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t)
+#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t)
+#define RAID_AUTORUN _IO (MD_MAJOR, 0x14)
+#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t)
+
+/* configuration */
+#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t)
+#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22)
+#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t)
+#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29)
+#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int)
+
+/* usage */
+#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t)
+#define STOP_ARRAY _IO (MD_MAJOR, 0x32)
+#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33)
+#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34)
+#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35)
+
+typedef struct mdu_version_s {
+ int major;
+ int minor;
+ int patchlevel;
+} mdu_version_t;
+
+typedef struct mdu_array_info_s {
+ /*
+ * Generic constant information
+ */
+ int major_version;
+ int minor_version;
+ int patch_version;
+ unsigned int ctime;
+ int level;
+ int size;
+ int nr_disks;
+ int raid_disks;
+ int md_minor;
+ int not_persistent;
+
+ /*
+ * Generic state information
+ */
+ unsigned int utime; /* 0 Superblock update time */
+ int state; /* 1 State bits (clean, ...) */
+ int active_disks; /* 2 Number of currently active disks */
+ int working_disks; /* 3 Number of working disks */
+ int failed_disks; /* 4 Number of failed disks */
+ int spare_disks; /* 5 Number of spare disks */
+
+ /*
+ * Personality information
+ */
+ int layout; /* 0 the array's physical layout */
+ int chunk_size; /* 1 chunk size in bytes */
+
+} mdu_array_info_t;
+
+typedef struct mdu_disk_info_s {
+ /*
+ * configuration/status of one particular disk
+ */
+ int number;
+ int major;
+ int minor;
+ int raid_disk;
+ int state;
+
+} mdu_disk_info_t;
+
+typedef struct mdu_start_info_s {
+ /*
+ * configuration/status of one particular disk
+ */
+ int major;
+ int minor;
+ int raid_disk;
+ int state;
+
+} mdu_start_info_t;
+
+typedef struct mdu_bitmap_file_s
+{
+ char pathname[4096];
+} mdu_bitmap_file_t;
+
+typedef struct mdu_param_s
+{
+ int personality; /* 1,2,3,4 */
+ int chunk_size; /* in bytes */
+ int max_fault; /* unused for now */
+} mdu_param_t;
+
+#endif
diff --git a/mdadm.8.in b/mdadm.8.in
new file mode 100644
index 0000000..be902db
--- /dev/null
+++ b/mdadm.8.in
@@ -0,0 +1,3452 @@
+.\" -*- nroff -*-
+.\" Copyright Neil Brown and others.
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" See file COPYING in distribution for details.
+.TH MDADM 8 "" v4.2
+.SH NAME
+mdadm \- manage MD devices
+.I aka
+Linux Software RAID
+
+.SH SYNOPSIS
+
+.BI mdadm " [mode] <raiddevice> [options] <component-devices>"
+
+.SH DESCRIPTION
+RAID devices are virtual devices created from two or more
+real block devices. This allows multiple devices (typically disk
+drives or partitions thereof) to be combined into a single device to
+hold (for example) a single filesystem.
+Some RAID levels include redundancy and so can survive some degree of
+device failure.
+
+Linux Software RAID devices are implemented through the md (Multiple
+Devices) device driver.
+
+Currently, Linux supports
+.B LINEAR
+md devices,
+.B RAID0
+(striping),
+.B RAID1
+(mirroring),
+.BR RAID4 ,
+.BR RAID5 ,
+.BR RAID6 ,
+.BR RAID10 ,
+.BR MULTIPATH ,
+.BR FAULTY ,
+and
+.BR CONTAINER .
+
+.B MULTIPATH
+is not a Software RAID mechanism, but does involve
+multiple devices:
+each device is a path to one common physical storage device.
+New installations should not use md/multipath as it is not well
+supported and has no ongoing development. Use the Device Mapper based
+multipath-tools instead.
+
+.B FAULTY
+is also not true RAID, and it only involves one device. It
+provides a layer over a true device that can be used to inject faults.
+
+.B CONTAINER
+is different again. A
+.B CONTAINER
+is a collection of devices that are
+managed as a set. This is similar to the set of devices connected to
+a hardware RAID controller. The set of devices may contain a number
+of different RAID arrays each utilising some (or all) of the blocks from a
+number of the devices in the set. For example, two devices in a 5-device set
+might form a RAID1 using the whole devices. The remaining three might
+have a RAID5 over the first half of each device, and a RAID0 over the
+second half.
+
+With a
+.BR CONTAINER ,
+there is one set of metadata that describes all of
+the arrays in the container. So when
+.I mdadm
+creates a
+.B CONTAINER
+device, the device just represents the metadata. Other normal arrays (RAID1
+etc) can be created inside the container.
+
+.SH MODES
+mdadm has several major modes of operation:
+.TP
+.B Assemble
+Assemble the components of a previously created
+array into an active array. Components can be explicitly given
+or can be searched for.
+.I mdadm
+checks that the components
+do form a bona fide array, and can, on request, fiddle superblock
+information so as to assemble a faulty array.
+
+.TP
+.B Build
+Build an array that doesn't have per-device metadata (superblocks). For these
+sorts of arrays,
+.I mdadm
+cannot differentiate between initial creation and subsequent assembly
+of an array. It also cannot perform any checks that appropriate
+components have been requested. Because of this, the
+.B Build
+mode should only be used together with a complete understanding of
+what you are doing.
+
+.TP
+.B Create
+Create a new array with per-device metadata (superblocks).
+Appropriate metadata is written to each device, and then the array
+comprising those devices is activated. A 'resync' process is started
+to make sure that the array is consistent (e.g. both sides of a mirror
+contain the same data) but the content of the device is left otherwise
+untouched.
+The array can be used as soon as it has been created. There is no
+need to wait for the initial resync to finish.
+
+.TP
+.B "Follow or Monitor"
+Monitor one or more md devices and act on any state changes. This is
+only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as
+only these have interesting state. RAID0 or Linear never have
+missing, spare, or failed drives, so there is nothing to monitor.
+
+.TP
+.B "Grow"
+Grow (or shrink) an array, or otherwise reshape it in some way.
+Currently supported growth options including changing the active size
+of component devices and changing the number of active devices in
+Linear and RAID levels 0/1/4/5/6,
+changing the RAID level between 0, 1, 5, and 6, and between 0 and 10,
+changing the chunk size and layout for RAID 0,4,5,6,10 as well as adding or
+removing a write-intent bitmap and changing the array's consistency policy.
+
+.TP
+.B "Incremental Assembly"
+Add a single device to an appropriate array. If the addition of the
+device makes the array runnable, the array will be started.
+This provides a convenient interface to a
+.I hot-plug
+system. As each device is detected,
+.I mdadm
+has a chance to include it in some array as appropriate.
+Optionally, when the
+.I \-\-fail
+flag is passed in we will remove the device from any active array
+instead of adding it.
+
+If a
+.B CONTAINER
+is passed to
+.I mdadm
+in this mode, then any arrays within that container will be assembled
+and started.
+
+.TP
+.B Manage
+This is for doing things to specific components of an array such as
+adding new spares and removing faulty devices.
+
+.TP
+.B Misc
+This is an 'everything else' mode that supports operations on active
+arrays, operations on component devices such as erasing old superblocks, and
+information gathering operations.
+.\"This mode allows operations on independent devices such as examine MD
+.\"superblocks, erasing old superblocks and stopping active arrays.
+
+.TP
+.B Auto-detect
+This mode does not act on a specific device or array, but rather it
+requests the Linux Kernel to activate any auto-detected arrays.
+.SH OPTIONS
+
+.SH Options for selecting a mode are:
+
+.TP
+.BR \-A ", " \-\-assemble
+Assemble a pre-existing array.
+
+.TP
+.BR \-B ", " \-\-build
+Build a legacy array without superblocks.
+
+.TP
+.BR \-C ", " \-\-create
+Create a new array.
+
+.TP
+.BR \-F ", " \-\-follow ", " \-\-monitor
+Select
+.B Monitor
+mode.
+
+.TP
+.BR \-G ", " \-\-grow
+Change the size or shape of an active array.
+
+.TP
+.BR \-I ", " \-\-incremental
+Add/remove a single device to/from an appropriate array, and possibly start the array.
+
+.TP
+.B \-\-auto-detect
+Request that the kernel starts any auto-detected arrays. This can only
+work if
+.I md
+is compiled into the kernel \(em not if it is a module.
+Arrays can be auto-detected by the kernel if all the components are in
+primary MS-DOS partitions with partition type
+.BR FD ,
+and all use v0.90 metadata.
+In-kernel autodetect is not recommended for new installations. Using
+.I mdadm
+to detect and assemble arrays \(em possibly in an
+.I initrd
+\(em is substantially more flexible and should be preferred.
+
+.P
+If a device is given before any options, or if the first option is
+one of
+.BR \-\-add ,
+.BR \-\-re\-add ,
+.BR \-\-add\-spare ,
+.BR \-\-fail ,
+.BR \-\-remove ,
+or
+.BR \-\-replace ,
+then the MANAGE mode is assumed.
+Anything other than these will cause the
+.B Misc
+mode to be assumed.
+
+.SH Options that are not mode-specific are:
+
+.TP
+.BR \-h ", " \-\-help
+Display general help message or, after one of the above options, a
+mode-specific help message.
+
+.TP
+.B \-\-help\-options
+Display more detailed help about command line parsing and some commonly
+used options.
+
+.TP
+.BR \-V ", " \-\-version
+Print version information for mdadm.
+
+.TP
+.BR \-v ", " \-\-verbose
+Be more verbose about what is happening. This can be used twice to be
+extra-verbose.
+The extra verbosity currently only affects
+.B \-\-detail \-\-scan
+and
+.BR "\-\-examine \-\-scan" .
+
+.TP
+.BR \-q ", " \-\-quiet
+Avoid printing purely informative messages. With this,
+.I mdadm
+will be silent unless there is something really important to report.
+
+
+.TP
+.BR \-f ", " \-\-force
+Be more forceful about certain operations. See the various modes for
+the exact meaning of this option in different contexts.
+
+.TP
+.BR \-c ", " \-\-config=
+Specify the config file or directory. Default is to use
+.B /etc/mdadm.conf
+and
+.BR /etc/mdadm.conf.d ,
+or if those are missing then
+.B /etc/mdadm/mdadm.conf
+and
+.BR /etc/mdadm/mdadm.conf.d .
+If the config file given is
+.B "partitions"
+then nothing will be read, but
+.I mdadm
+will act as though the config file contained exactly
+.br
+.B " DEVICE partitions containers"
+.br
+and will read
+.B /proc/partitions
+to find a list of devices to scan, and
+.B /proc/mdstat
+to find a list of containers to examine.
+If the word
+.B "none"
+is given for the config file, then
+.I mdadm
+will act as though the config file were empty.
+
+If the name given is of a directory, then
+.I mdadm
+will collect all the files contained in the directory with a name ending
+in
+.BR .conf ,
+sort them lexically, and process all of those files as config files.
+
+.TP
+.BR \-s ", " \-\-scan
+Scan config file or
+.B /proc/mdstat
+for missing information.
+In general, this option gives
+.I mdadm
+permission to get any missing information (like component devices,
+array devices, array identities, and alert destination) from the
+configuration file (see previous option);
+one exception is MISC mode when using
+.B \-\-detail
+or
+.B \-\-stop,
+in which case
+.B \-\-scan
+says to get a list of array devices from
+.BR /proc/mdstat .
+
+.TP
+.BR \-e ", " \-\-metadata=
+Declare the style of RAID metadata (superblock) to be used. The
+default is {DEFAULT_METADATA} for
+.BR \-\-create ,
+and to guess for other operations.
+The default can be overridden by setting the
+.B metadata
+value for the
+.B CREATE
+keyword in
+.BR mdadm.conf .
+
+Options are:
+.RS
+.ie '{DEFAULT_METADATA}'0.90'
+.IP "0, 0.90, default"
+.el
+.IP "0, 0.90"
+Use the original 0.90 format superblock. This format limits arrays to
+28 component devices and limits component devices of levels 1 and
+greater to 2 terabytes. It is also possible for there to be confusion
+about whether the superblock applies to a whole device or just the
+last partition, if that partition starts on a 64K boundary.
+.ie '{DEFAULT_METADATA}'0.90'
+.IP "1, 1.0, 1.1, 1.2"
+.el
+.IP "1, 1.0, 1.1, 1.2 default"
+Use the new version-1 format superblock. This has fewer restrictions.
+It can easily be moved between hosts with different endian-ness, and a
+recovery operation can be checkpointed and restarted. The different
+sub-versions store the superblock at different locations on the
+device, either at the end (for 1.0), at the start (for 1.1) or 4K from
+the start (for 1.2). "1" is equivalent to "1.2" (the commonly
+preferred 1.x format).
+'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2".
+.IP ddf
+Use the "Industry Standard" DDF (Disk Data Format) format defined by
+SNIA.
+When creating a DDF array a
+.B CONTAINER
+will be created, and normal arrays can be created in that container.
+.IP imsm
+Use the Intel(R) Matrix Storage Manager metadata format. This creates a
+.B CONTAINER
+which is managed in a similar manner to DDF, and is supported by an
+option-rom on some platforms:
+.IP
+.B https://www.intel.com/content/www/us/en/support/products/122484/memory-and-storage/ssd-software/intel-virtual-raid-on-cpu-intel-vroc.html
+.PP
+.RE
+
+.TP
+.B \-\-homehost=
+This will override any
+.B HOMEHOST
+setting in the config file and provides the identity of the host which
+should be considered the home for any arrays.
+
+When creating an array, the
+.B homehost
+will be recorded in the metadata. For version-1 superblocks, it will
+be prefixed to the array name. For version-0.90 superblocks, part of
+the SHA1 hash of the hostname will be stored in the later half of the
+UUID.
+
+When reporting information about an array, any array which is tagged
+for the given homehost will be reported as such.
+
+When using Auto-Assemble, only arrays tagged for the given homehost
+will be allowed to use 'local' names (i.e. not ending in '_' followed
+by a digit string). See below under
+.BR "Auto Assembly" .
+
+The special name "\fBany\fP" can be used as a wild card. If an array
+is created with
+.B --homehost=any
+then the name "\fBany\fP" will be stored in the array and it can be
+assembled in the same way on any host. If an array is assembled with
+this option, then the homehost recorded on the array will be ignored.
+
+.TP
+.B \-\-prefer=
+When
+.I mdadm
+needs to print the name for a device it normally finds the name in
+.B /dev
+which refers to the device and is shortest. When a path component is
+given with
+.B \-\-prefer
+.I mdadm
+will prefer a longer name if it contains that component. For example
+.B \-\-prefer=by-uuid
+will prefer a name in a subdirectory of
+.B /dev
+called
+.BR by-uuid .
+
+This functionality is currently only provided by
+.B \-\-detail
+and
+.BR \-\-monitor .
+
+.TP
+.B \-\-home\-cluster=
+specifies the cluster name for the md device. The md device can be assembled
+only on the cluster which matches the name specified. If this option is not
+provided, mdadm tries to detect the cluster name automatically.
+
+.SH For create, build, or grow:
+
+.TP
+.BR \-n ", " \-\-raid\-devices=
+Specify the number of active devices in the array. This, plus the
+number of spare devices (see below) must equal the number of
+.I component-devices
+(including "\fBmissing\fP" devices)
+that are listed on the command line for
+.BR \-\-create .
+Setting a value of 1 is probably
+a mistake and so requires that
+.B \-\-force
+be specified first. A value of 1 will then be allowed for linear,
+multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6.
+.br
+This number can only be changed using
+.B \-\-grow
+for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide
+the necessary support.
+
+.TP
+.BR \-x ", " \-\-spare\-devices=
+Specify the number of spare (eXtra) devices in the initial array.
+Spares can also be added
+and removed later. The number of component devices listed
+on the command line must equal the number of RAID devices plus the
+number of spare devices.
+
+.TP
+.BR \-z ", " \-\-size=
+Amount (in Kilobytes) of space to use from each drive in RAID levels 1/4/5/6.
+This must be a multiple of the chunk size, and must leave about 128Kb
+of space at the end of the drive for the RAID superblock.
+If this is not specified
+(as it normally is not) the smallest drive (or partition) sets the
+size, though if there is a variance among the drives of greater than 1%, a warning is
+issued.
+
+A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes,
+Megabytes, Gigabytes or Terabytes respectively.
+
+Sometimes a replacement drive can be a little smaller than the
+original drives though this should be minimised by IDEMA standards.
+Such a replacement drive will be rejected by
+.IR md .
+To guard against this it can be useful to set the initial size
+slightly smaller than the smaller device with the aim that it will
+still be larger than any replacement.
+
+This value can be set with
+.B \-\-grow
+for RAID level 1/4/5/6 though
+DDF arrays may not be able to support this.
+If the array was created with a size smaller than the currently
+active drives, the extra space can be accessed using
+.BR \-\-grow .
+The size can be given as
+.B max
+which means to choose the largest size that fits on all current drives.
+
+Before reducing the size of the array (with
+.BR "\-\-grow \-\-size=" )
+you should make sure that space isn't needed. If the device holds a
+filesystem, you would need to resize the filesystem to use less space.
+
+After reducing the array size you should check that the data stored in
+the device is still available. If the device holds a filesystem, then
+an 'fsck' of the filesystem is a minimum requirement. If there are
+problems the array can be made bigger again with no loss with another
+.B "\-\-grow \-\-size="
+command.
+
+This value cannot be used when creating a
+.B CONTAINER
+such as with DDF and IMSM metadata, though it perfectly valid when
+creating an array inside a container.
+
+.TP
+.BR \-Z ", " \-\-array\-size=
+This is only meaningful with
+.B \-\-grow
+and its effect is not persistent: when the array is stopped and
+restarted the default array size will be restored.
+
+Setting the array-size causes the array to appear smaller to programs
+that access the data. This is particularly needed before reshaping an
+array so that it will be smaller. As the reshape is not reversible,
+but setting the size with
+.B \-\-array-size
+is, it is required that the array size is reduced as appropriate
+before the number of devices in the array is reduced.
+
+Before reducing the size of the array you should make sure that space
+isn't needed. If the device holds a filesystem, you would need to
+resize the filesystem to use less space.
+
+After reducing the array size you should check that the data stored in
+the device is still available. If the device holds a filesystem, then
+an 'fsck' of the filesystem is a minimum requirement. If there are
+problems the array can be made bigger again with no loss with another
+.B "\-\-grow \-\-array\-size="
+command.
+
+A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes,
+Megabytes, Gigabytes or Terabytes respectively.
+A value of
+.B max
+restores the apparent size of the array to be whatever the real
+amount of available space is.
+
+Clustered arrays do not support this parameter yet.
+
+.TP
+.BR \-c ", " \-\-chunk=
+Specify chunk size of kilobytes. The default when creating an
+array is 512KB. To ensure compatibility with earlier versions, the
+default when building an array with no persistent metadata is 64KB.
+This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10.
+
+RAID4, RAID5, RAID6, and RAID10 require the chunk size to be a power
+of 2. In any case it must be a multiple of 4KB.
+
+A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes,
+Megabytes, Gigabytes or Terabytes respectively.
+
+.TP
+.BR \-\-rounding=
+Specify rounding factor for a Linear array. The size of each
+component will be rounded down to a multiple of this size.
+This is a synonym for
+.B \-\-chunk
+but highlights the different meaning for Linear as compared to other
+RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in
+use, and is 0K (i.e. no rounding) in later kernels.
+
+.TP
+.BR \-l ", " \-\-level=
+Set RAID level. When used with
+.BR \-\-create ,
+options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4,
+raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container.
+Obviously some of these are synonymous.
+
+When a
+.B CONTAINER
+metadata type is requested, only the
+.B container
+level is permitted, and it does not need to be explicitly given.
+
+When used with
+.BR \-\-build ,
+only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid.
+
+Can be used with
+.B \-\-grow
+to change the RAID level in some cases. See LEVEL CHANGES below.
+
+.TP
+.BR \-p ", " \-\-layout=
+This option configures the fine details of data layout for RAID5, RAID6,
+and RAID10 arrays, and controls the failure modes for
+.IR faulty .
+It can also be used for working around a kernel bug with RAID0, but generally
+doesn't need to be used explicitly.
+
+The layout of the RAID5 parity block can be one of
+.BR left\-asymmetric ,
+.BR left\-symmetric ,
+.BR right\-asymmetric ,
+.BR right\-symmetric ,
+.BR la ", " ra ", " ls ", " rs .
+The default is
+.BR left\-symmetric .
+
+It is also possible to cause RAID5 to use a RAID4-like layout by
+choosing
+.BR parity\-first ,
+or
+.BR parity\-last .
+
+Finally for RAID5 there are DDF\-compatible layouts,
+.BR ddf\-zero\-restart ,
+.BR ddf\-N\-restart ,
+and
+.BR ddf\-N\-continue .
+
+These same layouts are available for RAID6. There are also 4 layouts
+that will provide an intermediate stage for converting between RAID5
+and RAID6. These provide a layout which is identical to the
+corresponding RAID5 layout on the first N\-1 devices, and has the 'Q'
+syndrome (the second 'parity' block used by RAID6) on the last device.
+These layouts are:
+.BR left\-symmetric\-6 ,
+.BR right\-symmetric\-6 ,
+.BR left\-asymmetric\-6 ,
+.BR right\-asymmetric\-6 ,
+and
+.BR parity\-first\-6 .
+
+When setting the failure mode for level
+.I faulty,
+the options are:
+.BR write\-transient ", " wt ,
+.BR read\-transient ", " rt ,
+.BR write\-persistent ", " wp ,
+.BR read\-persistent ", " rp ,
+.BR write\-all ,
+.BR read\-fixable ", " rf ,
+.BR clear ", " flush ", " none .
+
+Each failure mode can be followed by a number, which is used as a period
+between fault generation. Without a number, the fault is generated
+once on the first relevant request. With a number, the fault will be
+generated after that many requests, and will continue to be generated
+every time the period elapses.
+
+Multiple failure modes can be current simultaneously by using the
+.B \-\-grow
+option to set subsequent failure modes.
+
+"clear" or "none" will remove any pending or periodic failure modes,
+and "flush" will clear any persistent faults.
+
+The layout options for RAID10 are one of 'n', 'o' or 'f' followed
+by a small number. The default is 'n2'. The supported options are:
+
+.I 'n'
+signals 'near' copies. Multiple copies of one data block are at
+similar offsets in different devices.
+
+.I 'o'
+signals 'offset' copies. Rather than the chunks being duplicated
+within a stripe, whole stripes are duplicated but are rotated by one
+device so duplicate blocks are on different devices. Thus subsequent
+copies of a block are in the next drive, and are one chunk further
+down.
+
+.I 'f'
+signals 'far' copies
+(multiple copies have very different offsets).
+See md(4) for more detail about 'near', 'offset', and 'far'.
+
+The number is the number of copies of each datablock. 2 is normal, 3
+can be useful. This number can be at most equal to the number of
+devices in the array. It does not need to divide evenly into that
+number (e.g. it is perfectly legal to have an 'n2' layout for an array
+with an odd number of devices).
+
+A bug introduced in Linux 3.14 means that RAID0 arrays
+.B "with devices of differing sizes"
+started using a different layout. This could lead to
+data corruption. Since Linux 5.4 (and various stable releases that received
+backports), the kernel will not accept such an array unless
+a layout is explictly set. It can be set to
+.RB ' original '
+or
+.RB ' alternate '.
+When creating a new array,
+.I mdadm
+will select
+.RB ' original '
+by default, so the layout does not normally need to be set.
+An array created for either
+.RB ' original '
+or
+.RB ' alternate '
+will not be recognized by an (unpatched) kernel prior to 5.4. To create
+a RAID0 array with devices of differing sizes that can be used on an
+older kernel, you can set the layout to
+.RB ' dangerous '.
+This will use whichever layout the running kernel supports, so the data
+on the array may become corrupt when changing kernel from pre-3.14 to a
+later kernel.
+
+When an array is converted between RAID5 and RAID6 an intermediate
+RAID6 layout is used in which the second parity block (Q) is always on
+the last device. To convert a RAID5 to RAID6 and leave it in this new
+layout (which does not require re-striping) use
+.BR \-\-layout=preserve .
+This will try to avoid any restriping.
+
+The converse of this is
+.B \-\-layout=normalise
+which will change a non-standard RAID6 layout into a more standard
+arrangement.
+
+.TP
+.BR \-\-parity=
+same as
+.B \-\-layout
+(thus explaining the p of
+.BR \-p ).
+
+.TP
+.BR \-b ", " \-\-bitmap=
+Specify a file to store a write-intent bitmap in. The file should not
+exist unless
+.B \-\-force
+is also given. The same file should be provided
+when assembling the array. If the word
+.B "internal"
+is given, then the bitmap is stored with the metadata on the array,
+and so is replicated on all devices. If the word
+.B "none"
+is given with
+.B \-\-grow
+mode, then any bitmap that is present is removed. If the word
+.B "clustered"
+is given, the array is created for a clustered environment. One bitmap
+is created for each node as defined by the
+.B \-\-nodes
+parameter and are stored internally.
+
+To help catch typing errors, the filename must contain at least one
+slash ('/') if it is a real file (not 'internal' or 'none').
+
+Note: external bitmaps are only known to work on ext2 and ext3.
+Storing bitmap files on other filesystems may result in serious problems.
+
+When creating an array on devices which are 100G or larger,
+.I mdadm
+automatically adds an internal bitmap as it will usually be
+beneficial. This can be suppressed with
+.B "\-\-bitmap=none"
+or by selecting a different consistency policy with
+.BR \-\-consistency\-policy .
+
+.TP
+.BR \-\-bitmap\-chunk=
+Set the chunksize of the bitmap. Each bit corresponds to that many
+Kilobytes of storage.
+When using a file based bitmap, the default is to use the smallest
+size that is at-least 4 and requires no more than 2^21 chunks.
+When using an
+.B internal
+bitmap, the chunksize defaults to 64Meg, or larger if necessary to
+fit the bitmap into the available space.
+
+A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes,
+Megabytes, Gigabytes or Terabytes respectively.
+
+.TP
+.BR \-W ", " \-\-write\-mostly
+subsequent devices listed in a
+.BR \-\-build ,
+.BR \-\-create ,
+or
+.B \-\-add
+command will be flagged as 'write\-mostly'. This is valid for RAID1
+only and means that the 'md' driver will avoid reading from these
+devices if at all possible. This can be useful if mirroring over a
+slow link.
+
+.TP
+.BR \-\-write\-behind=
+Specify that write-behind mode should be enabled (valid for RAID1
+only). If an argument is specified, it will set the maximum number
+of outstanding writes allowed. The default value is 256.
+A write-intent bitmap is required in order to use write-behind
+mode, and write-behind is only attempted on drives marked as
+.IR write-mostly .
+
+.TP
+.BR \-\-failfast
+subsequent devices listed in a
+.B \-\-create
+or
+.B \-\-add
+command will be flagged as 'failfast'. This is valid for RAID1 and
+RAID10 only. IO requests to these devices will be encouraged to fail
+quickly rather than cause long delays due to error handling. Also no
+attempt is made to repair a read error on these devices.
+
+If an array becomes degraded so that the 'failfast' device is the only
+usable device, the 'failfast' flag will then be ignored and extended
+delays will be preferred to complete failure.
+
+The 'failfast' flag is appropriate for storage arrays which have a
+low probability of true failure, but which may sometimes
+cause unacceptable delays due to internal maintenance functions.
+
+.TP
+.BR \-\-assume\-clean
+Tell
+.I mdadm
+that the array pre-existed and is known to be clean. It can be useful
+when trying to recover from a major failure as you can be sure that no
+data will be affected unless you actually write to the array. It can
+also be used when creating a RAID1 or RAID10 if you want to avoid the
+initial resync, however this practice \(em while normally safe \(em is not
+recommended. Use this only if you really know what you are doing.
+.IP
+When the devices that will be part of a new array were filled
+with zeros before creation the operator knows the array is
+actually clean. If that is the case, such as after running
+badblocks, this argument can be used to tell mdadm the
+facts the operator knows.
+.IP
+When an array is resized to a larger size with
+.B "\-\-grow \-\-size="
+the new space is normally resynced in that same way that the whole
+array is resynced at creation. From Linux version 3.0,
+.B \-\-assume\-clean
+can be used with that command to avoid the automatic resync.
+
+.TP
+.BR \-\-backup\-file=
+This is needed when
+.B \-\-grow
+is used to increase the number of raid-devices in a RAID5 or RAID6 if
+there are no spare devices available, or to shrink, change RAID level
+or layout. See the GROW MODE section below on RAID\-DEVICES CHANGES.
+The file must be stored on a separate device, not on the RAID array
+being reshaped.
+
+.TP
+.B \-\-data\-offset=
+Arrays with 1.x metadata can leave a gap between the start of the
+device and the start of array data. This gap can be used for various
+metadata. The start of data is known as the
+.IR data\-offset .
+Normally an appropriate data offset is computed automatically.
+However it can be useful to set it explicitly such as when re-creating
+an array which was originally created using a different version of
+.I mdadm
+which computed a different offset.
+
+Setting the offset explicitly over-rides the default. The value given
+is in Kilobytes unless a suffix of 'K', 'M', 'G' or 'T' is used to explicitly
+indicate Kilobytes, Megabytes, Gigabytes or Terabytes respectively.
+
+Since Linux 3.4,
+.B \-\-data\-offset
+can also be used with
+.B --grow
+for some RAID levels (initially on RAID10). This allows the
+data\-offset to be changed as part of the reshape process. When the
+data offset is changed, no backup file is required as the difference
+in offsets is used to provide the same functionality.
+
+When the new offset is earlier than the old offset, the number of
+devices in the array cannot shrink. When it is after the old offset,
+the number of devices in the array cannot increase.
+
+When creating an array,
+.B \-\-data\-offset
+can be specified as
+.BR variable .
+In the case each member device is expected to have a offset appended
+to the name, separated by a colon. This makes it possible to recreate
+exactly an array which has varying data offsets (as can happen when
+different versions of
+.I mdadm
+are used to add different devices).
+
+.TP
+.BR \-\-continue
+This option is complementary to the
+.B \-\-freeze-reshape
+option for assembly. It is needed when
+.B \-\-grow
+operation is interrupted and it is not restarted automatically due to
+.B \-\-freeze-reshape
+usage during array assembly. This option is used together with
+.BR \-G
+, (
+.BR \-\-grow
+) command and device for a pending reshape to be continued.
+All parameters required for reshape continuation will be read from array metadata.
+If initial
+.BR \-\-grow
+command had required
+.BR \-\-backup\-file=
+option to be set, continuation option will require to have exactly the same
+backup file given as well.
+.IP
+Any other parameter passed together with
+.BR \-\-continue
+option will be ignored.
+
+.TP
+.BR \-N ", " \-\-name=
+Set a
+.B name
+for the array. This is currently only effective when creating an
+array with a version-1 superblock, or an array in a DDF container.
+The name is a simple textual string that can be used to identify array
+components when assembling. If name is needed but not specified, it
+is taken from the basename of the device that is being created.
+e.g. when creating
+.I /dev/md/home
+the
+.B name
+will default to
+.IR home .
+
+.TP
+.BR \-R ", " \-\-run
+Insist that
+.I mdadm
+run the array, even if some of the components
+appear to be active in another array or filesystem. Normally
+.I mdadm
+will ask for confirmation before including such components in an
+array. This option causes that question to be suppressed.
+
+.TP
+.BR \-f ", " \-\-force
+Insist that
+.I mdadm
+accept the geometry and layout specified without question. Normally
+.I mdadm
+will not allow creation of an array with only one device, and will try
+to create a RAID5 array with one missing drive (as this makes the
+initial resync work faster). With
+.BR \-\-force ,
+.I mdadm
+will not try to be so clever.
+
+.TP
+.BR \-o ", " \-\-readonly
+Start the array
+.B read only
+rather than read-write as normal. No writes will be allowed to the
+array, and no resync, recovery, or reshape will be started. It works with
+Create, Assemble, Manage and Misc mode.
+
+.TP
+.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}"
+Instruct mdadm how to create the device file if needed, possibly allocating
+an unused minor number. "md" causes a non-partitionable array
+to be used (though since Linux 2.6.28, these array devices are in fact
+partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and
+later) to be used. "yes" requires the named md device to have
+a 'standard' format, and the type and minor number will be determined
+from this. With mdadm 3.0, device creation is normally left up to
+.I udev
+so this option is unlikely to be needed.
+See DEVICE NAMES below.
+
+The argument can also come immediately after
+"\-a". e.g. "\-ap".
+
+If
+.B \-\-auto
+is not given on the command line or in the config file, then
+the default will be
+.BR \-\-auto=yes .
+
+If
+.B \-\-scan
+is also given, then any
+.I auto=
+entries in the config file will override the
+.B \-\-auto
+instruction given on the command line.
+
+For partitionable arrays,
+.I mdadm
+will create the device file for the whole array and for the first 4
+partitions. A different number of partitions can be specified at the
+end of this option (e.g.
+.BR \-\-auto=p7 ).
+If the device name ends with a digit, the partition names add a 'p',
+and a number, e.g.
+.IR /dev/md/home1p3 .
+If there is no trailing digit, then the partition names just have a
+number added, e.g.
+.IR /dev/md/scratch3 .
+
+If the md device name is in a 'standard' format as described in DEVICE
+NAMES, then it will be created, if necessary, with the appropriate
+device number based on that name. If the device name is not in one of these
+formats, then a unused device number will be allocated. The device
+number will be considered unused if there is no active array for that
+number, and there is no entry in /dev for that number and with a
+non-standard name. Names that are not in 'standard' format are only
+allowed in "/dev/md/".
+
+This is meaningful with
+.B \-\-create
+or
+.BR \-\-build .
+
+.TP
+.BR \-a ", " "\-\-add"
+This option can be used in Grow mode in two cases.
+
+If the target array is a Linear array, then
+.B \-\-add
+can be used to add one or more devices to the array. They
+are simply catenated on to the end of the array. Once added, the
+devices cannot be removed.
+
+If the
+.B \-\-raid\-disks
+option is being used to increase the number of devices in an array,
+then
+.B \-\-add
+can be used to add some extra devices to be included in the array.
+In most cases this is not needed as the extra devices can be added as
+spares first, and then the number of raid-disks can be changed.
+However for RAID0, it is not possible to add spares. So to increase
+the number of devices in a RAID0, it is necessary to set the new
+number of devices, and to add the new devices, in the same command.
+
+.TP
+.BR \-\-nodes
+Only works when the array is for clustered environment. It specifies
+the maximum number of nodes in the cluster that will use this device
+simultaneously. If not specified, this defaults to 4.
+
+.TP
+.BR \-\-write-journal
+Specify journal device for the RAID-4/5/6 array. The journal device
+should be a SSD with reasonable lifetime.
+
+.TP
+.BR \-\-symlinks
+Auto creation of symlinks in /dev to /dev/md, option --symlinks must
+be 'no' or 'yes' and work with --create and --build.
+
+.TP
+.BR \-k ", " \-\-consistency\-policy=
+Specify how the array maintains consistency in case of unexpected shutdown.
+Only relevant for RAID levels with redundancy.
+Currently supported options are:
+.RS
+
+.TP
+.B resync
+Full resync is performed and all redundancy is regenerated when the array is
+started after unclean shutdown.
+
+.TP
+.B bitmap
+Resync assisted by a write-intent bitmap. Implicitly selected when using
+.BR \-\-bitmap .
+
+.TP
+.B journal
+For RAID levels 4/5/6, journal device is used to log transactions and replay
+after unclean shutdown. Implicitly selected when using
+.BR \-\-write\-journal .
+
+.TP
+.B ppl
+For RAID5 only, Partial Parity Log is used to close the write hole and
+eliminate resync. PPL is stored in the metadata region of RAID member drives,
+no additional journal drive is needed.
+
+.PP
+Can be used with \-\-grow to change the consistency policy of an active array
+in some cases. See CONSISTENCY POLICY CHANGES below.
+.RE
+
+
+.SH For assemble:
+
+.TP
+.BR \-u ", " \-\-uuid=
+uuid of array to assemble. Devices which don't have this uuid are
+excluded
+
+.TP
+.BR \-m ", " \-\-super\-minor=
+Minor number of device that array was created for. Devices which
+don't have this minor number are excluded. If you create an array as
+/dev/md1, then all superblocks will contain the minor number 1, even if
+the array is later assembled as /dev/md2.
+
+Giving the literal word "dev" for
+.B \-\-super\-minor
+will cause
+.I mdadm
+to use the minor number of the md device that is being assembled.
+e.g. when assembling
+.BR /dev/md0 ,
+.B \-\-super\-minor=dev
+will look for super blocks with a minor number of 0.
+
+.B \-\-super\-minor
+is only relevant for v0.90 metadata, and should not normally be used.
+Using
+.B \-\-uuid
+is much safer.
+
+.TP
+.BR \-N ", " \-\-name=
+Specify the name of the array to assemble. This must be the name
+that was specified when creating the array. It must either match
+the name stored in the superblock exactly, or it must match
+with the current
+.I homehost
+prefixed to the start of the given name.
+
+.TP
+.BR \-f ", " \-\-force
+Assemble the array even if the metadata on some devices appears to be
+out-of-date. If
+.I mdadm
+cannot find enough working devices to start the array, but can find
+some devices that are recorded as having failed, then it will mark
+those devices as working so that the array can be started. This works only for
+native. For external metadata it allows to start dirty degraded RAID 4, 5, 6.
+An array which requires
+.B \-\-force
+to be started may contain data corruption. Use it carefully.
+
+.TP
+.BR \-R ", " \-\-run
+Attempt to start the array even if fewer drives were given than were
+present last time the array was active. Normally if not all the
+expected drives are found and
+.B \-\-scan
+is not used, then the array will be assembled but not started.
+With
+.B \-\-run
+an attempt will be made to start it anyway.
+
+.TP
+.B \-\-no\-degraded
+This is the reverse of
+.B \-\-run
+in that it inhibits the startup of array unless all expected drives
+are present. This is only needed with
+.B \-\-scan,
+and can be used if the physical connections to devices are
+not as reliable as you would like.
+
+.TP
+.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}"
+See this option under Create and Build options.
+
+.TP
+.BR \-b ", " \-\-bitmap=
+Specify the bitmap file that was given when the array was created. If
+an array has an
+.B internal
+bitmap, there is no need to specify this when assembling the array.
+
+.TP
+.BR \-\-backup\-file=
+If
+.B \-\-backup\-file
+was used while reshaping an array (e.g. changing number of devices or
+chunk size) and the system crashed during the critical section, then the same
+.B \-\-backup\-file
+must be presented to
+.B \-\-assemble
+to allow possibly corrupted data to be restored, and the reshape
+to be completed.
+
+.TP
+.BR \-\-invalid\-backup
+If the file needed for the above option is not available for any
+reason an empty file can be given together with this option to
+indicate that the backup file is invalid. In this case the data that
+was being rearranged at the time of the crash could be irrecoverably
+lost, but the rest of the array may still be recoverable. This option
+should only be used as a last resort if there is no way to recover the
+backup file.
+
+
+.TP
+.BR \-U ", " \-\-update=
+Update the superblock on each device while assembling the array. The
+argument given to this flag can be one of
+.BR sparc2.2 ,
+.BR summaries ,
+.BR uuid ,
+.BR name ,
+.BR nodes ,
+.BR homehost ,
+.BR home-cluster ,
+.BR resync ,
+.BR byteorder ,
+.BR devicesize ,
+.BR no\-bitmap ,
+.BR bbl ,
+.BR no\-bbl ,
+.BR ppl ,
+.BR no\-ppl ,
+.BR layout\-original ,
+.BR layout\-alternate ,
+.BR layout\-unspecified ,
+.BR metadata ,
+or
+.BR super\-minor .
+
+The
+.B sparc2.2
+option will adjust the superblock of an array what was created on a Sparc
+machine running a patched 2.2 Linux kernel. This kernel got the
+alignment of part of the superblock wrong. You can use the
+.B "\-\-examine \-\-sparc2.2"
+option to
+.I mdadm
+to see what effect this would have.
+
+The
+.B super\-minor
+option will update the
+.B "preferred minor"
+field on each superblock to match the minor number of the array being
+assembled.
+This can be useful if
+.B \-\-examine
+reports a different "Preferred Minor" to
+.BR \-\-detail .
+In some cases this update will be performed automatically
+by the kernel driver. In particular the update happens automatically
+at the first write to an array with redundancy (RAID level 1 or
+greater) on a 2.6 (or later) kernel.
+
+The
+.B uuid
+option will change the uuid of the array. If a UUID is given with the
+.B \-\-uuid
+option that UUID will be used as a new UUID and will
+.B NOT
+be used to help identify the devices in the array.
+If no
+.B \-\-uuid
+is given, a random UUID is chosen.
+
+The
+.B name
+option will change the
+.I name
+of the array as stored in the superblock. This is only supported for
+version-1 superblocks.
+
+The
+.B nodes
+option will change the
+.I nodes
+of the array as stored in the bitmap superblock. This option only
+works for a clustered environment.
+
+The
+.B homehost
+option will change the
+.I homehost
+as recorded in the superblock. For version-0 superblocks, this is the
+same as updating the UUID.
+For version-1 superblocks, this involves updating the name.
+
+The
+.B home\-cluster
+option will change the cluster name as recorded in the superblock and
+bitmap. This option only works for clustered environment.
+
+The
+.B resync
+option will cause the array to be marked
+.I dirty
+meaning that any redundancy in the array (e.g. parity for RAID5,
+copies for RAID1) may be incorrect. This will cause the RAID system
+to perform a "resync" pass to make sure that all redundant information
+is correct.
+
+The
+.B byteorder
+option allows arrays to be moved between machines with different
+byte-order, such as from a big-endian machine like a Sparc or some
+MIPS machines, to a little-endian x86_64 machine.
+When assembling such an array for the first time after a move, giving
+.B "\-\-update=byteorder"
+will cause
+.I mdadm
+to expect superblocks to have their byteorder reversed, and will
+correct that order before assembling the array. This is only valid
+with original (Version 0.90) superblocks.
+
+The
+.B summaries
+option will correct the summaries in the superblock. That is the
+counts of total, working, active, failed, and spare devices.
+
+The
+.B devicesize
+option will rarely be of use. It applies to version 1.1 and 1.2 metadata
+only (where the metadata is at the start of the device) and is only
+useful when the component device has changed size (typically become
+larger). The version 1 metadata records the amount of the device that
+can be used to store data, so if a device in a version 1.1 or 1.2
+array becomes larger, the metadata will still be visible, but the
+extra space will not. In this case it might be useful to assemble the
+array with
+.BR \-\-update=devicesize .
+This will cause
+.I mdadm
+to determine the maximum usable amount of space on each device and
+update the relevant field in the metadata.
+
+The
+.B metadata
+option only works on v0.90 metadata arrays and will convert them to
+v1.0 metadata. The array must not be dirty (i.e. it must not need a
+sync) and it must not have a write-intent bitmap.
+
+The old metadata will remain on the devices, but will appear older
+than the new metadata and so will usually be ignored. The old metadata
+(or indeed the new metadata) can be removed by giving the appropriate
+.B \-\-metadata=
+option to
+.BR \-\-zero\-superblock .
+
+The
+.B no\-bitmap
+option can be used when an array has an internal bitmap which is
+corrupt in some way so that assembling the array normally fails. It
+will cause any internal bitmap to be ignored.
+
+The
+.B bbl
+option will reserve space in each device for a bad block list. This
+will be 4K in size and positioned near the end of any free space
+between the superblock and the data.
+
+The
+.B no\-bbl
+option will cause any reservation of space for a bad block list to be
+removed. If the bad block list contains entries, this will fail, as
+removing the list could cause data corruption.
+
+The
+.B ppl
+option will enable PPL for a RAID5 array and reserve space for PPL on each
+device. There must be enough free space between the data and superblock and a
+write-intent bitmap or journal must not be used.
+
+The
+.B no\-ppl
+option will disable PPL in the superblock.
+
+The
+.B layout\-original
+and
+.B layout\-alternate
+options are for RAID0 arrays with non-uniform devices size that were in
+use before Linux 5.4. If the array was being used with Linux 3.13 or
+earlier, then to assemble the array on a new kernel,
+.B \-\-update=layout\-original
+must be given. If the array was created and used with a kernel from Linux 3.14 to
+Linux 5.3, then
+.B \-\-update=layout\-alternate
+must be given. This only needs to be given once. Subsequent assembly of the array
+will happen normally.
+For more information, see
+.IR md (4).
+
+The
+.B layout\-unspecified
+option reverts the effect of
+.B layout\-orignal
+or
+.B layout\-alternate
+and allows the array to be again used on a kernel prior to Linux 5.3.
+This option should be used with great caution.
+
+.TP
+.BR \-\-freeze\-reshape
+Option is intended to be used in start-up scripts during initrd boot phase.
+When array under reshape is assembled during initrd phase, this option
+stops reshape after reshape critical section is being restored. This happens
+before file system pivot operation and avoids loss of file system context.
+Losing file system context would cause reshape to be broken.
+
+Reshape can be continued later using the
+.B \-\-continue
+option for the grow command.
+
+.TP
+.BR \-\-symlinks
+See this option under Create and Build options.
+
+.SH For Manage mode:
+
+.TP
+.BR \-t ", " \-\-test
+Unless a more serious error occurred,
+.I mdadm
+will exit with a status of 2 if no changes were made to the array and
+0 if at least one change was made.
+This can be useful when an indirect specifier such as
+.BR missing ,
+.B detached
+or
+.B faulty
+is used in requesting an operation on the array.
+.B \-\-test
+will report failure if these specifiers didn't find any match.
+
+.TP
+.BR \-a ", " \-\-add
+hot-add listed devices.
+If a device appears to have recently been part of the array
+(possibly it failed or was removed) the device is re\-added as described
+in the next point.
+If that fails or the device was never part of the array, the device is
+added as a hot-spare.
+If the array is degraded, it will immediately start to rebuild data
+onto that spare.
+
+Note that this and the following options are only meaningful on array
+with redundancy. They don't apply to RAID0 or Linear.
+
+.TP
+.BR \-\-re\-add
+re\-add a device that was previously removed from an array.
+If the metadata on the device reports that it is a member of the
+array, and the slot that it used is still vacant, then the device will
+be added back to the array in the same position. This will normally
+cause the data for that device to be recovered. However based on the
+event count on the device, the recovery may only require sections that
+are flagged a write-intent bitmap to be recovered or may not require
+any recovery at all.
+
+When used on an array that has no metadata (i.e. it was built with
+.BR \-\-build)
+it will be assumed that bitmap-based recovery is enough to make the
+device fully consistent with the array.
+
+When used with v1.x metadata,
+.B \-\-re\-add
+can be accompanied by
+.BR \-\-update=devicesize ,
+.BR \-\-update=bbl ", or"
+.BR \-\-update=no\-bbl .
+See the description of these option when used in Assemble mode for an
+explanation of their use.
+
+If the device name given is
+.B missing
+then
+.I mdadm
+will try to find any device that looks like it should be
+part of the array but isn't and will try to re\-add all such devices.
+
+If the device name given is
+.B faulty
+then
+.I mdadm
+will find all devices in the array that are marked
+.BR faulty ,
+remove them and attempt to immediately re\-add them. This can be
+useful if you are certain that the reason for failure has been
+resolved.
+
+.TP
+.B \-\-add\-spare
+Add a device as a spare. This is similar to
+.B \-\-add
+except that it does not attempt
+.B \-\-re\-add
+first. The device will be added as a spare even if it looks like it
+could be an recent member of the array.
+
+.TP
+.BR \-r ", " \-\-remove
+remove listed devices. They must not be active. i.e. they should
+be failed or spare devices.
+
+As well as the name of a device file
+(e.g.
+.BR /dev/sda1 )
+the words
+.BR failed ,
+.B detached
+and names like
+.B set-A
+can be given to
+.BR \-\-remove .
+The first causes all failed device to be removed. The second causes
+any device which is no longer connected to the system (i.e an 'open'
+returns
+.BR ENXIO )
+to be removed.
+The third will remove a set as describe below under
+.BR \-\-fail .
+
+.TP
+.BR \-f ", " \-\-fail
+Mark listed devices as faulty.
+As well as the name of a device file, the word
+.B detached
+or a set name like
+.B set\-A
+can be given. The former will cause any device that has been detached from
+the system to be marked as failed. It can then be removed.
+
+For RAID10 arrays where the number of copies evenly divides the number
+of devices, the devices can be conceptually divided into sets where
+each set contains a single complete copy of the data on the array.
+Sometimes a RAID10 array will be configured so that these sets are on
+separate controllers. In this case all the devices in one set can be
+failed by giving a name like
+.B set\-A
+or
+.B set\-B
+to
+.BR \-\-fail .
+The appropriate set names are reported by
+.BR \-\-detail .
+
+.TP
+.BR \-\-set\-faulty
+same as
+.BR \-\-fail .
+
+.TP
+.B \-\-replace
+Mark listed devices as requiring replacement. As soon as a spare is
+available, it will be rebuilt and will replace the marked device.
+This is similar to marking a device as faulty, but the device remains
+in service during the recovery process to increase resilience against
+multiple failures. When the replacement process finishes, the
+replaced device will be marked as faulty.
+
+.TP
+.B \-\-with
+This can follow a list of
+.B \-\-replace
+devices. The devices listed after
+.B \-\-with
+will be preferentially used to replace the devices listed after
+.BR \-\-replace .
+These device must already be spare devices in the array.
+
+.TP
+.BR \-\-write\-mostly
+Subsequent devices that are added or re\-added will have the 'write-mostly'
+flag set. This is only valid for RAID1 and means that the 'md' driver
+will avoid reading from these devices if possible.
+.TP
+.BR \-\-readwrite
+Subsequent devices that are added or re\-added will have the 'write-mostly'
+flag cleared.
+.TP
+.BR \-\-cluster\-confirm
+Confirm the existence of the device. This is issued in response to an \-\-add
+request by a node in a cluster. When a node adds a device it sends a message
+to all nodes in the cluster to look for a device with a UUID. This translates
+to a udev notification with the UUID of the device to be added and the slot
+number. The receiving node must acknowledge this message
+with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case
+the device is found or <slot>:missing in case the device is not found.
+
+.TP
+.BR \-\-add-journal
+Add journal to an existing array, or recreate journal for RAID-4/5/6 array
+that lost a journal device. To avoid interrupting on-going write opertions,
+.B \-\-add-journal
+only works for array in Read-Only state.
+
+.TP
+.BR \-\-failfast
+Subsequent devices that are added or re\-added will have
+the 'failfast' flag set. This is only valid for RAID1 and RAID10 and
+means that the 'md' driver will avoid long timeouts on error handling
+where possible.
+.TP
+.BR \-\-nofailfast
+Subsequent devices that are re\-added will be re\-added without
+the 'failfast' flag set.
+
+.P
+Each of these options requires that the first device listed is the array
+to be acted upon, and the remainder are component devices to be added,
+removed, marked as faulty, etc. Several different operations can be
+specified for different devices, e.g.
+.in +5
+mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1
+.in -5
+Each operation applies to all devices listed until the next
+operation.
+
+If an array is using a write-intent bitmap, then devices which have
+been removed can be re\-added in a way that avoids a full
+reconstruction but instead just updates the blocks that have changed
+since the device was removed. For arrays with persistent metadata
+(superblocks) this is done automatically. For arrays created with
+.B \-\-build
+mdadm needs to be told that this device we removed recently with
+.BR \-\-re\-add .
+
+Devices can only be removed from an array if they are not in active
+use, i.e. that must be spares or failed devices. To remove an active
+device, it must first be marked as
+.B faulty.
+
+.SH For Misc mode:
+
+.TP
+.BR \-Q ", " \-\-query
+Examine a device to see
+(1) if it is an md device and (2) if it is a component of an md
+array.
+Information about what is discovered is presented.
+
+.TP
+.BR \-D ", " \-\-detail
+Print details of one or more md devices.
+
+.TP
+.BR \-\-detail\-platform
+Print details of the platform's RAID capabilities (firmware / hardware
+topology) for a given metadata format. If used without argument, mdadm
+will scan all controllers looking for their capabilities. Otherwise, mdadm
+will only look at the controller specified by the argument in form of an
+absolute filepath or a link, e.g.
+.IR /sys/devices/pci0000:00/0000:00:1f.2 .
+
+.TP
+.BR \-Y ", " \-\-export
+When used with
+.BR \-\-detail ,
+.BR \-\-detail-platform ,
+.BR \-\-examine ,
+or
+.B \-\-incremental
+output will be formatted as
+.B key=value
+pairs for easy import into the environment.
+
+With
+.B \-\-incremental
+The value
+.B MD_STARTED
+indicates whether an array was started
+.RB ( yes )
+or not, which may include a reason
+.RB ( unsafe ", " nothing ", " no ).
+Also the value
+.B MD_FOREIGN
+indicates if the array is expected on this host
+.RB ( no ),
+or seems to be from elsewhere
+.RB ( yes ).
+
+.TP
+.BR \-E ", " \-\-examine
+Print contents of the metadata stored on the named device(s).
+Note the contrast between
+.B \-\-examine
+and
+.BR \-\-detail .
+.B \-\-examine
+applies to devices which are components of an array, while
+.B \-\-detail
+applies to a whole array which is currently active.
+.TP
+.B \-\-sparc2.2
+If an array was created on a SPARC machine with a 2.2 Linux kernel
+patched with RAID support, the superblock will have been created
+incorrectly, or at least incompatibly with 2.4 and later kernels.
+Using the
+.B \-\-sparc2.2
+flag with
+.B \-\-examine
+will fix the superblock before displaying it. If this appears to do
+the right thing, then the array can be successfully assembled using
+.BR "\-\-assemble \-\-update=sparc2.2" .
+
+.TP
+.BR \-X ", " \-\-examine\-bitmap
+Report information about a bitmap file.
+The argument is either an external bitmap file or an array component
+in case of an internal bitmap. Note that running this on an array
+device (e.g.
+.BR /dev/md0 )
+does not report the bitmap for that array.
+
+.TP
+.B \-\-examine\-badblocks
+List the bad-blocks recorded for the device, if a bad-blocks list has
+been configured. Currently only
+.B 1.x
+and
+.B IMSM
+metadata support bad-blocks lists.
+
+.TP
+.BI \-\-dump= directory
+.TP
+.BI \-\-restore= directory
+Save metadata from lists devices, or restore metadata to listed devices.
+
+.TP
+.BR \-R ", " \-\-run
+start a partially assembled array. If
+.B \-\-assemble
+did not find enough devices to fully start the array, it might leaving
+it partially assembled. If you wish, you can then use
+.B \-\-run
+to start the array in degraded mode.
+
+.TP
+.BR \-S ", " \-\-stop
+deactivate array, releasing all resources.
+
+.TP
+.BR \-o ", " \-\-readonly
+mark array as readonly.
+
+.TP
+.BR \-w ", " \-\-readwrite
+mark array as readwrite.
+
+.TP
+.B \-\-zero\-superblock
+If the device contains a valid md superblock, the block is
+overwritten with zeros. With
+.B \-\-force
+the block where the superblock would be is overwritten even if it
+doesn't appear to be valid.
+
+.B Note:
+Be careful to call \-\-zero\-superblock with clustered raid, make sure
+array isn't used or assembled in other cluster node before execute it.
+
+.TP
+.B \-\-kill\-subarray=
+If the device is a container and the argument to \-\-kill\-subarray
+specifies an inactive subarray in the container, then the subarray is
+deleted. Deleting all subarrays will leave an 'empty-container' or
+spare superblock on the drives. See
+.B \-\-zero\-superblock
+for completely
+removing a superblock. Note that some formats depend on the subarray
+index for generating a UUID, this command will fail if it would change
+the UUID of an active subarray.
+
+.TP
+.B \-\-update\-subarray=
+If the device is a container and the argument to \-\-update\-subarray
+specifies a subarray in the container, then attempt to update the given
+superblock field in the subarray. See below in
+.B MISC MODE
+for details.
+
+.TP
+.BR \-t ", " \-\-test
+When used with
+.BR \-\-detail ,
+the exit status of
+.I mdadm
+is set to reflect the status of the device. See below in
+.B MISC MODE
+for details.
+
+.TP
+.BR \-W ", " \-\-wait
+For each md device given, wait for any resync, recovery, or reshape
+activity to finish before returning.
+.I mdadm
+will return with success if it actually waited for every device
+listed, otherwise it will return failure.
+
+.TP
+.BR \-\-wait\-clean
+For each md device given, or each device in /proc/mdstat if
+.B \-\-scan
+is given, arrange for the array to be marked clean as soon as possible.
+.I mdadm
+will return with success if the array uses external metadata and we
+successfully waited. For native arrays this returns immediately as the
+kernel handles dirty-clean transitions at shutdown. No action is taken
+if safe-mode handling is disabled.
+
+.TP
+.B \-\-action=
+Set the "sync_action" for all md devices given to one of
+.BR idle ,
+.BR frozen ,
+.BR check ,
+.BR repair .
+Setting to
+.B idle
+will abort any currently running action though some actions will
+automatically restart.
+Setting to
+.B frozen
+will abort any current action and ensure no other action starts
+automatically.
+
+Details of
+.B check
+and
+.B repair
+can be found it
+.IR md (4)
+under
+.BR "SCRUBBING AND MISMATCHES" .
+
+.SH For Incremental Assembly mode:
+.TP
+.BR \-\-rebuild\-map ", " \-r
+Rebuild the map file
+.RB ( {MAP_PATH} )
+that
+.I mdadm
+uses to help track which arrays are currently being assembled.
+
+.TP
+.BR \-\-run ", " \-R
+Run any array assembled as soon as a minimal number of devices are
+available, rather than waiting until all expected devices are present.
+
+.TP
+.BR \-\-scan ", " \-s
+Only meaningful with
+.B \-R
+this will scan the
+.B map
+file for arrays that are being incrementally assembled and will try to
+start any that are not already started. If any such array is listed
+in
+.B mdadm.conf
+as requiring an external bitmap, that bitmap will be attached first.
+
+.TP
+.BR \-\-fail ", " \-f
+This allows the hot-plug system to remove devices that have fully disappeared
+from the kernel. It will first fail and then remove the device from any
+array it belongs to.
+The device name given should be a kernel device name such as "sda",
+not a name in
+.IR /dev .
+
+.TP
+.BR \-\-path=
+Only used with \-\-fail. The 'path' given will be recorded so that if
+a new device appears at the same location it can be automatically
+added to the same array. This allows the failed device to be
+automatically replaced by a new device without metadata if it appears
+at specified path. This option is normally only set by a
+.I udev
+script.
+
+.SH For Monitor mode:
+.TP
+.BR \-m ", " \-\-mail
+Give a mail address to send alerts to.
+
+.TP
+.BR \-p ", " \-\-program ", " \-\-alert
+Give a program to be run whenever an event is detected.
+
+.TP
+.BR \-y ", " \-\-syslog
+Cause all events to be reported through 'syslog'. The messages have
+facility of 'daemon' and varying priorities.
+
+.TP
+.BR \-d ", " \-\-delay
+Give a delay in seconds.
+.I mdadm
+polls the md arrays and then waits this many seconds before polling
+again. The default is 60 seconds. Since 2.6.16, there is no need to
+reduce this as the kernel alerts
+.I mdadm
+immediately when there is any change.
+
+.TP
+.BR \-r ", " \-\-increment
+Give a percentage increment.
+.I mdadm
+will generate RebuildNN events with the given percentage increment.
+
+.TP
+.BR \-f ", " \-\-daemonise
+Tell
+.I mdadm
+to run as a background daemon if it decides to monitor anything. This
+causes it to fork and run in the child, and to disconnect from the
+terminal. The process id of the child is written to stdout.
+This is useful with
+.B \-\-scan
+which will only continue monitoring if a mail address or alert program
+is found in the config file.
+
+.TP
+.BR \-i ", " \-\-pid\-file
+When
+.I mdadm
+is running in daemon mode, write the pid of the daemon process to
+the specified file, instead of printing it on standard output.
+
+.TP
+.BR \-1 ", " \-\-oneshot
+Check arrays only once. This will generate
+.B NewArray
+events and more significantly
+.B DegradedArray
+and
+.B SparesMissing
+events. Running
+.in +5
+.B " mdadm \-\-monitor \-\-scan \-1"
+.in -5
+from a cron script will ensure regular notification of any degraded arrays.
+
+.TP
+.BR \-t ", " \-\-test
+Generate a
+.B TestMessage
+alert for every array found at startup. This alert gets mailed and
+passed to the alert program. This can be used for testing that alert
+message do get through successfully.
+
+.TP
+.BR \-\-no\-sharing
+This inhibits the functionality for moving spares between arrays.
+Only one monitoring process started with
+.B \-\-scan
+but without this flag is allowed, otherwise the two could interfere
+with each other.
+
+.SH ASSEMBLE MODE
+
+.HP 12
+Usage:
+.B mdadm \-\-assemble
+.I md-device options-and-component-devices...
+.HP 12
+Usage:
+.B mdadm \-\-assemble \-\-scan
+.I md-devices-and-options...
+.HP 12
+Usage:
+.B mdadm \-\-assemble \-\-scan
+.I options...
+
+.PP
+This usage assembles one or more RAID arrays from pre-existing components.
+For each array, mdadm needs to know the md device, the identity of the
+array, and a number of component-devices. These can be found in a number of ways.
+
+In the first usage example (without the
+.BR \-\-scan )
+the first device given is the md device.
+In the second usage example, all devices listed are treated as md
+devices and assembly is attempted.
+In the third (where no devices are listed) all md devices that are
+listed in the configuration file are assembled. If no arrays are
+described by the configuration file, then any arrays that
+can be found on unused devices will be assembled.
+
+If precisely one device is listed, but
+.B \-\-scan
+is not given, then
+.I mdadm
+acts as though
+.B \-\-scan
+was given and identity information is extracted from the configuration file.
+
+The identity can be given with the
+.B \-\-uuid
+option, the
+.B \-\-name
+option, or the
+.B \-\-super\-minor
+option, will be taken from the md-device record in the config file, or
+will be taken from the super block of the first component-device
+listed on the command line.
+
+Devices can be given on the
+.B \-\-assemble
+command line or in the config file. Only devices which have an md
+superblock which contains the right identity will be considered for
+any array.
+
+The config file is only used if explicitly named with
+.B \-\-config
+or requested with (a possibly implicit)
+.BR \-\-scan .
+In the later case,
+.B /etc/mdadm.conf
+or
+.B /etc/mdadm/mdadm.conf
+is used.
+
+If
+.B \-\-scan
+is not given, then the config file will only be used to find the
+identity of md arrays.
+
+Normally the array will be started after it is assembled. However if
+.B \-\-scan
+is not given and not all expected drives were listed, then the array
+is not started (to guard against usage errors). To insist that the
+array be started in this case (as may work for RAID1, 4, 5, 6, or 10),
+give the
+.B \-\-run
+flag.
+
+If
+.I udev
+is active,
+.I mdadm
+does not create any entries in
+.B /dev
+but leaves that to
+.IR udev .
+It does record information in
+.B {MAP_PATH}
+which will allow
+.I udev
+to choose the correct name.
+
+If
+.I mdadm
+detects that udev is not configured, it will create the devices in
+.B /dev
+itself.
+
+In Linux kernels prior to version 2.6.28 there were two distinctly
+different types of md devices that could be created: one that could be
+partitioned using standard partitioning tools and one that could not.
+Since 2.6.28 that distinction is no longer relevant as both type of
+devices can be partitioned.
+.I mdadm
+will normally create the type that originally could not be partitioned
+as it has a well defined major number (9).
+
+Prior to 2.6.28, it is important that mdadm chooses the correct type
+of array device to use. This can be controlled with the
+.B \-\-auto
+option. In particular, a value of "mdp" or "part" or "p" tells mdadm
+to use a partitionable device rather than the default.
+
+In the no-udev case, the value given to
+.B \-\-auto
+can be suffixed by a number. This tells
+.I mdadm
+to create that number of partition devices rather than the default of 4.
+
+The value given to
+.B \-\-auto
+can also be given in the configuration file as a word starting
+.B auto=
+on the ARRAY line for the relevant array.
+
+.SS Auto Assembly
+When
+.B \-\-assemble
+is used with
+.B \-\-scan
+and no devices are listed,
+.I mdadm
+will first attempt to assemble all the arrays listed in the config
+file.
+
+If no arrays are listed in the config (other than those marked
+.BR <ignore> )
+it will look through the available devices for possible arrays and
+will try to assemble anything that it finds. Arrays which are tagged
+as belonging to the given homehost will be assembled and started
+normally. Arrays which do not obviously belong to this host are given
+names that are expected not to conflict with anything local, and are
+started "read-auto" so that nothing is written to any device until the
+array is written to. i.e. automatic resync etc is delayed.
+
+If
+.I mdadm
+finds a consistent set of devices that look like they should comprise
+an array, and if the superblock is tagged as belonging to the given
+home host, it will automatically choose a device name and try to
+assemble the array. If the array uses version-0.90 metadata, then the
+.B minor
+number as recorded in the superblock is used to create a name in
+.B /dev/md/
+so for example
+.BR /dev/md/3 .
+If the array uses version-1 metadata, then the
+.B name
+from the superblock is used to similarly create a name in
+.B /dev/md/
+(the name will have any 'host' prefix stripped first).
+
+This behaviour can be modified by the
+.I AUTO
+line in the
+.I mdadm.conf
+configuration file. This line can indicate that specific metadata
+type should, or should not, be automatically assembled. If an array
+is found which is not listed in
+.I mdadm.conf
+and has a metadata format that is denied by the
+.I AUTO
+line, then it will not be assembled.
+The
+.I AUTO
+line can also request that all arrays identified as being for this
+homehost should be assembled regardless of their metadata type.
+See
+.IR mdadm.conf (5)
+for further details.
+
+Note: Auto assembly cannot be used for assembling and activating some
+arrays which are undergoing reshape. In particular as the
+.B backup\-file
+cannot be given, any reshape which requires a backup-file to continue
+cannot be started by auto assembly. An array which is growing to more
+devices and has passed the critical section can be assembled using
+auto-assembly.
+
+.SH BUILD MODE
+
+.HP 12
+Usage:
+.B mdadm \-\-build
+.I md-device
+.BI \-\-chunk= X
+.BI \-\-level= Y
+.BI \-\-raid\-devices= Z
+.I devices
+
+.PP
+This usage is similar to
+.BR \-\-create .
+The difference is that it creates an array without a superblock. With
+these arrays there is no difference between initially creating the array and
+subsequently assembling the array, except that hopefully there is useful
+data there in the second case.
+
+The level may raid0, linear, raid1, raid10, multipath, or faulty, or
+one of their synonyms. All devices must be listed and the array will
+be started once complete. It will often be appropriate to use
+.B \-\-assume\-clean
+with levels raid1 or raid10.
+
+.SH CREATE MODE
+
+.HP 12
+Usage:
+.B mdadm \-\-create
+.I md-device
+.BI \-\-chunk= X
+.BI \-\-level= Y
+.br
+.BI \-\-raid\-devices= Z
+.I devices
+
+.PP
+This usage will initialise a new md array, associate some devices with
+it, and activate the array.
+
+The named device will normally not exist when
+.I "mdadm \-\-create"
+is run, but will be created by
+.I udev
+once the array becomes active.
+
+The max length md-device name is limited to 32 characters.
+Different metadata types have more strict limitation
+(like IMSM where only 16 characters are allowed).
+For that reason, long name could be truncated or rejected, it depends on metadata policy.
+
+As devices are added, they are checked to see if they contain RAID
+superblocks or filesystems. They are also checked to see if the variance in
+device size exceeds 1%.
+
+If any discrepancy is found, the array will not automatically be run, though
+the presence of a
+.B \-\-run
+can override this caution.
+
+To create a "degraded" array in which some devices are missing, simply
+give the word "\fBmissing\fP"
+in place of a device name. This will cause
+.I mdadm
+to leave the corresponding slot in the array empty.
+For a RAID4 or RAID5 array at most one slot can be
+"\fBmissing\fP"; for a RAID6 array at most two slots.
+For a RAID1 array, only one real device needs to be given. All of the
+others can be
+"\fBmissing\fP".
+
+When creating a RAID5 array,
+.I mdadm
+will automatically create a degraded array with an extra spare drive.
+This is because building the spare into a degraded array is in general
+faster than resyncing the parity on a non-degraded, but not clean,
+array. This feature can be overridden with the
+.B \-\-force
+option.
+
+When creating an array with version-1 metadata a name for the array is
+required.
+If this is not given with the
+.B \-\-name
+option,
+.I mdadm
+will choose a name based on the last component of the name of the
+device being created. So if
+.B /dev/md3
+is being created, then the name
+.B 3
+will be chosen.
+If
+.B /dev/md/home
+is being created, then the name
+.B home
+will be used.
+
+When creating a partition based array, using
+.I mdadm
+with version-1.x metadata, the partition type should be set to
+.B 0xDA
+(non fs-data). This type selection allows for greater precision since
+using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)],
+might create problems in the event of array recovery through a live cdrom.
+
+A new array will normally get a randomly assigned 128bit UUID which is
+very likely to be unique. If you have a specific need, you can choose
+a UUID for the array by giving the
+.B \-\-uuid=
+option. Be warned that creating two arrays with the same UUID is a
+recipe for disaster. Also, using
+.B \-\-uuid=
+when creating a v0.90 array will silently override any
+.B \-\-homehost=
+setting.
+.\"If the
+.\".B \-\-size
+.\"option is given, it is not necessary to list any component-devices in this command.
+.\"They can be added later, before a
+.\".B \-\-run.
+.\"If no
+.\".B \-\-size
+.\"is given, the apparent size of the smallest drive given is used.
+
+If the array type supports a write-intent bitmap, and if the devices
+in the array exceed 100G is size, an internal write-intent bitmap
+will automatically be added unless some other option is explicitly
+requested with the
+.B \-\-bitmap
+option or a different consistency policy is selected with the
+.B \-\-consistency\-policy
+option. In any case space for a bitmap will be reserved so that one
+can be added later with
+.BR "\-\-grow \-\-bitmap=internal" .
+
+If the metadata type supports it (currently only 1.x and IMSM metadata),
+space will be allocated to store a bad block list. This allows a modest
+number of bad blocks to be recorded, allowing the drive to remain in
+service while only partially functional.
+
+When creating an array within a
+.B CONTAINER
+.I mdadm
+can be given either the list of devices to use, or simply the name of
+the container. The former case gives control over which devices in
+the container will be used for the array. The latter case allows
+.I mdadm
+to automatically choose which devices to use based on how much spare
+space is available.
+
+The General Management options that are valid with
+.B \-\-create
+are:
+.TP
+.B \-\-run
+insist on running the array even if some devices look like they might
+be in use.
+
+.TP
+.B \-\-readonly
+start the array in readonly mode.
+
+.SH MANAGE MODE
+.HP 12
+Usage:
+.B mdadm
+.I device
+.I options... devices...
+.PP
+
+This usage will allow individual devices in an array to be failed,
+removed or added. It is possible to perform multiple operations with
+on command. For example:
+.br
+.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1"
+.br
+will firstly mark
+.B /dev/hda1
+as faulty in
+.B /dev/md0
+and will then remove it from the array and finally add it back
+in as a spare. However only one md array can be affected by a single
+command.
+
+When a device is added to an active array, mdadm checks to see if it
+has metadata on it which suggests that it was recently a member of the
+array. If it does, it tries to "re\-add" the device. If there have
+been no changes since the device was removed, or if the array has a
+write-intent bitmap which has recorded whatever changes there were,
+then the device will immediately become a full member of the array and
+those differences recorded in the bitmap will be resolved.
+
+.SH MISC MODE
+.HP 12
+Usage:
+.B mdadm
+.I options ...
+.I devices ...
+.PP
+
+MISC mode includes a number of distinct operations that
+operate on distinct devices. The operations are:
+.TP
+.B \-\-query
+The device is examined to see if it is
+(1) an active md array, or
+(2) a component of an md array.
+The information discovered is reported.
+
+.TP
+.B \-\-detail
+The device should be an active md device.
+.B mdadm
+will display a detailed description of the array.
+.B \-\-brief
+or
+.B \-\-scan
+will cause the output to be less detailed and the format to be
+suitable for inclusion in
+.BR mdadm.conf .
+The exit status of
+.I mdadm
+will normally be 0 unless
+.I mdadm
+failed to get useful information about the device(s); however, if the
+.B \-\-test
+option is given, then the exit status will be:
+.RS
+.TP
+0
+The array is functioning normally.
+.TP
+1
+The array has at least one failed device.
+.TP
+2
+The array has multiple failed devices such that it is unusable.
+.TP
+4
+There was an error while trying to get information about the device.
+.RE
+
+.TP
+.B \-\-detail\-platform
+Print detail of the platform's RAID capabilities (firmware / hardware
+topology). If the metadata is specified with
+.B \-e
+or
+.B \-\-metadata=
+then the return status will be:
+.RS
+.TP
+0
+metadata successfully enumerated its platform components on this system
+.TP
+1
+metadata is platform independent
+.TP
+2
+metadata failed to find its platform components on this system
+.RE
+
+.TP
+.B \-\-update\-subarray=
+If the device is a container and the argument to \-\-update\-subarray
+specifies a subarray in the container, then attempt to update the given
+superblock field in the subarray. Similar to updating an array in
+"assemble" mode, the field to update is selected by
+.B \-U
+or
+.B \-\-update=
+option. The supported options are
+.BR name ,
+.BR ppl ,
+.BR no\-ppl ,
+.BR bitmap
+and
+.BR no\-bitmap .
+
+The
+.B name
+option updates the subarray name in the metadata, it may not affect the
+device node name or the device node symlink until the subarray is
+re\-assembled. If updating
+.B name
+would change the UUID of an active subarray this operation is blocked,
+and the command will end in an error.
+
+The
+.B ppl
+and
+.B no\-ppl
+options enable and disable PPL in the metadata. Currently supported only for
+IMSM subarrays.
+
+The
+.B bitmap
+and
+.B no\-bitmap
+options enable and disable write-intent bitmap in the metadata. Currently supported only for
+IMSM subarrays.
+
+.TP
+.B \-\-examine
+The device should be a component of an md array.
+.I mdadm
+will read the md superblock of the device and display the contents.
+If
+.B \-\-brief
+or
+.B \-\-scan
+is given, then multiple devices that are components of the one array
+are grouped together and reported in a single entry suitable
+for inclusion in
+.BR mdadm.conf .
+
+Having
+.B \-\-scan
+without listing any devices will cause all devices listed in the
+config file to be examined.
+
+.TP
+.BI \-\-dump= directory
+If the device contains RAID metadata, a file will be created in the
+.I directory
+and the metadata will be written to it. The file will be the same
+size as the device and have the metadata written in the file at the
+same locate that it exists in the device. However the file will be "sparse" so
+that only those blocks containing metadata will be allocated. The
+total space used will be small.
+
+The file name used in the
+.I directory
+will be the base name of the device. Further if any links appear in
+.I /dev/disk/by-id
+which point to the device, then hard links to the file will be created
+in
+.I directory
+based on these
+.I by-id
+names.
+
+Multiple devices can be listed and their metadata will all be stored
+in the one directory.
+
+.TP
+.BI \-\-restore= directory
+This is the reverse of
+.BR \-\-dump .
+.I mdadm
+will locate a file in the directory that has a name appropriate for
+the given device and will restore metadata from it. Names that match
+.I /dev/disk/by-id
+names are preferred, however if two of those refer to different files,
+.I mdadm
+will not choose between them but will abort the operation.
+
+If a file name is given instead of a
+.I directory
+then
+.I mdadm
+will restore from that file to a single device, always provided the
+size of the file matches that of the device, and the file contains
+valid metadata.
+.TP
+.B \-\-stop
+The devices should be active md arrays which will be deactivated, as
+long as they are not currently in use.
+
+.TP
+.B \-\-run
+This will fully activate a partially assembled md array.
+
+.TP
+.B \-\-readonly
+This will mark an active array as read-only, providing that it is
+not currently being used.
+
+.TP
+.B \-\-readwrite
+This will change a
+.B readonly
+array back to being read/write.
+
+.TP
+.B \-\-scan
+For all operations except
+.BR \-\-examine ,
+.B \-\-scan
+will cause the operation to be applied to all arrays listed in
+.BR /proc/mdstat .
+For
+.BR \-\-examine,
+.B \-\-scan
+causes all devices listed in the config file to be examined.
+
+.TP
+.BR \-b ", " \-\-brief
+Be less verbose. This is used with
+.B \-\-detail
+and
+.BR \-\-examine .
+Using
+.B \-\-brief
+with
+.B \-\-verbose
+gives an intermediate level of verbosity.
+
+.SH MONITOR MODE
+
+.HP 12
+Usage:
+.B mdadm \-\-monitor
+.I options... devices...
+
+.PP
+This usage causes
+.I mdadm
+to periodically poll a number of md arrays and to report on any events
+noticed.
+.I mdadm
+will never exit once it decides that there are arrays to be checked,
+so it should normally be run in the background.
+
+As well as reporting events,
+.I mdadm
+may move a spare drive from one array to another if they are in the
+same
+.B spare-group
+or
+.B domain
+and if the destination array has a failed drive but no spares.
+
+If any devices are listed on the command line,
+.I mdadm
+will only monitor those devices. Otherwise all arrays listed in the
+configuration file will be monitored. Further, if
+.B \-\-scan
+is given, then any other md devices that appear in
+.B /proc/mdstat
+will also be monitored.
+
+The result of monitoring the arrays is the generation of events.
+These events are passed to a separate program (if specified) and may
+be mailed to a given E-mail address.
+
+When passing events to a program, the program is run once for each event,
+and is given 2 or 3 command-line arguments: the first is the
+name of the event (see below), the second is the name of the
+md device which is affected, and the third is the name of a related
+device if relevant (such as a component device that has failed).
+
+If
+.B \-\-scan
+is given, then a program or an E-mail address must be specified on the
+command line or in the config file. If neither are available, then
+.I mdadm
+will not monitor anything.
+Without
+.B \-\-scan,
+.I mdadm
+will continue monitoring as long as something was found to monitor. If
+no program or email is given, then each event is reported to
+.BR stdout .
+
+The different events are:
+
+.RS 4
+.TP
+.B DeviceDisappeared
+An md array which previously was configured appears to no longer be
+configured. (syslog priority: Critical)
+
+If
+.I mdadm
+was told to monitor an array which is RAID0 or Linear, then it will
+report
+.B DeviceDisappeared
+with the extra information
+.BR Wrong-Level .
+This is because RAID0 and Linear do not support the device-failed,
+hot-spare and resync operations which are monitored.
+
+.TP
+.B RebuildStarted
+An md array started reconstruction (e.g. recovery, resync, reshape,
+check, repair). (syslog priority: Warning)
+
+.TP
+.BI Rebuild NN
+Where
+.I NN
+is a two-digit number (ie. 05, 48). This indicates that rebuild
+has passed that many percent of the total. The events are generated
+with fixed increment since 0. Increment size may be specified with
+a commandline option (default is 20). (syslog priority: Warning)
+
+.TP
+.B RebuildFinished
+An md array that was rebuilding, isn't any more, either because it
+finished normally or was aborted. (syslog priority: Warning)
+
+.TP
+.B Fail
+An active component device of an array has been marked as
+faulty. (syslog priority: Critical)
+
+.TP
+.B FailSpare
+A spare component device which was being rebuilt to replace a faulty
+device has failed. (syslog priority: Critical)
+
+.TP
+.B SpareActive
+A spare component device which was being rebuilt to replace a faulty
+device has been successfully rebuilt and has been made active.
+(syslog priority: Info)
+
+.TP
+.B NewArray
+A new md array has been detected in the
+.B /proc/mdstat
+file. (syslog priority: Info)
+
+.TP
+.B DegradedArray
+A newly noticed array appears to be degraded. This message is not
+generated when
+.I mdadm
+notices a drive failure which causes degradation, but only when
+.I mdadm
+notices that an array is degraded when it first sees the array.
+(syslog priority: Critical)
+
+.TP
+.B MoveSpare
+A spare drive has been moved from one array in a
+.B spare-group
+or
+.B domain
+to another to allow a failed drive to be replaced.
+(syslog priority: Info)
+
+.TP
+.B SparesMissing
+If
+.I mdadm
+has been told, via the config file, that an array should have a certain
+number of spare devices, and
+.I mdadm
+detects that it has fewer than this number when it first sees the
+array, it will report a
+.B SparesMissing
+message.
+(syslog priority: Warning)
+
+.TP
+.B TestMessage
+An array was found at startup, and the
+.B \-\-test
+flag was given.
+(syslog priority: Info)
+.RE
+
+Only
+.B Fail,
+.B FailSpare,
+.B DegradedArray,
+.B SparesMissing
+and
+.B TestMessage
+cause Email to be sent. All events cause the program to be run.
+The program is run with two or three arguments: the event
+name, the array device and possibly a second device.
+
+Each event has an associated array device (e.g.
+.BR /dev/md1 )
+and possibly a second device. For
+.BR Fail ,
+.BR FailSpare ,
+and
+.B SpareActive
+the second device is the relevant component device.
+For
+.B MoveSpare
+the second device is the array that the spare was moved from.
+
+For
+.I mdadm
+to move spares from one array to another, the different arrays need to
+be labeled with the same
+.B spare-group
+or the spares must be allowed to migrate through matching POLICY domains
+in the configuration file. The
+.B spare-group
+name can be any string; it is only necessary that different spare
+groups use different names.
+
+When
+.I mdadm
+detects that an array in a spare group has fewer active
+devices than necessary for the complete array, and has no spare
+devices, it will look for another array in the same spare group that
+has a full complement of working drive and a spare. It will then
+attempt to remove the spare from the second drive and add it to the
+first.
+If the removal succeeds but the adding fails, then it is added back to
+the original array.
+
+If the spare group for a degraded array is not defined,
+.I mdadm
+will look at the rules of spare migration specified by POLICY lines in
+.B mdadm.conf
+and then follow similar steps as above if a matching spare is found.
+
+.SH GROW MODE
+The GROW mode is used for changing the size or shape of an active
+array.
+For this to work, the kernel must support the necessary change.
+Various types of growth are being added during 2.6 development.
+
+Currently the supported changes include
+.IP \(bu 4
+change the "size" attribute for RAID1, RAID4, RAID5 and RAID6.
+.IP \(bu 4
+increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4,
+RAID5, and RAID6.
+.IP \(bu 4
+change the chunk-size and layout of RAID0, RAID4, RAID5, RAID6 and RAID10.
+.IP \(bu 4
+convert between RAID1 and RAID5, between RAID5 and RAID6, between
+RAID0, RAID4, and RAID5, and between RAID0 and RAID10 (in the near-2 mode).
+.IP \(bu 4
+add a write-intent bitmap to any array which supports these bitmaps, or
+remove a write-intent bitmap from such an array.
+.IP \(bu 4
+change the array's consistency policy.
+.PP
+
+Using GROW on containers is currently supported only for Intel's IMSM
+container format. The number of devices in a container can be
+increased - which affects all arrays in the container - or an array
+in a container can be converted between levels where those levels are
+supported by the container, and the conversion is on of those listed
+above.
+
+.PP
+Notes:
+.IP \(bu 4
+Intel's native checkpointing doesn't use
+.B --backup-file
+option and it is transparent for assembly feature.
+.IP \(bu 4
+Roaming between Windows(R) and Linux systems for IMSM metadata is not
+supported during grow process.
+.IP \(bu 4
+When growing a raid0 device, the new component disk size (or external
+backup size) should be larger than LCM(old, new) * chunk-size * 2,
+where LCM() is the least common multiple of the old and new count of
+component disks, and "* 2" comes from the fact that mdadm refuses to
+use more than half of a spare device for backup space.
+
+.SS SIZE CHANGES
+Normally when an array is built the "size" is taken from the smallest
+of the drives. If all the small drives in an arrays are, one at a
+time, removed and replaced with larger drives, then you could have an
+array of large drives with only a small amount used. In this
+situation, changing the "size" with "GROW" mode will allow the extra
+space to start being used. If the size is increased in this way, a
+"resync" process will start to make sure the new parts of the array
+are synchronised.
+
+Note that when an array changes size, any filesystem that may be
+stored in the array will not automatically grow or shrink to use or
+vacate the space. The
+filesystem will need to be explicitly told to use the extra space
+after growing, or to reduce its size
+.B prior
+to shrinking the array.
+
+Also the size of an array cannot be changed while it has an active
+bitmap. If an array has a bitmap, it must be removed before the size
+can be changed. Once the change is complete a new bitmap can be created.
+
+.PP
+Note:
+.B "--grow --size"
+is not yet supported for external file bitmap.
+
+.SS RAID\-DEVICES CHANGES
+
+A RAID1 array can work with any number of devices from 1 upwards
+(though 1 is not very useful). There may be times which you want to
+increase or decrease the number of active devices. Note that this is
+different to hot-add or hot-remove which changes the number of
+inactive devices.
+
+When reducing the number of devices in a RAID1 array, the slots which
+are to be removed from the array must already be vacant. That is, the
+devices which were in those slots must be failed and removed.
+
+When the number of devices is increased, any hot spares that are
+present will be activated immediately.
+
+Changing the number of active devices in a RAID5 or RAID6 is much more
+effort. Every block in the array will need to be read and written
+back to a new location. From 2.6.17, the Linux Kernel is able to
+increase the number of devices in a RAID5 safely, including restarting
+an interrupted "reshape". From 2.6.31, the Linux Kernel is able to
+increase or decrease the number of devices in a RAID5 or RAID6.
+
+From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4
+or RAID5.
+.I mdadm
+uses this functionality and the ability to add
+devices to a RAID4 to allow devices to be added to a RAID0. When
+requested to do this,
+.I mdadm
+will convert the RAID0 to a RAID4, add the necessary disks and make
+the reshape happen, and then convert the RAID4 back to RAID0.
+
+When decreasing the number of devices, the size of the array will also
+decrease. If there was data in the array, it could get destroyed and
+this is not reversible, so you should firstly shrink the filesystem on
+the array to fit within the new size. To help prevent accidents,
+.I mdadm
+requires that the size of the array be decreased first with
+.BR "mdadm --grow --array-size" .
+This is a reversible change which simply makes the end of the array
+inaccessible. The integrity of any data can then be checked before
+the non-reversible reduction in the number of devices is request.
+
+When relocating the first few stripes on a RAID5 or RAID6, it is not
+possible to keep the data on disk completely consistent and
+crash-proof. To provide the required safety, mdadm disables writes to
+the array while this "critical section" is reshaped, and takes a
+backup of the data that is in that section. For grows, this backup may be
+stored in any spare devices that the array has, however it can also be
+stored in a separate file specified with the
+.B \-\-backup\-file
+option, and is required to be specified for shrinks, RAID level
+changes and layout changes. If this option is used, and the system
+does crash during the critical period, the same file must be passed to
+.B \-\-assemble
+to restore the backup and reassemble the array. When shrinking rather
+than growing the array, the reshape is done from the end towards the
+beginning, so the "critical section" is at the end of the reshape.
+
+.SS LEVEL CHANGES
+
+Changing the RAID level of any array happens instantaneously. However
+in the RAID5 to RAID6 case this requires a non-standard layout of the
+RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is
+required before the change can be accomplished. So while the level
+change is instant, the accompanying layout change can take quite a
+long time. A
+.B \-\-backup\-file
+is required. If the array is not simultaneously being grown or
+shrunk, so that the array size will remain the same - for example,
+reshaping a 3-drive RAID5 into a 4-drive RAID6 - the backup file will
+be used not just for a "cricital section" but throughout the reshape
+operation, as described below under LAYOUT CHANGES.
+
+.SS CHUNK-SIZE AND LAYOUT CHANGES
+
+Changing the chunk-size or layout without also changing the number of
+devices as the same time will involve re-writing all blocks in-place.
+To ensure against data loss in the case of a crash, a
+.B --backup-file
+must be provided for these changes. Small sections of the array will
+be copied to the backup file while they are being rearranged. This
+means that all the data is copied twice, once to the backup and once
+to the new layout on the array, so this type of reshape will go very
+slowly.
+
+If the reshape is interrupted for any reason, this backup file must be
+made available to
+.B "mdadm --assemble"
+so the array can be reassembled. Consequently the file cannot be
+stored on the device being reshaped.
+
+
+.SS BITMAP CHANGES
+
+A write-intent bitmap can be added to, or removed from, an active
+array. Either internal bitmaps, or bitmaps stored in a separate file,
+can be added. Note that if you add a bitmap stored in a file which is
+in a filesystem that is on the RAID array being affected, the system
+will deadlock. The bitmap must be on a separate filesystem.
+
+.SS CONSISTENCY POLICY CHANGES
+
+The consistency policy of an active array can be changed by using the
+.B \-\-consistency\-policy
+option in Grow mode. Currently this works only for the
+.B ppl
+and
+.B resync
+policies and allows to enable or disable the RAID5 Partial Parity Log (PPL).
+
+.SH INCREMENTAL MODE
+
+.HP 12
+Usage:
+.B mdadm \-\-incremental
+.RB [ \-\-run ]
+.RB [ \-\-quiet ]
+.I component-device
+.RI [ optional-aliases-for-device ]
+.HP 12
+Usage:
+.B mdadm \-\-incremental \-\-fail
+.I component-device
+.HP 12
+Usage:
+.B mdadm \-\-incremental \-\-rebuild\-map
+.HP 12
+Usage:
+.B mdadm \-\-incremental \-\-run \-\-scan
+
+.PP
+This mode is designed to be used in conjunction with a device
+discovery system. As devices are found in a system, they can be
+passed to
+.B "mdadm \-\-incremental"
+to be conditionally added to an appropriate array.
+
+Conversely, it can also be used with the
+.B \-\-fail
+flag to do just the opposite and find whatever array a particular device
+is part of and remove the device from that array.
+
+If the device passed is a
+.B CONTAINER
+device created by a previous call to
+.IR mdadm ,
+then rather than trying to add that device to an array, all the arrays
+described by the metadata of the container will be started.
+
+.I mdadm
+performs a number of tests to determine if the device is part of an
+array, and which array it should be part of. If an appropriate array
+is found, or can be created,
+.I mdadm
+adds the device to the array and conditionally starts the array.
+
+Note that
+.I mdadm
+will normally only add devices to an array which were previously working
+(active or spare) parts of that array. The support for automatic
+inclusion of a new drive as a spare in some array requires
+a configuration through POLICY in config file.
+
+The tests that
+.I mdadm
+makes are as follow:
+.IP +
+Is the device permitted by
+.BR mdadm.conf ?
+That is, is it listed in a
+.B DEVICES
+line in that file. If
+.B DEVICES
+is absent then the default it to allow any device. Similarly if
+.B DEVICES
+contains the special word
+.B partitions
+then any device is allowed. Otherwise the device name given to
+.IR mdadm ,
+or one of the aliases given, or an alias found in the filesystem,
+must match one of the names or patterns in a
+.B DEVICES
+line.
+
+This is the only context where the aliases are used. They are
+usually provided by a
+.I udev
+rules mentioning
+.BR $env{DEVLINKS} .
+
+.IP +
+Does the device have a valid md superblock? If a specific metadata
+version is requested with
+.B \-\-metadata
+or
+.B \-e
+then only that style of metadata is accepted, otherwise
+.I mdadm
+finds any known version of metadata. If no
+.I md
+metadata is found, the device may be still added to an array
+as a spare if POLICY allows.
+
+.ig
+.IP +
+Does the metadata match an expected array?
+The metadata can match in two ways. Either there is an array listed
+in
+.B mdadm.conf
+which identifies the array (either by UUID, by name, by device list,
+or by minor-number), or the array was created with a
+.B homehost
+specified and that
+.B homehost
+matches the one in
+.B mdadm.conf
+or on the command line.
+If
+.I mdadm
+is not able to positively identify the array as belonging to the
+current host, the device will be rejected.
+..
+
+.PP
+.I mdadm
+keeps a list of arrays that it has partially assembled in
+.BR {MAP_PATH} .
+If no array exists which matches
+the metadata on the new device,
+.I mdadm
+must choose a device name and unit number. It does this based on any
+name given in
+.B mdadm.conf
+or any name information stored in the metadata. If this name
+suggests a unit number, that number will be used, otherwise a free
+unit number will be chosen. Normally
+.I mdadm
+will prefer to create a partitionable array, however if the
+.B CREATE
+line in
+.B mdadm.conf
+suggests that a non-partitionable array is preferred, that will be
+honoured.
+
+If the array is not found in the config file and its metadata does not
+identify it as belonging to the "homehost", then
+.I mdadm
+will choose a name for the array which is certain not to conflict with
+any array which does belong to this host. It does this be adding an
+underscore and a small number to the name preferred by the metadata.
+
+Once an appropriate array is found or created and the device is added,
+.I mdadm
+must decide if the array is ready to be started. It will
+normally compare the number of available (non-spare) devices to the
+number of devices that the metadata suggests need to be active. If
+there are at least that many, the array will be started. This means
+that if any devices are missing the array will not be restarted.
+
+As an alternative,
+.B \-\-run
+may be passed to
+.I mdadm
+in which case the array will be run as soon as there are enough
+devices present for the data to be accessible. For a RAID1, that
+means one device will start the array. For a clean RAID5, the array
+will be started as soon as all but one drive is present.
+
+Note that neither of these approaches is really ideal. If it can
+be known that all device discovery has completed, then
+.br
+.B " mdadm \-IRs"
+.br
+can be run which will try to start all arrays that are being
+incrementally assembled. They are started in "read-auto" mode in
+which they are read-only until the first write request. This means
+that no metadata updates are made and no attempt at resync or recovery
+happens. Further devices that are found before the first write can
+still be added safely.
+
+.SH ENVIRONMENT
+This section describes environment variables that affect how mdadm
+operates.
+
+.TP
+.B MDADM_NO_MDMON
+Setting this value to 1 will prevent mdadm from automatically launching
+mdmon. This variable is intended primarily for debugging mdadm/mdmon.
+
+.TP
+.B MDADM_NO_UDEV
+Normally,
+.I mdadm
+does not create any device nodes in /dev, but leaves that task to
+.IR udev .
+If
+.I udev
+appears not to be configured, or if this environment variable is set
+to '1', the
+.I mdadm
+will create and devices that are needed.
+
+.TP
+.B MDADM_NO_SYSTEMCTL
+If
+.I mdadm
+detects that
+.I systemd
+is in use it will normally request
+.I systemd
+to start various background tasks (particularly
+.IR mdmon )
+rather than forking and running them in the background. This can be
+suppressed by setting
+.BR MDADM_NO_SYSTEMCTL=1 .
+
+.TP
+.B IMSM_NO_PLATFORM
+A key value of IMSM metadata is that it allows interoperability with
+boot ROMs on Intel platforms, and with other major operating systems.
+Consequently,
+.I mdadm
+will only allow an IMSM array to be created or modified if detects
+that it is running on an Intel platform which supports IMSM, and
+supports the particular configuration of IMSM that is being requested
+(some functionality requires newer OROM support).
+
+These checks can be suppressed by setting IMSM_NO_PLATFORM=1 in the
+environment. This can be useful for testing or for disaster
+recovery. You should be aware that interoperability may be
+compromised by setting this value.
+
+.TP
+.B MDADM_GROW_ALLOW_OLD
+If an array is stopped while it is performing a reshape and that
+reshape was making use of a backup file, then when the array is
+re-assembled
+.I mdadm
+will sometimes complain that the backup file is too old. If this
+happens and you are certain it is the right backup file, you can
+over-ride this check by setting
+.B MDADM_GROW_ALLOW_OLD=1
+in the environment.
+
+.TP
+.B MDADM_CONF_AUTO
+Any string given in this variable is added to the start of the
+.B AUTO
+line in the config file, or treated as the whole
+.B AUTO
+line if none is given. It can be used to disable certain metadata
+types when
+.I mdadm
+is called from a boot script. For example
+.br
+.B " export MDADM_CONF_AUTO='-ddf -imsm'
+.br
+will make sure that
+.I mdadm
+does not automatically assemble any DDF or
+IMSM arrays that are found. This can be useful on systems configured
+to manage such arrays with
+.BR dmraid .
+
+
+.SH EXAMPLES
+
+.B " mdadm \-\-query /dev/name-of-device"
+.br
+This will find out if a given device is a RAID array, or is part of
+one, and will provide brief information about the device.
+
+.B " mdadm \-\-assemble \-\-scan"
+.br
+This will assemble and start all arrays listed in the standard config
+file. This command will typically go in a system startup file.
+
+.B " mdadm \-\-stop \-\-scan"
+.br
+This will shut down all arrays that can be shut down (i.e. are not
+currently in use). This will typically go in a system shutdown script.
+
+.B " mdadm \-\-follow \-\-scan \-\-delay=120"
+.br
+If (and only if) there is an Email address or program given in the
+standard config file, then
+monitor the status of all arrays listed in that file by
+polling them ever 2 minutes.
+
+.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1"
+.br
+Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1.
+
+.br
+.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf"
+.br
+.B " mdadm \-\-detail \-\-scan >> mdadm.conf"
+.br
+This will create a prototype config file that describes currently
+active arrays that are known to be made from partitions of IDE or SCSI drives.
+This file should be reviewed before being used as it may
+contain unwanted detail.
+
+.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf"
+.br
+.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf"
+.br
+This will find arrays which could be assembled from existing IDE and
+SCSI whole drives (not partitions), and store the information in the
+format of a config file.
+This file is very likely to contain unwanted detail, particularly
+the
+.B devices=
+entries. It should be reviewed and edited before being used as an
+actual config file.
+
+.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions"
+.br
+.B " mdadm \-Ebsc partitions"
+.br
+Create a list of devices by reading
+.BR /proc/partitions ,
+scan these for RAID superblocks, and printout a brief listing of all
+that were found.
+
+.B " mdadm \-Ac partitions \-m 0 /dev/md0"
+.br
+Scan all partitions and devices listed in
+.BR /proc/partitions
+and assemble
+.B /dev/md0
+out of all such devices with a RAID superblock with a minor number of 0.
+
+.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /run/mdadm/mon.pid"
+.br
+If config file contains a mail address or alert program, run mdadm in
+the background in monitor mode monitoring all md devices. Also write
+pid of mdadm daemon to
+.BR /run/mdadm/mon.pid .
+
+.B " mdadm \-Iq /dev/somedevice"
+.br
+Try to incorporate newly discovered device into some array as
+appropriate.
+
+.B " mdadm \-\-incremental \-\-rebuild\-map \-\-run \-\-scan"
+.br
+Rebuild the array map from any current arrays, and then start any that
+can be started.
+
+.B " mdadm /dev/md4 --fail detached --remove detached"
+.br
+Any devices which are components of /dev/md4 will be marked as faulty
+and then remove from the array.
+
+.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4"
+.br
+The array
+.B /dev/md4
+which is currently a RAID5 array will be converted to RAID6. There
+should normally already be a spare drive attached to the array as a
+RAID6 needs one more drive than a matching RAID5.
+
+.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]"
+.br
+Create a DDF array over 6 devices.
+
+.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf"
+.br
+Create a RAID5 array over any 3 devices in the given DDF set. Use
+only 30 gigabytes of each device.
+
+.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]"
+.br
+Assemble a pre-exist ddf array.
+
+.B " mdadm -I /dev/md/ddf1"
+.br
+Assemble all arrays contained in the ddf array, assigning names as
+appropriate.
+
+.B " mdadm \-\-create \-\-help"
+.br
+Provide help about the Create mode.
+
+.B " mdadm \-\-config \-\-help"
+.br
+Provide help about the format of the config file.
+
+.B " mdadm \-\-help"
+.br
+Provide general help.
+
+.SH FILES
+
+.SS /proc/mdstat
+
+If you're using the
+.B /proc
+filesystem,
+.B /proc/mdstat
+lists all active md devices with information about them.
+.I mdadm
+uses this to find arrays when
+.B \-\-scan
+is given in Misc mode, and to monitor array reconstruction
+on Monitor mode.
+
+.SS /etc/mdadm.conf
+
+The config file lists which devices may be scanned to see if
+they contain MD super block, and gives identifying information
+(e.g. UUID) about known MD arrays. See
+.BR mdadm.conf (5)
+for more details.
+
+.SS /etc/mdadm.conf.d
+
+A directory containing configuration files which are read in lexical
+order.
+
+.SS {MAP_PATH}
+When
+.B \-\-incremental
+mode is used, this file gets a list of arrays currently being created.
+
+.SH DEVICE NAMES
+
+.I mdadm
+understand two sorts of names for array devices.
+
+The first is the so-called 'standard' format name, which matches the
+names used by the kernel and which appear in
+.IR /proc/mdstat .
+
+The second sort can be freely chosen, but must reside in
+.IR /dev/md/ .
+When giving a device name to
+.I mdadm
+to create or assemble an array, either full path name such as
+.I /dev/md0
+or
+.I /dev/md/home
+can be given, or just the suffix of the second sort of name, such as
+.I home
+can be given.
+
+When
+.I mdadm
+chooses device names during auto-assembly or incremental assembly, it
+will sometimes add a small sequence number to the end of the name to
+avoid conflicted between multiple arrays that have the same name. If
+.I mdadm
+can reasonably determine that the array really is meant for this host,
+either by a hostname in the metadata, or by the presence of the array
+in
+.BR mdadm.conf ,
+then it will leave off the suffix if possible.
+Also if the homehost is specified as
+.B <ignore>
+.I mdadm
+will only use a suffix if a different array of the same name already
+exists or is listed in the config file.
+
+The standard names for non-partitioned arrays (the only sort of md
+array available in 2.4 and earlier) are of the form
+.IP
+.RB /dev/md NN
+.PP
+where NN is a number.
+The standard names for partitionable arrays (as available from 2.6
+onwards) are of the form:
+.IP
+.RB /dev/md_d NN
+.PP
+Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2".
+.PP
+From kernel version 2.6.28 the "non-partitioned array" can actually
+be partitioned. So the "md_d\fBNN\fP"
+names are no longer needed, and
+partitions such as "/dev/md\fBNN\fPp\fBXX\fP"
+are possible.
+.PP
+From kernel version 2.6.29 standard names can be non-numeric following
+the form:
+.IP
+.RB /dev/md_ XXX
+.PP
+where
+.B XXX
+is any string. These names are supported by
+.I mdadm
+since version 3.3 provided they are enabled in
+.IR mdadm.conf .
+
+.SH NOTE
+.I mdadm
+was previously known as
+.IR mdctl .
+
+.SH SEE ALSO
+For further information on mdadm usage, MD and the various levels of
+RAID, see:
+.IP
+.B https://raid.wiki.kernel.org/
+.PP
+(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO)
+.PP
+The latest version of
+.I mdadm
+should always be available from
+.IP
+.B https://www.kernel.org/pub/linux/utils/raid/mdadm/
+.PP
+Related man pages:
+.PP
+.IR mdmon (8),
+.IR mdadm.conf (5),
+.IR md (4).
diff --git a/mdadm.c b/mdadm.c
new file mode 100644
index 0000000..26299b2
--- /dev/null
+++ b/mdadm.c
@@ -0,0 +1,2078 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ *
+ * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004,
+ * Paul Clements, SteelEye Technology, Inc.
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include <ctype.h>
+
+static int scan_assemble(struct supertype *ss,
+ struct context *c,
+ struct mddev_ident *ident);
+static int misc_scan(char devmode, struct context *c);
+static int stop_scan(int verbose);
+static int misc_list(struct mddev_dev *devlist,
+ struct mddev_ident *ident,
+ char *dump_directory,
+ struct supertype *ss, struct context *c);
+const char Name[] = "mdadm";
+
+int main(int argc, char *argv[])
+{
+ int mode = 0;
+ int opt;
+ int option_index;
+ int rv;
+ int i;
+
+ unsigned long long array_size = 0;
+ unsigned long long data_offset = INVALID_SECTORS;
+ struct mddev_ident ident;
+ char *configfile = NULL;
+ int devmode = 0;
+ int bitmap_fd = -1;
+ struct mddev_dev *devlist = NULL;
+ struct mddev_dev **devlistend = & devlist;
+ struct mddev_dev *dv;
+ mdu_array_info_t array;
+ int devs_found = 0;
+ char *symlinks = NULL;
+ int grow_continue = 0;
+ /* autof indicates whether and how to create device node.
+ * bottom 3 bits are style. Rest (when shifted) are number of parts
+ * 0 - unset
+ * 1 - don't create (no)
+ * 2 - if is_standard, then create (yes)
+ * 3 - create as 'md' - reject is_standard mdp (md)
+ * 4 - create as 'mdp' - reject is_standard md (mdp)
+ * 5 - default to md if not is_standard (md in config file)
+ * 6 - default to mdp if not is_standard (part, or mdp in config file)
+ */
+ struct context c = {
+ .require_homehost = 1,
+ };
+ struct shape s = {
+ .journaldisks = 0,
+ .level = UnSet,
+ .layout = UnSet,
+ .bitmap_chunk = UnSet,
+ .consistency_policy = CONSISTENCY_POLICY_UNKNOWN,
+ };
+
+ char sys_hostname[256];
+ char *mailaddr = NULL;
+ char *program = NULL;
+ int increments = 20;
+ int daemonise = 0;
+ char *pidfile = NULL;
+ int oneshot = 0;
+ int spare_sharing = 1;
+ struct supertype *ss = NULL;
+ enum flag_mode writemostly = FlagDefault;
+ enum flag_mode failfast = FlagDefault;
+ char *shortopt = short_options;
+ int dosyslog = 0;
+ int rebuild_map = 0;
+ char *remove_path = NULL;
+ char *udev_filename = NULL;
+ char *dump_directory = NULL;
+
+ int print_help = 0;
+ FILE *outf;
+
+ int mdfd = -1;
+ int locked = 0;
+
+ srandom(time(0) ^ getpid());
+
+ ident.uuid_set = 0;
+ ident.level = UnSet;
+ ident.raid_disks = UnSet;
+ ident.super_minor = UnSet;
+ ident.devices = 0;
+ ident.spare_group = NULL;
+ ident.autof = 0;
+ ident.st = NULL;
+ ident.bitmap_fd = -1;
+ ident.bitmap_file = NULL;
+ ident.name[0] = 0;
+ ident.container = NULL;
+ ident.member = NULL;
+
+ if (get_linux_version() < 2006015) {
+ pr_err("This version of mdadm does not support kernels older than 2.6.15\n");
+ exit(1);
+ }
+
+ while ((option_index = -1),
+ (opt = getopt_long(argc, argv, shortopt, long_options,
+ &option_index)) != -1) {
+ int newmode = mode;
+ /* firstly, some mode-independent options */
+ switch(opt) {
+ case HelpOptions:
+ print_help = 2;
+ continue;
+ case 'h':
+ print_help = 1;
+ continue;
+
+ case 'V':
+ fputs(Version, stderr);
+ exit(0);
+
+ case 'v': c.verbose++;
+ continue;
+
+ case 'q': c.verbose--;
+ continue;
+
+ case 'b':
+ if (mode == ASSEMBLE || mode == BUILD ||
+ mode == CREATE || mode == GROW ||
+ mode == INCREMENTAL || mode == MANAGE)
+ break; /* b means bitmap */
+ case Brief:
+ c.brief = 1;
+ continue;
+
+ case NoDevices:
+ c.no_devices = 1;
+ continue;
+
+ case 'Y': c.export++;
+ continue;
+
+ case HomeHost:
+ if (strcasecmp(optarg, "<ignore>") == 0)
+ c.require_homehost = 0;
+ else
+ c.homehost = optarg;
+ continue;
+
+ case OffRootOpt:
+ /* Silently ignore old option */
+ continue;
+
+ case Prefer:
+ if (c.prefer)
+ free(c.prefer);
+ if (asprintf(&c.prefer, "/%s/", optarg) <= 0)
+ c.prefer = NULL;
+ continue;
+
+ case ':':
+ case '?':
+ fputs(Usage, stderr);
+ exit(2);
+ }
+ /* second, figure out the mode.
+ * Some options force the mode. Others
+ * set the mode if it isn't already
+ */
+
+ switch(opt) {
+ case ManageOpt:
+ newmode = MANAGE;
+ shortopt = short_bitmap_options;
+ break;
+ case 'a':
+ case Add:
+ case AddSpare:
+ case AddJournal:
+ case 'r':
+ case Remove:
+ case Replace:
+ case With:
+ case 'f':
+ case Fail:
+ case ReAdd: /* re-add */
+ case ClusterConfirm:
+ if (!mode) {
+ newmode = MANAGE;
+ shortopt = short_bitmap_options;
+ }
+ break;
+
+ case 'A': newmode = ASSEMBLE;
+ shortopt = short_bitmap_auto_options;
+ break;
+ case 'B': newmode = BUILD;
+ shortopt = short_bitmap_auto_options;
+ break;
+ case 'C': newmode = CREATE;
+ shortopt = short_bitmap_auto_options;
+ break;
+ case 'F': newmode = MONITOR;
+ break;
+ case 'G': newmode = GROW;
+ shortopt = short_bitmap_options;
+ break;
+ case 'I': newmode = INCREMENTAL;
+ shortopt = short_bitmap_auto_options;
+ break;
+ case AutoDetect:
+ newmode = AUTODETECT;
+ break;
+
+ case MiscOpt:
+ case 'D':
+ case 'E':
+ case 'X':
+ case 'Q':
+ case ExamineBB:
+ case Dump:
+ case Restore:
+ case Action:
+ newmode = MISC;
+ break;
+
+ case 'R':
+ case 'S':
+ case 'o':
+ case 'w':
+ case 'W':
+ case WaitOpt:
+ case Waitclean:
+ case DetailPlatform:
+ case KillSubarray:
+ case UpdateSubarray:
+ case UdevRules:
+ case KillOpt:
+ if (!mode)
+ newmode = MISC;
+ break;
+
+ case NoSharing:
+ newmode = MONITOR;
+ break;
+ }
+ if (mode && newmode == mode) {
+ /* everybody happy ! */
+ } else if (mode && newmode != mode) {
+ /* not allowed.. */
+ pr_err("");
+ if (option_index >= 0)
+ fprintf(stderr, "--%s", long_options[option_index].name);
+ else
+ fprintf(stderr, "-%c", opt);
+ fprintf(stderr, " would set mdadm mode to \"%s\", but it is already set to \"%s\".\n",
+ map_num(modes, newmode),
+ map_num(modes, mode));
+ exit(2);
+ } else if (!mode && newmode) {
+ mode = newmode;
+ if (mode == MISC && devs_found) {
+ pr_err("No action given for %s in --misc mode\n",
+ devlist->devname);
+ cont_err("Action options must come before device names\n");
+ exit(2);
+ }
+ } else {
+ /* special case of -c --help */
+ if ((opt == 'c' || opt == ConfigFile) &&
+ (strncmp(optarg, "--h", 3) == 0 ||
+ strncmp(optarg, "-h", 2) == 0)) {
+ fputs(Help_config, stdout);
+ exit(0);
+ }
+
+ /* If first option is a device, don't force the mode yet */
+ if (opt == 1) {
+ if (devs_found == 0) {
+ dv = xmalloc(sizeof(*dv));
+ dv->devname = optarg;
+ dv->disposition = devmode;
+ dv->writemostly = writemostly;
+ dv->failfast = failfast;
+ dv->used = 0;
+ dv->next = NULL;
+ *devlistend = dv;
+ devlistend = &dv->next;
+
+ devs_found++;
+ continue;
+ }
+ /* No mode yet, and this is the second device ... */
+ pr_err("An option must be given to set the mode before a second device\n"
+ " (%s) is listed\n", optarg);
+ exit(2);
+ }
+ if (option_index >= 0)
+ pr_err("--%s", long_options[option_index].name);
+ else
+ pr_err("-%c", opt);
+ fprintf(stderr, " does not set the mode, and so cannot be the first option.\n");
+ exit(2);
+ }
+
+ /* if we just set the mode, then done */
+ switch(opt) {
+ case ManageOpt:
+ case MiscOpt:
+ case 'A':
+ case 'B':
+ case 'C':
+ case 'F':
+ case 'G':
+ case 'I':
+ case AutoDetect:
+ continue;
+ }
+ if (opt == 1) {
+ /* an undecorated option - must be a device name.
+ */
+
+ if (devs_found > 0 && devmode == DetailPlatform) {
+ pr_err("controller may only be specified once. %s ignored\n",
+ optarg);
+ continue;
+ }
+
+ if (devs_found > 0 && mode == MANAGE && !devmode) {
+ pr_err("Must give one of -a/-r/-f for subsequent devices at %s\n", optarg);
+ exit(2);
+ }
+ if (devs_found > 0 && mode == GROW && !devmode) {
+ pr_err("Must give -a/--add for devices to add: %s\n", optarg);
+ exit(2);
+ }
+ dv = xmalloc(sizeof(*dv));
+ dv->devname = optarg;
+ dv->disposition = devmode;
+ dv->writemostly = writemostly;
+ dv->failfast = failfast;
+ dv->used = 0;
+ dv->next = NULL;
+ *devlistend = dv;
+ devlistend = &dv->next;
+
+ devs_found++;
+ continue;
+ }
+
+ /* We've got a mode, and opt is now something else which
+ * could depend on the mode */
+#define O(a,b) ((a<<16)|b)
+ switch (O(mode,opt)) {
+ case O(GROW,'c'):
+ case O(GROW,ChunkSize):
+ case O(CREATE,'c'):
+ case O(CREATE,ChunkSize):
+ case O(BUILD,'c'): /* chunk or rounding */
+ case O(BUILD,ChunkSize): /* chunk or rounding */
+ if (s.chunk) {
+ pr_err("chunk/rounding may only be specified once. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ s.chunk = parse_size(optarg);
+ if (s.chunk == INVALID_SECTORS ||
+ s.chunk < 8 || (s.chunk&1)) {
+ pr_err("invalid chunk/rounding value: %s\n",
+ optarg);
+ exit(2);
+ }
+ /* Convert sectors to K */
+ s.chunk /= 2;
+ continue;
+
+ case O(INCREMENTAL, 'e'):
+ case O(CREATE,'e'):
+ case O(ASSEMBLE,'e'):
+ case O(MISC,'e'): /* set metadata (superblock) information */
+ if (ss) {
+ pr_err("metadata information already given\n");
+ exit(2);
+ }
+ for(i = 0; !ss && superlist[i]; i++)
+ ss = superlist[i]->match_metadata_desc(optarg);
+
+ if (!ss) {
+ pr_err("unrecognised metadata identifier: %s\n", optarg);
+ exit(2);
+ }
+ continue;
+
+ case O(MANAGE,'W'):
+ case O(MANAGE,WriteMostly):
+ case O(BUILD,'W'):
+ case O(BUILD,WriteMostly):
+ case O(CREATE,'W'):
+ case O(CREATE,WriteMostly):
+ /* set write-mostly for following devices */
+ writemostly = FlagSet;
+ continue;
+
+ case O(MANAGE,'w'):
+ /* clear write-mostly for following devices */
+ writemostly = FlagClear;
+ continue;
+
+ case O(MANAGE,FailFast):
+ case O(CREATE,FailFast):
+ failfast = FlagSet;
+ continue;
+ case O(MANAGE,NoFailFast):
+ failfast = FlagClear;
+ continue;
+
+ case O(GROW,'z'):
+ case O(CREATE,'z'):
+ case O(BUILD,'z'): /* size */
+ if (s.size > 0) {
+ pr_err("size may only be specified once. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ if (strcmp(optarg, "max") == 0)
+ s.size = MAX_SIZE;
+ else {
+ s.size = parse_size(optarg);
+ if (s.size == INVALID_SECTORS || s.size < 8) {
+ pr_err("invalid size: %s\n", optarg);
+ exit(2);
+ }
+ /* convert sectors to K */
+ s.size /= 2;
+ }
+ continue;
+
+ case O(GROW,'Z'): /* array size */
+ if (array_size > 0) {
+ pr_err("array-size may only be specified once. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ if (strcmp(optarg, "max") == 0)
+ array_size = MAX_SIZE;
+ else {
+ array_size = parse_size(optarg);
+ if (array_size == 0 ||
+ array_size == INVALID_SECTORS) {
+ pr_err("invalid array size: %s\n",
+ optarg);
+ exit(2);
+ }
+ }
+ continue;
+
+ case O(CREATE,DataOffset):
+ case O(GROW,DataOffset):
+ if (data_offset != INVALID_SECTORS) {
+ pr_err("data-offset may only be specified one. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ if (mode == CREATE && strcmp(optarg, "variable") == 0)
+ data_offset = VARIABLE_OFFSET;
+ else
+ data_offset = parse_size(optarg);
+ if (data_offset == INVALID_SECTORS) {
+ pr_err("invalid data-offset: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+
+ case O(GROW,'l'):
+ case O(CREATE,'l'):
+ case O(BUILD,'l'): /* set raid level*/
+ if (s.level != UnSet) {
+ pr_err("raid level may only be set once. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ s.level = map_name(pers, optarg);
+ if (s.level == UnSet) {
+ pr_err("invalid raid level: %s\n",
+ optarg);
+ exit(2);
+ }
+ if (s.level != 0 && s.level != LEVEL_LINEAR &&
+ s.level != 1 && s.level != LEVEL_MULTIPATH &&
+ s.level != LEVEL_FAULTY && s.level != 10 &&
+ mode == BUILD) {
+ pr_err("Raid level %s not permitted with --build.\n",
+ optarg);
+ exit(2);
+ }
+ if (s.sparedisks > 0 && s.level < 1 && s.level >= -1) {
+ pr_err("raid level %s is incompatible with spare-devices setting.\n",
+ optarg);
+ exit(2);
+ }
+ ident.level = s.level;
+ continue;
+
+ case O(GROW, 'p'): /* new layout */
+ case O(GROW, Layout):
+ if (s.layout_str) {
+ pr_err("layout may only be sent once. Second value was %s\n", optarg);
+ exit(2);
+ }
+ s.layout_str = optarg;
+ /* 'Grow' will parse the value */
+ continue;
+
+ case O(CREATE,'p'): /* raid5 layout */
+ case O(CREATE,Layout):
+ case O(BUILD,'p'): /* faulty layout */
+ case O(BUILD,Layout):
+ if (s.layout != UnSet) {
+ pr_err("layout may only be sent once. Second value was %s\n", optarg);
+ exit(2);
+ }
+ switch(s.level) {
+ default:
+ pr_err("layout not meaningful for %s arrays.\n",
+ map_num(pers, s.level));
+ exit(2);
+ case UnSet:
+ pr_err("raid level must be given before layout.\n");
+ exit(2);
+
+ case 0:
+ s.layout = map_name(r0layout, optarg);
+ if (s.layout == UnSet) {
+ pr_err("layout %s not understood for raid0.\n",
+ optarg);
+ exit(2);
+ }
+ break;
+ case 5:
+ s.layout = map_name(r5layout, optarg);
+ if (s.layout == UnSet) {
+ pr_err("layout %s not understood for raid5.\n",
+ optarg);
+ exit(2);
+ }
+ break;
+ case 6:
+ s.layout = map_name(r6layout, optarg);
+ if (s.layout == UnSet) {
+ pr_err("layout %s not understood for raid6.\n",
+ optarg);
+ exit(2);
+ }
+ break;
+
+ case 10:
+ s.layout = parse_layout_10(optarg);
+ if (s.layout < 0) {
+ pr_err("layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg);
+ exit(2);
+ }
+ break;
+ case LEVEL_FAULTY:
+ /* Faulty
+ * modeNNN
+ */
+ s.layout = parse_layout_faulty(optarg);
+ if (s.layout == -1) {
+ pr_err("layout %s not understood for faulty.\n",
+ optarg);
+ exit(2);
+ }
+ break;
+ }
+ continue;
+
+ case O(CREATE,AssumeClean):
+ case O(BUILD,AssumeClean): /* assume clean */
+ case O(GROW,AssumeClean):
+ s.assume_clean = 1;
+ continue;
+
+ case O(GROW,'n'):
+ case O(CREATE,'n'):
+ case O(BUILD,'n'): /* number of raid disks */
+ if (s.raiddisks) {
+ pr_err("raid-devices set twice: %d and %s\n",
+ s.raiddisks, optarg);
+ exit(2);
+ }
+ if (parse_num(&s.raiddisks, optarg) != 0 || s.raiddisks <= 0) {
+ pr_err("invalid number of raid devices: %s\n",
+ optarg);
+ exit(2);
+ }
+ ident.raid_disks = s.raiddisks;
+ continue;
+ case O(ASSEMBLE, Nodes):
+ case O(GROW, Nodes):
+ case O(CREATE, Nodes):
+ if (parse_num(&c.nodes, optarg) != 0 || c.nodes < 2) {
+ pr_err("clustered array needs two nodes at least: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+ case O(CREATE, ClusterName):
+ case O(ASSEMBLE, ClusterName):
+ c.homecluster = optarg;
+ if (strlen(c.homecluster) > 64) {
+ pr_err("Cluster name too big.\n");
+ exit(2);
+ }
+ continue;
+ case O(CREATE,'x'): /* number of spare (eXtra) disks */
+ if (s.sparedisks) {
+ pr_err("spare-devices set twice: %d and %s\n",
+ s.sparedisks, optarg);
+ exit(2);
+ }
+ if (s.level != UnSet && s.level <= 0 && s.level >= -1) {
+ pr_err("spare-devices setting is incompatible with raid level %d\n",
+ s.level);
+ exit(2);
+ }
+ if (parse_num(&s.sparedisks, optarg) != 0 || s.sparedisks < 0) {
+ pr_err("invalid number of spare-devices: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+
+ case O(CREATE,'a'):
+ case O(CREATE,Auto):
+ case O(BUILD,'a'):
+ case O(BUILD,Auto):
+ case O(INCREMENTAL,'a'):
+ case O(INCREMENTAL,Auto):
+ case O(ASSEMBLE,'a'):
+ case O(ASSEMBLE,Auto): /* auto-creation of device node */
+ c.autof = parse_auto(optarg, "--auto flag", 0);
+ continue;
+
+ case O(CREATE,Symlinks):
+ case O(BUILD,Symlinks):
+ case O(ASSEMBLE,Symlinks): /* auto creation of symlinks in /dev to /dev/md */
+ symlinks = optarg;
+ continue;
+
+ case O(BUILD,'f'): /* force honouring '-n 1' */
+ case O(BUILD,Force): /* force honouring '-n 1' */
+ case O(GROW,'f'): /* ditto */
+ case O(GROW,Force): /* ditto */
+ case O(CREATE,'f'): /* force honouring of device list */
+ case O(CREATE,Force): /* force honouring of device list */
+ case O(ASSEMBLE,'f'): /* force assembly */
+ case O(ASSEMBLE,Force): /* force assembly */
+ case O(MISC,'f'): /* force zero */
+ case O(MISC,Force): /* force zero */
+ case O(MANAGE,Force): /* add device which is too large */
+ c.force = 1;
+ continue;
+ /* now for the Assemble options */
+ case O(ASSEMBLE, FreezeReshape): /* Freeze reshape during
+ * initrd phase */
+ case O(INCREMENTAL, FreezeReshape):
+ c.freeze_reshape = 1;
+ continue;
+ case O(CREATE,'u'): /* uuid of array */
+ case O(ASSEMBLE,'u'): /* uuid of array */
+ if (ident.uuid_set) {
+ pr_err("uuid cannot be set twice. Second value %s.\n", optarg);
+ exit(2);
+ }
+ if (parse_uuid(optarg, ident.uuid))
+ ident.uuid_set = 1;
+ else {
+ pr_err("Bad uuid: %s\n", optarg);
+ exit(2);
+ }
+ continue;
+
+ case O(CREATE,'N'):
+ case O(ASSEMBLE,'N'):
+ case O(MISC,'N'):
+ if (ident.name[0]) {
+ pr_err("name cannot be set twice. Second value %s.\n", optarg);
+ exit(2);
+ }
+ if (mode == MISC && !c.subarray) {
+ pr_err("-N/--name only valid with --update-subarray in misc mode\n");
+ exit(2);
+ }
+ if (strlen(optarg) > 32) {
+ pr_err("name '%s' is too long, 32 chars max.\n",
+ optarg);
+ exit(2);
+ }
+ strcpy(ident.name, optarg);
+ continue;
+
+ case O(ASSEMBLE,'m'): /* super-minor for array */
+ case O(ASSEMBLE,SuperMinor):
+ if (ident.super_minor != UnSet) {
+ pr_err("super-minor cannot be set twice. Second value: %s.\n", optarg);
+ exit(2);
+ }
+ if (strcmp(optarg, "dev") == 0)
+ ident.super_minor = -2;
+ else if (parse_num(&ident.super_minor, optarg) != 0 || ident.super_minor < 0) {
+ pr_err("Bad super-minor number: %s.\n", optarg);
+ exit(2);
+ }
+ continue;
+
+ case O(ASSEMBLE,'o'):
+ case O(MANAGE,'o'):
+ case O(CREATE,'o'):
+ c.readonly = 1;
+ continue;
+
+ case O(ASSEMBLE,'U'): /* update the superblock */
+ case O(MISC,'U'):
+ if (c.update) {
+ pr_err("Can only update one aspect of superblock, both %s and %s given.\n",
+ c.update, optarg);
+ exit(2);
+ }
+ if (mode == MISC && !c.subarray) {
+ pr_err("Only subarrays can be updated in misc mode\n");
+ exit(2);
+ }
+ c.update = optarg;
+ if (strcmp(c.update, "sparc2.2") == 0)
+ continue;
+ if (strcmp(c.update, "super-minor") == 0)
+ continue;
+ if (strcmp(c.update, "summaries") == 0)
+ continue;
+ if (strcmp(c.update, "resync") == 0)
+ continue;
+ if (strcmp(c.update, "uuid") == 0)
+ continue;
+ if (strcmp(c.update, "name") == 0)
+ continue;
+ if (strcmp(c.update, "homehost") == 0)
+ continue;
+ if (strcmp(c.update, "home-cluster") == 0)
+ continue;
+ if (strcmp(c.update, "nodes") == 0)
+ continue;
+ if (strcmp(c.update, "devicesize") == 0)
+ continue;
+ if (strcmp(c.update, "bitmap") == 0)
+ continue;
+ if (strcmp(c.update, "no-bitmap") == 0)
+ continue;
+ if (strcmp(c.update, "bbl") == 0)
+ continue;
+ if (strcmp(c.update, "no-bbl") == 0)
+ continue;
+ if (strcmp(c.update, "force-no-bbl") == 0)
+ continue;
+ if (strcmp(c.update, "ppl") == 0)
+ continue;
+ if (strcmp(c.update, "no-ppl") == 0)
+ continue;
+ if (strcmp(c.update, "metadata") == 0)
+ continue;
+ if (strcmp(c.update, "revert-reshape") == 0)
+ continue;
+ if (strcmp(c.update, "layout-original") == 0 ||
+ strcmp(c.update, "layout-alternate") == 0 ||
+ strcmp(c.update, "layout-unspecified") == 0)
+ continue;
+ if (strcmp(c.update, "byteorder") == 0) {
+ if (ss) {
+ pr_err("must not set metadata type with --update=byteorder.\n");
+ exit(2);
+ }
+ for(i = 0; !ss && superlist[i]; i++)
+ ss = superlist[i]->match_metadata_desc(
+ "0.swap");
+ if (!ss) {
+ pr_err("INTERNAL ERROR cannot find 0.swap\n");
+ exit(2);
+ }
+
+ continue;
+ }
+ if (strcmp(c.update,"?") == 0 ||
+ strcmp(c.update, "help") == 0) {
+ outf = stdout;
+ fprintf(outf, "%s: ", Name);
+ } else {
+ outf = stderr;
+ fprintf(outf,
+ "%s: '--update=%s' is invalid. ",
+ Name, c.update);
+ }
+ fprintf(outf, "Valid --update options are:\n"
+ " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n"
+ " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n"
+ " 'bitmap', 'no-bitmap', 'metadata', 'revert-reshape'\n"
+ " 'bbl', 'no-bbl', 'force-no-bbl', 'ppl', 'no-ppl'\n"
+ " 'layout-original', 'layout-alternate', 'layout-unspecified'\n"
+ );
+ exit(outf == stdout ? 0 : 2);
+
+ case O(MANAGE,'U'):
+ /* update=devicesize is allowed with --re-add */
+ if (devmode != 'A') {
+ pr_err("--update in Manage mode only allowed with --re-add.\n");
+ exit(1);
+ }
+ if (c.update) {
+ pr_err("Can only update one aspect of superblock, both %s and %s given.\n",
+ c.update, optarg);
+ exit(2);
+ }
+ c.update = optarg;
+ if (strcmp(c.update, "devicesize") != 0 &&
+ strcmp(c.update, "bbl") != 0 &&
+ strcmp(c.update, "force-no-bbl") != 0 &&
+ strcmp(c.update, "no-bbl") != 0) {
+ pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n");
+ exit(2);
+ }
+ continue;
+
+ case O(INCREMENTAL,NoDegraded):
+ pr_err("--no-degraded is deprecated in Incremental mode\n");
+ case O(ASSEMBLE,NoDegraded): /* --no-degraded */
+ c.runstop = -1; /* --stop isn't allowed for --assemble,
+ * so we overload slightly */
+ continue;
+
+ case O(ASSEMBLE,'c'):
+ case O(ASSEMBLE,ConfigFile):
+ case O(INCREMENTAL, 'c'):
+ case O(INCREMENTAL, ConfigFile):
+ case O(MISC, 'c'):
+ case O(MISC, ConfigFile):
+ case O(MONITOR,'c'):
+ case O(MONITOR,ConfigFile):
+ case O(CREATE,ConfigFile):
+ if (configfile) {
+ pr_err("configfile cannot be set twice. Second value is %s.\n", optarg);
+ exit(2);
+ }
+ configfile = optarg;
+ set_conffile(configfile);
+ /* FIXME possibly check that config file exists. Even parse it */
+ continue;
+ case O(ASSEMBLE,'s'): /* scan */
+ case O(MISC,'s'):
+ case O(MONITOR,'s'):
+ case O(INCREMENTAL,'s'):
+ c.scan = 1;
+ continue;
+
+ case O(MONITOR,'m'): /* mail address */
+ case O(MONITOR,EMail):
+ if (mailaddr)
+ pr_err("only specify one mailaddress. %s ignored.\n",
+ optarg);
+ else
+ mailaddr = optarg;
+ continue;
+
+ case O(MONITOR,'p'): /* alert program */
+ case O(MONITOR,ProgramOpt): /* alert program */
+ if (program)
+ pr_err("only specify one alter program. %s ignored.\n",
+ optarg);
+ else
+ program = optarg;
+ continue;
+
+ case O(MONITOR,'r'): /* rebuild increments */
+ case O(MONITOR,Increment):
+ if (parse_num(&increments, optarg) != 0
+ || increments > 99 || increments < 1) {
+ pr_err("please specify positive integer between 1 and 99 as rebuild increments.\n");
+ exit(2);
+ }
+ continue;
+
+ case O(MONITOR,'d'): /* delay in seconds */
+ case O(GROW, 'd'):
+ case O(BUILD,'d'): /* delay for bitmap updates */
+ case O(CREATE,'d'):
+ if (c.delay)
+ pr_err("only specify delay once. %s ignored.\n", optarg);
+ else if (parse_num(&c.delay, optarg) != 0 || c.delay < 1) {
+ pr_err("invalid delay: %s\n", optarg);
+ exit(2);
+ }
+ continue;
+ case O(MONITOR,'f'): /* daemonise */
+ case O(MONITOR,Fork):
+ daemonise = 1;
+ continue;
+ case O(MONITOR,'i'): /* pid */
+ if (pidfile)
+ pr_err("only specify one pid file. %s ignored.\n",
+ optarg);
+ else
+ pidfile = optarg;
+ continue;
+ case O(MONITOR,'1'): /* oneshot */
+ oneshot = 1;
+ spare_sharing = 0;
+ continue;
+ case O(MONITOR,'t'): /* test */
+ c.test = 1;
+ continue;
+ case O(MONITOR,'y'): /* log messages to syslog */
+ openlog("mdadm", LOG_PID, SYSLOG_FACILITY);
+ dosyslog = 1;
+ continue;
+ case O(MONITOR, NoSharing):
+ spare_sharing = 0;
+ continue;
+
+ /* now the general management options. Some are applicable
+ * to other modes. None have arguments.
+ */
+ case O(GROW,'a'):
+ case O(GROW,Add):
+ case O(MANAGE,'a'):
+ case O(MANAGE,Add): /* add a drive */
+ devmode = 'a';
+ continue;
+ case O(MANAGE,AddSpare): /* add drive - never re-add */
+ devmode = 'S';
+ continue;
+ case O(MANAGE,AddJournal): /* add journal */
+ if (s.journaldisks && (s.level < 4 || s.level > 6)) {
+ pr_err("--add-journal is only supported for RAID level 4/5/6.\n");
+ exit(2);
+ }
+ devmode = 'j';
+ continue;
+ case O(MANAGE,ReAdd):
+ devmode = 'A';
+ continue;
+ case O(MANAGE,'r'): /* remove a drive */
+ case O(MANAGE,Remove):
+ devmode = 'r';
+ continue;
+ case O(MANAGE,'f'): /* set faulty */
+ case O(MANAGE,Fail):
+ case O(INCREMENTAL,'f'):
+ case O(INCREMENTAL,Remove):
+ case O(INCREMENTAL,Fail): /* r for incremental is taken, use f
+ * even though we will both fail and
+ * remove the device */
+ devmode = 'f';
+ continue;
+ case O(MANAGE, ClusterConfirm):
+ devmode = 'c';
+ continue;
+ case O(MANAGE,Replace):
+ /* Mark these devices for replacement */
+ devmode = 'R';
+ continue;
+ case O(MANAGE,With):
+ /* These are the replacements to use */
+ if (devmode != 'R') {
+ pr_err("--with must follow --replace\n");
+ exit(2);
+ }
+ devmode = 'W';
+ continue;
+ case O(INCREMENTAL,'R'):
+ case O(MANAGE,'R'):
+ case O(ASSEMBLE,'R'):
+ case O(BUILD,'R'):
+ case O(CREATE,'R'): /* Run the array */
+ if (c.runstop < 0) {
+ pr_err("Cannot both Stop and Run an array\n");
+ exit(2);
+ }
+ c.runstop = 1;
+ continue;
+ case O(MANAGE,'S'):
+ if (c.runstop > 0) {
+ pr_err("Cannot both Run and Stop an array\n");
+ exit(2);
+ }
+ c.runstop = -1;
+ continue;
+ case O(MANAGE,'t'):
+ c.test = 1;
+ continue;
+
+ case O(MISC,'Q'):
+ case O(MISC,'D'):
+ case O(MISC,'E'):
+ case O(MISC,KillOpt):
+ case O(MISC,'R'):
+ case O(MISC,'S'):
+ case O(MISC,'X'):
+ case O(MISC, ExamineBB):
+ case O(MISC,'o'):
+ case O(MISC,'w'):
+ case O(MISC,'W'):
+ case O(MISC, WaitOpt):
+ case O(MISC, Waitclean):
+ case O(MISC, DetailPlatform):
+ case O(MISC, KillSubarray):
+ case O(MISC, UpdateSubarray):
+ case O(MISC, Dump):
+ case O(MISC, Restore):
+ case O(MISC ,Action):
+ if (opt == KillSubarray || opt == UpdateSubarray) {
+ if (c.subarray) {
+ pr_err("subarray can only be specified once\n");
+ exit(2);
+ }
+ c.subarray = optarg;
+ }
+ if (opt == Action) {
+ if (c.action) {
+ pr_err("Only one --action can be specified\n");
+ exit(2);
+ }
+ if (strcmp(optarg, "idle") == 0 ||
+ strcmp(optarg, "frozen") == 0 ||
+ strcmp(optarg, "check") == 0 ||
+ strcmp(optarg, "repair") == 0)
+ c.action = optarg;
+ else {
+ pr_err("action must be one of idle, frozen, check, repair\n");
+ exit(2);
+ }
+ }
+ if (devmode && devmode != opt &&
+ (devmode == 'E' ||
+ (opt == 'E' && devmode != 'Q'))) {
+ pr_err("--examine/-E cannot be given with ");
+ if (devmode == 'E') {
+ if (option_index >= 0)
+ fprintf(stderr, "--%s\n",
+ long_options[option_index].name);
+ else
+ fprintf(stderr, "-%c\n", opt);
+ } else if (isalpha(devmode))
+ fprintf(stderr, "-%c\n", devmode);
+ else
+ fprintf(stderr, "previous option\n");
+ exit(2);
+ }
+ devmode = opt;
+ if (opt == Dump || opt == Restore) {
+ if (dump_directory != NULL) {
+ pr_err("dump/restore directory specified twice: %s and %s\n",
+ dump_directory, optarg);
+ exit(2);
+ }
+ dump_directory = optarg;
+ }
+ continue;
+ case O(MISC, UdevRules):
+ if (devmode && devmode != opt) {
+ pr_err("--udev-rules must be the only option.\n");
+ } else {
+ if (udev_filename)
+ pr_err("only specify one udev rule filename. %s ignored.\n",
+ optarg);
+ else
+ udev_filename = optarg;
+ }
+ devmode = opt;
+ continue;
+ case O(MISC,'t'):
+ c.test = 1;
+ continue;
+
+ case O(MISC, Sparc22):
+ if (devmode != 'E') {
+ pr_err("--sparc2.2 only allowed with --examine\n");
+ exit(2);
+ }
+ c.SparcAdjust = 1;
+ continue;
+
+ case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */
+ case O(ASSEMBLE,Bitmap):
+ if (!optarg) {
+ pr_err("bitmap file needed with -b in --assemble mode\n");
+ exit(2);
+ }
+ if (strcmp(optarg, "internal") == 0 ||
+ strcmp(optarg, "clustered") == 0) {
+ pr_err("no need to specify --bitmap when assembling"
+ " arrays with internal or clustered bitmap\n");
+ continue;
+ }
+ bitmap_fd = open(optarg, O_RDWR);
+ if (!*optarg || bitmap_fd < 0) {
+ pr_err("cannot open bitmap file %s: %s\n", optarg, strerror(errno));
+ exit(2);
+ }
+ ident.bitmap_fd = bitmap_fd; /* for Assemble */
+ continue;
+
+ case O(ASSEMBLE, BackupFile):
+ case O(GROW, BackupFile):
+ /* Specify a file into which grow might place a backup,
+ * or from which assemble might recover a backup
+ */
+ if (c.backup_file) {
+ pr_err("backup file already specified, rejecting %s\n", optarg);
+ exit(2);
+ }
+ c.backup_file = optarg;
+ continue;
+
+ case O(GROW, Continue):
+ /* Continue interrupted grow
+ */
+ grow_continue = 1;
+ continue;
+ case O(ASSEMBLE, InvalidBackup):
+ /* Acknowledge that the backupfile is invalid, but ask
+ * to continue anyway
+ */
+ c.invalid_backup = 1;
+ continue;
+
+ case O(BUILD,'b'):
+ case O(BUILD,Bitmap):
+ case O(CREATE,'b'):
+ case O(CREATE,Bitmap): /* here we create the bitmap */
+ case O(GROW,'b'):
+ case O(GROW,Bitmap):
+ if (s.bitmap_file) {
+ pr_err("bitmap cannot be set twice. Second value: %s.\n", optarg);
+ exit(2);
+ }
+ if (strcmp(optarg, "internal") == 0 ||
+ strcmp(optarg, "none") == 0 ||
+ strchr(optarg, '/') != NULL) {
+ s.bitmap_file = optarg;
+ continue;
+ }
+ if (strcmp(optarg, "clustered") == 0) {
+ s.bitmap_file = optarg;
+ /* Set the default number of cluster nodes
+ * to 4 if not already set by user
+ */
+ if (c.nodes < 1)
+ c.nodes = 4;
+ continue;
+ }
+ /* probable typo */
+ pr_err("bitmap file must contain a '/', or be 'internal', or be 'clustered', or 'none'\n"
+ " not '%s'\n", optarg);
+ exit(2);
+
+ case O(GROW,BitmapChunk):
+ case O(BUILD,BitmapChunk):
+ case O(CREATE,BitmapChunk): /* bitmap chunksize */
+ s.bitmap_chunk = parse_size(optarg);
+ if (s.bitmap_chunk == 0 ||
+ s.bitmap_chunk == INVALID_SECTORS ||
+ s.bitmap_chunk & (s.bitmap_chunk - 1)) {
+ pr_err("invalid bitmap chunksize: %s\n",
+ optarg);
+ exit(2);
+ }
+ s.bitmap_chunk = s.bitmap_chunk * 512;
+ continue;
+
+ case O(GROW, WriteBehind):
+ case O(BUILD, WriteBehind):
+ case O(CREATE, WriteBehind):
+ s.write_behind = DEFAULT_MAX_WRITE_BEHIND;
+ if (parse_num(&s.write_behind, optarg) != 0 ||
+ s.write_behind < 0 || s.write_behind > 16383) {
+ pr_err("Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+ case O(INCREMENTAL, 'r'):
+ case O(INCREMENTAL, RebuildMapOpt):
+ rebuild_map = 1;
+ continue;
+ case O(INCREMENTAL, IncrementalPath):
+ remove_path = optarg;
+ continue;
+ case O(CREATE, WriteJournal):
+ if (s.journaldisks) {
+ pr_err("Please specify only one journal device for the array.\n");
+ pr_err("Ignoring --write-journal %s...\n", optarg);
+ continue;
+ }
+ dv = xmalloc(sizeof(*dv));
+ dv->devname = optarg;
+ dv->disposition = 'j'; /* WriteJournal */
+ dv->used = 0;
+ dv->next = NULL;
+ *devlistend = dv;
+ devlistend = &dv->next;
+ devs_found++;
+
+ s.journaldisks = 1;
+ continue;
+ case O(CREATE, 'k'):
+ case O(GROW, 'k'):
+ s.consistency_policy = map_name(consistency_policies,
+ optarg);
+ if (s.consistency_policy < CONSISTENCY_POLICY_RESYNC) {
+ pr_err("Invalid consistency policy: %s\n",
+ optarg);
+ exit(2);
+ }
+ continue;
+ }
+ /* We have now processed all the valid options. Anything else is
+ * an error
+ */
+ if (option_index > 0)
+ pr_err(":option --%s not valid in %s mode\n",
+ long_options[option_index].name,
+ map_num(modes, mode));
+ else
+ pr_err("option -%c not valid in %s mode\n",
+ opt, map_num(modes, mode));
+ exit(2);
+
+ }
+
+ if (print_help) {
+ char *help_text;
+ if (print_help == 2)
+ help_text = OptionHelp;
+ else
+ help_text = mode_help[mode];
+ if (help_text == NULL)
+ help_text = Help;
+ fputs(help_text,stdout);
+ exit(0);
+ }
+
+ if (s.journaldisks) {
+ if (s.level < 4 || s.level > 6) {
+ pr_err("--write-journal is only supported for RAID level 4/5/6.\n");
+ exit(2);
+ }
+ if (s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN &&
+ s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
+ pr_err("--write-journal is not supported with consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ }
+ }
+
+ if (mode == CREATE &&
+ s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN) {
+ if (s.level <= 0) {
+ pr_err("--consistency-policy not meaningful with level %s.\n",
+ map_num(pers, s.level));
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL &&
+ !s.journaldisks) {
+ pr_err("--write-journal is required for consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_PPL &&
+ s.level != 5) {
+ pr_err("PPL consistency policy is only supported for RAID level 5.\n");
+ exit(2);
+ } else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP &&
+ (!s.bitmap_file ||
+ strcmp(s.bitmap_file, "none") == 0)) {
+ pr_err("--bitmap is required for consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ } else if (s.bitmap_file &&
+ strcmp(s.bitmap_file, "none") != 0 &&
+ s.consistency_policy != CONSISTENCY_POLICY_BITMAP &&
+ s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) {
+ pr_err("--bitmap is not compatible with consistency policy: %s\n",
+ map_num(consistency_policies, s.consistency_policy));
+ exit(2);
+ }
+ }
+
+ if (!mode && devs_found) {
+ mode = MISC;
+ devmode = 'Q';
+ if (devlist->disposition == 0)
+ devlist->disposition = devmode;
+ }
+ if (!mode) {
+ fputs(Usage, stderr);
+ exit(2);
+ }
+
+ if (symlinks) {
+ struct createinfo *ci = conf_get_create_info();
+
+ if (strcasecmp(symlinks, "yes") == 0)
+ ci->symlinks = 1;
+ else if (strcasecmp(symlinks, "no") == 0)
+ ci->symlinks = 0;
+ else {
+ pr_err("option --symlinks must be 'no' or 'yes'\n");
+ exit(2);
+ }
+ }
+ /* Ok, got the option parsing out of the way
+ * hopefully it's mostly right but there might be some stuff
+ * missing
+ *
+ * That is mostly checked in the per-mode stuff but...
+ *
+ * For @,B,C and A without -s, the first device listed must be
+ * an md device. We check that here and open it.
+ */
+
+ if (mode == MANAGE || mode == BUILD || mode == CREATE ||
+ mode == GROW || (mode == ASSEMBLE && ! c.scan)) {
+ if (devs_found < 1) {
+ pr_err("an md device must be given in this mode\n");
+ exit(2);
+ }
+ if ((int)ident.super_minor == -2 && c.autof) {
+ pr_err("--super-minor=dev is incompatible with --auto\n");
+ exit(2);
+ }
+ if (mode == MANAGE || mode == GROW) {
+ mdfd = open_mddev(devlist->devname, 1);
+ if (mdfd < 0)
+ exit(1);
+ } else {
+ char *bname = basename(devlist->devname);
+
+ if (strlen(bname) > MD_NAME_MAX) {
+ pr_err("Name %s is too long.\n", devlist->devname);
+ exit(1);
+ }
+ /* non-existent device is OK */
+ mdfd = open_mddev(devlist->devname, 0);
+ }
+ if (mdfd == -2) {
+ pr_err("device %s exists but is not an md array.\n", devlist->devname);
+ exit(1);
+ }
+ if ((int)ident.super_minor == -2) {
+ struct stat stb;
+ if (mdfd < 0) {
+ pr_err("--super-minor=dev given, and listed device %s doesn't exist.\n",
+ devlist->devname);
+ exit(1);
+ }
+ fstat(mdfd, &stb);
+ ident.super_minor = minor(stb.st_rdev);
+ }
+ if (mdfd >= 0 && mode != MANAGE && mode != GROW) {
+ /* We don't really want this open yet, we just might
+ * have wanted to check some things
+ */
+ close(mdfd);
+ mdfd = -1;
+ }
+ }
+
+ if (s.raiddisks) {
+ if (s.raiddisks == 1 && !c.force && s.level != LEVEL_FAULTY) {
+ pr_err("'1' is an unusual number of drives for an array, so it is probably\n"
+ " a mistake. If you really mean it you will need to specify --force before\n"
+ " setting the number of drives.\n");
+ exit(2);
+ }
+ }
+
+ if (c.homehost == NULL && c.require_homehost)
+ c.homehost = conf_get_homehost(&c.require_homehost);
+ if (c.homehost == NULL || strcasecmp(c.homehost, "<system>") == 0) {
+ if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) {
+ sys_hostname[sizeof(sys_hostname)-1] = 0;
+ c.homehost = sys_hostname;
+ }
+ }
+ if (c.homehost &&
+ (!c.homehost[0] || strcasecmp(c.homehost, "<none>") == 0)) {
+ c.homehost = NULL;
+ c.require_homehost = 0;
+ }
+
+ rv = 0;
+
+ set_hooks(); /* set hooks from libs */
+
+ if (c.homecluster == NULL && (c.nodes > 0)) {
+ c.homecluster = conf_get_homecluster();
+ if (c.homecluster == NULL)
+ rv = get_cluster_name(&c.homecluster);
+ if (rv) {
+ pr_err("The md can't get cluster name\n");
+ exit(1);
+ }
+ }
+
+ if (c.update && strcmp(c.update, "nodes") == 0 && c.nodes == 0) {
+ pr_err("Please specify nodes number with --nodes\n");
+ exit(1);
+ }
+
+ if (c.backup_file && data_offset != INVALID_SECTORS) {
+ pr_err("--backup-file and --data-offset are incompatible\n");
+ exit(2);
+ }
+
+ if ((mode == MISC && devmode == 'E') ||
+ (mode == MONITOR && spare_sharing == 0))
+ /* Anyone may try this */;
+ else if (geteuid() != 0) {
+ pr_err("must be super-user to perform this action\n");
+ exit(1);
+ }
+
+ ident.autof = c.autof;
+
+ if (c.scan && c.verbose < 2)
+ /* --scan implied --brief unless -vv */
+ c.brief = 1;
+
+ if (mode == CREATE) {
+ if (s.bitmap_file && strcmp(s.bitmap_file, "clustered") == 0) {
+ locked = cluster_get_dlmlock();
+ if (locked != 1)
+ exit(1);
+ }
+ } else if (mode == MANAGE || mode == GROW || mode == INCREMENTAL) {
+ if (!md_get_array_info(mdfd, &array) && (devmode != 'c')) {
+ if (array.state & (1 << MD_SB_CLUSTERED)) {
+ locked = cluster_get_dlmlock();
+ if (locked != 1)
+ exit(1);
+ }
+ }
+ }
+
+ switch(mode) {
+ case MANAGE:
+ /* readonly, add/remove, readwrite, runstop */
+ if (c.readonly > 0)
+ rv = Manage_ro(devlist->devname, mdfd, c.readonly);
+ if (!rv && devs_found>1)
+ rv = Manage_subdevs(devlist->devname, mdfd,
+ devlist->next, c.verbose, c.test,
+ c.update, c.force);
+ if (!rv && c.readonly < 0)
+ rv = Manage_ro(devlist->devname, mdfd, c.readonly);
+ if (!rv && c.runstop > 0)
+ rv = Manage_run(devlist->devname, mdfd, &c);
+ if (!rv && c.runstop < 0)
+ rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0);
+ break;
+ case ASSEMBLE:
+ if (!c.scan && c.runstop == -1) {
+ pr_err("--no-degraded not meaningful without a --scan assembly.\n");
+ exit(1);
+ } else if (devs_found == 1 && ident.uuid_set == 0 &&
+ ident.super_minor == UnSet && ident.name[0] == 0 &&
+ !c.scan) {
+ /* Only a device has been given, so get details from config file */
+ struct mddev_ident *array_ident = conf_get_ident(devlist->devname);
+ if (array_ident == NULL) {
+ pr_err("%s not identified in config file.\n",
+ devlist->devname);
+ rv |= 1;
+ if (mdfd >= 0)
+ close(mdfd);
+ } else {
+ if (array_ident->autof == 0)
+ array_ident->autof = c.autof;
+ rv |= Assemble(ss, devlist->devname, array_ident,
+ NULL, &c);
+ }
+ } else if (!c.scan)
+ rv = Assemble(ss, devlist->devname, &ident,
+ devlist->next, &c);
+ else if (devs_found > 0) {
+ if (c.update && devs_found > 1) {
+ pr_err("can only update a single array at a time\n");
+ exit(1);
+ }
+ if (c.backup_file && devs_found > 1) {
+ pr_err("can only assemble a single array when providing a backup file.\n");
+ exit(1);
+ }
+ for (dv = devlist; dv; dv = dv->next) {
+ struct mddev_ident *array_ident = conf_get_ident(dv->devname);
+ if (array_ident == NULL) {
+ pr_err("%s not identified in config file.\n",
+ dv->devname);
+ rv |= 1;
+ continue;
+ }
+ if (array_ident->autof == 0)
+ array_ident->autof = c.autof;
+ rv |= Assemble(ss, dv->devname, array_ident,
+ NULL, &c);
+ }
+ } else {
+ if (c.update) {
+ pr_err("--update not meaningful with a --scan assembly.\n");
+ exit(1);
+ }
+ if (c.backup_file) {
+ pr_err("--backup_file not meaningful with a --scan assembly.\n");
+ exit(1);
+ }
+ rv = scan_assemble(ss, &c, &ident);
+ }
+
+ break;
+ case BUILD:
+ if (c.delay == 0)
+ c.delay = DEFAULT_BITMAP_DELAY;
+ if (s.write_behind && !s.bitmap_file) {
+ pr_err("write-behind mode requires a bitmap.\n");
+ rv = 1;
+ break;
+ }
+ if (s.raiddisks == 0) {
+ pr_err("no raid-devices specified.\n");
+ rv = 1;
+ break;
+ }
+
+ if (s.bitmap_file) {
+ if (strcmp(s.bitmap_file, "internal") == 0 ||
+ strcmp(s.bitmap_file, "clustered") == 0) {
+ pr_err("'internal' and 'clustered' bitmaps not supported with --build\n");
+ rv |= 1;
+ break;
+ }
+ }
+ rv = Build(devlist->devname, devlist->next, &s, &c);
+ break;
+ case CREATE:
+ if (c.delay == 0)
+ c.delay = DEFAULT_BITMAP_DELAY;
+
+ if (c.nodes) {
+ if (!s.bitmap_file ||
+ strcmp(s.bitmap_file, "clustered") != 0) {
+ pr_err("--nodes argument only compatible with --bitmap=clustered\n");
+ rv = 1;
+ break;
+ }
+
+ if (s.level != 1 && s.level != 10) {
+ pr_err("--bitmap=clustered is currently supported with raid1/10 only\n");
+ rv = 1;
+ break;
+ }
+ if (s.level == 10 && !(is_near_layout_10(s.layout) || s.layout == UnSet)) {
+ pr_err("only near layout is supported with clustered raid10\n");
+ rv = 1;
+ break;
+ }
+ }
+
+ if (s.write_behind && !s.bitmap_file) {
+ pr_err("write-behind mode requires a bitmap.\n");
+ rv = 1;
+ break;
+ }
+ if (s.raiddisks == 0) {
+ pr_err("no raid-devices specified.\n");
+ rv = 1;
+ break;
+ }
+
+ rv = Create(ss, devlist->devname,
+ ident.name, ident.uuid_set ? ident.uuid : NULL,
+ devs_found-1, devlist->next,
+ &s, &c, data_offset);
+ break;
+ case MISC:
+ if (devmode == 'E') {
+ if (devlist == NULL && !c.scan) {
+ pr_err("No devices to examine\n");
+ exit(2);
+ }
+ if (devlist == NULL)
+ devlist = conf_get_devs();
+ if (devlist == NULL) {
+ pr_err("No devices listed in %s\n", configfile?configfile:DefaultConfFile);
+ exit(1);
+ }
+ rv = Examine(devlist, &c, ss);
+ } else if (devmode == DetailPlatform) {
+ rv = Detail_Platform(ss ? ss->ss : NULL, ss ? c.scan : 1,
+ c.verbose, c.export,
+ devlist ? devlist->devname : NULL);
+ } else if (devlist == NULL) {
+ if (devmode == 'S' && c.scan)
+ rv = stop_scan(c.verbose);
+ else if ((devmode == 'D' || devmode == Waitclean) &&
+ c.scan)
+ rv = misc_scan(devmode, &c);
+ else if (devmode == UdevRules)
+ rv = Write_rules(udev_filename);
+ else {
+ pr_err("No devices given.\n");
+ exit(2);
+ }
+ } else
+ rv = misc_list(devlist, &ident, dump_directory, ss, &c);
+ break;
+ case MONITOR:
+ if (!devlist && !c.scan) {
+ pr_err("Cannot monitor: need --scan or at least one device\n");
+ rv = 1;
+ break;
+ }
+ if (pidfile && !daemonise) {
+ pr_err("Cannot write a pid file when not in daemon mode\n");
+ rv = 1;
+ break;
+ }
+ if (c.delay == 0) {
+ c.delay = conf_get_monitor_delay();
+ if (!c.delay)
+ c.delay = 60;
+ }
+ rv = Monitor(devlist, mailaddr, program,
+ &c, daemonise, oneshot,
+ dosyslog, pidfile, increments,
+ spare_sharing);
+ break;
+
+ case GROW:
+ if (array_size > 0) {
+ /* alway impose array size first, independent of
+ * anything else
+ * Do not allow level or raid_disks changes at the
+ * same time as that can be irreversibly destructive.
+ */
+ struct mdinfo sra;
+ int err;
+ if (s.raiddisks || s.level != UnSet) {
+ pr_err("cannot change array size in same operation as changing raiddisks or level.\n"
+ " Change size first, then check that data is still intact.\n");
+ rv = 1;
+ break;
+ }
+ if (sysfs_init(&sra, mdfd, NULL)) {
+ rv = 1;
+ break;
+ }
+ if (array_size == MAX_SIZE)
+ err = sysfs_set_str(&sra, NULL, "array_size", "default");
+ else
+ err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2);
+ if (err < 0) {
+ if (errno == E2BIG)
+ pr_err("--array-size setting is too large.\n");
+ else
+ pr_err("current kernel does not support setting --array-size\n");
+ rv = 1;
+ break;
+ }
+ }
+ if (devs_found > 1 && s.raiddisks == 0 && s.level == UnSet) {
+ /* must be '-a'. */
+ if (s.size > 0 || s.chunk ||
+ s.layout_str || s.bitmap_file) {
+ pr_err("--add cannot be used with other geometry changes in --grow mode\n");
+ rv = 1;
+ break;
+ }
+ for (dv = devlist->next; dv; dv = dv->next) {
+ rv = Grow_Add_device(devlist->devname, mdfd,
+ dv->devname);
+ if (rv)
+ break;
+ }
+ } else if (s.bitmap_file) {
+ if (s.size > 0 || s.raiddisks || s.chunk ||
+ s.layout_str || devs_found > 1) {
+ pr_err("--bitmap changes cannot be used with other geometry changes in --grow mode\n");
+ rv = 1;
+ break;
+ }
+ if (c.delay == 0)
+ c.delay = DEFAULT_BITMAP_DELAY;
+ rv = Grow_addbitmap(devlist->devname, mdfd, &c, &s);
+ } else if (grow_continue)
+ rv = Grow_continue_command(devlist->devname,
+ mdfd, c.backup_file,
+ c.verbose);
+ else if (s.size > 0 || s.raiddisks || s.layout_str ||
+ s.chunk != 0 || s.level != UnSet ||
+ data_offset != INVALID_SECTORS) {
+ rv = Grow_reshape(devlist->devname, mdfd,
+ devlist->next,
+ data_offset, &c, &s);
+ } else if (s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN) {
+ rv = Grow_consistency_policy(devlist->devname, mdfd, &c, &s);
+ } else if (array_size == 0)
+ pr_err("no changes to --grow\n");
+ break;
+ case INCREMENTAL:
+ if (rebuild_map) {
+ RebuildMap();
+ }
+ if (c.scan) {
+ rv = 1;
+ if (devlist) {
+ pr_err("In --incremental mode, a device cannot be given with --scan.\n");
+ break;
+ }
+ if (c.runstop <= 0) {
+ pr_err("--incremental --scan meaningless without --run.\n");
+ break;
+ }
+ if (devmode == 'f') {
+ pr_err("--incremental --scan --fail not supported.\n");
+ break;
+ }
+ rv = IncrementalScan(&c, NULL);
+ }
+ if (!devlist) {
+ if (!rebuild_map && !c.scan) {
+ pr_err("--incremental requires a device.\n");
+ rv = 1;
+ }
+ break;
+ }
+ if (devmode == 'f') {
+ if (devlist->next) {
+ pr_err("'--incremental --fail' can only handle one device.\n");
+ rv = 1;
+ break;
+ }
+ rv = IncrementalRemove(devlist->devname, remove_path,
+ c.verbose);
+ } else
+ rv = Incremental(devlist, &c, ss);
+ break;
+ case AUTODETECT:
+ autodetect();
+ break;
+ }
+ if (locked)
+ cluster_release_dlmlock();
+ close_fd(&mdfd);
+ exit(rv);
+}
+
+static int scan_assemble(struct supertype *ss,
+ struct context *c,
+ struct mddev_ident *ident)
+{
+ struct mddev_ident *a, *array_list = conf_get_ident(NULL);
+ struct mddev_dev *devlist = conf_get_devs();
+ struct map_ent *map = NULL;
+ int cnt = 0;
+ int rv = 0;
+ int failures, successes;
+
+ if (conf_verify_devnames(array_list)) {
+ pr_err("Duplicate MD device names in conf file were found.\n");
+ return 1;
+ }
+ if (devlist == NULL) {
+ pr_err("No devices listed in conf file were found.\n");
+ return 1;
+ }
+ for (a = array_list; a; a = a->next) {
+ a->assembled = 0;
+ if (a->autof == 0)
+ a->autof = c->autof;
+ }
+ if (map_lock(&map))
+ pr_err("failed to get exclusive lock on mapfile\n");
+ do {
+ failures = 0;
+ successes = 0;
+ rv = 0;
+ for (a = array_list; a; a = a->next) {
+ int r;
+ if (a->assembled)
+ continue;
+ if (a->devname &&
+ strcasecmp(a->devname, "<ignore>") == 0)
+ continue;
+
+ r = Assemble(ss, a->devname,
+ a, NULL, c);
+ if (r == 0) {
+ a->assembled = 1;
+ successes++;
+ } else
+ failures++;
+ rv |= r;
+ cnt++;
+ }
+ } while (failures && successes);
+ if (c->homehost && cnt == 0) {
+ /* Maybe we can auto-assemble something.
+ * Repeatedly call Assemble in auto-assemble mode
+ * until it fails
+ */
+ int rv2;
+ int acnt;
+ ident->autof = c->autof;
+ do {
+ struct mddev_dev *devlist = conf_get_devs();
+ acnt = 0;
+ do {
+ rv2 = Assemble(ss, NULL,
+ ident,
+ devlist, c);
+ if (rv2 == 0) {
+ cnt++;
+ acnt++;
+ }
+ } while (rv2 != 2);
+ /* Incase there are stacked devices, we need to go around again */
+ } while (acnt);
+ if (cnt == 0 && rv == 0) {
+ pr_err("No arrays found in config file or automatically\n");
+ rv = 1;
+ } else if (cnt)
+ rv = 0;
+ } else if (cnt == 0 && rv == 0) {
+ pr_err("No arrays found in config file\n");
+ rv = 1;
+ }
+ map_unlock(&map);
+ return rv;
+}
+
+static int misc_scan(char devmode, struct context *c)
+{
+ /* apply --detail or --wait-clean to
+ * all devices in /proc/mdstat
+ */
+ struct mdstat_ent *ms = mdstat_read(0, 1);
+ struct mdstat_ent *e;
+ struct map_ent *map = NULL;
+ int members;
+ int rv = 0;
+
+ for (members = 0; members <= 1; members++) {
+ for (e = ms; e; e = e->next) {
+ char *name = NULL;
+ struct map_ent *me;
+ struct stat stb;
+ int member = e->metadata_version &&
+ strncmp(e->metadata_version,
+ "external:/", 10) == 0;
+ if (members != member)
+ continue;
+ me = map_by_devnm(&map, e->devnm);
+ if (me && me->path && strcmp(me->path, "/unknown") != 0)
+ name = me->path;
+ if (name == NULL || stat(name, &stb) != 0)
+ name = get_md_name(e->devnm);
+
+ if (!name) {
+ pr_err("cannot find device file for %s\n",
+ e->devnm);
+ continue;
+ }
+ if (devmode == 'D')
+ rv |= Detail(name, c);
+ else
+ rv |= WaitClean(name, c->verbose);
+ put_md_name(name);
+ map_free(map);
+ map = NULL;
+ }
+ }
+ free_mdstat(ms);
+ return rv;
+}
+
+static int stop_scan(int verbose)
+{
+ /* apply --stop to all devices in /proc/mdstat */
+ /* Due to possible stacking of devices, repeat until
+ * nothing more can be stopped
+ */
+ int progress = 1, err;
+ int last = 0;
+ int rv = 0;
+ do {
+ struct mdstat_ent *ms = mdstat_read(0, 0);
+ struct mdstat_ent *e;
+
+ if (!progress) last = 1;
+ progress = 0; err = 0;
+ for (e = ms; e; e = e->next) {
+ char *name = get_md_name(e->devnm);
+ int mdfd;
+
+ if (!name) {
+ pr_err("cannot find device file for %s\n",
+ e->devnm);
+ continue;
+ }
+ mdfd = open_mddev(name, 1);
+ if (mdfd >= 0) {
+ if (Manage_stop(name, mdfd, verbose, !last))
+ err = 1;
+ else
+ progress = 1;
+ close(mdfd);
+ }
+
+ put_md_name(name);
+ }
+ free_mdstat(ms);
+ } while (!last && err);
+ if (err)
+ rv |= 1;
+ return rv;
+}
+
+static int misc_list(struct mddev_dev *devlist,
+ struct mddev_ident *ident,
+ char *dump_directory,
+ struct supertype *ss, struct context *c)
+{
+ struct mddev_dev *dv;
+ int rv = 0;
+
+ for (dv = devlist; dv; dv = (rv & 16) ? NULL : dv->next) {
+ int mdfd = -1;
+
+ switch(dv->disposition) {
+ case 'D':
+ rv |= Detail(dv->devname, c);
+ continue;
+ case KillOpt: /* Zero superblock */
+ if (ss)
+ rv |= Kill(dv->devname, ss, c->force, c->verbose,0);
+ else {
+ int v = c->verbose;
+ do {
+ rv |= Kill(dv->devname, NULL, c->force, v, 0);
+ v = -1;
+ } while (rv == 0);
+ rv &= ~4;
+ }
+ continue;
+ case 'Q':
+ rv |= Query(dv->devname);
+ continue;
+ case 'X':
+ rv |= ExamineBitmap(dv->devname, c->brief, ss);
+ continue;
+ case ExamineBB:
+ rv |= ExamineBadblocks(dv->devname, c->brief, ss);
+ continue;
+ case 'W':
+ case WaitOpt:
+ rv |= Wait(dv->devname);
+ continue;
+ case Waitclean:
+ rv |= WaitClean(dv->devname, c->verbose);
+ continue;
+ case KillSubarray:
+ rv |= Kill_subarray(dv->devname, c->subarray, c->verbose);
+ continue;
+ case UpdateSubarray:
+ if (c->update == NULL) {
+ pr_err("-U/--update must be specified with --update-subarray\n");
+ rv |= 1;
+ continue;
+ }
+ rv |= Update_subarray(dv->devname, c->subarray,
+ c->update, ident, c->verbose);
+ continue;
+ case Dump:
+ rv |= Dump_metadata(dv->devname, dump_directory, c, ss);
+ continue;
+ case Restore:
+ rv |= Restore_metadata(dv->devname, dump_directory, c, ss,
+ (dv == devlist && dv->next == NULL));
+ continue;
+ case Action:
+ rv |= SetAction(dv->devname, c->action);
+ continue;
+ }
+
+ if (dv->devname[0] != '/')
+ mdfd = open_dev(dv->devname);
+ if (dv->devname[0] == '/' || mdfd < 0)
+ mdfd = open_mddev(dv->devname, 1);
+
+ if (mdfd >= 0) {
+ switch(dv->disposition) {
+ case 'R':
+ c->runstop = 1;
+ rv |= Manage_run(dv->devname, mdfd, c);
+ break;
+ case 'S':
+ if (c->scan) {
+ pr_err("--stop not meaningful with both a --scan assembly and a device name.\n");
+ rv |= 1;
+ break;
+ }
+ rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0);
+ break;
+ case 'o':
+ rv |= Manage_ro(dv->devname, mdfd, 1);
+ break;
+ case 'w':
+ rv |= Manage_ro(dv->devname, mdfd, -1);
+ break;
+ }
+ close(mdfd);
+ } else
+ rv |= 1;
+ }
+ return rv;
+}
+
+int SetAction(char *dev, char *action)
+{
+ int fd = open(dev, O_RDONLY);
+ struct mdinfo mdi;
+ int retval;
+
+ if (fd < 0) {
+ pr_err("Couldn't open %s: %s\n", dev, strerror(errno));
+ return 1;
+ }
+ retval = sysfs_init(&mdi, fd, NULL);
+ close(fd);
+ if (retval) {
+ pr_err("%s is no an md array\n", dev);
+ return 1;
+ }
+
+ if (sysfs_set_str(&mdi, NULL, "sync_action", action) < 0) {
+ pr_err("Count not set action for %s to %s: %s\n",
+ dev, action, strerror(errno));
+ return 1;
+ }
+ return 0;
+}
diff --git a/mdadm.conf-example b/mdadm.conf-example
new file mode 100644
index 0000000..35a75d1
--- /dev/null
+++ b/mdadm.conf-example
@@ -0,0 +1,65 @@
+# mdadm configuration file
+#
+# mdadm will function properly without the use of a configuration file,
+# but this file is useful for keeping track of arrays and member disks.
+# In general, a mdadm.conf file is created, and updated, after arrays
+# are created. This is the opposite behavior of /etc/raidtab which is
+# created prior to array construction.
+#
+#
+# the config file takes two types of lines:
+#
+# DEVICE lines specify a list of devices of where to look for
+# potential member disks
+#
+# ARRAY lines specify information about how to identify arrays so
+# so that they can be activated
+#
+# You can have more than one device line and use wild cards. The first
+# example includes SCSI the first partition of SCSI disks /dev/sdb,
+# /dev/sdc, /dev/sdd, /dev/sdj, /dev/sdk, and /dev/sdl. The second
+# line looks for array slices on IDE disks.
+#
+#DEVICE /dev/sd[bcdjkl]1
+#DEVICE /dev/hda1 /dev/hdb1
+#
+# If you mount devfs on /dev, then a suitable way to list all devices is:
+#DEVICE /dev/discs/*/*
+#
+#
+# The AUTO line can control which arrays get assembled by auto-assembly,
+# meaing either "mdadm -As" when there are no 'ARRAY' lines in this file,
+# or "mdadm --incremental" when the array found is not listed in this file.
+# By default, all arrays that are found are assembled.
+# If you want to ignore all DDF arrays (maybe they are managed by dmraid),
+# and only assemble 1.x arrays if which are marked for 'this' homehost,
+# but assemble all others, then use
+#AUTO -ddf homehost -1.x +all
+#
+# ARRAY lines specify an array to assemble and a method of identification.
+# Arrays can currently be identified by using a UUID, superblock minor number,
+# or a listing of devices.
+#
+# super-minor is usually the minor number of the metadevice
+# UUID is the Universally Unique Identifier for the array
+# Each can be obtained using
+#
+# mdadm -D <md>
+#
+#ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371
+#ARRAY /dev/md1 super-minor=1
+#ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1
+#
+# ARRAY lines can also specify a "spare-group" for each array. mdadm --monitor
+# will then move a spare between arrays in a spare-group if one array has a failed
+# drive but no spare
+#ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df spare-group=group1
+#ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 spare-group=group1
+#
+# When used in --follow (aka --monitor) mode, mdadm needs a
+# mail address and/or a program. This can be given with "mailaddr"
+# and "program" lines to that monitoring can be started using
+# mdadm --follow --scan & echo $! > /run/mdadm/mon.pid
+# If the lines are not found, mdadm will exit quietly
+#MAILADDR root@mydomain.tld
+#PROGRAM /usr/sbin/handle-mdadm-events
diff --git a/mdadm.conf.5 b/mdadm.conf.5
new file mode 100644
index 0000000..74a21c5
--- /dev/null
+++ b/mdadm.conf.5
@@ -0,0 +1,706 @@
+.\" Copyright Neil Brown and others.
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" See file COPYING in distribution for details.
+.TH MDADM.CONF 5
+.SH NAME
+mdadm.conf \- configuration for management of Software RAID with mdadm
+.SH SYNOPSIS
+/etc/mdadm.conf
+.SH DESCRIPTION
+.PP
+.I mdadm
+is a tool for creating, managing, and monitoring RAID devices using the
+.B md
+driver in Linux.
+.PP
+Some common tasks, such as assembling all arrays, can be simplified
+by describing the devices and arrays in this configuration file.
+
+.SS SYNTAX
+The file should be seen as a collection of words separated by white
+space (space, tab, or newline).
+Any word that beings with a hash sign (#) starts a comment and that
+word together with the remainder of the line is ignored.
+
+Spaces can be included in a word using quotation characters. Either
+single quotes
+.RB ( ' )
+or double quotes (\fB"\fP)
+may be used. All the characters from one quotation character to
+next identical character are protected and will not be used to
+separate words to start new quoted strings. To include a single quote
+it must be between double quotes. To include a double quote it must
+be between single quotes.
+
+Any line that starts with white space (space or tab) is treated as
+though it were a continuation of the previous line.
+
+Empty lines are ignored, but otherwise each (non continuation) line
+must start with a keyword as listed below. The keywords are case
+insensitive and can be abbreviated to 3 characters.
+
+The keywords are:
+.TP
+.B DEVICE
+A
+.B device
+line lists the devices (whole devices or partitions) that might contain
+a component of an MD array. When looking for the components of an
+array,
+.I mdadm
+will scan these devices (or any devices listed on the command line).
+
+The
+.B device
+line may contain a number of different devices (separated by spaces)
+and each device name can contain wild cards as defined by
+.BR glob (7).
+
+Also, there may be several device lines present in the file.
+
+Alternatively, a
+.B device
+line can contain either or both of the words
+.B containers
+and
+.BR partitions .
+The word
+.B containers
+will cause
+.I mdadm
+to look for assembled CONTAINER arrays and included them as a source
+for assembling further arrays.
+
+The word
+.I partitions
+will cause
+.I mdadm
+to read
+.I /proc/partitions
+and include all devices and partitions found therein.
+.I mdadm
+does not use the names from
+.I /proc/partitions
+but only the major and minor device numbers. It scans
+.I /dev
+to find the name that matches the numbers.
+
+If no DEVICE line is present, then "DEVICE partitions containers" is assumed.
+
+For example:
+.IP
+DEVICE /dev/hda* /dev/hdc*
+.br
+DEV /dev/sd*
+.br
+DEVICE /dev/disk/by-path/pci*
+.br
+DEVICE partitions
+
+.TP
+.B ARRAY
+The ARRAY lines identify actual arrays. The second word on the line
+may be the name of the device where the array is normally
+assembled, such as
+.B /dev/md1
+or
+.BR /dev/md/backup .
+If the name does not start with a slash
+.RB (' / '),
+it is treated as being in
+.BR /dev/md/ .
+Alternately the word
+.B <ignore>
+(complete with angle brackets) can be given in which case any array
+which matches the rest of the line will never be automatically assembled.
+If no device name is given,
+.I mdadm
+will use various heuristics to determine an appropriate name.
+
+Subsequent words identify the array, or identify the array as a member
+of a group. If multiple identities are given,
+then a component device must match ALL identities to be considered a
+match. Each identity word has a tag, and equals sign, and some value.
+The tags are:
+.RS 4
+.TP
+.B uuid=
+The value should be a 128 bit uuid in hexadecimal, with punctuation
+interspersed if desired. This must match the uuid stored in the
+superblock.
+.TP
+.B name=
+The value should be a simple textual name as was given to
+.I mdadm
+when the array was created. This must match the name stored in the
+superblock on a device for that device to be included in the array.
+Not all superblock formats support names.
+.TP
+.B super\-minor=
+The value is an integer which indicates the minor number that was
+stored in the superblock when the array was created. When an array is
+created as /dev/mdX, then the minor number X is stored.
+.TP
+.B devices=
+The value is a comma separated list of device names or device name
+patterns.
+Only devices with names which match one entry in the list will be used
+to assemble the array. Note that the devices
+listed there must also be listed on a DEVICE line.
+.TP
+.B level=
+The value is a RAID level. This is not normally used to
+identify an array, but is supported so that the output of
+
+.B "mdadm \-\-examine \-\-scan"
+
+can be use directly in the configuration file.
+.TP
+.B num\-devices=
+The value is the number of devices in a complete active array. As with
+.B level=
+this is mainly for compatibility with the output of
+
+.BR "mdadm \-\-examine \-\-scan" .
+
+.TP
+.B spares=
+The value is a number of spare devices to expect the array to have.
+The sole use of this keyword and value is as follows:
+.B mdadm \-\-monitor
+will report an array if it is found to have fewer than this number of
+spares when
+.B \-\-monitor
+starts or when
+.B \-\-oneshot
+is used.
+
+.TP
+.B spare\-group=
+The value is a textual name for a group of arrays. All arrays with
+the same
+.B spare\-group
+name are considered to be part of the same group. The significance of
+a group of arrays is that
+.I mdadm
+will, when monitoring the arrays, move a spare drive from one array in
+a group to another array in that group if the first array had a failed
+or missing drive but no spare.
+
+.TP
+.B auto=
+This option is rarely needed with mdadm-3.0, particularly if use with
+the Linux kernel v2.6.28 or later.
+It tells
+.I mdadm
+whether to use partitionable array or non-partitionable arrays and,
+in the absence of
+.IR udev ,
+how many partition devices to create. From 2.6.28 all md array
+devices are partitionable, hence this option is not needed.
+
+The value of this option can be "yes" or "md" to indicate that a
+traditional, non-partitionable md array should be created, or "mdp",
+"part" or "partition" to indicate that a partitionable md array (only
+available in linux 2.6 and later) should be used. This later set can
+also have a number appended to indicate how many partitions to create
+device files for, e.g.
+.BR auto=mdp5 .
+The default is 4.
+
+.TP
+.B bitmap=
+The option specifies a file in which a write-intent bitmap should be
+found. When assembling the array,
+.I mdadm
+will provide this file to the
+.B md
+driver as the bitmap file. This has the same function as the
+.B \-\-bitmap\-file
+option to
+.BR \-\-assemble .
+
+.TP
+.B metadata=
+Specify the metadata format that the array has. This is mainly
+recognised for comparability with the output of
+.BR "mdadm \-Es" .
+
+.TP
+.B container=
+Specify that this array is a member array of some container. The
+value given can be either a path name in /dev, or a UUID of the
+container array.
+
+.TP
+.B member=
+Specify that this array is a member array of some container. Each
+type of container has some way to enumerate member arrays, often a
+simple sequence number. The value identifies which member of a
+container the array is. It will usually accompany a "container=" word.
+.RE
+
+.TP
+.B MAILADDR
+The
+.B mailaddr
+line gives an E-mail address that alerts should be
+sent to when
+.I mdadm
+is running in
+.B \-\-monitor
+mode (and was given the
+.B \-\-scan
+option). There should only be one
+.B MAILADDR
+line and it should have only one address. Any subsequent addresses
+are silently ignored.
+
+.TP
+.B MAILFROM
+The
+.B mailfrom
+line (which can only be abbreviated to at least 5 characters) gives an
+address to appear in the "From" address for alert mails. This can be
+useful if you want to explicitly set a domain, as the default from
+address is "root" with no domain. All words on this line are
+catenated with spaces to form the address.
+
+Note that this value cannot be set via the
+.I mdadm
+commandline. It is only settable via the config file.
+
+.TP
+.B PROGRAM
+The
+.B program
+line gives the name of a program to be run when
+.B "mdadm \-\-monitor"
+detects potentially interesting events on any of the arrays that it
+is monitoring. This program gets run with two or three arguments, they
+being the Event, the md device, and possibly the related component
+device.
+
+There should only be one
+.B program
+line and it should be give only one program.
+
+
+.TP
+.B CREATE
+The
+.B create
+line gives default values to be used when creating arrays, new members
+of arrays, and device entries for arrays.
+These include:
+
+.RS 4
+.TP
+.B owner=
+.TP
+.B group=
+These can give user/group ids or names to use instead of system
+defaults (root/wheel or root/disk).
+.TP
+.B mode=
+An octal file mode such as 0660 can be given to override the default
+of 0600.
+.TP
+.B auto=
+This corresponds to the
+.B \-\-auto
+flag to mdadm. Give
+.BR yes ,
+.BR md ,
+.BR mdp ,
+.B part
+\(em possibly followed by a number of partitions \(em to indicate how
+missing device entries should be created.
+
+.TP
+.B metadata=
+The name of the metadata format to use if none is explicitly given.
+This can be useful to impose a system-wide default of version-1 superblocks.
+
+.TP
+.B symlinks=no
+Normally when creating devices in
+.B /dev/md/
+.I mdadm
+will create a matching symlink from
+.B /dev/
+with a name starting
+.B md
+or
+.BR md_ .
+Give
+.B symlinks=no
+to suppress this symlink creation.
+
+.TP
+.B names=yes
+Since Linux 2.6.29 it has been possible to create
+.B md
+devices with a name like
+.B md_home
+rather than just a number, like
+.BR md3 .
+.I mdadm
+will use the numeric alternative by default as other tools that interact
+with md arrays may expect only numbers.
+If
+.B names=yes
+is given in
+.I mdadm.conf
+then
+.I mdadm
+will use a name when appropriate.
+If
+.B names=no
+is given, then non-numeric
+.I md
+device names will not be used even if the default changes in a future
+release of
+.IR mdadm .
+
+.TP
+.B bbl=no
+By default,
+.I mdadm
+will reserve space for a bad block list (bbl) on all devices
+included in or added to any array that supports them. Setting
+.B bbl=no
+will prevent this, so newly added devices will not have a bad
+block log.
+.RE
+
+.TP
+.B HOMEHOST
+The
+.B homehost
+line gives a default value for the
+.B \-\-homehost=
+option to mdadm. There should normally be only one other word on the line.
+It should either be a host name, or one of the special words
+.BR <system>,
+.B <none>
+and
+.BR <ignore> .
+If
+.B <system>
+is given, then the
+.BR gethostname ( 2 )
+systemcall is used to get the host name. This is the default.
+
+If
+.B <ignore>
+is given, then a flag is set so that when arrays are being
+auto-assembled the checking of the recorded
+.I homehost
+is disabled.
+If
+.B <ignore>
+is given it is also possible to give an explicit name which will be
+used when creating arrays. This is the only case when there can be
+more that one other word on the
+.B HOMEHOST
+line. If there are other words, or other
+.B HOMEHOST
+lines, they are silently ignored.
+
+If
+.B <none>
+is given, then the default of using
+.BR gethostname ( 2 )
+is over-ridden and no homehost name is assumed.
+
+When arrays are created, this host name will be stored in the
+metadata. When arrays are assembled using auto-assembly, arrays which
+do not record the correct homehost name in their metadata will be
+assembled using a "foreign" name. A "foreign" name alway ends with a
+digit string preceded by an underscore to differentiate it
+from any possible local name. e.g.
+.B /dev/md/1_1
+or
+.BR /dev/md/home_0 .
+.TP
+.B AUTO
+A list of names of metadata format can be given, each preceded by a
+plus or minus sign. Also the word
+.I homehost
+is allowed as is
+.I all
+preceded by plus or minus sign.
+.I all
+is usually last.
+
+When
+.I mdadm
+is auto-assembling an array, either via
+.I \-\-assemble
+or
+.I \-\-incremental
+and it finds metadata of a given type, it checks that metadata type
+against those listed in this line. The first match wins, where
+.I all
+matches anything.
+If a match is found that was preceded by a plus sign, the auto
+assembly is allowed. If the match was preceded by a minus sign, the
+auto assembly is disallowed. If no match is found, the auto assembly
+is allowed.
+
+If the metadata indicates that the array was created for
+.I this
+host, and the word
+.I homehost
+appears before any other match, then the array is treated as a valid
+candidate for auto-assembly.
+
+This can be used to disable all auto-assembly (so that only arrays
+explicitly listed in mdadm.conf or on the command line are assembled),
+or to disable assembly of certain metadata types which might be
+handled by other software. It can also be used to disable assembly of
+all foreign arrays - normally such arrays are assembled but given a
+non-deterministic name in
+.BR /dev/md/ .
+
+The known metadata types are
+.BR 0.90 ,
+.BR 1.x ,
+.BR ddf ,
+.BR imsm .
+
+.B AUTO
+should be given at most once. Subsequent lines are silently ignored.
+Thus an earlier config file in a config directory will over-ride
+the setting in a later config file.
+
+.TP
+.B POLICY
+This is used to specify what automatic behavior is allowed on devices
+newly appearing in the system and provides a way of marking spares that can
+be moved to other arrays as well as the migration domains.
+.I Domain
+can be defined through
+.I policy
+line by specifying a domain name for a number of paths from
+.BR /dev/disk/by-path/ .
+A device may belong to several domains. The domain of an array is a union
+of domains of all devices in that array. A spare can be automatically
+moved from one array to another if the set of the destination array's
+.I domains
+contains all the
+.I domains
+of the new disk or if both arrays have the same
+.IR spare-group .
+
+To update hot plug configuration it is necessary to execute
+.B mdadm \-\-udev\-rules
+command after changing the config file
+
+Keywords used in the
+.I POLICY
+line and supported values are:
+
+.RS 4
+.TP
+.B domain=
+any arbitrary string
+.TP
+.B metadata=
+0.9 1.x ddf or imsm
+.TP
+.B path=
+file glob matching anything from
+.B /dev/disk/by-path
+.TP
+.B type=
+either
+.B disk
+or
+.BR part .
+.TP
+.B action=
+include, re-add, spare, spare-same-slot, or force-spare
+.TP
+.B auto=
+yes, no, or homehost.
+
+.P
+The
+.I action
+item determines the automatic behavior allowed for devices matching the
+.I path
+and
+.I type
+in the same line. If a device matches several lines with different
+.I actions
+then the most permissive will apply. The ordering of policy lines
+is irrelevant to the end result.
+.TP
+.B include
+allows adding a disk to an array if metadata on that disk matches that array
+.TP
+.B re\-add
+will include the device in the array if it appears to be a current member
+or a member that was recently removed and the array has a
+write-intent-bitmap to allow the
+.B re\-add
+functionality.
+.TP
+.B spare
+as above and additionally: if the device is bare it can
+become a spare if there is any array that it is a candidate for based
+on domains and metadata.
+.TP
+.B spare\-same\-slot
+as above and additionally if given slot was used by an array that went
+degraded recently and the device plugged in has no metadata then it will
+be automatically added to that array (or it's container)
+.TP
+.B force\-spare
+as above and the disk will become a spare in remaining cases
+.RE
+
+.TP
+.B PART-POLICY
+This is similar to
+.B POLICY
+and accepts the same keyword assignments. It allows a consistent set
+of policies to applied to each of the partitions of a device.
+
+A
+.B PART-POLICY
+line should set
+.I type=disk
+and identify the path to one or more disk devices. Each partition on
+these disks will be treated according to the
+.I action=
+setting from this line. If a
+.I domain
+is set in the line, then the domain associated with each patition will
+be based on the domain, but with
+.RB \(dq -part N\(dq
+appended, when N is the partition number for the partition that was
+found.
+
+.TP
+.B SYSFS
+The
+.B SYSFS
+line lists custom values of MD device's sysfs attributes which will be
+stored in sysfs after the array is assembled. Multiple lines are allowed and each
+line has to contain the uuid or the name of the device to which it relates.
+.RS 4
+.TP
+.B uuid=
+hexadecimal identifier of MD device. This has to match the uuid stored in the
+superblock.
+.TP
+.B name=
+name of the MD device as was given to
+.I mdadm
+when the array was created. It will be ignored if
+.B uuid
+is not empty.
+.RE
+
+.TP
+.B MONITORDELAY
+The
+.B monitordelay
+line gives a delay in seconds
+.I mdadm
+shall wait before pooling md arrays
+when
+.I mdadm
+is running in
+.B \-\-monitor
+mode.
+.B \-d/\-\-delay
+command line argument takes precedence over the config file
+
+.SH EXAMPLE
+DEVICE /dev/sd[bcdjkl]1
+.br
+DEVICE /dev/hda1 /dev/hdb1
+
+# /dev/md0 is known by its UUID.
+.br
+ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371
+.br
+# /dev/md1 contains all devices with a minor number of
+.br
+# 1 in the superblock.
+.br
+ARRAY /dev/md1 superminor=1
+.br
+# /dev/md2 is made from precisely these two devices
+.br
+ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1
+
+# /dev/md4 and /dev/md5 are a spare-group and spares
+.br
+# can be moved between them
+.br
+ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df
+.br
+ spare\-group=group1
+.br
+ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977
+.br
+ spare\-group=group1
+.br
+# /dev/md/home is created if need to be a partitionable md array
+.br
+# any spare device number is allocated.
+.br
+ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b
+.br
+ auto=part
+.br
+# The name of this array contains a space.
+.br
+ARRAY /dev/md9 name='Data Storage'
+.sp
+POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-*
+.br
+ action=spare
+.br
+POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]*
+.br
+ action=include
+.br
+# One domain comprising of devices attached to specified paths is defined.
+.br
+# Bare device matching first path will be made an imsm spare on hot plug.
+.br
+# If more than one array is created on devices belonging to domain1 and
+.br
+# one of them becomes degraded, then any imsm spare matching any path for
+.br
+# given domain name can be migrated.
+.br
+MAILADDR root@mydomain.tld
+.br
+PROGRAM /usr/sbin/handle\-mdadm\-events
+.br
+CREATE group=system mode=0640 auto=part\-8
+.br
+HOMEHOST <system>
+.br
+AUTO +1.x homehost \-all
+.br
+SYSFS name=/dev/md/raid5 group_thread_cnt=4 sync_speed_max=1000000
+.br
+SYSFS uuid=bead5eb6:31c17a27:da120ba2:7dfda40d group_thread_cnt=4
+sync_speed_max=1000000
+.br
+MONITORDELAY 60
+
+.SH SEE ALSO
+.BR mdadm (8),
+.BR md (4).
diff --git a/mdadm.h b/mdadm.h
new file mode 100644
index 0000000..c7268a7
--- /dev/null
+++ b/mdadm.h
@@ -0,0 +1,1887 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#define _GNU_SOURCE
+#define _FILE_OFFSET_BITS 64
+#include <unistd.h>
+#ifdef __GLIBC__
+extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence));
+#elif !defined(lseek64)
+# if defined(__NO_STAT64) || __WORDSIZE != 32
+# define lseek64 lseek
+# endif
+#endif
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <time.h>
+#include <sys/time.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <syslog.h>
+#include <stdbool.h>
+/* Newer glibc requires sys/sysmacros.h directly for makedev() */
+#include <sys/sysmacros.h>
+#ifdef __dietlibc__
+#include <strings.h>
+/* dietlibc has deprecated random and srandom!! */
+#define random rand
+#define srandom srand
+#endif
+
+#ifdef NO_COROSYNC
+#define CS_OK 1
+typedef uint64_t cmap_handle_t;
+#else
+#include <corosync/cmap.h>
+#endif
+
+#ifndef NO_DLM
+#include <libdlm.h>
+#include <errno.h>
+#else
+#define LKF_NOQUEUE 0x00000001
+#define LKM_PWMODE 4
+#define EUNLOCK 0x10002
+
+typedef void *dlm_lshandle_t;
+
+struct dlm_lksb {
+ int sb_status;
+ uint32_t sb_lkid;
+ char sb_flags;
+ char *sb_lvbptr;
+};
+#endif
+
+#include <linux/kdev_t.h>
+/*#include <linux/fs.h> */
+#include <sys/mount.h>
+#include <asm/types.h>
+#include <sys/ioctl.h>
+#define MD_MAJOR 9
+#define MdpMinorShift 6
+
+#ifndef BLKGETSIZE64
+#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */
+#endif
+
+#define DEFAULT_CHUNK 512
+#define DEFAULT_BITMAP_CHUNK 4096
+#define DEFAULT_BITMAP_DELAY 5
+#define DEFAULT_MAX_WRITE_BEHIND 256
+
+/* MAP_DIR should be somewhere that persists across the pivotroot
+ * from early boot to late boot.
+ * /run seems to have emerged as the best standard.
+ */
+#ifndef MAP_DIR
+#define MAP_DIR "/run/mdadm"
+#endif /* MAP_DIR */
+/* MAP_FILE is what we name the map file we put in MAP_DIR, in case you
+ * want something other than the default of "map"
+ */
+#ifndef MAP_FILE
+#define MAP_FILE "map"
+#endif /* MAP_FILE */
+/* MDMON_DIR is where pid and socket files used for communicating
+ * with mdmon normally live. Best is /var/run/mdadm as
+ * mdmon is needed at early boot then it needs to write there prior
+ * to /var/run being mounted read/write, and it also then needs to
+ * persist beyond when /var/run is mounter read-only. So, to be
+ * safe, the default is somewhere that is read/write early in the
+ * boot process and stays up as long as possible during shutdown.
+ */
+#ifndef MDMON_DIR
+#define MDMON_DIR "/run/mdadm"
+#endif /* MDMON_DIR */
+
+/* FAILED_SLOTS is where to save files storing recent removal of array
+ * member in order to allow future reuse of disk inserted in the same
+ * slot for array recovery
+ */
+#ifndef FAILED_SLOTS_DIR
+#define FAILED_SLOTS_DIR "/run/mdadm/failed-slots"
+#endif /* FAILED_SLOTS */
+
+#ifndef MDMON_SERVICE
+#define MDMON_SERVICE "mdmon"
+#endif /* MDMON_SERVICE */
+
+#ifndef GROW_SERVICE
+#define GROW_SERVICE "mdadm-grow-continue"
+#endif /* GROW_SERVICE */
+
+#include "md_u.h"
+#include "md_p.h"
+#include "bitmap.h"
+#include "msg.h"
+
+#include <endian.h>
+/* Redhat don't like to #include <asm/byteorder.h>, and
+ * some time include <linux/byteorder/xxx_endian.h> isn't enough,
+ * and there is no standard conversion function so... */
+/* And dietlibc doesn't think byteswap is ok, so.. */
+/* #include <byteswap.h> */
+#define __mdadm_bswap_16(x) (((x) & 0x00ffU) << 8 | \
+ ((x) & 0xff00U) >> 8)
+#define __mdadm_bswap_32(x) (((x) & 0x000000ffU) << 24 | \
+ ((x) & 0xff000000U) >> 24 | \
+ ((x) & 0x0000ff00U) << 8 | \
+ ((x) & 0x00ff0000U) >> 8)
+#define __mdadm_bswap_64(x) (((x) & 0x00000000000000ffULL) << 56 | \
+ ((x) & 0xff00000000000000ULL) >> 56 | \
+ ((x) & 0x000000000000ff00ULL) << 40 | \
+ ((x) & 0x00ff000000000000ULL) >> 40 | \
+ ((x) & 0x0000000000ff0000ULL) << 24 | \
+ ((x) & 0x0000ff0000000000ULL) >> 24 | \
+ ((x) & 0x00000000ff000000ULL) << 8 | \
+ ((x) & 0x000000ff00000000ULL) >> 8)
+
+#if !defined(__KLIBC__)
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define __cpu_to_le16(_x) (unsigned int)(_x)
+#define __cpu_to_le32(_x) (unsigned int)(_x)
+#define __cpu_to_le64(_x) (unsigned long long)(_x)
+#define __le16_to_cpu(_x) (unsigned int)(_x)
+#define __le32_to_cpu(_x) (unsigned int)(_x)
+#define __le64_to_cpu(_x) (unsigned long long)(_x)
+
+#define __cpu_to_be16(_x) __mdadm_bswap_16(_x)
+#define __cpu_to_be32(_x) __mdadm_bswap_32(_x)
+#define __cpu_to_be64(_x) __mdadm_bswap_64(_x)
+#define __be16_to_cpu(_x) __mdadm_bswap_16(_x)
+#define __be32_to_cpu(_x) __mdadm_bswap_32(_x)
+#define __be64_to_cpu(_x) __mdadm_bswap_64(_x)
+#elif BYTE_ORDER == BIG_ENDIAN
+#define __cpu_to_le16(_x) __mdadm_bswap_16(_x)
+#define __cpu_to_le32(_x) __mdadm_bswap_32(_x)
+#define __cpu_to_le64(_x) __mdadm_bswap_64(_x)
+#define __le16_to_cpu(_x) __mdadm_bswap_16(_x)
+#define __le32_to_cpu(_x) __mdadm_bswap_32(_x)
+#define __le64_to_cpu(_x) __mdadm_bswap_64(_x)
+
+#define __cpu_to_be16(_x) (unsigned int)(_x)
+#define __cpu_to_be32(_x) (unsigned int)(_x)
+#define __cpu_to_be64(_x) (unsigned long long)(_x)
+#define __be16_to_cpu(_x) (unsigned int)(_x)
+#define __be32_to_cpu(_x) (unsigned int)(_x)
+#define __be64_to_cpu(_x) (unsigned long long)(_x)
+#else
+# error "unknown endianness."
+#endif
+#endif /* __KLIBC__ */
+
+/*
+ * Partially stolen from include/linux/unaligned/packed_struct.h
+ */
+struct __una_u16 { __u16 x; } __attribute__ ((packed));
+struct __una_u32 { __u32 x; } __attribute__ ((packed));
+
+static inline __u16 __get_unaligned16(const void *p)
+{
+ const struct __una_u16 *ptr = (const struct __una_u16 *)p;
+ return ptr->x;
+}
+
+static inline __u32 __get_unaligned32(const void *p)
+{
+ const struct __una_u32 *ptr = (const struct __una_u32 *)p;
+ return ptr->x;
+}
+
+static inline void __put_unaligned16(__u16 val, void *p)
+{
+ struct __una_u16 *ptr = (struct __una_u16 *)p;
+ ptr->x = val;
+}
+
+static inline void __put_unaligned32(__u32 val, void *p)
+{
+ struct __una_u32 *ptr = (struct __una_u32 *)p;
+ ptr->x = val;
+}
+
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+*/
+
+#define typecheck(type,x) \
+({ type __dummy; \
+ typeof(x) __dummy2; \
+ (void)(&__dummy == &__dummy2); \
+ 1; \
+})
+
+/*
+ * These inlines deal with timer wrapping correctly.
+ *
+ * time_after(a,b) returns true if the time a is after time b.
+*/
+
+#define time_after(a,b) \
+ (typecheck(unsigned int, a) && \
+ typecheck(unsigned int, b) && \
+ ((int)((b) - (a)) < 0))
+
+#define time_before(a,b) time_after(b,a)
+
+/*
+ * min()/max()/clamp() macros that also do
+ * strict type-checking.. See the
+ * "unnecessary" pointer comparison.
+ */
+#define min(x, y) ({ \
+ typeof(x) _min1 = (x); \
+ typeof(y) _min2 = (y); \
+ (void) (&_min1 == &_min2); \
+ _min1 < _min2 ? _min1 : _min2; })
+
+#define max(x, y) ({ \
+ typeof(x) _max1 = (x); \
+ typeof(y) _max2 = (y); \
+ (void) (&_max1 == &_max2); \
+ _max1 > _max2 ? _max1 : _max2; })
+
+#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0]))
+
+extern const char Name[];
+
+struct md_bb_entry {
+ unsigned long long sector;
+ int length;
+};
+
+struct md_bb {
+ int supported;
+ int count;
+ struct md_bb_entry *entries;
+};
+
+/* general information that might be extracted from a superblock */
+struct mdinfo {
+ mdu_array_info_t array;
+ mdu_disk_info_t disk;
+ __u64 events;
+ int uuid[4];
+ char name[33];
+ unsigned long long data_offset;
+ unsigned long long new_data_offset;
+ unsigned long long component_size; /* same as array.size, except in
+ * sectors and up to 64bits.
+ */
+ unsigned long long custom_array_size; /* size for non-default sized
+ * arrays (in sectors)
+ */
+#define NO_RESHAPE 0
+#define VOLUME_RESHAPE 1
+#define CONTAINER_RESHAPE 2
+#define RESHAPE_NO_BACKUP 16 /* Mask 'or'ed in */
+ int reshape_active;
+ unsigned long long reshape_progress;
+ int recovery_blocked; /* for external metadata it
+ * indicates that there is
+ * reshape in progress in
+ * container,
+ * for native metadata it is
+ * reshape_active field mirror
+ */
+ int journal_device_required;
+ int journal_clean;
+
+ enum {
+ CONSISTENCY_POLICY_UNKNOWN,
+ CONSISTENCY_POLICY_NONE,
+ CONSISTENCY_POLICY_RESYNC,
+ CONSISTENCY_POLICY_BITMAP,
+ CONSISTENCY_POLICY_JOURNAL,
+ CONSISTENCY_POLICY_PPL,
+ } consistency_policy;
+
+ /* During reshape we can sometimes change the data_offset to avoid
+ * over-writing still-valid data. We need to know if there is space.
+ * So getinfo_super will fill in space_before and space_after in sectors.
+ * data_offset can be increased or decreased by this amount.
+ */
+ unsigned long long space_before, space_after;
+ union {
+ unsigned long long resync_start; /* per-array resync position */
+ unsigned long long recovery_start; /* per-device rebuild position */
+ #define MaxSector (~0ULL) /* resync/recovery complete position */
+ };
+ long bitmap_offset; /* 0 == none, 1 == a file */
+ unsigned int ppl_size;
+ int ppl_offset;
+ unsigned long long ppl_sector;
+ unsigned long safe_mode_delay; /* ms delay to mark clean */
+ int new_level, delta_disks, new_layout, new_chunk;
+ int errors;
+ unsigned long cache_size; /* size of raid456 stripe cache*/
+ int mismatch_cnt;
+ char text_version[50];
+
+ int container_member; /* for assembling external-metatdata arrays
+ * This is to be used internally by metadata
+ * handler only */
+ int container_enough; /* flag external handlers can set to
+ * indicate that subarrays have not enough (-1),
+ * enough to start (0), or all expected disks (1) */
+ char sys_name[32];
+ struct mdinfo *devs;
+ struct mdinfo *next;
+
+ /* Device info for mdmon: */
+ int recovery_fd;
+ int state_fd;
+ int bb_fd;
+ int ubb_fd;
+ #define DS_FAULTY 1
+ #define DS_INSYNC 2
+ #define DS_WRITE_MOSTLY 4
+ #define DS_SPARE 8
+ #define DS_BLOCKED 16
+ #define DS_REMOVE 1024
+ #define DS_UNBLOCK 2048
+ int prev_state, curr_state, next_state;
+
+ /* info read from sysfs */
+ enum {
+ ARRAY_CLEAR,
+ ARRAY_INACTIVE,
+ ARRAY_SUSPENDED,
+ ARRAY_READONLY,
+ ARRAY_READ_AUTO,
+ ARRAY_CLEAN,
+ ARRAY_ACTIVE,
+ ARRAY_WRITE_PENDING,
+ ARRAY_ACTIVE_IDLE,
+ ARRAY_BROKEN,
+ ARRAY_UNKNOWN_STATE,
+ } array_state;
+ struct md_bb bb;
+};
+
+struct createinfo {
+ int uid;
+ int gid;
+ int autof;
+ int mode;
+ int symlinks;
+ int names;
+ int bblist;
+ struct supertype *supertype;
+};
+
+struct spare_criteria {
+ unsigned long long min_size;
+ unsigned int sector_size;
+};
+
+enum mode {
+ ASSEMBLE=1,
+ BUILD,
+ CREATE,
+ MANAGE,
+ MISC,
+ MONITOR,
+ GROW,
+ INCREMENTAL,
+ AUTODETECT,
+ mode_count
+};
+
+extern char short_options[];
+extern char short_bitmap_options[];
+extern char short_bitmap_auto_options[];
+extern struct option long_options[];
+extern char Version[], Usage[], Help[], OptionHelp[],
+ *mode_help[],
+ Help_create[], Help_build[], Help_assemble[], Help_grow[],
+ Help_incr[],
+ Help_manage[], Help_misc[], Help_monitor[], Help_config[];
+
+/* for option that don't have short equivilents, we assign arbitrary
+ * numbers later than any 'short' character option.
+ */
+enum special_options {
+ AssumeClean = 300,
+ BitmapChunk,
+ WriteBehind,
+ ReAdd,
+ NoDegraded,
+ Sparc22,
+ BackupFile,
+ HomeHost,
+ AutoHomeHost,
+ Symlinks,
+ AutoDetect,
+ Waitclean,
+ DetailPlatform,
+ KillSubarray,
+ UpdateSubarray,
+ IncrementalPath,
+ NoSharing,
+ HelpOptions,
+ Brief,
+ NoDevices,
+ ManageOpt,
+ Add,
+ AddSpare,
+ AddJournal,
+ Remove,
+ Fail,
+ Replace,
+ With,
+ MiscOpt,
+ WaitOpt,
+ ConfigFile,
+ ChunkSize,
+ WriteMostly,
+ FailFast,
+ NoFailFast,
+ Layout,
+ Auto,
+ Force,
+ SuperMinor,
+ EMail,
+ ProgramOpt,
+ Increment,
+ Fork,
+ Bitmap,
+ RebuildMapOpt,
+ InvalidBackup,
+ UdevRules,
+ FreezeReshape,
+ Continue,
+ OffRootOpt,
+ Prefer,
+ KillOpt,
+ DataOffset,
+ ExamineBB,
+ Dump,
+ Restore,
+ Action,
+ Nodes,
+ ClusterName,
+ ClusterConfirm,
+ WriteJournal,
+ ConsistencyPolicy,
+};
+
+enum prefix_standard {
+ JEDEC,
+ IEC
+};
+
+enum bitmap_update {
+ NoUpdate,
+ NameUpdate,
+ NodeNumUpdate,
+};
+
+enum flag_mode {
+ FlagDefault, FlagSet, FlagClear,
+};
+
+/* structures read from config file */
+/* List of mddevice names and identifiers
+ * Identifiers can be:
+ * uuid=128-hex-uuid
+ * super-minor=decimal-minor-number-from-superblock
+ * devices=comma,separated,list,of,device,names,with,wildcards
+ *
+ * If multiple fields are present, the intersection of all matching
+ * devices is considered
+ */
+#define UnSet (0xfffe)
+struct mddev_ident {
+ char *devname;
+
+ int uuid_set;
+ int uuid[4];
+ char name[33];
+
+ int super_minor;
+
+ char *devices; /* comma separated list of device
+ * names with wild cards
+ */
+ int level;
+ int raid_disks;
+ int spare_disks;
+ struct supertype *st;
+ int autof; /* 1 for normal, 2 for partitioned */
+ char *spare_group;
+ char *bitmap_file;
+ int bitmap_fd;
+
+ char *container; /* /dev/whatever name of container, or
+ * uuid of container. You would expect
+ * this to be the 'devname' or UUID
+ * of some other entry.
+ */
+ char *member; /* subarray within a container */
+
+ struct mddev_ident *next;
+ union {
+ /* fields needed by different users of this structure */
+ int assembled; /* set when assembly succeeds */
+ };
+};
+
+struct context {
+ int readonly;
+ int runstop;
+ int verbose;
+ int brief;
+ int no_devices;
+ int force;
+ char *homehost;
+ int require_homehost;
+ char *prefer;
+ int export;
+ int test;
+ char *subarray;
+ char *update;
+ int scan;
+ int SparcAdjust;
+ int autof;
+ int delay;
+ int freeze_reshape;
+ char *backup_file;
+ int invalid_backup;
+ char *action;
+ int nodes;
+ char *homecluster;
+};
+
+struct shape {
+ int raiddisks;
+ int sparedisks;
+ int journaldisks;
+ int level;
+ int layout;
+ char *layout_str;
+ int chunk;
+ int bitmap_chunk;
+ char *bitmap_file;
+ int assume_clean;
+ int write_behind;
+ unsigned long long size;
+ int consistency_policy;
+};
+
+/* List of device names - wildcards expanded */
+struct mddev_dev {
+ char *devname;
+ int disposition; /* 'a' for add, 'r' for remove, 'f' for fail,
+ * 'A' for re_add.
+ * Not set for names read from .config
+ */
+ enum flag_mode writemostly;
+ enum flag_mode failfast;
+ int used; /* set when used */
+ long long data_offset;
+ struct mddev_dev *next;
+};
+
+typedef struct mapping {
+ char *name;
+ int num;
+} mapping_t;
+
+struct mdstat_ent {
+ char devnm[32];
+ int active;
+ char *level;
+ char *pattern; /* U for up, _ for down */
+ int percent; /* -1 if no resync */
+ int resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */
+ int devcnt;
+ int raid_disks;
+ char * metadata_version;
+ struct dev_member {
+ char *name;
+ struct dev_member *next;
+ } *members;
+ struct mdstat_ent *next;
+};
+
+extern struct mdstat_ent *mdstat_read(int hold, int start);
+extern void mdstat_close(void);
+extern void free_mdstat(struct mdstat_ent *ms);
+extern int mdstat_wait(int seconds);
+extern void mdstat_wait_fd(int fd, const sigset_t *sigmask);
+extern int mddev_busy(char *devnm);
+extern struct mdstat_ent *mdstat_by_component(char *name);
+extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container);
+
+struct map_ent {
+ struct map_ent *next;
+ char devnm[32];
+ char metadata[20];
+ int uuid[4];
+ int bad;
+ char *path;
+};
+extern int map_update(struct map_ent **mpp, char *devnm, char *metadata,
+ int uuid[4], char *path);
+extern void map_remove(struct map_ent **map, char *devnm);
+extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]);
+extern struct map_ent *map_by_devnm(struct map_ent **map, char *devnm);
+extern void map_free(struct map_ent *map);
+extern struct map_ent *map_by_name(struct map_ent **map, char *name);
+extern void map_read(struct map_ent **melp);
+extern int map_write(struct map_ent *mel);
+extern void map_delete(struct map_ent **mapp, char *devnm);
+extern void map_add(struct map_ent **melp,
+ char *devnm, char *metadata, int uuid[4], char *path);
+extern int map_lock(struct map_ent **melp);
+extern void map_unlock(struct map_ent **melp);
+extern void map_fork(void);
+
+/* various details can be requested */
+enum sysfs_read_flags {
+ GET_LEVEL = (1 << 0),
+ GET_LAYOUT = (1 << 1),
+ GET_COMPONENT = (1 << 2),
+ GET_CHUNK = (1 << 3),
+ GET_CACHE = (1 << 4),
+ GET_MISMATCH = (1 << 5),
+ GET_VERSION = (1 << 6),
+ GET_DISKS = (1 << 7),
+ GET_SAFEMODE = (1 << 9),
+ GET_BITMAP_LOCATION = (1 << 10),
+
+ GET_DEVS = (1 << 20), /* gets role, major, minor */
+ GET_OFFSET = (1 << 21),
+ GET_SIZE = (1 << 22),
+ GET_STATE = (1 << 23),
+ GET_ERROR = (1 << 24),
+ GET_ARRAY_STATE = (1 << 25),
+ GET_CONSISTENCY_POLICY = (1 << 26),
+ GET_DEVS_ALL = (1 << 27),
+};
+
+/* If fd >= 0, get the array it is open on,
+ * else use devnm.
+ */
+extern int sysfs_open(char *devnm, char *devname, char *attr);
+extern int sysfs_init(struct mdinfo *mdi, int fd, char *devnm);
+extern void sysfs_init_dev(struct mdinfo *mdi, dev_t devid);
+extern void sysfs_free(struct mdinfo *sra);
+extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options);
+extern int sysfs_attr_match(const char *attr, const char *str);
+extern int sysfs_match_word(const char *word, char **list);
+extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val);
+extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long val);
+extern int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, long long val);
+extern int sysfs_uevent(struct mdinfo *sra, char *event);
+extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
+ char *name);
+extern int sysfs_fd_get_ll(int fd, unsigned long long *val);
+extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *val);
+extern int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2);
+extern int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *v1, unsigned long long *v2);
+extern int sysfs_fd_get_str(int fd, char *val, int size);
+extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev,
+ char *name);
+extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size);
+extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms);
+extern int sysfs_set_array(struct mdinfo *info, int vers);
+extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume);
+extern int sysfs_disk_to_scsi_id(int fd, __u32 *id);
+extern int sysfs_unique_holder(char *devnm, long rdev);
+extern int sysfs_freeze_array(struct mdinfo *sra);
+extern int sysfs_wait(int fd, int *msec);
+extern int load_sys(char *path, char *buf, int len);
+extern int zero_disk_range(int fd, unsigned long long sector, size_t count);
+extern int reshape_prepare_fdlist(char *devname,
+ struct mdinfo *sra,
+ int raid_disks,
+ int nrdisks,
+ unsigned long blocks,
+ char *backup_file,
+ int *fdlist,
+ unsigned long long *offsets);
+extern void reshape_free_fdlist(int *fdlist,
+ unsigned long long *offsets,
+ int size);
+extern int reshape_open_backup_file(char *backup,
+ int fd,
+ char *devname,
+ long blocks,
+ int *fdlist,
+ unsigned long long *offsets,
+ char *sysfs_name,
+ int restart);
+extern unsigned long compute_backup_blocks(int nchunk, int ochunk,
+ unsigned int ndata, unsigned int odata);
+extern char *locate_backup(char *name);
+extern char *make_backup(char *name);
+
+extern int save_stripes(int *source, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int nwrites, int *dest,
+ unsigned long long start, unsigned long long length,
+ char *buf);
+extern int restore_stripes(int *dest, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int source, unsigned long long read_offset,
+ unsigned long long start, unsigned long long length,
+ char *src_buf);
+
+#ifndef Sendmail
+#define Sendmail "/usr/lib/sendmail -t"
+#endif
+
+#define SYSLOG_FACILITY LOG_DAEMON
+
+extern char *map_num(mapping_t *map, int num);
+extern int map_name(mapping_t *map, char *name);
+extern mapping_t r0layout[], r5layout[], r6layout[],
+ pers[], modes[], faultylayout[];
+extern mapping_t consistency_policies[], sysfs_array_states[];
+
+extern char *map_dev_preferred(int major, int minor, int create,
+ char *prefer);
+static inline char *map_dev(int major, int minor, int create)
+{
+ return map_dev_preferred(major, minor, create, NULL);
+}
+
+/**
+ * is_fd_valid() - check file descriptor.
+ * @fd: file descriptor.
+ *
+ * The function checks if @fd is nonnegative integer and shall be used only
+ * to verify open() result.
+ */
+static inline int is_fd_valid(int fd)
+{
+ return (fd > -1);
+}
+
+/**
+ * close_fd() - verify, close and unset file descriptor.
+ * @fd: pointer to file descriptor.
+ *
+ * The function closes and invalidates file descriptor if appropriative. It
+ * ignores incorrect file descriptor quitely to simplify error handling.
+ */
+static inline void close_fd(int *fd)
+{
+ if (is_fd_valid(*fd) && close(*fd) == 0)
+ *fd = -1;
+}
+
+struct active_array;
+struct metadata_update;
+
+/* 'struct reshape' records the intermediate states of
+ * a general reshape.
+ * The starting geometry is converted to the 'before' geometry
+ * by at most an atomic level change. They could be the same.
+ * Similarly the 'after' geometry is converted to the final
+ * geometry by at most a level change.
+ * Note that 'before' and 'after' must have the same level.
+ * 'blocks' is the minimum number of sectors for a reshape unit.
+ * This will be a multiple of the stripe size in each of the
+ * 'before' and 'after' geometries.
+ * If 'blocks' is 0, no restriping is necessary.
+ * 'min_offset_change' is the minimum change to data_offset to
+ * allow the reshape to happen. It is at least the larger of
+ * the old and new chunk sizes, and typically the same as 'blocks'
+ * divided by number of data disks.
+ */
+struct reshape {
+ int level;
+ int parity; /* number of parity blocks/devices */
+ struct {
+ int layout;
+ int data_disks;
+ } before, after;
+ unsigned long long backup_blocks;
+ unsigned long long min_offset_change;
+ unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/
+ unsigned long long new_size; /* New size of array in sectors */
+};
+
+/* A superswitch provides entry point to a metadata handler.
+ *
+ * The superswitch primarily operates on some "metadata" that
+ * is accessed via the 'supertype'.
+ * This metadata has one of three possible sources.
+ * 1/ It is read from a single device. In this case it may not completely
+ * describe the array or arrays as some information might be on other
+ * devices.
+ * 2/ It is read from all devices in a container. In this case all
+ * information is present.
+ * 3/ It is created by ->init_super / ->add_to_super. In this case it will
+ * be complete once enough ->add_to_super calls have completed.
+ *
+ * When creating an array inside a container, the metadata will be
+ * formed by a combination of 2 and 3. The metadata or the array is read,
+ * then new information is added.
+ *
+ * The metadata must sometimes have a concept of a 'current' array
+ * and a 'current' device.
+ * The 'current' array is set by init_super to be the newly created array,
+ * or is set by super_by_fd when it finds it is looking at an array inside
+ * a container.
+ *
+ * The 'current' device is either the device that the metadata was read from
+ * in case 1, or the last device added by add_to_super in case 3.
+ * Case 2 does not identify a 'current' device.
+ */
+extern struct superswitch {
+
+ /* Used to report details of metadata read from a component
+ * device. ->load_super has been called.
+ */
+ void (*examine_super)(struct supertype *st, char *homehost);
+ void (*brief_examine_super)(struct supertype *st, int verbose);
+ void (*brief_examine_subarrays)(struct supertype *st, int verbose);
+ void (*export_examine_super)(struct supertype *st);
+ int (*examine_badblocks)(struct supertype *st, int fd, char *devname);
+ int (*copy_metadata)(struct supertype *st, int from, int to);
+
+ /* Used to report details of an active array.
+ * ->load_super was possibly given a 'component' string.
+ */
+ void (*detail_super)(struct supertype *st, char *homehost,
+ char *subarray);
+ void (*brief_detail_super)(struct supertype *st, char *subarray);
+ void (*export_detail_super)(struct supertype *st);
+
+ /* Optional: platform hardware / firmware details */
+ int (*detail_platform)(int verbose, int enumerate_only, char *controller_path);
+ int (*export_detail_platform)(int verbose, char *controller_path);
+
+ /* Used:
+ * to get uuid to storing in bitmap metadata
+ * and 'reshape' backup-data metadata
+ * To see if a device is being re-added to an array it was part of.
+ */
+ void (*uuid_from_super)(struct supertype *st, int uuid[4]);
+
+ /* Extract generic details from metadata. This could be details about
+ * the container, or about an individual array within the container.
+ * The determination is made either by:
+ * load_super being given a 'component' string.
+ * validate_geometry determining what to create.
+ * The info includes both array information and device information.
+ * The particular device should be:
+ * The last device added by add_to_super
+ * The device the metadata was loaded from by load_super
+ * If 'map' is present, then it is an array raid_disks long
+ * (raid_disk must already be set and correct) and it is filled
+ * with 1 for slots that are thought to be active and 0 for slots which
+ * appear to be failed/missing.
+ * *info is zeroed out before data is added.
+ */
+ void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map);
+ struct mdinfo *(*getinfo_super_disks)(struct supertype *st);
+ /* Check if the given metadata is flagged as belonging to "this"
+ * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost"
+ */
+ int (*match_home)(struct supertype *st, char *homehost);
+
+ /* Make one of several generic modifications to metadata
+ * prior to assembly (or other times).
+ * sparc2.2 - first bug in early 0.90 metadata
+ * super-minor - change name of 0.90 metadata
+ * summaries - 'correct' any redundant data
+ * resync - mark array as dirty to trigger a resync.
+ * uuid - set new uuid - only 0.90 or 1.x
+ * name - change the name of the array (where supported)
+ * homehost - change which host this array is tied to.
+ * devicesize - If metadata is at start of device, change recorded
+ * device size to match actual device size
+ * byteorder - swap bytes for 0.90 metadata
+ *
+ * force-one - mark that device as uptodate, not old or failed.
+ * force-array - mark array as clean if it would not otherwise
+ * assemble
+ * assemble - not sure how this is different from force-one...
+ * linear-grow-new - add a new device to a linear array, but don't
+ * change the size: so superblock still matches
+ * linear-grow-update - now change the size of the array.
+ * writemostly - set the WriteMostly1 bit in the superblock devflags
+ * readwrite - clear the WriteMostly1 bit in the superblock devflags
+ * failfast - set the FailFast1 bit in the superblock
+ * nofailfast - clear the FailFast1 bit
+ * no-bitmap - clear any record that a bitmap is present.
+ * bbl - add a bad-block-log if possible
+ * no-bbl - remove any bad-block-log is it is empty.
+ * force-no-bbl - remove any bad-block-log even if empty.
+ * revert-reshape - If a reshape is in progress, modify metadata so
+ * it will resume going in the opposite direction.
+ */
+ int (*update_super)(struct supertype *st, struct mdinfo *info,
+ char *update,
+ char *devname, int verbose,
+ int uuid_set, char *homehost);
+
+ /* Create new metadata for new array as described. This could
+ * be a new container, or an array in a pre-existing container.
+ * Also used to zero metadata prior to writing it to invalidate old
+ * metadata.
+ */
+ int (*init_super)(struct supertype *st, mdu_array_info_t *info,
+ struct shape *s, char *name,
+ char *homehost, int *uuid,
+ unsigned long long data_offset);
+
+ /* update the metadata to include new device, either at create or
+ * when hot-adding a spare.
+ */
+ int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo,
+ int fd, char *devname,
+ unsigned long long data_offset);
+ /* update the metadata to delete a device,
+ * when hot-removing.
+ */
+ int (*remove_from_super)(struct supertype *st, mdu_disk_info_t *dinfo);
+
+ /* Write metadata to one device when fixing problems or adding
+ * a new device.
+ */
+ int (*store_super)(struct supertype *st, int fd);
+
+ /* Write all metadata for this array.
+ */
+ int (*write_init_super)(struct supertype *st);
+ /* Check if metadata read from one device is compatible with an array,
+ * used when assembling an array, or pseudo-assembling was with
+ * "--examine --brief"
+ * If "st" has not yet been loaded the superblock from, "tst" is
+ * moved in, otherwise the superblock in 'st' is compared with
+ * 'tst'.
+ */
+ int (*compare_super)(struct supertype *st, struct supertype *tst,
+ int verbose);
+ /* Load metadata from a single device. If 'devname' is not NULL
+ * print error messages as appropriate */
+ int (*load_super)(struct supertype *st, int fd, char *devname);
+ /* 'fd' is a 'container' md array - load array metadata from the
+ * whole container.
+ */
+ int (*load_container)(struct supertype *st, int fd, char *devname);
+ /* If 'arg' is a valid name of this metadata type, allocate and
+ * return a 'supertype' for the particular minor version */
+ struct supertype * (*match_metadata_desc)(char *arg);
+ /* If a device has the given size, and the data_offset has been
+ * requested - work out how much space is available for data.
+ * This involves adjusting for reserved space (e.g. bitmaps)
+ * and for any rounding.
+ * 'mdadm' only calls this for existing arrays where a possible
+ * spare is being added. However some super-handlers call it
+ * internally from validate_geometry when creating an array.
+ */
+ __u64 (*avail_size)(struct supertype *st, __u64 size,
+ unsigned long long data_offset);
+ /*
+ * Return spare criteria for array:
+ * - minimum disk size can be used in array;
+ * - sector size can be used in array.
+ * Return values: 0 - for success and -EINVAL on error.
+ */
+ int (*get_spare_criteria)(struct supertype *st,
+ struct spare_criteria *sc);
+ /* Find somewhere to put a bitmap - possibly auto-size it - and
+ * update the metadata to record this. The array may be newly
+ * created, in which case data_size may be updated, or it might
+ * already exist. Metadata handler can know if init_super
+ * has been called, but not write_init_super.
+ * 0: Success
+ * -Exxxx: On error
+ */
+ int (*add_internal_bitmap)(struct supertype *st, int *chunkp,
+ int delay, int write_behind,
+ unsigned long long size, int may_change, int major);
+ /* Perform additional setup required to activate a bitmap.
+ */
+ int (*set_bitmap)(struct supertype *st, struct mdinfo *info);
+ /* Seek 'fd' to start of write-intent-bitmap. Must be an
+ * md-native format bitmap
+ */
+ int (*locate_bitmap)(struct supertype *st, int fd, int node_num);
+ /* if add_internal_bitmap succeeded for existing array, this
+ * writes it out.
+ */
+ int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update);
+ /* Free the superblock and any other allocated data */
+ void (*free_super)(struct supertype *st);
+
+ /* validate_geometry is called with an st returned by
+ * match_metadata_desc.
+ * It should check that the geometry described is compatible with
+ * the metadata type. It will be called repeatedly as devices
+ * added to validate changing size and new devices. If there are
+ * inter-device dependencies, it should record sufficient details
+ * so these can be validated.
+ * Both 'size' and '*freesize' are in sectors. chunk is KiB.
+ * Return value is:
+ * 1: everything is OK
+ * 0: not OK for some reason - if 'verbose', then error was reported.
+ * -1: st->sb was NULL, 'subdev' is a member of a container of this
+ * type, but array is not acceptable for some reason
+ * message was reported even if verbose is 0.
+ */
+ int (*validate_geometry)(struct supertype *st, int level, int layout,
+ int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *subdev, unsigned long long *freesize,
+ int consistency_policy, int verbose);
+
+ /* Return a linked list of 'mdinfo' structures for all arrays
+ * in the container. For non-containers, it is like
+ * getinfo_super with an allocated mdinfo.*/
+ struct mdinfo *(*container_content)(struct supertype *st, char *subarray);
+ /* query the supertype for default geometry */
+ void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */
+ /* Permit subarray's to be deleted from inactive containers */
+ int (*kill_subarray)(struct supertype *st,
+ char *subarray_id); /* optional */
+ /* Permit subarray's to be modified */
+ int (*update_subarray)(struct supertype *st, char *subarray,
+ char *update, struct mddev_ident *ident); /* optional */
+ /* Check if reshape is supported for this external format.
+ * st is obtained from super_by_fd() where st->subarray[0] is
+ * initialized to indicate if reshape is being performed at the
+ * container or subarray level
+ */
+#define APPLY_METADATA_CHANGES 1
+#define ROLLBACK_METADATA_CHANGES 0
+
+ int (*reshape_super)(struct supertype *st,
+ unsigned long long size, int level,
+ int layout, int chunksize, int raid_disks,
+ int delta_disks, char *backup, char *dev,
+ int direction,
+ int verbose); /* optional */
+ int (*manage_reshape)( /* optional */
+ int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets);
+
+/* for mdmon */
+ int (*open_new)(struct supertype *c, struct active_array *a,
+ int inst);
+
+ /* Tell the metadata handler the current state of the array.
+ * This covers whether it is known to be consistent (no pending writes)
+ * and how far along a resync is known to have progressed
+ * (in a->resync_start).
+ * resync status is really irrelevant if the array is not consistent,
+ * but some metadata (DDF!) have a place to record the distinction.
+ * If 'consistent' is '2', then the array can mark it dirty if a
+ * resync/recovery/whatever is required, or leave it clean if not.
+ * Return value is 0 dirty (not consistent) and 1 if clean.
+ * it is only really important if consistent is passed in as '2'.
+ */
+ int (*set_array_state)(struct active_array *a, int consistent);
+
+ /* When the state of a device might have changed, we call set_disk to
+ * tell the metadata what the current state is.
+ * Typically this happens on spare->in_sync and (spare|in_sync)->faulty
+ * transitions.
+ * set_disk might be called when the state of the particular disk has
+ * not in fact changed.
+ */
+ void (*set_disk)(struct active_array *a, int n, int state);
+ void (*sync_metadata)(struct supertype *st);
+ void (*process_update)(struct supertype *st,
+ struct metadata_update *update);
+ /* Prepare updates allocates extra memory that might be
+ * needed. If the update cannot be understood, return 0.
+ */
+ int (*prepare_update)(struct supertype *st,
+ struct metadata_update *update);
+
+ /* activate_spare will check if the array is degraded and, if it
+ * is, try to find some spare space in the container.
+ * On success, it add appropriate updates (For process_update) to
+ * to the 'updates' list and returns a list of 'mdinfo' identifying
+ * the device, or devices as there might be multiple missing
+ * devices and multiple spares available.
+ */
+ struct mdinfo *(*activate_spare)(struct active_array *a,
+ struct metadata_update **updates);
+ /*
+ * Return statically allocated string that represents metadata specific
+ * controller domain of the disk. The domain is used in disk domain
+ * matching functions. Disks belong to the same domain if the they have
+ * the same domain from mdadm.conf and belong the same metadata domain.
+ * Returning NULL or not providing this handler means that metadata
+ * does not distinguish the differences between disks that belong to
+ * different controllers. They are in the domain specified by
+ * configuration file (mdadm.conf).
+ * In case when the metadata has the notion of domains based on disk
+ * it shall return NULL for disks that do not belong to the controller
+ * the supported domains. Such disks will form another domain and won't
+ * be mixed with supported ones.
+ */
+ const char *(*get_disk_controller_domain)(const char *path);
+
+ /* for external backup area */
+ int (*recover_backup)(struct supertype *st, struct mdinfo *info);
+
+ /* validate container after assemble */
+ int (*validate_container)(struct mdinfo *info);
+
+ /* write initial empty PPL on device */
+ int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd);
+
+ /* validate ppl before assemble */
+ int (*validate_ppl)(struct supertype *st, struct mdinfo *info,
+ struct mdinfo *disk);
+
+ /* records new bad block in metadata */
+ int (*record_bad_block)(struct active_array *a, int n,
+ unsigned long long sector, int length);
+
+ /* clears bad block from metadata */
+ int (*clear_bad_block)(struct active_array *a, int n,
+ unsigned long long sector, int length);
+
+ /* get list of bad blocks from metadata */
+ struct md_bb *(*get_bad_blocks)(struct active_array *a, int n);
+
+ int swapuuid; /* true if uuid is bigending rather than hostendian */
+ int external;
+ const char *name; /* canonical metadata name */
+} *superlist[];
+
+extern struct superswitch super0, super1;
+extern struct superswitch super_imsm, super_ddf;
+extern struct superswitch mbr, gpt;
+
+struct metadata_update {
+ int len;
+ char *buf;
+ void *space; /* allocated space that monitor will use */
+ void **space_list; /* list of allocated spaces that monitor can
+ * use or that it returned.
+ */
+ struct metadata_update *next;
+};
+
+/* A supertype holds a particular collection of metadata.
+ * It identifies the metadata type by the superswitch, and the particular
+ * sub-version of that metadata type.
+ * metadata read in or created is stored in 'sb' and 'info'.
+ * There are also fields used by mdmon to track containers.
+ *
+ * A supertype may refer to:
+ * Just an array, possibly in a container
+ * A container, not identifying any particular array
+ * Info read from just one device, not yet fully describing the array/container.
+ *
+ *
+ * A supertype is created by:
+ * super_by_fd
+ * guess_super
+ * dup_super
+ */
+struct supertype {
+ struct superswitch *ss;
+ int minor_version;
+ int max_devs;
+ char container_devnm[32]; /* devnm of container */
+ void *sb;
+ void *info;
+ void *other; /* Hack used to convert v0.90 to v1.0 */
+ unsigned long long devsize;
+ unsigned long long data_offset; /* used by v1.x only */
+ int ignore_hw_compat; /* used to inform metadata handlers that it should ignore
+ HW/firmware related incompatability to load metadata.
+ Used when examining metadata to display content of disk
+ when user has no hw/firmare compatible system.
+ */
+ struct metadata_update *updates;
+ struct metadata_update **update_tail;
+
+ /* extra stuff used by mdmon */
+ struct active_array *arrays;
+ int sock; /* listen to external programs */
+ char devnm[32]; /* e.g. md0. This appears in metadata_version:
+ * external:/md0/12
+ */
+ int devcnt;
+ int retry_soon;
+ int nodes;
+ char *cluster_name;
+
+ struct mdinfo *devs;
+
+};
+
+extern struct supertype *super_by_fd(int fd, char **subarray);
+enum guess_types { guess_any, guess_array, guess_partitions };
+extern struct supertype *guess_super_type(int fd, enum guess_types guess_type);
+static inline struct supertype *guess_super(int fd) {
+ return guess_super_type(fd, guess_any);
+}
+extern struct supertype *dup_super(struct supertype *st);
+extern int get_dev_size(int fd, char *dname, unsigned long long *sizep);
+extern int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep);
+extern int must_be_container(int fd);
+extern int dev_size_from_id(dev_t id, unsigned long long *size);
+extern int dev_sector_size_from_id(dev_t id, unsigned int *size);
+void wait_for(char *dev, int fd);
+
+/*
+ * Data structures for policy management.
+ * Each device can have a policy structure that lists
+ * various name/value pairs each possibly with a metadata associated.
+ * The policy list is sorted by name/value/metadata
+ */
+struct dev_policy {
+ struct dev_policy *next;
+ char *name; /* None of these strings are allocated. They are
+ * all just references to strings which are known
+ * to exist elsewhere.
+ * name and metadata can be compared by address equality.
+ */
+ const char *metadata;
+ const char *value;
+};
+
+extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[];
+
+/* iterate over the sublist starting at list, having the same
+ * 'name' as 'list', and matching the given metadata (Where
+ * NULL matches anything
+ */
+#define pol_for_each(item, list, _metadata) \
+ for (item = list; \
+ item && item->name == list->name; \
+ item = item->next) \
+ if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \
+ ; else
+
+/*
+ * policy records read from mdadm are largely just name-value pairs.
+ * The names are constants, not strdupped
+ */
+struct pol_rule {
+ struct pol_rule *next;
+ char *type; /* rule_policy or rule_part */
+ struct rule {
+ struct rule *next;
+ char *name;
+ char *value;
+ char *dups; /* duplicates of 'value' with a partNN appended */
+ } *rule;
+};
+
+extern char rule_policy[], rule_part[];
+extern char rule_path[], rule_type[];
+extern char type_part[], type_disk[];
+
+extern void policyline(char *line, char *type);
+extern void policy_add(char *type, ...);
+extern void policy_free(void);
+
+extern struct dev_policy *path_policy(char **paths, char *type);
+extern struct dev_policy *disk_policy(struct mdinfo *disk);
+extern struct dev_policy *devid_policy(int devid);
+extern void dev_policy_free(struct dev_policy *p);
+
+//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata);
+extern struct dev_policy *pol_find(struct dev_policy *pol, char *name);
+
+enum policy_action {
+ act_default,
+ act_include,
+ act_re_add,
+ act_spare, /* This only applies to bare devices */
+ act_spare_same_slot, /* this allows non-bare devices,
+ * but only if recent removal */
+ act_force_spare, /* this allow non-bare devices in any case */
+ act_err
+};
+
+extern int policy_action_allows(struct dev_policy *plist, const char *metadata,
+ enum policy_action want);
+extern int disk_action_allows(struct mdinfo *disk, const char *metadata,
+ enum policy_action want);
+
+struct domainlist {
+ struct domainlist *next;
+ const char *dom;
+};
+
+extern int domain_test(struct domainlist *dom, struct dev_policy *pol,
+ const char *metadata);
+extern struct domainlist *domain_from_array(struct mdinfo *mdi,
+ const char *metadata);
+extern void domainlist_add_dev(struct domainlist **dom, int devid,
+ const char *metadata);
+extern void domain_free(struct domainlist *dl);
+extern void domain_merge(struct domainlist **domp, struct dev_policy *pol,
+ const char *metadata);
+void domain_add(struct domainlist **domp, char *domain);
+
+extern void policy_save_path(char *id_path, struct map_ent *array);
+extern int policy_check_path(struct mdinfo *disk, struct map_ent *array);
+
+extern void sysfs_rules_apply(char *devnm, struct mdinfo *dev);
+extern void sysfsline(char *line);
+
+#if __GNUC__ < 3
+struct stat64;
+#endif
+
+#define HAVE_NFTW we assume
+#define HAVE_FTW
+
+#ifdef __UCLIBC__
+# include <features.h>
+# ifndef __UCLIBC_HAS_LFS__
+# define lseek64 lseek
+# endif
+# ifndef __UCLIBC_HAS_FTW__
+# undef HAVE_FTW
+# undef HAVE_NFTW
+# endif
+#endif
+
+#ifdef __dietlibc__
+# undef HAVE_NFTW
+#endif
+
+#if defined(__KLIBC__)
+# undef HAVE_NFTW
+# undef HAVE_FTW
+#endif
+
+#ifndef HAVE_NFTW
+# define FTW_PHYS 1
+# ifndef HAVE_FTW
+ struct FTW {};
+# endif
+#endif
+
+#ifdef HAVE_FTW
+# include <ftw.h>
+#endif
+
+extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s);
+
+extern int Manage_ro(char *devname, int fd, int readonly);
+extern int Manage_run(char *devname, int fd, struct context *c);
+extern int Manage_stop(char *devname, int fd, int quiet,
+ int will_retry);
+extern int Manage_subdevs(char *devname, int fd,
+ struct mddev_dev *devlist, int verbose, int test,
+ char *update, int force);
+extern int autodetect(void);
+extern int Grow_Add_device(char *devname, int fd, char *newdev);
+extern int Grow_addbitmap(char *devname, int fd,
+ struct context *c, struct shape *s);
+extern int Grow_reshape(char *devname, int fd,
+ struct mddev_dev *devlist,
+ unsigned long long data_offset,
+ struct context *c, struct shape *s);
+extern int Grow_restart(struct supertype *st, struct mdinfo *info,
+ int *fdlist, int cnt, char *backup_file, int verbose);
+extern int Grow_continue(int mdfd, struct supertype *st,
+ struct mdinfo *info, char *backup_file,
+ int forked, int freeze_reshape);
+extern int Grow_consistency_policy(char *devname, int fd,
+ struct context *c, struct shape *s);
+
+extern int restore_backup(struct supertype *st,
+ struct mdinfo *content,
+ int working_disks,
+ int spares,
+ char **backup_filep,
+ int verbose);
+extern int Grow_continue_command(char *devname, int fd,
+ char *backup_file, int verbose);
+
+extern int Assemble(struct supertype *st, char *mddev,
+ struct mddev_ident *ident,
+ struct mddev_dev *devlist,
+ struct context *c);
+
+extern int Build(char *mddev, struct mddev_dev *devlist,
+ struct shape *s, struct context *c);
+
+extern int Create(struct supertype *st, char *mddev,
+ char *name, int *uuid,
+ int subdevs, struct mddev_dev *devlist,
+ struct shape *s,
+ struct context *c,
+ unsigned long long data_offset);
+
+extern int Detail(char *dev, struct context *c);
+extern int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path);
+extern int Query(char *dev);
+extern int ExamineBadblocks(char *devname, int brief, struct supertype *forcest);
+extern int Examine(struct mddev_dev *devlist, struct context *c,
+ struct supertype *forcest);
+extern int Monitor(struct mddev_dev *devlist,
+ char *mailaddr, char *alert_cmd,
+ struct context *c,
+ int daemonise, int oneshot,
+ int dosyslog, char *pidfile, int increments,
+ int share);
+
+extern int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl);
+extern int Kill_subarray(char *dev, char *subarray, int verbose);
+extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet);
+extern int Wait(char *dev);
+extern int WaitClean(char *dev, int verbose);
+extern int SetAction(char *dev, char *action);
+
+extern int Incremental(struct mddev_dev *devlist, struct context *c,
+ struct supertype *st);
+extern void RebuildMap(void);
+extern int IncrementalScan(struct context *c, char *devnm);
+extern int IncrementalRemove(char *devname, char *path, int verbose);
+extern int CreateBitmap(char *filename, int force, char uuid[16],
+ unsigned long chunksize, unsigned long daemon_sleep,
+ unsigned long write_behind,
+ unsigned long long array_size,
+ int major);
+extern int ExamineBitmap(char *filename, int brief, struct supertype *st);
+extern int IsBitmapDirty(char *filename);
+extern int Write_rules(char *rule_name);
+extern int bitmap_update_uuid(int fd, int *uuid, int swap);
+
+/* calculate the size of the bitmap given the array size and bitmap chunksize */
+static inline unsigned long long
+bitmap_bits(unsigned long long array_size, unsigned long chunksize)
+{
+ return (array_size * 512 + chunksize - 1) / chunksize;
+}
+
+extern int Dump_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st);
+extern int Restore_metadata(char *dev, char *dir, struct context *c,
+ struct supertype *st, int only);
+
+int md_array_valid(int fd);
+int md_array_active(int fd);
+int md_array_is_active(struct mdinfo *info);
+int md_get_array_info(int fd, struct mdu_array_info_s *array);
+int md_set_array_info(int fd, struct mdu_array_info_s *array);
+int md_get_disk_info(int fd, struct mdu_disk_info_s *disk);
+extern int get_linux_version(void);
+extern int mdadm_version(char *version);
+extern unsigned long long parse_size(char *size);
+extern int parse_uuid(char *str, int uuid[4]);
+extern int is_near_layout_10(int layout);
+extern int parse_layout_10(char *layout);
+extern int parse_layout_faulty(char *layout);
+extern int parse_num(int *dest, char *num);
+extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot);
+extern int check_ext2(int fd, char *name);
+extern int check_reiser(int fd, char *name);
+extern int check_raid(int fd, char *name);
+extern int check_partitions(int fd, char *dname,
+ unsigned long long freesize,
+ unsigned long long size);
+extern int fstat_is_blkdev(int fd, char *devname, dev_t *rdev);
+extern int stat_is_blkdev(char *devname, dev_t *rdev);
+
+extern bool is_dev_alive(char *path);
+extern int get_mdp_major(void);
+extern int get_maj_min(char *dev, int *major, int *minor);
+extern int dev_open(char *dev, int flags);
+extern int open_dev(char *devnm);
+extern void reopen_mddev(int mdfd);
+extern int open_dev_flags(char *devnm, int flags);
+extern int open_dev_excl(char *devnm);
+extern int is_standard(char *dev, int *nump);
+extern int same_dev(char *one, char *two);
+extern int compare_paths (char* path1,char* path2);
+extern void enable_fds(int devices);
+extern void manage_fork_fds(int close_all);
+extern int continue_via_systemd(char *devnm, char *service_name);
+
+extern int parse_auto(char *str, char *msg, int config);
+extern struct mddev_ident *conf_get_ident(char *dev);
+extern struct mddev_dev *conf_get_devs(void);
+extern int conf_test_dev(char *devname);
+extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost);
+extern struct createinfo *conf_get_create_info(void);
+extern void set_conffile(char *file);
+extern char *conf_get_mailaddr(void);
+extern char *conf_get_mailfrom(void);
+extern char *conf_get_program(void);
+extern char *conf_get_homehost(int *require_homehostp);
+extern char *conf_get_homecluster(void);
+extern int conf_get_monitor_delay(void);
+extern char *conf_line(FILE *file);
+extern char *conf_word(FILE *file, int allow_key);
+extern void print_quoted(char *str);
+extern void print_escape(char *str);
+extern int use_udev(void);
+extern unsigned long GCD(unsigned long a, unsigned long b);
+extern int conf_name_is_free(char *name);
+extern int conf_verify_devnames(struct mddev_ident *array_list);
+extern int devname_matches(char *name, char *match);
+extern struct mddev_ident *conf_match(struct supertype *st,
+ struct mdinfo *info,
+ char *devname,
+ int verbose, int *rvp);
+
+extern void free_line(char *line);
+extern int match_oneof(char *devices, char *devname);
+extern void uuid_from_super(int uuid[4], mdp_super_t *super);
+extern const int uuid_zero[4];
+extern int same_uuid(int a[4], int b[4], int swapuuid);
+extern void copy_uuid(void *a, int b[4], int swapuuid);
+extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep);
+extern char *fname_from_uuid(struct supertype *st,
+ struct mdinfo *info, char *buf, char sep);
+extern unsigned long calc_csum(void *super, int bytes);
+extern int enough(int level, int raid_disks, int layout, int clean,
+ char *avail);
+extern int ask(char *mesg);
+extern unsigned long long get_component_size(int fd);
+extern void remove_partitions(int fd);
+extern int test_partition(int fd);
+extern int test_partition_from_id(dev_t id);
+extern int get_data_disks(int level, int layout, int raid_disks);
+extern unsigned long long calc_array_size(int level, int raid_disks, int layout,
+ int chunksize, unsigned long long devsize);
+extern int flush_metadata_updates(struct supertype *st);
+extern void append_metadata_update(struct supertype *st, void *buf, int len);
+extern int assemble_container_content(struct supertype *st, int mdfd,
+ struct mdinfo *content,
+ struct context *c,
+ char *chosen_name, int *result);
+#define INCR_NO 1
+#define INCR_UNSAFE 2
+#define INCR_ALREADY 4
+#define INCR_YES 8
+extern struct mdinfo *container_choose_spares(struct supertype *st,
+ struct spare_criteria *criteria,
+ struct domainlist *domlist,
+ char *spare_group,
+ const char *metadata, int get_one);
+extern int move_spare(char *from_devname, char *to_devname, dev_t devid);
+extern int add_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info);
+extern int remove_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info);
+extern int hot_remove_disk(int mdfd, unsigned long dev, int force);
+extern int sys_hot_remove_disk(int statefd, int force);
+extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info);
+unsigned long long min_recovery_start(struct mdinfo *array);
+
+extern char *human_size(long long bytes);
+extern char *human_size_brief(long long bytes, int prefix);
+extern void print_r10_layout(int layout);
+
+extern char *find_free_devnm(int use_partitions);
+
+extern void put_md_name(char *name);
+extern char *devid2kname(dev_t devid);
+extern char *devid2devnm(dev_t devid);
+extern dev_t devnm2devid(char *devnm);
+extern char *get_md_name(char *devnm);
+
+extern char DefaultConfFile[];
+
+extern int create_mddev(char *dev, char *name, int autof, int trustworthy,
+ char *chosen, int block_udev);
+/* values for 'trustworthy' */
+#define LOCAL 1
+#define LOCAL_ANY 10
+#define FOREIGN 2
+#define METADATA 3
+extern int open_mddev(char *dev, int report_errors);
+extern int open_container(int fd);
+extern int metadata_container_matches(char *metadata, char *devnm);
+extern int metadata_subdev_matches(char *metadata, char *devnm);
+extern int is_container_member(struct mdstat_ent *ent, char *devname);
+extern int is_subarray_active(char *subarray, char *devname);
+extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet);
+extern struct superswitch *version_to_superswitch(char *vers);
+
+extern int mdmon_running(char *devnm);
+extern int mdmon_pid(char *devnm);
+extern int check_env(char *name);
+extern __u32 random32(void);
+extern void random_uuid(__u8 *buf);
+extern int start_mdmon(char *devnm);
+
+extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long stripes,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets);
+void abort_reshape(struct mdinfo *sra);
+
+void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0);
+
+extern char *stat2kname(struct stat *st);
+extern char *fd2kname(int fd);
+extern char *stat2devnm(struct stat *st);
+extern char *fd2devnm(int fd);
+extern void udev_block(char *devnm);
+extern void udev_unblock(void);
+
+extern int in_initrd(void);
+
+struct cmap_hooks {
+ void *cmap_handle; /* corosync lib related */
+
+ int (*initialize)(cmap_handle_t *handle);
+ int (*get_string)(cmap_handle_t handle,
+ const char *string,
+ char **name);
+ int (*finalize)(cmap_handle_t handle);
+};
+
+extern void set_cmap_hooks(void);
+extern void set_hooks(void);
+
+struct dlm_hooks {
+ void *dlm_handle; /* dlm lib related */
+
+ dlm_lshandle_t (*create_lockspace)(const char *name,
+ unsigned int mode);
+ dlm_lshandle_t (*open_lockspace)(const char *name);
+ int (*release_lockspace)(const char *name, dlm_lshandle_t ls,
+ int force);
+ int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode,
+ struct dlm_lksb *lksb, uint32_t flags,
+ const void *name, unsigned int namelen,
+ uint32_t parent, void (*astaddr) (void *astarg),
+ void *astarg, void (*bastaddr) (void *astarg),
+ void *range);
+ int (*ls_unlock_wait)(dlm_lshandle_t lockspace, uint32_t lkid,
+ uint32_t flags, struct dlm_lksb *lksb);
+ int (*ls_get_fd)(dlm_lshandle_t ls);
+ int (*dispatch)(int fd);
+};
+
+extern int get_cluster_name(char **name);
+extern int dlm_funs_ready(void);
+extern int cluster_get_dlmlock(void);
+extern int cluster_release_dlmlock(void);
+extern void set_dlm_hooks(void);
+
+#define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1))
+#define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base))
+#define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \
+ (ROUND_UP((unsigned long)(ptr), base)))
+
+static inline int is_subarray(char *vers)
+{
+ /* The version string for a 'subarray' (an array in a container)
+ * is
+ * /containername/componentname for normal read-write arrays
+ * -containername/componentname for arrays which mdmon must not
+ * reconfigure. They might be read-only
+ * or might be undergoing reshape etc.
+ * containername is e.g. md0, md_d1
+ * componentname is dependant on the metadata. e.g. '1' 'S1' ...
+ */
+ return (*vers == '/' || *vers == '-');
+}
+
+static inline char *to_subarray(struct mdstat_ent *ent, char *container)
+{
+ return &ent->metadata_version[10+strlen(container)+1];
+}
+
+#ifdef DEBUG
+#define dprintf(fmt, arg...) \
+ fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##arg)
+#define dprintf_cont(fmt, arg...) \
+ fprintf(stderr, fmt, ##arg)
+#else
+#define dprintf(fmt, arg...) \
+ ({ if (0) fprintf(stderr, "%s: %s: " fmt, Name, __func__, ##arg); 0; })
+#define dprintf_cont(fmt, arg...) \
+ ({ if (0) fprintf(stderr, fmt, ##arg); 0; })
+#endif
+#include <assert.h>
+#include <stdarg.h>
+static inline int xasprintf(char **strp, const char *fmt, ...) {
+ va_list ap;
+ int ret;
+ va_start(ap, fmt);
+ ret = vasprintf(strp, fmt, ap);
+ va_end(ap);
+ assert(ret >= 0);
+ return ret;
+}
+
+#ifdef DEBUG
+#define pr_err(fmt, args...) fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##args)
+#else
+#define pr_err(fmt, args...) fprintf(stderr, "%s: "fmt, Name, ##args)
+#endif
+#define cont_err(fmt ...) fprintf(stderr, " " fmt)
+
+void *xmalloc(size_t len);
+void *xrealloc(void *ptr, size_t len);
+void *xcalloc(size_t num, size_t size);
+char *xstrdup(const char *str);
+
+#define LEVEL_MULTIPATH (-4)
+#define LEVEL_LINEAR (-1)
+#define LEVEL_FAULTY (-5)
+
+/* kernel module doesn't know about these */
+#define LEVEL_CONTAINER (-100)
+#define LEVEL_UNSUPPORTED (-200)
+
+/* the kernel does know about this one ... */
+#define LEVEL_NONE (-1000000)
+
+/* faulty stuff */
+
+#define WriteTransient 0
+#define ReadTransient 1
+#define WritePersistent 2
+#define ReadPersistent 3
+#define WriteAll 4 /* doesn't go to device */
+#define ReadFixable 5
+#define Modes 6
+
+#define ClearErrors 31
+#define ClearFaults 30
+
+#define AllPersist 100 /* internal use only */
+#define NoPersist 101
+
+#define ModeMask 0x1f
+#define ModeShift 5
+
+#ifdef __TINYC__
+#undef minor
+#undef major
+#undef makedev
+#define minor(x) ((x)&0xff)
+#define major(x) (((x)>>8)&0xff)
+#define makedev(M,m) (((M)<<8) | (m))
+#endif
+
+enum r0layout {
+ RAID0_ORIG_LAYOUT = 1,
+ RAID0_ALT_MULTIZONE_LAYOUT = 2,
+};
+
+/* for raid4/5/6 */
+#define ALGORITHM_LEFT_ASYMMETRIC 0
+#define ALGORITHM_RIGHT_ASYMMETRIC 1
+#define ALGORITHM_LEFT_SYMMETRIC 2
+#define ALGORITHM_RIGHT_SYMMETRIC 3
+
+/* Define non-rotating (raid4) algorithms. These allow
+ * conversion of raid4 to raid5.
+ */
+#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */
+#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */
+
+/* DDF RAID6 layouts differ from md/raid6 layouts in two ways.
+ * Firstly, the exact positioning of the parity block is slightly
+ * different between the 'LEFT_*' modes of md and the "_N_*" modes
+ * of DDF.
+ * Secondly, or order of datablocks over which the Q syndrome is computed
+ * is different.
+ * Consequently we have different layouts for DDF/raid6 than md/raid6.
+ * These layouts are from the DDFv1.2 spec.
+ * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but
+ * leaves RLQ=3 as 'Vendor Specific'
+ */
+
+#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */
+#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */
+#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */
+
+/* For every RAID5 algorithm we define a RAID6 algorithm
+ * with exactly the same layout for data and parity, and
+ * with the Q block always on the last device (N-1).
+ * This allows trivial conversion from RAID5 to RAID6
+ */
+#define ALGORITHM_LEFT_ASYMMETRIC_6 16
+#define ALGORITHM_RIGHT_ASYMMETRIC_6 17
+#define ALGORITHM_LEFT_SYMMETRIC_6 18
+#define ALGORITHM_RIGHT_SYMMETRIC_6 19
+#define ALGORITHM_PARITY_0_6 20
+#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N
+
+/* Define PATH_MAX in case we don't use glibc or standard library does
+ * not have PATH_MAX defined. Assume max path length is 4K characters.
+ */
+#ifndef PATH_MAX
+#define PATH_MAX 4096
+#endif
+
+#define RESYNC_NONE -1
+#define RESYNC_DELAYED -2
+#define RESYNC_PENDING -3
+#define RESYNC_REMOTE -4
+#define RESYNC_UNKNOWN -5
+
+/* When using "GET_DISK_INFO" it isn't certain how high
+ * we need to check. So we impose an absolute limit of
+ * MAX_DISKS. This needs to be much more than the largest
+ * number of devices any metadata can support. Currently
+ * v1.x can support 1920
+ */
+#define MAX_DISKS 4096
+
+/* Sometimes the 'size' value passed needs to mean "Maximum".
+ * In those cases with use MAX_SIZE
+ */
+#define MAX_SIZE 1
+
+/* We want to use unsigned numbers for sector counts, but need
+ * a value for 'invalid'. Use '1'.
+ */
+#define INVALID_SECTORS 1
+/* And another special number needed for --data_offset=variable */
+#define VARIABLE_OFFSET 3
+
+/**
+ * This is true for native and DDF, IMSM allows 16.
+ */
+#define MD_NAME_MAX 32
diff --git a/mdadm.spec b/mdadm.spec
new file mode 100644
index 0000000..1b7c6bd
--- /dev/null
+++ b/mdadm.spec
@@ -0,0 +1,47 @@
+Summary: mdadm is used for controlling Linux md devices (aka RAID arrays)
+Name: mdadm
+Version: 4.2
+Release: 1
+Source: https://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz
+URL: https://neil.brown.name/blog/mdadm
+License: GPL
+Group: Utilities/System
+BuildRoot: %{_tmppath}/%{name}-root
+Obsoletes: mdctl
+
+%description
+mdadm is a program that can be used to create, manage, and monitor
+Linux MD (Software RAID) devices.
+
+%prep
+%setup -q
+# we want to install in /sbin, not /usr/sbin...
+%define _exec_prefix %{nil}
+
+%build
+# This is a debatable issue. The author of this RPM spec file feels that
+# people who install RPMs (especially given that the default RPM options
+# will strip the binary) are not going to be running gdb against the
+# program.
+make CXFLAGS="$RPM_OPT_FLAGS" SYSCONFDIR="%{_sysconfdir}"
+
+%install
+make DESTDIR=$RPM_BUILD_ROOT MANDIR=%{_mandir} BINDIR=%{_sbindir} install
+install -D -m644 mdadm.conf-example $RPM_BUILD_ROOT/%{_sysconfdir}/mdadm.conf
+
+%clean
+rm -rf $RPM_BUILD_ROOT
+
+%files
+%defattr(-,root,root)
+%doc TODO ChangeLog mdadm.conf-example COPYING
+%{_sbindir}/mdadm
+%{_sbindir}/mdmon
+/usr/lib/udev/rules.d/01-md-raid-creating.rules
+/usr/lib/udev/rules.d/63-md-raid-arrays.rules
+/usr/lib/udev/rules.d/64-md-raid-assembly.rules
+/usr/lib/udev/rules.d/69-md-clustered-confirm-device.rules
+%config(noreplace,missingok)/%{_sysconfdir}/mdadm.conf
+%{_mandir}/man*/md*
+
+%changelog
diff --git a/mdmon-design.txt b/mdmon-design.txt
new file mode 100644
index 0000000..f09184a
--- /dev/null
+++ b/mdmon-design.txt
@@ -0,0 +1,146 @@
+
+When managing a RAID1 array which uses metadata other than the
+"native" metadata understood by the kernel, mdadm makes use of a
+partner program named 'mdmon' to manage some aspects of updating
+that metadata and synchronising the metadata with the array state.
+
+This document provides some details on how mdmon works.
+
+Containers
+----------
+
+As background: mdadm makes a distinction between an 'array' and a
+'container'. Other sources sometimes use the term 'volume' or
+'device' for an 'array', and may use the term 'array' for a
+'container'.
+
+For our purposes:
+ - a 'container' is a collection of devices which are described by a
+ single set of metadata. The metadata may be stored equally
+ on all devices, or different devices may have quite different
+ subsets of the total metadata. But there is conceptually one set
+ of metadata that unifies the devices.
+
+ - an 'array' is a set of datablock from various devices which
+ together are used to present the abstraction of a single linear
+ sequence of block, which may provide data redundancy or enhanced
+ performance.
+
+So a container has some metadata and provides a number of arrays which
+are described by that metadata.
+
+Sometimes this model doesn't work perfectly. For example, global
+spares may have their own metadata which is quite different from the
+metadata from any device that participates in one or more arrays.
+Such a global spare might still need to belong to some container so
+that it is available to be used should a failure arise. In that case
+we consider the 'metadata' to be the union of the metadata on the
+active devices which describes the arrays, and the metadata on the
+global spares which only describes the spares. In this case different
+devices in the one container will have quite different metadata.
+
+
+Purpose
+-------
+
+The main purpose of mdmon is to update the metadata in response to
+changes to the array which need to be reflected in the metadata before
+futures writes to the array can safely be performed.
+These include:
+ - transitions from 'clean' to 'dirty'.
+ - recording the devices have failed.
+ - recording the progress of a 'reshape'
+
+This requires mdmon to be running at any time that the array is
+writable (a read-only array does not require mdmon to be running).
+
+Because mdmon must be able to process these metadata updates at any
+time, it must (when running) have exclusive write access to the
+metadata. Any other changes (e.g. reconfiguration of the array) must
+go through mdmon.
+
+A secondary role for mdmon is to activate spares when a device fails.
+This role is much less time-critical than the other metadata updates,
+so it could be performed by a separate process, possibly
+"mdadm --monitor" which has a related role of moving devices between
+arrays. A main reason for including this functionality in mdmon is
+that in the native-metadata case this function is handled in the
+kernel, and mdmon's reason for existence to provide functionality
+which is otherwise handled by the kernel.
+
+
+Design overview
+---------------
+
+mdmon is structured as two threads with a common address space and
+common data structures. These threads are know as the 'monitor' and
+the 'manager'.
+
+The 'monitor' has the primary role of monitoring the array for
+important state changes and updating the metadata accordingly. As
+writes to the array can be blocked until 'monitor' completes and
+acknowledges the update, it much be very careful not to block itself.
+In particular it must not block waiting for any write to complete else
+it could deadlock. This means that it must not allocate memory as
+doing this can require dirty memory to be written out and if the
+system choose to write to the array that mdmon is monitoring, the
+memory allocation could deadlock.
+
+So 'monitor' must never allocate memory and must limit the number of
+other system call it performs. It may:
+ - use select (or poll) to wait for activity on a file descriptor
+ - read from a sysfs file descriptor
+ - write to a sysfs file descriptor
+ - write the metadata out to the block devices using O_DIRECT
+ - send a signal (kill) to the manager thread
+
+It must not e.g. open files or do anything similar that might allocate
+resources.
+
+The 'manager' thread does everything else that is needed. If any
+files are to be opened (e.g. because a device has been added to the
+array), the manager does that. If any memory needs to be allocated
+(e.g. to hold data about a new array as can happen when one set of
+metadata describes several arrays), the manager performs that
+allocation.
+
+The 'manager' is also responsible for communicating with mdadm and
+assigning spares to replace failed devices.
+
+
+Handling metadata updates
+-------------------------
+
+There are a number of cases in which mdadm needs to update the
+metdata which mdmon is managing. These include:
+ - creating a new array in an active container
+ - adding a device to a container
+ - reconfiguring an array
+etc.
+
+To complete these updates, mdadm must send a message to mdmon which
+will merge the update into the metadata as it is at that moment.
+
+To achieve this, mdmon creates a Unix Domain Socket which the manager
+thread listens on. mdadm sends a message over this socket. The
+manager thread examines the message to see if it will require
+allocating any memory and allocates it. This is done in the
+'prepare_update' metadata method.
+
+The update message is then queued for handling by the monitor thread
+which it will do when convenient. The monitor thread calls
+->process_update which should atomically make the required changes to
+the metadata, making use of the pre-allocate memory as required. Any
+memory the is no-longer needed can be placed back in the request and
+the manager thread will free it.
+
+The exact format of a metadata update is up to the implementer of the
+metadata handlers. It will simply describe a change that needs to be
+made. It will sometimes contain fragments of the metadata to be
+copied in to place. However the ->process_update routine must make
+sure not to over-write any field that the monitor thread might have
+updated, such as a 'device failed' or 'array is dirty' state.
+
+When the monitor thread has completed the update and written it to the
+devices, an acknowledgement message is sent back over the socket so
+that mdadm knows it is complete.
diff --git a/mdmon.8 b/mdmon.8
new file mode 100644
index 0000000..c0dc6ce
--- /dev/null
+++ b/mdmon.8
@@ -0,0 +1,257 @@
+.\" See file COPYING in distribution for details.
+.TH MDMON 8 "" v4.2
+.SH NAME
+mdmon \- monitor MD external metadata arrays
+
+.SH SYNOPSIS
+
+.BI mdmon " [--all] [--takeover] [--foreground] CONTAINER"
+
+.SH OVERVIEW
+The 2.6.27 kernel brings the ability to support external metadata arrays.
+External metadata implies that user space handles all updates to the metadata.
+The kernel's responsibility is to notify user space when a "metadata event"
+occurs, like disk failures and clean-to-dirty transitions. The kernel, in
+important cases, waits for user space to take action on these notifications.
+
+.SH DESCRIPTION
+.SS Metadata updates:
+To service metadata update requests a daemon,
+.IR mdmon ,
+is introduced.
+.I Mdmon
+is tasked with polling the sysfs namespace looking for changes in
+.BR array_state ,
+.BR sync_action ,
+and per disk
+.BR state
+attributes. When a change is detected it calls a per metadata type
+handler to make modifications to the metadata. The following actions
+are taken:
+.RS
+.TP
+.B array_state \- inactive
+Clear the dirty bit for the volume and let the array be stopped
+.TP
+.B array_state \- write pending
+Set the dirty bit for the array and then set
+.B array_state
+to
+.BR active .
+Writes
+are blocked until userspace writes
+.BR active.
+.TP
+.B array_state \- active-idle
+The safe mode timer has expired so set array state to clean to block writes to the array
+.TP
+.B array_state \- clean
+Clear the dirty bit for the volume
+.TP
+.B array_state \- read-only
+This is the initial state that all arrays start at.
+.I mdmon
+takes one of the three actions:
+.RS
+.TP
+1/
+Transition the array to read-auto keeping the dirty bit clear if the metadata
+handler determines that the array does not need resyncing or other modification
+.TP
+2/
+Transition the array to active if the metadata handler determines a resync or
+some other manipulation is necessary
+.TP
+3/
+Leave the array read\-only if the volume is marked to not be monitored; for
+example, the metadata version has been set to "external:\-dev/md127" instead of
+"external:/dev/md127"
+.RE
+.TP
+.B sync_action \- resync\-to\-idle
+Notify the metadata handler that a resync may have completed. If a resync
+process is idled before it completes this event allows the metadata handler to
+checkpoint resync.
+.TP
+.B sync_action \- recover\-to\-idle
+A spare may have completed rebuilding so tell the metadata handler about the
+state of each disk. This is the metadata handler's opportunity to clear
+any "out-of-sync" bits and clear the volume's degraded status. If a recovery
+process is idled before it completes this event allows the metadata handler to
+checkpoint recovery.
+.TP
+.B <disk>/state \- faulty
+A disk failure kicks off a series of events. First, notify the metadata
+handler that a disk has failed, and then notify the kernel that it can unblock
+writes that were dependent on this disk. After unblocking the kernel this disk
+is set to be removed+ from the member array. Finally the disk is marked failed
+in all other member arrays in the container.
+.IP
++ Note This behavior differs slightly from native MD arrays where
+removal is reserved for a
+.B mdadm --remove
+event. In the external metadata case the container holds the final
+reference on a block device and a
+.B mdadm --remove <container> <victim>
+call is still required.
+.RE
+
+.SS Containers:
+.P
+External metadata formats, like DDF, differ from the native MD metadata
+formats in that they define a set of disks and a series of sub-arrays
+within those disks. MD metadata in comparison defines a 1:1
+relationship between a set of block devices and a RAID array. For
+example to create 2 arrays at different RAID levels on a single
+set of disks, MD metadata requires the disks be partitioned and then
+each array can be created with a subset of those partitions. The
+supported external formats perform this disk carving internally.
+.P
+Container devices simply hold references to all member disks and allow
+tools like
+.I mdmon
+to determine which active arrays belong to which
+container. Some array management commands like disk removal and disk
+add are now only valid at the container level. Attempts to perform
+these actions on member arrays are blocked with error messages like:
+.IP
+"mdadm: Cannot remove disks from a \'member\' array, perform this
+operation on the parent container"
+.P
+Containers are identified in /proc/mdstat with a metadata version string
+"external:<metadata name>". Member devices are identified by
+"external:/<container device>/<member index>", or "external:-<container
+device>/<member index>" if the array is to remain readonly.
+
+.SH OPTIONS
+.TP
+CONTAINER
+The
+.B container
+device to monitor. It can be a full path like /dev/md/container, or a
+simple md device name like md127.
+.TP
+.B \-\-foreground
+Normally,
+.I mdmon
+will fork and continue in the background. Adding this option will
+skip that step and run
+.I mdmon
+in the foreground.
+.TP
+.B \-\-takeover
+This instructs
+.I mdmon
+to replace any active
+.I mdmon
+which is currently monitoring the array. This is primarily used late
+in the boot process to replace any
+.I mdmon
+which was started from an
+.B initramfs
+before the root filesystem was mounted. This avoids holding a
+reference on that
+.B initramfs
+indefinitely and ensures that the
+.I pid
+and
+.I sock
+files used to communicate with
+.I mdmon
+are in a standard place.
+.TP
+.B \-\-all
+This tells mdmon to find any active containers and start monitoring
+each of them if appropriate. This is normally used with
+.B \-\-takeover
+late in the boot sequence.
+A separate
+.I mdmon
+process is started for each container as the
+.B \-\-all
+argument is over-written with the name of the container. To allow for
+containers with names longer than 5 characters, this argument can be
+arbitrarily extended, e.g. to
+.BR \-\-all-active-arrays .
+.TP
+
+.PP
+Note that
+.I mdmon
+is automatically started by
+.I mdadm
+when needed and so does not need to be considered when working with
+RAID arrays. The only times it is run other than by
+.I mdadm
+is when the boot scripts need to restart it after mounting the new
+root filesystem.
+
+.SH START UP AND SHUTDOWN
+
+As
+.I mdmon
+needs to be running whenever any filesystem on the monitored device is
+mounted there are special considerations when the root filesystem is
+mounted from an
+.I mdmon
+monitored device.
+Note that in general
+.I mdmon
+is needed even if the filesystem is mounted read-only as some
+filesystems can still write to the device in those circumstances, for
+example to replay a journal after an unclean shutdown.
+
+When the array is assembled by the
+.B initramfs
+code, mdadm will automatically start
+.I mdmon
+as required. This means that
+.I mdmon
+must be installed on the
+.B initramfs
+and there must be a writable filesystem (typically tmpfs) in which
+.B mdmon
+can create a
+.B .pid
+and
+.B .sock
+file. The particular filesystem to use is given to mdmon at compile
+time and defaults to
+.BR /run/mdadm .
+
+This filesystem must persist through to shutdown time.
+
+After the final root filesystem has be instantiated (usually with
+.BR pivot_root )
+.I mdmon
+should be run with
+.I "\-\-all \-\-takeover"
+so that the
+.I mdmon
+running from the
+.B initramfs
+can be replaced with one running in the main root, and so the
+memory used by the initramfs can be released.
+
+At shutdown time,
+.I mdmon
+should not be killed along with other processes. Also as it holds a
+file (socket actually) open in
+.B /dev
+(by default) it will not be possible to unmount
+.B /dev
+if it is a separate filesystem.
+
+.SH EXAMPLES
+
+.B " mdmon \-\-all-active-arrays \-\-takeover"
+.br
+Any
+.I mdmon
+which is currently running is killed and a new instance is started.
+This should be run during in the boot sequence if an initramfs was
+used, so that any mdmon running from the initramfs will not hold
+the initramfs active.
+.SH SEE ALSO
+.IR mdadm (8),
+.IR md (4).
diff --git a/mdmon.c b/mdmon.c
new file mode 100644
index 0000000..c71e62c
--- /dev/null
+++ b/mdmon.c
@@ -0,0 +1,594 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+/*
+ * md array manager.
+ * When md arrays have user-space managed metadata, this is the program
+ * that does the managing.
+ *
+ * Given one argument: the name of the array (e.g. /dev/md0) that is
+ * the container.
+ * We fork off a helper that runs high priority and mlocked. It responds to
+ * device failures and other events that might stop writeout, or that are
+ * trivial to deal with.
+ * The main thread then watches for new arrays being created in the container
+ * and starts monitoring them too ... along with a few other tasks.
+ *
+ * The main thread communicates with the priority thread by writing over
+ * a pipe.
+ * Separate programs can communicate with the main thread via Unix-domain
+ * socket.
+ * The two threads share address space and open file table.
+ *
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <sys/wait.h>
+#include <stdio.h>
+#include <errno.h>
+#include <string.h>
+#include <fcntl.h>
+#include <signal.h>
+#include <dirent.h>
+#ifdef USE_PTHREADS
+#include <pthread.h>
+#else
+#include <sched.h>
+#endif
+
+#include "mdadm.h"
+#include "mdmon.h"
+
+char const Name[] = "mdmon";
+
+struct active_array *discard_this;
+struct active_array *pending_discard;
+
+int mon_tid, mgr_tid;
+
+int sigterm;
+
+#ifdef USE_PTHREADS
+static void *run_child(void *v)
+{
+ struct supertype *c = v;
+
+ mon_tid = syscall(SYS_gettid);
+ do_monitor(c);
+ return 0;
+}
+
+static int clone_monitor(struct supertype *container)
+{
+ pthread_attr_t attr;
+ pthread_t thread;
+ int rc;
+
+ mon_tid = -1;
+ pthread_attr_init(&attr);
+ pthread_attr_setstacksize(&attr, 4096);
+ pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
+ rc = pthread_create(&thread, &attr, run_child, container);
+ if (rc)
+ return rc;
+ while (mon_tid == -1)
+ usleep(10);
+ pthread_attr_destroy(&attr);
+
+ mgr_tid = syscall(SYS_gettid);
+
+ return mon_tid;
+}
+#else /* USE_PTHREADS */
+static int run_child(void *v)
+{
+ struct supertype *c = v;
+
+ do_monitor(c);
+ return 0;
+}
+
+#ifdef __ia64__
+int __clone2(int (*fn)(void *),
+ void *child_stack_base, size_t stack_size,
+ int flags, void *arg, ...
+ /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ );
+#endif
+static int clone_monitor(struct supertype *container)
+{
+ static char stack[4096];
+
+#ifdef __ia64__
+ mon_tid = __clone2(run_child, stack, sizeof(stack),
+ CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+ container);
+#else
+ mon_tid = clone(run_child, stack+4096-64,
+ CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD,
+ container);
+#endif
+
+ mgr_tid = syscall(SYS_gettid);
+
+ return mon_tid;
+}
+#endif /* USE_PTHREADS */
+
+static int make_pidfile(char *devname)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ int n;
+
+ if (mkdir(MDMON_DIR, 0755) < 0 &&
+ errno != EEXIST)
+ return -errno;
+ sprintf(path, "%s/%s.pid", MDMON_DIR, devname);
+
+ fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600);
+ if (fd < 0)
+ return -errno;
+ sprintf(pid, "%d\n", getpid());
+ n = write(fd, pid, strlen(pid));
+ close(fd);
+ if (n < 0)
+ return -errno;
+ return 0;
+}
+
+static void try_kill_monitor(pid_t pid, char *devname, int sock)
+{
+ char buf[100];
+ int fd;
+ int n;
+ long fl;
+ int rv;
+
+ /* first rule of survival... don't off yourself */
+ if (pid == getpid())
+ return;
+
+ /* kill this process if it is mdmon */
+ sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid);
+ fd = open(buf, O_RDONLY);
+ if (fd < 0)
+ return;
+
+ n = read(fd, buf, sizeof(buf)-1);
+ buf[sizeof(buf)-1] = 0;
+ close(fd);
+
+ if (n < 0 || !(strstr(buf, "mdmon") ||
+ strstr(buf, "@dmon")))
+ return;
+
+ kill(pid, SIGTERM);
+
+ if (sock < 0)
+ return;
+
+ /* Wait for monitor to exit by reading from the socket, after
+ * clearing the non-blocking flag */
+ fl = fcntl(sock, F_GETFL, 0);
+ fl &= ~O_NONBLOCK;
+ fcntl(sock, F_SETFL, fl);
+ n = read(sock, buf, 100);
+
+ /* If there is I/O going on it might took some time to get to
+ * clean state. Wait for monitor to exit fully to avoid races.
+ * Ping it with SIGUSR1 in case that it is sleeping */
+ for (n = 0; n < 25; n++) {
+ rv = kill(pid, SIGUSR1);
+ if (rv < 0)
+ break;
+ usleep(200000);
+ }
+}
+
+void remove_pidfile(char *devname)
+{
+ char buf[100];
+
+ sprintf(buf, "%s/%s.pid", MDMON_DIR, devname);
+ unlink(buf);
+ sprintf(buf, "%s/%s.sock", MDMON_DIR, devname);
+ unlink(buf);
+}
+
+static int make_control_sock(char *devname)
+{
+ char path[100];
+ int sfd;
+ long fl;
+ struct sockaddr_un addr;
+
+ if (sigterm)
+ return -1;
+
+ sprintf(path, "%s/%s.sock", MDMON_DIR, devname);
+ unlink(path);
+ sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (sfd < 0)
+ return -1;
+
+ addr.sun_family = PF_LOCAL;
+ strcpy(addr.sun_path, path);
+ umask(077); /* ensure no world write access */
+ if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+ close(sfd);
+ return -1;
+ }
+ listen(sfd, 10);
+ fl = fcntl(sfd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(sfd, F_SETFL, fl);
+ return sfd;
+}
+
+static void term(int sig)
+{
+ sigterm = 1;
+}
+
+static void wake_me(int sig)
+{
+
+}
+
+/* if we are debugging and starting mdmon by hand then don't fork */
+static int do_fork(void)
+{
+ #ifdef DEBUG
+ if (check_env("MDADM_NO_MDMON"))
+ return 0;
+ #endif
+
+ return 1;
+}
+
+void usage(void)
+{
+ fprintf(stderr,
+"Usage: mdmon [options] CONTAINER\n"
+"\n"
+"Options are:\n"
+" --help -h : This message\n"
+" --all -a : All devices\n"
+" --foreground -F : Run in foreground (do not fork)\n"
+" --takeover -t : Takeover container\n"
+);
+ exit(2);
+}
+
+static int mdmon(char *devnm, int must_fork, int takeover);
+
+int main(int argc, char *argv[])
+{
+ char *container_name = NULL;
+ char *devnm = NULL;
+ int status = 0;
+ int opt;
+ int all = 0;
+ int takeover = 0;
+ int dofork = 1;
+ static struct option options[] = {
+ {"all", 0, NULL, 'a'},
+ {"takeover", 0, NULL, 't'},
+ {"help", 0, NULL, 'h'},
+ {"offroot", 0, NULL, OffRootOpt},
+ {"foreground", 0, NULL, 'F'},
+ {NULL, 0, NULL, 0}
+ };
+
+ if (in_initrd()) {
+ /*
+ * set first char of argv[0] to @. This is used by
+ * systemd to signal that the task was launched from
+ * initrd/initramfs and should be preserved during shutdown
+ */
+ argv[0][0] = '@';
+ }
+
+ while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) {
+ switch (opt) {
+ case 'a':
+ container_name = argv[optind-1];
+ all = 1;
+ break;
+ case 't':
+ takeover = 1;
+ break;
+ case 'F':
+ dofork = 0;
+ break;
+ case OffRootOpt:
+ argv[0][0] = '@';
+ break;
+ case 'h':
+ default:
+ usage();
+ break;
+ }
+ }
+
+ if (all == 0 && container_name == NULL) {
+ if (argv[optind])
+ container_name = argv[optind];
+ }
+
+ if (container_name == NULL)
+ usage();
+
+ if (argc - optind > 1)
+ usage();
+
+ if (strcmp(container_name, "/proc/mdstat") == 0)
+ all = 1;
+
+ if (all) {
+ struct mdstat_ent *mdstat, *e;
+ int container_len = strlen(container_name);
+
+ /* launch an mdmon instance for each container found */
+ mdstat = mdstat_read(0, 0);
+ for (e = mdstat; e; e = e->next) {
+ if (e->metadata_version &&
+ strncmp(e->metadata_version, "external:", 9) == 0 &&
+ !is_subarray(&e->metadata_version[9])) {
+ /* update cmdline so this mdmon instance can be
+ * distinguished from others in a call to ps(1)
+ */
+ if (strlen(e->devnm) <= (unsigned)container_len) {
+ memset(container_name, 0, container_len);
+ sprintf(container_name, "%s", e->devnm);
+ }
+ status |= mdmon(e->devnm, 1, takeover);
+ }
+ }
+ free_mdstat(mdstat);
+
+ return status;
+ } else if (strncmp(container_name, "md", 2) == 0) {
+ int id = devnm2devid(container_name);
+ if (id)
+ devnm = container_name;
+ } else {
+ struct stat st;
+
+ if (stat(container_name, &st) == 0)
+ devnm = xstrdup(stat2devnm(&st));
+ }
+
+ if (!devnm) {
+ pr_err("%s is not a valid md device name\n",
+ container_name);
+ exit(1);
+ }
+ return mdmon(devnm, dofork && do_fork(), takeover);
+}
+
+static int mdmon(char *devnm, int must_fork, int takeover)
+{
+ int mdfd;
+ struct mdinfo *mdi, *di;
+ struct supertype *container;
+ sigset_t set;
+ struct sigaction act;
+ int pfd[2];
+ int status;
+ int ignore;
+ pid_t victim = -1;
+ int victim_sock = -1;
+
+ dprintf("starting mdmon for %s\n", devnm);
+
+ mdfd = open_dev(devnm);
+ if (mdfd < 0) {
+ pr_err("%s: %s\n", devnm, strerror(errno));
+ return 1;
+ }
+
+ /* Fork, and have the child tell us when they are ready */
+ if (must_fork) {
+ if (pipe(pfd) != 0) {
+ pr_err("failed to create pipe\n");
+ return 1;
+ }
+ switch(fork()) {
+ case -1:
+ pr_err("failed to fork: %s\n", strerror(errno));
+ return 1;
+ case 0: /* child */
+ close(pfd[0]);
+ break;
+ default: /* parent */
+ close(pfd[1]);
+ if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) {
+ wait(&status);
+ status = WEXITSTATUS(status);
+ }
+ close(pfd[0]);
+ return status;
+ }
+ } else
+ pfd[0] = pfd[1] = -1;
+
+ container = xcalloc(1, sizeof(*container));
+ strcpy(container->devnm, devnm);
+ container->arrays = NULL;
+ container->sock = -1;
+
+ mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS);
+
+ if (!mdi) {
+ pr_err("failed to load sysfs info for %s\n", container->devnm);
+ exit(3);
+ }
+ if (mdi->array.level != UnSet) {
+ pr_err("%s is not a container - cannot monitor\n", devnm);
+ exit(3);
+ }
+ if (mdi->array.major_version != -1 ||
+ mdi->array.minor_version != -2) {
+ pr_err("%s does not use external metadata - cannot monitor\n",
+ devnm);
+ exit(3);
+ }
+
+ container->ss = version_to_superswitch(mdi->text_version);
+ if (container->ss == NULL) {
+ pr_err("%s uses unsupported metadata: %s\n",
+ devnm, mdi->text_version);
+ exit(3);
+ }
+
+ container->devs = NULL;
+ for (di = mdi->devs; di; di = di->next) {
+ struct mdinfo *cd = xmalloc(sizeof(*cd));
+ *cd = *di;
+ cd->next = container->devs;
+ container->devs = cd;
+ }
+ sysfs_free(mdi);
+
+ /* SIGUSR is sent between parent and child. So both block it
+ * and enable it only with pselect.
+ */
+ sigemptyset(&set);
+ sigaddset(&set, SIGUSR1);
+ sigaddset(&set, SIGTERM);
+ sigprocmask(SIG_BLOCK, &set, NULL);
+ act.sa_handler = wake_me;
+ act.sa_flags = 0;
+ sigaction(SIGUSR1, &act, NULL);
+ act.sa_handler = term;
+ sigaction(SIGTERM, &act, NULL);
+ act.sa_handler = SIG_IGN;
+ sigaction(SIGPIPE, &act, NULL);
+
+ victim = mdmon_pid(container->devnm);
+ if (victim >= 0)
+ victim_sock = connect_monitor(container->devnm);
+
+ ignore = chdir("/");
+ if (!takeover && victim > 0 && victim_sock >= 0) {
+ if (fping_monitor(victim_sock) == 0) {
+ pr_err("%s already managed\n", container->devnm);
+ exit(3);
+ }
+ close(victim_sock);
+ victim_sock = -1;
+ }
+ if (container->ss->load_container(container, mdfd, devnm)) {
+ pr_err("Cannot load metadata for %s\n", devnm);
+ exit(3);
+ }
+ close(mdfd);
+
+ /* Ok, this is close enough. We can say goodbye to our parent now.
+ */
+ if (victim > 0)
+ remove_pidfile(devnm);
+ if (make_pidfile(devnm) < 0) {
+ exit(3);
+ }
+ container->sock = make_control_sock(devnm);
+
+ status = 0;
+ if (pfd[1] >= 0) {
+ if (write(pfd[1], &status, sizeof(status)) < 0)
+ pr_err("failed to notify our parent: %d\n",
+ getppid());
+ close(pfd[1]);
+ }
+
+ mlockall(MCL_CURRENT | MCL_FUTURE);
+
+ if (clone_monitor(container) < 0) {
+ pr_err("failed to start monitor process: %s\n",
+ strerror(errno));
+ exit(2);
+ }
+
+ if (victim > 0) {
+ try_kill_monitor(victim, container->devnm, victim_sock);
+ if (victim_sock >= 0)
+ close(victim_sock);
+ }
+
+ setsid();
+ manage_fork_fds(0);
+
+ /* This silliness is to stop the compiler complaining
+ * that we ignore 'ignore'
+ */
+ if (ignore)
+ ignore++;
+
+ do_manager(container);
+
+ exit(0);
+}
+
+/* Some stub functions so super-* can link with us */
+int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ return 0;
+}
+
+int restore_stripes(int *dest, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int source, unsigned long long read_offset,
+ unsigned long long start, unsigned long long length,
+ char *src_buf)
+{
+ return 1;
+}
+
+int save_stripes(int *source, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int nwrites, int *dest,
+ unsigned long long start, unsigned long long length,
+ char *buf)
+{
+ return 0;
+}
+
+struct superswitch super0 = {
+ .name = "0.90",
+};
+struct superswitch super1 = {
+ .name = "1.x",
+};
diff --git a/mdmon.h b/mdmon.h
new file mode 100644
index 0000000..b3d72ac
--- /dev/null
+++ b/mdmon.h
@@ -0,0 +1,111 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+extern const char Name[];
+
+enum array_state { clear, inactive, suspended, readonly, read_auto,
+ clean, active, write_pending, active_idle, broken, bad_word};
+
+enum sync_action { idle, reshape, resync, recover, check, repair, bad_action };
+
+struct active_array {
+ struct mdinfo info;
+ struct supertype *container;
+ struct active_array *next, *replaces;
+ int to_remove;
+
+ int action_fd;
+ int resync_start_fd;
+ int metadata_fd; /* for monitoring rw/ro status */
+ int sync_completed_fd; /* for checkpoint notification events */
+ int safe_mode_delay_fd;
+ unsigned long long last_checkpoint; /* sync_completed fires for many
+ * reasons this field makes sure the
+ * kernel has made progress before
+ * moving the checkpoint. It is
+ * cleared by the metadata handler
+ * when it determines recovery is
+ * terminated.
+ */
+
+ enum array_state prev_state, curr_state, next_state;
+ enum sync_action prev_action, curr_action, next_action;
+
+ int check_degraded; /* flag set by mon, read by manage */
+ int check_reshape; /* flag set by mon, read by manage */
+};
+
+/*
+ * Metadata updates are handled by the monitor thread,
+ * as it has exclusive access to the metadata.
+ * When the manager want to updates metadata, either
+ * for it's own reason (e.g. committing a spare) or
+ * on behalf of mdadm, it creates a metadata_update
+ * structure and queues it to the monitor.
+ * Updates are created and processed by code under the
+ * superswitch. All common code sees them as opaque
+ * blobs.
+ */
+extern struct metadata_update *update_queue, *update_queue_handled;
+
+#define MD_MAJOR 9
+
+extern struct active_array *container;
+extern struct active_array *discard_this;
+extern struct active_array *pending_discard;
+extern struct md_generic_cmd *active_cmd;
+
+void remove_pidfile(char *devname);
+void do_monitor(struct supertype *container);
+void do_manager(struct supertype *container);
+extern int sigterm;
+
+int read_dev_state(int fd);
+int is_container_member(struct mdstat_ent *mdstat, char *container);
+
+struct mdstat_ent *mdstat_read(int hold, int start);
+
+extern int exit_now, manager_ready;
+extern int mon_tid, mgr_tid;
+extern int monitor_loop_cnt;
+
+/* helper routine to determine resync completion since MaxSector is a
+ * moving target
+ */
+static inline int is_resync_complete(struct mdinfo *array)
+{
+ unsigned long long sync_size = 0;
+ int ncopies, l;
+ switch(array->array.level) {
+ case 1:
+ case 4:
+ case 5:
+ case 6:
+ sync_size = array->component_size;
+ break;
+ case 10:
+ l = array->array.layout;
+ ncopies = (l & 0xff) * ((l >> 8) & 0xff);
+ sync_size = array->component_size * array->array.raid_disks;
+ sync_size /= ncopies;
+ break;
+ }
+ return array->resync_start >= sync_size;
+}
diff --git a/mdopen.c b/mdopen.c
new file mode 100644
index 0000000..245be53
--- /dev/null
+++ b/mdopen.c
@@ -0,0 +1,509 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include <ctype.h>
+
+void make_parts(char *dev, int cnt)
+{
+ /* make 'cnt' partition devices for 'dev'
+ * If dev is a device name we use the
+ * major/minor from dev and add 1..cnt
+ * If it is a symlink, we make similar symlinks.
+ * If dev ends with a digit, we add "p%d" else "%d"
+ * If the name exists, we use it's owner/mode,
+ * else that of dev
+ */
+ struct stat stb;
+ int major_num;
+ int minor_num;
+ int odig;
+ int i;
+ int nlen = strlen(dev) + 20;
+ char *name;
+ int dig = isdigit(dev[strlen(dev)-1]);
+ char orig[1001];
+ char sym[1024];
+ int err;
+
+ if (cnt == 0)
+ cnt = 4;
+ if (lstat(dev, &stb)!= 0)
+ return;
+
+ if (S_ISBLK(stb.st_mode)) {
+ major_num = major(stb.st_rdev);
+ minor_num = minor(stb.st_rdev);
+ odig = -1;
+ } else if (S_ISLNK(stb.st_mode)) {
+ int len;
+
+ len = readlink(dev, orig, sizeof(orig));
+ if (len < 0 || len >= (int)sizeof(orig))
+ return;
+ orig[len] = 0;
+ odig = isdigit(orig[len-1]);
+ major_num = -1;
+ minor_num = -1;
+ } else
+ return;
+ name = xmalloc(nlen);
+ for (i = 1; i <= cnt ; i++) {
+ struct stat stb2;
+ snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i);
+ if (stat(name, &stb2) == 0) {
+ if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode))
+ continue;
+ if (stb2.st_rdev == makedev(major_num, minor_num+i))
+ continue;
+ unlink(name);
+ } else {
+ stb2 = stb;
+ }
+ if (S_ISBLK(stb.st_mode)) {
+ if (mknod(name, S_IFBLK | 0600,
+ makedev(major_num, minor_num+i)))
+ perror("mknod");
+ if (chown(name, stb2.st_uid, stb2.st_gid))
+ perror("chown");
+ if (chmod(name, stb2.st_mode & 07777))
+ perror("chmod");
+ err = 0;
+ } else {
+ snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i);
+ err = symlink(sym, name);
+ }
+
+ if (err == 0 && stat(name, &stb2) == 0)
+ add_dev(name, &stb2, 0, NULL);
+ }
+ free(name);
+}
+
+int create_named_array(char *devnm)
+{
+ int fd;
+ int n = -1;
+ static const char new_array_file[] = {
+ "/sys/module/md_mod/parameters/new_array"
+ };
+
+ fd = open(new_array_file, O_WRONLY);
+ if (fd < 0 && errno == ENOENT) {
+ if (system("modprobe md_mod") == 0)
+ fd = open(new_array_file, O_WRONLY);
+ }
+ if (fd >= 0) {
+ n = write(fd, devnm, strlen(devnm));
+ close(fd);
+ }
+ if (fd < 0 || n != (int)strlen(devnm)) {
+ pr_err("Fail to create %s when using %s, fallback to creation via node\n",
+ devnm, new_array_file);
+ return 0;
+ }
+
+ return 1;
+}
+
+/*
+ * We need a new md device to assemble/build/create an array.
+ * 'dev' is a name given us by the user (command line or mdadm.conf)
+ * It might start with /dev or /dev/md any might end with a digit
+ * string.
+ * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX
+ * If it ends with a digit string, then it must be as above, or
+ * 'trustworthy' must be 'METADATA' and the 'dev' must be
+ * /dev/md/'name'NN or 'name'NN
+ * If it doesn't end with a digit string, it must be /dev/md/'name'
+ * or 'name' or must be NULL.
+ * If the digit string is present, it gives the minor number to use
+ * If not, we choose a high, unused minor number.
+ * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'.
+ * else if the name is 'd[0-9]+' then we use mdp
+ * else if trustworthy is 'METADATA' we use md
+ * else the choice depends on 'autof'.
+ * If name is NULL it is assumed to match whatever dev provides.
+ * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX'
+ *
+ * If 'name' is given, and 'trustworthy' is 'foreign' and name is not
+ * supported by 'dev', we add a "_%d" suffix based on the minor number
+ * use that.
+ *
+ * If udev is configured, we create a temporary device, open it, and
+ * unlink it.
+ * If not, we create the /dev/mdXX device, and if name is usable,
+ * /dev/md/name
+ * In any case we return /dev/md/name or (if that isn't available)
+ * /dev/mdXX in 'chosen'.
+ *
+ * When we create devices, we use uid/gid/umask from config file.
+ */
+
+int create_mddev(char *dev, char *name, int autof, int trustworthy,
+ char *chosen, int block_udev)
+{
+ int mdfd;
+ struct stat stb;
+ int num = -1;
+ int use_mdp = -1;
+ struct createinfo *ci = conf_get_create_info();
+ int parts;
+ char *cname;
+ char devname[37];
+ char devnm[32];
+ char cbuf[400];
+
+ if (!use_udev())
+ block_udev = 0;
+
+ if (chosen == NULL)
+ chosen = cbuf;
+
+ if (autof == 0)
+ autof = ci->autof;
+
+ parts = autof >> 3;
+ autof &= 7;
+
+ strcpy(chosen, "/dev/md/");
+ cname = chosen + strlen(chosen);
+
+ if (dev) {
+ if (strncmp(dev, "/dev/md/", 8) == 0) {
+ strcpy(cname, dev+8);
+ } else if (strncmp(dev, "/dev/", 5) == 0) {
+ char *e = dev + strlen(dev);
+ while (e > dev && isdigit(e[-1]))
+ e--;
+ if (e[0])
+ num = strtoul(e, NULL, 10);
+ strcpy(cname, dev+5);
+ cname[e-(dev+5)] = 0;
+ /* name *must* be mdXX or md_dXX in this context */
+ if (num < 0 ||
+ (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) {
+ pr_err("%s is an invalid name for an md device. Try /dev/md/%s\n",
+ dev, dev+5);
+ return -1;
+ }
+ if (strcmp(cname, "md") == 0)
+ use_mdp = 0;
+ else
+ use_mdp = 1;
+ /* recreate name: /dev/md/0 or /dev/md/d0 */
+ sprintf(cname, "%s%d", use_mdp?"d":"", num);
+ } else
+ strcpy(cname, dev);
+
+ /* 'cname' must not contain a slash, and may not be
+ * empty.
+ */
+ if (strchr(cname, '/') != NULL) {
+ pr_err("%s is an invalid name for an md device.\n", dev);
+ return -1;
+ }
+ if (cname[0] == 0) {
+ pr_err("%s is an invalid name for an md device (empty!).\n", dev);
+ return -1;
+ }
+ if (num < 0) {
+ /* If cname is 'N' or 'dN', we get dev number
+ * from there.
+ */
+ char *sp = cname;
+ char *ep;
+ if (cname[0] == 'd')
+ sp++;
+ if (isdigit(sp[0]))
+ num = strtoul(sp, &ep, 10);
+ else
+ ep = sp;
+ if (ep == sp || *ep || num < 0)
+ num = -1;
+ else if (cname[0] == 'd')
+ use_mdp = 1;
+ else
+ use_mdp = 0;
+ }
+ }
+
+ /* Now determine device number */
+ /* named 'METADATA' cannot use 'mdp'. */
+ if (name && name[0] == 0)
+ name = NULL;
+ if (name && trustworthy == METADATA && use_mdp == 1) {
+ pr_err("%s is not allowed for a %s container. Consider /dev/md%d.\n", dev, name, num);
+ return -1;
+ }
+ if (name && trustworthy == METADATA)
+ use_mdp = 0;
+ if (use_mdp == -1) {
+ if (autof == 4 || autof == 6)
+ use_mdp = 1;
+ else
+ use_mdp = 0;
+ }
+ if (num < 0 && trustworthy == LOCAL && name) {
+ /* if name is numeric, possibly prefixed by
+ * 'md' or '/dev/md', use that for num
+ * if it is not already in use */
+ char *ep;
+ char *n2 = name;
+ if (strncmp(n2, "/dev/", 5) == 0)
+ n2 += 5;
+ if (strncmp(n2, "md", 2) == 0)
+ n2 += 2;
+ if (*n2 == '/')
+ n2++;
+ num = strtoul(n2, &ep, 10);
+ if (ep == n2 || *ep)
+ num = -1;
+ else {
+ sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num);
+ if (mddev_busy(devnm))
+ num = -1;
+ }
+ }
+
+ if (cname[0] == 0 && name) {
+ /* Need to find a name if we can
+ * We don't completely trust 'name'. Truncate to
+ * reasonable length and remove '/'
+ */
+ char *cp;
+ struct map_ent *map = NULL;
+ int conflict = 1;
+ int unum = 0;
+ int cnlen;
+ strncpy(cname, name, 200);
+ cname[200] = 0;
+ for (cp = cname; *cp ; cp++)
+ switch (*cp) {
+ case '/':
+ *cp = '-';
+ break;
+ case ' ':
+ case '\t':
+ *cp = '_';
+ break;
+ }
+
+ if (trustworthy == LOCAL ||
+ (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) {
+ /* Only need suffix if there is a conflict */
+ if (map_by_name(&map, cname) == NULL)
+ conflict = 0;
+ }
+ cnlen = strlen(cname);
+ while (conflict) {
+ if (trustworthy == METADATA && !isdigit(cname[cnlen-1]))
+ sprintf(cname+cnlen, "%d", unum);
+ else
+ /* add _%d to FOREIGN array that don't
+ * a 'host:' prefix
+ */
+ sprintf(cname+cnlen, "_%d", unum);
+ unum++;
+ if (map_by_name(&map, cname) == NULL)
+ conflict = 0;
+ }
+ }
+
+ devnm[0] = 0;
+ if (num < 0 && cname && ci->names) {
+ sprintf(devnm, "md_%s", cname);
+ if (block_udev)
+ udev_block(devnm);
+ if (!create_named_array(devnm)) {
+ devnm[0] = 0;
+ udev_unblock();
+ }
+ }
+ if (num >= 0) {
+ sprintf(devnm, "md%d", num);
+ if (block_udev)
+ udev_block(devnm);
+ if (!create_named_array(devnm)) {
+ devnm[0] = 0;
+ udev_unblock();
+ }
+ }
+ if (devnm[0] == 0) {
+ if (num < 0) {
+ /* need to choose a free number. */
+ char *_devnm = find_free_devnm(use_mdp);
+ if (_devnm == NULL) {
+ pr_err("No avail md devices - aborting\n");
+ return -1;
+ }
+ strcpy(devnm, _devnm);
+ } else {
+ sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num);
+ if (mddev_busy(devnm)) {
+ pr_err("%s is already in use.\n",
+ dev);
+ return -1;
+ }
+ }
+ if (block_udev)
+ udev_block(devnm);
+ }
+
+ sprintf(devname, "/dev/%s", devnm);
+
+ if (dev && dev[0] == '/')
+ strcpy(chosen, dev);
+ else if (cname[0] == 0)
+ strcpy(chosen, devname);
+
+ /* We have a device number and name.
+ * If we cannot detect udev, we need to make
+ * devices and links ourselves.
+ */
+ if (!use_udev()) {
+ /* Make sure 'devname' exists and 'chosen' is a symlink to it */
+ if (lstat(devname, &stb) == 0) {
+ /* Must be the correct device, else error */
+ if ((stb.st_mode&S_IFMT) != S_IFBLK ||
+ stb.st_rdev != devnm2devid(devnm)) {
+ pr_err("%s exists but looks wrong, please fix\n",
+ devname);
+ return -1;
+ }
+ } else {
+ if (mknod(devname, S_IFBLK|0600,
+ devnm2devid(devnm)) != 0) {
+ pr_err("failed to create %s\n",
+ devname);
+ return -1;
+ }
+ if (chown(devname, ci->uid, ci->gid))
+ perror("chown");
+ if (chmod(devname, ci->mode))
+ perror("chmod");
+ stat(devname, &stb);
+ add_dev(devname, &stb, 0, NULL);
+ }
+ if (use_mdp == 1)
+ make_parts(devname, parts);
+
+ if (strcmp(chosen, devname) != 0) {
+ if (mkdir("/dev/md",0700) == 0) {
+ if (chown("/dev/md", ci->uid, ci->gid))
+ perror("chown /dev/md");
+ if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111)))
+ perror("chmod /dev/md");
+ }
+
+ if (dev && strcmp(chosen, dev) == 0)
+ /* We know we are allowed to use this name */
+ unlink(chosen);
+
+ if (lstat(chosen, &stb) == 0) {
+ char buf[300];
+ ssize_t link_len = readlink(chosen, buf, sizeof(buf)-1);
+ if (link_len >= 0)
+ buf[link_len] = '\0';
+
+ if ((stb.st_mode & S_IFMT) != S_IFLNK ||
+ link_len < 0 ||
+ strcmp(buf, devname) != 0) {
+ pr_err("%s exists - ignoring\n",
+ chosen);
+ strcpy(chosen, devname);
+ }
+ } else if (symlink(devname, chosen) != 0)
+ pr_err("failed to create %s: %s\n",
+ chosen, strerror(errno));
+ if (use_mdp && strcmp(chosen, devname) != 0)
+ make_parts(chosen, parts);
+ }
+ }
+ mdfd = open_dev_excl(devnm);
+ if (mdfd < 0)
+ pr_err("unexpected failure opening %s\n",
+ devname);
+ return mdfd;
+}
+
+/* Open this and check that it is an md device.
+ * On success, return filedescriptor.
+ * On failure, return -1 if it doesn't exist,
+ * or -2 if it exists but is not an md device.
+ */
+int open_mddev(char *dev, int report_errors)
+{
+ int mdfd = open(dev, O_RDONLY);
+
+ if (mdfd < 0) {
+ if (report_errors)
+ pr_err("error opening %s: %s\n",
+ dev, strerror(errno));
+ return -1;
+ }
+
+ if (md_array_valid(mdfd) == 0) {
+ close(mdfd);
+ if (report_errors)
+ pr_err("%s does not appear to be an md device\n", dev);
+ return -2;
+ }
+
+ return mdfd;
+}
+
+char *find_free_devnm(int use_partitions)
+{
+ static char devnm[32];
+ int devnum;
+ for (devnum = 127; devnum != 128;
+ devnum = devnum ? devnum-1 : (1<<9)-1) {
+
+ if (use_partitions)
+ sprintf(devnm, "md_d%d", devnum);
+ else
+ sprintf(devnm, "md%d", devnum);
+ if (mddev_busy(devnm))
+ continue;
+ if (!conf_name_is_free(devnm))
+ continue;
+ if (!use_udev()) {
+ /* make sure it is new to /dev too, at least as a
+ * non-standard */
+ dev_t devid = devnm2devid(devnm);
+ if (devid) {
+ char *dn = map_dev(major(devid),
+ minor(devid), 0);
+ if (dn && ! is_standard(dn, NULL))
+ continue;
+ }
+ }
+ break;
+ }
+ if (devnum == 128)
+ return NULL;
+ return devnm;
+}
diff --git a/mdstat.c b/mdstat.c
new file mode 100644
index 0000000..2fd792c
--- /dev/null
+++ b/mdstat.c
@@ -0,0 +1,441 @@
+/*
+ * mdstat - parse /proc/mdstat file. Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+/*
+ * The /proc/mdstat file comes in at least 3 flavours:
+ * In an unpatched 2.2 kernel (md 0.36.6):
+ * Personalities : [n raidx] ...
+ * read_ahead {not set|%d sectors}
+ * md0 : {in}active{ raidX /dev/hda... %d blocks{ maxfault=%d}}
+ * md1 : .....
+ *
+ * Normally only 4 md lines, but all are listed.
+ *
+ * In a patched 2.2 kernel (md 0.90.0)
+ * Personalities : [raidx] ...
+ * read_ahead {not set|%d sectors}
+ * mdN : {in}active {(readonly)} raidX dev[%d]{(F)} ... %d blocks STATUS RESYNC
+ * ... Only initialised arrays listed
+ * unused devices: {dev dev ... | <none>}
+ *
+ * STATUS is personality dependant:
+ * linear: %dk rounding
+ * raid0: %dk chunks
+ * raid1: [%d/%d] [U_U] ( raid/working. operational or not)
+ * raid5: level 4/5, %dk chunk, algorithm %d [%d/%d] [U_U]
+ *
+ * RESYNC is empty or:
+ * {resync|recovery}=%u%% finish=%u.%umin
+ * or
+ * resync=DELAYED
+ *
+ * In a 2.4 kernel (md 0.90.0/2.4)
+ * Personalities : [raidX] ...
+ * read_ahead {not set|%d sectors}
+ * mdN : {in}active {(read-only)} raidX dev[%d]{(F)} ...
+ * %d blocks STATUS
+ * RESYNC
+ * unused devices: {dev dev .. | <none>}
+ *
+ * STATUS matches 0.90.0/2.2
+ * RESYNC includes [===>....],
+ * adds a space after {resync|recovery} and before and after '='
+ * adds a decimal to the recovery percent.
+ * adds (%d/%d) resync amount and max_blocks, before finish.
+ * adds speed=%dK/sec after finish
+ *
+ *
+ *
+ * Out of this we want to extract:
+ * list of devices, active or not
+ * pattern of failed drives (so need number of drives)
+ * percent resync complete
+ *
+ * As continuation is indicated by leading space, we use
+ * conf_line from config.c to read logical lines
+ *
+ */
+
+#include "mdadm.h"
+#include "dlink.h"
+#include <sys/select.h>
+#include <ctype.h>
+
+static void free_member_devnames(struct dev_member *m)
+{
+ while(m) {
+ struct dev_member *t = m;
+
+ m = m->next;
+ free(t->name);
+ free(t);
+ }
+}
+
+static int add_member_devname(struct dev_member **m, char *name)
+{
+ struct dev_member *new;
+ char *t;
+
+ if ((t = strchr(name, '[')) == NULL)
+ /* not a device */
+ return 0;
+
+ new = xmalloc(sizeof(*new));
+ new->name = strndup(name, t - name);
+ new->next = *m;
+ *m = new;
+ return 1;
+}
+
+void free_mdstat(struct mdstat_ent *ms)
+{
+ while (ms) {
+ struct mdstat_ent *t;
+ free(ms->level);
+ free(ms->pattern);
+ free(ms->metadata_version);
+ free_member_devnames(ms->members);
+ t = ms;
+ ms = ms->next;
+ free(t);
+ }
+}
+
+static int mdstat_fd = -1;
+struct mdstat_ent *mdstat_read(int hold, int start)
+{
+ FILE *f;
+ struct mdstat_ent *all, *rv, **end, **insert_here;
+ char *line;
+ int fd;
+
+ if (hold && mdstat_fd != -1) {
+ off_t offset = lseek(mdstat_fd, 0L, 0);
+ if (offset == (off_t)-1) {
+ return NULL;
+ }
+ fd = dup(mdstat_fd);
+ if (fd >= 0)
+ f = fdopen(fd, "r");
+ else
+ return NULL;
+ } else
+ f = fopen("/proc/mdstat", "r");
+ if (f == NULL)
+ return NULL;
+ else
+ fcntl(fileno(f), F_SETFD, FD_CLOEXEC);
+
+ all = NULL;
+ end = &all;
+ for (; (line = conf_line(f)) ; free_line(line)) {
+ struct mdstat_ent *ent;
+ char *w;
+ char devnm[32];
+ int in_devs = 0;
+
+ if (strcmp(line, "Personalities") == 0)
+ continue;
+ if (strcmp(line, "read_ahead") == 0)
+ continue;
+ if (strcmp(line, "unused") == 0)
+ continue;
+ insert_here = NULL;
+ /* Better be an md line.. */
+ if (strncmp(line, "md", 2)!= 0 || strlen(line) >= 32 ||
+ (line[2] != '_' && !isdigit(line[2])))
+ continue;
+ strcpy(devnm, line);
+
+ ent = xmalloc(sizeof(*ent));
+ ent->level = ent->pattern= NULL;
+ ent->next = NULL;
+ ent->percent = RESYNC_NONE;
+ ent->active = -1;
+ ent->resync = 0;
+ ent->metadata_version = NULL;
+ ent->raid_disks = 0;
+ ent->devcnt = 0;
+ ent->members = NULL;
+
+ strcpy(ent->devnm, devnm);
+
+ for (w=dl_next(line); w!= line ; w=dl_next(w)) {
+ int l = strlen(w);
+ char *eq;
+ if (strcmp(w, "active") == 0)
+ ent->active = 1;
+ else if (strcmp(w, "inactive") == 0) {
+ ent->active = 0;
+ in_devs = 1;
+ } else if (strcmp(w, "bitmap:") == 0) {
+ /* We need to stop parsing here;
+ * otherwise, ent->raid_disks will be
+ * overwritten by the wrong value.
+ */
+ break;
+ } else if (ent->active > 0 &&
+ ent->level == NULL &&
+ w[0] != '(' /*readonly*/) {
+ ent->level = xstrdup(w);
+ in_devs = 1;
+ } else if (in_devs && strcmp(w, "blocks") == 0)
+ in_devs = 0;
+ else if (in_devs) {
+ char *ep = strchr(w, '[');
+ ent->devcnt +=
+ add_member_devname(&ent->members, w);
+ if (ep && strncmp(w, "md", 2) == 0) {
+ /* This has an md device as a component.
+ * If that device is already in the
+ * list, make sure we insert before
+ * there.
+ */
+ struct mdstat_ent **ih;
+ ih = &all;
+ while (ih != insert_here && *ih &&
+ ((int)strlen((*ih)->devnm) !=
+ ep-w ||
+ strncmp((*ih)->devnm, w,
+ ep-w) != 0))
+ ih = & (*ih)->next;
+ insert_here = ih;
+ }
+ } else if (strcmp(w, "super") == 0 &&
+ dl_next(w) != line) {
+ w = dl_next(w);
+ ent->metadata_version = xstrdup(w);
+ } else if (w[0] == '[' && isdigit(w[1])) {
+ ent->raid_disks = atoi(w+1);
+ } else if (!ent->pattern &&
+ w[0] == '[' &&
+ (w[1] == 'U' || w[1] == '_')) {
+ ent->pattern = xstrdup(w+1);
+ if (ent->pattern[l-2] == ']')
+ ent->pattern[l-2] = '\0';
+ } else if (ent->percent == RESYNC_NONE &&
+ strncmp(w, "re", 2) == 0 &&
+ w[l-1] == '%' &&
+ (eq = strchr(w, '=')) != NULL ) {
+ ent->percent = atoi(eq+1);
+ if (strncmp(w,"resync", 6) == 0)
+ ent->resync = 1;
+ else if (strncmp(w, "reshape", 7) == 0)
+ ent->resync = 2;
+ else
+ ent->resync = 0;
+ } else if (ent->percent == RESYNC_NONE &&
+ (w[0] == 'r' || w[0] == 'c')) {
+ if (strncmp(w, "resync", 6) == 0)
+ ent->resync = 1;
+ if (strncmp(w, "reshape", 7) == 0)
+ ent->resync = 2;
+ if (strncmp(w, "recovery", 8) == 0)
+ ent->resync = 0;
+ if (strncmp(w, "check", 5) == 0)
+ ent->resync = 3;
+
+ if (l > 8 && strcmp(w+l-8, "=DELAYED") == 0)
+ ent->percent = RESYNC_DELAYED;
+ if (l > 8 && strcmp(w+l-8, "=PENDING") == 0)
+ ent->percent = RESYNC_PENDING;
+ if (l > 7 && strcmp(w+l-7, "=REMOTE") == 0)
+ ent->percent = RESYNC_REMOTE;
+ } else if (ent->percent == RESYNC_NONE &&
+ w[0] >= '0' &&
+ w[0] <= '9' &&
+ w[l-1] == '%') {
+ ent->percent = atoi(w);
+ }
+ }
+ if (insert_here && (*insert_here)) {
+ ent->next = *insert_here;
+ *insert_here = ent;
+ } else {
+ *end = ent;
+ end = &ent->next;
+ }
+ }
+ if (hold && mdstat_fd == -1) {
+ mdstat_fd = dup(fileno(f));
+ fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC);
+ }
+ fclose(f);
+
+ /* If we might want to start array,
+ * reverse the order, so that components comes before composites
+ */
+ if (start) {
+ rv = NULL;
+ while (all) {
+ struct mdstat_ent *e = all;
+ all = all->next;
+ e->next = rv;
+ rv = e;
+ }
+ } else
+ rv = all;
+ return rv;
+}
+
+void mdstat_close(void)
+{
+ if (mdstat_fd >= 0)
+ close(mdstat_fd);
+ mdstat_fd = -1;
+}
+
+/*
+ * function: mdstat_wait
+ * Description: Function waits for event on mdstat.
+ * Parameters:
+ * seconds - timeout for waiting
+ * Returns:
+ * > 0 - detected event
+ * 0 - timeout
+ * < 0 - detected error
+ */
+int mdstat_wait(int seconds)
+{
+ fd_set fds;
+ struct timeval tm;
+ int maxfd = 0;
+ FD_ZERO(&fds);
+ if (mdstat_fd >= 0) {
+ FD_SET(mdstat_fd, &fds);
+ maxfd = mdstat_fd;
+ } else
+ return -1;
+
+ tm.tv_sec = seconds;
+ tm.tv_usec = 0;
+
+ return select(maxfd + 1, NULL, NULL, &fds, &tm);
+}
+
+void mdstat_wait_fd(int fd, const sigset_t *sigmask)
+{
+ fd_set fds, rfds;
+ int maxfd = 0;
+
+ FD_ZERO(&fds);
+ FD_ZERO(&rfds);
+ if (mdstat_fd >= 0)
+ FD_SET(mdstat_fd, &fds);
+
+ if (fd >= 0) {
+ struct stat stb;
+ fstat(fd, &stb);
+ if ((stb.st_mode & S_IFMT) == S_IFREG)
+ /* Must be a /proc or /sys fd, so expect
+ * POLLPRI
+ * i.e. an 'exceptional' event.
+ */
+ FD_SET(fd, &fds);
+ else
+ FD_SET(fd, &rfds);
+
+ if (fd > maxfd)
+ maxfd = fd;
+
+ }
+ if (mdstat_fd > maxfd)
+ maxfd = mdstat_fd;
+
+ pselect(maxfd + 1, &rfds, NULL, &fds,
+ NULL, sigmask);
+}
+
+int mddev_busy(char *devnm)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *me;
+
+ for (me = mdstat ; me ; me = me->next)
+ if (strcmp(me->devnm, devnm) == 0)
+ break;
+ free_mdstat(mdstat);
+ return me != NULL;
+}
+
+struct mdstat_ent *mdstat_by_component(char *name)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+
+ while (mdstat) {
+ struct dev_member *m;
+ struct mdstat_ent *ent;
+ if (mdstat->metadata_version &&
+ strncmp(mdstat->metadata_version, "external:", 9) == 0 &&
+ is_subarray(mdstat->metadata_version+9))
+ /* don't return subarrays, only containers */
+ ;
+ else for (m = mdstat->members; m; m = m->next) {
+ if (strcmp(m->name, name) == 0) {
+ free_mdstat(mdstat->next);
+ mdstat->next = NULL;
+ return mdstat;
+ }
+ }
+ ent = mdstat;
+ mdstat = mdstat->next;
+ ent->next = NULL;
+ free_mdstat(ent);
+ }
+ return NULL;
+}
+
+struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent = NULL;
+
+ while (mdstat) {
+ /* metadata version must match:
+ * external:[/-]%s/%s
+ * where first %s is 'container' and second %s is 'subdev'
+ */
+ if (ent)
+ free_mdstat(ent);
+ ent = mdstat;
+ mdstat = mdstat->next;
+ ent->next = NULL;
+
+ if (ent->metadata_version == NULL ||
+ strncmp(ent->metadata_version, "external:", 9) != 0)
+ continue;
+
+ if (!metadata_container_matches(ent->metadata_version+9,
+ container) ||
+ !metadata_subdev_matches(ent->metadata_version+9,
+ subdev))
+ continue;
+
+ free_mdstat(mdstat);
+ return ent;
+ }
+ return NULL;
+}
diff --git a/misc/mdcheck b/misc/mdcheck
new file mode 100644
index 0000000..700c3e2
--- /dev/null
+++ b/misc/mdcheck
@@ -0,0 +1,166 @@
+#!/bin/bash
+
+# Copyright (C) 2014-2017 Neil Brown <neilb@suse.de>
+#
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# Author: Neil Brown
+# Email: <neilb@suse.com>
+
+# This script should be run periodically to automatically
+# perform a 'check' on any md arrays.
+#
+# It supports a 'time budget' such that any incomplete 'check'
+# will be checkpointed when that time has expired.
+# A subsequent invocation can allow the 'check' to continue.
+#
+# Options are:
+# --continue Don't start new checks, only continue old ones.
+# --duration This is passed to "date --date=$duration" to find out
+# when to finish
+#
+# To support '--continue', arrays are identified by UUID and the 'sync_completed'
+# value is stored in /var/lib/mdcheck/$UUID
+
+# convert a /dev/md name into /sys/.../md equivalent
+sysname() {
+ set `ls -lLd $1`
+ maj=${5%,}
+ min=$6
+ readlink -f /sys/dev/block/$maj:$min
+}
+
+args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@")
+rv=$?
+if [ $rv -ne 0 ]; then exit $rv; fi
+
+eval set -- $args
+
+cont=
+endtime=
+while [ " $1" != " --" ]
+do
+ case $1 in
+ --help )
+ echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]'
+ echo >&2 ' time-offset must be understood by "date --date"'
+ exit 0
+ ;;
+ --continue ) cont=yes ;;
+ --duration ) shift; dur=$1
+ endtime=$(date --date "$dur" "+%s")
+ ;;
+ esac
+ shift
+done
+shift
+
+# We need a temp file occasionally...
+tmp=/var/lib/mdcheck/.md-check-$$
+trap 'rm -f "$tmp"' 0 2 3 15
+
+
+# firstly, clean out really old state files
+mkdir -p /var/lib/mdcheck
+find /var/lib/mdcheck -name "MD_UUID*" -type f -mtime +180 -exec rm {} \;
+
+# Now look at each md device.
+cnt=0
+for dev in /dev/md?*
+do
+ [ -e "$dev" ] || continue
+ sys=`sysname $dev`
+ if [ ! -f "$sys/md/sync_action" ]
+ then # cannot check this array
+ continue
+ fi
+ if [ "`cat $sys/md/sync_action`" != 'idle' ]
+ then # This array is busy
+ continue
+ fi
+
+ mdadm --detail --export "$dev" | grep '^MD_UUID=' > $tmp || continue
+ source $tmp
+ fl="/var/lib/mdcheck/MD_UUID_$MD_UUID"
+ if [ -z "$cont" ]
+ then
+ start=0
+ logger -p daemon.info mdcheck start checking $dev
+ elif [ -z "$MD_UUID" -o ! -f "$fl" ]
+ then
+ # Nothing to continue here
+ continue
+ else
+ start=`cat "$fl"`
+ logger -p daemon.info mdcheck continue checking $dev from $start
+ fi
+
+ cnt=$[cnt+1]
+ eval MD_${cnt}_fl=\$fl
+ eval MD_${cnt}_sys=\$sys
+ eval MD_${cnt}_dev=\$dev
+ echo $start > $fl
+ echo $start > $sys/md/sync_min
+ echo check > $sys/md/sync_action
+done
+
+if [ -z "$endtime" ]
+then
+ exit 0
+fi
+
+while [ `date +%s` -lt $endtime ]
+do
+ any=
+ for i in `eval echo {1..$cnt}`
+ do
+ eval fl=\$MD_${i}_fl
+ eval sys=\$MD_${i}_sys
+ eval dev=\$MD_${i}_dev
+
+ if [ -z "$fl" ]; then continue; fi
+
+ if [ "`cat $sys/md/sync_action`" != 'check' ]
+ then
+ logger -p daemon.info mdcheck finished checking $dev
+ eval MD_${i}_fl=
+ rm -f $fl
+ continue;
+ fi
+ read a rest < $sys/md/sync_completed
+ echo $a > $fl
+ any=yes
+ done
+ if [ -z "$any" ]; then exit 0; fi
+ sleep 120
+done
+
+# We've waited, and there are still checks running.
+# Time to stop them.
+for i in `eval echo {1..$cnt}`
+do
+ eval fl=\$MD_${i}_fl
+ eval sys=\$MD_${i}_sys
+ eval dev=\$MD_${i}_dev
+
+ if [ -z "$fl" ]; then continue; fi
+
+ if [ "`cat $sys/md/sync_action`" != 'check' ]
+ then
+ eval MD_${i}_fl=
+ rm -f $fl
+ continue;
+ fi
+ echo idle > $sys/md/sync_action
+ cat $sys/md/sync_min > $fl
+ logger -p daemon.info pause checking $dev at `cat $fl`
+done
diff --git a/misc/syslog-events b/misc/syslog-events
new file mode 100644
index 0000000..fe8c14e
--- /dev/null
+++ b/misc/syslog-events
@@ -0,0 +1,27 @@
+#!/bin/sh
+#
+# sample event handling script for mdadm
+# e.g. mdadm --follow --program=/sbin/syslog-events --scan
+#
+# License: GPL ver.2
+# Copyright (C) 2004 SEKINE Tatsuo <tsekine@sdri.co.jp>
+
+event="$1"
+dev="$2"
+disc="$3"
+
+facility="kern"
+tag="mdmonitor"
+
+case x"${event}" in
+ xFail*) priority="error" ;;
+ xTest*) priority="debug" ;;
+ x*) priority="info" ;;
+esac
+
+msg="${event} event on ${dev}"
+if [ x"${disc}" != x ]; then
+ msg="${msg}, related to disc ${disc}"
+fi
+
+exec logger -t "${tag}" -p "${facility}.${priority}" -- "${msg}"
diff --git a/mkinitramfs b/mkinitramfs
new file mode 100644
index 0000000..c6275dd
--- /dev/null
+++ b/mkinitramfs
@@ -0,0 +1,55 @@
+#!/bin/sh
+
+# make sure we are being run in the right directory...
+if [ -f mkinitramfs ]
+then :
+else
+ echo >&2 mkinitramfs must be run from the mdadm source directory.
+ exit 1
+fi
+if [ -f /bin/busybox ]
+then : good, it exists
+ case `file /bin/busybox` in
+ *statically* ) : good ;;
+ * ) echo >&2 mkinitramfs: /bin/busybox is not statically linked: cannot proceed.
+ exit 1
+ esac
+else
+ echo >&2 "mkinitramfs: /bin/busybox doesn't exist - please install it statically linked."
+ exit 1
+fi
+
+rm -rf initramfs
+mkdir initramfs
+mkdir initramfs/bin
+make mdadm.static
+cp mdadm.static initramfs/bin/mdadm
+cp /bin/busybox initramfs/bin/busybox
+ln initramfs/bin/busybox initramfs/bin/sh
+cat <<- END > initramfs/init
+ #!/bin/sh
+
+ echo 'Auto-assembling boot md array'
+ mkdir /proc
+ mount -t proc proc /proc
+ if [ -n "$rootuuid" ]
+ then arg=--uuid=$rootuuid
+ elif [ -n "$mdminor" ]
+ then arg=--super-minor=$mdminor
+ else arg=--super-minor=0
+ fi
+ echo "Using $arg"
+ mdadm -Acpartitions $arg --auto=part /dev/mda
+ cd /
+ mount /dev/mda1 /root || mount /dev/mda /root
+ umount /proc
+ cd /root
+ exec chroot . /sbin/init < /dev/console > /dev/console 2>&1
+END
+chmod +x initramfs/init
+
+(cd initramfs
+ find init bin | cpio -o -H newc | gzip --best
+) > init.cpio.gz
+rm -rf initramfs
+ls -l init.cpio.gz
diff --git a/monitor.c b/monitor.c
new file mode 100644
index 0000000..e0d3be6
--- /dev/null
+++ b/monitor.c
@@ -0,0 +1,909 @@
+/*
+ * mdmon - monitor external metadata arrays
+ *
+ * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de>
+ * Copyright (C) 2007-2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "mdadm.h"
+#include "mdmon.h"
+#include <sys/syscall.h>
+#include <sys/select.h>
+#include <signal.h>
+
+static char *array_states[] = {
+ "clear", "inactive", "suspended", "readonly", "read-auto",
+ "clean", "active", "write-pending", "active-idle", "broken", NULL };
+static char *sync_actions[] = {
+ "idle", "reshape", "resync", "recover", "check", "repair", NULL
+};
+
+enum bb_action {
+ RECORD_BB = 1,
+ COMPARE_BB,
+};
+
+static int write_attr(char *attr, int fd)
+{
+ return write(fd, attr, strlen(attr));
+}
+
+static void add_fd(fd_set *fds, int *maxfd, int fd)
+{
+ struct stat st;
+ if (fd < 0)
+ return;
+ if (fstat(fd, &st) == -1) {
+ dprintf("Invalid fd %d\n", fd);
+ return;
+ }
+ if (st.st_nlink == 0) {
+ dprintf("fd %d was deleted\n", fd);
+ return;
+ }
+ if (fd > *maxfd)
+ *maxfd = fd;
+ FD_SET(fd, fds);
+}
+
+static int read_attr(char *buf, int len, int fd)
+{
+ int n;
+
+ if (fd < 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ lseek(fd, 0, 0);
+ n = read(fd, buf, len - 1);
+
+ if (n <= 0) {
+ buf[0] = 0;
+ return 0;
+ }
+ buf[n] = 0;
+ if (buf[n-1] == '\n')
+ buf[n-1] = 0;
+ return n;
+}
+
+static void read_resync_start(int fd, unsigned long long *v)
+{
+ char buf[30];
+ int n;
+
+ n = read_attr(buf, 30, fd);
+ if (n <= 0) {
+ dprintf("Failed to read resync_start (%d)\n", fd);
+ return;
+ }
+ if (strncmp(buf, "none", 4) == 0)
+ *v = MaxSector;
+ else
+ *v = strtoull(buf, NULL, 10);
+}
+
+static unsigned long long read_sync_completed(int fd)
+{
+ unsigned long long val;
+ char buf[50];
+ int n;
+ char *ep;
+
+ n = read_attr(buf, 50, fd);
+
+ if (n <= 0)
+ return 0;
+ buf[n] = 0;
+ val = strtoull(buf, &ep, 0);
+ if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+ return 0;
+ return val;
+}
+
+static enum array_state read_state(int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_word;
+ return (enum array_state) sysfs_match_word(buf, array_states);
+}
+
+static enum sync_action read_action( int fd)
+{
+ char buf[20];
+ int n = read_attr(buf, 20, fd);
+
+ if (n <= 0)
+ return bad_action;
+ return (enum sync_action) sysfs_match_word(buf, sync_actions);
+}
+
+int read_dev_state(int fd)
+{
+ char buf[100];
+ int n = read_attr(buf, sizeof(buf), fd);
+ char *cp;
+ int rv = 0;
+
+ if (n <= 0)
+ return 0;
+
+ cp = buf;
+ while (cp) {
+ if (sysfs_attr_match(cp, "faulty"))
+ rv |= DS_FAULTY;
+ if (sysfs_attr_match(cp, "in_sync"))
+ rv |= DS_INSYNC;
+ if (sysfs_attr_match(cp, "write_mostly"))
+ rv |= DS_WRITE_MOSTLY;
+ if (sysfs_attr_match(cp, "spare"))
+ rv |= DS_SPARE;
+ if (sysfs_attr_match(cp, "blocked"))
+ rv |= DS_BLOCKED;
+ cp = strchr(cp, ',');
+ if (cp)
+ cp++;
+ }
+ return rv;
+}
+
+int process_ubb(struct active_array *a, struct mdinfo *mdi, const unsigned long
+ long sector, const int length, const char *buf,
+ const int buf_len)
+{
+ struct superswitch *ss = a->container->ss;
+
+ /*
+ * record bad block in metadata first, then acknowledge it to the driver
+ * via sysfs file
+ */
+ if ((ss->record_bad_block(a, mdi->disk.raid_disk, sector, length)) &&
+ (write(mdi->bb_fd, buf, buf_len) == buf_len))
+ return 1;
+
+ /*
+ * failed to store or acknowledge bad block, switch of bad block support
+ * to get it out of blocked state
+ */
+ sysfs_set_str(&a->info, mdi, "state", "-external_bbl");
+ return -1;
+}
+
+int compare_bb(struct active_array *a, struct mdinfo *mdi, const unsigned long
+ long sector, const unsigned int length, void *arg)
+{
+ struct superswitch *ss = a->container->ss;
+ struct md_bb *bb = (struct md_bb *) arg;
+ int record = 1;
+ int i;
+
+ for (i = 0; i < bb->count; i++) {
+ unsigned long long start = bb->entries[i].sector;
+ unsigned long long len = bb->entries[i].length;
+
+ /*
+ * bad block in metadata exactly matches bad block in kernel
+ * list, just remove it from a list
+ */
+ if ((start == sector) && (len == length)) {
+ if (i < bb->count - 1)
+ bb->entries[i] = bb->entries[bb->count - 1];
+ bb->count -= 1;
+ record = 0;
+ break;
+ }
+ /*
+ * bad block in metadata spans bad block in kernel list,
+ * clear it and record new bad block
+ */
+ if ((sector >= start) && (sector + length <= start + len)) {
+ ss->clear_bad_block(a, mdi->disk.raid_disk, start, len);
+ break;
+ }
+ }
+
+ /* record all bad blocks not in metadata list */
+ if (record && (ss->record_bad_block(a, mdi->disk.raid_disk, sector,
+ length) <= 0)) {
+ sysfs_set_str(&a->info, mdi, "state", "-external_bbl");
+ return -1;
+ }
+
+ return 1;
+}
+
+static int read_bb_file(int fd, struct active_array *a, struct mdinfo *mdi,
+ enum bb_action action, void *arg)
+{
+ char buf[30];
+ int n = 0;
+ int ret = 0;
+ int read_again = 0;
+ int off = 0;
+ int pos = 0;
+ int preserve_pos = (action == RECORD_BB ? 0 : 1);
+
+ if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
+ return -1;
+
+ do {
+ read_again = 0;
+ n = read(fd, buf + pos, sizeof(buf) - 1 - pos);
+ if (n < 0)
+ return -1;
+ n += pos;
+
+ buf[n] = '\0';
+ off = 0;
+
+ while (off < n) {
+ unsigned long long sector;
+ int length;
+ char newline;
+ int consumed;
+ int matched;
+ int rc;
+
+ /* kernel sysfs file format: "sector length\n" */
+ matched = sscanf(buf + off, "%llu %d%c%n", &sector,
+ &length, &newline, &consumed);
+ if ((matched != 3) && (off > 0)) {
+ /* truncated entry, read again */
+ if (preserve_pos) {
+ pos = sizeof(buf) - off - 1;
+ memmove(buf, buf + off, pos);
+ } else {
+ if (lseek(fd, 0, SEEK_SET) ==
+ (off_t) -1)
+ return -1;
+ }
+ read_again = 1;
+ break;
+ }
+ if (matched != 3)
+ return -1;
+ if (newline != '\n')
+ return -1;
+ if (length <= 0)
+ return -1;
+
+ if (action == RECORD_BB)
+ rc = process_ubb(a, mdi, sector, length,
+ buf + off, consumed);
+ else if (action == COMPARE_BB)
+ rc = compare_bb(a, mdi, sector, length, arg);
+ else
+ rc = -1;
+
+ if (rc < 0)
+ return rc;
+ ret += rc;
+ off += consumed;
+ }
+ } while (read_again);
+
+ return ret;
+}
+
+static int process_dev_ubb(struct active_array *a, struct mdinfo *mdi)
+{
+ return read_bb_file(mdi->ubb_fd, a, mdi, RECORD_BB, NULL);
+}
+
+static int check_for_cleared_bb(struct active_array *a, struct mdinfo *mdi)
+{
+ struct superswitch *ss = a->container->ss;
+ struct md_bb *bb;
+ int i;
+
+ /*
+ * Get a list of bad blocks for an array, then read list of
+ * acknowledged bad blocks from kernel and compare it against metadata
+ * list, clear all bad blocks remaining in metadata list
+ */
+ bb = ss->get_bad_blocks(a, mdi->disk.raid_disk);
+ if (!bb)
+ return -1;
+
+ if (read_bb_file(mdi->bb_fd, a, mdi, COMPARE_BB, bb) < 0)
+ return -1;
+
+ for (i = 0; i < bb->count; i++) {
+ unsigned long long sector = bb->entries[i].sector;
+ int length = bb->entries[i].length;
+
+ ss->clear_bad_block(a, mdi->disk.raid_disk, sector, length);
+ }
+
+ return 0;
+}
+
+static void signal_manager(void)
+{
+ /* tgkill(getpid(), mon_tid, SIGUSR1); */
+ int pid = getpid();
+ syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1);
+}
+
+/* Monitor a set of active md arrays - all of which share the
+ * same metadata - and respond to events that require
+ * metadata update.
+ *
+ * New arrays are detected by another thread which allocates
+ * required memory and attaches the data structure to our list.
+ *
+ * Events:
+ * Array stops.
+ * This is detected by array_state going to 'clear' or 'inactive'.
+ * while we thought it was active.
+ * Response is to mark metadata as clean and 'clear' the array(??)
+ * write-pending
+ * array_state if 'write-pending'
+ * We mark metadata as 'dirty' then set array to 'active'.
+ * active_idle
+ * Either ignore, or mark clean, then mark metadata as clean.
+ *
+ * device fails
+ * detected by rd-N/state reporting "faulty"
+ * mark device as 'failed' in metadata, let the kernel release the
+ * device by writing '-blocked' to rd/state, and finally write 'remove' to
+ * rd/state. Before a disk can be replaced it must be failed and removed
+ * from all container members, this will be preemptive for the other
+ * arrays... safe?
+ *
+ * sync completes
+ * sync_action was 'resync' and becomes 'idle' and resync_start becomes
+ * MaxSector
+ * Notify metadata that sync is complete.
+ *
+ * recovery completes
+ * sync_action changes from 'recover' to 'idle'
+ * Check each device state and mark metadata if 'faulty' or 'in_sync'.
+ *
+ * deal with resync
+ * This only happens on finding a new array... mdadm will have set
+ * 'resync_start' to the correct value. If 'resync_start' indicates that an
+ * resync needs to occur set the array to the 'active' state rather than the
+ * initial read-auto state.
+ *
+ *
+ *
+ * We wait for a change (poll/select) on array_state, sync_action, and
+ * each rd-X/state file.
+ * When we get any change, we check everything. So read each state file,
+ * then decide what to do.
+ *
+ * The core action is to write new metadata to all devices in the array.
+ * This is done at most once on any wakeup.
+ * After that we might:
+ * - update the array_state
+ * - set the role of some devices.
+ * - request a sync_action
+ *
+ */
+
+#define ARRAY_DIRTY 1
+#define ARRAY_BUSY 2
+static int read_and_act(struct active_array *a, fd_set *fds)
+{
+ unsigned long long sync_completed;
+ int check_degraded = 0;
+ int check_reshape = 0;
+ int deactivate = 0;
+ struct mdinfo *mdi;
+ int ret = 0;
+ int count = 0;
+ struct timeval tv;
+
+ a->next_state = bad_word;
+ a->next_action = bad_action;
+
+ a->curr_state = read_state(a->info.state_fd);
+ a->curr_action = read_action(a->action_fd);
+ if (a->curr_state != clear)
+ /*
+ * In "clear" state, resync_start may wrongly be set to "0"
+ * when the kernel called md_clean but didn't remove the
+ * sysfs attributes yet
+ */
+ read_resync_start(a->resync_start_fd, &a->info.resync_start);
+ sync_completed = read_sync_completed(a->sync_completed_fd);
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->next_state = 0;
+ mdi->curr_state = 0;
+ if (mdi->state_fd >= 0) {
+ read_resync_start(mdi->recovery_fd,
+ &mdi->recovery_start);
+ mdi->curr_state = read_dev_state(mdi->state_fd);
+ }
+ /*
+ * If array is blocked and metadata handler is able to handle
+ * BB, check if you can acknowledge them to md driver. If
+ * successful, clear faulty state and unblock the array.
+ */
+ if ((mdi->curr_state & DS_BLOCKED) &&
+ a->container->ss->record_bad_block &&
+ (process_dev_ubb(a, mdi) > 0)) {
+ mdi->next_state |= DS_UNBLOCK;
+ }
+ if (FD_ISSET(mdi->bb_fd, fds))
+ check_for_cleared_bb(a, mdi);
+ }
+
+ gettimeofday(&tv, NULL);
+ dprintf("(%d): %ld.%06ld state:%s prev:%s action:%s prev: %s start:%llu\n",
+ a->info.container_member,
+ tv.tv_sec, tv.tv_usec,
+ array_states[a->curr_state],
+ array_states[a->prev_state],
+ sync_actions[a->curr_action],
+ sync_actions[a->prev_action],
+ a->info.resync_start
+ );
+
+ if ((a->curr_state == bad_word || a->curr_state <= inactive) &&
+ a->prev_state > inactive) {
+ /* array has been stopped */
+ a->container->ss->set_array_state(a, 1);
+ a->next_state = clear;
+ deactivate = 1;
+ }
+ if (a->curr_state == write_pending) {
+ a->container->ss->set_array_state(a, 0);
+ a->next_state = active;
+ ret |= ARRAY_DIRTY;
+ }
+ if (a->curr_state == active_idle) {
+ /* Set array to 'clean' FIRST, then mark clean
+ * in the metadata
+ */
+ a->next_state = clean;
+ ret |= ARRAY_DIRTY;
+ }
+ if ((a->curr_state == clean) || (a->curr_state == broken)) {
+ a->container->ss->set_array_state(a, 1);
+ }
+ if (a->curr_state == active ||
+ a->curr_state == suspended)
+ ret |= ARRAY_DIRTY;
+ if (a->curr_state == readonly) {
+ /* Well, I'm ready to handle things. If readonly
+ * wasn't requested, transition to read-auto.
+ */
+ char buf[64];
+ read_attr(buf, sizeof(buf), a->metadata_fd);
+ if (strncmp(buf, "external:-", 10) == 0) {
+ /* explicit request for readonly array. Leave it alone */
+ ;
+ } else {
+ if (a->container->ss->set_array_state(a, 2))
+ a->next_state = read_auto; /* array is clean */
+ else {
+ a->next_state = active; /* Now active for recovery etc */
+ ret |= ARRAY_DIRTY;
+ }
+ }
+ }
+
+ if (!deactivate &&
+ a->curr_action == idle &&
+ a->prev_action == resync) {
+ /* A resync has finished. The endpoint is recorded in
+ * 'sync_start'. We don't update the metadata
+ * until the array goes inactive or readonly though.
+ * Just check if we need to fiddle spares.
+ */
+ a->container->ss->set_array_state(a, a->curr_state <= clean);
+ check_degraded = 1;
+ }
+
+ if (!deactivate &&
+ a->curr_action == idle &&
+ a->prev_action == recover) {
+ /* A recovery has finished. Some disks may be in sync now,
+ * and the array may no longer be degraded
+ */
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk,
+ mdi->curr_state);
+ if (! (mdi->curr_state & DS_INSYNC))
+ check_degraded = 1;
+ count++;
+ }
+ if (count != a->info.array.raid_disks)
+ check_degraded = 1;
+ }
+
+ if (!deactivate &&
+ a->curr_action == reshape &&
+ a->prev_action != reshape)
+ /* reshape was requested by mdadm. Need to see if
+ * new devices have been added. Manager does that
+ * when it sees check_reshape
+ */
+ check_reshape = 1;
+
+ /* Check for failures and if found:
+ * 1/ Record the failure in the metadata and unblock the device.
+ * FIXME update the kernel to stop notifying on failed drives when
+ * the array is readonly and we have cleared 'blocked'
+ * 2/ Try to remove the device if the array is writable, or can be
+ * made writable.
+ */
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ if (mdi->curr_state & DS_FAULTY) {
+ a->container->ss->set_disk(a, mdi->disk.raid_disk,
+ mdi->curr_state);
+ check_degraded = 1;
+ if (mdi->curr_state & DS_BLOCKED)
+ mdi->next_state |= DS_UNBLOCK;
+ if (a->curr_state == read_auto) {
+ a->container->ss->set_array_state(a, 0);
+ a->next_state = active;
+ }
+ if (a->curr_state > readonly)
+ mdi->next_state |= DS_REMOVE;
+ }
+ }
+
+ /* Check for recovery checkpoint notifications. We need to be a
+ * minimum distance away from the last checkpoint to prevent
+ * over checkpointing. Note reshape checkpointing is handled
+ * in the second branch.
+ */
+ if (sync_completed > a->last_checkpoint &&
+ sync_completed - a->last_checkpoint > a->info.component_size >> 4 &&
+ a->curr_action > reshape) {
+ /* A (non-reshape) sync_action has reached a checkpoint.
+ * Record the updated position in the metadata
+ */
+ a->last_checkpoint = sync_completed;
+ a->container->ss->set_array_state(a, a->curr_state <= clean);
+ } else if ((a->curr_action == idle && a->prev_action == reshape) ||
+ (a->curr_action == reshape &&
+ sync_completed > a->last_checkpoint)) {
+ /* Reshape has progressed or completed so we need to
+ * update the array state - and possibly the array size
+ */
+ if (sync_completed != 0)
+ a->last_checkpoint = sync_completed;
+ /* We might need to update last_checkpoint depending on
+ * the reason that reshape finished.
+ * if array reshape is really finished:
+ * set check point to the end, this allows
+ * set_array_state() to finalize reshape in metadata
+ * if reshape if broken: do not set checkpoint to the end
+ * this allows for reshape restart from checkpoint
+ */
+ if ((a->curr_action != reshape) &&
+ (a->prev_action == reshape)) {
+ char buf[40];
+ if ((sysfs_get_str(&a->info, NULL,
+ "reshape_position",
+ buf,
+ sizeof(buf)) >= 0) &&
+ strncmp(buf, "none", 4) == 0)
+ a->last_checkpoint = a->info.component_size;
+ }
+ a->container->ss->set_array_state(a, a->curr_state <= clean);
+ a->last_checkpoint = sync_completed;
+ }
+
+ if (sync_completed > a->last_checkpoint)
+ a->last_checkpoint = sync_completed;
+
+ if (sync_completed >= a->info.component_size)
+ a->last_checkpoint = 0;
+
+ a->container->ss->sync_metadata(a->container);
+ dprintf("(%d): state:%s action:%s next(", a->info.container_member,
+ array_states[a->curr_state], sync_actions[a->curr_action]);
+
+ /* Effect state changes in the array */
+ if (a->next_state != bad_word) {
+ dprintf_cont(" state:%s", array_states[a->next_state]);
+ write_attr(array_states[a->next_state], a->info.state_fd);
+ }
+ if (a->next_action != bad_action) {
+ write_attr(sync_actions[a->next_action], a->action_fd);
+ dprintf_cont(" action:%s", sync_actions[a->next_action]);
+ }
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ if (mdi->next_state & DS_UNBLOCK) {
+ dprintf_cont(" %d:-blocked", mdi->disk.raid_disk);
+ write_attr("-blocked", mdi->state_fd);
+ }
+
+ if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) {
+ int remove_result;
+
+ /* The kernel may not be able to immediately remove the
+ * disk. In that case we wait a little while and
+ * try again.
+ */
+ remove_result = write_attr("remove", mdi->state_fd);
+ if (remove_result > 0) {
+ dprintf_cont(" %d:removed", mdi->disk.raid_disk);
+ close(mdi->state_fd);
+ close(mdi->recovery_fd);
+ close(mdi->bb_fd);
+ close(mdi->ubb_fd);
+ mdi->state_fd = -1;
+ } else
+ ret |= ARRAY_BUSY;
+ }
+ if (mdi->next_state & DS_INSYNC) {
+ write_attr("+in_sync", mdi->state_fd);
+ dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk);
+ }
+ }
+ dprintf_cont(" )\n");
+
+ /* move curr_ to prev_ */
+ a->prev_state = a->curr_state;
+
+ a->prev_action = a->curr_action;
+
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next) {
+ mdi->prev_state = mdi->curr_state;
+ mdi->next_state = 0;
+ }
+
+ if (check_degraded || check_reshape) {
+ /* manager will do the actual check */
+ if (check_degraded)
+ a->check_degraded = 1;
+ if (check_reshape)
+ a->check_reshape = 1;
+ signal_manager();
+ }
+
+ if (deactivate)
+ a->container = NULL;
+
+ return ret;
+}
+
+static struct mdinfo *
+find_device(struct active_array *a, int major, int minor)
+{
+ struct mdinfo *mdi;
+
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ if (mdi->disk.major == major && mdi->disk.minor == minor)
+ return mdi;
+
+ return NULL;
+}
+
+static void reconcile_failed(struct active_array *aa, struct mdinfo *failed)
+{
+ struct active_array *a;
+ struct mdinfo *victim;
+
+ for (a = aa; a; a = a->next) {
+ if (!a->container || a->to_remove)
+ continue;
+ victim = find_device(a, failed->disk.major, failed->disk.minor);
+ if (!victim)
+ continue;
+
+ if (!(victim->curr_state & DS_FAULTY))
+ write_attr("faulty", victim->state_fd);
+ }
+}
+
+#ifdef DEBUG
+static void dprint_wake_reasons(fd_set *fds)
+{
+ int i;
+ char proc_path[256];
+ char link[256];
+ char *basename;
+ int rv;
+
+ fprintf(stderr, "monitor: wake ( ");
+ for (i = 0; i < FD_SETSIZE; i++) {
+ if (FD_ISSET(i, fds)) {
+ sprintf(proc_path, "/proc/%d/fd/%d",
+ (int) getpid(), i);
+
+ rv = readlink(proc_path, link, sizeof(link) - 1);
+ if (rv < 0) {
+ fprintf(stderr, "%d:unknown ", i);
+ continue;
+ }
+ link[rv] = '\0';
+ basename = strrchr(link, '/');
+ fprintf(stderr, "%d:%s ",
+ i, basename ? ++basename : link);
+ }
+ }
+ fprintf(stderr, ")\n");
+}
+#endif
+
+int monitor_loop_cnt;
+
+static int wait_and_act(struct supertype *container, int nowait)
+{
+ fd_set rfds;
+ int maxfd = 0;
+ struct active_array **aap = &container->arrays;
+ struct active_array *a, **ap;
+ int rv;
+ struct mdinfo *mdi;
+ static unsigned int dirty_arrays = ~0; /* start at some non-zero value */
+
+ FD_ZERO(&rfds);
+
+ for (ap = aap ; *ap ;) {
+ a = *ap;
+ /* once an array has been deactivated we want to
+ * ask the manager to discard it.
+ */
+ if (!a->container || a->to_remove) {
+ if (discard_this) {
+ ap = &(*ap)->next;
+ continue;
+ }
+ *ap = a->next;
+ a->next = NULL;
+ discard_this = a;
+ signal_manager();
+ continue;
+ }
+
+ add_fd(&rfds, &maxfd, a->info.state_fd);
+ add_fd(&rfds, &maxfd, a->action_fd);
+ add_fd(&rfds, &maxfd, a->sync_completed_fd);
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next) {
+ add_fd(&rfds, &maxfd, mdi->state_fd);
+ add_fd(&rfds, &maxfd, mdi->bb_fd);
+ add_fd(&rfds, &maxfd, mdi->ubb_fd);
+ }
+
+ ap = &(*ap)->next;
+ }
+
+ if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) {
+ /* No interesting arrays, or we have been told to
+ * terminate and everything is clean. Lets see about
+ * exiting. Note that blocking at this point is not a
+ * problem as there are no active arrays, there is
+ * nothing that we need to be ready to do.
+ */
+ int fd;
+ if (sigterm)
+ fd = open_dev_excl(container->devnm);
+ else
+ fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL);
+ if (fd >= 0 || errno != EBUSY) {
+ /* OK, we are safe to leave */
+ if (sigterm && !dirty_arrays)
+ dprintf("caught sigterm, all clean... exiting\n");
+ else
+ dprintf("no arrays to monitor... exiting\n");
+ if (!sigterm)
+ /* On SIGTERM, someone (the take-over mdmon) will
+ * clean up
+ */
+ remove_pidfile(container->devnm);
+ exit_now = 1;
+ signal_manager();
+ close(fd);
+ exit(0);
+ }
+ }
+
+ if (!nowait) {
+ sigset_t set;
+ struct timespec ts;
+ ts.tv_sec = 24*3600;
+ ts.tv_nsec = 0;
+ if (*aap == NULL || container->retry_soon) {
+ /* just waiting to get O_EXCL access */
+ ts.tv_sec = 0;
+ ts.tv_nsec = 20000000ULL;
+ }
+ sigprocmask(SIG_UNBLOCK, NULL, &set);
+ sigdelset(&set, SIGUSR1);
+ monitor_loop_cnt |= 1;
+ rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set);
+ monitor_loop_cnt += 1;
+ if (rv == -1) {
+ if (errno == EINTR) {
+ rv = 0;
+ FD_ZERO(&rfds);
+ dprintf("monitor: caught signal\n");
+ } else
+ dprintf("monitor: error %d in pselect\n",
+ errno);
+ }
+ #ifdef DEBUG
+ else
+ dprint_wake_reasons(&rfds);
+ #endif
+ container->retry_soon = 0;
+ }
+
+ if (update_queue) {
+ struct metadata_update *this;
+
+ for (this = update_queue; this ; this = this->next)
+ container->ss->process_update(container, this);
+
+ update_queue_handled = update_queue;
+ update_queue = NULL;
+ signal_manager();
+ container->ss->sync_metadata(container);
+ }
+
+ rv = 0;
+ dirty_arrays = 0;
+ for (a = *aap; a ; a = a->next) {
+
+ if (a->replaces && !discard_this) {
+ struct active_array **ap;
+ for (ap = &a->next; *ap && *ap != a->replaces;
+ ap = & (*ap)->next)
+ ;
+ if (*ap)
+ *ap = (*ap)->next;
+ discard_this = a->replaces;
+ a->replaces = NULL;
+ /* FIXME check if device->state_fd need to be cleared?*/
+ signal_manager();
+ }
+ if (a->container && !a->to_remove) {
+ int ret = read_and_act(a, &rfds);
+ rv |= 1;
+ dirty_arrays += !!(ret & ARRAY_DIRTY);
+ /* when terminating stop manipulating the array after it
+ * is clean, but make sure read_and_act() is given a
+ * chance to handle 'active_idle'
+ */
+ if (sigterm && !(ret & ARRAY_DIRTY))
+ a->container = NULL; /* stop touching this array */
+ if (ret & ARRAY_BUSY)
+ container->retry_soon = 1;
+ }
+ }
+
+ /* propagate failures across container members */
+ for (a = *aap; a ; a = a->next) {
+ if (!a->container || a->to_remove)
+ continue;
+ for (mdi = a->info.devs ; mdi ; mdi = mdi->next)
+ if (mdi->curr_state & DS_FAULTY)
+ reconcile_failed(*aap, mdi);
+ }
+
+ return rv;
+}
+
+void do_monitor(struct supertype *container)
+{
+ int rv;
+ int first = 1;
+ do {
+ rv = wait_and_act(container, first);
+ first = 0;
+ } while (rv >= 0);
+}
diff --git a/msg.c b/msg.c
new file mode 100644
index 0000000..45cd450
--- /dev/null
+++ b/msg.c
@@ -0,0 +1,475 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdlib.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+#include "mdadm.h"
+#include "mdmon.h"
+
+static const __u32 start_magic = 0x5a5aa5a5;
+static const __u32 end_magic = 0xa5a55a5a;
+
+static int send_buf(int fd, const void* buf, int len, int tmo)
+{
+ fd_set set;
+ int rv;
+ struct timeval timeout = {tmo, 0};
+ struct timeval *ptmo = tmo ? &timeout : NULL;
+
+ while (len) {
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+ rv = select(fd+1, NULL, &set, NULL, ptmo);
+ if (rv <= 0)
+ return -1;
+ rv = write(fd, buf, len);
+ if (rv <= 0)
+ return -1;
+ len -= rv;
+ buf += rv;
+ }
+ return 0;
+}
+
+static int recv_buf(int fd, void* buf, int len, int tmo)
+{
+ fd_set set;
+ int rv;
+ struct timeval timeout = {tmo, 0};
+ struct timeval *ptmo = tmo ? &timeout : NULL;
+
+ while (len) {
+ FD_ZERO(&set);
+ FD_SET(fd, &set);
+ rv = select(fd+1, &set, NULL, NULL, ptmo);
+ if (rv <= 0)
+ return -1;
+ rv = read(fd, buf, len);
+ if (rv <= 0)
+ return -1;
+ len -= rv;
+ buf += rv;
+ }
+ return 0;
+}
+
+int send_message(int fd, struct metadata_update *msg, int tmo)
+{
+ __s32 len = msg->len;
+ int rv;
+
+ rv = send_buf(fd, &start_magic, 4, tmo);
+ rv = rv ?: send_buf(fd, &len, 4, tmo);
+ if (len > 0)
+ rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo);
+ rv = send_buf(fd, &end_magic, 4, tmo);
+
+ return rv;
+}
+
+int receive_message(int fd, struct metadata_update *msg, int tmo)
+{
+ __u32 magic;
+ __s32 len;
+ int rv;
+
+ rv = recv_buf(fd, &magic, 4, tmo);
+ if (rv < 0 || magic != start_magic)
+ return -1;
+ rv = recv_buf(fd, &len, 4, tmo);
+ if (rv < 0 || len > MSG_MAX_LEN)
+ return -1;
+ if (len > 0) {
+ msg->buf = xmalloc(len);
+ rv = recv_buf(fd, msg->buf, len, tmo);
+ if (rv < 0) {
+ free(msg->buf);
+ return -1;
+ }
+ } else
+ msg->buf = NULL;
+ rv = recv_buf(fd, &magic, 4, tmo);
+ if (rv < 0 || magic != end_magic) {
+ free(msg->buf);
+ return -1;
+ }
+ msg->len = len;
+ return 0;
+}
+
+int ack(int fd, int tmo)
+{
+ struct metadata_update msg = { .len = 0 };
+
+ return send_message(fd, &msg, tmo);
+}
+
+int wait_reply(int fd, int tmo)
+{
+ struct metadata_update msg;
+ int err = receive_message(fd, &msg, tmo);
+
+ /* mdmon sent extra data, but caller only cares that we got a
+ * successful reply
+ */
+ if (err == 0 && msg.len > 0)
+ free(msg.buf);
+
+ return err;
+}
+
+int connect_monitor(char *devname)
+{
+ char path[100];
+ int sfd;
+ long fl;
+ struct sockaddr_un addr;
+ int pos;
+ char *c;
+
+ pos = sprintf(path, "%s/", MDMON_DIR);
+ if (is_subarray(devname)) {
+ devname++;
+ c = strchr(devname, '/');
+ if (!c)
+ return -1;
+ snprintf(&path[pos], c - devname + 1, "%s", devname);
+ pos += c - devname;
+ } else
+ pos += sprintf(&path[pos], "%s", devname);
+ sprintf(&path[pos], ".sock");
+
+ sfd = socket(PF_LOCAL, SOCK_STREAM, 0);
+ if (sfd < 0)
+ return -1;
+
+ addr.sun_family = PF_LOCAL;
+ strcpy(addr.sun_path, path);
+ if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) {
+ close(sfd);
+ return -1;
+ }
+
+ fl = fcntl(sfd, F_GETFL, 0);
+ fl |= O_NONBLOCK;
+ fcntl(sfd, F_SETFL, fl);
+
+ return sfd;
+}
+
+int fping_monitor(int sfd)
+{
+ int err = 0;
+
+ if (sfd < 0)
+ return sfd;
+
+ /* try to ping existing socket */
+ if (ack(sfd, 20) != 0)
+ err = -1;
+
+ /* check the reply */
+ if (!err && wait_reply(sfd, 20) != 0)
+ err = -1;
+
+ return err;
+}
+
+/* give the monitor a chance to update the metadata */
+int ping_monitor(char *devname)
+{
+ int sfd = connect_monitor(devname);
+ int err;
+
+ if (sfd >= 0) {
+ err = fping_monitor(sfd);
+ close(sfd);
+ } else
+ err = -1;
+
+ return err;
+}
+
+static char *ping_monitor_version(char *devname)
+{
+ int sfd = connect_monitor(devname);
+ struct metadata_update msg;
+ int err = 0;
+
+ if (sfd < 0)
+ return NULL;
+
+ if (ack(sfd, 20) != 0)
+ err = -1;
+
+ if (!err && receive_message(sfd, &msg, 20) != 0)
+ err = -1;
+
+ close(sfd);
+
+ if (err || !msg.len || !msg.buf)
+ return NULL;
+ return msg.buf;
+}
+
+int unblock_subarray(struct mdinfo *sra, const int unfreeze)
+{
+ char buf[64];
+ int rc = 0;
+
+ if (sra) {
+ sprintf(buf, "external:%s\n", sra->text_version);
+ buf[9] = '/';
+ } else
+ buf[9] = '-';
+
+ if (buf[9] == '-' ||
+ sysfs_set_str(sra, NULL, "metadata_version", buf) ||
+ (unfreeze &&
+ sysfs_attribute_available(sra, NULL, "sync_action") &&
+ sysfs_set_str(sra, NULL, "sync_action", "idle")))
+ rc = -1;
+ return rc;
+}
+
+int block_subarray(struct mdinfo *sra)
+{
+ char buf[64];
+ int rc = 0;
+
+ sprintf(buf, "external:%s\n", sra->text_version);
+ buf[9] = '-';
+ if (sysfs_set_str(sra, NULL, "metadata_version", buf))
+ rc = -1;
+
+ return rc;
+}
+
+/* check mdmon version if it supports
+ * array blocking mechanism
+ */
+int check_mdmon_version(char *container)
+{
+ char *version = NULL;
+
+ if (!mdmon_running(container)) {
+ /* if mdmon is not active we assume that any instance that is
+ * later started will match the current mdadm version, if this
+ * assumption is violated we may inadvertantly rebuild an array
+ * that was meant for reshape, or start rebuild on a spare that
+ * was to be moved to another container
+ */
+ /* pass */;
+ } else {
+ int ver;
+
+ version = ping_monitor_version(container);
+ ver = version ? mdadm_version(version) : -1;
+ free(version);
+ if (ver < 3002000) {
+ pr_err("mdmon instance for %s cannot be disabled\n",
+ container);
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * block_monitor - prevent mdmon spare assignment
+ * @container - container to block
+ * @freeze - flag to additionally freeze sync_action
+ *
+ * This is used by the reshape code to freeze the container, and the
+ * auto-rebuild implementation to atomically move spares.
+ * In both cases we need to stop mdmon from assigning spares to replace
+ * failed devices as we might have other plans for the spare.
+ * For the reshape case we also need to 'freeze' sync_action so that
+ * no recovery happens until we have fully prepared for the reshape.
+ *
+ * We tell mdmon that the array is frozen by marking the 'metadata' name
+ * with a leading '-'. The previously told mdmon "Don't make this array
+ * read/write, leave it readonly". Now it means a more general "Don't
+ * reconfigure this array at all".
+ * As older versions of mdmon (which might run from initrd) don't understand
+ * this, we first check that the running mdmon is new enough.
+ */
+int block_monitor(char *container, const int freeze)
+{
+ struct mdstat_ent *ent, *e, *e2;
+ struct mdinfo *sra = NULL;
+ char buf[64];
+ int rv = 0;
+
+ if (check_mdmon_version(container))
+ return -1;
+
+ ent = mdstat_read(0, 0);
+ if (!ent) {
+ pr_err("failed to read /proc/mdstat while disabling mdmon\n");
+ return -1;
+ }
+
+ /* freeze container contents */
+ for (e = ent; e; e = e->next) {
+ if (!is_container_member(e, container))
+ continue;
+ sysfs_free(sra);
+ sra = sysfs_read(-1, e->devnm, GET_VERSION);
+ if (!sra) {
+ pr_err("failed to read sysfs for subarray%s\n",
+ to_subarray(e, container));
+ break;
+ }
+ /* can't reshape an array that we can't monitor */
+ if (sra->text_version[0] == '-')
+ break;
+
+ if (freeze && sysfs_freeze_array(sra) < 1)
+ break;
+ /* flag this array to not be modified by mdmon (close race with
+ * takeover in reshape case and spare reassignment in the
+ * auto-rebuild case)
+ */
+ if (block_subarray(sra))
+ break;
+ ping_monitor(container);
+
+ /* check that we did not race with recovery */
+ if ((freeze &&
+ !sysfs_attribute_available(sra, NULL, "sync_action")) ||
+ (freeze &&
+ sysfs_attribute_available(sra, NULL, "sync_action") &&
+ sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 &&
+ strcmp(buf, "frozen\n") == 0))
+ /* pass */;
+ else {
+ unblock_subarray(sra, 0);
+ break;
+ }
+ /* Double check against races - there should be no spares
+ * or part-spares
+ */
+ sysfs_free(sra);
+ sra = sysfs_read(-1, e->devnm, GET_DEVS | GET_STATE);
+ if (sra && sra->array.spare_disks > 0) {
+ unblock_subarray(sra, freeze);
+ break;
+ }
+ }
+
+ if (e) {
+ pr_err("failed to freeze subarray%s\n",
+ to_subarray(e, container));
+
+ /* thaw the partially frozen container */
+ for (e2 = ent; e2 && e2 != e; e2 = e2->next) {
+ if (!is_container_member(e2, container))
+ continue;
+ sysfs_free(sra);
+ sra = sysfs_read(-1, e2->devnm, GET_VERSION);
+ if (unblock_subarray(sra, freeze))
+ pr_err("Failed to unfreeze %s\n", e2->devnm);
+ }
+
+ ping_monitor(container); /* cleared frozen */
+ rv = -1;
+ }
+
+ sysfs_free(sra);
+ free_mdstat(ent);
+
+ return rv;
+}
+
+void unblock_monitor(char *container, const int unfreeze)
+{
+ struct mdstat_ent *ent, *e;
+ struct mdinfo *sra = NULL;
+ int to_ping = 0;
+
+ ent = mdstat_read(0, 0);
+ if (!ent) {
+ pr_err("failed to read /proc/mdstat while unblocking container\n");
+ return;
+ }
+
+ /* unfreeze container contents */
+ for (e = ent; e; e = e->next) {
+ if (!is_container_member(e, container))
+ continue;
+ sysfs_free(sra);
+ sra = sysfs_read(-1, e->devnm, GET_VERSION|GET_LEVEL);
+ if (!sra)
+ continue;
+ if (sra->array.level > 0)
+ to_ping++;
+ if (unblock_subarray(sra, unfreeze))
+ pr_err("Failed to unfreeze %s\n", e->devnm);
+ }
+ if (to_ping)
+ ping_monitor(container);
+
+ sysfs_free(sra);
+ free_mdstat(ent);
+}
+
+/* give the manager a chance to view the updated container state. This
+ * would naturally happen due to the manager noticing a change in
+ * /proc/mdstat; however, pinging encourages this detection to happen
+ * while an exclusive open() on the container is active
+ */
+int ping_manager(char *devname)
+{
+ int sfd = connect_monitor(devname);
+ struct metadata_update msg = { .len = -1 };
+ int err = 0;
+
+ if (sfd < 0)
+ return sfd;
+
+ err = send_message(sfd, &msg, 20);
+
+ /* check the reply */
+ if (!err && wait_reply(sfd, 20) != 0)
+ err = -1;
+
+ close(sfd);
+ return err;
+}
+
+/* using takeover operation for grow purposes, mdadm has to be sure
+ * that mdmon processes all updates, and if necessary it will be closed
+ * at takeover to raid0 operation
+ */
+void flush_mdmon(char *container)
+{
+ ping_manager(container);
+ ping_monitor(container);
+}
diff --git a/msg.h b/msg.h
new file mode 100644
index 0000000..016612c
--- /dev/null
+++ b/msg.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * mdmon socket / message handling
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+struct mdinfo;
+struct metadata_update;
+
+extern int receive_message(int fd, struct metadata_update *msg, int tmo);
+extern int send_message(int fd, struct metadata_update *msg, int tmo);
+extern int ack(int fd, int tmo);
+extern int wait_reply(int fd, int tmo);
+extern int connect_monitor(char *devname);
+extern int ping_monitor(char *devname);
+extern int block_subarray(struct mdinfo *sra);
+extern int unblock_subarray(struct mdinfo *sra, const int unfreeze);
+extern int block_monitor(char *container, const int freeze);
+extern void unblock_monitor(char *container, const int unfreeze);
+extern int fping_monitor(int sock);
+extern int ping_manager(char *devname);
+extern void flush_mdmon(char *container);
+
+#define MSG_MAX_LEN (4*1024*1024)
diff --git a/part.h b/part.h
new file mode 100644
index 0000000..e697fb4
--- /dev/null
+++ b/part.h
@@ -0,0 +1,79 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neil@brown.name>
+ *
+ */
+
+/* Structure definitions ext for MBR and GPT partition tables
+ */
+
+#define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55)
+#define MBR_PARTITIONS 4
+
+struct MBR_part_record {
+ __u8 bootable;
+ __u8 first_head;
+ __u8 first_sector;
+ __u8 first_cyl;
+ __u8 part_type;
+ __u8 last_head;
+ __u8 last_sector;
+ __u8 last_cyl;
+ __u32 first_sect_lba;
+ __u32 blocks_num;
+} __attribute__((packed));
+
+struct MBR {
+ __u8 pad[446];
+ struct MBR_part_record parts[MBR_PARTITIONS];
+ __u16 magic;
+} __attribute__((packed));
+
+#define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL)
+#define MBR_GPT_PARTITION_TYPE 0xEE
+
+struct GPT_part_entry {
+ unsigned char type_guid[16];
+ unsigned char partition_guid[16];
+ __u64 starting_lba;
+ __u64 ending_lba;
+ unsigned char attr_bits[8];
+ unsigned char name[72];
+} __attribute__((packed));
+
+struct GPT {
+ __u64 magic;
+ __u32 revision;
+ __u32 header_size;
+ __u32 crc;
+ __u32 pad1;
+ __u64 current_lba;
+ __u64 backup_lba;
+ __u64 first_lba;
+ __u64 last_lba;
+ __u8 guid[16];
+ __u64 part_start;
+ __u32 part_cnt;
+ __u32 part_size;
+ __u32 part_crc;
+ __u8 pad2[420];
+} __attribute__((packed));
diff --git a/platform-intel.c b/platform-intel.c
new file mode 100644
index 0000000..5a8729e
--- /dev/null
+++ b/platform-intel.c
@@ -0,0 +1,969 @@
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include "mdadm.h"
+#include "platform-intel.h"
+#include "probe_roms.h"
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <dirent.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <limits.h>
+
+#define NVME_SUBSYS_PATH "/sys/devices/virtual/nvme-subsystem/"
+
+static int devpath_to_ll(const char *dev_path, const char *entry,
+ unsigned long long *val);
+
+static void free_sys_dev(struct sys_dev **list)
+{
+ while (*list) {
+ struct sys_dev *next = (*list)->next;
+
+ if ((*list)->path)
+ free((*list)->path);
+ free(*list);
+ *list = next;
+ }
+}
+
+struct sys_dev *find_driver_devices(const char *bus, const char *driver)
+{
+ /* search sysfs for devices driven by 'driver' */
+ char path[PATH_MAX];
+ char link[PATH_MAX];
+ char *c, *p;
+ DIR *driver_dir;
+ struct dirent *de;
+ struct sys_dev *head = NULL;
+ struct sys_dev *list = NULL;
+ struct sys_dev *vmd = NULL;
+ enum sys_dev_type type;
+ unsigned long long dev_id;
+ unsigned long long class;
+
+ if (strcmp(driver, "isci") == 0)
+ type = SYS_DEV_SAS;
+ else if (strcmp(driver, "ahci") == 0)
+ type = SYS_DEV_SATA;
+ else if (strcmp(driver, "nvme") == 0) {
+ /* if looking for nvme devs, first look for vmd */
+ vmd = find_driver_devices("pci", "vmd");
+ type = SYS_DEV_NVME;
+ } else if (strcmp(driver, "vmd") == 0)
+ type = SYS_DEV_VMD;
+ else
+ type = SYS_DEV_UNKNOWN;
+
+ sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver);
+ driver_dir = opendir(path);
+ if (!driver_dir) {
+ if (vmd)
+ free_sys_dev(&vmd);
+ return NULL;
+ }
+ for (de = readdir(driver_dir); de; de = readdir(driver_dir)) {
+ int n;
+ int skip = 0;
+
+ /* is 'de' a device? check that the 'subsystem' link exists and
+ * that its target matches 'bus'
+ */
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem",
+ bus, driver, de->d_name);
+ n = readlink(path, link, sizeof(link));
+ if (n < 0 || n >= (int)sizeof(link))
+ continue;
+ link[n] = '\0';
+ c = strrchr(link, '/');
+ if (!c)
+ continue;
+ if (strncmp(bus, c+1, strlen(bus)) != 0)
+ continue;
+
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s",
+ bus, driver, de->d_name);
+
+ /* if searching for nvme - skip vmd connected one */
+ if (type == SYS_DEV_NVME) {
+ struct sys_dev *dev;
+ char *rp = realpath(path, NULL);
+ for (dev = vmd; dev; dev = dev->next) {
+ if ((strncmp(dev->path, rp, strlen(dev->path)) == 0))
+ skip = 1;
+ }
+ free(rp);
+ }
+
+ /* if it's not Intel device or mark as VMD connected - skip it. */
+ if (devpath_to_vendor(path) != 0x8086 || skip == 1)
+ continue;
+
+ if (devpath_to_ll(path, "device", &dev_id) != 0)
+ continue;
+
+ if (devpath_to_ll(path, "class", &class) != 0)
+ continue;
+
+ /*
+ * Each VMD device (domain) adds separate PCI bus, it is better
+ * to store path as a path to that bus (easier further
+ * determination which NVMe dev is connected to this particular
+ * VMD domain).
+ */
+ if (type == SYS_DEV_VMD) {
+ sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device",
+ bus, driver, de->d_name);
+ }
+ p = realpath(path, NULL);
+ if (p == NULL) {
+ pr_err("Unable to get real path for '%s'\n", path);
+ continue;
+ }
+
+ /* start / add list entry */
+ if (!head) {
+ head = xmalloc(sizeof(*head));
+ list = head;
+ } else {
+ list->next = xmalloc(sizeof(*head));
+ list = list->next;
+ }
+
+ if (!list) {
+ free_sys_dev(&head);
+ break;
+ }
+
+ list->dev_id = (__u16) dev_id;
+ list->class = (__u32) class;
+ list->type = type;
+ list->next = NULL;
+ list->path = p;
+
+ if ((list->pci_id = strrchr(list->path, '/')) != NULL)
+ list->pci_id++;
+ }
+ closedir(driver_dir);
+
+ if (vmd) {
+ if (list)
+ list->next = vmd;
+ else
+ head = vmd;
+ }
+
+ return head;
+}
+
+static struct sys_dev *intel_devices=NULL;
+static time_t valid_time = 0;
+
+struct sys_dev *device_by_id(__u16 device_id)
+{
+ struct sys_dev *iter;
+
+ for (iter = intel_devices; iter != NULL; iter = iter->next)
+ if (iter->dev_id == device_id)
+ return iter;
+ return NULL;
+}
+
+struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path)
+{
+ struct sys_dev *iter;
+
+ for (iter = intel_devices; iter != NULL; iter = iter->next)
+ if ((iter->dev_id == device_id) && strstr(iter->path, path))
+ return iter;
+ return NULL;
+}
+
+static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val)
+{
+ char path[strlen(dev_path) + strlen(entry) + 2];
+ int fd;
+ int n;
+
+ sprintf(path, "%s/%s", dev_path, entry);
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_ll(fd, val);
+ close(fd);
+ return n;
+}
+
+__u16 devpath_to_vendor(const char *dev_path)
+{
+ char path[strlen(dev_path) + strlen("/vendor") + 1];
+ char vendor[7];
+ int fd;
+ __u16 id = 0xffff;
+ int n;
+
+ sprintf(path, "%s/vendor", dev_path);
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 0xffff;
+
+ n = read(fd, vendor, sizeof(vendor));
+ if (n == sizeof(vendor)) {
+ vendor[n - 1] = '\0';
+ id = strtoul(vendor, NULL, 16);
+ }
+ close(fd);
+
+ return id;
+}
+
+/* Description: Read text value of dev_path/entry field
+ * Parameters:
+ * dev_path - sysfs path to the device
+ * entry - entry to be read
+ * buf - buffer for read value
+ * len - size of buf
+ * verbose - error logging level
+ */
+int devpath_to_char(const char *dev_path, const char *entry, char *buf, int len,
+ int verbose)
+{
+ char path[PATH_MAX];
+
+ snprintf(path, sizeof(path), "%s/%s", dev_path, entry);
+ if (load_sys(path, buf, len)) {
+ if (verbose)
+ pr_err("Cannot read %s, aborting\n", path);
+ return 1;
+ }
+
+ return 0;
+}
+
+struct sys_dev *find_intel_devices(void)
+{
+ struct sys_dev *ahci, *isci, *nvme;
+
+ if (valid_time > time(0) - 10)
+ return intel_devices;
+
+ if (intel_devices)
+ free_sys_dev(&intel_devices);
+
+ isci = find_driver_devices("pci", "isci");
+ ahci = find_driver_devices("pci", "ahci");
+ /* Searching for NVMe will return list of NVMe and VMD controllers */
+ nvme = find_driver_devices("pci", "nvme");
+
+ if (!isci && !ahci) {
+ ahci = nvme;
+ } else if (!ahci) {
+ ahci = isci;
+ struct sys_dev *elem = ahci;
+ while (elem->next)
+ elem = elem->next;
+ elem->next = nvme;
+ } else {
+ struct sys_dev *elem = ahci;
+ while (elem->next)
+ elem = elem->next;
+ elem->next = isci;
+ while (elem->next)
+ elem = elem->next;
+ elem->next = nvme;
+ }
+ intel_devices = ahci;
+ valid_time = time(0);
+ return intel_devices;
+}
+
+/*
+ * PCI Expansion ROM Data Structure Format */
+struct pciExpDataStructFormat {
+ __u8 ver[4];
+ __u16 vendorID;
+ __u16 deviceID;
+ __u16 devListOffset;
+ __u16 pciDataStructLen;
+ __u8 pciDataStructRev;
+} __attribute__ ((packed));
+
+struct orom_entry *orom_entries;
+
+const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id)
+{
+ struct orom_entry *entry;
+ struct devid_list *devid;
+
+ for (entry = orom_entries; entry; entry = entry->next) {
+ for (devid = entry->devid_list; devid; devid = devid->next) {
+ if (devid->devid == dev_id)
+ return entry;
+ }
+ }
+
+ return NULL;
+}
+
+const struct imsm_orom *get_orom_by_device_id(__u16 dev_id)
+{
+ const struct orom_entry *entry = get_orom_entry_by_device_id(dev_id);
+
+ if (entry)
+ return &entry->orom;
+
+ return NULL;
+}
+
+static struct orom_entry *add_orom(const struct imsm_orom *orom)
+{
+ struct orom_entry *list;
+ struct orom_entry *prev = NULL;
+
+ for (list = orom_entries; list; prev = list, list = list->next)
+ ;
+
+ list = xmalloc(sizeof(struct orom_entry));
+ list->orom = *orom;
+ list->devid_list = NULL;
+ list->next = NULL;
+
+ if (prev == NULL)
+ orom_entries = list;
+ else
+ prev->next = list;
+
+ return list;
+}
+
+static void add_orom_device_id(struct orom_entry *entry, __u16 dev_id)
+{
+ struct devid_list *list;
+ struct devid_list *prev = NULL;
+
+ for (list = entry->devid_list; list; prev = list, list = list->next) {
+ if (list->devid == dev_id)
+ return;
+ }
+ list = xmalloc(sizeof(struct devid_list));
+ list->devid = dev_id;
+ list->next = NULL;
+
+ if (prev == NULL)
+ entry->devid_list = list;
+ else
+ prev->next = list;
+}
+
+static int scan(const void *start, const void *end, const void *data)
+{
+ int offset;
+ const struct imsm_orom *imsm_mem = NULL;
+ int len = (end - start);
+ struct pciExpDataStructFormat *ptr= (struct pciExpDataStructFormat *)data;
+
+ if (data + 0x18 > end) {
+ dprintf("cannot find pciExpDataStruct \n");
+ return 0;
+ }
+
+ dprintf("ptr->vendorID: %lx __le16_to_cpu(ptr->deviceID): %lx \n",
+ (ulong) __le16_to_cpu(ptr->vendorID),
+ (ulong) __le16_to_cpu(ptr->deviceID));
+
+ if (__le16_to_cpu(ptr->vendorID) != 0x8086)
+ return 0;
+
+ if (get_orom_by_device_id(ptr->deviceID))
+ return 0;
+
+ for (offset = 0; offset < len; offset += 4) {
+ const void *mem = start + offset;
+
+ if ((memcmp(mem, IMSM_OROM_SIGNATURE, 4) == 0)) {
+ imsm_mem = mem;
+ break;
+ }
+ }
+
+ if (!imsm_mem)
+ return 0;
+
+ struct orom_entry *orom = add_orom(imsm_mem);
+
+ /* only PciDataStructure with revision 3 and above supports devices list. */
+ if (ptr->pciDataStructRev >= 3 && ptr->devListOffset) {
+ const __u16 *dev_list = (void *)ptr + ptr->devListOffset;
+ int i;
+
+ for (i = 0; dev_list[i] != 0; i++)
+ add_orom_device_id(orom, dev_list[i]);
+ } else {
+ add_orom_device_id(orom, __le16_to_cpu(ptr->deviceID));
+ }
+
+ return 0;
+}
+
+const struct imsm_orom *imsm_platform_test(struct sys_dev *hba)
+{
+ struct imsm_orom orom = {
+ .signature = IMSM_OROM_SIGNATURE,
+ .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5,
+ .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
+ IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB |
+ IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB |
+ IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB |
+ IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB,
+ .dpa = IMSM_OROM_DISKS_PER_ARRAY,
+ .tds = IMSM_OROM_TOTAL_DISKS,
+ .vpa = IMSM_OROM_VOLUMES_PER_ARRAY,
+ .vphba = IMSM_OROM_VOLUMES_PER_HBA
+ };
+ orom.attr = orom.rlc | IMSM_OROM_ATTR_ChecksumVerify;
+
+ if (check_env("IMSM_TEST_OROM_NORAID5")) {
+ orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10;
+ }
+ if (check_env("IMSM_TEST_AHCI_EFI_NORAID5") && (hba->type == SYS_DEV_SAS)) {
+ orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10;
+ }
+ if (check_env("IMSM_TEST_SCU_EFI_NORAID5") && (hba->type == SYS_DEV_SATA)) {
+ orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10;
+ }
+
+ struct orom_entry *ret = add_orom(&orom);
+
+ add_orom_device_id(ret, hba->dev_id);
+
+ return &ret->orom;
+}
+
+static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba)
+{
+ unsigned long align;
+
+ if (check_env("IMSM_TEST_OROM"))
+ return imsm_platform_test(hba);
+
+ /* return empty OROM capabilities in EFI test mode */
+ if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI"))
+ return NULL;
+
+ find_intel_devices();
+
+ if (intel_devices == NULL)
+ return NULL;
+
+ /* scan option-rom memory looking for an imsm signature */
+ if (check_env("IMSM_SAFE_OROM_SCAN"))
+ align = 2048;
+ else
+ align = 512;
+ if (probe_roms_init(align) != 0)
+ return NULL;
+ probe_roms();
+ /* ignore return value - True is returned if both adapater roms are found */
+ scan_adapter_roms(scan);
+ probe_roms_exit();
+
+ return get_orom_by_device_id(hba->dev_id);
+}
+
+#define GUID_STR_MAX 37 /* according to GUID format:
+ * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */
+
+#define EFI_GUID(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \
+((struct efi_guid) \
+{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \
+ (b) & 0xff, ((b) >> 8) & 0xff, \
+ (c) & 0xff, ((c) >> 8) & 0xff, \
+ (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }})
+
+#define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars"
+#define SYS_EFIVARS_PATH "/sys/firmware/efi/efivars"
+#define SCU_PROP "RstScuV"
+#define AHCI_PROP "RstSataV"
+#define AHCI_SSATA_PROP "RstsSatV"
+#define AHCI_TSATA_PROP "RsttSatV"
+#define VMD_PROP "RstUefiV"
+
+#define VENDOR_GUID \
+ EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6)
+
+#define PCI_CLASS_RAID_CNTRL 0x010400
+
+static int read_efi_var(void *buffer, ssize_t buf_size,
+ const char *variable_name, struct efi_guid guid)
+{
+ char path[PATH_MAX];
+ char buf[GUID_STR_MAX];
+ int fd;
+ ssize_t n;
+
+ snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFIVARS_PATH, variable_name, guid_str(buf, guid));
+
+ fd = open(path, O_RDONLY);
+ if (fd < 0)
+ return 1;
+
+ /* read the variable attributes and ignore it */
+ n = read(fd, buf, sizeof(__u32));
+ if (n < 0) {
+ close(fd);
+ return 1;
+ }
+
+ /* read the variable data */
+ n = read(fd, buffer, buf_size);
+ close(fd);
+ if (n < buf_size)
+ return 1;
+
+ return 0;
+}
+
+static int read_efi_variable(void *buffer, ssize_t buf_size,
+ const char *variable_name, struct efi_guid guid)
+{
+ char path[PATH_MAX];
+ char buf[GUID_STR_MAX];
+ int dfd;
+ ssize_t n, var_data_len;
+
+ /* Try to read the variable using the new efivarfs interface first.
+ * If that fails, fall back to the old sysfs-efivars interface. */
+ if (!read_efi_var(buffer, buf_size, variable_name, guid))
+ return 0;
+
+ snprintf(path, PATH_MAX, "%s/%s-%s/size", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid));
+
+ dprintf("EFI VAR: path=%s\n", path);
+ /* get size of variable data */
+ dfd = open(path, O_RDONLY);
+ if (dfd < 0)
+ return 1;
+
+ n = read(dfd, &buf, sizeof(buf));
+ close(dfd);
+ if (n < 0)
+ return 1;
+ buf[n] = '\0';
+
+ errno = 0;
+ var_data_len = strtoul(buf, NULL, 16);
+ if ((errno == ERANGE && (var_data_len == LONG_MAX)) ||
+ (errno != 0 && var_data_len == 0))
+ return 1;
+
+ /* get data */
+ snprintf(path, PATH_MAX, "%s/%s-%s/data", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid));
+
+ dprintf("EFI VAR: path=%s\n", path);
+ dfd = open(path, O_RDONLY);
+ if (dfd < 0)
+ return 1;
+
+ n = read(dfd, buffer, buf_size);
+ close(dfd);
+ if (n != var_data_len || n < buf_size) {
+ return 1;
+ }
+
+ return 0;
+}
+
+const struct imsm_orom *find_imsm_efi(struct sys_dev *hba)
+{
+ struct imsm_orom orom;
+ struct orom_entry *ret;
+ static const char * const sata_efivars[] = {AHCI_PROP, AHCI_SSATA_PROP,
+ AHCI_TSATA_PROP};
+ unsigned long i;
+
+ if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI"))
+ return imsm_platform_test(hba);
+
+ /* OROM test is set, return that there is no EFI capabilities */
+ if (check_env("IMSM_TEST_OROM"))
+ return NULL;
+
+ switch (hba->type) {
+ case SYS_DEV_SAS:
+ if (!read_efi_variable(&orom, sizeof(orom), SCU_PROP,
+ VENDOR_GUID))
+ break;
+
+ return NULL;
+ case SYS_DEV_SATA:
+ if (hba->class != PCI_CLASS_RAID_CNTRL)
+ return NULL;
+
+ for (i = 0; i < ARRAY_SIZE(sata_efivars); i++) {
+ if (!read_efi_variable(&orom, sizeof(orom),
+ sata_efivars[i], VENDOR_GUID))
+ break;
+
+ }
+ if (i == ARRAY_SIZE(sata_efivars))
+ return NULL;
+
+ break;
+ case SYS_DEV_VMD:
+ if (!read_efi_variable(&orom, sizeof(orom), VMD_PROP,
+ VENDOR_GUID))
+ break;
+ return NULL;
+ default:
+ return NULL;
+ }
+
+ ret = add_orom(&orom);
+ add_orom_device_id(ret, hba->dev_id);
+ ret->type = hba->type;
+
+ return &ret->orom;
+}
+
+const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba)
+{
+ static struct orom_entry *nvme_orom;
+
+ if (hba->type != SYS_DEV_NVME)
+ return NULL;
+
+ if (!nvme_orom) {
+ struct imsm_orom nvme_orom_compat = {
+ .signature = IMSM_NVME_OROM_COMPAT_SIGNATURE,
+ .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 |
+ IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5,
+ .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB |
+ IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB |
+ IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB,
+ .dpa = IMSM_OROM_DISKS_PER_ARRAY_NVME,
+ .tds = IMSM_OROM_TOTAL_DISKS_NVME,
+ .vpa = IMSM_OROM_VOLUMES_PER_ARRAY,
+ .vphba = IMSM_OROM_TOTAL_DISKS_NVME / 2 * IMSM_OROM_VOLUMES_PER_ARRAY,
+ .attr = IMSM_OROM_ATTR_2TB | IMSM_OROM_ATTR_2TB_DISK,
+ .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem
+ };
+ nvme_orom = add_orom(&nvme_orom_compat);
+ }
+ add_orom_device_id(nvme_orom, hba->dev_id);
+ nvme_orom->type = SYS_DEV_NVME;
+ return &nvme_orom->orom;
+}
+
+const struct imsm_orom *find_imsm_capability(struct sys_dev *hba)
+{
+ const struct imsm_orom *cap = get_orom_by_device_id(hba->dev_id);
+
+ if (cap)
+ return cap;
+
+ if (hba->type == SYS_DEV_NVME)
+ return find_imsm_nvme(hba);
+ if ((cap = find_imsm_efi(hba)) != NULL)
+ return cap;
+ if ((cap = find_imsm_hba_orom(hba)) != NULL)
+ return cap;
+
+ return NULL;
+}
+
+/* Check whether the nvme device is represented by nvme subsytem,
+ * if yes virtual path should be changed to hardware device path,
+ * to allow IMSM capabilities detection.
+ * Returns:
+ * hardware path to device - if the device is represented via
+ * nvme virtual subsytem
+ * NULL - if the device is not represented via nvme virtual subsytem
+ */
+char *get_nvme_multipath_dev_hw_path(const char *dev_path)
+{
+ DIR *dir;
+ struct dirent *ent;
+ char *rp = NULL;
+
+ if (strncmp(dev_path, NVME_SUBSYS_PATH, strlen(NVME_SUBSYS_PATH)) != 0)
+ return NULL;
+
+ dir = opendir(dev_path);
+ if (!dir)
+ return NULL;
+
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ char buf[strlen(dev_path) + strlen(ent->d_name) + 1];
+
+ /* Check if dir is a controller, ignore namespaces*/
+ if (!(strncmp(ent->d_name, "nvme", 4) == 0) ||
+ (strrchr(ent->d_name, 'n') != &ent->d_name[0]))
+ continue;
+
+ sprintf(buf, "%s/%s", dev_path, ent->d_name);
+ rp = realpath(buf, NULL);
+ break;
+ }
+
+ closedir(dir);
+ return rp;
+}
+
+/* Description: Return part or whole realpath for the dev
+ * Parameters:
+ * dev - the device to be quered
+ * dev_level - level of "/device" entries. It allows to caller to access
+ * virtual or physical devices which are on "path" to quered
+ * one.
+ * buf - optional, must be PATH_MAX size. If set, then will be used.
+ */
+char *devt_to_devpath(dev_t dev, int dev_level, char *buf)
+{
+ char device[PATH_MAX];
+ char *hw_path;
+ int i;
+ unsigned long device_free_len = sizeof(device) - 1;
+ char dev_str[] = "/device";
+ unsigned long dev_str_len = strlen(dev_str);
+
+ snprintf(device, sizeof(device), "/sys/dev/block/%d:%d", major(dev),
+ minor(dev));
+
+ /* If caller wants block device, return path to it even if it is exposed
+ * via virtual layer.
+ */
+ if (dev_level == 0)
+ return realpath(device, buf);
+
+ device_free_len -= strlen(device);
+ for (i = 0; i < dev_level; i++) {
+ if (device_free_len < dev_str_len)
+ return NULL;
+
+ strncat(device, dev_str, device_free_len);
+
+ /* Resolve nvme-subsystem abstraction if needed
+ */
+ device_free_len -= dev_str_len;
+ if (i == 0) {
+ char rp[PATH_MAX];
+
+ if (!realpath(device, rp))
+ return NULL;
+ hw_path = get_nvme_multipath_dev_hw_path(rp);
+ if (hw_path) {
+ strcpy(device, hw_path);
+ device_free_len = sizeof(device) -
+ strlen(device) - 1;
+ free(hw_path);
+ }
+ }
+ }
+
+ return realpath(device, buf);
+}
+
+char *diskfd_to_devpath(int fd, int dev_level, char *buf)
+{
+ /* return the device path for a disk, return NULL on error or fd
+ * refers to a partition
+ */
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return NULL;
+ if (!S_ISBLK(st.st_mode))
+ return NULL;
+
+ return devt_to_devpath(st.st_rdev, dev_level, buf);
+}
+
+int path_attached_to_hba(const char *disk_path, const char *hba_path)
+{
+ int rc;
+
+ if (check_env("IMSM_TEST_AHCI_DEV") ||
+ check_env("IMSM_TEST_SCU_DEV")) {
+ return 1;
+ }
+
+ if (!disk_path || !hba_path)
+ return 0;
+ dprintf("hba: %s - disk: %s\n", hba_path, disk_path);
+ if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0)
+ rc = 1;
+ else
+ rc = 0;
+
+ return rc;
+}
+
+int devt_attached_to_hba(dev_t dev, const char *hba_path)
+{
+ char *disk_path = devt_to_devpath(dev, 1, NULL);
+ int rc = path_attached_to_hba(disk_path, hba_path);
+
+ if (disk_path)
+ free(disk_path);
+
+ return rc;
+}
+
+int disk_attached_to_hba(int fd, const char *hba_path)
+{
+ char *disk_path = diskfd_to_devpath(fd, 1, NULL);
+ int rc = path_attached_to_hba(disk_path, hba_path);
+
+ if (disk_path)
+ free(disk_path);
+
+ return rc;
+}
+
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf)
+{
+ struct dirent *ent;
+ DIR *dir;
+ char path[PATH_MAX];
+
+ if (!hba)
+ return NULL;
+
+ if (hba->type != SYS_DEV_VMD)
+ return NULL;
+
+ dir = opendir("/sys/bus/pci/drivers/vmd");
+ if (!dir)
+ return NULL;
+
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device",
+ ent->d_name);
+
+ if (!realpath(path, buf))
+ continue;
+
+ if (strncmp(buf, hba->path, strlen(buf)) == 0) {
+ sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name);
+ closedir(dir);
+ return realpath(path, buf);
+ }
+ }
+
+ closedir(dir);
+ return NULL;
+}
+
+/* Scan over all controller's namespaces and compare nsid value to verify if
+ * current one is supported. The routine doesn't check IMSM capabilities for
+ * namespace. Only one nvme namespace is supported by IMSM.
+ * Paramteres:
+ * fd - open descriptor to the nvme namespace
+ * verbose - error logging level
+ * Returns:
+ * 1 - if namespace is supported
+ * 0 - otherwise
+ */
+int imsm_is_nvme_namespace_supported(int fd, int verbose)
+{
+ DIR *dir = NULL;
+ struct dirent *ent;
+ char cntrl_path[PATH_MAX];
+ char ns_path[PATH_MAX];
+ unsigned long long lowest_nsid = ULLONG_MAX;
+ unsigned long long this_nsid;
+ int rv = 0;
+
+
+ if (!diskfd_to_devpath(fd, 1, cntrl_path) ||
+ !diskfd_to_devpath(fd, 0, ns_path)) {
+ if (verbose)
+ pr_err("Cannot get device paths\n");
+ goto abort;
+ }
+
+
+ if (devpath_to_ll(ns_path, "nsid", &this_nsid)) {
+ if (verbose)
+ pr_err("Cannot read nsid value for %s",
+ basename(ns_path));
+ goto abort;
+ }
+
+ dir = opendir(cntrl_path);
+ if (!dir)
+ goto abort;
+
+ /* The lowest nvme namespace is supported */
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ unsigned long long curr_nsid;
+ char curr_ns_path[PATH_MAX + 256];
+
+ if (!strstr(ent->d_name, "nvme"))
+ continue;
+
+ snprintf(curr_ns_path, sizeof(curr_ns_path), "%s/%s",
+ cntrl_path, ent->d_name);
+
+ if (devpath_to_ll(curr_ns_path, "nsid", &curr_nsid))
+ goto abort;
+
+ if (lowest_nsid > curr_nsid)
+ lowest_nsid = curr_nsid;
+ }
+
+ if (this_nsid == lowest_nsid)
+ rv = 1;
+ else if (verbose)
+ pr_err("IMSM is supported on the lowest NVMe namespace\n");
+
+abort:
+ if (dir)
+ closedir(dir);
+
+ return rv;
+}
+
+/* Verify if multipath is supported by NVMe controller
+ * Returns:
+ * 0 - not supported
+ * 1 - supported
+ */
+int is_multipath_nvme(int disk_fd)
+{
+ char ns_path[PATH_MAX];
+
+ if (!diskfd_to_devpath(disk_fd, 0, ns_path))
+ return 0;
+
+ if (strncmp(ns_path, NVME_SUBSYS_PATH, strlen(NVME_SUBSYS_PATH)) == 0)
+ return 1;
+
+ return 0;
+}
diff --git a/platform-intel.h b/platform-intel.h
new file mode 100644
index 0000000..6238d23
--- /dev/null
+++ b/platform-intel.h
@@ -0,0 +1,259 @@
+/*
+ * Intel(R) Matrix Storage Manager hardware and firmware support routines
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <asm/types.h>
+#include <strings.h>
+
+/* The IMSM Capability (IMSM AHCI and ISCU OROM/EFI variable) Version Table definition */
+struct imsm_orom {
+ __u8 signature[4];
+ #define IMSM_OROM_SIGNATURE "$VER"
+ #define IMSM_NVME_OROM_COMPAT_SIGNATURE "$NVM"
+ __u8 table_ver_major; /* Currently 2 (can change with future revs) */
+ __u8 table_ver_minor; /* Currently 2 (can change with future revs) */
+ __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */
+ __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */
+ __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */
+ __u16 build; /* Example: 1020 as in 8.6.0.1020 */
+ __u8 len; /* number of bytes in this entire table */
+ __u8 checksum; /* checksum of all the bytes in this table */
+ __u16 rlc; /* RAID Level Capability */
+ /* we assume the cpu is x86 as the orom should not be found
+ * anywhere else
+ */
+ #define IMSM_OROM_RLC_RAID0 (1 << 0)
+ #define IMSM_OROM_RLC_RAID1 (1 << 1)
+ #define IMSM_OROM_RLC_RAID10 (1 << 2)
+ #define IMSM_OROM_RLC_RAID1E (1 << 3)
+ #define IMSM_OROM_RLC_RAID5 (1 << 4)
+ #define IMSM_OROM_RLC_RAID_CNG (1 << 5)
+ __u16 sss; /* Strip Size Supported */
+ #define IMSM_OROM_SSS_2kB (1 << 0)
+ #define IMSM_OROM_SSS_4kB (1 << 1)
+ #define IMSM_OROM_SSS_8kB (1 << 2)
+ #define IMSM_OROM_SSS_16kB (1 << 3)
+ #define IMSM_OROM_SSS_32kB (1 << 4)
+ #define IMSM_OROM_SSS_64kB (1 << 5)
+ #define IMSM_OROM_SSS_128kB (1 << 6)
+ #define IMSM_OROM_SSS_256kB (1 << 7)
+ #define IMSM_OROM_SSS_512kB (1 << 8)
+ #define IMSM_OROM_SSS_1MB (1 << 9)
+ #define IMSM_OROM_SSS_2MB (1 << 10)
+ #define IMSM_OROM_SSS_4MB (1 << 11)
+ #define IMSM_OROM_SSS_8MB (1 << 12)
+ #define IMSM_OROM_SSS_16MB (1 << 13)
+ #define IMSM_OROM_SSS_32MB (1 << 14)
+ #define IMSM_OROM_SSS_64MB (1 << 15)
+ __u16 dpa; /* Disks Per Array supported */
+ #define IMSM_OROM_DISKS_PER_ARRAY 6
+ #define IMSM_OROM_DISKS_PER_ARRAY_NVME 12
+ __u16 tds; /* Total Disks Supported */
+ #define IMSM_OROM_TOTAL_DISKS 6
+ #define IMSM_OROM_TOTAL_DISKS_NVME 12
+ __u8 vpa; /* # Volumes Per Array supported */
+ #define IMSM_OROM_VOLUMES_PER_ARRAY 2
+ __u8 vphba; /* # Volumes Per Host Bus Adapter supported */
+ #define IMSM_OROM_VOLUMES_PER_HBA 4
+ #define IMSM_OROM_VOLUMES_PER_HBA_NVME 4
+ /* Attributes supported. This should map to the
+ * attributes in the MPB. Also, lower 16 bits
+ * should match/duplicate RLC bits above.
+ */
+ __u32 attr;
+ #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0
+ #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1
+ #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10
+ #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E
+ #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5
+ #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG
+ #define IMSM_OROM_ATTR_2TB_DISK (1 << 26)
+ #define IMSM_OROM_ATTR_2TB (1 << 29)
+ #define IMSM_OROM_ATTR_PM (1 << 30)
+ #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31)
+ __u32 capabilities;
+ #define IMSM_OROM_CAPABILITIES_Ext_SATA (1 << 0)
+ #define IMSM_OROM_CAPABILITIES_TurboMemory (1 << 1)
+ #define IMSM_OROM_CAPABILITIES_HddPassword (1 << 2)
+ #define IMSM_OROM_CAPABILITIES_DiskCoercion (1 << 3)
+ __u32 driver_features;
+ #define IMSM_OROM_CAPABILITIES_HDDUnlock (1 << 0)
+ #define IMSM_OROM_CAPABILITIES_LEDLoc (1 << 1)
+ #define IMSM_OROM_CAPABILITIES_EnterpriseSystem (1 << 2)
+ #define IMSM_OROM_CAPABILITIES_Zpodd (1 << 3)
+ #define IMSM_OROM_CAPABILITIES_LargeDramCache (1 << 4)
+ #define IMSM_OROM_CAPABILITIES_Rohi (1 << 5)
+ #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6)
+ #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7)
+ #define IMSM_OROM_CAPABILITIES_SKUMode ((1 << 8)|(1 << 9))
+ #define IMSM_OROM_CAPABILITIES_TPV (1 << 10)
+} __attribute__((packed));
+
+static inline int imsm_orom_has_raid0(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID0);
+}
+static inline int imsm_orom_has_raid1(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID1);
+}
+static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID1E);
+}
+static inline int imsm_orom_has_raid10(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID10);
+}
+static inline int imsm_orom_has_raid5(const struct imsm_orom *orom)
+{
+ return !!(orom->rlc & IMSM_OROM_RLC_RAID5);
+}
+
+/**
+ * imsm_orom_has_chunk - check if the orom supports the given chunk size
+ * @orom: orom pointer from find_imsm_orom
+ * @chunk: chunk size in kibibytes
+ */
+static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk)
+{
+ int fs = ffs(chunk);
+ if (!fs)
+ return 0;
+ fs--; /* bit num to bit index */
+ if (chunk & (chunk-1))
+ return 0; /* not a power of 2 */
+ return !!(orom->sss & (1 << (fs - 1)));
+}
+
+/**
+ * fls - find last (most-significant) bit set
+ * @x: the word to search
+ * The funciton is borrowed from Linux kernel code
+ * include/asm-generic/bitops/fls.h
+ */
+static inline int fls(int x)
+{
+ int r = 32;
+
+ if (!x)
+ return 0;
+ if (!(x & 0xffff0000u)) {
+ x <<= 16;
+ r -= 16;
+ }
+ if (!(x & 0xff000000u)) {
+ x <<= 8;
+ r -= 8;
+ }
+ if (!(x & 0xf0000000u)) {
+ x <<= 4;
+ r -= 4;
+ }
+ if (!(x & 0xc0000000u)) {
+ x <<= 2;
+ r -= 2;
+ }
+ if (!(x & 0x80000000u)) {
+ r -= 1;
+ }
+ return r;
+}
+
+static inline int imsm_orom_is_enterprise(const struct imsm_orom *orom)
+{
+ return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_EnterpriseSystem);
+}
+
+static inline int imsm_orom_is_nvme(const struct imsm_orom *orom)
+{
+ return memcmp(orom->signature, IMSM_NVME_OROM_COMPAT_SIGNATURE,
+ sizeof(orom->signature)) == 0;
+}
+
+static inline int imsm_orom_has_tpv_support(const struct imsm_orom *orom)
+{
+ return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_TPV);
+}
+
+enum sys_dev_type {
+ SYS_DEV_UNKNOWN = 0,
+ SYS_DEV_SAS,
+ SYS_DEV_SATA,
+ SYS_DEV_NVME,
+ SYS_DEV_VMD,
+ SYS_DEV_MAX
+};
+
+struct sys_dev {
+ enum sys_dev_type type;
+ char *path;
+ char *pci_id;
+ __u16 dev_id;
+ __u32 class;
+ struct sys_dev *next;
+};
+
+struct efi_guid {
+ __u8 b[16];
+};
+
+struct devid_list {
+ __u16 devid;
+ struct devid_list *next;
+};
+
+struct orom_entry {
+ struct imsm_orom orom;
+ struct devid_list *devid_list;
+ enum sys_dev_type type;
+ struct orom_entry *next;
+};
+
+extern struct orom_entry *orom_entries;
+
+static inline char *guid_str(char *buf, struct efi_guid guid)
+{
+ sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
+ guid.b[3], guid.b[2], guid.b[1], guid.b[0],
+ guid.b[5], guid.b[4], guid.b[7], guid.b[6],
+ guid.b[8], guid.b[9], guid.b[10], guid.b[11],
+ guid.b[12], guid.b[13], guid.b[14], guid.b[15]);
+ return buf;
+}
+
+char *get_nvme_multipath_dev_hw_path(const char *dev_path);
+char *diskfd_to_devpath(int fd, int dev_level, char *buf);
+int devpath_to_char(const char *dev_path, const char *entry, char *buf,
+ int len, int verbose);
+__u16 devpath_to_vendor(const char *dev_path);
+struct sys_dev *find_driver_devices(const char *bus, const char *driver);
+struct sys_dev *find_intel_devices(void);
+const struct imsm_orom *find_imsm_capability(struct sys_dev *hba);
+const struct imsm_orom *find_imsm_orom(void);
+int disk_attached_to_hba(int fd, const char *hba_path);
+int devt_attached_to_hba(dev_t dev, const char *hba_path);
+char *devt_to_devpath(dev_t dev, int dev_level, char *buf);
+int path_attached_to_hba(const char *disk_path, const char *hba_path);
+const char *get_sys_dev_type(enum sys_dev_type);
+const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id);
+const struct imsm_orom *get_orom_by_device_id(__u16 device_id);
+struct sys_dev *device_by_id(__u16 device_id);
+struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path);
+int is_multipath_nvme(int disk_fd);
+int imsm_is_nvme_namespace_supported(int disk_fd, int verbose);
+char *vmd_domain_to_controller(struct sys_dev *hba, char *buf);
diff --git a/policy.c b/policy.c
new file mode 100644
index 0000000..eee9ef6
--- /dev/null
+++ b/policy.c
@@ -0,0 +1,931 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <dirent.h>
+#include <fnmatch.h>
+#include <ctype.h>
+#include "dlink.h"
+/*
+ * Policy module for mdadm.
+ * A policy statement about a device lists a set of values for each
+ * of a set of names. Each value can have a metadata type as context.
+ *
+ * names include:
+ * action - the actions that can be taken on hot-plug
+ * domain - the domain(s) that the device is part of
+ *
+ * Policy information is extracted from various sources, but
+ * particularly from a set of policy rules in mdadm.conf
+ */
+
+static void pol_new(struct dev_policy **pol, char *name, const char *val,
+ const char *metadata)
+{
+ struct dev_policy *n = xmalloc(sizeof(*n));
+ const char *real_metadata = NULL;
+ int i;
+
+ n->name = name;
+ n->value = val;
+
+ /* We need to normalise the metadata name */
+ if (metadata) {
+ for (i = 0; superlist[i] ; i++)
+ if (strcmp(metadata, superlist[i]->name) == 0) {
+ real_metadata = superlist[i]->name;
+ break;
+ }
+ if (!real_metadata) {
+ if (strcmp(metadata, "1") == 0 ||
+ strcmp(metadata, "1.0") == 0 ||
+ strcmp(metadata, "1.1") == 0 ||
+ strcmp(metadata, "1.2") == 0)
+ real_metadata = super1.name;
+ }
+ if (!real_metadata) {
+ static const char *prev = NULL;
+ if (prev != metadata) {
+ pr_err("metadata=%s unrecognised - ignoring rule\n",
+ metadata);
+ prev = metadata;
+ }
+ real_metadata = "unknown";
+ }
+ }
+
+ n->metadata = real_metadata;
+ n->next = *pol;
+ *pol = n;
+}
+
+static int pol_lesseq(struct dev_policy *a, struct dev_policy *b)
+{
+ int cmp;
+
+ if (a->name < b->name)
+ return 1;
+ if (a->name > b->name)
+ return 0;
+
+ cmp = strcmp(a->value, b->value);
+ if (cmp < 0)
+ return 1;
+ if (cmp > 0)
+ return 0;
+
+ return (a->metadata <= b->metadata);
+}
+
+static void pol_sort(struct dev_policy **pol)
+{
+ /* sort policy list in *pol by name/metadata/value
+ * using merge sort
+ */
+
+ struct dev_policy *pl[2];
+ pl[0] = *pol;
+ pl[1] = NULL;
+
+ do {
+ struct dev_policy **plp[2], *p[2];
+ int curr = 0;
+ struct dev_policy nul = { NULL, NULL, NULL, NULL };
+ struct dev_policy *prev = &nul;
+ int next = 0;
+
+ /* p[] are the two lists that we are merging.
+ * plp[] are the ends of the two lists we create
+ * from the merge.
+ * 'curr' is which of plp[] that we are currently
+ * adding items to.
+ * 'next' is which if p[] we will take the next
+ * item from.
+ * 'prev' is that last value, which was placed in
+ * plp[curr].
+ */
+ plp[0] = &pl[0];
+ plp[1] = &pl[1];
+ p[0] = pl[0];
+ p[1] = pl[1];
+
+ /* take least of p[0] and p[1]
+ * if it is larger than prev, add to
+ * plp[curr], else swap curr then add
+ */
+ while (p[0] || p[1]) {
+ if (p[next] == NULL ||
+ (p[1-next] != NULL &&
+ !(pol_lesseq(prev, p[1-next])
+ ^pol_lesseq(prev, p[next])
+ ^pol_lesseq(p[next], p[1-next])))
+ )
+ next = 1 - next;
+
+ if (!pol_lesseq(prev, p[next]))
+ curr = 1 - curr;
+
+ *plp[curr] = prev = p[next];
+ plp[curr] = &p[next]->next;
+ p[next] = p[next]->next;
+ }
+ *plp[0] = NULL;
+ *plp[1] = NULL;
+ } while (pl[0] && pl[1]);
+ if (pl[0])
+ *pol = pl[0];
+ else
+ *pol = pl[1];
+}
+
+static void pol_dedup(struct dev_policy *pol)
+{
+ /* This is a sorted list - remove duplicates. */
+ while (pol && pol->next) {
+ if (pol_lesseq(pol->next, pol)) {
+ struct dev_policy *tmp = pol->next;
+ pol->next = tmp->next;
+ free(tmp);
+ } else
+ pol = pol->next;
+ }
+}
+
+/*
+ * pol_find finds the first entry in the policy
+ * list to match name.
+ * If it returns non-NULL there is at least one
+ * value, but how many can only be found by
+ * iterating through the list.
+ */
+struct dev_policy *pol_find(struct dev_policy *pol, char *name)
+{
+ while (pol && pol->name < name)
+ pol = pol->next;
+
+ if (!pol || pol->name != name)
+ return NULL;
+ return pol;
+}
+
+static char **disk_paths(struct mdinfo *disk)
+{
+ struct stat stb;
+ int prefix_len;
+ DIR *by_path;
+ char symlink[PATH_MAX] = "/dev/disk/by-path/";
+ char **paths;
+ int cnt = 0;
+ struct dirent *ent;
+
+ paths = xmalloc(sizeof(*paths) * (cnt+1));
+
+ by_path = opendir(symlink);
+ if (by_path) {
+ prefix_len = strlen(symlink);
+ while ((ent = readdir(by_path)) != NULL) {
+ if (ent->d_type != DT_LNK)
+ continue;
+ strncpy(symlink + prefix_len,
+ ent->d_name,
+ sizeof(symlink) - prefix_len);
+ if (stat(symlink, &stb) < 0)
+ continue;
+ if ((stb.st_mode & S_IFMT) != S_IFBLK)
+ continue;
+ if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor))
+ continue;
+ paths[cnt++] = xstrdup(ent->d_name);
+ paths = xrealloc(paths, sizeof(*paths) * (cnt+1));
+ }
+ closedir(by_path);
+ }
+ paths[cnt] = NULL;
+ return paths;
+}
+
+char type_part[] = "part";
+char type_disk[] = "disk";
+static char *disk_type(struct mdinfo *disk)
+{
+ char buf[30+20+20];
+ struct stat stb;
+ sprintf(buf, "/sys/dev/block/%d:%d/partition",
+ disk->disk.major, disk->disk.minor);
+ if (stat(buf, &stb) == 0)
+ return type_part;
+ else
+ return type_disk;
+}
+
+static int path_has_part(char *path, char **part)
+{
+ /* check if path ends with "-partNN" and
+ * if it does, place a pointer to "-pathNN"
+ * in 'part'.
+ */
+ int l;
+ if (!path)
+ return 0;
+ l = strlen(path);
+ while (l > 1 && isdigit(path[l-1]))
+ l--;
+ if (l < 5 || strncmp(path+l-5, "-part", 5) != 0)
+ return 0;
+ *part = path+l-5;
+ return 1;
+}
+
+static int pol_match(struct rule *rule, char **paths, char *type, char **part)
+{
+ /* Check if this rule matches on any path and type.
+ * If 'part' is not NULL, then 'path' must end in -partN, which
+ * we ignore for matching, and return in *part on success.
+ */
+ int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */
+ int typeok = 0;
+
+ for (; rule; rule = rule->next) {
+ if (rule->name == rule_path) {
+ char *p = NULL;
+ int i;
+ if (pathok == 0)
+ pathok = -1;
+ if (!paths)
+ continue;
+ for (i = 0; paths[i]; i++) {
+ if (part) {
+ if (!path_has_part(paths[i], &p))
+ continue;
+ *p = '\0';
+ *part = p+1;
+ }
+ if (fnmatch(rule->value, paths[i], 0) == 0)
+ pathok = 1;
+ if (part)
+ *p = '-';
+ }
+ }
+ if (rule->name == rule_type) {
+ if (typeok == 0)
+ typeok = -1;
+ if (type && strcmp(rule->value, type) == 0)
+ typeok = 1;
+ }
+ }
+ return pathok >= 0 && typeok >= 0;
+}
+
+static void pol_merge(struct dev_policy **pol, struct rule *rule)
+{
+ /* copy any name assignments from rule into pol */
+ struct rule *r;
+ char *metadata = NULL;
+ for (r = rule; r ; r = r->next)
+ if (r->name == pol_metadata)
+ metadata = r->value;
+
+ for (r = rule; r ; r = r->next)
+ if (r->name == pol_act ||
+ r->name == pol_domain ||
+ r->name == pol_auto)
+ pol_new(pol, r->name, r->value, metadata);
+}
+
+static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part)
+{
+ /* copy any name assignments from rule into pol, appending
+ * -part to any domain. The string with -part appended is
+ * stored with the rule so it has a lifetime to match
+ * the rule.
+ */
+ struct rule *r;
+ char *metadata = NULL;
+ for (r = rule; r ; r = r->next)
+ if (r->name == pol_metadata)
+ metadata = r->value;
+
+ for (r = rule; r ; r = r->next) {
+ if (r->name == pol_act)
+ pol_new(pol, r->name, r->value, metadata);
+ else if (r->name == pol_domain) {
+ char *dom;
+ int len;
+ if (r->dups == NULL)
+ r->dups = dl_head();
+ len = strlen(r->value);
+ for (dom = dl_next(r->dups); dom != r->dups;
+ dom = dl_next(dom))
+ if (strcmp(dom+len+1, part)== 0)
+ break;
+ if (dom == r->dups) {
+ char *newdom = dl_strndup(
+ r->value, len + 1 + strlen(part));
+ strcat(strcat(newdom, "-"), part);
+ dl_add(r->dups, newdom);
+ dom = newdom;
+ }
+ pol_new(pol, r->name, dom, metadata);
+ }
+ }
+}
+
+static struct pol_rule *config_rules = NULL;
+static struct pol_rule **config_rules_end = NULL;
+static int config_rules_has_path = 0;
+
+/*
+ * most policy comes from a set policy rules that are
+ * read from the config file.
+ * path_policy() gathers policy information for the
+ * disk described in the given a 'path' and a 'type'.
+ */
+struct dev_policy *path_policy(char **paths, char *type)
+{
+ struct pol_rule *rules;
+ struct dev_policy *pol = NULL;
+ int i;
+
+ rules = config_rules;
+
+ while (rules) {
+ char *part = NULL;
+ if (rules->type == rule_policy)
+ if (pol_match(rules->rule, paths, type, NULL))
+ pol_merge(&pol, rules->rule);
+ if (rules->type == rule_part && strcmp(type, type_part) == 0)
+ if (pol_match(rules->rule, paths, type_disk, &part))
+ pol_merge_part(&pol, rules->rule, part);
+ rules = rules->next;
+ }
+
+ /* Now add any metadata-specific internal knowledge
+ * about this path
+ */
+ for (i=0; paths && paths[0] && superlist[i]; i++)
+ if (superlist[i]->get_disk_controller_domain) {
+ const char *d =
+ superlist[i]->get_disk_controller_domain(
+ paths[0]);
+ if (d)
+ pol_new(&pol, pol_domain, d, superlist[i]->name);
+ }
+
+ pol_sort(&pol);
+ pol_dedup(pol);
+ return pol;
+}
+
+void pol_add(struct dev_policy **pol,
+ char *name, char *val,
+ char *metadata)
+{
+ pol_new(pol, name, val, metadata);
+ pol_sort(pol);
+ pol_dedup(*pol);
+}
+
+static void free_paths(char **paths)
+{
+ int i;
+
+ if (!paths)
+ return;
+
+ for (i = 0; paths[i]; i++)
+ free(paths[i]);
+ free(paths);
+}
+
+/*
+ * disk_policy() gathers policy information for the
+ * disk described in the given mdinfo (disk.{major,minor}).
+ */
+struct dev_policy *disk_policy(struct mdinfo *disk)
+{
+ char **paths = NULL;
+ char *type = disk_type(disk);
+ struct dev_policy *pol = NULL;
+
+ if (config_rules_has_path)
+ paths = disk_paths(disk);
+
+ pol = path_policy(paths, type);
+
+ free_paths(paths);
+ return pol;
+}
+
+struct dev_policy *devid_policy(int dev)
+{
+ struct mdinfo disk;
+ disk.disk.major = major(dev);
+ disk.disk.minor = minor(dev);
+ return disk_policy(&disk);
+}
+
+/*
+ * process policy rules read from config file.
+ */
+
+char rule_path[] = "path";
+char rule_type[] = "type";
+
+char rule_policy[] = "policy";
+char rule_part[] = "part-policy";
+
+char pol_metadata[] = "metadata";
+char pol_act[] = "action";
+char pol_domain[] = "domain";
+char pol_auto[] = "auto";
+
+static int try_rule(char *w, char *name, struct rule **rp)
+{
+ struct rule *r;
+ int len = strlen(name);
+ if (strncmp(w, name, len) != 0 ||
+ w[len] != '=')
+ return 0;
+ r = xmalloc(sizeof(*r));
+ r->next = *rp;
+ r->name = name;
+ r->value = xstrdup(w+len+1);
+ r->dups = NULL;
+ *rp = r;
+ return 1;
+}
+
+void policyline(char *line, char *type)
+{
+ struct pol_rule *pr;
+ char *w;
+
+ if (config_rules_end == NULL)
+ config_rules_end = &config_rules;
+
+ pr = xmalloc(sizeof(*pr));
+ pr->type = type;
+ pr->rule = NULL;
+ for (w = dl_next(line); w != line ; w = dl_next(w)) {
+ if (try_rule(w, rule_path, &pr->rule))
+ config_rules_has_path = 1;
+ else if (! try_rule(w, rule_type, &pr->rule) &&
+ ! try_rule(w, pol_metadata, &pr->rule) &&
+ ! try_rule(w, pol_act, &pr->rule) &&
+ ! try_rule(w, pol_domain, &pr->rule) &&
+ ! try_rule(w, pol_auto, &pr->rule))
+ pr_err("policy rule %s unrecognised and ignored\n",
+ w);
+ }
+ pr->next = config_rules;
+ config_rules = pr;
+}
+
+void policy_add(char *type, ...)
+{
+ va_list ap;
+ struct pol_rule *pr;
+ char *name, *val;
+
+ pr = xmalloc(sizeof(*pr));
+ pr->type = type;
+ pr->rule = NULL;
+
+ va_start(ap, type);
+ while ((name = va_arg(ap, char*)) != NULL) {
+ struct rule *r;
+
+ val = va_arg(ap, char*);
+ r = xmalloc(sizeof(*r));
+ r->next = pr->rule;
+ r->name = name;
+ r->value = xstrdup(val);
+ r->dups = NULL;
+ pr->rule = r;
+ }
+ pr->next = config_rules;
+ config_rules = pr;
+ va_end(ap);
+}
+
+void policy_free(void)
+{
+ while (config_rules) {
+ struct pol_rule *pr = config_rules;
+ struct rule *r;
+
+ config_rules = config_rules->next;
+
+ for (r = pr->rule; r; ) {
+ struct rule *next = r->next;
+ free(r->value);
+ if (r->dups)
+ free_line(r->dups);
+ free(r);
+ r = next;
+ }
+ free(pr);
+ }
+ config_rules_end = NULL;
+ config_rules_has_path = 0;
+}
+
+void dev_policy_free(struct dev_policy *p)
+{
+ struct dev_policy *t;
+ while (p) {
+ t = p;
+ p = p->next;
+ free(t);
+ }
+}
+
+static enum policy_action map_act(const char *act)
+{
+ if (strcmp(act, "include") == 0)
+ return act_include;
+ if (strcmp(act, "re-add") == 0)
+ return act_re_add;
+ if (strcmp(act, "spare") == 0)
+ return act_spare;
+ if (strcmp(act, "spare-same-slot") == 0)
+ return act_spare_same_slot;
+ if (strcmp(act, "force-spare") == 0)
+ return act_force_spare;
+ return act_err;
+}
+
+static enum policy_action policy_action(struct dev_policy *plist, const char *metadata)
+{
+ enum policy_action rv = act_default;
+ struct dev_policy *p;
+
+ plist = pol_find(plist, pol_act);
+ pol_for_each(p, plist, metadata) {
+ enum policy_action a = map_act(p->value);
+ if (a > rv)
+ rv = a;
+ }
+ return rv;
+}
+
+int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want)
+{
+ enum policy_action act = policy_action(plist, metadata);
+
+ if (act == act_err)
+ return 0;
+ return (act >= want);
+}
+
+int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want)
+{
+ struct dev_policy *pol = disk_policy(disk);
+ int rv = policy_action_allows(pol, metadata, want);
+
+ dev_policy_free(pol);
+ return rv;
+}
+
+/* Domain policy:
+ * Any device can have a list of domains asserted by different policy
+ * statements.
+ * An array also has a list of domains comprising all the domains of
+ * all the devices in an array.
+ * Where an array has a spare-group, that becomes an addition domain for
+ * every device in the array and thus for the array.
+ *
+ * We keep the list of domains in a sorted linked list
+ * As dev policies are already sorted, this is fairly easy to manage.
+ */
+
+static struct domainlist **domain_merge_one(struct domainlist **domp,
+ const char *domain)
+{
+ /* merge a domain name into a sorted list and return the
+ * location of the insertion or match
+ */
+ struct domainlist *dom = *domp;
+
+ while (dom && strcmp(dom->dom, domain) < 0) {
+ domp = &dom->next;
+ dom = *domp;
+ }
+ if (dom == NULL || strcmp(dom->dom, domain) != 0) {
+ dom = xmalloc(sizeof(*dom));
+ dom->next = *domp;
+ dom->dom = domain;
+ *domp = dom;
+ }
+ return domp;
+}
+
+#if (DEBUG)
+void dump_policy(struct dev_policy *policy)
+{
+ while (policy) {
+ dprintf("policy: %p name: %s value: %s metadata: %s\n",
+ policy,
+ policy->name,
+ policy->value,
+ policy->metadata);
+ policy = policy->next;
+ }
+}
+#endif
+
+void domain_merge(struct domainlist **domp, struct dev_policy *pollist,
+ const char *metadata)
+{
+ /* Add to 'domp' all the domains in pol that apply to 'metadata'
+ * which are not already in domp
+ */
+ struct dev_policy *pol;
+ pollist = pol_find(pollist, pol_domain);
+ pol_for_each(pol, pollist, metadata)
+ domain_merge_one(domp, pol->value);
+}
+
+int domain_test(struct domainlist *dom, struct dev_policy *pol,
+ const char *metadata)
+{
+ /* Check that all domains in pol (for metadata) are also in
+ * dom. Both lists are sorted.
+ * If pol has no domains, we don't really know about this device
+ * so we allow caller to choose:
+ * -1: has no domains
+ * 0: has domains, not all match
+ * 1: has domains, all match
+ */
+ int found_any = -1;
+ int has_one_domain = 1;
+ struct dev_policy *p;
+
+ pol = pol_find(pol, pol_domain);
+ pol_for_each(p, pol, metadata) {
+ found_any = 1;
+ while (dom && strcmp(dom->dom, p->value) < 0)
+ dom = dom->next;
+ if (!dom || strcmp(dom->dom, p->value) != 0)
+ return 0;
+ if (has_one_domain && metadata && strcmp(metadata, "imsm") == 0)
+ found_any = -1;
+ has_one_domain = 0;
+ }
+ return found_any;
+}
+
+void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata)
+{
+ struct dev_policy *pol = devid_policy(devid);
+ domain_merge(dom, pol, metadata);
+ dev_policy_free(pol);
+}
+
+struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata)
+{
+ struct domainlist *domlist = NULL;
+
+ if (!mdi)
+ return NULL;
+ for (mdi = mdi->devs ; mdi ; mdi = mdi->next)
+ domainlist_add_dev(&domlist, makedev(mdi->disk.major,
+ mdi->disk.minor),
+ metadata);
+
+ return domlist;
+}
+
+void domain_add(struct domainlist **domp, char *domain)
+{
+ domain_merge_one(domp, domain);
+}
+
+void domain_free(struct domainlist *dl)
+{
+ while (dl) {
+ struct domainlist *head = dl;
+ dl = dl->next;
+ free(head);
+ }
+}
+
+/*
+ * same-path policy.
+ * Some policy decisions are guided by knowledge of which
+ * array previously owned the device at a given physical location (path).
+ * When removing a device from an array we might record the array against
+ * the path, and when finding a new device, we might look for which
+ * array previously used that path.
+ *
+ * The 'array' is described by a map_ent, and the path by a the disk in an
+ * mdinfo, or a string.
+ */
+
+void policy_save_path(char *id_path, struct map_ent *array)
+{
+ char path[PATH_MAX];
+ FILE *f = NULL;
+
+ if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) {
+ pr_err("can't create file to save path to old disk: %s\n", strerror(errno));
+ return;
+ }
+
+ snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path);
+ f = fopen(path, "w");
+ if (!f) {
+ pr_err("can't create file to save path to old disk: %s\n",
+ strerror(errno));
+ return;
+ }
+
+ if (fprintf(f, "%20s %08x:%08x:%08x:%08x\n",
+ array->metadata,
+ array->uuid[0], array->uuid[1],
+ array->uuid[2], array->uuid[3]) <= 0)
+ pr_err("Failed to write to <id_path> cookie\n");
+
+ fclose(f);
+}
+
+int policy_check_path(struct mdinfo *disk, struct map_ent *array)
+{
+ char path[PATH_MAX];
+ FILE *f = NULL;
+ char **id_paths = disk_paths(disk);
+ int i;
+ int rv = 0;
+
+ for (i = 0; id_paths[i]; i++) {
+ snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_paths[i]);
+ f = fopen(path, "r");
+ if (!f)
+ continue;
+
+ rv = fscanf(f, " %20s %x:%x:%x:%x\n",
+ array->metadata,
+ array->uuid,
+ array->uuid+1,
+ array->uuid+2,
+ array->uuid+3);
+ fclose(f);
+ break;
+ }
+ free_paths(id_paths);
+ return rv == 5;
+}
+
+/* invocation of udev rule file */
+char udev_template_start[] =
+"# do not edit this file, it is automatically generated by mdadm\n"
+"\n";
+
+/* find rule named rule_type and return its value */
+char *find_rule(struct rule *rule, char *rule_type)
+{
+ while (rule) {
+ if (rule->name == rule_type)
+ return rule->value;
+
+ rule = rule->next;
+ }
+ return NULL;
+}
+
+#define UDEV_RULE_FORMAT \
+"ACTION==\"add\", SUBSYSTEM==\"block\", " \
+"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \
+"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n"
+
+#define UDEV_RULE_FORMAT_NOTYPE \
+"ACTION==\"add\", SUBSYSTEM==\"block\", " \
+"ENV{ID_PATH}==\"%s\", " \
+"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n"
+
+/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */
+int write_rule(struct rule *rule, int fd, int force_part)
+{
+ char line[1024];
+ char *pth = find_rule(rule, rule_path);
+ char *typ = find_rule(rule, rule_type);
+ if (!pth)
+ return -1;
+
+ if (force_part)
+ typ = type_part;
+ if (typ)
+ snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT, typ, pth);
+ else
+ snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT_NOTYPE, pth);
+ return write(fd, line, strlen(line)) == (int)strlen(line);
+}
+
+/* Generate single entry in udev rule basing on POLICY line found in config
+ * file. Take only those with paths, only first occurrence if paths are equal
+ * and if actions supports handling of spares (>=act_spare_same_slot)
+ */
+int generate_entries(int fd)
+{
+ struct pol_rule *loop, *dup;
+ char *loop_value, *dup_value;
+ int duplicate;
+
+ for (loop = config_rules; loop; loop = loop->next) {
+ if (loop->type != rule_policy && loop->type != rule_part)
+ continue;
+ duplicate = 0;
+
+ /* only policies with paths and with actions supporting
+ * bare disks are considered */
+ loop_value = find_rule(loop->rule, pol_act);
+ if (!loop_value || map_act(loop_value) < act_spare_same_slot)
+ continue;
+ loop_value = find_rule(loop->rule, rule_path);
+ if (!loop_value)
+ continue;
+ for (dup = config_rules; dup != loop; dup = dup->next) {
+ if (dup->type != rule_policy && loop->type != rule_part)
+ continue;
+ dup_value = find_rule(dup->rule, pol_act);
+ if (!dup_value || map_act(dup_value) < act_spare_same_slot)
+ continue;
+ dup_value = find_rule(dup->rule, rule_path);
+ if (!dup_value)
+ continue;
+ if (strcmp(loop_value, dup_value) == 0) {
+ duplicate = 1;
+ break;
+ }
+ }
+
+ /* not a dup or first occurrence */
+ if (!duplicate)
+ if (!write_rule(loop->rule, fd, loop->type == rule_part) )
+ return 0;
+ }
+ return 1;
+}
+
+/* Write_rules routine creates dynamic udev rules used to handle
+ * hot-plug events for bare devices (and making them spares)
+ */
+int Write_rules(char *rule_name)
+{
+ int fd;
+ char udev_rule_file[PATH_MAX];
+
+ if (rule_name) {
+ strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6);
+ udev_rule_file[sizeof(udev_rule_file) - 6] = '\0';
+ strcat(udev_rule_file, ".temp");
+ fd = creat(udev_rule_file,
+ S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
+ if (fd == -1)
+ return 1;
+ } else
+ fd = 1;
+
+ /* write static invocation */
+ if (write(fd, udev_template_start, sizeof(udev_template_start) - 1) !=
+ (int)sizeof(udev_template_start) - 1)
+ goto abort;
+
+ /* iterate, if none created or error occurred, remove file */
+ if (generate_entries(fd) < 0)
+ goto abort;
+
+ fsync(fd);
+ if (rule_name) {
+ close(fd);
+ rename(udev_rule_file, rule_name);
+ }
+ return 0;
+abort:
+ if (rule_name) {
+ close(fd);
+ unlink(udev_rule_file);
+ }
+ return 1;
+}
diff --git a/probe_roms.c b/probe_roms.c
new file mode 100644
index 0000000..7ea04c7
--- /dev/null
+++ b/probe_roms.c
@@ -0,0 +1,331 @@
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c)
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#include "probe_roms.h"
+#include "mdadm.h"
+#include <unistd.h>
+#include <signal.h>
+#include <fcntl.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <asm/types.h>
+
+static void *rom_mem = MAP_FAILED;
+static int rom_fd = -1;
+static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */
+static int _sigbus;
+static unsigned long rom_align;
+
+static void roms_deinit(void);
+static int roms_init(void);
+
+static void sigbus(int sig)
+{
+ _sigbus = 1;
+}
+
+static int probe_address8(const __u8 *ptr, __u8 *val)
+{
+ int rc = 0;
+
+ *val = *ptr;
+ if (_sigbus)
+ rc = -1;
+ _sigbus = 0;
+
+ return rc;
+}
+
+static int probe_address16(const __u16 *ptr, __u16 *val)
+{
+ int rc = 0;
+
+ *val = *ptr;
+ if (_sigbus)
+ rc = -1;
+ _sigbus = 0;
+
+ return rc;
+}
+
+void probe_roms_exit(void)
+{
+ signal(SIGBUS, SIG_DFL);
+ if (rom_fd >= 0) {
+ close(rom_fd);
+ rom_fd = -1;
+ }
+ if (rom_mem != MAP_FAILED) {
+ munmap(rom_mem, rom_len);
+ rom_mem = MAP_FAILED;
+ }
+ roms_deinit();
+}
+
+int probe_roms_init(unsigned long align)
+{
+ int fd = -1;
+ int rc = 0;
+
+ /* valid values are 2048 and 512. 512 is for PCI-3.0 compliant
+ * systems, or systems that do not have dangerous/legacy ISA
+ * devices. 2048 should always be safe
+ */
+ if (align == 512 || align == 2048)
+ rom_align = align;
+ else
+ return -1;
+
+ if (roms_init())
+ return -1;
+
+ if (signal(SIGBUS, sigbus) == SIG_ERR)
+ rc = -1;
+ if (rc == 0) {
+ fd = open("/dev/mem", O_RDONLY);
+ if (fd < 0)
+ rc = -1;
+ }
+ if (rc == 0) {
+ rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000);
+ if (rom_mem == MAP_FAILED)
+ rc = -1;
+ }
+
+ if (rc == 0)
+ rom_fd = fd;
+ else {
+ if (fd >= 0)
+ close(fd);
+ probe_roms_exit();
+ }
+ return rc;
+}
+
+/**
+ * isa_bus_to_virt - convert physical address to mmap'd region
+ * @addr - address to convert
+ *
+ * Only valid between a successful call to probe_roms_init and the
+ * corresponding probe_roms_exit
+ */
+static void *isa_bus_to_virt(unsigned long addr)
+{
+ return rom_mem + (addr - 0xc0000);
+}
+
+struct resource {
+ unsigned long start;
+ unsigned long end;
+ unsigned long data;
+ const char *name;
+ struct resource *next;
+};
+
+static struct resource system_rom_resource = {
+ .name = "System ROM",
+ .start = 0xf0000,
+ .data = 0,
+ .end = 0xfffff,
+};
+
+static struct resource extension_rom_resource = {
+ .name = "Extension ROM",
+ .start = 0xe0000,
+ .data = 0,
+ .end = 0xeffff,
+};
+
+static struct resource *adapter_rom_resources;
+
+static struct resource video_rom_resource = {
+ .name = "Video ROM",
+ .start = 0xc0000,
+ .data = 0,
+ .end = 0xc7fff,
+};
+
+static int roms_init(void)
+{
+ adapter_rom_resources = malloc(sizeof(struct resource));
+ if (adapter_rom_resources == NULL)
+ return 1;
+ adapter_rom_resources->name = "Adapter ROM";
+ adapter_rom_resources->start = 0xc8000;
+ adapter_rom_resources->data = 0;
+ adapter_rom_resources->end = 0;
+ adapter_rom_resources->next = NULL;
+ return 0;
+}
+
+static void roms_deinit(void)
+{
+ struct resource *res;
+
+ res = adapter_rom_resources;
+ while (res) {
+ struct resource *tmp = res;
+
+ res = res->next;
+ free(tmp);
+ }
+}
+
+#define ROMSIGNATURE 0xaa55
+
+
+static int romsignature(const unsigned char *rom)
+{
+ const unsigned short * const ptr = (const unsigned short *)rom;
+ unsigned short sig = 0;
+
+ return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE;
+}
+
+static int romchecksum(const unsigned char *rom, unsigned long length)
+{
+ unsigned char sum, c;
+
+ for (sum = 0; length && probe_address8(rom++, &c) == 0; length--)
+ sum += c;
+ return !length && !sum;
+}
+
+int scan_adapter_roms(scan_fn fn)
+{
+ /* let scan_fn examing each of the adapter roms found by probe_roms */
+ struct resource *res = adapter_rom_resources;
+ int found;
+
+ if (rom_fd < 0)
+ return 0;
+
+ found = 0;
+ while (res) {
+ if (res->start) {
+ found = fn(isa_bus_to_virt(res->start),
+ isa_bus_to_virt(res->end),
+ isa_bus_to_virt(res->data));
+ if (found)
+ break;
+ } else
+ break;
+ res = res->next;
+ }
+
+ return found;
+}
+
+static unsigned long align(unsigned long addr, unsigned long alignment)
+{
+ return (addr + alignment - 1) & ~(alignment - 1);
+}
+
+void probe_roms(void)
+{
+ const void *rom;
+ unsigned long start, length, upper;
+ unsigned char c;
+ struct resource *res = adapter_rom_resources;
+ __u16 val=0;
+
+ if (rom_fd < 0)
+ return;
+
+ /* video rom */
+ upper = res->start;
+ for (start = video_rom_resource.start; start < upper; start += rom_align) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ video_rom_resource.start = start;
+
+ if (probe_address8(rom + 2, &c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* if checksum okay, trust length byte */
+ if (length && romchecksum(rom, length))
+ video_rom_resource.end = start + length - 1;
+ break;
+ }
+
+ start = align(video_rom_resource.end + 1, rom_align);
+ if (start < upper)
+ start = upper;
+
+ /* system rom */
+ upper = system_rom_resource.start;
+
+ /* check for extension rom (ignore length byte!) */
+ rom = isa_bus_to_virt(extension_rom_resource.start);
+ if (romsignature(rom)) {
+ length = extension_rom_resource.end - extension_rom_resource.start + 1;
+ if (romchecksum(rom, length))
+ upper = extension_rom_resource.start;
+ }
+
+ struct resource *prev_res = res;
+ /* check for adapter roms on 2k boundaries */
+ for (; start < upper; start += rom_align) {
+ rom = isa_bus_to_virt(start);
+ if (!romsignature(rom))
+ continue;
+
+ if (probe_address8(rom + 2, &c) != 0)
+ continue;
+
+ /* 0 < length <= 0x7f * 512, historically */
+ length = c * 512;
+
+ /* Retrieve 16-bit pointer to PCI Data Structure (offset 18h-19h)
+ * The data can be within 64KB forward of the first location
+ * of this code image. The pointer is in little-endian order
+ */
+
+ if (probe_address16(rom + 0x18, &val) != 0)
+ continue;
+ val = __le16_to_cpu(val);
+
+ /* but accept any length that fits if checksum okay */
+ if (!length || start + length > upper || !romchecksum(rom, length))
+ continue;
+
+ if (res == NULL) {
+ res = calloc(1, sizeof(struct resource));
+ if (res == NULL)
+ return;
+ prev_res->next = res;
+ }
+
+ res->start = start;
+ res->data = start + (unsigned long)val;
+ res->end = start + length - 1;
+
+ start = res->end & ~(rom_align - 1);
+ prev_res = res;
+ res = res->next;
+ }
+}
diff --git a/probe_roms.h b/probe_roms.h
new file mode 100644
index 0000000..6d70411
--- /dev/null
+++ b/probe_roms.h
@@ -0,0 +1,24 @@
+/*
+ * probe_roms - scan for Adapter ROMS
+ *
+ * Copyright (C) 2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+void probe_roms_exit(void);
+int probe_roms_init(unsigned long align);
+typedef int (*scan_fn)(const void *start, const void *end, const void *data);
+int scan_adapter_roms(scan_fn fn);
+void probe_roms(void);
diff --git a/pwgr.c b/pwgr.c
new file mode 100644
index 0000000..a07de33
--- /dev/null
+++ b/pwgr.c
@@ -0,0 +1,17 @@
+
+/*
+ * We cannot link a static binary with passwd/group support, so
+ * just do without
+ */
+#include <stdlib.h>
+#include <pwd.h>
+#include <grp.h>
+
+struct passwd *getpwnam(const char *name)
+{
+ return NULL;
+}
+struct group *getgrnam(const char *name)
+{
+ return NULL;
+}
diff --git a/raid5extend.c b/raid5extend.c
new file mode 100644
index 0000000..d8e62c2
--- /dev/null
+++ b/raid5extend.c
@@ -0,0 +1,80 @@
+
+int phys2log(int phys, int stripe, int n, int layout)
+{
+ /* In an 'n' disk array using 'layout',
+ * in stripe 'stripe', the physical disc 'phys'
+ * stores what logical chunk?
+ * -1 mean parity.
+ *
+ */
+ switch(layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ pd = (n-1) - (stripe % n);
+ if (phys < pd)
+ return phys;
+ else if (phys == pd)
+ return -1;
+ else return phys-1;
+
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ pd = stripe % n;
+ if (phys < pd)
+ return phys;
+ else if (phys == pd)
+ return -1;
+ else return phys-1;
+
+ case ALGORITHM_LEFT_SYMMETRIC:
+ pd = (n-1) - (stripe %n);
+ if (phys < pd)
+ return phys+ n-1-pd;
+ else if (phys == pd)
+ return -1;
+ else return phys-pd-1;
+
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ pd = stripe % n;
+ if (phys < pd)
+ return phys+ n-1-pd;
+ else if (phys == pd)
+ return -1;
+ else return phys-pd-1;
+ }
+ return -2;
+}
+
+raid5_extend(unsigned long len, int chunksize, int layout, int n, int m, int rfds[], int wfds[])
+{
+
+ static char buf[4096];
+
+ unsigned long blocks = len/4;
+ unsigned int blocksperchunk= chunksize/4096;
+
+ unsigned long b;
+
+ for (b=0; b<blocks; b++) {
+ unsigned long stripe = b / blocksperchunk;
+ unsigned int offset = b - (stripe*blocksperchunk);
+ unsigned long chunk = stripe * (n-1);
+ int src;
+ for (src=0; src<n; src++) {
+ int dnum, snum;
+ if (read(rfds[src], buf, sizeof(buf)) != sizeof(buf)) {
+ error();
+ return 0;
+ }
+
+ snum = phys2log(src, stripe, n, layout);
+
+ if (snum == -1)
+ continue;
+ chunk = stripe*(n-1)+snum;
+
+ dstripe = chunk/(m-1);
+ dnum = log2phys(chunk-(stripe*(m-1)), dstripe, m, layout);
+ llseek(wfds[dnum], dstripe*chunksize+(offset*4096), 0);
+ write(wfds[dnum], buf, sizeof(buf));
+ }
+ }
+}
diff --git a/raid6check.8 b/raid6check.8
new file mode 100644
index 0000000..8999ca8
--- /dev/null
+++ b/raid6check.8
@@ -0,0 +1,96 @@
+.\" -*- nroff -*-
+.\" Copyright Piergiorgio Sartor and others.
+.\" This program is free software; you can redistribute it and/or modify
+.\" it under the terms of the GNU General Public License as published by
+.\" the Free Software Foundation; either version 2 of the License, or
+.\" (at your option) any later version.
+.\" See file COPYING in distribution for details.
+.TH RAID6CHECK 8 "" v1.0.0
+.SH NAME
+raid6check \- check MD RAID6 device for errors
+.I aka
+Linux Software RAID
+
+.SH SYNOPSIS
+
+.BI raid6check " <raid6 device> <start stripe> <number of stripes>"
+
+.SH DESCRIPTION
+RAID6 devices in which one single component drive has errors can use
+the double parity in order to find out which component drive.
+The "raid6check" tool checks, for each stripe, the double parity
+consistency, reports mismatches and, if possible, which
+component drive has the mismatch.
+Since it works at stripe level, it can report different drives with
+mismatches at different stripes.
+
+"raid6check" requires a non-degraded RAID6 MD device as first
+parameter, a starting stripe (usually 0) and the number of stripes
+to be checked.
+If this third parameter is also 0, it will check the array up to
+the end.
+
+"raid6check" will start printing information about the RAID6, then
+for each stripe, it will report the parity rotation status.
+In case of parity mismatches, "raid6check" reports, if possible,
+which component drive could be responsible. Otherwise it reports
+that it is not possible to find the component drive.
+
+If the given MD device is not a RAID6, "raid6check" will, of
+course, not continue.
+
+If the RAID6 MD device is degraded, "raid6check" will report
+an error and it will not proceed further.
+
+No write operations are performed on the array or the components.
+Furthermore, the checked array can be online and in use during
+the operation of "raid6check".
+
+.SH EXAMPLES
+
+.B " raid6check /dev/md0 0 0"
+.br
+This will check /dev/md0 from start to end.
+
+.B " raid6check /dev/md3 0 1"
+.br
+This will check the first stripe of /dev/md3.
+
+.B " raid6check /dev/md1 1000 0"
+.br
+This will check /dev/md1 from stripe 1000 up to the end.
+
+.B " raid6check /dev/m127 128 256"
+.br
+This will check 256 stripes of /dev/md127 starting from stripe 128.
+
+.B " raid6check /dev/md0 0 0 | grep -i error > md0_err.log"
+.br
+This will check /dev/md0 completely and create a log file only
+with errors, if any.
+
+.SH FILES
+
+"raid6check" uses directly the component drives as found in /dev.
+Furthermore, the sysfs interface is needed in order to find out
+the RAID6 parameters.
+
+.SH BUGS
+Negative parameters can lead to unexpected results.
+
+It is not clear what will happen if the RAID6 MD device gets
+degraded during the check.
+
+.PP
+The latest version of
+.I raid6check
+should always be available from
+.IP
+.B https://www.kernel.org/pub/linux/utils/raid/mdadm/
+.PP
+Related man pages:
+.PP
+.IR mdadm (8)
+.IR mdmon (8),
+.IR mdadm.conf (5),
+.IR md (4).
diff --git a/raid6check.c b/raid6check.c
new file mode 100644
index 0000000..a8e6005
--- /dev/null
+++ b/raid6check.c
@@ -0,0 +1,714 @@
+/*
+ * raid6check - extended consistency check for RAID-6
+ *
+ * Copyright (C) 2011 Piergiorgio Sartor
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Piergiorgio Sartor
+ * Based on "restripe.c" from "mdadm" codebase
+ */
+
+#include "mdadm.h"
+#include <stdint.h>
+#include <signal.h>
+#include <sys/mman.h>
+
+#define CHECK_PAGE_BITS (12)
+#define CHECK_PAGE_SIZE (1 << CHECK_PAGE_BITS)
+
+char const Name[] = "raid6check";
+
+enum repair {
+ NO_REPAIR = 0,
+ MANUAL_REPAIR,
+ AUTO_REPAIR
+};
+
+int geo_map(int block, unsigned long long stripe, int raid_disks,
+ int level, int layout);
+int is_ddf(int layout);
+void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size);
+void make_tables(void);
+void ensure_zero_has_size(int chunk_size);
+void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
+ int neg_offset);
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+ uint8_t **ptrs, int neg_offset);
+void xor_blocks(char *target, char **sources, int disks, int size);
+
+/* Collect per stripe consistency information */
+void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q,
+ char *chunkP, char *chunkQ, int *results)
+{
+ int i;
+ int data_id;
+ uint8_t Px, Qx;
+ extern uint8_t raid6_gflog[];
+
+ for(i = 0; i < chunk_size; i++) {
+ Px = (uint8_t)chunkP[i] ^ (uint8_t)p[i];
+ Qx = (uint8_t)chunkQ[i] ^ (uint8_t)q[i];
+
+ if((Px != 0) && (Qx == 0))
+ results[i] = -1;
+
+ if((Px == 0) && (Qx != 0))
+ results[i] = -2;
+
+ if((Px != 0) && (Qx != 0)) {
+ data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
+ if(data_id < 0) data_id += 255;
+ results[i] = data_id;
+ }
+
+ if((Px == 0) && (Qx == 0))
+ results[i] = -255;
+ }
+}
+
+/* Try to find out if a specific disk has problems in a CHECK_PAGE_SIZE page size */
+int raid6_stats_blk(int *results, int raid_disks)
+{
+ int i;
+ int curr_broken_disk = -255;
+ int prev_broken_disk = -255;
+ int broken_status = 0;
+
+ for(i = 0; i < CHECK_PAGE_SIZE; i++) {
+
+ if(results[i] != -255)
+ curr_broken_disk = results[i];
+
+ if(curr_broken_disk >= raid_disks)
+ broken_status = 2;
+
+ switch(broken_status) {
+ case 0:
+ if(curr_broken_disk != -255) {
+ prev_broken_disk = curr_broken_disk;
+ broken_status = 1;
+ }
+ break;
+
+ case 1:
+ if(curr_broken_disk != prev_broken_disk)
+ broken_status = 2;
+ break;
+
+ case 2:
+ default:
+ curr_broken_disk = prev_broken_disk = -65535;
+ break;
+ }
+ }
+
+ return curr_broken_disk;
+}
+
+/* Collect disks status for a strip in CHECK_PAGE_SIZE page size blocks */
+void raid6_stats(int *disk, int *results, int raid_disks, int chunk_size)
+{
+ int i, j;
+
+ for(i = 0, j = 0; i < chunk_size; i += CHECK_PAGE_SIZE, j++) {
+ disk[j] = raid6_stats_blk(&results[i], raid_disks);
+ }
+}
+
+int lock_stripe(struct mdinfo *info, unsigned long long start,
+ int chunk_size, int data_disks, sighandler_t *sig) {
+ int rv;
+ if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) {
+ return 2;
+ }
+
+ sig[0] = signal(SIGTERM, SIG_IGN);
+ sig[1] = signal(SIGINT, SIG_IGN);
+ sig[2] = signal(SIGQUIT, SIG_IGN);
+
+ rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks);
+ rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks);
+ return rv * 256;
+}
+
+int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) {
+ int rv;
+ rv = sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ rv |= sysfs_set_num(info, NULL, "suspend_hi", 0);
+ rv |= sysfs_set_num(info, NULL, "suspend_lo", 0);
+
+ signal(SIGQUIT, sig[2]);
+ signal(SIGINT, sig[1]);
+ signal(SIGTERM, sig[0]);
+
+ if(munlockall() != 0)
+ return 3;
+ return rv * 256;
+}
+
+/* Autorepair */
+int autorepair(int *disk, unsigned long long start, int chunk_size,
+ char *name[], int raid_disks, int syndrome_disks, char **blocks_page,
+ char **blocks, uint8_t *p, int *block_index_for_slot,
+ int *source, unsigned long long *offsets)
+{
+ int i, j;
+ int pages_to_write_count = 0;
+ int page_to_write[chunk_size >> CHECK_PAGE_BITS];
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ if (disk[j] >= -2 && block_index_for_slot[disk[j]] >= 0) {
+ int slot = block_index_for_slot[disk[j]];
+ printf("Auto-repairing slot %d (%s)\n", slot, name[slot]);
+ pages_to_write_count++;
+ page_to_write[j] = 1;
+ for(i = -2; i < syndrome_disks; i++) {
+ blocks_page[i] = blocks[i] + j * CHECK_PAGE_SIZE;
+ }
+ if (disk[j] == -2) {
+ qsyndrome(p, (uint8_t*)blocks_page[-2],
+ (uint8_t**)blocks_page,
+ syndrome_disks, CHECK_PAGE_SIZE);
+ }
+ else {
+ char *all_but_failed_blocks[syndrome_disks];
+ for(i = 0; i < syndrome_disks; i++) {
+ if (i == disk[j])
+ all_but_failed_blocks[i] = blocks_page[-1];
+ else
+ all_but_failed_blocks[i] = blocks_page[i];
+ }
+ xor_blocks(blocks_page[disk[j]],
+ all_but_failed_blocks, syndrome_disks,
+ CHECK_PAGE_SIZE);
+ }
+ }
+ else {
+ page_to_write[j] = 0;
+ }
+ }
+
+ if(pages_to_write_count > 0) {
+ int write_res = 0;
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ if(page_to_write[j] == 1) {
+ int slot = block_index_for_slot[disk[j]];
+ lseek64(source[slot], offsets[slot] + start * chunk_size + j * CHECK_PAGE_SIZE, SEEK_SET);
+ write_res += write(source[slot],
+ blocks[disk[j]] + j * CHECK_PAGE_SIZE,
+ CHECK_PAGE_SIZE);
+ }
+ }
+
+ if (write_res != (CHECK_PAGE_SIZE * pages_to_write_count)) {
+ fprintf(stderr, "Failed to write a full chunk.\n");
+ return -1;
+ }
+ }
+
+ return 0;
+}
+
+/* Manual repair */
+int manual_repair(int chunk_size, int syndrome_disks,
+ int failed_slot1, int failed_slot2,
+ unsigned long long start, int *block_index_for_slot,
+ char *name[], char **stripes, char **blocks, uint8_t *p,
+ int *source, unsigned long long *offsets)
+{
+ int i;
+ int fd1 = block_index_for_slot[failed_slot1];
+ int fd2 = block_index_for_slot[failed_slot2];
+ printf("Repairing stripe %llu\n", start);
+ printf("Assuming slots %d (%s) and %d (%s) are incorrect\n",
+ fd1, name[fd1],
+ fd2, name[fd2]);
+
+ if (failed_slot1 == -2 || failed_slot2 == -2) {
+ char *all_but_failed_blocks[syndrome_disks];
+ int failed_data_or_p;
+
+ if (failed_slot1 == -2)
+ failed_data_or_p = failed_slot2;
+ else
+ failed_data_or_p = failed_slot1;
+
+ printf("Repairing D/P(%d) and Q\n", failed_data_or_p);
+
+ for (i = 0; i < syndrome_disks; i++) {
+ if (i == failed_data_or_p)
+ all_but_failed_blocks[i] = blocks[-1];
+ else
+ all_but_failed_blocks[i] = blocks[i];
+ }
+ xor_blocks(blocks[failed_data_or_p],
+ all_but_failed_blocks, syndrome_disks, chunk_size);
+ qsyndrome(p, (uint8_t*)blocks[-2], (uint8_t**)blocks,
+ syndrome_disks, chunk_size);
+ } else {
+ ensure_zero_has_size(chunk_size);
+ if (failed_slot1 == -1 || failed_slot2 == -1) {
+ int failed_data;
+ if (failed_slot1 == -1)
+ failed_data = failed_slot2;
+ else
+ failed_data = failed_slot1;
+
+ printf("Repairing D(%d) and P\n", failed_data);
+ raid6_datap_recov(syndrome_disks+2, chunk_size,
+ failed_data, (uint8_t**)blocks, 1);
+ } else {
+ printf("Repairing D and D\n");
+ raid6_2data_recov(syndrome_disks+2, chunk_size,
+ failed_slot1, failed_slot2,
+ (uint8_t**)blocks, 1);
+ }
+ }
+
+ int write_res1, write_res2;
+ off64_t seek_res;
+
+ seek_res = lseek64(source[fd1],
+ offsets[fd1] + start * chunk_size, SEEK_SET);
+ if (seek_res < 0) {
+ fprintf(stderr, "lseek failed for failed_disk1\n");
+ return -1;
+ }
+ write_res1 = write(source[fd1], blocks[failed_slot1], chunk_size);
+
+ seek_res = lseek64(source[fd2],
+ offsets[fd2] + start * chunk_size, SEEK_SET);
+ if (seek_res < 0) {
+ fprintf(stderr, "lseek failed for failed_disk2\n");
+ return -1;
+ }
+ write_res2 = write(source[fd2], blocks[failed_slot2], chunk_size);
+
+ if (write_res1 != chunk_size || write_res2 != chunk_size) {
+ fprintf(stderr, "Failed to write a complete chunk.\n");
+ return -2;
+ }
+
+ return 0;
+}
+
+int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ unsigned long long start, unsigned long long length, char *name[],
+ enum repair repair, int failed_disk1, int failed_disk2)
+{
+ /* read the data and p and q blocks, and check we got them right */
+ int data_disks = raid_disks - 2;
+ int syndrome_disks = data_disks + is_ddf(layout) * 2;
+ char *stripe_buf;
+
+ /* stripes[] is indexed by raid_disk and holds chunks from each device */
+ char **stripes = xmalloc(raid_disks * sizeof(char*));
+
+ /* blocks[] is indexed by syndrome number and points to either one of the
+ * chunks from 'stripes[]', or to a chunk of zeros. -1 and -2 are
+ * P and Q */
+ char **blocks = xmalloc((syndrome_disks + 2) * sizeof(char*));
+
+ /* blocks_page[] is a temporary index to just one page of the chunks
+ * that blocks[] points to. */
+ char **blocks_page = xmalloc((syndrome_disks + 2) * sizeof(char*));
+
+ /* block_index_for_slot[] provides the reverse mapping from blocks to stripes.
+ * The index is a syndrome position, the content is a raid_disk number.
+ * indicies -1 and -2 work, and are P and Q disks */
+ int *block_index_for_slot = xmalloc((syndrome_disks+2) * sizeof(int));
+
+ /* 'p' and 'q' contain calcualted P and Q, to be compared with
+ * blocks[-1] and blocks[-2];
+ */
+ uint8_t *p = xmalloc(chunk_size);
+ uint8_t *q = xmalloc(chunk_size);
+ char *zero = xmalloc(chunk_size);
+ int *results = xmalloc(chunk_size * sizeof(int));
+ sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t));
+
+ int i, j;
+ int diskP, diskQ, diskD;
+ int err = 0;
+
+ extern int tables_ready;
+
+ if (!tables_ready)
+ make_tables();
+
+ if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0)
+ exit(4);
+ block_index_for_slot += 2;
+ blocks += 2;
+ blocks_page += 2;
+
+ memset(zero, 0, chunk_size);
+ for ( i = 0 ; i < raid_disks ; i++)
+ stripes[i] = stripe_buf + i * chunk_size;
+
+ while (length > 0) {
+ /* The syndrome number of the broken disk is recorded
+ * in 'disk[]' which allows a different broken disk for
+ * each page.
+ */
+ int disk[chunk_size >> CHECK_PAGE_BITS];
+
+ err = lock_stripe(info, start, chunk_size, data_disks, sig);
+ if(err != 0) {
+ if (err != 2)
+ unlock_all_stripes(info, sig);
+ goto exitCheck;
+ }
+ for (i = 0 ; i < raid_disks ; i++) {
+ off64_t seek_res = lseek64(source[i], offsets[i] + start * chunk_size,
+ SEEK_SET);
+ if (seek_res < 0) {
+ fprintf(stderr, "lseek to source %d failed\n", i);
+ unlock_all_stripes(info, sig);
+ err = -1;
+ goto exitCheck;
+ }
+ int read_res = read(source[i], stripes[i], chunk_size);
+ if (read_res < chunk_size) {
+ fprintf(stderr, "Failed to read complete chunk disk %d, aborting\n", i);
+ unlock_all_stripes(info, sig);
+ err = -1;
+ goto exitCheck;
+ }
+ }
+
+ diskP = geo_map(-1, start, raid_disks, level, layout);
+ block_index_for_slot[-1] = diskP;
+ blocks[-1] = stripes[diskP];
+
+ diskQ = geo_map(-2, start, raid_disks, level, layout);
+ block_index_for_slot[-2] = diskQ;
+ blocks[-2] = stripes[diskQ];
+
+ if (!is_ddf(layout)) {
+ /* The syndrome-order of disks starts immediately after 'Q',
+ * but skips P */
+ diskD = diskQ;
+ for (i = 0 ; i < data_disks ; i++) {
+ diskD = diskD + 1;
+ if (diskD >= raid_disks)
+ diskD = 0;
+ if (diskD == diskP)
+ diskD += 1;
+ if (diskD >= raid_disks)
+ diskD = 0;
+ blocks[i] = stripes[diskD];
+ block_index_for_slot[i] = diskD;
+ }
+ } else {
+ /* The syndrome-order exactly follows raid-disk
+ * numbers, with ZERO in place of P and Q
+ */
+ for (i = 0 ; i < raid_disks; i++) {
+ if (i == diskP || i == diskQ) {
+ blocks[i] = zero;
+ block_index_for_slot[i] = -1;
+ } else {
+ blocks[i] = stripes[i];
+ block_index_for_slot[i] = i;
+ }
+ }
+ }
+
+ qsyndrome(p, q, (uint8_t**)blocks, syndrome_disks, chunk_size);
+
+ raid6_collect(chunk_size, p, q, stripes[diskP], stripes[diskQ], results);
+ raid6_stats(disk, results, raid_disks, chunk_size);
+
+ for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) {
+ int role = disk[j];
+ if (role >= -2) {
+ int slot = block_index_for_slot[role];
+ if (slot >= 0)
+ printf("Error detected at stripe %llu, page %d: possible failed disk slot %d: %d --> %s\n",
+ start, j, role, slot, name[slot]);
+ else
+ printf("Error detected at stripe %llu, page %d: failed slot %d should be zeros\n",
+ start, j, role);
+ } else if(disk[j] == -65535) {
+ printf("Error detected at stripe %llu, page %d: disk slot unknown\n", start, j);
+ }
+ }
+
+ if(repair == AUTO_REPAIR) {
+ err = autorepair(disk, start, chunk_size,
+ name, raid_disks, syndrome_disks, blocks_page,
+ blocks, p, block_index_for_slot,
+ source, offsets);
+ if(err != 0) {
+ unlock_all_stripes(info, sig);
+ goto exitCheck;
+ }
+ }
+
+ if(repair == MANUAL_REPAIR) {
+ int failed_slot1 = -1, failed_slot2 = -1;
+ for (i = -2; i < syndrome_disks; i++) {
+ if (block_index_for_slot[i] == failed_disk1)
+ failed_slot1 = i;
+ if (block_index_for_slot[i] == failed_disk2)
+ failed_slot2 = i;
+ }
+ err = manual_repair(chunk_size, syndrome_disks,
+ failed_slot1, failed_slot2,
+ start, block_index_for_slot,
+ name, stripes, blocks, p,
+ source, offsets);
+ if(err == -1) {
+ unlock_all_stripes(info, sig);
+ goto exitCheck;
+ }
+ }
+
+ err = unlock_all_stripes(info, sig);
+ if(err != 0) {
+ goto exitCheck;
+ }
+
+ length--;
+ start++;
+ }
+
+exitCheck:
+
+ free(stripe_buf);
+ free(stripes);
+ free(blocks-2);
+ free(blocks_page-2);
+ free(block_index_for_slot-2);
+ free(p);
+ free(q);
+ free(results);
+ free(sig);
+
+ return err;
+}
+
+unsigned long long getnum(char *str, char **err)
+{
+ char *e;
+ unsigned long long rv = strtoull(str, &e, 10);
+ if (e==str || *e) {
+ *err = str;
+ return 0;
+ }
+ return rv;
+}
+
+int main(int argc, char *argv[])
+{
+ /* md_device start length */
+ int *fds = NULL;
+ char *buf = NULL;
+ char **disk_name = NULL;
+ unsigned long long *offsets = NULL;
+ int raid_disks = 0;
+ int active_disks;
+ int chunk_size = 0;
+ int layout = -1;
+ int level = 6;
+ enum repair repair = NO_REPAIR;
+ int failed_disk1 = -1;
+ int failed_disk2 = -1;
+ unsigned long long start, length;
+ int i;
+ int mdfd;
+ struct mdinfo *info = NULL, *comp = NULL;
+ char *err = NULL;
+ int exit_err = 0;
+ int close_flag = 0;
+ char *prg = strrchr(argv[0], '/');
+
+ if (prg == NULL)
+ prg = argv[0];
+ else
+ prg++;
+
+ if (argc < 4) {
+ fprintf(stderr, "Usage: %s md_device start_stripe length_stripes [autorepair]\n", prg);
+ fprintf(stderr, " or: %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg);
+ exit_err = 1;
+ goto exitHere;
+ }
+
+ mdfd = open(argv[1], O_RDONLY);
+ if(mdfd < 0) {
+ perror(argv[1]);
+ fprintf(stderr, "%s: cannot open %s\n", prg, argv[1]);
+ exit_err = 2;
+ goto exitHere;
+ }
+
+ info = sysfs_read(mdfd, NULL,
+ GET_LEVEL|
+ GET_LAYOUT|
+ GET_DISKS|
+ GET_STATE |
+ GET_COMPONENT|
+ GET_CHUNK|
+ GET_DEVS|
+ GET_OFFSET|
+ GET_SIZE);
+
+ if(info == NULL) {
+ fprintf(stderr, "%s: Error reading sysfs information of %s\n", prg, argv[1]);
+ exit_err = 9;
+ goto exitHere;
+ }
+
+ if(info->array.level != level) {
+ fprintf(stderr, "%s: %s not a RAID-6\n", prg, argv[1]);
+ exit_err = 3;
+ goto exitHere;
+ }
+
+ if(info->array.failed_disks > 0) {
+ fprintf(stderr, "%s: %s degraded array\n", prg, argv[1]);
+ exit_err = 8;
+ goto exitHere;
+ }
+
+ printf("layout: %d\n", info->array.layout);
+ printf("disks: %d\n", info->array.raid_disks);
+ printf("component size: %llu\n", info->component_size * 512);
+ printf("total stripes: %llu\n", (info->component_size * 512) / info->array.chunk_size);
+ printf("chunk size: %d\n", info->array.chunk_size);
+ printf("\n");
+
+ comp = info->devs;
+ for(i = 0, active_disks = 0; active_disks < info->array.raid_disks; i++) {
+ printf("disk: %d - offset: %llu - size: %llu - name: %s - slot: %d\n",
+ i, comp->data_offset * 512, comp->component_size * 512,
+ map_dev(comp->disk.major, comp->disk.minor, 0),
+ comp->disk.raid_disk);
+ if(comp->disk.raid_disk >= 0)
+ active_disks++;
+ comp = comp->next;
+ }
+ printf("\n");
+
+ close(mdfd);
+
+ raid_disks = info->array.raid_disks;
+ chunk_size = info->array.chunk_size;
+ layout = info->array.layout;
+ if (strcmp(argv[2], "repair")==0) {
+ if (argc < 6) {
+ fprintf(stderr, "For repair mode, call %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg);
+ exit_err = 1;
+ goto exitHere;
+ }
+ repair = MANUAL_REPAIR;
+ start = getnum(argv[3], &err);
+ length = 1;
+ failed_disk1 = getnum(argv[4], &err);
+ failed_disk2 = getnum(argv[5], &err);
+
+ if(failed_disk1 >= info->array.raid_disks) {
+ fprintf(stderr, "%s: failed_slot_1 index is higher than number of devices in raid\n", prg);
+ exit_err = 4;
+ goto exitHere;
+ }
+ if(failed_disk2 >= info->array.raid_disks) {
+ fprintf(stderr, "%s: failed_slot_2 index is higher than number of devices in raid\n", prg);
+ exit_err = 4;
+ goto exitHere;
+ }
+ if(failed_disk1 == failed_disk2) {
+ fprintf(stderr, "%s: failed_slot_1 and failed_slot_2 are the same\n", prg);
+ exit_err = 4;
+ goto exitHere;
+ }
+ }
+ else {
+ start = getnum(argv[2], &err);
+ length = getnum(argv[3], &err);
+ if (argc >= 5 && strcmp(argv[4], "autorepair")==0)
+ repair = AUTO_REPAIR;
+ }
+
+ if (err) {
+ fprintf(stderr, "%s: Bad number: %s\n", prg, err);
+ exit_err = 4;
+ goto exitHere;
+ }
+
+ if(start > ((info->component_size * 512) / chunk_size)) {
+ start = (info->component_size * 512) / chunk_size;
+ fprintf(stderr, "%s: start beyond disks size\n", prg);
+ }
+
+ if((length == 0) ||
+ ((length + start) > ((info->component_size * 512) / chunk_size))) {
+ length = (info->component_size * 512) / chunk_size - start;
+ }
+
+ disk_name = xmalloc(raid_disks * sizeof(*disk_name));
+ fds = xmalloc(raid_disks * sizeof(*fds));
+ offsets = xcalloc(raid_disks, sizeof(*offsets));
+ buf = xmalloc(raid_disks * chunk_size);
+
+ for(i=0; i<raid_disks; i++) {
+ fds[i] = -1;
+ }
+ close_flag = 1;
+
+ comp = info->devs;
+ for (i=0, active_disks=0; active_disks<raid_disks; i++) {
+ int disk_slot = comp->disk.raid_disk;
+ if(disk_slot >= 0) {
+ disk_name[disk_slot] = map_dev(comp->disk.major, comp->disk.minor, 0);
+ offsets[disk_slot] = comp->data_offset * 512;
+ fds[disk_slot] = open(disk_name[disk_slot], O_RDWR | O_DIRECT);
+ if (fds[disk_slot] < 0) {
+ perror(disk_name[disk_slot]);
+ fprintf(stderr,"%s: cannot open %s\n", prg, disk_name[disk_slot]);
+ exit_err = 6;
+ goto exitHere;
+ }
+ active_disks++;
+ }
+ comp = comp->next;
+ }
+
+ int rv = check_stripes(info, fds, offsets,
+ raid_disks, chunk_size, level, layout,
+ start, length, disk_name, repair, failed_disk1, failed_disk2);
+ if (rv != 0) {
+ fprintf(stderr, "%s: check_stripes returned %d\n", prg, rv);
+ exit_err = 7;
+ goto exitHere;
+ }
+
+exitHere:
+
+ if (close_flag)
+ for(i = 0; i < raid_disks; i++)
+ close(fds[i]);
+
+ free(disk_name);
+ free(fds);
+ free(offsets);
+ free(buf);
+
+ exit(exit_err);
+}
diff --git a/restripe.c b/restripe.c
new file mode 100644
index 0000000..a7a7229
--- /dev/null
+++ b/restripe.c
@@ -0,0 +1,1038 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <stdint.h>
+
+/* To restripe, we read from old geometry to a buffer, and
+ * read from buffer to new geometry.
+ * When reading, we might have missing devices and so could need
+ * to reconstruct.
+ * When writing, we need to create correct parity and Q.
+ *
+ */
+
+int geo_map(int block, unsigned long long stripe, int raid_disks,
+ int level, int layout)
+{
+ /* On the given stripe, find which disk in the array will have
+ * block numbered 'block'.
+ * '-1' means the parity block.
+ * '-2' means the Q syndrome.
+ */
+ int pd;
+
+ /* layout is not relevant for raid0 and raid4 */
+ if ((level == 0) ||
+ (level == 4))
+ layout = 0;
+
+ switch(level*100 + layout) {
+ case 000:
+ case 400:
+ case 500 + ALGORITHM_PARITY_N:
+ /* raid 4 isn't messed around by parity blocks */
+ if (block == -1)
+ return raid_disks-1; /* parity block */
+ return block;
+ case 500 + ALGORITHM_LEFT_ASYMMETRIC:
+ pd = (raid_disks-1) - stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 500 + ALGORITHM_RIGHT_ASYMMETRIC:
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 500 + ALGORITHM_LEFT_SYMMETRIC:
+ pd = (raid_disks - 1) - stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 500 + ALGORITHM_RIGHT_SYMMETRIC:
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 500 + ALGORITHM_PARITY_0:
+ return block + 1;
+
+ case 600 + ALGORITHM_PARITY_N_6:
+ if (block == -2)
+ return raid_disks - 1;
+ if (block == -1)
+ return raid_disks - 2; /* parity block */
+ return block;
+ case 600 + ALGORITHM_LEFT_ASYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = (raid_disks-1) - stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block >= pd)
+ block++;
+ return block;
+
+ case 600 + ALGORITHM_LEFT_SYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = (raid_disks - 1) - stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 600 + ALGORITHM_RIGHT_SYMMETRIC_6:
+ if (block == -2)
+ return raid_disks - 1;
+ raid_disks--;
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ return (pd + 1 + block) % raid_disks;
+
+ case 600 + ALGORITHM_PARITY_0_6:
+ if (block == -2)
+ return raid_disks - 1;
+ return block + 1;
+
+ case 600 + ALGORITHM_PARITY_0:
+ if (block == -1)
+ return 0;
+ if (block == -2)
+ return 1;
+ return block + 2;
+
+ case 600 + ALGORITHM_LEFT_ASYMMETRIC:
+ pd = raid_disks - 1 - (stripe % raid_disks);
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+1) % raid_disks;
+ if (pd == raid_disks - 1)
+ return block+1;
+ if (block >= pd)
+ return block+2;
+ return block;
+
+ case 600 + ALGORITHM_ROTATING_ZERO_RESTART:
+ /* Different order for calculating Q, otherwize same as ... */
+ case 600 + ALGORITHM_RIGHT_ASYMMETRIC:
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+1) % raid_disks;
+ if (pd == raid_disks - 1)
+ return block+1;
+ if (block >= pd)
+ return block+2;
+ return block;
+
+ case 600 + ALGORITHM_LEFT_SYMMETRIC:
+ pd = raid_disks - 1 - (stripe % raid_disks);
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+1) % raid_disks;
+ return (pd + 2 + block) % raid_disks;
+
+ case 600 + ALGORITHM_RIGHT_SYMMETRIC:
+ pd = stripe % raid_disks;
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+1) % raid_disks;
+ return (pd + 2 + block) % raid_disks;
+
+ case 600 + ALGORITHM_ROTATING_N_RESTART:
+ /* Same a left_asymmetric, by first stripe is
+ * D D D P Q rather than
+ * Q D D D P
+ */
+ pd = raid_disks - 1 - ((stripe + 1) % raid_disks);
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+1) % raid_disks;
+ if (pd == raid_disks - 1)
+ return block+1;
+ if (block >= pd)
+ return block+2;
+ return block;
+
+ case 600 + ALGORITHM_ROTATING_N_CONTINUE:
+ /* Same as left_symmetric but Q is before P */
+ pd = raid_disks - 1 - (stripe % raid_disks);
+ if (block == -1)
+ return pd;
+ if (block == -2)
+ return (pd+raid_disks-1) % raid_disks;
+ return (pd + 1 + block) % raid_disks;
+ }
+ return -1;
+}
+
+int is_ddf(int layout)
+{
+ switch (layout)
+ {
+ default:
+ return 0;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ case ALGORITHM_ROTATING_N_RESTART:
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ return 1;
+ }
+}
+
+void xor_blocks(char *target, char **sources, int disks, int size)
+{
+ int i, j;
+ /* Amazingly inefficient... */
+ for (i=0; i<size; i++) {
+ char c = 0;
+ for (j=0 ; j<disks; j++)
+ c ^= sources[j][i];
+ target[i] = c;
+ }
+}
+
+void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size)
+{
+ int d, z;
+ uint8_t wq0, wp0, wd0, w10, w20;
+ for ( d = 0; d < size; d++) {
+ wq0 = wp0 = sources[disks-1][d];
+ for ( z = disks-2 ; z >= 0 ; z-- ) {
+ wd0 = sources[z][d];
+ wp0 ^= wd0;
+ w20 = (wq0&0x80) ? 0xff : 0x00;
+ w10 = (wq0 << 1) & 0xff;
+ w20 &= 0x1d;
+ w10 ^= w20;
+ wq0 = w10 ^ wd0;
+ }
+ p[d] = wp0;
+ q[d] = wq0;
+ }
+}
+
+/*
+ * The following was taken from linux/drivers/md/mktables.c, and modified
+ * to create in-memory tables rather than C code
+ */
+static uint8_t gfmul(uint8_t a, uint8_t b)
+{
+ uint8_t v = 0;
+
+ while (b) {
+ if (b & 1)
+ v ^= a;
+ a = (a << 1) ^ (a & 0x80 ? 0x1d : 0);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+static uint8_t gfpow(uint8_t a, int b)
+{
+ uint8_t v = 1;
+
+ b %= 255;
+ if (b < 0)
+ b += 255;
+
+ while (b) {
+ if (b & 1)
+ v = gfmul(v, a);
+ a = gfmul(a, a);
+ b >>= 1;
+ }
+
+ return v;
+}
+
+int tables_ready = 0;
+uint8_t raid6_gfmul[256][256];
+uint8_t raid6_gfexp[256];
+uint8_t raid6_gfinv[256];
+uint8_t raid6_gfexi[256];
+uint8_t raid6_gflog[256];
+uint8_t raid6_gfilog[256];
+void make_tables(void)
+{
+ int i, j;
+ uint8_t v;
+ uint32_t b, log;
+
+ /* Compute multiplication table */
+ for (i = 0; i < 256; i++)
+ for (j = 0; j < 256; j++)
+ raid6_gfmul[i][j] = gfmul(i, j);
+
+ /* Compute power-of-2 table (exponent) */
+ v = 1;
+ for (i = 0; i < 256; i++) {
+ raid6_gfexp[i] = v;
+ v = gfmul(v, 2);
+ if (v == 1)
+ v = 0; /* For entry 255, not a real entry */
+ }
+
+ /* Compute inverse table x^-1 == x^254 */
+ for (i = 0; i < 256; i++)
+ raid6_gfinv[i] = gfpow(i, 254);
+
+ /* Compute inv(2^x + 1) (exponent-xor-inverse) table */
+ for (i = 0; i < 256; i ++)
+ raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1];
+
+ /* Compute log and inverse log */
+ /* Modified code from:
+ * https://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html
+ */
+ b = 1;
+ raid6_gflog[0] = 0;
+ raid6_gfilog[255] = 0;
+
+ for (log = 0; log < 255; log++) {
+ raid6_gflog[b] = (uint8_t) log;
+ raid6_gfilog[log] = (uint8_t) b;
+ b = b << 1;
+ if (b & 256) b = b ^ 0435;
+ }
+
+ tables_ready = 1;
+}
+
+uint8_t *zero;
+int zero_size;
+
+void ensure_zero_has_size(int chunk_size)
+{
+ if (zero == NULL || chunk_size > zero_size) {
+ if (zero)
+ free(zero);
+ zero = xcalloc(1, chunk_size);
+ zero_size = chunk_size;
+ }
+}
+
+/* Following was taken from linux/drivers/md/raid6recov.c */
+
+/* Recover two failed data blocks. */
+
+void raid6_2data_recov(int disks, size_t bytes, int faila, int failb,
+ uint8_t **ptrs, int neg_offset)
+{
+ uint8_t *p, *q, *dp, *dq;
+ uint8_t px, qx, db;
+ const uint8_t *pbmul; /* P multiplier table for B data */
+ const uint8_t *qmul; /* Q multiplier table (for both) */
+
+ if (faila > failb) {
+ int t = faila;
+ faila = failb;
+ failb = t;
+ }
+
+ if (neg_offset) {
+ p = ptrs[-1];
+ q = ptrs[-2];
+ } else {
+ p = ptrs[disks-2];
+ q = ptrs[disks-1];
+ }
+
+ /* Compute syndrome with zero for the missing data pages
+ Use the dead data pages as temporary storage for
+ delta p and delta q */
+ dp = ptrs[faila];
+ ptrs[faila] = zero;
+ dq = ptrs[failb];
+ ptrs[failb] = zero;
+
+ qsyndrome(dp, dq, ptrs, disks-2, bytes);
+
+ /* Restore pointer table */
+ ptrs[faila] = dp;
+ ptrs[failb] = dq;
+
+ /* Now, pick the proper data tables */
+ pbmul = raid6_gfmul[raid6_gfexi[failb-faila]];
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ px = *p ^ *dp;
+ qx = qmul[*q ^ *dq];
+ *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */
+ *dp++ = db ^ px; /* Reconstructed A */
+ p++; q++;
+ }
+}
+
+/* Recover failure of one data block plus the P block */
+void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs,
+ int neg_offset)
+{
+ uint8_t *p, *q, *dq;
+ const uint8_t *qmul; /* Q multiplier table */
+
+ if (neg_offset) {
+ p = ptrs[-1];
+ q = ptrs[-2];
+ } else {
+ p = ptrs[disks-2];
+ q = ptrs[disks-1];
+ }
+
+ /* Compute syndrome with zero for the missing data page
+ Use the dead data page as temporary storage for delta q */
+ dq = ptrs[faila];
+ ptrs[faila] = zero;
+
+ qsyndrome(p, dq, ptrs, disks-2, bytes);
+
+ /* Restore pointer table */
+ ptrs[faila] = dq;
+
+ /* Now, pick the proper data tables */
+ qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]];
+
+ /* Now do it... */
+ while ( bytes-- ) {
+ *p++ ^= *dq = qmul[*q ^ *dq];
+ q++; dq++;
+ }
+}
+
+/* Try to find out if a specific disk has a problem */
+int raid6_check_disks(int data_disks, int start, int chunk_size,
+ int level, int layout, int diskP, int diskQ,
+ uint8_t *p, uint8_t *q, char **stripes)
+{
+ int i;
+ int data_id, diskD;
+ uint8_t Px, Qx;
+ int curr_broken_disk = -1;
+ int prev_broken_disk = -1;
+ int broken_status = 0;
+
+ for(i = 0; i < chunk_size; i++) {
+ Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i];
+ Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i];
+
+ if((Px != 0) && (Qx == 0))
+ curr_broken_disk = diskP;
+
+ if((Px == 0) && (Qx != 0))
+ curr_broken_disk = diskQ;
+
+ if((Px != 0) && (Qx != 0)) {
+ data_id = (raid6_gflog[Qx] - raid6_gflog[Px]);
+ if(data_id < 0) data_id += 255;
+ diskD = geo_map(data_id, start/chunk_size,
+ data_disks + 2, level, layout);
+ curr_broken_disk = diskD;
+ }
+
+ if((Px == 0) && (Qx == 0))
+ curr_broken_disk = prev_broken_disk;
+
+ if(curr_broken_disk >= data_disks + 2)
+ broken_status = 2;
+
+ switch(broken_status) {
+ case 0:
+ if(curr_broken_disk != -1) {
+ prev_broken_disk = curr_broken_disk;
+ broken_status = 1;
+ }
+ break;
+
+ case 1:
+ if(curr_broken_disk != prev_broken_disk)
+ broken_status = 2;
+ break;
+
+ case 2:
+ default:
+ curr_broken_disk = prev_broken_disk = -2;
+ break;
+ }
+ }
+
+ return curr_broken_disk;
+}
+
+/*******************************************************************************
+ * Function: save_stripes
+ * Description:
+ * Function reads data (only data without P and Q) from array and writes
+ * it to buf and opcjonaly to backup files
+ * Parameters:
+ * source : A list of 'fds' of the active disks.
+ * Some may be absent
+ * offsets : A list of offsets on disk belonging
+ * to the array [bytes]
+ * raid_disks : geometry: number of disks in the array
+ * chunk_size : geometry: chunk size [bytes]
+ * level : geometry: RAID level
+ * layout : geometry: layout
+ * nwrites : number of backup files
+ * dest : A list of 'fds' for mirrored targets
+ * (e.g. backup files). They are already seeked to right
+ * (write) location. If NULL, data will be wrote
+ * to the buf only
+ * start : start address of data to read (must be stripe-aligned)
+ * [bytes]
+ * length - : length of data to read (must be stripe-aligned)
+ * [bytes]
+ * buf : buffer for data. It is large enough to hold
+ * one stripe. It is stripe aligned
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+int save_stripes(int *source, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int nwrites, int *dest,
+ unsigned long long start, unsigned long long length,
+ char *buf)
+{
+ int len;
+ int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2);
+ int disk;
+ int i;
+ unsigned long long length_test;
+
+ if (!tables_ready)
+ make_tables();
+ ensure_zero_has_size(chunk_size);
+
+ len = data_disks * chunk_size;
+ length_test = length / len;
+ length_test *= len;
+
+ if (length != length_test) {
+ dprintf("Error: save_stripes(): Data are not alligned. EXIT\n");
+ dprintf("\tArea for saving stripes (length) = %llu\n", length);
+ dprintf("\tWork step (len) = %i\n", len);
+ dprintf("\tExpected save area (length_test) = %llu\n",
+ length_test);
+ abort();
+ }
+
+ while (length > 0) {
+ int failed = 0;
+ int fdisk[3], fblock[3];
+ for (disk = 0; disk < raid_disks ; disk++) {
+ unsigned long long offset;
+ int dnum;
+
+ offset = (start/chunk_size/data_disks)*chunk_size;
+ dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1,
+ start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (dnum < 0) abort();
+ if (source[dnum] < 0 ||
+ lseek64(source[dnum],
+ offsets[dnum] + offset, 0) < 0 ||
+ read(source[dnum], buf+disk * chunk_size,
+ chunk_size) != chunk_size) {
+ if (failed <= 2) {
+ fdisk[failed] = dnum;
+ fblock[failed] = disk;
+ failed++;
+ }
+ }
+ }
+ if (failed == 0 || fblock[0] >= data_disks)
+ /* all data disks are good */
+ ;
+ else if (failed == 1 || fblock[1] >= data_disks+1) {
+ /* one failed data disk and good parity */
+ char *bufs[data_disks];
+ for (i=0; i < data_disks; i++)
+ if (fblock[0] == i)
+ bufs[i] = buf + data_disks*chunk_size;
+ else
+ bufs[i] = buf + i*chunk_size;
+
+ xor_blocks(buf + fblock[0]*chunk_size,
+ bufs, data_disks, chunk_size);
+ } else if (failed > 2 || level != 6)
+ /* too much failure */
+ return -1;
+ else {
+ /* RAID6 computations needed. */
+ uint8_t *bufs[data_disks+4];
+ int qdisk;
+ int syndrome_disks;
+ disk = geo_map(-1, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ qdisk = geo_map(-2, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (is_ddf(layout)) {
+ /* q over 'raid_disks' blocks, in device order.
+ * 'p' and 'q' get to be all zero
+ */
+ for (i = 0; i < raid_disks; i++)
+ bufs[i] = zero;
+ for (i = 0; i < data_disks; i++) {
+ int dnum = geo_map(i,
+ start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ int snum;
+ /* i is the logical block number, so is index to 'buf'.
+ * dnum is physical disk number
+ * and thus the syndrome number.
+ */
+ snum = dnum;
+ bufs[snum] = (uint8_t*)buf + chunk_size * i;
+ }
+ syndrome_disks = raid_disks;
+ } else {
+ /* for md, q is over 'data_disks' blocks,
+ * starting immediately after 'q'
+ * Note that for the '_6' variety, the p block
+ * makes a hole that we need to be careful of.
+ */
+ int j;
+ int snum = 0;
+ for (j = 0; j < raid_disks; j++) {
+ int dnum = (qdisk + 1 + j) % raid_disks;
+ if (dnum == disk || dnum == qdisk)
+ continue;
+ for (i = 0; i < data_disks; i++)
+ if (geo_map(i,
+ start/chunk_size/data_disks,
+ raid_disks, level, layout) == dnum)
+ break;
+ /* i is the logical block number, so is index to 'buf'.
+ * dnum is physical disk number
+ * snum is syndrome disk for which 0 is immediately after Q
+ */
+ bufs[snum] = (uint8_t*)buf + chunk_size * i;
+
+ if (fblock[0] == i)
+ fdisk[0] = snum;
+ if (fblock[1] == i)
+ fdisk[1] = snum;
+ snum++;
+ }
+
+ syndrome_disks = data_disks;
+ }
+
+ /* Place P and Q blocks at end of bufs */
+ bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks;
+ bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1);
+
+ if (fblock[1] == data_disks)
+ /* One data failed, and parity failed */
+ raid6_datap_recov(syndrome_disks+2, chunk_size,
+ fdisk[0], bufs, 0);
+ else {
+ /* Two data blocks failed, P,Q OK */
+ raid6_2data_recov(syndrome_disks+2, chunk_size,
+ fdisk[0], fdisk[1], bufs, 0);
+ }
+ }
+ if (dest) {
+ for (i = 0; i < nwrites; i++)
+ if (write(dest[i], buf, len) != len)
+ return -1;
+ } else {
+ /* build next stripe in buffer */
+ buf += len;
+ }
+ length -= len;
+ start += len;
+ }
+ return 0;
+}
+
+/* Restore data:
+ * We are given:
+ * A list of 'fds' of the active disks. Some may be '-1' for not-available.
+ * A geometry: raid_disks, chunk_size, level, layout
+ * An 'fd' to read from. It is already seeked to the right (Read) location.
+ * A start and length.
+ * The length must be a multiple of the stripe size.
+ *
+ * We build a full stripe in memory and then write it out.
+ * We assume that there are enough working devices.
+ */
+int restore_stripes(int *dest, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ int source, unsigned long long read_offset,
+ unsigned long long start, unsigned long long length,
+ char *src_buf)
+{
+ char *stripe_buf;
+ char **stripes = xmalloc(raid_disks * sizeof(char*));
+ char **blocks = xmalloc(raid_disks * sizeof(char*));
+ int i;
+ int rv;
+
+ int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2);
+
+ if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size))
+ stripe_buf = NULL;
+
+ if (zero == NULL || chunk_size > zero_size) {
+ if (zero)
+ free(zero);
+ zero = xcalloc(1, chunk_size);
+ zero_size = chunk_size;
+ }
+
+ if (stripe_buf == NULL || stripes == NULL || blocks == NULL ||
+ zero == NULL) {
+ rv = -2;
+ goto abort;
+ }
+ for (i = 0; i < raid_disks; i++)
+ stripes[i] = stripe_buf + i * chunk_size;
+ while (length > 0) {
+ unsigned int len = data_disks * chunk_size;
+ unsigned long long offset;
+ int disk, qdisk;
+ int syndrome_disks;
+ if (length < len) {
+ rv = -3;
+ goto abort;
+ }
+ for (i = 0; i < data_disks; i++) {
+ int disk = geo_map(i, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (src_buf == NULL) {
+ /* read from file */
+ if (lseek64(source, read_offset, 0) !=
+ (off64_t)read_offset) {
+ rv = -1;
+ goto abort;
+ }
+ if (read(source,
+ stripes[disk],
+ chunk_size) != chunk_size) {
+ rv = -1;
+ goto abort;
+ }
+ } else {
+ /* read from input buffer */
+ memcpy(stripes[disk],
+ src_buf + read_offset,
+ chunk_size);
+ }
+ read_offset += chunk_size;
+ }
+ /* We have the data, now do the parity */
+ offset = (start/chunk_size/data_disks) * chunk_size;
+ switch (level) {
+ case 4:
+ case 5:
+ disk = geo_map(-1, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ for (i = 0; i < data_disks; i++)
+ blocks[i] = stripes[(disk+1+i) % raid_disks];
+ xor_blocks(stripes[disk], blocks, data_disks, chunk_size);
+ break;
+ case 6:
+ disk = geo_map(-1, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ qdisk = geo_map(-2, start/chunk_size/data_disks,
+ raid_disks, level, layout);
+ if (is_ddf(layout)) {
+ /* q over 'raid_disks' blocks, in device order.
+ * 'p' and 'q' get to be all zero
+ */
+ for (i = 0; i < raid_disks; i++)
+ if (i == disk || i == qdisk)
+ blocks[i] = (char*)zero;
+ else
+ blocks[i] = stripes[i];
+ syndrome_disks = raid_disks;
+ } else {
+ /* for md, q is over 'data_disks' blocks,
+ * starting immediately after 'q'
+ */
+ for (i = 0; i < data_disks; i++)
+ blocks[i] = stripes[(qdisk+1+i) % raid_disks];
+
+ syndrome_disks = data_disks;
+ }
+ qsyndrome((uint8_t*)stripes[disk],
+ (uint8_t*)stripes[qdisk],
+ (uint8_t**)blocks,
+ syndrome_disks, chunk_size);
+ break;
+ }
+ for (i=0; i < raid_disks ; i++)
+ if (dest[i] >= 0) {
+ if (lseek64(dest[i],
+ offsets[i]+offset, 0) < 0) {
+ rv = -1;
+ goto abort;
+ }
+ if (write(dest[i], stripes[i],
+ chunk_size) != chunk_size) {
+ rv = -1;
+ goto abort;
+ }
+ }
+ length -= len;
+ start += len;
+ }
+ rv = 0;
+
+abort:
+ free(stripe_buf);
+ free(stripes);
+ free(blocks);
+ return rv;
+}
+
+#ifdef MAIN
+
+int test_stripes(int *source, unsigned long long *offsets,
+ int raid_disks, int chunk_size, int level, int layout,
+ unsigned long long start, unsigned long long length)
+{
+ /* ready the data and p (and q) blocks, and check we got them right */
+ char *stripe_buf = xmalloc(raid_disks * chunk_size);
+ char **stripes = xmalloc(raid_disks * sizeof(char*));
+ char **blocks = xmalloc(raid_disks * sizeof(char*));
+ uint8_t *p = xmalloc(chunk_size);
+ uint8_t *q = xmalloc(chunk_size);
+
+ int i;
+ int diskP, diskQ;
+ int data_disks = raid_disks - (level == 5 ? 1: 2);
+
+ if (!tables_ready)
+ make_tables();
+
+ for ( i = 0 ; i < raid_disks ; i++)
+ stripes[i] = stripe_buf + i * chunk_size;
+
+ while (length > 0) {
+ int disk;
+
+ for (i = 0 ; i < raid_disks ; i++) {
+ if ((lseek64(source[i], offsets[i]+start, 0) < 0) ||
+ (read(source[i], stripes[i], chunk_size) !=
+ chunk_size)) {
+ free(q);
+ free(p);
+ free(blocks);
+ free(stripes);
+ free(stripe_buf);
+ return -1;
+ }
+ }
+ for (i = 0 ; i < data_disks ; i++) {
+ int disk = geo_map(i, start/chunk_size, raid_disks,
+ level, layout);
+ blocks[i] = stripes[disk];
+ printf("%d->%d\n", i, disk);
+ }
+ switch(level) {
+ case 6:
+ qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size);
+ diskP = geo_map(-1, start/chunk_size, raid_disks,
+ level, layout);
+ if (memcmp(p, stripes[diskP], chunk_size) != 0) {
+ printf("P(%d) wrong at %llu\n", diskP,
+ start / chunk_size);
+ }
+ diskQ = geo_map(-2, start/chunk_size, raid_disks,
+ level, layout);
+ if (memcmp(q, stripes[diskQ], chunk_size) != 0) {
+ printf("Q(%d) wrong at %llu\n", diskQ,
+ start / chunk_size);
+ }
+ disk = raid6_check_disks(data_disks, start, chunk_size,
+ level, layout, diskP, diskQ,
+ p, q, stripes);
+ if(disk >= 0) {
+ printf("Possible failed disk: %d\n", disk);
+ }
+ if(disk == -2) {
+ printf("Failure detected, but disk unknown\n");
+ }
+ break;
+ }
+ length -= chunk_size;
+ start += chunk_size;
+ }
+ return 0;
+}
+
+unsigned long long getnum(char *str, char **err)
+{
+ char *e;
+ unsigned long long rv = strtoull(str, &e, 10);
+ if (e==str || *e) {
+ *err = str;
+ return 0;
+ }
+ return rv;
+}
+
+char const Name[] = "test_restripe";
+int main(int argc, char *argv[])
+{
+ /* save/restore file raid_disks chunk_size level layout start length devices...
+ */
+ int save;
+ int *fds;
+ char *file;
+ char *buf;
+ int storefd;
+ unsigned long long *offsets;
+ int raid_disks, chunk_size, level, layout;
+ unsigned long long start, length;
+ int i;
+
+ char *err = NULL;
+ if (argc < 10) {
+ fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n");
+ exit(1);
+ }
+ if (strcmp(argv[1], "save")==0)
+ save = 1;
+ else if (strcmp(argv[1], "restore") == 0)
+ save = 0;
+ else if (strcmp(argv[1], "test") == 0)
+ save = 2;
+ else {
+ fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n");
+ exit(2);
+ }
+
+ file = argv[2];
+ raid_disks = getnum(argv[3], &err);
+ chunk_size = getnum(argv[4], &err);
+ level = getnum(argv[5], &err);
+ layout = getnum(argv[6], &err);
+ start = getnum(argv[7], &err);
+ length = getnum(argv[8], &err);
+ if (err) {
+ fprintf(stderr, "test_stripe: Bad number: %s\n", err);
+ exit(2);
+ }
+ if (argc != raid_disks + 9) {
+ fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n",
+ raid_disks, argc-9);
+ exit(2);
+ }
+ fds = xmalloc(raid_disks * sizeof(*fds));
+ offsets = xcalloc(raid_disks, sizeof(*offsets));
+
+ storefd = open(file, O_RDWR);
+ if (storefd < 0) {
+ perror(file);
+ fprintf(stderr, "test_stripe: could not open %s.\n", file);
+ exit(3);
+ }
+ for (i=0; i<raid_disks; i++) {
+ char *p;
+ p = strchr(argv[9+i], ':');
+
+ if(p != NULL) {
+ *p++ = '\0';
+ offsets[i] = atoll(p) * 512;
+ }
+
+ fds[i] = open(argv[9+i], O_RDWR);
+ if (fds[i] < 0) {
+ perror(argv[9+i]);
+ fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]);
+ exit(3);
+ }
+ }
+
+ buf = xmalloc(raid_disks * chunk_size);
+
+ if (save == 1) {
+ int rv = save_stripes(fds, offsets,
+ raid_disks, chunk_size, level, layout,
+ 1, &storefd,
+ start, length, buf);
+ if (rv != 0) {
+ fprintf(stderr,
+ "test_stripe: save_stripes returned %d\n", rv);
+ exit(1);
+ }
+ } else if (save == 2) {
+ int rv = test_stripes(fds, offsets,
+ raid_disks, chunk_size, level, layout,
+ start, length);
+ if (rv != 0) {
+ fprintf(stderr,
+ "test_stripe: test_stripes returned %d\n", rv);
+ exit(1);
+ }
+ } else {
+ int rv = restore_stripes(fds, offsets,
+ raid_disks, chunk_size, level, layout,
+ storefd, 0ULL,
+ start, length, NULL);
+ if (rv != 0) {
+ fprintf(stderr,
+ "test_stripe: restore_stripes returned %d\n",
+ rv);
+ exit(1);
+ }
+ }
+ exit(0);
+}
+
+#endif /* MAIN */
diff --git a/sg_io.c b/sg_io.c
new file mode 100644
index 0000000..7889a95
--- /dev/null
+++ b/sg_io.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2007-2008 Intel Corporation
+ *
+ * Retrieve drive serial numbers for scsi disks
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#include <string.h>
+#include <scsi/scsi.h>
+#include <scsi/sg.h>
+#include <sys/ioctl.h>
+
+int scsi_get_serial(int fd, void *buf, size_t buf_len)
+{
+ unsigned char rsp_buf[255];
+ unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, sizeof(rsp_buf), 0};
+ unsigned char sense[32];
+ struct sg_io_hdr io_hdr;
+ int rv;
+ unsigned int rsp_len;
+
+ memset(&io_hdr, 0, sizeof(io_hdr));
+ io_hdr.interface_id = 'S';
+ io_hdr.cmdp = inq_cmd;
+ io_hdr.cmd_len = sizeof(inq_cmd);
+ io_hdr.dxferp = rsp_buf;
+ io_hdr.dxfer_len = sizeof(rsp_buf);
+ io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+ io_hdr.sbp = sense;
+ io_hdr.mx_sb_len = sizeof(sense);
+ io_hdr.timeout = 5000;
+
+ rv = ioctl(fd, SG_IO, &io_hdr);
+
+ if (rv)
+ return rv;
+
+ if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK)
+ return -1;
+
+ rsp_len = rsp_buf[3];
+
+ if (!rsp_len || buf_len < rsp_len)
+ return -1;
+
+ memcpy(buf, &rsp_buf[4], rsp_len);
+
+ return 0;
+}
diff --git a/sha1.c b/sha1.c
new file mode 100644
index 0000000..89b32f4
--- /dev/null
+++ b/sha1.c
@@ -0,0 +1,415 @@
+/* sha1.c - Functions to compute SHA1 message digest of files or
+ memory blocks according to the NIST specification FIPS-180-1.
+
+ Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software
+ Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+/* Written by Scott G. Miller
+ Credits:
+ Robert Klep <robert@ilse.nl> -- Expansion function fix
+*/
+
+//#include <config.h>
+
+#include "sha1.h"
+
+#include <stddef.h>
+#include <string.h>
+
+#if USE_UNLOCKED_IO
+# include "unlocked-io.h"
+#endif
+
+#ifdef WORDS_BIGENDIAN
+# define SWAP(n) (n)
+#else
+# define SWAP(n) \
+ (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24))
+#endif
+
+#define BLOCKSIZE 4096
+#if BLOCKSIZE % 64 != 0
+# error "invalid BLOCKSIZE"
+#endif
+
+/* This array contains the bytes used to pad the buffer to the next
+ 64-byte boundary. (RFC 1321, 3.1: Step 1) */
+static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ };
+
+/* Take a pointer to a 160 bit block of data (five 32 bit ints) and
+ initialize it to the start constants of the SHA1 algorithm. This
+ must be called before using hash in the call to sha1_hash. */
+void
+sha1_init_ctx (struct sha1_ctx *ctx)
+{
+ ctx->A = 0x67452301;
+ ctx->B = 0xefcdab89;
+ ctx->C = 0x98badcfe;
+ ctx->D = 0x10325476;
+ ctx->E = 0xc3d2e1f0;
+
+ ctx->total[0] = ctx->total[1] = 0;
+ ctx->buflen = 0;
+}
+
+/* Put result from CTX in first 20 bytes following RESBUF. The result
+ must be in little endian byte order.
+
+ IMPORTANT: On some systems it is required that RESBUF is correctly
+ aligned for a 32-bit value. */
+void *
+sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf)
+{
+ ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A);
+ ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B);
+ ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C);
+ ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D);
+ ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E);
+
+ return resbuf;
+}
+
+/* Process the remaining bytes in the internal buffer and the usual
+ prolog according to the standard and write the result to RESBUF.
+
+ IMPORTANT: On some systems it is required that RESBUF is correctly
+ aligned for a 32-bit value. */
+void *
+sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf)
+{
+ /* Take yet unprocessed bytes into account. */
+ sha1_uint32 bytes = ctx->buflen;
+ size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4;
+
+ /* Now count remaining bytes. */
+ ctx->total[0] += bytes;
+ if (ctx->total[0] < bytes)
+ ++ctx->total[1];
+
+ /* Put the 64-bit file length in *bits* at the end of the buffer. */
+ ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29));
+ ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3);
+
+ memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes);
+
+ /* Process last bytes. */
+ sha1_process_block (ctx->buffer, size * 4, ctx);
+
+ return sha1_read_ctx (ctx, resbuf);
+}
+
+/* Compute SHA1 message digest for bytes read from STREAM. The
+ resulting message digest number will be written into the 16 bytes
+ beginning at RESBLOCK. */
+int
+sha1_stream (FILE *stream, void *resblock)
+{
+ struct sha1_ctx ctx;
+ char buffer[BLOCKSIZE + 72];
+ size_t sum;
+
+ /* Initialize the computation context. */
+ sha1_init_ctx (&ctx);
+
+ /* Iterate over full file contents. */
+ while (1)
+ {
+ /* We read the file in blocks of BLOCKSIZE bytes. One call of the
+ computation function processes the whole buffer so that with the
+ next round of the loop another block can be read. */
+ size_t n;
+ sum = 0;
+
+ /* Read block. Take care for partial reads. */
+ while (1)
+ {
+ n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream);
+
+ sum += n;
+
+ if (sum == BLOCKSIZE)
+ break;
+
+ if (n == 0)
+ {
+ /* Check for the error flag IFF N == 0, so that we don't
+ exit the loop after a partial read due to e.g., EAGAIN
+ or EWOULDBLOCK. */
+ if (ferror (stream))
+ return 1;
+ goto process_partial_block;
+ }
+
+ /* We've read at least one byte, so ignore errors. But always
+ check for EOF, since feof may be true even though N > 0.
+ Otherwise, we could end up calling fread after EOF. */
+ if (feof (stream))
+ goto process_partial_block;
+ }
+
+ /* Process buffer with BLOCKSIZE bytes. Note that
+ BLOCKSIZE % 64 == 0
+ */
+ sha1_process_block (buffer, BLOCKSIZE, &ctx);
+ }
+
+ process_partial_block:;
+
+ /* Process any remaining bytes. */
+ if (sum > 0)
+ sha1_process_bytes (buffer, sum, &ctx);
+
+ /* Construct result in desired memory. */
+ sha1_finish_ctx (&ctx, resblock);
+ return 0;
+}
+
+/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The
+ result is always in little endian byte order, so that a byte-wise
+ output yields to the wanted ASCII representation of the message
+ digest. */
+void *
+sha1_buffer (const char *buffer, size_t len, void *resblock)
+{
+ struct sha1_ctx ctx;
+
+ /* Initialize the computation context. */
+ sha1_init_ctx (&ctx);
+
+ /* Process whole buffer but last len % 64 bytes. */
+ sha1_process_bytes (buffer, len, &ctx);
+
+ /* Put result in desired memory area. */
+ return sha1_finish_ctx (&ctx, resblock);
+}
+
+void
+sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx)
+{
+ /* When we already have some bits in our internal buffer concatenate
+ both inputs first. */
+ if (ctx->buflen != 0)
+ {
+ size_t left_over = ctx->buflen;
+ size_t add = 128 - left_over > len ? len : 128 - left_over;
+
+ memcpy (&((char *) ctx->buffer)[left_over], buffer, add);
+ ctx->buflen += add;
+
+ if (ctx->buflen > 64)
+ {
+ sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx);
+
+ ctx->buflen &= 63;
+ /* The regions in the following copy operation cannot overlap. */
+ memcpy (ctx->buffer,
+ &((char *) ctx->buffer)[(left_over + add) & ~63],
+ ctx->buflen);
+ }
+
+ buffer = (const char *) buffer + add;
+ len -= add;
+ }
+
+ /* Process available complete blocks. */
+ if (len >= 64)
+ {
+#if !_STRING_ARCH_unaligned
+# define alignof(type) offsetof (struct { char c; type x; }, x)
+# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0)
+ if (UNALIGNED_P (buffer))
+ while (len > 64)
+ {
+ sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx);
+ buffer = (const char *) buffer + 64;
+ len -= 64;
+ }
+ else
+#endif
+ {
+ sha1_process_block (buffer, len & ~63, ctx);
+ buffer = (const char *) buffer + (len & ~63);
+ len &= 63;
+ }
+ }
+
+ /* Move remaining bytes in internal buffer. */
+ if (len > 0)
+ {
+ size_t left_over = ctx->buflen;
+
+ memcpy (&((char *) ctx->buffer)[left_over], buffer, len);
+ left_over += len;
+ if (left_over >= 64)
+ {
+ sha1_process_block (ctx->buffer, 64, ctx);
+ left_over -= 64;
+ memmove (ctx->buffer, &ctx->buffer[16], left_over);
+ }
+ ctx->buflen = left_over;
+ }
+}
+
+/* --- Code below is the primary difference between md5.c and sha1.c --- */
+
+/* SHA1 round constants */
+#define K1 0x5a827999
+#define K2 0x6ed9eba1
+#define K3 0x8f1bbcdc
+#define K4 0xca62c1d6
+
+/* Round functions. Note that F2 is the same as F4. */
+#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) )
+#define F2(B,C,D) (B ^ C ^ D)
+#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) )
+#define F4(B,C,D) (B ^ C ^ D)
+
+/* Process LEN bytes of BUFFER, accumulating context into CTX.
+ It is assumed that LEN % 64 == 0.
+ Most of this code comes from GnuPG's cipher/sha1.c. */
+
+void
+sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx)
+{
+ const sha1_uint32 *words = (const sha1_uint32*) buffer;
+ size_t nwords = len / sizeof (sha1_uint32);
+ const sha1_uint32 *endp = words + nwords;
+ sha1_uint32 x[16];
+ sha1_uint32 a = ctx->A;
+ sha1_uint32 b = ctx->B;
+ sha1_uint32 c = ctx->C;
+ sha1_uint32 d = ctx->D;
+ sha1_uint32 e = ctx->E;
+
+ /* First increment the byte count. RFC 1321 specifies the possible
+ length of the file up to 2^64 bits. Here we only compute the
+ number of bytes. Do a double word increment. */
+ ctx->total[0] += len;
+ if (ctx->total[0] < len)
+ ++ctx->total[1];
+
+#define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n))))
+
+#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \
+ ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \
+ , (x[I&0x0f] = rol(tm, 1)) )
+
+#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \
+ + F( B, C, D ) \
+ + K \
+ + M; \
+ B = rol( B, 30 ); \
+ } while(0)
+
+ while (words < endp)
+ {
+ sha1_uint32 tm;
+ int t;
+ for (t = 0; t < 16; t++)
+ {
+ x[t] = SWAP (*words);
+ words++;
+ }
+
+ R( a, b, c, d, e, F1, K1, x[ 0] );
+ R( e, a, b, c, d, F1, K1, x[ 1] );
+ R( d, e, a, b, c, F1, K1, x[ 2] );
+ R( c, d, e, a, b, F1, K1, x[ 3] );
+ R( b, c, d, e, a, F1, K1, x[ 4] );
+ R( a, b, c, d, e, F1, K1, x[ 5] );
+ R( e, a, b, c, d, F1, K1, x[ 6] );
+ R( d, e, a, b, c, F1, K1, x[ 7] );
+ R( c, d, e, a, b, F1, K1, x[ 8] );
+ R( b, c, d, e, a, F1, K1, x[ 9] );
+ R( a, b, c, d, e, F1, K1, x[10] );
+ R( e, a, b, c, d, F1, K1, x[11] );
+ R( d, e, a, b, c, F1, K1, x[12] );
+ R( c, d, e, a, b, F1, K1, x[13] );
+ R( b, c, d, e, a, F1, K1, x[14] );
+ R( a, b, c, d, e, F1, K1, x[15] );
+ R( e, a, b, c, d, F1, K1, M(16) );
+ R( d, e, a, b, c, F1, K1, M(17) );
+ R( c, d, e, a, b, F1, K1, M(18) );
+ R( b, c, d, e, a, F1, K1, M(19) );
+ R( a, b, c, d, e, F2, K2, M(20) );
+ R( e, a, b, c, d, F2, K2, M(21) );
+ R( d, e, a, b, c, F2, K2, M(22) );
+ R( c, d, e, a, b, F2, K2, M(23) );
+ R( b, c, d, e, a, F2, K2, M(24) );
+ R( a, b, c, d, e, F2, K2, M(25) );
+ R( e, a, b, c, d, F2, K2, M(26) );
+ R( d, e, a, b, c, F2, K2, M(27) );
+ R( c, d, e, a, b, F2, K2, M(28) );
+ R( b, c, d, e, a, F2, K2, M(29) );
+ R( a, b, c, d, e, F2, K2, M(30) );
+ R( e, a, b, c, d, F2, K2, M(31) );
+ R( d, e, a, b, c, F2, K2, M(32) );
+ R( c, d, e, a, b, F2, K2, M(33) );
+ R( b, c, d, e, a, F2, K2, M(34) );
+ R( a, b, c, d, e, F2, K2, M(35) );
+ R( e, a, b, c, d, F2, K2, M(36) );
+ R( d, e, a, b, c, F2, K2, M(37) );
+ R( c, d, e, a, b, F2, K2, M(38) );
+ R( b, c, d, e, a, F2, K2, M(39) );
+ R( a, b, c, d, e, F3, K3, M(40) );
+ R( e, a, b, c, d, F3, K3, M(41) );
+ R( d, e, a, b, c, F3, K3, M(42) );
+ R( c, d, e, a, b, F3, K3, M(43) );
+ R( b, c, d, e, a, F3, K3, M(44) );
+ R( a, b, c, d, e, F3, K3, M(45) );
+ R( e, a, b, c, d, F3, K3, M(46) );
+ R( d, e, a, b, c, F3, K3, M(47) );
+ R( c, d, e, a, b, F3, K3, M(48) );
+ R( b, c, d, e, a, F3, K3, M(49) );
+ R( a, b, c, d, e, F3, K3, M(50) );
+ R( e, a, b, c, d, F3, K3, M(51) );
+ R( d, e, a, b, c, F3, K3, M(52) );
+ R( c, d, e, a, b, F3, K3, M(53) );
+ R( b, c, d, e, a, F3, K3, M(54) );
+ R( a, b, c, d, e, F3, K3, M(55) );
+ R( e, a, b, c, d, F3, K3, M(56) );
+ R( d, e, a, b, c, F3, K3, M(57) );
+ R( c, d, e, a, b, F3, K3, M(58) );
+ R( b, c, d, e, a, F3, K3, M(59) );
+ R( a, b, c, d, e, F4, K4, M(60) );
+ R( e, a, b, c, d, F4, K4, M(61) );
+ R( d, e, a, b, c, F4, K4, M(62) );
+ R( c, d, e, a, b, F4, K4, M(63) );
+ R( b, c, d, e, a, F4, K4, M(64) );
+ R( a, b, c, d, e, F4, K4, M(65) );
+ R( e, a, b, c, d, F4, K4, M(66) );
+ R( d, e, a, b, c, F4, K4, M(67) );
+ R( c, d, e, a, b, F4, K4, M(68) );
+ R( b, c, d, e, a, F4, K4, M(69) );
+ R( a, b, c, d, e, F4, K4, M(70) );
+ R( e, a, b, c, d, F4, K4, M(71) );
+ R( d, e, a, b, c, F4, K4, M(72) );
+ R( c, d, e, a, b, F4, K4, M(73) );
+ R( b, c, d, e, a, F4, K4, M(74) );
+ R( a, b, c, d, e, F4, K4, M(75) );
+ R( e, a, b, c, d, F4, K4, M(76) );
+ R( d, e, a, b, c, F4, K4, M(77) );
+ R( c, d, e, a, b, F4, K4, M(78) );
+ R( b, c, d, e, a, F4, K4, M(79) );
+
+ a = ctx->A += a;
+ b = ctx->B += b;
+ c = ctx->C += c;
+ d = ctx->D += d;
+ e = ctx->E += e;
+ }
+}
diff --git a/sha1.h b/sha1.h
new file mode 100644
index 0000000..999fc6a
--- /dev/null
+++ b/sha1.h
@@ -0,0 +1,136 @@
+/* Declarations of functions and data types used for SHA1 sum
+ library functions.
+ Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008
+ Free Software Foundation, Inc.
+
+ This program is free software; you can redistribute it and/or modify it
+ under the terms of the GNU General Public License as published by the
+ Free Software Foundation; either version 2, or (at your option) any
+ later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program; if not, write to the Free Software Foundation,
+ Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
+
+#ifndef SHA1_H
+# define SHA1_H 1
+
+#include <stdio.h>
+
+#if defined HAVE_LIMITS_H || _LIBC
+# include <limits.h>
+#endif
+
+/* The following contortions are an attempt to use the C preprocessor
+ to determine an unsigned integral type that is 32 bits wide. An
+ alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but
+ doing that would require that the configure script compile and *run*
+ the resulting executable. Locally running cross-compiled executables
+ is usually not possible. */
+
+#ifdef _LIBC
+# include <sys/types.h>
+typedef u_int32_t sha1_uint32;
+typedef uintptr_t sha1_uintptr;
+#else
+# define INT_MAX_32_BITS 2147483647
+
+/* If UINT_MAX isn't defined, assume it's a 32-bit type.
+ This should be valid for all systems GNU cares about because
+ that doesn't include 16-bit systems, and only modern systems
+ (that certainly have <limits.h>) have 64+-bit integral types. */
+
+# ifndef INT_MAX
+# define INT_MAX INT_MAX_32_BITS
+# endif
+
+# if INT_MAX == INT_MAX_32_BITS
+ typedef unsigned int sha1_uint32;
+# else
+# if SHRT_MAX == INT_MAX_32_BITS
+ typedef unsigned short sha1_uint32;
+# else
+# if LONG_MAX == INT_MAX_32_BITS
+ typedef unsigned long sha1_uint32;
+# else
+ /* The following line is intended to evoke an error.
+ Using #error is not portable enough. */
+ "Cannot determine unsigned 32-bit data type."
+# endif
+# endif
+# endif
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Structure to save state of computation between the single steps. */
+struct sha1_ctx
+{
+ sha1_uint32 A;
+ sha1_uint32 B;
+ sha1_uint32 C;
+ sha1_uint32 D;
+ sha1_uint32 E;
+
+ sha1_uint32 total[2];
+ sha1_uint32 buflen;
+ sha1_uint32 buffer[32];
+};
+
+/* Initialize structure containing state of computation. */
+extern void sha1_init_ctx (struct sha1_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+ initialization function update the context for the next LEN bytes
+ starting at BUFFER.
+ It is necessary that LEN is a multiple of 64!!! */
+extern void sha1_process_block (const void *buffer, size_t len,
+ struct sha1_ctx *ctx);
+
+/* Starting with the result of former calls of this function (or the
+ initialization function update the context for the next LEN bytes
+ starting at BUFFER.
+ It is NOT required that LEN is a multiple of 64. */
+extern void sha1_process_bytes (const void *buffer, size_t len,
+ struct sha1_ctx *ctx);
+
+/* Process the remaining bytes in the buffer and put result from CTX
+ in first 20 bytes following RESBUF. The result is always in little
+ endian byte order, so that a byte-wise output yields to the wanted
+ ASCII representation of the message digest.
+
+ IMPORTANT: On some systems it is required that RESBUF be correctly
+ aligned for a 32 bits value. */
+extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf);
+
+/* Put result from CTX in first 20 bytes following RESBUF. The result is
+ always in little endian byte order, so that a byte-wise output yields
+ to the wanted ASCII representation of the message digest.
+
+ IMPORTANT: On some systems it is required that RESBUF is correctly
+ aligned for a 32 bits value. */
+extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf);
+
+/* Compute SHA1 message digest for bytes read from STREAM. The
+ resulting message digest number will be written into the 20 bytes
+ beginning at RESBLOCK. */
+extern int sha1_stream (FILE *stream, void *resblock);
+
+/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The
+ result is always in little endian byte order, so that a byte-wise
+ output yields to the wanted ASCII representation of the message
+ digest. */
+extern void *sha1_buffer (const char *buffer, size_t len, void *resblock);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/super-ddf.c b/super-ddf.c
new file mode 100644
index 0000000..3f304cd
--- /dev/null
+++ b/super-ddf.c
@@ -0,0 +1,5244 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2014 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neil@brown.name>
+ *
+ * Specifications for DDF taken from Common RAID DDF Specification Revision 1.2
+ * (July 28 2006). Reused by permission of SNIA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include <values.h>
+#include <stddef.h>
+
+/* a non-official T10 name for creation GUIDs */
+static char T10[] = "Linux-MD";
+
+/* DDF timestamps are 1980 based, so we need to add
+ * second-in-decade-of-seventies to convert to linux timestamps.
+ * 10 years with 2 leap years.
+ */
+#define DECADE (3600*24*(365*10+2))
+unsigned long crc32(
+ unsigned long crc,
+ const unsigned char *buf,
+ unsigned len);
+
+#define DDF_NOTFOUND (~0U)
+#define DDF_CONTAINER (DDF_NOTFOUND-1)
+
+/* Default for safe_mode_delay. Same value as for IMSM.
+ */
+static const int DDF_SAFE_MODE_DELAY = 4000;
+
+/* The DDF metadata handling.
+ * DDF metadata lives at the end of the device.
+ * The last 512 byte block provides an 'anchor' which is used to locate
+ * the rest of the metadata which usually lives immediately behind the anchor.
+ *
+ * Note:
+ * - all multibyte numeric fields are bigendian.
+ * - all strings are space padded.
+ *
+ */
+
+typedef struct __be16 {
+ __u16 _v16;
+} be16;
+#define be16_eq(x, y) ((x)._v16 == (y)._v16)
+#define be16_and(x, y) ((x)._v16 & (y)._v16)
+#define be16_or(x, y) ((x)._v16 | (y)._v16)
+#define be16_clear(x, y) ((x)._v16 &= ~(y)._v16)
+#define be16_set(x, y) ((x)._v16 |= (y)._v16)
+
+typedef struct __be32 {
+ __u32 _v32;
+} be32;
+#define be32_eq(x, y) ((x)._v32 == (y)._v32)
+
+typedef struct __be64 {
+ __u64 _v64;
+} be64;
+#define be64_eq(x, y) ((x)._v64 == (y)._v64)
+
+#define be16_to_cpu(be) __be16_to_cpu((be)._v16)
+static inline be16 cpu_to_be16(__u16 x)
+{
+ be16 be = { ._v16 = __cpu_to_be16(x) };
+ return be;
+}
+
+#define be32_to_cpu(be) __be32_to_cpu((be)._v32)
+static inline be32 cpu_to_be32(__u32 x)
+{
+ be32 be = { ._v32 = __cpu_to_be32(x) };
+ return be;
+}
+
+#define be64_to_cpu(be) __be64_to_cpu((be)._v64)
+static inline be64 cpu_to_be64(__u64 x)
+{
+ be64 be = { ._v64 = __cpu_to_be64(x) };
+ return be;
+}
+
+/* Primary Raid Level (PRL) */
+#define DDF_RAID0 0x00
+#define DDF_RAID1 0x01
+#define DDF_RAID3 0x03
+#define DDF_RAID4 0x04
+#define DDF_RAID5 0x05
+#define DDF_RAID1E 0x11
+#define DDF_JBOD 0x0f
+#define DDF_CONCAT 0x1f
+#define DDF_RAID5E 0x15
+#define DDF_RAID5EE 0x25
+#define DDF_RAID6 0x06
+
+/* Raid Level Qualifier (RLQ) */
+#define DDF_RAID0_SIMPLE 0x00
+#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */
+#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */
+#define DDF_RAID3_0 0x00 /* parity in first extent */
+#define DDF_RAID3_N 0x01 /* parity in last extent */
+#define DDF_RAID4_0 0x00 /* parity in first extent */
+#define DDF_RAID4_N 0x01 /* parity in last extent */
+/* these apply to raid5e and raid5ee as well */
+#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */
+#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */
+#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */
+#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */
+
+#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */
+#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */
+
+/* Secondary RAID Level (SRL) */
+#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */
+#define DDF_2MIRRORED 0x01
+#define DDF_2CONCAT 0x02
+#define DDF_2SPANNED 0x03 /* This is also weird - be careful */
+
+/* Magic numbers */
+#define DDF_HEADER_MAGIC cpu_to_be32(0xDE11DE11)
+#define DDF_CONTROLLER_MAGIC cpu_to_be32(0xAD111111)
+#define DDF_PHYS_RECORDS_MAGIC cpu_to_be32(0x22222222)
+#define DDF_PHYS_DATA_MAGIC cpu_to_be32(0x33333333)
+#define DDF_VIRT_RECORDS_MAGIC cpu_to_be32(0xDDDDDDDD)
+#define DDF_VD_CONF_MAGIC cpu_to_be32(0xEEEEEEEE)
+#define DDF_SPARE_ASSIGN_MAGIC cpu_to_be32(0x55555555)
+#define DDF_VU_CONF_MAGIC cpu_to_be32(0x88888888)
+#define DDF_VENDOR_LOG_MAGIC cpu_to_be32(0x01dBEEF0)
+#define DDF_BBM_LOG_MAGIC cpu_to_be32(0xABADB10C)
+
+#define DDF_GUID_LEN 24
+#define DDF_REVISION_0 "01.00.00"
+#define DDF_REVISION_2 "01.02.00"
+
+struct ddf_header {
+ be32 magic; /* DDF_HEADER_MAGIC */
+ be32 crc;
+ char guid[DDF_GUID_LEN];
+ char revision[8]; /* 01.02.00 */
+ be32 seq; /* starts at '1' */
+ be32 timestamp;
+ __u8 openflag;
+ __u8 foreignflag;
+ __u8 enforcegroups;
+ __u8 pad0; /* 0xff */
+ __u8 pad1[12]; /* 12 * 0xff */
+ /* 64 bytes so far */
+ __u8 header_ext[32]; /* reserved: fill with 0xff */
+ be64 primary_lba;
+ be64 secondary_lba;
+ __u8 type;
+ __u8 pad2[3]; /* 0xff */
+ be32 workspace_len; /* sectors for vendor space -
+ * at least 32768(sectors) */
+ be64 workspace_lba;
+ be16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */
+ be16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */
+ be16 max_partitions; /* i.e. max num of configuration
+ record entries per disk */
+ be16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries
+ *12/512) */
+ be16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */
+ __u8 pad3[54]; /* 0xff */
+ /* 192 bytes so far */
+ be32 controller_section_offset;
+ be32 controller_section_length;
+ be32 phys_section_offset;
+ be32 phys_section_length;
+ be32 virt_section_offset;
+ be32 virt_section_length;
+ be32 config_section_offset;
+ be32 config_section_length;
+ be32 data_section_offset;
+ be32 data_section_length;
+ be32 bbm_section_offset;
+ be32 bbm_section_length;
+ be32 diag_space_offset;
+ be32 diag_space_length;
+ be32 vendor_offset;
+ be32 vendor_length;
+ /* 256 bytes so far */
+ __u8 pad4[256]; /* 0xff */
+};
+
+/* type field */
+#define DDF_HEADER_ANCHOR 0x00
+#define DDF_HEADER_PRIMARY 0x01
+#define DDF_HEADER_SECONDARY 0x02
+
+/* The content of the 'controller section' - global scope */
+struct ddf_controller_data {
+ be32 magic; /* DDF_CONTROLLER_MAGIC */
+ be32 crc;
+ char guid[DDF_GUID_LEN];
+ struct controller_type {
+ be16 vendor_id;
+ be16 device_id;
+ be16 sub_vendor_id;
+ be16 sub_device_id;
+ } type;
+ char product_id[16];
+ __u8 pad[8]; /* 0xff */
+ __u8 vendor_data[448];
+};
+
+/* The content of phys_section - global scope */
+struct phys_disk {
+ be32 magic; /* DDF_PHYS_RECORDS_MAGIC */
+ be32 crc;
+ be16 used_pdes; /* This is a counter, not a max - the list
+ * of used entries may not be dense */
+ be16 max_pdes;
+ __u8 pad[52];
+ struct phys_disk_entry {
+ char guid[DDF_GUID_LEN];
+ be32 refnum;
+ be16 type;
+ be16 state;
+ be64 config_size; /* DDF structures must be after here */
+ char path[18]; /* Another horrible structure really
+ * but is "used for information
+ * purposes only" */
+ __u8 pad[6];
+ } entries[0];
+};
+
+/* phys_disk_entry.type is a bitmap - bigendian remember */
+#define DDF_Forced_PD_GUID 1
+#define DDF_Active_in_VD 2
+#define DDF_Global_Spare 4 /* VD_CONF records are ignored */
+#define DDF_Spare 8 /* overrides Global_spare */
+#define DDF_Foreign 16
+#define DDF_Legacy 32 /* no DDF on this device */
+
+#define DDF_Interface_mask 0xf00
+#define DDF_Interface_SCSI 0x100
+#define DDF_Interface_SAS 0x200
+#define DDF_Interface_SATA 0x300
+#define DDF_Interface_FC 0x400
+
+/* phys_disk_entry.state is a bigendian bitmap */
+#define DDF_Online 1
+#define DDF_Failed 2 /* overrides 1,4,8 */
+#define DDF_Rebuilding 4
+#define DDF_Transition 8
+#define DDF_SMART 16
+#define DDF_ReadErrors 32
+#define DDF_Missing 64
+
+/* The content of the virt_section global scope */
+struct virtual_disk {
+ be32 magic; /* DDF_VIRT_RECORDS_MAGIC */
+ be32 crc;
+ be16 populated_vdes;
+ be16 max_vdes;
+ __u8 pad[52];
+ struct virtual_entry {
+ char guid[DDF_GUID_LEN];
+ be16 unit;
+ __u16 pad0; /* 0xffff */
+ be16 guid_crc;
+ be16 type;
+ __u8 state;
+ __u8 init_state;
+ __u8 pad1[14];
+ char name[16];
+ } entries[0];
+};
+
+/* virtual_entry.type is a bitmap - bigendian */
+#define DDF_Shared 1
+#define DDF_Enforce_Groups 2
+#define DDF_Unicode 4
+#define DDF_Owner_Valid 8
+
+/* virtual_entry.state is a bigendian bitmap */
+#define DDF_state_mask 0x7
+#define DDF_state_optimal 0x0
+#define DDF_state_degraded 0x1
+#define DDF_state_deleted 0x2
+#define DDF_state_missing 0x3
+#define DDF_state_failed 0x4
+#define DDF_state_part_optimal 0x5
+
+#define DDF_state_morphing 0x8
+#define DDF_state_inconsistent 0x10
+
+/* virtual_entry.init_state is a bigendian bitmap */
+#define DDF_initstate_mask 0x03
+#define DDF_init_not 0x00
+#define DDF_init_quick 0x01 /* initialisation is progress.
+ * i.e. 'state_inconsistent' */
+#define DDF_init_full 0x02
+
+#define DDF_access_mask 0xc0
+#define DDF_access_rw 0x00
+#define DDF_access_ro 0x80
+#define DDF_access_blocked 0xc0
+
+/* The content of the config_section - local scope
+ * It has multiple records each config_record_len sectors
+ * They can be vd_config or spare_assign
+ */
+
+struct vd_config {
+ be32 magic; /* DDF_VD_CONF_MAGIC */
+ be32 crc;
+ char guid[DDF_GUID_LEN];
+ be32 timestamp;
+ be32 seqnum;
+ __u8 pad0[24];
+ be16 prim_elmnt_count;
+ __u8 chunk_shift; /* 0 == 512, 1==1024 etc */
+ __u8 prl;
+ __u8 rlq;
+ __u8 sec_elmnt_count;
+ __u8 sec_elmnt_seq;
+ __u8 srl;
+ be64 blocks; /* blocks per component could be different
+ * on different component devices...(only
+ * for concat I hope) */
+ be64 array_blocks; /* blocks in array */
+ __u8 pad1[8];
+ be32 spare_refs[8]; /* This is used to detect missing spares.
+ * As we don't have an interface for that
+ * the values are ignored.
+ */
+ __u8 cache_pol[8];
+ __u8 bg_rate;
+ __u8 pad2[3];
+ __u8 pad3[52];
+ __u8 pad4[192];
+ __u8 v0[32]; /* reserved- 0xff */
+ __u8 v1[32]; /* reserved- 0xff */
+ __u8 v2[16]; /* reserved- 0xff */
+ __u8 v3[16]; /* reserved- 0xff */
+ __u8 vendor[32];
+ be32 phys_refnum[0]; /* refnum of each disk in sequence */
+ /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a
+ bvd are always the same size */
+};
+#define LBA_OFFSET(ddf, vd) ((be64 *) &(vd)->phys_refnum[(ddf)->mppe])
+
+/* vd_config.cache_pol[7] is a bitmap */
+#define DDF_cache_writeback 1 /* else writethrough */
+#define DDF_cache_wadaptive 2 /* only applies if writeback */
+#define DDF_cache_readahead 4
+#define DDF_cache_radaptive 8 /* only if doing read-ahead */
+#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */
+#define DDF_cache_wallowed 32 /* enable write caching */
+#define DDF_cache_rallowed 64 /* enable read caching */
+
+struct spare_assign {
+ be32 magic; /* DDF_SPARE_ASSIGN_MAGIC */
+ be32 crc;
+ be32 timestamp;
+ __u8 reserved[7];
+ __u8 type;
+ be16 populated; /* SAEs used */
+ be16 max; /* max SAEs */
+ __u8 pad[8];
+ struct spare_assign_entry {
+ char guid[DDF_GUID_LEN];
+ be16 secondary_element;
+ __u8 pad[6];
+ } spare_ents[0];
+};
+/* spare_assign.type is a bitmap */
+#define DDF_spare_dedicated 0x1 /* else global */
+#define DDF_spare_revertible 0x2 /* else committable */
+#define DDF_spare_active 0x4 /* else not active */
+#define DDF_spare_affinity 0x8 /* enclosure affinity */
+
+/* The data_section contents - local scope */
+struct disk_data {
+ be32 magic; /* DDF_PHYS_DATA_MAGIC */
+ be32 crc;
+ char guid[DDF_GUID_LEN];
+ be32 refnum; /* crc of some magic drive data ... */
+ __u8 forced_ref; /* set when above was not result of magic */
+ __u8 forced_guid; /* set if guid was forced rather than magic */
+ __u8 vendor[32];
+ __u8 pad[442];
+};
+
+/* bbm_section content */
+struct bad_block_log {
+ be32 magic;
+ be32 crc;
+ be16 entry_count;
+ be32 spare_count;
+ __u8 pad[10];
+ be64 first_spare;
+ struct mapped_block {
+ be64 defective_start;
+ be32 replacement_start;
+ be16 remap_count;
+ __u8 pad[2];
+ } entries[0];
+};
+
+/* Struct for internally holding ddf structures */
+/* The DDF structure stored on each device is potentially
+ * quite different, as some data is global and some is local.
+ * The global data is:
+ * - ddf header
+ * - controller_data
+ * - Physical disk records
+ * - Virtual disk records
+ * The local data is:
+ * - Configuration records
+ * - Physical Disk data section
+ * ( and Bad block and vendor which I don't care about yet).
+ *
+ * The local data is parsed into separate lists as it is read
+ * and reconstructed for writing. This means that we only need
+ * to make config changes once and they are automatically
+ * propagated to all devices.
+ * The global (config and disk data) records are each in a list
+ * of separate data structures. When writing we find the entry
+ * or entries applicable to the particular device.
+ */
+struct ddf_super {
+ struct ddf_header anchor, primary, secondary;
+ struct ddf_controller_data controller;
+ struct ddf_header *active;
+ struct phys_disk *phys;
+ struct virtual_disk *virt;
+ char *conf;
+ int pdsize, vdsize;
+ unsigned int max_part, mppe, conf_rec_len;
+ int currentdev;
+ int updates_pending;
+ struct vcl {
+ union {
+ char space[512];
+ struct {
+ struct vcl *next;
+ unsigned int vcnum; /* index into ->virt */
+ /* For an array with a secondary level there are
+ * multiple vd_config structures, all with the same
+ * guid but with different sec_elmnt_seq.
+ * One of these structures is in 'conf' below.
+ * The others are in other_bvds, not in any
+ * particular order.
+ */
+ struct vd_config **other_bvds;
+ __u64 *block_sizes; /* NULL if all the same */
+ };
+ };
+ struct vd_config conf;
+ } *conflist, *currentconf;
+ struct dl {
+ union {
+ char space[512];
+ struct {
+ struct dl *next;
+ int major, minor;
+ char *devname;
+ int fd;
+ unsigned long long size; /* sectors */
+ be64 primary_lba; /* sectors */
+ be64 secondary_lba; /* sectors */
+ be64 workspace_lba; /* sectors */
+ int pdnum; /* index in ->phys */
+ struct spare_assign *spare;
+ void *mdupdate; /* hold metadata update */
+
+ /* These fields used by auto-layout */
+ int raiddisk; /* slot to fill in autolayout */
+ __u64 esize;
+ int displayed;
+ };
+ };
+ struct disk_data disk;
+ struct vcl *vlist[0]; /* max_part in size */
+ } *dlist, *add_list;
+};
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+ void **sbp, char *devname);
+static int get_svd_state(const struct ddf_super *, const struct vcl *);
+static int
+validate_geometry_ddf_container(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose);
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose);
+
+static void free_super_ddf(struct supertype *st);
+static int all_ff(const char *guid);
+static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
+ be32 refnum, unsigned int nmax,
+ const struct vd_config **bvd,
+ unsigned int *idx);
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map);
+static void uuid_from_ddf_guid(const char *guid, int uuid[4]);
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4]);
+static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i);
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map);
+static int init_super_ddf_bvd(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size,
+ char *name, char *homehost,
+ int *uuid, unsigned long long data_offset);
+
+#if DEBUG
+static void pr_state(struct ddf_super *ddf, const char *msg)
+{
+ unsigned int i;
+ dprintf("%s: ", msg);
+ for (i = 0; i < be16_to_cpu(ddf->active->max_vd_entries); i++) {
+ if (all_ff(ddf->virt->entries[i].guid))
+ continue;
+ dprintf_cont("%u(s=%02x i=%02x) ", i,
+ ddf->virt->entries[i].state,
+ ddf->virt->entries[i].init_state);
+ }
+ dprintf_cont("\n");
+}
+#else
+static void pr_state(const struct ddf_super *ddf, const char *msg) {}
+#endif
+
+static void _ddf_set_updates_pending(struct ddf_super *ddf, struct vd_config *vc,
+ const char *func)
+{
+ if (vc) {
+ vc->timestamp = cpu_to_be32(time(0)-DECADE);
+ vc->seqnum = cpu_to_be32(be32_to_cpu(vc->seqnum) + 1);
+ }
+ if (ddf->updates_pending)
+ return;
+ ddf->updates_pending = 1;
+ ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1));
+ pr_state(ddf, func);
+}
+
+#define ddf_set_updates_pending(x,v) _ddf_set_updates_pending((x), (v), __func__)
+
+static be32 calc_crc(void *buf, int len)
+{
+ /* crcs are always at the same place as in the ddf_header */
+ struct ddf_header *ddf = buf;
+ be32 oldcrc = ddf->crc;
+ __u32 newcrc;
+ ddf->crc = cpu_to_be32(0xffffffff);
+
+ newcrc = crc32(0, buf, len);
+ ddf->crc = oldcrc;
+ /* The crc is stored (like everything) bigendian, so convert
+ * here for simplicity
+ */
+ return cpu_to_be32(newcrc);
+}
+
+#define DDF_INVALID_LEVEL 0xff
+#define DDF_NO_SECONDARY 0xff
+static int err_bad_md_layout(const mdu_array_info_t *array)
+{
+ pr_err("RAID%d layout %x with %d disks is unsupported for DDF\n",
+ array->level, array->layout, array->raid_disks);
+ return -1;
+}
+
+static int layout_md2ddf(const mdu_array_info_t *array,
+ struct vd_config *conf)
+{
+ be16 prim_elmnt_count = cpu_to_be16(array->raid_disks);
+ __u8 prl = DDF_INVALID_LEVEL, rlq = 0;
+ __u8 sec_elmnt_count = 1;
+ __u8 srl = DDF_NO_SECONDARY;
+
+ switch (array->level) {
+ case LEVEL_LINEAR:
+ prl = DDF_CONCAT;
+ break;
+ case 0:
+ rlq = DDF_RAID0_SIMPLE;
+ prl = DDF_RAID0;
+ break;
+ case 1:
+ switch (array->raid_disks) {
+ case 2:
+ rlq = DDF_RAID1_SIMPLE;
+ break;
+ case 3:
+ rlq = DDF_RAID1_MULTI;
+ break;
+ default:
+ return err_bad_md_layout(array);
+ }
+ prl = DDF_RAID1;
+ break;
+ case 4:
+ if (array->layout != 0)
+ return err_bad_md_layout(array);
+ rlq = DDF_RAID4_N;
+ prl = DDF_RAID4;
+ break;
+ case 5:
+ switch (array->layout) {
+ case ALGORITHM_LEFT_ASYMMETRIC:
+ rlq = DDF_RAID5_N_RESTART;
+ break;
+ case ALGORITHM_RIGHT_ASYMMETRIC:
+ rlq = DDF_RAID5_0_RESTART;
+ break;
+ case ALGORITHM_LEFT_SYMMETRIC:
+ rlq = DDF_RAID5_N_CONTINUE;
+ break;
+ case ALGORITHM_RIGHT_SYMMETRIC:
+ /* not mentioned in standard */
+ default:
+ return err_bad_md_layout(array);
+ }
+ prl = DDF_RAID5;
+ break;
+ case 6:
+ switch (array->layout) {
+ case ALGORITHM_ROTATING_N_RESTART:
+ rlq = DDF_RAID5_N_RESTART;
+ break;
+ case ALGORITHM_ROTATING_ZERO_RESTART:
+ rlq = DDF_RAID6_0_RESTART;
+ break;
+ case ALGORITHM_ROTATING_N_CONTINUE:
+ rlq = DDF_RAID5_N_CONTINUE;
+ break;
+ default:
+ return err_bad_md_layout(array);
+ }
+ prl = DDF_RAID6;
+ break;
+ case 10:
+ if (array->raid_disks % 2 == 0 && array->layout == 0x102) {
+ rlq = DDF_RAID1_SIMPLE;
+ prim_elmnt_count = cpu_to_be16(2);
+ sec_elmnt_count = array->raid_disks / 2;
+ srl = DDF_2SPANNED;
+ prl = DDF_RAID1;
+ } else if (array->raid_disks % 3 == 0 &&
+ array->layout == 0x103) {
+ rlq = DDF_RAID1_MULTI;
+ prim_elmnt_count = cpu_to_be16(3);
+ sec_elmnt_count = array->raid_disks / 3;
+ srl = DDF_2SPANNED;
+ prl = DDF_RAID1;
+ } else if (array->layout == 0x201) {
+ prl = DDF_RAID1E;
+ rlq = DDF_RAID1E_OFFSET;
+ } else if (array->layout == 0x102) {
+ prl = DDF_RAID1E;
+ rlq = DDF_RAID1E_ADJACENT;
+ } else
+ return err_bad_md_layout(array);
+ break;
+ default:
+ return err_bad_md_layout(array);
+ }
+ conf->prl = prl;
+ conf->prim_elmnt_count = prim_elmnt_count;
+ conf->rlq = rlq;
+ conf->srl = srl;
+ conf->sec_elmnt_count = sec_elmnt_count;
+ return 0;
+}
+
+static int err_bad_ddf_layout(const struct vd_config *conf)
+{
+ pr_err("DDF RAID %u qualifier %u with %u disks is unsupported\n",
+ conf->prl, conf->rlq, be16_to_cpu(conf->prim_elmnt_count));
+ return -1;
+}
+
+static int layout_ddf2md(const struct vd_config *conf,
+ mdu_array_info_t *array)
+{
+ int level = LEVEL_UNSUPPORTED;
+ int layout = 0;
+ int raiddisks = be16_to_cpu(conf->prim_elmnt_count);
+
+ if (conf->sec_elmnt_count > 1) {
+ /* see also check_secondary() */
+ if (conf->prl != DDF_RAID1 ||
+ (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED)) {
+ pr_err("Unsupported secondary RAID level %u/%u\n",
+ conf->prl, conf->srl);
+ return -1;
+ }
+ if (raiddisks == 2 && conf->rlq == DDF_RAID1_SIMPLE)
+ layout = 0x102;
+ else if (raiddisks == 3 && conf->rlq == DDF_RAID1_MULTI)
+ layout = 0x103;
+ else
+ return err_bad_ddf_layout(conf);
+ raiddisks *= conf->sec_elmnt_count;
+ level = 10;
+ goto good;
+ }
+
+ switch (conf->prl) {
+ case DDF_CONCAT:
+ level = LEVEL_LINEAR;
+ break;
+ case DDF_RAID0:
+ if (conf->rlq != DDF_RAID0_SIMPLE)
+ return err_bad_ddf_layout(conf);
+ level = 0;
+ break;
+ case DDF_RAID1:
+ if (!((conf->rlq == DDF_RAID1_SIMPLE && raiddisks == 2) ||
+ (conf->rlq == DDF_RAID1_MULTI && raiddisks == 3)))
+ return err_bad_ddf_layout(conf);
+ level = 1;
+ break;
+ case DDF_RAID1E:
+ if (conf->rlq == DDF_RAID1E_ADJACENT)
+ layout = 0x102;
+ else if (conf->rlq == DDF_RAID1E_OFFSET)
+ layout = 0x201;
+ else
+ return err_bad_ddf_layout(conf);
+ level = 10;
+ break;
+ case DDF_RAID4:
+ if (conf->rlq != DDF_RAID4_N)
+ return err_bad_ddf_layout(conf);
+ level = 4;
+ break;
+ case DDF_RAID5:
+ switch (conf->rlq) {
+ case DDF_RAID5_N_RESTART:
+ layout = ALGORITHM_LEFT_ASYMMETRIC;
+ break;
+ case DDF_RAID5_0_RESTART:
+ layout = ALGORITHM_RIGHT_ASYMMETRIC;
+ break;
+ case DDF_RAID5_N_CONTINUE:
+ layout = ALGORITHM_LEFT_SYMMETRIC;
+ break;
+ default:
+ return err_bad_ddf_layout(conf);
+ }
+ level = 5;
+ break;
+ case DDF_RAID6:
+ switch (conf->rlq) {
+ case DDF_RAID5_N_RESTART:
+ layout = ALGORITHM_ROTATING_N_RESTART;
+ break;
+ case DDF_RAID6_0_RESTART:
+ layout = ALGORITHM_ROTATING_ZERO_RESTART;
+ break;
+ case DDF_RAID5_N_CONTINUE:
+ layout = ALGORITHM_ROTATING_N_CONTINUE;
+ break;
+ default:
+ return err_bad_ddf_layout(conf);
+ }
+ level = 6;
+ break;
+ default:
+ return err_bad_ddf_layout(conf);
+ };
+
+good:
+ array->level = level;
+ array->layout = layout;
+ array->raid_disks = raiddisks;
+ return 0;
+}
+
+static int load_ddf_header(int fd, unsigned long long lba,
+ unsigned long long size,
+ int type,
+ struct ddf_header *hdr, struct ddf_header *anchor)
+{
+ /* read a ddf header (primary or secondary) from fd/lba
+ * and check that it is consistent with anchor
+ * Need to check:
+ * magic, crc, guid, rev, and LBA's header_type, and
+ * everything after header_type must be the same
+ */
+ if (lba >= size-1)
+ return 0;
+
+ if (lseek64(fd, lba<<9, 0) < 0)
+ return 0;
+
+ if (read(fd, hdr, 512) != 512)
+ return 0;
+
+ if (!be32_eq(hdr->magic, DDF_HEADER_MAGIC)) {
+ pr_err("bad header magic\n");
+ return 0;
+ }
+ if (!be32_eq(calc_crc(hdr, 512), hdr->crc)) {
+ pr_err("bad CRC\n");
+ return 0;
+ }
+ if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 ||
+ memcmp(anchor->revision, hdr->revision, 8) != 0 ||
+ !be64_eq(anchor->primary_lba, hdr->primary_lba) ||
+ !be64_eq(anchor->secondary_lba, hdr->secondary_lba) ||
+ hdr->type != type ||
+ memcmp(anchor->pad2, hdr->pad2, 512 -
+ offsetof(struct ddf_header, pad2)) != 0) {
+ pr_err("header mismatch\n");
+ return 0;
+ }
+
+ /* Looks good enough to me... */
+ return 1;
+}
+
+static void *load_section(int fd, struct ddf_super *super, void *buf,
+ be32 offset_be, be32 len_be, int check)
+{
+ unsigned long long offset = be32_to_cpu(offset_be);
+ unsigned long long len = be32_to_cpu(len_be);
+ int dofree = (buf == NULL);
+
+ if (check)
+ if (len != 2 && len != 8 && len != 32 &&
+ len != 128 && len != 512)
+ return NULL;
+
+ if (len > 1024)
+ return NULL;
+ if (!buf && posix_memalign(&buf, 512, len<<9) != 0)
+ buf = NULL;
+
+ if (!buf)
+ return NULL;
+
+ if (super->active->type == 1)
+ offset += be64_to_cpu(super->active->primary_lba);
+ else
+ offset += be64_to_cpu(super->active->secondary_lba);
+
+ if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) {
+ if (dofree)
+ free(buf);
+ return NULL;
+ }
+ if ((unsigned long long)read(fd, buf, len<<9) != (len<<9)) {
+ if (dofree)
+ free(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+static int load_ddf_headers(int fd, struct ddf_super *super, char *devname)
+{
+ unsigned long long dsize;
+
+ get_dev_size(fd, NULL, &dsize);
+
+ if (lseek64(fd, dsize-512, 0) < 0) {
+ if (devname)
+ pr_err("Cannot seek to anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+ if (read(fd, &super->anchor, 512) != 512) {
+ if (devname)
+ pr_err("Cannot read anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+ if (!be32_eq(super->anchor.magic, DDF_HEADER_MAGIC)) {
+ if (devname)
+ pr_err("no DDF anchor found on %s\n",
+ devname);
+ return 2;
+ }
+ if (!be32_eq(calc_crc(&super->anchor, 512), super->anchor.crc)) {
+ if (devname)
+ pr_err("bad CRC on anchor on %s\n",
+ devname);
+ return 2;
+ }
+ if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 &&
+ memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) {
+ if (devname)
+ pr_err("can only support super revision %.8s and earlier, not %.8s on %s\n",
+ DDF_REVISION_2, super->anchor.revision,devname);
+ return 2;
+ }
+ super->active = NULL;
+ if (load_ddf_header(fd, be64_to_cpu(super->anchor.primary_lba),
+ dsize >> 9, 1,
+ &super->primary, &super->anchor) == 0) {
+ if (devname)
+ pr_err("Failed to load primary DDF header on %s\n", devname);
+ } else
+ super->active = &super->primary;
+
+ if (load_ddf_header(fd, be64_to_cpu(super->anchor.secondary_lba),
+ dsize >> 9, 2,
+ &super->secondary, &super->anchor)) {
+ if (super->active == NULL ||
+ (be32_to_cpu(super->primary.seq)
+ < be32_to_cpu(super->secondary.seq) &&
+ !super->secondary.openflag) ||
+ (be32_to_cpu(super->primary.seq) ==
+ be32_to_cpu(super->secondary.seq) &&
+ super->primary.openflag && !super->secondary.openflag))
+ super->active = &super->secondary;
+ } else if (devname &&
+ be64_to_cpu(super->anchor.secondary_lba) != ~(__u64)0)
+ pr_err("Failed to load secondary DDF header on %s\n",
+ devname);
+ if (super->active == NULL)
+ return 2;
+ return 0;
+}
+
+static int load_ddf_global(int fd, struct ddf_super *super, char *devname)
+{
+ void *ok;
+ ok = load_section(fd, super, &super->controller,
+ super->active->controller_section_offset,
+ super->active->controller_section_length,
+ 0);
+ super->phys = load_section(fd, super, NULL,
+ super->active->phys_section_offset,
+ super->active->phys_section_length,
+ 1);
+ super->pdsize = be32_to_cpu(super->active->phys_section_length) * 512;
+
+ super->virt = load_section(fd, super, NULL,
+ super->active->virt_section_offset,
+ super->active->virt_section_length,
+ 1);
+ super->vdsize = be32_to_cpu(super->active->virt_section_length) * 512;
+ if (!ok ||
+ !super->phys ||
+ !super->virt) {
+ free(super->phys);
+ free(super->virt);
+ super->phys = NULL;
+ super->virt = NULL;
+ return 2;
+ }
+ super->conflist = NULL;
+ super->dlist = NULL;
+
+ super->max_part = be16_to_cpu(super->active->max_partitions);
+ super->mppe = be16_to_cpu(super->active->max_primary_element_entries);
+ super->conf_rec_len = be16_to_cpu(super->active->config_record_len);
+ return 0;
+}
+
+#define DDF_UNUSED_BVD 0xff
+static int alloc_other_bvds(const struct ddf_super *ddf, struct vcl *vcl)
+{
+ unsigned int n_vds = vcl->conf.sec_elmnt_count - 1;
+ unsigned int i, vdsize;
+ void *p;
+ if (n_vds == 0) {
+ vcl->other_bvds = NULL;
+ return 0;
+ }
+ vdsize = ddf->conf_rec_len * 512;
+ if (posix_memalign(&p, 512, n_vds *
+ (vdsize + sizeof(struct vd_config *))) != 0)
+ return -1;
+ vcl->other_bvds = (struct vd_config **) (p + n_vds * vdsize);
+ for (i = 0; i < n_vds; i++) {
+ vcl->other_bvds[i] = p + i * vdsize;
+ memset(vcl->other_bvds[i], 0, vdsize);
+ vcl->other_bvds[i]->sec_elmnt_seq = DDF_UNUSED_BVD;
+ }
+ return 0;
+}
+
+static void add_other_bvd(struct vcl *vcl, struct vd_config *vd,
+ unsigned int len)
+{
+ int i;
+ for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++)
+ if (vcl->other_bvds[i]->sec_elmnt_seq == vd->sec_elmnt_seq)
+ break;
+
+ if (i < vcl->conf.sec_elmnt_count-1) {
+ if (be32_to_cpu(vd->seqnum) <=
+ be32_to_cpu(vcl->other_bvds[i]->seqnum))
+ return;
+ } else {
+ for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++)
+ if (vcl->other_bvds[i]->sec_elmnt_seq == DDF_UNUSED_BVD)
+ break;
+ if (i == vcl->conf.sec_elmnt_count-1) {
+ pr_err("no space for sec level config %u, count is %u\n",
+ vd->sec_elmnt_seq, vcl->conf.sec_elmnt_count);
+ return;
+ }
+ }
+ memcpy(vcl->other_bvds[i], vd, len);
+}
+
+static int load_ddf_local(int fd, struct ddf_super *super,
+ char *devname, int keep)
+{
+ struct dl *dl;
+ struct stat stb;
+ char *conf;
+ unsigned int i;
+ unsigned int confsec;
+ int vnum;
+ unsigned int max_virt_disks =
+ be16_to_cpu(super->active->max_vd_entries);
+ unsigned long long dsize;
+
+ /* First the local disk info */
+ if (posix_memalign((void**)&dl, 512,
+ sizeof(*dl) +
+ (super->max_part) * sizeof(dl->vlist[0])) != 0) {
+ pr_err("could not allocate disk info buffer\n");
+ return 1;
+ }
+
+ load_section(fd, super, &dl->disk,
+ super->active->data_section_offset,
+ super->active->data_section_length,
+ 0);
+ dl->devname = devname ? xstrdup(devname) : NULL;
+
+ fstat(fd, &stb);
+ dl->major = major(stb.st_rdev);
+ dl->minor = minor(stb.st_rdev);
+ dl->next = super->dlist;
+ dl->fd = keep ? fd : -1;
+
+ dl->size = 0;
+ if (get_dev_size(fd, devname, &dsize))
+ dl->size = dsize >> 9;
+ /* If the disks have different sizes, the LBAs will differ
+ * between phys disks.
+ * At this point here, the values in super->active must be valid
+ * for this phys disk. */
+ dl->primary_lba = super->active->primary_lba;
+ dl->secondary_lba = super->active->secondary_lba;
+ dl->workspace_lba = super->active->workspace_lba;
+ dl->spare = NULL;
+ for (i = 0 ; i < super->max_part ; i++)
+ dl->vlist[i] = NULL;
+ super->dlist = dl;
+ dl->pdnum = -1;
+ for (i = 0; i < be16_to_cpu(super->active->max_pd_entries); i++)
+ if (memcmp(super->phys->entries[i].guid,
+ dl->disk.guid, DDF_GUID_LEN) == 0)
+ dl->pdnum = i;
+
+ /* Now the config list. */
+ /* 'conf' is an array of config entries, some of which are
+ * probably invalid. Those which are good need to be copied into
+ * the conflist
+ */
+
+ conf = load_section(fd, super, super->conf,
+ super->active->config_section_offset,
+ super->active->config_section_length,
+ 0);
+ super->conf = conf;
+ vnum = 0;
+ for (confsec = 0;
+ confsec < be32_to_cpu(super->active->config_section_length);
+ confsec += super->conf_rec_len) {
+ struct vd_config *vd =
+ (struct vd_config *)((char*)conf + confsec*512);
+ struct vcl *vcl;
+
+ if (be32_eq(vd->magic, DDF_SPARE_ASSIGN_MAGIC)) {
+ if (dl->spare)
+ continue;
+ if (posix_memalign((void**)&dl->spare, 512,
+ super->conf_rec_len*512) != 0) {
+ pr_err("could not allocate spare info buf\n");
+ return 1;
+ }
+
+ memcpy(dl->spare, vd, super->conf_rec_len*512);
+ continue;
+ }
+ if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC))
+ /* Must be vendor-unique - I cannot handle those */
+ continue;
+
+ for (vcl = super->conflist; vcl; vcl = vcl->next) {
+ if (memcmp(vcl->conf.guid,
+ vd->guid, DDF_GUID_LEN) == 0)
+ break;
+ }
+
+ if (vcl) {
+ dl->vlist[vnum++] = vcl;
+ if (vcl->other_bvds != NULL &&
+ vcl->conf.sec_elmnt_seq != vd->sec_elmnt_seq) {
+ add_other_bvd(vcl, vd, super->conf_rec_len*512);
+ continue;
+ }
+ if (be32_to_cpu(vd->seqnum) <=
+ be32_to_cpu(vcl->conf.seqnum))
+ continue;
+ } else {
+ if (posix_memalign((void**)&vcl, 512,
+ (super->conf_rec_len*512 +
+ offsetof(struct vcl, conf))) != 0) {
+ pr_err("could not allocate vcl buf\n");
+ return 1;
+ }
+ vcl->next = super->conflist;
+ vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+ vcl->conf.sec_elmnt_count = vd->sec_elmnt_count;
+ if (alloc_other_bvds(super, vcl) != 0) {
+ pr_err("could not allocate other bvds\n");
+ free(vcl);
+ return 1;
+ };
+ super->conflist = vcl;
+ dl->vlist[vnum++] = vcl;
+ }
+ memcpy(&vcl->conf, vd, super->conf_rec_len*512);
+ for (i=0; i < max_virt_disks ; i++)
+ if (memcmp(super->virt->entries[i].guid,
+ vcl->conf.guid, DDF_GUID_LEN)==0)
+ break;
+ if (i < max_virt_disks)
+ vcl->vcnum = i;
+ }
+
+ return 0;
+}
+
+static int load_super_ddf(struct supertype *st, int fd,
+ char *devname)
+{
+ unsigned long long dsize;
+ struct ddf_super *super;
+ int rv;
+
+ if (get_dev_size(fd, devname, &dsize) == 0)
+ return 1;
+
+ if (test_partition(fd))
+ /* DDF is not allowed on partitions */
+ return 1;
+
+ /* 32M is a lower bound */
+ if (dsize <= 32*1024*1024) {
+ if (devname)
+ pr_err("%s is too small for ddf: size is %llu sectors.\n",
+ devname, dsize>>9);
+ return 1;
+ }
+ if (dsize & 511) {
+ if (devname)
+ pr_err("%s is an odd size for ddf: size is %llu bytes.\n",
+ devname, dsize);
+ return 1;
+ }
+
+ free_super_ddf(st);
+
+ if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) {
+ pr_err("malloc of %zu failed.\n",
+ sizeof(*super));
+ return 1;
+ }
+ memset(super, 0, sizeof(*super));
+
+ rv = load_ddf_headers(fd, super, devname);
+ if (rv) {
+ free(super);
+ return rv;
+ }
+
+ /* Have valid headers and have chosen the best. Let's read in the rest*/
+
+ rv = load_ddf_global(fd, super, devname);
+
+ if (rv) {
+ if (devname)
+ pr_err("Failed to load all information sections on %s\n", devname);
+ free(super);
+ return rv;
+ }
+
+ rv = load_ddf_local(fd, super, devname, 0);
+
+ if (rv) {
+ if (devname)
+ pr_err("Failed to load all information sections on %s\n", devname);
+ free(super);
+ return rv;
+ }
+
+ /* Should possibly check the sections .... */
+
+ st->sb = super;
+ if (st->ss == NULL) {
+ st->ss = &super_ddf;
+ st->minor_version = 0;
+ st->max_devs = 512;
+ }
+ return 0;
+
+}
+
+static void free_super_ddf(struct supertype *st)
+{
+ struct ddf_super *ddf = st->sb;
+ if (ddf == NULL)
+ return;
+ free(ddf->phys);
+ free(ddf->virt);
+ free(ddf->conf);
+ while (ddf->conflist) {
+ struct vcl *v = ddf->conflist;
+ ddf->conflist = v->next;
+ if (v->block_sizes)
+ free(v->block_sizes);
+ if (v->other_bvds)
+ /*
+ v->other_bvds[0] points to beginning of buffer,
+ see alloc_other_bvds()
+ */
+ free(v->other_bvds[0]);
+ free(v);
+ }
+ while (ddf->dlist) {
+ struct dl *d = ddf->dlist;
+ ddf->dlist = d->next;
+ if (d->fd >= 0)
+ close(d->fd);
+ if (d->spare)
+ free(d->spare);
+ free(d);
+ }
+ while (ddf->add_list) {
+ struct dl *d = ddf->add_list;
+ ddf->add_list = d->next;
+ if (d->fd >= 0)
+ close(d->fd);
+ if (d->spare)
+ free(d->spare);
+ free(d);
+ }
+ free(ddf);
+ st->sb = NULL;
+}
+
+static struct supertype *match_metadata_desc_ddf(char *arg)
+{
+ /* 'ddf' only supports containers */
+ struct supertype *st;
+ if (strcmp(arg, "ddf") != 0 &&
+ strcmp(arg, "default") != 0
+ )
+ return NULL;
+
+ st = xcalloc(1, sizeof(*st));
+ st->ss = &super_ddf;
+ st->max_devs = 512;
+ st->minor_version = 0;
+ st->sb = NULL;
+ return st;
+}
+
+static mapping_t ddf_state[] = {
+ { "Optimal", 0},
+ { "Degraded", 1},
+ { "Deleted", 2},
+ { "Missing", 3},
+ { "Failed", 4},
+ { "Partially Optimal", 5},
+ { "-reserved-", 6},
+ { "-reserved-", 7},
+ { NULL, 0}
+};
+
+static mapping_t ddf_init_state[] = {
+ { "Not Initialised", 0},
+ { "QuickInit in Progress", 1},
+ { "Fully Initialised", 2},
+ { "*UNKNOWN*", 3},
+ { NULL, 0}
+};
+static mapping_t ddf_access[] = {
+ { "Read/Write", 0},
+ { "Reserved", 1},
+ { "Read Only", 2},
+ { "Blocked (no access)", 3},
+ { NULL ,0}
+};
+
+static mapping_t ddf_level[] = {
+ { "RAID0", DDF_RAID0},
+ { "RAID1", DDF_RAID1},
+ { "RAID3", DDF_RAID3},
+ { "RAID4", DDF_RAID4},
+ { "RAID5", DDF_RAID5},
+ { "RAID1E",DDF_RAID1E},
+ { "JBOD", DDF_JBOD},
+ { "CONCAT",DDF_CONCAT},
+ { "RAID5E",DDF_RAID5E},
+ { "RAID5EE",DDF_RAID5EE},
+ { "RAID6", DDF_RAID6},
+ { NULL, 0}
+};
+static mapping_t ddf_sec_level[] = {
+ { "Striped", DDF_2STRIPED},
+ { "Mirrored", DDF_2MIRRORED},
+ { "Concat", DDF_2CONCAT},
+ { "Spanned", DDF_2SPANNED},
+ { NULL, 0}
+};
+
+static int all_ff(const char *guid)
+{
+ int i;
+ for (i = 0; i < DDF_GUID_LEN; i++)
+ if (guid[i] != (char)0xff)
+ return 0;
+ return 1;
+}
+
+static const char *guid_str(const char *guid)
+{
+ static char buf[DDF_GUID_LEN*2+1];
+ int i;
+ char *p = buf;
+ for (i = 0; i < DDF_GUID_LEN; i++) {
+ unsigned char c = guid[i];
+ if (c >= 32 && c < 127)
+ p += sprintf(p, "%c", c);
+ else
+ p += sprintf(p, "%02x", c);
+ }
+ *p = '\0';
+ return (const char *) buf;
+}
+
+static void print_guid(char *guid, int tstamp)
+{
+ /* A GUIDs are part (or all) ASCII and part binary.
+ * They tend to be space padded.
+ * We print the GUID in HEX, then in parentheses add
+ * any initial ASCII sequence, and a possible
+ * time stamp from bytes 16-19
+ */
+ int l = DDF_GUID_LEN;
+ int i;
+
+ for (i=0 ; i<DDF_GUID_LEN ; i++) {
+ if ((i&3)==0 && i != 0) printf(":");
+ printf("%02X", guid[i]&255);
+ }
+
+ printf("\n (");
+ while (l && guid[l-1] == ' ')
+ l--;
+ for (i=0 ; i<l ; i++) {
+ if (guid[i] >= 0x20 && guid[i] < 0x7f)
+ fputc(guid[i], stdout);
+ else
+ break;
+ }
+ if (tstamp) {
+ time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE;
+ char tbuf[100];
+ struct tm *tm;
+ tm = localtime(&then);
+ strftime(tbuf, 100, " %D %T",tm);
+ fputs(tbuf, stdout);
+ }
+ printf(")");
+}
+
+static void examine_vd(int n, struct ddf_super *sb, char *guid)
+{
+ int crl = sb->conf_rec_len;
+ struct vcl *vcl;
+
+ for (vcl = sb->conflist ; vcl ; vcl = vcl->next) {
+ unsigned int i;
+ struct vd_config *vc = &vcl->conf;
+
+ if (!be32_eq(calc_crc(vc, crl*512), vc->crc))
+ continue;
+ if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0)
+ continue;
+
+ /* Ok, we know about this VD, let's give more details */
+ printf(" Raid Devices[%d] : %d (", n,
+ be16_to_cpu(vc->prim_elmnt_count));
+ for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) {
+ int j;
+ int cnt = be16_to_cpu(sb->phys->max_pdes);
+ for (j=0; j<cnt; j++)
+ if (be32_eq(vc->phys_refnum[i],
+ sb->phys->entries[j].refnum))
+ break;
+ if (i) printf(" ");
+ if (j < cnt)
+ printf("%d", j);
+ else
+ printf("--");
+ printf("@%lluK", (unsigned long long) be64_to_cpu(LBA_OFFSET(sb, vc)[i])/2);
+ }
+ printf(")\n");
+ if (vc->chunk_shift != 255)
+ printf(" Chunk Size[%d] : %d sectors\n", n,
+ 1 << vc->chunk_shift);
+ printf(" Raid Level[%d] : %s\n", n,
+ map_num(ddf_level, vc->prl)?:"-unknown-");
+ if (vc->sec_elmnt_count != 1) {
+ printf(" Secondary Position[%d] : %d of %d\n", n,
+ vc->sec_elmnt_seq, vc->sec_elmnt_count);
+ printf(" Secondary Level[%d] : %s\n", n,
+ map_num(ddf_sec_level, vc->srl) ?: "-unknown-");
+ }
+ printf(" Device Size[%d] : %llu\n", n,
+ be64_to_cpu(vc->blocks)/2);
+ printf(" Array Size[%d] : %llu\n", n,
+ be64_to_cpu(vc->array_blocks)/2);
+ }
+}
+
+static void examine_vds(struct ddf_super *sb)
+{
+ int cnt = be16_to_cpu(sb->virt->populated_vdes);
+ unsigned int i;
+ printf(" Virtual Disks : %d\n", cnt);
+
+ for (i = 0; i < be16_to_cpu(sb->virt->max_vdes); i++) {
+ struct virtual_entry *ve = &sb->virt->entries[i];
+ if (all_ff(ve->guid))
+ continue;
+ printf("\n");
+ printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1);
+ printf("\n");
+ printf(" unit[%d] : %d\n", i, be16_to_cpu(ve->unit));
+ printf(" state[%d] : %s, %s%s\n", i,
+ map_num(ddf_state, ve->state & 7),
+ (ve->state & DDF_state_morphing) ? "Morphing, ": "",
+ (ve->state & DDF_state_inconsistent)? "Not Consistent" : "Consistent");
+ printf(" init state[%d] : %s\n", i,
+ map_num(ddf_init_state, ve->init_state&DDF_initstate_mask));
+ printf(" access[%d] : %s\n", i,
+ map_num(ddf_access, (ve->init_state & DDF_access_mask) >> 6));
+ printf(" Name[%d] : %.16s\n", i, ve->name);
+ examine_vd(i, sb, ve->guid);
+ }
+ if (cnt) printf("\n");
+}
+
+static void examine_pds(struct ddf_super *sb)
+{
+ int cnt = be16_to_cpu(sb->phys->max_pdes);
+ int i;
+ struct dl *dl;
+ int unlisted = 0;
+ printf(" Physical Disks : %d\n", cnt);
+ printf(" Number RefNo Size Device Type/State\n");
+
+ for (dl = sb->dlist; dl; dl = dl->next)
+ dl->displayed = 0;
+
+ for (i=0 ; i<cnt ; i++) {
+ struct phys_disk_entry *pd = &sb->phys->entries[i];
+ int type = be16_to_cpu(pd->type);
+ int state = be16_to_cpu(pd->state);
+
+ if (be32_to_cpu(pd->refnum) == 0xffffffff)
+ /* Not in use */
+ continue;
+ //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0);
+ //printf("\n");
+ printf(" %3d %08x ", i,
+ be32_to_cpu(pd->refnum));
+ printf("%8lluK ",
+ be64_to_cpu(pd->config_size)>>1);
+ for (dl = sb->dlist; dl ; dl = dl->next) {
+ if (be32_eq(dl->disk.refnum, pd->refnum)) {
+ char *dv = map_dev(dl->major, dl->minor, 0);
+ if (dv) {
+ printf("%-15s", dv);
+ break;
+ }
+ }
+ }
+ if (!dl)
+ printf("%15s","");
+ else
+ dl->displayed = 1;
+ printf(" %s%s%s%s%s",
+ (type&2) ? "active":"",
+ (type&4) ? "Global-Spare":"",
+ (type&8) ? "spare" : "",
+ (type&16)? ", foreign" : "",
+ (type&32)? "pass-through" : "");
+ if (state & DDF_Failed)
+ /* This over-rides these three */
+ state &= ~(DDF_Online|DDF_Rebuilding|DDF_Transition);
+ printf("/%s%s%s%s%s%s%s",
+ (state&1)? "Online": "Offline",
+ (state&2)? ", Failed": "",
+ (state&4)? ", Rebuilding": "",
+ (state&8)? ", in-transition": "",
+ (state&16)? ", SMART-errors": "",
+ (state&32)? ", Unrecovered-Read-Errors": "",
+ (state&64)? ", Missing" : "");
+ printf("\n");
+ }
+ for (dl = sb->dlist; dl; dl = dl->next) {
+ char *dv;
+ if (dl->displayed)
+ continue;
+ if (!unlisted)
+ printf(" Physical disks not in metadata!:\n");
+ unlisted = 1;
+ dv = map_dev(dl->major, dl->minor, 0);
+ printf(" %08x %s\n", be32_to_cpu(dl->disk.refnum),
+ dv ? dv : "-unknown-");
+ }
+ if (unlisted)
+ printf("\n");
+}
+
+static void examine_super_ddf(struct supertype *st, char *homehost)
+{
+ struct ddf_super *sb = st->sb;
+
+ printf(" Magic : %08x\n", be32_to_cpu(sb->anchor.magic));
+ printf(" Version : %.8s\n", sb->anchor.revision);
+ printf("Controller GUID : "); print_guid(sb->controller.guid, 0);
+ printf("\n");
+ printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+ printf("\n");
+ printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq));
+ printf(" Redundant hdr : %s\n", (be32_eq(sb->secondary.magic,
+ DDF_HEADER_MAGIC)
+ ?"yes" : "no"));
+ examine_vds(sb);
+ examine_pds(sb);
+}
+
+static unsigned int get_vd_num_of_subarray(struct supertype *st)
+{
+ /*
+ * Figure out the VD number for this supertype.
+ * Returns DDF_CONTAINER for the container itself,
+ * and DDF_NOTFOUND on error.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct mdinfo *sra;
+ char *sub, *end;
+ unsigned int vcnum;
+
+ if (*st->container_devnm == '\0')
+ return DDF_CONTAINER;
+
+ sra = sysfs_read(-1, st->devnm, GET_VERSION);
+ if (!sra || sra->array.major_version != -1 ||
+ sra->array.minor_version != -2 ||
+ !is_subarray(sra->text_version))
+ return DDF_NOTFOUND;
+
+ sub = strchr(sra->text_version + 1, '/');
+ if (sub != NULL)
+ vcnum = strtoul(sub + 1, &end, 10);
+ if (sub == NULL || *sub == '\0' || *end != '\0' ||
+ vcnum >= be16_to_cpu(ddf->active->max_vd_entries))
+ return DDF_NOTFOUND;
+
+ return vcnum;
+}
+
+static void brief_examine_super_ddf(struct supertype *st, int verbose)
+{
+ /* We just write a generic DDF ARRAY entry
+ */
+ struct mdinfo info;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+
+ printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5);
+}
+
+static void brief_examine_subarrays_ddf(struct supertype *st, int verbose)
+{
+ /* We write a DDF ARRAY member entry for each vd, identifying container
+ * by uuid and member by unit number and uuid.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct mdinfo info;
+ unsigned int i;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+
+ for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) {
+ struct virtual_entry *ve = &ddf->virt->entries[i];
+ struct vcl vcl;
+ char nbuf1[64];
+ char namebuf[17];
+ if (all_ff(ve->guid))
+ continue;
+ memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN);
+ ddf->currentconf =&vcl;
+ vcl.vcnum = i;
+ uuid_from_super_ddf(st, info.uuid);
+ fname_from_uuid(st, &info, nbuf1, ':');
+ _ddf_array_name(namebuf, ddf, i);
+ printf("ARRAY%s%s container=%s member=%d UUID=%s\n",
+ namebuf[0] == '\0' ? "" : " /dev/md/", namebuf,
+ nbuf+5, i, nbuf1+5);
+ }
+}
+
+static void export_examine_super_ddf(struct supertype *st)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ getinfo_super_ddf(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("MD_METADATA=ddf\n");
+ printf("MD_LEVEL=container\n");
+ printf("MD_UUID=%s\n", nbuf+5);
+ printf("MD_DEVICES=%u\n",
+ be16_to_cpu(((struct ddf_super *)st->sb)->phys->used_pdes));
+}
+
+static int copy_metadata_ddf(struct supertype *st, int from, int to)
+{
+ void *buf;
+ unsigned long long dsize, offset;
+ int bytes;
+ struct ddf_header *ddf;
+ int written = 0;
+
+ /* The meta consists of an anchor, a primary, and a secondary.
+ * This all lives at the end of the device.
+ * So it is easiest to find the earliest of primary and
+ * secondary, and copy everything from there.
+ *
+ * Anchor is 512 from end. It contains primary_lba and secondary_lba
+ * we choose one of those
+ */
+
+ if (posix_memalign(&buf, 4096, 4096) != 0)
+ return 1;
+
+ if (!get_dev_size(from, NULL, &dsize))
+ goto err;
+
+ if (lseek64(from, dsize-512, 0) < 0)
+ goto err;
+ if (read(from, buf, 512) != 512)
+ goto err;
+ ddf = buf;
+ if (!be32_eq(ddf->magic, DDF_HEADER_MAGIC) ||
+ !be32_eq(calc_crc(ddf, 512), ddf->crc) ||
+ (memcmp(ddf->revision, DDF_REVISION_0, 8) != 0 &&
+ memcmp(ddf->revision, DDF_REVISION_2, 8) != 0))
+ goto err;
+
+ offset = dsize - 512;
+ if ((be64_to_cpu(ddf->primary_lba) << 9) < offset)
+ offset = be64_to_cpu(ddf->primary_lba) << 9;
+ if ((be64_to_cpu(ddf->secondary_lba) << 9) < offset)
+ offset = be64_to_cpu(ddf->secondary_lba) << 9;
+
+ bytes = dsize - offset;
+
+ if (lseek64(from, offset, 0) < 0 ||
+ lseek64(to, offset, 0) < 0)
+ goto err;
+ while (written < bytes) {
+ int n = bytes - written;
+ if (n > 4096)
+ n = 4096;
+ if (read(from, buf, n) != n)
+ goto err;
+ if (write(to, buf, n) != n)
+ goto err;
+ written += n;
+ }
+ free(buf);
+ return 0;
+err:
+ free(buf);
+ return 1;
+}
+
+static void detail_super_ddf(struct supertype *st, char *homehost,
+ char *subarray)
+{
+ struct ddf_super *sb = st->sb;
+ int cnt = be16_to_cpu(sb->virt->populated_vdes);
+
+ printf(" Container GUID : "); print_guid(sb->anchor.guid, 1);
+ printf("\n");
+ printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq));
+ printf(" Virtual Disks : %d\n", cnt);
+ printf("\n");
+}
+
+static const char *vendors_with_variable_volume_UUID[] = {
+ "LSI ",
+};
+
+static int volume_id_is_reliable(const struct ddf_super *ddf)
+{
+ int n = ARRAY_SIZE(vendors_with_variable_volume_UUID);
+ int i;
+ for (i = 0; i < n; i++)
+ if (!memcmp(ddf->controller.guid,
+ vendors_with_variable_volume_UUID[i], 8))
+ return 0;
+ return 1;
+}
+
+static void uuid_of_ddf_subarray(const struct ddf_super *ddf,
+ unsigned int vcnum, int uuid[4])
+{
+ char buf[DDF_GUID_LEN+18], sha[20], *p;
+ struct sha1_ctx ctx;
+ if (volume_id_is_reliable(ddf)) {
+ uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, uuid);
+ return;
+ }
+ /*
+ * Some fake RAID BIOSes (in particular, LSI ones) change the
+ * VD GUID at every boot. These GUIDs are not suitable for
+ * identifying an array. Luckily the header GUID appears to
+ * remain constant.
+ * We construct a pseudo-UUID from the header GUID and those
+ * properties of the subarray that we expect to remain constant.
+ */
+ memset(buf, 0, sizeof(buf));
+ p = buf;
+ memcpy(p, ddf->anchor.guid, DDF_GUID_LEN);
+ p += DDF_GUID_LEN;
+ memcpy(p, ddf->virt->entries[vcnum].name, 16);
+ p += 16;
+ *((__u16 *) p) = vcnum;
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(buf, sizeof(buf), &ctx);
+ sha1_finish_ctx(&ctx, sha);
+ memcpy(uuid, sha, 4*4);
+}
+
+static void brief_detail_super_ddf(struct supertype *st, char *subarray)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ struct ddf_super *ddf = st->sb;
+ unsigned int vcnum = get_vd_num_of_subarray(st);
+ if (vcnum == DDF_CONTAINER)
+ uuid_from_super_ddf(st, info.uuid);
+ else if (vcnum == DDF_NOTFOUND)
+ return;
+ else
+ uuid_of_ddf_subarray(ddf, vcnum, info.uuid);
+ fname_from_uuid(st, &info, nbuf,':');
+ printf(" UUID=%s", nbuf + 5);
+}
+
+static int match_home_ddf(struct supertype *st, char *homehost)
+{
+ /* It matches 'this' host if the controller is a
+ * Linux-MD controller with vendor_data matching
+ * the hostname. It would be nice if we could
+ * test against controller found in /sys or somewhere...
+ */
+ struct ddf_super *ddf = st->sb;
+ unsigned int len;
+
+ if (!homehost)
+ return 0;
+ len = strlen(homehost);
+
+ return (memcmp(ddf->controller.guid, T10, 8) == 0 &&
+ len < sizeof(ddf->controller.vendor_data) &&
+ memcmp(ddf->controller.vendor_data, homehost,len) == 0 &&
+ ddf->controller.vendor_data[len] == 0);
+}
+
+static int find_index_in_bvd(const struct ddf_super *ddf,
+ const struct vd_config *conf, unsigned int n,
+ unsigned int *n_bvd)
+{
+ /*
+ * Find the index of the n-th valid physical disk in this BVD.
+ * Unused entries can be sprinkled in with the used entries,
+ * but don't count.
+ */
+ unsigned int i, j;
+ for (i = 0, j = 0;
+ i < ddf->mppe && j < be16_to_cpu(conf->prim_elmnt_count);
+ i++) {
+ if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) {
+ if (n == j) {
+ *n_bvd = i;
+ return 1;
+ }
+ j++;
+ }
+ }
+ dprintf("couldn't find BVD member %u (total %u)\n",
+ n, be16_to_cpu(conf->prim_elmnt_count));
+ return 0;
+}
+
+/* Given a member array instance number, and a raid disk within that instance,
+ * find the vd_config structure. The offset of the given disk in the phys_refnum
+ * table is returned in n_bvd.
+ * For two-level members with a secondary raid level the vd_config for
+ * the appropriate BVD is returned.
+ * The return value is always &vlc->conf, where vlc is returned in last pointer.
+ */
+static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst,
+ unsigned int n,
+ unsigned int *n_bvd, struct vcl **vcl)
+{
+ struct vcl *v;
+
+ for (v = ddf->conflist; v; v = v->next) {
+ unsigned int nsec, ibvd = 0;
+ struct vd_config *conf;
+ if (inst != v->vcnum)
+ continue;
+ conf = &v->conf;
+ if (conf->sec_elmnt_count == 1) {
+ if (find_index_in_bvd(ddf, conf, n, n_bvd)) {
+ *vcl = v;
+ return conf;
+ } else
+ goto bad;
+ }
+ if (v->other_bvds == NULL) {
+ pr_err("BUG: other_bvds is NULL, nsec=%u\n",
+ conf->sec_elmnt_count);
+ goto bad;
+ }
+ nsec = n / be16_to_cpu(conf->prim_elmnt_count);
+ if (conf->sec_elmnt_seq != nsec) {
+ for (ibvd = 1; ibvd < conf->sec_elmnt_count; ibvd++) {
+ if (v->other_bvds[ibvd-1]->sec_elmnt_seq ==
+ nsec)
+ break;
+ }
+ if (ibvd == conf->sec_elmnt_count)
+ goto bad;
+ conf = v->other_bvds[ibvd-1];
+ }
+ if (!find_index_in_bvd(ddf, conf,
+ n - nsec*conf->sec_elmnt_count, n_bvd))
+ goto bad;
+ dprintf("found disk %u as member %u in bvd %d of array %u\n",
+ n, *n_bvd, ibvd, inst);
+ *vcl = v;
+ return conf;
+ }
+bad:
+ pr_err("Couldn't find disk %d in array %u\n", n, inst);
+ return NULL;
+}
+
+static int find_phys(const struct ddf_super *ddf, be32 phys_refnum)
+{
+ /* Find the entry in phys_disk which has the given refnum
+ * and return it's index
+ */
+ unsigned int i;
+ for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++)
+ if (be32_eq(ddf->phys->entries[i].refnum, phys_refnum))
+ return i;
+ return -1;
+}
+
+static void uuid_from_ddf_guid(const char *guid, int uuid[4])
+{
+ char buf[20];
+ struct sha1_ctx ctx;
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(guid, DDF_GUID_LEN, &ctx);
+ sha1_finish_ctx(&ctx, buf);
+ memcpy(uuid, buf, 4*4);
+}
+
+static void uuid_from_super_ddf(struct supertype *st, int uuid[4])
+{
+ /* The uuid returned here is used for:
+ * uuid to put into bitmap file (Create, Grow)
+ * uuid for backup header when saving critical section (Grow)
+ * comparing uuids when re-adding a device into an array
+ * In these cases the uuid required is that of the data-array,
+ * not the device-set.
+ * uuid to recognise same set when adding a missing device back
+ * to an array. This is a uuid for the device-set.
+ *
+ * For each of these we can make do with a truncated
+ * or hashed uuid rather than the original, as long as
+ * everyone agrees.
+ * In the case of SVD we assume the BVD is of interest,
+ * though that might be the case if a bitmap were made for
+ * a mirrored SVD - worry about that later.
+ * So we need to find the VD configuration record for the
+ * relevant BVD and extract the GUID and Secondary_Element_Seq.
+ * The first 16 bytes of the sha1 of these is used.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct vcl *vcl = ddf->currentconf;
+
+ if (vcl)
+ uuid_of_ddf_subarray(ddf, vcl->vcnum, uuid);
+ else
+ uuid_from_ddf_guid(ddf->anchor.guid, uuid);
+}
+
+static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct ddf_super *ddf = st->sb;
+ int map_disks = info->array.raid_disks;
+ __u32 *cptr;
+
+ if (ddf->currentconf) {
+ getinfo_super_ddf_bvd(st, info, map);
+ return;
+ }
+ memset(info, 0, sizeof(*info));
+
+ info->array.raid_disks = be16_to_cpu(ddf->phys->used_pdes);
+ info->array.level = LEVEL_CONTAINER;
+ info->array.layout = 0;
+ info->array.md_minor = -1;
+ cptr = (__u32 *)(ddf->anchor.guid + 16);
+ info->array.ctime = DECADE + __be32_to_cpu(*cptr);
+
+ info->array.chunk_size = 0;
+ info->container_enough = 1;
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ if (ddf->dlist) {
+ struct phys_disk_entry *pde = NULL;
+ info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum);
+ info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum);
+
+ info->data_offset = be64_to_cpu(ddf->phys->
+ entries[info->disk.raid_disk].
+ config_size);
+ info->component_size = ddf->dlist->size - info->data_offset;
+ if (info->disk.raid_disk >= 0)
+ pde = ddf->phys->entries + info->disk.raid_disk;
+ if (pde &&
+ !(be16_to_cpu(pde->state) & DDF_Failed) &&
+ !(be16_to_cpu(pde->state) & DDF_Missing))
+ info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ else
+ info->disk.state = 1 << MD_DISK_FAULTY;
+
+ } else {
+ /* There should always be a dlist, but just in case...*/
+ info->disk.number = -1;
+ info->disk.raid_disk = -1;
+ info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE);
+ }
+ info->events = be32_to_cpu(ddf->active->seq);
+ info->array.utime = DECADE + be32_to_cpu(ddf->active->timestamp);
+
+ info->recovery_start = MaxSector;
+ info->reshape_active = 0;
+ info->recovery_blocked = 0;
+ info->name[0] = 0;
+
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ strcpy(info->text_version, "ddf");
+ info->safe_mode_delay = 0;
+
+ uuid_from_super_ddf(st, info->uuid);
+
+ if (map) {
+ int i, e = 0;
+ int max = be16_to_cpu(ddf->phys->max_pdes);
+ for (i = e = 0 ; i < map_disks ; i++, e++) {
+ while (e < max &&
+ be32_to_cpu(ddf->phys->entries[e].refnum) == 0xffffffff)
+ e++;
+ if (i < info->array.raid_disks && e < max &&
+ !(be16_to_cpu(ddf->phys->entries[e].state) &
+ DDF_Failed))
+ map[i] = 1;
+ else
+ map[i] = 0;
+ }
+ }
+}
+
+/* size of name must be at least 17 bytes! */
+static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i)
+{
+ int j;
+ memcpy(name, ddf->virt->entries[i].name, 16);
+ name[16] = 0;
+ for(j = 0; j < 16; j++)
+ if (name[j] == ' ')
+ name[j] = 0;
+}
+
+static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct ddf_super *ddf = st->sb;
+ struct vcl *vc = ddf->currentconf;
+ int cd = ddf->currentdev;
+ int n_prim;
+ int j;
+ struct dl *dl = NULL;
+ int map_disks = info->array.raid_disks;
+ __u32 *cptr;
+ struct vd_config *conf;
+
+ memset(info, 0, sizeof(*info));
+ if (layout_ddf2md(&vc->conf, &info->array) == -1)
+ return;
+ info->array.md_minor = -1;
+ cptr = (__u32 *)(vc->conf.guid + 16);
+ info->array.ctime = DECADE + __be32_to_cpu(*cptr);
+ info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp);
+ info->array.chunk_size = 512 << vc->conf.chunk_shift;
+ info->custom_array_size = be64_to_cpu(vc->conf.array_blocks);
+
+ conf = &vc->conf;
+ n_prim = be16_to_cpu(conf->prim_elmnt_count);
+ if (conf->sec_elmnt_count > 1 && cd >= n_prim) {
+ int ibvd = cd / n_prim - 1;
+ cd %= n_prim;
+ conf = vc->other_bvds[ibvd];
+ }
+
+ if (cd >= 0 && (unsigned)cd < ddf->mppe) {
+ info->data_offset =
+ be64_to_cpu(LBA_OFFSET(ddf, conf)[cd]);
+ if (vc->block_sizes)
+ info->component_size = vc->block_sizes[cd];
+ else
+ info->component_size = be64_to_cpu(conf->blocks);
+
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd]))
+ break;
+ }
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ info->disk.state = 0;
+ if (dl && dl->pdnum >= 0) {
+ info->disk.major = dl->major;
+ info->disk.minor = dl->minor;
+ info->disk.raid_disk = cd + conf->sec_elmnt_seq
+ * be16_to_cpu(conf->prim_elmnt_count);
+ info->disk.number = dl->pdnum;
+ info->disk.state = 0;
+ if (info->disk.number >= 0 &&
+ (be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Online) &&
+ !(be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Failed))
+ info->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+ info->events = be32_to_cpu(ddf->active->seq);
+ }
+
+ info->container_member = ddf->currentconf->vcnum;
+
+ info->recovery_start = MaxSector;
+ info->resync_start = 0;
+ info->reshape_active = 0;
+ info->recovery_blocked = 0;
+ if (!(ddf->virt->entries[info->container_member].state &
+ DDF_state_inconsistent) &&
+ (ddf->virt->entries[info->container_member].init_state &
+ DDF_initstate_mask) == DDF_init_full)
+ info->resync_start = MaxSector;
+
+ uuid_from_super_ddf(st, info->uuid);
+
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ sprintf(info->text_version, "/%s/%d",
+ st->container_devnm,
+ info->container_member);
+ info->safe_mode_delay = DDF_SAFE_MODE_DELAY;
+
+ _ddf_array_name(info->name, ddf, info->container_member);
+
+ if (map)
+ for (j = 0; j < map_disks; j++) {
+ map[j] = 0;
+ if (j < info->array.raid_disks) {
+ int i = find_phys(ddf, vc->conf.phys_refnum[j]);
+ if (i >= 0 &&
+ (be16_to_cpu(ddf->phys->entries[i].state)
+ & DDF_Online) &&
+ !(be16_to_cpu(ddf->phys->entries[i].state)
+ & DDF_Failed))
+ map[i] = 1;
+ }
+ }
+}
+
+static int update_super_ddf(struct supertype *st, struct mdinfo *info,
+ char *update,
+ char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* For 'assemble' and 'force' we need to return non-zero if any
+ * change was made. For others, the return value is ignored.
+ * Update options are:
+ * force-one : This device looks a bit old but needs to be included,
+ * update age info appropriately.
+ * assemble: clear any 'faulty' flag to allow this device to
+ * be assembled.
+ * force-array: Array is degraded but being forced, mark it clean
+ * if that will be needed to assemble it.
+ *
+ * newdev: not used ????
+ * grow: Array has gained a new device - this is currently for
+ * linear only
+ * resync: mark as dirty so a resync will happen.
+ * uuid: Change the uuid of the array to match what is given
+ * homehost: update the recorded homehost
+ * name: update the name - preserving the homehost
+ * _reshape_progress: record new reshape_progress position.
+ *
+ * Following are not relevant for this version:
+ * sparc2.2 : update from old dodgey metadata
+ * super-minor: change the preferred_minor number
+ * summaries: update redundant counters.
+ */
+ int rv = 0;
+// struct ddf_super *ddf = st->sb;
+// struct vd_config *vd = find_vdcr(ddf, info->container_member);
+// struct virtual_entry *ve = find_ve(ddf);
+
+ /* we don't need to handle "force-*" or "assemble" as
+ * there is no need to 'trick' the kernel. When the metadata is
+ * first updated to activate the array, all the implied modifications
+ * will just happen.
+ */
+
+ if (strcmp(update, "grow") == 0) {
+ /* FIXME */
+ } else if (strcmp(update, "resync") == 0) {
+// info->resync_checkpoint = 0;
+ } else if (strcmp(update, "homehost") == 0) {
+ /* homehost is stored in controller->vendor_data,
+ * or it is when we are the vendor
+ */
+// if (info->vendor_is_local)
+// strcpy(ddf->controller.vendor_data, homehost);
+ rv = -1;
+ } else if (strcmp(update, "name") == 0) {
+ /* name is stored in virtual_entry->name */
+// memset(ve->name, ' ', 16);
+// strncpy(ve->name, info->name, 16);
+ rv = -1;
+ } else if (strcmp(update, "_reshape_progress") == 0) {
+ /* We don't support reshape yet */
+ } else if (strcmp(update, "assemble") == 0 ) {
+ /* Do nothing, just succeed */
+ rv = 0;
+ } else
+ rv = -1;
+
+// update_all_csum(ddf);
+
+ return rv;
+}
+
+static void make_header_guid(char *guid)
+{
+ be32 stamp;
+ /* Create a DDF Header of Virtual Disk GUID */
+
+ /* 24 bytes of fiction required.
+ * first 8 are a 'vendor-id' - "Linux-MD"
+ * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000
+ * Remaining 8 random number plus timestamp
+ */
+ memcpy(guid, T10, sizeof(T10));
+ stamp = cpu_to_be32(0xdeadbeef);
+ memcpy(guid+8, &stamp, 4);
+ stamp = cpu_to_be32(0);
+ memcpy(guid+12, &stamp, 4);
+ stamp = cpu_to_be32(time(0) - DECADE);
+ memcpy(guid+16, &stamp, 4);
+ stamp._v32 = random32();
+ memcpy(guid+20, &stamp, 4);
+}
+
+static unsigned int find_unused_vde(const struct ddf_super *ddf)
+{
+ unsigned int i;
+ for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) {
+ if (all_ff(ddf->virt->entries[i].guid))
+ return i;
+ }
+ return DDF_NOTFOUND;
+}
+
+static unsigned int find_vde_by_name(const struct ddf_super *ddf,
+ const char *name)
+{
+ unsigned int i;
+ if (name == NULL)
+ return DDF_NOTFOUND;
+ for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) {
+ if (all_ff(ddf->virt->entries[i].guid))
+ continue;
+ if (!strncmp(name, ddf->virt->entries[i].name,
+ sizeof(ddf->virt->entries[i].name)))
+ return i;
+ }
+ return DDF_NOTFOUND;
+}
+
+static unsigned int find_vde_by_guid(const struct ddf_super *ddf,
+ const char *guid)
+{
+ unsigned int i;
+ if (guid == NULL || all_ff(guid))
+ return DDF_NOTFOUND;
+ for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++)
+ if (!memcmp(ddf->virt->entries[i].guid, guid, DDF_GUID_LEN))
+ return i;
+ return DDF_NOTFOUND;
+}
+
+static int init_super_ddf(struct supertype *st,
+ mdu_array_info_t *info,
+ struct shape *s, char *name, char *homehost,
+ int *uuid, unsigned long long data_offset)
+{
+ /* This is primarily called by Create when creating a new array.
+ * We will then get add_to_super called for each component, and then
+ * write_init_super called to write it out to each device.
+ * For DDF, Create can create on fresh devices or on a pre-existing
+ * array.
+ * To create on a pre-existing array a different method will be called.
+ * This one is just for fresh drives.
+ *
+ * We need to create the entire 'ddf' structure which includes:
+ * DDF headers - these are easy.
+ * Controller data - a Sector describing this controller .. not that
+ * this is a controller exactly.
+ * Physical Disk Record - one entry per device, so
+ * leave plenty of space.
+ * Virtual Disk Records - again, just leave plenty of space.
+ * This just lists VDs, doesn't give details.
+ * Config records - describe the VDs that use this disk
+ * DiskData - describes 'this' device.
+ * BadBlockManagement - empty
+ * Diag Space - empty
+ * Vendor Logs - Could we put bitmaps here?
+ *
+ */
+ struct ddf_super *ddf;
+ char hostname[17];
+ int hostlen;
+ int max_phys_disks, max_virt_disks;
+ unsigned long long sector;
+ int clen;
+ int i;
+ int pdsize, vdsize;
+ struct phys_disk *pd;
+ struct virtual_disk *vd;
+
+ if (st->sb)
+ return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid,
+ data_offset);
+
+ if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 0;
+ }
+ memset(ddf, 0, sizeof(*ddf));
+ st->sb = ddf;
+
+ if (info == NULL) {
+ /* zeroing superblock */
+ return 0;
+ }
+
+ /* At least 32MB *must* be reserved for the ddf. So let's just
+ * start 32MB from the end, and put the primary header there.
+ * Don't do secondary for now.
+ * We don't know exactly where that will be yet as it could be
+ * different on each device. So just set up the lengths.
+ */
+
+ ddf->anchor.magic = DDF_HEADER_MAGIC;
+ make_header_guid(ddf->anchor.guid);
+
+ memcpy(ddf->anchor.revision, DDF_REVISION_2, 8);
+ ddf->anchor.seq = cpu_to_be32(1);
+ ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE);
+ ddf->anchor.openflag = 0xFF;
+ ddf->anchor.foreignflag = 0;
+ ddf->anchor.enforcegroups = 0; /* Is this best?? */
+ ddf->anchor.pad0 = 0xff;
+ memset(ddf->anchor.pad1, 0xff, 12);
+ memset(ddf->anchor.header_ext, 0xff, 32);
+ ddf->anchor.primary_lba = cpu_to_be64(~(__u64)0);
+ ddf->anchor.secondary_lba = cpu_to_be64(~(__u64)0);
+ ddf->anchor.type = DDF_HEADER_ANCHOR;
+ memset(ddf->anchor.pad2, 0xff, 3);
+ ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */
+ /* Put this at bottom of 32M reserved.. */
+ ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0);
+ max_phys_disks = 1023; /* Should be enough, 4095 is also allowed */
+ ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks);
+ max_virt_disks = 255; /* 15, 63, 255, 1024, 4095 are all allowed */
+ ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks);
+ ddf->max_part = 64;
+ ddf->anchor.max_partitions = cpu_to_be16(ddf->max_part);
+ ddf->mppe = 256; /* 16, 64, 256, 1024, 4096 are all allowed */
+ ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512;
+ ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len);
+ ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe);
+ memset(ddf->anchor.pad3, 0xff, 54);
+ /* Controller section is one sector long immediately
+ * after the ddf header */
+ sector = 1;
+ ddf->anchor.controller_section_offset = cpu_to_be32(sector);
+ ddf->anchor.controller_section_length = cpu_to_be32(1);
+ sector += 1;
+
+ /* phys is 8 sectors after that */
+ pdsize = ROUND_UP(sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry)*max_phys_disks,
+ 512);
+ switch(pdsize/512) {
+ case 2: case 8: case 32: case 128: case 512: break;
+ default: abort();
+ }
+ ddf->anchor.phys_section_offset = cpu_to_be32(sector);
+ ddf->anchor.phys_section_length =
+ cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */
+ sector += pdsize/512;
+
+ /* virt is another 32 sectors */
+ vdsize = ROUND_UP(sizeof(struct virtual_disk) +
+ sizeof(struct virtual_entry) * max_virt_disks,
+ 512);
+ switch(vdsize/512) {
+ case 2: case 8: case 32: case 128: case 512: break;
+ default: abort();
+ }
+ ddf->anchor.virt_section_offset = cpu_to_be32(sector);
+ ddf->anchor.virt_section_length =
+ cpu_to_be32(vdsize/512); /* max_vd_entries/8 */
+ sector += vdsize/512;
+
+ clen = ddf->conf_rec_len * (ddf->max_part+1);
+ ddf->anchor.config_section_offset = cpu_to_be32(sector);
+ ddf->anchor.config_section_length = cpu_to_be32(clen);
+ sector += clen;
+
+ ddf->anchor.data_section_offset = cpu_to_be32(sector);
+ ddf->anchor.data_section_length = cpu_to_be32(1);
+ sector += 1;
+
+ ddf->anchor.bbm_section_length = cpu_to_be32(0);
+ ddf->anchor.bbm_section_offset = cpu_to_be32(0xFFFFFFFF);
+ ddf->anchor.diag_space_length = cpu_to_be32(0);
+ ddf->anchor.diag_space_offset = cpu_to_be32(0xFFFFFFFF);
+ ddf->anchor.vendor_length = cpu_to_be32(0);
+ ddf->anchor.vendor_offset = cpu_to_be32(0xFFFFFFFF);
+
+ memset(ddf->anchor.pad4, 0xff, 256);
+
+ memcpy(&ddf->primary, &ddf->anchor, 512);
+ memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+ ddf->primary.openflag = 1; /* I guess.. */
+ ddf->primary.type = DDF_HEADER_PRIMARY;
+
+ ddf->secondary.openflag = 1; /* I guess.. */
+ ddf->secondary.type = DDF_HEADER_SECONDARY;
+
+ ddf->active = &ddf->primary;
+
+ ddf->controller.magic = DDF_CONTROLLER_MAGIC;
+
+ /* 24 more bytes of fiction required.
+ * first 8 are a 'vendor-id' - "Linux-MD"
+ * Remaining 16 are serial number.... maybe a hostname would do?
+ */
+ memcpy(ddf->controller.guid, T10, sizeof(T10));
+ gethostname(hostname, sizeof(hostname));
+ hostname[sizeof(hostname) - 1] = 0;
+ hostlen = strlen(hostname);
+ memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen);
+ for (i = strlen(T10) ; i+hostlen < 24; i++)
+ ddf->controller.guid[i] = ' ';
+
+ ddf->controller.type.vendor_id = cpu_to_be16(0xDEAD);
+ ddf->controller.type.device_id = cpu_to_be16(0xBEEF);
+ ddf->controller.type.sub_vendor_id = cpu_to_be16(0);
+ ddf->controller.type.sub_device_id = cpu_to_be16(0);
+ memcpy(ddf->controller.product_id, "What Is My PID??", 16);
+ memset(ddf->controller.pad, 0xff, 8);
+ memset(ddf->controller.vendor_data, 0xff, 448);
+ if (homehost && strlen(homehost) < 440)
+ strcpy((char*)ddf->controller.vendor_data, homehost);
+
+ if (posix_memalign((void**)&pd, 512, pdsize) != 0) {
+ pr_err("could not allocate pd\n");
+ return 0;
+ }
+ ddf->phys = pd;
+ ddf->pdsize = pdsize;
+
+ memset(pd, 0xff, pdsize);
+ memset(pd, 0, sizeof(*pd));
+ pd->magic = DDF_PHYS_RECORDS_MAGIC;
+ pd->used_pdes = cpu_to_be16(0);
+ pd->max_pdes = cpu_to_be16(max_phys_disks);
+ memset(pd->pad, 0xff, 52);
+ for (i = 0; i < max_phys_disks; i++)
+ memset(pd->entries[i].guid, 0xff, DDF_GUID_LEN);
+
+ if (posix_memalign((void**)&vd, 512, vdsize) != 0) {
+ pr_err("could not allocate vd\n");
+ return 0;
+ }
+ ddf->virt = vd;
+ ddf->vdsize = vdsize;
+ memset(vd, 0, vdsize);
+ vd->magic = DDF_VIRT_RECORDS_MAGIC;
+ vd->populated_vdes = cpu_to_be16(0);
+ vd->max_vdes = cpu_to_be16(max_virt_disks);
+ memset(vd->pad, 0xff, 52);
+
+ for (i=0; i<max_virt_disks; i++)
+ memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry));
+
+ st->sb = ddf;
+ ddf_set_updates_pending(ddf, NULL);
+ return 1;
+}
+
+static int chunk_to_shift(int chunksize)
+{
+ return ffs(chunksize/512)-1;
+}
+
+struct extent {
+ unsigned long long start, size;
+};
+static int cmp_extent(const void *av, const void *bv)
+{
+ const struct extent *a = av;
+ const struct extent *b = bv;
+ if (a->start < b->start)
+ return -1;
+ if (a->start > b->start)
+ return 1;
+ return 0;
+}
+
+static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl)
+{
+ /* Find a list of used extents on the given physical device
+ * (dnum) of the given ddf.
+ * Return a malloced array of 'struct extent'
+ */
+ struct extent *rv;
+ int n = 0;
+ unsigned int i;
+ __u16 state;
+
+ if (dl->pdnum < 0)
+ return NULL;
+ state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state);
+
+ if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online)
+ return NULL;
+
+ rv = xmalloc(sizeof(struct extent) * (ddf->max_part + 2));
+
+ for (i = 0; i < ddf->max_part; i++) {
+ const struct vd_config *bvd;
+ unsigned int ibvd;
+ struct vcl *v = dl->vlist[i];
+ if (v == NULL ||
+ get_pd_index_from_refnum(v, dl->disk.refnum, ddf->mppe,
+ &bvd, &ibvd) == DDF_NOTFOUND)
+ continue;
+ rv[n].start = be64_to_cpu(LBA_OFFSET(ddf, bvd)[ibvd]);
+ rv[n].size = be64_to_cpu(bvd->blocks);
+ n++;
+ }
+ qsort(rv, n, sizeof(*rv), cmp_extent);
+
+ rv[n].start = be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size);
+ rv[n].size = 0;
+ return rv;
+}
+
+static unsigned long long find_space(
+ struct ddf_super *ddf, struct dl *dl,
+ unsigned long long data_offset,
+ unsigned long long *size)
+{
+ /* Find if the requested amount of space is available.
+ * If it is, return start.
+ * If not, set *size to largest space.
+ * If data_offset != INVALID_SECTORS, then the space must start
+ * at this location.
+ */
+ struct extent *e = get_extents(ddf, dl);
+ int i = 0;
+ unsigned long long pos = 0;
+ unsigned long long max_size = 0;
+
+ if (!e) {
+ *size = 0;
+ return INVALID_SECTORS;
+ }
+ do {
+ unsigned long long esize = e[i].start - pos;
+ if (data_offset != INVALID_SECTORS &&
+ pos <= data_offset &&
+ e[i].start > data_offset) {
+ pos = data_offset;
+ esize = e[i].start - pos;
+ }
+ if (data_offset != INVALID_SECTORS &&
+ pos != data_offset) {
+ i++;
+ continue;
+ }
+ if (esize >= *size) {
+ /* Found! */
+ free(e);
+ return pos;
+ }
+ if (esize > max_size)
+ max_size = esize;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ *size = max_size;
+ free(e);
+ return INVALID_SECTORS;
+}
+
+static int init_super_ddf_bvd(struct supertype *st,
+ mdu_array_info_t *info,
+ unsigned long long size,
+ char *name, char *homehost,
+ int *uuid, unsigned long long data_offset)
+{
+ /* We are creating a BVD inside a pre-existing container.
+ * so st->sb is already set.
+ * We need to create a new vd_config and a new virtual_entry
+ */
+ struct ddf_super *ddf = st->sb;
+ unsigned int venum, i;
+ struct virtual_entry *ve;
+ struct vcl *vcl;
+ struct vd_config *vc;
+
+ if (find_vde_by_name(ddf, name) != DDF_NOTFOUND) {
+ pr_err("This ddf already has an array called %s\n", name);
+ return 0;
+ }
+ venum = find_unused_vde(ddf);
+ if (venum == DDF_NOTFOUND) {
+ pr_err("Cannot find spare slot for virtual disk\n");
+ return 0;
+ }
+ ve = &ddf->virt->entries[venum];
+
+ /* A Virtual Disk GUID contains the T10 Vendor ID, controller type,
+ * timestamp, random number
+ */
+ make_header_guid(ve->guid);
+ ve->unit = cpu_to_be16(info->md_minor);
+ ve->pad0 = 0xFFFF;
+ ve->guid_crc._v16 = crc32(0, (unsigned char *)ddf->anchor.guid,
+ DDF_GUID_LEN);
+ ve->type = cpu_to_be16(0);
+ ve->state = DDF_state_degraded; /* Will be modified as devices are added */
+ if (info->state & 1) /* clean */
+ ve->init_state = DDF_init_full;
+ else
+ ve->init_state = DDF_init_not;
+
+ memset(ve->pad1, 0xff, 14);
+ memset(ve->name, '\0', sizeof(ve->name));
+ if (name) {
+ int l = strnlen(name, sizeof(ve->name));
+ memcpy(ve->name, name, l);
+ }
+ ddf->virt->populated_vdes =
+ cpu_to_be16(be16_to_cpu(ddf->virt->populated_vdes)+1);
+
+ /* Now create a new vd_config */
+ if (posix_memalign((void**)&vcl, 512,
+ (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) {
+ pr_err("could not allocate vd_config\n");
+ return 0;
+ }
+ vcl->vcnum = venum;
+ vcl->block_sizes = NULL; /* FIXME not for CONCAT */
+ vc = &vcl->conf;
+
+ vc->magic = DDF_VD_CONF_MAGIC;
+ memcpy(vc->guid, ve->guid, DDF_GUID_LEN);
+ vc->timestamp = cpu_to_be32(time(0)-DECADE);
+ vc->seqnum = cpu_to_be32(1);
+ memset(vc->pad0, 0xff, 24);
+ vc->chunk_shift = chunk_to_shift(info->chunk_size);
+ if (layout_md2ddf(info, vc) == -1 ||
+ be16_to_cpu(vc->prim_elmnt_count) > ddf->mppe) {
+ pr_err("unsupported RAID level/layout %d/%d with %d disks\n",
+ info->level, info->layout, info->raid_disks);
+ free(vcl);
+ return 0;
+ }
+ vc->sec_elmnt_seq = 0;
+ if (alloc_other_bvds(ddf, vcl) != 0) {
+ pr_err("could not allocate other bvds\n");
+ free(vcl);
+ return 0;
+ }
+ vc->blocks = cpu_to_be64(size * 2);
+ vc->array_blocks = cpu_to_be64(
+ calc_array_size(info->level, info->raid_disks, info->layout,
+ info->chunk_size, size * 2));
+ memset(vc->pad1, 0xff, 8);
+ vc->spare_refs[0] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[1] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[2] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[3] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[4] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[5] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[6] = cpu_to_be32(0xffffffff);
+ vc->spare_refs[7] = cpu_to_be32(0xffffffff);
+ memset(vc->cache_pol, 0, 8);
+ vc->bg_rate = 0x80;
+ memset(vc->pad2, 0xff, 3);
+ memset(vc->pad3, 0xff, 52);
+ memset(vc->pad4, 0xff, 192);
+ memset(vc->v0, 0xff, 32);
+ memset(vc->v1, 0xff, 32);
+ memset(vc->v2, 0xff, 16);
+ memset(vc->v3, 0xff, 16);
+ memset(vc->vendor, 0xff, 32);
+
+ memset(vc->phys_refnum, 0xff, 4*ddf->mppe);
+ memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe);
+
+ for (i = 1; i < vc->sec_elmnt_count; i++) {
+ memcpy(vcl->other_bvds[i-1], vc, ddf->conf_rec_len * 512);
+ vcl->other_bvds[i-1]->sec_elmnt_seq = i;
+ }
+
+ vcl->next = ddf->conflist;
+ ddf->conflist = vcl;
+ ddf->currentconf = vcl;
+ ddf_set_updates_pending(ddf, NULL);
+ return 1;
+}
+
+static void add_to_super_ddf_bvd(struct supertype *st,
+ mdu_disk_info_t *dk, int fd, char *devname,
+ unsigned long long data_offset)
+{
+ /* fd and devname identify a device within the ddf container (st).
+ * dk identifies a location in the new BVD.
+ * We need to find suitable free space in that device and update
+ * the phys_refnum and lba_offset for the newly created vd_config.
+ * We might also want to update the type in the phys_disk
+ * section.
+ *
+ * Alternately: fd == -1 and we have already chosen which device to
+ * use and recorded in dlist->raid_disk;
+ */
+ struct dl *dl;
+ struct ddf_super *ddf = st->sb;
+ struct vd_config *vc;
+ unsigned int i;
+ unsigned long long blocks, pos;
+ unsigned int raid_disk = dk->raid_disk;
+
+ if (fd == -1) {
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->raiddisk == dk->raid_disk)
+ break;
+ } else {
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->major == dk->major &&
+ dl->minor == dk->minor)
+ break;
+ }
+ if (!dl || dl->pdnum < 0 || ! (dk->state & (1<<MD_DISK_SYNC)))
+ return;
+
+ vc = &ddf->currentconf->conf;
+ if (vc->sec_elmnt_count > 1) {
+ unsigned int n = be16_to_cpu(vc->prim_elmnt_count);
+ if (raid_disk >= n)
+ vc = ddf->currentconf->other_bvds[raid_disk / n - 1];
+ raid_disk %= n;
+ }
+
+ blocks = be64_to_cpu(vc->blocks);
+ if (ddf->currentconf->block_sizes)
+ blocks = ddf->currentconf->block_sizes[dk->raid_disk];
+
+ pos = find_space(ddf, dl, data_offset, &blocks);
+ if (pos == INVALID_SECTORS)
+ return;
+
+ ddf->currentdev = dk->raid_disk;
+ vc->phys_refnum[raid_disk] = dl->disk.refnum;
+ LBA_OFFSET(ddf, vc)[raid_disk] = cpu_to_be64(pos);
+
+ for (i = 0; i < ddf->max_part ; i++)
+ if (dl->vlist[i] == NULL)
+ break;
+ if (i == ddf->max_part)
+ return;
+ dl->vlist[i] = ddf->currentconf;
+
+ if (fd >= 0)
+ dl->fd = fd;
+ if (devname)
+ dl->devname = devname;
+
+ /* Check if we can mark array as optimal yet */
+ i = ddf->currentconf->vcnum;
+ ddf->virt->entries[i].state =
+ (ddf->virt->entries[i].state & ~DDF_state_mask)
+ | get_svd_state(ddf, ddf->currentconf);
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_set(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ dprintf("added disk %d/%08x to VD %d/%s as disk %d\n",
+ dl->pdnum, be32_to_cpu(dl->disk.refnum),
+ ddf->currentconf->vcnum, guid_str(vc->guid),
+ dk->raid_disk);
+ ddf_set_updates_pending(ddf, vc);
+}
+
+static unsigned int find_unused_pde(const struct ddf_super *ddf)
+{
+ unsigned int i;
+ for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) {
+ if (all_ff(ddf->phys->entries[i].guid))
+ return i;
+ }
+ return DDF_NOTFOUND;
+}
+
+static void _set_config_size(struct phys_disk_entry *pde, const struct dl *dl)
+{
+ __u64 cfs, t;
+ cfs = min(dl->size - 32*1024*2ULL, be64_to_cpu(dl->primary_lba));
+ t = be64_to_cpu(dl->secondary_lba);
+ if (t != ~(__u64)0)
+ cfs = min(cfs, t);
+ /*
+ * Some vendor DDF structures interpret workspace_lba
+ * very differently than we do: Make a sanity check on the value.
+ */
+ t = be64_to_cpu(dl->workspace_lba);
+ if (t < cfs) {
+ __u64 wsp = cfs - t;
+ if (wsp > 1024*1024*2ULL && wsp > dl->size / 16) {
+ pr_err("%x:%x: workspace size 0x%llx too big, ignoring\n",
+ dl->major, dl->minor, (unsigned long long)wsp);
+ } else
+ cfs = t;
+ }
+ pde->config_size = cpu_to_be64(cfs);
+ dprintf("%x:%x config_size %llx, DDF structure is %llx blocks\n",
+ dl->major, dl->minor,
+ (unsigned long long)cfs, (unsigned long long)(dl->size-cfs));
+}
+
+/* Add a device to a container, either while creating it or while
+ * expanding a pre-existing container
+ */
+static int add_to_super_ddf(struct supertype *st,
+ mdu_disk_info_t *dk, int fd, char *devname,
+ unsigned long long data_offset)
+{
+ struct ddf_super *ddf = st->sb;
+ struct dl *dd;
+ time_t now;
+ struct tm *tm;
+ unsigned long long size;
+ struct phys_disk_entry *pde;
+ unsigned int n, i;
+ struct stat stb;
+ __u32 *tptr;
+
+ if (ddf->currentconf) {
+ add_to_super_ddf_bvd(st, dk, fd, devname, data_offset);
+ return 0;
+ }
+
+ /* This is device numbered dk->number. We need to create
+ * a phys_disk entry and a more detailed disk_data entry.
+ */
+ fstat(fd, &stb);
+ n = find_unused_pde(ddf);
+ if (n == DDF_NOTFOUND) {
+ pr_err("No free slot in array, cannot add disk\n");
+ return 1;
+ }
+ pde = &ddf->phys->entries[n];
+ get_dev_size(fd, NULL, &size);
+ if (size <= 32*1024*1024) {
+ pr_err("device size must be at least 32MB\n");
+ return 1;
+ }
+ size >>= 9;
+
+ if (posix_memalign((void**)&dd, 512,
+ sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) {
+ pr_err("could allocate buffer for new disk, aborting\n");
+ return 1;
+ }
+ dd->major = major(stb.st_rdev);
+ dd->minor = minor(stb.st_rdev);
+ dd->devname = devname;
+ dd->fd = fd;
+ dd->spare = NULL;
+
+ dd->disk.magic = DDF_PHYS_DATA_MAGIC;
+ now = time(0);
+ tm = localtime(&now);
+ sprintf(dd->disk.guid, "%8s%04d%02d%02d", T10,
+ (__u16)tm->tm_year+1900,
+ (__u8)tm->tm_mon+1, (__u8)tm->tm_mday);
+ tptr = (__u32 *)(dd->disk.guid + 16);
+ *tptr++ = random32();
+ *tptr = random32();
+
+ do {
+ /* Cannot be bothered finding a CRC of some irrelevant details*/
+ dd->disk.refnum._v32 = random32();
+ for (i = be16_to_cpu(ddf->active->max_pd_entries);
+ i > 0; i--)
+ if (be32_eq(ddf->phys->entries[i-1].refnum,
+ dd->disk.refnum))
+ break;
+ } while (i > 0);
+
+ dd->disk.forced_ref = 1;
+ dd->disk.forced_guid = 1;
+ memset(dd->disk.vendor, ' ', 32);
+ memcpy(dd->disk.vendor, "Linux", 5);
+ memset(dd->disk.pad, 0xff, 442);
+ for (i = 0; i < ddf->max_part ; i++)
+ dd->vlist[i] = NULL;
+
+ dd->pdnum = n;
+
+ if (st->update_tail) {
+ int len = (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry));
+ struct phys_disk *pd;
+
+ pd = xmalloc(len);
+ pd->magic = DDF_PHYS_RECORDS_MAGIC;
+ pd->used_pdes = cpu_to_be16(n);
+ pde = &pd->entries[0];
+ dd->mdupdate = pd;
+ } else
+ ddf->phys->used_pdes = cpu_to_be16(
+ 1 + be16_to_cpu(ddf->phys->used_pdes));
+
+ memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN);
+ pde->refnum = dd->disk.refnum;
+ pde->type = cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare);
+ pde->state = cpu_to_be16(DDF_Online);
+ dd->size = size;
+ /*
+ * If there is already a device in dlist, try to reserve the same
+ * amount of workspace. Otherwise, use 32MB.
+ * We checked disk size above already.
+ */
+#define __calc_lba(new, old, lba, mb) do { \
+ unsigned long long dif; \
+ if ((old) != NULL) \
+ dif = (old)->size - be64_to_cpu((old)->lba); \
+ else \
+ dif = (new)->size; \
+ if ((new)->size > dif) \
+ (new)->lba = cpu_to_be64((new)->size - dif); \
+ else \
+ (new)->lba = cpu_to_be64((new)->size - (mb*1024*2)); \
+ } while (0)
+ __calc_lba(dd, ddf->dlist, workspace_lba, 32);
+ __calc_lba(dd, ddf->dlist, primary_lba, 16);
+ if (ddf->dlist == NULL ||
+ be64_to_cpu(ddf->dlist->secondary_lba) != ~(__u64)0)
+ __calc_lba(dd, ddf->dlist, secondary_lba, 32);
+ _set_config_size(pde, dd);
+
+ sprintf(pde->path, "%17.17s","Information: nil") ;
+ memset(pde->pad, 0xff, 6);
+
+ if (st->update_tail) {
+ dd->next = ddf->add_list;
+ ddf->add_list = dd;
+ } else {
+ dd->next = ddf->dlist;
+ ddf->dlist = dd;
+ ddf_set_updates_pending(ddf, NULL);
+ }
+
+ return 0;
+}
+
+static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk)
+{
+ struct ddf_super *ddf = st->sb;
+ struct dl *dl;
+
+ /* mdmon has noticed that this disk (dk->major/dk->minor) has
+ * disappeared from the container.
+ * We need to arrange that it disappears from the metadata and
+ * internal data structures too.
+ * Most of the work is done by ddf_process_update which edits
+ * the metadata and closes the file handle and attaches the memory
+ * where free_updates will free it.
+ */
+ for (dl = ddf->dlist; dl ; dl = dl->next)
+ if (dl->major == dk->major &&
+ dl->minor == dk->minor)
+ break;
+ if (!dl || dl->pdnum < 0)
+ return -1;
+
+ if (st->update_tail) {
+ int len = (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry));
+ struct phys_disk *pd;
+
+ pd = xmalloc(len);
+ pd->magic = DDF_PHYS_RECORDS_MAGIC;
+ pd->used_pdes = cpu_to_be16(dl->pdnum);
+ pd->entries[0].state = cpu_to_be16(DDF_Missing);
+ append_metadata_update(st, pd, len);
+ }
+ return 0;
+}
+
+/*
+ * This is the write_init_super method for a ddf container. It is
+ * called when creating a container or adding another device to a
+ * container.
+ */
+
+static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type)
+{
+ unsigned long long sector;
+ struct ddf_header *header;
+ int fd, i, n_config, conf_size, buf_size;
+ int ret = 0;
+ char *conf;
+
+ fd = d->fd;
+
+ switch (type) {
+ case DDF_HEADER_PRIMARY:
+ header = &ddf->primary;
+ sector = be64_to_cpu(header->primary_lba);
+ break;
+ case DDF_HEADER_SECONDARY:
+ header = &ddf->secondary;
+ sector = be64_to_cpu(header->secondary_lba);
+ break;
+ default:
+ return 0;
+ }
+ if (sector == ~(__u64)0)
+ return 0;
+
+ header->type = type;
+ header->openflag = 1;
+ header->crc = calc_crc(header, 512);
+
+ lseek64(fd, sector<<9, 0);
+ if (write(fd, header, 512) < 0)
+ goto out;
+
+ ddf->controller.crc = calc_crc(&ddf->controller, 512);
+ if (write(fd, &ddf->controller, 512) < 0)
+ goto out;
+
+ ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize);
+ if (write(fd, ddf->phys, ddf->pdsize) < 0)
+ goto out;
+ ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize);
+ if (write(fd, ddf->virt, ddf->vdsize) < 0)
+ goto out;
+
+ /* Now write lots of config records. */
+ n_config = ddf->max_part;
+ conf_size = ddf->conf_rec_len * 512;
+ conf = ddf->conf;
+ buf_size = conf_size * (n_config + 1);
+ if (!conf) {
+ if (posix_memalign((void**)&conf, 512, buf_size) != 0)
+ goto out;
+ ddf->conf = conf;
+ }
+ for (i = 0 ; i <= n_config ; i++) {
+ struct vcl *c;
+ struct vd_config *vdc = NULL;
+ if (i == n_config) {
+ c = (struct vcl *)d->spare;
+ if (c)
+ vdc = &c->conf;
+ } else {
+ unsigned int dummy;
+ c = d->vlist[i];
+ if (c)
+ get_pd_index_from_refnum(
+ c, d->disk.refnum,
+ ddf->mppe,
+ (const struct vd_config **)&vdc,
+ &dummy);
+ }
+ if (vdc) {
+ dprintf("writing conf record %i on disk %08x for %s/%u\n",
+ i, be32_to_cpu(d->disk.refnum),
+ guid_str(vdc->guid),
+ vdc->sec_elmnt_seq);
+ vdc->crc = calc_crc(vdc, conf_size);
+ memcpy(conf + i*conf_size, vdc, conf_size);
+ } else
+ memset(conf + i*conf_size, 0xff, conf_size);
+ }
+ if (write(fd, conf, buf_size) != buf_size)
+ goto out;
+
+ d->disk.crc = calc_crc(&d->disk, 512);
+ if (write(fd, &d->disk, 512) < 0)
+ goto out;
+
+ ret = 1;
+out:
+ header->openflag = 0;
+ header->crc = calc_crc(header, 512);
+
+ lseek64(fd, sector<<9, 0);
+ if (write(fd, header, 512) < 0)
+ ret = 0;
+
+ return ret;
+}
+
+static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d)
+{
+ unsigned long long size;
+ int fd = d->fd;
+ if (fd < 0)
+ return 0;
+
+ /* We need to fill in the primary, (secondary) and workspace
+ * lba's in the headers, set their checksums,
+ * Also checksum phys, virt....
+ *
+ * Then write everything out, finally the anchor is written.
+ */
+ get_dev_size(fd, NULL, &size);
+ size /= 512;
+ memcpy(&ddf->anchor, ddf->active, 512);
+ if (be64_to_cpu(d->workspace_lba) != 0ULL)
+ ddf->anchor.workspace_lba = d->workspace_lba;
+ else
+ ddf->anchor.workspace_lba =
+ cpu_to_be64(size - 32*1024*2);
+ if (be64_to_cpu(d->primary_lba) != 0ULL)
+ ddf->anchor.primary_lba = d->primary_lba;
+ else
+ ddf->anchor.primary_lba =
+ cpu_to_be64(size - 16*1024*2);
+ if (be64_to_cpu(d->secondary_lba) != 0ULL)
+ ddf->anchor.secondary_lba = d->secondary_lba;
+ else
+ ddf->anchor.secondary_lba =
+ cpu_to_be64(size - 32*1024*2);
+ ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE);
+ memcpy(&ddf->primary, &ddf->anchor, 512);
+ memcpy(&ddf->secondary, &ddf->anchor, 512);
+
+ ddf->anchor.type = DDF_HEADER_ANCHOR;
+ ddf->anchor.openflag = 0xFF; /* 'open' means nothing */
+ ddf->anchor.seq = cpu_to_be32(0xFFFFFFFF); /* no sequencing in anchor */
+ ddf->anchor.crc = calc_crc(&ddf->anchor, 512);
+
+ if (!__write_ddf_structure(d, ddf, DDF_HEADER_PRIMARY))
+ return 0;
+
+ if (!__write_ddf_structure(d, ddf, DDF_HEADER_SECONDARY))
+ return 0;
+
+ lseek64(fd, (size-1)*512, SEEK_SET);
+ if (write(fd, &ddf->anchor, 512) < 0)
+ return 0;
+
+ return 1;
+}
+
+static int __write_init_super_ddf(struct supertype *st)
+{
+ struct ddf_super *ddf = st->sb;
+ struct dl *d;
+ int attempts = 0;
+ int successes = 0;
+
+ pr_state(ddf, __func__);
+
+ /* try to write updated metadata,
+ * if we catch a failure move on to the next disk
+ */
+ for (d = ddf->dlist; d; d=d->next) {
+ attempts++;
+ successes += _write_super_to_disk(ddf, d);
+ }
+
+ return attempts != successes;
+}
+
+static int write_init_super_ddf(struct supertype *st)
+{
+ struct ddf_super *ddf = st->sb;
+ struct vcl *currentconf = ddf->currentconf;
+
+ /* We are done with currentconf - reset it so st refers to the container */
+ ddf->currentconf = NULL;
+
+ if (st->update_tail) {
+ /* queue the virtual_disk and vd_config as metadata updates */
+ struct virtual_disk *vd;
+ struct vd_config *vc;
+ int len, tlen;
+ unsigned int i;
+
+ if (!currentconf) {
+ /* Must be adding a physical disk to the container */
+ int len = (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry));
+
+ /* adding a disk to the container. */
+ if (!ddf->add_list)
+ return 0;
+
+ append_metadata_update(st, ddf->add_list->mdupdate, len);
+ ddf->add_list->mdupdate = NULL;
+ return 0;
+ }
+
+ /* Newly created VD */
+
+ /* First the virtual disk. We have a slightly fake header */
+ len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry);
+ vd = xmalloc(len);
+ *vd = *ddf->virt;
+ vd->entries[0] = ddf->virt->entries[currentconf->vcnum];
+ vd->populated_vdes = cpu_to_be16(currentconf->vcnum);
+ append_metadata_update(st, vd, len);
+
+ /* Then the vd_config */
+ len = ddf->conf_rec_len * 512;
+ tlen = len * currentconf->conf.sec_elmnt_count;
+ vc = xmalloc(tlen);
+ memcpy(vc, &currentconf->conf, len);
+ for (i = 1; i < currentconf->conf.sec_elmnt_count; i++)
+ memcpy((char *)vc + i*len, currentconf->other_bvds[i-1],
+ len);
+ append_metadata_update(st, vc, tlen);
+
+ return 0;
+ } else {
+ struct dl *d;
+ if (!currentconf)
+ for (d = ddf->dlist; d; d=d->next)
+ while (Kill(d->devname, NULL, 0, -1, 1) == 0);
+ /* Note: we don't close the fd's now, but a subsequent
+ * ->free_super() will
+ */
+ return __write_init_super_ddf(st);
+ }
+}
+
+static __u64 avail_size_ddf(struct supertype *st, __u64 devsize,
+ unsigned long long data_offset)
+{
+ /* We must reserve the last 32Meg */
+ if (devsize <= 32*1024*2)
+ return 0;
+ return devsize - 32*1024*2;
+}
+
+static int reserve_space(struct supertype *st, int raiddisks,
+ unsigned long long size, int chunk,
+ unsigned long long data_offset,
+ unsigned long long *freesize)
+{
+ /* Find 'raiddisks' spare extents at least 'size' big (but
+ * only caring about multiples of 'chunk') and remember
+ * them. If size==0, find the largest size possible.
+ * Report available size in *freesize
+ * If space cannot be found, fail.
+ */
+ struct dl *dl;
+ struct ddf_super *ddf = st->sb;
+ int cnt = 0;
+
+ for (dl = ddf->dlist; dl ; dl=dl->next) {
+ dl->raiddisk = -1;
+ dl->esize = 0;
+ }
+ /* Now find largest extent on each device */
+ for (dl = ddf->dlist ; dl ; dl=dl->next) {
+ unsigned long long minsize = ULLONG_MAX;
+
+ find_space(ddf, dl, data_offset, &minsize);
+ if (minsize >= size && minsize >= (unsigned)chunk) {
+ cnt++;
+ dl->esize = minsize;
+ }
+ }
+ if (cnt < raiddisks) {
+ pr_err("not enough devices with space to create array.\n");
+ return 0; /* No enough free spaces large enough */
+ }
+ if (size == 0) {
+ /* choose the largest size of which there are at least 'raiddisk' */
+ for (dl = ddf->dlist ; dl ; dl=dl->next) {
+ struct dl *dl2;
+ if (dl->esize <= size)
+ continue;
+ /* This is bigger than 'size', see if there are enough */
+ cnt = 0;
+ for (dl2 = ddf->dlist; dl2 ; dl2=dl2->next)
+ if (dl2->esize >= dl->esize)
+ cnt++;
+ if (cnt >= raiddisks)
+ size = dl->esize;
+ }
+ if (chunk) {
+ size = size / chunk;
+ size *= chunk;
+ }
+ *freesize = size;
+ if (size < 32) {
+ pr_err("not enough spare devices to create array.\n");
+ return 0;
+ }
+ }
+ /* We have a 'size' of which there are enough spaces.
+ * We simply do a first-fit */
+ cnt = 0;
+ for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) {
+ if (dl->esize < size)
+ continue;
+
+ dl->raiddisk = cnt;
+ cnt++;
+ }
+ return 1;
+}
+
+static int validate_geometry_ddf(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ int fd;
+ struct mdinfo *sra;
+ int cfd;
+
+ /* ddf potentially supports lots of things, but it depends on
+ * what devices are offered (and maybe kernel version?)
+ * If given unused devices, we will make a container.
+ * If given devices in a container, we will make a BVD.
+ * If given BVDs, we make an SVD, changing all the GUIDs in the process.
+ */
+
+ if (*chunk == UnSet)
+ *chunk = DEFAULT_CHUNK;
+
+ if (level == LEVEL_NONE)
+ level = LEVEL_CONTAINER;
+ if (level == LEVEL_CONTAINER) {
+ /* Must be a fresh device to add to a container */
+ return validate_geometry_ddf_container(st, level, layout,
+ raiddisks, *chunk,
+ size, data_offset, dev,
+ freesize,
+ verbose);
+ }
+
+ if (!dev) {
+ mdu_array_info_t array = {
+ .level = level,
+ .layout = layout,
+ .raid_disks = raiddisks
+ };
+ struct vd_config conf;
+ if (layout_md2ddf(&array, &conf) == -1) {
+ if (verbose)
+ pr_err("DDF does not support level %d /layout %d arrays with %d disks\n",
+ level, layout, raiddisks);
+ return 0;
+ }
+ /* Should check layout? etc */
+
+ if (st->sb && freesize) {
+ /* --create was given a container to create in.
+ * So we need to check that there are enough
+ * free spaces and return the amount of space.
+ * We may as well remember which drives were
+ * chosen so that add_to_super/getinfo_super
+ * can return them.
+ */
+ return reserve_space(st, raiddisks, size, *chunk,
+ data_offset, freesize);
+ }
+ return 1;
+ }
+
+ if (st->sb) {
+ /* A container has already been opened, so we are
+ * creating in there. Maybe a BVD, maybe an SVD.
+ * Should make a distinction one day.
+ */
+ return validate_geometry_ddf_bvd(st, level, layout, raiddisks,
+ chunk, size, data_offset, dev,
+ freesize,
+ verbose);
+ }
+ /* This is the first device for the array.
+ * If it is a container, we read it in and do automagic allocations,
+ * no other devices should be given.
+ * Otherwise it must be a member device of a container, and we
+ * do manual allocation.
+ * Later we should check for a BVD and make an SVD.
+ */
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+ if (fd >= 0) {
+ close(fd);
+ /* Just a bare device, no good to us */
+ if (verbose)
+ pr_err("ddf: Cannot create this array on device %s - a container is required.\n",
+ dev);
+ return 0;
+ }
+ if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) {
+ if (verbose)
+ pr_err("ddf: Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ /* Well, it is in use by someone, maybe a 'ddf' container. */
+ cfd = open_container(fd);
+ if (cfd < 0) {
+ close(fd);
+ if (verbose)
+ pr_err("ddf: Cannot use %s: %s\n",
+ dev, strerror(EBUSY));
+ return 0;
+ }
+ sra = sysfs_read(cfd, NULL, GET_VERSION);
+ close(fd);
+ if (sra && sra->array.major_version == -1 &&
+ strcmp(sra->text_version, "ddf") == 0) {
+ /* This is a member of a ddf container. Load the container
+ * and try to create a bvd
+ */
+ struct ddf_super *ddf;
+ if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) {
+ st->sb = ddf;
+ strcpy(st->container_devnm, fd2devnm(cfd));
+ close(cfd);
+ return validate_geometry_ddf_bvd(st, level, layout,
+ raiddisks, chunk, size,
+ data_offset,
+ dev, freesize,
+ verbose);
+ }
+ close(cfd);
+ } else /* device may belong to a different container */
+ return 0;
+
+ return 1;
+}
+
+static int
+validate_geometry_ddf_container(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ int fd;
+ unsigned long long ldsize;
+
+ if (level != LEVEL_CONTAINER)
+ return 0;
+ if (!dev)
+ return 1;
+
+ fd = dev_open(dev, O_RDONLY|O_EXCL);
+ if (fd < 0) {
+ if (verbose)
+ pr_err("ddf: Cannot open %s: %s\n",
+ dev, strerror(errno));
+ return 0;
+ }
+ if (!get_dev_size(fd, dev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+ if (freesize) {
+ *freesize = avail_size_ddf(st, ldsize >> 9, INVALID_SECTORS);
+ if (*freesize == 0)
+ return 0;
+ }
+
+ return 1;
+}
+
+static int validate_geometry_ddf_bvd(struct supertype *st,
+ int level, int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int verbose)
+{
+ dev_t rdev;
+ struct ddf_super *ddf = st->sb;
+ struct dl *dl;
+ unsigned long long maxsize;
+ /* ddf/bvd supports lots of things, but not containers */
+ if (level == LEVEL_CONTAINER) {
+ if (verbose)
+ pr_err("DDF cannot create a container within an container\n");
+ return 0;
+ }
+ /* We must have the container info already read in. */
+ if (!ddf)
+ return 0;
+
+ if (!dev) {
+ /* General test: make sure there is space for
+ * 'raiddisks' device extents of size 'size'.
+ */
+ unsigned long long minsize = size;
+ int dcnt = 0;
+ if (minsize == 0)
+ minsize = 8;
+ for (dl = ddf->dlist; dl ; dl = dl->next) {
+ if (find_space(ddf, dl, data_offset, &minsize) !=
+ INVALID_SECTORS)
+ dcnt++;
+ }
+ if (dcnt < raiddisks) {
+ if (verbose)
+ pr_err("ddf: Not enough devices with space for this array (%d < %d)\n",
+ dcnt, raiddisks);
+ return 0;
+ }
+ return 1;
+ }
+ /* This device must be a member of the set */
+ if (!stat_is_blkdev(dev, &rdev))
+ return 0;
+ for (dl = ddf->dlist ; dl ; dl = dl->next) {
+ if (dl->major == (int)major(rdev) &&
+ dl->minor == (int)minor(rdev))
+ break;
+ }
+ if (!dl) {
+ if (verbose)
+ pr_err("ddf: %s is not in the same DDF set\n",
+ dev);
+ return 0;
+ }
+ maxsize = ULLONG_MAX;
+ find_space(ddf, dl, data_offset, &maxsize);
+ *freesize = maxsize;
+
+ return 1;
+}
+
+static int load_super_ddf_all(struct supertype *st, int fd,
+ void **sbp, char *devname)
+{
+ struct mdinfo *sra;
+ struct ddf_super *super;
+ struct mdinfo *sd, *best = NULL;
+ int bestseq = 0;
+ int seq;
+ char nm[20];
+ int dfd;
+
+ sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+ if (!sra)
+ return 1;
+ if (sra->array.major_version != -1 ||
+ sra->array.minor_version != -2 ||
+ strcmp(sra->text_version, "ddf") != 0)
+ return 1;
+
+ if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0)
+ return 1;
+ memset(super, 0, sizeof(*super));
+
+ /* first, try each device, and choose the best ddf */
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ int rv;
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, O_RDONLY);
+ if (dfd < 0)
+ return 2;
+ rv = load_ddf_headers(dfd, super, NULL);
+ close(dfd);
+ if (rv == 0) {
+ seq = be32_to_cpu(super->active->seq);
+ if (super->active->openflag)
+ seq--;
+ if (!best || seq > bestseq) {
+ bestseq = seq;
+ best = sd;
+ }
+ }
+ }
+ if (!best)
+ return 1;
+ /* OK, load this ddf */
+ sprintf(nm, "%d:%d", best->disk.major, best->disk.minor);
+ dfd = dev_open(nm, O_RDONLY);
+ if (dfd < 0)
+ return 1;
+ load_ddf_headers(dfd, super, NULL);
+ load_ddf_global(dfd, super, NULL);
+ close(dfd);
+ /* Now we need the device-local bits */
+ for (sd = sra->devs ; sd ; sd = sd->next) {
+ int rv;
+
+ sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor);
+ dfd = dev_open(nm, O_RDWR);
+ if (dfd < 0)
+ return 2;
+ rv = load_ddf_headers(dfd, super, NULL);
+ if (rv == 0)
+ rv = load_ddf_local(dfd, super, NULL, 1);
+ if (rv)
+ return 1;
+ }
+
+ *sbp = super;
+ if (st->ss == NULL) {
+ st->ss = &super_ddf;
+ st->minor_version = 0;
+ st->max_devs = 512;
+ }
+ strcpy(st->container_devnm, fd2devnm(fd));
+ return 0;
+}
+
+static int load_container_ddf(struct supertype *st, int fd,
+ char *devname)
+{
+ return load_super_ddf_all(st, fd, &st->sb, devname);
+}
+
+static int check_secondary(const struct vcl *vc)
+{
+ const struct vd_config *conf = &vc->conf;
+ int i;
+
+ /* The only DDF secondary RAID level md can support is
+ * RAID 10, if the stripe sizes and Basic volume sizes
+ * are all equal.
+ * Other configurations could in theory be supported by exposing
+ * the BVDs to user space and using device mapper for the secondary
+ * mapping. So far we don't support that.
+ */
+
+ __u64 sec_elements[4] = {0, 0, 0, 0};
+#define __set_sec_seen(n) (sec_elements[(n)>>6] |= (1<<((n)&63)))
+#define __was_sec_seen(n) ((sec_elements[(n)>>6] & (1<<((n)&63))) != 0)
+
+ if (vc->other_bvds == NULL) {
+ pr_err("No BVDs for secondary RAID found\n");
+ return -1;
+ }
+ if (conf->prl != DDF_RAID1) {
+ pr_err("Secondary RAID level only supported for mirrored BVD\n");
+ return -1;
+ }
+ if (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED) {
+ pr_err("Secondary RAID level %d is unsupported\n",
+ conf->srl);
+ return -1;
+ }
+ __set_sec_seen(conf->sec_elmnt_seq);
+ for (i = 0; i < conf->sec_elmnt_count-1; i++) {
+ const struct vd_config *bvd = vc->other_bvds[i];
+ if (bvd->sec_elmnt_seq == DDF_UNUSED_BVD)
+ continue;
+ if (bvd->srl != conf->srl) {
+ pr_err("Inconsistent secondary RAID level across BVDs\n");
+ return -1;
+ }
+ if (bvd->prl != conf->prl) {
+ pr_err("Different RAID levels for BVDs are unsupported\n");
+ return -1;
+ }
+ if (!be16_eq(bvd->prim_elmnt_count, conf->prim_elmnt_count)) {
+ pr_err("All BVDs must have the same number of primary elements\n");
+ return -1;
+ }
+ if (bvd->chunk_shift != conf->chunk_shift) {
+ pr_err("Different strip sizes for BVDs are unsupported\n");
+ return -1;
+ }
+ if (!be64_eq(bvd->array_blocks, conf->array_blocks)) {
+ pr_err("Different BVD sizes are unsupported\n");
+ return -1;
+ }
+ __set_sec_seen(bvd->sec_elmnt_seq);
+ }
+ for (i = 0; i < conf->sec_elmnt_count; i++) {
+ if (!__was_sec_seen(i)) {
+ /* pr_err("BVD %d is missing\n", i); */
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static unsigned int get_pd_index_from_refnum(const struct vcl *vc,
+ be32 refnum, unsigned int nmax,
+ const struct vd_config **bvd,
+ unsigned int *idx)
+{
+ unsigned int i, j, n, sec, cnt;
+
+ cnt = be16_to_cpu(vc->conf.prim_elmnt_count);
+ sec = (vc->conf.sec_elmnt_count == 1 ? 0 : vc->conf.sec_elmnt_seq);
+
+ for (i = 0, j = 0 ; i < nmax ; i++) {
+ /* j counts valid entries for this BVD */
+ if (be32_eq(vc->conf.phys_refnum[i], refnum)) {
+ *bvd = &vc->conf;
+ *idx = i;
+ return sec * cnt + j;
+ }
+ if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff)
+ j++;
+ }
+ if (vc->other_bvds == NULL)
+ goto bad;
+
+ for (n = 1; n < vc->conf.sec_elmnt_count; n++) {
+ struct vd_config *vd = vc->other_bvds[n-1];
+ sec = vd->sec_elmnt_seq;
+ if (sec == DDF_UNUSED_BVD)
+ continue;
+ for (i = 0, j = 0 ; i < nmax ; i++) {
+ if (be32_eq(vd->phys_refnum[i], refnum)) {
+ *bvd = vd;
+ *idx = i;
+ return sec * cnt + j;
+ }
+ if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff)
+ j++;
+ }
+ }
+bad:
+ *bvd = NULL;
+ return DDF_NOTFOUND;
+}
+
+static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray)
+{
+ /* Given a container loaded by load_super_ddf_all,
+ * extract information about all the arrays into
+ * an mdinfo tree.
+ *
+ * For each vcl in conflist: create an mdinfo, fill it in,
+ * then look for matching devices (phys_refnum) in dlist
+ * and create appropriate device mdinfo.
+ */
+ struct ddf_super *ddf = st->sb;
+ struct mdinfo *rest = NULL;
+ struct vcl *vc;
+
+ for (vc = ddf->conflist ; vc ; vc=vc->next) {
+ unsigned int i;
+ struct mdinfo *this;
+ char *ep;
+ __u32 *cptr;
+ unsigned int pd;
+
+ if (subarray &&
+ (strtoul(subarray, &ep, 10) != vc->vcnum ||
+ *ep != '\0'))
+ continue;
+
+ if (vc->conf.sec_elmnt_count > 1) {
+ if (check_secondary(vc) != 0)
+ continue;
+ }
+
+ this = xcalloc(1, sizeof(*this));
+ this->next = rest;
+ rest = this;
+
+ if (layout_ddf2md(&vc->conf, &this->array))
+ continue;
+ this->array.md_minor = -1;
+ this->array.major_version = -1;
+ this->array.minor_version = -2;
+ this->safe_mode_delay = DDF_SAFE_MODE_DELAY;
+ cptr = (__u32 *)(vc->conf.guid + 16);
+ this->array.ctime = DECADE + __be32_to_cpu(*cptr);
+ this->array.utime = DECADE +
+ be32_to_cpu(vc->conf.timestamp);
+ this->array.chunk_size = 512 << vc->conf.chunk_shift;
+
+ i = vc->vcnum;
+ if ((ddf->virt->entries[i].state & DDF_state_inconsistent) ||
+ (ddf->virt->entries[i].init_state & DDF_initstate_mask) !=
+ DDF_init_full) {
+ this->array.state = 0;
+ this->resync_start = 0;
+ } else {
+ this->array.state = 1;
+ this->resync_start = MaxSector;
+ }
+ _ddf_array_name(this->name, ddf, i);
+ memset(this->uuid, 0, sizeof(this->uuid));
+ this->component_size = be64_to_cpu(vc->conf.blocks);
+ this->array.size = this->component_size / 2;
+ this->container_member = i;
+
+ ddf->currentconf = vc;
+ uuid_from_super_ddf(st, this->uuid);
+ if (!subarray)
+ ddf->currentconf = NULL;
+
+ sprintf(this->text_version, "/%s/%d",
+ st->container_devnm, this->container_member);
+
+ for (pd = 0; pd < be16_to_cpu(ddf->phys->max_pdes); pd++) {
+ struct mdinfo *dev;
+ struct dl *d;
+ const struct vd_config *bvd;
+ unsigned int iphys;
+ int stt;
+
+ if (be32_to_cpu(ddf->phys->entries[pd].refnum) ==
+ 0xffffffff)
+ continue;
+
+ stt = be16_to_cpu(ddf->phys->entries[pd].state);
+ if ((stt & (DDF_Online|DDF_Failed|DDF_Rebuilding)) !=
+ DDF_Online)
+ continue;
+
+ i = get_pd_index_from_refnum(
+ vc, ddf->phys->entries[pd].refnum,
+ ddf->mppe, &bvd, &iphys);
+ if (i == DDF_NOTFOUND)
+ continue;
+
+ this->array.working_disks++;
+
+ for (d = ddf->dlist; d ; d=d->next)
+ if (be32_eq(d->disk.refnum,
+ ddf->phys->entries[pd].refnum))
+ break;
+ if (d == NULL)
+ /* Haven't found that one yet, maybe there are others */
+ continue;
+
+ dev = xcalloc(1, sizeof(*dev));
+ dev->next = this->devs;
+ this->devs = dev;
+
+ dev->disk.number = be32_to_cpu(d->disk.refnum);
+ dev->disk.major = d->major;
+ dev->disk.minor = d->minor;
+ dev->disk.raid_disk = i;
+ dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE);
+ dev->recovery_start = MaxSector;
+
+ dev->events = be32_to_cpu(ddf->active->seq);
+ dev->data_offset =
+ be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]);
+ dev->component_size = be64_to_cpu(bvd->blocks);
+ if (d->devname)
+ strcpy(dev->name, d->devname);
+ }
+ }
+ return rest;
+}
+
+static int store_super_ddf(struct supertype *st, int fd)
+{
+ struct ddf_super *ddf = st->sb;
+ unsigned long long dsize;
+ void *buf;
+ int rc;
+
+ if (!ddf)
+ return 1;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return 1;
+
+ if (ddf->dlist || ddf->conflist) {
+ struct stat sta;
+ struct dl *dl;
+ int ofd, ret;
+
+ if (fstat(fd, &sta) == -1 || !S_ISBLK(sta.st_mode)) {
+ pr_err("file descriptor for invalid device\n");
+ return 1;
+ }
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->major == (int)major(sta.st_rdev) &&
+ dl->minor == (int)minor(sta.st_rdev))
+ break;
+ if (!dl) {
+ pr_err("couldn't find disk %d/%d\n",
+ (int)major(sta.st_rdev),
+ (int)minor(sta.st_rdev));
+ return 1;
+ }
+ ofd = dl->fd;
+ dl->fd = fd;
+ ret = (_write_super_to_disk(ddf, dl) != 1);
+ dl->fd = ofd;
+ return ret;
+ }
+
+ if (posix_memalign(&buf, 512, 512) != 0)
+ return 1;
+ memset(buf, 0, 512);
+
+ lseek64(fd, dsize-512, 0);
+ rc = write(fd, buf, 512);
+ free(buf);
+ if (rc < 0)
+ return 1;
+ return 0;
+}
+
+static int compare_super_ddf(struct supertype *st, struct supertype *tst,
+ int verbose)
+{
+ /*
+ * return:
+ * 0 same, or first was empty, and second was copied
+ * 1 second had wrong magic number - but that isn't possible
+ * 2 wrong uuid
+ * 3 wrong other info
+ */
+ struct ddf_super *first = st->sb;
+ struct ddf_super *second = tst->sb;
+ struct dl *dl1, *dl2;
+ struct vcl *vl1, *vl2;
+ unsigned int max_vds, max_pds, pd, vd;
+
+ if (!first) {
+ st->sb = tst->sb;
+ tst->sb = NULL;
+ return 0;
+ }
+
+ if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0)
+ return 2;
+
+ /* It is only OK to compare info in the anchor. Anything else
+ * could be changing due to a reconfig so must be ignored.
+ * guid really should be enough anyway.
+ */
+
+ if (!be32_eq(first->active->seq, second->active->seq)) {
+ dprintf("sequence number mismatch %u<->%u\n",
+ be32_to_cpu(first->active->seq),
+ be32_to_cpu(second->active->seq));
+ return 0;
+ }
+
+ /*
+ * At this point we are fairly sure that the meta data matches.
+ * But the new disk may contain additional local data.
+ * Add it to the super block.
+ */
+ max_vds = be16_to_cpu(first->active->max_vd_entries);
+ max_pds = be16_to_cpu(first->phys->max_pdes);
+ for (vl2 = second->conflist; vl2; vl2 = vl2->next) {
+ for (vl1 = first->conflist; vl1; vl1 = vl1->next)
+ if (!memcmp(vl1->conf.guid, vl2->conf.guid,
+ DDF_GUID_LEN))
+ break;
+ if (vl1) {
+ if (vl1->other_bvds != NULL &&
+ vl1->conf.sec_elmnt_seq !=
+ vl2->conf.sec_elmnt_seq) {
+ dprintf("adding BVD %u\n",
+ vl2->conf.sec_elmnt_seq);
+ add_other_bvd(vl1, &vl2->conf,
+ first->conf_rec_len*512);
+ }
+ continue;
+ }
+
+ if (posix_memalign((void **)&vl1, 512,
+ (first->conf_rec_len*512 +
+ offsetof(struct vcl, conf))) != 0) {
+ pr_err("could not allocate vcl buf\n");
+ return 3;
+ }
+
+ vl1->next = first->conflist;
+ vl1->block_sizes = NULL;
+ memcpy(&vl1->conf, &vl2->conf, first->conf_rec_len*512);
+ if (alloc_other_bvds(first, vl1) != 0) {
+ pr_err("could not allocate other bvds\n");
+ free(vl1);
+ return 3;
+ }
+ for (vd = 0; vd < max_vds; vd++)
+ if (!memcmp(first->virt->entries[vd].guid,
+ vl1->conf.guid, DDF_GUID_LEN))
+ break;
+ vl1->vcnum = vd;
+ dprintf("added config for VD %u\n", vl1->vcnum);
+ first->conflist = vl1;
+ }
+
+ for (dl2 = second->dlist; dl2; dl2 = dl2->next) {
+ for (dl1 = first->dlist; dl1; dl1 = dl1->next)
+ if (be32_eq(dl1->disk.refnum, dl2->disk.refnum))
+ break;
+ if (dl1)
+ continue;
+
+ if (posix_memalign((void **)&dl1, 512,
+ sizeof(*dl1) + (first->max_part) *
+ sizeof(dl1->vlist[0])) != 0) {
+ pr_err("could not allocate disk info buffer\n");
+ return 3;
+ }
+ memcpy(dl1, dl2, sizeof(*dl1));
+ dl1->mdupdate = NULL;
+ dl1->next = first->dlist;
+ dl1->fd = -1;
+ for (pd = 0; pd < max_pds; pd++)
+ if (be32_eq(first->phys->entries[pd].refnum,
+ dl1->disk.refnum))
+ break;
+ dl1->pdnum = pd < max_pds ? (int)pd : -1;
+ if (dl2->spare) {
+ if (posix_memalign((void **)&dl1->spare, 512,
+ first->conf_rec_len*512) != 0) {
+ pr_err("could not allocate spare info buf\n");
+ return 3;
+ }
+ memcpy(dl1->spare, dl2->spare, first->conf_rec_len*512);
+ }
+ for (vd = 0 ; vd < first->max_part ; vd++) {
+ if (!dl2->vlist[vd]) {
+ dl1->vlist[vd] = NULL;
+ continue;
+ }
+ for (vl1 = first->conflist; vl1; vl1 = vl1->next) {
+ if (!memcmp(vl1->conf.guid,
+ dl2->vlist[vd]->conf.guid,
+ DDF_GUID_LEN))
+ break;
+ dl1->vlist[vd] = vl1;
+ }
+ }
+ first->dlist = dl1;
+ dprintf("added disk %d: %08x\n", dl1->pdnum,
+ be32_to_cpu(dl1->disk.refnum));
+ }
+
+ return 0;
+}
+
+/*
+ * A new array 'a' has been started which claims to be instance 'inst'
+ * within container 'c'.
+ * We need to confirm that the array matches the metadata in 'c' so
+ * that we don't corrupt any metadata.
+ */
+static int ddf_open_new(struct supertype *c, struct active_array *a, int inst)
+{
+ struct ddf_super *ddf = c->sb;
+ struct mdinfo *dev;
+ struct dl *dl;
+ static const char faulty[] = "faulty";
+
+ if (all_ff(ddf->virt->entries[inst].guid)) {
+ pr_err("subarray %d doesn't exist\n", inst);
+ return -ENODEV;
+ }
+ dprintf("new subarray %d, GUID: %s\n", inst,
+ guid_str(ddf->virt->entries[inst].guid));
+ for (dev = a->info.devs; dev; dev = dev->next) {
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->major == dev->disk.major &&
+ dl->minor == dev->disk.minor)
+ break;
+ if (!dl || dl->pdnum < 0) {
+ pr_err("device %d/%d of subarray %d not found in meta data\n",
+ dev->disk.major, dev->disk.minor, inst);
+ return -1;
+ }
+ if ((be16_to_cpu(ddf->phys->entries[dl->pdnum].state) &
+ (DDF_Online|DDF_Missing|DDF_Failed)) != DDF_Online) {
+ pr_err("new subarray %d contains broken device %d/%d (%02x)\n",
+ inst, dl->major, dl->minor,
+ be16_to_cpu(ddf->phys->entries[dl->pdnum].state));
+ if (write(dev->state_fd, faulty, sizeof(faulty)-1) !=
+ sizeof(faulty) - 1)
+ pr_err("Write to state_fd failed\n");
+ dev->curr_state = DS_FAULTY;
+ }
+ }
+ a->info.container_member = inst;
+ return 0;
+}
+
+static void handle_missing(struct ddf_super *ddf, struct active_array *a, int inst)
+{
+ /* This member array is being activated. If any devices
+ * are missing they must now be marked as failed.
+ */
+ struct vd_config *vc;
+ unsigned int n_bvd;
+ struct vcl *vcl;
+ struct dl *dl;
+ int pd;
+ int n;
+ int state;
+
+ for (n = 0; ; n++) {
+ vc = find_vdcr(ddf, inst, n, &n_bvd, &vcl);
+ if (!vc)
+ break;
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (be32_eq(dl->disk.refnum, vc->phys_refnum[n_bvd]))
+ break;
+ if (dl)
+ /* Found this disk, so not missing */
+ continue;
+
+ /* Mark the device as failed/missing. */
+ pd = find_phys(ddf, vc->phys_refnum[n_bvd]);
+ if (pd >= 0 && be16_and(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Online))) {
+ be16_clear(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Online));
+ be16_set(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Failed|DDF_Missing));
+ vc->phys_refnum[n_bvd] = cpu_to_be32(0);
+ ddf_set_updates_pending(ddf, vc);
+ }
+
+ /* Mark the array as Degraded */
+ state = get_svd_state(ddf, vcl);
+ if (ddf->virt->entries[inst].state !=
+ ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state)) {
+ ddf->virt->entries[inst].state =
+ (ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state;
+ a->check_degraded = 1;
+ ddf_set_updates_pending(ddf, vc);
+ }
+ }
+}
+
+/*
+ * The array 'a' is to be marked clean in the metadata.
+ * If '->resync_start' is not ~(unsigned long long)0, then the array is only
+ * clean up to the point (in sectors). If that cannot be recorded in the
+ * metadata, then leave it as dirty.
+ *
+ * For DDF, we need to clear the DDF_state_inconsistent bit in the
+ * !global! virtual_disk.virtual_entry structure.
+ */
+static int ddf_set_array_state(struct active_array *a, int consistent)
+{
+ struct ddf_super *ddf = a->container->sb;
+ int inst = a->info.container_member;
+ int old = ddf->virt->entries[inst].state;
+ if (consistent == 2) {
+ handle_missing(ddf, a, inst);
+ consistent = 1;
+ if (!is_resync_complete(&a->info))
+ consistent = 0;
+ }
+ if (consistent)
+ ddf->virt->entries[inst].state &= ~DDF_state_inconsistent;
+ else
+ ddf->virt->entries[inst].state |= DDF_state_inconsistent;
+ if (old != ddf->virt->entries[inst].state)
+ ddf_set_updates_pending(ddf, NULL);
+
+ old = ddf->virt->entries[inst].init_state;
+ ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask;
+ if (is_resync_complete(&a->info))
+ ddf->virt->entries[inst].init_state |= DDF_init_full;
+ else if (a->info.resync_start == 0)
+ ddf->virt->entries[inst].init_state |= DDF_init_not;
+ else
+ ddf->virt->entries[inst].init_state |= DDF_init_quick;
+ if (old != ddf->virt->entries[inst].init_state)
+ ddf_set_updates_pending(ddf, NULL);
+
+ dprintf("ddf mark %d/%s (%d) %s %llu\n", inst,
+ guid_str(ddf->virt->entries[inst].guid), a->curr_state,
+ consistent?"clean":"dirty",
+ a->info.resync_start);
+ return consistent;
+}
+
+static int get_bvd_state(const struct ddf_super *ddf,
+ const struct vd_config *vc)
+{
+ unsigned int i, n_bvd, working = 0;
+ unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count);
+ int pd, st, state;
+ char *avail = xcalloc(1, n_prim);
+ mdu_array_info_t array;
+
+ layout_ddf2md(vc, &array);
+
+ for (i = 0; i < n_prim; i++) {
+ if (!find_index_in_bvd(ddf, vc, i, &n_bvd))
+ continue;
+ pd = find_phys(ddf, vc->phys_refnum[n_bvd]);
+ if (pd < 0)
+ continue;
+ st = be16_to_cpu(ddf->phys->entries[pd].state);
+ if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) ==
+ DDF_Online) {
+ working++;
+ avail[i] = 1;
+ }
+ }
+
+ state = DDF_state_degraded;
+ if (working == n_prim)
+ state = DDF_state_optimal;
+ else
+ switch (vc->prl) {
+ case DDF_RAID0:
+ case DDF_CONCAT:
+ case DDF_JBOD:
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID1:
+ if (working == 0)
+ state = DDF_state_failed;
+ else if (working >= 2)
+ state = DDF_state_part_optimal;
+ break;
+ case DDF_RAID1E:
+ if (!enough(10, n_prim, array.layout, 1, avail))
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID4:
+ case DDF_RAID5:
+ if (working < n_prim - 1)
+ state = DDF_state_failed;
+ break;
+ case DDF_RAID6:
+ if (working < n_prim - 2)
+ state = DDF_state_failed;
+ else if (working == n_prim - 1)
+ state = DDF_state_part_optimal;
+ break;
+ }
+ return state;
+}
+
+static int secondary_state(int state, int other, int seclevel)
+{
+ if (state == DDF_state_optimal && other == DDF_state_optimal)
+ return DDF_state_optimal;
+ if (seclevel == DDF_2MIRRORED) {
+ if (state == DDF_state_optimal || other == DDF_state_optimal)
+ return DDF_state_part_optimal;
+ if (state == DDF_state_failed && other == DDF_state_failed)
+ return DDF_state_failed;
+ return DDF_state_degraded;
+ } else {
+ if (state == DDF_state_failed || other == DDF_state_failed)
+ return DDF_state_failed;
+ if (state == DDF_state_degraded || other == DDF_state_degraded)
+ return DDF_state_degraded;
+ return DDF_state_part_optimal;
+ }
+}
+
+static int get_svd_state(const struct ddf_super *ddf, const struct vcl *vcl)
+{
+ int state = get_bvd_state(ddf, &vcl->conf);
+ unsigned int i;
+ for (i = 1; i < vcl->conf.sec_elmnt_count; i++) {
+ state = secondary_state(
+ state,
+ get_bvd_state(ddf, vcl->other_bvds[i-1]),
+ vcl->conf.srl);
+ }
+ return state;
+}
+
+/*
+ * The state of each disk is stored in the global phys_disk structure
+ * in phys_disk.entries[n].state.
+ * This makes various combinations awkward.
+ * - When a device fails in any array, it must be failed in all arrays
+ * that include a part of this device.
+ * - When a component is rebuilding, we cannot include it officially in the
+ * array unless this is the only array that uses the device.
+ *
+ * So: when transitioning:
+ * Online -> failed, just set failed flag. monitor will propagate
+ * spare -> online, the device might need to be added to the array.
+ * spare -> failed, just set failed. Don't worry if in array or not.
+ */
+static void ddf_set_disk(struct active_array *a, int n, int state)
+{
+ struct ddf_super *ddf = a->container->sb;
+ unsigned int inst = a->info.container_member, n_bvd;
+ struct vcl *vcl;
+ struct vd_config *vc = find_vdcr(ddf, inst, (unsigned int)n,
+ &n_bvd, &vcl);
+ int pd;
+ struct mdinfo *mdi;
+ struct dl *dl;
+ int update = 0;
+
+ dprintf("%d to %x\n", n, state);
+ if (vc == NULL) {
+ dprintf("ddf: cannot find instance %d!!\n", inst);
+ return;
+ }
+ /* Find the matching slot in 'info'. */
+ for (mdi = a->info.devs; mdi; mdi = mdi->next)
+ if (mdi->disk.raid_disk == n)
+ break;
+ if (!mdi) {
+ pr_err("cannot find raid disk %d\n", n);
+ return;
+ }
+
+ /* and find the 'dl' entry corresponding to that. */
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (mdi->state_fd >= 0 &&
+ mdi->disk.major == dl->major &&
+ mdi->disk.minor == dl->minor)
+ break;
+ if (!dl) {
+ pr_err("cannot find raid disk %d (%d/%d)\n",
+ n, mdi->disk.major, mdi->disk.minor);
+ return;
+ }
+
+ pd = find_phys(ddf, vc->phys_refnum[n_bvd]);
+ if (pd < 0 || pd != dl->pdnum) {
+ /* disk doesn't currently exist or has changed.
+ * If it is now in_sync, insert it. */
+ dprintf("phys disk not found for %d: %d/%d ref %08x\n",
+ dl->pdnum, dl->major, dl->minor,
+ be32_to_cpu(dl->disk.refnum));
+ dprintf("array %u disk %u ref %08x pd %d\n",
+ inst, n_bvd,
+ be32_to_cpu(vc->phys_refnum[n_bvd]), pd);
+ if ((state & DS_INSYNC) && ! (state & DS_FAULTY) &&
+ dl->pdnum >= 0) {
+ pd = dl->pdnum;
+ vc->phys_refnum[n_bvd] = dl->disk.refnum;
+ LBA_OFFSET(ddf, vc)[n_bvd] =
+ cpu_to_be64(mdi->data_offset);
+ be16_clear(ddf->phys->entries[pd].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_set(ddf->phys->entries[pd].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ update = 1;
+ }
+ } else {
+ be16 old = ddf->phys->entries[pd].state;
+ if (state & DS_FAULTY)
+ be16_set(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Failed));
+ if (state & DS_INSYNC) {
+ be16_set(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Online));
+ be16_clear(ddf->phys->entries[pd].state,
+ cpu_to_be16(DDF_Rebuilding));
+ }
+ if (!be16_eq(old, ddf->phys->entries[pd].state))
+ update = 1;
+ }
+
+ dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n,
+ be32_to_cpu(dl->disk.refnum), state,
+ be16_to_cpu(ddf->phys->entries[pd].state));
+
+ /* Now we need to check the state of the array and update
+ * virtual_disk.entries[n].state.
+ * It needs to be one of "optimal", "degraded", "failed".
+ * I don't understand 'deleted' or 'missing'.
+ */
+ state = get_svd_state(ddf, vcl);
+
+ if (ddf->virt->entries[inst].state !=
+ ((ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state)) {
+ ddf->virt->entries[inst].state =
+ (ddf->virt->entries[inst].state & ~DDF_state_mask)
+ | state;
+ update = 1;
+ }
+ if (update)
+ ddf_set_updates_pending(ddf, vc);
+}
+
+static void ddf_sync_metadata(struct supertype *st)
+{
+ /*
+ * Write all data to all devices.
+ * Later, we might be able to track whether only local changes
+ * have been made, or whether any global data has been changed,
+ * but ddf is sufficiently weird that it probably always
+ * changes global data ....
+ */
+ struct ddf_super *ddf = st->sb;
+ if (!ddf->updates_pending)
+ return;
+ ddf->updates_pending = 0;
+ __write_init_super_ddf(st);
+ dprintf("ddf: sync_metadata\n");
+}
+
+static int del_from_conflist(struct vcl **list, const char *guid)
+{
+ struct vcl **p;
+ int found = 0;
+ for (p = list; p && *p; p = &((*p)->next))
+ if (!memcmp((*p)->conf.guid, guid, DDF_GUID_LEN)) {
+ found = 1;
+ *p = (*p)->next;
+ }
+ return found;
+}
+
+static int _kill_subarray_ddf(struct ddf_super *ddf, const char *guid)
+{
+ struct dl *dl;
+ unsigned int vdnum, i;
+ vdnum = find_vde_by_guid(ddf, guid);
+ if (vdnum == DDF_NOTFOUND) {
+ pr_err("could not find VD %s\n", guid_str(guid));
+ return -1;
+ }
+ if (del_from_conflist(&ddf->conflist, guid) == 0) {
+ pr_err("could not find conf %s\n", guid_str(guid));
+ return -1;
+ }
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ for (i = 0; i < ddf->max_part; i++)
+ if (dl->vlist[i] != NULL &&
+ !memcmp(dl->vlist[i]->conf.guid, guid,
+ DDF_GUID_LEN))
+ dl->vlist[i] = NULL;
+ memset(ddf->virt->entries[vdnum].guid, 0xff, DDF_GUID_LEN);
+ dprintf("deleted %s\n", guid_str(guid));
+ return 0;
+}
+
+static int kill_subarray_ddf(struct supertype *st, char *subarray_id)
+{
+ struct ddf_super *ddf = st->sb;
+ /*
+ * currentconf is set in container_content_ddf,
+ * called with subarray arg
+ */
+ struct vcl *victim = ddf->currentconf;
+ struct vd_config *conf;
+ unsigned int vdnum;
+
+ ddf->currentconf = NULL;
+ if (!victim) {
+ pr_err("nothing to kill\n");
+ return -1;
+ }
+ conf = &victim->conf;
+ vdnum = find_vde_by_guid(ddf, conf->guid);
+ if (vdnum == DDF_NOTFOUND) {
+ pr_err("could not find VD %s\n", guid_str(conf->guid));
+ return -1;
+ }
+ if (st->update_tail) {
+ struct virtual_disk *vd;
+ int len = sizeof(struct virtual_disk)
+ + sizeof(struct virtual_entry);
+ vd = xmalloc(len);
+ if (vd == NULL) {
+ pr_err("failed to allocate %d bytes\n", len);
+ return -1;
+ }
+ memset(vd, 0 , len);
+ vd->magic = DDF_VIRT_RECORDS_MAGIC;
+ vd->populated_vdes = cpu_to_be16(0);
+ memcpy(vd->entries[0].guid, conf->guid, DDF_GUID_LEN);
+ /* we use DDF_state_deleted as marker */
+ vd->entries[0].state = DDF_state_deleted;
+ append_metadata_update(st, vd, len);
+ } else {
+ _kill_subarray_ddf(ddf, conf->guid);
+ ddf_set_updates_pending(ddf, NULL);
+ ddf_sync_metadata(st);
+ }
+ return 0;
+}
+
+static void copy_matching_bvd(struct ddf_super *ddf,
+ struct vd_config *conf,
+ const struct metadata_update *update)
+{
+ unsigned int mppe =
+ be16_to_cpu(ddf->anchor.max_primary_element_entries);
+ unsigned int len = ddf->conf_rec_len * 512;
+ char *p;
+ struct vd_config *vc;
+ for (p = update->buf; p < update->buf + update->len; p += len) {
+ vc = (struct vd_config *) p;
+ if (vc->sec_elmnt_seq == conf->sec_elmnt_seq) {
+ memcpy(conf->phys_refnum, vc->phys_refnum,
+ mppe * (sizeof(__u32) + sizeof(__u64)));
+ return;
+ }
+ }
+ pr_err("no match for BVD %d of %s in update\n",
+ conf->sec_elmnt_seq, guid_str(conf->guid));
+}
+
+static void ddf_process_phys_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct phys_disk *pd;
+ unsigned int ent;
+
+ pd = (struct phys_disk*)update->buf;
+ ent = be16_to_cpu(pd->used_pdes);
+ if (ent >= be16_to_cpu(ddf->phys->max_pdes))
+ return;
+ if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) {
+ struct dl **dlp;
+ /* removing this disk. */
+ be16_set(ddf->phys->entries[ent].state,
+ cpu_to_be16(DDF_Missing));
+ for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) {
+ struct dl *dl = *dlp;
+ if (dl->pdnum == (signed)ent) {
+ close(dl->fd);
+ dl->fd = -1;
+ *dlp = dl->next;
+ update->space = dl->devname;
+ *(void**)dl = update->space_list;
+ update->space_list = (void**)dl;
+ break;
+ }
+ }
+ ddf_set_updates_pending(ddf, NULL);
+ return;
+ }
+ if (!all_ff(ddf->phys->entries[ent].guid))
+ return;
+ ddf->phys->entries[ent] = pd->entries[0];
+ ddf->phys->used_pdes = cpu_to_be16
+ (1 + be16_to_cpu(ddf->phys->used_pdes));
+ ddf_set_updates_pending(ddf, NULL);
+ if (ddf->add_list) {
+ struct active_array *a;
+ struct dl *al = ddf->add_list;
+ ddf->add_list = al->next;
+
+ al->next = ddf->dlist;
+ ddf->dlist = al;
+
+ /* As a device has been added, we should check
+ * for any degraded devices that might make
+ * use of this spare */
+ for (a = st->arrays ; a; a=a->next)
+ a->check_degraded = 1;
+ }
+}
+
+static void ddf_process_virt_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct virtual_disk *vd;
+ unsigned int ent;
+
+ vd = (struct virtual_disk*)update->buf;
+
+ if (vd->entries[0].state == DDF_state_deleted) {
+ if (_kill_subarray_ddf(ddf, vd->entries[0].guid))
+ return;
+ } else {
+ ent = find_vde_by_guid(ddf, vd->entries[0].guid);
+ if (ent != DDF_NOTFOUND) {
+ dprintf("VD %s exists already in slot %d\n",
+ guid_str(vd->entries[0].guid),
+ ent);
+ return;
+ }
+ ent = find_unused_vde(ddf);
+ if (ent == DDF_NOTFOUND)
+ return;
+ ddf->virt->entries[ent] = vd->entries[0];
+ ddf->virt->populated_vdes =
+ cpu_to_be16(
+ 1 + be16_to_cpu(
+ ddf->virt->populated_vdes));
+ dprintf("added VD %s in slot %d(s=%02x i=%02x)\n",
+ guid_str(vd->entries[0].guid), ent,
+ ddf->virt->entries[ent].state,
+ ddf->virt->entries[ent].init_state);
+ }
+ ddf_set_updates_pending(ddf, NULL);
+}
+
+static void ddf_remove_failed(struct ddf_super *ddf)
+{
+ /* Now remove any 'Failed' devices that are not part
+ * of any VD. They will have the Transition flag set.
+ * Once done, we need to update all dl->pdnum numbers.
+ */
+ unsigned int pdnum;
+ unsigned int pd2 = 0;
+ struct dl *dl;
+
+ for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes);
+ pdnum++) {
+ if (be32_to_cpu(ddf->phys->entries[pdnum].refnum) ==
+ 0xFFFFFFFF)
+ continue;
+ if (be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Failed)) &&
+ be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Transition))) {
+ /* skip this one unless in dlist*/
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->pdnum == (int)pdnum)
+ break;
+ if (!dl)
+ continue;
+ }
+ if (pdnum == pd2)
+ pd2++;
+ else {
+ ddf->phys->entries[pd2] =
+ ddf->phys->entries[pdnum];
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->pdnum == (int)pdnum)
+ dl->pdnum = pd2;
+ pd2++;
+ }
+ }
+ ddf->phys->used_pdes = cpu_to_be16(pd2);
+ while (pd2 < pdnum) {
+ memset(ddf->phys->entries[pd2].guid, 0xff,
+ DDF_GUID_LEN);
+ pd2++;
+ }
+}
+
+static void ddf_update_vlist(struct ddf_super *ddf, struct dl *dl)
+{
+ struct vcl *vcl;
+ unsigned int vn = 0;
+ int in_degraded = 0;
+
+ if (dl->pdnum < 0)
+ return;
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next) {
+ unsigned int dn, ibvd;
+ const struct vd_config *conf;
+ int vstate;
+ dn = get_pd_index_from_refnum(vcl,
+ dl->disk.refnum,
+ ddf->mppe,
+ &conf, &ibvd);
+ if (dn == DDF_NOTFOUND)
+ continue;
+ dprintf("dev %d/%08x has %s (sec=%u) at %d\n",
+ dl->pdnum,
+ be32_to_cpu(dl->disk.refnum),
+ guid_str(conf->guid),
+ conf->sec_elmnt_seq, vn);
+ /* Clear the Transition flag */
+ if (be16_and
+ (ddf->phys->entries[dl->pdnum].state,
+ cpu_to_be16(DDF_Failed)))
+ be16_clear(ddf->phys
+ ->entries[dl->pdnum].state,
+ cpu_to_be16(DDF_Transition));
+ dl->vlist[vn++] = vcl;
+ vstate = ddf->virt->entries[vcl->vcnum].state
+ & DDF_state_mask;
+ if (vstate == DDF_state_degraded ||
+ vstate == DDF_state_part_optimal)
+ in_degraded = 1;
+ }
+ while (vn < ddf->max_part)
+ dl->vlist[vn++] = NULL;
+ if (dl->vlist[0]) {
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ if (!be16_and(ddf->phys
+ ->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD))) {
+ be16_set(ddf->phys
+ ->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ if (in_degraded)
+ be16_set(ddf->phys
+ ->entries[dl->pdnum]
+ .state,
+ cpu_to_be16
+ (DDF_Rebuilding));
+ }
+ }
+ if (dl->spare) {
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_set(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Spare));
+ }
+ if (!dl->vlist[0] && !dl->spare) {
+ be16_set(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare));
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Spare));
+ be16_clear(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Active_in_VD));
+ }
+}
+
+static void ddf_process_conf_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ struct ddf_super *ddf = st->sb;
+ struct vd_config *vc;
+ struct vcl *vcl;
+ struct dl *dl;
+ unsigned int ent;
+ unsigned int pdnum, len;
+
+ vc = (struct vd_config*)update->buf;
+ len = ddf->conf_rec_len * 512;
+ if ((unsigned int)update->len != len * vc->sec_elmnt_count) {
+ pr_err("%s: insufficient data (%d) for %u BVDs\n",
+ guid_str(vc->guid), update->len,
+ vc->sec_elmnt_count);
+ return;
+ }
+ for (vcl = ddf->conflist; vcl ; vcl = vcl->next)
+ if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0)
+ break;
+ dprintf("conf update for %s (%s)\n",
+ guid_str(vc->guid), (vcl ? "old" : "new"));
+ if (vcl) {
+ /* An update, just copy the phys_refnum and lba_offset
+ * fields
+ */
+ unsigned int i;
+ unsigned int k;
+ copy_matching_bvd(ddf, &vcl->conf, update);
+ for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++)
+ dprintf("BVD %u has %08x at %llu\n", 0,
+ be32_to_cpu(vcl->conf.phys_refnum[k]),
+ be64_to_cpu(LBA_OFFSET(ddf,
+ &vcl->conf)[k]));
+ for (i = 1; i < vc->sec_elmnt_count; i++) {
+ copy_matching_bvd(ddf, vcl->other_bvds[i-1],
+ update);
+ for (k = 0; k < be16_to_cpu(
+ vc->prim_elmnt_count); k++)
+ dprintf("BVD %u has %08x at %llu\n", i,
+ be32_to_cpu
+ (vcl->other_bvds[i-1]->
+ phys_refnum[k]),
+ be64_to_cpu
+ (LBA_OFFSET
+ (ddf,
+ vcl->other_bvds[i-1])[k]));
+ }
+ } else {
+ /* A new VD_CONF */
+ unsigned int i;
+ if (!update->space)
+ return;
+ vcl = update->space;
+ update->space = NULL;
+ vcl->next = ddf->conflist;
+ memcpy(&vcl->conf, vc, len);
+ ent = find_vde_by_guid(ddf, vc->guid);
+ if (ent == DDF_NOTFOUND)
+ return;
+ vcl->vcnum = ent;
+ ddf->conflist = vcl;
+ for (i = 1; i < vc->sec_elmnt_count; i++)
+ memcpy(vcl->other_bvds[i-1],
+ update->buf + len * i, len);
+ }
+ /* Set DDF_Transition on all Failed devices - to help
+ * us detect those that are no longer in use
+ */
+ for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes);
+ pdnum++)
+ if (be16_and(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Failed)))
+ be16_set(ddf->phys->entries[pdnum].state,
+ cpu_to_be16(DDF_Transition));
+
+ /* Now make sure vlist is correct for each dl. */
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ ddf_update_vlist(ddf, dl);
+ ddf_remove_failed(ddf);
+
+ ddf_set_updates_pending(ddf, vc);
+}
+
+static void ddf_process_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /* Apply this update to the metadata.
+ * The first 4 bytes are a DDF_*_MAGIC which guides
+ * our actions.
+ * Possible update are:
+ * DDF_PHYS_RECORDS_MAGIC
+ * Add a new physical device or remove an old one.
+ * Changes to this record only happen implicitly.
+ * used_pdes is the device number.
+ * DDF_VIRT_RECORDS_MAGIC
+ * Add a new VD. Possibly also change the 'access' bits.
+ * populated_vdes is the entry number.
+ * DDF_VD_CONF_MAGIC
+ * New or updated VD. the VIRT_RECORD must already
+ * exist. For an update, phys_refnum and lba_offset
+ * (at least) are updated, and the VD_CONF must
+ * be written to precisely those devices listed with
+ * a phys_refnum.
+ * DDF_SPARE_ASSIGN_MAGIC
+ * replacement Spare Assignment Record... but for which device?
+ *
+ * So, e.g.:
+ * - to create a new array, we send a VIRT_RECORD and
+ * a VD_CONF. Then assemble and start the array.
+ * - to activate a spare we send a VD_CONF to add the phys_refnum
+ * and offset. This will also mark the spare as active with
+ * a spare-assignment record.
+ */
+ be32 *magic = (be32 *)update->buf;
+
+ dprintf("Process update %x\n", be32_to_cpu(*magic));
+
+ if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) {
+ if (update->len == (sizeof(struct phys_disk) +
+ sizeof(struct phys_disk_entry)))
+ ddf_process_phys_update(st, update);
+ } else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) {
+ if (update->len == (sizeof(struct virtual_disk) +
+ sizeof(struct virtual_entry)))
+ ddf_process_virt_update(st, update);
+ } else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) {
+ ddf_process_conf_update(st, update);
+ }
+ /* case DDF_SPARE_ASSIGN_MAGIC */
+}
+
+static int ddf_prepare_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /* This update arrived at managemon.
+ * We are about to pass it to monitor.
+ * If a malloc is needed, do it here.
+ */
+ struct ddf_super *ddf = st->sb;
+ be32 *magic;
+ if (update->len < 4)
+ return 0;
+ magic = (be32 *)update->buf;
+ if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) {
+ struct vcl *vcl;
+ struct vd_config *conf;
+ if (update->len < (int)sizeof(*conf))
+ return 0;
+ conf = (struct vd_config *) update->buf;
+ if (posix_memalign(&update->space, 512,
+ offsetof(struct vcl, conf)
+ + ddf->conf_rec_len * 512) != 0) {
+ update->space = NULL;
+ return 0;
+ }
+ vcl = update->space;
+ vcl->conf.sec_elmnt_count = conf->sec_elmnt_count;
+ if (alloc_other_bvds(ddf, vcl) != 0) {
+ free(update->space);
+ update->space = NULL;
+ return 0;
+ }
+ }
+ return 1;
+}
+
+/*
+ * Check degraded state of a RAID10.
+ * returns 2 for good, 1 for degraded, 0 for failed, and -1 for error
+ */
+static int raid10_degraded(struct mdinfo *info)
+{
+ int n_prim, n_bvds;
+ int i;
+ struct mdinfo *d;
+ char *found;
+ int ret = -1;
+
+ n_prim = info->array.layout & ~0x100;
+ n_bvds = info->array.raid_disks / n_prim;
+ found = xmalloc(n_bvds);
+ if (found == NULL)
+ return ret;
+ memset(found, 0, n_bvds);
+ for (d = info->devs; d; d = d->next) {
+ i = d->disk.raid_disk / n_prim;
+ if (i >= n_bvds) {
+ pr_err("BUG: invalid raid disk\n");
+ goto out;
+ }
+ if (is_fd_valid(d->state_fd))
+ found[i]++;
+ }
+ ret = 2;
+ for (i = 0; i < n_bvds; i++)
+ if (!found[i]) {
+ dprintf("BVD %d/%d failed\n", i, n_bvds);
+ ret = 0;
+ goto out;
+ } else if (found[i] < n_prim) {
+ dprintf("BVD %d/%d degraded\n", i, n_bvds);
+ ret = 1;
+ }
+out:
+ free(found);
+ return ret;
+}
+
+/*
+ * Check if the array 'a' is degraded but not failed.
+ * If it is, find as many spares as are available and needed and
+ * arrange for their inclusion.
+ * We only choose devices which are not already in the array,
+ * and prefer those with a spare-assignment to this array.
+ * Otherwise we choose global spares - assuming always that
+ * there is enough room.
+ * For each spare that we assign, we return an 'mdinfo' which
+ * describes the position for the device in the array.
+ * We also add to 'updates' a DDF_VD_CONF_MAGIC update with
+ * the new phys_refnum and lba_offset values.
+ *
+ * Only worry about BVDs at the moment.
+ */
+static struct mdinfo *ddf_activate_spare(struct active_array *a,
+ struct metadata_update **updates)
+{
+ int working = 0;
+ struct mdinfo *d;
+ struct ddf_super *ddf = a->container->sb;
+ int global_ok = 0;
+ struct mdinfo *rv = NULL;
+ struct mdinfo *di;
+ struct metadata_update *mu;
+ struct dl *dl;
+ int i;
+ unsigned int j;
+ struct vcl *vcl;
+ struct vd_config *vc;
+ unsigned int n_bvd;
+
+ for (d = a->info.devs ; d ; d = d->next) {
+ if ((d->curr_state & DS_FAULTY) &&
+ d->state_fd >= 0)
+ /* wait for Removal to happen */
+ return NULL;
+ if (d->state_fd >= 0)
+ working ++;
+ }
+
+ dprintf("working=%d (%d) level=%d\n", working,
+ a->info.array.raid_disks,
+ a->info.array.level);
+ if (working == a->info.array.raid_disks)
+ return NULL; /* array not degraded */
+ switch (a->info.array.level) {
+ case 1:
+ if (working == 0)
+ return NULL; /* failed */
+ break;
+ case 4:
+ case 5:
+ if (working < a->info.array.raid_disks - 1)
+ return NULL; /* failed */
+ break;
+ case 6:
+ if (working < a->info.array.raid_disks - 2)
+ return NULL; /* failed */
+ break;
+ case 10:
+ if (raid10_degraded(&a->info) < 1)
+ return NULL;
+ break;
+ default: /* concat or stripe */
+ return NULL; /* failed */
+ }
+
+ /* For each slot, if it is not working, find a spare */
+ dl = ddf->dlist;
+ for (i = 0; i < a->info.array.raid_disks; i++) {
+ for (d = a->info.devs ; d ; d = d->next)
+ if (d->disk.raid_disk == i)
+ break;
+ dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+ if (d && (d->state_fd >= 0))
+ continue;
+
+ /* OK, this device needs recovery. Find a spare */
+ again:
+ for ( ; dl ; dl = dl->next) {
+ unsigned long long esize;
+ unsigned long long pos;
+ struct mdinfo *d2;
+ int is_global = 0;
+ int is_dedicated = 0;
+ be16 state;
+
+ if (dl->pdnum < 0)
+ continue;
+ state = ddf->phys->entries[dl->pdnum].state;
+ if (be16_and(state,
+ cpu_to_be16(DDF_Failed|DDF_Missing)) ||
+ !be16_and(state,
+ cpu_to_be16(DDF_Online)))
+ continue;
+
+ /* If in this array, skip */
+ for (d2 = a->info.devs ; d2 ; d2 = d2->next)
+ if (d2->state_fd >= 0 &&
+ d2->disk.major == dl->major &&
+ d2->disk.minor == dl->minor) {
+ dprintf("%x:%x (%08x) already in array\n",
+ dl->major, dl->minor,
+ be32_to_cpu(dl->disk.refnum));
+ break;
+ }
+ if (d2)
+ continue;
+ if (be16_and(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Spare))) {
+ /* Check spare assign record */
+ if (dl->spare) {
+ if (dl->spare->type & DDF_spare_dedicated) {
+ /* check spare_ents for guid */
+ unsigned int j;
+ for (j = 0 ;
+ j < be16_to_cpu
+ (dl->spare
+ ->populated);
+ j++) {
+ if (memcmp(dl->spare->spare_ents[j].guid,
+ ddf->virt->entries[a->info.container_member].guid,
+ DDF_GUID_LEN) == 0)
+ is_dedicated = 1;
+ }
+ } else
+ is_global = 1;
+ }
+ } else if (be16_and(ddf->phys->entries[dl->pdnum].type,
+ cpu_to_be16(DDF_Global_Spare))) {
+ is_global = 1;
+ } else if (!be16_and(ddf->phys
+ ->entries[dl->pdnum].state,
+ cpu_to_be16(DDF_Failed))) {
+ /* we can possibly use some of this */
+ is_global = 1;
+ }
+ if ( ! (is_dedicated ||
+ (is_global && global_ok))) {
+ dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor,
+ is_dedicated, is_global);
+ continue;
+ }
+
+ /* We are allowed to use this device - is there space?
+ * We need a->info.component_size sectors */
+ esize = a->info.component_size;
+ pos = find_space(ddf, dl, INVALID_SECTORS, &esize);
+
+ if (esize < a->info.component_size) {
+ dprintf("%x:%x has no room: %llu %llu\n",
+ dl->major, dl->minor,
+ esize, a->info.component_size);
+ /* No room */
+ continue;
+ }
+
+ /* Cool, we have a device with some space at pos */
+ di = xcalloc(1, sizeof(*di));
+ di->disk.number = i;
+ di->disk.raid_disk = i;
+ di->disk.major = dl->major;
+ di->disk.minor = dl->minor;
+ di->disk.state = 0;
+ di->recovery_start = 0;
+ di->data_offset = pos;
+ di->component_size = a->info.component_size;
+ di->next = rv;
+ rv = di;
+ dprintf("%x:%x (%08x) to be %d at %llu\n",
+ dl->major, dl->minor,
+ be32_to_cpu(dl->disk.refnum), i, pos);
+
+ break;
+ }
+ if (!dl && ! global_ok) {
+ /* not enough dedicated spares, try global */
+ global_ok = 1;
+ dl = ddf->dlist;
+ goto again;
+ }
+ }
+
+ if (!rv)
+ /* No spares found */
+ return rv;
+ /* Now 'rv' has a list of devices to return.
+ * Create a metadata_update record to update the
+ * phys_refnum and lba_offset values
+ */
+ vc = find_vdcr(ddf, a->info.container_member, rv->disk.raid_disk,
+ &n_bvd, &vcl);
+ if (vc == NULL)
+ return NULL;
+
+ mu = xmalloc(sizeof(*mu));
+ if (posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) {
+ free(mu);
+ mu = NULL;
+ }
+
+ mu->len = ddf->conf_rec_len * 512 * vcl->conf.sec_elmnt_count;
+ mu->buf = xmalloc(mu->len);
+ mu->space = NULL;
+ mu->space_list = NULL;
+ mu->next = *updates;
+ memcpy(mu->buf, &vcl->conf, ddf->conf_rec_len * 512);
+ for (j = 1; j < vcl->conf.sec_elmnt_count; j++)
+ memcpy(mu->buf + j * ddf->conf_rec_len * 512,
+ vcl->other_bvds[j-1], ddf->conf_rec_len * 512);
+
+ vc = (struct vd_config*)mu->buf;
+ for (di = rv ; di ; di = di->next) {
+ unsigned int i_sec, i_prim;
+ i_sec = di->disk.raid_disk
+ / be16_to_cpu(vcl->conf.prim_elmnt_count);
+ i_prim = di->disk.raid_disk
+ % be16_to_cpu(vcl->conf.prim_elmnt_count);
+ vc = (struct vd_config *)(mu->buf
+ + i_sec * ddf->conf_rec_len * 512);
+ for (dl = ddf->dlist; dl; dl = dl->next)
+ if (dl->major == di->disk.major &&
+ dl->minor == di->disk.minor)
+ break;
+ if (!dl || dl->pdnum < 0) {
+ pr_err("BUG: can't find disk %d (%d/%d)\n",
+ di->disk.raid_disk,
+ di->disk.major, di->disk.minor);
+ return NULL;
+ }
+ vc->phys_refnum[i_prim] = ddf->phys->entries[dl->pdnum].refnum;
+ LBA_OFFSET(ddf, vc)[i_prim] = cpu_to_be64(di->data_offset);
+ dprintf("BVD %u gets %u: %08x at %llu\n", i_sec, i_prim,
+ be32_to_cpu(vc->phys_refnum[i_prim]),
+ be64_to_cpu(LBA_OFFSET(ddf, vc)[i_prim]));
+ }
+ *updates = mu;
+ return rv;
+}
+
+static int ddf_level_to_layout(int level)
+{
+ switch(level) {
+ case 0:
+ case 1:
+ return 0;
+ case 5:
+ return ALGORITHM_LEFT_SYMMETRIC;
+ case 6:
+ return ALGORITHM_ROTATING_N_CONTINUE;
+ case 10:
+ return 0x102;
+ default:
+ return UnSet;
+ }
+}
+
+static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk)
+{
+ if (level && *level == UnSet)
+ *level = LEVEL_CONTAINER;
+
+ if (level && layout && *layout == UnSet)
+ *layout = ddf_level_to_layout(*level);
+}
+
+struct superswitch super_ddf = {
+ .examine_super = examine_super_ddf,
+ .brief_examine_super = brief_examine_super_ddf,
+ .brief_examine_subarrays = brief_examine_subarrays_ddf,
+ .export_examine_super = export_examine_super_ddf,
+ .detail_super = detail_super_ddf,
+ .brief_detail_super = brief_detail_super_ddf,
+ .validate_geometry = validate_geometry_ddf,
+ .write_init_super = write_init_super_ddf,
+ .add_to_super = add_to_super_ddf,
+ .remove_from_super = remove_from_super_ddf,
+ .load_container = load_container_ddf,
+ .copy_metadata = copy_metadata_ddf,
+ .kill_subarray = kill_subarray_ddf,
+ .match_home = match_home_ddf,
+ .uuid_from_super= uuid_from_super_ddf,
+ .getinfo_super = getinfo_super_ddf,
+ .update_super = update_super_ddf,
+
+ .avail_size = avail_size_ddf,
+
+ .compare_super = compare_super_ddf,
+
+ .load_super = load_super_ddf,
+ .init_super = init_super_ddf,
+ .store_super = store_super_ddf,
+ .free_super = free_super_ddf,
+ .match_metadata_desc = match_metadata_desc_ddf,
+ .container_content = container_content_ddf,
+ .default_geometry = default_geometry_ddf,
+
+ .external = 1,
+
+/* for mdmon */
+ .open_new = ddf_open_new,
+ .set_array_state= ddf_set_array_state,
+ .set_disk = ddf_set_disk,
+ .sync_metadata = ddf_sync_metadata,
+ .process_update = ddf_process_update,
+ .prepare_update = ddf_prepare_update,
+ .activate_spare = ddf_activate_spare,
+ .name = "ddf",
+};
diff --git a/super-gpt.c b/super-gpt.c
new file mode 100644
index 0000000..a1e9aa9
--- /dev/null
+++ b/super-gpt.c
@@ -0,0 +1,220 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'gpt' is a pseudo metadata type for devices which have a
+ * GPT partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_gpt(struct supertype *st)
+{
+ free(st->sb);
+ st->sb = NULL;
+}
+
+static void examine_gpt(struct supertype *st, char *homehost)
+{
+ struct GPT *gpt = st->sb + 512;
+ struct GPT_part_entry *gpe = st->sb + 1024;
+ unsigned int i;
+
+ printf(" GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic));
+ printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision));
+ for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+ printf(" Partition[%02d] : %12llu sectors at %12llu\n",
+ i,
+ (unsigned long long)__le64_to_cpu(gpe[i].starting_lba),
+ (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)-
+ (unsigned long long)__le64_to_cpu(gpe[i].starting_lba)
+ +1
+ );
+ }
+}
+
+static int load_gpt(struct supertype *st, int fd, char *devname)
+{
+ struct MBR *super;
+ struct GPT *gpt_head;
+ int to_read;
+ unsigned int sector_size;
+
+ free_gpt(st);
+
+ if (posix_memalign((void**)&super, 4096, 32*512) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+
+ if (!get_dev_sector_size(fd, devname, &sector_size)) {
+ free(super);
+ return 1;
+ }
+
+ lseek(fd, 0, 0);
+ if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+ no_read:
+ if (devname)
+ pr_err("Cannot read partition table on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+
+ if (super->magic != MBR_SIGNATURE_MAGIC ||
+ super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) {
+ not_found:
+ if (devname)
+ pr_err("No partition table found on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+ /* Set offset to second block (GPT header) */
+ lseek(fd, sector_size, SEEK_SET);
+ /* Seem to have GPT, load the header */
+ gpt_head = (struct GPT*)(super+1);
+ if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head))
+ goto no_read;
+ if (gpt_head->magic != GPT_SIGNATURE_MAGIC)
+ goto not_found;
+ if (__le32_to_cpu(gpt_head->part_cnt) >= 128)
+ goto not_found;
+
+ to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry);
+ to_read = ((to_read+511)/512) * 512;
+ /* Set offset to third block (GPT entries) */
+ lseek(fd, sector_size*2, SEEK_SET);
+ if (read(fd, gpt_head+1, to_read) != to_read)
+ goto no_read;
+
+ st->sb = super;
+
+ if (st->ss == NULL) {
+ st->ss = &gpt;
+ st->minor_version = 0;
+ st->max_devs = 1;
+ st->info = NULL;
+ }
+ return 0;
+}
+
+static int store_gpt(struct supertype *st, int fd)
+{
+ /* FIXME should I save the boot loader */
+ /* need to write two copies! */
+ /* FIXME allow for blocks != 512 bytes
+ *etc
+ */
+ struct MBR *super = st->sb;
+ struct GPT *gpt;
+ int to_write;
+
+ gpt = (struct GPT*)(super+1);
+
+ to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry);
+ to_write = ((to_write+511)/512) * 512;
+
+ lseek(fd, 0, 0);
+ if (write(fd, st->sb, to_write) != to_write)
+ return 4;
+
+ fsync(fd);
+ ioctl(fd, BLKRRPART, 0);
+ return 0;
+}
+
+static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct GPT *gpt = st->sb + 512;
+ struct GPT_part_entry *gpe = st->sb + 1024;
+ unsigned int i;
+
+ memset(&info->array, 0, sizeof(info->array));
+ memset(&info->disk, 0, sizeof(info->disk));
+ strcpy(info->text_version, "gpt");
+ strcpy(info->name, "gpt");
+ info->component_size = 0;
+
+ for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) {
+ unsigned long long last =
+ (unsigned long long)__le64_to_cpu(gpe[i].ending_lba);
+ if (last > info->component_size)
+ info->component_size = last;
+ }
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+ struct supertype *st = xmalloc(sizeof(*st));
+
+ if (!st)
+ return st;
+ if (strcmp(arg, "gpt") != 0) {
+ free(st);
+ return NULL;
+ }
+
+ st->ss = &gpt;
+ st->info = NULL;
+ st->minor_version = 0;
+ st->max_devs = 1;
+ st->sb = NULL;
+ return st;
+}
+
+static int validate_geometry(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *subdev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ pr_err("gpt metadata cannot be used this way\n");
+ return 0;
+}
+
+struct superswitch gpt = {
+ .examine_super = examine_gpt,
+ .validate_geometry = validate_geometry,
+ .match_metadata_desc = match_metadata_desc,
+ .load_super = load_gpt,
+ .store_super = store_gpt,
+ .getinfo_super = getinfo_gpt,
+ .free_super = free_gpt,
+ .name = "gpt",
+};
diff --git a/super-intel.c b/super-intel.c
new file mode 100644
index 0000000..d5fad10
--- /dev/null
+++ b/super-intel.c
@@ -0,0 +1,12894 @@
+/*
+ * mdadm - Intel(R) Matrix Storage Manager Support
+ *
+ * Copyright (C) 2002-2008 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "mdmon.h"
+#include "sha1.h"
+#include "platform-intel.h"
+#include <values.h>
+#include <scsi/sg.h>
+#include <ctype.h>
+#include <dirent.h>
+
+/* MPB == Metadata Parameter Block */
+#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. "
+#define MPB_SIG_LEN (strlen(MPB_SIGNATURE))
+#define MPB_VERSION_RAID0 "1.0.00"
+#define MPB_VERSION_RAID1 "1.1.00"
+#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00"
+#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01"
+#define MPB_VERSION_RAID5 "1.2.02"
+#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04"
+#define MPB_VERSION_CNG "1.2.06"
+#define MPB_VERSION_ATTRIBS "1.3.00"
+#define MAX_SIGNATURE_LENGTH 32
+#define MAX_RAID_SERIAL_LEN 16
+
+/* supports RAID0 */
+#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001)
+/* supports RAID1 */
+#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002)
+/* supports RAID10 */
+#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004)
+/* supports RAID1E */
+#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008)
+/* supports RAID5 */
+#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010)
+/* supports RAID CNG */
+#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020)
+/* supports expanded stripe sizes of 256K, 512K and 1MB */
+#define MPB_ATTRIB_EXP_STRIPE_SIZE __cpu_to_le32(0x00000040)
+
+/* The OROM Support RST Caching of Volumes */
+#define MPB_ATTRIB_NVM __cpu_to_le32(0x02000000)
+/* The OROM supports creating disks greater than 2TB */
+#define MPB_ATTRIB_2TB_DISK __cpu_to_le32(0x04000000)
+/* The OROM supports Bad Block Management */
+#define MPB_ATTRIB_BBM __cpu_to_le32(0x08000000)
+
+/* THe OROM Supports NVM Caching of Volumes */
+#define MPB_ATTRIB_NEVER_USE2 __cpu_to_le32(0x10000000)
+/* The OROM supports creating volumes greater than 2TB */
+#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000)
+/* originally for PMP, now it's wasted b/c. Never use this bit! */
+#define MPB_ATTRIB_NEVER_USE __cpu_to_le32(0x40000000)
+/* Verify MPB contents against checksum after reading MPB */
+#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000)
+
+/* Define all supported attributes that have to be accepted by mdadm
+ */
+#define MPB_ATTRIB_SUPPORTED (MPB_ATTRIB_CHECKSUM_VERIFY | \
+ MPB_ATTRIB_2TB | \
+ MPB_ATTRIB_2TB_DISK | \
+ MPB_ATTRIB_RAID0 | \
+ MPB_ATTRIB_RAID1 | \
+ MPB_ATTRIB_RAID10 | \
+ MPB_ATTRIB_RAID5 | \
+ MPB_ATTRIB_EXP_STRIPE_SIZE | \
+ MPB_ATTRIB_BBM)
+
+/* Define attributes that are unused but not harmful */
+#define MPB_ATTRIB_IGNORED (MPB_ATTRIB_NEVER_USE)
+
+#define MPB_SECTOR_CNT 2210
+#define IMSM_RESERVED_SECTORS 8192
+#define NUM_BLOCKS_DIRTY_STRIPE_REGION 2048
+#define SECT_PER_MB_SHIFT 11
+#define MAX_SECTOR_SIZE 4096
+#define MULTIPLE_PPL_AREA_SIZE_IMSM (1024 * 1024) /* Size of the whole
+ * mutliple PPL area
+ */
+
+/*
+ * Internal Write-intent bitmap is stored in the same area where PPL.
+ * Both features are mutually exclusive, so it is not an issue.
+ * The first 8KiB of the area are reserved and shall not be used.
+ */
+#define IMSM_BITMAP_AREA_RESERVED_SIZE 8192
+
+#define IMSM_BITMAP_HEADER_OFFSET (IMSM_BITMAP_AREA_RESERVED_SIZE)
+#define IMSM_BITMAP_HEADER_SIZE MAX_SECTOR_SIZE
+
+#define IMSM_BITMAP_START_OFFSET (IMSM_BITMAP_HEADER_OFFSET + IMSM_BITMAP_HEADER_SIZE)
+#define IMSM_BITMAP_AREA_SIZE (MULTIPLE_PPL_AREA_SIZE_IMSM - IMSM_BITMAP_START_OFFSET)
+#define IMSM_BITMAP_AND_HEADER_SIZE (IMSM_BITMAP_AREA_SIZE + IMSM_BITMAP_HEADER_SIZE)
+
+#define IMSM_DEFAULT_BITMAP_CHUNKSIZE (64 * 1024 * 1024)
+#define IMSM_DEFAULT_BITMAP_DAEMON_SLEEP 5
+
+/*
+ * This macro let's us ensure that no-one accidentally
+ * changes the size of a struct
+ */
+#define ASSERT_SIZE(_struct, size) \
+static inline void __assert_size_##_struct(void) \
+{ \
+ switch (0) { \
+ case 0: break; \
+ case (sizeof(struct _struct) == size): break; \
+ } \
+}
+
+/* Disk configuration info. */
+#define IMSM_MAX_DEVICES 255
+struct imsm_disk {
+ __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */
+ __u32 total_blocks_lo; /* 0xE8 - 0xEB total blocks lo */
+ __u32 scsi_id; /* 0xEC - 0xEF scsi ID */
+#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */
+#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */
+#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */
+#define JOURNAL_DISK __cpu_to_le32(0x2000000) /* Device marked as Journaling Drive */
+ __u32 status; /* 0xF0 - 0xF3 */
+ __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */
+ __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */
+#define IMSM_DISK_FILLERS 3
+ __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */
+};
+ASSERT_SIZE(imsm_disk, 48)
+
+/* map selector for map managment
+ */
+#define MAP_0 0
+#define MAP_1 1
+#define MAP_X -1
+
+/* RAID map configuration infos. */
+struct imsm_map {
+ __u32 pba_of_lba0_lo; /* start address of partition */
+ __u32 blocks_per_member_lo;/* blocks per member */
+ __u32 num_data_stripes_lo; /* number of data stripes */
+ __u16 blocks_per_strip;
+ __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */
+#define IMSM_T_STATE_NORMAL 0
+#define IMSM_T_STATE_UNINITIALIZED 1
+#define IMSM_T_STATE_DEGRADED 2
+#define IMSM_T_STATE_FAILED 3
+ __u8 raid_level;
+#define IMSM_T_RAID0 0
+#define IMSM_T_RAID1 1
+#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */
+ __u8 num_members; /* number of member disks */
+ __u8 num_domains; /* number of parity domains */
+ __u8 failed_disk_num; /* valid only when state is degraded */
+ __u8 ddf;
+ __u32 pba_of_lba0_hi;
+ __u32 blocks_per_member_hi;
+ __u32 num_data_stripes_hi;
+ __u32 filler[4]; /* expansion area */
+#define IMSM_ORD_REBUILD (1 << 24)
+ __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members],
+ * top byte contains some flags
+ */
+};
+ASSERT_SIZE(imsm_map, 52)
+
+struct imsm_vol {
+ __u32 curr_migr_unit_lo;
+ __u32 checkpoint_id; /* id to access curr_migr_unit */
+ __u8 migr_state; /* Normal or Migrating */
+#define MIGR_INIT 0
+#define MIGR_REBUILD 1
+#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */
+#define MIGR_GEN_MIGR 3
+#define MIGR_STATE_CHANGE 4
+#define MIGR_REPAIR 5
+ __u8 migr_type; /* Initializing, Rebuilding, ... */
+#define RAIDVOL_CLEAN 0
+#define RAIDVOL_DIRTY 1
+#define RAIDVOL_DSRECORD_VALID 2
+ __u8 dirty;
+ __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */
+ __u16 verify_errors; /* number of mismatches */
+ __u16 bad_blocks; /* number of bad blocks during verify */
+ __u32 curr_migr_unit_hi;
+ __u32 filler[3];
+ struct imsm_map map[1];
+ /* here comes another one if migr_state */
+};
+ASSERT_SIZE(imsm_vol, 84)
+
+struct imsm_dev {
+ __u8 volume[MAX_RAID_SERIAL_LEN];
+ __u32 size_low;
+ __u32 size_high;
+#define DEV_BOOTABLE __cpu_to_le32(0x01)
+#define DEV_BOOT_DEVICE __cpu_to_le32(0x02)
+#define DEV_READ_COALESCING __cpu_to_le32(0x04)
+#define DEV_WRITE_COALESCING __cpu_to_le32(0x08)
+#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10)
+#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20)
+#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40)
+#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80)
+#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100)
+#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200)
+#define DEV_CLONE_N_GO __cpu_to_le32(0x400)
+#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800)
+#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000)
+ __u32 status; /* Persistent RaidDev status */
+ __u32 reserved_blocks; /* Reserved blocks at beginning of volume */
+ __u8 migr_priority;
+ __u8 num_sub_vols;
+ __u8 tid;
+ __u8 cng_master_disk;
+ __u16 cache_policy;
+ __u8 cng_state;
+ __u8 cng_sub_state;
+ __u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */
+
+ /* NVM_EN */
+ __u8 nv_cache_mode;
+ __u8 nv_cache_flags;
+
+ /* Unique Volume Id of the NvCache Volume associated with this volume */
+ __u32 nvc_vol_orig_family_num;
+ __u16 nvc_vol_raid_dev_num;
+
+#define RWH_OFF 0
+#define RWH_DISTRIBUTED 1
+#define RWH_JOURNALING_DRIVE 2
+#define RWH_MULTIPLE_DISTRIBUTED 3
+#define RWH_MULTIPLE_PPLS_JOURNALING_DRIVE 4
+#define RWH_MULTIPLE_OFF 5
+#define RWH_BITMAP 6
+ __u8 rwh_policy; /* Raid Write Hole Policy */
+ __u8 jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */
+ __u8 filler1;
+
+#define IMSM_DEV_FILLERS 3
+ __u32 filler[IMSM_DEV_FILLERS];
+ struct imsm_vol vol;
+};
+ASSERT_SIZE(imsm_dev, 164)
+
+struct imsm_super {
+ __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */
+ __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */
+ __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */
+ __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */
+ __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */
+ __u32 error_log_size; /* 0x30 - 0x33 in bytes */
+ __u32 attributes; /* 0x34 - 0x37 */
+ __u8 num_disks; /* 0x38 Number of configured disks */
+ __u8 num_raid_devs; /* 0x39 Number of configured volumes */
+ __u8 error_log_pos; /* 0x3A */
+ __u8 fill[1]; /* 0x3B */
+ __u32 cache_size; /* 0x3c - 0x40 in mb */
+ __u32 orig_family_num; /* 0x40 - 0x43 original family num */
+ __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */
+ __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */
+ __u16 num_raid_devs_created; /* 0x4C - 0x4D Used for generating unique
+ * volume IDs for raid_dev created in this array
+ * (starts at 1)
+ */
+ __u16 filler1; /* 0x4E - 0x4F */
+ __u64 creation_time; /* 0x50 - 0x57 Array creation time */
+#define IMSM_FILLERS 32
+ __u32 filler[IMSM_FILLERS]; /* 0x58 - 0xD7 RAID_MPB_FILLERS */
+ struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */
+ /* here comes imsm_dev[num_raid_devs] */
+ /* here comes BBM logs */
+};
+ASSERT_SIZE(imsm_super, 264)
+
+#define BBM_LOG_MAX_ENTRIES 254
+#define BBM_LOG_MAX_LBA_ENTRY_VAL 256 /* Represents 256 LBAs */
+#define BBM_LOG_SIGNATURE 0xabadb10c
+
+struct bbm_log_block_addr {
+ __u16 w1;
+ __u32 dw1;
+} __attribute__ ((__packed__));
+
+struct bbm_log_entry {
+ __u8 marked_count; /* Number of blocks marked - 1 */
+ __u8 disk_ordinal; /* Disk entry within the imsm_super */
+ struct bbm_log_block_addr defective_block_start;
+} __attribute__ ((__packed__));
+
+struct bbm_log {
+ __u32 signature; /* 0xABADB10C */
+ __u32 entry_count;
+ struct bbm_log_entry marked_block_entries[BBM_LOG_MAX_ENTRIES];
+};
+ASSERT_SIZE(bbm_log, 2040)
+
+static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" };
+
+#define BLOCKS_PER_KB (1024/512)
+
+#define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 2209
+
+#define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */
+
+#define MIGR_REC_BUF_SECTORS 1 /* size of migr_record i/o buffer in sectors */
+#define MIGR_REC_SECTOR_POSITION 1 /* migr_record position offset on disk,
+ * MIGR_REC_BUF_SECTORS <= MIGR_REC_SECTOR_POS
+ */
+
+#define UNIT_SRC_NORMAL 0 /* Source data for curr_migr_unit must
+ * be recovered using srcMap */
+#define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has
+ * already been migrated and must
+ * be recovered from checkpoint area */
+
+#define PPL_ENTRY_SPACE (128 * 1024) /* Size of single PPL, without the header */
+
+struct migr_record {
+ __u32 rec_status; /* Status used to determine how to restart
+ * migration in case it aborts
+ * in some fashion */
+ __u32 curr_migr_unit_lo; /* 0..numMigrUnits-1 */
+ __u32 family_num; /* Family number of MPB
+ * containing the RaidDev
+ * that is migrating */
+ __u32 ascending_migr; /* True if migrating in increasing
+ * order of lbas */
+ __u32 blocks_per_unit; /* Num disk blocks per unit of operation */
+ __u32 dest_depth_per_unit; /* Num member blocks each destMap
+ * member disk
+ * advances per unit-of-operation */
+ __u32 ckpt_area_pba_lo; /* Pba of first block of ckpt copy area */
+ __u32 dest_1st_member_lba_lo; /* First member lba on first
+ * stripe of destination */
+ __u32 num_migr_units_lo; /* Total num migration units-of-op */
+ __u32 post_migr_vol_cap; /* Size of volume after
+ * migration completes */
+ __u32 post_migr_vol_cap_hi; /* Expansion space for LBA64 */
+ __u32 ckpt_read_disk_num; /* Which member disk in destSubMap[0] the
+ * migration ckpt record was read from
+ * (for recovered migrations) */
+ __u32 curr_migr_unit_hi; /* 0..numMigrUnits-1 high order 32 bits */
+ __u32 ckpt_area_pba_hi; /* Pba of first block of ckpt copy area
+ * high order 32 bits */
+ __u32 dest_1st_member_lba_hi; /* First member lba on first stripe of
+ * destination - high order 32 bits */
+ __u32 num_migr_units_hi; /* Total num migration units-of-op
+ * high order 32 bits */
+ __u32 filler[16];
+};
+ASSERT_SIZE(migr_record, 128)
+
+struct md_list {
+ /* usage marker:
+ * 1: load metadata
+ * 2: metadata does not match
+ * 4: already checked
+ */
+ int used;
+ char *devname;
+ int found;
+ int container;
+ dev_t st_rdev;
+ struct md_list *next;
+};
+
+#define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg))
+
+static __u8 migr_type(struct imsm_dev *dev)
+{
+ if (dev->vol.migr_type == MIGR_VERIFY &&
+ dev->status & DEV_VERIFY_AND_FIX)
+ return MIGR_REPAIR;
+ else
+ return dev->vol.migr_type;
+}
+
+static void set_migr_type(struct imsm_dev *dev, __u8 migr_type)
+{
+ /* for compatibility with older oroms convert MIGR_REPAIR, into
+ * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status
+ */
+ if (migr_type == MIGR_REPAIR) {
+ dev->vol.migr_type = MIGR_VERIFY;
+ dev->status |= DEV_VERIFY_AND_FIX;
+ } else {
+ dev->vol.migr_type = migr_type;
+ dev->status &= ~DEV_VERIFY_AND_FIX;
+ }
+}
+
+static unsigned int sector_count(__u32 bytes, unsigned int sector_size)
+{
+ return ROUND_UP(bytes, sector_size) / sector_size;
+}
+
+static unsigned int mpb_sectors(struct imsm_super *mpb,
+ unsigned int sector_size)
+{
+ return sector_count(__le32_to_cpu(mpb->mpb_size), sector_size);
+}
+
+struct intel_dev {
+ struct imsm_dev *dev;
+ struct intel_dev *next;
+ unsigned index;
+};
+
+struct intel_hba {
+ enum sys_dev_type type;
+ char *path;
+ char *pci_id;
+ struct intel_hba *next;
+};
+
+enum action {
+ DISK_REMOVE = 1,
+ DISK_ADD
+};
+/* internal representation of IMSM metadata */
+struct intel_super {
+ union {
+ void *buf; /* O_DIRECT buffer for reading/writing metadata */
+ struct imsm_super *anchor; /* immovable parameters */
+ };
+ union {
+ void *migr_rec_buf; /* buffer for I/O operations */
+ struct migr_record *migr_rec; /* migration record */
+ };
+ int clean_migration_record_by_mdmon; /* when reshape is switched to next
+ array, it indicates that mdmon is allowed to clean migration
+ record */
+ size_t len; /* size of the 'buf' allocation */
+ size_t extra_space; /* extra space in 'buf' that is not used yet */
+ void *next_buf; /* for realloc'ing buf from the manager */
+ size_t next_len;
+ int updates_pending; /* count of pending updates for mdmon */
+ int current_vol; /* index of raid device undergoing creation */
+ unsigned long long create_offset; /* common start for 'current_vol' */
+ __u32 random; /* random data for seeding new family numbers */
+ struct intel_dev *devlist;
+ unsigned int sector_size; /* sector size of used member drives */
+ struct dl {
+ struct dl *next;
+ int index;
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+ int major, minor;
+ char *devname;
+ struct imsm_disk disk;
+ int fd;
+ int extent_cnt;
+ struct extent *e; /* for determining freespace @ create */
+ int raiddisk; /* slot to fill in autolayout */
+ enum action action;
+ } *disks, *current_disk;
+ struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon
+ active */
+ struct dl *missing; /* disks removed while we weren't looking */
+ struct bbm_log *bbm_log;
+ struct intel_hba *hba; /* device path of the raid controller for this metadata */
+ const struct imsm_orom *orom; /* platform firmware support */
+ struct intel_super *next; /* (temp) list for disambiguating family_num */
+ struct md_bb bb; /* memory for get_bad_blocks call */
+};
+
+struct intel_disk {
+ struct imsm_disk disk;
+ #define IMSM_UNKNOWN_OWNER (-1)
+ int owner;
+ struct intel_disk *next;
+};
+
+struct extent {
+ unsigned long long start, size;
+};
+
+/* definitions of reshape process types */
+enum imsm_reshape_type {
+ CH_TAKEOVER,
+ CH_MIGRATION,
+ CH_ARRAY_SIZE,
+};
+
+/* definition of messages passed to imsm_process_update */
+enum imsm_update_type {
+ update_activate_spare,
+ update_create_array,
+ update_kill_array,
+ update_rename_array,
+ update_add_remove_disk,
+ update_reshape_container_disks,
+ update_reshape_migration,
+ update_takeover,
+ update_general_migration_checkpoint,
+ update_size_change,
+ update_prealloc_badblocks_mem,
+ update_rwh_policy,
+};
+
+struct imsm_update_activate_spare {
+ enum imsm_update_type type;
+ struct dl *dl;
+ int slot;
+ int array;
+ struct imsm_update_activate_spare *next;
+};
+
+struct geo_params {
+ char devnm[32];
+ char *dev_name;
+ unsigned long long size;
+ int level;
+ int layout;
+ int chunksize;
+ int raid_disks;
+};
+
+enum takeover_direction {
+ R10_TO_R0,
+ R0_TO_R10
+};
+struct imsm_update_takeover {
+ enum imsm_update_type type;
+ int subarray;
+ enum takeover_direction direction;
+};
+
+struct imsm_update_reshape {
+ enum imsm_update_type type;
+ int old_raid_disks;
+ int new_raid_disks;
+
+ int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */
+};
+
+struct imsm_update_reshape_migration {
+ enum imsm_update_type type;
+ int old_raid_disks;
+ int new_raid_disks;
+ /* fields for array migration changes
+ */
+ int subdev;
+ int new_level;
+ int new_layout;
+ int new_chunksize;
+
+ int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */
+};
+
+struct imsm_update_size_change {
+ enum imsm_update_type type;
+ int subdev;
+ long long new_size;
+};
+
+struct imsm_update_general_migration_checkpoint {
+ enum imsm_update_type type;
+ __u64 curr_migr_unit;
+};
+
+struct disk_info {
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+};
+
+struct imsm_update_create_array {
+ enum imsm_update_type type;
+ int dev_idx;
+ struct imsm_dev dev;
+};
+
+struct imsm_update_kill_array {
+ enum imsm_update_type type;
+ int dev_idx;
+};
+
+struct imsm_update_rename_array {
+ enum imsm_update_type type;
+ __u8 name[MAX_RAID_SERIAL_LEN];
+ int dev_idx;
+};
+
+struct imsm_update_add_remove_disk {
+ enum imsm_update_type type;
+};
+
+struct imsm_update_prealloc_bb_mem {
+ enum imsm_update_type type;
+};
+
+struct imsm_update_rwh_policy {
+ enum imsm_update_type type;
+ int new_policy;
+ int dev_idx;
+};
+
+static const char *_sys_dev_type[] = {
+ [SYS_DEV_UNKNOWN] = "Unknown",
+ [SYS_DEV_SAS] = "SAS",
+ [SYS_DEV_SATA] = "SATA",
+ [SYS_DEV_NVME] = "NVMe",
+ [SYS_DEV_VMD] = "VMD"
+};
+
+const char *get_sys_dev_type(enum sys_dev_type type)
+{
+ if (type >= SYS_DEV_MAX)
+ type = SYS_DEV_UNKNOWN;
+
+ return _sys_dev_type[type];
+}
+
+static struct intel_hba * alloc_intel_hba(struct sys_dev *device)
+{
+ struct intel_hba *result = xmalloc(sizeof(*result));
+
+ result->type = device->type;
+ result->path = xstrdup(device->path);
+ result->next = NULL;
+ if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL)
+ result->pci_id++;
+
+ return result;
+}
+
+static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device)
+{
+ struct intel_hba *result;
+
+ for (result = hba; result; result = result->next) {
+ if (result->type == device->type && strcmp(result->path, device->path) == 0)
+ break;
+ }
+ return result;
+}
+
+static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device)
+{
+ struct intel_hba *hba;
+
+ /* check if disk attached to Intel HBA */
+ hba = find_intel_hba(super->hba, device);
+ if (hba != NULL)
+ return 1;
+ /* Check if HBA is already attached to super */
+ if (super->hba == NULL) {
+ super->hba = alloc_intel_hba(device);
+ return 1;
+ }
+
+ hba = super->hba;
+ /* Intel metadata allows for all disks attached to the same type HBA.
+ * Do not support HBA types mixing
+ */
+ if (device->type != hba->type)
+ return 2;
+
+ /* Multiple same type HBAs can be used if they share the same OROM */
+ const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id);
+
+ if (device_orom != super->orom)
+ return 2;
+
+ while (hba->next)
+ hba = hba->next;
+
+ hba->next = alloc_intel_hba(device);
+ return 1;
+}
+
+static struct sys_dev* find_disk_attached_hba(int fd, const char *devname)
+{
+ struct sys_dev *list, *elem;
+ char *disk_path;
+
+ if ((list = find_intel_devices()) == NULL)
+ return 0;
+
+ if (!is_fd_valid(fd))
+ disk_path = (char *) devname;
+ else
+ disk_path = diskfd_to_devpath(fd, 1, NULL);
+
+ if (!disk_path)
+ return 0;
+
+ for (elem = list; elem; elem = elem->next)
+ if (path_attached_to_hba(disk_path, elem->path))
+ return elem;
+
+ if (disk_path != devname)
+ free(disk_path);
+
+ return NULL;
+}
+
+static int find_intel_hba_capability(int fd, struct intel_super *super,
+ char *devname);
+
+static struct supertype *match_metadata_desc_imsm(char *arg)
+{
+ struct supertype *st;
+
+ if (strcmp(arg, "imsm") != 0 &&
+ strcmp(arg, "default") != 0
+ )
+ return NULL;
+
+ st = xcalloc(1, sizeof(*st));
+ st->ss = &super_imsm;
+ st->max_devs = IMSM_MAX_DEVICES;
+ st->minor_version = 0;
+ st->sb = NULL;
+ return st;
+}
+
+static __u8 *get_imsm_version(struct imsm_super *mpb)
+{
+ return &mpb->sig[MPB_SIG_LEN];
+}
+
+/* retrieve a disk directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load time
+ */
+static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index)
+{
+ if (index >= mpb->num_disks)
+ return NULL;
+ return &mpb->disk[index];
+}
+
+/* retrieve the disk description based on a index of the disk
+ * in the sub-array
+ */
+static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index)
+{
+ struct dl *d;
+
+ for (d = super->disks; d; d = d->next)
+ if (d->index == index)
+ return d;
+
+ return NULL;
+}
+/* retrieve a disk from the parsed metadata */
+static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index)
+{
+ struct dl *dl;
+
+ dl = get_imsm_dl_disk(super, index);
+ if (dl)
+ return &dl->disk;
+
+ return NULL;
+}
+
+/* generate a checksum directly from the anchor when the anchor is known to be
+ * up-to-date, currently only at load or write_super after coalescing
+ */
+static __u32 __gen_imsm_checksum(struct imsm_super *mpb)
+{
+ __u32 end = mpb->mpb_size / sizeof(end);
+ __u32 *p = (__u32 *) mpb;
+ __u32 sum = 0;
+
+ while (end--) {
+ sum += __le32_to_cpu(*p);
+ p++;
+ }
+
+ return sum - __le32_to_cpu(mpb->check_sum);
+}
+
+static size_t sizeof_imsm_map(struct imsm_map *map)
+{
+ return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1);
+}
+
+struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map)
+{
+ /* A device can have 2 maps if it is in the middle of a migration.
+ * If second_map is:
+ * MAP_0 - we return the first map
+ * MAP_1 - we return the second map if it exists, else NULL
+ * MAP_X - we return the second map if it exists, else the first
+ */
+ struct imsm_map *map = &dev->vol.map[0];
+ struct imsm_map *map2 = NULL;
+
+ if (dev->vol.migr_state)
+ map2 = (void *)map + sizeof_imsm_map(map);
+
+ switch (second_map) {
+ case MAP_0:
+ break;
+ case MAP_1:
+ map = map2;
+ break;
+ case MAP_X:
+ if (map2)
+ map = map2;
+ break;
+ default:
+ map = NULL;
+ }
+ return map;
+
+}
+
+/* return the size of the device.
+ * migr_state increases the returned size if map[0] were to be duplicated
+ */
+static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state)
+{
+ size_t size = sizeof(*dev) - sizeof(struct imsm_map) +
+ sizeof_imsm_map(get_imsm_map(dev, MAP_0));
+
+ /* migrating means an additional map */
+ if (dev->vol.migr_state)
+ size += sizeof_imsm_map(get_imsm_map(dev, MAP_1));
+ else if (migr_state)
+ size += sizeof_imsm_map(get_imsm_map(dev, MAP_0));
+
+ return size;
+}
+
+/* retrieve disk serial number list from a metadata update */
+static struct disk_info *get_disk_info(struct imsm_update_create_array *update)
+{
+ void *u = update;
+ struct disk_info *inf;
+
+ inf = u + sizeof(*update) - sizeof(struct imsm_dev) +
+ sizeof_imsm_dev(&update->dev, 0);
+
+ return inf;
+}
+
+static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index)
+{
+ int offset;
+ int i;
+ void *_mpb = mpb;
+
+ if (index >= mpb->num_raid_devs)
+ return NULL;
+
+ /* devices start after all disks */
+ offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb;
+
+ for (i = 0; i <= index; i++)
+ if (i == index)
+ return _mpb + offset;
+ else
+ offset += sizeof_imsm_dev(_mpb + offset, 0);
+
+ return NULL;
+}
+
+static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index)
+{
+ struct intel_dev *dv;
+
+ if (index >= super->anchor->num_raid_devs)
+ return NULL;
+ for (dv = super->devlist; dv; dv = dv->next)
+ if (dv->index == index)
+ return dv->dev;
+ return NULL;
+}
+
+static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr
+ *addr)
+{
+ return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) |
+ __le16_to_cpu(addr->w1));
+}
+
+static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec)
+{
+ struct bbm_log_block_addr addr;
+
+ addr.w1 = __cpu_to_le16((__u16)(sec & 0xffff));
+ addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xffffffff);
+ return addr;
+}
+
+/* get size of the bbm log */
+static __u32 get_imsm_bbm_log_size(struct bbm_log *log)
+{
+ if (!log || log->entry_count == 0)
+ return 0;
+
+ return sizeof(log->signature) +
+ sizeof(log->entry_count) +
+ log->entry_count * sizeof(struct bbm_log_entry);
+}
+
+/* check if bad block is not partially stored in bbm log */
+static int is_stored_in_bbm(struct bbm_log *log, const __u8 idx, const unsigned
+ long long sector, const int length, __u32 *pos)
+{
+ __u32 i;
+
+ for (i = *pos; i < log->entry_count; i++) {
+ struct bbm_log_entry *entry = &log->marked_block_entries[i];
+ unsigned long long bb_start;
+ unsigned long long bb_end;
+
+ bb_start = __le48_to_cpu(&entry->defective_block_start);
+ bb_end = bb_start + (entry->marked_count + 1);
+
+ if ((entry->disk_ordinal == idx) && (bb_start >= sector) &&
+ (bb_end <= sector + length)) {
+ *pos = i;
+ return 1;
+ }
+ }
+ return 0;
+}
+
+/* record new bad block in bbm log */
+static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned
+ long long sector, int length)
+{
+ int new_bb = 0;
+ __u32 pos = 0;
+ struct bbm_log_entry *entry = NULL;
+
+ while (is_stored_in_bbm(log, idx, sector, length, &pos)) {
+ struct bbm_log_entry *e = &log->marked_block_entries[pos];
+
+ if ((e->marked_count + 1 == BBM_LOG_MAX_LBA_ENTRY_VAL) &&
+ (__le48_to_cpu(&e->defective_block_start) == sector)) {
+ sector += BBM_LOG_MAX_LBA_ENTRY_VAL;
+ length -= BBM_LOG_MAX_LBA_ENTRY_VAL;
+ pos = pos + 1;
+ continue;
+ }
+ entry = e;
+ break;
+ }
+
+ if (entry) {
+ int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
+ BBM_LOG_MAX_LBA_ENTRY_VAL;
+ entry->defective_block_start = __cpu_to_le48(sector);
+ entry->marked_count = cnt - 1;
+ if (cnt == length)
+ return 1;
+ sector += cnt;
+ length -= cnt;
+ }
+
+ new_bb = ROUND_UP(length, BBM_LOG_MAX_LBA_ENTRY_VAL) /
+ BBM_LOG_MAX_LBA_ENTRY_VAL;
+ if (log->entry_count + new_bb > BBM_LOG_MAX_ENTRIES)
+ return 0;
+
+ while (length > 0) {
+ int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length :
+ BBM_LOG_MAX_LBA_ENTRY_VAL;
+ struct bbm_log_entry *entry =
+ &log->marked_block_entries[log->entry_count];
+
+ entry->defective_block_start = __cpu_to_le48(sector);
+ entry->marked_count = cnt - 1;
+ entry->disk_ordinal = idx;
+
+ sector += cnt;
+ length -= cnt;
+
+ log->entry_count++;
+ }
+
+ return new_bb;
+}
+
+/* clear all bad blocks for given disk */
+static void clear_disk_badblocks(struct bbm_log *log, const __u8 idx)
+{
+ __u32 i = 0;
+
+ while (i < log->entry_count) {
+ struct bbm_log_entry *entries = log->marked_block_entries;
+
+ if (entries[i].disk_ordinal == idx) {
+ if (i < log->entry_count - 1)
+ entries[i] = entries[log->entry_count - 1];
+ log->entry_count--;
+ } else {
+ i++;
+ }
+ }
+}
+
+/* clear given bad block */
+static int clear_badblock(struct bbm_log *log, const __u8 idx, const unsigned
+ long long sector, const int length) {
+ __u32 i = 0;
+
+ while (i < log->entry_count) {
+ struct bbm_log_entry *entries = log->marked_block_entries;
+
+ if ((entries[i].disk_ordinal == idx) &&
+ (__le48_to_cpu(&entries[i].defective_block_start) ==
+ sector) && (entries[i].marked_count + 1 == length)) {
+ if (i < log->entry_count - 1)
+ entries[i] = entries[log->entry_count - 1];
+ log->entry_count--;
+ break;
+ }
+ i++;
+ }
+
+ return 1;
+}
+
+/* allocate and load BBM log from metadata */
+static int load_bbm_log(struct intel_super *super)
+{
+ struct imsm_super *mpb = super->anchor;
+ __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
+
+ super->bbm_log = xcalloc(1, sizeof(struct bbm_log));
+ if (!super->bbm_log)
+ return 1;
+
+ if (bbm_log_size) {
+ struct bbm_log *log = (void *)mpb +
+ __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+
+ __u32 entry_count;
+
+ if (bbm_log_size < sizeof(log->signature) +
+ sizeof(log->entry_count))
+ return 2;
+
+ entry_count = __le32_to_cpu(log->entry_count);
+ if ((__le32_to_cpu(log->signature) != BBM_LOG_SIGNATURE) ||
+ (entry_count > BBM_LOG_MAX_ENTRIES))
+ return 3;
+
+ if (bbm_log_size !=
+ sizeof(log->signature) + sizeof(log->entry_count) +
+ entry_count * sizeof(struct bbm_log_entry))
+ return 4;
+
+ memcpy(super->bbm_log, log, bbm_log_size);
+ } else {
+ super->bbm_log->signature = __cpu_to_le32(BBM_LOG_SIGNATURE);
+ super->bbm_log->entry_count = 0;
+ }
+
+ return 0;
+}
+
+/* checks if bad block is within volume boundaries */
+static int is_bad_block_in_volume(const struct bbm_log_entry *entry,
+ const unsigned long long start_sector,
+ const unsigned long long size)
+{
+ unsigned long long bb_start;
+ unsigned long long bb_end;
+
+ bb_start = __le48_to_cpu(&entry->defective_block_start);
+ bb_end = bb_start + (entry->marked_count + 1);
+
+ if (((bb_start >= start_sector) && (bb_start < start_sector + size)) ||
+ ((bb_end >= start_sector) && (bb_end <= start_sector + size)))
+ return 1;
+
+ return 0;
+}
+
+/* get list of bad blocks on a drive for a volume */
+static void get_volume_badblocks(const struct bbm_log *log, const __u8 idx,
+ const unsigned long long start_sector,
+ const unsigned long long size,
+ struct md_bb *bbs)
+{
+ __u32 count = 0;
+ __u32 i;
+
+ for (i = 0; i < log->entry_count; i++) {
+ const struct bbm_log_entry *ent =
+ &log->marked_block_entries[i];
+ struct md_bb_entry *bb;
+
+ if ((ent->disk_ordinal == idx) &&
+ is_bad_block_in_volume(ent, start_sector, size)) {
+
+ if (!bbs->entries) {
+ bbs->entries = xmalloc(BBM_LOG_MAX_ENTRIES *
+ sizeof(*bb));
+ if (!bbs->entries)
+ break;
+ }
+
+ bb = &bbs->entries[count++];
+ bb->sector = __le48_to_cpu(&ent->defective_block_start);
+ bb->length = ent->marked_count + 1;
+ }
+ }
+ bbs->count = count;
+}
+
+/*
+ * for second_map:
+ * == MAP_0 get first map
+ * == MAP_1 get second map
+ * == MAP_X than get map according to the current migr_state
+ */
+static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev,
+ int slot,
+ int second_map)
+{
+ struct imsm_map *map;
+
+ map = get_imsm_map(dev, second_map);
+
+ /* top byte identifies disk under rebuild */
+ return __le32_to_cpu(map->disk_ord_tbl[slot]);
+}
+
+#define ord_to_idx(ord) (((ord) << 8) >> 8)
+static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map)
+{
+ __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map);
+
+ return ord_to_idx(ord);
+}
+
+static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord)
+{
+ map->disk_ord_tbl[slot] = __cpu_to_le32(ord);
+}
+
+static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx)
+{
+ int slot;
+ __u32 ord;
+
+ for (slot = 0; slot < map->num_members; slot++) {
+ ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+ if (ord_to_idx(ord) == idx)
+ return slot;
+ }
+
+ return -1;
+}
+
+static int get_imsm_raid_level(struct imsm_map *map)
+{
+ if (map->raid_level == 1) {
+ if (map->num_members == 2)
+ return 1;
+ else
+ return 10;
+ }
+
+ return map->raid_level;
+}
+
+static int cmp_extent(const void *av, const void *bv)
+{
+ const struct extent *a = av;
+ const struct extent *b = bv;
+ if (a->start < b->start)
+ return -1;
+ if (a->start > b->start)
+ return 1;
+ return 0;
+}
+
+static int count_memberships(struct dl *dl, struct intel_super *super)
+{
+ int memberships = 0;
+ int i;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+ if (get_imsm_disk_slot(map, dl->index) >= 0)
+ memberships++;
+ }
+
+ return memberships;
+}
+
+static __u32 imsm_min_reserved_sectors(struct intel_super *super);
+
+static int split_ull(unsigned long long n, void *lo, void *hi)
+{
+ if (lo == 0 || hi == 0)
+ return 1;
+ __put_unaligned32(__cpu_to_le32((__u32)n), lo);
+ __put_unaligned32(__cpu_to_le32((n >> 32)), hi);
+ return 0;
+}
+
+static unsigned long long join_u32(__u32 lo, __u32 hi)
+{
+ return (unsigned long long)__le32_to_cpu(lo) |
+ (((unsigned long long)__le32_to_cpu(hi)) << 32);
+}
+
+static unsigned long long total_blocks(struct imsm_disk *disk)
+{
+ if (disk == NULL)
+ return 0;
+ return join_u32(disk->total_blocks_lo, disk->total_blocks_hi);
+}
+
+/**
+ * imsm_num_data_members() - get data drives count for an array.
+ * @map: Map to analyze.
+ *
+ * num_data_members value represents minimal count of drives for level.
+ * The name of the property could be misleading for RAID5 with asymmetric layout
+ * because some data required to be calculated from parity.
+ * The property is extracted from level and num_members value.
+ *
+ * Return: num_data_members value on success, zero otherwise.
+ */
+static __u8 imsm_num_data_members(struct imsm_map *map)
+{
+ switch (get_imsm_raid_level(map)) {
+ case 0:
+ return map->num_members;
+ case 1:
+ case 10:
+ return map->num_members / 2;
+ case 5:
+ return map->num_members - 1;
+ default:
+ dprintf("unsupported raid level\n");
+ return 0;
+ }
+}
+
+static unsigned long long pba_of_lba0(struct imsm_map *map)
+{
+ if (map == NULL)
+ return 0;
+ return join_u32(map->pba_of_lba0_lo, map->pba_of_lba0_hi);
+}
+
+static unsigned long long blocks_per_member(struct imsm_map *map)
+{
+ if (map == NULL)
+ return 0;
+ return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi);
+}
+
+static unsigned long long num_data_stripes(struct imsm_map *map)
+{
+ if (map == NULL)
+ return 0;
+ return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi);
+}
+
+static unsigned long long vol_curr_migr_unit(struct imsm_dev *dev)
+{
+ if (dev == NULL)
+ return 0;
+
+ return join_u32(dev->vol.curr_migr_unit_lo, dev->vol.curr_migr_unit_hi);
+}
+
+static unsigned long long imsm_dev_size(struct imsm_dev *dev)
+{
+ if (dev == NULL)
+ return 0;
+ return join_u32(dev->size_low, dev->size_high);
+}
+
+static unsigned long long migr_chkp_area_pba(struct migr_record *migr_rec)
+{
+ if (migr_rec == NULL)
+ return 0;
+ return join_u32(migr_rec->ckpt_area_pba_lo,
+ migr_rec->ckpt_area_pba_hi);
+}
+
+static unsigned long long current_migr_unit(struct migr_record *migr_rec)
+{
+ if (migr_rec == NULL)
+ return 0;
+ return join_u32(migr_rec->curr_migr_unit_lo,
+ migr_rec->curr_migr_unit_hi);
+}
+
+static unsigned long long migr_dest_1st_member_lba(struct migr_record *migr_rec)
+{
+ if (migr_rec == NULL)
+ return 0;
+ return join_u32(migr_rec->dest_1st_member_lba_lo,
+ migr_rec->dest_1st_member_lba_hi);
+}
+
+static unsigned long long get_num_migr_units(struct migr_record *migr_rec)
+{
+ if (migr_rec == NULL)
+ return 0;
+ return join_u32(migr_rec->num_migr_units_lo,
+ migr_rec->num_migr_units_hi);
+}
+
+static void set_total_blocks(struct imsm_disk *disk, unsigned long long n)
+{
+ split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi);
+}
+
+/**
+ * set_num_domains() - Set number of domains for an array.
+ * @map: Map to be updated.
+ *
+ * num_domains property represents copies count of each data drive, thus make
+ * it meaningful only for RAID1 and RAID10. IMSM supports two domains for
+ * raid1 and raid10.
+ */
+static void set_num_domains(struct imsm_map *map)
+{
+ int level = get_imsm_raid_level(map);
+
+ if (level == 1 || level == 10)
+ map->num_domains = 2;
+ else
+ map->num_domains = 1;
+}
+
+static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n)
+{
+ split_ull(n, &map->pba_of_lba0_lo, &map->pba_of_lba0_hi);
+}
+
+static void set_blocks_per_member(struct imsm_map *map, unsigned long long n)
+{
+ split_ull(n, &map->blocks_per_member_lo, &map->blocks_per_member_hi);
+}
+
+static void set_num_data_stripes(struct imsm_map *map, unsigned long long n)
+{
+ split_ull(n, &map->num_data_stripes_lo, &map->num_data_stripes_hi);
+}
+
+/**
+ * update_num_data_stripes() - Calculate and update num_data_stripes value.
+ * @map: map to be updated.
+ * @dev_size: size of volume.
+ *
+ * num_data_stripes value is addictionally divided by num_domains, therefore for
+ * levels where num_domains is not 1, nds is a part of real value.
+ */
+static void update_num_data_stripes(struct imsm_map *map,
+ unsigned long long dev_size)
+{
+ unsigned long long nds = dev_size / imsm_num_data_members(map);
+
+ nds /= map->num_domains;
+ nds /= map->blocks_per_strip;
+ set_num_data_stripes(map, nds);
+}
+
+static void set_vol_curr_migr_unit(struct imsm_dev *dev, unsigned long long n)
+{
+ if (dev == NULL)
+ return;
+
+ split_ull(n, &dev->vol.curr_migr_unit_lo, &dev->vol.curr_migr_unit_hi);
+}
+
+static void set_imsm_dev_size(struct imsm_dev *dev, unsigned long long n)
+{
+ split_ull(n, &dev->size_low, &dev->size_high);
+}
+
+static void set_migr_chkp_area_pba(struct migr_record *migr_rec,
+ unsigned long long n)
+{
+ split_ull(n, &migr_rec->ckpt_area_pba_lo, &migr_rec->ckpt_area_pba_hi);
+}
+
+static void set_current_migr_unit(struct migr_record *migr_rec,
+ unsigned long long n)
+{
+ split_ull(n, &migr_rec->curr_migr_unit_lo,
+ &migr_rec->curr_migr_unit_hi);
+}
+
+static void set_migr_dest_1st_member_lba(struct migr_record *migr_rec,
+ unsigned long long n)
+{
+ split_ull(n, &migr_rec->dest_1st_member_lba_lo,
+ &migr_rec->dest_1st_member_lba_hi);
+}
+
+static void set_num_migr_units(struct migr_record *migr_rec,
+ unsigned long long n)
+{
+ split_ull(n, &migr_rec->num_migr_units_lo,
+ &migr_rec->num_migr_units_hi);
+}
+
+static unsigned long long per_dev_array_size(struct imsm_map *map)
+{
+ unsigned long long array_size = 0;
+
+ if (map == NULL)
+ return array_size;
+
+ array_size = num_data_stripes(map) * map->blocks_per_strip;
+ if (get_imsm_raid_level(map) == 1 || get_imsm_raid_level(map) == 10)
+ array_size *= 2;
+
+ return array_size;
+}
+
+static struct extent *get_extents(struct intel_super *super, struct dl *dl,
+ int get_minimal_reservation)
+{
+ /* find a list of used extents on the given physical device */
+ struct extent *rv, *e;
+ int i;
+ int memberships = count_memberships(dl, super);
+ __u32 reservation;
+
+ /* trim the reserved area for spares, so they can join any array
+ * regardless of whether the OROM has assigned sectors from the
+ * IMSM_RESERVED_SECTORS region
+ */
+ if (dl->index == -1 || get_minimal_reservation)
+ reservation = imsm_min_reserved_sectors(super);
+ else
+ reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+ rv = xcalloc(sizeof(struct extent), (memberships + 1));
+ e = rv;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+ if (get_imsm_disk_slot(map, dl->index) >= 0) {
+ e->start = pba_of_lba0(map);
+ e->size = per_dev_array_size(map);
+ e++;
+ }
+ }
+ qsort(rv, memberships, sizeof(*rv), cmp_extent);
+
+ /* determine the start of the metadata
+ * when no raid devices are defined use the default
+ * ...otherwise allow the metadata to truncate the value
+ * as is the case with older versions of imsm
+ */
+ if (memberships) {
+ struct extent *last = &rv[memberships - 1];
+ unsigned long long remainder;
+
+ remainder = total_blocks(&dl->disk) - (last->start + last->size);
+ /* round down to 1k block to satisfy precision of the kernel
+ * 'size' interface
+ */
+ remainder &= ~1UL;
+ /* make sure remainder is still sane */
+ if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9)
+ remainder = ROUND_UP(super->len, 512) >> 9;
+ if (reservation > remainder)
+ reservation = remainder;
+ }
+ e->start = total_blocks(&dl->disk) - reservation;
+ e->size = 0;
+ return rv;
+}
+
+/* try to determine how much space is reserved for metadata from
+ * the last get_extents() entry, otherwise fallback to the
+ * default
+ */
+static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl)
+{
+ struct extent *e;
+ int i;
+ __u32 rv;
+
+ /* for spares just return a minimal reservation which will grow
+ * once the spare is picked up by an array
+ */
+ if (dl->index == -1)
+ return MPB_SECTOR_CNT;
+
+ e = get_extents(super, dl, 0);
+ if (!e)
+ return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+
+ /* scroll to last entry */
+ for (i = 0; e[i].size; i++)
+ continue;
+
+ rv = total_blocks(&dl->disk) - e[i].start;
+
+ free(e);
+
+ return rv;
+}
+
+static int is_spare(struct imsm_disk *disk)
+{
+ return (disk->status & SPARE_DISK) == SPARE_DISK;
+}
+
+static int is_configured(struct imsm_disk *disk)
+{
+ return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK;
+}
+
+static int is_failed(struct imsm_disk *disk)
+{
+ return (disk->status & FAILED_DISK) == FAILED_DISK;
+}
+
+static int is_journal(struct imsm_disk *disk)
+{
+ return (disk->status & JOURNAL_DISK) == JOURNAL_DISK;
+}
+
+/* round array size down to closest MB and ensure it splits evenly
+ * between members
+ */
+static unsigned long long round_size_to_mb(unsigned long long size, unsigned int
+ disk_count)
+{
+ size /= disk_count;
+ size = (size >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT;
+ size *= disk_count;
+
+ return size;
+}
+
+static int able_to_resync(int raid_level, int missing_disks)
+{
+ int max_missing_disks = 0;
+
+ switch (raid_level) {
+ case 10:
+ max_missing_disks = 1;
+ break;
+ default:
+ max_missing_disks = 0;
+ }
+ return missing_disks <= max_missing_disks;
+}
+
+/* try to determine how much space is reserved for metadata from
+ * the last get_extents() entry on the smallest active disk,
+ * otherwise fallback to the default
+ */
+static __u32 imsm_min_reserved_sectors(struct intel_super *super)
+{
+ struct extent *e;
+ int i;
+ unsigned long long min_active;
+ __u32 remainder;
+ __u32 rv = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+ struct dl *dl, *dl_min = NULL;
+
+ if (!super)
+ return rv;
+
+ min_active = 0;
+ for (dl = super->disks; dl; dl = dl->next) {
+ if (dl->index < 0)
+ continue;
+ unsigned long long blocks = total_blocks(&dl->disk);
+ if (blocks < min_active || min_active == 0) {
+ dl_min = dl;
+ min_active = blocks;
+ }
+ }
+ if (!dl_min)
+ return rv;
+
+ /* find last lba used by subarrays on the smallest active disk */
+ e = get_extents(super, dl_min, 0);
+ if (!e)
+ return rv;
+ for (i = 0; e[i].size; i++)
+ continue;
+
+ remainder = min_active - e[i].start;
+ free(e);
+
+ /* to give priority to recovery we should not require full
+ IMSM_RESERVED_SECTORS from the spare */
+ rv = MPB_SECTOR_CNT + NUM_BLOCKS_DIRTY_STRIPE_REGION;
+
+ /* if real reservation is smaller use that value */
+ return (remainder < rv) ? remainder : rv;
+}
+
+/*
+ * Return minimum size of a spare and sector size
+ * that can be used in this array
+ */
+int get_spare_criteria_imsm(struct supertype *st, struct spare_criteria *c)
+{
+ struct intel_super *super = st->sb;
+ struct dl *dl;
+ struct extent *e;
+ int i;
+ unsigned long long size = 0;
+
+ c->min_size = 0;
+ c->sector_size = 0;
+
+ if (!super)
+ return -EINVAL;
+ /* find first active disk in array */
+ dl = super->disks;
+ while (dl && (is_failed(&dl->disk) || dl->index == -1))
+ dl = dl->next;
+ if (!dl)
+ return -EINVAL;
+ /* find last lba used by subarrays */
+ e = get_extents(super, dl, 0);
+ if (!e)
+ return -EINVAL;
+ for (i = 0; e[i].size; i++)
+ continue;
+ if (i > 0)
+ size = e[i-1].start + e[i-1].size;
+ free(e);
+
+ /* add the amount of space needed for metadata */
+ size += imsm_min_reserved_sectors(super);
+
+ c->min_size = size * 512;
+ c->sector_size = super->sector_size;
+
+ return 0;
+}
+
+static bool is_gen_migration(struct imsm_dev *dev);
+
+#define IMSM_4K_DIV 8
+
+static __u64 blocks_per_migr_unit(struct intel_super *super,
+ struct imsm_dev *dev);
+
+static void print_imsm_dev(struct intel_super *super,
+ struct imsm_dev *dev,
+ char *uuid,
+ int disk_idx)
+{
+ __u64 sz;
+ int slot, i;
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
+ __u32 ord;
+
+ printf("\n");
+ printf("[%.16s]:\n", dev->volume);
+ printf(" Subarray : %d\n", super->current_vol);
+ printf(" UUID : %s\n", uuid);
+ printf(" RAID Level : %d", get_imsm_raid_level(map));
+ if (map2)
+ printf(" <-- %d", get_imsm_raid_level(map2));
+ printf("\n");
+ printf(" Members : %d", map->num_members);
+ if (map2)
+ printf(" <-- %d", map2->num_members);
+ printf("\n");
+ printf(" Slots : [");
+ for (i = 0; i < map->num_members; i++) {
+ ord = get_imsm_ord_tbl_ent(dev, i, MAP_0);
+ printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
+ }
+ printf("]");
+ if (map2) {
+ printf(" <-- [");
+ for (i = 0; i < map2->num_members; i++) {
+ ord = get_imsm_ord_tbl_ent(dev, i, MAP_1);
+ printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U");
+ }
+ printf("]");
+ }
+ printf("\n");
+ printf(" Failed disk : ");
+ if (map->failed_disk_num == 0xff)
+ printf("none");
+ else
+ printf("%i", map->failed_disk_num);
+ printf("\n");
+ slot = get_imsm_disk_slot(map, disk_idx);
+ if (slot >= 0) {
+ ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
+ printf(" This Slot : %d%s\n", slot,
+ ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : "");
+ } else
+ printf(" This Slot : ?\n");
+ printf(" Sector Size : %u\n", super->sector_size);
+ sz = imsm_dev_size(dev);
+ printf(" Array Size : %llu%s\n",
+ (unsigned long long)sz * 512 / super->sector_size,
+ human_size(sz * 512));
+ sz = blocks_per_member(map);
+ printf(" Per Dev Size : %llu%s\n",
+ (unsigned long long)sz * 512 / super->sector_size,
+ human_size(sz * 512));
+ printf(" Sector Offset : %llu\n",
+ pba_of_lba0(map) * 512 / super->sector_size);
+ printf(" Num Stripes : %llu\n",
+ num_data_stripes(map));
+ printf(" Chunk Size : %u KiB",
+ __le16_to_cpu(map->blocks_per_strip) / 2);
+ if (map2)
+ printf(" <-- %u KiB",
+ __le16_to_cpu(map2->blocks_per_strip) / 2);
+ printf("\n");
+ printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks));
+ printf(" Migrate State : ");
+ if (dev->vol.migr_state) {
+ if (migr_type(dev) == MIGR_INIT)
+ printf("initialize\n");
+ else if (migr_type(dev) == MIGR_REBUILD)
+ printf("rebuild\n");
+ else if (migr_type(dev) == MIGR_VERIFY)
+ printf("check\n");
+ else if (migr_type(dev) == MIGR_GEN_MIGR)
+ printf("general migration\n");
+ else if (migr_type(dev) == MIGR_STATE_CHANGE)
+ printf("state change\n");
+ else if (migr_type(dev) == MIGR_REPAIR)
+ printf("repair\n");
+ else
+ printf("<unknown:%d>\n", migr_type(dev));
+ } else
+ printf("idle\n");
+ printf(" Map State : %s", map_state_str[map->map_state]);
+ if (dev->vol.migr_state) {
+ struct imsm_map *map = get_imsm_map(dev, MAP_1);
+
+ printf(" <-- %s", map_state_str[map->map_state]);
+ printf("\n Checkpoint : %llu ", vol_curr_migr_unit(dev));
+ if (is_gen_migration(dev) && (slot > 1 || slot < 0))
+ printf("(N/A)");
+ else
+ printf("(%llu)", (unsigned long long)
+ blocks_per_migr_unit(super, dev));
+ }
+ printf("\n");
+ printf(" Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ?
+ "dirty" : "clean");
+ printf(" RWH Policy : ");
+ if (dev->rwh_policy == RWH_OFF || dev->rwh_policy == RWH_MULTIPLE_OFF)
+ printf("off\n");
+ else if (dev->rwh_policy == RWH_DISTRIBUTED)
+ printf("PPL distributed\n");
+ else if (dev->rwh_policy == RWH_JOURNALING_DRIVE)
+ printf("PPL journaling drive\n");
+ else if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+ printf("Multiple distributed PPLs\n");
+ else if (dev->rwh_policy == RWH_MULTIPLE_PPLS_JOURNALING_DRIVE)
+ printf("Multiple PPLs on journaling drive\n");
+ else if (dev->rwh_policy == RWH_BITMAP)
+ printf("Write-intent bitmap\n");
+ else
+ printf("<unknown:%d>\n", dev->rwh_policy);
+
+ printf(" Volume ID : %u\n", dev->my_vol_raid_dev_num);
+}
+
+static void print_imsm_disk(struct imsm_disk *disk,
+ int index,
+ __u32 reserved,
+ unsigned int sector_size) {
+ char str[MAX_RAID_SERIAL_LEN + 1];
+ __u64 sz;
+
+ if (index < -1 || !disk)
+ return;
+
+ printf("\n");
+ snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial);
+ if (index >= 0)
+ printf(" Disk%02d Serial : %s\n", index, str);
+ else
+ printf(" Disk Serial : %s\n", str);
+ printf(" State :%s%s%s%s\n", is_spare(disk) ? " spare" : "",
+ is_configured(disk) ? " active" : "",
+ is_failed(disk) ? " failed" : "",
+ is_journal(disk) ? " journal" : "");
+ printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id));
+ sz = total_blocks(disk) - reserved;
+ printf(" Usable Size : %llu%s\n",
+ (unsigned long long)sz * 512 / sector_size,
+ human_size(sz * 512));
+}
+
+void convert_to_4k_imsm_migr_rec(struct intel_super *super)
+{
+ struct migr_record *migr_rec = super->migr_rec;
+
+ migr_rec->blocks_per_unit /= IMSM_4K_DIV;
+ migr_rec->dest_depth_per_unit /= IMSM_4K_DIV;
+ split_ull((join_u32(migr_rec->post_migr_vol_cap,
+ migr_rec->post_migr_vol_cap_hi) / IMSM_4K_DIV),
+ &migr_rec->post_migr_vol_cap, &migr_rec->post_migr_vol_cap_hi);
+ set_migr_chkp_area_pba(migr_rec,
+ migr_chkp_area_pba(migr_rec) / IMSM_4K_DIV);
+ set_migr_dest_1st_member_lba(migr_rec,
+ migr_dest_1st_member_lba(migr_rec) / IMSM_4K_DIV);
+}
+
+void convert_to_4k_imsm_disk(struct imsm_disk *disk)
+{
+ set_total_blocks(disk, (total_blocks(disk)/IMSM_4K_DIV));
+}
+
+void convert_to_4k(struct intel_super *super)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_disk *disk;
+ int i;
+ __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
+
+ for (i = 0; i < mpb->num_disks ; i++) {
+ disk = __get_imsm_disk(mpb, i);
+ /* disk */
+ convert_to_4k_imsm_disk(disk);
+ }
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ /* dev */
+ set_imsm_dev_size(dev, imsm_dev_size(dev)/IMSM_4K_DIV);
+ set_vol_curr_migr_unit(dev,
+ vol_curr_migr_unit(dev) / IMSM_4K_DIV);
+
+ /* map0 */
+ set_blocks_per_member(map, blocks_per_member(map)/IMSM_4K_DIV);
+ map->blocks_per_strip /= IMSM_4K_DIV;
+ set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
+
+ if (dev->vol.migr_state) {
+ /* map1 */
+ map = get_imsm_map(dev, MAP_1);
+ set_blocks_per_member(map,
+ blocks_per_member(map)/IMSM_4K_DIV);
+ map->blocks_per_strip /= IMSM_4K_DIV;
+ set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV);
+ }
+ }
+ if (bbm_log_size) {
+ struct bbm_log *log = (void *)mpb +
+ __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+ __u32 i;
+
+ for (i = 0; i < log->entry_count; i++) {
+ struct bbm_log_entry *entry =
+ &log->marked_block_entries[i];
+
+ __u8 count = entry->marked_count + 1;
+ unsigned long long sector =
+ __le48_to_cpu(&entry->defective_block_start);
+
+ entry->defective_block_start =
+ __cpu_to_le48(sector/IMSM_4K_DIV);
+ entry->marked_count = max(count/IMSM_4K_DIV, 1) - 1;
+ }
+ }
+
+ mpb->check_sum = __gen_imsm_checksum(mpb);
+}
+
+void examine_migr_rec_imsm(struct intel_super *super)
+{
+ struct migr_record *migr_rec = super->migr_rec;
+ struct imsm_super *mpb = super->anchor;
+ int i;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+ struct imsm_map *map;
+ int slot = -1;
+
+ if (is_gen_migration(dev) == false)
+ continue;
+
+ printf("\nMigration Record Information:");
+
+ /* first map under migration */
+ map = get_imsm_map(dev, MAP_0);
+ if (map)
+ slot = get_imsm_disk_slot(map, super->disks->index);
+ if (map == NULL || slot > 1 || slot < 0) {
+ printf(" Empty\n ");
+ printf("Examine one of first two disks in array\n");
+ break;
+ }
+ printf("\n Status : ");
+ if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL)
+ printf("Normal\n");
+ else
+ printf("Contains Data\n");
+ printf(" Current Unit : %llu\n",
+ current_migr_unit(migr_rec));
+ printf(" Family : %u\n",
+ __le32_to_cpu(migr_rec->family_num));
+ printf(" Ascending : %u\n",
+ __le32_to_cpu(migr_rec->ascending_migr));
+ printf(" Blocks Per Unit : %u\n",
+ __le32_to_cpu(migr_rec->blocks_per_unit));
+ printf(" Dest. Depth Per Unit : %u\n",
+ __le32_to_cpu(migr_rec->dest_depth_per_unit));
+ printf(" Checkpoint Area pba : %llu\n",
+ migr_chkp_area_pba(migr_rec));
+ printf(" First member lba : %llu\n",
+ migr_dest_1st_member_lba(migr_rec));
+ printf(" Total Number of Units : %llu\n",
+ get_num_migr_units(migr_rec));
+ printf(" Size of volume : %llu\n",
+ join_u32(migr_rec->post_migr_vol_cap,
+ migr_rec->post_migr_vol_cap_hi));
+ printf(" Record was read from : %u\n",
+ __le32_to_cpu(migr_rec->ckpt_read_disk_num));
+
+ break;
+ }
+}
+
+void convert_from_4k_imsm_migr_rec(struct intel_super *super)
+{
+ struct migr_record *migr_rec = super->migr_rec;
+
+ migr_rec->blocks_per_unit *= IMSM_4K_DIV;
+ migr_rec->dest_depth_per_unit *= IMSM_4K_DIV;
+ split_ull((join_u32(migr_rec->post_migr_vol_cap,
+ migr_rec->post_migr_vol_cap_hi) * IMSM_4K_DIV),
+ &migr_rec->post_migr_vol_cap,
+ &migr_rec->post_migr_vol_cap_hi);
+ set_migr_chkp_area_pba(migr_rec,
+ migr_chkp_area_pba(migr_rec) * IMSM_4K_DIV);
+ set_migr_dest_1st_member_lba(migr_rec,
+ migr_dest_1st_member_lba(migr_rec) * IMSM_4K_DIV);
+}
+
+void convert_from_4k(struct intel_super *super)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_disk *disk;
+ int i;
+ __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size);
+
+ for (i = 0; i < mpb->num_disks ; i++) {
+ disk = __get_imsm_disk(mpb, i);
+ /* disk */
+ set_total_blocks(disk, (total_blocks(disk)*IMSM_4K_DIV));
+ }
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ /* dev */
+ set_imsm_dev_size(dev, imsm_dev_size(dev)*IMSM_4K_DIV);
+ set_vol_curr_migr_unit(dev,
+ vol_curr_migr_unit(dev) * IMSM_4K_DIV);
+
+ /* map0 */
+ set_blocks_per_member(map, blocks_per_member(map)*IMSM_4K_DIV);
+ map->blocks_per_strip *= IMSM_4K_DIV;
+ set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
+
+ if (dev->vol.migr_state) {
+ /* map1 */
+ map = get_imsm_map(dev, MAP_1);
+ set_blocks_per_member(map,
+ blocks_per_member(map)*IMSM_4K_DIV);
+ map->blocks_per_strip *= IMSM_4K_DIV;
+ set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV);
+ }
+ }
+ if (bbm_log_size) {
+ struct bbm_log *log = (void *)mpb +
+ __le32_to_cpu(mpb->mpb_size) - bbm_log_size;
+ __u32 i;
+
+ for (i = 0; i < log->entry_count; i++) {
+ struct bbm_log_entry *entry =
+ &log->marked_block_entries[i];
+
+ __u8 count = entry->marked_count + 1;
+ unsigned long long sector =
+ __le48_to_cpu(&entry->defective_block_start);
+
+ entry->defective_block_start =
+ __cpu_to_le48(sector*IMSM_4K_DIV);
+ entry->marked_count = count*IMSM_4K_DIV - 1;
+ }
+ }
+
+ mpb->check_sum = __gen_imsm_checksum(mpb);
+}
+
+/*******************************************************************************
+ * function: imsm_check_attributes
+ * Description: Function checks if features represented by attributes flags
+ * are supported by mdadm.
+ * Parameters:
+ * attributes - Attributes read from metadata
+ * Returns:
+ * 0 - passed attributes contains unsupported features flags
+ * 1 - all features are supported
+ ******************************************************************************/
+static int imsm_check_attributes(__u32 attributes)
+{
+ int ret_val = 1;
+ __u32 not_supported = MPB_ATTRIB_SUPPORTED^0xffffffff;
+
+ not_supported &= ~MPB_ATTRIB_IGNORED;
+
+ not_supported &= attributes;
+ if (not_supported) {
+ pr_err("(IMSM): Unsupported attributes : %x\n",
+ (unsigned)__le32_to_cpu(not_supported));
+ if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) {
+ dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n");
+ not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY;
+ }
+ if (not_supported & MPB_ATTRIB_2TB) {
+ dprintf("\t\tMPB_ATTRIB_2TB\n");
+ not_supported ^= MPB_ATTRIB_2TB;
+ }
+ if (not_supported & MPB_ATTRIB_RAID0) {
+ dprintf("\t\tMPB_ATTRIB_RAID0\n");
+ not_supported ^= MPB_ATTRIB_RAID0;
+ }
+ if (not_supported & MPB_ATTRIB_RAID1) {
+ dprintf("\t\tMPB_ATTRIB_RAID1\n");
+ not_supported ^= MPB_ATTRIB_RAID1;
+ }
+ if (not_supported & MPB_ATTRIB_RAID10) {
+ dprintf("\t\tMPB_ATTRIB_RAID10\n");
+ not_supported ^= MPB_ATTRIB_RAID10;
+ }
+ if (not_supported & MPB_ATTRIB_RAID1E) {
+ dprintf("\t\tMPB_ATTRIB_RAID1E\n");
+ not_supported ^= MPB_ATTRIB_RAID1E;
+ }
+ if (not_supported & MPB_ATTRIB_RAID5) {
+ dprintf("\t\tMPB_ATTRIB_RAID5\n");
+ not_supported ^= MPB_ATTRIB_RAID5;
+ }
+ if (not_supported & MPB_ATTRIB_RAIDCNG) {
+ dprintf("\t\tMPB_ATTRIB_RAIDCNG\n");
+ not_supported ^= MPB_ATTRIB_RAIDCNG;
+ }
+ if (not_supported & MPB_ATTRIB_BBM) {
+ dprintf("\t\tMPB_ATTRIB_BBM\n");
+ not_supported ^= MPB_ATTRIB_BBM;
+ }
+ if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) {
+ dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY (== MPB_ATTRIB_LEGACY)\n");
+ not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY;
+ }
+ if (not_supported & MPB_ATTRIB_EXP_STRIPE_SIZE) {
+ dprintf("\t\tMPB_ATTRIB_EXP_STRIP_SIZE\n");
+ not_supported ^= MPB_ATTRIB_EXP_STRIPE_SIZE;
+ }
+ if (not_supported & MPB_ATTRIB_2TB_DISK) {
+ dprintf("\t\tMPB_ATTRIB_2TB_DISK\n");
+ not_supported ^= MPB_ATTRIB_2TB_DISK;
+ }
+ if (not_supported & MPB_ATTRIB_NEVER_USE2) {
+ dprintf("\t\tMPB_ATTRIB_NEVER_USE2\n");
+ not_supported ^= MPB_ATTRIB_NEVER_USE2;
+ }
+ if (not_supported & MPB_ATTRIB_NEVER_USE) {
+ dprintf("\t\tMPB_ATTRIB_NEVER_USE\n");
+ not_supported ^= MPB_ATTRIB_NEVER_USE;
+ }
+
+ if (not_supported)
+ dprintf("(IMSM): Unknown attributes : %x\n", not_supported);
+
+ ret_val = 0;
+ }
+
+ return ret_val;
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map);
+
+static void examine_super_imsm(struct supertype *st, char *homehost)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ char str[MAX_SIGNATURE_LENGTH];
+ int i;
+ struct mdinfo info;
+ char nbuf[64];
+ __u32 sum;
+ __u32 reserved = imsm_reserved_sectors(super, super->disks);
+ struct dl *dl;
+ time_t creation_time;
+
+ strncpy(str, (char *)mpb->sig, MPB_SIG_LEN);
+ str[MPB_SIG_LEN-1] = '\0';
+ printf(" Magic : %s\n", str);
+ printf(" Version : %s\n", get_imsm_version(mpb));
+ printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num));
+ printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num));
+ printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num));
+ creation_time = __le64_to_cpu(mpb->creation_time);
+ printf(" Creation Time : %.24s\n",
+ creation_time ? ctime(&creation_time) : "Unknown");
+ printf(" Attributes : ");
+ if (imsm_check_attributes(mpb->attributes))
+ printf("All supported\n");
+ else
+ printf("not supported\n");
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf(" UUID : %s\n", nbuf + 5);
+ sum = __le32_to_cpu(mpb->check_sum);
+ printf(" Checksum : %08x %s\n", sum,
+ __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect");
+ printf(" MPB Sectors : %d\n", mpb_sectors(mpb, super->sector_size));
+ printf(" Disks : %d\n", mpb->num_disks);
+ printf(" RAID Devices : %d\n", mpb->num_raid_devs);
+ print_imsm_disk(__get_imsm_disk(mpb, super->disks->index),
+ super->disks->index, reserved, super->sector_size);
+ if (get_imsm_bbm_log_size(super->bbm_log)) {
+ struct bbm_log *log = super->bbm_log;
+
+ printf("\n");
+ printf("Bad Block Management Log:\n");
+ printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size));
+ printf(" Signature : %x\n", __le32_to_cpu(log->signature));
+ printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count));
+ }
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct mdinfo info;
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+
+ super->current_vol = i;
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ print_imsm_dev(super, dev, nbuf + 5, super->disks->index);
+ }
+ for (i = 0; i < mpb->num_disks; i++) {
+ if (i == super->disks->index)
+ continue;
+ print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved,
+ super->sector_size);
+ }
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->index == -1)
+ print_imsm_disk(&dl->disk, -1, reserved,
+ super->sector_size);
+
+ examine_migr_rec_imsm(super);
+}
+
+static void brief_examine_super_imsm(struct supertype *st, int verbose)
+{
+ /* We just write a generic IMSM ARRAY entry */
+ struct mdinfo info;
+ char nbuf[64];
+
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5);
+}
+
+static void brief_examine_subarrays_imsm(struct supertype *st, int verbose)
+{
+ /* We just write a generic IMSM ARRAY entry */
+ struct mdinfo info;
+ char nbuf[64];
+ char nbuf1[64];
+ struct intel_super *super = st->sb;
+ int i;
+
+ if (!super->anchor->num_raid_devs)
+ return;
+
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+
+ super->current_vol = i;
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf1, ':');
+ printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n",
+ dev->volume, nbuf + 5, i, nbuf1 + 5);
+ }
+}
+
+static void export_examine_super_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct mdinfo info;
+ char nbuf[64];
+
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("MD_METADATA=imsm\n");
+ printf("MD_LEVEL=container\n");
+ printf("MD_UUID=%s\n", nbuf+5);
+ printf("MD_DEVICES=%u\n", mpb->num_disks);
+ printf("MD_CREATION_TIME=%llu\n", __le64_to_cpu(mpb->creation_time));
+}
+
+static void detail_super_imsm(struct supertype *st, char *homehost,
+ char *subarray)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ struct intel_super *super = st->sb;
+ int temp_vol = super->current_vol;
+
+ if (subarray)
+ super->current_vol = strtoul(subarray, NULL, 10);
+
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf("\n UUID : %s\n", nbuf + 5);
+
+ super->current_vol = temp_vol;
+}
+
+static void brief_detail_super_imsm(struct supertype *st, char *subarray)
+{
+ struct mdinfo info;
+ char nbuf[64];
+ struct intel_super *super = st->sb;
+ int temp_vol = super->current_vol;
+
+ if (subarray)
+ super->current_vol = strtoul(subarray, NULL, 10);
+
+ getinfo_super_imsm(st, &info, NULL);
+ fname_from_uuid(st, &info, nbuf, ':');
+ printf(" UUID=%s", nbuf + 5);
+
+ super->current_vol = temp_vol;
+}
+
+static int imsm_read_serial(int fd, char *devname, __u8 *serial,
+ size_t serial_buf_len);
+static void fd2devname(int fd, char *name);
+
+static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose)
+{
+ /* dump an unsorted list of devices attached to AHCI Intel storage
+ * controller, as well as non-connected ports
+ */
+ int hba_len = strlen(hba_path) + 1;
+ struct dirent *ent;
+ DIR *dir;
+ char *path = NULL;
+ int err = 0;
+ unsigned long port_mask = (1 << port_count) - 1;
+
+ if (port_count > (int)sizeof(port_mask) * 8) {
+ if (verbose > 0)
+ pr_err("port_count %d out of range\n", port_count);
+ return 2;
+ }
+
+ /* scroll through /sys/dev/block looking for devices attached to
+ * this hba
+ */
+ dir = opendir("/sys/dev/block");
+ if (!dir)
+ return 1;
+
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ int fd;
+ char model[64];
+ char vendor[64];
+ char buf[1024];
+ int major, minor;
+ char device[PATH_MAX];
+ char *c;
+ int port;
+ int type;
+
+ if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
+ continue;
+ path = devt_to_devpath(makedev(major, minor), 1, NULL);
+ if (!path)
+ continue;
+ if (!path_attached_to_hba(path, hba_path)) {
+ free(path);
+ path = NULL;
+ continue;
+ }
+
+ /* retrieve the scsi device */
+ if (!devt_to_devpath(makedev(major, minor), 1, device)) {
+ if (verbose > 0)
+ pr_err("failed to get device\n");
+ err = 2;
+ break;
+ }
+ if (devpath_to_char(device, "type", buf, sizeof(buf), 0)) {
+ err = 2;
+ break;
+ }
+ type = strtoul(buf, NULL, 10);
+
+ /* if it's not a disk print the vendor and model */
+ if (!(type == 0 || type == 7 || type == 14)) {
+ vendor[0] = '\0';
+ model[0] = '\0';
+
+ if (devpath_to_char(device, "vendor", buf,
+ sizeof(buf), 0) == 0) {
+ strncpy(vendor, buf, sizeof(vendor));
+ vendor[sizeof(vendor) - 1] = '\0';
+ c = (char *) &vendor[sizeof(vendor) - 1];
+ while (isspace(*c) || *c == '\0')
+ *c-- = '\0';
+
+ }
+
+ if (devpath_to_char(device, "model", buf,
+ sizeof(buf), 0) == 0) {
+ strncpy(model, buf, sizeof(model));
+ model[sizeof(model) - 1] = '\0';
+ c = (char *) &model[sizeof(model) - 1];
+ while (isspace(*c) || *c == '\0')
+ *c-- = '\0';
+ }
+
+ if (vendor[0] && model[0])
+ sprintf(buf, "%.64s %.64s", vendor, model);
+ else
+ switch (type) { /* numbers from hald/linux/device.c */
+ case 1: sprintf(buf, "tape"); break;
+ case 2: sprintf(buf, "printer"); break;
+ case 3: sprintf(buf, "processor"); break;
+ case 4:
+ case 5: sprintf(buf, "cdrom"); break;
+ case 6: sprintf(buf, "scanner"); break;
+ case 8: sprintf(buf, "media_changer"); break;
+ case 9: sprintf(buf, "comm"); break;
+ case 12: sprintf(buf, "raid"); break;
+ default: sprintf(buf, "unknown");
+ }
+ } else
+ buf[0] = '\0';
+
+ /* chop device path to 'host%d' and calculate the port number */
+ c = strchr(&path[hba_len], '/');
+ if (!c) {
+ if (verbose > 0)
+ pr_err("%s - invalid path name\n", path + hba_len);
+ err = 2;
+ break;
+ }
+ *c = '\0';
+ if ((sscanf(&path[hba_len], "ata%d", &port) == 1) ||
+ ((sscanf(&path[hba_len], "host%d", &port) == 1)))
+ port -= host_base;
+ else {
+ if (verbose > 0) {
+ *c = '/'; /* repair the full string */
+ pr_err("failed to determine port number for %s\n",
+ path);
+ }
+ err = 2;
+ break;
+ }
+
+ /* mark this port as used */
+ port_mask &= ~(1 << port);
+
+ /* print out the device information */
+ if (buf[0]) {
+ printf(" Port%d : - non-disk device (%s) -\n", port, buf);
+ continue;
+ }
+
+ fd = dev_open(ent->d_name, O_RDONLY);
+ if (!is_fd_valid(fd))
+ printf(" Port%d : - disk info unavailable -\n", port);
+ else {
+ fd2devname(fd, buf);
+ printf(" Port%d : %s", port, buf);
+ if (imsm_read_serial(fd, NULL, (__u8 *)buf,
+ sizeof(buf)) == 0)
+ printf(" (%s)\n", buf);
+ else
+ printf(" ()\n");
+ close(fd);
+ }
+ free(path);
+ path = NULL;
+ }
+ if (path)
+ free(path);
+ if (dir)
+ closedir(dir);
+ if (err == 0) {
+ int i;
+
+ for (i = 0; i < port_count; i++)
+ if (port_mask & (1 << i))
+ printf(" Port%d : - no device attached -\n", i);
+ }
+
+ return err;
+}
+
+static int print_nvme_info(struct sys_dev *hba)
+{
+ struct dirent *ent;
+ DIR *dir;
+
+ dir = opendir("/sys/block/");
+ if (!dir)
+ return 1;
+
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ char ns_path[PATH_MAX];
+ char cntrl_path[PATH_MAX];
+ char buf[PATH_MAX];
+ int fd = -1;
+
+ if (!strstr(ent->d_name, "nvme"))
+ goto skip;
+
+ fd = open_dev(ent->d_name);
+ if (!is_fd_valid(fd))
+ goto skip;
+
+ if (!diskfd_to_devpath(fd, 0, ns_path) ||
+ !diskfd_to_devpath(fd, 1, cntrl_path))
+ goto skip;
+
+ if (!path_attached_to_hba(cntrl_path, hba->path))
+ goto skip;
+
+ if (!imsm_is_nvme_namespace_supported(fd, 0))
+ goto skip;
+
+ fd2devname(fd, buf);
+ if (hba->type == SYS_DEV_VMD)
+ printf(" NVMe under VMD : %s", buf);
+ else if (hba->type == SYS_DEV_NVME)
+ printf(" NVMe Device : %s", buf);
+
+ if (!imsm_read_serial(fd, NULL, (__u8 *)buf,
+ sizeof(buf)))
+ printf(" (%s)\n", buf);
+ else
+ printf("()\n");
+
+skip:
+ close_fd(&fd);
+ }
+
+ closedir(dir);
+ return 0;
+}
+
+static void print_found_intel_controllers(struct sys_dev *elem)
+{
+ for (; elem; elem = elem->next) {
+ pr_err("found Intel(R) ");
+ if (elem->type == SYS_DEV_SATA)
+ fprintf(stderr, "SATA ");
+ else if (elem->type == SYS_DEV_SAS)
+ fprintf(stderr, "SAS ");
+ else if (elem->type == SYS_DEV_NVME)
+ fprintf(stderr, "NVMe ");
+
+ if (elem->type == SYS_DEV_VMD)
+ fprintf(stderr, "VMD domain");
+ else
+ fprintf(stderr, "RAID controller");
+
+ if (elem->pci_id)
+ fprintf(stderr, " at %s", elem->pci_id);
+ fprintf(stderr, ".\n");
+ }
+ fflush(stderr);
+}
+
+static int ahci_get_port_count(const char *hba_path, int *port_count)
+{
+ struct dirent *ent;
+ DIR *dir;
+ int host_base = -1;
+
+ *port_count = 0;
+ if ((dir = opendir(hba_path)) == NULL)
+ return -1;
+
+ for (ent = readdir(dir); ent; ent = readdir(dir)) {
+ int host;
+
+ if ((sscanf(ent->d_name, "ata%d", &host) != 1) &&
+ ((sscanf(ent->d_name, "host%d", &host) != 1)))
+ continue;
+ if (*port_count == 0)
+ host_base = host;
+ else if (host < host_base)
+ host_base = host;
+
+ if (host + 1 > *port_count + host_base)
+ *port_count = host + 1 - host_base;
+ }
+ closedir(dir);
+ return host_base;
+}
+
+static void print_imsm_capability(const struct imsm_orom *orom)
+{
+ printf(" Platform : Intel(R) ");
+ if (orom->capabilities == 0 && orom->driver_features == 0)
+ printf("Matrix Storage Manager\n");
+ else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6)
+ printf("Virtual RAID on CPU\n");
+ else
+ printf("Rapid Storage Technology%s\n",
+ imsm_orom_is_enterprise(orom) ? " enterprise" : "");
+ if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build)
+ printf(" Version : %d.%d.%d.%d\n", orom->major_ver,
+ orom->minor_ver, orom->hotfix_ver, orom->build);
+ printf(" RAID Levels :%s%s%s%s%s\n",
+ imsm_orom_has_raid0(orom) ? " raid0" : "",
+ imsm_orom_has_raid1(orom) ? " raid1" : "",
+ imsm_orom_has_raid1e(orom) ? " raid1e" : "",
+ imsm_orom_has_raid10(orom) ? " raid10" : "",
+ imsm_orom_has_raid5(orom) ? " raid5" : "");
+ printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ imsm_orom_has_chunk(orom, 2) ? " 2k" : "",
+ imsm_orom_has_chunk(orom, 4) ? " 4k" : "",
+ imsm_orom_has_chunk(orom, 8) ? " 8k" : "",
+ imsm_orom_has_chunk(orom, 16) ? " 16k" : "",
+ imsm_orom_has_chunk(orom, 32) ? " 32k" : "",
+ imsm_orom_has_chunk(orom, 64) ? " 64k" : "",
+ imsm_orom_has_chunk(orom, 128) ? " 128k" : "",
+ imsm_orom_has_chunk(orom, 256) ? " 256k" : "",
+ imsm_orom_has_chunk(orom, 512) ? " 512k" : "",
+ imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "",
+ imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "",
+ imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "",
+ imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "",
+ imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "",
+ imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "",
+ imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : "");
+ printf(" 2TB volumes :%s supported\n",
+ (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not");
+ printf(" 2TB disks :%s supported\n",
+ (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not");
+ printf(" Max Disks : %d\n", orom->tds);
+ printf(" Max Volumes : %d per array, %d per %s\n",
+ orom->vpa, orom->vphba,
+ imsm_orom_is_nvme(orom) ? "platform" : "controller");
+ return;
+}
+
+static void print_imsm_capability_export(const struct imsm_orom *orom)
+{
+ printf("MD_FIRMWARE_TYPE=imsm\n");
+ if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build)
+ printf("IMSM_VERSION=%d.%d.%d.%d\n", orom->major_ver, orom->minor_ver,
+ orom->hotfix_ver, orom->build);
+ printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n",
+ imsm_orom_has_raid0(orom) ? "raid0 " : "",
+ imsm_orom_has_raid1(orom) ? "raid1 " : "",
+ imsm_orom_has_raid1e(orom) ? "raid1e " : "",
+ imsm_orom_has_raid5(orom) ? "raid10 " : "",
+ imsm_orom_has_raid10(orom) ? "raid5 " : "");
+ printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n",
+ imsm_orom_has_chunk(orom, 2) ? "2k " : "",
+ imsm_orom_has_chunk(orom, 4) ? "4k " : "",
+ imsm_orom_has_chunk(orom, 8) ? "8k " : "",
+ imsm_orom_has_chunk(orom, 16) ? "16k " : "",
+ imsm_orom_has_chunk(orom, 32) ? "32k " : "",
+ imsm_orom_has_chunk(orom, 64) ? "64k " : "",
+ imsm_orom_has_chunk(orom, 128) ? "128k " : "",
+ imsm_orom_has_chunk(orom, 256) ? "256k " : "",
+ imsm_orom_has_chunk(orom, 512) ? "512k " : "",
+ imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "",
+ imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "",
+ imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "",
+ imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "",
+ imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "",
+ imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "",
+ imsm_orom_has_chunk(orom, 1024*64) ? "64M " : "");
+ printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no");
+ printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no");
+ printf("IMSM_MAX_DISKS=%d\n",orom->tds);
+ printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa);
+ printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba);
+}
+
+static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path)
+{
+ /* There are two components to imsm platform support, the ahci SATA
+ * controller and the option-rom. To find the SATA controller we
+ * simply look in /sys/bus/pci/drivers/ahci to see if an ahci
+ * controller with the Intel vendor id is present. This approach
+ * allows mdadm to leverage the kernel's ahci detection logic, with the
+ * caveat that if ahci.ko is not loaded mdadm will not be able to
+ * detect platform raid capabilities. The option-rom resides in a
+ * platform "Adapter ROM". We scan for its signature to retrieve the
+ * platform capabilities. If raid support is disabled in the BIOS the
+ * option-rom capability structure will not be available.
+ */
+ struct sys_dev *list, *hba;
+ int host_base = 0;
+ int port_count = 0;
+ int result=1;
+
+ if (enumerate_only) {
+ if (check_env("IMSM_NO_PLATFORM"))
+ return 0;
+ list = find_intel_devices();
+ if (!list)
+ return 2;
+ for (hba = list; hba; hba = hba->next) {
+ if (find_imsm_capability(hba)) {
+ result = 0;
+ break;
+ }
+ else
+ result = 2;
+ }
+ return result;
+ }
+
+ list = find_intel_devices();
+ if (!list) {
+ if (verbose > 0)
+ pr_err("no active Intel(R) RAID controller found.\n");
+ return 2;
+ } else if (verbose > 0)
+ print_found_intel_controllers(list);
+
+ for (hba = list; hba; hba = hba->next) {
+ if (controller_path && (compare_paths(hba->path, controller_path) != 0))
+ continue;
+ if (!find_imsm_capability(hba)) {
+ char buf[PATH_MAX];
+ pr_err("imsm capabilities not found for controller: %s (type %s)\n",
+ hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path,
+ get_sys_dev_type(hba->type));
+ continue;
+ }
+ result = 0;
+ }
+
+ if (controller_path && result == 1) {
+ pr_err("no active Intel(R) RAID controller found under %s\n",
+ controller_path);
+ return result;
+ }
+
+ const struct orom_entry *entry;
+
+ for (entry = orom_entries; entry; entry = entry->next) {
+ if (entry->type == SYS_DEV_VMD) {
+ print_imsm_capability(&entry->orom);
+ printf(" 3rd party NVMe :%s supported\n",
+ imsm_orom_has_tpv_support(&entry->orom)?"":" not");
+ for (hba = list; hba; hba = hba->next) {
+ if (hba->type == SYS_DEV_VMD) {
+ char buf[PATH_MAX];
+ printf(" I/O Controller : %s (%s)\n",
+ vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type));
+ if (print_nvme_info(hba)) {
+ if (verbose > 0)
+ pr_err("failed to get devices attached to VMD domain.\n");
+ result |= 2;
+ }
+ }
+ }
+ printf("\n");
+ continue;
+ }
+
+ print_imsm_capability(&entry->orom);
+ if (entry->type == SYS_DEV_NVME) {
+ for (hba = list; hba; hba = hba->next) {
+ if (hba->type == SYS_DEV_NVME)
+ print_nvme_info(hba);
+ }
+ printf("\n");
+ continue;
+ }
+
+ struct devid_list *devid;
+ for (devid = entry->devid_list; devid; devid = devid->next) {
+ hba = device_by_id(devid->devid);
+ if (!hba)
+ continue;
+
+ printf(" I/O Controller : %s (%s)\n",
+ hba->path, get_sys_dev_type(hba->type));
+ if (hba->type == SYS_DEV_SATA) {
+ host_base = ahci_get_port_count(hba->path, &port_count);
+ if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) {
+ if (verbose > 0)
+ pr_err("failed to enumerate ports on SATA controller at %s.\n", hba->pci_id);
+ result |= 2;
+ }
+ }
+ }
+ printf("\n");
+ }
+
+ return result;
+}
+
+static int export_detail_platform_imsm(int verbose, char *controller_path)
+{
+ struct sys_dev *list, *hba;
+ int result=1;
+
+ list = find_intel_devices();
+ if (!list) {
+ if (verbose > 0)
+ pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n");
+ result = 2;
+ return result;
+ }
+
+ for (hba = list; hba; hba = hba->next) {
+ if (controller_path && (compare_paths(hba->path,controller_path) != 0))
+ continue;
+ if (!find_imsm_capability(hba) && verbose > 0) {
+ char buf[PATH_MAX];
+ pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n",
+ hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path);
+ }
+ else
+ result = 0;
+ }
+
+ const struct orom_entry *entry;
+
+ for (entry = orom_entries; entry; entry = entry->next) {
+ if (entry->type == SYS_DEV_VMD) {
+ for (hba = list; hba; hba = hba->next)
+ print_imsm_capability_export(&entry->orom);
+ continue;
+ }
+ print_imsm_capability_export(&entry->orom);
+ }
+
+ return result;
+}
+
+static int match_home_imsm(struct supertype *st, char *homehost)
+{
+ /* the imsm metadata format does not specify any host
+ * identification information. We return -1 since we can never
+ * confirm nor deny whether a given array is "meant" for this
+ * host. We rely on compare_super and the 'family_num' fields to
+ * exclude member disks that do not belong, and we rely on
+ * mdadm.conf to specify the arrays that should be assembled.
+ * Auto-assembly may still pick up "foreign" arrays.
+ */
+
+ return -1;
+}
+
+static void uuid_from_super_imsm(struct supertype *st, int uuid[4])
+{
+ /* The uuid returned here is used for:
+ * uuid to put into bitmap file (Create, Grow)
+ * uuid for backup header when saving critical section (Grow)
+ * comparing uuids when re-adding a device into an array
+ * In these cases the uuid required is that of the data-array,
+ * not the device-set.
+ * uuid to recognise same set when adding a missing device back
+ * to an array. This is a uuid for the device-set.
+ *
+ * For each of these we can make do with a truncated
+ * or hashed uuid rather than the original, as long as
+ * everyone agrees.
+ * In each case the uuid required is that of the data-array,
+ * not the device-set.
+ */
+ /* imsm does not track uuid's so we synthesis one using sha1 on
+ * - The signature (Which is constant for all imsm array, but no matter)
+ * - the orig_family_num of the container
+ * - the index number of the volume
+ * - the 'serial' number of the volume.
+ * Hopefully these are all constant.
+ */
+ struct intel_super *super = st->sb;
+
+ char buf[20];
+ struct sha1_ctx ctx;
+ struct imsm_dev *dev = NULL;
+ __u32 family_num;
+
+ /* some mdadm versions failed to set ->orig_family_num, in which
+ * case fall back to ->family_num. orig_family_num will be
+ * fixed up with the first metadata update.
+ */
+ family_num = super->anchor->orig_family_num;
+ if (family_num == 0)
+ family_num = super->anchor->family_num;
+ sha1_init_ctx(&ctx);
+ sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx);
+ sha1_process_bytes(&family_num, sizeof(__u32), &ctx);
+ if (super->current_vol >= 0)
+ dev = get_imsm_dev(super, super->current_vol);
+ if (dev) {
+ __u32 vol = super->current_vol;
+ sha1_process_bytes(&vol, sizeof(vol), &ctx);
+ sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx);
+ }
+ sha1_finish_ctx(&ctx, buf);
+ memcpy(uuid, buf, 4*4);
+}
+
+#if 0
+static void
+get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p)
+{
+ __u8 *v = get_imsm_version(mpb);
+ __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH;
+ char major[] = { 0, 0, 0 };
+ char minor[] = { 0 ,0, 0 };
+ char patch[] = { 0, 0, 0 };
+ char *ver_parse[] = { major, minor, patch };
+ int i, j;
+
+ i = j = 0;
+ while (*v != '\0' && v < end) {
+ if (*v != '.' && j < 2)
+ ver_parse[i][j++] = *v;
+ else {
+ i++;
+ j = 0;
+ }
+ v++;
+ }
+
+ *m = strtol(minor, NULL, 0);
+ *p = strtol(patch, NULL, 0);
+}
+#endif
+
+static __u32 migr_strip_blocks_resync(struct imsm_dev *dev)
+{
+ /* migr_strip_size when repairing or initializing parity */
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
+
+ switch (get_imsm_raid_level(map)) {
+ case 5:
+ case 10:
+ return chunk;
+ default:
+ return 128*1024 >> 9;
+ }
+}
+
+static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev)
+{
+ /* migr_strip_size when rebuilding a degraded disk, no idea why
+ * this is different than migr_strip_size_resync(), but it's good
+ * to be compatible
+ */
+ struct imsm_map *map = get_imsm_map(dev, MAP_1);
+ __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
+
+ switch (get_imsm_raid_level(map)) {
+ case 1:
+ case 10:
+ if (map->num_members % map->num_domains == 0)
+ return 128*1024 >> 9;
+ else
+ return chunk;
+ case 5:
+ return max((__u32) 64*1024 >> 9, chunk);
+ default:
+ return 128*1024 >> 9;
+ }
+}
+
+static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev)
+{
+ struct imsm_map *lo = get_imsm_map(dev, MAP_0);
+ struct imsm_map *hi = get_imsm_map(dev, MAP_1);
+ __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip);
+ __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip);
+
+ return max((__u32) 1, hi_chunk / lo_chunk);
+}
+
+static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev)
+{
+ struct imsm_map *lo = get_imsm_map(dev, MAP_0);
+ int level = get_imsm_raid_level(lo);
+
+ if (level == 1 || level == 10) {
+ struct imsm_map *hi = get_imsm_map(dev, MAP_1);
+
+ return hi->num_domains;
+ } else
+ return num_stripes_per_unit_resync(dev);
+}
+
+static unsigned long long calc_component_size(struct imsm_map *map,
+ struct imsm_dev *dev)
+{
+ unsigned long long component_size;
+ unsigned long long dev_size = imsm_dev_size(dev);
+ long long calc_dev_size = 0;
+ unsigned int member_disks = imsm_num_data_members(map);
+
+ if (member_disks == 0)
+ return 0;
+
+ component_size = per_dev_array_size(map);
+ calc_dev_size = component_size * member_disks;
+
+ /* Component size is rounded to 1MB so difference between size from
+ * metadata and size calculated from num_data_stripes equals up to
+ * 2048 blocks per each device. If the difference is higher it means
+ * that array size was expanded and num_data_stripes was not updated.
+ */
+ if (llabs(calc_dev_size - (long long)dev_size) >
+ (1 << SECT_PER_MB_SHIFT) * member_disks) {
+ component_size = dev_size / member_disks;
+ dprintf("Invalid num_data_stripes in metadata; expected=%llu, found=%llu\n",
+ component_size / map->blocks_per_strip,
+ num_data_stripes(map));
+ }
+
+ return component_size;
+}
+
+static __u32 parity_segment_depth(struct imsm_dev *dev)
+{
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
+
+ switch(get_imsm_raid_level(map)) {
+ case 1:
+ case 10:
+ return chunk * map->num_domains;
+ case 5:
+ return chunk * map->num_members;
+ default:
+ return chunk;
+ }
+}
+
+static __u32 map_migr_block(struct imsm_dev *dev, __u32 block)
+{
+ struct imsm_map *map = get_imsm_map(dev, MAP_1);
+ __u32 chunk = __le32_to_cpu(map->blocks_per_strip);
+ __u32 strip = block / chunk;
+
+ switch (get_imsm_raid_level(map)) {
+ case 1:
+ case 10: {
+ __u32 vol_strip = (strip * map->num_domains) + 1;
+ __u32 vol_stripe = vol_strip / map->num_members;
+
+ return vol_stripe * chunk + block % chunk;
+ } case 5: {
+ __u32 stripe = strip / (map->num_members - 1);
+
+ return stripe * chunk + block % chunk;
+ }
+ default:
+ return 0;
+ }
+}
+
+static __u64 blocks_per_migr_unit(struct intel_super *super,
+ struct imsm_dev *dev)
+{
+ /* calculate the conversion factor between per member 'blocks'
+ * (md/{resync,rebuild}_start) and imsm migration units, return
+ * 0 for the 'not migrating' and 'unsupported migration' cases
+ */
+ if (!dev->vol.migr_state)
+ return 0;
+
+ switch (migr_type(dev)) {
+ case MIGR_GEN_MIGR: {
+ struct migr_record *migr_rec = super->migr_rec;
+ return __le32_to_cpu(migr_rec->blocks_per_unit);
+ }
+ case MIGR_VERIFY:
+ case MIGR_REPAIR:
+ case MIGR_INIT: {
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ __u32 stripes_per_unit;
+ __u32 blocks_per_unit;
+ __u32 parity_depth;
+ __u32 migr_chunk;
+ __u32 block_map;
+ __u32 block_rel;
+ __u32 segment;
+ __u32 stripe;
+ __u8 disks;
+
+ /* yes, this is really the translation of migr_units to
+ * per-member blocks in the 'resync' case
+ */
+ stripes_per_unit = num_stripes_per_unit_resync(dev);
+ migr_chunk = migr_strip_blocks_resync(dev);
+ disks = imsm_num_data_members(map);
+ blocks_per_unit = stripes_per_unit * migr_chunk * disks;
+ stripe = __le16_to_cpu(map->blocks_per_strip) * disks;
+ segment = blocks_per_unit / stripe;
+ block_rel = blocks_per_unit - segment * stripe;
+ parity_depth = parity_segment_depth(dev);
+ block_map = map_migr_block(dev, block_rel);
+ return block_map + parity_depth * segment;
+ }
+ case MIGR_REBUILD: {
+ __u32 stripes_per_unit;
+ __u32 migr_chunk;
+
+ stripes_per_unit = num_stripes_per_unit_rebuild(dev);
+ migr_chunk = migr_strip_blocks_rebuild(dev);
+ return migr_chunk * stripes_per_unit;
+ }
+ case MIGR_STATE_CHANGE:
+ default:
+ return 0;
+ }
+}
+
+static int imsm_level_to_layout(int level)
+{
+ switch (level) {
+ case 0:
+ case 1:
+ return 0;
+ case 5:
+ case 6:
+ return ALGORITHM_LEFT_ASYMMETRIC;
+ case 10:
+ return 0x102;
+ }
+ return UnSet;
+}
+
+/*******************************************************************************
+ * Function: read_imsm_migr_rec
+ * Description: Function reads imsm migration record from last sector of disk
+ * Parameters:
+ * fd : disk descriptor
+ * super : metadata info
+ * Returns:
+ * 0 : success,
+ * -1 : fail
+ ******************************************************************************/
+static int read_imsm_migr_rec(int fd, struct intel_super *super)
+{
+ int ret_val = -1;
+ unsigned int sector_size = super->sector_size;
+ unsigned long long dsize;
+
+ get_dev_size(fd, NULL, &dsize);
+ if (lseek64(fd, dsize - (sector_size*MIGR_REC_SECTOR_POSITION),
+ SEEK_SET) < 0) {
+ pr_err("Cannot seek to anchor block: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ if ((unsigned int)read(fd, super->migr_rec_buf,
+ MIGR_REC_BUF_SECTORS*sector_size) !=
+ MIGR_REC_BUF_SECTORS*sector_size) {
+ pr_err("Cannot read migr record block: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ ret_val = 0;
+ if (sector_size == 4096)
+ convert_from_4k_imsm_migr_rec(super);
+
+out:
+ return ret_val;
+}
+
+static struct imsm_dev *imsm_get_device_during_migration(
+ struct intel_super *super)
+{
+
+ struct intel_dev *dv;
+
+ for (dv = super->devlist; dv; dv = dv->next) {
+ if (is_gen_migration(dv->dev))
+ return dv->dev;
+ }
+ return NULL;
+}
+
+/*******************************************************************************
+ * Function: load_imsm_migr_rec
+ * Description: Function reads imsm migration record (it is stored at the last
+ * sector of disk)
+ * Parameters:
+ * super : imsm internal array info
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ * -2 : no migration in progress
+ ******************************************************************************/
+static int load_imsm_migr_rec(struct intel_super *super)
+{
+ struct dl *dl;
+ char nm[30];
+ int retval = -1;
+ int fd = -1;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ int slot = -1;
+ int keep_fd = 1;
+
+ /* find map under migration */
+ dev = imsm_get_device_during_migration(super);
+ /* nothing to load,no migration in progress?
+ */
+ if (dev == NULL)
+ return -2;
+
+ map = get_imsm_map(dev, MAP_0);
+ if (!map)
+ return -1;
+
+ for (dl = super->disks; dl; dl = dl->next) {
+ /* skip spare and failed disks
+ */
+ if (dl->index < 0)
+ continue;
+ /* read only from one of the first two slots
+ */
+ slot = get_imsm_disk_slot(map, dl->index);
+ if (slot > 1 || slot < 0)
+ continue;
+
+ if (!is_fd_valid(dl->fd)) {
+ sprintf(nm, "%d:%d", dl->major, dl->minor);
+ fd = dev_open(nm, O_RDONLY);
+
+ if (is_fd_valid(fd)) {
+ keep_fd = 0;
+ break;
+ }
+ } else {
+ fd = dl->fd;
+ break;
+ }
+ }
+
+ if (!is_fd_valid(fd))
+ return retval;
+ retval = read_imsm_migr_rec(fd, super);
+ if (!keep_fd)
+ close(fd);
+
+ return retval;
+}
+
+/*******************************************************************************
+ * function: imsm_create_metadata_checkpoint_update
+ * Description: It creates update for checkpoint change.
+ * Parameters:
+ * super : imsm internal array info
+ * u : pointer to prepared update
+ * Returns:
+ * Uptate length.
+ * If length is equal to 0, input pointer u contains no update
+ ******************************************************************************/
+static int imsm_create_metadata_checkpoint_update(
+ struct intel_super *super,
+ struct imsm_update_general_migration_checkpoint **u)
+{
+
+ int update_memory_size = 0;
+
+ dprintf("(enter)\n");
+
+ if (u == NULL)
+ return 0;
+ *u = NULL;
+
+ /* size of all update data without anchor */
+ update_memory_size =
+ sizeof(struct imsm_update_general_migration_checkpoint);
+
+ *u = xcalloc(1, update_memory_size);
+ if (*u == NULL) {
+ dprintf("error: cannot get memory\n");
+ return 0;
+ }
+ (*u)->type = update_general_migration_checkpoint;
+ (*u)->curr_migr_unit = current_migr_unit(super->migr_rec);
+ dprintf("prepared for %llu\n", (unsigned long long)(*u)->curr_migr_unit);
+
+ return update_memory_size;
+}
+
+static void imsm_update_metadata_locally(struct supertype *st,
+ void *buf, int len);
+
+/*******************************************************************************
+ * Function: write_imsm_migr_rec
+ * Description: Function writes imsm migration record
+ * (at the last sector of disk)
+ * Parameters:
+ * super : imsm internal array info
+ * Returns:
+ * 0 : success
+ * -1 : if fail
+ ******************************************************************************/
+static int write_imsm_migr_rec(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ unsigned int sector_size = super->sector_size;
+ unsigned long long dsize;
+ int retval = -1;
+ struct dl *sd;
+ int len;
+ struct imsm_update_general_migration_checkpoint *u;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+
+ /* find map under migration */
+ dev = imsm_get_device_during_migration(super);
+ /* if no migration, write buffer anyway to clear migr_record
+ * on disk based on first available device
+ */
+ if (dev == NULL)
+ dev = get_imsm_dev(super, super->current_vol < 0 ? 0 :
+ super->current_vol);
+
+ map = get_imsm_map(dev, MAP_0);
+
+ if (sector_size == 4096)
+ convert_to_4k_imsm_migr_rec(super);
+ for (sd = super->disks ; sd ; sd = sd->next) {
+ int slot = -1;
+
+ /* skip failed and spare devices */
+ if (sd->index < 0)
+ continue;
+ /* write to 2 first slots only */
+ if (map)
+ slot = get_imsm_disk_slot(map, sd->index);
+ if (map == NULL || slot > 1 || slot < 0)
+ continue;
+
+ get_dev_size(sd->fd, NULL, &dsize);
+ if (lseek64(sd->fd, dsize - (MIGR_REC_SECTOR_POSITION *
+ sector_size),
+ SEEK_SET) < 0) {
+ pr_err("Cannot seek to anchor block: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ if ((unsigned int)write(sd->fd, super->migr_rec_buf,
+ MIGR_REC_BUF_SECTORS*sector_size) !=
+ MIGR_REC_BUF_SECTORS*sector_size) {
+ pr_err("Cannot write migr record block: %s\n",
+ strerror(errno));
+ goto out;
+ }
+ }
+ if (sector_size == 4096)
+ convert_from_4k_imsm_migr_rec(super);
+ /* update checkpoint information in metadata */
+ len = imsm_create_metadata_checkpoint_update(super, &u);
+ if (len <= 0) {
+ dprintf("imsm: Cannot prepare update\n");
+ goto out;
+ }
+ /* update metadata locally */
+ imsm_update_metadata_locally(st, u, len);
+ /* and possibly remotely */
+ if (st->update_tail) {
+ append_metadata_update(st, u, len);
+ /* during reshape we do all work inside metadata handler
+ * manage_reshape(), so metadata update has to be triggered
+ * insida it
+ */
+ flush_metadata_updates(st);
+ st->update_tail = &st->updates;
+ } else
+ free(u);
+
+ retval = 0;
+ out:
+ return retval;
+}
+
+/* spare/missing disks activations are not allowe when
+ * array/container performs reshape operation, because
+ * all arrays in container works on the same disks set
+ */
+int imsm_reshape_blocks_arrays_changes(struct intel_super *super)
+{
+ int rv = 0;
+ struct intel_dev *i_dev;
+ struct imsm_dev *dev;
+
+ /* check whole container
+ */
+ for (i_dev = super->devlist; i_dev; i_dev = i_dev->next) {
+ dev = i_dev->dev;
+ if (is_gen_migration(dev)) {
+ /* No repair during any migration in container
+ */
+ rv = 1;
+ break;
+ }
+ }
+ return rv;
+}
+static unsigned long long imsm_component_size_alignment_check(int level,
+ int chunk_size,
+ unsigned int sector_size,
+ unsigned long long component_size)
+{
+ unsigned int component_size_alignment;
+
+ /* check component size alignment
+ */
+ component_size_alignment = component_size % (chunk_size/sector_size);
+
+ dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alignment = %u\n",
+ level, chunk_size, component_size,
+ component_size_alignment);
+
+ if (component_size_alignment && (level != 1) && (level != UnSet)) {
+ dprintf("imsm: reported component size aligned from %llu ",
+ component_size);
+ component_size -= component_size_alignment;
+ dprintf_cont("to %llu (%i).\n",
+ component_size, component_size_alignment);
+ }
+
+ return component_size;
+}
+
+/*******************************************************************************
+ * Function: get_bitmap_header_sector
+ * Description: Returns the sector where the bitmap header is placed.
+ * Parameters:
+ * st : supertype information
+ * dev_idx : index of the device with bitmap
+ *
+ * Returns:
+ * The sector where the bitmap header is placed
+ ******************************************************************************/
+static unsigned long long get_bitmap_header_sector(struct intel_super *super,
+ int dev_idx)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+ if (!super->sector_size) {
+ dprintf("sector size is not set\n");
+ return 0;
+ }
+
+ return pba_of_lba0(map) + calc_component_size(map, dev) +
+ (IMSM_BITMAP_HEADER_OFFSET / super->sector_size);
+}
+
+/*******************************************************************************
+ * Function: get_bitmap_sector
+ * Description: Returns the sector where the bitmap is placed.
+ * Parameters:
+ * st : supertype information
+ * dev_idx : index of the device with bitmap
+ *
+ * Returns:
+ * The sector where the bitmap is placed
+ ******************************************************************************/
+static unsigned long long get_bitmap_sector(struct intel_super *super,
+ int dev_idx)
+{
+ if (!super->sector_size) {
+ dprintf("sector size is not set\n");
+ return 0;
+ }
+
+ return get_bitmap_header_sector(super, dev_idx) +
+ (IMSM_BITMAP_HEADER_SIZE / super->sector_size);
+}
+
+static unsigned long long get_ppl_sector(struct intel_super *super, int dev_idx)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+ return pba_of_lba0(map) +
+ (num_data_stripes(map) * map->blocks_per_strip);
+}
+
+static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap)
+{
+ struct intel_super *super = st->sb;
+ struct migr_record *migr_rec = super->migr_rec;
+ struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *prev_map = get_imsm_map(dev, MAP_1);
+ struct imsm_map *map_to_analyse = map;
+ struct dl *dl;
+ int map_disks = info->array.raid_disks;
+
+ memset(info, 0, sizeof(*info));
+ if (prev_map)
+ map_to_analyse = prev_map;
+
+ dl = super->current_disk;
+
+ info->container_member = super->current_vol;
+ info->array.raid_disks = map->num_members;
+ info->array.level = get_imsm_raid_level(map_to_analyse);
+ info->array.layout = imsm_level_to_layout(info->array.level);
+ info->array.md_minor = -1;
+ info->array.ctime = 0;
+ info->array.utime = 0;
+ info->array.chunk_size =
+ __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9;
+ info->array.state = !(dev->vol.dirty & RAIDVOL_DIRTY);
+ info->custom_array_size = imsm_dev_size(dev);
+ info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
+
+ if (is_gen_migration(dev)) {
+ /*
+ * device prev_map should be added if it is in the middle
+ * of migration
+ */
+ assert(prev_map);
+
+ info->reshape_active = 1;
+ info->new_level = get_imsm_raid_level(map);
+ info->new_layout = imsm_level_to_layout(info->new_level);
+ info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9;
+ info->delta_disks = map->num_members - prev_map->num_members;
+ if (info->delta_disks) {
+ /* this needs to be applied to every array
+ * in the container.
+ */
+ info->reshape_active = CONTAINER_RESHAPE;
+ }
+ /* We shape information that we give to md might have to be
+ * modify to cope with md's requirement for reshaping arrays.
+ * For example, when reshaping a RAID0, md requires it to be
+ * presented as a degraded RAID4.
+ * Also if a RAID0 is migrating to a RAID5 we need to specify
+ * the array as already being RAID5, but the 'before' layout
+ * is a RAID4-like layout.
+ */
+ switch (info->array.level) {
+ case 0:
+ switch(info->new_level) {
+ case 0:
+ /* conversion is happening as RAID4 */
+ info->array.level = 4;
+ info->array.raid_disks += 1;
+ break;
+ case 5:
+ /* conversion is happening as RAID5 */
+ info->array.level = 5;
+ info->array.layout = ALGORITHM_PARITY_N;
+ info->delta_disks -= 1;
+ break;
+ default:
+ /* FIXME error message */
+ info->array.level = UnSet;
+ break;
+ }
+ break;
+ }
+ } else {
+ info->new_level = UnSet;
+ info->new_layout = UnSet;
+ info->new_chunk = info->array.chunk_size;
+ info->delta_disks = 0;
+ }
+
+ if (dl) {
+ info->disk.major = dl->major;
+ info->disk.minor = dl->minor;
+ info->disk.number = dl->index;
+ info->disk.raid_disk = get_imsm_disk_slot(map_to_analyse,
+ dl->index);
+ }
+
+ info->data_offset = pba_of_lba0(map_to_analyse);
+ info->component_size = calc_component_size(map, dev);
+ info->component_size = imsm_component_size_alignment_check(
+ info->array.level,
+ info->array.chunk_size,
+ super->sector_size,
+ info->component_size);
+ info->bb.supported = 1;
+
+ memset(info->uuid, 0, sizeof(info->uuid));
+ info->recovery_start = MaxSector;
+
+ if (info->array.level == 5 &&
+ (dev->rwh_policy == RWH_DISTRIBUTED ||
+ dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)) {
+ info->consistency_policy = CONSISTENCY_POLICY_PPL;
+ info->ppl_sector = get_ppl_sector(super, super->current_vol);
+ if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+ info->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
+ else
+ info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE)
+ >> 9;
+ } else if (info->array.level <= 0) {
+ info->consistency_policy = CONSISTENCY_POLICY_NONE;
+ } else {
+ if (dev->rwh_policy == RWH_BITMAP) {
+ info->bitmap_offset = get_bitmap_sector(super, super->current_vol);
+ info->consistency_policy = CONSISTENCY_POLICY_BITMAP;
+ } else {
+ info->consistency_policy = CONSISTENCY_POLICY_RESYNC;
+ }
+ }
+
+ info->reshape_progress = 0;
+ info->resync_start = MaxSector;
+ if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED ||
+ !(info->array.state & 1)) &&
+ imsm_reshape_blocks_arrays_changes(super) == 0) {
+ info->resync_start = 0;
+ }
+ if (dev->vol.migr_state) {
+ switch (migr_type(dev)) {
+ case MIGR_REPAIR:
+ case MIGR_INIT: {
+ __u64 blocks_per_unit = blocks_per_migr_unit(super,
+ dev);
+ __u64 units = vol_curr_migr_unit(dev);
+
+ info->resync_start = blocks_per_unit * units;
+ break;
+ }
+ case MIGR_GEN_MIGR: {
+ __u64 blocks_per_unit = blocks_per_migr_unit(super,
+ dev);
+ __u64 units = current_migr_unit(migr_rec);
+ int used_disks;
+
+ if (__le32_to_cpu(migr_rec->ascending_migr) &&
+ (units <
+ (get_num_migr_units(migr_rec)-1)) &&
+ (super->migr_rec->rec_status ==
+ __cpu_to_le32(UNIT_SRC_IN_CP_AREA)))
+ units++;
+
+ info->reshape_progress = blocks_per_unit * units;
+
+ dprintf("IMSM: General Migration checkpoint : %llu (%llu) -> read reshape progress : %llu\n",
+ (unsigned long long)units,
+ (unsigned long long)blocks_per_unit,
+ info->reshape_progress);
+
+ used_disks = imsm_num_data_members(prev_map);
+ if (used_disks > 0) {
+ info->custom_array_size = per_dev_array_size(map) *
+ used_disks;
+ }
+ }
+ case MIGR_VERIFY:
+ /* we could emulate the checkpointing of
+ * 'sync_action=check' migrations, but for now
+ * we just immediately complete them
+ */
+ case MIGR_REBUILD:
+ /* this is handled by container_content_imsm() */
+ case MIGR_STATE_CHANGE:
+ /* FIXME handle other migrations */
+ default:
+ /* we are not dirty, so... */
+ info->resync_start = MaxSector;
+ }
+ }
+
+ strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN);
+ info->name[MAX_RAID_SERIAL_LEN] = 0;
+
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member);
+ info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */
+ uuid_from_super_imsm(st, info->uuid);
+
+ if (dmap) {
+ int i, j;
+ for (i=0; i<map_disks; i++) {
+ dmap[i] = 0;
+ if (i < info->array.raid_disks) {
+ struct imsm_disk *dsk;
+ j = get_imsm_disk_idx(dev, i, MAP_X);
+ dsk = get_imsm_disk(super, j);
+ if (dsk && (dsk->status & CONFIGURED_DISK))
+ dmap[i] = 1;
+ }
+ }
+ }
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
+ int failed, int look_in_map);
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev,
+ int look_in_map);
+
+static void manage_second_map(struct intel_super *super, struct imsm_dev *dev)
+{
+ if (is_gen_migration(dev)) {
+ int failed;
+ __u8 map_state;
+ struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
+
+ failed = imsm_count_failed(super, dev, MAP_1);
+ map_state = imsm_check_degraded(super, dev, failed, MAP_1);
+ if (map2->map_state != map_state) {
+ map2->map_state = map_state;
+ super->updates_pending++;
+ }
+ }
+}
+
+static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index)
+{
+ struct dl *d;
+
+ for (d = super->missing; d; d = d->next)
+ if (d->index == index)
+ return &d->disk;
+ return NULL;
+}
+
+static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_disk *disk;
+ int map_disks = info->array.raid_disks;
+ int max_enough = -1;
+ int i;
+ struct imsm_super *mpb;
+
+ if (super->current_vol >= 0) {
+ getinfo_super_imsm_volume(st, info, map);
+ return;
+ }
+ memset(info, 0, sizeof(*info));
+
+ /* Set raid_disks to zero so that Assemble will always pull in valid
+ * spares
+ */
+ info->array.raid_disks = 0;
+ info->array.level = LEVEL_CONTAINER;
+ info->array.layout = 0;
+ info->array.md_minor = -1;
+ info->array.ctime = 0; /* N/A for imsm */
+ info->array.utime = 0;
+ info->array.chunk_size = 0;
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ info->disk.raid_disk = -1;
+ info->reshape_active = 0;
+ info->array.major_version = -1;
+ info->array.minor_version = -2;
+ strcpy(info->text_version, "imsm");
+ info->safe_mode_delay = 0;
+ info->disk.number = -1;
+ info->disk.state = 0;
+ info->name[0] = 0;
+ info->recovery_start = MaxSector;
+ info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb);
+ info->bb.supported = 1;
+
+ /* do we have the all the insync disks that we expect? */
+ mpb = super->anchor;
+ info->events = __le32_to_cpu(mpb->generation_num);
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ int failed, enough, j, missing = 0;
+ struct imsm_map *map;
+ __u8 state;
+
+ failed = imsm_count_failed(super, dev, MAP_0);
+ state = imsm_check_degraded(super, dev, failed, MAP_0);
+ map = get_imsm_map(dev, MAP_0);
+
+ /* any newly missing disks?
+ * (catches single-degraded vs double-degraded)
+ */
+ for (j = 0; j < map->num_members; j++) {
+ __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0);
+ __u32 idx = ord_to_idx(ord);
+
+ if (super->disks && super->disks->index == (int)idx)
+ info->disk.raid_disk = j;
+
+ if (!(ord & IMSM_ORD_REBUILD) &&
+ get_imsm_missing(super, idx)) {
+ missing = 1;
+ break;
+ }
+ }
+
+ if (state == IMSM_T_STATE_FAILED)
+ enough = -1;
+ else if (state == IMSM_T_STATE_DEGRADED &&
+ (state != map->map_state || missing))
+ enough = 0;
+ else /* we're normal, or already degraded */
+ enough = 1;
+ if (is_gen_migration(dev) && missing) {
+ /* during general migration we need all disks
+ * that process is running on.
+ * No new missing disk is allowed.
+ */
+ max_enough = -1;
+ enough = -1;
+ /* no more checks necessary
+ */
+ break;
+ }
+ /* in the missing/failed disk case check to see
+ * if at least one array is runnable
+ */
+ max_enough = max(max_enough, enough);
+ }
+ dprintf("enough: %d\n", max_enough);
+ info->container_enough = max_enough;
+
+ if (super->disks) {
+ __u32 reserved = imsm_reserved_sectors(super, super->disks);
+
+ disk = &super->disks->disk;
+ info->data_offset = total_blocks(&super->disks->disk) - reserved;
+ info->component_size = reserved;
+ info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0;
+ /* we don't change info->disk.raid_disk here because
+ * this state will be finalized in mdmon after we have
+ * found the 'most fresh' version of the metadata
+ */
+ info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
+ info->disk.state |= (is_spare(disk) || is_journal(disk)) ?
+ 0 : (1 << MD_DISK_SYNC);
+ }
+
+ /* only call uuid_from_super_imsm when this disk is part of a populated container,
+ * ->compare_super may have updated the 'num_raid_devs' field for spares
+ */
+ if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs)
+ uuid_from_super_imsm(st, info->uuid);
+ else
+ memcpy(info->uuid, uuid_zero, sizeof(uuid_zero));
+
+ /* I don't know how to compute 'map' on imsm, so use safe default */
+ if (map) {
+ int i;
+ for (i = 0; i < map_disks; i++)
+ map[i] = 1;
+ }
+
+}
+
+/* allocates memory and fills disk in mdinfo structure
+ * for each disk in array */
+struct mdinfo *getinfo_super_disks_imsm(struct supertype *st)
+{
+ struct mdinfo *mddev;
+ struct intel_super *super = st->sb;
+ struct imsm_disk *disk;
+ int count = 0;
+ struct dl *dl;
+ if (!super || !super->disks)
+ return NULL;
+ dl = super->disks;
+ mddev = xcalloc(1, sizeof(*mddev));
+ while (dl) {
+ struct mdinfo *tmp;
+ disk = &dl->disk;
+ tmp = xcalloc(1, sizeof(*tmp));
+ if (mddev->devs)
+ tmp->next = mddev->devs;
+ mddev->devs = tmp;
+ tmp->disk.number = count++;
+ tmp->disk.major = dl->major;
+ tmp->disk.minor = dl->minor;
+ tmp->disk.state = is_configured(disk) ?
+ (1 << MD_DISK_ACTIVE) : 0;
+ tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0;
+ tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC);
+ tmp->disk.raid_disk = -1;
+ dl = dl->next;
+ }
+ return mddev;
+}
+
+static int update_super_imsm(struct supertype *st, struct mdinfo *info,
+ char *update, char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* For 'assemble' and 'force' we need to return non-zero if any
+ * change was made. For others, the return value is ignored.
+ * Update options are:
+ * force-one : This device looks a bit old but needs to be included,
+ * update age info appropriately.
+ * assemble: clear any 'faulty' flag to allow this device to
+ * be assembled.
+ * force-array: Array is degraded but being forced, mark it clean
+ * if that will be needed to assemble it.
+ *
+ * newdev: not used ????
+ * grow: Array has gained a new device - this is currently for
+ * linear only
+ * resync: mark as dirty so a resync will happen.
+ * name: update the name - preserving the homehost
+ * uuid: Change the uuid of the array to match watch is given
+ *
+ * Following are not relevant for this imsm:
+ * sparc2.2 : update from old dodgey metadata
+ * super-minor: change the preferred_minor number
+ * summaries: update redundant counters.
+ * homehost: update the recorded homehost
+ * _reshape_progress: record new reshape_progress position.
+ */
+ int rv = 1;
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb;
+
+ /* we can only update container info */
+ if (!super || super->current_vol >= 0 || !super->anchor)
+ return 1;
+
+ mpb = super->anchor;
+
+ if (strcmp(update, "uuid") == 0) {
+ /* We take this to mean that the family_num should be updated.
+ * However that is much smaller than the uuid so we cannot really
+ * allow an explicit uuid to be given. And it is hard to reliably
+ * know if one was.
+ * So if !uuid_set we know the current uuid is random and just used
+ * the first 'int' and copy it to the other 3 positions.
+ * Otherwise we require the 4 'int's to be the same as would be the
+ * case if we are using a random uuid. So an explicit uuid will be
+ * accepted as long as all for ints are the same... which shouldn't hurt
+ */
+ if (!uuid_set) {
+ info->uuid[1] = info->uuid[2] = info->uuid[3] = info->uuid[0];
+ rv = 0;
+ } else {
+ if (info->uuid[0] != info->uuid[1] ||
+ info->uuid[1] != info->uuid[2] ||
+ info->uuid[2] != info->uuid[3])
+ rv = -1;
+ else
+ rv = 0;
+ }
+ if (rv == 0)
+ mpb->orig_family_num = info->uuid[0];
+ } else if (strcmp(update, "assemble") == 0)
+ rv = 0;
+ else
+ rv = -1;
+
+ /* successful update? recompute checksum */
+ if (rv == 0)
+ mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb));
+
+ return rv;
+}
+
+static size_t disks_to_mpb_size(int disks)
+{
+ size_t size;
+
+ size = sizeof(struct imsm_super);
+ size += (disks - 1) * sizeof(struct imsm_disk);
+ size += 2 * sizeof(struct imsm_dev);
+ /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */
+ size += (4 - 2) * sizeof(struct imsm_map);
+ /* 4 possible disk_ord_tbl's */
+ size += 4 * (disks - 1) * sizeof(__u32);
+ /* maximum bbm log */
+ size += sizeof(struct bbm_log);
+
+ return size;
+}
+
+static __u64 avail_size_imsm(struct supertype *st, __u64 devsize,
+ unsigned long long data_offset)
+{
+ if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS))
+ return 0;
+
+ return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS);
+}
+
+static void free_devlist(struct intel_super *super)
+{
+ struct intel_dev *dv;
+
+ while (super->devlist) {
+ dv = super->devlist->next;
+ free(super->devlist->dev);
+ free(super->devlist);
+ super->devlist = dv;
+ }
+}
+
+static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src)
+{
+ memcpy(dest, src, sizeof_imsm_dev(src, 0));
+}
+
+static int compare_super_imsm(struct supertype *st, struct supertype *tst,
+ int verbose)
+{
+ /* return:
+ * 0 same, or first was empty, and second was copied
+ * 1 sb are different
+ */
+ struct intel_super *first = st->sb;
+ struct intel_super *sec = tst->sb;
+
+ if (!first) {
+ st->sb = tst->sb;
+ tst->sb = NULL;
+ return 0;
+ }
+
+ /* in platform dependent environment test if the disks
+ * use the same Intel hba
+ * if not on Intel hba at all, allow anything.
+ * doesn't check HBAs if num_raid_devs is not set, as it means
+ * it is a free floating spare, and all spares regardless of HBA type
+ * will fall into separate container during the assembly
+ */
+ if (first->hba && sec->hba && first->anchor->num_raid_devs != 0) {
+ if (first->hba->type != sec->hba->type) {
+ if (verbose)
+ pr_err("HBAs of devices do not match %s != %s\n",
+ get_sys_dev_type(first->hba->type),
+ get_sys_dev_type(sec->hba->type));
+ return 1;
+ }
+ if (first->orom != sec->orom) {
+ if (verbose)
+ pr_err("HBAs of devices do not match %s != %s\n",
+ first->hba->pci_id, sec->hba->pci_id);
+ return 1;
+ }
+ }
+
+ if (first->anchor->num_raid_devs > 0 &&
+ sec->anchor->num_raid_devs > 0) {
+ /* Determine if these disks might ever have been
+ * related. Further disambiguation can only take place
+ * in load_super_imsm_all
+ */
+ __u32 first_family = first->anchor->orig_family_num;
+ __u32 sec_family = sec->anchor->orig_family_num;
+
+ if (memcmp(first->anchor->sig, sec->anchor->sig,
+ MAX_SIGNATURE_LENGTH) != 0)
+ return 1;
+
+ if (first_family == 0)
+ first_family = first->anchor->family_num;
+ if (sec_family == 0)
+ sec_family = sec->anchor->family_num;
+
+ if (first_family != sec_family)
+ return 1;
+
+ }
+
+ /* if an anchor does not have num_raid_devs set then it is a free
+ * floating spare. don't assosiate spare with any array, as during assembly
+ * spares shall fall into separate container, from which they can be moved
+ * when necessary
+ */
+ if (first->anchor->num_raid_devs ^ sec->anchor->num_raid_devs)
+ return 1;
+
+ return 0;
+}
+
+static void fd2devname(int fd, char *name)
+{
+ char *nm;
+
+ nm = fd2kname(fd);
+ if (!nm)
+ return;
+
+ snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm);
+}
+
+static int nvme_get_serial(int fd, void *buf, size_t buf_len)
+{
+ char path[PATH_MAX];
+ char *name = fd2kname(fd);
+
+ if (!name)
+ return 1;
+
+ if (strncmp(name, "nvme", 4) != 0)
+ return 1;
+
+ if (!diskfd_to_devpath(fd, 1, path))
+ return 1;
+
+ return devpath_to_char(path, "serial", buf, buf_len, 0);
+}
+
+extern int scsi_get_serial(int fd, void *buf, size_t buf_len);
+
+static int imsm_read_serial(int fd, char *devname,
+ __u8 *serial, size_t serial_buf_len)
+{
+ char buf[50];
+ int rv;
+ size_t len;
+ char *dest;
+ char *src;
+ unsigned int i;
+
+ memset(buf, 0, sizeof(buf));
+
+ rv = nvme_get_serial(fd, buf, sizeof(buf));
+
+ if (rv)
+ rv = scsi_get_serial(fd, buf, sizeof(buf));
+
+ if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) {
+ memset(serial, 0, MAX_RAID_SERIAL_LEN);
+ fd2devname(fd, (char *) serial);
+ return 0;
+ }
+
+ if (rv != 0) {
+ if (devname)
+ pr_err("Failed to retrieve serial for %s\n",
+ devname);
+ return rv;
+ }
+
+ /* trim all whitespace and non-printable characters and convert
+ * ':' to ';'
+ */
+ for (i = 0, dest = buf; i < sizeof(buf) && buf[i]; i++) {
+ src = &buf[i];
+ if (*src > 0x20) {
+ /* ':' is reserved for use in placeholder serial
+ * numbers for missing disks
+ */
+ if (*src == ':')
+ *dest++ = ';';
+ else
+ *dest++ = *src;
+ }
+ }
+ len = dest - buf;
+ dest = buf;
+
+ if (len > serial_buf_len) {
+ /* truncate leading characters */
+ dest += len - serial_buf_len;
+ len = serial_buf_len;
+ }
+
+ memset(serial, 0, serial_buf_len);
+ memcpy(serial, dest, len);
+
+ return 0;
+}
+
+static int serialcmp(__u8 *s1, __u8 *s2)
+{
+ return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN);
+}
+
+static void serialcpy(__u8 *dest, __u8 *src)
+{
+ strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN);
+}
+
+static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super)
+{
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (serialcmp(dl->serial, serial) == 0)
+ break;
+
+ return dl;
+}
+
+static struct imsm_disk *
+__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx)
+{
+ int i;
+
+ for (i = 0; i < mpb->num_disks; i++) {
+ struct imsm_disk *disk = __get_imsm_disk(mpb, i);
+
+ if (serialcmp(disk->serial, serial) == 0) {
+ if (idx)
+ *idx = i;
+ return disk;
+ }
+ }
+
+ return NULL;
+}
+
+static int
+load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+ struct imsm_disk *disk;
+ struct dl *dl;
+ struct stat stb;
+ int rv;
+ char name[40];
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+
+ rv = imsm_read_serial(fd, devname, serial, MAX_RAID_SERIAL_LEN);
+
+ if (rv != 0)
+ return 2;
+
+ dl = xcalloc(1, sizeof(*dl));
+
+ fstat(fd, &stb);
+ dl->major = major(stb.st_rdev);
+ dl->minor = minor(stb.st_rdev);
+ dl->next = super->disks;
+ dl->fd = keep_fd ? fd : -1;
+ assert(super->disks == NULL);
+ super->disks = dl;
+ serialcpy(dl->serial, serial);
+ dl->index = -2;
+ dl->e = NULL;
+ fd2devname(fd, name);
+ if (devname)
+ dl->devname = xstrdup(devname);
+ else
+ dl->devname = xstrdup(name);
+
+ /* look up this disk's index in the current anchor */
+ disk = __serial_to_disk(dl->serial, super->anchor, &dl->index);
+ if (disk) {
+ dl->disk = *disk;
+ /* only set index on disks that are a member of a
+ * populated contianer, i.e. one with raid_devs
+ */
+ if (is_failed(&dl->disk))
+ dl->index = -2;
+ else if (is_spare(&dl->disk) || is_journal(&dl->disk))
+ dl->index = -1;
+ }
+
+ return 0;
+}
+
+/* When migrating map0 contains the 'destination' state while map1
+ * contains the current state. When not migrating map0 contains the
+ * current state. This routine assumes that map[0].map_state is set to
+ * the current array state before being called.
+ *
+ * Migration is indicated by one of the following states
+ * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed)
+ * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal
+ * map1state=unitialized)
+ * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal
+ * map1state=normal)
+ * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal
+ * map1state=degraded)
+ * 5/ Migration (mig_state=1 migr_type=MIGR_GEN_MIGR map0state=normal
+ * map1state=normal)
+ */
+static void migrate(struct imsm_dev *dev, struct intel_super *super,
+ __u8 to_state, int migr_type)
+{
+ struct imsm_map *dest;
+ struct imsm_map *src = get_imsm_map(dev, MAP_0);
+
+ dev->vol.migr_state = 1;
+ set_migr_type(dev, migr_type);
+ set_vol_curr_migr_unit(dev, 0);
+ dest = get_imsm_map(dev, MAP_1);
+
+ /* duplicate and then set the target end state in map[0] */
+ memcpy(dest, src, sizeof_imsm_map(src));
+ if (migr_type == MIGR_GEN_MIGR) {
+ __u32 ord;
+ int i;
+
+ for (i = 0; i < src->num_members; i++) {
+ ord = __le32_to_cpu(src->disk_ord_tbl[i]);
+ set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord));
+ }
+ }
+
+ if (migr_type == MIGR_GEN_MIGR)
+ /* Clear migration record */
+ memset(super->migr_rec, 0, sizeof(struct migr_record));
+
+ src->map_state = to_state;
+}
+
+static void end_migration(struct imsm_dev *dev, struct intel_super *super,
+ __u8 map_state)
+{
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ?
+ MAP_0 : MAP_1);
+ int i, j;
+
+ /* merge any IMSM_ORD_REBUILD bits that were not successfully
+ * completed in the last migration.
+ *
+ * FIXME add support for raid-level-migration
+ */
+ if (map_state != map->map_state && (is_gen_migration(dev) == false) &&
+ prev->map_state != IMSM_T_STATE_UNINITIALIZED) {
+ /* when final map state is other than expected
+ * merge maps (not for migration)
+ */
+ int failed;
+
+ for (i = 0; i < prev->num_members; i++)
+ for (j = 0; j < map->num_members; j++)
+ /* during online capacity expansion
+ * disks position can be changed
+ * if takeover is used
+ */
+ if (ord_to_idx(map->disk_ord_tbl[j]) ==
+ ord_to_idx(prev->disk_ord_tbl[i])) {
+ map->disk_ord_tbl[j] |=
+ prev->disk_ord_tbl[i];
+ break;
+ }
+ failed = imsm_count_failed(super, dev, MAP_0);
+ map_state = imsm_check_degraded(super, dev, failed, MAP_0);
+ }
+
+ dev->vol.migr_state = 0;
+ set_migr_type(dev, 0);
+ set_vol_curr_migr_unit(dev, 0);
+ map->map_state = map_state;
+}
+
+static int parse_raid_devices(struct intel_super *super)
+{
+ int i;
+ struct imsm_dev *dev_new;
+ size_t len, len_migr;
+ size_t max_len = 0;
+ size_t space_needed = 0;
+ struct imsm_super *mpb = super->anchor;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+ struct intel_dev *dv;
+
+ len = sizeof_imsm_dev(dev_iter, 0);
+ len_migr = sizeof_imsm_dev(dev_iter, 1);
+ if (len_migr > len)
+ space_needed += len_migr - len;
+
+ dv = xmalloc(sizeof(*dv));
+ if (max_len < len_migr)
+ max_len = len_migr;
+ if (max_len > len_migr)
+ space_needed += max_len - len_migr;
+ dev_new = xmalloc(max_len);
+ imsm_copy_dev(dev_new, dev_iter);
+ dv->dev = dev_new;
+ dv->index = i;
+ dv->next = super->devlist;
+ super->devlist = dv;
+ }
+
+ /* ensure that super->buf is large enough when all raid devices
+ * are migrating
+ */
+ if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) {
+ void *buf;
+
+ len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed,
+ super->sector_size);
+ if (posix_memalign(&buf, MAX_SECTOR_SIZE, len) != 0)
+ return 1;
+
+ memcpy(buf, super->buf, super->len);
+ memset(buf + super->len, 0, len - super->len);
+ free(super->buf);
+ super->buf = buf;
+ super->len = len;
+ }
+
+ super->extra_space += space_needed;
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: check_mpb_migr_compatibility
+ * Description: Function checks for unsupported migration features:
+ * - migration optimization area (pba_of_lba0)
+ * - descending reshape (ascending_migr)
+ * Parameters:
+ * super : imsm metadata information
+ * Returns:
+ * 0 : migration is compatible
+ * -1 : migration is not compatible
+ ******************************************************************************/
+int check_mpb_migr_compatibility(struct intel_super *super)
+{
+ struct imsm_map *map0, *map1;
+ struct migr_record *migr_rec = super->migr_rec;
+ int i;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i);
+
+ if (dev_iter &&
+ dev_iter->vol.migr_state == 1 &&
+ dev_iter->vol.migr_type == MIGR_GEN_MIGR) {
+ /* This device is migrating */
+ map0 = get_imsm_map(dev_iter, MAP_0);
+ map1 = get_imsm_map(dev_iter, MAP_1);
+ if (pba_of_lba0(map0) != pba_of_lba0(map1))
+ /* migration optimization area was used */
+ return -1;
+ if (migr_rec->ascending_migr == 0 &&
+ migr_rec->dest_depth_per_unit > 0)
+ /* descending reshape not supported yet */
+ return -1;
+ }
+ }
+ return 0;
+}
+
+static void __free_imsm(struct intel_super *super, int free_disks);
+
+/* load_imsm_mpb - read matrix metadata
+ * allocates super->mpb to be freed by free_imsm
+ */
+static int load_imsm_mpb(int fd, struct intel_super *super, char *devname)
+{
+ unsigned long long dsize;
+ unsigned long long sectors;
+ unsigned int sector_size = super->sector_size;
+ struct stat;
+ struct imsm_super *anchor;
+ __u32 check_sum;
+
+ get_dev_size(fd, NULL, &dsize);
+ if (dsize < 2*sector_size) {
+ if (devname)
+ pr_err("%s: device to small for imsm\n",
+ devname);
+ return 1;
+ }
+
+ if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0) {
+ if (devname)
+ pr_err("Cannot seek to anchor block on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if (posix_memalign((void **)&anchor, sector_size, sector_size) != 0) {
+ if (devname)
+ pr_err("Failed to allocate imsm anchor buffer on %s\n", devname);
+ return 1;
+ }
+ if ((unsigned int)read(fd, anchor, sector_size) != sector_size) {
+ if (devname)
+ pr_err("Cannot read anchor block on %s: %s\n",
+ devname, strerror(errno));
+ free(anchor);
+ return 1;
+ }
+
+ if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) {
+ if (devname)
+ pr_err("no IMSM anchor on %s\n", devname);
+ free(anchor);
+ return 2;
+ }
+
+ __free_imsm(super, 0);
+ /* reload capability and hba */
+
+ /* capability and hba must be updated with new super allocation */
+ find_intel_hba_capability(fd, super, devname);
+ super->len = ROUND_UP(anchor->mpb_size, sector_size);
+ if (posix_memalign(&super->buf, MAX_SECTOR_SIZE, super->len) != 0) {
+ if (devname)
+ pr_err("unable to allocate %zu byte mpb buffer\n",
+ super->len);
+ free(anchor);
+ return 2;
+ }
+ memcpy(super->buf, anchor, sector_size);
+
+ sectors = mpb_sectors(anchor, sector_size) - 1;
+ free(anchor);
+
+ if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
+ MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
+ pr_err("could not allocate migr_rec buffer\n");
+ free(super->buf);
+ return 2;
+ }
+ super->clean_migration_record_by_mdmon = 0;
+
+ if (!sectors) {
+ check_sum = __gen_imsm_checksum(super->anchor);
+ if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+ if (devname)
+ pr_err("IMSM checksum %x != %x on %s\n",
+ check_sum,
+ __le32_to_cpu(super->anchor->check_sum),
+ devname);
+ return 2;
+ }
+
+ return 0;
+ }
+
+ /* read the extended mpb */
+ if (lseek64(fd, dsize - (sector_size * (2 + sectors)), SEEK_SET) < 0) {
+ if (devname)
+ pr_err("Cannot seek to extended mpb on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if ((unsigned int)read(fd, super->buf + sector_size,
+ super->len - sector_size) != super->len - sector_size) {
+ if (devname)
+ pr_err("Cannot read extended mpb on %s: %s\n",
+ devname, strerror(errno));
+ return 2;
+ }
+
+ check_sum = __gen_imsm_checksum(super->anchor);
+ if (check_sum != __le32_to_cpu(super->anchor->check_sum)) {
+ if (devname)
+ pr_err("IMSM checksum %x != %x on %s\n",
+ check_sum, __le32_to_cpu(super->anchor->check_sum),
+ devname);
+ return 3;
+ }
+
+ return 0;
+}
+
+static int read_imsm_migr_rec(int fd, struct intel_super *super);
+
+/* clears hi bits in metadata if MPB_ATTRIB_2TB_DISK not set */
+static void clear_hi(struct intel_super *super)
+{
+ struct imsm_super *mpb = super->anchor;
+ int i, n;
+ if (mpb->attributes & MPB_ATTRIB_2TB_DISK)
+ return;
+ for (i = 0; i < mpb->num_disks; ++i) {
+ struct imsm_disk *disk = &mpb->disk[i];
+ disk->total_blocks_hi = 0;
+ }
+ for (i = 0; i < mpb->num_raid_devs; ++i) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ if (!dev)
+ return;
+ for (n = 0; n < 2; ++n) {
+ struct imsm_map *map = get_imsm_map(dev, n);
+ if (!map)
+ continue;
+ map->pba_of_lba0_hi = 0;
+ map->blocks_per_member_hi = 0;
+ map->num_data_stripes_hi = 0;
+ }
+ }
+}
+
+static int
+load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd)
+{
+ int err;
+
+ err = load_imsm_mpb(fd, super, devname);
+ if (err)
+ return err;
+ if (super->sector_size == 4096)
+ convert_from_4k(super);
+ err = load_imsm_disk(fd, super, devname, keep_fd);
+ if (err)
+ return err;
+ err = parse_raid_devices(super);
+ if (err)
+ return err;
+ err = load_bbm_log(super);
+ clear_hi(super);
+ return err;
+}
+
+static void __free_imsm_disk(struct dl *d, int do_close)
+{
+ if (do_close)
+ close_fd(&d->fd);
+ if (d->devname)
+ free(d->devname);
+ if (d->e)
+ free(d->e);
+ free(d);
+
+}
+
+static void free_imsm_disks(struct intel_super *super)
+{
+ struct dl *d;
+
+ while (super->disks) {
+ d = super->disks;
+ super->disks = d->next;
+ __free_imsm_disk(d, 1);
+ }
+ while (super->disk_mgmt_list) {
+ d = super->disk_mgmt_list;
+ super->disk_mgmt_list = d->next;
+ __free_imsm_disk(d, 1);
+ }
+ while (super->missing) {
+ d = super->missing;
+ super->missing = d->next;
+ __free_imsm_disk(d, 1);
+ }
+
+}
+
+/* free all the pieces hanging off of a super pointer */
+static void __free_imsm(struct intel_super *super, int free_disks)
+{
+ struct intel_hba *elem, *next;
+
+ if (super->buf) {
+ free(super->buf);
+ super->buf = NULL;
+ }
+ /* unlink capability description */
+ super->orom = NULL;
+ if (super->migr_rec_buf) {
+ free(super->migr_rec_buf);
+ super->migr_rec_buf = NULL;
+ }
+ if (free_disks)
+ free_imsm_disks(super);
+ free_devlist(super);
+ elem = super->hba;
+ while (elem) {
+ if (elem->path)
+ free((void *)elem->path);
+ next = elem->next;
+ free(elem);
+ elem = next;
+ }
+ if (super->bbm_log)
+ free(super->bbm_log);
+ super->hba = NULL;
+}
+
+static void free_imsm(struct intel_super *super)
+{
+ __free_imsm(super, 1);
+ free(super->bb.entries);
+ free(super);
+}
+
+static void free_super_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+
+ if (!super)
+ return;
+
+ free_imsm(super);
+ st->sb = NULL;
+}
+
+static struct intel_super *alloc_super(void)
+{
+ struct intel_super *super = xcalloc(1, sizeof(*super));
+
+ super->current_vol = -1;
+ super->create_offset = ~((unsigned long long) 0);
+
+ super->bb.entries = xmalloc(BBM_LOG_MAX_ENTRIES *
+ sizeof(struct md_bb_entry));
+ if (!super->bb.entries) {
+ free(super);
+ return NULL;
+ }
+
+ return super;
+}
+
+/*
+ * find and allocate hba and OROM/EFI based on valid fd of RAID component device
+ */
+static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname)
+{
+ struct sys_dev *hba_name;
+ int rv = 0;
+
+ if (is_fd_valid(fd) && test_partition(fd)) {
+ pr_err("imsm: %s is a partition, cannot be used in IMSM\n",
+ devname);
+ return 1;
+ }
+ if (!is_fd_valid(fd) || check_env("IMSM_NO_PLATFORM")) {
+ super->orom = NULL;
+ super->hba = NULL;
+ return 0;
+ }
+ hba_name = find_disk_attached_hba(fd, NULL);
+ if (!hba_name) {
+ if (devname)
+ pr_err("%s is not attached to Intel(R) RAID controller.\n",
+ devname);
+ return 1;
+ }
+ rv = attach_hba_to_super(super, hba_name);
+ if (rv == 2) {
+ if (devname) {
+ struct intel_hba *hba = super->hba;
+
+ pr_err("%s is attached to Intel(R) %s %s (%s),\n"
+ " but the container is assigned to Intel(R) %s %s (",
+ devname,
+ get_sys_dev_type(hba_name->type),
+ hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller",
+ hba_name->pci_id ? : "Err!",
+ get_sys_dev_type(super->hba->type),
+ hba->type == SYS_DEV_VMD ? "domain" : "RAID controller");
+
+ while (hba) {
+ fprintf(stderr, "%s", hba->pci_id ? : "Err!");
+ if (hba->next)
+ fprintf(stderr, ", ");
+ hba = hba->next;
+ }
+ fprintf(stderr, ").\n"
+ " Mixing devices attached to different controllers is not allowed.\n");
+ }
+ return 2;
+ }
+ super->orom = find_imsm_capability(hba_name);
+ if (!super->orom)
+ return 3;
+
+ return 0;
+}
+
+/* find_missing - helper routine for load_super_imsm_all that identifies
+ * disks that have disappeared from the system. This routine relies on
+ * the mpb being uptodate, which it is at load time.
+ */
+static int find_missing(struct intel_super *super)
+{
+ int i;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ struct imsm_disk *disk;
+
+ for (i = 0; i < mpb->num_disks; i++) {
+ disk = __get_imsm_disk(mpb, i);
+ dl = serial_to_dl(disk->serial, super);
+ if (dl)
+ continue;
+
+ dl = xmalloc(sizeof(*dl));
+ dl->major = 0;
+ dl->minor = 0;
+ dl->fd = -1;
+ dl->devname = xstrdup("missing");
+ dl->index = i;
+ serialcpy(dl->serial, disk->serial);
+ dl->disk = *disk;
+ dl->e = NULL;
+ dl->next = super->missing;
+ super->missing = dl;
+ }
+
+ return 0;
+}
+
+static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list)
+{
+ struct intel_disk *idisk = disk_list;
+
+ while (idisk) {
+ if (serialcmp(idisk->disk.serial, serial) == 0)
+ break;
+ idisk = idisk->next;
+ }
+
+ return idisk;
+}
+
+static int __prep_thunderdome(struct intel_super **table, int tbl_size,
+ struct intel_super *super,
+ struct intel_disk **disk_list)
+{
+ struct imsm_disk *d = &super->disks->disk;
+ struct imsm_super *mpb = super->anchor;
+ int i, j;
+
+ for (i = 0; i < tbl_size; i++) {
+ struct imsm_super *tbl_mpb = table[i]->anchor;
+ struct imsm_disk *tbl_d = &table[i]->disks->disk;
+
+ if (tbl_mpb->family_num == mpb->family_num) {
+ if (tbl_mpb->check_sum == mpb->check_sum) {
+ dprintf("mpb from %d:%d matches %d:%d\n",
+ super->disks->major,
+ super->disks->minor,
+ table[i]->disks->major,
+ table[i]->disks->minor);
+ break;
+ }
+
+ if (((is_configured(d) && !is_configured(tbl_d)) ||
+ is_configured(d) == is_configured(tbl_d)) &&
+ tbl_mpb->generation_num < mpb->generation_num) {
+ /* current version of the mpb is a
+ * better candidate than the one in
+ * super_table, but copy over "cross
+ * generational" status
+ */
+ struct intel_disk *idisk;
+
+ dprintf("mpb from %d:%d replaces %d:%d\n",
+ super->disks->major,
+ super->disks->minor,
+ table[i]->disks->major,
+ table[i]->disks->minor);
+
+ idisk = disk_list_get(tbl_d->serial, *disk_list);
+ if (idisk && is_failed(&idisk->disk))
+ tbl_d->status |= FAILED_DISK;
+ break;
+ } else {
+ struct intel_disk *idisk;
+ struct imsm_disk *disk;
+
+ /* tbl_mpb is more up to date, but copy
+ * over cross generational status before
+ * returning
+ */
+ disk = __serial_to_disk(d->serial, mpb, NULL);
+ if (disk && is_failed(disk))
+ d->status |= FAILED_DISK;
+
+ idisk = disk_list_get(d->serial, *disk_list);
+ if (idisk) {
+ idisk->owner = i;
+ if (disk && is_configured(disk))
+ idisk->disk.status |= CONFIGURED_DISK;
+ }
+
+ dprintf("mpb from %d:%d prefer %d:%d\n",
+ super->disks->major,
+ super->disks->minor,
+ table[i]->disks->major,
+ table[i]->disks->minor);
+
+ return tbl_size;
+ }
+ }
+ }
+
+ if (i >= tbl_size)
+ table[tbl_size++] = super;
+ else
+ table[i] = super;
+
+ /* update/extend the merged list of imsm_disk records */
+ for (j = 0; j < mpb->num_disks; j++) {
+ struct imsm_disk *disk = __get_imsm_disk(mpb, j);
+ struct intel_disk *idisk;
+
+ idisk = disk_list_get(disk->serial, *disk_list);
+ if (idisk) {
+ idisk->disk.status |= disk->status;
+ if (is_configured(&idisk->disk) ||
+ is_failed(&idisk->disk))
+ idisk->disk.status &= ~(SPARE_DISK);
+ } else {
+ idisk = xcalloc(1, sizeof(*idisk));
+ idisk->owner = IMSM_UNKNOWN_OWNER;
+ idisk->disk = *disk;
+ idisk->next = *disk_list;
+ *disk_list = idisk;
+ }
+
+ if (serialcmp(idisk->disk.serial, d->serial) == 0)
+ idisk->owner = i;
+ }
+
+ return tbl_size;
+}
+
+static struct intel_super *
+validate_members(struct intel_super *super, struct intel_disk *disk_list,
+ const int owner)
+{
+ struct imsm_super *mpb = super->anchor;
+ int ok_count = 0;
+ int i;
+
+ for (i = 0; i < mpb->num_disks; i++) {
+ struct imsm_disk *disk = __get_imsm_disk(mpb, i);
+ struct intel_disk *idisk;
+
+ idisk = disk_list_get(disk->serial, disk_list);
+ if (idisk) {
+ if (idisk->owner == owner ||
+ idisk->owner == IMSM_UNKNOWN_OWNER)
+ ok_count++;
+ else
+ dprintf("'%.16s' owner %d != %d\n",
+ disk->serial, idisk->owner,
+ owner);
+ } else {
+ dprintf("unknown disk %x [%d]: %.16s\n",
+ __le32_to_cpu(mpb->family_num), i,
+ disk->serial);
+ break;
+ }
+ }
+
+ if (ok_count == mpb->num_disks)
+ return super;
+ return NULL;
+}
+
+static void show_conflicts(__u32 family_num, struct intel_super *super_list)
+{
+ struct intel_super *s;
+
+ for (s = super_list; s; s = s->next) {
+ if (family_num != s->anchor->family_num)
+ continue;
+ pr_err("Conflict, offlining family %#x on '%s'\n",
+ __le32_to_cpu(family_num), s->disks->devname);
+ }
+}
+
+static struct intel_super *
+imsm_thunderdome(struct intel_super **super_list, int len)
+{
+ struct intel_super *super_table[len];
+ struct intel_disk *disk_list = NULL;
+ struct intel_super *champion, *spare;
+ struct intel_super *s, **del;
+ int tbl_size = 0;
+ int conflict;
+ int i;
+
+ memset(super_table, 0, sizeof(super_table));
+ for (s = *super_list; s; s = s->next)
+ tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list);
+
+ for (i = 0; i < tbl_size; i++) {
+ struct imsm_disk *d;
+ struct intel_disk *idisk;
+ struct imsm_super *mpb = super_table[i]->anchor;
+
+ s = super_table[i];
+ d = &s->disks->disk;
+
+ /* 'd' must appear in merged disk list for its
+ * configuration to be valid
+ */
+ idisk = disk_list_get(d->serial, disk_list);
+ if (idisk && idisk->owner == i)
+ s = validate_members(s, disk_list, i);
+ else
+ s = NULL;
+
+ if (!s)
+ dprintf("marking family: %#x from %d:%d offline\n",
+ mpb->family_num,
+ super_table[i]->disks->major,
+ super_table[i]->disks->minor);
+ super_table[i] = s;
+ }
+
+ /* This is where the mdadm implementation differs from the Windows
+ * driver which has no strict concept of a container. We can only
+ * assemble one family from a container, so when returning a prodigal
+ * array member to this system the code will not be able to disambiguate
+ * the container contents that should be assembled ("foreign" versus
+ * "local"). It requires user intervention to set the orig_family_num
+ * to a new value to establish a new container. The Windows driver in
+ * this situation fixes up the volume name in place and manages the
+ * foreign array as an independent entity.
+ */
+ s = NULL;
+ spare = NULL;
+ conflict = 0;
+ for (i = 0; i < tbl_size; i++) {
+ struct intel_super *tbl_ent = super_table[i];
+ int is_spare = 0;
+
+ if (!tbl_ent)
+ continue;
+
+ if (tbl_ent->anchor->num_raid_devs == 0) {
+ spare = tbl_ent;
+ is_spare = 1;
+ }
+
+ if (s && !is_spare) {
+ show_conflicts(tbl_ent->anchor->family_num, *super_list);
+ conflict++;
+ } else if (!s && !is_spare)
+ s = tbl_ent;
+ }
+
+ if (!s)
+ s = spare;
+ if (!s) {
+ champion = NULL;
+ goto out;
+ }
+ champion = s;
+
+ if (conflict)
+ pr_err("Chose family %#x on '%s', assemble conflicts to new container with '--update=uuid'\n",
+ __le32_to_cpu(s->anchor->family_num), s->disks->devname);
+
+ /* collect all dl's onto 'champion', and update them to
+ * champion's version of the status
+ */
+ for (s = *super_list; s; s = s->next) {
+ struct imsm_super *mpb = champion->anchor;
+ struct dl *dl = s->disks;
+
+ if (s == champion)
+ continue;
+
+ mpb->attributes |= s->anchor->attributes & MPB_ATTRIB_2TB_DISK;
+
+ for (i = 0; i < mpb->num_disks; i++) {
+ struct imsm_disk *disk;
+
+ disk = __serial_to_disk(dl->serial, mpb, &dl->index);
+ if (disk) {
+ dl->disk = *disk;
+ /* only set index on disks that are a member of
+ * a populated contianer, i.e. one with
+ * raid_devs
+ */
+ if (is_failed(&dl->disk))
+ dl->index = -2;
+ else if (is_spare(&dl->disk))
+ dl->index = -1;
+ break;
+ }
+ }
+
+ if (i >= mpb->num_disks) {
+ struct intel_disk *idisk;
+
+ idisk = disk_list_get(dl->serial, disk_list);
+ if (idisk && is_spare(&idisk->disk) &&
+ !is_failed(&idisk->disk) && !is_configured(&idisk->disk))
+ dl->index = -1;
+ else {
+ dl->index = -2;
+ continue;
+ }
+ }
+
+ dl->next = champion->disks;
+ champion->disks = dl;
+ s->disks = NULL;
+ }
+
+ /* delete 'champion' from super_list */
+ for (del = super_list; *del; ) {
+ if (*del == champion) {
+ *del = (*del)->next;
+ break;
+ } else
+ del = &(*del)->next;
+ }
+ champion->next = NULL;
+
+ out:
+ while (disk_list) {
+ struct intel_disk *idisk = disk_list;
+
+ disk_list = disk_list->next;
+ free(idisk);
+ }
+
+ return champion;
+}
+
+static int
+get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd);
+static int get_super_block(struct intel_super **super_list, char *devnm, char *devname,
+ int major, int minor, int keep_fd);
+static int
+get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list,
+ int *max, int keep_fd);
+
+static int load_super_imsm_all(struct supertype *st, int fd, void **sbp,
+ char *devname, struct md_list *devlist,
+ int keep_fd)
+{
+ struct intel_super *super_list = NULL;
+ struct intel_super *super = NULL;
+ int err = 0;
+ int i = 0;
+
+ if (is_fd_valid(fd))
+ /* 'fd' is an opened container */
+ err = get_sra_super_block(fd, &super_list, devname, &i, keep_fd);
+ else
+ /* get super block from devlist devices */
+ err = get_devlist_super_block(devlist, &super_list, &i, keep_fd);
+ if (err)
+ goto error;
+ /* all mpbs enter, maybe one leaves */
+ super = imsm_thunderdome(&super_list, i);
+ if (!super) {
+ err = 1;
+ goto error;
+ }
+
+ if (find_missing(super) != 0) {
+ free_imsm(super);
+ err = 2;
+ goto error;
+ }
+
+ /* load migration record */
+ err = load_imsm_migr_rec(super);
+ if (err == -1) {
+ /* migration is in progress,
+ * but migr_rec cannot be loaded,
+ */
+ err = 4;
+ goto error;
+ }
+
+ /* Check migration compatibility */
+ if (err == 0 && check_mpb_migr_compatibility(super) != 0) {
+ pr_err("Unsupported migration detected");
+ if (devname)
+ fprintf(stderr, " on %s\n", devname);
+ else
+ fprintf(stderr, " (IMSM).\n");
+
+ err = 5;
+ goto error;
+ }
+
+ err = 0;
+
+ error:
+ while (super_list) {
+ struct intel_super *s = super_list;
+
+ super_list = super_list->next;
+ free_imsm(s);
+ }
+
+ if (err)
+ return err;
+
+ *sbp = super;
+ if (is_fd_valid(fd))
+ strcpy(st->container_devnm, fd2devnm(fd));
+ else
+ st->container_devnm[0] = 0;
+ if (err == 0 && st->ss == NULL) {
+ st->ss = &super_imsm;
+ st->minor_version = 0;
+ st->max_devs = IMSM_MAX_DEVICES;
+ }
+ return 0;
+}
+
+static int
+get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list,
+ int *max, int keep_fd)
+{
+ struct md_list *tmpdev;
+ int err = 0;
+ int i = 0;
+
+ for (i = 0, tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ if (tmpdev->used != 1)
+ continue;
+ if (tmpdev->container == 1) {
+ int lmax = 0;
+ int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL);
+ if (!is_fd_valid(fd)) {
+ pr_err("cannot open device %s: %s\n",
+ tmpdev->devname, strerror(errno));
+ err = 8;
+ goto error;
+ }
+ err = get_sra_super_block(fd, super_list,
+ tmpdev->devname, &lmax,
+ keep_fd);
+ i += lmax;
+ close(fd);
+ if (err) {
+ err = 7;
+ goto error;
+ }
+ } else {
+ int major = major(tmpdev->st_rdev);
+ int minor = minor(tmpdev->st_rdev);
+ err = get_super_block(super_list,
+ NULL,
+ tmpdev->devname,
+ major, minor,
+ keep_fd);
+ i++;
+ if (err) {
+ err = 6;
+ goto error;
+ }
+ }
+ }
+ error:
+ *max = i;
+ return err;
+}
+
+static int get_super_block(struct intel_super **super_list, char *devnm, char *devname,
+ int major, int minor, int keep_fd)
+{
+ struct intel_super *s;
+ char nm[32];
+ int dfd = -1;
+ int err = 0;
+ int retry;
+
+ s = alloc_super();
+ if (!s) {
+ err = 1;
+ goto error;
+ }
+
+ sprintf(nm, "%d:%d", major, minor);
+ dfd = dev_open(nm, O_RDWR);
+ if (!is_fd_valid(dfd)) {
+ err = 2;
+ goto error;
+ }
+
+ if (!get_dev_sector_size(dfd, NULL, &s->sector_size)) {
+ err = 2;
+ goto error;
+ }
+ find_intel_hba_capability(dfd, s, devname);
+ err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+
+ /* retry the load if we might have raced against mdmon */
+ if (err == 3 && devnm && mdmon_running(devnm))
+ for (retry = 0; retry < 3; retry++) {
+ usleep(3000);
+ err = load_and_parse_mpb(dfd, s, NULL, keep_fd);
+ if (err != 3)
+ break;
+ }
+ error:
+ if (!err) {
+ s->next = *super_list;
+ *super_list = s;
+ } else {
+ if (s)
+ free_imsm(s);
+ close_fd(&dfd);
+ }
+ if (!keep_fd)
+ close_fd(&dfd);
+ return err;
+
+}
+
+static int
+get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd)
+{
+ struct mdinfo *sra;
+ char *devnm;
+ struct mdinfo *sd;
+ int err = 0;
+ int i = 0;
+ sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE);
+ if (!sra)
+ return 1;
+
+ if (sra->array.major_version != -1 ||
+ sra->array.minor_version != -2 ||
+ strcmp(sra->text_version, "imsm") != 0) {
+ err = 1;
+ goto error;
+ }
+ /* load all mpbs */
+ devnm = fd2devnm(fd);
+ for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) {
+ if (get_super_block(super_list, devnm, devname,
+ sd->disk.major, sd->disk.minor, keep_fd) != 0) {
+ err = 7;
+ goto error;
+ }
+ }
+ error:
+ sysfs_free(sra);
+ *max = i;
+ return err;
+}
+
+static int load_container_imsm(struct supertype *st, int fd, char *devname)
+{
+ return load_super_imsm_all(st, fd, &st->sb, devname, NULL, 1);
+}
+
+static int load_super_imsm(struct supertype *st, int fd, char *devname)
+{
+ struct intel_super *super;
+ int rv;
+ int retry;
+
+ if (test_partition(fd))
+ /* IMSM not allowed on partitions */
+ return 1;
+
+ free_super_imsm(st);
+
+ super = alloc_super();
+ if (!super)
+ return 1;
+
+ if (!get_dev_sector_size(fd, NULL, &super->sector_size)) {
+ free_imsm(super);
+ return 1;
+ }
+ /* Load hba and capabilities if they exist.
+ * But do not preclude loading metadata in case capabilities or hba are
+ * non-compliant and ignore_hw_compat is set.
+ */
+ rv = find_intel_hba_capability(fd, super, devname);
+ /* no orom/efi or non-intel hba of the disk */
+ if (rv != 0 && st->ignore_hw_compat == 0) {
+ if (devname)
+ pr_err("No OROM/EFI properties for %s\n", devname);
+ free_imsm(super);
+ return 2;
+ }
+ rv = load_and_parse_mpb(fd, super, devname, 0);
+
+ /* retry the load if we might have raced against mdmon */
+ if (rv == 3) {
+ struct mdstat_ent *mdstat = NULL;
+ char *name = fd2kname(fd);
+
+ if (name)
+ mdstat = mdstat_by_component(name);
+
+ if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) {
+ for (retry = 0; retry < 3; retry++) {
+ usleep(3000);
+ rv = load_and_parse_mpb(fd, super, devname, 0);
+ if (rv != 3)
+ break;
+ }
+ }
+
+ free_mdstat(mdstat);
+ }
+
+ if (rv) {
+ if (devname)
+ pr_err("Failed to load all information sections on %s\n", devname);
+ free_imsm(super);
+ return rv;
+ }
+
+ st->sb = super;
+ if (st->ss == NULL) {
+ st->ss = &super_imsm;
+ st->minor_version = 0;
+ st->max_devs = IMSM_MAX_DEVICES;
+ }
+
+ /* load migration record */
+ if (load_imsm_migr_rec(super) == 0) {
+ /* Check for unsupported migration features */
+ if (check_mpb_migr_compatibility(super) != 0) {
+ pr_err("Unsupported migration detected");
+ if (devname)
+ fprintf(stderr, " on %s\n", devname);
+ else
+ fprintf(stderr, " (IMSM).\n");
+ return 3;
+ }
+ }
+
+ return 0;
+}
+
+static __u16 info_to_blocks_per_strip(mdu_array_info_t *info)
+{
+ if (info->level == 1)
+ return 128;
+ return info->chunk_size >> 9;
+}
+
+static unsigned long long info_to_blocks_per_member(mdu_array_info_t *info,
+ unsigned long long size)
+{
+ if (info->level == 1)
+ return size * 2;
+ else
+ return (size * 2) & ~(info_to_blocks_per_strip(info) - 1);
+}
+
+static void imsm_update_version_info(struct intel_super *super)
+{
+ /* update the version and attributes */
+ struct imsm_super *mpb = super->anchor;
+ char *version;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ int i;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, MAP_0);
+ if (__le32_to_cpu(dev->size_high) > 0)
+ mpb->attributes |= MPB_ATTRIB_2TB;
+
+ /* FIXME detect when an array spans a port multiplier */
+ #if 0
+ mpb->attributes |= MPB_ATTRIB_PM;
+ #endif
+
+ if (mpb->num_raid_devs > 1 ||
+ mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) {
+ version = MPB_VERSION_ATTRIBS;
+ switch (get_imsm_raid_level(map)) {
+ case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break;
+ case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break;
+ case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break;
+ case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break;
+ }
+ } else {
+ if (map->num_members >= 5)
+ version = MPB_VERSION_5OR6_DISK_ARRAY;
+ else if (dev->status == DEV_CLONE_N_GO)
+ version = MPB_VERSION_CNG;
+ else if (get_imsm_raid_level(map) == 5)
+ version = MPB_VERSION_RAID5;
+ else if (map->num_members >= 3)
+ version = MPB_VERSION_3OR4_DISK_ARRAY;
+ else if (get_imsm_raid_level(map) == 1)
+ version = MPB_VERSION_RAID1;
+ else
+ version = MPB_VERSION_RAID0;
+ }
+ strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version);
+ }
+}
+
+static int check_name(struct intel_super *super, char *name, int quiet)
+{
+ struct imsm_super *mpb = super->anchor;
+ char *reason = NULL;
+ char *start = name;
+ size_t len = strlen(name);
+ int i;
+
+ if (len > 0) {
+ while (isspace(start[len - 1]))
+ start[--len] = 0;
+ while (*start && isspace(*start))
+ ++start, --len;
+ memmove(name, start, len + 1);
+ }
+
+ if (len > MAX_RAID_SERIAL_LEN)
+ reason = "must be 16 characters or less";
+ else if (len == 0)
+ reason = "must be a non-empty string";
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+
+ if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) {
+ reason = "already exists";
+ break;
+ }
+ }
+
+ if (reason && !quiet)
+ pr_err("imsm volume name %s\n", reason);
+
+ return !reason;
+}
+
+static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info,
+ struct shape *s, char *name,
+ char *homehost, int *uuid,
+ long long data_offset)
+{
+ /* We are creating a volume inside a pre-existing container.
+ * so st->sb is already set.
+ */
+ struct intel_super *super = st->sb;
+ unsigned int sector_size = super->sector_size;
+ struct imsm_super *mpb = super->anchor;
+ struct intel_dev *dv;
+ struct imsm_dev *dev;
+ struct imsm_vol *vol;
+ struct imsm_map *map;
+ int idx = mpb->num_raid_devs;
+ int i;
+ int namelen;
+ unsigned long long array_blocks;
+ size_t size_old, size_new;
+ unsigned int data_disks;
+ unsigned long long size_per_member;
+
+ if (super->orom && mpb->num_raid_devs >= super->orom->vpa) {
+ pr_err("This imsm-container already has the maximum of %d volumes\n", super->orom->vpa);
+ return 0;
+ }
+
+ /* ensure the mpb is large enough for the new data */
+ size_old = __le32_to_cpu(mpb->mpb_size);
+ size_new = disks_to_mpb_size(info->nr_disks);
+ if (size_new > size_old) {
+ void *mpb_new;
+ size_t size_round = ROUND_UP(size_new, sector_size);
+
+ if (posix_memalign(&mpb_new, sector_size, size_round) != 0) {
+ pr_err("could not allocate new mpb\n");
+ return 0;
+ }
+ if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
+ MIGR_REC_BUF_SECTORS*
+ MAX_SECTOR_SIZE) != 0) {
+ pr_err("could not allocate migr_rec buffer\n");
+ free(super->buf);
+ free(super);
+ free(mpb_new);
+ return 0;
+ }
+ memcpy(mpb_new, mpb, size_old);
+ free(mpb);
+ mpb = mpb_new;
+ super->anchor = mpb_new;
+ mpb->mpb_size = __cpu_to_le32(size_new);
+ memset(mpb_new + size_old, 0, size_round - size_old);
+ super->len = size_round;
+ }
+ super->current_vol = idx;
+
+ /* handle 'failed_disks' by either:
+ * a) create dummy disk entries in the table if this the first
+ * volume in the array. We add them here as this is the only
+ * opportunity to add them. add_to_super_imsm_volume()
+ * handles the non-failed disks and continues incrementing
+ * mpb->num_disks.
+ * b) validate that 'failed_disks' matches the current number
+ * of missing disks if the container is populated
+ */
+ if (super->current_vol == 0) {
+ mpb->num_disks = 0;
+ for (i = 0; i < info->failed_disks; i++) {
+ struct imsm_disk *disk;
+
+ mpb->num_disks++;
+ disk = __get_imsm_disk(mpb, i);
+ disk->status = CONFIGURED_DISK | FAILED_DISK;
+ disk->scsi_id = __cpu_to_le32(~(__u32)0);
+ snprintf((char *) disk->serial, MAX_RAID_SERIAL_LEN,
+ "missing:%d", (__u8)i);
+ }
+ find_missing(super);
+ } else {
+ int missing = 0;
+ struct dl *d;
+
+ for (d = super->missing; d; d = d->next)
+ missing++;
+ if (info->failed_disks > missing) {
+ pr_err("unable to add 'missing' disk to container\n");
+ return 0;
+ }
+ }
+
+ if (!check_name(super, name, 0))
+ return 0;
+ dv = xmalloc(sizeof(*dv));
+ dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1));
+ /*
+ * Explicitly allow truncating to not confuse gcc's
+ * -Werror=stringop-truncation
+ */
+ namelen = min((int) strlen(name), MAX_RAID_SERIAL_LEN);
+ memcpy(dev->volume, name, namelen);
+ array_blocks = calc_array_size(info->level, info->raid_disks,
+ info->layout, info->chunk_size,
+ s->size * BLOCKS_PER_KB);
+ data_disks = get_data_disks(info->level, info->layout,
+ info->raid_disks);
+ array_blocks = round_size_to_mb(array_blocks, data_disks);
+ size_per_member = array_blocks / data_disks;
+
+ set_imsm_dev_size(dev, array_blocks);
+ dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING);
+ vol = &dev->vol;
+ vol->migr_state = 0;
+ set_migr_type(dev, MIGR_INIT);
+ vol->dirty = !info->state;
+ set_vol_curr_migr_unit(dev, 0);
+ map = get_imsm_map(dev, MAP_0);
+ set_pba_of_lba0(map, super->create_offset);
+ map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info));
+ map->failed_disk_num = ~0;
+ if (info->level > 0)
+ map->map_state = (info->state ? IMSM_T_STATE_NORMAL
+ : IMSM_T_STATE_UNINITIALIZED);
+ else
+ map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED :
+ IMSM_T_STATE_NORMAL;
+ map->ddf = 1;
+
+ if (info->level == 1 && info->raid_disks > 2) {
+ free(dev);
+ free(dv);
+ pr_err("imsm does not support more than 2 disksin a raid1 volume\n");
+ return 0;
+ }
+
+ map->raid_level = info->level;
+ if (info->level == 10)
+ map->raid_level = 1;
+ set_num_domains(map);
+
+ size_per_member += NUM_BLOCKS_DIRTY_STRIPE_REGION;
+ set_blocks_per_member(map, info_to_blocks_per_member(info,
+ size_per_member /
+ BLOCKS_PER_KB));
+
+ map->num_members = info->raid_disks;
+ update_num_data_stripes(map, array_blocks);
+ for (i = 0; i < map->num_members; i++) {
+ /* initialized in add_to_super */
+ set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD);
+ }
+ mpb->num_raid_devs++;
+ mpb->num_raid_devs_created++;
+ dev->my_vol_raid_dev_num = mpb->num_raid_devs_created;
+
+ if (s->consistency_policy <= CONSISTENCY_POLICY_RESYNC) {
+ dev->rwh_policy = RWH_MULTIPLE_OFF;
+ } else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ dev->rwh_policy = RWH_MULTIPLE_DISTRIBUTED;
+ } else {
+ free(dev);
+ free(dv);
+ pr_err("imsm does not support consistency policy %s\n",
+ map_num(consistency_policies, s->consistency_policy));
+ return 0;
+ }
+
+ dv->dev = dev;
+ dv->index = super->current_vol;
+ dv->next = super->devlist;
+ super->devlist = dv;
+
+ imsm_update_version_info(super);
+
+ return 1;
+}
+
+static int init_super_imsm(struct supertype *st, mdu_array_info_t *info,
+ struct shape *s, char *name,
+ char *homehost, int *uuid,
+ unsigned long long data_offset)
+{
+ /* This is primarily called by Create when creating a new array.
+ * We will then get add_to_super called for each component, and then
+ * write_init_super called to write it out to each device.
+ * For IMSM, Create can create on fresh devices or on a pre-existing
+ * array.
+ * To create on a pre-existing array a different method will be called.
+ * This one is just for fresh drives.
+ */
+ struct intel_super *super;
+ struct imsm_super *mpb;
+ size_t mpb_size;
+ char *version;
+
+ if (data_offset != INVALID_SECTORS) {
+ pr_err("data-offset not supported by imsm\n");
+ return 0;
+ }
+
+ if (st->sb)
+ return init_super_imsm_volume(st, info, s, name, homehost, uuid,
+ data_offset);
+
+ if (info)
+ mpb_size = disks_to_mpb_size(info->nr_disks);
+ else
+ mpb_size = MAX_SECTOR_SIZE;
+
+ super = alloc_super();
+ if (super &&
+ posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) {
+ free_imsm(super);
+ super = NULL;
+ }
+ if (!super) {
+ pr_err("could not allocate superblock\n");
+ return 0;
+ }
+ if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE,
+ MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) {
+ pr_err("could not allocate migr_rec buffer\n");
+ free(super->buf);
+ free_imsm(super);
+ return 0;
+ }
+ memset(super->buf, 0, mpb_size);
+ mpb = super->buf;
+ mpb->mpb_size = __cpu_to_le32(mpb_size);
+ st->sb = super;
+
+ if (info == NULL) {
+ /* zeroing superblock */
+ return 0;
+ }
+
+ mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
+
+ version = (char *) mpb->sig;
+ strcpy(version, MPB_SIGNATURE);
+ version += strlen(MPB_SIGNATURE);
+ strcpy(version, MPB_VERSION_RAID0);
+
+ return 1;
+}
+
+static int drive_validate_sector_size(struct intel_super *super, struct dl *dl)
+{
+ unsigned int member_sector_size;
+
+ if (!is_fd_valid(dl->fd)) {
+ pr_err("Invalid file descriptor for %s\n", dl->devname);
+ return 0;
+ }
+
+ if (!get_dev_sector_size(dl->fd, dl->devname, &member_sector_size))
+ return 0;
+ if (member_sector_size != super->sector_size)
+ return 0;
+ return 1;
+}
+
+static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_disk *_disk;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ struct dl *dl, *df;
+ int slot;
+
+ dev = get_imsm_dev(super, super->current_vol);
+ map = get_imsm_map(dev, MAP_0);
+
+ if (! (dk->state & (1<<MD_DISK_SYNC))) {
+ pr_err("%s: Cannot add spare devices to IMSM volume\n",
+ devname);
+ return 1;
+ }
+
+ if (!is_fd_valid(fd)) {
+ /* we're doing autolayout so grab the pre-marked (in
+ * validate_geometry) raid_disk
+ */
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->raiddisk == dk->raid_disk)
+ break;
+ } else {
+ for (dl = super->disks; dl ; dl = dl->next)
+ if (dl->major == dk->major &&
+ dl->minor == dk->minor)
+ break;
+ }
+
+ if (!dl) {
+ pr_err("%s is not a member of the same container\n", devname);
+ return 1;
+ }
+
+ if (mpb->num_disks == 0)
+ if (!get_dev_sector_size(dl->fd, dl->devname,
+ &super->sector_size))
+ return 1;
+
+ if (!drive_validate_sector_size(super, dl)) {
+ pr_err("Combining drives of different sector size in one volume is not allowed\n");
+ return 1;
+ }
+
+ /* add a pristine spare to the metadata */
+ if (dl->index < 0) {
+ dl->index = super->anchor->num_disks;
+ super->anchor->num_disks++;
+ }
+ /* Check the device has not already been added */
+ slot = get_imsm_disk_slot(map, dl->index);
+ if (slot >= 0 &&
+ (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) {
+ pr_err("%s has been included in this array twice\n",
+ devname);
+ return 1;
+ }
+ set_imsm_ord_tbl_ent(map, dk->raid_disk, dl->index);
+ dl->disk.status = CONFIGURED_DISK;
+
+ /* update size of 'missing' disks to be at least as large as the
+ * largest acitve member (we only have dummy missing disks when
+ * creating the first volume)
+ */
+ if (super->current_vol == 0) {
+ for (df = super->missing; df; df = df->next) {
+ if (total_blocks(&dl->disk) > total_blocks(&df->disk))
+ set_total_blocks(&df->disk, total_blocks(&dl->disk));
+ _disk = __get_imsm_disk(mpb, df->index);
+ *_disk = df->disk;
+ }
+ }
+
+ /* refresh unset/failed slots to point to valid 'missing' entries */
+ for (df = super->missing; df; df = df->next)
+ for (slot = 0; slot < mpb->num_disks; slot++) {
+ __u32 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
+
+ if ((ord & IMSM_ORD_REBUILD) == 0)
+ continue;
+ set_imsm_ord_tbl_ent(map, slot, df->index | IMSM_ORD_REBUILD);
+ if (is_gen_migration(dev)) {
+ struct imsm_map *map2 = get_imsm_map(dev,
+ MAP_1);
+ int slot2 = get_imsm_disk_slot(map2, df->index);
+ if (slot2 < map2->num_members && slot2 >= 0) {
+ __u32 ord2 = get_imsm_ord_tbl_ent(dev,
+ slot2,
+ MAP_1);
+ if ((unsigned)df->index ==
+ ord_to_idx(ord2))
+ set_imsm_ord_tbl_ent(map2,
+ slot2,
+ df->index |
+ IMSM_ORD_REBUILD);
+ }
+ }
+ dprintf("set slot:%d to missing disk:%d\n", slot, df->index);
+ break;
+ }
+
+ /* if we are creating the first raid device update the family number */
+ if (super->current_vol == 0) {
+ __u32 sum;
+ struct imsm_dev *_dev = __get_imsm_dev(mpb, 0);
+
+ _disk = __get_imsm_disk(mpb, dl->index);
+ if (!_dev || !_disk) {
+ pr_err("BUG mpb setup error\n");
+ return 1;
+ }
+ *_dev = *dev;
+ *_disk = dl->disk;
+ sum = random32();
+ sum += __gen_imsm_checksum(mpb);
+ mpb->family_num = __cpu_to_le32(sum);
+ mpb->orig_family_num = mpb->family_num;
+ mpb->creation_time = __cpu_to_le64((__u64)time(NULL));
+ }
+ super->current_disk = dl;
+ return 0;
+}
+
+/* mark_spare()
+ * Function marks disk as spare and restores disk serial
+ * in case it was previously marked as failed by takeover operation
+ * reruns:
+ * -1 : critical error
+ * 0 : disk is marked as spare but serial is not set
+ * 1 : success
+ */
+int mark_spare(struct dl *disk)
+{
+ __u8 serial[MAX_RAID_SERIAL_LEN];
+ int ret_val = -1;
+
+ if (!disk)
+ return ret_val;
+
+ ret_val = 0;
+ if (!imsm_read_serial(disk->fd, NULL, serial, MAX_RAID_SERIAL_LEN)) {
+ /* Restore disk serial number, because takeover marks disk
+ * as failed and adds to serial ':0' before it becomes
+ * a spare disk.
+ */
+ serialcpy(disk->serial, serial);
+ serialcpy(disk->disk.serial, serial);
+ ret_val = 1;
+ }
+ disk->disk.status = SPARE_DISK;
+ disk->index = -1;
+
+ return ret_val;
+}
+
+
+static int write_super_imsm_spare(struct intel_super *super, struct dl *d);
+
+static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname,
+ unsigned long long data_offset)
+{
+ struct intel_super *super = st->sb;
+ struct dl *dd;
+ unsigned long long size;
+ unsigned int member_sector_size;
+ __u32 id;
+ int rv;
+ struct stat stb;
+
+ /* If we are on an RAID enabled platform check that the disk is
+ * attached to the raid controller.
+ * We do not need to test disks attachment for container based additions,
+ * they shall be already tested when container was created/assembled.
+ */
+ rv = find_intel_hba_capability(fd, super, devname);
+ /* no orom/efi or non-intel hba of the disk */
+ if (rv != 0) {
+ dprintf("capability: %p fd: %d ret: %d\n",
+ super->orom, fd, rv);
+ return 1;
+ }
+
+ if (super->current_vol >= 0)
+ return add_to_super_imsm_volume(st, dk, fd, devname);
+
+ fstat(fd, &stb);
+ dd = xcalloc(sizeof(*dd), 1);
+ dd->major = major(stb.st_rdev);
+ dd->minor = minor(stb.st_rdev);
+ dd->devname = devname ? xstrdup(devname) : NULL;
+ dd->fd = fd;
+ dd->e = NULL;
+ dd->action = DISK_ADD;
+ rv = imsm_read_serial(fd, devname, dd->serial, MAX_RAID_SERIAL_LEN);
+ if (rv) {
+ pr_err("failed to retrieve scsi serial, aborting\n");
+ __free_imsm_disk(dd, 0);
+ abort();
+ }
+
+ if (super->hba && ((super->hba->type == SYS_DEV_NVME) ||
+ (super->hba->type == SYS_DEV_VMD))) {
+ int i;
+ char cntrl_path[PATH_MAX];
+ char *cntrl_name;
+ char pci_dev_path[PATH_MAX];
+
+ if (!diskfd_to_devpath(fd, 2, pci_dev_path) ||
+ !diskfd_to_devpath(fd, 1, cntrl_path)) {
+ pr_err("failed to get dev paths, aborting\n");
+ __free_imsm_disk(dd, 0);
+ return 1;
+ }
+
+ cntrl_name = basename(cntrl_path);
+ if (is_multipath_nvme(fd))
+ pr_err("%s controller supports Multi-Path I/O, Intel (R) VROC does not support multipathing\n",
+ cntrl_name);
+
+ if (devpath_to_vendor(pci_dev_path) == 0x8086) {
+ /*
+ * If Intel's NVMe drive has serial ended with
+ * "-A","-B","-1" or "-2" it means that this is "x8"
+ * device (double drive on single PCIe card).
+ * User should be warned about potential data loss.
+ */
+ for (i = MAX_RAID_SERIAL_LEN-1; i > 0; i--) {
+ /* Skip empty character at the end */
+ if (dd->serial[i] == 0)
+ continue;
+
+ if (((dd->serial[i] == 'A') ||
+ (dd->serial[i] == 'B') ||
+ (dd->serial[i] == '1') ||
+ (dd->serial[i] == '2')) &&
+ (dd->serial[i-1] == '-'))
+ pr_err("\tThe action you are about to take may put your data at risk.\n"
+ "\tPlease note that x8 devices may consist of two separate x4 devices "
+ "located on a single PCIe port.\n"
+ "\tRAID 0 is the only supported configuration for this type of x8 device.\n");
+ break;
+ }
+ } else if (super->hba->type == SYS_DEV_VMD && super->orom &&
+ !imsm_orom_has_tpv_support(super->orom)) {
+ pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n"
+ "\tPlease refer to Intel(R) RSTe/VROC user guide.\n");
+ __free_imsm_disk(dd, 0);
+ return 1;
+ }
+ }
+
+ get_dev_size(fd, NULL, &size);
+ if (!get_dev_sector_size(fd, NULL, &member_sector_size)) {
+ __free_imsm_disk(dd, 0);
+ return 1;
+ }
+
+ if (super->sector_size == 0) {
+ /* this a first device, so sector_size is not set yet */
+ super->sector_size = member_sector_size;
+ }
+
+ /* clear migr_rec when adding disk to container */
+ memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
+ if (lseek64(fd, size - MIGR_REC_SECTOR_POSITION*member_sector_size,
+ SEEK_SET) >= 0) {
+ if ((unsigned int)write(fd, super->migr_rec_buf,
+ MIGR_REC_BUF_SECTORS*member_sector_size) !=
+ MIGR_REC_BUF_SECTORS*member_sector_size)
+ perror("Write migr_rec failed");
+ }
+
+ size /= 512;
+ serialcpy(dd->disk.serial, dd->serial);
+ set_total_blocks(&dd->disk, size);
+ if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) {
+ struct imsm_super *mpb = super->anchor;
+ mpb->attributes |= MPB_ATTRIB_2TB_DISK;
+ }
+ mark_spare(dd);
+ if (sysfs_disk_to_scsi_id(fd, &id) == 0)
+ dd->disk.scsi_id = __cpu_to_le32(id);
+ else
+ dd->disk.scsi_id = __cpu_to_le32(0);
+
+ if (st->update_tail) {
+ dd->next = super->disk_mgmt_list;
+ super->disk_mgmt_list = dd;
+ } else {
+ /* this is called outside of mdmon
+ * write initial spare metadata
+ * mdmon will overwrite it.
+ */
+ dd->next = super->disks;
+ super->disks = dd;
+ write_super_imsm_spare(super, dd);
+ }
+
+ return 0;
+}
+
+static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk)
+{
+ struct intel_super *super = st->sb;
+ struct dl *dd;
+
+ /* remove from super works only in mdmon - for communication
+ * manager - monitor. Check if communication memory buffer
+ * is prepared.
+ */
+ if (!st->update_tail) {
+ pr_err("shall be used in mdmon context only\n");
+ return 1;
+ }
+ dd = xcalloc(1, sizeof(*dd));
+ dd->major = dk->major;
+ dd->minor = dk->minor;
+ dd->fd = -1;
+ mark_spare(dd);
+ dd->action = DISK_REMOVE;
+
+ dd->next = super->disk_mgmt_list;
+ super->disk_mgmt_list = dd;
+
+ return 0;
+}
+
+static int store_imsm_mpb(int fd, struct imsm_super *mpb);
+
+static union {
+ char buf[MAX_SECTOR_SIZE];
+ struct imsm_super anchor;
+} spare_record __attribute__ ((aligned(MAX_SECTOR_SIZE)));
+
+
+static int write_super_imsm_spare(struct intel_super *super, struct dl *d)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_super *spare = &spare_record.anchor;
+ __u32 sum;
+
+ if (d->index != -1)
+ return 1;
+
+ spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super));
+ spare->generation_num = __cpu_to_le32(1UL);
+ spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY;
+ spare->num_disks = 1;
+ spare->num_raid_devs = 0;
+ spare->cache_size = mpb->cache_size;
+ spare->pwr_cycle_count = __cpu_to_le32(1);
+
+ snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH,
+ MPB_SIGNATURE MPB_VERSION_RAID0);
+
+ spare->disk[0] = d->disk;
+ if (__le32_to_cpu(d->disk.total_blocks_hi) > 0)
+ spare->attributes |= MPB_ATTRIB_2TB_DISK;
+
+ if (super->sector_size == 4096)
+ convert_to_4k_imsm_disk(&spare->disk[0]);
+
+ sum = __gen_imsm_checksum(spare);
+ spare->family_num = __cpu_to_le32(sum);
+ spare->orig_family_num = 0;
+ sum = __gen_imsm_checksum(spare);
+ spare->check_sum = __cpu_to_le32(sum);
+
+ if (store_imsm_mpb(d->fd, spare)) {
+ pr_err("failed for device %d:%d %s\n",
+ d->major, d->minor, strerror(errno));
+ return 1;
+ }
+
+ return 0;
+}
+/* spare records have their own family number and do not have any defined raid
+ * devices
+ */
+static int write_super_imsm_spares(struct intel_super *super, int doclose)
+{
+ struct dl *d;
+
+ for (d = super->disks; d; d = d->next) {
+ if (d->index != -1)
+ continue;
+
+ if (write_super_imsm_spare(super, d))
+ return 1;
+
+ if (doclose)
+ close_fd(&d->fd);
+ }
+
+ return 0;
+}
+
+static int write_super_imsm(struct supertype *st, int doclose)
+{
+ struct intel_super *super = st->sb;
+ unsigned int sector_size = super->sector_size;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *d;
+ __u32 generation;
+ __u32 sum;
+ int spares = 0;
+ int i;
+ __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk);
+ int num_disks = 0;
+ int clear_migration_record = 1;
+ __u32 bbm_log_size;
+
+ /* 'generation' is incremented everytime the metadata is written */
+ generation = __le32_to_cpu(mpb->generation_num);
+ generation++;
+ mpb->generation_num = __cpu_to_le32(generation);
+
+ /* fix up cases where previous mdadm releases failed to set
+ * orig_family_num
+ */
+ if (mpb->orig_family_num == 0)
+ mpb->orig_family_num = mpb->family_num;
+
+ for (d = super->disks; d; d = d->next) {
+ if (d->index == -1)
+ spares++;
+ else {
+ mpb->disk[d->index] = d->disk;
+ num_disks++;
+ }
+ }
+ for (d = super->missing; d; d = d->next) {
+ mpb->disk[d->index] = d->disk;
+ num_disks++;
+ }
+ mpb->num_disks = num_disks;
+ mpb_size += sizeof(struct imsm_disk) * mpb->num_disks;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = __get_imsm_dev(mpb, i);
+ struct imsm_dev *dev2 = get_imsm_dev(super, i);
+ if (dev && dev2) {
+ imsm_copy_dev(dev, dev2);
+ mpb_size += sizeof_imsm_dev(dev, 0);
+ }
+ if (is_gen_migration(dev2))
+ clear_migration_record = 0;
+ }
+
+ bbm_log_size = get_imsm_bbm_log_size(super->bbm_log);
+
+ if (bbm_log_size) {
+ memcpy((void *)mpb + mpb_size, super->bbm_log, bbm_log_size);
+ mpb->attributes |= MPB_ATTRIB_BBM;
+ } else
+ mpb->attributes &= ~MPB_ATTRIB_BBM;
+
+ super->anchor->bbm_log_size = __cpu_to_le32(bbm_log_size);
+ mpb_size += bbm_log_size;
+ mpb->mpb_size = __cpu_to_le32(mpb_size);
+
+#ifdef DEBUG
+ assert(super->len == 0 || mpb_size <= super->len);
+#endif
+
+ /* recalculate checksum */
+ sum = __gen_imsm_checksum(mpb);
+ mpb->check_sum = __cpu_to_le32(sum);
+
+ if (super->clean_migration_record_by_mdmon) {
+ clear_migration_record = 1;
+ super->clean_migration_record_by_mdmon = 0;
+ }
+ if (clear_migration_record)
+ memset(super->migr_rec_buf, 0,
+ MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
+
+ if (sector_size == 4096)
+ convert_to_4k(super);
+
+ /* write the mpb for disks that compose raid devices */
+ for (d = super->disks; d ; d = d->next) {
+ if (d->index < 0 || is_failed(&d->disk))
+ continue;
+
+ if (clear_migration_record) {
+ unsigned long long dsize;
+
+ get_dev_size(d->fd, NULL, &dsize);
+ if (lseek64(d->fd, dsize - sector_size,
+ SEEK_SET) >= 0) {
+ if ((unsigned int)write(d->fd,
+ super->migr_rec_buf,
+ MIGR_REC_BUF_SECTORS*sector_size) !=
+ MIGR_REC_BUF_SECTORS*sector_size)
+ perror("Write migr_rec failed");
+ }
+ }
+
+ if (store_imsm_mpb(d->fd, mpb))
+ fprintf(stderr,
+ "failed for device %d:%d (fd: %d)%s\n",
+ d->major, d->minor,
+ d->fd, strerror(errno));
+
+ if (doclose)
+ close_fd(&d->fd);
+ }
+
+ if (spares)
+ return write_super_imsm_spares(super, doclose);
+
+ return 0;
+}
+
+static int create_array(struct supertype *st, int dev_idx)
+{
+ size_t len;
+ struct imsm_update_create_array *u;
+ struct intel_super *super = st->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, dev_idx);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct disk_info *inf;
+ struct imsm_disk *disk;
+ int i;
+
+ len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) +
+ sizeof(*inf) * map->num_members;
+ u = xmalloc(len);
+ u->type = update_create_array;
+ u->dev_idx = dev_idx;
+ imsm_copy_dev(&u->dev, dev);
+ inf = get_disk_info(u);
+ for (i = 0; i < map->num_members; i++) {
+ int idx = get_imsm_disk_idx(dev, i, MAP_X);
+
+ disk = get_imsm_disk(super, idx);
+ if (!disk)
+ disk = get_imsm_missing(super, idx);
+ serialcpy(inf[i].serial, disk->serial);
+ }
+ append_metadata_update(st, u, len);
+
+ return 0;
+}
+
+static int mgmt_disk(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ size_t len;
+ struct imsm_update_add_remove_disk *u;
+
+ if (!super->disk_mgmt_list)
+ return 0;
+
+ len = sizeof(*u);
+ u = xmalloc(len);
+ u->type = update_add_remove_disk;
+ append_metadata_update(st, u, len);
+
+ return 0;
+}
+
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
+
+static int write_ppl_header(unsigned long long ppl_sector, int fd, void *buf)
+{
+ struct ppl_header *ppl_hdr = buf;
+ int ret;
+
+ ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
+
+ if (lseek64(fd, ppl_sector * 512, SEEK_SET) < 0) {
+ ret = -errno;
+ perror("Failed to seek to PPL header location");
+ return ret;
+ }
+
+ if (write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
+ ret = -errno;
+ perror("Write PPL header failed");
+ return ret;
+ }
+
+ fsync(fd);
+
+ return 0;
+}
+
+static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd)
+{
+ struct intel_super *super = st->sb;
+ void *buf;
+ struct ppl_header *ppl_hdr;
+ int ret;
+
+ /* first clear entire ppl space */
+ ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size);
+ if (ret)
+ return ret;
+
+ ret = posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE);
+ if (ret) {
+ pr_err("Failed to allocate PPL header buffer\n");
+ return -ret;
+ }
+
+ memset(buf, 0, PPL_HEADER_SIZE);
+ ppl_hdr = buf;
+ memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
+ ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num);
+
+ if (info->mismatch_cnt) {
+ /*
+ * We are overwriting an invalid ppl. Make one entry with wrong
+ * checksum to prevent the kernel from skipping resync.
+ */
+ ppl_hdr->entries_count = __cpu_to_le32(1);
+ ppl_hdr->entries[0].checksum = ~0;
+ }
+
+ ret = write_ppl_header(info->ppl_sector, fd, buf);
+
+ free(buf);
+ return ret;
+}
+
+static int is_rebuilding(struct imsm_dev *dev);
+
+static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info,
+ struct mdinfo *disk)
+{
+ struct intel_super *super = st->sb;
+ struct dl *d;
+ void *buf_orig, *buf, *buf_prev = NULL;
+ int ret = 0;
+ struct ppl_header *ppl_hdr = NULL;
+ __u32 crc;
+ struct imsm_dev *dev;
+ __u32 idx;
+ unsigned int i;
+ unsigned long long ppl_offset = 0;
+ unsigned long long prev_gen_num = 0;
+
+ if (disk->disk.raid_disk < 0)
+ return 0;
+
+ dev = get_imsm_dev(super, info->container_member);
+ idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_0);
+ d = get_imsm_dl_disk(super, idx);
+
+ if (!d || d->index < 0 || is_failed(&d->disk))
+ return 0;
+
+ if (posix_memalign(&buf_orig, MAX_SECTOR_SIZE, PPL_HEADER_SIZE * 2)) {
+ pr_err("Failed to allocate PPL header buffer\n");
+ return -1;
+ }
+ buf = buf_orig;
+
+ ret = 1;
+ while (ppl_offset < MULTIPLE_PPL_AREA_SIZE_IMSM) {
+ void *tmp;
+
+ dprintf("Checking potential PPL at offset: %llu\n", ppl_offset);
+
+ if (lseek64(d->fd, info->ppl_sector * 512 + ppl_offset,
+ SEEK_SET) < 0) {
+ perror("Failed to seek to PPL header location");
+ ret = -1;
+ break;
+ }
+
+ if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
+ perror("Read PPL header failed");
+ ret = -1;
+ break;
+ }
+
+ ppl_hdr = buf;
+
+ crc = __le32_to_cpu(ppl_hdr->checksum);
+ ppl_hdr->checksum = 0;
+
+ if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) {
+ dprintf("Wrong PPL header checksum on %s\n",
+ d->devname);
+ break;
+ }
+
+ if (prev_gen_num > __le64_to_cpu(ppl_hdr->generation)) {
+ /* previous was newest, it was already checked */
+ break;
+ }
+
+ if ((__le32_to_cpu(ppl_hdr->signature) !=
+ super->anchor->orig_family_num)) {
+ dprintf("Wrong PPL header signature on %s\n",
+ d->devname);
+ ret = 1;
+ break;
+ }
+
+ ret = 0;
+ prev_gen_num = __le64_to_cpu(ppl_hdr->generation);
+
+ ppl_offset += PPL_HEADER_SIZE;
+ for (i = 0; i < __le32_to_cpu(ppl_hdr->entries_count); i++)
+ ppl_offset +=
+ __le32_to_cpu(ppl_hdr->entries[i].pp_size);
+
+ if (!buf_prev)
+ buf_prev = buf + PPL_HEADER_SIZE;
+ tmp = buf_prev;
+ buf_prev = buf;
+ buf = tmp;
+ }
+
+ if (buf_prev) {
+ buf = buf_prev;
+ ppl_hdr = buf_prev;
+ }
+
+ /*
+ * Update metadata to use mutliple PPLs area (1MB).
+ * This is done once for all RAID members
+ */
+ if (info->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ info->ppl_size != (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9)) {
+ char subarray[20];
+ struct mdinfo *member_dev;
+
+ sprintf(subarray, "%d", info->container_member);
+
+ if (mdmon_running(st->container_devnm))
+ st->update_tail = &st->updates;
+
+ if (st->ss->update_subarray(st, subarray, "ppl", NULL)) {
+ pr_err("Failed to update subarray %s\n",
+ subarray);
+ } else {
+ if (st->update_tail)
+ flush_metadata_updates(st);
+ else
+ st->ss->sync_metadata(st);
+ info->ppl_size = (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+ for (member_dev = info->devs; member_dev;
+ member_dev = member_dev->next)
+ member_dev->ppl_size =
+ (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9);
+ }
+ }
+
+ if (ret == 1) {
+ struct imsm_map *map = get_imsm_map(dev, MAP_X);
+
+ if (map->map_state == IMSM_T_STATE_UNINITIALIZED ||
+ (map->map_state == IMSM_T_STATE_NORMAL &&
+ !(dev->vol.dirty & RAIDVOL_DIRTY)) ||
+ (is_rebuilding(dev) &&
+ vol_curr_migr_unit(dev) == 0 &&
+ get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_1) != idx))
+ ret = st->ss->write_init_ppl(st, info, d->fd);
+ else
+ info->mismatch_cnt++;
+ } else if (ret == 0 &&
+ ppl_hdr->entries_count == 0 &&
+ is_rebuilding(dev) &&
+ info->resync_start == 0) {
+ /*
+ * The header has no entries - add a single empty entry and
+ * rewrite the header to prevent the kernel from going into
+ * resync after an interrupted rebuild.
+ */
+ ppl_hdr->entries_count = __cpu_to_le32(1);
+ ret = write_ppl_header(info->ppl_sector, d->fd, buf);
+ }
+
+ free(buf_orig);
+
+ return ret;
+}
+
+static int write_init_ppl_imsm_all(struct supertype *st, struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ struct dl *d;
+ int ret = 0;
+
+ if (info->consistency_policy != CONSISTENCY_POLICY_PPL ||
+ info->array.level != 5)
+ return 0;
+
+ for (d = super->disks; d ; d = d->next) {
+ if (d->index < 0 || is_failed(&d->disk))
+ continue;
+
+ ret = st->ss->write_init_ppl(st, info, d->fd);
+ if (ret)
+ break;
+ }
+
+ return ret;
+}
+
+/*******************************************************************************
+ * Function: write_init_bitmap_imsm_vol
+ * Description: Write a bitmap header and prepares the area for the bitmap.
+ * Parameters:
+ * st : supertype information
+ * vol_idx : the volume index to use
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int write_init_bitmap_imsm_vol(struct supertype *st, int vol_idx)
+{
+ struct intel_super *super = st->sb;
+ int prev_current_vol = super->current_vol;
+ struct dl *d;
+ int ret = 0;
+
+ super->current_vol = vol_idx;
+ for (d = super->disks; d; d = d->next) {
+ if (d->index < 0 || is_failed(&d->disk))
+ continue;
+ ret = st->ss->write_bitmap(st, d->fd, NoUpdate);
+ if (ret)
+ break;
+ }
+ super->current_vol = prev_current_vol;
+ return ret;
+}
+
+/*******************************************************************************
+ * Function: write_init_bitmap_imsm_all
+ * Description: Write a bitmap header and prepares the area for the bitmap.
+ * Operation is executed for volumes with CONSISTENCY_POLICY_BITMAP.
+ * Parameters:
+ * st : supertype information
+ * info : info about the volume where the bitmap should be written
+ * vol_idx : the volume index to use
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int write_init_bitmap_imsm_all(struct supertype *st, struct mdinfo *info,
+ int vol_idx)
+{
+ int ret = 0;
+
+ if (info && (info->consistency_policy == CONSISTENCY_POLICY_BITMAP))
+ ret = write_init_bitmap_imsm_vol(st, vol_idx);
+
+ return ret;
+}
+
+static int write_init_super_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ int current_vol = super->current_vol;
+ int rv = 0;
+ struct mdinfo info;
+
+ getinfo_super_imsm(st, &info, NULL);
+
+ /* we are done with current_vol reset it to point st at the container */
+ super->current_vol = -1;
+
+ if (st->update_tail) {
+ /* queue the recently created array / added disk
+ * as a metadata update */
+
+ /* determine if we are creating a volume or adding a disk */
+ if (current_vol < 0) {
+ /* in the mgmt (add/remove) disk case we are running
+ * in mdmon context, so don't close fd's
+ */
+ rv = mgmt_disk(st);
+ } else {
+ /* adding the second volume to the array */
+ rv = write_init_ppl_imsm_all(st, &info);
+ if (!rv)
+ rv = write_init_bitmap_imsm_all(st, &info, current_vol);
+ if (!rv)
+ rv = create_array(st, current_vol);
+ }
+ } else {
+ struct dl *d;
+ for (d = super->disks; d; d = d->next)
+ Kill(d->devname, NULL, 0, -1, 1);
+ if (current_vol >= 0) {
+ rv = write_init_ppl_imsm_all(st, &info);
+ if (!rv)
+ rv = write_init_bitmap_imsm_all(st, &info, current_vol);
+ }
+
+ if (!rv)
+ rv = write_super_imsm(st, 1);
+ }
+
+ return rv;
+}
+
+static int store_super_imsm(struct supertype *st, int fd)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super ? super->anchor : NULL;
+
+ if (!mpb)
+ return 1;
+
+ if (super->sector_size == 4096)
+ convert_to_4k(super);
+ return store_imsm_mpb(fd, mpb);
+}
+
+static int validate_geometry_imsm_container(struct supertype *st, int level,
+ int raiddisks,
+ unsigned long long data_offset,
+ char *dev,
+ unsigned long long *freesize,
+ int verbose)
+{
+ int fd;
+ unsigned long long ldsize;
+ struct intel_super *super = NULL;
+ int rv = 0;
+
+ if (level != LEVEL_CONTAINER)
+ return 0;
+ if (!dev)
+ return 1;
+
+ fd = dev_open(dev, O_RDONLY|O_EXCL);
+ if (!is_fd_valid(fd)) {
+ pr_vrb("imsm: Cannot open %s: %s\n", dev, strerror(errno));
+ return 0;
+ }
+ if (!get_dev_size(fd, dev, &ldsize))
+ goto exit;
+
+ /* capabilities retrieve could be possible
+ * note that there is no fd for the disks in array.
+ */
+ super = alloc_super();
+ if (!super)
+ goto exit;
+
+ if (!get_dev_sector_size(fd, NULL, &super->sector_size))
+ goto exit;
+
+ rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL);
+ if (rv != 0) {
+#if DEBUG
+ char str[256];
+ fd2devname(fd, str);
+ dprintf("fd: %d %s orom: %p rv: %d raiddisk: %d\n",
+ fd, str, super->orom, rv, raiddisks);
+#endif
+ /* no orom/efi or non-intel hba of the disk */
+ rv = 0;
+ goto exit;
+ }
+ if (super->orom) {
+ if (raiddisks > super->orom->tds) {
+ if (verbose)
+ pr_err("%d exceeds maximum number of platform supported disks: %d\n",
+ raiddisks, super->orom->tds);
+ goto exit;
+ }
+ if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 &&
+ (ldsize >> 9) >> 32 > 0) {
+ if (verbose)
+ pr_err("%s exceeds maximum platform supported size\n", dev);
+ goto exit;
+ }
+
+ if (super->hba->type == SYS_DEV_VMD ||
+ super->hba->type == SYS_DEV_NVME) {
+ if (!imsm_is_nvme_namespace_supported(fd, 1)) {
+ if (verbose)
+ pr_err("NVMe namespace %s is not supported by IMSM\n",
+ basename(dev));
+ goto exit;
+ }
+ }
+ }
+ if (freesize)
+ *freesize = avail_size_imsm(st, ldsize >> 9, data_offset);
+ rv = 1;
+exit:
+ if (super)
+ free_imsm(super);
+ close(fd);
+
+ return rv;
+}
+
+static unsigned long long find_size(struct extent *e, int *idx, int num_extents)
+{
+ const unsigned long long base_start = e[*idx].start;
+ unsigned long long end = base_start + e[*idx].size;
+ int i;
+
+ if (base_start == end)
+ return 0;
+
+ *idx = *idx + 1;
+ for (i = *idx; i < num_extents; i++) {
+ /* extend overlapping extents */
+ if (e[i].start >= base_start &&
+ e[i].start <= end) {
+ if (e[i].size == 0)
+ return 0;
+ if (e[i].start + e[i].size > end)
+ end = e[i].start + e[i].size;
+ } else if (e[i].start > end) {
+ *idx = i;
+ break;
+ }
+ }
+
+ return end - base_start;
+}
+
+static unsigned long long merge_extents(struct intel_super *super, int sum_extents)
+{
+ /* build a composite disk with all known extents and generate a new
+ * 'maxsize' given the "all disks in an array must share a common start
+ * offset" constraint
+ */
+ struct extent *e = xcalloc(sum_extents, sizeof(*e));
+ struct dl *dl;
+ int i, j;
+ int start_extent;
+ unsigned long long pos;
+ unsigned long long start = 0;
+ unsigned long long maxsize;
+ unsigned long reserve;
+
+ /* coalesce and sort all extents. also, check to see if we need to
+ * reserve space between member arrays
+ */
+ j = 0;
+ for (dl = super->disks; dl; dl = dl->next) {
+ if (!dl->e)
+ continue;
+ for (i = 0; i < dl->extent_cnt; i++)
+ e[j++] = dl->e[i];
+ }
+ qsort(e, sum_extents, sizeof(*e), cmp_extent);
+
+ /* merge extents */
+ i = 0;
+ j = 0;
+ while (i < sum_extents) {
+ e[j].start = e[i].start;
+ e[j].size = find_size(e, &i, sum_extents);
+ j++;
+ if (e[j-1].size == 0)
+ break;
+ }
+
+ pos = 0;
+ maxsize = 0;
+ start_extent = 0;
+ i = 0;
+ do {
+ unsigned long long esize;
+
+ esize = e[i].start - pos;
+ if (esize >= maxsize) {
+ maxsize = esize;
+ start = pos;
+ start_extent = i;
+ }
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ free(e);
+
+ if (maxsize == 0)
+ return 0;
+
+ /* FIXME assumes volume at offset 0 is the first volume in a
+ * container
+ */
+ if (start_extent > 0)
+ reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */
+ else
+ reserve = 0;
+
+ if (maxsize < reserve)
+ return 0;
+
+ super->create_offset = ~((unsigned long long) 0);
+ if (start + reserve > super->create_offset)
+ return 0; /* start overflows create_offset */
+ super->create_offset = start + reserve;
+
+ return maxsize - reserve;
+}
+
+static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks)
+{
+ if (level < 0 || level == 6 || level == 4)
+ return 0;
+
+ /* if we have an orom prevent invalid raid levels */
+ if (orom)
+ switch (level) {
+ case 0: return imsm_orom_has_raid0(orom);
+ case 1:
+ if (raiddisks > 2)
+ return imsm_orom_has_raid1e(orom);
+ return imsm_orom_has_raid1(orom) && raiddisks == 2;
+ case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4;
+ case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2;
+ }
+ else
+ return 1; /* not on an Intel RAID platform so anything goes */
+
+ return 0;
+}
+
+static int
+active_arrays_by_format(char *name, char* hba, struct md_list **devlist,
+ int dpa, int verbose)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *memb;
+ int count = 0;
+ int num = 0;
+ struct md_list *dv;
+ int found;
+
+ for (memb = mdstat ; memb ; memb = memb->next) {
+ if (memb->metadata_version &&
+ (strncmp(memb->metadata_version, "external:", 9) == 0) &&
+ (strcmp(&memb->metadata_version[9], name) == 0) &&
+ !is_subarray(memb->metadata_version+9) &&
+ memb->members) {
+ struct dev_member *dev = memb->members;
+ int fd = -1;
+ while (dev && !is_fd_valid(fd)) {
+ char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1);
+ num = sprintf(path, "%s%s", "/dev/", dev->name);
+ if (num > 0)
+ fd = open(path, O_RDONLY, 0);
+ if (num <= 0 || !is_fd_valid(fd)) {
+ pr_vrb("Cannot open %s: %s\n",
+ dev->name, strerror(errno));
+ }
+ free(path);
+ dev = dev->next;
+ }
+ found = 0;
+ if (is_fd_valid(fd) && disk_attached_to_hba(fd, hba)) {
+ struct mdstat_ent *vol;
+ for (vol = mdstat ; vol ; vol = vol->next) {
+ if (vol->active > 0 &&
+ vol->metadata_version &&
+ is_container_member(vol, memb->devnm)) {
+ found++;
+ count++;
+ }
+ }
+ if (*devlist && (found < dpa)) {
+ dv = xcalloc(1, sizeof(*dv));
+ dv->devname = xmalloc(strlen(memb->devnm) + strlen("/dev/") + 1);
+ sprintf(dv->devname, "%s%s", "/dev/", memb->devnm);
+ dv->found = found;
+ dv->used = 0;
+ dv->next = *devlist;
+ *devlist = dv;
+ }
+ }
+ close_fd(&fd);
+ }
+ }
+ free_mdstat(mdstat);
+ return count;
+}
+
+#ifdef DEBUG_LOOP
+static struct md_list*
+get_loop_devices(void)
+{
+ int i;
+ struct md_list *devlist = NULL;
+ struct md_list *dv;
+
+ for(i = 0; i < 12; i++) {
+ dv = xcalloc(1, sizeof(*dv));
+ dv->devname = xmalloc(40);
+ sprintf(dv->devname, "/dev/loop%d", i);
+ dv->next = devlist;
+ devlist = dv;
+ }
+ return devlist;
+}
+#endif
+
+static struct md_list*
+get_devices(const char *hba_path)
+{
+ struct md_list *devlist = NULL;
+ struct md_list *dv;
+ struct dirent *ent;
+ DIR *dir;
+ int err = 0;
+
+#if DEBUG_LOOP
+ devlist = get_loop_devices();
+ return devlist;
+#endif
+ /* scroll through /sys/dev/block looking for devices attached to
+ * this hba
+ */
+ dir = opendir("/sys/dev/block");
+ for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) {
+ int fd;
+ char buf[1024];
+ int major, minor;
+ char *path = NULL;
+ if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2)
+ continue;
+ path = devt_to_devpath(makedev(major, minor), 1, NULL);
+ if (!path)
+ continue;
+ if (!path_attached_to_hba(path, hba_path)) {
+ free(path);
+ path = NULL;
+ continue;
+ }
+ free(path);
+ path = NULL;
+ fd = dev_open(ent->d_name, O_RDONLY);
+ if (is_fd_valid(fd)) {
+ fd2devname(fd, buf);
+ close(fd);
+ } else {
+ pr_err("cannot open device: %s\n",
+ ent->d_name);
+ continue;
+ }
+
+ dv = xcalloc(1, sizeof(*dv));
+ dv->devname = xstrdup(buf);
+ dv->next = devlist;
+ devlist = dv;
+ }
+ if (err) {
+ while(devlist) {
+ dv = devlist;
+ devlist = devlist->next;
+ free(dv->devname);
+ free(dv);
+ }
+ }
+ closedir(dir);
+ return devlist;
+}
+
+static int
+count_volumes_list(struct md_list *devlist, char *homehost,
+ int verbose, int *found)
+{
+ struct md_list *tmpdev;
+ int count = 0;
+ struct supertype *st;
+
+ /* first walk the list of devices to find a consistent set
+ * that match the criterea, if that is possible.
+ * We flag the ones we like with 'used'.
+ */
+ *found = 0;
+ st = match_metadata_desc_imsm("imsm");
+ if (st == NULL) {
+ pr_vrb("cannot allocate memory for imsm supertype\n");
+ return 0;
+ }
+
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ char *devname = tmpdev->devname;
+ dev_t rdev;
+ struct supertype *tst;
+ int dfd;
+ if (tmpdev->used > 1)
+ continue;
+ tst = dup_super(st);
+ if (tst == NULL) {
+ pr_vrb("cannot allocate memory for imsm supertype\n");
+ goto err_1;
+ }
+ tmpdev->container = 0;
+ dfd = dev_open(devname, O_RDONLY|O_EXCL);
+ if (!is_fd_valid(dfd)) {
+ dprintf("cannot open device %s: %s\n",
+ devname, strerror(errno));
+ tmpdev->used = 2;
+ } else if (!fstat_is_blkdev(dfd, devname, &rdev)) {
+ tmpdev->used = 2;
+ } else if (must_be_container(dfd)) {
+ struct supertype *cst;
+ cst = super_by_fd(dfd, NULL);
+ if (cst == NULL) {
+ dprintf("cannot recognize container type %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (tst->ss != st->ss) {
+ dprintf("non-imsm container - ignore it: %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (!tst->ss->load_container ||
+ tst->ss->load_container(tst, dfd, NULL))
+ tmpdev->used = 2;
+ else {
+ tmpdev->container = 1;
+ }
+ if (cst)
+ cst->ss->free_super(cst);
+ } else {
+ tmpdev->st_rdev = rdev;
+ if (tst->ss->load_super(tst,dfd, NULL)) {
+ dprintf("no RAID superblock on %s\n",
+ devname);
+ tmpdev->used = 2;
+ } else if (tst->ss->compare_super == NULL) {
+ dprintf("Cannot assemble %s metadata on %s\n",
+ tst->ss->name, devname);
+ tmpdev->used = 2;
+ }
+ }
+ close_fd(&dfd);
+
+ if (tmpdev->used == 2 || tmpdev->used == 4) {
+ /* Ignore unrecognised devices during auto-assembly */
+ goto loop;
+ }
+ else {
+ struct mdinfo info;
+ tst->ss->getinfo_super(tst, &info, NULL);
+
+ if (st->minor_version == -1)
+ st->minor_version = tst->minor_version;
+
+ if (memcmp(info.uuid, uuid_zero,
+ sizeof(int[4])) == 0) {
+ /* this is a floating spare. It cannot define
+ * an array unless there are no more arrays of
+ * this type to be found. It can be included
+ * in an array of this type though.
+ */
+ tmpdev->used = 3;
+ goto loop;
+ }
+
+ if (st->ss != tst->ss ||
+ st->minor_version != tst->minor_version ||
+ st->ss->compare_super(st, tst, 1) != 0) {
+ /* Some mismatch. If exactly one array matches this host,
+ * we can resolve on that one.
+ * Or, if we are auto assembling, we just ignore the second
+ * for now.
+ */
+ dprintf("superblock on %s doesn't match others - assembly aborted\n",
+ devname);
+ goto loop;
+ }
+ tmpdev->used = 1;
+ *found = 1;
+ dprintf("found: devname: %s\n", devname);
+ }
+ loop:
+ if (tst)
+ tst->ss->free_super(tst);
+ }
+ if (*found != 0) {
+ int err;
+ if ((err = load_super_imsm_all(st, -1, &st->sb, NULL, devlist, 0)) == 0) {
+ struct mdinfo *iter, *head = st->ss->container_content(st, NULL);
+ for (iter = head; iter; iter = iter->next) {
+ dprintf("content->text_version: %s vol\n",
+ iter->text_version);
+ if (iter->array.state & (1<<MD_SB_BLOCK_VOLUME)) {
+ /* do not assemble arrays with unsupported
+ configurations */
+ dprintf("Cannot activate member %s.\n",
+ iter->text_version);
+ } else
+ count++;
+ }
+ sysfs_free(head);
+
+ } else {
+ dprintf("No valid super block on device list: err: %d %p\n",
+ err, st->sb);
+ }
+ } else {
+ dprintf("no more devices to examine\n");
+ }
+
+ for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) {
+ if (tmpdev->used == 1 && tmpdev->found) {
+ if (count) {
+ if (count < tmpdev->found)
+ count = 0;
+ else
+ count -= tmpdev->found;
+ }
+ }
+ if (tmpdev->used == 1)
+ tmpdev->used = 4;
+ }
+ err_1:
+ if (st)
+ st->ss->free_super(st);
+ return count;
+}
+
+static int __count_volumes(char *hba_path, int dpa, int verbose,
+ int cmp_hba_path)
+{
+ struct sys_dev *idev, *intel_devices = find_intel_devices();
+ int count = 0;
+ const struct orom_entry *entry;
+ struct devid_list *dv, *devid_list;
+
+ if (!hba_path)
+ return 0;
+
+ for (idev = intel_devices; idev; idev = idev->next) {
+ if (strstr(idev->path, hba_path))
+ break;
+ }
+
+ if (!idev || !idev->dev_id)
+ return 0;
+
+ entry = get_orom_entry_by_device_id(idev->dev_id);
+
+ if (!entry || !entry->devid_list)
+ return 0;
+
+ devid_list = entry->devid_list;
+ for (dv = devid_list; dv; dv = dv->next) {
+ struct md_list *devlist;
+ struct sys_dev *device = NULL;
+ char *hpath;
+ int found = 0;
+
+ if (cmp_hba_path)
+ device = device_by_id_and_path(dv->devid, hba_path);
+ else
+ device = device_by_id(dv->devid);
+
+ if (device)
+ hpath = device->path;
+ else
+ return 0;
+
+ devlist = get_devices(hpath);
+ /* if no intel devices return zero volumes */
+ if (devlist == NULL)
+ return 0;
+
+ count += active_arrays_by_format("imsm", hpath, &devlist, dpa,
+ verbose);
+ dprintf("path: %s active arrays: %d\n", hpath, count);
+ if (devlist == NULL)
+ return 0;
+ do {
+ found = 0;
+ count += count_volumes_list(devlist,
+ NULL,
+ verbose,
+ &found);
+ dprintf("found %d count: %d\n", found, count);
+ } while (found);
+
+ dprintf("path: %s total number of volumes: %d\n", hpath, count);
+
+ while (devlist) {
+ struct md_list *dv = devlist;
+ devlist = devlist->next;
+ free(dv->devname);
+ free(dv);
+ }
+ }
+ return count;
+}
+
+static int count_volumes(struct intel_hba *hba, int dpa, int verbose)
+{
+ if (!hba)
+ return 0;
+ if (hba->type == SYS_DEV_VMD) {
+ struct sys_dev *dev;
+ int count = 0;
+
+ for (dev = find_intel_devices(); dev; dev = dev->next) {
+ if (dev->type == SYS_DEV_VMD)
+ count += __count_volumes(dev->path, dpa,
+ verbose, 1);
+ }
+ return count;
+ }
+ return __count_volumes(hba->path, dpa, verbose, 0);
+}
+
+static int imsm_default_chunk(const struct imsm_orom *orom)
+{
+ /* up to 512 if the plaform supports it, otherwise the platform max.
+ * 128 if no platform detected
+ */
+ int fs = max(7, orom ? fls(orom->sss) : 0);
+
+ return min(512, (1 << fs));
+}
+
+static int
+validate_geometry_imsm_orom(struct intel_super *super, int level, int layout,
+ int raiddisks, int *chunk, unsigned long long size, int verbose)
+{
+ /* check/set platform and metadata limits/defaults */
+ if (super->orom && raiddisks > super->orom->dpa) {
+ pr_vrb("platform supports a maximum of %d disks per array\n",
+ super->orom->dpa);
+ return 0;
+ }
+
+ /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */
+ if (!is_raid_level_supported(super->orom, level, raiddisks)) {
+ pr_vrb("platform does not support raid%d with %d disk%s\n",
+ level, raiddisks, raiddisks > 1 ? "s" : "");
+ return 0;
+ }
+
+ if (*chunk == 0 || *chunk == UnSet)
+ *chunk = imsm_default_chunk(super->orom);
+
+ if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) {
+ pr_vrb("platform does not support a chunk size of: %d\n", *chunk);
+ return 0;
+ }
+
+ if (layout != imsm_level_to_layout(level)) {
+ if (level == 5)
+ pr_vrb("imsm raid 5 only supports the left-asymmetric layout\n");
+ else if (level == 10)
+ pr_vrb("imsm raid 10 only supports the n2 layout\n");
+ else
+ pr_vrb("imsm unknown layout %#x for this raid level %d\n",
+ layout, level);
+ return 0;
+ }
+
+ if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 &&
+ (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) {
+ pr_vrb("platform does not support a volume size over 2TB\n");
+ return 0;
+ }
+
+ return 1;
+}
+
+/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd
+ * FIX ME add ahci details
+ */
+static int validate_geometry_imsm_volume(struct supertype *st, int level,
+ int layout, int raiddisks, int *chunk,
+ unsigned long long size,
+ unsigned long long data_offset,
+ char *dev,
+ unsigned long long *freesize,
+ int verbose)
+{
+ dev_t rdev;
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb;
+ struct dl *dl;
+ unsigned long long pos = 0;
+ unsigned long long maxsize;
+ struct extent *e;
+ int i;
+
+ /* We must have the container info already read in. */
+ if (!super)
+ return 0;
+
+ mpb = super->anchor;
+
+ if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) {
+ pr_err("RAID geometry validation failed. Cannot proceed with the action(s).\n");
+ return 0;
+ }
+ if (!dev) {
+ /* General test: make sure there is space for
+ * 'raiddisks' device extents of size 'size' at a given
+ * offset
+ */
+ unsigned long long minsize = size;
+ unsigned long long start_offset = MaxSector;
+ int dcnt = 0;
+ if (minsize == 0)
+ minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS;
+ for (dl = super->disks; dl ; dl = dl->next) {
+ int found = 0;
+
+ pos = 0;
+ i = 0;
+ e = get_extents(super, dl, 0);
+ if (!e) continue;
+ do {
+ unsigned long long esize;
+ esize = e[i].start - pos;
+ if (esize >= minsize)
+ found = 1;
+ if (found && start_offset == MaxSector) {
+ start_offset = pos;
+ break;
+ } else if (found && pos != start_offset) {
+ found = 0;
+ break;
+ }
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ if (found)
+ dcnt++;
+ free(e);
+ }
+ if (dcnt < raiddisks) {
+ if (verbose)
+ pr_err("imsm: Not enough devices with space for this array (%d < %d)\n",
+ dcnt, raiddisks);
+ return 0;
+ }
+ return 1;
+ }
+
+ /* This device must be a member of the set */
+ if (!stat_is_blkdev(dev, &rdev))
+ return 0;
+ for (dl = super->disks ; dl ; dl = dl->next) {
+ if (dl->major == (int)major(rdev) &&
+ dl->minor == (int)minor(rdev))
+ break;
+ }
+ if (!dl) {
+ if (verbose)
+ pr_err("%s is not in the same imsm set\n", dev);
+ return 0;
+ } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) {
+ /* If a volume is present then the current creation attempt
+ * cannot incorporate new spares because the orom may not
+ * understand this configuration (all member disks must be
+ * members of each array in the container).
+ */
+ pr_err("%s is a spare and a volume is already defined for this container\n", dev);
+ pr_err("The option-rom requires all member disks to be a member of all volumes\n");
+ return 0;
+ } else if (super->orom && mpb->num_raid_devs > 0 &&
+ mpb->num_disks != raiddisks) {
+ pr_err("The option-rom requires all member disks to be a member of all volumes\n");
+ return 0;
+ }
+
+ /* retrieve the largest free space block */
+ e = get_extents(super, dl, 0);
+ maxsize = 0;
+ i = 0;
+ if (e) {
+ do {
+ unsigned long long esize;
+
+ esize = e[i].start - pos;
+ if (esize >= maxsize)
+ maxsize = esize;
+ pos = e[i].start + e[i].size;
+ i++;
+ } while (e[i-1].size);
+ dl->e = e;
+ dl->extent_cnt = i;
+ } else {
+ if (verbose)
+ pr_err("unable to determine free space for: %s\n",
+ dev);
+ return 0;
+ }
+ if (maxsize < size) {
+ if (verbose)
+ pr_err("%s not enough space (%llu < %llu)\n",
+ dev, maxsize, size);
+ return 0;
+ }
+
+ /* count total number of extents for merge */
+ i = 0;
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->e)
+ i += dl->extent_cnt;
+
+ maxsize = merge_extents(super, i);
+
+ if (mpb->num_raid_devs > 0 && size && size != maxsize)
+ pr_err("attempting to create a second volume with size less then remaining space.\n");
+
+ if (maxsize < size || maxsize == 0) {
+ if (verbose) {
+ if (maxsize == 0)
+ pr_err("no free space left on device. Aborting...\n");
+ else
+ pr_err("not enough space to create volume of given size (%llu < %llu). Aborting...\n",
+ maxsize, size);
+ }
+ return 0;
+ }
+
+ *freesize = maxsize;
+
+ if (super->orom) {
+ int count = count_volumes(super->hba,
+ super->orom->dpa, verbose);
+ if (super->orom->vphba <= count) {
+ pr_vrb("platform does not support more than %d raid volumes.\n",
+ super->orom->vphba);
+ return 0;
+ }
+ }
+ return 1;
+}
+
+static int imsm_get_free_size(struct supertype *st, int raiddisks,
+ unsigned long long size, int chunk,
+ unsigned long long *freesize)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct dl *dl;
+ int i;
+ int extent_cnt;
+ struct extent *e;
+ unsigned long long maxsize;
+ unsigned long long minsize;
+ int cnt;
+ int used;
+
+ /* find the largest common start free region of the possible disks */
+ used = 0;
+ extent_cnt = 0;
+ cnt = 0;
+ for (dl = super->disks; dl; dl = dl->next) {
+ dl->raiddisk = -1;
+
+ if (dl->index >= 0)
+ used++;
+
+ /* don't activate new spares if we are orom constrained
+ * and there is already a volume active in the container
+ */
+ if (super->orom && dl->index < 0 && mpb->num_raid_devs)
+ continue;
+
+ e = get_extents(super, dl, 0);
+ if (!e)
+ continue;
+ for (i = 1; e[i-1].size; i++)
+ ;
+ dl->e = e;
+ dl->extent_cnt = i;
+ extent_cnt += i;
+ cnt++;
+ }
+
+ maxsize = merge_extents(super, extent_cnt);
+ minsize = size;
+ if (size == 0)
+ /* chunk is in K */
+ minsize = chunk * 2;
+
+ if (cnt < raiddisks ||
+ (super->orom && used && used != raiddisks) ||
+ maxsize < minsize ||
+ maxsize == 0) {
+ pr_err("not enough devices with space to create array.\n");
+ return 0; /* No enough free spaces large enough */
+ }
+
+ if (size == 0) {
+ size = maxsize;
+ if (chunk) {
+ size /= 2 * chunk;
+ size *= 2 * chunk;
+ }
+ maxsize = size;
+ }
+ if (mpb->num_raid_devs > 0 && size && size != maxsize)
+ pr_err("attempting to create a second volume with size less then remaining space.\n");
+ cnt = 0;
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->e)
+ dl->raiddisk = cnt++;
+
+ *freesize = size;
+
+ dprintf("imsm: imsm_get_free_size() returns : %llu\n", size);
+
+ return 1;
+}
+
+static int reserve_space(struct supertype *st, int raiddisks,
+ unsigned long long size, int chunk,
+ unsigned long long *freesize)
+{
+ struct intel_super *super = st->sb;
+ struct dl *dl;
+ int cnt;
+ int rv = 0;
+
+ rv = imsm_get_free_size(st, raiddisks, size, chunk, freesize);
+ if (rv) {
+ cnt = 0;
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->e)
+ dl->raiddisk = cnt++;
+ rv = 1;
+ }
+
+ return rv;
+}
+
+static int validate_geometry_imsm(struct supertype *st, int level, int layout,
+ int raiddisks, int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *dev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ int fd, cfd;
+ struct mdinfo *sra;
+ int is_member = 0;
+
+ /* load capability
+ * if given unused devices create a container
+ * if given given devices in a container create a member volume
+ */
+ if (level == LEVEL_CONTAINER)
+ /* Must be a fresh device to add to a container */
+ return validate_geometry_imsm_container(st, level, raiddisks,
+ data_offset, dev,
+ freesize, verbose);
+
+ /*
+ * Size is given in sectors.
+ */
+ if (size && (size < 2048)) {
+ pr_err("Given size must be greater than 1M.\n");
+ /* Depends on algorithm in Create.c :
+ * if container was given (dev == NULL) return -1,
+ * if block device was given ( dev != NULL) return 0.
+ */
+ return dev ? -1 : 0;
+ }
+
+ if (!dev) {
+ if (st->sb) {
+ struct intel_super *super = st->sb;
+ if (!validate_geometry_imsm_orom(st->sb, level, layout,
+ raiddisks, chunk, size,
+ verbose))
+ return 0;
+ /* we are being asked to automatically layout a
+ * new volume based on the current contents of
+ * the container. If the the parameters can be
+ * satisfied reserve_space will record the disks,
+ * start offset, and size of the volume to be
+ * created. add_to_super and getinfo_super
+ * detect when autolayout is in progress.
+ */
+ /* assuming that freesize is always given when array is
+ created */
+ if (super->orom && freesize) {
+ int count;
+ count = count_volumes(super->hba,
+ super->orom->dpa, verbose);
+ if (super->orom->vphba <= count) {
+ pr_vrb("platform does not support more than %d raid volumes.\n",
+ super->orom->vphba);
+ return 0;
+ }
+ }
+ if (freesize)
+ return reserve_space(st, raiddisks, size,
+ *chunk, freesize);
+ }
+ return 1;
+ }
+ if (st->sb) {
+ /* creating in a given container */
+ return validate_geometry_imsm_volume(st, level, layout,
+ raiddisks, chunk, size,
+ data_offset,
+ dev, freesize, verbose);
+ }
+
+ /* This device needs to be a device in an 'imsm' container */
+ fd = open(dev, O_RDONLY|O_EXCL, 0);
+
+ if (is_fd_valid(fd)) {
+ pr_vrb("Cannot create this array on device %s\n", dev);
+ close(fd);
+ return 0;
+ }
+ if (errno == EBUSY)
+ fd = open(dev, O_RDONLY, 0);
+
+ if (!is_fd_valid(fd)) {
+ pr_vrb("Cannot open %s: %s\n", dev, strerror(errno));
+ return 0;
+ }
+
+ /* Well, it is in use by someone, maybe an 'imsm' container. */
+ cfd = open_container(fd);
+ close_fd(&fd);
+
+ if (!is_fd_valid(cfd)) {
+ pr_vrb("Cannot use %s: It is busy\n", dev);
+ return 0;
+ }
+ sra = sysfs_read(cfd, NULL, GET_VERSION);
+ if (sra && sra->array.major_version == -1 &&
+ strcmp(sra->text_version, "imsm") == 0)
+ is_member = 1;
+ sysfs_free(sra);
+ if (is_member) {
+ /* This is a member of a imsm container. Load the container
+ * and try to create a volume
+ */
+ struct intel_super *super;
+
+ if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) {
+ st->sb = super;
+ strcpy(st->container_devnm, fd2devnm(cfd));
+ close(cfd);
+ return validate_geometry_imsm_volume(st, level, layout,
+ raiddisks, chunk,
+ size, data_offset, dev,
+ freesize, 1)
+ ? 1 : -1;
+ }
+ }
+
+ if (verbose)
+ pr_err("failed container membership check\n");
+
+ close(cfd);
+ return 0;
+}
+
+static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk)
+{
+ struct intel_super *super = st->sb;
+
+ if (level && *level == UnSet)
+ *level = LEVEL_CONTAINER;
+
+ if (level && layout && *layout == UnSet)
+ *layout = imsm_level_to_layout(*level);
+
+ if (chunk && (*chunk == UnSet || *chunk == 0))
+ *chunk = imsm_default_chunk(super->orom);
+}
+
+static void handle_missing(struct intel_super *super, struct imsm_dev *dev);
+
+static int kill_subarray_imsm(struct supertype *st, char *subarray_id)
+{
+ /* remove the subarray currently referenced by subarray_id */
+ __u8 i;
+ struct intel_dev **dp;
+ struct intel_super *super = st->sb;
+ __u8 current_vol = strtoul(subarray_id, NULL, 10);
+ struct imsm_super *mpb = super->anchor;
+
+ if (mpb->num_raid_devs == 0)
+ return 2;
+
+ /* block deletions that would change the uuid of active subarrays
+ *
+ * FIXME when immutable ids are available, but note that we'll
+ * also need to fixup the invalidated/active subarray indexes in
+ * mdstat
+ */
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ char subarray[4];
+
+ if (i < current_vol)
+ continue;
+ sprintf(subarray, "%u", i);
+ if (is_subarray_active(subarray, st->devnm)) {
+ pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n",
+ current_vol, i);
+
+ return 2;
+ }
+ }
+
+ if (st->update_tail) {
+ struct imsm_update_kill_array *u = xmalloc(sizeof(*u));
+
+ u->type = update_kill_array;
+ u->dev_idx = current_vol;
+ append_metadata_update(st, u, sizeof(*u));
+
+ return 0;
+ }
+
+ for (dp = &super->devlist; *dp;)
+ if ((*dp)->index == current_vol) {
+ *dp = (*dp)->next;
+ } else {
+ handle_missing(super, (*dp)->dev);
+ if ((*dp)->index > current_vol)
+ (*dp)->index--;
+ dp = &(*dp)->next;
+ }
+
+ /* no more raid devices, all active components are now spares,
+ * but of course failed are still failed
+ */
+ if (--mpb->num_raid_devs == 0) {
+ struct dl *d;
+
+ for (d = super->disks; d; d = d->next)
+ if (d->index > -2)
+ mark_spare(d);
+ }
+
+ super->updates_pending++;
+
+ return 0;
+}
+
+static int get_rwh_policy_from_update(char *update)
+{
+ if (strcmp(update, "ppl") == 0)
+ return RWH_MULTIPLE_DISTRIBUTED;
+ else if (strcmp(update, "no-ppl") == 0)
+ return RWH_MULTIPLE_OFF;
+ else if (strcmp(update, "bitmap") == 0)
+ return RWH_BITMAP;
+ else if (strcmp(update, "no-bitmap") == 0)
+ return RWH_OFF;
+ return -1;
+}
+
+static int update_subarray_imsm(struct supertype *st, char *subarray,
+ char *update, struct mddev_ident *ident)
+{
+ /* update the subarray currently referenced by ->current_vol */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+
+ if (strcmp(update, "name") == 0) {
+ char *name = ident->name;
+ char *ep;
+ int vol;
+
+ if (is_subarray_active(subarray, st->devnm)) {
+ pr_err("Unable to update name of active subarray\n");
+ return 2;
+ }
+
+ if (!check_name(super, name, 0))
+ return 2;
+
+ vol = strtoul(subarray, &ep, 10);
+ if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+ return 2;
+
+ if (st->update_tail) {
+ struct imsm_update_rename_array *u = xmalloc(sizeof(*u));
+
+ u->type = update_rename_array;
+ u->dev_idx = vol;
+ strncpy((char *) u->name, name, MAX_RAID_SERIAL_LEN);
+ u->name[MAX_RAID_SERIAL_LEN-1] = '\0';
+ append_metadata_update(st, u, sizeof(*u));
+ } else {
+ struct imsm_dev *dev;
+ int i, namelen;
+
+ dev = get_imsm_dev(super, vol);
+ memset(dev->volume, '\0', MAX_RAID_SERIAL_LEN);
+ namelen = min((int)strlen(name), MAX_RAID_SERIAL_LEN);
+ memcpy(dev->volume, name, namelen);
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ handle_missing(super, dev);
+ }
+ super->updates_pending++;
+ }
+ } else if (get_rwh_policy_from_update(update) != -1) {
+ int new_policy;
+ char *ep;
+ int vol = strtoul(subarray, &ep, 10);
+
+ if (*ep != '\0' || vol >= super->anchor->num_raid_devs)
+ return 2;
+
+ new_policy = get_rwh_policy_from_update(update);
+
+ if (st->update_tail) {
+ struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u));
+
+ u->type = update_rwh_policy;
+ u->dev_idx = vol;
+ u->new_policy = new_policy;
+ append_metadata_update(st, u, sizeof(*u));
+ } else {
+ struct imsm_dev *dev;
+
+ dev = get_imsm_dev(super, vol);
+ dev->rwh_policy = new_policy;
+ super->updates_pending++;
+ }
+ if (new_policy == RWH_BITMAP)
+ return write_init_bitmap_imsm_vol(st, vol);
+ } else
+ return 2;
+
+ return 0;
+}
+
+static bool is_gen_migration(struct imsm_dev *dev)
+{
+ if (dev && dev->vol.migr_state &&
+ migr_type(dev) == MIGR_GEN_MIGR)
+ return true;
+
+ return false;
+}
+
+static int is_rebuilding(struct imsm_dev *dev)
+{
+ struct imsm_map *migr_map;
+
+ if (!dev->vol.migr_state)
+ return 0;
+
+ if (migr_type(dev) != MIGR_REBUILD)
+ return 0;
+
+ migr_map = get_imsm_map(dev, MAP_1);
+
+ if (migr_map->map_state == IMSM_T_STATE_DEGRADED)
+ return 1;
+ else
+ return 0;
+}
+
+static int is_initializing(struct imsm_dev *dev)
+{
+ struct imsm_map *migr_map;
+
+ if (!dev->vol.migr_state)
+ return 0;
+
+ if (migr_type(dev) != MIGR_INIT)
+ return 0;
+
+ migr_map = get_imsm_map(dev, MAP_1);
+
+ if (migr_map->map_state == IMSM_T_STATE_UNINITIALIZED)
+ return 1;
+
+ return 0;
+}
+
+static void update_recovery_start(struct intel_super *super,
+ struct imsm_dev *dev,
+ struct mdinfo *array)
+{
+ struct mdinfo *rebuild = NULL;
+ struct mdinfo *d;
+ __u32 units;
+
+ if (!is_rebuilding(dev))
+ return;
+
+ /* Find the rebuild target, but punt on the dual rebuild case */
+ for (d = array->devs; d; d = d->next)
+ if (d->recovery_start == 0) {
+ if (rebuild)
+ return;
+ rebuild = d;
+ }
+
+ if (!rebuild) {
+ /* (?) none of the disks are marked with
+ * IMSM_ORD_REBUILD, so assume they are missing and the
+ * disk_ord_tbl was not correctly updated
+ */
+ dprintf("failed to locate out-of-sync disk\n");
+ return;
+ }
+
+ units = vol_curr_migr_unit(dev);
+ rebuild->recovery_start = units * blocks_per_migr_unit(super, dev);
+}
+
+static int recover_backup_imsm(struct supertype *st, struct mdinfo *info);
+
+static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray)
+{
+ /* Given a container loaded by load_super_imsm_all,
+ * extract information about all the arrays into
+ * an mdinfo tree.
+ * If 'subarray' is given, just extract info about that array.
+ *
+ * For each imsm_dev create an mdinfo, fill it in,
+ * then look for matching devices in super->disks
+ * and create appropriate device mdinfo.
+ */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct mdinfo *rest = NULL;
+ unsigned int i;
+ int sb_errors = 0;
+ struct dl *d;
+ int spare_disks = 0;
+ int current_vol = super->current_vol;
+
+ /* do not assemble arrays when not all attributes are supported */
+ if (imsm_check_attributes(mpb->attributes) == 0) {
+ sb_errors = 1;
+ pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n");
+ }
+
+ /* count spare devices, not used in maps
+ */
+ for (d = super->disks; d; d = d->next)
+ if (d->index == -1)
+ spare_disks++;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ struct imsm_map *map2;
+ struct mdinfo *this;
+ int slot;
+ int chunk;
+ char *ep;
+ int level;
+
+ if (subarray &&
+ (i != strtoul(subarray, &ep, 10) || *ep != '\0'))
+ continue;
+
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, MAP_0);
+ map2 = get_imsm_map(dev, MAP_1);
+ level = get_imsm_raid_level(map);
+
+ /* do not publish arrays that are in the middle of an
+ * unsupported migration
+ */
+ if (dev->vol.migr_state &&
+ (migr_type(dev) == MIGR_STATE_CHANGE)) {
+ pr_err("cannot assemble volume '%.16s': unsupported migration in progress\n",
+ dev->volume);
+ continue;
+ }
+ /* do not publish arrays that are not support by controller's
+ * OROM/EFI
+ */
+
+ this = xmalloc(sizeof(*this));
+
+ super->current_vol = i;
+ getinfo_super_imsm_volume(st, this, NULL);
+ this->next = rest;
+ chunk = __le16_to_cpu(map->blocks_per_strip) >> 1;
+ /* mdadm does not support all metadata features- set the bit in all arrays state */
+ if (!validate_geometry_imsm_orom(super,
+ level, /* RAID level */
+ imsm_level_to_layout(level),
+ map->num_members, /* raid disks */
+ &chunk, imsm_dev_size(dev),
+ 1 /* verbose */)) {
+ pr_err("IMSM RAID geometry validation failed. Array %s activation is blocked.\n",
+ dev->volume);
+ this->array.state |=
+ (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) |
+ (1<<MD_SB_BLOCK_VOLUME);
+ }
+
+ /* if array has bad blocks, set suitable bit in all arrays state */
+ if (sb_errors)
+ this->array.state |=
+ (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) |
+ (1<<MD_SB_BLOCK_VOLUME);
+
+ for (slot = 0 ; slot < map->num_members; slot++) {
+ unsigned long long recovery_start;
+ struct mdinfo *info_d;
+ struct dl *d;
+ int idx;
+ int skip;
+ __u32 ord;
+ int missing = 0;
+
+ skip = 0;
+ idx = get_imsm_disk_idx(dev, slot, MAP_0);
+ ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X);
+ for (d = super->disks; d ; d = d->next)
+ if (d->index == idx)
+ break;
+
+ recovery_start = MaxSector;
+ if (d == NULL)
+ skip = 1;
+ if (d && is_failed(&d->disk))
+ skip = 1;
+ if (!skip && (ord & IMSM_ORD_REBUILD))
+ recovery_start = 0;
+ if (!(ord & IMSM_ORD_REBUILD))
+ this->array.working_disks++;
+ /*
+ * if we skip some disks the array will be assmebled degraded;
+ * reset resync start to avoid a dirty-degraded
+ * situation when performing the intial sync
+ */
+ if (skip)
+ missing++;
+
+ if (!(dev->vol.dirty & RAIDVOL_DIRTY)) {
+ if ((!able_to_resync(level, missing) ||
+ recovery_start == 0))
+ this->resync_start = MaxSector;
+ }
+
+ if (skip)
+ continue;
+
+ info_d = xcalloc(1, sizeof(*info_d));
+ info_d->next = this->devs;
+ this->devs = info_d;
+
+ info_d->disk.number = d->index;
+ info_d->disk.major = d->major;
+ info_d->disk.minor = d->minor;
+ info_d->disk.raid_disk = slot;
+ info_d->recovery_start = recovery_start;
+ if (map2) {
+ if (slot < map2->num_members)
+ info_d->disk.state = (1 << MD_DISK_ACTIVE);
+ else
+ this->array.spare_disks++;
+ } else {
+ if (slot < map->num_members)
+ info_d->disk.state = (1 << MD_DISK_ACTIVE);
+ else
+ this->array.spare_disks++;
+ }
+
+ info_d->events = __le32_to_cpu(mpb->generation_num);
+ info_d->data_offset = pba_of_lba0(map);
+ info_d->component_size = calc_component_size(map, dev);
+
+ if (map->raid_level == 5) {
+ info_d->ppl_sector = this->ppl_sector;
+ info_d->ppl_size = this->ppl_size;
+ if (this->consistency_policy == CONSISTENCY_POLICY_PPL &&
+ recovery_start == 0)
+ this->resync_start = 0;
+ }
+
+ info_d->bb.supported = 1;
+ get_volume_badblocks(super->bbm_log, ord_to_idx(ord),
+ info_d->data_offset,
+ info_d->component_size,
+ &info_d->bb);
+ }
+ /* now that the disk list is up-to-date fixup recovery_start */
+ update_recovery_start(super, dev, this);
+ this->array.spare_disks += spare_disks;
+
+ /* check for reshape */
+ if (this->reshape_active == 1)
+ recover_backup_imsm(st, this);
+ rest = this;
+ }
+
+ super->current_vol = current_vol;
+ return rest;
+}
+
+static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev,
+ int failed, int look_in_map)
+{
+ struct imsm_map *map;
+
+ map = get_imsm_map(dev, look_in_map);
+
+ if (!failed)
+ return map->map_state == IMSM_T_STATE_UNINITIALIZED ?
+ IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL;
+
+ switch (get_imsm_raid_level(map)) {
+ case 0:
+ return IMSM_T_STATE_FAILED;
+ break;
+ case 1:
+ if (failed < map->num_members)
+ return IMSM_T_STATE_DEGRADED;
+ else
+ return IMSM_T_STATE_FAILED;
+ break;
+ case 10:
+ {
+ /**
+ * check to see if any mirrors have failed, otherwise we
+ * are degraded. Even numbered slots are mirrored on
+ * slot+1
+ */
+ int i;
+ /* gcc -Os complains that this is unused */
+ int insync = insync;
+
+ for (i = 0; i < map->num_members; i++) {
+ __u32 ord = get_imsm_ord_tbl_ent(dev, i, MAP_X);
+ int idx = ord_to_idx(ord);
+ struct imsm_disk *disk;
+
+ /* reset the potential in-sync count on even-numbered
+ * slots. num_copies is always 2 for imsm raid10
+ */
+ if ((i & 1) == 0)
+ insync = 2;
+
+ disk = get_imsm_disk(super, idx);
+ if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD)
+ insync--;
+
+ /* no in-sync disks left in this mirror the
+ * array has failed
+ */
+ if (insync == 0)
+ return IMSM_T_STATE_FAILED;
+ }
+
+ return IMSM_T_STATE_DEGRADED;
+ }
+ case 5:
+ if (failed < 2)
+ return IMSM_T_STATE_DEGRADED;
+ else
+ return IMSM_T_STATE_FAILED;
+ break;
+ default:
+ break;
+ }
+
+ return map->map_state;
+}
+
+static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev,
+ int look_in_map)
+{
+ int i;
+ int failed = 0;
+ struct imsm_disk *disk;
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *prev = get_imsm_map(dev, MAP_1);
+ struct imsm_map *map_for_loop;
+ __u32 ord;
+ int idx;
+ int idx_1;
+
+ /* at the beginning of migration we set IMSM_ORD_REBUILD on
+ * disks that are being rebuilt. New failures are recorded to
+ * map[0]. So we look through all the disks we started with and
+ * see if any failures are still present, or if any new ones
+ * have arrived
+ */
+ map_for_loop = map;
+ if (prev && (map->num_members < prev->num_members))
+ map_for_loop = prev;
+
+ for (i = 0; i < map_for_loop->num_members; i++) {
+ idx_1 = -255;
+ /* when MAP_X is passed both maps failures are counted
+ */
+ if (prev &&
+ (look_in_map == MAP_1 || look_in_map == MAP_X) &&
+ i < prev->num_members) {
+ ord = __le32_to_cpu(prev->disk_ord_tbl[i]);
+ idx_1 = ord_to_idx(ord);
+
+ disk = get_imsm_disk(super, idx_1);
+ if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD)
+ failed++;
+ }
+ if ((look_in_map == MAP_0 || look_in_map == MAP_X) &&
+ i < map->num_members) {
+ ord = __le32_to_cpu(map->disk_ord_tbl[i]);
+ idx = ord_to_idx(ord);
+
+ if (idx != idx_1) {
+ disk = get_imsm_disk(super, idx);
+ if (!disk || is_failed(disk) ||
+ ord & IMSM_ORD_REBUILD)
+ failed++;
+ }
+ }
+ }
+
+ return failed;
+}
+
+static int imsm_open_new(struct supertype *c, struct active_array *a,
+ int inst)
+{
+ struct intel_super *super = c->sb;
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_update_prealloc_bb_mem u;
+
+ if (inst >= mpb->num_raid_devs) {
+ pr_err("subarry index %d, out of range\n", inst);
+ return -ENODEV;
+ }
+
+ dprintf("imsm: open_new %d\n", inst);
+ a->info.container_member = inst;
+
+ u.type = update_prealloc_badblocks_mem;
+ imsm_update_metadata_locally(c, &u, sizeof(u));
+
+ return 0;
+}
+
+static int is_resyncing(struct imsm_dev *dev)
+{
+ struct imsm_map *migr_map;
+
+ if (!dev->vol.migr_state)
+ return 0;
+
+ if (migr_type(dev) == MIGR_INIT ||
+ migr_type(dev) == MIGR_REPAIR)
+ return 1;
+
+ if (migr_type(dev) == MIGR_GEN_MIGR)
+ return 0;
+
+ migr_map = get_imsm_map(dev, MAP_1);
+
+ if (migr_map->map_state == IMSM_T_STATE_NORMAL &&
+ dev->vol.migr_type != MIGR_GEN_MIGR)
+ return 1;
+ else
+ return 0;
+}
+
+/* return true if we recorded new information */
+static int mark_failure(struct intel_super *super,
+ struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+ __u32 ord;
+ int slot;
+ struct imsm_map *map;
+ char buf[MAX_RAID_SERIAL_LEN+3];
+ unsigned int len, shift = 0;
+
+ /* new failures are always set in map[0] */
+ map = get_imsm_map(dev, MAP_0);
+
+ slot = get_imsm_disk_slot(map, idx);
+ if (slot < 0)
+ return 0;
+
+ ord = __le32_to_cpu(map->disk_ord_tbl[slot]);
+ if (is_failed(disk) && (ord & IMSM_ORD_REBUILD))
+ return 0;
+
+ memcpy(buf, disk->serial, MAX_RAID_SERIAL_LEN);
+ buf[MAX_RAID_SERIAL_LEN] = '\000';
+ strcat(buf, ":0");
+ if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN)
+ shift = len - MAX_RAID_SERIAL_LEN + 1;
+ memcpy(disk->serial, &buf[shift], len + 1 - shift);
+
+ disk->status |= FAILED_DISK;
+ set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD);
+ /* mark failures in second map if second map exists and this disk
+ * in this slot.
+ * This is valid for migration, initialization and rebuild
+ */
+ if (dev->vol.migr_state) {
+ struct imsm_map *map2 = get_imsm_map(dev, MAP_1);
+ int slot2 = get_imsm_disk_slot(map2, idx);
+
+ if (slot2 < map2->num_members && slot2 >= 0)
+ set_imsm_ord_tbl_ent(map2, slot2,
+ idx | IMSM_ORD_REBUILD);
+ }
+ if (map->failed_disk_num == 0xff ||
+ (!is_rebuilding(dev) && map->failed_disk_num > slot))
+ map->failed_disk_num = slot;
+
+ clear_disk_badblocks(super->bbm_log, ord_to_idx(ord));
+
+ return 1;
+}
+
+static void mark_missing(struct intel_super *super,
+ struct imsm_dev *dev, struct imsm_disk *disk, int idx)
+{
+ mark_failure(super, dev, disk, idx);
+
+ if (disk->scsi_id == __cpu_to_le32(~(__u32)0))
+ return;
+
+ disk->scsi_id = __cpu_to_le32(~(__u32)0);
+ memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1);
+}
+
+static void handle_missing(struct intel_super *super, struct imsm_dev *dev)
+{
+ struct dl *dl;
+
+ if (!super->missing)
+ return;
+
+ /* When orom adds replacement for missing disk it does
+ * not remove entry of missing disk, but just updates map with
+ * new added disk. So it is not enough just to test if there is
+ * any missing disk, we have to look if there are any failed disks
+ * in map to stop migration */
+
+ dprintf("imsm: mark missing\n");
+ /* end process for initialization and rebuild only
+ */
+ if (is_gen_migration(dev) == false) {
+ int failed = imsm_count_failed(super, dev, MAP_0);
+
+ if (failed) {
+ __u8 map_state;
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *map1;
+ int i, ord, ord_map1;
+ int rebuilt = 1;
+
+ for (i = 0; i < map->num_members; i++) {
+ ord = get_imsm_ord_tbl_ent(dev, i, MAP_0);
+ if (!(ord & IMSM_ORD_REBUILD))
+ continue;
+
+ map1 = get_imsm_map(dev, MAP_1);
+ if (!map1)
+ continue;
+
+ ord_map1 = __le32_to_cpu(map1->disk_ord_tbl[i]);
+ if (ord_map1 & IMSM_ORD_REBUILD)
+ rebuilt = 0;
+ }
+
+ if (rebuilt) {
+ map_state = imsm_check_degraded(super, dev,
+ failed, MAP_0);
+ end_migration(dev, super, map_state);
+ }
+ }
+ }
+ for (dl = super->missing; dl; dl = dl->next)
+ mark_missing(super, dev, &dl->disk, dl->index);
+ super->updates_pending++;
+}
+
+static unsigned long long imsm_set_array_size(struct imsm_dev *dev,
+ long long new_size)
+{
+ unsigned long long array_blocks;
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int used_disks = imsm_num_data_members(map);
+
+ if (used_disks == 0) {
+ /* when problems occures
+ * return current array_blocks value
+ */
+ array_blocks = imsm_dev_size(dev);
+
+ return array_blocks;
+ }
+
+ /* set array size in metadata
+ */
+ if (new_size <= 0)
+ /* OLCE size change is caused by added disks
+ */
+ array_blocks = per_dev_array_size(map) * used_disks;
+ else
+ /* Online Volume Size Change
+ * Using available free space
+ */
+ array_blocks = new_size;
+
+ array_blocks = round_size_to_mb(array_blocks, used_disks);
+ set_imsm_dev_size(dev, array_blocks);
+
+ return array_blocks;
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state);
+
+static void imsm_progress_container_reshape(struct intel_super *super)
+{
+ /* if no device has a migr_state, but some device has a
+ * different number of members than the previous device, start
+ * changing the number of devices in this device to match
+ * previous.
+ */
+ struct imsm_super *mpb = super->anchor;
+ int prev_disks = -1;
+ int i;
+ int copy_map_size;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ struct imsm_dev *dev = get_imsm_dev(super, i);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *map2;
+ int prev_num_members;
+
+ if (dev->vol.migr_state)
+ return;
+
+ if (prev_disks == -1)
+ prev_disks = map->num_members;
+ if (prev_disks == map->num_members)
+ continue;
+
+ /* OK, this array needs to enter reshape mode.
+ * i.e it needs a migr_state
+ */
+
+ copy_map_size = sizeof_imsm_map(map);
+ prev_num_members = map->num_members;
+ map->num_members = prev_disks;
+ dev->vol.migr_state = 1;
+ set_vol_curr_migr_unit(dev, 0);
+ set_migr_type(dev, MIGR_GEN_MIGR);
+ for (i = prev_num_members;
+ i < map->num_members; i++)
+ set_imsm_ord_tbl_ent(map, i, i);
+ map2 = get_imsm_map(dev, MAP_1);
+ /* Copy the current map */
+ memcpy(map2, map, copy_map_size);
+ map2->num_members = prev_num_members;
+
+ imsm_set_array_size(dev, -1);
+ super->clean_migration_record_by_mdmon = 1;
+ super->updates_pending++;
+ }
+}
+
+/* Handle dirty -> clean transititions, resync and reshape. Degraded and rebuild
+ * states are handled in imsm_set_disk() with one exception, when a
+ * resync is stopped due to a new failure this routine will set the
+ * 'degraded' state for the array.
+ */
+static int imsm_set_array_state(struct active_array *a, int consistent)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int failed = imsm_count_failed(super, dev, MAP_0);
+ __u8 map_state = imsm_check_degraded(super, dev, failed, MAP_0);
+ __u32 blocks_per_unit;
+
+ if (dev->vol.migr_state &&
+ dev->vol.migr_type == MIGR_GEN_MIGR) {
+ /* array state change is blocked due to reshape action
+ * We might need to
+ * - abort the reshape (if last_checkpoint is 0 and action!= reshape)
+ * - finish the reshape (if last_checkpoint is big and action != reshape)
+ * - update vol_curr_migr_unit
+ */
+ if (a->curr_action == reshape) {
+ /* still reshaping, maybe update vol_curr_migr_unit */
+ goto mark_checkpoint;
+ } else {
+ if (a->last_checkpoint == 0 && a->prev_action == reshape) {
+ /* for some reason we aborted the reshape.
+ *
+ * disable automatic metadata rollback
+ * user action is required to recover process
+ */
+ if (0) {
+ struct imsm_map *map2 =
+ get_imsm_map(dev, MAP_1);
+ dev->vol.migr_state = 0;
+ set_migr_type(dev, 0);
+ set_vol_curr_migr_unit(dev, 0);
+ memcpy(map, map2,
+ sizeof_imsm_map(map2));
+ super->updates_pending++;
+ }
+ }
+ if (a->last_checkpoint >= a->info.component_size) {
+ unsigned long long array_blocks;
+ int used_disks;
+ struct mdinfo *mdi;
+
+ used_disks = imsm_num_data_members(map);
+ if (used_disks > 0) {
+ array_blocks =
+ per_dev_array_size(map) *
+ used_disks;
+ array_blocks =
+ round_size_to_mb(array_blocks,
+ used_disks);
+ a->info.custom_array_size = array_blocks;
+ /* encourage manager to update array
+ * size
+ */
+
+ a->check_reshape = 1;
+ }
+ /* finalize online capacity expansion/reshape */
+ for (mdi = a->info.devs; mdi; mdi = mdi->next)
+ imsm_set_disk(a,
+ mdi->disk.raid_disk,
+ mdi->curr_state);
+
+ imsm_progress_container_reshape(super);
+ }
+ }
+ }
+
+ /* before we activate this array handle any missing disks */
+ if (consistent == 2)
+ handle_missing(super, dev);
+
+ if (consistent == 2 &&
+ (!is_resync_complete(&a->info) ||
+ map_state != IMSM_T_STATE_NORMAL ||
+ dev->vol.migr_state))
+ consistent = 0;
+
+ if (is_resync_complete(&a->info)) {
+ /* complete intialization / resync,
+ * recovery and interrupted recovery is completed in
+ * ->set_disk
+ */
+ if (is_resyncing(dev)) {
+ dprintf("imsm: mark resync done\n");
+ end_migration(dev, super, map_state);
+ super->updates_pending++;
+ a->last_checkpoint = 0;
+ }
+ } else if ((!is_resyncing(dev) && !failed) &&
+ (imsm_reshape_blocks_arrays_changes(super) == 0)) {
+ /* mark the start of the init process if nothing is failed */
+ dprintf("imsm: mark resync start\n");
+ if (map->map_state == IMSM_T_STATE_UNINITIALIZED)
+ migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_INIT);
+ else
+ migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_REPAIR);
+ super->updates_pending++;
+ }
+
+mark_checkpoint:
+ /* skip checkpointing for general migration,
+ * it is controlled in mdadm
+ */
+ if (is_gen_migration(dev))
+ goto skip_mark_checkpoint;
+
+ /* check if we can update vol_curr_migr_unit from resync_start,
+ * recovery_start
+ */
+ blocks_per_unit = blocks_per_migr_unit(super, dev);
+ if (blocks_per_unit) {
+ set_vol_curr_migr_unit(dev,
+ a->last_checkpoint / blocks_per_unit);
+ dprintf("imsm: mark checkpoint (%llu)\n",
+ vol_curr_migr_unit(dev));
+ super->updates_pending++;
+ }
+
+skip_mark_checkpoint:
+ /* mark dirty / clean */
+ if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) ||
+ (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) {
+ dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty");
+ if (consistent) {
+ dev->vol.dirty = RAIDVOL_CLEAN;
+ } else {
+ dev->vol.dirty = RAIDVOL_DIRTY;
+ if (dev->rwh_policy == RWH_DISTRIBUTED ||
+ dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)
+ dev->vol.dirty |= RAIDVOL_DSRECORD_VALID;
+ }
+ super->updates_pending++;
+ }
+
+ return consistent;
+}
+
+static int imsm_disk_slot_to_ord(struct active_array *a, int slot)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+
+ if (slot > map->num_members) {
+ pr_err("imsm: imsm_disk_slot_to_ord %d out of range 0..%d\n",
+ slot, map->num_members - 1);
+ return -1;
+ }
+
+ if (slot < 0)
+ return -1;
+
+ return get_imsm_ord_tbl_ent(dev, slot, MAP_0);
+}
+
+static void imsm_set_disk(struct active_array *a, int n, int state)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_disk *disk;
+ struct mdinfo *mdi;
+ int recovery_not_finished = 0;
+ int failed;
+ int ord;
+ __u8 map_state;
+ int rebuild_done = 0;
+ int i;
+
+ ord = get_imsm_ord_tbl_ent(dev, n, MAP_X);
+ if (ord < 0)
+ return;
+
+ dprintf("imsm: set_disk %d:%x\n", n, state);
+ disk = get_imsm_disk(super, ord_to_idx(ord));
+
+ /* check for new failures */
+ if (disk && (state & DS_FAULTY)) {
+ if (mark_failure(super, dev, disk, ord_to_idx(ord)))
+ super->updates_pending++;
+ }
+
+ /* check if in_sync */
+ if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) {
+ struct imsm_map *migr_map = get_imsm_map(dev, MAP_1);
+
+ set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord));
+ rebuild_done = 1;
+ super->updates_pending++;
+ }
+
+ failed = imsm_count_failed(super, dev, MAP_0);
+ map_state = imsm_check_degraded(super, dev, failed, MAP_0);
+
+ /* check if recovery complete, newly degraded, or failed */
+ dprintf("imsm: Detected transition to state ");
+ switch (map_state) {
+ case IMSM_T_STATE_NORMAL: /* transition to normal state */
+ dprintf("normal: ");
+ if (is_rebuilding(dev)) {
+ dprintf_cont("while rebuilding");
+ /* check if recovery is really finished */
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next)
+ if (mdi->recovery_start != MaxSector) {
+ recovery_not_finished = 1;
+ break;
+ }
+ if (recovery_not_finished) {
+ dprintf_cont("\n");
+ dprintf("Rebuild has not finished yet, state not changed");
+ if (a->last_checkpoint < mdi->recovery_start) {
+ a->last_checkpoint = mdi->recovery_start;
+ super->updates_pending++;
+ }
+ break;
+ }
+ end_migration(dev, super, map_state);
+ map->failed_disk_num = ~0;
+ super->updates_pending++;
+ a->last_checkpoint = 0;
+ break;
+ }
+ if (is_gen_migration(dev)) {
+ dprintf_cont("while general migration");
+ if (a->last_checkpoint >= a->info.component_size)
+ end_migration(dev, super, map_state);
+ else
+ map->map_state = map_state;
+ map->failed_disk_num = ~0;
+ super->updates_pending++;
+ break;
+ }
+ break;
+ case IMSM_T_STATE_DEGRADED: /* transition to degraded state */
+ dprintf_cont("degraded: ");
+ if (map->map_state != map_state && !dev->vol.migr_state) {
+ dprintf_cont("mark degraded");
+ map->map_state = map_state;
+ super->updates_pending++;
+ a->last_checkpoint = 0;
+ break;
+ }
+ if (is_rebuilding(dev)) {
+ dprintf_cont("while rebuilding ");
+ if (state & DS_FAULTY) {
+ dprintf_cont("removing failed drive ");
+ if (n == map->failed_disk_num) {
+ dprintf_cont("end migration");
+ end_migration(dev, super, map_state);
+ a->last_checkpoint = 0;
+ } else {
+ dprintf_cont("fail detected during rebuild, changing map state");
+ map->map_state = map_state;
+ }
+ super->updates_pending++;
+ }
+
+ if (!rebuild_done)
+ break;
+
+ /* check if recovery is really finished */
+ for (mdi = a->info.devs; mdi ; mdi = mdi->next)
+ if (mdi->recovery_start != MaxSector) {
+ recovery_not_finished = 1;
+ break;
+ }
+ if (recovery_not_finished) {
+ dprintf_cont("\n");
+ dprintf_cont("Rebuild has not finished yet");
+ if (a->last_checkpoint < mdi->recovery_start) {
+ a->last_checkpoint =
+ mdi->recovery_start;
+ super->updates_pending++;
+ }
+ break;
+ }
+
+ dprintf_cont(" Rebuild done, still degraded");
+ end_migration(dev, super, map_state);
+ a->last_checkpoint = 0;
+ super->updates_pending++;
+
+ for (i = 0; i < map->num_members; i++) {
+ int idx = get_imsm_ord_tbl_ent(dev, i, MAP_0);
+
+ if (idx & IMSM_ORD_REBUILD)
+ map->failed_disk_num = i;
+ }
+ super->updates_pending++;
+ break;
+ }
+ if (is_gen_migration(dev)) {
+ dprintf_cont("while general migration");
+ if (a->last_checkpoint >= a->info.component_size)
+ end_migration(dev, super, map_state);
+ else {
+ map->map_state = map_state;
+ manage_second_map(super, dev);
+ }
+ super->updates_pending++;
+ break;
+ }
+ if (is_initializing(dev)) {
+ dprintf_cont("while initialization.");
+ map->map_state = map_state;
+ super->updates_pending++;
+ break;
+ }
+ break;
+ case IMSM_T_STATE_FAILED: /* transition to failed state */
+ dprintf_cont("failed: ");
+ if (is_gen_migration(dev)) {
+ dprintf_cont("while general migration");
+ map->map_state = map_state;
+ super->updates_pending++;
+ break;
+ }
+ if (map->map_state != map_state) {
+ dprintf_cont("mark failed");
+ end_migration(dev, super, map_state);
+ super->updates_pending++;
+ a->last_checkpoint = 0;
+ break;
+ }
+ break;
+ default:
+ dprintf_cont("state %i\n", map_state);
+ }
+ dprintf_cont("\n");
+}
+
+static int store_imsm_mpb(int fd, struct imsm_super *mpb)
+{
+ void *buf = mpb;
+ __u32 mpb_size = __le32_to_cpu(mpb->mpb_size);
+ unsigned long long dsize;
+ unsigned long long sectors;
+ unsigned int sector_size;
+
+ if (!get_dev_sector_size(fd, NULL, &sector_size))
+ return 1;
+ get_dev_size(fd, NULL, &dsize);
+
+ if (mpb_size > sector_size) {
+ /* -1 to account for anchor */
+ sectors = mpb_sectors(mpb, sector_size) - 1;
+
+ /* write the extended mpb to the sectors preceeding the anchor */
+ if (lseek64(fd, dsize - (sector_size * (2 + sectors)),
+ SEEK_SET) < 0)
+ return 1;
+
+ if ((unsigned long long)write(fd, buf + sector_size,
+ sector_size * sectors) != sector_size * sectors)
+ return 1;
+ }
+
+ /* first block is stored on second to last sector of the disk */
+ if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0)
+ return 1;
+
+ if ((unsigned int)write(fd, buf, sector_size) != sector_size)
+ return 1;
+
+ return 0;
+}
+
+static void imsm_sync_metadata(struct supertype *container)
+{
+ struct intel_super *super = container->sb;
+
+ dprintf("sync metadata: %d\n", super->updates_pending);
+ if (!super->updates_pending)
+ return;
+
+ write_super_imsm(container, 0);
+
+ super->updates_pending = 0;
+}
+
+static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+ int i = get_imsm_disk_idx(dev, idx, MAP_X);
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->index == i)
+ break;
+
+ if (dl && is_failed(&dl->disk))
+ dl = NULL;
+
+ if (dl)
+ dprintf("found %x:%x\n", dl->major, dl->minor);
+
+ return dl;
+}
+
+static struct dl *imsm_add_spare(struct intel_super *super, int slot,
+ struct active_array *a, int activate_new,
+ struct mdinfo *additional_test_list)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member);
+ int idx = get_imsm_disk_idx(dev, slot, MAP_X);
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_map *map;
+ unsigned long long pos;
+ struct mdinfo *d;
+ struct extent *ex;
+ int i, j;
+ int found;
+ __u32 array_start = 0;
+ __u32 array_end = 0;
+ struct dl *dl;
+ struct mdinfo *test_list;
+
+ for (dl = super->disks; dl; dl = dl->next) {
+ /* If in this array, skip */
+ for (d = a->info.devs ; d ; d = d->next)
+ if (is_fd_valid(d->state_fd) &&
+ d->disk.major == dl->major &&
+ d->disk.minor == dl->minor) {
+ dprintf("%x:%x already in array\n",
+ dl->major, dl->minor);
+ break;
+ }
+ if (d)
+ continue;
+ test_list = additional_test_list;
+ while (test_list) {
+ if (test_list->disk.major == dl->major &&
+ test_list->disk.minor == dl->minor) {
+ dprintf("%x:%x already in additional test list\n",
+ dl->major, dl->minor);
+ break;
+ }
+ test_list = test_list->next;
+ }
+ if (test_list)
+ continue;
+
+ /* skip in use or failed drives */
+ if (is_failed(&dl->disk) || idx == dl->index ||
+ dl->index == -2) {
+ dprintf("%x:%x status (failed: %d index: %d)\n",
+ dl->major, dl->minor, is_failed(&dl->disk), idx);
+ continue;
+ }
+
+ /* skip pure spares when we are looking for partially
+ * assimilated drives
+ */
+ if (dl->index == -1 && !activate_new)
+ continue;
+
+ if (!drive_validate_sector_size(super, dl))
+ continue;
+
+ /* Does this unused device have the requisite free space?
+ * It needs to be able to cover all member volumes
+ */
+ ex = get_extents(super, dl, 1);
+ if (!ex) {
+ dprintf("cannot get extents\n");
+ continue;
+ }
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, MAP_0);
+
+ /* check if this disk is already a member of
+ * this array
+ */
+ if (get_imsm_disk_slot(map, dl->index) >= 0)
+ continue;
+
+ found = 0;
+ j = 0;
+ pos = 0;
+ array_start = pba_of_lba0(map);
+ array_end = array_start +
+ per_dev_array_size(map) - 1;
+
+ do {
+ /* check that we can start at pba_of_lba0 with
+ * num_data_stripes*blocks_per_stripe of space
+ */
+ if (array_start >= pos && array_end < ex[j].start) {
+ found = 1;
+ break;
+ }
+ pos = ex[j].start + ex[j].size;
+ j++;
+ } while (ex[j-1].size);
+
+ if (!found)
+ break;
+ }
+
+ free(ex);
+ if (i < mpb->num_raid_devs) {
+ dprintf("%x:%x does not have %u to %u available\n",
+ dl->major, dl->minor, array_start, array_end);
+ /* No room */
+ continue;
+ }
+ return dl;
+ }
+
+ return dl;
+}
+
+static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed)
+{
+ struct imsm_dev *dev2;
+ struct imsm_map *map;
+ struct dl *idisk;
+ int slot;
+ int idx;
+ __u8 state;
+
+ dev2 = get_imsm_dev(cont->sb, dev_idx);
+ if (dev2) {
+ state = imsm_check_degraded(cont->sb, dev2, failed, MAP_0);
+ if (state == IMSM_T_STATE_FAILED) {
+ map = get_imsm_map(dev2, MAP_0);
+ if (!map)
+ return 1;
+ for (slot = 0; slot < map->num_members; slot++) {
+ /*
+ * Check if failed disks are deleted from intel
+ * disk list or are marked to be deleted
+ */
+ idx = get_imsm_disk_idx(dev2, slot, MAP_X);
+ idisk = get_imsm_dl_disk(cont->sb, idx);
+ /*
+ * Do not rebuild the array if failed disks
+ * from failed sub-array are not removed from
+ * container.
+ */
+ if (idisk &&
+ is_failed(&idisk->disk) &&
+ (idisk->action != DISK_REMOVE))
+ return 0;
+ }
+ }
+ }
+ return 1;
+}
+
+static struct mdinfo *imsm_activate_spare(struct active_array *a,
+ struct metadata_update **updates)
+{
+ /**
+ * Find a device with unused free space and use it to replace a
+ * failed/vacant region in an array. We replace failed regions one a
+ * array at a time. The result is that a new spare disk will be added
+ * to the first failed array and after the monitor has finished
+ * propagating failures the remainder will be consumed.
+ *
+ * FIXME add a capability for mdmon to request spares from another
+ * container.
+ */
+
+ struct intel_super *super = a->container->sb;
+ int inst = a->info.container_member;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int failed = a->info.array.raid_disks;
+ struct mdinfo *rv = NULL;
+ struct mdinfo *d;
+ struct mdinfo *di;
+ struct metadata_update *mu;
+ struct dl *dl;
+ struct imsm_update_activate_spare *u;
+ int num_spares = 0;
+ int i;
+ int allowed;
+
+ for (d = a->info.devs ; d; d = d->next) {
+ if (!is_fd_valid(d->state_fd))
+ continue;
+
+ if (d->curr_state & DS_FAULTY)
+ /* wait for Removal to happen */
+ return NULL;
+
+ failed--;
+ }
+
+ dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n",
+ inst, failed, a->info.array.raid_disks, a->info.array.level);
+
+ if (imsm_reshape_blocks_arrays_changes(super))
+ return NULL;
+
+ /* Cannot activate another spare if rebuild is in progress already
+ */
+ if (is_rebuilding(dev)) {
+ dprintf("imsm: No spare activation allowed. Rebuild in progress already.\n");
+ return NULL;
+ }
+
+ if (a->info.array.level == 4)
+ /* No repair for takeovered array
+ * imsm doesn't support raid4
+ */
+ return NULL;
+
+ if (imsm_check_degraded(super, dev, failed, MAP_0) !=
+ IMSM_T_STATE_DEGRADED)
+ return NULL;
+
+ if (get_imsm_map(dev, MAP_0)->map_state == IMSM_T_STATE_UNINITIALIZED) {
+ dprintf("imsm: No spare activation allowed. Volume is not initialized.\n");
+ return NULL;
+ }
+
+ /*
+ * If there are any failed disks check state of the other volume.
+ * Block rebuild if the another one is failed until failed disks
+ * are removed from container.
+ */
+ if (failed) {
+ dprintf("found failed disks in %.*s, check if there anotherfailed sub-array.\n",
+ MAX_RAID_SERIAL_LEN, dev->volume);
+ /* check if states of the other volumes allow for rebuild */
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ if (i != inst) {
+ allowed = imsm_rebuild_allowed(a->container,
+ i, failed);
+ if (!allowed)
+ return NULL;
+ }
+ }
+ }
+
+ /* For each slot, if it is not working, find a spare */
+ for (i = 0; i < a->info.array.raid_disks; i++) {
+ for (d = a->info.devs ; d ; d = d->next)
+ if (d->disk.raid_disk == i)
+ break;
+ dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0);
+ if (d && is_fd_valid(d->state_fd))
+ continue;
+
+ /*
+ * OK, this device needs recovery. Try to re-add the
+ * previous occupant of this slot, if this fails see if
+ * we can continue the assimilation of a spare that was
+ * partially assimilated, finally try to activate a new
+ * spare.
+ */
+ dl = imsm_readd(super, i, a);
+ if (!dl)
+ dl = imsm_add_spare(super, i, a, 0, rv);
+ if (!dl)
+ dl = imsm_add_spare(super, i, a, 1, rv);
+ if (!dl)
+ continue;
+
+ /* found a usable disk with enough space */
+ di = xcalloc(1, sizeof(*di));
+
+ /* dl->index will be -1 in the case we are activating a
+ * pristine spare. imsm_process_update() will create a
+ * new index in this case. Once a disk is found to be
+ * failed in all member arrays it is kicked from the
+ * metadata
+ */
+ di->disk.number = dl->index;
+
+ /* (ab)use di->devs to store a pointer to the device
+ * we chose
+ */
+ di->devs = (struct mdinfo *) dl;
+
+ di->disk.raid_disk = i;
+ di->disk.major = dl->major;
+ di->disk.minor = dl->minor;
+ di->disk.state = 0;
+ di->recovery_start = 0;
+ di->data_offset = pba_of_lba0(map);
+ di->component_size = a->info.component_size;
+ di->container_member = inst;
+ di->bb.supported = 1;
+ if (a->info.consistency_policy == CONSISTENCY_POLICY_PPL) {
+ di->ppl_sector = get_ppl_sector(super, inst);
+ di->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9;
+ }
+ super->random = random32();
+ di->next = rv;
+ rv = di;
+ num_spares++;
+ dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor,
+ i, di->data_offset);
+ }
+
+ if (!rv)
+ /* No spares found */
+ return rv;
+ /* Now 'rv' has a list of devices to return.
+ * Create a metadata_update record to update the
+ * disk_ord_tbl for the array
+ */
+ mu = xmalloc(sizeof(*mu));
+ mu->buf = xcalloc(num_spares,
+ sizeof(struct imsm_update_activate_spare));
+ mu->space = NULL;
+ mu->space_list = NULL;
+ mu->len = sizeof(struct imsm_update_activate_spare) * num_spares;
+ mu->next = *updates;
+ u = (struct imsm_update_activate_spare *) mu->buf;
+
+ for (di = rv ; di ; di = di->next) {
+ u->type = update_activate_spare;
+ u->dl = (struct dl *) di->devs;
+ di->devs = NULL;
+ u->slot = di->disk.raid_disk;
+ u->array = inst;
+ u->next = u + 1;
+ u++;
+ }
+ (u-1)->next = NULL;
+ *updates = mu;
+
+ return rv;
+}
+
+static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u)
+{
+ struct imsm_dev *dev = get_imsm_dev(super, idx);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *new_map = get_imsm_map(&u->dev, MAP_0);
+ struct disk_info *inf = get_disk_info(u);
+ struct imsm_disk *disk;
+ int i;
+ int j;
+
+ for (i = 0; i < map->num_members; i++) {
+ disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, MAP_X));
+ for (j = 0; j < new_map->num_members; j++)
+ if (serialcmp(disk->serial, inf[j].serial) == 0)
+ return 1;
+ }
+
+ return 0;
+}
+
+static struct dl *get_disk_super(struct intel_super *super, int major, int minor)
+{
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl->major == major && dl->minor == minor)
+ return dl;
+ return NULL;
+}
+
+static int remove_disk_super(struct intel_super *super, int major, int minor)
+{
+ struct dl *prev;
+ struct dl *dl;
+
+ prev = NULL;
+ for (dl = super->disks; dl; dl = dl->next) {
+ if (dl->major == major && dl->minor == minor) {
+ /* remove */
+ if (prev)
+ prev->next = dl->next;
+ else
+ super->disks = dl->next;
+ dl->next = NULL;
+ __free_imsm_disk(dl, 1);
+ dprintf("removed %x:%x\n", major, minor);
+ break;
+ }
+ prev = dl;
+ }
+ return 0;
+}
+
+static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index);
+
+static int add_remove_disk_update(struct intel_super *super)
+{
+ int check_degraded = 0;
+ struct dl *disk;
+
+ /* add/remove some spares to/from the metadata/contrainer */
+ while (super->disk_mgmt_list) {
+ struct dl *disk_cfg;
+
+ disk_cfg = super->disk_mgmt_list;
+ super->disk_mgmt_list = disk_cfg->next;
+ disk_cfg->next = NULL;
+
+ if (disk_cfg->action == DISK_ADD) {
+ disk_cfg->next = super->disks;
+ super->disks = disk_cfg;
+ check_degraded = 1;
+ dprintf("added %x:%x\n",
+ disk_cfg->major, disk_cfg->minor);
+ } else if (disk_cfg->action == DISK_REMOVE) {
+ dprintf("Disk remove action processed: %x.%x\n",
+ disk_cfg->major, disk_cfg->minor);
+ disk = get_disk_super(super,
+ disk_cfg->major,
+ disk_cfg->minor);
+ if (disk) {
+ /* store action status */
+ disk->action = DISK_REMOVE;
+ /* remove spare disks only */
+ if (disk->index == -1) {
+ remove_disk_super(super,
+ disk_cfg->major,
+ disk_cfg->minor);
+ } else {
+ disk_cfg->fd = disk->fd;
+ disk->fd = -1;
+ }
+ }
+ /* release allocate disk structure */
+ __free_imsm_disk(disk_cfg, 1);
+ }
+ }
+ return check_degraded;
+}
+
+static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u,
+ struct intel_super *super,
+ void ***space_list)
+{
+ struct intel_dev *id;
+ void **tofree = NULL;
+ int ret_val = 0;
+
+ dprintf("(enter)\n");
+ if (u->subdev < 0 || u->subdev > 1) {
+ dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev);
+ return ret_val;
+ }
+ if (space_list == NULL || *space_list == NULL) {
+ dprintf("imsm: Error: Memory is not allocated\n");
+ return ret_val;
+ }
+
+ for (id = super->devlist ; id; id = id->next) {
+ if (id->index == (unsigned)u->subdev) {
+ struct imsm_dev *dev = get_imsm_dev(super, u->subdev);
+ struct imsm_map *map;
+ struct imsm_dev *new_dev =
+ (struct imsm_dev *)*space_list;
+ struct imsm_map *migr_map = get_imsm_map(dev, MAP_1);
+ int to_state;
+ struct dl *new_disk;
+
+ if (new_dev == NULL)
+ return ret_val;
+ *space_list = **space_list;
+ memcpy(new_dev, dev, sizeof_imsm_dev(dev, 0));
+ map = get_imsm_map(new_dev, MAP_0);
+ if (migr_map) {
+ dprintf("imsm: Error: migration in progress");
+ return ret_val;
+ }
+
+ to_state = map->map_state;
+ if ((u->new_level == 5) && (map->raid_level == 0)) {
+ map->num_members++;
+ /* this should not happen */
+ if (u->new_disks[0] < 0) {
+ map->failed_disk_num =
+ map->num_members - 1;
+ to_state = IMSM_T_STATE_DEGRADED;
+ } else
+ to_state = IMSM_T_STATE_NORMAL;
+ }
+ migrate(new_dev, super, to_state, MIGR_GEN_MIGR);
+ if (u->new_level > -1)
+ map->raid_level = u->new_level;
+ migr_map = get_imsm_map(new_dev, MAP_1);
+ if ((u->new_level == 5) &&
+ (migr_map->raid_level == 0)) {
+ int ord = map->num_members - 1;
+ migr_map->num_members--;
+ if (u->new_disks[0] < 0)
+ ord |= IMSM_ORD_REBUILD;
+ set_imsm_ord_tbl_ent(map,
+ map->num_members - 1,
+ ord);
+ }
+ id->dev = new_dev;
+ tofree = (void **)dev;
+
+ /* update chunk size
+ */
+ if (u->new_chunksize > 0) {
+ struct imsm_map *dest_map =
+ get_imsm_map(dev, MAP_0);
+ int used_disks =
+ imsm_num_data_members(dest_map);
+
+ if (used_disks == 0)
+ return ret_val;
+
+ map->blocks_per_strip =
+ __cpu_to_le16(u->new_chunksize * 2);
+ update_num_data_stripes(map, imsm_dev_size(dev));
+ }
+
+ /* ensure blocks_per_member has valid value
+ */
+ set_blocks_per_member(map,
+ per_dev_array_size(map) +
+ NUM_BLOCKS_DIRTY_STRIPE_REGION);
+
+ /* add disk
+ */
+ if (u->new_level != 5 || migr_map->raid_level != 0 ||
+ migr_map->raid_level == map->raid_level)
+ goto skip_disk_add;
+
+ if (u->new_disks[0] >= 0) {
+ /* use passes spare
+ */
+ new_disk = get_disk_super(super,
+ major(u->new_disks[0]),
+ minor(u->new_disks[0]));
+ dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n",
+ major(u->new_disks[0]),
+ minor(u->new_disks[0]),
+ new_disk, new_disk->index);
+ if (new_disk == NULL)
+ goto error_disk_add;
+
+ new_disk->index = map->num_members - 1;
+ /* slot to fill in autolayout
+ */
+ new_disk->raiddisk = new_disk->index;
+ new_disk->disk.status |= CONFIGURED_DISK;
+ new_disk->disk.status &= ~SPARE_DISK;
+ } else
+ goto error_disk_add;
+
+skip_disk_add:
+ *tofree = *space_list;
+ /* calculate new size
+ */
+ imsm_set_array_size(new_dev, -1);
+
+ ret_val = 1;
+ }
+ }
+
+ if (tofree)
+ *space_list = tofree;
+ return ret_val;
+
+error_disk_add:
+ dprintf("Error: imsm: Cannot find disk.\n");
+ return ret_val;
+}
+
+static int apply_size_change_update(struct imsm_update_size_change *u,
+ struct intel_super *super)
+{
+ struct intel_dev *id;
+ int ret_val = 0;
+
+ dprintf("(enter)\n");
+ if (u->subdev < 0 || u->subdev > 1) {
+ dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev);
+ return ret_val;
+ }
+
+ for (id = super->devlist ; id; id = id->next) {
+ if (id->index == (unsigned)u->subdev) {
+ struct imsm_dev *dev = get_imsm_dev(super, u->subdev);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int used_disks = imsm_num_data_members(map);
+ unsigned long long blocks_per_member;
+ unsigned long long new_size_per_disk;
+
+ if (used_disks == 0)
+ return 0;
+
+ /* calculate new size
+ */
+ new_size_per_disk = u->new_size / used_disks;
+ blocks_per_member = new_size_per_disk +
+ NUM_BLOCKS_DIRTY_STRIPE_REGION;
+
+ imsm_set_array_size(dev, u->new_size);
+ set_blocks_per_member(map, blocks_per_member);
+ update_num_data_stripes(map, u->new_size);
+ ret_val = 1;
+ break;
+ }
+ }
+
+ return ret_val;
+}
+
+static int prepare_spare_to_activate(struct supertype *st,
+ struct imsm_update_activate_spare *u)
+{
+ struct intel_super *super = st->sb;
+ int prev_current_vol = super->current_vol;
+ struct active_array *a;
+ int ret = 1;
+
+ for (a = st->arrays; a; a = a->next)
+ /*
+ * Additional initialization (adding bitmap header, filling
+ * the bitmap area with '1's to force initial rebuild for a whole
+ * data-area) is required when adding the spare to the volume
+ * with write-intent bitmap.
+ */
+ if (a->info.container_member == u->array &&
+ a->info.consistency_policy == CONSISTENCY_POLICY_BITMAP) {
+ struct dl *dl;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl == u->dl)
+ break;
+ if (!dl)
+ break;
+
+ super->current_vol = u->array;
+ if (st->ss->write_bitmap(st, dl->fd, NoUpdate))
+ ret = 0;
+ super->current_vol = prev_current_vol;
+ }
+ return ret;
+}
+
+static int apply_update_activate_spare(struct imsm_update_activate_spare *u,
+ struct intel_super *super,
+ struct active_array *active_array)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct imsm_dev *dev = get_imsm_dev(super, u->array);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct imsm_map *migr_map;
+ struct active_array *a;
+ struct imsm_disk *disk;
+ __u8 to_state;
+ struct dl *dl;
+ unsigned int found;
+ int failed;
+ int victim;
+ int i;
+ int second_map_created = 0;
+
+ for (; u; u = u->next) {
+ victim = get_imsm_disk_idx(dev, u->slot, MAP_X);
+
+ if (victim < 0)
+ return 0;
+
+ for (dl = super->disks; dl; dl = dl->next)
+ if (dl == u->dl)
+ break;
+
+ if (!dl) {
+ pr_err("error: imsm_activate_spare passed an unknown disk (index: %d)\n",
+ u->dl->index);
+ return 0;
+ }
+
+ /* count failures (excluding rebuilds and the victim)
+ * to determine map[0] state
+ */
+ failed = 0;
+ for (i = 0; i < map->num_members; i++) {
+ if (i == u->slot)
+ continue;
+ disk = get_imsm_disk(super,
+ get_imsm_disk_idx(dev, i, MAP_X));
+ if (!disk || is_failed(disk))
+ failed++;
+ }
+
+ /* adding a pristine spare, assign a new index */
+ if (dl->index < 0) {
+ dl->index = super->anchor->num_disks;
+ super->anchor->num_disks++;
+ }
+ disk = &dl->disk;
+ disk->status |= CONFIGURED_DISK;
+ disk->status &= ~SPARE_DISK;
+
+ /* mark rebuild */
+ to_state = imsm_check_degraded(super, dev, failed, MAP_0);
+ if (!second_map_created) {
+ second_map_created = 1;
+ map->map_state = IMSM_T_STATE_DEGRADED;
+ migrate(dev, super, to_state, MIGR_REBUILD);
+ } else
+ map->map_state = to_state;
+ migr_map = get_imsm_map(dev, MAP_1);
+ set_imsm_ord_tbl_ent(map, u->slot, dl->index);
+ set_imsm_ord_tbl_ent(migr_map, u->slot,
+ dl->index | IMSM_ORD_REBUILD);
+
+ /* update the family_num to mark a new container
+ * generation, being careful to record the existing
+ * family_num in orig_family_num to clean up after
+ * earlier mdadm versions that neglected to set it.
+ */
+ if (mpb->orig_family_num == 0)
+ mpb->orig_family_num = mpb->family_num;
+ mpb->family_num += super->random;
+
+ /* count arrays using the victim in the metadata */
+ found = 0;
+ for (a = active_array; a ; a = a->next) {
+ dev = get_imsm_dev(super, a->info.container_member);
+ map = get_imsm_map(dev, MAP_0);
+
+ if (get_imsm_disk_slot(map, victim) >= 0)
+ found++;
+ }
+
+ /* delete the victim if it is no longer being
+ * utilized anywhere
+ */
+ if (!found) {
+ struct dl **dlp;
+
+ /* We know that 'manager' isn't touching anything,
+ * so it is safe to delete
+ */
+ for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next)
+ if ((*dlp)->index == victim)
+ break;
+
+ /* victim may be on the missing list */
+ if (!*dlp)
+ for (dlp = &super->missing; *dlp;
+ dlp = &(*dlp)->next)
+ if ((*dlp)->index == victim)
+ break;
+ imsm_delete(super, dlp, victim);
+ }
+ }
+
+ return 1;
+}
+
+static int apply_reshape_container_disks_update(struct imsm_update_reshape *u,
+ struct intel_super *super,
+ void ***space_list)
+{
+ struct dl *new_disk;
+ struct intel_dev *id;
+ int i;
+ int delta_disks = u->new_raid_disks - u->old_raid_disks;
+ int disk_count = u->old_raid_disks;
+ void **tofree = NULL;
+ int devices_to_reshape = 1;
+ struct imsm_super *mpb = super->anchor;
+ int ret_val = 0;
+ unsigned int dev_id;
+
+ dprintf("(enter)\n");
+
+ /* enable spares to use in array */
+ for (i = 0; i < delta_disks; i++) {
+ new_disk = get_disk_super(super,
+ major(u->new_disks[i]),
+ minor(u->new_disks[i]));
+ dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n",
+ major(u->new_disks[i]), minor(u->new_disks[i]),
+ new_disk, new_disk->index);
+ if (new_disk == NULL ||
+ (new_disk->index >= 0 &&
+ new_disk->index < u->old_raid_disks))
+ goto update_reshape_exit;
+ new_disk->index = disk_count++;
+ /* slot to fill in autolayout
+ */
+ new_disk->raiddisk = new_disk->index;
+ new_disk->disk.status |=
+ CONFIGURED_DISK;
+ new_disk->disk.status &= ~SPARE_DISK;
+ }
+
+ dprintf("imsm: volume set mpb->num_raid_devs = %i\n",
+ mpb->num_raid_devs);
+ /* manage changes in volume
+ */
+ for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) {
+ void **sp = *space_list;
+ struct imsm_dev *newdev;
+ struct imsm_map *newmap, *oldmap;
+
+ for (id = super->devlist ; id; id = id->next) {
+ if (id->index == dev_id)
+ break;
+ }
+ if (id == NULL)
+ break;
+ if (!sp)
+ continue;
+ *space_list = *sp;
+ newdev = (void*)sp;
+ /* Copy the dev, but not (all of) the map */
+ memcpy(newdev, id->dev, sizeof(*newdev));
+ oldmap = get_imsm_map(id->dev, MAP_0);
+ newmap = get_imsm_map(newdev, MAP_0);
+ /* Copy the current map */
+ memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
+ /* update one device only
+ */
+ if (devices_to_reshape) {
+ dprintf("imsm: modifying subdev: %i\n",
+ id->index);
+ devices_to_reshape--;
+ newdev->vol.migr_state = 1;
+ set_vol_curr_migr_unit(newdev, 0);
+ set_migr_type(newdev, MIGR_GEN_MIGR);
+ newmap->num_members = u->new_raid_disks;
+ for (i = 0; i < delta_disks; i++) {
+ set_imsm_ord_tbl_ent(newmap,
+ u->old_raid_disks + i,
+ u->old_raid_disks + i);
+ }
+ /* New map is correct, now need to save old map
+ */
+ newmap = get_imsm_map(newdev, MAP_1);
+ memcpy(newmap, oldmap, sizeof_imsm_map(oldmap));
+
+ imsm_set_array_size(newdev, -1);
+ }
+
+ sp = (void **)id->dev;
+ id->dev = newdev;
+ *sp = tofree;
+ tofree = sp;
+
+ /* Clear migration record */
+ memset(super->migr_rec, 0, sizeof(struct migr_record));
+ }
+ if (tofree)
+ *space_list = tofree;
+ ret_val = 1;
+
+update_reshape_exit:
+
+ return ret_val;
+}
+
+static int apply_takeover_update(struct imsm_update_takeover *u,
+ struct intel_super *super,
+ void ***space_list)
+{
+ struct imsm_dev *dev = NULL;
+ struct intel_dev *dv;
+ struct imsm_dev *dev_new;
+ struct imsm_map *map;
+ struct dl *dm, *du;
+ int i;
+
+ for (dv = super->devlist; dv; dv = dv->next)
+ if (dv->index == (unsigned int)u->subarray) {
+ dev = dv->dev;
+ break;
+ }
+
+ if (dev == NULL)
+ return 0;
+
+ map = get_imsm_map(dev, MAP_0);
+
+ if (u->direction == R10_TO_R0) {
+ /* Number of failed disks must be half of initial disk number */
+ if (imsm_count_failed(super, dev, MAP_0) !=
+ (map->num_members / 2))
+ return 0;
+
+ /* iterate through devices to mark removed disks as spare */
+ for (dm = super->disks; dm; dm = dm->next) {
+ if (dm->disk.status & FAILED_DISK) {
+ int idx = dm->index;
+ /* update indexes on the disk list */
+/* FIXME this loop-with-the-loop looks wrong, I'm not convinced
+ the index values will end up being correct.... NB */
+ for (du = super->disks; du; du = du->next)
+ if (du->index > idx)
+ du->index--;
+ /* mark as spare disk */
+ mark_spare(dm);
+ }
+ }
+ /* update map */
+ map->num_members /= map->num_domains;
+ map->map_state = IMSM_T_STATE_NORMAL;
+ map->raid_level = 0;
+ set_num_domains(map);
+ update_num_data_stripes(map, imsm_dev_size(dev));
+ map->failed_disk_num = -1;
+ }
+
+ if (u->direction == R0_TO_R10) {
+ void **space;
+
+ /* update slots in current disk list */
+ for (dm = super->disks; dm; dm = dm->next) {
+ if (dm->index >= 0)
+ dm->index *= 2;
+ }
+ /* create new *missing* disks */
+ for (i = 0; i < map->num_members; i++) {
+ space = *space_list;
+ if (!space)
+ continue;
+ *space_list = *space;
+ du = (void *)space;
+ memcpy(du, super->disks, sizeof(*du));
+ du->fd = -1;
+ du->minor = 0;
+ du->major = 0;
+ du->index = (i * 2) + 1;
+ sprintf((char *)du->disk.serial,
+ " MISSING_%d", du->index);
+ sprintf((char *)du->serial,
+ "MISSING_%d", du->index);
+ du->next = super->missing;
+ super->missing = du;
+ }
+ /* create new dev and map */
+ space = *space_list;
+ if (!space)
+ return 0;
+ *space_list = *space;
+ dev_new = (void *)space;
+ memcpy(dev_new, dev, sizeof(*dev));
+ /* update new map */
+ map = get_imsm_map(dev_new, MAP_0);
+
+ map->map_state = IMSM_T_STATE_DEGRADED;
+ map->raid_level = 1;
+ set_num_domains(map);
+ map->num_members = map->num_members * map->num_domains;
+ update_num_data_stripes(map, imsm_dev_size(dev));
+
+ /* replace dev<->dev_new */
+ dv->dev = dev_new;
+ }
+ /* update disk order table */
+ for (du = super->disks; du; du = du->next)
+ if (du->index >= 0)
+ set_imsm_ord_tbl_ent(map, du->index, du->index);
+ for (du = super->missing; du; du = du->next)
+ if (du->index >= 0) {
+ set_imsm_ord_tbl_ent(map, du->index, du->index);
+ mark_missing(super, dv->dev, &du->disk, du->index);
+ }
+
+ return 1;
+}
+
+static void imsm_process_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /**
+ * crack open the metadata_update envelope to find the update record
+ * update can be one of:
+ * update_reshape_container_disks - all the arrays in the container
+ * are being reshaped to have more devices. We need to mark
+ * the arrays for general migration and convert selected spares
+ * into active devices.
+ * update_activate_spare - a spare device has replaced a failed
+ * device in an array, update the disk_ord_tbl. If this disk is
+ * present in all member arrays then also clear the SPARE_DISK
+ * flag
+ * update_create_array
+ * update_kill_array
+ * update_rename_array
+ * update_add_remove_disk
+ */
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb;
+ enum imsm_update_type type = *(enum imsm_update_type *) update->buf;
+
+ /* update requires a larger buf but the allocation failed */
+ if (super->next_len && !super->next_buf) {
+ super->next_len = 0;
+ return;
+ }
+
+ if (super->next_buf) {
+ memcpy(super->next_buf, super->buf, super->len);
+ free(super->buf);
+ super->len = super->next_len;
+ super->buf = super->next_buf;
+
+ super->next_len = 0;
+ super->next_buf = NULL;
+ }
+
+ mpb = super->anchor;
+
+ switch (type) {
+ case update_general_migration_checkpoint: {
+ struct intel_dev *id;
+ struct imsm_update_general_migration_checkpoint *u =
+ (void *)update->buf;
+
+ dprintf("called for update_general_migration_checkpoint\n");
+
+ /* find device under general migration */
+ for (id = super->devlist ; id; id = id->next) {
+ if (is_gen_migration(id->dev)) {
+ set_vol_curr_migr_unit(id->dev,
+ u->curr_migr_unit);
+ super->updates_pending++;
+ }
+ }
+ break;
+ }
+ case update_takeover: {
+ struct imsm_update_takeover *u = (void *)update->buf;
+ if (apply_takeover_update(u, super, &update->space_list)) {
+ imsm_update_version_info(super);
+ super->updates_pending++;
+ }
+ break;
+ }
+
+ case update_reshape_container_disks: {
+ struct imsm_update_reshape *u = (void *)update->buf;
+ if (apply_reshape_container_disks_update(
+ u, super, &update->space_list))
+ super->updates_pending++;
+ break;
+ }
+ case update_reshape_migration: {
+ struct imsm_update_reshape_migration *u = (void *)update->buf;
+ if (apply_reshape_migration_update(
+ u, super, &update->space_list))
+ super->updates_pending++;
+ break;
+ }
+ case update_size_change: {
+ struct imsm_update_size_change *u = (void *)update->buf;
+ if (apply_size_change_update(u, super))
+ super->updates_pending++;
+ break;
+ }
+ case update_activate_spare: {
+ struct imsm_update_activate_spare *u = (void *) update->buf;
+
+ if (prepare_spare_to_activate(st, u) &&
+ apply_update_activate_spare(u, super, st->arrays))
+ super->updates_pending++;
+ break;
+ }
+ case update_create_array: {
+ /* someone wants to create a new array, we need to be aware of
+ * a few races/collisions:
+ * 1/ 'Create' called by two separate instances of mdadm
+ * 2/ 'Create' versus 'activate_spare': mdadm has chosen
+ * devices that have since been assimilated via
+ * activate_spare.
+ * In the event this update can not be carried out mdadm will
+ * (FIX ME) notice that its update did not take hold.
+ */
+ struct imsm_update_create_array *u = (void *) update->buf;
+ struct intel_dev *dv;
+ struct imsm_dev *dev;
+ struct imsm_map *map, *new_map;
+ unsigned long long start, end;
+ unsigned long long new_start, new_end;
+ int i;
+ struct disk_info *inf;
+ struct dl *dl;
+
+ /* handle racing creates: first come first serve */
+ if (u->dev_idx < mpb->num_raid_devs) {
+ dprintf("subarray %d already defined\n", u->dev_idx);
+ goto create_error;
+ }
+
+ /* check update is next in sequence */
+ if (u->dev_idx != mpb->num_raid_devs) {
+ dprintf("can not create array %d expected index %d\n",
+ u->dev_idx, mpb->num_raid_devs);
+ goto create_error;
+ }
+
+ new_map = get_imsm_map(&u->dev, MAP_0);
+ new_start = pba_of_lba0(new_map);
+ new_end = new_start + per_dev_array_size(new_map);
+ inf = get_disk_info(u);
+
+ /* handle activate_spare versus create race:
+ * check to make sure that overlapping arrays do not include
+ * overalpping disks
+ */
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, MAP_0);
+ start = pba_of_lba0(map);
+ end = start + per_dev_array_size(map);
+ if ((new_start >= start && new_start <= end) ||
+ (start >= new_start && start <= new_end))
+ /* overlap */;
+ else
+ continue;
+
+ if (disks_overlap(super, i, u)) {
+ dprintf("arrays overlap\n");
+ goto create_error;
+ }
+ }
+
+ /* check that prepare update was successful */
+ if (!update->space) {
+ dprintf("prepare update failed\n");
+ goto create_error;
+ }
+
+ /* check that all disks are still active before committing
+ * changes. FIXME: could we instead handle this by creating a
+ * degraded array? That's probably not what the user expects,
+ * so better to drop this update on the floor.
+ */
+ for (i = 0; i < new_map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (!dl) {
+ dprintf("disk disappeared\n");
+ goto create_error;
+ }
+ }
+
+ super->updates_pending++;
+
+ /* convert spares to members and fixup ord_tbl */
+ for (i = 0; i < new_map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (dl->index == -1) {
+ dl->index = mpb->num_disks;
+ mpb->num_disks++;
+ dl->disk.status |= CONFIGURED_DISK;
+ dl->disk.status &= ~SPARE_DISK;
+ }
+ set_imsm_ord_tbl_ent(new_map, i, dl->index);
+ }
+
+ dv = update->space;
+ dev = dv->dev;
+ update->space = NULL;
+ imsm_copy_dev(dev, &u->dev);
+ dv->index = u->dev_idx;
+ dv->next = super->devlist;
+ super->devlist = dv;
+ mpb->num_raid_devs++;
+
+ imsm_update_version_info(super);
+ break;
+ create_error:
+ /* mdmon knows how to release update->space, but not
+ * ((struct intel_dev *) update->space)->dev
+ */
+ if (update->space) {
+ dv = update->space;
+ free(dv->dev);
+ }
+ break;
+ }
+ case update_kill_array: {
+ struct imsm_update_kill_array *u = (void *) update->buf;
+ int victim = u->dev_idx;
+ struct active_array *a;
+ struct intel_dev **dp;
+ struct imsm_dev *dev;
+
+ /* sanity check that we are not affecting the uuid of
+ * active arrays, or deleting an active array
+ *
+ * FIXME when immutable ids are available, but note that
+ * we'll also need to fixup the invalidated/active
+ * subarray indexes in mdstat
+ */
+ for (a = st->arrays; a; a = a->next)
+ if (a->info.container_member >= victim)
+ break;
+ /* by definition if mdmon is running at least one array
+ * is active in the container, so checking
+ * mpb->num_raid_devs is just extra paranoia
+ */
+ dev = get_imsm_dev(super, victim);
+ if (a || !dev || mpb->num_raid_devs == 1) {
+ dprintf("failed to delete subarray-%d\n", victim);
+ break;
+ }
+
+ for (dp = &super->devlist; *dp;)
+ if ((*dp)->index == (unsigned)super->current_vol) {
+ *dp = (*dp)->next;
+ } else {
+ if ((*dp)->index > (unsigned)victim)
+ (*dp)->index--;
+ dp = &(*dp)->next;
+ }
+ mpb->num_raid_devs--;
+ super->updates_pending++;
+ break;
+ }
+ case update_rename_array: {
+ struct imsm_update_rename_array *u = (void *) update->buf;
+ char name[MAX_RAID_SERIAL_LEN+1];
+ int target = u->dev_idx;
+ struct active_array *a;
+ struct imsm_dev *dev;
+
+ /* sanity check that we are not affecting the uuid of
+ * an active array
+ */
+ memset(name, 0, sizeof(name));
+ snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name);
+ name[MAX_RAID_SERIAL_LEN] = '\0';
+ for (a = st->arrays; a; a = a->next)
+ if (a->info.container_member == target)
+ break;
+ dev = get_imsm_dev(super, u->dev_idx);
+ if (a || !dev || !check_name(super, name, 1)) {
+ dprintf("failed to rename subarray-%d\n", target);
+ break;
+ }
+
+ memcpy(dev->volume, name, MAX_RAID_SERIAL_LEN);
+ super->updates_pending++;
+ break;
+ }
+ case update_add_remove_disk: {
+ /* we may be able to repair some arrays if disks are
+ * being added, check the status of add_remove_disk
+ * if discs has been added.
+ */
+ if (add_remove_disk_update(super)) {
+ struct active_array *a;
+
+ super->updates_pending++;
+ for (a = st->arrays; a; a = a->next)
+ a->check_degraded = 1;
+ }
+ break;
+ }
+ case update_prealloc_badblocks_mem:
+ break;
+ case update_rwh_policy: {
+ struct imsm_update_rwh_policy *u = (void *)update->buf;
+ int target = u->dev_idx;
+ struct imsm_dev *dev = get_imsm_dev(super, target);
+ if (!dev) {
+ dprintf("could not find subarray-%d\n", target);
+ break;
+ }
+
+ if (dev->rwh_policy != u->new_policy) {
+ dev->rwh_policy = u->new_policy;
+ super->updates_pending++;
+ }
+ break;
+ }
+ default:
+ pr_err("error: unsupported process update type:(type: %d)\n", type);
+ }
+}
+
+static struct mdinfo *get_spares_for_grow(struct supertype *st);
+
+static int imsm_prepare_update(struct supertype *st,
+ struct metadata_update *update)
+{
+ /**
+ * Allocate space to hold new disk entries, raid-device entries or a new
+ * mpb if necessary. The manager synchronously waits for updates to
+ * complete in the monitor, so new mpb buffers allocated here can be
+ * integrated by the monitor thread without worrying about live pointers
+ * in the manager thread.
+ */
+ enum imsm_update_type type;
+ struct intel_super *super = st->sb;
+ unsigned int sector_size = super->sector_size;
+ struct imsm_super *mpb = super->anchor;
+ size_t buf_len;
+ size_t len = 0;
+
+ if (update->len < (int)sizeof(type))
+ return 0;
+
+ type = *(enum imsm_update_type *) update->buf;
+
+ switch (type) {
+ case update_general_migration_checkpoint:
+ if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint))
+ return 0;
+ dprintf("called for update_general_migration_checkpoint\n");
+ break;
+ case update_takeover: {
+ struct imsm_update_takeover *u = (void *)update->buf;
+ if (update->len < (int)sizeof(*u))
+ return 0;
+ if (u->direction == R0_TO_R10) {
+ void **tail = (void **)&update->space_list;
+ struct imsm_dev *dev = get_imsm_dev(super, u->subarray);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int num_members = map->num_members;
+ void *space;
+ int size, i;
+ /* allocate memory for added disks */
+ for (i = 0; i < num_members; i++) {
+ size = sizeof(struct dl);
+ space = xmalloc(size);
+ *tail = space;
+ tail = space;
+ *tail = NULL;
+ }
+ /* allocate memory for new device */
+ size = sizeof_imsm_dev(super->devlist->dev, 0) +
+ (num_members * sizeof(__u32));
+ space = xmalloc(size);
+ *tail = space;
+ tail = space;
+ *tail = NULL;
+ len = disks_to_mpb_size(num_members * 2);
+ }
+
+ break;
+ }
+ case update_reshape_container_disks: {
+ /* Every raid device in the container is about to
+ * gain some more devices, and we will enter a
+ * reconfiguration.
+ * So each 'imsm_map' will be bigger, and the imsm_vol
+ * will now hold 2 of them.
+ * Thus we need new 'struct imsm_dev' allocations sized
+ * as sizeof_imsm_dev but with more devices in both maps.
+ */
+ struct imsm_update_reshape *u = (void *)update->buf;
+ struct intel_dev *dl;
+ void **space_tail = (void**)&update->space_list;
+
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
+ dprintf("for update_reshape\n");
+
+ for (dl = super->devlist; dl; dl = dl->next) {
+ int size = sizeof_imsm_dev(dl->dev, 1);
+ void *s;
+ if (u->new_raid_disks > u->old_raid_disks)
+ size += sizeof(__u32)*2*
+ (u->new_raid_disks - u->old_raid_disks);
+ s = xmalloc(size);
+ *space_tail = s;
+ space_tail = s;
+ *space_tail = NULL;
+ }
+
+ len = disks_to_mpb_size(u->new_raid_disks);
+ dprintf("New anchor length is %llu\n", (unsigned long long)len);
+ break;
+ }
+ case update_reshape_migration: {
+ /* for migration level 0->5 we need to add disks
+ * so the same as for container operation we will copy
+ * device to the bigger location.
+ * in memory prepared device and new disk area are prepared
+ * for usage in process update
+ */
+ struct imsm_update_reshape_migration *u = (void *)update->buf;
+ struct intel_dev *id;
+ void **space_tail = (void **)&update->space_list;
+ int size;
+ void *s;
+ int current_level = -1;
+
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
+ dprintf("for update_reshape\n");
+
+ /* add space for bigger array in update
+ */
+ for (id = super->devlist; id; id = id->next) {
+ if (id->index == (unsigned)u->subdev) {
+ size = sizeof_imsm_dev(id->dev, 1);
+ if (u->new_raid_disks > u->old_raid_disks)
+ size += sizeof(__u32)*2*
+ (u->new_raid_disks - u->old_raid_disks);
+ s = xmalloc(size);
+ *space_tail = s;
+ space_tail = s;
+ *space_tail = NULL;
+ break;
+ }
+ }
+ if (update->space_list == NULL)
+ break;
+
+ /* add space for disk in update
+ */
+ size = sizeof(struct dl);
+ s = xmalloc(size);
+ *space_tail = s;
+ space_tail = s;
+ *space_tail = NULL;
+
+ /* add spare device to update
+ */
+ for (id = super->devlist ; id; id = id->next)
+ if (id->index == (unsigned)u->subdev) {
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+
+ dev = get_imsm_dev(super, u->subdev);
+ map = get_imsm_map(dev, MAP_0);
+ current_level = map->raid_level;
+ break;
+ }
+ if (u->new_level == 5 && u->new_level != current_level) {
+ struct mdinfo *spares;
+
+ spares = get_spares_for_grow(st);
+ if (spares) {
+ struct dl *dl;
+ struct mdinfo *dev;
+
+ dev = spares->devs;
+ if (dev) {
+ u->new_disks[0] =
+ makedev(dev->disk.major,
+ dev->disk.minor);
+ dl = get_disk_super(super,
+ dev->disk.major,
+ dev->disk.minor);
+ dl->index = u->old_raid_disks;
+ dev = dev->next;
+ }
+ sysfs_free(spares);
+ }
+ }
+ len = disks_to_mpb_size(u->new_raid_disks);
+ dprintf("New anchor length is %llu\n", (unsigned long long)len);
+ break;
+ }
+ case update_size_change: {
+ if (update->len < (int)sizeof(struct imsm_update_size_change))
+ return 0;
+ break;
+ }
+ case update_activate_spare: {
+ if (update->len < (int)sizeof(struct imsm_update_activate_spare))
+ return 0;
+ break;
+ }
+ case update_create_array: {
+ struct imsm_update_create_array *u = (void *) update->buf;
+ struct intel_dev *dv;
+ struct imsm_dev *dev = &u->dev;
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ struct dl *dl;
+ struct disk_info *inf;
+ int i;
+ int activate = 0;
+
+ if (update->len < (int)sizeof(*u))
+ return 0;
+
+ inf = get_disk_info(u);
+ len = sizeof_imsm_dev(dev, 1);
+ /* allocate a new super->devlist entry */
+ dv = xmalloc(sizeof(*dv));
+ dv->dev = xmalloc(len);
+ update->space = dv;
+
+ /* count how many spares will be converted to members */
+ for (i = 0; i < map->num_members; i++) {
+ dl = serial_to_dl(inf[i].serial, super);
+ if (!dl) {
+ /* hmm maybe it failed?, nothing we can do about
+ * it here
+ */
+ continue;
+ }
+ if (count_memberships(dl, super) == 0)
+ activate++;
+ }
+ len += activate * sizeof(struct imsm_disk);
+ break;
+ }
+ case update_kill_array: {
+ if (update->len < (int)sizeof(struct imsm_update_kill_array))
+ return 0;
+ break;
+ }
+ case update_rename_array: {
+ if (update->len < (int)sizeof(struct imsm_update_rename_array))
+ return 0;
+ break;
+ }
+ case update_add_remove_disk:
+ /* no update->len needed */
+ break;
+ case update_prealloc_badblocks_mem:
+ super->extra_space += sizeof(struct bbm_log) -
+ get_imsm_bbm_log_size(super->bbm_log);
+ break;
+ case update_rwh_policy: {
+ if (update->len < (int)sizeof(struct imsm_update_rwh_policy))
+ return 0;
+ break;
+ }
+ default:
+ return 0;
+ }
+
+ /* check if we need a larger metadata buffer */
+ if (super->next_buf)
+ buf_len = super->next_len;
+ else
+ buf_len = super->len;
+
+ if (__le32_to_cpu(mpb->mpb_size) + super->extra_space + len > buf_len) {
+ /* ok we need a larger buf than what is currently allocated
+ * if this allocation fails process_update will notice that
+ * ->next_len is set and ->next_buf is NULL
+ */
+ buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) +
+ super->extra_space + len, sector_size);
+ if (super->next_buf)
+ free(super->next_buf);
+
+ super->next_len = buf_len;
+ if (posix_memalign(&super->next_buf, sector_size, buf_len) == 0)
+ memset(super->next_buf, 0, buf_len);
+ else
+ super->next_buf = NULL;
+ }
+ return 1;
+}
+
+/* must be called while manager is quiesced */
+static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index)
+{
+ struct imsm_super *mpb = super->anchor;
+ struct dl *iter;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ unsigned int i, j, num_members;
+ __u32 ord, ord_map0;
+ struct bbm_log *log = super->bbm_log;
+
+ dprintf("deleting device[%d] from imsm_super\n", index);
+
+ /* shift all indexes down one */
+ for (iter = super->disks; iter; iter = iter->next)
+ if (iter->index > (int)index)
+ iter->index--;
+ for (iter = super->missing; iter; iter = iter->next)
+ if (iter->index > (int)index)
+ iter->index--;
+
+ for (i = 0; i < mpb->num_raid_devs; i++) {
+ dev = get_imsm_dev(super, i);
+ map = get_imsm_map(dev, MAP_0);
+ num_members = map->num_members;
+ for (j = 0; j < num_members; j++) {
+ /* update ord entries being careful not to propagate
+ * ord-flags to the first map
+ */
+ ord = get_imsm_ord_tbl_ent(dev, j, MAP_X);
+ ord_map0 = get_imsm_ord_tbl_ent(dev, j, MAP_0);
+
+ if (ord_to_idx(ord) <= index)
+ continue;
+
+ map = get_imsm_map(dev, MAP_0);
+ set_imsm_ord_tbl_ent(map, j, ord_map0 - 1);
+ map = get_imsm_map(dev, MAP_1);
+ if (map)
+ set_imsm_ord_tbl_ent(map, j, ord - 1);
+ }
+ }
+
+ for (i = 0; i < log->entry_count; i++) {
+ struct bbm_log_entry *entry = &log->marked_block_entries[i];
+
+ if (entry->disk_ordinal <= index)
+ continue;
+ entry->disk_ordinal--;
+ }
+
+ mpb->num_disks--;
+ super->updates_pending++;
+ if (*dlp) {
+ struct dl *dl = *dlp;
+
+ *dlp = (*dlp)->next;
+ __free_imsm_disk(dl, 1);
+ }
+}
+
+static int imsm_get_allowed_degradation(int level, int raid_disks,
+ struct intel_super *super,
+ struct imsm_dev *dev)
+{
+ switch (level) {
+ case 1:
+ case 10:{
+ int ret_val = 0;
+ struct imsm_map *map;
+ int i;
+
+ ret_val = raid_disks/2;
+ /* check map if all disks pairs not failed
+ * in both maps
+ */
+ map = get_imsm_map(dev, MAP_0);
+ for (i = 0; i < ret_val; i++) {
+ int degradation = 0;
+ if (get_imsm_disk(super, i) == NULL)
+ degradation++;
+ if (get_imsm_disk(super, i + 1) == NULL)
+ degradation++;
+ if (degradation == 2)
+ return 0;
+ }
+ map = get_imsm_map(dev, MAP_1);
+ /* if there is no second map
+ * result can be returned
+ */
+ if (map == NULL)
+ return ret_val;
+ /* check degradation in second map
+ */
+ for (i = 0; i < ret_val; i++) {
+ int degradation = 0;
+ if (get_imsm_disk(super, i) == NULL)
+ degradation++;
+ if (get_imsm_disk(super, i + 1) == NULL)
+ degradation++;
+ if (degradation == 2)
+ return 0;
+ }
+ return ret_val;
+ }
+ case 5:
+ return 1;
+ case 6:
+ return 2;
+ default:
+ return 0;
+ }
+}
+
+/*******************************************************************************
+ * Function: validate_container_imsm
+ * Description: This routine validates container after assemble,
+ * eg. if devices in container are under the same controller.
+ *
+ * Parameters:
+ * info : linked list with info about devices used in array
+ * Returns:
+ * 1 : HBA mismatch
+ * 0 : Success
+ ******************************************************************************/
+int validate_container_imsm(struct mdinfo *info)
+{
+ if (check_env("IMSM_NO_PLATFORM"))
+ return 0;
+
+ struct sys_dev *idev;
+ struct sys_dev *hba = NULL;
+ struct sys_dev *intel_devices = find_intel_devices();
+ char *dev_path = devt_to_devpath(makedev(info->disk.major,
+ info->disk.minor), 1, NULL);
+
+ for (idev = intel_devices; idev; idev = idev->next) {
+ if (dev_path && strstr(dev_path, idev->path)) {
+ hba = idev;
+ break;
+ }
+ }
+ if (dev_path)
+ free(dev_path);
+
+ if (!hba) {
+ pr_err("WARNING - Cannot detect HBA for device %s!\n",
+ devid2kname(makedev(info->disk.major, info->disk.minor)));
+ return 1;
+ }
+
+ const struct imsm_orom *orom = get_orom_by_device_id(hba->dev_id);
+ struct mdinfo *dev;
+
+ for (dev = info->next; dev; dev = dev->next) {
+ dev_path = devt_to_devpath(makedev(dev->disk.major,
+ dev->disk.minor), 1, NULL);
+
+ struct sys_dev *hba2 = NULL;
+ for (idev = intel_devices; idev; idev = idev->next) {
+ if (dev_path && strstr(dev_path, idev->path)) {
+ hba2 = idev;
+ break;
+ }
+ }
+ if (dev_path)
+ free(dev_path);
+
+ const struct imsm_orom *orom2 = hba2 == NULL ? NULL :
+ get_orom_by_device_id(hba2->dev_id);
+
+ if (hba2 && hba->type != hba2->type) {
+ pr_err("WARNING - HBAs of devices do not match %s != %s\n",
+ get_sys_dev_type(hba->type), get_sys_dev_type(hba2->type));
+ return 1;
+ }
+
+ if (orom != orom2) {
+ pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n"
+ " This operation is not supported and can lead to data loss.\n");
+ return 1;
+ }
+
+ if (!orom) {
+ pr_err("WARNING - IMSM container assembled with disks under HBAs without IMSM platform support!\n"
+ " This operation is not supported and can lead to data loss.\n");
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*******************************************************************************
+* Function: imsm_record_badblock
+* Description: This routine stores new bad block record in BBM log
+*
+* Parameters:
+* a : array containing a bad block
+* slot : disk number containing a bad block
+* sector : bad block sector
+* length : bad block sectors range
+* Returns:
+* 1 : Success
+* 0 : Error
+******************************************************************************/
+static int imsm_record_badblock(struct active_array *a, int slot,
+ unsigned long long sector, int length)
+{
+ struct intel_super *super = a->container->sb;
+ int ord;
+ int ret;
+
+ ord = imsm_disk_slot_to_ord(a, slot);
+ if (ord < 0)
+ return 0;
+
+ ret = record_new_badblock(super->bbm_log, ord_to_idx(ord), sector,
+ length);
+ if (ret)
+ super->updates_pending++;
+
+ return ret;
+}
+/*******************************************************************************
+* Function: imsm_clear_badblock
+* Description: This routine clears bad block record from BBM log
+*
+* Parameters:
+* a : array containing a bad block
+* slot : disk number containing a bad block
+* sector : bad block sector
+* length : bad block sectors range
+* Returns:
+* 1 : Success
+* 0 : Error
+******************************************************************************/
+static int imsm_clear_badblock(struct active_array *a, int slot,
+ unsigned long long sector, int length)
+{
+ struct intel_super *super = a->container->sb;
+ int ord;
+ int ret;
+
+ ord = imsm_disk_slot_to_ord(a, slot);
+ if (ord < 0)
+ return 0;
+
+ ret = clear_badblock(super->bbm_log, ord_to_idx(ord), sector, length);
+ if (ret)
+ super->updates_pending++;
+
+ return ret;
+}
+/*******************************************************************************
+* Function: imsm_get_badblocks
+* Description: This routine get list of bad blocks for an array
+*
+* Parameters:
+* a : array
+* slot : disk number
+* Returns:
+* bb : structure containing bad blocks
+* NULL : error
+******************************************************************************/
+static struct md_bb *imsm_get_badblocks(struct active_array *a, int slot)
+{
+ int inst = a->info.container_member;
+ struct intel_super *super = a->container->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, inst);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ int ord;
+
+ ord = imsm_disk_slot_to_ord(a, slot);
+ if (ord < 0)
+ return NULL;
+
+ get_volume_badblocks(super->bbm_log, ord_to_idx(ord), pba_of_lba0(map),
+ per_dev_array_size(map), &super->bb);
+
+ return &super->bb;
+}
+/*******************************************************************************
+* Function: examine_badblocks_imsm
+* Description: Prints list of bad blocks on a disk to the standard output
+*
+* Parameters:
+* st : metadata handler
+* fd : open file descriptor for device
+* devname : device name
+* Returns:
+* 0 : Success
+* 1 : Error
+******************************************************************************/
+static int examine_badblocks_imsm(struct supertype *st, int fd, char *devname)
+{
+ struct intel_super *super = st->sb;
+ struct bbm_log *log = super->bbm_log;
+ struct dl *d = NULL;
+ int any = 0;
+
+ for (d = super->disks; d ; d = d->next) {
+ if (strcmp(d->devname, devname) == 0)
+ break;
+ }
+
+ if ((d == NULL) || (d->index < 0)) { /* serial mismatch probably */
+ pr_err("%s doesn't appear to be part of a raid array\n",
+ devname);
+ return 1;
+ }
+
+ if (log != NULL) {
+ unsigned int i;
+ struct bbm_log_entry *entry = &log->marked_block_entries[0];
+
+ for (i = 0; i < log->entry_count; i++) {
+ if (entry[i].disk_ordinal == d->index) {
+ unsigned long long sector = __le48_to_cpu(
+ &entry[i].defective_block_start);
+ int cnt = entry[i].marked_count + 1;
+
+ if (!any) {
+ printf("Bad-blocks on %s:\n", devname);
+ any = 1;
+ }
+
+ printf("%20llu for %d sectors\n", sector, cnt);
+ }
+ }
+ }
+
+ if (!any)
+ printf("No bad-blocks list configured on %s\n", devname);
+
+ return 0;
+}
+/*******************************************************************************
+ * Function: init_migr_record_imsm
+ * Description: Function inits imsm migration record
+ * Parameters:
+ * super : imsm internal array info
+ * dev : device under migration
+ * info : general array info to find the smallest device
+ * Returns:
+ * none
+ ******************************************************************************/
+void init_migr_record_imsm(struct supertype *st, struct imsm_dev *dev,
+ struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ struct migr_record *migr_rec = super->migr_rec;
+ int new_data_disks;
+ unsigned long long dsize, dev_sectors;
+ long long unsigned min_dev_sectors = -1LLU;
+ struct imsm_map *map_dest = get_imsm_map(dev, MAP_0);
+ struct imsm_map *map_src = get_imsm_map(dev, MAP_1);
+ unsigned long long num_migr_units;
+ unsigned long long array_blocks;
+ struct dl *dl_disk = NULL;
+
+ memset(migr_rec, 0, sizeof(struct migr_record));
+ migr_rec->family_num = __cpu_to_le32(super->anchor->family_num);
+
+ /* only ascending reshape supported now */
+ migr_rec->ascending_migr = __cpu_to_le32(1);
+
+ migr_rec->dest_depth_per_unit = GEN_MIGR_AREA_SIZE /
+ max(map_dest->blocks_per_strip, map_src->blocks_per_strip);
+ migr_rec->dest_depth_per_unit *=
+ max(map_dest->blocks_per_strip, map_src->blocks_per_strip);
+ new_data_disks = imsm_num_data_members(map_dest);
+ migr_rec->blocks_per_unit =
+ __cpu_to_le32(migr_rec->dest_depth_per_unit * new_data_disks);
+ migr_rec->dest_depth_per_unit =
+ __cpu_to_le32(migr_rec->dest_depth_per_unit);
+ array_blocks = info->component_size * new_data_disks;
+ num_migr_units =
+ array_blocks / __le32_to_cpu(migr_rec->blocks_per_unit);
+
+ if (array_blocks % __le32_to_cpu(migr_rec->blocks_per_unit))
+ num_migr_units++;
+ set_num_migr_units(migr_rec, num_migr_units);
+
+ migr_rec->post_migr_vol_cap = dev->size_low;
+ migr_rec->post_migr_vol_cap_hi = dev->size_high;
+
+ /* Find the smallest dev */
+ for (dl_disk = super->disks; dl_disk ; dl_disk = dl_disk->next) {
+ /* ignore spares in container */
+ if (dl_disk->index < 0)
+ continue;
+ get_dev_size(dl_disk->fd, NULL, &dsize);
+ dev_sectors = dsize / 512;
+ if (dev_sectors < min_dev_sectors)
+ min_dev_sectors = dev_sectors;
+ }
+ set_migr_chkp_area_pba(migr_rec, min_dev_sectors -
+ RAID_DISK_RESERVED_BLOCKS_IMSM_HI);
+
+ write_imsm_migr_rec(st);
+
+ return;
+}
+
+/*******************************************************************************
+ * Function: save_backup_imsm
+ * Description: Function saves critical data stripes to Migration Copy Area
+ * and updates the current migration unit status.
+ * Use restore_stripes() to form a destination stripe,
+ * and to write it to the Copy Area.
+ * Parameters:
+ * st : supertype information
+ * dev : imsm device that backup is saved for
+ * info : general array info
+ * buf : input buffer
+ * length : length of data to backup (blocks_per_unit)
+ * Returns:
+ * 0 : success
+ *, -1 : fail
+ ******************************************************************************/
+int save_backup_imsm(struct supertype *st,
+ struct imsm_dev *dev,
+ struct mdinfo *info,
+ void *buf,
+ int length)
+{
+ int rv = -1;
+ struct intel_super *super = st->sb;
+ int i;
+ struct imsm_map *map_dest = get_imsm_map(dev, MAP_0);
+ int new_disks = map_dest->num_members;
+ int dest_layout = 0;
+ int dest_chunk, targets[new_disks];
+ unsigned long long start, target_offsets[new_disks];
+ int data_disks = imsm_num_data_members(map_dest);
+
+ for (i = 0; i < new_disks; i++) {
+ struct dl *dl_disk = get_imsm_dl_disk(super, i);
+ if (dl_disk && is_fd_valid(dl_disk->fd))
+ targets[i] = dl_disk->fd;
+ else
+ goto abort;
+ }
+
+ start = info->reshape_progress * 512;
+ for (i = 0; i < new_disks; i++) {
+ target_offsets[i] = migr_chkp_area_pba(super->migr_rec) * 512;
+ /* move back copy area adderss, it will be moved forward
+ * in restore_stripes() using start input variable
+ */
+ target_offsets[i] -= start/data_disks;
+ }
+
+ dest_layout = imsm_level_to_layout(map_dest->raid_level);
+ dest_chunk = __le16_to_cpu(map_dest->blocks_per_strip) * 512;
+
+ if (restore_stripes(targets, /* list of dest devices */
+ target_offsets, /* migration record offsets */
+ new_disks,
+ dest_chunk,
+ map_dest->raid_level,
+ dest_layout,
+ -1, /* source backup file descriptor */
+ 0, /* input buf offset
+ * always 0 buf is already offseted */
+ start,
+ length,
+ buf) != 0) {
+ pr_err("Error restoring stripes\n");
+ goto abort;
+ }
+
+ rv = 0;
+
+abort:
+ return rv;
+}
+
+/*******************************************************************************
+ * Function: save_checkpoint_imsm
+ * Description: Function called for current unit status update
+ * in the migration record. It writes it to disk.
+ * Parameters:
+ * super : imsm internal array info
+ * info : general array info
+ * Returns:
+ * 0: success
+ * 1: failure
+ * 2: failure, means no valid migration record
+ * / no general migration in progress /
+ ******************************************************************************/
+int save_checkpoint_imsm(struct supertype *st, struct mdinfo *info, int state)
+{
+ struct intel_super *super = st->sb;
+ unsigned long long blocks_per_unit;
+ unsigned long long curr_migr_unit;
+
+ if (load_imsm_migr_rec(super) != 0) {
+ dprintf("imsm: ERROR: Cannot read migration record for checkpoint save.\n");
+ return 1;
+ }
+
+ blocks_per_unit = __le32_to_cpu(super->migr_rec->blocks_per_unit);
+ if (blocks_per_unit == 0) {
+ dprintf("imsm: no migration in progress.\n");
+ return 2;
+ }
+ curr_migr_unit = info->reshape_progress / blocks_per_unit;
+ /* check if array is alligned to copy area
+ * if it is not alligned, add one to current migration unit value
+ * this can happend on array reshape finish only
+ */
+ if (info->reshape_progress % blocks_per_unit)
+ curr_migr_unit++;
+
+ set_current_migr_unit(super->migr_rec, curr_migr_unit);
+ super->migr_rec->rec_status = __cpu_to_le32(state);
+ set_migr_dest_1st_member_lba(super->migr_rec,
+ super->migr_rec->dest_depth_per_unit * curr_migr_unit);
+
+ if (write_imsm_migr_rec(st) < 0) {
+ dprintf("imsm: Cannot write migration record outside backup area\n");
+ return 1;
+ }
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: recover_backup_imsm
+ * Description: Function recovers critical data from the Migration Copy Area
+ * while assembling an array.
+ * Parameters:
+ * super : imsm internal array info
+ * info : general array info
+ * Returns:
+ * 0 : success (or there is no data to recover)
+ * 1 : fail
+ ******************************************************************************/
+int recover_backup_imsm(struct supertype *st, struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ struct migr_record *migr_rec = super->migr_rec;
+ struct imsm_map *map_dest;
+ struct intel_dev *id = NULL;
+ unsigned long long read_offset;
+ unsigned long long write_offset;
+ unsigned unit_len;
+ int new_disks, err;
+ char *buf = NULL;
+ int retval = 1;
+ unsigned int sector_size = super->sector_size;
+ unsigned long long curr_migr_unit = current_migr_unit(migr_rec);
+ unsigned long long num_migr_units = get_num_migr_units(migr_rec);
+ char buffer[20];
+ int skipped_disks = 0;
+ struct dl *dl_disk;
+
+ err = sysfs_get_str(info, NULL, "array_state", (char *)buffer, 20);
+ if (err < 1)
+ return 1;
+
+ /* recover data only during assemblation */
+ if (strncmp(buffer, "inactive", 8) != 0)
+ return 0;
+ /* no data to recover */
+ if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL)
+ return 0;
+ if (curr_migr_unit >= num_migr_units)
+ return 1;
+
+ /* find device during reshape */
+ for (id = super->devlist; id; id = id->next)
+ if (is_gen_migration(id->dev))
+ break;
+ if (id == NULL)
+ return 1;
+
+ map_dest = get_imsm_map(id->dev, MAP_0);
+ new_disks = map_dest->num_members;
+
+ read_offset = migr_chkp_area_pba(migr_rec) * 512;
+
+ write_offset = (migr_dest_1st_member_lba(migr_rec) +
+ pba_of_lba0(map_dest)) * 512;
+
+ unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
+ if (posix_memalign((void **)&buf, sector_size, unit_len) != 0)
+ goto abort;
+
+ for (dl_disk = super->disks; dl_disk; dl_disk = dl_disk->next) {
+ if (dl_disk->index < 0)
+ continue;
+
+ if (!is_fd_valid(dl_disk->fd)) {
+ skipped_disks++;
+ continue;
+ }
+ if (lseek64(dl_disk->fd, read_offset, SEEK_SET) < 0) {
+ pr_err("Cannot seek to block: %s\n",
+ strerror(errno));
+ skipped_disks++;
+ continue;
+ }
+ if (read(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) {
+ pr_err("Cannot read copy area block: %s\n",
+ strerror(errno));
+ skipped_disks++;
+ continue;
+ }
+ if (lseek64(dl_disk->fd, write_offset, SEEK_SET) < 0) {
+ pr_err("Cannot seek to block: %s\n",
+ strerror(errno));
+ skipped_disks++;
+ continue;
+ }
+ if (write(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) {
+ pr_err("Cannot restore block: %s\n",
+ strerror(errno));
+ skipped_disks++;
+ continue;
+ }
+ }
+
+ if (skipped_disks > imsm_get_allowed_degradation(info->new_level,
+ new_disks,
+ super,
+ id->dev)) {
+ pr_err("Cannot restore data from backup. Too many failed disks\n");
+ goto abort;
+ }
+
+ if (save_checkpoint_imsm(st, info, UNIT_SRC_NORMAL)) {
+ /* ignore error == 2, this can mean end of reshape here
+ */
+ dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL) during restart\n");
+ } else
+ retval = 0;
+
+abort:
+ free(buf);
+ return retval;
+}
+
+static char disk_by_path[] = "/dev/disk/by-path/";
+
+static const char *imsm_get_disk_controller_domain(const char *path)
+{
+ char disk_path[PATH_MAX];
+ char *drv=NULL;
+ struct stat st;
+
+ strcpy(disk_path, disk_by_path);
+ strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1);
+ if (stat(disk_path, &st) == 0) {
+ struct sys_dev* hba;
+ char *path;
+
+ path = devt_to_devpath(st.st_rdev, 1, NULL);
+ if (path == NULL)
+ return "unknown";
+ hba = find_disk_attached_hba(-1, path);
+ if (hba && hba->type == SYS_DEV_SAS)
+ drv = "isci";
+ else if (hba && hba->type == SYS_DEV_SATA)
+ drv = "ahci";
+ else if (hba && hba->type == SYS_DEV_VMD)
+ drv = "vmd";
+ else if (hba && hba->type == SYS_DEV_NVME)
+ drv = "nvme";
+ else
+ drv = "unknown";
+ dprintf("path: %s hba: %s attached: %s\n",
+ path, (hba) ? hba->path : "NULL", drv);
+ free(path);
+ }
+ return drv;
+}
+
+static char *imsm_find_array_devnm_by_subdev(int subdev, char *container)
+{
+ static char devnm[32];
+ char subdev_name[20];
+ struct mdstat_ent *mdstat;
+
+ sprintf(subdev_name, "%d", subdev);
+ mdstat = mdstat_by_subdev(subdev_name, container);
+ if (!mdstat)
+ return NULL;
+
+ strcpy(devnm, mdstat->devnm);
+ free_mdstat(mdstat);
+ return devnm;
+}
+
+static int imsm_reshape_is_allowed_on_container(struct supertype *st,
+ struct geo_params *geo,
+ int *old_raid_disks,
+ int direction)
+{
+ /* currently we only support increasing the number of devices
+ * for a container. This increases the number of device for each
+ * member array. They must all be RAID0 or RAID5.
+ */
+ int ret_val = 0;
+ struct mdinfo *info, *member;
+ int devices_that_can_grow = 0;
+
+ dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): st->devnm = (%s)\n", st->devnm);
+
+ if (geo->size > 0 ||
+ geo->level != UnSet ||
+ geo->layout != UnSet ||
+ geo->chunksize != 0 ||
+ geo->raid_disks == UnSet) {
+ dprintf("imsm: Container operation is allowed for raid disks number change only.\n");
+ return ret_val;
+ }
+
+ if (direction == ROLLBACK_METADATA_CHANGES) {
+ dprintf("imsm: Metadata changes rollback is not supported for container operation.\n");
+ return ret_val;
+ }
+
+ info = container_content_imsm(st, NULL);
+ for (member = info; member; member = member->next) {
+ char *result;
+
+ dprintf("imsm: checking device_num: %i\n",
+ member->container_member);
+
+ if (geo->raid_disks <= member->array.raid_disks) {
+ /* we work on container for Online Capacity Expansion
+ * only so raid_disks has to grow
+ */
+ dprintf("imsm: for container operation raid disks increase is required\n");
+ break;
+ }
+
+ if (info->array.level != 0 && info->array.level != 5) {
+ /* we cannot use this container with other raid level
+ */
+ dprintf("imsm: for container operation wrong raid level (%i) detected\n",
+ info->array.level);
+ break;
+ } else {
+ /* check for platform support
+ * for this raid level configuration
+ */
+ struct intel_super *super = st->sb;
+ if (!is_raid_level_supported(super->orom,
+ member->array.level,
+ geo->raid_disks)) {
+ dprintf("platform does not support raid%d with %d disk%s\n",
+ info->array.level,
+ geo->raid_disks,
+ geo->raid_disks > 1 ? "s" : "");
+ break;
+ }
+ /* check if component size is aligned to chunk size
+ */
+ if (info->component_size %
+ (info->array.chunk_size/512)) {
+ dprintf("Component size is not aligned to chunk size\n");
+ break;
+ }
+ }
+
+ if (*old_raid_disks &&
+ info->array.raid_disks != *old_raid_disks)
+ break;
+ *old_raid_disks = info->array.raid_disks;
+
+ /* All raid5 and raid0 volumes in container
+ * have to be ready for Online Capacity Expansion
+ * so they need to be assembled. We have already
+ * checked that no recovery etc is happening.
+ */
+ result = imsm_find_array_devnm_by_subdev(member->container_member,
+ st->container_devnm);
+ if (result == NULL) {
+ dprintf("imsm: cannot find array\n");
+ break;
+ }
+ devices_that_can_grow++;
+ }
+ sysfs_free(info);
+ if (!member && devices_that_can_grow)
+ ret_val = 1;
+
+ if (ret_val)
+ dprintf("Container operation allowed\n");
+ else
+ dprintf("Error: %i\n", ret_val);
+
+ return ret_val;
+}
+
+/* Function: get_spares_for_grow
+ * Description: Allocates memory and creates list of spare devices
+ * avaliable in container. Checks if spare drive size is acceptable.
+ * Parameters: Pointer to the supertype structure
+ * Returns: Pointer to the list of spare devices (mdinfo structure) on success,
+ * NULL if fail
+ */
+static struct mdinfo *get_spares_for_grow(struct supertype *st)
+{
+ struct spare_criteria sc;
+
+ get_spare_criteria_imsm(st, &sc);
+ return container_choose_spares(st, &sc, NULL, NULL, NULL, 0);
+}
+
+/******************************************************************************
+ * function: imsm_create_metadata_update_for_reshape
+ * Function creates update for whole IMSM container.
+ *
+ ******************************************************************************/
+static int imsm_create_metadata_update_for_reshape(
+ struct supertype *st,
+ struct geo_params *geo,
+ int old_raid_disks,
+ struct imsm_update_reshape **updatep)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+ int update_memory_size;
+ struct imsm_update_reshape *u;
+ struct mdinfo *spares;
+ int i;
+ int delta_disks;
+ struct mdinfo *dev;
+
+ dprintf("(enter) raid_disks = %i\n", geo->raid_disks);
+
+ delta_disks = geo->raid_disks - old_raid_disks;
+
+ /* size of all update data without anchor */
+ update_memory_size = sizeof(struct imsm_update_reshape);
+
+ /* now add space for spare disks that we need to add. */
+ update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1);
+
+ u = xcalloc(1, update_memory_size);
+ u->type = update_reshape_container_disks;
+ u->old_raid_disks = old_raid_disks;
+ u->new_raid_disks = geo->raid_disks;
+
+ /* now get spare disks list
+ */
+ spares = get_spares_for_grow(st);
+
+ if (spares == NULL || delta_disks > spares->array.spare_disks) {
+ pr_err("imsm: ERROR: Cannot get spare devices for %s.\n", geo->dev_name);
+ i = -1;
+ goto abort;
+ }
+
+ /* we have got spares
+ * update disk list in imsm_disk list table in anchor
+ */
+ dprintf("imsm: %i spares are available.\n\n",
+ spares->array.spare_disks);
+
+ dev = spares->devs;
+ for (i = 0; i < delta_disks; i++) {
+ struct dl *dl;
+
+ if (dev == NULL)
+ break;
+ u->new_disks[i] = makedev(dev->disk.major,
+ dev->disk.minor);
+ dl = get_disk_super(super, dev->disk.major, dev->disk.minor);
+ dl->index = mpb->num_disks;
+ mpb->num_disks++;
+ dev = dev->next;
+ }
+
+abort:
+ /* free spares
+ */
+ sysfs_free(spares);
+
+ dprintf("imsm: reshape update preparation :");
+ if (i == delta_disks) {
+ dprintf_cont(" OK\n");
+ *updatep = u;
+ return update_memory_size;
+ }
+ free(u);
+ dprintf_cont(" Error\n");
+
+ return 0;
+}
+
+/******************************************************************************
+ * function: imsm_create_metadata_update_for_size_change()
+ * Creates update for IMSM array for array size change.
+ *
+ ******************************************************************************/
+static int imsm_create_metadata_update_for_size_change(
+ struct supertype *st,
+ struct geo_params *geo,
+ struct imsm_update_size_change **updatep)
+{
+ struct intel_super *super = st->sb;
+ int update_memory_size;
+ struct imsm_update_size_change *u;
+
+ dprintf("(enter) New size = %llu\n", geo->size);
+
+ /* size of all update data without anchor */
+ update_memory_size = sizeof(struct imsm_update_size_change);
+
+ u = xcalloc(1, update_memory_size);
+ u->type = update_size_change;
+ u->subdev = super->current_vol;
+ u->new_size = geo->size;
+
+ dprintf("imsm: reshape update preparation : OK\n");
+ *updatep = u;
+
+ return update_memory_size;
+}
+
+/******************************************************************************
+ * function: imsm_create_metadata_update_for_migration()
+ * Creates update for IMSM array.
+ *
+ ******************************************************************************/
+static int imsm_create_metadata_update_for_migration(
+ struct supertype *st,
+ struct geo_params *geo,
+ struct imsm_update_reshape_migration **updatep)
+{
+ struct intel_super *super = st->sb;
+ int update_memory_size;
+ struct imsm_update_reshape_migration *u;
+ struct imsm_dev *dev;
+ int previous_level = -1;
+
+ dprintf("(enter) New Level = %i\n", geo->level);
+
+ /* size of all update data without anchor */
+ update_memory_size = sizeof(struct imsm_update_reshape_migration);
+
+ u = xcalloc(1, update_memory_size);
+ u->type = update_reshape_migration;
+ u->subdev = super->current_vol;
+ u->new_level = geo->level;
+ u->new_layout = geo->layout;
+ u->new_raid_disks = u->old_raid_disks = geo->raid_disks;
+ u->new_disks[0] = -1;
+ u->new_chunksize = -1;
+
+ dev = get_imsm_dev(super, u->subdev);
+ if (dev) {
+ struct imsm_map *map;
+
+ map = get_imsm_map(dev, MAP_0);
+ if (map) {
+ int current_chunk_size =
+ __le16_to_cpu(map->blocks_per_strip) / 2;
+
+ if (geo->chunksize != current_chunk_size) {
+ u->new_chunksize = geo->chunksize / 1024;
+ dprintf("imsm: chunk size change from %i to %i\n",
+ current_chunk_size, u->new_chunksize);
+ }
+ previous_level = map->raid_level;
+ }
+ }
+ if (geo->level == 5 && previous_level == 0) {
+ struct mdinfo *spares = NULL;
+
+ u->new_raid_disks++;
+ spares = get_spares_for_grow(st);
+ if (spares == NULL || spares->array.spare_disks < 1) {
+ free(u);
+ sysfs_free(spares);
+ update_memory_size = 0;
+ pr_err("cannot get spare device for requested migration\n");
+ return 0;
+ }
+ sysfs_free(spares);
+ }
+ dprintf("imsm: reshape update preparation : OK\n");
+ *updatep = u;
+
+ return update_memory_size;
+}
+
+static void imsm_update_metadata_locally(struct supertype *st,
+ void *buf, int len)
+{
+ struct metadata_update mu;
+
+ mu.buf = buf;
+ mu.len = len;
+ mu.space = NULL;
+ mu.space_list = NULL;
+ mu.next = NULL;
+ if (imsm_prepare_update(st, &mu))
+ imsm_process_update(st, &mu);
+
+ while (mu.space_list) {
+ void **space = mu.space_list;
+ mu.space_list = *space;
+ free(space);
+ }
+}
+
+/***************************************************************************
+* Function: imsm_analyze_change
+* Description: Function analyze change for single volume
+* and validate if transition is supported
+* Parameters: Geometry parameters, supertype structure,
+* metadata change direction (apply/rollback)
+* Returns: Operation type code on success, -1 if fail
+****************************************************************************/
+enum imsm_reshape_type imsm_analyze_change(struct supertype *st,
+ struct geo_params *geo,
+ int direction)
+{
+ struct mdinfo info;
+ int change = -1;
+ int check_devs = 0;
+ int chunk;
+ /* number of added/removed disks in operation result */
+ int devNumChange = 0;
+ /* imsm compatible layout value for array geometry verification */
+ int imsm_layout = -1;
+ int data_disks;
+ struct imsm_dev *dev;
+ struct imsm_map *map;
+ struct intel_super *super;
+ unsigned long long current_size;
+ unsigned long long free_size;
+ unsigned long long max_size;
+ int rv;
+
+ getinfo_super_imsm_volume(st, &info, NULL);
+ if (geo->level != info.array.level && geo->level >= 0 &&
+ geo->level != UnSet) {
+ switch (info.array.level) {
+ case 0:
+ if (geo->level == 5) {
+ change = CH_MIGRATION;
+ if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) {
+ pr_err("Error. Requested Layout not supported (left-asymmetric layout is supported only)!\n");
+ change = -1;
+ goto analyse_change_exit;
+ }
+ imsm_layout = geo->layout;
+ check_devs = 1;
+ devNumChange = 1; /* parity disk added */
+ } else if (geo->level == 10) {
+ change = CH_TAKEOVER;
+ check_devs = 1;
+ devNumChange = 2; /* two mirrors added */
+ imsm_layout = 0x102; /* imsm supported layout */
+ }
+ break;
+ case 1:
+ case 10:
+ if (geo->level == 0) {
+ change = CH_TAKEOVER;
+ check_devs = 1;
+ devNumChange = -(geo->raid_disks/2);
+ imsm_layout = 0; /* imsm raid0 layout */
+ }
+ break;
+ }
+ if (change == -1) {
+ pr_err("Error. Level Migration from %d to %d not supported!\n",
+ info.array.level, geo->level);
+ goto analyse_change_exit;
+ }
+ } else
+ geo->level = info.array.level;
+
+ if (geo->layout != info.array.layout &&
+ (geo->layout != UnSet && geo->layout != -1)) {
+ change = CH_MIGRATION;
+ if (info.array.layout == 0 && info.array.level == 5 &&
+ geo->layout == 5) {
+ /* reshape 5 -> 4 */
+ } else if (info.array.layout == 5 && info.array.level == 5 &&
+ geo->layout == 0) {
+ /* reshape 4 -> 5 */
+ geo->layout = 0;
+ geo->level = 5;
+ } else {
+ pr_err("Error. Layout Migration from %d to %d not supported!\n",
+ info.array.layout, geo->layout);
+ change = -1;
+ goto analyse_change_exit;
+ }
+ } else {
+ geo->layout = info.array.layout;
+ if (imsm_layout == -1)
+ imsm_layout = info.array.layout;
+ }
+
+ if (geo->chunksize > 0 && geo->chunksize != UnSet &&
+ geo->chunksize != info.array.chunk_size) {
+ if (info.array.level == 10) {
+ pr_err("Error. Chunk size change for RAID 10 is not supported.\n");
+ change = -1;
+ goto analyse_change_exit;
+ } else if (info.component_size % (geo->chunksize/512)) {
+ pr_err("New chunk size (%dK) does not evenly divide device size (%lluk). Aborting...\n",
+ geo->chunksize/1024, info.component_size/2);
+ change = -1;
+ goto analyse_change_exit;
+ }
+ change = CH_MIGRATION;
+ } else {
+ geo->chunksize = info.array.chunk_size;
+ }
+
+ chunk = geo->chunksize / 1024;
+
+ super = st->sb;
+ dev = get_imsm_dev(super, super->current_vol);
+ map = get_imsm_map(dev, MAP_0);
+ data_disks = imsm_num_data_members(map);
+ /* compute current size per disk member
+ */
+ current_size = info.custom_array_size / data_disks;
+
+ if (geo->size > 0 && geo->size != MAX_SIZE) {
+ /* align component size
+ */
+ geo->size = imsm_component_size_alignment_check(
+ get_imsm_raid_level(dev->vol.map),
+ chunk * 1024, super->sector_size,
+ geo->size * 2);
+ if (geo->size == 0) {
+ pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n",
+ current_size);
+ goto analyse_change_exit;
+ }
+ }
+
+ if (current_size != geo->size && geo->size > 0) {
+ if (change != -1) {
+ pr_err("Error. Size change should be the only one at a time.\n");
+ change = -1;
+ goto analyse_change_exit;
+ }
+ if ((super->current_vol + 1) != super->anchor->num_raid_devs) {
+ pr_err("Error. The last volume in container can be expanded only (%i/%s).\n",
+ super->current_vol, st->devnm);
+ goto analyse_change_exit;
+ }
+ /* check the maximum available size
+ */
+ rv = imsm_get_free_size(st, dev->vol.map->num_members,
+ 0, chunk, &free_size);
+ if (rv == 0)
+ /* Cannot find maximum available space
+ */
+ max_size = 0;
+ else {
+ max_size = free_size + current_size;
+ /* align component size
+ */
+ max_size = imsm_component_size_alignment_check(
+ get_imsm_raid_level(dev->vol.map),
+ chunk * 1024, super->sector_size,
+ max_size);
+ }
+ if (geo->size == MAX_SIZE) {
+ /* requested size change to the maximum available size
+ */
+ if (max_size == 0) {
+ pr_err("Error. Cannot find maximum available space.\n");
+ change = -1;
+ goto analyse_change_exit;
+ } else
+ geo->size = max_size;
+ }
+
+ if (direction == ROLLBACK_METADATA_CHANGES) {
+ /* accept size for rollback only
+ */
+ } else {
+ /* round size due to metadata compatibility
+ */
+ geo->size = (geo->size >> SECT_PER_MB_SHIFT)
+ << SECT_PER_MB_SHIFT;
+ dprintf("Prepare update for size change to %llu\n",
+ geo->size );
+ if (current_size >= geo->size) {
+ pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is %llu).\n",
+ current_size, geo->size);
+ goto analyse_change_exit;
+ }
+ if (max_size && geo->size > max_size) {
+ pr_err("Error. Requested size is larger than maximum available size (maximum available size is %llu, requested size /rounded/ is %llu).\n",
+ max_size, geo->size);
+ goto analyse_change_exit;
+ }
+ }
+ geo->size *= data_disks;
+ geo->raid_disks = dev->vol.map->num_members;
+ change = CH_ARRAY_SIZE;
+ }
+ if (!validate_geometry_imsm(st,
+ geo->level,
+ imsm_layout,
+ geo->raid_disks + devNumChange,
+ &chunk,
+ geo->size, INVALID_SECTORS,
+ 0, 0, info.consistency_policy, 1))
+ change = -1;
+
+ if (check_devs) {
+ struct intel_super *super = st->sb;
+ struct imsm_super *mpb = super->anchor;
+
+ if (mpb->num_raid_devs > 1) {
+ pr_err("Error. Cannot perform operation on %s- for this operation it MUST be single array in container\n",
+ geo->dev_name);
+ change = -1;
+ }
+ }
+
+analyse_change_exit:
+ if (direction == ROLLBACK_METADATA_CHANGES &&
+ (change == CH_MIGRATION || change == CH_TAKEOVER)) {
+ dprintf("imsm: Metadata changes rollback is not supported for migration and takeover operations.\n");
+ change = -1;
+ }
+ return change;
+}
+
+int imsm_takeover(struct supertype *st, struct geo_params *geo)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_update_takeover *u;
+
+ u = xmalloc(sizeof(struct imsm_update_takeover));
+
+ u->type = update_takeover;
+ u->subarray = super->current_vol;
+
+ /* 10->0 transition */
+ if (geo->level == 0)
+ u->direction = R10_TO_R0;
+
+ /* 0->10 transition */
+ if (geo->level == 10)
+ u->direction = R0_TO_R10;
+
+ /* update metadata locally */
+ imsm_update_metadata_locally(st, u,
+ sizeof(struct imsm_update_takeover));
+ /* and possibly remotely */
+ if (st->update_tail)
+ append_metadata_update(st, u,
+ sizeof(struct imsm_update_takeover));
+ else
+ free(u);
+
+ return 0;
+}
+
+/* Flush size update if size calculated by num_data_stripes is higher than
+ * imsm_dev_size to eliminate differences during reshape.
+ * Mdmon will recalculate them correctly.
+ * If subarray index is not set then check whole container.
+ * Returns:
+ * 0 - no error occurred
+ * 1 - error detected
+ */
+static int imsm_fix_size_mismatch(struct supertype *st, int subarray_index)
+{
+ struct intel_super *super = st->sb;
+ int tmp = super->current_vol;
+ int ret_val = 1;
+ int i;
+
+ for (i = 0; i < super->anchor->num_raid_devs; i++) {
+ if (subarray_index >= 0 && i != subarray_index)
+ continue;
+ super->current_vol = i;
+ struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+ struct imsm_map *map = get_imsm_map(dev, MAP_0);
+ unsigned int disc_count = imsm_num_data_members(map);
+ struct geo_params geo;
+ struct imsm_update_size_change *update;
+ unsigned long long calc_size = per_dev_array_size(map) * disc_count;
+ unsigned long long d_size = imsm_dev_size(dev);
+ int u_size;
+
+ if (calc_size == d_size || dev->vol.migr_type == MIGR_GEN_MIGR)
+ continue;
+
+ /* There is a difference, confirm that imsm_dev_size is
+ * smaller and push update.
+ */
+ if (d_size > calc_size) {
+ pr_err("imsm: dev size of subarray %d is incorrect\n",
+ i);
+ goto exit;
+ }
+ memset(&geo, 0, sizeof(struct geo_params));
+ geo.size = d_size;
+ u_size = imsm_create_metadata_update_for_size_change(st, &geo,
+ &update);
+ if (u_size < 1) {
+ dprintf("imsm: Cannot prepare size change update\n");
+ goto exit;
+ }
+ imsm_update_metadata_locally(st, update, u_size);
+ if (st->update_tail) {
+ append_metadata_update(st, update, u_size);
+ flush_metadata_updates(st);
+ st->update_tail = &st->updates;
+ } else {
+ imsm_sync_metadata(st);
+ }
+
+ free(update);
+ }
+ ret_val = 0;
+exit:
+ super->current_vol = tmp;
+ return ret_val;
+}
+
+static int imsm_reshape_super(struct supertype *st, unsigned long long size,
+ int level,
+ int layout, int chunksize, int raid_disks,
+ int delta_disks, char *backup, char *dev,
+ int direction, int verbose)
+{
+ int ret_val = 1;
+ struct geo_params geo;
+
+ dprintf("(enter)\n");
+
+ memset(&geo, 0, sizeof(struct geo_params));
+
+ geo.dev_name = dev;
+ strcpy(geo.devnm, st->devnm);
+ geo.size = size;
+ geo.level = level;
+ geo.layout = layout;
+ geo.chunksize = chunksize;
+ geo.raid_disks = raid_disks;
+ if (delta_disks != UnSet)
+ geo.raid_disks += delta_disks;
+
+ dprintf("for level : %i\n", geo.level);
+ dprintf("for raid_disks : %i\n", geo.raid_disks);
+
+ if (strcmp(st->container_devnm, st->devnm) == 0) {
+ /* On container level we can only increase number of devices. */
+ dprintf("imsm: info: Container operation\n");
+ int old_raid_disks = 0;
+
+ if (imsm_reshape_is_allowed_on_container(
+ st, &geo, &old_raid_disks, direction)) {
+ struct imsm_update_reshape *u = NULL;
+ int len;
+
+ if (imsm_fix_size_mismatch(st, -1)) {
+ dprintf("imsm: Cannot fix size mismatch\n");
+ goto exit_imsm_reshape_super;
+ }
+
+ len = imsm_create_metadata_update_for_reshape(
+ st, &geo, old_raid_disks, &u);
+
+ if (len <= 0) {
+ dprintf("imsm: Cannot prepare update\n");
+ goto exit_imsm_reshape_super;
+ }
+
+ ret_val = 0;
+ /* update metadata locally */
+ imsm_update_metadata_locally(st, u, len);
+ /* and possibly remotely */
+ if (st->update_tail)
+ append_metadata_update(st, u, len);
+ else
+ free(u);
+
+ } else {
+ pr_err("(imsm) Operation is not allowed on this container\n");
+ }
+ } else {
+ /* On volume level we support following operations
+ * - takeover: raid10 -> raid0; raid0 -> raid10
+ * - chunk size migration
+ * - migration: raid5 -> raid0; raid0 -> raid5
+ */
+ struct intel_super *super = st->sb;
+ struct intel_dev *dev = super->devlist;
+ int change;
+ dprintf("imsm: info: Volume operation\n");
+ /* find requested device */
+ while (dev) {
+ char *devnm =
+ imsm_find_array_devnm_by_subdev(
+ dev->index, st->container_devnm);
+ if (devnm && strcmp(devnm, geo.devnm) == 0)
+ break;
+ dev = dev->next;
+ }
+ if (dev == NULL) {
+ pr_err("Cannot find %s (%s) subarray\n",
+ geo.dev_name, geo.devnm);
+ goto exit_imsm_reshape_super;
+ }
+ super->current_vol = dev->index;
+ change = imsm_analyze_change(st, &geo, direction);
+ switch (change) {
+ case CH_TAKEOVER:
+ ret_val = imsm_takeover(st, &geo);
+ break;
+ case CH_MIGRATION: {
+ struct imsm_update_reshape_migration *u = NULL;
+ int len =
+ imsm_create_metadata_update_for_migration(
+ st, &geo, &u);
+ if (len < 1) {
+ dprintf("imsm: Cannot prepare update\n");
+ break;
+ }
+ ret_val = 0;
+ /* update metadata locally */
+ imsm_update_metadata_locally(st, u, len);
+ /* and possibly remotely */
+ if (st->update_tail)
+ append_metadata_update(st, u, len);
+ else
+ free(u);
+ }
+ break;
+ case CH_ARRAY_SIZE: {
+ struct imsm_update_size_change *u = NULL;
+ int len =
+ imsm_create_metadata_update_for_size_change(
+ st, &geo, &u);
+ if (len < 1) {
+ dprintf("imsm: Cannot prepare update\n");
+ break;
+ }
+ ret_val = 0;
+ /* update metadata locally */
+ imsm_update_metadata_locally(st, u, len);
+ /* and possibly remotely */
+ if (st->update_tail)
+ append_metadata_update(st, u, len);
+ else
+ free(u);
+ }
+ break;
+ default:
+ ret_val = 1;
+ }
+ }
+
+exit_imsm_reshape_super:
+ dprintf("imsm: reshape_super Exit code = %i\n", ret_val);
+ return ret_val;
+}
+
+#define COMPLETED_OK 0
+#define COMPLETED_NONE 1
+#define COMPLETED_DELAYED 2
+
+static int read_completed(int fd, unsigned long long *val)
+{
+ int ret;
+ char buf[50];
+
+ ret = sysfs_fd_get_str(fd, buf, 50);
+ if (ret < 0)
+ return ret;
+
+ ret = COMPLETED_OK;
+ if (strncmp(buf, "none", 4) == 0) {
+ ret = COMPLETED_NONE;
+ } else if (strncmp(buf, "delayed", 7) == 0) {
+ ret = COMPLETED_DELAYED;
+ } else {
+ char *ep;
+ *val = strtoull(buf, &ep, 0);
+ if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+ ret = -1;
+ }
+ return ret;
+}
+
+/*******************************************************************************
+ * Function: wait_for_reshape_imsm
+ * Description: Function writes new sync_max value and waits until
+ * reshape process reach new position
+ * Parameters:
+ * sra : general array info
+ * ndata : number of disks in new array's layout
+ * Returns:
+ * 0 : success,
+ * 1 : there is no reshape in progress,
+ * -1 : fail
+ ******************************************************************************/
+int wait_for_reshape_imsm(struct mdinfo *sra, int ndata)
+{
+ int fd = sysfs_get_fd(sra, NULL, "sync_completed");
+ int retry = 3;
+ unsigned long long completed;
+ /* to_complete : new sync_max position */
+ unsigned long long to_complete = sra->reshape_progress;
+ unsigned long long position_to_set = to_complete / ndata;
+
+ if (!is_fd_valid(fd)) {
+ dprintf("cannot open reshape_position\n");
+ return 1;
+ }
+
+ do {
+ if (sysfs_fd_get_ll(fd, &completed) < 0) {
+ if (!retry) {
+ dprintf("cannot read reshape_position (no reshape in progres)\n");
+ close(fd);
+ return 1;
+ }
+ usleep(30000);
+ } else
+ break;
+ } while (retry--);
+
+ if (completed > position_to_set) {
+ dprintf("wrong next position to set %llu (%llu)\n",
+ to_complete, position_to_set);
+ close(fd);
+ return -1;
+ }
+ dprintf("Position set: %llu\n", position_to_set);
+ if (sysfs_set_num(sra, NULL, "sync_max",
+ position_to_set) != 0) {
+ dprintf("cannot set reshape position to %llu\n",
+ position_to_set);
+ close(fd);
+ return -1;
+ }
+
+ do {
+ int rc;
+ char action[20];
+ int timeout = 3000;
+
+ sysfs_wait(fd, &timeout);
+ if (sysfs_get_str(sra, NULL, "sync_action",
+ action, 20) > 0 &&
+ strncmp(action, "reshape", 7) != 0) {
+ if (strncmp(action, "idle", 4) == 0)
+ break;
+ close(fd);
+ return -1;
+ }
+
+ rc = read_completed(fd, &completed);
+ if (rc < 0) {
+ dprintf("cannot read reshape_position (in loop)\n");
+ close(fd);
+ return 1;
+ } else if (rc == COMPLETED_NONE)
+ break;
+ } while (completed < position_to_set);
+
+ close(fd);
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: check_degradation_change
+ * Description: Check that array hasn't become failed.
+ * Parameters:
+ * info : for sysfs access
+ * sources : source disks descriptors
+ * degraded: previous degradation level
+ * Returns:
+ * degradation level
+ ******************************************************************************/
+int check_degradation_change(struct mdinfo *info,
+ int *sources,
+ int degraded)
+{
+ unsigned long long new_degraded;
+ int rv;
+
+ rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded);
+ if (rv == -1 || (new_degraded != (unsigned long long)degraded)) {
+ /* check each device to ensure it is still working */
+ struct mdinfo *sd;
+ new_degraded = 0;
+ for (sd = info->devs ; sd ; sd = sd->next) {
+ if (sd->disk.state & (1<<MD_DISK_FAULTY))
+ continue;
+ if (sd->disk.state & (1<<MD_DISK_SYNC)) {
+ char sbuf[100];
+ int raid_disk = sd->disk.raid_disk;
+
+ if (sysfs_get_str(info,
+ sd, "state", sbuf, sizeof(sbuf)) < 0 ||
+ strstr(sbuf, "faulty") ||
+ strstr(sbuf, "in_sync") == NULL) {
+ /* this device is dead */
+ sd->disk.state = (1<<MD_DISK_FAULTY);
+ if (raid_disk >= 0)
+ close_fd(&sources[raid_disk]);
+ new_degraded++;
+ }
+ }
+ }
+ }
+
+ return new_degraded;
+}
+
+/*******************************************************************************
+ * Function: imsm_manage_reshape
+ * Description: Function finds array under reshape and it manages reshape
+ * process. It creates stripes backups (if required) and sets
+ * checkpoints.
+ * Parameters:
+ * afd : Backup handle (nattive) - not used
+ * sra : general array info
+ * reshape : reshape parameters - not used
+ * st : supertype structure
+ * blocks : size of critical section [blocks]
+ * fds : table of source device descriptor
+ * offsets : start of array (offest per devices)
+ * dests : not used
+ * destfd : table of destination device descriptor
+ * destoffsets : table of destination offsets (per device)
+ * Returns:
+ * 1 : success, reshape is done
+ * 0 : fail
+ ******************************************************************************/
+static int imsm_manage_reshape(
+ int afd, struct mdinfo *sra, struct reshape *reshape,
+ struct supertype *st, unsigned long backup_blocks,
+ int *fds, unsigned long long *offsets,
+ int dests, int *destfd, unsigned long long *destoffsets)
+{
+ int ret_val = 0;
+ struct intel_super *super = st->sb;
+ struct intel_dev *dv;
+ unsigned int sector_size = super->sector_size;
+ struct imsm_dev *dev = NULL;
+ struct imsm_map *map_src, *map_dest;
+ int migr_vol_qan = 0;
+ int ndata, odata; /* [bytes] */
+ int chunk; /* [bytes] */
+ struct migr_record *migr_rec;
+ char *buf = NULL;
+ unsigned int buf_size; /* [bytes] */
+ unsigned long long max_position; /* array size [bytes] */
+ unsigned long long next_step; /* [blocks]/[bytes] */
+ unsigned long long old_data_stripe_length;
+ unsigned long long start_src; /* [bytes] */
+ unsigned long long start; /* [bytes] */
+ unsigned long long start_buf_shift; /* [bytes] */
+ int degraded = 0;
+ int source_layout = 0;
+ int subarray_index = -1;
+
+ if (!sra)
+ return ret_val;
+
+ if (!fds || !offsets)
+ goto abort;
+
+ /* Find volume during the reshape */
+ for (dv = super->devlist; dv; dv = dv->next) {
+ if (dv->dev->vol.migr_type == MIGR_GEN_MIGR &&
+ dv->dev->vol.migr_state == 1) {
+ dev = dv->dev;
+ migr_vol_qan++;
+ subarray_index = dv->index;
+ }
+ }
+ /* Only one volume can migrate at the same time */
+ if (migr_vol_qan != 1) {
+ pr_err("%s", migr_vol_qan ?
+ "Number of migrating volumes greater than 1\n" :
+ "There is no volume during migrationg\n");
+ goto abort;
+ }
+
+ map_dest = get_imsm_map(dev, MAP_0);
+ map_src = get_imsm_map(dev, MAP_1);
+ if (map_src == NULL)
+ goto abort;
+
+ ndata = imsm_num_data_members(map_dest);
+ odata = imsm_num_data_members(map_src);
+
+ chunk = __le16_to_cpu(map_src->blocks_per_strip) * 512;
+ old_data_stripe_length = odata * chunk;
+
+ migr_rec = super->migr_rec;
+
+ /* initialize migration record for start condition */
+ if (sra->reshape_progress == 0)
+ init_migr_record_imsm(st, dev, sra);
+ else {
+ if (__le32_to_cpu(migr_rec->rec_status) != UNIT_SRC_NORMAL) {
+ dprintf("imsm: cannot restart migration when data are present in copy area.\n");
+ goto abort;
+ }
+ /* Save checkpoint to update migration record for current
+ * reshape position (in md). It can be farther than current
+ * reshape position in metadata.
+ */
+ if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) {
+ /* ignore error == 2, this can mean end of reshape here
+ */
+ dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL, initial save)\n");
+ goto abort;
+ }
+ }
+
+ /* size for data */
+ buf_size = __le32_to_cpu(migr_rec->blocks_per_unit) * 512;
+ /* extend buffer size for parity disk */
+ buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512;
+ /* add space for stripe alignment */
+ buf_size += old_data_stripe_length;
+ if (posix_memalign((void **)&buf, MAX_SECTOR_SIZE, buf_size)) {
+ dprintf("imsm: Cannot allocate checkpoint buffer\n");
+ goto abort;
+ }
+
+ max_position = sra->component_size * ndata;
+ source_layout = imsm_level_to_layout(map_src->raid_level);
+
+ while (current_migr_unit(migr_rec) <
+ get_num_migr_units(migr_rec)) {
+ /* current reshape position [blocks] */
+ unsigned long long current_position =
+ __le32_to_cpu(migr_rec->blocks_per_unit)
+ * current_migr_unit(migr_rec);
+ unsigned long long border;
+
+ /* Check that array hasn't become failed.
+ */
+ degraded = check_degradation_change(sra, fds, degraded);
+ if (degraded > 1) {
+ dprintf("imsm: Abort reshape due to degradation level (%i)\n", degraded);
+ goto abort;
+ }
+
+ next_step = __le32_to_cpu(migr_rec->blocks_per_unit);
+
+ if ((current_position + next_step) > max_position)
+ next_step = max_position - current_position;
+
+ start = current_position * 512;
+
+ /* align reading start to old geometry */
+ start_buf_shift = start % old_data_stripe_length;
+ start_src = start - start_buf_shift;
+
+ border = (start_src / odata) - (start / ndata);
+ border /= 512;
+ if (border <= __le32_to_cpu(migr_rec->dest_depth_per_unit)) {
+ /* save critical stripes to buf
+ * start - start address of current unit
+ * to backup [bytes]
+ * start_src - start address of current unit
+ * to backup alligned to source array
+ * [bytes]
+ */
+ unsigned long long next_step_filler;
+ unsigned long long copy_length = next_step * 512;
+
+ /* allign copy area length to stripe in old geometry */
+ next_step_filler = ((copy_length + start_buf_shift)
+ % old_data_stripe_length);
+ if (next_step_filler)
+ next_step_filler = (old_data_stripe_length
+ - next_step_filler);
+ dprintf("save_stripes() parameters: start = %llu,\tstart_src = %llu,\tnext_step*512 = %llu,\tstart_in_buf_shift = %llu,\tnext_step_filler = %llu\n",
+ start, start_src, copy_length,
+ start_buf_shift, next_step_filler);
+
+ if (save_stripes(fds, offsets, map_src->num_members,
+ chunk, map_src->raid_level,
+ source_layout, 0, NULL, start_src,
+ copy_length +
+ next_step_filler + start_buf_shift,
+ buf)) {
+ dprintf("imsm: Cannot save stripes to buffer\n");
+ goto abort;
+ }
+ /* Convert data to destination format and store it
+ * in backup general migration area
+ */
+ if (save_backup_imsm(st, dev, sra,
+ buf + start_buf_shift, copy_length)) {
+ dprintf("imsm: Cannot save stripes to target devices\n");
+ goto abort;
+ }
+ if (save_checkpoint_imsm(st, sra,
+ UNIT_SRC_IN_CP_AREA)) {
+ dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_IN_CP_AREA)\n");
+ goto abort;
+ }
+ } else {
+ /* set next step to use whole border area */
+ border /= next_step;
+ if (border > 1)
+ next_step *= border;
+ }
+ /* When data backed up, checkpoint stored,
+ * kick the kernel to reshape unit of data
+ */
+ next_step = next_step + sra->reshape_progress;
+ /* limit next step to array max position */
+ if (next_step > max_position)
+ next_step = max_position;
+ sysfs_set_num(sra, NULL, "suspend_lo", sra->reshape_progress);
+ sysfs_set_num(sra, NULL, "suspend_hi", next_step);
+ sra->reshape_progress = next_step;
+
+ /* wait until reshape finish */
+ if (wait_for_reshape_imsm(sra, ndata)) {
+ dprintf("wait_for_reshape_imsm returned error!\n");
+ goto abort;
+ }
+ if (sigterm)
+ goto abort;
+
+ if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) {
+ /* ignore error == 2, this can mean end of reshape here
+ */
+ dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL)\n");
+ goto abort;
+ }
+
+ }
+
+ /* clear migr_rec on disks after successful migration */
+ struct dl *d;
+
+ memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE);
+ for (d = super->disks; d; d = d->next) {
+ if (d->index < 0 || is_failed(&d->disk))
+ continue;
+ unsigned long long dsize;
+
+ get_dev_size(d->fd, NULL, &dsize);
+ if (lseek64(d->fd, dsize - MIGR_REC_SECTOR_POSITION*sector_size,
+ SEEK_SET) >= 0) {
+ if ((unsigned int)write(d->fd, super->migr_rec_buf,
+ MIGR_REC_BUF_SECTORS*sector_size) !=
+ MIGR_REC_BUF_SECTORS*sector_size)
+ perror("Write migr_rec failed");
+ }
+ }
+
+ /* return '1' if done */
+ ret_val = 1;
+
+ /* After the reshape eliminate size mismatch in metadata.
+ * Don't update md/component_size here, volume hasn't
+ * to take whole space. It is allowed by kernel.
+ * md/component_size will be set propoperly after next assembly.
+ */
+ imsm_fix_size_mismatch(st, subarray_index);
+
+abort:
+ free(buf);
+ /* See Grow.c: abort_reshape() for further explanation */
+ sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL);
+ sysfs_set_num(sra, NULL, "suspend_hi", 0);
+ sysfs_set_num(sra, NULL, "suspend_lo", 0);
+
+ return ret_val;
+}
+
+/*******************************************************************************
+ * Function: calculate_bitmap_min_chunksize
+ * Description: Calculates the minimal valid bitmap chunk size
+ * Parameters:
+ * max_bits : indicate how many bits can be used for the bitmap
+ * data_area_size : the size of the data area covered by the bitmap
+ *
+ * Returns:
+ * The bitmap chunk size
+ ******************************************************************************/
+static unsigned long long
+calculate_bitmap_min_chunksize(unsigned long long max_bits,
+ unsigned long long data_area_size)
+{
+ unsigned long long min_chunk =
+ 4096; /* sub-page chunks don't work yet.. */
+ unsigned long long bits = data_area_size / min_chunk + 1;
+
+ while (bits > max_bits) {
+ min_chunk *= 2;
+ bits = (bits + 1) / 2;
+ }
+ return min_chunk;
+}
+
+/*******************************************************************************
+ * Function: calculate_bitmap_chunksize
+ * Description: Calculates the bitmap chunk size for the given device
+ * Parameters:
+ * st : supertype information
+ * dev : device for the bitmap
+ *
+ * Returns:
+ * The bitmap chunk size
+ ******************************************************************************/
+static unsigned long long calculate_bitmap_chunksize(struct supertype *st,
+ struct imsm_dev *dev)
+{
+ struct intel_super *super = st->sb;
+ unsigned long long min_chunksize;
+ unsigned long long result = IMSM_DEFAULT_BITMAP_CHUNKSIZE;
+ size_t dev_size = imsm_dev_size(dev);
+
+ min_chunksize = calculate_bitmap_min_chunksize(
+ IMSM_BITMAP_AREA_SIZE * super->sector_size, dev_size);
+
+ if (result < min_chunksize)
+ result = min_chunksize;
+
+ return result;
+}
+
+/*******************************************************************************
+ * Function: init_bitmap_header
+ * Description: Initialize the bitmap header structure
+ * Parameters:
+ * st : supertype information
+ * bms : bitmap header struct to initialize
+ * dev : device for the bitmap
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int init_bitmap_header(struct supertype *st, struct bitmap_super_s *bms,
+ struct imsm_dev *dev)
+{
+ int vol_uuid[4];
+
+ if (!bms || !dev)
+ return -1;
+
+ bms->magic = __cpu_to_le32(BITMAP_MAGIC);
+ bms->version = __cpu_to_le32(BITMAP_MAJOR_HI);
+ bms->daemon_sleep = __cpu_to_le32(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP);
+ bms->sync_size = __cpu_to_le64(IMSM_BITMAP_AREA_SIZE);
+ bms->write_behind = __cpu_to_le32(0);
+
+ uuid_from_super_imsm(st, vol_uuid);
+ memcpy(bms->uuid, vol_uuid, 16);
+
+ bms->chunksize = calculate_bitmap_chunksize(st, dev);
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: validate_internal_bitmap_for_drive
+ * Description: Verify if the bitmap header for a given drive.
+ * Parameters:
+ * st : supertype information
+ * offset : The offset from the beginning of the drive where to look for
+ * the bitmap header.
+ * d : the drive info
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int validate_internal_bitmap_for_drive(struct supertype *st,
+ unsigned long long offset,
+ struct dl *d)
+{
+ struct intel_super *super = st->sb;
+ int ret = -1;
+ int vol_uuid[4];
+ bitmap_super_t *bms;
+ int fd;
+
+ if (!d)
+ return -1;
+
+ void *read_buf;
+
+ if (posix_memalign(&read_buf, MAX_SECTOR_SIZE, IMSM_BITMAP_HEADER_SIZE))
+ return -1;
+
+ fd = d->fd;
+ if (!is_fd_valid(fd)) {
+ fd = open(d->devname, O_RDONLY, 0);
+
+ if (!is_fd_valid(fd)) {
+ dprintf("cannot open the device %s\n", d->devname);
+ goto abort;
+ }
+ }
+
+ if (lseek64(fd, offset * super->sector_size, SEEK_SET) < 0)
+ goto abort;
+ if (read(fd, read_buf, IMSM_BITMAP_HEADER_SIZE) !=
+ IMSM_BITMAP_HEADER_SIZE)
+ goto abort;
+
+ uuid_from_super_imsm(st, vol_uuid);
+
+ bms = read_buf;
+ if ((bms->magic != __cpu_to_le32(BITMAP_MAGIC)) ||
+ (bms->version != __cpu_to_le32(BITMAP_MAJOR_HI)) ||
+ (!same_uuid((int *)bms->uuid, vol_uuid, st->ss->swapuuid))) {
+ dprintf("wrong bitmap header detected\n");
+ goto abort;
+ }
+
+ ret = 0;
+abort:
+ if (!is_fd_valid(d->fd))
+ close_fd(&fd);
+
+ if (read_buf)
+ free(read_buf);
+
+ return ret;
+}
+
+/*******************************************************************************
+ * Function: validate_internal_bitmap_imsm
+ * Description: Verify if the bitmap header is in place and with proper data.
+ * Parameters:
+ * st : supertype information
+ *
+ * Returns:
+ * 0 : success or device w/o RWH_BITMAP
+ * -1 : fail
+ ******************************************************************************/
+static int validate_internal_bitmap_imsm(struct supertype *st)
+{
+ struct intel_super *super = st->sb;
+ struct imsm_dev *dev = get_imsm_dev(super, super->current_vol);
+ unsigned long long offset;
+ struct dl *d;
+
+ if (!dev)
+ return -1;
+
+ if (dev->rwh_policy != RWH_BITMAP)
+ return 0;
+
+ offset = get_bitmap_header_sector(super, super->current_vol);
+ for (d = super->disks; d; d = d->next) {
+ if (d->index < 0 || is_failed(&d->disk))
+ continue;
+
+ if (validate_internal_bitmap_for_drive(st, offset, d)) {
+ pr_err("imsm: bitmap validation failed\n");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: add_internal_bitmap_imsm
+ * Description: Mark the volume to use the bitmap and updates the chunk size value.
+ * Parameters:
+ * st : supertype information
+ * chunkp : bitmap chunk size
+ * delay : not used for imsm
+ * write_behind : not used for imsm
+ * size : not used for imsm
+ * may_change : not used for imsm
+ * amajor : not used for imsm
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int add_internal_bitmap_imsm(struct supertype *st, int *chunkp,
+ int delay, int write_behind,
+ unsigned long long size, int may_change,
+ int amajor)
+{
+ struct intel_super *super = st->sb;
+ int vol_idx = super->current_vol;
+ struct imsm_dev *dev;
+
+ if (!super->devlist || vol_idx == -1 || !chunkp)
+ return -1;
+
+ dev = get_imsm_dev(super, vol_idx);
+
+ if (!dev) {
+ dprintf("cannot find the device for volume index %d\n",
+ vol_idx);
+ return -1;
+ }
+ dev->rwh_policy = RWH_BITMAP;
+
+ *chunkp = calculate_bitmap_chunksize(st, dev);
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: locate_bitmap_imsm
+ * Description: Seek 'fd' to start of write-intent-bitmap.
+ * Parameters:
+ * st : supertype information
+ * fd : file descriptor for the device
+ * node_num : not used for imsm
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int locate_bitmap_imsm(struct supertype *st, int fd, int node_num)
+{
+ struct intel_super *super = st->sb;
+ unsigned long long offset;
+ int vol_idx = super->current_vol;
+
+ if (!super->devlist || vol_idx == -1)
+ return -1;
+
+ offset = get_bitmap_header_sector(super, super->current_vol);
+ dprintf("bitmap header offset is %llu\n", offset);
+
+ lseek64(fd, offset << 9, 0);
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: write_init_bitmap_imsm
+ * Description: Write a bitmap header and prepares the area for the bitmap.
+ * Parameters:
+ * st : supertype information
+ * fd : file descriptor for the device
+ * update : not used for imsm
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int write_init_bitmap_imsm(struct supertype *st, int fd,
+ enum bitmap_update update)
+{
+ struct intel_super *super = st->sb;
+ int vol_idx = super->current_vol;
+ int ret = 0;
+ unsigned long long offset;
+ bitmap_super_t bms = { 0 };
+ size_t written = 0;
+ size_t to_write;
+ ssize_t rv_num;
+ void *buf;
+
+ if (!super->devlist || !super->sector_size || vol_idx == -1)
+ return -1;
+
+ struct imsm_dev *dev = get_imsm_dev(super, vol_idx);
+
+ /* first clear the space for bitmap header */
+ unsigned long long bitmap_area_start =
+ get_bitmap_header_sector(super, vol_idx);
+
+ dprintf("zeroing area start (%llu) and size (%u)\n", bitmap_area_start,
+ IMSM_BITMAP_AND_HEADER_SIZE / super->sector_size);
+ if (zero_disk_range(fd, bitmap_area_start,
+ IMSM_BITMAP_HEADER_SIZE / super->sector_size)) {
+ pr_err("imsm: cannot zeroing the space for the bitmap\n");
+ return -1;
+ }
+
+ /* The bitmap area should be filled with "1"s to perform initial
+ * synchronization.
+ */
+ if (posix_memalign(&buf, MAX_SECTOR_SIZE, MAX_SECTOR_SIZE))
+ return -1;
+ memset(buf, 0xFF, MAX_SECTOR_SIZE);
+ offset = get_bitmap_sector(super, vol_idx);
+ lseek64(fd, offset << 9, 0);
+ while (written < IMSM_BITMAP_AREA_SIZE) {
+ to_write = IMSM_BITMAP_AREA_SIZE - written;
+ if (to_write > MAX_SECTOR_SIZE)
+ to_write = MAX_SECTOR_SIZE;
+ rv_num = write(fd, buf, MAX_SECTOR_SIZE);
+ if (rv_num != MAX_SECTOR_SIZE) {
+ ret = -1;
+ dprintf("cannot initialize bitmap area\n");
+ goto abort;
+ }
+ written += rv_num;
+ }
+
+ /* write a bitmap header */
+ init_bitmap_header(st, &bms, dev);
+ memset(buf, 0, MAX_SECTOR_SIZE);
+ memcpy(buf, &bms, sizeof(bitmap_super_t));
+ if (locate_bitmap_imsm(st, fd, 0)) {
+ ret = -1;
+ dprintf("cannot locate the bitmap\n");
+ goto abort;
+ }
+ if (write(fd, buf, MAX_SECTOR_SIZE) != MAX_SECTOR_SIZE) {
+ ret = -1;
+ dprintf("cannot write the bitmap header\n");
+ goto abort;
+ }
+ fsync(fd);
+
+abort:
+ free(buf);
+
+ return ret;
+}
+
+/*******************************************************************************
+ * Function: is_vol_to_setup_bitmap
+ * Description: Checks if a bitmap should be activated on the dev.
+ * Parameters:
+ * info : info about the volume to setup the bitmap
+ * dev : the device to check against bitmap creation
+ *
+ * Returns:
+ * 0 : bitmap should be set up on the device
+ * -1 : otherwise
+ ******************************************************************************/
+static int is_vol_to_setup_bitmap(struct mdinfo *info, struct imsm_dev *dev)
+{
+ if (!dev || !info)
+ return -1;
+
+ if ((strcmp((char *)dev->volume, info->name) == 0) &&
+ (dev->rwh_policy == RWH_BITMAP))
+ return -1;
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: set_bitmap_sysfs
+ * Description: Set the sysfs atributes of a given volume to activate the bitmap.
+ * Parameters:
+ * info : info about the volume where the bitmap should be setup
+ * chunksize : bitmap chunk size
+ * location : location of the bitmap
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int set_bitmap_sysfs(struct mdinfo *info, unsigned long long chunksize,
+ char *location)
+{
+ /* The bitmap/metadata is set to external to allow changing of value for
+ * bitmap/location. When external is used, the kernel will treat an offset
+ * related to the device's first lba (in opposition to the "internal" case
+ * when this value is related to the beginning of the superblock).
+ */
+ if (sysfs_set_str(info, NULL, "bitmap/metadata", "external")) {
+ dprintf("failed to set bitmap/metadata\n");
+ return -1;
+ }
+
+ /* It can only be changed when no bitmap is active.
+ * Should be bigger than 512 and must be power of 2.
+ * It is expecting the value in bytes.
+ */
+ if (sysfs_set_num(info, NULL, "bitmap/chunksize",
+ __cpu_to_le32(chunksize))) {
+ dprintf("failed to set bitmap/chunksize\n");
+ return -1;
+ }
+
+ /* It is expecting the value in sectors. */
+ if (sysfs_set_num(info, NULL, "bitmap/space",
+ __cpu_to_le64(IMSM_BITMAP_AREA_SIZE))) {
+ dprintf("failed to set bitmap/space\n");
+ return -1;
+ }
+
+ /* Determines the delay between the bitmap updates.
+ * It is expecting the value in seconds.
+ */
+ if (sysfs_set_num(info, NULL, "bitmap/time_base",
+ __cpu_to_le64(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP))) {
+ dprintf("failed to set bitmap/time_base\n");
+ return -1;
+ }
+
+ /* It is expecting the value in sectors with a sign at the beginning. */
+ if (sysfs_set_str(info, NULL, "bitmap/location", location)) {
+ dprintf("failed to set bitmap/location\n");
+ return -1;
+ }
+
+ return 0;
+}
+
+/*******************************************************************************
+ * Function: set_bitmap_imsm
+ * Description: Setup the bitmap for the given volume
+ * Parameters:
+ * st : supertype information
+ * info : info about the volume where the bitmap should be setup
+ *
+ * Returns:
+ * 0 : success
+ * -1 : fail
+ ******************************************************************************/
+static int set_bitmap_imsm(struct supertype *st, struct mdinfo *info)
+{
+ struct intel_super *super = st->sb;
+ int prev_current_vol = super->current_vol;
+ struct imsm_dev *dev;
+ int ret = -1;
+ char location[16] = "";
+ unsigned long long chunksize;
+ struct intel_dev *dev_it;
+
+ for (dev_it = super->devlist; dev_it; dev_it = dev_it->next) {
+ super->current_vol = dev_it->index;
+ dev = get_imsm_dev(super, super->current_vol);
+
+ if (is_vol_to_setup_bitmap(info, dev)) {
+ if (validate_internal_bitmap_imsm(st)) {
+ dprintf("bitmap header validation failed\n");
+ goto abort;
+ }
+
+ chunksize = calculate_bitmap_chunksize(st, dev);
+ dprintf("chunk size is %llu\n", chunksize);
+
+ snprintf(location, sizeof(location), "+%llu",
+ get_bitmap_sector(super, super->current_vol));
+ dprintf("bitmap offset is %s\n", location);
+
+ if (set_bitmap_sysfs(info, chunksize, location)) {
+ dprintf("cannot setup the bitmap\n");
+ goto abort;
+ }
+ }
+ }
+ ret = 0;
+abort:
+ super->current_vol = prev_current_vol;
+ return ret;
+}
+
+struct superswitch super_imsm = {
+ .examine_super = examine_super_imsm,
+ .brief_examine_super = brief_examine_super_imsm,
+ .brief_examine_subarrays = brief_examine_subarrays_imsm,
+ .export_examine_super = export_examine_super_imsm,
+ .detail_super = detail_super_imsm,
+ .brief_detail_super = brief_detail_super_imsm,
+ .write_init_super = write_init_super_imsm,
+ .validate_geometry = validate_geometry_imsm,
+ .add_to_super = add_to_super_imsm,
+ .remove_from_super = remove_from_super_imsm,
+ .detail_platform = detail_platform_imsm,
+ .export_detail_platform = export_detail_platform_imsm,
+ .kill_subarray = kill_subarray_imsm,
+ .update_subarray = update_subarray_imsm,
+ .load_container = load_container_imsm,
+ .default_geometry = default_geometry_imsm,
+ .get_disk_controller_domain = imsm_get_disk_controller_domain,
+ .reshape_super = imsm_reshape_super,
+ .manage_reshape = imsm_manage_reshape,
+ .recover_backup = recover_backup_imsm,
+ .examine_badblocks = examine_badblocks_imsm,
+ .match_home = match_home_imsm,
+ .uuid_from_super= uuid_from_super_imsm,
+ .getinfo_super = getinfo_super_imsm,
+ .getinfo_super_disks = getinfo_super_disks_imsm,
+ .update_super = update_super_imsm,
+
+ .avail_size = avail_size_imsm,
+ .get_spare_criteria = get_spare_criteria_imsm,
+
+ .compare_super = compare_super_imsm,
+
+ .load_super = load_super_imsm,
+ .init_super = init_super_imsm,
+ .store_super = store_super_imsm,
+ .free_super = free_super_imsm,
+ .match_metadata_desc = match_metadata_desc_imsm,
+ .container_content = container_content_imsm,
+ .validate_container = validate_container_imsm,
+
+ .add_internal_bitmap = add_internal_bitmap_imsm,
+ .locate_bitmap = locate_bitmap_imsm,
+ .write_bitmap = write_init_bitmap_imsm,
+ .set_bitmap = set_bitmap_imsm,
+
+ .write_init_ppl = write_init_ppl_imsm,
+ .validate_ppl = validate_ppl_imsm,
+
+ .external = 1,
+ .name = "imsm",
+
+/* for mdmon */
+ .open_new = imsm_open_new,
+ .set_array_state= imsm_set_array_state,
+ .set_disk = imsm_set_disk,
+ .sync_metadata = imsm_sync_metadata,
+ .activate_spare = imsm_activate_spare,
+ .process_update = imsm_process_update,
+ .prepare_update = imsm_prepare_update,
+ .record_bad_block = imsm_record_badblock,
+ .clear_bad_block = imsm_clear_badblock,
+ .get_bad_blocks = imsm_get_badblocks,
+};
diff --git a/super-mbr.c b/super-mbr.c
new file mode 100644
index 0000000..839f000
--- /dev/null
+++ b/super-mbr.c
@@ -0,0 +1,206 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2010 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neil@brown.name>
+ *
+ */
+
+/*
+ * 'mbr' is a pseudo metadata type for devices which have a
+ * partition table in the Master Boot Record (mbr) also known
+ * as a dos partition table.
+ *
+ * Obviously arrays cannot be created or assembled for this type.
+ * It is used to allow a new bare device to have an partition table
+ * added so the member partitions can then be included in other
+ * arrays as relevant.
+ *
+ * The meaning operations are:
+ * examine_super, but not brief_examine_super or export_examine
+ * load_super
+ * store_super
+ */
+
+#include "mdadm.h"
+#include "part.h"
+
+static void free_mbr(struct supertype *st)
+{
+ free(st->sb);
+ st->sb = NULL;
+}
+
+static void examine_mbr(struct supertype *st, char *homehost)
+{
+ struct MBR *sb = st->sb;
+ int i;
+
+ printf(" MBR Magic : %04x\n", sb->magic);
+ for (i = 0; i < MBR_PARTITIONS; i++)
+ /*
+ * Have to make every access through sb rather than using a
+ * pointer to the partition table (or an entry), since the
+ * entries are not properly aligned.
+ */
+ if (sb->parts[i].blocks_num)
+ printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n",
+ i,
+ (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num),
+ (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba),
+ sb->parts[i].part_type);
+
+}
+
+static int load_super_mbr(struct supertype *st, int fd, char *devname)
+{
+ /* try to read an mbr
+ * Return
+ * 0 on success
+ * 1 cannot get record
+ * 2 record is meaningless
+ */
+ struct MBR *super;
+
+ free_mbr(st);
+
+ if (posix_memalign((void**)&super, 512, 512) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+
+ lseek(fd, 0, 0);
+ if (read(fd, super, sizeof(*super)) != sizeof(*super)) {
+ if (devname)
+ pr_err("Cannot read partition table on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+
+ if (super->magic != MBR_SIGNATURE_MAGIC) {
+ if (devname)
+ pr_err("No partition table found on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+
+ st->sb = super;
+
+ if (st->ss == NULL) {
+ st->ss = &mbr;
+ st->minor_version = 0;
+ st->max_devs = 1;
+ st->info = NULL;
+ }
+ return 0;
+}
+
+static int store_mbr(struct supertype *st, int fd)
+{
+ struct MBR *old, *super;
+
+ if (posix_memalign((void**)&old, 512, 512) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+
+ lseek(fd, 0, 0);
+ if (read(fd, old, sizeof(*old)) != sizeof(*old)) {
+ free(old);
+ return 1;
+ }
+
+ super = st->sb;
+ memcpy(super->pad, old->pad, sizeof(super->pad));
+ free(old);
+ lseek(fd, 0, 0);
+ if (write(fd, super, sizeof(*super)) != sizeof(*super))
+ return 4;
+ fsync(fd);
+ ioctl(fd, BLKRRPART, 0);
+ return 0;
+}
+
+static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct MBR *sb = st->sb;
+ int i;
+
+ memset(&info->array, 0, sizeof(info->array));
+ memset(&info->disk, 0, sizeof(info->disk));
+ strcpy(info->text_version, "mbr");
+ strcpy(info->name, "mbr");
+ info->component_size = 0;
+
+ for (i = 0; i < MBR_PARTITIONS ; i++)
+ /*
+ * Have to make every access through sb rather than using a
+ * pointer to the partition table (or an entry), since the
+ * entries are not properly aligned.
+ */
+ if (sb->parts[i].blocks_num) {
+ unsigned long last =
+ (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num)
+ + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba);
+ if (last > info->component_size)
+ info->component_size = last;
+ }
+
+}
+
+static struct supertype *match_metadata_desc(char *arg)
+{
+ struct supertype *st;
+
+ if (strcmp(arg, "mbr") != 0)
+ return NULL;
+
+ st = xmalloc(sizeof(*st));
+ st->ss = &mbr;
+ st->info = NULL;
+ st->minor_version = 0;
+ st->max_devs = 1;
+ st->sb = NULL;
+ return st;
+}
+
+static int validate_geometry(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *subdev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ pr_err("mbr metadata cannot be used this way\n");
+ return 0;
+}
+
+struct superswitch mbr = {
+ .examine_super = examine_mbr,
+ .validate_geometry = validate_geometry,
+ .match_metadata_desc = match_metadata_desc,
+ .load_super = load_super_mbr,
+ .store_super = store_mbr,
+ .getinfo_super = getinfo_mbr,
+ .free_super = free_mbr,
+ .name = "mbr",
+};
diff --git a/super0.c b/super0.c
new file mode 100644
index 0000000..b79b97a
--- /dev/null
+++ b/super0.c
@@ -0,0 +1,1350 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#define HAVE_STDINT_H 1
+#include "mdadm.h"
+#include "sha1.h"
+/*
+ * All handling for the 0.90.0 version superblock is in
+ * this file.
+ * This includes:
+ * - finding, loading, and writing the superblock.
+ * - initialising a new superblock
+ * - printing the superblock for --examine
+ * - printing part of the superblock for --detail
+ * .. other stuff
+ */
+
+static unsigned long calc_sb0_csum(mdp_super_t *super)
+{
+ unsigned long csum = super->sb_csum;
+ unsigned long newcsum;
+ super->sb_csum= 0 ;
+ newcsum = calc_csum(super, MD_SB_BYTES);
+ super->sb_csum = csum;
+ return newcsum;
+}
+
+static void super0_swap_endian(struct mdp_superblock_s *sb)
+{
+ /* as super0 superblocks are host-endian, it is sometimes
+ * useful to be able to swap the endianness
+ * as (almost) everything is u32's we byte-swap every 4byte
+ * number.
+ * We then also have to swap the events_hi and events_lo
+ */
+ char *sbc = (char *)sb;
+ __u32 t32;
+ int i;
+
+ for (i=0; i < MD_SB_BYTES ; i+=4) {
+ char t = sbc[i];
+ sbc[i] = sbc[i+3];
+ sbc[i+3] = t;
+ t=sbc[i+1];
+ sbc[i+1]=sbc[i+2];
+ sbc[i+2]=t;
+ }
+ t32 = sb->events_hi;
+ sb->events_hi = sb->events_lo;
+ sb->events_lo = t32;
+
+ t32 = sb->cp_events_hi;
+ sb->cp_events_hi = sb->cp_events_lo;
+ sb->cp_events_lo = t32;
+
+}
+
+static void examine_super0(struct supertype *st, char *homehost)
+{
+ mdp_super_t *sb = st->sb;
+ time_t atime;
+ int d;
+ int delta_extra = 0;
+ char *c;
+
+ printf(" Magic : %08x\n", sb->md_magic);
+ printf(" Version : %d.%02d.%02d\n",
+ sb->major_version, sb->minor_version, sb->patch_version);
+ if (sb->minor_version >= 90) {
+ printf(" UUID : %08x:%08x:%08x:%08x", sb->set_uuid0,
+ sb->set_uuid1, sb->set_uuid2, sb->set_uuid3);
+ if (homehost) {
+ char buf[20];
+ void *hash;
+
+ hash = sha1_buffer(homehost, strlen(homehost), buf);
+ if (memcmp(&sb->set_uuid2, hash, 8) == 0)
+ printf(" (local to host %s)", homehost);
+ }
+ printf("\n");
+ } else
+ printf(" UUID : %08x\n", sb->set_uuid0);
+
+ if (sb->not_persistent)
+ printf(" Eedk : not persistent\n");
+
+ atime = sb->ctime;
+ printf(" Creation Time : %.24s\n", ctime(&atime));
+ c = map_num(pers, sb->level);
+ printf(" Raid Level : %s\n", c?c:"-unknown-");
+ if ((int)sb->level > 0) {
+ int ddsks = 0, ddsks_denom = 1;
+ printf(" Used Dev Size : %d%s\n", sb->size,
+ human_size((long long)sb->size<<10));
+ switch(sb->level) {
+ case 1:
+ ddsks=1;
+ break;
+ case 4:
+ case 5:
+ ddsks = sb->raid_disks - 1;
+ break;
+ case 6:
+ ddsks = sb->raid_disks - 2;
+ break;
+ case 10:
+ ddsks = sb->raid_disks;
+ ddsks_denom =
+ (sb->layout & 255) * ((sb->layout >> 8) & 255);
+ }
+ if (ddsks) {
+ long long asize = sb->size;
+ asize = (asize << 10) * ddsks / ddsks_denom;
+ printf(" Array Size : %llu%s\n",
+ asize >> 10, human_size(asize));
+ }
+ }
+ printf(" Raid Devices : %d\n", sb->raid_disks);
+ printf(" Total Devices : %d\n", sb->nr_disks);
+ printf("Preferred Minor : %d\n", sb->md_minor);
+ printf("\n");
+ if (sb->minor_version > 90 && (sb->reshape_position + 1) != 0) {
+ printf(" Reshape pos'n : %llu%s\n",
+ (unsigned long long)sb->reshape_position / 2,
+ human_size((long long)sb->reshape_position << 9));
+ if (sb->delta_disks) {
+ printf(" Delta Devices : %d", sb->delta_disks);
+ printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks,
+ sb->raid_disks);
+ if (((int)sb->delta_disks) < 0)
+ delta_extra = - sb->delta_disks;
+ }
+ if (sb->new_level != sb->level) {
+ c = map_num(pers, sb->new_level);
+ printf(" New Level : %s\n", c?c:"-unknown-");
+ }
+ if (sb->new_layout != sb->layout) {
+ if (sb->level == 5) {
+ c = map_num(r5layout, sb->new_layout);
+ printf(" New Layout : %s\n",
+ c?c:"-unknown-");
+ }
+ if (sb->level == 6) {
+ c = map_num(r6layout, sb->new_layout);
+ printf(" New Layout : %s\n",
+ c?c:"-unknown-");
+ }
+ if (sb->level == 10) {
+ printf(" New Layout : near=%d, %s=%d\n",
+ sb->new_layout&255,
+ (sb->new_layout&0x10000)?"offset":"far",
+ (sb->new_layout>>8)&255);
+ }
+ }
+ if (sb->new_chunk != sb->chunk_size)
+ printf(" New Chunksize : %d\n", sb->new_chunk);
+ printf("\n");
+ }
+ atime = sb->utime;
+ printf(" Update Time : %.24s\n", ctime(&atime));
+ printf(" State : %s\n",
+ (sb->state&(1 << MD_SB_CLEAN)) ? "clean":"active");
+ if (sb->state & (1 << MD_SB_BITMAP_PRESENT))
+ printf("Internal Bitmap : present\n");
+ printf(" Active Devices : %d\n", sb->active_disks);
+ printf("Working Devices : %d\n", sb->working_disks);
+ printf(" Failed Devices : %d\n", sb->failed_disks);
+ printf(" Spare Devices : %d\n", sb->spare_disks);
+ if (calc_sb0_csum(sb) == sb->sb_csum)
+ printf(" Checksum : %x - correct\n", sb->sb_csum);
+ else
+ printf(" Checksum : %x - expected %lx\n",
+ sb->sb_csum, calc_sb0_csum(sb));
+ printf(" Events : %llu\n",
+ ((unsigned long long)sb->events_hi << 32) + sb->events_lo);
+ printf("\n");
+ if (sb->level == 5) {
+ c = map_num(r5layout, sb->layout);
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
+ if (sb->level == 6) {
+ c = map_num(r6layout, sb->layout);
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
+ if (sb->level == 10) {
+ printf(" Layout :");
+ print_r10_layout(sb->layout);
+ printf("\n");
+ }
+ switch(sb->level) {
+ case 0:
+ case 4:
+ case 5:
+ case 6:
+ case 10:
+ printf(" Chunk Size : %dK\n", sb->chunk_size / 1024);
+ break;
+ case -1:
+ printf(" Rounding : %dK\n", sb->chunk_size / 1024);
+ break;
+ default:
+ break;
+ }
+ printf("\n");
+ printf(" Number Major Minor RaidDevice State\n");
+ for (d = -1;
+ d < (signed int)(sb->raid_disks + delta_extra + sb->spare_disks);
+ d++) {
+ mdp_disk_t *dp;
+ char *dv;
+ char nb[11];
+ int wonly, failfast;
+ if (d>=0) dp = &sb->disks[d];
+ else dp = &sb->this_disk;
+ snprintf(nb, sizeof(nb), "%4d", d);
+ printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb,
+ dp->number, dp->major, dp->minor, dp->raid_disk);
+ wonly = dp->state & (1 << MD_DISK_WRITEMOSTLY);
+ failfast = dp->state & (1<<MD_DISK_FAILFAST);
+ dp->state &= ~(wonly | failfast);
+ if (dp->state & (1 << MD_DISK_FAULTY))
+ printf(" faulty");
+ if (dp->state & (1 << MD_DISK_ACTIVE))
+ printf(" active");
+ if (dp->state & (1 << MD_DISK_SYNC))
+ printf(" sync");
+ if (dp->state & (1 << MD_DISK_REMOVED))
+ printf(" removed");
+ if (wonly)
+ printf(" write-mostly");
+ if (failfast)
+ printf(" failfast");
+ if (dp->state == 0)
+ printf(" spare");
+ if ((dv = map_dev(dp->major, dp->minor, 0)))
+ printf(" %s", dv);
+ printf("\n");
+ if (d == -1)
+ printf("\n");
+ }
+}
+
+static void brief_examine_super0(struct supertype *st, int verbose)
+{
+ mdp_super_t *sb = st->sb;
+ char *c=map_num(pers, sb->level);
+ char devname[20];
+
+ sprintf(devname, "/dev/md%d", sb->md_minor);
+
+ if (verbose) {
+ printf("ARRAY %s level=%s num-devices=%d",
+ devname,
+ c?c:"-unknown-", sb->raid_disks);
+ } else
+ printf("ARRAY %s", devname);
+
+ if (sb->minor_version >= 90)
+ printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
+ sb->set_uuid2, sb->set_uuid3);
+ else
+ printf(" UUID=%08x", sb->set_uuid0);
+ printf("\n");
+}
+
+static void export_examine_super0(struct supertype *st)
+{
+ mdp_super_t *sb = st->sb;
+
+ printf("MD_LEVEL=%s\n", map_num(pers, sb->level));
+ printf("MD_DEVICES=%d\n", sb->raid_disks);
+ if (sb->minor_version >= 90)
+ printf("MD_UUID=%08x:%08x:%08x:%08x\n",
+ sb->set_uuid0, sb->set_uuid1,
+ sb->set_uuid2, sb->set_uuid3);
+ else
+ printf("MD_UUID=%08x\n", sb->set_uuid0);
+ printf("MD_UPDATE_TIME=%llu\n",
+ __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL);
+ printf("MD_EVENTS=%llu\n",
+ ((unsigned long long)sb->events_hi << 32)
+ + sb->events_lo);
+}
+
+static int copy_metadata0(struct supertype *st, int from, int to)
+{
+ /* Read 64K from the appropriate offset of 'from'
+ * and if it looks a little like a 0.90 superblock,
+ * write it to the same offset of 'to'
+ */
+ void *buf;
+ unsigned long long dsize, offset;
+ const int bufsize = 64*1024;
+ mdp_super_t *super;
+
+ if (posix_memalign(&buf, 4096, bufsize) != 0)
+ return 1;
+
+ if (!get_dev_size(from, NULL, &dsize))
+ goto err;
+
+ if (dsize < MD_RESERVED_SECTORS*512)
+ goto err;
+
+ offset = MD_NEW_SIZE_SECTORS(dsize>>9);
+
+ offset *= 512;
+
+ if (lseek64(from, offset, 0) < 0LL)
+ goto err;
+ if (read(from, buf, bufsize) != bufsize)
+ goto err;
+
+ if (lseek64(to, offset, 0) < 0LL)
+ goto err;
+ super = buf;
+ if (super->md_magic != MD_SB_MAGIC ||
+ super->major_version != 0 ||
+ calc_sb0_csum(super) != super->sb_csum)
+ goto err;
+ if (write(to, buf, bufsize) != bufsize)
+ goto err;
+ free(buf);
+ return 0;
+err:
+ free(buf);
+ return 1;
+}
+
+static void detail_super0(struct supertype *st, char *homehost, char *subarray)
+{
+ mdp_super_t *sb = st->sb;
+ printf(" UUID : ");
+ if (sb->minor_version >= 90)
+ printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
+ sb->set_uuid2, sb->set_uuid3);
+ else
+ printf("%08x", sb->set_uuid0);
+ if (homehost) {
+ char buf[20];
+ void *hash = sha1_buffer(homehost,
+ strlen(homehost),
+ buf);
+ if (memcmp(&sb->set_uuid2, hash, 8)==0)
+ printf(" (local to host %s)", homehost);
+ }
+ printf("\n Events : %d.%d\n\n", sb->events_hi, sb->events_lo);
+}
+
+static void brief_detail_super0(struct supertype *st, char *subarray)
+{
+ mdp_super_t *sb = st->sb;
+ printf(" UUID=");
+ if (sb->minor_version >= 90)
+ printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1,
+ sb->set_uuid2, sb->set_uuid3);
+ else
+ printf("%08x", sb->set_uuid0);
+}
+
+static int match_home0(struct supertype *st, char *homehost)
+{
+ mdp_super_t *sb = st->sb;
+ char buf[20];
+ char *hash;
+
+ if (!homehost)
+ return 0;
+ hash = sha1_buffer(homehost,
+ strlen(homehost),
+ buf);
+
+ return (memcmp(&sb->set_uuid2, hash, 8)==0);
+}
+
+static void uuid_from_super0(struct supertype *st, int uuid[4])
+{
+ mdp_super_t *super = st->sb;
+ uuid[0] = super->set_uuid0;
+ if (super->minor_version >= 90) {
+ uuid[1] = super->set_uuid1;
+ uuid[2] = super->set_uuid2;
+ uuid[3] = super->set_uuid3;
+ } else {
+ uuid[1] = 0;
+ uuid[2] = 0;
+ uuid[3] = 0;
+ }
+}
+
+static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map)
+{
+ mdp_super_t *sb = st->sb;
+ int working = 0;
+ int i;
+ int map_disks = info->array.raid_disks;
+
+ memset(info, 0, sizeof(*info));
+ info->array.major_version = sb->major_version;
+ info->array.minor_version = sb->minor_version;
+ info->array.patch_version = sb->patch_version;
+ info->array.raid_disks = sb->raid_disks;
+ info->array.level = sb->level;
+ info->array.layout = sb->layout;
+ info->array.md_minor = sb->md_minor;
+ info->array.ctime = sb->ctime;
+ info->array.utime = sb->utime;
+ info->array.chunk_size = sb->chunk_size;
+ info->array.state = sb->state;
+ info->component_size = sb->size;
+ info->component_size *= 2;
+
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT))
+ info->bitmap_offset = 8;
+
+ info->disk.state = sb->this_disk.state;
+ info->disk.major = sb->this_disk.major;
+ info->disk.minor = sb->this_disk.minor;
+ info->disk.raid_disk = sb->this_disk.raid_disk;
+ info->disk.number = sb->this_disk.number;
+
+ info->events = md_event(sb);
+ info->data_offset = 0;
+
+ sprintf(info->text_version, "0.%d", sb->minor_version);
+ info->safe_mode_delay = 200;
+
+ uuid_from_super0(st, info->uuid);
+
+ info->recovery_start = MaxSector;
+ if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) {
+ info->reshape_active = 1;
+ info->reshape_progress = sb->reshape_position;
+ info->new_level = sb->new_level;
+ info->delta_disks = sb->delta_disks;
+ info->new_layout = sb->new_layout;
+ info->new_chunk = sb->new_chunk;
+ if (info->delta_disks < 0)
+ info->array.raid_disks -= info->delta_disks;
+ } else
+ info->reshape_active = 0;
+
+ info->recovery_blocked = info->reshape_active;
+
+ sprintf(info->name, "%d", sb->md_minor);
+ /* work_disks is calculated rather than read directly */
+ for (i=0; i < MD_SB_DISKS; i++)
+ if ((sb->disks[i].state & (1<<MD_DISK_SYNC)) &&
+ (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) &&
+ (sb->disks[i].state & (1<<MD_DISK_ACTIVE)) &&
+ !(sb->disks[i].state & (1<<MD_DISK_FAULTY))) {
+ working ++;
+ if (map && i < map_disks)
+ map[i] = 1;
+ } else if (map && i < map_disks)
+ map[i] = 0;
+ info->array.working_disks = working;
+}
+
+static struct mdinfo *container_content0(struct supertype *st, char *subarray)
+{
+ struct mdinfo *info;
+
+ if (subarray)
+ return NULL;
+
+ info = xmalloc(sizeof(*info));
+ getinfo_super0(st, info, NULL);
+ return info;
+}
+
+static int update_super0(struct supertype *st, struct mdinfo *info,
+ char *update,
+ char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* NOTE: for 'assemble' and 'force' we need to return non-zero
+ * if any change was made. For others, the return value is
+ * ignored.
+ */
+ int rv = 0;
+ int uuid[4];
+ mdp_super_t *sb = st->sb;
+
+ if (strcmp(update, "homehost") == 0 &&
+ homehost) {
+ /* note that 'homehost' is special as it is really
+ * a "uuid" update.
+ */
+ uuid_set = 0;
+ update = "uuid";
+ info->uuid[0] = sb->set_uuid0;
+ info->uuid[1] = sb->set_uuid1;
+ }
+
+ if (strcmp(update, "sparc2.2")==0 ) {
+ /* 2.2 sparc put the events in the wrong place
+ * So we copy the tail of the superblock
+ * up 4 bytes before continuing
+ */
+ __u32 *sb32 = (__u32*)sb;
+
+ memmove(sb32+MD_SB_GENERIC_CONSTANT_WORDS+7,
+ sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1,
+ (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4);
+ if (verbose >= 0)
+ pr_err("adjusting superblock of %s for 2.2/sparc compatibility.\n",
+ devname);
+ } else if (strcmp(update, "super-minor") ==0) {
+ sb->md_minor = info->array.md_minor;
+ if (verbose > 0)
+ pr_err("updating superblock of %s with minor number %d\n",
+ devname, info->array.md_minor);
+ } else if (strcmp(update, "summaries") == 0) {
+ unsigned int i;
+ /* set nr_disks, active_disks, working_disks,
+ * failed_disks, spare_disks based on disks[]
+ * array in superblock.
+ * Also make sure extra slots aren't 'failed'
+ */
+ sb->nr_disks = sb->active_disks =
+ sb->working_disks = sb->failed_disks =
+ sb->spare_disks = 0;
+ for (i=0; i < MD_SB_DISKS ; i++)
+ if (sb->disks[i].major ||
+ sb->disks[i].minor) {
+ int state = sb->disks[i].state;
+ if (state & (1<<MD_DISK_REMOVED))
+ continue;
+ sb->nr_disks++;
+ if (state & (1<<MD_DISK_ACTIVE))
+ sb->active_disks++;
+ if (state & (1<<MD_DISK_FAULTY))
+ sb->failed_disks++;
+ else
+ sb->working_disks++;
+ if (state == 0)
+ sb->spare_disks++;
+ } else if (i >= sb->raid_disks && sb->disks[i].number == 0)
+ sb->disks[i].state = 0;
+ } else if (strcmp(update, "force-one")==0) {
+ /* Not enough devices for a working array, so
+ * bring this one up-to-date.
+ */
+ __u32 ehi = sb->events_hi, elo = sb->events_lo;
+ sb->events_hi = (info->events>>32) & 0xFFFFFFFF;
+ sb->events_lo = (info->events) & 0xFFFFFFFF;
+ if (sb->events_hi != ehi ||
+ sb->events_lo != elo)
+ rv = 1;
+ } else if (strcmp(update, "force-array")==0) {
+ /* degraded array and 'force' requested, so
+ * maybe need to mark it 'clean'
+ */
+ if ((sb->level == 5 || sb->level == 4 || sb->level == 6) &&
+ (sb->state & (1 << MD_SB_CLEAN)) == 0) {
+ /* need to force clean */
+ sb->state |= (1 << MD_SB_CLEAN);
+ rv = 1;
+ }
+ } else if (strcmp(update, "assemble")==0) {
+ int d = info->disk.number;
+ int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY);
+ int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST);
+ int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST);
+ int add = 0;
+ if (sb->minor_version >= 91)
+ /* During reshape we don't insist on everything
+ * being marked 'sync'
+ */
+ add = (1<<MD_DISK_SYNC);
+ if (((sb->disks[d].state & ~mask) | add) !=
+ (unsigned)info->disk.state) {
+ sb->disks[d].state = info->disk.state | wonly |failfast;
+ rv = 1;
+ }
+ if (info->reshape_active &&
+ sb->minor_version > 90 && (sb->reshape_position+1) != 0 &&
+ info->delta_disks >= 0 &&
+ info->reshape_progress < sb->reshape_position) {
+ sb->reshape_position = info->reshape_progress;
+ rv = 1;
+ }
+ if (info->reshape_active &&
+ sb->minor_version > 90 && (sb->reshape_position+1) != 0 &&
+ info->delta_disks < 0 &&
+ info->reshape_progress > sb->reshape_position) {
+ sb->reshape_position = info->reshape_progress;
+ rv = 1;
+ }
+ } else if (strcmp(update, "linear-grow-new") == 0) {
+ memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0]));
+ sb->disks[info->disk.number].number = info->disk.number;
+ sb->disks[info->disk.number].major = info->disk.major;
+ sb->disks[info->disk.number].minor = info->disk.minor;
+ sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
+ sb->disks[info->disk.number].state = info->disk.state;
+ sb->this_disk = sb->disks[info->disk.number];
+ } else if (strcmp(update, "linear-grow-update") == 0) {
+ sb->raid_disks = info->array.raid_disks;
+ sb->nr_disks = info->array.nr_disks;
+ sb->active_disks = info->array.active_disks;
+ sb->working_disks = info->array.working_disks;
+ memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0]));
+ sb->disks[info->disk.number].number = info->disk.number;
+ sb->disks[info->disk.number].major = info->disk.major;
+ sb->disks[info->disk.number].minor = info->disk.minor;
+ sb->disks[info->disk.number].raid_disk = info->disk.raid_disk;
+ sb->disks[info->disk.number].state = info->disk.state;
+ } else if (strcmp(update, "resync") == 0) {
+ /* make sure resync happens */
+ sb->state &= ~(1<<MD_SB_CLEAN);
+ sb->recovery_cp = 0;
+ } else if (strcmp(update, "uuid") == 0) {
+ if (!uuid_set && homehost) {
+ char buf[20];
+ char *hash = sha1_buffer(homehost,
+ strlen(homehost),
+ buf);
+ memcpy(info->uuid+2, hash, 8);
+ }
+ sb->set_uuid0 = info->uuid[0];
+ sb->set_uuid1 = info->uuid[1];
+ sb->set_uuid2 = info->uuid[2];
+ sb->set_uuid3 = info->uuid[3];
+ if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ struct bitmap_super_s *bm;
+ bm = (struct bitmap_super_s*)(sb+1);
+ uuid_from_super0(st, uuid);
+ memcpy(bm->uuid, uuid, 16);
+ }
+ } else if (strcmp(update, "metadata") == 0) {
+ /* Create some v1.0 metadata to match ours but make the
+ * ctime bigger. Also update info->array.*_version.
+ * We need to arrange that store_super writes out
+ * the v1.0 metadata.
+ * Not permitted for unclean array, or array with
+ * bitmap.
+ */
+ if (info->bitmap_offset) {
+ pr_err("Cannot update metadata when bitmap is present\n");
+ rv = -2;
+ } else if (info->array.state != 1) {
+ pr_err("Cannot update metadata on unclean array\n");
+ rv = -2;
+ } else {
+ info->array.major_version = 1;
+ info->array.minor_version = 0;
+ uuid_from_super0(st, info->uuid);
+ st->other = super1_make_v0(st, info, st->sb);
+ }
+ } else if (strcmp(update, "revert-reshape") == 0) {
+ rv = -2;
+ if (sb->minor_version <= 90)
+ pr_err("No active reshape to revert on %s\n",
+ devname);
+ else if (sb->delta_disks == 0)
+ pr_err("%s: Can only revert reshape which changes number of devices\n",
+ devname);
+ else {
+ int tmp;
+ int parity = sb->level == 6 ? 2 : 1;
+ rv = 0;
+
+ if (sb->level >= 4 && sb->level <= 6 &&
+ sb->reshape_position % (
+ sb->new_chunk/512 *
+ (sb->raid_disks - sb->delta_disks - parity))) {
+ pr_err("Reshape position is not suitably aligned.\n");
+ pr_err("Try normal assembly and stop again\n");
+ return -2;
+ }
+ sb->raid_disks -= sb->delta_disks;
+ sb->delta_disks = -sb->delta_disks;
+
+ tmp = sb->new_layout;
+ sb->new_layout = sb->layout;
+ sb->layout = tmp;
+
+ tmp = sb->new_chunk;
+ sb->new_chunk = sb->chunk_size;
+ sb->chunk_size = tmp;
+ }
+ } else if (strcmp(update, "no-bitmap") == 0) {
+ sb->state &= ~(1<<MD_SB_BITMAP_PRESENT);
+ } else if (strcmp(update, "_reshape_progress")==0)
+ sb->reshape_position = info->reshape_progress;
+ else if (strcmp(update, "writemostly")==0)
+ sb->state |= (1<<MD_DISK_WRITEMOSTLY);
+ else if (strcmp(update, "readwrite")==0)
+ sb->state &= ~(1<<MD_DISK_WRITEMOSTLY);
+ else
+ rv = -1;
+
+ sb->sb_csum = calc_sb0_csum(sb);
+ return rv;
+}
+
+/*
+ * For version-0 superblock, the homehost is 'stored' in the uuid.
+ * 8 bytes for a hash of the host leaving 8 bytes of random material.
+ * We use the first 8 bytes (64bits) of the sha1 of the host name
+ */
+static int init_super0(struct supertype *st, mdu_array_info_t *info,
+ struct shape *s, char *ignored_name,
+ char *homehost, int *uuid,
+ unsigned long long data_offset)
+{
+ mdp_super_t *sb;
+ int spares;
+
+ if (data_offset != INVALID_SECTORS) {
+ pr_err("data-offset not support for 0.90\n");
+ return 0;
+ }
+
+ if (posix_memalign((void**)&sb, 4096,
+ MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 0;
+ }
+ memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t));
+
+ st->sb = sb;
+ if (info == NULL) {
+ /* zeroing the superblock */
+ return 0;
+ }
+
+ spares = info->working_disks - info->active_disks;
+ if (info->raid_disks + spares > MD_SB_DISKS) {
+ pr_err("too many devices requested: %d+%d > %d\n",
+ info->raid_disks , spares, MD_SB_DISKS);
+ return 0;
+ }
+
+ sb->md_magic = MD_SB_MAGIC;
+ sb->major_version = 0;
+ sb->minor_version = 90;
+ sb->patch_version = 0;
+ sb->gvalid_words = 0; /* ignored */
+ sb->ctime = time(0);
+ sb->level = info->level;
+ sb->size = s->size;
+ if (s->size != (unsigned long long)sb->size)
+ return 0;
+ sb->nr_disks = info->nr_disks;
+ sb->raid_disks = info->raid_disks;
+ sb->md_minor = info->md_minor;
+ sb->not_persistent = 0;
+ if (uuid) {
+ sb->set_uuid0 = uuid[0];
+ sb->set_uuid1 = uuid[1];
+ sb->set_uuid2 = uuid[2];
+ sb->set_uuid3 = uuid[3];
+ } else {
+ __u32 r[4];
+ random_uuid((__u8 *)r);
+ sb->set_uuid0 = r[0];
+ sb->set_uuid1 = r[1];
+ sb->set_uuid2 = r[2];
+ sb->set_uuid3 = r[3];
+ }
+ if (homehost && !uuid) {
+ char buf[20];
+ char *hash = sha1_buffer(homehost,
+ strlen(homehost),
+ buf);
+ memcpy(&sb->set_uuid2, hash, 8);
+ }
+
+ sb->utime = sb->ctime;
+ sb->state = info->state;
+ sb->active_disks = info->active_disks;
+ sb->working_disks = info->working_disks;
+ sb->failed_disks = info->failed_disks;
+ sb->spare_disks = info->spare_disks;
+ sb->events_hi = 0;
+ sb->events_lo = 1;
+
+ sb->layout = info->layout;
+ sb->chunk_size = info->chunk_size;
+
+ return 1;
+}
+
+struct devinfo {
+ int fd;
+ char *devname;
+ mdu_disk_info_t disk;
+ struct devinfo *next;
+};
+
+/* Add a device to the superblock being created */
+static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo,
+ int fd, char *devname, unsigned long long data_offset)
+{
+ mdp_super_t *sb = st->sb;
+ mdp_disk_t *dk = &sb->disks[dinfo->number];
+ struct devinfo *di, **dip;
+
+ dk->number = dinfo->number;
+ dk->major = dinfo->major;
+ dk->minor = dinfo->minor;
+ dk->raid_disk = dinfo->raid_disk;
+ dk->state = dinfo->state & ((1<<MD_DISK_ACTIVE) |
+ (1<<MD_DISK_SYNC));
+
+ sb->this_disk = sb->disks[dinfo->number];
+ sb->sb_csum = calc_sb0_csum(sb);
+
+ dip = (struct devinfo **)&st->info;
+ while (*dip)
+ dip = &(*dip)->next;
+ di = xmalloc(sizeof(struct devinfo));
+ di->fd = fd;
+ di->devname = devname;
+ di->disk = *dinfo;
+ di->next = NULL;
+ *dip = di;
+
+ return 0;
+}
+
+static int store_super0(struct supertype *st, int fd)
+{
+ unsigned long long dsize;
+ unsigned long long offset;
+ mdp_super_t *super = st->sb;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return 1;
+
+ if (dsize < MD_RESERVED_SECTORS*512)
+ return 2;
+
+ if (st->other) {
+ /* Writing out v1.0 metadata for --update=metadata */
+ int ret = 0;
+
+ offset = dsize/512 - 8*2;
+ offset &= ~(4*2-1);
+ offset *= 512;
+ if (lseek64(fd, offset, 0)< 0LL)
+ ret = 3;
+ else if (write(fd, st->other, 1024) != 1024)
+ ret = 4;
+ else
+ fsync(fd);
+ free(st->other);
+ st->other = NULL;
+ return ret;
+ }
+
+ offset = MD_NEW_SIZE_SECTORS(dsize>>9);
+
+ offset *= 512;
+
+ if (lseek64(fd, offset, 0)< 0LL)
+ return 3;
+
+ if (write(fd, super, sizeof(*super)) != sizeof(*super))
+ return 4;
+
+ if (super->state & (1<<MD_SB_BITMAP_PRESENT)) {
+ struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1);
+ if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC)
+ if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) !=
+ ROUND_UP(sizeof(*bm),4096))
+ return 5;
+ }
+
+ fsync(fd);
+ return 0;
+}
+
+static int write_init_super0(struct supertype *st)
+{
+ mdp_super_t *sb = st->sb;
+ int rv = 0;
+ struct devinfo *di;
+
+ for (di = st->info ; di && ! rv ; di = di->next) {
+
+ if (di->disk.state & (1 << MD_DISK_FAULTY))
+ continue;
+ if (di->fd == -1)
+ continue;
+ while (Kill(di->devname, NULL, 0, -1, 1) == 0)
+ ;
+
+ sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY);
+
+ sb->this_disk = sb->disks[di->disk.number];
+ sb->sb_csum = calc_sb0_csum(sb);
+ rv = store_super0(st, di->fd);
+
+ if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT)))
+ rv = st->ss->write_bitmap(st, di->fd, NoUpdate);
+
+ if (rv)
+ pr_err("failed to write superblock to %s\n",
+ di->devname);
+ }
+ return rv;
+}
+
+static int compare_super0(struct supertype *st, struct supertype *tst,
+ int verbose)
+{
+ /*
+ * return:
+ * 0 same, or first was empty, and second was copied
+ * 1 second had wrong number
+ * 2 wrong uuid
+ * 3 wrong other info
+ */
+ mdp_super_t *first = st->sb;
+ mdp_super_t *second = tst->sb;
+ int uuid1[4], uuid2[4];
+
+ if (second->md_magic != MD_SB_MAGIC)
+ return 1;
+ if (!first) {
+ if (posix_memalign((void**)&first, 4096,
+ MD_SB_BYTES +
+ ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+ memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s));
+ st->sb = first;
+ return 0;
+ }
+
+ uuid_from_super0(st, uuid1);
+ uuid_from_super0(tst, uuid2);
+ if (!same_uuid(uuid1, uuid2, 0))
+ return 2;
+ if (first->major_version != second->major_version ||
+ first->minor_version != second->minor_version ||
+ first->patch_version != second->patch_version ||
+ first->gvalid_words != second->gvalid_words ||
+ first->ctime != second->ctime ||
+ first->level != second->level ||
+ first->size != second->size ||
+ first->raid_disks != second->raid_disks )
+ return 3;
+
+ return 0;
+}
+
+static void free_super0(struct supertype *st);
+
+static int load_super0(struct supertype *st, int fd, char *devname)
+{
+ /* try to read in the superblock
+ * Return:
+ * 0 on success
+ * 1 on cannot get superblock
+ * 2 on superblock meaningless
+ */
+ unsigned long long dsize;
+ unsigned long long offset;
+ mdp_super_t *super;
+ int uuid[4];
+ struct bitmap_super_s *bsb;
+
+ free_super0(st);
+
+ if (!get_dev_size(fd, devname, &dsize))
+ return 1;
+
+ if (dsize < MD_RESERVED_SECTORS*512) {
+ if (devname)
+ pr_err("%s is too small for md: size is %llu sectors.\n",
+ devname, dsize);
+ return 1;
+ }
+ st->devsize = dsize;
+
+ offset = MD_NEW_SIZE_SECTORS(dsize>>9);
+
+ offset *= 512;
+
+ if (lseek64(fd, offset, 0)< 0LL) {
+ if (devname)
+ pr_err("Cannot seek to superblock on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if (posix_memalign((void**)&super, 4096,
+ MD_SB_BYTES +
+ ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+
+ if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) {
+ if (devname)
+ pr_err("Cannot read superblock on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+
+ if (st->ss && st->minor_version == 9)
+ super0_swap_endian(super);
+
+ if (super->md_magic != MD_SB_MAGIC) {
+ if (devname)
+ pr_err("No super block found on %s (Expected magic %08x, got %08x)\n",
+ devname, MD_SB_MAGIC, super->md_magic);
+ free(super);
+ return 2;
+ }
+
+ if (super->major_version != 0) {
+ if (devname)
+ pr_err("Cannot interpret superblock on %s - version is %d\n",
+ devname, super->major_version);
+ free(super);
+ return 2;
+ }
+ st->sb = super;
+
+ if (st->ss == NULL) {
+ st->ss = &super0;
+ st->minor_version = super->minor_version;
+ st->max_devs = MD_SB_DISKS;
+ st->info = NULL;
+ }
+
+ /* Now check on the bitmap superblock */
+ if ((super->state & (1<<MD_SB_BITMAP_PRESENT)) == 0)
+ return 0;
+ /* Read the bitmap superblock and make sure it looks
+ * valid. If it doesn't clear the bit. An --assemble --force
+ * should get that written out.
+ */
+ if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096)) !=
+ ROUND_UP(sizeof(struct bitmap_super_s), 4096))
+ goto no_bitmap;
+
+ uuid_from_super0(st, uuid);
+ bsb = (struct bitmap_super_s *)(super+1);
+ if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC ||
+ memcmp(bsb->uuid, uuid, 16) != 0)
+ goto no_bitmap;
+ return 0;
+
+ no_bitmap:
+ super->state &= ~(1<<MD_SB_BITMAP_PRESENT);
+
+ return 0;
+}
+
+static struct supertype *match_metadata_desc0(char *arg)
+{
+ struct supertype *st = xcalloc(1, sizeof(*st));
+
+ st->container_devnm[0] = 0;
+ st->ss = &super0;
+ st->info = NULL;
+ st->minor_version = 90;
+ st->max_devs = MD_SB_DISKS;
+ st->sb = NULL;
+ /* we sometimes get 00.90 */
+ while (arg[0] == '0' && arg[1] == '0')
+ arg++;
+ if (strcmp(arg, "0") == 0 ||
+#ifdef DEFAULT_OLD_METADATA /* ifndef in super1.c */
+ strcmp(arg, "default") == 0 ||
+#endif /* DEFAULT_OLD_METADATA */
+ strcmp(arg, "0.90") == 0 ||
+ strcmp(arg, "") == 0 /* no metadata - i.e. non_persistent */
+ )
+ return st;
+
+ st->minor_version = 91; /* reshape in progress */
+ if (strcmp(arg, "0.91") == 0) /* For dup_super support */
+ return st;
+
+ st->minor_version = 9; /* flag for 'byte-swapped' */
+ if (strcmp(arg, "0.swap")==0 ||
+ strcmp(arg, "0.9") == 0) /* For dup_super support */
+ return st;
+
+ free(st);
+ return NULL;
+}
+
+static __u64 avail_size0(struct supertype *st, __u64 devsize,
+ unsigned long long data_offset)
+{
+ if (data_offset != 0 && data_offset != INVALID_SECTORS)
+ return 0ULL;
+ if (devsize < MD_RESERVED_SECTORS)
+ return 0ULL;
+ return MD_NEW_SIZE_SECTORS(devsize);
+}
+
+static int add_internal_bitmap0(struct supertype *st, int *chunkp,
+ int delay, int write_behind,
+ unsigned long long size, int may_change,
+ int major)
+{
+ /*
+ * The bitmap comes immediately after the superblock and must be 60K in size
+ * at most. The default size is between 30K and 60K
+ *
+ * size is in sectors, chunk is in bytes !!!
+ */
+ unsigned long long bits;
+ unsigned long long max_bits = (60*1024 - sizeof(bitmap_super_t))*8;
+ unsigned long long min_chunk;
+ int chunk = *chunkp;
+ mdp_super_t *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MD_SB_BYTES);
+ int uuid[4];
+
+ min_chunk = 4096; /* sub-page chunks don't work yet.. */
+ bits = (size * 512) / min_chunk + 1;
+ while (bits > max_bits) {
+ min_chunk *= 2;
+ bits = (bits+1)/2;
+ }
+ if (chunk == UnSet) {
+ /* A chunk size less than a few Megabytes gives poor
+ * performance without increasing resync noticeably
+ */
+ chunk = min_chunk;
+ if (chunk < 64*1024*1024)
+ chunk = 64*1024*1024;
+ } else if ((unsigned long long)chunk < min_chunk)
+ return -EINVAL; /* chunk size too small */
+
+ sb->state |= (1<<MD_SB_BITMAP_PRESENT);
+
+ memset(bms, 0, sizeof(*bms));
+ bms->magic = __cpu_to_le32(BITMAP_MAGIC);
+ bms->version = __cpu_to_le32(major);
+ uuid_from_super0(st, uuid);
+ memcpy(bms->uuid, uuid, 16);
+ bms->chunksize = __cpu_to_le32(chunk);
+ bms->daemon_sleep = __cpu_to_le32(delay);
+ bms->sync_size = __cpu_to_le64(size);
+ bms->write_behind = __cpu_to_le32(write_behind);
+ *chunkp = chunk;
+ return 0;
+}
+
+static int locate_bitmap0(struct supertype *st, int fd, int node_num)
+{
+ unsigned long long dsize;
+ unsigned long long offset;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return -1;
+
+ if (dsize < MD_RESERVED_SECTORS*512)
+ return -1;
+
+ offset = MD_NEW_SIZE_SECTORS(dsize>>9);
+
+ offset *= 512;
+
+ offset += MD_SB_BYTES;
+
+ lseek64(fd, offset, 0);
+ return 0;
+}
+
+static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update)
+{
+ unsigned long long dsize;
+ unsigned long long offset;
+ mdp_super_t *sb = st->sb;
+
+ int rv = 0;
+
+ int towrite, n;
+ void *buf;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return 1;
+
+ if (dsize < MD_RESERVED_SECTORS*512)
+ return -1;
+
+ offset = MD_NEW_SIZE_SECTORS(dsize>>9);
+
+ offset *= 512;
+
+ if (lseek64(fd, offset + 4096, 0)< 0LL)
+ return 3;
+
+ if (posix_memalign(&buf, 4096, 4096))
+ return -ENOMEM;
+
+ memset(buf, 0xff, 4096);
+ memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t));
+ towrite = 60*1024;
+ while (towrite > 0) {
+ n = towrite;
+ if (n > 4096)
+ n = 4096;
+ n = write(fd, buf, n);
+ if (n > 0)
+ towrite -= n;
+ else
+ break;
+ memset(buf, 0xff, 4096);
+ }
+ fsync(fd);
+ if (towrite)
+ rv = -2;
+
+ free(buf);
+ return rv;
+}
+
+static void free_super0(struct supertype *st)
+{
+ if (st->sb)
+ free(st->sb);
+ while (st->info) {
+ struct devinfo *di = st->info;
+ st->info = di->next;
+ if (di->fd >= 0)
+ close(di->fd);
+ free(di);
+ }
+ st->sb = NULL;
+}
+
+static int validate_geometry0(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *subdev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ unsigned long long ldsize;
+ int fd;
+ unsigned int tbmax = 4;
+
+ /* prior to linux 3.1, a but limits usable device size to 2TB.
+ * It was introduced in 2.6.29, but we won't worry about that detail
+ */
+ if (get_linux_version() < 3001000)
+ tbmax = 2;
+
+ if (level == LEVEL_CONTAINER) {
+ if (verbose)
+ pr_err("0.90 metadata does not support containers\n");
+ return 0;
+ }
+ if (raiddisks > MD_SB_DISKS) {
+ if (verbose)
+ pr_err("0.90 metadata supports at most %d devices per array\n",
+ MD_SB_DISKS);
+ return 0;
+ }
+ if (size >= tbmax * 2ULL*1024*1024*1024) {
+ if (verbose)
+ pr_err("0.90 metadata supports at most %d terabytes per device\n", tbmax);
+ return 0;
+ }
+ if (*chunk == UnSet)
+ *chunk = DEFAULT_CHUNK;
+
+ if (level == 0 && layout != UnSet) {
+ if (verbose)
+ pr_err("0.90 metadata does not support layouts for RAID0\n");
+ return 0;
+ }
+
+ if (!subdev)
+ return 1;
+
+ fd = open(subdev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ pr_err("super0.90 cannot open %s: %s\n",
+ subdev, strerror(errno));
+ return 0;
+ }
+
+ if (!get_dev_size(fd, subdev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ if (ldsize < MD_RESERVED_SECTORS * 512)
+ return 0;
+ *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9);
+ return 1;
+}
+
+struct superswitch super0 = {
+ .examine_super = examine_super0,
+ .brief_examine_super = brief_examine_super0,
+ .export_examine_super = export_examine_super0,
+ .detail_super = detail_super0,
+ .brief_detail_super = brief_detail_super0,
+ .write_init_super = write_init_super0,
+ .validate_geometry = validate_geometry0,
+ .add_to_super = add_to_super0,
+ .copy_metadata = copy_metadata0,
+ .match_home = match_home0,
+ .uuid_from_super = uuid_from_super0,
+ .getinfo_super = getinfo_super0,
+ .container_content = container_content0,
+ .update_super = update_super0,
+ .init_super = init_super0,
+ .store_super = store_super0,
+ .compare_super = compare_super0,
+ .load_super = load_super0,
+ .match_metadata_desc = match_metadata_desc0,
+ .avail_size = avail_size0,
+ .add_internal_bitmap = add_internal_bitmap0,
+ .locate_bitmap = locate_bitmap0,
+ .write_bitmap = write_bitmap0,
+ .free_super = free_super0,
+ .name = "0.90",
+};
diff --git a/super1.c b/super1.c
new file mode 100644
index 0000000..a12a5bc
--- /dev/null
+++ b/super1.c
@@ -0,0 +1,2980 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include <stddef.h>
+#include "mdadm.h"
+/*
+ * The version-1 superblock :
+ * All numeric fields are little-endian.
+ *
+ * total size: 256 bytes plus 2 per device.
+ * 1K allows 384 devices.
+ */
+struct mdp_superblock_1 {
+ /* constant array information - 128 bytes */
+ __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */
+ __u32 major_version; /* 1 */
+ __u32 feature_map; /* 0 for now */
+ __u32 pad0; /* always set to 0 when writing */
+
+ __u8 set_uuid[16]; /* user-space generated. */
+ char set_name[32]; /* set and interpreted by user-space */
+
+ __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/
+ __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */
+ __u32 layout; /* used for raid5, raid6, raid10, and raid0 */
+ __u64 size; /* used size of component devices, in 512byte sectors */
+
+ __u32 chunksize; /* in 512byte sectors */
+ __u32 raid_disks;
+ union {
+ __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts
+ * NOTE: signed, so bitmap can be before superblock
+ * only meaningful of feature_map[0] is set.
+ */
+
+ /* only meaningful when feature_map[MD_FEATURE_PPL] is set */
+ struct {
+ __s16 offset; /* sectors from start of superblock that ppl starts */
+ __u16 size; /* ppl size in sectors */
+ } ppl;
+ };
+
+ /* These are only valid with feature bit '4' */
+ __u32 new_level; /* new level we are reshaping to */
+ __u64 reshape_position; /* next address in array-space for reshape */
+ __u32 delta_disks; /* change in number of raid_disks */
+ __u32 new_layout; /* new layout */
+ __u32 new_chunk; /* new chunk size (sectors) */
+ __u32 new_offset; /* signed number to add to data_offset in new
+ * layout. 0 == no-change. This can be
+ * different on each device in the array.
+ */
+
+ /* constant this-device information - 64 bytes */
+ __u64 data_offset; /* sector start of data, often 0 */
+ __u64 data_size; /* sectors in this device that can be used for data */
+ __u64 super_offset; /* sector start of this superblock */
+ union {
+ __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */
+ __u64 journal_tail;/* journal tail of journal device (from data_offset) */
+ };
+ __u32 dev_number; /* permanent identifier of this device - not role in raid */
+ __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */
+ __u8 device_uuid[16]; /* user-space setable, ignored by kernel */
+ __u8 devflags; /* per-device flags. Only one defined...*/
+#define WriteMostly1 1 /* mask for writemostly flag in above */
+#define FailFast1 2 /* Device should get FailFast requests */
+ /* bad block log. If there are any bad blocks the feature flag is set.
+ * if offset and size are non-zero, that space is reserved and available.
+ */
+ __u8 bblog_shift; /* shift from sectors to block size for badblock list */
+ __u16 bblog_size; /* number of sectors reserved for badblock list */
+ __u32 bblog_offset; /* sector offset from superblock to bblog, signed */
+
+ /* array state information - 64 bytes */
+ __u64 utime; /* 40 bits second, 24 bits microseconds */
+ __u64 events; /* incremented when superblock updated */
+ __u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */
+ __u32 sb_csum; /* checksum upto dev_roles[max_dev] */
+ __u32 max_dev; /* size of dev_roles[] array to consider */
+ __u8 pad3[64-32]; /* set to 0 when writing */
+
+ /* device state information. Indexed by dev_number.
+ * 2 bytes per device
+ * Note there are no per-device state flags. State information is rolled
+ * into the 'roles' value. If a device is spare or faulty, then it doesn't
+ * have a meaningful role.
+ */
+ __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */
+};
+
+#define MAX_SB_SIZE 4096
+/* bitmap super size is 256, but we round up to a sector for alignment */
+#define BM_SUPER_SIZE 512
+#define MAX_DEVS ((int)(MAX_SB_SIZE - sizeof(struct mdp_superblock_1)) / 2)
+#define SUPER1_SIZE (MAX_SB_SIZE + BM_SUPER_SIZE \
+ + sizeof(struct misc_dev_info))
+
+struct misc_dev_info {
+ __u64 device_size;
+};
+
+#define MULTIPLE_PPL_AREA_SIZE_SUPER1 (1024 * 1024) /* Size of the whole
+ * mutliple PPL area
+ */
+/* feature_map bits */
+#define MD_FEATURE_BITMAP_OFFSET 1
+#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and
+ * must be honoured
+ */
+#define MD_FEATURE_RESHAPE_ACTIVE 4
+#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */
+#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an
+ * active device with same 'role'.
+ * 'recovery_offset' is also set.
+ */
+#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number
+ * of devices, but is going
+ * backwards anyway.
+ */
+#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */
+#define MD_FEATURE_BITMAP_VERSIONED 256 /* bitmap version number checked properly */
+#define MD_FEATURE_JOURNAL 512 /* support write journal */
+#define MD_FEATURE_PPL 1024 /* support PPL */
+#define MD_FEATURE_MUTLIPLE_PPLS 2048 /* support for multiple PPLs */
+#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful in RAID0 */
+#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \
+ |MD_FEATURE_RECOVERY_OFFSET \
+ |MD_FEATURE_RESHAPE_ACTIVE \
+ |MD_FEATURE_BAD_BLOCKS \
+ |MD_FEATURE_REPLACEMENT \
+ |MD_FEATURE_RESHAPE_BACKWARDS \
+ |MD_FEATURE_NEW_OFFSET \
+ |MD_FEATURE_BITMAP_VERSIONED \
+ |MD_FEATURE_JOURNAL \
+ |MD_FEATURE_PPL \
+ |MD_FEATURE_MULTIPLE_PPLS \
+ |MD_FEATURE_RAID0_LAYOUT \
+ )
+
+static int role_from_sb(struct mdp_superblock_1 *sb)
+{
+ unsigned int d;
+ int role;
+
+ d = __le32_to_cpu(sb->dev_number);
+ if (d < __le32_to_cpu(sb->max_dev))
+ role = __le16_to_cpu(sb->dev_roles[d]);
+ else
+ role = MD_DISK_ROLE_SPARE;
+ return role;
+}
+
+/* return how many bytes are needed for bitmap, for cluster-md each node
+ * should have it's own bitmap */
+static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary)
+{
+ unsigned long long bits, bytes;
+
+ bits = bitmap_bits(__le64_to_cpu(bms->sync_size),
+ __le32_to_cpu(bms->chunksize));
+ bytes = (bits+7) >> 3;
+ bytes += sizeof(bitmap_super_t);
+ bytes = ROUND_UP(bytes, boundary);
+
+ return bytes;
+}
+
+static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb)
+{
+ unsigned int disk_csum, csum;
+ unsigned long long newcsum;
+ int size = sizeof(*sb) + __le32_to_cpu(sb->max_dev)*2;
+ unsigned int *isuper = (unsigned int*)sb;
+
+/* make sure I can count... */
+ if (offsetof(struct mdp_superblock_1,data_offset) != 128 ||
+ offsetof(struct mdp_superblock_1, utime) != 192 ||
+ sizeof(struct mdp_superblock_1) != 256) {
+ fprintf(stderr, "WARNING - superblock isn't sized correctly\n");
+ }
+
+ disk_csum = sb->sb_csum;
+ sb->sb_csum = 0;
+ newcsum = 0;
+ for (; size>=4; size -= 4 ) {
+ newcsum += __le32_to_cpu(*isuper);
+ isuper++;
+ }
+
+ if (size == 2)
+ newcsum += __le16_to_cpu(*(unsigned short*) isuper);
+
+ csum = (newcsum & 0xffffffff) + (newcsum >> 32);
+ sb->sb_csum = disk_csum;
+ return __cpu_to_le32(csum);
+}
+
+/*
+ * Information related to file descriptor used for aligned reads/writes.
+ * Cache the block size.
+ */
+struct align_fd {
+ int fd;
+ int blk_sz;
+};
+
+static void init_afd(struct align_fd *afd, int fd)
+{
+ afd->fd = fd;
+ if (!get_dev_sector_size(afd->fd, NULL, (unsigned int *)&afd->blk_sz))
+ afd->blk_sz = 512;
+}
+
+static char abuf[4096+4096];
+
+static int aread(struct align_fd *afd, void *buf, int len)
+{
+ /* aligned read.
+ * On devices with a 4K sector size, we need to read
+ * the full sector and copy relevant bits into
+ * the buffer
+ */
+ int bsize, iosize;
+ char *b;
+ int n;
+
+ bsize = afd->blk_sz;
+
+ if (!bsize || bsize > 4096 || len > 4096) {
+ if (!bsize)
+ fprintf(stderr, "WARNING - aread() called with invalid block size\n");
+ return -1;
+ }
+ b = ROUND_UP_PTR((char *)abuf, 4096);
+
+ for (iosize = 0; iosize < len; iosize += bsize)
+ ;
+ n = read(afd->fd, b, iosize);
+ if (n <= 0)
+ return n;
+ lseek(afd->fd, len - n, 1);
+ if (n > len)
+ n = len;
+ memcpy(buf, b, n);
+ return n;
+}
+
+static int awrite(struct align_fd *afd, void *buf, int len)
+{
+ /* aligned write.
+ * On devices with a 4K sector size, we need to write
+ * the full sector. We pre-read if the sector is larger
+ * than the write.
+ * The address must be sector-aligned.
+ */
+ int bsize, iosize;
+ char *b;
+ int n;
+
+ bsize = afd->blk_sz;
+ if (!bsize || bsize > 4096 || len > 4096) {
+ if (!bsize)
+ fprintf(stderr, "WARNING - awrite() called with invalid block size\n");
+ return -1;
+ }
+ b = ROUND_UP_PTR((char *)abuf, 4096);
+
+ for (iosize = 0; iosize < len ; iosize += bsize)
+ ;
+
+ if (len != iosize) {
+ n = read(afd->fd, b, iosize);
+ if (n <= 0)
+ return n;
+ lseek(afd->fd, -n, 1);
+ }
+
+ memcpy(b, buf, len);
+ n = write(afd->fd, b, iosize);
+ if (n <= 0)
+ return n;
+ lseek(afd->fd, len - n, 1);
+ return len;
+}
+
+static inline unsigned int md_feature_any_ppl_on(__u32 feature_map)
+{
+ return ((__cpu_to_le32(feature_map) &
+ (MD_FEATURE_PPL | MD_FEATURE_MUTLIPLE_PPLS)));
+}
+
+static inline unsigned int choose_ppl_space(int chunk)
+{
+ return (PPL_HEADER_SIZE >> 9) + (chunk > 128*2 ? chunk : 128*2);
+}
+
+static void examine_super1(struct supertype *st, char *homehost)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
+ time_t atime;
+ unsigned int d;
+ int role;
+ int delta_extra = 0;
+ int i;
+ char *c;
+ int l = homehost ? strlen(homehost) : 0;
+ int layout;
+ unsigned long long sb_offset;
+ struct mdinfo info;
+ int inconsistent = 0;
+
+ printf(" Magic : %08x\n", __le32_to_cpu(sb->magic));
+ printf(" Version : 1");
+ sb_offset = __le64_to_cpu(sb->super_offset);
+ if (sb_offset <= 4)
+ printf(".1\n");
+ else if (sb_offset <= 8)
+ printf(".2\n");
+ else
+ printf(".0\n");
+ printf(" Feature Map : 0x%x\n", __le32_to_cpu(sb->feature_map));
+ printf(" Array UUID : ");
+ for (i=0; i<16; i++) {
+ if ((i&3)==0 && i != 0) printf(":");
+ printf("%02x", sb->set_uuid[i]);
+ }
+ printf("\n");
+ printf(" Name : %.32s", sb->set_name);
+ if (l > 0 && l < 32 &&
+ sb->set_name[l] == ':' &&
+ strncmp(sb->set_name, homehost, l) == 0)
+ printf(" (local to host %s)", homehost);
+ printf("\n");
+ if (bms->nodes > 0 &&
+ (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ printf(" Cluster Name : %-64s\n", bms->cluster_name);
+ atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL;
+ printf(" Creation Time : %.24s\n", ctime(&atime));
+ c=map_num(pers, __le32_to_cpu(sb->level));
+ printf(" Raid Level : %s\n", c?c:"-unknown-");
+ printf(" Raid Devices : %d\n", __le32_to_cpu(sb->raid_disks));
+ printf("\n");
+ printf(" Avail Dev Size : %llu sectors%s\n",
+ (unsigned long long)__le64_to_cpu(sb->data_size),
+ human_size(__le64_to_cpu(sb->data_size)<<9));
+ if (__le32_to_cpu(sb->level) > 0) {
+ int ddsks = 0, ddsks_denom = 1;
+ switch(__le32_to_cpu(sb->level)) {
+ case 1: ddsks=1;break;
+ case 4:
+ case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break;
+ case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break;
+ case 10:
+ layout = __le32_to_cpu(sb->layout);
+ ddsks = __le32_to_cpu(sb->raid_disks);
+ ddsks_denom = (layout&255) * ((layout>>8)&255);
+ }
+ if (ddsks) {
+ long long asize = __le64_to_cpu(sb->size);
+ asize = (asize << 9) * ddsks / ddsks_denom;
+ printf(" Array Size : %llu KiB%s\n",
+ asize >> 10, human_size(asize));
+ }
+ if (sb->size != sb->data_size)
+ printf(" Used Dev Size : %llu sectors%s\n",
+ (unsigned long long)__le64_to_cpu(sb->size),
+ human_size(__le64_to_cpu(sb->size)<<9));
+ }
+ if (sb->data_offset)
+ printf(" Data Offset : %llu sectors\n",
+ (unsigned long long)__le64_to_cpu(sb->data_offset));
+ if (sb->new_offset &&
+ (__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) {
+ unsigned long long offset = __le64_to_cpu(sb->data_offset);
+ offset += (signed)(int32_t)__le32_to_cpu(sb->new_offset);
+ printf(" New Offset : %llu sectors\n", offset);
+ }
+ printf(" Super Offset : %llu sectors\n",
+ (unsigned long long)__le64_to_cpu(sb->super_offset));
+ if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET)
+ printf("Recovery Offset : %llu sectors\n",
+ (unsigned long long)__le64_to_cpu(sb->recovery_offset));
+
+ st->ss->getinfo_super(st, &info, NULL);
+ if (info.space_after != 1 &&
+ !(__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET))
+ printf(" Unused Space : before=%llu sectors, after=%llu sectors\n",
+ info.space_before, info.space_after);
+
+ printf(" State : %s\n",
+ (__le64_to_cpu(sb->resync_offset)+1)? "active":"clean");
+ printf(" Device UUID : ");
+ for (i=0; i<16; i++) {
+ if ((i&3)==0 && i != 0)
+ printf(":");
+ printf("%02x", sb->device_uuid[i]);
+ }
+ printf("\n");
+ printf("\n");
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
+ printf("Internal Bitmap : %ld sectors from superblock\n",
+ (long)(int32_t)__le32_to_cpu(sb->bitmap_offset));
+ } else if (md_feature_any_ppl_on(sb->feature_map)) {
+ printf(" PPL : %u sectors at offset %d sectors from superblock\n",
+ __le16_to_cpu(sb->ppl.size),
+ __le16_to_cpu(sb->ppl.offset));
+ }
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) {
+ printf(" Reshape pos'n : %llu%s\n", (unsigned long long)
+ __le64_to_cpu(sb->reshape_position)/2,
+ human_size(__le64_to_cpu(sb->reshape_position)<<9));
+ if (__le32_to_cpu(sb->delta_disks)) {
+ printf(" Delta Devices : %d",
+ __le32_to_cpu(sb->delta_disks));
+ printf(" (%d->%d)\n",
+ __le32_to_cpu(sb->raid_disks) -
+ __le32_to_cpu(sb->delta_disks),
+ __le32_to_cpu(sb->raid_disks));
+ if ((int)__le32_to_cpu(sb->delta_disks) < 0)
+ delta_extra = -__le32_to_cpu(sb->delta_disks);
+ }
+ if (__le32_to_cpu(sb->new_level) != __le32_to_cpu(sb->level)) {
+ c = map_num(pers, __le32_to_cpu(sb->new_level));
+ printf(" New Level : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->new_layout) !=
+ __le32_to_cpu(sb->layout)) {
+ if (__le32_to_cpu(sb->level) == 5) {
+ c = map_num(r5layout,
+ __le32_to_cpu(sb->new_layout));
+ printf(" New Layout : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->level) == 6) {
+ c = map_num(r6layout,
+ __le32_to_cpu(sb->new_layout));
+ printf(" New Layout : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->level) == 10) {
+ printf(" New Layout :");
+ print_r10_layout(__le32_to_cpu(sb->new_layout));
+ printf("\n");
+ }
+ }
+ if (__le32_to_cpu(sb->new_chunk) !=
+ __le32_to_cpu(sb->chunksize))
+ printf(" New Chunksize : %dK\n",
+ __le32_to_cpu(sb->new_chunk)/2);
+ printf("\n");
+ }
+ if (sb->devflags) {
+ printf(" Flags :");
+ if (sb->devflags & WriteMostly1)
+ printf(" write-mostly");
+ if (sb->devflags & FailFast1)
+ printf(" failfast");
+ printf("\n");
+ }
+
+ atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL;
+ printf(" Update Time : %.24s\n", ctime(&atime));
+
+ if (sb->bblog_size && sb->bblog_offset) {
+ printf(" Bad Block Log : %d entries available at offset %ld sectors",
+ __le16_to_cpu(sb->bblog_size)*512/8,
+ (long)(int32_t)__le32_to_cpu(sb->bblog_offset));
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS))
+ printf(" - bad blocks present.");
+ printf("\n");
+ }
+
+ if (calc_sb_1_csum(sb) == sb->sb_csum)
+ printf(" Checksum : %x - correct\n",
+ __le32_to_cpu(sb->sb_csum));
+ else
+ printf(" Checksum : %x - expected %x\n",
+ __le32_to_cpu(sb->sb_csum),
+ __le32_to_cpu(calc_sb_1_csum(sb)));
+ printf(" Events : %llu\n",
+ (unsigned long long)__le64_to_cpu(sb->events));
+ printf("\n");
+ if (__le32_to_cpu(sb->level) == 0 &&
+ (sb->feature_map & __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT))) {
+ c = map_num(r0layout, __le32_to_cpu(sb->layout));
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->level) == 5) {
+ c = map_num(r5layout, __le32_to_cpu(sb->layout));
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->level) == 6) {
+ c = map_num(r6layout, __le32_to_cpu(sb->layout));
+ printf(" Layout : %s\n", c?c:"-unknown-");
+ }
+ if (__le32_to_cpu(sb->level) == 10) {
+ int lo = __le32_to_cpu(sb->layout);
+ printf(" Layout :");
+ print_r10_layout(lo);
+ printf("\n");
+ }
+ switch(__le32_to_cpu(sb->level)) {
+ case 0:
+ case 4:
+ case 5:
+ case 6:
+ case 10:
+ printf(" Chunk Size : %dK\n",
+ __le32_to_cpu(sb->chunksize)/2);
+ break;
+ case -1:
+ printf(" Rounding : %dK\n",
+ __le32_to_cpu(sb->chunksize)/2);
+ break;
+ default:
+ break;
+ }
+ printf("\n");
+#if 0
+ /* This turns out to just be confusing */
+ printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number));
+ for (i = __le32_to_cpu(sb->max_dev); i> 0 ; i--)
+ if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE)
+ break;
+ for (d = 0; d < i; d++) {
+ int role = __le16_to_cpu(sb->dev_roles[d]);
+ if (d)
+ printf(", ");
+ if (role == MD_DISK_ROLE_SPARE)
+ printf("empty");
+ else
+ if(role == MD_DISK_ROLE_FAULTY)
+ printf("failed");
+ else
+ printf("%d", role);
+ }
+ printf(")\n");
+#endif
+ printf(" Device Role : ");
+ role = role_from_sb(sb);
+ if (role >= MD_DISK_ROLE_FAULTY)
+ printf("spare\n");
+ else if (role == MD_DISK_ROLE_JOURNAL)
+ printf("Journal\n");
+ else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT))
+ printf("Replacement device %d\n", role);
+ else
+ printf("Active device %d\n", role);
+
+ printf(" Array State : ");
+ for (d = 0; d < __le32_to_cpu(sb->raid_disks) + delta_extra; d++) {
+ int cnt = 0;
+ unsigned int i;
+ for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
+ unsigned int role = __le16_to_cpu(sb->dev_roles[i]);
+ if (role == d)
+ cnt++;
+ }
+ if (cnt == 2 && __le32_to_cpu(sb->level) > 0)
+ printf("R");
+ else if (cnt == 1)
+ printf("A");
+ else if (cnt == 0)
+ printf(".");
+ else {
+ printf("?");
+ inconsistent = 1;
+ }
+ }
+#if 0
+ /* This is confusing too */
+ faulty = 0;
+ for (i = 0; i< __le32_to_cpu(sb->max_dev); i++) {
+ int role = __le16_to_cpu(sb->dev_roles[i]);
+ if (role == MD_DISK_ROLE_FAULTY)
+ faulty++;
+ }
+ if (faulty)
+ printf(" %d failed", faulty);
+#endif
+ printf(" ('A' == active, '.' == missing, 'R' == replacing)");
+ printf("\n");
+ for (d = 0; d < __le32_to_cpu(sb->max_dev); d++) {
+ unsigned int r = __le16_to_cpu(sb->dev_roles[d]);
+ if (r <= MD_DISK_ROLE_MAX &&
+ r > __le32_to_cpu(sb->raid_disks) + delta_extra)
+ inconsistent = 1;
+ }
+ if (inconsistent) {
+ printf("WARNING Array state is inconsistent - each number should appear only once\n");
+ for (d = 0; d < __le32_to_cpu(sb->max_dev); d++)
+ if (__le16_to_cpu(sb->dev_roles[d]) >= MD_DISK_ROLE_FAULTY)
+ printf(" %d:-", d);
+ else
+ printf(" %d:%d", d, __le16_to_cpu(sb->dev_roles[d]));
+ printf("\n");
+ }
+}
+
+static void brief_examine_super1(struct supertype *st, int verbose)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ int i;
+ unsigned long long sb_offset;
+ char *nm;
+ char *c = map_num(pers, __le32_to_cpu(sb->level));
+
+ nm = strchr(sb->set_name, ':');
+ if (nm)
+ nm++;
+ else if (sb->set_name[0])
+ nm = sb->set_name;
+ else
+ nm = NULL;
+
+ printf("ARRAY ");
+ if (nm) {
+ printf("/dev/md/");
+ print_escape(nm);
+ putchar(' ');
+ }
+ if (verbose && c)
+ printf(" level=%s", c);
+ sb_offset = __le64_to_cpu(sb->super_offset);
+ if (sb_offset <= 4)
+ printf(" metadata=1.1 ");
+ else if (sb_offset <= 8)
+ printf(" metadata=1.2 ");
+ else
+ printf(" metadata=1.0 ");
+ if (verbose)
+ printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks));
+ printf("UUID=");
+ for (i = 0; i < 16; i++) {
+ if ((i&3)==0 && i != 0)
+ printf(":");
+ printf("%02x", sb->set_uuid[i]);
+ }
+ if (sb->set_name[0]) {
+ printf(" name=");
+ print_quoted(sb->set_name);
+ }
+ printf("\n");
+}
+
+static void export_examine_super1(struct supertype *st)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ int i;
+ int len = 32;
+ int layout;
+
+ printf("MD_LEVEL=%s\n", map_num(pers, __le32_to_cpu(sb->level)));
+ printf("MD_DEVICES=%d\n", __le32_to_cpu(sb->raid_disks));
+ for (i = 0; i < 32; i++)
+ if (sb->set_name[i] == '\n' || sb->set_name[i] == '\0') {
+ len = i;
+ break;
+ }
+ if (len)
+ printf("MD_NAME=%.*s\n", len, sb->set_name);
+ if (__le32_to_cpu(sb->level) > 0) {
+ int ddsks = 0, ddsks_denom = 1;
+ switch(__le32_to_cpu(sb->level)) {
+ case 1:
+ ddsks = 1;
+ break;
+ case 4:
+ case 5:
+ ddsks = __le32_to_cpu(sb->raid_disks)-1;
+ break;
+ case 6:
+ ddsks = __le32_to_cpu(sb->raid_disks)-2;
+ break;
+ case 10:
+ layout = __le32_to_cpu(sb->layout);
+ ddsks = __le32_to_cpu(sb->raid_disks);
+ ddsks_denom = (layout&255) * ((layout>>8)&255);
+ }
+ if (ddsks) {
+ long long asize = __le64_to_cpu(sb->size);
+ asize = (asize << 9) * ddsks / ddsks_denom;
+ printf("MD_ARRAY_SIZE=%s\n",
+ human_size_brief(asize, JEDEC));
+ }
+ }
+ printf("MD_UUID=");
+ for (i = 0; i < 16; i++) {
+ if ((i&3) == 0 && i != 0)
+ printf(":");
+ printf("%02x", sb->set_uuid[i]);
+ }
+ printf("\n");
+ printf("MD_UPDATE_TIME=%llu\n",
+ __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL);
+ printf("MD_DEV_UUID=");
+ for (i = 0; i < 16; i++) {
+ if ((i&3) == 0 && i != 0)
+ printf(":");
+ printf("%02x", sb->device_uuid[i]);
+ }
+ printf("\n");
+ printf("MD_EVENTS=%llu\n",
+ (unsigned long long)__le64_to_cpu(sb->events));
+}
+
+static int copy_metadata1(struct supertype *st, int from, int to)
+{
+ /* Read superblock. If it looks good, write it out.
+ * Then if a bitmap is present, copy that.
+ * And if a bad-block-list is present, copy that too.
+ */
+ void *buf;
+ unsigned long long dsize, sb_offset;
+ const int bufsize = 4*1024;
+ struct mdp_superblock_1 super, *sb;
+
+ if (posix_memalign(&buf, 4096, bufsize) != 0)
+ return 1;
+
+ if (!get_dev_size(from, NULL, &dsize))
+ goto err;
+
+ dsize >>= 9;
+ if (dsize < 24)
+ goto err;
+ switch(st->minor_version) {
+ case 0:
+ sb_offset = dsize;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2-1);
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4*2;
+ break;
+ default:
+ goto err;
+ }
+
+ if (lseek64(from, sb_offset << 9, 0) < 0LL)
+ goto err;
+ if (read(from, buf, bufsize) != bufsize)
+ goto err;
+
+ sb = buf;
+ super = *sb; // save most of sb for when we reuse buf
+
+ if (__le32_to_cpu(super.magic) != MD_SB_MAGIC ||
+ __le32_to_cpu(super.major_version) != 1 ||
+ __le64_to_cpu(super.super_offset) != sb_offset ||
+ calc_sb_1_csum(sb) != super.sb_csum)
+ goto err;
+
+ if (lseek64(to, sb_offset << 9, 0) < 0LL)
+ goto err;
+ if (write(to, buf, bufsize) != bufsize)
+ goto err;
+
+ if (super.feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) {
+ unsigned long long bitmap_offset = sb_offset;
+ int bytes = 4096; // just an estimate.
+ int written = 0;
+ struct align_fd afrom, ato;
+
+ init_afd(&afrom, from);
+ init_afd(&ato, to);
+
+ bitmap_offset += (int32_t)__le32_to_cpu(super.bitmap_offset);
+
+ if (lseek64(from, bitmap_offset<<9, 0) < 0)
+ goto err;
+ if (lseek64(to, bitmap_offset<<9, 0) < 0)
+ goto err;
+
+ for (written = 0; written < bytes ; ) {
+ int n = bytes - written;
+ if (n > 4096)
+ n = 4096;
+ if (aread(&afrom, buf, n) != n)
+ goto err;
+ if (written == 0) {
+ /* have the header, can calculate
+ * correct bitmap bytes */
+ bitmap_super_t *bms;
+ bms = (void*)buf;
+ bytes = calc_bitmap_size(bms, 512);
+ if (n > bytes)
+ n = bytes;
+ }
+ if (awrite(&ato, buf, n) != n)
+ goto err;
+ written += n;
+ }
+ }
+
+ if (super.bblog_size != 0 &&
+ __le16_to_cpu(super.bblog_size) <= 100 &&
+ super.bblog_offset != 0 &&
+ (super.feature_map & __le32_to_cpu(MD_FEATURE_BAD_BLOCKS))) {
+ /* There is a bad block log */
+ unsigned long long bb_offset = sb_offset;
+ int bytes = __le16_to_cpu(super.bblog_size) * 512;
+ int written = 0;
+ struct align_fd afrom, ato;
+
+ init_afd(&afrom, from);
+ init_afd(&ato, to);
+
+ bb_offset += (int32_t)__le32_to_cpu(super.bblog_offset);
+
+ if (lseek64(from, bb_offset<<9, 0) < 0)
+ goto err;
+ if (lseek64(to, bb_offset<<9, 0) < 0)
+ goto err;
+
+ for (written = 0; written < bytes ; ) {
+ int n = bytes - written;
+ if (n > 4096)
+ n = 4096;
+ if (aread(&afrom, buf, n) != n)
+ goto err;
+
+ if (awrite(&ato, buf, n) != n)
+ goto err;
+ written += n;
+ }
+ }
+
+ free(buf);
+ return 0;
+
+err:
+ free(buf);
+ return 1;
+}
+
+static void detail_super1(struct supertype *st, char *homehost, char *subarray)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+ int i;
+ int l = homehost ? strlen(homehost) : 0;
+
+ printf(" Name : %.32s", sb->set_name);
+ if (l > 0 && l < 32 && sb->set_name[l] == ':' &&
+ strncmp(sb->set_name, homehost, l) == 0)
+ printf(" (local to host %s)", homehost);
+ if (bms->nodes > 0 &&
+ (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ printf("\n Cluster Name : %-64s", bms->cluster_name);
+ printf("\n UUID : ");
+ for (i = 0; i < 16; i++) {
+ if ((i&3) == 0 && i != 0)
+ printf(":");
+ printf("%02x", sb->set_uuid[i]);
+ }
+ printf("\n Events : %llu\n\n",
+ (unsigned long long)__le64_to_cpu(sb->events));
+}
+
+static void brief_detail_super1(struct supertype *st, char *subarray)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ int i;
+
+ if (sb->set_name[0]) {
+ printf(" name=");
+ print_quoted(sb->set_name);
+ }
+ printf(" UUID=");
+ for (i = 0; i < 16; i++) {
+ if ((i & 3) == 0 && i != 0)
+ printf(":");
+ printf("%02x", sb->set_uuid[i]);
+ }
+}
+
+static void export_detail_super1(struct supertype *st)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ int i;
+ int len = 32;
+
+ for (i = 0; i < 32; i++)
+ if (sb->set_name[i] == '\n' || sb->set_name[i] == '\0') {
+ len = i;
+ break;
+ }
+ if (len)
+ printf("MD_NAME=%.*s\n", len, sb->set_name);
+}
+
+static int examine_badblocks_super1(struct supertype *st, int fd, char *devname)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ unsigned long long offset;
+ int size;
+ __u64 *bbl, *bbp;
+ int i;
+
+ if (!sb->bblog_size || __le16_to_cpu(sb->bblog_size) > 100 ||
+ !sb->bblog_offset){
+ printf("No bad-blocks list configured on %s\n", devname);
+ return 0;
+ }
+ if ((sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) == 0) {
+ printf("Bad-blocks list is empty in %s\n", devname);
+ return 0;
+ }
+
+ size = __le16_to_cpu(sb->bblog_size)* 512;
+ if (posix_memalign((void**)&bbl, 4096, size) != 0) {
+ pr_err("could not allocate badblocks list\n");
+ return 0;
+ }
+ offset = __le64_to_cpu(sb->super_offset) +
+ (int)__le32_to_cpu(sb->bblog_offset);
+ offset <<= 9;
+ if (lseek64(fd, offset, 0) < 0) {
+ pr_err("Cannot seek to bad-blocks list\n");
+ return 1;
+ }
+ if (read(fd, bbl, size) != size) {
+ pr_err("Cannot read bad-blocks list\n");
+ return 1;
+ }
+ /* 64bits per entry. 10 bits is block-count, 54 bits is block
+ * offset. Blocks are sectors unless bblog->shift makes them bigger
+ */
+ bbp = (__u64*)bbl;
+ printf("Bad-blocks on %s:\n", devname);
+ for (i = 0; i < size/8; i++, bbp++) {
+ __u64 bb = __le64_to_cpu(*bbp);
+ int count = bb & 0x3ff;
+ unsigned long long sector = bb >> 10;
+
+ if (bb + 1 == 0)
+ break;
+
+ sector <<= sb->bblog_shift;
+ count <<= sb->bblog_shift;
+
+ printf("%20llu for %d sectors\n", sector, count);
+ }
+ return 0;
+}
+
+static int match_home1(struct supertype *st, char *homehost)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ int l = homehost ? strlen(homehost) : 0;
+
+ return (l > 0 && l < 32 && sb->set_name[l] == ':' &&
+ strncmp(sb->set_name, homehost, l) == 0);
+}
+
+static void uuid_from_super1(struct supertype *st, int uuid[4])
+{
+ struct mdp_superblock_1 *super = st->sb;
+ char *cuuid = (char*)uuid;
+ int i;
+ for (i = 0; i < 16; i++)
+ cuuid[i] = super->set_uuid[i];
+}
+
+static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ struct bitmap_super_s *bsb = (void*)(((char*)sb)+MAX_SB_SIZE);
+ struct misc_dev_info *misc =
+ (void*)(((char*)sb)+MAX_SB_SIZE+BM_SUPER_SIZE);
+ int working = 0;
+ unsigned int i;
+ unsigned int role;
+ unsigned int map_disks = info->array.raid_disks;
+ unsigned long long super_offset;
+ unsigned long long data_size;
+
+ memset(info, 0, sizeof(*info));
+ info->array.major_version = 1;
+ info->array.minor_version = st->minor_version;
+ info->array.patch_version = 0;
+ info->array.raid_disks = __le32_to_cpu(sb->raid_disks);
+ info->array.level = __le32_to_cpu(sb->level);
+ info->array.layout = __le32_to_cpu(sb->layout);
+ info->array.md_minor = -1;
+ info->array.ctime = __le64_to_cpu(sb->ctime);
+ info->array.utime = __le64_to_cpu(sb->utime);
+ info->array.chunk_size = __le32_to_cpu(sb->chunksize)*512;
+ info->array.state =
+ (__le64_to_cpu(sb->resync_offset) == MaxSector) ? 1 : 0;
+
+ super_offset = __le64_to_cpu(sb->super_offset);
+ info->data_offset = __le64_to_cpu(sb->data_offset);
+ info->component_size = __le64_to_cpu(sb->size);
+ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) {
+ info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset);
+ if (__le32_to_cpu(bsb->nodes) > 1)
+ info->array.state |= (1 << MD_SB_CLUSTERED);
+ } else if (md_feature_any_ppl_on(sb->feature_map)) {
+ info->ppl_offset = __le16_to_cpu(sb->ppl.offset);
+ info->ppl_size = __le16_to_cpu(sb->ppl.size);
+ info->ppl_sector = super_offset + info->ppl_offset;
+ }
+
+ info->disk.major = 0;
+ info->disk.minor = 0;
+ info->disk.number = __le32_to_cpu(sb->dev_number);
+ if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) ||
+ __le32_to_cpu(sb->dev_number) >= MAX_DEVS)
+ role = MD_DISK_ROLE_FAULTY;
+ else
+ role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]);
+
+ if (info->array.level <= 0)
+ data_size = __le64_to_cpu(sb->data_size);
+ else
+ data_size = __le64_to_cpu(sb->size);
+ if (info->data_offset < super_offset) {
+ unsigned long long end;
+ info->space_before = info->data_offset;
+ end = super_offset;
+
+ if (sb->bblog_offset && sb->bblog_size) {
+ unsigned long long bboffset = super_offset;
+ bboffset += (int32_t)__le32_to_cpu(sb->bblog_offset);
+ if (bboffset < end)
+ end = bboffset;
+ }
+
+ if (super_offset + info->bitmap_offset + info->ppl_offset < end)
+ end = super_offset + info->bitmap_offset +
+ info->ppl_offset;
+
+ if (info->data_offset + data_size < end)
+ info->space_after = end - data_size - info->data_offset;
+ else
+ info->space_after = 0;
+ } else {
+ unsigned long long earliest;
+ earliest = super_offset + (32+4)*2; /* match kernel */
+ if (info->bitmap_offset > 0) {
+ unsigned long long bmend = info->bitmap_offset;
+ unsigned long long size = calc_bitmap_size(bsb, 4096);
+ size /= 512;
+ bmend += size;
+ if (bmend > earliest)
+ earliest = bmend;
+ } else if (info->ppl_offset > 0) {
+ unsigned long long pplend;
+
+ pplend = info->ppl_offset + info->ppl_size;
+ if (pplend > earliest)
+ earliest = pplend;
+ }
+ if (sb->bblog_offset && sb->bblog_size) {
+ unsigned long long bbend = super_offset;
+ bbend += (int32_t)__le32_to_cpu(sb->bblog_offset);
+ bbend += __le16_to_cpu(sb->bblog_size);
+ if (bbend > earliest)
+ earliest = bbend;
+ }
+ if (earliest < info->data_offset)
+ info->space_before = info->data_offset - earliest;
+ else
+ info->space_before = 0;
+ info->space_after = misc->device_size - data_size -
+ info->data_offset;
+ }
+ if (info->space_before == 0 && info->space_after == 0) {
+ /* It will look like we don't support data_offset changes,
+ * be we do - it's just that there is no room.
+ * A change that reduced the number of devices should
+ * still be allowed, so set the otherwise useless value of '1'
+ */
+ info->space_after = 1;
+ }
+
+ info->disk.raid_disk = -1;
+ switch(role) {
+ case MD_DISK_ROLE_SPARE:
+ /* spare: not active, not sync, not faulty */
+ info->disk.state = 0;
+ break;
+ case MD_DISK_ROLE_FAULTY:
+ info->disk.state = (1 << MD_DISK_FAULTY); /* faulty */
+ break;
+ case MD_DISK_ROLE_JOURNAL:
+ info->disk.state = (1 << MD_DISK_JOURNAL);
+ info->disk.raid_disk = role;
+ /* journal uses all 4kB blocks*/
+ info->space_after = (misc->device_size - info->data_offset) % 8;
+ break;
+ default:
+ info->disk.state = 6; /* active and in sync */
+ info->disk.raid_disk = role;
+ }
+ if (sb->devflags & WriteMostly1)
+ info->disk.state |= (1 << MD_DISK_WRITEMOSTLY);
+ if (sb->devflags & FailFast1)
+ info->disk.state |= (1 << MD_DISK_FAILFAST);
+ info->events = __le64_to_cpu(sb->events);
+ sprintf(info->text_version, "1.%d", st->minor_version);
+ info->safe_mode_delay = 200;
+
+ memcpy(info->uuid, sb->set_uuid, 16);
+
+ strncpy(info->name, sb->set_name, 32);
+ info->name[32] = 0;
+
+ if ((__le32_to_cpu(sb->feature_map)&MD_FEATURE_REPLACEMENT)) {
+ info->disk.state &= ~(1 << MD_DISK_SYNC);
+ info->disk.state |= 1 << MD_DISK_REPLACEMENT;
+ }
+
+ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET))
+ info->recovery_start = __le32_to_cpu(sb->recovery_offset);
+ else
+ info->recovery_start = MaxSector;
+
+ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) {
+ info->reshape_active = 1;
+ if ((sb->feature_map & __le32_to_cpu(MD_FEATURE_NEW_OFFSET)) &&
+ sb->new_offset != 0)
+ info->reshape_active |= RESHAPE_NO_BACKUP;
+ info->reshape_progress = __le64_to_cpu(sb->reshape_position);
+ info->new_level = __le32_to_cpu(sb->new_level);
+ info->delta_disks = __le32_to_cpu(sb->delta_disks);
+ info->new_layout = __le32_to_cpu(sb->new_layout);
+ info->new_chunk = __le32_to_cpu(sb->new_chunk)<<9;
+ if (info->delta_disks < 0)
+ info->array.raid_disks -= info->delta_disks;
+ } else
+ info->reshape_active = 0;
+
+ info->recovery_blocked = info->reshape_active;
+
+ if (map)
+ for (i=0; i<map_disks; i++)
+ map[i] = 0;
+ for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) {
+ role = __le16_to_cpu(sb->dev_roles[i]);
+ if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) {
+ working++;
+ if (map && role < map_disks)
+ map[role] = 1;
+ }
+ }
+
+ info->array.working_disks = working;
+
+ if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) {
+ info->journal_device_required = 1;
+ info->consistency_policy = CONSISTENCY_POLICY_JOURNAL;
+ } else if (md_feature_any_ppl_on(sb->feature_map)) {
+ info->consistency_policy = CONSISTENCY_POLICY_PPL;
+ } else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) {
+ info->consistency_policy = CONSISTENCY_POLICY_BITMAP;
+ } else if (info->array.level <= 0) {
+ info->consistency_policy = CONSISTENCY_POLICY_NONE;
+ } else {
+ info->consistency_policy = CONSISTENCY_POLICY_RESYNC;
+ }
+
+ info->journal_clean = 0;
+}
+
+static struct mdinfo *container_content1(struct supertype *st, char *subarray)
+{
+ struct mdinfo *info;
+
+ if (subarray)
+ return NULL;
+
+ info = xmalloc(sizeof(*info));
+ getinfo_super1(st, info, NULL);
+ return info;
+}
+
+static int update_super1(struct supertype *st, struct mdinfo *info,
+ char *update, char *devname, int verbose,
+ int uuid_set, char *homehost)
+{
+ /* NOTE: for 'assemble' and 'force' we need to return non-zero
+ * if any change was made. For others, the return value is
+ * ignored.
+ */
+ int rv = 0;
+ struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+
+ if (strcmp(update, "homehost") == 0 &&
+ homehost) {
+ /* Note that 'homehost' is special as it is really
+ * a "name" update.
+ */
+ char *c;
+ update = "name";
+ c = strchr(sb->set_name, ':');
+ if (c)
+ strncpy(info->name, c+1, 31 - (c-sb->set_name));
+ else
+ strncpy(info->name, sb->set_name, 32);
+ info->name[32] = 0;
+ }
+
+ if (strcmp(update, "force-one")==0) {
+ /* Not enough devices for a working array,
+ * so bring this one up-to-date
+ */
+ if (sb->events != __cpu_to_le64(info->events))
+ rv = 1;
+ sb->events = __cpu_to_le64(info->events);
+ } else if (strcmp(update, "force-array")==0) {
+ /* Degraded array and 'force' requests to
+ * maybe need to mark it 'clean'.
+ */
+ switch(__le32_to_cpu(sb->level)) {
+ case 4:
+ case 5:
+ case 6:
+ /* need to force clean */
+ if (sb->resync_offset != MaxSector)
+ rv = 1;
+ sb->resync_offset = MaxSector;
+ }
+ } else if (strcmp(update, "assemble")==0) {
+ int d = info->disk.number;
+ int want;
+ if (info->disk.state & (1<<MD_DISK_ACTIVE))
+ want = info->disk.raid_disk;
+ else if (info->disk.state & (1<<MD_DISK_JOURNAL))
+ want = MD_DISK_ROLE_JOURNAL;
+ else
+ want = MD_DISK_ROLE_SPARE;
+ if (sb->dev_roles[d] != __cpu_to_le16(want)) {
+ sb->dev_roles[d] = __cpu_to_le16(want);
+ rv = 1;
+ }
+ if (info->reshape_active &&
+ sb->feature_map &
+ __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) &&
+ info->delta_disks >= 0 &&
+ info->reshape_progress <
+ __le64_to_cpu(sb->reshape_position)) {
+ sb->reshape_position =
+ __cpu_to_le64(info->reshape_progress);
+ rv = 1;
+ }
+ if (info->reshape_active &&
+ sb->feature_map &
+ __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) &&
+ info->delta_disks < 0 &&
+ info->reshape_progress >
+ __le64_to_cpu(sb->reshape_position)) {
+ sb->reshape_position =
+ __cpu_to_le64(info->reshape_progress);
+ rv = 1;
+ }
+ } else if (strcmp(update, "linear-grow-new") == 0) {
+ int i;
+ int fd;
+ int max = __le32_to_cpu(sb->max_dev);
+
+ if (max > MAX_DEVS)
+ return -2;
+
+ for (i = 0; i < max; i++)
+ if (__le16_to_cpu(sb->dev_roles[i]) >=
+ MD_DISK_ROLE_FAULTY)
+ break;
+ if (i != info->disk.number)
+ return -2;
+ sb->dev_number = __cpu_to_le32(i);
+
+ if (i == max)
+ sb->max_dev = __cpu_to_le32(max+1);
+ if (i > max)
+ return -2;
+
+ random_uuid(sb->device_uuid);
+
+ sb->dev_roles[i] = __cpu_to_le16(info->disk.raid_disk);
+
+ fd = open(devname, O_RDONLY);
+ if (fd >= 0) {
+ unsigned long long ds;
+ get_dev_size(fd, devname, &ds);
+ close(fd);
+ ds >>= 9;
+ if (__le64_to_cpu(sb->super_offset) <
+ __le64_to_cpu(sb->data_offset)) {
+ sb->data_size = __cpu_to_le64(
+ ds - __le64_to_cpu(sb->data_offset));
+ } else {
+ ds -= 8*2;
+ ds &= ~(unsigned long long)(4*2-1);
+ sb->super_offset = __cpu_to_le64(ds);
+ sb->data_size = __cpu_to_le64(
+ ds - __le64_to_cpu(sb->data_offset));
+ }
+ }
+ } else if (strcmp(update, "linear-grow-update") == 0) {
+ int max = __le32_to_cpu(sb->max_dev);
+ int i = info->disk.number;
+ if (max > MAX_DEVS || i > MAX_DEVS)
+ return -2;
+ if (i > max)
+ return -2;
+ if (i == max)
+ sb->max_dev = __cpu_to_le32(max+1);
+ sb->raid_disks = __cpu_to_le32(info->array.raid_disks);
+ sb->dev_roles[info->disk.number] =
+ __cpu_to_le16(info->disk.raid_disk);
+ } else if (strcmp(update, "resync") == 0) {
+ /* make sure resync happens */
+ sb->resync_offset = 0ULL;
+ } else if (strcmp(update, "uuid") == 0) {
+ copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid);
+
+ if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)
+ memcpy(bms->uuid, sb->set_uuid, 16);
+ } else if (strcmp(update, "no-bitmap") == 0) {
+ sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET);
+ if (bms->version == BITMAP_MAJOR_CLUSTERED && !IsBitmapDirty(devname))
+ sb->resync_offset = MaxSector;
+ } else if (strcmp(update, "bbl") == 0) {
+ /* only possible if there is room after the bitmap, or if
+ * there is no bitmap
+ */
+ unsigned long long sb_offset = __le64_to_cpu(sb->super_offset);
+ unsigned long long data_offset = __le64_to_cpu(sb->data_offset);
+ long bitmap_offset = 0;
+ long bm_sectors = 0;
+ long space;
+
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
+ bitmap_offset = (long)__le32_to_cpu(sb->bitmap_offset);
+ bm_sectors = calc_bitmap_size(bms, 4096) >> 9;
+ } else if (md_feature_any_ppl_on(sb->feature_map)) {
+ bitmap_offset = (long)__le16_to_cpu(sb->ppl.offset);
+ bm_sectors = (long)__le16_to_cpu(sb->ppl.size);
+ }
+
+ if (sb_offset < data_offset) {
+ /*
+ * 1.1 or 1.2. Put bbl after bitmap leaving
+ * at least 32K
+ */
+ long bb_offset;
+ bb_offset = sb_offset + 8;
+ if (bm_sectors && bitmap_offset > 0)
+ bb_offset = bitmap_offset + bm_sectors;
+ while (bb_offset < (long)sb_offset + 8 + 32*2 &&
+ bb_offset + 8+8 <= (long)data_offset)
+ /* too close to bitmap, and room to grow */
+ bb_offset += 8;
+ if (bb_offset + 8 <= (long)data_offset) {
+ sb->bblog_size = __cpu_to_le16(8);
+ sb->bblog_offset = __cpu_to_le32(bb_offset);
+ }
+ } else {
+ /* 1.0 - Put bbl just before super block */
+ if (bm_sectors && bitmap_offset < 0)
+ space = -bitmap_offset - bm_sectors;
+ else
+ space = sb_offset - data_offset -
+ __le64_to_cpu(sb->data_size);
+ if (space >= 8) {
+ sb->bblog_size = __cpu_to_le16(8);
+ sb->bblog_offset = __cpu_to_le32((unsigned)-8);
+ }
+ }
+ } else if (strcmp(update, "no-bbl") == 0) {
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS))
+ pr_err("Cannot remove active bbl from %s\n",devname);
+ else {
+ sb->bblog_size = 0;
+ sb->bblog_shift = 0;
+ sb->bblog_offset = 0;
+ }
+ } else if (strcmp(update, "force-no-bbl") == 0) {
+ sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
+ sb->bblog_size = 0;
+ sb->bblog_shift = 0;
+ sb->bblog_offset = 0;
+ } else if (strcmp(update, "ppl") == 0) {
+ unsigned long long sb_offset = __le64_to_cpu(sb->super_offset);
+ unsigned long long data_offset = __le64_to_cpu(sb->data_offset);
+ unsigned long long data_size = __le64_to_cpu(sb->data_size);
+ long bb_offset = __le32_to_cpu(sb->bblog_offset);
+ int space;
+ int offset;
+
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
+ pr_err("Cannot add PPL to array with bitmap\n");
+ return -2;
+ }
+
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_JOURNAL)) {
+ pr_err("Cannot add PPL to array with journal\n");
+ return -2;
+ }
+
+ if (sb_offset < data_offset) {
+ if (bb_offset)
+ space = bb_offset - 8;
+ else
+ space = data_offset - sb_offset - 8;
+ offset = 8;
+ } else {
+ offset = -(sb_offset - data_offset - data_size);
+ if (offset < INT16_MIN)
+ offset = INT16_MIN;
+ space = -(offset - bb_offset);
+ }
+
+ if (space < (PPL_HEADER_SIZE >> 9) + 8) {
+ pr_err("Not enough space to add ppl\n");
+ return -2;
+ }
+
+ if (space >= (MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9)) {
+ space = (MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9);
+ } else {
+ int optimal_space = choose_ppl_space(
+ __le32_to_cpu(sb->chunksize));
+ if (space > optimal_space)
+ space = optimal_space;
+ if (space > UINT16_MAX)
+ space = UINT16_MAX;
+ }
+
+ sb->ppl.offset = __cpu_to_le16(offset);
+ sb->ppl.size = __cpu_to_le16(space);
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL);
+ } else if (strcmp(update, "no-ppl") == 0) {
+ sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_PPL |
+ MD_FEATURE_MUTLIPLE_PPLS);
+ } else if (strcmp(update, "name") == 0) {
+ if (info->name[0] == 0)
+ sprintf(info->name, "%d", info->array.md_minor);
+ memset(sb->set_name, 0, sizeof(sb->set_name));
+ if (homehost &&
+ strchr(info->name, ':') == NULL &&
+ strlen(homehost)+1+strlen(info->name) < 32) {
+ strcpy(sb->set_name, homehost);
+ strcat(sb->set_name, ":");
+ strcat(sb->set_name, info->name);
+ } else {
+ int namelen;
+
+ namelen = min((int)strlen(info->name),
+ (int)sizeof(sb->set_name) - 1);
+ memcpy(sb->set_name, info->name, namelen);
+ memset(&sb->set_name[namelen], '\0',
+ sizeof(sb->set_name) - namelen);
+ }
+ } else if (strcmp(update, "devicesize") == 0 &&
+ __le64_to_cpu(sb->super_offset) <
+ __le64_to_cpu(sb->data_offset)) {
+ /* set data_size to device size less data_offset */
+ struct misc_dev_info *misc = (struct misc_dev_info*)
+ (st->sb + MAX_SB_SIZE + BM_SUPER_SIZE);
+ sb->data_size = __cpu_to_le64(
+ misc->device_size - __le64_to_cpu(sb->data_offset));
+ } else if (strncmp(update, "revert-reshape", 14) == 0) {
+ rv = -2;
+ if (!(sb->feature_map &
+ __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)))
+ pr_err("No active reshape to revert on %s\n",
+ devname);
+ else {
+ __u32 temp;
+ unsigned long long reshape_sectors;
+ long reshape_chunk;
+ rv = 0;
+ /* If the reshape hasn't started, just stop it.
+ * It is conceivable that a stripe was modified but
+ * the metadata not updated. In that case the backup
+ * should have been used to get passed the critical stage.
+ * If that couldn't happen, the "-nobackup" version
+ * will be used.
+ */
+ if (strcmp(update, "revert-reshape-nobackup") == 0 &&
+ sb->reshape_position == 0 &&
+ (__le32_to_cpu(sb->delta_disks) > 0 ||
+ (__le32_to_cpu(sb->delta_disks) == 0 &&
+ !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) {
+ sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE);
+ sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) -
+ __le32_to_cpu(sb->delta_disks));
+ sb->delta_disks = 0;
+ goto done;
+ }
+ /* reshape_position is a little messy.
+ * Its value must be a multiple of the larger
+ * chunk size, and of the "after" data disks.
+ * So when reverting we need to change it to
+ * be a multiple of the new "after" data disks,
+ * which is the old "before".
+ * If it isn't already a multiple of 'before',
+ * the only thing we could do would be
+ * copy some block around on the disks, which
+ * is easy to get wrong.
+ * So we reject a revert-reshape unless the
+ * alignment is good.
+ */
+ if (__le32_to_cpu(sb->level) >= 4 &&
+ __le32_to_cpu(sb->level) <= 6) {
+ reshape_sectors =
+ __le64_to_cpu(sb->reshape_position);
+ reshape_chunk = __le32_to_cpu(sb->new_chunk);
+ reshape_chunk *= __le32_to_cpu(sb->raid_disks) -
+ __le32_to_cpu(sb->delta_disks) -
+ (__le32_to_cpu(sb->level)==6 ? 2 : 1);
+ if (reshape_sectors % reshape_chunk) {
+ pr_err("Reshape position is not suitably aligned.\n");
+ pr_err("Try normal assembly and stop again\n");
+ return -2;
+ }
+ }
+ sb->raid_disks =
+ __cpu_to_le32(__le32_to_cpu(sb->raid_disks) -
+ __le32_to_cpu(sb->delta_disks));
+ if (sb->delta_disks == 0)
+ sb->feature_map ^= __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS);
+ else
+ sb->delta_disks = __cpu_to_le32(-__le32_to_cpu(sb->delta_disks));
+
+ temp = sb->new_layout;
+ sb->new_layout = sb->layout;
+ sb->layout = temp;
+
+ temp = sb->new_chunk;
+ sb->new_chunk = sb->chunksize;
+ sb->chunksize = temp;
+
+ if (sb->feature_map &
+ __cpu_to_le32(MD_FEATURE_NEW_OFFSET)) {
+ long offset_delta =
+ (int32_t)__le32_to_cpu(sb->new_offset);
+ sb->data_offset = __cpu_to_le64(__le64_to_cpu(sb->data_offset) + offset_delta);
+ sb->new_offset = __cpu_to_le32(-offset_delta);
+ sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta);
+ }
+ done:;
+ }
+ } else if (strcmp(update, "_reshape_progress") == 0)
+ sb->reshape_position = __cpu_to_le64(info->reshape_progress);
+ else if (strcmp(update, "writemostly") == 0)
+ sb->devflags |= WriteMostly1;
+ else if (strcmp(update, "readwrite") == 0)
+ sb->devflags &= ~WriteMostly1;
+ else if (strcmp(update, "failfast") == 0)
+ sb->devflags |= FailFast1;
+ else if (strcmp(update, "nofailfast") == 0)
+ sb->devflags &= ~FailFast1;
+ else if (strcmp(update, "layout-original") == 0 ||
+ strcmp(update, "layout-alternate") == 0 ||
+ strcmp(update, "layout-unspecified") == 0) {
+ if (__le32_to_cpu(sb->level) != 0) {
+ pr_err("%s: %s only supported for RAID0\n",
+ devname?:"", update);
+ rv = -1;
+ } else if (strcmp(update, "layout-unspecified") == 0) {
+ sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RAID0_LAYOUT);
+ sb->layout = 0;
+ } else {
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT);
+ sb->layout = __cpu_to_le32(update[7] == 'o' ? 1 : 2);
+ }
+ } else
+ rv = -1;
+
+ sb->sb_csum = calc_sb_1_csum(sb);
+
+ return rv;
+}
+
+static int init_super1(struct supertype *st, mdu_array_info_t *info,
+ struct shape *s, char *name, char *homehost,
+ int *uuid, unsigned long long data_offset)
+{
+ struct mdp_superblock_1 *sb;
+ int spares;
+ char defname[10];
+ int sbsize;
+
+ if (posix_memalign((void**)&sb, 4096, SUPER1_SIZE) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 0;
+ }
+ memset(sb, 0, SUPER1_SIZE);
+
+ st->sb = sb;
+ if (info == NULL) {
+ /* zeroing superblock */
+ return 0;
+ }
+
+ spares = info->working_disks - info->active_disks;
+ if (info->raid_disks + spares > MAX_DEVS) {
+ pr_err("too many devices requested: %d+%d > %d\n",
+ info->raid_disks , spares, MAX_DEVS);
+ return 0;
+ }
+
+ sb->magic = __cpu_to_le32(MD_SB_MAGIC);
+ sb->major_version = __cpu_to_le32(1);
+ sb->feature_map = 0;
+ sb->pad0 = 0;
+
+ if (uuid)
+ copy_uuid(sb->set_uuid, uuid, super1.swapuuid);
+ else
+ random_uuid(sb->set_uuid);;
+
+ if (name == NULL || *name == 0) {
+ sprintf(defname, "%d", info->md_minor);
+ name = defname;
+ }
+ if (homehost &&
+ strchr(name, ':')== NULL &&
+ strlen(homehost)+1+strlen(name) < 32) {
+ strcpy(sb->set_name, homehost);
+ strcat(sb->set_name, ":");
+ strcat(sb->set_name, name);
+ } else {
+ int namelen;
+
+ namelen = min((int)strlen(name),
+ (int)sizeof(sb->set_name) - 1);
+ memcpy(sb->set_name, name, namelen);
+ memset(&sb->set_name[namelen], '\0',
+ sizeof(sb->set_name) - namelen);
+ }
+
+ sb->ctime = __cpu_to_le64((unsigned long long)time(0));
+ sb->level = __cpu_to_le32(info->level);
+ sb->layout = __cpu_to_le32(info->layout);
+ sb->size = __cpu_to_le64(s->size*2ULL);
+ sb->chunksize = __cpu_to_le32(info->chunk_size>>9);
+ sb->raid_disks = __cpu_to_le32(info->raid_disks);
+
+ sb->data_offset = __cpu_to_le64(data_offset);
+ sb->data_size = __cpu_to_le64(0);
+ sb->super_offset = __cpu_to_le64(0);
+ sb->recovery_offset = __cpu_to_le64(0);
+
+ sb->utime = sb->ctime;
+ sb->events = __cpu_to_le64(1);
+ if (info->state & (1<<MD_SB_CLEAN))
+ sb->resync_offset = MaxSector;
+ else
+ sb->resync_offset = 0;
+ sbsize = sizeof(struct mdp_superblock_1) +
+ 2 * (info->raid_disks + spares);
+ sbsize = ROUND_UP(sbsize, 512);
+ sb->max_dev =
+ __cpu_to_le32((sbsize - sizeof(struct mdp_superblock_1)) / 2);
+
+ memset(sb->dev_roles, 0xff,
+ MAX_SB_SIZE - sizeof(struct mdp_superblock_1));
+
+ if (s->consistency_policy == CONSISTENCY_POLICY_PPL)
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL);
+
+ return 1;
+}
+
+struct devinfo {
+ int fd;
+ char *devname;
+ long long data_offset;
+ unsigned long long dev_size;
+ mdu_disk_info_t disk;
+ struct devinfo *next;
+};
+
+/* Add a device to the superblock being created */
+static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk,
+ int fd, char *devname, unsigned long long data_offset)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ __u16 *rp = sb->dev_roles + dk->number;
+ struct devinfo *di, **dip;
+ int dk_state;
+
+ dk_state = dk->state & ~(1<<MD_DISK_FAILFAST);
+ if ((dk_state & (1<<MD_DISK_ACTIVE)) &&
+ (dk_state & (1<<MD_DISK_SYNC)))/* active, sync */
+ *rp = __cpu_to_le16(dk->raid_disk);
+ else if (dk_state & (1<<MD_DISK_JOURNAL))
+ *rp = MD_DISK_ROLE_JOURNAL;
+ else if ((dk_state & ~(1<<MD_DISK_ACTIVE)) == 0)
+ /* active or idle -> spare */
+ *rp = MD_DISK_ROLE_SPARE;
+ else
+ *rp = MD_DISK_ROLE_FAULTY;
+
+ if (dk->number >= (int)__le32_to_cpu(sb->max_dev) &&
+ __le32_to_cpu(sb->max_dev) < MAX_DEVS)
+ sb->max_dev = __cpu_to_le32(dk->number+1);
+
+ sb->dev_number = __cpu_to_le32(dk->number);
+ sb->devflags = 0; /* don't copy another disks flags */
+ sb->sb_csum = calc_sb_1_csum(sb);
+
+ dip = (struct devinfo **)&st->info;
+ while (*dip)
+ dip = &(*dip)->next;
+ di = xmalloc(sizeof(struct devinfo));
+ di->fd = fd;
+ di->devname = devname;
+ di->disk = *dk;
+ di->data_offset = data_offset;
+ get_dev_size(fd, NULL, &di->dev_size);
+ di->next = NULL;
+ *dip = di;
+
+ return 0;
+}
+
+static int locate_bitmap1(struct supertype *st, int fd, int node_num);
+
+static int store_super1(struct supertype *st, int fd)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ unsigned long long sb_offset;
+ struct align_fd afd;
+ int sbsize;
+ unsigned long long dsize;
+
+ if (!get_dev_size(fd, NULL, &dsize))
+ return 1;
+
+ dsize >>= 9;
+
+ if (dsize < 24)
+ return 2;
+
+ init_afd(&afd, fd);
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depending on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(st->minor_version) {
+ case 0:
+ sb_offset = dsize;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2-1);
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4*2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (sb_offset != __le64_to_cpu(sb->super_offset) &&
+ 0 != __le64_to_cpu(sb->super_offset)
+ ) {
+ pr_err("internal error - sb_offset is wrong\n");
+ abort();
+ }
+
+ if (lseek64(fd, sb_offset << 9, 0)< 0LL)
+ return 3;
+
+ sbsize = ROUND_UP(sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev), 512);
+
+ if (awrite(&afd, sb, sbsize) != sbsize)
+ return 4;
+
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
+ struct bitmap_super_s *bm = (struct bitmap_super_s*)
+ (((char*)sb)+MAX_SB_SIZE);
+ if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) {
+ locate_bitmap1(st, fd, 0);
+ if (awrite(&afd, bm, sizeof(*bm)) != sizeof(*bm))
+ return 5;
+ }
+ }
+ fsync(fd);
+
+ return 0;
+}
+
+static int load_super1(struct supertype *st, int fd, char *devname);
+
+static unsigned long choose_bm_space(unsigned long devsize)
+{
+ /* if the device is bigger than 8Gig, save 64k for bitmap usage,
+ * if bigger than 200Gig, save 128k
+ * NOTE: result must be multiple of 4K else bad things happen
+ * on 4K-sector devices.
+ */
+ if (devsize < 64*2)
+ return 0;
+ if (devsize - 64*2 >= 200*1024*1024*2)
+ return 128*2;
+ if (devsize - 4*2 > 8*1024*1024*2)
+ return 64*2;
+ return 4*2;
+}
+
+static void free_super1(struct supertype *st);
+
+__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len);
+
+static int write_init_ppl1(struct supertype *st, struct mdinfo *info, int fd)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ void *buf;
+ struct ppl_header *ppl_hdr;
+ int ret;
+
+ /* first clear entire ppl space */
+ ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size);
+ if (ret)
+ return ret;
+
+ ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE);
+ if (ret) {
+ pr_err("Failed to allocate PPL header buffer\n");
+ return ret;
+ }
+
+ memset(buf, 0, PPL_HEADER_SIZE);
+ ppl_hdr = buf;
+ memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED);
+ ppl_hdr->signature = __cpu_to_le32(~crc32c_le(~0, sb->set_uuid,
+ sizeof(sb->set_uuid)));
+ ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE));
+
+ if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) {
+ ret = errno;
+ perror("Failed to seek to PPL header location");
+ }
+
+ if (!ret && write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) {
+ ret = errno;
+ perror("Write PPL header failed");
+ }
+
+ if (!ret)
+ fsync(fd);
+
+ free(buf);
+ return ret;
+}
+
+#define META_BLOCK_SIZE 4096
+
+static int write_empty_r5l_meta_block(struct supertype *st, int fd)
+{
+ struct r5l_meta_block *mb;
+ struct mdp_superblock_1 *sb = st->sb;
+ struct align_fd afd;
+ __u32 crc;
+
+ init_afd(&afd, fd);
+
+ if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) {
+ pr_err("Could not allocate memory for the meta block.\n");
+ return 1;
+ }
+
+ memset(mb, 0, META_BLOCK_SIZE);
+
+ mb->magic = __cpu_to_le32(R5LOG_MAGIC);
+ mb->version = R5LOG_VERSION;
+ mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block));
+ mb->seq = __cpu_to_le64(random32());
+ mb->position = __cpu_to_le64(0);
+
+ crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid));
+ crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE);
+ mb->checksum = crc;
+
+ if (lseek64(fd, __le64_to_cpu(sb->data_offset) * 512, 0) < 0LL) {
+ pr_err("cannot seek to offset of the meta block\n");
+ goto fail_to_write;
+ }
+
+ if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) {
+ pr_err("failed to store write the meta block \n");
+ goto fail_to_write;
+ }
+ fsync(fd);
+
+ free(mb);
+ return 0;
+
+fail_to_write:
+ free(mb);
+ return 1;
+}
+
+static int write_init_super1(struct supertype *st)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ struct supertype *refst;
+ int rv = 0;
+ unsigned long long bm_space;
+ struct devinfo *di;
+ unsigned long long dsize, array_size;
+ unsigned long long sb_offset;
+ unsigned long long data_offset;
+ long bm_offset;
+ int raid0_need_layout = 0;
+
+ for (di = st->info; di; di = di->next) {
+ if (di->disk.state & (1 << MD_DISK_JOURNAL))
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL);
+ if (sb->level == 0 && sb->layout != 0) {
+ struct devinfo *di2 = st->info;
+ unsigned long long s1, s2;
+ s1 = di->dev_size;
+ if (di->data_offset != INVALID_SECTORS)
+ s1 -= di->data_offset;
+ s1 /= __le32_to_cpu(sb->chunksize);
+ s2 = di2->dev_size;
+ if (di2->data_offset != INVALID_SECTORS)
+ s2 -= di2->data_offset;
+ s2 /= __le32_to_cpu(sb->chunksize);
+ if (s1 != s2)
+ raid0_need_layout = 1;
+ }
+ }
+
+ for (di = st->info; di; di = di->next) {
+ if (di->disk.state & (1 << MD_DISK_FAULTY))
+ continue;
+ if (di->fd < 0)
+ continue;
+
+ while (Kill(di->devname, NULL, 0, -1, 1) == 0)
+ ;
+
+ sb->dev_number = __cpu_to_le32(di->disk.number);
+ if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY))
+ sb->devflags |= WriteMostly1;
+ else
+ sb->devflags &= ~WriteMostly1;
+ if (di->disk.state & (1<<MD_DISK_FAILFAST))
+ sb->devflags |= FailFast1;
+ else
+ sb->devflags &= ~FailFast1;
+
+ random_uuid(sb->device_uuid);
+
+ if (!(di->disk.state & (1<<MD_DISK_JOURNAL)))
+ sb->events = 0;
+
+ refst = dup_super(st);
+ if (load_super1(refst, di->fd, NULL)==0) {
+ struct mdp_superblock_1 *refsb = refst->sb;
+
+ memcpy(sb->device_uuid, refsb->device_uuid, 16);
+ if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) {
+ /* same array, so preserve events and
+ * dev_number */
+ sb->events = refsb->events;
+ /* bugs in 2.6.17 and earlier mean the
+ * dev_number chosen in Manage must be preserved
+ */
+ if (get_linux_version() >= 2006018)
+ sb->dev_number = refsb->dev_number;
+ }
+ free_super1(refst);
+ }
+ free(refst);
+
+ if (!get_dev_size(di->fd, NULL, &dsize)) {
+ rv = 1;
+ goto error_out;
+ }
+ dsize >>= 9;
+
+ if (dsize < 24) {
+ close(di->fd);
+ rv = 2;
+ goto error_out;
+ }
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depending on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ * data_offset has already been set.
+ */
+ array_size = __le64_to_cpu(sb->size);
+
+ /* work out how much space we left for a bitmap */
+ if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) {
+ bitmap_super_t *bms = (bitmap_super_t *)
+ (((char *)sb) + MAX_SB_SIZE);
+ bm_space = calc_bitmap_size(bms, 4096) >> 9;
+ bm_offset = (long)__le32_to_cpu(sb->bitmap_offset);
+ } else if (md_feature_any_ppl_on(sb->feature_map)) {
+ bm_space = MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9;
+ if (st->minor_version == 0)
+ bm_offset = -bm_space - 8;
+ else
+ bm_offset = 8;
+ sb->ppl.offset = __cpu_to_le16(bm_offset);
+ sb->ppl.size = __cpu_to_le16(bm_space);
+ } else {
+ bm_space = choose_bm_space(array_size);
+ bm_offset = 8;
+ }
+
+ data_offset = di->data_offset;
+ if (data_offset == INVALID_SECTORS)
+ data_offset = st->data_offset;
+ switch(st->minor_version) {
+ case 0:
+ /* Add 8 sectors for bad block log */
+ bm_space += 8;
+ if (data_offset == INVALID_SECTORS)
+ data_offset = 0;
+ sb_offset = dsize;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2-1);
+ sb->data_offset = __cpu_to_le64(data_offset);
+ sb->super_offset = __cpu_to_le64(sb_offset);
+ if (sb_offset < array_size + bm_space)
+ bm_space = sb_offset - array_size;
+ sb->data_size = __cpu_to_le64(sb_offset - bm_space);
+ if (bm_space >= 8) {
+ sb->bblog_size = __cpu_to_le16(8);
+ sb->bblog_offset = __cpu_to_le32((unsigned)-8);
+ }
+ break;
+ case 1:
+ case 2:
+ sb_offset = st->minor_version == 2 ? 8 : 0;
+ sb->super_offset = __cpu_to_le64(sb_offset);
+ if (data_offset == INVALID_SECTORS)
+ data_offset = sb_offset + 16;
+
+ sb->data_offset = __cpu_to_le64(data_offset);
+ sb->data_size = __cpu_to_le64(dsize - data_offset);
+ if (data_offset >= sb_offset+bm_offset+bm_space+8) {
+ sb->bblog_size = __cpu_to_le16(8);
+ sb->bblog_offset = __cpu_to_le32(bm_offset +
+ bm_space);
+ } else if (data_offset >= sb_offset + 16) {
+ sb->bblog_size = __cpu_to_le16(8);
+ /* '8' sectors for the bblog, and 'sb_offset'
+ * because we want offset from superblock, not
+ * start of device.
+ */
+ sb->bblog_offset = __cpu_to_le32(data_offset -
+ 8 - sb_offset);
+ }
+ break;
+ default:
+ pr_err("Failed to write invalid metadata format 1.%i to %s\n",
+ st->minor_version, di->devname);
+ rv = -EINVAL;
+ goto out;
+ }
+ /*
+ * Disable badblock log on clusters, or when
+ * explicitly requested
+ */
+ if (st->nodes > 0 || conf_get_create_info()->bblist == 0) {
+ sb->bblog_size = 0;
+ sb->bblog_offset = 0;
+ }
+
+ /* RAID0 needs a layout if devices aren't all the same size */
+ if (raid0_need_layout)
+ sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT);
+
+ sb->sb_csum = calc_sb_1_csum(sb);
+ rv = store_super1(st, di->fd);
+
+ if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) {
+ rv = write_empty_r5l_meta_block(st, di->fd);
+ if (rv)
+ goto error_out;
+ }
+
+ if (rv == 0 &&
+ (__le32_to_cpu(sb->feature_map) &
+ MD_FEATURE_BITMAP_OFFSET)) {
+ rv = st->ss->write_bitmap(st, di->fd, NodeNumUpdate);
+ } else if (rv == 0 &&
+ md_feature_any_ppl_on(sb->feature_map)) {
+ struct mdinfo info;
+
+ st->ss->getinfo_super(st, &info, NULL);
+ rv = st->ss->write_init_ppl(st, &info, di->fd);
+ }
+
+ close(di->fd);
+ di->fd = -1;
+ if (rv)
+ goto error_out;
+ }
+error_out:
+ if (rv)
+ pr_err("Failed to write metadata to %s\n", di->devname);
+out:
+ return rv;
+}
+
+static int compare_super1(struct supertype *st, struct supertype *tst,
+ int verbose)
+{
+ /*
+ * return:
+ * 0 same, or first was empty, and second was copied
+ * 1 second had wrong number
+ * 2 wrong uuid
+ * 3 wrong other info
+ */
+ struct mdp_superblock_1 *first = st->sb;
+ struct mdp_superblock_1 *second = tst->sb;
+
+ if (second->magic != __cpu_to_le32(MD_SB_MAGIC))
+ return 1;
+ if (second->major_version != __cpu_to_le32(1))
+ return 1;
+
+ if (!first) {
+ if (posix_memalign((void**)&first, 4096, SUPER1_SIZE) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+ memcpy(first, second, SUPER1_SIZE);
+ st->sb = first;
+ return 0;
+ }
+ if (memcmp(first->set_uuid, second->set_uuid, 16)!= 0)
+ return 2;
+
+ if (first->ctime != second->ctime ||
+ first->level != second->level ||
+ first->layout != second->layout ||
+ first->size != second->size ||
+ first->chunksize != second->chunksize ||
+ first->raid_disks != second->raid_disks)
+ return 3;
+ return 0;
+}
+
+static int load_super1(struct supertype *st, int fd, char *devname)
+{
+ unsigned long long dsize;
+ unsigned long long sb_offset;
+ struct mdp_superblock_1 *super;
+ int uuid[4];
+ struct bitmap_super_s *bsb;
+ struct misc_dev_info *misc;
+ struct align_fd afd;
+
+ free_super1(st);
+
+ init_afd(&afd, fd);
+
+ if (st->ss == NULL || st->minor_version == -1) {
+ int bestvers = -1;
+ struct supertype tst;
+ __u64 bestctime = 0;
+ /* guess... choose latest ctime */
+ memset(&tst, 0, sizeof(tst));
+ tst.ss = &super1;
+ for (tst.minor_version = 0; tst.minor_version <= 2;
+ tst.minor_version++) {
+ switch(load_super1(&tst, fd, devname)) {
+ case 0: super = tst.sb;
+ if (bestvers == -1 ||
+ bestctime < __le64_to_cpu(super->ctime)) {
+ bestvers = tst.minor_version;
+ bestctime = __le64_to_cpu(super->ctime);
+ }
+ free(super);
+ tst.sb = NULL;
+ break;
+ case 1: return 1; /*bad device */
+ case 2: break; /* bad, try next */
+ }
+ }
+ if (bestvers != -1) {
+ int rv;
+ tst.minor_version = bestvers;
+ tst.ss = &super1;
+ tst.max_devs = MAX_DEVS;
+ rv = load_super1(&tst, fd, devname);
+ if (rv == 0)
+ *st = tst;
+ return rv;
+ }
+ return 2;
+ }
+ if (!get_dev_size(fd, devname, &dsize))
+ return 1;
+ dsize >>= 9;
+
+ if (dsize < 24) {
+ if (devname)
+ pr_err("%s is too small for md: size is %llu sectors.\n",
+ devname, dsize);
+ return 1;
+ }
+
+ /*
+ * Calculate the position of the superblock.
+ * It is always aligned to a 4K boundary and
+ * depending on minor_version, it can be:
+ * 0: At least 8K, but less than 12K, from end of device
+ * 1: At start of device
+ * 2: 4K from start of device.
+ */
+ switch(st->minor_version) {
+ case 0:
+ sb_offset = dsize;
+ sb_offset -= 8*2;
+ sb_offset &= ~(4*2-1);
+ break;
+ case 1:
+ sb_offset = 0;
+ break;
+ case 2:
+ sb_offset = 4*2;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (lseek64(fd, sb_offset << 9, 0)< 0LL) {
+ if (devname)
+ pr_err("Cannot seek to superblock on %s: %s\n",
+ devname, strerror(errno));
+ return 1;
+ }
+
+ if (posix_memalign((void**)&super, 4096, SUPER1_SIZE) != 0) {
+ pr_err("could not allocate superblock\n");
+ return 1;
+ }
+
+ memset(super, 0, SUPER1_SIZE);
+
+ if (aread(&afd, super, MAX_SB_SIZE) != MAX_SB_SIZE) {
+ if (devname)
+ pr_err("Cannot read superblock on %s\n",
+ devname);
+ free(super);
+ return 1;
+ }
+
+ if (__le32_to_cpu(super->magic) != MD_SB_MAGIC) {
+ if (devname)
+ pr_err("No super block found on %s (Expected magic %08x, got %08x)\n",
+ devname, MD_SB_MAGIC,
+ __le32_to_cpu(super->magic));
+ free(super);
+ return 2;
+ }
+
+ if (__le32_to_cpu(super->major_version) != 1) {
+ if (devname)
+ pr_err("Cannot interpret superblock on %s - version is %d\n",
+ devname, __le32_to_cpu(super->major_version));
+ free(super);
+ return 2;
+ }
+ if (__le64_to_cpu(super->super_offset) != sb_offset) {
+ if (devname)
+ pr_err("No superblock found on %s (super_offset is wrong)\n",
+ devname);
+ free(super);
+ return 2;
+ }
+ st->sb = super;
+
+ bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE);
+
+ misc = (struct misc_dev_info*)
+ (((char*)super)+MAX_SB_SIZE+BM_SUPER_SIZE);
+ misc->device_size = dsize;
+ if (st->data_offset == INVALID_SECTORS)
+ st->data_offset = __le64_to_cpu(super->data_offset);
+
+ /* Now check on the bitmap superblock */
+ if ((__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) == 0)
+ return 0;
+ /* Read the bitmap superblock and make sure it looks
+ * valid. If it doesn't clear the bit. An --assemble --force
+ * should get that written out.
+ */
+ locate_bitmap1(st, fd, 0);
+ if (aread(&afd, bsb, 512) != 512)
+ goto no_bitmap;
+
+ uuid_from_super1(st, uuid);
+ if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC ||
+ memcmp(bsb->uuid, uuid, 16) != 0)
+ goto no_bitmap;
+ return 0;
+
+ no_bitmap:
+ super->feature_map = __cpu_to_le32(__le32_to_cpu(super->feature_map)
+ & ~MD_FEATURE_BITMAP_OFFSET);
+ return 0;
+}
+
+static struct supertype *match_metadata_desc1(char *arg)
+{
+ struct supertype *st = xcalloc(1, sizeof(*st));
+
+ st->container_devnm[0] = 0;
+ st->ss = &super1;
+ st->max_devs = MAX_DEVS;
+ st->sb = NULL;
+ st->data_offset = INVALID_SECTORS;
+ /* leading zeros can be safely ignored. --detail generates them. */
+ while (*arg == '0')
+ arg++;
+ if (strcmp(arg, "1.0") == 0 || strcmp(arg, "1.00") == 0) {
+ st->minor_version = 0;
+ return st;
+ }
+ if (strcmp(arg, "1.1") == 0 || strcmp(arg, "1.01") == 0
+ ) {
+ st->minor_version = 1;
+ return st;
+ }
+ if (strcmp(arg, "1.2") == 0 ||
+#ifndef DEFAULT_OLD_METADATA /* ifdef in super0.c */
+ strcmp(arg, "default") == 0 ||
+#endif /* DEFAULT_OLD_METADATA */
+ strcmp(arg, "1.02") == 0) {
+ st->minor_version = 2;
+ return st;
+ }
+ if (strcmp(arg, "1") == 0 || strcmp(arg, "default") == 0) {
+ st->minor_version = -1;
+ return st;
+ }
+
+ free(st);
+ return NULL;
+}
+
+/* find available size on device with this devsize, using
+ * superblock type st, and reserving 'reserve' sectors for
+ * a possible bitmap
+ */
+static __u64 avail_size1(struct supertype *st, __u64 devsize,
+ unsigned long long data_offset)
+{
+ struct mdp_superblock_1 *super = st->sb;
+ int bmspace = 0;
+ int bbspace = 0;
+ if (devsize < 24)
+ return 0;
+
+ if (__le32_to_cpu(super->feature_map) & MD_FEATURE_BITMAP_OFFSET) {
+ /* hot-add. allow for actual size of bitmap */
+ struct bitmap_super_s *bsb;
+ bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE);
+ bmspace = calc_bitmap_size(bsb, 4096) >> 9;
+ } else if (md_feature_any_ppl_on(super->feature_map)) {
+ bmspace = __le16_to_cpu(super->ppl.size);
+ }
+
+ /* Allow space for bad block log */
+ if (super->bblog_size)
+ bbspace = __le16_to_cpu(super->bblog_size);
+
+ if (st->minor_version < 0)
+ /* not specified, so time to set default */
+ st->minor_version = 2;
+
+ if (data_offset == INVALID_SECTORS)
+ data_offset = st->data_offset;
+
+ if (data_offset != INVALID_SECTORS)
+ switch(st->minor_version) {
+ case 0:
+ return devsize - data_offset - 8*2 - bbspace;
+ case 1:
+ case 2:
+ return devsize - data_offset;
+ default:
+ return 0;
+ }
+
+ devsize -= bmspace;
+
+ switch(st->minor_version) {
+ case 0:
+ /* at end */
+ return ((devsize - 8*2 - bbspace ) & ~(4*2-1));
+ case 1:
+ /* at start, 4K for superblock and possible bitmap */
+ return devsize - 4*2 - bbspace;
+ case 2:
+ /* 4k from start, 4K for superblock and possible bitmap */
+ return devsize - (4+4)*2 - bbspace;
+ }
+ return 0;
+}
+
+static int
+add_internal_bitmap1(struct supertype *st,
+ int *chunkp, int delay, int write_behind,
+ unsigned long long size,
+ int may_change, int major)
+{
+ /*
+ * If not may_change, then this is a 'Grow' without sysfs support for
+ * bitmaps, and the bitmap must fit after the superblock at 1K offset.
+ * If may_change, then this is create or a Grow with sysfs support,
+ * and we can put the bitmap wherever we like.
+ *
+ * size is in sectors, chunk is in bytes !!!
+ */
+
+ unsigned long long bits;
+ unsigned long long max_bits;
+ unsigned long long min_chunk;
+ long offset;
+ long bbl_offset, bbl_size;
+ unsigned long long chunk = *chunkp;
+ int room = 0;
+ int creating = 0;
+ int len;
+ struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE);
+ int uuid[4];
+
+ if (__le64_to_cpu(sb->data_size) == 0)
+ /*
+ * Must be creating the array, else data_size
+ * would be non-zero
+ */
+ creating = 1;
+ switch(st->minor_version) {
+ case 0:
+ /*
+ * either 3K after the superblock (when hot-add),
+ * or some amount of space before.
+ */
+ if (creating) {
+ /*
+ * We are creating array, so we *know* how much room has
+ * been left.
+ */
+ offset = 0;
+ bbl_size = 8;
+ room =
+ choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size;
+ } else {
+ room = __le64_to_cpu(sb->super_offset)
+ - __le64_to_cpu(sb->data_offset)
+ - __le64_to_cpu(sb->data_size);
+ bbl_size = __le16_to_cpu(sb->bblog_size);
+ if (bbl_size < 8)
+ bbl_size = 8;
+ bbl_offset = (__s32)__le32_to_cpu(sb->bblog_offset);
+ if (bbl_size < -bbl_offset)
+ bbl_size = -bbl_offset;
+
+ if (!may_change ||
+ (room < 3*2 && __le32_to_cpu(sb->max_dev) <= 384)) {
+ room = 3*2;
+ offset = 1*2;
+ bbl_size = 0;
+ } else {
+ offset = 0; /* means movable offset */
+ }
+ }
+ break;
+ case 1:
+ case 2: /* between superblock and data */
+ if (creating) {
+ offset = 4*2;
+ bbl_size = 8;
+ room =
+ choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size;
+ } else {
+ room = __le64_to_cpu(sb->data_offset)
+ - __le64_to_cpu(sb->super_offset);
+ bbl_size = __le16_to_cpu(sb->bblog_size);
+ if (bbl_size)
+ room =
+ __le32_to_cpu(sb->bblog_offset) + bbl_size;
+ else
+ bbl_size = 8;
+
+ if (!may_change) {
+ room -= 2; /* Leave 1K for superblock */
+ offset = 2;
+ bbl_size = 0;
+ } else {
+ room -= 4*2; /* leave 4K for superblock */
+ offset = 4*2;
+ }
+ }
+ break;
+ default:
+ return -ENOSPC;
+ }
+
+ room -= bbl_size;
+ if (chunk == UnSet && room > 128*2)
+ /* Limit to 128K of bitmap when chunk size not requested */
+ room = 128*2;
+
+ if (room <= 1)
+ /* No room for a bitmap */
+ return -ENOSPC;
+
+ max_bits = (room * 512 - sizeof(bitmap_super_t)) * 8;
+
+ min_chunk = 4096; /* sub-page chunks don't work yet.. */
+ bits = (size*512)/min_chunk +1;
+ while (bits > max_bits) {
+ min_chunk *= 2;
+ bits = (bits+1)/2;
+ }
+ if (chunk == UnSet) {
+ /* For practical purpose, 64Meg is a good
+ * default chunk size for internal bitmaps.
+ */
+ chunk = min_chunk;
+ if (chunk < 64*1024*1024)
+ chunk = 64*1024*1024;
+ } else if (chunk < min_chunk)
+ return -EINVAL; /* chunk size too small */
+ if (chunk == 0) /* rounding problem */
+ return -EINVAL;
+
+ if (offset == 0) {
+ /* start bitmap on a 4K boundary with enough space for
+ * the bitmap
+ */
+ bits = (size*512) / chunk + 1;
+ room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096;
+ room *= 8; /* convert 4K blocks to sectors */
+ offset = -room - bbl_size;
+ }
+
+ sb->bitmap_offset = (int32_t)__cpu_to_le32(offset);
+
+ sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) |
+ MD_FEATURE_BITMAP_OFFSET);
+ memset(bms, 0, sizeof(*bms));
+ bms->magic = __cpu_to_le32(BITMAP_MAGIC);
+ bms->version = __cpu_to_le32(major);
+ uuid_from_super1(st, uuid);
+ memcpy(bms->uuid, uuid, 16);
+ bms->chunksize = __cpu_to_le32(chunk);
+ bms->daemon_sleep = __cpu_to_le32(delay);
+ bms->sync_size = __cpu_to_le64(size);
+ bms->write_behind = __cpu_to_le32(write_behind);
+ bms->nodes = __cpu_to_le32(st->nodes);
+ if (st->nodes)
+ sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) |
+ MD_FEATURE_BITMAP_VERSIONED);
+ if (st->cluster_name) {
+ len = sizeof(bms->cluster_name);
+ strncpy((char *)bms->cluster_name, st->cluster_name, len);
+ bms->cluster_name[len - 1] = '\0';
+ }
+
+ *chunkp = chunk;
+ return 0;
+}
+
+static int locate_bitmap1(struct supertype *st, int fd, int node_num)
+{
+ unsigned long long offset, bm_sectors_per_node;
+ struct mdp_superblock_1 *sb;
+ bitmap_super_t *bms;
+ int mustfree = 0;
+ int ret;
+
+ if (!st->sb) {
+ if (st->ss->load_super(st, fd, NULL))
+ return -1; /* no error I hope... */
+ mustfree = 1;
+ }
+ sb = st->sb;
+
+ if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET))
+ ret = 0;
+ else
+ ret = -1;
+
+ offset = __le64_to_cpu(sb->super_offset) + (int32_t)__le32_to_cpu(sb->bitmap_offset);
+ if (node_num) {
+ bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
+ bm_sectors_per_node = calc_bitmap_size(bms, 4096) >> 9;
+ offset += bm_sectors_per_node * node_num;
+ }
+ if (mustfree)
+ free(sb);
+ lseek64(fd, offset<<9, 0);
+ return ret;
+}
+
+static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update)
+{
+ struct mdp_superblock_1 *sb = st->sb;
+ bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE);
+ int rv = 0;
+ void *buf;
+ int towrite, n, len;
+ struct align_fd afd;
+ unsigned int i = 0;
+ unsigned long long total_bm_space, bm_space_per_node;
+
+ switch (update) {
+ case NameUpdate:
+ /* update cluster name */
+ if (st->cluster_name) {
+ len = sizeof(bms->cluster_name);
+ memset((char *)bms->cluster_name, 0, len);
+ strncpy((char *)bms->cluster_name,
+ st->cluster_name, len);
+ bms->cluster_name[len - 1] = '\0';
+ }
+ break;
+ case NodeNumUpdate:
+ /* cluster md only supports superblock 1.2 now */
+ if (st->minor_version != 2 &&
+ bms->version == BITMAP_MAJOR_CLUSTERED) {
+ pr_err("Warning: cluster md only works with superblock 1.2\n");
+ return -EINVAL;
+ }
+
+ if (bms->version == BITMAP_MAJOR_CLUSTERED) {
+ if (__cpu_to_le32(st->nodes) < bms->nodes) {
+ /*
+ * Since the nodes num is not increased, no
+ * need to check the space enough or not,
+ * just update bms->nodes
+ */
+ bms->nodes = __cpu_to_le32(st->nodes);
+ break;
+ }
+ } else {
+ /*
+ * no need to change bms->nodes for other
+ * bitmap types
+ */
+ if (st->nodes)
+ pr_err("Warning: --nodes option is only suitable for clustered bitmap\n");
+ break;
+ }
+
+ /*
+ * Each node has an independent bitmap, it is necessary to
+ * calculate the space is enough or not, first get how many
+ * bytes for the total bitmap
+ */
+ bm_space_per_node = calc_bitmap_size(bms, 4096);
+
+ total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) -
+ __le64_to_cpu(sb->super_offset));
+ /* leave another 4k for superblock */
+ total_bm_space = total_bm_space - 4096;
+
+ if (bm_space_per_node * st->nodes > total_bm_space) {
+ pr_err("Warning: The max num of nodes can't exceed %llu\n",
+ total_bm_space / bm_space_per_node);
+ return -ENOMEM;
+ }
+
+ bms->nodes = __cpu_to_le32(st->nodes);
+ break;
+ case NoUpdate:
+ default:
+ break;
+ }
+
+ init_afd(&afd, fd);
+
+ if (locate_bitmap1(st, fd, 0) < 0) {
+ pr_err("Error: Invalid bitmap\n");
+ return -EINVAL;
+ }
+
+ if (posix_memalign(&buf, 4096, 4096))
+ return -ENOMEM;
+
+ do {
+ /* Only the bitmap[0] should resync
+ * whole device on initial assembly
+ */
+ if (i)
+ memset(buf, 0x00, 4096);
+ else
+ memset(buf, 0xff, 4096);
+ memcpy(buf, (char *)bms, sizeof(bitmap_super_t));
+
+ /*
+ * use 4096 boundary if bitmap_offset is aligned
+ * with 8 sectors, then it should compatible with
+ * older mdadm.
+ */
+ if (__le32_to_cpu(sb->bitmap_offset) & 7)
+ towrite = calc_bitmap_size(bms, 512);
+ else
+ towrite = calc_bitmap_size(bms, 4096);
+ while (towrite > 0) {
+ n = towrite;
+ if (n > 4096)
+ n = 4096;
+ n = awrite(&afd, buf, n);
+ if (n > 0)
+ towrite -= n;
+ else
+ break;
+ if (i)
+ memset(buf, 0x00, 4096);
+ else
+ memset(buf, 0xff, 4096);
+ }
+ fsync(fd);
+ if (towrite) {
+ rv = -2;
+ break;
+ }
+ } while (++i < __le32_to_cpu(bms->nodes));
+
+ free(buf);
+ return rv;
+}
+
+static void free_super1(struct supertype *st)
+{
+
+ if (st->sb)
+ free(st->sb);
+ while (st->info) {
+ struct devinfo *di = st->info;
+ st->info = di->next;
+ if (di->fd >= 0)
+ close(di->fd);
+ free(di);
+ }
+ st->sb = NULL;
+}
+
+static int validate_geometry1(struct supertype *st, int level,
+ int layout, int raiddisks,
+ int *chunk, unsigned long long size,
+ unsigned long long data_offset,
+ char *subdev, unsigned long long *freesize,
+ int consistency_policy, int verbose)
+{
+ unsigned long long ldsize, devsize;
+ int bmspace;
+ unsigned long long headroom;
+ unsigned long long overhead;
+ int fd;
+
+ if (level == LEVEL_CONTAINER) {
+ if (verbose)
+ pr_err("1.x metadata does not support containers\n");
+ return 0;
+ }
+ if (*chunk == UnSet)
+ *chunk = DEFAULT_CHUNK;
+
+ if (!subdev)
+ return 1;
+
+ if (st->minor_version < 0)
+ /* not specified, so time to set default */
+ st->minor_version = 2;
+
+ fd = open(subdev, O_RDONLY|O_EXCL, 0);
+ if (fd < 0) {
+ if (verbose)
+ pr_err("super1.x cannot open %s: %s\n",
+ subdev, strerror(errno));
+ return 0;
+ }
+
+ if (!get_dev_size(fd, subdev, &ldsize)) {
+ close(fd);
+ return 0;
+ }
+ close(fd);
+
+ devsize = ldsize >> 9;
+
+ /* creating: allow suitable space for bitmap or PPL */
+ if (consistency_policy == CONSISTENCY_POLICY_PPL)
+ bmspace = MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9;
+ else
+ bmspace = choose_bm_space(devsize);
+
+ if (data_offset == INVALID_SECTORS)
+ data_offset = st->data_offset;
+ if (data_offset == INVALID_SECTORS)
+ switch (st->minor_version) {
+ case 0:
+ data_offset = 0;
+ break;
+ case 1:
+ case 2:
+ /* Choose data offset appropriate for this device
+ * and use as default for whole array.
+ * The data_offset must allow for bitmap space
+ * and base metadata, should allow for some headroom
+ * for reshape, and should be rounded to multiple
+ * of 1M.
+ * Headroom is limited to 128M, but aim for about 0.1%
+ */
+ headroom = 128*1024*2;
+ while ((headroom << 10) > devsize &&
+ (*chunk == 0 ||
+ headroom / 2 >= ((unsigned)(*chunk)*2)*2))
+ headroom >>= 1;
+ data_offset = 12*2 + bmspace + headroom;
+ #define ONE_MEG (2*1024)
+ data_offset = ROUND_UP(data_offset, ONE_MEG);
+ break;
+ }
+ if (st->data_offset == INVALID_SECTORS)
+ st->data_offset = data_offset;
+ switch(st->minor_version) {
+ case 0: /* metadata at end. Round down and subtract space to reserve */
+ devsize = (devsize & ~(4ULL*2-1));
+ /* space for metadata, bblog, bitmap/ppl */
+ overhead = 8*2 + 8 + bmspace;
+ if (devsize < overhead) /* detect underflow */
+ goto dev_too_small_err;
+ devsize -= overhead;
+ break;
+ case 1:
+ case 2:
+ if (devsize < data_offset) /* detect underflow */
+ goto dev_too_small_err;
+ devsize -= data_offset;
+ break;
+ }
+ *freesize = devsize;
+ return 1;
+
+/* Error condition, device cannot even hold the overhead. */
+dev_too_small_err:
+ fprintf(stderr, "device %s is too small (%lluK) for "
+ "required metadata!\n", subdev, devsize>>1);
+ *freesize = 0;
+ return 0;
+}
+
+void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0)
+{
+ /* Create a v1.0 superblock based on 'info'*/
+ void *ret;
+ struct mdp_superblock_1 *sb;
+ int i;
+ unsigned long long offset;
+
+ if (posix_memalign(&ret, 4096, 1024) != 0)
+ return NULL;
+ sb = ret;
+ memset(ret, 0, 1024);
+ sb->magic = __cpu_to_le32(MD_SB_MAGIC);
+ sb->major_version = __cpu_to_le32(1);
+
+ copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid);
+ sprintf(sb->set_name, "%d", sb0->md_minor);
+ sb->ctime = __cpu_to_le32(info->array.ctime+1);
+ sb->level = __cpu_to_le32(info->array.level);
+ sb->layout = __cpu_to_le32(info->array.layout);
+ sb->size = __cpu_to_le64(info->component_size);
+ sb->chunksize = __cpu_to_le32(info->array.chunk_size/512);
+ sb->raid_disks = __cpu_to_le32(info->array.raid_disks);
+ if (info->array.level > 0)
+ sb->data_size = sb->size;
+ else
+ sb->data_size = st->ss->avail_size(st, st->devsize/512, 0);
+ sb->resync_offset = MaxSector;
+ sb->max_dev = __cpu_to_le32(MD_SB_DISKS);
+ sb->dev_number = __cpu_to_le32(info->disk.number);
+ sb->utime = __cpu_to_le64(info->array.utime);
+
+ offset = st->devsize/512 - 8*2;
+ offset &= ~(4*2-1);
+ sb->super_offset = __cpu_to_le64(offset);
+ //*(__u64*)(st->other + 128 + 8 + 8) = __cpu_to_le64(offset);
+
+ random_uuid(sb->device_uuid);
+
+ for (i = 0; i < MD_SB_DISKS; i++) {
+ int state = sb0->disks[i].state;
+ sb->dev_roles[i] = MD_DISK_ROLE_SPARE;
+ if ((state & (1<<MD_DISK_SYNC)) &&
+ !(state & (1<<MD_DISK_FAULTY)))
+ sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk);
+ }
+ sb->sb_csum = calc_sb_1_csum(sb);
+ return ret;
+}
+
+struct superswitch super1 = {
+ .examine_super = examine_super1,
+ .brief_examine_super = brief_examine_super1,
+ .export_examine_super = export_examine_super1,
+ .detail_super = detail_super1,
+ .brief_detail_super = brief_detail_super1,
+ .export_detail_super = export_detail_super1,
+ .write_init_super = write_init_super1,
+ .validate_geometry = validate_geometry1,
+ .add_to_super = add_to_super1,
+ .examine_badblocks = examine_badblocks_super1,
+ .copy_metadata = copy_metadata1,
+ .write_init_ppl = write_init_ppl1,
+ .match_home = match_home1,
+ .uuid_from_super = uuid_from_super1,
+ .getinfo_super = getinfo_super1,
+ .container_content = container_content1,
+ .update_super = update_super1,
+ .init_super = init_super1,
+ .store_super = store_super1,
+ .compare_super = compare_super1,
+ .load_super = load_super1,
+ .match_metadata_desc = match_metadata_desc1,
+ .avail_size = avail_size1,
+ .add_internal_bitmap = add_internal_bitmap1,
+ .locate_bitmap = locate_bitmap1,
+ .write_bitmap = write_bitmap1,
+ .free_super = free_super1,
+#if __BYTE_ORDER == BIG_ENDIAN
+ .swapuuid = 0,
+#else
+ .swapuuid = 1,
+#endif
+ .name = "1.x",
+};
diff --git a/swap_super.c b/swap_super.c
new file mode 100644
index 0000000..b6db574
--- /dev/null
+++ b/swap_super.c
@@ -0,0 +1,81 @@
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mount.h>
+/*
+ * This is a tiny test program to endian-swap
+ * the superblock on a given device.
+ * We simply read 4k from where the superblock should be
+ * do the swap, and write it back
+ * Don't use this on a real array, use mdadm.
+ */
+
+#define MD_RESERVED_BYTES (64 * 1024)
+#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512)
+
+#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS)
+
+extern long long lseek64(int, long long, int);
+
+int main(int argc, char *argv[])
+{
+ int fd, i;
+ unsigned long size;
+ unsigned long long offset;
+ char super[4096];
+ if (argc != 2) {
+ fprintf(stderr, "Usage: swap_super device\n");
+ exit(1);
+ }
+ fd = open(argv[1], O_RDWR);
+ if (fd<0) {
+ perror(argv[1]);
+ exit(1);
+ }
+ if (ioctl(fd, BLKGETSIZE, &size)) {
+ perror("BLKGETSIZE");
+ exit(1);
+ }
+ offset = MD_NEW_SIZE_SECTORS(size) * 512LL;
+ if (lseek64(fd, offset, 0) < 0LL) {
+ perror("lseek64");
+ exit(1);
+ }
+ if (read(fd, super, 4096) != 4096) {
+ perror("read");
+ exit(1);
+ }
+
+ for (i=0; i < 4096 ; i+=4) {
+ char t = super[i];
+ super[i] = super[i+3];
+ super[i+3] = t;
+ t=super[i+1];
+ super[i+1]=super[i+2];
+ super[i+2]=t;
+ }
+ /* swap the u64 events counters */
+ for (i=0; i<4; i++) {
+ /* events_hi and events_lo */
+ char t=super[32*4+7*4 +i];
+ super[32*4+7*4 +i] = super[32*4+8*4 +i];
+ super[32*4+8*4 +i] = t;
+
+ /* cp_events_hi and cp_events_lo */
+ t=super[32*4+9*4 +i];
+ super[32*4+9*4 +i] = super[32*4+10*4 +i];
+ super[32*4+10*4 +i] = t;
+ }
+
+ if (lseek64(fd, offset, 0) < 0LL) {
+ perror("lseek64");
+ exit(1);
+ }
+ if (write(fd, super, 4096) != 4096) {
+ perror("write");
+ exit(1);
+ }
+ exit(0);
+
+}
diff --git a/sysfs.c b/sysfs.c
new file mode 100644
index 0000000..2995713
--- /dev/null
+++ b/sysfs.c
@@ -0,0 +1,1167 @@
+/*
+ * sysfs - extract md related information from sysfs. Part of:
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include <dirent.h>
+#include <ctype.h>
+#include "dlink.h"
+
+#define MAX_SYSFS_PATH_LEN 120
+
+struct dev_sysfs_rule {
+ struct dev_sysfs_rule *next;
+ char *devname;
+ int uuid[4];
+ int uuid_set;
+ struct sysfs_entry {
+ struct sysfs_entry *next;
+ char *name;
+ char *value;
+ } *entry;
+};
+
+int load_sys(char *path, char *buf, int len)
+{
+ int fd = open(path, O_RDONLY);
+ int n;
+ if (fd < 0)
+ return -1;
+ n = read(fd, buf, len);
+ close(fd);
+ if (n <0 || n >= len)
+ return -1;
+ buf[n] = 0;
+ if (n && buf[n-1] == '\n')
+ buf[n-1] = 0;
+ return 0;
+}
+
+void sysfs_free(struct mdinfo *sra)
+{
+ while (sra) {
+ struct mdinfo *sra2 = sra->next;
+ while (sra->devs) {
+ struct mdinfo *d = sra->devs;
+ sra->devs = d->next;
+ free(d->bb.entries);
+ free(d);
+ }
+ free(sra->bb.entries);
+ free(sra);
+ sra = sra2;
+ }
+}
+
+int sysfs_open(char *devnm, char *devname, char *attr)
+{
+ char fname[MAX_SYSFS_PATH_LEN];
+ int fd;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/", devnm);
+ if (devname) {
+ strncat(fname, devname, MAX_SYSFS_PATH_LEN - strlen(fname));
+ strncat(fname, "/", MAX_SYSFS_PATH_LEN - strlen(fname));
+ }
+ strncat(fname, attr, MAX_SYSFS_PATH_LEN - strlen(fname));
+ fd = open(fname, O_RDWR);
+ if (fd < 0 && errno == EACCES)
+ fd = open(fname, O_RDONLY);
+ return fd;
+}
+
+void sysfs_init_dev(struct mdinfo *mdi, dev_t devid)
+{
+ snprintf(mdi->sys_name,
+ sizeof(mdi->sys_name), "dev-%s", devid2kname(devid));
+}
+
+int sysfs_init(struct mdinfo *mdi, int fd, char *devnm)
+{
+ struct stat stb;
+ char fname[MAX_SYSFS_PATH_LEN];
+ int retval = -ENODEV;
+
+ mdi->sys_name[0] = 0;
+ if (fd >= 0)
+ devnm = fd2devnm(fd);
+
+ if (devnm == NULL)
+ goto out;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md", devnm);
+
+ if (stat(fname, &stb))
+ goto out;
+ if (!S_ISDIR(stb.st_mode))
+ goto out;
+ strcpy(mdi->sys_name, devnm);
+
+ retval = 0;
+out:
+ return retval;
+}
+
+struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options)
+{
+ char fname[PATH_MAX];
+ char buf[PATH_MAX];
+ char *base;
+ char *dbase;
+ struct mdinfo *sra;
+ struct mdinfo *dev, **devp;
+ DIR *dir = NULL;
+ struct dirent *de;
+
+ sra = xcalloc(1, sizeof(*sra));
+ if (sysfs_init(sra, fd, devnm)) {
+ free(sra);
+ return NULL;
+ }
+
+ sprintf(fname, "/sys/block/%s/md/", sra->sys_name);
+ base = fname + strlen(fname);
+
+ sra->devs = NULL;
+ if (options & GET_VERSION) {
+ strcpy(base, "metadata_version");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ if (strncmp(buf, "none", 4) == 0) {
+ sra->array.major_version =
+ sra->array.minor_version = -1;
+ strcpy(sra->text_version, "");
+ } else if (strncmp(buf, "external:", 9) == 0) {
+ sra->array.major_version = -1;
+ sra->array.minor_version = -2;
+ strcpy(sra->text_version, buf+9);
+ } else {
+ sscanf(buf, "%d.%d",
+ &sra->array.major_version,
+ &sra->array.minor_version);
+ strcpy(sra->text_version, buf);
+ }
+ }
+ if (options & GET_LEVEL) {
+ strcpy(base, "level");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->array.level = map_name(pers, buf);
+ }
+ if (options & GET_LAYOUT) {
+ strcpy(base, "layout");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->array.layout = strtoul(buf, NULL, 0);
+ }
+ if (options & (GET_DISKS|GET_STATE)) {
+ strcpy(base, "raid_disks");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->array.raid_disks = strtoul(buf, NULL, 0);
+ }
+ if (options & GET_COMPONENT) {
+ strcpy(base, "component_size");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->component_size = strtoull(buf, NULL, 0);
+ /* sysfs reports "K", but we want sectors */
+ sra->component_size *= 2;
+ }
+ if (options & GET_CHUNK) {
+ strcpy(base, "chunk_size");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->array.chunk_size = strtoul(buf, NULL, 0);
+ }
+ if (options & GET_CACHE) {
+ strcpy(base, "stripe_cache_size");
+ if (load_sys(fname, buf, sizeof(buf)))
+ /* Probably level doesn't support it */
+ sra->cache_size = 0;
+ else
+ sra->cache_size = strtoul(buf, NULL, 0);
+ }
+ if (options & GET_MISMATCH) {
+ strcpy(base, "mismatch_cnt");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->mismatch_cnt = strtoul(buf, NULL, 0);
+ }
+ if (options & GET_SAFEMODE) {
+ int scale = 1;
+ int dot = 0;
+ unsigned i;
+ unsigned long msec;
+ size_t len;
+
+ strcpy(base, "safe_mode_delay");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+
+ /* remove a period, and count digits after it */
+ len = strlen(buf);
+ for (i = 0; i < len; i++) {
+ if (dot) {
+ if (isdigit(buf[i])) {
+ buf[i-1] = buf[i];
+ scale *= 10;
+ }
+ buf[i] = 0;
+ } else if (buf[i] == '.') {
+ dot=1;
+ buf[i] = 0;
+ }
+ }
+ msec = strtoul(buf, NULL, 10);
+ msec = (msec * 1000) / scale;
+ sra->safe_mode_delay = msec;
+ }
+ if (options & GET_BITMAP_LOCATION) {
+ strcpy(base, "bitmap/location");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ if (strncmp(buf, "file", 4) == 0)
+ sra->bitmap_offset = 1;
+ else if (strncmp(buf, "none", 4) == 0)
+ sra->bitmap_offset = 0;
+ else if (buf[0] == '+')
+ sra->bitmap_offset = strtol(buf+1, NULL, 10);
+ else
+ goto abort;
+ }
+
+ if (options & GET_ARRAY_STATE) {
+ strcpy(base, "array_state");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ sra->array_state = map_name(sysfs_array_states, buf);
+ }
+
+ if (options & GET_CONSISTENCY_POLICY) {
+ strcpy(base, "consistency_policy");
+ if (load_sys(fname, buf, sizeof(buf)))
+ sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN;
+ else
+ sra->consistency_policy = map_name(consistency_policies,
+ buf);
+ }
+
+ if (! (options & GET_DEVS))
+ return sra;
+
+ /* Get all the devices as well */
+ *base = 0;
+ dir = opendir(fname);
+ if (!dir)
+ goto abort;
+ sra->array.spare_disks = 0;
+ sra->array.active_disks = 0;
+ sra->array.failed_disks = 0;
+ sra->array.working_disks = 0;
+
+ devp = &sra->devs;
+ sra->devs = NULL;
+ while ((de = readdir(dir)) != NULL) {
+ char *ep;
+ if (de->d_ino == 0 ||
+ strncmp(de->d_name, "dev-", 4) != 0)
+ continue;
+ strcpy(base, de->d_name);
+ dbase = base + strlen(base);
+ *dbase++ = '/';
+
+ dev = xcalloc(1, sizeof(*dev));
+
+ /* Always get slot, major, minor */
+ strcpy(dbase, "slot");
+ if (load_sys(fname, buf, sizeof(buf))) {
+ /* hmm... unable to read 'slot' maybe the device
+ * is going away?
+ */
+ strcpy(dbase, "block");
+ if (readlink(fname, buf, sizeof(buf)) < 0 &&
+ errno != ENAMETOOLONG) {
+ /* ...yup device is gone */
+ free(dev);
+ continue;
+ } else {
+ /* slot is unreadable but 'block' link
+ * still intact... something bad is happening
+ * so abort
+ */
+ free(dev);
+ goto abort;
+ }
+
+ }
+ strcpy(dev->sys_name, de->d_name);
+ dev->disk.raid_disk = strtoul(buf, &ep, 10);
+ if (*ep) dev->disk.raid_disk = -1;
+
+ sra->array.nr_disks++;
+ strcpy(dbase, "block/dev");
+ if (load_sys(fname, buf, sizeof(buf))) {
+ /* assume this is a stale reference to a hot
+ * removed device
+ */
+ if (!(options & GET_DEVS_ALL)) {
+ free(dev);
+ continue;
+ }
+ } else {
+ sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor);
+ }
+
+ if (!(options & GET_DEVS_ALL)) {
+ /* special case check for block devices that can go 'offline' */
+ strcpy(dbase, "block/device/state");
+ if (load_sys(fname, buf, sizeof(buf)) == 0 &&
+ strncmp(buf, "offline", 7) == 0) {
+ free(dev);
+ continue;
+ }
+ }
+
+ /* finally add this disk to the array */
+ *devp = dev;
+ devp = & dev->next;
+ dev->next = NULL;
+
+ if (options & GET_OFFSET) {
+ strcpy(dbase, "offset");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ dev->data_offset = strtoull(buf, NULL, 0);
+ strcpy(dbase, "new_offset");
+ if (load_sys(fname, buf, sizeof(buf)) == 0)
+ dev->new_data_offset = strtoull(buf, NULL, 0);
+ else
+ dev->new_data_offset = dev->data_offset;
+ }
+ if (options & GET_SIZE) {
+ strcpy(dbase, "size");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ dev->component_size = strtoull(buf, NULL, 0) * 2;
+ }
+ if (options & GET_STATE) {
+ dev->disk.state = 0;
+ strcpy(dbase, "state");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ if (strstr(buf, "faulty"))
+ dev->disk.state |= (1<<MD_DISK_FAULTY);
+ else {
+ sra->array.working_disks++;
+ if (strstr(buf, "in_sync")) {
+ dev->disk.state |= (1<<MD_DISK_SYNC);
+ sra->array.active_disks++;
+ }
+ if (dev->disk.state == 0)
+ sra->array.spare_disks++;
+ }
+ }
+ if (options & GET_ERROR) {
+ strcpy(buf, "errors");
+ if (load_sys(fname, buf, sizeof(buf)))
+ goto abort;
+ dev->errors = strtoul(buf, NULL, 0);
+ }
+ }
+
+ if ((options & GET_STATE) && sra->array.raid_disks)
+ sra->array.failed_disks = sra->array.raid_disks -
+ sra->array.active_disks - sra->array.spare_disks;
+
+ closedir(dir);
+ return sra;
+
+ abort:
+ if (dir)
+ closedir(dir);
+ sysfs_free(sra);
+ return NULL;
+}
+
+int sysfs_attr_match(const char *attr, const char *str)
+{
+ /* See if attr, read from a sysfs file, matches
+ * str. They must either be the same, or attr can
+ * have a trailing newline or comma
+ */
+ while (*attr && *str && *attr == *str) {
+ attr++;
+ str++;
+ }
+
+ if (*str || (*attr && *attr != ',' && *attr != '\n'))
+ return 0;
+ return 1;
+}
+
+int sysfs_match_word(const char *word, char **list)
+{
+ int n;
+ for (n=0; list[n]; n++)
+ if (sysfs_attr_match(word, list[n]))
+ break;
+ return n;
+}
+
+unsigned long long get_component_size(int fd)
+{
+ /* Find out the component size of the array.
+ * We cannot trust GET_ARRAY_INFO ioctl as it's
+ * size field is only 32bits.
+ * So look in /sys/block/mdXXX/md/component_size
+ *
+ * This returns in units of sectors.
+ */
+ struct stat stb;
+ char fname[MAX_SYSFS_PATH_LEN];
+ int n;
+ if (fstat(fd, &stb))
+ return 0;
+ snprintf(fname, MAX_SYSFS_PATH_LEN,
+ "/sys/block/%s/md/component_size", stat2devnm(&stb));
+ fd = open(fname, O_RDONLY);
+ if (fd < 0)
+ return 0;
+ n = read(fd, fname, sizeof(fname));
+ close(fd);
+ if (n < 0 || n == sizeof(fname))
+ return 0;
+ fname[n] = 0;
+ return strtoull(fname, NULL, 10) * 2;
+}
+
+int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val)
+{
+ char fname[MAX_SYSFS_PATH_LEN];
+ unsigned int n;
+ int fd;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
+ sra->sys_name, dev?dev->sys_name:"", name);
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ return -1;
+ n = write(fd, val, strlen(val));
+ close(fd);
+ if (n != strlen(val)) {
+ dprintf("failed to write '%s' to '%s' (%s)\n",
+ val, fname, strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long val)
+{
+ char valstr[50];
+ sprintf(valstr, "%llu", val);
+ return sysfs_set_str(sra, dev, name, valstr);
+}
+
+int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, long long val)
+{
+ char valstr[50];
+ sprintf(valstr, "%lli", val);
+ return sysfs_set_str(sra, dev, name, valstr);
+}
+
+int sysfs_uevent(struct mdinfo *sra, char *event)
+{
+ char fname[MAX_SYSFS_PATH_LEN];
+ int n;
+ int fd;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/uevent",
+ sra->sys_name);
+ fd = open(fname, O_WRONLY);
+ if (fd < 0)
+ return -1;
+ n = write(fd, event, strlen(event));
+ close(fd);
+ if (n != (int)strlen(event)) {
+ dprintf("failed to write '%s' to '%s' (%s)\n",
+ event, fname, strerror(errno));
+ return -1;
+ }
+ return 0;
+}
+
+int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name)
+{
+ char fname[MAX_SYSFS_PATH_LEN];
+ struct stat st;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
+ sra->sys_name, dev?dev->sys_name:"", name);
+
+ return stat(fname, &st) == 0;
+}
+
+int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev,
+ char *name)
+{
+ char fname[MAX_SYSFS_PATH_LEN];
+ int fd;
+
+ snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s",
+ sra->sys_name, dev?dev->sys_name:"", name);
+ fd = open(fname, O_RDWR);
+ if (fd < 0)
+ fd = open(fname, O_RDONLY);
+ return fd;
+}
+
+int sysfs_fd_get_ll(int fd, unsigned long long *val)
+{
+ char buf[50];
+ int n;
+ char *ep;
+
+ lseek(fd, 0, 0);
+ n = read(fd, buf, sizeof(buf));
+ if (n <= 0 || n == sizeof(buf))
+ return -2;
+ buf[n] = 0;
+ *val = strtoull(buf, &ep, 0);
+ if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+ return -1;
+ return 0;
+}
+
+int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *val)
+{
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_ll(fd, val);
+ close(fd);
+ return n;
+}
+
+int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2)
+{
+ /* two numbers in this sysfs file, either
+ * NNN (NNN)
+ * or
+ * NNN / NNN
+ */
+ char buf[80];
+ int n;
+ char *ep, *ep2;
+
+ lseek(fd, 0, 0);
+ n = read(fd, buf, sizeof(buf));
+ if (n <= 0 || n == sizeof(buf))
+ return -2;
+ buf[n] = 0;
+ *v1 = strtoull(buf, &ep, 0);
+ if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))
+ return -1;
+ while (*ep == ' ' || *ep == '/' || *ep == '(')
+ ep++;
+ *v2 = strtoull(ep, &ep2, 0);
+ if (ep2 == ep || (*ep2 != 0 && *ep2 != '\n' && *ep2 != ' ' && *ep2 != ')')) {
+ *v2 = *v1;
+ return 1;
+ }
+ return 2;
+}
+
+int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, unsigned long long *v1, unsigned long long *v2)
+{
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_two(fd, v1, v2);
+ close(fd);
+ return n;
+}
+
+int sysfs_fd_get_str(int fd, char *val, int size)
+{
+ int n;
+
+ lseek(fd, 0, 0);
+ n = read(fd, val, size);
+ if (n <= 0 || n == size)
+ return -1;
+ val[n] = 0;
+ return n;
+}
+
+int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev,
+ char *name, char *val, int size)
+{
+ int n;
+ int fd;
+
+ fd = sysfs_get_fd(sra, dev, name);
+ if (fd < 0)
+ return -1;
+ n = sysfs_fd_get_str(fd, val, size);
+ close(fd);
+ return n;
+}
+
+int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms)
+{
+ unsigned long sec;
+ unsigned long msec;
+ char delay[30];
+
+ sec = ms / 1000;
+ msec = ms % 1000;
+
+ sprintf(delay, "%ld.%03ld\n", sec, msec);
+ /* this '\n' ^ needed for kernels older than 2.6.28 */
+ return sysfs_set_str(sra, NULL, "safe_mode_delay", delay);
+}
+
+int sysfs_set_array(struct mdinfo *info, int vers)
+{
+ int rv = 0;
+ char ver[100];
+ int raid_disks = info->array.raid_disks;
+
+ ver[0] = 0;
+ if (info->array.major_version == -1 &&
+ info->array.minor_version == -2) {
+ char buf[1024];
+
+ strcat(strcpy(ver, "external:"), info->text_version);
+
+ /* meta version might already be set if we are setting
+ * new geometry for a reshape. In that case we don't
+ * want to over-write the 'readonly' flag that is
+ * stored in the metadata version. So read the current
+ * version first, and preserve the flag
+ */
+ if (sysfs_get_str(info, NULL, "metadata_version",
+ buf, 1024) > 0)
+ if (strlen(buf) >= 9 && buf[9] == '-')
+ ver[9] = '-';
+
+ if ((vers % 100) < 2 ||
+ sysfs_set_str(info, NULL, "metadata_version",
+ ver) < 0) {
+ pr_err("This kernel does not support external metadata.\n");
+ return 1;
+ }
+ }
+ if (info->array.level < 0)
+ return 0; /* FIXME */
+ rv |= sysfs_set_str(info, NULL, "level",
+ map_num(pers, info->array.level));
+ if (info->reshape_active && info->delta_disks != UnSet)
+ raid_disks -= info->delta_disks;
+ rv |= sysfs_set_num(info, NULL, "raid_disks", raid_disks);
+ rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size);
+ rv |= sysfs_set_num(info, NULL, "layout", info->array.layout);
+ rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2);
+ if (info->custom_array_size) {
+ int rc;
+
+ rc = sysfs_set_num(info, NULL, "array_size",
+ info->custom_array_size/2);
+ if (rc && errno == ENOENT) {
+ pr_err("This kernel does not have the md/array_size attribute, the array may be larger than expected\n");
+ rc = 0;
+ }
+ rv |= rc;
+ }
+
+ if (info->array.level > 0)
+ rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start);
+
+ if (info->reshape_active) {
+ rv |= sysfs_set_num(info, NULL, "reshape_position",
+ info->reshape_progress);
+ rv |= sysfs_set_num(info, NULL, "chunk_size", info->new_chunk);
+ rv |= sysfs_set_num(info, NULL, "layout", info->new_layout);
+ rv |= sysfs_set_num(info, NULL, "raid_disks",
+ info->array.raid_disks);
+ /* We don't set 'new_level' here. That can only happen
+ * once the reshape completes.
+ */
+ }
+
+ if (info->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ if (sysfs_set_str(info, NULL, "consistency_policy",
+ map_num(consistency_policies,
+ info->consistency_policy))) {
+ pr_err("This kernel does not support PPL. Falling back to consistency-policy=resync.\n");
+ info->consistency_policy = CONSISTENCY_POLICY_RESYNC;
+ }
+ }
+
+ return rv;
+}
+
+int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume)
+{
+ char dv[PATH_MAX];
+ char nm[PATH_MAX];
+ char *dname;
+ int rv;
+ int i;
+
+ sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor);
+ rv = sysfs_set_str(sra, NULL, "new_dev", dv);
+ if (rv)
+ return rv;
+
+ memset(nm, 0, sizeof(nm));
+ dname = devid2kname(makedev(sd->disk.major, sd->disk.minor));
+ strcpy(sd->sys_name, "dev-");
+ strcpy(sd->sys_name+4, dname);
+
+ /* test write to see if 'recovery_start' is available */
+ if (resume && sd->recovery_start < MaxSector &&
+ sysfs_set_num(sra, sd, "recovery_start", 0)) {
+ sysfs_set_str(sra, sd, "state", "remove");
+ return -1;
+ }
+
+ rv = sysfs_set_num(sra, sd, "offset", sd->data_offset);
+ rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2);
+ if (sra->array.level != LEVEL_CONTAINER) {
+ if (sra->consistency_policy == CONSISTENCY_POLICY_PPL) {
+ rv |= sysfs_set_num(sra, sd, "ppl_sector", sd->ppl_sector);
+ rv |= sysfs_set_num(sra, sd, "ppl_size", sd->ppl_size);
+ }
+ if (sd->recovery_start == MaxSector)
+ /* This can correctly fail if array isn't started,
+ * yet, so just ignore status for now.
+ */
+ sysfs_set_str(sra, sd, "state", "insync");
+ if (sd->disk.raid_disk >= 0)
+ rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk);
+ if (resume)
+ sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start);
+ }
+ if (sd->bb.supported) {
+ if (sysfs_set_str(sra, sd, "state", "external_bbl")) {
+ /*
+ * backward compatibility - if kernel doesn't support
+ * bad blocks for external metadata, let it continue
+ * as long as there are none known so far
+ */
+ if (sd->bb.count) {
+ pr_err("The kernel has no support for bad blocks in external metadata\n");
+ return -1;
+ }
+ }
+
+ for (i = 0; i < sd->bb.count; i++) {
+ char s[30];
+ const struct md_bb_entry *entry = &sd->bb.entries[i];
+
+ snprintf(s, sizeof(s) - 1, "%llu %d\n", entry->sector,
+ entry->length);
+ rv |= sysfs_set_str(sra, sd, "bad_blocks", s);
+ }
+ }
+ return rv;
+}
+
+#if 0
+int sysfs_disk_to_sg(int fd)
+{
+ /* from an open block device, try find and open its corresponding
+ * scsi_generic interface
+ */
+ struct stat st;
+ char path[256];
+ char sg_path[256];
+ char sg_major_minor[10];
+ char *c;
+ DIR *dir;
+ struct dirent *de;
+ int major, minor, rv;
+
+ if (fstat(fd, &st))
+ return -1;
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ dir = opendir(path);
+ if (!dir)
+ return -1;
+
+ de = readdir(dir);
+ while (de) {
+ if (strncmp("scsi_generic:", de->d_name,
+ strlen("scsi_generic:")) == 0)
+ break;
+ de = readdir(dir);
+ }
+ closedir(dir);
+
+ if (!de)
+ return -1;
+
+ snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name);
+ fd = open(sg_path, O_RDONLY);
+ if (fd < 0)
+ return fd;
+
+ rv = read(fd, sg_major_minor, sizeof(sg_major_minor));
+ close(fd);
+ if (rv < 0 || rv == sizeof(sg_major_minor))
+ return -1;
+ else
+ sg_major_minor[rv - 1] = '\0';
+
+ c = strchr(sg_major_minor, ':');
+ *c = '\0';
+ c++;
+ major = strtol(sg_major_minor, NULL, 10);
+ minor = strtol(c, NULL, 10);
+ snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d",
+ (int) getpid(), major, minor);
+ if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) {
+ fd = open(path, O_RDONLY);
+ unlink(path);
+ return fd;
+ }
+
+ return -1;
+}
+#endif
+
+int sysfs_disk_to_scsi_id(int fd, __u32 *id)
+{
+ /* from an open block device, try to retrieve it scsi_id */
+ struct stat st;
+ char path[256];
+ DIR *dir;
+ struct dirent *de;
+ int host, bus, target, lun;
+
+ if (fstat(fd, &st))
+ return 1;
+
+ snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device/scsi_device",
+ major(st.st_rdev), minor(st.st_rdev));
+
+ dir = opendir(path);
+ if (!dir)
+ return 1;
+
+ for (de = readdir(dir); de; de = readdir(dir)) {
+ int count;
+
+ if (de->d_type != DT_DIR)
+ continue;
+
+ count = sscanf(de->d_name, "%d:%d:%d:%d", &host, &bus, &target, &lun);
+ if (count == 4)
+ break;
+ }
+ closedir(dir);
+
+ if (!de)
+ return 1;
+
+ *id = (host << 24) | (bus << 16) | (target << 8) | (lun << 0);
+ return 0;
+}
+
+int sysfs_unique_holder(char *devnm, long rdev)
+{
+ /* Check that devnm is a holder of rdev,
+ * and is the only holder.
+ * we should be locked against races by
+ * an O_EXCL on devnm
+ * Return values:
+ * 0 - not unique, not even a holder
+ * 1 - unique, this is the only holder.
+ * 2/3 - not unique, there is another holder
+ * -1 - error, cannot find the holders
+ */
+ DIR *dir;
+ struct dirent *de;
+ char dirname[100];
+ char l;
+ int ret = 0;
+ sprintf(dirname, "/sys/dev/block/%d:%d/holders",
+ major(rdev), minor(rdev));
+ dir = opendir(dirname);
+ if (!dir)
+ return -1;
+ l = strlen(dirname);
+ while ((de = readdir(dir)) != NULL) {
+ char buf[100];
+ char *sl;
+ int n;
+
+ if (de->d_ino == 0)
+ continue;
+ if (de->d_name[0] == '.')
+ continue;
+ strcpy(dirname+l, "/");
+ strcat(dirname+l, de->d_name);
+ n = readlink(dirname, buf, sizeof(buf)-1);
+ if (n <= 0)
+ continue;
+ buf[n] = 0;
+ sl = strrchr(buf, '/');
+ if (!sl)
+ continue;
+ sl++;
+
+ if (strcmp(devnm, sl) == 0)
+ ret |= 1;
+ else
+ ret |= 2;
+ }
+ closedir(dir);
+ return ret;
+}
+
+int sysfs_freeze_array(struct mdinfo *sra)
+{
+ /* Try to freeze resync/rebuild on this array/container.
+ * Return -1 if the array is busy,
+ * return 0 if this kernel doesn't support 'frozen'
+ * return 1 if it worked.
+ */
+ char buf[20];
+
+ if (!sysfs_attribute_available(sra, NULL, "sync_action"))
+ return 1; /* no sync_action == frozen */
+ if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0)
+ return 0;
+ if (strcmp(buf, "frozen\n") == 0)
+ /* Already frozen */
+ return 0;
+ if (strcmp(buf, "idle\n") != 0 && strcmp(buf, "recover\n") != 0)
+ return -1;
+ if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0)
+ return 0;
+ return 1;
+}
+
+int sysfs_wait(int fd, int *msec)
+{
+ /* Wait up to '*msec' for fd to have an exception condition.
+ * if msec == NULL, wait indefinitely.
+ */
+ fd_set fds;
+ int n;
+ FD_ZERO(&fds);
+ FD_SET(fd, &fds);
+ if (msec == NULL)
+ n = select(fd+1, NULL, NULL, &fds, NULL);
+ else if (*msec < 0)
+ n = 0;
+ else {
+ struct timeval start, end, tv;
+ gettimeofday(&start, NULL);
+ if (*msec < 1000) {
+ tv.tv_sec = 0;
+ tv.tv_usec = (*msec)*1000;
+ } else {
+ tv.tv_sec = (*msec)/1000;
+ tv.tv_usec = 0;
+ }
+ n = select(fd+1, NULL, NULL, &fds, &tv);
+ gettimeofday(&end, NULL);
+ end.tv_sec -= start.tv_sec;
+ *msec -= (end.tv_sec * 1000 + end.tv_usec/1000
+ - start.tv_usec/1000) + 1;
+ }
+ return n;
+}
+
+int sysfs_rules_apply_check(const struct mdinfo *sra,
+ const struct sysfs_entry *ent)
+{
+ /* Check whether parameter is regular file,
+ * exists and is under specified directory.
+ */
+ char fname[MAX_SYSFS_PATH_LEN];
+ char dname[MAX_SYSFS_PATH_LEN];
+ char resolved_path[PATH_MAX];
+ char resolved_dir[PATH_MAX];
+ int result;
+
+ if (sra == NULL || ent == NULL)
+ return -1;
+
+ result = snprintf(dname, MAX_SYSFS_PATH_LEN,
+ "/sys/block/%s/md/", sra->sys_name);
+ if (result < 0 || result >= MAX_SYSFS_PATH_LEN)
+ return -1;
+
+ result = snprintf(fname, MAX_SYSFS_PATH_LEN,
+ "%s/%s", dname, ent->name);
+ if (result < 0 || result >= MAX_SYSFS_PATH_LEN)
+ return -1;
+
+ if (realpath(fname, resolved_path) == NULL ||
+ realpath(dname, resolved_dir) == NULL)
+ return -1;
+
+ if (strncmp(resolved_dir, resolved_path,
+ strnlen(resolved_dir, PATH_MAX)) != 0)
+ return -1;
+
+ return 0;
+}
+
+static struct dev_sysfs_rule *sysfs_rules;
+
+void sysfs_rules_apply(char *devnm, struct mdinfo *dev)
+{
+ struct dev_sysfs_rule *rules = sysfs_rules;
+
+ while (rules) {
+ struct sysfs_entry *ent = rules->entry;
+ int match = 0;
+
+ if (!rules->uuid_set) {
+ if (rules->devname)
+ match = strcmp(devnm, rules->devname) == 0;
+ } else {
+ match = memcmp(dev->uuid, rules->uuid,
+ sizeof(int[4])) == 0;
+ }
+
+ while (match && ent) {
+ if (sysfs_rules_apply_check(dev, ent) < 0)
+ pr_err("SYSFS: failed to write '%s' to '%s'\n",
+ ent->value, ent->name);
+ else
+ sysfs_set_str(dev, NULL, ent->name, ent->value);
+ ent = ent->next;
+ }
+ rules = rules->next;
+ }
+}
+
+static void sysfs_rule_free(struct dev_sysfs_rule *rule)
+{
+ struct sysfs_entry *entry;
+
+ while (rule) {
+ struct dev_sysfs_rule *tmp = rule->next;
+
+ entry = rule->entry;
+ while (entry) {
+ struct sysfs_entry *tmp = entry->next;
+
+ free(entry->name);
+ free(entry->value);
+ free(entry);
+ entry = tmp;
+ }
+
+ if (rule->devname)
+ free(rule->devname);
+ free(rule);
+ rule = tmp;
+ }
+}
+
+void sysfsline(char *line)
+{
+ struct dev_sysfs_rule *sr;
+ char *w;
+
+ sr = xcalloc(1, sizeof(*sr));
+ for (w = dl_next(line); w != line ; w = dl_next(w)) {
+ if (strncasecmp(w, "name=", 5) == 0) {
+ char *devname = w + 5;
+
+ if (strncmp(devname, "/dev/md/", 8) == 0) {
+ if (sr->devname)
+ pr_err("Only give one device per SYSFS line: %s\n",
+ devname);
+ else
+ sr->devname = xstrdup(devname);
+ } else {
+ pr_err("%s is an invalid name for an md device - ignored.\n",
+ devname);
+ }
+ } else if (strncasecmp(w, "uuid=", 5) == 0) {
+ char *uuid = w + 5;
+
+ if (sr->uuid_set) {
+ pr_err("Only give one uuid per SYSFS line: %s\n",
+ uuid);
+ } else {
+ if (parse_uuid(w + 5, sr->uuid) &&
+ memcmp(sr->uuid, uuid_zero,
+ sizeof(int[4])) != 0)
+ sr->uuid_set = 1;
+ else
+ pr_err("Invalid uuid: %s\n", uuid);
+ }
+ } else {
+ struct sysfs_entry *prop;
+
+ char *sep = strchr(w, '=');
+
+ if (sep == NULL || *(sep + 1) == 0) {
+ pr_err("Cannot parse \"%s\" - ignoring.\n", w);
+ continue;
+ }
+
+ prop = xmalloc(sizeof(*prop));
+ prop->value = xstrdup(sep + 1);
+ *sep = 0;
+ prop->name = xstrdup(w);
+ prop->next = sr->entry;
+ sr->entry = prop;
+ }
+ }
+
+ if (!sr->devname && !sr->uuid_set) {
+ pr_err("Device name not found in sysfs config entry - ignoring.\n");
+ sysfs_rule_free(sr);
+ return;
+ }
+
+ sr->next = sysfs_rules;
+ sysfs_rules = sr;
+}
diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh
new file mode 100644
index 0000000..c13b48a
--- /dev/null
+++ b/systemd/SUSE-mdadm_env.sh
@@ -0,0 +1,48 @@
+#!/bin/sh
+
+# extract configuration from /etc/sysconfig/mdadm and write
+# environment to /run/sysconfig/mdadm to be used by
+# systemd unit files.
+
+MDADM_SCAN="yes"
+
+# Following adapted from /etc/init.d/mdadmd on openSUSE
+
+mdadmd_CONFIG=/etc/sysconfig/mdadm
+if test -r $mdadmd_CONFIG; then
+ . $mdadmd_CONFIG
+fi
+
+if [ x$MDADM_DELAY != x"" ]; then
+ MDADM_DELAY="-d "$MDADM_DELAY;
+fi
+
+if [ x$MDADM_MAIL != x"" ]; then
+ MDADM_MAIL="-m \"$MDADM_MAIL\""
+fi
+
+if [ x$MDADM_PROGRAM != x"" ]; then
+ MDADM_PROGRAM="-p \"$MDADM_PROGRAM\""
+fi
+
+if [ x$MDADM_SCAN = x"yes" ]; then
+ MDADM_SCAN="--scan"
+else
+ MDADM_SCAN=""
+fi
+
+if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then
+ MDADM_SEND_MAIL="-t"
+else
+ MDADM_SEND_MAIL=""
+fi
+
+if [ x$MDADM_CONFIG != x"" ]; then
+ MDADM_CONFIG="-c \"$MDADM_CONFIG\""
+fi
+
+mkdir -p /run/sysconfig
+echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm
+if [ -n "$MDADM_CHECK_DURATION" ]; then
+ echo "MDADM_CHECK_DURATION=$MDADM_CHECK_DURATION" >> /run/sysconfig/mdadm
+fi
diff --git a/systemd/mdadm-grow-continue@.service b/systemd/mdadm-grow-continue@.service
new file mode 100644
index 0000000..5c667d2
--- /dev/null
+++ b/systemd/mdadm-grow-continue@.service
@@ -0,0 +1,17 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Manage MD Reshape on /dev/%I
+DefaultDependencies=no
+
+[Service]
+ExecStart=BINDIR/mdadm --grow --continue /dev/%I
+StandardInput=null
+StandardOutput=null
+StandardError=null
+KillMode=none
diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service
new file mode 100644
index 0000000..efeb3f6
--- /dev/null
+++ b/systemd/mdadm-last-resort@.service
@@ -0,0 +1,8 @@
+[Unit]
+Description=Activate md array %I even though degraded
+DefaultDependencies=no
+ConditionPathExists=!/sys/devices/virtual/block/%i/md/sync_action
+
+[Service]
+Type=oneshot
+ExecStart=BINDIR/mdadm --run /dev/%i
diff --git a/systemd/mdadm-last-resort@.timer b/systemd/mdadm-last-resort@.timer
new file mode 100644
index 0000000..45ad223
--- /dev/null
+++ b/systemd/mdadm-last-resort@.timer
@@ -0,0 +1,7 @@
+[Unit]
+Description=Timer to wait for more drives before activating degraded array %I.
+DefaultDependencies=no
+Conflicts=sys-devices-virtual-block-%i.device
+
+[Timer]
+OnActiveSec=30
diff --git a/systemd/mdadm.shutdown b/systemd/mdadm.shutdown
new file mode 100644
index 0000000..33f2778
--- /dev/null
+++ b/systemd/mdadm.shutdown
@@ -0,0 +1,4 @@
+#!/bin/sh
+# We need to ensure all md arrays with external metadata
+# (e.g. IMSM, DDF) are clean before completing the shutdown.
+BINDIR/mdadm --wait-clean --scan
diff --git a/systemd/mdcheck_continue.service b/systemd/mdcheck_continue.service
new file mode 100644
index 0000000..854317f
--- /dev/null
+++ b/systemd/mdcheck_continue.service
@@ -0,0 +1,17 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array scrubbing - continuation
+ConditionPathExistsGlob = /var/lib/mdcheck/MD_UUID_*
+
+[Service]
+Type=oneshot
+Environment="MDADM_CHECK_DURATION=6 hours"
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh
+ExecStart=/usr/share/mdadm/mdcheck --continue --duration ${MDADM_CHECK_DURATION}
diff --git a/systemd/mdcheck_continue.timer b/systemd/mdcheck_continue.timer
new file mode 100644
index 0000000..dba1074
--- /dev/null
+++ b/systemd/mdcheck_continue.timer
@@ -0,0 +1,15 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array scrubbing - continuation
+
+[Timer]
+OnCalendar= 1:05:00
+
+[Install]
+WantedBy= mdmonitor.service
diff --git a/systemd/mdcheck_start.service b/systemd/mdcheck_start.service
new file mode 100644
index 0000000..3bb3d13
--- /dev/null
+++ b/systemd/mdcheck_start.service
@@ -0,0 +1,17 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array scrubbing
+Wants=mdcheck_continue.timer
+
+[Service]
+Type=oneshot
+Environment="MDADM_CHECK_DURATION=6 hours"
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh
+ExecStart=/usr/share/mdadm/mdcheck --duration ${MDADM_CHECK_DURATION}
diff --git a/systemd/mdcheck_start.timer b/systemd/mdcheck_start.timer
new file mode 100644
index 0000000..9e7e02a
--- /dev/null
+++ b/systemd/mdcheck_start.timer
@@ -0,0 +1,16 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array scrubbing
+
+[Timer]
+OnCalendar=Sun *-*-1..7 1:00:00
+
+[Install]
+WantedBy= mdmonitor.service
+Also= mdcheck_continue.timer
diff --git a/systemd/mdmon@.service b/systemd/mdmon@.service
new file mode 100644
index 0000000..85a3a7c
--- /dev/null
+++ b/systemd/mdmon@.service
@@ -0,0 +1,28 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD Metadata Monitor on /dev/%I
+DefaultDependencies=no
+Before=initrd-switch-root.target
+
+[Service]
+# mdmon should never complain due to lack of a platform,
+# that is mdadm's job if at all.
+Environment=IMSM_NO_PLATFORM=1
+# The mdmon starting in the initramfs (with dracut at least)
+# cannot see sysfs after root is mounted, so we will have to
+# 'takeover'. As the '--offroot --takeover' don't hurt when
+# not necessary, are are useful with root-on-md in dracut,
+# have them always present.
+ExecStart=BINDIR/mdmon --offroot --takeover %I
+Type=forking
+# Don't set the PIDFile. It isn't necessary (systemd can work
+# it out) and systemd will remove it when transitioning from
+# initramfs to rootfs.
+#PIDFile=/run/mdadm/%I.pid
+KillMode=none
diff --git a/systemd/mdmonitor-oneshot.service b/systemd/mdmonitor-oneshot.service
new file mode 100644
index 0000000..373955a
--- /dev/null
+++ b/systemd/mdmonitor-oneshot.service
@@ -0,0 +1,15 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Reminder for degraded MD arrays
+
+[Service]
+Environment=MDADM_MONITOR_ARGS=--scan
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh
+ExecStart=BINDIR/mdadm --monitor --oneshot $MDADM_MONITOR_ARGS
diff --git a/systemd/mdmonitor-oneshot.timer b/systemd/mdmonitor-oneshot.timer
new file mode 100644
index 0000000..cb54bda
--- /dev/null
+++ b/systemd/mdmonitor-oneshot.timer
@@ -0,0 +1,15 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=Reminder for degraded MD arrays
+
+[Timer]
+OnCalendar= 2:00:00
+
+[Install]
+WantedBy= mdmonitor.service
diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service
new file mode 100644
index 0000000..46f7b88
--- /dev/null
+++ b/systemd/mdmonitor.service
@@ -0,0 +1,16 @@
+# This file is part of mdadm.
+#
+# mdadm is free software; you can redistribute it and/or modify it
+# under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+
+[Unit]
+Description=MD array monitor
+DefaultDependencies=no
+
+[Service]
+Environment= MDADM_MONITOR_ARGS=--scan
+EnvironmentFile=-/run/sysconfig/mdadm
+ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh
+ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS
diff --git a/test b/test
new file mode 100755
index 0000000..711a3c7
--- /dev/null
+++ b/test
@@ -0,0 +1,283 @@
+#!/bin/bash
+#
+# run test suite for mdadm
+mdadm=$PWD/mdadm
+targetdir="/var/tmp"
+logdir="$targetdir"
+config=/tmp/mdadm.conf
+testdir=$PWD/tests
+devlist=
+
+savelogs=0
+exitonerror=1
+prefix='[0-9][0-9]'
+
+# use loop devices by default if doesn't specify --dev
+DEVTYPE=loop
+INTEGRITY=yes
+LVM_VOLGROUP=mdtest
+
+# make sure to test local mdmon, not system one
+export MDADM_NO_SYSTEMCTL=1
+
+# assume md0, md1, md2 exist in /dev
+md0=/dev/md0
+md1=/dev/md1
+md2=/dev/md2
+mdp0=/dev/md_d0
+mdp1=/dev/md_d1
+
+die() {
+ echo -e "\n\tERROR: $* \n"
+ save_log fail
+ exit 2
+}
+
+ctrl_c() {
+ exitonerror=1
+}
+
+# mdadm always adds --quiet, and we want to see any unexpected messages
+mdadm() {
+ rm -f $targetdir/stderr
+ case $* in
+ *-S* )
+ udevadm settle
+ p=`cat /proc/sys/dev/raid/speed_limit_max`
+ echo 20000 > /proc/sys/dev/raid/speed_limit_max
+ ;;
+ esac
+ case $* in
+ *-C* | *--create* | *-B* | *--build* )
+ # clear superblock every time once creating or
+ # building arrays, because it's always creating
+ # and building array many times in a test case.
+ for args in $*
+ do
+ [[ $args =~ "/dev/" ]] && {
+ [[ $args =~ "md" ]] ||
+ $mdadm --zero $args > /dev/null
+ }
+ done
+ $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes
+ ;;
+ * )
+ $mdadm 2> $targetdir/stderr --quiet "$@"
+ ;;
+ esac
+ rv=$?
+ case $* in
+ *-S* )
+ udevadm settle
+ echo $p > /proc/sys/dev/raid/speed_limit_max
+ ;;
+ esac
+ cat >&2 $targetdir/stderr
+ return $rv
+}
+
+do_test() {
+ _script=$1
+ _basename=`basename $_script`
+ if [ -f "$_script" ]
+ then
+ rm -f $targetdir/stderr
+ # this might have been reset: restore the default.
+ echo 2000 > /proc/sys/dev/raid/speed_limit_max
+ do_clean
+ # source script in a subshell, so it has access to our
+ # namespace, but cannot change it.
+ echo -ne "$_script... "
+ if ( set -ex ; . $_script ) &> $targetdir/log
+ then
+ dmesg | grep -iq "error\|call trace\|segfault" &&
+ die "dmesg prints errors when testing $_basename!"
+ echo "succeeded"
+ _fail=0
+ else
+ save_log fail
+ _fail=1
+ fi
+ [ "$savelogs" == "1" ] &&
+ mv -f $targetdir/log $logdir/$_basename.log
+ [ "$_fail" == "1" -a "$exitonerror" == "1" ] && exit 1
+ fi
+}
+
+do_help() {
+ cat <<-EOF
+ Usage: $0 [options]
+ Example for disk mode: ./test --dev=disk --disks=/dev/sda{2..15}
+ Options:
+ --tests=test1,test2,... Comma separated list of tests to run
+ --testdir= Specify testdir as tests|clustermd_tests
+ --raidtype= raid0|linear|raid1|raid456|raid10|ddf|imsm
+ --disable-multipath Disable any tests involving multipath
+ --disable-integrity Disable slow tests of RAID[56] consistency
+ --logdir=directory Directory to save all logfiles in
+ --save-logs Usually use with --logdir together
+ --keep-going | --no-error Don't stop on error, ie. run all tests
+ --dev=loop|lvm|ram|disk Use loop devices (default), LVM, RAM or disk
+ --disks= Provide a bunch of physical devices for test
+ --volgroup=name LVM volume group for LVM test
+ setup Setup test environment and exit
+ cleanup Cleanup test environment
+ prefix Run tests with <prefix>
+ --help | -h Print this usage
+ EOF
+}
+
+parse_args() {
+ for i in $*
+ do
+ case $i in
+ --testdir=* )
+ case ${i##*=} in
+ tests )
+ testdir=tests
+ ;;
+ clustermd_tests )
+ testdir=clustermd_tests
+ CLUSTER_CONF="$PWD/$testdir/cluster_conf"
+ ;;
+ * )
+ echo "Unknown argument: $i"
+ do_help
+ exit 1
+ ;;
+ esac
+ ;;
+ esac
+ done
+ [ -z "$testdir" ] && testdir=tests
+ . $testdir/func.sh
+ for i in $*
+ do
+ case $i in
+ [0-9][0-9] )
+ prefix=$i
+ ;;
+ setup )
+ echo "mdadm test environment setup"
+ do_setup
+ trap 0
+ exit 0
+ ;;
+ cleanup )
+ cleanup
+ exit 0
+ ;;
+ --testdir=* )
+ ;;
+ --tests=* )
+ TESTLIST=($(echo ${i##*=} | sed -e 's/,/ /g'))
+ ;;
+ --raidtype=* )
+ case ${i##*=} in
+ raid0 )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]r0\|raid0"))
+ ;;
+ linear )
+ TESTLIST=($(ls $testdir | grep "linear"))
+ ;;
+ raid1 )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]r1\|raid1" | grep -vi "r10\|raid10"))
+ ;;
+ raid456 )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]r[4-6]\|raid[4-6]"))
+ ;;
+ raid10 )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]r10\|raid10"))
+ ;;
+ ddf )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]ddf"))
+ ;;
+ imsm )
+ TESTLIST=($(ls $testdir | grep "[0-9][0-9]imsm"))
+ ;;
+ * )
+ echo "Unknown argument: $i"
+ do_help
+ exit 1
+ ;;
+ esac
+ ;;
+ --logdir=* )
+ logdir="${i##*=}"
+ ;;
+ --save-logs )
+ savelogs=1
+ ;;
+ --keep-going | --no-error )
+ exitonerror=0
+ ;;
+ --disable-multipath )
+ unset MULTIPATH
+ ;;
+ --disable-integrity )
+ unset INTEGRITY
+ ;;
+ --dev=* )
+ case ${i##*=} in
+ loop )
+ DEVTYPE=loop
+ ;;
+ lvm )
+ DEVTYPE=lvm
+ ;;
+ ram )
+ DEVTYPE=ram
+ ;;
+ disk )
+ DEVTYPE=disk
+ ;;
+ * )
+ echo "Unknown argument: $i"
+ do_help
+ exit 1
+ ;;
+ esac
+ ;;
+ --disks=* )
+ disks=(${disks[*]} ${i##*=})
+ ;;
+ --volgroup=* )
+ LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'`
+ ;;
+ --help | -h )
+ do_help
+ exit 0
+ ;;
+ * )
+ echo " $0: Unknown argument: $i"
+ do_help
+ exit 1
+ ;;
+ esac
+ done
+}
+
+main() {
+ do_setup
+
+ echo "Testing on linux-$(uname -r) kernel"
+ [ "$savelogs" == "1" ] &&
+ echo "Saving logs to $logdir"
+ if [ "x$TESTLIST" != "x" ]
+ then
+ for script in ${TESTLIST[@]}
+ do
+ do_test $testdir/$script
+ done
+ else
+ for script in $testdir/$prefix $testdir/$prefix*[^~]
+ do
+ do_test $script
+ done
+ fi
+
+ exit 0
+}
+
+parse_args $@
+main
diff --git a/tests/00linear b/tests/00linear
new file mode 100644
index 0000000..e3ac655
--- /dev/null
+++ b/tests/00linear
@@ -0,0 +1,25 @@
+
+# create a simple linear
+
+mdadm -CR $md0 -l linear -n3 $dev0 $dev1 $dev2
+check linear
+testdev $md0 3 $mdsize2_l 1
+mdadm -S $md0
+
+# now with version-0.90 superblock
+mdadm -CR $md0 -e0.90 --level=linear -n4 $dev0 $dev1 $dev2 $dev3
+check linear
+testdev $md0 4 $mdsize0 1
+mdadm -S $md0
+
+# now with version-1.0 superblock
+mdadm -CR $md0 -e1.0 --level=linear -n4 $dev0 $dev1 $dev2 $dev3
+check linear
+testdev $md0 4 $mdsize1 1
+mdadm -S $md0
+
+# now with no superblock
+mdadm -B $md0 -l linear -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+check linear
+testdev $md0 5 $size 64
+mdadm -S $md0
diff --git a/tests/00multipath b/tests/00multipath
new file mode 100644
index 0000000..84e4d69
--- /dev/null
+++ b/tests/00multipath
@@ -0,0 +1,29 @@
+
+#
+# create a multipath, and fail and stuff
+
+if [ "$MULTIPATH" != "yes" ]; then
+ echo -ne 'skipping... '
+ exit 0
+fi
+
+mdadm -CR $md1 -l multipath -n2 $path0 $path1
+
+testdev $md1 1 $mdsize12 1
+
+mdadm $md1 -f $path0
+rotest $md1
+testdev $md1 1 $mdsize12 1
+
+mdadm $md1 -r $path0
+mdadm $md1 -a $path0
+
+rotest $md1
+testdev $md1 1 $mdsize12 1
+
+mdadm $md1 -f $path1
+mdadm $md1 -r $path1
+rotest $md1
+testdev $md1 1 $mdsize12 1
+
+mdadm -S $md1
diff --git a/tests/00names b/tests/00names
new file mode 100644
index 0000000..7a066d8
--- /dev/null
+++ b/tests/00names
@@ -0,0 +1,13 @@
+set -x -e
+
+# create arrays with non-numeric names
+conf=$targetdir/mdadm.conf
+echo "CREATE names=yes" > $conf
+
+for i in linear raid0 raid1 raid4 raid5 raid6
+do
+ mdadm -CR --config $conf /dev/md/$i -l $i -n 4 $dev4 $dev3 $dev2 $dev1
+ check $i
+ [ -d /sys/class/block/md_$i/md ]
+ mdadm -S md_$i
+done
diff --git a/tests/00raid0 b/tests/00raid0
new file mode 100644
index 0000000..8bc1898
--- /dev/null
+++ b/tests/00raid0
@@ -0,0 +1,43 @@
+
+# create a simple raid0
+
+mdadm -CR $md0 -l raid0 -n3 $dev0 $dev1 $dev2
+check raid0
+testdev $md0 3 $mdsize2_l 512
+mdadm -S $md0
+
+# now with version-0.90 superblock
+mdadm -CR $md0 -e0.90 -l0 -n4 $dev0 $dev1 $dev2 $dev3
+check raid0
+testdev $md0 4 $mdsize0 512
+mdadm -S $md0
+
+# now with no superblock
+mdadm -B $md0 -l0 -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+check raid0
+testdev $md0 5 $size 512
+mdadm -S $md0
+
+
+# now same again with different chunk size
+for chunk in 4 32 256
+do
+ mdadm -CR $md0 -e0.90 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2
+ check raid0
+ testdev $md0 3 $mdsize0 $chunk
+ mdadm -S $md0
+
+ # now with version-1 superblock
+ mdadm -CR $md0 -e1.0 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3
+ check raid0
+ testdev $md0 4 $mdsize1 $chunk
+ mdadm -S $md0
+
+ # now with no superblock
+ mdadm -B $md0 -l0 -n5 --chun=$chunk $dev0 $dev1 $dev2 $dev3 $dev4
+ check raid0
+ testdev $md0 5 $size $chunk
+ mdadm -S $md0
+
+done
+exit 0
diff --git a/tests/00raid1 b/tests/00raid1
new file mode 100644
index 0000000..f6b8be1
--- /dev/null
+++ b/tests/00raid1
@@ -0,0 +1,38 @@
+
+# create a simple mirror
+# test version0, version1, and no super
+# test resync and recovery.
+
+# It's just a sanity check. This command shouldn't run successfully
+mdadm -CR $md0 -l 1 -n2 missing missing
+check opposite_result
+
+mdadm -CR $md0 -l 1 -n2 $dev0 $dev1
+check resync
+check raid1
+testdev $md0 1 $mdsize1a 64
+mdadm -S $md0
+
+# now with version-0.90 superblock, spare
+mdadm -CR $md0 -e0.90 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2
+check recovery
+check raid1
+testdev $md0 1 $mdsize0 64
+mdadm -S $md0
+
+# now with no superblock
+mdadm -B $md0 -l mirror -n2 $dev0 $dev1
+check resync
+check raid1
+testdev $md0 1 $size 1
+mdadm -S $md0
+
+# again, but with no resync
+mdadm -B $md0 -l 1 --assume-clean -n2 $dev0 $dev1
+check raid1
+check nosync
+testdev $md0 1 $size 1
+mdadm -S $md0
+
+
+exit 0
diff --git a/tests/00raid10 b/tests/00raid10
new file mode 100644
index 0000000..796b970
--- /dev/null
+++ b/tests/00raid10
@@ -0,0 +1,18 @@
+
+# Create some raid10 arrays, all with 6 devices and one spare
+devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6"
+
+for lo in n2 n3 f2 f3
+do
+ cm=1
+ case $lo in
+ f2 ) m=3 cm=2;;
+ f3 ) m=2 cm=3;;
+ n2 ) m=3;;
+ n3 ) m=2;;
+ esac
+ mdadm --create --run --level=raid10 --layout $lo --raid-disks 6 -x 1 $md0 $devs
+ check resync ; check raid10
+ testdev $md0 $m $mdsize1 $[512*cm]
+ mdadm -S $md0
+done
diff --git a/tests/00raid4 b/tests/00raid4
new file mode 100644
index 0000000..00a14f2
--- /dev/null
+++ b/tests/00raid4
@@ -0,0 +1,16 @@
+
+# create a simple raid4 set
+
+mdadm -CfR $md0 -l 4 -n3 $dev0 $dev1 $dev2
+check resync ; check raid[45]
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+# now with version-1 superblock
+mdadm -CR $md0 -e1 --level=raid4 -n4 $dev0 $dev1 $dev2 $dev3
+check recovery; check raid[45]
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+
+exit 0
diff --git a/tests/00raid5 b/tests/00raid5
new file mode 100644
index 0000000..b2b7a97
--- /dev/null
+++ b/tests/00raid5
@@ -0,0 +1,33 @@
+
+# create a simple raid5 set
+
+mdadm -CfR $md0 -e 0.90 -l 5 -n3 $dev0 $dev1 $dev2
+check resync
+testdev $md0 2 $mdsize0 512
+mdadm -S $md0
+
+# now with version-1 superblock
+mdadm -CR $md0 -e1 --level=raid5 -n4 $dev0 $dev1 $dev2 $dev3
+check recovery
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+# now same again with explicit layout
+
+for lo in la ra left-symmetric right-symmetric
+do
+
+ mdadm -CfR $md0 -l 5 -p $lo -n3 $dev0 $dev1 $dev2
+ check resync ; check raid5
+ testdev $md0 2 $mdsize1 512
+ mdadm -S $md0
+
+ # now with version-1 superblock
+ mdadm -CR $md0 -e1 --level=raid5 --layout $lo -n4 $dev0 $dev1 $dev2 $dev3
+ check recovery ; check raid5
+ testdev $md0 3 $mdsize1 512
+ mdadm -S $md0
+
+done
+
+exit 0
diff --git a/tests/00raid6 b/tests/00raid6
new file mode 100644
index 0000000..6977af9
--- /dev/null
+++ b/tests/00raid6
@@ -0,0 +1,16 @@
+
+# create a simple raid6 set
+
+mdadm -CfR $md0 -e0.90 -l 6 -n4 $dev0 $dev1 $dev2 $dev3
+check resync ; check raid6
+testdev $md0 2 $mdsize0 512
+mdadm -S $md0
+
+# now with version-1 superblock
+mdadm -CR $md0 -e1 --level=raid6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+check resync ; check raid6
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+
+exit 0
diff --git a/tests/00readonly b/tests/00readonly
new file mode 100644
index 0000000..28b0fa1
--- /dev/null
+++ b/tests/00readonly
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+for metadata in 0.9 1.0 1.1 1.2
+do
+ for level in linear raid0 raid1 raid4 raid5 raid6 raid10
+ do
+ mdadm -CR $md0 -l $level -n 4 --metadata=$metadata \
+ $dev1 $dev2 $dev3 $dev4 --assume-clean
+ check nosync
+ check $level
+ mdadm -ro $md0
+ check readonly
+ state=$(cat /sys/block/md0/md/array_state)
+ [ "$state" == "readonly" ] ||
+ die "array_state should be 'readonly', but is $state"
+ mdadm -w $md0
+ check $level
+ mdadm -S $md0
+ done
+done
+
+exit 0
diff --git a/tests/01r1fail b/tests/01r1fail
new file mode 100644
index 0000000..389b813
--- /dev/null
+++ b/tests/01r1fail
@@ -0,0 +1,29 @@
+
+# create a raid1, fail and remove a drive during initial sync
+# Add two more, fail and remove one
+# wait for sync to complete, fail, remove, re-add
+
+mdadm -CR $md0 -l1 -n4 $dev0 $dev1 $dev2 missing
+check resync
+mdadm $md0 --fail $dev2
+check resync
+mdadm $md0 --fail $dev1
+sleep 1
+check nosync
+check state U___
+mdadm $md0 --add $dev4 $dev3
+check recovery
+# there could be two separate recoveries, one for each dev
+check wait
+check wait
+mdadm $md0 --remove $dev2 $dev1
+check nosync
+check state UUU_
+
+mdadm --zero-superblock $dev2
+mdadm $md0 -a $dev2
+check recovery
+check wait
+check state UUUU
+
+mdadm -S $md0
diff --git a/tests/01r5fail b/tests/01r5fail
new file mode 100644
index 0000000..873dba5
--- /dev/null
+++ b/tests/01r5fail
@@ -0,0 +1,27 @@
+
+
+# create a raid5, fail and remove a drive during initial sync
+# Add two more, fail and remove one
+# wait for sync to complete, fail, remove, re-add
+
+mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3
+check recovery
+mdadm $md0 --fail $dev3
+sleep 1
+check nosync
+check state UUU_
+
+mdadm $md0 --add $dev4 $dev5
+check recovery
+check wait
+mdadm $md0 --fail $dev0
+mdadm $md0 --remove $dev3 $dev0
+check recovery
+check state _UUU
+
+mdadm $md0 -a $dev3
+check recovery
+check wait
+check state UUUU
+
+mdadm -S $md0 \ No newline at end of file
diff --git a/tests/01r5integ b/tests/01r5integ
new file mode 100644
index 0000000..48676a2
--- /dev/null
+++ b/tests/01r5integ
@@ -0,0 +1,33 @@
+
+# Check integrity of raid5 in degraded mode
+# Create a 4 disk raid5, create a filesystem and
+# sha1sum it with each device failed
+
+if [ "$INTEGRITY" != "yes" ]; then
+ echo -ne 'skipping... '
+ exit 0
+fi
+
+for layout in ls rs la ra
+do
+ mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3
+ check wait
+ tar cf - /etc > $md0
+ sum=`sha1sum $md0`
+
+ for i in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm $md0 -f $i
+ mdadm $md0 -r $i
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ "$sum" != "$sum1" ]
+ then
+ echo $sum does not match $sum1 with $i missing
+ exit 1
+ fi
+ mdadm $md0 -a $i
+ while ! (check state 'U*'); do check wait; sleep 0.2; done
+ done
+ mdadm -S $md0
+done
diff --git a/tests/01raid6integ b/tests/01raid6integ
new file mode 100644
index 0000000..12f4d81
--- /dev/null
+++ b/tests/01raid6integ
@@ -0,0 +1,57 @@
+
+# Check integrity of raid6 in degraded modes
+# Create a 5 disk raid6, dump some data to it, then
+# sha1sum it with different pairs of devices failed
+
+if [ "$INTEGRITY" != "yes" ]; then
+ echo -ne 'skipping... '
+ exit 0
+fi
+
+layouts='ls rs la ra'
+lv=`uname -r`
+if expr $lv '>=' 2.6.30 > /dev/null
+then
+ layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6"
+fi
+
+for layout in $layouts
+do
+ mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+ check wait
+ tar cf - /etc > $md0
+ sum=`sha1sum $md0`
+
+ totest=
+ for second in $dev0 $dev1 $dev2 $dev3 $dev4
+ do
+ mdadm $md0 -f $second
+ mdadm $md0 -r $second
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ "$sum" != "$sum1" ]
+ then
+ echo $sum does not match $sum1 with $second missing
+ exit 1
+ fi
+ for first in $totest
+ do
+ mdadm $md0 -f $first
+ mdadm $md0 -r $first
+ blockdev --flushbufs $md0
+ sum1=`sha1sum $md0`
+ if [ "$sum" != "$sum1" ]
+ then
+ echo $sum does not match $sum1 with $first and $second missing
+ exit 1
+ fi
+ mdadm $md0 -a $first
+ while ! (check state 'U*_U*'); do check wait; sleep 0.2; done
+ done
+ mdadm $md0 -a $second
+ while ! (check state 'U*'); do check wait; sleep 0.2; done
+ totest="$totest $second"
+ done
+ mdadm -S $md0
+done
diff --git a/tests/01replace b/tests/01replace
new file mode 100644
index 0000000..6223a22
--- /dev/null
+++ b/tests/01replace
@@ -0,0 +1,52 @@
+set -x -e
+
+## test --replace for raid5 raid6 raid1 and raid10
+#1/ after replace, can remove replaced device
+#2/ after --replace-with cannot remove the 'with' device
+#3/ preserve integrity with concurrent failure
+
+for level in 1 5 6 10
+do
+ dd if=/dev/zero of=$dev4 bs=1M || true
+ dd if=/dev/zero of=$dev5 bs=1M || true
+ mdadm -CR $md0 -l $level -n4 -x2 $devlist5
+ dd if=/dev/urandom of=$md0 bs=1M || true
+ sum=`sha1sum < $md0`
+ check wait
+ mdadm $md0 --replace $dev1
+ check wait
+ mdadm $md0 --remove $dev1
+ mdadm $md0 --remove $dev5 && exit 1
+ mdadm -S $md0
+ dd if=/dev/zero of=$dev4 bs=1M || true
+ dd if=/dev/zero of=$dev5 bs=1M || true
+ mdadm -CR $md0 -l $level -n4 -x2 $devlist5
+ check wait
+ sum1=`sha1sum < $md0`
+ [ "$sum" == "$sum1" ]
+
+ mdadm $md0 --replace $dev1 --with $dev4
+ check wait
+ mdadm $md0 --remove $dev1
+ mdadm $md0 --remove $dev5
+ mdadm $md0 --remove $dev4 && exit 1
+
+ mdadm $md0 --add $dev1 $dev5
+ mdadm $md0 --replace $dev0
+ sleep 1
+ mdadm $md0 --fail $dev2
+ check wait
+ sum2=`sha1sum < $md0`
+ [ "$sum" == "$sum2" ]
+
+ mdadm $md0 --remove $dev0 $dev2
+ mdadm $md0 --add $dev0 $dev2
+ mdadm $md0 --replace $dev3
+ sleep 1
+ mdadm $md0 --fail $dev0 $dev2
+ check wait
+ sum3=`sha1sum < $md0`
+ [ "$sum" == "$sum3" ]
+
+ mdadm -S $md0
+done
diff --git a/tests/02lineargrow b/tests/02lineargrow
new file mode 100644
index 0000000..e05c219
--- /dev/null
+++ b/tests/02lineargrow
@@ -0,0 +1,23 @@
+
+# create a liner array, and add more drives to to.
+
+for e in 0.90 1 1.1 1.2
+do
+ case $e in
+ 0.90 ) sz=$mdsize0 ;;
+ 1 ) sz=$mdsize2_l ;;
+ 1.0 ) sz=$mdsize1 ;;
+ 1.1 ) sz=$mdsize1_l ;;
+ 1.2 ) sz=$mdsize2_l ;;
+ esac
+ mdadm -CRf $md0 --level linear -e $e --raid-disks=1 $dev1
+ testdev $md0 1 $sz 1
+
+ mdadm --grow $md0 --add $dev2
+ testdev $md0 2 $sz 1
+
+ mdadm --grow $md0 --add $dev3
+ testdev $md0 3 $sz 1
+
+ mdadm -S $md0
+done
diff --git a/tests/02r1add b/tests/02r1add
new file mode 100644
index 0000000..757f696
--- /dev/null
+++ b/tests/02r1add
@@ -0,0 +1,40 @@
+
+# Make a raid1, add a device, then remove it again.
+
+mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2
+check resync
+check wait
+check state UU
+
+mdadm --grow $md0 -n 3
+check recovery
+check wait
+check state UUU
+
+mdadm $md0 --fail $dev0
+check state _UU
+
+mdadm --grow $md0 -n 2
+check state UU
+
+mdadm -S $md0
+# same again for version-1
+
+
+mdadm -CR $md0 -l1 -n2 -e1.2 -x1 $dev0 $dev1 $dev2
+check resync
+check wait
+check state UU
+
+mdadm --grow $md0 -n 3
+check recovery
+check wait
+check state UUU
+
+mdadm $md0 --fail $dev0
+check state _UU
+
+mdadm --grow $md0 -n 2
+check state UU
+
+mdadm -S $md0
diff --git a/tests/02r1grow b/tests/02r1grow
new file mode 100644
index 0000000..5754c88
--- /dev/null
+++ b/tests/02r1grow
@@ -0,0 +1,36 @@
+
+
+# create a small raid1 array, make it larger. Then make it smaller
+
+mdadm -CR $md0 -e 0.90 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3
+check wait
+check state UUU
+testdev $md0 1 $[size/2] 1
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 1 $mdsize0 1
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 1 $[size/2] 1
+
+mdadm -S $md0
+
+# same again with version 1.1 superblock
+mdadm -CR $md0 --level raid1 --metadata=1.1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3
+check wait
+check state UUU
+testdev $md0 1 $[size/2] 1
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 1 $mdsize1_l 1
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 1 $[size/2] 1
+
+mdadm -S $md0
diff --git a/tests/02r5grow b/tests/02r5grow
new file mode 100644
index 0000000..2da78ee
--- /dev/null
+++ b/tests/02r5grow
@@ -0,0 +1,53 @@
+
+
+# create a small raid5 array, make it larger. Then make it smaller
+
+mdadm -CR $md0 -e0.90 --level raid5 --chunk=64 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3
+check wait
+check state UUU
+testdev $md0 2 $[size/2] 32
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 2 $mdsize0 32
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 2 $[size/2] 32
+
+mdadm -S $md0
+
+# same again with version 1.1 superblock
+mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4
+check wait
+check state UUUU
+testdev $md0 3 $[size/2] 128
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 3 $[mdsize1_l] 128
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 3 $[size/2] 128
+
+mdadm -S $md0
+
+# create a raid5 array and change the chunk
+mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=32 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3
+check wait
+check state UUU
+check chunk 32
+
+mdadm $md0 --grow --chunk=64
+check reshape
+check wait
+check chunk 64
+
+mdadm -S $md0
+mdadm -A $md0 $dev1 $dev2 $dev3
+check state UUU
+check chunk 64
+mdadm -S $md0
diff --git a/tests/02r6grow b/tests/02r6grow
new file mode 100644
index 0000000..759e627
--- /dev/null
+++ b/tests/02r6grow
@@ -0,0 +1,36 @@
+
+
+# create a small raid6 array, make it larger. Then make it smaller
+
+mdadm -CR $md0 -e 0.90 --level raid6 --chunk=64 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4
+check wait
+check state UUUU
+testdev $md0 2 $[size/2] 32
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 2 $mdsize0 32
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 2 $[size/2] 32
+
+mdadm -S $md0
+
+# same again with version 1.1 superblock
+mdadm -CR $md0 --level raid6 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4
+check wait
+check state UUUU
+testdev $md0 2 $[size/2] 128
+
+mdadm --grow $md0 --size max
+check resync
+check wait
+testdev $md0 2 $[mdsize1_l] 128
+
+mdadm --grow $md0 --size $[size/2]
+check nosync
+testdev $md0 2 $[size/2] 128
+
+mdadm -S $md0
diff --git a/tests/03assem-incr b/tests/03assem-incr
new file mode 100644
index 0000000..f10a1a4
--- /dev/null
+++ b/tests/03assem-incr
@@ -0,0 +1,17 @@
+set -x -e
+
+# Test interaction between -I and -A
+# there are locking issue too, but those are hard to test for.
+#
+# Here just test that a partly "-I" assembled array can
+# be completed with "-A"
+
+for l in 0 1 5 linear
+do
+ mdadm -CR $md0 -l $l -n5 $dev0 $dev1 $dev2 $dev3 $dev4 --assume-clean
+ mdadm -S md0
+ mdadm -I $dev1
+ mdadm -I $dev3
+ mdadm -A /dev/md0 $dev0 $dev1 $dev2 $dev3 $dev4
+ mdadm -S /dev/md0
+done
diff --git a/tests/03r0assem b/tests/03r0assem
new file mode 100644
index 0000000..6744e32
--- /dev/null
+++ b/tests/03r0assem
@@ -0,0 +1,137 @@
+
+# create a raid0 array from 3 devices, and assemble it in a multitude of ways.
+# explicitly list devices
+# uuid, md-minor on command line with wildcard devices
+# mdadm.conf file
+
+mdadm -CR $md2 -l0 -n3 $dev0 $dev1 $dev2
+check raid0
+tst="testdev $md2 3 $mdsize1_l 512"
+$tst
+uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'`
+mdadm -S $md2
+
+mdadm -A $md2 $dev0 $dev1 $dev2
+$tst
+mdadm -S $md2
+
+mdadm -A $md2 -u $uuid $devlist
+$tst
+mdadm -S $md2
+
+mdadm --assemble $md2 --name=2 $devlist
+$tst
+mdadm -S $md2
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md2 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+mdadm -S $md2
+
+{
+ echo DEVICE $devlist
+ echo array $md2 name=2
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+mdadm -S $md2
+
+
+{
+ echo DEVICE $devlist
+ echo array $md2 devices=$dev0,$dev1,$dev2
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md2 >> $conf
+mdadm -S $md2
+
+mdadm --assemble --scan --config=$conf $md2
+$tst
+mdadm -S $md2
+
+echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf
+mdadm --assemble --scan --config=$conf $md2
+$tst
+mdadm -S $md2
+
+
+### Now for version 0...
+
+mdadm --zero-superblock $dev0 $dev1 $dev2
+mdadm -CR $md2 -l0 --metadata=0.90 -n3 $dev0 $dev1 $dev2
+check raid0
+tst="testdev $md2 3 $mdsize0 512"
+$tst
+
+uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'`
+mdadm -S $md2
+
+mdadm -A $md2 $dev0 $dev1 $dev2
+$tst
+mdadm -S $md2
+
+mdadm -A $md2 -u $uuid $devlist
+$tst
+mdadm -S $md2
+
+mdadm --assemble $md2 --super-minor=2 $devlist #
+$tst
+mdadm -S $md2
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md2 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+mdadm -S $md2
+
+{
+ echo DEVICE $devlist
+ echo array $md2 super-minor=2
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+mdadm -S $md2
+
+
+{
+ echo DEVICE $devlist
+ echo array $md2 devices=$dev0,$dev1,$dev2
+} > $conf
+
+mdadm -As -c $conf $md2
+$tst
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md2 >> $conf
+mdadm -S $md2
+
+mdadm --assemble --scan --config=$conf $md2
+$tst
+mdadm -S $md2
+
+echo " metadata=1 devices=$dev0,$dev1,$dev2" >> $conf
+mdadm --assemble --scan --config=$conf $md2
+$tst
+mdadm -S $md2
+
+# Now use incremental assembly.
+mdadm -I --config=$conf $dev0
+mdadm -I --config=$conf $dev1
+mdadm -I --config=$conf $dev2
+$tst
+mdadm -S $md2
diff --git a/tests/03r5assem b/tests/03r5assem
new file mode 100644
index 0000000..0c7fb8c
--- /dev/null
+++ b/tests/03r5assem
@@ -0,0 +1,109 @@
+
+# create a raid5 array and assemble it in various ways,
+# including with missing devices.
+
+mdadm -CR -e 0.90 $md1 -l5 -n3 $dev0 $dev1 $dev2
+tst="check raid5 ;testdev $md1 2 $mdsize0 512 ; mdadm -S $md1"
+uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'`
+check wait
+eval $tst
+
+mdadm -A $md1 $dev0 $dev1 $dev2
+eval $tst
+
+mdadm -A $md1 -u $uuid $devlist
+eval $tst
+
+mdadm -A $md1 -m 1 $devlist
+eval $tst
+
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md1 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md1
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 super-minor=1
+} > $conf
+
+mdadm -As -c $conf
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 devices=$dev0,$dev1,$dev2
+} > $conf
+
+mdadm -As -c $conf
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md1 >> $conf
+eval $tst
+
+mdadm --assemble --scan --config=$conf $md1
+eval $tst
+
+echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf
+mdadm --assemble --scan --config=$conf $md1
+eval $tst
+
+### Now with a missing device
+
+mdadm -AR $md1 $dev0 $dev2 #
+check state U_U
+eval $tst
+
+mdadm -A $md1 -u $uuid $devlist
+check state U_U
+eval $tst
+
+mdadm -A $md1 -m 1 $devlist
+check state U_U
+eval $tst
+
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md1 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md1
+check state U_U
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 super-minor=1
+} > $conf
+
+mdadm -As -c $conf
+check state U_U
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 devices=$dev0,$dev1,$dev2
+} > $conf
+
+mdadm -As -c $conf
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md1 >> $conf
+check state U_U
+eval $tst
+
+mdadm --assemble --scan --config=$conf $md1
+check state U_U
+eval $tst
+
+echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf
+mdadm --assemble --scan --config=$conf $md1
+check state U_U
+eval $tst
diff --git a/tests/03r5assem-failed b/tests/03r5assem-failed
new file mode 100644
index 0000000..d38241d
--- /dev/null
+++ b/tests/03r5assem-failed
@@ -0,0 +1,12 @@
+
+# Create an array, fail one device while array is active, stop array,
+# then re-assemble listing the failed device first.
+
+mdadm -CR $md1 -l5 -n4 $dev0 $dev1 $dev2 $dev3
+check wait
+
+echo 2000 > /sys/block/md1/md/safe_mode_delay
+mkfs $md1
+mdadm $md1 -f $dev0
+mdadm -S $md1
+mdadm -A $md1 $dev0 $dev1 $dev2 $dev3 || exit 1
diff --git a/tests/03r5assemV1 b/tests/03r5assemV1
new file mode 100644
index 0000000..bca0c58
--- /dev/null
+++ b/tests/03r5assemV1
@@ -0,0 +1,128 @@
+
+# create a v-1 raid5 array and assemble in various ways
+
+mdadm -CR -e1 --name one $md1 -l5 -n3 -x2 $dev0 $dev1 $dev2 $dev3 $dev4
+tst="check raid5 ;testdev $md1 2 $mdsize1 512 ; mdadm -S $md1"
+uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'`
+check wait
+
+eval $tst
+
+mdadm -A $md1 $dev0 $dev1 $dev2
+mdadm $md1 --add $dev3 $dev4
+check spares 2
+eval $tst
+
+mdadm -A $md1 -u $uuid $devlist
+check spares 2
+eval $tst
+
+mdadm -A $md1 --name one $devlist
+check spares 2
+eval $tst
+
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md1 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md1
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 name=one
+} > $conf
+
+mdadm -As -c $conf
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 devices=$dev0,$dev1,$dev2,$dev3,$dev4
+} > $conf
+
+mdadm -As -c $conf
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md1 >> $conf
+eval $tst
+mdadm --assemble --scan --config=$conf $md1
+eval $tst
+echo PING >&2
+
+echo " metadata=1.0 devices=$dev0,$dev1,$dev2,$dev3,$dev4" >> $conf
+mdadm --assemble --scan --config=$conf $md1
+eval $tst
+
+### Now with a missing device
+# We don't want the recovery to complete while we are
+# messing about here.
+echo 100 > /proc/sys/dev/raid/speed_limit_max
+echo 100 > /proc/sys/dev/raid/speed_limit_min
+
+mdadm -AR $md1 $dev0 $dev2 $dev3 $dev4 #
+check state U_U
+check spares 1
+eval $tst
+
+mdadm -A $md1 -u $uuid $devlist
+check state U_U
+eval $tst
+
+mdadm -A $md1 --name=one $devlist
+check state U_U
+check spares 1
+eval $tst
+
+
+conf=$targetdir/mdadm.conf
+{
+ echo DEVICE $devlist
+ echo array $md1 UUID=$uuid
+} > $conf
+
+mdadm -As -c $conf $md1
+check state U_U
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 name=one
+} > $conf
+
+mdadm -As -c $conf
+check state U_U
+eval $tst
+
+{
+ echo DEVICE $devlist
+ echo array $md1 devices=$dev0,$dev1,$dev2
+} > $conf
+
+mdadm -As -c $conf
+
+echo "DEVICE $devlist" > $conf
+mdadm -Db $md1 >> $conf
+check state U_U
+eval $tst
+
+mdadm --assemble --scan --config=$conf $md1
+check state U_U
+eval $tst
+
+echo " metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf
+mdadm --assemble --scan --config=$conf $md1
+check state U_U
+eval $tst
+
+# And now assemble with -I
+mdadm -Ss
+mdadm -I -c $conf $dev0
+mdadm -I -c $conf $dev1
+mdadm -I -c $conf $dev2
+eval $tst
+echo 2000 > /proc/sys/dev/raid/speed_limit_max
+echo 1000 > /proc/sys/dev/raid/speed_limit_min
diff --git a/tests/04r0update b/tests/04r0update
new file mode 100644
index 0000000..73ee3b9
--- /dev/null
+++ b/tests/04r0update
@@ -0,0 +1,20 @@
+
+# create a raid0, re-assemble with a different super-minor
+mdadm -CR -e 0.90 $md0 -l0 -n3 $dev0 $dev1 $dev2
+testdev $md0 3 $mdsize0 512
+minor1=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'`
+mdadm -S /dev/md0
+
+mdadm -A $md1 $dev0 $dev1 $dev2
+minor2=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'`
+mdadm -S /dev/md1
+
+mdadm -A $md1 --update=super-minor $dev0 $dev1 $dev2
+minor3=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'`
+mdadm -S /dev/md1
+
+case "$minor1 $minor2 $minor3" in
+ "0 0 1" ) ;;
+ * ) echo >&2 "ERROR minors should be '0 0 1' but are '$minor1 $minor2 $minor3'"
+ exit 1
+esac
diff --git a/tests/04r1update b/tests/04r1update
new file mode 100644
index 0000000..e22965b
--- /dev/null
+++ b/tests/04r1update
@@ -0,0 +1,15 @@
+set -i
+
+# create a raid1 array, let it sync, then re-assemble with a force-sync
+
+mdadm -CR $md0 -l1 -n2 $dev0 $dev1
+check wait
+mdadm -S $md0
+
+mdadm -A $md0 $dev0 $dev1
+check nosync
+mdadm -S $md0
+
+mdadm -A $md0 -U resync $dev0 $dev1
+check resync
+mdadm -S $md0
diff --git a/tests/04r5swap b/tests/04r5swap
new file mode 100644
index 0000000..5373a60
--- /dev/null
+++ b/tests/04r5swap
@@ -0,0 +1,18 @@
+
+# make a raid5 array, byte swap the superblocks, then assemble...
+
+mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3
+sleep 4
+mdadm -S $md0
+
+mdadm -E --metadata=0 $dev1 > $targetdir/d1
+for d in $dev0 $dev1 $dev2 $dev3
+do $dir/swap_super $d
+done
+mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s
+diff -u $targetdir/d1 $targetdir/d1s
+
+mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3
+sleep 3
+check recovery
+mdadm -S $md0
diff --git a/tests/04update-metadata b/tests/04update-metadata
new file mode 100644
index 0000000..232fc1f
--- /dev/null
+++ b/tests/04update-metadata
@@ -0,0 +1,48 @@
+set -xe
+
+# test converting v0.90 to v1.0
+# check for different levels
+# check it fails for non-v0.90
+# check it fails during reshape or recovery
+# check it fails when bitmap is present
+
+dlist="$dev0 $dev1 $dev2 $dev3"
+
+for ls in raid0/4 linear/4 raid1/1 raid5/3 raid6/2
+do
+ s=${ls#*/} l=${ls%/*}
+ mdadm -CR --assume-clean -e 0.90 $md0 --level $l -n 4 -c 64 $dlist
+ testdev $md0 $s 19904 64
+ mdadm -S $md0
+ mdadm -A $md0 --update=metadata $dlist
+ testdev $md0 $s 19904 64 check
+ mdadm -S $md0
+done
+
+if mdadm -A $md0 --update=metadata $dlist
+then echo >&2 should fail with v1.0 metadata
+ exit 1
+fi
+
+mdadm -CR -e 0.90 $md0 --level=6 -n4 -c32 $dlist
+mdadm -S $md0
+
+if mdadm -A $md0 --update=metadata $dlist
+then echo >&2 should fail during resync
+ exit 1
+fi
+mdadm -A $md0 $dlist
+mdadm --wait $md0 || true
+mdadm -S $md0
+
+# should succeed now
+mdadm -A $md0 --update=metadata $dlist
+
+mdadm -S /dev/md0
+mdadm -CR --assume-clean -e 0.90 $md0 --level=6 -n4 -c32 $dlist --bitmap=internal
+mdadm -S $md0
+
+if mdadm -A $md0 --update=metadata $dlist
+then echo >&2 should fail when bitmap present
+ exit 1
+fi
diff --git a/tests/04update-uuid b/tests/04update-uuid
new file mode 100644
index 0000000..a4409e7
--- /dev/null
+++ b/tests/04update-uuid
@@ -0,0 +1,82 @@
+set -x
+
+# create an array, then change the uuid.
+
+mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+mdadm -S /dev/md0
+
+# try v1 superblock
+
+mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+mdadm -S /dev/md0
+
+
+# now if we have a bitmap, that needs updating too.
+rm -f $targetdir/bitmap
+mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 ||
+ mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476
+then : ; else
+ echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2;
+fi
+mdadm -S /dev/md0
+
+# and bitmap for version1
+rm -f $targetdir/bitmap
+mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+# -X cannot tell which byteorder to use for the UUID, so allow both.
+if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 ||
+ mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476
+then : ; else
+ echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2;
+fi
+mdadm -S /dev/md0
+
+# Internal bitmaps too.
+mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -X $dev0; exit 2;
+}
+mdadm -S /dev/md0
+
+mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2
+mdadm -S /dev/md0
+mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2
+no_errors
+mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -D /dev/md0 ; exit 2;
+}
+mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || {
+ echo Wrong uuid; mdadm -X $dev0; exit 2;
+}
+mdadm -S /dev/md0
diff --git a/tests/05r1-add-internalbitmap b/tests/05r1-add-internalbitmap
new file mode 100644
index 0000000..4e20305
--- /dev/null
+++ b/tests/05r1-add-internalbitmap
@@ -0,0 +1,20 @@
+#
+# create a raid1 without any bitmap, add the bitmap and then write to
+# the device. This should catch the case where the bitmap is created
+# but not reloaded correctly, such as the case fixed by
+# 4474ca42e2577563a919fd3ed782e2ec55bf11a2
+#
+mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --delay=1 $dev1 $dev2
+check wait
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb internal --bitmap-chunk=4 $md0
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-add-internalbitmap-v1a b/tests/05r1-add-internalbitmap-v1a
new file mode 100644
index 0000000..721a41c
--- /dev/null
+++ b/tests/05r1-add-internalbitmap-v1a
@@ -0,0 +1,20 @@
+#
+# create a raid1 without any bitmap, add the bitmap and then write to
+# the device. This should catch the case where the bitmap is created
+# but not reloaded correctly, such as the case fixed by
+# 4474ca42e2577563a919fd3ed782e2ec55bf11a2
+#
+mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 $dev1 $dev2
+check wait
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb internal --bitmap-chunk=4 $md0
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-add-internalbitmap-v1b b/tests/05r1-add-internalbitmap-v1b
new file mode 100644
index 0000000..da78fd6
--- /dev/null
+++ b/tests/05r1-add-internalbitmap-v1b
@@ -0,0 +1,20 @@
+#
+# create a raid1 without any bitmap, add the bitmap and then write to
+# the device. This should catch the case where the bitmap is created
+# but not reloaded correctly, such as the case fixed by
+# 4474ca42e2577563a919fd3ed782e2ec55bf11a2
+#
+mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 $dev1 $dev2
+check wait
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb internal --bitmap-chunk=4 $md0
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-add-internalbitmap-v1c b/tests/05r1-add-internalbitmap-v1c
new file mode 100644
index 0000000..9f2f128
--- /dev/null
+++ b/tests/05r1-add-internalbitmap-v1c
@@ -0,0 +1,20 @@
+#
+# create a raid1 without any bitmap, add the bitmap and then write to
+# the device. This should catch the case where the bitmap is created
+# but not reloaded correctly, such as the case fixed by
+# 4474ca42e2577563a919fd3ed782e2ec55bf11a2
+#
+mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 $dev1 $dev2
+check wait
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb internal --bitmap-chunk=4 $md0
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-bitmapfile b/tests/05r1-bitmapfile
new file mode 100644
index 0000000..f384f0e
--- /dev/null
+++ b/tests/05r1-bitmapfile
@@ -0,0 +1,49 @@
+
+#
+# create a raid1 with a bitmap file
+#
+bmf=$targetdir/bitmap
+rm -f $bmf
+mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap $bmf $dev1 $dev2
+check wait
+testdev $md0 1 $mdsize1a 64
+mdadm -S $md0
+
+mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2
+testdev $md0 1 $mdsize1a 64
+dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 1 $mdsize1a 64
+sleep 4
+dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 --bitmap=$bmf $dev2
+dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+mdadm --zero $dev1 # force --add, not --re-add
+mdadm $md0 --add $dev1
+#it is too fast# check recovery
+
+check wait
+sleep 4
+dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r1-failfast b/tests/05r1-failfast
new file mode 100644
index 0000000..823dd6f
--- /dev/null
+++ b/tests/05r1-failfast
@@ -0,0 +1,74 @@
+
+# create a simple mirror and check failfast flag works
+mdadm -CR $md0 -e1.2 --level=raid1 --failfast -n2 $dev0 $dev1
+check raid1
+if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null
+then
+ die "failfast missing"
+fi
+
+# Removing works with the failfast flag
+mdadm $md0 -f $dev0
+mdadm $md0 -r $dev0
+if grep -v failfast /sys/block/md0/md/rd1/state > /dev/null
+then
+ die "failfast missing"
+fi
+
+# Adding works with the failfast flag
+mdadm $md0 -a --failfast $dev0
+check wait
+if grep -v failfast /sys/block/md0/md/rd0/state > /dev/null
+then
+ die "failfast missing"
+fi
+
+mdadm -S $md0
+
+# Assembling works with the failfast flag
+mdadm -A $md0 $dev0 $dev1
+check raid1
+if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null
+then
+ die "failfast missing"
+fi
+
+# Adding works with the nofailfast flag
+mdadm $md0 -f $dev0
+mdadm $md0 -r $dev0
+mdadm $md0 -a --nofailfast $dev0
+check wait
+if grep failfast /sys/block/md0/md/rd0/state > /dev/null
+then
+ die "failfast should be missing"
+fi
+
+# Assembling with one faulty slave works with the failfast flag
+mdadm $md0 -f $dev0
+mdadm $md0 -r $dev0
+mdadm -S $md0
+mdadm -A $md0 $dev0 $dev1
+check raid1
+mdadm -S $md0
+
+# Spare works with the failfast flag
+mdadm -CR $md0 -e1.2 --level=raid1 --failfast -n2 $dev0 $dev1
+check raid1
+mdadm $md0 -a --failfast $dev2
+check wait
+check spares 1
+if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null
+then
+ die "failfast missing"
+fi
+
+# Grow works with the failfast flag
+mdadm -G $md0 --raid-devices=3
+check wait
+if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null
+then
+ die "failfast missing"
+fi
+mdadm -S $md0
+
+exit 0
diff --git a/tests/05r1-grow-external b/tests/05r1-grow-external
new file mode 100644
index 0000000..69da3e9
--- /dev/null
+++ b/tests/05r1-grow-external
@@ -0,0 +1,33 @@
+
+#
+# create a raid1 array, add an external bitmap
+#
+mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2
+check wait
+testdev $md0 1 $mdsize1a 64
+
+bmf=$targetdir/bm
+rm -f $bmf
+#mdadm -E $dev1
+mdadm --grow $md0 --bitmap=$bmf --delay=1 || { mdadm -X $bmf ; exit 1; }
+dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+testdev $md0 1 $mdsize1a 64
+dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+#echo $dirty1 $dirty2 $dirty3 $dirty4
+if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ]
+then
+ echo bad dirty counts
+ exit 1
+fi
+
+# now to remove the bitmap
+check bitmap
+mdadm --grow $md0 --bitmap=none
+check nobitmap
+mdadm -S $md0
diff --git a/tests/05r1-grow-internal b/tests/05r1-grow-internal
new file mode 100644
index 0000000..24b3aec
--- /dev/null
+++ b/tests/05r1-grow-internal
@@ -0,0 +1,31 @@
+
+#
+# create a raid1 array, add an internal bitmap
+#
+mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2
+check wait
+testdev $md0 1 $mdsize1a 64
+
+#mdadm -E $dev1
+mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 || { mdadm -X $dev2 ; exit 1; }
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+testdev $md0 1 $mdsize1a 64
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+#echo $dirty1 $dirty2 $dirty3 $dirty4
+if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ]
+then
+ echo bad dirty counts
+ exit 1
+fi
+
+# now to remove the bitmap
+check bitmap
+mdadm --grow $md0 --bitmap=none
+check nobitmap
+mdadm -S $md0
diff --git a/tests/05r1-grow-internal-1 b/tests/05r1-grow-internal-1
new file mode 100644
index 0000000..2f0d823
--- /dev/null
+++ b/tests/05r1-grow-internal-1
@@ -0,0 +1,31 @@
+
+#
+# create a raid1 array, version 1 superblock, add an internal bitmap
+#
+mdadm --create --run $md0 -e1 -l 1 -n 2 $dev1 $dev2
+check wait
+testdev $md0 1 $mdsize1b 64
+
+#mdadm -E $dev1
+mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+testdev $md0 1 $mdsize1b 64
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+#echo $dirty1 $dirty2 $dirty3 $dirty4
+if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ]
+then
+ echo bad dirty counts
+ exit 1
+fi
+
+# now to remove the bitmap
+check bitmap
+mdadm --grow $md0 --bitmap=none
+check nobitmap
+mdadm -S $md0
diff --git a/tests/05r1-internalbitmap b/tests/05r1-internalbitmap
new file mode 100644
index 0000000..dd7232a
--- /dev/null
+++ b/tests/05r1-internalbitmap
@@ -0,0 +1,47 @@
+
+#
+# create a raid1 with an internal bitmap
+#
+mdadm --create -e0.90 --run $md0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2
+check wait
+testdev $md0 1 $mdsize0 64
+mdadm -S $md0
+
+mdadm --assemble $md0 $dev1 $dev2
+testdev $md0 1 $mdsize0 64
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 1 $mdsize0 64
+sleep 4
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 $dev2
+mdadm --zero-superblock $dev1
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r1-internalbitmap-v1a b/tests/05r1-internalbitmap-v1a
new file mode 100644
index 0000000..3ddc082
--- /dev/null
+++ b/tests/05r1-internalbitmap-v1a
@@ -0,0 +1,48 @@
+
+#
+# create a raid1 with an internal bitmap
+#
+mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+mdadm --assemble $md0 $dev1 $dev2
+testdev $md0 1 $mdsize1b 64
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 1 $mdsize1b 64
+sleep 4
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --zero-superblock $dev1
+mdadm --assemble -R $md0 $dev2
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r1-internalbitmap-v1b b/tests/05r1-internalbitmap-v1b
new file mode 100644
index 0000000..40f7abe
--- /dev/null
+++ b/tests/05r1-internalbitmap-v1b
@@ -0,0 +1,49 @@
+
+#
+# create a raid1 with an internal bitmap
+#
+mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize11 64
+mdadm -S $md0
+
+mdadm --assemble $md0 $dev1 $dev2
+check bitmap
+testdev $md0 1 $mdsize11 64
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 1 $mdsize11 64
+sleep 4
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --zero-superblock $dev1
+mdadm --assemble -R $md0 $dev2
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r1-internalbitmap-v1c b/tests/05r1-internalbitmap-v1c
new file mode 100644
index 0000000..2eaea59
--- /dev/null
+++ b/tests/05r1-internalbitmap-v1c
@@ -0,0 +1,48 @@
+
+#
+# create a raid1 with an internal bitmap
+#
+mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk 4 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize12 64
+mdadm -S $md0
+
+mdadm --assemble $md0 $dev1 $dev2
+testdev $md0 1 $mdsize12 64
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 1 $mdsize12 64
+sleep 4
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --zero-superblock $dev1
+mdadm --assemble -R $md0 $dev2
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r1-n3-bitmapfile b/tests/05r1-n3-bitmapfile
new file mode 100644
index 0000000..f1c3f1e
--- /dev/null
+++ b/tests/05r1-n3-bitmapfile
@@ -0,0 +1,53 @@
+
+#
+# create a raid1 with 3 devices and a bitmap file
+# make sure resync does right thing.
+#
+#
+bmf=$targetdir/bitmap
+rm -f $bmf
+mdadm --create -e0.90 --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3
+check wait
+testdev $md0 1 $mdsize0 64
+mdadm -S $md0
+
+mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3
+testdev $md0 1 $mdsize0 64
+dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev2
+testdev $md0 1 $mdsize0 64
+sleep 4
+dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev3
+check nosync
+mdadm --zero-superblock $dev2
+mdadm $md0 --add $dev2
+check recovery
+
+dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
+exit 0
diff --git a/tests/05r1-re-add b/tests/05r1-re-add
new file mode 100644
index 0000000..fa6bbcb
--- /dev/null
+++ b/tests/05r1-re-add
@@ -0,0 +1,39 @@
+
+#
+# create a raid1, remove a drive, and readd it.
+# resync should be instant.
+# Then do some IO first. Resync should still be very fast
+#
+
+mdadm -CR $md0 -l1 -n2 -binternal --bitmap-chunk=4 -d1 $dev1 $dev2
+check resync
+check wait
+testdev $md0 1 $mdsize1a 64
+sleep 4
+
+mdadm $md0 -f $dev2
+sleep 1
+mdadm $md0 -r $dev2
+mdadm $md0 -a $dev2
+#cat /proc/mdstat
+check nosync
+
+mdadm $md0 -f $dev2
+sleep 1
+mdadm $md0 -r $dev2
+testdev $md0 1 $mdsize1a 64
+mdadm $md0 -a $dev2
+check wait
+blockdev --flushbufs $dev1 $dev2
+cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2
+
+mdadm $md0 -f $dev2; sleep 1
+mdadm $md0 -r $dev2
+if dd if=/dev/zero of=$md0 ; then : ; fi
+blockdev --flushbufs $md0 # ensure writes have been sent.
+mdadm $md0 -a $dev2
+check recovery
+check wait
+blockdev --flushbufs $dev1 $dev2
+cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2
+mdadm -S $md0
diff --git a/tests/05r1-re-add-nosuper b/tests/05r1-re-add-nosuper
new file mode 100644
index 0000000..058d602
--- /dev/null
+++ b/tests/05r1-re-add-nosuper
@@ -0,0 +1,38 @@
+
+#
+# create a raid1, remove a drive, and readd it.
+# resync should be instant.
+# Then do some IO first. Resync should still be very fast
+#
+bmf=$targetdir/bitmap2
+rm -f $bmf
+mdadm -B $md0 -l1 -n2 -b$bmf -d1 $dev1 $dev2
+check resync
+check wait
+testdev $md0 1 $size 1
+sleep 4
+
+mdadm $md0 -f $dev2
+sleep 1
+mdadm $md0 -r $dev2
+mdadm $md0 --re-add $dev2
+check nosync
+
+mdadm $md0 -f $dev2
+sleep 1
+mdadm $md0 -r $dev2
+testdev $md0 1 $size 1
+mdadm $md0 --re-add $dev2
+check wait
+cmp --bytes=$[$mdsize0*1024] $dev1 $dev2
+
+mdadm $md0 -f $dev2; sleep 1
+mdadm $md0 -r $dev2
+if dd if=/dev/zero of=$md0 ; then : ; fi
+blockdev --flushbufs $md0 # make sure writes have been sent
+mdadm $md0 --re-add $dev2
+check recovery
+check wait
+# should BLKFLSBUF and then read $dev1/$dev2...
+cmp --bytes=$[$mdsize0*1024] $file1 $file2
+mdadm -S $md0
diff --git a/tests/05r1-remove-internalbitmap b/tests/05r1-remove-internalbitmap
new file mode 100644
index 0000000..712fd56
--- /dev/null
+++ b/tests/05r1-remove-internalbitmap
@@ -0,0 +1,18 @@
+#
+# create a raid1 with bitmap, remove the bitmap and verify it is still
+# gone when re-assembling the array
+#
+mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb none $md0
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-remove-internalbitmap-v1a b/tests/05r1-remove-internalbitmap-v1a
new file mode 100644
index 0000000..a4a9aaf
--- /dev/null
+++ b/tests/05r1-remove-internalbitmap-v1a
@@ -0,0 +1,18 @@
+#
+# create a raid1 with bitmap, remove the bitmap and verify it is still
+# gone when re-assembling the array
+#
+mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb none $md0
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-remove-internalbitmap-v1b b/tests/05r1-remove-internalbitmap-v1b
new file mode 100644
index 0000000..c0918eb
--- /dev/null
+++ b/tests/05r1-remove-internalbitmap-v1b
@@ -0,0 +1,18 @@
+#
+# create a raid1 with bitmap, remove the bitmap and verify it is still
+# gone when re-assembling the array
+#
+mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb none $md0
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r1-remove-internalbitmap-v1c b/tests/05r1-remove-internalbitmap-v1c
new file mode 100644
index 0000000..15f1fbb
--- /dev/null
+++ b/tests/05r1-remove-internalbitmap-v1c
@@ -0,0 +1,18 @@
+#
+# create a raid1 with bitmap, remove the bitmap and verify it is still
+# gone when re-assembling the array
+#
+mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2
+check wait
+check bitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -Gb none $md0
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
+
+# Re-assemble the array and verify the bitmap is still present
+mdadm --assemble $md0 $dev1 $dev2
+check nobitmap
+testdev $md0 1 $mdsize1b 64
+mdadm -S $md0
diff --git a/tests/05r5-bitmapfile b/tests/05r5-bitmapfile
new file mode 100644
index 0000000..6d173d8
--- /dev/null
+++ b/tests/05r5-bitmapfile
@@ -0,0 +1,49 @@
+
+#
+# create a raid1 with a bitmap file
+#
+bmf=$targetdir/bitmap
+rm -f $bmf
+mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3
+check wait
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3
+testdev $md0 2 $mdsize1 512
+dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 2 $mdsize1 512
+sleep 4
+dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 --bitmap=$bmf $dev2 $dev3
+mdadm --zero $dev1 # force add, not re-add
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r5-internalbitmap b/tests/05r5-internalbitmap
new file mode 100644
index 0000000..13dc592
--- /dev/null
+++ b/tests/05r5-internalbitmap
@@ -0,0 +1,47 @@
+
+#
+# create a raid1 with an internal bitmap
+#
+mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 $dev3
+check wait
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+mdadm --assemble $md0 $dev1 $dev2 $dev3
+testdev $md0 2 $mdsize1 512
+dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev1
+testdev $md0 2 $mdsize1 512
+sleep 4
+dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 $dev2 $dev3
+mdadm --zero $dev1 # force --add, not --re-add
+mdadm $md0 --add $dev1
+check recovery
+
+dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r6-bitmapfile b/tests/05r6-bitmapfile
new file mode 100644
index 0000000..d11896d
--- /dev/null
+++ b/tests/05r6-bitmapfile
@@ -0,0 +1,49 @@
+
+#
+# create a raid1 with a bitmap file
+#
+bmf=$targetdir/bitmap
+rm -f $bmf
+mdadm --create --run $md0 --level=6 -n4 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 $dev4
+check wait
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 $dev4
+testdev $md0 2 $mdsize1 512
+dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+sleep 4
+dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ]
+then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2"
+ exit 1
+fi
+mdadm $md0 -f $dev3
+testdev $md0 2 $mdsize1 512
+sleep 4
+dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+if [ $dirty3 -lt 400 ]
+then
+ echo >&2 "ERROR dirty count $dirty3 is too small"
+ exit 2
+fi
+
+mdadm -S $md0
+
+mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev2 $dev4
+mdadm --zero $dev3 # force --add, not --re-add
+mdadm $md0 --add $dev3
+check recovery
+
+dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+check wait
+sleep 4
+dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'`
+
+if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ]
+then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5"
+ exit 1
+fi
+
+mdadm -S $md0
diff --git a/tests/05r6tor0 b/tests/05r6tor0
new file mode 100644
index 0000000..2fd51f2
--- /dev/null
+++ b/tests/05r6tor0
@@ -0,0 +1,27 @@
+set -x -e
+
+# reshape a RAID6 to RAID5 and then RAID0.
+# then reshape back up to RAID5 and RAID5
+
+mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+check wait; sleep 1
+check raid6
+testdev $md0 3 19456 512
+mdadm -G $md0 -l5
+check wait; sleep 1
+check raid5
+testdev $md0 3 19456 512
+mdadm -G $md0 -l0
+check wait; sleep 1
+check raid0
+testdev $md0 3 19456 512
+mdadm -G $md0 -l5 --add $dev3 $dev4
+check wait; sleep 1
+check raid5
+check algorithm 2
+testdev $md0 3 19456 512
+mdadm -G $md0 -l 6
+check wait; sleep 1
+check raid6
+check algorithm 2
+testdev $md0 3 19456 512
diff --git a/tests/06name b/tests/06name
new file mode 100644
index 0000000..4d5e824
--- /dev/null
+++ b/tests/06name
@@ -0,0 +1,12 @@
+set -x
+
+# create an array with a name
+
+mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1
+mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
+mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1
+mdadm -S $md0
+
+mdadm -A $md0 --name="Fred" $devlist
+#mdadm -Db $md0
+mdadm -S $md0
diff --git a/tests/06sysfs b/tests/06sysfs
new file mode 100644
index 0000000..af63ef4
--- /dev/null
+++ b/tests/06sysfs
@@ -0,0 +1,11 @@
+exit 0
+mdadm -CR $md0 -l1 -n3 $dev1 $dev2 $dev3
+
+ls -Rl /sys/block/md0
+
+cat /sys/block/md0/md/level
+cat /sys/block/md0/md/raid_disks
+
+mdadm -S $md0
+
+exit 1
diff --git a/tests/06wrmostly b/tests/06wrmostly
new file mode 100644
index 0000000..968c197
--- /dev/null
+++ b/tests/06wrmostly
@@ -0,0 +1,13 @@
+
+# create a raid1 array with a wrmostly device
+
+mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2
+testdev $md0 1 $mdsize1a 64
+
+# unfortunately, we cannot measure if any read requests are going to $dev2
+
+mdadm -S $md0
+
+mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2
+testdev $md0 1 $mdsize1a 64
+mdadm -S $md0
diff --git a/tests/07autoassemble b/tests/07autoassemble
new file mode 100644
index 0000000..e689be7
--- /dev/null
+++ b/tests/07autoassemble
@@ -0,0 +1,24 @@
+
+# create two raid1s, build a raid0 on top, then
+# tear it down and get auto-assemble to rebuild it.
+
+mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing
+mdadm -CR $md2 -l1 -n2 $dev2 $dev3 --homehost=testing
+mdadm -CR $md0 -l0 -n2 $md1 $md2 --homehost=testing
+
+mdadm -Ss
+mdadm -As -c /dev/null --homehost=testing -vvv
+testdev $md1 1 $mdsize1a 64
+testdev $md2 1 $mdsize1a 64
+testdev $md0 2 $mdsize11a 512
+mdadm -Ss
+
+mdadm --zero-superblock $dev0 $dev1 $dev2 $dev3
+## Now the raid0 uses one stacked and one not
+mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing
+mdadm -CR $md0 -l0 -n2 $md1 $dev2 --homehost=testing
+mdadm -Ss
+mdadm -As -c /dev/null --homehost=testing -vvv
+testdev $md1 1 $mdsize1a 64
+testdev $md0 1 $[mdsize1a+mdsize11a] 512
+mdadm -Ss
diff --git a/tests/07autodetect b/tests/07autodetect
new file mode 100644
index 0000000..917e0d6
--- /dev/null
+++ b/tests/07autodetect
@@ -0,0 +1,34 @@
+
+#
+# Test in-kernel autodetect.
+# Create a partitionable array on each of two devices,
+# put a partition on each, create an array, and see if we can
+# use autodetect to restart the array.
+
+if lsmod | grep md_mod > /dev/null 2>&1
+then
+ echo md is a module - cannot test autodetect
+ exit 0
+fi
+
+
+mdadm -CR -e 0 $mdp0 -l0 -f -n1 $dev0
+mdadm -CR -e 0 $mdp1 -l0 -f -n1 $dev1
+udevadm settle
+sfdisk $mdp0 >&2 << END
+,,FD
+END
+sfdisk $mdp1 >&2 << END
+,,FD
+END
+udevadm settle
+mdadm -CR -e 0 $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1
+check resync
+check raid1
+check wait
+mdadm -S $md0
+mdadm --auto-detect
+check raid1
+
+mdadm -Ss
+exit 0
diff --git a/tests/07changelevelintr b/tests/07changelevelintr
new file mode 100644
index 0000000..18c6309
--- /dev/null
+++ b/tests/07changelevelintr
@@ -0,0 +1,61 @@
+
+#
+# test that we can stop and restart a level change.
+# just test a few in-place changes, and a few
+# size-reducing changes.
+
+
+checkgeo() {
+ # check the geometry of an array
+ # level raid_disks chunk_size layout
+ dev=$1
+ shift
+ sleep 0.5
+ check wait
+ sleep 1
+ for attr in level raid_disks chunk_size layout
+ do
+ if [ $# -gt 0 ] ; then
+ val=$1
+ shift
+ if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ]
+ then echo "$attr doesn't match for $dev"
+ exit 1
+ fi
+ fi
+ done
+}
+
+restart() {
+ sleep 0.5
+ check reshape
+ mdadm -S $md0
+ mdadm -A $md0 $devs --backup-file=$bu
+ sleep 0.5
+ check reshape
+}
+
+bu=/tmp/md-backup
+rm -f $bu
+devs="$dev0 $dev1 $dev2 $dev3 $dev4"
+mdadm -CR $md0 -l5 -n5 -c 256 $devs
+checkgeo md0 raid5 5 $[256*1024] 2
+
+mdadm -G $md0 -c 128 --backup-file=$bu
+restart
+checkgeo md0 raid5 5 $[128*1024] 2
+
+mdadm -G $md0 --layout rs --backup-file=$bu
+restart
+checkgeo md0 raid5 5 $[128*1024] 3
+
+mdadm -G $md0 --array-size 58368
+mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu
+restart
+checkgeo md0 raid5 4 $[64*1024] 3
+
+devs="$dev0 $dev1 $dev2 $dev3"
+mdadm -G $md0 --array-size 19456
+mdadm -G $md0 -n 2 -c 256 --backup-file=$bu
+restart
+checkgeo md0 raid5 2 $[256*1024] 3
diff --git a/tests/07changelevels b/tests/07changelevels
new file mode 100644
index 0000000..a328874
--- /dev/null
+++ b/tests/07changelevels
@@ -0,0 +1,114 @@
+
+# Test changing of level, chunksize etc.
+# Create a RAID1, convert to RAID5, add a disk, add another disk
+# convert to RAID6, back to RAID5 and ultimately to RAID1
+
+testK=$[64*3*6]
+dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK
+export MDADM_GROW_VERIFY=1
+
+dotest() {
+ sleep 2
+ check wait
+ testdev $md0 $1 19968 64 nd
+ blockdev --flushbufs $md0
+ cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; }
+ # write something new - shift chars 4 space
+ tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2
+ mv /tmp/RandFile2 /tmp/RandFile
+ dd if=/tmp/RandFile of=$md0
+}
+
+checkgeo() {
+ # check the geometry of an array
+ # level raid_disks chunk_size layout
+ dev=$1
+ shift
+ sleep 0.5
+ check wait
+ sleep 1
+ for attr in level raid_disks chunk_size layout
+ do
+ if [ $# -gt 0 ] ; then
+ val=$1
+ shift
+ if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ]
+ then echo "$attr doesn't match for $dev"
+ exit 1
+ fi
+ fi
+ done
+}
+
+
+bu=/tmp/md-test-backup
+rm -f $bu
+mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968
+testdev $md0 1 $mdsize1a 64
+dd if=/tmp/RandFile of=$md0
+dotest 1
+
+mdadm --grow $md0 -l5 -n3 --chunk 64
+dotest 2
+
+mdadm $md0 --add $dev3 $dev4
+mdadm --grow $md0 -n4 --chunk 32
+dotest 3
+
+mdadm -G $md0 -l6 --backup-file $bu
+dotest 3
+
+mdadm -G /dev/md0 --array-size 39936
+mdadm -G $md0 -n4 --backup-file $bu
+checkgeo md0 raid6 4 $[32*1024]
+dotest 2
+
+mdadm -G $md0 -l5 --backup-file $bu
+checkgeo md0 raid5 3 $[32*1024]
+dotest 2
+
+mdadm -G /dev/md0 --array-size 19968
+mdadm -G $md0 -n2 --backup-file $bu
+checkgeo md0 raid5 2 $[32*1024]
+dotest 1
+
+mdadm -G --level=1 $md0
+dotest 1
+
+# now repeat that last few steps only with a degraded array.
+mdadm -S $md0
+mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4
+dd if=/tmp/RandFile of=$md0
+dotest 3
+
+mdadm $md0 --fail $dev0
+
+mdadm -G /dev/md0 --array-size 37888
+mdadm -G $md0 -n4 --backup-file $bu
+dotest 2
+checkgeo md0 raid6 4 $[512*1024]
+mdadm $md0 --fail $dev4
+
+mdadm $md0 --fail $dev3
+# now double-degraded.
+# switch layout to a DDF layout and back to make sure that works.
+
+mdadm -G /dev/md0 --layout=ddf-N-continue --backup-file $bu
+checkgeo md0 raid6 4 $[512*1024] 10
+dotest 2
+mdadm -G /dev/md0 --layout=ra --backup-file $bu
+checkgeo md0 raid6 4 $[512*1024] 1
+dotest 2
+
+mdadm -G $md0 -l5 --backup-file $bu
+dotest 2
+
+mdadm -G /dev/md0 --array-size 18944
+mdadm -G $md0 -n2 --backup-file $bu
+dotest 1
+checkgeo md0 raid5 2 $[512*1024]
+mdadm $md0 --fail $dev2
+
+mdadm -G --level=1 $md0
+dotest 1
+checkgeo md0 raid1 2
diff --git a/tests/07layouts b/tests/07layouts
new file mode 100644
index 0000000..acd1a80
--- /dev/null
+++ b/tests/07layouts
@@ -0,0 +1,91 @@
+
+# check that kernel an restripe interpret all the different layouts
+# the same
+# This involves changing the layout to each different possibility
+# while MDADM_GROW_VERIFY is set.
+
+testK=$[64*3*6]
+dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK
+export MDADM_GROW_VERITY=1
+
+
+dotest() {
+ sleep 0.5
+ check wait
+ testdev $md0 $1 $mdsize1 512 nd
+ blockdev --flushbufs $md0
+ cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; }
+ # write something new - shift chars 4 space
+ tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2
+ mv /tmp/RandFile2 /tmp/RandFile
+ dd if=/tmp/RandFile of=$md0
+}
+
+checkgeo() {
+ # check the geometry of an array
+ # level raid_disks chunk_size layout
+ dev=$1
+ shift
+ sleep 0.5
+ check wait
+ for attr in level raid_disks chunk_size layout
+ do
+ if [ $# -gt 0 ] ; then
+ val=$1
+ shift
+ if [ " `sed 's/ .*//' /sys/block/$dev/md/$attr`" != " $val" ]
+ then echo "$attr doesn't match for $dev"
+ exit 1
+ fi
+ fi
+ done
+}
+
+
+bu=/tmp/md-test-backup
+rm -f $bu
+
+# first a degraded 5 device raid5
+mdadm -CR $md0 -l5 -n5 $dev0 $dev1 missing $dev2 $dev3
+dd if=/tmp/RandFile of=$md0
+dotest 4
+
+l5[0]=la
+l5[1]=ra
+l5[2]=ls
+l5[3]=rs
+l5[4]=parity-first
+l5[5]=parity-last
+for layout in 0 1 2 3 4 5 0
+do
+ mdadm -G $md0 --layout=${l5[$layout]} --backup-file $bu
+ checkgeo md0 raid5 5 $[512*1024] $layout
+ dotest 4
+done
+
+mdadm -S $md0
+# now a doubly degraded raid6
+mdadm -CR $md0 -l6 -n5 $dev0 missing $dev2 missing $dev4
+dd if=/tmp/RandFile of=$md0
+dotest 3
+
+l6[0]=la
+l6[1]=ra
+l6[2]=ls
+l6[3]=rs
+l6[4]=parity-first
+l6[5]=parity-last
+l6[8]=ddf-zero-restart
+l6[9]=ddf-N-restart
+l6[10]=ddf-N-continue
+l6[16]=left-asymmetric-6
+l6[17]=right-asymmetric-6
+l6[18]=left-symmetric-6
+l6[19]=right-symmetric-6
+l6[20]=parity-first-6
+for layout in 0 1 2 3 4 5 8 9 10 16 17 18 19 20 0
+do
+ mdadm -G $md0 --layout=${l6[$layout]} --backup-file $bu
+ checkgeo md0 raid6 5 $[512*1024] $layout
+ dotest 3
+done
diff --git a/tests/07reshape5intr b/tests/07reshape5intr
new file mode 100644
index 0000000..0f4803a
--- /dev/null
+++ b/tests/07reshape5intr
@@ -0,0 +1,41 @@
+
+#
+# test interrupting and restarting raid5 reshape.
+set -x
+devs="$dev1"
+st=UU
+for disks in 2 3 4 5
+do
+ eval devs=\"$devs \$dev$disks\"
+ st=U$st
+ for d in $devs
+ do dd if=/dev/urandom of=$d bs=1024 || true
+ done
+
+ case $disks in
+ 2 | 3) chunk=1024;;
+ 4 ) chunk=512;;
+ 5 ) chunk=256;;
+ esac
+
+ mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs
+ mdadm $md0 --add $dev6
+ echo 20 > /proc/sys/dev/raid/speed_limit_min
+ echo 20 > /proc/sys/dev/raid/speed_limit_max
+ mdadm --grow $md0 -n $[disks+1]
+ check reshape
+ check state $st
+ mdadm --stop $md0
+ mdadm --assemble $md0 $devs $dev6
+ check reshape
+ echo 1000 > /proc/sys/dev/raid/speed_limit_min
+ echo 2000 > /proc/sys/dev/raid/speed_limit_max
+ check wait
+ while ! echo check > /sys/block/md0/md/sync_action; do sleep 0.1; done
+ check wait
+ mm=`cat /sys/block/md0/md/mismatch_cnt`
+ if [ $mm -gt 0 ]
+ then echo >&2 "ERROR mismatch_cnt non-zero : $mm" ; exit 1
+ fi
+ mdadm -S $md0
+done
diff --git a/tests/07revert-grow b/tests/07revert-grow
new file mode 100644
index 0000000..c8c4e85
--- /dev/null
+++ b/tests/07revert-grow
@@ -0,0 +1,52 @@
+set -e -x
+
+# revert a reshape that is increasing the number of devices,
+# raid5, raid6, and raid10
+
+# metadate 0.90 cannot handle RAID10 growth
+# metadata 1.0 doesn't get a default headspace, is don't try it either.
+
+for metadata in 0.90 1.1 1.2
+do
+# RAID5
+mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 --metadata=$metadata
+check raid5
+testdev $md0 3 $mdsize1 512
+mdadm -G $md0 -n 5
+sleep 3
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup
+check wait
+check raid5
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+# RAID6
+mdadm -CR --assume-clean $md0 -l6 -n4 -x1 $devlist4 --metadata=$metadata
+check raid6
+testdev $md0 2 $mdsize1 512
+mdadm -G $md0 -n 5
+sleep 3
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup
+check wait
+check raid6
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+if [ $metadata = 0.90 ]; then continue; fi
+
+# RAID10
+mdadm -CR --assume-clean $md0 -l10 -n4 -x1 $devlist4 --metadata=$metadata
+check raid10
+testdev $md0 2 $mdsize1 512
+mdadm -G $md0 -n 5
+sleep 3
+mdadm -S $md0
+strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist4
+check wait
+check raid10
+testdev $md0 2 $mdsize1 512
+mdadm -S $md0
+
+done
diff --git a/tests/07revert-inplace b/tests/07revert-inplace
new file mode 100644
index 0000000..a73eb97
--- /dev/null
+++ b/tests/07revert-inplace
@@ -0,0 +1,44 @@
+set -e -x
+
+# revert a reshape that is not changing the number of data devices,
+# raid5, raid6, and raid10
+
+# RAID5 -> RAID6
+mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4
+check raid5
+testdev $md0 3 $mdsize1 512
+mdadm -G $md0 -l 6
+sleep 2
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup
+check wait
+check raid6
+check algorithm 18
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+# RAID6 -> RAID5
+mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4
+check raid6
+testdev $md0 3 $mdsize1 512
+mdadm -G $md0 -l 5
+sleep 2
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup
+check wait
+check raid6
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+# RAID10 - decrease chunk size
+mdadm -CR --assume-clean $md0 -l10 -n6 -c 64 $devlist5
+check raid10
+testdev $md0 3 $mdsize1 64
+mdadm -G $md0 -c 32
+sleep 2
+mdadm -S $md0
+strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist5
+check wait
+check raid10
+testdev $md0 3 $mdsize1 64
+mdadm -S $md0
diff --git a/tests/07revert-shrink b/tests/07revert-shrink
new file mode 100644
index 0000000..62b5ae0
--- /dev/null
+++ b/tests/07revert-shrink
@@ -0,0 +1,56 @@
+set -e -x
+
+# revert a reshape that is decreasing the number of devices,
+# raid5, raid6, and raid10
+
+bu=$targetdir/md-backup
+rm -f $bu
+# RAID5
+mdadm -CR --assume-clean $md0 -l5 -n5 $devlist4
+check raid5
+testdev $md0 4 $mdsize1 512
+mdadm --grow $md0 --array-size 56832
+testdev $md0 3 $mdsize1 512
+mdadm -G $md0 -n 4 --backup=$bu
+sleep 3
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu
+check wait
+check raid5
+fsck -f -n $md0
+testdev $md0 4 $mdsize1 512
+mdadm -S $md0
+
+#FIXME
+rm -f $bu
+# RAID6
+mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4
+check raid6
+testdev $md0 3 $mdsize1 512
+mdadm --grow $md0 --array-size 37888
+testdev $md0 2 $mdsize1 512
+mdadm -G $md0 -n 4 --backup=$bu
+sleep 2
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu
+check wait
+check raid6
+fsck -f -n $md0
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
+
+# RAID10
+mdadm -CR --assume-clean $md0 -l10 -n6 $devlist5
+check raid10
+testdev $md0 3 $mdsize1 512
+mdadm --grow $md0 --array-size 36864
+testdev $md0 2 $mdsize1 512
+mdadm -G $md0 -n 4
+sleep 3
+mdadm -S $md0
+mdadm -A $md0 --update=revert-reshape $devlist5
+check wait
+check raid10
+fsck -f -n $md0
+testdev $md0 3 $mdsize1 512
+mdadm -S $md0
diff --git a/tests/07testreshape5 b/tests/07testreshape5
new file mode 100644
index 0000000..0e1f25f
--- /dev/null
+++ b/tests/07testreshape5
@@ -0,0 +1,45 @@
+
+#
+# test the reshape code by using test_reshape and the
+# kernel md code to move data into and out of variously
+# shaped md arrays.
+set -x
+layouts=(la ra ls rs)
+for level in 5 6
+do
+for chunk in 4 8 16 32 64 128
+do
+ devs="$dev1"
+ for disks in 2 3 4 5 6
+ do
+ eval devs=\"$devs \$dev$disks\"
+ if [ " $level $disks" = " 6 3" -o " $level $disks" = " 6 2" ]
+ then continue
+ fi
+ for nlayout in 0 1 2 3
+ do
+ layout=${layouts[$nlayout]}
+
+ size=$[chunk*(disks-(level-4))*disks]
+
+ # test restore: make a raid5 from a file, then do a compare
+ dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size
+ $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs
+ mdadm -CR -e 1.0 $md0 -amd -l$level -n$disks --assume-clean -c $chunk -p $layout $devs
+ cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+ # FIXME check parity
+
+ # test save
+ dd if=/dev/urandom of=$md0 bs=1024 count=$size
+ blockdev --flushbufs $md0 $devs; sync
+ > /tmp/NewRand
+ $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs
+ cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; }
+ mdadm -S $md0
+ udevadm settle
+ done
+ done
+done
+done
+exit 0
diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble
new file mode 100644
index 0000000..d7028c6
--- /dev/null
+++ b/tests/09imsm-assemble
@@ -0,0 +1,73 @@
+# validate the prodigal member disk scenario i.e. a former container
+# member is returned after having been rebuilt on another system
+
+
+imsm_check_hold() {
+ if mdadm --remove $1 $2; then
+ echo "$2 removal from $1 should have been blocked" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+imsm_check_removal() {
+ if ! mdadm --remove $1 $2 ; then
+ echo "$2 removal from $1 should have succeeded" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export IMSM_NO_PLATFORM=1
+container=/dev/md/container
+member=/dev/md/vol0
+
+
+num_disks=4
+size=$((10*1024))
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3
+mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size
+mdadm --wait $member || true
+mdadm -Ss
+
+# make dev0 and dev1 a new rebuild family
+mdadm -A $container $dev0 $dev1
+mdadm -IR $container
+mdadm --wait ${member}_0 || true
+mdadm -Ss
+
+# make dev2 and dev3 a new rebuild family
+mdadm -A $container $dev2 $dev3
+mdadm -IR $container
+mdadm --wait ${member}_0 || true
+mdadm -Ss
+
+# reassemble and make sure one of the families falls out
+mdadm -A $container $dev0 $dev1 $dev2 $dev3
+mdadm -IR $container
+testdev ${member}_0 1 $size 64
+if mdadm --remove $container $dev0 ; then
+ # the dev[23] family won
+ imsm_check_removal $container $dev1
+ imsm_check_hold $container $dev2
+ imsm_check_hold $container $dev3
+else
+ # the dev[01] family won
+ imsm_check_hold $container $dev1
+ imsm_check_removal $container $dev2
+ imsm_check_removal $container $dev3
+fi
+mdadm -Ss
+
+# reassemble with a new id for the dev[23] family
+mdadm -A $container $dev0 $dev1
+mdadm -IR $container
+mdadm -A ${container}2 $dev2 $dev3 --update=uuid
+mdadm -IR ${container}2
+
+testdev ${member}_0 1 $size 64
+testdev ${member}_1 1 $size 64
diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild
new file mode 100644
index 0000000..f09b437
--- /dev/null
+++ b/tests/09imsm-create-fail-rebuild
@@ -0,0 +1,78 @@
+# sanity check array creation
+
+imsm_check_hold() {
+ if mdadm --remove $1 $2; then
+ echo "$2 removal from $1 should have been blocked" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+imsm_check_removal() {
+ if ! mdadm --remove $1 $2 ; then
+ echo "$2 removal from $1 should have succeeded" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+. tests/env-imsm-template
+
+# IMSM rounds to multiples of one mebibyte - 1024K
+DEV_ROUND_K=1024
+
+num_disks=2
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1
+imsm_check container $num_disks
+
+# RAID0 + RAID1
+size=9000
+level=0
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk
+testdev $member0 $num_disks $size $chunk
+
+offset=$(((size & ~(1024 - 1)) + 4096))
+size=4000
+level=1
+chunk=0
+mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size
+imsm_check member $member1 $num_disks $level $size $size $offset $chunk
+testdev $member1 1 $size 64
+check wait
+
+mdadm -Ss
+
+# RAID10 + RAID5
+num_disks=4
+mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3
+imsm_check container $num_disks
+
+size=9000
+level=10
+chunk=64
+offset=0
+mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk
+testdev $member0 $((num_disks-2)) $size $chunk
+
+offset=$(((size & ~(1024 - 1)) + 4096))
+size=4000
+level=5
+mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk
+imsm_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk
+testdev $member1 $((num_disks-1)) $size $chunk
+check wait
+
+# FAIL / REBUILD
+imsm_check_hold $container $dev0
+mdadm --fail $member0 $dev0
+mdadm --wait-clean --scan || true
+imsm_check_removal $container $dev0
+mdadm --add $container $dev4
+check wait
+imsm_check_hold $container $dev4
diff --git a/tests/09imsm-overlap b/tests/09imsm-overlap
new file mode 100644
index 0000000..ff5d209
--- /dev/null
+++ b/tests/09imsm-overlap
@@ -0,0 +1,28 @@
+
+. tests/env-imsm-template
+
+# create raid arrays with varying degress of overlap
+mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5
+imsm_check container 6
+
+size=1024
+level=1
+num_disks=2
+mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size
+mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size
+mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size
+mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size
+mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size
+
+udevadm settle
+
+offset=0
+imsm_check member $member0 $num_disks $level $size 1024 $offset
+offset=$((offset+size+4096))
+imsm_check member $member1 $num_disks $level $size 1024 $offset
+offset=$((offset+size+4096))
+imsm_check member $member2 $num_disks $level $size 1024 $offset
+offset=$((offset+size+4096))
+imsm_check member $member3 $num_disks $level $size 1024 $offset
+offset=$((offset+size+4096))
+imsm_check member $member4 $num_disks $level $size 1024 $offset
diff --git a/tests/10ddf-assemble-missing b/tests/10ddf-assemble-missing
new file mode 100644
index 0000000..4bf21b2
--- /dev/null
+++ b/tests/10ddf-assemble-missing
@@ -0,0 +1,61 @@
+# An array is assembled incompletely.
+# Re missing disks get marked as missing and are not allowed back in
+
+. tests/env-ddf-template
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp /var/tmp/mdmon.log
+ret=0
+
+mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11
+ddf_check container 4
+
+mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000
+mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000
+
+mdadm --wait $member0 || true
+mdadm --wait $member1 || true
+
+mdadm -Ss
+sleep 1
+
+# Add all devices except those for $member0
+mdadm -I $dev10
+mdadm -I $dev11
+
+# Start runnable members
+mdadm -IRs || true
+mdadm -Ss
+
+#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log
+
+# Now reassemble
+# This should work because BVDs weren't written to
+for d in $dev8 $dev9 $dev10 $dev11; do
+ mdadm -I $d
+done
+mdadm -Ss
+
+# Expect consistent state
+for d in $dev10 $dev11; do
+ mdadm -E $d>$tmp
+ egrep 'state\[0\] : Degraded, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member0 has unexpected state on $d
+ }
+ egrep 'state\[1\] : Optimal, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member1 has unexpected state on $d
+ }
+
+ if [ x$(egrep -c 'active/Online$' $tmp) != x2 ]; then
+ ret=1
+ echo ERROR: unexpected number of online disks on $d
+ fi
+done
+
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev10
+ mdadm -E $dev8
+fi
+rm -f $tmp /var/tmp/mdmon.log
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-create b/tests/10ddf-create
new file mode 100644
index 0000000..44e9544
--- /dev/null
+++ b/tests/10ddf-create
@@ -0,0 +1,89 @@
+#
+# Test basic DDF functionality.
+#
+# Create a container with 5 drives
+# create a small raid0 across them all,
+# then a small raid10 using 4 drives, then a 2disk raid1
+# and a 3disk raid5 using the remaining space
+#
+# add some data, tear down the array, reassemble
+# and make sure it is still there.
+set -e
+. tests/env-ddf-template
+sda=$(get_rootdev) || exit 1
+
+mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000
+if mdadm -CR r5 -l1 -n2 /dev/md/ddf0 -z 5000
+then echo >&2 create with same name should fail ; exit 1
+fi
+mdadm -CR r10 -l10 -n4 -pn2 /dev/md/ddf0 -z 5000
+mdadm -CR r1 -l1 -n2 /dev/md/ddf0
+mdadm -CR r0 -l0 -n3 /dev/md/ddf0
+testdev /dev/md/r5 4 5000 512
+testdev /dev/md/r10 2 5000 512
+# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest
+testdev /dev/md/r1 1 23552 64
+testdev /dev/md/r0 3 23552 512
+dd if=$sda of=/dev/md/r0 || true
+dd if=$sda of=/dev/md/r10 || true
+dd if=$sda of=/dev/md/r1 || true
+dd if=$sda of=/dev/md/r5 || true
+
+s0=`sha1sum /dev/md/r0`
+s10=`sha1sum /dev/md/r10`
+s1=`sha1sum /dev/md/r1`
+s5=`sha1sum /dev/md/r5`
+
+
+mdadm -Ss
+mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12
+mdadm -I /dev/md/ddf0
+
+udevadm settle
+s0a=`sha1sum /dev/md/r0`
+s10a=`sha1sum /dev/md/r10`
+s1a=`sha1sum /dev/md/r1`
+s5a=`sha1sum /dev/md/r5`
+
+if [ "$s0" != "$s0a" ]; then
+ echo r0 did not match ; exit 1;
+fi
+if [ "$s10" != "$s10a" ]; then
+ echo r10 did not match ; exit 1;
+fi
+if [ "$s1" != "$s1a" ]; then
+ echo r1 did not match ; exit 1;
+fi
+if [ "$s5" != "$s5a" ]; then
+ echo r5 did not match ; exit 1;
+fi
+
+# failure status just means it has completed already, so ignore it.
+mdadm --wait /dev/md/r1 || true
+mdadm --wait /dev/md/r10 || true
+mdadm --wait /dev/md/r5 || true
+
+mdadm -Dbs > /var/tmp/mdadm.conf
+
+mdadm -Ss
+
+# Now try to assemble using mdadm.conf
+mdadm -Asc /var/tmp/mdadm.conf
+check nosync # This failed once. The raid5 was resyncing.
+udevadm settle
+mdadm -Dbs | sort > /tmp/mdadm.conf
+sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf -
+mdadm -Ss
+
+# and now assemble fully incrementally.
+for i in $dev8 $dev9 $dev10 $dev11 $dev12
+do
+ mdadm -I $i -c /var/tmp/mdadm.conf
+done
+check nosync
+udevadm settle
+mdadm -Dbs | sort > /tmp/mdadm.conf
+sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf -
+mdadm -Ss
+rm /tmp/mdadm.conf /var/tmp/mdadm.conf
diff --git a/tests/10ddf-create-fail-rebuild b/tests/10ddf-create-fail-rebuild
new file mode 100644
index 0000000..a8e8ced
--- /dev/null
+++ b/tests/10ddf-create-fail-rebuild
@@ -0,0 +1,77 @@
+# sanity check array creation
+
+ddf_check_hold() {
+ if mdadm --remove $1 $2; then
+ echo "$2 removal from $1 should have been blocked" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+ddf_check_removal() {
+ if ! mdadm --remove $1 $2 ; then
+ echo "$2 removal from $1 should have succeeded" >&2
+ cat /proc/mdstat >&2
+ mdadm -E $2
+ exit 1
+ fi
+}
+
+. tests/env-ddf-template
+
+num_disks=2
+mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9
+ddf_check container $num_disks
+
+# RAID0 + RAID1
+size=9000
+level=0
+chunk=64
+offset=0
+layout=0
+mdadm -CR $member0 $dev8 $dev9 -n $num_disks -l $level -z $size -c $chunk
+ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout
+testdev $member0 $num_disks $size $chunk
+
+offset=$(((size & ~(chunk - 1))))
+size=4000
+level=1
+chunk=0
+mdadm -CR $member1 $dev8 $dev9 -n $num_disks -l $level -z $size
+ddf_check member $member1 $num_disks $level $size $size $offset $chunk $layout
+testdev $member1 1 $size 1
+check wait
+
+mdadm -Ss
+
+# RAID10 + RAID5
+num_disks=4
+mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11
+ddf_check container $num_disks
+
+size=9000
+level=10
+chunk=64
+offset=0
+layout=2
+mdadm -CR $member0 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk
+ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout
+testdev $member0 $((num_disks-2)) $size $chunk
+
+offset=$(((size & ~(chunk - 1))))
+size=4000
+level=5
+mdadm -CR $member1 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk
+ddf_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk $layout
+testdev $member1 $((num_disks-1)) $size $chunk
+check wait
+
+# FAIL / REBUILD
+ddf_check_hold $container $dev8
+mdadm --fail $member0 $dev8
+mdadm --wait-clean --scan || true
+ddf_check_removal $container $dev8
+mdadm --add $container $dev12
+check wait
+ddf_check_hold $container $dev12
diff --git a/tests/10ddf-fail-create-race b/tests/10ddf-fail-create-race
new file mode 100644
index 0000000..bd5dfb5
--- /dev/null
+++ b/tests/10ddf-fail-create-race
@@ -0,0 +1,66 @@
+# This test creates a RAID1, fails a disk, and immediately
+# (simultaneously) creates a new array. This tests for a possible
+# race where the meta data reflecting the disk failure may not
+# be written when the 2nd array is created.
+. tests/env-ddf-template
+
+mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+
+mdadm -CR $container -e ddf -l container -n 2 $dev11 $dev12
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 >/tmp/mdmon.txt 2>&1
+mdadm -CR $member0 -l raid1 -n 2 $container -z 10000
+check wait
+fail0=$dev11
+mdadm --fail $member0 $fail0 &
+
+# The test can succeed two ways:
+# 1) mdadm -C member1 fails - in this case the meta data
+# was already on disk when the create attempt was made
+# 2) mdadm -C succeeds in the first place (meta data not on disk yet),
+# but mdmon detects the problem and sets the disk faulty.
+
+if mdadm -CR $member1 -l raid1 -n 2 $container; then
+
+ echo create should have failed / race condition?
+
+ check wait
+ set -- $(get_raiddisks $member0)
+ d0=$1
+ ret=0
+ if [ $1 = $fail0 -o $2 = $fail0 ]; then
+ ret=1
+ else
+ set -- $(get_raiddisks $member1)
+ if [ $1 = $fail0 -o $2 = $fail0 ]; then
+ ret=1
+ fi
+ fi
+ if [ $ret -eq 1 ]; then
+ echo ERROR: failed disk $fail0 is still a RAID member
+ echo $member0: $(get_raiddisks $member0)
+ echo $member1: $(get_raiddisks $member1)
+ fi
+ tmp=$(mktemp /tmp/mdest-XXXXXX)
+ mdadm -E $d0 >$tmp
+ if [ x$(grep -c 'state\[[01]\] : Degraded' $tmp) != x2 ]; then
+ echo ERROR: non-degraded array found
+ mdadm -E $d0
+ ret=1
+ fi
+ if ! grep -q '^ *0 *[0-9a-f]\{8\} .*Offline, Failed' $tmp; then
+ echo ERROR: disk 0 not marked as failed in meta data
+ mdadm -E $d0
+ ret=1
+ fi
+ rm -f $tmp
+else
+ ret=0
+fi
+
+[ -f /tmp/mdmon.txt ] && {
+ cat /tmp/mdmon.txt
+ rm -f /tmp/mdmon.txt
+}
+
+[ $ret -eq 0 ]
+
diff --git a/tests/10ddf-fail-readd b/tests/10ddf-fail-readd
new file mode 100644
index 0000000..9cd7893
--- /dev/null
+++ b/tests/10ddf-fail-readd
@@ -0,0 +1,55 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+mke2fs -F $member0
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+mdadm $container --remove $fail0
+
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0 || true
+
+ret=0
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-readd-readonly b/tests/10ddf-fail-readd-readonly
new file mode 100644
index 0000000..6a74d9c
--- /dev/null
+++ b/tests/10ddf-fail-readd-readonly
@@ -0,0 +1,71 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+# Check that the meta data now show one disk as failed
+ret=0
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Degraded, Consistent' $tmp; then
+ echo ERROR: member 0 should be degraded in meta data on $x
+ ret=1
+ fi
+ phys=$(grep $x $tmp)
+ case $x:$phys in
+ $fail0:*active/Offline,\ Failed);;
+ $good0:*active/Online);;
+ *) echo ERROR: wrong phys disk state for $x
+ ret=1
+ ;;
+ esac
+done
+
+mdadm $container --remove $fail0
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0 || true
+
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-spare b/tests/10ddf-fail-spare
new file mode 100644
index 0000000..ab737ca
--- /dev/null
+++ b/tests/10ddf-fail-spare
@@ -0,0 +1,86 @@
+# Test suggested by Albert Pauw: Create, fail one disk, have mdmon
+# activate the spare,
+# then run create again. Shouldn't use the failed disk for Create,
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+mdadm -CR $container -e ddf -l container -n 5 $dev8 $dev9 $dev10 $dev11 $dev12
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm --fail $member0 $fail0
+
+# To make sure the spare is activated, we may have to sleep
+# 2s has always been enough for me
+sleep 2
+check wait
+
+# This test can succeed both ways - if spare was activated
+# before new array was created, we see only member 0.
+# otherwise, we see both, adn member0 is degraded because the
+# new array grabbed the spare
+# which case occurs depends on the sleep time above.
+ret=0
+if mdadm -CR $member1 -l raid5 -n 3 $container; then
+ # Creation successful - must have been quicker than spare activation
+
+ check wait
+ set -- $(get_raiddisks $member1)
+ if [ $1 = $fail0 -o $2 = $fail0 -o $3 = $fail0 ]; then
+ echo ERROR: $member1 must not contain $fail0: $@
+ ret=1
+ fi
+ d1=$1
+ mdadm -E $d1 >$tmp
+ if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 1 should be optimal in meta data
+ ret=1
+ fi
+ state0=Degraded
+else
+ # Creation unsuccessful - spare was used for member 0
+ state0=Optimal
+fi
+
+# need to delay a little bit, sometimes the meta data aren't
+# up-to-date yet
+sleep 0.5
+set -- $(get_raiddisks $member0)
+if [ $1 = $fail0 -o $2 = $fail0 ]; then
+ echo ERROR: $member0 must not contain $fail0: $@
+ ret=1
+fi
+d0=$1
+
+[ -f $tmp ] || mdadm -E $d0 >$tmp
+
+if ! grep -q 'state\[0\] : '$state0', Consistent' $tmp; then
+ echo ERROR: member 0 should be $state0 in meta data
+ ret=1
+fi
+if ! grep -q 'Offline, Failed' $tmp; then
+ echo ERROR: Failed disk expected in meta data
+ ret=1
+fi
+if [ $ret -eq 1 ]; then
+ cat /proc/mdstat
+ mdadm -E $d0
+ mdadm -E $d1
+ mdadm -E $fail0
+fi
+
+[ -f /tmp/mdmon.txt ] && {
+ cat /tmp/mdmon.txt
+ rm -f /tmp/mdmon.txt
+}
+
+rm -f $tmp
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-stop-readd b/tests/10ddf-fail-stop-readd
new file mode 100644
index 0000000..f8ebe17
--- /dev/null
+++ b/tests/10ddf-fail-stop-readd
@@ -0,0 +1,66 @@
+# Simple fail / re-add test
+. tests/env-ddf-template
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp
+
+mdadm --zero-superblock $dev8 $dev9
+mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9
+
+mdadm -CR $member0 -l raid1 -n 2 $container
+#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1
+
+# Write to the array
+mke2fs -F $member0
+check wait
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+
+sleep 1
+mdadm $container --remove $fail0
+
+set -- $(get_raiddisks $member0)
+case $1 in MISSING) shift;; esac
+good0=$1
+
+mdadm -Ss
+
+sleep 1
+# Now simulate incremental assembly
+mdadm -I $good0
+mdadm -IRs || true
+
+# Write to the array
+mke2fs -F $member0
+
+# We re-add the disk now
+mdadm $container --add $fail0
+
+sleep 1
+mdadm --wait $member0 || true
+
+ret=0
+set -- $(get_raiddisks $member0)
+case $1:$2 in
+ $dev8:$dev9|$dev9:$dev8);;
+ *) echo ERROR: bad raid disks "$@"; ret=1;;
+esac
+
+mdadm -Ss
+for x in $@; do
+ mdadm -E $x >$tmp
+ if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: member 0 should be optimal in meta data on $x
+ ret=1
+ fi
+done
+
+rm -f $tmp
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev8
+ mdadm -E $dev9
+fi
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-fail-twice b/tests/10ddf-fail-twice
new file mode 100644
index 0000000..6af1943
--- /dev/null
+++ b/tests/10ddf-fail-twice
@@ -0,0 +1,59 @@
+. tests/env-ddf-template
+
+num_disks=5
+mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 $dev12
+ddf_check container $num_disks
+
+mdadm -CR $member0 -n 2 -l 1 $container
+mdadm -CR $member1 -n 3 -l 5 $container
+
+mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true
+
+set -- $(get_raiddisks $member0)
+fail0=$1
+mdadm $member0 --fail $fail0
+set -- $(get_raiddisks $member1)
+fail1=$1
+mdadm $member1 --fail $fail1
+
+mdadm $container --add $dev13
+
+mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true
+
+
+devs0="$(get_raiddisks $member0)"
+devs1="$(get_raiddisks $member1)"
+
+present=$(($(get_present $member0) + $(get_present $member1)))
+[ $present -eq 4 ] || {
+ echo expected 4 present disks, got $present
+ devices for $member0: $devs0
+ devices for $member1: $devs1
+ exit 1
+}
+
+if echo "$devs0" | grep -q MISSING; then
+ good=1
+ bad=0
+else
+ good=0
+ bad=1
+fi
+
+# find a good device
+eval "set -- \$devs$good"
+check=$1
+
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+mdadm -E $check >$tmp
+
+{ grep -q 'state\['$bad'\] : Degraded, Consistent' $tmp &&
+ grep -q 'state\['$good'\] : Optimal, Consistent' $tmp; } || {
+ echo unexpected meta data state on $check
+ mdadm -E $check
+ rm -f $tmp
+ exit 1
+}
+
+rm -f $tmp
+exit 0
diff --git a/tests/10ddf-fail-two-spares b/tests/10ddf-fail-two-spares
new file mode 100644
index 0000000..e00810d
--- /dev/null
+++ b/tests/10ddf-fail-two-spares
@@ -0,0 +1,86 @@
+# Simulate two disks failing shorty after each other
+. tests/env-ddf-template
+sda=$(get_rootdev) || exit 1
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+
+mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+mdadm -CR $container -e ddf -l container -n 6 \
+ $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+
+#fast_sync
+
+mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384
+#$dir/mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 \
+# >/tmp/mdmon.txt 2>&1
+mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384
+
+dd if=$sda of=$member0 bs=1M count=32
+dd if=$sda of=$member1 bs=1M skip=16 count=16
+
+check wait
+
+sum0=$(sha1sum $member0)
+sum1=$(sha1sum $member1)
+
+mdadm --fail $member1 $dev11
+sleep 1
+mdadm --fail $member1 $dev12
+
+# We will have 4 resync procedures, 2 spares for 2 arrays.
+mdadm --wait $member1 $member0 || true
+mdadm --wait $member1 $member0 || true
+
+devs0="$(get_raiddisks $member0)"
+devs1="$(get_raiddisks $member1)"
+expected="$dev10
+$dev13
+$dev8
+$dev9"
+
+ret=0
+if [ "$(echo "$devs0" | sort)" != "$expected" \
+ -o "$(echo "$devs1" | sort)" != "$expected" ]; then
+ echo ERROR: unexpected members
+ echo $member0: $devs0
+ echo $member1: $devs1
+ ret=1
+fi
+
+mdadm -E $dev10 >$tmp
+if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then
+ echo ERROR: $member0 should be optimal in meta data
+ ret=1
+fi
+if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then
+ echo ERROR: $member1 should be optimal in meta data
+ ret=1
+fi
+if [ x"$(grep -c active/Online $tmp)" != x4 ]; then
+ echo ERROR: expected 4 online disks
+ ret=1
+fi
+if [ x"$(grep -c "Offline, Failed" $tmp)" != x2 ]; then
+ echo ERROR: expected 2 failed disks
+ ret=1
+fi
+
+sum0a=$(sha1sum $member0)
+sum1a=$(sha1sum $member1)
+
+if [ "$sum0" != "$sum0a" -o "$sum1" != "$sum1a" ]; then
+ echo ERROR: checksum mismatch
+ ret=1
+fi
+
+if [ $ret -eq 1 ]; then
+ cat /proc/mdstat
+ cat $tmp
+fi
+
+[ -f /tmp/mdmon.txt ] && {
+ cat /tmp/mdmon.txt
+ rm -f /tmp/mdmon.txt
+}
+rm -f $tmp
+
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-geometry b/tests/10ddf-geometry
new file mode 100644
index 0000000..b0cce2f
--- /dev/null
+++ b/tests/10ddf-geometry
@@ -0,0 +1,82 @@
+#
+# Test various RAID geometries, creation and deletion of subarrays
+#
+
+assert_fail() {
+ if mdadm "$@"; then
+ echo mdadm "$@" must fail
+ return 1
+ else
+ return 0
+ fi
+}
+
+assert_kill() {
+ local dev=$1 n=$2
+ mdadm -S $dev
+ mdadm --kill-subarray=$n /dev/md/ddf0
+ if mdadm -Dbs | grep -q $dev; then
+ echo >&2 $dev should be deleted
+ return 1
+ fi
+ return 0
+}
+
+set -e
+mdadm -CR /dev/md/ddf0 -e ddf -n 6 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+
+# RAID1 geometries
+# Use different sizes to make offset calculation harder
+mdadm -CR l1s -l1 -n2 /dev/md/ddf0 -z 8000
+mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000
+assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0
+
+# RAID10 geometries
+mdadm -CR l10_0 -l10 -n3 /dev/md/ddf0 -z 1000
+mdadm -CR l10_1 -l10 -n5 /dev/md/ddf0 -z 1000
+assert_fail mdadm -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0
+mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 4000
+mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 4000
+
+assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
+assert_kill /dev/md/l10_2 4
+# gone now, must be able to create it again
+mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
+
+# Now stop and reassemble
+mdadm -Ss
+mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+
+# Same as above, on inactive container
+assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000
+# Kill subarray without having started anything (no mdmon)
+mdadm --kill-subarray=5 /dev/md/ddf0
+mdadm -I /dev/md/ddf0
+mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000
+
+assert_kill /dev/md/l10_2 4
+assert_kill /dev/md/l10_3 5
+
+# RAID5 geometries
+mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000
+mdadm -CR l5ra -l5 -n3 --layout=ddf-zero-restart /dev/md/ddf0 -z 5000
+mdadm -CR l5ls -l5 -n3 --layout=ddf-N-continue /dev/md/ddf0 -z 5000
+assert_fail -CR l5rs -l5 -n3 -prs /dev/md/ddf0 -z 5000
+
+# Stop and reassemble
+mdadm -Ss
+mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13
+mdadm -I /dev/md/ddf0
+
+assert_kill /dev/md/l5la 4
+assert_kill /dev/md/l5ls 6
+assert_kill /dev/md/l5ra 5
+
+# RAID6 geometries
+assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000
+assert_fail -CR l6rs -l5 -n4 -prs /dev/md/ddf0 -z 5000
+mdadm -CR l6la -l6 -n4 --layout=ddf-N-restart /dev/md/ddf0 -z 5000
+mdadm -CR l6ra -l6 -n4 --layout=ddf-zero-restart $dev8 $dev9 $dev10 $dev11 -z 5000
+mdadm -CR l6ls -l6 -n4 --layout=ddf-N-continue $dev13 $dev8 $dev9 $dev12 -z 5000
+
+mdadm -Ss
diff --git a/tests/10ddf-incremental-wrong-order b/tests/10ddf-incremental-wrong-order
new file mode 100644
index 0000000..9ecf6bc
--- /dev/null
+++ b/tests/10ddf-incremental-wrong-order
@@ -0,0 +1,131 @@
+# An array is assembled incompletely. Some disks will
+# have later metadata than others.
+# The array is then reassembled in the "wrong" order -
+# older meta data first.
+# This FAILS with mdadm 3.3
+. tests/env-ddf-template
+tmp=$(mktemp /tmp/mdtest-XXXXXX)
+rm -f $tmp /var/tmp/mdmon.log
+ret=0
+
+mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11
+ddf_check container 4
+
+mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000
+mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000
+
+mdadm --wait $member0 || true
+mdadm --wait $member1 || true
+
+mke2fs -F $member0
+mke2fs -F $member1
+sha_0a=$(sha1_sum $member0)
+sha_1a=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+# Add all devices except those for $member0
+mdadm -I $dev10
+mdadm -I $dev11
+
+# Start runnable members ($member1) and write
+mdadm -IRs || true
+e2fsck -fy $member1
+sha_1b=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+# Seq number should be different now
+seq8a=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+seq10a=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+
+if [ $seq8a -ge $seq10a ]; then
+ ret=1
+ echo ERROR: sequential number of $dev10 not bigger than $dev8
+fi
+if [ x$sha_1a = x$sha_1b ]; then
+ ret=1
+ echo ERROR: sha1sums equal after write
+fi
+
+#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log
+
+# Now reassemble
+# Note that we add the previously missing disks first.
+# $dev10 should have a higher seq number than $dev8
+for d in $dev8 $dev9 $dev10 $dev11; do
+ mdadm -I $d
+done
+
+mdadm -IRs || true
+sha_0c=$(sha1_sum $member0)
+sha_1c=$(sha1_sum $member1)
+
+mdadm -Ss
+sleep 1
+
+seq8c=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+seq10c=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+
+if [ x$sha_0a != x$sha_0c ]; then
+ ret=1
+ echo ERROR: sha1sum of $member0 has changed
+fi
+if [ x$sha_1b != x$sha_1c ]; then
+ ret=1
+ echo ERROR: sha1sum of $member1 has changed
+fi
+if [ \( $seq10a -ge $seq10c \) -o \( $seq8c -ne $seq10c \) ]; then
+ ret=1
+ echo ERROR: sequential numbers are wrong
+fi
+
+# Expect consistent state
+for d in $dev10 $dev8; do
+ mdadm -E $d>$tmp
+ for x in 0 1; do
+ egrep 'state\['$x'\] : Optimal, Consistent' $tmp || {
+ ret=1
+ echo ERROR: $member0 has unexpected state on $d
+ }
+ done
+ if [ x$(egrep -c 'active/Online$' $tmp) != x4 ]; then
+ ret=1
+ echo ERROR: unexpected number of online disks on $d
+ fi
+done
+
+# Now try assembly
+if mdadm -A $container $dev8 $dev9 $dev10 $dev11; then
+ mdadm -IR $container
+ sha_0d=$(sha1_sum $member0)
+ sha_1d=$(sha1_sum $member1)
+ mdadm -Ss
+ sleep 1
+ seq8d=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p')
+ seq10d=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p')
+ if [ x$sha_0a != x$sha_0d ]; then
+ ret=1
+ echo ERROR: sha1sum of $member0 has changed
+ fi
+ if [ x$sha_1b != x$sha_1d ]; then
+ ret=1
+ echo ERROR: sha1sum of $member1 has changed
+ fi
+ if [ \( $seq10a -ge $seq10d \) -o \( $seq8d -ne $seq10d \) ]; then
+ ret=1
+ echo ERROR: sequential numbers are wrong
+ fi
+else
+ ret=1
+ echo ERROR: assembly failed
+fi
+
+if [ $ret -ne 0 ]; then
+ mdadm -E $dev10
+ mdadm -E $dev8
+fi
+rm -f $tmp /var/tmp/mdmon.log
+[ $ret -eq 0 ]
diff --git a/tests/10ddf-sudden-degraded b/tests/10ddf-sudden-degraded
new file mode 100644
index 0000000..dc692ae
--- /dev/null
+++ b/tests/10ddf-sudden-degraded
@@ -0,0 +1,18 @@
+#
+# An array is assembled with one device missing.
+# The other device must be marked as Failed in metadata
+
+. tests/env-ddf-template
+
+mdadm -CR $container -e ddf -n 2 $dev8 $dev9
+ddf_check container 2
+
+mdadm -CR $member1 -n 2 -l1 $dev8 $dev9
+mdadm --wait $member1 || true
+mdadm -Ss
+
+mdadm -I $dev8
+mdadm -R $container
+mkfs $member1
+# There must be a missing device recorded
+mdadm --examine $dev8 | grep 'Raid Devices.*--' || exit 1
diff --git a/tests/11spare-migration b/tests/11spare-migration
new file mode 100644
index 0000000..24b6ec6
--- /dev/null
+++ b/tests/11spare-migration
@@ -0,0 +1,454 @@
+# Set of tests for autorebuild functionality using mdadm -F
+# To be able to test ddf one must have all loop devices of bigger size, with the ones
+# above number 7 bigger again by any amount (this is not changed for now as it
+# could affect other tests)
+
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+export IMSM_NO_PLATFORM=1
+
+. tests/utils
+set -ex
+verbose="yes"
+sleeptime=10
+
+# if listfailed=yes then don't exit if test failed due to wrong
+# spare-migration and just print a list at the end. Other errors still
+# stop the test.
+# if listfailed=no then exit on first failure
+listfailed="yes"
+
+# start Monitor, set monitorpid
+# uses global scan variable
+# all parameters are numbers of devices to be monitored. only used when $scan="no"
+# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1
+monitor(){
+ [ -z $monitorpid ] || return
+ if [ "$scan" == "yes" ]; then
+ $mdadm -F -d 1 --scan --mail root@localhost -c $config &
+ monitorpid=$!
+ return
+ fi
+ unset mddevs
+ while [ -n "$1" ]
+ do
+ eval container=\$c$1
+ eval volumes=\$v$1
+ mddevs="$mddevs /dev/$container"
+ if [ "$container" != "$volumes" ]; then
+ for vol in $volumes; do
+ mddevs="$mddevs /dev/$vol"
+ done
+ fi
+ shift
+ done
+ if [ -n "$mddevs" ]; then
+ if [ "$verbose" != "yes" ]; then
+ $mdadm -F -d 1 $mddevs -c $config >&2 &
+ monitorpid=$!
+ else
+ $mdadm -F -t -d 1 $mddevs -c $config &
+ monitorpid=$!
+ fi
+ fi
+ [ "$verbose" != "yes" ] || echo $mddevs $monitorpid
+}
+
+test0()
+{
+dsc "Test 0: No config file, no spare should be moved"
+> $config
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was not moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2 n
+tidyup
+}
+
+test0a()
+{
+dsc "Test 0a: No domains in config file, no spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was not moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2 n
+tidyup
+}
+
+test1()
+{
+dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with arrays and common domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev0
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test1a()
+{
+dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev1
+# check that spare loop2 was moved from container c1 to container c0
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test2()
+{
+dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev2
+chksparemoved $c1 $c0 $dev2
+tidyup
+}
+
+test3()
+{
+dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+# create config file with 2 domains
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2
+createconfig domain-$platform"2" $platform spare 3 4 5
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+chksparemoved $c1 $c0 $dev5 n
+tidyup
+}
+
+test4()
+{
+dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, move if metadata allows"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2
+monitor 0 1
+mdadm --fail /dev/$v0 $dev1
+mdadm -a /dev/$c1 $dev5
+unset shouldmove
+[ "$platform" == "imsm" ] || shouldmove="n"
+chksparemoved $c1 $c0 $dev5 $shouldmove
+tidyup
+}
+
+test5()
+{
+dsc "Test 5: Two domains, two containers in each domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+setupdevs 2 5 6 $platform
+setupdevs 3 8 10 $platform
+# 2 and 9 for spares
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 2 3 4
+createconfig domain-$platform"2" $platform spare 5 6 8 9 10
+monitor 0 1 2 3
+test5a
+test5b
+test5c
+tidyup
+}
+
+test5a()
+{
+dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved"
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v2 $dev5
+chksparemoved $c0 $c2 $dev2 n
+}
+
+test5b()
+{
+dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved"
+mdadm --fail /dev/$v1 $dev3
+chksparemoved $c0 $c1 $dev2
+}
+
+test5c()
+{
+dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved"
+mdadm -a /dev/$c3 $dev9
+chksparemoved $c3 $c2 $dev9
+}
+
+test6()
+{
+dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+# all devices in one domain
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c0 $dev2
+mdadm --fail /dev/$v1 $dev8
+chksparemoved $c0 $c1 $dev2 n
+tidyup
+}
+
+test7()
+{
+dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9 10
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+
+test7a()
+{
+dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain"
+setupdevs 0 0 1 $platform
+setupdevs 1 8 9 $platform
+#all $platform devices in one domain
+createconfig a
+createconfig domain-$platform"1" $platform spare 0 1 8 9 10
+createconfig domain-$platform"2" $platform spare 2
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+chkspare $c1 $dev2
+mdadm --fail /dev/$v1 $dev8
+mdadm -a /dev/$c0 $dev10
+chksparemoved $c0 $c1 $dev10
+tidyup
+}
+
+test8()
+{
+# ddf does not have getinfo_super_disks implemented so skip this test
+return
+dsc "Test 8: imsm and ddf - spare should not be migrated"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 ddf
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12 n
+tidyup
+}
+
+test9()
+{
+dsc "Test 9: imsm and native 1.2 - one domain, no metadata specified, spare should be moved"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 1.2
+createconfig a
+createconfig domain0 noplatform spare 8 9 10 11 12
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+test9a()
+{
+dsc "Test 9a: imsm and native 1.2 - spare in global domain, should be moved"
+setupdevs 0 10 11 imsm
+setupdevs 1 8 9 1.2
+createconfig a
+createconfig domain-global noplatform spare 8 9 10 11 12
+createconfig domain-1.2 1.2 spare 8 9
+createconfig domain-imsm imsm spare 10 11
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+test10()
+{
+dsc "Test 10: Two arrays on the same devices in container"
+setupdevs 0 0 1 $platform 10000
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/md/sub0_ $dev0
+chksparemoved $c1 $c0 $dev2
+if [ $failed -eq 0 ]; then
+# now fail the spare and see if we get another one
+ mdadm --fail /dev/md/sub0_ $dev2
+ mdadm -a /dev/$c1 $dev5
+ chksparemoved $c1 $c0 $dev5
+fi
+tidyup
+}
+
+test11()
+{
+dsc "Test 11: Failed spare from other container should not be used"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v1 $dev3
+#wait until recovery finishes so no degraded array in c1
+check wait
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $dev3 n
+tidyup
+}
+
+test12()
+{
+dsc "Test 12: Only one spare should be taken for rebuild, second not needed"
+setupdevs 0 0 1 $platform
+setupdevs 1 3 4 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 3 4 5
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm -a /dev/$c1 $dev5
+mdadm --fail /dev/$v0 $dev0
+sleep $sleeptime
+chkarray $dev2 n
+sc1=$c
+chkarray $dev5 n
+sc2=$c
+[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1"
+tidyup
+}
+
+test13()
+{
+dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should be moved regadless of action"
+setupdevs 0 0 1 $platform
+setupdevs 1 4 5 $platform
+# same domain but different action on 4 5 6
+createconfig a
+createconfig domain-$platform $platform spare 0 1
+createconfig domain-$platform $platform include 4 5 6
+monitor 0 1
+mdadm -a /dev/$c1 $dev6
+mdadm --fail /dev/$v0 $dev0
+chksparemoved $c1 $c0 $d6
+tidyup
+}
+
+test14()
+{
+dsc "Test 14: One domain, small array on big disks, check if small spare is accepted"
+setupdevs 0 8 9 $platform 10000 1
+setupdevs 1 0 1 $platform
+createconfig a
+createconfig domain-$platform $platform spare 0 1 2 8 9
+monitor 0 1
+mdadm -a /dev/$c1 $dev2
+mdadm --fail /dev/$v0 $dev9
+chksparemoved $c1 $c0 $d2
+tidyup
+}
+
+test15()
+{
+dsc "Test 15: spare in global domain for $platform metadata, should be moved"
+# this is like 9a but only one metadata used
+setupdevs 0 10 11 $platform
+setupdevs 1 8 9 $platform
+createconfig a
+createconfig domain-global $platform spare 8 9 10 11 12
+createconfig domain-1 $platform spare 8 9
+createconfig domain-2 $platform spare 10 11
+monitor 0 1
+mdadm -a /dev/$c1 $dev12
+mdadm --fail /dev/$v0 $dev10
+chksparemoved $c1 $c0 $dev12
+tidyup
+}
+
+try()
+{
+test0
+test0a
+test1
+test1a
+test2
+test3
+test4
+test5
+test6
+if [ "$platform" != "1.2" ]; then
+# this is because we can't have a small spare added to native array
+ test7
+ test7a
+fi
+test8
+test9
+test9a
+if [ "$platform" != "1.2" ]; then
+# we can't create two subarrays on the same devices for native (without
+# partitions)
+ test10
+fi
+test11
+test12
+test13
+test14
+test15
+}
+
+try_failed()
+{
+platform="1.2"
+scan="no"
+test5
+test9
+test13
+scan="yes"
+test9
+}
+
+#try_failed
+
+for scan in no yes; do
+ for platform in 1.2 imsm; do
+ try
+ done
+done
+
+[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist"
+
+#cat $targetdir/log
+rm -f /dev/disk/by-path/loop*
diff --git a/tests/12imsm-r0_2d-grow-r0_3d b/tests/12imsm-r0_2d-grow-r0_3d
new file mode 100644
index 0000000..3c6cf74
--- /dev/null
+++ b/tests/12imsm-r0_2d-grow-r0_3d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 3 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 3 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_2d-grow-r0_4d b/tests/12imsm-r0_2d-grow-r0_4d
new file mode 100644
index 0000000..e4fccda
--- /dev/null
+++ b/tests/12imsm-r0_2d-grow-r0_4d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_2d-grow-r0_5d b/tests/12imsm-r0_2d-grow-r0_5d
new file mode 100644
index 0000000..388a5bb
--- /dev/null
+++ b/tests/12imsm-r0_2d-grow-r0_5d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks grow to RAID 0 volume, 5 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3 $dev4"
+
+# Before: RAID 0 volume, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 5 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 3))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r0_3d-grow-r0_4d b/tests/12imsm-r0_3d-grow-r0_4d
new file mode 100644
index 0000000..7065f07
--- /dev/null
+++ b/tests/12imsm-r0_3d-grow-r0_4d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 3 disks grow to RAID 0 volume, 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 0 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r5_3d-grow-r5_4d b/tests/12imsm-r5_3d-grow-r5_4d
new file mode 100644
index 0000000..097da0a
--- /dev/null
+++ b/tests/12imsm-r5_3d-grow-r5_4d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 4 disks, 64k chunk size
+vol0_new_num_comps=$num_disks
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/12imsm-r5_3d-grow-r5_5d b/tests/12imsm-r5_3d-grow-r5_5d
new file mode 100644
index 0000000..2e5c7d2
--- /dev/null
+++ b/tests/12imsm-r5_3d-grow-r5_5d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 5 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_4d b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d
new file mode 100644
index 0000000..66ceeb3
--- /dev/null
+++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 2 disks to 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 128k chunk size
+# RAID 0 volume in slot #1, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID 0 volume in slot #0, 4 disks, 128k chunk size
+# RAID 0 volume in slot #1, 4 disks, 64k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_5d b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d
new file mode 100644
index 0000000..0da9ef3
--- /dev/null
+++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow both members from 2 disks to 5 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3 $dev4"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size
+# RAID 0 volume in slot #1, 2 disks, 256k chunk size
+vol0_level=0
+vol0_comp_size=$((4 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((6 * 1024))
+vol1_chunk=256
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID 0 volume in slot #0, 5 disks, 64k chunk size
+# RAID 0 volume in slot #1, 5 disks, 256k chunk size
+vol0_new_num_comps=$((num_disks + 3))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r0_3d-grow-r0_r0_4d b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d
new file mode 100644
index 0000000..1ff6025
--- /dev/null
+++ b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow a container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 128k chunk size
+# RAID 0 volume in slot #1, 3 disks, 512k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID0 volume in slot #0, 4 disks, 128k chunk size
+# RAID0 volume in slot #1, 4 disks, 512k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+vol1_new_num_comps=$vol0_new_num_comps
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_4d b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d
new file mode 100644
index 0000000..2977f36
--- /dev/null
+++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 64k chunk size
+# RAID 5 volume in slot #1, 3 disks, 128k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID 0 volume in slot #0, 4 disks, 64k chunk size
+# RAID 5 volume in slot #1, 4 disks, 128k chunk size
+vol1_new_num_comps=$num_disks
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_5d b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d
new file mode 100644
index 0000000..ff15ad0
--- /dev/null
+++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 0 volume in slot #0, 3 disks, 256k chunk size
+# RAID 5 volume in slot #1, 3 disks, 512k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=128
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID 0 volume in slot #0, 5 disks, 256k chunk size
+# RAID 5 volume in slot #1, 5 disks, 512k chunk size
+vol0_new_num_comps=$((num_disks + 2))
+vol1_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_4d b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d
new file mode 100644
index 0000000..9fed88a
--- /dev/null
+++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 4 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 5 volume in slot #0, 3 disks, 64k chunk size
+# RAID 0 volume in slot #1, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_offset=$((vol0_comp_size + 4096))
+vol1_num_comps=$num_disks
+
+# After: RAID 5 volume in slot #0, 4 disks, 64k chunk size
+# RAID 0 volume in slot #1, 4 disks, 64k chunk size
+vol0_new_num_comps=$num_disks
+vol1_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_5d b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d
new file mode 100644
index 0000000..e8beddc
--- /dev/null
+++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# Grow the container (arrays inside) from 3 disks to 5 disks
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3 $dev4"
+
+# Before: RAID 5 volume in slot #0, 3 disks, 128k chunk size
+# RAID 0 volume in slot #1, 3 disks, 256k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_offset=$((vol0_comp_size + 4096))
+vol1_num_comps=$num_disks
+
+# After: RAID 5 volume in slot #0, 5 disks, 128k chunk size
+# RAID 0 volume in slot #1, 5 disks, 256k chunk size
+vol0_new_num_comps=$((num_disks + 1))
+vol1_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 0 0
diff --git a/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d
new file mode 100644
index 0000000..cb7328a
--- /dev/null
+++ b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d
@@ -0,0 +1,29 @@
+. tests/env-imsm-template
+
+# RAID 0 and RAID 5 volumes (3 disks) migrate to RAID 5 and RAID 5 volumes (4 disks)
+# NEGATIVE test - migration is not allowed if there is more then one array in a container
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+spare_list="$dev3"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size, as member #0
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# Extra: RAID 5 volume, 3 disks, 64k chunk size, as member #1
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$((num_disks - 1))
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After: RAID 5 volume, 4 disks, 64k chunk size (only member #0)
+vol0_new_level=5
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/14imsm-r0_3d_no_spares-migrate-r5_3d b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d
new file mode 100644
index 0000000..10bbab6
--- /dev/null
+++ b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (3 disks, no spares) migrate to RAID 5 volume (3 disks)
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 0 volume, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 3 disks, 64k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/14imsm-r0_r0_2d-takeover-r10_4d b/tests/14imsm-r0_r0_2d-takeover-r10_4d
new file mode 100644
index 0000000..d068abb
--- /dev/null
+++ b/tests/14imsm-r0_r0_2d-takeover-r10_4d
@@ -0,0 +1,30 @@
+. tests/env-imsm-template
+
+
+# Two RAID 0 volumes (2 disks) migrate to RAID 10 volume (4 disks)
+# NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size
+# RAID 0 volume in slot #1, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# Before: RAID 0 volume, disks, 64k chunk size
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=num_disks
+vol1_offset=$(( $vol0_comp_size + 4096 ))
+
+# After: RAID 10, 4 disks, 64k chunk size
+vol0_new_level=10
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/14imsm-r10_4d-grow-r10_5d b/tests/14imsm-r10_4d-grow-r10_5d
new file mode 100644
index 0000000..bcbe147
--- /dev/null
+++ b/tests/14imsm-r10_4d-grow-r10_5d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 10 volume, 4 disks grow to RAID 10 volume, 5 disks
+# NEGATIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+spare_list="$dev4"
+
+# Before: RAID 10 volume, 4 disks, 128k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$((num_disks - 2))
+vol0_offset=0
+
+# After: RAID 10 volume, 5 disks, 128k chunks size (test should fail)
+vol0_new_num_comps=$((num_disks + 1))
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r10_r5_4d-takeover-r0_2d b/tests/14imsm-r10_r5_4d-takeover-r0_2d
new file mode 100644
index 0000000..720e575
--- /dev/null
+++ b/tests/14imsm-r10_r5_4d-takeover-r0_2d
@@ -0,0 +1,30 @@
+. tests/env-imsm-template
+
+
+# Two RAID volumes: RAID10 and RAID5 (4 disks) migrate to RAID 0 volume (2 disks)
+# NEGATIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+
+# Before: RAID 10 volume in slot #0, 4 disks, 64k chunk size
+# RAID 5 volume in slot #1, 4 disks, 64k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$(( $num_disks - 2 ))
+vol0_offset=0
+
+# Before: RAID 0 volume, disks, 64k chunk size
+vol1_level=5
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$(( $num_disks - 1 ))
+vol1_offset=$(( $vol0_comp_size + 4096 ))
+
+# After: RAID 10, 4 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=2
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/14imsm-r1_2d-grow-r1_3d b/tests/14imsm-r1_2d-grow-r1_3d
new file mode 100644
index 0000000..be20ab8
--- /dev/null
+++ b/tests/14imsm-r1_2d-grow-r1_3d
@@ -0,0 +1,19 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks grow to RAID 1 volume, 3 disks
+# NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev4"
+
+# Before: RAID 1 volume, 2 disks, 64k chunk size
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 1 volume, 3 disks, 64k chunks size (test should fail)
+vol0_new_num_comps=$num_disks
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r1_2d-takeover-r0_2d b/tests/14imsm-r1_2d-takeover-r0_2d
new file mode 100644
index 0000000..27002e1
--- /dev/null
+++ b/tests/14imsm-r1_2d-takeover-r0_2d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks change to RAID 0 volume, 2 disks
+#
+#NEGATIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 1 volume, 2 disks, 64k chunk size
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 0 volume, 2 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/14imsm-r5_3d-grow-r5_5d-no-spares b/tests/14imsm-r5_3d-grow-r5_5d-no-spares
new file mode 100644
index 0000000..ed18e72
--- /dev/null
+++ b/tests/14imsm-r5_3d-grow-r5_5d-no-spares
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 5 volume, 4 disks, 64k chunks size
+add_to_num_disks=2
+vol0_new_num_comps=$((num_disks + 2))
+
+. tests/imsm-grow-template 1 0
diff --git a/tests/14imsm-r5_3d-migrate-r4_3d b/tests/14imsm-r5_3d-migrate-r4_3d
new file mode 100644
index 0000000..e3b971c
--- /dev/null
+++ b/tests/14imsm-r5_3d-migrate-r4_3d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (3 disks) migrate to RAID 4 volume (3 disks)
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5 volume, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 4, 3 disks, 64k chunk size
+vol0_new_level=4
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1
diff --git a/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k
new file mode 100644
index 0000000..4fe3807
--- /dev/null
+++ b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, Migration from 64k to 256k chunk size.
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# RAID 0, 2 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# RAID 0, 2 disks, 256k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k
new file mode 100644
index 0000000..025e9ef
--- /dev/null
+++ b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 4k to 256 chunk size.
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 4k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=4
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 3 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k
new file mode 100644
index 0000000..37547b7
--- /dev/null
+++ b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 64k to 256k chunk size.
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 3 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k
new file mode 100644
index 0000000..d2f6c70
--- /dev/null
+++ b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume, Migration from 4k to 256k chunk size.
+# POSITIVE test
+
+num_disks=6
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5"
+
+# RAID 5, 6 disks, 4k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=4
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# RAID 5, 6 disks, 256k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k
new file mode 100644
index 0000000..f9369d5
--- /dev/null
+++ b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k
@@ -0,0 +1,34 @@
+. tests/env-imsm-template
+
+# Member 0: RAID 5 volume, Member 1: RAID 0 volume
+# Migration from 64k to 256k chunk size (both members)
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After migration parameters
+vol0_new_level=5
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=256
+
+# RAID 0, 3 disks, 64k chunk size
+vol1_level=0
+vol1_comp_size=$((5 * 1024))
+vol1_chunk=64
+vol1_num_comps=$num_disks
+vol1_offset=$((vol0_comp_size + 4096))
+
+# After migration paramters
+vol1_new_level=0
+vol1_new_num_comps=$vol1_num_comps
+vol1_new_chunk=256
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r0_3d-migrate-r5_4d b/tests/16imsm-r0_3d-migrate-r5_4d
new file mode 100644
index 0000000..265adf9
--- /dev/null
+++ b/tests/16imsm-r0_3d-migrate-r5_4d
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (3 disks) migrate to RAID 5 volume (4 disks)
+# POSITIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 0, 3 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 4 disks, 64k chunk size
+vol0_new_level=5
+new_num_disks=4
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r0_5d-migrate-r5_6d b/tests/16imsm-r0_5d-migrate-r5_6d
new file mode 100644
index 0000000..535b609
--- /dev/null
+++ b/tests/16imsm-r0_5d-migrate-r5_6d
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 0 volume (5 disks) migrate to RAID 5 volume (6 disks)
+# POSITIVE test
+
+num_disks=5
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4"
+
+# Before: RAID 0, 5 disks, 64k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 5, 6 disks, 64k chunk size
+vol0_new_level=5
+vol0_new_num_comps=$num_disks
+vol0_new_chunk=64
+new_num_disks=6
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/16imsm-r5_3d-migrate-r0_3d b/tests/16imsm-r5_3d-migrate-r0_3d
new file mode 100644
index 0000000..bcb5709
--- /dev/null
+++ b/tests/16imsm-r5_3d-migrate-r0_3d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (3 disks) migrate to RAID 0 volume (2 disks)
+# NEGATIVE test
+
+num_disks=3
+device_list="$dev0 $dev1 $dev2"
+
+# Before: RAID 5, 3 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 0, 3 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$((num_disks-1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/16imsm-r5_5d-migrate-r0_5d b/tests/16imsm-r5_5d-migrate-r0_5d
new file mode 100644
index 0000000..ca77435
--- /dev/null
+++ b/tests/16imsm-r5_5d-migrate-r0_5d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 5 volume (5 disks) migration to RAID 0 volume (4 disks)
+# NEGATIVE test
+
+num_disks=5
+device_list="$dev0 $dev1 $dev2 $dev3 $dev4"
+
+# Before: RAID 5 volume, 5 disks, 64k chunk size
+vol0_level=5
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=64
+vol0_num_comps=$((num_disks - 1))
+vol0_offset=0
+
+# After: RAID 0 volume, 5 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=$((num_disks - 1))
+vol0_new_chunk=64
+
+. tests/imsm-grow-template 1 1
diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d
new file mode 100644
index 0000000..6f5cf5a
--- /dev/null
+++ b/tests/18imsm-1d-takeover-r0_1d
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# Create RAID 0 from a single disk.
+# POSITIVE test
+
+vol0_num_comps=1
+vol0_comp_size=$((10 * 1024))
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0
+check wait
+imsm_check container $vol0_num_comps
+
+# Create RAID 0 volume
+mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0
+check wait
+
+# Test the member
+imsm_check member $member0 $vol0_num_comps 0 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64
+testdev $member0 $vol0_num_comps $vol0_comp_size 64
+
+exit 0
diff --git a/tests/18imsm-1d-takeover-r1_2d b/tests/18imsm-1d-takeover-r1_2d
new file mode 100644
index 0000000..e38ed89
--- /dev/null
+++ b/tests/18imsm-1d-takeover-r1_2d
@@ -0,0 +1,20 @@
+. tests/env-imsm-template
+
+# Create RAID 1 from a single disk
+# POSITIVE test
+
+vol0_num_comps=1
+vol0_comp_size=$((10 * 1024))
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0
+check wait
+imsm_check container $vol0_num_comps
+
+# Create RAID 1 volume
+mdadm --create --run $member0 --auto=md --level=1 --size=$vol0_comp_size --raid-disks=$((vol0_num_comps + 1)) $dev0 missing
+check wait
+
+# Test the member0
+imsm_check member $member0 $((vol_num_comps + 1)) 1 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64
+testdev $member0 $vol0_num_comps $vol0_comp_size 64
diff --git a/tests/18imsm-r0_2d-takeover-r10_4d b/tests/18imsm-r0_2d-takeover-r10_4d
new file mode 100644
index 0000000..0e77e5d
--- /dev/null
+++ b/tests/18imsm-r0_2d-takeover-r10_4d
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 0 volume, 2 disks change to RAID 10 volume, 4 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+spare_list="$dev2 $dev3"
+
+# Before: RAID 0 volume, 2 disks, 256k chunk size
+vol0_level=0
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$num_disks
+vol0_offset=0
+
+# After: RAID 10 volume, 4 disks, 256k chunk size
+vol0_new_level=10
+vol0_new_num_comps=$vol0_num_comps
+vol0_new_chunk=128
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/18imsm-r10_4d-takeover-r0_2d b/tests/18imsm-r10_4d-takeover-r0_2d
new file mode 100644
index 0000000..8a9606b
--- /dev/null
+++ b/tests/18imsm-r10_4d-takeover-r0_2d
@@ -0,0 +1,22 @@
+. tests/env-imsm-template
+
+# RAID 10 volume, 4 disks change to RAID 0 volume, 2 disks
+# POSITIVE test
+
+num_disks=4
+device_list="$dev0 $dev1 $dev2 $dev3"
+
+# Before: RAID 10 volume, 4 disks, 128k chunk size
+vol0_level=10
+vol0_comp_size=$((5 * 1024))
+vol0_chunk=128
+vol0_num_comps=$((num_disks - 2))
+vol0_offset=0
+
+# After: RAID 0 volume, 2 disks, 128k chunk size
+vol0_new_level=0
+vol0_new_num_comps=2
+vol0_new_chunk=128
+new_num_disks=2
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/18imsm-r1_2d-takeover-r0_1d b/tests/18imsm-r1_2d-takeover-r0_1d
new file mode 100644
index 0000000..049f19c
--- /dev/null
+++ b/tests/18imsm-r1_2d-takeover-r0_1d
@@ -0,0 +1,21 @@
+. tests/env-imsm-template
+
+# RAID 1 volume, 2 disks change to RAID 0 volume, 1 disks
+# POSITIVE test
+
+num_disks=2
+device_list="$dev0 $dev1"
+
+# Before: RAID 1 volume, 2 disks
+vol0_level=1
+vol0_comp_size=$((5 * 1024))
+vol0_num_comps=$(( $num_disks - 1 ))
+vol0_offset=0
+
+# After: RAID 0 volume, 1 disks, 64k chunk size
+vol0_new_level=0
+vol0_new_num_comps=1
+vol0_new_chunk=64
+new_num_disks=1
+
+. tests/imsm-grow-template 0 1
diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair
new file mode 100644
index 0000000..ce4a7c0
--- /dev/null
+++ b/tests/19raid6auto-repair
@@ -0,0 +1,49 @@
+number_of_disks=5
+chunksize_in_kib=512
+chunksize_in_b=$[chunksize_in_kib*1024]
+array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks]
+array_data_size_in_b=$[array_data_size_in_kib*1024]
+devs="$dev0 $dev1 $dev2 $dev3 $dev4"
+
+# default 2048 sectors
+data_offset_in_kib=$[2048/2]
+
+# make a raid5 from a file
+dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
+
+# perform test for every layout
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+ right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+ mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+ dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+ blockdev --flushbufs $md0; sync
+ check wait
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+ # wipe out 5 chunks on each device
+ dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0]
+ dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5]
+ dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10]
+ dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15]
+ dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20]
+
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+ $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; }
+ blockdev --flushbufs $md0 $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+ mdadm -S $md0
+done
diff --git a/tests/19raid6check b/tests/19raid6check
new file mode 100644
index 0000000..67958c6
--- /dev/null
+++ b/tests/19raid6check
@@ -0,0 +1,27 @@
+#
+# Confirm that raid6check handles all RAID6 layouts.
+# Try both 4 and 5 devices.
+
+layouts='ls rs la ra'
+lv=`uname -r`
+if expr $lv '>=' 2.6.30 > /dev/null
+then
+ layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6"
+fi
+
+for layout in $layouts
+do
+ for devs in 4 5
+ do
+ dl="$dev0 $dev1 $dev2 $dev3"
+ if [ $devs = 5 ]; then dl="$dl $dev4"; fi
+
+ mdadm -CR $md0 -l6 --layout $layout -n$devs $dl
+ check wait
+ tar cf - /etc > $md0
+ ./raid6check $md0 0 0 | grep 'Error detected' && exit 1
+ mdadm -S $md0
+ done
+done
+
diff --git a/tests/19raid6repair b/tests/19raid6repair
new file mode 100644
index 0000000..26846cc
--- /dev/null
+++ b/tests/19raid6repair
@@ -0,0 +1,56 @@
+number_of_disks=4
+chunksize_in_kib=512
+chunksize_in_b=$[chunksize_in_kib*1024]
+array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks]
+array_data_size_in_b=$[array_data_size_in_kib*1024]
+devs="$dev1 $dev2 $dev3 $dev4"
+
+# default 2048 sectors
+data_offset_in_kib=$[2048/2]
+
+layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \
+ left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \
+ right-symmetric-6 parity-first-6"
+
+for layout in $layouts
+do
+ for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \
+ "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \
+ "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \
+ "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do
+ failure_split=( $failure )
+ device_with_error=${failure_split[0]}
+ stripe_with_error=${failure_split[1]}
+ repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}"
+ start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error]
+
+ # make a raid5 from a file
+ dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
+ mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs
+ dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+ blockdev --flushbufs $md0; sync
+
+ check wait
+ blockdev --flushbufs $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; }
+
+ dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib
+ blockdev --flushbufs $device_with_error; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; }
+
+ $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; }
+ blockdev --flushbufs $md0 $devs; sync
+ echo 3 > /proc/sys/vm/drop_caches
+
+ $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+ cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; }
+
+ mdadm -S $md0
+ udevadm settle
+ sync
+ echo 3 > /proc/sys/vm/drop_caches
+ done
+done
diff --git a/tests/19repair-does-not-destroy b/tests/19repair-does-not-destroy
new file mode 100644
index 0000000..a92883f
--- /dev/null
+++ b/tests/19repair-does-not-destroy
@@ -0,0 +1,28 @@
+number_of_disks=7
+chunksize_in_kib=512
+array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks]
+array_data_size_in_b=$[array_data_size_in_kib*1024]
+devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6"
+
+dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib
+mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs
+dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib
+blockdev --flushbufs $md0; sync
+check wait
+blockdev --flushbufs $devs; sync
+echo 3 > /proc/sys/vm/drop_caches
+$dir/raid6check $md0 repair 1 2 3 > /dev/null # D D
+$dir/raid6check $md0 repair 8 2 5 > /dev/null # D P
+$dir/raid6check $md0 repair 15 4 6 > /dev/null # D Q
+$dir/raid6check $md0 repair 22 5 6 > /dev/null # P Q
+$dir/raid6check $md0 repair 3 4 0 > /dev/null # Q D
+$dir/raid6check $md0 repair 3 3 1 > /dev/null # P D
+$dir/raid6check $md0 repair 6 4 5 > /dev/null # D<D
+$dir/raid6check $md0 repair 13 5 4 > /dev/null # D>D
+blockdev --flushbufs $devs; sync
+echo 3 > /proc/sys/vm/drop_caches
+$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; }
+cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo should not mess up correct stripe ; exit 2; }
+
+mdadm -S $md0
+udevadm settle
diff --git a/tests/20raid5journal b/tests/20raid5journal
new file mode 100644
index 0000000..f751ace
--- /dev/null
+++ b/tests/20raid5journal
@@ -0,0 +1,64 @@
+# check write journal of raid456
+
+# test --detail
+test_detail_shows_journal() {
+ mdadm -D $1 | grep journal || {
+ echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; }
+}
+
+# test --examine
+test_examine_shows_journal() {
+ mdadm -E $1 | grep Journal || {
+ echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; }
+}
+
+# test --create
+create_with_journal_and_stop() {
+ mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4
+ check wait
+ tar cf - /etc > $md0
+ ./raid6check $md0 0 0 | grep 'Error detected' && exit 1
+ test_detail_shows_journal $md0
+ test_examine_shows_journal $dev4
+ mdadm -S $md0
+}
+
+# test --assemble
+test_assemble() {
+ create_with_journal_and_stop
+ if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3
+ then
+ echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1;
+ fi
+ mdadm -S $md0
+
+ mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force
+ check readonly
+ mdadm -S $md0
+}
+
+# test --incremental
+test_incremental() {
+ create_with_journal_and_stop
+ for d in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm -I $d
+ done
+ check inactive
+ mdadm -I $dev4
+ check raid5
+ mdadm -S $md0
+
+ # test --incremental with journal missing
+ for d in $dev0 $dev1 $dev2 $dev3
+ do
+ mdadm -I $d
+ done
+ mdadm -R $md0
+ check readonly
+ mdadm -S $md0
+}
+
+create_with_journal_and_stop
+test_assemble
+test_incremental
diff --git a/tests/21raid5cache b/tests/21raid5cache
new file mode 100644
index 0000000..0dd97bf
--- /dev/null
+++ b/tests/21raid5cache
@@ -0,0 +1,87 @@
+# check data integrity with raid5 write back cache
+
+# create a 4kB random file and 4 files each with a 1kB chunk of the random file:
+# randfile: ABCD randchunk[0-3]: A B C D
+#
+# then create another random 1kB chunk E, and a new random page with A, B, E, D:
+# randchunk4: E newrandfile: ABED
+create_random_data() {
+ dd if=/dev/urandom of=/tmp/randfile bs=4k count=1
+ for x in {0..3}
+ do
+ dd if=/tmp/randfile of=/tmp/randchunk$x bs=1k count=1 skip=$x count=1
+ done
+
+ dd if=/dev/urandom of=/tmp/randchunk4 bs=1k count=1
+
+ rm /tmp/newrandfile
+ for x in 0 1 4 3
+ do
+ cat /tmp/randchunk$x >> /tmp/newrandfile
+ done
+}
+
+# create array, $1 could be 5 for raid5 and 6 for raid6
+create_array() {
+ if [ $1 -lt 5 -o $1 -gt 6 ]
+ then
+ echo wrong array type $1
+ exit 2
+ fi
+
+ mdadm -CR $md0 -c4 -l5 -n10 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6 $dev11 $dev8 $dev9 --write-journal $dev10
+ check wait
+ echo write-back > /sys/block/md0/md/journal_mode
+}
+
+restart_array_write_back() {
+ mdadm -S $md0
+ mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6 $dev11 $dev8 $dev9 $dev10
+ echo write-back > /sys/block/md0/md/journal_mode
+}
+
+# compare the first page of md0 with file in $1
+cmp_first_page() {
+ cmp -n 4096 $1 $md0 || { echo cmp failed ; exit 2 ; }
+}
+
+# write 3 pages after the first page of md0
+write_three_pages() {
+ for x in {1..3}
+ do
+ dd if=/dev/urandom of=$md0 bs=4k count=1 seek=$x count=1
+ done
+}
+
+# run_test <array_type:5/6> <degraded_or_not:yes/no>
+run_test() {
+ create_random_data
+ create_array $1
+
+ if [ $2 == yes ]
+ then
+ mdadm --fail $md0 $dev0
+ fi
+
+ dd if=/tmp/randfile of=$md0 bs=4k count=1
+ restart_array_write_back
+ cmp_first_page /tmp/randfile
+ restart_array_write_back
+ write_three_pages
+ cmp_first_page /tmp/randfile
+
+
+ dd if=/tmp/randchunk4 of=/dev/md0 bs=1k count=1 seek=2
+ restart_array_write_back
+ cmp_first_page /tmp/newrandfile
+ restart_array_write_back
+ write_three_pages
+ cmp_first_page /tmp/newrandfile
+
+ mdadm -S $md0
+}
+
+run_test 5 no
+run_test 5 yes
+run_test 6 no
+run_test 6 yes
diff --git a/tests/ToTest b/tests/ToTest
new file mode 100644
index 0000000..b98e266
--- /dev/null
+++ b/tests/ToTest
@@ -0,0 +1,44 @@
+
+multipath!!
+
+add/remove/fail
+ raid1 DONE
+ raid5 DONE
+ raid6/10 needed??
+
+assemble
+ by devices DONE
+ by uuid DONE
+ by superminor DONE
+ by config file DONE
+
+ various --updates DONE (not sparc2.2 or summaries)
+
+stop
+ --scan
+
+readonly/readwrite
+
+bitmap
+ separate file
+ internal
+ filename in config file
+
+examine
+ --scan
+ --brief
+
+detail
+
+grow:
+ size
+ raid1/5/6 DONE
+ devices
+ raid1 add DONE
+ raid1 shrink DONE
+
+'--quiet' option, and remove ""
+'--name' option fo v1, and configfile etc...
+
+faulty
+ errors in raid1/5/6
diff --git a/tests/env-ddf-template b/tests/env-ddf-template
new file mode 100644
index 0000000..90d7272
--- /dev/null
+++ b/tests/env-ddf-template
@@ -0,0 +1,113 @@
+sha1_sum() {
+ sha1sum "$1" | cut -c 1-40
+}
+
+get_rootdev() {
+ local dev=$(stat -c %D /)
+ local maj=$(expr $dev : '\(..*\)..')
+ local min=${dev#$maj}
+ local bd=/dev/$(basename $(readlink /sys/dev/block/$((0x$maj)):$((0x$min))))
+ [ -b $bd ] || exit 1
+ echo $bd
+}
+
+get_sysdir() {
+ local mddev=$1
+ [ -L $mddev ] && mddev=$(readlink -f $mddev)
+ echo "/sys/class/block/$(basename $mddev)/md"
+}
+
+get_raiddisks() {
+ sysdir=$(get_sysdir "$1")
+ for i in $(seq 0 $(($(cat $sysdir/raid_disks)-1))); do
+ if [ -d $sysdir/rd$i ]; then
+ readlink -f /dev/block/$(cat $sysdir/rd$i/block/dev)
+ else
+ echo MISSING
+ fi
+ done
+}
+
+get_present() {
+ get_raiddisks $1 | grep -vc MISSING
+}
+
+ddf_check() {
+ udevadm settle
+ case $1 in
+ container )
+ grep -s "blocks super external:ddf" /proc/mdstat > /dev/null || {
+ echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; }
+ ;;
+ member )
+ t_member=$2
+ t_num_disks=$3
+ t_level=$4
+ t_rd_size=$5
+ t_size=$6
+ t_offset=$7
+ t_chunk=$8
+ t_layout=$9
+
+ if [ $t_chunk -ne 0 ]; then
+ t_rd_size=$((t_rd_size & ~(t_chunk - 1)))
+ fi
+ case $t_level in
+ 0) t_size=$((t_num_disks*$t_rd_size));;
+ 1) t_size=$t_rd_size;;
+ 4|5) t_size=$(((t_num_disks-1)*$t_rd_size));;
+ 6) t_size=$(((t_num_disks-2)*$t_rd_size));;
+ 10) t_size=$((t_num_disks*$t_rd_size/t_layout));;
+ esac
+
+ err=0
+
+ eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member`
+ sysfs=/sys/dev/block/${major}:${minor}
+ if [ ! -f ${sysfs}/md/array_state ]; then
+ echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1
+ fi
+ _chunk=`cat ${sysfs}/md/chunk_size`
+ if [ $t_chunk -ne $((_chunk/1024)) ]; then
+ echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2
+ err=$((err + 1))
+ fi
+ for i in `seq 0 $((t_num_disks - 1))`; do
+ _offset=`cat ${sysfs}/md/rd${i}/offset`
+ if [ $t_offset -ne $((_offset / 2)) ]; then
+ echo "**Error**: Offset mismatch - expected $t_offset, actual $((_offset/2))" >&2
+ err=$((err + 1))
+ fi
+ _rd_size=`cat ${sysfs}/md/rd${i}/size`
+ if [ $t_rd_size -ne $_rd_size ]; then
+ echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2
+ err=$((err + 1))
+ fi
+ done
+ _size=`cat ${sysfs}/md/array_size`
+ [ o$_size = odefault ] && _size=$(($(cat ${sysfs}/size)/2))
+ if [ $t_size -ne $_size ]; then
+ echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2
+ err=$((err + 1))
+ fi
+ if [ $err -gt 0 ]; then
+ echo "$t_member failed check" >&2
+ cat /proc/mdstat >&2
+ mdadm -E /dev/loop8 >&2
+ exit 1
+ fi
+ ;;
+ * )
+ echo >&2 "**Error** unknown check $1"; exit 1;
+ esac
+}
+
+container=/dev/md/ddf0
+member0=/dev/md/vol0
+member1=/dev/md/vol1
+member2=/dev/md/vol2
+member3=/dev/md/vol3
+member4=/dev/md/vol4
+
+# We don't want systemd to start system mdmon; start our own
+export MDADM_NO_SYSTEMCTL=1
diff --git a/tests/env-imsm-template b/tests/env-imsm-template
new file mode 100644
index 0000000..d524771
--- /dev/null
+++ b/tests/env-imsm-template
@@ -0,0 +1,91 @@
+imsm_check() {
+ udevadm settle
+ case $1 in
+ container )
+ grep -s "blocks super external:imsm" /proc/mdstat > /dev/null || {
+ echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; }
+ ;;
+ member )
+ t_member=$2
+ t_num_disks=$3
+ t_level=$4
+ t_rd_size=$5
+ t_size=$6
+ t_offset=$7
+ t_chunk=$8
+
+ t_rd_size=$((t_rd_size & ~(1024 - 1)))
+
+ if [ $t_level -eq 1 ]; then
+ t_chunk=64
+ fi
+
+ t_num_data_disks=0
+
+ case $t_level in
+ 0)
+ t_num_data_disks=$t_num_disks
+ ;;
+ 1)
+ t_num_data_disks=1
+ ;;
+ 5)
+ t_num_data_disks=$((t_num_disks-1))
+ ;;
+ 10)
+ t_num_data_disks=$((t_num_disks/2))
+ ;;
+ esac
+
+ t_size=$((t_rd_size*t_num_data_disks))
+
+ err=0
+
+ eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member`
+ sysfs=/sys/dev/block/${major}:${minor}
+ if [ ! -f ${sysfs}/md/array_state ]; then
+ echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1
+ fi
+ _chunk=`cat ${sysfs}/md/chunk_size`
+ if [ $t_chunk -ne $((_chunk/1024)) ]; then
+ echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $(($_chunk/1024))" >&2
+ err=$((err + 1))
+ fi
+ for i in `seq 0 $((t_num_disks - 1))`; do
+ _offset=`cat ${sysfs}/md/rd${i}/offset`
+ if [ $t_offset -ne $((_offset / 2)) ]; then
+ echo "**Error**: Offset mismatch - expected $t_offset, actual $_offset" >&2
+ err=$((err + 1))
+ fi
+ _rd_size=`cat ${sysfs}/md/rd${i}/size`
+ if [ $t_rd_size -ne $_rd_size ]; then
+ echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2
+ err=$((err + 1))
+ fi
+ done
+ _size=`cat ${sysfs}/md/array_size`
+ if [ $t_size -ne $_size ]; then
+ echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2
+ err=$((err + 1))
+ fi
+ if [ $err -gt 0 ]; then
+ echo "$t_member failed check" >&2
+ cat /proc/mdstat >&2
+ mdadm -E /dev/loop0 >&2
+ exit 1
+ fi
+ ;;
+ * )
+ echo >&2 "**Error** unknown check $1"; exit 1;
+ esac
+}
+
+export IMSM_NO_PLATFORM=1
+export IMSM_DEVNAME_AS_SERIAL=1
+export IMSM_TEST_OROM=1
+container=/dev/md/container
+member0=/dev/md/vol0
+member1=/dev/md/vol1
+member2=/dev/md/vol2
+member3=/dev/md/vol3
+member4=/dev/md/vol4
diff --git a/tests/func.sh b/tests/func.sh
new file mode 100644
index 0000000..9710a53
--- /dev/null
+++ b/tests/func.sh
@@ -0,0 +1,344 @@
+#!/bin/bash
+
+# We test mdadm on loop-back block devices.
+# dir for storing files should be settable by command line maybe
+size=20000
+# super0, round down to multiple of 64 and substract 64
+mdsize0=19904
+# super00 is nested, subtract 128
+mdsize00=19840
+# super1.0 round down to multiple of 2, subtract 8
+mdsize1=19992
+mdsize1a=19988
+mdsize12=19988
+# super1.2 for linear: round to multiple of 2, subtract 4
+mdsize1_l=19996
+mdsize2_l=19996
+# subtract another 4 for bitmaps
+mdsize1b=19988
+mdsize11=19992
+mdsize11a=19456
+mdsize12=19988
+
+# ddf needs bigger devices as 32Meg is reserved!
+ddfsize=65536
+
+# $1 is optional parameter, it shows why to save log
+save_log() {
+ status=$1
+ logfile="$status""$_basename".log
+
+ cat $targetdir/stderr >> $targetdir/log
+ cp $targetdir/log $logdir/$_basename.log
+ echo "## $HOSTNAME: saving dmesg." >> $logdir/$logfile
+ dmesg -c >> $logdir/$logfile
+ echo "## $HOSTNAME: saving proc mdstat." >> $logdir/$logfile
+ cat /proc/mdstat >> $logdir/$logfile
+ array=($(mdadm -Ds | cut -d' ' -f2))
+ [ "$1" == "fail" ] &&
+ echo "FAILED - see $logdir/$_basename.log and $logdir/$logfile for details"
+ if [ $DEVTYPE == 'lvm' ]
+ then
+ # not supported lvm type yet
+ echo
+ elif [ "$DEVTYPE" == 'loop' -o "$DEVTYPE" == 'disk' ]
+ then
+ if [ ! -z "$array" -a ${#array[@]} -ge 1 ]
+ then
+ echo "## $HOSTNAME: mdadm -D ${array[@]}" >> $logdir/$logfile
+ $mdadm -D ${array[@]} >> $logdir/$logfile
+ # ignore saving external(external file, imsm...) bitmap
+ cat /proc/mdstat | grep -q "linear\|external" && return 0
+ md_disks=($($mdadm -D -Y ${array[@]} | grep "/dev/" | cut -d'=' -f2))
+ cat /proc/mdstat | grep -q "bitmap"
+ if [ $? -eq 0 ]
+ then
+ echo "## $HOSTNAME: mdadm -X ${md_disks[@]}" >> $logdir/$logfile
+ $mdadm -X ${md_disks[@]} >> $logdir/$logfile
+ echo "## $HOSTNAME: mdadm -E ${md_disks[@]}" >> $logdir/$logfile
+ $mdadm -E ${md_disks[@]} >> $logdir/$logfile
+ fi
+ else
+ echo "## $HOSTNAME: no array assembled!" >> $logdir/$logfile
+ fi
+ fi
+}
+
+cleanup() {
+ udevadm settle
+ $mdadm -Ssq 2> /dev/null
+ case $DEVTYPE in
+ loop )
+ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+ do
+ losetup -d /dev/loop$d &> /dev/null
+ rm -f /dev/disk/by-path/loop*
+ rm -f /var/tmp/mdtest$d
+ done
+ ;;
+ lvm )
+ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+ do
+ eval "lvremove --quiet -f \$dev$d"
+ done
+ ;;
+ disk )
+ $mdadm --zero ${disks[@]} &> /dev/null
+ ;;
+ esac
+}
+
+do_clean()
+{
+ mdadm -Ss > /dev/null
+ mdadm --zero $devlist 2> /dev/null
+ dmesg -c > /dev/null
+}
+
+check_env() {
+ user=$(id -un)
+ [ "X$user" != "Xroot" ] && {
+ echo "test: testing can only be done as 'root'."
+ exit 1
+ }
+ [ \! -x $mdadm ] && {
+ echo "test: please run make everything before perform testing."
+ exit 1
+ }
+ cmds=(mdadm lsblk df udevadm losetup mkfs.ext3 fsck seq)
+ for cmd in ${cmds[@]}
+ do
+ which $cmd > /dev/null || {
+ echo "$cmd command not found!"
+ exit 1
+ }
+ done
+ if $(lsblk -a | grep -iq raid)
+ then
+ # donot run mdadm -Ss directly if there are RAIDs working.
+ echo "test: please run test suite without running RAIDs environment."
+ exit 1
+ fi
+ # Check whether to run multipath tests
+ modprobe multipath 2> /dev/null
+ grep -sq 'Personalities : .*multipath' /proc/mdstat &&
+ MULTIPATH="yes"
+}
+
+do_setup() {
+ trap cleanup 0 1 3 15
+ trap ctrl_c 2
+
+ check_env
+ [ -d $logdir ] || mkdir -p $logdir
+
+ devlist=
+ if [ "$DEVTYPE" == "loop" ]
+ then
+ # make sure there are no loop devices remaining.
+ # udev started things can sometimes prevent them being stopped
+ # immediately
+ while grep loop /proc/partitions > /dev/null 2>&1
+ do
+ $mdadm -Ssq
+ losetup -d /dev/loop[0-9]* 2> /dev/null
+ sleep 0.2
+ done
+ elif [ "$DEVTYPE" == "disk" ]
+ then
+ if [ ! -z "$disks" ]
+ then
+ for d in $(seq 0 ${#disks[@]})
+ do
+ eval "dev$d=${disks[$d]}"
+ eval devlist=\"\$devlist \$dev$d\"
+ eval devlist$d=\"\$devlist\"
+ done
+ $mdadm --zero ${disks[@]} &> /dev/null
+ else
+ echo "Forget to provide physical devices for disk mode."
+ exit 1
+ fi
+ fi
+ for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13
+ do
+ sz=$size
+ [ $d -gt 7 ] && sz=$ddfsize
+ case $DEVTYPE in
+ loop)
+ [ -f $targetdir/mdtest$d ] ||
+ dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1
+ # make sure udev doesn't touch
+ mdadm --zero $targetdir/mdtest$d 2> /dev/null
+ [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d
+ if [ $d -eq 7 ]
+ then
+ losetup /dev/loop$d $targetdir/mdtest6 # for multipath use
+ else
+ losetup /dev/loop$d $targetdir/mdtest$d
+ fi
+ eval dev$d=/dev/loop$d
+ eval file$d=$targetdir/mdtest$d
+ ;;
+ lvm)
+ unset MULTIPATH
+ eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d
+ if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP
+ then
+ trap '' 0 # make sure lvremove is not called
+ eval echo error creating \$dev$d
+ exit 129
+ fi
+ ;;
+ ram)
+ unset MULTIPATH
+ eval dev$d=/dev/ram$d
+ ;;
+ esac
+ eval devlist=\"\$devlist \$dev$d\"
+ eval devlist$d=\"\$devlist\"
+ #" <-- add this quote to un-confuse vim syntax highlighting
+ done
+ path0=$dev6
+ path1=$dev7
+ ulimit -c unlimited
+ [ -f /proc/mdstat ] || modprobe md_mod
+ echo 2000 > /proc/sys/dev/raid/speed_limit_max
+ echo 0 > /sys/module/md_mod/parameters/start_ro
+}
+
+# check various things
+check() {
+ case $1 in
+ opposite_result )
+ if [ $? -eq 0 ]; then
+ die "This command shouldn't run successfully"
+ fi
+ ;;
+ spares )
+ spares=$(tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)' || exit 0)
+ [ $spares -ne $2 ] &&
+ die "expected $2 spares, found $spares"
+ ;;
+ raid* | linear )
+ grep -sq "active $1 " /proc/mdstat ||
+ die "active $1 not found"
+ ;;
+ algorithm )
+ grep -sq " algorithm $2 " /proc/mdstat ||
+ die "algorithm $2 not found"
+ ;;
+ resync | recovery | reshape )
+ cnt=5
+ while ! grep -sq $1 /proc/mdstat
+ do
+ if [ $cnt -gt 0 ] && grep -v idle /sys/block/md*/md/sync_action > /dev/null
+ then # Something isn't idle - wait a bit
+ sleep 0.5
+ cnt=$[cnt-1]
+ else
+ die "no $1 happening"
+ fi
+ done
+ ;;
+ nosync )
+ sleep 0.5
+ # Since 4.2 we delay the close of recovery until there has been a chance for
+ # spares to be activated. That means that a recovery that finds nothing
+ # to do can still take a little longer than expected.
+ # add an extra check: is sync_completed shows the end is reached, assume
+ # there is no recovery.
+ if grep -sq -E '(resync|recovery|reshape) *=' /proc/mdstat
+ then
+ incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'`
+ [ -n "$incomplete" ] &&
+ die "resync or recovery is happening!"
+ fi
+ ;;
+ wait )
+ p=`cat /proc/sys/dev/raid/speed_limit_max`
+ echo 2000000 > /proc/sys/dev/raid/speed_limit_max
+ sleep 0.1
+ while grep -Eq '(resync|recovery|reshape|check|repair) *=' /proc/mdstat ||
+ grep -v idle > /dev/null /sys/block/md*/md/sync_action
+ do
+ sleep 0.5
+ done
+ echo $p > /proc/sys/dev/raid/speed_limit_max
+ ;;
+ state )
+ grep -sq "blocks.*\[$2\]\$" /proc/mdstat ||
+ die "state $2 not found!"
+ sleep 0.5
+ ;;
+ bitmap )
+ grep -sq bitmap /proc/mdstat ||
+ die "no bitmap"
+ ;;
+ nobitmap )
+ grep -sq "bitmap" /proc/mdstat &&
+ die "bitmap present"
+ ;;
+ readonly )
+ grep -sq "read-only" /proc/mdstat ||
+ die "array is not read-only!"
+ ;;
+ inactive )
+ grep -sq "inactive" /proc/mdstat ||
+ die "array is not inactive!"
+ ;;
+ # It only can be used when there is only one raid
+ chunk )
+ chunk_size=`awk -F',' '/chunk/{print $2}' /proc/mdstat | awk -F'[a-z]' '{print $1}'`
+ if [ "$chunk_size" -ne "$2" ] ; then
+ die "chunksize should be $2, but it's $chunk_size"
+ fi
+ ;;
+ * )
+ die "unknown check $1"
+ ;;
+ esac
+}
+
+no_errors() {
+ if [ -s $targetdir/stderr ]
+ then
+ echo Bad errors from mdadm:
+ cat $targetdir/stderr
+ exit 2
+ fi
+}
+
+# basic device test
+testdev() {
+ [ -b $1 ] || die "$1 isn't a block device."
+ [ "$DEVTYPE" == "disk" ] && return 0
+ udevadm settle
+ dev=$1
+ cnt=$2
+ dvsize=$3
+ chunk=$4
+ if [ -z "$5" ]
+ then
+ mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2
+ fi
+ dsize=$[dvsize/chunk]
+ dsize=$[dsize*chunk]
+ rasize=$[dsize*2*cnt]
+ # rasize is in sectors
+ if [ -n "$DEV_ROUND_K" ]
+ then
+ rasize=$[rasize/DEV_ROUND_K/2]
+ rasize=$[rasize*DEV_ROUND_K*2]
+ fi
+ [ `/sbin/blockdev --getsize $dev` -eq 0 ] && sleep 2
+ _sz=`/sbin/blockdev --getsize $dev`
+ [ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ] &&
+ die "size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz"
+ return 0
+}
+
+rotest() {
+ dev=$1
+ fsck -fn $dev >&2
+}
diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template
new file mode 100644
index 0000000..1a8676e
--- /dev/null
+++ b/tests/imsm-grow-template
@@ -0,0 +1,119 @@
+
+# 0 - POSITIVE test, otherwise NEGATIVE test
+negative_test=$1
+
+# 0 - On-line Capacity Expansion test, otherwise LEVEL migration or CHUNK size migration test
+migration_test=$2
+
+function grow_member() {
+ local member=$1
+ local disks=$2
+ local comps=$3
+ local level=$4
+ local size=$5
+ local offset=$6
+ local chunk=$7
+ local old_chunk=$8
+ local array_size=$((comps * size))
+
+ rm -f $backup_imsm
+ if [ $chunk -eq $old_chunk ]; then
+ ( set -ex; mdadm --grow $member --level=$level )
+ else
+ ( set -ex; mdadm --grow $member --chunk=$chunk )
+ fi
+ local status=$?
+ if [ $negative_test -ne 0 ]; then
+ if [ $status -eq 0 ]; then
+ echo >&2 "**Error**: $member: --grow should failed, but it completed successfuly"
+ exit 1
+ fi
+ return
+ fi
+ check wait
+ sleep 5
+ imsm_check member $member $disks $level $size $array_size $offset $chunk
+ testdev $member $comps $size $chunk
+}
+
+# Create container
+mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list
+check wait
+imsm_check container $num_disks
+
+# Create first volume inside the container
+if [[ ! -z $vol0_chunk ]]; then
+ mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list
+else
+ mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --raid-disks=$num_disks $device_list
+fi
+check wait
+
+# Create second volume inside the container (if defined)
+if [ ! -z $vol1_level ]; then
+ if [ ! -z $vol1_chunk ]; then
+ mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list
+ else
+ mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --raid-disks=$num_disks $device_list
+ fi
+ check wait
+fi
+
+# Wait for any RESYNC to complete
+check wait
+
+# Test first volume
+imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_num_comps)) $vol0_offset $vol0_chunk
+testdev $member0 $vol0_num_comps $vol0_comp_size $vol0_chunk
+
+# Test second volume (if defined)
+if [ ! -z $vol1_level ]; then
+ imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_num_comps)) $vol1_offset $vol1_chunk
+ testdev $member1 $vol1_num_comps $vol1_comp_size $vol1_chunk
+fi
+
+# Add extra disks to container if operation requires spares in container.
+for i in $spare_list
+do
+ mdadm --add $container $i
+ check wait
+ num_disks=$((num_disks + 1))
+done
+
+imsm_check container $num_disks
+num_disks=$((num_disks + add_to_num_disks))
+backup_imsm=/tmp/backup_imsm
+
+# Grow each member or a container depending on the type of an operation
+if [ $migration_test -ne 0 ]; then
+ if [ -z $new_num_disks ]; then
+ new_num_disks=$num_disks
+ fi
+ grow_member $member0 $new_num_disks $vol0_new_num_comps $vol0_new_level $vol0_comp_size $vol0_offset $vol0_new_chunk $vol0_chunk
+ if [[ $vol1_new_chunk -ne 0 ]] ; then
+ grow_member $member1 $new_num_disks $vol1_new_num_comps $vol1_new_level $vol1_comp_size $vol1_offset $vol1_new_chunk $vol1_chunk
+ fi
+else
+ rm -f $backup_imsm
+ ( set -x; mdadm --grow $container --raid-disks=$num_disks )
+ grow_status=$?
+ if [ $negative_test -ne 0 ]; then
+ if [ $grow_status -eq 0 ]; then
+ echo >&2 "**Error**: $container: --grow should failed, but it completed successfuly"
+ exit 1
+ fi
+ else
+ sleep 5
+ check wait
+ sleep 5
+ check wait
+ imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_new_num_comps)) $vol0_offset $vol0_chunk
+ testdev $member0 $vol0_new_num_comps $vol0_comp_size $vol0_chunk
+ if [ $vol1_new_num_comps -ne 0 ]; then
+ imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_new_num_comps)) $vol1_offset $vol1_chunk
+ testdev $member1 $vol1_new_num_comps $vol1_comp_size $vol1_chunk
+ fi
+ fi
+fi
+
+exit 0
diff --git a/tests/utils b/tests/utils
new file mode 100644
index 0000000..3acebd7
--- /dev/null
+++ b/tests/utils
@@ -0,0 +1,191 @@
+# set of functions used to test policy framework with assemble, incremental and Monitor
+
+set +e
+#create links to be able to use domains
+for d in 0 1 2 3 4 5 6 7 8 9 10 11 12
+do
+ eval ln -s \$dev$d /dev/disk/by-path/loop$d
+ eval d$d="loop$d"
+ eval mdadm --zero-superblock \$dev$d
+done
+
+devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12"
+
+# on failure print out few things before exit
+# uses testdsc and platform global variables
+err(){
+ echo >&2 "ERROR: $*"
+ cat $config >&2 || true
+ cat /proc/mdstat >&2
+ [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; }
+ ps -e | grep mdadm >&2 || true
+ if [ $listfailed == "yes" ]; then
+ [ "$verbose" != "yes" ] || echo ---FAILED---
+ flist="$flist \n $platform $testdsc"
+ failed=1
+ else
+ exit 1
+ fi
+}
+
+# set test description
+dsc(){
+ failed=0
+ testdsc="$*"
+ [ "$verbose" != "yes" ] || echo $testdsc
+}
+
+killmonitor(){
+ [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; }
+}
+
+tidyup(){
+ killmonitor
+ mdadm -Ss || true
+ mdadm -Ss
+ mdadm --zero-superblock $devices || true
+ udevadm settle
+ rm -f $config
+}
+
+trap tidyup 0 1 2 3 15
+
+# create a RAID 1 array or container and subarray(s) on 2 disks
+# if platform not specified imsm is used
+# if subsize is given, first subarray is created with given size and second one on remaining space
+ccv(){
+ # mddevno used to name created array
+ local mddevno="$1"
+ # numbers of devices to be used in array
+ local devno1="$2"
+ local devno2="$3"
+ local platform="$4"
+ local subsize="$5"
+ local onearray="$6"
+ [ -n "$platform" ] || platform="imsm"
+ if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then
+ eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2
+ udevadm settle
+ [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize
+ [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno
+ else
+ [ -z "$subsize" ] || sizepar="-z $subsize"
+ eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar
+ unset sizepar
+ fi
+}
+
+# get container and subarray using given device from mdstat
+# sets global variables c and v
+getarray(){
+ local devname=`basename $1`
+ local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'`
+ c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+ v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'`
+ [ "$platformtype" == "external" ] || c=$v
+}
+
+# check if given device belongs to any container and subarray
+# if $2 given then only container checked
+chkarray(){
+ local devname="$1"
+ local subcheck="$2"
+ getarray $devname
+ [ -n "$c" ] || err "$devname not in any container"
+ [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray"
+}
+
+# test if two devices in the same container/subarray
+# $1 $2 - devices
+# $3 don't check subarrays, only containers
+tst(){
+ local device1=`basename $1`
+ local device2=`basename $2`
+ local subcheck="$3"
+ chkarray $device1 $subcheck
+ local x="$c"
+ local y="$v"
+ chkarray $device2 $subcheck
+ [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container"
+ [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray"
+}
+
+# same as tst, just use numbers of devices instead of names as parameters
+dtst(){
+ local devno1="$1"
+ local devno2="$2"
+ local subcheck="$3"
+ eval tst \$dev$devno1 \$dev$devno2 $subcheck
+}
+
+# create containers/subarrays, check if created properly,
+# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc.
+setupdevs(){
+ local mddevno="$1"
+ local devno1="$2"
+ local devno2="$3"
+ local p="$4"
+ local subsize="$5"
+ local onearray="$6"
+ [ -n "$p" ] || p=$platform
+ ccv $mddevno $devno1 $devno2 $p $subsize $onearray
+ dtst $devno1 $devno2
+ eval c$mddevno=\"$c\"
+ eval v$mddevno=\"$v\"
+}
+
+# check if given spare in container
+# usage: chkspare container spare [n] (n if spare shouldn't be in container)
+chkspare(){
+ local container=`basename $1`
+ local spare=$2
+ local expected=$3
+ getarray $spare
+ [ -n "$expected" ] || expected="y"
+ if [ "$expected" == "y" ]; then
+ [ "$c" == "$container" ] || err "$spare not in container $container"
+ else
+ [ "$c" != "$container" ] || err "$spare in container $container"
+ fi
+}
+
+#check if spare was moved from one container to another
+# args: from_container to_container spare [yn]
+# n when spare should remain in original container
+chksparemoved(){
+ sleep $sleeptime
+ from_container="$1"
+ to_container="$2"
+ spare="$3"
+ expected="$4"
+ [ -n "$expected" ] || expected="y"
+ notexpected="n"; [ "$expected" == "y" ] || notexpected="y"
+ chkspare $from_container $spare $notexpected
+ [ $failed -eq 1 ] || chkspare $to_container $spare $expected
+}
+
+
+# for domains defined through policy
+createconfig(){
+if [ "$1" != "a" ]; then
+{
+ domain=$1
+ metadata=$2
+ action=$3
+ while [ -n "$4" ]; do
+ echo="policy domain=$domain"
+ [ "$metadata" == "noplatform" ] || echo="$echo metadata=$metadata"
+ echo="$echo path=loop$4"
+ echo="$echo action=$action"
+ echo "$echo"
+ shift
+ done
+} >> $config
+else
+{
+ echo "DEVICES $devlist /dev/md1*"
+ mdadm -Ebs
+} > $config
+fi
+#[ "$verbose" != "yes" ] || cat $config | grep policy || true
+}
diff --git a/udev-md-clustered-confirm-device.rules b/udev-md-clustered-confirm-device.rules
new file mode 100644
index 0000000..3e5381e
--- /dev/null
+++ b/udev-md-clustered-confirm-device.rules
@@ -0,0 +1,21 @@
+# do not edit this file, it will be overwritten on update
+
+SUBSYSTEM!="block", GOTO="clustermd_end"
+
+# handle md arrays
+KERNEL!="md*", GOTO="clustermd_end"
+ENV{DEVTYPE}!="disk", GOTO="clustermd_end"
+ACTION!="change", GOTO="clustermd_end"
+ENV{EVENT}!="ADD_DEVICE", GOTO="clustermd_end"
+ENV{DEVICE_UUID}!="?*", GOTO="clustermd_end"
+ENV{RAID_DISK}!="?*", GOTO="clustermd_end"
+
+# Based on the received UUID, node confirms the device if
+# it is found by blkid, otherwise the node reports it is
+# missing.
+PROGRAM="BINDIR/blkid -o device -t UUID_SUB=$env{DEVICE_UUID}", ENV{.md.newdevice} = "$result"
+
+ENV{.md.newdevice}!="", RUN+="BINDIR/mdadm --manage $env{DEVNAME} --cluster-confirm $env{RAID_DISK}:$env{.md.newdevice}"
+ENV{.md.newdevice}=="", RUN+="BINDIR/mdadm --manage $env{DEVNAME} --cluster-confirm $env{RAID_DISK}:missing"
+
+LABEL="clustermd_end"
diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules
new file mode 100644
index 0000000..13c9076
--- /dev/null
+++ b/udev-md-raid-arrays.rules
@@ -0,0 +1,44 @@
+# do not edit this file, it will be overwritten on update
+
+SUBSYSTEM!="block", GOTO="md_end"
+
+# handle md arrays
+ACTION!="add|change", GOTO="md_end"
+KERNEL!="md*", GOTO="md_end"
+
+# partitions have no md/{array_state,metadata_version}, but should not
+# for that reason be ignored.
+ENV{DEVTYPE}=="partition", GOTO="md_ignore_state"
+
+# container devices have a metadata version of e.g. 'external:ddf' and
+# never leave state 'inactive'
+ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state"
+TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end"
+ATTR{md/array_state}=="clear*|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end"
+ATTR{md/sync_action}=="reshape", ENV{RESHAPE_ACTIVE}="yes"
+LABEL="md_ignore_state"
+
+IMPORT{program}="BINDIR/mdadm --detail --no-devices --export $devnode"
+ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}"
+ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}"
+ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace"
+ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n"
+ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n"
+
+IMPORT{builtin}="blkid"
+OPTIONS+="link_priority=100"
+OPTIONS+="watch"
+ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}"
+ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_PART_ENTRY_UUID}=="?*", SYMLINK+="disk/by-partuuid/$env{ID_PART_ENTRY_UUID}"
+ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}"
+
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service"
+
+# Tell systemd to run mdmon for our container, if we need it.
+ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c"
+ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service"
+ENV{RESHAPE_ACTIVE}=="yes", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdadm-grow-continue@%c.service"
+
+LABEL="md_end"
diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules
new file mode 100644
index 0000000..d668cdd
--- /dev/null
+++ b/udev-md-raid-assembly.rules
@@ -0,0 +1,38 @@
+# do not edit this file, it will be overwritten on update
+
+# Don't process any events if anaconda is running as anaconda brings up
+# raid devices manually
+ENV{ANACONDA}=="?*", GOTO="md_inc_end"
+# assemble md arrays
+
+SUBSYSTEM!="block", GOTO="md_inc_end"
+
+# skip non-initialized devices
+ENV{SYSTEMD_READY}=="0", GOTO="md_inc_end"
+
+# handle potential components of arrays (the ones supported by md)
+ENV{ID_FS_TYPE}=="linux_raid_member", GOTO="md_inc"
+
+# "noiswmd" on kernel command line stops mdadm from handling
+# "isw" (aka IMSM - Intel RAID).
+# "nodmraid" on kernel command line stops mdadm from handling
+# "isw" or "ddf".
+IMPORT{cmdline}="noiswmd"
+IMPORT{cmdline}="nodmraid"
+
+ENV{nodmraid}=="?*", GOTO="md_inc_end"
+ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc"
+ENV{noiswmd}=="?*", GOTO="md_inc_end"
+ENV{ID_FS_TYPE}=="isw_raid_member", ACTION!="change", GOTO="md_inc"
+GOTO="md_inc_end"
+
+LABEL="md_inc"
+
+# remember you can limit what gets auto/incrementally assembled by
+# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY'
+ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot $env{DEVLINKS}"
+ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer"
+ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}"
+ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name"
+
+LABEL="md_inc_end"
diff --git a/udev-md-raid-creating.rules b/udev-md-raid-creating.rules
new file mode 100644
index 0000000..9bef8d1
--- /dev/null
+++ b/udev-md-raid-creating.rules
@@ -0,0 +1,7 @@
+# do not edit this file, it will be overwritten on update
+# While mdadm is creating an array, it creates a file
+# /run/mdadm/creating-mdXXX. If that file exists, then
+# the array is not "ready" and we should make sure the
+# content is ignored.
+
+KERNEL=="md*", TEST=="/run/mdadm/creating-$kernel", ENV{SYSTEMD_READY}="0"
diff --git a/udev-md-raid-safe-timeouts.rules b/udev-md-raid-safe-timeouts.rules
new file mode 100644
index 0000000..12bdcaa
--- /dev/null
+++ b/udev-md-raid-safe-timeouts.rules
@@ -0,0 +1,61 @@
+# Copyright (C) 2017 by Jonathan G. Underwood
+# This file is part of mdraid-safe-timeouts.
+#
+# mdraid-safe-timeouts is free software: you can redistribute it
+# and/or modify it under the terms of the GNU General Public License
+# as published by the Free Software Foundation, either version 3 of
+# the License, or (at your option) any later version.
+#
+# Foobar is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with mdraid-safe-timeouts. If not, see
+# <https://www.gnu.org/licenses/>.
+
+# This file causes block devices with Linux RAID (mdadm) signatures to
+# attempt to set safe timeouts for the drives involved
+# See udev(8) for syntax
+
+# Don't process any events if anaconda is running as anaconda brings up
+# raid devices manually
+ENV{ANACONDA}=="?*", GOTO="md_timeouts_end"
+
+SUBSYSTEM!="block|machinecheck", GOTO="md_timeouts_end"
+
+# "noiswmd" on kernel command line stops mdadm from handling
+# "isw" (aka IMSM - Intel RAID).
+# "nodmraid" on kernel command line stops mdadm from handling
+# "isw" or "ddf".
+IMPORT{cmdline}="nodmraid"
+ENV{nodmraid}=="?*", GOTO="md_timeouts_end"
+IMPORT{cmdline}="noiswmd"
+ENV{noiswmd}=="?*", GOTO="md_timeouts_end"
+
+# Set controller timeout for parent disk of each partition if the
+# partition is a mdraid partition of higher than raid 0, and the disk
+# doesn't have scterc turned on (i.e. if it's disabled or the disk
+# doesn't support it). We determine if the disk has SCTERC turned on
+# by examining the output of smartctl and seeing if it contains the
+# word "seconds". If the word "seconds" is found we take this to imply
+# STCERC is turned on, and take no action. Otherwise we set the drive
+# controller timeout to 180 seconds. It would be better to check the
+# exit status code of smartctl rather than grepping for "seconds", but
+# it's not clear what that will be in the three cases (supported and
+# turned on, supported but disabled, not supported).
+
+ENV{DEVTYPE}!="partition", GOTO="md_timeouts_end"
+
+IMPORT{program}="/sbin/mdadm --examine --export $devnode"
+
+ACTION=="add|change", \
+ ENV{ID_FS_TYPE}=="linux_raid_member", \
+ ENV{MD_LEVEL}=="raid[1-9]*", \
+ TEST=="/sys/block/$parent/device/timeout", \
+ TEST=="/usr/sbin/smartctl", \
+ PROGRAM!="/bin/sh -c '/usr/sbin/smartctl -l scterc /dev/$parent | grep -q seconds && exit 0 || exit 1'", \
+ RUN+="/bin/sh -c 'echo 180 > /sys/block/$parent/device/timeout && /usr/bin/logger timeout for /dev/$parent set to 180 secs'"
+
+LABEL="md_timeouts_end"
diff --git a/util.c b/util.c
new file mode 100644
index 0000000..3d05d07
--- /dev/null
+++ b/util.c
@@ -0,0 +1,2378 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+#include "md_p.h"
+#include <sys/socket.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <sys/un.h>
+#include <sys/resource.h>
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <linux/magic.h>
+#include <poll.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <signal.h>
+#include <dlfcn.h>
+
+
+/*
+ * following taken from linux/blkpg.h because they aren't
+ * anywhere else and it isn't safe to #include linux/ * stuff.
+ */
+
+#define BLKPG _IO(0x12,105)
+
+/* The argument structure */
+struct blkpg_ioctl_arg {
+ int op;
+ int flags;
+ int datalen;
+ void *data;
+};
+
+/* The subfunctions (for the op field) */
+#define BLKPG_ADD_PARTITION 1
+#define BLKPG_DEL_PARTITION 2
+
+/* Sizes of name fields. Unused at present. */
+#define BLKPG_DEVNAMELTH 64
+#define BLKPG_VOLNAMELTH 64
+
+/* The data structure for ADD_PARTITION and DEL_PARTITION */
+struct blkpg_partition {
+ long long start; /* starting offset in bytes */
+ long long length; /* length in bytes */
+ int pno; /* partition number */
+ char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2,
+ to be used in kernel messages */
+ char volname[BLKPG_VOLNAMELTH]; /* volume label */
+};
+
+#include "part.h"
+
+/* Force a compilation error if condition is true */
+#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
+
+/* Force a compilation error if condition is true, but also produce a
+ result (of value 0 and type size_t), so the expression can be used
+ e.g. in a structure initializer (or where-ever else comma expressions
+ aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+
+static int is_dlm_hooks_ready = 0;
+
+int dlm_funs_ready(void)
+{
+ return is_dlm_hooks_ready ? 1 : 0;
+}
+
+static struct dlm_hooks *dlm_hooks = NULL;
+struct dlm_lock_resource *dlm_lock_res = NULL;
+static int ast_called = 0;
+
+struct dlm_lock_resource {
+ dlm_lshandle_t *ls;
+ struct dlm_lksb lksb;
+};
+
+/* Using poll(2) to wait for and dispatch ASTs */
+static int poll_for_ast(dlm_lshandle_t ls)
+{
+ struct pollfd pfd;
+
+ pfd.fd = dlm_hooks->ls_get_fd(ls);
+ pfd.events = POLLIN;
+
+ while (!ast_called)
+ {
+ if (poll(&pfd, 1, 0) < 0)
+ {
+ perror("poll");
+ return -1;
+ }
+ dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls));
+ }
+ ast_called = 0;
+
+ return 0;
+}
+
+static void dlm_ast(void *arg)
+{
+ ast_called = 1;
+}
+
+static char *cluster_name = NULL;
+/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */
+int cluster_get_dlmlock(void)
+{
+ int ret = -1;
+ char str[64];
+ int flags = LKF_NOQUEUE;
+ int retry_count = 0;
+
+ if (!dlm_funs_ready()) {
+ pr_err("Something wrong with dlm library\n");
+ return -1;
+ }
+
+ ret = get_cluster_name(&cluster_name);
+ if (ret) {
+ pr_err("The md can't get cluster name\n");
+ return -1;
+ }
+
+ dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource));
+ dlm_lock_res->ls = dlm_hooks->open_lockspace(cluster_name);
+ if (!dlm_lock_res->ls) {
+ dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR);
+ if (!dlm_lock_res->ls) {
+ pr_err("%s failed to create lockspace\n", cluster_name);
+ return -ENOMEM;
+ }
+ } else {
+ pr_err("open existed %s lockspace\n", cluster_name);
+ }
+
+ snprintf(str, 64, "bitmap%s", cluster_name);
+retry:
+ ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE,
+ &dlm_lock_res->lksb, flags, str, strlen(str),
+ 0, dlm_ast, dlm_lock_res, NULL, NULL);
+ if (ret) {
+ pr_err("error %d when get PW mode on lock %s\n", errno, str);
+ /* let's try several times if EAGAIN happened */
+ if (dlm_lock_res->lksb.sb_status == EAGAIN && retry_count < 10) {
+ sleep(10);
+ retry_count++;
+ goto retry;
+ }
+ dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+ return ret;
+ }
+
+ /* Wait for it to complete */
+ poll_for_ast(dlm_lock_res->ls);
+
+ if (dlm_lock_res->lksb.sb_status) {
+ pr_err("failed to lock cluster\n");
+ return -1;
+ }
+ return 1;
+}
+
+int cluster_release_dlmlock(void)
+{
+ int ret = -1;
+
+ if (!cluster_name)
+ goto out;
+
+ if (!dlm_lock_res->lksb.sb_lkid)
+ goto out;
+
+ ret = dlm_hooks->ls_unlock_wait(dlm_lock_res->ls,
+ dlm_lock_res->lksb.sb_lkid, 0,
+ &dlm_lock_res->lksb);
+ if (ret) {
+ pr_err("error %d happened when unlock\n", errno);
+ /* XXX make sure the lock is unlocked eventually */
+ goto out;
+ }
+
+ /* Wait for it to complete */
+ poll_for_ast(dlm_lock_res->ls);
+
+ errno = dlm_lock_res->lksb.sb_status;
+ if (errno != EUNLOCK) {
+ pr_err("error %d happened in ast when unlock lockspace\n",
+ errno);
+ /* XXX make sure the lockspace is unlocked eventually */
+ goto out;
+ }
+
+ ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1);
+ if (ret) {
+ pr_err("error %d happened when release lockspace\n", errno);
+ /* XXX make sure the lockspace is released eventually */
+ goto out;
+ }
+ free(dlm_lock_res);
+
+out:
+ return ret;
+}
+
+int md_array_valid(int fd)
+{
+ struct mdinfo *sra;
+ int ret;
+
+ sra = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+ if (sra) {
+ if (sra->array_state != ARRAY_UNKNOWN_STATE)
+ ret = 0;
+ else
+ ret = -ENODEV;
+
+ free(sra);
+ } else {
+ /*
+ * GET_ARRAY_INFO doesn't provide access to the proper state
+ * information, so fallback to a basic check for raid_disks != 0
+ */
+ ret = ioctl(fd, RAID_VERSION);
+ }
+
+ return !ret;
+}
+
+int md_array_active(int fd)
+{
+ struct mdinfo *sra;
+ struct mdu_array_info_s array;
+ int ret = 0;
+
+ sra = sysfs_read(fd, NULL, GET_ARRAY_STATE);
+ if (sra) {
+ if (!md_array_is_active(sra))
+ ret = -ENODEV;
+
+ free(sra);
+ } else {
+ /*
+ * GET_ARRAY_INFO doesn't provide access to the proper state
+ * information, so fallback to a basic check for raid_disks != 0
+ */
+ ret = ioctl(fd, GET_ARRAY_INFO, &array);
+ }
+
+ return !ret;
+}
+
+int md_array_is_active(struct mdinfo *info)
+{
+ return (info->array_state != ARRAY_CLEAR &&
+ info->array_state != ARRAY_INACTIVE &&
+ info->array_state != ARRAY_UNKNOWN_STATE);
+}
+
+/*
+ * Get array info from the kernel. Longer term we want to deprecate the
+ * ioctl and get it from sysfs.
+ */
+int md_get_array_info(int fd, struct mdu_array_info_s *array)
+{
+ return ioctl(fd, GET_ARRAY_INFO, array);
+}
+
+/*
+ * Set array info
+ */
+int md_set_array_info(int fd, struct mdu_array_info_s *array)
+{
+ return ioctl(fd, SET_ARRAY_INFO, array);
+}
+
+/*
+ * Get disk info from the kernel.
+ */
+int md_get_disk_info(int fd, struct mdu_disk_info_s *disk)
+{
+ return ioctl(fd, GET_DISK_INFO, disk);
+}
+
+int get_linux_version()
+{
+ struct utsname name;
+ char *cp;
+ int a = 0, b = 0,c = 0;
+ if (uname(&name) <0)
+ return -1;
+
+ cp = name.release;
+ a = strtoul(cp, &cp, 10);
+ if (*cp == '.')
+ b = strtoul(cp+1, &cp, 10);
+ if (*cp == '.')
+ c = strtoul(cp+1, &cp, 10);
+
+ return (a*1000000)+(b*1000)+c;
+}
+
+int mdadm_version(char *version)
+{
+ int a, b, c;
+ char *cp;
+
+ if (!version)
+ version = Version;
+
+ cp = strchr(version, '-');
+ if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v')
+ return -1;
+ cp += 3;
+ a = strtoul(cp, &cp, 10);
+ if (*cp != '.')
+ return -1;
+ b = strtoul(cp+1, &cp, 10);
+ if (*cp == '.')
+ c = strtoul(cp+1, &cp, 10);
+ else
+ c = 0;
+ if (*cp != ' ' && *cp != '-')
+ return -1;
+ return (a*1000000)+(b*1000)+c;
+}
+
+unsigned long long parse_size(char *size)
+{
+ /* parse 'size' which should be a number optionally
+ * followed by 'K', 'M'. 'G' or 'T'.
+ * Without a suffix, K is assumed.
+ * Number returned is in sectors (half-K)
+ * INVALID_SECTORS returned on error.
+ */
+ char *c;
+ long long s = strtoll(size, &c, 10);
+ if (s > 0) {
+ switch (*c) {
+ case 'K':
+ c++;
+ default:
+ s *= 2;
+ break;
+ case 'M':
+ c++;
+ s *= 1024 * 2;
+ break;
+ case 'G':
+ c++;
+ s *= 1024 * 1024 * 2;
+ break;
+ case 'T':
+ c++;
+ s *= 1024 * 1024 * 1024 * 2LL;
+ break;
+ case 's': /* sectors */
+ c++;
+ break;
+ }
+ } else
+ s = INVALID_SECTORS;
+ if (*c)
+ s = INVALID_SECTORS;
+ return s;
+}
+
+int is_near_layout_10(int layout)
+{
+ int fc, fo;
+
+ fc = (layout >> 8) & 255;
+ fo = layout & (1 << 16);
+ if (fc > 1 || fo > 0)
+ return 0;
+ return 1;
+}
+
+int parse_layout_10(char *layout)
+{
+ int copies, rv;
+ char *cp;
+ /* Parse the layout string for raid10 */
+ /* 'f', 'o' or 'n' followed by a number <= raid_disks */
+ if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') ||
+ (copies = strtoul(layout+1, &cp, 10)) < 1 ||
+ copies > 200 ||
+ *cp)
+ return -1;
+ if (layout[0] == 'n')
+ rv = 256 + copies;
+ else if (layout[0] == 'o')
+ rv = 0x10000 + (copies<<8) + 1;
+ else
+ rv = 1 + (copies<<8);
+ return rv;
+}
+
+int parse_layout_faulty(char *layout)
+{
+ if (!layout)
+ return -1;
+ /* Parse the layout string for 'faulty' */
+ int ln = strcspn(layout, "0123456789");
+ char *m = xstrdup(layout);
+ int mode;
+ m[ln] = 0;
+ mode = map_name(faultylayout, m);
+ if (mode == UnSet)
+ return -1;
+
+ return mode | (atoi(layout+ln)<< ModeShift);
+}
+
+int parse_cluster_confirm_arg(char *input, char **devname, int *slot)
+{
+ char *dev;
+ *slot = strtoul(input, &dev, 10);
+ if (dev == input || dev[0] != ':')
+ return -1;
+ *devname = dev+1;
+ return 0;
+}
+
+void remove_partitions(int fd)
+{
+ /* remove partitions from this block devices.
+ * This is used for components added to an array
+ */
+#ifdef BLKPG_DEL_PARTITION
+ struct blkpg_ioctl_arg a;
+ struct blkpg_partition p;
+
+ a.op = BLKPG_DEL_PARTITION;
+ a.data = (void*)&p;
+ a.datalen = sizeof(p);
+ a.flags = 0;
+ memset(a.data, 0, a.datalen);
+ for (p.pno = 0; p.pno < 16; p.pno++)
+ ioctl(fd, BLKPG, &a);
+#endif
+}
+
+int test_partition(int fd)
+{
+ /* Check if fd is a whole-disk or a partition.
+ * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION
+ * will return ENXIO on an invalid partition number.
+ */
+ struct blkpg_ioctl_arg a;
+ struct blkpg_partition p;
+ a.op = BLKPG_DEL_PARTITION;
+ a.data = (void*)&p;
+ a.datalen = sizeof(p);
+ a.flags = 0;
+ memset(a.data, 0, a.datalen);
+ p.pno = 1<<30;
+ if (ioctl(fd, BLKPG, &a) == 0)
+ /* Very unlikely, but not a partition */
+ return 0;
+ if (errno == ENXIO || errno == ENOTTY)
+ /* not a partition */
+ return 0;
+
+ return 1;
+}
+
+int test_partition_from_id(dev_t id)
+{
+ char buf[20];
+ int fd, rv;
+
+ sprintf(buf, "%d:%d", major(id), minor(id));
+ fd = dev_open(buf, O_RDONLY);
+ if (fd < 0)
+ return -1;
+ rv = test_partition(fd);
+ close(fd);
+ return rv;
+}
+
+int enough(int level, int raid_disks, int layout, int clean, char *avail)
+{
+ int copies, first;
+ int i;
+ int avail_disks = 0;
+
+ for (i = 0; i < raid_disks; i++)
+ avail_disks += !!avail[i];
+
+ switch (level) {
+ case 10:
+ /* This is the tricky one - we need to check
+ * which actual disks are present.
+ */
+ copies = (layout&255)* ((layout>>8) & 255);
+ first = 0;
+ do {
+ /* there must be one of the 'copies' form 'first' */
+ int n = copies;
+ int cnt = 0;
+ int this = first;
+ while (n--) {
+ if (avail[this])
+ cnt++;
+ this = (this+1) % raid_disks;
+ }
+ if (cnt == 0)
+ return 0;
+ first = (first+(layout&255)) % raid_disks;
+ } while (first != 0);
+ return 1;
+
+ case LEVEL_MULTIPATH:
+ return avail_disks>= 1;
+ case LEVEL_LINEAR:
+ case 0:
+ return avail_disks == raid_disks;
+ case 1:
+ return avail_disks >= 1;
+ case 4:
+ if (avail_disks == raid_disks - 1 &&
+ !avail[raid_disks - 1])
+ /* If just the parity device is missing, then we
+ * have enough, even if not clean
+ */
+ return 1;
+ /* FALL THROUGH */
+ case 5:
+ if (clean)
+ return avail_disks >= raid_disks-1;
+ else
+ return avail_disks >= raid_disks;
+ case 6:
+ if (clean)
+ return avail_disks >= raid_disks-2;
+ else
+ return avail_disks >= raid_disks;
+ default:
+ return 0;
+ }
+}
+
+char *__fname_from_uuid(int id[4], int swap, char *buf, char sep)
+{
+ int i, j;
+ char uuid[16];
+ char *c = buf;
+ strcpy(c, "UUID-");
+ c += strlen(c);
+ copy_uuid(uuid, id, swap);
+ for (i = 0; i < 4; i++) {
+ if (i)
+ *c++ = sep;
+ for (j = 3; j >= 0; j--) {
+ sprintf(c,"%02x", (unsigned char) uuid[j+4*i]);
+ c+= 2;
+ }
+ }
+ return buf;
+
+}
+
+char *fname_from_uuid(struct supertype *st, struct mdinfo *info,
+ char *buf, char sep)
+{
+ // dirty hack to work around an issue with super1 superblocks...
+ // super1 superblocks need swapuuid set in order for assembly to
+ // work, but can't have it set if we want this printout to match
+ // all the other uuid printouts in super1.c, so we force swapuuid
+ // to 1 to make our printout match the rest of super1
+#if __BYTE_ORDER == BIG_ENDIAN
+ return __fname_from_uuid(info->uuid, 1, buf, sep);
+#else
+ return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 :
+ st->ss->swapuuid, buf, sep);
+#endif
+}
+
+int check_ext2(int fd, char *name)
+{
+ /*
+ * Check for an ext2fs file system.
+ * Superblock is always 1K at 1K offset
+ *
+ * s_magic is le16 at 56 == 0xEF53
+ * report mtime - le32 at 44
+ * blocks - le32 at 4
+ * logblksize - le32 at 24
+ */
+ unsigned char sb[1024];
+ time_t mtime;
+ unsigned long long size;
+ int bsize;
+ if (lseek(fd, 1024,0)!= 1024)
+ return 0;
+ if (read(fd, sb, 1024)!= 1024)
+ return 0;
+ if (sb[56] != 0x53 || sb[57] != 0xef)
+ return 0;
+
+ mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8;
+ bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8;
+ size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8;
+ size <<= bsize;
+ pr_err("%s appears to contain an ext2fs file system\n",
+ name);
+ cont_err("size=%lluK mtime=%s", size, ctime(&mtime));
+ return 1;
+}
+
+int check_reiser(int fd, char *name)
+{
+ /*
+ * superblock is at 64K
+ * size is 1024;
+ * Magic string "ReIsErFs" or "ReIsEr2Fs" at 52
+ *
+ */
+ unsigned char sb[1024];
+ unsigned long long size;
+ if (lseek(fd, 64*1024, 0) != 64*1024)
+ return 0;
+ if (read(fd, sb, 1024) != 1024)
+ return 0;
+ if (strncmp((char*)sb+52, "ReIsErFs",8) != 0 &&
+ strncmp((char*)sb+52, "ReIsEr2Fs",9) != 0)
+ return 0;
+ pr_err("%s appears to contain a reiserfs file system\n",name);
+ size = sb[0]|(sb[1]|(sb[2]|sb[3]<<8)<<8)<<8;
+ cont_err("size = %lluK\n", size*4);
+
+ return 1;
+}
+
+int check_raid(int fd, char *name)
+{
+ struct mdinfo info;
+ time_t crtime;
+ char *level;
+ struct supertype *st = guess_super(fd);
+
+ if (!st)
+ return 0;
+ if (st->ss->add_to_super != NULL) {
+ st->ss->load_super(st, fd, name);
+ /* Looks like a raid array .. */
+ pr_err("%s appears to be part of a raid array:\n", name);
+ st->ss->getinfo_super(st, &info, NULL);
+ st->ss->free_super(st);
+ crtime = info.array.ctime;
+ level = map_num(pers, info.array.level);
+ if (!level)
+ level = "-unknown-";
+ cont_err("level=%s devices=%d ctime=%s",
+ level, info.array.raid_disks, ctime(&crtime));
+ } else {
+ /* Looks like GPT or MBR */
+ pr_err("partition table exists on %s\n", name);
+ }
+ return 1;
+}
+
+int fstat_is_blkdev(int fd, char *devname, dev_t *rdev)
+{
+ struct stat stb;
+
+ if (fstat(fd, &stb) != 0) {
+ pr_err("fstat failed for %s: %s\n", devname, strerror(errno));
+ return 0;
+ }
+ if ((S_IFMT & stb.st_mode) != S_IFBLK) {
+ pr_err("%s is not a block device.\n", devname);
+ return 0;
+ }
+ if (rdev)
+ *rdev = stb.st_rdev;
+ return 1;
+}
+
+int stat_is_blkdev(char *devname, dev_t *rdev)
+{
+ struct stat stb;
+
+ if (stat(devname, &stb) != 0) {
+ pr_err("stat failed for %s: %s\n", devname, strerror(errno));
+ return 0;
+ }
+ if ((S_IFMT & stb.st_mode) != S_IFBLK) {
+ pr_err("%s is not a block device.\n", devname);
+ return 0;
+ }
+ if (rdev)
+ *rdev = stb.st_rdev;
+ return 1;
+}
+
+int ask(char *mesg)
+{
+ char *add = "";
+ int i;
+ for (i = 0; i < 5; i++) {
+ char buf[100];
+ fprintf(stderr, "%s%s", mesg, add);
+ fflush(stderr);
+ if (fgets(buf, 100, stdin)==NULL)
+ return 0;
+ if (buf[0]=='y' || buf[0]=='Y')
+ return 1;
+ if (buf[0]=='n' || buf[0]=='N')
+ return 0;
+ add = "(y/n) ";
+ }
+ pr_err("assuming 'no'\n");
+ return 0;
+}
+
+int is_standard(char *dev, int *nump)
+{
+ /* tests if dev is a "standard" md dev name.
+ * i.e if the last component is "/dNN" or "/mdNN",
+ * where NN is a string of digits
+ * Returns 1 if a partitionable standard,
+ * -1 if non-partitonable,
+ * 0 if not a standard name.
+ */
+ char *d = strrchr(dev, '/');
+ int type = 0;
+ int num;
+ if (!d)
+ return 0;
+ if (strncmp(d, "/d",2) == 0)
+ d += 2, type = 1; /* /dev/md/dN{pM} */
+ else if (strncmp(d, "/md_d", 5) == 0)
+ d += 5, type = 1; /* /dev/md_dN{pM} */
+ else if (strncmp(d, "/md", 3) == 0)
+ d += 3, type = -1; /* /dev/mdN */
+ else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0)
+ d += 1, type = -1; /* /dev/md/N */
+ else
+ return 0;
+ if (!*d)
+ return 0;
+ num = atoi(d);
+ while (isdigit(*d))
+ d++;
+ if (*d)
+ return 0;
+ if (nump) *nump = num;
+
+ return type;
+}
+
+unsigned long calc_csum(void *super, int bytes)
+{
+ unsigned long long newcsum = 0;
+ int i;
+ unsigned int csum;
+ unsigned int *superc = (unsigned int*) super;
+
+ for(i = 0; i < bytes/4; i++)
+ newcsum += superc[i];
+ csum = (newcsum& 0xffffffff) + (newcsum>>32);
+#ifdef __alpha__
+/* The in-kernel checksum calculation is always 16bit on
+ * the alpha, though it is 32 bit on i386...
+ * I wonder what it is elsewhere... (it uses an API in
+ * a way that it shouldn't).
+ */
+ csum = (csum & 0xffff) + (csum >> 16);
+ csum = (csum & 0xffff) + (csum >> 16);
+#endif
+ return csum;
+}
+
+char *human_size(long long bytes)
+{
+ static char buf[47];
+
+ /* We convert bytes to either centi-M{ega,ibi}bytes,
+ * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes
+ * with appropriate rounding, and then print
+ * 1/100th of those as a decimal.
+ * We allow upto 2048Megabytes before converting to
+ * gigabytes and 2048Gigabytes before converting to
+ * terabytes, as that shows more precision and isn't
+ * too large a number.
+ */
+
+ if (bytes < 5000*1024)
+ buf[0] = 0;
+ else if (bytes < 2*1024LL*1024LL*1024LL) {
+ long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2;
+ long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2;
+ snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)",
+ cMiB/100, cMiB % 100, cMB/100, cMB % 100);
+ } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
+ long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2;
+ long cGB = (bytes / (1000000000LL/200LL ) +1) /2;
+ snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)",
+ cGiB/100, cGiB % 100, cGB/100, cGB % 100);
+ } else {
+ long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2;
+ long cTB = (bytes / (1000000000000LL / 200LL) + 1) / 2;
+ snprintf(buf, sizeof(buf), " (%ld.%02ld TiB %ld.%02ld TB)",
+ cTiB/100, cTiB % 100, cTB/100, cTB % 100);
+ }
+ return buf;
+}
+
+char *human_size_brief(long long bytes, int prefix)
+{
+ static char buf[30];
+
+ /* We convert bytes to either centi-M{ega,ibi}bytes,
+ * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes
+ * with appropriate rounding, and then print
+ * 1/100th of those as a decimal.
+ * We allow upto 2048Megabytes before converting to
+ * gigabytes and 2048Gigabytes before converting to
+ * terabytes, as that shows more precision and isn't
+ * too large a number.
+ *
+ * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc.
+ * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc.
+ */
+
+ if (bytes < 5000*1024)
+ buf[0] = 0;
+ else if (prefix == IEC) {
+ if (bytes < 2*1024LL*1024LL*1024LL) {
+ long cMiB = (bytes * 200LL / (1LL<<20) +1) /2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldMiB",
+ cMiB/100, cMiB % 100);
+ } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
+ long cGiB = (bytes * 200LL / (1LL<<30) +1) /2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldGiB",
+ cGiB/100, cGiB % 100);
+ } else {
+ long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldTiB",
+ cTiB/100, cTiB % 100);
+ }
+ }
+ else if (prefix == JEDEC) {
+ if (bytes < 2*1024LL*1024LL*1024LL) {
+ long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldMB",
+ cMB/100, cMB % 100);
+ } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) {
+ long cGB = (bytes / (1000000000LL/200LL ) +1) /2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldGB",
+ cGB/100, cGB % 100);
+ } else {
+ long cTB = (bytes / (1000000000000LL / 200LL) + 1) / 2;
+ snprintf(buf, sizeof(buf), "%ld.%02ldTB",
+ cTB/100, cTB % 100);
+ }
+ }
+ else
+ buf[0] = 0;
+
+ return buf;
+}
+
+void print_r10_layout(int layout)
+{
+ int near = layout & 255;
+ int far = (layout >> 8) & 255;
+ int offset = (layout&0x10000);
+ char *sep = "";
+
+ if (near != 1) {
+ printf("%s near=%d", sep, near);
+ sep = ",";
+ }
+ if (far != 1)
+ printf("%s %s=%d", sep, offset?"offset":"far", far);
+ if (near*far == 1)
+ printf("NO REDUNDANCY");
+}
+
+unsigned long long calc_array_size(int level, int raid_disks, int layout,
+ int chunksize, unsigned long long devsize)
+{
+ if (level == 1)
+ return devsize;
+ devsize &= ~(unsigned long long)((chunksize>>9)-1);
+ return get_data_disks(level, layout, raid_disks) * devsize;
+}
+
+int get_data_disks(int level, int layout, int raid_disks)
+{
+ int data_disks = 0;
+ switch (level) {
+ case 0: data_disks = raid_disks;
+ break;
+ case 1: data_disks = 1;
+ break;
+ case 4:
+ case 5: data_disks = raid_disks - 1;
+ break;
+ case 6: data_disks = raid_disks - 2;
+ break;
+ case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255);
+ break;
+ }
+
+ return data_disks;
+}
+
+dev_t devnm2devid(char *devnm)
+{
+ /* First look in /sys/block/$DEVNM/dev for %d:%d
+ * If that fails, try parsing out a number
+ */
+ char path[PATH_MAX];
+ char *ep;
+ int fd;
+ int mjr,mnr;
+
+ snprintf(path, sizeof(path), "/sys/block/%s/dev", devnm);
+ fd = open(path, O_RDONLY);
+ if (fd >= 0) {
+ char buf[20];
+ int n = read(fd, buf, sizeof(buf));
+ close(fd);
+ if (n > 0)
+ buf[n] = 0;
+ if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2)
+ return makedev(mjr, mnr);
+ }
+ if (strncmp(devnm, "md_d", 4) == 0 &&
+ isdigit(devnm[4]) &&
+ (mnr = strtoul(devnm+4, &ep, 10)) >= 0 &&
+ ep > devnm && *ep == 0)
+ return makedev(get_mdp_major(), mnr << MdpMinorShift);
+
+ if (strncmp(devnm, "md", 2) == 0 &&
+ isdigit(devnm[2]) &&
+ (mnr = strtoul(devnm+2, &ep, 10)) >= 0 &&
+ ep > devnm && *ep == 0)
+ return makedev(MD_MAJOR, mnr);
+
+ return 0;
+}
+
+char *get_md_name(char *devnm)
+{
+ /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */
+ /* if dev < 0, want /dev/md/d%d or find mdp in /proc/devices ... */
+
+ static char devname[50];
+ struct stat stb;
+ dev_t rdev = devnm2devid(devnm);
+ char *dn;
+
+ if (rdev == 0)
+ return 0;
+ if (strncmp(devnm, "md_", 3) == 0) {
+ snprintf(devname, sizeof(devname), "/dev/md/%s",
+ devnm + 3);
+ if (stat(devname, &stb) == 0 &&
+ (S_IFMT&stb.st_mode) == S_IFBLK && (stb.st_rdev == rdev))
+ return devname;
+ }
+ snprintf(devname, sizeof(devname), "/dev/%s", devnm);
+ if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK &&
+ (stb.st_rdev == rdev))
+ return devname;
+
+ snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2);
+ if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK &&
+ (stb.st_rdev == rdev))
+ return devname;
+
+ dn = map_dev(major(rdev), minor(rdev), 0);
+ if (dn)
+ return dn;
+ snprintf(devname, sizeof(devname), "/dev/.tmp.%s", devnm);
+ if (mknod(devname, S_IFBLK | 0600, rdev) == -1)
+ if (errno != EEXIST)
+ return NULL;
+
+ if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK &&
+ (stb.st_rdev == rdev))
+ return devname;
+ unlink(devname);
+ return NULL;
+}
+
+void put_md_name(char *name)
+{
+ if (strncmp(name, "/dev/.tmp.md", 12) == 0)
+ unlink(name);
+}
+
+int get_maj_min(char *dev, int *major, int *minor)
+{
+ char *e;
+ *major = strtoul(dev, &e, 0);
+ return (e > dev && *e == ':' && e[1] &&
+ (*minor = strtoul(e+1, &e, 0)) >= 0 &&
+ *e == 0);
+}
+
+int dev_open(char *dev, int flags)
+{
+ /* like 'open', but if 'dev' matches %d:%d, create a temp
+ * block device and open that
+ */
+ int fd = -1;
+ char devname[32];
+ int major;
+ int minor;
+
+ if (!dev)
+ return -1;
+ flags |= O_DIRECT;
+
+ if (get_maj_min(dev, &major, &minor)) {
+ snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d",
+ (int)getpid(), major, minor);
+ if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) {
+ fd = open(devname, flags);
+ unlink(devname);
+ }
+ if (fd < 0) {
+ /* Try /tmp as /dev appear to be read-only */
+ snprintf(devname, sizeof(devname),
+ "/tmp/.tmp.md.%d:%d:%d",
+ (int)getpid(), major, minor);
+ if (mknod(devname, S_IFBLK|0600,
+ makedev(major, minor)) == 0) {
+ fd = open(devname, flags);
+ unlink(devname);
+ }
+ }
+ } else
+ fd = open(dev, flags);
+ return fd;
+}
+
+int open_dev_flags(char *devnm, int flags)
+{
+ dev_t devid;
+ char buf[20];
+
+ devid = devnm2devid(devnm);
+ sprintf(buf, "%d:%d", major(devid), minor(devid));
+ return dev_open(buf, flags);
+}
+
+int open_dev(char *devnm)
+{
+ return open_dev_flags(devnm, O_RDONLY);
+}
+
+int open_dev_excl(char *devnm)
+{
+ char buf[20];
+ int i;
+ int flags = O_RDWR;
+ dev_t devid = devnm2devid(devnm);
+ long delay = 1000;
+
+ sprintf(buf, "%d:%d", major(devid), minor(devid));
+ for (i = 0; i < 25; i++) {
+ int fd = dev_open(buf, flags|O_EXCL);
+ if (fd >= 0)
+ return fd;
+ if (errno == EACCES && flags == O_RDWR) {
+ flags = O_RDONLY;
+ continue;
+ }
+ if (errno != EBUSY)
+ return fd;
+ usleep(delay);
+ if (delay < 200000)
+ delay *= 2;
+ }
+ return -1;
+}
+
+int same_dev(char *one, char *two)
+{
+ struct stat st1, st2;
+ if (stat(one, &st1) != 0)
+ return 0;
+ if (stat(two, &st2) != 0)
+ return 0;
+ if ((st1.st_mode & S_IFMT) != S_IFBLK)
+ return 0;
+ if ((st2.st_mode & S_IFMT) != S_IFBLK)
+ return 0;
+ return st1.st_rdev == st2.st_rdev;
+}
+
+void wait_for(char *dev, int fd)
+{
+ int i;
+ struct stat stb_want;
+ long delay = 1000;
+
+ if (fstat(fd, &stb_want) != 0 ||
+ (stb_want.st_mode & S_IFMT) != S_IFBLK)
+ return;
+
+ for (i = 0; i < 25; i++) {
+ struct stat stb;
+ if (stat(dev, &stb) == 0 &&
+ (stb.st_mode & S_IFMT) == S_IFBLK &&
+ (stb.st_rdev == stb_want.st_rdev))
+ return;
+ usleep(delay);
+ if (delay < 200000)
+ delay *= 2;
+ }
+ if (i == 25)
+ pr_err("timeout waiting for %s\n", dev);
+}
+
+struct superswitch *superlist[] =
+{
+ &super0, &super1,
+ &super_ddf, &super_imsm,
+ &mbr, &gpt,
+ NULL
+};
+
+struct supertype *super_by_fd(int fd, char **subarrayp)
+{
+ mdu_array_info_t array;
+ int vers;
+ int minor;
+ struct supertype *st = NULL;
+ struct mdinfo *sra;
+ char *verstr;
+ char version[20];
+ int i;
+ char *subarray = NULL;
+ char container[32] = "";
+
+ sra = sysfs_read(fd, NULL, GET_VERSION);
+
+ if (sra) {
+ vers = sra->array.major_version;
+ minor = sra->array.minor_version;
+ verstr = sra->text_version;
+ } else {
+ if (md_get_array_info(fd, &array))
+ array.major_version = array.minor_version = 0;
+ vers = array.major_version;
+ minor = array.minor_version;
+ verstr = "";
+ }
+
+ if (vers != -1) {
+ sprintf(version, "%d.%d", vers, minor);
+ verstr = version;
+ }
+ if (minor == -2 && is_subarray(verstr)) {
+ char *dev = verstr+1;
+
+ subarray = strchr(dev, '/');
+ if (subarray) {
+ *subarray++ = '\0';
+ subarray = xstrdup(subarray);
+ }
+ strcpy(container, dev);
+ sysfs_free(sra);
+ sra = sysfs_read(-1, container, GET_VERSION);
+ if (sra && sra->text_version[0])
+ verstr = sra->text_version;
+ else
+ verstr = "-no-metadata-";
+ }
+
+ for (i = 0; st == NULL && superlist[i]; i++)
+ st = superlist[i]->match_metadata_desc(verstr);
+
+ sysfs_free(sra);
+ if (st) {
+ st->sb = NULL;
+ if (subarrayp)
+ *subarrayp = subarray;
+ strcpy(st->container_devnm, container);
+ strcpy(st->devnm, fd2devnm(fd));
+ } else
+ free(subarray);
+
+ return st;
+}
+
+int dev_size_from_id(dev_t id, unsigned long long *size)
+{
+ char buf[20];
+ int fd;
+
+ sprintf(buf, "%d:%d", major(id), minor(id));
+ fd = dev_open(buf, O_RDONLY);
+ if (fd < 0)
+ return 0;
+ if (get_dev_size(fd, NULL, size)) {
+ close(fd);
+ return 1;
+ }
+ close(fd);
+ return 0;
+}
+
+int dev_sector_size_from_id(dev_t id, unsigned int *size)
+{
+ char buf[20];
+ int fd;
+
+ sprintf(buf, "%d:%d", major(id), minor(id));
+ fd = dev_open(buf, O_RDONLY);
+ if (fd < 0)
+ return 0;
+ if (get_dev_sector_size(fd, NULL, size)) {
+ close(fd);
+ return 1;
+ }
+ close(fd);
+ return 0;
+}
+
+struct supertype *dup_super(struct supertype *orig)
+{
+ struct supertype *st;
+
+ if (!orig)
+ return orig;
+ st = xcalloc(1, sizeof(*st));
+ st->ss = orig->ss;
+ st->max_devs = orig->max_devs;
+ st->minor_version = orig->minor_version;
+ st->ignore_hw_compat = orig->ignore_hw_compat;
+ st->data_offset = orig->data_offset;
+ st->sb = NULL;
+ st->info = NULL;
+ return st;
+}
+
+struct supertype *guess_super_type(int fd, enum guess_types guess_type)
+{
+ /* try each load_super to find the best match,
+ * and return the best superswitch
+ */
+ struct superswitch *ss;
+ struct supertype *st;
+ unsigned int besttime = 0;
+ int bestsuper = -1;
+ int i;
+
+ st = xcalloc(1, sizeof(*st));
+ st->container_devnm[0] = 0;
+
+ for (i = 0; superlist[i]; i++) {
+ int rv;
+ ss = superlist[i];
+ if (guess_type == guess_array && ss->add_to_super == NULL)
+ continue;
+ if (guess_type == guess_partitions && ss->add_to_super != NULL)
+ continue;
+ memset(st, 0, sizeof(*st));
+ st->ignore_hw_compat = 1;
+ rv = ss->load_super(st, fd, NULL);
+ if (rv == 0) {
+ struct mdinfo info;
+ st->ss->getinfo_super(st, &info, NULL);
+ if (bestsuper == -1 ||
+ besttime < info.array.ctime) {
+ bestsuper = i;
+ besttime = info.array.ctime;
+ }
+ ss->free_super(st);
+ }
+ }
+ if (bestsuper != -1) {
+ int rv;
+ memset(st, 0, sizeof(*st));
+ st->ignore_hw_compat = 1;
+ rv = superlist[bestsuper]->load_super(st, fd, NULL);
+ if (rv == 0) {
+ superlist[bestsuper]->free_super(st);
+ return st;
+ }
+ }
+ free(st);
+ return NULL;
+}
+
+/* Return size of device in bytes */
+int get_dev_size(int fd, char *dname, unsigned long long *sizep)
+{
+ unsigned long long ldsize;
+ struct stat st;
+
+ if (fstat(fd, &st) != -1 && S_ISREG(st.st_mode))
+ ldsize = (unsigned long long)st.st_size;
+ else
+#ifdef BLKGETSIZE64
+ if (ioctl(fd, BLKGETSIZE64, &ldsize) != 0)
+#endif
+ {
+ unsigned long dsize;
+ if (ioctl(fd, BLKGETSIZE, &dsize) == 0) {
+ ldsize = dsize;
+ ldsize <<= 9;
+ } else {
+ if (dname)
+ pr_err("Cannot get size of %s: %s\n",
+ dname, strerror(errno));
+ return 0;
+ }
+ }
+ *sizep = ldsize;
+ return 1;
+}
+
+/* Return sector size of device in bytes */
+int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep)
+{
+ unsigned int sectsize;
+
+ if (ioctl(fd, BLKSSZGET, &sectsize) != 0) {
+ if (dname)
+ pr_err("Cannot get sector size of %s: %s\n",
+ dname, strerror(errno));
+ return 0;
+ }
+
+ *sectsizep = sectsize;
+ return 1;
+}
+
+/* Return true if this can only be a container, not a member device.
+ * i.e. is and md device and size is zero
+ */
+int must_be_container(int fd)
+{
+ struct mdinfo *mdi;
+ unsigned long long size;
+
+ mdi = sysfs_read(fd, NULL, GET_VERSION);
+ if (!mdi)
+ return 0;
+ sysfs_free(mdi);
+
+ if (get_dev_size(fd, NULL, &size) == 0)
+ return 1;
+ if (size == 0)
+ return 1;
+ return 0;
+}
+
+/* Sets endofpart parameter to the last block used by the last GPT partition on the device.
+ * Returns: 1 if successful
+ * -1 for unknown partition type
+ * 0 for other errors
+ */
+static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart)
+{
+ struct GPT gpt;
+ unsigned char empty_gpt_entry[16]= {0};
+ struct GPT_part_entry *part;
+ char buf[512];
+ unsigned long long curr_part_end;
+ unsigned all_partitions, entry_size;
+ unsigned part_nr;
+ unsigned int sector_size = 0;
+
+ *endofpart = 0;
+
+ BUILD_BUG_ON(sizeof(gpt) != 512);
+ /* skip protective MBR */
+ if (!get_dev_sector_size(fd, NULL, &sector_size))
+ return 0;
+ lseek(fd, sector_size, SEEK_SET);
+ /* read GPT header */
+ if (read(fd, &gpt, 512) != 512)
+ return 0;
+
+ /* get the number of partition entries and the entry size */
+ all_partitions = __le32_to_cpu(gpt.part_cnt);
+ entry_size = __le32_to_cpu(gpt.part_size);
+
+ /* Check GPT signature*/
+ if (gpt.magic != GPT_SIGNATURE_MAGIC)
+ return -1;
+
+ /* sanity checks */
+ if (all_partitions > 1024 ||
+ entry_size > sizeof(buf))
+ return -1;
+
+ part = (struct GPT_part_entry *)buf;
+
+ /* set offset to third block (GPT entries) */
+ lseek(fd, sector_size*2, SEEK_SET);
+ for (part_nr = 0; part_nr < all_partitions; part_nr++) {
+ /* read partition entry */
+ if (read(fd, buf, entry_size) != (ssize_t)entry_size)
+ return 0;
+
+ /* is this valid partition? */
+ if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) {
+ /* check the last lba for the current partition */
+ curr_part_end = __le64_to_cpu(part->ending_lba);
+ if (curr_part_end > *endofpart)
+ *endofpart = curr_part_end;
+ }
+
+ }
+ return 1;
+}
+
+/* Sets endofpart parameter to the last block used by the last partition on the device.
+ * Returns: 1 if successful
+ * -1 for unknown partition type
+ * 0 for other errors
+ */
+static int get_last_partition_end(int fd, unsigned long long *endofpart)
+{
+ struct MBR boot_sect;
+ unsigned long long curr_part_end;
+ unsigned part_nr;
+ unsigned int sector_size;
+ int retval = 0;
+
+ *endofpart = 0;
+
+ BUILD_BUG_ON(sizeof(boot_sect) != 512);
+ /* read MBR */
+ lseek(fd, 0, 0);
+ if (read(fd, &boot_sect, 512) != 512)
+ goto abort;
+
+ /* check MBP signature */
+ if (boot_sect.magic == MBR_SIGNATURE_MAGIC) {
+ retval = 1;
+ /* found the correct signature */
+
+ for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) {
+ /*
+ * Have to make every access through boot_sect rather
+ * than using a pointer to the partition table (or an
+ * entry), since the entries are not properly aligned.
+ */
+
+ /* check for GPT type */
+ if (boot_sect.parts[part_nr].part_type ==
+ MBR_GPT_PARTITION_TYPE) {
+ retval = get_gpt_last_partition_end(fd, endofpart);
+ break;
+ }
+ /* check the last used lba for the current partition */
+ curr_part_end =
+ __le32_to_cpu(boot_sect.parts[part_nr].first_sect_lba) +
+ __le32_to_cpu(boot_sect.parts[part_nr].blocks_num);
+ if (curr_part_end > *endofpart)
+ *endofpart = curr_part_end;
+ }
+ } else {
+ /* Unknown partition table */
+ retval = -1;
+ }
+ /* calculate number of 512-byte blocks */
+ if (get_dev_sector_size(fd, NULL, &sector_size))
+ *endofpart *= (sector_size / 512);
+ abort:
+ return retval;
+}
+
+int check_partitions(int fd, char *dname, unsigned long long freesize,
+ unsigned long long size)
+{
+ /*
+ * Check where the last partition ends
+ */
+ unsigned long long endofpart;
+
+ if (get_last_partition_end(fd, &endofpart) > 0) {
+ /* There appears to be a partition table here */
+ if (freesize == 0) {
+ /* partitions will not be visible in new device */
+ pr_err("partition table exists on %s but will be lost or\n"
+ " meaningless after creating array\n",
+ dname);
+ return 1;
+ } else if (endofpart > freesize) {
+ /* last partition overlaps metadata */
+ pr_err("metadata will over-write last partition on %s.\n",
+ dname);
+ return 1;
+ } else if (size && endofpart > size) {
+ /* partitions will be truncated in new device */
+ pr_err("array size is too small to cover all partitions on %s.\n",
+ dname);
+ return 1;
+ }
+ }
+ return 0;
+}
+
+int open_container(int fd)
+{
+ /* 'fd' is a block device. Find out if it is in use
+ * by a container, and return an open fd on that container.
+ */
+ char path[288];
+ char *e;
+ DIR *dir;
+ struct dirent *de;
+ int dfd, n;
+ char buf[200];
+ int major, minor;
+ struct stat st;
+
+ if (fstat(fd, &st) != 0)
+ return -1;
+ sprintf(path, "/sys/dev/block/%d:%d/holders",
+ (int)major(st.st_rdev), (int)minor(st.st_rdev));
+ e = path + strlen(path);
+
+ dir = opendir(path);
+ if (!dir)
+ return -1;
+ while ((de = readdir(dir))) {
+ if (de->d_ino == 0)
+ continue;
+ if (de->d_name[0] == '.')
+ continue;
+ /* Need to make sure it is a container and not a volume */
+ sprintf(e, "/%s/md/metadata_version", de->d_name);
+ dfd = open(path, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ n = read(dfd, buf, sizeof(buf));
+ close(dfd);
+ if (n <= 0 || (unsigned)n >= sizeof(buf))
+ continue;
+ buf[n] = 0;
+ if (strncmp(buf, "external", 8) != 0 ||
+ n < 10 ||
+ buf[9] == '/')
+ continue;
+ sprintf(e, "/%s/dev", de->d_name);
+ dfd = open(path, O_RDONLY);
+ if (dfd < 0)
+ continue;
+ n = read(dfd, buf, sizeof(buf));
+ close(dfd);
+ if (n <= 0 || (unsigned)n >= sizeof(buf))
+ continue;
+ buf[n] = 0;
+ if (sscanf(buf, "%d:%d", &major, &minor) != 2)
+ continue;
+ sprintf(buf, "%d:%d", major, minor);
+ dfd = dev_open(buf, O_RDONLY);
+ if (dfd >= 0) {
+ closedir(dir);
+ return dfd;
+ }
+ }
+ closedir(dir);
+ return -1;
+}
+
+struct superswitch *version_to_superswitch(char *vers)
+{
+ int i;
+
+ for (i = 0; superlist[i]; i++) {
+ struct superswitch *ss = superlist[i];
+
+ if (strcmp(vers, ss->name) == 0)
+ return ss;
+ }
+
+ return NULL;
+}
+
+int metadata_container_matches(char *metadata, char *devnm)
+{
+ /* Check if 'devnm' is the container named in 'metadata'
+ * which is
+ * /containername/componentname or
+ * -containername/componentname
+ */
+ int l;
+ if (*metadata != '/' && *metadata != '-')
+ return 0;
+ l = strlen(devnm);
+ if (strncmp(metadata+1, devnm, l) != 0)
+ return 0;
+ if (metadata[l+1] != '/')
+ return 0;
+ return 1;
+}
+
+int metadata_subdev_matches(char *metadata, char *devnm)
+{
+ /* Check if 'devnm' is the subdev named in 'metadata'
+ * which is
+ * /containername/subdev or
+ * -containername/subdev
+ */
+ char *sl;
+ if (*metadata != '/' && *metadata != '-')
+ return 0;
+ sl = strchr(metadata+1, '/');
+ if (!sl)
+ return 0;
+ if (strcmp(sl+1, devnm) == 0)
+ return 1;
+ return 0;
+}
+
+int is_container_member(struct mdstat_ent *mdstat, char *container)
+{
+ if (mdstat->metadata_version == NULL ||
+ strncmp(mdstat->metadata_version, "external:", 9) != 0 ||
+ !metadata_container_matches(mdstat->metadata_version+9, container))
+ return 0;
+
+ return 1;
+}
+
+int is_subarray_active(char *subarray, char *container)
+{
+ struct mdstat_ent *mdstat = mdstat_read(0, 0);
+ struct mdstat_ent *ent;
+
+ for (ent = mdstat; ent; ent = ent->next)
+ if (is_container_member(ent, container))
+ if (strcmp(to_subarray(ent, container), subarray) == 0)
+ break;
+
+ free_mdstat(mdstat);
+
+ return ent != NULL;
+}
+
+/* open_subarray - opens a subarray in a container
+ * @dev: container device name
+ * @st: empty supertype
+ * @quiet: block reporting errors flag
+ *
+ * On success returns an fd to a container and fills in *st
+ */
+int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet)
+{
+ struct mdinfo *mdi;
+ struct mdinfo *info;
+ int fd, err = 1;
+ char *_devnm;
+
+ fd = open(dev, O_RDWR|O_EXCL);
+ if (fd < 0) {
+ if (!quiet)
+ pr_err("Couldn't open %s, aborting\n",
+ dev);
+ return -1;
+ }
+
+ _devnm = fd2devnm(fd);
+ if (_devnm == NULL) {
+ if (!quiet)
+ pr_err("Failed to determine device number for %s\n",
+ dev);
+ goto close_fd;
+ }
+ strcpy(st->devnm, _devnm);
+
+ mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL);
+ if (!mdi) {
+ if (!quiet)
+ pr_err("Failed to read sysfs for %s\n",
+ dev);
+ goto close_fd;
+ }
+
+ if (mdi->array.level != UnSet) {
+ if (!quiet)
+ pr_err("%s is not a container\n", dev);
+ goto free_sysfs;
+ }
+
+ st->ss = version_to_superswitch(mdi->text_version);
+ if (!st->ss) {
+ if (!quiet)
+ pr_err("Operation not supported for %s metadata\n",
+ mdi->text_version);
+ goto free_sysfs;
+ }
+
+ if (st->devnm[0] == 0) {
+ if (!quiet)
+ pr_err("Failed to allocate device name\n");
+ goto free_sysfs;
+ }
+
+ if (!st->ss->load_container) {
+ if (!quiet)
+ pr_err("%s is not a container\n", dev);
+ goto free_sysfs;
+ }
+
+ if (st->ss->load_container(st, fd, NULL)) {
+ if (!quiet)
+ pr_err("Failed to load metadata for %s\n",
+ dev);
+ goto free_sysfs;
+ }
+
+ info = st->ss->container_content(st, subarray);
+ if (!info) {
+ if (!quiet)
+ pr_err("Failed to find subarray-%s in %s\n",
+ subarray, dev);
+ goto free_super;
+ }
+ free(info);
+
+ err = 0;
+
+ free_super:
+ if (err)
+ st->ss->free_super(st);
+ free_sysfs:
+ sysfs_free(mdi);
+ close_fd:
+ if (err)
+ close(fd);
+
+ if (err)
+ return -1;
+ else
+ return fd;
+}
+
+int add_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info)
+{
+ /* Add a device to an array, in one of 2 ways. */
+ int rv;
+
+ if (st->ss->external) {
+ if (info->disk.state & (1<<MD_DISK_SYNC))
+ info->recovery_start = MaxSector;
+ else
+ info->recovery_start = 0;
+ rv = sysfs_add_disk(sra, info, 0);
+ if (! rv) {
+ struct mdinfo *sd2;
+ for (sd2 = sra->devs; sd2; sd2=sd2->next)
+ if (sd2 == info)
+ break;
+ if (sd2 == NULL) {
+ sd2 = xmalloc(sizeof(*sd2));
+ *sd2 = *info;
+ sd2->next = sra->devs;
+ sra->devs = sd2;
+ }
+ }
+ } else
+ rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk);
+ return rv;
+}
+
+int remove_disk(int mdfd, struct supertype *st,
+ struct mdinfo *sra, struct mdinfo *info)
+{
+ int rv;
+
+ /* Remove the disk given by 'info' from the array */
+ if (st->ss->external)
+ rv = sysfs_set_str(sra, info, "slot", "none");
+ else
+ rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major,
+ info->disk.minor));
+ return rv;
+}
+
+int hot_remove_disk(int mdfd, unsigned long dev, int force)
+{
+ int cnt = force ? 500 : 5;
+ int ret;
+
+ /* HOT_REMOVE_DISK can fail with EBUSY if there are
+ * outstanding IO requests to the device.
+ * In this case, it can be helpful to wait a little while,
+ * up to 5 seconds if 'force' is set, or 50 msec if not.
+ */
+ while ((ret = ioctl(mdfd, HOT_REMOVE_DISK, dev)) == -1 &&
+ errno == EBUSY &&
+ cnt-- > 0)
+ usleep(10000);
+
+ return ret;
+}
+
+int sys_hot_remove_disk(int statefd, int force)
+{
+ int cnt = force ? 500 : 5;
+ int ret;
+
+ while ((ret = write(statefd, "remove", 6)) == -1 &&
+ errno == EBUSY &&
+ cnt-- > 0)
+ usleep(10000);
+ return ret == 6 ? 0 : -1;
+}
+
+int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info)
+{
+ /* Initialise kernel's knowledge of array.
+ * This varies between externally managed arrays
+ * and older kernels
+ */
+ mdu_array_info_t inf;
+ int rv;
+
+ if (st->ss->external)
+ return sysfs_set_array(info, 9003);
+
+ memset(&inf, 0, sizeof(inf));
+ inf.major_version = info->array.major_version;
+ inf.minor_version = info->array.minor_version;
+ rv = md_set_array_info(mdfd, &inf);
+
+ return rv;
+}
+
+unsigned long long min_recovery_start(struct mdinfo *array)
+{
+ /* find the minimum recovery_start in an array for metadata
+ * formats that only record per-array recovery progress instead
+ * of per-device
+ */
+ unsigned long long recovery_start = MaxSector;
+ struct mdinfo *d;
+
+ for (d = array->devs; d; d = d->next)
+ recovery_start = min(recovery_start, d->recovery_start);
+
+ return recovery_start;
+}
+
+int mdmon_pid(char *devnm)
+{
+ char path[100];
+ char pid[10];
+ int fd;
+ int n;
+
+ sprintf(path, "%s/%s.pid", MDMON_DIR, devnm);
+
+ fd = open(path, O_RDONLY | O_NOATIME, 0);
+
+ if (fd < 0)
+ return -1;
+ n = read(fd, pid, 9);
+ close(fd);
+ if (n <= 0)
+ return -1;
+ return atoi(pid);
+}
+
+int mdmon_running(char *devnm)
+{
+ int pid = mdmon_pid(devnm);
+ if (pid <= 0)
+ return 0;
+ if (kill(pid, 0) == 0)
+ return 1;
+ return 0;
+}
+
+int start_mdmon(char *devnm)
+{
+ int i;
+ int len;
+ pid_t pid;
+ int status;
+ char pathbuf[1024];
+ char *paths[4] = {
+ pathbuf,
+ BINDIR "/mdmon",
+ "./mdmon",
+ NULL
+ };
+
+ if (check_env("MDADM_NO_MDMON"))
+ return 0;
+ if (continue_via_systemd(devnm, MDMON_SERVICE))
+ return 0;
+
+ /* That failed, try running mdmon directly */
+ len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)-1);
+ if (len > 0) {
+ char *sl;
+ pathbuf[len] = 0;
+ sl = strrchr(pathbuf, '/');
+ if (sl)
+ sl++;
+ else
+ sl = pathbuf;
+ strcpy(sl, "mdmon");
+ } else
+ pathbuf[0] = '\0';
+
+ switch(fork()) {
+ case 0:
+ manage_fork_fds(1);
+ for (i = 0; paths[i]; i++)
+ if (paths[i][0]) {
+ execl(paths[i], paths[i],
+ devnm, NULL);
+ }
+ exit(1);
+ case -1: pr_err("cannot run mdmon. Array remains readonly\n");
+ return -1;
+ default: /* parent - good */
+ pid = wait(&status);
+ if (pid < 0 || status != 0) {
+ pr_err("failed to launch mdmon. Array remains readonly\n");
+ return -1;
+ }
+ }
+ return 0;
+}
+
+__u32 random32(void)
+{
+ __u32 rv;
+ int rfd = open("/dev/urandom", O_RDONLY);
+ if (rfd < 0 || read(rfd, &rv, 4) != 4)
+ rv = random();
+ if (rfd >= 0)
+ close(rfd);
+ return rv;
+}
+
+void random_uuid(__u8 *buf)
+{
+ int fd, i, len;
+ __u32 r[4];
+
+ fd = open("/dev/urandom", O_RDONLY);
+ if (fd < 0)
+ goto use_random;
+ len = read(fd, buf, 16);
+ close(fd);
+ if (len != 16)
+ goto use_random;
+
+ return;
+
+use_random:
+ for (i = 0; i < 4; i++)
+ r[i] = random();
+ memcpy(buf, r, 16);
+}
+
+int flush_metadata_updates(struct supertype *st)
+{
+ int sfd;
+ if (!st->updates) {
+ st->update_tail = NULL;
+ return -1;
+ }
+
+ sfd = connect_monitor(st->container_devnm);
+ if (sfd < 0)
+ return -1;
+
+ while (st->updates) {
+ struct metadata_update *mu = st->updates;
+ st->updates = mu->next;
+
+ send_message(sfd, mu, 0);
+ wait_reply(sfd, 0);
+ free(mu->buf);
+ free(mu);
+ }
+ ack(sfd, 0);
+ wait_reply(sfd, 0);
+ close(sfd);
+ st->update_tail = NULL;
+ return 0;
+}
+
+void append_metadata_update(struct supertype *st, void *buf, int len)
+{
+
+ struct metadata_update *mu = xmalloc(sizeof(*mu));
+
+ mu->buf = buf;
+ mu->len = len;
+ mu->space = NULL;
+ mu->space_list = NULL;
+ mu->next = NULL;
+ *st->update_tail = mu;
+ st->update_tail = &mu->next;
+}
+
+#ifdef __TINYC__
+/* tinyc doesn't optimize this check in ioctl.h out ... */
+unsigned int __invalid_size_argument_for_IOC = 0;
+#endif
+
+/* Pick all spares matching given criteria from a container
+ * if min_size == 0 do not check size
+ * if domlist == NULL do not check domains
+ * if spare_group given add it to domains of each spare
+ * metadata allows to test domains using metadata of destination array */
+struct mdinfo *container_choose_spares(struct supertype *st,
+ struct spare_criteria *criteria,
+ struct domainlist *domlist,
+ char *spare_group,
+ const char *metadata, int get_one)
+{
+ struct mdinfo *d, **dp, *disks = NULL;
+
+ /* get list of all disks in container */
+ if (st->ss->getinfo_super_disks)
+ disks = st->ss->getinfo_super_disks(st);
+
+ if (!disks)
+ return disks;
+ /* find spare devices on the list */
+ dp = &disks->devs;
+ disks->array.spare_disks = 0;
+ while (*dp) {
+ int found = 0;
+ d = *dp;
+ if (d->disk.state == 0) {
+ /* check if size is acceptable */
+ unsigned long long dev_size;
+ unsigned int dev_sector_size;
+ int size_valid = 0;
+ int sector_size_valid = 0;
+
+ dev_t dev = makedev(d->disk.major,d->disk.minor);
+
+ if (!criteria->min_size ||
+ (dev_size_from_id(dev, &dev_size) &&
+ dev_size >= criteria->min_size))
+ size_valid = 1;
+
+ if (!criteria->sector_size ||
+ (dev_sector_size_from_id(dev, &dev_sector_size) &&
+ criteria->sector_size == dev_sector_size))
+ sector_size_valid = 1;
+
+ found = size_valid && sector_size_valid;
+
+ /* check if domain matches */
+ if (found && domlist) {
+ struct dev_policy *pol = devid_policy(dev);
+ if (spare_group)
+ pol_add(&pol, pol_domain,
+ spare_group, NULL);
+ if (domain_test(domlist, pol, metadata) != 1)
+ found = 0;
+ dev_policy_free(pol);
+ }
+ }
+ if (found) {
+ dp = &d->next;
+ disks->array.spare_disks++;
+ if (get_one) {
+ sysfs_free(*dp);
+ d->next = NULL;
+ }
+ } else {
+ *dp = d->next;
+ d->next = NULL;
+ sysfs_free(d);
+ }
+ }
+ return disks;
+}
+
+/* Checks if paths point to the same device
+ * Returns 0 if they do.
+ * Returns 1 if they don't.
+ * Returns -1 if something went wrong,
+ * e.g. paths are empty or the files
+ * they point to don't exist */
+int compare_paths (char* path1, char* path2)
+{
+ struct stat st1,st2;
+
+ if (path1 == NULL || path2 == NULL)
+ return -1;
+ if (stat(path1,&st1) != 0)
+ return -1;
+ if (stat(path2,&st2) != 0)
+ return -1;
+ if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev))
+ return 0;
+ return 1;
+}
+
+/* Make sure we can open as many devices as needed */
+void enable_fds(int devices)
+{
+ unsigned int fds = 20 + devices;
+ struct rlimit lim;
+ if (getrlimit(RLIMIT_NOFILE, &lim) != 0 || lim.rlim_cur >= fds)
+ return;
+ if (lim.rlim_max < fds)
+ lim.rlim_max = fds;
+ lim.rlim_cur = fds;
+ setrlimit(RLIMIT_NOFILE, &lim);
+}
+
+/* Close all opened descriptors if needed and redirect
+ * streams to /dev/null.
+ * For debug purposed, leave STDOUT and STDERR untouched
+ * Returns:
+ * 1- if any error occurred
+ * 0- otherwise
+ */
+void manage_fork_fds(int close_all)
+{
+ DIR *dir;
+ struct dirent *dirent;
+
+ close(0);
+ open("/dev/null", O_RDWR);
+
+#ifndef DEBUG
+ dup2(0, 1);
+ dup2(0, 2);
+#endif
+
+ if (close_all == 0)
+ return;
+
+ dir = opendir("/proc/self/fd");
+ if (!dir) {
+ pr_err("Cannot open /proc/self/fd directory.\n");
+ return;
+ }
+ for (dirent = readdir(dir); dirent; dirent = readdir(dir)) {
+ int fd = -1;
+
+ if ((strcmp(dirent->d_name, ".") == 0) ||
+ (strcmp(dirent->d_name, "..")) == 0)
+ continue;
+
+ fd = strtol(dirent->d_name, NULL, 10);
+ if (fd > 2)
+ close(fd);
+ }
+}
+
+/* In a systemd/udev world, it is best to get systemd to
+ * run daemon rather than running in the background.
+ * Returns:
+ * 1- if systemd service has been started
+ * 0- otherwise
+ */
+int continue_via_systemd(char *devnm, char *service_name)
+{
+ int pid, status;
+ char pathbuf[1024];
+
+ /* Simply return that service cannot be started */
+ if (check_env("MDADM_NO_SYSTEMCTL"))
+ return 0;
+ switch (fork()) {
+ case 0:
+ manage_fork_fds(1);
+ snprintf(pathbuf, sizeof(pathbuf),
+ "%s@%s.service", service_name, devnm);
+ status = execl("/usr/bin/systemctl", "systemctl", "restart",
+ pathbuf, NULL);
+ status = execl("/bin/systemctl", "systemctl", "restart",
+ pathbuf, NULL);
+ exit(1);
+ case -1: /* Just do it ourselves. */
+ break;
+ default: /* parent - good */
+ pid = wait(&status);
+ if (pid >= 0 && status == 0)
+ return 1;
+ }
+ return 0;
+}
+
+int in_initrd(void)
+{
+ /* This is based on similar function in systemd. */
+ struct statfs s;
+ /* statfs.f_type is signed long on s390x and MIPS, causing all
+ sorts of sign extension problems with RAMFS_MAGIC being
+ defined as 0x858458f6 */
+ return statfs("/", &s) >= 0 &&
+ ((unsigned long)s.f_type == TMPFS_MAGIC ||
+ ((unsigned long)s.f_type & 0xFFFFFFFFUL) ==
+ ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL));
+}
+
+void reopen_mddev(int mdfd)
+{
+ /* Re-open without any O_EXCL, but keep
+ * the same fd
+ */
+ char *devnm;
+ int fd;
+ devnm = fd2devnm(mdfd);
+ close(mdfd);
+ fd = open_dev(devnm);
+ if (fd >= 0 && fd != mdfd)
+ dup2(fd, mdfd);
+}
+
+static struct cmap_hooks *cmap_hooks = NULL;
+static int is_cmap_hooks_ready = 0;
+
+void set_cmap_hooks(void)
+{
+ cmap_hooks = xmalloc(sizeof(struct cmap_hooks));
+ cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL);
+ if (!cmap_hooks->cmap_handle)
+ return;
+
+ cmap_hooks->initialize =
+ dlsym(cmap_hooks->cmap_handle, "cmap_initialize");
+ cmap_hooks->get_string =
+ dlsym(cmap_hooks->cmap_handle, "cmap_get_string");
+ cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize");
+
+ if (!cmap_hooks->initialize || !cmap_hooks->get_string ||
+ !cmap_hooks->finalize)
+ dlclose(cmap_hooks->cmap_handle);
+ else
+ is_cmap_hooks_ready = 1;
+}
+
+int get_cluster_name(char **cluster_name)
+{
+ int rv = -1;
+ cmap_handle_t handle;
+
+ if (!is_cmap_hooks_ready)
+ return rv;
+
+ rv = cmap_hooks->initialize(&handle);
+ if (rv != CS_OK)
+ goto out;
+
+ rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name);
+ if (rv != CS_OK) {
+ free(*cluster_name);
+ rv = -1;
+ goto name_err;
+ }
+
+ rv = 0;
+name_err:
+ cmap_hooks->finalize(handle);
+out:
+ return rv;
+}
+
+void set_dlm_hooks(void)
+{
+ dlm_hooks = xmalloc(sizeof(struct dlm_hooks));
+ dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL);
+ if (!dlm_hooks->dlm_handle)
+ return;
+
+ dlm_hooks->open_lockspace =
+ dlsym(dlm_hooks->dlm_handle, "dlm_open_lockspace");
+ dlm_hooks->create_lockspace =
+ dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace");
+ dlm_hooks->release_lockspace =
+ dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace");
+ dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock");
+ dlm_hooks->ls_unlock_wait =
+ dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock_wait");
+ dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd");
+ dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch");
+
+ if (!dlm_hooks->open_lockspace || !dlm_hooks->create_lockspace ||
+ !dlm_hooks->ls_lock || !dlm_hooks->ls_unlock_wait ||
+ !dlm_hooks->release_lockspace || !dlm_hooks->ls_get_fd ||
+ !dlm_hooks->dispatch)
+ dlclose(dlm_hooks->dlm_handle);
+ else
+ is_dlm_hooks_ready = 1;
+}
+
+void set_hooks(void)
+{
+ set_dlm_hooks();
+ set_cmap_hooks();
+}
+
+int zero_disk_range(int fd, unsigned long long sector, size_t count)
+{
+ int ret = 0;
+ int fd_zero;
+ void *addr = NULL;
+ size_t written = 0;
+ size_t len = count * 512;
+ ssize_t n;
+
+ fd_zero = open("/dev/zero", O_RDONLY);
+ if (fd_zero < 0) {
+ pr_err("Cannot open /dev/zero\n");
+ return -1;
+ }
+
+ if (lseek64(fd, sector * 512, SEEK_SET) < 0) {
+ ret = -errno;
+ pr_err("Failed to seek offset for zeroing\n");
+ goto out;
+ }
+
+ addr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd_zero, 0);
+
+ if (addr == MAP_FAILED) {
+ ret = -errno;
+ pr_err("Mapping /dev/zero failed\n");
+ goto out;
+ }
+
+ do {
+ n = write(fd, addr + written, len - written);
+ if (n < 0) {
+ if (errno == EINTR)
+ continue;
+ ret = -errno;
+ pr_err("Zeroing disk range failed\n");
+ break;
+ }
+ written += n;
+ } while (written != len);
+
+ munmap(addr, len);
+
+out:
+ close(fd_zero);
+ return ret;
+}
diff --git a/uuid.c b/uuid.c
new file mode 100644
index 0000000..94b5abd
--- /dev/null
+++ b/uuid.c
@@ -0,0 +1,112 @@
+/*
+ * mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include <string.h>
+
+const int uuid_zero[4] = { 0, 0, 0, 0 };
+
+int same_uuid(int a[4], int b[4], int swapuuid)
+{
+ if (swapuuid) {
+ /* parse uuids are hostendian.
+ * uuid's from some superblocks are big-ending
+ * if there is a difference, we need to swap..
+ */
+ unsigned char *ac = (unsigned char *)a;
+ unsigned char *bc = (unsigned char *)b;
+ int i;
+ for (i = 0; i < 16; i += 4) {
+ if (ac[i+0] != bc[i+3] ||
+ ac[i+1] != bc[i+2] ||
+ ac[i+2] != bc[i+1] ||
+ ac[i+3] != bc[i+0])
+ return 0;
+ }
+ return 1;
+ } else {
+ if (a[0]==b[0] &&
+ a[1]==b[1] &&
+ a[2]==b[2] &&
+ a[3]==b[3])
+ return 1;
+ return 0;
+ }
+}
+
+void copy_uuid(void *a, int b[4], int swapuuid)
+{
+ if (swapuuid) {
+ /* parse uuids are hostendian.
+ * uuid's from some superblocks are big-ending
+ * if there is a difference, we need to swap..
+ */
+ unsigned char *ac = (unsigned char *)a;
+ unsigned char *bc = (unsigned char *)b;
+ int i;
+ for (i = 0; i < 16; i += 4) {
+ ac[i+0] = bc[i+3];
+ ac[i+1] = bc[i+2];
+ ac[i+2] = bc[i+1];
+ ac[i+3] = bc[i+0];
+ }
+ } else
+ memcpy(a, b, 16);
+}
+
+/*
+ * Parse a 128 bit uuid in 4 integers
+ * format is 32 hexx nibbles with options :.<space> separator
+ * If not exactly 32 hex digits are found, return 0
+ * else return 1
+ */
+int parse_uuid(char *str, int uuid[4])
+{
+ int hit = 0; /* number of Hex digIT */
+ int i;
+ char c;
+ for (i = 0; i < 4; i++)
+ uuid[i] = 0;
+
+ while ((c = *str++) != 0) {
+ int n;
+ if (c >= '0' && c <= '9')
+ n = c-'0';
+ else if (c >= 'a' && c <= 'f')
+ n = 10 + c - 'a';
+ else if (c >= 'A' && c <= 'F')
+ n = 10 + c - 'A';
+ else if (strchr(":. -", c))
+ continue;
+ else return 0;
+
+ if (hit<32) {
+ uuid[hit/8] <<= 4;
+ uuid[hit/8] += n;
+ }
+ hit++;
+ }
+ if (hit == 32)
+ return 1;
+ return 0;
+}
diff --git a/xmalloc.c b/xmalloc.c
new file mode 100644
index 0000000..8b3f78a
--- /dev/null
+++ b/xmalloc.c
@@ -0,0 +1,84 @@
+/* mdadm - manage Linux "md" devices aka RAID arrays.
+ *
+ * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de>
+ *
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ * Author: Neil Brown
+ * Email: <neilb@suse.de>
+ */
+
+#include "mdadm.h"
+/*#include <sys/socket.h>
+#include <sys/utsname.h>
+#include <sys/wait.h>
+#include <sys/un.h>
+#include <ctype.h>
+#include <dirent.h>
+#include <signal.h>
+*/
+
+void *xmalloc(size_t len)
+{
+ void *rv = malloc(len);
+ char *msg;
+ int n;
+ if (rv)
+ return rv;
+ msg = ": memory allocation failure - aborting\n";
+ n = write(2, Name, strlen(Name));
+ n += write(2, msg, strlen(msg));
+ exit(4+!!n);
+}
+
+void *xrealloc(void *ptr, size_t len)
+{
+ void *rv = realloc(ptr, len);
+ char *msg;
+ int n;
+ if (rv)
+ return rv;
+ msg = ": memory allocation failure - aborting\n";
+ n = write(2, Name, strlen(Name));
+ n += write(2, msg, strlen(msg));
+ exit(4+!!n);
+}
+
+void *xcalloc(size_t num, size_t size)
+{
+ void *rv = calloc(num, size);
+ char *msg;
+ int n;
+ if (rv)
+ return rv;
+ msg = ": memory allocation failure - aborting\n";
+ n = write(2, Name, strlen(Name));
+ n += write(2, msg, strlen(msg));
+ exit(4+!!n);
+}
+
+char *xstrdup(const char *str)
+{
+ char *rv = strdup(str);
+ char *msg;
+ int n;
+ if (rv)
+ return rv;
+ msg = ": memory allocation failure - aborting\n";
+ n = write(2, Name, strlen(Name));
+ n += write(2, msg, strlen(msg));
+ exit(4+!!n);
+}