diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:42:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:42:59 +0000 |
commit | 0c7a6eb5ccace1d8e9f7b301f6a61a7d3f016369 (patch) | |
tree | 80a778fbd7bb3c7858cfac572df1cb08cfa4f988 | |
parent | Initial commit. (diff) | |
download | mdadm-0c7a6eb5ccace1d8e9f7b301f6a61a7d3f016369.tar.xz mdadm-0c7a6eb5ccace1d8e9f7b301f6a61a7d3f016369.zip |
Adding upstream version 4.2.upstream/4.2upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
279 files changed, 77998 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..217fe76 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/*.o +/*.man +/*-stamp +/mdadm +/mdadm.8 +/mdadm.udeb +/mdassemble +/mdmon +/swap_super +/test_stripe +/TAGS +/mdadm.O2 +/mdadm.Os +/mdadm.static +/mdassemble.auto +/mdassemble.static +/mdmon.O2 +/raid6check diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0 new file mode 100644 index 0000000..f2d4f84 --- /dev/null +++ b/ANNOUNCE-3.0 @@ -0,0 +1,98 @@ +Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux + +I am pleased to (finally) announce the availability of + mdadm version 3.0 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This is a major new version and as such should be treated with some +caution. However it has seen substantial testing and is considerred +to be ready for wide use. + + +The significant change which justifies the new major version number is +that mdadm can now handle metadata updates entirely in userspace. +This allows mdadm to support metadata formats that the kernel knows +nothing about. + +Currently two such metadata formats are supported: + - DDF - The SNIA standard format + - Intel Matrix - The metadata used by recent Intel ICH controlers. + +Also the approach to device names has changed significantly. + +If udev is installed on the system, mdadm will not create any devices +in /dev. Rather it allows udev to manage those devices. For this to work +as expected, the included udev rules file should be installed. + +If udev is not installed, mdadm will still create devices and symlinks +as required, and will also remove them when the array is stopped. + +mdadm now requires all devices which do not have a standard name (mdX +or md_dX) to live in the directory /dev/md/. Names in this directory +will always be created as symlinks back to the standard name in /dev. + +The man pages contain some information about the new externally managed +metadata. However see below for a more condensed overview. + +Externally managed metadata introduces the concept of a 'container'. +A container is a collection of (normally) physical devices which have +a common set of metadata. A container is assembled as an md array, but +is left 'inactive'. + +A container can contain one or more data arrays. These are composed from +slices (partitions?) of various devices in the container. + +For example, a 5 devices DDF set can container a RAID1 using the first +half of two devices, a RAID0 using the first half of the remain 3 devices, +and a RAID5 over thte second half of all 5 devices. + +A container can be created with + + mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde] + +or "-e imsm" to use the Intel Matrix Storage Manager. + +An array can be created within a container either by giving the +container name and the only member: + + mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0 + +or by listing the component devices + + mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde] + +To assemble a container, it is easiest just to pass each device in turn to +mdadm -I + + for i in /dev/sd[abcde] + do mdadm -I $i + done + +This will assemble the container and the components. + +Alternately the container can be assembled explicitly + + mdadm -A /dev/md0 /dev/sd[abcde] + +Then the components can all be assembled with + + mdadm -I /dev/md0 + +For each container, mdadm will start a program called "mdmon" which will +monitor the array and effect any metadata updates needed. The array is +initially assembled readonly. It is up to "mdmon" to mark the metadata +as 'dirty' and which the array to 'read-write'. + +The version 0.90 and 1.x metadata formats supported by previous +versions for mdadm are still supported and the kernel still performs +the same updates it use to. The new 'mdmon' approach is only used for +newly introduced metadata types. + +NeilBrown 2nd June 2009 diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1 new file mode 100644 index 0000000..91b4428 --- /dev/null +++ b/ANNOUNCE-3.0.1 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains only minor bug fixes over 3.0. If you are using +3.0, you could consider upgrading. + +The brief change log is: + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2 new file mode 100644 index 0000000..93643d1 --- /dev/null +++ b/ANNOUNCE-3.0.2 @@ -0,0 +1,21 @@ +Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This just contains one bugfix over 3.0.1 - I was obviously a bit hasty +in releasing that one. + +The brief change log is: + - Fix crash when hosthost is not set, as often happens in + early boot. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3 new file mode 100644 index 0000000..d6117a1 --- /dev/null +++ b/ANNOUNCE-3.0.3 @@ -0,0 +1,29 @@ +Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains a collection of bug fixes and minor enhancements over +3.0.1. + +The brief change log is: + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1 new file mode 100644 index 0000000..343b85d --- /dev/null +++ b/ANNOUNCE-3.1 @@ -0,0 +1,33 @@ +Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux + +Hot on the heals of 3.0.3 I am pleased to announce the availability of + mdadm version 3.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +It contains significant feature enhancements over 3.0.x + +The brief change log is: + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Note that a 2.6.31 or later is needed to have access to these. +Reducing devices in a RAID4/5/6 requires 2.6.32. +Changing RAID5 to RAID1 requires 2.6.33. + +You should only upgrade if you need to use, or which to test, these +features. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1 new file mode 100644 index 0000000..9e480dc --- /dev/null +++ b/ANNOUNCE-3.1.1 @@ -0,0 +1,39 @@ +Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix release over 3.1, which was withdrawn due to serious +bugs. So it might be best to ignore 3.1 and say that this is a significant +feature release over 3.0.x + +Significant changes are: + - RAID level conversion between RAID1, RAID5, and RAID6 are + possible were the kernel supports it (2.6.32 at least) + - online chunksize and layout changing for RAID5 and RAID6 + where the kernel supports it. + - reduce the number of devices in a RAID4/5/6 array. + + - The default metadata is not v1.1. This metadata is stored at the + start of the device so is safer in many ways but could interfere with + boot loaded. The old default (0.90) is still available and fully + supported. + + - The default chunksize is now 512K rather than 64K. This seems more + appropriate for modern devices. + + - The default bitmap chunksize for internal bitmaps is now at least + 64Meg as fine grained bitmaps tend to impact performance more for + little extra gain. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.1. + +NeilBrown 19th November 2009 diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2 new file mode 100644 index 0000000..321b8be --- /dev/null +++ b/ANNOUNCE-3.1.2 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.1. + +Significant changes are: + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + + +This release is believed to be stable and you should feel free to +upgrade to 3.1.2 + +NeilBrown 10th March 2010 diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3 new file mode 100644 index 0000000..95b2b6c --- /dev/null +++ b/ANNOUNCE-3.1.3 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.2 + +Significant changes are: + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +This release is believed to be stable and you should feel free to +upgrade to 3.1.3 + +It is expected that the next release will be 3.2 with a number of new +features. 3.1.4 will only happen if important bugs show up before 3.2 +is stable. + +NeilBrown 6th August 2010 diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4 new file mode 100644 index 0000000..c157a36 --- /dev/null +++ b/ANNOUNCE-3.1.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.4 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.3. +3.1.3 had a couple of embarrasing regressions and a couple of other +issues surfaces which had easy fixes so I decided to make a 3.1.4 +release after all. + +Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev +And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + - Fix spare migration + +This release is believed to be stable and you should feel free to +upgrade to 3.1.4 + +It is expected that the next release will be 3.2 with a number of new +features. + +NeilBrown 31st August 2010 diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5 new file mode 100644 index 0000000..baa1f92 --- /dev/null +++ b/ANNOUNCE-3.1.5 @@ -0,0 +1,42 @@ +Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.5 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.4. It contains all the +important bugfixes found while working on 3.2 and 3.2.1. It will be +the last 3.1.x release - 3.2.1 is expected to be released in a few days. + +Changes include: + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.5 + + +NeilBrown 23rd March 2011 + diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2 new file mode 100644 index 0000000..9e282bc --- /dev/null +++ b/ANNOUNCE-3.2 @@ -0,0 +1,77 @@ +Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY) + +I am pleased to announce the availability of + mdadm version 3.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm devel-3.2 + http://neil.brown.name/git?p=mdadm + +This is a "Developers only" release. Please don't consider using it +or making it available to others without reading the following. + + +By far the most significant change in this release related to the +management of reshaping arrays. This code has been substantially +re-written so that it can work with 'externally managed metadata' - +Intel's IMSM in particular. We now support level migration and +OnLine Capacity Expansion on these arrays. + +However, while the code largely works it has not been tested +exhaustively so there are likely to be problems. As the reshape code +for native metadata arrays was changed as part of this rewrite these +problems could also result in regressions for reshape of native +metadata. + +It is partly to encourage greater testing that this release is being +made. Any reports of problem - particular reproducible recipes for +triggering the problems - will be gratefully received. + +It is hopped that a "3.2.1" release will be available in early March +which will be a bugfix release over this and can be considered +suitable for general use. + +Other changes of note: + + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + + +Any feed back and bug reports are always welcomed at: + linux-raid@vger.kernel.org + +And please: don't use this in production - particularly not the +--grow functionality. + +NeilBrown 1st February 2011 + + diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1 new file mode 100644 index 0000000..0e7826c --- /dev/null +++ b/ANNOUNCE-3.2.1 @@ -0,0 +1,75 @@ + + +I am pleased to announce the availability of + mdadm version 3.2.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +Many of the changes in this release are of internal interest only, +restructuring and refactoring code and so forth. + +Most of the bugs found and fixed during development for 3.2.1 have been +back-ported for the recently-release 3.1.5 so this release primarily +provides a few new features over 3.1.5. + +They include: + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + + +While mdadm-3.2.1 is considered to be reasonably stable, you should +only use it if you want to try out the new features, or if you +generally like to be on the bleeding edge. If the new features are not +important to you, then 3.1.5 is probably the appropriate version to be using +until 3.2.2 comes out. + +NeilBrown 28th March 2011 diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2 new file mode 100644 index 0000000..b70d18b --- /dev/null +++ b/ANNOUNCE-3.2.2 @@ -0,0 +1,36 @@ +Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a stablising release for the 3.2 series. +Many of the changes just fix bugs introduces in 3.2 or 3.2.1. + +There are some new features. They are: + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Future releases in the 3.2 series will only be made if bugfixes are needed. +The next release to add features is expected to be 3.3. + +NeilBrown 17th June 2011 diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3 new file mode 100644 index 0000000..8a8dba4 --- /dev/null +++ b/ANNOUNCE-3.2.3 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +The largest single area of change is support for reshape of Intel +IMSM arrays (OnLine Capacity Explansion and Level Migtration). +Among other fixes, this now has a better chance of surviving if a +device fails during reshape. + +Upgrading is recommended - particularly if you use mdadm for IMSM +arrays - but not essential. + +NeilBrown 23rd December 2011 diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4 new file mode 100644 index 0000000..e321678 --- /dev/null +++ b/ANNOUNCE-3.2.4 @@ -0,0 +1,144 @@ +Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.4 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Upgrading is encouraged. + +The next mdadm release is expected to be 3.3 with a number of new +features. + +NeilBrown 9th May 2012 + +77b3ac8 monitor: make return from read_and_act more symbolic. +68226a8 monitor: ensure we retry soon when 'remove' fails. +8453f8d fix: Monitor sometimes crashes +90fa1a2 Work around gcc-4.7's strict aliasing checks +0c4304c fix: container creation with --incremental used. +5d1c7cd FIX: External metadata sometimes is not updated +3c20f98 FIX: mdmon check in reshape_container() can cause a problem +59ab9f5 FIX: Typo error in fprint command +9587c37 imsm: load_super_imsm_all function refactoring +ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list +ca9de18 imsm: validate the number of imsm volumes per controller +30602f5 imsm: display fd in error trace when when store_imsm_mpb failes +eb155f6 mdmon: Use getopt_long() to parse command line options +08ca2ad Add --offroot argument to mdadm +da82751 Add --offroot argument to mdmon +a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot +f878b24 imsm: fix, the second array need to have the whole available space on devices +d597705 getinfo_super1: Use MaxSector in place of sb->size +6ef8905 super1: make aread/awrite always use an aligned buffer. +de5a472 Remove avail_disks arg from 'enough'. +da8fe5a Assemble: fix --force assemble during reshape. +b10c663 config: fix handing of 'homehost' in AUTO line. +92d49ec FIX: NULL pointer to strdup() can be passed +d2bde6d imsm: FIX: No new missing disks are allowed during general migration +111e9fd FIX: Array is not run when expansion disks are added +bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1 +50927b1 Fix: Sometimes mdmon throws core dump during reshape +78340e2 Flush mdmon before next reshape step during container operation +e174219 imsm: FIX: Chunk size migration problem +f93346e FIX: use md position to reshape restart +6a75c8c imsm: FIX: use md position to reshape restart +51d83f5 imsm: FIX: Clear migration record when migration switches to next volume. +e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes +1ca90aa FIX: Do not try to (continue) reshape using inactive array +9f1b0f0 config: conf_match should ignore devname when not set. +d669228 Use posix_memalign() for memory used to write bitmaps +178950e FIX: Changes in '0' case for reshape position verification +9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure +4011421 Print error message if failing to write super for 1.x metadata +0011874 Use MDMON_DIR for pid files created in Monitor.c +56d1885 Assemble: don't use O_EXCL until we have checked device content. +b720636 Assemble: support assembling of a RAID0 being reshaped. +c69ffac Manage: allow --re-add to failed array. +52f07f5 Reset bad flag on map update +911cead super1: support superblocks up to 4K. +ad6db3c Create: reduce the verbosity of 'default_layout'. +b2bfdfa super1.c don't keep recalculating bitmap pointer +4122675 Define and use SUPER1_SIZE for allocations +1afa930 init_super1() memset full buffer allocated for superblock +2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset +3c0bcd4 Use 4K buffer alignment for superblock allocations +308340a Use struct align_fd to cache fd's block size for aligned reads/writes +65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset +de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR() +0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR +654a381 super-intel.c: Use ROUND_UP() instead of manually coding it +42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer +d4633e0 Examine: fix array size calculation for RAID10. +e62b778 Assemble: improve verbose logging when including old devices. +0073a6e Remove possible crash during RAID6 -> RAID5 reshape. +69fe207 Incremental: fix adding devices with --incremental +bcbb311 Manage: replace 'return 1' with 'goto abort'. +9f58469 Manage: freeze recovery while adding multiple devices. +ae6c05a Create: round off size for RAID1 arrays. +5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail. +c07d640 Fix tests/05r1-re-add-nosupper +2d762ad Fix the new ROUND_UP macro. +fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs. +5551b11 imsm: avoid overflows for disks over 1TB +97f81ee clear hi bits if not used after loading metadata from disk +e03640b simplify calculating array_blocks +29cd082 show 2TB volumes/disks support in --detail-platform +2cc699a check volume size in validate_geometry_imsm_orom +9126b9a check that no disk over 2TB is used to create container when no support +027c374 imsm: set 2tb disk attribute for spare +3556c2f Fix typo: wan -> want +15632a9 parse_size: distinguish between 0 and error. +fbdef49 Bitmap_offset is a signed number +508a7f1 super1: leave more space in front of data by default. +40110b9 Fix two typos in fprintf messages +342460c mdadm man page: fix typo +0e7f69a imsm: display maximum volumes per controller and array +36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10 +7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change() +f3871fd imsm: Add new metadata update for volume size expansion +54397ed imsm: Execute size change for external metatdata +016e00f FIX: Support metadata changes rollback +fbf3d20 imsm: FIX: Support metadata changes rollback +44f6f18 FIX: Extend size of raid0 array +7e7e9a4 FIX: Respect metadata size limitations +65a9798 FIX: Detect error and rollback metadata +13bcac9 imsm: Add function imsm_get_free_size() +b130333 imsm: Support setting max size for size change operation +c41e00b imsm: FIX: Component size alignment check +58d26a2 FIX: Size change is possible as standalone change only +4aecb54 FIX: Assembled second array is in read only state during reshape +ae2416e FIX: resolve make everything compilation error +480f356 Raid limit of 1024 when scanning for devices. +c2ecf5f Add --prefer option for --detail and --monitor +0a99975 Relax restrictions on when --add is permitted. +7ce0570 imsm: fix: rebuild does not continue after reboot +b51702b fix: correct extending size of raid0 array +34a1395 Fix sign extension of bitmap_offset in super1.c +012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset +5d7b407 imsm: fix: thunderdome may drop 2tb attribute +5ffdc2d Update test for "is udev active". +96fd06e Adjust to new standard of /run +974e039 test: don't worry too much about array size. +b0a658f Grow: failing the set the per-device size is not an error. +36614e9 super-intel.c: Don't try to close negative fd +562aa10 super-intel.c: Fix resource leak from opendir() + diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5 new file mode 100644 index 0000000..396da12 --- /dev/null +++ b/ANNOUNCE-3.2.5 @@ -0,0 +1,31 @@ +Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.2.5 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release primarily fixes a serious regression in 3.2.4. +This regression does *not* cause any risk to data. It simply +means that adding a device with "--add" would sometime fail +when it should not. + +The fix also includes a couple of minor fixes such as making +the "--layout=preserve" option to "--grow" work again. + +A reminder that the default location for runtime files is now +"/run/mdadm". If you compile this for a distro that does not +have "/run", you will need to compile with an alternate setting for +MAP_DIR. e.g. + make MAP_DIR=/var/run/mdadm +or + make MAP_DIR=/dev/.mdadm + +NeilBrown 18th May 2012 + diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6 new file mode 100644 index 0000000..f5cfd49 --- /dev/null +++ b/ANNOUNCE-3.2.6 @@ -0,0 +1,57 @@ +Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.6 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This is a stablity release which adds a number of bugfixs to 3.2.5. +There are no real stand-out fixes, just lots of little bits and pieces. + +Below is the "git log --oneline --reverse" list of changes since +3.2.5. + +NeilBrown 25th October 2012 + +b7e05d2 udev-rules: prevent systemd from mount devices before they are ready. +0d478e2 mdadm: Fix Segmentation fault. +42f0ca1 imsm: fix: correct checking volume's degradation +fcf2195 Monitor: fix inconsistencies in values for ->percent +5f862fb Monitor: Report NewArray when an array the disappeared, reappears. +6f51b1c Monitor: fix reporting for Fail vs FailSpare etc. +68ad53b mdmon: fix arg parsing. +517f135 Assemble: don't leak memory with fdlist. +090900c udev-rules: prevent systemd from mount devices before they are ready. +446e000 sha1.h: remove ansidecl.h header inclusion +ec894f5 Manage: zero metadata before adding to 'external' array. +3a84db5 ddf: allow a non-spare to be used to recovery a missing device. +c5d61ca ddf: hack to fix container recognition. +23084aa mdmon: fix arg processing for -a +c4e96a3 mdmon: allow --takeover when original was started with --offroot +80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf +c5c56d6 mapfile: fix mapfile rebuild for containers +aec89f6 fix segfaults in Detail() +2117ad1 Fix 'enough' function for RAID10. +0bc300d Use --offroot flag when assembling md arrays via --incrmental +ac78f24 Grow: make warning about old metadata more explicit. +14026ab Replace sha1.h with slightly older version. +6f6809f Add zlib license to crc32.c +5267ba0 Handles spaces in array names better. +c51f288 imsm: allow --assume-clean to work. +acf7076 Grow: allow --grow --continue to work for native metadata. +335d2a6 Grow: fix a couple of typos with --assume-clean usage +9ff1427 Fix open_container +3713633 mdadm: super0: do not override uuid with homehost +31bff58 Trivial bugfix and spelling fixes. +e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'. +22a6461 super0: allow creation of array on 2TB+ devices. +a5d47a2 Create new md devices consistently +eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf +ecdf2d7 Query: don't be confused by partition tables. +f7b75c1 Query: allow member of non-0.90 arrays to be better reported. diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3 new file mode 100644 index 0000000..f770aa1 --- /dev/null +++ b/ANNOUNCE-3.3 @@ -0,0 +1,63 @@ +Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +This is a major new release so don't be too surprised if there are a +few issues. If I hear about them they will be fixed in 3.3.1. +git log reports nearly 500 changes since 3.2.6 so I won't list them +all. + +Some highlights are: + +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +and lots of bugfixes and other little changes. + +NeilBrown 3rd September 2013 diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1 new file mode 100644 index 0000000..7d5e666 --- /dev/null +++ b/ANNOUNCE-3.3.1 @@ -0,0 +1,23 @@ +Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The main changes are: + - lots of work on "DDF" support. Hopefully it will be more stable + now. Bug reports are always welcome. + - improved interactions with 'systemd'. Where possible, background + tasks are run from systemd (if it is present) rather then forking + disassociationg from the session. This is important because udev + doesn't really let you disassociate. + +though there are a number of other little bug fixes too. + +NeilBrown 5th June 2014 diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2 new file mode 100644 index 0000000..6b54961 --- /dev/null +++ b/ANNOUNCE-3.3.2 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.2 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +Changes since 3.3.1 are mostly little bugfixes and some man-page +updates. + +NeilBrown 21st August 2014 diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3 new file mode 100644 index 0000000..ac1b217 --- /dev/null +++ b/ANNOUNCE-3.3.3 @@ -0,0 +1,18 @@ +Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The 100 changes since 3.3.3 are mostly little bugfixes and some improvements +to the selftests. +raid6check now handle all RAID6 layouts including DDF correctly. +See git log for the rest. + +NeilBrown 24th July 2015 diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4 new file mode 100644 index 0000000..52b9456 --- /dev/null +++ b/ANNOUNCE-3.3.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage +Manager) metadata was handled. Previously an IMSM array would only +be assembled if it was attached to an IMSM controller. + +In 3.3 this was relaxed as there are circumstances where the +controller is not properly detected. Unfortunately this has negative +consequences which have only just come to light. + +If you have an IMSM RAID1 configured and then disable RAID in the +BIOS, the metadata will remain on the devices. If you then install +some other OS on one device and then install Linux on the other, Linux +might eventually start noticing the IMSM metadata (depending a bit on whether +mdadm is included in the initramfs) and might start up the RAID1. This could +copy one device over the other, thus trashing one of the installations. + +Not good. + +So with this release IMSM arrays will only be assembled if attached to +an IMSM controller, or if "--force" is given to --assemble, or if the +environment variable IMSM_NO_PLATFORM is set (used primarily for +testing). + +I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later. + +NeilBrown 3rd August 2015. diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4 new file mode 100644 index 0000000..2689732 --- /dev/null +++ b/ANNOUNCE-3.4 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +The new second-level version number reflects significant new +functionality, particular support for journalled RAID5/6 and clustered +RAID1. This new support is probably still buggy. Please report bugs. + +There are also a number of fixes for Intel's IMSM metadata support, +and an assortment of minor bug fixes. + +I plan for this to be the last release of mdadm that I provide as I am +retiring from MD and mdadm maintenance. Jes Sorensen has volunteered +to oversee mdadm for the next while. Thanks Jes! + +NeilBrown 28th January 2016 diff --git a/ANNOUNCE-4.0 b/ANNOUNCE-4.0 new file mode 100644 index 0000000..f79c540 --- /dev/null +++ b/ANNOUNCE-4.0 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 4.0 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The update in major version number primarily indicates this is a +release by it's new maintainer. In addition it contains a large number +of fixes in particular for IMSM RAID and clustered RAID support. In +addition this release includes support for IMSM 4k sector drives, +failfast and better documentation for journaled RAID. + +This is my first release of mdadm. Please thank Neil Brown for his +previous work as maintainer and blame me for all the bugs I caused +since taking over. + +Jes Sorensen, 2017-01-09 diff --git a/ANNOUNCE-4.1 b/ANNOUNCE-4.1 new file mode 100644 index 0000000..a273b9a --- /dev/null +++ b/ANNOUNCE-4.1 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 4.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 4.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The update constitutes more than one year of enhancements and bug fixes +including for IMSM RAID, Partial Parity Log, clustered RAID support, +improved testing, and gcc-8 support. + +Jes Sorensen, 2018-10-01 diff --git a/ANNOUNCE-4.2 b/ANNOUNCE-4.2 new file mode 100644 index 0000000..8b22d09 --- /dev/null +++ b/ANNOUNCE-4.2 @@ -0,0 +1,19 @@ +Subject: ANNOUNCE: mdadm 4.2 - A tool for managing md Soft RAID under Linux + +I am pleased to finally announce the availability of mdadm-4.2. +get 4.2 out the door soon. + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The release includes more than two years of development and bugfixes, +so it is difficult to remember everything. Highlights include +enhancements and bug fixes including for IMSM RAID, Partial Parity +Log, clustered RAID support, improved testing, and gcc-9 support. + +Thank you everyone who contributed to this release! + +Jes Sorensen, 2021-12-30 diff --git a/Assemble.c b/Assemble.c new file mode 100644 index 0000000..704b829 --- /dev/null +++ b/Assemble.c @@ -0,0 +1,2227 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <ctype.h> + +mapping_t assemble_statuses[] = { + { "but cannot be started", INCR_NO }, + { "but not safe to start", INCR_UNSAFE }, + { "and started", INCR_YES }, + { NULL, INCR_ALREADY } +}; + + +/** + * struct assembly_array_info - General, meaningful information for assembly. + * @name: Array name. + * @new_cnt: Count of drives known to be members, recently added. + * @preexist_cnt: Count of member drives in pre-assembled array. + * @exp_cnt: Count of known expansion targets. + * + * FIXME: @exp_new_cnt for recently added expansion targets. + */ +struct assembly_array_info { + char *name; + int new_cnt; + int preexist_cnt; + int exp_cnt; +}; + +/** + * set_array_assembly_status() - generate status of assembly for an array. + * @c: Global settings. + * @result: Pointer to status mask. + * @status: Status to be set/printed. + * @arr: Array information. + * + * Print status message to user or set it in @result if it is not NULL. + */ +static void set_array_assembly_status(struct context *c, + int *result, int status, + struct assembly_array_info *arr) +{ + int raid_disks = arr->preexist_cnt + arr->new_cnt; + char *status_msg = map_num(assemble_statuses, status); + + if (c->export && result) + *result |= status; + + if (c->export || c->verbose < 0) + return; + + pr_err("%s has been assembled with %d device%s", arr->name, + raid_disks, raid_disks == 1 ? "":"s"); + if (arr->preexist_cnt > 0) + fprintf(stderr, " (%d new)", arr->new_cnt); + if (arr->exp_cnt) + fprintf(stderr, " ( + %d for expansion)", arr->exp_cnt); + if (status_msg) + fprintf(stderr, " %s", status_msg); + fprintf(stderr, ".\n"); +} + +static int name_matches(char *found, char *required, char *homehost, int require_homehost) +{ + /* See if the name found matches the required name, possibly + * prefixed with 'homehost' + */ + char *sep; + unsigned int l; + + if (strcmp(found, required)==0) + return 1; + sep = strchr(found, ':'); + if (!sep) + return 0; + l = sep - found; + if (strncmp(found, "any:", 4) == 0 || + (homehost && strcmp(homehost, "any") == 0) || + !require_homehost || + (homehost && strlen(homehost) == l && + strncmp(found, homehost, l) == 0)) { + /* matching homehost */ + if (strcmp(sep+1, required) == 0) + return 1; + } + return 0; +} + +static int is_member_busy(char *metadata_version) +{ + /* check if the given member array is active */ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + int busy = 0; + + for (ent = mdstat; ent; ent = ent->next) { + if (ent->metadata_version == NULL) + continue; + if (strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + if (!is_subarray(&ent->metadata_version[9])) + continue; + /* Skip first char - it can be '/' or '-' */ + if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + busy = 1; + break; + } + } + free_mdstat(mdstat); + + return busy; +} + +static int ident_matches(struct mddev_ident *ident, + struct mdinfo *content, + struct supertype *tst, + char *homehost, int require_homehost, + char *update, char *devname) +{ + + if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) && + same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 && + memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) { + if (devname) + pr_err("%s has wrong uuid.\n", devname); + return 0; + } + if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && + name_matches(content->name, ident->name, homehost, require_homehost)==0) { + if (devname) + pr_err("%s has wrong name.\n", devname); + return 0; + } + if (ident->super_minor != UnSet && + ident->super_minor != content->array.md_minor) { + if (devname) + pr_err("%s has wrong super-minor.\n", + devname); + return 0; + } + if (ident->level != UnSet && + ident->level != content->array.level) { + if (devname) + pr_err("%s has wrong raid level.\n", + devname); + return 0; + } + if (ident->raid_disks != UnSet && + content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */ + ident->raid_disks!= content->array.raid_disks) { + if (devname) + pr_err("%s requires wrong number of drives.\n", + devname); + return 0; + } + if (ident->member && ident->member[0]) { + /* content->text_version must match */ + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + if (devname) + pr_err("%s is not a container and one is required.\n", + devname); + return 0; + } else if (strcmp(ident->member, s+1) != 0) { + if (devname) + pr_err("skipping wrong member %s is %s\n", + content->text_version, devname); + return 0; + } + } + return 1; +} + +static int select_devices(struct mddev_dev *devlist, + struct mddev_ident *ident, + struct supertype **stp, + struct mdinfo **contentp, + struct context *c, + int inargv, int auto_assem) +{ + struct mddev_dev *tmpdev; + int num_devs; + struct supertype *st = *stp; + struct mdinfo *content = NULL; + int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0); + struct domainlist *domains = NULL; + dev_t rdev; + + tmpdev = devlist; num_devs = 0; + while (tmpdev) { + if (tmpdev->used) + tmpdev->used = 2; + else + num_devs++; + tmpdev->disposition = 0; + tmpdev = tmpdev->next; + } + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + for (tmpdev = devlist; + tmpdev; + tmpdev = tmpdev ? tmpdev->next : NULL) { + char *devname = tmpdev->devname; + int dfd; + struct supertype *tst; + struct dev_policy *pol = NULL; + int found_container = 0; + + if (tmpdev->used > 1) + continue; + + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_mismatch) + pr_err("%s is not the container required (%s)\n", + devname, ident->container); + continue; + } + } else if (ident->devices && + !match_oneof(ident->devices, devname)) { + /* Note that we ignore the "device=" identifier if a + * "container=" is given. Checking both is unnecessarily + * complicated. + */ + if (report_mismatch) + pr_err("%s is not one of %s\n", devname, ident->devices); + continue; + } + + tst = dup_super(st); + + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (report_mismatch) + pr_err("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (!fstat_is_blkdev(dfd, devname, &rdev)) { + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + if (st) { + /* already found some components, this cannot + * be another one. + */ + if (report_mismatch) + pr_err("%s is a container, but we are looking for components\n", + devname); + tmpdev->used = 2; + } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) { + if (report_mismatch) + pr_err("not a recognisable container: %s\n", + devname); + tmpdev->used = 2; + } else if (!tst->ss->load_container || + tst->ss->load_container(tst, dfd, NULL)) { + if (report_mismatch) + pr_err("no correct container type: %s\n", + devname); + tmpdev->used = 2; + } else if (auto_assem && + !conf_test_metadata(tst->ss->name, + (pol = devid_policy(rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } else + found_container = 1; + } else { + if (!tst && (tst = guess_super(dfd)) == NULL) { + if (report_mismatch) + pr_err("no recogniseable superblock on %s\n", + devname); + tmpdev->used = 2; + } else if ((tst->ignore_hw_compat = 0), + tst->ss->load_super(tst, dfd, + report_mismatch ? devname : NULL)) { + if (report_mismatch) + pr_err("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + if (report_mismatch) + pr_err("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } else if (auto_assem && st == NULL && + !conf_test_metadata(tst->ss->name, + (pol = devid_policy(rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } + } + if (dfd >= 0) close(dfd); + if (tmpdev->used == 2) { + if (auto_assem || !inargv) + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + if (ident->name[0] || + ident->super_minor != UnSet) + /* Ignore unrecognised device if looking for + * specific array */ + goto loop; + if (ident->uuid_set) + /* ignore unrecognized device if looking for + * specific uuid + */ + goto loop; + + pr_err("%s has no superblock - assembly aborted\n", + devname); + if (st) + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + if (tst) + tst->ss->free_super(tst); + return -1; + } + + if (found_container) { + /* tmpdev is a container. We need to be either + * looking for a member, or auto-assembling + */ + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (ident->container && ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_mismatch) + pr_err("%s has wrong UUID to be required container\n", + devname); + goto loop; + } + } + /* It is worth looking inside this container. + */ + if (c->verbose > 0) + pr_err("looking in container %s\n", + devname); + + for (content = tst->ss->container_content(tst, NULL); + content; + content = content->next) { + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* message already printed */; + else if (is_member_busy(content->text_version)) { + if (report_mismatch) + pr_err("member %s in %s is already assembled\n", + content->text_version, + devname); + } else if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) { + /* do not assemble arrays with unsupported configurations */ + pr_err("Cannot activate member %s in %s.\n", + content->text_version, + devname); + } else + break; + } + if (!content) { + tmpdev->used = 2; + goto loop; /* empty container */ + } + + st = tst; tst = NULL; + if (!auto_assem && inargv && tmpdev->next != NULL) { + pr_err("%s is a container, but is not only device given: confused and aborting\n", + devname); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + if (c->verbose > 0) + pr_err("found match on member %s in %s\n", + content->text_version, devname); + + /* make sure we finished the loop */ + tmpdev = NULL; + goto loop; + } else { + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + goto loop; + + if (auto_assem) { + /* Never auto-assemble things that conflict + * with mdadm.conf in some way + */ + struct mddev_ident *match; + int rv = 0; + + match = conf_match(tst, content, devname, + report_mismatch ? c->verbose : -1, + &rv); + if (!match && rv == 2) + goto loop; + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (report_mismatch) + pr_err("%s is a member of an explicitly ignored array\n", + devname); + goto loop; + } + if (match && !ident_matches(match, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* Array exists in mdadm.conf but some + * details don't match, so reject it + */ + goto loop; + } + + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (st == NULL) + st = dup_super(tst); + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(content->uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst, 1) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + if (auto_assem) + goto loop; + if (c->homehost) { + int first = st->ss->match_home(st, c->homehost); + int last = tst->ss->match_home(tst, c->homehost); + if (first != last && + (first == 1 || last == 1)) { + /* We can do something */ + if (first) {/* just ignore this one */ + if (report_mismatch) + pr_err("%s misses out due to wrong homehost\n", + devname); + goto loop; + } else { /* reject all those sofar */ + struct mddev_dev *td; + if (report_mismatch) + pr_err("%s overrides previous devices due to good homehost\n", + devname); + for (td=devlist; td != tmpdev; td=td->next) + if (td->used == 1) + td->used = 0; + tmpdev->used = 1; + goto loop; + } + } + } + pr_err("superblock on %s doesn't match others - assembly aborted\n", + devname); + tst->ss->free_super(tst); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + tmpdev->used = 1; + } + loop: + /* Collect domain information from members only */ + if (tmpdev && tmpdev->used == 1) { + if (!pol) + pol = devid_policy(rdev); + domain_merge(&domains, pol, tst?tst->ss->name:NULL); + } + dev_policy_free(pol); + pol = NULL; + if (tst) + tst->ss->free_super(tst); + } + + /* Check if we found some imsm spares but no members */ + if ((auto_assem || + (ident->uuid_set && + memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) && + (!st || !st->sb)) + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + tmpdev->used = 1; + content = *contentp; + + if (!st->sb) { + /* we need sb from one of the spares */ + int dfd = dev_open(tmpdev->devname, O_RDONLY); + if (dfd < 0 || + st->ss->load_super(st, dfd, NULL)) + tmpdev->used = 2; + close_fd(&dfd); + } + } + + /* Now reject spares that don't match domains of identified members */ + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + if (!stat_is_blkdev(tmpdev->devname, &rdev)) { + tmpdev->used = 2; + } else { + struct dev_policy *pol = devid_policy(rdev); + int dt = domain_test(domains, pol, NULL); + if (inargv && dt != 0) + /* take this spare as domains match + * if there are any */ + tmpdev->used = 1; + else if (!inargv && dt == 1) + /* device wasn't explicitly listed, so need + * explicit domain match - which we have */ + tmpdev->used = 1; + else + /* if domains don't match mark as unused */ + tmpdev->used = 0; + dev_policy_free(pol); + } + } + domain_free(domains); + *stp = st; + if (st && st->sb && content == *contentp) + st->ss->getinfo_super(st, content, NULL); + *contentp = content; + + return num_devs; +} + +struct devs { + char *devname; + int uptodate; /* set once we decide that this device is as + * recent as everything else in the array. + */ + int included; /* set if the device is already in the array + * due to a previous '-I' + */ + struct mdinfo i; +}; + +static int load_devices(struct devs *devices, char *devmap, + struct mddev_ident *ident, struct supertype **stp, + struct mddev_dev *devlist, struct context *c, + struct mdinfo *content, + int mdfd, char *mddev, + int *most_recentp, int *bestcntp, int **bestp, + int inargv) +{ + struct mddev_dev *tmpdev; + int devcnt = 0; + int nextspare = 0; + int bitmap_done = 0; + int most_recent = -1; + int bestcnt = 0; + int *best = *bestp; + struct supertype *st = *stp; + + for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int i; + int dfd; + int disk_state; + + if (tmpdev->used != 1) + continue; + /* looks like a good enough match to update the super block if needed */ + if (c->update) { + /* prepare useful information in info structures */ + struct stat stb2; + int err; + fstat(mdfd, &stb2); + + if (strcmp(c->update, "uuid") == 0 && !ident->uuid_set) + random_uuid((__u8 *)ident->uuid); + + if (strcmp(c->update, "ppl") == 0 && + ident->bitmap_fd >= 0) { + pr_err("PPL is not compatible with bitmap\n"); + close(mdfd); + free(devices); + free(devmap); + return -1; + } + + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + + tst = dup_super(st); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + + memcpy(content->uuid, ident->uuid, 16); + strcpy(content->name, ident->name); + content->array.md_minor = minor(stb2.st_rdev); + + if (strcmp(c->update, "byteorder") == 0) + err = 0; + else if (strcmp(c->update, "home-cluster") == 0) { + tst->cluster_name = c->homecluster; + err = tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else if (strcmp(c->update, "nodes") == 0) { + tst->nodes = c->nodes; + err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate); + } else if (strcmp(c->update, "revert-reshape") == 0 && + c->invalid_backup) + err = tst->ss->update_super(tst, content, + "revert-reshape-nobackup", + devname, c->verbose, + ident->uuid_set, + c->homehost); + else + err = tst->ss->update_super(tst, content, c->update, + devname, c->verbose, + ident->uuid_set, + c->homehost); + if (err < 0) { + if (err == -1) + pr_err("--update=%s not understood for %s metadata\n", + c->update, tst->ss->name); + tst->ss->free_super(tst); + free(tst); + close(mdfd); + close(dfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + ident->uuid_set = 1; + memcpy(ident->uuid, content->uuid, 16); + } + if (tst->ss->store_super(tst, dfd)) + pr_err("Could not re-write superblock on %s.\n", + devname); + + if (strcmp(c->update, "uuid")==0 && + ident->bitmap_fd >= 0 && !bitmap_done) { + if (bitmap_update_uuid(ident->bitmap_fd, + content->uuid, + tst->ss->swapuuid) != 0) + pr_err("Could not update uuid on external bitmap.\n"); + else + bitmap_done = 1; + } + } else { + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + tst = dup_super(st); + + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + } + + fstat(dfd, &stb); + close(dfd); + + if (c->verbose > 0) + pr_err("%s is identified as a member of %s, slot %d%s.\n", + devname, mddev, content->disk.raid_disk, + (content->disk.state & (1<<MD_DISK_REPLACEMENT)) ? " replacement":""); + devices[devcnt].devname = devname; + devices[devcnt].uptodate = 0; + devices[devcnt].included = (tmpdev->disposition == 'I'); + devices[devcnt].i = *content; + devices[devcnt].i.disk.major = major(stb.st_rdev); + devices[devcnt].i.disk.minor = minor(stb.st_rdev); + + disk_state = devices[devcnt].i.disk.state & ~((1<<MD_DISK_FAILFAST) | + (1<<MD_DISK_WRITEMOSTLY)); + if (disk_state == ((1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC))) { + if (most_recent < 0 || + devices[devcnt].i.events + > devices[most_recent].i.events) { + struct supertype *tmp = tst; + tst = st; + st = tmp; + most_recent = devcnt; + } + } + tst->ss->free_super(tst); + free(tst); + + if (content->array.level == LEVEL_MULTIPATH) + /* with multipath, the raid_disk from the superblock is meaningless */ + i = devcnt; + else + i = devices[devcnt].i.disk.raid_disk; + if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) { + if (nextspare < content->array.raid_disks*2) + nextspare = content->array.raid_disks*2; + i = nextspare++; + } else { + /* i is raid_disk - double it so there is room for + * replacements */ + i *= 2; + if (devices[devcnt].i.disk.state & (1<<MD_DISK_REPLACEMENT)) + i++; + if (i >= content->array.raid_disks*2 && + i >= nextspare) + nextspare = i+1; + } + if (i < 10000) { + if (i >= bestcnt) { + int newbestcnt = i+10; + int *newbest = xmalloc(sizeof(int)*newbestcnt); + int c; + for (c=0; c < newbestcnt; c++) + if (c < bestcnt) + newbest[c] = best[c]; + else + newbest[c] = -1; + if (best)free(best); + best = newbest; + bestcnt = newbestcnt; + } + if (best[i] >=0 && + devices[best[i]].i.events == + devices[devcnt].i.events && + (devices[best[i]].i.disk.minor != + devices[devcnt].i.disk.minor) && + st->ss == &super0 && + content->array.level != LEVEL_MULTIPATH) { + /* two different devices with identical superblock. + * Could be a mis-detection caused by overlapping + * partitions. fail-safe. + */ + pr_err("WARNING %s and %s appear to have very similar superblocks.\n" + " If they are really different, please --zero the superblock on one\n" + " If they are the same or overlap, please remove one from %s.\n", + devices[best[i]].devname, devname, + inargv ? "the list" : + "the\n DEVICE list in mdadm.conf" + ); + close(mdfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (best[i] == -1 || (devices[best[i]].i.events + < devices[devcnt].i.events)) + best[i] = devcnt; + else if (st->ss == &super_imsm) + best[i+1] = devcnt; + } + devcnt++; + } + if (most_recent >= 0) + *most_recentp = most_recent; + *bestcntp = bestcnt; + *bestp = best; + *stp = st; + return devcnt; +} + +static int force_array(struct mdinfo *content, + struct devs *devices, + int *best, int bestcnt, char *avail, + int most_recent, + struct supertype *st, + struct context *c) +{ + int okcnt = 0; + while (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, + avail) || + (content->reshape_active && content->delta_disks > 0 && + !enough(content->array.level, (content->array.raid_disks + - content->delta_disks), + content->new_layout, 1, avail))) { + /* Choose the newest best drive which is + * not up-to-date, update the superblock + * and add it. + */ + int fd; + struct supertype *tst; + unsigned long long current_events; + int chosen_drive = -1; + int i; + + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt; + i += 2) { + int j = best[i]; + if (j < 0) + continue; + if (devices[j].uptodate) + continue; + if (devices[j].i.recovery_start != MaxSector) { + int delta; + if (!devices[j].i.reshape_active || + devices[j].i.delta_disks <= 0) + continue; + /* When increasing number of devices, an + * added device also appears to be + * recovering. It is safe to include it + * as long as it won't be a source of + * data. + * For now, just allow for last data + * devices in RAID4 or last devices in RAID4/5/6. + */ + delta = devices[j].i.delta_disks; + if (devices[j].i.array.level >= 4 && + devices[j].i.array.level <= 6 && + i/2 >= content->array.raid_disks - delta) + /* OK */; + else if (devices[j].i.array.level == 4 && + i/2 >= content->array.raid_disks - delta - 1) + /* OK */; + else + continue; + } else if (devices[j].i.reshape_active != + content->reshape_active || + (devices[j].i.reshape_active && + devices[j].i.reshape_progress != + content->reshape_progress)) + /* Here, it may be a source of data. If two + * devices claim different progresses, it + * means that reshape boundaries differ for + * their own devices. Kernel will only treat + * the first one as reshape progress and + * go on. It may cause disaster, so avoid it. + */ + continue; + if (chosen_drive < 0 || + devices[j].i.events + > devices[chosen_drive].i.events) + chosen_drive = j; + } + if (chosen_drive < 0) + break; + current_events = devices[chosen_drive].i.events; + add_another: + if (c->verbose >= 0) + pr_err("forcing event count in %s(%d) from %d up to %d\n", + devices[chosen_drive].devname, + devices[chosen_drive].i.disk.raid_disk, + (int)(devices[chosen_drive].i.events), + (int)(devices[most_recent].i.events)); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? O_RDWR + : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Couldn't open %s for write - not updating\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + tst = dup_super(st); + if (tst->ss->load_super(tst,fd, NULL)) { + close(fd); + pr_err("RAID superblock disappeared from %s - not updating.\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + content->events = devices[most_recent].i.events; + tst->ss->update_super(tst, content, "force-one", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + + if (tst->ss->store_super(tst, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + tst->ss->free_super(tst); + continue; + } + close(fd); + devices[chosen_drive].i.events = devices[most_recent].i.events; + devices[chosen_drive].uptodate = 1; + avail[chosen_drive] = 1; + okcnt++; + tst->ss->free_super(tst); + /* If there are any other drives of the same vintage, + * add them in as well. We can't lose and we might gain + */ + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt ; + i += 2) { + int j = best[i]; + if (j >= 0 && + !devices[j].uptodate && + devices[j].i.recovery_start == MaxSector && + devices[j].i.events == current_events && + ((!devices[j].i.reshape_active && + !content->reshape_active) || + (devices[j].i.reshape_active == + content->reshape_active && + devices[j].i.reshape_progress == + content->reshape_progress))) { + chosen_drive = j; + goto add_another; + } + } + } + return okcnt; +} + +static int start_array(int mdfd, + char *mddev, + struct mdinfo *content, + struct supertype *st, + struct mddev_ident *ident, + int *best, int bestcnt, + int chosen_drive, + struct devs *devices, + unsigned int okcnt, + unsigned int sparecnt, + unsigned int rebuilding_cnt, + unsigned int journalcnt, + struct context *c, + int clean, char *avail, + int start_partial_ok, + int err_ok, + int was_forced + ) +{ + int rv; + int i; + unsigned int req_cnt; + + if (content->journal_device_required && (content->journal_clean == 0)) { + if (!c->force) { + pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n"); + return 1; + } + pr_err("Journal is missing or stale, starting array read only.\n"); + c->readonly = 1; + } + + if (content->consistency_policy == CONSISTENCY_POLICY_PPL) + clean = 1; + + rv = set_array_info(mdfd, st, content); + if (rv && !err_ok) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + return 1; + } + if (ident->bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { + pr_err("SET_BITMAP_FILE failed.\n"); + return 1; + } + } else if (ident->bitmap_file) { + /* From config file */ + int bmfd = open(ident->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s\n", + ident->bitmap_file); + return 1; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + pr_err("Failed to set bitmapfile for %s\n", mddev); + close(bmfd); + return 1; + } + close(bmfd); + } + + /* First, add the raid disks, but add the chosen one last */ + for (i = 0; i <= bestcnt; i++) { + int j; + if (i < bestcnt) { + j = best[i]; + if (j == chosen_drive) + continue; + } else + j = chosen_drive; + + if (j >= 0 && !devices[j].included) { + int dfd; + + dfd = dev_open(devices[j].devname, O_RDWR|O_EXCL); + if (dfd >= 0) { + remove_partitions(dfd); + close(dfd); + } + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { + pr_err("failed to add %s to %s: %s\n", + devices[j].devname, mddev, + strerror(errno)); + if (errno == EINVAL && content->array.level == 0 && + content->array.layout != 0) { + cont_err("Possibly your kernel doesn't support RAID0 layouts.\n"); + cont_err("Please upgrade.\n"); + } + if (i < content->array.raid_disks * 2 || + i == bestcnt) + okcnt--; + else + sparecnt--; + } else if (c->verbose > 0) { + pr_err("added %s to %s as %d%s%s\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk, + devices[j].uptodate?"": + " (possibly out of date)", + (devices[j].i.disk.state & + (1<<MD_DISK_REPLACEMENT)) ? + " replacement":""); + } + } else if (j >= 0) { + if (c->verbose > 0) + pr_err("%s is already in %s as %d\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk); + } else if (c->verbose > 0 && + i < content->array.raid_disks * 2 && (i & 1) == 0) + pr_err("no uptodate device for slot %d of %s\n", + i/2, mddev); + } + + if (content->array.level == LEVEL_CONTAINER) { + sysfs_rules_apply(mddev, content); + if (c->verbose >= 0) { + pr_err("Container %s has been assembled with %d drive%s", + mddev, okcnt + sparecnt + journalcnt, + okcnt + sparecnt + journalcnt == 1 ? "" : "s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)\n", + content->array.raid_disks); + else + fprintf(stderr, "\n"); + } + + if (st->ss->validate_container) { + struct mdinfo *devices_list; + struct mdinfo *info_devices; + unsigned int count; + + devices_list = NULL; + info_devices = xmalloc(sizeof(struct mdinfo) * + (okcnt + sparecnt)); + for (count = 0; count < okcnt + sparecnt; count++) { + info_devices[count] = devices[count].i; + info_devices[count].next = devices_list; + devices_list = &info_devices[count]; + } + if (st->ss->validate_container(devices_list)) + pr_err("Mismatch detected!\n"); + free(info_devices); + } + + st->ss->free_super(st); + sysfs_uevent(content, "change"); + if (err_ok && okcnt < (unsigned)content->array.raid_disks) + /* Was partial, is still partial, so signal an error + * to ensure we don't retry */ + return 1; + return 0; + } + + /* Get number of in-sync devices according to the superblock. + * We must have this number to start the array without -s or -R + */ + req_cnt = content->array.working_disks; + + if (c->runstop == 1 || + (c->runstop <= 0 && + (enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok)))) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; + + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP) && + content->delta_disks <= 0) { + if (!c->backup_file) { + pr_err("%s: Need a backup file to complete reshape of this array.\n", + mddev); + pr_err("Please provided one with \"--backup-file=...\"\n"); + if (c->update && + strcmp(c->update, "revert-reshape") == 0) + pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n"); + return 1; + } + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (rv == 0) + rv = Grow_continue(mdfd, st, content, + c->backup_file, 0, + c->freeze_reshape); + } else if (c->readonly && + sysfs_attribute_available(content, NULL, + "array_state")) { + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } else + rv = ioctl(mdfd, RUN_ARRAY, NULL); + reopen_mddev(mdfd); /* drop O_EXCL */ + if (rv == 0) { + sysfs_rules_apply(mddev, content); + if (c->verbose >= 0) { + pr_err("%s has been started with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", + sparecnt?",":" and", + rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", + sparecnt, + sparecnt == 1 ? "" : "s"); + if (content->journal_clean) + fprintf(stderr, " and %d journal", + journalcnt); + fprintf(stderr, ".\n"); + } + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + int chunk_size = content->array.chunk_size; + + if (content->reshape_active && + content->new_chunk > chunk_size) + chunk_size = content->new_chunk; + if (256 < 4 * ((chunk_size+4065)/4096)) { + struct mdinfo *sra; + + sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * chunk_size / 4096) + 1); + sysfs_free(sra); + } + } + if (okcnt < (unsigned)content->array.raid_disks) { + /* If any devices did not get added + * because the kernel rejected them based + * on event count, try adding them + * again providing the action policy is + * 're-add' or greater. The bitmap + * might allow them to be included, or + * they will become spares. + */ + for (i = 0; i < bestcnt; i++) { + int j = best[i]; + if (j >= 0 && !devices[j].uptodate) { + if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) + continue; + rv = add_disk(mdfd, st, content, + &devices[j].i); + if (rv == 0 && c->verbose >= 0) + pr_err("%s has been re-added.\n", + devices[j].devname); + } + } + } + if (content->array.level == 6 && + okcnt + 1 == (unsigned)content->array.raid_disks && + was_forced) { + struct mdinfo *sra; + + sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_str(sra, NULL, + "sync_action", "repair"); + sysfs_free(sra); + } + return 0; + } + pr_err("failed to RUN_ARRAY %s: %s\n", mddev, strerror(errno)); + if (errno == 524 /* ENOTSUP */ && + content->array.level == 0 && content->array.layout == 0) + cont_err("Please use --update=layout-original or --update=layout-alternate\n"); + + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + pr_err("Not enough devices to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail)) + pr_err("Not enough devices to start the array while not clean - consider --force.\n"); + + return 1; + } + if (c->runstop == -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt == 1 ? "" : "s"); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, ", but not started.\n"); + return 2; + } + if (c->verbose >= -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt == 1 ? "" : "s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", + sparecnt ? "," : " and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, + sparecnt == 1 ? "" : "s"); + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + fprintf(stderr, " - not enough to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail)) + fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n"); + else { + if (req_cnt == (unsigned)content->array.raid_disks) + fprintf(stderr, " - need all %d to start it", + req_cnt); + else + fprintf(stderr, " - need %d to start", req_cnt); + fprintf(stderr, " (use --run to insist).\n"); + } + } + return 1; +} + +int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c) +{ + /* + * The task of Assemble is to find a collection of + * devices that should (according to their superblocks) + * form an array, and to give this collection to the MD driver. + * In Linux-2.4 and later, this involves submitting a + * SET_ARRAY_INFO ioctl with no arg - to prepare + * the array - and then submit a number of + * ADD_NEW_DISK ioctls to add disks into + * the array. Finally RUN_ARRAY might + * be submitted to start the array. + * + * Much of the work of Assemble is in finding and/or + * checking the disks to make sure they look right. + * + * If mddev is not set, then scan must be set and we + * read through the config file for dev+uuid mapping + * We recurse, setting mddev, for each device that + * - isn't running + * - has a valid uuid (or any uuid if !uuidset) + * + * If mddev is set, we try to determine state of md. + * check version - must be at least 0.90.0 + * check kernel version. must be at least 2.4. + * If not, we can possibly fall back on START_ARRAY + * Try to GET_ARRAY_INFO. + * If possible, give up + * If not, try to STOP_ARRAY just to make sure + * + * If !uuidset and scan, look in conf-file for uuid + * If not found, give up + * If !devlist and scan and uuidset, get list of devs from conf-file + * + * For each device: + * Check superblock - discard if bad + * Check uuid (set if we don't have one) - discard if no match + * Check superblock similarity if we have a superblock - discard if different + * Record events, devicenum + * This should give us a list of devices for the array + * We should collect the most recent event number + * + * Count disks with recent enough event count + * While force && !enough disks + * Choose newest rejected disks, update event count + * mark clean and rewrite superblock + * If recent kernel: + * SET_ARRAY_INFO + * foreach device with recent events : ADD_NEW_DISK + * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY + * If old kernel: + * Check the device numbers in superblock are right + * update superblock if any changes + * START_ARRAY + * + */ + int rv = -1; + int mdfd = -1; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL)); + struct devs *devices = NULL; + char *devmap; + int *best = NULL; /* indexed by raid_disk */ + int bestcnt = 0; + int devcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt; + int journal_clean = 0; + int i; + int was_forced = 0; + int most_recent = 0; + int chosen_drive; + int change = 0; + int inargv = 0; + int start_partial_ok = (c->runstop >= 0) && + (c->force || devlist==NULL || auto_assem); + int num_devs; + struct mddev_dev *tmpdev; + struct mdinfo info; + struct mdinfo *content = NULL; + struct mdinfo *pre_exist = NULL; + char *avail; + char *name = NULL; + char chosen_name[1024]; + struct map_ent *map = NULL; + struct map_ent *mp; + + /* + * If any subdevs are listed, then any that don't + * match ident are discarded. Remainder must all match and + * become the array. + * If no subdevs, then we scan all devices in the config file, but + * there must be something in the identity + */ + + if (!devlist && + ident->uuid_set == 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && + ident->devices == NULL) { + pr_err("No identity information available for %s - cannot assemble.\n", + mddev ? mddev : "further assembly"); + return 1; + } + + if (devlist == NULL) + devlist = conf_get_devs(); + else if (mddev) + inargv = 1; + +try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ + if (!st && ident->st) + st = ident->st; + if (c->verbose>0) + pr_err("looking for devices for %s\n", + mddev ? mddev : "further assembly"); + + content = &info; + if (st && c->force) + st->ignore_hw_compat = 1; + num_devs = select_devices(devlist, ident, &st, &content, c, + inargv, auto_assem); + if (num_devs < 0) + return 1; + + if (!st || !st->sb || !content) + return 2; + + /* We have a full set of devices - we now need to find the + * array device. + * However there is a risk that we are racing with "mdadm -I" + * and the array is already partially assembled - we will have + * rejected any devices already in this address. + * So we take a lock on the map file - to prevent further races - + * and look for the uuid in there. If found and the array is + * active, we abort. If found and the array is not active + * we commit to that md device and add all the contained devices + * to our list. We flag them so that we don't try to re-add, + * but can remove if they turn out to not be wanted. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); + if (c->update && strcmp(c->update,"uuid") == 0) + mp = NULL; + else + mp = map_by_uuid(&map, content->uuid); + if (mp) { + struct mdinfo *dv; + /* array already exists. */ + pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS); + if (pre_exist->array.level != UnSet) { + pr_err("Found some drive for an array that is already active: %s\n", + mp->path); + pr_err("giving up.\n"); + goto out; + } + for (dv = pre_exist->devs; dv; dv = dv->next) { + /* We want to add this device to our list, + * but it could already be there if "mdadm -I" + * started *after* we checked for O_EXCL. + * If we add it to the top of the list + * it will be preferred over later copies. + */ + struct mddev_dev *newdev; + char *devname = map_dev(dv->disk.major, + dv->disk.minor, + 0); + if (!devname) + continue; + newdev = xmalloc(sizeof(*newdev)); + newdev->devname = devname; + newdev->disposition = 'I'; + newdev->used = 1; + newdev->next = devlist; + devlist = newdev; + num_devs++; + } + strcpy(chosen_name, mp->path); + if (c->verbose > 0 || mddev == NULL || + strcmp(mddev, chosen_name) != 0) + pr_err("Merging with already-assembled %s\n", + chosen_name); + mdfd = open_dev_excl(mp->devnm); + } else { + int trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, c->homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (mddev && map_by_name(&map, mddev) != NULL) { + pr_err("Cannot create device with %s because is in use\n", mddev); + goto out; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local + */ + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name, 0); + } + if (mdfd < 0) { + st->ss->free_super(st); + if (auto_assem) + goto try_again; + goto out; + } + mddev = chosen_name; + if (pre_exist == NULL) { + if (mddev_busy(fd2devnm(mdfd))) { + pr_err("%s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + pr_err("%s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + if (auto_assem) + goto try_again; + goto out; + } + /* just incase it was started but has no content */ + ioctl(mdfd, STOP_ARRAY, NULL); + } + + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + int err; + err = assemble_container_content(st, mdfd, content, c, + chosen_name, NULL); + close(mdfd); + return err; + } + + /* Ok, no bad inconsistancy, we can try updating etc */ + devices = xcalloc(num_devs, sizeof(*devices)); + devmap = xcalloc(num_devs, content->array.raid_disks); + devcnt = load_devices(devices, devmap, ident, &st, devlist, + c, content, mdfd, mddev, + &most_recent, &bestcnt, &best, inargv); + if (devcnt < 0) { + mdfd = -3; + /* + * devices is already freed in load_devices, so set devices + * to NULL to avoid double free devices. + */ + devices = NULL; + goto out; + } + + if (devcnt == 0) { + pr_err("no devices found for %s\n", + mddev); + if (st) + st->ss->free_super(st); + free(devmap); + goto out; + } + + if (c->update && strcmp(c->update, "byteorder")==0) + st->minor_version = 90; + + st->ss->getinfo_super(st, content, NULL); + clean = content->array.state & 1; + + /* now we have some devices that might be suitable. + * I wonder how many + */ + avail = xcalloc(content->array.raid_disks, 1); + okcnt = 0; + replcnt = 0; + sparecnt=0; + journalcnt=0; + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { + int j = best[i]; + int event_margin = 1; /* always allow a difference of '1' + * like the kernel does + */ + if (j < 0) continue; + /* note: we ignore error flags in multipath arrays + * as they don't make sense + */ + if (content->array.level != LEVEL_MULTIPATH) { + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) { + if (content->journal_device_required) + journalcnt++; + else /* unexpected journal, mark as faulty */ + devices[j].i.disk.state |= (1<<MD_DISK_FAULTY); + } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) { + if (!(devices[j].i.disk.state + & (1<<MD_DISK_FAULTY))) { + devices[j].uptodate = 1; + sparecnt++; + } + continue; + } + } + /* If this device thinks that 'most_recent' has failed, then + * we must reject this device. + */ + if (j != most_recent && !c->force && + content->array.raid_disks > 0 && + devices[most_recent].i.disk.raid_disk >= 0 && + devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { + if (c->verbose > -1) + pr_err("ignoring %s as it reports %s as failed\n", + devices[j].devname, devices[most_recent].devname); + best[i] = -1; + continue; + } + /* Require event counter to be same as, or just less than, + * most recent. If it is bigger, it must be a stray spare and + * should be ignored. + */ + if (devices[j].i.events+event_margin >= + devices[most_recent].i.events && + devices[j].i.events <= + devices[most_recent].i.events + ) { + devices[j].uptodate = 1; + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) + journal_clean = 1; + if (i < content->array.raid_disks * 2) { + if (devices[j].i.recovery_start == MaxSector || + (content->reshape_active && + i >= content->array.raid_disks - content->delta_disks)) { + if (!avail[i/2]) { + okcnt++; + avail[i/2]=1; + } else + replcnt++; + } else + rebuilding_cnt++; + } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL) + sparecnt++; + } + } + free(devmap); + if (c->force) { + int force_ok = force_array(content, devices, best, bestcnt, + avail, most_recent, st, c); + okcnt += force_ok; + if (force_ok) + was_forced = 1; + } + /* Now we want to look at the superblock which the kernel will base things on + * and compare the devices that we think are working with the devices that the + * superblock thinks are working. + * If there are differences and --force is given, then update this chosen + * superblock. + */ + chosen_drive = -1; + st->ss->free_super(st); + for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) { + int j = best[i]; + int fd; + + if (j<0) + continue; + if (!devices[j].uptodate) + continue; + if (devices[j].i.events < devices[most_recent].i.events) + continue; + chosen_drive = j; + if ((fd=dev_open(devices[j].devname, + devices[j].included ? O_RDONLY + : (O_RDONLY|O_EXCL)))< 0) { + pr_err("Cannot open %s: %s\n", + devices[j].devname, strerror(errno)); + goto out; + } + if (st->ss->load_super(st,fd, NULL)) { + close(fd); + pr_err("RAID superblock has disappeared from %s\n", + devices[j].devname); + goto out; + } + close(fd); + } + if (st->sb == NULL) { + pr_err("No suitable drives found for %s\n", mddev); + goto out; + } + st->ss->getinfo_super(st, content, NULL); + if (sysfs_init(content, mdfd, NULL)) { + pr_err("Unable to initialize sysfs\n"); + goto out; + } + + /* after reload context, store journal_clean in context */ + content->journal_clean = journal_clean; + for (i=0; i<bestcnt; i++) { + int j = best[i]; + unsigned int desired_state; + + if (j < 0) + continue; + if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL) + desired_state = (1<<MD_DISK_JOURNAL); + else if (i >= content->array.raid_disks * 2) + desired_state = 0; + else if (i & 1) + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT); + else + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC); + + desired_state |= devices[j].i.disk.state & ((1<<MD_DISK_FAILFAST) | + (1<<MD_DISK_WRITEMOSTLY)); + + if (!devices[j].uptodate) + continue; + + devices[j].i.disk.state = desired_state; + if (!(devices[j].i.array.state & 1)) + clean = 0; + + if (st->ss->update_super(st, &devices[j].i, "assemble", NULL, + c->verbose, 0, NULL)) { + if (c->force) { + if (c->verbose >= 0) + pr_err("clearing FAULTY flag for device %d in %s for %s\n", + j, mddev, devices[j].devname); + change = 1; + } else { + if (c->verbose >= -1) + pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n", + i, mddev, devices[j].devname); + } + } +#if 0 + if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) { + pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", + i, mddev); + } +#endif + } + if (c->force && !clean && + !enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, + avail)) { + change += st->ss->update_super(st, content, "force-array", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + was_forced = 1; + clean = 1; + } + + if (change) { + int fd; + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? + O_RDWR : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[chosen_drive].devname); + goto out; + } + if (st->ss->store_super(st, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + goto out; + } + if (c->verbose >= 0) + pr_err("Marking array %s as 'clean'\n", + mddev); + close(fd); + } + + /* If we are in the middle of a reshape we may need to restore saved data + * that was moved aside due to the reshape overwriting live data + * The code of doing this lives in Grow.c + */ + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP)) { + int err = 0; + int *fdlist = xmalloc(sizeof(int)* bestcnt); + if (c->verbose > 0) + pr_err("%s has an active reshape - checking if critical section needs to be restored\n", + chosen_name); + if (!c->backup_file) + c->backup_file = locate_backup(content->sys_name); + enable_fds(bestcnt/2); + for (i = 0; i < bestcnt/2; i++) { + int j = best[i*2]; + if (j >= 0) { + fdlist[i] = dev_open(devices[j].devname, + devices[j].included + ? O_RDWR : (O_RDWR|O_EXCL)); + if (fdlist[i] < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[j].devname); + err = 1; + break; + } + } else + fdlist[i] = -1; + } + if (!err) { + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, bestcnt/2, + c->backup_file, c->verbose > 0); + if (err && c->invalid_backup) { + if (c->verbose > 0) + pr_err("continuing without restoring backup\n"); + err = 0; + } + } + while (i>0) { + i--; + if (fdlist[i]>=0) close(fdlist[i]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape, sorry.\n"); + if (c->backup_file == NULL) + cont_err("Possibly you needed to specify the --backup-file\n"); + goto out; + } + } + + /* Almost ready to actually *do* something */ + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + if (c->update && strcmp(c->update, "metadata")==0) { + content->array.major_version = 1; + content->array.minor_version = 0; + strcpy(content->text_version, "1.0"); + } + + map_update(&map, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + rv = start_array(mdfd, mddev, content, + st, ident, best, bestcnt, + chosen_drive, devices, okcnt, sparecnt, + rebuilding_cnt, journalcnt, + c, + clean, avail, start_partial_ok, + pre_exist != NULL, + was_forced); + if (rv == 1 && !pre_exist) + ioctl(mdfd, STOP_ARRAY, NULL); + free(devices); +out: + map_unlock(&map); + if (rv == 0) { + wait_for(chosen_name, mdfd); + close(mdfd); + if (auto_assem) { + int usecs = 1; + /* There is a nasty race with 'mdadm --monitor'. + * If it opens this device before we close it, + * it gets an incomplete open on which IO + * doesn't work and the capacity is + * wrong. + * If we reopen (to check for layered devices) + * before --monitor closes, we loose. + * + * So: wait upto 1 second for there to be + * a non-zero capacity. + */ + while (usecs < 1000) { + mdfd = open(mddev, O_RDONLY); + if (mdfd >= 0) { + unsigned long long size; + if (get_dev_size(mdfd, NULL, &size) && + size > 0) + break; + close(mdfd); + } + usleep(usecs); + usecs <<= 1; + } + } + } else if (mdfd >= 0) + close(mdfd); + + /* '2' means 'OK, but not started yet' */ + if (rv == -1) { + free(devices); + return 1; + } + return rv == 2 ? 0 : rv; +} + +int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, struct context *c, + char *chosen_name, int *result) +{ + struct mdinfo *dev, *sra, *dev2; + struct assembly_array_info array = {chosen_name, 0, 0, 0}; + int old_raid_disks; + int start_reshape; + char *avail; + int err; + int is_raid456, is_clean, all_disks; + + if (sysfs_init(content, mdfd, NULL)) { + pr_err("Unable to initialize sysfs\n"); + return 1; + } + + sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) { + if (content->array.major_version == -1 && + content->array.minor_version == -2 && + c->readonly && + content->text_version[0] == '/') + content->text_version[0] = '-'; + if (sysfs_set_array(content, 9003) != 0) { + sysfs_free(sra); + return 1; + } + } + + /* There are two types of reshape: container wide or sub-array specific + * Check if metadata requests blocking container wide reshapes + */ + start_reshape = (content->reshape_active && + !((content->reshape_active == CONTAINER_RESHAPE) && + (content->array.state & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))); + + /* Block subarray here if it is under reshape now + * Do not allow for any changes in this array + */ + if (st->ss->external && content->recovery_blocked && start_reshape) + block_subarray(content); + + for (dev2 = sra->devs; dev2; dev2 = dev2->next) { + for (dev = content->devs; dev; dev = dev->next) + if (dev2->disk.major == dev->disk.major && + dev2->disk.minor == dev->disk.minor) + break; + if (dev) + continue; + /* Don't want this one any more */ + if (sysfs_set_str(sra, dev2, "slot", "none") < 0 && + errno == EBUSY) { + pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name); + sysfs_free(sra); + return 1; + } + sysfs_set_str(sra, dev2, "state", "remove"); + } + old_raid_disks = content->array.raid_disks - content->delta_disks; + avail = xcalloc(content->array.raid_disks, 1); + for (dev = content->devs; dev; dev = dev->next) { + if (dev->disk.raid_disk >= 0) + avail[dev->disk.raid_disk] = 1; + if (sysfs_add_disk(content, dev, 1) == 0) { + if (dev->disk.raid_disk >= old_raid_disks && + content->reshape_active) + array.exp_cnt++; + else + array.new_cnt++; + } else if (errno == EEXIST) + array.preexist_cnt++; + } + sysfs_free(sra); + + all_disks = array.new_cnt + array.exp_cnt + array.preexist_cnt; + + map_update(NULL, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + if (content->consistency_policy == CONSISTENCY_POLICY_PPL && + st->ss->validate_ppl) { + content->array.state |= 1; + err = 0; + + for (dev = content->devs; dev; dev = dev->next) { + int dfd; + char *devpath; + int ret; + + ret = st->ss->validate_ppl(st, content, dev); + if (ret == 0) + continue; + + if (ret < 0) { + err = 1; + break; + } + + if (!c->force) { + pr_err("%s contains invalid PPL - consider --force or --update-subarray with --update=no-ppl\n", + chosen_name); + content->array.state &= ~1; + avail[dev->disk.raid_disk] = 0; + break; + } + + /* have --force - overwrite the invalid ppl */ + devpath = map_dev(dev->disk.major, dev->disk.minor, 0); + dfd = dev_open(devpath, O_RDWR); + if (dfd < 0) { + pr_err("Failed to open %s\n", devpath); + err = 1; + break; + } + + err = st->ss->write_init_ppl(st, content, dfd); + close(dfd); + + if (err) + break; + } + + if (err) { + free(avail); + return err; + } + } else if (c->force) { + /* Set the array as 'clean' so that we can proceed with starting + * it even if we don't have all devices. Mdmon doesn't care + * if the dirty flag is set in metadata, it will start managing + * it anyway. + * This is really important for raid456 (RWH case), other levels + * are started anyway. + */ + content->array.state |= 1; + } + + is_raid456 = (content->array.level >= 4 && content->array.level <= 6); + is_clean = content->array.state & 1; + + if (enough(content->array.level, content->array.raid_disks, + content->array.layout, is_clean, avail) == 0) { + set_array_assembly_status(c, result, INCR_NO, &array); + + if (c->verbose >= 0 && is_raid456 && !is_clean) + pr_err("Consider --force to start dirty degraded array\n"); + + free(avail); + return 1; + } + free(avail); + + if (c->runstop <= 0 && all_disks < content->array.working_disks) { + + set_array_assembly_status(c, result, INCR_UNSAFE, &array); + + if (c->verbose >= 0 && c->force) + pr_err("Consider --run to start array as degraded.\n"); + return 1; + } + + if (is_raid456 && content->resync_start != MaxSector && c->force && + all_disks < content->array.raid_disks) { + + content->resync_start = MaxSector; + err = sysfs_set_num(content, NULL, "resync_start", MaxSector); + if (err) + return 1; + + pr_err("%s array state forced to clean. It may cause data corruption.\n", + chosen_name); + } + + /* + * Before activating the array, perform extra steps required + * to configure the internal write-intent bitmap. + */ + if (content->consistency_policy == CONSISTENCY_POLICY_BITMAP && + st->ss->set_bitmap) + st->ss->set_bitmap(st, content); + + if (start_reshape) { + int spare = content->array.raid_disks + array.exp_cnt; + if (restore_backup(st, content, + array.new_cnt, + spare, &c->backup_file, c->verbose) == 1) + return 1; + + if (content->reshape_progress == 0) { + /* If reshape progress is 0 - we are assembling the + * array that was stopped, before reshape has started. + * Array needs to be started as active, Grow_continue() + * will start the reshape. + */ + sysfs_set_num(content, NULL, "reshape_position", + MaxSector); + err = sysfs_set_str(content, NULL, + "array_state", "active"); + sysfs_set_num(content, NULL, "reshape_position", 0); + } else { + err = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } + + if (err) + return 1; + + if (st->ss->external) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + err = Grow_continue(mdfd, st, content, c->backup_file, + 0, c->freeze_reshape); + } else switch(content->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + c->readonly ? "readonly" : "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + } + break; + } + if (!err) + sysfs_set_safemode(content, content->safe_mode_delay); + + /* Block subarray here if it is not reshaped now + * It has be blocked a little later to allow mdmon to switch in + * in to R/W state + */ + if (st->ss->external && content->recovery_blocked && + !start_reshape) + block_subarray(content); + + if (err) + set_array_assembly_status(c, result, INCR_NO, &array); + else { + set_array_assembly_status(c, result, INCR_YES, &array); + wait_for(chosen_name, mdfd); + sysfs_rules_apply(chosen_name, content); + } + + return err; + /* FIXME should have an O_EXCL and wait for read-auto */ +} @@ -0,0 +1,227 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c) +{ + /* Build a linear or raid0 arrays without superblocks + * We cannot really do any checks, we just do it. + * For md_version < 0.90.0, we call REGISTER_DEV + * with the device numbers, and then + * START_MD giving the "geometry" + * geometry is 0xpp00cc + * where pp is personality: 1==linear, 2=raid0 + * cc = chunk size factor: 0==4k, 1==8k etc. + */ + int i; + dev_t rdev; + int subdevs = 0, missing_disks = 0; + struct mddev_dev *dv; + int bitmap_fd; + unsigned long long bitmapsize; + int mdfd; + char chosen_name[1024]; + int uuid[4] = {0,0,0,0}; + struct map_ent *map = NULL; + mdu_array_info_t array; + mdu_param_t param; /* not used by syscall */ + + if (s->level == UnSet) { + pr_err("a RAID level is needed to Build an array.\n"); + return 1; + } + /* scan all devices, make sure they really are block devices */ + for (dv = devlist; dv; dv=dv->next) { + subdevs++; + if (strcmp("missing", dv->devname) == 0) { + missing_disks++; + continue; + } + if (!stat_is_blkdev(dv->devname, NULL)) + return 1; + } + + if (s->raiddisks != subdevs) { + pr_err("requested %d devices in array but listed %d\n", + s->raiddisks, subdevs); + return 1; + } + + if (s->layout == UnSet) + switch(s->level) { + default: /* no layout */ + s->layout = 0; + break; + case 10: + s->layout = 0x102; /* near=2, far=1 */ + if (c->verbose > 0) + pr_err("layout defaults to n1\n"); + break; + case 5: + case 6: + s->layout = map_name(r5layout, "default"); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, s->layout)); + break; + case LEVEL_FAULTY: + s->layout = map_name(faultylayout, "default"); + + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout)); + break; + } + + /* We need to create the device. It can have no name. */ + map_lock(&map); + mdfd = create_mddev(mddev, NULL, c->autof, LOCAL, + chosen_name, 0); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name); + map_unlock(&map); + + array.level = s->level; + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + array.nr_disks = s->raiddisks; + array.raid_disks = s->raiddisks; + array.md_minor = 0; + if (fstat_is_blkdev(mdfd, mddev, &rdev)) + array.md_minor = minor(rdev); + array.not_persistent = 1; + array.state = 0; /* not clean, but no errors */ + if (s->assume_clean) + array.state |= 1; + array.active_disks = s->raiddisks - missing_disks; + array.working_disks = s->raiddisks - missing_disks; + array.spare_disks = 0; + array.failed_disks = missing_disks; + if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR)) + s->chunk = 64; + array.chunk_size = s->chunk*1024; + array.layout = s->layout; + if (md_set_array_info(mdfd, &array)) { + pr_err("md_set_array_info() failed for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + goto abort; + } + /* now add the devices */ + for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) { + mdu_disk_info_t disk; + unsigned long long dsize; + int fd; + + if (strcmp("missing", dv->devname) == 0) + continue; + if (!stat_is_blkdev(dv->devname, &rdev)) + goto abort; + fd = open(dv->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (get_dev_size(fd, NULL, &dsize) && + (s->size == 0 || s->size == MAX_SIZE || dsize < s->size)) + s->size = dsize; + close(fd); + disk.number = i; + disk.raid_disk = i; + disk.state = (1<<MD_DISK_SYNC) | (1<<MD_DISK_ACTIVE); + if (dv->writemostly == FlagSet) + disk.state |= 1<<MD_DISK_WRITEMOSTLY; + disk.major = major(rdev); + disk.minor = minor(rdev); + if (ioctl(mdfd, ADD_NEW_DISK, &disk)) { + pr_err("ADD_NEW_DISK failed for %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + } + /* now to start it */ + if (s->bitmap_file) { + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + int major = BITMAP_MAJOR_HI; +#if 0 + if (s->bitmap_chunk == UnSet) { + pr_err("%s cannot be opened.\n", s->bitmap_file); + goto abort; + } +#endif + bitmapsize = s->size >> 9; /* FIXME wrong for RAID10 */ + if (CreateBitmap(s->bitmap_file, 1, NULL, + s->bitmap_chunk, c->delay, + s->write_behind, bitmapsize, major)) { + goto abort; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("%s cannot be opened.\n", s->bitmap_file); + goto abort; + } + } + if (bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } + } + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", strerror(errno)); + if (s->chunk & (s->chunk - 1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + goto abort; + } + + if (c->verbose >= 0) + pr_err("array %s built and started.\n", + mddev); + wait_for(mddev, mdfd); + close(mdfd); + return 0; + + abort: + ioctl(mdfd, STOP_ARRAY, 0); + close(mdfd); + return 1; +} @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..a3bf700 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,306 @@ +Please see git logs for detailed change log. +This file just contains highlight. + +Changes Prior to release 3.3 +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +Changes Prior to release 3.2.6 + - There are no real stand-out fixes, just lots of little bits and pieces. + +Changes Prior to release 3.2.5 + - This release primarily fixes a serious regression in 3.2.4. + This regression does *not* cause any risk to data. It simply + means that adding a device with "--add" would sometime fail + when it should not. + + - The fix also includes a couple of minor fixes such as making + the "--layout=preserve" option to "--grow" work again. + + +Changes Prior to release 3.2.4 +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Changes Prior to release 3.2.3 + - The largest single area of change is support for reshape of Intel + IMSM arrays (OnLine Capacity Explansion and Level Migration). + - Among other fixes, this now has a better chance of surviving if a + device fails during reshape. + +Changes Prior to release 3.2.2 + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Changes Prior to release 3.2.1 + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + +Changes Prior to release 3.2 + - By far the most significant change in this release related to the + management of reshaping arrays. This code has been substantially + re-written so that it can work with 'externally managed metadata' - + Intel's IMSM in particular. We now support level migration and + OnLine Capacity Expansion on these arrays. + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + +Changes Prior to release 3.1.5 + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +Changes Prior to release 3.1.4 + Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev + And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + +Changes Prior to release 3.1.3 + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +Changes Prior to release 3.1.2 + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + +Changes Prior to release 3.1.1 + - Multiple fixes for new --grow levels including fixes for + serious data corruption problems. + - Change default metadata to v1.1 + - Change default chunk size to 512K + - Change default bitmap chunk size to 64Meg + - When --re-add is used, don't fall back to + --add if --re-add fails as this can destroy data. + +Changes Prior to release 3.1 + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Changes Prior to release 3.0.3 + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +Changes Prior to release 3.0.2 + - Fix crash when hosthost is not set, as often happens in + early boot. + +Changes Prior to release 3.0.1 + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +Changes Prior to release 3.0 + - Support for externally managed metadata, specifically DDF and IMSM. + - Depend on udev to create entries in /dev, rather than creating them + ourselves. + - remove --auto-update-home-hosts + - new config file line "auto" + - new "<ignore>" and "any" options for "homehost" + - numerous bug fixes and minor enhancements. diff --git a/Create.c b/Create.c new file mode 100644 index 0000000..0ff1922 --- /dev/null +++ b/Create.c @@ -0,0 +1,1118 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +static int round_size_and_verify(unsigned long long *size, int chunk) +{ + if (*size == 0) + return 0; + *size &= ~(unsigned long long)(chunk - 1); + if (*size == 0) { + pr_err("Size cannot be smaller than chunk.\n"); + return 1; + } + return 0; +} + +static int default_layout(struct supertype *st, int level, int verbose) +{ + int layout = UnSet; + + if (st && st->ss->default_geometry) + st->ss->default_geometry(st, &level, &layout, NULL); + + if (layout == UnSet) + switch(level) { + default: /* no layout */ + layout = 0; + break; + case 0: + layout = RAID0_ORIG_LAYOUT; + break; + case 10: + layout = 0x102; /* near=2, far=1 */ + if (verbose > 0) + pr_err("layout defaults to n2\n"); + break; + case 5: + case 6: + layout = map_name(r5layout, "default"); + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, layout)); + break; + case LEVEL_FAULTY: + layout = map_name(faultylayout, "default"); + + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, layout)); + break; + } + + return layout; +} + +int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, unsigned long long data_offset) +{ + /* + * Create a new raid array. + * + * First check that necessary details are available + * (i.e. level, raid-disks) + * + * Then check each disk to see what might be on it + * and report anything interesting. + * + * If anything looks odd, and runstop not set, + * abort. + * + * SET_ARRAY_INFO and ADD_NEW_DISK, and + * if runstop==run, or raiddisks disks were used, + * RUN_ARRAY + */ + int mdfd; + unsigned long long minsize = 0, maxsize = 0; + char *mindisc = NULL; + char *maxdisc = NULL; + int dnum, raid_disk_num; + struct mddev_dev *dv; + dev_t rdev; + int fail = 0, warn = 0; + int first_missing = subdevs * 2; + int second_missing = subdevs * 2; + int missing_disks = 0; + int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; + int pass; + int rv; + int bitmap_fd; + int have_container = 0; + int container_fd = -1; + int need_mdmon = 0; + unsigned long long bitmapsize; + struct mdinfo info, *infos; + int did_default = 0; + int do_default_layout = 0; + int do_default_chunk = 0; + unsigned long safe_mode_delay = 0; + char chosen_name[1024]; + struct map_ent *map = NULL; + unsigned long long newsize; + mdu_array_info_t inf; + + int major_num = BITMAP_MAJOR_HI; + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) { + major_num = BITMAP_MAJOR_CLUSTERED; + if (c->nodes <= 1) { + pr_err("At least 2 nodes are needed for cluster-md\n"); + return 1; + } + } + + memset(&info, 0, sizeof(info)); + if (s->level == UnSet && st && st->ss->default_geometry) + st->ss->default_geometry(st, &s->level, NULL, NULL); + if (s->level == UnSet) { + pr_err("a RAID level is needed to create an array.\n"); + return 1; + } + if (s->raiddisks < 4 && s->level == 6) { + pr_err("at least 4 raid-devices needed for level 6\n"); + return 1; + } + if (s->raiddisks > 256 && s->level == 6) { + pr_err("no more than 256 raid-devices supported for level 6\n"); + return 1; + } + if (s->raiddisks < 2 && s->level >= 4) { + pr_err("at least 2 raid-devices needed for level %d\n", s->level); + return 1; + } + if (s->level <= 0 && s->sparedisks) { + pr_err("This level does not support spare devices\n"); + return 1; + } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_container(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = super_by_fd(fd, NULL); + if (st && !(rv = st->ss-> + load_container(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + if (have_container) { + subdevs = s->raiddisks; + first_missing = subdevs * 2; + second_missing = subdevs * 2; + insert_point = subdevs * 2; + } + } + if (fd >= 0) + close(fd); + } + if (st && st->ss->external && s->sparedisks) { + pr_err("This metadata type does not support spare disks at create time\n"); + return 1; + } + if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); + return 1; + } + if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You haven't given enough devices (real or missing) to create this array\n"); + return 1; + } + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + return 1; + } + + /* now set some defaults */ + + if (s->layout == UnSet) { + do_default_layout = 1; + s->layout = default_layout(st, s->level, c->verbose); + } + + if (s->level == 10) + /* check layout fits in array*/ + if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) { + pr_err("that layout requires at least %d devices\n", + (s->layout&255) * ((s->layout>>8)&255)); + return 1; + } + + switch(s->level) { + case 4: + case 5: + case 10: + case 6: + case 0: + if (s->chunk == 0 || s->chunk == UnSet) { + s->chunk = UnSet; + do_default_chunk = 1; + /* chunk will be set later */ + } + break; + case LEVEL_LINEAR: + /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ + if (get_linux_version() < 2006016 && s->chunk == 0) { + s->chunk = 64; + if (c->verbose > 0) + pr_err("chunk size defaults to 64K\n"); + } + break; + case 1: + case LEVEL_FAULTY: + case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: + if (s->chunk) { + pr_err("specifying chunk size is forbidden for this level\n"); + return 1; + } + break; + default: + pr_err("unknown level %d\n", s->level); + return 1; + } + + if (s->size == MAX_SIZE) + /* use '0' to mean 'max' now... */ + s->size = 0; + if (s->size && s->chunk && s->chunk != UnSet) + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + + newsize = s->size * 2; + if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + data_offset, NULL, + &newsize, s->consistency_policy, + c->verbose >= 0)) + return 1; + + if (s->chunk && s->chunk != UnSet) { + newsize &= ~(unsigned long long)(s->chunk*2 - 1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + do_default_chunk = 0; + } + } + + if (s->size == 0) { + s->size = newsize / 2; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + + if (s->size && c->verbose > 0) + pr_err("setting size to %lluK\n", s->size); + } + + /* now look at the subdevs */ + info.array.active_disks = 0; + info.array.working_disks = 0; + dnum = 0; + for (dv = devlist; dv; dv = dv->next) + if (data_offset == VARIABLE_OFFSET) + dv->data_offset = INVALID_SECTORS; + else + dv->data_offset = data_offset; + + for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) { + char *dname = dv->devname; + unsigned long long freesize; + int dfd; + char *doff; + + if (strcasecmp(dname, "missing") == 0) { + if (first_missing > dnum) + first_missing = dnum; + if (second_missing > dnum && dnum > first_missing) + second_missing = dnum; + missing_disks ++; + continue; + } + if (data_offset == VARIABLE_OFFSET) { + doff = strchr(dname, ':'); + if (doff) { + *doff++ = 0; + dv->data_offset = parse_size(doff); + } else + dv->data_offset = INVALID_SECTORS; + } else + dv->data_offset = data_offset; + + dfd = open(dname, O_RDONLY); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + if (!fstat_is_blkdev(dfd, dname, NULL)) { + close(dfd); + exit(2); + } + close(dfd); + info.array.working_disks++; + if (dnum < s->raiddisks && dv->disposition != 'j') + info.array.active_disks++; + if (st == NULL) { + struct createinfo *ci = conf_get_create_info(); + if (ci) + st = ci->supertype; + } + if (st == NULL) { + /* Need to choose a default metadata, which is different + * depending on geometry of array. + */ + int i; + char *name = "default"; + for(i = 0; !st && superlist[i]; i++) { + st = superlist[i]->match_metadata_desc(name); + if (!st) + continue; + if (do_default_layout) + s->layout = default_layout(st, s->level, c->verbose); + switch (st->ss->validate_geometry( + st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, dname, + &freesize, s->consistency_policy, + c->verbose > 0)) { + case -1: /* Not valid, message printed, and not + * worth checking any further */ + exit(2); + break; + case 0: /* Geometry not valid */ + free(st); + st = NULL; + s->chunk = do_default_chunk ? UnSet : s->chunk; + break; + case 1: /* All happy */ + break; + } + } + + if (!st) { + int dfd = open(dname, O_RDONLY|O_EXCL); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + pr_err("device %s not suitable for any style of array\n", + dname); + exit(2); + } + if (st->ss != &super0 || + st->minor_version != 90) + did_default = 1; + } else { + if (do_default_layout) + s->layout = default_layout(st, s->level, 0); + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, + dname, &freesize, + s->consistency_policy, + c->verbose >= 0)) { + + pr_err("%s is not suitable for this array.\n", + dname); + fail = 1; + continue; + } + } + + if (dv->disposition == 'j') + goto skip_size_check; /* skip write journal for size check */ + + freesize /= 2; /* convert to K */ + if (s->chunk && s->chunk != UnSet) { + /* round to chunk size */ + freesize = freesize & ~(s->chunk-1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + do_default_chunk = 0; + } + } + if (!freesize) { + pr_err("no free space left on %s\n", dname); + fail = 1; + continue; + } + + if (s->size && freesize < s->size) { + pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n", + dname, freesize, s->size); + fail = 1; + continue; + } + if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { + maxdisc = dname; + maxsize = freesize; + } + if (mindisc ==NULL || (mindisc && freesize < minsize)) { + mindisc = dname; + minsize = freesize; + } + skip_size_check: + if (c->runstop != 1 || c->verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dname, strerror(errno)); + fail = 1; + continue; + } + warn |= check_ext2(fd, dname); + warn |= check_reiser(fd, dname); + warn |= check_raid(fd, dname); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1) + /* metadata at front */ + warn |= check_partitions(fd, dname, 0, 0); + else if (s->level == 1 || s->level == LEVEL_CONTAINER || + (s->level == 0 && s->raiddisks == 1)) + /* partitions could be meaningful */ + warn |= check_partitions(fd, dname, freesize*2, s->size*2); + else + /* partitions cannot be meaningful */ + warn |= check_partitions(fd, dname, 0, 0); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1 && + did_default && + s->level == 1 && + (warn & 1024) == 0) { + warn |= 1024; + pr_err("Note: this array has metadata at the start and\n" + " may not be suitable as a boot device. If you plan to\n" + " store '/boot' on this device please ensure that\n" + " your boot-loader understands md/v1.x metadata, or use\n" + " --metadata=0.90\n"); + } + close(fd); + } + } + if (missing_disks == dnum && !have_container) { + pr_err("Subdevs can't be all missing\n"); + return 1; + } + if (s->raiddisks + s->sparedisks > st->max_devs) { + pr_err("Too many devices: %s metadata only supports %d\n", + st->ss->name, st->max_devs); + return 1; + } + if (have_container) + info.array.working_disks = s->raiddisks; + if (fail) { + pr_err("create aborted\n"); + return 1; + } + if (s->size == 0) { + if (mindisc == NULL && !have_container) { + pr_err("no size and no drives given - aborting create.\n"); + return 1; + } + if (s->level > 0 || s->level == LEVEL_MULTIPATH || + s->level == LEVEL_FAULTY || st->ss->external) { + /* size is meaningful */ + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, minsize*2, + data_offset, + NULL, NULL, + s->consistency_policy, 0)) { + pr_err("devices too large for RAID level %d\n", s->level); + return 1; + } + s->size = minsize; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + if (c->verbose > 0) + pr_err("size set to %lluK\n", s->size); + } + } + + if (!s->bitmap_file && + !st->ss->external && + s->level >= 1 && + st->ss->add_internal_bitmap && + s->journaldisks == 0 && + (s->consistency_policy != CONSISTENCY_POLICY_RESYNC && + s->consistency_policy != CONSISTENCY_POLICY_PPL) && + (s->write_behind || s->size > 100*1024*1024ULL)) { + if (c->verbose > 0) + pr_err("automatically enabling write-intent bitmap on large array\n"); + s->bitmap_file = "internal"; + } + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + return 1; + } + + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", + maxdisc, s->size); + warn = 1; + } + + if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("%s unable to enumerate platform support\n" + " array may not be compatible with hardware/firmware\n", + st->ss->name); + warn = 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + + if (warn) { + if (c->runstop!= 1) { + if (!ask("Continue creating array? ")) { + pr_err("create aborted.\n"); + return 1; + } + } else { + if (c->verbose > 0) + pr_err("creation continuing despite oddities due to --run\n"); + } + } + + /* If this is raid4/5, we want to configure the last active slot + * as missing, so that a reconstruct happens (faster than re-parity) + * FIX: Can we do this for raid6 as well? + */ + if (st->ss->external == 0 && s->assume_clean == 0 && + c->force == 0 && first_missing >= s->raiddisks) { + switch (s->level) { + case 4: + case 5: + insert_point = s->raiddisks-1; + s->sparedisks++; + info.array.active_disks--; + missing_disks++; + break; + default: + break; + } + } + /* For raid6, if creating with 1 missing drive, make a good drive + * into a spare, else the create will fail + */ + if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks && + st->ss->external == 0 && + second_missing >= s->raiddisks && s->level == 6) { + insert_point = s->raiddisks - 1; + if (insert_point == first_missing) + insert_point--; + s->sparedisks ++; + info.array.active_disks--; + missing_disks++; + } + + if (s->level <= 0 && first_missing < subdevs * 2) { + pr_err("This level does not support missing devices\n"); + return 1; + } + + /* We need to create the device */ + map_lock(&map); + mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name, 1); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + /* verify if chosen_name is not in use, + * it could be in conflict with already existing device + * e.g. container, array + */ + if (strncmp(chosen_name, "/dev/md/", 8) == 0 && + map_by_name(&map, chosen_name+8) != NULL) { + pr_err("Array name %s is in use already.\n", + chosen_name); + close(mdfd); + map_unlock(&map); + udev_unblock(); + return 1; + } + mddev = chosen_name; + + memset(&inf, 0, sizeof(inf)); + md_get_array_info(mdfd, &inf); + if (inf.working_disks != 0) { + pr_err("another array by this name is already running.\n"); + goto abort_locked; + } + + /* Ok, lets try some ioctls */ + + info.array.level = s->level; + info.array.size = s->size; + info.array.raid_disks = s->raiddisks; + /* The kernel should *know* what md_minor we are dealing + * with, but it chooses to trust me instead. Sigh + */ + info.array.md_minor = 0; + if (fstat_is_blkdev(mdfd, mddev, &rdev)) + info.array.md_minor = minor(rdev); + info.array.not_persistent = 0; + + if (((s->level == 4 || s->level == 5) && + (insert_point < s->raiddisks || first_missing < s->raiddisks)) || + (s->level == 6 && (insert_point < s->raiddisks || + second_missing < s->raiddisks)) || + (s->level <= 0) || s->assume_clean) { + info.array.state = 1; /* clean, but one+ drive will be missing*/ + info.resync_start = MaxSector; + } else { + info.array.state = 0; /* not clean, but no errors */ + info.resync_start = 0; + } + if (s->level == 10) { + /* for raid10, the bitmap size is the capacity of the array, + * which is array.size * raid_disks / ncopies; + * .. but convert to sectors. + */ + int ncopies = ((s->layout>>8) & 255) * (s->layout & 255); + bitmapsize = s->size * s->raiddisks / ncopies * 2; +/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/ + } else + bitmapsize = s->size * 2; + + /* There is lots of redundancy in these disk counts, + * raid_disks is the most meaningful value + * it describes the geometry of the array + * it is constant + * nr_disks is total number of used slots. + * it should be raid_disks+spare_disks + * spare_disks is the number of extra disks present + * see above + * active_disks is the number of working disks in + * active slots. (With raid_disks) + * working_disks is the total number of working disks, + * including spares + * failed_disks is the number of disks marked failed + * + * Ideally, the kernel would keep these (except raid_disks) + * up-to-date as we ADD_NEW_DISK, but it doesn't (yet). + * So for now, we assume that all raid and spare + * devices will be given. + */ + info.array.spare_disks=s->sparedisks; + info.array.failed_disks=missing_disks; + info.array.nr_disks = info.array.working_disks + + info.array.failed_disks; + info.array.layout = s->layout; + info.array.chunk_size = s->chunk*1024; + + if (name == NULL || *name == 0) { + /* base name on mddev */ + /* /dev/md0 -> 0 + * /dev/md_d0 -> d0 + * /dev/md_foo -> foo + * /dev/md/1 -> 1 + * /dev/md/d1 -> d1 + * /dev/md/home -> home + * /dev/mdhome -> home + */ + /* FIXME compare this with rules in create_mddev */ + name = strrchr(mddev, '/'); + if (name) { + name++; + if (strncmp(name, "md_", 3) == 0 && + strlen(name) > 3 && (name-mddev) == 5 /* /dev/ */) + name += 3; + else if (strncmp(name, "md", 2) == 0 && + strlen(name) > 2 && isdigit(name[2]) && + (name-mddev) == 5 /* /dev/ */) + name += 2; + } + } + if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid, + data_offset)) + goto abort_locked; + + total_slots = info.array.nr_disks; + st->ss->getinfo_super(st, &info, NULL); + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs\n"); + goto abort_locked; + } + + if (did_default && c->verbose >= 0) { + if (is_subarray(info.text_version)) { + char devnm[32]; + char *ep; + struct mdinfo *mdi; + + strncpy(devnm, info.text_version+1, 32); + devnm[31] = 0; + ep = strchr(devnm, '/'); + if (ep) + *ep = 0; + + mdi = sysfs_read(-1, devnm, GET_VERSION); + + pr_err("Creating array inside %s container %s\n", + mdi?mdi->text_version:"managed", devnm); + sysfs_free(mdi); + } else + pr_err("Defaulting to version %s metadata\n", info.text_version); + } + + map_update(&map, fd2devnm(mdfd), info.text_version, + info.uuid, chosen_name); + /* Keep map locked until devices have been added to array + * to stop another mdadm from finding and using those devices. + */ + + if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0)) { + if (!st->ss->add_internal_bitmap) { + pr_err("internal bitmaps not supported with %s metadata\n", + st->ss->name); + goto abort_locked; + } + if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, 1, major_num)) { + pr_err("Given bitmap chunk size not supported.\n"); + goto abort_locked; + } + s->bitmap_file = NULL; + } + + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs\n"); + goto abort_locked; + } + + if (st->ss->external && st->container_devnm[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_devnm); + if (container_fd < 0) { + pr_err("Cannot get exclusive open on container - weird.\n"); + goto abort_locked; + } + if (mdmon_running(st->container_devnm)) { + if (c->verbose) + pr_err("reusing mdmon for %s.\n", + st->container_devnm); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + rv = set_array_info(mdfd, st, &info); + if (rv) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + + if (s->bitmap_file) { + int uuid[4]; + + st->ss->uuid_from_super(st, uuid); + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, + major_num)) { + goto abort_locked; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", + s->bitmap_file); + goto abort_locked; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + } + + infos = xmalloc(sizeof(*infos) * total_slots); + enable_fds(total_slots); + for (pass = 1; pass <= 2; pass++) { + struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ + + for (dnum = 0, raid_disk_num = 0, dv = devlist; dv; + dv = (dv->next) ? (dv->next) : moved_disk, dnum++) { + int fd; + struct mdinfo *inf = &infos[dnum]; + + if (dnum >= total_slots) + abort(); + if (dnum == insert_point) { + raid_disk_num += 1; + moved_disk = dv; + continue; + } + if (strcasecmp(dv->devname, "missing") == 0) { + raid_disk_num += 1; + continue; + } + if (have_container) + moved_disk = NULL; + if (have_container && dnum < info.array.raid_disks - 1) + /* repeatedly use the container */ + moved_disk = dv; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = raid_disk_num++; + + if (dv->disposition == 'j') { + inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL; + inf->disk.state = (1<<MD_DISK_JOURNAL); + raid_disk_num--; + } else if (inf->disk.raid_disk < s->raiddisks) + inf->disk.state = (1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC); + else + inf->disk.state = 0; + + if (dv->writemostly == FlagSet) { + if (major_num == BITMAP_MAJOR_CLUSTERED) { + pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname); + goto abort_locked; + } else + inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY); + } + if (dv->failfast == FlagSet) + inf->disk.state |= (1<<MD_DISK_FAILFAST); + + if (have_container) + fd = -1; + else { + if (st->ss->external && + st->container_devnm[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); + + if (fd < 0) { + pr_err("failed to open %s after earlier success - aborting\n", + dv->devname); + goto abort_locked; + } + if (!fstat_is_blkdev(fd, dv->devname, &rdev)) + return 1; + inf->disk.major = major(rdev); + inf->disk.minor = minor(rdev); + } + if (fd >= 0) + remove_partitions(fd); + if (st->ss->add_to_super(st, &inf->disk, + fd, dv->devname, + dv->data_offset)) { + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort_locked; + } + st->ss->getinfo_super(st, inf, NULL); + safe_mode_delay = inf->safe_mode_delay; + + if (have_container && c->verbose > 0) + pr_err("Using %s for device %d\n", + map_dev(inf->disk.major, + inf->disk.minor, + 0), dnum); + + if (!have_container) { + /* getinfo_super might have lost these ... */ + inf->disk.major = major(rdev); + inf->disk.minor = minor(rdev); + } + break; + case 2: + inf->errors = 0; + + rv = add_disk(mdfd, st, &info, inf); + + if (rv) { + pr_err("ADD_NEW_DISK for %s failed: %s\n", + dv->devname, strerror(errno)); + if (errno == EINVAL && + info.array.level == 0) { + pr_err("Possibly your kernel doesn't support RAID0 layouts.\n"); + pr_err("Either upgrade, or use --layout=dangerous\n"); + } + goto abort_locked; + } + break; + } + if (!have_container && + dv == moved_disk && dnum != insert_point) break; + } + if (pass == 1) { + struct mdinfo info_new; + struct map_ent *me = NULL; + + /* check to see if the uuid has changed due to these + * metadata changes, and if so update the member array + * and container uuid. Note ->write_init_super clears + * the subarray cursor such that ->getinfo_super once + * again returns container info. + */ + st->ss->getinfo_super(st, &info_new, NULL); + if (st->ss->external && s->level != LEVEL_CONTAINER && + !same_uuid(info_new.uuid, info.uuid, 0)) { + map_update(&map, fd2devnm(mdfd), + info_new.text_version, + info_new.uuid, chosen_name); + me = map_by_devnm(&map, st->container_devnm); + } + + if (st->ss->write_init_super(st)) { + st->ss->free_super(st); + goto abort_locked; + } + /* + * Before activating the array, perform extra steps + * required to configure the internal write-intent + * bitmap. + */ + if (info_new.consistency_policy == + CONSISTENCY_POLICY_BITMAP && + st->ss->set_bitmap && + st->ss->set_bitmap(st, &info)) { + st->ss->free_super(st); + goto abort_locked; + } + + /* update parent container uuid */ + if (me) { + char *path = xstrdup(me->path); + + st->ss->getinfo_super(st, &info_new, NULL); + map_update(&map, st->container_devnm, + info_new.text_version, + info_new.uuid, path); + free(path); + } + + flush_metadata_updates(st); + st->ss->free_super(st); + } + } + map_unlock(&map); + free(infos); + + if (s->level == LEVEL_CONTAINER) { + /* No need to start. But we should signal udev to + * create links */ + sysfs_uevent(&info, "change"); + if (c->verbose >= 0) + pr_err("container %s prepared.\n", mddev); + wait_for(chosen_name, mdfd); + } else if (c->runstop == 1 || subdevs >= s->raiddisks) { + if (st->ss->external) { + int err; + switch(s->level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(&info, NULL, "array_state", + c->readonly + ? "readonly" + : "active"); + need_mdmon = 0; + break; + default: + err = sysfs_set_str(&info, NULL, "array_state", + "readonly"); + break; + } + sysfs_set_safemode(&info, safe_mode_delay); + if (err) { + pr_err("failed to activate array.\n"); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else if (c->readonly && + sysfs_attribute_available( + &info, NULL, "array_state")) { + if (sysfs_set_str(&info, NULL, + "array_state", "readonly") < 0) { + pr_err("Failed to start array: %s\n", + strerror(errno)); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else { + /* param is not actually used */ + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (errno == 524 /* ENOTSUP */ && + info.array.level == 0) + cont_err("Please use --layout=original or --layout=alternate\n"); + if (info.array.chunk_size & (info.array.chunk_size-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + /* if start_ro module parameter is set, array is + * auto-read-only, which is bad as the resync won't + * start. So lets make it read-write now. + */ + ioctl(mdfd, RESTART_ARRAY_RW, NULL); + } + if (c->verbose >= 0) + pr_err("array %s started.\n", mddev); + if (st->ss->external && st->container_devnm[0]) { + if (need_mdmon) + start_mdmon(st->container_devnm); + + ping_monitor(st->container_devnm); + close(container_fd); + } + wait_for(chosen_name, mdfd); + } else { + pr_err("not starting array - not enough devices.\n"); + } + udev_unblock(); + close(mdfd); + sysfs_uevent(&info, "change"); + return 0; + + abort: + udev_unblock(); + map_lock(&map); + abort_locked: + map_remove(&map, fd2devnm(mdfd)); + map_unlock(&map); + + if (mdfd >= 0) + close(mdfd); + return 1; +} diff --git a/Detail.c b/Detail.c new file mode 100644 index 0000000..95d4cc7 --- /dev/null +++ b/Detail.c @@ -0,0 +1,879 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <ctype.h> +#include <dirent.h> + +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static int add_device(const char *dev, char ***p_devices, + int *p_max_devices, int n_devices) +{ + if (n_devices + 1 >= *p_max_devices) { + *p_max_devices += 16; + *p_devices = xrealloc(*p_devices, *p_max_devices * + sizeof(**p_devices)); + if (!*p_devices) { + *p_max_devices = 0; + return 0; + } + }; + (*p_devices)[n_devices] = xstrdup(dev); + return n_devices + 1; +} + +int Detail(char *dev, struct context *c) +{ + /* + * Print out details for an md array + */ + int fd = open(dev, O_RDONLY); + mdu_array_info_t array; + mdu_disk_info_t *disks = NULL; + int next; + int d; + time_t atime; + char *str; + char **devices = NULL; + int max_devices = 0, n_devices = 0; + int spares = 0; + struct stat stb; + int failed = 0; + struct supertype *st = NULL; + char *subarray = NULL; + int max_disks = MD_SB_DISKS; /* just a default */ + struct mdinfo *info = NULL; + struct mdinfo *sra = NULL; + struct mdinfo *subdev; + char *member = NULL; + char *container = NULL; + + int rv = c->test ? 4 : 1; + int avail_disks = 0; + char *avail = NULL; + int external; + int inactive; + int is_container = 0; + char *arrayst; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return rv; + } + sra = sysfs_read(fd, NULL, GET_VERSION | GET_DEVS | + GET_ARRAY_STATE | GET_STATE); + if (!sra) { + if (md_get_array_info(fd, &array)) { + pr_err("%s does not appear to be an md device\n", dev); + goto out; + } + } + external = (sra != NULL && sra->array.major_version == -1 && + sra->array.minor_version == -2); + inactive = (sra != NULL && !md_array_is_active(sra)); + st = super_by_fd(fd, &subarray); + if (md_get_array_info(fd, &array)) { + if (errno == ENODEV) { + if (sra->array.major_version == -1 && + sra->array.minor_version == -1 && + sra->devs == NULL) { + pr_err("Array associated with md device %s does not exist.\n", + dev); + goto out; + } + array = sra->array; + } else { + pr_err("cannot get array detail for %s: %s\n", + dev, strerror(errno)); + goto out; + } + } + + if (array.raid_disks == 0 && external) + is_container = 1; + if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode)) + stb.st_rdev = 0; + rv = 0; + + if (st) + max_disks = st->max_devs; + + if (subarray) { + /* This is a subarray of some container. + * We want the name of the container, and the member + */ + dev_t devid = devnm2devid(st->container_devnm); + int cfd, err; + + member = subarray; + container = map_dev_preferred(major(devid), minor(devid), + 1, c->prefer); + cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + err = st->ss->load_container(st, cfd, NULL); + close(cfd); + if (err == 0) + info = st->ss->container_content(st, subarray); + } + } + + /* try to load a superblock. Try sra->devs first, then try ioctl */ + if (st && !info) + for (d = 0, subdev = sra ? sra->devs : NULL; + d < max_disks || subdev; + subdev ? (void)(subdev = subdev->next) : (void)(d++)){ + mdu_disk_info_t disk; + char *dv; + int fd2; + int err; + + if (subdev) + disk = subdev->disk; + else { + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (d >= array.raid_disks && + disk.major == 0 && disk.minor == 0) + continue; + } + + if (array.raid_disks > 0 && + (disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + + if (st->sb) + st->ss->free_super(st); + + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + if (info) + free(info); + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + if (array.raid_disks != 0 && /* container */ + (info->array.ctime != array.ctime || + info->array.level != array.level)) { + st->ss->free_super(st); + continue; + } + /* some formats (imsm) have free-floating-spares + * with a uuid of uuid_zero, they don't + * have very good info about the rest of the + * container, so keep searching when + * encountering such a device. Otherwise, stop + * after the first successful call to + * ->load_super. + */ + if (memcmp(uuid_zero, + info->uuid, + sizeof(uuid_zero)) == 0) { + st->ss->free_super(st); + continue; + } + break; + } + + /* Ok, we have some info to print... */ + if (inactive && info) + str = map_num(pers, info->array.level); + else + str = map_num(pers, array.level); + + if (c->export) { + if (array.raid_disks) { + if (str) + printf("MD_LEVEL=%s\n", str); + printf("MD_DEVICES=%d\n", array.raid_disks); + } else { + if (is_container) + printf("MD_LEVEL=container\n"); + printf("MD_DEVICES=%d\n", array.nr_disks); + } + if (container) { + printf("MD_CONTAINER=%s\n", container); + printf("MD_MEMBER=%s\n", member); + } else { + if (sra && sra->array.major_version < 0) + printf("MD_METADATA=%s\n", sra->text_version); + else + printf("MD_METADATA=%d.%d\n", + array.major_version, + array.minor_version); + } + + if (st && st->sb && info) { + char nbuf[64]; + struct map_ent *mp, *map = NULL; + + fname_from_uuid(st, info, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf + 5); + mp = map_by_uuid(&map, info->uuid); + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path + 8); + putchar('\n'); + } + + if (st->ss->export_detail_super) + st->ss->export_detail_super(st); + map_free(map); + } else { + struct map_ent *mp, *map = NULL; + char nbuf[64]; + mp = map_by_devnm(&map, fd2devnm(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + map_free(map); + } + if (!c->no_devices && sra) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + char *path; + char *sysdev = xstrdup(mdi->sys_name); + char *cp; + + path = map_dev(mdi->disk.major, + mdi->disk.minor, 0); + for (cp = sysdev; *cp; cp++) + if (!isalnum(*cp)) + *cp = '_'; + + if (mdi->disk.raid_disk >= 0) + printf("MD_DEVICE_%s_ROLE=%d\n", + sysdev, + mdi->disk.raid_disk); + else + printf("MD_DEVICE_%s_ROLE=spare\n", + sysdev); + if (path) + printf("MD_DEVICE_%s_DEV=%s\n", + sysdev, path); + } + } + goto out; + } + + disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t)); + for (d = 0; d < max_disks * 2; d++) { + disks[d].state = (1 << MD_DISK_REMOVED); + disks[d].major = disks[d].minor = 0; + disks[d].number = -1; + disks[d].raid_disk = d / 2; + } + + next = array.raid_disks * 2; + if (inactive) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + disks[next++] = mdi->disk; + disks[next - 1].number = -1; + } + } else for (d = 0; d < max_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + if (d < array.raid_disks) + pr_err("cannot get device detail for device %d: %s\n", + d, strerror(errno)); + continue; + } + if (disk.major == 0 && disk.minor == 0) + continue; + if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks && + disks[disk.raid_disk * 2].state == (1 << MD_DISK_REMOVED) && + ((disk.state & (1 << MD_DISK_JOURNAL)) == 0)) + disks[disk.raid_disk * 2] = disk; + else if (disk.raid_disk >= 0 && + disk.raid_disk < array.raid_disks && + disks[disk.raid_disk * 2 + 1].state == + (1 << MD_DISK_REMOVED) && + !(disk.state & (1 << MD_DISK_JOURNAL))) + disks[disk.raid_disk * 2 + 1] = disk; + else if (next < max_disks * 2) + disks[next++] = disk; + } + + avail = xcalloc(array.raid_disks, 1); + + for (d = 0; d < array.raid_disks; d++) { + char dv[PATH_MAX], dv_rep[PATH_MAX]; + snprintf(dv, PATH_MAX, "/sys/dev/block/%d:%d", + disks[d*2].major, disks[d*2].minor); + snprintf(dv_rep, PATH_MAX, "/sys/dev/block/%d:%d", + disks[d*2+1].major, disks[d*2+1].minor); + + if ((is_dev_alive(dv) && (disks[d*2].state & (1<<MD_DISK_SYNC))) || + (is_dev_alive(dv_rep) && (disks[d*2+1].state & (1<<MD_DISK_SYNC)))) { + avail_disks ++; + avail[d] = 1; + } else + rv |= !! c->test; + } + + if (c->brief) { + mdu_bitmap_file_t bmf; + if (inactive && !is_container) + printf("INACTIVE-ARRAY %s", dev); + else + printf("ARRAY %s", dev); + if (c->verbose > 0) { + if (array.raid_disks) + printf(" level=%s num-devices=%d", + str ? str : "-unknown-", + array.raid_disks); + else if (is_container) + printf(" level=container num-devices=%d", + array.nr_disks); + else + printf(" num-devices=%d", array.nr_disks); + } + if (container) { + printf(" container=%s", container); + printf(" member=%s", member); + } else { + if (sra && sra->array.major_version < 0) + printf(" metadata=%s", sra->text_version); + else + printf(" metadata=%d.%d", array.major_version, + array.minor_version); + } + + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) { + printf(" bitmap=%s", bmf.pathname); + } + } else { + mdu_bitmap_file_t bmf; + unsigned long long larray_size; + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + char *devnm; + + devnm = stat2devnm(&stb); + for (e = ms; e; e = e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + + printf("%s:\n", dev); + + if (container) + printf(" Container : %s, member %s\n", + container, member); + else { + if (sra && sra->array.major_version < 0) + printf(" Version : %s\n", + sra->text_version); + else + printf(" Version : %d.%d\n", + array.major_version, + array.minor_version); + } + + atime = array.ctime; + if (atime) + printf(" Creation Time : %.24s\n", ctime(&atime)); + if (is_container) + str = "container"; + if (str) + printf(" Raid Level : %s\n", str); + if (larray_size) + printf(" Array Size : %llu%s\n", + (larray_size >> 10), + human_size(larray_size)); + if (array.level >= 1) { + if (sra) + array.major_version = sra->array.major_version; + if (array.major_version != 0 && + (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) { + unsigned long long dsize; + + dsize = get_component_size(fd); + if (dsize > 0) + printf(" Used Dev Size : %llu%s\n", + dsize/2, + human_size((long long)dsize<<9)); + else + printf(" Used Dev Size : unknown\n"); + } else + printf(" Used Dev Size : %lu%s\n", + (unsigned long)array.size, + human_size((unsigned long long) + array.size << 10)); + } + if (array.raid_disks) + printf(" Raid Devices : %d\n", array.raid_disks); + printf(" Total Devices : %d\n", array.nr_disks); + if (!container && + ((sra == NULL && array.major_version == 0) || + (sra && sra->array.major_version == 0))) + printf(" Preferred Minor : %d\n", array.md_minor); + if (sra == NULL || sra->array.major_version >= 0) + printf(" Persistence : Superblock is %spersistent\n", + array.not_persistent ? "not " : ""); + printf("\n"); + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) { + printf(" Intent Bitmap : %s\n", bmf.pathname); + printf("\n"); + } else if (array.state & (1<<MD_SB_CLUSTERED)) + printf(" Intent Bitmap : Internal(Clustered)\n\n"); + else if (array.state & (1<<MD_SB_BITMAP_PRESENT)) + printf(" Intent Bitmap : Internal\n\n"); + atime = array.utime; + if (atime) + printf(" Update Time : %.24s\n", ctime(&atime)); + if (array.raid_disks) { + static char *sync_action[] = { + ", recovering", ", resyncing", + ", reshaping", ", checking" }; + char *st; + if (avail_disks == array.raid_disks) + st = ""; + else if (!enough(array.level, array.raid_disks, + array.layout, 1, avail)) + st = ", FAILED"; + else + st = ", degraded"; + + if (array.state & (1 << MD_SB_CLEAN)) { + if ((array.level == 0) || + (array.level == LEVEL_LINEAR)) + arrayst = map_num(sysfs_array_states, + sra->array_state); + else + arrayst = "clean"; + } else { + arrayst = "active"; + if (array.state & (1<<MD_SB_CLUSTERED)) { + for (d = 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + /* only check first valid disk in cluster env */ + if ((disk.state & (MD_DISK_SYNC | MD_DISK_ACTIVE)) + && (disk.major | disk.minor)) { + dv = map_dev_preferred(disk.major, disk.minor, 0, + c->prefer); + if (!dv) + continue; + arrayst = IsBitmapDirty(dv) ? "active" : "clean"; + break; + } + } + } + } + + printf(" State : %s%s%s%s%s%s%s \n", + arrayst, st, + (!e || (e->percent < 0 && + e->percent != RESYNC_PENDING && + e->percent != RESYNC_DELAYED && + e->percent != RESYNC_REMOTE)) ? + "" : sync_action[e->resync], + larray_size ? "": ", Not Started", + (e && e->percent == RESYNC_DELAYED) ? + " (DELAYED)": "", + (e && e->percent == RESYNC_PENDING) ? + " (PENDING)": "", + (e && e->percent == RESYNC_REMOTE) ? + " (REMOTE)": ""); + } else if (inactive && !is_container) { + printf(" State : inactive\n"); + } + if (array.raid_disks) + printf(" Active Devices : %d\n", array.active_disks); + if (array.working_disks > 0) + printf(" Working Devices : %d\n", + array.working_disks); + if (array.raid_disks) { + printf(" Failed Devices : %d\n", array.failed_disks); + if (!external) + printf(" Spare Devices : %d\n", array.spare_disks); + } + printf("\n"); + if (array.level == 5) { + str = map_num(r5layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 0 && array.layout) { + str = map_num(r0layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 6) { + str = map_num(r6layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 10) { + printf(" Layout :"); + print_r10_layout(array.layout); + printf("\n"); + } + switch (array.level) { + case 0: + case 4: + case 5: + case 10: + case 6: + if (array.chunk_size) + printf(" Chunk Size : %dK\n\n", + array.chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n\n", + array.chunk_size/1024); + break; + default: + break; + } + + if (array.raid_disks) { + struct mdinfo *mdi; + + mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY); + if (mdi) { + char *policy = map_num(consistency_policies, + mdi->consistency_policy); + sysfs_free(mdi); + if (policy) + printf("Consistency Policy : %s\n\n", + policy); + } + } + + if (e && e->percent >= 0) { + static char *sync_action[] = { + "Rebuild", "Resync", "Reshape", "Check"}; + printf(" %7s Status : %d%% complete\n", + sync_action[e->resync], e->percent); + } + + if ((st && st->sb) && (info && info->reshape_active)) { +#if 0 +This is pretty boring + printf(" Reshape pos'n : %llu%s\n", + (unsigned long long) info->reshape_progress << 9, + human_size((unsigned long long) + info->reshape_progress << 9)); +#endif + if (info->delta_disks != 0) + printf(" Delta Devices : %d, (%d->%d)\n", + info->delta_disks, + array.raid_disks - info->delta_disks, + array.raid_disks); + if (info->new_level != array.level) { + str = map_num(pers, info->new_level); + printf(" New Level : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level != array.level || + info->new_layout != array.layout) { + if (info->new_level == 5) { + str = map_num(r5layout, + info->new_layout); + printf(" New Layout : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level == 6) { + str = map_num(r6layout, + info->new_layout); + printf(" New Layout : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + info->new_layout & 255, + (info->new_layout & 0x10000) ? + "offset" : "far", + (info->new_layout >> 8) & 255); + } + } + if (info->new_chunk != array.chunk_size) + printf(" New Chunksize : %dK\n", + info->new_chunk/1024); + printf("\n"); + } else if (e && e->percent >= 0) + printf("\n"); + free_mdstat(ms); + + if (st && st->sb) + st->ss->detail_super(st, c->homehost, subarray); + + if (array.raid_disks == 0 && sra && + sra->array.major_version == -1 && + sra->array.minor_version == -2 && + sra->text_version[0] != '/') { + /* This looks like a container. Find any active arrays + * That claim to be a member. + */ + DIR *dir = opendir("/sys/block"); + struct dirent *de; + + printf(" Member Arrays :"); + + while (dir && (de = readdir(dir)) != NULL) { + char path[287]; + char vbuf[1024]; + int nlen = strlen(sra->sys_name); + dev_t devid; + if (de->d_name[0] == '.') + continue; + sprintf(path, + "/sys/block/%s/md/metadata_version", + de->d_name); + if (load_sys(path, vbuf, sizeof(vbuf)) < 0) + continue; + if (strncmp(vbuf, "external:", 9) || + !is_subarray(vbuf + 9) || + strncmp(vbuf + 10, sra->sys_name, nlen) || + vbuf[10 + nlen] != '/') + continue; + devid = devnm2devid(de->d_name); + printf(" %s", + map_dev_preferred(major(devid), + minor(devid), 1, + c->prefer)); + } + if (dir) + closedir(dir); + printf("\n\n"); + } + + if (!c->no_devices) { + if (array.raid_disks) + printf(" Number Major Minor RaidDevice State\n"); + else + printf(" Number Major Minor RaidDevice\n"); + } + } + + /* if --no_devices specified, not print component devices info */ + if (c->no_devices) + goto skip_devices_state; + + for (d = 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + if (d >= array.raid_disks * 2 && + disk.major == 0 && disk.minor == 0) + continue; + if ((d & 1) && disk.major == 0 && disk.minor == 0) + continue; + if (!c->brief) { + if (d == array.raid_disks*2) + printf("\n"); + if (disk.number < 0 && disk.raid_disk < 0) + printf(" - %5d %5d - ", + disk.major, disk.minor); + else if (disk.raid_disk < 0 || + disk.state & (1 << MD_DISK_JOURNAL)) + printf(" %5d %5d %5d - ", + disk.number, disk.major, disk.minor); + else if (disk.number < 0) + printf(" - %5d %5d %5d ", + disk.major, disk.minor, disk.raid_disk); + else + printf(" %5d %5d %5d %5d ", + disk.number, disk.major, disk.minor, + disk.raid_disk); + } + if (!c->brief && array.raid_disks) { + if (disk.state & (1 << MD_DISK_FAULTY)) { + printf(" faulty"); + if (disk.raid_disk < array.raid_disks && + disk.raid_disk >= 0) + failed++; + } + if (disk.state & (1 << MD_DISK_ACTIVE)) + printf(" active"); + if (disk.state & (1 << MD_DISK_SYNC)) { + printf(" sync"); + if (array.level == 10 && + (array.layout & ~0x1FFFF) == 0) { + int nc = array.layout & 0xff; + int fc = (array.layout >> 8) & 0xff; + int copies = nc*fc; + if (fc == 1 && + array.raid_disks % copies == 0 && + copies <= 26) { + /* We can divide the devices + into 'sets' */ + int set; + set = disk.raid_disk % copies; + printf(" set-%c", set + 'A'); + } + } + } + if (disk.state & (1 << MD_DISK_REMOVED)) + printf(" removed"); + if (disk.state & (1 << MD_DISK_WRITEMOSTLY)) + printf(" writemostly"); + if (disk.state & (1 << MD_DISK_FAILFAST)) + printf(" failfast"); + if (disk.state & (1 << MD_DISK_JOURNAL)) + printf(" journal"); + if ((disk.state & + ((1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC) | + (1 << MD_DISK_REMOVED) | (1 << MD_DISK_FAULTY) | + (1 << MD_DISK_JOURNAL))) == 0) { + printf(" spare"); + if (disk.raid_disk < array.raid_disks && + disk.raid_disk >= 0) + printf(" rebuilding"); + } + } + if (disk.state == 0) + spares++; + dv = map_dev_preferred(disk.major, disk.minor, 0, c->prefer); + if (dv != NULL) { + if (c->brief) + n_devices = add_device(dv, &devices, + &max_devices, n_devices); + else + printf(" %s", dv); + } else if (disk.major | disk.minor) + printf(" missing"); + if (!c->brief) + printf("\n"); + } + +skip_devices_state: + if (spares && c->brief && array.raid_disks) + printf(" spares=%d", spares); + if (c->brief && st && st->sb) + st->ss->brief_detail_super(st, subarray); + if (st) + st->ss->free_super(st); + + if (c->brief && c->verbose > 0 && devices) { + qsort(devices, n_devices, sizeof(*devices), cmpstringp); + printf("\n devices=%s", devices[0]); + for (d = 1; d < n_devices; d++) + printf(",%s", devices[d]); + } + if (c->brief) + printf("\n"); + if (c->test && + !enough(array.level, array.raid_disks, array.layout, 1, avail)) + rv = 2; + +out: + free(info); + free(disks); + close(fd); + free(subarray); + free(avail); + if (devices) + for (d = 0; d < n_devices; d++) + free(devices[d]); + free(devices); + sysfs_free(sra); + free(st); + return rv; +} + +int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path) +{ + /* display platform capabilities for the given metadata format + * 'scan' in this context means iterate over all metadata types + */ + int i; + int err = 1; + + if (ss && export && ss->export_detail_platform) + err = ss->export_detail_platform(verbose, controller_path); + else if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0, controller_path); + else if (ss) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + ss->name ? : "[no name]"); + } else if (!scan) { + if (verbose > 0) + pr_err("specify a metadata type or --scan\n"); + } + + if (!scan) + return err; + + err = 0; + for (i = 0; superlist[i]; i++) { + struct superswitch *meta = superlist[i]; + + if (meta == ss) + continue; + if (verbose > 0) + pr_err("checking metadata %s\n", + meta->name ? : "[no name]"); + if (!meta->detail_platform) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + meta->name ? : "[no name]"); + } else if (export && meta->export_detail_platform) { + err |= meta->export_detail_platform(verbose, controller_path); + } else + err |= meta->detail_platform(verbose, 0, controller_path); + } + + return err; +} @@ -0,0 +1,319 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2013 Neil Brown <neilb@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <sys/dir.h> + +int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st) +{ + /* create a new file in 'dir' named for the basename of 'dev'. + * Truncate to the same size as 'dev' and ask the metadata + * handler to copy metadata there. + * For every name in /dev/disk/by-id that points to this device, + * create a hardlink in 'dir'. + * Complain if any of those hardlinks cannot be created. + */ + int fd, fl; + struct stat stb, dstb; + char *base; + char *fname = NULL; + unsigned long long size; + DIR *dirp; + struct dirent *de; + + if (stat(dir, &stb) != 0 || + (S_IFMT & stb.st_mode) != S_IFDIR) { + pr_err("--dump requires an existing directory, not: %s\n", + dir); + return 16; + } + + fd = dev_open(dev, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s to dump metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if (st == NULL) + st = guess_super_type(fd, guess_array); + if (!st) { + pr_err("Cannot find RAID metadata on %s\n", dev); + close(fd); + return 1; + } + + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fd, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, dev); + close(fd); + return 1; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + close(fd); + return 1; + } + + base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666); + if (fl < 0) { + pr_err("Cannot create dump file %s: %s\n", + fname, strerror(errno)); + close(fd); + free(fname); + return 1; + } + if (ftruncate(fl, size) < 0) { + pr_err("failed to set size of dump file: %s\n", + strerror(errno)); + close(fd); + close(fl); + free(fname); + return 1; + } + + if (st->ss->copy_metadata(st, fd, fl) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + dev, fname); + close(fd); + close(fl); + unlink(fname); + free(fname); + return 1; + } + if (c->verbose >= 0) + printf("%s saved as %s.\n", dev, fname); + fstat(fd, &dstb); + close(fd); + close(fl); + if ((dstb.st_mode & S_IFMT) != S_IFBLK) { + /* Not a block device, so cannot create links */ + free(fname); + return 0; + } + /* mostly done: just want to find some other names */ + dirp = opendir("/dev/disk/by-id"); + if (!dirp) { + free(fname); + return 0; + } + while ((de = readdir(dirp)) != NULL) { + char *p = NULL; + if (de->d_name[0] == '.') + continue; + xasprintf(&p, "/dev/disk/by-id/%s", de->d_name); + if (stat(p, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK || + stb.st_rdev != dstb.st_rdev) { + /* Not this one */ + free(p); + continue; + } + free(p); + xasprintf(&p, "%s/%s", dir, de->d_name); + if (link(fname, p) == 0) { + if (c->verbose >= 0) + printf("%s also saved as %s.\n", + dev, p); + } else { + pr_err("Could not save %s as %s!!\n", + dev, p); + } + free(p); + } + closedir(dirp); + free(fname); + return 0; +} + +int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only) +{ + /* If 'dir' really is a directory we choose a name + * from it that matches a suitable name in /dev/disk/by-id, + * and copy metadata from the file to the device. + * If two names from by-id match and aren't both the same + * inode, we fail. If none match and basename of 'dev' + * can be found in dir, use that. + * If 'dir' is really a file then it is only permitted if + * 'only' is set (meaning there was only one device given) + * and the metadata is restored irrespective of file names. + */ + int fd, fl; + struct stat stb, dstb; + char *fname = NULL; + unsigned long long size; + + if (stat(dir, &stb) != 0) { + pr_err("%s does not exist: cannot restore from there.\n", + dir); + return 16; + } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) { + pr_err("--restore requires a directory when multiple devices given\n"); + return 16; + } + + fd = dev_open(dev, O_RDWR); + if (fd < 0) { + pr_err("Cannot open %s to restore metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if ((S_IFMT & stb.st_mode) == S_IFDIR) { + /* choose one name from the directory. */ + DIR *d = opendir(dir); + struct dirent *de; + char *chosen = NULL; + unsigned int chosen_inode = 0; + + fstat(fd, &dstb); + + while (d && (de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name); + if (stat(fname, &stb) != 0) { + free(fname); + continue; + } + free(fname); + if ((S_IFMT & stb.st_mode) != S_IFBLK) + continue; + if (stb.st_rdev != dstb.st_rdev) + continue; + /* This file is a good match for our device. */ + xasprintf(&fname, "%s/%s", dir, de->d_name); + if (stat(fname, &stb) != 0) { + /* Weird! */ + free(fname); + continue; + } + if (chosen == NULL) { + chosen = fname; + chosen_inode = stb.st_ino; + continue; + } + if (chosen_inode == stb.st_ino) { + /* same, no need to change */ + free(fname); + continue; + } + /* Oh dear, two names both match. Must give up. */ + pr_err("Both %s and %s seem suitable for %s. Please choose one.\n", + chosen, fname, dev); + free(fname); + free(chosen); + close(fd); + closedir(d); + return 1; + } + closedir(d); + if (!chosen) { + /* One last chance: try basename of device */ + char *base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + if (stat(fname, &stb) == 0) + chosen = fname; + else + free(fname); + } + fname = chosen; + } else + fname = strdup(dir); + + if (!fname) { + pr_err("Cannot find suitable file in %s for %s\n", + dir, dev); + close(fd); + return 1; + } + + fl = open(fname, O_RDONLY); + if (!fl) { + pr_err("Could not open %s for --restore.\n", + fname); + goto err; + } + if (stat(fname, &stb) != 0) { + pr_err("Could not stat %s for --restore.\n", + fname); + goto err; + } + if (((unsigned long long)stb.st_size) != size) { + pr_err("%s is not the same size as %s - cannot restore.\n", + fname, dev); + goto err; + } + if (st == NULL) + st = guess_super_type(fl, guess_array); + if (!st) { + pr_err("Cannot find metadata on %s\n", fname); + goto err; + } + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fl, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, fname); + goto err; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + goto err; + } + if (st->ss->copy_metadata(st, fl, fd) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + fname, dev); + goto err; + } + if (c->verbose >= 0) + printf("%s restored from %s.\n", dev, fname); + close(fl); + close(fd); + free(fname); + return 0; + +err: + close(fd); + close(fl); + free(fname); + return 1; +} diff --git a/Examine.c b/Examine.c new file mode 100644 index 0000000..9574a3c --- /dev/null +++ b/Examine.c @@ -0,0 +1,228 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" +int Examine(struct mddev_dev *devlist, + struct context *c, + struct supertype *forcest) +{ + + /* Read the raid superblock from a device and + * display important content. + * + * If cannot be found, print reason: too small, bad magic + * + * Print: + * version, ctime, level, size, raid+spare+ + * prefered minor + * uuid + * + * utime, state etc + * + * If (brief) gather devices for same array and just print a mdadm.conf + * line including devices= + * if devlist==NULL, use conf_get_devs() + */ + int fd; + int rv = 0; + + struct array { + struct supertype *st; + struct mdinfo info; + void *devs; + struct array *next; + int spares; + } *arrays = NULL; + + for (; devlist ; devlist = devlist->next) { + struct supertype *st; + int have_container = 0; + int err = 0; + int container = 0; + + fd = dev_open(devlist->devname, O_RDONLY); + if (fd < 0) { + if (!c->scan) { + pr_err("cannot open %s: %s\n", + devlist->devname, strerror(errno)); + rv = 1; + } + continue; + } + + if (forcest) + st = dup_super(forcest); + else if (must_be_container(fd)) { + /* might be a container */ + st = super_by_fd(fd, NULL); + container = 1; + } else + st = guess_super(fd); + if (st) { + err = 1; + st->ignore_hw_compat = 1; + if (!container) + err = st->ss->load_super(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (err && st->ss->load_container) { + err = st->ss->load_container(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (!err) + have_container = 1; + } + st->ignore_hw_compat = 0; + } else { + if (!c->brief) { + pr_err("No md superblock detected on %s.\n", devlist->devname); + rv = 1; + } + err = 1; + } + close(fd); + + if (err) { + if (st) + st->ss->free_super(st); + continue; + } + + if (c->SparcAdjust) + st->ss->update_super(st, NULL, "sparc2.2", + devlist->devname, 0, 0, NULL); + /* Ok, its good enough to try, though the checksum could be wrong */ + + if (c->brief && st->ss->brief_examine_super == NULL) { + if (!c->scan) + pr_err("No brief listing for %s on %s\n", + st->ss->name, devlist->devname); + } else if (c->brief) { + struct array *ap; + char *d; + for (ap = arrays; ap; ap = ap->next) { + if (st->ss == ap->st->ss && + st->ss->compare_super(ap->st, st, 0) == 0) + break; + } + if (!ap) { + ap = xmalloc(sizeof(*ap)); + ap->devs = dl_head(); + ap->next = arrays; + ap->spares = 0; + ap->st = st; + arrays = ap; + st->ss->getinfo_super(st, &ap->info, NULL); + } else + st->ss->getinfo_super(st, &ap->info, NULL); + if (!have_container && + !(ap->info.disk.state & (1<<MD_DISK_SYNC))) + ap->spares++; + d = dl_strdup(devlist->devname); + dl_add(ap->devs, d); + } else if (c->export) { + if (st->ss->export_examine_super) + st->ss->export_examine_super(st); + st->ss->free_super(st); + } else { + printf("%s:\n",devlist->devname); + st->ss->examine_super(st, c->homehost); + st->ss->free_super(st); + } + } + if (c->brief) { + struct array *ap; + for (ap = arrays; ap; ap = ap->next) { + char sep='='; + char *d; + int newline = 0; + + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); + if (ap->spares && !ap->st->ss->external) + newline += printf(" spares=%d", ap->spares); + if (c->verbose > 0) { + newline += printf(" devices"); + for (d = dl_next(ap->devs); + d != ap->devs; + d=dl_next(d)) { + printf("%c%s", sep, d); + sep=','; + } + } + if (ap->st->ss->brief_examine_subarrays) { + if (newline) + printf("\n"); + ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); + } + ap->st->ss->free_super(ap->st); + /* FIXME free ap */ + if (ap->spares || c->verbose > 0) + printf("\n"); + } + } + return rv; +} + +int ExamineBadblocks(char *devname, int brief, struct supertype *forcest) +{ + int fd = dev_open(devname, O_RDONLY); + struct supertype *st = forcest; + int err = 1; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", devname, strerror(errno)); + return 1; + } + if (!st) + st = guess_super(fd); + if (!st) { + if (!brief) + pr_err("No md superblock detected on %s\n", devname); + goto out; + } + if (!st->ss->examine_badblocks) { + pr_err("%s metadata does not support badblocks\n", st->ss->name); + goto out; + } + err = st->ss->load_super(st, fd, brief ? NULL : devname); + if (err) + goto out; + err = st->ss->examine_badblocks(st, fd, devname); + +out: + if (fd >= 0) + close(fd); + if (st) { + st->ss->free_super(st); + free(st); + } + return err; +} @@ -0,0 +1,5229 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ +#include "mdadm.h" +#include "dlink.h" +#include <sys/mman.h> +#include <stddef.h> +#include <stdint.h> +#include <signal.h> +#include <sys/wait.h> + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" + +int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int next_spare, + char **backup_filep, + int verbose) +{ + int i; + int *fdlist; + struct mdinfo *dev; + int err; + int disk_count = next_spare + working_disks; + char *backup_file = *backup_filep; + + dprintf("Called restore_backup()\n"); + fdlist = xmalloc(sizeof(int) * disk_count); + + enable_fds(next_spare); + for (i = 0; i < next_spare; i++) + fdlist[i] = -1; + for (dev = content->devs; dev; dev = dev->next) { + char buf[22]; + int fd; + + sprintf(buf, "%d:%d", dev->disk.major, dev->disk.minor); + fd = dev_open(buf, O_RDWR); + + if (dev->disk.raid_disk >= 0) + fdlist[dev->disk.raid_disk] = fd; + else + fdlist[next_spare++] = fd; + } + + if (!backup_file) { + backup_file = locate_backup(content->sys_name); + *backup_filep = backup_file; + } + + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, next_spare, + backup_file, verbose > 0); + + while (next_spare > 0) { + next_spare--; + if (fdlist[next_spare] >= 0) + close(fdlist[next_spare]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape - sorry.\n"); + if (!backup_file) + pr_err("Possibly you need to specify a --backup-file\n"); + return 1; + } + + dprintf("restore_backup() returns status OK.\n"); + return 0; +} + +int Grow_Add_device(char *devname, int fd, char *newdev) +{ + /* Add a device to an active array. + * Currently, just extend a linear array. + * This requires writing a new superblock on the + * new device, calling the kernel to add the device, + * and if that succeeds, update the superblock on + * all other devices. + * This means that we need to *find* all other devices. + */ + struct mdinfo info; + + dev_t rdev; + int nfd, fd2; + int d, nd; + struct supertype *st = NULL; + char *subarray = NULL; + + if (md_get_array_info(fd, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + if (info.array.level != -1) { + pr_err("can only add devices to linear arrays\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("cannot handle arrays with superblock version %d\n", + info.array.major_version); + return 1; + } + + if (subarray) { + pr_err("Cannot grow linear sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); + if (nfd < 0) { + pr_err("cannot open %s\n", newdev); + free(st); + return 1; + } + if (!fstat_is_blkdev(nfd, newdev, &rdev)) { + close(nfd); + free(st); + return 1; + } + /* now check out all the devices and make sure we can read the + * superblock */ + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + st->ss->free_super(st); + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", d); + close(nfd); + free(st); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", d); + close(nfd); + free(st); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + close(nfd); + free(st); + return 1; + } + + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(nfd); + close(fd2); + free(st); + return 1; + } + close(fd2); + } + /* Ok, looks good. Lets update the superblock and write it out to + * newdev. + */ + + info.disk.number = d; + info.disk.major = major(rdev); + info.disk.minor = minor(rdev); + info.disk.raid_disk = d; + info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + if (st->ss->update_super(st, &info, "linear-grow-new", newdev, + 0, 0, NULL) != 0) { + pr_err("Preparing new metadata failed on %s\n", newdev); + close(nfd); + return 1; + } + + if (st->ss->store_super(st, nfd)) { + pr_err("Cannot store new superblock on %s\n", newdev); + close(nfd); + return 1; + } + close(nfd); + + if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) { + pr_err("Cannot add new disk to this array\n"); + return 1; + } + /* Well, that seems to have worked. + * Now go through and update all superblocks + */ + + if (md_get_array_info(fd, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + nd = d; + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", d); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", d); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + return 1; + } + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(fd); + close(fd2); + return 1; + } + info.array.raid_disks = nd+1; + info.array.nr_disks = nd+1; + info.array.active_disks = nd+1; + info.array.working_disks = nd+1; + + if (st->ss->update_super(st, &info, "linear-grow-update", dv, + 0, 0, NULL) != 0) { + pr_err("Updating metadata failed on %s\n", dv); + close(fd2); + return 1; + } + + if (st->ss->store_super(st, fd2)) { + pr_err("Cannot store new superblock on %s\n", dv); + close(fd2); + return 1; + } + close(fd2); + } + + return 0; +} + +int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) +{ + /* + * First check that array doesn't have a bitmap + * Then create the bitmap + * Then add it + * + * For internal bitmaps, we need to check the version, + * find all the active devices, and write the bitmap block + * to all devices + */ + mdu_bitmap_file_t bmf; + mdu_array_info_t array; + struct supertype *st; + char *subarray = NULL; + int major = BITMAP_MAJOR_HI; + unsigned long long bitmapsize, array_size; + struct mdinfo *mdi; + + /* + * We only ever get called if s->bitmap_file is != NULL, so this check + * is just here to quiet down static code checkers. + */ + if (!s->bitmap_file) + return 1; + + if (strcmp(s->bitmap_file, "clustered") == 0) + major = BITMAP_MAJOR_CLUSTERED; + + if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { + if (errno == ENOMEM) + pr_err("Memory allocation failure.\n"); + else + pr_err("bitmaps not supported by this kernel.\n"); + return 1; + } + if (bmf.pathname[0]) { + if (strcmp(s->bitmap_file,"none") == 0) { + if (ioctl(fd, SET_BITMAP_FILE, -1) != 0) { + pr_err("failed to remove bitmap %s\n", + bmf.pathname); + return 1; + } + return 0; + } + pr_err("%s already has a bitmap (%s)\n", devname, bmf.pathname); + return 1; + } + if (md_get_array_info(fd, &array) != 0) { + pr_err("cannot get array status for %s\n", devname); + return 1; + } + if (array.state & (1 << MD_SB_BITMAP_PRESENT)) { + if (strcmp(s->bitmap_file, "none")==0) { + array.state &= ~(1 << MD_SB_BITMAP_PRESENT); + if (md_set_array_info(fd, &array) != 0) { + if (array.state & (1 << MD_SB_CLUSTERED)) + pr_err("failed to remove clustered bitmap.\n"); + else + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + return 0; + } + pr_err("bitmap already present on %s\n", devname); + return 1; + } + + if (strcmp(s->bitmap_file, "none") == 0) { + pr_err("no bitmap found on %s\n", devname); + return 1; + } + if (array.level <= 0) { + pr_err("Bitmaps not meaningful with level %s\n", + map_num(pers, array.level)?:"of this array"); + return 1; + } + bitmapsize = array.size; + bitmapsize <<= 1; + if (get_dev_size(fd, NULL, &array_size) && + array_size > (0x7fffffffULL << 9)) { + /* Array is big enough that we cannot trust array.size + * try other approaches + */ + bitmapsize = get_component_size(fd); + } + if (bitmapsize == 0) { + pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n"); + return 1; + } + + if (array.level == 10) { + int ncopies; + + ncopies = (array.layout & 255) * ((array.layout >> 8) & 255); + bitmapsize = bitmapsize * array.raid_disks / ncopies; + + if (strcmp(s->bitmap_file, "clustered") == 0 && + !is_near_layout_10(array.layout)) { + pr_err("only near layout is supported with clustered raid10\n"); + return 1; + } + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Cannot understand version %d.%d\n", + array.major_version, array.minor_version); + return 1; + } + if (subarray) { + pr_err("Cannot add bitmaps to sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY); + if (mdi) { + if (mdi->consistency_policy == CONSISTENCY_POLICY_PPL) { + pr_err("Cannot add bitmap to array with PPL\n"); + free(mdi); + free(st); + return 1; + } + free(mdi); + } + + if (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0) { + int rv; + int d; + int offset_setable = 0; + if (st->ss->add_internal_bitmap == NULL) { + pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); + return 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); + if (mdi) + offset_setable = 1; + for (d = 0; d < st->max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_SYNC)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + if (((disk.state & (1 << MD_DISK_WRITEMOSTLY)) == 0) && + (strcmp(s->bitmap_file, "clustered") == 0)) { + pr_err("%s disks marked write-mostly are not supported with clustered bitmap\n",devname); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) + continue; + rv = st->ss->load_super(st, fd2, NULL); + if (!rv) { + rv = st->ss->add_internal_bitmap( + st, &s->bitmap_chunk, c->delay, + s->write_behind, bitmapsize, + offset_setable, major); + if (!rv) { + st->ss->write_bitmap(st, fd2, + NodeNumUpdate); + } else { + pr_err("failed to create internal bitmap - chunksize problem.\n"); + } + } else { + pr_err("failed to load super-block.\n"); + } + close(fd2); + if (rv) + return 1; + } + if (offset_setable) { + st->ss->getinfo_super(st, mdi, NULL); + if (sysfs_init(mdi, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + free(mdi); + } + rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", + mdi->bitmap_offset); + free(mdi); + } else { + if (strcmp(s->bitmap_file, "clustered") == 0) + array.state |= (1 << MD_SB_CLUSTERED); + array.state |= (1 << MD_SB_BITMAP_PRESENT); + rv = md_set_array_info(fd, &array); + } + if (rv < 0) { + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("failed to set internal bitmap.\n"); + return 1; + } + } else { + int uuid[4]; + int bitmap_fd; + int d; + int max_devs = st->max_devs; + + /* try to load a superblock */ + for (d = 0; d < max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if ((disk.major==0 && disk.minor == 0) || + (disk.state & (1 << MD_DISK_REMOVED))) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 >= 0) { + if (st->ss->load_super(st, fd2, NULL) == 0) { + close(fd2); + st->ss->uuid_from_super(st, uuid); + break; + } + close(fd2); + } + } + if (d == max_devs) { + pr_err("cannot find UUID for array!\n"); + return 1; + } + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, + s->bitmap_chunk, c->delay, s->write_behind, + bitmapsize, major)) { + return 1; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", s->bitmap_file); + return 1; + } + if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) { + int err = errno; + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("Cannot set bitmap file for %s: %s\n", + devname, strerror(err)); + return 1; + } + } + + return 0; +} + +int Grow_consistency_policy(char *devname, int fd, struct context *c, struct shape *s) +{ + struct supertype *st; + struct mdinfo *sra; + struct mdinfo *sd; + char *subarray = NULL; + int ret = 0; + char container_dev[PATH_MAX]; + char buf[20]; + + if (s->consistency_policy != CONSISTENCY_POLICY_RESYNC && + s->consistency_policy != CONSISTENCY_POLICY_PPL) { + pr_err("Operation not supported for consistency policy %s\n", + map_num(consistency_policies, s->consistency_policy)); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) + return 1; + + sra = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY|GET_LEVEL| + GET_DEVS|GET_STATE); + if (!sra) { + ret = 1; + goto free_st; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + ret = 1; + goto free_info; + } + + if (sra->array.level != 5) { + pr_err("Operation not supported for array level %d\n", + sra->array.level); + ret = 1; + goto free_info; + } + + if (sra->consistency_policy == (unsigned)s->consistency_policy) { + pr_err("Consistency policy is already %s\n", + map_num(consistency_policies, s->consistency_policy)); + ret = 1; + goto free_info; + } else if (sra->consistency_policy != CONSISTENCY_POLICY_RESYNC && + sra->consistency_policy != CONSISTENCY_POLICY_PPL) { + pr_err("Current consistency policy is %s, cannot change to %s\n", + map_num(consistency_policies, sra->consistency_policy), + map_num(consistency_policies, s->consistency_policy)); + ret = 1; + goto free_info; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) { + ret = 1; + goto free_info; + } else if (strcmp(buf, "reshape\n") == 0) { + pr_err("PPL cannot be enabled when reshape is in progress\n"); + ret = 1; + goto free_info; + } + } + + if (subarray) { + char *update; + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) + update = "ppl"; + else + update = "no-ppl"; + + sprintf(container_dev, "/dev/%s", st->container_devnm); + + ret = Update_subarray(container_dev, subarray, update, NULL, + c->verbose); + if (ret) + goto free_info; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + struct mdinfo info; + + if (subarray) { + struct mdinfo *mdi; + int cfd; + + cfd = open(container_dev, O_RDWR|O_EXCL); + if (cfd < 0) { + pr_err("Failed to open %s\n", container_dev); + ret = 1; + goto free_info; + } + + ret = st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + + if (ret) { + pr_err("Cannot read superblock for %s\n", + container_dev); + goto free_info; + } + + mdi = st->ss->container_content(st, subarray); + info = *mdi; + free(mdi); + } + + for (sd = sra->devs; sd; sd = sd->next) { + int dfd; + char *devpath; + + devpath = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(devpath, O_RDWR); + if (dfd < 0) { + pr_err("Failed to open %s\n", devpath); + ret = 1; + goto free_info; + } + + if (!subarray) { + ret = st->ss->load_super(st, dfd, NULL); + if (ret) { + pr_err("Failed to load super-block.\n"); + close(dfd); + goto free_info; + } + + ret = st->ss->update_super(st, sra, "ppl", + devname, + c->verbose, 0, NULL); + if (ret) { + close(dfd); + st->ss->free_super(st); + goto free_info; + } + st->ss->getinfo_super(st, &info, NULL); + } + + ret |= sysfs_set_num(sra, sd, "ppl_sector", + info.ppl_sector); + ret |= sysfs_set_num(sra, sd, "ppl_size", + info.ppl_size); + + if (ret) { + pr_err("Failed to set PPL attributes for %s\n", + sd->sys_name); + close(dfd); + st->ss->free_super(st); + goto free_info; + } + + ret = st->ss->write_init_ppl(st, &info, dfd); + if (ret) + pr_err("Failed to write PPL\n"); + + close(dfd); + + if (!subarray) + st->ss->free_super(st); + + if (ret) + goto free_info; + } + } + + ret = sysfs_set_str(sra, NULL, "consistency_policy", + map_num(consistency_policies, + s->consistency_policy)); + if (ret) + pr_err("Failed to change array consistency policy\n"); + +free_info: + sysfs_free(sra); +free_st: + free(st); + free(subarray); + + return ret; +} + +/* + * When reshaping an array we might need to backup some data. + * This is written to all spares with a 'super_block' describing it. + * The superblock goes 4K from the end of the used space on the + * device. + * It if written after the backup is complete. + * It has the following structure. + */ + +static struct mdp_backup_super { + char magic[16]; /* md_backup_data-1 or -2 */ + __u8 set_uuid[16]; + __u64 mtime; + /* start/sizes in 512byte sectors */ + __u64 devstart; /* address on backup device/file of data */ + __u64 arraystart; + __u64 length; + __u32 sb_csum; /* csum of preceeding bytes. */ + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; +} __attribute__((aligned(512))) bsb, bsb2; + +static __u32 bsb_csum(char *buf, int len) +{ + int i; + int csum = 0; + for (i = 0; i < len; i++) + csum = (csum<<3) + buf[0]; + return __cpu_to_le32(csum); +} + +static int check_idle(struct supertype *st) +{ + /* Check that all member arrays for this container, or the + * container of this array, are idle + */ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + struct mdstat_ent *ent, *e; + int is_idle = 1; + + ent = mdstat_read(0, 0); + for (e = ent ; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + /* frozen array is not idle*/ + if (e->percent >= 0 || e->metadata_version[9] == '-') { + is_idle = 0; + break; + } + } + free_mdstat(ent); + return is_idle; +} + +static int freeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + if (!check_idle(st)) + return -1; + + if (block_monitor(container, 1)) { + pr_err("failed to freeze container\n"); + return -2; + } + + return 1; +} + +static void unfreeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + unblock_monitor(container, 1); +} + +static int freeze(struct supertype *st) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return -2 container cannot be frozen, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + if (st->ss->external) + return freeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + int err; + char buf[20]; + + if (!sra) + return -1; + /* Need to clear any 'read-auto' status */ + if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 && + strncmp(buf, "read-auto", 9) == 0) + sysfs_set_str(sra, NULL, "array_state", "clean"); + + err = sysfs_freeze_array(sra); + sysfs_free(sra); + return err; + } +} + +static void unfreeze(struct supertype *st) +{ + if (st->ss->external) + return unfreeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + char buf[20]; + + if (sra && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "frozen\n") == 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_free(sra); + } +} + +static void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + if (fd < 0) + return; + + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); + close(fd); +} + +static int reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, + int delta_disks, char *backup_file, char *dev, + int direction, int verbose) +{ + /* nothing extra to check in the native case */ + if (!st->ss->external) + return 0; + if (!st->ss->reshape_super || !st->ss->manage_reshape) { + pr_err("%s metadata does not support reshape\n", + st->ss->name); + return 1; + } + + return st->ss->reshape_super(st, size, level, layout, chunksize, + raid_disks, delta_disks, backup_file, dev, + direction, verbose); +} + +static void sync_metadata(struct supertype *st) +{ + if (st->ss->external) { + if (st->update_tail) { + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + st->ss->sync_metadata(st); + } +} + +static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n) +{ + /* when dealing with external metadata subarrays we need to be + * prepared to handle EAGAIN. The kernel may need to wait for + * mdmon to mark the array active so the kernel can handle + * allocations/writeback when preparing the reshape action + * (md_allow_write()). We temporarily disable safe_mode_delay + * to close a race with the array_state going clean before the + * next write to raid_disks / stripe_cache_size + */ + char safe[50]; + int rc; + + /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */ + if (!container || + (strcmp(name, "raid_disks") != 0 && + strcmp(name, "stripe_cache_size") != 0)) + return sysfs_set_num(sra, NULL, name, n); + + rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe)); + if (rc <= 0) + return -1; + sysfs_set_num(sra, NULL, "safe_mode_delay", 0); + rc = sysfs_set_num(sra, NULL, name, n); + if (rc < 0 && errno == EAGAIN) { + ping_monitor(container); + /* if we get EAGAIN here then the monitor is not active + * so stop trying + */ + rc = sysfs_set_num(sra, NULL, name, n); + } + sysfs_set_str(sra, NULL, "safe_mode_delay", safe); + return rc; +} + +int start_reshape(struct mdinfo *sra, int already_running, + int before_data_disks, int data_disks, struct supertype *st) +{ + int err; + unsigned long long sync_max_to_set; + + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress); + err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", + sra->reshape_progress); + if (before_data_disks <= data_disks) + sync_max_to_set = sra->reshape_progress / data_disks; + else + sync_max_to_set = (sra->component_size * data_disks + - sra->reshape_progress) / data_disks; + + if (!already_running) + sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set); + + if (st->ss->external) + err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set); + else + err = err ?: sysfs_set_str(sra, NULL, "sync_max", "max"); + + if (!already_running && err == 0) { + int cnt = 5; + do { + err = sysfs_set_str(sra, NULL, "sync_action", + "reshape"); + if (err) + sleep(1); + } while (err && errno == EBUSY && cnt-- > 0); + } + return err; +} + +void abort_reshape(struct mdinfo *sra) +{ + sysfs_set_str(sra, NULL, "sync_action", "idle"); + /* + * Prior to kernel commit: 23ddff3792f6 ("md: allow suspend_lo and + * suspend_hi to decrease as well as increase.") + * you could only increase suspend_{lo,hi} unless the region they + * covered was empty. So to reset to 0, you need to push suspend_lo + * up past suspend_hi first. So to maximize the chance of mdadm + * working on all kernels, we want to keep doing that. + */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. +} + +int remove_disks_for_takeover(struct supertype *st, + struct mdinfo *sra, + int layout) +{ + int nr_of_copies; + struct mdinfo *remaining; + int slot; + + if (st->ss->external) { + int rv = 0; + struct mdinfo *arrays = st->ss->container_content(st, NULL); + /* + * containter_content returns list of arrays in container + * If arrays->next is not NULL it means that there are + * 2 arrays in container and operation should be blocked + */ + if (arrays) { + if (arrays->next) + rv = 1; + sysfs_free(arrays); + if (rv) { + pr_err("Error. Cannot perform operation on /dev/%s\n", st->devnm); + pr_err("For this operation it MUST be single array in container\n"); + return rv; + } + } + } + + if (sra->array.level == 10) + nr_of_copies = layout & 0xff; + else if (sra->array.level == 1) + nr_of_copies = sra->array.raid_disks; + else + return 1; + + remaining = sra->devs; + sra->devs = NULL; + /* for each 'copy', select one device and remove from the list. */ + for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) { + struct mdinfo **diskp; + int found = 0; + + /* Find a working device to keep */ + for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) { + struct mdinfo *disk = *diskp; + + if (disk->disk.raid_disk < slot) + continue; + if (disk->disk.raid_disk >= slot + nr_of_copies) + continue; + if (disk->disk.state & (1<<MD_DISK_REMOVED)) + continue; + if (disk->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (!(disk->disk.state & (1<<MD_DISK_SYNC))) + continue; + + /* We have found a good disk to use! */ + *diskp = disk->next; + disk->next = sra->devs; + sra->devs = disk; + found = 1; + break; + } + if (!found) + break; + } + + if (slot < sra->array.raid_disks) { + /* didn't find all slots */ + struct mdinfo **e; + e = &remaining; + while (*e) + e = &(*e)->next; + *e = sra->devs; + sra->devs = remaining; + return 1; + } + + /* Remove all 'remaining' devices from the array */ + while (remaining) { + struct mdinfo *sd = remaining; + remaining = sd->next; + + sysfs_set_str(sra, sd, "state", "faulty"); + sysfs_set_str(sra, sd, "slot", "none"); + /* for external metadata disks should be removed in mdmon */ + if (!st->ss->external) + sysfs_set_str(sra, sd, "state", "remove"); + sd->disk.state |= (1<<MD_DISK_REMOVED); + sd->disk.state &= ~(1<<MD_DISK_SYNC); + sd->next = sra->devs; + sra->devs = sd; + } + return 0; +} + +void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size) +{ + int i; + + for (i = 0; i < size; i++) + if (fdlist[i] >= 0) + close(fdlist[i]); + + free(fdlist); + free(offsets); +} + +int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets) +{ + int d = 0; + struct mdinfo *sd; + + enable_fds(nrdisks); + for (d = 0; d <= nrdisks; d++) + fdlist[d] = -1; + d = raid_disks; + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC) && + sd->disk.raid_disk < raid_disks) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); + fdlist[sd->disk.raid_disk] = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fdlist[sd->disk.raid_disk] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + } else if (backup_file == NULL) { + /* spare */ + char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); + fdlist[d] = dev_open(dn, O_RDWR); + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; + if (fdlist[d] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + d++; + } + } +release: + return d; +} + +int reshape_open_backup_file(char *backup_file, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sys_name, + int restart) +{ + /* Return 1 on success, 0 on any form of failure */ + /* need to check backup file is large enough */ + char buf[512]; + struct stat stb; + unsigned int dev; + int i; + + *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL), + S_IRUSR | S_IWUSR); + *offsets = 8 * 512; + if (*fdlist < 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + /* Guard against backup file being on array device. + * If array is partitioned or if LVM etc is in the + * way this will not notice, but it is better than + * nothing. + */ + fstat(*fdlist, &stb); + dev = stb.st_dev; + fstat(fd, &stb); + if (stb.st_rdev == dev) { + pr_err("backup file must NOT be on the array being reshaped.\n"); + close(*fdlist); + return 0; + } + + memset(buf, 0, 512); + for (i=0; i < blocks + 8 ; i++) { + if (write(*fdlist, buf, 512) != 512) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + } + if (fsync(*fdlist) != 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + + if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) { + char *bu = make_backup(sys_name); + if (symlink(backup_file, bu)) + pr_err("Recording backup file in " MAP_DIR " failed: %s\n", + strerror(errno)); + free(bu); + } + + return 1; +} + +unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata) +{ + unsigned long a, b, blocks; + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). + */ + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + a = GCD(a, b); + /* LCM == product / GCD */ + blocks = (unsigned long)(ochunk/512) * (unsigned long)(nchunk/512) * + odata * ndata / a; + + return blocks; +} + +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) +{ + /* Based on the current array state in info->array and + * the changes in info->new_* etc, determine: + * - whether the change is possible + * - Intermediate level/raid_disks/layout + * - whether a restriping reshape is needed + * - number of sectors in minimum change unit. This + * will cover a whole number of stripes in 'before' and + * 'after'. + * + * Return message if the change should be rejected + * NULL if the change can be achieved + * + * This can be called as part of starting a reshape, or + * when assembling an array that is undergoing reshape. + */ + int near, far, offset, copies; + int new_disks; + int old_chunk, new_chunk; + /* delta_parity records change in number of devices + * caused by level change + */ + int delta_parity = 0; + + memset(re, 0, sizeof(*re)); + + /* If a new level not explicitly given, we assume no-change */ + if (info->new_level == UnSet) + info->new_level = info->array.level; + + if (info->new_chunk) + switch (info->new_level) { + case 0: + case 4: + case 5: + case 6: + case 10: + /* chunk size is meaningful, must divide component_size + * evenly + */ + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } + break; + default: + return "chunk size not meaningful for this level"; + } + else + info->new_chunk = info->array.chunk_size; + + switch (info->array.level) { + default: + return "No reshape is possibly for this RAID level"; + case LEVEL_LINEAR: + if (info->delta_disks != UnSet) + return "Only --add is supported for LINEAR, setting --raid-disks is not needed"; + else + return "Only --add is supported for LINEAR, other --grow options are not meaningful"; + case 1: + /* RAID1 can convert to RAID1 with different disks, or + * raid5 with 2 disks, or + * raid0 with 1 disk + */ + if (info->new_level > 1 && (info->component_size & 7)) + return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first."; + if (info->new_level == 0) { + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot change number of disks with RAID1->RAID0 conversion"; + re->level = 0; + re->before.data_disks = 1; + re->after.data_disks = 1; + return NULL; + } + if (info->new_level == 1) { + if (info->delta_disks == UnSet) + /* Don't know what to do */ + return "no change requested for Growing RAID1"; + re->level = 1; + return NULL; + } + if (info->array.raid_disks != 2 && info->new_level == 5) + return "Can only convert a 2-device array to RAID5"; + if (info->array.raid_disks == 2 && info->new_level == 5) { + re->level = 5; + re->before.data_disks = 1; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + re->after.data_disks = 1 + info->delta_disks; + else + re->after.data_disks = 1; + if (re->after.data_disks < 1) + return "Number of disks too small for RAID5"; + + re->before.layout = ALGORITHM_LEFT_SYMMETRIC; + info->array.chunk_size = 65536; + break; + } + /* Could do some multi-stage conversions, but leave that to + * later. + */ + return "Impossibly level change request for RAID1"; + + case 10: + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. + */ + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* + * number of devices must be multiple of + * number of copies + */ + if (info->array.raid_disks % + (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; + + new_disks = (info->array.raid_disks / + (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); + + if (info->delta_disks != + new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + + default: + return "RAID10 can only be changed to RAID0"; + } + case 0: + /* RAID0 can be converted to RAID10, or to RAID456 */ + if (info->new_level == 10) { + if (info->new_layout == UnSet && + info->delta_disks == UnSet) { + /* Assume near=2 layout */ + info->new_layout = 0x102; + info->delta_disks = info->array.raid_disks; + } + if (info->new_layout == UnSet) { + int copies = 1 + (info->delta_disks + / info->array.raid_disks); + if (info->array.raid_disks * (copies-1) != + info->delta_disks) + return "Impossible number of devices for RAID0->RAID10"; + info->new_layout = 0x100 + copies; + } + if (info->delta_disks == UnSet) { + int copies = info->new_layout & 0xff; + if (info->new_layout != 0x100 + copies) + return "New layout impossible for RAID0->RAID10";; + info->delta_disks = (copies - 1) * + info->array.raid_disks; + } + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID0->RAID10"; + /* looks good */ + re->level = 10; + re->before.data_disks = (info->array.raid_disks + + info->delta_disks); + re->after.data_disks = re->before.data_disks; + re->before.layout = info->new_layout; + return NULL; + } + + /* RAID0 can also covert to RAID0/4/5/6 by first converting to + * a raid4 style layout of the final level. + */ + switch (info->new_level) { + case 4: + delta_parity = 1; + case 0: + re->level = 4; + re->before.layout = 0; + break; + case 5: + delta_parity = 1; + re->level = 5; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); + break; + case 6: + delta_parity = 2; + re->level = 6; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); + break; + default: + return "Impossible level change requested"; + } + re->before.data_disks = info->array.raid_disks; + /* determining 'after' layout happens outside this 'switch' */ + break; + + case 4: + info->array.layout = ALGORITHM_PARITY_N; + case 5: + switch (info->new_level) { + case 0: + delta_parity = -1; + case 4: + re->level = info->array.level; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 5: + re->level = 5; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 6: + delta_parity = 1; + re->level = 6; + re->before.data_disks = info->array.raid_disks - 1; + switch (info->array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + re->before.layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + re->before.layout = ALGORITHM_PARITY_N_6; + break; + default: + return "Cannot convert an array with this layout"; + } + break; + case 1: + if (info->array.raid_disks != 2) + return "Can only convert a 2-device array to RAID1"; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot set raid_disk when converting RAID5->RAID1"; + re->level = 1; + info->new_chunk = 0; + return NULL; + default: + return "Impossible level change requested"; + } + break; + case 6: + switch (info->new_level) { + case 4: + case 5: + delta_parity = -1; + case 6: + re->level = 6; + re->before.data_disks = info->array.raid_disks - 2; + re->before.layout = info->array.layout; + break; + default: + return "Impossible level change requested"; + } + break; + } + + /* If we reached here then it looks like a re-stripe is + * happening. We have determined the intermediate level + * and initial raid_disks/layout and stored these in 're'. + * + * We need to deduce the final layout that can be atomically + * converted to the end state. + */ + switch (info->new_level) { + case 0: + /* We can only get to RAID0 from RAID4 or RAID5 + * with appropriate layout and one extra device + */ + if (re->level != 4 && re->level != 5) + return "Cannot covert to RAID0 from this level"; + + switch (re->level) { + case 4: + re->before.layout = 0; + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 4: + /* We can only get to RAID4 from RAID5 */ + if (re->level != 4 && re->level != 5) + return "Cannot convert to RAID4 from this level"; + + switch (re->level) { + case 4: + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 5: + /* We get to RAID5 from RAID5 or RAID6 */ + if (re->level != 5 && re->level != 6) + return "Cannot convert to RAID5 from this level"; + + switch (re->level) { + case 5: + if (info->new_layout == UnSet) + re->after.layout = re->before.layout; + else + re->after.layout = info->new_layout; + break; + case 6: + if (info->new_layout == UnSet) + info->new_layout = re->before.layout; + + /* after.layout needs to be raid6 version of new_layout */ + if (info->new_layout == ALGORITHM_PARITY_N) + re->after.layout = ALGORITHM_PARITY_N; + else { + char layout[40]; + char *ls = map_num(r5layout, info->new_layout); + int l; + if (ls) { + /* Current RAID6 layout has a RAID5 + * equivalent - good + */ + strcat(strcpy(layout, ls), "-6"); + l = map_name(r6layout, layout); + if (l == UnSet) + return "Cannot find RAID6 layout to convert to"; + } else { + /* Current RAID6 has no equivalent. + * If it is already a '-6' layout we + * can leave it unchanged, else we must + * fail + */ + ls = map_num(r6layout, + info->new_layout); + if (!ls || + strcmp(ls+strlen(ls)-2, "-6") != 0) + return "Please specify new layout"; + l = info->new_layout; + } + re->after.layout = l; + } + } + break; + + case 6: + /* We must already be at level 6 */ + if (re->level != 6) + return "Impossible level change"; + if (info->new_layout == UnSet) + re->after.layout = info->array.layout; + else + re->after.layout = info->new_layout; + break; + default: + return "Impossible level change requested"; + } + if (info->delta_disks == UnSet) + info->delta_disks = delta_parity; + + re->after.data_disks = + (re->before.data_disks + info->delta_disks - delta_parity); + + switch (re->level) { + case 6: + re->parity = 2; + break; + case 4: + case 5: + re->parity = 1; + break; + default: + re->parity = 0; + break; + } + /* So we have a restripe operation, we need to calculate the number + * of blocks per reshape operation. + */ + re->new_size = info->component_size * re->before.data_disks; + if (info->new_chunk == 0) + info->new_chunk = info->array.chunk_size; + if (re->after.data_disks == re->before.data_disks && + re->after.layout == re->before.layout && + info->new_chunk == info->array.chunk_size) { + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + if (re->after.data_disks == 1 && re->before.data_disks == 1) { + /* chunk and layout changes make no difference */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + + if (re->after.data_disks == re->before.data_disks && + get_linux_version() < 2006032) + return "in-place reshape is not safe before 2.6.32 - sorry."; + + if (re->after.data_disks < re->before.data_disks && + get_linux_version() < 2006030) + return "reshape to fewer devices is not supported before 2.6.30 - sorry."; + + re->backup_blocks = compute_backup_blocks( + info->new_chunk, info->array.chunk_size, + re->after.data_disks, re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; + + re->new_size = info->component_size * re->after.data_disks; + return NULL; +} + +static int set_array_size(struct supertype *st, struct mdinfo *sra, + char *text_version) +{ + struct mdinfo *info; + char *subarray; + int ret_val = -1; + + if ((st == NULL) || (sra == NULL)) + return ret_val; + + if (text_version == NULL) + text_version = sra->text_version; + subarray = strchr(text_version + 1, '/')+1; + info = st->ss->container_content(st, subarray); + if (info) { + unsigned long long current_size = 0; + unsigned long long new_size = info->custom_array_size/2; + + if (sysfs_get_ll(sra, NULL, "array_size", ¤t_size) == 0 && + new_size > current_size) { + if (sysfs_set_num(sra, NULL, "array_size", new_size) + < 0) + dprintf("Error: Cannot set array size"); + else { + ret_val = 0; + dprintf("Array size changed"); + } + dprintf_cont(" from %llu to %llu.\n", + current_size, new_size); + } + sysfs_free(info); + } else + dprintf("Error: set_array_size(): info pointer in NULL\n"); + + return ret_val; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape); +static int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape); + +int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s) +{ + /* Make some changes in the shape of an array. + * The kernel must support the change. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * + */ + struct mdu_array_info_s array; + int rv = 0; + struct supertype *st; + char *subarray = NULL; + + int frozen; + int changed = 0; + char *container = NULL; + int cfd = -1; + + struct mddev_dev *dv; + int added_disks; + + struct mdinfo info; + struct mdinfo *sra; + + if (md_get_array_info(fd, &array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + return 1; + } + if (s->level != UnSet && s->chunk) { + pr_err("Cannot change array level in the same operation as changing chunk size.\n"); + return 1; + } + + if (data_offset != INVALID_SECTORS && array.level != 10 && + (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } + + if (s->size > 0 && + (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { + pr_err("cannot change component size at the same time as other changes.\n" + " Change size first, then check data is intact before making other changes.\n"); + return 1; + } + + if (s->raiddisks && s->raiddisks < array.raid_disks && + array.level > 1 && get_linux_version() < 2006032 && + !check_env("MDADM_FORCE_FEWER")) { + pr_err("reducing the number of devices is not safe before Linux 2.6.32\n" + " Please use a newer kernel\n"); + return 1; + } + + if (array.level > 1 && s->size > 1 && + (unsigned long long) (array.chunk_size / 1024) > s->size) { + pr_err("component size must be larger than chunk size.\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + if (s->raiddisks > st->max_devs) { + pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs); + return 1; + } + if (s->level == 0 && (array.state & (1 << MD_SB_BITMAP_PRESENT)) && + !(array.state & (1 << MD_SB_CLUSTERED)) && !st->ss->external) { + array.state &= ~(1 << MD_SB_BITMAP_PRESENT); + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + } + + /* in the external case we need to check that the requested reshape is + * supported, and perform an initial check that the container holds the + * pre-requisite spare devices (mdmon owns final validation) + */ + if (st->ss->external) { + int retval; + + if (subarray) { + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + free(subarray); + return 1; + } + + retval = st->ss->load_container(st, cfd, NULL); + + if (retval) { + pr_err("Cannot read superblock for %s\n", devname); + free(subarray); + return 1; + } + + /* check if operation is supported for metadata handler */ + if (st->ss->container_content) { + struct mdinfo *cc = NULL; + struct mdinfo *content = NULL; + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + int allow_reshape = 1; + + /* check if reshape is allowed based on metadata + * indications stored in content.array.status + */ + if (content->array.state & + (1 << MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->array.state & + (1 << MD_SB_BLOCK_CONTAINER_RESHAPE)) + allow_reshape = 0; + if (!allow_reshape) { + pr_err("cannot reshape arrays in container with unsupported metadata: %s(%s)\n", + devname, container); + sysfs_free(cc); + free(subarray); + return 1; + } + if (content->consistency_policy == + CONSISTENCY_POLICY_PPL) { + pr_err("Operation not supported when ppl consistency policy is enabled\n"); + sysfs_free(cc); + free(subarray); + return 1; + } + if (content->consistency_policy == + CONSISTENCY_POLICY_BITMAP) { + pr_err("Operation not supported when write-intent bitmap is enabled\n"); + sysfs_free(cc); + free(subarray); + return 1; + } + } + sysfs_free(cc); + } + if (mdmon_running(container)) + st->update_tail = &st->updates; + } + + added_disks = 0; + for (dv = devlist; dv; dv = dv->next) + added_disks++; + if (s->raiddisks > array.raid_disks && + array.spare_disks + added_disks < + (s->raiddisks - array.raid_disks) && + !c->force) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + s->raiddisks - array.raid_disks, + s->raiddisks - array.raid_disks == 1 ? "" : "s", + array.spare_disks + added_disks); + return 1; + } + + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS | + GET_STATE | GET_VERSION); + if (sra) { + if (st->ss->external && subarray == NULL) { + array.level = LEVEL_CONTAINER; + sra->array.level = LEVEL_CONTAINER; + } + } else { + pr_err("failed to read sysfs parameters for %s\n", + devname); + return 1; + } + frozen = freeze(st); + if (frozen < -1) { + /* freeze() already spewed the reason */ + sysfs_free(sra); + return 1; + } else if (frozen < 0) { + pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname); + sysfs_free(sra); + return 1; + } + + /* ========= set size =============== */ + if (s->size > 0 && + (s->size == MAX_SIZE || s->size != (unsigned)array.size)) { + unsigned long long orig_size = get_component_size(fd)/2; + unsigned long long min_csize; + struct mdinfo *mdi; + int raid0_takeover = 0; + + if (orig_size == 0) + orig_size = (unsigned) array.size; + + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } + + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, APPLY_METADATA_CHANGES, + c->verbose > 0)) { + rv = 1; + goto release; + } + sync_metadata(st); + if (st->ss->external) { + /* metadata can have size limitation + * update size value according to metadata information + */ + struct mdinfo *sizeinfo = + st->ss->container_content(st, subarray); + if (sizeinfo) { + unsigned long long new_size = + sizeinfo->custom_array_size/2; + int data_disks = get_data_disks( + sizeinfo->array.level, + sizeinfo->array.layout, + sizeinfo->array.raid_disks); + new_size /= data_disks; + dprintf("Metadata size correction from %llu to %llu (%llu)\n", + orig_size, new_size, + new_size * data_disks); + s->size = new_size; + sysfs_free(sizeinfo); + } + } + + /* Update the size of each member device in case + * they have been resized. This will never reduce + * below the current used-size. The "size" attribute + * understands '0' to mean 'max'. + */ + min_csize = 0; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + sysfs_set_num(sra, mdi, "size", + s->size == MAX_SIZE ? 0 : s->size); + if (array.not_persistent == 0 && + array.major_version == 0 && + get_linux_version() < 3001000) { + /* Dangerous to allow size to exceed 2TB */ + unsigned long long csize; + if (sysfs_get_ll(sra, mdi, "size", + &csize) == 0) { + if (csize >= 2ULL*1024*1024*1024) + csize = 2ULL*1024*1024*1024; + if ((min_csize == 0 || + (min_csize > csize))) + min_csize = csize; + } + } + } + if (min_csize && s->size > min_csize) { + pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n"); + rv = 1; + goto size_change_error; + } + if (min_csize && s->size == MAX_SIZE) { + /* Don't let the kernel choose a size - it will get + * it wrong + */ + pr_err("Limited v0.90 array to 2TB per device\n"); + s->size = min_csize; + } + if (st->ss->external) { + if (sra->array.level == 0) { + rv = sysfs_set_str(sra, NULL, "level", "raid5"); + if (!rv) { + raid0_takeover = 1; + /* get array parameters after takeover + * to change one parameter at time only + */ + rv = md_get_array_info(fd, &array); + } + } + /* make sure mdmon is + * aware of the new level */ + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(container); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + if (s->size & ~INT32_MAX) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", s->size); + else + rv = -1; + } else { + rv = md_set_array_info(fd, &array); + + /* manage array size when it is managed externally + */ + if ((rv == 0) && st->ss->external) + rv = set_array_size(st, sra, sra->text_version); + } + + if (raid0_takeover) { + /* do not recync non-existing parity, + * we will drop it anyway + */ + sysfs_set_str(sra, NULL, "sync_action", "frozen"); + /* go back to raid0, drop parity disk + */ + sysfs_set_str(sra, NULL, "level", "raid0"); + md_get_array_info(fd, &array); + } + +size_change_error: + if (rv != 0) { + int err = errno; + + /* restore metadata */ + if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, + UnSet, NULL, devname, + ROLLBACK_METADATA_CHANGES, + c->verbose) == 0) + sync_metadata(st); + pr_err("Cannot set device size for %s: %s\n", + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before size can be changed\n"); + rv = 1; + goto release; + } + if (s->assume_clean) { + /* This will fail on kernels older than 3.0 unless + * a backport has been arranged. + */ + if (sra == NULL || + sysfs_set_str(sra, NULL, "resync_start", + "none") < 0) + pr_err("--assume-clean not supported with --grow on this kernel\n"); + } + md_get_array_info(fd, &array); + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + if (c->verbose >= 0) { + if (s->size == orig_size) + pr_err("component size of %s unchanged at %lluK\n", + devname, s->size); + else + pr_err("component size of %s has been set to %lluK\n", + devname, s->size); + } + changed = 1; + } else if (array.level != LEVEL_CONTAINER) { + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + } + + /* See if there is anything else to do */ + if ((s->level == UnSet || s->level == array.level) && + (s->layout_str == NULL) && + (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && + (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { + /* Nothing more to do */ + if (!changed && c->verbose >= 0) + pr_err("%s: no change requested\n", devname); + goto release; + } + + /* ========= check for Raid10/Raid1 -> Raid0 conversion =============== + * current implementation assumes that following conditions must be met: + * - RAID10: + * - far_copies == 1 + * - near_copies == 2 + */ + if ((s->level == 0 && array.level == 10 && sra && + array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || + (s->level == 0 && array.level == 1 && sra)) { + int err; + + err = remove_disks_for_takeover(st, sra, array.layout); + if (err) { + dprintf("Array cannot be reshaped\n"); + if (cfd > -1) + close(cfd); + rv = 1; + goto release; + } + /* Make sure mdmon has seen the device removal + * and updated metadata before we continue with + * level change + */ + if (container) + ping_monitor(container); + } + + memset(&info, 0, sizeof(info)); + info.array = array; + if (sysfs_init(&info, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + rv = 1; + goto release; + } + strcpy(info.text_version, sra->text_version); + info.component_size = s->size*2; + info.new_level = s->level; + info.new_chunk = s->chunk * 1024; + if (info.array.level == LEVEL_CONTAINER) { + info.delta_disks = UnSet; + info.array.raid_disks = s->raiddisks; + } else if (s->raiddisks) + info.delta_disks = s->raiddisks - info.array.raid_disks; + else + info.delta_disks = UnSet; + if (s->layout_str == NULL) { + info.new_layout = UnSet; + if (info.array.level == 6 && + (info.new_level == 6 || info.new_level == UnSet) && + info.array.layout >= 16) { + pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname); + cont_err("during the reshape, please specify --layout=preserve\n"); + cont_err("If you want to change it, specify a layout or use --layout=normalise\n"); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "normalise") == 0 || + strcmp(s->layout_str, "normalize") == 0) { + /* If we have a -6 RAID6 layout, remove the '-6'. */ + info.new_layout = UnSet; + if (info.array.level == 6 && info.new_level == UnSet) { + char l[40], *h; + strcpy(l, map_num(r6layout, info.array.layout)); + h = strrchr(l, '-'); + if (h && strcmp(h, "-6") == 0) { + *h = 0; + info.new_layout = map_name(r6layout, l); + } + } else { + pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "preserve") == 0) { + /* This means that a non-standard RAID6 layout + * is OK. + * In particular: + * - When reshape a RAID6 (e.g. adding a device) + * which is in a non-standard layout, it is OK + * to preserve that layout. + * - When converting a RAID5 to RAID6, leave it in + * the XXX-6 layout, don't re-layout. + */ + if (info.array.level == 6 && info.new_level == UnSet) + info.new_layout = info.array.layout; + else if (info.array.level == 5 && info.new_level == 6) { + char l[40]; + strcpy(l, map_num(r5layout, info.array.layout)); + strcat(l, "-6"); + info.new_layout = map_name(r6layout, l); + } else { + pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str); + rv = 1; + goto release; + } + } else { + int l = info.new_level; + if (l == UnSet) + l = info.array.level; + switch (l) { + case 5: + info.new_layout = map_name(r5layout, s->layout_str); + break; + case 6: + info.new_layout = map_name(r6layout, s->layout_str); + break; + case 10: + info.new_layout = parse_layout_10(s->layout_str); + break; + case LEVEL_FAULTY: + info.new_layout = parse_layout_faulty(s->layout_str); + break; + default: + pr_err("layout not meaningful with this level\n"); + rv = 1; + goto release; + } + if (info.new_layout == UnSet) { + pr_err("layout %s not understood for this level\n", + s->layout_str); + rv = 1; + goto release; + } + } + + if (array.level == LEVEL_FAULTY) { + if (s->level != UnSet && s->level != array.level) { + pr_err("cannot change level of Faulty device\n"); + rv =1 ; + } + if (s->chunk) { + pr_err("cannot set chunksize of Faulty device\n"); + rv =1 ; + } + if (s->raiddisks && s->raiddisks != 1) { + pr_err("cannot set raid_disks of Faulty device\n"); + rv =1 ; + } + if (s->layout_str) { + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + array.layout = info.new_layout; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set new layout\n"); + rv = 1; + } else if (c->verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + } else if (array.level == LEVEL_CONTAINER) { + /* This change is to be applied to every array in the + * container. This is only needed when the metadata imposes + * restraints of the various arrays in the container. + * Currently we only know that IMSM requires all arrays + * to have the same number of devices so changing the + * number of devices (On-Line Capacity Expansion) must be + * performed at the level of the container + */ + close_fd(&fd); + rv = reshape_container(container, devname, -1, st, &info, + c->force, c->backup_file, c->verbose, + 0, 0, 0); + frozen = 0; + } else { + /* get spare devices from external metadata + */ + if (st->ss->external) { + struct mdinfo *info2; + + info2 = st->ss->container_content(st, subarray); + if (info2) { + info.array.spare_disks = + info2->array.spare_disks; + sysfs_free(info2); + } + } + + /* Impose these changes on a single array. First + * check that the metadata is OK with the change. */ + + if (reshape_super(st, 0, info.new_level, + info.new_layout, info.new_chunk, + info.array.raid_disks, info.delta_disks, + c->backup_file, devname, + APPLY_METADATA_CHANGES, c->verbose)) { + rv = 1; + goto release; + } + sync_metadata(st); + rv = reshape_array(container, fd, devname, st, &info, c->force, + devlist, data_offset, c->backup_file, + c->verbose, 0, 0, 0); + frozen = 0; + } +release: + sysfs_free(sra); + if (frozen > 0) + unfreeze(st); + return rv; +} + +/* verify_reshape_position() + * Function checks if reshape position in metadata is not farther + * than position in md. + * Return value: + * 0 : not valid sysfs entry + * it can be caused by not started reshape, it should be started + * by reshape array or raid0 array is before takeover + * -1 : error, reshape position is obviously wrong + * 1 : success, reshape progress correct or updated +*/ +static int verify_reshape_position(struct mdinfo *info, int level) +{ + int ret_val = 0; + char buf[40]; + int rv; + + /* read sync_max, failure can mean raid0 array */ + rv = sysfs_get_str(info, NULL, "sync_max", buf, 40); + + if (rv > 0) { + char *ep; + unsigned long long position = strtoull(buf, &ep, 0); + + dprintf("Read sync_max sysfs entry is: %s\n", buf); + if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) { + position *= get_data_disks(level, + info->new_layout, + info->array.raid_disks); + if (info->reshape_progress < position) { + dprintf("Corrected reshape progress (%llu) to md position (%llu)\n", + info->reshape_progress, position); + info->reshape_progress = position; + ret_val = 1; + } else if (info->reshape_progress > position) { + pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n", + position, info->reshape_progress); + ret_val = -1; + } else { + dprintf("Reshape position in md and metadata are the same;"); + ret_val = 1; + } + } + } else if (rv == 0) { + /* for valid sysfs entry, 0-length content + * should be indicated as error + */ + ret_val = -1; + } + + return ret_val; +} + +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + if (!can_fallback) + pr_err("%s: Metadata version doesn't support data_offset changes\n", + devname); + goto fallback; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && + dir == 1) || + (data_offset >= info2.data_offset && + dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (delta_disks < 0) { + /* Don't need any space as array is shrinking + * just move data_offset up by min + */ + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset > sd->data_offset - min) { + pr_err("--data-offset too large on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<<MD_DISK_SYNC))) { + /* some kernels have a bug where you cannot + * use '0' on spare devices. */ + sysfs_set_num(sra, sd, "size", + (sra->component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, + data_offset, min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && + sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err(" Bitmap must be removed before shape can be changed\n"); + goto release; + } + sysfs_free(sra); + return 0; +release: + sysfs_free(sra); + return 1; +} + +static void get_space_after(int fd, struct supertype *st, struct mdinfo *info) +{ + struct mdinfo *sra, *sd; + /* Initialisation to silence compiler warning */ + unsigned long long min_space_before = 0, min_space_after = 0; + int first = 1; + + sra = sysfs_read(fd, NULL, GET_DEVS); + if (!sra) + return; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Convert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + md_get_array_info(fd, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && md_set_array_info(fd, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + + goto release; + } + } else if (!restart) { + /* set them all just in case some old 'new_*' value + * persists from some earlier problem. + */ + int err = 0; + if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", devname); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + goto release; + } + } + return 0; +release: + return -1; +} + +static int impose_level(int fd, int level, char *devname, int verbose) +{ + char *c; + struct mdu_array_info_s array; + struct mdinfo info; + + if (sysfs_init(&info, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + return 1; + } + + md_get_array_info(fd, &array); + if (level == 0 && (array.level >= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) && + disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + md_get_array_info(fd, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) && + disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + hot_remove_disk(fd, makedev(disk.major, disk.minor), 1); + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before level can be changed\n"); + return err; + } + if (verbose >= 0) + pr_err("level of %s changed to %s\n", devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape) +{ + struct reshape reshape; + int spares_needed; + char *msg; + int orig_level = UnSet; + int odisks; + int delayed; + + struct mdu_array_info_s array; + char *c; + + struct mddev_dev *dv; + int added_disks; + + int *fdlist = NULL; + unsigned long long *offsets = NULL; + int d; + int nrdisks; + int err; + unsigned long blocks; + unsigned long long array_size; + int done; + struct mdinfo *sra = NULL; + char buf[20]; + + /* when reshaping a RAID0, the component_size might be zero. + * So try to fix that up. + */ + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + if (array.level == 0 && info->component_size == 0) { + get_dev_size(fd, NULL, &array_size); + info->component_size = array_size / array.raid_disks; + } + + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + + if (info->reshape_active) { + int new_level = info->new_level; + info->new_level = UnSet; + if (info->delta_disks > 0) + info->array.raid_disks -= info->delta_disks; + msg = analyse_change(devname, info, &reshape); + info->new_level = new_level; + if (info->delta_disks > 0) + info->array.raid_disks += info->delta_disks; + if (!restart) + /* Make sure the array isn't read-only */ + ioctl(fd, RESTART_ARRAY_RW, 0); + } else + msg = analyse_change(devname, info, &reshape); + if (msg) { + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); + goto release; + } + if (restart && (reshape.level != info->array.level || + reshape.before.layout != info->array.layout || + reshape.before.data_disks + reshape.parity != + info->array.raid_disks - max(0, info->delta_disks))) { + pr_err("reshape info is not in native format - cannot continue.\n"); + goto release; + } + + if (st->ss->external && restart && (info->reshape_progress == 0) && + !((sysfs_get_str(info, NULL, "sync_action", + buf, sizeof(buf)) > 0) && + (strncmp(buf, "reshape", 7) == 0))) { + /* When reshape is restarted from '0', very begin of array + * it is possible that for external metadata reshape and array + * configuration doesn't happen. + * Check if md has the same opinion, and reshape is restarted + * from 0. If so, this is regular reshape start after reshape + * switch in metadata to next array only. + */ + if ((verify_reshape_position(info, reshape.level) >= 0) && + (info->reshape_progress == 0)) + restart = 0; + } + if (restart) { + /* + * reshape already started. just skip to monitoring + * the reshape + */ + if (reshape.backup_blocks == 0) + return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CHUNK|GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + backup_file = locate_backup(sra->sys_name); + + goto started; + } + /* The container is frozen but the array may not be. + * So freeze the array so spares don't get put to the wrong use + * FIXME there should probably be a cleaner separation between + * freeze_array and freeze_container. + */ + sysfs_freeze_array(info); + /* Check we have enough spares to not be degraded */ + added_disks = 0; + for (dv = devlist; dv ; dv=dv->next) + added_disks++; + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + + reshape.parity - array.raid_disks; + + if (!force && info->new_level > 1 && info->array.level > 1 && + spares_needed > info->array.spare_disks + added_disks) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + /* Check we have enough spares to not fail */ + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + - array.raid_disks; + if ((info->new_level > 1 || info->new_level == 0) && + spares_needed > info->array.spare_disks +added_disks) { + pr_err("Need %d spare%s to create working array, and only have %d.\n", + spares_needed, spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + + if (reshape.level != array.level) { + int err = impose_level(fd, reshape.level, devname, verbose); + if (err) + goto release; + info->new_layout = UnSet; /* after level change, + * layout is meaningless */ + orig_level = array.level; + sysfs_freeze_array(info); + + if (reshape.level > 0 && st->ss->external) { + /* make sure mdmon is aware of the new level */ + if (mdmon_running(container)) + flush_mdmon(container); + + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + if (mdmon_running(container) && st->update_tail == NULL) + st->update_tail = &st->updates; + } + } + /* ->reshape_super might have chosen some spares from the + * container that it wants to be part of the new array. + * We can collect them with ->container_content and give + * them to the kernel. + */ + if (st->ss->reshape_super && st->ss->container_content) { + char *subarray = strchr(info->text_version+1, '/')+1; + struct mdinfo *info2 = + st->ss->container_content(st, subarray); + struct mdinfo *d; + + if (info2) { + if (sysfs_init(info2, fd, st->devnm)) { + pr_err("unable to initialize sysfs for %s\n", + st->devnm); + free(info2); + goto release; + } + /* When increasing number of devices, we need to set + * new raid_disks before adding these, or they might + * be rejected. + */ + if (reshape.backup_blocks && + reshape.after.data_disks > + reshape.before.data_disks) + subarray_set_num(container, info2, "raid_disks", + reshape.after.data_disks + + reshape.parity); + for (d = info2->devs; d; d = d->next) { + if (d->disk.state == 0 && + d->disk.raid_disk >= 0) { + /* This is a spare that wants to + * be part of the array. + */ + add_disk(fd, st, info2, d); + } + } + sysfs_free(info2); + } + } + /* We might have been given some devices to add to the + * array. Now that the array has been changed to the right + * level and frozen, we can safely add them. + */ + if (devlist) { + if (Manage_subdevs(devname, fd, devlist, verbose, 0, NULL, 0)) + goto release; + } + + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; + if (reshape.backup_blocks == 0) { + /* No restriping needed, but we might need to impose + * some more changes: layout, raid_disks, chunk_size + */ + /* read current array info */ + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + /* compare current array info with new values and if + * it is different update them to new */ + if (info->new_layout != UnSet && + info->new_layout != array.layout) { + array.layout = info->new_layout; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set new layout\n"); + goto release; + } else if (verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + if (info->delta_disks != UnSet && info->delta_disks != 0 && + array.raid_disks != + (info->array.raid_disks + info->delta_disks)) { + array.raid_disks += info->delta_disks; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set raid disks\n"); + goto release; + } else if (verbose >= 0) { + printf("raid_disks for %s set to %d\n", + devname, array.raid_disks); + } + } + if (info->new_chunk != 0 && + info->new_chunk != array.chunk_size) { + if (sysfs_set_num(info, NULL, + "chunk_size", info->new_chunk) != 0) { + pr_err("failed to set chunk size\n"); + goto release; + } else if (verbose >= 0) + printf("chunk size for %s set to %d\n", + devname, info->new_chunk); + } + unfreeze(st); + return 0; + } + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * In future we will allow the data_offset to change, so + * a sliding backup becomes unnecessary. + * + * 3/ The array will grow. This is relatively easy. + * However the kernel's restripe routines will cheerfully + * overwrite some early data before it is safe. So we + * need to make a backup of the early parts of the array + * and be ready to restore it if rebuild aborts very early. + * For externally managed metadata, we still need a forked + * child to monitor the reshape and suspend IO over the region + * that is being reshaped. + * + * We backup data by writing it to one spare, or to a + * file which was given on command line. + * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - fork to handle backup etc. + */ + /* Check that we can hold all the data */ + get_dev_size(fd, NULL, &array_size); + if (reshape.new_size < (array_size/512)) { + pr_err("this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, reshape.new_size/2); + goto release; + } + + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, force, verbose); + } + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + struct mdinfo *sd; + if (errno != EINVAL) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + /* revert data_offset and try the old way */ + for (sd = sra->devs; sd; sd = sd->next) { + sysfs_set_num(sra, sd, "new_offset", + sd->data_offset); + sysfs_set_str(sra, NULL, "reshape_direction", + "forwards"); + } + break; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + manage_fork_fds(0); + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: + /* Decide how many blocks (sectors) for a reshape + * unit. The number we have so far is just a minimum + */ + blocks = reshape.backup_blocks; + if (reshape.before.data_disks == + reshape.after.data_disks) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && blocks < 16*1024*2) + blocks *= 2; + } else + pr_err("Need to backup %luK of critical section..\n", blocks/2); + + if (blocks >= sra->component_size/2) { + pr_err("%s: Something wrong - reshape aborted\n", devname); + goto release; + } + + /* Now we need to open all these devices so we can read/write. + */ + nrdisks = max(reshape.before.data_disks, + reshape.after.data_disks) + reshape.parity + + sra->array.spare_disks; + fdlist = xcalloc((1+nrdisks), sizeof(int)); + offsets = xcalloc((1+nrdisks), sizeof(offsets[0])); + + odisks = reshape.before.data_disks + reshape.parity; + d = reshape_prepare_fdlist(devname, sra, odisks, nrdisks, blocks, + backup_file, fdlist, offsets); + if (d < odisks) { + goto release; + } + if ((st->ss->manage_reshape == NULL) || + (st->ss->recover_backup == NULL)) { + if (backup_file == NULL) { + if (reshape.after.data_disks <= + reshape.before.data_disks) { + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); + goto release; + } else if (d == odisks) { + pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname); + goto release; + } + } else { + if (!reshape_open_backup_file(backup_file, fd, devname, + (signed)blocks, + fdlist+d, offsets+d, + sra->sys_name, restart)) { + goto release; + } + d++; + } + } + + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), blocks); + + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + + err = start_reshape(sra, restart, reshape.before.data_disks, + reshape.after.data_disks, st); + if (err) { + pr_err("Cannot %s reshape for %s\n", + restart ? "continue" : "start", devname); + goto release; + } + if (restart) + sysfs_set_str(sra, NULL, "array_state", "active"); + if (freeze_reshape) { + free(fdlist); + free(offsets); + sysfs_free(sra); + pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n", + sra->reshape_progress); + return 1; + } + + if (!forked) + if (continue_via_systemd(container ?: sra->sys_name, + GROW_SERVICE)) { + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + } + + close(fd); + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. + */ + switch(forked ? 0 : fork()) { + case -1: + pr_err("Cannot run child to monitor reshape: %s\n", + strerror(errno)); + abort_reshape(sra); + goto release; + default: + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + case 0: + map_fork(); + break; + } + + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + signal(SIGTERM, catch_term); + + if (st->ss->external) { + /* metadata handler takes it from here */ + done = st->ss->manage_reshape( + fd, sra, &reshape, st, blocks, + fdlist, offsets, d - odisks, fdlist + odisks, + offsets + odisks); + } else + done = child_monitor( + fd, sra, &reshape, st, blocks, fdlist, offsets, + d - odisks, fdlist + odisks, offsets + odisks); + + free(fdlist); + free(offsets); + + if (backup_file && done) { + char *bul; + bul = make_backup(sra->sys_name); + if (bul) { + char buf[1024]; + int l = readlink(bul, buf, sizeof(buf) - 1); + if (l > 0) { + buf[l]=0; + unlink(buf); + } + unlink(bul); + free(bul); + } + unlink(backup_file); + } + if (!done) { + abort_reshape(sra); + goto out; + } + + if (!st->ss->external && + !(reshape.before.data_disks != reshape.after.data_disks && + info->custom_array_size) && info->new_level == reshape.level && + !forked) { + /* no need to wait for the reshape to finish as + * there is nothing more to do. + */ + sysfs_free(sra); + exit(0); + } + wait_reshape(sra); + + if (st->ss->external) { + /* Re-load the metadata as much could have changed */ + int cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + flush_mdmon(container); + st->ss->free_super(st); + st->ss->load_container(st, cfd, container); + close(cfd); + } + } + + /* set new array size if required customer_array_size is used + * by this metadata. + */ + if (reshape.before.data_disks != reshape.after.data_disks && + info->custom_array_size) + set_array_size(st, info, info->text_version); + + if (info->new_level != reshape.level) { + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); + if (info->new_level == 0) + st->update_tail = NULL; + } +out: + sysfs_free(sra); + if (forked) + return 0; + unfreeze(st); + exit(0); + +release: + free(fdlist); + free(offsets); + if (orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + pr_err("aborting level change\n"); + } + sysfs_free(sra); + if (!forked) + unfreeze(st); + return 1; +} + +/* mdfd handle is passed to be closed in child process (after fork). + */ +int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape) +{ + struct mdinfo *cc = NULL; + int rv = restart; + char last_devnm[32] = ""; + + /* component_size is not meaningful for a container, + * so pass '0' meaning 'no change' + */ + if (!restart && + reshape_super(st, 0, info->new_level, + info->new_layout, info->new_chunk, + info->array.raid_disks, info->delta_disks, + backup_file, devname, APPLY_METADATA_CHANGES, + verbose)) { + unfreeze(st); + return 1; + } + + sync_metadata(st); + + /* ping monitor to be sure that update is on disk + */ + ping_monitor(container); + + if (!forked && !freeze_reshape) + if (continue_via_systemd(container, GROW_SERVICE)) + return 0; + + switch (forked ? 0 : fork()) { + case -1: /* error */ + perror("Cannot fork to complete reshape\n"); + unfreeze(st); + return 1; + default: /* parent */ + if (!freeze_reshape) + printf("%s: multi-array reshape continues in background\n", Name); + return 0; + case 0: /* child */ + manage_fork_fds(0); + map_fork(); + break; + } + + /* close unused handle in child process + */ + if (mdfd > -1) + close(mdfd); + + while(1) { + /* For each member array with reshape_active, + * we need to perform the reshape. + * We pick the first array that needs reshaping and + * reshape it. reshape_array() will re-read the metadata + * so the next time through a different array should be + * ready for reshape. + * It is possible that the 'different' array will not + * be assembled yet. In that case we simple exit. + * When it is assembled, the mdadm which assembles it + * will take over the reshape. + */ + struct mdinfo *content; + int fd; + struct mdstat_ent *mdstat; + char *adev; + dev_t devid; + + sysfs_free(cc); + + cc = st->ss->container_content(st, NULL); + + for (content = cc; content ; content = content->next) { + char *subarray; + if (!content->reshape_active) + continue; + + subarray = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(subarray, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) + break; + + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); + if (!adev) + adev = content->text_version; + + fd = open_dev(mdstat->devnm); + if (fd < 0) { + pr_err("Device %s cannot be opened for reshape.\n", + adev); + break; + } + + if (strcmp(last_devnm, mdstat->devnm) == 0) { + /* Do not allow for multiple reshape_array() calls for + * the same array. + * It can happen when reshape_array() returns without + * error, when reshape is not finished (wrong reshape + * starting/continuation conditions). Mdmon doesn't + * switch to next array in container and reentry + * conditions for the same array occur. + * This is possibly interim until the behaviour of + * reshape_array is resolved(). + */ + printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev); + close(fd); + break; + } + strcpy(last_devnm, mdstat->devnm); + + if (sysfs_init(content, fd, mdstat->devnm)) { + pr_err("Unable to initialize sysfs for %s\n", + mdstat->devnm); + rv = 1; + break; + } + + if (mdmon_running(container)) + flush_mdmon(container); + + rv = reshape_array(container, fd, adev, st, + content, force, NULL, INVALID_SECTORS, + backup_file, verbose, 1, restart, + freeze_reshape); + close(fd); + + if (freeze_reshape) { + sysfs_free(cc); + exit(0); + } + + restart = 0; + if (rv) + break; + + if (mdmon_running(container)) + flush_mdmon(container); + } + if (!rv) + unfreeze(st); + sysfs_free(cc); + exit(0); +} + +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ + +int progress_reshape(struct mdinfo *info, struct reshape *reshape, + unsigned long long backup_point, + unsigned long long wait_point, + unsigned long long *suspend_point, + unsigned long long *reshape_completed, int *frozen) +{ + /* This function is called repeatedly by the reshape manager. + * It determines how much progress can safely be made and allows + * that progress. + * - 'info' identifies the array and particularly records in + * ->reshape_progress the metadata's knowledge of progress + * This is a sector offset from the start of the array + * of the next array block to be relocated. This number + * may increase from 0 or decrease from array_size, depending + * on the type of reshape that is happening. + * Note that in contrast, 'sync_completed' is a block count of the + * reshape so far. It gives the distance between the start point + * (head or tail of device) and the next place that data will be + * written. It always increases. + * - 'reshape' is the structure created by analyse_change + * - 'backup_point' shows how much the metadata manager has backed-up + * data. For reshapes with increasing progress, it is the next address + * to be backed up, previous addresses have been backed-up. For + * decreasing progress, it is the earliest address that has been + * backed up - later address are also backed up. + * So addresses between reshape_progress and backup_point are + * backed up providing those are in the 'correct' order. + * - 'wait_point' is an array address. When reshape_completed + * passes this point, progress_reshape should return. It might + * return earlier if it determines that ->reshape_progress needs + * to be updated or further backup is needed. + * - suspend_point is maintained by progress_reshape and the caller + * should not touch it except to initialise to zero. + * It is an array address and it only increases in 2.6.37 and earlier. + * This makes it difficult to handle reducing reshapes with + * external metadata. + * However: it is similar to backup_point in that it records the + * other end of a suspended region from reshape_progress. + * it is moved to extend the region that is safe to backup and/or + * reshape + * - reshape_completed is read from sysfs and returned. The caller + * should copy this into ->reshape_progress when it has reason to + * believe that the metadata knows this, and any backup outside this + * has been erased. + * + * Return value is: + * 1 if more data from backup_point - but only as far as suspend_point, + * should be backed up + * 0 if things are progressing smoothly + * -1 if the reshape is finished because it is all done, + * -2 if the reshape is finished due to an error. + */ + + int advancing = (reshape->after.data_disks + >= reshape->before.data_disks); + unsigned long long need_backup; /* All data between start of array and + * here will at some point need to + * be backed up. + */ + unsigned long long read_offset, write_offset; + unsigned long long write_range; + unsigned long long max_progress, target, completed; + unsigned long long array_size = (info->component_size + * reshape->before.data_disks); + int fd; + char buf[20]; + + /* First, we unsuspend any region that is now known to be safe. + * If suspend_point is on the 'wrong' side of reshape_progress, then + * we don't have or need suspension at the moment. This is true for + * native metadata when we don't need to back-up. + */ + if (advancing) { + if (info->reshape_progress <= *suspend_point) + sysfs_set_num(info, NULL, "suspend_lo", + info->reshape_progress); + } else { + /* Note: this won't work in 2.6.37 and before. + * Something somewhere should make sure we don't need it! + */ + if (info->reshape_progress >= *suspend_point) + sysfs_set_num(info, NULL, "suspend_hi", + info->reshape_progress); + } + + /* Now work out how far it is safe to progress. + * If the read_offset for ->reshape_progress is less than + * 'blocks' beyond the write_offset, we can only progress as far + * as a backup. + * Otherwise we can progress until the write_offset for the new location + * reaches (within 'blocks' of) the read_offset at the current location. + * However that region must be suspended unless we are using native + * metadata. + * If we need to suspend more, we limit it to 128M per device, which is + * rather arbitrary and should be some time-based calculation. + */ + read_offset = info->reshape_progress / reshape->before.data_disks; + write_offset = info->reshape_progress / reshape->after.data_disks; + write_range = info->new_chunk/512; + if (reshape->before.data_disks == reshape->after.data_disks) + need_backup = array_size; + else + need_backup = reshape->backup_blocks; + if (advancing) { + if (read_offset < write_offset + write_range) + max_progress = backup_point; + else + max_progress = + read_offset * reshape->after.data_disks; + } else { + if (read_offset > write_offset - write_range) + /* Can only progress as far as has been backed up, + * which must be suspended */ + max_progress = backup_point; + else if (info->reshape_progress <= need_backup) + max_progress = backup_point; + else { + if (info->array.major_version >= 0) + /* Can progress until backup is needed */ + max_progress = need_backup; + else { + /* Can progress until metadata update is required */ + max_progress = + read_offset * reshape->after.data_disks; + /* but data must be suspended */ + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + } + + /* We know it is safe to progress to 'max_progress' providing + * it is suspended or we are using native metadata. + * Consider extending suspend_point 128M per device if it + * is less than 64M per device beyond reshape_progress. + * But always do a multiple of 'blocks' + * FIXME this is too big - it takes to long to complete + * this much. + */ + target = 64*1024*2 * min(reshape->before.data_disks, + reshape->after.data_disks); + target /= reshape->backup_blocks; + if (target < 2) + target = 2; + target *= reshape->backup_blocks; + + /* For externally managed metadata we always need to suspend IO to + * the area being reshaped so we regularly push suspend_point forward. + * For native metadata we only need the suspend if we are going to do + * a backup. + */ + if (advancing) { + if ((need_backup > info->reshape_progress || + info->array.major_version < 0) && + *suspend_point < info->reshape_progress + target) { + if (need_backup < *suspend_point + 2 * target) + *suspend_point = need_backup; + else if (*suspend_point + 2 * target < array_size) + *suspend_point += 2 * target; + else + *suspend_point = array_size; + sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); + if (max_progress > *suspend_point) + max_progress = *suspend_point; + } + } else { + if (info->array.major_version >= 0) { + /* Only need to suspend when about to backup */ + if (info->reshape_progress < need_backup * 2 && + *suspend_point > 0) { + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", 0); + sysfs_set_num(info, NULL, "suspend_hi", + need_backup); + } + } else { + /* Need to suspend continually */ + if (info->reshape_progress < *suspend_point) + *suspend_point = info->reshape_progress; + if (*suspend_point + target < info->reshape_progress) + /* No need to move suspend region yet */; + else { + if (*suspend_point >= 2 * target) + *suspend_point -= 2 * target; + else + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", + *suspend_point); + } + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + + /* now set sync_max to allow that progress. sync_max, like + * sync_completed is a count of sectors written per device, so + * we find the difference between max_progress and the start point, + * and divide that by after.data_disks to get a sync_max + * number. + * At the same time we convert wait_point to a similar number + * for comparing against sync_completed. + */ + /* scale down max_progress to per_disk */ + max_progress /= reshape->after.data_disks; + /* + * Round to chunk size as some kernels give an erroneously + * high number + */ + max_progress /= info->new_chunk/512; + max_progress *= info->new_chunk/512; + /* And round to old chunk size as the kernel wants that */ + max_progress /= info->array.chunk_size/512; + max_progress *= info->array.chunk_size/512; + /* Limit progress to the whole device */ + if (max_progress > info->component_size) + max_progress = info->component_size; + wait_point /= reshape->after.data_disks; + if (!advancing) { + /* switch from 'device offset' to 'processed block count' */ + max_progress = info->component_size - max_progress; + wait_point = info->component_size - wait_point; + } + + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); + + /* Now wait. If we have already reached the point that we were + * asked to wait to, don't wait at all, else wait for any change. + * We need to select on 'sync_completed' as that is the place that + * notifications happen, but we are really interested in + * 'reshape_position' + */ + fd = sysfs_get_fd(info, NULL, "sync_completed"); + if (fd < 0) + goto check_progress; + + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + + while (completed < max_progress && completed < wait_point) { + /* Check that sync_action is still 'reshape' to avoid + * waiting forever on a dead array + */ + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", action, 20) <= 0 || + strncmp(action, "reshape", 7) != 0) + break; + /* Some kernels reset 'sync_completed' to zero + * before setting 'sync_action' to 'idle'. + * So we need these extra tests. + */ + if (completed == 0 && advancing && + strncmp(action, "idle", 4) == 0 && + info->reshape_progress > 0) + break; + if (completed == 0 && !advancing && + strncmp(action, "idle", 4) == 0 && + info->reshape_progress < + (info->component_size * reshape->after.data_disks)) + break; + sysfs_wait(fd, NULL); + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + } + /* Some kernels reset 'sync_completed' to zero, + * we need to have real point we are in md. + * So in that case, read 'reshape_position' from sysfs. + */ + if (completed == 0) { + unsigned long long reshapep; + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", action, 20) > 0 && + strncmp(action, "idle", 4) == 0 && + sysfs_get_ll(info, NULL, + "reshape_position", &reshapep) == 0) + *reshape_completed = reshapep; + } else { + /* some kernels can give an incorrectly high + * 'completed' number, so round down */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) + completed = (info->component_size + * reshape->after.data_disks + - completed); + *reshape_completed = completed; + } + + close(fd); + + /* We return the need_backup flag. Caller will decide + * how much - a multiple of ->backup_blocks up to *suspend_point + */ + if (advancing) + return need_backup > info->reshape_progress; + else + return need_backup >= info->reshape_progress; + +check_progress: + /* if we couldn't read a number from sync_completed, then + * either the reshape did complete, or it aborted. + * We can tell which by checking for 'none' in reshape_position. + * If it did abort, then it might immediately restart if it + * it was just a device failure that leaves us degraded but + * functioning. + */ + if (sysfs_get_str(info, NULL, "reshape_position", buf, + sizeof(buf)) < 0 || strncmp(buf, "none", 4) != 0) { + /* The abort might only be temporary. Wait up to 10 + * seconds for fd to contain a valid number again. + */ + int wait = 10000; + int rv = -2; + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) + break; + switch (sysfs_fd_get_ll(fd, &completed)) { + case 0: + /* all good again */ + rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", + &new_sync_max); + *frozen = (new_sync_max != max_progress); + break; + case -2: /* read error - abort */ + wait = 0; + break; + } + } + if (fd >= 0) + close(fd); + return rv; /* abort */ + } else { + /* Maybe racing with array shutdown - check state */ + if (fd >= 0) + close(fd); + if (sysfs_get_str(info, NULL, "array_state", buf, + sizeof(buf)) < 0 || + strncmp(buf, "inactive", 8) == 0 || + strncmp(buf, "clear",5) == 0) + return -2; /* abort */ + return -1; /* complete */ + } +} + +/* FIXME return status is never checked */ +static int grow_backup(struct mdinfo *sra, + unsigned long long offset, /* per device */ + unsigned long stripes, /* per device, in old chunks */ + int *sources, unsigned long long *offsets, + int disks, int chunk, int level, int layout, + int dests, int *destfd, unsigned long long *destoffsets, + int part, int *degraded, + char *buf) +{ + /* Backup 'blocks' sectors at 'offset' on each device of the array, + * to storage 'destfd' (offset 'destoffsets'), after first + * suspending IO. Then allow resync to continue + * over the suspended section. + * Use part 'part' of the backup-super-block. + */ + int odata = disks; + int rv = 0; + int i; + unsigned long long ll; + int new_degraded; + //printf("offset %llu\n", offset); + if (level >= 4) + odata--; + if (level == 6) + odata--; + + /* Check that array hasn't become degraded, else we might backup the wrong data */ + if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0) + return -1; /* FIXME this error is ignored */ + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC)) { + char sbuf[100]; + + if (sysfs_get_str(sra, sd, "state", + sbuf, sizeof(sbuf)) < 0 || + strstr(sbuf, "faulty") || + strstr(sbuf, "in_sync") == NULL) { + /* this device is dead */ + sd->disk.state = (1<<MD_DISK_FAULTY); + if (sd->disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } + } + } + *degraded = new_degraded; + } + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); + + rv = save_stripes(sources, offsets, disks, chunk, level, layout, + dests, destfd, offset * 512 * odata, + stripes * chunk * odata, buf); + + if (rv) + return rv; + bsb.mtime = __cpu_to_le64(time(0)); + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + rv = -1; + if ((unsigned long long)lseek64(destfd[i], + destoffsets[i] - 4096, 0) != + destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + } + fsync(destfd[i]); + rv = 0; + } + + return rv; +} + +/* in 2.6.30, the value reported by sync_completed can be + * less that it should be by one stripe. + * This only happens when reshape hits sync_max and pauses. + * So allow wait_backup to either extent sync_max further + * than strictly necessary, or return before the + * sync has got quite as far as we would really like. + * This is what 'blocks2' is for. + * The various caller give appropriate values so that + * every works. + */ +/* FIXME return value is often ignored */ +static int forget_backup(int dests, int *destfd, + unsigned long long *destoffsets, + int part) +{ + /* + * Erase backup 'part' (which is 0 or 1) + */ + int i; + int rv; + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && write(destfd[i], &bsb, 512) != 512) + rv = -1; + fsync(destfd[i]); + } + return rv; +} + +static void fail(char *msg) +{ + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); +} + +static char *abuf, *bbuf; +static unsigned long long abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } + } + + lseek64(bfd, offset, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { +#if 0 + int i; + printf("offset=%llu len=%llu\n", + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; i<len; i++) + if (bbuf[i] != abuf[i]) { + printf("first diff byte %d\n", i); + break; + } +#endif + fail("data1 compare failed"); + } + } + if (bsb2.length2) { + unsigned long long len = __le64_to_cpu(bsb2.length2)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + abuf = xmalloc(abuflen); + bbuf = xmalloc(abuflen); + } + + lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) + fail("read second backup failed"); + lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read second from array failed"); + if (memcmp(bbuf, abuf, len) != 0) + fail("data2 compare failed"); + } +} + +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + /* Monitor a reshape where backup is being performed using + * 'native' mechanism - either to a backup file, or + * to some space in a spare. + */ + char *buf; + int degraded = -1; + unsigned long long speed; + unsigned long long suspend_point, array_size; + unsigned long long backup_point, wait_point; + unsigned long long reshape_completed; + int done = 0; + int increasing = reshape->after.data_disks >= + reshape->before.data_disks; + int part = 0; /* The next part of the backup area to fill. It + * may already be full, so we need to check */ + int level = reshape->level; + int layout = reshape->before.layout; + int data = reshape->before.data_disks; + int disks = reshape->before.data_disks + reshape->parity; + int chunk = sra->array.chunk_size; + struct mdinfo *sd; + unsigned long stripes; + int uuid[4]; + int frozen = 0; + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ + /* Find a superblock */ + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok == 0) + break; + } + if (!sd) { + pr_err("Cannot find a superblock\n"); + return 0; + } + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + st->ss->uuid_from_super(st, uuid); + memcpy(bsb.set_uuid, uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + stripes = blocks / (sra->array.chunk_size/512) / + reshape->before.data_disks; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + /* Don't start the 'reshape' */ + return 0; + if (reshape->before.data_disks == reshape->after.data_disks) { + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + } + + if (increasing) { + array_size = sra->component_size * reshape->after.data_disks; + backup_point = sra->reshape_progress; + suspend_point = 0; + } else { + array_size = sra->component_size * reshape->before.data_disks; + backup_point = reshape->backup_blocks; + suspend_point = array_size; + } + + while (!done) { + int rv; + + /* Want to return as soon the oldest backup slot can + * be released as that allows us to start backing up + * some more, providing suspend_point has been + * advanced, which it should have. + */ + if (increasing) { + wait_point = array_size; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length)); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2)); + } else { + wait_point = 0; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = __le64_to_cpu(bsb.arraystart); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = __le64_to_cpu(bsb.arraystart2); + } + + reshape_completed = sra->reshape_progress; + rv = progress_reshape(sra, reshape, + backup_point, wait_point, + &suspend_point, &reshape_completed, + &frozen); + /* external metadata would need to ping_monitor here */ + sra->reshape_progress = reshape_completed; + + /* Clear any backup region that is before 'here' */ + if (increasing) { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2))) + forget_backup(dests, destfd, + destoffsets, 1); + } else { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart2))) + forget_backup(dests, destfd, + destoffsets, 1); + } + if (sigterm) + rv = -2; + if (rv < 0) { + if (rv == -1) + done = 1; + break; + } + if (rv == 0 && increasing && !st->ss->external) { + /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + done = 1; + break; + } + + while (rv) { + unsigned long long offset; + unsigned long actual_stripes; + /* Need to backup some data. + * If 'part' is not used and the desired + * backup size is suspended, do a backup, + * then consider the next part. + */ + /* Check that 'part' is unused */ + if (part == 0 && __le64_to_cpu(bsb.length) != 0) + break; + if (part == 1 && __le64_to_cpu(bsb.length2) != 0) + break; + + offset = backup_point / data; + actual_stripes = stripes; + if (increasing) { + if (offset + actual_stripes * (chunk/512) > + sra->component_size) + actual_stripes = ((sra->component_size - offset) + / (chunk/512)); + if (offset + actual_stripes * (chunk/512) > + suspend_point/data) + break; + } else { + if (offset < actual_stripes * (chunk/512)) + actual_stripes = offset / (chunk/512); + offset -= actual_stripes * (chunk/512); + if (offset < suspend_point/data) + break; + } + if (actual_stripes == 0) + break; + grow_backup(sra, offset, actual_stripes, fds, offsets, + disks, chunk, level, layout, dests, destfd, + destoffsets, part, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + /* record where 'part' is up to */ + part = !part; + if (increasing) + backup_point += actual_stripes * (chunk/512) * data; + else + backup_point -= actual_stripes * (chunk/512) * data; + } + } + + /* FIXME maybe call progress_reshape one more time instead */ + /* remove any remaining suspension */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + + if (reshape->before.data_disks == reshape->after.data_disks) + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return done; +} + +/* + * If any spare contains md_back_data-1 which is recent wrt mtime, + * write that data into the array and update the super blocks with + * the new reshape_progress + */ +int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, + int cnt, char *backup_file, int verbose) +{ + int i, j; + int old_disks; + unsigned long long *offsets; + unsigned long long nstripe, ostripe; + int ndata, odata; + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) + odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) + ndata--; + + old_disks = info->array.raid_disks - info->delta_disks; + + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; + for (i=old_disks-(backup_file?1:0); i<cnt; i++) { + struct mdinfo dinfo; + int fd; + int bsbsize; + char *devname, namebuf[20]; + unsigned long long lo, hi; + + /* This was a spare and may have some saved data on it. + * Load the superblock, find and load the + * backup_super_block. + * If either fail, go on to next device. + * If the backup contains no new info, just return + * else restore data and update all superblocks + */ + if (i == old_disks-1) { + fd = open(backup_file, O_RDONLY); + if (fd<0) { + pr_err("backup file %s inaccessible: %s\n", + backup_file, strerror(errno)); + continue; + } + devname = backup_file; + } else { + fd = fdlist[i]; + if (fd < 0) + continue; + if (st->ss->load_super(st, fd, NULL)) + continue; + + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + + if (lseek64(fd, + (dinfo.data_offset + dinfo.component_size - 8) <<9, + 0) < 0) { + pr_err("Cannot seek on device %d\n", i); + continue; /* Cannot seek */ + } + sprintf(namebuf, "device-%d", i); + devname = namebuf; + } + if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) { + if (verbose) + pr_err("Cannot read from %s\n", devname); + continue; /* Cannot read */ + } + if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 && + memcmp(bsb.magic, "md_backup_data-2", 16) != 0) { + if (verbose) + pr_err("No backup metadata on %s\n", devname); + continue; + } + if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum on %s\n", + devname); + continue; /* bad checksum */ + } + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && + bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum2 on %s\n", + devname); + continue; /* Bad second checksum */ + } + if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { + if (verbose) + pr_err("Wrong uuid on backup-metadata on %s\n", + devname); + continue; /* Wrong uuid */ + } + + /* + * array utime and backup-mtime should be updated at + * much the same time, but it seems that sometimes + * they aren't... So allow considerable flexability in + * matching, and allow this test to be overridden by + * an environment variable. + */ + if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) || + time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); + continue; /* time stamp is too bad */ + } + } + + if (bsb.magic[15] == '1') { + if (bsb.length == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) { + nonew: + if (verbose) + pr_err("backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } else { + if (bsb.length == 0 && bsb.length2 == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if ((__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) && + (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2) + < info->reshape_progress)) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } + if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { + second_fail: + if (verbose) + pr_err("Failed to verify secondary backup-metadata block on %s\n", + devname); + continue; /* Cannot seek */ + } + /* There should be a duplicate backup superblock 4k before here */ + if (lseek64(fd, -4096, 1) < 0 || + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) + goto second_fail; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(&bsb2, &bsb, bsbsize) != 0) + goto second_fail; /* Cannot find leading superblock */ + + /* Now need the data offsets for all devices. */ + offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks); + for(j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + /* FIXME should be this be an error */ + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + offsets[j] = dinfo.data_offset * 512; + } + printf("%s: restoring critical section\n", Name); + + if (restore_stripes(fdlist, offsets, info->array.raid_disks, + info->new_chunk, info->new_level, + info->new_layout, fd, + __le64_to_cpu(bsb.devstart)*512, + __le64_to_cpu(bsb.arraystart)*512, + __le64_to_cpu(bsb.length)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring backup from %s\n", + devname); + free(offsets); + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, info->array.raid_disks, + info->new_chunk, info->new_level, + info->new_layout, fd, + __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2)*512, + __le64_to_cpu(bsb.length2)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring second backup from %s\n", + devname); + free(offsets); + return 1; + } + + free(offsets); + + /* Ok, so the data is restored. Let's update those superblocks. */ + + lo = hi = 0; + if (bsb.length) { + lo = __le64_to_cpu(bsb.arraystart); + hi = lo + __le64_to_cpu(bsb.length); + } + if (bsb.magic[15] == '2' && bsb.length2) { + unsigned long long lo1, hi1; + lo1 = __le64_to_cpu(bsb.arraystart2); + hi1 = lo1 + __le64_to_cpu(bsb.length2); + if (lo == hi) { + lo = lo1; + hi = hi1; + } else if (lo < lo1) + hi = hi1; + else + lo = lo1; + } + if (lo < hi && (info->reshape_progress < lo || + info->reshape_progress > hi)) + /* backup does not affect reshape_progress*/ ; + else if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2; + + p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2; + + p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } + for (j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + dinfo.reshape_progress = info->reshape_progress; + st->ss->update_super(st, &dinfo, "_reshape_progress", + NULL,0, 0, NULL); + st->ss->store_super(st, fdlist[j]); + st->ss->free_super(st); + } + return 0; + } + /* Didn't find any backup data, try to see if any + * was needed. + */ + if (info->delta_disks < 0) { + /* When shrinking, the critical section is at the end. + * So see if we are before the critical section. + */ + unsigned long long first_block; + nstripe = ostripe = 0; + first_block = 0; + while (ostripe >= nstripe) { + ostripe += info->array.chunk_size / 512; + first_block = ostripe * odata; + nstripe = first_block / ndata / (info->new_chunk/512) * + (info->new_chunk/512); + } + + if (info->reshape_progress >= first_block) + return 0; + } + if (info->delta_disks > 0) { + /* See if we are beyond the critical section. */ + unsigned long long last_block; + nstripe = ostripe = 0; + last_block = 0; + while (nstripe >= ostripe) { + nstripe += info->new_chunk / 512; + last_block = nstripe * ndata; + ostripe = last_block / odata / (info->array.chunk_size/512) * + (info->array.chunk_size/512); + } + + if (info->reshape_progress >= last_block) + return 0; + } + /* needed to recover critical section! */ + if (verbose) + pr_err("Failed to find backup of critical section\n"); + return 1; +} + +int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose) +{ + int ret_val = 0; + struct supertype *st = NULL; + struct mdinfo *content = NULL; + struct mdinfo array; + char *subarray = NULL; + struct mdinfo *cc = NULL; + struct mdstat_ent *mdstat = NULL; + int cfd = -1; + int fd2; + + dprintf("Grow continue from command line called for %s\n", devname); + + st = super_by_fd(fd, &subarray); + if (!st || !st->ss) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + dprintf("Grow continue is run for "); + if (st->ss->external == 0) { + int d; + int cnt = 5; + dprintf_cont("native array (%s)\n", devname); + if (md_get_array_info(fd, &array.array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + content = &array; + sysfs_init(content, fd, NULL); + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + do { + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); + if (!content->reshape_active) + sleep(3); + else + break; + } while (cnt-- > 0); + } else { + char *container; + + if (subarray) { + dprintf_cont("subarray (%s)\n", subarray); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + dprintf_cont("container (%s)\n", container); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* find in container array under reshape + */ + ret_val = st->ss->load_container(st, cfd, NULL); + if (ret_val) { + pr_err("Cannot read superblock for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + char *array_name; + int allow_reshape = 1; + + if (content->reshape_active == 0) + continue; + /* The decision about array or container wide + * reshape is taken in Grow_continue based + * content->reshape_active state, therefore we + * need to check_reshape based on + * reshape_active and subarray name + */ + if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->reshape_active == CONTAINER_RESHAPE && + (content->array.state + & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))) + allow_reshape = 0; + + if (!allow_reshape) { + pr_err("cannot continue reshape of an array in container with unsupported metadata: %s(%s)\n", + devname, container); + ret_val = 1; + goto Grow_continue_command_exit; + } + + array_name = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(array_name, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) { + pr_err("Unable to determine reshaped array for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + fd2 = open_dev(mdstat->devnm); + if (fd2 < 0) { + pr_err("cannot open (%s)\n", mdstat->devnm); + ret_val = 1; + goto Grow_continue_command_exit; + } + + if (sysfs_init(content, fd2, mdstat->devnm)) { + pr_err("Unable to initialize sysfs for %s, Grow cannot continue.\n", + mdstat->devnm); + ret_val = 1; + close(fd2); + goto Grow_continue_command_exit; + } + + close(fd2); + + /* start mdmon in case it is not running + */ + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + + if (mdmon_running(container)) + st->update_tail = &st->updates; + else { + pr_err("No mdmon found. Grow cannot continue.\n"); + ret_val = 1; + goto Grow_continue_command_exit; + } + } + + /* verify that array under reshape is started from + * correct position + */ + if (verify_reshape_position(content, content->array.level) < 0) { + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* continue reshape + */ + ret_val = Grow_continue(fd, st, content, backup_file, 1, 0); + +Grow_continue_command_exit: + if (cfd > -1) + close(cfd); + st->ss->free_super(st); + free_mdstat(mdstat); + sysfs_free(cc); + free(subarray); + + return ret_val; +} + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file, int forked, int freeze_reshape) +{ + int ret_val = 2; + + if (!info->reshape_active) + return ret_val; + + if (st->ss->external) { + int cfd = open_dev(st->container_devnm); + + if (cfd < 0) + return 1; + + st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + ret_val = reshape_container(st->container_devnm, NULL, mdfd, + st, info, 0, backup_file, 0, + forked, 1 | info->reshape_active, + freeze_reshape); + } else + ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, + NULL, INVALID_SECTORS, backup_file, + 0, forked, 1 | info->reshape_active, + freeze_reshape); + + return ret_val; +} + +char *make_backup(char *name) +{ + char *base = "backup_file-"; + int len; + char *fname; + + len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1; + fname = xmalloc(len); + sprintf(fname, "%s/%s%s", MAP_DIR, base, name); + return fname; +} + +char *locate_backup(char *name) +{ + char *fl = make_backup(name); + struct stat stb; + + if (stat(fl, &stb) == 0 && S_ISREG(stb.st_mode)) + return fl; + + free(fl); + return NULL; +} @@ -0,0 +1,13 @@ + +To build mdadm, simply run: + + make + +to install, run + + make install + +as root. + + +No configuration is necessary. diff --git a/Incremental.c b/Incremental.c new file mode 100644 index 0000000..a57fc32 --- /dev/null +++ b/Incremental.c @@ -0,0 +1,1764 @@ +/* + * Incremental.c - support --incremental. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +#include "mdadm.h" +#include <sys/wait.h> +#include <dirent.h> +#include <ctype.h> + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *info); +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name); +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose); + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only); + +int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st) +{ + /* Add this device to an array, creating the array if necessary + * and starting the array if sensible or - if runstop>0 - if possible. + * + * This has several steps: + * + * 1/ Check if device is permitted by mdadm.conf, reject if not. + * 2/ Find metadata, reject if none appropriate (check + * version/name from args) + * 3/ Check if there is a match in mdadm.conf + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. + * 4/ Determine device number. + * - If in mdadm.conf with std name, use that + * - UUID in /var/run/mdadm.map use that + * - If name is suggestive, use that. unless in use with different uuid. + * - Choose a free, high number. + * - Use a partitioned device unless strong suggestion not to. + * e.g. auto=md + * Don't choose partitioned for containers. + * 5/ Find out if array already exists + * 5a/ if it does not + * - choose a name, from mdadm.conf or 'name' field in array. + * - create the array + * - add the device + * 5b/ if it does + * - check one drive in array to make sure metadata is a reasonably + * close match. Reject if not (e.g. different type) + * - add the device + * 6/ Make sure /var/run/mdadm.map contains this array. + * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. + * 7a/ if not, finish with success. + * 7b/ if yes, + * - read all metadata and arrange devices like -A does + * - if number of OK devices match expected, or -R and there are enough, + * start the array (auto-readonly). + */ + dev_t rdev, rdev2; + struct mdinfo info, dinfo; + struct mdinfo *sra = NULL, *d; + struct mddev_ident *match; + char chosen_name[1024]; + char *md_devname; + int rv = 1; + struct map_ent *mp, *map = NULL; + int dfd = -1, mdfd = -1; + char *avail = NULL; + int active_disks; + int trustworthy; + char *name_to_use; + struct dev_policy *policy = NULL; + struct map_ent target_array; + int have_target; + char *devname = devlist->devname; + int journal_device_missing = 0; + + struct createinfo *ci = conf_get_create_info(); + + if (!stat_is_blkdev(devname, &rdev)) + return rv; + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", + devname, strerror(errno)); + return rv; + } + /* If the device is a container, we do something very different */ + if (must_be_container(dfd)) { + if (!st) + st = super_by_fd(dfd, NULL); + if (st && st->ss->load_container) + rv = st->ss->load_container(st, dfd, NULL); + + close(dfd); + if (!rv && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + if (c->export) + printf("MD_DEVNAME=%s\n", devname); + rv = Incremental_container(st, devname, c, NULL); + map_unlock(&map); + return rv; + } + + pr_err("%s is not part of an md array.\n", + devname); + return rv; + } + + /* 1/ Check if device is permitted by mdadm.conf */ + + for (;devlist; devlist = devlist->next) + if (conf_test_dev(devlist->devname)) + break; + if (!devlist) { + devlist = conf_get_devs(); + for (;devlist; devlist = devlist->next) { + if (stat_is_blkdev(devlist->devname, &rdev2) && + rdev2 == rdev) + break; + } + } + if (!devlist) { + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); + goto out; + } + + /* 2/ Find metadata, reject if none appropriate (check + * version/name from args) */ + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + goto out; + + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + + policy = disk_policy(&dinfo); + have_target = policy_check_path(&dinfo, &target_array); + + if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) { + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + NULL, c->verbose); + goto out; + } + st->ignore_hw_compat = 0; + + if (st->ss->compare_super == NULL || + st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) { + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + st, c->verbose); + free(st); + goto out; + } + close (dfd); dfd = -1; + + st->ss->getinfo_super(st, &info, NULL); + + /* 3/ Check if there is a match in mdadm.conf */ + match = conf_match(st, &info, devname, c->verbose, &rv); + if (!match && rv == 2) + goto out; + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose >= 0) + pr_err("array containing %s is explicitly ignored by mdadm.conf\n", + devname); + goto out; + } + + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + if (!match && !conf_test_metadata(st->ss->name, policy, + (trustworthy == LOCAL))) { + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, st->ss->name); + goto out; + } + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; + + name_to_use = info.name; + if (name_to_use[0] == 0 && info.array.level == LEVEL_CONTAINER) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnm); + else + mdfd = -1; + + if (mdfd < 0) { + + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; + + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, c->autof, trustworthy, chosen_name, 0); + + if (mdfd < 0) + goto out_unlock; + + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs for %s\n", + chosen_name); + rv = 2; + goto out_unlock; + } + + if (set_array_info(mdfd, st, &info) != 0) { + pr_err("failed to set array info for %s: %s\n", + chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + + dinfo = info; + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { + pr_err("failed to add %s to new array %s: %s.\n", + devname, chosen_name, strerror(errno)); + ioctl(mdfd, STOP_ARRAY, 0); + rv = 2; + goto out_unlock; + } + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { + /* It really should be 'none' - must be old buggy + * kernel, and mdadm -I may not be able to complete. + * So reject it. + */ + ioctl(mdfd, STOP_ARRAY, NULL); + pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n"); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 1; + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnm(mdfd), + info.text_version, + info.uuid, chosen_name); + } else { + /* 5b/ if it does */ + /* - check one drive in array to make sure metadata is a reasonably */ + /* close match. Reject if not (e.g. different type) */ + /* - add the device */ + char dn[20]; + int dfd2; + int err; + struct supertype *st2; + struct mdinfo info2, *d; + + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best bet is + * to add this anyway. + * Also if action policy is re-add or better we allow + * re-add. + * This doesn't apply to containers as the 'non-spare' + * flag has a different meaning. The test has to happen + * at the device level there + */ + if (!st->ss->external && + (info.disk.state & (1 << MD_DISK_SYNC)) != 0 && + !policy_action_allows(policy, st->ss->name, act_re_add) && + c->runstop < 1) { + if (md_array_active(mdfd)) { + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); + rv = 2; + goto out_unlock; + } + } + if (!sra) { + rv = 2; + goto out_unlock; + } + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + if (dfd2 < 0) { + pr_err("unable to open %s\n", devname); + rv = 2; + goto out_unlock; + } + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2, 1) != 0) { + pr_err("metadata mismatch between %s and chosen array %s\n", + devname, chosen_name); + close(dfd2); + rv = 2; + goto out_unlock; + } + close(dfd2); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); + rv = 2; + goto out_unlock; + } + } + info.disk.major = major(rdev); + info.disk.minor = minor(rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + + if (info.array.state & (1 << MD_SB_CLUSTERED)) + info.disk.state |= (1 << MD_DISK_CLUSTER_ADD); + + err = add_disk(mdfd, st, sra, &info); + if (err < 0 && errno == EBUSY) { + /* could be another device present with the same + * disk.number. Find and reject any such + */ + find_reject(mdfd, st, sra, info.disk.number, + info.events, c->verbose, chosen_name); + err = add_disk(mdfd, st, sra, &info); + } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<<MD_DISK_SYNC)) { + /* Maybe it needs to be added as a spare */ + if (policy_action_allows(policy, st->ss->name, + act_force_spare)) { + info.disk.state &= ~(1<<MD_DISK_SYNC); + err = add_disk(mdfd, st, sra, &info); + } else + if (c->verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } + if (err < 0) { + pr_err("failed to add %s to existing array %s: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + + } + if (strncmp(chosen_name, "/dev/md/", 8) == 0) + md_devname = chosen_name+8; + else + md_devname = chosen_name; + if (c->export) { + printf("MD_DEVICE=%s\n", fd2devnm(mdfd)); + printf("MD_DEVNAME=%s\n", md_devname); + printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no"); + } + + /* 7/ Is there enough devices to possibly start the array? */ + /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + char devnm[32]; + /* Try to assemble within the container */ + sysfs_uevent(sra, "change"); + if (!c->export && c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); + sysfs_rules_apply(chosen_name, &info); + wait_for(chosen_name, mdfd); + if (st->ss->external) + strcpy(devnm, fd2devnm(mdfd)); + if (st->ss->load_container) + rv = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + sysfs_free(sra); + if (!rv) + rv = Incremental_container(st, chosen_name, c, NULL); + map_unlock(&map); + /* after spare is added, ping monitor for external metadata + * so that it can eg. try to rebuild degraded array */ + if (st->ss->external) + ping_monitor(devnm); + return rv; + } + + /* We have added something to the array, so need to re-read the + * state. Eventually this state should be kept up-to-date as + * things change. + */ + sysfs_free(sra); + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + active_disks = count_active(st, sra, mdfd, &avail, &info); + + journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0); + + if (info.consistency_policy == CONSISTENCY_POLICY_PPL) + info.array.state |= 1; + + if (enough(info.array.level, info.array.raid_disks, + info.array.layout, info.array.state & 1, avail) == 0) { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); + rv = 0; + goto out_unlock; + } + + /* 7b/ if yes, */ + /* - if number of OK devices match expected, or -R and there */ + /* are enough, */ + /* + add any bitmap file */ + /* + start the array (auto-readonly). */ + + if (md_array_active(mdfd)) { + if (c->export) { + printf("MD_STARTED=already\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); + rv = 0; + goto out_unlock; + } + + map_unlock(&map); + if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) { + struct mdinfo *dsk; + /* Let's try to start it */ + + if (journal_device_missing) + pr_err("Trying to run with missing journal device\n"); + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } + if (match && match->bitmap_file) { + int bmfd = open(match->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); + goto out; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + close(bmfd); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); + goto out; + } + close(bmfd); + } + /* Need to remove from the array any devices which + * 'count_active' discerned were too old or inappropriate + */ + for (d = sra ? sra->devs : NULL ; d ; d = d->next) + if (d->disk.state & (1<<MD_DISK_REMOVED)) + remove_disk(mdfd, st, sra, d); + + if ((sra == NULL || active_disks >= info.array.working_disks) && + trustworthy != FOREIGN) + rv = ioctl(mdfd, RUN_ARRAY, NULL); + else + rv = sysfs_set_str(sra, NULL, + "array_state", "read-auto"); + /* Array might be O_EXCL which will interfere with + * fsck and mount. So re-open without O_EXCL. + */ + reopen_mddev(mdfd); + if (rv == 0) { + if (c->export) { + printf("MD_STARTED=yes\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); + rv = 0; + wait_for(chosen_name, mdfd); + /* We just started the array, so some devices + * might have been evicted from the array + * because their event counts were too old. + * If the action=re-add policy is in-force for + * those devices we should re-add them now. + */ + for (dsk = sra->devs; dsk ; dsk = dsk->next) { + if (disk_action_allows(dsk, st->ss->name, + act_re_add) && + add_disk(mdfd, st, sra, dsk) == 0) + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); + } + } else { + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 1; + } + } else { + if (c->export) { + printf("MD_STARTED=unsafe\n"); + } else if (journal_device_missing) { + pr_err("Journal device is missing, not safe to start yet.\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); + rv = 0; + } +out: + free(avail); + if (dfd >= 0) + close(dfd); + if (mdfd >= 0) + close(mdfd); + if (policy) + dev_policy_free(policy); + sysfs_free(sra); + return rv; +out_unlock: + map_unlock(&map); + goto out; +} + +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name) +{ + /* Find a device attached to this array with a disk.number of number + * and events less than the passed events, and remove the device. + */ + struct mdinfo *d; + + if (md_array_active(mdfd)) + return; /* not safe to remove from active arrays + * without thinking more */ + + for (d = sra->devs; d ; d = d->next) { + char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte + int dfd; + struct mdinfo info; + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + if (st->ss->load_super(st, dfd, NULL)) { + close(dfd); + continue; + } + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + close(dfd); + + if (info.disk.number != number || info.events >= events) + continue; + + if (d->disk.raid_disk > -1) + sysfs_set_str(sra, d, "slot", "none"); + if (sysfs_set_str(sra, d, "state", "remove") == 0) + if (verbose >= 0) + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); + } +} + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *bestinfo) +{ + /* count how many devices in sra think they are active */ + struct mdinfo *d; + int cnt = 0; + int replcnt = 0; + __u64 max_events = 0; + __u64 max_journal_events = 0; + char *avail = NULL; + int *best = NULL; + char *devmap = NULL; + int numdevs = 0; + int devnum; + int b, i; + int raid_disks = 0; + + if (!sra) + return 0; + + for (d = sra->devs ; d ; d = d->next) + numdevs++; + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { + char dn[30]; + int dfd; + int ok; + struct mdinfo info; + + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + ok = st->ss->load_super(st, dfd, NULL); + close(dfd); + if (ok != 0) + continue; + + info.array.raid_disks = raid_disks; + st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL && + info.events > max_journal_events) + max_journal_events = info.events; + if (!avail) { + raid_disks = info.array.raid_disks; + avail = xcalloc(raid_disks, 1); + *availp = avail; + + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); + + st->ss->getinfo_super(st, &info, devmap); + } + + if (info.disk.state & (1<<MD_DISK_SYNC)) + { + if (cnt == 0) { + cnt++; + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else if (info.events == max_events) { + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + } else if (info.events == max_events-1) { + if (avail[info.disk.raid_disk] == 0) { + avail[info.disk.raid_disk] = 1; + best[info.disk.raid_disk] = devnum; + } + } else if (info.events < max_events - 1) + ; + else if (info.events == max_events+1) { + int i; + max_events = info.events; + for (i = 0; i < raid_disks; i++) + if (avail[i]) + avail[i]--; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else { /* info.events much bigger */ + memset(avail, 0, raid_disks); + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } + } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT)) + replcnt++; + st->ss->free_super(st); + } + if (max_journal_events >= max_events - 1) + bestinfo->journal_clean = 1; + + if (!avail) + return 0; + /* We need to reject any device that thinks the best device is + * failed or missing */ + for (b = 0; b < raid_disks; b++) + if (avail[b] == 2) + break; + cnt = 0; + for (i = 0 ; i < raid_disks ; i++) { + if (i != b && avail[i]) + if (devmap[raid_disks * best[i] + b] == 0) { + /* This device thinks 'b' is failed - + * don't use it */ + devnum = best[i]; + for (d=sra->devs ; devnum; d = d->next) + devnum--; + d->disk.state |= (1 << MD_DISK_REMOVED); + avail[i] = 0; + } + if (avail[i]) + cnt++; + } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<<MD_DISK_SYNC)) && + d->events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } + free(best); + free(devmap); + return cnt + replcnt; +} + +/* test if container has degraded member(s) */ +static int +container_members_max_degradation(struct map_ent *map, struct map_ent *me) +{ + struct mdinfo *sra; + int degraded, max_degraded = 0; + + for(; map; map = map->next) { + if (!metadata_container_matches(map->metadata, me->devnm)) + continue; + /* most accurate information regarding array degradation */ + sra = sysfs_read(-1, map->devnm, + GET_DISKS | GET_DEVS | GET_STATE); + if (!sra) + continue; + degraded = sra->array.raid_disks - sra->array.active_disks - + sra->array.spare_disks; + if (degraded > max_degraded) + max_degraded = degraded; + sysfs_free(sra); + } + + return max_degraded; +} + +static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, int bare, + struct supertype *st, int verbose) +{ + /* This device doesn't have any md metadata + * The device policy allows 'spare' and if !bare, it allows spare-same-slot. + * If 'st' is not set, then we only know that some metadata allows this, + * others possibly don't. + * So look for a container or array to attach the device to. + * Prefer 'target' if that is set and the array is found. + * + * If st is set, then only arrays of that type are considered + * Return 0 on success, or some exit code on failure, probably 1. + */ + int rv = 1; + dev_t rdev; + struct map_ent *mp, *map = NULL; + struct mdinfo *chosen = NULL; + int dfd = *dfdp; + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + return 1; + + /* + * Now we need to find a suitable array to add this to. + * We only accept arrays that: + * - match 'st' + * - are in the same domains as the device + * - are of an size for which the device will be useful + * and we choose the one that is the most degraded + */ + + if (map_lock(&map)) { + pr_err("failed to get exclusive lock on mapfile\n"); + return 1; + } + for (mp = map ; mp ; mp = mp->next) { + struct supertype *st2; + struct domainlist *dl = NULL; + struct mdinfo *sra; + unsigned long long devsize, freesize = 0; + struct spare_criteria sc = {0, 0}; + + if (is_subarray(mp->metadata)) + continue; + if (st) { + st2 = st->ss->match_metadata_desc(mp->metadata); + if (!st2 || + (st->minor_version >= 0 && + st->minor_version != st2->minor_version)) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata type doesn't match\n", + devname, mp->path); + free(st2); + continue; + } + free(st2); + } + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_COMPONENT|GET_VERSION); + if (sra) + sra->array.failed_disks = -1; + else + continue; + if (st == NULL) { + int i; + st2 = NULL; + for(i = 0; !st2 && superlist[i]; i++) + st2 = superlist[i]->match_metadata_desc( + sra->text_version); + if (!st2) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata not recognised.\n", + devname, mp->path); + goto next; + } + /* Need to double check the 'act_spare' permissions applies + * to this metadata. + */ + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + goto next; + if (!bare && !policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) + goto next; + } else + st2 = st; + /* update number of failed disks for mostly degraded + * container member */ + if (sra->array.failed_disks == -1) + sra->array.failed_disks = container_members_max_degradation(map, mp); + + get_dev_size(dfd, NULL, &devsize); + if (sra->component_size == 0) { + /* true for containers, here we must read superblock + * to obtain minimum spare size */ + struct supertype *st3 = dup_super(st2); + int mdfd = open_dev(mp->devnm); + if (mdfd < 0) { + free(st3); + goto next; + } + if (st3->ss->load_container && + !st3->ss->load_container(st3, mdfd, mp->path)) { + if (st3->ss->get_spare_criteria) + st3->ss->get_spare_criteria(st3, &sc); + st3->ss->free_super(st3); + } + free(st3); + close(mdfd); + } + if ((sra->component_size > 0 && + st2->ss->validate_geometry(st2, sra->array.level, sra->array.layout, + sra->array.raid_disks, &sra->array.chunk_size, + sra->component_size, + sra->devs ? sra->devs->data_offset : INVALID_SECTORS, + devname, &freesize, sra->consistency_policy, + 0) && + freesize < sra->component_size) || + (sra->component_size == 0 && devsize < sc.min_size)) { + if (verbose > 1) + pr_err("not adding %s to %s as it is too small\n", + devname, mp->path); + goto next; + } + /* test against target. + * If 'target' is set and 'bare' is false, we only accept + * arrays/containers that match 'target'. + * If 'target' is set and 'bare' is true, we prefer the + * array which matches 'target'. + * target is considered only if we deal with degraded array + */ + if (target && policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) { + if (strcmp(target->metadata, mp->metadata) == 0 && + memcmp(target->uuid, mp->uuid, + sizeof(target->uuid)) == 0 && + sra->array.failed_disks > 0) { + /* This is our target!! */ + sysfs_free(chosen); + chosen = sra; + sra = NULL; + /* skip to end so we don't check any more */ + while (mp->next) + mp = mp->next; + goto next; + } + /* not our target */ + if (!bare) + goto next; + } + + dl = domain_from_array(sra, st2->ss->name); + if (domain_test(dl, pol, st2->ss->name) != 1) { + /* domain test fails */ + if (verbose > 1) + pr_err("not adding %s to %s as it is not in a compatible domain\n", + devname, mp->path); + + goto next; + } + /* all tests passed, OK to add to this array */ + if (!chosen) { + chosen = sra; + sra = NULL; + } else if (chosen->array.failed_disks < sra->array.failed_disks) { + sysfs_free(chosen); + chosen = sra; + sra = NULL; + } + next: + sysfs_free(sra); + if (st != st2) + free(st2); + if (dl) + domain_free(dl); + } + if (chosen) { + /* add current device to chosen array as a spare */ + int mdfd = open_dev(chosen->sys_name); + if (mdfd >= 0) { + struct mddev_dev devlist; + char chosen_devname[24]; // 2*11 for int (including signs) + colon + null + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = FlagDefault; + devlist.failfast = FlagDefault; + devlist.devname = chosen_devname; + sprintf(chosen_devname, "%d:%d", major(rdev), + minor(rdev)); + devlist.disposition = 'a'; + close(dfd); + *dfdp = -1; + rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist, + -1, 0, NULL, 0); + close(mdfd); + } + if (verbose > 0) { + if (rv == 0) + pr_err("added %s as spare for %s\n", + devname, chosen->sys_name); + else + pr_err("failed to add %s as spare for %s\n", + devname, chosen->sys_name); + } + sysfs_free(chosen); + } + map_unlock(&map); + return rv; +} + +static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct supertype *st, int verbose) +{ + /* we know that at least one partition virtual-metadata is + * allowed to incorporate spares like this device. We need to + * find a suitable device to copy partition information from. + * + * Getting a list of all disk (not partition) devices is + * slightly non-trivial. We could look at /sys/block, but + * that is theoretically due to be removed. Maybe best to use + * /dev/disk/by-path/?* and ignore names ending '-partNN' as + * we depend on this directory of 'path' info. But that fails + * to find loop devices and probably others. Maybe don't + * worry about that, they aren't the real target. + * + * So: check things in /dev/disk/by-path to see if they are in + * a compatible domain, then load the partition table and see + * if it is OK for the new device, and choose the largest + * partition table that fits. + */ + DIR *dir; + struct dirent *de; + char *chosen = NULL; + unsigned long long chosen_size = 0; + struct supertype *chosen_st = NULL; + int fd; + + dir = opendir("/dev/disk/by-path"); + if (!dir) + return 1; + while ((de = readdir(dir)) != NULL) { + char *ep; + struct dev_policy *pol2 = NULL; + struct domainlist *domlist = NULL; + int fd = -1; + struct mdinfo info; + struct supertype *st2 = NULL; + char *devname = NULL; + unsigned long long devsectors; + char *pathlist[2]; + + if (de->d_ino == 0 || de->d_name[0] == '.' || + (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN)) + goto next; + + ep = de->d_name + strlen(de->d_name); + while (ep > de->d_name && + isdigit(ep[-1])) + ep--; + if (ep > de->d_name + 5 && + strncmp(ep-5, "-part", 5) == 0) + /* This is a partition - skip it */ + goto next; + + pathlist[0] = de->d_name; + pathlist[1] = NULL; + pol2 = path_policy(pathlist, type_disk); + + domain_merge(&domlist, pol2, st ? st->ss->name : NULL); + if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1) + /* new device is incompatible with this device. */ + goto next; + + domain_free(domlist); + domlist = NULL; + + if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { + devname = NULL; + goto next; + } + fd = open(devname, O_RDONLY); + if (fd < 0) + goto next; + if (get_dev_size(fd, devname, &devsectors) == 0) + goto next; + devsectors >>= 9; + + if (st) + st2 = dup_super(st); + else + st2 = guess_super_type(fd, guess_partitions); + if (st2 == NULL || st2->ss->load_super(st2, fd, NULL) < 0) + goto next; + st2->ignore_hw_compat = 0; + + if (!st) { + /* Check domain policy again, this time referring to metadata */ + domain_merge(&domlist, pol2, st2->ss->name); + if (domain_test(domlist, pol, st2->ss->name) != 1) + /* Incompatible devices for this metadata type */ + goto next; + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + /* Some partition types allow sparing, but not + * this one. + */ + goto next; + } + + st2->ss->getinfo_super(st2, &info, NULL); + if (info.component_size > devsectors) + /* This partitioning doesn't fit in the device */ + goto next; + + /* This is an acceptable device to copy partition + * metadata from. We could just stop here, but I + * think I want to keep looking incase a larger + * metadata which makes better use of the device can + * be found. + */ + if (chosen == NULL || chosen_size < info.component_size) { + chosen_size = info.component_size; + free(chosen); + chosen = devname; + devname = NULL; + if (chosen_st) { + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + } + chosen_st = st2; + st2 = NULL; + } + + next: + free(devname); + domain_free(domlist); + dev_policy_free(pol2); + if (st2) + st2->ss->free_super(st2); + free(st2); + + if (fd >= 0) + close(fd); + } + + closedir(dir); + + if (!chosen) + return 1; + + /* 'chosen' is the best device we can find. Let's write its + * metadata to devname dfd is read-only so don't use that + */ + fd = open(devname, O_RDWR); + if (fd >= 0) { + chosen_st->ss->store_super(chosen_st, fd); + close(fd); + } + free(chosen); + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + return 0; +} + +static int is_bare(int dfd) +{ + unsigned long long size = 0; + char bufpad[4096 + 4096]; + char *buf = (char*)(((long)bufpad + 4096) & ~4095); + + if (lseek(dfd, 0, SEEK_SET) != 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + /* OK, first 4K appear blank, try the end. */ + get_dev_size(dfd, NULL, &size); + if (lseek(dfd, size-4096, SEEK_SET) < 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + return 1; +} + +/* adding a spare to a regular array is quite different from adding one to + * a set-of-partitions virtual array. + * This function determines which is worth trying and tries as appropriate. + * Arrays are given priority over partitions. + */ +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose) +{ + int i; + int rv; + int arrays_ok = 0; + int partitions_ok = 0; + int dfd = *dfdp; + int bare; + + /* Can only add a spare if device has at least one domain */ + if (pol_find(pol, pol_domain) == NULL) + return 1; + /* And only if some action allows spares */ + if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare)) + return 1; + + /* Now check if the device is bare. + * bare devices can always be added as a spare + * non-bare devices can only be added if spare-same-slot is permitted, + * and this device is replacing a previous device - in which case 'target' + * will be set. + */ + if (!is_bare(dfd)) { + /* Must have a target and allow same_slot */ + /* Later - may allow force_spare without target */ + if (!target || + !policy_action_allows(pol, st?st->ss->name:NULL, + act_spare_same_slot)) { + if (verbose > 1) + pr_err("%s is not bare, so not considering as a spare\n", + devname); + return 1; + } + bare = 0; + } else + bare = 1; + + /* It might be OK to add this device to an array - need to see + * what arrays might be candidates. + */ + if (st) { + /* just try to add 'array' or 'partition' based on this metadata */ + if (st->ss->add_to_super) + return array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + else + return partition_try_spare(devname, dfdp, pol, + st, verbose); + } + /* No metadata was specified or found so options are open. + * Check for whether any array metadata, or any partition metadata + * might allow adding the spare. This check is just help to avoid + * a more costly scan of all arrays when we can be sure that will + * fail. + */ + for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) { + if (superlist[i]->add_to_super && !arrays_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + arrays_ok = 1; + if (superlist[i]->add_to_super == NULL && !partitions_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + partitions_ok = 1; + } + rv = 1; + if (arrays_ok) + rv = array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + if (rv != 0 && partitions_ok) + rv = partition_try_spare(devname, dfdp, pol, st, verbose); + return rv; +} + +int IncrementalScan(struct context *c, char *devnm) +{ + /* look at every device listed in the 'map' file. + * If one is found that is not running then: + * look in mdadm.conf for bitmap file. + * if one exists, but array has none, add it. + * try to start array in auto-readonly mode + */ + struct map_ent *mapl = NULL; + struct map_ent *me; + struct mddev_ident *devs, *mddev; + int rv = 0; + char container[32]; + char *only = NULL; + + map_read(&mapl); + devs = conf_get_ident(NULL); + +restart: + for (me = mapl ; me ; me = me->next) { + struct mdinfo *sra; + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + if (me->metadata[0] == '/') { + char *sl; + + if (!devnm) + continue; + + /* member array, need to work on container */ + strncpy(container, me->metadata+1, 32); + container[31] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + only = devnm; + devnm = container; + goto restart; + } + mdfd = open_dev(me->devnm); + + if (!is_fd_valid(mdfd)) + continue; + if (!isdigit(me->metadata[0])) { + /* must be a container */ + struct supertype *st = super_by_fd(mdfd, NULL); + int ret = 0; + struct map_ent *map = NULL; + + if (st && st->ss->load_container) + ret = st->ss->load_container(st, mdfd, NULL); + close_fd(&mdfd); + if (!ret && st && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + ret = Incremental_container(st, me->path, c, only); + map_unlock(&map); + } + if (ret) + rv = 1; + continue; + } + if (md_array_active(mdfd)) { + close_fd(&mdfd); + continue; + } + /* Ok, we can try this one. Maybe it needs a bitmap */ + for (mddev = devs ; mddev ; mddev = mddev->next) + if (mddev->devname && me->path && + devname_matches(mddev->devname, me->path)) + break; + if (mddev && mddev->bitmap_file) { + /* + * Note: early kernels will wrongly fail this, so it + * is a hint only + */ + int added = -1; + int bmfd; + + bmfd = open(mddev->bitmap_file, O_RDWR); + if (is_fd_valid(bmfd)) { + added = ioctl(mdfd, SET_BITMAP_FILE, bmfd); + close_fd(&bmfd); + } + if (c->verbose >= 0) { + if (added == 0) + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); + else if (errno != EEXIST) + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); + } + } + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); + if (sra) { + if (sysfs_set_str(sra, NULL, + "array_state", "read-auto") == 0) { + if (c->verbose >= 0) + pr_err("started array %s\n", + me->path ?: me->devnm); + } else { + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); + rv = 1; + } + sysfs_free(sra); + } + close_fd(&mdfd); + } + map_free(mapl); + return rv; +} + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = xstrdup(fd2devnm(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = xstrdup(mp->devnm); + map_free(map); + } + + return mdname; +} + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list; + struct mdinfo *ra; + struct map_ent *map = NULL; + struct mdinfo info; + int trustworthy; + struct mddev_ident *match; + int rv = 0; + int result = 0; + + st->ss->getinfo_super(st, &info, NULL); + + if ((c->runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose) + pr_err("not enough devices to start the container\n"); + return 0; + } + + match = conf_match(st, &info, devname, c->verbose, &rv); + if (match == NULL && rv == 2) + return rv; + + /* Need to compute 'trustworthy' */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL; + else + trustworthy = FOREIGN; + + list = st->ss->container_content(st, NULL); + /* when nothing to activate - quit */ + if (list == NULL) { + if (c->export) { + printf("MD_STARTED=nothing\n"); + } + return 0; + } + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident *match = NULL; + + /* do not activate arrays blocked by metadata handler */ + if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { + pr_err("Cannot activate array %s in %s.\n", + ra->text_version, devname); + continue; + } + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnm); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + } else if (!only) { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (c->verbose>0) + pr_err("match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose > 0) + pr_err("array %s/%s is explicitly ignored by mdadm.conf\n", + match->container, match->member); + continue; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + c->autof, + trustworthy, + chosen_name, 0); + } + if (only && (!mp || strcmp(mp->devnm, only) != 0)) + continue; + + if (mdfd < 0) { + pr_err("failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, c, + chosen_name, &result); + map_free(map); + map = NULL; + close(mdfd); + } + if (c->export && result) { + char sep = '='; + printf("MD_STARTED"); + if (result & INCR_NO) { + printf("%cno", sep); + sep = ','; + } + if (result & INCR_UNSAFE) { + printf("%cunsafe", sep); + sep = ','; + } + if (result & INCR_ALREADY) { + printf("%calready", sep); + sep = ','; + } + if (result & INCR_YES) { + printf("%cyes", sep); + sep = ','; + } + printf("\n"); + } + return 0; +} + +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + manage_fork_fds(1); + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + +static int force_remove(char *devnm, int fd, struct mdinfo *mdi, int verbose) +{ + int rv; + int devid = devnm2devid(devnm); + + run_udisks("--unmount", map_dev(major(devid), minor(devid), 0)); + rv = Manage_stop(devnm, fd, verbose, 1); + if (rv) { + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(mdi, "remove"); + if (verbose) + pr_err("Fail to stop %s too.\n", devnm); + } + return rv; +} + +static void remove_from_member_array(struct mdstat_ent *memb, + struct mddev_dev *devlist, int verbose) +{ + int rv; + struct mdinfo mmdi; + int subfd = open_dev(memb->devnm); + + if (subfd >= 0) { + rv = Manage_subdevs(memb->devnm, subfd, devlist, verbose, + 0, NULL, 0); + if (rv & 2) { + if (sysfs_init(&mmdi, -1, memb->devnm)) + pr_err("unable to initialize sysfs for: %s\n", + memb->devnm); + else + force_remove(memb->devnm, subfd, &mmdi, + verbose); + } + close(subfd); + } +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * @id_path - name as found in /dev/disk/by-path for this device + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, char *id_path, int verbose) +{ + int mdfd; + int rv = 0; + struct mdstat_ent *ent; + struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; + + if (!id_path) + dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n"); + + if (strchr(devname, '/')) { + pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + if (verbose >= 0) + pr_err("%s does not appear to be a component of any array\n", devname); + return 1; + } + if (sysfs_init(&mdi, -1, ent->devnm)) { + pr_err("unable to initialize sysfs for: %s\n", devname); + return 1; + } + mdfd = open_dev_excl(ent->devnm); + if (is_fd_valid(mdfd)) { + close_fd(&mdfd); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + } + mdfd = open_dev(ent->devnm); + if (mdfd < 0) { + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->devnm); + free_mdstat(ent); + return 1; + } + + if (id_path) { + struct map_ent *map = NULL, *me; + me = map_by_devnm(&map, ent->devnm); + if (me) + policy_save_path(id_path, me); + map_free(map); + } + + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + /* for a container, we must fail each member array */ + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0) { + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + for (memb = mdstat ; memb ; memb = memb->next) { + if (is_container_member(memb, ent->devnm)) + remove_from_member_array(memb, + &devlist, verbose); + } + free_mdstat(mdstat); + } else { + rv |= Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + rv = force_remove(ent->devnm, mdfd, &mdi, verbose); + goto end; + } + } + + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); +end: + close(mdfd); + free_mdstat(ent); + return rv; +} @@ -0,0 +1,147 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * + * Added by Dale Stephenson + * steph@snapserver.com + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" + +int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) +{ + /* + * Nothing fancy about Kill. It just zeroes out a superblock + * Definitely not safe. + * Returns: + * 0 - a zero superblock was successfully written out + * 1 - failed to write the zero superblock + * 2 - failed to open the device. + * 4 - failed to find a superblock. + */ + + int fd, rv = 0; + + if (force) + noexcl = 1; + fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); + if (fd < 0) { + if (verbose >= 0) + pr_err("Couldn't open %s for write - not zeroing\n", + dev); + return 2; + } + if (st == NULL) + st = guess_super(fd); + if (st == NULL || st->ss->init_super == NULL) { + if (verbose >= 0) + pr_err("Unrecognised md component device - %s\n", dev); + close(fd); + return 4; + } + st->ignore_hw_compat = 1; + rv = st->ss->load_super(st, fd, dev); + if (rv == 0 || (force && rv >= 2)) { + st->ss->free_super(st); + st->ss->init_super(st, NULL, NULL, "", NULL, NULL, + INVALID_SECTORS); + if (st->ss->store_super(st, fd)) { + if (verbose >= 0) + pr_err("Could not zero superblock on %s\n", + dev); + rv = 1; + } else if (rv) { + if (verbose >= 0) + pr_err("superblock zeroed anyway\n"); + rv = 0; + } + } + close(fd); + return rv; +} + +int Kill_subarray(char *dev, char *subarray, int verbose) +{ + /* Delete a subarray out of a container, the subarry must be + * inactive. The subarray string must be a subarray index + * number. + * + * 0 = successfully deleted subarray from all container members + * 1 = failed to sync metadata to one or more devices + * 2 = failed to find the container, subarray, or other resource + * issue + */ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->kill_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (is_subarray_active(subarray, st->devnm)) { + if (verbose >= 0) + pr_err("Subarray-%s still active, aborting\n", + subarray); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + /* ok we've found our victim, drop the axe */ + rv = st->ss->kill_subarray(st, subarray); + if (rv) { + if (verbose >= 0) + pr_err("Failed to delete subarray-%s from %s\n", + subarray, dev); + goto free_super; + } + + /* FIXME these routines do not report success/failure */ + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (verbose >= 0) + pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); + + rv = 0; + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2a51d81 --- /dev/null +++ b/Makefile @@ -0,0 +1,332 @@ +# +# mdadm - manage Linux "md" devices aka RAID arrays. +# +# Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au> +# Copyright (C) 2013 Neil Brown <neilb@suse.de> +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Author: Neil Brown +# Email: <neilb@cse.unsw.edu.au> +# Paper: Neil Brown +# School of Computer Science and Engineering +# The University of New South Wales +# Sydney, 2052 +# Australia +# + +# define "CXFLAGS" to give extra flags to CC. +# e.g. make CXFLAGS=-O to optimise +CXFLAGS ?=-O2 +TCC = tcc +UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) +#DIET_GCC = diet gcc +# sorry, but diet-libc doesn't know about posix_memalign, +# so we cannot use it any more. +DIET_GCC = gcc -DHAVE_STDINT_H + +KLIBC=/home/src/klibc/klibc-0.77 + +KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 + +ifdef COVERITY +COVERITY_FLAGS=-include coverity-gcc-hack.h +endif + +ifeq ($(origin CC),default) +CC := $(CROSS_COMPILE)gcc +endif +CXFLAGS ?= -ggdb +CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter +ifdef WARN_UNUSED +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 +endif + +FALLTHROUGH := $(shell gcc -v --help 2>&1 | grep "implicit-fallthrough" | wc -l) +ifneq "$(FALLTHROUGH)" "0" +CWFLAGS += -Wimplicit-fallthrough=0 +endif + +ifdef DEBIAN +CPPFLAGS += -DDEBIAN +endif +ifdef DEFAULT_OLD_METADATA + CPPFLAGS += -DDEFAULT_OLD_METADATA + DEFAULT_METADATA=0.90 +else + DEFAULT_METADATA=1.2 +endif +CPPFLAGS += -DBINDIR=\"$(BINDIR)\" + +PKG_CONFIG ?= pkg-config + +SYSCONFDIR = /etc +CONFFILE = $(SYSCONFDIR)/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf +MAILCMD =/usr/sbin/sendmail -t +CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" +# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +# pivotroot from early boot to late boot. +# /run is best, but for distros that don't support that. +# /dev can work, in which case you probably want /dev/.mdadm +RUN_DIR=/run/mdadm +CHECK_RUN_DIR=1 +MAP_DIR=$(RUN_DIR) +MAP_FILE = map +MAP_PATH = $(MAP_DIR)/$(MAP_FILE) +MDMON_DIR = $(RUN_DIR) +# place for autoreplace cookies +FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots +SYSTEMD_DIR=/lib/systemd/system +LIB_DIR=/usr/libexec/mdadm + +COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC) +DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM) + +DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" +DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" +DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM) + +VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') +VERS_DATE = $(shell [ -d .git ] && date --iso-8601 --date="`git log -n1 --format=format:%cd --date=iso --date=short`") +DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",) +DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",) +DEXTRAVERSION = $(if $(EXTRAVERSION),-DEXTRAVERSION="\" - $(EXTRAVERSION)\"",) +CFLAGS += $(DVERS) $(DDATE) $(DEXTRAVERSION) + +# The glibc TLS ABI requires applications that call clone(2) to set up +# TLS data structures, use pthreads until mdmon implements this support +USE_PTHREADS = 1 +ifdef USE_PTHREADS +CFLAGS += -DUSE_PTHREADS +MON_LDFLAGS += -pthread +endif + +# If you want a static binary, you might uncomment these +# LDFLAGS = -static +# STRIP = -s +LDLIBS = -ldl + +# To explicitly disable libudev, set -DNO_LIBUDEV in CXFLAGS +ifeq (, $(findstring -DNO_LIBUDEV, $(CXFLAGS))) + LDLIBS += -ludev +endif + +INSTALL = /usr/bin/install +DESTDIR = +BINDIR = /sbin +MANDIR = /usr/share/man +MAN4DIR = $(MANDIR)/man4 +MAN5DIR = $(MANDIR)/man5 +MAN8DIR = $(MANDIR)/man8 + +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + +ifeq (,$(findstring s,$(MAKEFLAGS))) + ECHO=echo +else + ECHO=: +endif + +OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o uuid.o util.o maps.o lib.o \ + Manage.o Assemble.o Build.o \ + Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ + Incremental.o Dump.o \ + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + super-mbr.o super-gpt.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ + platform-intel.o probe_roms.o crc32c.o + +CHECK_OBJS = restripe.o uuid.o sysfs.o maps.o lib.o xmalloc.o dlink.o + +SRCS = $(patsubst %.o,%.c,$(OBJS)) + +INCL = mdadm.h part.h bitmap.h + +MON_OBJS = mdmon.o monitor.o managemon.o uuid.o util.o maps.o mdstat.o sysfs.o \ + policy.o lib.o \ + Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ + super-mbr.o super-gpt.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ + platform-intel.o probe_roms.o crc32c.o + +MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) + +STATICSRC = pwgr.c +STATICOBJS = pwgr.o + +all : mdadm mdmon +man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man + +check_rundir: + @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ + fi + +everything: all mdadm.static swap_super test_stripe raid6check \ + mdadm.Os mdadm.O2 man +everything-test: all mdadm.static swap_super test_stripe \ + mdadm.Os mdadm.O2 man +# mdadm.uclibc doesn't work on x86-64 +# mdadm.tcc doesn't work.. + +%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(COVERITY_FLAGS) -o $@ -c $< + +mdadm : $(OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) + +mdadm.static : $(OBJS) $(STATICOBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS) + +mdadm.tcc : $(SRCS) $(INCL) + $(TCC) -o mdadm.tcc $(SRCS) + +mdadm.klibc : $(SRCS) $(INCL) + rm -f $(OBJS) + $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) + +mdadm.Os : $(SRCS) $(INCL) + $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS) + +mdadm.O2 : $(SRCS) $(INCL) mdmon.O2 + $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS) + +mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h + $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS) + +# use '-z now' to guarantee no dynamic linker interactions with the monitor thread +mdmon : $(MON_OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h + +test_stripe : restripe.c xmalloc.o mdadm.h + $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c + +raid6check : raid6check.o mdadm.h $(CHECK_OBJS) + $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) + +mdadm.8 : mdadm.8.in + sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \ + -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8 + +mdadm.man : mdadm.8 + man -l mdadm.8 > mdadm.man + +mdmon.man : mdmon.8 + man -l mdmon.8 > mdmon.man + +md.man : md.4 + man -l md.4 > md.man + +mdadm.conf.man : mdadm.conf.5 + man -l mdadm.conf.5 > mdadm.conf.man + +raid6check.man : raid6check.8 + man -l raid6check.8 > raid6check.man + +$(OBJS) : $(INCL) mdmon.h +$(MON_OBJS) : $(INCL) mdmon.h + +sha1.o : sha1.c sha1.h md5.h + $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c + +install : install-bin install-man install-udev + +install-static : mdadm.static install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm + +install-tcc : mdadm.tcc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm + +install-uclibc : mdadm.uclibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm + +install-klibc : mdadm.klibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm + +install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 + $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8 + $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 + $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 + $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 + +install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules udev-md-raid-creating.rules \ + udev-md-clustered-confirm-device.rules + @for file in 01-md-raid-creating.rules 63-md-raid-arrays.rules 64-md-raid-assembly.rules \ + 69-md-clustered-confirm-device.rules ; \ + do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \ + $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + rm -f .install.tmp.1; \ + done + +install-systemd: systemd/mdmon@.service + @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \ + mdadm-last-resort@.service mdadm-grow-continue@.service \ + mdcheck_start.timer mdcheck_start.service \ + mdcheck_continue.timer mdcheck_continue.service \ + mdmonitor-oneshot.timer mdmonitor-oneshot.service \ + ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \ + $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + rm -f .install.tmp.2; \ + done + @for file in mdadm.shutdown ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \ + $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + rm -f .install.tmp.3; \ + done + if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(LIB_DIR)/mdadm_env.sh ;fi + +install-bin: mdadm mdmon + $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon + +uninstall: + rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm + +test: mdadm mdmon test_stripe swap_super raid6check + @echo "Please run './test' as root" + +clean : + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ + .merge_file_* mdadm.Os mdadm.O2 mdmon.O2 swap_super init.cpio.gz \ + mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon mdadm.8 + rm -rf cov-int + +dist : clean + ./makedist + +testdist : everything-test clean + ./makedist test + +TAGS : + etags *.h *.c + +DISTRO_MAKEFILE := $(wildcard distropkg/Makefile) +ifdef DISTRO_MAKEFILE +include $(DISTRO_MAKEFILE) +endif diff --git a/Manage.c b/Manage.c new file mode 100644 index 0000000..f789e0c --- /dev/null +++ b/Manage.c @@ -0,0 +1,1767 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +int Manage_ro(char *devname, int fd, int readonly) +{ + /* switch to readonly or rw + * + * requires >= 0.90.0 + * first check that array is runing + * use RESTART_ARRAY_RW or STOP_ARRAY_RO + * + */ + struct mdinfo *mdi; + int rv = 0; + + /* If this is an externally-managed array, we need to modify the + * metadata_version so that mdmon doesn't undo our change. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.major_version == -1 && + is_subarray(mdi->text_version)) { + char vers[64]; + strcpy(vers, "external:"); + strcat(vers, mdi->text_version); + if (readonly > 0) { + int rv; + /* We set readonly ourselves. */ + vers[9] = '-'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + close(fd); + rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); + + if (rv < 0) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + + vers[9] = mdi->text_version[0]; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + rv = 1; + goto out; + } + } else { + char *cp; + /* We cannot set read/write - must signal mdmon */ + vers[9] = '/'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + cp = strchr(vers+10, '/'); + if (cp) + *cp = 0; + ping_monitor(vers+10); + if (mdi->array.level <= 0) + sysfs_set_str(mdi, NULL, "array_state", "active"); + } + goto out; + } + + if (!md_array_active(fd)) { + pr_err("%s does not appear to be active.\n", devname); + rv = 1; + goto out; + } + + if (readonly > 0) { + if (ioctl(fd, STOP_ARRAY_RO, NULL)) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } else if (readonly < 0) { + if (ioctl(fd, RESTART_ARRAY_RW, NULL)) { + pr_err("failed to set writable for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } +out: + sysfs_free(mdi); + return rv; +} + +static void remove_devices(char *devnm, char *path) +{ + /* + * Remove names at 'path' - possibly with + * partition suffixes - which link to the 'standard' + * name for devnm. These were probably created + * by mdadm when the array was assembled. + */ + char base[40]; + char *path2; + char link[1024]; + int n; + int part; + char *be; + char *pe; + + if (!path) + return; + + sprintf(base, "/dev/%s", devnm); + be = base + strlen(base); + + path2 = xmalloc(strlen(path)+20); + strcpy(path2, path); + pe = path2 + strlen(path2); + + for (part = 0; part < 16; part++) { + if (part) { + sprintf(be, "p%d", part); + + if (isdigit(pe[-1])) + sprintf(pe, "p%d", part); + else + sprintf(pe, "%d", part); + } + n = readlink(path2, link, sizeof(link)); + if (n > 0 && (int)strlen(base) == n && + strncmp(link, base, n) == 0) + unlink(path2); + } + free(path2); +} + +int Manage_run(char *devname, int fd, struct context *c) +{ + /* Run the array. Array must already be configured + * Requires >= 0.90.0 + */ + char nm[32], *nmp; + + nmp = fd2devnm(fd); + if (!nmp) { + pr_err("Cannot find %s in sysfs!!\n", devname); + return 1; + } + strcpy(nm, nmp); + return IncrementalScan(c, nm); +} + +int Manage_stop(char *devname, int fd, int verbose, int will_retry) +{ + /* Stop the array. Array must already be configured + * 'will_retry' means that error messages are not wanted. + */ + int rv = 0; + struct map_ent *map = NULL; + struct mdinfo *mdi; + char devnm[32]; + char container[32]; + int err; + int count; + char buf[32]; + unsigned long long rd1, rd2; + + if (will_retry && verbose == 0) + verbose = -1; + + strcpy(devnm, fd2devnm(fd)); + /* Get EXCL access first. If this fails, then attempting + * to stop is probably a bad idea. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION); + if (mdi && is_subarray(mdi->text_version)) { + char *sl; + strncpy(container, mdi->text_version+1, sizeof(container)); + container[sizeof(container)-1] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + } else + container[0] = 0; + close(fd); + count = 5; + while (((fd = ((devname[0] == '/') + ?open(devname, O_RDONLY|O_EXCL) + :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 || + strcmp(fd2devnm(fd), devnm) != 0) && container[0] && + mdmon_running(container) && count) { + /* Can't open, so something might be wrong. However it + * is a container, so we might be racing with mdmon, so + * retry for a bit. + */ + if (fd >= 0) + close(fd); + flush_mdmon(container); + count--; + } + if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) { + if (fd >= 0) + close(fd); + if (verbose >= 0) + pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n", + devname); + return 1; + } + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + int err; + /* This is mdmon managed. */ + close(fd); + + /* As we had an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; + while (count && + (err = sysfs_set_str(mdi, NULL, + "array_state", + "inactive")) < 0 && + errno == EBUSY) { + usleep(200000); + count--; + } + if (err) { + if (verbose >= 0) + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); + + fd = open_dev_excl(devnm); + if (fd < 0) { + if (verbose >= 0) + pr_err("failed to completely stop %s: Device is busy\n", + devname); + rv = 1; + goto out; + } + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m = mds; m; m = m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + metadata_container_matches(m->metadata_version+9, + devnm)) { + if (verbose >= 0) + pr_err("Cannot stop container %s: member %s still active\n", + devname, m->devnm); + free_mdstat(mds); + rv = 1; + goto out; + } + } + + /* If the array is undergoing a reshape which changes the number + * of devices, then it would be nice to stop it at a point where + * it has completed a full number of stripes in both old and + * new layouts as this will allow the reshape to be reverted. + * So if 'sync_action' is "reshape" and 'raid_disks' shows two + * different numbers, then + * - freeze reshape + * - set sync_max to next multiple of both data_disks and + * chunk sizes (or next but one) + * - unfreeze reshape + * - wait on 'sync_completed' for that point to be reached. + */ + if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) && + sysfs_attribute_available(mdi, NULL, "sync_action") && + sysfs_attribute_available(mdi, NULL, "reshape_direction") && + sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "reshape\n") == 0 && + sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) { + unsigned long long position, curr; + unsigned long long chunk1, chunk2; + unsigned long long rddiv, chunkdiv; + unsigned long long sectors; + unsigned long long sync_max, old_sync_max; + unsigned long long completed; + int backwards = 0; + int delay; + int scfd; + + delay = 40; + while (rd1 > rd2 && delay > 0 && + sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) { + /* must be in the critical section - wait a bit */ + delay -= 1; + usleep(100000); + } + + if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0) + goto done; + /* Array is frozen */ + + rd1 -= mdi->array.level == 6 ? 2 : 1; + rd2 -= mdi->array.level == 6 ? 2 : 1; + sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf)); + if (strncmp(buf, "back", 4) == 0) + backwards = 1; + if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) { + /* reshape must have finished now */ + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + goto done; + } + sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2); + chunk1 /= 512; + chunk2 /= 512; + rddiv = GCD(rd1, rd2); + chunkdiv = GCD(chunk1, chunk2); + sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2; + + if (backwards) { + /* Need to subtract 'reshape_position' from + * array size to get equivalent of sync_max. + * Size calculation based on raid5_size in kernel. + */ + unsigned long long size = mdi->component_size; + size &= ~(chunk1-1); + size &= ~(chunk2-1); + /* rd1 must be smaller */ + /* Reshape may have progressed further backwards than + * recorded, so target even further back (hence "-1") + */ + position = (position / sectors - 1) * sectors; + /* rd1 is always the conversion factor between 'sync' + * position and 'reshape' position. + * We read 1 "new" stripe worth of data from where-ever, + * and when write out that full stripe. + */ + sync_max = size - position/rd1; + } else { + /* Reshape will very likely be beyond position, and it may + * be too late to stop at '+1', so aim for '+2' + */ + position = (position / sectors + 2) * sectors; + sync_max = position/rd1; + } + if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0) + old_sync_max = mdi->component_size; + /* Must not advance sync_max as that could confuse + * the reshape monitor */ + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + /* That should have set things going again. Now we + * wait a little while (3 second max) for sync_completed + * to reach the target. + * The reshape process can block for 500msec if + * the sync speed limit is hit, so we need to wait + * a lot longer than that. 1 second is usually + * enough. 3 is safe. + */ + delay = 3000; + scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed"); + while (scfd >= 0 && delay > 0 && old_sync_max > 0) { + unsigned long long max_completed; + sysfs_get_ll(mdi, NULL, "reshape_position", &curr); + sysfs_fd_get_str(scfd, buf, sizeof(buf)); + if (strncmp(buf, "none", 4) == 0) { + /* Either reshape has aborted, or hasn't + * quite started yet. Wait a bit and + * check 'sync_action' to see. + */ + usleep(10000); + sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)); + if (strncmp(buf, "reshape", 7) != 0) + break; + } + + if (sysfs_fd_get_two(scfd, &completed, + &max_completed) == 2 && + /* 'completed' sometimes reads as max-uulong */ + completed < max_completed && + (completed > sync_max || + (completed == sync_max && curr != position))) { + while (completed > sync_max) { + sync_max += sectors / rd1; + if (backwards) + position -= sectors; + else + position += sectors; + } + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + } + + if (!backwards && curr >= position) + break; + if (backwards && curr <= position) + break; + sysfs_wait(scfd, &delay); + } + if (scfd >= 0) + close(scfd); + + } +done: + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; err = 0; + while (count && fd >= 0 && + (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) { + usleep(200000); + count --; + } + if (fd >= 0 && err) { + if (verbose >= 0) { + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + if (errno == EBUSY) + cont_err("Perhaps a running process, mounted filesystem or active volume group?\n"); + } + rv = 1; + goto out; + } + + if (get_linux_version() < 2006028) { + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + } + + if (devnm[0] && use_udev()) { + struct map_ent *mp = map_by_devnm(&map, devnm); + remove_devices(devnm, mp ? mp->path : NULL); + } + + if (verbose >= 0) + pr_err("stopped %s\n", devname); + map_lock(&map); + map_remove(&map, devnm); + map_unlock(&map); +out: + sysfs_free(mdi); + + return rv; +} + +static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp) +{ + struct mddev_dev *new; + new = xmalloc(sizeof(*new)); + memset(new, 0, sizeof(*new)); + new->devname = xstrdup(name); + new->disposition = disp; + new->next = dv->next; + dv->next = new; + return new; +} + +static void add_faulty(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if ((disk.state & 1) == 0) /* not faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, disp); + } +} + +static void add_detached(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + int sfd; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + sfd = dev_open(buf, O_RDONLY); + if (sfd >= 0) { + /* Not detached */ + close(sfd); + continue; + } + if (errno != ENXIO) + /* Probably not detached */ + continue; + dv = add_one(dv, buf, disp); + } +} + +static void add_set(struct mddev_dev *dv, int fd, char set_char) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int copies, set; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + if (array.level != 10) + return; + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + set = disk.raid_disk % copies; + if (set_char != set + 'A') + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, dv->disposition); + } +} + +int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *dev_st, struct supertype *tst, + unsigned long rdev, + char *update, char *devname, int verbose, + mdu_array_info_t *array) +{ + struct mdinfo mdi; + int duuid[4]; + int ouuid[4]; + + dev_st->ss->getinfo_super(dev_st, &mdi, NULL); + dev_st->ss->uuid_from_super(dev_st, ouuid); + if (tst->sb) + tst->ss->uuid_from_super(tst, duuid); + else + /* Assume uuid matches: kernel will check */ + memcpy(duuid, ouuid, sizeof(ouuid)); + if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) && + !(mdi.disk.state & (1<<MD_DISK_FAULTY)) && + memcmp(duuid, ouuid, sizeof(ouuid))==0) { + /* Looks like it is worth a + * try. Need to make sure + * kernel will accept it + * though. + */ + mdu_disk_info_t disc; + /* re-add doesn't work for version-1 superblocks + * before 2.6.18 :-( + */ + if (array->major_version == 1 && + get_linux_version() <= 2006018) + goto skip_re_add; + disc.number = mdi.disk.number; + if (md_get_disk_info(fd, &disc) != 0 || + disc.major != 0 || disc.minor != 0) + goto skip_re_add; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number = mdi.disk.number; + disc.raid_disk = mdi.disk.raid_disk; + disc.state = mdi.disk.state; + if (array->state & (1 << MD_SB_CLUSTERED)) { + /* extra flags are needed when adding to a cluster as + * there are two cases to distinguish + */ + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + if (dv->writemostly == FlagSet) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->writemostly == FlagClear) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == FlagSet) + disc.state |= 1 << MD_DISK_FAILFAST; + if (dv->failfast == FlagClear) + disc.state &= ~(1 << MD_DISK_FAILFAST); + remove_partitions(tfd); + if (update || dv->writemostly != FlagDefault || + dv->failfast != FlagDefault) { + int rv = -1; + tfd = dev_open(dv->devname, O_RDWR); + if (tfd < 0) { + pr_err("failed to open %s for superblock update during re-add\n", dv->devname); + return -1; + } + + if (dv->writemostly == FlagSet) + rv = dev_st->ss->update_super( + dev_st, NULL, "writemostly", + devname, verbose, 0, NULL); + if (dv->writemostly == FlagClear) + rv = dev_st->ss->update_super( + dev_st, NULL, "readwrite", + devname, verbose, 0, NULL); + if (dv->failfast == FlagSet) + rv = dev_st->ss->update_super( + dev_st, NULL, "failfast", + devname, verbose, 0, NULL); + if (dv->failfast == FlagClear) + rv = dev_st->ss->update_super( + dev_st, NULL, "nofailfast", + devname, verbose, 0, NULL); + if (update) + rv = dev_st->ss->update_super( + dev_st, NULL, update, + devname, verbose, 0, NULL); + if (rv == 0) + rv = dev_st->ss->store_super(dev_st, tfd); + close(tfd); + if (rv != 0) { + pr_err("failed to update superblock during re-add\n"); + return -1; + } + } + /* don't even try if disk is marked as faulty */ + errno = 0; + if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (verbose >= 0) + pr_err("re-added %s\n", dv->devname); + return 1; + } + if (errno == ENOMEM || errno == EROFS) { + pr_err("add new device failed for %s: %s\n", + dv->devname, strerror(errno)); + if (dv->disposition == 'M') + return 0; + return -1; + } + } +skip_re_add: + return 0; +} + +int Manage_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *tst, mdu_array_info_t *array, + int force, int verbose, char *devname, + char *update, unsigned long rdev, unsigned long long array_size, + int raid_slot) +{ + unsigned long long ldsize; + struct supertype *dev_st; + int j; + mdu_disk_info_t disc; + + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (dv->disposition == 'M') + return 0; + else + return -1; + } + + if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) { + /* More than 4TB is wasted on v0.90 */ + if (!force) { + pr_err("%s is larger than %s can effectively use.\n" + " Add --force is you really want to add this device.\n", + dv->devname, devname); + return -1; + } + pr_err("%s is larger than %s can effectively use.\n" + " Adding anyway as --force was given.\n", + dv->devname, devname); + } + + if (array->not_persistent == 0 || tst->ss->external) { + + /* need to find a sample superblock to copy, and + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array-> + */ + int array_failed; + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_container(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { + char *dev; + int dfd; + disc.number = j; + if (md_get_disk_info(fd, &disc)) + continue; + if (disc.major==0 && disc.minor==0) + continue; + if ((disc.state & 4)==0) /* sync */ + continue; + /* Looks like a good device to try */ + dev = map_dev(disc.major, disc.minor, 1); + if (!dev) + continue; + dfd = dev_open(dev, O_RDONLY); + if (dfd < 0) + continue; + if (tst->ss->load_super(tst, dfd, + NULL)) { + close(dfd); + continue; + } + close(dfd); + break; + } + /* FIXME this is a bad test to be using */ + if (!tst->sb && (dv->disposition != 'a' && + dv->disposition != 'S')) { + /* we are re-adding a device to a + * completely dead array - have to depend + * on kernel to check + */ + } else if (!tst->sb) { + pr_err("cannot load array metadata from %s\n", devname); + return -1; + } + + /* Make sure device is large enough */ + if (dv->disposition != 'j' && /* skip size check for Journal */ + tst->sb && + tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < + array_size) { + if (dv->disposition == 'M') + return 0; + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + + /* Possibly this device was recently part of + * the array and was temporarily removed, and + * is now being re-added. If so, we can + * simply re-add it. + */ + + if (array->not_persistent == 0) { + dev_st = dup_super(tst); + dev_st->ss->load_super(dev_st, tfd, NULL); + if (dev_st->sb && dv->disposition != 'S') { + int rv; + + rv = attempt_re_add(fd, tfd, dv, dev_st, tst, + rdev, update, devname, + verbose, array); + dev_st->ss->free_super(dev_st); + if (rv) + return rv; + } + } + if (dv->disposition == 'M') { + if (verbose > 0) + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return 0; + } + if (dv->disposition == 'A') { + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return -1; + } + if (array->active_disks < array->raid_disks) { + char *avail = xcalloc(array->raid_disks, 1); + int d; + int found = 0; + + for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) { + disc.number = d; + if (md_get_disk_info(fd, &disc)) + continue; + if (disc.major == 0 && disc.minor == 0) + continue; + if (!(disc.state & (1<<MD_DISK_SYNC))) + continue; + avail[disc.raid_disk] = 1; + found++; + } + array_failed = !enough(array->level, array->raid_disks, + array->layout, 1, avail); + free(avail); + } else + array_failed = 0; + if (array_failed) { + pr_err("%s has failed so using --add cannot work and might destroy\n", + devname); + pr_err("data on %s. You should stop the array and re-assemble it.\n", + dv->devname); + return -1; + } + } else { + /* non-persistent. Must ensure that new drive + * is at least array->size big. + */ + if (ldsize/512 < array_size) { + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + } + /* committed to really trying this device now*/ + remove_partitions(tfd); + + /* in 2.6.17 and earlier, version-1 superblocks won't + * use the number we write, but will choose a free number. + * we must choose the same free number, which requires + * starting at 'raid_disks' and counting up + */ + for (j = array->raid_disks; j < tst->max_devs; j++) { + disc.number = j; + if (md_get_disk_info(fd, &disc)) + break; + if (disc.major==0 && disc.minor==0) + break; + if (disc.state & 8) /* removed */ + break; + } + disc.major = major(rdev); + disc.minor = minor(rdev); + if (raid_slot < 0) + disc.number = j; + else + disc.number = raid_slot; + disc.state = 0; + + /* only add journal to array that supports journaling */ + if (dv->disposition == 'j') { + struct mdinfo *mdp; + + mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (!mdp) { + pr_err("%s unable to read array state.\n", devname); + return -1; + } + + if (mdp->array_state != ARRAY_READONLY) { + sysfs_free(mdp); + pr_err("%s is not readonly, cannot add journal.\n", devname); + return -1; + } + + sysfs_free(mdp); + + disc.raid_disk = 0; + } + + if (array->not_persistent==0) { + int dfd; + if (dv->disposition == 'j') + disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC); + if (dv->writemostly == FlagSet) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->failfast == FlagSet) + disc.state |= 1 << MD_DISK_FAILFAST; + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) + return -1; + if (tst->ss->write_init_super(tst)) + return -1; + } else if (dv->disposition == 'A') { + /* this had better be raid1. + * As we are "--re-add"ing we must find a spare slot + * to fill. + */ + char *used = xcalloc(array->raid_disks, 1); + for (j = 0; j < tst->max_devs; j++) { + mdu_disk_info_t disc2; + disc2.number = j; + if (md_get_disk_info(fd, &disc2)) + continue; + if (disc2.major==0 && disc2.minor==0) + continue; + if (disc2.state & 8) /* removed */ + continue; + if (disc2.raid_disk < 0) + continue; + if (disc2.raid_disk > array->raid_disks) + continue; + used[disc2.raid_disk] = 1; + } + for (j = 0 ; j < array->raid_disks; j++) + if (!used[j]) { + disc.raid_disk = j; + disc.state |= (1<<MD_DISK_SYNC); + break; + } + free(used); + } + + if (array->state & (1 << MD_SB_CLUSTERED)) { + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + + if (dv->writemostly == FlagSet) + disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == FlagSet) + disc.state |= (1 << MD_DISK_FAILFAST); + if (tst->ss->external) { + /* add a disk + * to an external metadata container */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + char devnm[32]; + int dfd; + + strcpy(devnm, fd2devnm(fd)); + + container_fd = open_dev_excl(devnm); + if (container_fd < 0) { + pr_err("add failed for %s: could not get exclusive access to container\n", + dv->devname); + tst->ss->free_super(tst); + return -1; + } + + /* Check if metadata handler is able to accept the drive */ + if (!tst->ss->validate_geometry(tst, LEVEL_CONTAINER, 0, 1, NULL, + 0, 0, dv->devname, NULL, 0, 1)) { + close(container_fd); + return -1; + } + + Kill(dv->devname, NULL, 0, -1, 0); + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) { + close(dfd); + close(container_fd); + return -1; + } + if (!mdmon_running(tst->container_devnm)) + tst->ss->sync_metadata(tst); + + sra = sysfs_read(container_fd, NULL, 0); + if (!sra) { + pr_err("add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + tst->ss->free_super(tst); + return -1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi, NULL); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + /* Make sure fds are closed as they are O_EXCL which + * would block add_disk */ + tst->ss->free_super(tst); + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + pr_err("add new device to external metadata failed for %s\n", dv->devname); + close(container_fd); + sysfs_free(sra); + return -1; + } + ping_monitor(devnm); + sysfs_free(sra); + close(container_fd); + } else { + tst->ss->free_super(tst); + if (ioctl(fd, ADD_NEW_DISK, &disc)) { + if (dv->disposition == 'j') + pr_err("Failed to hot add %s as journal, " + "please try restart %s.\n", dv->devname, devname); + else + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); + return -1; + } + if (dv->disposition == 'j') { + pr_err("Journal added successfully, making %s read-write\n", devname); + if (Manage_ro(devname, fd, -1)) + pr_err("Failed to make %s read-write\n", devname); + } + + } + if (verbose >= 0) + pr_err("added %s\n", dv->devname); + return 1; +} + +int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, + int sysfd, unsigned long rdev, int force, int verbose, char *devname) +{ + int lfd = -1; + int err; + + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hot spare while we are checking, we + * get an O_EXCL open on the container + */ + int ret; + char devnm[32]; + strcpy(devnm, fd2devnm(fd)); + lfd = open_dev_excl(devnm); + if (lfd < 0) { + pr_err("Cannot get exclusive access to container - odd\n"); + return -1; + } + /* We may not be able to check on holders in + * sysfs, either because we don't have the dev num + * (rdev == 0) or because the device has been detached + * and the 'holders' directory no longer exists + * (ret == -1). In that case, assume it is OK to + * remove. + */ + if (rdev == 0) + ret = -1; + else { + /* + * The drive has already been set to 'faulty', however + * monitor might not have had time to process it and the + * drive might still have an entry in the 'holders' + * directory. Try a few times to avoid a false error + */ + int count = 20; + + do { + ret = sysfs_unique_holder(devnm, rdev); + if (ret < 2) + break; + usleep(100 * 1000); /* 100ms */ + } while (--count > 0); + + if (ret == 0) { + pr_err("%s is not a member, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + if (ret >= 2) { + pr_err("%s is still in use, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + } + } + /* FIXME check that it is a current member */ + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + err = sys_hot_remove_disk(sysfd, force); + } else { + err = hot_remove_disk(fd, rdev, force); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * is registered */ + struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(rdev) && + dv->disk.minor == (int)minor(rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + sysfs_free(sra); + } + } + if (err) { + pr_err("hot remove failed for %s: %s\n", dv->devname, + strerror(errno)); + if (lfd >= 0) + close(lfd); + return -1; + } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *devnm = fd2devnm(fd); + + if (!devnm) { + pr_err("unable to get container name\n"); + return -1; + } + + ping_manager(devnm); + } + if (lfd >= 0) + close(lfd); + if (verbose >= 0) + pr_err("hot removed %s from %s\n", + dv->devname, devname); + return 1; +} + +int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + if (tst->ss->external) { + pr_err("--replace only supported for native metadata (0.90 or 1.x)\n"); + return -1; + } + /* Need to find the device in sysfs and add 'want_replacement' to the + * status. + */ + mdi = sysfs_read(fd, NULL, GET_DEVS); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.raid_disk < 0) { + pr_err("%s is not active and so cannot be replaced.\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_str(mdi, di, + "state", "want_replacement"); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to request replacement for %s\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s (device %d in %s) for replacement\n", + dv->devname, di->disk.raid_disk, devname); + /* If there is a matching 'with', we need to tell it which + * raid disk + */ + while (dv && dv->disposition != 'W') + dv = dv->next; + if (dv) { + dv->disposition = 'w'; + dv->used = di->disk.raid_disk; + } + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot --replace it\n", + dv->devname, devname); + return -1; +} + +int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */ + mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.state & (1<<MD_DISK_FAULTY)) { + pr_err("%s is faulty and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + if (di->disk.raid_disk >= 0) { + pr_err("%s is active and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_num(mdi, di, + "slot", dv->used); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to set %s as preferred replacement.\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s in %s as replacement for device %d\n", + dv->devname, devname, dv->used); + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot make it preferred replacement\n", + dv->devname, devname); + return -1; +} + +int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force) +{ + /* Do something to each dev. + * devmode can be + * 'a' - add the device + * 'S' - add the device as a spare - don't try re-add + * 'j' - add the device as a journal device + * 'A' - re-add the device + * 'r' - remove the device: HOT_REMOVE_DISK + * device can be 'faulty' or 'detached' in which case all + * matching devices are removed. + * 'f' - set the device faulty SET_DISK_FAULTY + * device can be 'detached' in which case any device that + * is inaccessible will be marked faulty. + * 'R' - mark this device as wanting replacement. + * 'W' - this device is added if necessary and activated as + * a replacement for a previous 'R' device. + * ----- + * 'w' - 'W' will be changed to 'w' when it is paired with + * a 'R' device. If a 'W' is found while walking the list + * it must be unpaired, and is an error. + * 'M' - this is created by a 'missing' target. It is a slight + * variant on 'A' + * 'F' - Another variant of 'A', where the device was faulty + * so must be removed from the array first. + * 'c' - confirm the device as found (for clustered environments) + * + * For 'f' and 'r', the device can also be a kernel-internal + * name such as 'sdb'. + */ + mdu_array_info_t array; + unsigned long long array_size; + struct mddev_dev *dv; + int tfd = -1; + struct supertype *tst; + char *subarray = NULL; + int sysfd = -1; + int count = 0; /* number of actions taken */ + struct mdinfo info; + struct mdinfo devinfo; + int frozen = 0; + int busy = 0; + int raid_slot = -1; + + if (sysfs_init(&info, fd, NULL)) { + pr_err("sysfs not availabile for %s\n", devname); + goto abort; + } + + if (md_get_array_info(fd, &array)) { + pr_err("Cannot get array info for %s\n", devname); + goto abort; + } + /* array.size is only 32 bits and may be truncated. + * So read from sysfs if possible, and record number of sectors + */ + + array_size = get_component_size(fd); + if (array_size <= 0) + array_size = array.size * 2; + + tst = super_by_fd(fd, &subarray); + if (!tst) { + pr_err("unsupport array - version %d.%d\n", + array.major_version, array.minor_version); + goto abort; + } + + for (dv = devlist; dv; dv = dv->next) { + dev_t rdev = 0; /* device to add/remove etc */ + int rv; + int mj,mn; + + raid_slot = -1; + if (dv->disposition == 'c') { + rv = parse_cluster_confirm_arg(dv->devname, + &dv->devname, + &raid_slot); + if (rv) { + pr_err("Could not get the devname of cluster\n"); + goto abort; + } + } + + if (strcmp(dv->devname, "failed") == 0 || + strcmp(dv->devname, "faulty") == 0) { + if (dv->disposition != 'A' && dv->disposition != 'r') { + pr_err("%s only meaningful with -r or --re-add, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_faulty(dv, fd, (dv->disposition == 'A' + ? 'F' : 'r')); + continue; + } + if (strcmp(dv->devname, "detached") == 0) { + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r of -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_detached(dv, fd, dv->disposition); + continue; + } + + if (strcmp(dv->devname, "missing") == 0) { + struct mddev_dev *add_devlist; + struct mddev_dev **dp; + if (dv->disposition == 'c') { + rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); + break; + } + + if (dv->disposition != 'A') { + pr_err("'missing' only meaningful with --re-add\n"); + goto abort; + } + add_devlist = conf_get_devs(); + if (add_devlist == NULL) { + pr_err("no devices to scan for missing members.\n"); + continue; + } + for (dp = &add_devlist; *dp; dp = & (*dp)->next) + /* 'M' (for 'missing') is like 'A' without errors */ + (*dp)->disposition = 'M'; + *dp = dv->next; + dv->next = add_devlist; + continue; + } + + if (strncmp(dv->devname, "set-", 4) == 0 && + strlen(dv->devname) == 5) { + int copies; + + if (dv->disposition != 'r' && + dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", + dv->devname); + goto abort; + } + if (array.level != 10) { + pr_err("'%s' only meaningful with RAID10 arrays\n", + dv->devname); + goto abort; + } + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies != 0 || + dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || + copies > 26) { + pr_err("'%s' not meaningful with this array\n", + dv->devname); + goto abort; + } + add_set(dv, fd, dv->devname[4]); + continue; + } + + if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { + /* Assume this is a kernel-internal name like 'sda1' */ + int found = 0; + char dname[55]; + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r or -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + + sprintf(dname, "dev-%s", dv->devname); + sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); + if (sysfd >= 0) { + char dn[20]; + if (sysfs_fd_get_str(sysfd, dn, 20) > 0 && + sscanf(dn, "%d:%d", &mj,&mn) == 2) { + rdev = makedev(mj,mn); + found = 1; + } + close(sysfd); + sysfd = -1; + } + if (!found) { + sysfd = sysfs_open(fd2devnm(fd), dname, "state"); + if (sysfd < 0) { + pr_err("%s does not appear to be a component of %s\n", + dv->devname, devname); + goto abort; + } + } + } else if ((dv->disposition == 'r' || + dv->disposition == 'f') && + get_maj_min(dv->devname, &mj, &mn)) { + /* for 'fail' and 'remove', the device might + * not exist. + */ + rdev = makedev(mj, mn); + } else { + tfd = dev_open(dv->devname, O_RDONLY); + if (tfd >= 0) { + fstat_is_blkdev(tfd, dv->devname, &rdev); + close(tfd); + } else { + int open_err = errno; + if (!stat_is_blkdev(dv->devname, &rdev)) { + if (dv->disposition == 'M') + /* non-fatal. Also improbable */ + continue; + goto abort; + } + if (dv->disposition == 'r') + /* Be happy, the stat worked, that is + * enough for --remove + */ + ; + else { + if (dv->disposition == 'M') + /* non-fatal */ + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(open_err)); + goto abort; + } + } + } + switch(dv->disposition){ + default: + pr_err("internal error - devmode[%s]=%d\n", + dv->devname, dv->disposition); + goto abort; + case 'a': + case 'S': /* --add-spare */ + case 'j': /* --add-journal */ + case 'A': + case 'M': /* --re-add missing */ + case 'F': /* --re-add faulty */ + case 'c': /* --cluster-confirm */ + /* add the device */ + if (subarray) { + pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); + goto abort; + } + + /* Let's first try to write re-add to sysfs */ + if (rdev != 0 && + (dv->disposition == 'A' || dv->disposition == 'F')) { + sysfs_init_dev(&devinfo, rdev); + if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) { + pr_err("re-add %s to %s succeed\n", + dv->devname, info.sys_name); + break; + } + } + + if (dv->disposition == 'F') + /* Need to remove first */ + hot_remove_disk(fd, rdev, force); + /* Make sure it isn't in use (in 2.6 or later) */ + tfd = dev_open(dv->devname, O_RDONLY|O_EXCL); + if (tfd >= 0) { + /* We know no-one else is using it. We'll + * need non-exclusive access to add it, so + * do that now. + */ + close(tfd); + tfd = dev_open(dv->devname, O_RDONLY); + } + if (tfd < 0) { + if (dv->disposition == 'M') + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_add(fd, tfd, dv, tst, &array, + force, verbose, devname, update, + rdev, array_size, raid_slot); + close(tfd); + tfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'r': + /* hot remove */ + if (subarray) { + pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else + rv = Manage_remove(tst, fd, dv, sysfd, + rdev, verbose, force, + devname); + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'f': /* set faulty */ + /* FIXME check current member */ + if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || + (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, + rdev))) { + if (errno == EBUSY) + busy = 1; + pr_err("set device faulty failed for %s: %s\n", + dv->devname, strerror(errno)); + if (sysfd >= 0) + close(sysfd); + goto abort; + } + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + count++; + if (verbose >= 0) + pr_err("set %s faulty in %s\n", + dv->devname, devname); + break; + case 'R': /* Mark as replaceable */ + if (subarray) { + pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else { + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_replace(tst, fd, dv, + rdev, verbose, + devname); + } + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + case 'W': /* --with device that doesn't match */ + pr_err("No matching --replace device for --with %s\n", + dv->devname); + goto abort; + case 'w': /* --with device which was matched */ + rv = Manage_with(tst, fd, dv, + rdev, verbose, devname); + if (rv < 0) + goto abort; + break; + } + } + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + if (test && count == 0) + return 2; + return 0; + +abort: + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + return !test && busy ? 2 : 1; +} + +int autodetect(void) +{ + /* Open any md device, and issue the RAID_AUTORUN ioctl */ + int rv = 1; + int fd = dev_open("9:0", O_RDONLY); + if (fd >= 0) { + if (ioctl(fd, RAID_AUTORUN, 0) == 0) + rv = 0; + close(fd); + } + return rv; +} + +int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose) +{ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->update_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + rv = st->ss->update_subarray(st, subarray, update, ident); + + if (rv) { + if (verbose >= 0) + pr_err("Failed to update %s of subarray-%s in %s\n", + update, subarray, dev); + } else if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0) + pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} + +/* Move spare from one array to another If adding to destination array fails + * add back to original array. + * Returns 1 on success, 0 on failure */ +int move_spare(char *from_devname, char *to_devname, dev_t devid) +{ + struct mddev_dev devlist; + char devname[20]; + + /* try to remove and add */ + int fd1 = open(to_devname, O_RDONLY); + int fd2 = open(from_devname, O_RDONLY); + + if (fd1 < 0 || fd2 < 0) { + if (fd1 >= 0) + close(fd1); + if (fd2 >= 0) + close(fd2); + return 0; + } + + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = FlagDefault; + devlist.failfast = FlagDefault; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(devid), minor(devid)); + + devlist.disposition = 'r'; + if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, + NULL, 0) == 0) { + /* make sure manager is aware of changes */ + ping_manager(to_devname); + ping_manager(from_devname); + close(fd1); + close(fd2); + return 1; + } + else + Manage_subdevs(from_devname, fd2, &devlist, + -1, 0, NULL, 0); + } + close(fd1); + close(fd2); + return 0; +} diff --git a/Monitor.c b/Monitor.c new file mode 100644 index 0000000..30c031a --- /dev/null +++ b/Monitor.c @@ -0,0 +1,1275 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <sys/wait.h> +#include <signal.h> +#include <limits.h> +#include <syslog.h> +#ifndef NO_LIBUDEV +#include <libudev.h> +#endif + +struct state { + char *devname; + char devnm[32]; /* to sync with mdstat info */ + unsigned int utime; + int err; + char *spare_group; + int active, working, failed, spare, raid; + int from_config; + int from_auto; + int expected_spares; + int devstate[MAX_DISKS]; + dev_t devid[MAX_DISKS]; + int percent; + char parent_devnm[32]; /* For subarray, devnm of parent. + * For others, "" + */ + struct supertype *metadata; + struct state *subarray;/* for a container it is a link to first subarray + * for a subarray it is a link to next subarray + * in the same container */ + struct state *parent; /* for a subarray it is a link to its container + */ + struct state *next; +}; + +struct alert_info { + char *mailaddr; + char *mailfrom; + char *alert_cmd; + int dosyslog; +}; +static int make_daemon(char *pidfile); +static int check_one_sharer(int scan); +static void write_autorebuild_pid(void); +static void alert(char *event, char *dev, char *disc, struct alert_info *info); +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *info, + int increments, char *prefer); +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info); +static void try_spare_migration(struct state *statelist, struct alert_info *info); +static void link_containers_with_subarrays(struct state *list); +#ifndef NO_LIBUDEV +static int check_udev_activity(void); +#endif + +int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share) +{ + /* + * Every few seconds, scan every md device looking for changes + * When a change is found, log it, possibly run the alert command, + * and possibly send Email + * + * For each array, we record: + * Update time + * active/working/failed/spare drives + * State of each device. + * %rebuilt if rebuilding + * + * If the update time changes, check out all the data again + * It is possible that we cannot get the state of each device + * due to bugs in the md kernel module. + * We also read /proc/mdstat to get rebuild percent, + * and to get state on all active devices incase of kernel bug. + * + * Events are: + * Fail + * An active device had Faulty set or Active/Sync removed + * FailSpare + * A spare device had Faulty set + * SpareActive + * An active device had a reverse transition + * RebuildStarted + * percent went from -1 to +ve + * RebuildNN + * percent went from below to not-below NN% + * DeviceDisappeared + * Couldn't access a device which was previously visible + * + * if we detect an array with active<raid and spare==0 + * we look at other arrays that have same spare-group + * If we find one with active==raid and spare>0, + * and if we can get_disk_info and find a name + * Then we hot-remove and hot-add to the other array + * + * If devlist is NULL, then we can monitor everything because --scan + * was given. We get an initial list from config file and add anything + * that appears in /proc/mdstat + */ + + struct state *statelist = NULL; + struct state *st2; + int finished = 0; + struct mdstat_ent *mdstat = NULL; + char *mailfrom; + struct alert_info info; + struct mddev_ident *mdlist; + int delay_for_event = c->delay; + + if (!mailaddr) { + mailaddr = conf_get_mailaddr(); + if (mailaddr && ! c->scan) + pr_err("Monitor using email address \"%s\" from config file\n", + mailaddr); + } + mailfrom = conf_get_mailfrom(); + + if (!alert_cmd) { + alert_cmd = conf_get_program(); + if (alert_cmd && !c->scan) + pr_err("Monitor using program \"%s\" from config file\n", + alert_cmd); + } + if (c->scan && !mailaddr && !alert_cmd && !dosyslog) { + pr_err("No mail address or alert command - not monitoring.\n"); + return 1; + } + info.alert_cmd = alert_cmd; + info.mailaddr = mailaddr; + info.mailfrom = mailfrom; + info.dosyslog = dosyslog; + + if (share){ + if (check_one_sharer(c->scan)) + return 1; + } + + if (daemonise) { + int rv = make_daemon(pidfile); + if (rv >= 0) + return rv; + } + + if (share) + write_autorebuild_pid(); + + if (devlist == NULL) { + mdlist = conf_get_ident(NULL); + for (; mdlist; mdlist = mdlist->next) { + struct state *st; + + if (mdlist->devname == NULL) + continue; + if (strcasecmp(mdlist->devname, "<ignore>") == 0) + continue; + st = xcalloc(1, sizeof *st); + if (mdlist->devname[0] == '/') + st->devname = xstrdup(mdlist->devname); + else { + st->devname = xmalloc(8+strlen(mdlist->devname)+1); + strcpy(strcpy(st->devname, "/dev/md/"), + mdlist->devname); + } + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->from_config = 1; + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + statelist = st; + } + } else { + struct mddev_dev *dv; + + for (dv = devlist; dv; dv = dv->next) { + struct state *st = xcalloc(1, sizeof *st); + mdlist = conf_get_ident(dv->devname); + st->devname = xstrdup(dv->devname); + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mdlist) { + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + } + statelist = st; + } + } + + while (!finished) { + int new_found = 0; + struct state *st, **stp; + int anydegraded = 0; + int anyredundant = 0; + + if (mdstat) + free_mdstat(mdstat); + mdstat = mdstat_read(oneshot ? 0 : 1, 0); + + for (st = statelist; st; st = st->next) { + if (check_array(st, mdstat, c->test, &info, + increments, c->prefer)) + anydegraded = 1; + /* for external arrays, metadata is filled for + * containers only + */ + if (st->metadata && st->metadata->ss->external) + continue; + if (st->err == 0 && !anyredundant) + anyredundant = 1; + } + + /* now check if there are any new devices found in mdstat */ + if (c->scan) + new_found = add_new_arrays(mdstat, &statelist, c->test, + &info); + + /* If an array has active < raid && spare == 0 && spare_group != NULL + * Look for another array with spare > 0 and active == raid and same spare_group + * if found, choose a device and hotremove/hotadd + */ + if (share && anydegraded) + try_spare_migration(statelist, &info); + if (!new_found) { + if (oneshot) + break; + else if (!anyredundant) { + pr_err("No array with redundancy detected, stopping\n"); + break; + } + else { +#ifndef NO_LIBUDEV + /* + * Wait for udevd to finish new devices + * processing. + */ + if (mdstat_wait(delay_for_event) && + check_udev_activity()) + pr_err("Error while waiting for UDEV to complete new devices processing\n"); +#else + int wait_result = mdstat_wait(delay_for_event); + /* + * Give chance to process new device + */ + if (wait_result != 0) { + if (c->delay > 5) + delay_for_event = 5; + } else + delay_for_event = c->delay; +#endif + mdstat_close(); + } + } + c->test = 0; + + for (stp = &statelist; (st = *stp) != NULL; ) { + if (st->from_auto && st->err > 5) { + *stp = st->next; + free(st->devname); + free(st->spare_group); + free(st); + } else + stp = &st->next; + } + } + for (st2 = statelist; st2; st2 = statelist) { + statelist = st2->next; + free(st2); + } + + if (pidfile) + unlink(pidfile); + return 0; +} + +static int make_daemon(char *pidfile) +{ + /* Return: + * -1 in the forked daemon + * 0 in the parent + * 1 on error + * so a none-negative becomes the exit code. + */ + int pid = fork(); + if (pid > 0) { + if (!pidfile) + printf("%d\n", pid); + else { + FILE *pid_file = NULL; + int fd = open(pidfile, O_WRONLY | O_CREAT | O_TRUNC, + 0644); + if (fd >= 0) + pid_file = fdopen(fd, "w"); + if (!pid_file) + perror("cannot create pid file"); + else { + fprintf(pid_file,"%d\n", pid); + fclose(pid_file); + } + } + return 0; + } + if (pid < 0) { + perror("daemonise"); + return 1; + } + manage_fork_fds(0); + setsid(); + return -1; +} + +static int check_one_sharer(int scan) +{ + int pid; + FILE *comm_fp; + FILE *fp; + char comm_path[PATH_MAX]; + char path[PATH_MAX]; + char comm[20]; + + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + fp = fopen(path, "r"); + if (fp) { + if (fscanf(fp, "%d", &pid) != 1) + pid = -1; + snprintf(comm_path, sizeof(comm_path), + "/proc/%d/comm", pid); + comm_fp = fopen(comm_path, "r"); + if (comm_fp) { + if (fscanf(comm_fp, "%19s", comm) && + strncmp(basename(comm), Name, strlen(Name)) == 0) { + if (scan) { + pr_err("Only one autorebuild process allowed in scan mode, aborting\n"); + fclose(comm_fp); + fclose(fp); + return 1; + } else { + pr_err("Warning: One autorebuild process already running.\n"); + } + } + fclose(comm_fp); + } + fclose(fp); + } + return 0; +} + +static void write_autorebuild_pid() +{ + char path[PATH_MAX]; + int pid; + FILE *fp = NULL; + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + + if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) { + pr_err("Can't create autorebuild.pid file\n"); + } else { + int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0700); + + if (fd >= 0) + fp = fdopen(fd, "w"); + + if (!fp) + pr_err("Can't create autorebuild.pid file\n"); + else { + pid = getpid(); + fprintf(fp, "%d\n", pid); + fclose(fp); + } + } +} + +static void alert(char *event, char *dev, char *disc, struct alert_info *info) +{ + int priority; + + if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) { + time_t now = time(0); + + printf("%1.15s: %s on %s %s\n", ctime(&now) + 4, + event, dev, disc?disc:"unknown device"); + } + if (info->alert_cmd) { + int pid = fork(); + switch(pid) { + default: + waitpid(pid, NULL, 0); + break; + case -1: + break; + case 0: + execl(info->alert_cmd, info->alert_cmd, + event, dev, disc, NULL); + exit(2); + } + } + if (info->mailaddr && (strncmp(event, "Fail", 4) == 0 || + strncmp(event, "Test", 4) == 0 || + strncmp(event, "Spares", 6) == 0 || + strncmp(event, "Degrade", 7) == 0)) { + FILE *mp = popen(Sendmail, "w"); + if (mp) { + FILE *mdstat; + char hname[256]; + gethostname(hname, sizeof(hname)); + signal(SIGPIPE, SIG_IGN); + if (info->mailfrom) + fprintf(mp, "From: %s\n", info->mailfrom); + else + fprintf(mp, "From: %s monitoring <root>\n", + Name); + fprintf(mp, "To: %s\n", info->mailaddr); + fprintf(mp, "Subject: %s event on %s:%s\n\n", + event, dev, hname); + + fprintf(mp, + "This is an automatically generated mail message from %s\n", Name); + fprintf(mp, "running on %s\n\n", hname); + + fprintf(mp, + "A %s event had been detected on md device %s.\n\n", event, dev); + + if (disc && disc[0] != ' ') + fprintf(mp, + "It could be related to component device %s.\n\n", disc); + if (disc && disc[0] == ' ') + fprintf(mp, "Extra information:%s.\n\n", disc); + + fprintf(mp, "Faithfully yours, etc.\n"); + + mdstat = fopen("/proc/mdstat", "r"); + if (mdstat) { + char buf[8192]; + int n; + fprintf(mp, + "\nP.S. The /proc/mdstat file currently contains the following:\n\n"); + while ((n = fread(buf, 1, sizeof(buf), + mdstat)) > 0) + n = fwrite(buf, 1, n, mp); + fclose(mdstat); + } + pclose(mp); + } + } + + /* log the event to syslog maybe */ + if (info->dosyslog) { + /* Log at a different severity depending on the event. + * + * These are the critical events: */ + if (strncmp(event, "Fail", 4) == 0 || + strncmp(event, "Degrade", 7) == 0 || + strncmp(event, "DeviceDisappeared", 17) == 0) + priority = LOG_CRIT; + /* Good to know about, but are not failures: */ + else if (strncmp(event, "Rebuild", 7) == 0 || + strncmp(event, "MoveSpare", 9) == 0 || + strncmp(event, "Spares", 6) != 0) + priority = LOG_WARNING; + /* Everything else: */ + else + priority = LOG_INFO; + + if (disc && disc[0] != ' ') + syslog(priority, + "%s event detected on md device %s, component device %s", event, dev, disc); + else if (disc) + syslog(priority, + "%s event detected on md device %s: %s", + event, dev, disc); + else + syslog(priority, + "%s event detected on md device %s", + event, dev); + } +} + +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *ainfo, + int increments, char *prefer) +{ + /* Update the state 'st' to reflect any changes shown in mdstat, + * or found by directly examining the array, and return + * '1' if the array is degraded, or '0' if it is optimal (or dead). + */ + struct { int state, major, minor; } info[MAX_DISKS]; + struct mdinfo *sra = NULL; + mdu_array_info_t array; + struct mdstat_ent *mse = NULL, *mse2; + char *dev = st->devname; + int fd; + int i; + int remaining_disks; + int last_disk; + int new_array = 0; + int retval; + int is_container = 0; + unsigned long redundancy_only_flags = 0; + + if (test) + alert("TestMessage", dev, NULL, ainfo); + + retval = 0; + + fd = open(dev, O_RDONLY); + if (fd < 0) + goto disappeared; + + if (st->devnm[0] == 0) + strcpy(st->devnm, fd2devnm(fd)); + + for (mse2 = mdstat; mse2; mse2 = mse2->next) + if (strcmp(mse2->devnm, st->devnm) == 0) { + mse2->devnm[0] = 0; /* flag it as "used" */ + mse = mse2; + } + + if (!mse) { + /* duplicated array in statelist + * or re-created after reading mdstat + */ + st->err++; + goto out; + } + + if (mse->level == NULL) + is_container = 1; + + if (!is_container && !md_array_active(fd)) + goto disappeared; + + fcntl(fd, F_SETFD, FD_CLOEXEC); + if (md_get_array_info(fd, &array) < 0) + goto disappeared; + + if (!is_container && map_name(pers, mse->level) > 0) + redundancy_only_flags |= GET_MISMATCH; + + sra = sysfs_read(-1, st->devnm, GET_LEVEL | GET_DISKS | GET_DEVS | + GET_STATE | redundancy_only_flags); + + if (!sra) + goto disappeared; + + /* It's much easier to list what array levels can't + * have a device disappear than all of them that can + */ + if (sra->array.level == 0 || sra->array.level == -1) { + if (!st->err && !st->from_config) + alert("DeviceDisappeared", dev, " Wrong-Level", ainfo); + st->err++; + goto out; + } + + /* this array is in /proc/mdstat */ + if (array.utime == 0) + /* external arrays don't update utime, so + * just make sure it is always different. */ + array.utime = st->utime + 1;; + + if (st->err) { + /* New array appeared where previously had an error */ + st->err = 0; + st->percent = RESYNC_NONE; + new_array = 1; + if (!is_container) + alert("NewArray", st->devname, NULL, ainfo); + } + + if (st->utime == array.utime && st->failed == sra->array.failed_disks && + st->working == sra->array.working_disks && + st->spare == sra->array.spare_disks && + (mse == NULL || (mse->percent == st->percent))) { + if ((st->active < st->raid) && st->spare == 0) + retval = 1; + goto out; + } + if (st->utime == 0 && /* new array */ + mse->pattern && strchr(mse->pattern, '_') /* degraded */) + alert("DegradedArray", dev, NULL, ainfo); + + if (st->utime == 0 && /* new array */ st->expected_spares > 0 && + sra->array.spare_disks < st->expected_spares) + alert("SparesMissing", dev, NULL, ainfo); + if (st->percent < 0 && st->percent != RESYNC_UNKNOWN && + mse->percent >= 0) + alert("RebuildStarted", dev, NULL, ainfo); + if (st->percent >= 0 && mse->percent >= 0 && + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[18]; + /* + * "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + */ + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), + "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), + "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, ainfo); + } + + if (mse->percent == RESYNC_NONE && st->percent >= 0) { + /* Rebuild/sync/whatever just finished. + * If there is a number in /mismatch_cnt, + * we should report that. + */ + if (sra && sra->mismatch_cnt > 0) { + char cnt[80]; + snprintf(cnt, sizeof(cnt), + " mismatches found: %d (on raid level %d)", + sra->mismatch_cnt, sra->array.level); + alert("RebuildFinished", dev, cnt, ainfo); + } else + alert("RebuildFinished", dev, NULL, ainfo); + } + st->percent = mse->percent; + + remaining_disks = sra->array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + mdu_disk_info_t disc; + disc.number = i; + if (md_get_disk_info(fd, &disc) >= 0) { + info[i].state = disc.state; + info[i].major = disc.major; + info[i].minor = disc.minor; + if (disc.major || disc.minor) + remaining_disks --; + } else + info[i].major = info[i].minor = 0; + } + last_disk = i; + + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, mse->metadata_version + 10); + sl = strchr(st->parent_devnm, '/'); + if (sl) + *sl = 0; + } else + st->parent_devnm[0] = 0; + if (st->metadata == NULL && st->parent_devnm[0] == 0) + st->metadata = super_by_fd(fd, NULL); + + for (i = 0; i < MAX_DISKS; i++) { + mdu_disk_info_t disc = {0, 0, 0, 0, 0}; + int newstate = 0; + int change; + char *dv = NULL; + disc.number = i; + if (i < last_disk && (info[i].major || info[i].minor)) { + newstate = info[i].state; + dv = map_dev_preferred(info[i].major, info[i].minor, 1, + prefer); + disc.state = newstate; + disc.major = info[i].major; + disc.minor = info[i].minor; + } else + newstate = (1 << MD_DISK_REMOVED); + + if (dv == NULL && st->devid[i]) + dv = map_dev_preferred(major(st->devid[i]), + minor(st->devid[i]), 1, prefer); + change = newstate ^ st->devstate[i]; + if (st->utime && change && !st->err && !new_array) { + if ((st->devstate[i]&change) & (1 << MD_DISK_SYNC)) + alert("Fail", dev, dv, ainfo); + else if ((newstate & (1 << MD_DISK_FAULTY)) && + (disc.major || disc.minor) && + st->devid[i] == makedev(disc.major, + disc.minor)) + alert("FailSpare", dev, dv, ainfo); + else if ((newstate&change) & (1 << MD_DISK_SYNC)) + alert("SpareActive", dev, dv, ainfo); + } + st->devstate[i] = newstate; + st->devid[i] = makedev(disc.major, disc.minor); + } + st->active = sra->array.active_disks; + st->working = sra->array.working_disks; + st->spare = sra->array.spare_disks; + st->failed = sra->array.failed_disks; + st->utime = array.utime; + st->raid = sra->array.raid_disks; + st->err = 0; + if ((st->active < st->raid) && st->spare == 0) + retval = 1; + + out: + if (sra) + sysfs_free(sra); + if (fd >= 0) + close(fd); + return retval; + + disappeared: + if (!st->err && !is_container) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + goto out; +} + +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info) +{ + struct mdstat_ent *mse; + int new_found = 0; + char *name; + + for (mse = mdstat; mse; mse = mse->next) + if (mse->devnm[0] && (!mse->level || /* retrieve containers */ + (strcmp(mse->level, "raid0") != 0 && + strcmp(mse->level, "linear") != 0))) { + struct state *st = xcalloc(1, sizeof *st); + mdu_array_info_t array; + int fd; + + name = get_md_name(mse->devnm); + if (!name) { + free(st); + continue; + } + + st->devname = xstrdup(name); + if ((fd = open(st->devname, O_RDONLY)) < 0 || + md_get_array_info(fd, &array) < 0) { + /* no such array */ + if (fd >= 0) + close(fd); + put_md_name(st->devname); + free(st->devname); + if (st->metadata) { + st->metadata->ss->free_super(st->metadata); + free(st->metadata); + } + free(st); + continue; + } + close(fd); + st->next = *statelist; + st->err = 1; + st->from_auto = 1; + strcpy(st->devnm, mse->devnm); + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mse->metadata_version && + strncmp(mse->metadata_version, + "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + *sl = 0; + } else + st->parent_devnm[0] = 0; + *statelist = st; + if (test) + alert("TestMessage", st->devname, NULL, info); + new_found = 1; + } + return new_found; +} + +static int get_required_spare_criteria(struct state *st, + struct spare_criteria *sc) +{ + int fd; + + if (!st->metadata || !st->metadata->ss->get_spare_criteria) { + sc->min_size = 0; + sc->sector_size = 0; + return 0; + } + + fd = open(st->devname, O_RDONLY); + if (fd < 0) + return 1; + if (st->metadata->ss->external) + st->metadata->ss->load_container(st->metadata, fd, st->devname); + else + st->metadata->ss->load_super(st->metadata, fd, st->devname); + close(fd); + if (!st->metadata->sb) + return 1; + + st->metadata->ss->get_spare_criteria(st->metadata, sc); + st->metadata->ss->free_super(st->metadata); + + return 0; +} + +static int check_donor(struct state *from, struct state *to) +{ + struct state *sub; + + if (from == to) + return 0; + if (from->parent) + /* Cannot move from a member */ + return 0; + if (from->err) + return 0; + for (sub = from->subarray; sub; sub = sub->subarray) + /* If source array has degraded subarrays, don't + * remove anything + */ + if (sub->active < sub->raid) + return 0; + if (from->metadata->ss->external == 0) + if (from->active < from->raid) + return 0; + if (from->spare <= 0) + return 0; + return 1; +} + +static dev_t choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, struct spare_criteria *sc) +{ + int d; + dev_t dev = 0; + + for (d = from->raid; !dev && d < MAX_DISKS; d++) { + if (from->devid[d] > 0 && from->devstate[d] == 0) { + struct dev_policy *pol; + unsigned long long dev_size; + unsigned int dev_sector_size; + + if (to->metadata->ss->external && + test_partition_from_id(from->devid[d])) + continue; + + if (sc->min_size && + dev_size_from_id(from->devid[d], &dev_size) && + dev_size < sc->min_size) + continue; + + if (sc->sector_size && + dev_sector_size_from_id(from->devid[d], + &dev_sector_size) && + sc->sector_size != dev_sector_size) + continue; + + pol = devid_policy(from->devid[d]); + if (from->spare_group) + pol_add(&pol, pol_domain, + from->spare_group, NULL); + if (domain_test(domlist, pol, + to->metadata->ss->name) == 1) + dev = from->devid[d]; + dev_policy_free(pol); + } + } + return dev; +} + +static dev_t container_choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, + struct spare_criteria *sc, int active) +{ + /* This is similar to choose_spare, but we cannot trust devstate, + * so we need to read the metadata instead + */ + struct mdinfo *list; + struct supertype *st = from->metadata; + int fd = open(from->devname, O_RDONLY); + int err; + dev_t dev = 0; + + if (fd < 0) + return 0; + if (!st->ss->getinfo_super_disks) { + close(fd); + return 0; + } + + err = st->ss->load_container(st, fd, NULL); + close(fd); + if (err) + return 0; + < |