diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:42:59 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-04 17:42:59 +0000 |
commit | 0c7a6eb5ccace1d8e9f7b301f6a61a7d3f016369 (patch) | |
tree | 80a778fbd7bb3c7858cfac572df1cb08cfa4f988 | |
parent | Initial commit. (diff) | |
download | mdadm-upstream.tar.xz mdadm-upstream.zip |
Adding upstream version 4.2.upstream/4.2upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
279 files changed, 77998 insertions, 0 deletions
diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..217fe76 --- /dev/null +++ b/.gitignore @@ -0,0 +1,18 @@ +/*.o +/*.man +/*-stamp +/mdadm +/mdadm.8 +/mdadm.udeb +/mdassemble +/mdmon +/swap_super +/test_stripe +/TAGS +/mdadm.O2 +/mdadm.Os +/mdadm.static +/mdassemble.auto +/mdassemble.static +/mdmon.O2 +/raid6check diff --git a/ANNOUNCE-3.0 b/ANNOUNCE-3.0 new file mode 100644 index 0000000..f2d4f84 --- /dev/null +++ b/ANNOUNCE-3.0 @@ -0,0 +1,98 @@ +Subject: ANNOUNCE: mdadm 3.0 - A tool for managing Soft RAID under Linux + +I am pleased to (finally) announce the availability of + mdadm version 3.0 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This is a major new version and as such should be treated with some +caution. However it has seen substantial testing and is considerred +to be ready for wide use. + + +The significant change which justifies the new major version number is +that mdadm can now handle metadata updates entirely in userspace. +This allows mdadm to support metadata formats that the kernel knows +nothing about. + +Currently two such metadata formats are supported: + - DDF - The SNIA standard format + - Intel Matrix - The metadata used by recent Intel ICH controlers. + +Also the approach to device names has changed significantly. + +If udev is installed on the system, mdadm will not create any devices +in /dev. Rather it allows udev to manage those devices. For this to work +as expected, the included udev rules file should be installed. + +If udev is not installed, mdadm will still create devices and symlinks +as required, and will also remove them when the array is stopped. + +mdadm now requires all devices which do not have a standard name (mdX +or md_dX) to live in the directory /dev/md/. Names in this directory +will always be created as symlinks back to the standard name in /dev. + +The man pages contain some information about the new externally managed +metadata. However see below for a more condensed overview. + +Externally managed metadata introduces the concept of a 'container'. +A container is a collection of (normally) physical devices which have +a common set of metadata. A container is assembled as an md array, but +is left 'inactive'. + +A container can contain one or more data arrays. These are composed from +slices (partitions?) of various devices in the container. + +For example, a 5 devices DDF set can container a RAID1 using the first +half of two devices, a RAID0 using the first half of the remain 3 devices, +and a RAID5 over thte second half of all 5 devices. + +A container can be created with + + mdadm --create /dev/md0 -e ddf -n5 /dev/sd[abcde] + +or "-e imsm" to use the Intel Matrix Storage Manager. + +An array can be created within a container either by giving the +container name and the only member: + + mdadm -C /dev/md1 --level raid1 -n 2 /dev/md0 + +or by listing the component devices + + mdadm -C /dev/md2 --level raid0 -n 3 /dev/sd[cde] + +To assemble a container, it is easiest just to pass each device in turn to +mdadm -I + + for i in /dev/sd[abcde] + do mdadm -I $i + done + +This will assemble the container and the components. + +Alternately the container can be assembled explicitly + + mdadm -A /dev/md0 /dev/sd[abcde] + +Then the components can all be assembled with + + mdadm -I /dev/md0 + +For each container, mdadm will start a program called "mdmon" which will +monitor the array and effect any metadata updates needed. The array is +initially assembled readonly. It is up to "mdmon" to mark the metadata +as 'dirty' and which the array to 'read-write'. + +The version 0.90 and 1.x metadata formats supported by previous +versions for mdadm are still supported and the kernel still performs +the same updates it use to. The new 'mdmon' approach is only used for +newly introduced metadata types. + +NeilBrown 2nd June 2009 diff --git a/ANNOUNCE-3.0.1 b/ANNOUNCE-3.0.1 new file mode 100644 index 0000000..91b4428 --- /dev/null +++ b/ANNOUNCE-3.0.1 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 3.0.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains only minor bug fixes over 3.0. If you are using +3.0, you could consider upgrading. + +The brief change log is: + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.2 b/ANNOUNCE-3.0.2 new file mode 100644 index 0000000..93643d1 --- /dev/null +++ b/ANNOUNCE-3.0.2 @@ -0,0 +1,21 @@ +Subject: ANNOUNCE: mdadm 3.0.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This just contains one bugfix over 3.0.1 - I was obviously a bit hasty +in releasing that one. + +The brief change log is: + - Fix crash when hosthost is not set, as often happens in + early boot. + +NeilBrown 25th September 2009 diff --git a/ANNOUNCE-3.0.3 b/ANNOUNCE-3.0.3 new file mode 100644 index 0000000..d6117a1 --- /dev/null +++ b/ANNOUNCE-3.0.3 @@ -0,0 +1,29 @@ +Subject: ANNOUNCE: mdadm 3.0.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.0.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +This contains a collection of bug fixes and minor enhancements over +3.0.1. + +The brief change log is: + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1 b/ANNOUNCE-3.1 new file mode 100644 index 0000000..343b85d --- /dev/null +++ b/ANNOUNCE-3.1 @@ -0,0 +1,33 @@ +Subject: ANNOUNCE: mdadm 3.1 - A tool for managing Soft RAID under Linux + +Hot on the heals of 3.0.3 I am pleased to announce the availability of + mdadm version 3.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + + +It contains significant feature enhancements over 3.0.x + +The brief change log is: + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Note that a 2.6.31 or later is needed to have access to these. +Reducing devices in a RAID4/5/6 requires 2.6.32. +Changing RAID5 to RAID1 requires 2.6.33. + +You should only upgrade if you need to use, or which to test, these +features. + +NeilBrown 22nd October 2009 diff --git a/ANNOUNCE-3.1.1 b/ANNOUNCE-3.1.1 new file mode 100644 index 0000000..9e480dc --- /dev/null +++ b/ANNOUNCE-3.1.1 @@ -0,0 +1,39 @@ +Subject: ANNOUNCE: mdadm 3.1.1 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix release over 3.1, which was withdrawn due to serious +bugs. So it might be best to ignore 3.1 and say that this is a significant +feature release over 3.0.x + +Significant changes are: + - RAID level conversion between RAID1, RAID5, and RAID6 are + possible were the kernel supports it (2.6.32 at least) + - online chunksize and layout changing for RAID5 and RAID6 + where the kernel supports it. + - reduce the number of devices in a RAID4/5/6 array. + + - The default metadata is not v1.1. This metadata is stored at the + start of the device so is safer in many ways but could interfere with + boot loaded. The old default (0.90) is still available and fully + supported. + + - The default chunksize is now 512K rather than 64K. This seems more + appropriate for modern devices. + + - The default bitmap chunksize for internal bitmaps is now at least + 64Meg as fine grained bitmaps tend to impact performance more for + little extra gain. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.1. + +NeilBrown 19th November 2009 diff --git a/ANNOUNCE-3.1.2 b/ANNOUNCE-3.1.2 new file mode 100644 index 0000000..321b8be --- /dev/null +++ b/ANNOUNCE-3.1.2 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.1. + +Significant changes are: + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + + +This release is believed to be stable and you should feel free to +upgrade to 3.1.2 + +NeilBrown 10th March 2010 diff --git a/ANNOUNCE-3.1.3 b/ANNOUNCE-3.1.3 new file mode 100644 index 0000000..95b2b6c --- /dev/null +++ b/ANNOUNCE-3.1.3 @@ -0,0 +1,46 @@ +Subject: ANNOUNCE: mdadm 3.1.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.2 + +Significant changes are: + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +This release is believed to be stable and you should feel free to +upgrade to 3.1.3 + +It is expected that the next release will be 3.2 with a number of new +features. 3.1.4 will only happen if important bugs show up before 3.2 +is stable. + +NeilBrown 6th August 2010 diff --git a/ANNOUNCE-3.1.4 b/ANNOUNCE-3.1.4 new file mode 100644 index 0000000..c157a36 --- /dev/null +++ b/ANNOUNCE-3.1.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.1.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.4 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.3. +3.1.3 had a couple of embarrasing regressions and a couple of other +issues surfaces which had easy fixes so I decided to make a 3.1.4 +release after all. + +Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev +And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + - Fix spare migration + +This release is believed to be stable and you should feel free to +upgrade to 3.1.4 + +It is expected that the next release will be 3.2 with a number of new +features. + +NeilBrown 31st August 2010 diff --git a/ANNOUNCE-3.1.5 b/ANNOUNCE-3.1.5 new file mode 100644 index 0000000..baa1f92 --- /dev/null +++ b/ANNOUNCE-3.1.5 @@ -0,0 +1,42 @@ +Subject: ANNOUNCE: mdadm 3.1.5 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.1.5 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git?p=mdadm + +This is a bugfix/stability release over 3.1.4. It contains all the +important bugfixes found while working on 3.2 and 3.2.1. It will be +the last 3.1.x release - 3.2.1 is expected to be released in a few days. + +Changes include: + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +This release is believed to be stable and you should feel free to +upgrade to 3.1.5 + + +NeilBrown 23rd March 2011 + diff --git a/ANNOUNCE-3.2 b/ANNOUNCE-3.2 new file mode 100644 index 0000000..9e282bc --- /dev/null +++ b/ANNOUNCE-3.2 @@ -0,0 +1,77 @@ +Subject: ANNOUNCE: mdadm 3.2 - A tool for managing Soft RAID under Linux (DEVEL ONLY) + +I am pleased to announce the availability of + mdadm version 3.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm devel-3.2 + http://neil.brown.name/git?p=mdadm + +This is a "Developers only" release. Please don't consider using it +or making it available to others without reading the following. + + +By far the most significant change in this release related to the +management of reshaping arrays. This code has been substantially +re-written so that it can work with 'externally managed metadata' - +Intel's IMSM in particular. We now support level migration and +OnLine Capacity Expansion on these arrays. + +However, while the code largely works it has not been tested +exhaustively so there are likely to be problems. As the reshape code +for native metadata arrays was changed as part of this rewrite these +problems could also result in regressions for reshape of native +metadata. + +It is partly to encourage greater testing that this release is being +made. Any reports of problem - particular reproducible recipes for +triggering the problems - will be gratefully received. + +It is hopped that a "3.2.1" release will be available in early March +which will be a bugfix release over this and can be considered +suitable for general use. + +Other changes of note: + + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + + +Any feed back and bug reports are always welcomed at: + linux-raid@vger.kernel.org + +And please: don't use this in production - particularly not the +--grow functionality. + +NeilBrown 1st February 2011 + + diff --git a/ANNOUNCE-3.2.1 b/ANNOUNCE-3.2.1 new file mode 100644 index 0000000..0e7826c --- /dev/null +++ b/ANNOUNCE-3.2.1 @@ -0,0 +1,75 @@ + + +I am pleased to announce the availability of + mdadm version 3.2.1 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +Many of the changes in this release are of internal interest only, +restructuring and refactoring code and so forth. + +Most of the bugs found and fixed during development for 3.2.1 have been +back-ported for the recently-release 3.1.5 so this release primarily +provides a few new features over 3.1.5. + +They include: + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + + +While mdadm-3.2.1 is considered to be reasonably stable, you should +only use it if you want to try out the new features, or if you +generally like to be on the bleeding edge. If the new features are not +important to you, then 3.1.5 is probably the appropriate version to be using +until 3.2.2 comes out. + +NeilBrown 28th March 2011 diff --git a/ANNOUNCE-3.2.2 b/ANNOUNCE-3.2.2 new file mode 100644 index 0000000..b70d18b --- /dev/null +++ b/ANNOUNCE-3.2.2 @@ -0,0 +1,36 @@ +Subject: ANNOUNCE: mdadm 3.2.2 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.2 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a stablising release for the 3.2 series. +Many of the changes just fix bugs introduces in 3.2 or 3.2.1. + +There are some new features. They are: + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Future releases in the 3.2 series will only be made if bugfixes are needed. +The next release to add features is expected to be 3.3. + +NeilBrown 17th June 2011 diff --git a/ANNOUNCE-3.2.3 b/ANNOUNCE-3.2.3 new file mode 100644 index 0000000..8a8dba4 --- /dev/null +++ b/ANNOUNCE-3.2.3 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.2.3 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.3 + +It is available at the usual places: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +The largest single area of change is support for reshape of Intel +IMSM arrays (OnLine Capacity Explansion and Level Migtration). +Among other fixes, this now has a better chance of surviving if a +device fails during reshape. + +Upgrading is recommended - particularly if you use mdadm for IMSM +arrays - but not essential. + +NeilBrown 23rd December 2011 diff --git a/ANNOUNCE-3.2.4 b/ANNOUNCE-3.2.4 new file mode 100644 index 0000000..e321678 --- /dev/null +++ b/ANNOUNCE-3.2.4 @@ -0,0 +1,144 @@ +Subject: ANNOUNCE: mdadm 3.2.4 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.4 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release is largely a bugfix release for the 3.2 series with many +minor fixes with little or no impact. + +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Upgrading is encouraged. + +The next mdadm release is expected to be 3.3 with a number of new +features. + +NeilBrown 9th May 2012 + +77b3ac8 monitor: make return from read_and_act more symbolic. +68226a8 monitor: ensure we retry soon when 'remove' fails. +8453f8d fix: Monitor sometimes crashes +90fa1a2 Work around gcc-4.7's strict aliasing checks +0c4304c fix: container creation with --incremental used. +5d1c7cd FIX: External metadata sometimes is not updated +3c20f98 FIX: mdmon check in reshape_container() can cause a problem +59ab9f5 FIX: Typo error in fprint command +9587c37 imsm: load_super_imsm_all function refactoring +ec50f7b imsm: load_imsm_super_all supports loading metadata from the device list +ca9de18 imsm: validate the number of imsm volumes per controller +30602f5 imsm: display fd in error trace when when store_imsm_mpb failes +eb155f6 mdmon: Use getopt_long() to parse command line options +08ca2ad Add --offroot argument to mdadm +da82751 Add --offroot argument to mdmon +a0963a8 Spawn mdmon with --offroot if mdadm was launched with --offroot +f878b24 imsm: fix, the second array need to have the whole available space on devices +d597705 getinfo_super1: Use MaxSector in place of sb->size +6ef8905 super1: make aread/awrite always use an aligned buffer. +de5a472 Remove avail_disks arg from 'enough'. +da8fe5a Assemble: fix --force assemble during reshape. +b10c663 config: fix handing of 'homehost' in AUTO line. +92d49ec FIX: NULL pointer to strdup() can be passed +d2bde6d imsm: FIX: No new missing disks are allowed during general migration +111e9fd FIX: Array is not run when expansion disks are added +bf5cf7c imsm: FIX: imsm_get_allowed_degradation() doesn't count degradation for raid1 +50927b1 Fix: Sometimes mdmon throws core dump during reshape +78340e2 Flush mdmon before next reshape step during container operation +e174219 imsm: FIX: Chunk size migration problem +f93346e FIX: use md position to reshape restart +6a75c8c imsm: FIX: use md position to reshape restart +51d83f5 imsm: FIX: Clear migration record when migration switches to next volume. +e1dd332 FIX: restart reshape when reshape process is stopped just between 2 reshapes +1ca90aa FIX: Do not try to (continue) reshape using inactive array +9f1b0f0 config: conf_match should ignore devname when not set. +d669228 Use posix_memalign() for memory used to write bitmaps +178950e FIX: Changes in '0' case for reshape position verification +9200d41 avoid double-free upon "old buggy kernel" sysfs_read failure +4011421 Print error message if failing to write super for 1.x metadata +0011874 Use MDMON_DIR for pid files created in Monitor.c +56d1885 Assemble: don't use O_EXCL until we have checked device content. +b720636 Assemble: support assembling of a RAID0 being reshaped. +c69ffac Manage: allow --re-add to failed array. +52f07f5 Reset bad flag on map update +911cead super1: support superblocks up to 4K. +ad6db3c Create: reduce the verbosity of 'default_layout'. +b2bfdfa super1.c don't keep recalculating bitmap pointer +4122675 Define and use SUPER1_SIZE for allocations +1afa930 init_super1() memset full buffer allocated for superblock +2de0b8a match_metadata_desc1(): Use calloc instead of malloc+memset +3c0bcd4 Use 4K buffer alignment for superblock allocations +308340a Use struct align_fd to cache fd's block size for aligned reads/writes +65ed615 match_metadata_desc0(): Use calloc instead of malloc+memset +de89706 Generalize ROUND_UP() macro and introduce matching ROUND_UP_PTR() +0a2f189 super1.c: use ROUND_UP/ROUND_UP_PTR +654a381 super-intel.c: Use ROUND_UP() instead of manually coding it +42d5dfd __write_init_super_ddf(): Use posix_memalign() instead of static aligned buffer +d4633e0 Examine: fix array size calculation for RAID10. +e62b778 Assemble: improve verbose logging when including old devices. +0073a6e Remove possible crash during RAID6 -> RAID5 reshape. +69fe207 Incremental: fix adding devices with --incremental +bcbb311 Manage: replace 'return 1' with 'goto abort'. +9f58469 Manage: freeze recovery while adding multiple devices. +ae6c05a Create: round off size for RAID1 arrays. +5ca3a90 Grow: print useful error when converting RAID1->RAID5 will fail. +c07d640 Fix tests/05r1-re-add-nosupper +2d762ad Fix the new ROUND_UP macro. +fd324b0 sysfs: fixed sysfs_freeze_array array to work properly with Manage_subdevs. +5551b11 imsm: avoid overflows for disks over 1TB +97f81ee clear hi bits if not used after loading metadata from disk +e03640b simplify calculating array_blocks +29cd082 show 2TB volumes/disks support in --detail-platform +2cc699a check volume size in validate_geometry_imsm_orom +9126b9a check that no disk over 2TB is used to create container when no support +027c374 imsm: set 2tb disk attribute for spare +3556c2f Fix typo: wan -> want +15632a9 parse_size: distinguish between 0 and error. +fbdef49 Bitmap_offset is a signed number +508a7f1 super1: leave more space in front of data by default. +40110b9 Fix two typos in fprintf messages +342460c mdadm man page: fix typo +0e7f69a imsm: display maximum volumes per controller and array +36fd8cc imsm: FIX: Update function imsm_num_data_members() for Raid1/10 +7abc987 imsm: FIX: Add volume size expand support to imsm_analyze_change() +f3871fd imsm: Add new metadata update for volume size expansion +54397ed imsm: Execute size change for external metatdata +016e00f FIX: Support metadata changes rollback +fbf3d20 imsm: FIX: Support metadata changes rollback +44f6f18 FIX: Extend size of raid0 array +7e7e9a4 FIX: Respect metadata size limitations +65a9798 FIX: Detect error and rollback metadata +13bcac9 imsm: Add function imsm_get_free_size() +b130333 imsm: Support setting max size for size change operation +c41e00b imsm: FIX: Component size alignment check +58d26a2 FIX: Size change is possible as standalone change only +4aecb54 FIX: Assembled second array is in read only state during reshape +ae2416e FIX: resolve make everything compilation error +480f356 Raid limit of 1024 when scanning for devices. +c2ecf5f Add --prefer option for --detail and --monitor +0a99975 Relax restrictions on when --add is permitted. +7ce0570 imsm: fix: rebuild does not continue after reboot +b51702b fix: correct extending size of raid0 array +34a1395 Fix sign extension of bitmap_offset in super1.c +012a864 Introduce sysfs_set_num_signed() and use it to set bitmap/offset +5d7b407 imsm: fix: thunderdome may drop 2tb attribute +5ffdc2d Update test for "is udev active". +96fd06e Adjust to new standard of /run +974e039 test: don't worry too much about array size. +b0a658f Grow: failing the set the per-device size is not an error. +36614e9 super-intel.c: Don't try to close negative fd +562aa10 super-intel.c: Fix resource leak from opendir() + diff --git a/ANNOUNCE-3.2.5 b/ANNOUNCE-3.2.5 new file mode 100644 index 0000000..396da12 --- /dev/null +++ b/ANNOUNCE-3.2.5 @@ -0,0 +1,31 @@ +Subject: ANNOUNCE: mdadm 3.2.5 - A tool for managing Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.2.5 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This release primarily fixes a serious regression in 3.2.4. +This regression does *not* cause any risk to data. It simply +means that adding a device with "--add" would sometime fail +when it should not. + +The fix also includes a couple of minor fixes such as making +the "--layout=preserve" option to "--grow" work again. + +A reminder that the default location for runtime files is now +"/run/mdadm". If you compile this for a distro that does not +have "/run", you will need to compile with an alternate setting for +MAP_DIR. e.g. + make MAP_DIR=/var/run/mdadm +or + make MAP_DIR=/dev/.mdadm + +NeilBrown 18th May 2012 + diff --git a/ANNOUNCE-3.2.6 b/ANNOUNCE-3.2.6 new file mode 100644 index 0000000..f5cfd49 --- /dev/null +++ b/ANNOUNCE-3.2.6 @@ -0,0 +1,57 @@ +Subject: ANNOUNCE: mdadm 3.2.6 - A tool for managing Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.2.6 + +It is available at the usual places, now including github: + countrycode=xx. + http://www.${countrycode}kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://neil.brown.name/git/mdadm + +This is a stablity release which adds a number of bugfixs to 3.2.5. +There are no real stand-out fixes, just lots of little bits and pieces. + +Below is the "git log --oneline --reverse" list of changes since +3.2.5. + +NeilBrown 25th October 2012 + +b7e05d2 udev-rules: prevent systemd from mount devices before they are ready. +0d478e2 mdadm: Fix Segmentation fault. +42f0ca1 imsm: fix: correct checking volume's degradation +fcf2195 Monitor: fix inconsistencies in values for ->percent +5f862fb Monitor: Report NewArray when an array the disappeared, reappears. +6f51b1c Monitor: fix reporting for Fail vs FailSpare etc. +68ad53b mdmon: fix arg parsing. +517f135 Assemble: don't leak memory with fdlist. +090900c udev-rules: prevent systemd from mount devices before they are ready. +446e000 sha1.h: remove ansidecl.h header inclusion +ec894f5 Manage: zero metadata before adding to 'external' array. +3a84db5 ddf: allow a non-spare to be used to recovery a missing device. +c5d61ca ddf: hack to fix container recognition. +23084aa mdmon: fix arg processing for -a +c4e96a3 mdmon: allow --takeover when original was started with --offroot +80841df find_free_devnum: avoid auto-using names in /etc/mdadm.conf +c5c56d6 mapfile: fix mapfile rebuild for containers +aec89f6 fix segfaults in Detail() +2117ad1 Fix 'enough' function for RAID10. +0bc300d Use --offroot flag when assembling md arrays via --incrmental +ac78f24 Grow: make warning about old metadata more explicit. +14026ab Replace sha1.h with slightly older version. +6f6809f Add zlib license to crc32.c +5267ba0 Handles spaces in array names better. +c51f288 imsm: allow --assume-clean to work. +acf7076 Grow: allow --grow --continue to work for native metadata. +335d2a6 Grow: fix a couple of typos with --assume-clean usage +9ff1427 Fix open_container +3713633 mdadm: super0: do not override uuid with homehost +31bff58 Trivial bugfix and spelling fixes. +e1e539f Detail: don't report a faulty device as 'spare' or 'rebuilding'. +22a6461 super0: allow creation of array on 2TB+ devices. +a5d47a2 Create new md devices consistently +eb48676 Monitor: don't complain about non-monitorable arrays in mdadm.conf +ecdf2d7 Query: don't be confused by partition tables. +f7b75c1 Query: allow member of non-0.90 arrays to be better reported. diff --git a/ANNOUNCE-3.3 b/ANNOUNCE-3.3 new file mode 100644 index 0000000..f770aa1 --- /dev/null +++ b/ANNOUNCE-3.3 @@ -0,0 +1,63 @@ +Subject: ANNOUNCE: mdadm 3.3 - A tools for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +This is a major new release so don't be too surprised if there are a +few issues. If I hear about them they will be fixed in 3.3.1. +git log reports nearly 500 changes since 3.2.6 so I won't list them +all. + +Some highlights are: + +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +and lots of bugfixes and other little changes. + +NeilBrown 3rd September 2013 diff --git a/ANNOUNCE-3.3.1 b/ANNOUNCE-3.3.1 new file mode 100644 index 0000000..7d5e666 --- /dev/null +++ b/ANNOUNCE-3.3.1 @@ -0,0 +1,23 @@ +Subject: ANNOUNCE: mdadm 3.3.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The main changes are: + - lots of work on "DDF" support. Hopefully it will be more stable + now. Bug reports are always welcome. + - improved interactions with 'systemd'. Where possible, background + tasks are run from systemd (if it is present) rather then forking + disassociationg from the session. This is important because udev + doesn't really let you disassociate. + +though there are a number of other little bug fixes too. + +NeilBrown 5th June 2014 diff --git a/ANNOUNCE-3.3.2 b/ANNOUNCE-3.3.2 new file mode 100644 index 0000000..6b54961 --- /dev/null +++ b/ANNOUNCE-3.3.2 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 3.3.2 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.2 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +Changes since 3.3.1 are mostly little bugfixes and some man-page +updates. + +NeilBrown 21st August 2014 diff --git a/ANNOUNCE-3.3.3 b/ANNOUNCE-3.3.3 new file mode 100644 index 0000000..ac1b217 --- /dev/null +++ b/ANNOUNCE-3.3.3 @@ -0,0 +1,18 @@ +Subject: ANNOUNCE: mdadm 3.3.3 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.3.3 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +The 100 changes since 3.3.3 are mostly little bugfixes and some improvements +to the selftests. +raid6check now handle all RAID6 layouts including DDF correctly. +See git log for the rest. + +NeilBrown 24th July 2015 diff --git a/ANNOUNCE-3.3.4 b/ANNOUNCE-3.3.4 new file mode 100644 index 0000000..52b9456 --- /dev/null +++ b/ANNOUNCE-3.3.4 @@ -0,0 +1,37 @@ +Subject: ANNOUNCE: mdadm 3.3.4 - A tool for managing md Soft RAID under Linux + +I am somewhat disappointed to have to announce the availability of + mdadm version 3.3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm.git + +In mdadm-3.3 a change was made to how IMSM (Intel Matrix Storage +Manager) metadata was handled. Previously an IMSM array would only +be assembled if it was attached to an IMSM controller. + +In 3.3 this was relaxed as there are circumstances where the +controller is not properly detected. Unfortunately this has negative +consequences which have only just come to light. + +If you have an IMSM RAID1 configured and then disable RAID in the +BIOS, the metadata will remain on the devices. If you then install +some other OS on one device and then install Linux on the other, Linux +might eventually start noticing the IMSM metadata (depending a bit on whether +mdadm is included in the initramfs) and might start up the RAID1. This could +copy one device over the other, thus trashing one of the installations. + +Not good. + +So with this release IMSM arrays will only be assembled if attached to +an IMSM controller, or if "--force" is given to --assemble, or if the +environment variable IMSM_NO_PLATFORM is set (used primarily for +testing). + +I strongly recommend upgrading to 3.3.4 if you are using 3.3 or later. + +NeilBrown 3rd August 2015. diff --git a/ANNOUNCE-3.4 b/ANNOUNCE-3.4 new file mode 100644 index 0000000..2689732 --- /dev/null +++ b/ANNOUNCE-3.4 @@ -0,0 +1,24 @@ +Subject: ANNOUNCE: mdadm 3.4 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 3.4 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://github.com/neilbrown/mdadm + git://neil.brown.name/mdadm + http://git.neil.brown.name/git/mdadm + +The new second-level version number reflects significant new +functionality, particular support for journalled RAID5/6 and clustered +RAID1. This new support is probably still buggy. Please report bugs. + +There are also a number of fixes for Intel's IMSM metadata support, +and an assortment of minor bug fixes. + +I plan for this to be the last release of mdadm that I provide as I am +retiring from MD and mdadm maintenance. Jes Sorensen has volunteered +to oversee mdadm for the next while. Thanks Jes! + +NeilBrown 28th January 2016 diff --git a/ANNOUNCE-4.0 b/ANNOUNCE-4.0 new file mode 100644 index 0000000..f79c540 --- /dev/null +++ b/ANNOUNCE-4.0 @@ -0,0 +1,22 @@ +Subject: ANNOUNCE: mdadm 4.0 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 4.0 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The update in major version number primarily indicates this is a +release by it's new maintainer. In addition it contains a large number +of fixes in particular for IMSM RAID and clustered RAID support. In +addition this release includes support for IMSM 4k sector drives, +failfast and better documentation for journaled RAID. + +This is my first release of mdadm. Please thank Neil Brown for his +previous work as maintainer and blame me for all the bugs I caused +since taking over. + +Jes Sorensen, 2017-01-09 diff --git a/ANNOUNCE-4.1 b/ANNOUNCE-4.1 new file mode 100644 index 0000000..a273b9a --- /dev/null +++ b/ANNOUNCE-4.1 @@ -0,0 +1,16 @@ +Subject: ANNOUNCE: mdadm 4.1 - A tool for managing md Soft RAID under Linux + +I am pleased to announce the availability of + mdadm version 4.1 + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The update constitutes more than one year of enhancements and bug fixes +including for IMSM RAID, Partial Parity Log, clustered RAID support, +improved testing, and gcc-8 support. + +Jes Sorensen, 2018-10-01 diff --git a/ANNOUNCE-4.2 b/ANNOUNCE-4.2 new file mode 100644 index 0000000..8b22d09 --- /dev/null +++ b/ANNOUNCE-4.2 @@ -0,0 +1,19 @@ +Subject: ANNOUNCE: mdadm 4.2 - A tool for managing md Soft RAID under Linux + +I am pleased to finally announce the availability of mdadm-4.2. +get 4.2 out the door soon. + +It is available at the usual places: + http://www.kernel.org/pub/linux/utils/raid/mdadm/ +and via git at + git://git.kernel.org/pub/scm/utils/mdadm/mdadm.git + http://git.kernel.org/cgit/utils/mdadm/ + +The release includes more than two years of development and bugfixes, +so it is difficult to remember everything. Highlights include +enhancements and bug fixes including for IMSM RAID, Partial Parity +Log, clustered RAID support, improved testing, and gcc-9 support. + +Thank you everyone who contributed to this release! + +Jes Sorensen, 2021-12-30 diff --git a/Assemble.c b/Assemble.c new file mode 100644 index 0000000..704b829 --- /dev/null +++ b/Assemble.c @@ -0,0 +1,2227 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <ctype.h> + +mapping_t assemble_statuses[] = { + { "but cannot be started", INCR_NO }, + { "but not safe to start", INCR_UNSAFE }, + { "and started", INCR_YES }, + { NULL, INCR_ALREADY } +}; + + +/** + * struct assembly_array_info - General, meaningful information for assembly. + * @name: Array name. + * @new_cnt: Count of drives known to be members, recently added. + * @preexist_cnt: Count of member drives in pre-assembled array. + * @exp_cnt: Count of known expansion targets. + * + * FIXME: @exp_new_cnt for recently added expansion targets. + */ +struct assembly_array_info { + char *name; + int new_cnt; + int preexist_cnt; + int exp_cnt; +}; + +/** + * set_array_assembly_status() - generate status of assembly for an array. + * @c: Global settings. + * @result: Pointer to status mask. + * @status: Status to be set/printed. + * @arr: Array information. + * + * Print status message to user or set it in @result if it is not NULL. + */ +static void set_array_assembly_status(struct context *c, + int *result, int status, + struct assembly_array_info *arr) +{ + int raid_disks = arr->preexist_cnt + arr->new_cnt; + char *status_msg = map_num(assemble_statuses, status); + + if (c->export && result) + *result |= status; + + if (c->export || c->verbose < 0) + return; + + pr_err("%s has been assembled with %d device%s", arr->name, + raid_disks, raid_disks == 1 ? "":"s"); + if (arr->preexist_cnt > 0) + fprintf(stderr, " (%d new)", arr->new_cnt); + if (arr->exp_cnt) + fprintf(stderr, " ( + %d for expansion)", arr->exp_cnt); + if (status_msg) + fprintf(stderr, " %s", status_msg); + fprintf(stderr, ".\n"); +} + +static int name_matches(char *found, char *required, char *homehost, int require_homehost) +{ + /* See if the name found matches the required name, possibly + * prefixed with 'homehost' + */ + char *sep; + unsigned int l; + + if (strcmp(found, required)==0) + return 1; + sep = strchr(found, ':'); + if (!sep) + return 0; + l = sep - found; + if (strncmp(found, "any:", 4) == 0 || + (homehost && strcmp(homehost, "any") == 0) || + !require_homehost || + (homehost && strlen(homehost) == l && + strncmp(found, homehost, l) == 0)) { + /* matching homehost */ + if (strcmp(sep+1, required) == 0) + return 1; + } + return 0; +} + +static int is_member_busy(char *metadata_version) +{ + /* check if the given member array is active */ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + int busy = 0; + + for (ent = mdstat; ent; ent = ent->next) { + if (ent->metadata_version == NULL) + continue; + if (strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + if (!is_subarray(&ent->metadata_version[9])) + continue; + /* Skip first char - it can be '/' or '-' */ + if (strcmp(&ent->metadata_version[10], metadata_version+1) == 0) { + busy = 1; + break; + } + } + free_mdstat(mdstat); + + return busy; +} + +static int ident_matches(struct mddev_ident *ident, + struct mdinfo *content, + struct supertype *tst, + char *homehost, int require_homehost, + char *update, char *devname) +{ + + if (ident->uuid_set && (!update || strcmp(update, "uuid")!= 0) && + same_uuid(content->uuid, ident->uuid, tst->ss->swapuuid)==0 && + memcmp(content->uuid, uuid_zero, sizeof(int[4])) != 0) { + if (devname) + pr_err("%s has wrong uuid.\n", devname); + return 0; + } + if (ident->name[0] && (!update || strcmp(update, "name")!= 0) && + name_matches(content->name, ident->name, homehost, require_homehost)==0) { + if (devname) + pr_err("%s has wrong name.\n", devname); + return 0; + } + if (ident->super_minor != UnSet && + ident->super_minor != content->array.md_minor) { + if (devname) + pr_err("%s has wrong super-minor.\n", + devname); + return 0; + } + if (ident->level != UnSet && + ident->level != content->array.level) { + if (devname) + pr_err("%s has wrong raid level.\n", + devname); + return 0; + } + if (ident->raid_disks != UnSet && + content->array.raid_disks != 0 && /* metadata doesn't know how many to expect */ + ident->raid_disks!= content->array.raid_disks) { + if (devname) + pr_err("%s requires wrong number of drives.\n", + devname); + return 0; + } + if (ident->member && ident->member[0]) { + /* content->text_version must match */ + char *s = strchr(content->text_version+1, '/'); + if (s == NULL) { + if (devname) + pr_err("%s is not a container and one is required.\n", + devname); + return 0; + } else if (strcmp(ident->member, s+1) != 0) { + if (devname) + pr_err("skipping wrong member %s is %s\n", + content->text_version, devname); + return 0; + } + } + return 1; +} + +static int select_devices(struct mddev_dev *devlist, + struct mddev_ident *ident, + struct supertype **stp, + struct mdinfo **contentp, + struct context *c, + int inargv, int auto_assem) +{ + struct mddev_dev *tmpdev; + int num_devs; + struct supertype *st = *stp; + struct mdinfo *content = NULL; + int report_mismatch = ((inargv && c->verbose >= 0) || c->verbose > 0); + struct domainlist *domains = NULL; + dev_t rdev; + + tmpdev = devlist; num_devs = 0; + while (tmpdev) { + if (tmpdev->used) + tmpdev->used = 2; + else + num_devs++; + tmpdev->disposition = 0; + tmpdev = tmpdev->next; + } + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + for (tmpdev = devlist; + tmpdev; + tmpdev = tmpdev ? tmpdev->next : NULL) { + char *devname = tmpdev->devname; + int dfd; + struct supertype *tst; + struct dev_policy *pol = NULL; + int found_container = 0; + + if (tmpdev->used > 1) + continue; + + if (ident->container) { + if (ident->container[0] == '/' && + !same_dev(ident->container, devname)) { + if (report_mismatch) + pr_err("%s is not the container required (%s)\n", + devname, ident->container); + continue; + } + } else if (ident->devices && + !match_oneof(ident->devices, devname)) { + /* Note that we ignore the "device=" identifier if a + * "container=" is given. Checking both is unnecessarily + * complicated. + */ + if (report_mismatch) + pr_err("%s is not one of %s\n", devname, ident->devices); + continue; + } + + tst = dup_super(st); + + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (report_mismatch) + pr_err("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (!fstat_is_blkdev(dfd, devname, &rdev)) { + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + if (st) { + /* already found some components, this cannot + * be another one. + */ + if (report_mismatch) + pr_err("%s is a container, but we are looking for components\n", + devname); + tmpdev->used = 2; + } if (!tst && (tst = super_by_fd(dfd, NULL)) == NULL) { + if (report_mismatch) + pr_err("not a recognisable container: %s\n", + devname); + tmpdev->used = 2; + } else if (!tst->ss->load_container || + tst->ss->load_container(tst, dfd, NULL)) { + if (report_mismatch) + pr_err("no correct container type: %s\n", + devname); + tmpdev->used = 2; + } else if (auto_assem && + !conf_test_metadata(tst->ss->name, + (pol = devid_policy(rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } else + found_container = 1; + } else { + if (!tst && (tst = guess_super(dfd)) == NULL) { + if (report_mismatch) + pr_err("no recogniseable superblock on %s\n", + devname); + tmpdev->used = 2; + } else if ((tst->ignore_hw_compat = 0), + tst->ss->load_super(tst, dfd, + report_mismatch ? devname : NULL)) { + if (report_mismatch) + pr_err("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + if (report_mismatch) + pr_err("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } else if (auto_assem && st == NULL && + !conf_test_metadata(tst->ss->name, + (pol = devid_policy(rdev)), + tst->ss->match_home(tst, c->homehost) == 1)) { + if (report_mismatch) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, tst->ss->name); + tmpdev->used = 2; + } + } + if (dfd >= 0) close(dfd); + if (tmpdev->used == 2) { + if (auto_assem || !inargv) + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + if (ident->name[0] || + ident->super_minor != UnSet) + /* Ignore unrecognised device if looking for + * specific array */ + goto loop; + if (ident->uuid_set) + /* ignore unrecognized device if looking for + * specific uuid + */ + goto loop; + + pr_err("%s has no superblock - assembly aborted\n", + devname); + if (st) + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + if (tst) + tst->ss->free_super(tst); + return -1; + } + + if (found_container) { + /* tmpdev is a container. We need to be either + * looking for a member, or auto-assembling + */ + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (ident->container && ident->container[0] != '/') { + /* we have a uuid */ + int uuid[4]; + + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!parse_uuid(ident->container, uuid) || + !same_uuid(content->uuid, uuid, tst->ss->swapuuid)) { + if (report_mismatch) + pr_err("%s has wrong UUID to be required container\n", + devname); + goto loop; + } + } + /* It is worth looking inside this container. + */ + if (c->verbose > 0) + pr_err("looking in container %s\n", + devname); + + for (content = tst->ss->container_content(tst, NULL); + content; + content = content->next) { + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* message already printed */; + else if (is_member_busy(content->text_version)) { + if (report_mismatch) + pr_err("member %s in %s is already assembled\n", + content->text_version, + devname); + } else if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) { + /* do not assemble arrays with unsupported configurations */ + pr_err("Cannot activate member %s in %s.\n", + content->text_version, + devname); + } else + break; + } + if (!content) { + tmpdev->used = 2; + goto loop; /* empty container */ + } + + st = tst; tst = NULL; + if (!auto_assem && inargv && tmpdev->next != NULL) { + pr_err("%s is a container, but is not only device given: confused and aborting\n", + devname); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + if (c->verbose > 0) + pr_err("found match on member %s in %s\n", + content->text_version, devname); + + /* make sure we finished the loop */ + tmpdev = NULL; + goto loop; + } else { + content = *contentp; + tst->ss->getinfo_super(tst, content, NULL); + + if (!ident_matches(ident, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + goto loop; + + if (auto_assem) { + /* Never auto-assemble things that conflict + * with mdadm.conf in some way + */ + struct mddev_ident *match; + int rv = 0; + + match = conf_match(tst, content, devname, + report_mismatch ? c->verbose : -1, + &rv); + if (!match && rv == 2) + goto loop; + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (report_mismatch) + pr_err("%s is a member of an explicitly ignored array\n", + devname); + goto loop; + } + if (match && !ident_matches(match, content, tst, + c->homehost, c->require_homehost, + c->update, + report_mismatch ? devname : NULL)) + /* Array exists in mdadm.conf but some + * details don't match, so reject it + */ + goto loop; + } + + /* should be safe to try an exclusive open now, we + * have rejected anything that some other mdadm might + * be looking at + */ + dfd = dev_open(devname, O_RDONLY | O_EXCL); + if (dfd < 0) { + if (report_mismatch) + pr_err("%s is busy - skipping\n", devname); + goto loop; + } + close(dfd); + + if (st == NULL) + st = dup_super(tst); + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(content->uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst, 1) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + if (auto_assem) + goto loop; + if (c->homehost) { + int first = st->ss->match_home(st, c->homehost); + int last = tst->ss->match_home(tst, c->homehost); + if (first != last && + (first == 1 || last == 1)) { + /* We can do something */ + if (first) {/* just ignore this one */ + if (report_mismatch) + pr_err("%s misses out due to wrong homehost\n", + devname); + goto loop; + } else { /* reject all those sofar */ + struct mddev_dev *td; + if (report_mismatch) + pr_err("%s overrides previous devices due to good homehost\n", + devname); + for (td=devlist; td != tmpdev; td=td->next) + if (td->used == 1) + td->used = 0; + tmpdev->used = 1; + goto loop; + } + } + } + pr_err("superblock on %s doesn't match others - assembly aborted\n", + devname); + tst->ss->free_super(tst); + st->ss->free_super(st); + dev_policy_free(pol); + domain_free(domains); + return -1; + } + tmpdev->used = 1; + } + loop: + /* Collect domain information from members only */ + if (tmpdev && tmpdev->used == 1) { + if (!pol) + pol = devid_policy(rdev); + domain_merge(&domains, pol, tst?tst->ss->name:NULL); + } + dev_policy_free(pol); + pol = NULL; + if (tst) + tst->ss->free_super(tst); + } + + /* Check if we found some imsm spares but no members */ + if ((auto_assem || + (ident->uuid_set && + memcmp(uuid_zero, ident->uuid,sizeof(uuid_zero)) == 0)) && + (!st || !st->sb)) + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + tmpdev->used = 1; + content = *contentp; + + if (!st->sb) { + /* we need sb from one of the spares */ + int dfd = dev_open(tmpdev->devname, O_RDONLY); + if (dfd < 0 || + st->ss->load_super(st, dfd, NULL)) + tmpdev->used = 2; + close_fd(&dfd); + } + } + + /* Now reject spares that don't match domains of identified members */ + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 3) + continue; + if (!stat_is_blkdev(tmpdev->devname, &rdev)) { + tmpdev->used = 2; + } else { + struct dev_policy *pol = devid_policy(rdev); + int dt = domain_test(domains, pol, NULL); + if (inargv && dt != 0) + /* take this spare as domains match + * if there are any */ + tmpdev->used = 1; + else if (!inargv && dt == 1) + /* device wasn't explicitly listed, so need + * explicit domain match - which we have */ + tmpdev->used = 1; + else + /* if domains don't match mark as unused */ + tmpdev->used = 0; + dev_policy_free(pol); + } + } + domain_free(domains); + *stp = st; + if (st && st->sb && content == *contentp) + st->ss->getinfo_super(st, content, NULL); + *contentp = content; + + return num_devs; +} + +struct devs { + char *devname; + int uptodate; /* set once we decide that this device is as + * recent as everything else in the array. + */ + int included; /* set if the device is already in the array + * due to a previous '-I' + */ + struct mdinfo i; +}; + +static int load_devices(struct devs *devices, char *devmap, + struct mddev_ident *ident, struct supertype **stp, + struct mddev_dev *devlist, struct context *c, + struct mdinfo *content, + int mdfd, char *mddev, + int *most_recentp, int *bestcntp, int **bestp, + int inargv) +{ + struct mddev_dev *tmpdev; + int devcnt = 0; + int nextspare = 0; + int bitmap_done = 0; + int most_recent = -1; + int bestcnt = 0; + int *best = *bestp; + struct supertype *st = *stp; + + for (tmpdev = devlist; tmpdev; tmpdev=tmpdev->next) { + char *devname = tmpdev->devname; + struct stat stb; + struct supertype *tst; + int i; + int dfd; + int disk_state; + + if (tmpdev->used != 1) + continue; + /* looks like a good enough match to update the super block if needed */ + if (c->update) { + /* prepare useful information in info structures */ + struct stat stb2; + int err; + fstat(mdfd, &stb2); + + if (strcmp(c->update, "uuid") == 0 && !ident->uuid_set) + random_uuid((__u8 *)ident->uuid); + + if (strcmp(c->update, "ppl") == 0 && + ident->bitmap_fd >= 0) { + pr_err("PPL is not compatible with bitmap\n"); + close(mdfd); + free(devices); + free(devmap); + return -1; + } + + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + + tst = dup_super(st); + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + + memcpy(content->uuid, ident->uuid, 16); + strcpy(content->name, ident->name); + content->array.md_minor = minor(stb2.st_rdev); + + if (strcmp(c->update, "byteorder") == 0) + err = 0; + else if (strcmp(c->update, "home-cluster") == 0) { + tst->cluster_name = c->homecluster; + err = tst->ss->write_bitmap(tst, dfd, NameUpdate); + } else if (strcmp(c->update, "nodes") == 0) { + tst->nodes = c->nodes; + err = tst->ss->write_bitmap(tst, dfd, NodeNumUpdate); + } else if (strcmp(c->update, "revert-reshape") == 0 && + c->invalid_backup) + err = tst->ss->update_super(tst, content, + "revert-reshape-nobackup", + devname, c->verbose, + ident->uuid_set, + c->homehost); + else + err = tst->ss->update_super(tst, content, c->update, + devname, c->verbose, + ident->uuid_set, + c->homehost); + if (err < 0) { + if (err == -1) + pr_err("--update=%s not understood for %s metadata\n", + c->update, tst->ss->name); + tst->ss->free_super(tst); + free(tst); + close(mdfd); + close(dfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (strcmp(c->update, "uuid")==0 && + !ident->uuid_set) { + ident->uuid_set = 1; + memcpy(ident->uuid, content->uuid, 16); + } + if (tst->ss->store_super(tst, dfd)) + pr_err("Could not re-write superblock on %s.\n", + devname); + + if (strcmp(c->update, "uuid")==0 && + ident->bitmap_fd >= 0 && !bitmap_done) { + if (bitmap_update_uuid(ident->bitmap_fd, + content->uuid, + tst->ss->swapuuid) != 0) + pr_err("Could not update uuid on external bitmap.\n"); + else + bitmap_done = 1; + } + } else { + dfd = dev_open(devname, + tmpdev->disposition == 'I' + ? O_RDWR : (O_RDWR|O_EXCL)); + tst = dup_super(st); + + if (dfd < 0 || tst->ss->load_super(tst, dfd, NULL) != 0) { + pr_err("cannot re-read metadata from %s - aborting\n", + devname); + if (dfd >= 0) + close(dfd); + close(mdfd); + free(devices); + free(devmap); + tst->ss->free_super(tst); + free(tst); + *stp = st; + return -1; + } + tst->ss->getinfo_super(tst, content, devmap + devcnt * content->array.raid_disks); + } + + fstat(dfd, &stb); + close(dfd); + + if (c->verbose > 0) + pr_err("%s is identified as a member of %s, slot %d%s.\n", + devname, mddev, content->disk.raid_disk, + (content->disk.state & (1<<MD_DISK_REPLACEMENT)) ? " replacement":""); + devices[devcnt].devname = devname; + devices[devcnt].uptodate = 0; + devices[devcnt].included = (tmpdev->disposition == 'I'); + devices[devcnt].i = *content; + devices[devcnt].i.disk.major = major(stb.st_rdev); + devices[devcnt].i.disk.minor = minor(stb.st_rdev); + + disk_state = devices[devcnt].i.disk.state & ~((1<<MD_DISK_FAILFAST) | + (1<<MD_DISK_WRITEMOSTLY)); + if (disk_state == ((1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC))) { + if (most_recent < 0 || + devices[devcnt].i.events + > devices[most_recent].i.events) { + struct supertype *tmp = tst; + tst = st; + st = tmp; + most_recent = devcnt; + } + } + tst->ss->free_super(tst); + free(tst); + + if (content->array.level == LEVEL_MULTIPATH) + /* with multipath, the raid_disk from the superblock is meaningless */ + i = devcnt; + else + i = devices[devcnt].i.disk.raid_disk; + if (i+1 == 0 || i == MD_DISK_ROLE_JOURNAL) { + if (nextspare < content->array.raid_disks*2) + nextspare = content->array.raid_disks*2; + i = nextspare++; + } else { + /* i is raid_disk - double it so there is room for + * replacements */ + i *= 2; + if (devices[devcnt].i.disk.state & (1<<MD_DISK_REPLACEMENT)) + i++; + if (i >= content->array.raid_disks*2 && + i >= nextspare) + nextspare = i+1; + } + if (i < 10000) { + if (i >= bestcnt) { + int newbestcnt = i+10; + int *newbest = xmalloc(sizeof(int)*newbestcnt); + int c; + for (c=0; c < newbestcnt; c++) + if (c < bestcnt) + newbest[c] = best[c]; + else + newbest[c] = -1; + if (best)free(best); + best = newbest; + bestcnt = newbestcnt; + } + if (best[i] >=0 && + devices[best[i]].i.events == + devices[devcnt].i.events && + (devices[best[i]].i.disk.minor != + devices[devcnt].i.disk.minor) && + st->ss == &super0 && + content->array.level != LEVEL_MULTIPATH) { + /* two different devices with identical superblock. + * Could be a mis-detection caused by overlapping + * partitions. fail-safe. + */ + pr_err("WARNING %s and %s appear to have very similar superblocks.\n" + " If they are really different, please --zero the superblock on one\n" + " If they are the same or overlap, please remove one from %s.\n", + devices[best[i]].devname, devname, + inargv ? "the list" : + "the\n DEVICE list in mdadm.conf" + ); + close(mdfd); + free(devices); + free(devmap); + *stp = st; + return -1; + } + if (best[i] == -1 || (devices[best[i]].i.events + < devices[devcnt].i.events)) + best[i] = devcnt; + else if (st->ss == &super_imsm) + best[i+1] = devcnt; + } + devcnt++; + } + if (most_recent >= 0) + *most_recentp = most_recent; + *bestcntp = bestcnt; + *bestp = best; + *stp = st; + return devcnt; +} + +static int force_array(struct mdinfo *content, + struct devs *devices, + int *best, int bestcnt, char *avail, + int most_recent, + struct supertype *st, + struct context *c) +{ + int okcnt = 0; + while (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, + avail) || + (content->reshape_active && content->delta_disks > 0 && + !enough(content->array.level, (content->array.raid_disks + - content->delta_disks), + content->new_layout, 1, avail))) { + /* Choose the newest best drive which is + * not up-to-date, update the superblock + * and add it. + */ + int fd; + struct supertype *tst; + unsigned long long current_events; + int chosen_drive = -1; + int i; + + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt; + i += 2) { + int j = best[i]; + if (j < 0) + continue; + if (devices[j].uptodate) + continue; + if (devices[j].i.recovery_start != MaxSector) { + int delta; + if (!devices[j].i.reshape_active || + devices[j].i.delta_disks <= 0) + continue; + /* When increasing number of devices, an + * added device also appears to be + * recovering. It is safe to include it + * as long as it won't be a source of + * data. + * For now, just allow for last data + * devices in RAID4 or last devices in RAID4/5/6. + */ + delta = devices[j].i.delta_disks; + if (devices[j].i.array.level >= 4 && + devices[j].i.array.level <= 6 && + i/2 >= content->array.raid_disks - delta) + /* OK */; + else if (devices[j].i.array.level == 4 && + i/2 >= content->array.raid_disks - delta - 1) + /* OK */; + else + continue; + } else if (devices[j].i.reshape_active != + content->reshape_active || + (devices[j].i.reshape_active && + devices[j].i.reshape_progress != + content->reshape_progress)) + /* Here, it may be a source of data. If two + * devices claim different progresses, it + * means that reshape boundaries differ for + * their own devices. Kernel will only treat + * the first one as reshape progress and + * go on. It may cause disaster, so avoid it. + */ + continue; + if (chosen_drive < 0 || + devices[j].i.events + > devices[chosen_drive].i.events) + chosen_drive = j; + } + if (chosen_drive < 0) + break; + current_events = devices[chosen_drive].i.events; + add_another: + if (c->verbose >= 0) + pr_err("forcing event count in %s(%d) from %d up to %d\n", + devices[chosen_drive].devname, + devices[chosen_drive].i.disk.raid_disk, + (int)(devices[chosen_drive].i.events), + (int)(devices[most_recent].i.events)); + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? O_RDWR + : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Couldn't open %s for write - not updating\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + tst = dup_super(st); + if (tst->ss->load_super(tst,fd, NULL)) { + close(fd); + pr_err("RAID superblock disappeared from %s - not updating.\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + continue; + } + content->events = devices[most_recent].i.events; + tst->ss->update_super(tst, content, "force-one", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + + if (tst->ss->store_super(tst, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + devices[chosen_drive].i.events = 0; + tst->ss->free_super(tst); + continue; + } + close(fd); + devices[chosen_drive].i.events = devices[most_recent].i.events; + devices[chosen_drive].uptodate = 1; + avail[chosen_drive] = 1; + okcnt++; + tst->ss->free_super(tst); + /* If there are any other drives of the same vintage, + * add them in as well. We can't lose and we might gain + */ + for (i = 0; + i < content->array.raid_disks * 2 && i < bestcnt ; + i += 2) { + int j = best[i]; + if (j >= 0 && + !devices[j].uptodate && + devices[j].i.recovery_start == MaxSector && + devices[j].i.events == current_events && + ((!devices[j].i.reshape_active && + !content->reshape_active) || + (devices[j].i.reshape_active == + content->reshape_active && + devices[j].i.reshape_progress == + content->reshape_progress))) { + chosen_drive = j; + goto add_another; + } + } + } + return okcnt; +} + +static int start_array(int mdfd, + char *mddev, + struct mdinfo *content, + struct supertype *st, + struct mddev_ident *ident, + int *best, int bestcnt, + int chosen_drive, + struct devs *devices, + unsigned int okcnt, + unsigned int sparecnt, + unsigned int rebuilding_cnt, + unsigned int journalcnt, + struct context *c, + int clean, char *avail, + int start_partial_ok, + int err_ok, + int was_forced + ) +{ + int rv; + int i; + unsigned int req_cnt; + + if (content->journal_device_required && (content->journal_clean == 0)) { + if (!c->force) { + pr_err("Not safe to assemble with missing or stale journal device, consider --force.\n"); + return 1; + } + pr_err("Journal is missing or stale, starting array read only.\n"); + c->readonly = 1; + } + + if (content->consistency_policy == CONSISTENCY_POLICY_PPL) + clean = 1; + + rv = set_array_info(mdfd, st, content); + if (rv && !err_ok) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + return 1; + } + if (ident->bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, ident->bitmap_fd) != 0) { + pr_err("SET_BITMAP_FILE failed.\n"); + return 1; + } + } else if (ident->bitmap_file) { + /* From config file */ + int bmfd = open(ident->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s\n", + ident->bitmap_file); + return 1; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + pr_err("Failed to set bitmapfile for %s\n", mddev); + close(bmfd); + return 1; + } + close(bmfd); + } + + /* First, add the raid disks, but add the chosen one last */ + for (i = 0; i <= bestcnt; i++) { + int j; + if (i < bestcnt) { + j = best[i]; + if (j == chosen_drive) + continue; + } else + j = chosen_drive; + + if (j >= 0 && !devices[j].included) { + int dfd; + + dfd = dev_open(devices[j].devname, O_RDWR|O_EXCL); + if (dfd >= 0) { + remove_partitions(dfd); + close(dfd); + } + rv = add_disk(mdfd, st, content, &devices[j].i); + + if (rv) { + pr_err("failed to add %s to %s: %s\n", + devices[j].devname, mddev, + strerror(errno)); + if (errno == EINVAL && content->array.level == 0 && + content->array.layout != 0) { + cont_err("Possibly your kernel doesn't support RAID0 layouts.\n"); + cont_err("Please upgrade.\n"); + } + if (i < content->array.raid_disks * 2 || + i == bestcnt) + okcnt--; + else + sparecnt--; + } else if (c->verbose > 0) { + pr_err("added %s to %s as %d%s%s\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk, + devices[j].uptodate?"": + " (possibly out of date)", + (devices[j].i.disk.state & + (1<<MD_DISK_REPLACEMENT)) ? + " replacement":""); + } + } else if (j >= 0) { + if (c->verbose > 0) + pr_err("%s is already in %s as %d\n", + devices[j].devname, mddev, + devices[j].i.disk.raid_disk); + } else if (c->verbose > 0 && + i < content->array.raid_disks * 2 && (i & 1) == 0) + pr_err("no uptodate device for slot %d of %s\n", + i/2, mddev); + } + + if (content->array.level == LEVEL_CONTAINER) { + sysfs_rules_apply(mddev, content); + if (c->verbose >= 0) { + pr_err("Container %s has been assembled with %d drive%s", + mddev, okcnt + sparecnt + journalcnt, + okcnt + sparecnt + journalcnt == 1 ? "" : "s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)\n", + content->array.raid_disks); + else + fprintf(stderr, "\n"); + } + + if (st->ss->validate_container) { + struct mdinfo *devices_list; + struct mdinfo *info_devices; + unsigned int count; + + devices_list = NULL; + info_devices = xmalloc(sizeof(struct mdinfo) * + (okcnt + sparecnt)); + for (count = 0; count < okcnt + sparecnt; count++) { + info_devices[count] = devices[count].i; + info_devices[count].next = devices_list; + devices_list = &info_devices[count]; + } + if (st->ss->validate_container(devices_list)) + pr_err("Mismatch detected!\n"); + free(info_devices); + } + + st->ss->free_super(st); + sysfs_uevent(content, "change"); + if (err_ok && okcnt < (unsigned)content->array.raid_disks) + /* Was partial, is still partial, so signal an error + * to ensure we don't retry */ + return 1; + return 0; + } + + /* Get number of in-sync devices according to the superblock. + * We must have this number to start the array without -s or -R + */ + req_cnt = content->array.working_disks; + + if (c->runstop == 1 || + (c->runstop <= 0 && + (enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, avail) && + (okcnt + rebuilding_cnt >= req_cnt || start_partial_ok)))) { + /* This array is good-to-go. + * If a reshape is in progress then we might need to + * continue monitoring it. In that case we start + * it read-only and let the grow code make it writable. + */ + int rv; + + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP) && + content->delta_disks <= 0) { + if (!c->backup_file) { + pr_err("%s: Need a backup file to complete reshape of this array.\n", + mddev); + pr_err("Please provided one with \"--backup-file=...\"\n"); + if (c->update && + strcmp(c->update, "revert-reshape") == 0) + pr_err("(Don't specify --update=revert-reshape again, that part succeeded.)\n"); + return 1; + } + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + if (rv == 0) + rv = Grow_continue(mdfd, st, content, + c->backup_file, 0, + c->freeze_reshape); + } else if (c->readonly && + sysfs_attribute_available(content, NULL, + "array_state")) { + rv = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } else + rv = ioctl(mdfd, RUN_ARRAY, NULL); + reopen_mddev(mdfd); /* drop O_EXCL */ + if (rv == 0) { + sysfs_rules_apply(mddev, content); + if (c->verbose >= 0) { + pr_err("%s has been started with %d drive%s", + mddev, okcnt, okcnt==1?"":"s"); + if (okcnt < (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", + sparecnt?",":" and", + rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", + sparecnt, + sparecnt == 1 ? "" : "s"); + if (content->journal_clean) + fprintf(stderr, " and %d journal", + journalcnt); + fprintf(stderr, ".\n"); + } + if (content->reshape_active && + content->array.level >= 4 && + content->array.level <= 6) { + /* might need to increase the size + * of the stripe cache - default is 256 + */ + int chunk_size = content->array.chunk_size; + + if (content->reshape_active && + content->new_chunk > chunk_size) + chunk_size = content->new_chunk; + if (256 < 4 * ((chunk_size+4065)/4096)) { + struct mdinfo *sra; + + sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_num(sra, NULL, + "stripe_cache_size", + (4 * chunk_size / 4096) + 1); + sysfs_free(sra); + } + } + if (okcnt < (unsigned)content->array.raid_disks) { + /* If any devices did not get added + * because the kernel rejected them based + * on event count, try adding them + * again providing the action policy is + * 're-add' or greater. The bitmap + * might allow them to be included, or + * they will become spares. + */ + for (i = 0; i < bestcnt; i++) { + int j = best[i]; + if (j >= 0 && !devices[j].uptodate) { + if (!disk_action_allows(&devices[j].i, st->ss->name, act_re_add)) + continue; + rv = add_disk(mdfd, st, content, + &devices[j].i); + if (rv == 0 && c->verbose >= 0) + pr_err("%s has been re-added.\n", + devices[j].devname); + } + } + } + if (content->array.level == 6 && + okcnt + 1 == (unsigned)content->array.raid_disks && + was_forced) { + struct mdinfo *sra; + + sra = sysfs_read(mdfd, NULL, 0); + if (sra) + sysfs_set_str(sra, NULL, + "sync_action", "repair"); + sysfs_free(sra); + } + return 0; + } + pr_err("failed to RUN_ARRAY %s: %s\n", mddev, strerror(errno)); + if (errno == 524 /* ENOTSUP */ && + content->array.level == 0 && content->array.layout == 0) + cont_err("Please use --update=layout-original or --update=layout-alternate\n"); + + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + pr_err("Not enough devices to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail)) + pr_err("Not enough devices to start the array while not clean - consider --force.\n"); + + return 1; + } + if (c->runstop == -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt == 1 ? "" : "s"); + if (okcnt != (unsigned)content->array.raid_disks) + fprintf(stderr, " (out of %d)", + content->array.raid_disks); + fprintf(stderr, ", but not started.\n"); + return 2; + } + if (c->verbose >= -1) { + pr_err("%s assembled from %d drive%s", + mddev, okcnt, okcnt == 1 ? "" : "s"); + if (rebuilding_cnt) + fprintf(stderr, "%s %d rebuilding", + sparecnt ? "," : " and", rebuilding_cnt); + if (sparecnt) + fprintf(stderr, " and %d spare%s", sparecnt, + sparecnt == 1 ? "" : "s"); + if (!enough(content->array.level, content->array.raid_disks, + content->array.layout, 1, avail)) + fprintf(stderr, " - not enough to start the array.\n"); + else if (!enough(content->array.level, + content->array.raid_disks, + content->array.layout, clean, avail)) + fprintf(stderr, " - not enough to start the array while not clean - consider --force.\n"); + else { + if (req_cnt == (unsigned)content->array.raid_disks) + fprintf(stderr, " - need all %d to start it", + req_cnt); + else + fprintf(stderr, " - need %d to start", req_cnt); + fprintf(stderr, " (use --run to insist).\n"); + } + } + return 1; +} + +int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c) +{ + /* + * The task of Assemble is to find a collection of + * devices that should (according to their superblocks) + * form an array, and to give this collection to the MD driver. + * In Linux-2.4 and later, this involves submitting a + * SET_ARRAY_INFO ioctl with no arg - to prepare + * the array - and then submit a number of + * ADD_NEW_DISK ioctls to add disks into + * the array. Finally RUN_ARRAY might + * be submitted to start the array. + * + * Much of the work of Assemble is in finding and/or + * checking the disks to make sure they look right. + * + * If mddev is not set, then scan must be set and we + * read through the config file for dev+uuid mapping + * We recurse, setting mddev, for each device that + * - isn't running + * - has a valid uuid (or any uuid if !uuidset) + * + * If mddev is set, we try to determine state of md. + * check version - must be at least 0.90.0 + * check kernel version. must be at least 2.4. + * If not, we can possibly fall back on START_ARRAY + * Try to GET_ARRAY_INFO. + * If possible, give up + * If not, try to STOP_ARRAY just to make sure + * + * If !uuidset and scan, look in conf-file for uuid + * If not found, give up + * If !devlist and scan and uuidset, get list of devs from conf-file + * + * For each device: + * Check superblock - discard if bad + * Check uuid (set if we don't have one) - discard if no match + * Check superblock similarity if we have a superblock - discard if different + * Record events, devicenum + * This should give us a list of devices for the array + * We should collect the most recent event number + * + * Count disks with recent enough event count + * While force && !enough disks + * Choose newest rejected disks, update event count + * mark clean and rewrite superblock + * If recent kernel: + * SET_ARRAY_INFO + * foreach device with recent events : ADD_NEW_DISK + * if runstop == 1 || "enough" disks and runstop==0 -> RUN_ARRAY + * If old kernel: + * Check the device numbers in superblock are right + * update superblock if any changes + * START_ARRAY + * + */ + int rv = -1; + int mdfd = -1; + int clean; + int auto_assem = (mddev == NULL && !ident->uuid_set && + ident->super_minor == UnSet && ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL)); + struct devs *devices = NULL; + char *devmap; + int *best = NULL; /* indexed by raid_disk */ + int bestcnt = 0; + int devcnt; + unsigned int okcnt, sparecnt, rebuilding_cnt, replcnt, journalcnt; + int journal_clean = 0; + int i; + int was_forced = 0; + int most_recent = 0; + int chosen_drive; + int change = 0; + int inargv = 0; + int start_partial_ok = (c->runstop >= 0) && + (c->force || devlist==NULL || auto_assem); + int num_devs; + struct mddev_dev *tmpdev; + struct mdinfo info; + struct mdinfo *content = NULL; + struct mdinfo *pre_exist = NULL; + char *avail; + char *name = NULL; + char chosen_name[1024]; + struct map_ent *map = NULL; + struct map_ent *mp; + + /* + * If any subdevs are listed, then any that don't + * match ident are discarded. Remainder must all match and + * become the array. + * If no subdevs, then we scan all devices in the config file, but + * there must be something in the identity + */ + + if (!devlist && + ident->uuid_set == 0 && + (ident->super_minor < 0 || ident->super_minor == UnSet) && + ident->name[0] == 0 && + (ident->container == NULL || ident->member == NULL) && + ident->devices == NULL) { + pr_err("No identity information available for %s - cannot assemble.\n", + mddev ? mddev : "further assembly"); + return 1; + } + + if (devlist == NULL) + devlist = conf_get_devs(); + else if (mddev) + inargv = 1; + +try_again: + /* We come back here when doing auto-assembly and attempting some + * set of devices failed. Those are now marked as ->used==2 and + * we ignore them and try again + */ + if (!st && ident->st) + st = ident->st; + if (c->verbose>0) + pr_err("looking for devices for %s\n", + mddev ? mddev : "further assembly"); + + content = &info; + if (st && c->force) + st->ignore_hw_compat = 1; + num_devs = select_devices(devlist, ident, &st, &content, c, + inargv, auto_assem); + if (num_devs < 0) + return 1; + + if (!st || !st->sb || !content) + return 2; + + /* We have a full set of devices - we now need to find the + * array device. + * However there is a risk that we are racing with "mdadm -I" + * and the array is already partially assembled - we will have + * rejected any devices already in this address. + * So we take a lock on the map file - to prevent further races - + * and look for the uuid in there. If found and the array is + * active, we abort. If found and the array is not active + * we commit to that md device and add all the contained devices + * to our list. We flag them so that we don't try to re-add, + * but can remove if they turn out to not be wanted. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); + if (c->update && strcmp(c->update,"uuid") == 0) + mp = NULL; + else + mp = map_by_uuid(&map, content->uuid); + if (mp) { + struct mdinfo *dv; + /* array already exists. */ + pre_exist = sysfs_read(-1, mp->devnm, GET_LEVEL|GET_DEVS); + if (pre_exist->array.level != UnSet) { + pr_err("Found some drive for an array that is already active: %s\n", + mp->path); + pr_err("giving up.\n"); + goto out; + } + for (dv = pre_exist->devs; dv; dv = dv->next) { + /* We want to add this device to our list, + * but it could already be there if "mdadm -I" + * started *after* we checked for O_EXCL. + * If we add it to the top of the list + * it will be preferred over later copies. + */ + struct mddev_dev *newdev; + char *devname = map_dev(dv->disk.major, + dv->disk.minor, + 0); + if (!devname) + continue; + newdev = xmalloc(sizeof(*newdev)); + newdev->devname = devname; + newdev->disposition = 'I'; + newdev->used = 1; + newdev->next = devlist; + devlist = newdev; + num_devs++; + } + strcpy(chosen_name, mp->path); + if (c->verbose > 0 || mddev == NULL || + strcmp(mddev, chosen_name) != 0) + pr_err("Merging with already-assembled %s\n", + chosen_name); + mdfd = open_dev_excl(mp->devnm); + } else { + int trustworthy = FOREIGN; + name = content->name; + switch (st->ss->match_home(st, c->homehost) + ?: st->ss->match_home(st, "any")) { + case 1: + trustworthy = LOCAL; + name = strchr(content->name, ':'); + if (name) + name++; + else + name = content->name; + break; + } + if (mddev && map_by_name(&map, mddev) != NULL) { + pr_err("Cannot create device with %s because is in use\n", mddev); + goto out; + } + if (!auto_assem) + /* If the array is listed in mdadm.conf or on + * command line, then we trust the name + * even if the array doesn't look local + */ + trustworthy = LOCAL; + + if (name[0] == 0 && + content->array.level == LEVEL_CONTAINER) { + name = content->text_version; + trustworthy = METADATA; + } + + if (name[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name)) + trustworthy = LOCAL; + + if (trustworthy == LOCAL && + strchr(name, ':')) + /* Ignore 'host:' prefix of name */ + name = strchr(name, ':')+1; + + mdfd = create_mddev(mddev, name, ident->autof, trustworthy, + chosen_name, 0); + } + if (mdfd < 0) { + st->ss->free_super(st); + if (auto_assem) + goto try_again; + goto out; + } + mddev = chosen_name; + if (pre_exist == NULL) { + if (mddev_busy(fd2devnm(mdfd))) { + pr_err("%s already active, cannot restart it!\n", + mddev); + for (tmpdev = devlist ; + tmpdev && tmpdev->used != 1; + tmpdev = tmpdev->next) + ; + if (tmpdev && auto_assem) + pr_err("%s needed for %s...\n", + mddev, tmpdev->devname); + close(mdfd); + mdfd = -3; + st->ss->free_super(st); + if (auto_assem) + goto try_again; + goto out; + } + /* just incase it was started but has no content */ + ioctl(mdfd, STOP_ARRAY, NULL); + } + + if (content != &info) { + /* This is a member of a container. Try starting the array. */ + int err; + err = assemble_container_content(st, mdfd, content, c, + chosen_name, NULL); + close(mdfd); + return err; + } + + /* Ok, no bad inconsistancy, we can try updating etc */ + devices = xcalloc(num_devs, sizeof(*devices)); + devmap = xcalloc(num_devs, content->array.raid_disks); + devcnt = load_devices(devices, devmap, ident, &st, devlist, + c, content, mdfd, mddev, + &most_recent, &bestcnt, &best, inargv); + if (devcnt < 0) { + mdfd = -3; + /* + * devices is already freed in load_devices, so set devices + * to NULL to avoid double free devices. + */ + devices = NULL; + goto out; + } + + if (devcnt == 0) { + pr_err("no devices found for %s\n", + mddev); + if (st) + st->ss->free_super(st); + free(devmap); + goto out; + } + + if (c->update && strcmp(c->update, "byteorder")==0) + st->minor_version = 90; + + st->ss->getinfo_super(st, content, NULL); + clean = content->array.state & 1; + + /* now we have some devices that might be suitable. + * I wonder how many + */ + avail = xcalloc(content->array.raid_disks, 1); + okcnt = 0; + replcnt = 0; + sparecnt=0; + journalcnt=0; + rebuilding_cnt=0; + for (i=0; i< bestcnt; i++) { + int j = best[i]; + int event_margin = 1; /* always allow a difference of '1' + * like the kernel does + */ + if (j < 0) continue; + /* note: we ignore error flags in multipath arrays + * as they don't make sense + */ + if (content->array.level != LEVEL_MULTIPATH) { + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) { + if (content->journal_device_required) + journalcnt++; + else /* unexpected journal, mark as faulty */ + devices[j].i.disk.state |= (1<<MD_DISK_FAULTY); + } else if (!(devices[j].i.disk.state & (1<<MD_DISK_ACTIVE))) { + if (!(devices[j].i.disk.state + & (1<<MD_DISK_FAULTY))) { + devices[j].uptodate = 1; + sparecnt++; + } + continue; + } + } + /* If this device thinks that 'most_recent' has failed, then + * we must reject this device. + */ + if (j != most_recent && !c->force && + content->array.raid_disks > 0 && + devices[most_recent].i.disk.raid_disk >= 0 && + devmap[j * content->array.raid_disks + devices[most_recent].i.disk.raid_disk] == 0) { + if (c->verbose > -1) + pr_err("ignoring %s as it reports %s as failed\n", + devices[j].devname, devices[most_recent].devname); + best[i] = -1; + continue; + } + /* Require event counter to be same as, or just less than, + * most recent. If it is bigger, it must be a stray spare and + * should be ignored. + */ + if (devices[j].i.events+event_margin >= + devices[most_recent].i.events && + devices[j].i.events <= + devices[most_recent].i.events + ) { + devices[j].uptodate = 1; + if (devices[j].i.disk.state & (1<<MD_DISK_JOURNAL)) + journal_clean = 1; + if (i < content->array.raid_disks * 2) { + if (devices[j].i.recovery_start == MaxSector || + (content->reshape_active && + i >= content->array.raid_disks - content->delta_disks)) { + if (!avail[i/2]) { + okcnt++; + avail[i/2]=1; + } else + replcnt++; + } else + rebuilding_cnt++; + } else if (devices[j].i.disk.raid_disk != MD_DISK_ROLE_JOURNAL) + sparecnt++; + } + } + free(devmap); + if (c->force) { + int force_ok = force_array(content, devices, best, bestcnt, + avail, most_recent, st, c); + okcnt += force_ok; + if (force_ok) + was_forced = 1; + } + /* Now we want to look at the superblock which the kernel will base things on + * and compare the devices that we think are working with the devices that the + * superblock thinks are working. + * If there are differences and --force is given, then update this chosen + * superblock. + */ + chosen_drive = -1; + st->ss->free_super(st); + for (i=0; chosen_drive < 0 && i<bestcnt; i+=2) { + int j = best[i]; + int fd; + + if (j<0) + continue; + if (!devices[j].uptodate) + continue; + if (devices[j].i.events < devices[most_recent].i.events) + continue; + chosen_drive = j; + if ((fd=dev_open(devices[j].devname, + devices[j].included ? O_RDONLY + : (O_RDONLY|O_EXCL)))< 0) { + pr_err("Cannot open %s: %s\n", + devices[j].devname, strerror(errno)); + goto out; + } + if (st->ss->load_super(st,fd, NULL)) { + close(fd); + pr_err("RAID superblock has disappeared from %s\n", + devices[j].devname); + goto out; + } + close(fd); + } + if (st->sb == NULL) { + pr_err("No suitable drives found for %s\n", mddev); + goto out; + } + st->ss->getinfo_super(st, content, NULL); + if (sysfs_init(content, mdfd, NULL)) { + pr_err("Unable to initialize sysfs\n"); + goto out; + } + + /* after reload context, store journal_clean in context */ + content->journal_clean = journal_clean; + for (i=0; i<bestcnt; i++) { + int j = best[i]; + unsigned int desired_state; + + if (j < 0) + continue; + if (devices[j].i.disk.raid_disk == MD_DISK_ROLE_JOURNAL) + desired_state = (1<<MD_DISK_JOURNAL); + else if (i >= content->array.raid_disks * 2) + desired_state = 0; + else if (i & 1) + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_REPLACEMENT); + else + desired_state = (1<<MD_DISK_ACTIVE) | (1<<MD_DISK_SYNC); + + desired_state |= devices[j].i.disk.state & ((1<<MD_DISK_FAILFAST) | + (1<<MD_DISK_WRITEMOSTLY)); + + if (!devices[j].uptodate) + continue; + + devices[j].i.disk.state = desired_state; + if (!(devices[j].i.array.state & 1)) + clean = 0; + + if (st->ss->update_super(st, &devices[j].i, "assemble", NULL, + c->verbose, 0, NULL)) { + if (c->force) { + if (c->verbose >= 0) + pr_err("clearing FAULTY flag for device %d in %s for %s\n", + j, mddev, devices[j].devname); + change = 1; + } else { + if (c->verbose >= -1) + pr_err("device %d in %s has wrong state in superblock, but %s seems ok\n", + i, mddev, devices[j].devname); + } + } +#if 0 + if (!(super.disks[i].i.disk.state & (1 << MD_DISK_FAULTY))) { + pr_err("devices %d of %s is not marked FAULTY in superblock, but cannot be found\n", + i, mddev); + } +#endif + } + if (c->force && !clean && + !enough(content->array.level, content->array.raid_disks, + content->array.layout, clean, + avail)) { + change += st->ss->update_super(st, content, "force-array", + devices[chosen_drive].devname, c->verbose, + 0, NULL); + was_forced = 1; + clean = 1; + } + + if (change) { + int fd; + fd = dev_open(devices[chosen_drive].devname, + devices[chosen_drive].included ? + O_RDWR : (O_RDWR|O_EXCL)); + if (fd < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[chosen_drive].devname); + goto out; + } + if (st->ss->store_super(st, fd)) { + close(fd); + pr_err("Could not re-write superblock on %s\n", + devices[chosen_drive].devname); + goto out; + } + if (c->verbose >= 0) + pr_err("Marking array %s as 'clean'\n", + mddev); + close(fd); + } + + /* If we are in the middle of a reshape we may need to restore saved data + * that was moved aside due to the reshape overwriting live data + * The code of doing this lives in Grow.c + */ + if (content->reshape_active && + !(content->reshape_active & RESHAPE_NO_BACKUP)) { + int err = 0; + int *fdlist = xmalloc(sizeof(int)* bestcnt); + if (c->verbose > 0) + pr_err("%s has an active reshape - checking if critical section needs to be restored\n", + chosen_name); + if (!c->backup_file) + c->backup_file = locate_backup(content->sys_name); + enable_fds(bestcnt/2); + for (i = 0; i < bestcnt/2; i++) { + int j = best[i*2]; + if (j >= 0) { + fdlist[i] = dev_open(devices[j].devname, + devices[j].included + ? O_RDWR : (O_RDWR|O_EXCL)); + if (fdlist[i] < 0) { + pr_err("Could not open %s for write - cannot Assemble array.\n", + devices[j].devname); + err = 1; + break; + } + } else + fdlist[i] = -1; + } + if (!err) { + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, bestcnt/2, + c->backup_file, c->verbose > 0); + if (err && c->invalid_backup) { + if (c->verbose > 0) + pr_err("continuing without restoring backup\n"); + err = 0; + } + } + while (i>0) { + i--; + if (fdlist[i]>=0) close(fdlist[i]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape, sorry.\n"); + if (c->backup_file == NULL) + cont_err("Possibly you needed to specify the --backup-file\n"); + goto out; + } + } + + /* Almost ready to actually *do* something */ + /* First, fill in the map, so that udev can find our name + * as soon as we become active. + */ + if (c->update && strcmp(c->update, "metadata")==0) { + content->array.major_version = 1; + content->array.minor_version = 0; + strcpy(content->text_version, "1.0"); + } + + map_update(&map, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + rv = start_array(mdfd, mddev, content, + st, ident, best, bestcnt, + chosen_drive, devices, okcnt, sparecnt, + rebuilding_cnt, journalcnt, + c, + clean, avail, start_partial_ok, + pre_exist != NULL, + was_forced); + if (rv == 1 && !pre_exist) + ioctl(mdfd, STOP_ARRAY, NULL); + free(devices); +out: + map_unlock(&map); + if (rv == 0) { + wait_for(chosen_name, mdfd); + close(mdfd); + if (auto_assem) { + int usecs = 1; + /* There is a nasty race with 'mdadm --monitor'. + * If it opens this device before we close it, + * it gets an incomplete open on which IO + * doesn't work and the capacity is + * wrong. + * If we reopen (to check for layered devices) + * before --monitor closes, we loose. + * + * So: wait upto 1 second for there to be + * a non-zero capacity. + */ + while (usecs < 1000) { + mdfd = open(mddev, O_RDONLY); + if (mdfd >= 0) { + unsigned long long size; + if (get_dev_size(mdfd, NULL, &size) && + size > 0) + break; + close(mdfd); + } + usleep(usecs); + usecs <<= 1; + } + } + } else if (mdfd >= 0) + close(mdfd); + + /* '2' means 'OK, but not started yet' */ + if (rv == -1) { + free(devices); + return 1; + } + return rv == 2 ? 0 : rv; +} + +int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, struct context *c, + char *chosen_name, int *result) +{ + struct mdinfo *dev, *sra, *dev2; + struct assembly_array_info array = {chosen_name, 0, 0, 0}; + int old_raid_disks; + int start_reshape; + char *avail; + int err; + int is_raid456, is_clean, all_disks; + + if (sysfs_init(content, mdfd, NULL)) { + pr_err("Unable to initialize sysfs\n"); + return 1; + } + + sra = sysfs_read(mdfd, NULL, GET_VERSION|GET_DEVS); + if (sra == NULL || strcmp(sra->text_version, content->text_version) != 0) { + if (content->array.major_version == -1 && + content->array.minor_version == -2 && + c->readonly && + content->text_version[0] == '/') + content->text_version[0] = '-'; + if (sysfs_set_array(content, 9003) != 0) { + sysfs_free(sra); + return 1; + } + } + + /* There are two types of reshape: container wide or sub-array specific + * Check if metadata requests blocking container wide reshapes + */ + start_reshape = (content->reshape_active && + !((content->reshape_active == CONTAINER_RESHAPE) && + (content->array.state & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE)))); + + /* Block subarray here if it is under reshape now + * Do not allow for any changes in this array + */ + if (st->ss->external && content->recovery_blocked && start_reshape) + block_subarray(content); + + for (dev2 = sra->devs; dev2; dev2 = dev2->next) { + for (dev = content->devs; dev; dev = dev->next) + if (dev2->disk.major == dev->disk.major && + dev2->disk.minor == dev->disk.minor) + break; + if (dev) + continue; + /* Don't want this one any more */ + if (sysfs_set_str(sra, dev2, "slot", "none") < 0 && + errno == EBUSY) { + pr_err("Cannot remove old device %s: not updating %s\n", dev2->sys_name, sra->sys_name); + sysfs_free(sra); + return 1; + } + sysfs_set_str(sra, dev2, "state", "remove"); + } + old_raid_disks = content->array.raid_disks - content->delta_disks; + avail = xcalloc(content->array.raid_disks, 1); + for (dev = content->devs; dev; dev = dev->next) { + if (dev->disk.raid_disk >= 0) + avail[dev->disk.raid_disk] = 1; + if (sysfs_add_disk(content, dev, 1) == 0) { + if (dev->disk.raid_disk >= old_raid_disks && + content->reshape_active) + array.exp_cnt++; + else + array.new_cnt++; + } else if (errno == EEXIST) + array.preexist_cnt++; + } + sysfs_free(sra); + + all_disks = array.new_cnt + array.exp_cnt + array.preexist_cnt; + + map_update(NULL, fd2devnm(mdfd), content->text_version, + content->uuid, chosen_name); + + if (content->consistency_policy == CONSISTENCY_POLICY_PPL && + st->ss->validate_ppl) { + content->array.state |= 1; + err = 0; + + for (dev = content->devs; dev; dev = dev->next) { + int dfd; + char *devpath; + int ret; + + ret = st->ss->validate_ppl(st, content, dev); + if (ret == 0) + continue; + + if (ret < 0) { + err = 1; + break; + } + + if (!c->force) { + pr_err("%s contains invalid PPL - consider --force or --update-subarray with --update=no-ppl\n", + chosen_name); + content->array.state &= ~1; + avail[dev->disk.raid_disk] = 0; + break; + } + + /* have --force - overwrite the invalid ppl */ + devpath = map_dev(dev->disk.major, dev->disk.minor, 0); + dfd = dev_open(devpath, O_RDWR); + if (dfd < 0) { + pr_err("Failed to open %s\n", devpath); + err = 1; + break; + } + + err = st->ss->write_init_ppl(st, content, dfd); + close(dfd); + + if (err) + break; + } + + if (err) { + free(avail); + return err; + } + } else if (c->force) { + /* Set the array as 'clean' so that we can proceed with starting + * it even if we don't have all devices. Mdmon doesn't care + * if the dirty flag is set in metadata, it will start managing + * it anyway. + * This is really important for raid456 (RWH case), other levels + * are started anyway. + */ + content->array.state |= 1; + } + + is_raid456 = (content->array.level >= 4 && content->array.level <= 6); + is_clean = content->array.state & 1; + + if (enough(content->array.level, content->array.raid_disks, + content->array.layout, is_clean, avail) == 0) { + set_array_assembly_status(c, result, INCR_NO, &array); + + if (c->verbose >= 0 && is_raid456 && !is_clean) + pr_err("Consider --force to start dirty degraded array\n"); + + free(avail); + return 1; + } + free(avail); + + if (c->runstop <= 0 && all_disks < content->array.working_disks) { + + set_array_assembly_status(c, result, INCR_UNSAFE, &array); + + if (c->verbose >= 0 && c->force) + pr_err("Consider --run to start array as degraded.\n"); + return 1; + } + + if (is_raid456 && content->resync_start != MaxSector && c->force && + all_disks < content->array.raid_disks) { + + content->resync_start = MaxSector; + err = sysfs_set_num(content, NULL, "resync_start", MaxSector); + if (err) + return 1; + + pr_err("%s array state forced to clean. It may cause data corruption.\n", + chosen_name); + } + + /* + * Before activating the array, perform extra steps required + * to configure the internal write-intent bitmap. + */ + if (content->consistency_policy == CONSISTENCY_POLICY_BITMAP && + st->ss->set_bitmap) + st->ss->set_bitmap(st, content); + + if (start_reshape) { + int spare = content->array.raid_disks + array.exp_cnt; + if (restore_backup(st, content, + array.new_cnt, + spare, &c->backup_file, c->verbose) == 1) + return 1; + + if (content->reshape_progress == 0) { + /* If reshape progress is 0 - we are assembling the + * array that was stopped, before reshape has started. + * Array needs to be started as active, Grow_continue() + * will start the reshape. + */ + sysfs_set_num(content, NULL, "reshape_position", + MaxSector); + err = sysfs_set_str(content, NULL, + "array_state", "active"); + sysfs_set_num(content, NULL, "reshape_position", 0); + } else { + err = sysfs_set_str(content, NULL, + "array_state", "readonly"); + } + + if (err) + return 1; + + if (st->ss->external) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + err = Grow_continue(mdfd, st, content, c->backup_file, + 0, c->freeze_reshape); + } else switch(content->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(content, NULL, "array_state", + c->readonly ? "readonly" : "active"); + break; + default: + err = sysfs_set_str(content, NULL, "array_state", + "readonly"); + /* start mdmon if needed. */ + if (!err) { + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(st->container_devnm); + } + break; + } + if (!err) + sysfs_set_safemode(content, content->safe_mode_delay); + + /* Block subarray here if it is not reshaped now + * It has be blocked a little later to allow mdmon to switch in + * in to R/W state + */ + if (st->ss->external && content->recovery_blocked && + !start_reshape) + block_subarray(content); + + if (err) + set_array_assembly_status(c, result, INCR_NO, &array); + else { + set_array_assembly_status(c, result, INCR_YES, &array); + wait_for(chosen_name, mdfd); + sysfs_rules_apply(chosen_name, content); + } + + return err; + /* FIXME should have an O_EXCL and wait for read-auto */ +} @@ -0,0 +1,227 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c) +{ + /* Build a linear or raid0 arrays without superblocks + * We cannot really do any checks, we just do it. + * For md_version < 0.90.0, we call REGISTER_DEV + * with the device numbers, and then + * START_MD giving the "geometry" + * geometry is 0xpp00cc + * where pp is personality: 1==linear, 2=raid0 + * cc = chunk size factor: 0==4k, 1==8k etc. + */ + int i; + dev_t rdev; + int subdevs = 0, missing_disks = 0; + struct mddev_dev *dv; + int bitmap_fd; + unsigned long long bitmapsize; + int mdfd; + char chosen_name[1024]; + int uuid[4] = {0,0,0,0}; + struct map_ent *map = NULL; + mdu_array_info_t array; + mdu_param_t param; /* not used by syscall */ + + if (s->level == UnSet) { + pr_err("a RAID level is needed to Build an array.\n"); + return 1; + } + /* scan all devices, make sure they really are block devices */ + for (dv = devlist; dv; dv=dv->next) { + subdevs++; + if (strcmp("missing", dv->devname) == 0) { + missing_disks++; + continue; + } + if (!stat_is_blkdev(dv->devname, NULL)) + return 1; + } + + if (s->raiddisks != subdevs) { + pr_err("requested %d devices in array but listed %d\n", + s->raiddisks, subdevs); + return 1; + } + + if (s->layout == UnSet) + switch(s->level) { + default: /* no layout */ + s->layout = 0; + break; + case 10: + s->layout = 0x102; /* near=2, far=1 */ + if (c->verbose > 0) + pr_err("layout defaults to n1\n"); + break; + case 5: + case 6: + s->layout = map_name(r5layout, "default"); + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, s->layout)); + break; + case LEVEL_FAULTY: + s->layout = map_name(faultylayout, "default"); + + if (c->verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, s->layout)); + break; + } + + /* We need to create the device. It can have no name. */ + map_lock(&map); + mdfd = create_mddev(mddev, NULL, c->autof, LOCAL, + chosen_name, 0); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + mddev = chosen_name; + + map_update(&map, fd2devnm(mdfd), "none", uuid, chosen_name); + map_unlock(&map); + + array.level = s->level; + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + array.nr_disks = s->raiddisks; + array.raid_disks = s->raiddisks; + array.md_minor = 0; + if (fstat_is_blkdev(mdfd, mddev, &rdev)) + array.md_minor = minor(rdev); + array.not_persistent = 1; + array.state = 0; /* not clean, but no errors */ + if (s->assume_clean) + array.state |= 1; + array.active_disks = s->raiddisks - missing_disks; + array.working_disks = s->raiddisks - missing_disks; + array.spare_disks = 0; + array.failed_disks = missing_disks; + if (s->chunk == 0 && (s->level==0 || s->level==LEVEL_LINEAR)) + s->chunk = 64; + array.chunk_size = s->chunk*1024; + array.layout = s->layout; + if (md_set_array_info(mdfd, &array)) { + pr_err("md_set_array_info() failed for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + goto abort; + } + /* now add the devices */ + for ((i=0), (dv = devlist) ; dv ; i++, dv=dv->next) { + mdu_disk_info_t disk; + unsigned long long dsize; + int fd; + + if (strcmp("missing", dv->devname) == 0) + continue; + if (!stat_is_blkdev(dv->devname, &rdev)) + goto abort; + fd = open(dv->devname, O_RDONLY|O_EXCL); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (get_dev_size(fd, NULL, &dsize) && + (s->size == 0 || s->size == MAX_SIZE || dsize < s->size)) + s->size = dsize; + close(fd); + disk.number = i; + disk.raid_disk = i; + disk.state = (1<<MD_DISK_SYNC) | (1<<MD_DISK_ACTIVE); + if (dv->writemostly == FlagSet) + disk.state |= 1<<MD_DISK_WRITEMOSTLY; + disk.major = major(rdev); + disk.minor = minor(rdev); + if (ioctl(mdfd, ADD_NEW_DISK, &disk)) { + pr_err("ADD_NEW_DISK failed for %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + } + /* now to start it */ + if (s->bitmap_file) { + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + int major = BITMAP_MAJOR_HI; +#if 0 + if (s->bitmap_chunk == UnSet) { + pr_err("%s cannot be opened.\n", s->bitmap_file); + goto abort; + } +#endif + bitmapsize = s->size >> 9; /* FIXME wrong for RAID10 */ + if (CreateBitmap(s->bitmap_file, 1, NULL, + s->bitmap_chunk, c->delay, + s->write_behind, bitmapsize, major)) { + goto abort; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("%s cannot be opened.\n", s->bitmap_file); + goto abort; + } + } + if (bitmap_fd >= 0) { + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort; + } + } + } + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", strerror(errno)); + if (s->chunk & (s->chunk - 1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + goto abort; + } + + if (c->verbose >= 0) + pr_err("array %s built and started.\n", + mddev); + wait_for(mddev, mdfd); + close(mdfd); + return 0; + + abort: + ioctl(mdfd, STOP_ARRAY, 0); + close(mdfd); + return 1; +} @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + <one line to give the program's name and a brief idea of what it does.> + Copyright (C) <year> <name of author> + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + <signature of Ty Coon>, 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/ChangeLog b/ChangeLog new file mode 100644 index 0000000..a3bf700 --- /dev/null +++ b/ChangeLog @@ -0,0 +1,306 @@ +Please see git logs for detailed change log. +This file just contains highlight. + +Changes Prior to release 3.3 +- Some array reshapes can proceed without needing backup file. + This is done by changing the 'data_offset' so we never need to write + any data back over where it was before. If there is no "head space" + or "tail space" to allow data_offset to change, the old mechanism + with a backup file can still be used. +- RAID10 arrays can be reshaped to change the number of devices, + change the chunk size, or change the layout between 'near' + and 'offset'. + This will always change data_offset, and will fail if there is no + room for data_offset to be moved. +- "--assemble --update=metadata" can convert a 0.90 array to a 1.0 array. +- bad-block-logs are supported (but not heavily tested yet) +- "--assemble --update=revert-reshape" can be used to undo a reshape + that has just been started but isn't really wanted. This is very + new and while it passes basic tests it cannot be guaranteed. +- improved locking between --incremental and --assemble +- uses systemd to run "mdmon" if systemd is configured to do that. +- kernel names of md devices can be non-numeric. e.g. "md_home" rather than + "md0". This will probably confuse lots of other tools, so you need to + echo CREATE names=yes >> /etc/mdadm.conf + or the feature will not be used. (you also need a reasonably new kernel). +- "--stop" can be given a kernel name instead of a device name. i.e + mdadm --stop md4 + will work even if /dev/md4 doesn't exist. +- "--detail --export" has some information about the devices in the array +- --dump and --restore can be used to backup and restore the metadata on an + array. +- Hot-replace is supported with + mdadm /dev/mdX --replace /dev/foo + and + mdadm /dev/mdX --replace /dev/foo --with /dev/bar +- Config file can be a directory in which case all "*.conf" files are + read in lexical order. + Default is to read /etc/mdadm.conf and then /etc/mdadm.conf.d + Thus + echo CREATE name=yes > /etc/mdadm.conf.d/names.conf + will also enable the use of named md devices. + +- Lots of improvements to DDF support including adding support for + RAID10 (thanks Martin Wilck). + +Changes Prior to release 3.2.6 + - There are no real stand-out fixes, just lots of little bits and pieces. + +Changes Prior to release 3.2.5 + - This release primarily fixes a serious regression in 3.2.4. + This regression does *not* cause any risk to data. It simply + means that adding a device with "--add" would sometime fail + when it should not. + + - The fix also includes a couple of minor fixes such as making + the "--layout=preserve" option to "--grow" work again. + + +Changes Prior to release 3.2.4 +"--oneline" log of changes is below. Some notable ones are: + + - --offroot argument to improve interactions between mdmon and initrd + - --prefer argument to select which /dev names to display in some + circumstances. + - relax restructions on when "--add" will be allowed + - Fix bug with adding write-intent-bitmap to active array + - Now defaults to "/run/mdadm" for storing run-time files. + +Changes Prior to release 3.2.3 + - The largest single area of change is support for reshape of Intel + IMSM arrays (OnLine Capacity Explansion and Level Migration). + - Among other fixes, this now has a better chance of surviving if a + device fails during reshape. + +Changes Prior to release 3.2.2 + - reshaping IMSM (Intel metadata) arrays is no longer 'experimental', + it should work properly and be largely compatible with IMSM drivers in + other platforms. + - --assume-clean can be used with --grow --size to avoid resyncing the + new part of the array. This is only support with very new kernels. + - RAID0 arrays can have chunksize which is not a power of 2. This has been + supported in the kernel for a while but is only now supprted by + mdadm. + + - A new tool 'raid6check' is available which can check a RAID6 array, + or part of it, and report which device is most inconsistent with the + others if any stripe is inconsistent. This is still under development + and does not have a man page yet. If anyone tries it out and has any + questions or experience to report, they would be most welcome on + linux-raid@vger.kernel.org. + +Changes Prior to release 3.2.1 + - policy framework + Policy can be expressed for moving spare devices between arrays, and + for how to handle hot-plugged devices. This policy can be different + for devices plugged in to different controllers etc. + This, for example, allows a configuration where when a device is plugged + in it is immediately included in an md array as a hot spare and + possibly starts recovery immediately if an array is degraded. + + - some understanding of mbr and gpt paritition tables + This is primarly to support the new hot-plug support. If a + device is plugged in and policy suggests it should have a partition table, + the partition table will be copied from a suitably similar device, and + then the partitions will hot-plug and can then be added to md arrays. + + - "--incremental --remove" can remember where a device was removed from + so if a device gets plugged back in the same place, special policy applies + to it, allowing it to be included in an array even if a general hotplug + will not be included. + + - enhanced reshape options, including growing a RAID0 by converting to RAID4, + restriping, and converting back. Also convertions between RAID0 and + RAID10 and between RAID1 and RAID10 are possible (with a suitably recent + kernel). + + - spare migration for IMSM arrays. + Spare migration can now work across 'containers' using non-native metadata + and specifically Intel's IMSM arrays support spare migrations. + + - OLCE and level migration for Intel IMSM arrays. + OnLine Capacity Expansion and level migration (e.g. RAID0 -> RAID5) is + supported for Intel Matrix Storage Manager arrays. + This support is currently 'experimental' for technical reasons. It can + be enabled with "export MDADM_EXPERIMENTAL=1" + + - avoid including wayward devices + If you split a RAID1, mount the two halves as two separate degraded RAID1s, + and then later bring the two back together, it is possible that the md + metadata won't properly show that one must over-ride the other. + mdadm now does extra checking to detect this possibilty and avoid + potentially corrupting data. + + - remove any possible confusion between similar options. + e.g. --brief and --bitmap were mapped to 'b' and mdadm wouldn't + notice if one was used where the other was expected. + + - allow K,M,G suffixes on chunk sizes + +Changes Prior to release 3.2 + - By far the most significant change in this release related to the + management of reshaping arrays. This code has been substantially + re-written so that it can work with 'externally managed metadata' - + Intel's IMSM in particular. We now support level migration and + OnLine Capacity Expansion on these arrays. + - Policy framework. + Various policy statements can be made in the mdadm.conf to guide + the behaviour of mdadm, particular with regards to how new devices + are treated by "mdadm -I". + Depending on the 'action' associated with a device (identified by + its 'path') such need devices can be automatically re-added to and + existing array that they previously fell out off, or automatically + added as a spare if they appear to contain no data. + + - mdadm now has a limited understanding of partition tables. This + allows the policy framework to make decisions about partitioned + devices as well. + + - --incremental --remove can be told what --path the device was on, + and this info will be recorded so that another device appearing at + the same physical location can be preferentially added to the same + array (provides the spare-same-slot action policy applied to the + path). + + - A new flags "--invalid-backup" flag is available in --assemble + mode. This can be used to re-assemble an array which was stopping + in the middle of a reshape, and for which the 'backup file' is no + longer available or is corrupted. The array may have some + corruption in it at the point where reshape was up to, but at least + the rest of the array will become available. + + + - Various internal restructuring - more is needed. + +Changes Prior to release 3.1.5 + - Fixes for v1.x metadata on big-endian machines. + - man page improvements + - Improve '--detail --export' when run on partitions of an md array. + - Fix regression with removing 'failed' or 'detached' devices. + - Fixes for "--assemble --force" in various unusual cases. + - Allow '-Y' to mean --export. This was documented but not implemented. + - Various fixed for handling 'ddf' metadata. This is now more reliable + but could benefit from more interoperability testing. + - Correctly list subarrays of a container in "--detail" output. + - Improve checks on whether the requested number of devices is supported + by the metadata - both for --create and --grow. + - Don't remove partitions from a device that is being included in an + array until we are fully committed to including it. + - Allow "--assemble --update=no-bitmap" so an array with a corrupt + bitmap can still be assembled. + - Don't allow --add to succeed if it looks like a "--re-add" is probably + wanted, but cannot succeed. This avoids inadvertently turning + devices into spares when an array is failed. + +Changes Prior to release 3.1.4 + Two fixes related to configs that aren't using udev: + - Don't remove md devices which 'standard' names on --stop + - Allow dev_open to work on read-only /dev + And fixed regressions: + - Allow --incremental to add spares to an array + - Accept --no-degraded as a deprecated option rather than + throwing an error + - Return correct success status when --incrmental assembling + a container which does not yet have enough devices. + - Don't link mdadm with pthreads, only mdmon needs it. + - Fix compiler warning due to bad use of snprintf + +Changes Prior to release 3.1.3 + - mapfile now lives in a fixed location which default to + /dev/.mdadm/map but can be changed at compile time. This + location is choses and most distros provide it during early + boot and preserve it through. As long a /dev exists and is + writable, /dev/.mdadm will be created. + Other files file communication with mdmon live here too. + This fixes a bug reported by Debian and Gentoo users where + udev would spin in early-boot. + - IMSM and DDF metadata will not be recognised on partitions + as they should only be used on whole-disks. + - Various overflows causes by 2G drives have been addressed. + - A subarray of an IMSM contain can now be killed with + --kill-subarray. Also subarrays can be renamed with + --update-subarray + - -If (or --incremental --fail) can be used from udev to + fail and remove from all arrays a device which has been + unplugged from the system. i.e. hot-unplug-support. + - "mdadm /dev/mdX --re-add missing" will look for any device + that looks like it should be a member of /dev/mdX but isn't + and will automatically --re-add it + - Now compile with -Wextra to get extra warnings. + - Lots of minor bug fixes, documentation improvements, etcc + +Changes Prior to release 3.1.2 + - The default metadata has change again (sorry about that). + It is now v1.2 and will hopefully stay that way. It turned + out there with boot-block issues with v1.1 which make it + unsuitable for a default, though in many cases it is still + suitable to use. + - Stopping a container is not permitted when members are still + active + - Add 'homehost' to the valid words for the "AUTO" config file + line. When followed by "-all", this causes mdadm to + auto-assemble any array belonging to this host, but not + auto-assemble anything else. + - Fix some bugs with "--grow --chunksize=" for changing chunksize. + - VAR_RUN can be easily changed at compile time just like ALT_RUN. + This gives distros more flexability in how to manage the + pid and sock files that mdmon needs. + - Various mdmon fixes + - Alway make bitmap 4K-aligned if at all possible. + - If mdadm.conf lists arrays which have inter-dependencies, + the previously had to be listed in the "right" order. Now + any order should work. + - Fix --force assembly of v1.x arrays which are in the process + of recovering. + - Add section on 'scrubbing' to 'md' man page. + - Various command-line-option parsing improvements. + - ... and lots of other bug fixes. + +Changes Prior to release 3.1.1 + - Multiple fixes for new --grow levels including fixes for + serious data corruption problems. + - Change default metadata to v1.1 + - Change default chunk size to 512K + - Change default bitmap chunk size to 64Meg + - When --re-add is used, don't fall back to + --add if --re-add fails as this can destroy data. + +Changes Prior to release 3.1 + - Support --grow to change the layout of RAID4/5/6 + - Support --grow to change the chunksize of raid 4/5/6 + - Support --grow to change level from RAID1 -> RAID5 -> RAID6 and + back. + - Support --grow to reduce the number of devices in RAID4/5/6. + - Support restart of these grow options which assembling an array + which is partially grown. + - Assorted tests of this code, and of different RAID6 layouts. + +Changes Prior to release 3.0.3 + - Improvements for creating arrays giving just a name, like 'foo', + rather than the full '/dev/md/foo'. + - Improvements for assembling member arrays of containers. + - Improvements to test suite + - Add option to change increment for RebuildNN messages reported + by "mdadm --monitor" + - Improvements to mdmon 'hand-over' from initrd to final root. + - Handle merging of devices that have left an IMSM array and are + being re-incorporated. + - Add missing space in "--detail --brief" output. + +Changes Prior to release 3.0.2 + - Fix crash when hosthost is not set, as often happens in + early boot. + +Changes Prior to release 3.0.1 + - Fix various segfaults + - Fixed for --examine with containers + - Lots of other little fixes. + +Changes Prior to release 3.0 + - Support for externally managed metadata, specifically DDF and IMSM. + - Depend on udev to create entries in /dev, rather than creating them + ourselves. + - remove --auto-update-home-hosts + - new config file line "auto" + - new "<ignore>" and "any" options for "homehost" + - numerous bug fixes and minor enhancements. diff --git a/Create.c b/Create.c new file mode 100644 index 0000000..0ff1922 --- /dev/null +++ b/Create.c @@ -0,0 +1,1118 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +static int round_size_and_verify(unsigned long long *size, int chunk) +{ + if (*size == 0) + return 0; + *size &= ~(unsigned long long)(chunk - 1); + if (*size == 0) { + pr_err("Size cannot be smaller than chunk.\n"); + return 1; + } + return 0; +} + +static int default_layout(struct supertype *st, int level, int verbose) +{ + int layout = UnSet; + + if (st && st->ss->default_geometry) + st->ss->default_geometry(st, &level, &layout, NULL); + + if (layout == UnSet) + switch(level) { + default: /* no layout */ + layout = 0; + break; + case 0: + layout = RAID0_ORIG_LAYOUT; + break; + case 10: + layout = 0x102; /* near=2, far=1 */ + if (verbose > 0) + pr_err("layout defaults to n2\n"); + break; + case 5: + case 6: + layout = map_name(r5layout, "default"); + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(r5layout, layout)); + break; + case LEVEL_FAULTY: + layout = map_name(faultylayout, "default"); + + if (verbose > 0) + pr_err("layout defaults to %s\n", map_num(faultylayout, layout)); + break; + } + + return layout; +} + +int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, unsigned long long data_offset) +{ + /* + * Create a new raid array. + * + * First check that necessary details are available + * (i.e. level, raid-disks) + * + * Then check each disk to see what might be on it + * and report anything interesting. + * + * If anything looks odd, and runstop not set, + * abort. + * + * SET_ARRAY_INFO and ADD_NEW_DISK, and + * if runstop==run, or raiddisks disks were used, + * RUN_ARRAY + */ + int mdfd; + unsigned long long minsize = 0, maxsize = 0; + char *mindisc = NULL; + char *maxdisc = NULL; + int dnum, raid_disk_num; + struct mddev_dev *dv; + dev_t rdev; + int fail = 0, warn = 0; + int first_missing = subdevs * 2; + int second_missing = subdevs * 2; + int missing_disks = 0; + int insert_point = subdevs * 2; /* where to insert a missing drive */ + int total_slots; + int pass; + int rv; + int bitmap_fd; + int have_container = 0; + int container_fd = -1; + int need_mdmon = 0; + unsigned long long bitmapsize; + struct mdinfo info, *infos; + int did_default = 0; + int do_default_layout = 0; + int do_default_chunk = 0; + unsigned long safe_mode_delay = 0; + char chosen_name[1024]; + struct map_ent *map = NULL; + unsigned long long newsize; + mdu_array_info_t inf; + + int major_num = BITMAP_MAJOR_HI; + if (s->bitmap_file && strcmp(s->bitmap_file, "clustered") == 0) { + major_num = BITMAP_MAJOR_CLUSTERED; + if (c->nodes <= 1) { + pr_err("At least 2 nodes are needed for cluster-md\n"); + return 1; + } + } + + memset(&info, 0, sizeof(info)); + if (s->level == UnSet && st && st->ss->default_geometry) + st->ss->default_geometry(st, &s->level, NULL, NULL); + if (s->level == UnSet) { + pr_err("a RAID level is needed to create an array.\n"); + return 1; + } + if (s->raiddisks < 4 && s->level == 6) { + pr_err("at least 4 raid-devices needed for level 6\n"); + return 1; + } + if (s->raiddisks > 256 && s->level == 6) { + pr_err("no more than 256 raid-devices supported for level 6\n"); + return 1; + } + if (s->raiddisks < 2 && s->level >= 4) { + pr_err("at least 2 raid-devices needed for level %d\n", s->level); + return 1; + } + if (s->level <= 0 && s->sparedisks) { + pr_err("This level does not support spare devices\n"); + return 1; + } + + if (subdevs == 1 && strcmp(devlist->devname, "missing") != 0) { + /* If given a single device, it might be a container, and we can + * extract a device list from there + */ + int fd; + + memset(&inf, 0, sizeof(inf)); + fd = open(devlist->devname, O_RDONLY); + if (fd >= 0 && + md_get_array_info(fd, &inf) == 0 && inf.raid_disks == 0) { + /* yep, looks like a container */ + if (st) { + rv = st->ss->load_container(st, fd, + devlist->devname); + if (rv == 0) + have_container = 1; + } else { + st = super_by_fd(fd, NULL); + if (st && !(rv = st->ss-> + load_container(st, fd, + devlist->devname))) + have_container = 1; + else + st = NULL; + } + if (have_container) { + subdevs = s->raiddisks; + first_missing = subdevs * 2; + second_missing = subdevs * 2; + insert_point = subdevs * 2; + } + } + if (fd >= 0) + close(fd); + } + if (st && st->ss->external && s->sparedisks) { + pr_err("This metadata type does not support spare disks at create time\n"); + return 1; + } + if (subdevs > s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You have listed more devices (%d) than are in the array(%d)!\n", subdevs, s->raiddisks+s->sparedisks); + return 1; + } + if (!have_container && subdevs < s->raiddisks+s->sparedisks+s->journaldisks) { + pr_err("You haven't given enough devices (real or missing) to create this array\n"); + return 1; + } + if (s->bitmap_file && s->level <= 0) { + pr_err("bitmaps not meaningful with level %s\n", + map_num(pers, s->level)?:"given"); + return 1; + } + + /* now set some defaults */ + + if (s->layout == UnSet) { + do_default_layout = 1; + s->layout = default_layout(st, s->level, c->verbose); + } + + if (s->level == 10) + /* check layout fits in array*/ + if ((s->layout&255) * ((s->layout>>8)&255) > s->raiddisks) { + pr_err("that layout requires at least %d devices\n", + (s->layout&255) * ((s->layout>>8)&255)); + return 1; + } + + switch(s->level) { + case 4: + case 5: + case 10: + case 6: + case 0: + if (s->chunk == 0 || s->chunk == UnSet) { + s->chunk = UnSet; + do_default_chunk = 1; + /* chunk will be set later */ + } + break; + case LEVEL_LINEAR: + /* a chunksize of zero 0s perfectly valid (and preferred) since 2.6.16 */ + if (get_linux_version() < 2006016 && s->chunk == 0) { + s->chunk = 64; + if (c->verbose > 0) + pr_err("chunk size defaults to 64K\n"); + } + break; + case 1: + case LEVEL_FAULTY: + case LEVEL_MULTIPATH: + case LEVEL_CONTAINER: + if (s->chunk) { + pr_err("specifying chunk size is forbidden for this level\n"); + return 1; + } + break; + default: + pr_err("unknown level %d\n", s->level); + return 1; + } + + if (s->size == MAX_SIZE) + /* use '0' to mean 'max' now... */ + s->size = 0; + if (s->size && s->chunk && s->chunk != UnSet) + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + + newsize = s->size * 2; + if (st && ! st->ss->validate_geometry(st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + data_offset, NULL, + &newsize, s->consistency_policy, + c->verbose >= 0)) + return 1; + + if (s->chunk && s->chunk != UnSet) { + newsize &= ~(unsigned long long)(s->chunk*2 - 1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + do_default_chunk = 0; + } + } + + if (s->size == 0) { + s->size = newsize / 2; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + + if (s->size && c->verbose > 0) + pr_err("setting size to %lluK\n", s->size); + } + + /* now look at the subdevs */ + info.array.active_disks = 0; + info.array.working_disks = 0; + dnum = 0; + for (dv = devlist; dv; dv = dv->next) + if (data_offset == VARIABLE_OFFSET) + dv->data_offset = INVALID_SECTORS; + else + dv->data_offset = data_offset; + + for (dv=devlist; dv && !have_container; dv=dv->next, dnum++) { + char *dname = dv->devname; + unsigned long long freesize; + int dfd; + char *doff; + + if (strcasecmp(dname, "missing") == 0) { + if (first_missing > dnum) + first_missing = dnum; + if (second_missing > dnum && dnum > first_missing) + second_missing = dnum; + missing_disks ++; + continue; + } + if (data_offset == VARIABLE_OFFSET) { + doff = strchr(dname, ':'); + if (doff) { + *doff++ = 0; + dv->data_offset = parse_size(doff); + } else + dv->data_offset = INVALID_SECTORS; + } else + dv->data_offset = data_offset; + + dfd = open(dname, O_RDONLY); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + if (!fstat_is_blkdev(dfd, dname, NULL)) { + close(dfd); + exit(2); + } + close(dfd); + info.array.working_disks++; + if (dnum < s->raiddisks && dv->disposition != 'j') + info.array.active_disks++; + if (st == NULL) { + struct createinfo *ci = conf_get_create_info(); + if (ci) + st = ci->supertype; + } + if (st == NULL) { + /* Need to choose a default metadata, which is different + * depending on geometry of array. + */ + int i; + char *name = "default"; + for(i = 0; !st && superlist[i]; i++) { + st = superlist[i]->match_metadata_desc(name); + if (!st) + continue; + if (do_default_layout) + s->layout = default_layout(st, s->level, c->verbose); + switch (st->ss->validate_geometry( + st, s->level, s->layout, s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, dname, + &freesize, s->consistency_policy, + c->verbose > 0)) { + case -1: /* Not valid, message printed, and not + * worth checking any further */ + exit(2); + break; + case 0: /* Geometry not valid */ + free(st); + st = NULL; + s->chunk = do_default_chunk ? UnSet : s->chunk; + break; + case 1: /* All happy */ + break; + } + } + + if (!st) { + int dfd = open(dname, O_RDONLY|O_EXCL); + if (dfd < 0) { + pr_err("cannot open %s: %s\n", + dname, strerror(errno)); + exit(2); + } + pr_err("device %s not suitable for any style of array\n", + dname); + exit(2); + } + if (st->ss != &super0 || + st->minor_version != 90) + did_default = 1; + } else { + if (do_default_layout) + s->layout = default_layout(st, s->level, 0); + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, s->size*2, + dv->data_offset, + dname, &freesize, + s->consistency_policy, + c->verbose >= 0)) { + + pr_err("%s is not suitable for this array.\n", + dname); + fail = 1; + continue; + } + } + + if (dv->disposition == 'j') + goto skip_size_check; /* skip write journal for size check */ + + freesize /= 2; /* convert to K */ + if (s->chunk && s->chunk != UnSet) { + /* round to chunk size */ + freesize = freesize & ~(s->chunk-1); + if (do_default_chunk) { + /* default chunk was just set */ + if (c->verbose > 0) + pr_err("chunk size defaults to %dK\n", s->chunk); + if (round_size_and_verify(&s->size, s->chunk)) + return 1; + do_default_chunk = 0; + } + } + if (!freesize) { + pr_err("no free space left on %s\n", dname); + fail = 1; + continue; + } + + if (s->size && freesize < s->size) { + pr_err("%s is smaller than given size. %lluK < %lluK + metadata\n", + dname, freesize, s->size); + fail = 1; + continue; + } + if (maxdisc == NULL || (maxdisc && freesize > maxsize)) { + maxdisc = dname; + maxsize = freesize; + } + if (mindisc ==NULL || (mindisc && freesize < minsize)) { + mindisc = dname; + minsize = freesize; + } + skip_size_check: + if (c->runstop != 1 || c->verbose >= 0) { + int fd = open(dname, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s: %s\n", + dname, strerror(errno)); + fail = 1; + continue; + } + warn |= check_ext2(fd, dname); + warn |= check_reiser(fd, dname); + warn |= check_raid(fd, dname); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1) + /* metadata at front */ + warn |= check_partitions(fd, dname, 0, 0); + else if (s->level == 1 || s->level == LEVEL_CONTAINER || + (s->level == 0 && s->raiddisks == 1)) + /* partitions could be meaningful */ + warn |= check_partitions(fd, dname, freesize*2, s->size*2); + else + /* partitions cannot be meaningful */ + warn |= check_partitions(fd, dname, 0, 0); + if (strcmp(st->ss->name, "1.x") == 0 && + st->minor_version >= 1 && + did_default && + s->level == 1 && + (warn & 1024) == 0) { + warn |= 1024; + pr_err("Note: this array has metadata at the start and\n" + " may not be suitable as a boot device. If you plan to\n" + " store '/boot' on this device please ensure that\n" + " your boot-loader understands md/v1.x metadata, or use\n" + " --metadata=0.90\n"); + } + close(fd); + } + } + if (missing_disks == dnum && !have_container) { + pr_err("Subdevs can't be all missing\n"); + return 1; + } + if (s->raiddisks + s->sparedisks > st->max_devs) { + pr_err("Too many devices: %s metadata only supports %d\n", + st->ss->name, st->max_devs); + return 1; + } + if (have_container) + info.array.working_disks = s->raiddisks; + if (fail) { + pr_err("create aborted\n"); + return 1; + } + if (s->size == 0) { + if (mindisc == NULL && !have_container) { + pr_err("no size and no drives given - aborting create.\n"); + return 1; + } + if (s->level > 0 || s->level == LEVEL_MULTIPATH || + s->level == LEVEL_FAULTY || st->ss->external) { + /* size is meaningful */ + if (!st->ss->validate_geometry(st, s->level, s->layout, + s->raiddisks, + &s->chunk, minsize*2, + data_offset, + NULL, NULL, + s->consistency_policy, 0)) { + pr_err("devices too large for RAID level %d\n", s->level); + return 1; + } + s->size = minsize; + if (s->level == 1) + /* If this is ever reshaped to RAID5, we will + * need a chunksize. So round it off a bit + * now just to be safe + */ + s->size &= ~(64ULL-1); + if (c->verbose > 0) + pr_err("size set to %lluK\n", s->size); + } + } + + if (!s->bitmap_file && + !st->ss->external && + s->level >= 1 && + st->ss->add_internal_bitmap && + s->journaldisks == 0 && + (s->consistency_policy != CONSISTENCY_POLICY_RESYNC && + s->consistency_policy != CONSISTENCY_POLICY_PPL) && + (s->write_behind || s->size > 100*1024*1024ULL)) { + if (c->verbose > 0) + pr_err("automatically enabling write-intent bitmap on large array\n"); + s->bitmap_file = "internal"; + } + if (s->bitmap_file && strcmp(s->bitmap_file, "none") == 0) + s->bitmap_file = NULL; + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + return 1; + } + + if (!have_container && s->level > 0 && ((maxsize-s->size)*100 > maxsize)) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("largest drive (%s) exceeds size (%lluK) by more than 1%%\n", + maxdisc, s->size); + warn = 1; + } + + if (st->ss->detail_platform && st->ss->detail_platform(0, 1, NULL) != 0) { + if (c->runstop != 1 || c->verbose >= 0) + pr_err("%s unable to enumerate platform support\n" + " array may not be compatible with hardware/firmware\n", + st->ss->name); + warn = 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + + if (warn) { + if (c->runstop!= 1) { + if (!ask("Continue creating array? ")) { + pr_err("create aborted.\n"); + return 1; + } + } else { + if (c->verbose > 0) + pr_err("creation continuing despite oddities due to --run\n"); + } + } + + /* If this is raid4/5, we want to configure the last active slot + * as missing, so that a reconstruct happens (faster than re-parity) + * FIX: Can we do this for raid6 as well? + */ + if (st->ss->external == 0 && s->assume_clean == 0 && + c->force == 0 && first_missing >= s->raiddisks) { + switch (s->level) { + case 4: + case 5: + insert_point = s->raiddisks-1; + s->sparedisks++; + info.array.active_disks--; + missing_disks++; + break; + default: + break; + } + } + /* For raid6, if creating with 1 missing drive, make a good drive + * into a spare, else the create will fail + */ + if (s->assume_clean == 0 && c->force == 0 && first_missing < s->raiddisks && + st->ss->external == 0 && + second_missing >= s->raiddisks && s->level == 6) { + insert_point = s->raiddisks - 1; + if (insert_point == first_missing) + insert_point--; + s->sparedisks ++; + info.array.active_disks--; + missing_disks++; + } + + if (s->level <= 0 && first_missing < subdevs * 2) { + pr_err("This level does not support missing devices\n"); + return 1; + } + + /* We need to create the device */ + map_lock(&map); + mdfd = create_mddev(mddev, name, c->autof, LOCAL, chosen_name, 1); + if (mdfd < 0) { + map_unlock(&map); + return 1; + } + /* verify if chosen_name is not in use, + * it could be in conflict with already existing device + * e.g. container, array + */ + if (strncmp(chosen_name, "/dev/md/", 8) == 0 && + map_by_name(&map, chosen_name+8) != NULL) { + pr_err("Array name %s is in use already.\n", + chosen_name); + close(mdfd); + map_unlock(&map); + udev_unblock(); + return 1; + } + mddev = chosen_name; + + memset(&inf, 0, sizeof(inf)); + md_get_array_info(mdfd, &inf); + if (inf.working_disks != 0) { + pr_err("another array by this name is already running.\n"); + goto abort_locked; + } + + /* Ok, lets try some ioctls */ + + info.array.level = s->level; + info.array.size = s->size; + info.array.raid_disks = s->raiddisks; + /* The kernel should *know* what md_minor we are dealing + * with, but it chooses to trust me instead. Sigh + */ + info.array.md_minor = 0; + if (fstat_is_blkdev(mdfd, mddev, &rdev)) + info.array.md_minor = minor(rdev); + info.array.not_persistent = 0; + + if (((s->level == 4 || s->level == 5) && + (insert_point < s->raiddisks || first_missing < s->raiddisks)) || + (s->level == 6 && (insert_point < s->raiddisks || + second_missing < s->raiddisks)) || + (s->level <= 0) || s->assume_clean) { + info.array.state = 1; /* clean, but one+ drive will be missing*/ + info.resync_start = MaxSector; + } else { + info.array.state = 0; /* not clean, but no errors */ + info.resync_start = 0; + } + if (s->level == 10) { + /* for raid10, the bitmap size is the capacity of the array, + * which is array.size * raid_disks / ncopies; + * .. but convert to sectors. + */ + int ncopies = ((s->layout>>8) & 255) * (s->layout & 255); + bitmapsize = s->size * s->raiddisks / ncopies * 2; +/* printf("bms=%llu as=%d rd=%d nc=%d\n", bitmapsize, s->size, s->raiddisks, ncopies);*/ + } else + bitmapsize = s->size * 2; + + /* There is lots of redundancy in these disk counts, + * raid_disks is the most meaningful value + * it describes the geometry of the array + * it is constant + * nr_disks is total number of used slots. + * it should be raid_disks+spare_disks + * spare_disks is the number of extra disks present + * see above + * active_disks is the number of working disks in + * active slots. (With raid_disks) + * working_disks is the total number of working disks, + * including spares + * failed_disks is the number of disks marked failed + * + * Ideally, the kernel would keep these (except raid_disks) + * up-to-date as we ADD_NEW_DISK, but it doesn't (yet). + * So for now, we assume that all raid and spare + * devices will be given. + */ + info.array.spare_disks=s->sparedisks; + info.array.failed_disks=missing_disks; + info.array.nr_disks = info.array.working_disks + + info.array.failed_disks; + info.array.layout = s->layout; + info.array.chunk_size = s->chunk*1024; + + if (name == NULL || *name == 0) { + /* base name on mddev */ + /* /dev/md0 -> 0 + * /dev/md_d0 -> d0 + * /dev/md_foo -> foo + * /dev/md/1 -> 1 + * /dev/md/d1 -> d1 + * /dev/md/home -> home + * /dev/mdhome -> home + */ + /* FIXME compare this with rules in create_mddev */ + name = strrchr(mddev, '/'); + if (name) { + name++; + if (strncmp(name, "md_", 3) == 0 && + strlen(name) > 3 && (name-mddev) == 5 /* /dev/ */) + name += 3; + else if (strncmp(name, "md", 2) == 0 && + strlen(name) > 2 && isdigit(name[2]) && + (name-mddev) == 5 /* /dev/ */) + name += 2; + } + } + if (!st->ss->init_super(st, &info.array, s, name, c->homehost, uuid, + data_offset)) + goto abort_locked; + + total_slots = info.array.nr_disks; + st->ss->getinfo_super(st, &info, NULL); + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs\n"); + goto abort_locked; + } + + if (did_default && c->verbose >= 0) { + if (is_subarray(info.text_version)) { + char devnm[32]; + char *ep; + struct mdinfo *mdi; + + strncpy(devnm, info.text_version+1, 32); + devnm[31] = 0; + ep = strchr(devnm, '/'); + if (ep) + *ep = 0; + + mdi = sysfs_read(-1, devnm, GET_VERSION); + + pr_err("Creating array inside %s container %s\n", + mdi?mdi->text_version:"managed", devnm); + sysfs_free(mdi); + } else + pr_err("Defaulting to version %s metadata\n", info.text_version); + } + + map_update(&map, fd2devnm(mdfd), info.text_version, + info.uuid, chosen_name); + /* Keep map locked until devices have been added to array + * to stop another mdadm from finding and using those devices. + */ + + if (s->bitmap_file && (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0)) { + if (!st->ss->add_internal_bitmap) { + pr_err("internal bitmaps not supported with %s metadata\n", + st->ss->name); + goto abort_locked; + } + if (st->ss->add_internal_bitmap(st, &s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, 1, major_num)) { + pr_err("Given bitmap chunk size not supported.\n"); + goto abort_locked; + } + s->bitmap_file = NULL; + } + + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs\n"); + goto abort_locked; + } + + if (st->ss->external && st->container_devnm[0]) { + /* member */ + + /* When creating a member, we need to be careful + * to negotiate with mdmon properly. + * If it is already running, we cannot write to + * the devices and must ask it to do that part. + * If it isn't running, we write to the devices, + * and then start it. + * We hold an exclusive open on the container + * device to make sure mdmon doesn't exit after + * we checked that it is running. + * + * For now, fail if it is already running. + */ + container_fd = open_dev_excl(st->container_devnm); + if (container_fd < 0) { + pr_err("Cannot get exclusive open on container - weird.\n"); + goto abort_locked; + } + if (mdmon_running(st->container_devnm)) { + if (c->verbose) + pr_err("reusing mdmon for %s.\n", + st->container_devnm); + st->update_tail = &st->updates; + } else + need_mdmon = 1; + } + rv = set_array_info(mdfd, st, &info); + if (rv) { + pr_err("failed to set array info for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + + if (s->bitmap_file) { + int uuid[4]; + + st->ss->uuid_from_super(st, uuid); + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, s->bitmap_chunk, + c->delay, s->write_behind, + bitmapsize, + major_num)) { + goto abort_locked; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", + s->bitmap_file); + goto abort_locked; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bitmap_fd) < 0) { + pr_err("Cannot set bitmap file for %s: %s\n", + mddev, strerror(errno)); + goto abort_locked; + } + } + + infos = xmalloc(sizeof(*infos) * total_slots); + enable_fds(total_slots); + for (pass = 1; pass <= 2; pass++) { + struct mddev_dev *moved_disk = NULL; /* the disk that was moved out of the insert point */ + + for (dnum = 0, raid_disk_num = 0, dv = devlist; dv; + dv = (dv->next) ? (dv->next) : moved_disk, dnum++) { + int fd; + struct mdinfo *inf = &infos[dnum]; + + if (dnum >= total_slots) + abort(); + if (dnum == insert_point) { + raid_disk_num += 1; + moved_disk = dv; + continue; + } + if (strcasecmp(dv->devname, "missing") == 0) { + raid_disk_num += 1; + continue; + } + if (have_container) + moved_disk = NULL; + if (have_container && dnum < info.array.raid_disks - 1) + /* repeatedly use the container */ + moved_disk = dv; + + switch(pass) { + case 1: + *inf = info; + + inf->disk.number = dnum; + inf->disk.raid_disk = raid_disk_num++; + + if (dv->disposition == 'j') { + inf->disk.raid_disk = MD_DISK_ROLE_JOURNAL; + inf->disk.state = (1<<MD_DISK_JOURNAL); + raid_disk_num--; + } else if (inf->disk.raid_disk < s->raiddisks) + inf->disk.state = (1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC); + else + inf->disk.state = 0; + + if (dv->writemostly == FlagSet) { + if (major_num == BITMAP_MAJOR_CLUSTERED) { + pr_err("Can not set %s --write-mostly with a clustered bitmap\n",dv->devname); + goto abort_locked; + } else + inf->disk.state |= (1<<MD_DISK_WRITEMOSTLY); + } + if (dv->failfast == FlagSet) + inf->disk.state |= (1<<MD_DISK_FAILFAST); + + if (have_container) + fd = -1; + else { + if (st->ss->external && + st->container_devnm[0]) + fd = open(dv->devname, O_RDWR); + else + fd = open(dv->devname, O_RDWR|O_EXCL); + + if (fd < 0) { + pr_err("failed to open %s after earlier success - aborting\n", + dv->devname); + goto abort_locked; + } + if (!fstat_is_blkdev(fd, dv->devname, &rdev)) + return 1; + inf->disk.major = major(rdev); + inf->disk.minor = minor(rdev); + } + if (fd >= 0) + remove_partitions(fd); + if (st->ss->add_to_super(st, &inf->disk, + fd, dv->devname, + dv->data_offset)) { + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort_locked; + } + st->ss->getinfo_super(st, inf, NULL); + safe_mode_delay = inf->safe_mode_delay; + + if (have_container && c->verbose > 0) + pr_err("Using %s for device %d\n", + map_dev(inf->disk.major, + inf->disk.minor, + 0), dnum); + + if (!have_container) { + /* getinfo_super might have lost these ... */ + inf->disk.major = major(rdev); + inf->disk.minor = minor(rdev); + } + break; + case 2: + inf->errors = 0; + + rv = add_disk(mdfd, st, &info, inf); + + if (rv) { + pr_err("ADD_NEW_DISK for %s failed: %s\n", + dv->devname, strerror(errno)); + if (errno == EINVAL && + info.array.level == 0) { + pr_err("Possibly your kernel doesn't support RAID0 layouts.\n"); + pr_err("Either upgrade, or use --layout=dangerous\n"); + } + goto abort_locked; + } + break; + } + if (!have_container && + dv == moved_disk && dnum != insert_point) break; + } + if (pass == 1) { + struct mdinfo info_new; + struct map_ent *me = NULL; + + /* check to see if the uuid has changed due to these + * metadata changes, and if so update the member array + * and container uuid. Note ->write_init_super clears + * the subarray cursor such that ->getinfo_super once + * again returns container info. + */ + st->ss->getinfo_super(st, &info_new, NULL); + if (st->ss->external && s->level != LEVEL_CONTAINER && + !same_uuid(info_new.uuid, info.uuid, 0)) { + map_update(&map, fd2devnm(mdfd), + info_new.text_version, + info_new.uuid, chosen_name); + me = map_by_devnm(&map, st->container_devnm); + } + + if (st->ss->write_init_super(st)) { + st->ss->free_super(st); + goto abort_locked; + } + /* + * Before activating the array, perform extra steps + * required to configure the internal write-intent + * bitmap. + */ + if (info_new.consistency_policy == + CONSISTENCY_POLICY_BITMAP && + st->ss->set_bitmap && + st->ss->set_bitmap(st, &info)) { + st->ss->free_super(st); + goto abort_locked; + } + + /* update parent container uuid */ + if (me) { + char *path = xstrdup(me->path); + + st->ss->getinfo_super(st, &info_new, NULL); + map_update(&map, st->container_devnm, + info_new.text_version, + info_new.uuid, path); + free(path); + } + + flush_metadata_updates(st); + st->ss->free_super(st); + } + } + map_unlock(&map); + free(infos); + + if (s->level == LEVEL_CONTAINER) { + /* No need to start. But we should signal udev to + * create links */ + sysfs_uevent(&info, "change"); + if (c->verbose >= 0) + pr_err("container %s prepared.\n", mddev); + wait_for(chosen_name, mdfd); + } else if (c->runstop == 1 || subdevs >= s->raiddisks) { + if (st->ss->external) { + int err; + switch(s->level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + err = sysfs_set_str(&info, NULL, "array_state", + c->readonly + ? "readonly" + : "active"); + need_mdmon = 0; + break; + default: + err = sysfs_set_str(&info, NULL, "array_state", + "readonly"); + break; + } + sysfs_set_safemode(&info, safe_mode_delay); + if (err) { + pr_err("failed to activate array.\n"); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else if (c->readonly && + sysfs_attribute_available( + &info, NULL, "array_state")) { + if (sysfs_set_str(&info, NULL, + "array_state", "readonly") < 0) { + pr_err("Failed to start array: %s\n", + strerror(errno)); + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + } else { + /* param is not actually used */ + mdu_param_t param; + if (ioctl(mdfd, RUN_ARRAY, ¶m)) { + pr_err("RUN_ARRAY failed: %s\n", + strerror(errno)); + if (errno == 524 /* ENOTSUP */ && + info.array.level == 0) + cont_err("Please use --layout=original or --layout=alternate\n"); + if (info.array.chunk_size & (info.array.chunk_size-1)) { + cont_err("Problem may be that chunk size is not a power of 2\n"); + } + ioctl(mdfd, STOP_ARRAY, NULL); + goto abort; + } + /* if start_ro module parameter is set, array is + * auto-read-only, which is bad as the resync won't + * start. So lets make it read-write now. + */ + ioctl(mdfd, RESTART_ARRAY_RW, NULL); + } + if (c->verbose >= 0) + pr_err("array %s started.\n", mddev); + if (st->ss->external && st->container_devnm[0]) { + if (need_mdmon) + start_mdmon(st->container_devnm); + + ping_monitor(st->container_devnm); + close(container_fd); + } + wait_for(chosen_name, mdfd); + } else { + pr_err("not starting array - not enough devices.\n"); + } + udev_unblock(); + close(mdfd); + sysfs_uevent(&info, "change"); + return 0; + + abort: + udev_unblock(); + map_lock(&map); + abort_locked: + map_remove(&map, fd2devnm(mdfd)); + map_unlock(&map); + + if (mdfd >= 0) + close(mdfd); + return 1; +} diff --git a/Detail.c b/Detail.c new file mode 100644 index 0000000..95d4cc7 --- /dev/null +++ b/Detail.c @@ -0,0 +1,879 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <ctype.h> +#include <dirent.h> + +static int cmpstringp(const void *p1, const void *p2) +{ + return strcmp(* (char * const *) p1, * (char * const *) p2); +} + +static int add_device(const char *dev, char ***p_devices, + int *p_max_devices, int n_devices) +{ + if (n_devices + 1 >= *p_max_devices) { + *p_max_devices += 16; + *p_devices = xrealloc(*p_devices, *p_max_devices * + sizeof(**p_devices)); + if (!*p_devices) { + *p_max_devices = 0; + return 0; + } + }; + (*p_devices)[n_devices] = xstrdup(dev); + return n_devices + 1; +} + +int Detail(char *dev, struct context *c) +{ + /* + * Print out details for an md array + */ + int fd = open(dev, O_RDONLY); + mdu_array_info_t array; + mdu_disk_info_t *disks = NULL; + int next; + int d; + time_t atime; + char *str; + char **devices = NULL; + int max_devices = 0, n_devices = 0; + int spares = 0; + struct stat stb; + int failed = 0; + struct supertype *st = NULL; + char *subarray = NULL; + int max_disks = MD_SB_DISKS; /* just a default */ + struct mdinfo *info = NULL; + struct mdinfo *sra = NULL; + struct mdinfo *subdev; + char *member = NULL; + char *container = NULL; + + int rv = c->test ? 4 : 1; + int avail_disks = 0; + char *avail = NULL; + int external; + int inactive; + int is_container = 0; + char *arrayst; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", + dev, strerror(errno)); + return rv; + } + sra = sysfs_read(fd, NULL, GET_VERSION | GET_DEVS | + GET_ARRAY_STATE | GET_STATE); + if (!sra) { + if (md_get_array_info(fd, &array)) { + pr_err("%s does not appear to be an md device\n", dev); + goto out; + } + } + external = (sra != NULL && sra->array.major_version == -1 && + sra->array.minor_version == -2); + inactive = (sra != NULL && !md_array_is_active(sra)); + st = super_by_fd(fd, &subarray); + if (md_get_array_info(fd, &array)) { + if (errno == ENODEV) { + if (sra->array.major_version == -1 && + sra->array.minor_version == -1 && + sra->devs == NULL) { + pr_err("Array associated with md device %s does not exist.\n", + dev); + goto out; + } + array = sra->array; + } else { + pr_err("cannot get array detail for %s: %s\n", + dev, strerror(errno)); + goto out; + } + } + + if (array.raid_disks == 0 && external) + is_container = 1; + if (fstat(fd, &stb) != 0 && !S_ISBLK(stb.st_mode)) + stb.st_rdev = 0; + rv = 0; + + if (st) + max_disks = st->max_devs; + + if (subarray) { + /* This is a subarray of some container. + * We want the name of the container, and the member + */ + dev_t devid = devnm2devid(st->container_devnm); + int cfd, err; + + member = subarray; + container = map_dev_preferred(major(devid), minor(devid), + 1, c->prefer); + cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + err = st->ss->load_container(st, cfd, NULL); + close(cfd); + if (err == 0) + info = st->ss->container_content(st, subarray); + } + } + + /* try to load a superblock. Try sra->devs first, then try ioctl */ + if (st && !info) + for (d = 0, subdev = sra ? sra->devs : NULL; + d < max_disks || subdev; + subdev ? (void)(subdev = subdev->next) : (void)(d++)){ + mdu_disk_info_t disk; + char *dv; + int fd2; + int err; + + if (subdev) + disk = subdev->disk; + else { + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (d >= array.raid_disks && + disk.major == 0 && disk.minor == 0) + continue; + } + + if (array.raid_disks > 0 && + (disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + + if (st->sb) + st->ss->free_super(st); + + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + if (info) + free(info); + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + if (array.raid_disks != 0 && /* container */ + (info->array.ctime != array.ctime || + info->array.level != array.level)) { + st->ss->free_super(st); + continue; + } + /* some formats (imsm) have free-floating-spares + * with a uuid of uuid_zero, they don't + * have very good info about the rest of the + * container, so keep searching when + * encountering such a device. Otherwise, stop + * after the first successful call to + * ->load_super. + */ + if (memcmp(uuid_zero, + info->uuid, + sizeof(uuid_zero)) == 0) { + st->ss->free_super(st); + continue; + } + break; + } + + /* Ok, we have some info to print... */ + if (inactive && info) + str = map_num(pers, info->array.level); + else + str = map_num(pers, array.level); + + if (c->export) { + if (array.raid_disks) { + if (str) + printf("MD_LEVEL=%s\n", str); + printf("MD_DEVICES=%d\n", array.raid_disks); + } else { + if (is_container) + printf("MD_LEVEL=container\n"); + printf("MD_DEVICES=%d\n", array.nr_disks); + } + if (container) { + printf("MD_CONTAINER=%s\n", container); + printf("MD_MEMBER=%s\n", member); + } else { + if (sra && sra->array.major_version < 0) + printf("MD_METADATA=%s\n", sra->text_version); + else + printf("MD_METADATA=%d.%d\n", + array.major_version, + array.minor_version); + } + + if (st && st->sb && info) { + char nbuf[64]; + struct map_ent *mp, *map = NULL; + + fname_from_uuid(st, info, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf + 5); + mp = map_by_uuid(&map, info->uuid); + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path + 8); + putchar('\n'); + } + + if (st->ss->export_detail_super) + st->ss->export_detail_super(st); + map_free(map); + } else { + struct map_ent *mp, *map = NULL; + char nbuf[64]; + mp = map_by_devnm(&map, fd2devnm(fd)); + if (mp) { + __fname_from_uuid(mp->uuid, 0, nbuf, ':'); + printf("MD_UUID=%s\n", nbuf+5); + } + if (mp && mp->path && + strncmp(mp->path, "/dev/md/", 8) == 0) { + printf("MD_DEVNAME="); + print_escape(mp->path+8); + putchar('\n'); + } + map_free(map); + } + if (!c->no_devices && sra) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + char *path; + char *sysdev = xstrdup(mdi->sys_name); + char *cp; + + path = map_dev(mdi->disk.major, + mdi->disk.minor, 0); + for (cp = sysdev; *cp; cp++) + if (!isalnum(*cp)) + *cp = '_'; + + if (mdi->disk.raid_disk >= 0) + printf("MD_DEVICE_%s_ROLE=%d\n", + sysdev, + mdi->disk.raid_disk); + else + printf("MD_DEVICE_%s_ROLE=spare\n", + sysdev); + if (path) + printf("MD_DEVICE_%s_DEV=%s\n", + sysdev, path); + } + } + goto out; + } + + disks = xmalloc(max_disks * 2 * sizeof(mdu_disk_info_t)); + for (d = 0; d < max_disks * 2; d++) { + disks[d].state = (1 << MD_DISK_REMOVED); + disks[d].major = disks[d].minor = 0; + disks[d].number = -1; + disks[d].raid_disk = d / 2; + } + + next = array.raid_disks * 2; + if (inactive) { + struct mdinfo *mdi; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + disks[next++] = mdi->disk; + disks[next - 1].number = -1; + } + } else for (d = 0; d < max_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + if (d < array.raid_disks) + pr_err("cannot get device detail for device %d: %s\n", + d, strerror(errno)); + continue; + } + if (disk.major == 0 && disk.minor == 0) + continue; + if (disk.raid_disk >= 0 && disk.raid_disk < array.raid_disks && + disks[disk.raid_disk * 2].state == (1 << MD_DISK_REMOVED) && + ((disk.state & (1 << MD_DISK_JOURNAL)) == 0)) + disks[disk.raid_disk * 2] = disk; + else if (disk.raid_disk >= 0 && + disk.raid_disk < array.raid_disks && + disks[disk.raid_disk * 2 + 1].state == + (1 << MD_DISK_REMOVED) && + !(disk.state & (1 << MD_DISK_JOURNAL))) + disks[disk.raid_disk * 2 + 1] = disk; + else if (next < max_disks * 2) + disks[next++] = disk; + } + + avail = xcalloc(array.raid_disks, 1); + + for (d = 0; d < array.raid_disks; d++) { + char dv[PATH_MAX], dv_rep[PATH_MAX]; + snprintf(dv, PATH_MAX, "/sys/dev/block/%d:%d", + disks[d*2].major, disks[d*2].minor); + snprintf(dv_rep, PATH_MAX, "/sys/dev/block/%d:%d", + disks[d*2+1].major, disks[d*2+1].minor); + + if ((is_dev_alive(dv) && (disks[d*2].state & (1<<MD_DISK_SYNC))) || + (is_dev_alive(dv_rep) && (disks[d*2+1].state & (1<<MD_DISK_SYNC)))) { + avail_disks ++; + avail[d] = 1; + } else + rv |= !! c->test; + } + + if (c->brief) { + mdu_bitmap_file_t bmf; + if (inactive && !is_container) + printf("INACTIVE-ARRAY %s", dev); + else + printf("ARRAY %s", dev); + if (c->verbose > 0) { + if (array.raid_disks) + printf(" level=%s num-devices=%d", + str ? str : "-unknown-", + array.raid_disks); + else if (is_container) + printf(" level=container num-devices=%d", + array.nr_disks); + else + printf(" num-devices=%d", array.nr_disks); + } + if (container) { + printf(" container=%s", container); + printf(" member=%s", member); + } else { + if (sra && sra->array.major_version < 0) + printf(" metadata=%s", sra->text_version); + else + printf(" metadata=%d.%d", array.major_version, + array.minor_version); + } + + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) { + printf(" bitmap=%s", bmf.pathname); + } + } else { + mdu_bitmap_file_t bmf; + unsigned long long larray_size; + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + char *devnm; + + devnm = stat2devnm(&stb); + for (e = ms; e; e = e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + + printf("%s:\n", dev); + + if (container) + printf(" Container : %s, member %s\n", + container, member); + else { + if (sra && sra->array.major_version < 0) + printf(" Version : %s\n", + sra->text_version); + else + printf(" Version : %d.%d\n", + array.major_version, + array.minor_version); + } + + atime = array.ctime; + if (atime) + printf(" Creation Time : %.24s\n", ctime(&atime)); + if (is_container) + str = "container"; + if (str) + printf(" Raid Level : %s\n", str); + if (larray_size) + printf(" Array Size : %llu%s\n", + (larray_size >> 10), + human_size(larray_size)); + if (array.level >= 1) { + if (sra) + array.major_version = sra->array.major_version; + if (array.major_version != 0 && + (larray_size >= 0xFFFFFFFFULL|| array.size == 0)) { + unsigned long long dsize; + + dsize = get_component_size(fd); + if (dsize > 0) + printf(" Used Dev Size : %llu%s\n", + dsize/2, + human_size((long long)dsize<<9)); + else + printf(" Used Dev Size : unknown\n"); + } else + printf(" Used Dev Size : %lu%s\n", + (unsigned long)array.size, + human_size((unsigned long long) + array.size << 10)); + } + if (array.raid_disks) + printf(" Raid Devices : %d\n", array.raid_disks); + printf(" Total Devices : %d\n", array.nr_disks); + if (!container && + ((sra == NULL && array.major_version == 0) || + (sra && sra->array.major_version == 0))) + printf(" Preferred Minor : %d\n", array.md_minor); + if (sra == NULL || sra->array.major_version >= 0) + printf(" Persistence : Superblock is %spersistent\n", + array.not_persistent ? "not " : ""); + printf("\n"); + /* Only try GET_BITMAP_FILE for 0.90.01 and later */ + if (ioctl(fd, GET_BITMAP_FILE, &bmf) == 0 && bmf.pathname[0]) { + printf(" Intent Bitmap : %s\n", bmf.pathname); + printf("\n"); + } else if (array.state & (1<<MD_SB_CLUSTERED)) + printf(" Intent Bitmap : Internal(Clustered)\n\n"); + else if (array.state & (1<<MD_SB_BITMAP_PRESENT)) + printf(" Intent Bitmap : Internal\n\n"); + atime = array.utime; + if (atime) + printf(" Update Time : %.24s\n", ctime(&atime)); + if (array.raid_disks) { + static char *sync_action[] = { + ", recovering", ", resyncing", + ", reshaping", ", checking" }; + char *st; + if (avail_disks == array.raid_disks) + st = ""; + else if (!enough(array.level, array.raid_disks, + array.layout, 1, avail)) + st = ", FAILED"; + else + st = ", degraded"; + + if (array.state & (1 << MD_SB_CLEAN)) { + if ((array.level == 0) || + (array.level == LEVEL_LINEAR)) + arrayst = map_num(sysfs_array_states, + sra->array_state); + else + arrayst = "clean"; + } else { + arrayst = "active"; + if (array.state & (1<<MD_SB_CLUSTERED)) { + for (d = 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + /* only check first valid disk in cluster env */ + if ((disk.state & (MD_DISK_SYNC | MD_DISK_ACTIVE)) + && (disk.major | disk.minor)) { + dv = map_dev_preferred(disk.major, disk.minor, 0, + c->prefer); + if (!dv) + continue; + arrayst = IsBitmapDirty(dv) ? "active" : "clean"; + break; + } + } + } + } + + printf(" State : %s%s%s%s%s%s%s \n", + arrayst, st, + (!e || (e->percent < 0 && + e->percent != RESYNC_PENDING && + e->percent != RESYNC_DELAYED && + e->percent != RESYNC_REMOTE)) ? + "" : sync_action[e->resync], + larray_size ? "": ", Not Started", + (e && e->percent == RESYNC_DELAYED) ? + " (DELAYED)": "", + (e && e->percent == RESYNC_PENDING) ? + " (PENDING)": "", + (e && e->percent == RESYNC_REMOTE) ? + " (REMOTE)": ""); + } else if (inactive && !is_container) { + printf(" State : inactive\n"); + } + if (array.raid_disks) + printf(" Active Devices : %d\n", array.active_disks); + if (array.working_disks > 0) + printf(" Working Devices : %d\n", + array.working_disks); + if (array.raid_disks) { + printf(" Failed Devices : %d\n", array.failed_disks); + if (!external) + printf(" Spare Devices : %d\n", array.spare_disks); + } + printf("\n"); + if (array.level == 5) { + str = map_num(r5layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 0 && array.layout) { + str = map_num(r0layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 6) { + str = map_num(r6layout, array.layout); + printf(" Layout : %s\n", + str ? str : "-unknown-"); + } + if (array.level == 10) { + printf(" Layout :"); + print_r10_layout(array.layout); + printf("\n"); + } + switch (array.level) { + case 0: + case 4: + case 5: + case 10: + case 6: + if (array.chunk_size) + printf(" Chunk Size : %dK\n\n", + array.chunk_size/1024); + break; + case -1: + printf(" Rounding : %dK\n\n", + array.chunk_size/1024); + break; + default: + break; + } + + if (array.raid_disks) { + struct mdinfo *mdi; + + mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY); + if (mdi) { + char *policy = map_num(consistency_policies, + mdi->consistency_policy); + sysfs_free(mdi); + if (policy) + printf("Consistency Policy : %s\n\n", + policy); + } + } + + if (e && e->percent >= 0) { + static char *sync_action[] = { + "Rebuild", "Resync", "Reshape", "Check"}; + printf(" %7s Status : %d%% complete\n", + sync_action[e->resync], e->percent); + } + + if ((st && st->sb) && (info && info->reshape_active)) { +#if 0 +This is pretty boring + printf(" Reshape pos'n : %llu%s\n", + (unsigned long long) info->reshape_progress << 9, + human_size((unsigned long long) + info->reshape_progress << 9)); +#endif + if (info->delta_disks != 0) + printf(" Delta Devices : %d, (%d->%d)\n", + info->delta_disks, + array.raid_disks - info->delta_disks, + array.raid_disks); + if (info->new_level != array.level) { + str = map_num(pers, info->new_level); + printf(" New Level : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level != array.level || + info->new_layout != array.layout) { + if (info->new_level == 5) { + str = map_num(r5layout, + info->new_layout); + printf(" New Layout : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level == 6) { + str = map_num(r6layout, + info->new_layout); + printf(" New Layout : %s\n", + str ? str : "-unknown-"); + } + if (info->new_level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + info->new_layout & 255, + (info->new_layout & 0x10000) ? + "offset" : "far", + (info->new_layout >> 8) & 255); + } + } + if (info->new_chunk != array.chunk_size) + printf(" New Chunksize : %dK\n", + info->new_chunk/1024); + printf("\n"); + } else if (e && e->percent >= 0) + printf("\n"); + free_mdstat(ms); + + if (st && st->sb) + st->ss->detail_super(st, c->homehost, subarray); + + if (array.raid_disks == 0 && sra && + sra->array.major_version == -1 && + sra->array.minor_version == -2 && + sra->text_version[0] != '/') { + /* This looks like a container. Find any active arrays + * That claim to be a member. + */ + DIR *dir = opendir("/sys/block"); + struct dirent *de; + + printf(" Member Arrays :"); + + while (dir && (de = readdir(dir)) != NULL) { + char path[287]; + char vbuf[1024]; + int nlen = strlen(sra->sys_name); + dev_t devid; + if (de->d_name[0] == '.') + continue; + sprintf(path, + "/sys/block/%s/md/metadata_version", + de->d_name); + if (load_sys(path, vbuf, sizeof(vbuf)) < 0) + continue; + if (strncmp(vbuf, "external:", 9) || + !is_subarray(vbuf + 9) || + strncmp(vbuf + 10, sra->sys_name, nlen) || + vbuf[10 + nlen] != '/') + continue; + devid = devnm2devid(de->d_name); + printf(" %s", + map_dev_preferred(major(devid), + minor(devid), 1, + c->prefer)); + } + if (dir) + closedir(dir); + printf("\n\n"); + } + + if (!c->no_devices) { + if (array.raid_disks) + printf(" Number Major Minor RaidDevice State\n"); + else + printf(" Number Major Minor RaidDevice\n"); + } + } + + /* if --no_devices specified, not print component devices info */ + if (c->no_devices) + goto skip_devices_state; + + for (d = 0; d < max_disks * 2; d++) { + char *dv; + mdu_disk_info_t disk = disks[d]; + + if (d >= array.raid_disks * 2 && + disk.major == 0 && disk.minor == 0) + continue; + if ((d & 1) && disk.major == 0 && disk.minor == 0) + continue; + if (!c->brief) { + if (d == array.raid_disks*2) + printf("\n"); + if (disk.number < 0 && disk.raid_disk < 0) + printf(" - %5d %5d - ", + disk.major, disk.minor); + else if (disk.raid_disk < 0 || + disk.state & (1 << MD_DISK_JOURNAL)) + printf(" %5d %5d %5d - ", + disk.number, disk.major, disk.minor); + else if (disk.number < 0) + printf(" - %5d %5d %5d ", + disk.major, disk.minor, disk.raid_disk); + else + printf(" %5d %5d %5d %5d ", + disk.number, disk.major, disk.minor, + disk.raid_disk); + } + if (!c->brief && array.raid_disks) { + if (disk.state & (1 << MD_DISK_FAULTY)) { + printf(" faulty"); + if (disk.raid_disk < array.raid_disks && + disk.raid_disk >= 0) + failed++; + } + if (disk.state & (1 << MD_DISK_ACTIVE)) + printf(" active"); + if (disk.state & (1 << MD_DISK_SYNC)) { + printf(" sync"); + if (array.level == 10 && + (array.layout & ~0x1FFFF) == 0) { + int nc = array.layout & 0xff; + int fc = (array.layout >> 8) & 0xff; + int copies = nc*fc; + if (fc == 1 && + array.raid_disks % copies == 0 && + copies <= 26) { + /* We can divide the devices + into 'sets' */ + int set; + set = disk.raid_disk % copies; + printf(" set-%c", set + 'A'); + } + } + } + if (disk.state & (1 << MD_DISK_REMOVED)) + printf(" removed"); + if (disk.state & (1 << MD_DISK_WRITEMOSTLY)) + printf(" writemostly"); + if (disk.state & (1 << MD_DISK_FAILFAST)) + printf(" failfast"); + if (disk.state & (1 << MD_DISK_JOURNAL)) + printf(" journal"); + if ((disk.state & + ((1 << MD_DISK_ACTIVE) | (1 << MD_DISK_SYNC) | + (1 << MD_DISK_REMOVED) | (1 << MD_DISK_FAULTY) | + (1 << MD_DISK_JOURNAL))) == 0) { + printf(" spare"); + if (disk.raid_disk < array.raid_disks && + disk.raid_disk >= 0) + printf(" rebuilding"); + } + } + if (disk.state == 0) + spares++; + dv = map_dev_preferred(disk.major, disk.minor, 0, c->prefer); + if (dv != NULL) { + if (c->brief) + n_devices = add_device(dv, &devices, + &max_devices, n_devices); + else + printf(" %s", dv); + } else if (disk.major | disk.minor) + printf(" missing"); + if (!c->brief) + printf("\n"); + } + +skip_devices_state: + if (spares && c->brief && array.raid_disks) + printf(" spares=%d", spares); + if (c->brief && st && st->sb) + st->ss->brief_detail_super(st, subarray); + if (st) + st->ss->free_super(st); + + if (c->brief && c->verbose > 0 && devices) { + qsort(devices, n_devices, sizeof(*devices), cmpstringp); + printf("\n devices=%s", devices[0]); + for (d = 1; d < n_devices; d++) + printf(",%s", devices[d]); + } + if (c->brief) + printf("\n"); + if (c->test && + !enough(array.level, array.raid_disks, array.layout, 1, avail)) + rv = 2; + +out: + free(info); + free(disks); + close(fd); + free(subarray); + free(avail); + if (devices) + for (d = 0; d < n_devices; d++) + free(devices[d]); + free(devices); + sysfs_free(sra); + free(st); + return rv; +} + +int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path) +{ + /* display platform capabilities for the given metadata format + * 'scan' in this context means iterate over all metadata types + */ + int i; + int err = 1; + + if (ss && export && ss->export_detail_platform) + err = ss->export_detail_platform(verbose, controller_path); + else if (ss && ss->detail_platform) + err = ss->detail_platform(verbose, 0, controller_path); + else if (ss) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + ss->name ? : "[no name]"); + } else if (!scan) { + if (verbose > 0) + pr_err("specify a metadata type or --scan\n"); + } + + if (!scan) + return err; + + err = 0; + for (i = 0; superlist[i]; i++) { + struct superswitch *meta = superlist[i]; + + if (meta == ss) + continue; + if (verbose > 0) + pr_err("checking metadata %s\n", + meta->name ? : "[no name]"); + if (!meta->detail_platform) { + if (verbose > 0) + pr_err("%s metadata is platform independent\n", + meta->name ? : "[no name]"); + } else if (export && meta->export_detail_platform) { + err |= meta->export_detail_platform(verbose, controller_path); + } else + err |= meta->detail_platform(verbose, 0, controller_path); + } + + return err; +} @@ -0,0 +1,319 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2013 Neil Brown <neilb@suse.de> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <sys/dir.h> + +int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st) +{ + /* create a new file in 'dir' named for the basename of 'dev'. + * Truncate to the same size as 'dev' and ask the metadata + * handler to copy metadata there. + * For every name in /dev/disk/by-id that points to this device, + * create a hardlink in 'dir'. + * Complain if any of those hardlinks cannot be created. + */ + int fd, fl; + struct stat stb, dstb; + char *base; + char *fname = NULL; + unsigned long long size; + DIR *dirp; + struct dirent *de; + + if (stat(dir, &stb) != 0 || + (S_IFMT & stb.st_mode) != S_IFDIR) { + pr_err("--dump requires an existing directory, not: %s\n", + dir); + return 16; + } + + fd = dev_open(dev, O_RDONLY); + if (fd < 0) { + pr_err("Cannot open %s to dump metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if (st == NULL) + st = guess_super_type(fd, guess_array); + if (!st) { + pr_err("Cannot find RAID metadata on %s\n", dev); + close(fd); + return 1; + } + + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fd, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, dev); + close(fd); + return 1; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + close(fd); + return 1; + } + + base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + fl = open(fname, O_RDWR|O_CREAT|O_EXCL, 0666); + if (fl < 0) { + pr_err("Cannot create dump file %s: %s\n", + fname, strerror(errno)); + close(fd); + free(fname); + return 1; + } + if (ftruncate(fl, size) < 0) { + pr_err("failed to set size of dump file: %s\n", + strerror(errno)); + close(fd); + close(fl); + free(fname); + return 1; + } + + if (st->ss->copy_metadata(st, fd, fl) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + dev, fname); + close(fd); + close(fl); + unlink(fname); + free(fname); + return 1; + } + if (c->verbose >= 0) + printf("%s saved as %s.\n", dev, fname); + fstat(fd, &dstb); + close(fd); + close(fl); + if ((dstb.st_mode & S_IFMT) != S_IFBLK) { + /* Not a block device, so cannot create links */ + free(fname); + return 0; + } + /* mostly done: just want to find some other names */ + dirp = opendir("/dev/disk/by-id"); + if (!dirp) { + free(fname); + return 0; + } + while ((de = readdir(dirp)) != NULL) { + char *p = NULL; + if (de->d_name[0] == '.') + continue; + xasprintf(&p, "/dev/disk/by-id/%s", de->d_name); + if (stat(p, &stb) != 0 || + (stb.st_mode & S_IFMT) != S_IFBLK || + stb.st_rdev != dstb.st_rdev) { + /* Not this one */ + free(p); + continue; + } + free(p); + xasprintf(&p, "%s/%s", dir, de->d_name); + if (link(fname, p) == 0) { + if (c->verbose >= 0) + printf("%s also saved as %s.\n", + dev, p); + } else { + pr_err("Could not save %s as %s!!\n", + dev, p); + } + free(p); + } + closedir(dirp); + free(fname); + return 0; +} + +int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only) +{ + /* If 'dir' really is a directory we choose a name + * from it that matches a suitable name in /dev/disk/by-id, + * and copy metadata from the file to the device. + * If two names from by-id match and aren't both the same + * inode, we fail. If none match and basename of 'dev' + * can be found in dir, use that. + * If 'dir' is really a file then it is only permitted if + * 'only' is set (meaning there was only one device given) + * and the metadata is restored irrespective of file names. + */ + int fd, fl; + struct stat stb, dstb; + char *fname = NULL; + unsigned long long size; + + if (stat(dir, &stb) != 0) { + pr_err("%s does not exist: cannot restore from there.\n", + dir); + return 16; + } else if ((S_IFMT & stb.st_mode) != S_IFDIR && !only) { + pr_err("--restore requires a directory when multiple devices given\n"); + return 16; + } + + fd = dev_open(dev, O_RDWR); + if (fd < 0) { + pr_err("Cannot open %s to restore metadata: %s\n", + dev, strerror(errno)); + return 1; + } + if (!get_dev_size(fd, dev, &size)) { + close(fd); + return 1; + } + + if ((S_IFMT & stb.st_mode) == S_IFDIR) { + /* choose one name from the directory. */ + DIR *d = opendir(dir); + struct dirent *de; + char *chosen = NULL; + unsigned int chosen_inode = 0; + + fstat(fd, &dstb); + + while (d && (de = readdir(d)) != NULL) { + if (de->d_name[0] == '.') + continue; + xasprintf(&fname, "/dev/disk/by-id/%s", de->d_name); + if (stat(fname, &stb) != 0) { + free(fname); + continue; + } + free(fname); + if ((S_IFMT & stb.st_mode) != S_IFBLK) + continue; + if (stb.st_rdev != dstb.st_rdev) + continue; + /* This file is a good match for our device. */ + xasprintf(&fname, "%s/%s", dir, de->d_name); + if (stat(fname, &stb) != 0) { + /* Weird! */ + free(fname); + continue; + } + if (chosen == NULL) { + chosen = fname; + chosen_inode = stb.st_ino; + continue; + } + if (chosen_inode == stb.st_ino) { + /* same, no need to change */ + free(fname); + continue; + } + /* Oh dear, two names both match. Must give up. */ + pr_err("Both %s and %s seem suitable for %s. Please choose one.\n", + chosen, fname, dev); + free(fname); + free(chosen); + close(fd); + closedir(d); + return 1; + } + closedir(d); + if (!chosen) { + /* One last chance: try basename of device */ + char *base = strrchr(dev, '/'); + if (base) + base++; + else + base = dev; + xasprintf(&fname, "%s/%s", dir, base); + if (stat(fname, &stb) == 0) + chosen = fname; + else + free(fname); + } + fname = chosen; + } else + fname = strdup(dir); + + if (!fname) { + pr_err("Cannot find suitable file in %s for %s\n", + dir, dev); + close(fd); + return 1; + } + + fl = open(fname, O_RDONLY); + if (!fl) { + pr_err("Could not open %s for --restore.\n", + fname); + goto err; + } + if (stat(fname, &stb) != 0) { + pr_err("Could not stat %s for --restore.\n", + fname); + goto err; + } + if (((unsigned long long)stb.st_size) != size) { + pr_err("%s is not the same size as %s - cannot restore.\n", + fname, dev); + goto err; + } + if (st == NULL) + st = guess_super_type(fl, guess_array); + if (!st) { + pr_err("Cannot find metadata on %s\n", fname); + goto err; + } + st->ignore_hw_compat = 1; + if (st->ss->load_super(st, fl, NULL) != 0) { + pr_err("No %s metadata found on %s\n", + st->ss->name, fname); + goto err; + } + if (st->ss->copy_metadata == NULL) { + pr_err("%s metadata on %s cannot be copied\n", + st->ss->name, dev); + goto err; + } + if (st->ss->copy_metadata(st, fl, fd) != 0) { + pr_err("Failed to copy metadata from %s to %s\n", + fname, dev); + goto err; + } + if (c->verbose >= 0) + printf("%s restored from %s.\n", dev, fname); + close(fl); + close(fd); + free(fname); + return 0; + +err: + close(fd); + close(fl); + free(fname); + return 1; +} diff --git a/Examine.c b/Examine.c new file mode 100644 index 0000000..9574a3c --- /dev/null +++ b/Examine.c @@ -0,0 +1,228 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" +int Examine(struct mddev_dev *devlist, + struct context *c, + struct supertype *forcest) +{ + + /* Read the raid superblock from a device and + * display important content. + * + * If cannot be found, print reason: too small, bad magic + * + * Print: + * version, ctime, level, size, raid+spare+ + * prefered minor + * uuid + * + * utime, state etc + * + * If (brief) gather devices for same array and just print a mdadm.conf + * line including devices= + * if devlist==NULL, use conf_get_devs() + */ + int fd; + int rv = 0; + + struct array { + struct supertype *st; + struct mdinfo info; + void *devs; + struct array *next; + int spares; + } *arrays = NULL; + + for (; devlist ; devlist = devlist->next) { + struct supertype *st; + int have_container = 0; + int err = 0; + int container = 0; + + fd = dev_open(devlist->devname, O_RDONLY); + if (fd < 0) { + if (!c->scan) { + pr_err("cannot open %s: %s\n", + devlist->devname, strerror(errno)); + rv = 1; + } + continue; + } + + if (forcest) + st = dup_super(forcest); + else if (must_be_container(fd)) { + /* might be a container */ + st = super_by_fd(fd, NULL); + container = 1; + } else + st = guess_super(fd); + if (st) { + err = 1; + st->ignore_hw_compat = 1; + if (!container) + err = st->ss->load_super(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (err && st->ss->load_container) { + err = st->ss->load_container(st, fd, + (c->brief||c->scan) ? NULL + :devlist->devname); + if (!err) + have_container = 1; + } + st->ignore_hw_compat = 0; + } else { + if (!c->brief) { + pr_err("No md superblock detected on %s.\n", devlist->devname); + rv = 1; + } + err = 1; + } + close(fd); + + if (err) { + if (st) + st->ss->free_super(st); + continue; + } + + if (c->SparcAdjust) + st->ss->update_super(st, NULL, "sparc2.2", + devlist->devname, 0, 0, NULL); + /* Ok, its good enough to try, though the checksum could be wrong */ + + if (c->brief && st->ss->brief_examine_super == NULL) { + if (!c->scan) + pr_err("No brief listing for %s on %s\n", + st->ss->name, devlist->devname); + } else if (c->brief) { + struct array *ap; + char *d; + for (ap = arrays; ap; ap = ap->next) { + if (st->ss == ap->st->ss && + st->ss->compare_super(ap->st, st, 0) == 0) + break; + } + if (!ap) { + ap = xmalloc(sizeof(*ap)); + ap->devs = dl_head(); + ap->next = arrays; + ap->spares = 0; + ap->st = st; + arrays = ap; + st->ss->getinfo_super(st, &ap->info, NULL); + } else + st->ss->getinfo_super(st, &ap->info, NULL); + if (!have_container && + !(ap->info.disk.state & (1<<MD_DISK_SYNC))) + ap->spares++; + d = dl_strdup(devlist->devname); + dl_add(ap->devs, d); + } else if (c->export) { + if (st->ss->export_examine_super) + st->ss->export_examine_super(st); + st->ss->free_super(st); + } else { + printf("%s:\n",devlist->devname); + st->ss->examine_super(st, c->homehost); + st->ss->free_super(st); + } + } + if (c->brief) { + struct array *ap; + for (ap = arrays; ap; ap = ap->next) { + char sep='='; + char *d; + int newline = 0; + + ap->st->ss->brief_examine_super(ap->st, c->verbose > 0); + if (ap->spares && !ap->st->ss->external) + newline += printf(" spares=%d", ap->spares); + if (c->verbose > 0) { + newline += printf(" devices"); + for (d = dl_next(ap->devs); + d != ap->devs; + d=dl_next(d)) { + printf("%c%s", sep, d); + sep=','; + } + } + if (ap->st->ss->brief_examine_subarrays) { + if (newline) + printf("\n"); + ap->st->ss->brief_examine_subarrays(ap->st, c->verbose); + } + ap->st->ss->free_super(ap->st); + /* FIXME free ap */ + if (ap->spares || c->verbose > 0) + printf("\n"); + } + } + return rv; +} + +int ExamineBadblocks(char *devname, int brief, struct supertype *forcest) +{ + int fd = dev_open(devname, O_RDONLY); + struct supertype *st = forcest; + int err = 1; + + if (fd < 0) { + pr_err("cannot open %s: %s\n", devname, strerror(errno)); + return 1; + } + if (!st) + st = guess_super(fd); + if (!st) { + if (!brief) + pr_err("No md superblock detected on %s\n", devname); + goto out; + } + if (!st->ss->examine_badblocks) { + pr_err("%s metadata does not support badblocks\n", st->ss->name); + goto out; + } + err = st->ss->load_super(st, fd, brief ? NULL : devname); + if (err) + goto out; + err = st->ss->examine_badblocks(st, fd, devname); + +out: + if (fd >= 0) + close(fd); + if (st) { + st->ss->free_super(st); + free(st); + } + return err; +} @@ -0,0 +1,5229 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ +#include "mdadm.h" +#include "dlink.h" +#include <sys/mman.h> +#include <stddef.h> +#include <stdint.h> +#include <signal.h> +#include <sys/wait.h> + +#if ! defined(__BIG_ENDIAN) && ! defined(__LITTLE_ENDIAN) +#error no endian defined +#endif +#include "md_u.h" +#include "md_p.h" + +int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int next_spare, + char **backup_filep, + int verbose) +{ + int i; + int *fdlist; + struct mdinfo *dev; + int err; + int disk_count = next_spare + working_disks; + char *backup_file = *backup_filep; + + dprintf("Called restore_backup()\n"); + fdlist = xmalloc(sizeof(int) * disk_count); + + enable_fds(next_spare); + for (i = 0; i < next_spare; i++) + fdlist[i] = -1; + for (dev = content->devs; dev; dev = dev->next) { + char buf[22]; + int fd; + + sprintf(buf, "%d:%d", dev->disk.major, dev->disk.minor); + fd = dev_open(buf, O_RDWR); + + if (dev->disk.raid_disk >= 0) + fdlist[dev->disk.raid_disk] = fd; + else + fdlist[next_spare++] = fd; + } + + if (!backup_file) { + backup_file = locate_backup(content->sys_name); + *backup_filep = backup_file; + } + + if (st->ss->external && st->ss->recover_backup) + err = st->ss->recover_backup(st, content); + else + err = Grow_restart(st, content, fdlist, next_spare, + backup_file, verbose > 0); + + while (next_spare > 0) { + next_spare--; + if (fdlist[next_spare] >= 0) + close(fdlist[next_spare]); + } + free(fdlist); + if (err) { + pr_err("Failed to restore critical section for reshape - sorry.\n"); + if (!backup_file) + pr_err("Possibly you need to specify a --backup-file\n"); + return 1; + } + + dprintf("restore_backup() returns status OK.\n"); + return 0; +} + +int Grow_Add_device(char *devname, int fd, char *newdev) +{ + /* Add a device to an active array. + * Currently, just extend a linear array. + * This requires writing a new superblock on the + * new device, calling the kernel to add the device, + * and if that succeeds, update the superblock on + * all other devices. + * This means that we need to *find* all other devices. + */ + struct mdinfo info; + + dev_t rdev; + int nfd, fd2; + int d, nd; + struct supertype *st = NULL; + char *subarray = NULL; + + if (md_get_array_info(fd, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + if (info.array.level != -1) { + pr_err("can only add devices to linear arrays\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("cannot handle arrays with superblock version %d\n", + info.array.major_version); + return 1; + } + + if (subarray) { + pr_err("Cannot grow linear sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + nfd = open(newdev, O_RDWR|O_EXCL|O_DIRECT); + if (nfd < 0) { + pr_err("cannot open %s\n", newdev); + free(st); + return 1; + } + if (!fstat_is_blkdev(nfd, newdev, &rdev)) { + close(nfd); + free(st); + return 1; + } + /* now check out all the devices and make sure we can read the + * superblock */ + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + st->ss->free_super(st); + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", d); + close(nfd); + free(st); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", d); + close(nfd); + free(st); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + close(nfd); + free(st); + return 1; + } + + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(nfd); + close(fd2); + free(st); + return 1; + } + close(fd2); + } + /* Ok, looks good. Lets update the superblock and write it out to + * newdev. + */ + + info.disk.number = d; + info.disk.major = major(rdev); + info.disk.minor = minor(rdev); + info.disk.raid_disk = d; + info.disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + if (st->ss->update_super(st, &info, "linear-grow-new", newdev, + 0, 0, NULL) != 0) { + pr_err("Preparing new metadata failed on %s\n", newdev); + close(nfd); + return 1; + } + + if (st->ss->store_super(st, nfd)) { + pr_err("Cannot store new superblock on %s\n", newdev); + close(nfd); + return 1; + } + close(nfd); + + if (ioctl(fd, ADD_NEW_DISK, &info.disk) != 0) { + pr_err("Cannot add new disk to this array\n"); + return 1; + } + /* Well, that seems to have worked. + * Now go through and update all superblocks + */ + + if (md_get_array_info(fd, &info.array) < 0) { + pr_err("cannot get array info for %s\n", devname); + return 1; + } + + nd = d; + for (d=0 ; d < info.array.raid_disks ; d++) { + mdu_disk_info_t disk; + char *dv; + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) { + pr_err("cannot get device detail for device %d\n", d); + return 1; + } + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) { + pr_err("cannot find device file for device %d\n", d); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) { + pr_err("cannot open device file %s\n", dv); + return 1; + } + if (st->ss->load_super(st, fd2, NULL)) { + pr_err("cannot find super block on %s\n", dv); + close(fd); + close(fd2); + return 1; + } + info.array.raid_disks = nd+1; + info.array.nr_disks = nd+1; + info.array.active_disks = nd+1; + info.array.working_disks = nd+1; + + if (st->ss->update_super(st, &info, "linear-grow-update", dv, + 0, 0, NULL) != 0) { + pr_err("Updating metadata failed on %s\n", dv); + close(fd2); + return 1; + } + + if (st->ss->store_super(st, fd2)) { + pr_err("Cannot store new superblock on %s\n", dv); + close(fd2); + return 1; + } + close(fd2); + } + + return 0; +} + +int Grow_addbitmap(char *devname, int fd, struct context *c, struct shape *s) +{ + /* + * First check that array doesn't have a bitmap + * Then create the bitmap + * Then add it + * + * For internal bitmaps, we need to check the version, + * find all the active devices, and write the bitmap block + * to all devices + */ + mdu_bitmap_file_t bmf; + mdu_array_info_t array; + struct supertype *st; + char *subarray = NULL; + int major = BITMAP_MAJOR_HI; + unsigned long long bitmapsize, array_size; + struct mdinfo *mdi; + + /* + * We only ever get called if s->bitmap_file is != NULL, so this check + * is just here to quiet down static code checkers. + */ + if (!s->bitmap_file) + return 1; + + if (strcmp(s->bitmap_file, "clustered") == 0) + major = BITMAP_MAJOR_CLUSTERED; + + if (ioctl(fd, GET_BITMAP_FILE, &bmf) != 0) { + if (errno == ENOMEM) + pr_err("Memory allocation failure.\n"); + else + pr_err("bitmaps not supported by this kernel.\n"); + return 1; + } + if (bmf.pathname[0]) { + if (strcmp(s->bitmap_file,"none") == 0) { + if (ioctl(fd, SET_BITMAP_FILE, -1) != 0) { + pr_err("failed to remove bitmap %s\n", + bmf.pathname); + return 1; + } + return 0; + } + pr_err("%s already has a bitmap (%s)\n", devname, bmf.pathname); + return 1; + } + if (md_get_array_info(fd, &array) != 0) { + pr_err("cannot get array status for %s\n", devname); + return 1; + } + if (array.state & (1 << MD_SB_BITMAP_PRESENT)) { + if (strcmp(s->bitmap_file, "none")==0) { + array.state &= ~(1 << MD_SB_BITMAP_PRESENT); + if (md_set_array_info(fd, &array) != 0) { + if (array.state & (1 << MD_SB_CLUSTERED)) + pr_err("failed to remove clustered bitmap.\n"); + else + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + return 0; + } + pr_err("bitmap already present on %s\n", devname); + return 1; + } + + if (strcmp(s->bitmap_file, "none") == 0) { + pr_err("no bitmap found on %s\n", devname); + return 1; + } + if (array.level <= 0) { + pr_err("Bitmaps not meaningful with level %s\n", + map_num(pers, array.level)?:"of this array"); + return 1; + } + bitmapsize = array.size; + bitmapsize <<= 1; + if (get_dev_size(fd, NULL, &array_size) && + array_size > (0x7fffffffULL << 9)) { + /* Array is big enough that we cannot trust array.size + * try other approaches + */ + bitmapsize = get_component_size(fd); + } + if (bitmapsize == 0) { + pr_err("Cannot reliably determine size of array to create bitmap - sorry.\n"); + return 1; + } + + if (array.level == 10) { + int ncopies; + + ncopies = (array.layout & 255) * ((array.layout >> 8) & 255); + bitmapsize = bitmapsize * array.raid_disks / ncopies; + + if (strcmp(s->bitmap_file, "clustered") == 0 && + !is_near_layout_10(array.layout)) { + pr_err("only near layout is supported with clustered raid10\n"); + return 1; + } + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Cannot understand version %d.%d\n", + array.major_version, array.minor_version); + return 1; + } + if (subarray) { + pr_err("Cannot add bitmaps to sub-arrays yet\n"); + free(subarray); + free(st); + return 1; + } + + mdi = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY); + if (mdi) { + if (mdi->consistency_policy == CONSISTENCY_POLICY_PPL) { + pr_err("Cannot add bitmap to array with PPL\n"); + free(mdi); + free(st); + return 1; + } + free(mdi); + } + + if (strcmp(s->bitmap_file, "internal") == 0 || + strcmp(s->bitmap_file, "clustered") == 0) { + int rv; + int d; + int offset_setable = 0; + if (st->ss->add_internal_bitmap == NULL) { + pr_err("Internal bitmaps not supported with %s metadata\n", st->ss->name); + return 1; + } + st->nodes = c->nodes; + st->cluster_name = c->homecluster; + mdi = sysfs_read(fd, NULL, GET_BITMAP_LOCATION); + if (mdi) + offset_setable = 1; + for (d = 0; d < st->max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_SYNC)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + if (((disk.state & (1 << MD_DISK_WRITEMOSTLY)) == 0) && + (strcmp(s->bitmap_file, "clustered") == 0)) { + pr_err("%s disks marked write-mostly are not supported with clustered bitmap\n",devname); + return 1; + } + fd2 = dev_open(dv, O_RDWR); + if (fd2 < 0) + continue; + rv = st->ss->load_super(st, fd2, NULL); + if (!rv) { + rv = st->ss->add_internal_bitmap( + st, &s->bitmap_chunk, c->delay, + s->write_behind, bitmapsize, + offset_setable, major); + if (!rv) { + st->ss->write_bitmap(st, fd2, + NodeNumUpdate); + } else { + pr_err("failed to create internal bitmap - chunksize problem.\n"); + } + } else { + pr_err("failed to load super-block.\n"); + } + close(fd2); + if (rv) + return 1; + } + if (offset_setable) { + st->ss->getinfo_super(st, mdi, NULL); + if (sysfs_init(mdi, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + free(mdi); + } + rv = sysfs_set_num_signed(mdi, NULL, "bitmap/location", + mdi->bitmap_offset); + free(mdi); + } else { + if (strcmp(s->bitmap_file, "clustered") == 0) + array.state |= (1 << MD_SB_CLUSTERED); + array.state |= (1 << MD_SB_BITMAP_PRESENT); + rv = md_set_array_info(fd, &array); + } + if (rv < 0) { + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("failed to set internal bitmap.\n"); + return 1; + } + } else { + int uuid[4]; + int bitmap_fd; + int d; + int max_devs = st->max_devs; + + /* try to load a superblock */ + for (d = 0; d < max_devs; d++) { + mdu_disk_info_t disk; + char *dv; + int fd2; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if ((disk.major==0 && disk.minor == 0) || + (disk.state & (1 << MD_DISK_REMOVED))) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 >= 0) { + if (st->ss->load_super(st, fd2, NULL) == 0) { + close(fd2); + st->ss->uuid_from_super(st, uuid); + break; + } + close(fd2); + } + } + if (d == max_devs) { + pr_err("cannot find UUID for array!\n"); + return 1; + } + if (CreateBitmap(s->bitmap_file, c->force, (char*)uuid, + s->bitmap_chunk, c->delay, s->write_behind, + bitmapsize, major)) { + return 1; + } + bitmap_fd = open(s->bitmap_file, O_RDWR); + if (bitmap_fd < 0) { + pr_err("weird: %s cannot be opened\n", s->bitmap_file); + return 1; + } + if (ioctl(fd, SET_BITMAP_FILE, bitmap_fd) < 0) { + int err = errno; + if (errno == EBUSY) + pr_err("Cannot add bitmap while array is resyncing or reshaping etc.\n"); + pr_err("Cannot set bitmap file for %s: %s\n", + devname, strerror(err)); + return 1; + } + } + + return 0; +} + +int Grow_consistency_policy(char *devname, int fd, struct context *c, struct shape *s) +{ + struct supertype *st; + struct mdinfo *sra; + struct mdinfo *sd; + char *subarray = NULL; + int ret = 0; + char container_dev[PATH_MAX]; + char buf[20]; + + if (s->consistency_policy != CONSISTENCY_POLICY_RESYNC && + s->consistency_policy != CONSISTENCY_POLICY_PPL) { + pr_err("Operation not supported for consistency policy %s\n", + map_num(consistency_policies, s->consistency_policy)); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) + return 1; + + sra = sysfs_read(fd, NULL, GET_CONSISTENCY_POLICY|GET_LEVEL| + GET_DEVS|GET_STATE); + if (!sra) { + ret = 1; + goto free_st; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL && + !st->ss->write_init_ppl) { + pr_err("%s metadata does not support PPL\n", st->ss->name); + ret = 1; + goto free_info; + } + + if (sra->array.level != 5) { + pr_err("Operation not supported for array level %d\n", + sra->array.level); + ret = 1; + goto free_info; + } + + if (sra->consistency_policy == (unsigned)s->consistency_policy) { + pr_err("Consistency policy is already %s\n", + map_num(consistency_policies, s->consistency_policy)); + ret = 1; + goto free_info; + } else if (sra->consistency_policy != CONSISTENCY_POLICY_RESYNC && + sra->consistency_policy != CONSISTENCY_POLICY_PPL) { + pr_err("Current consistency policy is %s, cannot change to %s\n", + map_num(consistency_policies, sra->consistency_policy), + map_num(consistency_policies, s->consistency_policy)); + ret = 1; + goto free_info; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) { + ret = 1; + goto free_info; + } else if (strcmp(buf, "reshape\n") == 0) { + pr_err("PPL cannot be enabled when reshape is in progress\n"); + ret = 1; + goto free_info; + } + } + + if (subarray) { + char *update; + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) + update = "ppl"; + else + update = "no-ppl"; + + sprintf(container_dev, "/dev/%s", st->container_devnm); + + ret = Update_subarray(container_dev, subarray, update, NULL, + c->verbose); + if (ret) + goto free_info; + } + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + struct mdinfo info; + + if (subarray) { + struct mdinfo *mdi; + int cfd; + + cfd = open(container_dev, O_RDWR|O_EXCL); + if (cfd < 0) { + pr_err("Failed to open %s\n", container_dev); + ret = 1; + goto free_info; + } + + ret = st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + + if (ret) { + pr_err("Cannot read superblock for %s\n", + container_dev); + goto free_info; + } + + mdi = st->ss->container_content(st, subarray); + info = *mdi; + free(mdi); + } + + for (sd = sra->devs; sd; sd = sd->next) { + int dfd; + char *devpath; + + devpath = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(devpath, O_RDWR); + if (dfd < 0) { + pr_err("Failed to open %s\n", devpath); + ret = 1; + goto free_info; + } + + if (!subarray) { + ret = st->ss->load_super(st, dfd, NULL); + if (ret) { + pr_err("Failed to load super-block.\n"); + close(dfd); + goto free_info; + } + + ret = st->ss->update_super(st, sra, "ppl", + devname, + c->verbose, 0, NULL); + if (ret) { + close(dfd); + st->ss->free_super(st); + goto free_info; + } + st->ss->getinfo_super(st, &info, NULL); + } + + ret |= sysfs_set_num(sra, sd, "ppl_sector", + info.ppl_sector); + ret |= sysfs_set_num(sra, sd, "ppl_size", + info.ppl_size); + + if (ret) { + pr_err("Failed to set PPL attributes for %s\n", + sd->sys_name); + close(dfd); + st->ss->free_super(st); + goto free_info; + } + + ret = st->ss->write_init_ppl(st, &info, dfd); + if (ret) + pr_err("Failed to write PPL\n"); + + close(dfd); + + if (!subarray) + st->ss->free_super(st); + + if (ret) + goto free_info; + } + } + + ret = sysfs_set_str(sra, NULL, "consistency_policy", + map_num(consistency_policies, + s->consistency_policy)); + if (ret) + pr_err("Failed to change array consistency policy\n"); + +free_info: + sysfs_free(sra); +free_st: + free(st); + free(subarray); + + return ret; +} + +/* + * When reshaping an array we might need to backup some data. + * This is written to all spares with a 'super_block' describing it. + * The superblock goes 4K from the end of the used space on the + * device. + * It if written after the backup is complete. + * It has the following structure. + */ + +static struct mdp_backup_super { + char magic[16]; /* md_backup_data-1 or -2 */ + __u8 set_uuid[16]; + __u64 mtime; + /* start/sizes in 512byte sectors */ + __u64 devstart; /* address on backup device/file of data */ + __u64 arraystart; + __u64 length; + __u32 sb_csum; /* csum of preceeding bytes. */ + __u32 pad1; + __u64 devstart2; /* offset in to data of second section */ + __u64 arraystart2; + __u64 length2; + __u32 sb_csum2; /* csum of preceeding bytes. */ + __u8 pad[512-68-32]; +} __attribute__((aligned(512))) bsb, bsb2; + +static __u32 bsb_csum(char *buf, int len) +{ + int i; + int csum = 0; + for (i = 0; i < len; i++) + csum = (csum<<3) + buf[0]; + return __cpu_to_le32(csum); +} + +static int check_idle(struct supertype *st) +{ + /* Check that all member arrays for this container, or the + * container of this array, are idle + */ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + struct mdstat_ent *ent, *e; + int is_idle = 1; + + ent = mdstat_read(0, 0); + for (e = ent ; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + /* frozen array is not idle*/ + if (e->percent >= 0 || e->metadata_version[9] == '-') { + is_idle = 0; + break; + } + } + free_mdstat(ent); + return is_idle; +} + +static int freeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + if (!check_idle(st)) + return -1; + + if (block_monitor(container, 1)) { + pr_err("failed to freeze container\n"); + return -2; + } + + return 1; +} + +static void unfreeze_container(struct supertype *st) +{ + char *container = (st->container_devnm[0] + ? st->container_devnm : st->devnm); + + unblock_monitor(container, 1); +} + +static int freeze(struct supertype *st) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return -2 container cannot be frozen, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + if (st->ss->external) + return freeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + int err; + char buf[20]; + + if (!sra) + return -1; + /* Need to clear any 'read-auto' status */ + if (sysfs_get_str(sra, NULL, "array_state", buf, 20) > 0 && + strncmp(buf, "read-auto", 9) == 0) + sysfs_set_str(sra, NULL, "array_state", "clean"); + + err = sysfs_freeze_array(sra); + sysfs_free(sra); + return err; + } +} + +static void unfreeze(struct supertype *st) +{ + if (st->ss->external) + return unfreeze_container(st); + else { + struct mdinfo *sra = sysfs_read(-1, st->devnm, GET_VERSION); + char buf[20]; + + if (sra && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "frozen\n") == 0) + sysfs_set_str(sra, NULL, "sync_action", "idle"); + sysfs_free(sra); + } +} + +static void wait_reshape(struct mdinfo *sra) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_action"); + char action[20]; + + if (fd < 0) + return; + + while (sysfs_fd_get_str(fd, action, 20) > 0 && + strncmp(action, "reshape", 7) == 0) + sysfs_wait(fd, NULL); + close(fd); +} + +static int reshape_super(struct supertype *st, unsigned long long size, + int level, int layout, int chunksize, int raid_disks, + int delta_disks, char *backup_file, char *dev, + int direction, int verbose) +{ + /* nothing extra to check in the native case */ + if (!st->ss->external) + return 0; + if (!st->ss->reshape_super || !st->ss->manage_reshape) { + pr_err("%s metadata does not support reshape\n", + st->ss->name); + return 1; + } + + return st->ss->reshape_super(st, size, level, layout, chunksize, + raid_disks, delta_disks, backup_file, dev, + direction, verbose); +} + +static void sync_metadata(struct supertype *st) +{ + if (st->ss->external) { + if (st->update_tail) { + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + st->ss->sync_metadata(st); + } +} + +static int subarray_set_num(char *container, struct mdinfo *sra, char *name, int n) +{ + /* when dealing with external metadata subarrays we need to be + * prepared to handle EAGAIN. The kernel may need to wait for + * mdmon to mark the array active so the kernel can handle + * allocations/writeback when preparing the reshape action + * (md_allow_write()). We temporarily disable safe_mode_delay + * to close a race with the array_state going clean before the + * next write to raid_disks / stripe_cache_size + */ + char safe[50]; + int rc; + + /* only 'raid_disks' and 'stripe_cache_size' trigger md_allow_write */ + if (!container || + (strcmp(name, "raid_disks") != 0 && + strcmp(name, "stripe_cache_size") != 0)) + return sysfs_set_num(sra, NULL, name, n); + + rc = sysfs_get_str(sra, NULL, "safe_mode_delay", safe, sizeof(safe)); + if (rc <= 0) + return -1; + sysfs_set_num(sra, NULL, "safe_mode_delay", 0); + rc = sysfs_set_num(sra, NULL, name, n); + if (rc < 0 && errno == EAGAIN) { + ping_monitor(container); + /* if we get EAGAIN here then the monitor is not active + * so stop trying + */ + rc = sysfs_set_num(sra, NULL, name, n); + } + sysfs_set_str(sra, NULL, "safe_mode_delay", safe); + return rc; +} + +int start_reshape(struct mdinfo *sra, int already_running, + int before_data_disks, int data_disks, struct supertype *st) +{ + int err; + unsigned long long sync_max_to_set; + + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + err = sysfs_set_num(sra, NULL, "suspend_hi", sra->reshape_progress); + err = err ?: sysfs_set_num(sra, NULL, "suspend_lo", + sra->reshape_progress); + if (before_data_disks <= data_disks) + sync_max_to_set = sra->reshape_progress / data_disks; + else + sync_max_to_set = (sra->component_size * data_disks + - sra->reshape_progress) / data_disks; + + if (!already_running) + sysfs_set_num(sra, NULL, "sync_min", sync_max_to_set); + + if (st->ss->external) + err = err ?: sysfs_set_num(sra, NULL, "sync_max", sync_max_to_set); + else + err = err ?: sysfs_set_str(sra, NULL, "sync_max", "max"); + + if (!already_running && err == 0) { + int cnt = 5; + do { + err = sysfs_set_str(sra, NULL, "sync_action", + "reshape"); + if (err) + sleep(1); + } while (err && errno == EBUSY && cnt-- > 0); + } + return err; +} + +void abort_reshape(struct mdinfo *sra) +{ + sysfs_set_str(sra, NULL, "sync_action", "idle"); + /* + * Prior to kernel commit: 23ddff3792f6 ("md: allow suspend_lo and + * suspend_hi to decrease as well as increase.") + * you could only increase suspend_{lo,hi} unless the region they + * covered was empty. So to reset to 0, you need to push suspend_lo + * up past suspend_hi first. So to maximize the chance of mdadm + * working on all kernels, we want to keep doing that. + */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + // It isn't safe to reset sync_max as we aren't monitoring. + // Array really should be stopped at this point. +} + +int remove_disks_for_takeover(struct supertype *st, + struct mdinfo *sra, + int layout) +{ + int nr_of_copies; + struct mdinfo *remaining; + int slot; + + if (st->ss->external) { + int rv = 0; + struct mdinfo *arrays = st->ss->container_content(st, NULL); + /* + * containter_content returns list of arrays in container + * If arrays->next is not NULL it means that there are + * 2 arrays in container and operation should be blocked + */ + if (arrays) { + if (arrays->next) + rv = 1; + sysfs_free(arrays); + if (rv) { + pr_err("Error. Cannot perform operation on /dev/%s\n", st->devnm); + pr_err("For this operation it MUST be single array in container\n"); + return rv; + } + } + } + + if (sra->array.level == 10) + nr_of_copies = layout & 0xff; + else if (sra->array.level == 1) + nr_of_copies = sra->array.raid_disks; + else + return 1; + + remaining = sra->devs; + sra->devs = NULL; + /* for each 'copy', select one device and remove from the list. */ + for (slot = 0; slot < sra->array.raid_disks; slot += nr_of_copies) { + struct mdinfo **diskp; + int found = 0; + + /* Find a working device to keep */ + for (diskp = &remaining; *diskp ; diskp = &(*diskp)->next) { + struct mdinfo *disk = *diskp; + + if (disk->disk.raid_disk < slot) + continue; + if (disk->disk.raid_disk >= slot + nr_of_copies) + continue; + if (disk->disk.state & (1<<MD_DISK_REMOVED)) + continue; + if (disk->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (!(disk->disk.state & (1<<MD_DISK_SYNC))) + continue; + + /* We have found a good disk to use! */ + *diskp = disk->next; + disk->next = sra->devs; + sra->devs = disk; + found = 1; + break; + } + if (!found) + break; + } + + if (slot < sra->array.raid_disks) { + /* didn't find all slots */ + struct mdinfo **e; + e = &remaining; + while (*e) + e = &(*e)->next; + *e = sra->devs; + sra->devs = remaining; + return 1; + } + + /* Remove all 'remaining' devices from the array */ + while (remaining) { + struct mdinfo *sd = remaining; + remaining = sd->next; + + sysfs_set_str(sra, sd, "state", "faulty"); + sysfs_set_str(sra, sd, "slot", "none"); + /* for external metadata disks should be removed in mdmon */ + if (!st->ss->external) + sysfs_set_str(sra, sd, "state", "remove"); + sd->disk.state |= (1<<MD_DISK_REMOVED); + sd->disk.state &= ~(1<<MD_DISK_SYNC); + sd->next = sra->devs; + sra->devs = sd; + } + return 0; +} + +void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size) +{ + int i; + + for (i = 0; i < size; i++) + if (fdlist[i] >= 0) + close(fdlist[i]); + + free(fdlist); + free(offsets); +} + +int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets) +{ + int d = 0; + struct mdinfo *sd; + + enable_fds(nrdisks); + for (d = 0; d <= nrdisks; d++) + fdlist[d] = -1; + d = raid_disks; + for (sd = sra->devs; sd; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC) && + sd->disk.raid_disk < raid_disks) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); + fdlist[sd->disk.raid_disk] = dev_open(dn, O_RDONLY); + offsets[sd->disk.raid_disk] = sd->data_offset*512; + if (fdlist[sd->disk.raid_disk] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + } else if (backup_file == NULL) { + /* spare */ + char *dn = map_dev(sd->disk.major, sd->disk.minor, 1); + fdlist[d] = dev_open(dn, O_RDWR); + offsets[d] = (sd->data_offset + sra->component_size - blocks - 8)*512; + if (fdlist[d] < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + d = -1; + goto release; + } + d++; + } + } +release: + return d; +} + +int reshape_open_backup_file(char *backup_file, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sys_name, + int restart) +{ + /* Return 1 on success, 0 on any form of failure */ + /* need to check backup file is large enough */ + char buf[512]; + struct stat stb; + unsigned int dev; + int i; + + *fdlist = open(backup_file, O_RDWR|O_CREAT|(restart ? O_TRUNC : O_EXCL), + S_IRUSR | S_IWUSR); + *offsets = 8 * 512; + if (*fdlist < 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + /* Guard against backup file being on array device. + * If array is partitioned or if LVM etc is in the + * way this will not notice, but it is better than + * nothing. + */ + fstat(*fdlist, &stb); + dev = stb.st_dev; + fstat(fd, &stb); + if (stb.st_rdev == dev) { + pr_err("backup file must NOT be on the array being reshaped.\n"); + close(*fdlist); + return 0; + } + + memset(buf, 0, 512); + for (i=0; i < blocks + 8 ; i++) { + if (write(*fdlist, buf, 512) != 512) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + } + if (fsync(*fdlist) != 0) { + pr_err("%s: cannot create backup file %s: %s\n", + devname, backup_file, strerror(errno)); + return 0; + } + + if (!restart && strncmp(backup_file, MAP_DIR, strlen(MAP_DIR)) != 0) { + char *bu = make_backup(sys_name); + if (symlink(backup_file, bu)) + pr_err("Recording backup file in " MAP_DIR " failed: %s\n", + strerror(errno)); + free(bu); + } + + return 1; +} + +unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata) +{ + unsigned long a, b, blocks; + /* So how much do we need to backup. + * We need an amount of data which is both a whole number of + * old stripes and a whole number of new stripes. + * So LCM for (chunksize*datadisks). + */ + a = (ochunk/512) * odata; + b = (nchunk/512) * ndata; + /* Find GCD */ + a = GCD(a, b); + /* LCM == product / GCD */ + blocks = (unsigned long)(ochunk/512) * (unsigned long)(nchunk/512) * + odata * ndata / a; + + return blocks; +} + +char *analyse_change(char *devname, struct mdinfo *info, struct reshape *re) +{ + /* Based on the current array state in info->array and + * the changes in info->new_* etc, determine: + * - whether the change is possible + * - Intermediate level/raid_disks/layout + * - whether a restriping reshape is needed + * - number of sectors in minimum change unit. This + * will cover a whole number of stripes in 'before' and + * 'after'. + * + * Return message if the change should be rejected + * NULL if the change can be achieved + * + * This can be called as part of starting a reshape, or + * when assembling an array that is undergoing reshape. + */ + int near, far, offset, copies; + int new_disks; + int old_chunk, new_chunk; + /* delta_parity records change in number of devices + * caused by level change + */ + int delta_parity = 0; + + memset(re, 0, sizeof(*re)); + + /* If a new level not explicitly given, we assume no-change */ + if (info->new_level == UnSet) + info->new_level = info->array.level; + + if (info->new_chunk) + switch (info->new_level) { + case 0: + case 4: + case 5: + case 6: + case 10: + /* chunk size is meaningful, must divide component_size + * evenly + */ + if (info->component_size % (info->new_chunk/512)) { + unsigned long long shrink = info->component_size; + shrink &= ~(unsigned long long)(info->new_chunk/512-1); + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk)\n", + info->new_chunk/1024, info->component_size/2); + pr_err("After shrinking any filesystem, \"mdadm --grow %s --size %llu\"\n", + devname, shrink/2); + pr_err("will shrink the array so the given chunk size would work.\n"); + return ""; + } + break; + default: + return "chunk size not meaningful for this level"; + } + else + info->new_chunk = info->array.chunk_size; + + switch (info->array.level) { + default: + return "No reshape is possibly for this RAID level"; + case LEVEL_LINEAR: + if (info->delta_disks != UnSet) + return "Only --add is supported for LINEAR, setting --raid-disks is not needed"; + else + return "Only --add is supported for LINEAR, other --grow options are not meaningful"; + case 1: + /* RAID1 can convert to RAID1 with different disks, or + * raid5 with 2 disks, or + * raid0 with 1 disk + */ + if (info->new_level > 1 && (info->component_size & 7)) + return "Cannot convert RAID1 of this size - reduce size to multiple of 4K first."; + if (info->new_level == 0) { + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot change number of disks with RAID1->RAID0 conversion"; + re->level = 0; + re->before.data_disks = 1; + re->after.data_disks = 1; + return NULL; + } + if (info->new_level == 1) { + if (info->delta_disks == UnSet) + /* Don't know what to do */ + return "no change requested for Growing RAID1"; + re->level = 1; + return NULL; + } + if (info->array.raid_disks != 2 && info->new_level == 5) + return "Can only convert a 2-device array to RAID5"; + if (info->array.raid_disks == 2 && info->new_level == 5) { + re->level = 5; + re->before.data_disks = 1; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + re->after.data_disks = 1 + info->delta_disks; + else + re->after.data_disks = 1; + if (re->after.data_disks < 1) + return "Number of disks too small for RAID5"; + + re->before.layout = ALGORITHM_LEFT_SYMMETRIC; + info->array.chunk_size = 65536; + break; + } + /* Could do some multi-stage conversions, but leave that to + * later. + */ + return "Impossibly level change request for RAID1"; + + case 10: + /* RAID10 can be converted from near mode to + * RAID0 by removing some devices. + * It can also be reshaped if the kernel supports + * new_data_offset. + */ + switch (info->new_level) { + case 0: + if ((info->array.layout & ~0xff) != 0x100) + return "Cannot Grow RAID10 with far/offset layout"; + /* + * number of devices must be multiple of + * number of copies + */ + if (info->array.raid_disks % + (info->array.layout & 0xff)) + return "RAID10 layout too complex for Grow operation"; + + new_disks = (info->array.raid_disks / + (info->array.layout & 0xff)); + if (info->delta_disks == UnSet) + info->delta_disks = (new_disks + - info->array.raid_disks); + + if (info->delta_disks != + new_disks - info->array.raid_disks) + return "New number of raid-devices impossible for RAID10"; + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID10 Grow"; + + /* looks good */ + re->level = 0; + re->before.data_disks = new_disks; + re->after.data_disks = re->before.data_disks; + return NULL; + + case 10: + near = info->array.layout & 0xff; + far = (info->array.layout >> 8) & 0xff; + offset = info->array.layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 in far-mode"; + copies = near * far; + + old_chunk = info->array.chunk_size * far; + + if (info->new_layout == UnSet) + info->new_layout = info->array.layout; + else { + near = info->new_layout & 0xff; + far = (info->new_layout >> 8) & 0xff; + offset = info->new_layout & 0x10000; + if (far > 1 && !offset) + return "Cannot reshape RAID10 to far-mode"; + if (near * far != copies) + return "Cannot change number of copies when reshaping RAID10"; + } + if (info->delta_disks == UnSet) + info->delta_disks = 0; + new_disks = (info->array.raid_disks + + info->delta_disks); + + new_chunk = info->new_chunk * far; + + re->level = 10; + re->before.layout = info->array.layout; + re->before.data_disks = info->array.raid_disks; + re->after.layout = info->new_layout; + re->after.data_disks = new_disks; + /* For RAID10 we don't do backup but do allow reshape, + * so set backup_blocks to INVALID_SECTORS rather than + * zero. + * And there is no need to synchronise stripes on both + * 'old' and 'new'. So the important + * number is the minimum data_offset difference + * which is the larger of (offset copies * chunk). + */ + re->backup_blocks = INVALID_SECTORS; + re->min_offset_change = max(old_chunk, new_chunk) / 512; + if (new_disks < re->before.data_disks && + info->space_after < re->min_offset_change) + /* Reduce component size by one chunk */ + re->new_size = (info->component_size - + re->min_offset_change); + else + re->new_size = info->component_size; + re->new_size = re->new_size * new_disks / copies; + return NULL; + + default: + return "RAID10 can only be changed to RAID0"; + } + case 0: + /* RAID0 can be converted to RAID10, or to RAID456 */ + if (info->new_level == 10) { + if (info->new_layout == UnSet && + info->delta_disks == UnSet) { + /* Assume near=2 layout */ + info->new_layout = 0x102; + info->delta_disks = info->array.raid_disks; + } + if (info->new_layout == UnSet) { + int copies = 1 + (info->delta_disks + / info->array.raid_disks); + if (info->array.raid_disks * (copies-1) != + info->delta_disks) + return "Impossible number of devices for RAID0->RAID10"; + info->new_layout = 0x100 + copies; + } + if (info->delta_disks == UnSet) { + int copies = info->new_layout & 0xff; + if (info->new_layout != 0x100 + copies) + return "New layout impossible for RAID0->RAID10";; + info->delta_disks = (copies - 1) * + info->array.raid_disks; + } + if (info->new_chunk && + info->new_chunk != info->array.chunk_size) + return "Cannot change chunk-size with RAID0->RAID10"; + /* looks good */ + re->level = 10; + re->before.data_disks = (info->array.raid_disks + + info->delta_disks); + re->after.data_disks = re->before.data_disks; + re->before.layout = info->new_layout; + return NULL; + } + + /* RAID0 can also covert to RAID0/4/5/6 by first converting to + * a raid4 style layout of the final level. + */ + switch (info->new_level) { + case 4: + delta_parity = 1; + case 0: + re->level = 4; + re->before.layout = 0; + break; + case 5: + delta_parity = 1; + re->level = 5; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r5layout, "default"); + break; + case 6: + delta_parity = 2; + re->level = 6; + re->before.layout = ALGORITHM_PARITY_N; + if (info->new_layout == UnSet) + info->new_layout = map_name(r6layout, "default"); + break; + default: + return "Impossible level change requested"; + } + re->before.data_disks = info->array.raid_disks; + /* determining 'after' layout happens outside this 'switch' */ + break; + + case 4: + info->array.layout = ALGORITHM_PARITY_N; + case 5: + switch (info->new_level) { + case 0: + delta_parity = -1; + case 4: + re->level = info->array.level; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 5: + re->level = 5; + re->before.data_disks = info->array.raid_disks - 1; + re->before.layout = info->array.layout; + break; + case 6: + delta_parity = 1; + re->level = 6; + re->before.data_disks = info->array.raid_disks - 1; + switch (info->array.layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + re->before.layout = ALGORITHM_LEFT_ASYMMETRIC_6; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_ASYMMETRIC_6; + break; + case ALGORITHM_LEFT_SYMMETRIC: + re->before.layout = ALGORITHM_LEFT_SYMMETRIC_6; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + re->before.layout = ALGORITHM_RIGHT_SYMMETRIC_6; + break; + case ALGORITHM_PARITY_0: + re->before.layout = ALGORITHM_PARITY_0_6; + break; + case ALGORITHM_PARITY_N: + re->before.layout = ALGORITHM_PARITY_N_6; + break; + default: + return "Cannot convert an array with this layout"; + } + break; + case 1: + if (info->array.raid_disks != 2) + return "Can only convert a 2-device array to RAID1"; + if (info->delta_disks != UnSet && + info->delta_disks != 0) + return "Cannot set raid_disk when converting RAID5->RAID1"; + re->level = 1; + info->new_chunk = 0; + return NULL; + default: + return "Impossible level change requested"; + } + break; + case 6: + switch (info->new_level) { + case 4: + case 5: + delta_parity = -1; + case 6: + re->level = 6; + re->before.data_disks = info->array.raid_disks - 2; + re->before.layout = info->array.layout; + break; + default: + return "Impossible level change requested"; + } + break; + } + + /* If we reached here then it looks like a re-stripe is + * happening. We have determined the intermediate level + * and initial raid_disks/layout and stored these in 're'. + * + * We need to deduce the final layout that can be atomically + * converted to the end state. + */ + switch (info->new_level) { + case 0: + /* We can only get to RAID0 from RAID4 or RAID5 + * with appropriate layout and one extra device + */ + if (re->level != 4 && re->level != 5) + return "Cannot covert to RAID0 from this level"; + + switch (re->level) { + case 4: + re->before.layout = 0; + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 4: + /* We can only get to RAID4 from RAID5 */ + if (re->level != 4 && re->level != 5) + return "Cannot convert to RAID4 from this level"; + + switch (re->level) { + case 4: + re->after.layout = 0; + break; + case 5: + re->after.layout = ALGORITHM_PARITY_N; + break; + } + break; + + case 5: + /* We get to RAID5 from RAID5 or RAID6 */ + if (re->level != 5 && re->level != 6) + return "Cannot convert to RAID5 from this level"; + + switch (re->level) { + case 5: + if (info->new_layout == UnSet) + re->after.layout = re->before.layout; + else + re->after.layout = info->new_layout; + break; + case 6: + if (info->new_layout == UnSet) + info->new_layout = re->before.layout; + + /* after.layout needs to be raid6 version of new_layout */ + if (info->new_layout == ALGORITHM_PARITY_N) + re->after.layout = ALGORITHM_PARITY_N; + else { + char layout[40]; + char *ls = map_num(r5layout, info->new_layout); + int l; + if (ls) { + /* Current RAID6 layout has a RAID5 + * equivalent - good + */ + strcat(strcpy(layout, ls), "-6"); + l = map_name(r6layout, layout); + if (l == UnSet) + return "Cannot find RAID6 layout to convert to"; + } else { + /* Current RAID6 has no equivalent. + * If it is already a '-6' layout we + * can leave it unchanged, else we must + * fail + */ + ls = map_num(r6layout, + info->new_layout); + if (!ls || + strcmp(ls+strlen(ls)-2, "-6") != 0) + return "Please specify new layout"; + l = info->new_layout; + } + re->after.layout = l; + } + } + break; + + case 6: + /* We must already be at level 6 */ + if (re->level != 6) + return "Impossible level change"; + if (info->new_layout == UnSet) + re->after.layout = info->array.layout; + else + re->after.layout = info->new_layout; + break; + default: + return "Impossible level change requested"; + } + if (info->delta_disks == UnSet) + info->delta_disks = delta_parity; + + re->after.data_disks = + (re->before.data_disks + info->delta_disks - delta_parity); + + switch (re->level) { + case 6: + re->parity = 2; + break; + case 4: + case 5: + re->parity = 1; + break; + default: + re->parity = 0; + break; + } + /* So we have a restripe operation, we need to calculate the number + * of blocks per reshape operation. + */ + re->new_size = info->component_size * re->before.data_disks; + if (info->new_chunk == 0) + info->new_chunk = info->array.chunk_size; + if (re->after.data_disks == re->before.data_disks && + re->after.layout == re->before.layout && + info->new_chunk == info->array.chunk_size) { + /* Nothing to change, can change level immediately. */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + if (re->after.data_disks == 1 && re->before.data_disks == 1) { + /* chunk and layout changes make no difference */ + re->level = info->new_level; + re->backup_blocks = 0; + return NULL; + } + + if (re->after.data_disks == re->before.data_disks && + get_linux_version() < 2006032) + return "in-place reshape is not safe before 2.6.32 - sorry."; + + if (re->after.data_disks < re->before.data_disks && + get_linux_version() < 2006030) + return "reshape to fewer devices is not supported before 2.6.30 - sorry."; + + re->backup_blocks = compute_backup_blocks( + info->new_chunk, info->array.chunk_size, + re->after.data_disks, re->before.data_disks); + re->min_offset_change = re->backup_blocks / re->before.data_disks; + + re->new_size = info->component_size * re->after.data_disks; + return NULL; +} + +static int set_array_size(struct supertype *st, struct mdinfo *sra, + char *text_version) +{ + struct mdinfo *info; + char *subarray; + int ret_val = -1; + + if ((st == NULL) || (sra == NULL)) + return ret_val; + + if (text_version == NULL) + text_version = sra->text_version; + subarray = strchr(text_version + 1, '/')+1; + info = st->ss->container_content(st, subarray); + if (info) { + unsigned long long current_size = 0; + unsigned long long new_size = info->custom_array_size/2; + + if (sysfs_get_ll(sra, NULL, "array_size", ¤t_size) == 0 && + new_size > current_size) { + if (sysfs_set_num(sra, NULL, "array_size", new_size) + < 0) + dprintf("Error: Cannot set array size"); + else { + ret_val = 0; + dprintf("Array size changed"); + } + dprintf_cont(" from %llu to %llu.\n", + current_size, new_size); + } + sysfs_free(info); + } else + dprintf("Error: set_array_size(): info pointer in NULL\n"); + + return ret_val; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape); +static int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape); + +int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s) +{ + /* Make some changes in the shape of an array. + * The kernel must support the change. + * + * There are three different changes. Each can trigger + * a resync or recovery so we freeze that until we have + * requested everything (if kernel supports freezing - 2.6.30). + * The steps are: + * - change size (i.e. component_size) + * - change level + * - change layout/chunksize/ndisks + * + * The last can require a reshape. It is different on different + * levels so we need to check the level before actioning it. + * Some times the level change needs to be requested after the + * reshape (e.g. raid6->raid5, raid5->raid0) + * + */ + struct mdu_array_info_s array; + int rv = 0; + struct supertype *st; + char *subarray = NULL; + + int frozen; + int changed = 0; + char *container = NULL; + int cfd = -1; + + struct mddev_dev *dv; + int added_disks; + + struct mdinfo info; + struct mdinfo *sra; + + if (md_get_array_info(fd, &array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + return 1; + } + if (s->level != UnSet && s->chunk) { + pr_err("Cannot change array level in the same operation as changing chunk size.\n"); + return 1; + } + + if (data_offset != INVALID_SECTORS && array.level != 10 && + (array.level < 4 || array.level > 6)) { + pr_err("--grow --data-offset not yet supported\n"); + return 1; + } + + if (s->size > 0 && + (s->chunk || s->level!= UnSet || s->layout_str || s->raiddisks)) { + pr_err("cannot change component size at the same time as other changes.\n" + " Change size first, then check data is intact before making other changes.\n"); + return 1; + } + + if (s->raiddisks && s->raiddisks < array.raid_disks && + array.level > 1 && get_linux_version() < 2006032 && + !check_env("MDADM_FORCE_FEWER")) { + pr_err("reducing the number of devices is not safe before Linux 2.6.32\n" + " Please use a newer kernel\n"); + return 1; + } + + if (array.level > 1 && s->size > 1 && + (unsigned long long) (array.chunk_size / 1024) > s->size) { + pr_err("component size must be larger than chunk size.\n"); + return 1; + } + + st = super_by_fd(fd, &subarray); + if (!st) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + if (s->raiddisks > st->max_devs) { + pr_err("Cannot increase raid-disks on this array beyond %d\n", st->max_devs); + return 1; + } + if (s->level == 0 && (array.state & (1 << MD_SB_BITMAP_PRESENT)) && + !(array.state & (1 << MD_SB_CLUSTERED)) && !st->ss->external) { + array.state &= ~(1 << MD_SB_BITMAP_PRESENT); + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to remove internal bitmap.\n"); + return 1; + } + } + + /* in the external case we need to check that the requested reshape is + * supported, and perform an initial check that the container holds the + * pre-requisite spare devices (mdmon owns final validation) + */ + if (st->ss->external) { + int retval; + + if (subarray) { + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + free(subarray); + return 1; + } + + retval = st->ss->load_container(st, cfd, NULL); + + if (retval) { + pr_err("Cannot read superblock for %s\n", devname); + free(subarray); + return 1; + } + + /* check if operation is supported for metadata handler */ + if (st->ss->container_content) { + struct mdinfo *cc = NULL; + struct mdinfo *content = NULL; + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + int allow_reshape = 1; + + /* check if reshape is allowed based on metadata + * indications stored in content.array.status + */ + if (content->array.state & + (1 << MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->array.state & + (1 << MD_SB_BLOCK_CONTAINER_RESHAPE)) + allow_reshape = 0; + if (!allow_reshape) { + pr_err("cannot reshape arrays in container with unsupported metadata: %s(%s)\n", + devname, container); + sysfs_free(cc); + free(subarray); + return 1; + } + if (content->consistency_policy == + CONSISTENCY_POLICY_PPL) { + pr_err("Operation not supported when ppl consistency policy is enabled\n"); + sysfs_free(cc); + free(subarray); + return 1; + } + if (content->consistency_policy == + CONSISTENCY_POLICY_BITMAP) { + pr_err("Operation not supported when write-intent bitmap is enabled\n"); + sysfs_free(cc); + free(subarray); + return 1; + } + } + sysfs_free(cc); + } + if (mdmon_running(container)) + st->update_tail = &st->updates; + } + + added_disks = 0; + for (dv = devlist; dv; dv = dv->next) + added_disks++; + if (s->raiddisks > array.raid_disks && + array.spare_disks + added_disks < + (s->raiddisks - array.raid_disks) && + !c->force) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + s->raiddisks - array.raid_disks, + s->raiddisks - array.raid_disks == 1 ? "" : "s", + array.spare_disks + added_disks); + return 1; + } + + sra = sysfs_read(fd, NULL, GET_LEVEL | GET_DISKS | GET_DEVS | + GET_STATE | GET_VERSION); + if (sra) { + if (st->ss->external && subarray == NULL) { + array.level = LEVEL_CONTAINER; + sra->array.level = LEVEL_CONTAINER; + } + } else { + pr_err("failed to read sysfs parameters for %s\n", + devname); + return 1; + } + frozen = freeze(st); + if (frozen < -1) { + /* freeze() already spewed the reason */ + sysfs_free(sra); + return 1; + } else if (frozen < 0) { + pr_err("%s is performing resync/recovery and cannot be reshaped\n", devname); + sysfs_free(sra); + return 1; + } + + /* ========= set size =============== */ + if (s->size > 0 && + (s->size == MAX_SIZE || s->size != (unsigned)array.size)) { + unsigned long long orig_size = get_component_size(fd)/2; + unsigned long long min_csize; + struct mdinfo *mdi; + int raid0_takeover = 0; + + if (orig_size == 0) + orig_size = (unsigned) array.size; + + if (orig_size == 0) { + pr_err("Cannot set device size in this type of array.\n"); + rv = 1; + goto release; + } + + if (reshape_super(st, s->size, UnSet, UnSet, 0, 0, UnSet, NULL, + devname, APPLY_METADATA_CHANGES, + c->verbose > 0)) { + rv = 1; + goto release; + } + sync_metadata(st); + if (st->ss->external) { + /* metadata can have size limitation + * update size value according to metadata information + */ + struct mdinfo *sizeinfo = + st->ss->container_content(st, subarray); + if (sizeinfo) { + unsigned long long new_size = + sizeinfo->custom_array_size/2; + int data_disks = get_data_disks( + sizeinfo->array.level, + sizeinfo->array.layout, + sizeinfo->array.raid_disks); + new_size /= data_disks; + dprintf("Metadata size correction from %llu to %llu (%llu)\n", + orig_size, new_size, + new_size * data_disks); + s->size = new_size; + sysfs_free(sizeinfo); + } + } + + /* Update the size of each member device in case + * they have been resized. This will never reduce + * below the current used-size. The "size" attribute + * understands '0' to mean 'max'. + */ + min_csize = 0; + for (mdi = sra->devs; mdi; mdi = mdi->next) { + sysfs_set_num(sra, mdi, "size", + s->size == MAX_SIZE ? 0 : s->size); + if (array.not_persistent == 0 && + array.major_version == 0 && + get_linux_version() < 3001000) { + /* Dangerous to allow size to exceed 2TB */ + unsigned long long csize; + if (sysfs_get_ll(sra, mdi, "size", + &csize) == 0) { + if (csize >= 2ULL*1024*1024*1024) + csize = 2ULL*1024*1024*1024; + if ((min_csize == 0 || + (min_csize > csize))) + min_csize = csize; + } + } + } + if (min_csize && s->size > min_csize) { + pr_err("Cannot safely make this array use more than 2TB per device on this kernel.\n"); + rv = 1; + goto size_change_error; + } + if (min_csize && s->size == MAX_SIZE) { + /* Don't let the kernel choose a size - it will get + * it wrong + */ + pr_err("Limited v0.90 array to 2TB per device\n"); + s->size = min_csize; + } + if (st->ss->external) { + if (sra->array.level == 0) { + rv = sysfs_set_str(sra, NULL, "level", "raid5"); + if (!rv) { + raid0_takeover = 1; + /* get array parameters after takeover + * to change one parameter at time only + */ + rv = md_get_array_info(fd, &array); + } + } + /* make sure mdmon is + * aware of the new level */ + if (!mdmon_running(st->container_devnm)) + start_mdmon(st->container_devnm); + ping_monitor(container); + if (mdmon_running(st->container_devnm) && + st->update_tail == NULL) + st->update_tail = &st->updates; + } + + if (s->size == MAX_SIZE) + s->size = 0; + array.size = s->size; + if (s->size & ~INT32_MAX) { + /* got truncated to 32bit, write to + * component_size instead + */ + if (sra) + rv = sysfs_set_num(sra, NULL, + "component_size", s->size); + else + rv = -1; + } else { + rv = md_set_array_info(fd, &array); + + /* manage array size when it is managed externally + */ + if ((rv == 0) && st->ss->external) + rv = set_array_size(st, sra, sra->text_version); + } + + if (raid0_takeover) { + /* do not recync non-existing parity, + * we will drop it anyway + */ + sysfs_set_str(sra, NULL, "sync_action", "frozen"); + /* go back to raid0, drop parity disk + */ + sysfs_set_str(sra, NULL, "level", "raid0"); + md_get_array_info(fd, &array); + } + +size_change_error: + if (rv != 0) { + int err = errno; + + /* restore metadata */ + if (reshape_super(st, orig_size, UnSet, UnSet, 0, 0, + UnSet, NULL, devname, + ROLLBACK_METADATA_CHANGES, + c->verbose) == 0) + sync_metadata(st); + pr_err("Cannot set device size for %s: %s\n", + devname, strerror(err)); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before size can be changed\n"); + rv = 1; + goto release; + } + if (s->assume_clean) { + /* This will fail on kernels older than 3.0 unless + * a backport has been arranged. + */ + if (sra == NULL || + sysfs_set_str(sra, NULL, "resync_start", + "none") < 0) + pr_err("--assume-clean not supported with --grow on this kernel\n"); + } + md_get_array_info(fd, &array); + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + if (c->verbose >= 0) { + if (s->size == orig_size) + pr_err("component size of %s unchanged at %lluK\n", + devname, s->size); + else + pr_err("component size of %s has been set to %lluK\n", + devname, s->size); + } + changed = 1; + } else if (array.level != LEVEL_CONTAINER) { + s->size = get_component_size(fd)/2; + if (s->size == 0) + s->size = array.size; + } + + /* See if there is anything else to do */ + if ((s->level == UnSet || s->level == array.level) && + (s->layout_str == NULL) && + (s->chunk == 0 || s->chunk == array.chunk_size) && + data_offset == INVALID_SECTORS && + (s->raiddisks == 0 || s->raiddisks == array.raid_disks)) { + /* Nothing more to do */ + if (!changed && c->verbose >= 0) + pr_err("%s: no change requested\n", devname); + goto release; + } + + /* ========= check for Raid10/Raid1 -> Raid0 conversion =============== + * current implementation assumes that following conditions must be met: + * - RAID10: + * - far_copies == 1 + * - near_copies == 2 + */ + if ((s->level == 0 && array.level == 10 && sra && + array.layout == ((1 << 8) + 2) && !(array.raid_disks & 1)) || + (s->level == 0 && array.level == 1 && sra)) { + int err; + + err = remove_disks_for_takeover(st, sra, array.layout); + if (err) { + dprintf("Array cannot be reshaped\n"); + if (cfd > -1) + close(cfd); + rv = 1; + goto release; + } + /* Make sure mdmon has seen the device removal + * and updated metadata before we continue with + * level change + */ + if (container) + ping_monitor(container); + } + + memset(&info, 0, sizeof(info)); + info.array = array; + if (sysfs_init(&info, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + rv = 1; + goto release; + } + strcpy(info.text_version, sra->text_version); + info.component_size = s->size*2; + info.new_level = s->level; + info.new_chunk = s->chunk * 1024; + if (info.array.level == LEVEL_CONTAINER) { + info.delta_disks = UnSet; + info.array.raid_disks = s->raiddisks; + } else if (s->raiddisks) + info.delta_disks = s->raiddisks - info.array.raid_disks; + else + info.delta_disks = UnSet; + if (s->layout_str == NULL) { + info.new_layout = UnSet; + if (info.array.level == 6 && + (info.new_level == 6 || info.new_level == UnSet) && + info.array.layout >= 16) { + pr_err("%s has a non-standard layout. If you wish to preserve this\n", devname); + cont_err("during the reshape, please specify --layout=preserve\n"); + cont_err("If you want to change it, specify a layout or use --layout=normalise\n"); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "normalise") == 0 || + strcmp(s->layout_str, "normalize") == 0) { + /* If we have a -6 RAID6 layout, remove the '-6'. */ + info.new_layout = UnSet; + if (info.array.level == 6 && info.new_level == UnSet) { + char l[40], *h; + strcpy(l, map_num(r6layout, info.array.layout)); + h = strrchr(l, '-'); + if (h && strcmp(h, "-6") == 0) { + *h = 0; + info.new_layout = map_name(r6layout, l); + } + } else { + pr_err("%s is only meaningful when reshaping a RAID6 array.\n", s->layout_str); + rv = 1; + goto release; + } + } else if (strcmp(s->layout_str, "preserve") == 0) { + /* This means that a non-standard RAID6 layout + * is OK. + * In particular: + * - When reshape a RAID6 (e.g. adding a device) + * which is in a non-standard layout, it is OK + * to preserve that layout. + * - When converting a RAID5 to RAID6, leave it in + * the XXX-6 layout, don't re-layout. + */ + if (info.array.level == 6 && info.new_level == UnSet) + info.new_layout = info.array.layout; + else if (info.array.level == 5 && info.new_level == 6) { + char l[40]; + strcpy(l, map_num(r5layout, info.array.layout)); + strcat(l, "-6"); + info.new_layout = map_name(r6layout, l); + } else { + pr_err("%s in only meaningful when reshaping to RAID6\n", s->layout_str); + rv = 1; + goto release; + } + } else { + int l = info.new_level; + if (l == UnSet) + l = info.array.level; + switch (l) { + case 5: + info.new_layout = map_name(r5layout, s->layout_str); + break; + case 6: + info.new_layout = map_name(r6layout, s->layout_str); + break; + case 10: + info.new_layout = parse_layout_10(s->layout_str); + break; + case LEVEL_FAULTY: + info.new_layout = parse_layout_faulty(s->layout_str); + break; + default: + pr_err("layout not meaningful with this level\n"); + rv = 1; + goto release; + } + if (info.new_layout == UnSet) { + pr_err("layout %s not understood for this level\n", + s->layout_str); + rv = 1; + goto release; + } + } + + if (array.level == LEVEL_FAULTY) { + if (s->level != UnSet && s->level != array.level) { + pr_err("cannot change level of Faulty device\n"); + rv =1 ; + } + if (s->chunk) { + pr_err("cannot set chunksize of Faulty device\n"); + rv =1 ; + } + if (s->raiddisks && s->raiddisks != 1) { + pr_err("cannot set raid_disks of Faulty device\n"); + rv =1 ; + } + if (s->layout_str) { + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + array.layout = info.new_layout; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set new layout\n"); + rv = 1; + } else if (c->verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + } else if (array.level == LEVEL_CONTAINER) { + /* This change is to be applied to every array in the + * container. This is only needed when the metadata imposes + * restraints of the various arrays in the container. + * Currently we only know that IMSM requires all arrays + * to have the same number of devices so changing the + * number of devices (On-Line Capacity Expansion) must be + * performed at the level of the container + */ + close_fd(&fd); + rv = reshape_container(container, devname, -1, st, &info, + c->force, c->backup_file, c->verbose, + 0, 0, 0); + frozen = 0; + } else { + /* get spare devices from external metadata + */ + if (st->ss->external) { + struct mdinfo *info2; + + info2 = st->ss->container_content(st, subarray); + if (info2) { + info.array.spare_disks = + info2->array.spare_disks; + sysfs_free(info2); + } + } + + /* Impose these changes on a single array. First + * check that the metadata is OK with the change. */ + + if (reshape_super(st, 0, info.new_level, + info.new_layout, info.new_chunk, + info.array.raid_disks, info.delta_disks, + c->backup_file, devname, + APPLY_METADATA_CHANGES, c->verbose)) { + rv = 1; + goto release; + } + sync_metadata(st); + rv = reshape_array(container, fd, devname, st, &info, c->force, + devlist, data_offset, c->backup_file, + c->verbose, 0, 0, 0); + frozen = 0; + } +release: + sysfs_free(sra); + if (frozen > 0) + unfreeze(st); + return rv; +} + +/* verify_reshape_position() + * Function checks if reshape position in metadata is not farther + * than position in md. + * Return value: + * 0 : not valid sysfs entry + * it can be caused by not started reshape, it should be started + * by reshape array or raid0 array is before takeover + * -1 : error, reshape position is obviously wrong + * 1 : success, reshape progress correct or updated +*/ +static int verify_reshape_position(struct mdinfo *info, int level) +{ + int ret_val = 0; + char buf[40]; + int rv; + + /* read sync_max, failure can mean raid0 array */ + rv = sysfs_get_str(info, NULL, "sync_max", buf, 40); + + if (rv > 0) { + char *ep; + unsigned long long position = strtoull(buf, &ep, 0); + + dprintf("Read sync_max sysfs entry is: %s\n", buf); + if (!(ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' '))) { + position *= get_data_disks(level, + info->new_layout, + info->array.raid_disks); + if (info->reshape_progress < position) { + dprintf("Corrected reshape progress (%llu) to md position (%llu)\n", + info->reshape_progress, position); + info->reshape_progress = position; + ret_val = 1; + } else if (info->reshape_progress > position) { + pr_err("Fatal error: array reshape was not properly frozen (expected reshape position is %llu, but reshape progress is %llu.\n", + position, info->reshape_progress); + ret_val = -1; + } else { + dprintf("Reshape position in md and metadata are the same;"); + ret_val = 1; + } + } + } else if (rv == 0) { + /* for valid sysfs entry, 0-length content + * should be indicated as error + */ + ret_val = -1; + } + + return ret_val; +} + +static unsigned long long choose_offset(unsigned long long lo, + unsigned long long hi, + unsigned long long min, + unsigned long long max) +{ + /* Choose a new offset between hi and lo. + * It must be between min and max, but + * we would prefer something near the middle of hi/lo, and also + * prefer to be aligned to a big power of 2. + * + * So we start with the middle, then for each bit, + * starting at '1' and increasing, if it is set, we either + * add it or subtract it if possible, preferring the option + * which is furthest from the boundary. + * + * We stop once we get a 1MB alignment. As units are in sectors, + * 1MB = 2*1024 sectors. + */ + unsigned long long choice = (lo + hi) / 2; + unsigned long long bit = 1; + + for (bit = 1; bit < 2*1024; bit = bit << 1) { + unsigned long long bigger, smaller; + if (! (bit & choice)) + continue; + bigger = choice + bit; + smaller = choice - bit; + if (bigger > max && smaller < min) + break; + if (bigger > max) + choice = smaller; + else if (smaller < min) + choice = bigger; + else if (hi - bigger > smaller - lo) + choice = bigger; + else + choice = smaller; + } + return choice; +} + +static int set_new_data_offset(struct mdinfo *sra, struct supertype *st, + char *devname, int delta_disks, + unsigned long long data_offset, + unsigned long long min, + int can_fallback) +{ + struct mdinfo *sd; + int dir = 0; + int err = 0; + unsigned long long before, after; + + /* Need to find min space before and after so same is used + * on all devices + */ + before = UINT64_MAX; + after = UINT64_MAX; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + int rv; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) { + pr_err("%s: cannot open component %s\n", + devname, dn ? dn : "-unknown-"); + goto release; + } + st2 = dup_super(st); + rv = st2->ss->load_super(st2,dfd, NULL); + close(dfd); + if (rv) { + free(st2); + pr_err("%s: cannot get superblock from %s\n", + devname, dn); + goto release; + } + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (info2.space_before == 0 && + info2.space_after == 0) { + /* Metadata doesn't support data_offset changes */ + if (!can_fallback) + pr_err("%s: Metadata version doesn't support data_offset changes\n", + devname); + goto fallback; + } + if (before > info2.space_before) + before = info2.space_before; + if (after > info2.space_after) + after = info2.space_after; + + if (data_offset != INVALID_SECTORS) { + if (dir == 0) { + if (info2.data_offset == data_offset) { + pr_err("%s: already has that data_offset\n", + dn); + goto release; + } + if (data_offset < info2.data_offset) + dir = -1; + else + dir = 1; + } else if ((data_offset <= info2.data_offset && + dir == 1) || + (data_offset >= info2.data_offset && + dir == -1)) { + pr_err("%s: differing data offsets on devices make this --data-offset setting impossible\n", + dn); + goto release; + } + } + } + if (before == UINT64_MAX) + /* impossible really, there must be no devices */ + return 1; + + for (sd = sra->devs; sd; sd = sd->next) { + char *dn = map_dev(sd->disk.major, sd->disk.minor, 0); + unsigned long long new_data_offset; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (delta_disks < 0) { + /* Don't need any space as array is shrinking + * just move data_offset up by min + */ + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset + min; + else { + if (data_offset < sd->data_offset + min) { + pr_err("--data-offset too small for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else if (delta_disks > 0) { + /* need space before */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient head-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset == INVALID_SECTORS) + new_data_offset = sd->data_offset - min; + else { + if (data_offset > sd->data_offset - min) { + pr_err("--data-offset too large for %s\n", + dn); + goto release; + } + new_data_offset = data_offset; + } + } else { + if (dir == 0) { + /* can move up or down. If 'data_offset' + * was set we would have already decided, + * so just choose direction with most space. + */ + if (before > after) + dir = -1; + else + dir = 1; + } + sysfs_set_str(sra, NULL, "reshape_direction", + dir == 1 ? "backwards" : "forwards"); + if (dir > 0) { + /* Increase data offset */ + if (after < min) { + if (can_fallback) + goto fallback; + pr_err("Insufficient tail-space for reshape on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset < sd->data_offset + min) { + pr_err("--data-offset too small on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset, + sd->data_offset + after, + sd->data_offset + min, + sd->data_offset + after); + } else { + /* Decrease data offset */ + if (before < min) { + if (can_fallback) + goto fallback; + pr_err("insufficient head-room on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS && + data_offset > sd->data_offset - min) { + pr_err("--data-offset too large on %s\n", + dn); + goto release; + } + if (data_offset != INVALID_SECTORS) + new_data_offset = data_offset; + else + new_data_offset = choose_offset(sd->data_offset - before, + sd->data_offset, + sd->data_offset - before, + sd->data_offset - min); + } + } + err = sysfs_set_num(sra, sd, "new_offset", new_data_offset); + if (err < 0 && errno == E2BIG) { + /* try again after increasing data size to max */ + err = sysfs_set_num(sra, sd, "size", 0); + if (err < 0 && errno == EINVAL && + !(sd->disk.state & (1<<MD_DISK_SYNC))) { + /* some kernels have a bug where you cannot + * use '0' on spare devices. */ + sysfs_set_num(sra, sd, "size", + (sra->component_size + after)/2); + } + err = sysfs_set_num(sra, sd, "new_offset", + new_data_offset); + } + if (err < 0) { + if (errno == E2BIG && data_offset != INVALID_SECTORS) { + pr_err("data-offset is too big for %s\n", dn); + goto release; + } + if (sd == sra->devs && + (errno == ENOENT || errno == E2BIG)) + /* Early kernel, no 'new_offset' file, + * or kernel doesn't like us. + * For RAID5/6 this is not fatal + */ + return 1; + pr_err("Cannot set new_offset for %s\n", dn); + break; + } + } + return err; +release: + return -1; +fallback: + /* Just use a backup file */ + return 1; +} + +static int raid10_reshape(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + struct reshape *reshape, + unsigned long long data_offset, + int force, int verbose) +{ + /* Changing raid_disks, layout, chunksize or possibly + * just data_offset for a RAID10. + * We must always change data_offset. We change by at least + * ->min_offset_change which is the largest of the old and new + * chunk sizes. + * If raid_disks is increasing, then data_offset must decrease + * by at least this copy size. + * If raid_disks is unchanged, data_offset must increase or + * decrease by at least min_offset_change but preferably by much more. + * We choose half of the available space. + * If raid_disks is decreasing, data_offset must increase by + * at least min_offset_change. To allow of this, component_size + * must be decreased by the same amount. + * + * So we calculate the required minimum and direction, possibly + * reduce the component_size, then iterate through the devices + * and set the new_data_offset. + * If that all works, we set chunk_size, layout, raid_disks, and start + * 'reshape' + */ + struct mdinfo *sra; + unsigned long long min; + int err = 0; + + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK + ); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", devname); + goto release; + } + min = reshape->min_offset_change; + + if (info->delta_disks) + sysfs_set_str(sra, NULL, "reshape_direction", + info->delta_disks < 0 ? "backwards" : "forwards"); + if (info->delta_disks < 0 && info->space_after < min) { + int rv = sysfs_set_num(sra, NULL, "component_size", + (sra->component_size - min)/2); + if (rv) { + pr_err("cannot reduce component size\n"); + goto release; + } + } + err = set_new_data_offset(sra, st, devname, info->delta_disks, + data_offset, min, 0); + if (err == 1) { + pr_err("Cannot set new_data_offset: RAID10 reshape not\n"); + cont_err("supported on this kernel\n"); + err = -1; + } + if (err < 0) + goto release; + + if (!err && sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && + sysfs_set_num(sra, NULL, "raid_disks", + info->array.raid_disks + info->delta_disks) < 0) + err = errno; + if (!err && sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) + err = errno; + if (err) { + pr_err("Cannot set array shape for %s\n", + devname); + if (err == EBUSY && + (info->array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err(" Bitmap must be removed before shape can be changed\n"); + goto release; + } + sysfs_free(sra); + return 0; +release: + sysfs_free(sra); + return 1; +} + +static void get_space_after(int fd, struct supertype *st, struct mdinfo *info) +{ + struct mdinfo *sra, *sd; + /* Initialisation to silence compiler warning */ + unsigned long long min_space_before = 0, min_space_after = 0; + int first = 1; + + sra = sysfs_read(fd, NULL, GET_DEVS); + if (!sra) + return; + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int dfd; + struct supertype *st2; + struct mdinfo info2; + + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 0); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + break; + st2 = dup_super(st); + if (st2->ss->load_super(st2,dfd, NULL)) { + close(dfd); + free(st2); + break; + } + close(dfd); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + free(st2); + if (first || + min_space_before > info2.space_before) + min_space_before = info2.space_before; + if (first || + min_space_after > info2.space_after) + min_space_after = info2.space_after; + first = 0; + } + if (sd == NULL && !first) { + info->space_after = min_space_after; + info->space_before = min_space_before; + } + sysfs_free(sra); +} + +static void update_cache_size(char *container, struct mdinfo *sra, + struct mdinfo *info, + int disks, unsigned long long blocks) +{ + /* Check that the internal stripe cache is + * large enough, or it won't work. + * It must hold at least 4 stripes of the larger + * chunk size + */ + unsigned long cache; + cache = max(info->array.chunk_size, info->new_chunk); + cache *= 4; /* 4 stripes minimum */ + cache /= 512; /* convert to sectors */ + /* make sure there is room for 'blocks' with a bit to spare */ + if (cache < 16 + blocks / disks) + cache = 16 + blocks / disks; + cache /= (4096/512); /* Convert from sectors to pages */ + + if (sra->cache_size < cache) + subarray_set_num(container, sra, "stripe_cache_size", + cache+1); +} + +static int impose_reshape(struct mdinfo *sra, + struct mdinfo *info, + struct supertype *st, + int fd, + int restart, + char *devname, char *container, + struct reshape *reshape) +{ + struct mdu_array_info_s array; + + sra->new_chunk = info->new_chunk; + + if (restart) { + /* for external metadata checkpoint saved by mdmon can be lost + * or missed /due to e.g. crash/. Check if md is not during + * restart farther than metadata points to. + * If so, this means metadata information is obsolete. + */ + if (st->ss->external) + verify_reshape_position(info, reshape->level); + sra->reshape_progress = info->reshape_progress; + } else { + sra->reshape_progress = 0; + if (reshape->after.data_disks < reshape->before.data_disks) + /* start from the end of the new array */ + sra->reshape_progress = (sra->component_size + * reshape->after.data_disks); + } + + md_get_array_info(fd, &array); + if (info->array.chunk_size == info->new_chunk && + reshape->before.layout == reshape->after.layout && + st->ss->external == 0) { + /* use SET_ARRAY_INFO but only if reshape hasn't started */ + array.raid_disks = reshape->after.data_disks + reshape->parity; + if (!restart && md_set_array_info(fd, &array) != 0) { + int err = errno; + + pr_err("Cannot set device shape for %s: %s\n", + devname, strerror(errno)); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + + goto release; + } + } else if (!restart) { + /* set them all just in case some old 'new_*' value + * persists from some earlier problem. + */ + int err = 0; + if (sysfs_set_num(sra, NULL, "chunk_size", info->new_chunk) < 0) + err = errno; + if (!err && sysfs_set_num(sra, NULL, "layout", + reshape->after.layout) < 0) + err = errno; + if (!err && subarray_set_num(container, sra, "raid_disks", + reshape->after.data_disks + + reshape->parity) < 0) + err = errno; + if (err) { + pr_err("Cannot set device shape for %s\n", devname); + + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before shape can be changed\n"); + goto release; + } + } + return 0; +release: + return -1; +} + +static int impose_level(int fd, int level, char *devname, int verbose) +{ + char *c; + struct mdu_array_info_s array; + struct mdinfo info; + + if (sysfs_init(&info, fd, NULL)) { + pr_err("failed to initialize sysfs.\n"); + return 1; + } + + md_get_array_info(fd, &array); + if (level == 0 && (array.level >= 4 && array.level <= 6)) { + /* To convert to RAID0 we need to fail and + * remove any non-data devices. */ + int found = 0; + int d; + int data_disks = array.raid_disks - 1; + if (array.level == 6) + data_disks -= 1; + if (array.level == 5 && array.layout != ALGORITHM_PARITY_N) + return -1; + if (array.level == 6 && array.layout != ALGORITHM_PARITY_N_6) + return -1; + sysfs_set_str(&info, NULL,"sync_action", "idle"); + /* First remove any spares so no recovery starts */ + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) && + disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, HOT_REMOVE_DISK, + makedev(disk.major, disk.minor)); + } + /* Now fail anything left */ + md_get_array_info(fd, &array); + for (d = 0, found = 0; + d < MAX_DISKS && found < array.nr_disks; d++) { + mdu_disk_info_t disk; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + found++; + if ((disk.state & (1 << MD_DISK_ACTIVE)) && + disk.raid_disk < data_disks) + /* keep this */ + continue; + ioctl(fd, SET_DISK_FAULTY, + makedev(disk.major, disk.minor)); + hot_remove_disk(fd, makedev(disk.major, disk.minor), 1); + } + } + c = map_num(pers, level); + if (c) { + int err = sysfs_set_str(&info, NULL, "level", c); + if (err) { + err = errno; + pr_err("%s: could not set level to %s\n", + devname, c); + if (err == EBUSY && + (array.state & (1<<MD_SB_BITMAP_PRESENT))) + cont_err("Bitmap must be removed before level can be changed\n"); + return err; + } + if (verbose >= 0) + pr_err("level of %s changed to %s\n", devname, c); + } + return 0; +} + +int sigterm = 0; +static void catch_term(int sig) +{ + sigterm = 1; +} + +static int reshape_array(char *container, int fd, char *devname, + struct supertype *st, struct mdinfo *info, + int force, struct mddev_dev *devlist, + unsigned long long data_offset, + char *backup_file, int verbose, int forked, + int restart, int freeze_reshape) +{ + struct reshape reshape; + int spares_needed; + char *msg; + int orig_level = UnSet; + int odisks; + int delayed; + + struct mdu_array_info_s array; + char *c; + + struct mddev_dev *dv; + int added_disks; + + int *fdlist = NULL; + unsigned long long *offsets = NULL; + int d; + int nrdisks; + int err; + unsigned long blocks; + unsigned long long array_size; + int done; + struct mdinfo *sra = NULL; + char buf[20]; + + /* when reshaping a RAID0, the component_size might be zero. + * So try to fix that up. + */ + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + if (array.level == 0 && info->component_size == 0) { + get_dev_size(fd, NULL, &array_size); + info->component_size = array_size / array.raid_disks; + } + + if (array.level == 10) + /* Need space_after info */ + get_space_after(fd, st, info); + + if (info->reshape_active) { + int new_level = info->new_level; + info->new_level = UnSet; + if (info->delta_disks > 0) + info->array.raid_disks -= info->delta_disks; + msg = analyse_change(devname, info, &reshape); + info->new_level = new_level; + if (info->delta_disks > 0) + info->array.raid_disks += info->delta_disks; + if (!restart) + /* Make sure the array isn't read-only */ + ioctl(fd, RESTART_ARRAY_RW, 0); + } else + msg = analyse_change(devname, info, &reshape); + if (msg) { + /* if msg == "", error has already been printed */ + if (msg[0]) + pr_err("%s\n", msg); + goto release; + } + if (restart && (reshape.level != info->array.level || + reshape.before.layout != info->array.layout || + reshape.before.data_disks + reshape.parity != + info->array.raid_disks - max(0, info->delta_disks))) { + pr_err("reshape info is not in native format - cannot continue.\n"); + goto release; + } + + if (st->ss->external && restart && (info->reshape_progress == 0) && + !((sysfs_get_str(info, NULL, "sync_action", + buf, sizeof(buf)) > 0) && + (strncmp(buf, "reshape", 7) == 0))) { + /* When reshape is restarted from '0', very begin of array + * it is possible that for external metadata reshape and array + * configuration doesn't happen. + * Check if md has the same opinion, and reshape is restarted + * from 0. If so, this is regular reshape start after reshape + * switch in metadata to next array only. + */ + if ((verify_reshape_position(info, reshape.level) >= 0) && + (info->reshape_progress == 0)) + restart = 0; + } + if (restart) { + /* + * reshape already started. just skip to monitoring + * the reshape + */ + if (reshape.backup_blocks == 0) + return 0; + if (restart & RESHAPE_NO_BACKUP) + return 0; + + /* Need 'sra' down at 'started:' */ + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE| + GET_CHUNK|GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + backup_file = locate_backup(sra->sys_name); + + goto started; + } + /* The container is frozen but the array may not be. + * So freeze the array so spares don't get put to the wrong use + * FIXME there should probably be a cleaner separation between + * freeze_array and freeze_container. + */ + sysfs_freeze_array(info); + /* Check we have enough spares to not be degraded */ + added_disks = 0; + for (dv = devlist; dv ; dv=dv->next) + added_disks++; + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + + reshape.parity - array.raid_disks; + + if (!force && info->new_level > 1 && info->array.level > 1 && + spares_needed > info->array.spare_disks + added_disks) { + pr_err("Need %d spare%s to avoid degraded array, and only have %d.\n" + " Use --force to over-ride this check.\n", + spares_needed, + spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + /* Check we have enough spares to not fail */ + spares_needed = max(reshape.before.data_disks, + reshape.after.data_disks) + - array.raid_disks; + if ((info->new_level > 1 || info->new_level == 0) && + spares_needed > info->array.spare_disks +added_disks) { + pr_err("Need %d spare%s to create working array, and only have %d.\n", + spares_needed, spares_needed == 1 ? "" : "s", + info->array.spare_disks + added_disks); + goto release; + } + + if (reshape.level != array.level) { + int err = impose_level(fd, reshape.level, devname, verbose); + if (err) + goto release; + info->new_layout = UnSet; /* after level change, + * layout is meaningless */ + orig_level = array.level; + sysfs_freeze_array(info); + + if (reshape.level > 0 && st->ss->external) { + /* make sure mdmon is aware of the new level */ + if (mdmon_running(container)) + flush_mdmon(container); + + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + if (mdmon_running(container) && st->update_tail == NULL) + st->update_tail = &st->updates; + } + } + /* ->reshape_super might have chosen some spares from the + * container that it wants to be part of the new array. + * We can collect them with ->container_content and give + * them to the kernel. + */ + if (st->ss->reshape_super && st->ss->container_content) { + char *subarray = strchr(info->text_version+1, '/')+1; + struct mdinfo *info2 = + st->ss->container_content(st, subarray); + struct mdinfo *d; + + if (info2) { + if (sysfs_init(info2, fd, st->devnm)) { + pr_err("unable to initialize sysfs for %s\n", + st->devnm); + free(info2); + goto release; + } + /* When increasing number of devices, we need to set + * new raid_disks before adding these, or they might + * be rejected. + */ + if (reshape.backup_blocks && + reshape.after.data_disks > + reshape.before.data_disks) + subarray_set_num(container, info2, "raid_disks", + reshape.after.data_disks + + reshape.parity); + for (d = info2->devs; d; d = d->next) { + if (d->disk.state == 0 && + d->disk.raid_disk >= 0) { + /* This is a spare that wants to + * be part of the array. + */ + add_disk(fd, st, info2, d); + } + } + sysfs_free(info2); + } + } + /* We might have been given some devices to add to the + * array. Now that the array has been changed to the right + * level and frozen, we can safely add them. + */ + if (devlist) { + if (Manage_subdevs(devname, fd, devlist, verbose, 0, NULL, 0)) + goto release; + } + + if (reshape.backup_blocks == 0 && data_offset != INVALID_SECTORS) + reshape.backup_blocks = reshape.before.data_disks * info->array.chunk_size/512; + if (reshape.backup_blocks == 0) { + /* No restriping needed, but we might need to impose + * some more changes: layout, raid_disks, chunk_size + */ + /* read current array info */ + if (md_get_array_info(fd, &array) != 0) { + dprintf("Cannot get array information.\n"); + goto release; + } + /* compare current array info with new values and if + * it is different update them to new */ + if (info->new_layout != UnSet && + info->new_layout != array.layout) { + array.layout = info->new_layout; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set new layout\n"); + goto release; + } else if (verbose >= 0) + printf("layout for %s set to %d\n", + devname, array.layout); + } + if (info->delta_disks != UnSet && info->delta_disks != 0 && + array.raid_disks != + (info->array.raid_disks + info->delta_disks)) { + array.raid_disks += info->delta_disks; + if (md_set_array_info(fd, &array) != 0) { + pr_err("failed to set raid disks\n"); + goto release; + } else if (verbose >= 0) { + printf("raid_disks for %s set to %d\n", + devname, array.raid_disks); + } + } + if (info->new_chunk != 0 && + info->new_chunk != array.chunk_size) { + if (sysfs_set_num(info, NULL, + "chunk_size", info->new_chunk) != 0) { + pr_err("failed to set chunk size\n"); + goto release; + } else if (verbose >= 0) + printf("chunk size for %s set to %d\n", + devname, info->new_chunk); + } + unfreeze(st); + return 0; + } + + /* + * There are three possibilities. + * 1/ The array will shrink. + * We need to ensure the reshape will pause before reaching + * the 'critical section'. We also need to fork and wait for + * that to happen. When it does we + * suspend/backup/complete/unfreeze + * + * 2/ The array will not change size. + * This requires that we keep a backup of a sliding window + * so that we can restore data after a crash. So we need + * to fork and monitor progress. + * In future we will allow the data_offset to change, so + * a sliding backup becomes unnecessary. + * + * 3/ The array will grow. This is relatively easy. + * However the kernel's restripe routines will cheerfully + * overwrite some early data before it is safe. So we + * need to make a backup of the early parts of the array + * and be ready to restore it if rebuild aborts very early. + * For externally managed metadata, we still need a forked + * child to monitor the reshape and suspend IO over the region + * that is being reshaped. + * + * We backup data by writing it to one spare, or to a + * file which was given on command line. + * + * In each case, we first make sure that storage is available + * for the required backup. + * Then we: + * - request the shape change. + * - fork to handle backup etc. + */ + /* Check that we can hold all the data */ + get_dev_size(fd, NULL, &array_size); + if (reshape.new_size < (array_size/512)) { + pr_err("this change will reduce the size of the array.\n" + " use --grow --array-size first to truncate array.\n" + " e.g. mdadm --grow %s --array-size %llu\n", + devname, reshape.new_size/2); + goto release; + } + + if (array.level == 10) { + /* Reshaping RAID10 does not require any data backup by + * user-space. Instead it requires that the data_offset + * is changed to avoid the need for backup. + * So this is handled very separately + */ + if (restart) + /* Nothing to do. */ + return 0; + return raid10_reshape(container, fd, devname, st, info, + &reshape, data_offset, force, verbose); + } + sra = sysfs_read(fd, NULL, + GET_COMPONENT|GET_DEVS|GET_OFFSET|GET_STATE|GET_CHUNK| + GET_CACHE); + if (!sra) { + pr_err("%s: Cannot get array details from sysfs\n", + devname); + goto release; + } + + if (!backup_file) + switch(set_new_data_offset(sra, st, devname, + reshape.after.data_disks - reshape.before.data_disks, + data_offset, + reshape.min_offset_change, 1)) { + case -1: + goto release; + case 0: + /* Updated data_offset, so it's easy now */ + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), + reshape.backup_blocks); + + /* Right, everything seems fine. Let's kick things off. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + if (sysfs_set_str(sra, NULL, "sync_action", "reshape") < 0) { + struct mdinfo *sd; + if (errno != EINVAL) { + pr_err("Failed to initiate reshape!\n"); + goto release; + } + /* revert data_offset and try the old way */ + for (sd = sra->devs; sd; sd = sd->next) { + sysfs_set_num(sra, sd, "new_offset", + sd->data_offset); + sysfs_set_str(sra, NULL, "reshape_direction", + "forwards"); + } + break; + } + if (info->new_level == reshape.level) + return 0; + /* need to adjust level when reshape completes */ + switch(fork()) { + case -1: /* ignore error, but don't wait */ + return 0; + default: /* parent */ + return 0; + case 0: + manage_fork_fds(0); + map_fork(); + break; + } + close(fd); + wait_reshape(sra); + fd = open_dev(sra->sys_name); + if (fd >= 0) + impose_level(fd, info->new_level, devname, verbose); + return 0; + case 1: /* Couldn't set data_offset, try the old way */ + if (data_offset != INVALID_SECTORS) { + pr_err("Cannot update data_offset on this array\n"); + goto release; + } + break; + } + +started: + /* Decide how many blocks (sectors) for a reshape + * unit. The number we have so far is just a minimum + */ + blocks = reshape.backup_blocks; + if (reshape.before.data_disks == + reshape.after.data_disks) { + /* Make 'blocks' bigger for better throughput, but + * not so big that we reject it below. + * Try for 16 megabytes + */ + while (blocks * 32 < sra->component_size && blocks < 16*1024*2) + blocks *= 2; + } else + pr_err("Need to backup %luK of critical section..\n", blocks/2); + + if (blocks >= sra->component_size/2) { + pr_err("%s: Something wrong - reshape aborted\n", devname); + goto release; + } + + /* Now we need to open all these devices so we can read/write. + */ + nrdisks = max(reshape.before.data_disks, + reshape.after.data_disks) + reshape.parity + + sra->array.spare_disks; + fdlist = xcalloc((1+nrdisks), sizeof(int)); + offsets = xcalloc((1+nrdisks), sizeof(offsets[0])); + + odisks = reshape.before.data_disks + reshape.parity; + d = reshape_prepare_fdlist(devname, sra, odisks, nrdisks, blocks, + backup_file, fdlist, offsets); + if (d < odisks) { + goto release; + } + if ((st->ss->manage_reshape == NULL) || + (st->ss->recover_backup == NULL)) { + if (backup_file == NULL) { + if (reshape.after.data_disks <= + reshape.before.data_disks) { + pr_err("%s: Cannot grow - need backup-file\n", + devname); + pr_err(" Please provide one with \"--backup=...\"\n"); + goto release; + } else if (d == odisks) { + pr_err("%s: Cannot grow - need a spare or backup-file to backup critical section\n", devname); + goto release; + } + } else { + if (!reshape_open_backup_file(backup_file, fd, devname, + (signed)blocks, + fdlist+d, offsets+d, + sra->sys_name, restart)) { + goto release; + } + d++; + } + } + + update_cache_size(container, sra, info, + min(reshape.before.data_disks, + reshape.after.data_disks), blocks); + + /* Right, everything seems fine. Let's kick things off. + * If only changing raid_disks, use ioctl, else use + * sysfs. + */ + sync_metadata(st); + + if (impose_reshape(sra, info, st, fd, restart, + devname, container, &reshape) < 0) + goto release; + + err = start_reshape(sra, restart, reshape.before.data_disks, + reshape.after.data_disks, st); + if (err) { + pr_err("Cannot %s reshape for %s\n", + restart ? "continue" : "start", devname); + goto release; + } + if (restart) + sysfs_set_str(sra, NULL, "array_state", "active"); + if (freeze_reshape) { + free(fdlist); + free(offsets); + sysfs_free(sra); + pr_err("Reshape has to be continued from location %llu when root filesystem has been mounted.\n", + sra->reshape_progress); + return 1; + } + + if (!forked) + if (continue_via_systemd(container ?: sra->sys_name, + GROW_SERVICE)) { + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + } + + close(fd); + /* Now we just need to kick off the reshape and watch, while + * handling backups of the data... + * This is all done by a forked background process. + */ + switch(forked ? 0 : fork()) { + case -1: + pr_err("Cannot run child to monitor reshape: %s\n", + strerror(errno)); + abort_reshape(sra); + goto release; + default: + free(fdlist); + free(offsets); + sysfs_free(sra); + return 0; + case 0: + map_fork(); + break; + } + + /* If another array on the same devices is busy, the + * reshape will wait for them. This would mean that + * the first section that we suspend will stay suspended + * for a long time. So check on that possibility + * by looking for "DELAYED" in /proc/mdstat, and if found, + * wait a while + */ + do { + struct mdstat_ent *mds, *m; + delayed = 0; + mds = mdstat_read(1, 0); + for (m = mds; m; m = m->next) + if (strcmp(m->devnm, sra->sys_name) == 0) { + if (m->resync && m->percent == RESYNC_DELAYED) + delayed = 1; + if (m->resync == 0) + /* Haven't started the reshape thread + * yet, wait a bit + */ + delayed = 2; + break; + } + free_mdstat(mds); + if (delayed == 1 && get_linux_version() < 3007000) { + pr_err("Reshape is delayed, but cannot wait carefully with this kernel.\n" + " You might experience problems until other reshapes complete.\n"); + delayed = 0; + } + if (delayed) + mdstat_wait(30 - (delayed-1) * 25); + } while (delayed); + mdstat_close(); + if (check_env("MDADM_GROW_VERIFY")) + fd = open(devname, O_RDONLY | O_DIRECT); + else + fd = -1; + mlockall(MCL_FUTURE); + + signal(SIGTERM, catch_term); + + if (st->ss->external) { + /* metadata handler takes it from here */ + done = st->ss->manage_reshape( + fd, sra, &reshape, st, blocks, + fdlist, offsets, d - odisks, fdlist + odisks, + offsets + odisks); + } else + done = child_monitor( + fd, sra, &reshape, st, blocks, fdlist, offsets, + d - odisks, fdlist + odisks, offsets + odisks); + + free(fdlist); + free(offsets); + + if (backup_file && done) { + char *bul; + bul = make_backup(sra->sys_name); + if (bul) { + char buf[1024]; + int l = readlink(bul, buf, sizeof(buf) - 1); + if (l > 0) { + buf[l]=0; + unlink(buf); + } + unlink(bul); + free(bul); + } + unlink(backup_file); + } + if (!done) { + abort_reshape(sra); + goto out; + } + + if (!st->ss->external && + !(reshape.before.data_disks != reshape.after.data_disks && + info->custom_array_size) && info->new_level == reshape.level && + !forked) { + /* no need to wait for the reshape to finish as + * there is nothing more to do. + */ + sysfs_free(sra); + exit(0); + } + wait_reshape(sra); + + if (st->ss->external) { + /* Re-load the metadata as much could have changed */ + int cfd = open_dev(st->container_devnm); + if (cfd >= 0) { + flush_mdmon(container); + st->ss->free_super(st); + st->ss->load_container(st, cfd, container); + close(cfd); + } + } + + /* set new array size if required customer_array_size is used + * by this metadata. + */ + if (reshape.before.data_disks != reshape.after.data_disks && + info->custom_array_size) + set_array_size(st, info, info->text_version); + + if (info->new_level != reshape.level) { + if (fd < 0) + fd = open(devname, O_RDONLY); + impose_level(fd, info->new_level, devname, verbose); + close(fd); + if (info->new_level == 0) + st->update_tail = NULL; + } +out: + sysfs_free(sra); + if (forked) + return 0; + unfreeze(st); + exit(0); + +release: + free(fdlist); + free(offsets); + if (orig_level != UnSet && sra) { + c = map_num(pers, orig_level); + if (c && sysfs_set_str(sra, NULL, "level", c) == 0) + pr_err("aborting level change\n"); + } + sysfs_free(sra); + if (!forked) + unfreeze(st); + return 1; +} + +/* mdfd handle is passed to be closed in child process (after fork). + */ +int reshape_container(char *container, char *devname, + int mdfd, + struct supertype *st, + struct mdinfo *info, + int force, + char *backup_file, int verbose, + int forked, int restart, int freeze_reshape) +{ + struct mdinfo *cc = NULL; + int rv = restart; + char last_devnm[32] = ""; + + /* component_size is not meaningful for a container, + * so pass '0' meaning 'no change' + */ + if (!restart && + reshape_super(st, 0, info->new_level, + info->new_layout, info->new_chunk, + info->array.raid_disks, info->delta_disks, + backup_file, devname, APPLY_METADATA_CHANGES, + verbose)) { + unfreeze(st); + return 1; + } + + sync_metadata(st); + + /* ping monitor to be sure that update is on disk + */ + ping_monitor(container); + + if (!forked && !freeze_reshape) + if (continue_via_systemd(container, GROW_SERVICE)) + return 0; + + switch (forked ? 0 : fork()) { + case -1: /* error */ + perror("Cannot fork to complete reshape\n"); + unfreeze(st); + return 1; + default: /* parent */ + if (!freeze_reshape) + printf("%s: multi-array reshape continues in background\n", Name); + return 0; + case 0: /* child */ + manage_fork_fds(0); + map_fork(); + break; + } + + /* close unused handle in child process + */ + if (mdfd > -1) + close(mdfd); + + while(1) { + /* For each member array with reshape_active, + * we need to perform the reshape. + * We pick the first array that needs reshaping and + * reshape it. reshape_array() will re-read the metadata + * so the next time through a different array should be + * ready for reshape. + * It is possible that the 'different' array will not + * be assembled yet. In that case we simple exit. + * When it is assembled, the mdadm which assembles it + * will take over the reshape. + */ + struct mdinfo *content; + int fd; + struct mdstat_ent *mdstat; + char *adev; + dev_t devid; + + sysfs_free(cc); + + cc = st->ss->container_content(st, NULL); + + for (content = cc; content ; content = content->next) { + char *subarray; + if (!content->reshape_active) + continue; + + subarray = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(subarray, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) + break; + + devid = devnm2devid(mdstat->devnm); + adev = map_dev(major(devid), minor(devid), 0); + if (!adev) + adev = content->text_version; + + fd = open_dev(mdstat->devnm); + if (fd < 0) { + pr_err("Device %s cannot be opened for reshape.\n", + adev); + break; + } + + if (strcmp(last_devnm, mdstat->devnm) == 0) { + /* Do not allow for multiple reshape_array() calls for + * the same array. + * It can happen when reshape_array() returns without + * error, when reshape is not finished (wrong reshape + * starting/continuation conditions). Mdmon doesn't + * switch to next array in container and reentry + * conditions for the same array occur. + * This is possibly interim until the behaviour of + * reshape_array is resolved(). + */ + printf("%s: Multiple reshape execution detected for device %s.\n", Name, adev); + close(fd); + break; + } + strcpy(last_devnm, mdstat->devnm); + + if (sysfs_init(content, fd, mdstat->devnm)) { + pr_err("Unable to initialize sysfs for %s\n", + mdstat->devnm); + rv = 1; + break; + } + + if (mdmon_running(container)) + flush_mdmon(container); + + rv = reshape_array(container, fd, adev, st, + content, force, NULL, INVALID_SECTORS, + backup_file, verbose, 1, restart, + freeze_reshape); + close(fd); + + if (freeze_reshape) { + sysfs_free(cc); + exit(0); + } + + restart = 0; + if (rv) + break; + + if (mdmon_running(container)) + flush_mdmon(container); + } + if (!rv) + unfreeze(st); + sysfs_free(cc); + exit(0); +} + +/* + * We run a child process in the background which performs the following + * steps: + * - wait for resync to reach a certain point + * - suspend io to the following section + * - backup that section + * - allow resync to proceed further + * - resume io + * - discard the backup. + * + * When are combined in slightly different ways in the three cases. + * Grow: + * - suspend/backup/allow/wait/resume/discard + * Shrink: + * - allow/wait/suspend/backup/allow/wait/resume/discard + * same-size: + * - wait/resume/discard/suspend/backup/allow + * + * suspend/backup/allow always come together + * wait/resume/discard do too. + * For the same-size case we have two backups to improve flow. + * + */ + +int progress_reshape(struct mdinfo *info, struct reshape *reshape, + unsigned long long backup_point, + unsigned long long wait_point, + unsigned long long *suspend_point, + unsigned long long *reshape_completed, int *frozen) +{ + /* This function is called repeatedly by the reshape manager. + * It determines how much progress can safely be made and allows + * that progress. + * - 'info' identifies the array and particularly records in + * ->reshape_progress the metadata's knowledge of progress + * This is a sector offset from the start of the array + * of the next array block to be relocated. This number + * may increase from 0 or decrease from array_size, depending + * on the type of reshape that is happening. + * Note that in contrast, 'sync_completed' is a block count of the + * reshape so far. It gives the distance between the start point + * (head or tail of device) and the next place that data will be + * written. It always increases. + * - 'reshape' is the structure created by analyse_change + * - 'backup_point' shows how much the metadata manager has backed-up + * data. For reshapes with increasing progress, it is the next address + * to be backed up, previous addresses have been backed-up. For + * decreasing progress, it is the earliest address that has been + * backed up - later address are also backed up. + * So addresses between reshape_progress and backup_point are + * backed up providing those are in the 'correct' order. + * - 'wait_point' is an array address. When reshape_completed + * passes this point, progress_reshape should return. It might + * return earlier if it determines that ->reshape_progress needs + * to be updated or further backup is needed. + * - suspend_point is maintained by progress_reshape and the caller + * should not touch it except to initialise to zero. + * It is an array address and it only increases in 2.6.37 and earlier. + * This makes it difficult to handle reducing reshapes with + * external metadata. + * However: it is similar to backup_point in that it records the + * other end of a suspended region from reshape_progress. + * it is moved to extend the region that is safe to backup and/or + * reshape + * - reshape_completed is read from sysfs and returned. The caller + * should copy this into ->reshape_progress when it has reason to + * believe that the metadata knows this, and any backup outside this + * has been erased. + * + * Return value is: + * 1 if more data from backup_point - but only as far as suspend_point, + * should be backed up + * 0 if things are progressing smoothly + * -1 if the reshape is finished because it is all done, + * -2 if the reshape is finished due to an error. + */ + + int advancing = (reshape->after.data_disks + >= reshape->before.data_disks); + unsigned long long need_backup; /* All data between start of array and + * here will at some point need to + * be backed up. + */ + unsigned long long read_offset, write_offset; + unsigned long long write_range; + unsigned long long max_progress, target, completed; + unsigned long long array_size = (info->component_size + * reshape->before.data_disks); + int fd; + char buf[20]; + + /* First, we unsuspend any region that is now known to be safe. + * If suspend_point is on the 'wrong' side of reshape_progress, then + * we don't have or need suspension at the moment. This is true for + * native metadata when we don't need to back-up. + */ + if (advancing) { + if (info->reshape_progress <= *suspend_point) + sysfs_set_num(info, NULL, "suspend_lo", + info->reshape_progress); + } else { + /* Note: this won't work in 2.6.37 and before. + * Something somewhere should make sure we don't need it! + */ + if (info->reshape_progress >= *suspend_point) + sysfs_set_num(info, NULL, "suspend_hi", + info->reshape_progress); + } + + /* Now work out how far it is safe to progress. + * If the read_offset for ->reshape_progress is less than + * 'blocks' beyond the write_offset, we can only progress as far + * as a backup. + * Otherwise we can progress until the write_offset for the new location + * reaches (within 'blocks' of) the read_offset at the current location. + * However that region must be suspended unless we are using native + * metadata. + * If we need to suspend more, we limit it to 128M per device, which is + * rather arbitrary and should be some time-based calculation. + */ + read_offset = info->reshape_progress / reshape->before.data_disks; + write_offset = info->reshape_progress / reshape->after.data_disks; + write_range = info->new_chunk/512; + if (reshape->before.data_disks == reshape->after.data_disks) + need_backup = array_size; + else + need_backup = reshape->backup_blocks; + if (advancing) { + if (read_offset < write_offset + write_range) + max_progress = backup_point; + else + max_progress = + read_offset * reshape->after.data_disks; + } else { + if (read_offset > write_offset - write_range) + /* Can only progress as far as has been backed up, + * which must be suspended */ + max_progress = backup_point; + else if (info->reshape_progress <= need_backup) + max_progress = backup_point; + else { + if (info->array.major_version >= 0) + /* Can progress until backup is needed */ + max_progress = need_backup; + else { + /* Can progress until metadata update is required */ + max_progress = + read_offset * reshape->after.data_disks; + /* but data must be suspended */ + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + } + + /* We know it is safe to progress to 'max_progress' providing + * it is suspended or we are using native metadata. + * Consider extending suspend_point 128M per device if it + * is less than 64M per device beyond reshape_progress. + * But always do a multiple of 'blocks' + * FIXME this is too big - it takes to long to complete + * this much. + */ + target = 64*1024*2 * min(reshape->before.data_disks, + reshape->after.data_disks); + target /= reshape->backup_blocks; + if (target < 2) + target = 2; + target *= reshape->backup_blocks; + + /* For externally managed metadata we always need to suspend IO to + * the area being reshaped so we regularly push suspend_point forward. + * For native metadata we only need the suspend if we are going to do + * a backup. + */ + if (advancing) { + if ((need_backup > info->reshape_progress || + info->array.major_version < 0) && + *suspend_point < info->reshape_progress + target) { + if (need_backup < *suspend_point + 2 * target) + *suspend_point = need_backup; + else if (*suspend_point + 2 * target < array_size) + *suspend_point += 2 * target; + else + *suspend_point = array_size; + sysfs_set_num(info, NULL, "suspend_hi", *suspend_point); + if (max_progress > *suspend_point) + max_progress = *suspend_point; + } + } else { + if (info->array.major_version >= 0) { + /* Only need to suspend when about to backup */ + if (info->reshape_progress < need_backup * 2 && + *suspend_point > 0) { + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", 0); + sysfs_set_num(info, NULL, "suspend_hi", + need_backup); + } + } else { + /* Need to suspend continually */ + if (info->reshape_progress < *suspend_point) + *suspend_point = info->reshape_progress; + if (*suspend_point + target < info->reshape_progress) + /* No need to move suspend region yet */; + else { + if (*suspend_point >= 2 * target) + *suspend_point -= 2 * target; + else + *suspend_point = 0; + sysfs_set_num(info, NULL, "suspend_lo", + *suspend_point); + } + if (max_progress < *suspend_point) + max_progress = *suspend_point; + } + } + + /* now set sync_max to allow that progress. sync_max, like + * sync_completed is a count of sectors written per device, so + * we find the difference between max_progress and the start point, + * and divide that by after.data_disks to get a sync_max + * number. + * At the same time we convert wait_point to a similar number + * for comparing against sync_completed. + */ + /* scale down max_progress to per_disk */ + max_progress /= reshape->after.data_disks; + /* + * Round to chunk size as some kernels give an erroneously + * high number + */ + max_progress /= info->new_chunk/512; + max_progress *= info->new_chunk/512; + /* And round to old chunk size as the kernel wants that */ + max_progress /= info->array.chunk_size/512; + max_progress *= info->array.chunk_size/512; + /* Limit progress to the whole device */ + if (max_progress > info->component_size) + max_progress = info->component_size; + wait_point /= reshape->after.data_disks; + if (!advancing) { + /* switch from 'device offset' to 'processed block count' */ + max_progress = info->component_size - max_progress; + wait_point = info->component_size - wait_point; + } + + if (!*frozen) + sysfs_set_num(info, NULL, "sync_max", max_progress); + + /* Now wait. If we have already reached the point that we were + * asked to wait to, don't wait at all, else wait for any change. + * We need to select on 'sync_completed' as that is the place that + * notifications happen, but we are really interested in + * 'reshape_position' + */ + fd = sysfs_get_fd(info, NULL, "sync_completed"); + if (fd < 0) + goto check_progress; + + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + + while (completed < max_progress && completed < wait_point) { + /* Check that sync_action is still 'reshape' to avoid + * waiting forever on a dead array + */ + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", action, 20) <= 0 || + strncmp(action, "reshape", 7) != 0) + break; + /* Some kernels reset 'sync_completed' to zero + * before setting 'sync_action' to 'idle'. + * So we need these extra tests. + */ + if (completed == 0 && advancing && + strncmp(action, "idle", 4) == 0 && + info->reshape_progress > 0) + break; + if (completed == 0 && !advancing && + strncmp(action, "idle", 4) == 0 && + info->reshape_progress < + (info->component_size * reshape->after.data_disks)) + break; + sysfs_wait(fd, NULL); + if (sysfs_fd_get_ll(fd, &completed) < 0) + goto check_progress; + } + /* Some kernels reset 'sync_completed' to zero, + * we need to have real point we are in md. + * So in that case, read 'reshape_position' from sysfs. + */ + if (completed == 0) { + unsigned long long reshapep; + char action[20]; + if (sysfs_get_str(info, NULL, "sync_action", action, 20) > 0 && + strncmp(action, "idle", 4) == 0 && + sysfs_get_ll(info, NULL, + "reshape_position", &reshapep) == 0) + *reshape_completed = reshapep; + } else { + /* some kernels can give an incorrectly high + * 'completed' number, so round down */ + completed /= (info->new_chunk/512); + completed *= (info->new_chunk/512); + /* Convert 'completed' back in to a 'progress' number */ + completed *= reshape->after.data_disks; + if (!advancing) + completed = (info->component_size + * reshape->after.data_disks + - completed); + *reshape_completed = completed; + } + + close(fd); + + /* We return the need_backup flag. Caller will decide + * how much - a multiple of ->backup_blocks up to *suspend_point + */ + if (advancing) + return need_backup > info->reshape_progress; + else + return need_backup >= info->reshape_progress; + +check_progress: + /* if we couldn't read a number from sync_completed, then + * either the reshape did complete, or it aborted. + * We can tell which by checking for 'none' in reshape_position. + * If it did abort, then it might immediately restart if it + * it was just a device failure that leaves us degraded but + * functioning. + */ + if (sysfs_get_str(info, NULL, "reshape_position", buf, + sizeof(buf)) < 0 || strncmp(buf, "none", 4) != 0) { + /* The abort might only be temporary. Wait up to 10 + * seconds for fd to contain a valid number again. + */ + int wait = 10000; + int rv = -2; + unsigned long long new_sync_max; + while (fd >= 0 && rv < 0 && wait > 0) { + if (sysfs_wait(fd, &wait) != 1) + break; + switch (sysfs_fd_get_ll(fd, &completed)) { + case 0: + /* all good again */ + rv = 1; + /* If "sync_max" is no longer max_progress + * we need to freeze things + */ + sysfs_get_ll(info, NULL, "sync_max", + &new_sync_max); + *frozen = (new_sync_max != max_progress); + break; + case -2: /* read error - abort */ + wait = 0; + break; + } + } + if (fd >= 0) + close(fd); + return rv; /* abort */ + } else { + /* Maybe racing with array shutdown - check state */ + if (fd >= 0) + close(fd); + if (sysfs_get_str(info, NULL, "array_state", buf, + sizeof(buf)) < 0 || + strncmp(buf, "inactive", 8) == 0 || + strncmp(buf, "clear",5) == 0) + return -2; /* abort */ + return -1; /* complete */ + } +} + +/* FIXME return status is never checked */ +static int grow_backup(struct mdinfo *sra, + unsigned long long offset, /* per device */ + unsigned long stripes, /* per device, in old chunks */ + int *sources, unsigned long long *offsets, + int disks, int chunk, int level, int layout, + int dests, int *destfd, unsigned long long *destoffsets, + int part, int *degraded, + char *buf) +{ + /* Backup 'blocks' sectors at 'offset' on each device of the array, + * to storage 'destfd' (offset 'destoffsets'), after first + * suspending IO. Then allow resync to continue + * over the suspended section. + * Use part 'part' of the backup-super-block. + */ + int odata = disks; + int rv = 0; + int i; + unsigned long long ll; + int new_degraded; + //printf("offset %llu\n", offset); + if (level >= 4) + odata--; + if (level == 6) + odata--; + + /* Check that array hasn't become degraded, else we might backup the wrong data */ + if (sysfs_get_ll(sra, NULL, "degraded", &ll) < 0) + return -1; /* FIXME this error is ignored */ + new_degraded = (int)ll; + if (new_degraded != *degraded) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + for (sd = sra->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC)) { + char sbuf[100]; + + if (sysfs_get_str(sra, sd, "state", + sbuf, sizeof(sbuf)) < 0 || + strstr(sbuf, "faulty") || + strstr(sbuf, "in_sync") == NULL) { + /* this device is dead */ + sd->disk.state = (1<<MD_DISK_FAULTY); + if (sd->disk.raid_disk >= 0 && + sources[sd->disk.raid_disk] >= 0) { + close(sources[sd->disk.raid_disk]); + sources[sd->disk.raid_disk] = -1; + } + } + } + } + *degraded = new_degraded; + } + if (part) { + bsb.arraystart2 = __cpu_to_le64(offset * odata); + bsb.length2 = __cpu_to_le64(stripes * (chunk/512) * odata); + } else { + bsb.arraystart = __cpu_to_le64(offset * odata); + bsb.length = __cpu_to_le64(stripes * (chunk/512) * odata); + } + if (part) + bsb.magic[15] = '2'; + for (i = 0; i < dests; i++) + if (part) + lseek64(destfd[i], destoffsets[i] + + __le64_to_cpu(bsb.devstart2)*512, 0); + else + lseek64(destfd[i], destoffsets[i], 0); + + rv = save_stripes(sources, offsets, disks, chunk, level, layout, + dests, destfd, offset * 512 * odata, + stripes * chunk * odata, buf); + + if (rv) + return rv; + bsb.mtime = __cpu_to_le64(time(0)); + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + + bsb.sb_csum = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + + rv = -1; + if ((unsigned long long)lseek64(destfd[i], + destoffsets[i] - 4096, 0) != + destoffsets[i] - 4096) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + if (destoffsets[i] > 4096) { + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]+stripes*chunk*odata, 0) != + destoffsets[i]+stripes*chunk*odata) + break; + if (write(destfd[i], &bsb, 512) != 512) + break; + } + fsync(destfd[i]); + rv = 0; + } + + return rv; +} + +/* in 2.6.30, the value reported by sync_completed can be + * less that it should be by one stripe. + * This only happens when reshape hits sync_max and pauses. + * So allow wait_backup to either extent sync_max further + * than strictly necessary, or return before the + * sync has got quite as far as we would really like. + * This is what 'blocks2' is for. + * The various caller give appropriate values so that + * every works. + */ +/* FIXME return value is often ignored */ +static int forget_backup(int dests, int *destfd, + unsigned long long *destoffsets, + int part) +{ + /* + * Erase backup 'part' (which is 0 or 1) + */ + int i; + int rv; + + if (part) { + bsb.arraystart2 = __cpu_to_le64(0); + bsb.length2 = __cpu_to_le64(0); + } else { + bsb.arraystart = __cpu_to_le64(0); + bsb.length = __cpu_to_le64(0); + } + bsb.mtime = __cpu_to_le64(time(0)); + rv = 0; + for (i = 0; i < dests; i++) { + bsb.devstart = __cpu_to_le64(destoffsets[i]/512); + bsb.sb_csum = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum)-((char*)&bsb)); + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0) + bsb.sb_csum2 = bsb_csum((char*)&bsb, + ((char*)&bsb.sb_csum2)-((char*)&bsb)); + if ((unsigned long long)lseek64(destfd[i], destoffsets[i]-4096, 0) != + destoffsets[i]-4096) + rv = -1; + if (rv == 0 && write(destfd[i], &bsb, 512) != 512) + rv = -1; + fsync(destfd[i]); + } + return rv; +} + +static void fail(char *msg) +{ + int rv; + rv = (write(2, msg, strlen(msg)) != (int)strlen(msg)); + rv |= (write(2, "\n", 1) != 1); + exit(rv ? 1 : 2); +} + +static char *abuf, *bbuf; +static unsigned long long abuflen; +static void validate(int afd, int bfd, unsigned long long offset) +{ + /* check that the data in the backup against the array. + * This is only used for regression testing and should not + * be used while the array is active + */ + if (afd < 0) + return; + lseek64(bfd, offset - 4096, 0); + if (read(bfd, &bsb2, 512) != 512) + fail("cannot read bsb"); + if (bsb2.sb_csum != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum)-((char*)&bsb2))) + fail("first csum bad"); + if (memcmp(bsb2.magic, "md_backup_data", 14) != 0) + fail("magic is bad"); + if (memcmp(bsb2.magic, "md_backup_data-2", 16) == 0 && + bsb2.sb_csum2 != bsb_csum((char*)&bsb2, + ((char*)&bsb2.sb_csum2)-((char*)&bsb2))) + fail("second csum bad"); + + if (__le64_to_cpu(bsb2.devstart)*512 != offset) + fail("devstart is wrong"); + + if (bsb2.length) { + unsigned long long len = __le64_to_cpu(bsb2.length)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + if (posix_memalign((void**)&abuf, 4096, abuflen) || + posix_memalign((void**)&bbuf, 4096, abuflen)) { + abuflen = 0; + /* just stop validating on mem-alloc failure */ + return; + } + } + + lseek64(bfd, offset, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) { + //printf("len %llu\n", len); + fail("read first backup failed"); + } + lseek64(afd, __le64_to_cpu(bsb2.arraystart)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read first from array failed"); + if (memcmp(bbuf, abuf, len) != 0) { +#if 0 + int i; + printf("offset=%llu len=%llu\n", + (unsigned long long)__le64_to_cpu(bsb2.arraystart)*512, len); + for (i=0; i<len; i++) + if (bbuf[i] != abuf[i]) { + printf("first diff byte %d\n", i); + break; + } +#endif + fail("data1 compare failed"); + } + } + if (bsb2.length2) { + unsigned long long len = __le64_to_cpu(bsb2.length2)*512; + + if (abuflen < len) { + free(abuf); + free(bbuf); + abuflen = len; + abuf = xmalloc(abuflen); + bbuf = xmalloc(abuflen); + } + + lseek64(bfd, offset+__le64_to_cpu(bsb2.devstart2)*512, 0); + if ((unsigned long long)read(bfd, bbuf, len) != len) + fail("read second backup failed"); + lseek64(afd, __le64_to_cpu(bsb2.arraystart2)*512, 0); + if ((unsigned long long)read(afd, abuf, len) != len) + fail("read second from array failed"); + if (memcmp(bbuf, abuf, len) != 0) + fail("data2 compare failed"); + } +} + +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + /* Monitor a reshape where backup is being performed using + * 'native' mechanism - either to a backup file, or + * to some space in a spare. + */ + char *buf; + int degraded = -1; + unsigned long long speed; + unsigned long long suspend_point, array_size; + unsigned long long backup_point, wait_point; + unsigned long long reshape_completed; + int done = 0; + int increasing = reshape->after.data_disks >= + reshape->before.data_disks; + int part = 0; /* The next part of the backup area to fill. It + * may already be full, so we need to check */ + int level = reshape->level; + int layout = reshape->before.layout; + int data = reshape->before.data_disks; + int disks = reshape->before.data_disks + reshape->parity; + int chunk = sra->array.chunk_size; + struct mdinfo *sd; + unsigned long stripes; + int uuid[4]; + int frozen = 0; + + /* set up the backup-super-block. This requires the + * uuid from the array. + */ + /* Find a superblock */ + for (sd = sra->devs; sd; sd = sd->next) { + char *dn; + int devfd; + int ok; + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + dn = map_dev(sd->disk.major, sd->disk.minor, 1); + devfd = dev_open(dn, O_RDONLY); + if (devfd < 0) + continue; + ok = st->ss->load_super(st, devfd, NULL); + close(devfd); + if (ok == 0) + break; + } + if (!sd) { + pr_err("Cannot find a superblock\n"); + return 0; + } + + memset(&bsb, 0, 512); + memcpy(bsb.magic, "md_backup_data-1", 16); + st->ss->uuid_from_super(st, uuid); + memcpy(bsb.set_uuid, uuid, 16); + bsb.mtime = __cpu_to_le64(time(0)); + bsb.devstart2 = blocks; + + stripes = blocks / (sra->array.chunk_size/512) / + reshape->before.data_disks; + + if (posix_memalign((void**)&buf, 4096, disks * chunk)) + /* Don't start the 'reshape' */ + return 0; + if (reshape->before.data_disks == reshape->after.data_disks) { + sysfs_get_ll(sra, NULL, "sync_speed_min", &speed); + sysfs_set_num(sra, NULL, "sync_speed_min", 200000); + } + + if (increasing) { + array_size = sra->component_size * reshape->after.data_disks; + backup_point = sra->reshape_progress; + suspend_point = 0; + } else { + array_size = sra->component_size * reshape->before.data_disks; + backup_point = reshape->backup_blocks; + suspend_point = array_size; + } + + while (!done) { + int rv; + + /* Want to return as soon the oldest backup slot can + * be released as that allows us to start backing up + * some more, providing suspend_point has been + * advanced, which it should have. + */ + if (increasing) { + wait_point = array_size; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length)); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2)); + } else { + wait_point = 0; + if (part == 0 && __le64_to_cpu(bsb.length) > 0) + wait_point = __le64_to_cpu(bsb.arraystart); + if (part == 1 && __le64_to_cpu(bsb.length2) > 0) + wait_point = __le64_to_cpu(bsb.arraystart2); + } + + reshape_completed = sra->reshape_progress; + rv = progress_reshape(sra, reshape, + backup_point, wait_point, + &suspend_point, &reshape_completed, + &frozen); + /* external metadata would need to ping_monitor here */ + sra->reshape_progress = reshape_completed; + + /* Clear any backup region that is before 'here' */ + if (increasing) { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed >= (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2))) + forget_backup(dests, destfd, + destoffsets, 1); + } else { + if (__le64_to_cpu(bsb.length) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart))) + forget_backup(dests, destfd, + destoffsets, 0); + if (__le64_to_cpu(bsb.length2) > 0 && + reshape_completed <= (__le64_to_cpu(bsb.arraystart2))) + forget_backup(dests, destfd, + destoffsets, 1); + } + if (sigterm) + rv = -2; + if (rv < 0) { + if (rv == -1) + done = 1; + break; + } + if (rv == 0 && increasing && !st->ss->external) { + /* No longer need to monitor this reshape */ + sysfs_set_str(sra, NULL, "sync_max", "max"); + done = 1; + break; + } + + while (rv) { + unsigned long long offset; + unsigned long actual_stripes; + /* Need to backup some data. + * If 'part' is not used and the desired + * backup size is suspended, do a backup, + * then consider the next part. + */ + /* Check that 'part' is unused */ + if (part == 0 && __le64_to_cpu(bsb.length) != 0) + break; + if (part == 1 && __le64_to_cpu(bsb.length2) != 0) + break; + + offset = backup_point / data; + actual_stripes = stripes; + if (increasing) { + if (offset + actual_stripes * (chunk/512) > + sra->component_size) + actual_stripes = ((sra->component_size - offset) + / (chunk/512)); + if (offset + actual_stripes * (chunk/512) > + suspend_point/data) + break; + } else { + if (offset < actual_stripes * (chunk/512)) + actual_stripes = offset / (chunk/512); + offset -= actual_stripes * (chunk/512); + if (offset < suspend_point/data) + break; + } + if (actual_stripes == 0) + break; + grow_backup(sra, offset, actual_stripes, fds, offsets, + disks, chunk, level, layout, dests, destfd, + destoffsets, part, °raded, buf); + validate(afd, destfd[0], destoffsets[0]); + /* record where 'part' is up to */ + part = !part; + if (increasing) + backup_point += actual_stripes * (chunk/512) * data; + else + backup_point -= actual_stripes * (chunk/512) * data; + } + } + + /* FIXME maybe call progress_reshape one more time instead */ + /* remove any remaining suspension */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + sysfs_set_num(sra, NULL, "sync_min", 0); + + if (reshape->before.data_disks == reshape->after.data_disks) + sysfs_set_num(sra, NULL, "sync_speed_min", speed); + free(buf); + return done; +} + +/* + * If any spare contains md_back_data-1 which is recent wrt mtime, + * write that data into the array and update the super blocks with + * the new reshape_progress + */ +int Grow_restart(struct supertype *st, struct mdinfo *info, int *fdlist, + int cnt, char *backup_file, int verbose) +{ + int i, j; + int old_disks; + unsigned long long *offsets; + unsigned long long nstripe, ostripe; + int ndata, odata; + + odata = info->array.raid_disks - info->delta_disks - 1; + if (info->array.level == 6) + odata--; /* number of data disks */ + ndata = info->array.raid_disks - 1; + if (info->new_level == 6) + ndata--; + + old_disks = info->array.raid_disks - info->delta_disks; + + if (info->delta_disks <= 0) + /* Didn't grow, so the backup file must have + * been used + */ + old_disks = cnt; + for (i=old_disks-(backup_file?1:0); i<cnt; i++) { + struct mdinfo dinfo; + int fd; + int bsbsize; + char *devname, namebuf[20]; + unsigned long long lo, hi; + + /* This was a spare and may have some saved data on it. + * Load the superblock, find and load the + * backup_super_block. + * If either fail, go on to next device. + * If the backup contains no new info, just return + * else restore data and update all superblocks + */ + if (i == old_disks-1) { + fd = open(backup_file, O_RDONLY); + if (fd<0) { + pr_err("backup file %s inaccessible: %s\n", + backup_file, strerror(errno)); + continue; + } + devname = backup_file; + } else { + fd = fdlist[i]; + if (fd < 0) + continue; + if (st->ss->load_super(st, fd, NULL)) + continue; + + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + + if (lseek64(fd, + (dinfo.data_offset + dinfo.component_size - 8) <<9, + 0) < 0) { + pr_err("Cannot seek on device %d\n", i); + continue; /* Cannot seek */ + } + sprintf(namebuf, "device-%d", i); + devname = namebuf; + } + if (read(fd, &bsb, sizeof(bsb)) != sizeof(bsb)) { + if (verbose) + pr_err("Cannot read from %s\n", devname); + continue; /* Cannot read */ + } + if (memcmp(bsb.magic, "md_backup_data-1", 16) != 0 && + memcmp(bsb.magic, "md_backup_data-2", 16) != 0) { + if (verbose) + pr_err("No backup metadata on %s\n", devname); + continue; + } + if (bsb.sb_csum != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum on %s\n", + devname); + continue; /* bad checksum */ + } + if (memcmp(bsb.magic, "md_backup_data-2", 16) == 0 && + bsb.sb_csum2 != bsb_csum((char*)&bsb, ((char*)&bsb.sb_csum2)-((char*)&bsb))) { + if (verbose) + pr_err("Bad backup-metadata checksum2 on %s\n", + devname); + continue; /* Bad second checksum */ + } + if (memcmp(bsb.set_uuid,info->uuid, 16) != 0) { + if (verbose) + pr_err("Wrong uuid on backup-metadata on %s\n", + devname); + continue; /* Wrong uuid */ + } + + /* + * array utime and backup-mtime should be updated at + * much the same time, but it seems that sometimes + * they aren't... So allow considerable flexability in + * matching, and allow this test to be overridden by + * an environment variable. + */ + if(time_after(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) + 2*60*60) || + time_before(info->array.utime, (unsigned int)__le64_to_cpu(bsb.mtime) - 10*60)) { + if (check_env("MDADM_GROW_ALLOW_OLD")) { + pr_err("accepting backup with timestamp %lu for array with timestamp %lu\n", + (unsigned long)__le64_to_cpu(bsb.mtime), + (unsigned long)info->array.utime); + } else { + pr_err("too-old timestamp on backup-metadata on %s\n", devname); + pr_err("If you think it is should be safe, try 'export MDADM_GROW_ALLOW_OLD=1'\n"); + continue; /* time stamp is too bad */ + } + } + + if (bsb.magic[15] == '1') { + if (bsb.length == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if (__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) { + nonew: + if (verbose) + pr_err("backup-metadata found on %s but is not needed\n", devname); + continue; /* No new data here */ + } + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } else { + if (bsb.length == 0 && bsb.length2 == 0) + continue; + if (info->delta_disks >= 0) { + /* reshape_progress is increasing */ + if ((__le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length) + < info->reshape_progress) && + (__le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2) + < info->reshape_progress)) + goto nonew; /* No new data here */ + } else { + /* reshape_progress is decreasing */ + if (__le64_to_cpu(bsb.arraystart) >= + info->reshape_progress && + __le64_to_cpu(bsb.arraystart2) >= + info->reshape_progress) + goto nonew; /* No new data here */ + } + } + if (lseek64(fd, __le64_to_cpu(bsb.devstart)*512, 0)< 0) { + second_fail: + if (verbose) + pr_err("Failed to verify secondary backup-metadata block on %s\n", + devname); + continue; /* Cannot seek */ + } + /* There should be a duplicate backup superblock 4k before here */ + if (lseek64(fd, -4096, 1) < 0 || + read(fd, &bsb2, sizeof(bsb2)) != sizeof(bsb2)) + goto second_fail; /* Cannot find leading superblock */ + if (bsb.magic[15] == '1') + bsbsize = offsetof(struct mdp_backup_super, pad1); + else + bsbsize = offsetof(struct mdp_backup_super, pad); + if (memcmp(&bsb2, &bsb, bsbsize) != 0) + goto second_fail; /* Cannot find leading superblock */ + + /* Now need the data offsets for all devices. */ + offsets = xmalloc(sizeof(*offsets)*info->array.raid_disks); + for(j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + /* FIXME should be this be an error */ + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + st->ss->free_super(st); + offsets[j] = dinfo.data_offset * 512; + } + printf("%s: restoring critical section\n", Name); + + if (restore_stripes(fdlist, offsets, info->array.raid_disks, + info->new_chunk, info->new_level, + info->new_layout, fd, + __le64_to_cpu(bsb.devstart)*512, + __le64_to_cpu(bsb.arraystart)*512, + __le64_to_cpu(bsb.length)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring backup from %s\n", + devname); + free(offsets); + return 1; + } + + if (bsb.magic[15] == '2' && + restore_stripes(fdlist, offsets, info->array.raid_disks, + info->new_chunk, info->new_level, + info->new_layout, fd, + __le64_to_cpu(bsb.devstart)*512 + + __le64_to_cpu(bsb.devstart2)*512, + __le64_to_cpu(bsb.arraystart2)*512, + __le64_to_cpu(bsb.length2)*512, NULL)) { + /* didn't succeed, so giveup */ + if (verbose) + pr_err("Error restoring second backup from %s\n", + devname); + free(offsets); + return 1; + } + + free(offsets); + + /* Ok, so the data is restored. Let's update those superblocks. */ + + lo = hi = 0; + if (bsb.length) { + lo = __le64_to_cpu(bsb.arraystart); + hi = lo + __le64_to_cpu(bsb.length); + } + if (bsb.magic[15] == '2' && bsb.length2) { + unsigned long long lo1, hi1; + lo1 = __le64_to_cpu(bsb.arraystart2); + hi1 = lo1 + __le64_to_cpu(bsb.length2); + if (lo == hi) { + lo = lo1; + hi = hi1; + } else if (lo < lo1) + hi = hi1; + else + lo = lo1; + } + if (lo < hi && (info->reshape_progress < lo || + info->reshape_progress > hi)) + /* backup does not affect reshape_progress*/ ; + else if (info->delta_disks >= 0) { + info->reshape_progress = __le64_to_cpu(bsb.arraystart) + + __le64_to_cpu(bsb.length); + if (bsb.magic[15] == '2') { + unsigned long long p2; + + p2 = __le64_to_cpu(bsb.arraystart2) + + __le64_to_cpu(bsb.length2); + if (p2 > info->reshape_progress) + info->reshape_progress = p2; + } + } else { + info->reshape_progress = __le64_to_cpu(bsb.arraystart); + if (bsb.magic[15] == '2') { + unsigned long long p2; + + p2 = __le64_to_cpu(bsb.arraystart2); + if (p2 < info->reshape_progress) + info->reshape_progress = p2; + } + } + for (j=0; j<info->array.raid_disks; j++) { + if (fdlist[j] < 0) + continue; + if (st->ss->load_super(st, fdlist[j], NULL)) + continue; + st->ss->getinfo_super(st, &dinfo, NULL); + dinfo.reshape_progress = info->reshape_progress; + st->ss->update_super(st, &dinfo, "_reshape_progress", + NULL,0, 0, NULL); + st->ss->store_super(st, fdlist[j]); + st->ss->free_super(st); + } + return 0; + } + /* Didn't find any backup data, try to see if any + * was needed. + */ + if (info->delta_disks < 0) { + /* When shrinking, the critical section is at the end. + * So see if we are before the critical section. + */ + unsigned long long first_block; + nstripe = ostripe = 0; + first_block = 0; + while (ostripe >= nstripe) { + ostripe += info->array.chunk_size / 512; + first_block = ostripe * odata; + nstripe = first_block / ndata / (info->new_chunk/512) * + (info->new_chunk/512); + } + + if (info->reshape_progress >= first_block) + return 0; + } + if (info->delta_disks > 0) { + /* See if we are beyond the critical section. */ + unsigned long long last_block; + nstripe = ostripe = 0; + last_block = 0; + while (nstripe >= ostripe) { + nstripe += info->new_chunk / 512; + last_block = nstripe * ndata; + ostripe = last_block / odata / (info->array.chunk_size/512) * + (info->array.chunk_size/512); + } + + if (info->reshape_progress >= last_block) + return 0; + } + /* needed to recover critical section! */ + if (verbose) + pr_err("Failed to find backup of critical section\n"); + return 1; +} + +int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose) +{ + int ret_val = 0; + struct supertype *st = NULL; + struct mdinfo *content = NULL; + struct mdinfo array; + char *subarray = NULL; + struct mdinfo *cc = NULL; + struct mdstat_ent *mdstat = NULL; + int cfd = -1; + int fd2; + + dprintf("Grow continue from command line called for %s\n", devname); + + st = super_by_fd(fd, &subarray); + if (!st || !st->ss) { + pr_err("Unable to determine metadata format for %s\n", devname); + return 1; + } + dprintf("Grow continue is run for "); + if (st->ss->external == 0) { + int d; + int cnt = 5; + dprintf_cont("native array (%s)\n", devname); + if (md_get_array_info(fd, &array.array) < 0) { + pr_err("%s is not an active md array - aborting\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + content = &array; + sysfs_init(content, fd, NULL); + /* Need to load a superblock. + * FIXME we should really get what we need from + * sysfs + */ + do { + for (d = 0; d < MAX_DISKS; d++) { + mdu_disk_info_t disk; + char *dv; + int err; + disk.number = d; + if (md_get_disk_info(fd, &disk) < 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + if ((disk.state & (1 << MD_DISK_ACTIVE)) == 0) + continue; + dv = map_dev(disk.major, disk.minor, 1); + if (!dv) + continue; + fd2 = dev_open(dv, O_RDONLY); + if (fd2 < 0) + continue; + err = st->ss->load_super(st, fd2, NULL); + close(fd2); + if (err) + continue; + break; + } + if (d == MAX_DISKS) { + pr_err("Unable to load metadata for %s\n", + devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + st->ss->getinfo_super(st, content, NULL); + if (!content->reshape_active) + sleep(3); + else + break; + } while (cnt-- > 0); + } else { + char *container; + + if (subarray) { + dprintf_cont("subarray (%s)\n", subarray); + container = st->container_devnm; + cfd = open_dev_excl(st->container_devnm); + } else { + container = st->devnm; + close(fd); + cfd = open_dev_excl(st->devnm); + dprintf_cont("container (%s)\n", container); + fd = cfd; + } + if (cfd < 0) { + pr_err("Unable to open container for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* find in container array under reshape + */ + ret_val = st->ss->load_container(st, cfd, NULL); + if (ret_val) { + pr_err("Cannot read superblock for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + + cc = st->ss->container_content(st, subarray); + for (content = cc; content ; content = content->next) { + char *array_name; + int allow_reshape = 1; + + if (content->reshape_active == 0) + continue; + /* The decision about array or container wide + * reshape is taken in Grow_continue based + * content->reshape_active state, therefore we + * need to check_reshape based on + * reshape_active and subarray name + */ + if (content->array.state & (1<<MD_SB_BLOCK_VOLUME)) + allow_reshape = 0; + if (content->reshape_active == CONTAINER_RESHAPE && + (content->array.state + & (1<<MD_SB_BLOCK_CONTAINER_RESHAPE))) + allow_reshape = 0; + + if (!allow_reshape) { + pr_err("cannot continue reshape of an array in container with unsupported metadata: %s(%s)\n", + devname, container); + ret_val = 1; + goto Grow_continue_command_exit; + } + + array_name = strchr(content->text_version+1, '/')+1; + mdstat = mdstat_by_subdev(array_name, container); + if (!mdstat) + continue; + if (mdstat->active == 0) { + pr_err("Skipping inactive array %s.\n", + mdstat->devnm); + free_mdstat(mdstat); + mdstat = NULL; + continue; + } + break; + } + if (!content) { + pr_err("Unable to determine reshaped array for %s\n", devname); + ret_val = 1; + goto Grow_continue_command_exit; + } + fd2 = open_dev(mdstat->devnm); + if (fd2 < 0) { + pr_err("cannot open (%s)\n", mdstat->devnm); + ret_val = 1; + goto Grow_continue_command_exit; + } + + if (sysfs_init(content, fd2, mdstat->devnm)) { + pr_err("Unable to initialize sysfs for %s, Grow cannot continue.\n", + mdstat->devnm); + ret_val = 1; + close(fd2); + goto Grow_continue_command_exit; + } + + close(fd2); + + /* start mdmon in case it is not running + */ + if (!mdmon_running(container)) + start_mdmon(container); + ping_monitor(container); + + if (mdmon_running(container)) + st->update_tail = &st->updates; + else { + pr_err("No mdmon found. Grow cannot continue.\n"); + ret_val = 1; + goto Grow_continue_command_exit; + } + } + + /* verify that array under reshape is started from + * correct position + */ + if (verify_reshape_position(content, content->array.level) < 0) { + ret_val = 1; + goto Grow_continue_command_exit; + } + + /* continue reshape + */ + ret_val = Grow_continue(fd, st, content, backup_file, 1, 0); + +Grow_continue_command_exit: + if (cfd > -1) + close(cfd); + st->ss->free_super(st); + free_mdstat(mdstat); + sysfs_free(cc); + free(subarray); + + return ret_val; +} + +int Grow_continue(int mdfd, struct supertype *st, struct mdinfo *info, + char *backup_file, int forked, int freeze_reshape) +{ + int ret_val = 2; + + if (!info->reshape_active) + return ret_val; + + if (st->ss->external) { + int cfd = open_dev(st->container_devnm); + + if (cfd < 0) + return 1; + + st->ss->load_container(st, cfd, st->container_devnm); + close(cfd); + ret_val = reshape_container(st->container_devnm, NULL, mdfd, + st, info, 0, backup_file, 0, + forked, 1 | info->reshape_active, + freeze_reshape); + } else + ret_val = reshape_array(NULL, mdfd, "array", st, info, 1, + NULL, INVALID_SECTORS, backup_file, + 0, forked, 1 | info->reshape_active, + freeze_reshape); + + return ret_val; +} + +char *make_backup(char *name) +{ + char *base = "backup_file-"; + int len; + char *fname; + + len = strlen(MAP_DIR) + 1 + strlen(base) + strlen(name)+1; + fname = xmalloc(len); + sprintf(fname, "%s/%s%s", MAP_DIR, base, name); + return fname; +} + +char *locate_backup(char *name) +{ + char *fl = make_backup(name); + struct stat stb; + + if (stat(fl, &stb) == 0 && S_ISREG(stb.st_mode)) + return fl; + + free(fl); + return NULL; +} @@ -0,0 +1,13 @@ + +To build mdadm, simply run: + + make + +to install, run + + make install + +as root. + + +No configuration is necessary. diff --git a/Incremental.c b/Incremental.c new file mode 100644 index 0000000..a57fc32 --- /dev/null +++ b/Incremental.c @@ -0,0 +1,1764 @@ +/* + * Incremental.c - support --incremental. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +#include "mdadm.h" +#include <sys/wait.h> +#include <dirent.h> +#include <ctype.h> + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *info); +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name); +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose); + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only); + +int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st) +{ + /* Add this device to an array, creating the array if necessary + * and starting the array if sensible or - if runstop>0 - if possible. + * + * This has several steps: + * + * 1/ Check if device is permitted by mdadm.conf, reject if not. + * 2/ Find metadata, reject if none appropriate (check + * version/name from args) + * 3/ Check if there is a match in mdadm.conf + * 3a/ if not, check for homehost match. If no match, assemble as + * a 'foreign' array. + * 4/ Determine device number. + * - If in mdadm.conf with std name, use that + * - UUID in /var/run/mdadm.map use that + * - If name is suggestive, use that. unless in use with different uuid. + * - Choose a free, high number. + * - Use a partitioned device unless strong suggestion not to. + * e.g. auto=md + * Don't choose partitioned for containers. + * 5/ Find out if array already exists + * 5a/ if it does not + * - choose a name, from mdadm.conf or 'name' field in array. + * - create the array + * - add the device + * 5b/ if it does + * - check one drive in array to make sure metadata is a reasonably + * close match. Reject if not (e.g. different type) + * - add the device + * 6/ Make sure /var/run/mdadm.map contains this array. + * 7/ Is there enough devices to possibly start the array? + * For a container, this means running Incremental_container. + * 7a/ if not, finish with success. + * 7b/ if yes, + * - read all metadata and arrange devices like -A does + * - if number of OK devices match expected, or -R and there are enough, + * start the array (auto-readonly). + */ + dev_t rdev, rdev2; + struct mdinfo info, dinfo; + struct mdinfo *sra = NULL, *d; + struct mddev_ident *match; + char chosen_name[1024]; + char *md_devname; + int rv = 1; + struct map_ent *mp, *map = NULL; + int dfd = -1, mdfd = -1; + char *avail = NULL; + int active_disks; + int trustworthy; + char *name_to_use; + struct dev_policy *policy = NULL; + struct map_ent target_array; + int have_target; + char *devname = devlist->devname; + int journal_device_missing = 0; + + struct createinfo *ci = conf_get_create_info(); + + if (!stat_is_blkdev(devname, &rdev)) + return rv; + dfd = dev_open(devname, O_RDONLY); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot open %s: %s.\n", + devname, strerror(errno)); + return rv; + } + /* If the device is a container, we do something very different */ + if (must_be_container(dfd)) { + if (!st) + st = super_by_fd(dfd, NULL); + if (st && st->ss->load_container) + rv = st->ss->load_container(st, dfd, NULL); + + close(dfd); + if (!rv && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + if (c->export) + printf("MD_DEVNAME=%s\n", devname); + rv = Incremental_container(st, devname, c, NULL); + map_unlock(&map); + return rv; + } + + pr_err("%s is not part of an md array.\n", + devname); + return rv; + } + + /* 1/ Check if device is permitted by mdadm.conf */ + + for (;devlist; devlist = devlist->next) + if (conf_test_dev(devlist->devname)) + break; + if (!devlist) { + devlist = conf_get_devs(); + for (;devlist; devlist = devlist->next) { + if (stat_is_blkdev(devlist->devname, &rdev2) && + rdev2 == rdev) + break; + } + } + if (!devlist) { + if (c->verbose >= 0) + pr_err("%s not permitted by mdadm.conf.\n", + devname); + goto out; + } + + /* 2/ Find metadata, reject if none appropriate (check + * version/name from args) */ + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + goto out; + + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + + policy = disk_policy(&dinfo); + have_target = policy_check_path(&dinfo, &target_array); + + if (st == NULL && (st = guess_super_type(dfd, guess_array)) == NULL) { + if (c->verbose >= 0) + pr_err("no recognisable superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + NULL, c->verbose); + goto out; + } + st->ignore_hw_compat = 0; + + if (st->ss->compare_super == NULL || + st->ss->load_super(st, dfd, c->verbose >= 0 ? devname : NULL)) { + if (c->verbose >= 0) + pr_err("no RAID superblock on %s.\n", + devname); + rv = try_spare(devname, &dfd, policy, + have_target ? &target_array : NULL, + st, c->verbose); + free(st); + goto out; + } + close (dfd); dfd = -1; + + st->ss->getinfo_super(st, &info, NULL); + + /* 3/ Check if there is a match in mdadm.conf */ + match = conf_match(st, &info, devname, c->verbose, &rv); + if (!match && rv == 2) + goto out; + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose >= 0) + pr_err("array containing %s is explicitly ignored by mdadm.conf\n", + devname); + goto out; + } + + /* 3a/ if not, check for homehost match. If no match, continue + * but don't trust the 'name' in the array. Thus a 'random' minor + * number will be assigned, and the device name will be based + * on that. */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL_ANY; + else + trustworthy = FOREIGN; + + if (!match && !conf_test_metadata(st->ss->name, policy, + (trustworthy == LOCAL))) { + if (c->verbose >= 1) + pr_err("%s has metadata type %s for which auto-assembly is disabled\n", + devname, st->ss->name); + goto out; + } + if (trustworthy == LOCAL_ANY) + trustworthy = LOCAL; + + /* There are three possible sources for 'autof': command line, + * ARRAY line in mdadm.conf, or CREATE line in mdadm.conf. + * ARRAY takes precedence, then command line, then + * CREATE. + */ + if (match && match->autof) + c->autof = match->autof; + if (c->autof == 0) + c->autof = ci->autof; + + name_to_use = info.name; + if (name_to_use[0] == 0 && info.array.level == LEVEL_CONTAINER) { + name_to_use = info.text_version; + trustworthy = METADATA; + } + if (name_to_use[0] && trustworthy != LOCAL && + ! c->require_homehost && + conf_name_is_free(name_to_use)) + trustworthy = LOCAL; + + /* strip "hostname:" prefix from name if we have decided + * to treat it as LOCAL + */ + if (trustworthy == LOCAL && strchr(name_to_use, ':') != NULL) + name_to_use = strchr(name_to_use, ':')+1; + + /* 4/ Check if array exists. + */ + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + /* Now check we can get O_EXCL. If not, probably "mdadm -A" has + * taken over + */ + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (dfd < 0) { + if (c->verbose >= 0) + pr_err("cannot reopen %s: %s.\n", + devname, strerror(errno)); + goto out_unlock; + } + /* Cannot hold it open while we add the device to the array, + * so we must release the O_EXCL and depend on the map_lock() + * So now is the best time to remove any partitions. + */ + remove_partitions(dfd); + close(dfd); + dfd = -1; + + mp = map_by_uuid(&map, info.uuid); + if (mp) + mdfd = open_dev(mp->devnm); + else + mdfd = -1; + + if (mdfd < 0) { + + /* Skip the clustered ones. This should be started by + * clustering resource agents + */ + if (info.array.state & (1 << MD_SB_CLUSTERED)) + goto out; + + /* Couldn't find an existing array, maybe make a new one */ + mdfd = create_mddev(match ? match->devname : NULL, + name_to_use, c->autof, trustworthy, chosen_name, 0); + + if (mdfd < 0) + goto out_unlock; + + if (sysfs_init(&info, mdfd, NULL)) { + pr_err("unable to initialize sysfs for %s\n", + chosen_name); + rv = 2; + goto out_unlock; + } + + if (set_array_info(mdfd, st, &info) != 0) { + pr_err("failed to set array info for %s: %s\n", + chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + + dinfo = info; + dinfo.disk.major = major(rdev); + dinfo.disk.minor = minor(rdev); + if (add_disk(mdfd, st, &info, &dinfo) != 0) { + pr_err("failed to add %s to new array %s: %s.\n", + devname, chosen_name, strerror(errno)); + ioctl(mdfd, STOP_ARRAY, 0); + rv = 2; + goto out_unlock; + } + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (!sra || !sra->devs || sra->devs->disk.raid_disk >= 0) { + /* It really should be 'none' - must be old buggy + * kernel, and mdadm -I may not be able to complete. + * So reject it. + */ + ioctl(mdfd, STOP_ARRAY, NULL); + pr_err("You have an old buggy kernel which cannot support\n --incremental reliably. Aborting.\n"); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 1; + /* 6/ Make sure /var/run/mdadm.map contains this array. */ + map_update(&map, fd2devnm(mdfd), + info.text_version, + info.uuid, chosen_name); + } else { + /* 5b/ if it does */ + /* - check one drive in array to make sure metadata is a reasonably */ + /* close match. Reject if not (e.g. different type) */ + /* - add the device */ + char dn[20]; + int dfd2; + int err; + struct supertype *st2; + struct mdinfo info2, *d; + + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + + /* It is generally not OK to add non-spare drives to a + * running array as they are probably missing because + * they failed. However if runstop is 1, then the + * array was possibly started early and our best bet is + * to add this anyway. + * Also if action policy is re-add or better we allow + * re-add. + * This doesn't apply to containers as the 'non-spare' + * flag has a different meaning. The test has to happen + * at the device level there + */ + if (!st->ss->external && + (info.disk.state & (1 << MD_DISK_SYNC)) != 0 && + !policy_action_allows(policy, st->ss->name, act_re_add) && + c->runstop < 1) { + if (md_array_active(mdfd)) { + pr_err("not adding %s to active array (without --run) %s\n", + devname, chosen_name); + rv = 2; + goto out_unlock; + } + } + if (!sra) { + rv = 2; + goto out_unlock; + } + if (sra->devs) { + sprintf(dn, "%d:%d", sra->devs->disk.major, + sra->devs->disk.minor); + dfd2 = dev_open(dn, O_RDONLY); + if (dfd2 < 0) { + pr_err("unable to open %s\n", devname); + rv = 2; + goto out_unlock; + } + st2 = dup_super(st); + if (st2->ss->load_super(st2, dfd2, NULL) || + st->ss->compare_super(st, st2, 1) != 0) { + pr_err("metadata mismatch between %s and chosen array %s\n", + devname, chosen_name); + close(dfd2); + rv = 2; + goto out_unlock; + } + close(dfd2); + st2->ss->getinfo_super(st2, &info2, NULL); + st2->ss->free_super(st2); + if (info.array.level != info2.array.level || + memcmp(info.uuid, info2.uuid, 16) != 0 || + info.array.raid_disks != info2.array.raid_disks) { + pr_err("unexpected difference between %s and %s.\n", + chosen_name, devname); + rv = 2; + goto out_unlock; + } + } + info.disk.major = major(rdev); + info.disk.minor = minor(rdev); + /* add disk needs to know about containers */ + if (st->ss->external) + sra->array.level = LEVEL_CONTAINER; + + if (info.array.state & (1 << MD_SB_CLUSTERED)) + info.disk.state |= (1 << MD_DISK_CLUSTER_ADD); + + err = add_disk(mdfd, st, sra, &info); + if (err < 0 && errno == EBUSY) { + /* could be another device present with the same + * disk.number. Find and reject any such + */ + find_reject(mdfd, st, sra, info.disk.number, + info.events, c->verbose, chosen_name); + err = add_disk(mdfd, st, sra, &info); + } + if (err < 0 && errno == EINVAL && + info.disk.state & (1<<MD_DISK_SYNC)) { + /* Maybe it needs to be added as a spare */ + if (policy_action_allows(policy, st->ss->name, + act_force_spare)) { + info.disk.state &= ~(1<<MD_DISK_SYNC); + err = add_disk(mdfd, st, sra, &info); + } else + if (c->verbose >= 0) + pr_err("can only add %s to %s as a spare, and force-spare is not set.\n", + devname, chosen_name); + } + if (err < 0) { + pr_err("failed to add %s to existing array %s: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 2; + goto out_unlock; + } + info.array.working_disks = 0; + for (d = sra->devs; d; d=d->next) + info.array.working_disks ++; + + } + if (strncmp(chosen_name, "/dev/md/", 8) == 0) + md_devname = chosen_name+8; + else + md_devname = chosen_name; + if (c->export) { + printf("MD_DEVICE=%s\n", fd2devnm(mdfd)); + printf("MD_DEVNAME=%s\n", md_devname); + printf("MD_FOREIGN=%s\n", trustworthy == FOREIGN ? "yes" : "no"); + } + + /* 7/ Is there enough devices to possibly start the array? */ + /* 7a/ if not, finish with success. */ + if (info.array.level == LEVEL_CONTAINER) { + char devnm[32]; + /* Try to assemble within the container */ + sysfs_uevent(sra, "change"); + if (!c->export && c->verbose >= 0) + pr_err("container %s now has %d device%s\n", + chosen_name, info.array.working_disks, + info.array.working_disks == 1?"":"s"); + sysfs_rules_apply(chosen_name, &info); + wait_for(chosen_name, mdfd); + if (st->ss->external) + strcpy(devnm, fd2devnm(mdfd)); + if (st->ss->load_container) + rv = st->ss->load_container(st, mdfd, NULL); + close(mdfd); + sysfs_free(sra); + if (!rv) + rv = Incremental_container(st, chosen_name, c, NULL); + map_unlock(&map); + /* after spare is added, ping monitor for external metadata + * so that it can eg. try to rebuild degraded array */ + if (st->ss->external) + ping_monitor(devnm); + return rv; + } + + /* We have added something to the array, so need to re-read the + * state. Eventually this state should be kept up-to-date as + * things change. + */ + sysfs_free(sra); + sra = sysfs_read(mdfd, NULL, (GET_DEVS | GET_STATE | + GET_OFFSET | GET_SIZE)); + active_disks = count_active(st, sra, mdfd, &avail, &info); + + journal_device_missing = (info.journal_device_required) && (info.journal_clean == 0); + + if (info.consistency_policy == CONSISTENCY_POLICY_PPL) + info.array.state |= 1; + + if (enough(info.array.level, info.array.raid_disks, + info.array.layout, info.array.state & 1, avail) == 0) { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start (%d).\n", + devname, chosen_name, active_disks); + rv = 0; + goto out_unlock; + } + + /* 7b/ if yes, */ + /* - if number of OK devices match expected, or -R and there */ + /* are enough, */ + /* + add any bitmap file */ + /* + start the array (auto-readonly). */ + + if (md_array_active(mdfd)) { + if (c->export) { + printf("MD_STARTED=already\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s which is already active.\n", + devname, chosen_name); + rv = 0; + goto out_unlock; + } + + map_unlock(&map); + if (c->runstop > 0 || (!journal_device_missing && active_disks >= info.array.working_disks)) { + struct mdinfo *dsk; + /* Let's try to start it */ + + if (journal_device_missing) + pr_err("Trying to run with missing journal device\n"); + if (info.reshape_active && !(info.reshape_active & RESHAPE_NO_BACKUP)) { + pr_err("%s: This array is being reshaped and cannot be started\n", + chosen_name); + cont_err("by --incremental. Please use --assemble\n"); + goto out; + } + if (match && match->bitmap_file) { + int bmfd = open(match->bitmap_file, O_RDWR); + if (bmfd < 0) { + pr_err("Could not open bitmap file %s.\n", + match->bitmap_file); + goto out; + } + if (ioctl(mdfd, SET_BITMAP_FILE, bmfd) != 0) { + close(bmfd); + pr_err("Failed to set bitmapfile for %s.\n", + chosen_name); + goto out; + } + close(bmfd); + } + /* Need to remove from the array any devices which + * 'count_active' discerned were too old or inappropriate + */ + for (d = sra ? sra->devs : NULL ; d ; d = d->next) + if (d->disk.state & (1<<MD_DISK_REMOVED)) + remove_disk(mdfd, st, sra, d); + + if ((sra == NULL || active_disks >= info.array.working_disks) && + trustworthy != FOREIGN) + rv = ioctl(mdfd, RUN_ARRAY, NULL); + else + rv = sysfs_set_str(sra, NULL, + "array_state", "read-auto"); + /* Array might be O_EXCL which will interfere with + * fsck and mount. So re-open without O_EXCL. + */ + reopen_mddev(mdfd); + if (rv == 0) { + if (c->export) { + printf("MD_STARTED=yes\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, which has been started.\n", + devname, chosen_name); + rv = 0; + wait_for(chosen_name, mdfd); + /* We just started the array, so some devices + * might have been evicted from the array + * because their event counts were too old. + * If the action=re-add policy is in-force for + * those devices we should re-add them now. + */ + for (dsk = sra->devs; dsk ; dsk = dsk->next) { + if (disk_action_allows(dsk, st->ss->name, + act_re_add) && + add_disk(mdfd, st, sra, dsk) == 0) + pr_err("%s re-added to %s\n", + dsk->sys_name, chosen_name); + } + } else { + pr_err("%s attached to %s, but failed to start: %s.\n", + devname, chosen_name, strerror(errno)); + rv = 1; + } + } else { + if (c->export) { + printf("MD_STARTED=unsafe\n"); + } else if (journal_device_missing) { + pr_err("Journal device is missing, not safe to start yet.\n"); + } else if (c->verbose >= 0) + pr_err("%s attached to %s, not enough to start safely.\n", + devname, chosen_name); + rv = 0; + } +out: + free(avail); + if (dfd >= 0) + close(dfd); + if (mdfd >= 0) + close(mdfd); + if (policy) + dev_policy_free(policy); + sysfs_free(sra); + return rv; +out_unlock: + map_unlock(&map); + goto out; +} + +static void find_reject(int mdfd, struct supertype *st, struct mdinfo *sra, + int number, __u64 events, int verbose, + char *array_name) +{ + /* Find a device attached to this array with a disk.number of number + * and events less than the passed events, and remove the device. + */ + struct mdinfo *d; + + if (md_array_active(mdfd)) + return; /* not safe to remove from active arrays + * without thinking more */ + + for (d = sra->devs; d ; d = d->next) { + char dn[24]; // 2*11 bytes for ints (including sign) + colon + null byte + int dfd; + struct mdinfo info; + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + if (st->ss->load_super(st, dfd, NULL)) { + close(dfd); + continue; + } + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + close(dfd); + + if (info.disk.number != number || info.events >= events) + continue; + + if (d->disk.raid_disk > -1) + sysfs_set_str(sra, d, "slot", "none"); + if (sysfs_set_str(sra, d, "state", "remove") == 0) + if (verbose >= 0) + pr_err("removing old device %s from %s\n", + d->sys_name+4, array_name); + } +} + +static int count_active(struct supertype *st, struct mdinfo *sra, + int mdfd, char **availp, + struct mdinfo *bestinfo) +{ + /* count how many devices in sra think they are active */ + struct mdinfo *d; + int cnt = 0; + int replcnt = 0; + __u64 max_events = 0; + __u64 max_journal_events = 0; + char *avail = NULL; + int *best = NULL; + char *devmap = NULL; + int numdevs = 0; + int devnum; + int b, i; + int raid_disks = 0; + + if (!sra) + return 0; + + for (d = sra->devs ; d ; d = d->next) + numdevs++; + for (d = sra->devs, devnum = 0 ; d ; d = d->next, devnum++) { + char dn[30]; + int dfd; + int ok; + struct mdinfo info; + + sprintf(dn, "%d:%d", d->disk.major, d->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + ok = st->ss->load_super(st, dfd, NULL); + close(dfd); + if (ok != 0) + continue; + + info.array.raid_disks = raid_disks; + st->ss->getinfo_super(st, &info, devmap + raid_disks * devnum); + if (info.disk.raid_disk == MD_DISK_ROLE_JOURNAL && + info.events > max_journal_events) + max_journal_events = info.events; + if (!avail) { + raid_disks = info.array.raid_disks; + avail = xcalloc(raid_disks, 1); + *availp = avail; + + best = xcalloc(raid_disks, sizeof(int)); + devmap = xcalloc(raid_disks, numdevs); + + st->ss->getinfo_super(st, &info, devmap); + } + + if (info.disk.state & (1<<MD_DISK_SYNC)) + { + if (cnt == 0) { + cnt++; + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else if (info.events == max_events) { + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + } else if (info.events == max_events-1) { + if (avail[info.disk.raid_disk] == 0) { + avail[info.disk.raid_disk] = 1; + best[info.disk.raid_disk] = devnum; + } + } else if (info.events < max_events - 1) + ; + else if (info.events == max_events+1) { + int i; + max_events = info.events; + for (i = 0; i < raid_disks; i++) + if (avail[i]) + avail[i]--; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } else { /* info.events much bigger */ + memset(avail, 0, raid_disks); + max_events = info.events; + avail[info.disk.raid_disk] = 2; + best[info.disk.raid_disk] = devnum; + st->ss->getinfo_super(st, bestinfo, NULL); + } + } else if (info.disk.state & (1<<MD_DISK_REPLACEMENT)) + replcnt++; + st->ss->free_super(st); + } + if (max_journal_events >= max_events - 1) + bestinfo->journal_clean = 1; + + if (!avail) + return 0; + /* We need to reject any device that thinks the best device is + * failed or missing */ + for (b = 0; b < raid_disks; b++) + if (avail[b] == 2) + break; + cnt = 0; + for (i = 0 ; i < raid_disks ; i++) { + if (i != b && avail[i]) + if (devmap[raid_disks * best[i] + b] == 0) { + /* This device thinks 'b' is failed - + * don't use it */ + devnum = best[i]; + for (d=sra->devs ; devnum; d = d->next) + devnum--; + d->disk.state |= (1 << MD_DISK_REMOVED); + avail[i] = 0; + } + if (avail[i]) + cnt++; + } + /* Also need to reject any spare device with an event count that + * is too high + */ + for (d = sra->devs; d; d = d->next) { + if (!(d->disk.state & (1<<MD_DISK_SYNC)) && + d->events > max_events) + d->disk.state |= (1 << MD_DISK_REMOVED); + } + free(best); + free(devmap); + return cnt + replcnt; +} + +/* test if container has degraded member(s) */ +static int +container_members_max_degradation(struct map_ent *map, struct map_ent *me) +{ + struct mdinfo *sra; + int degraded, max_degraded = 0; + + for(; map; map = map->next) { + if (!metadata_container_matches(map->metadata, me->devnm)) + continue; + /* most accurate information regarding array degradation */ + sra = sysfs_read(-1, map->devnm, + GET_DISKS | GET_DEVS | GET_STATE); + if (!sra) + continue; + degraded = sra->array.raid_disks - sra->array.active_disks - + sra->array.spare_disks; + if (degraded > max_degraded) + max_degraded = degraded; + sysfs_free(sra); + } + + return max_degraded; +} + +static int array_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, int bare, + struct supertype *st, int verbose) +{ + /* This device doesn't have any md metadata + * The device policy allows 'spare' and if !bare, it allows spare-same-slot. + * If 'st' is not set, then we only know that some metadata allows this, + * others possibly don't. + * So look for a container or array to attach the device to. + * Prefer 'target' if that is set and the array is found. + * + * If st is set, then only arrays of that type are considered + * Return 0 on success, or some exit code on failure, probably 1. + */ + int rv = 1; + dev_t rdev; + struct map_ent *mp, *map = NULL; + struct mdinfo *chosen = NULL; + int dfd = *dfdp; + + if (!fstat_is_blkdev(dfd, devname, &rdev)) + return 1; + + /* + * Now we need to find a suitable array to add this to. + * We only accept arrays that: + * - match 'st' + * - are in the same domains as the device + * - are of an size for which the device will be useful + * and we choose the one that is the most degraded + */ + + if (map_lock(&map)) { + pr_err("failed to get exclusive lock on mapfile\n"); + return 1; + } + for (mp = map ; mp ; mp = mp->next) { + struct supertype *st2; + struct domainlist *dl = NULL; + struct mdinfo *sra; + unsigned long long devsize, freesize = 0; + struct spare_criteria sc = {0, 0}; + + if (is_subarray(mp->metadata)) + continue; + if (st) { + st2 = st->ss->match_metadata_desc(mp->metadata); + if (!st2 || + (st->minor_version >= 0 && + st->minor_version != st2->minor_version)) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata type doesn't match\n", + devname, mp->path); + free(st2); + continue; + } + free(st2); + } + sra = sysfs_read(-1, mp->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_COMPONENT|GET_VERSION); + if (sra) + sra->array.failed_disks = -1; + else + continue; + if (st == NULL) { + int i; + st2 = NULL; + for(i = 0; !st2 && superlist[i]; i++) + st2 = superlist[i]->match_metadata_desc( + sra->text_version); + if (!st2) { + if (verbose > 1) + pr_err("not adding %s to %s as metadata not recognised.\n", + devname, mp->path); + goto next; + } + /* Need to double check the 'act_spare' permissions applies + * to this metadata. + */ + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + goto next; + if (!bare && !policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) + goto next; + } else + st2 = st; + /* update number of failed disks for mostly degraded + * container member */ + if (sra->array.failed_disks == -1) + sra->array.failed_disks = container_members_max_degradation(map, mp); + + get_dev_size(dfd, NULL, &devsize); + if (sra->component_size == 0) { + /* true for containers, here we must read superblock + * to obtain minimum spare size */ + struct supertype *st3 = dup_super(st2); + int mdfd = open_dev(mp->devnm); + if (mdfd < 0) { + free(st3); + goto next; + } + if (st3->ss->load_container && + !st3->ss->load_container(st3, mdfd, mp->path)) { + if (st3->ss->get_spare_criteria) + st3->ss->get_spare_criteria(st3, &sc); + st3->ss->free_super(st3); + } + free(st3); + close(mdfd); + } + if ((sra->component_size > 0 && + st2->ss->validate_geometry(st2, sra->array.level, sra->array.layout, + sra->array.raid_disks, &sra->array.chunk_size, + sra->component_size, + sra->devs ? sra->devs->data_offset : INVALID_SECTORS, + devname, &freesize, sra->consistency_policy, + 0) && + freesize < sra->component_size) || + (sra->component_size == 0 && devsize < sc.min_size)) { + if (verbose > 1) + pr_err("not adding %s to %s as it is too small\n", + devname, mp->path); + goto next; + } + /* test against target. + * If 'target' is set and 'bare' is false, we only accept + * arrays/containers that match 'target'. + * If 'target' is set and 'bare' is true, we prefer the + * array which matches 'target'. + * target is considered only if we deal with degraded array + */ + if (target && policy_action_allows(pol, st2->ss->name, + act_spare_same_slot)) { + if (strcmp(target->metadata, mp->metadata) == 0 && + memcmp(target->uuid, mp->uuid, + sizeof(target->uuid)) == 0 && + sra->array.failed_disks > 0) { + /* This is our target!! */ + sysfs_free(chosen); + chosen = sra; + sra = NULL; + /* skip to end so we don't check any more */ + while (mp->next) + mp = mp->next; + goto next; + } + /* not our target */ + if (!bare) + goto next; + } + + dl = domain_from_array(sra, st2->ss->name); + if (domain_test(dl, pol, st2->ss->name) != 1) { + /* domain test fails */ + if (verbose > 1) + pr_err("not adding %s to %s as it is not in a compatible domain\n", + devname, mp->path); + + goto next; + } + /* all tests passed, OK to add to this array */ + if (!chosen) { + chosen = sra; + sra = NULL; + } else if (chosen->array.failed_disks < sra->array.failed_disks) { + sysfs_free(chosen); + chosen = sra; + sra = NULL; + } + next: + sysfs_free(sra); + if (st != st2) + free(st2); + if (dl) + domain_free(dl); + } + if (chosen) { + /* add current device to chosen array as a spare */ + int mdfd = open_dev(chosen->sys_name); + if (mdfd >= 0) { + struct mddev_dev devlist; + char chosen_devname[24]; // 2*11 for int (including signs) + colon + null + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = FlagDefault; + devlist.failfast = FlagDefault; + devlist.devname = chosen_devname; + sprintf(chosen_devname, "%d:%d", major(rdev), + minor(rdev)); + devlist.disposition = 'a'; + close(dfd); + *dfdp = -1; + rv = Manage_subdevs(chosen->sys_name, mdfd, &devlist, + -1, 0, NULL, 0); + close(mdfd); + } + if (verbose > 0) { + if (rv == 0) + pr_err("added %s as spare for %s\n", + devname, chosen->sys_name); + else + pr_err("failed to add %s as spare for %s\n", + devname, chosen->sys_name); + } + sysfs_free(chosen); + } + map_unlock(&map); + return rv; +} + +static int partition_try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct supertype *st, int verbose) +{ + /* we know that at least one partition virtual-metadata is + * allowed to incorporate spares like this device. We need to + * find a suitable device to copy partition information from. + * + * Getting a list of all disk (not partition) devices is + * slightly non-trivial. We could look at /sys/block, but + * that is theoretically due to be removed. Maybe best to use + * /dev/disk/by-path/?* and ignore names ending '-partNN' as + * we depend on this directory of 'path' info. But that fails + * to find loop devices and probably others. Maybe don't + * worry about that, they aren't the real target. + * + * So: check things in /dev/disk/by-path to see if they are in + * a compatible domain, then load the partition table and see + * if it is OK for the new device, and choose the largest + * partition table that fits. + */ + DIR *dir; + struct dirent *de; + char *chosen = NULL; + unsigned long long chosen_size = 0; + struct supertype *chosen_st = NULL; + int fd; + + dir = opendir("/dev/disk/by-path"); + if (!dir) + return 1; + while ((de = readdir(dir)) != NULL) { + char *ep; + struct dev_policy *pol2 = NULL; + struct domainlist *domlist = NULL; + int fd = -1; + struct mdinfo info; + struct supertype *st2 = NULL; + char *devname = NULL; + unsigned long long devsectors; + char *pathlist[2]; + + if (de->d_ino == 0 || de->d_name[0] == '.' || + (de->d_type != DT_LNK && de->d_type != DT_UNKNOWN)) + goto next; + + ep = de->d_name + strlen(de->d_name); + while (ep > de->d_name && + isdigit(ep[-1])) + ep--; + if (ep > de->d_name + 5 && + strncmp(ep-5, "-part", 5) == 0) + /* This is a partition - skip it */ + goto next; + + pathlist[0] = de->d_name; + pathlist[1] = NULL; + pol2 = path_policy(pathlist, type_disk); + + domain_merge(&domlist, pol2, st ? st->ss->name : NULL); + if (domain_test(domlist, pol, st ? st->ss->name : NULL) != 1) + /* new device is incompatible with this device. */ + goto next; + + domain_free(domlist); + domlist = NULL; + + if (asprintf(&devname, "/dev/disk/by-path/%s", de->d_name) != 1) { + devname = NULL; + goto next; + } + fd = open(devname, O_RDONLY); + if (fd < 0) + goto next; + if (get_dev_size(fd, devname, &devsectors) == 0) + goto next; + devsectors >>= 9; + + if (st) + st2 = dup_super(st); + else + st2 = guess_super_type(fd, guess_partitions); + if (st2 == NULL || st2->ss->load_super(st2, fd, NULL) < 0) + goto next; + st2->ignore_hw_compat = 0; + + if (!st) { + /* Check domain policy again, this time referring to metadata */ + domain_merge(&domlist, pol2, st2->ss->name); + if (domain_test(domlist, pol, st2->ss->name) != 1) + /* Incompatible devices for this metadata type */ + goto next; + if (!policy_action_allows(pol, st2->ss->name, act_spare)) + /* Some partition types allow sparing, but not + * this one. + */ + goto next; + } + + st2->ss->getinfo_super(st2, &info, NULL); + if (info.component_size > devsectors) + /* This partitioning doesn't fit in the device */ + goto next; + + /* This is an acceptable device to copy partition + * metadata from. We could just stop here, but I + * think I want to keep looking incase a larger + * metadata which makes better use of the device can + * be found. + */ + if (chosen == NULL || chosen_size < info.component_size) { + chosen_size = info.component_size; + free(chosen); + chosen = devname; + devname = NULL; + if (chosen_st) { + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + } + chosen_st = st2; + st2 = NULL; + } + + next: + free(devname); + domain_free(domlist); + dev_policy_free(pol2); + if (st2) + st2->ss->free_super(st2); + free(st2); + + if (fd >= 0) + close(fd); + } + + closedir(dir); + + if (!chosen) + return 1; + + /* 'chosen' is the best device we can find. Let's write its + * metadata to devname dfd is read-only so don't use that + */ + fd = open(devname, O_RDWR); + if (fd >= 0) { + chosen_st->ss->store_super(chosen_st, fd); + close(fd); + } + free(chosen); + chosen_st->ss->free_super(chosen_st); + free(chosen_st); + return 0; +} + +static int is_bare(int dfd) +{ + unsigned long long size = 0; + char bufpad[4096 + 4096]; + char *buf = (char*)(((long)bufpad + 4096) & ~4095); + + if (lseek(dfd, 0, SEEK_SET) != 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + /* OK, first 4K appear blank, try the end. */ + get_dev_size(dfd, NULL, &size); + if (lseek(dfd, size-4096, SEEK_SET) < 0 || + read(dfd, buf, 4096) != 4096) + return 0; + + if (buf[0] != '\0' && buf[0] != '\x5a' && buf[0] != '\xff') + return 0; + if (memcmp(buf, buf+1, 4095) != 0) + return 0; + + return 1; +} + +/* adding a spare to a regular array is quite different from adding one to + * a set-of-partitions virtual array. + * This function determines which is worth trying and tries as appropriate. + * Arrays are given priority over partitions. + */ +static int try_spare(char *devname, int *dfdp, struct dev_policy *pol, + struct map_ent *target, + struct supertype *st, int verbose) +{ + int i; + int rv; + int arrays_ok = 0; + int partitions_ok = 0; + int dfd = *dfdp; + int bare; + + /* Can only add a spare if device has at least one domain */ + if (pol_find(pol, pol_domain) == NULL) + return 1; + /* And only if some action allows spares */ + if (!policy_action_allows(pol, st?st->ss->name:NULL, act_spare)) + return 1; + + /* Now check if the device is bare. + * bare devices can always be added as a spare + * non-bare devices can only be added if spare-same-slot is permitted, + * and this device is replacing a previous device - in which case 'target' + * will be set. + */ + if (!is_bare(dfd)) { + /* Must have a target and allow same_slot */ + /* Later - may allow force_spare without target */ + if (!target || + !policy_action_allows(pol, st?st->ss->name:NULL, + act_spare_same_slot)) { + if (verbose > 1) + pr_err("%s is not bare, so not considering as a spare\n", + devname); + return 1; + } + bare = 0; + } else + bare = 1; + + /* It might be OK to add this device to an array - need to see + * what arrays might be candidates. + */ + if (st) { + /* just try to add 'array' or 'partition' based on this metadata */ + if (st->ss->add_to_super) + return array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + else + return partition_try_spare(devname, dfdp, pol, + st, verbose); + } + /* No metadata was specified or found so options are open. + * Check for whether any array metadata, or any partition metadata + * might allow adding the spare. This check is just help to avoid + * a more costly scan of all arrays when we can be sure that will + * fail. + */ + for (i = 0; (!arrays_ok || !partitions_ok) && superlist[i] ; i++) { + if (superlist[i]->add_to_super && !arrays_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + arrays_ok = 1; + if (superlist[i]->add_to_super == NULL && !partitions_ok && + policy_action_allows(pol, superlist[i]->name, act_spare)) + partitions_ok = 1; + } + rv = 1; + if (arrays_ok) + rv = array_try_spare(devname, dfdp, pol, target, bare, + st, verbose); + if (rv != 0 && partitions_ok) + rv = partition_try_spare(devname, dfdp, pol, st, verbose); + return rv; +} + +int IncrementalScan(struct context *c, char *devnm) +{ + /* look at every device listed in the 'map' file. + * If one is found that is not running then: + * look in mdadm.conf for bitmap file. + * if one exists, but array has none, add it. + * try to start array in auto-readonly mode + */ + struct map_ent *mapl = NULL; + struct map_ent *me; + struct mddev_ident *devs, *mddev; + int rv = 0; + char container[32]; + char *only = NULL; + + map_read(&mapl); + devs = conf_get_ident(NULL); + +restart: + for (me = mapl ; me ; me = me->next) { + struct mdinfo *sra; + int mdfd; + + if (devnm && strcmp(devnm, me->devnm) != 0) + continue; + if (me->metadata[0] == '/') { + char *sl; + + if (!devnm) + continue; + + /* member array, need to work on container */ + strncpy(container, me->metadata+1, 32); + container[31] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + only = devnm; + devnm = container; + goto restart; + } + mdfd = open_dev(me->devnm); + + if (!is_fd_valid(mdfd)) + continue; + if (!isdigit(me->metadata[0])) { + /* must be a container */ + struct supertype *st = super_by_fd(mdfd, NULL); + int ret = 0; + struct map_ent *map = NULL; + + if (st && st->ss->load_container) + ret = st->ss->load_container(st, mdfd, NULL); + close_fd(&mdfd); + if (!ret && st && st->ss->container_content) { + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + ret = Incremental_container(st, me->path, c, only); + map_unlock(&map); + } + if (ret) + rv = 1; + continue; + } + if (md_array_active(mdfd)) { + close_fd(&mdfd); + continue; + } + /* Ok, we can try this one. Maybe it needs a bitmap */ + for (mddev = devs ; mddev ; mddev = mddev->next) + if (mddev->devname && me->path && + devname_matches(mddev->devname, me->path)) + break; + if (mddev && mddev->bitmap_file) { + /* + * Note: early kernels will wrongly fail this, so it + * is a hint only + */ + int added = -1; + int bmfd; + + bmfd = open(mddev->bitmap_file, O_RDWR); + if (is_fd_valid(bmfd)) { + added = ioctl(mdfd, SET_BITMAP_FILE, bmfd); + close_fd(&bmfd); + } + if (c->verbose >= 0) { + if (added == 0) + pr_err("Added bitmap %s to %s\n", + mddev->bitmap_file, me->path); + else if (errno != EEXIST) + pr_err("Failed to add bitmap to %s: %s\n", + me->path, strerror(errno)); + } + } + /* FIXME check for reshape_active and consider not + * starting array. + */ + sra = sysfs_read(mdfd, NULL, 0); + if (sra) { + if (sysfs_set_str(sra, NULL, + "array_state", "read-auto") == 0) { + if (c->verbose >= 0) + pr_err("started array %s\n", + me->path ?: me->devnm); + } else { + pr_err("failed to start array %s: %s\n", + me->path ?: me->devnm, + strerror(errno)); + rv = 1; + } + sysfs_free(sra); + } + close_fd(&mdfd); + } + map_free(mapl); + return rv; +} + +static char *container2devname(char *devname) +{ + char *mdname = NULL; + + if (devname[0] == '/') { + int fd = open(devname, O_RDONLY); + if (fd >= 0) { + mdname = xstrdup(fd2devnm(fd)); + close(fd); + } + } else { + int uuid[4]; + struct map_ent *mp, *map = NULL; + + if (!parse_uuid(devname, uuid)) + return mdname; + mp = map_by_uuid(&map, uuid); + if (mp) + mdname = xstrdup(mp->devnm); + map_free(map); + } + + return mdname; +} + +static int Incremental_container(struct supertype *st, char *devname, + struct context *c, char *only) +{ + /* Collect the contents of this container and for each + * array, choose a device name and assemble the array. + */ + + struct mdinfo *list; + struct mdinfo *ra; + struct map_ent *map = NULL; + struct mdinfo info; + int trustworthy; + struct mddev_ident *match; + int rv = 0; + int result = 0; + + st->ss->getinfo_super(st, &info, NULL); + + if ((c->runstop > 0 && info.container_enough >= 0) || + info.container_enough > 0) + /* pass */; + else { + if (c->export) { + printf("MD_STARTED=no\n"); + } else if (c->verbose) + pr_err("not enough devices to start the container\n"); + return 0; + } + + match = conf_match(st, &info, devname, c->verbose, &rv); + if (match == NULL && rv == 2) + return rv; + + /* Need to compute 'trustworthy' */ + if (match) + trustworthy = LOCAL; + else if (st->ss->match_home(st, c->homehost) == 1) + trustworthy = LOCAL; + else if (st->ss->match_home(st, "any") == 1) + trustworthy = LOCAL; + else + trustworthy = FOREIGN; + + list = st->ss->container_content(st, NULL); + /* when nothing to activate - quit */ + if (list == NULL) { + if (c->export) { + printf("MD_STARTED=nothing\n"); + } + return 0; + } + for (ra = list ; ra ; ra = ra->next) { + int mdfd; + char chosen_name[1024]; + struct map_ent *mp; + struct mddev_ident *match = NULL; + + /* do not activate arrays blocked by metadata handler */ + if (ra->array.state & (1 << MD_SB_BLOCK_VOLUME)) { + pr_err("Cannot activate array %s in %s.\n", + ra->text_version, devname); + continue; + } + mp = map_by_uuid(&map, ra->uuid); + + if (mp) { + mdfd = open_dev(mp->devnm); + if (mp->path) + strcpy(chosen_name, mp->path); + else + strcpy(chosen_name, mp->devnm); + } else if (!only) { + + /* Check in mdadm.conf for container == devname and + * member == ra->text_version after second slash. + */ + char *sub = strchr(ra->text_version+1, '/'); + struct mddev_ident *array_list; + if (sub) { + sub++; + array_list = conf_get_ident(NULL); + } else + array_list = NULL; + for(; array_list ; array_list = array_list->next) { + char *dn; + if (array_list->member == NULL || + array_list->container == NULL) + continue; + if (strcmp(array_list->member, sub) != 0) + continue; + if (array_list->uuid_set && + !same_uuid(ra->uuid, array_list->uuid, st->ss->swapuuid)) + continue; + dn = container2devname(array_list->container); + if (dn == NULL) + continue; + if (strncmp(dn, ra->text_version+1, + strlen(dn)) != 0 || + ra->text_version[strlen(dn)+1] != '/') { + free(dn); + continue; + } + free(dn); + /* we have a match */ + match = array_list; + if (c->verbose>0) + pr_err("match found for member %s\n", + array_list->member); + break; + } + + if (match && match->devname && + strcasecmp(match->devname, "<ignore>") == 0) { + if (c->verbose > 0) + pr_err("array %s/%s is explicitly ignored by mdadm.conf\n", + match->container, match->member); + continue; + } + if (match) + trustworthy = LOCAL; + + mdfd = create_mddev(match ? match->devname : NULL, + ra->name, + c->autof, + trustworthy, + chosen_name, 0); + } + if (only && (!mp || strcmp(mp->devnm, only) != 0)) + continue; + + if (mdfd < 0) { + pr_err("failed to open %s: %s.\n", + chosen_name, strerror(errno)); + return 2; + } + + assemble_container_content(st, mdfd, ra, c, + chosen_name, &result); + map_free(map); + map = NULL; + close(mdfd); + } + if (c->export && result) { + char sep = '='; + printf("MD_STARTED"); + if (result & INCR_NO) { + printf("%cno", sep); + sep = ','; + } + if (result & INCR_UNSAFE) { + printf("%cunsafe", sep); + sep = ','; + } + if (result & INCR_ALREADY) { + printf("%calready", sep); + sep = ','; + } + if (result & INCR_YES) { + printf("%cyes", sep); + sep = ','; + } + printf("\n"); + } + return 0; +} + +static void run_udisks(char *arg1, char *arg2) +{ + int pid = fork(); + int status; + if (pid == 0) { + manage_fork_fds(1); + execl("/usr/bin/udisks", "udisks", arg1, arg2, NULL); + execl("/bin/udisks", "udisks", arg1, arg2, NULL); + exit(1); + } + while (pid > 0 && wait(&status) != pid) + ; +} + +static int force_remove(char *devnm, int fd, struct mdinfo *mdi, int verbose) +{ + int rv; + int devid = devnm2devid(devnm); + + run_udisks("--unmount", map_dev(major(devid), minor(devid), 0)); + rv = Manage_stop(devnm, fd, verbose, 1); + if (rv) { + /* At least we can try to trigger a 'remove' */ + sysfs_uevent(mdi, "remove"); + if (verbose) + pr_err("Fail to stop %s too.\n", devnm); + } + return rv; +} + +static void remove_from_member_array(struct mdstat_ent *memb, + struct mddev_dev *devlist, int verbose) +{ + int rv; + struct mdinfo mmdi; + int subfd = open_dev(memb->devnm); + + if (subfd >= 0) { + rv = Manage_subdevs(memb->devnm, subfd, devlist, verbose, + 0, NULL, 0); + if (rv & 2) { + if (sysfs_init(&mmdi, -1, memb->devnm)) + pr_err("unable to initialize sysfs for: %s\n", + memb->devnm); + else + force_remove(memb->devnm, subfd, &mmdi, + verbose); + } + close(subfd); + } +} + +/* + * IncrementalRemove - Attempt to see if the passed in device belongs to any + * raid arrays, and if so first fail (if needed) and then remove the device. + * + * @devname - The device we want to remove + * @id_path - name as found in /dev/disk/by-path for this device + * + * Note: the device name must be a kernel name like "sda", so + * that we can find it in /proc/mdstat + */ +int IncrementalRemove(char *devname, char *id_path, int verbose) +{ + int mdfd; + int rv = 0; + struct mdstat_ent *ent; + struct mddev_dev devlist; + struct mdinfo mdi; + char buf[32]; + + if (!id_path) + dprintf("incremental removal without --path <id_path> lacks the possibility to re-add new device in this port\n"); + + if (strchr(devname, '/')) { + pr_err("incremental removal requires a kernel device name, not a file: %s\n", devname); + return 1; + } + ent = mdstat_by_component(devname); + if (!ent) { + if (verbose >= 0) + pr_err("%s does not appear to be a component of any array\n", devname); + return 1; + } + if (sysfs_init(&mdi, -1, ent->devnm)) { + pr_err("unable to initialize sysfs for: %s\n", devname); + return 1; + } + mdfd = open_dev_excl(ent->devnm); + if (is_fd_valid(mdfd)) { + close_fd(&mdfd); + if (sysfs_get_str(&mdi, NULL, "array_state", + buf, sizeof(buf)) > 0) { + if (strncmp(buf, "active", 6) == 0 || + strncmp(buf, "clean", 5) == 0) + sysfs_set_str(&mdi, NULL, + "array_state", "read-auto"); + } + } + mdfd = open_dev(ent->devnm); + if (mdfd < 0) { + if (verbose >= 0) + pr_err("Cannot open array %s!!\n", ent->devnm); + free_mdstat(ent); + return 1; + } + + if (id_path) { + struct map_ent *map = NULL, *me; + me = map_by_devnm(&map, ent->devnm); + if (me) + policy_save_path(id_path, me); + map_free(map); + } + + memset(&devlist, 0, sizeof(devlist)); + devlist.devname = devname; + devlist.disposition = 'f'; + /* for a container, we must fail each member array */ + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0) { + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + for (memb = mdstat ; memb ; memb = memb->next) { + if (is_container_member(memb, ent->devnm)) + remove_from_member_array(memb, + &devlist, verbose); + } + free_mdstat(mdstat); + } else { + rv |= Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); + if (rv & 2) { + /* Failed due to EBUSY, try to stop the array. + * Give udisks a chance to unmount it first. + */ + rv = force_remove(ent->devnm, mdfd, &mdi, verbose); + goto end; + } + } + + devlist.disposition = 'r'; + rv = Manage_subdevs(ent->devnm, mdfd, &devlist, + verbose, 0, NULL, 0); +end: + close(mdfd); + free_mdstat(ent); + return rv; +} @@ -0,0 +1,147 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * + * Added by Dale Stephenson + * steph@snapserver.com + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" + +int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl) +{ + /* + * Nothing fancy about Kill. It just zeroes out a superblock + * Definitely not safe. + * Returns: + * 0 - a zero superblock was successfully written out + * 1 - failed to write the zero superblock + * 2 - failed to open the device. + * 4 - failed to find a superblock. + */ + + int fd, rv = 0; + + if (force) + noexcl = 1; + fd = open(dev, O_RDWR|(noexcl ? 0 : O_EXCL)); + if (fd < 0) { + if (verbose >= 0) + pr_err("Couldn't open %s for write - not zeroing\n", + dev); + return 2; + } + if (st == NULL) + st = guess_super(fd); + if (st == NULL || st->ss->init_super == NULL) { + if (verbose >= 0) + pr_err("Unrecognised md component device - %s\n", dev); + close(fd); + return 4; + } + st->ignore_hw_compat = 1; + rv = st->ss->load_super(st, fd, dev); + if (rv == 0 || (force && rv >= 2)) { + st->ss->free_super(st); + st->ss->init_super(st, NULL, NULL, "", NULL, NULL, + INVALID_SECTORS); + if (st->ss->store_super(st, fd)) { + if (verbose >= 0) + pr_err("Could not zero superblock on %s\n", + dev); + rv = 1; + } else if (rv) { + if (verbose >= 0) + pr_err("superblock zeroed anyway\n"); + rv = 0; + } + } + close(fd); + return rv; +} + +int Kill_subarray(char *dev, char *subarray, int verbose) +{ + /* Delete a subarray out of a container, the subarry must be + * inactive. The subarray string must be a subarray index + * number. + * + * 0 = successfully deleted subarray from all container members + * 1 = failed to sync metadata to one or more devices + * 2 = failed to find the container, subarray, or other resource + * issue + */ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->kill_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (is_subarray_active(subarray, st->devnm)) { + if (verbose >= 0) + pr_err("Subarray-%s still active, aborting\n", + subarray); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + /* ok we've found our victim, drop the axe */ + rv = st->ss->kill_subarray(st, subarray); + if (rv) { + if (verbose >= 0) + pr_err("Failed to delete subarray-%s from %s\n", + subarray, dev); + goto free_super; + } + + /* FIXME these routines do not report success/failure */ + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (verbose >= 0) + pr_err("Deleted subarray-%s from %s, UUIDs may have changed\n", + subarray, dev); + + rv = 0; + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2a51d81 --- /dev/null +++ b/Makefile @@ -0,0 +1,332 @@ +# +# mdadm - manage Linux "md" devices aka RAID arrays. +# +# Copyright (C) 2001-2002 Neil Brown <neilb@cse.unsw.edu.au> +# Copyright (C) 2013 Neil Brown <neilb@suse.de> +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +# +# Author: Neil Brown +# Email: <neilb@cse.unsw.edu.au> +# Paper: Neil Brown +# School of Computer Science and Engineering +# The University of New South Wales +# Sydney, 2052 +# Australia +# + +# define "CXFLAGS" to give extra flags to CC. +# e.g. make CXFLAGS=-O to optimise +CXFLAGS ?=-O2 +TCC = tcc +UCLIBC_GCC = $(shell for nm in i386-uclibc-linux-gcc i386-uclibc-gcc; do which $$nm > /dev/null && { echo $$nm ; exit; } ; done; echo false No uclibc found ) +#DIET_GCC = diet gcc +# sorry, but diet-libc doesn't know about posix_memalign, +# so we cannot use it any more. +DIET_GCC = gcc -DHAVE_STDINT_H + +KLIBC=/home/src/klibc/klibc-0.77 + +KLIBC_GCC = gcc -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 + +ifdef COVERITY +COVERITY_FLAGS=-include coverity-gcc-hack.h +endif + +ifeq ($(origin CC),default) +CC := $(CROSS_COMPILE)gcc +endif +CXFLAGS ?= -ggdb +CWFLAGS = -Wall -Werror -Wstrict-prototypes -Wextra -Wno-unused-parameter +ifdef WARN_UNUSED +CWFLAGS += -Wp,-D_FORTIFY_SOURCE=2 -O3 +endif + +FALLTHROUGH := $(shell gcc -v --help 2>&1 | grep "implicit-fallthrough" | wc -l) +ifneq "$(FALLTHROUGH)" "0" +CWFLAGS += -Wimplicit-fallthrough=0 +endif + +ifdef DEBIAN +CPPFLAGS += -DDEBIAN +endif +ifdef DEFAULT_OLD_METADATA + CPPFLAGS += -DDEFAULT_OLD_METADATA + DEFAULT_METADATA=0.90 +else + DEFAULT_METADATA=1.2 +endif +CPPFLAGS += -DBINDIR=\"$(BINDIR)\" + +PKG_CONFIG ?= pkg-config + +SYSCONFDIR = /etc +CONFFILE = $(SYSCONFDIR)/mdadm.conf +CONFFILE2 = $(SYSCONFDIR)/mdadm/mdadm.conf +MAILCMD =/usr/sbin/sendmail -t +CONFFILEFLAGS = -DCONFFILE=\"$(CONFFILE)\" -DCONFFILE2=\"$(CONFFILE2)\" +# Both MAP_DIR and MDMON_DIR should be somewhere that persists across the +# pivotroot from early boot to late boot. +# /run is best, but for distros that don't support that. +# /dev can work, in which case you probably want /dev/.mdadm +RUN_DIR=/run/mdadm +CHECK_RUN_DIR=1 +MAP_DIR=$(RUN_DIR) +MAP_FILE = map +MAP_PATH = $(MAP_DIR)/$(MAP_FILE) +MDMON_DIR = $(RUN_DIR) +# place for autoreplace cookies +FAILED_SLOTS_DIR = $(RUN_DIR)/failed-slots +SYSTEMD_DIR=/lib/systemd/system +LIB_DIR=/usr/libexec/mdadm + +COROSYNC:=$(shell [ -d /usr/include/corosync ] || echo -DNO_COROSYNC) +DLM:=$(shell [ -f /usr/include/libdlm.h ] || echo -DNO_DLM) + +DIRFLAGS = -DMAP_DIR=\"$(MAP_DIR)\" -DMAP_FILE=\"$(MAP_FILE)\" +DIRFLAGS += -DMDMON_DIR=\"$(MDMON_DIR)\" +DIRFLAGS += -DFAILED_SLOTS_DIR=\"$(FAILED_SLOTS_DIR)\" +CFLAGS = $(CWFLAGS) $(CXFLAGS) -DSendmail=\""$(MAILCMD)"\" $(CONFFILEFLAGS) $(DIRFLAGS) $(COROSYNC) $(DLM) + +VERSION = $(shell [ -d .git ] && git describe HEAD | sed 's/mdadm-//') +VERS_DATE = $(shell [ -d .git ] && date --iso-8601 --date="`git log -n1 --format=format:%cd --date=iso --date=short`") +DVERS = $(if $(VERSION),-DVERSION=\"$(VERSION)\",) +DDATE = $(if $(VERS_DATE),-DVERS_DATE="\"$(VERS_DATE)\"",) +DEXTRAVERSION = $(if $(EXTRAVERSION),-DEXTRAVERSION="\" - $(EXTRAVERSION)\"",) +CFLAGS += $(DVERS) $(DDATE) $(DEXTRAVERSION) + +# The glibc TLS ABI requires applications that call clone(2) to set up +# TLS data structures, use pthreads until mdmon implements this support +USE_PTHREADS = 1 +ifdef USE_PTHREADS +CFLAGS += -DUSE_PTHREADS +MON_LDFLAGS += -pthread +endif + +# If you want a static binary, you might uncomment these +# LDFLAGS = -static +# STRIP = -s +LDLIBS = -ldl + +# To explicitly disable libudev, set -DNO_LIBUDEV in CXFLAGS +ifeq (, $(findstring -DNO_LIBUDEV, $(CXFLAGS))) + LDLIBS += -ludev +endif + +INSTALL = /usr/bin/install +DESTDIR = +BINDIR = /sbin +MANDIR = /usr/share/man +MAN4DIR = $(MANDIR)/man4 +MAN5DIR = $(MANDIR)/man5 +MAN8DIR = $(MANDIR)/man8 + +UDEVDIR := $(shell $(PKG_CONFIG) --variable=udevdir udev 2>/dev/null) +ifndef UDEVDIR + UDEVDIR = /lib/udev +endif + +ifeq (,$(findstring s,$(MAKEFLAGS))) + ECHO=echo +else + ECHO=: +endif + +OBJS = mdadm.o config.o policy.o mdstat.o ReadMe.o uuid.o util.o maps.o lib.o \ + Manage.o Assemble.o Build.o \ + Create.o Detail.o Examine.o Grow.o Monitor.o dlink.o Kill.o Query.o \ + Incremental.o Dump.o \ + mdopen.o super0.o super1.o super-ddf.o super-intel.o bitmap.o \ + super-mbr.o super-gpt.o \ + restripe.o sysfs.o sha1.o mapfile.o crc32.o sg_io.o msg.o xmalloc.o \ + platform-intel.o probe_roms.o crc32c.o + +CHECK_OBJS = restripe.o uuid.o sysfs.o maps.o lib.o xmalloc.o dlink.o + +SRCS = $(patsubst %.o,%.c,$(OBJS)) + +INCL = mdadm.h part.h bitmap.h + +MON_OBJS = mdmon.o monitor.o managemon.o uuid.o util.o maps.o mdstat.o sysfs.o \ + policy.o lib.o \ + Kill.o sg_io.o dlink.o ReadMe.o super-intel.o \ + super-mbr.o super-gpt.o \ + super-ddf.o sha1.o crc32.o msg.o bitmap.o xmalloc.o \ + platform-intel.o probe_roms.o crc32c.o + +MON_SRCS = $(patsubst %.o,%.c,$(MON_OBJS)) + +STATICSRC = pwgr.c +STATICOBJS = pwgr.o + +all : mdadm mdmon +man : mdadm.man md.man mdadm.conf.man mdmon.man raid6check.man + +check_rundir: + @if [ ! -d "$(dir $(RUN_DIR))" -a "$(CHECK_RUN_DIR)" = 1 ]; then \ + echo "***** Parent of $(RUN_DIR) does not exist. Maybe set different RUN_DIR="; \ + echo "***** e.g. make RUN_DIR=/dev/.mdadm" ; \ + echo "***** or set CHECK_RUN_DIR=0"; exit 1; \ + fi + +everything: all mdadm.static swap_super test_stripe raid6check \ + mdadm.Os mdadm.O2 man +everything-test: all mdadm.static swap_super test_stripe \ + mdadm.Os mdadm.O2 man +# mdadm.uclibc doesn't work on x86-64 +# mdadm.tcc doesn't work.. + +%.o: %.c + $(CC) $(CFLAGS) $(CPPFLAGS) $(COVERITY_FLAGS) -o $@ -c $< + +mdadm : $(OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) -o mdadm $(OBJS) $(LDLIBS) + +mdadm.static : $(OBJS) $(STATICOBJS) + $(CC) $(CFLAGS) $(LDFLAGS) -static -o mdadm.static $(OBJS) $(STATICOBJS) $(LDLIBS) + +mdadm.tcc : $(SRCS) $(INCL) + $(TCC) -o mdadm.tcc $(SRCS) + +mdadm.klibc : $(SRCS) $(INCL) + rm -f $(OBJS) + $(CC) -nostdinc -iwithprefix include -I$(KLIBC)/klibc/include -I$(KLIBC)/linux/include -I$(KLIBC)/klibc/arch/i386/include -I$(KLIBC)/klibc/include/bits32 $(CFLAGS) $(SRCS) + +mdadm.Os : $(SRCS) $(INCL) + $(CC) -o mdadm.Os $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -Os $(SRCS) $(LDLIBS) + +mdadm.O2 : $(SRCS) $(INCL) mdmon.O2 + $(CC) -o mdadm.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(SRCS) $(LDLIBS) + +mdmon.O2 : $(MON_SRCS) $(INCL) mdmon.h + $(CC) -o mdmon.O2 $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -DHAVE_STDINT_H -O2 -D_FORTIFY_SOURCE=2 $(MON_SRCS) $(LDLIBS) + +# use '-z now' to guarantee no dynamic linker interactions with the monitor thread +mdmon : $(MON_OBJS) | check_rundir + $(CC) $(CFLAGS) $(LDFLAGS) $(MON_LDFLAGS) -Wl,-z,now -o mdmon $(MON_OBJS) $(LDLIBS) +msg.o: msg.c msg.h + +test_stripe : restripe.c xmalloc.o mdadm.h + $(CC) $(CFLAGS) $(CXFLAGS) $(LDFLAGS) -o test_stripe xmalloc.o -DMAIN restripe.c + +raid6check : raid6check.o mdadm.h $(CHECK_OBJS) + $(CC) $(CXFLAGS) $(LDFLAGS) -o raid6check raid6check.o $(CHECK_OBJS) + +mdadm.8 : mdadm.8.in + sed -e 's/{DEFAULT_METADATA}/$(DEFAULT_METADATA)/g' \ + -e 's,{MAP_PATH},$(MAP_PATH),g' mdadm.8.in > mdadm.8 + +mdadm.man : mdadm.8 + man -l mdadm.8 > mdadm.man + +mdmon.man : mdmon.8 + man -l mdmon.8 > mdmon.man + +md.man : md.4 + man -l md.4 > md.man + +mdadm.conf.man : mdadm.conf.5 + man -l mdadm.conf.5 > mdadm.conf.man + +raid6check.man : raid6check.8 + man -l raid6check.8 > raid6check.man + +$(OBJS) : $(INCL) mdmon.h +$(MON_OBJS) : $(INCL) mdmon.h + +sha1.o : sha1.c sha1.h md5.h + $(CC) $(CFLAGS) -DHAVE_STDINT_H -o sha1.o -c sha1.c + +install : install-bin install-man install-udev + +install-static : mdadm.static install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.static $(DESTDIR)$(BINDIR)/mdadm + +install-tcc : mdadm.tcc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.tcc $(DESTDIR)$(BINDIR)/mdadm + +install-uclibc : mdadm.uclibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.uclibc $(DESTDIR)$(BINDIR)/mdadm + +install-klibc : mdadm.klibc install-man + $(INSTALL) -D $(STRIP) -m 755 mdadm.klibc $(DESTDIR)$(BINDIR)/mdadm + +install-man: mdadm.8 md.4 mdadm.conf.5 mdmon.8 + $(INSTALL) -D -m 644 mdadm.8 $(DESTDIR)$(MAN8DIR)/mdadm.8 + $(INSTALL) -D -m 644 mdmon.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 + $(INSTALL) -D -m 644 md.4 $(DESTDIR)$(MAN4DIR)/md.4 + $(INSTALL) -D -m 644 mdadm.conf.5 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 + +install-udev: udev-md-raid-arrays.rules udev-md-raid-assembly.rules udev-md-raid-creating.rules \ + udev-md-clustered-confirm-device.rules + @for file in 01-md-raid-creating.rules 63-md-raid-arrays.rules 64-md-raid-assembly.rules \ + 69-md-clustered-confirm-device.rules ; \ + do sed -e 's,BINDIR,$(BINDIR),g' udev-$${file#??-} > .install.tmp.1 && \ + $(ECHO) $(INSTALL) -D -m 644 udev-$${file#??-} $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.1 $(DESTDIR)$(UDEVDIR)/rules.d/$$file ; \ + rm -f .install.tmp.1; \ + done + +install-systemd: systemd/mdmon@.service + @for file in mdmon@.service mdmonitor.service mdadm-last-resort@.timer \ + mdadm-last-resort@.service mdadm-grow-continue@.service \ + mdcheck_start.timer mdcheck_start.service \ + mdcheck_continue.timer mdcheck_continue.service \ + mdmonitor-oneshot.timer mdmonitor-oneshot.service \ + ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.2 && \ + $(ECHO) $(INSTALL) -D -m 644 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + $(INSTALL) -D -m 644 .install.tmp.2 $(DESTDIR)$(SYSTEMD_DIR)/$$file ; \ + rm -f .install.tmp.2; \ + done + @for file in mdadm.shutdown ; \ + do sed -e 's,BINDIR,$(BINDIR),g' systemd/$$file > .install.tmp.3 && \ + $(ECHO) $(INSTALL) -D -m 755 systemd/$$file $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + $(INSTALL) -D -m 755 .install.tmp.3 $(DESTDIR)$(SYSTEMD_DIR)-shutdown/$$file ; \ + rm -f .install.tmp.3; \ + done + if [ -f /etc/SuSE-release -o -n "$(SUSE)" ] ;then $(INSTALL) -D -m 755 systemd/SUSE-mdadm_env.sh $(DESTDIR)$(LIB_DIR)/mdadm_env.sh ;fi + +install-bin: mdadm mdmon + $(INSTALL) -D $(STRIP) -m 755 mdadm $(DESTDIR)$(BINDIR)/mdadm + $(INSTALL) -D $(STRIP) -m 755 mdmon $(DESTDIR)$(BINDIR)/mdmon + +uninstall: + rm -f $(DESTDIR)$(MAN8DIR)/mdadm.8 $(DESTDIR)$(MAN8DIR)/mdmon.8 $(DESTDIR)$(MAN4DIR)/md.4 $(DESTDIR)$(MAN5DIR)/mdadm.conf.5 $(DESTDIR)$(BINDIR)/mdadm + +test: mdadm mdmon test_stripe swap_super raid6check + @echo "Please run './test' as root" + +clean : + rm -f mdadm mdmon $(OBJS) $(MON_OBJS) $(STATICOBJS) core *.man \ + mdadm.tcc mdadm.uclibc mdadm.static *.orig *.porig *.rej *.alt \ + .merge_file_* mdadm.Os mdadm.O2 mdmon.O2 swap_super init.cpio.gz \ + mdadm.uclibc.static test_stripe raid6check raid6check.o mdmon mdadm.8 + rm -rf cov-int + +dist : clean + ./makedist + +testdist : everything-test clean + ./makedist test + +TAGS : + etags *.h *.c + +DISTRO_MAKEFILE := $(wildcard distropkg/Makefile) +ifdef DISTRO_MAKEFILE +include $(DISTRO_MAKEFILE) +endif diff --git a/Manage.c b/Manage.c new file mode 100644 index 0000000..f789e0c --- /dev/null +++ b/Manage.c @@ -0,0 +1,1767 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_u.h" +#include "md_p.h" +#include <ctype.h> + +int Manage_ro(char *devname, int fd, int readonly) +{ + /* switch to readonly or rw + * + * requires >= 0.90.0 + * first check that array is runing + * use RESTART_ARRAY_RW or STOP_ARRAY_RO + * + */ + struct mdinfo *mdi; + int rv = 0; + + /* If this is an externally-managed array, we need to modify the + * metadata_version so that mdmon doesn't undo our change. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION); + if (mdi && + mdi->array.major_version == -1 && + is_subarray(mdi->text_version)) { + char vers[64]; + strcpy(vers, "external:"); + strcat(vers, mdi->text_version); + if (readonly > 0) { + int rv; + /* We set readonly ourselves. */ + vers[9] = '-'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + close(fd); + rv = sysfs_set_str(mdi, NULL, "array_state", "readonly"); + + if (rv < 0) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + + vers[9] = mdi->text_version[0]; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + rv = 1; + goto out; + } + } else { + char *cp; + /* We cannot set read/write - must signal mdmon */ + vers[9] = '/'; + sysfs_set_str(mdi, NULL, "metadata_version", vers); + + cp = strchr(vers+10, '/'); + if (cp) + *cp = 0; + ping_monitor(vers+10); + if (mdi->array.level <= 0) + sysfs_set_str(mdi, NULL, "array_state", "active"); + } + goto out; + } + + if (!md_array_active(fd)) { + pr_err("%s does not appear to be active.\n", devname); + rv = 1; + goto out; + } + + if (readonly > 0) { + if (ioctl(fd, STOP_ARRAY_RO, NULL)) { + pr_err("failed to set readonly for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } else if (readonly < 0) { + if (ioctl(fd, RESTART_ARRAY_RW, NULL)) { + pr_err("failed to set writable for %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + } +out: + sysfs_free(mdi); + return rv; +} + +static void remove_devices(char *devnm, char *path) +{ + /* + * Remove names at 'path' - possibly with + * partition suffixes - which link to the 'standard' + * name for devnm. These were probably created + * by mdadm when the array was assembled. + */ + char base[40]; + char *path2; + char link[1024]; + int n; + int part; + char *be; + char *pe; + + if (!path) + return; + + sprintf(base, "/dev/%s", devnm); + be = base + strlen(base); + + path2 = xmalloc(strlen(path)+20); + strcpy(path2, path); + pe = path2 + strlen(path2); + + for (part = 0; part < 16; part++) { + if (part) { + sprintf(be, "p%d", part); + + if (isdigit(pe[-1])) + sprintf(pe, "p%d", part); + else + sprintf(pe, "%d", part); + } + n = readlink(path2, link, sizeof(link)); + if (n > 0 && (int)strlen(base) == n && + strncmp(link, base, n) == 0) + unlink(path2); + } + free(path2); +} + +int Manage_run(char *devname, int fd, struct context *c) +{ + /* Run the array. Array must already be configured + * Requires >= 0.90.0 + */ + char nm[32], *nmp; + + nmp = fd2devnm(fd); + if (!nmp) { + pr_err("Cannot find %s in sysfs!!\n", devname); + return 1; + } + strcpy(nm, nmp); + return IncrementalScan(c, nm); +} + +int Manage_stop(char *devname, int fd, int verbose, int will_retry) +{ + /* Stop the array. Array must already be configured + * 'will_retry' means that error messages are not wanted. + */ + int rv = 0; + struct map_ent *map = NULL; + struct mdinfo *mdi; + char devnm[32]; + char container[32]; + int err; + int count; + char buf[32]; + unsigned long long rd1, rd2; + + if (will_retry && verbose == 0) + verbose = -1; + + strcpy(devnm, fd2devnm(fd)); + /* Get EXCL access first. If this fails, then attempting + * to stop is probably a bad idea. + */ + mdi = sysfs_read(fd, NULL, GET_LEVEL|GET_COMPONENT|GET_VERSION); + if (mdi && is_subarray(mdi->text_version)) { + char *sl; + strncpy(container, mdi->text_version+1, sizeof(container)); + container[sizeof(container)-1] = 0; + sl = strchr(container, '/'); + if (sl) + *sl = 0; + } else + container[0] = 0; + close(fd); + count = 5; + while (((fd = ((devname[0] == '/') + ?open(devname, O_RDONLY|O_EXCL) + :open_dev_flags(devnm, O_RDONLY|O_EXCL))) < 0 || + strcmp(fd2devnm(fd), devnm) != 0) && container[0] && + mdmon_running(container) && count) { + /* Can't open, so something might be wrong. However it + * is a container, so we might be racing with mdmon, so + * retry for a bit. + */ + if (fd >= 0) + close(fd); + flush_mdmon(container); + count--; + } + if (fd < 0 || strcmp(fd2devnm(fd), devnm) != 0) { + if (fd >= 0) + close(fd); + if (verbose >= 0) + pr_err("Cannot get exclusive access to %s:Perhaps a running process, mounted filesystem or active volume group?\n", + devname); + return 1; + } + /* If this is an mdmon managed array, just write 'inactive' + * to the array state and let mdmon clear up. + */ + if (mdi && + mdi->array.level > 0 && + is_subarray(mdi->text_version)) { + int err; + /* This is mdmon managed. */ + close(fd); + + /* As we had an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; + while (count && + (err = sysfs_set_str(mdi, NULL, + "array_state", + "inactive")) < 0 && + errno == EBUSY) { + usleep(200000); + count--; + } + if (err) { + if (verbose >= 0) + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + rv = 1; + goto out; + } + + /* Give monitor a chance to act */ + ping_monitor(mdi->text_version); + + fd = open_dev_excl(devnm); + if (fd < 0) { + if (verbose >= 0) + pr_err("failed to completely stop %s: Device is busy\n", + devname); + rv = 1; + goto out; + } + } else if (mdi && + mdi->array.major_version == -1 && + mdi->array.minor_version == -2 && + !is_subarray(mdi->text_version)) { + struct mdstat_ent *mds, *m; + /* container, possibly mdmon-managed. + * Make sure mdmon isn't opening it, which + * would interfere with the 'stop' + */ + ping_monitor(mdi->sys_name); + + /* now check that there are no existing arrays + * which are members of this array + */ + mds = mdstat_read(0, 0); + for (m = mds; m; m = m->next) + if (m->metadata_version && + strncmp(m->metadata_version, "external:", 9)==0 && + metadata_container_matches(m->metadata_version+9, + devnm)) { + if (verbose >= 0) + pr_err("Cannot stop container %s: member %s still active\n", + devname, m->devnm); + free_mdstat(mds); + rv = 1; + goto out; + } + } + + /* If the array is undergoing a reshape which changes the number + * of devices, then it would be nice to stop it at a point where + * it has completed a full number of stripes in both old and + * new layouts as this will allow the reshape to be reverted. + * So if 'sync_action' is "reshape" and 'raid_disks' shows two + * different numbers, then + * - freeze reshape + * - set sync_max to next multiple of both data_disks and + * chunk sizes (or next but one) + * - unfreeze reshape + * - wait on 'sync_completed' for that point to be reached. + */ + if (mdi && (mdi->array.level >= 4 && mdi->array.level <= 6) && + sysfs_attribute_available(mdi, NULL, "sync_action") && + sysfs_attribute_available(mdi, NULL, "reshape_direction") && + sysfs_get_str(mdi, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "reshape\n") == 0 && + sysfs_get_two(mdi, NULL, "raid_disks", &rd1, &rd2) == 2) { + unsigned long long position, curr; + unsigned long long chunk1, chunk2; + unsigned long long rddiv, chunkdiv; + unsigned long long sectors; + unsigned long long sync_max, old_sync_max; + unsigned long long completed; + int backwards = 0; + int delay; + int scfd; + + delay = 40; + while (rd1 > rd2 && delay > 0 && + sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) == 0) { + /* must be in the critical section - wait a bit */ + delay -= 1; + usleep(100000); + } + + if (sysfs_set_str(mdi, NULL, "sync_action", "frozen") != 0) + goto done; + /* Array is frozen */ + + rd1 -= mdi->array.level == 6 ? 2 : 1; + rd2 -= mdi->array.level == 6 ? 2 : 1; + sysfs_get_str(mdi, NULL, "reshape_direction", buf, sizeof(buf)); + if (strncmp(buf, "back", 4) == 0) + backwards = 1; + if (sysfs_get_ll(mdi, NULL, "reshape_position", &position) != 0) { + /* reshape must have finished now */ + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + goto done; + } + sysfs_get_two(mdi, NULL, "chunk_size", &chunk1, &chunk2); + chunk1 /= 512; + chunk2 /= 512; + rddiv = GCD(rd1, rd2); + chunkdiv = GCD(chunk1, chunk2); + sectors = (chunk1/chunkdiv) * chunk2 * (rd1/rddiv) * rd2; + + if (backwards) { + /* Need to subtract 'reshape_position' from + * array size to get equivalent of sync_max. + * Size calculation based on raid5_size in kernel. + */ + unsigned long long size = mdi->component_size; + size &= ~(chunk1-1); + size &= ~(chunk2-1); + /* rd1 must be smaller */ + /* Reshape may have progressed further backwards than + * recorded, so target even further back (hence "-1") + */ + position = (position / sectors - 1) * sectors; + /* rd1 is always the conversion factor between 'sync' + * position and 'reshape' position. + * We read 1 "new" stripe worth of data from where-ever, + * and when write out that full stripe. + */ + sync_max = size - position/rd1; + } else { + /* Reshape will very likely be beyond position, and it may + * be too late to stop at '+1', so aim for '+2' + */ + position = (position / sectors + 2) * sectors; + sync_max = position/rd1; + } + if (sysfs_get_ll(mdi, NULL, "sync_max", &old_sync_max) < 0) + old_sync_max = mdi->component_size; + /* Must not advance sync_max as that could confuse + * the reshape monitor */ + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + sysfs_set_str(mdi, NULL, "sync_action", "idle"); + + /* That should have set things going again. Now we + * wait a little while (3 second max) for sync_completed + * to reach the target. + * The reshape process can block for 500msec if + * the sync speed limit is hit, so we need to wait + * a lot longer than that. 1 second is usually + * enough. 3 is safe. + */ + delay = 3000; + scfd = sysfs_open(mdi->sys_name, NULL, "sync_completed"); + while (scfd >= 0 && delay > 0 && old_sync_max > 0) { + unsigned long long max_completed; + sysfs_get_ll(mdi, NULL, "reshape_position", &curr); + sysfs_fd_get_str(scfd, buf, sizeof(buf)); + if (strncmp(buf, "none", 4) == 0) { + /* Either reshape has aborted, or hasn't + * quite started yet. Wait a bit and + * check 'sync_action' to see. + */ + usleep(10000); + sysfs_get_str(mdi, NULL, "sync_action", buf, sizeof(buf)); + if (strncmp(buf, "reshape", 7) != 0) + break; + } + + if (sysfs_fd_get_two(scfd, &completed, + &max_completed) == 2 && + /* 'completed' sometimes reads as max-uulong */ + completed < max_completed && + (completed > sync_max || + (completed == sync_max && curr != position))) { + while (completed > sync_max) { + sync_max += sectors / rd1; + if (backwards) + position -= sectors; + else + position += sectors; + } + if (sync_max < old_sync_max) + sysfs_set_num(mdi, NULL, "sync_max", sync_max); + } + + if (!backwards && curr >= position) + break; + if (backwards && curr <= position) + break; + sysfs_wait(scfd, &delay); + } + if (scfd >= 0) + close(scfd); + + } +done: + + /* As we have an O_EXCL open, any use of the device + * which blocks STOP_ARRAY is probably a transient use, + * so it is reasonable to retry for a while - 5 seconds. + */ + count = 25; err = 0; + while (count && fd >= 0 && + (err = ioctl(fd, STOP_ARRAY, NULL)) < 0 && errno == EBUSY) { + usleep(200000); + count --; + } + if (fd >= 0 && err) { + if (verbose >= 0) { + pr_err("failed to stop array %s: %s\n", + devname, strerror(errno)); + if (errno == EBUSY) + cont_err("Perhaps a running process, mounted filesystem or active volume group?\n"); + } + rv = 1; + goto out; + } + + if (get_linux_version() < 2006028) { + /* prior to 2.6.28, KOBJ_CHANGE was not sent when an md array + * was stopped, so We'll do it here just to be sure. Drop any + * partitions as well... + */ + if (fd >= 0) + ioctl(fd, BLKRRPART, 0); + if (mdi) + sysfs_uevent(mdi, "change"); + } + + if (devnm[0] && use_udev()) { + struct map_ent *mp = map_by_devnm(&map, devnm); + remove_devices(devnm, mp ? mp->path : NULL); + } + + if (verbose >= 0) + pr_err("stopped %s\n", devname); + map_lock(&map); + map_remove(&map, devnm); + map_unlock(&map); +out: + sysfs_free(mdi); + + return rv; +} + +static struct mddev_dev *add_one(struct mddev_dev *dv, char *name, char disp) +{ + struct mddev_dev *new; + new = xmalloc(sizeof(*new)); + memset(new, 0, sizeof(*new)); + new->devname = xstrdup(name); + new->disposition = disp; + new->next = dv->next; + dv->next = new; + return new; +} + +static void add_faulty(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if ((disk.state & 1) == 0) /* not faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, disp); + } +} + +static void add_detached(struct mddev_dev *dv, int fd, char disp) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + int sfd; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + if (disp == 'f' && (disk.state & 1) != 0) /* already faulty */ + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + sfd = dev_open(buf, O_RDONLY); + if (sfd >= 0) { + /* Not detached */ + close(sfd); + continue; + } + if (errno != ENXIO) + /* Probably not detached */ + continue; + dv = add_one(dv, buf, disp); + } +} + +static void add_set(struct mddev_dev *dv, int fd, char set_char) +{ + mdu_array_info_t array; + mdu_disk_info_t disk; + int remaining_disks; + int copies, set; + int i; + + if (md_get_array_info(fd, &array) != 0) + return; + if (array.level != 10) + return; + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies) + return; + + remaining_disks = array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + char buf[40]; + disk.number = i; + if (md_get_disk_info(fd, &disk) != 0) + continue; + if (disk.major == 0 && disk.minor == 0) + continue; + remaining_disks--; + set = disk.raid_disk % copies; + if (set_char != set + 'A') + continue; + sprintf(buf, "%d:%d", disk.major, disk.minor); + dv = add_one(dv, buf, dv->disposition); + } +} + +int attempt_re_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *dev_st, struct supertype *tst, + unsigned long rdev, + char *update, char *devname, int verbose, + mdu_array_info_t *array) +{ + struct mdinfo mdi; + int duuid[4]; + int ouuid[4]; + + dev_st->ss->getinfo_super(dev_st, &mdi, NULL); + dev_st->ss->uuid_from_super(dev_st, ouuid); + if (tst->sb) + tst->ss->uuid_from_super(tst, duuid); + else + /* Assume uuid matches: kernel will check */ + memcpy(duuid, ouuid, sizeof(ouuid)); + if ((mdi.disk.state & (1<<MD_DISK_ACTIVE)) && + !(mdi.disk.state & (1<<MD_DISK_FAULTY)) && + memcmp(duuid, ouuid, sizeof(ouuid))==0) { + /* Looks like it is worth a + * try. Need to make sure + * kernel will accept it + * though. + */ + mdu_disk_info_t disc; + /* re-add doesn't work for version-1 superblocks + * before 2.6.18 :-( + */ + if (array->major_version == 1 && + get_linux_version() <= 2006018) + goto skip_re_add; + disc.number = mdi.disk.number; + if (md_get_disk_info(fd, &disc) != 0 || + disc.major != 0 || disc.minor != 0) + goto skip_re_add; + disc.major = major(rdev); + disc.minor = minor(rdev); + disc.number = mdi.disk.number; + disc.raid_disk = mdi.disk.raid_disk; + disc.state = mdi.disk.state; + if (array->state & (1 << MD_SB_CLUSTERED)) { + /* extra flags are needed when adding to a cluster as + * there are two cases to distinguish + */ + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + if (dv->writemostly == FlagSet) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->writemostly == FlagClear) + disc.state &= ~(1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == FlagSet) + disc.state |= 1 << MD_DISK_FAILFAST; + if (dv->failfast == FlagClear) + disc.state &= ~(1 << MD_DISK_FAILFAST); + remove_partitions(tfd); + if (update || dv->writemostly != FlagDefault || + dv->failfast != FlagDefault) { + int rv = -1; + tfd = dev_open(dv->devname, O_RDWR); + if (tfd < 0) { + pr_err("failed to open %s for superblock update during re-add\n", dv->devname); + return -1; + } + + if (dv->writemostly == FlagSet) + rv = dev_st->ss->update_super( + dev_st, NULL, "writemostly", + devname, verbose, 0, NULL); + if (dv->writemostly == FlagClear) + rv = dev_st->ss->update_super( + dev_st, NULL, "readwrite", + devname, verbose, 0, NULL); + if (dv->failfast == FlagSet) + rv = dev_st->ss->update_super( + dev_st, NULL, "failfast", + devname, verbose, 0, NULL); + if (dv->failfast == FlagClear) + rv = dev_st->ss->update_super( + dev_st, NULL, "nofailfast", + devname, verbose, 0, NULL); + if (update) + rv = dev_st->ss->update_super( + dev_st, NULL, update, + devname, verbose, 0, NULL); + if (rv == 0) + rv = dev_st->ss->store_super(dev_st, tfd); + close(tfd); + if (rv != 0) { + pr_err("failed to update superblock during re-add\n"); + return -1; + } + } + /* don't even try if disk is marked as faulty */ + errno = 0; + if (ioctl(fd, ADD_NEW_DISK, &disc) == 0) { + if (verbose >= 0) + pr_err("re-added %s\n", dv->devname); + return 1; + } + if (errno == ENOMEM || errno == EROFS) { + pr_err("add new device failed for %s: %s\n", + dv->devname, strerror(errno)); + if (dv->disposition == 'M') + return 0; + return -1; + } + } +skip_re_add: + return 0; +} + +int Manage_add(int fd, int tfd, struct mddev_dev *dv, + struct supertype *tst, mdu_array_info_t *array, + int force, int verbose, char *devname, + char *update, unsigned long rdev, unsigned long long array_size, + int raid_slot) +{ + unsigned long long ldsize; + struct supertype *dev_st; + int j; + mdu_disk_info_t disc; + + if (!get_dev_size(tfd, dv->devname, &ldsize)) { + if (dv->disposition == 'M') + return 0; + else + return -1; + } + + if (tst->ss == &super0 && ldsize > 4ULL*1024*1024*1024*1024) { + /* More than 4TB is wasted on v0.90 */ + if (!force) { + pr_err("%s is larger than %s can effectively use.\n" + " Add --force is you really want to add this device.\n", + dv->devname, devname); + return -1; + } + pr_err("%s is larger than %s can effectively use.\n" + " Adding anyway as --force was given.\n", + dv->devname, devname); + } + + if (array->not_persistent == 0 || tst->ss->external) { + + /* need to find a sample superblock to copy, and + * a spare slot to use. + * For 'external' array (well, container based), + * We can just load the metadata for the array-> + */ + int array_failed; + if (tst->sb) + /* already loaded */; + else if (tst->ss->external) { + tst->ss->load_container(tst, fd, NULL); + } else for (j = 0; j < tst->max_devs; j++) { + char *dev; + int dfd; + disc.number = j; + if (md_get_disk_info(fd, &disc)) + continue; + if (disc.major==0 && disc.minor==0) + continue; + if ((disc.state & 4)==0) /* sync */ + continue; + /* Looks like a good device to try */ + dev = map_dev(disc.major, disc.minor, 1); + if (!dev) + continue; + dfd = dev_open(dev, O_RDONLY); + if (dfd < 0) + continue; + if (tst->ss->load_super(tst, dfd, + NULL)) { + close(dfd); + continue; + } + close(dfd); + break; + } + /* FIXME this is a bad test to be using */ + if (!tst->sb && (dv->disposition != 'a' && + dv->disposition != 'S')) { + /* we are re-adding a device to a + * completely dead array - have to depend + * on kernel to check + */ + } else if (!tst->sb) { + pr_err("cannot load array metadata from %s\n", devname); + return -1; + } + + /* Make sure device is large enough */ + if (dv->disposition != 'j' && /* skip size check for Journal */ + tst->sb && + tst->ss->avail_size(tst, ldsize/512, INVALID_SECTORS) < + array_size) { + if (dv->disposition == 'M') + return 0; + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + + /* Possibly this device was recently part of + * the array and was temporarily removed, and + * is now being re-added. If so, we can + * simply re-add it. + */ + + if (array->not_persistent == 0) { + dev_st = dup_super(tst); + dev_st->ss->load_super(dev_st, tfd, NULL); + if (dev_st->sb && dv->disposition != 'S') { + int rv; + + rv = attempt_re_add(fd, tfd, dv, dev_st, tst, + rdev, update, devname, + verbose, array); + dev_st->ss->free_super(dev_st); + if (rv) + return rv; + } + } + if (dv->disposition == 'M') { + if (verbose > 0) + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return 0; + } + if (dv->disposition == 'A') { + pr_err("--re-add for %s to %s is not possible\n", + dv->devname, devname); + return -1; + } + if (array->active_disks < array->raid_disks) { + char *avail = xcalloc(array->raid_disks, 1); + int d; + int found = 0; + + for (d = 0; d < MAX_DISKS && found < array->nr_disks; d++) { + disc.number = d; + if (md_get_disk_info(fd, &disc)) + continue; + if (disc.major == 0 && disc.minor == 0) + continue; + if (!(disc.state & (1<<MD_DISK_SYNC))) + continue; + avail[disc.raid_disk] = 1; + found++; + } + array_failed = !enough(array->level, array->raid_disks, + array->layout, 1, avail); + free(avail); + } else + array_failed = 0; + if (array_failed) { + pr_err("%s has failed so using --add cannot work and might destroy\n", + devname); + pr_err("data on %s. You should stop the array and re-assemble it.\n", + dv->devname); + return -1; + } + } else { + /* non-persistent. Must ensure that new drive + * is at least array->size big. + */ + if (ldsize/512 < array_size) { + pr_err("%s not large enough to join array\n", + dv->devname); + return -1; + } + } + /* committed to really trying this device now*/ + remove_partitions(tfd); + + /* in 2.6.17 and earlier, version-1 superblocks won't + * use the number we write, but will choose a free number. + * we must choose the same free number, which requires + * starting at 'raid_disks' and counting up + */ + for (j = array->raid_disks; j < tst->max_devs; j++) { + disc.number = j; + if (md_get_disk_info(fd, &disc)) + break; + if (disc.major==0 && disc.minor==0) + break; + if (disc.state & 8) /* removed */ + break; + } + disc.major = major(rdev); + disc.minor = minor(rdev); + if (raid_slot < 0) + disc.number = j; + else + disc.number = raid_slot; + disc.state = 0; + + /* only add journal to array that supports journaling */ + if (dv->disposition == 'j') { + struct mdinfo *mdp; + + mdp = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (!mdp) { + pr_err("%s unable to read array state.\n", devname); + return -1; + } + + if (mdp->array_state != ARRAY_READONLY) { + sysfs_free(mdp); + pr_err("%s is not readonly, cannot add journal.\n", devname); + return -1; + } + + sysfs_free(mdp); + + disc.raid_disk = 0; + } + + if (array->not_persistent==0) { + int dfd; + if (dv->disposition == 'j') + disc.state |= (1 << MD_DISK_JOURNAL) | (1 << MD_DISK_SYNC); + if (dv->writemostly == FlagSet) + disc.state |= 1 << MD_DISK_WRITEMOSTLY; + if (dv->failfast == FlagSet) + disc.state |= 1 << MD_DISK_FAILFAST; + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) + return -1; + if (tst->ss->write_init_super(tst)) + return -1; + } else if (dv->disposition == 'A') { + /* this had better be raid1. + * As we are "--re-add"ing we must find a spare slot + * to fill. + */ + char *used = xcalloc(array->raid_disks, 1); + for (j = 0; j < tst->max_devs; j++) { + mdu_disk_info_t disc2; + disc2.number = j; + if (md_get_disk_info(fd, &disc2)) + continue; + if (disc2.major==0 && disc2.minor==0) + continue; + if (disc2.state & 8) /* removed */ + continue; + if (disc2.raid_disk < 0) + continue; + if (disc2.raid_disk > array->raid_disks) + continue; + used[disc2.raid_disk] = 1; + } + for (j = 0 ; j < array->raid_disks; j++) + if (!used[j]) { + disc.raid_disk = j; + disc.state |= (1<<MD_DISK_SYNC); + break; + } + free(used); + } + + if (array->state & (1 << MD_SB_CLUSTERED)) { + if (dv->disposition == 'c') + disc.state |= (1 << MD_DISK_CANDIDATE); + else + disc.state |= (1 << MD_DISK_CLUSTER_ADD); + } + + if (dv->writemostly == FlagSet) + disc.state |= (1 << MD_DISK_WRITEMOSTLY); + if (dv->failfast == FlagSet) + disc.state |= (1 << MD_DISK_FAILFAST); + if (tst->ss->external) { + /* add a disk + * to an external metadata container */ + struct mdinfo new_mdi; + struct mdinfo *sra; + int container_fd; + char devnm[32]; + int dfd; + + strcpy(devnm, fd2devnm(fd)); + + container_fd = open_dev_excl(devnm); + if (container_fd < 0) { + pr_err("add failed for %s: could not get exclusive access to container\n", + dv->devname); + tst->ss->free_super(tst); + return -1; + } + + /* Check if metadata handler is able to accept the drive */ + if (!tst->ss->validate_geometry(tst, LEVEL_CONTAINER, 0, 1, NULL, + 0, 0, dv->devname, NULL, 0, 1)) { + close(container_fd); + return -1; + } + + Kill(dv->devname, NULL, 0, -1, 0); + dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); + if (tst->ss->add_to_super(tst, &disc, dfd, + dv->devname, INVALID_SECTORS)) { + close(dfd); + close(container_fd); + return -1; + } + if (!mdmon_running(tst->container_devnm)) + tst->ss->sync_metadata(tst); + + sra = sysfs_read(container_fd, NULL, 0); + if (!sra) { + pr_err("add failed for %s: sysfs_read failed\n", + dv->devname); + close(container_fd); + tst->ss->free_super(tst); + return -1; + } + sra->array.level = LEVEL_CONTAINER; + /* Need to set data_offset and component_size */ + tst->ss->getinfo_super(tst, &new_mdi, NULL); + new_mdi.disk.major = disc.major; + new_mdi.disk.minor = disc.minor; + new_mdi.recovery_start = 0; + /* Make sure fds are closed as they are O_EXCL which + * would block add_disk */ + tst->ss->free_super(tst); + if (sysfs_add_disk(sra, &new_mdi, 0) != 0) { + pr_err("add new device to external metadata failed for %s\n", dv->devname); + close(container_fd); + sysfs_free(sra); + return -1; + } + ping_monitor(devnm); + sysfs_free(sra); + close(container_fd); + } else { + tst->ss->free_super(tst); + if (ioctl(fd, ADD_NEW_DISK, &disc)) { + if (dv->disposition == 'j') + pr_err("Failed to hot add %s as journal, " + "please try restart %s.\n", dv->devname, devname); + else + pr_err("add new device failed for %s as %d: %s\n", + dv->devname, j, strerror(errno)); + return -1; + } + if (dv->disposition == 'j') { + pr_err("Journal added successfully, making %s read-write\n", devname); + if (Manage_ro(devname, fd, -1)) + pr_err("Failed to make %s read-write\n", devname); + } + + } + if (verbose >= 0) + pr_err("added %s\n", dv->devname); + return 1; +} + +int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, + int sysfd, unsigned long rdev, int force, int verbose, char *devname) +{ + int lfd = -1; + int err; + + if (tst->ss->external) { + /* To remove a device from a container, we must + * check that it isn't in use in an array. + * This involves looking in the 'holders' + * directory - there must be just one entry, + * the container. + * To ensure that it doesn't get used as a + * hot spare while we are checking, we + * get an O_EXCL open on the container + */ + int ret; + char devnm[32]; + strcpy(devnm, fd2devnm(fd)); + lfd = open_dev_excl(devnm); + if (lfd < 0) { + pr_err("Cannot get exclusive access to container - odd\n"); + return -1; + } + /* We may not be able to check on holders in + * sysfs, either because we don't have the dev num + * (rdev == 0) or because the device has been detached + * and the 'holders' directory no longer exists + * (ret == -1). In that case, assume it is OK to + * remove. + */ + if (rdev == 0) + ret = -1; + else { + /* + * The drive has already been set to 'faulty', however + * monitor might not have had time to process it and the + * drive might still have an entry in the 'holders' + * directory. Try a few times to avoid a false error + */ + int count = 20; + + do { + ret = sysfs_unique_holder(devnm, rdev); + if (ret < 2) + break; + usleep(100 * 1000); /* 100ms */ + } while (--count > 0); + + if (ret == 0) { + pr_err("%s is not a member, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + if (ret >= 2) { + pr_err("%s is still in use, cannot remove.\n", + dv->devname); + close(lfd); + return -1; + } + } + } + /* FIXME check that it is a current member */ + if (sysfd >= 0) { + /* device has been removed and we don't know + * the major:minor number + */ + err = sys_hot_remove_disk(sysfd, force); + } else { + err = hot_remove_disk(fd, rdev, force); + if (err && errno == ENODEV) { + /* Old kernels rejected this if no personality + * is registered */ + struct mdinfo *sra = sysfs_read(fd, NULL, GET_DEVS); + struct mdinfo *dv = NULL; + if (sra) + dv = sra->devs; + for ( ; dv ; dv=dv->next) + if (dv->disk.major == (int)major(rdev) && + dv->disk.minor == (int)minor(rdev)) + break; + if (dv) + err = sysfs_set_str(sra, dv, + "state", "remove"); + else + err = -1; + sysfs_free(sra); + } + } + if (err) { + pr_err("hot remove failed for %s: %s\n", dv->devname, + strerror(errno)); + if (lfd >= 0) + close(lfd); + return -1; + } + if (tst->ss->external) { + /* + * Before dropping our exclusive open we make an + * attempt at preventing mdmon from seeing an + * 'add' event before reconciling this 'remove' + * event. + */ + char *devnm = fd2devnm(fd); + + if (!devnm) { + pr_err("unable to get container name\n"); + return -1; + } + + ping_manager(devnm); + } + if (lfd >= 0) + close(lfd); + if (verbose >= 0) + pr_err("hot removed %s from %s\n", + dv->devname, devname); + return 1; +} + +int Manage_replace(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + if (tst->ss->external) { + pr_err("--replace only supported for native metadata (0.90 or 1.x)\n"); + return -1; + } + /* Need to find the device in sysfs and add 'want_replacement' to the + * status. + */ + mdi = sysfs_read(fd, NULL, GET_DEVS); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.raid_disk < 0) { + pr_err("%s is not active and so cannot be replaced.\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_str(mdi, di, + "state", "want_replacement"); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to request replacement for %s\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s (device %d in %s) for replacement\n", + dv->devname, di->disk.raid_disk, devname); + /* If there is a matching 'with', we need to tell it which + * raid disk + */ + while (dv && dv->disposition != 'W') + dv = dv->next; + if (dv) { + dv->disposition = 'w'; + dv->used = di->disk.raid_disk; + } + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot --replace it\n", + dv->devname, devname); + return -1; +} + +int Manage_with(struct supertype *tst, int fd, struct mddev_dev *dv, + unsigned long rdev, int verbose, char *devname) +{ + struct mdinfo *mdi, *di; + /* try to set 'slot' for 'rdev' in 'fd' to 'dv->used' */ + mdi = sysfs_read(fd, NULL, GET_DEVS|GET_STATE); + if (!mdi || !mdi->devs) { + pr_err("Cannot find status of %s to enable replacement - strange\n", + devname); + return -1; + } + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (int)major(rdev) && + di->disk.minor == (int)minor(rdev)) + break; + if (di) { + int rv; + if (di->disk.state & (1<<MD_DISK_FAULTY)) { + pr_err("%s is faulty and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + if (di->disk.raid_disk >= 0) { + pr_err("%s is active and cannot be a replacement\n", + dv->devname); + sysfs_free(mdi); + return -1; + } + rv = sysfs_set_num(mdi, di, + "slot", dv->used); + if (rv) { + sysfs_free(mdi); + pr_err("Failed to set %s as preferred replacement.\n", + dv->devname); + return -1; + } + if (verbose >= 0) + pr_err("Marked %s in %s as replacement for device %d\n", + dv->devname, devname, dv->used); + return 1; + } + sysfs_free(mdi); + pr_err("%s not found in %s so cannot make it preferred replacement\n", + dv->devname, devname); + return -1; +} + +int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force) +{ + /* Do something to each dev. + * devmode can be + * 'a' - add the device + * 'S' - add the device as a spare - don't try re-add + * 'j' - add the device as a journal device + * 'A' - re-add the device + * 'r' - remove the device: HOT_REMOVE_DISK + * device can be 'faulty' or 'detached' in which case all + * matching devices are removed. + * 'f' - set the device faulty SET_DISK_FAULTY + * device can be 'detached' in which case any device that + * is inaccessible will be marked faulty. + * 'R' - mark this device as wanting replacement. + * 'W' - this device is added if necessary and activated as + * a replacement for a previous 'R' device. + * ----- + * 'w' - 'W' will be changed to 'w' when it is paired with + * a 'R' device. If a 'W' is found while walking the list + * it must be unpaired, and is an error. + * 'M' - this is created by a 'missing' target. It is a slight + * variant on 'A' + * 'F' - Another variant of 'A', where the device was faulty + * so must be removed from the array first. + * 'c' - confirm the device as found (for clustered environments) + * + * For 'f' and 'r', the device can also be a kernel-internal + * name such as 'sdb'. + */ + mdu_array_info_t array; + unsigned long long array_size; + struct mddev_dev *dv; + int tfd = -1; + struct supertype *tst; + char *subarray = NULL; + int sysfd = -1; + int count = 0; /* number of actions taken */ + struct mdinfo info; + struct mdinfo devinfo; + int frozen = 0; + int busy = 0; + int raid_slot = -1; + + if (sysfs_init(&info, fd, NULL)) { + pr_err("sysfs not availabile for %s\n", devname); + goto abort; + } + + if (md_get_array_info(fd, &array)) { + pr_err("Cannot get array info for %s\n", devname); + goto abort; + } + /* array.size is only 32 bits and may be truncated. + * So read from sysfs if possible, and record number of sectors + */ + + array_size = get_component_size(fd); + if (array_size <= 0) + array_size = array.size * 2; + + tst = super_by_fd(fd, &subarray); + if (!tst) { + pr_err("unsupport array - version %d.%d\n", + array.major_version, array.minor_version); + goto abort; + } + + for (dv = devlist; dv; dv = dv->next) { + dev_t rdev = 0; /* device to add/remove etc */ + int rv; + int mj,mn; + + raid_slot = -1; + if (dv->disposition == 'c') { + rv = parse_cluster_confirm_arg(dv->devname, + &dv->devname, + &raid_slot); + if (rv) { + pr_err("Could not get the devname of cluster\n"); + goto abort; + } + } + + if (strcmp(dv->devname, "failed") == 0 || + strcmp(dv->devname, "faulty") == 0) { + if (dv->disposition != 'A' && dv->disposition != 'r') { + pr_err("%s only meaningful with -r or --re-add, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_faulty(dv, fd, (dv->disposition == 'A' + ? 'F' : 'r')); + continue; + } + if (strcmp(dv->devname, "detached") == 0) { + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r of -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + add_detached(dv, fd, dv->disposition); + continue; + } + + if (strcmp(dv->devname, "missing") == 0) { + struct mddev_dev *add_devlist; + struct mddev_dev **dp; + if (dv->disposition == 'c') { + rv = ioctl(fd, CLUSTERED_DISK_NACK, NULL); + break; + } + + if (dv->disposition != 'A') { + pr_err("'missing' only meaningful with --re-add\n"); + goto abort; + } + add_devlist = conf_get_devs(); + if (add_devlist == NULL) { + pr_err("no devices to scan for missing members.\n"); + continue; + } + for (dp = &add_devlist; *dp; dp = & (*dp)->next) + /* 'M' (for 'missing') is like 'A' without errors */ + (*dp)->disposition = 'M'; + *dp = dv->next; + dv->next = add_devlist; + continue; + } + + if (strncmp(dv->devname, "set-", 4) == 0 && + strlen(dv->devname) == 5) { + int copies; + + if (dv->disposition != 'r' && + dv->disposition != 'f') { + pr_err("'%s' only meaningful with -r or -f\n", + dv->devname); + goto abort; + } + if (array.level != 10) { + pr_err("'%s' only meaningful with RAID10 arrays\n", + dv->devname); + goto abort; + } + copies = ((array.layout & 0xff) * + ((array.layout >> 8) & 0xff)); + if (array.raid_disks % copies != 0 || + dv->devname[4] < 'A' || + dv->devname[4] >= 'A' + copies || + copies > 26) { + pr_err("'%s' not meaningful with this array\n", + dv->devname); + goto abort; + } + add_set(dv, fd, dv->devname[4]); + continue; + } + + if (strchr(dv->devname, '/') == NULL && + strchr(dv->devname, ':') == NULL && + strlen(dv->devname) < 50) { + /* Assume this is a kernel-internal name like 'sda1' */ + int found = 0; + char dname[55]; + if (dv->disposition != 'r' && dv->disposition != 'f') { + pr_err("%s only meaningful with -r or -f, not -%c\n", + dv->devname, dv->disposition); + goto abort; + } + + sprintf(dname, "dev-%s", dv->devname); + sysfd = sysfs_open(fd2devnm(fd), dname, "block/dev"); + if (sysfd >= 0) { + char dn[20]; + if (sysfs_fd_get_str(sysfd, dn, 20) > 0 && + sscanf(dn, "%d:%d", &mj,&mn) == 2) { + rdev = makedev(mj,mn); + found = 1; + } + close(sysfd); + sysfd = -1; + } + if (!found) { + sysfd = sysfs_open(fd2devnm(fd), dname, "state"); + if (sysfd < 0) { + pr_err("%s does not appear to be a component of %s\n", + dv->devname, devname); + goto abort; + } + } + } else if ((dv->disposition == 'r' || + dv->disposition == 'f') && + get_maj_min(dv->devname, &mj, &mn)) { + /* for 'fail' and 'remove', the device might + * not exist. + */ + rdev = makedev(mj, mn); + } else { + tfd = dev_open(dv->devname, O_RDONLY); + if (tfd >= 0) { + fstat_is_blkdev(tfd, dv->devname, &rdev); + close(tfd); + } else { + int open_err = errno; + if (!stat_is_blkdev(dv->devname, &rdev)) { + if (dv->disposition == 'M') + /* non-fatal. Also improbable */ + continue; + goto abort; + } + if (dv->disposition == 'r') + /* Be happy, the stat worked, that is + * enough for --remove + */ + ; + else { + if (dv->disposition == 'M') + /* non-fatal */ + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(open_err)); + goto abort; + } + } + } + switch(dv->disposition){ + default: + pr_err("internal error - devmode[%s]=%d\n", + dv->devname, dv->disposition); + goto abort; + case 'a': + case 'S': /* --add-spare */ + case 'j': /* --add-journal */ + case 'A': + case 'M': /* --re-add missing */ + case 'F': /* --re-add faulty */ + case 'c': /* --cluster-confirm */ + /* add the device */ + if (subarray) { + pr_err("Cannot add disks to a \'member\' array, perform this operation on the parent container\n"); + goto abort; + } + + /* Let's first try to write re-add to sysfs */ + if (rdev != 0 && + (dv->disposition == 'A' || dv->disposition == 'F')) { + sysfs_init_dev(&devinfo, rdev); + if (sysfs_set_str(&info, &devinfo, "state", "re-add") == 0) { + pr_err("re-add %s to %s succeed\n", + dv->devname, info.sys_name); + break; + } + } + + if (dv->disposition == 'F') + /* Need to remove first */ + hot_remove_disk(fd, rdev, force); + /* Make sure it isn't in use (in 2.6 or later) */ + tfd = dev_open(dv->devname, O_RDONLY|O_EXCL); + if (tfd >= 0) { + /* We know no-one else is using it. We'll + * need non-exclusive access to add it, so + * do that now. + */ + close(tfd); + tfd = dev_open(dv->devname, O_RDONLY); + } + if (tfd < 0) { + if (dv->disposition == 'M') + continue; + pr_err("Cannot open %s: %s\n", + dv->devname, strerror(errno)); + goto abort; + } + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_add(fd, tfd, dv, tst, &array, + force, verbose, devname, update, + rdev, array_size, raid_slot); + close(tfd); + tfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'r': + /* hot remove */ + if (subarray) { + pr_err("Cannot remove disks from a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else + rv = Manage_remove(tst, fd, dv, sysfd, + rdev, verbose, force, + devname); + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + + case 'f': /* set faulty */ + /* FIXME check current member */ + if ((sysfd >= 0 && write(sysfd, "faulty", 6) != 6) || + (sysfd < 0 && ioctl(fd, SET_DISK_FAULTY, + rdev))) { + if (errno == EBUSY) + busy = 1; + pr_err("set device faulty failed for %s: %s\n", + dv->devname, strerror(errno)); + if (sysfd >= 0) + close(sysfd); + goto abort; + } + if (sysfd >= 0) + close(sysfd); + sysfd = -1; + count++; + if (verbose >= 0) + pr_err("set %s faulty in %s\n", + dv->devname, devname); + break; + case 'R': /* Mark as replaceable */ + if (subarray) { + pr_err("Cannot replace disks in a \'member\' array, perform this operation on the parent container\n"); + rv = -1; + } else { + if (!frozen) { + if (sysfs_freeze_array(&info) == 1) + frozen = 1; + else + frozen = -1; + } + rv = Manage_replace(tst, fd, dv, + rdev, verbose, + devname); + } + if (rv < 0) + goto abort; + if (rv > 0) + count++; + break; + case 'W': /* --with device that doesn't match */ + pr_err("No matching --replace device for --with %s\n", + dv->devname); + goto abort; + case 'w': /* --with device which was matched */ + rv = Manage_with(tst, fd, dv, + rdev, verbose, devname); + if (rv < 0) + goto abort; + break; + } + } + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + if (test && count == 0) + return 2; + return 0; + +abort: + if (frozen > 0) + sysfs_set_str(&info, NULL, "sync_action","idle"); + return !test && busy ? 2 : 1; +} + +int autodetect(void) +{ + /* Open any md device, and issue the RAID_AUTORUN ioctl */ + int rv = 1; + int fd = dev_open("9:0", O_RDONLY); + if (fd >= 0) { + if (ioctl(fd, RAID_AUTORUN, 0) == 0) + rv = 0; + close(fd); + } + return rv; +} + +int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int verbose) +{ + struct supertype supertype, *st = &supertype; + int fd, rv = 2; + + memset(st, 0, sizeof(*st)); + + fd = open_subarray(dev, subarray, st, verbose < 0); + if (fd < 0) + return 2; + + if (!st->ss->update_subarray) { + if (verbose >= 0) + pr_err("Operation not supported for %s metadata\n", + st->ss->name); + goto free_super; + } + + if (mdmon_running(st->devnm)) + st->update_tail = &st->updates; + + rv = st->ss->update_subarray(st, subarray, update, ident); + + if (rv) { + if (verbose >= 0) + pr_err("Failed to update %s of subarray-%s in %s\n", + update, subarray, dev); + } else if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + + if (rv == 0 && strcmp(update, "name") == 0 && verbose >= 0) + pr_err("Updated subarray-%s name from %s, UUIDs may have changed\n", + subarray, dev); + + free_super: + st->ss->free_super(st); + close(fd); + + return rv; +} + +/* Move spare from one array to another If adding to destination array fails + * add back to original array. + * Returns 1 on success, 0 on failure */ +int move_spare(char *from_devname, char *to_devname, dev_t devid) +{ + struct mddev_dev devlist; + char devname[20]; + + /* try to remove and add */ + int fd1 = open(to_devname, O_RDONLY); + int fd2 = open(from_devname, O_RDONLY); + + if (fd1 < 0 || fd2 < 0) { + if (fd1 >= 0) + close(fd1); + if (fd2 >= 0) + close(fd2); + return 0; + } + + devlist.next = NULL; + devlist.used = 0; + devlist.writemostly = FlagDefault; + devlist.failfast = FlagDefault; + devlist.devname = devname; + sprintf(devname, "%d:%d", major(devid), minor(devid)); + + devlist.disposition = 'r'; + if (Manage_subdevs(from_devname, fd2, &devlist, -1, 0, NULL, 0) == 0) { + devlist.disposition = 'a'; + if (Manage_subdevs(to_devname, fd1, &devlist, -1, 0, + NULL, 0) == 0) { + /* make sure manager is aware of changes */ + ping_manager(to_devname); + ping_manager(from_devname); + close(fd1); + close(fd2); + return 1; + } + else + Manage_subdevs(from_devname, fd2, &devlist, + -1, 0, NULL, 0); + } + close(fd1); + close(fd2); + return 0; +} diff --git a/Monitor.c b/Monitor.c new file mode 100644 index 0000000..30c031a --- /dev/null +++ b/Monitor.c @@ -0,0 +1,1275 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" +#include <sys/wait.h> +#include <signal.h> +#include <limits.h> +#include <syslog.h> +#ifndef NO_LIBUDEV +#include <libudev.h> +#endif + +struct state { + char *devname; + char devnm[32]; /* to sync with mdstat info */ + unsigned int utime; + int err; + char *spare_group; + int active, working, failed, spare, raid; + int from_config; + int from_auto; + int expected_spares; + int devstate[MAX_DISKS]; + dev_t devid[MAX_DISKS]; + int percent; + char parent_devnm[32]; /* For subarray, devnm of parent. + * For others, "" + */ + struct supertype *metadata; + struct state *subarray;/* for a container it is a link to first subarray + * for a subarray it is a link to next subarray + * in the same container */ + struct state *parent; /* for a subarray it is a link to its container + */ + struct state *next; +}; + +struct alert_info { + char *mailaddr; + char *mailfrom; + char *alert_cmd; + int dosyslog; +}; +static int make_daemon(char *pidfile); +static int check_one_sharer(int scan); +static void write_autorebuild_pid(void); +static void alert(char *event, char *dev, char *disc, struct alert_info *info); +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *info, + int increments, char *prefer); +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info); +static void try_spare_migration(struct state *statelist, struct alert_info *info); +static void link_containers_with_subarrays(struct state *list); +#ifndef NO_LIBUDEV +static int check_udev_activity(void); +#endif + +int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share) +{ + /* + * Every few seconds, scan every md device looking for changes + * When a change is found, log it, possibly run the alert command, + * and possibly send Email + * + * For each array, we record: + * Update time + * active/working/failed/spare drives + * State of each device. + * %rebuilt if rebuilding + * + * If the update time changes, check out all the data again + * It is possible that we cannot get the state of each device + * due to bugs in the md kernel module. + * We also read /proc/mdstat to get rebuild percent, + * and to get state on all active devices incase of kernel bug. + * + * Events are: + * Fail + * An active device had Faulty set or Active/Sync removed + * FailSpare + * A spare device had Faulty set + * SpareActive + * An active device had a reverse transition + * RebuildStarted + * percent went from -1 to +ve + * RebuildNN + * percent went from below to not-below NN% + * DeviceDisappeared + * Couldn't access a device which was previously visible + * + * if we detect an array with active<raid and spare==0 + * we look at other arrays that have same spare-group + * If we find one with active==raid and spare>0, + * and if we can get_disk_info and find a name + * Then we hot-remove and hot-add to the other array + * + * If devlist is NULL, then we can monitor everything because --scan + * was given. We get an initial list from config file and add anything + * that appears in /proc/mdstat + */ + + struct state *statelist = NULL; + struct state *st2; + int finished = 0; + struct mdstat_ent *mdstat = NULL; + char *mailfrom; + struct alert_info info; + struct mddev_ident *mdlist; + int delay_for_event = c->delay; + + if (!mailaddr) { + mailaddr = conf_get_mailaddr(); + if (mailaddr && ! c->scan) + pr_err("Monitor using email address \"%s\" from config file\n", + mailaddr); + } + mailfrom = conf_get_mailfrom(); + + if (!alert_cmd) { + alert_cmd = conf_get_program(); + if (alert_cmd && !c->scan) + pr_err("Monitor using program \"%s\" from config file\n", + alert_cmd); + } + if (c->scan && !mailaddr && !alert_cmd && !dosyslog) { + pr_err("No mail address or alert command - not monitoring.\n"); + return 1; + } + info.alert_cmd = alert_cmd; + info.mailaddr = mailaddr; + info.mailfrom = mailfrom; + info.dosyslog = dosyslog; + + if (share){ + if (check_one_sharer(c->scan)) + return 1; + } + + if (daemonise) { + int rv = make_daemon(pidfile); + if (rv >= 0) + return rv; + } + + if (share) + write_autorebuild_pid(); + + if (devlist == NULL) { + mdlist = conf_get_ident(NULL); + for (; mdlist; mdlist = mdlist->next) { + struct state *st; + + if (mdlist->devname == NULL) + continue; + if (strcasecmp(mdlist->devname, "<ignore>") == 0) + continue; + st = xcalloc(1, sizeof *st); + if (mdlist->devname[0] == '/') + st->devname = xstrdup(mdlist->devname); + else { + st->devname = xmalloc(8+strlen(mdlist->devname)+1); + strcpy(strcpy(st->devname, "/dev/md/"), + mdlist->devname); + } + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->from_config = 1; + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + statelist = st; + } + } else { + struct mddev_dev *dv; + + for (dv = devlist; dv; dv = dv->next) { + struct state *st = xcalloc(1, sizeof *st); + mdlist = conf_get_ident(dv->devname); + st->devname = xstrdup(dv->devname); + st->next = statelist; + st->devnm[0] = 0; + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mdlist) { + st->expected_spares = mdlist->spare_disks; + if (mdlist->spare_group) + st->spare_group = xstrdup(mdlist->spare_group); + } + statelist = st; + } + } + + while (!finished) { + int new_found = 0; + struct state *st, **stp; + int anydegraded = 0; + int anyredundant = 0; + + if (mdstat) + free_mdstat(mdstat); + mdstat = mdstat_read(oneshot ? 0 : 1, 0); + + for (st = statelist; st; st = st->next) { + if (check_array(st, mdstat, c->test, &info, + increments, c->prefer)) + anydegraded = 1; + /* for external arrays, metadata is filled for + * containers only + */ + if (st->metadata && st->metadata->ss->external) + continue; + if (st->err == 0 && !anyredundant) + anyredundant = 1; + } + + /* now check if there are any new devices found in mdstat */ + if (c->scan) + new_found = add_new_arrays(mdstat, &statelist, c->test, + &info); + + /* If an array has active < raid && spare == 0 && spare_group != NULL + * Look for another array with spare > 0 and active == raid and same spare_group + * if found, choose a device and hotremove/hotadd + */ + if (share && anydegraded) + try_spare_migration(statelist, &info); + if (!new_found) { + if (oneshot) + break; + else if (!anyredundant) { + pr_err("No array with redundancy detected, stopping\n"); + break; + } + else { +#ifndef NO_LIBUDEV + /* + * Wait for udevd to finish new devices + * processing. + */ + if (mdstat_wait(delay_for_event) && + check_udev_activity()) + pr_err("Error while waiting for UDEV to complete new devices processing\n"); +#else + int wait_result = mdstat_wait(delay_for_event); + /* + * Give chance to process new device + */ + if (wait_result != 0) { + if (c->delay > 5) + delay_for_event = 5; + } else + delay_for_event = c->delay; +#endif + mdstat_close(); + } + } + c->test = 0; + + for (stp = &statelist; (st = *stp) != NULL; ) { + if (st->from_auto && st->err > 5) { + *stp = st->next; + free(st->devname); + free(st->spare_group); + free(st); + } else + stp = &st->next; + } + } + for (st2 = statelist; st2; st2 = statelist) { + statelist = st2->next; + free(st2); + } + + if (pidfile) + unlink(pidfile); + return 0; +} + +static int make_daemon(char *pidfile) +{ + /* Return: + * -1 in the forked daemon + * 0 in the parent + * 1 on error + * so a none-negative becomes the exit code. + */ + int pid = fork(); + if (pid > 0) { + if (!pidfile) + printf("%d\n", pid); + else { + FILE *pid_file = NULL; + int fd = open(pidfile, O_WRONLY | O_CREAT | O_TRUNC, + 0644); + if (fd >= 0) + pid_file = fdopen(fd, "w"); + if (!pid_file) + perror("cannot create pid file"); + else { + fprintf(pid_file,"%d\n", pid); + fclose(pid_file); + } + } + return 0; + } + if (pid < 0) { + perror("daemonise"); + return 1; + } + manage_fork_fds(0); + setsid(); + return -1; +} + +static int check_one_sharer(int scan) +{ + int pid; + FILE *comm_fp; + FILE *fp; + char comm_path[PATH_MAX]; + char path[PATH_MAX]; + char comm[20]; + + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + fp = fopen(path, "r"); + if (fp) { + if (fscanf(fp, "%d", &pid) != 1) + pid = -1; + snprintf(comm_path, sizeof(comm_path), + "/proc/%d/comm", pid); + comm_fp = fopen(comm_path, "r"); + if (comm_fp) { + if (fscanf(comm_fp, "%19s", comm) && + strncmp(basename(comm), Name, strlen(Name)) == 0) { + if (scan) { + pr_err("Only one autorebuild process allowed in scan mode, aborting\n"); + fclose(comm_fp); + fclose(fp); + return 1; + } else { + pr_err("Warning: One autorebuild process already running.\n"); + } + } + fclose(comm_fp); + } + fclose(fp); + } + return 0; +} + +static void write_autorebuild_pid() +{ + char path[PATH_MAX]; + int pid; + FILE *fp = NULL; + sprintf(path, "%s/autorebuild.pid", MDMON_DIR); + + if (mkdir(MDMON_DIR, 0700) < 0 && errno != EEXIST) { + pr_err("Can't create autorebuild.pid file\n"); + } else { + int fd = open(path, O_WRONLY | O_CREAT | O_TRUNC, 0700); + + if (fd >= 0) + fp = fdopen(fd, "w"); + + if (!fp) + pr_err("Can't create autorebuild.pid file\n"); + else { + pid = getpid(); + fprintf(fp, "%d\n", pid); + fclose(fp); + } + } +} + +static void alert(char *event, char *dev, char *disc, struct alert_info *info) +{ + int priority; + + if (!info->alert_cmd && !info->mailaddr && !info->dosyslog) { + time_t now = time(0); + + printf("%1.15s: %s on %s %s\n", ctime(&now) + 4, + event, dev, disc?disc:"unknown device"); + } + if (info->alert_cmd) { + int pid = fork(); + switch(pid) { + default: + waitpid(pid, NULL, 0); + break; + case -1: + break; + case 0: + execl(info->alert_cmd, info->alert_cmd, + event, dev, disc, NULL); + exit(2); + } + } + if (info->mailaddr && (strncmp(event, "Fail", 4) == 0 || + strncmp(event, "Test", 4) == 0 || + strncmp(event, "Spares", 6) == 0 || + strncmp(event, "Degrade", 7) == 0)) { + FILE *mp = popen(Sendmail, "w"); + if (mp) { + FILE *mdstat; + char hname[256]; + gethostname(hname, sizeof(hname)); + signal(SIGPIPE, SIG_IGN); + if (info->mailfrom) + fprintf(mp, "From: %s\n", info->mailfrom); + else + fprintf(mp, "From: %s monitoring <root>\n", + Name); + fprintf(mp, "To: %s\n", info->mailaddr); + fprintf(mp, "Subject: %s event on %s:%s\n\n", + event, dev, hname); + + fprintf(mp, + "This is an automatically generated mail message from %s\n", Name); + fprintf(mp, "running on %s\n\n", hname); + + fprintf(mp, + "A %s event had been detected on md device %s.\n\n", event, dev); + + if (disc && disc[0] != ' ') + fprintf(mp, + "It could be related to component device %s.\n\n", disc); + if (disc && disc[0] == ' ') + fprintf(mp, "Extra information:%s.\n\n", disc); + + fprintf(mp, "Faithfully yours, etc.\n"); + + mdstat = fopen("/proc/mdstat", "r"); + if (mdstat) { + char buf[8192]; + int n; + fprintf(mp, + "\nP.S. The /proc/mdstat file currently contains the following:\n\n"); + while ((n = fread(buf, 1, sizeof(buf), + mdstat)) > 0) + n = fwrite(buf, 1, n, mp); + fclose(mdstat); + } + pclose(mp); + } + } + + /* log the event to syslog maybe */ + if (info->dosyslog) { + /* Log at a different severity depending on the event. + * + * These are the critical events: */ + if (strncmp(event, "Fail", 4) == 0 || + strncmp(event, "Degrade", 7) == 0 || + strncmp(event, "DeviceDisappeared", 17) == 0) + priority = LOG_CRIT; + /* Good to know about, but are not failures: */ + else if (strncmp(event, "Rebuild", 7) == 0 || + strncmp(event, "MoveSpare", 9) == 0 || + strncmp(event, "Spares", 6) != 0) + priority = LOG_WARNING; + /* Everything else: */ + else + priority = LOG_INFO; + + if (disc && disc[0] != ' ') + syslog(priority, + "%s event detected on md device %s, component device %s", event, dev, disc); + else if (disc) + syslog(priority, + "%s event detected on md device %s: %s", + event, dev, disc); + else + syslog(priority, + "%s event detected on md device %s", + event, dev); + } +} + +static int check_array(struct state *st, struct mdstat_ent *mdstat, + int test, struct alert_info *ainfo, + int increments, char *prefer) +{ + /* Update the state 'st' to reflect any changes shown in mdstat, + * or found by directly examining the array, and return + * '1' if the array is degraded, or '0' if it is optimal (or dead). + */ + struct { int state, major, minor; } info[MAX_DISKS]; + struct mdinfo *sra = NULL; + mdu_array_info_t array; + struct mdstat_ent *mse = NULL, *mse2; + char *dev = st->devname; + int fd; + int i; + int remaining_disks; + int last_disk; + int new_array = 0; + int retval; + int is_container = 0; + unsigned long redundancy_only_flags = 0; + + if (test) + alert("TestMessage", dev, NULL, ainfo); + + retval = 0; + + fd = open(dev, O_RDONLY); + if (fd < 0) + goto disappeared; + + if (st->devnm[0] == 0) + strcpy(st->devnm, fd2devnm(fd)); + + for (mse2 = mdstat; mse2; mse2 = mse2->next) + if (strcmp(mse2->devnm, st->devnm) == 0) { + mse2->devnm[0] = 0; /* flag it as "used" */ + mse = mse2; + } + + if (!mse) { + /* duplicated array in statelist + * or re-created after reading mdstat + */ + st->err++; + goto out; + } + + if (mse->level == NULL) + is_container = 1; + + if (!is_container && !md_array_active(fd)) + goto disappeared; + + fcntl(fd, F_SETFD, FD_CLOEXEC); + if (md_get_array_info(fd, &array) < 0) + goto disappeared; + + if (!is_container && map_name(pers, mse->level) > 0) + redundancy_only_flags |= GET_MISMATCH; + + sra = sysfs_read(-1, st->devnm, GET_LEVEL | GET_DISKS | GET_DEVS | + GET_STATE | redundancy_only_flags); + + if (!sra) + goto disappeared; + + /* It's much easier to list what array levels can't + * have a device disappear than all of them that can + */ + if (sra->array.level == 0 || sra->array.level == -1) { + if (!st->err && !st->from_config) + alert("DeviceDisappeared", dev, " Wrong-Level", ainfo); + st->err++; + goto out; + } + + /* this array is in /proc/mdstat */ + if (array.utime == 0) + /* external arrays don't update utime, so + * just make sure it is always different. */ + array.utime = st->utime + 1;; + + if (st->err) { + /* New array appeared where previously had an error */ + st->err = 0; + st->percent = RESYNC_NONE; + new_array = 1; + if (!is_container) + alert("NewArray", st->devname, NULL, ainfo); + } + + if (st->utime == array.utime && st->failed == sra->array.failed_disks && + st->working == sra->array.working_disks && + st->spare == sra->array.spare_disks && + (mse == NULL || (mse->percent == st->percent))) { + if ((st->active < st->raid) && st->spare == 0) + retval = 1; + goto out; + } + if (st->utime == 0 && /* new array */ + mse->pattern && strchr(mse->pattern, '_') /* degraded */) + alert("DegradedArray", dev, NULL, ainfo); + + if (st->utime == 0 && /* new array */ st->expected_spares > 0 && + sra->array.spare_disks < st->expected_spares) + alert("SparesMissing", dev, NULL, ainfo); + if (st->percent < 0 && st->percent != RESYNC_UNKNOWN && + mse->percent >= 0) + alert("RebuildStarted", dev, NULL, ainfo); + if (st->percent >= 0 && mse->percent >= 0 && + (mse->percent / increments) > (st->percent / increments)) { + char percentalert[18]; + /* + * "RebuildNN" (10 chars) or "RebuildStarted" (15 chars) + */ + + if((mse->percent / increments) == 0) + snprintf(percentalert, sizeof(percentalert), + "RebuildStarted"); + else + snprintf(percentalert, sizeof(percentalert), + "Rebuild%02d", mse->percent); + + alert(percentalert, dev, NULL, ainfo); + } + + if (mse->percent == RESYNC_NONE && st->percent >= 0) { + /* Rebuild/sync/whatever just finished. + * If there is a number in /mismatch_cnt, + * we should report that. + */ + if (sra && sra->mismatch_cnt > 0) { + char cnt[80]; + snprintf(cnt, sizeof(cnt), + " mismatches found: %d (on raid level %d)", + sra->mismatch_cnt, sra->array.level); + alert("RebuildFinished", dev, cnt, ainfo); + } else + alert("RebuildFinished", dev, NULL, ainfo); + } + st->percent = mse->percent; + + remaining_disks = sra->array.nr_disks; + for (i = 0; i < MAX_DISKS && remaining_disks > 0; i++) { + mdu_disk_info_t disc; + disc.number = i; + if (md_get_disk_info(fd, &disc) >= 0) { + info[i].state = disc.state; + info[i].major = disc.major; + info[i].minor = disc.minor; + if (disc.major || disc.minor) + remaining_disks --; + } else + info[i].major = info[i].minor = 0; + } + last_disk = i; + + if (mse->metadata_version && + strncmp(mse->metadata_version, "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, mse->metadata_version + 10); + sl = strchr(st->parent_devnm, '/'); + if (sl) + *sl = 0; + } else + st->parent_devnm[0] = 0; + if (st->metadata == NULL && st->parent_devnm[0] == 0) + st->metadata = super_by_fd(fd, NULL); + + for (i = 0; i < MAX_DISKS; i++) { + mdu_disk_info_t disc = {0, 0, 0, 0, 0}; + int newstate = 0; + int change; + char *dv = NULL; + disc.number = i; + if (i < last_disk && (info[i].major || info[i].minor)) { + newstate = info[i].state; + dv = map_dev_preferred(info[i].major, info[i].minor, 1, + prefer); + disc.state = newstate; + disc.major = info[i].major; + disc.minor = info[i].minor; + } else + newstate = (1 << MD_DISK_REMOVED); + + if (dv == NULL && st->devid[i]) + dv = map_dev_preferred(major(st->devid[i]), + minor(st->devid[i]), 1, prefer); + change = newstate ^ st->devstate[i]; + if (st->utime && change && !st->err && !new_array) { + if ((st->devstate[i]&change) & (1 << MD_DISK_SYNC)) + alert("Fail", dev, dv, ainfo); + else if ((newstate & (1 << MD_DISK_FAULTY)) && + (disc.major || disc.minor) && + st->devid[i] == makedev(disc.major, + disc.minor)) + alert("FailSpare", dev, dv, ainfo); + else if ((newstate&change) & (1 << MD_DISK_SYNC)) + alert("SpareActive", dev, dv, ainfo); + } + st->devstate[i] = newstate; + st->devid[i] = makedev(disc.major, disc.minor); + } + st->active = sra->array.active_disks; + st->working = sra->array.working_disks; + st->spare = sra->array.spare_disks; + st->failed = sra->array.failed_disks; + st->utime = array.utime; + st->raid = sra->array.raid_disks; + st->err = 0; + if ((st->active < st->raid) && st->spare == 0) + retval = 1; + + out: + if (sra) + sysfs_free(sra); + if (fd >= 0) + close(fd); + return retval; + + disappeared: + if (!st->err && !is_container) + alert("DeviceDisappeared", dev, NULL, ainfo); + st->err++; + goto out; +} + +static int add_new_arrays(struct mdstat_ent *mdstat, struct state **statelist, + int test, struct alert_info *info) +{ + struct mdstat_ent *mse; + int new_found = 0; + char *name; + + for (mse = mdstat; mse; mse = mse->next) + if (mse->devnm[0] && (!mse->level || /* retrieve containers */ + (strcmp(mse->level, "raid0") != 0 && + strcmp(mse->level, "linear") != 0))) { + struct state *st = xcalloc(1, sizeof *st); + mdu_array_info_t array; + int fd; + + name = get_md_name(mse->devnm); + if (!name) { + free(st); + continue; + } + + st->devname = xstrdup(name); + if ((fd = open(st->devname, O_RDONLY)) < 0 || + md_get_array_info(fd, &array) < 0) { + /* no such array */ + if (fd >= 0) + close(fd); + put_md_name(st->devname); + free(st->devname); + if (st->metadata) { + st->metadata->ss->free_super(st->metadata); + free(st->metadata); + } + free(st); + continue; + } + close(fd); + st->next = *statelist; + st->err = 1; + st->from_auto = 1; + strcpy(st->devnm, mse->devnm); + st->percent = RESYNC_UNKNOWN; + st->expected_spares = -1; + if (mse->metadata_version && + strncmp(mse->metadata_version, + "external:", 9) == 0 && + is_subarray(mse->metadata_version+9)) { + char *sl; + strcpy(st->parent_devnm, + mse->metadata_version+10); + sl = strchr(st->parent_devnm, '/'); + *sl = 0; + } else + st->parent_devnm[0] = 0; + *statelist = st; + if (test) + alert("TestMessage", st->devname, NULL, info); + new_found = 1; + } + return new_found; +} + +static int get_required_spare_criteria(struct state *st, + struct spare_criteria *sc) +{ + int fd; + + if (!st->metadata || !st->metadata->ss->get_spare_criteria) { + sc->min_size = 0; + sc->sector_size = 0; + return 0; + } + + fd = open(st->devname, O_RDONLY); + if (fd < 0) + return 1; + if (st->metadata->ss->external) + st->metadata->ss->load_container(st->metadata, fd, st->devname); + else + st->metadata->ss->load_super(st->metadata, fd, st->devname); + close(fd); + if (!st->metadata->sb) + return 1; + + st->metadata->ss->get_spare_criteria(st->metadata, sc); + st->metadata->ss->free_super(st->metadata); + + return 0; +} + +static int check_donor(struct state *from, struct state *to) +{ + struct state *sub; + + if (from == to) + return 0; + if (from->parent) + /* Cannot move from a member */ + return 0; + if (from->err) + return 0; + for (sub = from->subarray; sub; sub = sub->subarray) + /* If source array has degraded subarrays, don't + * remove anything + */ + if (sub->active < sub->raid) + return 0; + if (from->metadata->ss->external == 0) + if (from->active < from->raid) + return 0; + if (from->spare <= 0) + return 0; + return 1; +} + +static dev_t choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, struct spare_criteria *sc) +{ + int d; + dev_t dev = 0; + + for (d = from->raid; !dev && d < MAX_DISKS; d++) { + if (from->devid[d] > 0 && from->devstate[d] == 0) { + struct dev_policy *pol; + unsigned long long dev_size; + unsigned int dev_sector_size; + + if (to->metadata->ss->external && + test_partition_from_id(from->devid[d])) + continue; + + if (sc->min_size && + dev_size_from_id(from->devid[d], &dev_size) && + dev_size < sc->min_size) + continue; + + if (sc->sector_size && + dev_sector_size_from_id(from->devid[d], + &dev_sector_size) && + sc->sector_size != dev_sector_size) + continue; + + pol = devid_policy(from->devid[d]); + if (from->spare_group) + pol_add(&pol, pol_domain, + from->spare_group, NULL); + if (domain_test(domlist, pol, + to->metadata->ss->name) == 1) + dev = from->devid[d]; + dev_policy_free(pol); + } + } + return dev; +} + +static dev_t container_choose_spare(struct state *from, struct state *to, + struct domainlist *domlist, + struct spare_criteria *sc, int active) +{ + /* This is similar to choose_spare, but we cannot trust devstate, + * so we need to read the metadata instead + */ + struct mdinfo *list; + struct supertype *st = from->metadata; + int fd = open(from->devname, O_RDONLY); + int err; + dev_t dev = 0; + + if (fd < 0) + return 0; + if (!st->ss->getinfo_super_disks) { + close(fd); + return 0; + } + + err = st->ss->load_container(st, fd, NULL); + close(fd); + if (err) + return 0; + + if (from == to) { + /* We must check if number of active disks has not increased + * since ioctl in main loop. mdmon may have added spare + * to subarray. If so we do not need to look for more spares + * so return non zero value */ + int active_cnt = 0; + struct mdinfo *dp; + list = st->ss->getinfo_super_disks(st); + if (!list) { + st->ss->free_super(st); + return 1; + } + dp = list->devs; + while (dp) { + if (dp->disk.state & (1 << MD_DISK_SYNC) && + !(dp->disk.state & (1 << MD_DISK_FAULTY))) + active_cnt++; + dp = dp->next; + } + sysfs_free(list); + if (active < active_cnt) { + /* Spare just activated.*/ + st->ss->free_super(st); + return 1; + } + } + + /* We only need one spare so full list not needed */ + list = container_choose_spares(st, sc, domlist, from->spare_group, + to->metadata->ss->name, 1); + if (list) { + struct mdinfo *disks = list->devs; + if (disks) + dev = makedev(disks->disk.major, disks->disk.minor); + sysfs_free(list); + } + st->ss->free_super(st); + return dev; +} + +static void try_spare_migration(struct state *statelist, struct alert_info *info) +{ + struct state *from; + struct state *st; + struct spare_criteria sc; + + link_containers_with_subarrays(statelist); + for (st = statelist; st; st = st->next) + if (st->active < st->raid && st->spare == 0 && !st->err) { + struct domainlist *domlist = NULL; + int d; + struct state *to = st; + + if (to->parent_devnm[0] && !to->parent) + /* subarray monitored without parent container + * we can't move spares here */ + continue; + + if (to->parent) + /* member of a container */ + to = to->parent; + + if (get_required_spare_criteria(to, &sc)) + continue; + if (to->metadata->ss->external) { + /* We must make sure there is + * no suitable spare in container already. + * If there is we don't add more */ + dev_t devid = container_choose_spare( + to, to, NULL, &sc, st->active); + if (devid > 0) + continue; + } + for (d = 0; d < MAX_DISKS; d++) + if (to->devid[d]) + domainlist_add_dev(&domlist, + to->devid[d], + to->metadata->ss->name); + if (to->spare_group) + domain_add(&domlist, to->spare_group); + /* + * No spare migration if the destination + * has no domain. Skip this array. + */ + if (!domlist) + continue; + for (from=statelist ; from ; from=from->next) { + dev_t devid; + if (!check_donor(from, to)) + continue; + if (from->metadata->ss->external) + devid = container_choose_spare( + from, to, domlist, &sc, 0); + else + devid = choose_spare(from, to, domlist, + &sc); + if (devid > 0 && + move_spare(from->devname, to->devname, + devid)) { + alert("MoveSpare", to->devname, + from->devname, info); + break; + } + } + domain_free(domlist); + } +} + +/* search the statelist to connect external + * metadata subarrays with their containers + * We always completely rebuild the tree from scratch as + * that is safest considering the possibility of entries + * disappearing or changing. + */ +static void link_containers_with_subarrays(struct state *list) +{ + struct state *st; + struct state *cont; + for (st = list; st; st = st->next) { + st->parent = NULL; + st->subarray = NULL; + } + for (st = list; st; st = st->next) + if (st->parent_devnm[0]) + for (cont = list; cont; cont = cont->next) + if (!cont->err && cont->parent_devnm[0] == 0 && + strcmp(cont->devnm, st->parent_devnm) == 0) { + st->parent = cont; + st->subarray = cont->subarray; + cont->subarray = st; + break; + } +} + +#ifndef NO_LIBUDEV +/* function: check_udev_activity + * Description: Function waits for udev to finish + * events processing. + * Returns: + * 1 - detected error while opening udev + * 2 - timeout + * 0 - successfull completion + */ +static int check_udev_activity(void) +{ + struct udev *udev = NULL; + struct udev_queue *udev_queue = NULL; + int timeout_cnt = 30; + int rc = 0; + + /* + * In rare cases systemd may not have udevm, + * in such cases just exit with rc 0 + */ + if (!use_udev()) + goto out; + + udev = udev_new(); + if (!udev) { + rc = 1; + goto out; + } + + udev_queue = udev_queue_new(udev); + if (!udev_queue) { + rc = 1; + goto out; + } + + if (udev_queue_get_queue_is_empty(udev_queue)) + goto out; + + while (!udev_queue_get_queue_is_empty(udev_queue)) { + sleep(1); + + if (timeout_cnt) + timeout_cnt--; + else { + rc = 2; + goto out; + } + } + +out: + if (udev_queue) + udev_queue_unref(udev_queue); + if (udev) + udev_unref(udev); + return rc; +} +#endif + +/* Not really Monitor but ... */ +int Wait(char *dev) +{ + char devnm[32]; + dev_t rdev; + char *tmp; + int rv = 1; + int frozen_remaining = 3; + + if (!stat_is_blkdev(dev, &rdev)) + return 2; + + tmp = devid2devnm(rdev); + if (!tmp) { + pr_err("Cannot get md device name.\n"); + return 2; + } + + strcpy(devnm, tmp); + + while(1) { + struct mdstat_ent *ms = mdstat_read(1, 0); + struct mdstat_ent *e; + + for (e = ms; e; e = e->next) + if (strcmp(e->devnm, devnm) == 0) + break; + + if (e && e->percent == RESYNC_NONE) { + /* We could be in the brief pause before something + * starts. /proc/mdstat doesn't show that, but + * sync_action does. + */ + struct mdinfo mdi; + char buf[21]; + + if (sysfs_init(&mdi, -1, devnm)) + return 2; + if (sysfs_get_str(&mdi, NULL, "sync_action", + buf, 20) > 0 && + strcmp(buf,"idle\n") != 0) { + e->percent = RESYNC_UNKNOWN; + if (strcmp(buf, "frozen\n") == 0) { + if (frozen_remaining == 0) + e->percent = RESYNC_NONE; + else + frozen_remaining -= 1; + } + } + } + if (!e || e->percent == RESYNC_NONE) { + if (e && e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0) { + if (is_subarray(&e->metadata_version[9])) + ping_monitor(&e->metadata_version[9]); + else + ping_monitor(devnm); + } + free_mdstat(ms); + return rv; + } + free_mdstat(ms); + rv = 0; + mdstat_wait(5); + } +} + +/* The state "broken" is used only for RAID0/LINEAR - it's the same as + * "clean", but used in case the array has one or more members missing. + */ +static char *clean_states[] = { + "clear", "inactive", "readonly", "read-auto", "clean", "broken", NULL }; + +int WaitClean(char *dev, int verbose) +{ + int fd; + struct mdinfo *mdi; + int rv = 1; + char devnm[32]; + + if (!stat_is_blkdev(dev, NULL)) + return 2; + fd = open(dev, O_RDONLY); + if (fd < 0) { + if (verbose) + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + + strcpy(devnm, fd2devnm(fd)); + mdi = sysfs_read(fd, devnm, GET_VERSION|GET_LEVEL|GET_SAFEMODE); + if (!mdi) { + if (verbose) + pr_err("Failed to read sysfs attributes for %s\n", dev); + close(fd); + return 0; + } + + switch(mdi->array.level) { + case LEVEL_LINEAR: + case LEVEL_MULTIPATH: + case 0: + /* safemode delay is irrelevant for these levels */ + rv = 0; + } + + /* for internal metadata the kernel handles the final clean + * transition, containers can never be dirty + */ + if (!is_subarray(mdi->text_version)) + rv = 0; + + /* safemode disabled ? */ + if (mdi->safe_mode_delay == 0) + rv = 0; + + if (rv) { + int state_fd = sysfs_open(fd2devnm(fd), NULL, "array_state"); + char buf[20]; + int delay = 5000; + + /* minimize the safe_mode_delay and prepare to wait up to 5s + * for writes to quiesce + */ + sysfs_set_safemode(mdi, 1); + + /* wait for array_state to be clean */ + while (1) { + rv = read(state_fd, buf, sizeof(buf)); + if (rv < 0) + break; + if (sysfs_match_word(buf, clean_states) < + (int)ARRAY_SIZE(clean_states) - 1) + break; + rv = sysfs_wait(state_fd, &delay); + if (rv < 0 && errno != EINTR) + break; + lseek(state_fd, 0, SEEK_SET); + } + if (rv < 0) + rv = 1; + else if (ping_monitor(mdi->text_version) == 0) { + /* we need to ping to close the window between array + * state transitioning to clean and the metadata being + * marked clean + */ + rv = 0; + } else { + rv = 1; + pr_err("Error connecting monitor with %s\n", dev); + } + if (rv && verbose) + pr_err("Error waiting for %s to be clean\n", dev); + + /* restore the original safe_mode_delay */ + sysfs_set_safemode(mdi, mdi->safe_mode_delay); + close(state_fd); + } + + sysfs_free(mdi); + close(fd); + + return rv; +} @@ -0,0 +1,140 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include "md_u.h" + +int Query(char *dev) +{ + /* Give a brief description of the device, + * whether it is an md device and whether it has + * a superblock + */ + int fd; + int ioctlerr, staterr; + int superror; + int level, raid_disks, spare_disks; + struct mdinfo info; + struct mdinfo *sra; + struct supertype *st = NULL; + unsigned long long larray_size; + struct stat stb; + char *mddev; + mdu_disk_info_t disc; + char *activity; + + fd = open(dev, O_RDONLY); + if (fd < 0){ + pr_err("cannot open %s: %s\n", dev, strerror(errno)); + return 1; + } + + if (fstat(fd, &stb) < 0) + staterr = errno; + else + staterr = 0; + + ioctlerr = 0; + + sra = sysfs_read(fd, dev, GET_DISKS | GET_LEVEL | GET_DEVS | GET_STATE); + if (sra) { + level = sra->array.level; + raid_disks = sra->array.raid_disks; + spare_disks = sra->array.spare_disks; + } else { + mdu_array_info_t array; + + if (md_get_array_info(fd, &array) < 0) { + ioctlerr = errno; + level = -1; + raid_disks = -1; + spare_disks = -1; + } else { + level = array.level; + raid_disks = array.raid_disks; + spare_disks = array.spare_disks; + } + } + + if (!ioctlerr && !staterr) { + if (!get_dev_size(fd, NULL, &larray_size)) + larray_size = 0; + } + + if (ioctlerr == ENODEV) + printf("%s: is an md device which is not active\n", dev); + else if (ioctlerr && major(stb.st_rdev) != MD_MAJOR) + printf("%s: is not an md array\n", dev); + else if (ioctlerr) + printf("%s: is an md device, but gives \"%s\" when queried\n", + dev, strerror(ioctlerr)); + else { + printf("%s: %s %s %d devices, %d spare%s. Use mdadm --detail for more detail.\n", + dev, human_size_brief(larray_size,IEC), + map_num(pers, level), raid_disks, + spare_disks, spare_disks == 1 ? "" : "s"); + } + st = guess_super(fd); + if (st && st->ss->compare_super != NULL) + superror = st->ss->load_super(st, fd, dev); + else + superror = -1; + close(fd); + if (superror == 0) { + /* array might be active... */ + int uuid[4]; + struct map_ent *me, *map = NULL; + st->ss->getinfo_super(st, &info, NULL); + st->ss->uuid_from_super(st, uuid); + me = map_by_uuid(&map, uuid); + if (me) { + mddev = me->path; + disc.number = info.disk.number; + activity = "undetected"; + if (mddev && (fd = open(mddev, O_RDONLY))>=0) { + if (md_array_active(fd)) { + if (md_get_disk_info(fd, &disc) >= 0 && + makedev((unsigned)disc.major,(unsigned)disc.minor) == stb.st_rdev) + activity = "active"; + else + activity = "mismatch"; + } + close(fd); + } + } else { + activity = "inactive"; + mddev = "array"; + } + printf("%s: device %d in %d device %s %s %s. Use mdadm --examine for more detail.\n", + dev, + info.disk.number, info.array.raid_disks, + activity, + map_num(pers, info.array.level), + mddev); + if (st->ss == &super0) + put_md_name(mddev); + } + return 0; +} diff --git a/README.initramfs b/README.initramfs new file mode 100644 index 0000000..c5fa668 --- /dev/null +++ b/README.initramfs @@ -0,0 +1,122 @@ +Assembling md arrays at boot time. +--------------------------------- +December 2005 + +These notes apply to 2.6 kernels only and, in some cases, +to 2.6.15 or later. + +Md arrays can be assembled at boot time using the 'autodetect' functionality +which is triggered by storing components of an array in partitions of type +'fd' - Linux Raid Autodetect. +They can also be assembled by specifying the component devices in a +kernel parameter such as + md=0,/dev/sda,/dev/sdb +In this case, /dev/md0 will be assembled (because of the 0) from the listed +devices. + +These mechanisms, while useful, do not provide complete functionality +and are unlikely to be extended. The preferred way to assemble md +arrays at boot time is using 'mdadm'. To assemble an array which +contains the root filesystem, mdadm needs to be run before that +filesystem is mounted, and so needs to be run from an initial-ram-fs. +It is how this can work that is the primary focus of this document. + +It should be noted up front that only the array containing the root +filesystem should be assembled from the initramfs. Any other arrays +should be assembled under the control of files on the main filesystem +as this enhanced flexibility and maintainability. + +A minimal initramfs for assembling md arrays can be created using 3 +files and one directory. These are: + +/bin Directory +/bin/mdadm statically linked mdadm binary +/bin/busybox statically linked busybox binary +/bin/sh hard link to /bin/busybox +/init a shell script which call mdadm appropriately. + +An example init script is: + +============================================== +#!/bin/sh + +echo 'Auto-assembling boot md array' +mkdir /proc +mount -t proc proc /proc +if [ -n "$rootuuid" ] +then arg=--uuid=$rootuuid +elif [ -n "$mdminor" ] +then arg=--super-minor=$mdminor +else arg=--super-minor=0 +fi +echo "Using $arg" +mdadm -Acpartitions $arg --auto=part /dev/mda +cd / +mount /dev/mda1 /root || mount /dev/mda /root +umount /proc +cd /root +exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +============================================= + +This could certainly be extended, or merged into a larger init script. +Though tested and in production use, it is not presented here as +"The Right Way" to do it, but as a useful example. +Some key points are: + + /proc needs to be mounted so that /proc/partitions can be accessed + by mdadm, and so that /proc/filesystems can be accessed by mount. + + The uuid of the array can be passed in as a kernel parameter + (rootuuid). As the kernel doesn't use this value, it is made available + in the environment for /init + + If no uuid is given, we default to md0, (--super-minor=0) which is a + commonly used to store the root filesystem. This may not work in + all situations. + + We assemble the array as a partitionable array (/dev/mda) even if we + end up using the whole array. There is no cost in using the partitionable + interface, and in this context it is simpler. + + We try mounting both /dev/mda1 and /dev/mda as they are the most like + part of the array to contain the root filesystem. + + The --auto flag is given to mdadm so that it will create /dev/md* + files automatically. This is needed as /dev will not contain + and md files, and udev will not create them (as udev only created device + files after the device exists, and mdadm need the device file to create + the device). Note that the created md files may not exist in /dev + of the mounted root filesystem. This needs to be deal with separately + from mdadm - possibly using udev. + + We do not need to create device files for the components which will + be assembled into /dev/mda. mdadm finds the major/minor numbers from + /proc/partitions and creates a temporary /dev file if one doesn't already + exist. + +The script "mkinitramfs" which is included with the mdadm distribution +can be used to create a minimal initramfs. It creates a file called +'init.cpio.gz' which can be specified as an 'initrd' to lilo or grub +(or whatever boot loader is being used). + + + + +Resume from an md array +----------------------- + +If you want to make use of the suspend-to-disk/resume functionality in Linux, +and want to have swap on an md array, you will need to assemble the array +before resume is possible. +However, because the array is active in the resumed image, you do not want +anything written to any drives during the resume process, such as superblock +updates or array resync. + +This can be achieved in 2.6.15-rc1 and later kernels using the +'start_readonly' module parameter. +Simply include the command + echo 1 > /sys/module/md_mod/parameters/start_ro +before assembling the array with 'mdadm'. +You can then echo + 9:0 +or whatever is appropriate to /sys/power/resume to trigger the resume. diff --git a/ReadMe.c b/ReadMe.c new file mode 100644 index 0000000..8139976 --- /dev/null +++ b/ReadMe.c @@ -0,0 +1,656 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> + * Copyright (C) 2016-2017 Jes Sorensen <Jes.Sorensen@gmail.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Maintainer: Jes Sorensen + * Email: <Jes.Sorensen@gmail.com> + */ + +#include "mdadm.h" + +#ifndef VERSION +#define VERSION "4.2" +#endif +#ifndef VERS_DATE +#define VERS_DATE "2021-12-30" +#endif +#ifndef EXTRAVERSION +#define EXTRAVERSION "" +#endif +char Version[] = "mdadm - v" VERSION " - " VERS_DATE EXTRAVERSION "\n"; + +/* + * File: ReadMe.c + * + * This file contains general comments about the implementation + * and the various usage messages that can be displayed by mdadm + * + */ + +/* + * mdadm has 7 major modes of operation: + * 1/ Create + * This mode is used to create a new array with a superblock + * 2/ Assemble + * This mode is used to assemble the parts of a previously created + * array into an active array. Components can be explicitly given + * or can be searched for. mdadm (optionally) checks that the components + * do form a bona-fide array, and can, on request, fiddle superblock + * version numbers so as to assemble a faulty array. + * 3/ Build + * This is for building legacy arrays without superblocks + * 4/ Manage + * This is for doing something to one or more devices + * in an array, such as add,remove,fail. + * run/stop/readonly/readwrite are also available + * 5/ Misc + * This is for doing things to individual devices. + * They might be parts of an array so + * zero-superblock, examine might be appropriate + * They might be md arrays so + * run,stop,rw,ro,detail might be appropriate + * Also query will treat it as either + * 6/ Monitor + * This mode never exits but just monitors arrays and reports changes. + * 7/ Grow + * This mode allows for changing of key attributes of a raid array, such + * as size, number of devices, and possibly even layout. + * 8/ Incremental + * Is assembles an array incrementally instead of all at once. + * As devices are discovered they can be passed to "mdadm --incremental" + * which will collect them. When enough devices to for an array are + * found, it is started. + */ + +char short_options[]="-ABCDEFGIQhVXYWZ:vqbc:i:l:p:m:r:n:x:u:c:d:z:U:N:safRSow1tye:k"; +char short_bitmap_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:r:n:x:u:c:d:z:U:N:sarfRSow1tye:k:"; +char short_bitmap_auto_options[]= + "-ABCDEFGIQhVXYWZ:vqb:c:i:l:p:m:r:n:x:u:c:d:z:U:N:sa:rfRSow1tye:k:"; + +struct option long_options[] = { + {"manage", 0, 0, ManageOpt}, + {"misc", 0, 0, MiscOpt}, + {"assemble", 0, 0, 'A'}, + {"build", 0, 0, 'B'}, + {"create", 0, 0, 'C'}, + {"detail", 0, 0, 'D'}, + {"examine", 0, 0, 'E'}, + {"follow", 0, 0, 'F'}, + {"grow", 0, 0, 'G'}, + {"incremental",0,0, 'I'}, + {"zero-superblock", 0, 0, KillOpt}, /* deliberately not a short_option */ + {"query", 0, 0, 'Q'}, + {"examine-bitmap", 0, 0, 'X'}, + {"auto-detect", 0, 0, AutoDetect}, + {"detail-platform", 0, 0, DetailPlatform}, + {"kill-subarray", 1, 0, KillSubarray}, + {"update-subarray", 1, 0, UpdateSubarray}, + {"udev-rules", 2, 0, UdevRules}, + {"offroot", 0, 0, OffRootOpt}, + {"examine-badblocks", 0, 0, ExamineBB}, + + {"dump", 1, 0, Dump}, + {"restore", 1, 0, Restore}, + + /* synonyms */ + {"monitor", 0, 0, 'F'}, + + /* after those will normally come the name of the md device */ + + {"help", 0, 0, 'h'}, + {"help-options",0,0, HelpOptions}, + {"version", 0, 0, 'V'}, + {"verbose", 0, 0, 'v'}, + {"quiet", 0, 0, 'q'}, + + /* For create or build: */ + {"chunk", 1, 0, ChunkSize}, + {"rounding", 1, 0, ChunkSize}, /* for linear, chunk is really a + * rounding number */ + {"level", 1, 0, 'l'}, /* 0,1,4,5,6,linear */ + {"parity", 1, 0, Layout}, /* {left,right}-{a,}symmetric */ + {"layout", 1, 0, Layout}, + {"raid-disks",1, 0, 'n'}, + {"raid-devices",1, 0, 'n'}, + {"spare-disks",1,0, 'x'}, + {"spare-devices",1,0, 'x'}, + {"size", 1, 0, 'z'}, + {"auto", 1, 0, Auto}, /* also for --assemble */ + {"assume-clean",0,0, AssumeClean }, + {"metadata", 1, 0, 'e'}, /* superblock format */ + {"bitmap", 1, 0, Bitmap}, + {"bitmap-chunk", 1, 0, BitmapChunk}, + {"write-behind", 2, 0, WriteBehind}, + {"write-mostly",0, 0, WriteMostly}, + {"failfast", 0, 0, FailFast}, + {"nofailfast",0, 0, NoFailFast}, + {"re-add", 0, 0, ReAdd}, + {"homehost", 1, 0, HomeHost}, + {"symlinks", 1, 0, Symlinks}, + {"data-offset",1, 0, DataOffset}, + {"nodes",1, 0, Nodes}, /* also for --assemble */ + {"home-cluster",1, 0, ClusterName}, + {"write-journal",1, 0, WriteJournal}, + {"consistency-policy", 1, 0, 'k'}, + + /* For assemble */ + {"uuid", 1, 0, 'u'}, + {"super-minor",1,0, SuperMinor}, + {"name", 1, 0, 'N'}, + {"config", 1, 0, ConfigFile}, + {"scan", 0, 0, 's'}, + {"force", 0, 0, Force}, + {"update", 1, 0, 'U'}, + {"freeze-reshape", 0, 0, FreezeReshape}, + + /* Management */ + {"add", 0, 0, Add}, + {"add-spare", 0, 0, AddSpare}, + {"add-journal", 0, 0, AddJournal}, + {"remove", 0, 0, Remove}, + {"fail", 0, 0, Fail}, + {"set-faulty",0, 0, Fail}, + {"replace", 0, 0, Replace}, + {"with", 0, 0, With}, + {"run", 0, 0, 'R'}, + {"stop", 0, 0, 'S'}, + {"readonly", 0, 0, 'o'}, + {"readwrite", 0, 0, 'w'}, + {"no-degraded",0,0, NoDegraded }, + {"wait", 0, 0, WaitOpt}, + {"wait-clean", 0, 0, Waitclean }, + {"action", 1, 0, Action }, + {"cluster-confirm", 0, 0, ClusterConfirm}, + + /* For Detail/Examine */ + {"brief", 0, 0, Brief}, + {"no-devices",0, 0, NoDevices}, + {"export", 0, 0, 'Y'}, + {"sparc2.2", 0, 0, Sparc22}, + {"test", 0, 0, 't'}, + {"prefer", 1, 0, Prefer}, + + /* For Follow/monitor */ + {"mail", 1, 0, EMail}, + {"program", 1, 0, ProgramOpt}, + {"alert", 1, 0, ProgramOpt}, + {"increment", 1, 0, Increment}, + {"delay", 1, 0, 'd'}, + {"daemonise", 0, 0, Fork}, + {"daemonize", 0, 0, Fork}, + {"oneshot", 0, 0, '1'}, + {"pid-file", 1, 0, 'i'}, + {"syslog", 0, 0, 'y'}, + {"no-sharing", 0, 0, NoSharing}, + + /* For Grow */ + {"backup-file", 1,0, BackupFile}, + {"invalid-backup",0,0,InvalidBackup}, + {"array-size", 1, 0, 'Z'}, + {"continue", 0, 0, Continue}, + + /* For Incremental */ + {"rebuild-map", 0, 0, RebuildMapOpt}, + {"path", 1, 0, IncrementalPath}, + + {0, 0, 0, 0} +}; + +char Usage[] = +"Usage: mdadm --help\n" +" for help\n" +; + +char Help[] = +"mdadm is used for building, managing, and monitoring\n" +"Linux md devices (aka RAID arrays)\n" +"Usage: mdadm --create device options...\n" +" Create a new array from unused devices.\n" +" mdadm --assemble device options...\n" +" Assemble a previously created array.\n" +" mdadm --build device options...\n" +" Create or assemble an array without metadata.\n" +" mdadm --manage device options...\n" +" make changes to an existing array.\n" +" mdadm --misc options... devices\n" +" report on or modify various md related devices.\n" +" mdadm --grow options device\n" +" resize/reshape an active array\n" +" mdadm --incremental device\n" +" add/remove a device to/from an array as appropriate\n" +" mdadm --monitor options...\n" +" Monitor one or more array for significant changes.\n" +" mdadm device options...\n" +" Shorthand for --manage.\n" +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +" For detailed help on the above major modes use --help after the mode\n" +" e.g.\n" +" mdadm --assemble --help\n" +" For general help on options use\n" +" mdadm --help-options\n" +; + +char OptionHelp[] = +"Any parameter that does not start with '-' is treated as a device name\n" +"or, for --examine-bitmap, a file name.\n" +"The first such name is often the name of an md device. Subsequent\n" +"names are often names of component devices.\n" +"\n" +"Some common options are:\n" +" --help -h : General help message or, after above option,\n" +" mode specific help message\n" +" --help-options : This help message\n" +" --version -V : Print version information for mdadm\n" +" --verbose -v : Be more verbose about what is happening\n" +" --quiet -q : Don't print un-necessary messages\n" +" --brief -b : Be less verbose, more brief\n" +" --export -Y : With --detail, --detail-platform or --examine use\n" +" key=value format for easy import into environment\n" +" --force -f : Override normal checks and be more forceful\n" +"\n" +" --assemble -A : Assemble an array\n" +" --build -B : Build an array without metadata\n" +" --create -C : Create a new array\n" +" --detail -D : Display details of an array\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display the detail of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --monitor -F : monitor (follow) some arrays\n" +" --grow -G : resize/ reshape and array\n" +" --incremental -I : add/remove a single device to/from an array as appropriate\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --auto-detect : Start arrays auto-detected by the kernel\n" +; +/* +"\n" +" For create or build:\n" +" --bitmap= -b : File to store bitmap in - may pre-exist for --build\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear, or mp for create.\n" +" : 0,1,10,mp,faulty or linear for build.\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x: number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --assume-clean : Assume the array is already in-sync. This is dangerous for RAID5.\n" +" --bitmap-chunk= : chunksize of bitmap in bitmap file (Kilobytes)\n" +" --delay= -d : seconds between bitmap updates\n" +" --write-behind= : number of simultaneous write-behind requests to allow (requires bitmap)\n" +" --name= -N : Textual name for array - max 32 characters\n" +"\n" +" For assemble:\n" +" --bitmap= -b : File to find bitmap information in\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --force -f : Assemble the array even if some superblocks appear out-of-date\n" +" --update= -U : Update superblock: try '-A --update=?' for list of options.\n" +" --no-degraded : Do not start any degraded arrays - default unless --scan.\n" +"\n" +" For detail or examine:\n" +" --brief -b : Just print device name and UUID\n" +"\n" +" For follow/monitor:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +"\n" +" General management:\n" +" --add -a : add, or hotadd subsequent devices\n" +" --re-add : re-add a recently removed device\n" +" --remove -r : remove subsequent devices\n" +" --fail -f : mark subsequent devices as faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark a device for replacement\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --wait -W : wait for recovery/resync/reshape to finish.\n" +; +*/ + +char Help_create[] = +"Usage: mdadm --create device --chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage will initialise a new md array, associate some\n" +" devices with it, and activate the array. In order to create an\n" +" array with some devices missing, use the special word 'missing' in\n" +" place of the relevant device name.\n" +"\n" +" Before devices are added, they are checked to see if they already contain\n" +" raid superblocks or filesystems. They are also checked to see if\n" +" the variance in device size exceeds 1%.\n" +" If any discrepancy is found, the user will be prompted for confirmation\n" +" before the array is created. The presence of a '--run' can override this\n" +" caution.\n" +"\n" +" If the --size option is given then only that many kilobytes of each\n" +" device is used, no matter how big each device is.\n" +" If no --size is given, the apparent size of the smallest drive given\n" +" is used for raid level 1 and greater, and the full device is used for\n" +" other levels.\n" +"\n" +" Options that are valid with --create (-C) are:\n" +" --bitmap= -b : Create a bitmap for the array with the given filename\n" +" : or an internal bitmap if 'internal' is given\n" +" --chunk= -c : chunk size in kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : raid level: 0,1,4,5,6,10,linear,multipath and synonyms\n" +" --parity= -p : raid5/6 parity algorithm: {left,right}-{,a}symmetric\n" +" --layout= : same as --parity, for RAID10: [fno]NN \n" +" --raid-devices= -n : number of active devices in array\n" +" --spare-devices= -x : number of spare (eXtra) devices in initial array\n" +" --size= -z : Size (in K) of each drive in RAID1/4/5/6/10 - optional\n" +" --data-offset= : Space to leave between start of device and start\n" +" : of array data.\n" +" --force -f : Honour devices as listed on command line. Don't\n" +" : insert a missing drive for RAID5.\n" +" --run -R : insist of running the array even if not all\n" +" : devices are present or some look odd.\n" +" --readonly -o : start the array readonly - not supported yet.\n" +" --name= -N : Textual name for array - max 32 characters\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +" --write-journal= : Specify journal device for RAID-4/5/6 array\n" +" --consistency-policy= : Specify the policy that determines how the array\n" +" -k : maintains consistency in case of unexpected shutdown.\n" +"\n" +; + +char Help_build[] = +"Usage: mdadm --build device -chunk=X --level=Y --raid-devices=Z devices\n" +"\n" +" This usage is similar to --create. The difference is that it creates\n" +" a legacy array without a superblock. With these arrays there is no\n" +" different between initially creating the array and subsequently\n" +" assembling the array, except that hopefully there is useful data\n" +" there in the second case.\n" +"\n" +" The level may only be 0, 1, 10, linear, multipath, or faulty.\n" +" All devices must be listed and the array will be started once complete.\n" +" Options that are valid with --build (-B) are:\n" +" --bitmap= : file to store/find bitmap information in.\n" +" --chunk= -c : chunk size of kibibytes\n" +" --rounding= : rounding factor for linear array (==chunk size)\n" +" --level= -l : 0, 1, 10, linear, multipath, faulty\n" +" --raid-devices= -n : number of active devices in array\n" +" --bitmap-chunk= : bitmap chunksize in Kilobytes.\n" +" --delay= -d : bitmap update delay in seconds.\n" +; + +char Help_assemble[] = +"Usage: mdadm --assemble device options...\n" +" mdadm --assemble --scan options...\n" +"\n" +"This usage assembles one or more raid arrays from pre-existing\n" +"components.\n" +"For each array, mdadm needs to know the md device, the identity of\n" +"the array, and a number of sub devices. These can be found in a number\n" +"of ways.\n" +"\n" +"The md device is given on the command line, is found listed in the\n" +"config file, or can be deduced from the array identity.\n" +"The array identity is determined either from the --uuid, --name, or\n" +"--super-minor commandline arguments, from the config file,\n" +"or from the first component device on the command line.\n" +"\n" +"The different combinations of these are as follows:\n" +" If the --scan option is not given, then only devices and identities\n" +" listed on the command line are considered.\n" +" The first device will be the array device, and the remainder will be\n" +" examined when looking for components.\n" +" If an explicit identity is given with --uuid or --super-minor, then\n" +" only devices with a superblock which matches that identity is considered,\n" +" otherwise every device listed is considered.\n" +"\n" +" If the --scan option is given, and no devices are listed, then\n" +" every array listed in the config file is considered for assembly.\n" +" The identity of candidate devices are determined from the config file.\n" +" After these arrays are assembled, mdadm will look for other devices\n" +" that could form further arrays and tries to assemble them. This can\n" +" be disabled using the 'AUTO' option in the config file.\n" +"\n" +" If the --scan option is given as well as one or more devices, then\n" +" Those devices are md devices that are to be assembled. Their identity\n" +" and components are determined from the config file.\n" +"\n" +" If mdadm can not find all of the components for an array, it will assemble\n" +" it but not activate it unless --run or --scan is given. To preserve this\n" +" behaviour even with --scan, add --no-degraded. Note that \"all of the\n" +" components\" means as many as were present the last time the array was running\n" +" as recorded in the superblock. If the array was already degraded, and\n" +" the missing device is not a new problem, it will still be assembled. It\n" +" is only newly missing devices that cause the array not to be started.\n" +"\n" +"Options that are valid with --assemble (-A) are:\n" +" --bitmap= : bitmap file to use with the array\n" +" --uuid= -u : uuid of array to assemble. Devices which don't\n" +" have this uuid are excluded\n" +" --super-minor= -m : minor number to look for in super-block when\n" +" choosing devices to use.\n" +" --name= -N : Array name to look for in super-block.\n" +" --config= -c : config file\n" +" --scan -s : scan config file for missing information\n" +" --run -R : Try to start the array even if not enough devices\n" +" for a full array are present\n" +" --force -f : Assemble the array even if some superblocks appear\n" +" : out-of-date. This involves modifying the superblocks.\n" +" --update= -U : Update superblock: try '-A --update=?' for option list.\n" +" --no-degraded : Assemble but do not start degraded arrays.\n" +" --readonly -o : Mark the array as read-only. No resync will start.\n" +; + +char Help_manage[] = +"Usage: mdadm arraydevice options component devices...\n" +"\n" +"This usage is for managing the component devices within an array.\n" +"The --manage option is not needed and is assumed if the first argument\n" +"is a device name or a management option.\n" +"The first device listed will be taken to be an md array device, any\n" +"subsequent devices are (potential) components of that array.\n" +"\n" +"Options that are valid with management mode are:\n" +" --add -a : hotadd subsequent devices to the array\n" +" --re-add : subsequent devices are re-added if there were\n" +" : recent members of the array\n" +" --remove -r : remove subsequent devices, which must not be active\n" +" --fail -f : mark subsequent devices a faulty\n" +" --set-faulty : same as --fail\n" +" --replace : mark device(s) to be replaced by spares. Once\n" +" : replacement completes, device will be marked faulty\n" +" --with : Indicate which spare a previous '--replace' should\n" +" : prefer to use\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +; + +char Help_misc[] = +"Usage: mdadm misc_option devices...\n" +"\n" +"This usage is for performing some task on one or more devices, which\n" +"may be arrays or components, depending on the task.\n" +"The --misc option is not needed (though it is allowed) and is assumed\n" +"if the first argument in a misc option.\n" +"\n" +"Options that are valid with the miscellaneous mode are:\n" +" --query -Q : Display general information about how a\n" +" device relates to the md driver\n" +" --detail -D : Display details of an array\n" +" --detail-platform : Display hardware/firmware details\n" +" --examine -E : Examine superblock on an array component\n" +" --examine-bitmap -X: Display contents of a bitmap file\n" +" --examine-badblocks: Display list of known bad blocks on device\n" +" --zero-superblock : erase the MD superblock from a device.\n" +" --run -R : start a partially built array\n" +" --stop -S : deactivate array, releasing all resources\n" +" --readonly -o : mark array as readonly\n" +" --readwrite -w : mark array as readwrite\n" +" --test -t : exit status 0 if ok, 1 if degrade, 2 if dead, 4 if missing\n" +" --wait -W : wait for resync/rebuild/recovery to finish\n" +" --action= : initiate or abort ('idle' or 'frozen') a 'check' or 'repair'.\n" +; + +char Help_monitor[] = +"Usage: mdadm --monitor options devices\n" +"\n" +"This usage causes mdadm to monitor a number of md arrays by periodically\n" +"polling their status and acting on any changes.\n" +"If any devices are listed then those devices are monitored, otherwise\n" +"all devices listed in the config file are monitored.\n" +"The address for mailing advisories to, and the program to handle\n" +"each change can be specified in the config file or on the command line.\n" +"There must be at least one destination for advisories, whether\n" +"an email address, a program, or --syslog\n" +"\n" +"Options that are valid with the monitor (-F --follow) mode are:\n" +" --mail= -m : Address to mail alerts of failure to\n" +" --program= -p : Program to run when an event is detected\n" +" --alert= : same as --program\n" +" --syslog -y : Report alerts via syslog\n" +" --increment= -r : Report RebuildNN events in the given increment. default=20\n" +" --delay= -d : seconds of delay between polling state. default=60\n" +" --config= -c : specify a different config file\n" +" --scan -s : find mail-address/program in config file\n" +" --daemonise -f : Fork and continue in child, parent exits\n" +" --pid-file= -i : In daemon mode write pid to specified file instead of stdout\n" +" --oneshot -1 : Check for degraded arrays, then exit\n" +" --test -t : Generate a TestMessage event against each array at startup\n" +; + +char Help_grow[] = +"Usage: mdadm --grow device options\n" +"\n" +"This usage causes mdadm to attempt to reconfigure a running array.\n" +"This is only possibly if the kernel being used supports a particular\n" +"reconfiguration.\n" +"\n" +"Options that are valid with the grow (-G --grow) mode are:\n" +" --level= -l : Tell mdadm what level to convert the array to.\n" +" --layout= -p : For a FAULTY array, set/change the error mode.\n" +" : for other arrays, update the layout\n" +" --size= -z : Change the active size of devices in an array.\n" +" : This is useful if all devices have been replaced\n" +" : with larger devices. Value is in Kilobytes, or\n" +" : the special word 'max' meaning 'as large as possible'.\n" +" --assume-clean : When increasing the --size, this flag will avoid\n" +" : a resync of the new space\n" +" --chunk= -c : Change the chunksize of the array\n" +" --raid-devices= -n : Change the number of active devices in an array.\n" +" --add= -a : Add listed devices as part of reshape. This is\n" +" : needed for resizing a RAID0 which cannot have\n" +" : spares already present.\n" +" --bitmap= -b : Add or remove a write-intent bitmap.\n" +" --backup-file= file : A file on a different device to store data for a\n" +" : short time while increasing raid-devices on a\n" +" : RAID4/5/6 array. Also needed throughout a reshape\n" +" : when changing parameters other than raid-devices\n" +" --array-size= -Z : Change visible size of array. This does not change any\n" +" : data on the device, and is not stable across restarts.\n" +" --data-offset= : Location on device to move start of data to.\n" +" --consistency-policy= : Change the consistency policy of an active array.\n" +" -k : Currently works only for PPL with RAID5.\n" +; + +char Help_incr[] = +"Usage: mdadm --incremental [-Rqrsf] device\n" +"\n" +"This usage allows for incremental assembly of md arrays. Devices can be\n" +"added one at a time as they are discovered. Once an array has all expected\n" +"devices, it will be started.\n" +"\n" +"Optionally, the process can be reversed by using the fail option.\n" +"When fail mode is invoked, mdadm will see if the device belongs to an array\n" +"and then both fail (if needed) and remove the device from that array.\n" +"\n" +"Options that are valid with incremental assembly (-I --incremental) are:\n" +" --run -R : Run arrays as soon as a minimal number of devices are\n" +" : present rather than waiting for all expected.\n" +" --quiet -q : Don't print any information messages, just errors.\n" +" --rebuild-map -r : Rebuild the 'map' file that mdadm uses for tracking\n" +" : partial arrays.\n" +" --scan -s : Use with -R to start any arrays that have the minimal\n" +" : required number of devices, but are not yet started.\n" +" --fail -f : First fail (if needed) and then remove device from\n" +" : any array that it is a member of.\n" +; + +char Help_config[] = +"The /etc/mdadm.conf config file:\n\n" +" The config file contains, apart from blank lines and comment lines that\n" +" start with a hash(#), array lines, device lines, and various\n" +" configuration lines.\n" +" Each line is constructed of a number of space separated words, and can\n" +" be continued on subsequent physical lines by indenting those lines.\n" +"\n" +" A device line starts with the word 'device' and then has a number of words\n" +" which identify devices. These words should be names of devices in the\n" +" filesystem, and can contain wildcards. There can be multiple words or each\n" +" device line, and multiple device lines. All devices so listed are checked\n" +" for relevant super blocks when assembling arrays.\n" +"\n" +" An array line start with the word 'array'. This is followed by the name of\n" +" the array device in the filesystem, e.g. '/dev/md2'. Subsequent words\n" +" describe the identity of the array, used to recognise devices to include in the\n" +" array. The identity can be given as a UUID with a word starting 'uuid=', or\n" +" as a minor-number stored in the superblock using 'super-minor=', or as a list\n" +" of devices. This is given as a comma separated list of names, possibly\n" +" containing wildcards, preceded by 'devices='. If multiple critea are given,\n" +" than a device must match all of them to be considered.\n" +"\n" +" Other configuration lines include:\n" +" mailaddr, mailfrom, program used for --monitor mode\n" +" create, auto used when creating device names in /dev\n" +" homehost, policy, part-policy used to guide policy in various\n" +" situations\n" +"\n" +; + +char *mode_help[mode_count] = { + [0] = Help, + [ASSEMBLE] = Help_assemble, + [BUILD] = Help_build, + [CREATE] = Help_create, + [MANAGE] = Help_manage, + [MISC] = Help_misc, + [MONITOR] = Help_monitor, + [GROW] = Help_grow, + [INCREMENTAL] = Help_incr, +}; @@ -0,0 +1,213 @@ + - add 'name' field to metadata type and use it. + - use validate_geometry more + - metadata should be able to check/reject bitmap stuff. + +DDF: + Three new metadata types: + ddf - used only to create a container. + ddf-bvd - used to create an array in a container + ddf-svd - used to create a secondary array from bvds. + + Usage: + mdadm -C /dev/ddf1 /dev/sd[abcdef] + mdadm -C /dev/md1 -e ddf /dev/sd[a-f] + mdadm -C /dev/md1 -l container /dev/sd[a-f] + + Each of these create a new ddf container using all those + devices. The name 'ddf*' signals that ddf metadata should be used. + '-e ddf' only supports one level - 'container'. 'container' is only + supported by ddf. + + mdadm -C /dev/md1 -l0 -n4 /dev/ddf1 # or maybe not ??? + mdadm -C /dev/md1 -l1 -n2 /dev/sda /dev/sdb + If exactly one device is given, and it is a container, we select + devices from that container. + If devices are given that are already in use, they must be in use by + a container, and the array is created in the container. + If devices given are bvds, we slip under the hood to make + the svd arrays. + + mdadm -A /dev/ddf ...... + base drives make a container. Anything in that container is started + auto-read-only. + if /dev/ddf is already assembled, we assemble bvds and svds inside it. + + +2005-dec-20 + Want an incremental assembly mode to work nicely with udev. + Core usage would be something like + mdadm --incr-assemble /dev/newdevice + This would + - examine the device to determine uuid etc. + - look for a match in /etc/mdadm.conf, abort if not found + - find that device and collect current contents + - perform an 'assemble' analysis to make sure we have the best set of devices. + - remove or add devices as appropriate + - possibly start the array if it was complete + + Other usages could involve + - specify which array to auto-add to. + This requires an existing array for uuid matching... is there any point? + + - + + +2004-june-02 + * Don't print 'errors' flag, it is meaningless. DONE + * Handle new superblock format + * create device file on demand, particularly partitionable devices. DONE + BUT figure a way to create the partition devices. + auto=partN + * Use Event: interface to listen for events. DONE, untested + * Make sure mdadm -As can assemble multi-level RAIDs ok. + * --build to build raid1 or multipath arrays + clean or not ??? + +---------------------------------------------------------------------------- +* mdadm --monitor to monitor failed multipath paths and re-instate them. + +* Maybe make "--help" fit in 80x24 and have a --long-help with more info. DONE + + +* maybe "missing" instead of <bold>missing</> in doco DONE +* possibly wait for resync to start, or even finish while assembling.- NO + +* -Db should have a devices= entry if possible. - DONE +* when assembling multipath arrays, ignore any error indicators. - DONE +* rationalise --monitor usage: + mdadm --monitor + doesn't do as expected. DONE + +* --assemble could have a --update option. - DONE + following word can be: + sparc2.2 + super-minor + +* mdadm /dev/md11, where md11 is raid0 can segfault, particularly when looking in the + [UU_UUU] string ... which doesn't exist ! +It should be more sensible. DONE + +Example: + +from Raimund Sacherer <raimund.sacherer@ngit.at> + +mke2fs -m0 -q /dev/ram1 300 +mount -n -t ext2 /dev/ram1 /tmp +echo DEVICE /dev/[sh]* >> /tmp/mdadm.conf +mdadm -Esb /dev/[sh]* 2>/dev/null >> /tmp/mdadm.conf +mdadm -ARsc /tmp/mdadm.conf +umount /tmp + + +?? Allow -S /dev/md? - current complains subsequent not a/d/r - DONE + +* new "Query" mode to subsume --detail and --examine. + --query or -Q, takes a device and tells if it is an MD device, + and also tells in a raid superblock is found. + DONE + +* write mdstat.c to parse /proc/mdstat file + Build list of arrays: name, rebuild-percent + DONE + +* parse /proc/partitions and map major/minor into /dev/* names, + and use that for default DEVICE list ???? + +* --detail --scan to read /proc/mdstat, and then iterate over these, + but assume --brief. --verbose can override + check each subdevice to see if it is in conf_get_devs. + Warn if not. + DONE, but don't warn yet... + +* Support multipath ... maybe... + maybe DONE + +* --follow to syslog + +* --follow to move spares around DONE + +* --follow to notice other events: DONE + rebuild started + spare activated + spare removed + spare added + +------------------------------------ +- --examine --scan scans all drives and build an mdadm.conf file DONE + +- check superblock checksum in examine DONE +- report "chunk" or "rounding" depending on raid level DONE +- report "linear" instead of "-1" for raid level DONE +- decode ayout depending on raid level DONE +- --verbose and --force flags. DONE + +- set md_minor, *_disks for Create - DONE +- for create raid5, how to choose between + all working, but not insync + one missing, one spare, insync DONE (--force) +- and for raid1 - some failed drives... (missing) + +- when RUN_ARRAY, make sure *_disks counts are right + +- get --detail to extract extra stuff from superblock, + like uuid DONE +- --detail --brief to give a config file line DONE +- parse config file. DONE +- test... + +- when --assemble --scan, if an underlying device is an md device, + then try to assemble that device first. + + +- mdadm -S /dev/md0 /dev/md1 gives internal error FIXED + +- mdadm --detail --scan print summary of what it can find? DONE + + +--------- +Assemble doesn't add spares. - DONE +Create to allow "missing" name for devices. +Create to accept "--force" for do exactly what is requested +- get Assemble to upgrade devices if force flag. +ARRAY lines in config file to have super_minor=n +ARRAY lines in config file to have device=pattern, and only accept + those devices + If UUID given, insist on that + If not, but super_minor given, require all found with that minor + to have same uuid + If only device given, all valid supers on those devices must have + same uuid +allow /dev/mdX as first argument before any options +Possible --dry-run option for create and assemble--force + +Assemble to check that all devices mentioned in superblock + are present. + +New mode: --Monitor (or --Follow) + Periodically check status of all arrays (listed in config file). + Log every event and apparent cause - or differences + Email and alert - or run a program - for important events + Move spares around if necessary. + + An Array line can have a spare-group= field that indicates that + the array shares spares with other arrays with the same + spare-group name. + If an array has a failed and no spares, then check all other + arrays in the spare group. If one has no failures and a spare, + then consider that spare. + Choose the smallest considered spare that is large enough. + If there is one, then hot-remove it from it's home, and + hot-add it to the array in question. + + --mail-to address + --alert-handler program + + Will also extract information from /proc/mdstat if present, + and consider 20% marks in rebuild as events. + + Events are: + drive fails - causes mail to be sent + rebuild started + spare activated + spare removed + spare added diff --git a/bitmap.c b/bitmap.c new file mode 100644 index 0000000..9a7ffe3 --- /dev/null +++ b/bitmap.c @@ -0,0 +1,534 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2004 Paul Clements, SteelEye Technology, Inc. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "mdadm.h" + +static inline void sb_le_to_cpu(bitmap_super_t *sb) +{ + sb->magic = __le32_to_cpu(sb->magic); + sb->version = __le32_to_cpu(sb->version); + /* uuid gets no translation */ + sb->events = __le64_to_cpu(sb->events); + sb->events_cleared = __le64_to_cpu(sb->events_cleared); + sb->state = __le32_to_cpu(sb->state); + sb->chunksize = __le32_to_cpu(sb->chunksize); + sb->daemon_sleep = __le32_to_cpu(sb->daemon_sleep); + sb->sync_size = __le64_to_cpu(sb->sync_size); + sb->write_behind = __le32_to_cpu(sb->write_behind); + sb->nodes = __le32_to_cpu(sb->nodes); + sb->sectors_reserved = __le32_to_cpu(sb->sectors_reserved); +} + +static inline void sb_cpu_to_le(bitmap_super_t *sb) +{ + sb_le_to_cpu(sb); /* these are really the same thing */ +} + +mapping_t bitmap_states[] = { + { "OK", 0 }, + { "Out of date", 2 }, + { NULL, -1 } +}; + +static const char *bitmap_state(int state_num) +{ + char *state = map_num(bitmap_states, state_num); + return state ? state : "Unknown"; +} + +static const char *human_chunksize(unsigned long bytes) +{ + static char buf[16]; + char *suffixes[] = { "B", "KB", "MB", "GB", "TB", NULL }; + int i = 0; + + while (bytes >> 10) { + bytes >>= 10; + i++; + } + + snprintf(buf, sizeof(buf), "%lu %s", bytes, suffixes[i]); + + return buf; +} + +typedef struct bitmap_info_s { + bitmap_super_t sb; + unsigned long long total_bits; + unsigned long long dirty_bits; +} bitmap_info_t; + +/* count the dirty bits in the first num_bits of byte */ +static inline int count_dirty_bits_byte(char byte, int num_bits) +{ + int num = 0; + + switch (num_bits) { /* fall through... */ + case 8: if (byte & 128) num++; + case 7: if (byte & 64) num++; + case 6: if (byte & 32) num++; + case 5: if (byte & 16) num++; + case 4: if (byte & 8) num++; + case 3: if (byte & 4) num++; + case 2: if (byte & 2) num++; + case 1: if (byte & 1) num++; + default: break; + } + + return num; +} + +static int count_dirty_bits(char *buf, int num_bits) +{ + int i, num = 0; + + for (i = 0; i < num_bits / 8; i++) + num += count_dirty_bits_byte(buf[i], 8); + + if (num_bits % 8) /* not an even byte boundary */ + num += count_dirty_bits_byte(buf[i], num_bits % 8); + + return num; +} + +static bitmap_info_t *bitmap_fd_read(int fd, int brief) +{ + /* Note: fd might be open O_DIRECT, so we must be + * careful to align reads properly + */ + unsigned long long total_bits = 0, read_bits = 0, dirty_bits = 0; + bitmap_info_t *info; + void *buf; + unsigned int n, skip; + + if (posix_memalign(&buf, 4096, 8192) != 0) { + pr_err("failed to allocate 8192 bytes\n"); + return NULL; + } + n = read(fd, buf, 8192); + + info = xmalloc(sizeof(*info)); + + if (n < sizeof(info->sb)) { + pr_err("failed to read superblock of bitmap file: %s\n", strerror(errno)); + free(info); + free(buf); + return NULL; + } + memcpy(&info->sb, buf, sizeof(info->sb)); + skip = sizeof(info->sb); + + sb_le_to_cpu(&info->sb); /* convert superblock to CPU byte ordering */ + + if (brief || info->sb.sync_size == 0 || info->sb.chunksize == 0) + goto out; + + /* read the rest of the file counting total bits and dirty bits -- + * we stop when either: + * 1) we hit EOF, in which case we assume the rest of the bits (if any) + * are dirty + * 2) we've read the full bitmap, in which case we ignore any trailing + * data in the file + */ + total_bits = bitmap_bits(info->sb.sync_size, info->sb.chunksize); + + while(read_bits < total_bits) { + unsigned long long remaining = total_bits - read_bits; + + if (n == 0) { + n = read(fd, buf, 8192); + skip = 0; + if (n <= 0) + break; + } + if (remaining > (n-skip) * 8) /* we want the full buffer */ + remaining = (n-skip) * 8; + + dirty_bits += count_dirty_bits(buf+skip, remaining); + + read_bits += remaining; + n = 0; + } + + if (read_bits < total_bits) { /* file truncated... */ + pr_err("WARNING: bitmap file is not large enough for array size %llu!\n\n", + (unsigned long long)info->sb.sync_size); + total_bits = read_bits; + } +out: + free(buf); + info->total_bits = total_bits; + info->dirty_bits = dirty_bits; + return info; +} + +static int +bitmap_file_open(char *filename, struct supertype **stp, int node_num, int fd) +{ + struct stat stb; + struct supertype *st = *stp; + + /* won't re-open filename when (fd >= 0) */ + if (fd < 0) + fd = open(filename, O_RDONLY|O_DIRECT); + if (fd < 0) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return -1; + } + + if (fstat(fd, &stb) < 0) { + pr_err("fstat failed for %s: %s\n", filename, strerror(errno)); + close(fd); + return -1; + } + if ((stb.st_mode & S_IFMT) == S_IFBLK) { + /* block device, so we are probably after an internal bitmap */ + if (!st) + st = guess_super(fd); + if (!st) { + /* just look at device... */ + lseek(fd, 0, 0); + } else if (!st->ss->locate_bitmap) { + pr_err("No bitmap possible with %s metadata\n", + st->ss->name); + close(fd); + return -1; + } else { + if (st->ss->locate_bitmap(st, fd, node_num)) { + pr_err("%s doesn't have bitmap\n", filename); + close(fd); + fd = -1; + } + } + *stp = st; + } + + return fd; +} + +static __u32 swapl(__u32 l) +{ + char *c = (char*)&l; + char t= c[0]; + c[0] = c[3]; + c[3] = t; + + t = c[1]; + c[1] = c[2]; + c[2] = t; + return l; +} +int ExamineBitmap(char *filename, int brief, struct supertype *st) +{ + /* + * Read the bitmap file and display its contents + */ + + bitmap_super_t *sb; + bitmap_info_t *info; + int rv = 1; + char buf[64]; + int swap; + int fd, i; + __u32 uuid32[4]; + + fd = bitmap_file_open(filename, &st, 0, -1); + if (fd < 0) + return rv; + + info = bitmap_fd_read(fd, brief); + if (!info) + return rv; + sb = &info->sb; + if (sb->magic != BITMAP_MAGIC) { + pr_err("This is an md array. To view a bitmap you need to examine\n"); + pr_err("a member device, not the array.\n"); + pr_err("Reporting bitmap that would be used if this array were used\n"); + pr_err("as a member of some other array\n"); + } + printf(" Filename : %s\n", filename); + printf(" Magic : %08x\n", sb->magic); + if (sb->magic != BITMAP_MAGIC) { + pr_err("invalid bitmap magic 0x%x, the bitmap file appears\n", + sb->magic); + pr_err("to be corrupted or missing.\n"); + } + printf(" Version : %d\n", sb->version); + if (sb->version < BITMAP_MAJOR_LO || + sb->version > BITMAP_MAJOR_CLUSTERED) { + pr_err("unknown bitmap version %d, either the bitmap file\n", + sb->version); + pr_err("is corrupted or you need to upgrade your tools\n"); + goto free_info; + } + + rv = 0; + if (st) + swap = st->ss->swapuuid; + else +#if __BYTE_ORDER == BIG_ENDIAN + swap = 0; +#else + swap = 1; +#endif + memcpy(uuid32, sb->uuid, 16); + if (swap) + printf(" UUID : %08x:%08x:%08x:%08x\n", + swapl(uuid32[0]), + swapl(uuid32[1]), + swapl(uuid32[2]), + swapl(uuid32[3])); + else + printf(" UUID : %08x:%08x:%08x:%08x\n", + uuid32[0], + uuid32[1], + uuid32[2], + uuid32[3]); + + if (sb->nodes == 0) { + printf(" Events : %llu\n", (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + + } + + printf(" Chunksize : %s\n", human_chunksize(sb->chunksize)); + printf(" Daemon : %ds flush period\n", sb->daemon_sleep); + if (sb->write_behind) + sprintf(buf, "Allow write behind, max %d", sb->write_behind); + else + sprintf(buf, "Normal"); + printf(" Write Mode : %s\n", buf); + printf(" Sync Size : %llu%s\n", (unsigned long long)sb->sync_size/2, + human_size(sb->sync_size * 512)); + + if (sb->nodes == 0) { + if (brief) + goto free_info; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + } else { + printf(" Cluster nodes : %d\n", sb->nodes); + printf(" Cluster name : %-64s\n", sb->cluster_name); + for (i = 0; i < (int)sb->nodes; i++) { + st = NULL; + free(info); + fd = bitmap_file_open(filename, &st, i, fd); + if (fd < 0) { + printf(" Unable to open bitmap file on node: %i\n", i); + continue; + } + info = bitmap_fd_read(fd, brief); + if (!info) { + printf(" Unable to read bitmap on node: %i\n", i); + continue; + } + sb = &info->sb; + if (sb->magic != BITMAP_MAGIC) + pr_err("invalid bitmap magic 0x%x, the bitmap file appears to be corrupted\n", sb->magic); + + printf(" Node Slot : %d\n", i); + printf(" Events : %llu\n", + (unsigned long long)sb->events); + printf(" Events Cleared : %llu\n", + (unsigned long long)sb->events_cleared); + printf(" State : %s\n", bitmap_state(sb->state)); + if (brief) + continue; + printf(" Bitmap : %llu bits (chunks), %llu dirty (%2.1f%%)\n", + info->total_bits, info->dirty_bits, + 100.0 * info->dirty_bits / (info->total_bits?:1)); + } + } + +free_info: + close(fd); + free(info); + return rv; +} + +int IsBitmapDirty(char *filename) +{ + /* + * Read the bitmap file + * It will break reading bitmap action immediately when meeting any error. + * + * Return: 1(dirty), 0 (clean), -1(error) + */ + + int fd = -1, rv = 0, i; + struct supertype *st = NULL; + bitmap_info_t *info = NULL; + bitmap_super_t *sb = NULL; + + fd = bitmap_file_open(filename, &st, 0, fd); + free(st); + if (fd < 0) + goto out; + + info = bitmap_fd_read(fd, 0); + if (!info) { + close(fd); + goto out; + } + + sb = &info->sb; + for (i = 0; i < (int)sb->nodes; i++) { + st = NULL; + free(info); + info = NULL; + + fd = bitmap_file_open(filename, &st, i, fd); + free(st); + if (fd < 0) + goto out; + + info = bitmap_fd_read(fd, 0); + if (!info) { + close(fd); + goto out; + } + + sb = &info->sb; + if (sb->magic != BITMAP_MAGIC) { /* invalid bitmap magic */ + free(info); + close(fd); + goto out; + } + + if (info->dirty_bits) + rv = 1; + } + close(fd); + free(info); + return rv; +out: + return -1; +} + +int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size /* sectors */, + int major) +{ + /* + * Create a bitmap file with a superblock and (optionally) a full bitmap + */ + + FILE *fp; + int rv = 1; + char block[512]; + bitmap_super_t sb; + long long bytes, filesize; + + if (!force && access(filename, F_OK) == 0) { + pr_err("bitmap file %s already exists, use --force to overwrite\n", filename); + return rv; + } + + fp = fopen(filename, "w"); + if (fp == NULL) { + pr_err("failed to open bitmap file %s: %s\n", + filename, strerror(errno)); + return rv; + } + + if (chunksize == UnSet) { + /* We don't want more than 2^21 chunks, as 2^11 fill up one + * 4K page (2 bytes per chunk), and 2^10 address of those + * fill up a 4K indexing page. 2^20 might be safer, especially + * on 64bit hosts, so use that. + */ + chunksize = DEFAULT_BITMAP_CHUNK; + /* <<20 for 2^20 chunks, >>9 to convert bytes to sectors */ + while (array_size > ((unsigned long long)chunksize << (20-9))) + chunksize <<= 1; + } + + memset(&sb, 0, sizeof(sb)); + sb.magic = BITMAP_MAGIC; + sb.version = major; + if (uuid != NULL) + memcpy(sb.uuid, uuid, 16); + sb.chunksize = chunksize; + sb.daemon_sleep = daemon_sleep; + sb.write_behind = write_behind; + sb.sync_size = array_size; + + sb_cpu_to_le(&sb); /* convert to on-disk byte ordering */ + + if (fwrite(&sb, sizeof(sb), 1, fp) != 1) { + pr_err("failed to write superblock to bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + + /* calculate the size of the bitmap and write it to disk */ + bytes = (bitmap_bits(array_size, chunksize) + 7) / 8; + if (!bytes) { + rv = 0; + goto out; + } + + filesize = bytes + sizeof(sb); + + memset(block, 0xff, sizeof(block)); + + while (bytes > 0) { + if (fwrite(block, sizeof(block), 1, fp) != 1) { + pr_err("failed to write bitmap file %s: %s\n", filename, strerror(errno)); + goto out; + } + bytes -= sizeof(block); + } + + rv = 0; + fflush(fp); + /* make the file be the right size (well, to the nearest byte) */ + if (ftruncate(fileno(fp), filesize)) + perror("ftrunace"); +out: + fclose(fp); + if (rv) + unlink(filename); /* possibly corrupted, better get rid of it */ + return rv; +} + +int bitmap_update_uuid(int fd, int *uuid, int swap) +{ + struct bitmap_super_s bm; + if (lseek(fd, 0, 0) != 0) + return 1; + if (read(fd, &bm, sizeof(bm)) != sizeof(bm)) + return 1; + if (bm.magic != __cpu_to_le32(BITMAP_MAGIC)) + return 1; + copy_uuid(bm.uuid, uuid, swap); + if (lseek(fd, 0, 0) != 0) + return 2; + if (write(fd, &bm, sizeof(bm)) != sizeof(bm)) { + lseek(fd, 0, 0); + return 2; + } + lseek(fd, 0, 0); + return 0; +} diff --git a/bitmap.h b/bitmap.h new file mode 100644 index 0000000..7b1f80f --- /dev/null +++ b/bitmap.h @@ -0,0 +1,291 @@ +/* + * bitmap.h: Copyright (C) Peter T. Breuer (ptb@ot.uc3m.es) 2003 + * + * additions: Copyright (C) 2003-2004, Paul Clements, SteelEye Technology, Inc. + */ +#ifndef BITMAP_H +#define BITMAP_H 1 + +#define BITMAP_MAJOR_LO 3 +/* version 4 insists the bitmap is in little-endian order + * with version 3, it is host-endian which is non-portable + */ +#define BITMAP_MAJOR_HI 4 +#define BITMAP_MAJOR_HOSTENDIAN 3 +#define BITMAP_MAJOR_CLUSTERED 5 + +#define BITMAP_MINOR 39 + +/* + * in-memory bitmap: + * + * Use 16 bit block counters to track pending writes to each "chunk". + * The 2 high order bits are special-purpose, the first is a flag indicating + * whether a resync is needed. The second is a flag indicating whether a + * resync is active. + * This means that the counter is actually 14 bits: + * + * +--------+--------+------------------------------------------------+ + * | resync | resync | counter | + * | needed | active | | + * | (0-1) | (0-1) | (0-16383) | + * +--------+--------+------------------------------------------------+ + * + * The "resync needed" bit is set when: + * a '1' bit is read from storage at startup. + * a write request fails on some drives + * a resync is aborted on a chunk with 'resync active' set + * It is cleared (and resync-active set) when a resync starts across all drives + * of the chunk. + * + * + * The "resync active" bit is set when: + * a resync is started on all drives, and resync_needed is set. + * resync_needed will be cleared (as long as resync_active wasn't already set). + * It is cleared when a resync completes. + * + * The counter counts pending write requests, plus the on-disk bit. + * When the counter is '1' and the resync bits are clear, the on-disk + * bit can be cleared as well, thus setting the counter to 0. + * When we set a bit, or in the counter (to start a write), if the fields is + * 0, we first set the disk bit and set the counter to 1. + * + * If the counter is 0, the on-disk bit is clear and the stipe is clean + * Anything that dirties the stipe pushes the counter to 2 (at least) + * and sets the on-disk bit (lazily). + * If a periodic sweep find the counter at 2, it is decremented to 1. + * If the sweep find the counter at 1, the on-disk bit is cleared and the + * counter goes to zero. + * + * Also, we'll hijack the "map" pointer itself and use it as two 16 bit block + * counters as a fallback when "page" memory cannot be allocated: + * + * Normal case (page memory allocated): + * + * page pointer (32-bit) + * + * [ ] ------+ + * | + * +-------> [ ][ ]..[ ] (4096 byte page == 2048 counters) + * c1 c2 c2048 + * + * Hijacked case (page memory allocation failed): + * + * hijacked page pointer (32-bit) + * + * [ ][ ] (no page memory allocated) + * counter #1 (16-bit) counter #2 (16-bit) + * + */ + +#ifdef __KERNEL__ + +#define PAGE_BITS (PAGE_SIZE << 3) +#define PAGE_BIT_SHIFT (PAGE_SHIFT + 3) + +typedef __u16 bitmap_counter_t; +#define COUNTER_BITS 16 +#define COUNTER_BIT_SHIFT 4 +#define COUNTER_BYTE_RATIO (COUNTER_BITS / 8) +#define COUNTER_BYTE_SHIFT (COUNTER_BIT_SHIFT - 3) + +#define NEEDED_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 1))) +#define RESYNC_MASK ((bitmap_counter_t) (1 << (COUNTER_BITS - 2))) +#define COUNTER_MAX ((bitmap_counter_t) RESYNC_MASK - 1) +#define NEEDED(x) (((bitmap_counter_t) x) & NEEDED_MASK) +#define RESYNC(x) (((bitmap_counter_t) x) & RESYNC_MASK) +#define COUNTER(x) (((bitmap_counter_t) x) & COUNTER_MAX) + +/* how many counters per page? */ +#define PAGE_COUNTER_RATIO (PAGE_BITS / COUNTER_BITS) +/* same, except a shift value for more efficient bitops */ +#define PAGE_COUNTER_SHIFT (PAGE_BIT_SHIFT - COUNTER_BIT_SHIFT) +/* same, except a mask value for more efficient bitops */ +#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) + +#define BITMAP_BLOCK_SIZE 512 +#define BITMAP_BLOCK_SHIFT 9 + +/* how many blocks per chunk? (this is variable) */ +#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->chunksize >> BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) +#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) + +/* when hijacked, the counters and bits represent even larger "chunks" */ +/* there will be 1024 chunks represented by each counter in the page pointers */ +#define PAGEPTR_BLOCK_RATIO(bitmap) \ + (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) +#define PAGEPTR_BLOCK_SHIFT(bitmap) \ + (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) +#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) + +/* + * on-disk bitmap: + * + * Use one bit per "chunk" (block set). We do the disk I/O on the bitmap + * file a page at a time. There's a superblock at the start of the file. + */ + +/* map chunks (bits) to file pages - offset by the size of the superblock */ +#define CHUNK_BIT_OFFSET(chunk) ((chunk) + (sizeof(bitmap_super_t) << 3)) + +#endif + +/* + * bitmap structures: + */ + +#define BITMAP_MAGIC 0x6d746962 + +/* use these for bitmap->flags and bitmap->sb->state bit-fields */ +enum bitmap_state { + BITMAP_ACTIVE = 0x001, /* the bitmap is in use */ + BITMAP_STALE = 0x002 /* the bitmap file is out of date or had -EIO */ +}; + +/* the superblock at the front of the bitmap file -- little endian */ +typedef struct bitmap_super_s { + __u32 magic; /* 0 BITMAP_MAGIC */ + __u32 version; /* 4 the bitmap major for now, could change... */ + __u8 uuid[16]; /* 8 128 bit uuid - must match md device uuid */ + __u64 events; /* 24 event counter for the bitmap (1)*/ + __u64 events_cleared;/*32 event counter when last bit cleared (2) */ + __u64 sync_size; /* 40 the size of the md device's sync range(3) */ + __u32 state; /* 48 bitmap state information */ + __u32 chunksize; /* 52 the bitmap chunk size in bytes */ + __u32 daemon_sleep; /* 56 seconds between disk flushes */ + __u32 write_behind; /* 60 number of outstanding write-behind writes */ + __u32 sectors_reserved; /* 64 number of 512-byte sectors that are + * reserved for the bitmap. */ + __u32 nodes; /* 68 the maximum number of nodes in cluster. */ + __u8 cluster_name[64]; /* 72 cluster name to which this md belongs */ + __u8 pad[256 - 136]; /* set to zero */ +} bitmap_super_t; + +/* notes: + * (1) This event counter is updated before the eventcounter in the md superblock + * When a bitmap is loaded, it is only accepted if this event counter is equal + * to, or one greater than, the event counter in the superblock. + * (2) This event counter is updated when the other one is *if*and*only*if* the + * array is not degraded. As bits are not cleared when the array is degraded, + * this represents the last time that any bits were cleared. + * If a device is being added that has an event count with this value or + * higher, it is accepted as conforming to the bitmap. + * (3)This is the number of sectors represented by the bitmap, and is the range that + * resync happens across. For raid1 and raid5/6 it is the size of individual + * devices. For raid10 it is the size of the array. + */ + +#ifdef __KERNEL__ + +/* the in-memory bitmap is represented by bitmap_pages */ +struct bitmap_page { + /* + * map points to the actual memory page + */ + char *map; + /* + * in emergencies (when map cannot be allocated), hijack the map + * pointer and use it as two counters itself + */ + unsigned int hijacked; + /* + * count of dirty bits on the page + */ + int count; +}; + +/* keep track of bitmap file pages that have pending writes on them */ +struct page_list { + struct list_head list; + struct page *page; +}; + +/* the main bitmap structure - one per mddev */ +struct bitmap { + struct bitmap_page *bp; + unsigned long pages; /* total number of pages in the bitmap */ + unsigned long missing_pages; /* number of pages not yet allocated */ + + mddev_t *mddev; /* the md device that the bitmap is for */ + + int counter_bits; /* how many bits per block counter */ + + /* bitmap chunksize -- how much data does each bit represent? */ + unsigned long chunksize; + unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ + unsigned long chunks; /* total number of data chunks for the array */ + + /* We hold a count on the chunk currently being synced, and drop + * it when the last block is started. If the resync is aborted + * midway, we need to be able to drop that count, so we remember + * the counted chunk.. + */ + unsigned long syncchunk; + + __u64 events_cleared; + + /* bitmap spinlock */ + spinlock_t lock; + + struct file *file; /* backing disk file */ + struct page *sb_page; /* cached copy of the bitmap file superblock */ + struct page **filemap; /* list of cache pages for the file */ + unsigned long *filemap_attr; /* attributes associated w/ filemap pages */ + unsigned long file_pages; /* number of pages in the file */ + + unsigned long flags; + + /* + * the bitmap daemon - periodically wakes up and sweeps the bitmap + * file, cleaning up bits and flushing out pages to disk as necessary + */ + mdk_thread_t *daemon; + unsigned long daemon_sleep; /* how many seconds between updates? */ + + /* + * bitmap write daemon - this daemon performs writes to the bitmap file + * this thread is only needed because of a limitation in ext3 (jbd) + * that does not allow a task to have two journal transactions ongoing + * simultaneously (even if the transactions are for two different + * filesystems) -- in the case of bitmap, that would be the filesystem + * that the bitmap file resides on and the filesystem that is mounted + * on the md device -- see current->journal_info in jbd/transaction.c + */ + mdk_thread_t *write_daemon; + mdk_thread_t *writeback_daemon; + spinlock_t write_lock; + struct semaphore write_ready; + struct semaphore write_done; + unsigned long writes_pending; + wait_queue_head_t write_wait; + struct list_head write_pages; + struct list_head complete_pages; + mempool_t *write_pool; +}; + +/* the bitmap API */ + +/* these are used only by md/bitmap */ +int bitmap_create(mddev_t *mddev); +void bitmap_destroy(mddev_t *mddev); +int bitmap_active(struct bitmap *bitmap); + +char *file_path(struct file *file, char *buf, int count); +void bitmap_print_sb(struct bitmap *bitmap); +int bitmap_update_sb(struct bitmap *bitmap); + +int bitmap_setallbits(struct bitmap *bitmap); + +/* these are exported */ +void bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors); +void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, + int success); +int bitmap_start_sync(struct bitmap *bitmap, sector_t offset, int *blocks); +void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, int *blocks, int aborted); +void bitmap_close_sync(struct bitmap *bitmap); + +int bitmap_unplug(struct bitmap *bitmap); +#endif + +#endif diff --git a/clustermd_tests/00r10_Create b/clustermd_tests/00r10_Create new file mode 100644 index 0000000..8aa5a70 --- /dev/null +++ b/clustermd_tests/00r10_Create @@ -0,0 +1,50 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check $NODE1 resync +check $NODE2 PENDING +check all wait +check all raid10 +check all bitmap +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l10 -b clustered -n3 --layout n3 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid10 +check all bitmap +check all state UUU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l10 -b clustered -n2 -x1 --layout n2 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid10 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +name=tstmd +mdadm -CR $md0 -l10 -b clustered -n2 $dev0 $dev1 --layout n2 --name=$name --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU +for ip in $NODE1 $NODE2 +do + ssh $ip "mdadm -D $md0 | grep 'Name' | grep -q $name" + [ $? -ne '0' ] && + die "$ip: check --name=$name failed." +done +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/00r1_Create b/clustermd_tests/00r1_Create new file mode 100644 index 0000000..709bb7b --- /dev/null +++ b/clustermd_tests/00r1_Create @@ -0,0 +1,50 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check $NODE1 resync +check $NODE2 PENDING +check all wait +check all raid1 +check all bitmap +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid1 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +name=tstmd +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --name=$name --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +for ip in $NODE1 $NODE2 +do + ssh $ip "mdadm -D $md0 | grep 'Name' | grep -q $name" + [ $? -ne '0' ] && + die "$ip: check --name=$name failed." +done +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/01r10_Grow_bitmap-switch b/clustermd_tests/01r10_Grow_bitmap-switch new file mode 100644 index 0000000..1794719 --- /dev/null +++ b/clustermd_tests/01r10_Grow_bitmap-switch @@ -0,0 +1,51 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU + +# switch 'clustered' bitmap to 'none', and then 'none' to 'internal' +stop_md $NODE2 $md0 +mdadm --grow $md0 --bitmap=none +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'clustered' to 'none' failed." +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] && + die "$NODE1: bitmap still exists in member_disks." +check all nobitmap +mdadm --grow $md0 --bitmap=internal +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'none' to 'internal' failed." +sleep 1 +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] || + die "$NODE1: create 'internal' bitmap failed." +check $NODE1 bitmap + +# switch 'internal' bitmap to 'none', and then 'none' to 'clustered' +mdadm --grow $md0 --bitmap=none +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'internal' to 'none' failed." +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] && + die "$NODE1: bitmap still exists in member_disks." +check $NODE1 nobitmap +mdadm --grow $md0 --bitmap=clustered +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'none' to 'clustered' failed." +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +sleep 1 +for ip in $NODES +do + ssh $ip "mdadm -X $dev0 $dev1 | grep -q 'Cluster name'" || + die "$ip: create 'clustered' bitmap failed." +done +check all bitmap +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/01r10_Grow_resize b/clustermd_tests/01r10_Grow_resize new file mode 100644 index 0000000..c69b785 --- /dev/null +++ b/clustermd_tests/01r10_Grow_resize @@ -0,0 +1,38 @@ +#!/bin/bash + +size=20000 + +mdadm -CR $md0 -l10 -b clustered --layout n2 --size $size --chunk=64 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU + +mdadm --grow $md0 --size max +check $NODE1 resync +check $NODE1 wait +check all state UU + +mdadm --grow $md0 --size $size +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l10 -b clustered --layout n2 --chunk=64 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU + +mdadm --grow $md0 --chunk=128 +check $NODE1 reshape +check $NODE1 wait +check all chunk 128 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/01r1_Grow_add b/clustermd_tests/01r1_Grow_add new file mode 100644 index 0000000..5706114 --- /dev/null +++ b/clustermd_tests/01r1_Grow_add @@ -0,0 +1,68 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +mdadm --grow $md0 --raid-devices=3 --add $dev2 +sleep 0.3 +grep recovery /proc/mdstat +if [ $? -eq '0' ] +then + check $NODE1 wait +else + check $NODE2 recovery + check $NODE2 wait +fi +check all state UUU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid1 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --grow $md0 --raid-devices=3 --add $dev3 +sleep 0.3 +grep recovery /proc/mdstat +if [ $? -eq '0' ] +then + check $NODE1 wait +else + check $NODE2 recovery + check $NODE2 wait +fi +check all state UUU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid1 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --grow $md0 --raid-devices=3 +sleep 0.3 +grep recovery /proc/mdstat +if [ $? -eq '0' ] +then + check $NODE1 wait +else + check $NODE2 recovery + check $NODE2 wait +fi +check all state UUU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/01r1_Grow_bitmap-switch b/clustermd_tests/01r1_Grow_bitmap-switch new file mode 100644 index 0000000..3b363d9 --- /dev/null +++ b/clustermd_tests/01r1_Grow_bitmap-switch @@ -0,0 +1,51 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU + +# switch 'clustered' bitmap to 'none', and then 'none' to 'internal' +stop_md $NODE2 $md0 +mdadm --grow $md0 --bitmap=none +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'clustered' to 'none' failed." +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] && + die "$NODE1: bitmap still exists in member_disks." +check all nobitmap +mdadm --grow $md0 --bitmap=internal +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'none' to 'internal' failed." +sleep 2 +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] || + die "$NODE1: create 'internal' bitmap failed." +check $NODE1 bitmap + +# switch 'internal' bitmap to 'none', and then 'none' to 'clustered' +mdadm --grow $md0 --bitmap=none +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'internal' to 'none' failed." +mdadm -X $dev0 $dev1 &> /dev/null +[ $? -eq '0' ] && + die "$NODE1: bitmap still exists in member_disks." +check $NODE1 nobitmap +mdadm --grow $md0 --bitmap=clustered +[ $? -eq '0' ] || + die "$NODE1: change bitmap 'none' to 'clustered' failed." +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +sleep 2 +for ip in $NODES +do + ssh $ip "mdadm -X $dev0 $dev1 | grep -q 'Cluster name'" || + die "$ip: create 'clustered' bitmap failed." +done +check all bitmap +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/01r1_Grow_resize b/clustermd_tests/01r1_Grow_resize new file mode 100644 index 0000000..6d6e22a --- /dev/null +++ b/clustermd_tests/01r1_Grow_resize @@ -0,0 +1,23 @@ +#!/bin/bash + +size=10000 + +mdadm -CR $md0 -l1 -b clustered --size $size -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU + +mdadm --grow $md0 --size max +check $NODE1 resync +check $NODE1 wait +check all state UU + +mdadm --grow $md0 --size $size +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r10_Manage_add b/clustermd_tests/02r10_Manage_add new file mode 100644 index 0000000..8e878ab --- /dev/null +++ b/clustermd_tests/02r10_Manage_add @@ -0,0 +1,33 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 --remove $dev0 +mdadm --zero $dev2 +mdadm --manage $md0 --add $dev2 +sleep 0.3 +check $NODE1 recovery +check $NODE1 wait +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --add $dev2 +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r10_Manage_add-spare b/clustermd_tests/02r10_Manage_add-spare new file mode 100644 index 0000000..9924aa8 --- /dev/null +++ b/clustermd_tests/02r10_Manage_add-spare @@ -0,0 +1,30 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --add-spare $dev2 +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid10 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --manage $md0 --add-spare $dev3 +check all spares 2 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r10_Manage_re-add b/clustermd_tests/02r10_Manage_re-add new file mode 100644 index 0000000..2288a00 --- /dev/null +++ b/clustermd_tests/02r10_Manage_re-add @@ -0,0 +1,18 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid10 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 --remove $dev0 +mdadm --manage $md0 --re-add $dev0 +check $NODE1 recovery +check all wait +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r1_Manage_add b/clustermd_tests/02r1_Manage_add new file mode 100644 index 0000000..ab2751c --- /dev/null +++ b/clustermd_tests/02r1_Manage_add @@ -0,0 +1,33 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 --remove $dev0 +mdadm --zero $dev2 +mdadm --manage $md0 --add $dev2 +sleep 0.3 +check $NODE1 recovery +check $NODE1 wait +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --add $dev2 +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r1_Manage_add-spare b/clustermd_tests/02r1_Manage_add-spare new file mode 100644 index 0000000..eab8111 --- /dev/null +++ b/clustermd_tests/02r1_Manage_add-spare @@ -0,0 +1,30 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --add-spare $dev2 +check all spares 1 +check all state UU +check all dmesg +stop_md all $md0 + +mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid1 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --manage $md0 --add-spare $dev3 +check all spares 2 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/02r1_Manage_re-add b/clustermd_tests/02r1_Manage_re-add new file mode 100644 index 0000000..d0d13e5 --- /dev/null +++ b/clustermd_tests/02r1_Manage_re-add @@ -0,0 +1,16 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check all nosync +check all raid1 +check all bitmap +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 --remove $dev0 +mdadm --manage $md0 --re-add $dev0 +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/03r10_switch-recovery b/clustermd_tests/03r10_switch-recovery new file mode 100644 index 0000000..867388d --- /dev/null +++ b/clustermd_tests/03r10_switch-recovery @@ -0,0 +1,21 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid10 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 +sleep 0.2 +check $NODE1 recovery +stop_md $NODE1 $md0 +check $NODE2 recovery +check $NODE2 wait +check $NODE2 state UU +check all dmesg +stop_md $NODE2 $md0 + +exit 0 diff --git a/clustermd_tests/03r10_switch-resync b/clustermd_tests/03r10_switch-resync new file mode 100644 index 0000000..127c569 --- /dev/null +++ b/clustermd_tests/03r10_switch-resync @@ -0,0 +1,18 @@ +#!/bin/bash + +mdadm -CR $md0 -l10 -b clustered --layout n2 -n2 $dev0 $dev1 +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check $NODE1 resync +check $NODE2 PENDING +stop_md $NODE1 $md0 +check $NODE2 resync +check $NODE2 wait +mdadm -A $md0 $dev0 $dev1 +check all raid10 +check all bitmap +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/03r1_switch-recovery b/clustermd_tests/03r1_switch-recovery new file mode 100644 index 0000000..a1a7cbe --- /dev/null +++ b/clustermd_tests/03r1_switch-recovery @@ -0,0 +1,21 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 -x1 $dev0 $dev1 $dev2 --assume-clean +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 $dev2 +check all nosync +check all raid1 +check all bitmap +check all spares 1 +check all state UU +check all dmesg +mdadm --manage $md0 --fail $dev0 +sleep 0.3 +check $NODE1 recovery +stop_md $NODE1 $md0 +check $NODE2 recovery +check $NODE2 wait +check $NODE2 state UU +check all dmesg +stop_md $NODE2 $md0 + +exit 0 diff --git a/clustermd_tests/03r1_switch-resync b/clustermd_tests/03r1_switch-resync new file mode 100644 index 0000000..d99e1c5 --- /dev/null +++ b/clustermd_tests/03r1_switch-resync @@ -0,0 +1,18 @@ +#!/bin/bash + +mdadm -CR $md0 -l1 -b clustered -n2 $dev0 $dev1 +ssh $NODE2 mdadm -A $md0 $dev0 $dev1 +check $NODE1 resync +check $NODE2 PENDING +stop_md $NODE1 $md0 +check $NODE2 resync +check $NODE2 wait +mdadm -A $md0 $dev0 $dev1 +check all raid1 +check all bitmap +check all nosync +check all state UU +check all dmesg +stop_md all $md0 + +exit 0 diff --git a/clustermd_tests/cluster_conf b/clustermd_tests/cluster_conf new file mode 100644 index 0000000..4f0c9fb --- /dev/null +++ b/clustermd_tests/cluster_conf @@ -0,0 +1,43 @@ +# Prerequisite: +# 1. The clustermd_tests/ cases only support to test 2-node-cluster, cluster +# requires packages: 'pacemaker+corosync+sbd+crmsh', all packages link at +# "https://github.com/ClusterLabs/", and also requires dlm resource running +# on each node of cluster. +# For quick start HA-cluster with SUSE distributions, refer to the chapter 6-8: +# https://www.suse.com/documentation/sle-ha-12/install-quick/data/install-quick.html +# For Redhat distributions, please refer to: +# https://access.redhat.com/documentation/en-us/red_hat_enterprise_linux/7/html-single/high_availability_add-on_administration/index +# 2. Setup ssh-access with no-authorized mode, it should be: +# # 'ssh $node1 -l root ls' and 'ssh $node2 -l root ls' success on any node. +# 3. Fill-up node-ip part and disks part as following. + +# Set node1 as the master node, the cluster-md cases should run on this node, +# and node2 is the slave node. +# For example: +# NODE1=192.168.1.100 (testing run here) +# NODE2=192.168.1.101 +NODE1= +NODE2= + +# Provide the devlist for clustermd-testing, alternative: if set the step 1, +# don't set step 2, and vice versa. +# 1. Use ISCSI service to provide shared storage, then login ISCSI target via +# to ISCSI_TARGET_ID and ISCSI_TARGET_IP on iscsi clients, commands like: +# Execute on iscsi clients: +# 1) discover the iscsi server. +# # iscsiadm -m discovery -t st -p $ISCSI_TARGET_IP +# 2) login and establish connection. +# # iscsiadm -m node -T $ISCSI_TARGET_ID -p $ISCSI_TARGET_IP -l +# Note: +# On ISCSI server, must create all iscsi-luns in one target_id, recommend more +# than 6 luns/disks for testing, and each disk should be: 100M < disk < 800M. +# 2. If all cluster-nodes mounted the same disks directly, and the devname are +# the same on all nodes, then put them to 'devlist'. + +# For example: (Only set $ISCSI_TARGET_ID is enough if iscsi has already connected) +# ISCSI_TARGET_ID=iqn.2018-01.example.com:clustermd-testing +# ISCSI_TARGET_IP=192.168.1.102 +ISCSI_TARGET_ID= + +#devlist=/dev/sda /dev/sdb /dev/sdc /dev/sdd +devlist= diff --git a/clustermd_tests/func.sh b/clustermd_tests/func.sh new file mode 100644 index 0000000..801d604 --- /dev/null +++ b/clustermd_tests/func.sh @@ -0,0 +1,332 @@ +#!/bin/bash + +check_ssh() +{ + NODE1="$(grep '^NODE1' $CLUSTER_CONF | cut -d'=' -f2)" + NODE2="$(grep '^NODE2' $CLUSTER_CONF | cut -d'=' -f2)" + [ -z "$NODE1" -o -z "$NODE2" ] && { + echo "Please provide node-ip in $CLUSTER_CONF." + exit 1 + } + for ip in $NODE1 $NODE2 + do + ssh -o NumberOfPasswordPrompts=0 $ip -l root "pwd" > /dev/null + [ $? -ne 0 ] && { + echo "Please setup ssh-access with no-authorized mode." + exit 1 + } + done +} + +fetch_devlist() +{ + ISCSI_ID="$(grep '^ISCSI_TARGET_ID' $CLUSTER_CONF | cut -d'=' -f2)" + devlist="$(grep '^devlist' $CLUSTER_CONF | cut -d'=' -f2)" + if [ ! -z "$ISCSI_ID" -a ! -z "$devlist" ] + then + echo "Config ISCSI_TARGET_ID or devlist in $CLUSTER_CONF." + exit 1 + elif [ ! -z "$ISCSI_ID" -a -z "$devlist" ] + then + for ip in $NODE1 $NODE2 + do + ssh $ip "ls /dev/disk/by-path/*$ISCSI_ID*" > /dev/null + [ $? -ne 0 ] && { + echo "$ip: No disks found in '$ISCSI_ID' connection." + exit 1 + } + done + devlist=($(ls /dev/disk/by-path/*$ISCSI_ID*)) + fi + # sbd disk cannot use in testing + # Init devlist as an array + i='' + devlist=(${devlist[@]#$i}) + for i in ${devlist[@]} + do + sbd -d $i dump &> /dev/null + [ $? -eq '0' ] && devlist=(${devlist[@]#$i}) + done + for i in $(seq 0 ${#devlist[@]}) + do + eval "dev$i=${devlist[$i]}" + done + [ "${#devlist[@]}" -lt 6 ] && { + echo "Cluster-md testing requires 6 disks at least." + exit 1 + } +} + +check_dlm() +{ + if ! crm configure show | grep -q dlm + then + crm configure primitive dlm ocf:pacemaker:controld \ + op monitor interval=60 timeout=60 \ + meta target-role=Started &> /dev/null + crm configure group base-group dlm + crm configure clone base-clone base-group \ + meta interleave=true + fi + sleep 1 + for ip in $NODE1 $NODE2 + do + ssh $ip "pgrep dlm_controld > /dev/null" || { + echo "$ip: dlm_controld daemon doesn't exist." + exit 1 + } + done + crm_mon -r -n1 | grep -iq "fail\|not" && { + echo "Please clear cluster-resource errors." + exit 1 + } +} + +check_env() +{ + user=$(id -un) + [ "X$user" = "Xroot" ] || { + echo "testing can only be done as 'root'." + exit 1 + } + [ \! -x $mdadm ] && { + echo "test: please run make everything before perform testing." + exit 1 + } + check_ssh + commands=(mdadm iscsiadm bc modinfo dlm_controld + udevadm crm crm_mon lsblk pgrep sbd) + for ip in $NODE1 $NODE2 + do + for cmd in ${commands[@]} + do + ssh $ip "which $cmd &> /dev/null" || { + echo "$ip: $cmd, command not found!" + exit 1 + } + done + mods=(raid1 raid10 md_mod dlm md-cluster) + for mod in ${mods[@]} + do + ssh $ip "modinfo $mod > /dev/null" || { + echo "$ip: $mod, module doesn't exist." + exit 1 + } + done + ssh $ip "lsblk -a | grep -iq raid" + [ $? -eq 0 ] && { + echo "$ip: Please run testing without running RAIDs environment." + exit 1 + } + ssh $ip "modprobe md_mod" + done + fetch_devlist + check_dlm + [ -d $logdir ] || mkdir -p $logdir +} + +# $1/node, $2/optional +stop_md() +{ + if [ "$1" == "all" ] + then + NODES=($NODE1 $NODE2) + elif [ "$1" == "$NODE1" -o "$1" == "$NODE2" ] + then + NODES=$1 + else + die "$1: unknown parameter." + fi + if [ -z "$2" ] + then + for ip in ${NODES[@]} + do + ssh $ip mdadm -Ssq + done + else + for ip in ${NODES[@]} + do + ssh $ip mdadm -S $2 + done + fi +} + +# $1/optional, it shows why to save log +save_log() +{ + status=$1 + logfile="$status""$_basename".log + + cat $targetdir/stderr >> $targetdir/log + cp $targetdir/log $logdir/$_basename.log + + for ip in $NODE1 $NODE2 + do + echo "##$ip: saving dmesg." >> $logdir/$logfile + ssh $ip "dmesg -c" >> $logdir/$logfile + echo "##$ip: saving proc mdstat." >> $logdir/$logfile + ssh $ip "cat /proc/mdstat" >> $logdir/$logfile + array=($(ssh $ip "mdadm -Ds | cut -d' ' -f2")) + + if [ ! -z "$array" -a ${#array[@]} -ge 1 ] + then + echo "##$ip: mdadm -D ${array[@]}" >> $logdir/$logfile + ssh $ip "mdadm -D ${array[@]}" >> $logdir/$logfile + md_disks=($(ssh $ip "mdadm -DY ${array[@]} | grep "/dev/" | cut -d'=' -f2")) + cat /proc/mdstat | grep -q "bitmap" + if [ $? -eq 0 ] + then + echo "##$ip: mdadm -X ${md_disks[@]}" >> $logdir/$logfile + ssh $ip "mdadm -X ${md_disks[@]}" >> $logdir/$logfile + echo "##$ip: mdadm -E ${md_disks[@]}" >> $logdir/$logfile + ssh $ip "mdadm -E ${md_disks[@]}" >> $logdir/$logfile + fi + else + echo "##$ip: no array assembled!" >> $logdir/$logfile + fi + done + [ "$1" == "fail" ] && + echo "See $logdir/$_basename.log and $logdir/$logfile for details" + stop_md all +} + +do_setup() +{ + check_env + ulimit -c unlimited +} + +do_clean() +{ + for ip in $NODE1 $NODE2 + do + ssh $ip "mdadm -Ssq; dmesg -c > /dev/null" + done + mdadm --zero ${devlist[@]} &> /dev/null +} + +cleanup() +{ + check_ssh + do_clean +} + +# check: $1/cluster_node $2/feature $3/optional +check() +{ + NODES=() + if [ "$1" == "all" ] + then + NODES=($NODE1 $NODE2) + elif [ "$1" == "$NODE1" -o "$1" == "$NODE2" ] + then + NODES=$1 + else + die "$1: unknown parameter." + fi + case $2 in + spares ) + for ip in ${NODES[@]} + do + spares=$(ssh $ip "tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)'") + [ "$spares" -ne "$3" ] && + die "$ip: expected $3 spares, but found $spares" + done + ;; + raid* ) + for ip in ${NODES[@]} + do + ssh $ip "grep -sq "$2" /proc/mdstat" || + die "$ip: check '$2' failed." + done + ;; + PENDING | recovery | resync | reshape ) + cnt=5 + for ip in ${NODES[@]} + do + while ! ssh $ip "grep -sq '$2' /proc/mdstat" + do + if [ "$cnt" -gt '0' ] + then + sleep 0.2 + cnt=$[cnt-1] + else + die "$ip: no '$2' happening!" + fi + done + done + ;; + wait ) + local cnt=60 + for ip in ${NODES[@]} + do + p=$(ssh $ip "cat /proc/sys/dev/raid/speed_limit_max") + ssh $ip "echo 200000 > /proc/sys/dev/raid/speed_limit_max" + while ssh $ip "grep -Esq '(resync|recovery|reshape|check|repair)' /proc/mdstat" + do + if [ "$cnt" -gt '0' ] + then + sleep 5 + cnt=$[cnt-1] + else + die "$ip: Check '$2' timeout over 300 seconds." + fi + done + ssh $ip "echo $p > /proc/sys/dev/raid/speed_limit_max" + done + ;; + bitmap ) + for ip in ${NODES[@]} + do + ssh $ip "grep -sq '$2' /proc/mdstat" || + die "$ip: no '$2' found in /proc/mdstat." + done + ;; + nobitmap ) + for ip in ${NODES[@]} + do + ssh $ip "grep -sq 'bitmap' /proc/mdstat" && + die "$ip: 'bitmap' found in /proc/mdstat." + done + ;; + chunk ) + for ip in ${NODES[@]} + do + chunk_size=`awk -F',' '/chunk/{print $2}' /proc/mdstat | awk -F'[a-z]' '{print $1}'` + [ "$chunk_size" -ne "$3" ] && + die "$ip: chunksize should be $3, but it's $chunk_size" + done + ;; + state ) + for ip in ${NODES[@]} + do + ssh $ip "grep -Esq 'blocks.*\[$3\]\$' /proc/mdstat" || + die "$ip: no '$3' found in /proc/mdstat." + done + ;; + nosync ) + for ip in ${NODES[@]} + do + ssh $ip "grep -Eq '(resync|recovery)' /proc/mdstat" && + die "$ip: resync or recovery is happening!" + done + ;; + readonly ) + for ip in ${NODES[@]} + do + ssh $ip "grep -sq "read-only" /proc/mdstat" || + die "$ip: check '$2' failed!" + done + ;; + dmesg ) + for ip in ${NODES[@]} + do + ssh $ip "dmesg | grep -iq 'error\|call trace\|segfault'" && + die "$ip: check '$2' prints errors!" + done + ;; + * ) + die "unknown parameter $2" + ;; + esac +} diff --git a/config.c b/config.c new file mode 100644 index 0000000..9c72545 --- /dev/null +++ b/config.c @@ -0,0 +1,1235 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" +#include <dirent.h> +#include <glob.h> +#include <fnmatch.h> +#include <ctype.h> +#include <pwd.h> +#include <grp.h> + +/* + * Read the config file + * + * conf_get_uuids gets a list of devicename+uuid pairs + * conf_get_devs gets device names after expanding wildcards + * + * Each keeps the returned list and frees it when asked to make + * a new list. + * + * The format of the config file needs to be fairly extensible. + * Now, arrays only have names and uuids and devices merely are. + * But later arrays might want names, and devices might want superblock + * versions, and who knows what else. + * I like free format, abhore backslash line continuation, adore + * indentation for structure and am ok about # comments. + * + * So, each line that isn't blank or a #comment must either start + * with a key word, and not be indented, or must start with a + * non-key-word and must be indented. + * + * Keywords are DEVICE and ARRAY ... and several others. + * DEV{ICE} introduces some devices that might contain raid components. + * e.g. + * DEV style=0 /dev/sda* /dev/hd* + * DEV style=1 /dev/sd[b-f]* + * ARR{AY} describes an array giving md device and attributes like uuid=whatever + * e.g. + * ARRAY /dev/md0 uuid=whatever name=something + * Spaces separate words on each line. Quoting, with "" or '' protects them, + * but may not wrap over lines + * + */ +#ifndef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200809L +#endif + +#ifndef CONFFILE +#define CONFFILE "/etc/mdadm.conf" +#endif +#ifndef CONFFILE2 +/* for Debian compatibility .... */ +#define CONFFILE2 "/etc/mdadm/mdadm.conf" +#endif +char DefaultConfFile[] = CONFFILE; +char DefaultConfDir[] = CONFFILE ".d"; +char DefaultAltConfFile[] = CONFFILE2; +char DefaultAltConfDir[] = CONFFILE2 ".d"; + +enum linetype { Devices, Array, Mailaddr, Mailfrom, Program, CreateDev, + Homehost, HomeCluster, AutoMode, Policy, PartPolicy, Sysfs, + MonitorDelay, LTEnd }; +char *keywords[] = { + [Devices] = "devices", + [Array] = "array", + [Mailaddr] = "mailaddr", + [Mailfrom] = "mailfrom", + [Program] = "program", + [CreateDev]= "create", + [Homehost] = "homehost", + [HomeCluster] = "homecluster", + [AutoMode] = "auto", + [Policy] = "policy", + [PartPolicy]="part-policy", + [Sysfs] = "sysfs", + [MonitorDelay] = "monitordelay", + [LTEnd] = NULL +}; + +/* + * match_keyword returns an index into the keywords array, or -1 for no match + * case is ignored, and at least three characters must be given + */ + +int match_keyword(char *word) +{ + int len = strlen(word); + int n; + + if (len < 3) + return -1; + for (n = 0; keywords[n]; n++) { + if (strncasecmp(word, keywords[n], len) == 0) + return n; + } + + return -1; +} + +struct conf_dev { + struct conf_dev *next; + char *name; +} *cdevlist = NULL; + +struct mddev_dev *load_partitions(void) +{ + FILE *f = fopen("/proc/partitions", "r"); + char buf[1024]; + struct mddev_dev *rv = NULL; + + if (f == NULL) { + pr_err("cannot open /proc/partitions\n"); + return NULL; + } + while (fgets(buf, 1024, f)) { + int major, minor; + char *name, *mp; + struct mddev_dev *d; + + buf[1023] = '\0'; + if (buf[0] != ' ') + continue; + major = strtoul(buf, &mp, 10); + if (mp == buf || *mp != ' ') + continue; + minor = strtoul(mp, NULL, 10); + + name = map_dev(major, minor, 1); + if (!name) + continue; + d = xcalloc(1, sizeof(*d)); + d->devname = xstrdup(name); + d->next = rv; + rv = d; + } + fclose(f); + return rv; +} + +struct mddev_dev *load_containers(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + struct mddev_dev *d; + struct mddev_dev *rv = NULL; + struct map_ent *map = NULL, *me; + + if (!mdstat) + return NULL; + + for (ent = mdstat; ent; ent = ent->next) + if (ent->metadata_version && + strncmp(ent->metadata_version, "external:", 9) == 0 && + !is_subarray(&ent->metadata_version[9])) { + d = xcalloc(1, sizeof(*d)); + me = map_by_devnm(&map, ent->devnm); + if (me) + d->devname = xstrdup(me->path); + else if (asprintf(&d->devname, "/dev/%s", ent->devnm) < 0) { + free(d); + continue; + } + d->next = rv; + rv = d; + map_free(map); + map = NULL; + } + free_mdstat(mdstat); + + return rv; +} + +struct createinfo createinfo = { + .autof = 2, /* by default, create devices with standard names */ + .symlinks = 1, + .names = 0, /* By default, stick with numbered md devices. */ + .bblist = 1, /* Use a bad block list by default */ +#ifdef DEBIAN + .gid = 6, /* disk */ + .mode = 0660, +#else + .mode = 0600, +#endif +}; + +int parse_auto(char *str, char *msg, int config) +{ + int autof; + if (str == NULL || *str == 0) + autof = 2; + else if (strcasecmp(str, "no") == 0) + autof = 1; + else if (strcasecmp(str, "yes") == 0) + autof = 2; + else if (strcasecmp(str, "md") == 0) + autof = config ? 5:3; + else { + /* There might be digits, and maybe a hypen, at the end */ + char *e = str + strlen(str); + int num = 4; + int len; + while (e > str && isdigit(e[-1])) + e--; + if (*e) { + num = atoi(e); + if (num <= 0) + num = 1; + } + if (e > str && e[-1] == '-') + e--; + len = e - str; + if ((len == 2 && strncasecmp(str, "md", 2) == 0)) { + autof = config ? 5 : 3; + } else if ((len == 3 && strncasecmp(str, "yes", 3) == 0)) { + autof = 2; + } else if ((len == 3 && strncasecmp(str, "mdp", 3) == 0)) { + autof = config ? 6 : 4; + } else if ((len == 1 && strncasecmp(str, "p", 1) == 0) || + (len >= 4 && strncasecmp(str, "part", 4) == 0)) { + autof = 6; + } else { + pr_err("%s arg of \"%s\" unrecognised: use no,yes,md,mdp,part\n" + " optionally followed by a number.\n", + msg, str); + exit(2); + } + autof |= num << 3; + } + return autof; +} + +static void createline(char *line) +{ + char *w; + char *ep; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (strncasecmp(w, "auto=", 5) == 0) + createinfo.autof = parse_auto(w + 5, "auto=", 1); + else if (strncasecmp(w, "owner=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing owner name\n"); + continue; + } + createinfo.uid = strtoul(w + 6, &ep, 10); + if (*ep != 0) { + struct passwd *pw; + /* must be a name */ + pw = getpwnam(w + 6); + if (pw) + createinfo.uid = pw->pw_uid; + else + pr_err("CREATE user %s not found\n", + w + 6); + } + } else if (strncasecmp(w, "group=", 6) == 0) { + if (w[6] == 0) { + pr_err("missing group name\n"); + continue; + } + createinfo.gid = strtoul(w + 6, &ep, 10); + if (*ep != 0) { + struct group *gr; + /* must be a name */ + gr = getgrnam(w + 6); + if (gr) + createinfo.gid = gr->gr_gid; + else + pr_err("CREATE group %s not found\n", + w + 6); + } + } else if (strncasecmp(w, "mode=", 5) == 0) { + if (w[5] == 0) { + pr_err("missing CREATE mode\n"); + continue; + } + createinfo.mode = strtoul(w + 5, &ep, 8); + if (*ep != 0) { + createinfo.mode = 0600; + pr_err("unrecognised CREATE mode %s\n", + w + 5); + } + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata to use by default */ + int i; + for (i = 0; superlist[i] && !createinfo.supertype; i++) + createinfo.supertype = superlist[i]->match_metadata_desc(w + 9); + if (!createinfo.supertype) + pr_err("metadata format %s unknown, ignoring\n", + w+9); + } else if (strncasecmp(w, "symlinks=yes", 12) == 0) + createinfo.symlinks = 1; + else if (strncasecmp(w, "symlinks=no", 11) == 0) + createinfo.symlinks = 0; + else if (strncasecmp(w, "names=yes", 12) == 0) + createinfo.names = 1; + else if (strncasecmp(w, "names=no", 11) == 0) + createinfo.names = 0; + else if (strncasecmp(w, "bbl=no", 11) == 0) + createinfo.bblist = 0; + else if (strncasecmp(w, "bbl=yes", 11) == 0) + createinfo.bblist = 1; + else { + pr_err("unrecognised word on CREATE line: %s\n", + w); + } + } +} + +void devline(char *line) +{ + char *w; + struct conf_dev *cd; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (w[0] == '/' || strcasecmp(w, "partitions") == 0 || + strcasecmp(w, "containers") == 0) { + cd = xmalloc(sizeof(*cd)); + cd->name = xstrdup(w); + cd->next = cdevlist; + cdevlist = cd; + } else { + pr_err("unreconised word on DEVICE line: %s\n", w); + } + } +} + +struct mddev_ident *mddevlist = NULL; +struct mddev_ident **mddevlp = &mddevlist; + +static int is_number(char *w) +{ + /* check if there are 1 or more digits and nothing else */ + int digits = 0; + while (*w && isdigit(*w)) { + digits++; + w++; + } + return (digits && ! *w); +} + +void arrayline(char *line) +{ + char *w; + + struct mddev_ident mis; + struct mddev_ident *mi; + + mis.uuid_set = 0; + mis.super_minor = UnSet; + mis.level = UnSet; + mis.raid_disks = UnSet; + mis.spare_disks = 0; + mis.devices = NULL; + mis.devname = NULL; + mis.spare_group = NULL; + mis.autof = 0; + mis.next = NULL; + mis.st = NULL; + mis.bitmap_fd = -1; + mis.bitmap_file = NULL; + mis.name[0] = 0; + mis.container = NULL; + mis.member = NULL; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (w[0] == '/' || strchr(w, '=') == NULL) { + /* This names the device, or is '<ignore>'. + * The rules match those in create_mddev. + * 'w' must be: + * /dev/md/{anything} + * /dev/mdNN + * /dev/md_dNN + * <ignore> + * or anything that doesn't start '/' or '<' + */ + if (strcasecmp(w, "<ignore>") == 0 || + strncmp(w, "/dev/md/", 8) == 0 || + (w[0] != '/' && w[0] != '<') || + (strncmp(w, "/dev/md", 7) == 0 && + is_number(w + 7)) || + (strncmp(w, "/dev/md_d", 9) == 0 && + is_number(w + 9))) { + /* This is acceptable */; + if (mis.devname) + pr_err("only give one device per ARRAY line: %s and %s\n", + mis.devname, w); + else + mis.devname = w; + }else { + pr_err("%s is an invalid name for an md device - ignored.\n", w); + } + } else if (strncasecmp(w, "uuid=", 5) == 0) { + if (mis.uuid_set) + pr_err("only specify uuid once, %s ignored.\n", + w); + else { + if (parse_uuid(w + 5, mis.uuid)) + mis.uuid_set = 1; + else + pr_err("bad uuid: %s\n", w); + } + } else if (strncasecmp(w, "super-minor=", 12) == 0) { + if (mis.super_minor != UnSet) + pr_err("only specify super-minor once, %s ignored.\n", + w); + else { + char *endptr; + int minor = strtol(w + 12, &endptr, 10); + + if (w[12] == 0 || endptr[0] != 0 || minor < 0) + pr_err("invalid super-minor number: %s\n", + w); + else + mis.super_minor = minor; + } + } else if (strncasecmp(w, "name=", 5) == 0) { + if (mis.name[0]) + pr_err("only specify name once, %s ignored.\n", + w); + else if (strlen(w + 5) > 32) + pr_err("name too long, ignoring %s\n", w); + else + strcpy(mis.name, w + 5); + + } else if (strncasecmp(w, "bitmap=", 7) == 0) { + if (mis.bitmap_file) + pr_err("only specify bitmap file once. %s ignored\n", + w); + else + mis.bitmap_file = xstrdup(w + 7); + + } else if (strncasecmp(w, "devices=", 8 ) == 0) { + if (mis.devices) + pr_err("only specify devices once (use a comma separated list). %s ignored\n", + w); + else + mis.devices = xstrdup(w + 8); + } else if (strncasecmp(w, "spare-group=", 12) == 0) { + if (mis.spare_group) + pr_err("only specify one spare group per array. %s ignored.\n", + w); + else + mis.spare_group = xstrdup(w + 12); + } else if (strncasecmp(w, "level=", 6) == 0 ) { + /* this is mainly for compatability with --brief output */ + mis.level = map_name(pers, w + 6); + } else if (strncasecmp(w, "disks=", 6) == 0) { + /* again, for compat */ + mis.raid_disks = atoi(w + 6); + } else if (strncasecmp(w, "num-devices=", 12) == 0) { + /* again, for compat */ + mis.raid_disks = atoi(w + 12); + } else if (strncasecmp(w, "spares=", 7) == 0) { + /* for warning if not all spares present */ + mis.spare_disks = atoi(w + 7); + } else if (strncasecmp(w, "metadata=", 9) == 0) { + /* style of metadata on the devices. */ + int i; + + for(i=0; superlist[i] && !mis.st; i++) + mis.st = superlist[i]-> + match_metadata_desc(w + 9); + + if (!mis.st) + pr_err("metadata format %s unknown, ignored.\n", + w + 9); + } else if (strncasecmp(w, "auto=", 5) == 0 ) { + /* whether to create device special files as needed */ + mis.autof = parse_auto(w + 5, "auto type", 0); + } else if (strncasecmp(w, "member=", 7) == 0) { + /* subarray within a container */ + mis.member = xstrdup(w + 7); + } else if (strncasecmp(w, "container=", 10) == 0) { + /* The container holding this subarray. + * Either a device name or a uuid */ + mis.container = xstrdup(w + 10); + } else { + pr_err("unrecognised word on ARRAY line: %s\n", + w); + } + } + if (mis.uuid_set == 0 && mis.devices == NULL && + mis.super_minor == UnSet && mis.name[0] == 0 && + (mis.container == NULL || mis.member == NULL)) + pr_err("ARRAY line %s has no identity information.\n", + mis.devname); + else { + mi = xmalloc(sizeof(*mi)); + *mi = mis; + mi->devname = mis.devname ? xstrdup(mis.devname) : NULL; + mi->next = NULL; + *mddevlp = mi; + mddevlp = &mi->next; + } +} + +static char *alert_email = NULL; +void mailline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) + if (alert_email == NULL) + alert_email = xstrdup(w); +} + +static char *alert_mail_from = NULL; +void mailfromline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (alert_mail_from == NULL) + alert_mail_from = xstrdup(w); + else { + char *t = NULL; + + if (xasprintf(&t, "%s %s", alert_mail_from, w) > 0) { + free(alert_mail_from); + alert_mail_from = t; + } + } + } +} + +static char *alert_program = NULL; +void programline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) + if (alert_program == NULL) + alert_program = xstrdup(w); +} + +static char *home_host = NULL; +static int require_homehost = 1; +void homehostline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (strcasecmp(w, "<ignore>") == 0) + require_homehost = 0; + else if (home_host == NULL) { + if (strcasecmp(w, "<none>") == 0) + home_host = xstrdup(""); + else + home_host = xstrdup(w); + } + } +} + +static char *home_cluster = NULL; +void homeclusterline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (home_cluster == NULL) { + if (strcasecmp(w, "<none>") == 0) + home_cluster = xstrdup(""); + else + home_cluster = xstrdup(w); + } + } +} + +static int monitor_delay; +void monitordelayline(char *line) +{ + char *w; + + for (w = dl_next(line); w != line; w = dl_next(w)) { + if (monitor_delay == 0) + monitor_delay = strtol(w, NULL, 10); + } +} + +char auto_yes[] = "yes"; +char auto_no[] = "no"; +char auto_homehost[] = "homehost"; + +static int auto_seen = 0; +void autoline(char *line) +{ + char *w; + char *seen; + int super_cnt; + char *dflt = auto_yes; + int homehost = 0; + int i; + + if (auto_seen) + return; + auto_seen = 1; + + /* + * Parse the 'auto' line creating policy statements for the 'auto' + * policy. + * + * The default is 'yes' but the 'auto' line might over-ride that. + * Words in the line are processed in order with the first + * match winning. + * word can be: + * +version - that version can be assembled + * -version - that version cannot be auto-assembled + * yes or +all - any other version can be assembled + * no or -all - no other version can be assembled. + * homehost - any array associated by 'homehost' to this + * host can be assembled. + * + * Thus: + * +ddf -0.90 homehost -all + * will auto-assemble any ddf array, no 0.90 array, and + * any other array (imsm, 1.x) if and only if it is identified + * as belonging to this host. + * + * We translate that to policy by creating 'auto=yes' when we see + * a '+version' line, 'auto=no' if we see '-version' before 'homehost', + * or 'auto=homehost' if we see '-version' after 'homehost'. + * When we see yes, no, +all or -all we stop and any version that hasn't + * been seen gets an appropriate auto= entry. + */ + + /* + * If environment variable MDADM_CONF_AUTO is defined, then + * it is prepended to the auto line. This allow a script + * to easily disable some metadata types. + */ + w = getenv("MDADM_CONF_AUTO"); + if (w && *w) { + char *l = xstrdup(w); + char *head = line; + w = strtok(l, " \t"); + while (w) { + char *nw = dl_strdup(w); + dl_insert(head, nw); + head = nw; + w = strtok(NULL, " \t"); + } + free(l); + } + + for (super_cnt = 0; superlist[super_cnt]; super_cnt++) + ; + seen = xcalloc(super_cnt, 1); + + for (w = dl_next(line); w != line; w = dl_next(w)) { + char *val; + + if (strcasecmp(w, "yes") == 0) { + dflt = auto_yes; + break; + } + if (strcasecmp(w, "no") == 0) { + if (homehost) + dflt = auto_homehost; + else + dflt = auto_no; + break; + } + if (strcasecmp(w, "homehost") == 0) { + homehost = 1; + continue; + } + if (w[0] == '+') + val = auto_yes; + else if (w[0] == '-') { + if (homehost) + val = auto_homehost; + else + val = auto_no; + } else + continue; + + if (strcasecmp(w + 1, "all") == 0) { + dflt = val; + break; + } + for (i = 0; superlist[i]; i++) { + const char *version = superlist[i]->name; + if (strcasecmp(w + 1, version) == 0) + break; + /* 1 matches 1.x, 0 matches 0.90 */ + if (version[1] == '.' && strlen(w + 1) == 1 && + w[1] == version[0]) + break; + /* 1.anything matches 1.x */ + if (strcmp(version, "1.x") == 0 && + strncmp(w + 1, "1.", 2) == 0) + break; + } + if (superlist[i] == NULL) + /* ignore this word */ + continue; + if (seen[i]) + /* already know about this metadata */ + continue; + policy_add(rule_policy, pol_auto, val, pol_metadata, + superlist[i]->name, NULL); + seen[i] = 1; + } + for (i = 0; i < super_cnt; i++) + if (!seen[i]) + policy_add(rule_policy, pol_auto, dflt, pol_metadata, + superlist[i]->name, NULL); + + free(seen); +} + +int loaded = 0; + +static char *conffile = NULL; +void set_conffile(char *file) +{ + conffile = file; +} + +void conf_file(FILE *f) +{ + char *line; + while ((line = conf_line(f))) { + switch(match_keyword(line)) { + case Devices: + devline(line); + break; + case Array: + arrayline(line); + break; + case Mailaddr: + mailline(line); + break; + case Mailfrom: + mailfromline(line); + break; + case Program: + programline(line); + break; + case CreateDev: + createline(line); + break; + case Homehost: + homehostline(line); + break; + case HomeCluster: + homeclusterline(line); + break; + case AutoMode: + autoline(line); + break; + case Policy: + policyline(line, rule_policy); + break; + case PartPolicy: + policyline(line, rule_part); + break; + case Sysfs: + sysfsline(line); + break; + case MonitorDelay: + monitordelayline(line); + break; + default: + pr_err("Unknown keyword %s\n", line); + } + free_line(line); + } +} + +struct fname { + struct fname *next; + char name[]; +}; + +void conf_file_or_dir(FILE *f) +{ + struct stat st; + DIR *dir; + struct dirent *dp; + struct fname *list = NULL; + + fstat(fileno(f), &st); + if (S_ISREG(st.st_mode)) + conf_file(f); + else if (!S_ISDIR(st.st_mode)) + return; +#if _XOPEN_SOURCE >= 700 || _POSIX_C_SOURCE >= 200809L + dir = fdopendir(fileno(f)); + if (!dir) + return; + while ((dp = readdir(dir)) != NULL) { + int l; + struct fname *fn, **p; + if (dp->d_ino == 0) + continue; + if (dp->d_name[0] == '.') + continue; + l = strlen(dp->d_name); + if (l < 6 || strcmp(dp->d_name + l - 5, ".conf") != 0) + continue; + fn = xmalloc(sizeof(*fn) + l + 1); + strcpy(fn->name, dp->d_name); + for (p = &list; + *p && strcmp((*p)->name, fn->name) < 0; + p = & (*p)->next) + ; + fn->next = *p; + *p = fn; + } + while (list) { + int fd; + FILE *f2; + struct fname *fn = list; + list = list->next; + fd = openat(fileno(f), fn->name, O_RDONLY); + free(fn); + if (fd < 0) + continue; + f2 = fdopen(fd, "r"); + if (!f2) { + close(fd); + continue; + } + conf_file(f2); + fclose(f2); + } + closedir(dir); +#endif +} + +void load_conffile(void) +{ + FILE *f; + char *confdir = NULL; + char *head; + + if (loaded) + return; + if (conffile == NULL) { + conffile = DefaultConfFile; + confdir = DefaultConfDir; + } + + if (strcmp(conffile, "partitions") == 0) { + char *list = dl_strdup("DEV"); + dl_init(list); + dl_add(list, dl_strdup("partitions")); + devline(list); + free_line(list); + } else if (strcmp(conffile, "none") != 0) { + f = fopen(conffile, "r"); + /* Debian chose to relocate mdadm.conf into /etc/mdadm/. + * To allow Debian users to compile from clean source and still + * have a working mdadm, we read /etc/mdadm/mdadm.conf + * if /etc/mdadm.conf doesn't exist + */ + if (f == NULL && conffile == DefaultConfFile) { + f = fopen(DefaultAltConfFile, "r"); + if (f) { + conffile = DefaultAltConfFile; + confdir = DefaultAltConfDir; + } + } + if (f) { + conf_file_or_dir(f); + fclose(f); + } + if (confdir) { + f = fopen(confdir, "r"); + if (f) { + conf_file_or_dir(f); + fclose(f); + } + } + } + /* If there was no AUTO line, process an empty line + * now so that the MDADM_CONF_AUTO env var gets processed. + */ + head = dl_strdup("AUTO"); + dl_init(head); + autoline(head); + free_line(head); + + loaded = 1; +} + +char *conf_get_mailaddr(void) +{ + load_conffile(); + return alert_email; +} + +char *conf_get_mailfrom(void) +{ + load_conffile(); + return alert_mail_from; +} + +char *conf_get_program(void) +{ + load_conffile(); + return alert_program; +} + +char *conf_get_homehost(int *require_homehostp) +{ + load_conffile(); + if (require_homehostp) + *require_homehostp = require_homehost; + return home_host; +} + +char *conf_get_homecluster(void) +{ + load_conffile(); + return home_cluster; +} + +int conf_get_monitor_delay(void) +{ + load_conffile(); + return monitor_delay; +} + +struct createinfo *conf_get_create_info(void) +{ + load_conffile(); + return &createinfo; +} + +struct mddev_ident *conf_get_ident(char *dev) +{ + struct mddev_ident *rv; + load_conffile(); + rv = mddevlist; + while (dev && rv && (rv->devname == NULL || + !devname_matches(dev, rv->devname))) + rv = rv->next; + return rv; +} + +static void append_dlist(struct mddev_dev **dlp, struct mddev_dev *list) +{ + while (*dlp) + dlp = &(*dlp)->next; + *dlp = list; +} + +struct mddev_dev *conf_get_devs() +{ + glob_t globbuf; + struct conf_dev *cd; + int flags = 0; + static struct mddev_dev *dlist = NULL; + unsigned int i; + + while (dlist) { + struct mddev_dev *t = dlist; + dlist = dlist->next; + free(t->devname); + free(t); + } + + load_conffile(); + + if (cdevlist == NULL) { + /* default to 'partitions' and 'containers' */ + dlist = load_partitions(); + append_dlist(&dlist, load_containers()); + } + + for (cd = cdevlist; cd; cd = cd->next) { + if (strcasecmp(cd->name, "partitions") == 0) + append_dlist(&dlist, load_partitions()); + else if (strcasecmp(cd->name, "containers") == 0) + append_dlist(&dlist, load_containers()); + else { + glob(cd->name, flags, NULL, &globbuf); + flags |= GLOB_APPEND; + } + } + if (flags & GLOB_APPEND) { + for (i = 0; i < globbuf.gl_pathc; i++) { + struct mddev_dev *t; + t = xcalloc(1, sizeof(*t)); + t->devname = xstrdup(globbuf.gl_pathv[i]); + t->next = dlist; + dlist = t; +/* printf("one dev is %s\n", t->devname);*/ + } + globfree(&globbuf); + } + + return dlist; +} + +int conf_test_dev(char *devname) +{ + struct conf_dev *cd; + if (cdevlist == NULL) + /* allow anything by default */ + return 1; + for (cd = cdevlist; cd; cd = cd->next) { + if (strcasecmp(cd->name, "partitions") == 0) + return 1; + if (fnmatch(cd->name, devname, FNM_PATHNAME) == 0) + return 1; + } + return 0; +} + +int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost) +{ + /* If anyone said 'yes', that sticks. + * else if homehost applies, use that + * else if there is a 'no', say 'no'. + * else 'yes'. + */ + struct dev_policy *p; + int no = 0, found_homehost = 0; + load_conffile(); + + pol = pol_find(pol, pol_auto); + pol_for_each(p, pol, version) { + if (strcmp(p->value, "yes") == 0) + return 1; + if (strcmp(p->value, "homehost") == 0) + found_homehost = 1; + if (strcmp(p->value, "no") == 0) + no = 1; + } + if (is_homehost && found_homehost) + return 1; + if (no) + return 0; + return 1; +} + +int match_oneof(char *devices, char *devname) +{ + /* check if one of the comma separated patterns in devices + * matches devname + */ + + while (devices && *devices) { + char patn[1024]; + char *p = devices; + devices = strchr(devices, ','); + if (!devices) + devices = p + strlen(p); + if (devices-p < 1024) { + strncpy(patn, p, devices - p); + patn[devices-p] = 0; + if (fnmatch(patn, devname, FNM_PATHNAME) == 0) + return 1; + } + if (*devices == ',') + devices++; + } + return 0; +} + +int devname_matches(char *name, char *match) +{ + /* See if the given array name matches the + * given match from config file. + * + * First strip and /dev/md/ or /dev/, then + * see if there might be a numeric match of + * mdNN with NN + * then just strcmp + */ + if (strncmp(name, "/dev/md/", 8) == 0) + name += 8; + else if (strncmp(name, "/dev/", 5) == 0) + name += 5; + + if (strncmp(match, "/dev/md/", 8) == 0) + match += 8; + else if (strncmp(match, "/dev/", 5) == 0) + match += 5; + + if (strncmp(name, "md", 2) == 0 && isdigit(name[2])) + name += 2; + if (strncmp(match, "md", 2) == 0 && isdigit(match[2])) + match += 2; + + return (strcmp(name, match) == 0); +} + +int conf_name_is_free(char *name) +{ + /* Check if this name is already taken by an ARRAY entry in + * the config file. + * It can be taken either by a match on devname, name, or + * even super-minor. + */ + struct mddev_ident *dev; + + load_conffile(); + for (dev = mddevlist; dev; dev = dev->next) { + char nbuf[100]; + if (dev->devname && devname_matches(name, dev->devname)) + return 0; + if (dev->name[0] && devname_matches(name, dev->name)) + return 0; + sprintf(nbuf, "%d", dev->super_minor); + if (dev->super_minor != UnSet && devname_matches(name, nbuf)) + return 0; + } + return 1; +} + +struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp) +{ + struct mddev_ident *array_list, *match; + array_list = conf_get_ident(NULL); + match = NULL; + for (; array_list; array_list = array_list->next) { + if (array_list->uuid_set && + same_uuid(array_list->uuid, info->uuid, + st->ss->swapuuid) == 0) { + if (verbose >= 2 && array_list->devname) + pr_err("UUID differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->name[0] && + strcasecmp(array_list->name, info->name) != 0) { + if (verbose >= 2 && array_list->devname) + pr_err("Name differs from %s.\n", + array_list->devname); + continue; + } + if (array_list->devices && devname && + !match_oneof(array_list->devices, devname)) { + if (verbose >= 2 && array_list->devname) + pr_err("Not a listed device for %s.\n", + array_list->devname); + continue; + } + if (array_list->super_minor != UnSet && + array_list->super_minor != info->array.md_minor) { + if (verbose >= 2 && array_list->devname) + pr_err("Different super-minor to %s.\n", + array_list->devname); + continue; + } + if (!array_list->uuid_set && !array_list->name[0] && + !array_list->devices && array_list->super_minor == UnSet) { + if (verbose >= 2 && array_list->devname) + pr_err("%s doesn't have any identifying information.\n", + array_list->devname); + continue; + } + /* FIXME, should I check raid_disks and level too?? */ + + if (match) { + if (verbose >= 0) { + if (match->devname && array_list->devname) + pr_err("we match both %s and %s - cannot decide which to use.\n", + match->devname, + array_list->devname); + else + pr_err("multiple lines in mdadm.conf match\n"); + } + if (rvp) + *rvp = 2; + match = NULL; + break; + } + match = array_list; + } + return match; +} + +int conf_verify_devnames(struct mddev_ident *array_list) +{ + struct mddev_ident *a1, *a2; + + for (a1 = array_list; a1; a1 = a1->next) { + if (!a1->devname) + continue; + if (strcmp(a1->devname, "<ignore>") == 0) + continue; + for (a2 = a1->next; a2; a2 = a2->next) { + if (!a2->devname) + continue; + if (strcmp(a1->devname, a2->devname) != 0) + continue; + + if (a1->uuid_set && a2->uuid_set) { + char nbuf[64]; + __fname_from_uuid(a1->uuid, 0, nbuf, ':'); + pr_err("Devices %s and ", + nbuf); + __fname_from_uuid(a2->uuid, 0, nbuf, ':'); + fprintf(stderr, + "%s have the same name: %s\n", + nbuf, a1->devname); + } else + pr_err("Device %s given twice in config file\n", a1->devname); + return 1; + } + } + + return 0; +} diff --git a/coverity-gcc-hack.h b/coverity-gcc-hack.h new file mode 100644 index 0000000..2d94a8b --- /dev/null +++ b/coverity-gcc-hack.h @@ -0,0 +1,10 @@ +#if !defined(__KERNEL__) && defined(__x86_64__) && defined(__COVERITY_GCC_VERSION_AT_LEAST) +#if __COVERITY_GCC_VERSION_AT_LEAST(7, 0) +typedef float _Float128 __attribute__((__vector_size__(128))); +typedef float _Float64 __attribute__((__vector_size__(64))); +typedef float _Float32 __attribute__((__vector_size__(32))); +typedef float _Float128x __attribute__((__vector_size__(128))); +typedef float _Float64x __attribute__((__vector_size__(64))); +typedef float _Float32x __attribute__((__vector_size__(32))); +#endif +#endif @@ -0,0 +1,360 @@ +/* crc32.c -- compute the CRC-32 of a data stream + * Copyright (C) 1995-2003 Mark Adler + * For conditions of distribution and use, see copyright notice in zlib.h + * + * Note: zlib license from from zlib.h added explicitly as mdadm does + * not include zlib.h. License from v1.2.2 of zlib: + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * + * Thanks to Rodney Brown <rbrown64@csc.com.au> for his contribution of faster + * CRC methods: exclusive-oring 32 bits of data at a time, and pre-computing + * tables for updating the shift register in one step with three exclusive-ors + * instead of four steps with four exclusive-ors. This results about a factor + * of two increase in speed on a Power PC G4 (PPC7455) using gcc -O3. + */ + +/* @(#) $Id$ */ + +/* + Note on the use of DYNAMIC_CRC_TABLE: there is no mutex or semaphore + protection on the static variables used to control the first-use generation + of the crc tables. Therefore, if you #define DYNAMIC_CRC_TABLE, you should + first call get_crc_table() to initialize the tables before allowing more than + one thread to use crc32(). + */ + +#ifdef MAKECRCH +# include <stdio.h> +# ifndef DYNAMIC_CRC_TABLE +# define DYNAMIC_CRC_TABLE +# endif /* !DYNAMIC_CRC_TABLE */ +#endif /* MAKECRCH */ + +/* #include "zutil.h" / * for STDC and FAR definitions */ +#define STDC +#define FAR +#define Z_NULL ((void*)0) +#define OF(X) X +#define ZEXPORT +typedef long ptrdiff_t; +#define NOBYFOUR + +#define local static + +/* Find a four-byte integer type for crc32_little() and crc32_big(). */ +#ifndef NOBYFOUR +# ifdef STDC /* need ANSI C limits.h to determine sizes */ +# include <limits.h> +# define BYFOUR +# if (UINT_MAX == 0xffffffffUL) + typedef unsigned int u4; +# else +# if (ULONG_MAX == 0xffffffffUL) + typedef unsigned long u4; +# else +# if (USHRT_MAX == 0xffffffffUL) + typedef unsigned short u4; +# else +# undef BYFOUR /* can't find a four-byte integer type! */ +# endif +# endif +# endif +# endif /* STDC */ +#endif /* !NOBYFOUR */ + +/* Definitions for doing the crc four data bytes at a time. */ +#ifdef BYFOUR +# define REV(w) (((w)>>24)+(((w)>>8)&0xff00)+ \ + (((w)&0xff00)<<8)+(((w)&0xff)<<24)) + local unsigned long crc32_little OF((unsigned long, + const unsigned char FAR *, unsigned)); + local unsigned long crc32_big OF((unsigned long, + const unsigned char FAR *, unsigned)); +# define TBLS 8 +#else +# define TBLS 1 +#endif /* BYFOUR */ + +#ifdef DYNAMIC_CRC_TABLE + +local volatile int crc_table_empty = 1; +local unsigned long FAR crc_table[TBLS][256]; +local void make_crc_table OF((void)); +#ifdef MAKECRCH + local void write_table OF((FILE *, const unsigned long FAR *)); +#endif /* MAKECRCH */ + +/* + Generate tables for a byte-wise 32-bit CRC calculation on the polynomial: + x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1. + + Polynomials over GF(2) are represented in binary, one bit per coefficient, + with the lowest powers in the most significant bit. Then adding polynomials + is just exclusive-or, and multiplying a polynomial by x is a right shift by + one. If we call the above polynomial p, and represent a byte as the + polynomial q, also with the lowest power in the most significant bit (so the + byte 0xb1 is the polynomial x^7+x^3+x+1), then the CRC is (q*x^32) mod p, + where a mod b means the remainder after dividing a by b. + + This calculation is done using the shift-register method of multiplying and + taking the remainder. The register is initialized to zero, and for each + incoming bit, x^32 is added mod p to the register if the bit is a one (where + x^32 mod p is p+x^32 = x^26+...+1), and the register is multiplied mod p by + x (which is shifting right by one and adding x^32 mod p if the bit shifted + out is a one). We start with the highest power (least significant bit) of + q and repeat for all eight bits of q. + + The first table is simply the CRC of all possible eight bit values. This is + all the information needed to generate CRCs on data a byte at a time for all + combinations of CRC register values and incoming bytes. The remaining tables + allow for word-at-a-time CRC calculation for both big-endian and little- + endian machines, where a word is four bytes. +*/ +local void make_crc_table() +{ + unsigned long c; + int n, k; + unsigned long poly; /* polynomial exclusive-or pattern */ + /* terms of polynomial defining this crc (except x^32): */ + static volatile int first = 1; /* flag to limit concurrent making */ + static const unsigned char p[] = {0,1,2,4,5,7,8,10,11,12,16,22,23,26}; + + /* See if another task is already doing this (not thread-safe, but better + than nothing -- significantly reduces duration of vulnerability in + case the advice about DYNAMIC_CRC_TABLE is ignored) */ + if (first) { + first = 0; + + /* make exclusive-or pattern from polynomial (0xedb88320UL) */ + poly = 0UL; + for (n = 0; n < sizeof(p)/sizeof(unsigned char); n++) + poly |= 1UL << (31 - p[n]); + + /* generate a crc for every 8-bit value */ + for (n = 0; n < 256; n++) { + c = (unsigned long)n; + for (k = 0; k < 8; k++) + c = c & 1 ? poly ^ (c >> 1) : c >> 1; + crc_table[0][n] = c; + } + +#ifdef BYFOUR + /* generate crc for each value followed by one, two, and three zeros, + and then the byte reversal of those as well as the first table */ + for (n = 0; n < 256; n++) { + c = crc_table[0][n]; + crc_table[4][n] = REV(c); + for (k = 1; k < 4; k++) { + c = crc_table[0][c & 0xff] ^ (c >> 8); + crc_table[k][n] = c; + crc_table[k + 4][n] = REV(c); + } + } +#endif /* BYFOUR */ + + crc_table_empty = 0; + } + else { /* not first */ + /* wait for the other guy to finish (not efficient, but rare) */ + while (crc_table_empty) + ; + } + +#ifdef MAKECRCH + /* write out CRC tables to crc32.h */ + { + FILE *out; + + out = fopen("crc32.h", "w"); + if (out == NULL) return; + fprintf(out, "/* crc32.h -- tables for rapid CRC calculation\n"); + fprintf(out, " * Generated automatically by crc32.c\n */\n\n"); + fprintf(out, "local const unsigned long FAR "); + fprintf(out, "crc_table[TBLS][256] =\n{\n {\n"); + write_table(out, crc_table[0]); +# ifdef BYFOUR + fprintf(out, "#ifdef BYFOUR\n"); + for (k = 1; k < 8; k++) { + fprintf(out, " },\n {\n"); + write_table(out, crc_table[k]); + } + fprintf(out, "#endif\n"); +# endif /* BYFOUR */ + fprintf(out, " }\n};\n"); + fclose(out); + } +#endif /* MAKECRCH */ +} + +#ifdef MAKECRCH +local void write_table(out, table) + FILE *out; + const unsigned long FAR *table; +{ + int n; + + for (n = 0; n < 256; n++) + fprintf(out, "%s0x%08lxUL%s", n % 5 ? "" : " ", table[n], + n == 255 ? "\n" : (n % 5 == 4 ? ",\n" : ", ")); +} +#endif /* MAKECRCH */ + +#else /* !DYNAMIC_CRC_TABLE */ +/* ======================================================================== + * Tables of CRC-32s of all single-byte values, made by make_crc_table(). + */ +#include "crc32.h" +#endif /* DYNAMIC_CRC_TABLE */ + +/* ========================================================================= + * This function can be used by asm versions of crc32() + */ +const unsigned long FAR * ZEXPORT get_crc_table(void) +{ +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + return (const unsigned long FAR *)crc_table; +} + +/* ========================================================================= */ +#define DO1 crc = crc_table[0][((int)crc ^ (*buf++)) & 0xff] ^ (crc >> 8) +#define DO8 DO1; DO1; DO1; DO1; DO1; DO1; DO1; DO1 + +/* ========================================================================= */ +unsigned long ZEXPORT crc32( + unsigned long crc, + const unsigned char FAR *buf, + unsigned len) +{ + if (buf == Z_NULL) return 0UL; + +#ifdef DYNAMIC_CRC_TABLE + if (crc_table_empty) + make_crc_table(); +#endif /* DYNAMIC_CRC_TABLE */ + +#ifdef BYFOUR + if (sizeof(void *) == sizeof(ptrdiff_t)) { + u4 endian; + + endian = 1; + if (*((unsigned char *)(&endian))) + return crc32_little(crc, buf, len); + else + return crc32_big(crc, buf, len); + } +#endif /* BYFOUR */ +/* crc = crc ^ 0xffffffffUL;*/ + while (len >= 8) { + DO8; + len -= 8; + } + if (len) do { + DO1; + } while (--len); + return crc /* ^ 0xffffffffUL*/; +} + +#ifdef BYFOUR + +/* ========================================================================= */ +#define DOLIT4 c ^= *buf4++; \ + c = crc_table[3][c & 0xff] ^ crc_table[2][(c >> 8) & 0xff] ^ \ + crc_table[1][(c >> 16) & 0xff] ^ crc_table[0][c >> 24] +#define DOLIT32 DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4; DOLIT4 + +/* ========================================================================= */ +local unsigned long crc32_little(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = (u4)crc; + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + while (len >= 32) { + DOLIT32; + len -= 32; + } + while (len >= 4) { + DOLIT4; + len -= 4; + } + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[0][(c ^ *buf++) & 0xff] ^ (c >> 8); + } while (--len); + c = ~c; + return (unsigned long)c; +} + +/* ========================================================================= */ +#define DOBIG4 c ^= *++buf4; \ + c = crc_table[4][c & 0xff] ^ crc_table[5][(c >> 8) & 0xff] ^ \ + crc_table[6][(c >> 16) & 0xff] ^ crc_table[7][c >> 24] +#define DOBIG32 DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4; DOBIG4 + +/* ========================================================================= */ +local unsigned long crc32_big(crc, buf, len) + unsigned long crc; + const unsigned char FAR *buf; + unsigned len; +{ + register u4 c; + register const u4 FAR *buf4; + + c = REV((u4)crc); + c = ~c; + while (len && ((ptrdiff_t)buf & 3)) { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + len--; + } + + buf4 = (const u4 FAR *)buf; + buf4--; + while (len >= 32) { + DOBIG32; + len -= 32; + } + while (len >= 4) { + DOBIG4; + len -= 4; + } + buf4++; + buf = (const unsigned char FAR *)buf4; + + if (len) do { + c = crc_table[4][(c >> 24) ^ *buf++] ^ (c << 8); + } while (--len); + c = ~c; + return (unsigned long)(REV(c)); +} + +#endif /* BYFOUR */ @@ -0,0 +1,441 @@ +/* crc32.h -- tables for rapid CRC calculation + * Generated automatically by crc32.c + */ + +local const unsigned long FAR crc_table[TBLS][256] = +{ + { + 0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL, + 0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL, + 0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL, + 0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL, + 0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL, + 0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL, + 0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL, + 0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL, + 0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL, + 0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL, + 0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL, + 0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL, + 0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL, + 0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL, + 0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL, + 0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL, + 0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL, + 0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL, + 0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL, + 0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL, + 0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL, + 0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL, + 0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL, + 0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL, + 0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL, + 0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL, + 0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL, + 0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL, + 0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL, + 0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL, + 0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL, + 0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL, + 0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL, + 0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL, + 0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL, + 0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL, + 0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL, + 0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL, + 0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL, + 0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL, + 0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL, + 0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL, + 0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL, + 0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL, + 0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL, + 0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL, + 0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL, + 0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL, + 0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL, + 0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL, + 0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL, + 0x2d02ef8dUL +#ifdef BYFOUR + }, + { + 0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL, + 0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL, + 0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL, + 0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL, + 0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL, + 0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL, + 0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL, + 0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL, + 0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL, + 0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL, + 0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL, + 0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL, + 0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL, + 0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL, + 0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL, + 0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL, + 0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL, + 0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL, + 0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL, + 0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL, + 0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL, + 0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL, + 0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL, + 0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL, + 0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL, + 0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL, + 0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL, + 0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL, + 0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL, + 0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL, + 0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL, + 0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL, + 0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL, + 0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL, + 0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL, + 0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL, + 0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL, + 0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL, + 0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL, + 0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL, + 0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL, + 0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL, + 0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL, + 0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL, + 0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL, + 0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL, + 0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL, + 0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL, + 0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL, + 0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL, + 0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL, + 0x9324fd72UL + }, + { + 0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL, + 0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL, + 0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL, + 0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL, + 0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL, + 0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL, + 0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL, + 0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL, + 0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL, + 0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL, + 0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL, + 0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL, + 0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL, + 0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL, + 0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL, + 0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL, + 0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL, + 0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL, + 0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL, + 0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL, + 0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL, + 0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL, + 0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL, + 0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL, + 0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL, + 0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL, + 0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL, + 0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL, + 0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL, + 0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL, + 0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL, + 0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL, + 0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL, + 0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL, + 0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL, + 0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL, + 0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL, + 0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL, + 0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL, + 0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL, + 0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL, + 0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL, + 0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL, + 0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL, + 0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL, + 0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL, + 0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL, + 0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL, + 0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL, + 0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL, + 0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL, + 0xbe9834edUL + }, + { + 0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL, + 0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL, + 0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL, + 0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL, + 0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL, + 0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL, + 0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL, + 0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL, + 0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL, + 0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL, + 0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL, + 0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL, + 0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL, + 0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL, + 0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL, + 0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL, + 0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL, + 0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL, + 0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL, + 0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL, + 0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL, + 0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL, + 0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL, + 0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL, + 0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL, + 0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL, + 0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL, + 0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL, + 0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL, + 0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL, + 0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL, + 0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL, + 0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL, + 0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL, + 0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL, + 0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL, + 0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL, + 0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL, + 0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL, + 0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL, + 0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL, + 0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL, + 0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL, + 0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL, + 0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL, + 0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL, + 0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL, + 0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL, + 0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL, + 0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL, + 0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL, + 0xde0506f1UL + }, + { + 0x00000000UL, 0x96300777UL, 0x2c610eeeUL, 0xba510999UL, 0x19c46d07UL, + 0x8ff46a70UL, 0x35a563e9UL, 0xa395649eUL, 0x3288db0eUL, 0xa4b8dc79UL, + 0x1ee9d5e0UL, 0x88d9d297UL, 0x2b4cb609UL, 0xbd7cb17eUL, 0x072db8e7UL, + 0x911dbf90UL, 0x6410b71dUL, 0xf220b06aUL, 0x4871b9f3UL, 0xde41be84UL, + 0x7dd4da1aUL, 0xebe4dd6dUL, 0x51b5d4f4UL, 0xc785d383UL, 0x56986c13UL, + 0xc0a86b64UL, 0x7af962fdUL, 0xecc9658aUL, 0x4f5c0114UL, 0xd96c0663UL, + 0x633d0ffaUL, 0xf50d088dUL, 0xc8206e3bUL, 0x5e10694cUL, 0xe44160d5UL, + 0x727167a2UL, 0xd1e4033cUL, 0x47d4044bUL, 0xfd850dd2UL, 0x6bb50aa5UL, + 0xfaa8b535UL, 0x6c98b242UL, 0xd6c9bbdbUL, 0x40f9bcacUL, 0xe36cd832UL, + 0x755cdf45UL, 0xcf0dd6dcUL, 0x593dd1abUL, 0xac30d926UL, 0x3a00de51UL, + 0x8051d7c8UL, 0x1661d0bfUL, 0xb5f4b421UL, 0x23c4b356UL, 0x9995bacfUL, + 0x0fa5bdb8UL, 0x9eb80228UL, 0x0888055fUL, 0xb2d90cc6UL, 0x24e90bb1UL, + 0x877c6f2fUL, 0x114c6858UL, 0xab1d61c1UL, 0x3d2d66b6UL, 0x9041dc76UL, + 0x0671db01UL, 0xbc20d298UL, 0x2a10d5efUL, 0x8985b171UL, 0x1fb5b606UL, + 0xa5e4bf9fUL, 0x33d4b8e8UL, 0xa2c90778UL, 0x34f9000fUL, 0x8ea80996UL, + 0x18980ee1UL, 0xbb0d6a7fUL, 0x2d3d6d08UL, 0x976c6491UL, 0x015c63e6UL, + 0xf4516b6bUL, 0x62616c1cUL, 0xd8306585UL, 0x4e0062f2UL, 0xed95066cUL, + 0x7ba5011bUL, 0xc1f40882UL, 0x57c40ff5UL, 0xc6d9b065UL, 0x50e9b712UL, + 0xeab8be8bUL, 0x7c88b9fcUL, 0xdf1ddd62UL, 0x492dda15UL, 0xf37cd38cUL, + 0x654cd4fbUL, 0x5861b24dUL, 0xce51b53aUL, 0x7400bca3UL, 0xe230bbd4UL, + 0x41a5df4aUL, 0xd795d83dUL, 0x6dc4d1a4UL, 0xfbf4d6d3UL, 0x6ae96943UL, + 0xfcd96e34UL, 0x468867adUL, 0xd0b860daUL, 0x732d0444UL, 0xe51d0333UL, + 0x5f4c0aaaUL, 0xc97c0dddUL, 0x3c710550UL, 0xaa410227UL, 0x10100bbeUL, + 0x86200cc9UL, 0x25b56857UL, 0xb3856f20UL, 0x09d466b9UL, 0x9fe461ceUL, + 0x0ef9de5eUL, 0x98c9d929UL, 0x2298d0b0UL, 0xb4a8d7c7UL, 0x173db359UL, + 0x810db42eUL, 0x3b5cbdb7UL, 0xad6cbac0UL, 0x2083b8edUL, 0xb6b3bf9aUL, + 0x0ce2b603UL, 0x9ad2b174UL, 0x3947d5eaUL, 0xaf77d29dUL, 0x1526db04UL, + 0x8316dc73UL, 0x120b63e3UL, 0x843b6494UL, 0x3e6a6d0dUL, 0xa85a6a7aUL, + 0x0bcf0ee4UL, 0x9dff0993UL, 0x27ae000aUL, 0xb19e077dUL, 0x44930ff0UL, + 0xd2a30887UL, 0x68f2011eUL, 0xfec20669UL, 0x5d5762f7UL, 0xcb676580UL, + 0x71366c19UL, 0xe7066b6eUL, 0x761bd4feUL, 0xe02bd389UL, 0x5a7ada10UL, + 0xcc4add67UL, 0x6fdfb9f9UL, 0xf9efbe8eUL, 0x43beb717UL, 0xd58eb060UL, + 0xe8a3d6d6UL, 0x7e93d1a1UL, 0xc4c2d838UL, 0x52f2df4fUL, 0xf167bbd1UL, + 0x6757bca6UL, 0xdd06b53fUL, 0x4b36b248UL, 0xda2b0dd8UL, 0x4c1b0aafUL, + 0xf64a0336UL, 0x607a0441UL, 0xc3ef60dfUL, 0x55df67a8UL, 0xef8e6e31UL, + 0x79be6946UL, 0x8cb361cbUL, 0x1a8366bcUL, 0xa0d26f25UL, 0x36e26852UL, + 0x95770cccUL, 0x03470bbbUL, 0xb9160222UL, 0x2f260555UL, 0xbe3bbac5UL, + 0x280bbdb2UL, 0x925ab42bUL, 0x046ab35cUL, 0xa7ffd7c2UL, 0x31cfd0b5UL, + 0x8b9ed92cUL, 0x1daede5bUL, 0xb0c2649bUL, 0x26f263ecUL, 0x9ca36a75UL, + 0x0a936d02UL, 0xa906099cUL, 0x3f360eebUL, 0x85670772UL, 0x13570005UL, + 0x824abf95UL, 0x147ab8e2UL, 0xae2bb17bUL, 0x381bb60cUL, 0x9b8ed292UL, + 0x0dbed5e5UL, 0xb7efdc7cUL, 0x21dfdb0bUL, 0xd4d2d386UL, 0x42e2d4f1UL, + 0xf8b3dd68UL, 0x6e83da1fUL, 0xcd16be81UL, 0x5b26b9f6UL, 0xe177b06fUL, + 0x7747b718UL, 0xe65a0888UL, 0x706a0fffUL, 0xca3b0666UL, 0x5c0b0111UL, + 0xff9e658fUL, 0x69ae62f8UL, 0xd3ff6b61UL, 0x45cf6c16UL, 0x78e20aa0UL, + 0xeed20dd7UL, 0x5483044eUL, 0xc2b30339UL, 0x612667a7UL, 0xf71660d0UL, + 0x4d476949UL, 0xdb776e3eUL, 0x4a6ad1aeUL, 0xdc5ad6d9UL, 0x660bdf40UL, + 0xf03bd837UL, 0x53aebca9UL, 0xc59ebbdeUL, 0x7fcfb247UL, 0xe9ffb530UL, + 0x1cf2bdbdUL, 0x8ac2bacaUL, 0x3093b353UL, 0xa6a3b424UL, 0x0536d0baUL, + 0x9306d7cdUL, 0x2957de54UL, 0xbf67d923UL, 0x2e7a66b3UL, 0xb84a61c4UL, + 0x021b685dUL, 0x942b6f2aUL, 0x37be0bb4UL, 0xa18e0cc3UL, 0x1bdf055aUL, + 0x8def022dUL + }, + { + 0x00000000UL, 0x41311b19UL, 0x82623632UL, 0xc3532d2bUL, 0x04c56c64UL, + 0x45f4777dUL, 0x86a75a56UL, 0xc796414fUL, 0x088ad9c8UL, 0x49bbc2d1UL, + 0x8ae8effaUL, 0xcbd9f4e3UL, 0x0c4fb5acUL, 0x4d7eaeb5UL, 0x8e2d839eUL, + 0xcf1c9887UL, 0x5112c24aUL, 0x1023d953UL, 0xd370f478UL, 0x9241ef61UL, + 0x55d7ae2eUL, 0x14e6b537UL, 0xd7b5981cUL, 0x96848305UL, 0x59981b82UL, + 0x18a9009bUL, 0xdbfa2db0UL, 0x9acb36a9UL, 0x5d5d77e6UL, 0x1c6c6cffUL, + 0xdf3f41d4UL, 0x9e0e5acdUL, 0xa2248495UL, 0xe3159f8cUL, 0x2046b2a7UL, + 0x6177a9beUL, 0xa6e1e8f1UL, 0xe7d0f3e8UL, 0x2483dec3UL, 0x65b2c5daUL, + 0xaaae5d5dUL, 0xeb9f4644UL, 0x28cc6b6fUL, 0x69fd7076UL, 0xae6b3139UL, + 0xef5a2a20UL, 0x2c09070bUL, 0x6d381c12UL, 0xf33646dfUL, 0xb2075dc6UL, + 0x715470edUL, 0x30656bf4UL, 0xf7f32abbUL, 0xb6c231a2UL, 0x75911c89UL, + 0x34a00790UL, 0xfbbc9f17UL, 0xba8d840eUL, 0x79dea925UL, 0x38efb23cUL, + 0xff79f373UL, 0xbe48e86aUL, 0x7d1bc541UL, 0x3c2ade58UL, 0x054f79f0UL, + 0x447e62e9UL, 0x872d4fc2UL, 0xc61c54dbUL, 0x018a1594UL, 0x40bb0e8dUL, + 0x83e823a6UL, 0xc2d938bfUL, 0x0dc5a038UL, 0x4cf4bb21UL, 0x8fa7960aUL, + 0xce968d13UL, 0x0900cc5cUL, 0x4831d745UL, 0x8b62fa6eUL, 0xca53e177UL, + 0x545dbbbaUL, 0x156ca0a3UL, 0xd63f8d88UL, 0x970e9691UL, 0x5098d7deUL, + 0x11a9ccc7UL, 0xd2fae1ecUL, 0x93cbfaf5UL, 0x5cd76272UL, 0x1de6796bUL, + 0xdeb55440UL, 0x9f844f59UL, 0x58120e16UL, 0x1923150fUL, 0xda703824UL, + 0x9b41233dUL, 0xa76bfd65UL, 0xe65ae67cUL, 0x2509cb57UL, 0x6438d04eUL, + 0xa3ae9101UL, 0xe29f8a18UL, 0x21cca733UL, 0x60fdbc2aUL, 0xafe124adUL, + 0xeed03fb4UL, 0x2d83129fUL, 0x6cb20986UL, 0xab2448c9UL, 0xea1553d0UL, + 0x29467efbUL, 0x687765e2UL, 0xf6793f2fUL, 0xb7482436UL, 0x741b091dUL, + 0x352a1204UL, 0xf2bc534bUL, 0xb38d4852UL, 0x70de6579UL, 0x31ef7e60UL, + 0xfef3e6e7UL, 0xbfc2fdfeUL, 0x7c91d0d5UL, 0x3da0cbccUL, 0xfa368a83UL, + 0xbb07919aUL, 0x7854bcb1UL, 0x3965a7a8UL, 0x4b98833bUL, 0x0aa99822UL, + 0xc9fab509UL, 0x88cbae10UL, 0x4f5def5fUL, 0x0e6cf446UL, 0xcd3fd96dUL, + 0x8c0ec274UL, 0x43125af3UL, 0x022341eaUL, 0xc1706cc1UL, 0x804177d8UL, + 0x47d73697UL, 0x06e62d8eUL, 0xc5b500a5UL, 0x84841bbcUL, 0x1a8a4171UL, + 0x5bbb5a68UL, 0x98e87743UL, 0xd9d96c5aUL, 0x1e4f2d15UL, 0x5f7e360cUL, + 0x9c2d1b27UL, 0xdd1c003eUL, 0x120098b9UL, 0x533183a0UL, 0x9062ae8bUL, + 0xd153b592UL, 0x16c5f4ddUL, 0x57f4efc4UL, 0x94a7c2efUL, 0xd596d9f6UL, + 0xe9bc07aeUL, 0xa88d1cb7UL, 0x6bde319cUL, 0x2aef2a85UL, 0xed796bcaUL, + 0xac4870d3UL, 0x6f1b5df8UL, 0x2e2a46e1UL, 0xe136de66UL, 0xa007c57fUL, + 0x6354e854UL, 0x2265f34dUL, 0xe5f3b202UL, 0xa4c2a91bUL, 0x67918430UL, + 0x26a09f29UL, 0xb8aec5e4UL, 0xf99fdefdUL, 0x3accf3d6UL, 0x7bfde8cfUL, + 0xbc6ba980UL, 0xfd5ab299UL, 0x3e099fb2UL, 0x7f3884abUL, 0xb0241c2cUL, + 0xf1150735UL, 0x32462a1eUL, 0x73773107UL, 0xb4e17048UL, 0xf5d06b51UL, + 0x3683467aUL, 0x77b25d63UL, 0x4ed7facbUL, 0x0fe6e1d2UL, 0xccb5ccf9UL, + 0x8d84d7e0UL, 0x4a1296afUL, 0x0b238db6UL, 0xc870a09dUL, 0x8941bb84UL, + 0x465d2303UL, 0x076c381aUL, 0xc43f1531UL, 0x850e0e28UL, 0x42984f67UL, + 0x03a9547eUL, 0xc0fa7955UL, 0x81cb624cUL, 0x1fc53881UL, 0x5ef42398UL, + 0x9da70eb3UL, 0xdc9615aaUL, 0x1b0054e5UL, 0x5a314ffcUL, 0x996262d7UL, + 0xd85379ceUL, 0x174fe149UL, 0x567efa50UL, 0x952dd77bUL, 0xd41ccc62UL, + 0x138a8d2dUL, 0x52bb9634UL, 0x91e8bb1fUL, 0xd0d9a006UL, 0xecf37e5eUL, + 0xadc26547UL, 0x6e91486cUL, 0x2fa05375UL, 0xe836123aUL, 0xa9070923UL, + 0x6a542408UL, 0x2b653f11UL, 0xe479a796UL, 0xa548bc8fUL, 0x661b91a4UL, + 0x272a8abdUL, 0xe0bccbf2UL, 0xa18dd0ebUL, 0x62defdc0UL, 0x23efe6d9UL, + 0xbde1bc14UL, 0xfcd0a70dUL, 0x3f838a26UL, 0x7eb2913fUL, 0xb924d070UL, + 0xf815cb69UL, 0x3b46e642UL, 0x7a77fd5bUL, 0xb56b65dcUL, 0xf45a7ec5UL, + 0x370953eeUL, 0x763848f7UL, 0xb1ae09b8UL, 0xf09f12a1UL, 0x33cc3f8aUL, + 0x72fd2493UL + }, + { + 0x00000000UL, 0x376ac201UL, 0x6ed48403UL, 0x59be4602UL, 0xdca80907UL, + 0xebc2cb06UL, 0xb27c8d04UL, 0x85164f05UL, 0xb851130eUL, 0x8f3bd10fUL, + 0xd685970dUL, 0xe1ef550cUL, 0x64f91a09UL, 0x5393d808UL, 0x0a2d9e0aUL, + 0x3d475c0bUL, 0x70a3261cUL, 0x47c9e41dUL, 0x1e77a21fUL, 0x291d601eUL, + 0xac0b2f1bUL, 0x9b61ed1aUL, 0xc2dfab18UL, 0xf5b56919UL, 0xc8f23512UL, + 0xff98f713UL, 0xa626b111UL, 0x914c7310UL, 0x145a3c15UL, 0x2330fe14UL, + 0x7a8eb816UL, 0x4de47a17UL, 0xe0464d38UL, 0xd72c8f39UL, 0x8e92c93bUL, + 0xb9f80b3aUL, 0x3cee443fUL, 0x0b84863eUL, 0x523ac03cUL, 0x6550023dUL, + 0x58175e36UL, 0x6f7d9c37UL, 0x36c3da35UL, 0x01a91834UL, 0x84bf5731UL, + 0xb3d59530UL, 0xea6bd332UL, 0xdd011133UL, 0x90e56b24UL, 0xa78fa925UL, + 0xfe31ef27UL, 0xc95b2d26UL, 0x4c4d6223UL, 0x7b27a022UL, 0x2299e620UL, + 0x15f32421UL, 0x28b4782aUL, 0x1fdeba2bUL, 0x4660fc29UL, 0x710a3e28UL, + 0xf41c712dUL, 0xc376b32cUL, 0x9ac8f52eUL, 0xada2372fUL, 0xc08d9a70UL, + 0xf7e75871UL, 0xae591e73UL, 0x9933dc72UL, 0x1c259377UL, 0x2b4f5176UL, + 0x72f11774UL, 0x459bd575UL, 0x78dc897eUL, 0x4fb64b7fUL, 0x16080d7dUL, + 0x2162cf7cUL, 0xa4748079UL, 0x931e4278UL, 0xcaa0047aUL, 0xfdcac67bUL, + 0xb02ebc6cUL, 0x87447e6dUL, 0xdefa386fUL, 0xe990fa6eUL, 0x6c86b56bUL, + 0x5bec776aUL, 0x02523168UL, 0x3538f369UL, 0x087faf62UL, 0x3f156d63UL, + 0x66ab2b61UL, 0x51c1e960UL, 0xd4d7a665UL, 0xe3bd6464UL, 0xba032266UL, + 0x8d69e067UL, 0x20cbd748UL, 0x17a11549UL, 0x4e1f534bUL, 0x7975914aUL, + 0xfc63de4fUL, 0xcb091c4eUL, 0x92b75a4cUL, 0xa5dd984dUL, 0x989ac446UL, + 0xaff00647UL, 0xf64e4045UL, 0xc1248244UL, 0x4432cd41UL, 0x73580f40UL, + 0x2ae64942UL, 0x1d8c8b43UL, 0x5068f154UL, 0x67023355UL, 0x3ebc7557UL, + 0x09d6b756UL, 0x8cc0f853UL, 0xbbaa3a52UL, 0xe2147c50UL, 0xd57ebe51UL, + 0xe839e25aUL, 0xdf53205bUL, 0x86ed6659UL, 0xb187a458UL, 0x3491eb5dUL, + 0x03fb295cUL, 0x5a456f5eUL, 0x6d2fad5fUL, 0x801b35e1UL, 0xb771f7e0UL, + 0xeecfb1e2UL, 0xd9a573e3UL, 0x5cb33ce6UL, 0x6bd9fee7UL, 0x3267b8e5UL, + 0x050d7ae4UL, 0x384a26efUL, 0x0f20e4eeUL, 0x569ea2ecUL, 0x61f460edUL, + 0xe4e22fe8UL, 0xd388ede9UL, 0x8a36abebUL, 0xbd5c69eaUL, 0xf0b813fdUL, + 0xc7d2d1fcUL, 0x9e6c97feUL, 0xa90655ffUL, 0x2c101afaUL, 0x1b7ad8fbUL, + 0x42c49ef9UL, 0x75ae5cf8UL, 0x48e900f3UL, 0x7f83c2f2UL, 0x263d84f0UL, + 0x115746f1UL, 0x944109f4UL, 0xa32bcbf5UL, 0xfa958df7UL, 0xcdff4ff6UL, + 0x605d78d9UL, 0x5737bad8UL, 0x0e89fcdaUL, 0x39e33edbUL, 0xbcf571deUL, + 0x8b9fb3dfUL, 0xd221f5ddUL, 0xe54b37dcUL, 0xd80c6bd7UL, 0xef66a9d6UL, + 0xb6d8efd4UL, 0x81b22dd5UL, 0x04a462d0UL, 0x33cea0d1UL, 0x6a70e6d3UL, + 0x5d1a24d2UL, 0x10fe5ec5UL, 0x27949cc4UL, 0x7e2adac6UL, 0x494018c7UL, + 0xcc5657c2UL, 0xfb3c95c3UL, 0xa282d3c1UL, 0x95e811c0UL, 0xa8af4dcbUL, + 0x9fc58fcaUL, 0xc67bc9c8UL, 0xf1110bc9UL, 0x740744ccUL, 0x436d86cdUL, + 0x1ad3c0cfUL, 0x2db902ceUL, 0x4096af91UL, 0x77fc6d90UL, 0x2e422b92UL, + 0x1928e993UL, 0x9c3ea696UL, 0xab546497UL, 0xf2ea2295UL, 0xc580e094UL, + 0xf8c7bc9fUL, 0xcfad7e9eUL, 0x9613389cUL, 0xa179fa9dUL, 0x246fb598UL, + 0x13057799UL, 0x4abb319bUL, 0x7dd1f39aUL, 0x3035898dUL, 0x075f4b8cUL, + 0x5ee10d8eUL, 0x698bcf8fUL, 0xec9d808aUL, 0xdbf7428bUL, 0x82490489UL, + 0xb523c688UL, 0x88649a83UL, 0xbf0e5882UL, 0xe6b01e80UL, 0xd1dadc81UL, + 0x54cc9384UL, 0x63a65185UL, 0x3a181787UL, 0x0d72d586UL, 0xa0d0e2a9UL, + 0x97ba20a8UL, 0xce0466aaUL, 0xf96ea4abUL, 0x7c78ebaeUL, 0x4b1229afUL, + 0x12ac6fadUL, 0x25c6adacUL, 0x1881f1a7UL, 0x2feb33a6UL, 0x765575a4UL, + 0x413fb7a5UL, 0xc429f8a0UL, 0xf3433aa1UL, 0xaafd7ca3UL, 0x9d97bea2UL, + 0xd073c4b5UL, 0xe71906b4UL, 0xbea740b6UL, 0x89cd82b7UL, 0x0cdbcdb2UL, + 0x3bb10fb3UL, 0x620f49b1UL, 0x55658bb0UL, 0x6822d7bbUL, 0x5f4815baUL, + 0x06f653b8UL, 0x319c91b9UL, 0xb48adebcUL, 0x83e01cbdUL, 0xda5e5abfUL, + 0xed3498beUL + }, + { + 0x00000000UL, 0x6567bcb8UL, 0x8bc809aaUL, 0xeeafb512UL, 0x5797628fUL, + 0x32f0de37UL, 0xdc5f6b25UL, 0xb938d79dUL, 0xef28b4c5UL, 0x8a4f087dUL, + 0x64e0bd6fUL, 0x018701d7UL, 0xb8bfd64aUL, 0xddd86af2UL, 0x3377dfe0UL, + 0x56106358UL, 0x9f571950UL, 0xfa30a5e8UL, 0x149f10faUL, 0x71f8ac42UL, + 0xc8c07bdfUL, 0xada7c767UL, 0x43087275UL, 0x266fcecdUL, 0x707fad95UL, + 0x1518112dUL, 0xfbb7a43fUL, 0x9ed01887UL, 0x27e8cf1aUL, 0x428f73a2UL, + 0xac20c6b0UL, 0xc9477a08UL, 0x3eaf32a0UL, 0x5bc88e18UL, 0xb5673b0aUL, + 0xd00087b2UL, 0x6938502fUL, 0x0c5fec97UL, 0xe2f05985UL, 0x8797e53dUL, + 0xd1878665UL, 0xb4e03addUL, 0x5a4f8fcfUL, 0x3f283377UL, 0x8610e4eaUL, + 0xe3775852UL, 0x0dd8ed40UL, 0x68bf51f8UL, 0xa1f82bf0UL, 0xc49f9748UL, + 0x2a30225aUL, 0x4f579ee2UL, 0xf66f497fUL, 0x9308f5c7UL, 0x7da740d5UL, + 0x18c0fc6dUL, 0x4ed09f35UL, 0x2bb7238dUL, 0xc518969fUL, 0xa07f2a27UL, + 0x1947fdbaUL, 0x7c204102UL, 0x928ff410UL, 0xf7e848a8UL, 0x3d58149bUL, + 0x583fa823UL, 0xb6901d31UL, 0xd3f7a189UL, 0x6acf7614UL, 0x0fa8caacUL, + 0xe1077fbeUL, 0x8460c306UL, 0xd270a05eUL, 0xb7171ce6UL, 0x59b8a9f4UL, + 0x3cdf154cUL, 0x85e7c2d1UL, 0xe0807e69UL, 0x0e2fcb7bUL, 0x6b4877c3UL, + 0xa20f0dcbUL, 0xc768b173UL, 0x29c70461UL, 0x4ca0b8d9UL, 0xf5986f44UL, + 0x90ffd3fcUL, 0x7e5066eeUL, 0x1b37da56UL, 0x4d27b90eUL, 0x284005b6UL, + 0xc6efb0a4UL, 0xa3880c1cUL, 0x1ab0db81UL, 0x7fd76739UL, 0x9178d22bUL, + 0xf41f6e93UL, 0x03f7263bUL, 0x66909a83UL, 0x883f2f91UL, 0xed589329UL, + 0x546044b4UL, 0x3107f80cUL, 0xdfa84d1eUL, 0xbacff1a6UL, 0xecdf92feUL, + 0x89b82e46UL, 0x67179b54UL, 0x027027ecUL, 0xbb48f071UL, 0xde2f4cc9UL, + 0x3080f9dbUL, 0x55e74563UL, 0x9ca03f6bUL, 0xf9c783d3UL, 0x176836c1UL, + 0x720f8a79UL, 0xcb375de4UL, 0xae50e15cUL, 0x40ff544eUL, 0x2598e8f6UL, + 0x73888baeUL, 0x16ef3716UL, 0xf8408204UL, 0x9d273ebcUL, 0x241fe921UL, + 0x41785599UL, 0xafd7e08bUL, 0xcab05c33UL, 0x3bb659edUL, 0x5ed1e555UL, + 0xb07e5047UL, 0xd519ecffUL, 0x6c213b62UL, 0x094687daUL, 0xe7e932c8UL, + 0x828e8e70UL, 0xd49eed28UL, 0xb1f95190UL, 0x5f56e482UL, 0x3a31583aUL, + 0x83098fa7UL, 0xe66e331fUL, 0x08c1860dUL, 0x6da63ab5UL, 0xa4e140bdUL, + 0xc186fc05UL, 0x2f294917UL, 0x4a4ef5afUL, 0xf3762232UL, 0x96119e8aUL, + 0x78be2b98UL, 0x1dd99720UL, 0x4bc9f478UL, 0x2eae48c0UL, 0xc001fdd2UL, + 0xa566416aUL, 0x1c5e96f7UL, 0x79392a4fUL, 0x97969f5dUL, 0xf2f123e5UL, + 0x05196b4dUL, 0x607ed7f5UL, 0x8ed162e7UL, 0xebb6de5fUL, 0x528e09c2UL, + 0x37e9b57aUL, 0xd9460068UL, 0xbc21bcd0UL, 0xea31df88UL, 0x8f566330UL, + 0x61f9d622UL, 0x049e6a9aUL, 0xbda6bd07UL, 0xd8c101bfUL, 0x366eb4adUL, + 0x53090815UL, 0x9a4e721dUL, 0xff29cea5UL, 0x11867bb7UL, 0x74e1c70fUL, + 0xcdd91092UL, 0xa8beac2aUL, 0x46111938UL, 0x2376a580UL, 0x7566c6d8UL, + 0x10017a60UL, 0xfeaecf72UL, 0x9bc973caUL, 0x22f1a457UL, 0x479618efUL, + 0xa939adfdUL, 0xcc5e1145UL, 0x06ee4d76UL, 0x6389f1ceUL, 0x8d2644dcUL, + 0xe841f864UL, 0x51792ff9UL, 0x341e9341UL, 0xdab12653UL, 0xbfd69aebUL, + 0xe9c6f9b3UL, 0x8ca1450bUL, 0x620ef019UL, 0x07694ca1UL, 0xbe519b3cUL, + 0xdb362784UL, 0x35999296UL, 0x50fe2e2eUL, 0x99b95426UL, 0xfcdee89eUL, + 0x12715d8cUL, 0x7716e134UL, 0xce2e36a9UL, 0xab498a11UL, 0x45e63f03UL, + 0x208183bbUL, 0x7691e0e3UL, 0x13f65c5bUL, 0xfd59e949UL, 0x983e55f1UL, + 0x2106826cUL, 0x44613ed4UL, 0xaace8bc6UL, 0xcfa9377eUL, 0x38417fd6UL, + 0x5d26c36eUL, 0xb389767cUL, 0xd6eecac4UL, 0x6fd61d59UL, 0x0ab1a1e1UL, + 0xe41e14f3UL, 0x8179a84bUL, 0xd769cb13UL, 0xb20e77abUL, 0x5ca1c2b9UL, + 0x39c67e01UL, 0x80fea99cUL, 0xe5991524UL, 0x0b36a036UL, 0x6e511c8eUL, + 0xa7166686UL, 0xc271da3eUL, 0x2cde6f2cUL, 0x49b9d394UL, 0xf0810409UL, + 0x95e6b8b1UL, 0x7b490da3UL, 0x1e2eb11bUL, 0x483ed243UL, 0x2d596efbUL, + 0xc3f6dbe9UL, 0xa6916751UL, 0x1fa9b0ccUL, 0x7ace0c74UL, 0x9461b966UL, + 0xf10605deUL +#endif + } +}; diff --git a/crc32c.c b/crc32c.c new file mode 100644 index 0000000..156cba1 --- /dev/null +++ b/crc32c.c @@ -0,0 +1,104 @@ +/* + * Oct 28, 2015 Song Liu simplified the code and port it to mdadm + * + * Aug 8, 2011 Bob Pearson with help from Joakim Tjernlund and George Spelvin + * cleaned up code to current version of sparse and added the slicing-by-8 + * algorithm to the closely similar existing slicing-by-4 algorithm. + * + * Oct 15, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Nicer crc32 functions/docs submitted by linux@horizon.com. Thanks! + * Code was from the public domain, copyright abandoned. Code was + * subsequently included in the kernel, thus was re-licensed under the + * GNU GPL v2. + * + * Oct 12, 2000 Matt Domsch <Matt_Domsch@dell.com> + * Same crc32 function was used in 5 other places in the kernel. + * I made one version, and deleted the others. + * There are various incantations of crc32(). Some use a seed of 0 or ~0. + * Some xor at the end with ~0. The generic crc32() function takes + * seed as an argument, and doesn't xor at the end. Then individual + * users can do whatever they need. + * drivers/net/smc9194.c uses seed ~0, doesn't xor with ~0. + * fs/jffs2 uses seed 0, doesn't xor with ~0. + * fs/partitions/efi.c uses seed ~0, xor's with ~0. + * + * This source code is licensed under the GNU General Public License, + * Version 2. See the file COPYING for more details. + */ + +#include <sys/types.h> +#include <asm/types.h> +#include <stdlib.h> + +/* + * There are multiple 16-bit CRC polynomials in common use, but this is + * *the* standard CRC-32 polynomial, first popularized by Ethernet. + * x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x^1+x^0 + */ +#define CRCPOLY_LE 0xedb88320 +#define CRCPOLY_BE 0x04c11db7 + +/* + * This is the CRC32c polynomial, as outlined by Castagnoli. + * x^32+x^28+x^27+x^26+x^25+x^23+x^22+x^20+x^19+x^18+x^14+x^13+x^11+x^10+x^9+ + * x^8+x^6+x^0 + */ +#define CRC32C_POLY_LE 0x82F63B78 + +/** + * crc32_le_generic() - Calculate bitwise little-endian Ethernet AUTODIN II + * CRC32/CRC32C + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for other + * uses, or the previous crc32/crc32c value if computing incrementally. + * @p: pointer to buffer over which CRC32/CRC32C is run + * @len: length of buffer @p + * @polynomial: CRC32/CRC32c LE polynomial + */ +static inline __u32 crc32_le_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++; + for (i = 0; i < 8; i++) + crc = (crc >> 1) ^ ((crc & 1) ? polynomial : 0); + } + return crc; +} + +__u32 crc32_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRCPOLY_LE); +} + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_le_generic(crc, p, len, CRC32C_POLY_LE); +} + +/** + * crc32_be_generic() - Calculate bitwise big-endian Ethernet AUTODIN II CRC32 + * @crc: seed value for computation. ~0 for Ethernet, sometimes 0 for + * other uses, or the previous crc32 value if computing incrementally. + * @p: pointer to buffer over which CRC32 is run + * @len: length of buffer @p + * @polynomial: CRC32 BE polynomial + */ +static inline __u32 crc32_be_generic(__u32 crc, unsigned char const *p, + size_t len, __u32 polynomial) +{ + int i; + while (len--) { + crc ^= *p++ << 24; + for (i = 0; i < 8; i++) + crc = + (crc << 1) ^ ((crc & 0x80000000) ? polynomial : + 0); + } + return crc; +} + +__u32 crc32_be(__u32 crc, unsigned char const *p, size_t len) +{ + return crc32_be_generic(crc, p, len, CRCPOLY_BE); +} @@ -0,0 +1,74 @@ + +/* doubly linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +#include <unistd.h> +#include <stdlib.h> +#include <string.h> +#ifdef __dietlibc__ +char *strncpy(char *dest, const char *src, size_t n) __THROW; +#endif +void *xcalloc(size_t num, size_t size); +#include "dlink.h" + +void *dl_head() +{ + void *h; + h = dl_alloc(0); + dl_next(h) = h; + dl_prev(h) = h; + return h; +} + +void dl_free(void *v) +{ + struct __dl_head *vv = v; + free(vv-1); +} + +void dl_init(void *v) +{ + dl_next(v) = v; + dl_prev(v) = v; +} + +void dl_insert(void *head, void *val) +{ + dl_next(val) = dl_next(head); + dl_prev(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_add(void *head, void *val) +{ + dl_prev(val) = dl_prev(head); + dl_next(val) = head; + dl_next(dl_prev(val)) = val; + dl_prev(dl_next(val)) = val; +} + +void dl_del(void *val) +{ + if (dl_prev(val) == 0 || dl_next(val) == 0) + return; + dl_prev(dl_next(val)) = dl_prev(val); + dl_next(dl_prev(val)) = dl_next(val); + dl_prev(val) = dl_next(val) = 0; +} + +char *dl_strndup(char *s, int l) +{ + char *n; + if (s == NULL) + return NULL; + n = dl_newv(char, l+1); + strncpy(n, s, l+1); + n[l] = 0; + return n; +} + +char *dl_strdup(char *s) +{ + return dl_strndup(s, (int)strlen(s)); +} @@ -0,0 +1,25 @@ + +/* doubley linked lists */ +/* This is free software. No strings attached. No copyright claimed */ + +struct __dl_head +{ + void * dh_prev; + void * dh_next; +}; + +#define dl_alloc(size) ((void*)(((char*)xcalloc(1,(size)+sizeof(struct __dl_head)))+sizeof(struct __dl_head))) +#define dl_new(t) ((t*)dl_alloc(sizeof(t))) +#define dl_newv(t,n) ((t*)dl_alloc(sizeof(t)*n)) + +#define dl_next(p) *(&(((struct __dl_head*)(p))[-1].dh_next)) +#define dl_prev(p) *(&(((struct __dl_head*)(p))[-1].dh_prev)) + +void *dl_head(void); +char *dl_strdup(char *); +char *dl_strndup(char *, int); +void dl_insert(void*, void*); +void dl_add(void*, void*); +void dl_del(void*); +void dl_free(void*); +void dl_init(void*); diff --git a/external-reshape-design.txt b/external-reshape-design.txt new file mode 100644 index 0000000..e4cf4e1 --- /dev/null +++ b/external-reshape-design.txt @@ -0,0 +1,280 @@ +External Reshape + +1 Problem statement + +External (third-party metadata) reshape differs from native-metadata +reshape in three key ways: + +1.1 Format specific constraints + +In the native case reshape is limited by what is implemented in the +generic reshape routine (Grow_reshape()) and what is supported by the +kernel. There are exceptional cases where Grow_reshape() may block +operations when it knows that the kernel implementation is broken, but +otherwise the kernel is relied upon to be the final arbiter of what +reshape operations are supported. + +In the external case the kernel, and the generic checks in +Grow_reshape(), become the super-set of what reshapes are possible. The +metadata format may not support, or have yet to implement a given +reshape type. The implication for Grow_reshape() is that it must query +the metadata handler and effect changes in the metadata before the new +geometry is posted to the kernel. The ->reshape_super method allows +Grow_reshape() to validate the requested operation and post the metadata +update. + +1.2 Scope of reshape + +Native metadata reshape is always performed at the array scope (no +metadata relationship with sibling arrays on the same disks). External +reshape, depending on the format, may not allow the number of member +disks to be changed in a subarray unless the change is simultaneously +applied to all subarrays in the container. For example the imsm format +requires all member disks to be a member of all subarrays, so a 4-disk +raid5 in a container that also houses a 4-disk raid10 array could not be +reshaped to 5 disks as the imsm format does not support a 5-disk raid10 +representation. This requires the ->reshape_super method to check the +contents of the array and ask the user to run the reshape at container +scope (if all subarrays are agreeable to the change), or report an +error in the case where one subarray cannot support the change. + +1.3 Monitoring / checkpointing + +Reshape, unlike rebuild/resync, requires strict checkpointing to survive +interrupted reshape operations. For example when expanding a raid5 +array the first few stripes of the array will be overwritten in a +destructive manner. When restarting the reshape process we need to know +the exact location of the last successfully written stripe, and we need +to restore the data in any partially overwritten stripe. Native +metadata stores this backup data in the unused portion of spares that +are being promoted to array members, or in an external backup file +(located on a non-involved block device). + +The kernel is in charge of recording checkpoints of reshape progress, +but mdadm is delegated the task of managing the backup space which +involves: +1/ Identifying what data will be overwritten in the next unit of reshape + operation +2/ Suspending access to that region so that a snapshot of the data can + be transferred to the backup space. +3/ Allowing the kernel to reshape the saved region and setting the + boundary for the next backup. + +In the external reshape case we want to preserve this mdadm +'reshape-manager' arrangement, but have a third actor, mdmon, to +consider. It is tempting to give the role of managing reshape to mdmon, +but that is counter to its role as a monitor, and conflicts with the +existing capabilities and role of mdadm to manage the progress of +reshape. For clarity the external reshape implementation maintains the +role of mdmon as a (mostly) passive recorder of raid events, and mdadm +treats it as it would the kernel in the native reshape case (modulo +needing to send explicit metadata update messages and checking that +mdmon took the expected action). + +External reshape can use the generic md backup file as a fallback, but in the +optimal/firmware-compatible case the reshape-manager will use the metadata +specific areas for managing reshape. The implementation also needs to spawn a +reshape-manager per subarray when the reshape is being carried out at the +container level. For these two reasons the ->manage_reshape() method is +introduced. This method in addition to base tasks mentioned above: +1/ Processed each subarray one at a time in series - where appropriate. +2/ Uses either generic routines in Grow.c for md-style backup file + support, or uses the metadata-format specific location for storing + recovery data. +This aims to avoid a "midlayer mistake"[1] and lets the metadata handler +optionally take advantage of generic infrastructure in Grow.c + +2 Details for specific reshape requests + +There are quite a few moving pieces spread out across md, mdadm, and mdmon for +the support of external reshape, and there are several different types of +reshape that need to be comprehended by the implementation. A rundown of +these details follows. + +2.0 General provisions: + +Obtain an exclusive open on the container to make sure we are not +running concurrently with a Create() event. + +2.1 Freezing sync_action + + Before making any attempt at a reshape we 'freeze' every array in + the container to ensure no spare assignment or recovery happens. + This involves writing 'frozen' to sync_action and changing the '/' + after 'external:' in metadata_version to a '-'. mdmon knows that + this means not to perform any management. + + Before doing this we check that all sync_actions are 'idle', which + is racy but still useful. + Afterwards we check that all member arrays have no spares + or partial spares (recovery_start != 'none') which would indicate a + race. If they do, we unfreeze again. + + Once this completes we know all the arrays are stable. They may + still have failed devices as devices can fail at any time. However + we treat those like failures that happen during the reshape. + +2.2 Reshape size + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the size change + is allowed (being performed at subarray scope / enough room) prepares a + metadata update + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new size to the kernel + + +2.3 Reshape level (simple-takeover) + +"simple-takeover" implies the level change can be satisfied without touching +sync_action + + 1/ mdadm::Grow_reshape(): checks if mdmon is running and optionally + initializes st->update_tail + 2/ mdadm::Grow_reshape() calls ->reshape_super() to check that the level change + is allowed (being performed at subarray scope) prepares a + metadata update + 2a/ raid10 --> raid0: degrade all mirror legs prior to calling + ->reshape_super + 3/ mdadm::Grow_reshape(): flushes the metadata update (via + flush_metadata_update(), or ->sync_metadata()) + 4/ mdadm::Grow_reshape(): post the new level to the kernel + +2.4 Reshape chunk, layout + +2.5 Reshape raid disks (grow) + + 1/ mdadm::Grow_reshape(): unconditionally initializes st->update_tail + because only redundant raid levels can modify the number of raid disks + 2/ mdadm::Grow_reshape(): calls ->reshape_super() to check that the level + change is allowed (being performed at proper scope / permissible + geometry / proper spares available in the container), chooses + the spares to use, and prepares a metadata update. + 3/ mdadm::Grow_reshape(): Converts each subarray in the container to the + raid level that can perform the reshape and starts mdmon. + 4/ mdadm::Grow_reshape(): Pushes the update to mdmon. + 5/ mdadm::Grow_reshape(): uses container_content to find details of + the spares and passes them to the kernel. + 6/ mdadm::Grow_reshape(): gives raid_disks update to the kernel, + sets sync_max, sync_min, suspend_lo, suspend_hi all to zero, + and starts the reshape by writing 'reshape' to sync_action. + 7/ mdmon::monitor notices the sync_action change and tells + managemon to check for new devices. managemon notices the new + devices, opens relevant sysfs file, and passes them all to + monitor. + 8/ mdadm::Grow_reshape() calls ->manage_reshape to oversee the + rest of the reshape. + + 9/ mdadm::<format>->manage_reshape(): saves data that will be overwritten by + the kernel to either the backup file or the metadata specific location, + advances sync_max, waits for reshape, ping mdmon, repeat. + Meanwhile mdmon::read_and_act(): records checkpoints. + Specifically. + + 9a/ if the 'next' stripe to be reshaped will over-write + itself during reshape then: + 9a.1/ increase suspend_hi to cover a suitable number of + stripes. + 9a.2/ backup those stripes safely. + 9a.3/ advance sync_max to allow those stripes to be backed up + 9a.4/ when sync_completed indicates that those stripes have + been reshaped, manage_reshape must ping_manager + 9a.5/ when mdmon notices that sync_completed has been updated, + it records the new checkpoint in the metadata + 9a.6/ after the ping_manager, manage_reshape will increase + suspend_lo to allow access to those stripes again + + 9b/ if the 'next' stripe to be reshaped will over-write unused + space during reshape then we apply same process as above, + except that there is no need to back anything up. + Note that we *do* need to keep suspend_hi progressing as + it is not safe to write to the area-under-reshape. For + kernel-managed-metadata this protection is provided by + ->reshape_safe, but that does not protect us in the case + of user-space-managed-metadata. + + 10/ mdadm::<format>->manage_reshape(): Once reshape completes changes the raid + level back to the nominal raid level (if necessary) + + FIXME: native metadata does not have the capability to record the original + raid level in reshape-restart case because the kernel always records current + raid level to the metadata, whereas external metadata can masquerade at an + alternate level based on the reshape state. + +2.6 Reshape raid disks (shrink) + +3 Interaction with metadata handle. + + The following calls are made into the metadata handler to assist + with initiating and monitoring a 'reshape'. + + 1/ ->reshape_super is called quite early (after only minimial + checks) to make sure that the metadata can record the new shape + and any necessary transitions. It may be passed a 'container' + or an individual array within a container, and it should notice + the difference and act accordingly. + When a reshape is requested against a container it is expected + that it should be applied to every array in the container, + however it is up to the metadata handler to determine final + policy. + + If the reshape is supportable, the internal copy of the metadata + should be updated, and a metadata update suitable for sending + to mdmon should be queued. + + If the reshape will involve converting spares into array members, + this must be recorded in the metadata too. + + 2/ ->container_content will be called to find out the new state + of all the array, or all arrays in the container. Any newly + added devices (with state==0 and raid_disk >= 0) will be added + to the array as spares with the relevant slot number. + + It is likely that the info returned by ->container_content will + have ->reshape_active set, ->reshape_progress set to e.g. 0, and + new_* set appropriately. mdadm will use this information to + cause the correct reshape to start at an appropriate time. + + 3/ ->set_array_state will be called by mdmon when reshape has + started and again periodically as it progresses. This should + record the ->last_checkpoint as the point where reshape has + progressed to. When the reshape finished this will be called + again and it should notice that ->curr_action is no longer + 'reshape' and so should record that the reshape has finished + providing 'last_checkpoint' has progressed suitably. + + 4/ ->manage_reshape will be called once the reshape has been set + up in the kernel but before sync_max has been moved from 0, so + no actual reshape will have happened. + + ->manage_reshape should call progress_reshape() to allow the + reshape to progress, and should back-up any data as indicated + by the return value. See the documentation of that function + for more details. + ->manage_reshape will be called multiple times when a + container is being reshaped, once for each member array in + the container. + + + The progress of the metadata is as follows: + 1/ mdadm sends a metadata update to mdmon which marks the array + as undergoing a reshape. This is set up by + ->reshape_super and applied by ->process_update + For container-wide reshape, this happens once for the whole + container. + 2/ mdmon notices progress via the sysfs files and calls + ->set_array_state to update the state periodically + For container-wide reshape, this happens repeatedly for + one array, then repeatedly for the next, etc. + 3/ mdmon notices when reshape has finished and call + ->set_array_state to record the the reshape is complete. + For container-wide reshape, this happens once for each + member array. + + + +... + +[1]: Linux kernel design patterns - part 3, Neil Brown https://lwn.net/Articles/336262/ diff --git a/inventory b/inventory new file mode 100755 index 0000000..c4801b4 --- /dev/null +++ b/inventory @@ -0,0 +1,284 @@ + +.gitignore +ANNOUNCE-3.0 +ANNOUNCE-3.0.1 +ANNOUNCE-3.0.2 +ANNOUNCE-3.0.3 +ANNOUNCE-3.1 +ANNOUNCE-3.1.1 +ANNOUNCE-3.1.2 +ANNOUNCE-3.1.3 +ANNOUNCE-3.1.4 +ANNOUNCE-3.1.5 +ANNOUNCE-3.2 +ANNOUNCE-3.2.1 +ANNOUNCE-3.2.2 +ANNOUNCE-3.2.3 +ANNOUNCE-3.2.4 +ANNOUNCE-3.2.5 +ANNOUNCE-3.2.6 +ANNOUNCE-3.3 +ANNOUNCE-3.3.1 +ANNOUNCE-3.3.2 +ANNOUNCE-3.3.3 +ANNOUNCE-3.3.4 +ANNOUNCE-3.4 +ANNOUNCE-4.0 +ANNOUNCE-4.1 +ANNOUNCE-4.2 +Assemble.c +Build.c +COPYING +ChangeLog +Create.c +Detail.c +Dump.c +Examine.c +Grow.c +INSTALL +Incremental.c +Kill.c +Makefile +Manage.c +Monitor.c +Query.c +README.initramfs +ReadMe.c +TODO +bitmap.c +bitmap.h +clustermd_tests/ +clustermd_tests/00r10_Create +clustermd_tests/00r1_Create +clustermd_tests/01r10_Grow_bitmap-switch +clustermd_tests/01r10_Grow_resize +clustermd_tests/01r1_Grow_add +clustermd_tests/01r1_Grow_bitmap-switch +clustermd_tests/01r1_Grow_resize +clustermd_tests/02r10_Manage_add +clustermd_tests/02r10_Manage_add-spare +clustermd_tests/02r10_Manage_re-add +clustermd_tests/02r1_Manage_add +clustermd_tests/02r1_Manage_add-spare +clustermd_tests/02r1_Manage_re-add +clustermd_tests/03r10_switch-recovery +clustermd_tests/03r10_switch-resync +clustermd_tests/03r1_switch-recovery +clustermd_tests/03r1_switch-resync +clustermd_tests/cluster_conf +clustermd_tests/func.sh +config.c +coverity-gcc-hack.h +crc32.c +crc32.h +crc32c.c +dlink.c +dlink.h +external-reshape-design.txt +inventory +lib.c +makedist +managemon.c +mapfile.c +maps.c +md.4 +md5.h +md_p.h +md_u.h +mdadm.8.in +mdadm.c +mdadm.conf-example +mdadm.conf.5 +mdadm.h +mdadm.spec +mdmon-design.txt +mdmon.8 +mdmon.c +mdmon.h +mdopen.c +mdstat.c +misc/ +misc/mdcheck +misc/syslog-events +mkinitramfs +monitor.c +msg.c +msg.h +part.h +platform-intel.c +platform-intel.h +policy.c +probe_roms.c +probe_roms.h +pwgr.c +raid5extend.c +raid6check.8 +raid6check.c +restripe.c +sg_io.c +sha1.c +sha1.h +super-ddf.c +super-gpt.c +super-intel.c +super-mbr.c +super0.c +super1.c +swap_super.c +sysfs.c +systemd/ +systemd/SUSE-mdadm_env.sh +systemd/mdadm-grow-continue@.service +systemd/mdadm-last-resort@.service +systemd/mdadm-last-resort@.timer +systemd/mdadm.shutdown +systemd/mdcheck_continue.service +systemd/mdcheck_continue.timer +systemd/mdcheck_start.service +systemd/mdcheck_start.timer +systemd/mdmon@.service +systemd/mdmonitor-oneshot.service +systemd/mdmonitor-oneshot.timer +systemd/mdmonitor.service +test +tests/ +tests/00linear +tests/00multipath +tests/00names +tests/00raid0 +tests/00raid1 +tests/00raid10 +tests/00raid4 +tests/00raid5 +tests/00raid6 +tests/00readonly +tests/01r1fail +tests/01r5fail +tests/01r5integ +tests/01raid6integ +tests/01replace +tests/02lineargrow +tests/02r1add +tests/02r1grow +tests/02r5grow +tests/02r6grow +tests/03assem-incr +tests/03r0assem +tests/03r5assem +tests/03r5assem-failed +tests/03r5assemV1 +tests/04r0update +tests/04r1update +tests/04r5swap +tests/04update-metadata +tests/04update-uuid +tests/05r1-add-internalbitmap +tests/05r1-add-internalbitmap-v1a +tests/05r1-add-internalbitmap-v1b +tests/05r1-add-internalbitmap-v1c +tests/05r1-bitmapfile +tests/05r1-failfast +tests/05r1-grow-external +tests/05r1-grow-internal +tests/05r1-grow-internal-1 +tests/05r1-internalbitmap +tests/05r1-internalbitmap-v1a +tests/05r1-internalbitmap-v1b +tests/05r1-internalbitmap-v1c +tests/05r1-n3-bitmapfile +tests/05r1-re-add +tests/05r1-re-add-nosuper +tests/05r1-remove-internalbitmap +tests/05r1-remove-internalbitmap-v1a +tests/05r1-remove-internalbitmap-v1b +tests/05r1-remove-internalbitmap-v1c +tests/05r5-bitmapfile +tests/05r5-internalbitmap +tests/05r6-bitmapfile +tests/05r6tor0 +tests/06name +tests/06sysfs +tests/06wrmostly +tests/07autoassemble +tests/07autodetect +tests/07changelevelintr +tests/07changelevels +tests/07layouts +tests/07reshape5intr +tests/07revert-grow +tests/07revert-inplace +tests/07revert-shrink +tests/07testreshape5 +tests/09imsm-assemble +tests/09imsm-create-fail-rebuild +tests/09imsm-overlap +tests/10ddf-assemble-missing +tests/10ddf-create +tests/10ddf-create-fail-rebuild +tests/10ddf-fail-create-race +tests/10ddf-fail-readd +tests/10ddf-fail-readd-readonly +tests/10ddf-fail-spare +tests/10ddf-fail-stop-readd +tests/10ddf-fail-twice +tests/10ddf-fail-two-spares +tests/10ddf-geometry +tests/10ddf-incremental-wrong-order +tests/10ddf-sudden-degraded +tests/11spare-migration +tests/12imsm-r0_2d-grow-r0_3d +tests/12imsm-r0_2d-grow-r0_4d +tests/12imsm-r0_2d-grow-r0_5d +tests/12imsm-r0_3d-grow-r0_4d +tests/12imsm-r5_3d-grow-r5_4d +tests/12imsm-r5_3d-grow-r5_5d +tests/13imsm-r0_r0_2d-grow-r0_r0_4d +tests/13imsm-r0_r0_2d-grow-r0_r0_5d +tests/13imsm-r0_r0_3d-grow-r0_r0_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_4d +tests/13imsm-r0_r5_3d-grow-r0_r5_5d +tests/13imsm-r5_r0_3d-grow-r5_r0_4d +tests/13imsm-r5_r0_3d-grow-r5_r0_5d +tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d +tests/14imsm-r0_3d_no_spares-migrate-r5_3d +tests/14imsm-r0_r0_2d-takeover-r10_4d +tests/14imsm-r10_4d-grow-r10_5d +tests/14imsm-r10_r5_4d-takeover-r0_2d +tests/14imsm-r1_2d-grow-r1_3d +tests/14imsm-r1_2d-takeover-r0_2d +tests/14imsm-r5_3d-grow-r5_5d-no-spares +tests/14imsm-r5_3d-migrate-r4_3d +tests/15imsm-r0_3d_64k-migrate-r0_3d_256k +tests/15imsm-r5_3d_4k-migrate-r5_3d_256k +tests/15imsm-r5_3d_64k-migrate-r5_3d_256k +tests/15imsm-r5_6d_4k-migrate-r5_6d_256k +tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k +tests/16imsm-r0_3d-migrate-r5_4d +tests/16imsm-r0_5d-migrate-r5_6d +tests/16imsm-r5_3d-migrate-r0_3d +tests/16imsm-r5_5d-migrate-r0_5d +tests/18imsm-1d-takeover-r0_1d +tests/18imsm-1d-takeover-r1_2d +tests/18imsm-r0_2d-takeover-r10_4d +tests/18imsm-r10_4d-takeover-r0_2d +tests/18imsm-r1_2d-takeover-r0_1d +tests/19raid6auto-repair +tests/19raid6check +tests/19raid6repair +tests/19repair-does-not-destroy +tests/20raid5journal +tests/21raid5cache +tests/ToTest +tests/env-ddf-template +tests/env-imsm-template +tests/func.sh +tests/imsm-grow-template +tests/utils +udev-md-clustered-confirm-device.rules +udev-md-raid-arrays.rules +udev-md-raid-assembly.rules +udev-md-raid-creating.rules +udev-md-raid-safe-timeouts.rules +util.c +uuid.c +xmalloc.c @@ -0,0 +1,575 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "dlink.h" +#include <ctype.h> +#include <limits.h> + +bool is_dev_alive(char *path) +{ + if (!path) + return false; + + if (access(path, R_OK) == 0) + return true; + + return false; +} + +/* This fill contains various 'library' style function. They + * have no dependency on anything outside this file. + */ + +int get_mdp_major(void) +{ + static int mdp_major = -1; + FILE *fl; + char *w; + int have_block = 0; + int have_devices = 0; + int last_num = -1; + + if (mdp_major != -1) + return mdp_major; + + fl = fopen("/proc/devices", "r"); + if (!fl) + return -1; + + while ((w = conf_word(fl, 1))) { + if (have_block && strcmp(w, "devices:") == 0) + have_devices = 1; + have_block = (strcmp(w, "Block") == 0); + if (isdigit(w[0])) + last_num = atoi(w); + if (have_devices && strcmp(w, "mdp") == 0) + mdp_major = last_num; + free(w); + } + fclose(fl); + + return mdp_major; +} + +char *devid2kname(dev_t devid) +{ + char path[30]; + char link[PATH_MAX]; + static char devnm[32]; + char *cp; + int n; + + /* Look at the + * /sys/dev/block/%d:%d link which must look like + * and take the last component. + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), minor(devid)); + n = readlink(path, link, sizeof(link) - 1); + if (n > 0) { + link[n] = 0; + cp = strrchr(link, '/'); + if (cp) { + strcpy(devnm, cp + 1); + return devnm; + } + } + return NULL; +} + +char *stat2kname(struct stat *st) +{ + if ((S_IFMT & st->st_mode) != S_IFBLK) + return NULL; + + return devid2kname(st->st_rdev); +} + +char *fd2kname(int fd) +{ + struct stat stb; + + if (fstat(fd, &stb) == 0) + return stat2kname(&stb); + + return NULL; +} + +char *devid2devnm(dev_t devid) +{ + char path[30]; + char link[200]; + static char devnm[32]; + char *cp, *ep; + int n; + + /* Might be an extended-minor partition or a + * named md device. Look at the + * /sys/dev/block/%d:%d link which must look like + * ../../block/mdXXX/mdXXXpYY + * or + * ...../block/md_FOO + */ + sprintf(path, "/sys/dev/block/%d:%d", major(devid), minor(devid)); + n = readlink(path, link, sizeof(link) - 1); + if (n > 0) { + link[n] = 0; + cp = strstr(link, "/block/"); + if (cp) { + cp += 7; + ep = strchr(cp, '/'); + if (ep) + *ep = 0; + strcpy(devnm, cp); + return devnm; + } + } + if (major(devid) == MD_MAJOR) + sprintf(devnm,"md%d", minor(devid)); + else if (major(devid) == (unsigned)get_mdp_major()) + sprintf(devnm,"md_d%d", + (minor(devid)>>MdpMinorShift)); + else + return NULL; + + return devnm; +} + +char *stat2devnm(struct stat *st) +{ + if ((S_IFMT & st->st_mode) != S_IFBLK) + return NULL; + + return devid2devnm(st->st_rdev); +} + +char *fd2devnm(int fd) +{ + struct stat stb; + + if (fstat(fd, &stb) == 0) + return stat2devnm(&stb); + + return NULL; +} + +/* When we create a new array, we don't want the content to + * be immediately examined by udev - it is probably meaningless. + * So create /run/mdadm/creating-mdXXX and expect that a udev + * rule will noticed this and act accordingly. + */ +static char block_path[] = "/run/mdadm/creating-%s"; +static char *unblock_path = NULL; +void udev_block(char *devnm) +{ + int fd; + char *path = NULL; + + xasprintf(&path, block_path, devnm); + fd = open(path, O_CREAT|O_RDWR, 0600); + if (fd >= 0) { + close(fd); + unblock_path = path; + } else + free(path); +} + +void udev_unblock(void) +{ + if (unblock_path) + unlink(unblock_path); + free(unblock_path); + unblock_path = NULL; +} + +/* + * convert a major/minor pair for a block device into a name in /dev, if possible. + * On the first call, walk /dev collecting name. + * Put them in a simple linked listfor now. + */ +struct devmap { + int major, minor; + char *name; + struct devmap *next; +} *devlist = NULL; +int devlist_ready = 0; + +int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s) +{ + struct stat st; + + if (S_ISLNK(stb->st_mode)) { + if (stat(name, &st) != 0) + return 0; + stb = &st; + } + + if ((stb->st_mode&S_IFMT)== S_IFBLK) { + char *n = xstrdup(name); + struct devmap *dm = xmalloc(sizeof(*dm)); + if (strncmp(n, "/dev/./", 7) == 0) + strcpy(n + 4, name + 6); + if (dm) { + dm->major = major(stb->st_rdev); + dm->minor = minor(stb->st_rdev); + dm->name = n; + dm->next = devlist; + devlist = dm; + } + } + + return 0; +} + +#ifndef HAVE_NFTW +#ifdef HAVE_FTW +int add_dev_1(const char *name, const struct stat *stb, int flag) +{ + return add_dev(name, stb, flag, NULL); +} +int nftw(const char *path, + int (*han)(const char *name, const struct stat *stb, + int flag, struct FTW *s), int nopenfd, int flags) +{ + return ftw(path, add_dev_1, nopenfd); +} +#else +int nftw(const char *path, + int (*han)(const char *name, const struct stat *stb, + int flag, struct FTW *s), int nopenfd, int flags) +{ + return 0; +} +#endif /* HAVE_FTW */ +#endif /* HAVE_NFTW */ + +/* + * Find a block device with the right major/minor number. + * If we find multiple names, choose the shortest. + * If we find a name in /dev/md/, we prefer that. + * This applies only to names for MD devices. + * If 'prefer' is set (normally to e.g. /by-path/) + * then we prefer a name which contains that string. + */ +char *map_dev_preferred(int major, int minor, int create, + char *prefer) +{ + struct devmap *p; + char *regular = NULL, *preferred=NULL; + int did_check = 0; + + if (major == 0 && minor == 0) + return NULL; + + retry: + if (!devlist_ready) { + char *dev = "/dev"; + struct stat stb; + while(devlist) { + struct devmap *d = devlist; + devlist = d->next; + free(d->name); + free(d); + } + if (lstat(dev, &stb) == 0 && S_ISLNK(stb.st_mode)) + dev = "/dev/."; + nftw(dev, add_dev, 10, FTW_PHYS); + devlist_ready=1; + did_check = 1; + } + + for (p = devlist; p; p = p->next) + if (p->major == major && p->minor == minor) { + if (strncmp(p->name, "/dev/md/",8) == 0 || + (prefer && strstr(p->name, prefer))) { + if (preferred == NULL || + strlen(p->name) < strlen(preferred)) + preferred = p->name; + } else { + if (regular == NULL || + strlen(p->name) < strlen(regular)) + regular = p->name; + } + } + if (!regular && !preferred && !did_check) { + devlist_ready = 0; + goto retry; + } + if (create && !regular && !preferred) { + static char buf[30]; + snprintf(buf, sizeof(buf), "%d:%d", major, minor); + regular = buf; + } + + return preferred ? preferred : regular; +} + +/* conf_word gets one word from the conf file. + * if "allow_key", then accept words at the start of a line, + * otherwise stop when such a word is found. + * We assume that the file pointer is at the end of a word, so the + * next character is a space, or a newline. If not, it is the start of a line. + */ + +char *conf_word(FILE *file, int allow_key) +{ + int wsize = 100; + int len = 0; + int c; + int quote; + int wordfound = 0; + char *word = xmalloc(wsize); + + while (wordfound == 0) { + /* at the end of a word.. */ + c = getc(file); + if (c == '#') + while (c != EOF && c != '\n') + c = getc(file); + if (c == EOF) + break; + if (c == '\n') + continue; + + if (c != ' ' && c != '\t' && ! allow_key) { + ungetc(c, file); + break; + } + /* looks like it is safe to get a word here, if there is one */ + quote = 0; + /* first, skip any spaces */ + while (c == ' ' || c == '\t') + c = getc(file); + if (c != EOF && c != '\n' && c != '#') { + /* we really have a character of a word, so start saving it */ + while (c != EOF && c != '\n' && + (quote || (c != ' ' && c != '\t'))) { + wordfound = 1; + if (quote && c == quote) + quote = 0; + else if (quote == 0 && (c == '\'' || c == '"')) + quote = c; + else { + if (len == wsize-1) { + wsize += 100; + word = xrealloc(word, wsize); + } + word[len++] = c; + } + c = getc(file); + /* Hack for broken kernels (2.6.14-.24) that put + * "active(auto-read-only)" + * in /proc/mdstat instead of + * "active (auto-read-only)" + */ + if (c == '(' && len >= 6 && + strncmp(word + len - 6, "active", 6) == 0) + c = ' '; + } + } + if (c != EOF) + ungetc(c, file); + } + word[len] = 0; + + /* Further HACK for broken kernels.. 2.6.14-2.6.24 */ + if (strcmp(word, "auto-read-only)") == 0) + strcpy(word, "(auto-read-only)"); + +/* printf("word is <%s>\n", word); */ + if (!wordfound) { + free(word); + word = NULL; + } + return word; +} + +void print_quoted(char *str) +{ + /* Printf the string with surrounding quotes + * iff needed. + * If no space, tab, or quote - leave unchanged. + * Else print surrounded by " or ', swapping quotes + * when we find one that will cause confusion. + */ + + char first_quote = 0, q; + char *c; + + for (c = str; *c; c++) { + switch(*c) { + case '\'': + case '"': + first_quote = *c; + break; + case ' ': + case '\t': + first_quote = *c; + continue; + default: + continue; + } + break; + } + if (!first_quote) { + printf("%s", str); + return; + } + + if (first_quote == '"') + q = '\''; + else + q = '"'; + putchar(q); + for (c = str; *c; c++) { + if (*c == q) { + putchar(q); + q ^= '"' ^ '\''; + putchar(q); + } + putchar(*c); + } + putchar(q); +} + +void print_escape(char *str) +{ + /* print str, but change space and tab to '_' + * as is suitable for device names + */ + for (; *str; str++) { + switch (*str) { + case ' ': + case '\t': + putchar('_'); + break; + case '/': + putchar('-'); + break; + default: + putchar(*str); + } + } +} + +int check_env(char *name) +{ + char *val = getenv(name); + + if (val && atoi(val) == 1) + return 1; + + return 0; +} + +int use_udev(void) +{ + static int use = -1; + struct stat stb; + + if (use < 0) { + use = ((stat("/dev/.udev", &stb) == 0 || + stat("/run/udev", &stb) == 0) && + check_env("MDADM_NO_UDEV") == 0); + } + return use; +} + +unsigned long GCD(unsigned long a, unsigned long b) +{ + while (a != b) { + if (a < b) + b -= a; + if (b < a) + a -= b; + } + return a; +} + +/* + * conf_line reads one logical line from the conffile or mdstat. + * It skips comments and continues until it finds a line that starts + * with a non blank/comment. This character is pushed back for the next call + * A doubly linked list of words is returned. + * the first word will be a keyword. Other words will have had quotes removed. + */ + +char *conf_line(FILE *file) +{ + char *w; + char *list; + + w = conf_word(file, 1); + if (w == NULL) + return NULL; + + list = dl_strdup(w); + free(w); + dl_init(list); + + while ((w = conf_word(file, 0))){ + char *w2 = dl_strdup(w); + free(w); + dl_add(list, w2); + } +/* printf("got a line\n");*/ + return list; +} + +void free_line(char *line) +{ + char *w; + for (w = dl_next(line); w != line; w = dl_next(line)) { + dl_del(w); + dl_free(w); + } + dl_free(line); +} + +/** + * parse_num() - Parse int from string. + * @dest: Pointer to destination. + * @num: Pointer to string that is going to be parsed. + * + * If string contains anything after a number, error code is returned. + * The same happens when number is bigger than INT_MAX or smaller than 0. + * Writes to destination only if successfully read the number. + * + * Return: 0 on success, 1 otherwise. + */ +int parse_num(int *dest, char *num) +{ + char *c = NULL; + long temp; + + if (!num) + return 1; + + errno = 0; + temp = strtol(num, &c, 10); + if (temp < 0 || temp > INT_MAX || *c || errno != 0 || num == c) + return 1; + *dest = temp; + return 0; +} diff --git a/makedist b/makedist new file mode 100755 index 0000000..0c4b39e --- /dev/null +++ b/makedist @@ -0,0 +1,96 @@ +#!/bin/sh +# avoid silly sorting +export LANG=C +arg=$1 +target=~/public_html/source/mdadm +if [ " $arg" = " test" ] +then + target=/tmp/mdadm-test + rm -rf $target + mkdir -p $target +fi +if [ -d $target ] +then : +else echo $target is not a directory + exit 2 +fi +set `grep '^#define VERSION' ReadMe.c ` +version=`echo $3 | sed -e 's/"//g'` +grep "^.TH MDADM 8 .. v$version" mdadm.8.in > /dev/null 2>&1 || + { + echo mdadm.8.in does not mention version $version. + exit 1 + } +grep "^.TH MDMON 8 .. v$version" mdmon.8 > /dev/null 2>&1 || + { + echo mdmon.8 does not mention version $version. + exit 1 + } +rpmv=`echo $version | tr - _` +grep "^Version: *$rpmv$" mdadm.spec > /dev/null 2>&1 || + { + echo mdadm.spec does not mention version $version. + exit 1 + } +if [ -f ANNOUNCE-$version ] +then : +else + echo ANNOUNCE-$version does not exist + exit 1 +fi +if grep "^ANNOUNCE-$version\$" inventory +then : +else { cat inventory ; echo ANNOUNCE-$version ; } | sort -o inventory +fi + +echo version = $version +base=mdadm-$rpmv.tar.gz +if [ " $arg" != " diff" ] +then + if [ -f $target/$base ] + then + echo $target/$base exists. + exit 1 + fi + trap "rm $target/$base; exit" 1 2 3 + git archive --prefix=mdadm-$rpmv/ HEAD | gzip --best > $target/$base + chmod a+r $target/$base + ls -l $target/$base + if tar tzf $target/$base | sed 's,[^/]*/,,' | sort | diff -u inventory - + then : correct files found + else echo "Extra files, or inventory is out-of-date" + rm $target/$base + exit 1 + fi + rpmbuild -ta $target/$base || exit 1 + find ~/rpmbuild/RPMS -name "*mdadm-$version-*" \ + -exec cp {} $target/RPM \; + cp ANNOUNCE-$version $target/ANNOUNCE + cp ChangeLog $target/ChangeLog + if [ " $arg" != " test" ] + then + echo -n "Confirm signing this release? " + read a + if [ " $a" != " y" ]; then echo OK - bye. ; exit 1; fi + if zcat $target/$base | gpg -ba > $target/$base.sign && gpg -ba $target/ANNOUNCE + then + kup put $target/$base $target/$base.sign \ + /pub/linux/utils/raid/mdadm/mdadm-$version.tar.gz + kup put $target/ANNOUNCE $target/ANNOUNCE.asc /pub/linux/utils/raid/mdadm/ANNOUNCE + else + echo signing failed + exit 1 + fi + fi +else + if [ ! -f $target/$base ] + then + echo $target/$base does not exist. + exit 1 + fi + ( cd .. ; ln -s mdadm.v2 mdadm-$version ; tar chf - --exclude=.git --exclude="TAGS" --exclude='*,v' --exclude='*~' --exclude='*.o' --exclude mdadm --exclude=mdadm'.[^ch0-9]' --exclude=RCS mdadm-$version ; rm mdadm-$version ) | gzip --best > /var/tmp/mdadm-new.tgz + mkdir /var/tmp/mdadm-old ; zcat $target/$base | ( cd /var/tmp/mdadm-old ; tar xf - ) + mkdir /var/tmp/mdadm-new ; zcat /var/tmp/mdadm-new.tgz | ( cd /var/tmp/mdadm-new ; tar xf - ) + diff -ru /var/tmp/mdadm-old /var/tmp/mdadm-new + rm -rf /var/tmp/mdadm-old /var/tmp/mdadm-new /var/tmp/mdadm-new.tgz +fi diff --git a/managemon.c b/managemon.c new file mode 100644 index 0000000..bb7334c --- /dev/null +++ b/managemon.c @@ -0,0 +1,943 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * The management thread for monitoring active md arrays. + * This thread does things which might block such as memory + * allocation. + * In particular: + * + * - Find out about new arrays in this container. + * Allocate the data structures and open the files. + * + * For this we watch /proc/mdstat and find new arrays with + * metadata type that confirms sharing. e.g. "md4" + * When we find a new array we slip it into the list of + * arrays and signal 'monitor' by writing to a pipe. + * + * - Respond to reshape requests by allocating new data structures + * and opening new files. + * + * These come as a change to raid_disks. We allocate a new + * version of the data structures and slip it into the list. + * 'monitor' will notice and release the old version. + * Changes to level, chunksize, layout.. do not need re-allocation. + * Reductions in raid_disks don't really either, but we handle + * them the same way for consistency. + * + * - When a device is added to the container, we add it to the metadata + * as a spare. + * + * - Deal with degraded array + * We only do this when first noticing the array is degraded. + * This can be when we first see the array, when sync completes or + * when recovery completes. + * + * Check if number of failed devices suggests recovery is needed, and + * skip if not. + * Ask metadata to allocate a spare device + * Add device as not in_sync and give a role + * Update metadata. + * Open sysfs files and pass to monitor. + * Make sure that monitor Starts recovery.... + * + * - Pass on metadata updates from external programs such as + * mdadm creating a new array. + * + * This is most-messy. + * It might involve adding a new array or changing the status of + * a spare, or any reconfig that the kernel doesn't get involved in. + * + * The required updates are received via a named pipe. There will + * be one named pipe for each container. Each message contains a + * sync marker: 0x5a5aa5a5, A byte count, and the message. This is + * passed to the metadata handler which will interpret and process it. + * For 'DDF' messages are internal data blocks with the leading + * 'magic number' signifying what sort of data it is. + * + */ + +/* + * We select on /proc/mdstat and the named pipe. + * We create new arrays or updated version of arrays and slip + * them into the head of the list, then signal 'monitor' via a pipe write. + * 'monitor' will notice and place the old array on a return list. + * Metadata updates are placed on a queue just like they arrive + * from the named pipe. + * + * When new arrays are found based on correct metadata string, we + * need to identify them with an entry in the metadata. Maybe we require + * the metadata to be mdX/NN when NN is the index into an appropriate table. + * + */ + +/* + * List of tasks: + * - Watch for spares to be added to the container, and write updated + * metadata to them. + * - Watch for new arrays using this container, confirm they match metadata + * and if so, start monitoring them + * - Watch for spares being added to monitored arrays. This shouldn't + * happen, as we should do all the adding. Just remove them. + * - Watch for change in raid-disks, chunk-size, etc. Update metadata and + * start a reshape. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/socket.h> +#include <signal.h> + +static void close_aa(struct active_array *aa) +{ + struct mdinfo *d; + + for (d = aa->info.devs; d; d = d->next) { + close(d->recovery_fd); + close(d->state_fd); + close(d->bb_fd); + close(d->ubb_fd); + } + + if (aa->action_fd >= 0) + close(aa->action_fd); + if (aa->info.state_fd >= 0) + close(aa->info.state_fd); + if (aa->resync_start_fd >= 0) + close(aa->resync_start_fd); + if (aa->metadata_fd >= 0) + close(aa->metadata_fd); + if (aa->sync_completed_fd >= 0) + close(aa->sync_completed_fd); + if (aa->safe_mode_delay_fd >= 0) + close(aa->safe_mode_delay_fd); +} + +static void free_aa(struct active_array *aa) +{ + /* Note that this doesn't close fds if they are being used + * by a clone. ->container will be set for a clone + */ + dprintf("sys_name: %s\n", aa->info.sys_name); + if (!aa->container) + close_aa(aa); + while (aa->info.devs) { + struct mdinfo *d = aa->info.devs; + aa->info.devs = d->next; + free(d); + } + free(aa); +} + +static struct active_array *duplicate_aa(struct active_array *aa) +{ + struct active_array *newa = xmalloc(sizeof(*newa)); + struct mdinfo **dp1, **dp2; + + *newa = *aa; + newa->next = NULL; + newa->replaces = NULL; + newa->info.next = NULL; + + dp2 = &newa->info.devs; + + for (dp1 = &aa->info.devs; *dp1; dp1 = &(*dp1)->next) { + struct mdinfo *d; + if ((*dp1)->state_fd < 0) + continue; + + d = xmalloc(sizeof(*d)); + *d = **dp1; + *dp2 = d; + dp2 = & d->next; + } + *dp2 = NULL; + + return newa; +} + +static void wakeup_monitor(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mon_tid, SIGUSR1); +} + +static void remove_old(void) +{ + if (discard_this) { + discard_this->next = NULL; + free_aa(discard_this); + if (pending_discard == discard_this) + pending_discard = NULL; + discard_this = NULL; + wakeup_monitor(); + } +} + +static void replace_array(struct supertype *container, + struct active_array *old, + struct active_array *new) +{ + /* To replace an array, we add it to the top of the list + * marked with ->replaces to point to the original. + * 'monitor' will take the original out of the list + * and put it on 'discard_this'. We take it from there + * and discard it. + */ + remove_old(); + while (pending_discard) { + while (discard_this == NULL) + sleep(1); + remove_old(); + } + pending_discard = old; + new->replaces = old; + new->next = container->arrays; + container->arrays = new; + wakeup_monitor(); +} + +struct metadata_update *update_queue = NULL; +struct metadata_update *update_queue_handled = NULL; +struct metadata_update *update_queue_pending = NULL; + +static void free_updates(struct metadata_update **update) +{ + while (*update) { + struct metadata_update *this = *update; + void **space_list = this->space_list; + + *update = this->next; + free(this->buf); + free(this->space); + while (space_list) { + void *space = space_list; + space_list = *space_list; + free(space); + } + free(this); + } +} + +void check_update_queue(struct supertype *container) +{ + free_updates(&update_queue_handled); + + if (update_queue == NULL && + update_queue_pending) { + update_queue = update_queue_pending; + update_queue_pending = NULL; + wakeup_monitor(); + } +} + +static void queue_metadata_update(struct metadata_update *mu) +{ + struct metadata_update **qp; + + qp = &update_queue_pending; + while (*qp) + qp = & ((*qp)->next); + *qp = mu; +} + +static void add_disk_to_container(struct supertype *st, struct mdinfo *sd) +{ + int dfd; + char nm[20]; + struct metadata_update *update = NULL; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + + dprintf("add %d:%d to container\n", sd->disk.major, sd->disk.minor); + + sd->next = st->devs; + st->devs = sd; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return; + + st->update_tail = &update; + st->ss->add_to_super(st, &dk, dfd, NULL, INVALID_SECTORS); + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +/* + * Create and queue update structure about the removed disks. + * The update is prepared by super type handler and passed to the monitor + * thread. + */ +static void remove_disk_from_container(struct supertype *st, struct mdinfo *sd) +{ + struct metadata_update *update = NULL; + mdu_disk_info_t dk = { + .number = -1, + .major = sd->disk.major, + .minor = sd->disk.minor, + .raid_disk = -1, + .state = 0, + }; + dprintf("remove %d:%d from container\n", + sd->disk.major, sd->disk.minor); + + st->update_tail = &update; + st->ss->remove_from_super(st, &dk); + /* FIXME this write_init_super shouldn't be here. + * We have it after add_to_super to write to new device, + * but with 'remove' we don't ant to write to that device! + */ + st->ss->write_init_super(st); + queue_metadata_update(update); + st->update_tail = NULL; +} + +static void manage_container(struct mdstat_ent *mdstat, + struct supertype *container) +{ + /* Of interest here are: + * - if a new device has been added to the container, we + * add it to the array ignoring any metadata on it. + * - if a device has been removed from the container, we + * remove it from the device list and update the metadata. + * FIXME should we look for compatible metadata and take hints + * about spare assignment.... probably not. + */ + if (mdstat->devcnt != container->devcnt) { + struct mdinfo **cdp, *cd, *di, *mdi; + int found; + + /* read /sys/block/NAME/md/dev-??/block/dev to find out + * what is there, and compare with container->info.devs + * To see what is removed and what is added. + * These need to be remove from, or added to, the array + */ + mdi = sysfs_read(-1, mdstat->devnm, GET_DEVS); + if (!mdi) { + /* invalidate the current count so we can try again */ + container->devcnt = -1; + return; + } + + /* check for removals */ + for (cdp = &container->devs; *cdp; ) { + found = 0; + for (di = mdi->devs; di; di = di->next) + if (di->disk.major == (*cdp)->disk.major && + di->disk.minor == (*cdp)->disk.minor) { + found = 1; + break; + } + if (!found) { + cd = *cdp; + *cdp = (*cdp)->next; + remove_disk_from_container(container, cd); + free(cd); + } else + cdp = &(*cdp)->next; + } + + /* check for additions */ + for (di = mdi->devs; di; di = di->next) { + for (cd = container->devs; cd; cd = cd->next) + if (di->disk.major == cd->disk.major && + di->disk.minor == cd->disk.minor) + break; + if (!cd) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + *newd = *di; + add_disk_to_container(container, newd); + } + } + sysfs_free(mdi); + container->devcnt = mdstat->devcnt; + } +} + +static int sysfs_open2(char *devnum, char *name, char *attr) +{ + int fd = sysfs_open(devnum, name, attr); + if (fd >= 0) { + /* seq_file in the kernel allocates buffer space + * on the first read. Do that now so 'monitor' + * never needs too. + */ + char buf[200]; + if (read(fd, buf, sizeof(buf)) < 0) + /* pretend not to ignore return value */ + return fd; + } + return fd; +} + +static int disk_init_and_add(struct mdinfo *disk, struct mdinfo *clone, + struct active_array *aa) +{ + if (!disk || !clone) + return -1; + + *disk = *clone; + disk->recovery_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, + "recovery_start"); + if (disk->recovery_fd < 0) + return -1; + disk->state_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, "state"); + if (disk->state_fd < 0) { + close(disk->recovery_fd); + return -1; + } + disk->bb_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, + "bad_blocks"); + if (disk->bb_fd < 0) { + close(disk->recovery_fd); + close(disk->state_fd); + return -1; + } + disk->ubb_fd = sysfs_open2(aa->info.sys_name, disk->sys_name, + "unacknowledged_bad_blocks"); + if (disk->ubb_fd < 0) { + close(disk->recovery_fd); + close(disk->state_fd); + close(disk->bb_fd); + return -1; + } + disk->prev_state = read_dev_state(disk->state_fd); + disk->curr_state = disk->prev_state; + disk->next = aa->info.devs; + aa->info.devs = disk; + + return 0; +} + +static void manage_member(struct mdstat_ent *mdstat, + struct active_array *a) +{ + /* Compare mdstat info with known state of member array. + * We do not need to look for device state changes here, that + * is dealt with by the monitor. + * + * If a reshape is being requested, monitor will have noticed + * that sync_action changed and will have set check_reshape. + * We just need to see if new devices have appeared. All metadata + * updates will already have been processed. + * + * We also want to handle degraded arrays here by + * trying to find and assign a spare. + * We do that whenever the monitor tells us too. + */ + char buf[64]; + int frozen; + struct supertype *container = a->container; + struct mdinfo *mdi; + + if (container == NULL) + /* Raced with something */ + return; + + if (mdstat->active) { + // FIXME + a->info.array.raid_disks = mdstat->raid_disks; + // MORE + } + + mdi = sysfs_read(-1, mdstat->devnm, + GET_COMPONENT|GET_CONSISTENCY_POLICY); + if (mdi) { + a->info.component_size = mdi->component_size; + a->info.consistency_policy = mdi->consistency_policy; + sysfs_free(mdi); + } + + /* honor 'frozen' */ + if (sysfs_get_str(&a->info, NULL, "metadata_version", buf, sizeof(buf)) > 0) + frozen = buf[9] == '-'; + else + frozen = 1; /* can't read metadata_version assume the worst */ + + /* If sync_action is not 'idle' then don't try recovery now */ + if (!frozen && + sysfs_get_str(&a->info, NULL, "sync_action", + buf, sizeof(buf)) > 0 && strncmp(buf, "idle", 4) != 0) + frozen = 1; + + if (mdstat->level) { + int level = map_name(pers, mdstat->level); + if (level == 0 || level == LEVEL_LINEAR) { + a->to_remove = 1; + wakeup_monitor(); + return; + } + else if (a->info.array.level != level && level > 0) { + struct active_array *newa = duplicate_aa(a); + if (newa) { + newa->info.array.level = level; + replace_array(container, a, newa); + a = newa; + } + } + } + + /* we are after monitor kick, + * so container field can be cleared - check it again + */ + if (a->container == NULL) + return; + + if (sigterm && a->info.safe_mode_delay != 1 && + a->safe_mode_delay_fd >= 0) { + long int new_delay = 1; + char delay[10]; + ssize_t len; + + len = snprintf(delay, sizeof(delay), "0.%03ld\n", new_delay); + if (write(a->safe_mode_delay_fd, delay, len) == len) + a->info.safe_mode_delay = new_delay; + } + + /* We don't check the array while any update is pending, as it + * might container a change (such as a spare assignment) which + * could affect our decisions. + */ + if (a->check_degraded && !frozen && + update_queue == NULL && update_queue_pending == NULL) { + struct metadata_update *updates = NULL; + struct mdinfo *newdev = NULL; + struct active_array *newa; + struct mdinfo *d; + + a->check_degraded = 0; + + /* The array may not be degraded, this is just a good time + * to check. + */ + newdev = container->ss->activate_spare(a, &updates); + if (!newdev) + return; + + newa = duplicate_aa(a); + if (!newa) + goto out; + /* prevent the kernel from activating the disk(s) before we + * finish adding them + */ + dprintf("freezing %s\n", a->info.sys_name); + sysfs_set_str(&a->info, NULL, "sync_action", "frozen"); + + /* Add device to array and set offset/size/slot. + * and open files for each newdev */ + for (d = newdev; d ; d = d->next) { + struct mdinfo *newd; + + newd = xmalloc(sizeof(*newd)); + if (sysfs_add_disk(&newa->info, d, 0) < 0) { + free(newd); + continue; + } + disk_init_and_add(newd, d, newa); + } + queue_metadata_update(updates); + updates = NULL; + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + replace_array(container, a, newa); + if (sysfs_set_str(&a->info, NULL, + "sync_action", "recover") == 0) + newa->prev_action = recover; + dprintf("recovery started on %s\n", a->info.sys_name); + out: + while (newdev) { + d = newdev->next; + free(newdev); + newdev = d; + } + free_updates(&updates); + } + + if (a->check_reshape) { + /* mdadm might have added some devices to the array. + * We want to disk_init_and_add any such device to a + * duplicate_aa and replace a with that. + * mdstat doesn't have enough info so we sysfs_read + * and look for new stuff. + */ + struct mdinfo *info, *d, *d2, *newd; + unsigned long long array_size; + struct active_array *newa = NULL; + a->check_reshape = 0; + info = sysfs_read(-1, mdstat->devnm, + GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE); + if (!info) + goto out2; + for (d = info->devs; d; d = d->next) { + if (d->disk.raid_disk < 0) + continue; + for (d2 = a->info.devs; d2; d2 = d2->next) + if (d2->disk.raid_disk == + d->disk.raid_disk) + break; + if (d2) + /* already have this one */ + continue; + if (!newa) { + newa = duplicate_aa(a); + if (!newa) + break; + } + newd = xmalloc(sizeof(*newd)); + disk_init_and_add(newd, d, newa); + } + if (sysfs_get_ll(info, NULL, "array_size", &array_size) == 0 && + a->info.custom_array_size > array_size*2) { + sysfs_set_num(info, NULL, "array_size", + a->info.custom_array_size/2); + } + out2: + sysfs_free(info); + if (newa) + replace_array(container, a, newa); + } +} + +static int aa_ready(struct active_array *aa) +{ + struct mdinfo *d; + int level = aa->info.array.level; + + for (d = aa->info.devs; d; d = d->next) + if (d->state_fd < 0) + return 0; + + if (aa->info.state_fd < 0) + return 0; + + if (level > 0 && (aa->action_fd < 0 || aa->resync_start_fd < 0)) + return 0; + + if (!aa->container) + return 0; + + return 1; +} + +static void manage_new(struct mdstat_ent *mdstat, + struct supertype *container, + struct active_array *victim) +{ + /* A new array has appeared in this container. + * Hopefully it is already recorded in the metadata. + * Check, then create the new array to report it to + * the monitor. + */ + + struct active_array *new = NULL; + struct mdinfo *mdi = NULL, *di; + int i, inst; + int failed = 0; + char buf[40]; + + /* check if array is ready to be monitored */ + if (!mdstat->active || !mdstat->level) + return; + if (strncmp(mdstat->level, "raid0", strlen("raid0")) == 0 || + strncmp(mdstat->level, "linear", strlen("linear")) == 0) + return; + + mdi = sysfs_read(-1, mdstat->devnm, + GET_LEVEL|GET_CHUNK|GET_DISKS|GET_COMPONENT| + GET_SAFEMODE|GET_DEVS|GET_OFFSET|GET_SIZE|GET_STATE| + GET_LAYOUT|GET_DEVS_ALL); + + if (!mdi) + return; + new = xcalloc(1, sizeof(*new)); + + strcpy(new->info.sys_name, mdstat->devnm); + + new->prev_state = new->curr_state = new->next_state = inactive; + new->prev_action= new->curr_action= new->next_action= idle; + + new->container = container; + + if (parse_num(&inst, to_subarray(mdstat, container->devnm)) != 0) + goto error; + + new->info.array = mdi->array; + new->info.component_size = mdi->component_size; + + for (i = 0; i < new->info.array.raid_disks; i++) { + struct mdinfo *newd = xmalloc(sizeof(*newd)); + + for (di = mdi->devs; di; di = di->next) + if (i == di->disk.raid_disk) + break; + + if (disk_init_and_add(newd, di, new) != 0) { + if (newd) + free(newd); + + failed++; + if (failed > new->info.array.failed_disks) { + /* we cannot properly monitor without all working disks */ + new->container = NULL; + break; + } + } + } + + new->action_fd = sysfs_open2(new->info.sys_name, NULL, "sync_action"); + new->info.state_fd = sysfs_open2(new->info.sys_name, NULL, "array_state"); + new->resync_start_fd = sysfs_open2(new->info.sys_name, NULL, "resync_start"); + new->metadata_fd = sysfs_open2(new->info.sys_name, NULL, "metadata_version"); + new->sync_completed_fd = sysfs_open2(new->info.sys_name, NULL, "sync_completed"); + new->safe_mode_delay_fd = sysfs_open2(new->info.sys_name, NULL, + "safe_mode_delay"); + + dprintf("inst: %d action: %d state: %d\n", inst, + new->action_fd, new->info.state_fd); + + if (mdi->safe_mode_delay >= 50) + /* Normal start, mdadm set this. */ + new->info.safe_mode_delay = mdi->safe_mode_delay; + else + /* Restart, just pick a number */ + new->info.safe_mode_delay = 5000; + sysfs_set_safemode(&new->info, new->info.safe_mode_delay); + + /* reshape_position is set by mdadm in sysfs + * read this information for new arrays only (empty victim) + */ + if ((victim == NULL) && + (sysfs_get_str(mdi, NULL, "sync_action", buf, 40) > 0) && + (strncmp(buf, "reshape", 7) == 0)) { + if (sysfs_get_ll(mdi, NULL, "reshape_position", + &new->last_checkpoint) != 0) + new->last_checkpoint = 0; + else { + int data_disks = mdi->array.raid_disks; + if (mdi->array.level == 4 || mdi->array.level == 5) + data_disks--; + if (mdi->array.level == 6) + data_disks -= 2; + + new->last_checkpoint /= data_disks; + } + dprintf("mdmon: New monitored array is under reshape.\n" + " Last checkpoint is: %llu\n", + new->last_checkpoint); + } + + sysfs_free(mdi); + mdi = NULL; + + /* if everything checks out tell the metadata handler we want to + * manage this instance + */ + if (!aa_ready(new) || container->ss->open_new(container, new, inst) < 0) { + goto error; + } else { + replace_array(container, victim, new); + if (failed) { + new->check_degraded = 1; + manage_member(mdstat, new); + } + } + return; + +error: + pr_err("failed to monitor %s\n", mdstat->metadata_version); + if (new) { + new->container = NULL; + free_aa(new); + } + if (mdi) + sysfs_free(mdi); +} + +void manage(struct mdstat_ent *mdstat, struct supertype *container) +{ + /* We have just read mdstat and need to compare it with + * the known active arrays. + * Arrays with the wrong metadata are ignored. + */ + + for ( ; mdstat ; mdstat = mdstat->next) { + struct active_array *a; + if (strcmp(mdstat->devnm, container->devnm) == 0) { + manage_container(mdstat, container); + continue; + } + if (!is_container_member(mdstat, container->devnm)) + /* Not for this array */ + continue; + /* Looks like a member of this container */ + for (a = container->arrays; a; a = a->next) { + if (strcmp(mdstat->devnm, a->info.sys_name) == 0) { + if (a->container && a->to_remove == 0) + manage_member(mdstat, a); + break; + } + } + if ((a == NULL || !a->container) && !sigterm) + manage_new(mdstat, container, a); + } +} + +static void handle_message(struct supertype *container, struct metadata_update *msg) +{ + /* queue this metadata update through to the monitor */ + + struct metadata_update *mu; + + if (msg->len <= 0) + while (update_queue_pending || update_queue) { + check_update_queue(container); + usleep(15*1000); + } + + if (msg->len == 0) { /* ping_monitor */ + int cnt; + + cnt = monitor_loop_cnt; + if (cnt & 1) + cnt += 2; /* wait until next pselect */ + else + cnt += 3; /* wait for 2 pselects */ + wakeup_monitor(); + + while (monitor_loop_cnt - cnt < 0) + usleep(10 * 1000); + } else if (msg->len == -1) { /* ping_manager */ + struct mdstat_ent *mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + free_mdstat(mdstat); + } else if (!sigterm) { + mu = xmalloc(sizeof(*mu)); + mu->len = msg->len; + mu->buf = msg->buf; + msg->buf = NULL; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + if (container->ss->prepare_update) + if (!container->ss->prepare_update(container, mu)) + free_updates(&mu); + queue_metadata_update(mu); + } +} + +void read_sock(struct supertype *container) +{ + int fd; + struct metadata_update msg; + int terminate = 0; + long fl; + int tmo = 3; /* 3 second timeout before hanging up the socket */ + + fd = accept(container->sock, NULL, NULL); + if (fd < 0) + return; + + fl = fcntl(fd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(fd, F_SETFL, fl); + + do { + msg.buf = NULL; + + /* read and validate the message */ + if (receive_message(fd, &msg, tmo) == 0) { + handle_message(container, &msg); + if (msg.len == 0) { + /* ping reply with version */ + msg.buf = Version; + msg.len = strlen(Version) + 1; + if (send_message(fd, &msg, tmo) < 0) + terminate = 1; + } else if (ack(fd, tmo) < 0) + terminate = 1; + } else + terminate = 1; + + } while (!terminate); + + close(fd); +} + +int exit_now = 0; +int manager_ready = 0; +void do_manager(struct supertype *container) +{ + struct mdstat_ent *mdstat; + sigset_t set; + + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + sigdelset(&set, SIGTERM); + + do { + + if (exit_now) + exit(0); + + /* Can only 'manage' things if 'monitor' is not making + * structural changes to metadata, so need to check + * update_queue + */ + if (update_queue == NULL) { + mdstat = mdstat_read(1, 0); + + manage(mdstat, container); + + read_sock(container); + + free_mdstat(mdstat); + } + remove_old(); + + check_update_queue(container); + + manager_ready = 1; + + if (sigterm) + wakeup_monitor(); + + if (update_queue == NULL) + mdstat_wait_fd(container->sock, &set); + else + /* If an update is happening, just wait for signal */ + pselect(0, NULL, NULL, NULL, NULL, &set); + } while(1); +} diff --git a/mapfile.c b/mapfile.c new file mode 100644 index 0000000..6b2207d --- /dev/null +++ b/mapfile.c @@ -0,0 +1,511 @@ +/* + * mapfile - keep track of uuid <-> array mapping. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * Paper: Neil Brown + * Novell Inc + * GPO Box Q1283 + * QVB Post Office, NSW 1230 + * Australia + */ + +/* The mapfile is used to track arrays being created in --incremental + * mode. It particularly allows lookup from UUID to array device, but + * also allows the array device name to be easily found. + * + * The map file is line based with space separated fields. The fields are: + * Device id - mdX or mdpX where X is a number. + * metadata - 0.90 1.0 1.1 1.2 ddf ... + * UUID - uuid of the array + * path - path where device created: /dev/md/home + * + * The best place for the mapfile is /run/mdadm/map. Distros and users + * which have not switched to /run yet can choose a different location + * at compile time via MAP_DIR and MAP_FILE. + */ +#include "mdadm.h" +#include <sys/file.h> +#include <ctype.h> + +#define MAP_READ 0 +#define MAP_NEW 1 +#define MAP_LOCK 2 +#define MAP_DIRNAME 3 + +char *mapname[4] = { + MAP_DIR "/" MAP_FILE, + MAP_DIR "/" MAP_FILE ".new", + MAP_DIR "/" MAP_FILE ".lock", + MAP_DIR +}; + +int mapmode[3] = { O_RDONLY, O_RDWR|O_CREAT, O_RDWR|O_CREAT|O_TRUNC }; +char *mapsmode[3] = { "r", "w", "w"}; + +FILE *open_map(int modenum) +{ + int fd; + if ((mapmode[modenum] & O_CREAT)) + /* Attempt to create directory, don't worry about + * failure. + */ + (void)mkdir(mapname[MAP_DIRNAME], 0755); + fd = open(mapname[modenum], mapmode[modenum], 0600); + if (fd >= 0) + return fdopen(fd, mapsmode[modenum]); + return NULL; +} + +int map_write(struct map_ent *mel) +{ + FILE *f; + int err; + + f = open_map(MAP_NEW); + + if (!f) + return 0; + for (; mel; mel = mel->next) { + if (mel->bad) + continue; + fprintf(f, "%s ", mel->devnm); + fprintf(f, "%s ", mel->metadata); + fprintf(f, "%08x:%08x:%08x:%08x ", mel->uuid[0], + mel->uuid[1], mel->uuid[2], mel->uuid[3]); + fprintf(f, "%s\n", mel->path?:""); + } + fflush(f); + err = ferror(f); + fclose(f); + if (err) { + unlink(mapname[1]); + return 0; + } + return rename(mapname[1], + mapname[0]) == 0; +} + +static FILE *lf = NULL; +int map_lock(struct map_ent **melp) +{ + while (lf == NULL) { + struct stat buf; + lf = open_map(MAP_LOCK); + if (lf == NULL) + return -1; + if (flock(fileno(lf), LOCK_EX) != 0) { + fclose(lf); + lf = NULL; + return -1; + } + if (fstat(fileno(lf), &buf) != 0 || + buf.st_nlink == 0) { + /* The owner of the lock unlinked it, + * so we have a lock on a stale file, + * try again + */ + fclose(lf); + lf = NULL; + } + } + if (*melp) + map_free(*melp); + map_read(melp); + return 0; +} + +void map_unlock(struct map_ent **melp) +{ + if (lf) { + /* must unlink before closing the file, + * as only the owner of the lock may + * unlink the file + */ + unlink(mapname[2]); + fclose(lf); + } + if (*melp) + map_free(*melp); + lf = NULL; +} + +void map_fork(void) +{ + /* We are forking, so must close the lock file. + * Don't risk flushing anything though. + */ + if (lf) { + close(fileno(lf)); + fclose(lf); + lf = NULL; + } +} + +void map_add(struct map_ent **melp, + char * devnm, char *metadata, int uuid[4], char *path) +{ + struct map_ent *me = xmalloc(sizeof(*me)); + + strcpy(me->devnm, devnm); + strcpy(me->metadata, metadata); + memcpy(me->uuid, uuid, 16); + me->path = path ? xstrdup(path) : NULL; + me->next = *melp; + me->bad = 0; + *melp = me; +} + +void map_read(struct map_ent **melp) +{ + FILE *f; + char buf[8192]; + char path[201]; + int uuid[4]; + char devnm[32]; + char metadata[30]; + + *melp = NULL; + + f = open_map(MAP_READ); + if (!f) { + RebuildMap(); + f = open_map(MAP_READ); + } + if (!f) + return; + + while (fgets(buf, sizeof(buf), f)) { + path[0] = 0; + if (sscanf(buf, " %s %s %x:%x:%x:%x %200s", + devnm, metadata, uuid, uuid+1, + uuid+2, uuid+3, path) >= 7) { + map_add(melp, devnm, metadata, uuid, path); + } + } + fclose(f); +} + +void map_free(struct map_ent *map) +{ + while (map) { + struct map_ent *mp = map; + map = mp->next; + free(mp->path); + free(mp); + } +} + +int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int uuid[4], char *path) +{ + struct map_ent *map, *mp; + int rv; + + if (mpp && *mpp) + map = *mpp; + else + map_read(&map); + + for (mp = map ; mp ; mp=mp->next) + if (strcmp(mp->devnm, devnm) == 0) { + strcpy(mp->metadata, metadata); + memcpy(mp->uuid, uuid, 16); + free(mp->path); + mp->path = path ? xstrdup(path) : NULL; + mp->bad = 0; + break; + } + if (!mp) + map_add(&map, devnm, metadata, uuid, path); + if (mpp) + *mpp = NULL; + rv = map_write(map); + map_free(map); + return rv; +} + +void map_delete(struct map_ent **mapp, char *devnm) +{ + struct map_ent *mp; + + if (*mapp == NULL) + map_read(mapp); + + for (mp = *mapp; mp; mp = *mapp) { + if (strcmp(mp->devnm, devnm) == 0) { + *mapp = mp->next; + free(mp->path); + free(mp); + } else + mapp = & mp->next; + } +} + +void map_remove(struct map_ent **mapp, char *devnm) +{ + if (devnm[0] == 0) + return; + + map_delete(mapp, devnm); + map_write(*mapp); + map_free(*mapp); + *mapp = NULL; +} + +struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (memcmp(uuid, mp->uuid, 16) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_devnm(struct map_ent **map, char *devnm) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (strcmp(mp->devnm, devnm) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +struct map_ent *map_by_name(struct map_ent **map, char *name) +{ + struct map_ent *mp; + if (!*map) + map_read(map); + + for (mp = *map ; mp ; mp = mp->next) { + if (!mp->path) + continue; + if (strncmp(mp->path, "/dev/md/", 8) != 0) + continue; + if (strcmp(mp->path+8, name) != 0) + continue; + if (!mddev_busy(mp->devnm)) { + mp->bad = 1; + continue; + } + return mp; + } + return NULL; +} + +/* sets the proper subarray and container_dev according to the metadata + * version super_by_fd does this automatically, this routine is meant as + * a supplement for guess_super() + */ +static char *get_member_info(struct mdstat_ent *ent) +{ + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + return NULL; + + if (is_subarray(&ent->metadata_version[9])) { + char *subarray; + + subarray = strrchr(ent->metadata_version, '/'); + return subarray + 1; + } + return NULL; +} + +void RebuildMap(void) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *md; + struct map_ent *map = NULL; + int require_homehost; + char sys_hostname[256]; + char *homehost = conf_get_homehost(&require_homehost); + + if (homehost == NULL || strcmp(homehost, "<system>")==0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + homehost = sys_hostname; + } + } + + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, GET_DEVS); + struct mdinfo *sd; + + if (!sra) + continue; + + for (sd = sra->devs ; sd ; sd = sd->next) { + char namebuf[100]; + char dn[30]; + int dfd; + int ok; + dev_t devid; + struct supertype *st; + char *subarray = NULL; + char *path; + struct mdinfo *info; + + sprintf(dn, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(dn, O_RDONLY); + if (dfd < 0) + continue; + st = guess_super(dfd); + if ( st == NULL) + ok = -1; + else { + subarray = get_member_info(md); + ok = st->ss->load_super(st, dfd, NULL); + } + close(dfd); + if (ok != 0) + continue; + if (subarray) + info = st->ss->container_content(st, subarray); + else { + info = xmalloc(sizeof(*info)); + st->ss->getinfo_super(st, info, NULL); + } + if (!info) + continue; + + devid = devnm2devid(md->devnm); + path = map_dev(major(devid), minor(devid), 0); + if (path == NULL || + strncmp(path, "/dev/md/", 8) != 0) { + /* We would really like a name that provides + * an MD_DEVNAME for udev. + * The name needs to be unique both in /dev/md/ + * and in this mapfile. + * It needs to match what -I or -As would come + * up with. + * That means: + * Check if array is in mdadm.conf + * - if so use that. + * determine trustworthy from homehost etc + * find a unique name based on metadata name. + * + */ + struct mddev_ident *match = conf_match(st, info, + NULL, 0, + NULL); + struct stat stb; + if (match && match->devname && match->devname[0] == '/') { + path = match->devname; + if (path[0] != '/') { + strcpy(namebuf, "/dev/md/"); + strcat(namebuf, path); + path = namebuf; + } + } else { + int unum = 0; + char *sep = "_"; + const char *name; + int conflict = 1; + if ((homehost == NULL || + st->ss->match_home(st, homehost) != 1) && + st->ss->match_home(st, "any") != 1 && + (require_homehost || + !conf_name_is_free(info->name))) + /* require a numeric suffix */ + unum = 0; + else + /* allow name to be used as-is if no conflict */ + unum = -1; + name = info->name; + if (!*name) { + name = st->ss->name; + if (!isdigit(name[strlen(name)-1]) && + unum == -1) { + unum = 0; + sep = ""; + } + } + if (strchr(name, ':')) { + /* Probably a uniquifying + * hostname prefix. Allow + * without a suffix, and strip + * hostname if it is us. + */ + if (homehost && unum == -1 && + strncmp(name, homehost, + strlen(homehost)) == 0 && + name[strlen(homehost)] == ':') + name += strlen(homehost)+1; + unum = -1; + } + + while (conflict) { + if (unum >= 0) + sprintf(namebuf, "/dev/md/%s%s%d", + name, sep, unum); + else + sprintf(namebuf, "/dev/md/%s", + name); + unum++; + if (lstat(namebuf, &stb) != 0 && + (map == NULL || + !map_by_name(&map, namebuf+8))) + conflict = 0; + } + path = namebuf; + } + } + map_add(&map, md->devnm, + info->text_version, + info->uuid, path); + st->ss->free_super(st); + free(info); + break; + } + sysfs_free(sra); + } + /* Only trigger a change if we wrote a new map file */ + if (map_write(map)) + for (md = mdstat ; md ; md = md->next) { + struct mdinfo *sra = sysfs_read(-1, md->devnm, + GET_VERSION); + if (sra) + sysfs_uevent(sra, "change"); + sysfs_free(sra); + } + map_free(map); + free_mdstat(mdstat); +} @@ -0,0 +1,185 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2011 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" + +/* name/number mappings */ + +mapping_t r5layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ddf-N-restart", ALGORITHM_LEFT_ASYMMETRIC}, + { "ddf-N-continue", ALGORITHM_LEFT_SYMMETRIC}, + + { NULL, UnSet } +}; +mapping_t r6layout[] = { + { "left-asymmetric", ALGORITHM_LEFT_ASYMMETRIC}, + { "right-asymmetric", ALGORITHM_RIGHT_ASYMMETRIC}, + { "left-symmetric", ALGORITHM_LEFT_SYMMETRIC}, + { "right-symmetric", ALGORITHM_RIGHT_SYMMETRIC}, + + { "default", ALGORITHM_LEFT_SYMMETRIC}, + { "la", ALGORITHM_LEFT_ASYMMETRIC}, + { "ra", ALGORITHM_RIGHT_ASYMMETRIC}, + { "ls", ALGORITHM_LEFT_SYMMETRIC}, + { "rs", ALGORITHM_RIGHT_SYMMETRIC}, + + { "parity-first", ALGORITHM_PARITY_0}, + { "parity-last", ALGORITHM_PARITY_N}, + { "ddf-zero-restart", ALGORITHM_ROTATING_ZERO_RESTART}, + { "ddf-N-restart", ALGORITHM_ROTATING_N_RESTART}, + { "ddf-N-continue", ALGORITHM_ROTATING_N_CONTINUE}, + + { "left-asymmetric-6", ALGORITHM_LEFT_ASYMMETRIC_6}, + { "right-asymmetric-6", ALGORITHM_RIGHT_ASYMMETRIC_6}, + { "left-symmetric-6", ALGORITHM_LEFT_SYMMETRIC_6}, + { "right-symmetric-6", ALGORITHM_RIGHT_SYMMETRIC_6}, + { "parity-first-6", ALGORITHM_PARITY_0_6}, + + { NULL, UnSet } +}; + +/* raid0 layout is only needed because of a bug in 3.14 which changed + * the effective layout of raid0 arrays with varying device sizes. + */ +mapping_t r0layout[] = { + { "original", RAID0_ORIG_LAYOUT}, + { "alternate", RAID0_ALT_MULTIZONE_LAYOUT}, + { "1", 1}, /* aka ORIG */ + { "2", 2}, /* aka ALT */ + { "dangerous", 0}, + { NULL, UnSet}, +}; + +mapping_t pers[] = { + { "linear", LEVEL_LINEAR}, + { "raid0", 0}, + { "0", 0}, + { "stripe", 0}, + { "raid1", 1}, + { "1", 1}, + { "mirror", 1}, + { "raid4", 4}, + { "4", 4}, + { "raid5", 5}, + { "5", 5}, + { "multipath", LEVEL_MULTIPATH}, + { "mp", LEVEL_MULTIPATH}, + { "raid6", 6}, + { "6", 6}, + { "raid10", 10}, + { "10", 10}, + { "faulty", LEVEL_FAULTY}, + { "container", LEVEL_CONTAINER}, + { NULL, UnSet } +}; + +mapping_t modes[] = { + { "assemble", ASSEMBLE}, + { "build", BUILD}, + { "create", CREATE}, + { "manage", MANAGE}, + { "misc", MISC}, + { "monitor", MONITOR}, + { "grow", GROW}, + { "incremental", INCREMENTAL}, + { "auto-detect", AUTODETECT}, + { NULL, UnSet } +}; + +mapping_t faultylayout[] = { + { "write-transient", WriteTransient }, + { "wt", WriteTransient }, + { "read-transient", ReadTransient }, + { "rt", ReadTransient }, + { "write-persistent", WritePersistent }, + { "wp", WritePersistent }, + { "read-persistent", ReadPersistent }, + { "rp", ReadPersistent }, + { "write-all", WriteAll }, + { "wa", WriteAll }, + { "read-fixable", ReadFixable }, + { "rf", ReadFixable }, + + { "clear", ClearErrors}, + { "flush", ClearFaults}, + { "none", ClearErrors}, + { "default", ClearErrors}, + { NULL, UnSet } +}; + +mapping_t consistency_policies[] = { + { "unknown", CONSISTENCY_POLICY_UNKNOWN}, + { "none", CONSISTENCY_POLICY_NONE}, + { "resync", CONSISTENCY_POLICY_RESYNC}, + { "bitmap", CONSISTENCY_POLICY_BITMAP}, + { "journal", CONSISTENCY_POLICY_JOURNAL}, + { "ppl", CONSISTENCY_POLICY_PPL}, + { NULL, CONSISTENCY_POLICY_UNKNOWN } +}; + +mapping_t sysfs_array_states[] = { + { "active-idle", ARRAY_ACTIVE_IDLE }, + { "active", ARRAY_ACTIVE }, + { "clear", ARRAY_CLEAR }, + { "inactive", ARRAY_INACTIVE }, + { "suspended", ARRAY_SUSPENDED }, + { "readonly", ARRAY_READONLY }, + { "read-auto", ARRAY_READ_AUTO }, + { "clean", ARRAY_CLEAN }, + { "write-pending", ARRAY_WRITE_PENDING }, + { "broken", ARRAY_BROKEN }, + { NULL, ARRAY_UNKNOWN_STATE } +}; + +char *map_num(mapping_t *map, int num) +{ + while (map->name) { + if (map->num == num) + return map->name; + map++; + } + return NULL; +} + +int map_name(mapping_t *map, char *name) +{ + while (map->name && strcmp(map->name, name) != 0) + map++; + + return map->num; +} @@ -0,0 +1,1317 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.if n .pl 1000v +.TH MD 4 +.SH NAME +md \- Multiple Device driver aka Linux Software RAID +.SH SYNOPSIS +.BI /dev/md n +.br +.BI /dev/md/ n +.br +.BR /dev/md/ name +.SH DESCRIPTION +The +.B md +driver provides virtual devices that are created from one or more +independent underlying devices. This array of devices often contains +redundancy and the devices are often disk drives, hence the acronym RAID +which stands for a Redundant Array of Independent Disks. +.PP +.B md +supports RAID levels +1 (mirroring), +4 (striped array with parity device), +5 (striped array with distributed parity information), +6 (striped array with distributed dual redundancy information), and +10 (striped and mirrored). +If some number of underlying devices fails while using one of these +levels, the array will continue to function; this number is one for +RAID levels 4 and 5, two for RAID level 6, and all but one (N-1) for +RAID level 1, and dependent on configuration for level 10. +.PP +.B md +also supports a number of pseudo RAID (non-redundant) configurations +including RAID0 (striped array), LINEAR (catenated array), +MULTIPATH (a set of different interfaces to the same device), +and FAULTY (a layer over a single device into which errors can be injected). + +.SS MD METADATA +Each device in an array may have some +.I metadata +stored in the device. This metadata is sometimes called a +.BR superblock . +The metadata records information about the structure and state of the array. +This allows the array to be reliably re-assembled after a shutdown. + +From Linux kernel version 2.6.10, +.B md +provides support for two different formats of metadata, and +other formats can be added. Prior to this release, only one format is +supported. + +The common format \(em known as version 0.90 \(em has +a superblock that is 4K long and is written into a 64K aligned block that +starts at least 64K and less than 128K from the end of the device +(i.e. to get the address of the superblock round the size of the +device down to a multiple of 64K and then subtract 64K). +The available size of each device is the amount of space before the +super block, so between 64K and 128K is lost when a device in +incorporated into an MD array. +This superblock stores multi-byte fields in a processor-dependent +manner, so arrays cannot easily be moved between computers with +different processors. + +The new format \(em known as version 1 \(em has a superblock that is +normally 1K long, but can be longer. It is normally stored between 8K +and 12K from the end of the device, on a 4K boundary, though +variations can be stored at the start of the device (version 1.1) or 4K from +the start of the device (version 1.2). +This metadata format stores multibyte data in a +processor-independent format and supports up to hundreds of +component devices (version 0.90 only supports 28). + +The metadata contains, among other things: +.TP +LEVEL +The manner in which the devices are arranged into the array +(LINEAR, RAID0, RAID1, RAID4, RAID5, RAID10, MULTIPATH). +.TP +UUID +a 128 bit Universally Unique Identifier that identifies the array that +contains this device. + +.PP +When a version 0.90 array is being reshaped (e.g. adding extra devices +to a RAID5), the version number is temporarily set to 0.91. This +ensures that if the reshape process is stopped in the middle (e.g. by +a system crash) and the machine boots into an older kernel that does +not support reshaping, then the array will not be assembled (which +would cause data corruption) but will be left untouched until a kernel +that can complete the reshape processes is used. + +.SS ARRAYS WITHOUT METADATA +While it is usually best to create arrays with superblocks so that +they can be assembled reliably, there are some circumstances when an +array without superblocks is preferred. These include: +.TP +LEGACY ARRAYS +Early versions of the +.B md +driver only supported LINEAR and RAID0 configurations and did not use +a superblock (which is less critical with these configurations). +While such arrays should be rebuilt with superblocks if possible, +.B md +continues to support them. +.TP +FAULTY +Being a largely transparent layer over a different device, the FAULTY +personality doesn't gain anything from having a superblock. +.TP +MULTIPATH +It is often possible to detect devices which are different paths to +the same storage directly rather than having a distinctive superblock +written to the device and searched for on all paths. In this case, +a MULTIPATH array with no superblock makes sense. +.TP +RAID1 +In some configurations it might be desired to create a RAID1 +configuration that does not use a superblock, and to maintain the state of +the array elsewhere. While not encouraged for general use, it does +have special-purpose uses and is supported. + +.SS ARRAYS WITH EXTERNAL METADATA + +From release 2.6.28, the +.I md +driver supports arrays with externally managed metadata. That is, +the metadata is not managed by the kernel but rather by a user-space +program which is external to the kernel. This allows support for a +variety of metadata formats without cluttering the kernel with lots of +details. +.PP +.I md +is able to communicate with the user-space program through various +sysfs attributes so that it can make appropriate changes to the +metadata \- for example to mark a device as faulty. When necessary, +.I md +will wait for the program to acknowledge the event by writing to a +sysfs attribute. +The manual page for +.IR mdmon (8) +contains more detail about this interaction. + +.SS CONTAINERS +Many metadata formats use a single block of metadata to describe a +number of different arrays which all use the same set of devices. +In this case it is helpful for the kernel to know about the full set +of devices as a whole. This set is known to md as a +.IR container . +A container is an +.I md +array with externally managed metadata and with device offset and size +so that it just covers the metadata part of the devices. The +remainder of each device is available to be incorporated into various +arrays. + +.SS LINEAR + +A LINEAR array simply catenates the available space on each +drive to form one large virtual drive. + +One advantage of this arrangement over the more common RAID0 +arrangement is that the array may be reconfigured at a later time with +an extra drive, so the array is made bigger without disturbing the +data that is on the array. This can even be done on a live +array. + +If a chunksize is given with a LINEAR array, the usable space on each +device is rounded down to a multiple of this chunksize. + +.SS RAID0 + +A RAID0 array (which has zero redundancy) is also known as a +striped array. +A RAID0 array is configured at creation with a +.B "Chunk Size" +which must be a power of two (prior to Linux 2.6.31), and at least 4 +kibibytes. + +The RAID0 driver assigns the first chunk of the array to the first +device, the second chunk to the second device, and so on until all +drives have been assigned one chunk. This collection of chunks forms a +.BR stripe . +Further chunks are gathered into stripes in the same way, and are +assigned to the remaining space in the drives. + +If devices in the array are not all the same size, then once the +smallest device has been exhausted, the RAID0 driver starts +collecting chunks into smaller stripes that only span the drives which +still have remaining space. + +A bug was introduced in linux 3.14 which changed the layout of blocks in +a RAID0 beyond the region that is striped over all devices. This bug +does not affect an array with all devices the same size, but can affect +other RAID0 arrays. + +Linux 5.4 (and some stable kernels to which the change was backported) +will not normally assemble such an array as it cannot know which layout +to use. There is a module parameter "raid0.default_layout" which can be +set to "1" to force the kernel to use the pre-3.14 layout or to "2" to +force it to use the 3.14-and-later layout. when creating a new RAID0 +array, +.I mdadm +will record the chosen layout in the metadata in a way that allows newer +kernels to assemble the array without needing a module parameter. + +To assemble an old array on a new kernel without using the module parameter, +use either the +.B "--update=layout-original" +option or the +.B "--update=layout-alternate" +option. + +Once you have updated the layout you will not be able to mount the array +on an older kernel. If you need to revert to an older kernel, the +layout information can be erased with the +.B "--update=layout-unspecificed" +option. If you use this option to +.B --assemble +while running a newer kernel, the array will NOT assemble, but the +metadata will be update so that it can be assembled on an older kernel. + +No that setting the layout to "unspecified" removes protections against +this bug, and you must be sure that the kernel you use matches the +layout of the array. + +.SS RAID1 + +A RAID1 array is also known as a mirrored set (though mirrors tend to +provide reflected images, which RAID1 does not) or a plex. + +Once initialised, each device in a RAID1 array contains exactly the +same data. Changes are written to all devices in parallel. Data is +read from any one device. The driver attempts to distribute read +requests across all devices to maximise performance. + +All devices in a RAID1 array should be the same size. If they are +not, then only the amount of space available on the smallest device is +used (any extra space on other devices is wasted). + +Note that the read balancing done by the driver does not make the RAID1 +performance profile be the same as for RAID0; a single stream of +sequential input will not be accelerated (e.g. a single dd), but +multiple sequential streams or a random workload will use more than one +spindle. In theory, having an N-disk RAID1 will allow N sequential +threads to read from all disks. + +Individual devices in a RAID1 can be marked as "write-mostly". +These drives are excluded from the normal read balancing and will only +be read from when there is no other option. This can be useful for +devices connected over a slow link. + +.SS RAID4 + +A RAID4 array is like a RAID0 array with an extra device for storing +parity. This device is the last of the active devices in the +array. Unlike RAID0, RAID4 also requires that all stripes span all +drives, so extra space on devices that are larger than the smallest is +wasted. + +When any block in a RAID4 array is modified, the parity block for that +stripe (i.e. the block in the parity device at the same device offset +as the stripe) is also modified so that the parity block always +contains the "parity" for the whole stripe. I.e. its content is +equivalent to the result of performing an exclusive-or operation +between all the data blocks in the stripe. + +This allows the array to continue to function if one device fails. +The data that was on that device can be calculated as needed from the +parity block and the other data blocks. + +.SS RAID5 + +RAID5 is very similar to RAID4. The difference is that the parity +blocks for each stripe, instead of being on a single device, are +distributed across all devices. This allows more parallelism when +writing, as two different block updates will quite possibly affect +parity blocks on different devices so there is less contention. + +This also allows more parallelism when reading, as read requests are +distributed over all the devices in the array instead of all but one. + +.SS RAID6 + +RAID6 is similar to RAID5, but can handle the loss of any \fItwo\fP +devices without data loss. Accordingly, it requires N+2 drives to +store N drives worth of data. + +The performance for RAID6 is slightly lower but comparable to RAID5 in +normal mode and single disk failure mode. It is very slow in dual +disk failure mode, however. + +.SS RAID10 + +RAID10 provides a combination of RAID1 and RAID0, and is sometimes known +as RAID1+0. Every datablock is duplicated some number of times, and +the resulting collection of datablocks are distributed over multiple +drives. + +When configuring a RAID10 array, it is necessary to specify the number +of replicas of each data block that are required (this will usually +be\ 2) and whether their layout should be "near", "far" or "offset" +(with "offset" being available since Linux\ 2.6.18). + +.B About the RAID10 Layout Examples: +.br +The examples below visualise the chunk distribution on the underlying +devices for the respective layout. + +For simplicity it is assumed that the size of the chunks equals the +size of the blocks of the underlying devices as well as those of the +RAID10 device exported by the kernel (for example \fB/dev/md/\fPname). +.br +Therefore the chunks\ /\ chunk numbers map directly to the blocks\ /\ +block addresses of the exported RAID10 device. + +Decimal numbers (0,\ 1, 2,\ ...) are the chunks of the RAID10 and due +to the above assumption also the blocks and block addresses of the +exported RAID10 device. +.br +Repeated numbers mean copies of a chunk\ /\ block (obviously on +different underlying devices). +.br +Hexadecimal numbers (0x00,\ 0x01, 0x02,\ ...) are the block addresses +of the underlying devices. + +.TP +\fB "near" Layout\fP +When "near" replicas are chosen, the multiple copies of a given chunk are laid +out consecutively ("as close to each other as possible") across the stripes of +the array. + +With an even number of devices, they will likely (unless some misalignment is +present) lay at the very same offset on the different devices. +.br +This is as the "classic" RAID1+0; that is two groups of mirrored devices (in the +example below the groups Device\ #1\ /\ #2 and Device\ #3\ /\ #4 are each a +RAID1) both in turn forming a striped RAID0. + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| C | C | C | C | C | +| - | - | - | - | - | + C C S C S + C C S C S + C C S S S + C C S S S. +; +;Device #1;Device #2;Device #3;Device #4 +0x00;0;0;1;1 +0x01;2;2;3;3 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;254;254;255;255 +;\\---------v---------/;\\---------v---------/ +;RAID1;RAID1 +;\\---------------------v---------------------/ +;RAID0 +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| C | C | C | C | C | C | +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +0x00;0;0;1;1;2 +0x01;2;3;3;4;4 +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\. +0x80;317;318;318;319;319 +; +.TE + +.TP +\fB "far" Layout\fP +When "far" replicas are chosen, the multiple copies of a given chunk +are laid out quite distant ("as far as reasonably possible") from each +other. + +First a complete sequence of all data blocks (that is all the data one +sees on the exported RAID10 block device) is striped over the +devices. Then another (though "shifted") complete sequence of all data +blocks; and so on (in the case of more than 2\ copies per chunk). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the stripes within a complete sequence of chunks. +.br +The offset\ 1 is relative to the previous complete sequence of chunks, +so in case of more than 2\ copies per chunk one gets the following +offsets: +.br +1.\ complete sequence of chunks: offset\ =\ \ 0 +.br +2.\ complete sequence of chunks: offset\ =\ \ 1 +.br +3.\ complete sequence of chunks: offset\ =\ \ 2 +.br + : +.br +n.\ complete sequence of chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;\\ +0x01;4;5;6;7;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;252;253;254;255;/ +0x41;3;0;1;2;\\ +0x42;7;4;5;6;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;255;252;253;254;/ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;\\ +0x01;5;6;7;8;9;> [#] +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x40;315;316;317;318;319;/ +0x41;4;0;1;2;3;\\ +0x42;9;5;6;7;8;> [#]~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +:;:;:;:;:;:;: +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;: +0x80;319;315;316;317;318;/ +; +.TE + +With [#]\ being the complete sequence of chunks and [#]~\ the cyclic permutation +with offset\ 1 thereof (in the case of more than 2 copies per chunk there would +be ([#]~)~,\ (([#]~)~)~,\ ...). + +The advantage of this layout is that MD can easily spread sequential reads over +the devices, making them similar to RAID0 in terms of speed. +.br +The cost is more seeking for writes, making them substantially slower. + +.TP +\fB"offset" Layout\fP +When "offset" replicas are chosen, all the copies of a given chunk are +striped consecutively ("offset by the stripe length after each other") +over the devices. + +Explained in detail, <number of devices> consecutive chunks are +striped over the devices, immediately followed by a "shifted" copy of +these chunks (and by further such "shifted" copies in the case of more +than 2\ copies per chunk). +.br +This pattern repeats for all further consecutive chunks of the +exported RAID10 device (in other words: all further data blocks). + +The "shift" needed to prevent placing copies of the same chunks on the +same devices is actually a cyclic permutation with offset\ 1 of each +of the striped copies of <number of devices> consecutive chunks. +.br +The offset\ 1 is relative to the previous striped copy of <number of +devices> consecutive chunks, so in case of more than 2\ copies per +chunk one gets the following offsets: +.br +1.\ <number of devices> consecutive chunks: offset\ =\ \ 0 +.br +2.\ <number of devices> consecutive chunks: offset\ =\ \ 1 +.br +3.\ <number of devices> consecutive chunks: offset\ =\ \ 2 +.br + : +.br +n.\ <number of devices> consecutive chunks: offset\ =\ n-1 + +.ne 10 +.B Example with 2\ copies per chunk and an even number\ (4) of devices: +.TS +tab(;); + C - - - - + C | C | C | C | C | +| - | - | - | - | - | +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| C | C | C | C | C | L +| - | - | - | - | - | +C. +; +;Device #1;Device #2;Device #3;Device #4 +; +0x00;0;1;2;3;) AA +0x01;3;0;1;2;) AA~ +0x02;4;5;6;7;) AB +0x03;7;4;5;6;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;251;252;253;254;) EX +0x80;254;251;252;253;) EX~ +; +.TE + +.ne 10 +.B Example with 2\ copies per chunk and an odd number\ (5) of devices: +.TS +tab(;); + C - - - - - + C | C | C | C | C | C | +| - | - | - | - | - | - | +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| C | C | C | C | C | C | L +| - | - | - | - | - | - | +C. +; +;Dev #1;Dev #2;Dev #3;Dev #4;Dev #5 +; +0x00;0;1;2;3;4;) AA +0x01;4;0;1;2;3;) AA~ +0x02;5;6;7;8;9;) AB +0x03;9;5;6;7;8;) AB~ +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +:;:;:;:;:;:; : +\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;\.\.\.;) \.\.\. +0x79;314;315;316;317;318;) EX +0x80;318;314;315;316;317;) EX~ +; +.TE + +With AA,\ AB,\ ..., AZ,\ BA,\ ... being the sets of <number of devices> consecutive +chunks and AA~,\ AB~,\ ..., AZ~,\ BA~,\ ... the cyclic permutations with offset\ 1 +thereof (in the case of more than 2 copies per chunk there would be (AA~)~,\ ... +as well as ((AA~)~)~,\ ... and so on). + +This should give similar read characteristics to "far" if a suitably large chunk +size is used, but without as much seeking for writes. +.PP + + +It should be noted that the number of devices in a RAID10 array need +not be a multiple of the number of replica of each data block; however, +there must be at least as many devices as replicas. + +If, for example, an array is created with 5 devices and 2 replicas, +then space equivalent to 2.5 of the devices will be available, and +every block will be stored on two different devices. + +Finally, it is possible to have an array with both "near" and "far" +copies. If an array is configured with 2 near copies and 2 far +copies, then there will be a total of 4 copies of each block, each on +a different drive. This is an artifact of the implementation and is +unlikely to be of real value. + +.SS MULTIPATH + +MULTIPATH is not really a RAID at all as there is only one real device +in a MULTIPATH md array. However there are multiple access points +(paths) to this device, and one of these paths might fail, so there +are some similarities. + +A MULTIPATH array is composed of a number of logically different +devices, often fibre channel interfaces, that all refer the the same +real device. If one of these interfaces fails (e.g. due to cable +problems), the MULTIPATH driver will attempt to redirect requests to +another interface. + +The MULTIPATH drive is not receiving any ongoing development and +should be considered a legacy driver. The device-mapper based +multipath drivers should be preferred for new installations. + +.SS FAULTY +The FAULTY md module is provided for testing purposes. A FAULTY array +has exactly one component device and is normally assembled without a +superblock, so the md array created provides direct access to all of +the data in the component device. + +The FAULTY module may be requested to simulate faults to allow testing +of other md levels or of filesystems. Faults can be chosen to trigger +on read requests or write requests, and can be transient (a subsequent +read/write at the address will probably succeed) or persistent +(subsequent read/write of the same address will fail). Further, read +faults can be "fixable" meaning that they persist until a write +request at the same address. + +Fault types can be requested with a period. In this case, the fault +will recur repeatedly after the given number of requests of the +relevant type. For example if persistent read faults have a period of +100, then every 100th read request would generate a fault, and the +faulty sector would be recorded so that subsequent reads on that +sector would also fail. + +There is a limit to the number of faulty sectors that are remembered. +Faults generated after this limit is exhausted are treated as +transient. + +The list of faulty sectors can be flushed, and the active list of +failure modes can be cleared. + +.SS UNCLEAN SHUTDOWN + +When changes are made to a RAID1, RAID4, RAID5, RAID6, or RAID10 array +there is a possibility of inconsistency for short periods of time as +each update requires at least two block to be written to different +devices, and these writes probably won't happen at exactly the same +time. Thus if a system with one of these arrays is shutdown in the +middle of a write operation (e.g. due to power failure), the array may +not be consistent. + +To handle this situation, the md driver marks an array as "dirty" +before writing any data to it, and marks it as "clean" when the array +is being disabled, e.g. at shutdown. If the md driver finds an array +to be dirty at startup, it proceeds to correct any possibly +inconsistency. For RAID1, this involves copying the contents of the +first drive onto all other drives. For RAID4, RAID5 and RAID6 this +involves recalculating the parity for each stripe and making sure that +the parity block has the correct data. For RAID10 it involves copying +one of the replicas of each block onto all the others. This process, +known as "resynchronising" or "resync" is performed in the background. +The array can still be used, though possibly with reduced performance. + +If a RAID4, RAID5 or RAID6 array is degraded (missing at least one +drive, two for RAID6) when it is restarted after an unclean shutdown, it cannot +recalculate parity, and so it is possible that data might be +undetectably corrupted. The 2.4 md driver +.B does not +alert the operator to this condition. The 2.6 md driver will fail to +start an array in this condition without manual intervention, though +this behaviour can be overridden by a kernel parameter. + +.SS RECOVERY + +If the md driver detects a write error on a device in a RAID1, RAID4, +RAID5, RAID6, or RAID10 array, it immediately disables that device +(marking it as faulty) and continues operation on the remaining +devices. If there are spare drives, the driver will start recreating +on one of the spare drives the data which was on that failed drive, +either by copying a working drive in a RAID1 configuration, or by +doing calculations with the parity block on RAID4, RAID5 or RAID6, or +by finding and copying originals for RAID10. + +In kernels prior to about 2.6.15, a read error would cause the same +effect as a write error. In later kernels, a read-error will instead +cause md to attempt a recovery by overwriting the bad block. i.e. it +will find the correct data from elsewhere, write it over the block +that failed, and then try to read it back again. If either the write +or the re-read fail, md will treat the error the same way that a write +error is treated, and will fail the whole device. + +While this recovery process is happening, the md driver will monitor +accesses to the array and will slow down the rate of recovery if other +activity is happening, so that normal access to the array will not be +unduly affected. When no other activity is happening, the recovery +process proceeds at full speed. The actual speed targets for the two +different situations can be controlled by the +.B speed_limit_min +and +.B speed_limit_max +control files mentioned below. + +.SS SCRUBBING AND MISMATCHES + +As storage devices can develop bad blocks at any time it is valuable +to regularly read all blocks on all devices in an array so as to catch +such bad blocks early. This process is called +.IR scrubbing . + +md arrays can be scrubbed by writing either +.I check +or +.I repair +to the file +.I md/sync_action +in the +.I sysfs +directory for the device. + +Requesting a scrub will cause +.I md +to read every block on every device in the array, and check that the +data is consistent. For RAID1 and RAID10, this means checking that the copies +are identical. For RAID4, RAID5, RAID6 this means checking that the +parity block is (or blocks are) correct. + +If a read error is detected during this process, the normal read-error +handling causes correct data to be found from other devices and to be +written back to the faulty device. In many case this will +effectively +.I fix +the bad block. + +If all blocks read successfully but are found to not be consistent, +then this is regarded as a +.IR mismatch . + +If +.I check +was used, then no action is taken to handle the mismatch, it is simply +recorded. +If +.I repair +was used, then a mismatch will be repaired in the same way that +.I resync +repairs arrays. For RAID5/RAID6 new parity blocks are written. For RAID1/RAID10, +all but one block are overwritten with the content of that one block. + +A count of mismatches is recorded in the +.I sysfs +file +.IR md/mismatch_cnt . +This is set to zero when a +scrub starts and is incremented whenever a sector is +found that is a mismatch. +.I md +normally works in units much larger than a single sector and when it +finds a mismatch, it does not determine exactly how many actual sectors were +affected but simply adds the number of sectors in the IO unit that was +used. So a value of 128 could simply mean that a single 64KB check +found an error (128 x 512bytes = 64KB). + +If an array is created by +.I mdadm +with +.I \-\-assume\-clean +then a subsequent check could be expected to find some mismatches. + +On a truly clean RAID5 or RAID6 array, any mismatches should indicate +a hardware problem at some level - software issues should never cause +such a mismatch. + +However on RAID1 and RAID10 it is possible for software issues to +cause a mismatch to be reported. This does not necessarily mean that +the data on the array is corrupted. It could simply be that the +system does not care what is stored on that part of the array - it is +unused space. + +The most likely cause for an unexpected mismatch on RAID1 or RAID10 +occurs if a swap partition or swap file is stored on the array. + +When the swap subsystem wants to write a page of memory out, it flags +the page as 'clean' in the memory manager and requests the swap device +to write it out. It is quite possible that the memory will be +changed while the write-out is happening. In that case the 'clean' +flag will be found to be clear when the write completes and so the +swap subsystem will simply forget that the swapout had been attempted, +and will possibly choose a different page to write out. + +If the swap device was on RAID1 (or RAID10), then the data is sent +from memory to a device twice (or more depending on the number of +devices in the array). Thus it is possible that the memory gets changed +between the times it is sent, so different data can be written to +the different devices in the array. This will be detected by +.I check +as a mismatch. However it does not reflect any corruption as the +block where this mismatch occurs is being treated by the swap system as +being empty, and the data will never be read from that block. + +It is conceivable for a similar situation to occur on non-swap files, +though it is less likely. + +Thus the +.I mismatch_cnt +value can not be interpreted very reliably on RAID1 or RAID10, +especially when the device is used for swap. + + +.SS BITMAP WRITE-INTENT LOGGING + +From Linux 2.6.13, +.I md +supports a bitmap based write-intent log. If configured, the bitmap +is used to record which blocks of the array may be out of sync. +Before any write request is honoured, md will make sure that the +corresponding bit in the log is set. After a period of time with no +writes to an area of the array, the corresponding bit will be cleared. + +This bitmap is used for two optimisations. + +Firstly, after an unclean shutdown, the resync process will consult +the bitmap and only resync those blocks that correspond to bits in the +bitmap that are set. This can dramatically reduce resync time. + +Secondly, when a drive fails and is removed from the array, md stops +clearing bits in the intent log. If that same drive is re-added to +the array, md will notice and will only recover the sections of the +drive that are covered by bits in the intent log that are set. This +can allow a device to be temporarily removed and reinserted without +causing an enormous recovery cost. + +The intent log can be stored in a file on a separate device, or it can +be stored near the superblocks of an array which has superblocks. + +It is possible to add an intent log to an active array, or remove an +intent log if one is present. + +In 2.6.13, intent bitmaps are only supported with RAID1. Other levels +with redundancy are supported from 2.6.15. + +.SS BAD BLOCK LIST + +From Linux 3.5 each device in an +.I md +array can store a list of known-bad-blocks. This list is 4K in size +and usually positioned at the end of the space between the superblock +and the data. + +When a block cannot be read and cannot be repaired by writing data +recovered from other devices, the address of the block is stored in +the bad block list. Similarly if an attempt to write a block fails, +the address will be recorded as a bad block. If attempting to record +the bad block fails, the whole device will be marked faulty. + +Attempting to read from a known bad block will cause a read error. +Attempting to write to a known bad block will be ignored if any write +errors have been reported by the device. If there have been no write +errors then the data will be written to the known bad block and if +that succeeds, the address will be removed from the list. + +This allows an array to fail more gracefully - a few blocks on +different devices can be faulty without taking the whole array out of +action. + +The list is particularly useful when recovering to a spare. If a few blocks +cannot be read from the other devices, the bulk of the recovery can +complete and those few bad blocks will be recorded in the bad block list. + +.SS RAID WRITE HOLE + +Due to non-atomicity nature of RAID write operations, +interruption of write operations (system crash, etc.) to RAID456 +array can lead to inconsistent parity and data loss (so called +RAID-5 write hole). +To plug the write hole md supports two mechanisms described below. + +.TP +DIRTY STRIPE JOURNAL +From Linux 4.4, md supports write ahead journal for RAID456. +When the array is created, an additional journal device can be added to +the array through write-journal option. The RAID write journal works +similar to file system journals. Before writing to the data +disks, md persists data AND parity of the stripe to the journal +device. After crashes, md searches the journal device for +incomplete write operations, and replay them to the data disks. + +When the journal device fails, the RAID array is forced to run in +read-only mode. + +.TP +PARTIAL PARITY LOG +From Linux 4.12 md supports Partial Parity Log (PPL) for RAID5 arrays only. +Partial parity for a write operation is the XOR of stripe data chunks not +modified by the write. PPL is stored in the metadata region of RAID member drives, +no additional journal drive is needed. +After crashes, if one of the not modified data disks of +the stripe is missing, this updated parity can be used to recover its +data. + +This mechanism is documented more fully in the file +Documentation/md/raid5-ppl.rst + +.SS WRITE-BEHIND + +From Linux 2.6.14, +.I md +supports WRITE-BEHIND on RAID1 arrays. + +This allows certain devices in the array to be flagged as +.IR write-mostly . +MD will only read from such devices if there is no +other option. + +If a write-intent bitmap is also provided, write requests to +write-mostly devices will be treated as write-behind requests and md +will not wait for writes to those requests to complete before +reporting the write as complete to the filesystem. + +This allows for a RAID1 with WRITE-BEHIND to be used to mirror data +over a slow link to a remote computer (providing the link isn't too +slow). The extra latency of the remote link will not slow down normal +operations, but the remote system will still have a reasonably +up-to-date copy of all data. + +.SS FAILFAST + +From Linux 4.10, +.I +md +supports FAILFAST for RAID1 and RAID10 arrays. This is a flag that +can be set on individual drives, though it is usually set on all +drives, or no drives. + +When +.I md +sends an I/O request to a drive that is marked as FAILFAST, and when +the array could survive the loss of that drive without losing data, +.I md +will request that the underlying device does not perform any retries. +This means that a failure will be reported to +.I md +promptly, and it can mark the device as faulty and continue using the +other device(s). +.I md +cannot control the timeout that the underlying devices use to +determine failure. Any changes desired to that timeout must be set +explictly on the underlying device, separately from using +.IR mdadm . + +If a FAILFAST request does fail, and if it is still safe to mark the +device as faulty without data loss, that will be done and the array +will continue functioning on a reduced number of devices. If it is not +possible to safely mark the device as faulty, +.I md +will retry the request without disabling retries in the underlying +device. In any case, +.I md +will not attempt to repair read errors on a device marked as FAILFAST +by writing out the correct. It will just mark the device as faulty. + +FAILFAST is appropriate for storage arrays that have a low probability +of true failure, but will sometimes introduce unacceptable delays to +I/O requests while performing internal maintenance. The value of +setting FAILFAST involves a trade-off. The gain is that the chance of +unacceptable delays is substantially reduced. The cost is that the +unlikely event of data-loss on one device is slightly more likely to +result in data-loss for the array. + +When a device in an array using FAILFAST is marked as faulty, it will +usually become usable again in a short while. +.I mdadm +makes no attempt to detect that possibility. Some separate +mechanism, tuned to the specific details of the expected failure modes, +needs to be created to monitor devices to see when they return to full +functionality, and to then re-add them to the array. In order of +this "re-add" functionality to be effective, an array using FAILFAST +should always have a write-intent bitmap. + +.SS RESTRIPING + +.IR Restriping , +also known as +.IR Reshaping , +is the processes of re-arranging the data stored in each stripe into a +new layout. This might involve changing the number of devices in the +array (so the stripes are wider), changing the chunk size (so stripes +are deeper or shallower), or changing the arrangement of data and +parity (possibly changing the RAID level, e.g. 1 to 5 or 5 to 6). + +As of Linux 2.6.35, md can reshape a RAID4, RAID5, or RAID6 array to +have a different number of devices (more or fewer) and to have a +different layout or chunk size. It can also convert between these +different RAID levels. It can also convert between RAID0 and RAID10, +and between RAID0 and RAID4 or RAID5. +Other possibilities may follow in future kernels. + +During any stripe process there is a 'critical section' during which +live data is being overwritten on disk. For the operation of +increasing the number of drives in a RAID5, this critical section +covers the first few stripes (the number being the product of the old +and new number of devices). After this critical section is passed, +data is only written to areas of the array which no longer hold live +data \(em the live data has already been located away. + +For a reshape which reduces the number of devices, the 'critical +section' is at the end of the reshape process. + +md is not able to ensure data preservation if there is a crash +(e.g. power failure) during the critical section. If md is asked to +start an array which failed during a critical section of restriping, +it will fail to start the array. + +To deal with this possibility, a user-space program must +.IP \(bu 4 +Disable writes to that section of the array (using the +.B sysfs +interface), +.IP \(bu 4 +take a copy of the data somewhere (i.e. make a backup), +.IP \(bu 4 +allow the process to continue and invalidate the backup and restore +write access once the critical section is passed, and +.IP \(bu 4 +provide for restoring the critical data before restarting the array +after a system crash. +.PP + +.B mdadm +versions from 2.4 do this for growing a RAID5 array. + +For operations that do not change the size of the array, like simply +increasing chunk size, or converting RAID5 to RAID6 with one extra +device, the entire process is the critical section. In this case, the +restripe will need to progress in stages, as a section is suspended, +backed up, restriped, and released. + +.SS SYSFS INTERFACE +Each block device appears as a directory in +.I sysfs +(which is usually mounted at +.BR /sys ). +For MD devices, this directory will contain a subdirectory called +.B md +which contains various files for providing access to information about +the array. + +This interface is documented more fully in the file +.B Documentation/admin-guide/md.rst +which is distributed with the kernel sources. That file should be +consulted for full documentation. The following are just a selection +of attribute files that are available. + +.TP +.B md/sync_speed_min +This value, if set, overrides the system-wide setting in +.B /proc/sys/dev/raid/speed_limit_min +for this array only. +Writing the value +.B "system" +to this file will cause the system-wide setting to have effect. + +.TP +.B md/sync_speed_max +This is the partner of +.B md/sync_speed_min +and overrides +.B /proc/sys/dev/raid/speed_limit_max +described below. + +.TP +.B md/sync_action +This can be used to monitor and control the resync/recovery process of +MD. +In particular, writing "check" here will cause the array to read all +data block and check that they are consistent (e.g. parity is correct, +or all mirror replicas are the same). Any discrepancies found are +.B NOT +corrected. + +A count of problems found will be stored in +.BR md/mismatch_count . + +Alternately, "repair" can be written which will cause the same check +to be performed, but any errors will be corrected. + +Finally, "idle" can be written to stop the check/repair process. + +.TP +.B md/stripe_cache_size +This is only available on RAID5 and RAID6. It records the size (in +pages per device) of the stripe cache which is used for synchronising +all write operations to the array and all read operations if the array +is degraded. The default is 256. Valid values are 17 to 32768. +Increasing this number can increase performance in some situations, at +some cost in system memory. Note, setting this value too high can +result in an "out of memory" condition for the system. + +memory_consumed = system_page_size * nr_disks * stripe_cache_size + +.TP +.B md/preread_bypass_threshold +This is only available on RAID5 and RAID6. This variable sets the +number of times MD will service a full-stripe-write before servicing a +stripe that requires some "prereading". For fairness this defaults to +1. Valid values are 0 to stripe_cache_size. Setting this to 0 +maximizes sequential-write throughput at the cost of fairness to threads +doing small or random writes. + +.TP +.B md/bitmap/backlog +The value stored in the file only has any effect on RAID1 when write-mostly +devices are active, and write requests to those devices are proceed in the +background. + +This variable sets a limit on the number of concurrent background writes, +the valid values are 0 to 16383, 0 means that write-behind is not allowed, +while any other number means it can happen. If there are more write requests +than the number, new writes will by synchronous. + +.TP +.B md/bitmap/can_clear +This is for externally managed bitmaps, where the kernel writes the bitmap +itself, but metadata describing the bitmap is managed by mdmon or similar. + +When the array is degraded, bits mustn't be cleared. When the array becomes +optimal again, bit can be cleared, but first the metadata needs to record +the current event count. So md sets this to 'false' and notifies mdmon, +then mdmon updates the metadata and writes 'true'. + +There is no code in mdmon to actually do this, so maybe it doesn't even +work. + +.TP +.B md/bitmap/chunksize +The bitmap chunksize can only be changed when no bitmap is active, and +the value should be power of 2 and at least 512. + +.TP +.B md/bitmap/location +This indicates where the write-intent bitmap for the array is stored. +It can be "none" or "file" or a signed offset from the array metadata +- measured in sectors. You cannot set a file by writing here - that can +only be done with the SET_BITMAP_FILE ioctl. + +Write 'none' to 'bitmap/location' will clear bitmap, and the previous +location value must be write to it to restore bitmap. + +.TP +.B md/bitmap/max_backlog_used +This keeps track of the maximum number of concurrent write-behind requests +for an md array, writing any value to this file will clear it. + +.TP +.B md/bitmap/metadata +This can be 'internal' or 'clustered' or 'external'. 'internal' is set +by default, which means the metadata for bitmap is stored in the first 256 +bytes of the bitmap space. 'clustered' means separate bitmap metadata are +used for each cluster node. 'external' means that bitmap metadata is managed +externally to the kernel. + +.TP +.B md/bitmap/space +This shows the space (in sectors) which is available at md/bitmap/location, +and allows the kernel to know when it is safe to resize the bitmap to match +a resized array. It should big enough to contain the total bytes in the bitmap. + +For 1.0 metadata, assume we can use up to the superblock if before, else +to 4K beyond superblock. For other metadata versions, assume no change is +possible. + +.TP +.B md/bitmap/time_base +This shows the time (in seconds) between disk flushes, and is used to looking +for bits in the bitmap to be cleared. + +The default value is 5 seconds, and it should be an unsigned long value. + +.SS KERNEL PARAMETERS + +The md driver recognised several different kernel parameters. +.TP +.B raid=noautodetect +This will disable the normal detection of md arrays that happens at +boot time. If a drive is partitioned with MS-DOS style partitions, +then if any of the 4 main partitions has a partition type of 0xFD, +then that partition will normally be inspected to see if it is part of +an MD array, and if any full arrays are found, they are started. This +kernel parameter disables this behaviour. + +.TP +.B raid=partitionable +.TP +.B raid=part +These are available in 2.6 and later kernels only. They indicate that +autodetected MD arrays should be created as partitionable arrays, with +a different major device number to the original non-partitionable md +arrays. The device number is listed as +.I mdp +in +.IR /proc/devices . + +.TP +.B md_mod.start_ro=1 +.TP +.B /sys/module/md_mod/parameters/start_ro +This tells md to start all arrays in read-only mode. This is a soft +read-only that will automatically switch to read-write on the first +write request. However until that write request, nothing is written +to any device by md, and in particular, no resync or recovery +operation is started. + +.TP +.B md_mod.start_dirty_degraded=1 +.TP +.B /sys/module/md_mod/parameters/start_dirty_degraded +As mentioned above, md will not normally start a RAID4, RAID5, or +RAID6 that is both dirty and degraded as this situation can imply +hidden data loss. This can be awkward if the root filesystem is +affected. Using this module parameter allows such arrays to be started +at boot time. It should be understood that there is a real (though +small) risk of data corruption in this situation. + +.TP +.BI md= n , dev , dev ,... +.TP +.BI md=d n , dev , dev ,... +This tells the md driver to assemble +.B /dev/md n +from the listed devices. It is only necessary to start the device +holding the root filesystem this way. Other arrays are best started +once the system is booted. + +In 2.6 kernels, the +.B d +immediately after the +.B = +indicates that a partitionable device (e.g. +.BR /dev/md/d0 ) +should be created rather than the original non-partitionable device. + +.TP +.BI md= n , l , c , i , dev... +This tells the md driver to assemble a legacy RAID0 or LINEAR array +without a superblock. +.I n +gives the md device number, +.I l +gives the level, 0 for RAID0 or \-1 for LINEAR, +.I c +gives the chunk size as a base-2 logarithm offset by twelve, so 0 +means 4K, 1 means 8K. +.I i +is ignored (legacy support). + +.SH FILES +.TP +.B /proc/mdstat +Contains information about the status of currently running array. +.TP +.B /proc/sys/dev/raid/speed_limit_min +A readable and writable file that reflects the current "goal" rebuild +speed for times when non-rebuild activity is current on an array. +The speed is in Kibibytes per second, and is a per-device rate, not a +per-array rate (which means that an array with more disks will shuffle +more data for a given speed). The default is 1000. + +.TP +.B /proc/sys/dev/raid/speed_limit_max +A readable and writable file that reflects the current "goal" rebuild +speed for times when no non-rebuild activity is current on an array. +The default is 200,000. + +.SH SEE ALSO +.BR mdadm (8), @@ -0,0 +1,136 @@ +/* Declaration of functions and data types used for MD5 sum computing + library functions. + Copyright (C) 1995-1997,1999-2005 Free Software Foundation, Inc. + + NOTE: The canonical source of this file is maintained with the GNU C + Library. Bugs can be reported to bug-glibc@prep.ai.mit.edu. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef _MD5_H +#define _MD5_H 1 + +#include <stdio.h> + +#if HAVE_INTTYPES_H +# include <inttypes.h> +#endif +#if HAVE_STDINT_H || _LIBC || defined __UCLIBC__ +# include <stdint.h> +#endif + +#ifndef __GNUC_PREREQ +# if defined __GNUC__ && defined __GNUC_MINOR__ +# define __GNUC_PREREQ(maj, min) \ + ((__GNUC__ << 16) + __GNUC_MINOR__ >= ((maj) << 16) + (min)) +# else +# define __GNUC_PREREQ(maj, min) 0 +# endif +#endif + +#ifndef __THROW +# if defined __cplusplus && __GNUC_PREREQ (2,8) +# define __THROW throw () +# else +# define __THROW +# endif +#endif + +#ifndef __attribute__ +# if ! __GNUC_PREREQ (2,8) || __STRICT_ANSI__ +# define __attribute__(x) +# endif +#endif + +#ifndef _LIBC +# define __md5_buffer md5_buffer +# define __md5_finish_ctx md5_finish_ctx +# define __md5_init_ctx md5_init_ctx +# define __md5_process_block md5_process_block +# define __md5_process_bytes md5_process_bytes +# define __md5_read_ctx md5_read_ctx +# define __md5_stream md5_stream +#endif + +typedef uint32_t md5_uint32; + +/* Structure to save state of computation between the single steps. */ +struct md5_ctx +{ + md5_uint32 A; + md5_uint32 B; + md5_uint32 C; + md5_uint32 D; + + md5_uint32 total[2]; + md5_uint32 buflen; + char buffer[128] __attribute__ ((__aligned__ (__alignof__ (md5_uint32)))); +}; + +/* + * The following three functions are build up the low level used in + * the functions `md5_stream' and `md5_buffer'. + */ + +/* Initialize structure containing state of computation. + (RFC 1321, 3.3: Step 3) */ +extern void __md5_init_ctx (struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void __md5_process_block (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void __md5_process_bytes (const void *buffer, size_t len, + struct md5_ctx *ctx) __THROW; + +/* Process the remaining bytes in the buffer and put result from CTX + in first 16 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *__md5_finish_ctx (struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Put result from CTX in first 16 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *__md5_read_ctx (const struct md5_ctx *ctx, void *resbuf) __THROW; + + +/* Compute MD5 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +extern int __md5_stream (FILE *stream, void *resblock) __THROW; + +/* Compute MD5 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *__md5_buffer (const char *buffer, size_t len, + void *resblock) __THROW; + +#endif /* md5.h */ @@ -0,0 +1,295 @@ +/* + md_p.h : physical layout of Linux RAID devices + Copyright (C) 1996-98 Ingo Molnar, Gadi Oxman + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_P_H +#define _MD_P_H + +/* + * RAID superblock. + * + * The RAID superblock maintains some statistics on each RAID configuration. + * Each real device in the RAID set contains it near the end of the device. + * Some of the ideas are copied from the ext2fs implementation. + * + * We currently use 4096 bytes as follows: + * + * word offset function + * + * 0 - 31 Constant generic RAID device information. + * 32 - 63 Generic state information. + * 64 - 127 Personality specific information. + * 128 - 511 12 32-words descriptors of the disks in the raid set. + * 512 - 911 Reserved. + * 912 - 1023 Disk specific descriptor. + */ + +/* + * If x is the real device size in bytes, we return an apparent size of: + * + * y = (x & ~(MD_RESERVED_BYTES - 1)) - MD_RESERVED_BYTES + * + * and place the 4kB superblock at offset y. + */ +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) +#define MD_RESERVED_BLOCKS (MD_RESERVED_BYTES / BLOCK_SIZE) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) +#define MD_NEW_SIZE_BLOCKS(x) ((x & ~(MD_RESERVED_BLOCKS - 1)) - MD_RESERVED_BLOCKS) + +#define MD_SB_BYTES 4096 +#define MD_SB_WORDS (MD_SB_BYTES / 4) +#define MD_SB_BLOCKS (MD_SB_BYTES / BLOCK_SIZE) +#define MD_SB_SECTORS (MD_SB_BYTES / 512) + +/* + * The following are counted in 32-bit words + */ +#define MD_SB_GENERIC_OFFSET 0 +#define MD_SB_PERSONALITY_OFFSET 64 +#define MD_SB_DISKS_OFFSET 128 +#define MD_SB_DESCRIPTOR_OFFSET 992 + +#define MD_SB_GENERIC_CONSTANT_WORDS 32 +#define MD_SB_GENERIC_STATE_WORDS 32 +#define MD_SB_GENERIC_WORDS (MD_SB_GENERIC_CONSTANT_WORDS + MD_SB_GENERIC_STATE_WORDS) +#define MD_SB_PERSONALITY_WORDS 64 +#define MD_SB_DESCRIPTOR_WORDS 32 +#define MD_SB_DISKS 27 +#define MD_SB_DISKS_WORDS (MD_SB_DISKS*MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_RESERVED_WORDS (1024 - MD_SB_GENERIC_WORDS - MD_SB_PERSONALITY_WORDS - MD_SB_DISKS_WORDS - MD_SB_DESCRIPTOR_WORDS) +#define MD_SB_EQUAL_WORDS (MD_SB_GENERIC_WORDS + MD_SB_PERSONALITY_WORDS + MD_SB_DISKS_WORDS) + +/* + * Device "operational" state bits + */ +#define MD_DISK_FAULTY 0 /* disk is faulty / operational */ +#define MD_DISK_ACTIVE 1 /* disk is running but may not be in sync */ +#define MD_DISK_SYNC 2 /* disk is in sync with the raid set */ +#define MD_DISK_REMOVED 3 /* disk is in sync with the raid set */ +#define MD_DISK_CLUSTER_ADD 4 /* Initiate a disk add across the cluster + * For clustered enviroments only. + */ +#define MD_DISK_CANDIDATE 5 /* disk is added as spare (local) until confirmed + * For clustered enviroments only. + */ + +#define MD_DISK_WRITEMOSTLY 9 /* disk is "write-mostly" is RAID1 config. + * read requests will only be sent here in + * dire need + */ +#define MD_DISK_FAILFAST 10 /* Fewer retries, more failures */ + +#define MD_DISK_REPLACEMENT 17 +#define MD_DISK_JOURNAL 18 /* disk is used as the write journal in RAID-5/6 */ + +#define MD_DISK_ROLE_SPARE 0xffff +#define MD_DISK_ROLE_FAULTY 0xfffe +#define MD_DISK_ROLE_JOURNAL 0xfffd +#define MD_DISK_ROLE_MAX 0xff00 /* max value of regular disk role */ + +typedef struct mdp_device_descriptor_s { + __u32 number; /* 0 Device number in the entire set */ + __u32 major; /* 1 Device major number */ + __u32 minor; /* 2 Device minor number */ + __u32 raid_disk; /* 3 The role of the device in the raid set */ + __u32 state; /* 4 Operational state */ + __u32 reserved[MD_SB_DESCRIPTOR_WORDS - 5]; +} mdp_disk_t; + +#define MD_SB_MAGIC 0xa92b4efc + +/* + * Superblock state bits + */ +#define MD_SB_CLEAN 0 +#define MD_SB_ERRORS 1 +#define MD_SB_BBM_ERRORS 2 +#define MD_SB_BLOCK_CONTAINER_RESHAPE 3 /* block container wide reshapes */ +#define MD_SB_BLOCK_VOLUME 4 /* block activation of array, other arrays + * in container can be activated */ +#define MD_SB_CLUSTERED 5 /* MD is clustered */ +#define MD_SB_BITMAP_PRESENT 8 /* bitmap may be present nearby */ + +typedef struct mdp_superblock_s { + /* + * Constant generic information + */ + __u32 md_magic; /* 0 MD identifier */ + __u32 major_version; /* 1 major version to which the set conforms */ + __u32 minor_version; /* 2 minor version ... */ + __u32 patch_version; /* 3 patchlevel version ... */ + __u32 gvalid_words; /* 4 Number of used words in this section */ + __u32 set_uuid0; /* 5 Raid set identifier */ + __u32 ctime; /* 6 Creation time */ + __u32 level; /* 7 Raid personality */ + __u32 size; /* 8 Apparent size of each individual disk */ + __u32 nr_disks; /* 9 total disks in the raid set */ + __u32 raid_disks; /* 10 disks in a fully functional raid set */ + __u32 md_minor; /* 11 preferred MD minor device number */ + __u32 not_persistent; /* 12 does it have a persistent superblock */ + __u32 set_uuid1; /* 13 Raid set identifier #2 */ + __u32 set_uuid2; /* 14 Raid set identifier #3 */ + __u32 set_uuid3; /* 15 Raid set identifier #4 */ + __u32 gstate_creserved[MD_SB_GENERIC_CONSTANT_WORDS - 16]; + + /* + * Generic state information + */ + __u32 utime; /* 0 Superblock update time */ + __u32 state; /* 1 State bits (clean, ...) */ + __u32 active_disks; /* 2 Number of currently active disks */ + __u32 working_disks; /* 3 Number of working disks */ + __u32 failed_disks; /* 4 Number of failed disks */ + __u32 spare_disks; /* 5 Number of spare disks */ + __u32 sb_csum; /* 6 checksum of the whole superblock */ +#if __BYTE_ORDER == __BIG_ENDIAN + __u32 events_hi; /* 7 high-order of superblock update count */ + __u32 events_lo; /* 8 low-order of superblock update count */ + __u32 cp_events_hi; /* 9 high-order of checkpoint update count */ + __u32 cp_events_lo; /* 10 low-order of checkpoint update count */ +#else + __u32 events_lo; /* 7 low-order of superblock update count */ + __u32 events_hi; /* 8 high-order of superblock update count */ + __u32 cp_events_lo; /* 9 low-order of checkpoint update count */ + __u32 cp_events_hi; /* 10 high-order of checkpoint update count */ +#endif + __u32 recovery_cp; /* 11 recovery checkpoint sector count */ + /* There are only valid for minor_version > 90 */ + __u64 reshape_position; /* 12,13 next address in array-space for reshape */ + __u32 new_level; /* 14 new level we are reshaping to */ + __u32 delta_disks; /* 15 change in number of raid_disks */ + __u32 new_layout; /* 16 new layout */ + __u32 new_chunk; /* 17 new chunk size (bytes) */ + __u32 gstate_sreserved[MD_SB_GENERIC_STATE_WORDS - 18]; + + /* + * Personality information + */ + __u32 layout; /* 0 the array's physical layout */ + __u32 chunk_size; /* 1 chunk size in bytes */ + __u32 root_pv; /* 2 LV root PV */ + __u32 root_block; /* 3 LV root block */ + __u32 pstate_reserved[MD_SB_PERSONALITY_WORDS - 4]; + + /* + * Disks information + */ + mdp_disk_t disks[MD_SB_DISKS]; + + /* + * Reserved + */ + __u32 reserved[MD_SB_RESERVED_WORDS]; + + /* + * Active descriptor + */ + mdp_disk_t this_disk; + +} mdp_super_t; + +#ifdef __TINYC__ +typedef unsigned long long __u64; +#endif + +static inline __u64 md_event(mdp_super_t *sb) { + __u64 ev = sb->events_hi; + return (ev<<32)| sb->events_lo; +} + +struct r5l_payload_header { + __u16 type; + __u16 flags; +} __attribute__ ((__packed__)); + +enum r5l_payload_type { + R5LOG_PAYLOAD_DATA = 0, + R5LOG_PAYLOAD_PARITY = 1, + R5LOG_PAYLOAD_FLUSH = 2, +}; + +struct r5l_payload_data_parity { + struct r5l_payload_header header; + __u32 size; /* sector. data/parity size. each 4k has a checksum */ + __u64 location; /* sector. For data, it's raid sector. For + parity, it's stripe sector */ + __u32 checksum[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_data_parity_flag { + R5LOG_PAYLOAD_FLAG_DISCARD = 1, /* payload is discard */ + /* + * RESHAPED/RESHAPING is only set when there is reshape activity. Note, + * both data/parity of a stripe should have the same flag set + * + * RESHAPED: reshape is running, and this stripe finished reshape + * RESHAPING: reshape is running, and this stripe isn't reshaped + * */ + R5LOG_PAYLOAD_FLAG_RESHAPED = 2, + R5LOG_PAYLOAD_FLAG_RESHAPING = 3, +}; + +struct r5l_payload_flush { + struct r5l_payload_header header; + __u32 size; /* flush_stripes size, bytes */ + __u64 flush_stripes[]; +} __attribute__ ((__packed__)); + +enum r5l_payload_flush_flag { + R5LOG_PAYLOAD_FLAG_FLUSH_STRIPE = 1, /* data represents whole stripe */ +}; + +struct r5l_meta_block { + __u32 magic; + __u32 checksum; + __u8 version; + __u8 __zero_pading_1; + __u16 __zero_pading_2; + __u32 meta_size; /* whole size of the block */ + + __u64 seq; + __u64 position; /* sector, start from rdev->data_offset, current position */ + struct r5l_payload_header payloads[]; +} __attribute__ ((__packed__)); + +#define R5LOG_VERSION 0x1 +#define R5LOG_MAGIC 0x6433c509 + +struct ppl_header_entry { + __u64 data_sector; /* raid sector of the new data */ + __u32 pp_size; /* length of partial parity */ + __u32 data_size; /* length of data */ + __u32 parity_disk; /* member disk containing parity */ + __u32 checksum; /* checksum of this entry's partial parity */ +} __attribute__ ((__packed__)); + +#define PPL_HEADER_SIZE 4096 +#define PPL_HDR_RESERVED 512 +#define PPL_HDR_ENTRY_SPACE \ + (PPL_HEADER_SIZE - PPL_HDR_RESERVED - 4 * sizeof(__u32) - sizeof(__u64)) +#define PPL_HDR_MAX_ENTRIES \ + (PPL_HDR_ENTRY_SPACE / sizeof(struct ppl_header_entry)) + +struct ppl_header { + __u8 reserved[PPL_HDR_RESERVED];/* reserved space, fill with 0xff */ + __u32 signature; /* signature (family number of volume) */ + __u32 padding; /* zero pad */ + __u64 generation; /* generation number of the header */ + __u32 entries_count; /* number of entries in entry array */ + __u32 checksum; /* checksum of the header */ + struct ppl_header_entry entries[PPL_HDR_MAX_ENTRIES]; +} __attribute__ ((__packed__)); + +#endif @@ -0,0 +1,115 @@ +/* + md_u.h : user <=> kernel API between Linux raidtools and RAID drivers + Copyright (C) 1998 Ingo Molnar + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + You should have received a copy of the GNU General Public License + (for example /usr/src/linux/COPYING); if not, write to the Free + Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +*/ + +#ifndef _MD_U_H +#define _MD_U_H + +/* ioctls */ + +/* status */ +#define RAID_VERSION _IOR (MD_MAJOR, 0x10, mdu_version_t) +#define GET_ARRAY_INFO _IOR (MD_MAJOR, 0x11, mdu_array_info_t) +#define GET_DISK_INFO _IOR (MD_MAJOR, 0x12, mdu_disk_info_t) +#define RAID_AUTORUN _IO (MD_MAJOR, 0x14) +#define GET_BITMAP_FILE _IOR (MD_MAJOR, 0x15, mdu_bitmap_file_t) + +/* configuration */ +#define ADD_NEW_DISK _IOW (MD_MAJOR, 0x21, mdu_disk_info_t) +#define HOT_REMOVE_DISK _IO (MD_MAJOR, 0x22) +#define SET_ARRAY_INFO _IOW (MD_MAJOR, 0x23, mdu_array_info_t) +#define SET_DISK_FAULTY _IO (MD_MAJOR, 0x29) +#define SET_BITMAP_FILE _IOW (MD_MAJOR, 0x2b, int) + +/* usage */ +#define RUN_ARRAY _IOW (MD_MAJOR, 0x30, mdu_param_t) +#define STOP_ARRAY _IO (MD_MAJOR, 0x32) +#define STOP_ARRAY_RO _IO (MD_MAJOR, 0x33) +#define RESTART_ARRAY_RW _IO (MD_MAJOR, 0x34) +#define CLUSTERED_DISK_NACK _IO (MD_MAJOR, 0x35) + +typedef struct mdu_version_s { + int major; + int minor; + int patchlevel; +} mdu_version_t; + +typedef struct mdu_array_info_s { + /* + * Generic constant information + */ + int major_version; + int minor_version; + int patch_version; + unsigned int ctime; + int level; + int size; + int nr_disks; + int raid_disks; + int md_minor; + int not_persistent; + + /* + * Generic state information + */ + unsigned int utime; /* 0 Superblock update time */ + int state; /* 1 State bits (clean, ...) */ + int active_disks; /* 2 Number of currently active disks */ + int working_disks; /* 3 Number of working disks */ + int failed_disks; /* 4 Number of failed disks */ + int spare_disks; /* 5 Number of spare disks */ + + /* + * Personality information + */ + int layout; /* 0 the array's physical layout */ + int chunk_size; /* 1 chunk size in bytes */ + +} mdu_array_info_t; + +typedef struct mdu_disk_info_s { + /* + * configuration/status of one particular disk + */ + int number; + int major; + int minor; + int raid_disk; + int state; + +} mdu_disk_info_t; + +typedef struct mdu_start_info_s { + /* + * configuration/status of one particular disk + */ + int major; + int minor; + int raid_disk; + int state; + +} mdu_start_info_t; + +typedef struct mdu_bitmap_file_s +{ + char pathname[4096]; +} mdu_bitmap_file_t; + +typedef struct mdu_param_s +{ + int personality; /* 1,2,3,4 */ + int chunk_size; /* in bytes */ + int max_fault; /* unused for now */ +} mdu_param_t; + +#endif diff --git a/mdadm.8.in b/mdadm.8.in new file mode 100644 index 0000000..be902db --- /dev/null +++ b/mdadm.8.in @@ -0,0 +1,3452 @@ +.\" -*- nroff -*- +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM 8 "" v4.2 +.SH NAME +mdadm \- manage MD devices +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI mdadm " [mode] <raiddevice> [options] <component-devices>" + +.SH DESCRIPTION +RAID devices are virtual devices created from two or more +real block devices. This allows multiple devices (typically disk +drives or partitions thereof) to be combined into a single device to +hold (for example) a single filesystem. +Some RAID levels include redundancy and so can survive some degree of +device failure. + +Linux Software RAID devices are implemented through the md (Multiple +Devices) device driver. + +Currently, Linux supports +.B LINEAR +md devices, +.B RAID0 +(striping), +.B RAID1 +(mirroring), +.BR RAID4 , +.BR RAID5 , +.BR RAID6 , +.BR RAID10 , +.BR MULTIPATH , +.BR FAULTY , +and +.BR CONTAINER . + +.B MULTIPATH +is not a Software RAID mechanism, but does involve +multiple devices: +each device is a path to one common physical storage device. +New installations should not use md/multipath as it is not well +supported and has no ongoing development. Use the Device Mapper based +multipath-tools instead. + +.B FAULTY +is also not true RAID, and it only involves one device. It +provides a layer over a true device that can be used to inject faults. + +.B CONTAINER +is different again. A +.B CONTAINER +is a collection of devices that are +managed as a set. This is similar to the set of devices connected to +a hardware RAID controller. The set of devices may contain a number +of different RAID arrays each utilising some (or all) of the blocks from a +number of the devices in the set. For example, two devices in a 5-device set +might form a RAID1 using the whole devices. The remaining three might +have a RAID5 over the first half of each device, and a RAID0 over the +second half. + +With a +.BR CONTAINER , +there is one set of metadata that describes all of +the arrays in the container. So when +.I mdadm +creates a +.B CONTAINER +device, the device just represents the metadata. Other normal arrays (RAID1 +etc) can be created inside the container. + +.SH MODES +mdadm has several major modes of operation: +.TP +.B Assemble +Assemble the components of a previously created +array into an active array. Components can be explicitly given +or can be searched for. +.I mdadm +checks that the components +do form a bona fide array, and can, on request, fiddle superblock +information so as to assemble a faulty array. + +.TP +.B Build +Build an array that doesn't have per-device metadata (superblocks). For these +sorts of arrays, +.I mdadm +cannot differentiate between initial creation and subsequent assembly +of an array. It also cannot perform any checks that appropriate +components have been requested. Because of this, the +.B Build +mode should only be used together with a complete understanding of +what you are doing. + +.TP +.B Create +Create a new array with per-device metadata (superblocks). +Appropriate metadata is written to each device, and then the array +comprising those devices is activated. A 'resync' process is started +to make sure that the array is consistent (e.g. both sides of a mirror +contain the same data) but the content of the device is left otherwise +untouched. +The array can be used as soon as it has been created. There is no +need to wait for the initial resync to finish. + +.TP +.B "Follow or Monitor" +Monitor one or more md devices and act on any state changes. This is +only meaningful for RAID1, 4, 5, 6, 10 or multipath arrays, as +only these have interesting state. RAID0 or Linear never have +missing, spare, or failed drives, so there is nothing to monitor. + +.TP +.B "Grow" +Grow (or shrink) an array, or otherwise reshape it in some way. +Currently supported growth options including changing the active size +of component devices and changing the number of active devices in +Linear and RAID levels 0/1/4/5/6, +changing the RAID level between 0, 1, 5, and 6, and between 0 and 10, +changing the chunk size and layout for RAID 0,4,5,6,10 as well as adding or +removing a write-intent bitmap and changing the array's consistency policy. + +.TP +.B "Incremental Assembly" +Add a single device to an appropriate array. If the addition of the +device makes the array runnable, the array will be started. +This provides a convenient interface to a +.I hot-plug +system. As each device is detected, +.I mdadm +has a chance to include it in some array as appropriate. +Optionally, when the +.I \-\-fail +flag is passed in we will remove the device from any active array +instead of adding it. + +If a +.B CONTAINER +is passed to +.I mdadm +in this mode, then any arrays within that container will be assembled +and started. + +.TP +.B Manage +This is for doing things to specific components of an array such as +adding new spares and removing faulty devices. + +.TP +.B Misc +This is an 'everything else' mode that supports operations on active +arrays, operations on component devices such as erasing old superblocks, and +information gathering operations. +.\"This mode allows operations on independent devices such as examine MD +.\"superblocks, erasing old superblocks and stopping active arrays. + +.TP +.B Auto-detect +This mode does not act on a specific device or array, but rather it +requests the Linux Kernel to activate any auto-detected arrays. +.SH OPTIONS + +.SH Options for selecting a mode are: + +.TP +.BR \-A ", " \-\-assemble +Assemble a pre-existing array. + +.TP +.BR \-B ", " \-\-build +Build a legacy array without superblocks. + +.TP +.BR \-C ", " \-\-create +Create a new array. + +.TP +.BR \-F ", " \-\-follow ", " \-\-monitor +Select +.B Monitor +mode. + +.TP +.BR \-G ", " \-\-grow +Change the size or shape of an active array. + +.TP +.BR \-I ", " \-\-incremental +Add/remove a single device to/from an appropriate array, and possibly start the array. + +.TP +.B \-\-auto-detect +Request that the kernel starts any auto-detected arrays. This can only +work if +.I md +is compiled into the kernel \(em not if it is a module. +Arrays can be auto-detected by the kernel if all the components are in +primary MS-DOS partitions with partition type +.BR FD , +and all use v0.90 metadata. +In-kernel autodetect is not recommended for new installations. Using +.I mdadm +to detect and assemble arrays \(em possibly in an +.I initrd +\(em is substantially more flexible and should be preferred. + +.P +If a device is given before any options, or if the first option is +one of +.BR \-\-add , +.BR \-\-re\-add , +.BR \-\-add\-spare , +.BR \-\-fail , +.BR \-\-remove , +or +.BR \-\-replace , +then the MANAGE mode is assumed. +Anything other than these will cause the +.B Misc +mode to be assumed. + +.SH Options that are not mode-specific are: + +.TP +.BR \-h ", " \-\-help +Display general help message or, after one of the above options, a +mode-specific help message. + +.TP +.B \-\-help\-options +Display more detailed help about command line parsing and some commonly +used options. + +.TP +.BR \-V ", " \-\-version +Print version information for mdadm. + +.TP +.BR \-v ", " \-\-verbose +Be more verbose about what is happening. This can be used twice to be +extra-verbose. +The extra verbosity currently only affects +.B \-\-detail \-\-scan +and +.BR "\-\-examine \-\-scan" . + +.TP +.BR \-q ", " \-\-quiet +Avoid printing purely informative messages. With this, +.I mdadm +will be silent unless there is something really important to report. + + +.TP +.BR \-f ", " \-\-force +Be more forceful about certain operations. See the various modes for +the exact meaning of this option in different contexts. + +.TP +.BR \-c ", " \-\-config= +Specify the config file or directory. Default is to use +.B /etc/mdadm.conf +and +.BR /etc/mdadm.conf.d , +or if those are missing then +.B /etc/mdadm/mdadm.conf +and +.BR /etc/mdadm/mdadm.conf.d . +If the config file given is +.B "partitions" +then nothing will be read, but +.I mdadm +will act as though the config file contained exactly +.br +.B " DEVICE partitions containers" +.br +and will read +.B /proc/partitions +to find a list of devices to scan, and +.B /proc/mdstat +to find a list of containers to examine. +If the word +.B "none" +is given for the config file, then +.I mdadm +will act as though the config file were empty. + +If the name given is of a directory, then +.I mdadm +will collect all the files contained in the directory with a name ending +in +.BR .conf , +sort them lexically, and process all of those files as config files. + +.TP +.BR \-s ", " \-\-scan +Scan config file or +.B /proc/mdstat +for missing information. +In general, this option gives +.I mdadm +permission to get any missing information (like component devices, +array devices, array identities, and alert destination) from the +configuration file (see previous option); +one exception is MISC mode when using +.B \-\-detail +or +.B \-\-stop, +in which case +.B \-\-scan +says to get a list of array devices from +.BR /proc/mdstat . + +.TP +.BR \-e ", " \-\-metadata= +Declare the style of RAID metadata (superblock) to be used. The +default is {DEFAULT_METADATA} for +.BR \-\-create , +and to guess for other operations. +The default can be overridden by setting the +.B metadata +value for the +.B CREATE +keyword in +.BR mdadm.conf . + +Options are: +.RS +.ie '{DEFAULT_METADATA}'0.90' +.IP "0, 0.90, default" +.el +.IP "0, 0.90" +Use the original 0.90 format superblock. This format limits arrays to +28 component devices and limits component devices of levels 1 and +greater to 2 terabytes. It is also possible for there to be confusion +about whether the superblock applies to a whole device or just the +last partition, if that partition starts on a 64K boundary. +.ie '{DEFAULT_METADATA}'0.90' +.IP "1, 1.0, 1.1, 1.2" +.el +.IP "1, 1.0, 1.1, 1.2 default" +Use the new version-1 format superblock. This has fewer restrictions. +It can easily be moved between hosts with different endian-ness, and a +recovery operation can be checkpointed and restarted. The different +sub-versions store the superblock at different locations on the +device, either at the end (for 1.0), at the start (for 1.1) or 4K from +the start (for 1.2). "1" is equivalent to "1.2" (the commonly +preferred 1.x format). +'if '{DEFAULT_METADATA}'1.2' "default" is equivalent to "1.2". +.IP ddf +Use the "Industry Standard" DDF (Disk Data Format) format defined by +SNIA. +When creating a DDF array a +.B CONTAINER +will be created, and normal arrays can be created in that container. +.IP imsm +Use the Intel(R) Matrix Storage Manager metadata format. This creates a +.B CONTAINER +which is managed in a similar manner to DDF, and is supported by an +option-rom on some platforms: +.IP +.B https://www.intel.com/content/www/us/en/support/products/122484/memory-and-storage/ssd-software/intel-virtual-raid-on-cpu-intel-vroc.html +.PP +.RE + +.TP +.B \-\-homehost= +This will override any +.B HOMEHOST +setting in the config file and provides the identity of the host which +should be considered the home for any arrays. + +When creating an array, the +.B homehost +will be recorded in the metadata. For version-1 superblocks, it will +be prefixed to the array name. For version-0.90 superblocks, part of +the SHA1 hash of the hostname will be stored in the later half of the +UUID. + +When reporting information about an array, any array which is tagged +for the given homehost will be reported as such. + +When using Auto-Assemble, only arrays tagged for the given homehost +will be allowed to use 'local' names (i.e. not ending in '_' followed +by a digit string). See below under +.BR "Auto Assembly" . + +The special name "\fBany\fP" can be used as a wild card. If an array +is created with +.B --homehost=any +then the name "\fBany\fP" will be stored in the array and it can be +assembled in the same way on any host. If an array is assembled with +this option, then the homehost recorded on the array will be ignored. + +.TP +.B \-\-prefer= +When +.I mdadm +needs to print the name for a device it normally finds the name in +.B /dev +which refers to the device and is shortest. When a path component is +given with +.B \-\-prefer +.I mdadm +will prefer a longer name if it contains that component. For example +.B \-\-prefer=by-uuid +will prefer a name in a subdirectory of +.B /dev +called +.BR by-uuid . + +This functionality is currently only provided by +.B \-\-detail +and +.BR \-\-monitor . + +.TP +.B \-\-home\-cluster= +specifies the cluster name for the md device. The md device can be assembled +only on the cluster which matches the name specified. If this option is not +provided, mdadm tries to detect the cluster name automatically. + +.SH For create, build, or grow: + +.TP +.BR \-n ", " \-\-raid\-devices= +Specify the number of active devices in the array. This, plus the +number of spare devices (see below) must equal the number of +.I component-devices +(including "\fBmissing\fP" devices) +that are listed on the command line for +.BR \-\-create . +Setting a value of 1 is probably +a mistake and so requires that +.B \-\-force +be specified first. A value of 1 will then be allowed for linear, +multipath, RAID0 and RAID1. It is never allowed for RAID4, RAID5 or RAID6. +.br +This number can only be changed using +.B \-\-grow +for RAID1, RAID4, RAID5 and RAID6 arrays, and only on kernels which provide +the necessary support. + +.TP +.BR \-x ", " \-\-spare\-devices= +Specify the number of spare (eXtra) devices in the initial array. +Spares can also be added +and removed later. The number of component devices listed +on the command line must equal the number of RAID devices plus the +number of spare devices. + +.TP +.BR \-z ", " \-\-size= +Amount (in Kilobytes) of space to use from each drive in RAID levels 1/4/5/6. +This must be a multiple of the chunk size, and must leave about 128Kb +of space at the end of the drive for the RAID superblock. +If this is not specified +(as it normally is not) the smallest drive (or partition) sets the +size, though if there is a variance among the drives of greater than 1%, a warning is +issued. + +A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes, +Megabytes, Gigabytes or Terabytes respectively. + +Sometimes a replacement drive can be a little smaller than the +original drives though this should be minimised by IDEMA standards. +Such a replacement drive will be rejected by +.IR md . +To guard against this it can be useful to set the initial size +slightly smaller than the smaller device with the aim that it will +still be larger than any replacement. + +This value can be set with +.B \-\-grow +for RAID level 1/4/5/6 though +DDF arrays may not be able to support this. +If the array was created with a size smaller than the currently +active drives, the extra space can be accessed using +.BR \-\-grow . +The size can be given as +.B max +which means to choose the largest size that fits on all current drives. + +Before reducing the size of the array (with +.BR "\-\-grow \-\-size=" ) +you should make sure that space isn't needed. If the device holds a +filesystem, you would need to resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-size=" +command. + +This value cannot be used when creating a +.B CONTAINER +such as with DDF and IMSM metadata, though it perfectly valid when +creating an array inside a container. + +.TP +.BR \-Z ", " \-\-array\-size= +This is only meaningful with +.B \-\-grow +and its effect is not persistent: when the array is stopped and +restarted the default array size will be restored. + +Setting the array-size causes the array to appear smaller to programs +that access the data. This is particularly needed before reshaping an +array so that it will be smaller. As the reshape is not reversible, +but setting the size with +.B \-\-array-size +is, it is required that the array size is reduced as appropriate +before the number of devices in the array is reduced. + +Before reducing the size of the array you should make sure that space +isn't needed. If the device holds a filesystem, you would need to +resize the filesystem to use less space. + +After reducing the array size you should check that the data stored in +the device is still available. If the device holds a filesystem, then +an 'fsck' of the filesystem is a minimum requirement. If there are +problems the array can be made bigger again with no loss with another +.B "\-\-grow \-\-array\-size=" +command. + +A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes, +Megabytes, Gigabytes or Terabytes respectively. +A value of +.B max +restores the apparent size of the array to be whatever the real +amount of available space is. + +Clustered arrays do not support this parameter yet. + +.TP +.BR \-c ", " \-\-chunk= +Specify chunk size of kilobytes. The default when creating an +array is 512KB. To ensure compatibility with earlier versions, the +default when building an array with no persistent metadata is 64KB. +This is only meaningful for RAID0, RAID4, RAID5, RAID6, and RAID10. + +RAID4, RAID5, RAID6, and RAID10 require the chunk size to be a power +of 2. In any case it must be a multiple of 4KB. + +A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes, +Megabytes, Gigabytes or Terabytes respectively. + +.TP +.BR \-\-rounding= +Specify rounding factor for a Linear array. The size of each +component will be rounded down to a multiple of this size. +This is a synonym for +.B \-\-chunk +but highlights the different meaning for Linear as compared to other +RAID levels. The default is 64K if a kernel earlier than 2.6.16 is in +use, and is 0K (i.e. no rounding) in later kernels. + +.TP +.BR \-l ", " \-\-level= +Set RAID level. When used with +.BR \-\-create , +options are: linear, raid0, 0, stripe, raid1, 1, mirror, raid4, 4, +raid5, 5, raid6, 6, raid10, 10, multipath, mp, faulty, container. +Obviously some of these are synonymous. + +When a +.B CONTAINER +metadata type is requested, only the +.B container +level is permitted, and it does not need to be explicitly given. + +When used with +.BR \-\-build , +only linear, stripe, raid0, 0, raid1, multipath, mp, and faulty are valid. + +Can be used with +.B \-\-grow +to change the RAID level in some cases. See LEVEL CHANGES below. + +.TP +.BR \-p ", " \-\-layout= +This option configures the fine details of data layout for RAID5, RAID6, +and RAID10 arrays, and controls the failure modes for +.IR faulty . +It can also be used for working around a kernel bug with RAID0, but generally +doesn't need to be used explicitly. + +The layout of the RAID5 parity block can be one of +.BR left\-asymmetric , +.BR left\-symmetric , +.BR right\-asymmetric , +.BR right\-symmetric , +.BR la ", " ra ", " ls ", " rs . +The default is +.BR left\-symmetric . + +It is also possible to cause RAID5 to use a RAID4-like layout by +choosing +.BR parity\-first , +or +.BR parity\-last . + +Finally for RAID5 there are DDF\-compatible layouts, +.BR ddf\-zero\-restart , +.BR ddf\-N\-restart , +and +.BR ddf\-N\-continue . + +These same layouts are available for RAID6. There are also 4 layouts +that will provide an intermediate stage for converting between RAID5 +and RAID6. These provide a layout which is identical to the +corresponding RAID5 layout on the first N\-1 devices, and has the 'Q' +syndrome (the second 'parity' block used by RAID6) on the last device. +These layouts are: +.BR left\-symmetric\-6 , +.BR right\-symmetric\-6 , +.BR left\-asymmetric\-6 , +.BR right\-asymmetric\-6 , +and +.BR parity\-first\-6 . + +When setting the failure mode for level +.I faulty, +the options are: +.BR write\-transient ", " wt , +.BR read\-transient ", " rt , +.BR write\-persistent ", " wp , +.BR read\-persistent ", " rp , +.BR write\-all , +.BR read\-fixable ", " rf , +.BR clear ", " flush ", " none . + +Each failure mode can be followed by a number, which is used as a period +between fault generation. Without a number, the fault is generated +once on the first relevant request. With a number, the fault will be +generated after that many requests, and will continue to be generated +every time the period elapses. + +Multiple failure modes can be current simultaneously by using the +.B \-\-grow +option to set subsequent failure modes. + +"clear" or "none" will remove any pending or periodic failure modes, +and "flush" will clear any persistent faults. + +The layout options for RAID10 are one of 'n', 'o' or 'f' followed +by a small number. The default is 'n2'. The supported options are: + +.I 'n' +signals 'near' copies. Multiple copies of one data block are at +similar offsets in different devices. + +.I 'o' +signals 'offset' copies. Rather than the chunks being duplicated +within a stripe, whole stripes are duplicated but are rotated by one +device so duplicate blocks are on different devices. Thus subsequent +copies of a block are in the next drive, and are one chunk further +down. + +.I 'f' +signals 'far' copies +(multiple copies have very different offsets). +See md(4) for more detail about 'near', 'offset', and 'far'. + +The number is the number of copies of each datablock. 2 is normal, 3 +can be useful. This number can be at most equal to the number of +devices in the array. It does not need to divide evenly into that +number (e.g. it is perfectly legal to have an 'n2' layout for an array +with an odd number of devices). + +A bug introduced in Linux 3.14 means that RAID0 arrays +.B "with devices of differing sizes" +started using a different layout. This could lead to +data corruption. Since Linux 5.4 (and various stable releases that received +backports), the kernel will not accept such an array unless +a layout is explictly set. It can be set to +.RB ' original ' +or +.RB ' alternate '. +When creating a new array, +.I mdadm +will select +.RB ' original ' +by default, so the layout does not normally need to be set. +An array created for either +.RB ' original ' +or +.RB ' alternate ' +will not be recognized by an (unpatched) kernel prior to 5.4. To create +a RAID0 array with devices of differing sizes that can be used on an +older kernel, you can set the layout to +.RB ' dangerous '. +This will use whichever layout the running kernel supports, so the data +on the array may become corrupt when changing kernel from pre-3.14 to a +later kernel. + +When an array is converted between RAID5 and RAID6 an intermediate +RAID6 layout is used in which the second parity block (Q) is always on +the last device. To convert a RAID5 to RAID6 and leave it in this new +layout (which does not require re-striping) use +.BR \-\-layout=preserve . +This will try to avoid any restriping. + +The converse of this is +.B \-\-layout=normalise +which will change a non-standard RAID6 layout into a more standard +arrangement. + +.TP +.BR \-\-parity= +same as +.B \-\-layout +(thus explaining the p of +.BR \-p ). + +.TP +.BR \-b ", " \-\-bitmap= +Specify a file to store a write-intent bitmap in. The file should not +exist unless +.B \-\-force +is also given. The same file should be provided +when assembling the array. If the word +.B "internal" +is given, then the bitmap is stored with the metadata on the array, +and so is replicated on all devices. If the word +.B "none" +is given with +.B \-\-grow +mode, then any bitmap that is present is removed. If the word +.B "clustered" +is given, the array is created for a clustered environment. One bitmap +is created for each node as defined by the +.B \-\-nodes +parameter and are stored internally. + +To help catch typing errors, the filename must contain at least one +slash ('/') if it is a real file (not 'internal' or 'none'). + +Note: external bitmaps are only known to work on ext2 and ext3. +Storing bitmap files on other filesystems may result in serious problems. + +When creating an array on devices which are 100G or larger, +.I mdadm +automatically adds an internal bitmap as it will usually be +beneficial. This can be suppressed with +.B "\-\-bitmap=none" +or by selecting a different consistency policy with +.BR \-\-consistency\-policy . + +.TP +.BR \-\-bitmap\-chunk= +Set the chunksize of the bitmap. Each bit corresponds to that many +Kilobytes of storage. +When using a file based bitmap, the default is to use the smallest +size that is at-least 4 and requires no more than 2^21 chunks. +When using an +.B internal +bitmap, the chunksize defaults to 64Meg, or larger if necessary to +fit the bitmap into the available space. + +A suffix of 'K', 'M', 'G' or 'T' can be given to indicate Kilobytes, +Megabytes, Gigabytes or Terabytes respectively. + +.TP +.BR \-W ", " \-\-write\-mostly +subsequent devices listed in a +.BR \-\-build , +.BR \-\-create , +or +.B \-\-add +command will be flagged as 'write\-mostly'. This is valid for RAID1 +only and means that the 'md' driver will avoid reading from these +devices if at all possible. This can be useful if mirroring over a +slow link. + +.TP +.BR \-\-write\-behind= +Specify that write-behind mode should be enabled (valid for RAID1 +only). If an argument is specified, it will set the maximum number +of outstanding writes allowed. The default value is 256. +A write-intent bitmap is required in order to use write-behind +mode, and write-behind is only attempted on drives marked as +.IR write-mostly . + +.TP +.BR \-\-failfast +subsequent devices listed in a +.B \-\-create +or +.B \-\-add +command will be flagged as 'failfast'. This is valid for RAID1 and +RAID10 only. IO requests to these devices will be encouraged to fail +quickly rather than cause long delays due to error handling. Also no +attempt is made to repair a read error on these devices. + +If an array becomes degraded so that the 'failfast' device is the only +usable device, the 'failfast' flag will then be ignored and extended +delays will be preferred to complete failure. + +The 'failfast' flag is appropriate for storage arrays which have a +low probability of true failure, but which may sometimes +cause unacceptable delays due to internal maintenance functions. + +.TP +.BR \-\-assume\-clean +Tell +.I mdadm +that the array pre-existed and is known to be clean. It can be useful +when trying to recover from a major failure as you can be sure that no +data will be affected unless you actually write to the array. It can +also be used when creating a RAID1 or RAID10 if you want to avoid the +initial resync, however this practice \(em while normally safe \(em is not +recommended. Use this only if you really know what you are doing. +.IP +When the devices that will be part of a new array were filled +with zeros before creation the operator knows the array is +actually clean. If that is the case, such as after running +badblocks, this argument can be used to tell mdadm the +facts the operator knows. +.IP +When an array is resized to a larger size with +.B "\-\-grow \-\-size=" +the new space is normally resynced in that same way that the whole +array is resynced at creation. From Linux version 3.0, +.B \-\-assume\-clean +can be used with that command to avoid the automatic resync. + +.TP +.BR \-\-backup\-file= +This is needed when +.B \-\-grow +is used to increase the number of raid-devices in a RAID5 or RAID6 if +there are no spare devices available, or to shrink, change RAID level +or layout. See the GROW MODE section below on RAID\-DEVICES CHANGES. +The file must be stored on a separate device, not on the RAID array +being reshaped. + +.TP +.B \-\-data\-offset= +Arrays with 1.x metadata can leave a gap between the start of the +device and the start of array data. This gap can be used for various +metadata. The start of data is known as the +.IR data\-offset . +Normally an appropriate data offset is computed automatically. +However it can be useful to set it explicitly such as when re-creating +an array which was originally created using a different version of +.I mdadm +which computed a different offset. + +Setting the offset explicitly over-rides the default. The value given +is in Kilobytes unless a suffix of 'K', 'M', 'G' or 'T' is used to explicitly +indicate Kilobytes, Megabytes, Gigabytes or Terabytes respectively. + +Since Linux 3.4, +.B \-\-data\-offset +can also be used with +.B --grow +for some RAID levels (initially on RAID10). This allows the +data\-offset to be changed as part of the reshape process. When the +data offset is changed, no backup file is required as the difference +in offsets is used to provide the same functionality. + +When the new offset is earlier than the old offset, the number of +devices in the array cannot shrink. When it is after the old offset, +the number of devices in the array cannot increase. + +When creating an array, +.B \-\-data\-offset +can be specified as +.BR variable . +In the case each member device is expected to have a offset appended +to the name, separated by a colon. This makes it possible to recreate +exactly an array which has varying data offsets (as can happen when +different versions of +.I mdadm +are used to add different devices). + +.TP +.BR \-\-continue +This option is complementary to the +.B \-\-freeze-reshape +option for assembly. It is needed when +.B \-\-grow +operation is interrupted and it is not restarted automatically due to +.B \-\-freeze-reshape +usage during array assembly. This option is used together with +.BR \-G +, ( +.BR \-\-grow +) command and device for a pending reshape to be continued. +All parameters required for reshape continuation will be read from array metadata. +If initial +.BR \-\-grow +command had required +.BR \-\-backup\-file= +option to be set, continuation option will require to have exactly the same +backup file given as well. +.IP +Any other parameter passed together with +.BR \-\-continue +option will be ignored. + +.TP +.BR \-N ", " \-\-name= +Set a +.B name +for the array. This is currently only effective when creating an +array with a version-1 superblock, or an array in a DDF container. +The name is a simple textual string that can be used to identify array +components when assembling. If name is needed but not specified, it +is taken from the basename of the device that is being created. +e.g. when creating +.I /dev/md/home +the +.B name +will default to +.IR home . + +.TP +.BR \-R ", " \-\-run +Insist that +.I mdadm +run the array, even if some of the components +appear to be active in another array or filesystem. Normally +.I mdadm +will ask for confirmation before including such components in an +array. This option causes that question to be suppressed. + +.TP +.BR \-f ", " \-\-force +Insist that +.I mdadm +accept the geometry and layout specified without question. Normally +.I mdadm +will not allow creation of an array with only one device, and will try +to create a RAID5 array with one missing drive (as this makes the +initial resync work faster). With +.BR \-\-force , +.I mdadm +will not try to be so clever. + +.TP +.BR \-o ", " \-\-readonly +Start the array +.B read only +rather than read-write as normal. No writes will be allowed to the +array, and no resync, recovery, or reshape will be started. It works with +Create, Assemble, Manage and Misc mode. + +.TP +.BR \-a ", " "\-\-auto{=yes,md,mdp,part,p}{NN}" +Instruct mdadm how to create the device file if needed, possibly allocating +an unused minor number. "md" causes a non-partitionable array +to be used (though since Linux 2.6.28, these array devices are in fact +partitionable). "mdp", "part" or "p" causes a partitionable array (2.6 and +later) to be used. "yes" requires the named md device to have +a 'standard' format, and the type and minor number will be determined +from this. With mdadm 3.0, device creation is normally left up to +.I udev +so this option is unlikely to be needed. +See DEVICE NAMES below. + +The argument can also come immediately after +"\-a". e.g. "\-ap". + +If +.B \-\-auto +is not given on the command line or in the config file, then +the default will be +.BR \-\-auto=yes . + +If +.B \-\-scan +is also given, then any +.I auto= +entries in the config file will override the +.B \-\-auto +instruction given on the command line. + +For partitionable arrays, +.I mdadm +will create the device file for the whole array and for the first 4 +partitions. A different number of partitions can be specified at the +end of this option (e.g. +.BR \-\-auto=p7 ). +If the device name ends with a digit, the partition names add a 'p', +and a number, e.g. +.IR /dev/md/home1p3 . +If there is no trailing digit, then the partition names just have a +number added, e.g. +.IR /dev/md/scratch3 . + +If the md device name is in a 'standard' format as described in DEVICE +NAMES, then it will be created, if necessary, with the appropriate +device number based on that name. If the device name is not in one of these +formats, then a unused device number will be allocated. The device +number will be considered unused if there is no active array for that +number, and there is no entry in /dev for that number and with a +non-standard name. Names that are not in 'standard' format are only +allowed in "/dev/md/". + +This is meaningful with +.B \-\-create +or +.BR \-\-build . + +.TP +.BR \-a ", " "\-\-add" +This option can be used in Grow mode in two cases. + +If the target array is a Linear array, then +.B \-\-add +can be used to add one or more devices to the array. They +are simply catenated on to the end of the array. Once added, the +devices cannot be removed. + +If the +.B \-\-raid\-disks +option is being used to increase the number of devices in an array, +then +.B \-\-add +can be used to add some extra devices to be included in the array. +In most cases this is not needed as the extra devices can be added as +spares first, and then the number of raid-disks can be changed. +However for RAID0, it is not possible to add spares. So to increase +the number of devices in a RAID0, it is necessary to set the new +number of devices, and to add the new devices, in the same command. + +.TP +.BR \-\-nodes +Only works when the array is for clustered environment. It specifies +the maximum number of nodes in the cluster that will use this device +simultaneously. If not specified, this defaults to 4. + +.TP +.BR \-\-write-journal +Specify journal device for the RAID-4/5/6 array. The journal device +should be a SSD with reasonable lifetime. + +.TP +.BR \-\-symlinks +Auto creation of symlinks in /dev to /dev/md, option --symlinks must +be 'no' or 'yes' and work with --create and --build. + +.TP +.BR \-k ", " \-\-consistency\-policy= +Specify how the array maintains consistency in case of unexpected shutdown. +Only relevant for RAID levels with redundancy. +Currently supported options are: +.RS + +.TP +.B resync +Full resync is performed and all redundancy is regenerated when the array is +started after unclean shutdown. + +.TP +.B bitmap +Resync assisted by a write-intent bitmap. Implicitly selected when using +.BR \-\-bitmap . + +.TP +.B journal +For RAID levels 4/5/6, journal device is used to log transactions and replay +after unclean shutdown. Implicitly selected when using +.BR \-\-write\-journal . + +.TP +.B ppl +For RAID5 only, Partial Parity Log is used to close the write hole and +eliminate resync. PPL is stored in the metadata region of RAID member drives, +no additional journal drive is needed. + +.PP +Can be used with \-\-grow to change the consistency policy of an active array +in some cases. See CONSISTENCY POLICY CHANGES below. +.RE + + +.SH For assemble: + +.TP +.BR \-u ", " \-\-uuid= +uuid of array to assemble. Devices which don't have this uuid are +excluded + +.TP +.BR \-m ", " \-\-super\-minor= +Minor number of device that array was created for. Devices which +don't have this minor number are excluded. If you create an array as +/dev/md1, then all superblocks will contain the minor number 1, even if +the array is later assembled as /dev/md2. + +Giving the literal word "dev" for +.B \-\-super\-minor +will cause +.I mdadm +to use the minor number of the md device that is being assembled. +e.g. when assembling +.BR /dev/md0 , +.B \-\-super\-minor=dev +will look for super blocks with a minor number of 0. + +.B \-\-super\-minor +is only relevant for v0.90 metadata, and should not normally be used. +Using +.B \-\-uuid +is much safer. + +.TP +.BR \-N ", " \-\-name= +Specify the name of the array to assemble. This must be the name +that was specified when creating the array. It must either match +the name stored in the superblock exactly, or it must match +with the current +.I homehost +prefixed to the start of the given name. + +.TP +.BR \-f ", " \-\-force +Assemble the array even if the metadata on some devices appears to be +out-of-date. If +.I mdadm +cannot find enough working devices to start the array, but can find +some devices that are recorded as having failed, then it will mark +those devices as working so that the array can be started. This works only for +native. For external metadata it allows to start dirty degraded RAID 4, 5, 6. +An array which requires +.B \-\-force +to be started may contain data corruption. Use it carefully. + +.TP +.BR \-R ", " \-\-run +Attempt to start the array even if fewer drives were given than were +present last time the array was active. Normally if not all the +expected drives are found and +.B \-\-scan +is not used, then the array will be assembled but not started. +With +.B \-\-run +an attempt will be made to start it anyway. + +.TP +.B \-\-no\-degraded +This is the reverse of +.B \-\-run +in that it inhibits the startup of array unless all expected drives +are present. This is only needed with +.B \-\-scan, +and can be used if the physical connections to devices are +not as reliable as you would like. + +.TP +.BR \-a ", " "\-\-auto{=no,yes,md,mdp,part}" +See this option under Create and Build options. + +.TP +.BR \-b ", " \-\-bitmap= +Specify the bitmap file that was given when the array was created. If +an array has an +.B internal +bitmap, there is no need to specify this when assembling the array. + +.TP +.BR \-\-backup\-file= +If +.B \-\-backup\-file +was used while reshaping an array (e.g. changing number of devices or +chunk size) and the system crashed during the critical section, then the same +.B \-\-backup\-file +must be presented to +.B \-\-assemble +to allow possibly corrupted data to be restored, and the reshape +to be completed. + +.TP +.BR \-\-invalid\-backup +If the file needed for the above option is not available for any +reason an empty file can be given together with this option to +indicate that the backup file is invalid. In this case the data that +was being rearranged at the time of the crash could be irrecoverably +lost, but the rest of the array may still be recoverable. This option +should only be used as a last resort if there is no way to recover the +backup file. + + +.TP +.BR \-U ", " \-\-update= +Update the superblock on each device while assembling the array. The +argument given to this flag can be one of +.BR sparc2.2 , +.BR summaries , +.BR uuid , +.BR name , +.BR nodes , +.BR homehost , +.BR home-cluster , +.BR resync , +.BR byteorder , +.BR devicesize , +.BR no\-bitmap , +.BR bbl , +.BR no\-bbl , +.BR ppl , +.BR no\-ppl , +.BR layout\-original , +.BR layout\-alternate , +.BR layout\-unspecified , +.BR metadata , +or +.BR super\-minor . + +The +.B sparc2.2 +option will adjust the superblock of an array what was created on a Sparc +machine running a patched 2.2 Linux kernel. This kernel got the +alignment of part of the superblock wrong. You can use the +.B "\-\-examine \-\-sparc2.2" +option to +.I mdadm +to see what effect this would have. + +The +.B super\-minor +option will update the +.B "preferred minor" +field on each superblock to match the minor number of the array being +assembled. +This can be useful if +.B \-\-examine +reports a different "Preferred Minor" to +.BR \-\-detail . +In some cases this update will be performed automatically +by the kernel driver. In particular the update happens automatically +at the first write to an array with redundancy (RAID level 1 or +greater) on a 2.6 (or later) kernel. + +The +.B uuid +option will change the uuid of the array. If a UUID is given with the +.B \-\-uuid +option that UUID will be used as a new UUID and will +.B NOT +be used to help identify the devices in the array. +If no +.B \-\-uuid +is given, a random UUID is chosen. + +The +.B name +option will change the +.I name +of the array as stored in the superblock. This is only supported for +version-1 superblocks. + +The +.B nodes +option will change the +.I nodes +of the array as stored in the bitmap superblock. This option only +works for a clustered environment. + +The +.B homehost +option will change the +.I homehost +as recorded in the superblock. For version-0 superblocks, this is the +same as updating the UUID. +For version-1 superblocks, this involves updating the name. + +The +.B home\-cluster +option will change the cluster name as recorded in the superblock and +bitmap. This option only works for clustered environment. + +The +.B resync +option will cause the array to be marked +.I dirty +meaning that any redundancy in the array (e.g. parity for RAID5, +copies for RAID1) may be incorrect. This will cause the RAID system +to perform a "resync" pass to make sure that all redundant information +is correct. + +The +.B byteorder +option allows arrays to be moved between machines with different +byte-order, such as from a big-endian machine like a Sparc or some +MIPS machines, to a little-endian x86_64 machine. +When assembling such an array for the first time after a move, giving +.B "\-\-update=byteorder" +will cause +.I mdadm +to expect superblocks to have their byteorder reversed, and will +correct that order before assembling the array. This is only valid +with original (Version 0.90) superblocks. + +The +.B summaries +option will correct the summaries in the superblock. That is the +counts of total, working, active, failed, and spare devices. + +The +.B devicesize +option will rarely be of use. It applies to version 1.1 and 1.2 metadata +only (where the metadata is at the start of the device) and is only +useful when the component device has changed size (typically become +larger). The version 1 metadata records the amount of the device that +can be used to store data, so if a device in a version 1.1 or 1.2 +array becomes larger, the metadata will still be visible, but the +extra space will not. In this case it might be useful to assemble the +array with +.BR \-\-update=devicesize . +This will cause +.I mdadm +to determine the maximum usable amount of space on each device and +update the relevant field in the metadata. + +The +.B metadata +option only works on v0.90 metadata arrays and will convert them to +v1.0 metadata. The array must not be dirty (i.e. it must not need a +sync) and it must not have a write-intent bitmap. + +The old metadata will remain on the devices, but will appear older +than the new metadata and so will usually be ignored. The old metadata +(or indeed the new metadata) can be removed by giving the appropriate +.B \-\-metadata= +option to +.BR \-\-zero\-superblock . + +The +.B no\-bitmap +option can be used when an array has an internal bitmap which is +corrupt in some way so that assembling the array normally fails. It +will cause any internal bitmap to be ignored. + +The +.B bbl +option will reserve space in each device for a bad block list. This +will be 4K in size and positioned near the end of any free space +between the superblock and the data. + +The +.B no\-bbl +option will cause any reservation of space for a bad block list to be +removed. If the bad block list contains entries, this will fail, as +removing the list could cause data corruption. + +The +.B ppl +option will enable PPL for a RAID5 array and reserve space for PPL on each +device. There must be enough free space between the data and superblock and a +write-intent bitmap or journal must not be used. + +The +.B no\-ppl +option will disable PPL in the superblock. + +The +.B layout\-original +and +.B layout\-alternate +options are for RAID0 arrays with non-uniform devices size that were in +use before Linux 5.4. If the array was being used with Linux 3.13 or +earlier, then to assemble the array on a new kernel, +.B \-\-update=layout\-original +must be given. If the array was created and used with a kernel from Linux 3.14 to +Linux 5.3, then +.B \-\-update=layout\-alternate +must be given. This only needs to be given once. Subsequent assembly of the array +will happen normally. +For more information, see +.IR md (4). + +The +.B layout\-unspecified +option reverts the effect of +.B layout\-orignal +or +.B layout\-alternate +and allows the array to be again used on a kernel prior to Linux 5.3. +This option should be used with great caution. + +.TP +.BR \-\-freeze\-reshape +Option is intended to be used in start-up scripts during initrd boot phase. +When array under reshape is assembled during initrd phase, this option +stops reshape after reshape critical section is being restored. This happens +before file system pivot operation and avoids loss of file system context. +Losing file system context would cause reshape to be broken. + +Reshape can be continued later using the +.B \-\-continue +option for the grow command. + +.TP +.BR \-\-symlinks +See this option under Create and Build options. + +.SH For Manage mode: + +.TP +.BR \-t ", " \-\-test +Unless a more serious error occurred, +.I mdadm +will exit with a status of 2 if no changes were made to the array and +0 if at least one change was made. +This can be useful when an indirect specifier such as +.BR missing , +.B detached +or +.B faulty +is used in requesting an operation on the array. +.B \-\-test +will report failure if these specifiers didn't find any match. + +.TP +.BR \-a ", " \-\-add +hot-add listed devices. +If a device appears to have recently been part of the array +(possibly it failed or was removed) the device is re\-added as described +in the next point. +If that fails or the device was never part of the array, the device is +added as a hot-spare. +If the array is degraded, it will immediately start to rebuild data +onto that spare. + +Note that this and the following options are only meaningful on array +with redundancy. They don't apply to RAID0 or Linear. + +.TP +.BR \-\-re\-add +re\-add a device that was previously removed from an array. +If the metadata on the device reports that it is a member of the +array, and the slot that it used is still vacant, then the device will +be added back to the array in the same position. This will normally +cause the data for that device to be recovered. However based on the +event count on the device, the recovery may only require sections that +are flagged a write-intent bitmap to be recovered or may not require +any recovery at all. + +When used on an array that has no metadata (i.e. it was built with +.BR \-\-build) +it will be assumed that bitmap-based recovery is enough to make the +device fully consistent with the array. + +When used with v1.x metadata, +.B \-\-re\-add +can be accompanied by +.BR \-\-update=devicesize , +.BR \-\-update=bbl ", or" +.BR \-\-update=no\-bbl . +See the description of these option when used in Assemble mode for an +explanation of their use. + +If the device name given is +.B missing +then +.I mdadm +will try to find any device that looks like it should be +part of the array but isn't and will try to re\-add all such devices. + +If the device name given is +.B faulty +then +.I mdadm +will find all devices in the array that are marked +.BR faulty , +remove them and attempt to immediately re\-add them. This can be +useful if you are certain that the reason for failure has been +resolved. + +.TP +.B \-\-add\-spare +Add a device as a spare. This is similar to +.B \-\-add +except that it does not attempt +.B \-\-re\-add +first. The device will be added as a spare even if it looks like it +could be an recent member of the array. + +.TP +.BR \-r ", " \-\-remove +remove listed devices. They must not be active. i.e. they should +be failed or spare devices. + +As well as the name of a device file +(e.g. +.BR /dev/sda1 ) +the words +.BR failed , +.B detached +and names like +.B set-A +can be given to +.BR \-\-remove . +The first causes all failed device to be removed. The second causes +any device which is no longer connected to the system (i.e an 'open' +returns +.BR ENXIO ) +to be removed. +The third will remove a set as describe below under +.BR \-\-fail . + +.TP +.BR \-f ", " \-\-fail +Mark listed devices as faulty. +As well as the name of a device file, the word +.B detached +or a set name like +.B set\-A +can be given. The former will cause any device that has been detached from +the system to be marked as failed. It can then be removed. + +For RAID10 arrays where the number of copies evenly divides the number +of devices, the devices can be conceptually divided into sets where +each set contains a single complete copy of the data on the array. +Sometimes a RAID10 array will be configured so that these sets are on +separate controllers. In this case all the devices in one set can be +failed by giving a name like +.B set\-A +or +.B set\-B +to +.BR \-\-fail . +The appropriate set names are reported by +.BR \-\-detail . + +.TP +.BR \-\-set\-faulty +same as +.BR \-\-fail . + +.TP +.B \-\-replace +Mark listed devices as requiring replacement. As soon as a spare is +available, it will be rebuilt and will replace the marked device. +This is similar to marking a device as faulty, but the device remains +in service during the recovery process to increase resilience against +multiple failures. When the replacement process finishes, the +replaced device will be marked as faulty. + +.TP +.B \-\-with +This can follow a list of +.B \-\-replace +devices. The devices listed after +.B \-\-with +will be preferentially used to replace the devices listed after +.BR \-\-replace . +These device must already be spare devices in the array. + +.TP +.BR \-\-write\-mostly +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag set. This is only valid for RAID1 and means that the 'md' driver +will avoid reading from these devices if possible. +.TP +.BR \-\-readwrite +Subsequent devices that are added or re\-added will have the 'write-mostly' +flag cleared. +.TP +.BR \-\-cluster\-confirm +Confirm the existence of the device. This is issued in response to an \-\-add +request by a node in a cluster. When a node adds a device it sends a message +to all nodes in the cluster to look for a device with a UUID. This translates +to a udev notification with the UUID of the device to be added and the slot +number. The receiving node must acknowledge this message +with \-\-cluster\-confirm. Valid arguments are <slot>:<devicename> in case +the device is found or <slot>:missing in case the device is not found. + +.TP +.BR \-\-add-journal +Add journal to an existing array, or recreate journal for RAID-4/5/6 array +that lost a journal device. To avoid interrupting on-going write opertions, +.B \-\-add-journal +only works for array in Read-Only state. + +.TP +.BR \-\-failfast +Subsequent devices that are added or re\-added will have +the 'failfast' flag set. This is only valid for RAID1 and RAID10 and +means that the 'md' driver will avoid long timeouts on error handling +where possible. +.TP +.BR \-\-nofailfast +Subsequent devices that are re\-added will be re\-added without +the 'failfast' flag set. + +.P +Each of these options requires that the first device listed is the array +to be acted upon, and the remainder are component devices to be added, +removed, marked as faulty, etc. Several different operations can be +specified for different devices, e.g. +.in +5 +mdadm /dev/md0 \-\-add /dev/sda1 \-\-fail /dev/sdb1 \-\-remove /dev/sdb1 +.in -5 +Each operation applies to all devices listed until the next +operation. + +If an array is using a write-intent bitmap, then devices which have +been removed can be re\-added in a way that avoids a full +reconstruction but instead just updates the blocks that have changed +since the device was removed. For arrays with persistent metadata +(superblocks) this is done automatically. For arrays created with +.B \-\-build +mdadm needs to be told that this device we removed recently with +.BR \-\-re\-add . + +Devices can only be removed from an array if they are not in active +use, i.e. that must be spares or failed devices. To remove an active +device, it must first be marked as +.B faulty. + +.SH For Misc mode: + +.TP +.BR \-Q ", " \-\-query +Examine a device to see +(1) if it is an md device and (2) if it is a component of an md +array. +Information about what is discovered is presented. + +.TP +.BR \-D ", " \-\-detail +Print details of one or more md devices. + +.TP +.BR \-\-detail\-platform +Print details of the platform's RAID capabilities (firmware / hardware +topology) for a given metadata format. If used without argument, mdadm +will scan all controllers looking for their capabilities. Otherwise, mdadm +will only look at the controller specified by the argument in form of an +absolute filepath or a link, e.g. +.IR /sys/devices/pci0000:00/0000:00:1f.2 . + +.TP +.BR \-Y ", " \-\-export +When used with +.BR \-\-detail , +.BR \-\-detail-platform , +.BR \-\-examine , +or +.B \-\-incremental +output will be formatted as +.B key=value +pairs for easy import into the environment. + +With +.B \-\-incremental +The value +.B MD_STARTED +indicates whether an array was started +.RB ( yes ) +or not, which may include a reason +.RB ( unsafe ", " nothing ", " no ). +Also the value +.B MD_FOREIGN +indicates if the array is expected on this host +.RB ( no ), +or seems to be from elsewhere +.RB ( yes ). + +.TP +.BR \-E ", " \-\-examine +Print contents of the metadata stored on the named device(s). +Note the contrast between +.B \-\-examine +and +.BR \-\-detail . +.B \-\-examine +applies to devices which are components of an array, while +.B \-\-detail +applies to a whole array which is currently active. +.TP +.B \-\-sparc2.2 +If an array was created on a SPARC machine with a 2.2 Linux kernel +patched with RAID support, the superblock will have been created +incorrectly, or at least incompatibly with 2.4 and later kernels. +Using the +.B \-\-sparc2.2 +flag with +.B \-\-examine +will fix the superblock before displaying it. If this appears to do +the right thing, then the array can be successfully assembled using +.BR "\-\-assemble \-\-update=sparc2.2" . + +.TP +.BR \-X ", " \-\-examine\-bitmap +Report information about a bitmap file. +The argument is either an external bitmap file or an array component +in case of an internal bitmap. Note that running this on an array +device (e.g. +.BR /dev/md0 ) +does not report the bitmap for that array. + +.TP +.B \-\-examine\-badblocks +List the bad-blocks recorded for the device, if a bad-blocks list has +been configured. Currently only +.B 1.x +and +.B IMSM +metadata support bad-blocks lists. + +.TP +.BI \-\-dump= directory +.TP +.BI \-\-restore= directory +Save metadata from lists devices, or restore metadata to listed devices. + +.TP +.BR \-R ", " \-\-run +start a partially assembled array. If +.B \-\-assemble +did not find enough devices to fully start the array, it might leaving +it partially assembled. If you wish, you can then use +.B \-\-run +to start the array in degraded mode. + +.TP +.BR \-S ", " \-\-stop +deactivate array, releasing all resources. + +.TP +.BR \-o ", " \-\-readonly +mark array as readonly. + +.TP +.BR \-w ", " \-\-readwrite +mark array as readwrite. + +.TP +.B \-\-zero\-superblock +If the device contains a valid md superblock, the block is +overwritten with zeros. With +.B \-\-force +the block where the superblock would be is overwritten even if it +doesn't appear to be valid. + +.B Note: +Be careful to call \-\-zero\-superblock with clustered raid, make sure +array isn't used or assembled in other cluster node before execute it. + +.TP +.B \-\-kill\-subarray= +If the device is a container and the argument to \-\-kill\-subarray +specifies an inactive subarray in the container, then the subarray is +deleted. Deleting all subarrays will leave an 'empty-container' or +spare superblock on the drives. See +.B \-\-zero\-superblock +for completely +removing a superblock. Note that some formats depend on the subarray +index for generating a UUID, this command will fail if it would change +the UUID of an active subarray. + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. See below in +.B MISC MODE +for details. + +.TP +.BR \-t ", " \-\-test +When used with +.BR \-\-detail , +the exit status of +.I mdadm +is set to reflect the status of the device. See below in +.B MISC MODE +for details. + +.TP +.BR \-W ", " \-\-wait +For each md device given, wait for any resync, recovery, or reshape +activity to finish before returning. +.I mdadm +will return with success if it actually waited for every device +listed, otherwise it will return failure. + +.TP +.BR \-\-wait\-clean +For each md device given, or each device in /proc/mdstat if +.B \-\-scan +is given, arrange for the array to be marked clean as soon as possible. +.I mdadm +will return with success if the array uses external metadata and we +successfully waited. For native arrays this returns immediately as the +kernel handles dirty-clean transitions at shutdown. No action is taken +if safe-mode handling is disabled. + +.TP +.B \-\-action= +Set the "sync_action" for all md devices given to one of +.BR idle , +.BR frozen , +.BR check , +.BR repair . +Setting to +.B idle +will abort any currently running action though some actions will +automatically restart. +Setting to +.B frozen +will abort any current action and ensure no other action starts +automatically. + +Details of +.B check +and +.B repair +can be found it +.IR md (4) +under +.BR "SCRUBBING AND MISMATCHES" . + +.SH For Incremental Assembly mode: +.TP +.BR \-\-rebuild\-map ", " \-r +Rebuild the map file +.RB ( {MAP_PATH} ) +that +.I mdadm +uses to help track which arrays are currently being assembled. + +.TP +.BR \-\-run ", " \-R +Run any array assembled as soon as a minimal number of devices are +available, rather than waiting until all expected devices are present. + +.TP +.BR \-\-scan ", " \-s +Only meaningful with +.B \-R +this will scan the +.B map +file for arrays that are being incrementally assembled and will try to +start any that are not already started. If any such array is listed +in +.B mdadm.conf +as requiring an external bitmap, that bitmap will be attached first. + +.TP +.BR \-\-fail ", " \-f +This allows the hot-plug system to remove devices that have fully disappeared +from the kernel. It will first fail and then remove the device from any +array it belongs to. +The device name given should be a kernel device name such as "sda", +not a name in +.IR /dev . + +.TP +.BR \-\-path= +Only used with \-\-fail. The 'path' given will be recorded so that if +a new device appears at the same location it can be automatically +added to the same array. This allows the failed device to be +automatically replaced by a new device without metadata if it appears +at specified path. This option is normally only set by a +.I udev +script. + +.SH For Monitor mode: +.TP +.BR \-m ", " \-\-mail +Give a mail address to send alerts to. + +.TP +.BR \-p ", " \-\-program ", " \-\-alert +Give a program to be run whenever an event is detected. + +.TP +.BR \-y ", " \-\-syslog +Cause all events to be reported through 'syslog'. The messages have +facility of 'daemon' and varying priorities. + +.TP +.BR \-d ", " \-\-delay +Give a delay in seconds. +.I mdadm +polls the md arrays and then waits this many seconds before polling +again. The default is 60 seconds. Since 2.6.16, there is no need to +reduce this as the kernel alerts +.I mdadm +immediately when there is any change. + +.TP +.BR \-r ", " \-\-increment +Give a percentage increment. +.I mdadm +will generate RebuildNN events with the given percentage increment. + +.TP +.BR \-f ", " \-\-daemonise +Tell +.I mdadm +to run as a background daemon if it decides to monitor anything. This +causes it to fork and run in the child, and to disconnect from the +terminal. The process id of the child is written to stdout. +This is useful with +.B \-\-scan +which will only continue monitoring if a mail address or alert program +is found in the config file. + +.TP +.BR \-i ", " \-\-pid\-file +When +.I mdadm +is running in daemon mode, write the pid of the daemon process to +the specified file, instead of printing it on standard output. + +.TP +.BR \-1 ", " \-\-oneshot +Check arrays only once. This will generate +.B NewArray +events and more significantly +.B DegradedArray +and +.B SparesMissing +events. Running +.in +5 +.B " mdadm \-\-monitor \-\-scan \-1" +.in -5 +from a cron script will ensure regular notification of any degraded arrays. + +.TP +.BR \-t ", " \-\-test +Generate a +.B TestMessage +alert for every array found at startup. This alert gets mailed and +passed to the alert program. This can be used for testing that alert +message do get through successfully. + +.TP +.BR \-\-no\-sharing +This inhibits the functionality for moving spares between arrays. +Only one monitoring process started with +.B \-\-scan +but without this flag is allowed, otherwise the two could interfere +with each other. + +.SH ASSEMBLE MODE + +.HP 12 +Usage: +.B mdadm \-\-assemble +.I md-device options-and-component-devices... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I md-devices-and-options... +.HP 12 +Usage: +.B mdadm \-\-assemble \-\-scan +.I options... + +.PP +This usage assembles one or more RAID arrays from pre-existing components. +For each array, mdadm needs to know the md device, the identity of the +array, and a number of component-devices. These can be found in a number of ways. + +In the first usage example (without the +.BR \-\-scan ) +the first device given is the md device. +In the second usage example, all devices listed are treated as md +devices and assembly is attempted. +In the third (where no devices are listed) all md devices that are +listed in the configuration file are assembled. If no arrays are +described by the configuration file, then any arrays that +can be found on unused devices will be assembled. + +If precisely one device is listed, but +.B \-\-scan +is not given, then +.I mdadm +acts as though +.B \-\-scan +was given and identity information is extracted from the configuration file. + +The identity can be given with the +.B \-\-uuid +option, the +.B \-\-name +option, or the +.B \-\-super\-minor +option, will be taken from the md-device record in the config file, or +will be taken from the super block of the first component-device +listed on the command line. + +Devices can be given on the +.B \-\-assemble +command line or in the config file. Only devices which have an md +superblock which contains the right identity will be considered for +any array. + +The config file is only used if explicitly named with +.B \-\-config +or requested with (a possibly implicit) +.BR \-\-scan . +In the later case, +.B /etc/mdadm.conf +or +.B /etc/mdadm/mdadm.conf +is used. + +If +.B \-\-scan +is not given, then the config file will only be used to find the +identity of md arrays. + +Normally the array will be started after it is assembled. However if +.B \-\-scan +is not given and not all expected drives were listed, then the array +is not started (to guard against usage errors). To insist that the +array be started in this case (as may work for RAID1, 4, 5, 6, or 10), +give the +.B \-\-run +flag. + +If +.I udev +is active, +.I mdadm +does not create any entries in +.B /dev +but leaves that to +.IR udev . +It does record information in +.B {MAP_PATH} +which will allow +.I udev +to choose the correct name. + +If +.I mdadm +detects that udev is not configured, it will create the devices in +.B /dev +itself. + +In Linux kernels prior to version 2.6.28 there were two distinctly +different types of md devices that could be created: one that could be +partitioned using standard partitioning tools and one that could not. +Since 2.6.28 that distinction is no longer relevant as both type of +devices can be partitioned. +.I mdadm +will normally create the type that originally could not be partitioned +as it has a well defined major number (9). + +Prior to 2.6.28, it is important that mdadm chooses the correct type +of array device to use. This can be controlled with the +.B \-\-auto +option. In particular, a value of "mdp" or "part" or "p" tells mdadm +to use a partitionable device rather than the default. + +In the no-udev case, the value given to +.B \-\-auto +can be suffixed by a number. This tells +.I mdadm +to create that number of partition devices rather than the default of 4. + +The value given to +.B \-\-auto +can also be given in the configuration file as a word starting +.B auto= +on the ARRAY line for the relevant array. + +.SS Auto Assembly +When +.B \-\-assemble +is used with +.B \-\-scan +and no devices are listed, +.I mdadm +will first attempt to assemble all the arrays listed in the config +file. + +If no arrays are listed in the config (other than those marked +.BR <ignore> ) +it will look through the available devices for possible arrays and +will try to assemble anything that it finds. Arrays which are tagged +as belonging to the given homehost will be assembled and started +normally. Arrays which do not obviously belong to this host are given +names that are expected not to conflict with anything local, and are +started "read-auto" so that nothing is written to any device until the +array is written to. i.e. automatic resync etc is delayed. + +If +.I mdadm +finds a consistent set of devices that look like they should comprise +an array, and if the superblock is tagged as belonging to the given +home host, it will automatically choose a device name and try to +assemble the array. If the array uses version-0.90 metadata, then the +.B minor +number as recorded in the superblock is used to create a name in +.B /dev/md/ +so for example +.BR /dev/md/3 . +If the array uses version-1 metadata, then the +.B name +from the superblock is used to similarly create a name in +.B /dev/md/ +(the name will have any 'host' prefix stripped first). + +This behaviour can be modified by the +.I AUTO +line in the +.I mdadm.conf +configuration file. This line can indicate that specific metadata +type should, or should not, be automatically assembled. If an array +is found which is not listed in +.I mdadm.conf +and has a metadata format that is denied by the +.I AUTO +line, then it will not be assembled. +The +.I AUTO +line can also request that all arrays identified as being for this +homehost should be assembled regardless of their metadata type. +See +.IR mdadm.conf (5) +for further details. + +Note: Auto assembly cannot be used for assembling and activating some +arrays which are undergoing reshape. In particular as the +.B backup\-file +cannot be given, any reshape which requires a backup-file to continue +cannot be started by auto assembly. An array which is growing to more +devices and has passed the critical section can be assembled using +auto-assembly. + +.SH BUILD MODE + +.HP 12 +Usage: +.B mdadm \-\-build +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage is similar to +.BR \-\-create . +The difference is that it creates an array without a superblock. With +these arrays there is no difference between initially creating the array and +subsequently assembling the array, except that hopefully there is useful +data there in the second case. + +The level may raid0, linear, raid1, raid10, multipath, or faulty, or +one of their synonyms. All devices must be listed and the array will +be started once complete. It will often be appropriate to use +.B \-\-assume\-clean +with levels raid1 or raid10. + +.SH CREATE MODE + +.HP 12 +Usage: +.B mdadm \-\-create +.I md-device +.BI \-\-chunk= X +.BI \-\-level= Y +.br +.BI \-\-raid\-devices= Z +.I devices + +.PP +This usage will initialise a new md array, associate some devices with +it, and activate the array. + +The named device will normally not exist when +.I "mdadm \-\-create" +is run, but will be created by +.I udev +once the array becomes active. + +The max length md-device name is limited to 32 characters. +Different metadata types have more strict limitation +(like IMSM where only 16 characters are allowed). +For that reason, long name could be truncated or rejected, it depends on metadata policy. + +As devices are added, they are checked to see if they contain RAID +superblocks or filesystems. They are also checked to see if the variance in +device size exceeds 1%. + +If any discrepancy is found, the array will not automatically be run, though +the presence of a +.B \-\-run +can override this caution. + +To create a "degraded" array in which some devices are missing, simply +give the word "\fBmissing\fP" +in place of a device name. This will cause +.I mdadm +to leave the corresponding slot in the array empty. +For a RAID4 or RAID5 array at most one slot can be +"\fBmissing\fP"; for a RAID6 array at most two slots. +For a RAID1 array, only one real device needs to be given. All of the +others can be +"\fBmissing\fP". + +When creating a RAID5 array, +.I mdadm +will automatically create a degraded array with an extra spare drive. +This is because building the spare into a degraded array is in general +faster than resyncing the parity on a non-degraded, but not clean, +array. This feature can be overridden with the +.B \-\-force +option. + +When creating an array with version-1 metadata a name for the array is +required. +If this is not given with the +.B \-\-name +option, +.I mdadm +will choose a name based on the last component of the name of the +device being created. So if +.B /dev/md3 +is being created, then the name +.B 3 +will be chosen. +If +.B /dev/md/home +is being created, then the name +.B home +will be used. + +When creating a partition based array, using +.I mdadm +with version-1.x metadata, the partition type should be set to +.B 0xDA +(non fs-data). This type selection allows for greater precision since +using any other [RAID auto-detect (0xFD) or a GNU/Linux partition (0x83)], +might create problems in the event of array recovery through a live cdrom. + +A new array will normally get a randomly assigned 128bit UUID which is +very likely to be unique. If you have a specific need, you can choose +a UUID for the array by giving the +.B \-\-uuid= +option. Be warned that creating two arrays with the same UUID is a +recipe for disaster. Also, using +.B \-\-uuid= +when creating a v0.90 array will silently override any +.B \-\-homehost= +setting. +.\"If the +.\".B \-\-size +.\"option is given, it is not necessary to list any component-devices in this command. +.\"They can be added later, before a +.\".B \-\-run. +.\"If no +.\".B \-\-size +.\"is given, the apparent size of the smallest drive given is used. + +If the array type supports a write-intent bitmap, and if the devices +in the array exceed 100G is size, an internal write-intent bitmap +will automatically be added unless some other option is explicitly +requested with the +.B \-\-bitmap +option or a different consistency policy is selected with the +.B \-\-consistency\-policy +option. In any case space for a bitmap will be reserved so that one +can be added later with +.BR "\-\-grow \-\-bitmap=internal" . + +If the metadata type supports it (currently only 1.x and IMSM metadata), +space will be allocated to store a bad block list. This allows a modest +number of bad blocks to be recorded, allowing the drive to remain in +service while only partially functional. + +When creating an array within a +.B CONTAINER +.I mdadm +can be given either the list of devices to use, or simply the name of +the container. The former case gives control over which devices in +the container will be used for the array. The latter case allows +.I mdadm +to automatically choose which devices to use based on how much spare +space is available. + +The General Management options that are valid with +.B \-\-create +are: +.TP +.B \-\-run +insist on running the array even if some devices look like they might +be in use. + +.TP +.B \-\-readonly +start the array in readonly mode. + +.SH MANAGE MODE +.HP 12 +Usage: +.B mdadm +.I device +.I options... devices... +.PP + +This usage will allow individual devices in an array to be failed, +removed or added. It is possible to perform multiple operations with +on command. For example: +.br +.B " mdadm /dev/md0 \-f /dev/hda1 \-r /dev/hda1 \-a /dev/hda1" +.br +will firstly mark +.B /dev/hda1 +as faulty in +.B /dev/md0 +and will then remove it from the array and finally add it back +in as a spare. However only one md array can be affected by a single +command. + +When a device is added to an active array, mdadm checks to see if it +has metadata on it which suggests that it was recently a member of the +array. If it does, it tries to "re\-add" the device. If there have +been no changes since the device was removed, or if the array has a +write-intent bitmap which has recorded whatever changes there were, +then the device will immediately become a full member of the array and +those differences recorded in the bitmap will be resolved. + +.SH MISC MODE +.HP 12 +Usage: +.B mdadm +.I options ... +.I devices ... +.PP + +MISC mode includes a number of distinct operations that +operate on distinct devices. The operations are: +.TP +.B \-\-query +The device is examined to see if it is +(1) an active md array, or +(2) a component of an md array. +The information discovered is reported. + +.TP +.B \-\-detail +The device should be an active md device. +.B mdadm +will display a detailed description of the array. +.B \-\-brief +or +.B \-\-scan +will cause the output to be less detailed and the format to be +suitable for inclusion in +.BR mdadm.conf . +The exit status of +.I mdadm +will normally be 0 unless +.I mdadm +failed to get useful information about the device(s); however, if the +.B \-\-test +option is given, then the exit status will be: +.RS +.TP +0 +The array is functioning normally. +.TP +1 +The array has at least one failed device. +.TP +2 +The array has multiple failed devices such that it is unusable. +.TP +4 +There was an error while trying to get information about the device. +.RE + +.TP +.B \-\-detail\-platform +Print detail of the platform's RAID capabilities (firmware / hardware +topology). If the metadata is specified with +.B \-e +or +.B \-\-metadata= +then the return status will be: +.RS +.TP +0 +metadata successfully enumerated its platform components on this system +.TP +1 +metadata is platform independent +.TP +2 +metadata failed to find its platform components on this system +.RE + +.TP +.B \-\-update\-subarray= +If the device is a container and the argument to \-\-update\-subarray +specifies a subarray in the container, then attempt to update the given +superblock field in the subarray. Similar to updating an array in +"assemble" mode, the field to update is selected by +.B \-U +or +.B \-\-update= +option. The supported options are +.BR name , +.BR ppl , +.BR no\-ppl , +.BR bitmap +and +.BR no\-bitmap . + +The +.B name +option updates the subarray name in the metadata, it may not affect the +device node name or the device node symlink until the subarray is +re\-assembled. If updating +.B name +would change the UUID of an active subarray this operation is blocked, +and the command will end in an error. + +The +.B ppl +and +.B no\-ppl +options enable and disable PPL in the metadata. Currently supported only for +IMSM subarrays. + +The +.B bitmap +and +.B no\-bitmap +options enable and disable write-intent bitmap in the metadata. Currently supported only for +IMSM subarrays. + +.TP +.B \-\-examine +The device should be a component of an md array. +.I mdadm +will read the md superblock of the device and display the contents. +If +.B \-\-brief +or +.B \-\-scan +is given, then multiple devices that are components of the one array +are grouped together and reported in a single entry suitable +for inclusion in +.BR mdadm.conf . + +Having +.B \-\-scan +without listing any devices will cause all devices listed in the +config file to be examined. + +.TP +.BI \-\-dump= directory +If the device contains RAID metadata, a file will be created in the +.I directory +and the metadata will be written to it. The file will be the same +size as the device and have the metadata written in the file at the +same locate that it exists in the device. However the file will be "sparse" so +that only those blocks containing metadata will be allocated. The +total space used will be small. + +The file name used in the +.I directory +will be the base name of the device. Further if any links appear in +.I /dev/disk/by-id +which point to the device, then hard links to the file will be created +in +.I directory +based on these +.I by-id +names. + +Multiple devices can be listed and their metadata will all be stored +in the one directory. + +.TP +.BI \-\-restore= directory +This is the reverse of +.BR \-\-dump . +.I mdadm +will locate a file in the directory that has a name appropriate for +the given device and will restore metadata from it. Names that match +.I /dev/disk/by-id +names are preferred, however if two of those refer to different files, +.I mdadm +will not choose between them but will abort the operation. + +If a file name is given instead of a +.I directory +then +.I mdadm +will restore from that file to a single device, always provided the +size of the file matches that of the device, and the file contains +valid metadata. +.TP +.B \-\-stop +The devices should be active md arrays which will be deactivated, as +long as they are not currently in use. + +.TP +.B \-\-run +This will fully activate a partially assembled md array. + +.TP +.B \-\-readonly +This will mark an active array as read-only, providing that it is +not currently being used. + +.TP +.B \-\-readwrite +This will change a +.B readonly +array back to being read/write. + +.TP +.B \-\-scan +For all operations except +.BR \-\-examine , +.B \-\-scan +will cause the operation to be applied to all arrays listed in +.BR /proc/mdstat . +For +.BR \-\-examine, +.B \-\-scan +causes all devices listed in the config file to be examined. + +.TP +.BR \-b ", " \-\-brief +Be less verbose. This is used with +.B \-\-detail +and +.BR \-\-examine . +Using +.B \-\-brief +with +.B \-\-verbose +gives an intermediate level of verbosity. + +.SH MONITOR MODE + +.HP 12 +Usage: +.B mdadm \-\-monitor +.I options... devices... + +.PP +This usage causes +.I mdadm +to periodically poll a number of md arrays and to report on any events +noticed. +.I mdadm +will never exit once it decides that there are arrays to be checked, +so it should normally be run in the background. + +As well as reporting events, +.I mdadm +may move a spare drive from one array to another if they are in the +same +.B spare-group +or +.B domain +and if the destination array has a failed drive but no spares. + +If any devices are listed on the command line, +.I mdadm +will only monitor those devices. Otherwise all arrays listed in the +configuration file will be monitored. Further, if +.B \-\-scan +is given, then any other md devices that appear in +.B /proc/mdstat +will also be monitored. + +The result of monitoring the arrays is the generation of events. +These events are passed to a separate program (if specified) and may +be mailed to a given E-mail address. + +When passing events to a program, the program is run once for each event, +and is given 2 or 3 command-line arguments: the first is the +name of the event (see below), the second is the name of the +md device which is affected, and the third is the name of a related +device if relevant (such as a component device that has failed). + +If +.B \-\-scan +is given, then a program or an E-mail address must be specified on the +command line or in the config file. If neither are available, then +.I mdadm +will not monitor anything. +Without +.B \-\-scan, +.I mdadm +will continue monitoring as long as something was found to monitor. If +no program or email is given, then each event is reported to +.BR stdout . + +The different events are: + +.RS 4 +.TP +.B DeviceDisappeared +An md array which previously was configured appears to no longer be +configured. (syslog priority: Critical) + +If +.I mdadm +was told to monitor an array which is RAID0 or Linear, then it will +report +.B DeviceDisappeared +with the extra information +.BR Wrong-Level . +This is because RAID0 and Linear do not support the device-failed, +hot-spare and resync operations which are monitored. + +.TP +.B RebuildStarted +An md array started reconstruction (e.g. recovery, resync, reshape, +check, repair). (syslog priority: Warning) + +.TP +.BI Rebuild NN +Where +.I NN +is a two-digit number (ie. 05, 48). This indicates that rebuild +has passed that many percent of the total. The events are generated +with fixed increment since 0. Increment size may be specified with +a commandline option (default is 20). (syslog priority: Warning) + +.TP +.B RebuildFinished +An md array that was rebuilding, isn't any more, either because it +finished normally or was aborted. (syslog priority: Warning) + +.TP +.B Fail +An active component device of an array has been marked as +faulty. (syslog priority: Critical) + +.TP +.B FailSpare +A spare component device which was being rebuilt to replace a faulty +device has failed. (syslog priority: Critical) + +.TP +.B SpareActive +A spare component device which was being rebuilt to replace a faulty +device has been successfully rebuilt and has been made active. +(syslog priority: Info) + +.TP +.B NewArray +A new md array has been detected in the +.B /proc/mdstat +file. (syslog priority: Info) + +.TP +.B DegradedArray +A newly noticed array appears to be degraded. This message is not +generated when +.I mdadm +notices a drive failure which causes degradation, but only when +.I mdadm +notices that an array is degraded when it first sees the array. +(syslog priority: Critical) + +.TP +.B MoveSpare +A spare drive has been moved from one array in a +.B spare-group +or +.B domain +to another to allow a failed drive to be replaced. +(syslog priority: Info) + +.TP +.B SparesMissing +If +.I mdadm +has been told, via the config file, that an array should have a certain +number of spare devices, and +.I mdadm +detects that it has fewer than this number when it first sees the +array, it will report a +.B SparesMissing +message. +(syslog priority: Warning) + +.TP +.B TestMessage +An array was found at startup, and the +.B \-\-test +flag was given. +(syslog priority: Info) +.RE + +Only +.B Fail, +.B FailSpare, +.B DegradedArray, +.B SparesMissing +and +.B TestMessage +cause Email to be sent. All events cause the program to be run. +The program is run with two or three arguments: the event +name, the array device and possibly a second device. + +Each event has an associated array device (e.g. +.BR /dev/md1 ) +and possibly a second device. For +.BR Fail , +.BR FailSpare , +and +.B SpareActive +the second device is the relevant component device. +For +.B MoveSpare +the second device is the array that the spare was moved from. + +For +.I mdadm +to move spares from one array to another, the different arrays need to +be labeled with the same +.B spare-group +or the spares must be allowed to migrate through matching POLICY domains +in the configuration file. The +.B spare-group +name can be any string; it is only necessary that different spare +groups use different names. + +When +.I mdadm +detects that an array in a spare group has fewer active +devices than necessary for the complete array, and has no spare +devices, it will look for another array in the same spare group that +has a full complement of working drive and a spare. It will then +attempt to remove the spare from the second drive and add it to the +first. +If the removal succeeds but the adding fails, then it is added back to +the original array. + +If the spare group for a degraded array is not defined, +.I mdadm +will look at the rules of spare migration specified by POLICY lines in +.B mdadm.conf +and then follow similar steps as above if a matching spare is found. + +.SH GROW MODE +The GROW mode is used for changing the size or shape of an active +array. +For this to work, the kernel must support the necessary change. +Various types of growth are being added during 2.6 development. + +Currently the supported changes include +.IP \(bu 4 +change the "size" attribute for RAID1, RAID4, RAID5 and RAID6. +.IP \(bu 4 +increase or decrease the "raid\-devices" attribute of RAID0, RAID1, RAID4, +RAID5, and RAID6. +.IP \(bu 4 +change the chunk-size and layout of RAID0, RAID4, RAID5, RAID6 and RAID10. +.IP \(bu 4 +convert between RAID1 and RAID5, between RAID5 and RAID6, between +RAID0, RAID4, and RAID5, and between RAID0 and RAID10 (in the near-2 mode). +.IP \(bu 4 +add a write-intent bitmap to any array which supports these bitmaps, or +remove a write-intent bitmap from such an array. +.IP \(bu 4 +change the array's consistency policy. +.PP + +Using GROW on containers is currently supported only for Intel's IMSM +container format. The number of devices in a container can be +increased - which affects all arrays in the container - or an array +in a container can be converted between levels where those levels are +supported by the container, and the conversion is on of those listed +above. + +.PP +Notes: +.IP \(bu 4 +Intel's native checkpointing doesn't use +.B --backup-file +option and it is transparent for assembly feature. +.IP \(bu 4 +Roaming between Windows(R) and Linux systems for IMSM metadata is not +supported during grow process. +.IP \(bu 4 +When growing a raid0 device, the new component disk size (or external +backup size) should be larger than LCM(old, new) * chunk-size * 2, +where LCM() is the least common multiple of the old and new count of +component disks, and "* 2" comes from the fact that mdadm refuses to +use more than half of a spare device for backup space. + +.SS SIZE CHANGES +Normally when an array is built the "size" is taken from the smallest +of the drives. If all the small drives in an arrays are, one at a +time, removed and replaced with larger drives, then you could have an +array of large drives with only a small amount used. In this +situation, changing the "size" with "GROW" mode will allow the extra +space to start being used. If the size is increased in this way, a +"resync" process will start to make sure the new parts of the array +are synchronised. + +Note that when an array changes size, any filesystem that may be +stored in the array will not automatically grow or shrink to use or +vacate the space. The +filesystem will need to be explicitly told to use the extra space +after growing, or to reduce its size +.B prior +to shrinking the array. + +Also the size of an array cannot be changed while it has an active +bitmap. If an array has a bitmap, it must be removed before the size +can be changed. Once the change is complete a new bitmap can be created. + +.PP +Note: +.B "--grow --size" +is not yet supported for external file bitmap. + +.SS RAID\-DEVICES CHANGES + +A RAID1 array can work with any number of devices from 1 upwards +(though 1 is not very useful). There may be times which you want to +increase or decrease the number of active devices. Note that this is +different to hot-add or hot-remove which changes the number of +inactive devices. + +When reducing the number of devices in a RAID1 array, the slots which +are to be removed from the array must already be vacant. That is, the +devices which were in those slots must be failed and removed. + +When the number of devices is increased, any hot spares that are +present will be activated immediately. + +Changing the number of active devices in a RAID5 or RAID6 is much more +effort. Every block in the array will need to be read and written +back to a new location. From 2.6.17, the Linux Kernel is able to +increase the number of devices in a RAID5 safely, including restarting +an interrupted "reshape". From 2.6.31, the Linux Kernel is able to +increase or decrease the number of devices in a RAID5 or RAID6. + +From 2.6.35, the Linux Kernel is able to convert a RAID0 in to a RAID4 +or RAID5. +.I mdadm +uses this functionality and the ability to add +devices to a RAID4 to allow devices to be added to a RAID0. When +requested to do this, +.I mdadm +will convert the RAID0 to a RAID4, add the necessary disks and make +the reshape happen, and then convert the RAID4 back to RAID0. + +When decreasing the number of devices, the size of the array will also +decrease. If there was data in the array, it could get destroyed and +this is not reversible, so you should firstly shrink the filesystem on +the array to fit within the new size. To help prevent accidents, +.I mdadm +requires that the size of the array be decreased first with +.BR "mdadm --grow --array-size" . +This is a reversible change which simply makes the end of the array +inaccessible. The integrity of any data can then be checked before +the non-reversible reduction in the number of devices is request. + +When relocating the first few stripes on a RAID5 or RAID6, it is not +possible to keep the data on disk completely consistent and +crash-proof. To provide the required safety, mdadm disables writes to +the array while this "critical section" is reshaped, and takes a +backup of the data that is in that section. For grows, this backup may be +stored in any spare devices that the array has, however it can also be +stored in a separate file specified with the +.B \-\-backup\-file +option, and is required to be specified for shrinks, RAID level +changes and layout changes. If this option is used, and the system +does crash during the critical period, the same file must be passed to +.B \-\-assemble +to restore the backup and reassemble the array. When shrinking rather +than growing the array, the reshape is done from the end towards the +beginning, so the "critical section" is at the end of the reshape. + +.SS LEVEL CHANGES + +Changing the RAID level of any array happens instantaneously. However +in the RAID5 to RAID6 case this requires a non-standard layout of the +RAID6 data, and in the RAID6 to RAID5 case that non-standard layout is +required before the change can be accomplished. So while the level +change is instant, the accompanying layout change can take quite a +long time. A +.B \-\-backup\-file +is required. If the array is not simultaneously being grown or +shrunk, so that the array size will remain the same - for example, +reshaping a 3-drive RAID5 into a 4-drive RAID6 - the backup file will +be used not just for a "cricital section" but throughout the reshape +operation, as described below under LAYOUT CHANGES. + +.SS CHUNK-SIZE AND LAYOUT CHANGES + +Changing the chunk-size or layout without also changing the number of +devices as the same time will involve re-writing all blocks in-place. +To ensure against data loss in the case of a crash, a +.B --backup-file +must be provided for these changes. Small sections of the array will +be copied to the backup file while they are being rearranged. This +means that all the data is copied twice, once to the backup and once +to the new layout on the array, so this type of reshape will go very +slowly. + +If the reshape is interrupted for any reason, this backup file must be +made available to +.B "mdadm --assemble" +so the array can be reassembled. Consequently the file cannot be +stored on the device being reshaped. + + +.SS BITMAP CHANGES + +A write-intent bitmap can be added to, or removed from, an active +array. Either internal bitmaps, or bitmaps stored in a separate file, +can be added. Note that if you add a bitmap stored in a file which is +in a filesystem that is on the RAID array being affected, the system +will deadlock. The bitmap must be on a separate filesystem. + +.SS CONSISTENCY POLICY CHANGES + +The consistency policy of an active array can be changed by using the +.B \-\-consistency\-policy +option in Grow mode. Currently this works only for the +.B ppl +and +.B resync +policies and allows to enable or disable the RAID5 Partial Parity Log (PPL). + +.SH INCREMENTAL MODE + +.HP 12 +Usage: +.B mdadm \-\-incremental +.RB [ \-\-run ] +.RB [ \-\-quiet ] +.I component-device +.RI [ optional-aliases-for-device ] +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-fail +.I component-device +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-rebuild\-map +.HP 12 +Usage: +.B mdadm \-\-incremental \-\-run \-\-scan + +.PP +This mode is designed to be used in conjunction with a device +discovery system. As devices are found in a system, they can be +passed to +.B "mdadm \-\-incremental" +to be conditionally added to an appropriate array. + +Conversely, it can also be used with the +.B \-\-fail +flag to do just the opposite and find whatever array a particular device +is part of and remove the device from that array. + +If the device passed is a +.B CONTAINER +device created by a previous call to +.IR mdadm , +then rather than trying to add that device to an array, all the arrays +described by the metadata of the container will be started. + +.I mdadm +performs a number of tests to determine if the device is part of an +array, and which array it should be part of. If an appropriate array +is found, or can be created, +.I mdadm +adds the device to the array and conditionally starts the array. + +Note that +.I mdadm +will normally only add devices to an array which were previously working +(active or spare) parts of that array. The support for automatic +inclusion of a new drive as a spare in some array requires +a configuration through POLICY in config file. + +The tests that +.I mdadm +makes are as follow: +.IP + +Is the device permitted by +.BR mdadm.conf ? +That is, is it listed in a +.B DEVICES +line in that file. If +.B DEVICES +is absent then the default it to allow any device. Similarly if +.B DEVICES +contains the special word +.B partitions +then any device is allowed. Otherwise the device name given to +.IR mdadm , +or one of the aliases given, or an alias found in the filesystem, +must match one of the names or patterns in a +.B DEVICES +line. + +This is the only context where the aliases are used. They are +usually provided by a +.I udev +rules mentioning +.BR $env{DEVLINKS} . + +.IP + +Does the device have a valid md superblock? If a specific metadata +version is requested with +.B \-\-metadata +or +.B \-e +then only that style of metadata is accepted, otherwise +.I mdadm +finds any known version of metadata. If no +.I md +metadata is found, the device may be still added to an array +as a spare if POLICY allows. + +.ig +.IP + +Does the metadata match an expected array? +The metadata can match in two ways. Either there is an array listed +in +.B mdadm.conf +which identifies the array (either by UUID, by name, by device list, +or by minor-number), or the array was created with a +.B homehost +specified and that +.B homehost +matches the one in +.B mdadm.conf +or on the command line. +If +.I mdadm +is not able to positively identify the array as belonging to the +current host, the device will be rejected. +.. + +.PP +.I mdadm +keeps a list of arrays that it has partially assembled in +.BR {MAP_PATH} . +If no array exists which matches +the metadata on the new device, +.I mdadm +must choose a device name and unit number. It does this based on any +name given in +.B mdadm.conf +or any name information stored in the metadata. If this name +suggests a unit number, that number will be used, otherwise a free +unit number will be chosen. Normally +.I mdadm +will prefer to create a partitionable array, however if the +.B CREATE +line in +.B mdadm.conf +suggests that a non-partitionable array is preferred, that will be +honoured. + +If the array is not found in the config file and its metadata does not +identify it as belonging to the "homehost", then +.I mdadm +will choose a name for the array which is certain not to conflict with +any array which does belong to this host. It does this be adding an +underscore and a small number to the name preferred by the metadata. + +Once an appropriate array is found or created and the device is added, +.I mdadm +must decide if the array is ready to be started. It will +normally compare the number of available (non-spare) devices to the +number of devices that the metadata suggests need to be active. If +there are at least that many, the array will be started. This means +that if any devices are missing the array will not be restarted. + +As an alternative, +.B \-\-run +may be passed to +.I mdadm +in which case the array will be run as soon as there are enough +devices present for the data to be accessible. For a RAID1, that +means one device will start the array. For a clean RAID5, the array +will be started as soon as all but one drive is present. + +Note that neither of these approaches is really ideal. If it can +be known that all device discovery has completed, then +.br +.B " mdadm \-IRs" +.br +can be run which will try to start all arrays that are being +incrementally assembled. They are started in "read-auto" mode in +which they are read-only until the first write request. This means +that no metadata updates are made and no attempt at resync or recovery +happens. Further devices that are found before the first write can +still be added safely. + +.SH ENVIRONMENT +This section describes environment variables that affect how mdadm +operates. + +.TP +.B MDADM_NO_MDMON +Setting this value to 1 will prevent mdadm from automatically launching +mdmon. This variable is intended primarily for debugging mdadm/mdmon. + +.TP +.B MDADM_NO_UDEV +Normally, +.I mdadm +does not create any device nodes in /dev, but leaves that task to +.IR udev . +If +.I udev +appears not to be configured, or if this environment variable is set +to '1', the +.I mdadm +will create and devices that are needed. + +.TP +.B MDADM_NO_SYSTEMCTL +If +.I mdadm +detects that +.I systemd +is in use it will normally request +.I systemd +to start various background tasks (particularly +.IR mdmon ) +rather than forking and running them in the background. This can be +suppressed by setting +.BR MDADM_NO_SYSTEMCTL=1 . + +.TP +.B IMSM_NO_PLATFORM +A key value of IMSM metadata is that it allows interoperability with +boot ROMs on Intel platforms, and with other major operating systems. +Consequently, +.I mdadm +will only allow an IMSM array to be created or modified if detects +that it is running on an Intel platform which supports IMSM, and +supports the particular configuration of IMSM that is being requested +(some functionality requires newer OROM support). + +These checks can be suppressed by setting IMSM_NO_PLATFORM=1 in the +environment. This can be useful for testing or for disaster +recovery. You should be aware that interoperability may be +compromised by setting this value. + +.TP +.B MDADM_GROW_ALLOW_OLD +If an array is stopped while it is performing a reshape and that +reshape was making use of a backup file, then when the array is +re-assembled +.I mdadm +will sometimes complain that the backup file is too old. If this +happens and you are certain it is the right backup file, you can +over-ride this check by setting +.B MDADM_GROW_ALLOW_OLD=1 +in the environment. + +.TP +.B MDADM_CONF_AUTO +Any string given in this variable is added to the start of the +.B AUTO +line in the config file, or treated as the whole +.B AUTO +line if none is given. It can be used to disable certain metadata +types when +.I mdadm +is called from a boot script. For example +.br +.B " export MDADM_CONF_AUTO='-ddf -imsm' +.br +will make sure that +.I mdadm +does not automatically assemble any DDF or +IMSM arrays that are found. This can be useful on systems configured +to manage such arrays with +.BR dmraid . + + +.SH EXAMPLES + +.B " mdadm \-\-query /dev/name-of-device" +.br +This will find out if a given device is a RAID array, or is part of +one, and will provide brief information about the device. + +.B " mdadm \-\-assemble \-\-scan" +.br +This will assemble and start all arrays listed in the standard config +file. This command will typically go in a system startup file. + +.B " mdadm \-\-stop \-\-scan" +.br +This will shut down all arrays that can be shut down (i.e. are not +currently in use). This will typically go in a system shutdown script. + +.B " mdadm \-\-follow \-\-scan \-\-delay=120" +.br +If (and only if) there is an Email address or program given in the +standard config file, then +monitor the status of all arrays listed in that file by +polling them ever 2 minutes. + +.B " mdadm \-\-create /dev/md0 \-\-level=1 \-\-raid\-devices=2 /dev/hd[ac]1" +.br +Create /dev/md0 as a RAID1 array consisting of /dev/hda1 and /dev/hdc1. + +.br +.B " echo 'DEVICE /dev/hd*[0\-9] /dev/sd*[0\-9]' > mdadm.conf" +.br +.B " mdadm \-\-detail \-\-scan >> mdadm.conf" +.br +This will create a prototype config file that describes currently +active arrays that are known to be made from partitions of IDE or SCSI drives. +This file should be reviewed before being used as it may +contain unwanted detail. + +.B " echo 'DEVICE /dev/hd[a\-z] /dev/sd*[a\-z]' > mdadm.conf" +.br +.B " mdadm \-\-examine \-\-scan \-\-config=mdadm.conf >> mdadm.conf" +.br +This will find arrays which could be assembled from existing IDE and +SCSI whole drives (not partitions), and store the information in the +format of a config file. +This file is very likely to contain unwanted detail, particularly +the +.B devices= +entries. It should be reviewed and edited before being used as an +actual config file. + +.B " mdadm \-\-examine \-\-brief \-\-scan \-\-config=partitions" +.br +.B " mdadm \-Ebsc partitions" +.br +Create a list of devices by reading +.BR /proc/partitions , +scan these for RAID superblocks, and printout a brief listing of all +that were found. + +.B " mdadm \-Ac partitions \-m 0 /dev/md0" +.br +Scan all partitions and devices listed in +.BR /proc/partitions +and assemble +.B /dev/md0 +out of all such devices with a RAID superblock with a minor number of 0. + +.B " mdadm \-\-monitor \-\-scan \-\-daemonise > /run/mdadm/mon.pid" +.br +If config file contains a mail address or alert program, run mdadm in +the background in monitor mode monitoring all md devices. Also write +pid of mdadm daemon to +.BR /run/mdadm/mon.pid . + +.B " mdadm \-Iq /dev/somedevice" +.br +Try to incorporate newly discovered device into some array as +appropriate. + +.B " mdadm \-\-incremental \-\-rebuild\-map \-\-run \-\-scan" +.br +Rebuild the array map from any current arrays, and then start any that +can be started. + +.B " mdadm /dev/md4 --fail detached --remove detached" +.br +Any devices which are components of /dev/md4 will be marked as faulty +and then remove from the array. + +.B " mdadm --grow /dev/md4 --level=6 --backup-file=/root/backup-md4" +.br +The array +.B /dev/md4 +which is currently a RAID5 array will be converted to RAID6. There +should normally already be a spare drive attached to the array as a +RAID6 needs one more drive than a matching RAID5. + +.B " mdadm --create /dev/md/ddf --metadata=ddf --raid-disks 6 /dev/sd[a-f]" +.br +Create a DDF array over 6 devices. + +.B " mdadm --create /dev/md/home -n3 -l5 -z 30000000 /dev/md/ddf" +.br +Create a RAID5 array over any 3 devices in the given DDF set. Use +only 30 gigabytes of each device. + +.B " mdadm -A /dev/md/ddf1 /dev/sd[a-f]" +.br +Assemble a pre-exist ddf array. + +.B " mdadm -I /dev/md/ddf1" +.br +Assemble all arrays contained in the ddf array, assigning names as +appropriate. + +.B " mdadm \-\-create \-\-help" +.br +Provide help about the Create mode. + +.B " mdadm \-\-config \-\-help" +.br +Provide help about the format of the config file. + +.B " mdadm \-\-help" +.br +Provide general help. + +.SH FILES + +.SS /proc/mdstat + +If you're using the +.B /proc +filesystem, +.B /proc/mdstat +lists all active md devices with information about them. +.I mdadm +uses this to find arrays when +.B \-\-scan +is given in Misc mode, and to monitor array reconstruction +on Monitor mode. + +.SS /etc/mdadm.conf + +The config file lists which devices may be scanned to see if +they contain MD super block, and gives identifying information +(e.g. UUID) about known MD arrays. See +.BR mdadm.conf (5) +for more details. + +.SS /etc/mdadm.conf.d + +A directory containing configuration files which are read in lexical +order. + +.SS {MAP_PATH} +When +.B \-\-incremental +mode is used, this file gets a list of arrays currently being created. + +.SH DEVICE NAMES + +.I mdadm +understand two sorts of names for array devices. + +The first is the so-called 'standard' format name, which matches the +names used by the kernel and which appear in +.IR /proc/mdstat . + +The second sort can be freely chosen, but must reside in +.IR /dev/md/ . +When giving a device name to +.I mdadm +to create or assemble an array, either full path name such as +.I /dev/md0 +or +.I /dev/md/home +can be given, or just the suffix of the second sort of name, such as +.I home +can be given. + +When +.I mdadm +chooses device names during auto-assembly or incremental assembly, it +will sometimes add a small sequence number to the end of the name to +avoid conflicted between multiple arrays that have the same name. If +.I mdadm +can reasonably determine that the array really is meant for this host, +either by a hostname in the metadata, or by the presence of the array +in +.BR mdadm.conf , +then it will leave off the suffix if possible. +Also if the homehost is specified as +.B <ignore> +.I mdadm +will only use a suffix if a different array of the same name already +exists or is listed in the config file. + +The standard names for non-partitioned arrays (the only sort of md +array available in 2.4 and earlier) are of the form +.IP +.RB /dev/md NN +.PP +where NN is a number. +The standard names for partitionable arrays (as available from 2.6 +onwards) are of the form: +.IP +.RB /dev/md_d NN +.PP +Partition numbers should be indicated by adding "pMM" to these, thus "/dev/md/d1p2". +.PP +From kernel version 2.6.28 the "non-partitioned array" can actually +be partitioned. So the "md_d\fBNN\fP" +names are no longer needed, and +partitions such as "/dev/md\fBNN\fPp\fBXX\fP" +are possible. +.PP +From kernel version 2.6.29 standard names can be non-numeric following +the form: +.IP +.RB /dev/md_ XXX +.PP +where +.B XXX +is any string. These names are supported by +.I mdadm +since version 3.3 provided they are enabled in +.IR mdadm.conf . + +.SH NOTE +.I mdadm +was previously known as +.IR mdctl . + +.SH SEE ALSO +For further information on mdadm usage, MD and the various levels of +RAID, see: +.IP +.B https://raid.wiki.kernel.org/ +.PP +(based upon Jakob \(/Ostergaard's Software\-RAID.HOWTO) +.PP +The latest version of +.I mdadm +should always be available from +.IP +.B https://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). @@ -0,0 +1,2078 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + * + * Additions for bitmap and write-behind RAID options, Copyright (C) 2003-2004, + * Paul Clements, SteelEye Technology, Inc. + */ + +#include "mdadm.h" +#include "md_p.h" +#include <ctype.h> + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident); +static int misc_scan(char devmode, struct context *c); +static int stop_scan(int verbose); +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c); +const char Name[] = "mdadm"; + +int main(int argc, char *argv[]) +{ + int mode = 0; + int opt; + int option_index; + int rv; + int i; + + unsigned long long array_size = 0; + unsigned long long data_offset = INVALID_SECTORS; + struct mddev_ident ident; + char *configfile = NULL; + int devmode = 0; + int bitmap_fd = -1; + struct mddev_dev *devlist = NULL; + struct mddev_dev **devlistend = & devlist; + struct mddev_dev *dv; + mdu_array_info_t array; + int devs_found = 0; + char *symlinks = NULL; + int grow_continue = 0; + /* autof indicates whether and how to create device node. + * bottom 3 bits are style. Rest (when shifted) are number of parts + * 0 - unset + * 1 - don't create (no) + * 2 - if is_standard, then create (yes) + * 3 - create as 'md' - reject is_standard mdp (md) + * 4 - create as 'mdp' - reject is_standard md (mdp) + * 5 - default to md if not is_standard (md in config file) + * 6 - default to mdp if not is_standard (part, or mdp in config file) + */ + struct context c = { + .require_homehost = 1, + }; + struct shape s = { + .journaldisks = 0, + .level = UnSet, + .layout = UnSet, + .bitmap_chunk = UnSet, + .consistency_policy = CONSISTENCY_POLICY_UNKNOWN, + }; + + char sys_hostname[256]; + char *mailaddr = NULL; + char *program = NULL; + int increments = 20; + int daemonise = 0; + char *pidfile = NULL; + int oneshot = 0; + int spare_sharing = 1; + struct supertype *ss = NULL; + enum flag_mode writemostly = FlagDefault; + enum flag_mode failfast = FlagDefault; + char *shortopt = short_options; + int dosyslog = 0; + int rebuild_map = 0; + char *remove_path = NULL; + char *udev_filename = NULL; + char *dump_directory = NULL; + + int print_help = 0; + FILE *outf; + + int mdfd = -1; + int locked = 0; + + srandom(time(0) ^ getpid()); + + ident.uuid_set = 0; + ident.level = UnSet; + ident.raid_disks = UnSet; + ident.super_minor = UnSet; + ident.devices = 0; + ident.spare_group = NULL; + ident.autof = 0; + ident.st = NULL; + ident.bitmap_fd = -1; + ident.bitmap_file = NULL; + ident.name[0] = 0; + ident.container = NULL; + ident.member = NULL; + + if (get_linux_version() < 2006015) { + pr_err("This version of mdadm does not support kernels older than 2.6.15\n"); + exit(1); + } + + while ((option_index = -1), + (opt = getopt_long(argc, argv, shortopt, long_options, + &option_index)) != -1) { + int newmode = mode; + /* firstly, some mode-independent options */ + switch(opt) { + case HelpOptions: + print_help = 2; + continue; + case 'h': + print_help = 1; + continue; + + case 'V': + fputs(Version, stderr); + exit(0); + + case 'v': c.verbose++; + continue; + + case 'q': c.verbose--; + continue; + + case 'b': + if (mode == ASSEMBLE || mode == BUILD || + mode == CREATE || mode == GROW || + mode == INCREMENTAL || mode == MANAGE) + break; /* b means bitmap */ + case Brief: + c.brief = 1; + continue; + + case NoDevices: + c.no_devices = 1; + continue; + + case 'Y': c.export++; + continue; + + case HomeHost: + if (strcasecmp(optarg, "<ignore>") == 0) + c.require_homehost = 0; + else + c.homehost = optarg; + continue; + + case OffRootOpt: + /* Silently ignore old option */ + continue; + + case Prefer: + if (c.prefer) + free(c.prefer); + if (asprintf(&c.prefer, "/%s/", optarg) <= 0) + c.prefer = NULL; + continue; + + case ':': + case '?': + fputs(Usage, stderr); + exit(2); + } + /* second, figure out the mode. + * Some options force the mode. Others + * set the mode if it isn't already + */ + + switch(opt) { + case ManageOpt: + newmode = MANAGE; + shortopt = short_bitmap_options; + break; + case 'a': + case Add: + case AddSpare: + case AddJournal: + case 'r': + case Remove: + case Replace: + case With: + case 'f': + case Fail: + case ReAdd: /* re-add */ + case ClusterConfirm: + if (!mode) { + newmode = MANAGE; + shortopt = short_bitmap_options; + } + break; + + case 'A': newmode = ASSEMBLE; + shortopt = short_bitmap_auto_options; + break; + case 'B': newmode = BUILD; + shortopt = short_bitmap_auto_options; + break; + case 'C': newmode = CREATE; + shortopt = short_bitmap_auto_options; + break; + case 'F': newmode = MONITOR; + break; + case 'G': newmode = GROW; + shortopt = short_bitmap_options; + break; + case 'I': newmode = INCREMENTAL; + shortopt = short_bitmap_auto_options; + break; + case AutoDetect: + newmode = AUTODETECT; + break; + + case MiscOpt: + case 'D': + case 'E': + case 'X': + case 'Q': + case ExamineBB: + case Dump: + case Restore: + case Action: + newmode = MISC; + break; + + case 'R': + case 'S': + case 'o': + case 'w': + case 'W': + case WaitOpt: + case Waitclean: + case DetailPlatform: + case KillSubarray: + case UpdateSubarray: + case UdevRules: + case KillOpt: + if (!mode) + newmode = MISC; + break; + + case NoSharing: + newmode = MONITOR; + break; + } + if (mode && newmode == mode) { + /* everybody happy ! */ + } else if (mode && newmode != mode) { + /* not allowed.. */ + pr_err(""); + if (option_index >= 0) + fprintf(stderr, "--%s", long_options[option_index].name); + else + fprintf(stderr, "-%c", opt); + fprintf(stderr, " would set mdadm mode to \"%s\", but it is already set to \"%s\".\n", + map_num(modes, newmode), + map_num(modes, mode)); + exit(2); + } else if (!mode && newmode) { + mode = newmode; + if (mode == MISC && devs_found) { + pr_err("No action given for %s in --misc mode\n", + devlist->devname); + cont_err("Action options must come before device names\n"); + exit(2); + } + } else { + /* special case of -c --help */ + if ((opt == 'c' || opt == ConfigFile) && + (strncmp(optarg, "--h", 3) == 0 || + strncmp(optarg, "-h", 2) == 0)) { + fputs(Help_config, stdout); + exit(0); + } + + /* If first option is a device, don't force the mode yet */ + if (opt == 1) { + if (devs_found == 0) { + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->failfast = failfast; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + /* No mode yet, and this is the second device ... */ + pr_err("An option must be given to set the mode before a second device\n" + " (%s) is listed\n", optarg); + exit(2); + } + if (option_index >= 0) + pr_err("--%s", long_options[option_index].name); + else + pr_err("-%c", opt); + fprintf(stderr, " does not set the mode, and so cannot be the first option.\n"); + exit(2); + } + + /* if we just set the mode, then done */ + switch(opt) { + case ManageOpt: + case MiscOpt: + case 'A': + case 'B': + case 'C': + case 'F': + case 'G': + case 'I': + case AutoDetect: + continue; + } + if (opt == 1) { + /* an undecorated option - must be a device name. + */ + + if (devs_found > 0 && devmode == DetailPlatform) { + pr_err("controller may only be specified once. %s ignored\n", + optarg); + continue; + } + + if (devs_found > 0 && mode == MANAGE && !devmode) { + pr_err("Must give one of -a/-r/-f for subsequent devices at %s\n", optarg); + exit(2); + } + if (devs_found > 0 && mode == GROW && !devmode) { + pr_err("Must give -a/--add for devices to add: %s\n", optarg); + exit(2); + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = devmode; + dv->writemostly = writemostly; + dv->failfast = failfast; + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + + devs_found++; + continue; + } + + /* We've got a mode, and opt is now something else which + * could depend on the mode */ +#define O(a,b) ((a<<16)|b) + switch (O(mode,opt)) { + case O(GROW,'c'): + case O(GROW,ChunkSize): + case O(CREATE,'c'): + case O(CREATE,ChunkSize): + case O(BUILD,'c'): /* chunk or rounding */ + case O(BUILD,ChunkSize): /* chunk or rounding */ + if (s.chunk) { + pr_err("chunk/rounding may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + s.chunk = parse_size(optarg); + if (s.chunk == INVALID_SECTORS || + s.chunk < 8 || (s.chunk&1)) { + pr_err("invalid chunk/rounding value: %s\n", + optarg); + exit(2); + } + /* Convert sectors to K */ + s.chunk /= 2; + continue; + + case O(INCREMENTAL, 'e'): + case O(CREATE,'e'): + case O(ASSEMBLE,'e'): + case O(MISC,'e'): /* set metadata (superblock) information */ + if (ss) { + pr_err("metadata information already given\n"); + exit(2); + } + for(i = 0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc(optarg); + + if (!ss) { + pr_err("unrecognised metadata identifier: %s\n", optarg); + exit(2); + } + continue; + + case O(MANAGE,'W'): + case O(MANAGE,WriteMostly): + case O(BUILD,'W'): + case O(BUILD,WriteMostly): + case O(CREATE,'W'): + case O(CREATE,WriteMostly): + /* set write-mostly for following devices */ + writemostly = FlagSet; + continue; + + case O(MANAGE,'w'): + /* clear write-mostly for following devices */ + writemostly = FlagClear; + continue; + + case O(MANAGE,FailFast): + case O(CREATE,FailFast): + failfast = FlagSet; + continue; + case O(MANAGE,NoFailFast): + failfast = FlagClear; + continue; + + case O(GROW,'z'): + case O(CREATE,'z'): + case O(BUILD,'z'): /* size */ + if (s.size > 0) { + pr_err("size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max") == 0) + s.size = MAX_SIZE; + else { + s.size = parse_size(optarg); + if (s.size == INVALID_SECTORS || s.size < 8) { + pr_err("invalid size: %s\n", optarg); + exit(2); + } + /* convert sectors to K */ + s.size /= 2; + } + continue; + + case O(GROW,'Z'): /* array size */ + if (array_size > 0) { + pr_err("array-size may only be specified once. Second value is %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "max") == 0) + array_size = MAX_SIZE; + else { + array_size = parse_size(optarg); + if (array_size == 0 || + array_size == INVALID_SECTORS) { + pr_err("invalid array size: %s\n", + optarg); + exit(2); + } + } + continue; + + case O(CREATE,DataOffset): + case O(GROW,DataOffset): + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset may only be specified one. Second value is %s.\n", optarg); + exit(2); + } + if (mode == CREATE && strcmp(optarg, "variable") == 0) + data_offset = VARIABLE_OFFSET; + else + data_offset = parse_size(optarg); + if (data_offset == INVALID_SECTORS) { + pr_err("invalid data-offset: %s\n", + optarg); + exit(2); + } + continue; + + case O(GROW,'l'): + case O(CREATE,'l'): + case O(BUILD,'l'): /* set raid level*/ + if (s.level != UnSet) { + pr_err("raid level may only be set once. Second value is %s.\n", optarg); + exit(2); + } + s.level = map_name(pers, optarg); + if (s.level == UnSet) { + pr_err("invalid raid level: %s\n", + optarg); + exit(2); + } + if (s.level != 0 && s.level != LEVEL_LINEAR && + s.level != 1 && s.level != LEVEL_MULTIPATH && + s.level != LEVEL_FAULTY && s.level != 10 && + mode == BUILD) { + pr_err("Raid level %s not permitted with --build.\n", + optarg); + exit(2); + } + if (s.sparedisks > 0 && s.level < 1 && s.level >= -1) { + pr_err("raid level %s is incompatible with spare-devices setting.\n", + optarg); + exit(2); + } + ident.level = s.level; + continue; + + case O(GROW, 'p'): /* new layout */ + case O(GROW, Layout): + if (s.layout_str) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + s.layout_str = optarg; + /* 'Grow' will parse the value */ + continue; + + case O(CREATE,'p'): /* raid5 layout */ + case O(CREATE,Layout): + case O(BUILD,'p'): /* faulty layout */ + case O(BUILD,Layout): + if (s.layout != UnSet) { + pr_err("layout may only be sent once. Second value was %s\n", optarg); + exit(2); + } + switch(s.level) { + default: + pr_err("layout not meaningful for %s arrays.\n", + map_num(pers, s.level)); + exit(2); + case UnSet: + pr_err("raid level must be given before layout.\n"); + exit(2); + + case 0: + s.layout = map_name(r0layout, optarg); + if (s.layout == UnSet) { + pr_err("layout %s not understood for raid0.\n", + optarg); + exit(2); + } + break; + case 5: + s.layout = map_name(r5layout, optarg); + if (s.layout == UnSet) { + pr_err("layout %s not understood for raid5.\n", + optarg); + exit(2); + } + break; + case 6: + s.layout = map_name(r6layout, optarg); + if (s.layout == UnSet) { + pr_err("layout %s not understood for raid6.\n", + optarg); + exit(2); + } + break; + + case 10: + s.layout = parse_layout_10(optarg); + if (s.layout < 0) { + pr_err("layout for raid10 must be 'nNN', 'oNN' or 'fNN' where NN is a number, not %s\n", optarg); + exit(2); + } + break; + case LEVEL_FAULTY: + /* Faulty + * modeNNN + */ + s.layout = parse_layout_faulty(optarg); + if (s.layout == -1) { + pr_err("layout %s not understood for faulty.\n", + optarg); + exit(2); + } + break; + } + continue; + + case O(CREATE,AssumeClean): + case O(BUILD,AssumeClean): /* assume clean */ + case O(GROW,AssumeClean): + s.assume_clean = 1; + continue; + + case O(GROW,'n'): + case O(CREATE,'n'): + case O(BUILD,'n'): /* number of raid disks */ + if (s.raiddisks) { + pr_err("raid-devices set twice: %d and %s\n", + s.raiddisks, optarg); + exit(2); + } + if (parse_num(&s.raiddisks, optarg) != 0 || s.raiddisks <= 0) { + pr_err("invalid number of raid devices: %s\n", + optarg); + exit(2); + } + ident.raid_disks = s.raiddisks; + continue; + case O(ASSEMBLE, Nodes): + case O(GROW, Nodes): + case O(CREATE, Nodes): + if (parse_num(&c.nodes, optarg) != 0 || c.nodes < 2) { + pr_err("clustered array needs two nodes at least: %s\n", + optarg); + exit(2); + } + continue; + case O(CREATE, ClusterName): + case O(ASSEMBLE, ClusterName): + c.homecluster = optarg; + if (strlen(c.homecluster) > 64) { + pr_err("Cluster name too big.\n"); + exit(2); + } + continue; + case O(CREATE,'x'): /* number of spare (eXtra) disks */ + if (s.sparedisks) { + pr_err("spare-devices set twice: %d and %s\n", + s.sparedisks, optarg); + exit(2); + } + if (s.level != UnSet && s.level <= 0 && s.level >= -1) { + pr_err("spare-devices setting is incompatible with raid level %d\n", + s.level); + exit(2); + } + if (parse_num(&s.sparedisks, optarg) != 0 || s.sparedisks < 0) { + pr_err("invalid number of spare-devices: %s\n", + optarg); + exit(2); + } + continue; + + case O(CREATE,'a'): + case O(CREATE,Auto): + case O(BUILD,'a'): + case O(BUILD,Auto): + case O(INCREMENTAL,'a'): + case O(INCREMENTAL,Auto): + case O(ASSEMBLE,'a'): + case O(ASSEMBLE,Auto): /* auto-creation of device node */ + c.autof = parse_auto(optarg, "--auto flag", 0); + continue; + + case O(CREATE,Symlinks): + case O(BUILD,Symlinks): + case O(ASSEMBLE,Symlinks): /* auto creation of symlinks in /dev to /dev/md */ + symlinks = optarg; + continue; + + case O(BUILD,'f'): /* force honouring '-n 1' */ + case O(BUILD,Force): /* force honouring '-n 1' */ + case O(GROW,'f'): /* ditto */ + case O(GROW,Force): /* ditto */ + case O(CREATE,'f'): /* force honouring of device list */ + case O(CREATE,Force): /* force honouring of device list */ + case O(ASSEMBLE,'f'): /* force assembly */ + case O(ASSEMBLE,Force): /* force assembly */ + case O(MISC,'f'): /* force zero */ + case O(MISC,Force): /* force zero */ + case O(MANAGE,Force): /* add device which is too large */ + c.force = 1; + continue; + /* now for the Assemble options */ + case O(ASSEMBLE, FreezeReshape): /* Freeze reshape during + * initrd phase */ + case O(INCREMENTAL, FreezeReshape): + c.freeze_reshape = 1; + continue; + case O(CREATE,'u'): /* uuid of array */ + case O(ASSEMBLE,'u'): /* uuid of array */ + if (ident.uuid_set) { + pr_err("uuid cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (parse_uuid(optarg, ident.uuid)) + ident.uuid_set = 1; + else { + pr_err("Bad uuid: %s\n", optarg); + exit(2); + } + continue; + + case O(CREATE,'N'): + case O(ASSEMBLE,'N'): + case O(MISC,'N'): + if (ident.name[0]) { + pr_err("name cannot be set twice. Second value %s.\n", optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("-N/--name only valid with --update-subarray in misc mode\n"); + exit(2); + } + if (strlen(optarg) > 32) { + pr_err("name '%s' is too long, 32 chars max.\n", + optarg); + exit(2); + } + strcpy(ident.name, optarg); + continue; + + case O(ASSEMBLE,'m'): /* super-minor for array */ + case O(ASSEMBLE,SuperMinor): + if (ident.super_minor != UnSet) { + pr_err("super-minor cannot be set twice. Second value: %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "dev") == 0) + ident.super_minor = -2; + else if (parse_num(&ident.super_minor, optarg) != 0 || ident.super_minor < 0) { + pr_err("Bad super-minor number: %s.\n", optarg); + exit(2); + } + continue; + + case O(ASSEMBLE,'o'): + case O(MANAGE,'o'): + case O(CREATE,'o'): + c.readonly = 1; + continue; + + case O(ASSEMBLE,'U'): /* update the superblock */ + case O(MISC,'U'): + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + if (mode == MISC && !c.subarray) { + pr_err("Only subarrays can be updated in misc mode\n"); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "sparc2.2") == 0) + continue; + if (strcmp(c.update, "super-minor") == 0) + continue; + if (strcmp(c.update, "summaries") == 0) + continue; + if (strcmp(c.update, "resync") == 0) + continue; + if (strcmp(c.update, "uuid") == 0) + continue; + if (strcmp(c.update, "name") == 0) + continue; + if (strcmp(c.update, "homehost") == 0) + continue; + if (strcmp(c.update, "home-cluster") == 0) + continue; + if (strcmp(c.update, "nodes") == 0) + continue; + if (strcmp(c.update, "devicesize") == 0) + continue; + if (strcmp(c.update, "bitmap") == 0) + continue; + if (strcmp(c.update, "no-bitmap") == 0) + continue; + if (strcmp(c.update, "bbl") == 0) + continue; + if (strcmp(c.update, "no-bbl") == 0) + continue; + if (strcmp(c.update, "force-no-bbl") == 0) + continue; + if (strcmp(c.update, "ppl") == 0) + continue; + if (strcmp(c.update, "no-ppl") == 0) + continue; + if (strcmp(c.update, "metadata") == 0) + continue; + if (strcmp(c.update, "revert-reshape") == 0) + continue; + if (strcmp(c.update, "layout-original") == 0 || + strcmp(c.update, "layout-alternate") == 0 || + strcmp(c.update, "layout-unspecified") == 0) + continue; + if (strcmp(c.update, "byteorder") == 0) { + if (ss) { + pr_err("must not set metadata type with --update=byteorder.\n"); + exit(2); + } + for(i = 0; !ss && superlist[i]; i++) + ss = superlist[i]->match_metadata_desc( + "0.swap"); + if (!ss) { + pr_err("INTERNAL ERROR cannot find 0.swap\n"); + exit(2); + } + + continue; + } + if (strcmp(c.update,"?") == 0 || + strcmp(c.update, "help") == 0) { + outf = stdout; + fprintf(outf, "%s: ", Name); + } else { + outf = stderr; + fprintf(outf, + "%s: '--update=%s' is invalid. ", + Name, c.update); + } + fprintf(outf, "Valid --update options are:\n" + " 'sparc2.2', 'super-minor', 'uuid', 'name', 'nodes', 'resync',\n" + " 'summaries', 'homehost', 'home-cluster', 'byteorder', 'devicesize',\n" + " 'bitmap', 'no-bitmap', 'metadata', 'revert-reshape'\n" + " 'bbl', 'no-bbl', 'force-no-bbl', 'ppl', 'no-ppl'\n" + " 'layout-original', 'layout-alternate', 'layout-unspecified'\n" + ); + exit(outf == stdout ? 0 : 2); + + case O(MANAGE,'U'): + /* update=devicesize is allowed with --re-add */ + if (devmode != 'A') { + pr_err("--update in Manage mode only allowed with --re-add.\n"); + exit(1); + } + if (c.update) { + pr_err("Can only update one aspect of superblock, both %s and %s given.\n", + c.update, optarg); + exit(2); + } + c.update = optarg; + if (strcmp(c.update, "devicesize") != 0 && + strcmp(c.update, "bbl") != 0 && + strcmp(c.update, "force-no-bbl") != 0 && + strcmp(c.update, "no-bbl") != 0) { + pr_err("only 'devicesize', 'bbl', 'no-bbl', and 'force-no-bbl' can be updated with --re-add\n"); + exit(2); + } + continue; + + case O(INCREMENTAL,NoDegraded): + pr_err("--no-degraded is deprecated in Incremental mode\n"); + case O(ASSEMBLE,NoDegraded): /* --no-degraded */ + c.runstop = -1; /* --stop isn't allowed for --assemble, + * so we overload slightly */ + continue; + + case O(ASSEMBLE,'c'): + case O(ASSEMBLE,ConfigFile): + case O(INCREMENTAL, 'c'): + case O(INCREMENTAL, ConfigFile): + case O(MISC, 'c'): + case O(MISC, ConfigFile): + case O(MONITOR,'c'): + case O(MONITOR,ConfigFile): + case O(CREATE,ConfigFile): + if (configfile) { + pr_err("configfile cannot be set twice. Second value is %s.\n", optarg); + exit(2); + } + configfile = optarg; + set_conffile(configfile); + /* FIXME possibly check that config file exists. Even parse it */ + continue; + case O(ASSEMBLE,'s'): /* scan */ + case O(MISC,'s'): + case O(MONITOR,'s'): + case O(INCREMENTAL,'s'): + c.scan = 1; + continue; + + case O(MONITOR,'m'): /* mail address */ + case O(MONITOR,EMail): + if (mailaddr) + pr_err("only specify one mailaddress. %s ignored.\n", + optarg); + else + mailaddr = optarg; + continue; + + case O(MONITOR,'p'): /* alert program */ + case O(MONITOR,ProgramOpt): /* alert program */ + if (program) + pr_err("only specify one alter program. %s ignored.\n", + optarg); + else + program = optarg; + continue; + + case O(MONITOR,'r'): /* rebuild increments */ + case O(MONITOR,Increment): + if (parse_num(&increments, optarg) != 0 + || increments > 99 || increments < 1) { + pr_err("please specify positive integer between 1 and 99 as rebuild increments.\n"); + exit(2); + } + continue; + + case O(MONITOR,'d'): /* delay in seconds */ + case O(GROW, 'd'): + case O(BUILD,'d'): /* delay for bitmap updates */ + case O(CREATE,'d'): + if (c.delay) + pr_err("only specify delay once. %s ignored.\n", optarg); + else if (parse_num(&c.delay, optarg) != 0 || c.delay < 1) { + pr_err("invalid delay: %s\n", optarg); + exit(2); + } + continue; + case O(MONITOR,'f'): /* daemonise */ + case O(MONITOR,Fork): + daemonise = 1; + continue; + case O(MONITOR,'i'): /* pid */ + if (pidfile) + pr_err("only specify one pid file. %s ignored.\n", + optarg); + else + pidfile = optarg; + continue; + case O(MONITOR,'1'): /* oneshot */ + oneshot = 1; + spare_sharing = 0; + continue; + case O(MONITOR,'t'): /* test */ + c.test = 1; + continue; + case O(MONITOR,'y'): /* log messages to syslog */ + openlog("mdadm", LOG_PID, SYSLOG_FACILITY); + dosyslog = 1; + continue; + case O(MONITOR, NoSharing): + spare_sharing = 0; + continue; + + /* now the general management options. Some are applicable + * to other modes. None have arguments. + */ + case O(GROW,'a'): + case O(GROW,Add): + case O(MANAGE,'a'): + case O(MANAGE,Add): /* add a drive */ + devmode = 'a'; + continue; + case O(MANAGE,AddSpare): /* add drive - never re-add */ + devmode = 'S'; + continue; + case O(MANAGE,AddJournal): /* add journal */ + if (s.journaldisks && (s.level < 4 || s.level > 6)) { + pr_err("--add-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + devmode = 'j'; + continue; + case O(MANAGE,ReAdd): + devmode = 'A'; + continue; + case O(MANAGE,'r'): /* remove a drive */ + case O(MANAGE,Remove): + devmode = 'r'; + continue; + case O(MANAGE,'f'): /* set faulty */ + case O(MANAGE,Fail): + case O(INCREMENTAL,'f'): + case O(INCREMENTAL,Remove): + case O(INCREMENTAL,Fail): /* r for incremental is taken, use f + * even though we will both fail and + * remove the device */ + devmode = 'f'; + continue; + case O(MANAGE, ClusterConfirm): + devmode = 'c'; + continue; + case O(MANAGE,Replace): + /* Mark these devices for replacement */ + devmode = 'R'; + continue; + case O(MANAGE,With): + /* These are the replacements to use */ + if (devmode != 'R') { + pr_err("--with must follow --replace\n"); + exit(2); + } + devmode = 'W'; + continue; + case O(INCREMENTAL,'R'): + case O(MANAGE,'R'): + case O(ASSEMBLE,'R'): + case O(BUILD,'R'): + case O(CREATE,'R'): /* Run the array */ + if (c.runstop < 0) { + pr_err("Cannot both Stop and Run an array\n"); + exit(2); + } + c.runstop = 1; + continue; + case O(MANAGE,'S'): + if (c.runstop > 0) { + pr_err("Cannot both Run and Stop an array\n"); + exit(2); + } + c.runstop = -1; + continue; + case O(MANAGE,'t'): + c.test = 1; + continue; + + case O(MISC,'Q'): + case O(MISC,'D'): + case O(MISC,'E'): + case O(MISC,KillOpt): + case O(MISC,'R'): + case O(MISC,'S'): + case O(MISC,'X'): + case O(MISC, ExamineBB): + case O(MISC,'o'): + case O(MISC,'w'): + case O(MISC,'W'): + case O(MISC, WaitOpt): + case O(MISC, Waitclean): + case O(MISC, DetailPlatform): + case O(MISC, KillSubarray): + case O(MISC, UpdateSubarray): + case O(MISC, Dump): + case O(MISC, Restore): + case O(MISC ,Action): + if (opt == KillSubarray || opt == UpdateSubarray) { + if (c.subarray) { + pr_err("subarray can only be specified once\n"); + exit(2); + } + c.subarray = optarg; + } + if (opt == Action) { + if (c.action) { + pr_err("Only one --action can be specified\n"); + exit(2); + } + if (strcmp(optarg, "idle") == 0 || + strcmp(optarg, "frozen") == 0 || + strcmp(optarg, "check") == 0 || + strcmp(optarg, "repair") == 0) + c.action = optarg; + else { + pr_err("action must be one of idle, frozen, check, repair\n"); + exit(2); + } + } + if (devmode && devmode != opt && + (devmode == 'E' || + (opt == 'E' && devmode != 'Q'))) { + pr_err("--examine/-E cannot be given with "); + if (devmode == 'E') { + if (option_index >= 0) + fprintf(stderr, "--%s\n", + long_options[option_index].name); + else + fprintf(stderr, "-%c\n", opt); + } else if (isalpha(devmode)) + fprintf(stderr, "-%c\n", devmode); + else + fprintf(stderr, "previous option\n"); + exit(2); + } + devmode = opt; + if (opt == Dump || opt == Restore) { + if (dump_directory != NULL) { + pr_err("dump/restore directory specified twice: %s and %s\n", + dump_directory, optarg); + exit(2); + } + dump_directory = optarg; + } + continue; + case O(MISC, UdevRules): + if (devmode && devmode != opt) { + pr_err("--udev-rules must be the only option.\n"); + } else { + if (udev_filename) + pr_err("only specify one udev rule filename. %s ignored.\n", + optarg); + else + udev_filename = optarg; + } + devmode = opt; + continue; + case O(MISC,'t'): + c.test = 1; + continue; + + case O(MISC, Sparc22): + if (devmode != 'E') { + pr_err("--sparc2.2 only allowed with --examine\n"); + exit(2); + } + c.SparcAdjust = 1; + continue; + + case O(ASSEMBLE,'b'): /* here we simply set the bitmap file */ + case O(ASSEMBLE,Bitmap): + if (!optarg) { + pr_err("bitmap file needed with -b in --assemble mode\n"); + exit(2); + } + if (strcmp(optarg, "internal") == 0 || + strcmp(optarg, "clustered") == 0) { + pr_err("no need to specify --bitmap when assembling" + " arrays with internal or clustered bitmap\n"); + continue; + } + bitmap_fd = open(optarg, O_RDWR); + if (!*optarg || bitmap_fd < 0) { + pr_err("cannot open bitmap file %s: %s\n", optarg, strerror(errno)); + exit(2); + } + ident.bitmap_fd = bitmap_fd; /* for Assemble */ + continue; + + case O(ASSEMBLE, BackupFile): + case O(GROW, BackupFile): + /* Specify a file into which grow might place a backup, + * or from which assemble might recover a backup + */ + if (c.backup_file) { + pr_err("backup file already specified, rejecting %s\n", optarg); + exit(2); + } + c.backup_file = optarg; + continue; + + case O(GROW, Continue): + /* Continue interrupted grow + */ + grow_continue = 1; + continue; + case O(ASSEMBLE, InvalidBackup): + /* Acknowledge that the backupfile is invalid, but ask + * to continue anyway + */ + c.invalid_backup = 1; + continue; + + case O(BUILD,'b'): + case O(BUILD,Bitmap): + case O(CREATE,'b'): + case O(CREATE,Bitmap): /* here we create the bitmap */ + case O(GROW,'b'): + case O(GROW,Bitmap): + if (s.bitmap_file) { + pr_err("bitmap cannot be set twice. Second value: %s.\n", optarg); + exit(2); + } + if (strcmp(optarg, "internal") == 0 || + strcmp(optarg, "none") == 0 || + strchr(optarg, '/') != NULL) { + s.bitmap_file = optarg; + continue; + } + if (strcmp(optarg, "clustered") == 0) { + s.bitmap_file = optarg; + /* Set the default number of cluster nodes + * to 4 if not already set by user + */ + if (c.nodes < 1) + c.nodes = 4; + continue; + } + /* probable typo */ + pr_err("bitmap file must contain a '/', or be 'internal', or be 'clustered', or 'none'\n" + " not '%s'\n", optarg); + exit(2); + + case O(GROW,BitmapChunk): + case O(BUILD,BitmapChunk): + case O(CREATE,BitmapChunk): /* bitmap chunksize */ + s.bitmap_chunk = parse_size(optarg); + if (s.bitmap_chunk == 0 || + s.bitmap_chunk == INVALID_SECTORS || + s.bitmap_chunk & (s.bitmap_chunk - 1)) { + pr_err("invalid bitmap chunksize: %s\n", + optarg); + exit(2); + } + s.bitmap_chunk = s.bitmap_chunk * 512; + continue; + + case O(GROW, WriteBehind): + case O(BUILD, WriteBehind): + case O(CREATE, WriteBehind): + s.write_behind = DEFAULT_MAX_WRITE_BEHIND; + if (parse_num(&s.write_behind, optarg) != 0 || + s.write_behind < 0 || s.write_behind > 16383) { + pr_err("Invalid value for maximum outstanding write-behind writes: %s.\n\tMust be between 0 and 16383.\n", + optarg); + exit(2); + } + continue; + case O(INCREMENTAL, 'r'): + case O(INCREMENTAL, RebuildMapOpt): + rebuild_map = 1; + continue; + case O(INCREMENTAL, IncrementalPath): + remove_path = optarg; + continue; + case O(CREATE, WriteJournal): + if (s.journaldisks) { + pr_err("Please specify only one journal device for the array.\n"); + pr_err("Ignoring --write-journal %s...\n", optarg); + continue; + } + dv = xmalloc(sizeof(*dv)); + dv->devname = optarg; + dv->disposition = 'j'; /* WriteJournal */ + dv->used = 0; + dv->next = NULL; + *devlistend = dv; + devlistend = &dv->next; + devs_found++; + + s.journaldisks = 1; + continue; + case O(CREATE, 'k'): + case O(GROW, 'k'): + s.consistency_policy = map_name(consistency_policies, + optarg); + if (s.consistency_policy < CONSISTENCY_POLICY_RESYNC) { + pr_err("Invalid consistency policy: %s\n", + optarg); + exit(2); + } + continue; + } + /* We have now processed all the valid options. Anything else is + * an error + */ + if (option_index > 0) + pr_err(":option --%s not valid in %s mode\n", + long_options[option_index].name, + map_num(modes, mode)); + else + pr_err("option -%c not valid in %s mode\n", + opt, map_num(modes, mode)); + exit(2); + + } + + if (print_help) { + char *help_text; + if (print_help == 2) + help_text = OptionHelp; + else + help_text = mode_help[mode]; + if (help_text == NULL) + help_text = Help; + fputs(help_text,stdout); + exit(0); + } + + if (s.journaldisks) { + if (s.level < 4 || s.level > 6) { + pr_err("--write-journal is only supported for RAID level 4/5/6.\n"); + exit(2); + } + if (s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--write-journal is not supported with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } + } + + if (mode == CREATE && + s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN) { + if (s.level <= 0) { + pr_err("--consistency-policy not meaningful with level %s.\n", + map_num(pers, s.level)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_JOURNAL && + !s.journaldisks) { + pr_err("--write-journal is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_PPL && + s.level != 5) { + pr_err("PPL consistency policy is only supported for RAID level 5.\n"); + exit(2); + } else if (s.consistency_policy == CONSISTENCY_POLICY_BITMAP && + (!s.bitmap_file || + strcmp(s.bitmap_file, "none") == 0)) { + pr_err("--bitmap is required for consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } else if (s.bitmap_file && + strcmp(s.bitmap_file, "none") != 0 && + s.consistency_policy != CONSISTENCY_POLICY_BITMAP && + s.consistency_policy != CONSISTENCY_POLICY_JOURNAL) { + pr_err("--bitmap is not compatible with consistency policy: %s\n", + map_num(consistency_policies, s.consistency_policy)); + exit(2); + } + } + + if (!mode && devs_found) { + mode = MISC; + devmode = 'Q'; + if (devlist->disposition == 0) + devlist->disposition = devmode; + } + if (!mode) { + fputs(Usage, stderr); + exit(2); + } + + if (symlinks) { + struct createinfo *ci = conf_get_create_info(); + + if (strcasecmp(symlinks, "yes") == 0) + ci->symlinks = 1; + else if (strcasecmp(symlinks, "no") == 0) + ci->symlinks = 0; + else { + pr_err("option --symlinks must be 'no' or 'yes'\n"); + exit(2); + } + } + /* Ok, got the option parsing out of the way + * hopefully it's mostly right but there might be some stuff + * missing + * + * That is mostly checked in the per-mode stuff but... + * + * For @,B,C and A without -s, the first device listed must be + * an md device. We check that here and open it. + */ + + if (mode == MANAGE || mode == BUILD || mode == CREATE || + mode == GROW || (mode == ASSEMBLE && ! c.scan)) { + if (devs_found < 1) { + pr_err("an md device must be given in this mode\n"); + exit(2); + } + if ((int)ident.super_minor == -2 && c.autof) { + pr_err("--super-minor=dev is incompatible with --auto\n"); + exit(2); + } + if (mode == MANAGE || mode == GROW) { + mdfd = open_mddev(devlist->devname, 1); + if (mdfd < 0) + exit(1); + } else { + char *bname = basename(devlist->devname); + + if (strlen(bname) > MD_NAME_MAX) { + pr_err("Name %s is too long.\n", devlist->devname); + exit(1); + } + /* non-existent device is OK */ + mdfd = open_mddev(devlist->devname, 0); + } + if (mdfd == -2) { + pr_err("device %s exists but is not an md array.\n", devlist->devname); + exit(1); + } + if ((int)ident.super_minor == -2) { + struct stat stb; + if (mdfd < 0) { + pr_err("--super-minor=dev given, and listed device %s doesn't exist.\n", + devlist->devname); + exit(1); + } + fstat(mdfd, &stb); + ident.super_minor = minor(stb.st_rdev); + } + if (mdfd >= 0 && mode != MANAGE && mode != GROW) { + /* We don't really want this open yet, we just might + * have wanted to check some things + */ + close(mdfd); + mdfd = -1; + } + } + + if (s.raiddisks) { + if (s.raiddisks == 1 && !c.force && s.level != LEVEL_FAULTY) { + pr_err("'1' is an unusual number of drives for an array, so it is probably\n" + " a mistake. If you really mean it you will need to specify --force before\n" + " setting the number of drives.\n"); + exit(2); + } + } + + if (c.homehost == NULL && c.require_homehost) + c.homehost = conf_get_homehost(&c.require_homehost); + if (c.homehost == NULL || strcasecmp(c.homehost, "<system>") == 0) { + if (gethostname(sys_hostname, sizeof(sys_hostname)) == 0) { + sys_hostname[sizeof(sys_hostname)-1] = 0; + c.homehost = sys_hostname; + } + } + if (c.homehost && + (!c.homehost[0] || strcasecmp(c.homehost, "<none>") == 0)) { + c.homehost = NULL; + c.require_homehost = 0; + } + + rv = 0; + + set_hooks(); /* set hooks from libs */ + + if (c.homecluster == NULL && (c.nodes > 0)) { + c.homecluster = conf_get_homecluster(); + if (c.homecluster == NULL) + rv = get_cluster_name(&c.homecluster); + if (rv) { + pr_err("The md can't get cluster name\n"); + exit(1); + } + } + + if (c.update && strcmp(c.update, "nodes") == 0 && c.nodes == 0) { + pr_err("Please specify nodes number with --nodes\n"); + exit(1); + } + + if (c.backup_file && data_offset != INVALID_SECTORS) { + pr_err("--backup-file and --data-offset are incompatible\n"); + exit(2); + } + + if ((mode == MISC && devmode == 'E') || + (mode == MONITOR && spare_sharing == 0)) + /* Anyone may try this */; + else if (geteuid() != 0) { + pr_err("must be super-user to perform this action\n"); + exit(1); + } + + ident.autof = c.autof; + + if (c.scan && c.verbose < 2) + /* --scan implied --brief unless -vv */ + c.brief = 1; + + if (mode == CREATE) { + if (s.bitmap_file && strcmp(s.bitmap_file, "clustered") == 0) { + locked = cluster_get_dlmlock(); + if (locked != 1) + exit(1); + } + } else if (mode == MANAGE || mode == GROW || mode == INCREMENTAL) { + if (!md_get_array_info(mdfd, &array) && (devmode != 'c')) { + if (array.state & (1 << MD_SB_CLUSTERED)) { + locked = cluster_get_dlmlock(); + if (locked != 1) + exit(1); + } + } + } + + switch(mode) { + case MANAGE: + /* readonly, add/remove, readwrite, runstop */ + if (c.readonly > 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && devs_found>1) + rv = Manage_subdevs(devlist->devname, mdfd, + devlist->next, c.verbose, c.test, + c.update, c.force); + if (!rv && c.readonly < 0) + rv = Manage_ro(devlist->devname, mdfd, c.readonly); + if (!rv && c.runstop > 0) + rv = Manage_run(devlist->devname, mdfd, &c); + if (!rv && c.runstop < 0) + rv = Manage_stop(devlist->devname, mdfd, c.verbose, 0); + break; + case ASSEMBLE: + if (!c.scan && c.runstop == -1) { + pr_err("--no-degraded not meaningful without a --scan assembly.\n"); + exit(1); + } else if (devs_found == 1 && ident.uuid_set == 0 && + ident.super_minor == UnSet && ident.name[0] == 0 && + !c.scan) { + /* Only a device has been given, so get details from config file */ + struct mddev_ident *array_ident = conf_get_ident(devlist->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + devlist->devname); + rv |= 1; + if (mdfd >= 0) + close(mdfd); + } else { + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, devlist->devname, array_ident, + NULL, &c); + } + } else if (!c.scan) + rv = Assemble(ss, devlist->devname, &ident, + devlist->next, &c); + else if (devs_found > 0) { + if (c.update && devs_found > 1) { + pr_err("can only update a single array at a time\n"); + exit(1); + } + if (c.backup_file && devs_found > 1) { + pr_err("can only assemble a single array when providing a backup file.\n"); + exit(1); + } + for (dv = devlist; dv; dv = dv->next) { + struct mddev_ident *array_ident = conf_get_ident(dv->devname); + if (array_ident == NULL) { + pr_err("%s not identified in config file.\n", + dv->devname); + rv |= 1; + continue; + } + if (array_ident->autof == 0) + array_ident->autof = c.autof; + rv |= Assemble(ss, dv->devname, array_ident, + NULL, &c); + } + } else { + if (c.update) { + pr_err("--update not meaningful with a --scan assembly.\n"); + exit(1); + } + if (c.backup_file) { + pr_err("--backup_file not meaningful with a --scan assembly.\n"); + exit(1); + } + rv = scan_assemble(ss, &c, &ident); + } + + break; + case BUILD: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + if (s.bitmap_file) { + if (strcmp(s.bitmap_file, "internal") == 0 || + strcmp(s.bitmap_file, "clustered") == 0) { + pr_err("'internal' and 'clustered' bitmaps not supported with --build\n"); + rv |= 1; + break; + } + } + rv = Build(devlist->devname, devlist->next, &s, &c); + break; + case CREATE: + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + + if (c.nodes) { + if (!s.bitmap_file || + strcmp(s.bitmap_file, "clustered") != 0) { + pr_err("--nodes argument only compatible with --bitmap=clustered\n"); + rv = 1; + break; + } + + if (s.level != 1 && s.level != 10) { + pr_err("--bitmap=clustered is currently supported with raid1/10 only\n"); + rv = 1; + break; + } + if (s.level == 10 && !(is_near_layout_10(s.layout) || s.layout == UnSet)) { + pr_err("only near layout is supported with clustered raid10\n"); + rv = 1; + break; + } + } + + if (s.write_behind && !s.bitmap_file) { + pr_err("write-behind mode requires a bitmap.\n"); + rv = 1; + break; + } + if (s.raiddisks == 0) { + pr_err("no raid-devices specified.\n"); + rv = 1; + break; + } + + rv = Create(ss, devlist->devname, + ident.name, ident.uuid_set ? ident.uuid : NULL, + devs_found-1, devlist->next, + &s, &c, data_offset); + break; + case MISC: + if (devmode == 'E') { + if (devlist == NULL && !c.scan) { + pr_err("No devices to examine\n"); + exit(2); + } + if (devlist == NULL) + devlist = conf_get_devs(); + if (devlist == NULL) { + pr_err("No devices listed in %s\n", configfile?configfile:DefaultConfFile); + exit(1); + } + rv = Examine(devlist, &c, ss); + } else if (devmode == DetailPlatform) { + rv = Detail_Platform(ss ? ss->ss : NULL, ss ? c.scan : 1, + c.verbose, c.export, + devlist ? devlist->devname : NULL); + } else if (devlist == NULL) { + if (devmode == 'S' && c.scan) + rv = stop_scan(c.verbose); + else if ((devmode == 'D' || devmode == Waitclean) && + c.scan) + rv = misc_scan(devmode, &c); + else if (devmode == UdevRules) + rv = Write_rules(udev_filename); + else { + pr_err("No devices given.\n"); + exit(2); + } + } else + rv = misc_list(devlist, &ident, dump_directory, ss, &c); + break; + case MONITOR: + if (!devlist && !c.scan) { + pr_err("Cannot monitor: need --scan or at least one device\n"); + rv = 1; + break; + } + if (pidfile && !daemonise) { + pr_err("Cannot write a pid file when not in daemon mode\n"); + rv = 1; + break; + } + if (c.delay == 0) { + c.delay = conf_get_monitor_delay(); + if (!c.delay) + c.delay = 60; + } + rv = Monitor(devlist, mailaddr, program, + &c, daemonise, oneshot, + dosyslog, pidfile, increments, + spare_sharing); + break; + + case GROW: + if (array_size > 0) { + /* alway impose array size first, independent of + * anything else + * Do not allow level or raid_disks changes at the + * same time as that can be irreversibly destructive. + */ + struct mdinfo sra; + int err; + if (s.raiddisks || s.level != UnSet) { + pr_err("cannot change array size in same operation as changing raiddisks or level.\n" + " Change size first, then check that data is still intact.\n"); + rv = 1; + break; + } + if (sysfs_init(&sra, mdfd, NULL)) { + rv = 1; + break; + } + if (array_size == MAX_SIZE) + err = sysfs_set_str(&sra, NULL, "array_size", "default"); + else + err = sysfs_set_num(&sra, NULL, "array_size", array_size / 2); + if (err < 0) { + if (errno == E2BIG) + pr_err("--array-size setting is too large.\n"); + else + pr_err("current kernel does not support setting --array-size\n"); + rv = 1; + break; + } + } + if (devs_found > 1 && s.raiddisks == 0 && s.level == UnSet) { + /* must be '-a'. */ + if (s.size > 0 || s.chunk || + s.layout_str || s.bitmap_file) { + pr_err("--add cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + for (dv = devlist->next; dv; dv = dv->next) { + rv = Grow_Add_device(devlist->devname, mdfd, + dv->devname); + if (rv) + break; + } + } else if (s.bitmap_file) { + if (s.size > 0 || s.raiddisks || s.chunk || + s.layout_str || devs_found > 1) { + pr_err("--bitmap changes cannot be used with other geometry changes in --grow mode\n"); + rv = 1; + break; + } + if (c.delay == 0) + c.delay = DEFAULT_BITMAP_DELAY; + rv = Grow_addbitmap(devlist->devname, mdfd, &c, &s); + } else if (grow_continue) + rv = Grow_continue_command(devlist->devname, + mdfd, c.backup_file, + c.verbose); + else if (s.size > 0 || s.raiddisks || s.layout_str || + s.chunk != 0 || s.level != UnSet || + data_offset != INVALID_SECTORS) { + rv = Grow_reshape(devlist->devname, mdfd, + devlist->next, + data_offset, &c, &s); + } else if (s.consistency_policy != CONSISTENCY_POLICY_UNKNOWN) { + rv = Grow_consistency_policy(devlist->devname, mdfd, &c, &s); + } else if (array_size == 0) + pr_err("no changes to --grow\n"); + break; + case INCREMENTAL: + if (rebuild_map) { + RebuildMap(); + } + if (c.scan) { + rv = 1; + if (devlist) { + pr_err("In --incremental mode, a device cannot be given with --scan.\n"); + break; + } + if (c.runstop <= 0) { + pr_err("--incremental --scan meaningless without --run.\n"); + break; + } + if (devmode == 'f') { + pr_err("--incremental --scan --fail not supported.\n"); + break; + } + rv = IncrementalScan(&c, NULL); + } + if (!devlist) { + if (!rebuild_map && !c.scan) { + pr_err("--incremental requires a device.\n"); + rv = 1; + } + break; + } + if (devmode == 'f') { + if (devlist->next) { + pr_err("'--incremental --fail' can only handle one device.\n"); + rv = 1; + break; + } + rv = IncrementalRemove(devlist->devname, remove_path, + c.verbose); + } else + rv = Incremental(devlist, &c, ss); + break; + case AUTODETECT: + autodetect(); + break; + } + if (locked) + cluster_release_dlmlock(); + close_fd(&mdfd); + exit(rv); +} + +static int scan_assemble(struct supertype *ss, + struct context *c, + struct mddev_ident *ident) +{ + struct mddev_ident *a, *array_list = conf_get_ident(NULL); + struct mddev_dev *devlist = conf_get_devs(); + struct map_ent *map = NULL; + int cnt = 0; + int rv = 0; + int failures, successes; + + if (conf_verify_devnames(array_list)) { + pr_err("Duplicate MD device names in conf file were found.\n"); + return 1; + } + if (devlist == NULL) { + pr_err("No devices listed in conf file were found.\n"); + return 1; + } + for (a = array_list; a; a = a->next) { + a->assembled = 0; + if (a->autof == 0) + a->autof = c->autof; + } + if (map_lock(&map)) + pr_err("failed to get exclusive lock on mapfile\n"); + do { + failures = 0; + successes = 0; + rv = 0; + for (a = array_list; a; a = a->next) { + int r; + if (a->assembled) + continue; + if (a->devname && + strcasecmp(a->devname, "<ignore>") == 0) + continue; + + r = Assemble(ss, a->devname, + a, NULL, c); + if (r == 0) { + a->assembled = 1; + successes++; + } else + failures++; + rv |= r; + cnt++; + } + } while (failures && successes); + if (c->homehost && cnt == 0) { + /* Maybe we can auto-assemble something. + * Repeatedly call Assemble in auto-assemble mode + * until it fails + */ + int rv2; + int acnt; + ident->autof = c->autof; + do { + struct mddev_dev *devlist = conf_get_devs(); + acnt = 0; + do { + rv2 = Assemble(ss, NULL, + ident, + devlist, c); + if (rv2 == 0) { + cnt++; + acnt++; + } + } while (rv2 != 2); + /* Incase there are stacked devices, we need to go around again */ + } while (acnt); + if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file or automatically\n"); + rv = 1; + } else if (cnt) + rv = 0; + } else if (cnt == 0 && rv == 0) { + pr_err("No arrays found in config file\n"); + rv = 1; + } + map_unlock(&map); + return rv; +} + +static int misc_scan(char devmode, struct context *c) +{ + /* apply --detail or --wait-clean to + * all devices in /proc/mdstat + */ + struct mdstat_ent *ms = mdstat_read(0, 1); + struct mdstat_ent *e; + struct map_ent *map = NULL; + int members; + int rv = 0; + + for (members = 0; members <= 1; members++) { + for (e = ms; e; e = e->next) { + char *name = NULL; + struct map_ent *me; + struct stat stb; + int member = e->metadata_version && + strncmp(e->metadata_version, + "external:/", 10) == 0; + if (members != member) + continue; + me = map_by_devnm(&map, e->devnm); + if (me && me->path && strcmp(me->path, "/unknown") != 0) + name = me->path; + if (name == NULL || stat(name, &stb) != 0) + name = get_md_name(e->devnm); + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + if (devmode == 'D') + rv |= Detail(name, c); + else + rv |= WaitClean(name, c->verbose); + put_md_name(name); + map_free(map); + map = NULL; + } + } + free_mdstat(ms); + return rv; +} + +static int stop_scan(int verbose) +{ + /* apply --stop to all devices in /proc/mdstat */ + /* Due to possible stacking of devices, repeat until + * nothing more can be stopped + */ + int progress = 1, err; + int last = 0; + int rv = 0; + do { + struct mdstat_ent *ms = mdstat_read(0, 0); + struct mdstat_ent *e; + + if (!progress) last = 1; + progress = 0; err = 0; + for (e = ms; e; e = e->next) { + char *name = get_md_name(e->devnm); + int mdfd; + + if (!name) { + pr_err("cannot find device file for %s\n", + e->devnm); + continue; + } + mdfd = open_mddev(name, 1); + if (mdfd >= 0) { + if (Manage_stop(name, mdfd, verbose, !last)) + err = 1; + else + progress = 1; + close(mdfd); + } + + put_md_name(name); + } + free_mdstat(ms); + } while (!last && err); + if (err) + rv |= 1; + return rv; +} + +static int misc_list(struct mddev_dev *devlist, + struct mddev_ident *ident, + char *dump_directory, + struct supertype *ss, struct context *c) +{ + struct mddev_dev *dv; + int rv = 0; + + for (dv = devlist; dv; dv = (rv & 16) ? NULL : dv->next) { + int mdfd = -1; + + switch(dv->disposition) { + case 'D': + rv |= Detail(dv->devname, c); + continue; + case KillOpt: /* Zero superblock */ + if (ss) + rv |= Kill(dv->devname, ss, c->force, c->verbose,0); + else { + int v = c->verbose; + do { + rv |= Kill(dv->devname, NULL, c->force, v, 0); + v = -1; + } while (rv == 0); + rv &= ~4; + } + continue; + case 'Q': + rv |= Query(dv->devname); + continue; + case 'X': + rv |= ExamineBitmap(dv->devname, c->brief, ss); + continue; + case ExamineBB: + rv |= ExamineBadblocks(dv->devname, c->brief, ss); + continue; + case 'W': + case WaitOpt: + rv |= Wait(dv->devname); + continue; + case Waitclean: + rv |= WaitClean(dv->devname, c->verbose); + continue; + case KillSubarray: + rv |= Kill_subarray(dv->devname, c->subarray, c->verbose); + continue; + case UpdateSubarray: + if (c->update == NULL) { + pr_err("-U/--update must be specified with --update-subarray\n"); + rv |= 1; + continue; + } + rv |= Update_subarray(dv->devname, c->subarray, + c->update, ident, c->verbose); + continue; + case Dump: + rv |= Dump_metadata(dv->devname, dump_directory, c, ss); + continue; + case Restore: + rv |= Restore_metadata(dv->devname, dump_directory, c, ss, + (dv == devlist && dv->next == NULL)); + continue; + case Action: + rv |= SetAction(dv->devname, c->action); + continue; + } + + if (dv->devname[0] != '/') + mdfd = open_dev(dv->devname); + if (dv->devname[0] == '/' || mdfd < 0) + mdfd = open_mddev(dv->devname, 1); + + if (mdfd >= 0) { + switch(dv->disposition) { + case 'R': + c->runstop = 1; + rv |= Manage_run(dv->devname, mdfd, c); + break; + case 'S': + if (c->scan) { + pr_err("--stop not meaningful with both a --scan assembly and a device name.\n"); + rv |= 1; + break; + } + rv |= Manage_stop(dv->devname, mdfd, c->verbose, 0); + break; + case 'o': + rv |= Manage_ro(dv->devname, mdfd, 1); + break; + case 'w': + rv |= Manage_ro(dv->devname, mdfd, -1); + break; + } + close(mdfd); + } else + rv |= 1; + } + return rv; +} + +int SetAction(char *dev, char *action) +{ + int fd = open(dev, O_RDONLY); + struct mdinfo mdi; + int retval; + + if (fd < 0) { + pr_err("Couldn't open %s: %s\n", dev, strerror(errno)); + return 1; + } + retval = sysfs_init(&mdi, fd, NULL); + close(fd); + if (retval) { + pr_err("%s is no an md array\n", dev); + return 1; + } + + if (sysfs_set_str(&mdi, NULL, "sync_action", action) < 0) { + pr_err("Count not set action for %s to %s: %s\n", + dev, action, strerror(errno)); + return 1; + } + return 0; +} diff --git a/mdadm.conf-example b/mdadm.conf-example new file mode 100644 index 0000000..35a75d1 --- /dev/null +++ b/mdadm.conf-example @@ -0,0 +1,65 @@ +# mdadm configuration file +# +# mdadm will function properly without the use of a configuration file, +# but this file is useful for keeping track of arrays and member disks. +# In general, a mdadm.conf file is created, and updated, after arrays +# are created. This is the opposite behavior of /etc/raidtab which is +# created prior to array construction. +# +# +# the config file takes two types of lines: +# +# DEVICE lines specify a list of devices of where to look for +# potential member disks +# +# ARRAY lines specify information about how to identify arrays so +# so that they can be activated +# +# You can have more than one device line and use wild cards. The first +# example includes SCSI the first partition of SCSI disks /dev/sdb, +# /dev/sdc, /dev/sdd, /dev/sdj, /dev/sdk, and /dev/sdl. The second +# line looks for array slices on IDE disks. +# +#DEVICE /dev/sd[bcdjkl]1 +#DEVICE /dev/hda1 /dev/hdb1 +# +# If you mount devfs on /dev, then a suitable way to list all devices is: +#DEVICE /dev/discs/*/* +# +# +# The AUTO line can control which arrays get assembled by auto-assembly, +# meaing either "mdadm -As" when there are no 'ARRAY' lines in this file, +# or "mdadm --incremental" when the array found is not listed in this file. +# By default, all arrays that are found are assembled. +# If you want to ignore all DDF arrays (maybe they are managed by dmraid), +# and only assemble 1.x arrays if which are marked for 'this' homehost, +# but assemble all others, then use +#AUTO -ddf homehost -1.x +all +# +# ARRAY lines specify an array to assemble and a method of identification. +# Arrays can currently be identified by using a UUID, superblock minor number, +# or a listing of devices. +# +# super-minor is usually the minor number of the metadevice +# UUID is the Universally Unique Identifier for the array +# Each can be obtained using +# +# mdadm -D <md> +# +#ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +#ARRAY /dev/md1 super-minor=1 +#ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 +# +# ARRAY lines can also specify a "spare-group" for each array. mdadm --monitor +# will then move a spare between arrays in a spare-group if one array has a failed +# drive but no spare +#ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df spare-group=group1 +#ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 spare-group=group1 +# +# When used in --follow (aka --monitor) mode, mdadm needs a +# mail address and/or a program. This can be given with "mailaddr" +# and "program" lines to that monitoring can be started using +# mdadm --follow --scan & echo $! > /run/mdadm/mon.pid +# If the lines are not found, mdadm will exit quietly +#MAILADDR root@mydomain.tld +#PROGRAM /usr/sbin/handle-mdadm-events diff --git a/mdadm.conf.5 b/mdadm.conf.5 new file mode 100644 index 0000000..74a21c5 --- /dev/null +++ b/mdadm.conf.5 @@ -0,0 +1,706 @@ +.\" Copyright Neil Brown and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH MDADM.CONF 5 +.SH NAME +mdadm.conf \- configuration for management of Software RAID with mdadm +.SH SYNOPSIS +/etc/mdadm.conf +.SH DESCRIPTION +.PP +.I mdadm +is a tool for creating, managing, and monitoring RAID devices using the +.B md +driver in Linux. +.PP +Some common tasks, such as assembling all arrays, can be simplified +by describing the devices and arrays in this configuration file. + +.SS SYNTAX +The file should be seen as a collection of words separated by white +space (space, tab, or newline). +Any word that beings with a hash sign (#) starts a comment and that +word together with the remainder of the line is ignored. + +Spaces can be included in a word using quotation characters. Either +single quotes +.RB ( ' ) +or double quotes (\fB"\fP) +may be used. All the characters from one quotation character to +next identical character are protected and will not be used to +separate words to start new quoted strings. To include a single quote +it must be between double quotes. To include a double quote it must +be between single quotes. + +Any line that starts with white space (space or tab) is treated as +though it were a continuation of the previous line. + +Empty lines are ignored, but otherwise each (non continuation) line +must start with a keyword as listed below. The keywords are case +insensitive and can be abbreviated to 3 characters. + +The keywords are: +.TP +.B DEVICE +A +.B device +line lists the devices (whole devices or partitions) that might contain +a component of an MD array. When looking for the components of an +array, +.I mdadm +will scan these devices (or any devices listed on the command line). + +The +.B device +line may contain a number of different devices (separated by spaces) +and each device name can contain wild cards as defined by +.BR glob (7). + +Also, there may be several device lines present in the file. + +Alternatively, a +.B device +line can contain either or both of the words +.B containers +and +.BR partitions . +The word +.B containers +will cause +.I mdadm +to look for assembled CONTAINER arrays and included them as a source +for assembling further arrays. + +The word +.I partitions +will cause +.I mdadm +to read +.I /proc/partitions +and include all devices and partitions found therein. +.I mdadm +does not use the names from +.I /proc/partitions +but only the major and minor device numbers. It scans +.I /dev +to find the name that matches the numbers. + +If no DEVICE line is present, then "DEVICE partitions containers" is assumed. + +For example: +.IP +DEVICE /dev/hda* /dev/hdc* +.br +DEV /dev/sd* +.br +DEVICE /dev/disk/by-path/pci* +.br +DEVICE partitions + +.TP +.B ARRAY +The ARRAY lines identify actual arrays. The second word on the line +may be the name of the device where the array is normally +assembled, such as +.B /dev/md1 +or +.BR /dev/md/backup . +If the name does not start with a slash +.RB (' / '), +it is treated as being in +.BR /dev/md/ . +Alternately the word +.B <ignore> +(complete with angle brackets) can be given in which case any array +which matches the rest of the line will never be automatically assembled. +If no device name is given, +.I mdadm +will use various heuristics to determine an appropriate name. + +Subsequent words identify the array, or identify the array as a member +of a group. If multiple identities are given, +then a component device must match ALL identities to be considered a +match. Each identity word has a tag, and equals sign, and some value. +The tags are: +.RS 4 +.TP +.B uuid= +The value should be a 128 bit uuid in hexadecimal, with punctuation +interspersed if desired. This must match the uuid stored in the +superblock. +.TP +.B name= +The value should be a simple textual name as was given to +.I mdadm +when the array was created. This must match the name stored in the +superblock on a device for that device to be included in the array. +Not all superblock formats support names. +.TP +.B super\-minor= +The value is an integer which indicates the minor number that was +stored in the superblock when the array was created. When an array is +created as /dev/mdX, then the minor number X is stored. +.TP +.B devices= +The value is a comma separated list of device names or device name +patterns. +Only devices with names which match one entry in the list will be used +to assemble the array. Note that the devices +listed there must also be listed on a DEVICE line. +.TP +.B level= +The value is a RAID level. This is not normally used to +identify an array, but is supported so that the output of + +.B "mdadm \-\-examine \-\-scan" + +can be use directly in the configuration file. +.TP +.B num\-devices= +The value is the number of devices in a complete active array. As with +.B level= +this is mainly for compatibility with the output of + +.BR "mdadm \-\-examine \-\-scan" . + +.TP +.B spares= +The value is a number of spare devices to expect the array to have. +The sole use of this keyword and value is as follows: +.B mdadm \-\-monitor +will report an array if it is found to have fewer than this number of +spares when +.B \-\-monitor +starts or when +.B \-\-oneshot +is used. + +.TP +.B spare\-group= +The value is a textual name for a group of arrays. All arrays with +the same +.B spare\-group +name are considered to be part of the same group. The significance of +a group of arrays is that +.I mdadm +will, when monitoring the arrays, move a spare drive from one array in +a group to another array in that group if the first array had a failed +or missing drive but no spare. + +.TP +.B auto= +This option is rarely needed with mdadm-3.0, particularly if use with +the Linux kernel v2.6.28 or later. +It tells +.I mdadm +whether to use partitionable array or non-partitionable arrays and, +in the absence of +.IR udev , +how many partition devices to create. From 2.6.28 all md array +devices are partitionable, hence this option is not needed. + +The value of this option can be "yes" or "md" to indicate that a +traditional, non-partitionable md array should be created, or "mdp", +"part" or "partition" to indicate that a partitionable md array (only +available in linux 2.6 and later) should be used. This later set can +also have a number appended to indicate how many partitions to create +device files for, e.g. +.BR auto=mdp5 . +The default is 4. + +.TP +.B bitmap= +The option specifies a file in which a write-intent bitmap should be +found. When assembling the array, +.I mdadm +will provide this file to the +.B md +driver as the bitmap file. This has the same function as the +.B \-\-bitmap\-file +option to +.BR \-\-assemble . + +.TP +.B metadata= +Specify the metadata format that the array has. This is mainly +recognised for comparability with the output of +.BR "mdadm \-Es" . + +.TP +.B container= +Specify that this array is a member array of some container. The +value given can be either a path name in /dev, or a UUID of the +container array. + +.TP +.B member= +Specify that this array is a member array of some container. Each +type of container has some way to enumerate member arrays, often a +simple sequence number. The value identifies which member of a +container the array is. It will usually accompany a "container=" word. +.RE + +.TP +.B MAILADDR +The +.B mailaddr +line gives an E-mail address that alerts should be +sent to when +.I mdadm +is running in +.B \-\-monitor +mode (and was given the +.B \-\-scan +option). There should only be one +.B MAILADDR +line and it should have only one address. Any subsequent addresses +are silently ignored. + +.TP +.B MAILFROM +The +.B mailfrom +line (which can only be abbreviated to at least 5 characters) gives an +address to appear in the "From" address for alert mails. This can be +useful if you want to explicitly set a domain, as the default from +address is "root" with no domain. All words on this line are +catenated with spaces to form the address. + +Note that this value cannot be set via the +.I mdadm +commandline. It is only settable via the config file. + +.TP +.B PROGRAM +The +.B program +line gives the name of a program to be run when +.B "mdadm \-\-monitor" +detects potentially interesting events on any of the arrays that it +is monitoring. This program gets run with two or three arguments, they +being the Event, the md device, and possibly the related component +device. + +There should only be one +.B program +line and it should be give only one program. + + +.TP +.B CREATE +The +.B create +line gives default values to be used when creating arrays, new members +of arrays, and device entries for arrays. +These include: + +.RS 4 +.TP +.B owner= +.TP +.B group= +These can give user/group ids or names to use instead of system +defaults (root/wheel or root/disk). +.TP +.B mode= +An octal file mode such as 0660 can be given to override the default +of 0600. +.TP +.B auto= +This corresponds to the +.B \-\-auto +flag to mdadm. Give +.BR yes , +.BR md , +.BR mdp , +.B part +\(em possibly followed by a number of partitions \(em to indicate how +missing device entries should be created. + +.TP +.B metadata= +The name of the metadata format to use if none is explicitly given. +This can be useful to impose a system-wide default of version-1 superblocks. + +.TP +.B symlinks=no +Normally when creating devices in +.B /dev/md/ +.I mdadm +will create a matching symlink from +.B /dev/ +with a name starting +.B md +or +.BR md_ . +Give +.B symlinks=no +to suppress this symlink creation. + +.TP +.B names=yes +Since Linux 2.6.29 it has been possible to create +.B md +devices with a name like +.B md_home +rather than just a number, like +.BR md3 . +.I mdadm +will use the numeric alternative by default as other tools that interact +with md arrays may expect only numbers. +If +.B names=yes +is given in +.I mdadm.conf +then +.I mdadm +will use a name when appropriate. +If +.B names=no +is given, then non-numeric +.I md +device names will not be used even if the default changes in a future +release of +.IR mdadm . + +.TP +.B bbl=no +By default, +.I mdadm +will reserve space for a bad block list (bbl) on all devices +included in or added to any array that supports them. Setting +.B bbl=no +will prevent this, so newly added devices will not have a bad +block log. +.RE + +.TP +.B HOMEHOST +The +.B homehost +line gives a default value for the +.B \-\-homehost= +option to mdadm. There should normally be only one other word on the line. +It should either be a host name, or one of the special words +.BR <system>, +.B <none> +and +.BR <ignore> . +If +.B <system> +is given, then the +.BR gethostname ( 2 ) +systemcall is used to get the host name. This is the default. + +If +.B <ignore> +is given, then a flag is set so that when arrays are being +auto-assembled the checking of the recorded +.I homehost +is disabled. +If +.B <ignore> +is given it is also possible to give an explicit name which will be +used when creating arrays. This is the only case when there can be +more that one other word on the +.B HOMEHOST +line. If there are other words, or other +.B HOMEHOST +lines, they are silently ignored. + +If +.B <none> +is given, then the default of using +.BR gethostname ( 2 ) +is over-ridden and no homehost name is assumed. + +When arrays are created, this host name will be stored in the +metadata. When arrays are assembled using auto-assembly, arrays which +do not record the correct homehost name in their metadata will be +assembled using a "foreign" name. A "foreign" name alway ends with a +digit string preceded by an underscore to differentiate it +from any possible local name. e.g. +.B /dev/md/1_1 +or +.BR /dev/md/home_0 . +.TP +.B AUTO +A list of names of metadata format can be given, each preceded by a +plus or minus sign. Also the word +.I homehost +is allowed as is +.I all +preceded by plus or minus sign. +.I all +is usually last. + +When +.I mdadm +is auto-assembling an array, either via +.I \-\-assemble +or +.I \-\-incremental +and it finds metadata of a given type, it checks that metadata type +against those listed in this line. The first match wins, where +.I all +matches anything. +If a match is found that was preceded by a plus sign, the auto +assembly is allowed. If the match was preceded by a minus sign, the +auto assembly is disallowed. If no match is found, the auto assembly +is allowed. + +If the metadata indicates that the array was created for +.I this +host, and the word +.I homehost +appears before any other match, then the array is treated as a valid +candidate for auto-assembly. + +This can be used to disable all auto-assembly (so that only arrays +explicitly listed in mdadm.conf or on the command line are assembled), +or to disable assembly of certain metadata types which might be +handled by other software. It can also be used to disable assembly of +all foreign arrays - normally such arrays are assembled but given a +non-deterministic name in +.BR /dev/md/ . + +The known metadata types are +.BR 0.90 , +.BR 1.x , +.BR ddf , +.BR imsm . + +.B AUTO +should be given at most once. Subsequent lines are silently ignored. +Thus an earlier config file in a config directory will over-ride +the setting in a later config file. + +.TP +.B POLICY +This is used to specify what automatic behavior is allowed on devices +newly appearing in the system and provides a way of marking spares that can +be moved to other arrays as well as the migration domains. +.I Domain +can be defined through +.I policy +line by specifying a domain name for a number of paths from +.BR /dev/disk/by-path/ . +A device may belong to several domains. The domain of an array is a union +of domains of all devices in that array. A spare can be automatically +moved from one array to another if the set of the destination array's +.I domains +contains all the +.I domains +of the new disk or if both arrays have the same +.IR spare-group . + +To update hot plug configuration it is necessary to execute +.B mdadm \-\-udev\-rules +command after changing the config file + +Keywords used in the +.I POLICY +line and supported values are: + +.RS 4 +.TP +.B domain= +any arbitrary string +.TP +.B metadata= +0.9 1.x ddf or imsm +.TP +.B path= +file glob matching anything from +.B /dev/disk/by-path +.TP +.B type= +either +.B disk +or +.BR part . +.TP +.B action= +include, re-add, spare, spare-same-slot, or force-spare +.TP +.B auto= +yes, no, or homehost. + +.P +The +.I action +item determines the automatic behavior allowed for devices matching the +.I path +and +.I type +in the same line. If a device matches several lines with different +.I actions +then the most permissive will apply. The ordering of policy lines +is irrelevant to the end result. +.TP +.B include +allows adding a disk to an array if metadata on that disk matches that array +.TP +.B re\-add +will include the device in the array if it appears to be a current member +or a member that was recently removed and the array has a +write-intent-bitmap to allow the +.B re\-add +functionality. +.TP +.B spare +as above and additionally: if the device is bare it can +become a spare if there is any array that it is a candidate for based +on domains and metadata. +.TP +.B spare\-same\-slot +as above and additionally if given slot was used by an array that went +degraded recently and the device plugged in has no metadata then it will +be automatically added to that array (or it's container) +.TP +.B force\-spare +as above and the disk will become a spare in remaining cases +.RE + +.TP +.B PART-POLICY +This is similar to +.B POLICY +and accepts the same keyword assignments. It allows a consistent set +of policies to applied to each of the partitions of a device. + +A +.B PART-POLICY +line should set +.I type=disk +and identify the path to one or more disk devices. Each partition on +these disks will be treated according to the +.I action= +setting from this line. If a +.I domain +is set in the line, then the domain associated with each patition will +be based on the domain, but with +.RB \(dq -part N\(dq +appended, when N is the partition number for the partition that was +found. + +.TP +.B SYSFS +The +.B SYSFS +line lists custom values of MD device's sysfs attributes which will be +stored in sysfs after the array is assembled. Multiple lines are allowed and each +line has to contain the uuid or the name of the device to which it relates. +.RS 4 +.TP +.B uuid= +hexadecimal identifier of MD device. This has to match the uuid stored in the +superblock. +.TP +.B name= +name of the MD device as was given to +.I mdadm +when the array was created. It will be ignored if +.B uuid +is not empty. +.RE + +.TP +.B MONITORDELAY +The +.B monitordelay +line gives a delay in seconds +.I mdadm +shall wait before pooling md arrays +when +.I mdadm +is running in +.B \-\-monitor +mode. +.B \-d/\-\-delay +command line argument takes precedence over the config file + +.SH EXAMPLE +DEVICE /dev/sd[bcdjkl]1 +.br +DEVICE /dev/hda1 /dev/hdb1 + +# /dev/md0 is known by its UUID. +.br +ARRAY /dev/md0 UUID=3aaa0122:29827cfa:5331ad66:ca767371 +.br +# /dev/md1 contains all devices with a minor number of +.br +# 1 in the superblock. +.br +ARRAY /dev/md1 superminor=1 +.br +# /dev/md2 is made from precisely these two devices +.br +ARRAY /dev/md2 devices=/dev/hda1,/dev/hdb1 + +# /dev/md4 and /dev/md5 are a spare-group and spares +.br +# can be moved between them +.br +ARRAY /dev/md4 uuid=b23f3c6d:aec43a9f:fd65db85:369432df +.br + spare\-group=group1 +.br +ARRAY /dev/md5 uuid=19464854:03f71b1b:e0df2edd:246cc977 +.br + spare\-group=group1 +.br +# /dev/md/home is created if need to be a partitionable md array +.br +# any spare device number is allocated. +.br +ARRAY /dev/md/home UUID=9187a482:5dde19d9:eea3cc4a:d646ab8b +.br + auto=part +.br +# The name of this array contains a space. +.br +ARRAY /dev/md9 name='Data Storage' +.sp +POLICY domain=domain1 metadata=imsm path=pci-0000:00:1f.2-scsi-* +.br + action=spare +.br +POLICY domain=domain1 metadata=imsm path=pci-0000:04:00.0-scsi-[01]* +.br + action=include +.br +# One domain comprising of devices attached to specified paths is defined. +.br +# Bare device matching first path will be made an imsm spare on hot plug. +.br +# If more than one array is created on devices belonging to domain1 and +.br +# one of them becomes degraded, then any imsm spare matching any path for +.br +# given domain name can be migrated. +.br +MAILADDR root@mydomain.tld +.br +PROGRAM /usr/sbin/handle\-mdadm\-events +.br +CREATE group=system mode=0640 auto=part\-8 +.br +HOMEHOST <system> +.br +AUTO +1.x homehost \-all +.br +SYSFS name=/dev/md/raid5 group_thread_cnt=4 sync_speed_max=1000000 +.br +SYSFS uuid=bead5eb6:31c17a27:da120ba2:7dfda40d group_thread_cnt=4 +sync_speed_max=1000000 +.br +MONITORDELAY 60 + +.SH SEE ALSO +.BR mdadm (8), +.BR md (4). @@ -0,0 +1,1887 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#define _GNU_SOURCE +#define _FILE_OFFSET_BITS 64 +#include <unistd.h> +#ifdef __GLIBC__ +extern __off64_t lseek64 __P ((int __fd, __off64_t __offset, int __whence)); +#elif !defined(lseek64) +# if defined(__NO_STAT64) || __WORDSIZE != 32 +# define lseek64 lseek +# endif +#endif + +#include <sys/types.h> +#include <sys/stat.h> +#include <stdint.h> +#include <stdlib.h> +#include <time.h> +#include <sys/time.h> +#include <getopt.h> +#include <fcntl.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <syslog.h> +#include <stdbool.h> +/* Newer glibc requires sys/sysmacros.h directly for makedev() */ +#include <sys/sysmacros.h> +#ifdef __dietlibc__ +#include <strings.h> +/* dietlibc has deprecated random and srandom!! */ +#define random rand +#define srandom srand +#endif + +#ifdef NO_COROSYNC +#define CS_OK 1 +typedef uint64_t cmap_handle_t; +#else +#include <corosync/cmap.h> +#endif + +#ifndef NO_DLM +#include <libdlm.h> +#include <errno.h> +#else +#define LKF_NOQUEUE 0x00000001 +#define LKM_PWMODE 4 +#define EUNLOCK 0x10002 + +typedef void *dlm_lshandle_t; + +struct dlm_lksb { + int sb_status; + uint32_t sb_lkid; + char sb_flags; + char *sb_lvbptr; +}; +#endif + +#include <linux/kdev_t.h> +/*#include <linux/fs.h> */ +#include <sys/mount.h> +#include <asm/types.h> +#include <sys/ioctl.h> +#define MD_MAJOR 9 +#define MdpMinorShift 6 + +#ifndef BLKGETSIZE64 +#define BLKGETSIZE64 _IOR(0x12,114,size_t) /* return device size in bytes (u64 *arg) */ +#endif + +#define DEFAULT_CHUNK 512 +#define DEFAULT_BITMAP_CHUNK 4096 +#define DEFAULT_BITMAP_DELAY 5 +#define DEFAULT_MAX_WRITE_BEHIND 256 + +/* MAP_DIR should be somewhere that persists across the pivotroot + * from early boot to late boot. + * /run seems to have emerged as the best standard. + */ +#ifndef MAP_DIR +#define MAP_DIR "/run/mdadm" +#endif /* MAP_DIR */ +/* MAP_FILE is what we name the map file we put in MAP_DIR, in case you + * want something other than the default of "map" + */ +#ifndef MAP_FILE +#define MAP_FILE "map" +#endif /* MAP_FILE */ +/* MDMON_DIR is where pid and socket files used for communicating + * with mdmon normally live. Best is /var/run/mdadm as + * mdmon is needed at early boot then it needs to write there prior + * to /var/run being mounted read/write, and it also then needs to + * persist beyond when /var/run is mounter read-only. So, to be + * safe, the default is somewhere that is read/write early in the + * boot process and stays up as long as possible during shutdown. + */ +#ifndef MDMON_DIR +#define MDMON_DIR "/run/mdadm" +#endif /* MDMON_DIR */ + +/* FAILED_SLOTS is where to save files storing recent removal of array + * member in order to allow future reuse of disk inserted in the same + * slot for array recovery + */ +#ifndef FAILED_SLOTS_DIR +#define FAILED_SLOTS_DIR "/run/mdadm/failed-slots" +#endif /* FAILED_SLOTS */ + +#ifndef MDMON_SERVICE +#define MDMON_SERVICE "mdmon" +#endif /* MDMON_SERVICE */ + +#ifndef GROW_SERVICE +#define GROW_SERVICE "mdadm-grow-continue" +#endif /* GROW_SERVICE */ + +#include "md_u.h" +#include "md_p.h" +#include "bitmap.h" +#include "msg.h" + +#include <endian.h> +/* Redhat don't like to #include <asm/byteorder.h>, and + * some time include <linux/byteorder/xxx_endian.h> isn't enough, + * and there is no standard conversion function so... */ +/* And dietlibc doesn't think byteswap is ok, so.. */ +/* #include <byteswap.h> */ +#define __mdadm_bswap_16(x) (((x) & 0x00ffU) << 8 | \ + ((x) & 0xff00U) >> 8) +#define __mdadm_bswap_32(x) (((x) & 0x000000ffU) << 24 | \ + ((x) & 0xff000000U) >> 24 | \ + ((x) & 0x0000ff00U) << 8 | \ + ((x) & 0x00ff0000U) >> 8) +#define __mdadm_bswap_64(x) (((x) & 0x00000000000000ffULL) << 56 | \ + ((x) & 0xff00000000000000ULL) >> 56 | \ + ((x) & 0x000000000000ff00ULL) << 40 | \ + ((x) & 0x00ff000000000000ULL) >> 40 | \ + ((x) & 0x0000000000ff0000ULL) << 24 | \ + ((x) & 0x0000ff0000000000ULL) >> 24 | \ + ((x) & 0x00000000ff000000ULL) << 8 | \ + ((x) & 0x000000ff00000000ULL) >> 8) + +#if !defined(__KLIBC__) +#if BYTE_ORDER == LITTLE_ENDIAN +#define __cpu_to_le16(_x) (unsigned int)(_x) +#define __cpu_to_le32(_x) (unsigned int)(_x) +#define __cpu_to_le64(_x) (unsigned long long)(_x) +#define __le16_to_cpu(_x) (unsigned int)(_x) +#define __le32_to_cpu(_x) (unsigned int)(_x) +#define __le64_to_cpu(_x) (unsigned long long)(_x) + +#define __cpu_to_be16(_x) __mdadm_bswap_16(_x) +#define __cpu_to_be32(_x) __mdadm_bswap_32(_x) +#define __cpu_to_be64(_x) __mdadm_bswap_64(_x) +#define __be16_to_cpu(_x) __mdadm_bswap_16(_x) +#define __be32_to_cpu(_x) __mdadm_bswap_32(_x) +#define __be64_to_cpu(_x) __mdadm_bswap_64(_x) +#elif BYTE_ORDER == BIG_ENDIAN +#define __cpu_to_le16(_x) __mdadm_bswap_16(_x) +#define __cpu_to_le32(_x) __mdadm_bswap_32(_x) +#define __cpu_to_le64(_x) __mdadm_bswap_64(_x) +#define __le16_to_cpu(_x) __mdadm_bswap_16(_x) +#define __le32_to_cpu(_x) __mdadm_bswap_32(_x) +#define __le64_to_cpu(_x) __mdadm_bswap_64(_x) + +#define __cpu_to_be16(_x) (unsigned int)(_x) +#define __cpu_to_be32(_x) (unsigned int)(_x) +#define __cpu_to_be64(_x) (unsigned long long)(_x) +#define __be16_to_cpu(_x) (unsigned int)(_x) +#define __be32_to_cpu(_x) (unsigned int)(_x) +#define __be64_to_cpu(_x) (unsigned long long)(_x) +#else +# error "unknown endianness." +#endif +#endif /* __KLIBC__ */ + +/* + * Partially stolen from include/linux/unaligned/packed_struct.h + */ +struct __una_u16 { __u16 x; } __attribute__ ((packed)); +struct __una_u32 { __u32 x; } __attribute__ ((packed)); + +static inline __u16 __get_unaligned16(const void *p) +{ + const struct __una_u16 *ptr = (const struct __una_u16 *)p; + return ptr->x; +} + +static inline __u32 __get_unaligned32(const void *p) +{ + const struct __una_u32 *ptr = (const struct __una_u32 *)p; + return ptr->x; +} + +static inline void __put_unaligned16(__u16 val, void *p) +{ + struct __una_u16 *ptr = (struct __una_u16 *)p; + ptr->x = val; +} + +static inline void __put_unaligned32(__u32 val, void *p) +{ + struct __una_u32 *ptr = (struct __una_u32 *)p; + ptr->x = val; +} + +/* + * Check at compile time that something is of a particular type. + * Always evaluates to 1 so you may use it easily in comparisons. +*/ + +#define typecheck(type,x) \ +({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ +}) + +/* + * These inlines deal with timer wrapping correctly. + * + * time_after(a,b) returns true if the time a is after time b. +*/ + +#define time_after(a,b) \ + (typecheck(unsigned int, a) && \ + typecheck(unsigned int, b) && \ + ((int)((b) - (a)) < 0)) + +#define time_before(a,b) time_after(b,a) + +/* + * min()/max()/clamp() macros that also do + * strict type-checking.. See the + * "unnecessary" pointer comparison. + */ +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +extern const char Name[]; + +struct md_bb_entry { + unsigned long long sector; + int length; +}; + +struct md_bb { + int supported; + int count; + struct md_bb_entry *entries; +}; + +/* general information that might be extracted from a superblock */ +struct mdinfo { + mdu_array_info_t array; + mdu_disk_info_t disk; + __u64 events; + int uuid[4]; + char name[33]; + unsigned long long data_offset; + unsigned long long new_data_offset; + unsigned long long component_size; /* same as array.size, except in + * sectors and up to 64bits. + */ + unsigned long long custom_array_size; /* size for non-default sized + * arrays (in sectors) + */ +#define NO_RESHAPE 0 +#define VOLUME_RESHAPE 1 +#define CONTAINER_RESHAPE 2 +#define RESHAPE_NO_BACKUP 16 /* Mask 'or'ed in */ + int reshape_active; + unsigned long long reshape_progress; + int recovery_blocked; /* for external metadata it + * indicates that there is + * reshape in progress in + * container, + * for native metadata it is + * reshape_active field mirror + */ + int journal_device_required; + int journal_clean; + + enum { + CONSISTENCY_POLICY_UNKNOWN, + CONSISTENCY_POLICY_NONE, + CONSISTENCY_POLICY_RESYNC, + CONSISTENCY_POLICY_BITMAP, + CONSISTENCY_POLICY_JOURNAL, + CONSISTENCY_POLICY_PPL, + } consistency_policy; + + /* During reshape we can sometimes change the data_offset to avoid + * over-writing still-valid data. We need to know if there is space. + * So getinfo_super will fill in space_before and space_after in sectors. + * data_offset can be increased or decreased by this amount. + */ + unsigned long long space_before, space_after; + union { + unsigned long long resync_start; /* per-array resync position */ + unsigned long long recovery_start; /* per-device rebuild position */ + #define MaxSector (~0ULL) /* resync/recovery complete position */ + }; + long bitmap_offset; /* 0 == none, 1 == a file */ + unsigned int ppl_size; + int ppl_offset; + unsigned long long ppl_sector; + unsigned long safe_mode_delay; /* ms delay to mark clean */ + int new_level, delta_disks, new_layout, new_chunk; + int errors; + unsigned long cache_size; /* size of raid456 stripe cache*/ + int mismatch_cnt; + char text_version[50]; + + int container_member; /* for assembling external-metatdata arrays + * This is to be used internally by metadata + * handler only */ + int container_enough; /* flag external handlers can set to + * indicate that subarrays have not enough (-1), + * enough to start (0), or all expected disks (1) */ + char sys_name[32]; + struct mdinfo *devs; + struct mdinfo *next; + + /* Device info for mdmon: */ + int recovery_fd; + int state_fd; + int bb_fd; + int ubb_fd; + #define DS_FAULTY 1 + #define DS_INSYNC 2 + #define DS_WRITE_MOSTLY 4 + #define DS_SPARE 8 + #define DS_BLOCKED 16 + #define DS_REMOVE 1024 + #define DS_UNBLOCK 2048 + int prev_state, curr_state, next_state; + + /* info read from sysfs */ + enum { + ARRAY_CLEAR, + ARRAY_INACTIVE, + ARRAY_SUSPENDED, + ARRAY_READONLY, + ARRAY_READ_AUTO, + ARRAY_CLEAN, + ARRAY_ACTIVE, + ARRAY_WRITE_PENDING, + ARRAY_ACTIVE_IDLE, + ARRAY_BROKEN, + ARRAY_UNKNOWN_STATE, + } array_state; + struct md_bb bb; +}; + +struct createinfo { + int uid; + int gid; + int autof; + int mode; + int symlinks; + int names; + int bblist; + struct supertype *supertype; +}; + +struct spare_criteria { + unsigned long long min_size; + unsigned int sector_size; +}; + +enum mode { + ASSEMBLE=1, + BUILD, + CREATE, + MANAGE, + MISC, + MONITOR, + GROW, + INCREMENTAL, + AUTODETECT, + mode_count +}; + +extern char short_options[]; +extern char short_bitmap_options[]; +extern char short_bitmap_auto_options[]; +extern struct option long_options[]; +extern char Version[], Usage[], Help[], OptionHelp[], + *mode_help[], + Help_create[], Help_build[], Help_assemble[], Help_grow[], + Help_incr[], + Help_manage[], Help_misc[], Help_monitor[], Help_config[]; + +/* for option that don't have short equivilents, we assign arbitrary + * numbers later than any 'short' character option. + */ +enum special_options { + AssumeClean = 300, + BitmapChunk, + WriteBehind, + ReAdd, + NoDegraded, + Sparc22, + BackupFile, + HomeHost, + AutoHomeHost, + Symlinks, + AutoDetect, + Waitclean, + DetailPlatform, + KillSubarray, + UpdateSubarray, + IncrementalPath, + NoSharing, + HelpOptions, + Brief, + NoDevices, + ManageOpt, + Add, + AddSpare, + AddJournal, + Remove, + Fail, + Replace, + With, + MiscOpt, + WaitOpt, + ConfigFile, + ChunkSize, + WriteMostly, + FailFast, + NoFailFast, + Layout, + Auto, + Force, + SuperMinor, + EMail, + ProgramOpt, + Increment, + Fork, + Bitmap, + RebuildMapOpt, + InvalidBackup, + UdevRules, + FreezeReshape, + Continue, + OffRootOpt, + Prefer, + KillOpt, + DataOffset, + ExamineBB, + Dump, + Restore, + Action, + Nodes, + ClusterName, + ClusterConfirm, + WriteJournal, + ConsistencyPolicy, +}; + +enum prefix_standard { + JEDEC, + IEC +}; + +enum bitmap_update { + NoUpdate, + NameUpdate, + NodeNumUpdate, +}; + +enum flag_mode { + FlagDefault, FlagSet, FlagClear, +}; + +/* structures read from config file */ +/* List of mddevice names and identifiers + * Identifiers can be: + * uuid=128-hex-uuid + * super-minor=decimal-minor-number-from-superblock + * devices=comma,separated,list,of,device,names,with,wildcards + * + * If multiple fields are present, the intersection of all matching + * devices is considered + */ +#define UnSet (0xfffe) +struct mddev_ident { + char *devname; + + int uuid_set; + int uuid[4]; + char name[33]; + + int super_minor; + + char *devices; /* comma separated list of device + * names with wild cards + */ + int level; + int raid_disks; + int spare_disks; + struct supertype *st; + int autof; /* 1 for normal, 2 for partitioned */ + char *spare_group; + char *bitmap_file; + int bitmap_fd; + + char *container; /* /dev/whatever name of container, or + * uuid of container. You would expect + * this to be the 'devname' or UUID + * of some other entry. + */ + char *member; /* subarray within a container */ + + struct mddev_ident *next; + union { + /* fields needed by different users of this structure */ + int assembled; /* set when assembly succeeds */ + }; +}; + +struct context { + int readonly; + int runstop; + int verbose; + int brief; + int no_devices; + int force; + char *homehost; + int require_homehost; + char *prefer; + int export; + int test; + char *subarray; + char *update; + int scan; + int SparcAdjust; + int autof; + int delay; + int freeze_reshape; + char *backup_file; + int invalid_backup; + char *action; + int nodes; + char *homecluster; +}; + +struct shape { + int raiddisks; + int sparedisks; + int journaldisks; + int level; + int layout; + char *layout_str; + int chunk; + int bitmap_chunk; + char *bitmap_file; + int assume_clean; + int write_behind; + unsigned long long size; + int consistency_policy; +}; + +/* List of device names - wildcards expanded */ +struct mddev_dev { + char *devname; + int disposition; /* 'a' for add, 'r' for remove, 'f' for fail, + * 'A' for re_add. + * Not set for names read from .config + */ + enum flag_mode writemostly; + enum flag_mode failfast; + int used; /* set when used */ + long long data_offset; + struct mddev_dev *next; +}; + +typedef struct mapping { + char *name; + int num; +} mapping_t; + +struct mdstat_ent { + char devnm[32]; + int active; + char *level; + char *pattern; /* U for up, _ for down */ + int percent; /* -1 if no resync */ + int resync; /* 3 if check, 2 if reshape, 1 if resync, 0 if recovery */ + int devcnt; + int raid_disks; + char * metadata_version; + struct dev_member { + char *name; + struct dev_member *next; + } *members; + struct mdstat_ent *next; +}; + +extern struct mdstat_ent *mdstat_read(int hold, int start); +extern void mdstat_close(void); +extern void free_mdstat(struct mdstat_ent *ms); +extern int mdstat_wait(int seconds); +extern void mdstat_wait_fd(int fd, const sigset_t *sigmask); +extern int mddev_busy(char *devnm); +extern struct mdstat_ent *mdstat_by_component(char *name); +extern struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container); + +struct map_ent { + struct map_ent *next; + char devnm[32]; + char metadata[20]; + int uuid[4]; + int bad; + char *path; +}; +extern int map_update(struct map_ent **mpp, char *devnm, char *metadata, + int uuid[4], char *path); +extern void map_remove(struct map_ent **map, char *devnm); +extern struct map_ent *map_by_uuid(struct map_ent **map, int uuid[4]); +extern struct map_ent *map_by_devnm(struct map_ent **map, char *devnm); +extern void map_free(struct map_ent *map); +extern struct map_ent *map_by_name(struct map_ent **map, char *name); +extern void map_read(struct map_ent **melp); +extern int map_write(struct map_ent *mel); +extern void map_delete(struct map_ent **mapp, char *devnm); +extern void map_add(struct map_ent **melp, + char *devnm, char *metadata, int uuid[4], char *path); +extern int map_lock(struct map_ent **melp); +extern void map_unlock(struct map_ent **melp); +extern void map_fork(void); + +/* various details can be requested */ +enum sysfs_read_flags { + GET_LEVEL = (1 << 0), + GET_LAYOUT = (1 << 1), + GET_COMPONENT = (1 << 2), + GET_CHUNK = (1 << 3), + GET_CACHE = (1 << 4), + GET_MISMATCH = (1 << 5), + GET_VERSION = (1 << 6), + GET_DISKS = (1 << 7), + GET_SAFEMODE = (1 << 9), + GET_BITMAP_LOCATION = (1 << 10), + + GET_DEVS = (1 << 20), /* gets role, major, minor */ + GET_OFFSET = (1 << 21), + GET_SIZE = (1 << 22), + GET_STATE = (1 << 23), + GET_ERROR = (1 << 24), + GET_ARRAY_STATE = (1 << 25), + GET_CONSISTENCY_POLICY = (1 << 26), + GET_DEVS_ALL = (1 << 27), +}; + +/* If fd >= 0, get the array it is open on, + * else use devnm. + */ +extern int sysfs_open(char *devnm, char *devname, char *attr); +extern int sysfs_init(struct mdinfo *mdi, int fd, char *devnm); +extern void sysfs_init_dev(struct mdinfo *mdi, dev_t devid); +extern void sysfs_free(struct mdinfo *sra); +extern struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options); +extern int sysfs_attr_match(const char *attr, const char *str); +extern int sysfs_match_word(const char *word, char **list); +extern int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val); +extern int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val); +extern int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val); +extern int sysfs_uevent(struct mdinfo *sra, char *event); +extern int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_fd_get_ll(int fd, unsigned long long *val); +extern int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val); +extern int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2); +extern int sysfs_fd_get_str(int fd, char *val, int size); +extern int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, + char *name); +extern int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size); +extern int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms); +extern int sysfs_set_array(struct mdinfo *info, int vers); +extern int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume); +extern int sysfs_disk_to_scsi_id(int fd, __u32 *id); +extern int sysfs_unique_holder(char *devnm, long rdev); +extern int sysfs_freeze_array(struct mdinfo *sra); +extern int sysfs_wait(int fd, int *msec); +extern int load_sys(char *path, char *buf, int len); +extern int zero_disk_range(int fd, unsigned long long sector, size_t count); +extern int reshape_prepare_fdlist(char *devname, + struct mdinfo *sra, + int raid_disks, + int nrdisks, + unsigned long blocks, + char *backup_file, + int *fdlist, + unsigned long long *offsets); +extern void reshape_free_fdlist(int *fdlist, + unsigned long long *offsets, + int size); +extern int reshape_open_backup_file(char *backup, + int fd, + char *devname, + long blocks, + int *fdlist, + unsigned long long *offsets, + char *sysfs_name, + int restart); +extern unsigned long compute_backup_blocks(int nchunk, int ochunk, + unsigned int ndata, unsigned int odata); +extern char *locate_backup(char *name); +extern char *make_backup(char *name); + +extern int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf); +extern int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf); + +#ifndef Sendmail +#define Sendmail "/usr/lib/sendmail -t" +#endif + +#define SYSLOG_FACILITY LOG_DAEMON + +extern char *map_num(mapping_t *map, int num); +extern int map_name(mapping_t *map, char *name); +extern mapping_t r0layout[], r5layout[], r6layout[], + pers[], modes[], faultylayout[]; +extern mapping_t consistency_policies[], sysfs_array_states[]; + +extern char *map_dev_preferred(int major, int minor, int create, + char *prefer); +static inline char *map_dev(int major, int minor, int create) +{ + return map_dev_preferred(major, minor, create, NULL); +} + +/** + * is_fd_valid() - check file descriptor. + * @fd: file descriptor. + * + * The function checks if @fd is nonnegative integer and shall be used only + * to verify open() result. + */ +static inline int is_fd_valid(int fd) +{ + return (fd > -1); +} + +/** + * close_fd() - verify, close and unset file descriptor. + * @fd: pointer to file descriptor. + * + * The function closes and invalidates file descriptor if appropriative. It + * ignores incorrect file descriptor quitely to simplify error handling. + */ +static inline void close_fd(int *fd) +{ + if (is_fd_valid(*fd) && close(*fd) == 0) + *fd = -1; +} + +struct active_array; +struct metadata_update; + +/* 'struct reshape' records the intermediate states of + * a general reshape. + * The starting geometry is converted to the 'before' geometry + * by at most an atomic level change. They could be the same. + * Similarly the 'after' geometry is converted to the final + * geometry by at most a level change. + * Note that 'before' and 'after' must have the same level. + * 'blocks' is the minimum number of sectors for a reshape unit. + * This will be a multiple of the stripe size in each of the + * 'before' and 'after' geometries. + * If 'blocks' is 0, no restriping is necessary. + * 'min_offset_change' is the minimum change to data_offset to + * allow the reshape to happen. It is at least the larger of + * the old and new chunk sizes, and typically the same as 'blocks' + * divided by number of data disks. + */ +struct reshape { + int level; + int parity; /* number of parity blocks/devices */ + struct { + int layout; + int data_disks; + } before, after; + unsigned long long backup_blocks; + unsigned long long min_offset_change; + unsigned long long stripes; /* number of old stripes that comprise 'blocks'*/ + unsigned long long new_size; /* New size of array in sectors */ +}; + +/* A superswitch provides entry point to a metadata handler. + * + * The superswitch primarily operates on some "metadata" that + * is accessed via the 'supertype'. + * This metadata has one of three possible sources. + * 1/ It is read from a single device. In this case it may not completely + * describe the array or arrays as some information might be on other + * devices. + * 2/ It is read from all devices in a container. In this case all + * information is present. + * 3/ It is created by ->init_super / ->add_to_super. In this case it will + * be complete once enough ->add_to_super calls have completed. + * + * When creating an array inside a container, the metadata will be + * formed by a combination of 2 and 3. The metadata or the array is read, + * then new information is added. + * + * The metadata must sometimes have a concept of a 'current' array + * and a 'current' device. + * The 'current' array is set by init_super to be the newly created array, + * or is set by super_by_fd when it finds it is looking at an array inside + * a container. + * + * The 'current' device is either the device that the metadata was read from + * in case 1, or the last device added by add_to_super in case 3. + * Case 2 does not identify a 'current' device. + */ +extern struct superswitch { + + /* Used to report details of metadata read from a component + * device. ->load_super has been called. + */ + void (*examine_super)(struct supertype *st, char *homehost); + void (*brief_examine_super)(struct supertype *st, int verbose); + void (*brief_examine_subarrays)(struct supertype *st, int verbose); + void (*export_examine_super)(struct supertype *st); + int (*examine_badblocks)(struct supertype *st, int fd, char *devname); + int (*copy_metadata)(struct supertype *st, int from, int to); + + /* Used to report details of an active array. + * ->load_super was possibly given a 'component' string. + */ + void (*detail_super)(struct supertype *st, char *homehost, + char *subarray); + void (*brief_detail_super)(struct supertype *st, char *subarray); + void (*export_detail_super)(struct supertype *st); + + /* Optional: platform hardware / firmware details */ + int (*detail_platform)(int verbose, int enumerate_only, char *controller_path); + int (*export_detail_platform)(int verbose, char *controller_path); + + /* Used: + * to get uuid to storing in bitmap metadata + * and 'reshape' backup-data metadata + * To see if a device is being re-added to an array it was part of. + */ + void (*uuid_from_super)(struct supertype *st, int uuid[4]); + + /* Extract generic details from metadata. This could be details about + * the container, or about an individual array within the container. + * The determination is made either by: + * load_super being given a 'component' string. + * validate_geometry determining what to create. + * The info includes both array information and device information. + * The particular device should be: + * The last device added by add_to_super + * The device the metadata was loaded from by load_super + * If 'map' is present, then it is an array raid_disks long + * (raid_disk must already be set and correct) and it is filled + * with 1 for slots that are thought to be active and 0 for slots which + * appear to be failed/missing. + * *info is zeroed out before data is added. + */ + void (*getinfo_super)(struct supertype *st, struct mdinfo *info, char *map); + struct mdinfo *(*getinfo_super_disks)(struct supertype *st); + /* Check if the given metadata is flagged as belonging to "this" + * host. 0 for 'no', 1 for 'yes', -1 for "Don't record homehost" + */ + int (*match_home)(struct supertype *st, char *homehost); + + /* Make one of several generic modifications to metadata + * prior to assembly (or other times). + * sparc2.2 - first bug in early 0.90 metadata + * super-minor - change name of 0.90 metadata + * summaries - 'correct' any redundant data + * resync - mark array as dirty to trigger a resync. + * uuid - set new uuid - only 0.90 or 1.x + * name - change the name of the array (where supported) + * homehost - change which host this array is tied to. + * devicesize - If metadata is at start of device, change recorded + * device size to match actual device size + * byteorder - swap bytes for 0.90 metadata + * + * force-one - mark that device as uptodate, not old or failed. + * force-array - mark array as clean if it would not otherwise + * assemble + * assemble - not sure how this is different from force-one... + * linear-grow-new - add a new device to a linear array, but don't + * change the size: so superblock still matches + * linear-grow-update - now change the size of the array. + * writemostly - set the WriteMostly1 bit in the superblock devflags + * readwrite - clear the WriteMostly1 bit in the superblock devflags + * failfast - set the FailFast1 bit in the superblock + * nofailfast - clear the FailFast1 bit + * no-bitmap - clear any record that a bitmap is present. + * bbl - add a bad-block-log if possible + * no-bbl - remove any bad-block-log is it is empty. + * force-no-bbl - remove any bad-block-log even if empty. + * revert-reshape - If a reshape is in progress, modify metadata so + * it will resume going in the opposite direction. + */ + int (*update_super)(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost); + + /* Create new metadata for new array as described. This could + * be a new container, or an array in a pre-existing container. + * Also used to zero metadata prior to writing it to invalidate old + * metadata. + */ + int (*init_super)(struct supertype *st, mdu_array_info_t *info, + struct shape *s, char *name, + char *homehost, int *uuid, + unsigned long long data_offset); + + /* update the metadata to include new device, either at create or + * when hot-adding a spare. + */ + int (*add_to_super)(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, + unsigned long long data_offset); + /* update the metadata to delete a device, + * when hot-removing. + */ + int (*remove_from_super)(struct supertype *st, mdu_disk_info_t *dinfo); + + /* Write metadata to one device when fixing problems or adding + * a new device. + */ + int (*store_super)(struct supertype *st, int fd); + + /* Write all metadata for this array. + */ + int (*write_init_super)(struct supertype *st); + /* Check if metadata read from one device is compatible with an array, + * used when assembling an array, or pseudo-assembling was with + * "--examine --brief" + * If "st" has not yet been loaded the superblock from, "tst" is + * moved in, otherwise the superblock in 'st' is compared with + * 'tst'. + */ + int (*compare_super)(struct supertype *st, struct supertype *tst, + int verbose); + /* Load metadata from a single device. If 'devname' is not NULL + * print error messages as appropriate */ + int (*load_super)(struct supertype *st, int fd, char *devname); + /* 'fd' is a 'container' md array - load array metadata from the + * whole container. + */ + int (*load_container)(struct supertype *st, int fd, char *devname); + /* If 'arg' is a valid name of this metadata type, allocate and + * return a 'supertype' for the particular minor version */ + struct supertype * (*match_metadata_desc)(char *arg); + /* If a device has the given size, and the data_offset has been + * requested - work out how much space is available for data. + * This involves adjusting for reserved space (e.g. bitmaps) + * and for any rounding. + * 'mdadm' only calls this for existing arrays where a possible + * spare is being added. However some super-handlers call it + * internally from validate_geometry when creating an array. + */ + __u64 (*avail_size)(struct supertype *st, __u64 size, + unsigned long long data_offset); + /* + * Return spare criteria for array: + * - minimum disk size can be used in array; + * - sector size can be used in array. + * Return values: 0 - for success and -EINVAL on error. + */ + int (*get_spare_criteria)(struct supertype *st, + struct spare_criteria *sc); + /* Find somewhere to put a bitmap - possibly auto-size it - and + * update the metadata to record this. The array may be newly + * created, in which case data_size may be updated, or it might + * already exist. Metadata handler can know if init_super + * has been called, but not write_init_super. + * 0: Success + * -Exxxx: On error + */ + int (*add_internal_bitmap)(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, int major); + /* Perform additional setup required to activate a bitmap. + */ + int (*set_bitmap)(struct supertype *st, struct mdinfo *info); + /* Seek 'fd' to start of write-intent-bitmap. Must be an + * md-native format bitmap + */ + int (*locate_bitmap)(struct supertype *st, int fd, int node_num); + /* if add_internal_bitmap succeeded for existing array, this + * writes it out. + */ + int (*write_bitmap)(struct supertype *st, int fd, enum bitmap_update update); + /* Free the superblock and any other allocated data */ + void (*free_super)(struct supertype *st); + + /* validate_geometry is called with an st returned by + * match_metadata_desc. + * It should check that the geometry described is compatible with + * the metadata type. It will be called repeatedly as devices + * added to validate changing size and new devices. If there are + * inter-device dependencies, it should record sufficient details + * so these can be validated. + * Both 'size' and '*freesize' are in sectors. chunk is KiB. + * Return value is: + * 1: everything is OK + * 0: not OK for some reason - if 'verbose', then error was reported. + * -1: st->sb was NULL, 'subdev' is a member of a container of this + * type, but array is not acceptable for some reason + * message was reported even if verbose is 0. + */ + int (*validate_geometry)(struct supertype *st, int level, int layout, + int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int consistency_policy, int verbose); + + /* Return a linked list of 'mdinfo' structures for all arrays + * in the container. For non-containers, it is like + * getinfo_super with an allocated mdinfo.*/ + struct mdinfo *(*container_content)(struct supertype *st, char *subarray); + /* query the supertype for default geometry */ + void (*default_geometry)(struct supertype *st, int *level, int *layout, int *chunk); /* optional */ + /* Permit subarray's to be deleted from inactive containers */ + int (*kill_subarray)(struct supertype *st, + char *subarray_id); /* optional */ + /* Permit subarray's to be modified */ + int (*update_subarray)(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident); /* optional */ + /* Check if reshape is supported for this external format. + * st is obtained from super_by_fd() where st->subarray[0] is + * initialized to indicate if reshape is being performed at the + * container or subarray level + */ +#define APPLY_METADATA_CHANGES 1 +#define ROLLBACK_METADATA_CHANGES 0 + + int (*reshape_super)(struct supertype *st, + unsigned long long size, int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, + int verbose); /* optional */ + int (*manage_reshape)( /* optional */ + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); + +/* for mdmon */ + int (*open_new)(struct supertype *c, struct active_array *a, + int inst); + + /* Tell the metadata handler the current state of the array. + * This covers whether it is known to be consistent (no pending writes) + * and how far along a resync is known to have progressed + * (in a->resync_start). + * resync status is really irrelevant if the array is not consistent, + * but some metadata (DDF!) have a place to record the distinction. + * If 'consistent' is '2', then the array can mark it dirty if a + * resync/recovery/whatever is required, or leave it clean if not. + * Return value is 0 dirty (not consistent) and 1 if clean. + * it is only really important if consistent is passed in as '2'. + */ + int (*set_array_state)(struct active_array *a, int consistent); + + /* When the state of a device might have changed, we call set_disk to + * tell the metadata what the current state is. + * Typically this happens on spare->in_sync and (spare|in_sync)->faulty + * transitions. + * set_disk might be called when the state of the particular disk has + * not in fact changed. + */ + void (*set_disk)(struct active_array *a, int n, int state); + void (*sync_metadata)(struct supertype *st); + void (*process_update)(struct supertype *st, + struct metadata_update *update); + /* Prepare updates allocates extra memory that might be + * needed. If the update cannot be understood, return 0. + */ + int (*prepare_update)(struct supertype *st, + struct metadata_update *update); + + /* activate_spare will check if the array is degraded and, if it + * is, try to find some spare space in the container. + * On success, it add appropriate updates (For process_update) to + * to the 'updates' list and returns a list of 'mdinfo' identifying + * the device, or devices as there might be multiple missing + * devices and multiple spares available. + */ + struct mdinfo *(*activate_spare)(struct active_array *a, + struct metadata_update **updates); + /* + * Return statically allocated string that represents metadata specific + * controller domain of the disk. The domain is used in disk domain + * matching functions. Disks belong to the same domain if the they have + * the same domain from mdadm.conf and belong the same metadata domain. + * Returning NULL or not providing this handler means that metadata + * does not distinguish the differences between disks that belong to + * different controllers. They are in the domain specified by + * configuration file (mdadm.conf). + * In case when the metadata has the notion of domains based on disk + * it shall return NULL for disks that do not belong to the controller + * the supported domains. Such disks will form another domain and won't + * be mixed with supported ones. + */ + const char *(*get_disk_controller_domain)(const char *path); + + /* for external backup area */ + int (*recover_backup)(struct supertype *st, struct mdinfo *info); + + /* validate container after assemble */ + int (*validate_container)(struct mdinfo *info); + + /* write initial empty PPL on device */ + int (*write_init_ppl)(struct supertype *st, struct mdinfo *info, int fd); + + /* validate ppl before assemble */ + int (*validate_ppl)(struct supertype *st, struct mdinfo *info, + struct mdinfo *disk); + + /* records new bad block in metadata */ + int (*record_bad_block)(struct active_array *a, int n, + unsigned long long sector, int length); + + /* clears bad block from metadata */ + int (*clear_bad_block)(struct active_array *a, int n, + unsigned long long sector, int length); + + /* get list of bad blocks from metadata */ + struct md_bb *(*get_bad_blocks)(struct active_array *a, int n); + + int swapuuid; /* true if uuid is bigending rather than hostendian */ + int external; + const char *name; /* canonical metadata name */ +} *superlist[]; + +extern struct superswitch super0, super1; +extern struct superswitch super_imsm, super_ddf; +extern struct superswitch mbr, gpt; + +struct metadata_update { + int len; + char *buf; + void *space; /* allocated space that monitor will use */ + void **space_list; /* list of allocated spaces that monitor can + * use or that it returned. + */ + struct metadata_update *next; +}; + +/* A supertype holds a particular collection of metadata. + * It identifies the metadata type by the superswitch, and the particular + * sub-version of that metadata type. + * metadata read in or created is stored in 'sb' and 'info'. + * There are also fields used by mdmon to track containers. + * + * A supertype may refer to: + * Just an array, possibly in a container + * A container, not identifying any particular array + * Info read from just one device, not yet fully describing the array/container. + * + * + * A supertype is created by: + * super_by_fd + * guess_super + * dup_super + */ +struct supertype { + struct superswitch *ss; + int minor_version; + int max_devs; + char container_devnm[32]; /* devnm of container */ + void *sb; + void *info; + void *other; /* Hack used to convert v0.90 to v1.0 */ + unsigned long long devsize; + unsigned long long data_offset; /* used by v1.x only */ + int ignore_hw_compat; /* used to inform metadata handlers that it should ignore + HW/firmware related incompatability to load metadata. + Used when examining metadata to display content of disk + when user has no hw/firmare compatible system. + */ + struct metadata_update *updates; + struct metadata_update **update_tail; + + /* extra stuff used by mdmon */ + struct active_array *arrays; + int sock; /* listen to external programs */ + char devnm[32]; /* e.g. md0. This appears in metadata_version: + * external:/md0/12 + */ + int devcnt; + int retry_soon; + int nodes; + char *cluster_name; + + struct mdinfo *devs; + +}; + +extern struct supertype *super_by_fd(int fd, char **subarray); +enum guess_types { guess_any, guess_array, guess_partitions }; +extern struct supertype *guess_super_type(int fd, enum guess_types guess_type); +static inline struct supertype *guess_super(int fd) { + return guess_super_type(fd, guess_any); +} +extern struct supertype *dup_super(struct supertype *st); +extern int get_dev_size(int fd, char *dname, unsigned long long *sizep); +extern int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep); +extern int must_be_container(int fd); +extern int dev_size_from_id(dev_t id, unsigned long long *size); +extern int dev_sector_size_from_id(dev_t id, unsigned int *size); +void wait_for(char *dev, int fd); + +/* + * Data structures for policy management. + * Each device can have a policy structure that lists + * various name/value pairs each possibly with a metadata associated. + * The policy list is sorted by name/value/metadata + */ +struct dev_policy { + struct dev_policy *next; + char *name; /* None of these strings are allocated. They are + * all just references to strings which are known + * to exist elsewhere. + * name and metadata can be compared by address equality. + */ + const char *metadata; + const char *value; +}; + +extern char pol_act[], pol_domain[], pol_metadata[], pol_auto[]; + +/* iterate over the sublist starting at list, having the same + * 'name' as 'list', and matching the given metadata (Where + * NULL matches anything + */ +#define pol_for_each(item, list, _metadata) \ + for (item = list; \ + item && item->name == list->name; \ + item = item->next) \ + if (!(!_metadata || !item->metadata || _metadata == item->metadata)) \ + ; else + +/* + * policy records read from mdadm are largely just name-value pairs. + * The names are constants, not strdupped + */ +struct pol_rule { + struct pol_rule *next; + char *type; /* rule_policy or rule_part */ + struct rule { + struct rule *next; + char *name; + char *value; + char *dups; /* duplicates of 'value' with a partNN appended */ + } *rule; +}; + +extern char rule_policy[], rule_part[]; +extern char rule_path[], rule_type[]; +extern char type_part[], type_disk[]; + +extern void policyline(char *line, char *type); +extern void policy_add(char *type, ...); +extern void policy_free(void); + +extern struct dev_policy *path_policy(char **paths, char *type); +extern struct dev_policy *disk_policy(struct mdinfo *disk); +extern struct dev_policy *devid_policy(int devid); +extern void dev_policy_free(struct dev_policy *p); + +//extern void pol_new(struct dev_policy **pol, char *name, char *val, char *metadata); +extern void pol_add(struct dev_policy **pol, char *name, char *val, char *metadata); +extern struct dev_policy *pol_find(struct dev_policy *pol, char *name); + +enum policy_action { + act_default, + act_include, + act_re_add, + act_spare, /* This only applies to bare devices */ + act_spare_same_slot, /* this allows non-bare devices, + * but only if recent removal */ + act_force_spare, /* this allow non-bare devices in any case */ + act_err +}; + +extern int policy_action_allows(struct dev_policy *plist, const char *metadata, + enum policy_action want); +extern int disk_action_allows(struct mdinfo *disk, const char *metadata, + enum policy_action want); + +struct domainlist { + struct domainlist *next; + const char *dom; +}; + +extern int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata); +extern struct domainlist *domain_from_array(struct mdinfo *mdi, + const char *metadata); +extern void domainlist_add_dev(struct domainlist **dom, int devid, + const char *metadata); +extern void domain_free(struct domainlist *dl); +extern void domain_merge(struct domainlist **domp, struct dev_policy *pol, + const char *metadata); +void domain_add(struct domainlist **domp, char *domain); + +extern void policy_save_path(char *id_path, struct map_ent *array); +extern int policy_check_path(struct mdinfo *disk, struct map_ent *array); + +extern void sysfs_rules_apply(char *devnm, struct mdinfo *dev); +extern void sysfsline(char *line); + +#if __GNUC__ < 3 +struct stat64; +#endif + +#define HAVE_NFTW we assume +#define HAVE_FTW + +#ifdef __UCLIBC__ +# include <features.h> +# ifndef __UCLIBC_HAS_LFS__ +# define lseek64 lseek +# endif +# ifndef __UCLIBC_HAS_FTW__ +# undef HAVE_FTW +# undef HAVE_NFTW +# endif +#endif + +#ifdef __dietlibc__ +# undef HAVE_NFTW +#endif + +#if defined(__KLIBC__) +# undef HAVE_NFTW +# undef HAVE_FTW +#endif + +#ifndef HAVE_NFTW +# define FTW_PHYS 1 +# ifndef HAVE_FTW + struct FTW {}; +# endif +#endif + +#ifdef HAVE_FTW +# include <ftw.h> +#endif + +extern int add_dev(const char *name, const struct stat *stb, int flag, struct FTW *s); + +extern int Manage_ro(char *devname, int fd, int readonly); +extern int Manage_run(char *devname, int fd, struct context *c); +extern int Manage_stop(char *devname, int fd, int quiet, + int will_retry); +extern int Manage_subdevs(char *devname, int fd, + struct mddev_dev *devlist, int verbose, int test, + char *update, int force); +extern int autodetect(void); +extern int Grow_Add_device(char *devname, int fd, char *newdev); +extern int Grow_addbitmap(char *devname, int fd, + struct context *c, struct shape *s); +extern int Grow_reshape(char *devname, int fd, + struct mddev_dev *devlist, + unsigned long long data_offset, + struct context *c, struct shape *s); +extern int Grow_restart(struct supertype *st, struct mdinfo *info, + int *fdlist, int cnt, char *backup_file, int verbose); +extern int Grow_continue(int mdfd, struct supertype *st, + struct mdinfo *info, char *backup_file, + int forked, int freeze_reshape); +extern int Grow_consistency_policy(char *devname, int fd, + struct context *c, struct shape *s); + +extern int restore_backup(struct supertype *st, + struct mdinfo *content, + int working_disks, + int spares, + char **backup_filep, + int verbose); +extern int Grow_continue_command(char *devname, int fd, + char *backup_file, int verbose); + +extern int Assemble(struct supertype *st, char *mddev, + struct mddev_ident *ident, + struct mddev_dev *devlist, + struct context *c); + +extern int Build(char *mddev, struct mddev_dev *devlist, + struct shape *s, struct context *c); + +extern int Create(struct supertype *st, char *mddev, + char *name, int *uuid, + int subdevs, struct mddev_dev *devlist, + struct shape *s, + struct context *c, + unsigned long long data_offset); + +extern int Detail(char *dev, struct context *c); +extern int Detail_Platform(struct superswitch *ss, int scan, int verbose, int export, char *controller_path); +extern int Query(char *dev); +extern int ExamineBadblocks(char *devname, int brief, struct supertype *forcest); +extern int Examine(struct mddev_dev *devlist, struct context *c, + struct supertype *forcest); +extern int Monitor(struct mddev_dev *devlist, + char *mailaddr, char *alert_cmd, + struct context *c, + int daemonise, int oneshot, + int dosyslog, char *pidfile, int increments, + int share); + +extern int Kill(char *dev, struct supertype *st, int force, int verbose, int noexcl); +extern int Kill_subarray(char *dev, char *subarray, int verbose); +extern int Update_subarray(char *dev, char *subarray, char *update, struct mddev_ident *ident, int quiet); +extern int Wait(char *dev); +extern int WaitClean(char *dev, int verbose); +extern int SetAction(char *dev, char *action); + +extern int Incremental(struct mddev_dev *devlist, struct context *c, + struct supertype *st); +extern void RebuildMap(void); +extern int IncrementalScan(struct context *c, char *devnm); +extern int IncrementalRemove(char *devname, char *path, int verbose); +extern int CreateBitmap(char *filename, int force, char uuid[16], + unsigned long chunksize, unsigned long daemon_sleep, + unsigned long write_behind, + unsigned long long array_size, + int major); +extern int ExamineBitmap(char *filename, int brief, struct supertype *st); +extern int IsBitmapDirty(char *filename); +extern int Write_rules(char *rule_name); +extern int bitmap_update_uuid(int fd, int *uuid, int swap); + +/* calculate the size of the bitmap given the array size and bitmap chunksize */ +static inline unsigned long long +bitmap_bits(unsigned long long array_size, unsigned long chunksize) +{ + return (array_size * 512 + chunksize - 1) / chunksize; +} + +extern int Dump_metadata(char *dev, char *dir, struct context *c, + struct supertype *st); +extern int Restore_metadata(char *dev, char *dir, struct context *c, + struct supertype *st, int only); + +int md_array_valid(int fd); +int md_array_active(int fd); +int md_array_is_active(struct mdinfo *info); +int md_get_array_info(int fd, struct mdu_array_info_s *array); +int md_set_array_info(int fd, struct mdu_array_info_s *array); +int md_get_disk_info(int fd, struct mdu_disk_info_s *disk); +extern int get_linux_version(void); +extern int mdadm_version(char *version); +extern unsigned long long parse_size(char *size); +extern int parse_uuid(char *str, int uuid[4]); +extern int is_near_layout_10(int layout); +extern int parse_layout_10(char *layout); +extern int parse_layout_faulty(char *layout); +extern int parse_num(int *dest, char *num); +extern int parse_cluster_confirm_arg(char *inp, char **devname, int *slot); +extern int check_ext2(int fd, char *name); +extern int check_reiser(int fd, char *name); +extern int check_raid(int fd, char *name); +extern int check_partitions(int fd, char *dname, + unsigned long long freesize, + unsigned long long size); +extern int fstat_is_blkdev(int fd, char *devname, dev_t *rdev); +extern int stat_is_blkdev(char *devname, dev_t *rdev); + +extern bool is_dev_alive(char *path); +extern int get_mdp_major(void); +extern int get_maj_min(char *dev, int *major, int *minor); +extern int dev_open(char *dev, int flags); +extern int open_dev(char *devnm); +extern void reopen_mddev(int mdfd); +extern int open_dev_flags(char *devnm, int flags); +extern int open_dev_excl(char *devnm); +extern int is_standard(char *dev, int *nump); +extern int same_dev(char *one, char *two); +extern int compare_paths (char* path1,char* path2); +extern void enable_fds(int devices); +extern void manage_fork_fds(int close_all); +extern int continue_via_systemd(char *devnm, char *service_name); + +extern int parse_auto(char *str, char *msg, int config); +extern struct mddev_ident *conf_get_ident(char *dev); +extern struct mddev_dev *conf_get_devs(void); +extern int conf_test_dev(char *devname); +extern int conf_test_metadata(const char *version, struct dev_policy *pol, int is_homehost); +extern struct createinfo *conf_get_create_info(void); +extern void set_conffile(char *file); +extern char *conf_get_mailaddr(void); +extern char *conf_get_mailfrom(void); +extern char *conf_get_program(void); +extern char *conf_get_homehost(int *require_homehostp); +extern char *conf_get_homecluster(void); +extern int conf_get_monitor_delay(void); +extern char *conf_line(FILE *file); +extern char *conf_word(FILE *file, int allow_key); +extern void print_quoted(char *str); +extern void print_escape(char *str); +extern int use_udev(void); +extern unsigned long GCD(unsigned long a, unsigned long b); +extern int conf_name_is_free(char *name); +extern int conf_verify_devnames(struct mddev_ident *array_list); +extern int devname_matches(char *name, char *match); +extern struct mddev_ident *conf_match(struct supertype *st, + struct mdinfo *info, + char *devname, + int verbose, int *rvp); + +extern void free_line(char *line); +extern int match_oneof(char *devices, char *devname); +extern void uuid_from_super(int uuid[4], mdp_super_t *super); +extern const int uuid_zero[4]; +extern int same_uuid(int a[4], int b[4], int swapuuid); +extern void copy_uuid(void *a, int b[4], int swapuuid); +extern char *__fname_from_uuid(int id[4], int swap, char *buf, char sep); +extern char *fname_from_uuid(struct supertype *st, + struct mdinfo *info, char *buf, char sep); +extern unsigned long calc_csum(void *super, int bytes); +extern int enough(int level, int raid_disks, int layout, int clean, + char *avail); +extern int ask(char *mesg); +extern unsigned long long get_component_size(int fd); +extern void remove_partitions(int fd); +extern int test_partition(int fd); +extern int test_partition_from_id(dev_t id); +extern int get_data_disks(int level, int layout, int raid_disks); +extern unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize); +extern int flush_metadata_updates(struct supertype *st); +extern void append_metadata_update(struct supertype *st, void *buf, int len); +extern int assemble_container_content(struct supertype *st, int mdfd, + struct mdinfo *content, + struct context *c, + char *chosen_name, int *result); +#define INCR_NO 1 +#define INCR_UNSAFE 2 +#define INCR_ALREADY 4 +#define INCR_YES 8 +extern struct mdinfo *container_choose_spares(struct supertype *st, + struct spare_criteria *criteria, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one); +extern int move_spare(char *from_devname, char *to_devname, dev_t devid); +extern int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info); +extern int hot_remove_disk(int mdfd, unsigned long dev, int force); +extern int sys_hot_remove_disk(int statefd, int force); +extern int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info); +unsigned long long min_recovery_start(struct mdinfo *array); + +extern char *human_size(long long bytes); +extern char *human_size_brief(long long bytes, int prefix); +extern void print_r10_layout(int layout); + +extern char *find_free_devnm(int use_partitions); + +extern void put_md_name(char *name); +extern char *devid2kname(dev_t devid); +extern char *devid2devnm(dev_t devid); +extern dev_t devnm2devid(char *devnm); +extern char *get_md_name(char *devnm); + +extern char DefaultConfFile[]; + +extern int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen, int block_udev); +/* values for 'trustworthy' */ +#define LOCAL 1 +#define LOCAL_ANY 10 +#define FOREIGN 2 +#define METADATA 3 +extern int open_mddev(char *dev, int report_errors); +extern int open_container(int fd); +extern int metadata_container_matches(char *metadata, char *devnm); +extern int metadata_subdev_matches(char *metadata, char *devnm); +extern int is_container_member(struct mdstat_ent *ent, char *devname); +extern int is_subarray_active(char *subarray, char *devname); +extern int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet); +extern struct superswitch *version_to_superswitch(char *vers); + +extern int mdmon_running(char *devnm); +extern int mdmon_pid(char *devnm); +extern int check_env(char *name); +extern __u32 random32(void); +extern void random_uuid(__u8 *buf); +extern int start_mdmon(char *devnm); + +extern int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long stripes, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets); +void abort_reshape(struct mdinfo *sra); + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0); + +extern char *stat2kname(struct stat *st); +extern char *fd2kname(int fd); +extern char *stat2devnm(struct stat *st); +extern char *fd2devnm(int fd); +extern void udev_block(char *devnm); +extern void udev_unblock(void); + +extern int in_initrd(void); + +struct cmap_hooks { + void *cmap_handle; /* corosync lib related */ + + int (*initialize)(cmap_handle_t *handle); + int (*get_string)(cmap_handle_t handle, + const char *string, + char **name); + int (*finalize)(cmap_handle_t handle); +}; + +extern void set_cmap_hooks(void); +extern void set_hooks(void); + +struct dlm_hooks { + void *dlm_handle; /* dlm lib related */ + + dlm_lshandle_t (*create_lockspace)(const char *name, + unsigned int mode); + dlm_lshandle_t (*open_lockspace)(const char *name); + int (*release_lockspace)(const char *name, dlm_lshandle_t ls, + int force); + int (*ls_lock)(dlm_lshandle_t lockspace, uint32_t mode, + struct dlm_lksb *lksb, uint32_t flags, + const void *name, unsigned int namelen, + uint32_t parent, void (*astaddr) (void *astarg), + void *astarg, void (*bastaddr) (void *astarg), + void *range); + int (*ls_unlock_wait)(dlm_lshandle_t lockspace, uint32_t lkid, + uint32_t flags, struct dlm_lksb *lksb); + int (*ls_get_fd)(dlm_lshandle_t ls); + int (*dispatch)(int fd); +}; + +extern int get_cluster_name(char **name); +extern int dlm_funs_ready(void); +extern int cluster_get_dlmlock(void); +extern int cluster_release_dlmlock(void); +extern void set_dlm_hooks(void); + +#define _ROUND_UP(val, base) (((val) + (base) - 1) & ~(base - 1)) +#define ROUND_UP(val, base) _ROUND_UP(val, (typeof(val))(base)) +#define ROUND_UP_PTR(ptr, base) ((typeof(ptr)) \ + (ROUND_UP((unsigned long)(ptr), base))) + +static inline int is_subarray(char *vers) +{ + /* The version string for a 'subarray' (an array in a container) + * is + * /containername/componentname for normal read-write arrays + * -containername/componentname for arrays which mdmon must not + * reconfigure. They might be read-only + * or might be undergoing reshape etc. + * containername is e.g. md0, md_d1 + * componentname is dependant on the metadata. e.g. '1' 'S1' ... + */ + return (*vers == '/' || *vers == '-'); +} + +static inline char *to_subarray(struct mdstat_ent *ent, char *container) +{ + return &ent->metadata_version[10+strlen(container)+1]; +} + +#ifdef DEBUG +#define dprintf(fmt, arg...) \ + fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##arg) +#define dprintf_cont(fmt, arg...) \ + fprintf(stderr, fmt, ##arg) +#else +#define dprintf(fmt, arg...) \ + ({ if (0) fprintf(stderr, "%s: %s: " fmt, Name, __func__, ##arg); 0; }) +#define dprintf_cont(fmt, arg...) \ + ({ if (0) fprintf(stderr, fmt, ##arg); 0; }) +#endif +#include <assert.h> +#include <stdarg.h> +static inline int xasprintf(char **strp, const char *fmt, ...) { + va_list ap; + int ret; + va_start(ap, fmt); + ret = vasprintf(strp, fmt, ap); + va_end(ap); + assert(ret >= 0); + return ret; +} + +#ifdef DEBUG +#define pr_err(fmt, args...) fprintf(stderr, "%s: %s: "fmt, Name, __func__, ##args) +#else +#define pr_err(fmt, args...) fprintf(stderr, "%s: "fmt, Name, ##args) +#endif +#define cont_err(fmt ...) fprintf(stderr, " " fmt) + +void *xmalloc(size_t len); +void *xrealloc(void *ptr, size_t len); +void *xcalloc(size_t num, size_t size); +char *xstrdup(const char *str); + +#define LEVEL_MULTIPATH (-4) +#define LEVEL_LINEAR (-1) +#define LEVEL_FAULTY (-5) + +/* kernel module doesn't know about these */ +#define LEVEL_CONTAINER (-100) +#define LEVEL_UNSUPPORTED (-200) + +/* the kernel does know about this one ... */ +#define LEVEL_NONE (-1000000) + +/* faulty stuff */ + +#define WriteTransient 0 +#define ReadTransient 1 +#define WritePersistent 2 +#define ReadPersistent 3 +#define WriteAll 4 /* doesn't go to device */ +#define ReadFixable 5 +#define Modes 6 + +#define ClearErrors 31 +#define ClearFaults 30 + +#define AllPersist 100 /* internal use only */ +#define NoPersist 101 + +#define ModeMask 0x1f +#define ModeShift 5 + +#ifdef __TINYC__ +#undef minor +#undef major +#undef makedev +#define minor(x) ((x)&0xff) +#define major(x) (((x)>>8)&0xff) +#define makedev(M,m) (((M)<<8) | (m)) +#endif + +enum r0layout { + RAID0_ORIG_LAYOUT = 1, + RAID0_ALT_MULTIZONE_LAYOUT = 2, +}; + +/* for raid4/5/6 */ +#define ALGORITHM_LEFT_ASYMMETRIC 0 +#define ALGORITHM_RIGHT_ASYMMETRIC 1 +#define ALGORITHM_LEFT_SYMMETRIC 2 +#define ALGORITHM_RIGHT_SYMMETRIC 3 + +/* Define non-rotating (raid4) algorithms. These allow + * conversion of raid4 to raid5. + */ +#define ALGORITHM_PARITY_0 4 /* P or P,Q are initial devices */ +#define ALGORITHM_PARITY_N 5 /* P or P,Q are final devices. */ + +/* DDF RAID6 layouts differ from md/raid6 layouts in two ways. + * Firstly, the exact positioning of the parity block is slightly + * different between the 'LEFT_*' modes of md and the "_N_*" modes + * of DDF. + * Secondly, or order of datablocks over which the Q syndrome is computed + * is different. + * Consequently we have different layouts for DDF/raid6 than md/raid6. + * These layouts are from the DDFv1.2 spec. + * Interestingly DDFv1.2-Errata-A does not specify N_CONTINUE but + * leaves RLQ=3 as 'Vendor Specific' + */ + +#define ALGORITHM_ROTATING_ZERO_RESTART 8 /* DDF PRL=6 RLQ=1 */ +#define ALGORITHM_ROTATING_N_RESTART 9 /* DDF PRL=6 RLQ=2 */ +#define ALGORITHM_ROTATING_N_CONTINUE 10 /*DDF PRL=6 RLQ=3 */ + +/* For every RAID5 algorithm we define a RAID6 algorithm + * with exactly the same layout for data and parity, and + * with the Q block always on the last device (N-1). + * This allows trivial conversion from RAID5 to RAID6 + */ +#define ALGORITHM_LEFT_ASYMMETRIC_6 16 +#define ALGORITHM_RIGHT_ASYMMETRIC_6 17 +#define ALGORITHM_LEFT_SYMMETRIC_6 18 +#define ALGORITHM_RIGHT_SYMMETRIC_6 19 +#define ALGORITHM_PARITY_0_6 20 +#define ALGORITHM_PARITY_N_6 ALGORITHM_PARITY_N + +/* Define PATH_MAX in case we don't use glibc or standard library does + * not have PATH_MAX defined. Assume max path length is 4K characters. + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#define RESYNC_NONE -1 +#define RESYNC_DELAYED -2 +#define RESYNC_PENDING -3 +#define RESYNC_REMOTE -4 +#define RESYNC_UNKNOWN -5 + +/* When using "GET_DISK_INFO" it isn't certain how high + * we need to check. So we impose an absolute limit of + * MAX_DISKS. This needs to be much more than the largest + * number of devices any metadata can support. Currently + * v1.x can support 1920 + */ +#define MAX_DISKS 4096 + +/* Sometimes the 'size' value passed needs to mean "Maximum". + * In those cases with use MAX_SIZE + */ +#define MAX_SIZE 1 + +/* We want to use unsigned numbers for sector counts, but need + * a value for 'invalid'. Use '1'. + */ +#define INVALID_SECTORS 1 +/* And another special number needed for --data_offset=variable */ +#define VARIABLE_OFFSET 3 + +/** + * This is true for native and DDF, IMSM allows 16. + */ +#define MD_NAME_MAX 32 diff --git a/mdadm.spec b/mdadm.spec new file mode 100644 index 0000000..1b7c6bd --- /dev/null +++ b/mdadm.spec @@ -0,0 +1,47 @@ +Summary: mdadm is used for controlling Linux md devices (aka RAID arrays) +Name: mdadm +Version: 4.2 +Release: 1 +Source: https://www.kernel.org/pub/linux/utils/raid/mdadm/mdadm-%{version}.tar.gz +URL: https://neil.brown.name/blog/mdadm +License: GPL +Group: Utilities/System +BuildRoot: %{_tmppath}/%{name}-root +Obsoletes: mdctl + +%description +mdadm is a program that can be used to create, manage, and monitor +Linux MD (Software RAID) devices. + +%prep +%setup -q +# we want to install in /sbin, not /usr/sbin... +%define _exec_prefix %{nil} + +%build +# This is a debatable issue. The author of this RPM spec file feels that +# people who install RPMs (especially given that the default RPM options +# will strip the binary) are not going to be running gdb against the +# program. +make CXFLAGS="$RPM_OPT_FLAGS" SYSCONFDIR="%{_sysconfdir}" + +%install +make DESTDIR=$RPM_BUILD_ROOT MANDIR=%{_mandir} BINDIR=%{_sbindir} install +install -D -m644 mdadm.conf-example $RPM_BUILD_ROOT/%{_sysconfdir}/mdadm.conf + +%clean +rm -rf $RPM_BUILD_ROOT + +%files +%defattr(-,root,root) +%doc TODO ChangeLog mdadm.conf-example COPYING +%{_sbindir}/mdadm +%{_sbindir}/mdmon +/usr/lib/udev/rules.d/01-md-raid-creating.rules +/usr/lib/udev/rules.d/63-md-raid-arrays.rules +/usr/lib/udev/rules.d/64-md-raid-assembly.rules +/usr/lib/udev/rules.d/69-md-clustered-confirm-device.rules +%config(noreplace,missingok)/%{_sysconfdir}/mdadm.conf +%{_mandir}/man*/md* + +%changelog diff --git a/mdmon-design.txt b/mdmon-design.txt new file mode 100644 index 0000000..f09184a --- /dev/null +++ b/mdmon-design.txt @@ -0,0 +1,146 @@ + +When managing a RAID1 array which uses metadata other than the +"native" metadata understood by the kernel, mdadm makes use of a +partner program named 'mdmon' to manage some aspects of updating +that metadata and synchronising the metadata with the array state. + +This document provides some details on how mdmon works. + +Containers +---------- + +As background: mdadm makes a distinction between an 'array' and a +'container'. Other sources sometimes use the term 'volume' or +'device' for an 'array', and may use the term 'array' for a +'container'. + +For our purposes: + - a 'container' is a collection of devices which are described by a + single set of metadata. The metadata may be stored equally + on all devices, or different devices may have quite different + subsets of the total metadata. But there is conceptually one set + of metadata that unifies the devices. + + - an 'array' is a set of datablock from various devices which + together are used to present the abstraction of a single linear + sequence of block, which may provide data redundancy or enhanced + performance. + +So a container has some metadata and provides a number of arrays which +are described by that metadata. + +Sometimes this model doesn't work perfectly. For example, global +spares may have their own metadata which is quite different from the +metadata from any device that participates in one or more arrays. +Such a global spare might still need to belong to some container so +that it is available to be used should a failure arise. In that case +we consider the 'metadata' to be the union of the metadata on the +active devices which describes the arrays, and the metadata on the +global spares which only describes the spares. In this case different +devices in the one container will have quite different metadata. + + +Purpose +------- + +The main purpose of mdmon is to update the metadata in response to +changes to the array which need to be reflected in the metadata before +futures writes to the array can safely be performed. +These include: + - transitions from 'clean' to 'dirty'. + - recording the devices have failed. + - recording the progress of a 'reshape' + +This requires mdmon to be running at any time that the array is +writable (a read-only array does not require mdmon to be running). + +Because mdmon must be able to process these metadata updates at any +time, it must (when running) have exclusive write access to the +metadata. Any other changes (e.g. reconfiguration of the array) must +go through mdmon. + +A secondary role for mdmon is to activate spares when a device fails. +This role is much less time-critical than the other metadata updates, +so it could be performed by a separate process, possibly +"mdadm --monitor" which has a related role of moving devices between +arrays. A main reason for including this functionality in mdmon is +that in the native-metadata case this function is handled in the +kernel, and mdmon's reason for existence to provide functionality +which is otherwise handled by the kernel. + + +Design overview +--------------- + +mdmon is structured as two threads with a common address space and +common data structures. These threads are know as the 'monitor' and +the 'manager'. + +The 'monitor' has the primary role of monitoring the array for +important state changes and updating the metadata accordingly. As +writes to the array can be blocked until 'monitor' completes and +acknowledges the update, it much be very careful not to block itself. +In particular it must not block waiting for any write to complete else +it could deadlock. This means that it must not allocate memory as +doing this can require dirty memory to be written out and if the +system choose to write to the array that mdmon is monitoring, the +memory allocation could deadlock. + +So 'monitor' must never allocate memory and must limit the number of +other system call it performs. It may: + - use select (or poll) to wait for activity on a file descriptor + - read from a sysfs file descriptor + - write to a sysfs file descriptor + - write the metadata out to the block devices using O_DIRECT + - send a signal (kill) to the manager thread + +It must not e.g. open files or do anything similar that might allocate +resources. + +The 'manager' thread does everything else that is needed. If any +files are to be opened (e.g. because a device has been added to the +array), the manager does that. If any memory needs to be allocated +(e.g. to hold data about a new array as can happen when one set of +metadata describes several arrays), the manager performs that +allocation. + +The 'manager' is also responsible for communicating with mdadm and +assigning spares to replace failed devices. + + +Handling metadata updates +------------------------- + +There are a number of cases in which mdadm needs to update the +metdata which mdmon is managing. These include: + - creating a new array in an active container + - adding a device to a container + - reconfiguring an array +etc. + +To complete these updates, mdadm must send a message to mdmon which +will merge the update into the metadata as it is at that moment. + +To achieve this, mdmon creates a Unix Domain Socket which the manager +thread listens on. mdadm sends a message over this socket. The +manager thread examines the message to see if it will require +allocating any memory and allocates it. This is done in the +'prepare_update' metadata method. + +The update message is then queued for handling by the monitor thread +which it will do when convenient. The monitor thread calls +->process_update which should atomically make the required changes to +the metadata, making use of the pre-allocate memory as required. Any +memory the is no-longer needed can be placed back in the request and +the manager thread will free it. + +The exact format of a metadata update is up to the implementer of the +metadata handlers. It will simply describe a change that needs to be +made. It will sometimes contain fragments of the metadata to be +copied in to place. However the ->process_update routine must make +sure not to over-write any field that the monitor thread might have +updated, such as a 'device failed' or 'array is dirty' state. + +When the monitor thread has completed the update and written it to the +devices, an acknowledgement message is sent back over the socket so +that mdadm knows it is complete. @@ -0,0 +1,257 @@ +.\" See file COPYING in distribution for details. +.TH MDMON 8 "" v4.2 +.SH NAME +mdmon \- monitor MD external metadata arrays + +.SH SYNOPSIS + +.BI mdmon " [--all] [--takeover] [--foreground] CONTAINER" + +.SH OVERVIEW +The 2.6.27 kernel brings the ability to support external metadata arrays. +External metadata implies that user space handles all updates to the metadata. +The kernel's responsibility is to notify user space when a "metadata event" +occurs, like disk failures and clean-to-dirty transitions. The kernel, in +important cases, waits for user space to take action on these notifications. + +.SH DESCRIPTION +.SS Metadata updates: +To service metadata update requests a daemon, +.IR mdmon , +is introduced. +.I Mdmon +is tasked with polling the sysfs namespace looking for changes in +.BR array_state , +.BR sync_action , +and per disk +.BR state +attributes. When a change is detected it calls a per metadata type +handler to make modifications to the metadata. The following actions +are taken: +.RS +.TP +.B array_state \- inactive +Clear the dirty bit for the volume and let the array be stopped +.TP +.B array_state \- write pending +Set the dirty bit for the array and then set +.B array_state +to +.BR active . +Writes +are blocked until userspace writes +.BR active. +.TP +.B array_state \- active-idle +The safe mode timer has expired so set array state to clean to block writes to the array +.TP +.B array_state \- clean +Clear the dirty bit for the volume +.TP +.B array_state \- read-only +This is the initial state that all arrays start at. +.I mdmon +takes one of the three actions: +.RS +.TP +1/ +Transition the array to read-auto keeping the dirty bit clear if the metadata +handler determines that the array does not need resyncing or other modification +.TP +2/ +Transition the array to active if the metadata handler determines a resync or +some other manipulation is necessary +.TP +3/ +Leave the array read\-only if the volume is marked to not be monitored; for +example, the metadata version has been set to "external:\-dev/md127" instead of +"external:/dev/md127" +.RE +.TP +.B sync_action \- resync\-to\-idle +Notify the metadata handler that a resync may have completed. If a resync +process is idled before it completes this event allows the metadata handler to +checkpoint resync. +.TP +.B sync_action \- recover\-to\-idle +A spare may have completed rebuilding so tell the metadata handler about the +state of each disk. This is the metadata handler's opportunity to clear +any "out-of-sync" bits and clear the volume's degraded status. If a recovery +process is idled before it completes this event allows the metadata handler to +checkpoint recovery. +.TP +.B <disk>/state \- faulty +A disk failure kicks off a series of events. First, notify the metadata +handler that a disk has failed, and then notify the kernel that it can unblock +writes that were dependent on this disk. After unblocking the kernel this disk +is set to be removed+ from the member array. Finally the disk is marked failed +in all other member arrays in the container. +.IP ++ Note This behavior differs slightly from native MD arrays where +removal is reserved for a +.B mdadm --remove +event. In the external metadata case the container holds the final +reference on a block device and a +.B mdadm --remove <container> <victim> +call is still required. +.RE + +.SS Containers: +.P +External metadata formats, like DDF, differ from the native MD metadata +formats in that they define a set of disks and a series of sub-arrays +within those disks. MD metadata in comparison defines a 1:1 +relationship between a set of block devices and a RAID array. For +example to create 2 arrays at different RAID levels on a single +set of disks, MD metadata requires the disks be partitioned and then +each array can be created with a subset of those partitions. The +supported external formats perform this disk carving internally. +.P +Container devices simply hold references to all member disks and allow +tools like +.I mdmon +to determine which active arrays belong to which +container. Some array management commands like disk removal and disk +add are now only valid at the container level. Attempts to perform +these actions on member arrays are blocked with error messages like: +.IP +"mdadm: Cannot remove disks from a \'member\' array, perform this +operation on the parent container" +.P +Containers are identified in /proc/mdstat with a metadata version string +"external:<metadata name>". Member devices are identified by +"external:/<container device>/<member index>", or "external:-<container +device>/<member index>" if the array is to remain readonly. + +.SH OPTIONS +.TP +CONTAINER +The +.B container +device to monitor. It can be a full path like /dev/md/container, or a +simple md device name like md127. +.TP +.B \-\-foreground +Normally, +.I mdmon +will fork and continue in the background. Adding this option will +skip that step and run +.I mdmon +in the foreground. +.TP +.B \-\-takeover +This instructs +.I mdmon +to replace any active +.I mdmon +which is currently monitoring the array. This is primarily used late +in the boot process to replace any +.I mdmon +which was started from an +.B initramfs +before the root filesystem was mounted. This avoids holding a +reference on that +.B initramfs +indefinitely and ensures that the +.I pid +and +.I sock +files used to communicate with +.I mdmon +are in a standard place. +.TP +.B \-\-all +This tells mdmon to find any active containers and start monitoring +each of them if appropriate. This is normally used with +.B \-\-takeover +late in the boot sequence. +A separate +.I mdmon +process is started for each container as the +.B \-\-all +argument is over-written with the name of the container. To allow for +containers with names longer than 5 characters, this argument can be +arbitrarily extended, e.g. to +.BR \-\-all-active-arrays . +.TP + +.PP +Note that +.I mdmon +is automatically started by +.I mdadm +when needed and so does not need to be considered when working with +RAID arrays. The only times it is run other than by +.I mdadm +is when the boot scripts need to restart it after mounting the new +root filesystem. + +.SH START UP AND SHUTDOWN + +As +.I mdmon +needs to be running whenever any filesystem on the monitored device is +mounted there are special considerations when the root filesystem is +mounted from an +.I mdmon +monitored device. +Note that in general +.I mdmon +is needed even if the filesystem is mounted read-only as some +filesystems can still write to the device in those circumstances, for +example to replay a journal after an unclean shutdown. + +When the array is assembled by the +.B initramfs +code, mdadm will automatically start +.I mdmon +as required. This means that +.I mdmon +must be installed on the +.B initramfs +and there must be a writable filesystem (typically tmpfs) in which +.B mdmon +can create a +.B .pid +and +.B .sock +file. The particular filesystem to use is given to mdmon at compile +time and defaults to +.BR /run/mdadm . + +This filesystem must persist through to shutdown time. + +After the final root filesystem has be instantiated (usually with +.BR pivot_root ) +.I mdmon +should be run with +.I "\-\-all \-\-takeover" +so that the +.I mdmon +running from the +.B initramfs +can be replaced with one running in the main root, and so the +memory used by the initramfs can be released. + +At shutdown time, +.I mdmon +should not be killed along with other processes. Also as it holds a +file (socket actually) open in +.B /dev +(by default) it will not be possible to unmount +.B /dev +if it is a separate filesystem. + +.SH EXAMPLES + +.B " mdmon \-\-all-active-arrays \-\-takeover" +.br +Any +.I mdmon +which is currently running is killed and a new instance is started. +This should be run during in the boot sequence if an initramfs was +used, so that any mdmon running from the initramfs will not hold +the initramfs active. +.SH SEE ALSO +.IR mdadm (8), +.IR md (4). @@ -0,0 +1,594 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +/* + * md array manager. + * When md arrays have user-space managed metadata, this is the program + * that does the managing. + * + * Given one argument: the name of the array (e.g. /dev/md0) that is + * the container. + * We fork off a helper that runs high priority and mlocked. It responds to + * device failures and other events that might stop writeout, or that are + * trivial to deal with. + * The main thread then watches for new arrays being created in the container + * and starts monitoring them too ... along with a few other tasks. + * + * The main thread communicates with the priority thread by writing over + * a pipe. + * Separate programs can communicate with the main thread via Unix-domain + * socket. + * The two threads share address space and open file table. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include <unistd.h> +#include <stdlib.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <sys/mman.h> +#include <sys/syscall.h> +#include <sys/wait.h> +#include <stdio.h> +#include <errno.h> +#include <string.h> +#include <fcntl.h> +#include <signal.h> +#include <dirent.h> +#ifdef USE_PTHREADS +#include <pthread.h> +#else +#include <sched.h> +#endif + +#include "mdadm.h" +#include "mdmon.h" + +char const Name[] = "mdmon"; + +struct active_array *discard_this; +struct active_array *pending_discard; + +int mon_tid, mgr_tid; + +int sigterm; + +#ifdef USE_PTHREADS +static void *run_child(void *v) +{ + struct supertype *c = v; + + mon_tid = syscall(SYS_gettid); + do_monitor(c); + return 0; +} + +static int clone_monitor(struct supertype *container) +{ + pthread_attr_t attr; + pthread_t thread; + int rc; + + mon_tid = -1; + pthread_attr_init(&attr); + pthread_attr_setstacksize(&attr, 4096); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + rc = pthread_create(&thread, &attr, run_child, container); + if (rc) + return rc; + while (mon_tid == -1) + usleep(10); + pthread_attr_destroy(&attr); + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#else /* USE_PTHREADS */ +static int run_child(void *v) +{ + struct supertype *c = v; + + do_monitor(c); + return 0; +} + +#ifdef __ia64__ +int __clone2(int (*fn)(void *), + void *child_stack_base, size_t stack_size, + int flags, void *arg, ... + /* pid_t *pid, struct user_desc *tls, pid_t *ctid */ ); +#endif +static int clone_monitor(struct supertype *container) +{ + static char stack[4096]; + +#ifdef __ia64__ + mon_tid = __clone2(run_child, stack, sizeof(stack), + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#else + mon_tid = clone(run_child, stack+4096-64, + CLONE_FS|CLONE_FILES|CLONE_VM|CLONE_SIGHAND|CLONE_THREAD, + container); +#endif + + mgr_tid = syscall(SYS_gettid); + + return mon_tid; +} +#endif /* USE_PTHREADS */ + +static int make_pidfile(char *devname) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + if (mkdir(MDMON_DIR, 0755) < 0 && + errno != EEXIST) + return -errno; + sprintf(path, "%s/%s.pid", MDMON_DIR, devname); + + fd = open(path, O_RDWR|O_CREAT|O_EXCL, 0600); + if (fd < 0) + return -errno; + sprintf(pid, "%d\n", getpid()); + n = write(fd, pid, strlen(pid)); + close(fd); + if (n < 0) + return -errno; + return 0; +} + +static void try_kill_monitor(pid_t pid, char *devname, int sock) +{ + char buf[100]; + int fd; + int n; + long fl; + int rv; + + /* first rule of survival... don't off yourself */ + if (pid == getpid()) + return; + + /* kill this process if it is mdmon */ + sprintf(buf, "/proc/%lu/cmdline", (unsigned long) pid); + fd = open(buf, O_RDONLY); + if (fd < 0) + return; + + n = read(fd, buf, sizeof(buf)-1); + buf[sizeof(buf)-1] = 0; + close(fd); + + if (n < 0 || !(strstr(buf, "mdmon") || + strstr(buf, "@dmon"))) + return; + + kill(pid, SIGTERM); + + if (sock < 0) + return; + + /* Wait for monitor to exit by reading from the socket, after + * clearing the non-blocking flag */ + fl = fcntl(sock, F_GETFL, 0); + fl &= ~O_NONBLOCK; + fcntl(sock, F_SETFL, fl); + n = read(sock, buf, 100); + + /* If there is I/O going on it might took some time to get to + * clean state. Wait for monitor to exit fully to avoid races. + * Ping it with SIGUSR1 in case that it is sleeping */ + for (n = 0; n < 25; n++) { + rv = kill(pid, SIGUSR1); + if (rv < 0) + break; + usleep(200000); + } +} + +void remove_pidfile(char *devname) +{ + char buf[100]; + + sprintf(buf, "%s/%s.pid", MDMON_DIR, devname); + unlink(buf); + sprintf(buf, "%s/%s.sock", MDMON_DIR, devname); + unlink(buf); +} + +static int make_control_sock(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + + if (sigterm) + return -1; + + sprintf(path, "%s/%s.sock", MDMON_DIR, devname); + unlink(path); + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + umask(077); /* ensure no world write access */ + if (bind(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + listen(sfd, 10); + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + return sfd; +} + +static void term(int sig) +{ + sigterm = 1; +} + +static void wake_me(int sig) +{ + +} + +/* if we are debugging and starting mdmon by hand then don't fork */ +static int do_fork(void) +{ + #ifdef DEBUG + if (check_env("MDADM_NO_MDMON")) + return 0; + #endif + + return 1; +} + +void usage(void) +{ + fprintf(stderr, +"Usage: mdmon [options] CONTAINER\n" +"\n" +"Options are:\n" +" --help -h : This message\n" +" --all -a : All devices\n" +" --foreground -F : Run in foreground (do not fork)\n" +" --takeover -t : Takeover container\n" +); + exit(2); +} + +static int mdmon(char *devnm, int must_fork, int takeover); + +int main(int argc, char *argv[]) +{ + char *container_name = NULL; + char *devnm = NULL; + int status = 0; + int opt; + int all = 0; + int takeover = 0; + int dofork = 1; + static struct option options[] = { + {"all", 0, NULL, 'a'}, + {"takeover", 0, NULL, 't'}, + {"help", 0, NULL, 'h'}, + {"offroot", 0, NULL, OffRootOpt}, + {"foreground", 0, NULL, 'F'}, + {NULL, 0, NULL, 0} + }; + + if (in_initrd()) { + /* + * set first char of argv[0] to @. This is used by + * systemd to signal that the task was launched from + * initrd/initramfs and should be preserved during shutdown + */ + argv[0][0] = '@'; + } + + while ((opt = getopt_long(argc, argv, "thaF", options, NULL)) != -1) { + switch (opt) { + case 'a': + container_name = argv[optind-1]; + all = 1; + break; + case 't': + takeover = 1; + break; + case 'F': + dofork = 0; + break; + case OffRootOpt: + argv[0][0] = '@'; + break; + case 'h': + default: + usage(); + break; + } + } + + if (all == 0 && container_name == NULL) { + if (argv[optind]) + container_name = argv[optind]; + } + + if (container_name == NULL) + usage(); + + if (argc - optind > 1) + usage(); + + if (strcmp(container_name, "/proc/mdstat") == 0) + all = 1; + + if (all) { + struct mdstat_ent *mdstat, *e; + int container_len = strlen(container_name); + + /* launch an mdmon instance for each container found */ + mdstat = mdstat_read(0, 0); + for (e = mdstat; e; e = e->next) { + if (e->metadata_version && + strncmp(e->metadata_version, "external:", 9) == 0 && + !is_subarray(&e->metadata_version[9])) { + /* update cmdline so this mdmon instance can be + * distinguished from others in a call to ps(1) + */ + if (strlen(e->devnm) <= (unsigned)container_len) { + memset(container_name, 0, container_len); + sprintf(container_name, "%s", e->devnm); + } + status |= mdmon(e->devnm, 1, takeover); + } + } + free_mdstat(mdstat); + + return status; + } else if (strncmp(container_name, "md", 2) == 0) { + int id = devnm2devid(container_name); + if (id) + devnm = container_name; + } else { + struct stat st; + + if (stat(container_name, &st) == 0) + devnm = xstrdup(stat2devnm(&st)); + } + + if (!devnm) { + pr_err("%s is not a valid md device name\n", + container_name); + exit(1); + } + return mdmon(devnm, dofork && do_fork(), takeover); +} + +static int mdmon(char *devnm, int must_fork, int takeover) +{ + int mdfd; + struct mdinfo *mdi, *di; + struct supertype *container; + sigset_t set; + struct sigaction act; + int pfd[2]; + int status; + int ignore; + pid_t victim = -1; + int victim_sock = -1; + + dprintf("starting mdmon for %s\n", devnm); + + mdfd = open_dev(devnm); + if (mdfd < 0) { + pr_err("%s: %s\n", devnm, strerror(errno)); + return 1; + } + + /* Fork, and have the child tell us when they are ready */ + if (must_fork) { + if (pipe(pfd) != 0) { + pr_err("failed to create pipe\n"); + return 1; + } + switch(fork()) { + case -1: + pr_err("failed to fork: %s\n", strerror(errno)); + return 1; + case 0: /* child */ + close(pfd[0]); + break; + default: /* parent */ + close(pfd[1]); + if (read(pfd[0], &status, sizeof(status)) != sizeof(status)) { + wait(&status); + status = WEXITSTATUS(status); + } + close(pfd[0]); + return status; + } + } else + pfd[0] = pfd[1] = -1; + + container = xcalloc(1, sizeof(*container)); + strcpy(container->devnm, devnm); + container->arrays = NULL; + container->sock = -1; + + mdi = sysfs_read(mdfd, container->devnm, GET_VERSION|GET_LEVEL|GET_DEVS); + + if (!mdi) { + pr_err("failed to load sysfs info for %s\n", container->devnm); + exit(3); + } + if (mdi->array.level != UnSet) { + pr_err("%s is not a container - cannot monitor\n", devnm); + exit(3); + } + if (mdi->array.major_version != -1 || + mdi->array.minor_version != -2) { + pr_err("%s does not use external metadata - cannot monitor\n", + devnm); + exit(3); + } + + container->ss = version_to_superswitch(mdi->text_version); + if (container->ss == NULL) { + pr_err("%s uses unsupported metadata: %s\n", + devnm, mdi->text_version); + exit(3); + } + + container->devs = NULL; + for (di = mdi->devs; di; di = di->next) { + struct mdinfo *cd = xmalloc(sizeof(*cd)); + *cd = *di; + cd->next = container->devs; + container->devs = cd; + } + sysfs_free(mdi); + + /* SIGUSR is sent between parent and child. So both block it + * and enable it only with pselect. + */ + sigemptyset(&set); + sigaddset(&set, SIGUSR1); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + act.sa_handler = wake_me; + act.sa_flags = 0; + sigaction(SIGUSR1, &act, NULL); + act.sa_handler = term; + sigaction(SIGTERM, &act, NULL); + act.sa_handler = SIG_IGN; + sigaction(SIGPIPE, &act, NULL); + + victim = mdmon_pid(container->devnm); + if (victim >= 0) + victim_sock = connect_monitor(container->devnm); + + ignore = chdir("/"); + if (!takeover && victim > 0 && victim_sock >= 0) { + if (fping_monitor(victim_sock) == 0) { + pr_err("%s already managed\n", container->devnm); + exit(3); + } + close(victim_sock); + victim_sock = -1; + } + if (container->ss->load_container(container, mdfd, devnm)) { + pr_err("Cannot load metadata for %s\n", devnm); + exit(3); + } + close(mdfd); + + /* Ok, this is close enough. We can say goodbye to our parent now. + */ + if (victim > 0) + remove_pidfile(devnm); + if (make_pidfile(devnm) < 0) { + exit(3); + } + container->sock = make_control_sock(devnm); + + status = 0; + if (pfd[1] >= 0) { + if (write(pfd[1], &status, sizeof(status)) < 0) + pr_err("failed to notify our parent: %d\n", + getppid()); + close(pfd[1]); + } + + mlockall(MCL_CURRENT | MCL_FUTURE); + + if (clone_monitor(container) < 0) { + pr_err("failed to start monitor process: %s\n", + strerror(errno)); + exit(2); + } + + if (victim > 0) { + try_kill_monitor(victim, container->devnm, victim_sock); + if (victim_sock >= 0) + close(victim_sock); + } + + setsid(); + manage_fork_fds(0); + + /* This silliness is to stop the compiler complaining + * that we ignore 'ignore' + */ + if (ignore) + ignore++; + + do_manager(container); + + exit(0); +} + +/* Some stub functions so super-* can link with us */ +int child_monitor(int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + return 0; +} + +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + return 1; +} + +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + return 0; +} + +struct superswitch super0 = { + .name = "0.90", +}; +struct superswitch super1 = { + .name = "1.x", +}; @@ -0,0 +1,111 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +extern const char Name[]; + +enum array_state { clear, inactive, suspended, readonly, read_auto, + clean, active, write_pending, active_idle, broken, bad_word}; + +enum sync_action { idle, reshape, resync, recover, check, repair, bad_action }; + +struct active_array { + struct mdinfo info; + struct supertype *container; + struct active_array *next, *replaces; + int to_remove; + + int action_fd; + int resync_start_fd; + int metadata_fd; /* for monitoring rw/ro status */ + int sync_completed_fd; /* for checkpoint notification events */ + int safe_mode_delay_fd; + unsigned long long last_checkpoint; /* sync_completed fires for many + * reasons this field makes sure the + * kernel has made progress before + * moving the checkpoint. It is + * cleared by the metadata handler + * when it determines recovery is + * terminated. + */ + + enum array_state prev_state, curr_state, next_state; + enum sync_action prev_action, curr_action, next_action; + + int check_degraded; /* flag set by mon, read by manage */ + int check_reshape; /* flag set by mon, read by manage */ +}; + +/* + * Metadata updates are handled by the monitor thread, + * as it has exclusive access to the metadata. + * When the manager want to updates metadata, either + * for it's own reason (e.g. committing a spare) or + * on behalf of mdadm, it creates a metadata_update + * structure and queues it to the monitor. + * Updates are created and processed by code under the + * superswitch. All common code sees them as opaque + * blobs. + */ +extern struct metadata_update *update_queue, *update_queue_handled; + +#define MD_MAJOR 9 + +extern struct active_array *container; +extern struct active_array *discard_this; +extern struct active_array *pending_discard; +extern struct md_generic_cmd *active_cmd; + +void remove_pidfile(char *devname); +void do_monitor(struct supertype *container); +void do_manager(struct supertype *container); +extern int sigterm; + +int read_dev_state(int fd); +int is_container_member(struct mdstat_ent *mdstat, char *container); + +struct mdstat_ent *mdstat_read(int hold, int start); + +extern int exit_now, manager_ready; +extern int mon_tid, mgr_tid; +extern int monitor_loop_cnt; + +/* helper routine to determine resync completion since MaxSector is a + * moving target + */ +static inline int is_resync_complete(struct mdinfo *array) +{ + unsigned long long sync_size = 0; + int ncopies, l; + switch(array->array.level) { + case 1: + case 4: + case 5: + case 6: + sync_size = array->component_size; + break; + case 10: + l = array->array.layout; + ncopies = (l & 0xff) * ((l >> 8) & 0xff); + sync_size = array->component_size * array->array.raid_disks; + sync_size /= ncopies; + break; + } + return array->resync_start >= sync_size; +} diff --git a/mdopen.c b/mdopen.c new file mode 100644 index 0000000..245be53 --- /dev/null +++ b/mdopen.c @@ -0,0 +1,509 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include <ctype.h> + +void make_parts(char *dev, int cnt) +{ + /* make 'cnt' partition devices for 'dev' + * If dev is a device name we use the + * major/minor from dev and add 1..cnt + * If it is a symlink, we make similar symlinks. + * If dev ends with a digit, we add "p%d" else "%d" + * If the name exists, we use it's owner/mode, + * else that of dev + */ + struct stat stb; + int major_num; + int minor_num; + int odig; + int i; + int nlen = strlen(dev) + 20; + char *name; + int dig = isdigit(dev[strlen(dev)-1]); + char orig[1001]; + char sym[1024]; + int err; + + if (cnt == 0) + cnt = 4; + if (lstat(dev, &stb)!= 0) + return; + + if (S_ISBLK(stb.st_mode)) { + major_num = major(stb.st_rdev); + minor_num = minor(stb.st_rdev); + odig = -1; + } else if (S_ISLNK(stb.st_mode)) { + int len; + + len = readlink(dev, orig, sizeof(orig)); + if (len < 0 || len >= (int)sizeof(orig)) + return; + orig[len] = 0; + odig = isdigit(orig[len-1]); + major_num = -1; + minor_num = -1; + } else + return; + name = xmalloc(nlen); + for (i = 1; i <= cnt ; i++) { + struct stat stb2; + snprintf(name, nlen, "%s%s%d", dev, dig?"p":"", i); + if (stat(name, &stb2) == 0) { + if (!S_ISBLK(stb2.st_mode) || !S_ISBLK(stb.st_mode)) + continue; + if (stb2.st_rdev == makedev(major_num, minor_num+i)) + continue; + unlink(name); + } else { + stb2 = stb; + } + if (S_ISBLK(stb.st_mode)) { + if (mknod(name, S_IFBLK | 0600, + makedev(major_num, minor_num+i))) + perror("mknod"); + if (chown(name, stb2.st_uid, stb2.st_gid)) + perror("chown"); + if (chmod(name, stb2.st_mode & 07777)) + perror("chmod"); + err = 0; + } else { + snprintf(sym, sizeof(sym), "%s%s%d", orig, odig?"p":"", i); + err = symlink(sym, name); + } + + if (err == 0 && stat(name, &stb2) == 0) + add_dev(name, &stb2, 0, NULL); + } + free(name); +} + +int create_named_array(char *devnm) +{ + int fd; + int n = -1; + static const char new_array_file[] = { + "/sys/module/md_mod/parameters/new_array" + }; + + fd = open(new_array_file, O_WRONLY); + if (fd < 0 && errno == ENOENT) { + if (system("modprobe md_mod") == 0) + fd = open(new_array_file, O_WRONLY); + } + if (fd >= 0) { + n = write(fd, devnm, strlen(devnm)); + close(fd); + } + if (fd < 0 || n != (int)strlen(devnm)) { + pr_err("Fail to create %s when using %s, fallback to creation via node\n", + devnm, new_array_file); + return 0; + } + + return 1; +} + +/* + * We need a new md device to assemble/build/create an array. + * 'dev' is a name given us by the user (command line or mdadm.conf) + * It might start with /dev or /dev/md any might end with a digit + * string. + * If it starts with just /dev, it must be /dev/mdX or /dev/md_dX + * If it ends with a digit string, then it must be as above, or + * 'trustworthy' must be 'METADATA' and the 'dev' must be + * /dev/md/'name'NN or 'name'NN + * If it doesn't end with a digit string, it must be /dev/md/'name' + * or 'name' or must be NULL. + * If the digit string is present, it gives the minor number to use + * If not, we choose a high, unused minor number. + * If the 'dev' is a standard name, it devices whether 'md' or 'mdp'. + * else if the name is 'd[0-9]+' then we use mdp + * else if trustworthy is 'METADATA' we use md + * else the choice depends on 'autof'. + * If name is NULL it is assumed to match whatever dev provides. + * If both name and dev are NULL, we choose a name 'mdXX' or 'mdpXX' + * + * If 'name' is given, and 'trustworthy' is 'foreign' and name is not + * supported by 'dev', we add a "_%d" suffix based on the minor number + * use that. + * + * If udev is configured, we create a temporary device, open it, and + * unlink it. + * If not, we create the /dev/mdXX device, and if name is usable, + * /dev/md/name + * In any case we return /dev/md/name or (if that isn't available) + * /dev/mdXX in 'chosen'. + * + * When we create devices, we use uid/gid/umask from config file. + */ + +int create_mddev(char *dev, char *name, int autof, int trustworthy, + char *chosen, int block_udev) +{ + int mdfd; + struct stat stb; + int num = -1; + int use_mdp = -1; + struct createinfo *ci = conf_get_create_info(); + int parts; + char *cname; + char devname[37]; + char devnm[32]; + char cbuf[400]; + + if (!use_udev()) + block_udev = 0; + + if (chosen == NULL) + chosen = cbuf; + + if (autof == 0) + autof = ci->autof; + + parts = autof >> 3; + autof &= 7; + + strcpy(chosen, "/dev/md/"); + cname = chosen + strlen(chosen); + + if (dev) { + if (strncmp(dev, "/dev/md/", 8) == 0) { + strcpy(cname, dev+8); + } else if (strncmp(dev, "/dev/", 5) == 0) { + char *e = dev + strlen(dev); + while (e > dev && isdigit(e[-1])) + e--; + if (e[0]) + num = strtoul(e, NULL, 10); + strcpy(cname, dev+5); + cname[e-(dev+5)] = 0; + /* name *must* be mdXX or md_dXX in this context */ + if (num < 0 || + (strcmp(cname, "md") != 0 && strcmp(cname, "md_d") != 0)) { + pr_err("%s is an invalid name for an md device. Try /dev/md/%s\n", + dev, dev+5); + return -1; + } + if (strcmp(cname, "md") == 0) + use_mdp = 0; + else + use_mdp = 1; + /* recreate name: /dev/md/0 or /dev/md/d0 */ + sprintf(cname, "%s%d", use_mdp?"d":"", num); + } else + strcpy(cname, dev); + + /* 'cname' must not contain a slash, and may not be + * empty. + */ + if (strchr(cname, '/') != NULL) { + pr_err("%s is an invalid name for an md device.\n", dev); + return -1; + } + if (cname[0] == 0) { + pr_err("%s is an invalid name for an md device (empty!).\n", dev); + return -1; + } + if (num < 0) { + /* If cname is 'N' or 'dN', we get dev number + * from there. + */ + char *sp = cname; + char *ep; + if (cname[0] == 'd') + sp++; + if (isdigit(sp[0])) + num = strtoul(sp, &ep, 10); + else + ep = sp; + if (ep == sp || *ep || num < 0) + num = -1; + else if (cname[0] == 'd') + use_mdp = 1; + else + use_mdp = 0; + } + } + + /* Now determine device number */ + /* named 'METADATA' cannot use 'mdp'. */ + if (name && name[0] == 0) + name = NULL; + if (name && trustworthy == METADATA && use_mdp == 1) { + pr_err("%s is not allowed for a %s container. Consider /dev/md%d.\n", dev, name, num); + return -1; + } + if (name && trustworthy == METADATA) + use_mdp = 0; + if (use_mdp == -1) { + if (autof == 4 || autof == 6) + use_mdp = 1; + else + use_mdp = 0; + } + if (num < 0 && trustworthy == LOCAL && name) { + /* if name is numeric, possibly prefixed by + * 'md' or '/dev/md', use that for num + * if it is not already in use */ + char *ep; + char *n2 = name; + if (strncmp(n2, "/dev/", 5) == 0) + n2 += 5; + if (strncmp(n2, "md", 2) == 0) + n2 += 2; + if (*n2 == '/') + n2++; + num = strtoul(n2, &ep, 10); + if (ep == n2 || *ep) + num = -1; + else { + sprintf(devnm, "md%s%d", use_mdp ? "_d":"", num); + if (mddev_busy(devnm)) + num = -1; + } + } + + if (cname[0] == 0 && name) { + /* Need to find a name if we can + * We don't completely trust 'name'. Truncate to + * reasonable length and remove '/' + */ + char *cp; + struct map_ent *map = NULL; + int conflict = 1; + int unum = 0; + int cnlen; + strncpy(cname, name, 200); + cname[200] = 0; + for (cp = cname; *cp ; cp++) + switch (*cp) { + case '/': + *cp = '-'; + break; + case ' ': + case '\t': + *cp = '_'; + break; + } + + if (trustworthy == LOCAL || + (trustworthy == FOREIGN && strchr(cname, ':') != NULL)) { + /* Only need suffix if there is a conflict */ + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + cnlen = strlen(cname); + while (conflict) { + if (trustworthy == METADATA && !isdigit(cname[cnlen-1])) + sprintf(cname+cnlen, "%d", unum); + else + /* add _%d to FOREIGN array that don't + * a 'host:' prefix + */ + sprintf(cname+cnlen, "_%d", unum); + unum++; + if (map_by_name(&map, cname) == NULL) + conflict = 0; + } + } + + devnm[0] = 0; + if (num < 0 && cname && ci->names) { + sprintf(devnm, "md_%s", cname); + if (block_udev) + udev_block(devnm); + if (!create_named_array(devnm)) { + devnm[0] = 0; + udev_unblock(); + } + } + if (num >= 0) { + sprintf(devnm, "md%d", num); + if (block_udev) + udev_block(devnm); + if (!create_named_array(devnm)) { + devnm[0] = 0; + udev_unblock(); + } + } + if (devnm[0] == 0) { + if (num < 0) { + /* need to choose a free number. */ + char *_devnm = find_free_devnm(use_mdp); + if (_devnm == NULL) { + pr_err("No avail md devices - aborting\n"); + return -1; + } + strcpy(devnm, _devnm); + } else { + sprintf(devnm, "%s%d", use_mdp?"md_d":"md", num); + if (mddev_busy(devnm)) { + pr_err("%s is already in use.\n", + dev); + return -1; + } + } + if (block_udev) + udev_block(devnm); + } + + sprintf(devname, "/dev/%s", devnm); + + if (dev && dev[0] == '/') + strcpy(chosen, dev); + else if (cname[0] == 0) + strcpy(chosen, devname); + + /* We have a device number and name. + * If we cannot detect udev, we need to make + * devices and links ourselves. + */ + if (!use_udev()) { + /* Make sure 'devname' exists and 'chosen' is a symlink to it */ + if (lstat(devname, &stb) == 0) { + /* Must be the correct device, else error */ + if ((stb.st_mode&S_IFMT) != S_IFBLK || + stb.st_rdev != devnm2devid(devnm)) { + pr_err("%s exists but looks wrong, please fix\n", + devname); + return -1; + } + } else { + if (mknod(devname, S_IFBLK|0600, + devnm2devid(devnm)) != 0) { + pr_err("failed to create %s\n", + devname); + return -1; + } + if (chown(devname, ci->uid, ci->gid)) + perror("chown"); + if (chmod(devname, ci->mode)) + perror("chmod"); + stat(devname, &stb); + add_dev(devname, &stb, 0, NULL); + } + if (use_mdp == 1) + make_parts(devname, parts); + + if (strcmp(chosen, devname) != 0) { + if (mkdir("/dev/md",0700) == 0) { + if (chown("/dev/md", ci->uid, ci->gid)) + perror("chown /dev/md"); + if (chmod("/dev/md", ci->mode| ((ci->mode>>2) & 0111))) + perror("chmod /dev/md"); + } + + if (dev && strcmp(chosen, dev) == 0) + /* We know we are allowed to use this name */ + unlink(chosen); + + if (lstat(chosen, &stb) == 0) { + char buf[300]; + ssize_t link_len = readlink(chosen, buf, sizeof(buf)-1); + if (link_len >= 0) + buf[link_len] = '\0'; + + if ((stb.st_mode & S_IFMT) != S_IFLNK || + link_len < 0 || + strcmp(buf, devname) != 0) { + pr_err("%s exists - ignoring\n", + chosen); + strcpy(chosen, devname); + } + } else if (symlink(devname, chosen) != 0) + pr_err("failed to create %s: %s\n", + chosen, strerror(errno)); + if (use_mdp && strcmp(chosen, devname) != 0) + make_parts(chosen, parts); + } + } + mdfd = open_dev_excl(devnm); + if (mdfd < 0) + pr_err("unexpected failure opening %s\n", + devname); + return mdfd; +} + +/* Open this and check that it is an md device. + * On success, return filedescriptor. + * On failure, return -1 if it doesn't exist, + * or -2 if it exists but is not an md device. + */ +int open_mddev(char *dev, int report_errors) +{ + int mdfd = open(dev, O_RDONLY); + + if (mdfd < 0) { + if (report_errors) + pr_err("error opening %s: %s\n", + dev, strerror(errno)); + return -1; + } + + if (md_array_valid(mdfd) == 0) { + close(mdfd); + if (report_errors) + pr_err("%s does not appear to be an md device\n", dev); + return -2; + } + + return mdfd; +} + +char *find_free_devnm(int use_partitions) +{ + static char devnm[32]; + int devnum; + for (devnum = 127; devnum != 128; + devnum = devnum ? devnum-1 : (1<<9)-1) { + + if (use_partitions) + sprintf(devnm, "md_d%d", devnum); + else + sprintf(devnm, "md%d", devnum); + if (mddev_busy(devnm)) + continue; + if (!conf_name_is_free(devnm)) + continue; + if (!use_udev()) { + /* make sure it is new to /dev too, at least as a + * non-standard */ + dev_t devid = devnm2devid(devnm); + if (devid) { + char *dn = map_dev(major(devid), + minor(devid), 0); + if (dn && ! is_standard(dn, NULL)) + continue; + } + } + break; + } + if (devnum == 128) + return NULL; + return devnm; +} diff --git a/mdstat.c b/mdstat.c new file mode 100644 index 0000000..2fd792c --- /dev/null +++ b/mdstat.c @@ -0,0 +1,441 @@ +/* + * mdstat - parse /proc/mdstat file. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2002-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +/* + * The /proc/mdstat file comes in at least 3 flavours: + * In an unpatched 2.2 kernel (md 0.36.6): + * Personalities : [n raidx] ... + * read_ahead {not set|%d sectors} + * md0 : {in}active{ raidX /dev/hda... %d blocks{ maxfault=%d}} + * md1 : ..... + * + * Normally only 4 md lines, but all are listed. + * + * In a patched 2.2 kernel (md 0.90.0) + * Personalities : [raidx] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(readonly)} raidX dev[%d]{(F)} ... %d blocks STATUS RESYNC + * ... Only initialised arrays listed + * unused devices: {dev dev ... | <none>} + * + * STATUS is personality dependant: + * linear: %dk rounding + * raid0: %dk chunks + * raid1: [%d/%d] [U_U] ( raid/working. operational or not) + * raid5: level 4/5, %dk chunk, algorithm %d [%d/%d] [U_U] + * + * RESYNC is empty or: + * {resync|recovery}=%u%% finish=%u.%umin + * or + * resync=DELAYED + * + * In a 2.4 kernel (md 0.90.0/2.4) + * Personalities : [raidX] ... + * read_ahead {not set|%d sectors} + * mdN : {in}active {(read-only)} raidX dev[%d]{(F)} ... + * %d blocks STATUS + * RESYNC + * unused devices: {dev dev .. | <none>} + * + * STATUS matches 0.90.0/2.2 + * RESYNC includes [===>....], + * adds a space after {resync|recovery} and before and after '=' + * adds a decimal to the recovery percent. + * adds (%d/%d) resync amount and max_blocks, before finish. + * adds speed=%dK/sec after finish + * + * + * + * Out of this we want to extract: + * list of devices, active or not + * pattern of failed drives (so need number of drives) + * percent resync complete + * + * As continuation is indicated by leading space, we use + * conf_line from config.c to read logical lines + * + */ + +#include "mdadm.h" +#include "dlink.h" +#include <sys/select.h> +#include <ctype.h> + +static void free_member_devnames(struct dev_member *m) +{ + while(m) { + struct dev_member *t = m; + + m = m->next; + free(t->name); + free(t); + } +} + +static int add_member_devname(struct dev_member **m, char *name) +{ + struct dev_member *new; + char *t; + + if ((t = strchr(name, '[')) == NULL) + /* not a device */ + return 0; + + new = xmalloc(sizeof(*new)); + new->name = strndup(name, t - name); + new->next = *m; + *m = new; + return 1; +} + +void free_mdstat(struct mdstat_ent *ms) +{ + while (ms) { + struct mdstat_ent *t; + free(ms->level); + free(ms->pattern); + free(ms->metadata_version); + free_member_devnames(ms->members); + t = ms; + ms = ms->next; + free(t); + } +} + +static int mdstat_fd = -1; +struct mdstat_ent *mdstat_read(int hold, int start) +{ + FILE *f; + struct mdstat_ent *all, *rv, **end, **insert_here; + char *line; + int fd; + + if (hold && mdstat_fd != -1) { + off_t offset = lseek(mdstat_fd, 0L, 0); + if (offset == (off_t)-1) { + return NULL; + } + fd = dup(mdstat_fd); + if (fd >= 0) + f = fdopen(fd, "r"); + else + return NULL; + } else + f = fopen("/proc/mdstat", "r"); + if (f == NULL) + return NULL; + else + fcntl(fileno(f), F_SETFD, FD_CLOEXEC); + + all = NULL; + end = &all; + for (; (line = conf_line(f)) ; free_line(line)) { + struct mdstat_ent *ent; + char *w; + char devnm[32]; + int in_devs = 0; + + if (strcmp(line, "Personalities") == 0) + continue; + if (strcmp(line, "read_ahead") == 0) + continue; + if (strcmp(line, "unused") == 0) + continue; + insert_here = NULL; + /* Better be an md line.. */ + if (strncmp(line, "md", 2)!= 0 || strlen(line) >= 32 || + (line[2] != '_' && !isdigit(line[2]))) + continue; + strcpy(devnm, line); + + ent = xmalloc(sizeof(*ent)); + ent->level = ent->pattern= NULL; + ent->next = NULL; + ent->percent = RESYNC_NONE; + ent->active = -1; + ent->resync = 0; + ent->metadata_version = NULL; + ent->raid_disks = 0; + ent->devcnt = 0; + ent->members = NULL; + + strcpy(ent->devnm, devnm); + + for (w=dl_next(line); w!= line ; w=dl_next(w)) { + int l = strlen(w); + char *eq; + if (strcmp(w, "active") == 0) + ent->active = 1; + else if (strcmp(w, "inactive") == 0) { + ent->active = 0; + in_devs = 1; + } else if (strcmp(w, "bitmap:") == 0) { + /* We need to stop parsing here; + * otherwise, ent->raid_disks will be + * overwritten by the wrong value. + */ + break; + } else if (ent->active > 0 && + ent->level == NULL && + w[0] != '(' /*readonly*/) { + ent->level = xstrdup(w); + in_devs = 1; + } else if (in_devs && strcmp(w, "blocks") == 0) + in_devs = 0; + else if (in_devs) { + char *ep = strchr(w, '['); + ent->devcnt += + add_member_devname(&ent->members, w); + if (ep && strncmp(w, "md", 2) == 0) { + /* This has an md device as a component. + * If that device is already in the + * list, make sure we insert before + * there. + */ + struct mdstat_ent **ih; + ih = &all; + while (ih != insert_here && *ih && + ((int)strlen((*ih)->devnm) != + ep-w || + strncmp((*ih)->devnm, w, + ep-w) != 0)) + ih = & (*ih)->next; + insert_here = ih; + } + } else if (strcmp(w, "super") == 0 && + dl_next(w) != line) { + w = dl_next(w); + ent->metadata_version = xstrdup(w); + } else if (w[0] == '[' && isdigit(w[1])) { + ent->raid_disks = atoi(w+1); + } else if (!ent->pattern && + w[0] == '[' && + (w[1] == 'U' || w[1] == '_')) { + ent->pattern = xstrdup(w+1); + if (ent->pattern[l-2] == ']') + ent->pattern[l-2] = '\0'; + } else if (ent->percent == RESYNC_NONE && + strncmp(w, "re", 2) == 0 && + w[l-1] == '%' && + (eq = strchr(w, '=')) != NULL ) { + ent->percent = atoi(eq+1); + if (strncmp(w,"resync", 6) == 0) + ent->resync = 1; + else if (strncmp(w, "reshape", 7) == 0) + ent->resync = 2; + else + ent->resync = 0; + } else if (ent->percent == RESYNC_NONE && + (w[0] == 'r' || w[0] == 'c')) { + if (strncmp(w, "resync", 6) == 0) + ent->resync = 1; + if (strncmp(w, "reshape", 7) == 0) + ent->resync = 2; + if (strncmp(w, "recovery", 8) == 0) + ent->resync = 0; + if (strncmp(w, "check", 5) == 0) + ent->resync = 3; + + if (l > 8 && strcmp(w+l-8, "=DELAYED") == 0) + ent->percent = RESYNC_DELAYED; + if (l > 8 && strcmp(w+l-8, "=PENDING") == 0) + ent->percent = RESYNC_PENDING; + if (l > 7 && strcmp(w+l-7, "=REMOTE") == 0) + ent->percent = RESYNC_REMOTE; + } else if (ent->percent == RESYNC_NONE && + w[0] >= '0' && + w[0] <= '9' && + w[l-1] == '%') { + ent->percent = atoi(w); + } + } + if (insert_here && (*insert_here)) { + ent->next = *insert_here; + *insert_here = ent; + } else { + *end = ent; + end = &ent->next; + } + } + if (hold && mdstat_fd == -1) { + mdstat_fd = dup(fileno(f)); + fcntl(mdstat_fd, F_SETFD, FD_CLOEXEC); + } + fclose(f); + + /* If we might want to start array, + * reverse the order, so that components comes before composites + */ + if (start) { + rv = NULL; + while (all) { + struct mdstat_ent *e = all; + all = all->next; + e->next = rv; + rv = e; + } + } else + rv = all; + return rv; +} + +void mdstat_close(void) +{ + if (mdstat_fd >= 0) + close(mdstat_fd); + mdstat_fd = -1; +} + +/* + * function: mdstat_wait + * Description: Function waits for event on mdstat. + * Parameters: + * seconds - timeout for waiting + * Returns: + * > 0 - detected event + * 0 - timeout + * < 0 - detected error + */ +int mdstat_wait(int seconds) +{ + fd_set fds; + struct timeval tm; + int maxfd = 0; + FD_ZERO(&fds); + if (mdstat_fd >= 0) { + FD_SET(mdstat_fd, &fds); + maxfd = mdstat_fd; + } else + return -1; + + tm.tv_sec = seconds; + tm.tv_usec = 0; + + return select(maxfd + 1, NULL, NULL, &fds, &tm); +} + +void mdstat_wait_fd(int fd, const sigset_t *sigmask) +{ + fd_set fds, rfds; + int maxfd = 0; + + FD_ZERO(&fds); + FD_ZERO(&rfds); + if (mdstat_fd >= 0) + FD_SET(mdstat_fd, &fds); + + if (fd >= 0) { + struct stat stb; + fstat(fd, &stb); + if ((stb.st_mode & S_IFMT) == S_IFREG) + /* Must be a /proc or /sys fd, so expect + * POLLPRI + * i.e. an 'exceptional' event. + */ + FD_SET(fd, &fds); + else + FD_SET(fd, &rfds); + + if (fd > maxfd) + maxfd = fd; + + } + if (mdstat_fd > maxfd) + maxfd = mdstat_fd; + + pselect(maxfd + 1, &rfds, NULL, &fds, + NULL, sigmask); +} + +int mddev_busy(char *devnm) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *me; + + for (me = mdstat ; me ; me = me->next) + if (strcmp(me->devnm, devnm) == 0) + break; + free_mdstat(mdstat); + return me != NULL; +} + +struct mdstat_ent *mdstat_by_component(char *name) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + + while (mdstat) { + struct dev_member *m; + struct mdstat_ent *ent; + if (mdstat->metadata_version && + strncmp(mdstat->metadata_version, "external:", 9) == 0 && + is_subarray(mdstat->metadata_version+9)) + /* don't return subarrays, only containers */ + ; + else for (m = mdstat->members; m; m = m->next) { + if (strcmp(m->name, name) == 0) { + free_mdstat(mdstat->next); + mdstat->next = NULL; + return mdstat; + } + } + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + free_mdstat(ent); + } + return NULL; +} + +struct mdstat_ent *mdstat_by_subdev(char *subdev, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent = NULL; + + while (mdstat) { + /* metadata version must match: + * external:[/-]%s/%s + * where first %s is 'container' and second %s is 'subdev' + */ + if (ent) + free_mdstat(ent); + ent = mdstat; + mdstat = mdstat->next; + ent->next = NULL; + + if (ent->metadata_version == NULL || + strncmp(ent->metadata_version, "external:", 9) != 0) + continue; + + if (!metadata_container_matches(ent->metadata_version+9, + container) || + !metadata_subdev_matches(ent->metadata_version+9, + subdev)) + continue; + + free_mdstat(mdstat); + return ent; + } + return NULL; +} diff --git a/misc/mdcheck b/misc/mdcheck new file mode 100644 index 0000000..700c3e2 --- /dev/null +++ b/misc/mdcheck @@ -0,0 +1,166 @@ +#!/bin/bash + +# Copyright (C) 2014-2017 Neil Brown <neilb@suse.de> +# +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# Author: Neil Brown +# Email: <neilb@suse.com> + +# This script should be run periodically to automatically +# perform a 'check' on any md arrays. +# +# It supports a 'time budget' such that any incomplete 'check' +# will be checkpointed when that time has expired. +# A subsequent invocation can allow the 'check' to continue. +# +# Options are: +# --continue Don't start new checks, only continue old ones. +# --duration This is passed to "date --date=$duration" to find out +# when to finish +# +# To support '--continue', arrays are identified by UUID and the 'sync_completed' +# value is stored in /var/lib/mdcheck/$UUID + +# convert a /dev/md name into /sys/.../md equivalent +sysname() { + set `ls -lLd $1` + maj=${5%,} + min=$6 + readlink -f /sys/dev/block/$maj:$min +} + +args=$(getopt -o hcd: -l help,continue,duration: -n mdcheck -- "$@") +rv=$? +if [ $rv -ne 0 ]; then exit $rv; fi + +eval set -- $args + +cont= +endtime= +while [ " $1" != " --" ] +do + case $1 in + --help ) + echo >&2 'Usage: mdcheck [--continue] [--duration time-offset]' + echo >&2 ' time-offset must be understood by "date --date"' + exit 0 + ;; + --continue ) cont=yes ;; + --duration ) shift; dur=$1 + endtime=$(date --date "$dur" "+%s") + ;; + esac + shift +done +shift + +# We need a temp file occasionally... +tmp=/var/lib/mdcheck/.md-check-$$ +trap 'rm -f "$tmp"' 0 2 3 15 + + +# firstly, clean out really old state files +mkdir -p /var/lib/mdcheck +find /var/lib/mdcheck -name "MD_UUID*" -type f -mtime +180 -exec rm {} \; + +# Now look at each md device. +cnt=0 +for dev in /dev/md?* +do + [ -e "$dev" ] || continue + sys=`sysname $dev` + if [ ! -f "$sys/md/sync_action" ] + then # cannot check this array + continue + fi + if [ "`cat $sys/md/sync_action`" != 'idle' ] + then # This array is busy + continue + fi + + mdadm --detail --export "$dev" | grep '^MD_UUID=' > $tmp || continue + source $tmp + fl="/var/lib/mdcheck/MD_UUID_$MD_UUID" + if [ -z "$cont" ] + then + start=0 + logger -p daemon.info mdcheck start checking $dev + elif [ -z "$MD_UUID" -o ! -f "$fl" ] + then + # Nothing to continue here + continue + else + start=`cat "$fl"` + logger -p daemon.info mdcheck continue checking $dev from $start + fi + + cnt=$[cnt+1] + eval MD_${cnt}_fl=\$fl + eval MD_${cnt}_sys=\$sys + eval MD_${cnt}_dev=\$dev + echo $start > $fl + echo $start > $sys/md/sync_min + echo check > $sys/md/sync_action +done + +if [ -z "$endtime" ] +then + exit 0 +fi + +while [ `date +%s` -lt $endtime ] +do + any= + for i in `eval echo {1..$cnt}` + do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + eval dev=\$MD_${i}_dev + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + logger -p daemon.info mdcheck finished checking $dev + eval MD_${i}_fl= + rm -f $fl + continue; + fi + read a rest < $sys/md/sync_completed + echo $a > $fl + any=yes + done + if [ -z "$any" ]; then exit 0; fi + sleep 120 +done + +# We've waited, and there are still checks running. +# Time to stop them. +for i in `eval echo {1..$cnt}` +do + eval fl=\$MD_${i}_fl + eval sys=\$MD_${i}_sys + eval dev=\$MD_${i}_dev + + if [ -z "$fl" ]; then continue; fi + + if [ "`cat $sys/md/sync_action`" != 'check' ] + then + eval MD_${i}_fl= + rm -f $fl + continue; + fi + echo idle > $sys/md/sync_action + cat $sys/md/sync_min > $fl + logger -p daemon.info pause checking $dev at `cat $fl` +done diff --git a/misc/syslog-events b/misc/syslog-events new file mode 100644 index 0000000..fe8c14e --- /dev/null +++ b/misc/syslog-events @@ -0,0 +1,27 @@ +#!/bin/sh +# +# sample event handling script for mdadm +# e.g. mdadm --follow --program=/sbin/syslog-events --scan +# +# License: GPL ver.2 +# Copyright (C) 2004 SEKINE Tatsuo <tsekine@sdri.co.jp> + +event="$1" +dev="$2" +disc="$3" + +facility="kern" +tag="mdmonitor" + +case x"${event}" in + xFail*) priority="error" ;; + xTest*) priority="debug" ;; + x*) priority="info" ;; +esac + +msg="${event} event on ${dev}" +if [ x"${disc}" != x ]; then + msg="${msg}, related to disc ${disc}" +fi + +exec logger -t "${tag}" -p "${facility}.${priority}" -- "${msg}" diff --git a/mkinitramfs b/mkinitramfs new file mode 100644 index 0000000..c6275dd --- /dev/null +++ b/mkinitramfs @@ -0,0 +1,55 @@ +#!/bin/sh + +# make sure we are being run in the right directory... +if [ -f mkinitramfs ] +then : +else + echo >&2 mkinitramfs must be run from the mdadm source directory. + exit 1 +fi +if [ -f /bin/busybox ] +then : good, it exists + case `file /bin/busybox` in + *statically* ) : good ;; + * ) echo >&2 mkinitramfs: /bin/busybox is not statically linked: cannot proceed. + exit 1 + esac +else + echo >&2 "mkinitramfs: /bin/busybox doesn't exist - please install it statically linked." + exit 1 +fi + +rm -rf initramfs +mkdir initramfs +mkdir initramfs/bin +make mdadm.static +cp mdadm.static initramfs/bin/mdadm +cp /bin/busybox initramfs/bin/busybox +ln initramfs/bin/busybox initramfs/bin/sh +cat <<- END > initramfs/init + #!/bin/sh + + echo 'Auto-assembling boot md array' + mkdir /proc + mount -t proc proc /proc + if [ -n "$rootuuid" ] + then arg=--uuid=$rootuuid + elif [ -n "$mdminor" ] + then arg=--super-minor=$mdminor + else arg=--super-minor=0 + fi + echo "Using $arg" + mdadm -Acpartitions $arg --auto=part /dev/mda + cd / + mount /dev/mda1 /root || mount /dev/mda /root + umount /proc + cd /root + exec chroot . /sbin/init < /dev/console > /dev/console 2>&1 +END +chmod +x initramfs/init + +(cd initramfs + find init bin | cpio -o -H newc | gzip --best +) > init.cpio.gz +rm -rf initramfs +ls -l init.cpio.gz diff --git a/monitor.c b/monitor.c new file mode 100644 index 0000000..e0d3be6 --- /dev/null +++ b/monitor.c @@ -0,0 +1,909 @@ +/* + * mdmon - monitor external metadata arrays + * + * Copyright (C) 2007-2009 Neil Brown <neilb@suse.de> + * Copyright (C) 2007-2009 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "mdadm.h" +#include "mdmon.h" +#include <sys/syscall.h> +#include <sys/select.h> +#include <signal.h> + +static char *array_states[] = { + "clear", "inactive", "suspended", "readonly", "read-auto", + "clean", "active", "write-pending", "active-idle", "broken", NULL }; +static char *sync_actions[] = { + "idle", "reshape", "resync", "recover", "check", "repair", NULL +}; + +enum bb_action { + RECORD_BB = 1, + COMPARE_BB, +}; + +static int write_attr(char *attr, int fd) +{ + return write(fd, attr, strlen(attr)); +} + +static void add_fd(fd_set *fds, int *maxfd, int fd) +{ + struct stat st; + if (fd < 0) + return; + if (fstat(fd, &st) == -1) { + dprintf("Invalid fd %d\n", fd); + return; + } + if (st.st_nlink == 0) { + dprintf("fd %d was deleted\n", fd); + return; + } + if (fd > *maxfd) + *maxfd = fd; + FD_SET(fd, fds); +} + +static int read_attr(char *buf, int len, int fd) +{ + int n; + + if (fd < 0) { + buf[0] = 0; + return 0; + } + lseek(fd, 0, 0); + n = read(fd, buf, len - 1); + + if (n <= 0) { + buf[0] = 0; + return 0; + } + buf[n] = 0; + if (buf[n-1] == '\n') + buf[n-1] = 0; + return n; +} + +static void read_resync_start(int fd, unsigned long long *v) +{ + char buf[30]; + int n; + + n = read_attr(buf, 30, fd); + if (n <= 0) { + dprintf("Failed to read resync_start (%d)\n", fd); + return; + } + if (strncmp(buf, "none", 4) == 0) + *v = MaxSector; + else + *v = strtoull(buf, NULL, 10); +} + +static unsigned long long read_sync_completed(int fd) +{ + unsigned long long val; + char buf[50]; + int n; + char *ep; + + n = read_attr(buf, 50, fd); + + if (n <= 0) + return 0; + buf[n] = 0; + val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return 0; + return val; +} + +static enum array_state read_state(int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_word; + return (enum array_state) sysfs_match_word(buf, array_states); +} + +static enum sync_action read_action( int fd) +{ + char buf[20]; + int n = read_attr(buf, 20, fd); + + if (n <= 0) + return bad_action; + return (enum sync_action) sysfs_match_word(buf, sync_actions); +} + +int read_dev_state(int fd) +{ + char buf[100]; + int n = read_attr(buf, sizeof(buf), fd); + char *cp; + int rv = 0; + + if (n <= 0) + return 0; + + cp = buf; + while (cp) { + if (sysfs_attr_match(cp, "faulty")) + rv |= DS_FAULTY; + if (sysfs_attr_match(cp, "in_sync")) + rv |= DS_INSYNC; + if (sysfs_attr_match(cp, "write_mostly")) + rv |= DS_WRITE_MOSTLY; + if (sysfs_attr_match(cp, "spare")) + rv |= DS_SPARE; + if (sysfs_attr_match(cp, "blocked")) + rv |= DS_BLOCKED; + cp = strchr(cp, ','); + if (cp) + cp++; + } + return rv; +} + +int process_ubb(struct active_array *a, struct mdinfo *mdi, const unsigned long + long sector, const int length, const char *buf, + const int buf_len) +{ + struct superswitch *ss = a->container->ss; + + /* + * record bad block in metadata first, then acknowledge it to the driver + * via sysfs file + */ + if ((ss->record_bad_block(a, mdi->disk.raid_disk, sector, length)) && + (write(mdi->bb_fd, buf, buf_len) == buf_len)) + return 1; + + /* + * failed to store or acknowledge bad block, switch of bad block support + * to get it out of blocked state + */ + sysfs_set_str(&a->info, mdi, "state", "-external_bbl"); + return -1; +} + +int compare_bb(struct active_array *a, struct mdinfo *mdi, const unsigned long + long sector, const unsigned int length, void *arg) +{ + struct superswitch *ss = a->container->ss; + struct md_bb *bb = (struct md_bb *) arg; + int record = 1; + int i; + + for (i = 0; i < bb->count; i++) { + unsigned long long start = bb->entries[i].sector; + unsigned long long len = bb->entries[i].length; + + /* + * bad block in metadata exactly matches bad block in kernel + * list, just remove it from a list + */ + if ((start == sector) && (len == length)) { + if (i < bb->count - 1) + bb->entries[i] = bb->entries[bb->count - 1]; + bb->count -= 1; + record = 0; + break; + } + /* + * bad block in metadata spans bad block in kernel list, + * clear it and record new bad block + */ + if ((sector >= start) && (sector + length <= start + len)) { + ss->clear_bad_block(a, mdi->disk.raid_disk, start, len); + break; + } + } + + /* record all bad blocks not in metadata list */ + if (record && (ss->record_bad_block(a, mdi->disk.raid_disk, sector, + length) <= 0)) { + sysfs_set_str(&a->info, mdi, "state", "-external_bbl"); + return -1; + } + + return 1; +} + +static int read_bb_file(int fd, struct active_array *a, struct mdinfo *mdi, + enum bb_action action, void *arg) +{ + char buf[30]; + int n = 0; + int ret = 0; + int read_again = 0; + int off = 0; + int pos = 0; + int preserve_pos = (action == RECORD_BB ? 0 : 1); + + if (lseek(fd, 0, SEEK_SET) == (off_t) -1) + return -1; + + do { + read_again = 0; + n = read(fd, buf + pos, sizeof(buf) - 1 - pos); + if (n < 0) + return -1; + n += pos; + + buf[n] = '\0'; + off = 0; + + while (off < n) { + unsigned long long sector; + int length; + char newline; + int consumed; + int matched; + int rc; + + /* kernel sysfs file format: "sector length\n" */ + matched = sscanf(buf + off, "%llu %d%c%n", §or, + &length, &newline, &consumed); + if ((matched != 3) && (off > 0)) { + /* truncated entry, read again */ + if (preserve_pos) { + pos = sizeof(buf) - off - 1; + memmove(buf, buf + off, pos); + } else { + if (lseek(fd, 0, SEEK_SET) == + (off_t) -1) + return -1; + } + read_again = 1; + break; + } + if (matched != 3) + return -1; + if (newline != '\n') + return -1; + if (length <= 0) + return -1; + + if (action == RECORD_BB) + rc = process_ubb(a, mdi, sector, length, + buf + off, consumed); + else if (action == COMPARE_BB) + rc = compare_bb(a, mdi, sector, length, arg); + else + rc = -1; + + if (rc < 0) + return rc; + ret += rc; + off += consumed; + } + } while (read_again); + + return ret; +} + +static int process_dev_ubb(struct active_array *a, struct mdinfo *mdi) +{ + return read_bb_file(mdi->ubb_fd, a, mdi, RECORD_BB, NULL); +} + +static int check_for_cleared_bb(struct active_array *a, struct mdinfo *mdi) +{ + struct superswitch *ss = a->container->ss; + struct md_bb *bb; + int i; + + /* + * Get a list of bad blocks for an array, then read list of + * acknowledged bad blocks from kernel and compare it against metadata + * list, clear all bad blocks remaining in metadata list + */ + bb = ss->get_bad_blocks(a, mdi->disk.raid_disk); + if (!bb) + return -1; + + if (read_bb_file(mdi->bb_fd, a, mdi, COMPARE_BB, bb) < 0) + return -1; + + for (i = 0; i < bb->count; i++) { + unsigned long long sector = bb->entries[i].sector; + int length = bb->entries[i].length; + + ss->clear_bad_block(a, mdi->disk.raid_disk, sector, length); + } + + return 0; +} + +static void signal_manager(void) +{ + /* tgkill(getpid(), mon_tid, SIGUSR1); */ + int pid = getpid(); + syscall(SYS_tgkill, pid, mgr_tid, SIGUSR1); +} + +/* Monitor a set of active md arrays - all of which share the + * same metadata - and respond to events that require + * metadata update. + * + * New arrays are detected by another thread which allocates + * required memory and attaches the data structure to our list. + * + * Events: + * Array stops. + * This is detected by array_state going to 'clear' or 'inactive'. + * while we thought it was active. + * Response is to mark metadata as clean and 'clear' the array(??) + * write-pending + * array_state if 'write-pending' + * We mark metadata as 'dirty' then set array to 'active'. + * active_idle + * Either ignore, or mark clean, then mark metadata as clean. + * + * device fails + * detected by rd-N/state reporting "faulty" + * mark device as 'failed' in metadata, let the kernel release the + * device by writing '-blocked' to rd/state, and finally write 'remove' to + * rd/state. Before a disk can be replaced it must be failed and removed + * from all container members, this will be preemptive for the other + * arrays... safe? + * + * sync completes + * sync_action was 'resync' and becomes 'idle' and resync_start becomes + * MaxSector + * Notify metadata that sync is complete. + * + * recovery completes + * sync_action changes from 'recover' to 'idle' + * Check each device state and mark metadata if 'faulty' or 'in_sync'. + * + * deal with resync + * This only happens on finding a new array... mdadm will have set + * 'resync_start' to the correct value. If 'resync_start' indicates that an + * resync needs to occur set the array to the 'active' state rather than the + * initial read-auto state. + * + * + * + * We wait for a change (poll/select) on array_state, sync_action, and + * each rd-X/state file. + * When we get any change, we check everything. So read each state file, + * then decide what to do. + * + * The core action is to write new metadata to all devices in the array. + * This is done at most once on any wakeup. + * After that we might: + * - update the array_state + * - set the role of some devices. + * - request a sync_action + * + */ + +#define ARRAY_DIRTY 1 +#define ARRAY_BUSY 2 +static int read_and_act(struct active_array *a, fd_set *fds) +{ + unsigned long long sync_completed; + int check_degraded = 0; + int check_reshape = 0; + int deactivate = 0; + struct mdinfo *mdi; + int ret = 0; + int count = 0; + struct timeval tv; + + a->next_state = bad_word; + a->next_action = bad_action; + + a->curr_state = read_state(a->info.state_fd); + a->curr_action = read_action(a->action_fd); + if (a->curr_state != clear) + /* + * In "clear" state, resync_start may wrongly be set to "0" + * when the kernel called md_clean but didn't remove the + * sysfs attributes yet + */ + read_resync_start(a->resync_start_fd, &a->info.resync_start); + sync_completed = read_sync_completed(a->sync_completed_fd); + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->next_state = 0; + mdi->curr_state = 0; + if (mdi->state_fd >= 0) { + read_resync_start(mdi->recovery_fd, + &mdi->recovery_start); + mdi->curr_state = read_dev_state(mdi->state_fd); + } + /* + * If array is blocked and metadata handler is able to handle + * BB, check if you can acknowledge them to md driver. If + * successful, clear faulty state and unblock the array. + */ + if ((mdi->curr_state & DS_BLOCKED) && + a->container->ss->record_bad_block && + (process_dev_ubb(a, mdi) > 0)) { + mdi->next_state |= DS_UNBLOCK; + } + if (FD_ISSET(mdi->bb_fd, fds)) + check_for_cleared_bb(a, mdi); + } + + gettimeofday(&tv, NULL); + dprintf("(%d): %ld.%06ld state:%s prev:%s action:%s prev: %s start:%llu\n", + a->info.container_member, + tv.tv_sec, tv.tv_usec, + array_states[a->curr_state], + array_states[a->prev_state], + sync_actions[a->curr_action], + sync_actions[a->prev_action], + a->info.resync_start + ); + + if ((a->curr_state == bad_word || a->curr_state <= inactive) && + a->prev_state > inactive) { + /* array has been stopped */ + a->container->ss->set_array_state(a, 1); + a->next_state = clear; + deactivate = 1; + } + if (a->curr_state == write_pending) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + ret |= ARRAY_DIRTY; + } + if (a->curr_state == active_idle) { + /* Set array to 'clean' FIRST, then mark clean + * in the metadata + */ + a->next_state = clean; + ret |= ARRAY_DIRTY; + } + if ((a->curr_state == clean) || (a->curr_state == broken)) { + a->container->ss->set_array_state(a, 1); + } + if (a->curr_state == active || + a->curr_state == suspended) + ret |= ARRAY_DIRTY; + if (a->curr_state == readonly) { + /* Well, I'm ready to handle things. If readonly + * wasn't requested, transition to read-auto. + */ + char buf[64]; + read_attr(buf, sizeof(buf), a->metadata_fd); + if (strncmp(buf, "external:-", 10) == 0) { + /* explicit request for readonly array. Leave it alone */ + ; + } else { + if (a->container->ss->set_array_state(a, 2)) + a->next_state = read_auto; /* array is clean */ + else { + a->next_state = active; /* Now active for recovery etc */ + ret |= ARRAY_DIRTY; + } + } + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == resync) { + /* A resync has finished. The endpoint is recorded in + * 'sync_start'. We don't update the metadata + * until the array goes inactive or readonly though. + * Just check if we need to fiddle spares. + */ + a->container->ss->set_array_state(a, a->curr_state <= clean); + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == idle && + a->prev_action == recover) { + /* A recovery has finished. Some disks may be in sync now, + * and the array may no longer be degraded + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + if (! (mdi->curr_state & DS_INSYNC)) + check_degraded = 1; + count++; + } + if (count != a->info.array.raid_disks) + check_degraded = 1; + } + + if (!deactivate && + a->curr_action == reshape && + a->prev_action != reshape) + /* reshape was requested by mdadm. Need to see if + * new devices have been added. Manager does that + * when it sees check_reshape + */ + check_reshape = 1; + + /* Check for failures and if found: + * 1/ Record the failure in the metadata and unblock the device. + * FIXME update the kernel to stop notifying on failed drives when + * the array is readonly and we have cleared 'blocked' + * 2/ Try to remove the device if the array is writable, or can be + * made writable. + */ + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + if (mdi->curr_state & DS_FAULTY) { + a->container->ss->set_disk(a, mdi->disk.raid_disk, + mdi->curr_state); + check_degraded = 1; + if (mdi->curr_state & DS_BLOCKED) + mdi->next_state |= DS_UNBLOCK; + if (a->curr_state == read_auto) { + a->container->ss->set_array_state(a, 0); + a->next_state = active; + } + if (a->curr_state > readonly) + mdi->next_state |= DS_REMOVE; + } + } + + /* Check for recovery checkpoint notifications. We need to be a + * minimum distance away from the last checkpoint to prevent + * over checkpointing. Note reshape checkpointing is handled + * in the second branch. + */ + if (sync_completed > a->last_checkpoint && + sync_completed - a->last_checkpoint > a->info.component_size >> 4 && + a->curr_action > reshape) { + /* A (non-reshape) sync_action has reached a checkpoint. + * Record the updated position in the metadata + */ + a->last_checkpoint = sync_completed; + a->container->ss->set_array_state(a, a->curr_state <= clean); + } else if ((a->curr_action == idle && a->prev_action == reshape) || + (a->curr_action == reshape && + sync_completed > a->last_checkpoint)) { + /* Reshape has progressed or completed so we need to + * update the array state - and possibly the array size + */ + if (sync_completed != 0) + a->last_checkpoint = sync_completed; + /* We might need to update last_checkpoint depending on + * the reason that reshape finished. + * if array reshape is really finished: + * set check point to the end, this allows + * set_array_state() to finalize reshape in metadata + * if reshape if broken: do not set checkpoint to the end + * this allows for reshape restart from checkpoint + */ + if ((a->curr_action != reshape) && + (a->prev_action == reshape)) { + char buf[40]; + if ((sysfs_get_str(&a->info, NULL, + "reshape_position", + buf, + sizeof(buf)) >= 0) && + strncmp(buf, "none", 4) == 0) + a->last_checkpoint = a->info.component_size; + } + a->container->ss->set_array_state(a, a->curr_state <= clean); + a->last_checkpoint = sync_completed; + } + + if (sync_completed > a->last_checkpoint) + a->last_checkpoint = sync_completed; + + if (sync_completed >= a->info.component_size) + a->last_checkpoint = 0; + + a->container->ss->sync_metadata(a->container); + dprintf("(%d): state:%s action:%s next(", a->info.container_member, + array_states[a->curr_state], sync_actions[a->curr_action]); + + /* Effect state changes in the array */ + if (a->next_state != bad_word) { + dprintf_cont(" state:%s", array_states[a->next_state]); + write_attr(array_states[a->next_state], a->info.state_fd); + } + if (a->next_action != bad_action) { + write_attr(sync_actions[a->next_action], a->action_fd); + dprintf_cont(" action:%s", sync_actions[a->next_action]); + } + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + if (mdi->next_state & DS_UNBLOCK) { + dprintf_cont(" %d:-blocked", mdi->disk.raid_disk); + write_attr("-blocked", mdi->state_fd); + } + + if ((mdi->next_state & DS_REMOVE) && mdi->state_fd >= 0) { + int remove_result; + + /* The kernel may not be able to immediately remove the + * disk. In that case we wait a little while and + * try again. + */ + remove_result = write_attr("remove", mdi->state_fd); + if (remove_result > 0) { + dprintf_cont(" %d:removed", mdi->disk.raid_disk); + close(mdi->state_fd); + close(mdi->recovery_fd); + close(mdi->bb_fd); + close(mdi->ubb_fd); + mdi->state_fd = -1; + } else + ret |= ARRAY_BUSY; + } + if (mdi->next_state & DS_INSYNC) { + write_attr("+in_sync", mdi->state_fd); + dprintf_cont(" %d:+in_sync", mdi->disk.raid_disk); + } + } + dprintf_cont(" )\n"); + + /* move curr_ to prev_ */ + a->prev_state = a->curr_state; + + a->prev_action = a->curr_action; + + for (mdi = a->info.devs; mdi ; mdi = mdi->next) { + mdi->prev_state = mdi->curr_state; + mdi->next_state = 0; + } + + if (check_degraded || check_reshape) { + /* manager will do the actual check */ + if (check_degraded) + a->check_degraded = 1; + if (check_reshape) + a->check_reshape = 1; + signal_manager(); + } + + if (deactivate) + a->container = NULL; + + return ret; +} + +static struct mdinfo * +find_device(struct active_array *a, int major, int minor) +{ + struct mdinfo *mdi; + + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->disk.major == major && mdi->disk.minor == minor) + return mdi; + + return NULL; +} + +static void reconcile_failed(struct active_array *aa, struct mdinfo *failed) +{ + struct active_array *a; + struct mdinfo *victim; + + for (a = aa; a; a = a->next) { + if (!a->container || a->to_remove) + continue; + victim = find_device(a, failed->disk.major, failed->disk.minor); + if (!victim) + continue; + + if (!(victim->curr_state & DS_FAULTY)) + write_attr("faulty", victim->state_fd); + } +} + +#ifdef DEBUG +static void dprint_wake_reasons(fd_set *fds) +{ + int i; + char proc_path[256]; + char link[256]; + char *basename; + int rv; + + fprintf(stderr, "monitor: wake ( "); + for (i = 0; i < FD_SETSIZE; i++) { + if (FD_ISSET(i, fds)) { + sprintf(proc_path, "/proc/%d/fd/%d", + (int) getpid(), i); + + rv = readlink(proc_path, link, sizeof(link) - 1); + if (rv < 0) { + fprintf(stderr, "%d:unknown ", i); + continue; + } + link[rv] = '\0'; + basename = strrchr(link, '/'); + fprintf(stderr, "%d:%s ", + i, basename ? ++basename : link); + } + } + fprintf(stderr, ")\n"); +} +#endif + +int monitor_loop_cnt; + +static int wait_and_act(struct supertype *container, int nowait) +{ + fd_set rfds; + int maxfd = 0; + struct active_array **aap = &container->arrays; + struct active_array *a, **ap; + int rv; + struct mdinfo *mdi; + static unsigned int dirty_arrays = ~0; /* start at some non-zero value */ + + FD_ZERO(&rfds); + + for (ap = aap ; *ap ;) { + a = *ap; + /* once an array has been deactivated we want to + * ask the manager to discard it. + */ + if (!a->container || a->to_remove) { + if (discard_this) { + ap = &(*ap)->next; + continue; + } + *ap = a->next; + a->next = NULL; + discard_this = a; + signal_manager(); + continue; + } + + add_fd(&rfds, &maxfd, a->info.state_fd); + add_fd(&rfds, &maxfd, a->action_fd); + add_fd(&rfds, &maxfd, a->sync_completed_fd); + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) { + add_fd(&rfds, &maxfd, mdi->state_fd); + add_fd(&rfds, &maxfd, mdi->bb_fd); + add_fd(&rfds, &maxfd, mdi->ubb_fd); + } + + ap = &(*ap)->next; + } + + if (manager_ready && (*aap == NULL || (sigterm && !dirty_arrays))) { + /* No interesting arrays, or we have been told to + * terminate and everything is clean. Lets see about + * exiting. Note that blocking at this point is not a + * problem as there are no active arrays, there is + * nothing that we need to be ready to do. + */ + int fd; + if (sigterm) + fd = open_dev_excl(container->devnm); + else + fd = open_dev_flags(container->devnm, O_RDONLY|O_EXCL); + if (fd >= 0 || errno != EBUSY) { + /* OK, we are safe to leave */ + if (sigterm && !dirty_arrays) + dprintf("caught sigterm, all clean... exiting\n"); + else + dprintf("no arrays to monitor... exiting\n"); + if (!sigterm) + /* On SIGTERM, someone (the take-over mdmon) will + * clean up + */ + remove_pidfile(container->devnm); + exit_now = 1; + signal_manager(); + close(fd); + exit(0); + } + } + + if (!nowait) { + sigset_t set; + struct timespec ts; + ts.tv_sec = 24*3600; + ts.tv_nsec = 0; + if (*aap == NULL || container->retry_soon) { + /* just waiting to get O_EXCL access */ + ts.tv_sec = 0; + ts.tv_nsec = 20000000ULL; + } + sigprocmask(SIG_UNBLOCK, NULL, &set); + sigdelset(&set, SIGUSR1); + monitor_loop_cnt |= 1; + rv = pselect(maxfd+1, NULL, NULL, &rfds, &ts, &set); + monitor_loop_cnt += 1; + if (rv == -1) { + if (errno == EINTR) { + rv = 0; + FD_ZERO(&rfds); + dprintf("monitor: caught signal\n"); + } else + dprintf("monitor: error %d in pselect\n", + errno); + } + #ifdef DEBUG + else + dprint_wake_reasons(&rfds); + #endif + container->retry_soon = 0; + } + + if (update_queue) { + struct metadata_update *this; + + for (this = update_queue; this ; this = this->next) + container->ss->process_update(container, this); + + update_queue_handled = update_queue; + update_queue = NULL; + signal_manager(); + container->ss->sync_metadata(container); + } + + rv = 0; + dirty_arrays = 0; + for (a = *aap; a ; a = a->next) { + + if (a->replaces && !discard_this) { + struct active_array **ap; + for (ap = &a->next; *ap && *ap != a->replaces; + ap = & (*ap)->next) + ; + if (*ap) + *ap = (*ap)->next; + discard_this = a->replaces; + a->replaces = NULL; + /* FIXME check if device->state_fd need to be cleared?*/ + signal_manager(); + } + if (a->container && !a->to_remove) { + int ret = read_and_act(a, &rfds); + rv |= 1; + dirty_arrays += !!(ret & ARRAY_DIRTY); + /* when terminating stop manipulating the array after it + * is clean, but make sure read_and_act() is given a + * chance to handle 'active_idle' + */ + if (sigterm && !(ret & ARRAY_DIRTY)) + a->container = NULL; /* stop touching this array */ + if (ret & ARRAY_BUSY) + container->retry_soon = 1; + } + } + + /* propagate failures across container members */ + for (a = *aap; a ; a = a->next) { + if (!a->container || a->to_remove) + continue; + for (mdi = a->info.devs ; mdi ; mdi = mdi->next) + if (mdi->curr_state & DS_FAULTY) + reconcile_failed(*aap, mdi); + } + + return rv; +} + +void do_monitor(struct supertype *container) +{ + int rv; + int first = 1; + do { + rv = wait_and_act(container, first); + first = 0; + } while (rv >= 0); +} @@ -0,0 +1,475 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif +#include <unistd.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <errno.h> +#include <fcntl.h> +#include <sys/types.h> +#include <sys/socket.h> +#include <sys/un.h> +#include "mdadm.h" +#include "mdmon.h" + +static const __u32 start_magic = 0x5a5aa5a5; +static const __u32 end_magic = 0xa5a55a5a; + +static int send_buf(int fd, const void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, NULL, &set, NULL, ptmo); + if (rv <= 0) + return -1; + rv = write(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +static int recv_buf(int fd, void* buf, int len, int tmo) +{ + fd_set set; + int rv; + struct timeval timeout = {tmo, 0}; + struct timeval *ptmo = tmo ? &timeout : NULL; + + while (len) { + FD_ZERO(&set); + FD_SET(fd, &set); + rv = select(fd+1, &set, NULL, NULL, ptmo); + if (rv <= 0) + return -1; + rv = read(fd, buf, len); + if (rv <= 0) + return -1; + len -= rv; + buf += rv; + } + return 0; +} + +int send_message(int fd, struct metadata_update *msg, int tmo) +{ + __s32 len = msg->len; + int rv; + + rv = send_buf(fd, &start_magic, 4, tmo); + rv = rv ?: send_buf(fd, &len, 4, tmo); + if (len > 0) + rv = rv ?: send_buf(fd, msg->buf, msg->len, tmo); + rv = send_buf(fd, &end_magic, 4, tmo); + + return rv; +} + +int receive_message(int fd, struct metadata_update *msg, int tmo) +{ + __u32 magic; + __s32 len; + int rv; + + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != start_magic) + return -1; + rv = recv_buf(fd, &len, 4, tmo); + if (rv < 0 || len > MSG_MAX_LEN) + return -1; + if (len > 0) { + msg->buf = xmalloc(len); + rv = recv_buf(fd, msg->buf, len, tmo); + if (rv < 0) { + free(msg->buf); + return -1; + } + } else + msg->buf = NULL; + rv = recv_buf(fd, &magic, 4, tmo); + if (rv < 0 || magic != end_magic) { + free(msg->buf); + return -1; + } + msg->len = len; + return 0; +} + +int ack(int fd, int tmo) +{ + struct metadata_update msg = { .len = 0 }; + + return send_message(fd, &msg, tmo); +} + +int wait_reply(int fd, int tmo) +{ + struct metadata_update msg; + int err = receive_message(fd, &msg, tmo); + + /* mdmon sent extra data, but caller only cares that we got a + * successful reply + */ + if (err == 0 && msg.len > 0) + free(msg.buf); + + return err; +} + +int connect_monitor(char *devname) +{ + char path[100]; + int sfd; + long fl; + struct sockaddr_un addr; + int pos; + char *c; + + pos = sprintf(path, "%s/", MDMON_DIR); + if (is_subarray(devname)) { + devname++; + c = strchr(devname, '/'); + if (!c) + return -1; + snprintf(&path[pos], c - devname + 1, "%s", devname); + pos += c - devname; + } else + pos += sprintf(&path[pos], "%s", devname); + sprintf(&path[pos], ".sock"); + + sfd = socket(PF_LOCAL, SOCK_STREAM, 0); + if (sfd < 0) + return -1; + + addr.sun_family = PF_LOCAL; + strcpy(addr.sun_path, path); + if (connect(sfd, (struct sockaddr*)&addr, sizeof(addr)) < 0) { + close(sfd); + return -1; + } + + fl = fcntl(sfd, F_GETFL, 0); + fl |= O_NONBLOCK; + fcntl(sfd, F_SETFL, fl); + + return sfd; +} + +int fping_monitor(int sfd) +{ + int err = 0; + + if (sfd < 0) + return sfd; + + /* try to ping existing socket */ + if (ack(sfd, 20) != 0) + err = -1; + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + return err; +} + +/* give the monitor a chance to update the metadata */ +int ping_monitor(char *devname) +{ + int sfd = connect_monitor(devname); + int err; + + if (sfd >= 0) { + err = fping_monitor(sfd); + close(sfd); + } else + err = -1; + + return err; +} + +static char *ping_monitor_version(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg; + int err = 0; + + if (sfd < 0) + return NULL; + + if (ack(sfd, 20) != 0) + err = -1; + + if (!err && receive_message(sfd, &msg, 20) != 0) + err = -1; + + close(sfd); + + if (err || !msg.len || !msg.buf) + return NULL; + return msg.buf; +} + +int unblock_subarray(struct mdinfo *sra, const int unfreeze) +{ + char buf[64]; + int rc = 0; + + if (sra) { + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '/'; + } else + buf[9] = '-'; + + if (buf[9] == '-' || + sysfs_set_str(sra, NULL, "metadata_version", buf) || + (unfreeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_set_str(sra, NULL, "sync_action", "idle"))) + rc = -1; + return rc; +} + +int block_subarray(struct mdinfo *sra) +{ + char buf[64]; + int rc = 0; + + sprintf(buf, "external:%s\n", sra->text_version); + buf[9] = '-'; + if (sysfs_set_str(sra, NULL, "metadata_version", buf)) + rc = -1; + + return rc; +} + +/* check mdmon version if it supports + * array blocking mechanism + */ +int check_mdmon_version(char *container) +{ + char *version = NULL; + + if (!mdmon_running(container)) { + /* if mdmon is not active we assume that any instance that is + * later started will match the current mdadm version, if this + * assumption is violated we may inadvertantly rebuild an array + * that was meant for reshape, or start rebuild on a spare that + * was to be moved to another container + */ + /* pass */; + } else { + int ver; + + version = ping_monitor_version(container); + ver = version ? mdadm_version(version) : -1; + free(version); + if (ver < 3002000) { + pr_err("mdmon instance for %s cannot be disabled\n", + container); + return -1; + } + } + + return 0; +} + +/** + * block_monitor - prevent mdmon spare assignment + * @container - container to block + * @freeze - flag to additionally freeze sync_action + * + * This is used by the reshape code to freeze the container, and the + * auto-rebuild implementation to atomically move spares. + * In both cases we need to stop mdmon from assigning spares to replace + * failed devices as we might have other plans for the spare. + * For the reshape case we also need to 'freeze' sync_action so that + * no recovery happens until we have fully prepared for the reshape. + * + * We tell mdmon that the array is frozen by marking the 'metadata' name + * with a leading '-'. The previously told mdmon "Don't make this array + * read/write, leave it readonly". Now it means a more general "Don't + * reconfigure this array at all". + * As older versions of mdmon (which might run from initrd) don't understand + * this, we first check that the running mdmon is new enough. + */ +int block_monitor(char *container, const int freeze) +{ + struct mdstat_ent *ent, *e, *e2; + struct mdinfo *sra = NULL; + char buf[64]; + int rv = 0; + + if (check_mdmon_version(container)) + return -1; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while disabling mdmon\n"); + return -1; + } + + /* freeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION); + if (!sra) { + pr_err("failed to read sysfs for subarray%s\n", + to_subarray(e, container)); + break; + } + /* can't reshape an array that we can't monitor */ + if (sra->text_version[0] == '-') + break; + + if (freeze && sysfs_freeze_array(sra) < 1) + break; + /* flag this array to not be modified by mdmon (close race with + * takeover in reshape case and spare reassignment in the + * auto-rebuild case) + */ + if (block_subarray(sra)) + break; + ping_monitor(container); + + /* check that we did not race with recovery */ + if ((freeze && + !sysfs_attribute_available(sra, NULL, "sync_action")) || + (freeze && + sysfs_attribute_available(sra, NULL, "sync_action") && + sysfs_get_str(sra, NULL, "sync_action", buf, 20) > 0 && + strcmp(buf, "frozen\n") == 0)) + /* pass */; + else { + unblock_subarray(sra, 0); + break; + } + /* Double check against races - there should be no spares + * or part-spares + */ + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_DEVS | GET_STATE); + if (sra && sra->array.spare_disks > 0) { + unblock_subarray(sra, freeze); + break; + } + } + + if (e) { + pr_err("failed to freeze subarray%s\n", + to_subarray(e, container)); + + /* thaw the partially frozen container */ + for (e2 = ent; e2 && e2 != e; e2 = e2->next) { + if (!is_container_member(e2, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e2->devnm, GET_VERSION); + if (unblock_subarray(sra, freeze)) + pr_err("Failed to unfreeze %s\n", e2->devnm); + } + + ping_monitor(container); /* cleared frozen */ + rv = -1; + } + + sysfs_free(sra); + free_mdstat(ent); + + return rv; +} + +void unblock_monitor(char *container, const int unfreeze) +{ + struct mdstat_ent *ent, *e; + struct mdinfo *sra = NULL; + int to_ping = 0; + + ent = mdstat_read(0, 0); + if (!ent) { + pr_err("failed to read /proc/mdstat while unblocking container\n"); + return; + } + + /* unfreeze container contents */ + for (e = ent; e; e = e->next) { + if (!is_container_member(e, container)) + continue; + sysfs_free(sra); + sra = sysfs_read(-1, e->devnm, GET_VERSION|GET_LEVEL); + if (!sra) + continue; + if (sra->array.level > 0) + to_ping++; + if (unblock_subarray(sra, unfreeze)) + pr_err("Failed to unfreeze %s\n", e->devnm); + } + if (to_ping) + ping_monitor(container); + + sysfs_free(sra); + free_mdstat(ent); +} + +/* give the manager a chance to view the updated container state. This + * would naturally happen due to the manager noticing a change in + * /proc/mdstat; however, pinging encourages this detection to happen + * while an exclusive open() on the container is active + */ +int ping_manager(char *devname) +{ + int sfd = connect_monitor(devname); + struct metadata_update msg = { .len = -1 }; + int err = 0; + + if (sfd < 0) + return sfd; + + err = send_message(sfd, &msg, 20); + + /* check the reply */ + if (!err && wait_reply(sfd, 20) != 0) + err = -1; + + close(sfd); + return err; +} + +/* using takeover operation for grow purposes, mdadm has to be sure + * that mdmon processes all updates, and if necessary it will be closed + * at takeover to raid0 operation + */ +void flush_mdmon(char *container) +{ + ping_manager(container); + ping_monitor(container); +} @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2008 Intel Corporation + * + * mdmon socket / message handling + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +struct mdinfo; +struct metadata_update; + +extern int receive_message(int fd, struct metadata_update *msg, int tmo); +extern int send_message(int fd, struct metadata_update *msg, int tmo); +extern int ack(int fd, int tmo); +extern int wait_reply(int fd, int tmo); +extern int connect_monitor(char *devname); +extern int ping_monitor(char *devname); +extern int block_subarray(struct mdinfo *sra); +extern int unblock_subarray(struct mdinfo *sra, const int unfreeze); +extern int block_monitor(char *container, const int freeze); +extern void unblock_monitor(char *container, const int unfreeze); +extern int fping_monitor(int sock); +extern int ping_manager(char *devname); +extern void flush_mdmon(char *container); + +#define MSG_MAX_LEN (4*1024*1024) @@ -0,0 +1,79 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* Structure definitions ext for MBR and GPT partition tables + */ + +#define MBR_SIGNATURE_MAGIC __cpu_to_le16(0xAA55) +#define MBR_PARTITIONS 4 + +struct MBR_part_record { + __u8 bootable; + __u8 first_head; + __u8 first_sector; + __u8 first_cyl; + __u8 part_type; + __u8 last_head; + __u8 last_sector; + __u8 last_cyl; + __u32 first_sect_lba; + __u32 blocks_num; +} __attribute__((packed)); + +struct MBR { + __u8 pad[446]; + struct MBR_part_record parts[MBR_PARTITIONS]; + __u16 magic; +} __attribute__((packed)); + +#define GPT_SIGNATURE_MAGIC __cpu_to_le64(0x5452415020494645ULL) +#define MBR_GPT_PARTITION_TYPE 0xEE + +struct GPT_part_entry { + unsigned char type_guid[16]; + unsigned char partition_guid[16]; + __u64 starting_lba; + __u64 ending_lba; + unsigned char attr_bits[8]; + unsigned char name[72]; +} __attribute__((packed)); + +struct GPT { + __u64 magic; + __u32 revision; + __u32 header_size; + __u32 crc; + __u32 pad1; + __u64 current_lba; + __u64 backup_lba; + __u64 first_lba; + __u64 last_lba; + __u8 guid[16]; + __u64 part_start; + __u32 part_cnt; + __u32 part_size; + __u32 part_crc; + __u8 pad2[420]; +} __attribute__((packed)); diff --git a/platform-intel.c b/platform-intel.c new file mode 100644 index 0000000..5a8729e --- /dev/null +++ b/platform-intel.c @@ -0,0 +1,969 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include "mdadm.h" +#include "platform-intel.h" +#include "probe_roms.h" +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <dirent.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/types.h> +#include <sys/stat.h> +#include <limits.h> + +#define NVME_SUBSYS_PATH "/sys/devices/virtual/nvme-subsystem/" + +static int devpath_to_ll(const char *dev_path, const char *entry, + unsigned long long *val); + +static void free_sys_dev(struct sys_dev **list) +{ + while (*list) { + struct sys_dev *next = (*list)->next; + + if ((*list)->path) + free((*list)->path); + free(*list); + *list = next; + } +} + +struct sys_dev *find_driver_devices(const char *bus, const char *driver) +{ + /* search sysfs for devices driven by 'driver' */ + char path[PATH_MAX]; + char link[PATH_MAX]; + char *c, *p; + DIR *driver_dir; + struct dirent *de; + struct sys_dev *head = NULL; + struct sys_dev *list = NULL; + struct sys_dev *vmd = NULL; + enum sys_dev_type type; + unsigned long long dev_id; + unsigned long long class; + + if (strcmp(driver, "isci") == 0) + type = SYS_DEV_SAS; + else if (strcmp(driver, "ahci") == 0) + type = SYS_DEV_SATA; + else if (strcmp(driver, "nvme") == 0) { + /* if looking for nvme devs, first look for vmd */ + vmd = find_driver_devices("pci", "vmd"); + type = SYS_DEV_NVME; + } else if (strcmp(driver, "vmd") == 0) + type = SYS_DEV_VMD; + else + type = SYS_DEV_UNKNOWN; + + sprintf(path, "/sys/bus/%s/drivers/%s", bus, driver); + driver_dir = opendir(path); + if (!driver_dir) { + if (vmd) + free_sys_dev(&vmd); + return NULL; + } + for (de = readdir(driver_dir); de; de = readdir(driver_dir)) { + int n; + int skip = 0; + + /* is 'de' a device? check that the 'subsystem' link exists and + * that its target matches 'bus' + */ + sprintf(path, "/sys/bus/%s/drivers/%s/%s/subsystem", + bus, driver, de->d_name); + n = readlink(path, link, sizeof(link)); + if (n < 0 || n >= (int)sizeof(link)) + continue; + link[n] = '\0'; + c = strrchr(link, '/'); + if (!c) + continue; + if (strncmp(bus, c+1, strlen(bus)) != 0) + continue; + + sprintf(path, "/sys/bus/%s/drivers/%s/%s", + bus, driver, de->d_name); + + /* if searching for nvme - skip vmd connected one */ + if (type == SYS_DEV_NVME) { + struct sys_dev *dev; + char *rp = realpath(path, NULL); + for (dev = vmd; dev; dev = dev->next) { + if ((strncmp(dev->path, rp, strlen(dev->path)) == 0)) + skip = 1; + } + free(rp); + } + + /* if it's not Intel device or mark as VMD connected - skip it. */ + if (devpath_to_vendor(path) != 0x8086 || skip == 1) + continue; + + if (devpath_to_ll(path, "device", &dev_id) != 0) + continue; + + if (devpath_to_ll(path, "class", &class) != 0) + continue; + + /* + * Each VMD device (domain) adds separate PCI bus, it is better + * to store path as a path to that bus (easier further + * determination which NVMe dev is connected to this particular + * VMD domain). + */ + if (type == SYS_DEV_VMD) { + sprintf(path, "/sys/bus/%s/drivers/%s/%s/domain/device", + bus, driver, de->d_name); + } + p = realpath(path, NULL); + if (p == NULL) { + pr_err("Unable to get real path for '%s'\n", path); + continue; + } + + /* start / add list entry */ + if (!head) { + head = xmalloc(sizeof(*head)); + list = head; + } else { + list->next = xmalloc(sizeof(*head)); + list = list->next; + } + + if (!list) { + free_sys_dev(&head); + break; + } + + list->dev_id = (__u16) dev_id; + list->class = (__u32) class; + list->type = type; + list->next = NULL; + list->path = p; + + if ((list->pci_id = strrchr(list->path, '/')) != NULL) + list->pci_id++; + } + closedir(driver_dir); + + if (vmd) { + if (list) + list->next = vmd; + else + head = vmd; + } + + return head; +} + +static struct sys_dev *intel_devices=NULL; +static time_t valid_time = 0; + +struct sys_dev *device_by_id(__u16 device_id) +{ + struct sys_dev *iter; + + for (iter = intel_devices; iter != NULL; iter = iter->next) + if (iter->dev_id == device_id) + return iter; + return NULL; +} + +struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path) +{ + struct sys_dev *iter; + + for (iter = intel_devices; iter != NULL; iter = iter->next) + if ((iter->dev_id == device_id) && strstr(iter->path, path)) + return iter; + return NULL; +} + +static int devpath_to_ll(const char *dev_path, const char *entry, unsigned long long *val) +{ + char path[strlen(dev_path) + strlen(entry) + 2]; + int fd; + int n; + + sprintf(path, "%s/%s", dev_path, entry); + + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +__u16 devpath_to_vendor(const char *dev_path) +{ + char path[strlen(dev_path) + strlen("/vendor") + 1]; + char vendor[7]; + int fd; + __u16 id = 0xffff; + int n; + + sprintf(path, "%s/vendor", dev_path); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 0xffff; + + n = read(fd, vendor, sizeof(vendor)); + if (n == sizeof(vendor)) { + vendor[n - 1] = '\0'; + id = strtoul(vendor, NULL, 16); + } + close(fd); + + return id; +} + +/* Description: Read text value of dev_path/entry field + * Parameters: + * dev_path - sysfs path to the device + * entry - entry to be read + * buf - buffer for read value + * len - size of buf + * verbose - error logging level + */ +int devpath_to_char(const char *dev_path, const char *entry, char *buf, int len, + int verbose) +{ + char path[PATH_MAX]; + + snprintf(path, sizeof(path), "%s/%s", dev_path, entry); + if (load_sys(path, buf, len)) { + if (verbose) + pr_err("Cannot read %s, aborting\n", path); + return 1; + } + + return 0; +} + +struct sys_dev *find_intel_devices(void) +{ + struct sys_dev *ahci, *isci, *nvme; + + if (valid_time > time(0) - 10) + return intel_devices; + + if (intel_devices) + free_sys_dev(&intel_devices); + + isci = find_driver_devices("pci", "isci"); + ahci = find_driver_devices("pci", "ahci"); + /* Searching for NVMe will return list of NVMe and VMD controllers */ + nvme = find_driver_devices("pci", "nvme"); + + if (!isci && !ahci) { + ahci = nvme; + } else if (!ahci) { + ahci = isci; + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } else { + struct sys_dev *elem = ahci; + while (elem->next) + elem = elem->next; + elem->next = isci; + while (elem->next) + elem = elem->next; + elem->next = nvme; + } + intel_devices = ahci; + valid_time = time(0); + return intel_devices; +} + +/* + * PCI Expansion ROM Data Structure Format */ +struct pciExpDataStructFormat { + __u8 ver[4]; + __u16 vendorID; + __u16 deviceID; + __u16 devListOffset; + __u16 pciDataStructLen; + __u8 pciDataStructRev; +} __attribute__ ((packed)); + +struct orom_entry *orom_entries; + +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id) +{ + struct orom_entry *entry; + struct devid_list *devid; + + for (entry = orom_entries; entry; entry = entry->next) { + for (devid = entry->devid_list; devid; devid = devid->next) { + if (devid->devid == dev_id) + return entry; + } + } + + return NULL; +} + +const struct imsm_orom *get_orom_by_device_id(__u16 dev_id) +{ + const struct orom_entry *entry = get_orom_entry_by_device_id(dev_id); + + if (entry) + return &entry->orom; + + return NULL; +} + +static struct orom_entry *add_orom(const struct imsm_orom *orom) +{ + struct orom_entry *list; + struct orom_entry *prev = NULL; + + for (list = orom_entries; list; prev = list, list = list->next) + ; + + list = xmalloc(sizeof(struct orom_entry)); + list->orom = *orom; + list->devid_list = NULL; + list->next = NULL; + + if (prev == NULL) + orom_entries = list; + else + prev->next = list; + + return list; +} + +static void add_orom_device_id(struct orom_entry *entry, __u16 dev_id) +{ + struct devid_list *list; + struct devid_list *prev = NULL; + + for (list = entry->devid_list; list; prev = list, list = list->next) { + if (list->devid == dev_id) + return; + } + list = xmalloc(sizeof(struct devid_list)); + list->devid = dev_id; + list->next = NULL; + + if (prev == NULL) + entry->devid_list = list; + else + prev->next = list; +} + +static int scan(const void *start, const void *end, const void *data) +{ + int offset; + const struct imsm_orom *imsm_mem = NULL; + int len = (end - start); + struct pciExpDataStructFormat *ptr= (struct pciExpDataStructFormat *)data; + + if (data + 0x18 > end) { + dprintf("cannot find pciExpDataStruct \n"); + return 0; + } + + dprintf("ptr->vendorID: %lx __le16_to_cpu(ptr->deviceID): %lx \n", + (ulong) __le16_to_cpu(ptr->vendorID), + (ulong) __le16_to_cpu(ptr->deviceID)); + + if (__le16_to_cpu(ptr->vendorID) != 0x8086) + return 0; + + if (get_orom_by_device_id(ptr->deviceID)) + return 0; + + for (offset = 0; offset < len; offset += 4) { + const void *mem = start + offset; + + if ((memcmp(mem, IMSM_OROM_SIGNATURE, 4) == 0)) { + imsm_mem = mem; + break; + } + } + + if (!imsm_mem) + return 0; + + struct orom_entry *orom = add_orom(imsm_mem); + + /* only PciDataStructure with revision 3 and above supports devices list. */ + if (ptr->pciDataStructRev >= 3 && ptr->devListOffset) { + const __u16 *dev_list = (void *)ptr + ptr->devListOffset; + int i; + + for (i = 0; dev_list[i] != 0; i++) + add_orom_device_id(orom, dev_list[i]); + } else { + add_orom_device_id(orom, __le16_to_cpu(ptr->deviceID)); + } + + return 0; +} + +const struct imsm_orom *imsm_platform_test(struct sys_dev *hba) +{ + struct imsm_orom orom = { + .signature = IMSM_OROM_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB | + IMSM_OROM_SSS_256kB | IMSM_OROM_SSS_512kB | + IMSM_OROM_SSS_1MB | IMSM_OROM_SSS_2MB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY, + .tds = IMSM_OROM_TOTAL_DISKS, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_VOLUMES_PER_HBA + }; + orom.attr = orom.rlc | IMSM_OROM_ATTR_ChecksumVerify; + + if (check_env("IMSM_TEST_OROM_NORAID5")) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_AHCI_EFI_NORAID5") && (hba->type == SYS_DEV_SAS)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + if (check_env("IMSM_TEST_SCU_EFI_NORAID5") && (hba->type == SYS_DEV_SATA)) { + orom.rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10; + } + + struct orom_entry *ret = add_orom(&orom); + + add_orom_device_id(ret, hba->dev_id); + + return &ret->orom; +} + +static const struct imsm_orom *find_imsm_hba_orom(struct sys_dev *hba) +{ + unsigned long align; + + if (check_env("IMSM_TEST_OROM")) + return imsm_platform_test(hba); + + /* return empty OROM capabilities in EFI test mode */ + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return NULL; + + find_intel_devices(); + + if (intel_devices == NULL) + return NULL; + + /* scan option-rom memory looking for an imsm signature */ + if (check_env("IMSM_SAFE_OROM_SCAN")) + align = 2048; + else + align = 512; + if (probe_roms_init(align) != 0) + return NULL; + probe_roms(); + /* ignore return value - True is returned if both adapater roms are found */ + scan_adapter_roms(scan); + probe_roms_exit(); + + return get_orom_by_device_id(hba->dev_id); +} + +#define GUID_STR_MAX 37 /* according to GUID format: + * xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx" */ + +#define EFI_GUID(a, b, c, d0, d1, d2, d3, d4, d5, d6, d7) \ +((struct efi_guid) \ +{{ (a) & 0xff, ((a) >> 8) & 0xff, ((a) >> 16) & 0xff, ((a) >> 24) & 0xff, \ + (b) & 0xff, ((b) >> 8) & 0xff, \ + (c) & 0xff, ((c) >> 8) & 0xff, \ + (d0), (d1), (d2), (d3), (d4), (d5), (d6), (d7) }}) + +#define SYS_EFI_VAR_PATH "/sys/firmware/efi/vars" +#define SYS_EFIVARS_PATH "/sys/firmware/efi/efivars" +#define SCU_PROP "RstScuV" +#define AHCI_PROP "RstSataV" +#define AHCI_SSATA_PROP "RstsSatV" +#define AHCI_TSATA_PROP "RsttSatV" +#define VMD_PROP "RstUefiV" + +#define VENDOR_GUID \ + EFI_GUID(0x193dfefa, 0xa445, 0x4302, 0x99, 0xd8, 0xef, 0x3a, 0xad, 0x1a, 0x04, 0xc6) + +#define PCI_CLASS_RAID_CNTRL 0x010400 + +static int read_efi_var(void *buffer, ssize_t buf_size, + const char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int fd; + ssize_t n; + + snprintf(path, PATH_MAX, "%s/%s-%s", SYS_EFIVARS_PATH, variable_name, guid_str(buf, guid)); + + fd = open(path, O_RDONLY); + if (fd < 0) + return 1; + + /* read the variable attributes and ignore it */ + n = read(fd, buf, sizeof(__u32)); + if (n < 0) { + close(fd); + return 1; + } + + /* read the variable data */ + n = read(fd, buffer, buf_size); + close(fd); + if (n < buf_size) + return 1; + + return 0; +} + +static int read_efi_variable(void *buffer, ssize_t buf_size, + const char *variable_name, struct efi_guid guid) +{ + char path[PATH_MAX]; + char buf[GUID_STR_MAX]; + int dfd; + ssize_t n, var_data_len; + + /* Try to read the variable using the new efivarfs interface first. + * If that fails, fall back to the old sysfs-efivars interface. */ + if (!read_efi_var(buffer, buf_size, variable_name, guid)) + return 0; + + snprintf(path, PATH_MAX, "%s/%s-%s/size", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + /* get size of variable data */ + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, &buf, sizeof(buf)); + close(dfd); + if (n < 0) + return 1; + buf[n] = '\0'; + + errno = 0; + var_data_len = strtoul(buf, NULL, 16); + if ((errno == ERANGE && (var_data_len == LONG_MAX)) || + (errno != 0 && var_data_len == 0)) + return 1; + + /* get data */ + snprintf(path, PATH_MAX, "%s/%s-%s/data", SYS_EFI_VAR_PATH, variable_name, guid_str(buf, guid)); + + dprintf("EFI VAR: path=%s\n", path); + dfd = open(path, O_RDONLY); + if (dfd < 0) + return 1; + + n = read(dfd, buffer, buf_size); + close(dfd); + if (n != var_data_len || n < buf_size) { + return 1; + } + + return 0; +} + +const struct imsm_orom *find_imsm_efi(struct sys_dev *hba) +{ + struct imsm_orom orom; + struct orom_entry *ret; + static const char * const sata_efivars[] = {AHCI_PROP, AHCI_SSATA_PROP, + AHCI_TSATA_PROP}; + unsigned long i; + + if (check_env("IMSM_TEST_AHCI_EFI") || check_env("IMSM_TEST_SCU_EFI")) + return imsm_platform_test(hba); + + /* OROM test is set, return that there is no EFI capabilities */ + if (check_env("IMSM_TEST_OROM")) + return NULL; + + switch (hba->type) { + case SYS_DEV_SAS: + if (!read_efi_variable(&orom, sizeof(orom), SCU_PROP, + VENDOR_GUID)) + break; + + return NULL; + case SYS_DEV_SATA: + if (hba->class != PCI_CLASS_RAID_CNTRL) + return NULL; + + for (i = 0; i < ARRAY_SIZE(sata_efivars); i++) { + if (!read_efi_variable(&orom, sizeof(orom), + sata_efivars[i], VENDOR_GUID)) + break; + + } + if (i == ARRAY_SIZE(sata_efivars)) + return NULL; + + break; + case SYS_DEV_VMD: + if (!read_efi_variable(&orom, sizeof(orom), VMD_PROP, + VENDOR_GUID)) + break; + return NULL; + default: + return NULL; + } + + ret = add_orom(&orom); + add_orom_device_id(ret, hba->dev_id); + ret->type = hba->type; + + return &ret->orom; +} + +const struct imsm_orom *find_imsm_nvme(struct sys_dev *hba) +{ + static struct orom_entry *nvme_orom; + + if (hba->type != SYS_DEV_NVME) + return NULL; + + if (!nvme_orom) { + struct imsm_orom nvme_orom_compat = { + .signature = IMSM_NVME_OROM_COMPAT_SIGNATURE, + .rlc = IMSM_OROM_RLC_RAID0 | IMSM_OROM_RLC_RAID1 | + IMSM_OROM_RLC_RAID10 | IMSM_OROM_RLC_RAID5, + .sss = IMSM_OROM_SSS_4kB | IMSM_OROM_SSS_8kB | + IMSM_OROM_SSS_16kB | IMSM_OROM_SSS_32kB | + IMSM_OROM_SSS_64kB | IMSM_OROM_SSS_128kB, + .dpa = IMSM_OROM_DISKS_PER_ARRAY_NVME, + .tds = IMSM_OROM_TOTAL_DISKS_NVME, + .vpa = IMSM_OROM_VOLUMES_PER_ARRAY, + .vphba = IMSM_OROM_TOTAL_DISKS_NVME / 2 * IMSM_OROM_VOLUMES_PER_ARRAY, + .attr = IMSM_OROM_ATTR_2TB | IMSM_OROM_ATTR_2TB_DISK, + .driver_features = IMSM_OROM_CAPABILITIES_EnterpriseSystem + }; + nvme_orom = add_orom(&nvme_orom_compat); + } + add_orom_device_id(nvme_orom, hba->dev_id); + nvme_orom->type = SYS_DEV_NVME; + return &nvme_orom->orom; +} + +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba) +{ + const struct imsm_orom *cap = get_orom_by_device_id(hba->dev_id); + + if (cap) + return cap; + + if (hba->type == SYS_DEV_NVME) + return find_imsm_nvme(hba); + if ((cap = find_imsm_efi(hba)) != NULL) + return cap; + if ((cap = find_imsm_hba_orom(hba)) != NULL) + return cap; + + return NULL; +} + +/* Check whether the nvme device is represented by nvme subsytem, + * if yes virtual path should be changed to hardware device path, + * to allow IMSM capabilities detection. + * Returns: + * hardware path to device - if the device is represented via + * nvme virtual subsytem + * NULL - if the device is not represented via nvme virtual subsytem + */ +char *get_nvme_multipath_dev_hw_path(const char *dev_path) +{ + DIR *dir; + struct dirent *ent; + char *rp = NULL; + + if (strncmp(dev_path, NVME_SUBSYS_PATH, strlen(NVME_SUBSYS_PATH)) != 0) + return NULL; + + dir = opendir(dev_path); + if (!dir) + return NULL; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + char buf[strlen(dev_path) + strlen(ent->d_name) + 1]; + + /* Check if dir is a controller, ignore namespaces*/ + if (!(strncmp(ent->d_name, "nvme", 4) == 0) || + (strrchr(ent->d_name, 'n') != &ent->d_name[0])) + continue; + + sprintf(buf, "%s/%s", dev_path, ent->d_name); + rp = realpath(buf, NULL); + break; + } + + closedir(dir); + return rp; +} + +/* Description: Return part or whole realpath for the dev + * Parameters: + * dev - the device to be quered + * dev_level - level of "/device" entries. It allows to caller to access + * virtual or physical devices which are on "path" to quered + * one. + * buf - optional, must be PATH_MAX size. If set, then will be used. + */ +char *devt_to_devpath(dev_t dev, int dev_level, char *buf) +{ + char device[PATH_MAX]; + char *hw_path; + int i; + unsigned long device_free_len = sizeof(device) - 1; + char dev_str[] = "/device"; + unsigned long dev_str_len = strlen(dev_str); + + snprintf(device, sizeof(device), "/sys/dev/block/%d:%d", major(dev), + minor(dev)); + + /* If caller wants block device, return path to it even if it is exposed + * via virtual layer. + */ + if (dev_level == 0) + return realpath(device, buf); + + device_free_len -= strlen(device); + for (i = 0; i < dev_level; i++) { + if (device_free_len < dev_str_len) + return NULL; + + strncat(device, dev_str, device_free_len); + + /* Resolve nvme-subsystem abstraction if needed + */ + device_free_len -= dev_str_len; + if (i == 0) { + char rp[PATH_MAX]; + + if (!realpath(device, rp)) + return NULL; + hw_path = get_nvme_multipath_dev_hw_path(rp); + if (hw_path) { + strcpy(device, hw_path); + device_free_len = sizeof(device) - + strlen(device) - 1; + free(hw_path); + } + } + } + + return realpath(device, buf); +} + +char *diskfd_to_devpath(int fd, int dev_level, char *buf) +{ + /* return the device path for a disk, return NULL on error or fd + * refers to a partition + */ + struct stat st; + + if (fstat(fd, &st) != 0) + return NULL; + if (!S_ISBLK(st.st_mode)) + return NULL; + + return devt_to_devpath(st.st_rdev, dev_level, buf); +} + +int path_attached_to_hba(const char *disk_path, const char *hba_path) +{ + int rc; + + if (check_env("IMSM_TEST_AHCI_DEV") || + check_env("IMSM_TEST_SCU_DEV")) { + return 1; + } + + if (!disk_path || !hba_path) + return 0; + dprintf("hba: %s - disk: %s\n", hba_path, disk_path); + if (strncmp(disk_path, hba_path, strlen(hba_path)) == 0) + rc = 1; + else + rc = 0; + + return rc; +} + +int devt_attached_to_hba(dev_t dev, const char *hba_path) +{ + char *disk_path = devt_to_devpath(dev, 1, NULL); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +int disk_attached_to_hba(int fd, const char *hba_path) +{ + char *disk_path = diskfd_to_devpath(fd, 1, NULL); + int rc = path_attached_to_hba(disk_path, hba_path); + + if (disk_path) + free(disk_path); + + return rc; +} + +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf) +{ + struct dirent *ent; + DIR *dir; + char path[PATH_MAX]; + + if (!hba) + return NULL; + + if (hba->type != SYS_DEV_VMD) + return NULL; + + dir = opendir("/sys/bus/pci/drivers/vmd"); + if (!dir) + return NULL; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s/domain/device", + ent->d_name); + + if (!realpath(path, buf)) + continue; + + if (strncmp(buf, hba->path, strlen(buf)) == 0) { + sprintf(path, "/sys/bus/pci/drivers/vmd/%s", ent->d_name); + closedir(dir); + return realpath(path, buf); + } + } + + closedir(dir); + return NULL; +} + +/* Scan over all controller's namespaces and compare nsid value to verify if + * current one is supported. The routine doesn't check IMSM capabilities for + * namespace. Only one nvme namespace is supported by IMSM. + * Paramteres: + * fd - open descriptor to the nvme namespace + * verbose - error logging level + * Returns: + * 1 - if namespace is supported + * 0 - otherwise + */ +int imsm_is_nvme_namespace_supported(int fd, int verbose) +{ + DIR *dir = NULL; + struct dirent *ent; + char cntrl_path[PATH_MAX]; + char ns_path[PATH_MAX]; + unsigned long long lowest_nsid = ULLONG_MAX; + unsigned long long this_nsid; + int rv = 0; + + + if (!diskfd_to_devpath(fd, 1, cntrl_path) || + !diskfd_to_devpath(fd, 0, ns_path)) { + if (verbose) + pr_err("Cannot get device paths\n"); + goto abort; + } + + + if (devpath_to_ll(ns_path, "nsid", &this_nsid)) { + if (verbose) + pr_err("Cannot read nsid value for %s", + basename(ns_path)); + goto abort; + } + + dir = opendir(cntrl_path); + if (!dir) + goto abort; + + /* The lowest nvme namespace is supported */ + for (ent = readdir(dir); ent; ent = readdir(dir)) { + unsigned long long curr_nsid; + char curr_ns_path[PATH_MAX + 256]; + + if (!strstr(ent->d_name, "nvme")) + continue; + + snprintf(curr_ns_path, sizeof(curr_ns_path), "%s/%s", + cntrl_path, ent->d_name); + + if (devpath_to_ll(curr_ns_path, "nsid", &curr_nsid)) + goto abort; + + if (lowest_nsid > curr_nsid) + lowest_nsid = curr_nsid; + } + + if (this_nsid == lowest_nsid) + rv = 1; + else if (verbose) + pr_err("IMSM is supported on the lowest NVMe namespace\n"); + +abort: + if (dir) + closedir(dir); + + return rv; +} + +/* Verify if multipath is supported by NVMe controller + * Returns: + * 0 - not supported + * 1 - supported + */ +int is_multipath_nvme(int disk_fd) +{ + char ns_path[PATH_MAX]; + + if (!diskfd_to_devpath(disk_fd, 0, ns_path)) + return 0; + + if (strncmp(ns_path, NVME_SUBSYS_PATH, strlen(NVME_SUBSYS_PATH)) == 0) + return 1; + + return 0; +} diff --git a/platform-intel.h b/platform-intel.h new file mode 100644 index 0000000..6238d23 --- /dev/null +++ b/platform-intel.h @@ -0,0 +1,259 @@ +/* + * Intel(R) Matrix Storage Manager hardware and firmware support routines + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <asm/types.h> +#include <strings.h> + +/* The IMSM Capability (IMSM AHCI and ISCU OROM/EFI variable) Version Table definition */ +struct imsm_orom { + __u8 signature[4]; + #define IMSM_OROM_SIGNATURE "$VER" + #define IMSM_NVME_OROM_COMPAT_SIGNATURE "$NVM" + __u8 table_ver_major; /* Currently 2 (can change with future revs) */ + __u8 table_ver_minor; /* Currently 2 (can change with future revs) */ + __u16 major_ver; /* Example: 8 as in 8.6.0.1020 */ + __u16 minor_ver; /* Example: 6 as in 8.6.0.1020 */ + __u16 hotfix_ver; /* Example: 0 as in 8.6.0.1020 */ + __u16 build; /* Example: 1020 as in 8.6.0.1020 */ + __u8 len; /* number of bytes in this entire table */ + __u8 checksum; /* checksum of all the bytes in this table */ + __u16 rlc; /* RAID Level Capability */ + /* we assume the cpu is x86 as the orom should not be found + * anywhere else + */ + #define IMSM_OROM_RLC_RAID0 (1 << 0) + #define IMSM_OROM_RLC_RAID1 (1 << 1) + #define IMSM_OROM_RLC_RAID10 (1 << 2) + #define IMSM_OROM_RLC_RAID1E (1 << 3) + #define IMSM_OROM_RLC_RAID5 (1 << 4) + #define IMSM_OROM_RLC_RAID_CNG (1 << 5) + __u16 sss; /* Strip Size Supported */ + #define IMSM_OROM_SSS_2kB (1 << 0) + #define IMSM_OROM_SSS_4kB (1 << 1) + #define IMSM_OROM_SSS_8kB (1 << 2) + #define IMSM_OROM_SSS_16kB (1 << 3) + #define IMSM_OROM_SSS_32kB (1 << 4) + #define IMSM_OROM_SSS_64kB (1 << 5) + #define IMSM_OROM_SSS_128kB (1 << 6) + #define IMSM_OROM_SSS_256kB (1 << 7) + #define IMSM_OROM_SSS_512kB (1 << 8) + #define IMSM_OROM_SSS_1MB (1 << 9) + #define IMSM_OROM_SSS_2MB (1 << 10) + #define IMSM_OROM_SSS_4MB (1 << 11) + #define IMSM_OROM_SSS_8MB (1 << 12) + #define IMSM_OROM_SSS_16MB (1 << 13) + #define IMSM_OROM_SSS_32MB (1 << 14) + #define IMSM_OROM_SSS_64MB (1 << 15) + __u16 dpa; /* Disks Per Array supported */ + #define IMSM_OROM_DISKS_PER_ARRAY 6 + #define IMSM_OROM_DISKS_PER_ARRAY_NVME 12 + __u16 tds; /* Total Disks Supported */ + #define IMSM_OROM_TOTAL_DISKS 6 + #define IMSM_OROM_TOTAL_DISKS_NVME 12 + __u8 vpa; /* # Volumes Per Array supported */ + #define IMSM_OROM_VOLUMES_PER_ARRAY 2 + __u8 vphba; /* # Volumes Per Host Bus Adapter supported */ + #define IMSM_OROM_VOLUMES_PER_HBA 4 + #define IMSM_OROM_VOLUMES_PER_HBA_NVME 4 + /* Attributes supported. This should map to the + * attributes in the MPB. Also, lower 16 bits + * should match/duplicate RLC bits above. + */ + __u32 attr; + #define IMSM_OROM_ATTR_RAID0 IMSM_OROM_RLC_RAID0 + #define IMSM_OROM_ATTR_RAID1 IMSM_OROM_RLC_RAID1 + #define IMSM_OROM_ATTR_RAID10 IMSM_OROM_RLC_RAID10 + #define IMSM_OROM_ATTR_RAID1E IMSM_OROM_RLC_RAID1E + #define IMSM_OROM_ATTR_RAID5 IMSM_OROM_RLC_RAID5 + #define IMSM_OROM_ATTR_RAID_CNG IMSM_OROM_RLC_RAID_CNG + #define IMSM_OROM_ATTR_2TB_DISK (1 << 26) + #define IMSM_OROM_ATTR_2TB (1 << 29) + #define IMSM_OROM_ATTR_PM (1 << 30) + #define IMSM_OROM_ATTR_ChecksumVerify (1 << 31) + __u32 capabilities; + #define IMSM_OROM_CAPABILITIES_Ext_SATA (1 << 0) + #define IMSM_OROM_CAPABILITIES_TurboMemory (1 << 1) + #define IMSM_OROM_CAPABILITIES_HddPassword (1 << 2) + #define IMSM_OROM_CAPABILITIES_DiskCoercion (1 << 3) + __u32 driver_features; + #define IMSM_OROM_CAPABILITIES_HDDUnlock (1 << 0) + #define IMSM_OROM_CAPABILITIES_LEDLoc (1 << 1) + #define IMSM_OROM_CAPABILITIES_EnterpriseSystem (1 << 2) + #define IMSM_OROM_CAPABILITIES_Zpodd (1 << 3) + #define IMSM_OROM_CAPABILITIES_LargeDramCache (1 << 4) + #define IMSM_OROM_CAPABILITIES_Rohi (1 << 5) + #define IMSM_OROM_CAPABILITIES_ReadPatrol (1 << 6) + #define IMSM_OROM_CAPABILITIES_XorHw (1 << 7) + #define IMSM_OROM_CAPABILITIES_SKUMode ((1 << 8)|(1 << 9)) + #define IMSM_OROM_CAPABILITIES_TPV (1 << 10) +} __attribute__((packed)); + +static inline int imsm_orom_has_raid0(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID0); +} +static inline int imsm_orom_has_raid1(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1); +} +static inline int imsm_orom_has_raid1e(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID1E); +} +static inline int imsm_orom_has_raid10(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID10); +} +static inline int imsm_orom_has_raid5(const struct imsm_orom *orom) +{ + return !!(orom->rlc & IMSM_OROM_RLC_RAID5); +} + +/** + * imsm_orom_has_chunk - check if the orom supports the given chunk size + * @orom: orom pointer from find_imsm_orom + * @chunk: chunk size in kibibytes + */ +static inline int imsm_orom_has_chunk(const struct imsm_orom *orom, int chunk) +{ + int fs = ffs(chunk); + if (!fs) + return 0; + fs--; /* bit num to bit index */ + if (chunk & (chunk-1)) + return 0; /* not a power of 2 */ + return !!(orom->sss & (1 << (fs - 1))); +} + +/** + * fls - find last (most-significant) bit set + * @x: the word to search + * The funciton is borrowed from Linux kernel code + * include/asm-generic/bitops/fls.h + */ +static inline int fls(int x) +{ + int r = 32; + + if (!x) + return 0; + if (!(x & 0xffff0000u)) { + x <<= 16; + r -= 16; + } + if (!(x & 0xff000000u)) { + x <<= 8; + r -= 8; + } + if (!(x & 0xf0000000u)) { + x <<= 4; + r -= 4; + } + if (!(x & 0xc0000000u)) { + x <<= 2; + r -= 2; + } + if (!(x & 0x80000000u)) { + r -= 1; + } + return r; +} + +static inline int imsm_orom_is_enterprise(const struct imsm_orom *orom) +{ + return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_EnterpriseSystem); +} + +static inline int imsm_orom_is_nvme(const struct imsm_orom *orom) +{ + return memcmp(orom->signature, IMSM_NVME_OROM_COMPAT_SIGNATURE, + sizeof(orom->signature)) == 0; +} + +static inline int imsm_orom_has_tpv_support(const struct imsm_orom *orom) +{ + return !!(orom->driver_features & IMSM_OROM_CAPABILITIES_TPV); +} + +enum sys_dev_type { + SYS_DEV_UNKNOWN = 0, + SYS_DEV_SAS, + SYS_DEV_SATA, + SYS_DEV_NVME, + SYS_DEV_VMD, + SYS_DEV_MAX +}; + +struct sys_dev { + enum sys_dev_type type; + char *path; + char *pci_id; + __u16 dev_id; + __u32 class; + struct sys_dev *next; +}; + +struct efi_guid { + __u8 b[16]; +}; + +struct devid_list { + __u16 devid; + struct devid_list *next; +}; + +struct orom_entry { + struct imsm_orom orom; + struct devid_list *devid_list; + enum sys_dev_type type; + struct orom_entry *next; +}; + +extern struct orom_entry *orom_entries; + +static inline char *guid_str(char *buf, struct efi_guid guid) +{ + sprintf(buf, "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x", + guid.b[3], guid.b[2], guid.b[1], guid.b[0], + guid.b[5], guid.b[4], guid.b[7], guid.b[6], + guid.b[8], guid.b[9], guid.b[10], guid.b[11], + guid.b[12], guid.b[13], guid.b[14], guid.b[15]); + return buf; +} + +char *get_nvme_multipath_dev_hw_path(const char *dev_path); +char *diskfd_to_devpath(int fd, int dev_level, char *buf); +int devpath_to_char(const char *dev_path, const char *entry, char *buf, + int len, int verbose); +__u16 devpath_to_vendor(const char *dev_path); +struct sys_dev *find_driver_devices(const char *bus, const char *driver); +struct sys_dev *find_intel_devices(void); +const struct imsm_orom *find_imsm_capability(struct sys_dev *hba); +const struct imsm_orom *find_imsm_orom(void); +int disk_attached_to_hba(int fd, const char *hba_path); +int devt_attached_to_hba(dev_t dev, const char *hba_path); +char *devt_to_devpath(dev_t dev, int dev_level, char *buf); +int path_attached_to_hba(const char *disk_path, const char *hba_path); +const char *get_sys_dev_type(enum sys_dev_type); +const struct orom_entry *get_orom_entry_by_device_id(__u16 dev_id); +const struct imsm_orom *get_orom_by_device_id(__u16 device_id); +struct sys_dev *device_by_id(__u16 device_id); +struct sys_dev *device_by_id_and_path(__u16 device_id, const char *path); +int is_multipath_nvme(int disk_fd); +int imsm_is_nvme_namespace_supported(int disk_fd, int verbose); +char *vmd_domain_to_controller(struct sys_dev *hba, char *buf); diff --git a/policy.c b/policy.c new file mode 100644 index 0000000..eee9ef6 --- /dev/null +++ b/policy.c @@ -0,0 +1,931 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <dirent.h> +#include <fnmatch.h> +#include <ctype.h> +#include "dlink.h" +/* + * Policy module for mdadm. + * A policy statement about a device lists a set of values for each + * of a set of names. Each value can have a metadata type as context. + * + * names include: + * action - the actions that can be taken on hot-plug + * domain - the domain(s) that the device is part of + * + * Policy information is extracted from various sources, but + * particularly from a set of policy rules in mdadm.conf + */ + +static void pol_new(struct dev_policy **pol, char *name, const char *val, + const char *metadata) +{ + struct dev_policy *n = xmalloc(sizeof(*n)); + const char *real_metadata = NULL; + int i; + + n->name = name; + n->value = val; + + /* We need to normalise the metadata name */ + if (metadata) { + for (i = 0; superlist[i] ; i++) + if (strcmp(metadata, superlist[i]->name) == 0) { + real_metadata = superlist[i]->name; + break; + } + if (!real_metadata) { + if (strcmp(metadata, "1") == 0 || + strcmp(metadata, "1.0") == 0 || + strcmp(metadata, "1.1") == 0 || + strcmp(metadata, "1.2") == 0) + real_metadata = super1.name; + } + if (!real_metadata) { + static const char *prev = NULL; + if (prev != metadata) { + pr_err("metadata=%s unrecognised - ignoring rule\n", + metadata); + prev = metadata; + } + real_metadata = "unknown"; + } + } + + n->metadata = real_metadata; + n->next = *pol; + *pol = n; +} + +static int pol_lesseq(struct dev_policy *a, struct dev_policy *b) +{ + int cmp; + + if (a->name < b->name) + return 1; + if (a->name > b->name) + return 0; + + cmp = strcmp(a->value, b->value); + if (cmp < 0) + return 1; + if (cmp > 0) + return 0; + + return (a->metadata <= b->metadata); +} + +static void pol_sort(struct dev_policy **pol) +{ + /* sort policy list in *pol by name/metadata/value + * using merge sort + */ + + struct dev_policy *pl[2]; + pl[0] = *pol; + pl[1] = NULL; + + do { + struct dev_policy **plp[2], *p[2]; + int curr = 0; + struct dev_policy nul = { NULL, NULL, NULL, NULL }; + struct dev_policy *prev = &nul; + int next = 0; + + /* p[] are the two lists that we are merging. + * plp[] are the ends of the two lists we create + * from the merge. + * 'curr' is which of plp[] that we are currently + * adding items to. + * 'next' is which if p[] we will take the next + * item from. + * 'prev' is that last value, which was placed in + * plp[curr]. + */ + plp[0] = &pl[0]; + plp[1] = &pl[1]; + p[0] = pl[0]; + p[1] = pl[1]; + + /* take least of p[0] and p[1] + * if it is larger than prev, add to + * plp[curr], else swap curr then add + */ + while (p[0] || p[1]) { + if (p[next] == NULL || + (p[1-next] != NULL && + !(pol_lesseq(prev, p[1-next]) + ^pol_lesseq(prev, p[next]) + ^pol_lesseq(p[next], p[1-next]))) + ) + next = 1 - next; + + if (!pol_lesseq(prev, p[next])) + curr = 1 - curr; + + *plp[curr] = prev = p[next]; + plp[curr] = &p[next]->next; + p[next] = p[next]->next; + } + *plp[0] = NULL; + *plp[1] = NULL; + } while (pl[0] && pl[1]); + if (pl[0]) + *pol = pl[0]; + else + *pol = pl[1]; +} + +static void pol_dedup(struct dev_policy *pol) +{ + /* This is a sorted list - remove duplicates. */ + while (pol && pol->next) { + if (pol_lesseq(pol->next, pol)) { + struct dev_policy *tmp = pol->next; + pol->next = tmp->next; + free(tmp); + } else + pol = pol->next; + } +} + +/* + * pol_find finds the first entry in the policy + * list to match name. + * If it returns non-NULL there is at least one + * value, but how many can only be found by + * iterating through the list. + */ +struct dev_policy *pol_find(struct dev_policy *pol, char *name) +{ + while (pol && pol->name < name) + pol = pol->next; + + if (!pol || pol->name != name) + return NULL; + return pol; +} + +static char **disk_paths(struct mdinfo *disk) +{ + struct stat stb; + int prefix_len; + DIR *by_path; + char symlink[PATH_MAX] = "/dev/disk/by-path/"; + char **paths; + int cnt = 0; + struct dirent *ent; + + paths = xmalloc(sizeof(*paths) * (cnt+1)); + + by_path = opendir(symlink); + if (by_path) { + prefix_len = strlen(symlink); + while ((ent = readdir(by_path)) != NULL) { + if (ent->d_type != DT_LNK) + continue; + strncpy(symlink + prefix_len, + ent->d_name, + sizeof(symlink) - prefix_len); + if (stat(symlink, &stb) < 0) + continue; + if ((stb.st_mode & S_IFMT) != S_IFBLK) + continue; + if (stb.st_rdev != makedev(disk->disk.major, disk->disk.minor)) + continue; + paths[cnt++] = xstrdup(ent->d_name); + paths = xrealloc(paths, sizeof(*paths) * (cnt+1)); + } + closedir(by_path); + } + paths[cnt] = NULL; + return paths; +} + +char type_part[] = "part"; +char type_disk[] = "disk"; +static char *disk_type(struct mdinfo *disk) +{ + char buf[30+20+20]; + struct stat stb; + sprintf(buf, "/sys/dev/block/%d:%d/partition", + disk->disk.major, disk->disk.minor); + if (stat(buf, &stb) == 0) + return type_part; + else + return type_disk; +} + +static int path_has_part(char *path, char **part) +{ + /* check if path ends with "-partNN" and + * if it does, place a pointer to "-pathNN" + * in 'part'. + */ + int l; + if (!path) + return 0; + l = strlen(path); + while (l > 1 && isdigit(path[l-1])) + l--; + if (l < 5 || strncmp(path+l-5, "-part", 5) != 0) + return 0; + *part = path+l-5; + return 1; +} + +static int pol_match(struct rule *rule, char **paths, char *type, char **part) +{ + /* Check if this rule matches on any path and type. + * If 'part' is not NULL, then 'path' must end in -partN, which + * we ignore for matching, and return in *part on success. + */ + int pathok = 0; /* 0 == no path, 1 == match, -1 == no match yet */ + int typeok = 0; + + for (; rule; rule = rule->next) { + if (rule->name == rule_path) { + char *p = NULL; + int i; + if (pathok == 0) + pathok = -1; + if (!paths) + continue; + for (i = 0; paths[i]; i++) { + if (part) { + if (!path_has_part(paths[i], &p)) + continue; + *p = '\0'; + *part = p+1; + } + if (fnmatch(rule->value, paths[i], 0) == 0) + pathok = 1; + if (part) + *p = '-'; + } + } + if (rule->name == rule_type) { + if (typeok == 0) + typeok = -1; + if (type && strcmp(rule->value, type) == 0) + typeok = 1; + } + } + return pathok >= 0 && typeok >= 0; +} + +static void pol_merge(struct dev_policy **pol, struct rule *rule) +{ + /* copy any name assignments from rule into pol */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) + if (r->name == pol_act || + r->name == pol_domain || + r->name == pol_auto) + pol_new(pol, r->name, r->value, metadata); +} + +static void pol_merge_part(struct dev_policy **pol, struct rule *rule, char *part) +{ + /* copy any name assignments from rule into pol, appending + * -part to any domain. The string with -part appended is + * stored with the rule so it has a lifetime to match + * the rule. + */ + struct rule *r; + char *metadata = NULL; + for (r = rule; r ; r = r->next) + if (r->name == pol_metadata) + metadata = r->value; + + for (r = rule; r ; r = r->next) { + if (r->name == pol_act) + pol_new(pol, r->name, r->value, metadata); + else if (r->name == pol_domain) { + char *dom; + int len; + if (r->dups == NULL) + r->dups = dl_head(); + len = strlen(r->value); + for (dom = dl_next(r->dups); dom != r->dups; + dom = dl_next(dom)) + if (strcmp(dom+len+1, part)== 0) + break; + if (dom == r->dups) { + char *newdom = dl_strndup( + r->value, len + 1 + strlen(part)); + strcat(strcat(newdom, "-"), part); + dl_add(r->dups, newdom); + dom = newdom; + } + pol_new(pol, r->name, dom, metadata); + } + } +} + +static struct pol_rule *config_rules = NULL; +static struct pol_rule **config_rules_end = NULL; +static int config_rules_has_path = 0; + +/* + * most policy comes from a set policy rules that are + * read from the config file. + * path_policy() gathers policy information for the + * disk described in the given a 'path' and a 'type'. + */ +struct dev_policy *path_policy(char **paths, char *type) +{ + struct pol_rule *rules; + struct dev_policy *pol = NULL; + int i; + + rules = config_rules; + + while (rules) { + char *part = NULL; + if (rules->type == rule_policy) + if (pol_match(rules->rule, paths, type, NULL)) + pol_merge(&pol, rules->rule); + if (rules->type == rule_part && strcmp(type, type_part) == 0) + if (pol_match(rules->rule, paths, type_disk, &part)) + pol_merge_part(&pol, rules->rule, part); + rules = rules->next; + } + + /* Now add any metadata-specific internal knowledge + * about this path + */ + for (i=0; paths && paths[0] && superlist[i]; i++) + if (superlist[i]->get_disk_controller_domain) { + const char *d = + superlist[i]->get_disk_controller_domain( + paths[0]); + if (d) + pol_new(&pol, pol_domain, d, superlist[i]->name); + } + + pol_sort(&pol); + pol_dedup(pol); + return pol; +} + +void pol_add(struct dev_policy **pol, + char *name, char *val, + char *metadata) +{ + pol_new(pol, name, val, metadata); + pol_sort(pol); + pol_dedup(*pol); +} + +static void free_paths(char **paths) +{ + int i; + + if (!paths) + return; + + for (i = 0; paths[i]; i++) + free(paths[i]); + free(paths); +} + +/* + * disk_policy() gathers policy information for the + * disk described in the given mdinfo (disk.{major,minor}). + */ +struct dev_policy *disk_policy(struct mdinfo *disk) +{ + char **paths = NULL; + char *type = disk_type(disk); + struct dev_policy *pol = NULL; + + if (config_rules_has_path) + paths = disk_paths(disk); + + pol = path_policy(paths, type); + + free_paths(paths); + return pol; +} + +struct dev_policy *devid_policy(int dev) +{ + struct mdinfo disk; + disk.disk.major = major(dev); + disk.disk.minor = minor(dev); + return disk_policy(&disk); +} + +/* + * process policy rules read from config file. + */ + +char rule_path[] = "path"; +char rule_type[] = "type"; + +char rule_policy[] = "policy"; +char rule_part[] = "part-policy"; + +char pol_metadata[] = "metadata"; +char pol_act[] = "action"; +char pol_domain[] = "domain"; +char pol_auto[] = "auto"; + +static int try_rule(char *w, char *name, struct rule **rp) +{ + struct rule *r; + int len = strlen(name); + if (strncmp(w, name, len) != 0 || + w[len] != '=') + return 0; + r = xmalloc(sizeof(*r)); + r->next = *rp; + r->name = name; + r->value = xstrdup(w+len+1); + r->dups = NULL; + *rp = r; + return 1; +} + +void policyline(char *line, char *type) +{ + struct pol_rule *pr; + char *w; + + if (config_rules_end == NULL) + config_rules_end = &config_rules; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + for (w = dl_next(line); w != line ; w = dl_next(w)) { + if (try_rule(w, rule_path, &pr->rule)) + config_rules_has_path = 1; + else if (! try_rule(w, rule_type, &pr->rule) && + ! try_rule(w, pol_metadata, &pr->rule) && + ! try_rule(w, pol_act, &pr->rule) && + ! try_rule(w, pol_domain, &pr->rule) && + ! try_rule(w, pol_auto, &pr->rule)) + pr_err("policy rule %s unrecognised and ignored\n", + w); + } + pr->next = config_rules; + config_rules = pr; +} + +void policy_add(char *type, ...) +{ + va_list ap; + struct pol_rule *pr; + char *name, *val; + + pr = xmalloc(sizeof(*pr)); + pr->type = type; + pr->rule = NULL; + + va_start(ap, type); + while ((name = va_arg(ap, char*)) != NULL) { + struct rule *r; + + val = va_arg(ap, char*); + r = xmalloc(sizeof(*r)); + r->next = pr->rule; + r->name = name; + r->value = xstrdup(val); + r->dups = NULL; + pr->rule = r; + } + pr->next = config_rules; + config_rules = pr; + va_end(ap); +} + +void policy_free(void) +{ + while (config_rules) { + struct pol_rule *pr = config_rules; + struct rule *r; + + config_rules = config_rules->next; + + for (r = pr->rule; r; ) { + struct rule *next = r->next; + free(r->value); + if (r->dups) + free_line(r->dups); + free(r); + r = next; + } + free(pr); + } + config_rules_end = NULL; + config_rules_has_path = 0; +} + +void dev_policy_free(struct dev_policy *p) +{ + struct dev_policy *t; + while (p) { + t = p; + p = p->next; + free(t); + } +} + +static enum policy_action map_act(const char *act) +{ + if (strcmp(act, "include") == 0) + return act_include; + if (strcmp(act, "re-add") == 0) + return act_re_add; + if (strcmp(act, "spare") == 0) + return act_spare; + if (strcmp(act, "spare-same-slot") == 0) + return act_spare_same_slot; + if (strcmp(act, "force-spare") == 0) + return act_force_spare; + return act_err; +} + +static enum policy_action policy_action(struct dev_policy *plist, const char *metadata) +{ + enum policy_action rv = act_default; + struct dev_policy *p; + + plist = pol_find(plist, pol_act); + pol_for_each(p, plist, metadata) { + enum policy_action a = map_act(p->value); + if (a > rv) + rv = a; + } + return rv; +} + +int policy_action_allows(struct dev_policy *plist, const char *metadata, enum policy_action want) +{ + enum policy_action act = policy_action(plist, metadata); + + if (act == act_err) + return 0; + return (act >= want); +} + +int disk_action_allows(struct mdinfo *disk, const char *metadata, enum policy_action want) +{ + struct dev_policy *pol = disk_policy(disk); + int rv = policy_action_allows(pol, metadata, want); + + dev_policy_free(pol); + return rv; +} + +/* Domain policy: + * Any device can have a list of domains asserted by different policy + * statements. + * An array also has a list of domains comprising all the domains of + * all the devices in an array. + * Where an array has a spare-group, that becomes an addition domain for + * every device in the array and thus for the array. + * + * We keep the list of domains in a sorted linked list + * As dev policies are already sorted, this is fairly easy to manage. + */ + +static struct domainlist **domain_merge_one(struct domainlist **domp, + const char *domain) +{ + /* merge a domain name into a sorted list and return the + * location of the insertion or match + */ + struct domainlist *dom = *domp; + + while (dom && strcmp(dom->dom, domain) < 0) { + domp = &dom->next; + dom = *domp; + } + if (dom == NULL || strcmp(dom->dom, domain) != 0) { + dom = xmalloc(sizeof(*dom)); + dom->next = *domp; + dom->dom = domain; + *domp = dom; + } + return domp; +} + +#if (DEBUG) +void dump_policy(struct dev_policy *policy) +{ + while (policy) { + dprintf("policy: %p name: %s value: %s metadata: %s\n", + policy, + policy->name, + policy->value, + policy->metadata); + policy = policy->next; + } +} +#endif + +void domain_merge(struct domainlist **domp, struct dev_policy *pollist, + const char *metadata) +{ + /* Add to 'domp' all the domains in pol that apply to 'metadata' + * which are not already in domp + */ + struct dev_policy *pol; + pollist = pol_find(pollist, pol_domain); + pol_for_each(pol, pollist, metadata) + domain_merge_one(domp, pol->value); +} + +int domain_test(struct domainlist *dom, struct dev_policy *pol, + const char *metadata) +{ + /* Check that all domains in pol (for metadata) are also in + * dom. Both lists are sorted. + * If pol has no domains, we don't really know about this device + * so we allow caller to choose: + * -1: has no domains + * 0: has domains, not all match + * 1: has domains, all match + */ + int found_any = -1; + int has_one_domain = 1; + struct dev_policy *p; + + pol = pol_find(pol, pol_domain); + pol_for_each(p, pol, metadata) { + found_any = 1; + while (dom && strcmp(dom->dom, p->value) < 0) + dom = dom->next; + if (!dom || strcmp(dom->dom, p->value) != 0) + return 0; + if (has_one_domain && metadata && strcmp(metadata, "imsm") == 0) + found_any = -1; + has_one_domain = 0; + } + return found_any; +} + +void domainlist_add_dev(struct domainlist **dom, int devid, const char *metadata) +{ + struct dev_policy *pol = devid_policy(devid); + domain_merge(dom, pol, metadata); + dev_policy_free(pol); +} + +struct domainlist *domain_from_array(struct mdinfo *mdi, const char *metadata) +{ + struct domainlist *domlist = NULL; + + if (!mdi) + return NULL; + for (mdi = mdi->devs ; mdi ; mdi = mdi->next) + domainlist_add_dev(&domlist, makedev(mdi->disk.major, + mdi->disk.minor), + metadata); + + return domlist; +} + +void domain_add(struct domainlist **domp, char *domain) +{ + domain_merge_one(domp, domain); +} + +void domain_free(struct domainlist *dl) +{ + while (dl) { + struct domainlist *head = dl; + dl = dl->next; + free(head); + } +} + +/* + * same-path policy. + * Some policy decisions are guided by knowledge of which + * array previously owned the device at a given physical location (path). + * When removing a device from an array we might record the array against + * the path, and when finding a new device, we might look for which + * array previously used that path. + * + * The 'array' is described by a map_ent, and the path by a the disk in an + * mdinfo, or a string. + */ + +void policy_save_path(char *id_path, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + + if (mkdir(FAILED_SLOTS_DIR, S_IRWXU) < 0 && errno != EEXIST) { + pr_err("can't create file to save path to old disk: %s\n", strerror(errno)); + return; + } + + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_path); + f = fopen(path, "w"); + if (!f) { + pr_err("can't create file to save path to old disk: %s\n", + strerror(errno)); + return; + } + + if (fprintf(f, "%20s %08x:%08x:%08x:%08x\n", + array->metadata, + array->uuid[0], array->uuid[1], + array->uuid[2], array->uuid[3]) <= 0) + pr_err("Failed to write to <id_path> cookie\n"); + + fclose(f); +} + +int policy_check_path(struct mdinfo *disk, struct map_ent *array) +{ + char path[PATH_MAX]; + FILE *f = NULL; + char **id_paths = disk_paths(disk); + int i; + int rv = 0; + + for (i = 0; id_paths[i]; i++) { + snprintf(path, PATH_MAX, FAILED_SLOTS_DIR "/%s", id_paths[i]); + f = fopen(path, "r"); + if (!f) + continue; + + rv = fscanf(f, " %20s %x:%x:%x:%x\n", + array->metadata, + array->uuid, + array->uuid+1, + array->uuid+2, + array->uuid+3); + fclose(f); + break; + } + free_paths(id_paths); + return rv == 5; +} + +/* invocation of udev rule file */ +char udev_template_start[] = +"# do not edit this file, it is automatically generated by mdadm\n" +"\n"; + +/* find rule named rule_type and return its value */ +char *find_rule(struct rule *rule, char *rule_type) +{ + while (rule) { + if (rule->name == rule_type) + return rule->value; + + rule = rule->next; + } + return NULL; +} + +#define UDEV_RULE_FORMAT \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{DEVTYPE}==\"%s\", ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +#define UDEV_RULE_FORMAT_NOTYPE \ +"ACTION==\"add\", SUBSYSTEM==\"block\", " \ +"ENV{ID_PATH}==\"%s\", " \ +"RUN+=\"" BINDIR "/mdadm --incremental $env{DEVNAME}\"\n" + +/* Write rule in the rule file. Use format from UDEV_RULE_FORMAT */ +int write_rule(struct rule *rule, int fd, int force_part) +{ + char line[1024]; + char *pth = find_rule(rule, rule_path); + char *typ = find_rule(rule, rule_type); + if (!pth) + return -1; + + if (force_part) + typ = type_part; + if (typ) + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT, typ, pth); + else + snprintf(line, sizeof(line) - 1, UDEV_RULE_FORMAT_NOTYPE, pth); + return write(fd, line, strlen(line)) == (int)strlen(line); +} + +/* Generate single entry in udev rule basing on POLICY line found in config + * file. Take only those with paths, only first occurrence if paths are equal + * and if actions supports handling of spares (>=act_spare_same_slot) + */ +int generate_entries(int fd) +{ + struct pol_rule *loop, *dup; + char *loop_value, *dup_value; + int duplicate; + + for (loop = config_rules; loop; loop = loop->next) { + if (loop->type != rule_policy && loop->type != rule_part) + continue; + duplicate = 0; + + /* only policies with paths and with actions supporting + * bare disks are considered */ + loop_value = find_rule(loop->rule, pol_act); + if (!loop_value || map_act(loop_value) < act_spare_same_slot) + continue; + loop_value = find_rule(loop->rule, rule_path); + if (!loop_value) + continue; + for (dup = config_rules; dup != loop; dup = dup->next) { + if (dup->type != rule_policy && loop->type != rule_part) + continue; + dup_value = find_rule(dup->rule, pol_act); + if (!dup_value || map_act(dup_value) < act_spare_same_slot) + continue; + dup_value = find_rule(dup->rule, rule_path); + if (!dup_value) + continue; + if (strcmp(loop_value, dup_value) == 0) { + duplicate = 1; + break; + } + } + + /* not a dup or first occurrence */ + if (!duplicate) + if (!write_rule(loop->rule, fd, loop->type == rule_part) ) + return 0; + } + return 1; +} + +/* Write_rules routine creates dynamic udev rules used to handle + * hot-plug events for bare devices (and making them spares) + */ +int Write_rules(char *rule_name) +{ + int fd; + char udev_rule_file[PATH_MAX]; + + if (rule_name) { + strncpy(udev_rule_file, rule_name, sizeof(udev_rule_file) - 6); + udev_rule_file[sizeof(udev_rule_file) - 6] = '\0'; + strcat(udev_rule_file, ".temp"); + fd = creat(udev_rule_file, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + if (fd == -1) + return 1; + } else + fd = 1; + + /* write static invocation */ + if (write(fd, udev_template_start, sizeof(udev_template_start) - 1) != + (int)sizeof(udev_template_start) - 1) + goto abort; + + /* iterate, if none created or error occurred, remove file */ + if (generate_entries(fd) < 0) + goto abort; + + fsync(fd); + if (rule_name) { + close(fd); + rename(udev_rule_file, rule_name); + } + return 0; +abort: + if (rule_name) { + close(fd); + unlink(udev_rule_file); + } + return 1; +} diff --git a/probe_roms.c b/probe_roms.c new file mode 100644 index 0000000..7ea04c7 --- /dev/null +++ b/probe_roms.c @@ -0,0 +1,331 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * (based on linux-2.6:arch/x86/kernel/probe_roms_32.c) + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include "probe_roms.h" +#include "mdadm.h" +#include <unistd.h> +#include <signal.h> +#include <fcntl.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/types.h> +#include <asm/types.h> + +static void *rom_mem = MAP_FAILED; +static int rom_fd = -1; +static const int rom_len = 0xf0000 - 0xc0000; /* option-rom memory region */ +static int _sigbus; +static unsigned long rom_align; + +static void roms_deinit(void); +static int roms_init(void); + +static void sigbus(int sig) +{ + _sigbus = 1; +} + +static int probe_address8(const __u8 *ptr, __u8 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +static int probe_address16(const __u16 *ptr, __u16 *val) +{ + int rc = 0; + + *val = *ptr; + if (_sigbus) + rc = -1; + _sigbus = 0; + + return rc; +} + +void probe_roms_exit(void) +{ + signal(SIGBUS, SIG_DFL); + if (rom_fd >= 0) { + close(rom_fd); + rom_fd = -1; + } + if (rom_mem != MAP_FAILED) { + munmap(rom_mem, rom_len); + rom_mem = MAP_FAILED; + } + roms_deinit(); +} + +int probe_roms_init(unsigned long align) +{ + int fd = -1; + int rc = 0; + + /* valid values are 2048 and 512. 512 is for PCI-3.0 compliant + * systems, or systems that do not have dangerous/legacy ISA + * devices. 2048 should always be safe + */ + if (align == 512 || align == 2048) + rom_align = align; + else + return -1; + + if (roms_init()) + return -1; + + if (signal(SIGBUS, sigbus) == SIG_ERR) + rc = -1; + if (rc == 0) { + fd = open("/dev/mem", O_RDONLY); + if (fd < 0) + rc = -1; + } + if (rc == 0) { + rom_mem = mmap(NULL, rom_len, PROT_READ, MAP_PRIVATE, fd, 0xc0000); + if (rom_mem == MAP_FAILED) + rc = -1; + } + + if (rc == 0) + rom_fd = fd; + else { + if (fd >= 0) + close(fd); + probe_roms_exit(); + } + return rc; +} + +/** + * isa_bus_to_virt - convert physical address to mmap'd region + * @addr - address to convert + * + * Only valid between a successful call to probe_roms_init and the + * corresponding probe_roms_exit + */ +static void *isa_bus_to_virt(unsigned long addr) +{ + return rom_mem + (addr - 0xc0000); +} + +struct resource { + unsigned long start; + unsigned long end; + unsigned long data; + const char *name; + struct resource *next; +}; + +static struct resource system_rom_resource = { + .name = "System ROM", + .start = 0xf0000, + .data = 0, + .end = 0xfffff, +}; + +static struct resource extension_rom_resource = { + .name = "Extension ROM", + .start = 0xe0000, + .data = 0, + .end = 0xeffff, +}; + +static struct resource *adapter_rom_resources; + +static struct resource video_rom_resource = { + .name = "Video ROM", + .start = 0xc0000, + .data = 0, + .end = 0xc7fff, +}; + +static int roms_init(void) +{ + adapter_rom_resources = malloc(sizeof(struct resource)); + if (adapter_rom_resources == NULL) + return 1; + adapter_rom_resources->name = "Adapter ROM"; + adapter_rom_resources->start = 0xc8000; + adapter_rom_resources->data = 0; + adapter_rom_resources->end = 0; + adapter_rom_resources->next = NULL; + return 0; +} + +static void roms_deinit(void) +{ + struct resource *res; + + res = adapter_rom_resources; + while (res) { + struct resource *tmp = res; + + res = res->next; + free(tmp); + } +} + +#define ROMSIGNATURE 0xaa55 + + +static int romsignature(const unsigned char *rom) +{ + const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig = 0; + + return probe_address16(ptr, &sig) == 0 && sig == ROMSIGNATURE; +} + +static int romchecksum(const unsigned char *rom, unsigned long length) +{ + unsigned char sum, c; + + for (sum = 0; length && probe_address8(rom++, &c) == 0; length--) + sum += c; + return !length && !sum; +} + +int scan_adapter_roms(scan_fn fn) +{ + /* let scan_fn examing each of the adapter roms found by probe_roms */ + struct resource *res = adapter_rom_resources; + int found; + + if (rom_fd < 0) + return 0; + + found = 0; + while (res) { + if (res->start) { + found = fn(isa_bus_to_virt(res->start), + isa_bus_to_virt(res->end), + isa_bus_to_virt(res->data)); + if (found) + break; + } else + break; + res = res->next; + } + + return found; +} + +static unsigned long align(unsigned long addr, unsigned long alignment) +{ + return (addr + alignment - 1) & ~(alignment - 1); +} + +void probe_roms(void) +{ + const void *rom; + unsigned long start, length, upper; + unsigned char c; + struct resource *res = adapter_rom_resources; + __u16 val=0; + + if (rom_fd < 0) + return; + + /* video rom */ + upper = res->start; + for (start = video_rom_resource.start; start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + video_rom_resource.start = start; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) + video_rom_resource.end = start + length - 1; + break; + } + + start = align(video_rom_resource.end + 1, rom_align); + if (start < upper) + start = upper; + + /* system rom */ + upper = system_rom_resource.start; + + /* check for extension rom (ignore length byte!) */ + rom = isa_bus_to_virt(extension_rom_resource.start); + if (romsignature(rom)) { + length = extension_rom_resource.end - extension_rom_resource.start + 1; + if (romchecksum(rom, length)) + upper = extension_rom_resource.start; + } + + struct resource *prev_res = res; + /* check for adapter roms on 2k boundaries */ + for (; start < upper; start += rom_align) { + rom = isa_bus_to_virt(start); + if (!romsignature(rom)) + continue; + + if (probe_address8(rom + 2, &c) != 0) + continue; + + /* 0 < length <= 0x7f * 512, historically */ + length = c * 512; + + /* Retrieve 16-bit pointer to PCI Data Structure (offset 18h-19h) + * The data can be within 64KB forward of the first location + * of this code image. The pointer is in little-endian order + */ + + if (probe_address16(rom + 0x18, &val) != 0) + continue; + val = __le16_to_cpu(val); + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) + continue; + + if (res == NULL) { + res = calloc(1, sizeof(struct resource)); + if (res == NULL) + return; + prev_res->next = res; + } + + res->start = start; + res->data = start + (unsigned long)val; + res->end = start + length - 1; + + start = res->end & ~(rom_align - 1); + prev_res = res; + res = res->next; + } +} diff --git a/probe_roms.h b/probe_roms.h new file mode 100644 index 0000000..6d70411 --- /dev/null +++ b/probe_roms.h @@ -0,0 +1,24 @@ +/* + * probe_roms - scan for Adapter ROMS + * + * Copyright (C) 2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +void probe_roms_exit(void); +int probe_roms_init(unsigned long align); +typedef int (*scan_fn)(const void *start, const void *end, const void *data); +int scan_adapter_roms(scan_fn fn); +void probe_roms(void); @@ -0,0 +1,17 @@ + +/* + * We cannot link a static binary with passwd/group support, so + * just do without + */ +#include <stdlib.h> +#include <pwd.h> +#include <grp.h> + +struct passwd *getpwnam(const char *name) +{ + return NULL; +} +struct group *getgrnam(const char *name) +{ + return NULL; +} diff --git a/raid5extend.c b/raid5extend.c new file mode 100644 index 0000000..d8e62c2 --- /dev/null +++ b/raid5extend.c @@ -0,0 +1,80 @@ + +int phys2log(int phys, int stripe, int n, int layout) +{ + /* In an 'n' disk array using 'layout', + * in stripe 'stripe', the physical disc 'phys' + * stores what logical chunk? + * -1 mean parity. + * + */ + switch(layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + pd = (n-1) - (stripe % n); + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys; + else if (phys == pd) + return -1; + else return phys-1; + + case ALGORITHM_LEFT_SYMMETRIC: + pd = (n-1) - (stripe %n); + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + + case ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % n; + if (phys < pd) + return phys+ n-1-pd; + else if (phys == pd) + return -1; + else return phys-pd-1; + } + return -2; +} + +raid5_extend(unsigned long len, int chunksize, int layout, int n, int m, int rfds[], int wfds[]) +{ + + static char buf[4096]; + + unsigned long blocks = len/4; + unsigned int blocksperchunk= chunksize/4096; + + unsigned long b; + + for (b=0; b<blocks; b++) { + unsigned long stripe = b / blocksperchunk; + unsigned int offset = b - (stripe*blocksperchunk); + unsigned long chunk = stripe * (n-1); + int src; + for (src=0; src<n; src++) { + int dnum, snum; + if (read(rfds[src], buf, sizeof(buf)) != sizeof(buf)) { + error(); + return 0; + } + + snum = phys2log(src, stripe, n, layout); + + if (snum == -1) + continue; + chunk = stripe*(n-1)+snum; + + dstripe = chunk/(m-1); + dnum = log2phys(chunk-(stripe*(m-1)), dstripe, m, layout); + llseek(wfds[dnum], dstripe*chunksize+(offset*4096), 0); + write(wfds[dnum], buf, sizeof(buf)); + } + } +} diff --git a/raid6check.8 b/raid6check.8 new file mode 100644 index 0000000..8999ca8 --- /dev/null +++ b/raid6check.8 @@ -0,0 +1,96 @@ +.\" -*- nroff -*- +.\" Copyright Piergiorgio Sartor and others. +.\" This program is free software; you can redistribute it and/or modify +.\" it under the terms of the GNU General Public License as published by +.\" the Free Software Foundation; either version 2 of the License, or +.\" (at your option) any later version. +.\" See file COPYING in distribution for details. +.TH RAID6CHECK 8 "" v1.0.0 +.SH NAME +raid6check \- check MD RAID6 device for errors +.I aka +Linux Software RAID + +.SH SYNOPSIS + +.BI raid6check " <raid6 device> <start stripe> <number of stripes>" + +.SH DESCRIPTION +RAID6 devices in which one single component drive has errors can use +the double parity in order to find out which component drive. +The "raid6check" tool checks, for each stripe, the double parity +consistency, reports mismatches and, if possible, which +component drive has the mismatch. +Since it works at stripe level, it can report different drives with +mismatches at different stripes. + +"raid6check" requires a non-degraded RAID6 MD device as first +parameter, a starting stripe (usually 0) and the number of stripes +to be checked. +If this third parameter is also 0, it will check the array up to +the end. + +"raid6check" will start printing information about the RAID6, then +for each stripe, it will report the parity rotation status. +In case of parity mismatches, "raid6check" reports, if possible, +which component drive could be responsible. Otherwise it reports +that it is not possible to find the component drive. + +If the given MD device is not a RAID6, "raid6check" will, of +course, not continue. + +If the RAID6 MD device is degraded, "raid6check" will report +an error and it will not proceed further. + +No write operations are performed on the array or the components. +Furthermore, the checked array can be online and in use during +the operation of "raid6check". + +.SH EXAMPLES + +.B " raid6check /dev/md0 0 0" +.br +This will check /dev/md0 from start to end. + +.B " raid6check /dev/md3 0 1" +.br +This will check the first stripe of /dev/md3. + +.B " raid6check /dev/md1 1000 0" +.br +This will check /dev/md1 from stripe 1000 up to the end. + +.B " raid6check /dev/m127 128 256" +.br +This will check 256 stripes of /dev/md127 starting from stripe 128. + +.B " raid6check /dev/md0 0 0 | grep -i error > md0_err.log" +.br +This will check /dev/md0 completely and create a log file only +with errors, if any. + +.SH FILES + +"raid6check" uses directly the component drives as found in /dev. +Furthermore, the sysfs interface is needed in order to find out +the RAID6 parameters. + +.SH BUGS +Negative parameters can lead to unexpected results. + +It is not clear what will happen if the RAID6 MD device gets +degraded during the check. + +.PP +The latest version of +.I raid6check +should always be available from +.IP +.B https://www.kernel.org/pub/linux/utils/raid/mdadm/ +.PP +Related man pages: +.PP +.IR mdadm (8) +.IR mdmon (8), +.IR mdadm.conf (5), +.IR md (4). diff --git a/raid6check.c b/raid6check.c new file mode 100644 index 0000000..a8e6005 --- /dev/null +++ b/raid6check.c @@ -0,0 +1,714 @@ +/* + * raid6check - extended consistency check for RAID-6 + * + * Copyright (C) 2011 Piergiorgio Sartor + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Piergiorgio Sartor + * Based on "restripe.c" from "mdadm" codebase + */ + +#include "mdadm.h" +#include <stdint.h> +#include <signal.h> +#include <sys/mman.h> + +#define CHECK_PAGE_BITS (12) +#define CHECK_PAGE_SIZE (1 << CHECK_PAGE_BITS) + +char const Name[] = "raid6check"; + +enum repair { + NO_REPAIR = 0, + MANUAL_REPAIR, + AUTO_REPAIR +}; + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout); +int is_ddf(int layout); +void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size); +void make_tables(void); +void ensure_zero_has_size(int chunk_size); +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset); +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset); +void xor_blocks(char *target, char **sources, int disks, int size); + +/* Collect per stripe consistency information */ +void raid6_collect(int chunk_size, uint8_t *p, uint8_t *q, + char *chunkP, char *chunkQ, int *results) +{ + int i; + int data_id; + uint8_t Px, Qx; + extern uint8_t raid6_gflog[]; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)chunkP[i] ^ (uint8_t)p[i]; + Qx = (uint8_t)chunkQ[i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + results[i] = -1; + + if((Px == 0) && (Qx != 0)) + results[i] = -2; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + results[i] = data_id; + } + + if((Px == 0) && (Qx == 0)) + results[i] = -255; + } +} + +/* Try to find out if a specific disk has problems in a CHECK_PAGE_SIZE page size */ +int raid6_stats_blk(int *results, int raid_disks) +{ + int i; + int curr_broken_disk = -255; + int prev_broken_disk = -255; + int broken_status = 0; + + for(i = 0; i < CHECK_PAGE_SIZE; i++) { + + if(results[i] != -255) + curr_broken_disk = results[i]; + + if(curr_broken_disk >= raid_disks) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -255) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -65535; + break; + } + } + + return curr_broken_disk; +} + +/* Collect disks status for a strip in CHECK_PAGE_SIZE page size blocks */ +void raid6_stats(int *disk, int *results, int raid_disks, int chunk_size) +{ + int i, j; + + for(i = 0, j = 0; i < chunk_size; i += CHECK_PAGE_SIZE, j++) { + disk[j] = raid6_stats_blk(&results[i], raid_disks); + } +} + +int lock_stripe(struct mdinfo *info, unsigned long long start, + int chunk_size, int data_disks, sighandler_t *sig) { + int rv; + if(mlockall(MCL_CURRENT | MCL_FUTURE) != 0) { + return 2; + } + + sig[0] = signal(SIGTERM, SIG_IGN); + sig[1] = signal(SIGINT, SIG_IGN); + sig[2] = signal(SIGQUIT, SIG_IGN); + + rv = sysfs_set_num(info, NULL, "suspend_lo", start * chunk_size * data_disks); + rv |= sysfs_set_num(info, NULL, "suspend_hi", (start + 1) * chunk_size * data_disks); + return rv * 256; +} + +int unlock_all_stripes(struct mdinfo *info, sighandler_t *sig) { + int rv; + rv = sysfs_set_num(info, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + rv |= sysfs_set_num(info, NULL, "suspend_hi", 0); + rv |= sysfs_set_num(info, NULL, "suspend_lo", 0); + + signal(SIGQUIT, sig[2]); + signal(SIGINT, sig[1]); + signal(SIGTERM, sig[0]); + + if(munlockall() != 0) + return 3; + return rv * 256; +} + +/* Autorepair */ +int autorepair(int *disk, unsigned long long start, int chunk_size, + char *name[], int raid_disks, int syndrome_disks, char **blocks_page, + char **blocks, uint8_t *p, int *block_index_for_slot, + int *source, unsigned long long *offsets) +{ + int i, j; + int pages_to_write_count = 0; + int page_to_write[chunk_size >> CHECK_PAGE_BITS]; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if (disk[j] >= -2 && block_index_for_slot[disk[j]] >= 0) { + int slot = block_index_for_slot[disk[j]]; + printf("Auto-repairing slot %d (%s)\n", slot, name[slot]); + pages_to_write_count++; + page_to_write[j] = 1; + for(i = -2; i < syndrome_disks; i++) { + blocks_page[i] = blocks[i] + j * CHECK_PAGE_SIZE; + } + if (disk[j] == -2) { + qsyndrome(p, (uint8_t*)blocks_page[-2], + (uint8_t**)blocks_page, + syndrome_disks, CHECK_PAGE_SIZE); + } + else { + char *all_but_failed_blocks[syndrome_disks]; + for(i = 0; i < syndrome_disks; i++) { + if (i == disk[j]) + all_but_failed_blocks[i] = blocks_page[-1]; + else + all_but_failed_blocks[i] = blocks_page[i]; + } + xor_blocks(blocks_page[disk[j]], + all_but_failed_blocks, syndrome_disks, + CHECK_PAGE_SIZE); + } + } + else { + page_to_write[j] = 0; + } + } + + if(pages_to_write_count > 0) { + int write_res = 0; + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + if(page_to_write[j] == 1) { + int slot = block_index_for_slot[disk[j]]; + lseek64(source[slot], offsets[slot] + start * chunk_size + j * CHECK_PAGE_SIZE, SEEK_SET); + write_res += write(source[slot], + blocks[disk[j]] + j * CHECK_PAGE_SIZE, + CHECK_PAGE_SIZE); + } + } + + if (write_res != (CHECK_PAGE_SIZE * pages_to_write_count)) { + fprintf(stderr, "Failed to write a full chunk.\n"); + return -1; + } + } + + return 0; +} + +/* Manual repair */ +int manual_repair(int chunk_size, int syndrome_disks, + int failed_slot1, int failed_slot2, + unsigned long long start, int *block_index_for_slot, + char *name[], char **stripes, char **blocks, uint8_t *p, + int *source, unsigned long long *offsets) +{ + int i; + int fd1 = block_index_for_slot[failed_slot1]; + int fd2 = block_index_for_slot[failed_slot2]; + printf("Repairing stripe %llu\n", start); + printf("Assuming slots %d (%s) and %d (%s) are incorrect\n", + fd1, name[fd1], + fd2, name[fd2]); + + if (failed_slot1 == -2 || failed_slot2 == -2) { + char *all_but_failed_blocks[syndrome_disks]; + int failed_data_or_p; + + if (failed_slot1 == -2) + failed_data_or_p = failed_slot2; + else + failed_data_or_p = failed_slot1; + + printf("Repairing D/P(%d) and Q\n", failed_data_or_p); + + for (i = 0; i < syndrome_disks; i++) { + if (i == failed_data_or_p) + all_but_failed_blocks[i] = blocks[-1]; + else + all_but_failed_blocks[i] = blocks[i]; + } + xor_blocks(blocks[failed_data_or_p], + all_but_failed_blocks, syndrome_disks, chunk_size); + qsyndrome(p, (uint8_t*)blocks[-2], (uint8_t**)blocks, + syndrome_disks, chunk_size); + } else { + ensure_zero_has_size(chunk_size); + if (failed_slot1 == -1 || failed_slot2 == -1) { + int failed_data; + if (failed_slot1 == -1) + failed_data = failed_slot2; + else + failed_data = failed_slot1; + + printf("Repairing D(%d) and P\n", failed_data); + raid6_datap_recov(syndrome_disks+2, chunk_size, + failed_data, (uint8_t**)blocks, 1); + } else { + printf("Repairing D and D\n"); + raid6_2data_recov(syndrome_disks+2, chunk_size, + failed_slot1, failed_slot2, + (uint8_t**)blocks, 1); + } + } + + int write_res1, write_res2; + off64_t seek_res; + + seek_res = lseek64(source[fd1], + offsets[fd1] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk1\n"); + return -1; + } + write_res1 = write(source[fd1], blocks[failed_slot1], chunk_size); + + seek_res = lseek64(source[fd2], + offsets[fd2] + start * chunk_size, SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek failed for failed_disk2\n"); + return -1; + } + write_res2 = write(source[fd2], blocks[failed_slot2], chunk_size); + + if (write_res1 != chunk_size || write_res2 != chunk_size) { + fprintf(stderr, "Failed to write a complete chunk.\n"); + return -2; + } + + return 0; +} + +int check_stripes(struct mdinfo *info, int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length, char *name[], + enum repair repair, int failed_disk1, int failed_disk2) +{ + /* read the data and p and q blocks, and check we got them right */ + int data_disks = raid_disks - 2; + int syndrome_disks = data_disks + is_ddf(layout) * 2; + char *stripe_buf; + + /* stripes[] is indexed by raid_disk and holds chunks from each device */ + char **stripes = xmalloc(raid_disks * sizeof(char*)); + + /* blocks[] is indexed by syndrome number and points to either one of the + * chunks from 'stripes[]', or to a chunk of zeros. -1 and -2 are + * P and Q */ + char **blocks = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* blocks_page[] is a temporary index to just one page of the chunks + * that blocks[] points to. */ + char **blocks_page = xmalloc((syndrome_disks + 2) * sizeof(char*)); + + /* block_index_for_slot[] provides the reverse mapping from blocks to stripes. + * The index is a syndrome position, the content is a raid_disk number. + * indicies -1 and -2 work, and are P and Q disks */ + int *block_index_for_slot = xmalloc((syndrome_disks+2) * sizeof(int)); + + /* 'p' and 'q' contain calcualted P and Q, to be compared with + * blocks[-1] and blocks[-2]; + */ + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + char *zero = xmalloc(chunk_size); + int *results = xmalloc(chunk_size * sizeof(int)); + sighandler_t *sig = xmalloc(3 * sizeof(sighandler_t)); + + int i, j; + int diskP, diskQ, diskD; + int err = 0; + + extern int tables_ready; + + if (!tables_ready) + make_tables(); + + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size) != 0) + exit(4); + block_index_for_slot += 2; + blocks += 2; + blocks_page += 2; + + memset(zero, 0, chunk_size); + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + /* The syndrome number of the broken disk is recorded + * in 'disk[]' which allows a different broken disk for + * each page. + */ + int disk[chunk_size >> CHECK_PAGE_BITS]; + + err = lock_stripe(info, start, chunk_size, data_disks, sig); + if(err != 0) { + if (err != 2) + unlock_all_stripes(info, sig); + goto exitCheck; + } + for (i = 0 ; i < raid_disks ; i++) { + off64_t seek_res = lseek64(source[i], offsets[i] + start * chunk_size, + SEEK_SET); + if (seek_res < 0) { + fprintf(stderr, "lseek to source %d failed\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + int read_res = read(source[i], stripes[i], chunk_size); + if (read_res < chunk_size) { + fprintf(stderr, "Failed to read complete chunk disk %d, aborting\n", i); + unlock_all_stripes(info, sig); + err = -1; + goto exitCheck; + } + } + + diskP = geo_map(-1, start, raid_disks, level, layout); + block_index_for_slot[-1] = diskP; + blocks[-1] = stripes[diskP]; + + diskQ = geo_map(-2, start, raid_disks, level, layout); + block_index_for_slot[-2] = diskQ; + blocks[-2] = stripes[diskQ]; + + if (!is_ddf(layout)) { + /* The syndrome-order of disks starts immediately after 'Q', + * but skips P */ + diskD = diskQ; + for (i = 0 ; i < data_disks ; i++) { + diskD = diskD + 1; + if (diskD >= raid_disks) + diskD = 0; + if (diskD == diskP) + diskD += 1; + if (diskD >= raid_disks) + diskD = 0; + blocks[i] = stripes[diskD]; + block_index_for_slot[i] = diskD; + } + } else { + /* The syndrome-order exactly follows raid-disk + * numbers, with ZERO in place of P and Q + */ + for (i = 0 ; i < raid_disks; i++) { + if (i == diskP || i == diskQ) { + blocks[i] = zero; + block_index_for_slot[i] = -1; + } else { + blocks[i] = stripes[i]; + block_index_for_slot[i] = i; + } + } + } + + qsyndrome(p, q, (uint8_t**)blocks, syndrome_disks, chunk_size); + + raid6_collect(chunk_size, p, q, stripes[diskP], stripes[diskQ], results); + raid6_stats(disk, results, raid_disks, chunk_size); + + for(j = 0; j < (chunk_size >> CHECK_PAGE_BITS); j++) { + int role = disk[j]; + if (role >= -2) { + int slot = block_index_for_slot[role]; + if (slot >= 0) + printf("Error detected at stripe %llu, page %d: possible failed disk slot %d: %d --> %s\n", + start, j, role, slot, name[slot]); + else + printf("Error detected at stripe %llu, page %d: failed slot %d should be zeros\n", + start, j, role); + } else if(disk[j] == -65535) { + printf("Error detected at stripe %llu, page %d: disk slot unknown\n", start, j); + } + } + + if(repair == AUTO_REPAIR) { + err = autorepair(disk, start, chunk_size, + name, raid_disks, syndrome_disks, blocks_page, + blocks, p, block_index_for_slot, + source, offsets); + if(err != 0) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + if(repair == MANUAL_REPAIR) { + int failed_slot1 = -1, failed_slot2 = -1; + for (i = -2; i < syndrome_disks; i++) { + if (block_index_for_slot[i] == failed_disk1) + failed_slot1 = i; + if (block_index_for_slot[i] == failed_disk2) + failed_slot2 = i; + } + err = manual_repair(chunk_size, syndrome_disks, + failed_slot1, failed_slot2, + start, block_index_for_slot, + name, stripes, blocks, p, + source, offsets); + if(err == -1) { + unlock_all_stripes(info, sig); + goto exitCheck; + } + } + + err = unlock_all_stripes(info, sig); + if(err != 0) { + goto exitCheck; + } + + length--; + start++; + } + +exitCheck: + + free(stripe_buf); + free(stripes); + free(blocks-2); + free(blocks_page-2); + free(block_index_for_slot-2); + free(p); + free(q); + free(results); + free(sig); + + return err; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +int main(int argc, char *argv[]) +{ + /* md_device start length */ + int *fds = NULL; + char *buf = NULL; + char **disk_name = NULL; + unsigned long long *offsets = NULL; + int raid_disks = 0; + int active_disks; + int chunk_size = 0; + int layout = -1; + int level = 6; + enum repair repair = NO_REPAIR; + int failed_disk1 = -1; + int failed_disk2 = -1; + unsigned long long start, length; + int i; + int mdfd; + struct mdinfo *info = NULL, *comp = NULL; + char *err = NULL; + int exit_err = 0; + int close_flag = 0; + char *prg = strrchr(argv[0], '/'); + + if (prg == NULL) + prg = argv[0]; + else + prg++; + + if (argc < 4) { + fprintf(stderr, "Usage: %s md_device start_stripe length_stripes [autorepair]\n", prg); + fprintf(stderr, " or: %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + + mdfd = open(argv[1], O_RDONLY); + if(mdfd < 0) { + perror(argv[1]); + fprintf(stderr, "%s: cannot open %s\n", prg, argv[1]); + exit_err = 2; + goto exitHere; + } + + info = sysfs_read(mdfd, NULL, + GET_LEVEL| + GET_LAYOUT| + GET_DISKS| + GET_STATE | + GET_COMPONENT| + GET_CHUNK| + GET_DEVS| + GET_OFFSET| + GET_SIZE); + + if(info == NULL) { + fprintf(stderr, "%s: Error reading sysfs information of %s\n", prg, argv[1]); + exit_err = 9; + goto exitHere; + } + + if(info->array.level != level) { + fprintf(stderr, "%s: %s not a RAID-6\n", prg, argv[1]); + exit_err = 3; + goto exitHere; + } + + if(info->array.failed_disks > 0) { + fprintf(stderr, "%s: %s degraded array\n", prg, argv[1]); + exit_err = 8; + goto exitHere; + } + + printf("layout: %d\n", info->array.layout); + printf("disks: %d\n", info->array.raid_disks); + printf("component size: %llu\n", info->component_size * 512); + printf("total stripes: %llu\n", (info->component_size * 512) / info->array.chunk_size); + printf("chunk size: %d\n", info->array.chunk_size); + printf("\n"); + + comp = info->devs; + for(i = 0, active_disks = 0; active_disks < info->array.raid_disks; i++) { + printf("disk: %d - offset: %llu - size: %llu - name: %s - slot: %d\n", + i, comp->data_offset * 512, comp->component_size * 512, + map_dev(comp->disk.major, comp->disk.minor, 0), + comp->disk.raid_disk); + if(comp->disk.raid_disk >= 0) + active_disks++; + comp = comp->next; + } + printf("\n"); + + close(mdfd); + + raid_disks = info->array.raid_disks; + chunk_size = info->array.chunk_size; + layout = info->array.layout; + if (strcmp(argv[2], "repair")==0) { + if (argc < 6) { + fprintf(stderr, "For repair mode, call %s md_device repair stripe failed_slot_1 failed_slot_2\n", prg); + exit_err = 1; + goto exitHere; + } + repair = MANUAL_REPAIR; + start = getnum(argv[3], &err); + length = 1; + failed_disk1 = getnum(argv[4], &err); + failed_disk2 = getnum(argv[5], &err); + + if(failed_disk1 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_1 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk2 >= info->array.raid_disks) { + fprintf(stderr, "%s: failed_slot_2 index is higher than number of devices in raid\n", prg); + exit_err = 4; + goto exitHere; + } + if(failed_disk1 == failed_disk2) { + fprintf(stderr, "%s: failed_slot_1 and failed_slot_2 are the same\n", prg); + exit_err = 4; + goto exitHere; + } + } + else { + start = getnum(argv[2], &err); + length = getnum(argv[3], &err); + if (argc >= 5 && strcmp(argv[4], "autorepair")==0) + repair = AUTO_REPAIR; + } + + if (err) { + fprintf(stderr, "%s: Bad number: %s\n", prg, err); + exit_err = 4; + goto exitHere; + } + + if(start > ((info->component_size * 512) / chunk_size)) { + start = (info->component_size * 512) / chunk_size; + fprintf(stderr, "%s: start beyond disks size\n", prg); + } + + if((length == 0) || + ((length + start) > ((info->component_size * 512) / chunk_size))) { + length = (info->component_size * 512) / chunk_size - start; + } + + disk_name = xmalloc(raid_disks * sizeof(*disk_name)); + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + buf = xmalloc(raid_disks * chunk_size); + + for(i=0; i<raid_disks; i++) { + fds[i] = -1; + } + close_flag = 1; + + comp = info->devs; + for (i=0, active_disks=0; active_disks<raid_disks; i++) { + int disk_slot = comp->disk.raid_disk; + if(disk_slot >= 0) { + disk_name[disk_slot] = map_dev(comp->disk.major, comp->disk.minor, 0); + offsets[disk_slot] = comp->data_offset * 512; + fds[disk_slot] = open(disk_name[disk_slot], O_RDWR | O_DIRECT); + if (fds[disk_slot] < 0) { + perror(disk_name[disk_slot]); + fprintf(stderr,"%s: cannot open %s\n", prg, disk_name[disk_slot]); + exit_err = 6; + goto exitHere; + } + active_disks++; + } + comp = comp->next; + } + + int rv = check_stripes(info, fds, offsets, + raid_disks, chunk_size, level, layout, + start, length, disk_name, repair, failed_disk1, failed_disk2); + if (rv != 0) { + fprintf(stderr, "%s: check_stripes returned %d\n", prg, rv); + exit_err = 7; + goto exitHere; + } + +exitHere: + + if (close_flag) + for(i = 0; i < raid_disks; i++) + close(fds[i]); + + free(disk_name); + free(fds); + free(offsets); + free(buf); + + exit(exit_err); +} diff --git a/restripe.c b/restripe.c new file mode 100644 index 0000000..a7a7229 --- /dev/null +++ b/restripe.c @@ -0,0 +1,1038 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <stdint.h> + +/* To restripe, we read from old geometry to a buffer, and + * read from buffer to new geometry. + * When reading, we might have missing devices and so could need + * to reconstruct. + * When writing, we need to create correct parity and Q. + * + */ + +int geo_map(int block, unsigned long long stripe, int raid_disks, + int level, int layout) +{ + /* On the given stripe, find which disk in the array will have + * block numbered 'block'. + * '-1' means the parity block. + * '-2' means the Q syndrome. + */ + int pd; + + /* layout is not relevant for raid0 and raid4 */ + if ((level == 0) || + (level == 4)) + layout = 0; + + switch(level*100 + layout) { + case 000: + case 400: + case 500 + ALGORITHM_PARITY_N: + /* raid 4 isn't messed around by parity blocks */ + if (block == -1) + return raid_disks-1; /* parity block */ + return block; + case 500 + ALGORITHM_LEFT_ASYMMETRIC: + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) + return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) + return pd; + if (block >= pd) + block++; + return block; + + case 500 + ALGORITHM_LEFT_SYMMETRIC: + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) + return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) + return pd; + return (pd + 1 + block) % raid_disks; + + case 500 + ALGORITHM_PARITY_0: + return block + 1; + + case 600 + ALGORITHM_PARITY_N_6: + if (block == -2) + return raid_disks - 1; + if (block == -1) + return raid_disks - 2; /* parity block */ + return block; + case 600 + ALGORITHM_LEFT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks-1) - stripe % raid_disks; + if (block == -1) + return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_RIGHT_ASYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) + return pd; + if (block >= pd) + block++; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = (raid_disks - 1) - stripe % raid_disks; + if (block == -1) + return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC_6: + if (block == -2) + return raid_disks - 1; + raid_disks--; + pd = stripe % raid_disks; + if (block == -1) + return pd; + return (pd + 1 + block) % raid_disks; + + case 600 + ALGORITHM_PARITY_0_6: + if (block == -2) + return raid_disks - 1; + return block + 1; + + case 600 + ALGORITHM_PARITY_0: + if (block == -1) + return 0; + if (block == -2) + return 1; + return block + 2; + + case 600 + ALGORITHM_LEFT_ASYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) + return pd; + if (block == -2) + return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_ZERO_RESTART: + /* Different order for calculating Q, otherwize same as ... */ + case 600 + ALGORITHM_RIGHT_ASYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) + return pd; + if (block == -2) + return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_LEFT_SYMMETRIC: + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) + return pd; + if (block == -2) + return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_RIGHT_SYMMETRIC: + pd = stripe % raid_disks; + if (block == -1) + return pd; + if (block == -2) + return (pd+1) % raid_disks; + return (pd + 2 + block) % raid_disks; + + case 600 + ALGORITHM_ROTATING_N_RESTART: + /* Same a left_asymmetric, by first stripe is + * D D D P Q rather than + * Q D D D P + */ + pd = raid_disks - 1 - ((stripe + 1) % raid_disks); + if (block == -1) + return pd; + if (block == -2) + return (pd+1) % raid_disks; + if (pd == raid_disks - 1) + return block+1; + if (block >= pd) + return block+2; + return block; + + case 600 + ALGORITHM_ROTATING_N_CONTINUE: + /* Same as left_symmetric but Q is before P */ + pd = raid_disks - 1 - (stripe % raid_disks); + if (block == -1) + return pd; + if (block == -2) + return (pd+raid_disks-1) % raid_disks; + return (pd + 1 + block) % raid_disks; + } + return -1; +} + +int is_ddf(int layout) +{ + switch (layout) + { + default: + return 0; + case ALGORITHM_ROTATING_N_CONTINUE: + case ALGORITHM_ROTATING_N_RESTART: + case ALGORITHM_ROTATING_ZERO_RESTART: + return 1; + } +} + +void xor_blocks(char *target, char **sources, int disks, int size) +{ + int i, j; + /* Amazingly inefficient... */ + for (i=0; i<size; i++) { + char c = 0; + for (j=0 ; j<disks; j++) + c ^= sources[j][i]; + target[i] = c; + } +} + +void qsyndrome(uint8_t *p, uint8_t *q, uint8_t **sources, int disks, int size) +{ + int d, z; + uint8_t wq0, wp0, wd0, w10, w20; + for ( d = 0; d < size; d++) { + wq0 = wp0 = sources[disks-1][d]; + for ( z = disks-2 ; z >= 0 ; z-- ) { + wd0 = sources[z][d]; + wp0 ^= wd0; + w20 = (wq0&0x80) ? 0xff : 0x00; + w10 = (wq0 << 1) & 0xff; + w20 &= 0x1d; + w10 ^= w20; + wq0 = w10 ^ wd0; + } + p[d] = wp0; + q[d] = wq0; + } +} + +/* + * The following was taken from linux/drivers/md/mktables.c, and modified + * to create in-memory tables rather than C code + */ +static uint8_t gfmul(uint8_t a, uint8_t b) +{ + uint8_t v = 0; + + while (b) { + if (b & 1) + v ^= a; + a = (a << 1) ^ (a & 0x80 ? 0x1d : 0); + b >>= 1; + } + + return v; +} + +static uint8_t gfpow(uint8_t a, int b) +{ + uint8_t v = 1; + + b %= 255; + if (b < 0) + b += 255; + + while (b) { + if (b & 1) + v = gfmul(v, a); + a = gfmul(a, a); + b >>= 1; + } + + return v; +} + +int tables_ready = 0; +uint8_t raid6_gfmul[256][256]; +uint8_t raid6_gfexp[256]; +uint8_t raid6_gfinv[256]; +uint8_t raid6_gfexi[256]; +uint8_t raid6_gflog[256]; +uint8_t raid6_gfilog[256]; +void make_tables(void) +{ + int i, j; + uint8_t v; + uint32_t b, log; + + /* Compute multiplication table */ + for (i = 0; i < 256; i++) + for (j = 0; j < 256; j++) + raid6_gfmul[i][j] = gfmul(i, j); + + /* Compute power-of-2 table (exponent) */ + v = 1; + for (i = 0; i < 256; i++) { + raid6_gfexp[i] = v; + v = gfmul(v, 2); + if (v == 1) + v = 0; /* For entry 255, not a real entry */ + } + + /* Compute inverse table x^-1 == x^254 */ + for (i = 0; i < 256; i++) + raid6_gfinv[i] = gfpow(i, 254); + + /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ + for (i = 0; i < 256; i ++) + raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; + + /* Compute log and inverse log */ + /* Modified code from: + * https://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html + */ + b = 1; + raid6_gflog[0] = 0; + raid6_gfilog[255] = 0; + + for (log = 0; log < 255; log++) { + raid6_gflog[b] = (uint8_t) log; + raid6_gfilog[log] = (uint8_t) b; + b = b << 1; + if (b & 256) b = b ^ 0435; + } + + tables_ready = 1; +} + +uint8_t *zero; +int zero_size; + +void ensure_zero_has_size(int chunk_size) +{ + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } +} + +/* Following was taken from linux/drivers/md/raid6recov.c */ + +/* Recover two failed data blocks. */ + +void raid6_2data_recov(int disks, size_t bytes, int faila, int failb, + uint8_t **ptrs, int neg_offset) +{ + uint8_t *p, *q, *dp, *dq; + uint8_t px, qx, db; + const uint8_t *pbmul; /* P multiplier table for B data */ + const uint8_t *qmul; /* Q multiplier table (for both) */ + + if (faila > failb) { + int t = faila; + faila = failb; + failb = t; + } + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data pages + Use the dead data pages as temporary storage for + delta p and delta q */ + dp = ptrs[faila]; + ptrs[faila] = zero; + dq = ptrs[failb]; + ptrs[failb] = zero; + + qsyndrome(dp, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dp; + ptrs[failb] = dq; + + /* Now, pick the proper data tables */ + pbmul = raid6_gfmul[raid6_gfexi[failb-faila]]; + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]^raid6_gfexp[failb]]]; + + /* Now do it... */ + while ( bytes-- ) { + px = *p ^ *dp; + qx = qmul[*q ^ *dq]; + *dq++ = db = pbmul[px] ^ qx; /* Reconstructed B */ + *dp++ = db ^ px; /* Reconstructed A */ + p++; q++; + } +} + +/* Recover failure of one data block plus the P block */ +void raid6_datap_recov(int disks, size_t bytes, int faila, uint8_t **ptrs, + int neg_offset) +{ + uint8_t *p, *q, *dq; + const uint8_t *qmul; /* Q multiplier table */ + + if (neg_offset) { + p = ptrs[-1]; + q = ptrs[-2]; + } else { + p = ptrs[disks-2]; + q = ptrs[disks-1]; + } + + /* Compute syndrome with zero for the missing data page + Use the dead data page as temporary storage for delta q */ + dq = ptrs[faila]; + ptrs[faila] = zero; + + qsyndrome(p, dq, ptrs, disks-2, bytes); + + /* Restore pointer table */ + ptrs[faila] = dq; + + /* Now, pick the proper data tables */ + qmul = raid6_gfmul[raid6_gfinv[raid6_gfexp[faila]]]; + + /* Now do it... */ + while ( bytes-- ) { + *p++ ^= *dq = qmul[*q ^ *dq]; + q++; dq++; + } +} + +/* Try to find out if a specific disk has a problem */ +int raid6_check_disks(int data_disks, int start, int chunk_size, + int level, int layout, int diskP, int diskQ, + uint8_t *p, uint8_t *q, char **stripes) +{ + int i; + int data_id, diskD; + uint8_t Px, Qx; + int curr_broken_disk = -1; + int prev_broken_disk = -1; + int broken_status = 0; + + for(i = 0; i < chunk_size; i++) { + Px = (uint8_t)stripes[diskP][i] ^ (uint8_t)p[i]; + Qx = (uint8_t)stripes[diskQ][i] ^ (uint8_t)q[i]; + + if((Px != 0) && (Qx == 0)) + curr_broken_disk = diskP; + + if((Px == 0) && (Qx != 0)) + curr_broken_disk = diskQ; + + if((Px != 0) && (Qx != 0)) { + data_id = (raid6_gflog[Qx] - raid6_gflog[Px]); + if(data_id < 0) data_id += 255; + diskD = geo_map(data_id, start/chunk_size, + data_disks + 2, level, layout); + curr_broken_disk = diskD; + } + + if((Px == 0) && (Qx == 0)) + curr_broken_disk = prev_broken_disk; + + if(curr_broken_disk >= data_disks + 2) + broken_status = 2; + + switch(broken_status) { + case 0: + if(curr_broken_disk != -1) { + prev_broken_disk = curr_broken_disk; + broken_status = 1; + } + break; + + case 1: + if(curr_broken_disk != prev_broken_disk) + broken_status = 2; + break; + + case 2: + default: + curr_broken_disk = prev_broken_disk = -2; + break; + } + } + + return curr_broken_disk; +} + +/******************************************************************************* + * Function: save_stripes + * Description: + * Function reads data (only data without P and Q) from array and writes + * it to buf and opcjonaly to backup files + * Parameters: + * source : A list of 'fds' of the active disks. + * Some may be absent + * offsets : A list of offsets on disk belonging + * to the array [bytes] + * raid_disks : geometry: number of disks in the array + * chunk_size : geometry: chunk size [bytes] + * level : geometry: RAID level + * layout : geometry: layout + * nwrites : number of backup files + * dest : A list of 'fds' for mirrored targets + * (e.g. backup files). They are already seeked to right + * (write) location. If NULL, data will be wrote + * to the buf only + * start : start address of data to read (must be stripe-aligned) + * [bytes] + * length - : length of data to read (must be stripe-aligned) + * [bytes] + * buf : buffer for data. It is large enough to hold + * one stripe. It is stripe aligned + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +int save_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int nwrites, int *dest, + unsigned long long start, unsigned long long length, + char *buf) +{ + int len; + int data_disks = raid_disks - (level == 0 ? 0 : level <=5 ? 1 : 2); + int disk; + int i; + unsigned long long length_test; + + if (!tables_ready) + make_tables(); + ensure_zero_has_size(chunk_size); + + len = data_disks * chunk_size; + length_test = length / len; + length_test *= len; + + if (length != length_test) { + dprintf("Error: save_stripes(): Data are not alligned. EXIT\n"); + dprintf("\tArea for saving stripes (length) = %llu\n", length); + dprintf("\tWork step (len) = %i\n", len); + dprintf("\tExpected save area (length_test) = %llu\n", + length_test); + abort(); + } + + while (length > 0) { + int failed = 0; + int fdisk[3], fblock[3]; + for (disk = 0; disk < raid_disks ; disk++) { + unsigned long long offset; + int dnum; + + offset = (start/chunk_size/data_disks)*chunk_size; + dnum = geo_map(disk < data_disks ? disk : data_disks - disk - 1, + start/chunk_size/data_disks, + raid_disks, level, layout); + if (dnum < 0) abort(); + if (source[dnum] < 0 || + lseek64(source[dnum], + offsets[dnum] + offset, 0) < 0 || + read(source[dnum], buf+disk * chunk_size, + chunk_size) != chunk_size) { + if (failed <= 2) { + fdisk[failed] = dnum; + fblock[failed] = disk; + failed++; + } + } + } + if (failed == 0 || fblock[0] >= data_disks) + /* all data disks are good */ + ; + else if (failed == 1 || fblock[1] >= data_disks+1) { + /* one failed data disk and good parity */ + char *bufs[data_disks]; + for (i=0; i < data_disks; i++) + if (fblock[0] == i) + bufs[i] = buf + data_disks*chunk_size; + else + bufs[i] = buf + i*chunk_size; + + xor_blocks(buf + fblock[0]*chunk_size, + bufs, data_disks, chunk_size); + } else if (failed > 2 || level != 6) + /* too much failure */ + return -1; + else { + /* RAID6 computations needed. */ + uint8_t *bufs[data_disks+4]; + int qdisk; + int syndrome_disks; + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + bufs[i] = zero; + for (i = 0; i < data_disks; i++) { + int dnum = geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout); + int snum; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * and thus the syndrome number. + */ + snum = dnum; + bufs[snum] = (uint8_t*)buf + chunk_size * i; + } + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + * Note that for the '_6' variety, the p block + * makes a hole that we need to be careful of. + */ + int j; + int snum = 0; + for (j = 0; j < raid_disks; j++) { + int dnum = (qdisk + 1 + j) % raid_disks; + if (dnum == disk || dnum == qdisk) + continue; + for (i = 0; i < data_disks; i++) + if (geo_map(i, + start/chunk_size/data_disks, + raid_disks, level, layout) == dnum) + break; + /* i is the logical block number, so is index to 'buf'. + * dnum is physical disk number + * snum is syndrome disk for which 0 is immediately after Q + */ + bufs[snum] = (uint8_t*)buf + chunk_size * i; + + if (fblock[0] == i) + fdisk[0] = snum; + if (fblock[1] == i) + fdisk[1] = snum; + snum++; + } + + syndrome_disks = data_disks; + } + + /* Place P and Q blocks at end of bufs */ + bufs[syndrome_disks] = (uint8_t*)buf + chunk_size * data_disks; + bufs[syndrome_disks+1] = (uint8_t*)buf + chunk_size * (data_disks+1); + + if (fblock[1] == data_disks) + /* One data failed, and parity failed */ + raid6_datap_recov(syndrome_disks+2, chunk_size, + fdisk[0], bufs, 0); + else { + /* Two data blocks failed, P,Q OK */ + raid6_2data_recov(syndrome_disks+2, chunk_size, + fdisk[0], fdisk[1], bufs, 0); + } + } + if (dest) { + for (i = 0; i < nwrites; i++) + if (write(dest[i], buf, len) != len) + return -1; + } else { + /* build next stripe in buffer */ + buf += len; + } + length -= len; + start += len; + } + return 0; +} + +/* Restore data: + * We are given: + * A list of 'fds' of the active disks. Some may be '-1' for not-available. + * A geometry: raid_disks, chunk_size, level, layout + * An 'fd' to read from. It is already seeked to the right (Read) location. + * A start and length. + * The length must be a multiple of the stripe size. + * + * We build a full stripe in memory and then write it out. + * We assume that there are enough working devices. + */ +int restore_stripes(int *dest, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + int source, unsigned long long read_offset, + unsigned long long start, unsigned long long length, + char *src_buf) +{ + char *stripe_buf; + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + int i; + int rv; + + int data_disks = raid_disks - (level == 0 ? 0 : level <= 5 ? 1 : 2); + + if (posix_memalign((void**)&stripe_buf, 4096, raid_disks * chunk_size)) + stripe_buf = NULL; + + if (zero == NULL || chunk_size > zero_size) { + if (zero) + free(zero); + zero = xcalloc(1, chunk_size); + zero_size = chunk_size; + } + + if (stripe_buf == NULL || stripes == NULL || blocks == NULL || + zero == NULL) { + rv = -2; + goto abort; + } + for (i = 0; i < raid_disks; i++) + stripes[i] = stripe_buf + i * chunk_size; + while (length > 0) { + unsigned int len = data_disks * chunk_size; + unsigned long long offset; + int disk, qdisk; + int syndrome_disks; + if (length < len) { + rv = -3; + goto abort; + } + for (i = 0; i < data_disks; i++) { + int disk = geo_map(i, start/chunk_size/data_disks, + raid_disks, level, layout); + if (src_buf == NULL) { + /* read from file */ + if (lseek64(source, read_offset, 0) != + (off64_t)read_offset) { + rv = -1; + goto abort; + } + if (read(source, + stripes[disk], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } else { + /* read from input buffer */ + memcpy(stripes[disk], + src_buf + read_offset, + chunk_size); + } + read_offset += chunk_size; + } + /* We have the data, now do the parity */ + offset = (start/chunk_size/data_disks) * chunk_size; + switch (level) { + case 4: + case 5: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(disk+1+i) % raid_disks]; + xor_blocks(stripes[disk], blocks, data_disks, chunk_size); + break; + case 6: + disk = geo_map(-1, start/chunk_size/data_disks, + raid_disks, level, layout); + qdisk = geo_map(-2, start/chunk_size/data_disks, + raid_disks, level, layout); + if (is_ddf(layout)) { + /* q over 'raid_disks' blocks, in device order. + * 'p' and 'q' get to be all zero + */ + for (i = 0; i < raid_disks; i++) + if (i == disk || i == qdisk) + blocks[i] = (char*)zero; + else + blocks[i] = stripes[i]; + syndrome_disks = raid_disks; + } else { + /* for md, q is over 'data_disks' blocks, + * starting immediately after 'q' + */ + for (i = 0; i < data_disks; i++) + blocks[i] = stripes[(qdisk+1+i) % raid_disks]; + + syndrome_disks = data_disks; + } + qsyndrome((uint8_t*)stripes[disk], + (uint8_t*)stripes[qdisk], + (uint8_t**)blocks, + syndrome_disks, chunk_size); + break; + } + for (i=0; i < raid_disks ; i++) + if (dest[i] >= 0) { + if (lseek64(dest[i], + offsets[i]+offset, 0) < 0) { + rv = -1; + goto abort; + } + if (write(dest[i], stripes[i], + chunk_size) != chunk_size) { + rv = -1; + goto abort; + } + } + length -= len; + start += len; + } + rv = 0; + +abort: + free(stripe_buf); + free(stripes); + free(blocks); + return rv; +} + +#ifdef MAIN + +int test_stripes(int *source, unsigned long long *offsets, + int raid_disks, int chunk_size, int level, int layout, + unsigned long long start, unsigned long long length) +{ + /* ready the data and p (and q) blocks, and check we got them right */ + char *stripe_buf = xmalloc(raid_disks * chunk_size); + char **stripes = xmalloc(raid_disks * sizeof(char*)); + char **blocks = xmalloc(raid_disks * sizeof(char*)); + uint8_t *p = xmalloc(chunk_size); + uint8_t *q = xmalloc(chunk_size); + + int i; + int diskP, diskQ; + int data_disks = raid_disks - (level == 5 ? 1: 2); + + if (!tables_ready) + make_tables(); + + for ( i = 0 ; i < raid_disks ; i++) + stripes[i] = stripe_buf + i * chunk_size; + + while (length > 0) { + int disk; + + for (i = 0 ; i < raid_disks ; i++) { + if ((lseek64(source[i], offsets[i]+start, 0) < 0) || + (read(source[i], stripes[i], chunk_size) != + chunk_size)) { + free(q); + free(p); + free(blocks); + free(stripes); + free(stripe_buf); + return -1; + } + } + for (i = 0 ; i < data_disks ; i++) { + int disk = geo_map(i, start/chunk_size, raid_disks, + level, layout); + blocks[i] = stripes[disk]; + printf("%d->%d\n", i, disk); + } + switch(level) { + case 6: + qsyndrome(p, q, (uint8_t**)blocks, data_disks, chunk_size); + diskP = geo_map(-1, start/chunk_size, raid_disks, + level, layout); + if (memcmp(p, stripes[diskP], chunk_size) != 0) { + printf("P(%d) wrong at %llu\n", diskP, + start / chunk_size); + } + diskQ = geo_map(-2, start/chunk_size, raid_disks, + level, layout); + if (memcmp(q, stripes[diskQ], chunk_size) != 0) { + printf("Q(%d) wrong at %llu\n", diskQ, + start / chunk_size); + } + disk = raid6_check_disks(data_disks, start, chunk_size, + level, layout, diskP, diskQ, + p, q, stripes); + if(disk >= 0) { + printf("Possible failed disk: %d\n", disk); + } + if(disk == -2) { + printf("Failure detected, but disk unknown\n"); + } + break; + } + length -= chunk_size; + start += chunk_size; + } + return 0; +} + +unsigned long long getnum(char *str, char **err) +{ + char *e; + unsigned long long rv = strtoull(str, &e, 10); + if (e==str || *e) { + *err = str; + return 0; + } + return rv; +} + +char const Name[] = "test_restripe"; +int main(int argc, char *argv[]) +{ + /* save/restore file raid_disks chunk_size level layout start length devices... + */ + int save; + int *fds; + char *file; + char *buf; + int storefd; + unsigned long long *offsets; + int raid_disks, chunk_size, level, layout; + unsigned long long start, length; + int i; + + char *err = NULL; + if (argc < 10) { + fprintf(stderr, "Usage: test_stripe save/restore file raid_disks chunk_size level layout start length devices...\n"); + exit(1); + } + if (strcmp(argv[1], "save")==0) + save = 1; + else if (strcmp(argv[1], "restore") == 0) + save = 0; + else if (strcmp(argv[1], "test") == 0) + save = 2; + else { + fprintf(stderr, "test_stripe: must give 'save' or 'restore'.\n"); + exit(2); + } + + file = argv[2]; + raid_disks = getnum(argv[3], &err); + chunk_size = getnum(argv[4], &err); + level = getnum(argv[5], &err); + layout = getnum(argv[6], &err); + start = getnum(argv[7], &err); + length = getnum(argv[8], &err); + if (err) { + fprintf(stderr, "test_stripe: Bad number: %s\n", err); + exit(2); + } + if (argc != raid_disks + 9) { + fprintf(stderr, "test_stripe: wrong number of devices: want %d found %d\n", + raid_disks, argc-9); + exit(2); + } + fds = xmalloc(raid_disks * sizeof(*fds)); + offsets = xcalloc(raid_disks, sizeof(*offsets)); + + storefd = open(file, O_RDWR); + if (storefd < 0) { + perror(file); + fprintf(stderr, "test_stripe: could not open %s.\n", file); + exit(3); + } + for (i=0; i<raid_disks; i++) { + char *p; + p = strchr(argv[9+i], ':'); + + if(p != NULL) { + *p++ = '\0'; + offsets[i] = atoll(p) * 512; + } + + fds[i] = open(argv[9+i], O_RDWR); + if (fds[i] < 0) { + perror(argv[9+i]); + fprintf(stderr,"test_stripe: cannot open %s.\n", argv[9+i]); + exit(3); + } + } + + buf = xmalloc(raid_disks * chunk_size); + + if (save == 1) { + int rv = save_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + 1, &storefd, + start, length, buf); + if (rv != 0) { + fprintf(stderr, + "test_stripe: save_stripes returned %d\n", rv); + exit(1); + } + } else if (save == 2) { + int rv = test_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + start, length); + if (rv != 0) { + fprintf(stderr, + "test_stripe: test_stripes returned %d\n", rv); + exit(1); + } + } else { + int rv = restore_stripes(fds, offsets, + raid_disks, chunk_size, level, layout, + storefd, 0ULL, + start, length, NULL); + if (rv != 0) { + fprintf(stderr, + "test_stripe: restore_stripes returned %d\n", + rv); + exit(1); + } + } + exit(0); +} + +#endif /* MAIN */ @@ -0,0 +1,60 @@ +/* + * Copyright (C) 2007-2008 Intel Corporation + * + * Retrieve drive serial numbers for scsi disks + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ +#include <string.h> +#include <scsi/scsi.h> +#include <scsi/sg.h> +#include <sys/ioctl.h> + +int scsi_get_serial(int fd, void *buf, size_t buf_len) +{ + unsigned char rsp_buf[255]; + unsigned char inq_cmd[] = {INQUIRY, 1, 0x80, 0, sizeof(rsp_buf), 0}; + unsigned char sense[32]; + struct sg_io_hdr io_hdr; + int rv; + unsigned int rsp_len; + + memset(&io_hdr, 0, sizeof(io_hdr)); + io_hdr.interface_id = 'S'; + io_hdr.cmdp = inq_cmd; + io_hdr.cmd_len = sizeof(inq_cmd); + io_hdr.dxferp = rsp_buf; + io_hdr.dxfer_len = sizeof(rsp_buf); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.sbp = sense; + io_hdr.mx_sb_len = sizeof(sense); + io_hdr.timeout = 5000; + + rv = ioctl(fd, SG_IO, &io_hdr); + + if (rv) + return rv; + + if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) + return -1; + + rsp_len = rsp_buf[3]; + + if (!rsp_len || buf_len < rsp_len) + return -1; + + memcpy(buf, &rsp_buf[4], rsp_len); + + return 0; +} @@ -0,0 +1,415 @@ +/* sha1.c - Functions to compute SHA1 message digest of files or + memory blocks according to the NIST specification FIPS-180-1. + + Copyright (C) 2000, 2001, 2003, 2004, 2005, 2006, 2008 Free Software + Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +/* Written by Scott G. Miller + Credits: + Robert Klep <robert@ilse.nl> -- Expansion function fix +*/ + +//#include <config.h> + +#include "sha1.h" + +#include <stddef.h> +#include <string.h> + +#if USE_UNLOCKED_IO +# include "unlocked-io.h" +#endif + +#ifdef WORDS_BIGENDIAN +# define SWAP(n) (n) +#else +# define SWAP(n) \ + (((n) << 24) | (((n) & 0xff00) << 8) | (((n) >> 8) & 0xff00) | ((n) >> 24)) +#endif + +#define BLOCKSIZE 4096 +#if BLOCKSIZE % 64 != 0 +# error "invalid BLOCKSIZE" +#endif + +/* This array contains the bytes used to pad the buffer to the next + 64-byte boundary. (RFC 1321, 3.1: Step 1) */ +static const unsigned char fillbuf[64] = { 0x80, 0 /* , 0, 0, ... */ }; + +/* Take a pointer to a 160 bit block of data (five 32 bit ints) and + initialize it to the start constants of the SHA1 algorithm. This + must be called before using hash in the call to sha1_hash. */ +void +sha1_init_ctx (struct sha1_ctx *ctx) +{ + ctx->A = 0x67452301; + ctx->B = 0xefcdab89; + ctx->C = 0x98badcfe; + ctx->D = 0x10325476; + ctx->E = 0xc3d2e1f0; + + ctx->total[0] = ctx->total[1] = 0; + ctx->buflen = 0; +} + +/* Put result from CTX in first 20 bytes following RESBUF. The result + must be in little endian byte order. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf) +{ + ((sha1_uint32 *) resbuf)[0] = SWAP (ctx->A); + ((sha1_uint32 *) resbuf)[1] = SWAP (ctx->B); + ((sha1_uint32 *) resbuf)[2] = SWAP (ctx->C); + ((sha1_uint32 *) resbuf)[3] = SWAP (ctx->D); + ((sha1_uint32 *) resbuf)[4] = SWAP (ctx->E); + + return resbuf; +} + +/* Process the remaining bytes in the internal buffer and the usual + prolog according to the standard and write the result to RESBUF. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32-bit value. */ +void * +sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf) +{ + /* Take yet unprocessed bytes into account. */ + sha1_uint32 bytes = ctx->buflen; + size_t size = (bytes < 56) ? 64 / 4 : 64 * 2 / 4; + + /* Now count remaining bytes. */ + ctx->total[0] += bytes; + if (ctx->total[0] < bytes) + ++ctx->total[1]; + + /* Put the 64-bit file length in *bits* at the end of the buffer. */ + ctx->buffer[size - 2] = SWAP ((ctx->total[1] << 3) | (ctx->total[0] >> 29)); + ctx->buffer[size - 1] = SWAP (ctx->total[0] << 3); + + memcpy (&((char *) ctx->buffer)[bytes], fillbuf, (size - 2) * 4 - bytes); + + /* Process last bytes. */ + sha1_process_block (ctx->buffer, size * 4, ctx); + + return sha1_read_ctx (ctx, resbuf); +} + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 16 bytes + beginning at RESBLOCK. */ +int +sha1_stream (FILE *stream, void *resblock) +{ + struct sha1_ctx ctx; + char buffer[BLOCKSIZE + 72]; + size_t sum; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Iterate over full file contents. */ + while (1) + { + /* We read the file in blocks of BLOCKSIZE bytes. One call of the + computation function processes the whole buffer so that with the + next round of the loop another block can be read. */ + size_t n; + sum = 0; + + /* Read block. Take care for partial reads. */ + while (1) + { + n = fread (buffer + sum, 1, BLOCKSIZE - sum, stream); + + sum += n; + + if (sum == BLOCKSIZE) + break; + + if (n == 0) + { + /* Check for the error flag IFF N == 0, so that we don't + exit the loop after a partial read due to e.g., EAGAIN + or EWOULDBLOCK. */ + if (ferror (stream)) + return 1; + goto process_partial_block; + } + + /* We've read at least one byte, so ignore errors. But always + check for EOF, since feof may be true even though N > 0. + Otherwise, we could end up calling fread after EOF. */ + if (feof (stream)) + goto process_partial_block; + } + + /* Process buffer with BLOCKSIZE bytes. Note that + BLOCKSIZE % 64 == 0 + */ + sha1_process_block (buffer, BLOCKSIZE, &ctx); + } + + process_partial_block:; + + /* Process any remaining bytes. */ + if (sum > 0) + sha1_process_bytes (buffer, sum, &ctx); + + /* Construct result in desired memory. */ + sha1_finish_ctx (&ctx, resblock); + return 0; +} + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +void * +sha1_buffer (const char *buffer, size_t len, void *resblock) +{ + struct sha1_ctx ctx; + + /* Initialize the computation context. */ + sha1_init_ctx (&ctx); + + /* Process whole buffer but last len % 64 bytes. */ + sha1_process_bytes (buffer, len, &ctx); + + /* Put result in desired memory area. */ + return sha1_finish_ctx (&ctx, resblock); +} + +void +sha1_process_bytes (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + /* When we already have some bits in our internal buffer concatenate + both inputs first. */ + if (ctx->buflen != 0) + { + size_t left_over = ctx->buflen; + size_t add = 128 - left_over > len ? len : 128 - left_over; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, add); + ctx->buflen += add; + + if (ctx->buflen > 64) + { + sha1_process_block (ctx->buffer, ctx->buflen & ~63, ctx); + + ctx->buflen &= 63; + /* The regions in the following copy operation cannot overlap. */ + memcpy (ctx->buffer, + &((char *) ctx->buffer)[(left_over + add) & ~63], + ctx->buflen); + } + + buffer = (const char *) buffer + add; + len -= add; + } + + /* Process available complete blocks. */ + if (len >= 64) + { +#if !_STRING_ARCH_unaligned +# define alignof(type) offsetof (struct { char c; type x; }, x) +# define UNALIGNED_P(p) (((size_t) p) % alignof (sha1_uint32) != 0) + if (UNALIGNED_P (buffer)) + while (len > 64) + { + sha1_process_block (memcpy (ctx->buffer, buffer, 64), 64, ctx); + buffer = (const char *) buffer + 64; + len -= 64; + } + else +#endif + { + sha1_process_block (buffer, len & ~63, ctx); + buffer = (const char *) buffer + (len & ~63); + len &= 63; + } + } + + /* Move remaining bytes in internal buffer. */ + if (len > 0) + { + size_t left_over = ctx->buflen; + + memcpy (&((char *) ctx->buffer)[left_over], buffer, len); + left_over += len; + if (left_over >= 64) + { + sha1_process_block (ctx->buffer, 64, ctx); + left_over -= 64; + memmove (ctx->buffer, &ctx->buffer[16], left_over); + } + ctx->buflen = left_over; + } +} + +/* --- Code below is the primary difference between md5.c and sha1.c --- */ + +/* SHA1 round constants */ +#define K1 0x5a827999 +#define K2 0x6ed9eba1 +#define K3 0x8f1bbcdc +#define K4 0xca62c1d6 + +/* Round functions. Note that F2 is the same as F4. */ +#define F1(B,C,D) ( D ^ ( B & ( C ^ D ) ) ) +#define F2(B,C,D) (B ^ C ^ D) +#define F3(B,C,D) ( ( B & C ) | ( D & ( B | C ) ) ) +#define F4(B,C,D) (B ^ C ^ D) + +/* Process LEN bytes of BUFFER, accumulating context into CTX. + It is assumed that LEN % 64 == 0. + Most of this code comes from GnuPG's cipher/sha1.c. */ + +void +sha1_process_block (const void *buffer, size_t len, struct sha1_ctx *ctx) +{ + const sha1_uint32 *words = (const sha1_uint32*) buffer; + size_t nwords = len / sizeof (sha1_uint32); + const sha1_uint32 *endp = words + nwords; + sha1_uint32 x[16]; + sha1_uint32 a = ctx->A; + sha1_uint32 b = ctx->B; + sha1_uint32 c = ctx->C; + sha1_uint32 d = ctx->D; + sha1_uint32 e = ctx->E; + + /* First increment the byte count. RFC 1321 specifies the possible + length of the file up to 2^64 bits. Here we only compute the + number of bytes. Do a double word increment. */ + ctx->total[0] += len; + if (ctx->total[0] < len) + ++ctx->total[1]; + +#define rol(x, n) (((x) << (n)) | ((sha1_uint32) (x) >> (32 - (n)))) + +#define M(I) ( tm = x[I&0x0f] ^ x[(I-14)&0x0f] \ + ^ x[(I-8)&0x0f] ^ x[(I-3)&0x0f] \ + , (x[I&0x0f] = rol(tm, 1)) ) + +#define R(A,B,C,D,E,F,K,M) do { E += rol( A, 5 ) \ + + F( B, C, D ) \ + + K \ + + M; \ + B = rol( B, 30 ); \ + } while(0) + + while (words < endp) + { + sha1_uint32 tm; + int t; + for (t = 0; t < 16; t++) + { + x[t] = SWAP (*words); + words++; + } + + R( a, b, c, d, e, F1, K1, x[ 0] ); + R( e, a, b, c, d, F1, K1, x[ 1] ); + R( d, e, a, b, c, F1, K1, x[ 2] ); + R( c, d, e, a, b, F1, K1, x[ 3] ); + R( b, c, d, e, a, F1, K1, x[ 4] ); + R( a, b, c, d, e, F1, K1, x[ 5] ); + R( e, a, b, c, d, F1, K1, x[ 6] ); + R( d, e, a, b, c, F1, K1, x[ 7] ); + R( c, d, e, a, b, F1, K1, x[ 8] ); + R( b, c, d, e, a, F1, K1, x[ 9] ); + R( a, b, c, d, e, F1, K1, x[10] ); + R( e, a, b, c, d, F1, K1, x[11] ); + R( d, e, a, b, c, F1, K1, x[12] ); + R( c, d, e, a, b, F1, K1, x[13] ); + R( b, c, d, e, a, F1, K1, x[14] ); + R( a, b, c, d, e, F1, K1, x[15] ); + R( e, a, b, c, d, F1, K1, M(16) ); + R( d, e, a, b, c, F1, K1, M(17) ); + R( c, d, e, a, b, F1, K1, M(18) ); + R( b, c, d, e, a, F1, K1, M(19) ); + R( a, b, c, d, e, F2, K2, M(20) ); + R( e, a, b, c, d, F2, K2, M(21) ); + R( d, e, a, b, c, F2, K2, M(22) ); + R( c, d, e, a, b, F2, K2, M(23) ); + R( b, c, d, e, a, F2, K2, M(24) ); + R( a, b, c, d, e, F2, K2, M(25) ); + R( e, a, b, c, d, F2, K2, M(26) ); + R( d, e, a, b, c, F2, K2, M(27) ); + R( c, d, e, a, b, F2, K2, M(28) ); + R( b, c, d, e, a, F2, K2, M(29) ); + R( a, b, c, d, e, F2, K2, M(30) ); + R( e, a, b, c, d, F2, K2, M(31) ); + R( d, e, a, b, c, F2, K2, M(32) ); + R( c, d, e, a, b, F2, K2, M(33) ); + R( b, c, d, e, a, F2, K2, M(34) ); + R( a, b, c, d, e, F2, K2, M(35) ); + R( e, a, b, c, d, F2, K2, M(36) ); + R( d, e, a, b, c, F2, K2, M(37) ); + R( c, d, e, a, b, F2, K2, M(38) ); + R( b, c, d, e, a, F2, K2, M(39) ); + R( a, b, c, d, e, F3, K3, M(40) ); + R( e, a, b, c, d, F3, K3, M(41) ); + R( d, e, a, b, c, F3, K3, M(42) ); + R( c, d, e, a, b, F3, K3, M(43) ); + R( b, c, d, e, a, F3, K3, M(44) ); + R( a, b, c, d, e, F3, K3, M(45) ); + R( e, a, b, c, d, F3, K3, M(46) ); + R( d, e, a, b, c, F3, K3, M(47) ); + R( c, d, e, a, b, F3, K3, M(48) ); + R( b, c, d, e, a, F3, K3, M(49) ); + R( a, b, c, d, e, F3, K3, M(50) ); + R( e, a, b, c, d, F3, K3, M(51) ); + R( d, e, a, b, c, F3, K3, M(52) ); + R( c, d, e, a, b, F3, K3, M(53) ); + R( b, c, d, e, a, F3, K3, M(54) ); + R( a, b, c, d, e, F3, K3, M(55) ); + R( e, a, b, c, d, F3, K3, M(56) ); + R( d, e, a, b, c, F3, K3, M(57) ); + R( c, d, e, a, b, F3, K3, M(58) ); + R( b, c, d, e, a, F3, K3, M(59) ); + R( a, b, c, d, e, F4, K4, M(60) ); + R( e, a, b, c, d, F4, K4, M(61) ); + R( d, e, a, b, c, F4, K4, M(62) ); + R( c, d, e, a, b, F4, K4, M(63) ); + R( b, c, d, e, a, F4, K4, M(64) ); + R( a, b, c, d, e, F4, K4, M(65) ); + R( e, a, b, c, d, F4, K4, M(66) ); + R( d, e, a, b, c, F4, K4, M(67) ); + R( c, d, e, a, b, F4, K4, M(68) ); + R( b, c, d, e, a, F4, K4, M(69) ); + R( a, b, c, d, e, F4, K4, M(70) ); + R( e, a, b, c, d, F4, K4, M(71) ); + R( d, e, a, b, c, F4, K4, M(72) ); + R( c, d, e, a, b, F4, K4, M(73) ); + R( b, c, d, e, a, F4, K4, M(74) ); + R( a, b, c, d, e, F4, K4, M(75) ); + R( e, a, b, c, d, F4, K4, M(76) ); + R( d, e, a, b, c, F4, K4, M(77) ); + R( c, d, e, a, b, F4, K4, M(78) ); + R( b, c, d, e, a, F4, K4, M(79) ); + + a = ctx->A += a; + b = ctx->B += b; + c = ctx->C += c; + d = ctx->D += d; + e = ctx->E += e; + } +} @@ -0,0 +1,136 @@ +/* Declarations of functions and data types used for SHA1 sum + library functions. + Copyright (C) 2000, 2001, 2003, 2005, 2006, 2008 + Free Software Foundation, Inc. + + This program is free software; you can redistribute it and/or modify it + under the terms of the GNU General Public License as published by the + Free Software Foundation; either version 2, or (at your option) any + later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software Foundation, + Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */ + +#ifndef SHA1_H +# define SHA1_H 1 + +#include <stdio.h> + +#if defined HAVE_LIMITS_H || _LIBC +# include <limits.h> +#endif + +/* The following contortions are an attempt to use the C preprocessor + to determine an unsigned integral type that is 32 bits wide. An + alternative approach is to use autoconf's AC_CHECK_SIZEOF macro, but + doing that would require that the configure script compile and *run* + the resulting executable. Locally running cross-compiled executables + is usually not possible. */ + +#ifdef _LIBC +# include <sys/types.h> +typedef u_int32_t sha1_uint32; +typedef uintptr_t sha1_uintptr; +#else +# define INT_MAX_32_BITS 2147483647 + +/* If UINT_MAX isn't defined, assume it's a 32-bit type. + This should be valid for all systems GNU cares about because + that doesn't include 16-bit systems, and only modern systems + (that certainly have <limits.h>) have 64+-bit integral types. */ + +# ifndef INT_MAX +# define INT_MAX INT_MAX_32_BITS +# endif + +# if INT_MAX == INT_MAX_32_BITS + typedef unsigned int sha1_uint32; +# else +# if SHRT_MAX == INT_MAX_32_BITS + typedef unsigned short sha1_uint32; +# else +# if LONG_MAX == INT_MAX_32_BITS + typedef unsigned long sha1_uint32; +# else + /* The following line is intended to evoke an error. + Using #error is not portable enough. */ + "Cannot determine unsigned 32-bit data type." +# endif +# endif +# endif +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Structure to save state of computation between the single steps. */ +struct sha1_ctx +{ + sha1_uint32 A; + sha1_uint32 B; + sha1_uint32 C; + sha1_uint32 D; + sha1_uint32 E; + + sha1_uint32 total[2]; + sha1_uint32 buflen; + sha1_uint32 buffer[32]; +}; + +/* Initialize structure containing state of computation. */ +extern void sha1_init_ctx (struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is necessary that LEN is a multiple of 64!!! */ +extern void sha1_process_block (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Starting with the result of former calls of this function (or the + initialization function update the context for the next LEN bytes + starting at BUFFER. + It is NOT required that LEN is a multiple of 64. */ +extern void sha1_process_bytes (const void *buffer, size_t len, + struct sha1_ctx *ctx); + +/* Process the remaining bytes in the buffer and put result from CTX + in first 20 bytes following RESBUF. The result is always in little + endian byte order, so that a byte-wise output yields to the wanted + ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF be correctly + aligned for a 32 bits value. */ +extern void *sha1_finish_ctx (struct sha1_ctx *ctx, void *resbuf); + +/* Put result from CTX in first 20 bytes following RESBUF. The result is + always in little endian byte order, so that a byte-wise output yields + to the wanted ASCII representation of the message digest. + + IMPORTANT: On some systems it is required that RESBUF is correctly + aligned for a 32 bits value. */ +extern void *sha1_read_ctx (const struct sha1_ctx *ctx, void *resbuf); + +/* Compute SHA1 message digest for bytes read from STREAM. The + resulting message digest number will be written into the 20 bytes + beginning at RESBLOCK. */ +extern int sha1_stream (FILE *stream, void *resblock); + +/* Compute SHA1 message digest for LEN bytes beginning at BUFFER. The + result is always in little endian byte order, so that a byte-wise + output yields to the wanted ASCII representation of the message + digest. */ +extern void *sha1_buffer (const char *buffer, size_t len, void *resblock); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/super-ddf.c b/super-ddf.c new file mode 100644 index 0000000..3f304cd --- /dev/null +++ b/super-ddf.c @@ -0,0 +1,5244 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2014 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + * Specifications for DDF taken from Common RAID DDF Specification Revision 1.2 + * (July 28 2006). Reused by permission of SNIA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include <values.h> +#include <stddef.h> + +/* a non-official T10 name for creation GUIDs */ +static char T10[] = "Linux-MD"; + +/* DDF timestamps are 1980 based, so we need to add + * second-in-decade-of-seventies to convert to linux timestamps. + * 10 years with 2 leap years. + */ +#define DECADE (3600*24*(365*10+2)) +unsigned long crc32( + unsigned long crc, + const unsigned char *buf, + unsigned len); + +#define DDF_NOTFOUND (~0U) +#define DDF_CONTAINER (DDF_NOTFOUND-1) + +/* Default for safe_mode_delay. Same value as for IMSM. + */ +static const int DDF_SAFE_MODE_DELAY = 4000; + +/* The DDF metadata handling. + * DDF metadata lives at the end of the device. + * The last 512 byte block provides an 'anchor' which is used to locate + * the rest of the metadata which usually lives immediately behind the anchor. + * + * Note: + * - all multibyte numeric fields are bigendian. + * - all strings are space padded. + * + */ + +typedef struct __be16 { + __u16 _v16; +} be16; +#define be16_eq(x, y) ((x)._v16 == (y)._v16) +#define be16_and(x, y) ((x)._v16 & (y)._v16) +#define be16_or(x, y) ((x)._v16 | (y)._v16) +#define be16_clear(x, y) ((x)._v16 &= ~(y)._v16) +#define be16_set(x, y) ((x)._v16 |= (y)._v16) + +typedef struct __be32 { + __u32 _v32; +} be32; +#define be32_eq(x, y) ((x)._v32 == (y)._v32) + +typedef struct __be64 { + __u64 _v64; +} be64; +#define be64_eq(x, y) ((x)._v64 == (y)._v64) + +#define be16_to_cpu(be) __be16_to_cpu((be)._v16) +static inline be16 cpu_to_be16(__u16 x) +{ + be16 be = { ._v16 = __cpu_to_be16(x) }; + return be; +} + +#define be32_to_cpu(be) __be32_to_cpu((be)._v32) +static inline be32 cpu_to_be32(__u32 x) +{ + be32 be = { ._v32 = __cpu_to_be32(x) }; + return be; +} + +#define be64_to_cpu(be) __be64_to_cpu((be)._v64) +static inline be64 cpu_to_be64(__u64 x) +{ + be64 be = { ._v64 = __cpu_to_be64(x) }; + return be; +} + +/* Primary Raid Level (PRL) */ +#define DDF_RAID0 0x00 +#define DDF_RAID1 0x01 +#define DDF_RAID3 0x03 +#define DDF_RAID4 0x04 +#define DDF_RAID5 0x05 +#define DDF_RAID1E 0x11 +#define DDF_JBOD 0x0f +#define DDF_CONCAT 0x1f +#define DDF_RAID5E 0x15 +#define DDF_RAID5EE 0x25 +#define DDF_RAID6 0x06 + +/* Raid Level Qualifier (RLQ) */ +#define DDF_RAID0_SIMPLE 0x00 +#define DDF_RAID1_SIMPLE 0x00 /* just 2 devices in this plex */ +#define DDF_RAID1_MULTI 0x01 /* exactly 3 devices in this plex */ +#define DDF_RAID3_0 0x00 /* parity in first extent */ +#define DDF_RAID3_N 0x01 /* parity in last extent */ +#define DDF_RAID4_0 0x00 /* parity in first extent */ +#define DDF_RAID4_N 0x01 /* parity in last extent */ +/* these apply to raid5e and raid5ee as well */ +#define DDF_RAID5_0_RESTART 0x00 /* same as 'right asymmetric' - layout 1 */ +#define DDF_RAID6_0_RESTART 0x01 /* raid6 different from raid5 here!!! */ +#define DDF_RAID5_N_RESTART 0x02 /* same as 'left asymmetric' - layout 0 */ +#define DDF_RAID5_N_CONTINUE 0x03 /* same as 'left symmetric' - layout 2 */ + +#define DDF_RAID1E_ADJACENT 0x00 /* raid10 nearcopies==2 */ +#define DDF_RAID1E_OFFSET 0x01 /* raid10 offsetcopies==2 */ + +/* Secondary RAID Level (SRL) */ +#define DDF_2STRIPED 0x00 /* This is weirder than RAID0 !! */ +#define DDF_2MIRRORED 0x01 +#define DDF_2CONCAT 0x02 +#define DDF_2SPANNED 0x03 /* This is also weird - be careful */ + +/* Magic numbers */ +#define DDF_HEADER_MAGIC cpu_to_be32(0xDE11DE11) +#define DDF_CONTROLLER_MAGIC cpu_to_be32(0xAD111111) +#define DDF_PHYS_RECORDS_MAGIC cpu_to_be32(0x22222222) +#define DDF_PHYS_DATA_MAGIC cpu_to_be32(0x33333333) +#define DDF_VIRT_RECORDS_MAGIC cpu_to_be32(0xDDDDDDDD) +#define DDF_VD_CONF_MAGIC cpu_to_be32(0xEEEEEEEE) +#define DDF_SPARE_ASSIGN_MAGIC cpu_to_be32(0x55555555) +#define DDF_VU_CONF_MAGIC cpu_to_be32(0x88888888) +#define DDF_VENDOR_LOG_MAGIC cpu_to_be32(0x01dBEEF0) +#define DDF_BBM_LOG_MAGIC cpu_to_be32(0xABADB10C) + +#define DDF_GUID_LEN 24 +#define DDF_REVISION_0 "01.00.00" +#define DDF_REVISION_2 "01.02.00" + +struct ddf_header { + be32 magic; /* DDF_HEADER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + char revision[8]; /* 01.02.00 */ + be32 seq; /* starts at '1' */ + be32 timestamp; + __u8 openflag; + __u8 foreignflag; + __u8 enforcegroups; + __u8 pad0; /* 0xff */ + __u8 pad1[12]; /* 12 * 0xff */ + /* 64 bytes so far */ + __u8 header_ext[32]; /* reserved: fill with 0xff */ + be64 primary_lba; + be64 secondary_lba; + __u8 type; + __u8 pad2[3]; /* 0xff */ + be32 workspace_len; /* sectors for vendor space - + * at least 32768(sectors) */ + be64 workspace_lba; + be16 max_pd_entries; /* one of 15, 63, 255, 1023, 4095 */ + be16 max_vd_entries; /* 2^(4,6,8,10,12)-1 : i.e. as above */ + be16 max_partitions; /* i.e. max num of configuration + record entries per disk */ + be16 config_record_len; /* 1 +ROUNDUP(max_primary_element_entries + *12/512) */ + be16 max_primary_element_entries; /* 16, 64, 256, 1024, or 4096 */ + __u8 pad3[54]; /* 0xff */ + /* 192 bytes so far */ + be32 controller_section_offset; + be32 controller_section_length; + be32 phys_section_offset; + be32 phys_section_length; + be32 virt_section_offset; + be32 virt_section_length; + be32 config_section_offset; + be32 config_section_length; + be32 data_section_offset; + be32 data_section_length; + be32 bbm_section_offset; + be32 bbm_section_length; + be32 diag_space_offset; + be32 diag_space_length; + be32 vendor_offset; + be32 vendor_length; + /* 256 bytes so far */ + __u8 pad4[256]; /* 0xff */ +}; + +/* type field */ +#define DDF_HEADER_ANCHOR 0x00 +#define DDF_HEADER_PRIMARY 0x01 +#define DDF_HEADER_SECONDARY 0x02 + +/* The content of the 'controller section' - global scope */ +struct ddf_controller_data { + be32 magic; /* DDF_CONTROLLER_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + struct controller_type { + be16 vendor_id; + be16 device_id; + be16 sub_vendor_id; + be16 sub_device_id; + } type; + char product_id[16]; + __u8 pad[8]; /* 0xff */ + __u8 vendor_data[448]; +}; + +/* The content of phys_section - global scope */ +struct phys_disk { + be32 magic; /* DDF_PHYS_RECORDS_MAGIC */ + be32 crc; + be16 used_pdes; /* This is a counter, not a max - the list + * of used entries may not be dense */ + be16 max_pdes; + __u8 pad[52]; + struct phys_disk_entry { + char guid[DDF_GUID_LEN]; + be32 refnum; + be16 type; + be16 state; + be64 config_size; /* DDF structures must be after here */ + char path[18]; /* Another horrible structure really + * but is "used for information + * purposes only" */ + __u8 pad[6]; + } entries[0]; +}; + +/* phys_disk_entry.type is a bitmap - bigendian remember */ +#define DDF_Forced_PD_GUID 1 +#define DDF_Active_in_VD 2 +#define DDF_Global_Spare 4 /* VD_CONF records are ignored */ +#define DDF_Spare 8 /* overrides Global_spare */ +#define DDF_Foreign 16 +#define DDF_Legacy 32 /* no DDF on this device */ + +#define DDF_Interface_mask 0xf00 +#define DDF_Interface_SCSI 0x100 +#define DDF_Interface_SAS 0x200 +#define DDF_Interface_SATA 0x300 +#define DDF_Interface_FC 0x400 + +/* phys_disk_entry.state is a bigendian bitmap */ +#define DDF_Online 1 +#define DDF_Failed 2 /* overrides 1,4,8 */ +#define DDF_Rebuilding 4 +#define DDF_Transition 8 +#define DDF_SMART 16 +#define DDF_ReadErrors 32 +#define DDF_Missing 64 + +/* The content of the virt_section global scope */ +struct virtual_disk { + be32 magic; /* DDF_VIRT_RECORDS_MAGIC */ + be32 crc; + be16 populated_vdes; + be16 max_vdes; + __u8 pad[52]; + struct virtual_entry { + char guid[DDF_GUID_LEN]; + be16 unit; + __u16 pad0; /* 0xffff */ + be16 guid_crc; + be16 type; + __u8 state; + __u8 init_state; + __u8 pad1[14]; + char name[16]; + } entries[0]; +}; + +/* virtual_entry.type is a bitmap - bigendian */ +#define DDF_Shared 1 +#define DDF_Enforce_Groups 2 +#define DDF_Unicode 4 +#define DDF_Owner_Valid 8 + +/* virtual_entry.state is a bigendian bitmap */ +#define DDF_state_mask 0x7 +#define DDF_state_optimal 0x0 +#define DDF_state_degraded 0x1 +#define DDF_state_deleted 0x2 +#define DDF_state_missing 0x3 +#define DDF_state_failed 0x4 +#define DDF_state_part_optimal 0x5 + +#define DDF_state_morphing 0x8 +#define DDF_state_inconsistent 0x10 + +/* virtual_entry.init_state is a bigendian bitmap */ +#define DDF_initstate_mask 0x03 +#define DDF_init_not 0x00 +#define DDF_init_quick 0x01 /* initialisation is progress. + * i.e. 'state_inconsistent' */ +#define DDF_init_full 0x02 + +#define DDF_access_mask 0xc0 +#define DDF_access_rw 0x00 +#define DDF_access_ro 0x80 +#define DDF_access_blocked 0xc0 + +/* The content of the config_section - local scope + * It has multiple records each config_record_len sectors + * They can be vd_config or spare_assign + */ + +struct vd_config { + be32 magic; /* DDF_VD_CONF_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 timestamp; + be32 seqnum; + __u8 pad0[24]; + be16 prim_elmnt_count; + __u8 chunk_shift; /* 0 == 512, 1==1024 etc */ + __u8 prl; + __u8 rlq; + __u8 sec_elmnt_count; + __u8 sec_elmnt_seq; + __u8 srl; + be64 blocks; /* blocks per component could be different + * on different component devices...(only + * for concat I hope) */ + be64 array_blocks; /* blocks in array */ + __u8 pad1[8]; + be32 spare_refs[8]; /* This is used to detect missing spares. + * As we don't have an interface for that + * the values are ignored. + */ + __u8 cache_pol[8]; + __u8 bg_rate; + __u8 pad2[3]; + __u8 pad3[52]; + __u8 pad4[192]; + __u8 v0[32]; /* reserved- 0xff */ + __u8 v1[32]; /* reserved- 0xff */ + __u8 v2[16]; /* reserved- 0xff */ + __u8 v3[16]; /* reserved- 0xff */ + __u8 vendor[32]; + be32 phys_refnum[0]; /* refnum of each disk in sequence */ + /*__u64 lba_offset[0]; LBA offset in each phys. Note extents in a + bvd are always the same size */ +}; +#define LBA_OFFSET(ddf, vd) ((be64 *) &(vd)->phys_refnum[(ddf)->mppe]) + +/* vd_config.cache_pol[7] is a bitmap */ +#define DDF_cache_writeback 1 /* else writethrough */ +#define DDF_cache_wadaptive 2 /* only applies if writeback */ +#define DDF_cache_readahead 4 +#define DDF_cache_radaptive 8 /* only if doing read-ahead */ +#define DDF_cache_ifnobatt 16 /* even to write cache if battery is poor */ +#define DDF_cache_wallowed 32 /* enable write caching */ +#define DDF_cache_rallowed 64 /* enable read caching */ + +struct spare_assign { + be32 magic; /* DDF_SPARE_ASSIGN_MAGIC */ + be32 crc; + be32 timestamp; + __u8 reserved[7]; + __u8 type; + be16 populated; /* SAEs used */ + be16 max; /* max SAEs */ + __u8 pad[8]; + struct spare_assign_entry { + char guid[DDF_GUID_LEN]; + be16 secondary_element; + __u8 pad[6]; + } spare_ents[0]; +}; +/* spare_assign.type is a bitmap */ +#define DDF_spare_dedicated 0x1 /* else global */ +#define DDF_spare_revertible 0x2 /* else committable */ +#define DDF_spare_active 0x4 /* else not active */ +#define DDF_spare_affinity 0x8 /* enclosure affinity */ + +/* The data_section contents - local scope */ +struct disk_data { + be32 magic; /* DDF_PHYS_DATA_MAGIC */ + be32 crc; + char guid[DDF_GUID_LEN]; + be32 refnum; /* crc of some magic drive data ... */ + __u8 forced_ref; /* set when above was not result of magic */ + __u8 forced_guid; /* set if guid was forced rather than magic */ + __u8 vendor[32]; + __u8 pad[442]; +}; + +/* bbm_section content */ +struct bad_block_log { + be32 magic; + be32 crc; + be16 entry_count; + be32 spare_count; + __u8 pad[10]; + be64 first_spare; + struct mapped_block { + be64 defective_start; + be32 replacement_start; + be16 remap_count; + __u8 pad[2]; + } entries[0]; +}; + +/* Struct for internally holding ddf structures */ +/* The DDF structure stored on each device is potentially + * quite different, as some data is global and some is local. + * The global data is: + * - ddf header + * - controller_data + * - Physical disk records + * - Virtual disk records + * The local data is: + * - Configuration records + * - Physical Disk data section + * ( and Bad block and vendor which I don't care about yet). + * + * The local data is parsed into separate lists as it is read + * and reconstructed for writing. This means that we only need + * to make config changes once and they are automatically + * propagated to all devices. + * The global (config and disk data) records are each in a list + * of separate data structures. When writing we find the entry + * or entries applicable to the particular device. + */ +struct ddf_super { + struct ddf_header anchor, primary, secondary; + struct ddf_controller_data controller; + struct ddf_header *active; + struct phys_disk *phys; + struct virtual_disk *virt; + char *conf; + int pdsize, vdsize; + unsigned int max_part, mppe, conf_rec_len; + int currentdev; + int updates_pending; + struct vcl { + union { + char space[512]; + struct { + struct vcl *next; + unsigned int vcnum; /* index into ->virt */ + /* For an array with a secondary level there are + * multiple vd_config structures, all with the same + * guid but with different sec_elmnt_seq. + * One of these structures is in 'conf' below. + * The others are in other_bvds, not in any + * particular order. + */ + struct vd_config **other_bvds; + __u64 *block_sizes; /* NULL if all the same */ + }; + }; + struct vd_config conf; + } *conflist, *currentconf; + struct dl { + union { + char space[512]; + struct { + struct dl *next; + int major, minor; + char *devname; + int fd; + unsigned long long size; /* sectors */ + be64 primary_lba; /* sectors */ + be64 secondary_lba; /* sectors */ + be64 workspace_lba; /* sectors */ + int pdnum; /* index in ->phys */ + struct spare_assign *spare; + void *mdupdate; /* hold metadata update */ + + /* These fields used by auto-layout */ + int raiddisk; /* slot to fill in autolayout */ + __u64 esize; + int displayed; + }; + }; + struct disk_data disk; + struct vcl *vlist[0]; /* max_part in size */ + } *dlist, *add_list; +}; + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname); +static int get_svd_state(const struct ddf_super *, const struct vcl *); +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose); + +static void free_super_ddf(struct supertype *st); +static int all_ff(const char *guid); +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx); +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map); +static void uuid_from_ddf_guid(const char *guid, int uuid[4]); +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]); +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i); +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map); +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset); + +#if DEBUG +static void pr_state(struct ddf_super *ddf, const char *msg) +{ + unsigned int i; + dprintf("%s: ", msg); + for (i = 0; i < be16_to_cpu(ddf->active->max_vd_entries); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + dprintf_cont("%u(s=%02x i=%02x) ", i, + ddf->virt->entries[i].state, + ddf->virt->entries[i].init_state); + } + dprintf_cont("\n"); +} +#else +static void pr_state(const struct ddf_super *ddf, const char *msg) {} +#endif + +static void _ddf_set_updates_pending(struct ddf_super *ddf, struct vd_config *vc, + const char *func) +{ + if (vc) { + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(be32_to_cpu(vc->seqnum) + 1); + } + if (ddf->updates_pending) + return; + ddf->updates_pending = 1; + ddf->active->seq = cpu_to_be32((be32_to_cpu(ddf->active->seq)+1)); + pr_state(ddf, func); +} + +#define ddf_set_updates_pending(x,v) _ddf_set_updates_pending((x), (v), __func__) + +static be32 calc_crc(void *buf, int len) +{ + /* crcs are always at the same place as in the ddf_header */ + struct ddf_header *ddf = buf; + be32 oldcrc = ddf->crc; + __u32 newcrc; + ddf->crc = cpu_to_be32(0xffffffff); + + newcrc = crc32(0, buf, len); + ddf->crc = oldcrc; + /* The crc is stored (like everything) bigendian, so convert + * here for simplicity + */ + return cpu_to_be32(newcrc); +} + +#define DDF_INVALID_LEVEL 0xff +#define DDF_NO_SECONDARY 0xff +static int err_bad_md_layout(const mdu_array_info_t *array) +{ + pr_err("RAID%d layout %x with %d disks is unsupported for DDF\n", + array->level, array->layout, array->raid_disks); + return -1; +} + +static int layout_md2ddf(const mdu_array_info_t *array, + struct vd_config *conf) +{ + be16 prim_elmnt_count = cpu_to_be16(array->raid_disks); + __u8 prl = DDF_INVALID_LEVEL, rlq = 0; + __u8 sec_elmnt_count = 1; + __u8 srl = DDF_NO_SECONDARY; + + switch (array->level) { + case LEVEL_LINEAR: + prl = DDF_CONCAT; + break; + case 0: + rlq = DDF_RAID0_SIMPLE; + prl = DDF_RAID0; + break; + case 1: + switch (array->raid_disks) { + case 2: + rlq = DDF_RAID1_SIMPLE; + break; + case 3: + rlq = DDF_RAID1_MULTI; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID1; + break; + case 4: + if (array->layout != 0) + return err_bad_md_layout(array); + rlq = DDF_RAID4_N; + prl = DDF_RAID4; + break; + case 5: + switch (array->layout) { + case ALGORITHM_LEFT_ASYMMETRIC: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_RIGHT_ASYMMETRIC: + rlq = DDF_RAID5_0_RESTART; + break; + case ALGORITHM_LEFT_SYMMETRIC: + rlq = DDF_RAID5_N_CONTINUE; + break; + case ALGORITHM_RIGHT_SYMMETRIC: + /* not mentioned in standard */ + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID5; + break; + case 6: + switch (array->layout) { + case ALGORITHM_ROTATING_N_RESTART: + rlq = DDF_RAID5_N_RESTART; + break; + case ALGORITHM_ROTATING_ZERO_RESTART: + rlq = DDF_RAID6_0_RESTART; + break; + case ALGORITHM_ROTATING_N_CONTINUE: + rlq = DDF_RAID5_N_CONTINUE; + break; + default: + return err_bad_md_layout(array); + } + prl = DDF_RAID6; + break; + case 10: + if (array->raid_disks % 2 == 0 && array->layout == 0x102) { + rlq = DDF_RAID1_SIMPLE; + prim_elmnt_count = cpu_to_be16(2); + sec_elmnt_count = array->raid_disks / 2; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->raid_disks % 3 == 0 && + array->layout == 0x103) { + rlq = DDF_RAID1_MULTI; + prim_elmnt_count = cpu_to_be16(3); + sec_elmnt_count = array->raid_disks / 3; + srl = DDF_2SPANNED; + prl = DDF_RAID1; + } else if (array->layout == 0x201) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_OFFSET; + } else if (array->layout == 0x102) { + prl = DDF_RAID1E; + rlq = DDF_RAID1E_ADJACENT; + } else + return err_bad_md_layout(array); + break; + default: + return err_bad_md_layout(array); + } + conf->prl = prl; + conf->prim_elmnt_count = prim_elmnt_count; + conf->rlq = rlq; + conf->srl = srl; + conf->sec_elmnt_count = sec_elmnt_count; + return 0; +} + +static int err_bad_ddf_layout(const struct vd_config *conf) +{ + pr_err("DDF RAID %u qualifier %u with %u disks is unsupported\n", + conf->prl, conf->rlq, be16_to_cpu(conf->prim_elmnt_count)); + return -1; +} + +static int layout_ddf2md(const struct vd_config *conf, + mdu_array_info_t *array) +{ + int level = LEVEL_UNSUPPORTED; + int layout = 0; + int raiddisks = be16_to_cpu(conf->prim_elmnt_count); + + if (conf->sec_elmnt_count > 1) { + /* see also check_secondary() */ + if (conf->prl != DDF_RAID1 || + (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED)) { + pr_err("Unsupported secondary RAID level %u/%u\n", + conf->prl, conf->srl); + return -1; + } + if (raiddisks == 2 && conf->rlq == DDF_RAID1_SIMPLE) + layout = 0x102; + else if (raiddisks == 3 && conf->rlq == DDF_RAID1_MULTI) + layout = 0x103; + else + return err_bad_ddf_layout(conf); + raiddisks *= conf->sec_elmnt_count; + level = 10; + goto good; + } + + switch (conf->prl) { + case DDF_CONCAT: + level = LEVEL_LINEAR; + break; + case DDF_RAID0: + if (conf->rlq != DDF_RAID0_SIMPLE) + return err_bad_ddf_layout(conf); + level = 0; + break; + case DDF_RAID1: + if (!((conf->rlq == DDF_RAID1_SIMPLE && raiddisks == 2) || + (conf->rlq == DDF_RAID1_MULTI && raiddisks == 3))) + return err_bad_ddf_layout(conf); + level = 1; + break; + case DDF_RAID1E: + if (conf->rlq == DDF_RAID1E_ADJACENT) + layout = 0x102; + else if (conf->rlq == DDF_RAID1E_OFFSET) + layout = 0x201; + else + return err_bad_ddf_layout(conf); + level = 10; + break; + case DDF_RAID4: + if (conf->rlq != DDF_RAID4_N) + return err_bad_ddf_layout(conf); + level = 4; + break; + case DDF_RAID5: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_LEFT_ASYMMETRIC; + break; + case DDF_RAID5_0_RESTART: + layout = ALGORITHM_RIGHT_ASYMMETRIC; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_LEFT_SYMMETRIC; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 5; + break; + case DDF_RAID6: + switch (conf->rlq) { + case DDF_RAID5_N_RESTART: + layout = ALGORITHM_ROTATING_N_RESTART; + break; + case DDF_RAID6_0_RESTART: + layout = ALGORITHM_ROTATING_ZERO_RESTART; + break; + case DDF_RAID5_N_CONTINUE: + layout = ALGORITHM_ROTATING_N_CONTINUE; + break; + default: + return err_bad_ddf_layout(conf); + } + level = 6; + break; + default: + return err_bad_ddf_layout(conf); + }; + +good: + array->level = level; + array->layout = layout; + array->raid_disks = raiddisks; + return 0; +} + +static int load_ddf_header(int fd, unsigned long long lba, + unsigned long long size, + int type, + struct ddf_header *hdr, struct ddf_header *anchor) +{ + /* read a ddf header (primary or secondary) from fd/lba + * and check that it is consistent with anchor + * Need to check: + * magic, crc, guid, rev, and LBA's header_type, and + * everything after header_type must be the same + */ + if (lba >= size-1) + return 0; + + if (lseek64(fd, lba<<9, 0) < 0) + return 0; + + if (read(fd, hdr, 512) != 512) + return 0; + + if (!be32_eq(hdr->magic, DDF_HEADER_MAGIC)) { + pr_err("bad header magic\n"); + return 0; + } + if (!be32_eq(calc_crc(hdr, 512), hdr->crc)) { + pr_err("bad CRC\n"); + return 0; + } + if (memcmp(anchor->guid, hdr->guid, DDF_GUID_LEN) != 0 || + memcmp(anchor->revision, hdr->revision, 8) != 0 || + !be64_eq(anchor->primary_lba, hdr->primary_lba) || + !be64_eq(anchor->secondary_lba, hdr->secondary_lba) || + hdr->type != type || + memcmp(anchor->pad2, hdr->pad2, 512 - + offsetof(struct ddf_header, pad2)) != 0) { + pr_err("header mismatch\n"); + return 0; + } + + /* Looks good enough to me... */ + return 1; +} + +static void *load_section(int fd, struct ddf_super *super, void *buf, + be32 offset_be, be32 len_be, int check) +{ + unsigned long long offset = be32_to_cpu(offset_be); + unsigned long long len = be32_to_cpu(len_be); + int dofree = (buf == NULL); + + if (check) + if (len != 2 && len != 8 && len != 32 && + len != 128 && len != 512) + return NULL; + + if (len > 1024) + return NULL; + if (!buf && posix_memalign(&buf, 512, len<<9) != 0) + buf = NULL; + + if (!buf) + return NULL; + + if (super->active->type == 1) + offset += be64_to_cpu(super->active->primary_lba); + else + offset += be64_to_cpu(super->active->secondary_lba); + + if ((unsigned long long)lseek64(fd, offset<<9, 0) != (offset<<9)) { + if (dofree) + free(buf); + return NULL; + } + if ((unsigned long long)read(fd, buf, len<<9) != (len<<9)) { + if (dofree) + free(buf); + return NULL; + } + return buf; +} + +static int load_ddf_headers(int fd, struct ddf_super *super, char *devname) +{ + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + + if (lseek64(fd, dsize-512, 0) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (read(fd, &super->anchor, 512) != 512) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + if (!be32_eq(super->anchor.magic, DDF_HEADER_MAGIC)) { + if (devname) + pr_err("no DDF anchor found on %s\n", + devname); + return 2; + } + if (!be32_eq(calc_crc(&super->anchor, 512), super->anchor.crc)) { + if (devname) + pr_err("bad CRC on anchor on %s\n", + devname); + return 2; + } + if (memcmp(super->anchor.revision, DDF_REVISION_0, 8) != 0 && + memcmp(super->anchor.revision, DDF_REVISION_2, 8) != 0) { + if (devname) + pr_err("can only support super revision %.8s and earlier, not %.8s on %s\n", + DDF_REVISION_2, super->anchor.revision,devname); + return 2; + } + super->active = NULL; + if (load_ddf_header(fd, be64_to_cpu(super->anchor.primary_lba), + dsize >> 9, 1, + &super->primary, &super->anchor) == 0) { + if (devname) + pr_err("Failed to load primary DDF header on %s\n", devname); + } else + super->active = &super->primary; + + if (load_ddf_header(fd, be64_to_cpu(super->anchor.secondary_lba), + dsize >> 9, 2, + &super->secondary, &super->anchor)) { + if (super->active == NULL || + (be32_to_cpu(super->primary.seq) + < be32_to_cpu(super->secondary.seq) && + !super->secondary.openflag) || + (be32_to_cpu(super->primary.seq) == + be32_to_cpu(super->secondary.seq) && + super->primary.openflag && !super->secondary.openflag)) + super->active = &super->secondary; + } else if (devname && + be64_to_cpu(super->anchor.secondary_lba) != ~(__u64)0) + pr_err("Failed to load secondary DDF header on %s\n", + devname); + if (super->active == NULL) + return 2; + return 0; +} + +static int load_ddf_global(int fd, struct ddf_super *super, char *devname) +{ + void *ok; + ok = load_section(fd, super, &super->controller, + super->active->controller_section_offset, + super->active->controller_section_length, + 0); + super->phys = load_section(fd, super, NULL, + super->active->phys_section_offset, + super->active->phys_section_length, + 1); + super->pdsize = be32_to_cpu(super->active->phys_section_length) * 512; + + super->virt = load_section(fd, super, NULL, + super->active->virt_section_offset, + super->active->virt_section_length, + 1); + super->vdsize = be32_to_cpu(super->active->virt_section_length) * 512; + if (!ok || + !super->phys || + !super->virt) { + free(super->phys); + free(super->virt); + super->phys = NULL; + super->virt = NULL; + return 2; + } + super->conflist = NULL; + super->dlist = NULL; + + super->max_part = be16_to_cpu(super->active->max_partitions); + super->mppe = be16_to_cpu(super->active->max_primary_element_entries); + super->conf_rec_len = be16_to_cpu(super->active->config_record_len); + return 0; +} + +#define DDF_UNUSED_BVD 0xff +static int alloc_other_bvds(const struct ddf_super *ddf, struct vcl *vcl) +{ + unsigned int n_vds = vcl->conf.sec_elmnt_count - 1; + unsigned int i, vdsize; + void *p; + if (n_vds == 0) { + vcl->other_bvds = NULL; + return 0; + } + vdsize = ddf->conf_rec_len * 512; + if (posix_memalign(&p, 512, n_vds * + (vdsize + sizeof(struct vd_config *))) != 0) + return -1; + vcl->other_bvds = (struct vd_config **) (p + n_vds * vdsize); + for (i = 0; i < n_vds; i++) { + vcl->other_bvds[i] = p + i * vdsize; + memset(vcl->other_bvds[i], 0, vdsize); + vcl->other_bvds[i]->sec_elmnt_seq = DDF_UNUSED_BVD; + } + return 0; +} + +static void add_other_bvd(struct vcl *vcl, struct vd_config *vd, + unsigned int len) +{ + int i; + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == vd->sec_elmnt_seq) + break; + + if (i < vcl->conf.sec_elmnt_count-1) { + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->other_bvds[i]->seqnum)) + return; + } else { + for (i = 0; i < vcl->conf.sec_elmnt_count-1; i++) + if (vcl->other_bvds[i]->sec_elmnt_seq == DDF_UNUSED_BVD) + break; + if (i == vcl->conf.sec_elmnt_count-1) { + pr_err("no space for sec level config %u, count is %u\n", + vd->sec_elmnt_seq, vcl->conf.sec_elmnt_count); + return; + } + } + memcpy(vcl->other_bvds[i], vd, len); +} + +static int load_ddf_local(int fd, struct ddf_super *super, + char *devname, int keep) +{ + struct dl *dl; + struct stat stb; + char *conf; + unsigned int i; + unsigned int confsec; + int vnum; + unsigned int max_virt_disks = + be16_to_cpu(super->active->max_vd_entries); + unsigned long long dsize; + + /* First the local disk info */ + if (posix_memalign((void**)&dl, 512, + sizeof(*dl) + + (super->max_part) * sizeof(dl->vlist[0])) != 0) { + pr_err("could not allocate disk info buffer\n"); + return 1; + } + + load_section(fd, super, &dl->disk, + super->active->data_section_offset, + super->active->data_section_length, + 0); + dl->devname = devname ? xstrdup(devname) : NULL; + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->dlist; + dl->fd = keep ? fd : -1; + + dl->size = 0; + if (get_dev_size(fd, devname, &dsize)) + dl->size = dsize >> 9; + /* If the disks have different sizes, the LBAs will differ + * between phys disks. + * At this point here, the values in super->active must be valid + * for this phys disk. */ + dl->primary_lba = super->active->primary_lba; + dl->secondary_lba = super->active->secondary_lba; + dl->workspace_lba = super->active->workspace_lba; + dl->spare = NULL; + for (i = 0 ; i < super->max_part ; i++) + dl->vlist[i] = NULL; + super->dlist = dl; + dl->pdnum = -1; + for (i = 0; i < be16_to_cpu(super->active->max_pd_entries); i++) + if (memcmp(super->phys->entries[i].guid, + dl->disk.guid, DDF_GUID_LEN) == 0) + dl->pdnum = i; + + /* Now the config list. */ + /* 'conf' is an array of config entries, some of which are + * probably invalid. Those which are good need to be copied into + * the conflist + */ + + conf = load_section(fd, super, super->conf, + super->active->config_section_offset, + super->active->config_section_length, + 0); + super->conf = conf; + vnum = 0; + for (confsec = 0; + confsec < be32_to_cpu(super->active->config_section_length); + confsec += super->conf_rec_len) { + struct vd_config *vd = + (struct vd_config *)((char*)conf + confsec*512); + struct vcl *vcl; + + if (be32_eq(vd->magic, DDF_SPARE_ASSIGN_MAGIC)) { + if (dl->spare) + continue; + if (posix_memalign((void**)&dl->spare, 512, + super->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 1; + } + + memcpy(dl->spare, vd, super->conf_rec_len*512); + continue; + } + if (!be32_eq(vd->magic, DDF_VD_CONF_MAGIC)) + /* Must be vendor-unique - I cannot handle those */ + continue; + + for (vcl = super->conflist; vcl; vcl = vcl->next) { + if (memcmp(vcl->conf.guid, + vd->guid, DDF_GUID_LEN) == 0) + break; + } + + if (vcl) { + dl->vlist[vnum++] = vcl; + if (vcl->other_bvds != NULL && + vcl->conf.sec_elmnt_seq != vd->sec_elmnt_seq) { + add_other_bvd(vcl, vd, super->conf_rec_len*512); + continue; + } + if (be32_to_cpu(vd->seqnum) <= + be32_to_cpu(vcl->conf.seqnum)) + continue; + } else { + if (posix_memalign((void**)&vcl, 512, + (super->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 1; + } + vcl->next = super->conflist; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vcl->conf.sec_elmnt_count = vd->sec_elmnt_count; + if (alloc_other_bvds(super, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 1; + }; + super->conflist = vcl; + dl->vlist[vnum++] = vcl; + } + memcpy(&vcl->conf, vd, super->conf_rec_len*512); + for (i=0; i < max_virt_disks ; i++) + if (memcmp(super->virt->entries[i].guid, + vcl->conf.guid, DDF_GUID_LEN)==0) + break; + if (i < max_virt_disks) + vcl->vcnum = i; + } + + return 0; +} + +static int load_super_ddf(struct supertype *st, int fd, + char *devname) +{ + unsigned long long dsize; + struct ddf_super *super; + int rv; + + if (get_dev_size(fd, devname, &dsize) == 0) + return 1; + + if (test_partition(fd)) + /* DDF is not allowed on partitions */ + return 1; + + /* 32M is a lower bound */ + if (dsize <= 32*1024*1024) { + if (devname) + pr_err("%s is too small for ddf: size is %llu sectors.\n", + devname, dsize>>9); + return 1; + } + if (dsize & 511) { + if (devname) + pr_err("%s is an odd size for ddf: size is %llu bytes.\n", + devname, dsize); + return 1; + } + + free_super_ddf(st); + + if (posix_memalign((void**)&super, 512, sizeof(*super))!= 0) { + pr_err("malloc of %zu failed.\n", + sizeof(*super)); + return 1; + } + memset(super, 0, sizeof(*super)); + + rv = load_ddf_headers(fd, super, devname); + if (rv) { + free(super); + return rv; + } + + /* Have valid headers and have chosen the best. Let's read in the rest*/ + + rv = load_ddf_global(fd, super, devname); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + rv = load_ddf_local(fd, super, devname, 0); + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free(super); + return rv; + } + + /* Should possibly check the sections .... */ + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + return 0; + +} + +static void free_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + if (ddf == NULL) + return; + free(ddf->phys); + free(ddf->virt); + free(ddf->conf); + while (ddf->conflist) { + struct vcl *v = ddf->conflist; + ddf->conflist = v->next; + if (v->block_sizes) + free(v->block_sizes); + if (v->other_bvds) + /* + v->other_bvds[0] points to beginning of buffer, + see alloc_other_bvds() + */ + free(v->other_bvds[0]); + free(v); + } + while (ddf->dlist) { + struct dl *d = ddf->dlist; + ddf->dlist = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + while (ddf->add_list) { + struct dl *d = ddf->add_list; + ddf->add_list = d->next; + if (d->fd >= 0) + close(d->fd); + if (d->spare) + free(d->spare); + free(d); + } + free(ddf); + st->sb = NULL; +} + +static struct supertype *match_metadata_desc_ddf(char *arg) +{ + /* 'ddf' only supports containers */ + struct supertype *st; + if (strcmp(arg, "ddf") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_ddf; + st->max_devs = 512; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +static mapping_t ddf_state[] = { + { "Optimal", 0}, + { "Degraded", 1}, + { "Deleted", 2}, + { "Missing", 3}, + { "Failed", 4}, + { "Partially Optimal", 5}, + { "-reserved-", 6}, + { "-reserved-", 7}, + { NULL, 0} +}; + +static mapping_t ddf_init_state[] = { + { "Not Initialised", 0}, + { "QuickInit in Progress", 1}, + { "Fully Initialised", 2}, + { "*UNKNOWN*", 3}, + { NULL, 0} +}; +static mapping_t ddf_access[] = { + { "Read/Write", 0}, + { "Reserved", 1}, + { "Read Only", 2}, + { "Blocked (no access)", 3}, + { NULL ,0} +}; + +static mapping_t ddf_level[] = { + { "RAID0", DDF_RAID0}, + { "RAID1", DDF_RAID1}, + { "RAID3", DDF_RAID3}, + { "RAID4", DDF_RAID4}, + { "RAID5", DDF_RAID5}, + { "RAID1E",DDF_RAID1E}, + { "JBOD", DDF_JBOD}, + { "CONCAT",DDF_CONCAT}, + { "RAID5E",DDF_RAID5E}, + { "RAID5EE",DDF_RAID5EE}, + { "RAID6", DDF_RAID6}, + { NULL, 0} +}; +static mapping_t ddf_sec_level[] = { + { "Striped", DDF_2STRIPED}, + { "Mirrored", DDF_2MIRRORED}, + { "Concat", DDF_2CONCAT}, + { "Spanned", DDF_2SPANNED}, + { NULL, 0} +}; + +static int all_ff(const char *guid) +{ + int i; + for (i = 0; i < DDF_GUID_LEN; i++) + if (guid[i] != (char)0xff) + return 0; + return 1; +} + +static const char *guid_str(const char *guid) +{ + static char buf[DDF_GUID_LEN*2+1]; + int i; + char *p = buf; + for (i = 0; i < DDF_GUID_LEN; i++) { + unsigned char c = guid[i]; + if (c >= 32 && c < 127) + p += sprintf(p, "%c", c); + else + p += sprintf(p, "%02x", c); + } + *p = '\0'; + return (const char *) buf; +} + +static void print_guid(char *guid, int tstamp) +{ + /* A GUIDs are part (or all) ASCII and part binary. + * They tend to be space padded. + * We print the GUID in HEX, then in parentheses add + * any initial ASCII sequence, and a possible + * time stamp from bytes 16-19 + */ + int l = DDF_GUID_LEN; + int i; + + for (i=0 ; i<DDF_GUID_LEN ; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02X", guid[i]&255); + } + + printf("\n ("); + while (l && guid[l-1] == ' ') + l--; + for (i=0 ; i<l ; i++) { + if (guid[i] >= 0x20 && guid[i] < 0x7f) + fputc(guid[i], stdout); + else + break; + } + if (tstamp) { + time_t then = __be32_to_cpu(*(__u32*)(guid+16)) + DECADE; + char tbuf[100]; + struct tm *tm; + tm = localtime(&then); + strftime(tbuf, 100, " %D %T",tm); + fputs(tbuf, stdout); + } + printf(")"); +} + +static void examine_vd(int n, struct ddf_super *sb, char *guid) +{ + int crl = sb->conf_rec_len; + struct vcl *vcl; + + for (vcl = sb->conflist ; vcl ; vcl = vcl->next) { + unsigned int i; + struct vd_config *vc = &vcl->conf; + + if (!be32_eq(calc_crc(vc, crl*512), vc->crc)) + continue; + if (memcmp(vc->guid, guid, DDF_GUID_LEN) != 0) + continue; + + /* Ok, we know about this VD, let's give more details */ + printf(" Raid Devices[%d] : %d (", n, + be16_to_cpu(vc->prim_elmnt_count)); + for (i = 0; i < be16_to_cpu(vc->prim_elmnt_count); i++) { + int j; + int cnt = be16_to_cpu(sb->phys->max_pdes); + for (j=0; j<cnt; j++) + if (be32_eq(vc->phys_refnum[i], + sb->phys->entries[j].refnum)) + break; + if (i) printf(" "); + if (j < cnt) + printf("%d", j); + else + printf("--"); + printf("@%lluK", (unsigned long long) be64_to_cpu(LBA_OFFSET(sb, vc)[i])/2); + } + printf(")\n"); + if (vc->chunk_shift != 255) + printf(" Chunk Size[%d] : %d sectors\n", n, + 1 << vc->chunk_shift); + printf(" Raid Level[%d] : %s\n", n, + map_num(ddf_level, vc->prl)?:"-unknown-"); + if (vc->sec_elmnt_count != 1) { + printf(" Secondary Position[%d] : %d of %d\n", n, + vc->sec_elmnt_seq, vc->sec_elmnt_count); + printf(" Secondary Level[%d] : %s\n", n, + map_num(ddf_sec_level, vc->srl) ?: "-unknown-"); + } + printf(" Device Size[%d] : %llu\n", n, + be64_to_cpu(vc->blocks)/2); + printf(" Array Size[%d] : %llu\n", n, + be64_to_cpu(vc->array_blocks)/2); + } +} + +static void examine_vds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->virt->populated_vdes); + unsigned int i; + printf(" Virtual Disks : %d\n", cnt); + + for (i = 0; i < be16_to_cpu(sb->virt->max_vdes); i++) { + struct virtual_entry *ve = &sb->virt->entries[i]; + if (all_ff(ve->guid)) + continue; + printf("\n"); + printf(" VD GUID[%d] : ", i); print_guid(ve->guid, 1); + printf("\n"); + printf(" unit[%d] : %d\n", i, be16_to_cpu(ve->unit)); + printf(" state[%d] : %s, %s%s\n", i, + map_num(ddf_state, ve->state & 7), + (ve->state & DDF_state_morphing) ? "Morphing, ": "", + (ve->state & DDF_state_inconsistent)? "Not Consistent" : "Consistent"); + printf(" init state[%d] : %s\n", i, + map_num(ddf_init_state, ve->init_state&DDF_initstate_mask)); + printf(" access[%d] : %s\n", i, + map_num(ddf_access, (ve->init_state & DDF_access_mask) >> 6)); + printf(" Name[%d] : %.16s\n", i, ve->name); + examine_vd(i, sb, ve->guid); + } + if (cnt) printf("\n"); +} + +static void examine_pds(struct ddf_super *sb) +{ + int cnt = be16_to_cpu(sb->phys->max_pdes); + int i; + struct dl *dl; + int unlisted = 0; + printf(" Physical Disks : %d\n", cnt); + printf(" Number RefNo Size Device Type/State\n"); + + for (dl = sb->dlist; dl; dl = dl->next) + dl->displayed = 0; + + for (i=0 ; i<cnt ; i++) { + struct phys_disk_entry *pd = &sb->phys->entries[i]; + int type = be16_to_cpu(pd->type); + int state = be16_to_cpu(pd->state); + + if (be32_to_cpu(pd->refnum) == 0xffffffff) + /* Not in use */ + continue; + //printf(" PD GUID[%d] : ", i); print_guid(pd->guid, 0); + //printf("\n"); + printf(" %3d %08x ", i, + be32_to_cpu(pd->refnum)); + printf("%8lluK ", + be64_to_cpu(pd->config_size)>>1); + for (dl = sb->dlist; dl ; dl = dl->next) { + if (be32_eq(dl->disk.refnum, pd->refnum)) { + char *dv = map_dev(dl->major, dl->minor, 0); + if (dv) { + printf("%-15s", dv); + break; + } + } + } + if (!dl) + printf("%15s",""); + else + dl->displayed = 1; + printf(" %s%s%s%s%s", + (type&2) ? "active":"", + (type&4) ? "Global-Spare":"", + (type&8) ? "spare" : "", + (type&16)? ", foreign" : "", + (type&32)? "pass-through" : ""); + if (state & DDF_Failed) + /* This over-rides these three */ + state &= ~(DDF_Online|DDF_Rebuilding|DDF_Transition); + printf("/%s%s%s%s%s%s%s", + (state&1)? "Online": "Offline", + (state&2)? ", Failed": "", + (state&4)? ", Rebuilding": "", + (state&8)? ", in-transition": "", + (state&16)? ", SMART-errors": "", + (state&32)? ", Unrecovered-Read-Errors": "", + (state&64)? ", Missing" : ""); + printf("\n"); + } + for (dl = sb->dlist; dl; dl = dl->next) { + char *dv; + if (dl->displayed) + continue; + if (!unlisted) + printf(" Physical disks not in metadata!:\n"); + unlisted = 1; + dv = map_dev(dl->major, dl->minor, 0); + printf(" %08x %s\n", be32_to_cpu(dl->disk.refnum), + dv ? dv : "-unknown-"); + } + if (unlisted) + printf("\n"); +} + +static void examine_super_ddf(struct supertype *st, char *homehost) +{ + struct ddf_super *sb = st->sb; + + printf(" Magic : %08x\n", be32_to_cpu(sb->anchor.magic)); + printf(" Version : %.8s\n", sb->anchor.revision); + printf("Controller GUID : "); print_guid(sb->controller.guid, 0); + printf("\n"); + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Redundant hdr : %s\n", (be32_eq(sb->secondary.magic, + DDF_HEADER_MAGIC) + ?"yes" : "no")); + examine_vds(sb); + examine_pds(sb); +} + +static unsigned int get_vd_num_of_subarray(struct supertype *st) +{ + /* + * Figure out the VD number for this supertype. + * Returns DDF_CONTAINER for the container itself, + * and DDF_NOTFOUND on error. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *sra; + char *sub, *end; + unsigned int vcnum; + + if (*st->container_devnm == '\0') + return DDF_CONTAINER; + + sra = sysfs_read(-1, st->devnm, GET_VERSION); + if (!sra || sra->array.major_version != -1 || + sra->array.minor_version != -2 || + !is_subarray(sra->text_version)) + return DDF_NOTFOUND; + + sub = strchr(sra->text_version + 1, '/'); + if (sub != NULL) + vcnum = strtoul(sub + 1, &end, 10); + if (sub == NULL || *sub == '\0' || *end != '\0' || + vcnum >= be16_to_cpu(ddf->active->max_vd_entries)) + return DDF_NOTFOUND; + + return vcnum; +} + +static void brief_examine_super_ddf(struct supertype *st, int verbose) +{ + /* We just write a generic DDF ARRAY entry + */ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + printf("ARRAY metadata=ddf UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_ddf(struct supertype *st, int verbose) +{ + /* We write a DDF ARRAY member entry for each vd, identifying container + * by uuid and member by unit number and uuid. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo info; + unsigned int i; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + struct virtual_entry *ve = &ddf->virt->entries[i]; + struct vcl vcl; + char nbuf1[64]; + char namebuf[17]; + if (all_ff(ve->guid)) + continue; + memcpy(vcl.conf.guid, ve->guid, DDF_GUID_LEN); + ddf->currentconf =&vcl; + vcl.vcnum = i; + uuid_from_super_ddf(st, info.uuid); + fname_from_uuid(st, &info, nbuf1, ':'); + _ddf_array_name(namebuf, ddf, i); + printf("ARRAY%s%s container=%s member=%d UUID=%s\n", + namebuf[0] == '\0' ? "" : " /dev/md/", namebuf, + nbuf+5, i, nbuf1+5); + } +} + +static void export_examine_super_ddf(struct supertype *st) +{ + struct mdinfo info; + char nbuf[64]; + getinfo_super_ddf(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=ddf\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", + be16_to_cpu(((struct ddf_super *)st->sb)->phys->used_pdes)); +} + +static int copy_metadata_ddf(struct supertype *st, int from, int to) +{ + void *buf; + unsigned long long dsize, offset; + int bytes; + struct ddf_header *ddf; + int written = 0; + + /* The meta consists of an anchor, a primary, and a secondary. + * This all lives at the end of the device. + * So it is easiest to find the earliest of primary and + * secondary, and copy everything from there. + * + * Anchor is 512 from end. It contains primary_lba and secondary_lba + * we choose one of those + */ + + if (posix_memalign(&buf, 4096, 4096) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (lseek64(from, dsize-512, 0) < 0) + goto err; + if (read(from, buf, 512) != 512) + goto err; + ddf = buf; + if (!be32_eq(ddf->magic, DDF_HEADER_MAGIC) || + !be32_eq(calc_crc(ddf, 512), ddf->crc) || + (memcmp(ddf->revision, DDF_REVISION_0, 8) != 0 && + memcmp(ddf->revision, DDF_REVISION_2, 8) != 0)) + goto err; + + offset = dsize - 512; + if ((be64_to_cpu(ddf->primary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->primary_lba) << 9; + if ((be64_to_cpu(ddf->secondary_lba) << 9) < offset) + offset = be64_to_cpu(ddf->secondary_lba) << 9; + + bytes = dsize - offset; + + if (lseek64(from, offset, 0) < 0 || + lseek64(to, offset, 0) < 0) + goto err; + while (written < bytes) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (read(from, buf, n) != n) + goto err; + if (write(to, buf, n) != n) + goto err; + written += n; + } + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super_ddf(struct supertype *st, char *homehost, + char *subarray) +{ + struct ddf_super *sb = st->sb; + int cnt = be16_to_cpu(sb->virt->populated_vdes); + + printf(" Container GUID : "); print_guid(sb->anchor.guid, 1); + printf("\n"); + printf(" Seq : %08x\n", be32_to_cpu(sb->active->seq)); + printf(" Virtual Disks : %d\n", cnt); + printf("\n"); +} + +static const char *vendors_with_variable_volume_UUID[] = { + "LSI ", +}; + +static int volume_id_is_reliable(const struct ddf_super *ddf) +{ + int n = ARRAY_SIZE(vendors_with_variable_volume_UUID); + int i; + for (i = 0; i < n; i++) + if (!memcmp(ddf->controller.guid, + vendors_with_variable_volume_UUID[i], 8)) + return 0; + return 1; +} + +static void uuid_of_ddf_subarray(const struct ddf_super *ddf, + unsigned int vcnum, int uuid[4]) +{ + char buf[DDF_GUID_LEN+18], sha[20], *p; + struct sha1_ctx ctx; + if (volume_id_is_reliable(ddf)) { + uuid_from_ddf_guid(ddf->virt->entries[vcnum].guid, uuid); + return; + } + /* + * Some fake RAID BIOSes (in particular, LSI ones) change the + * VD GUID at every boot. These GUIDs are not suitable for + * identifying an array. Luckily the header GUID appears to + * remain constant. + * We construct a pseudo-UUID from the header GUID and those + * properties of the subarray that we expect to remain constant. + */ + memset(buf, 0, sizeof(buf)); + p = buf; + memcpy(p, ddf->anchor.guid, DDF_GUID_LEN); + p += DDF_GUID_LEN; + memcpy(p, ddf->virt->entries[vcnum].name, 16); + p += 16; + *((__u16 *) p) = vcnum; + sha1_init_ctx(&ctx); + sha1_process_bytes(buf, sizeof(buf), &ctx); + sha1_finish_ctx(&ctx, sha); + memcpy(uuid, sha, 4*4); +} + +static void brief_detail_super_ddf(struct supertype *st, char *subarray) +{ + struct mdinfo info; + char nbuf[64]; + struct ddf_super *ddf = st->sb; + unsigned int vcnum = get_vd_num_of_subarray(st); + if (vcnum == DDF_CONTAINER) + uuid_from_super_ddf(st, info.uuid); + else if (vcnum == DDF_NOTFOUND) + return; + else + uuid_of_ddf_subarray(ddf, vcnum, info.uuid); + fname_from_uuid(st, &info, nbuf,':'); + printf(" UUID=%s", nbuf + 5); +} + +static int match_home_ddf(struct supertype *st, char *homehost) +{ + /* It matches 'this' host if the controller is a + * Linux-MD controller with vendor_data matching + * the hostname. It would be nice if we could + * test against controller found in /sys or somewhere... + */ + struct ddf_super *ddf = st->sb; + unsigned int len; + + if (!homehost) + return 0; + len = strlen(homehost); + + return (memcmp(ddf->controller.guid, T10, 8) == 0 && + len < sizeof(ddf->controller.vendor_data) && + memcmp(ddf->controller.vendor_data, homehost,len) == 0 && + ddf->controller.vendor_data[len] == 0); +} + +static int find_index_in_bvd(const struct ddf_super *ddf, + const struct vd_config *conf, unsigned int n, + unsigned int *n_bvd) +{ + /* + * Find the index of the n-th valid physical disk in this BVD. + * Unused entries can be sprinkled in with the used entries, + * but don't count. + */ + unsigned int i, j; + for (i = 0, j = 0; + i < ddf->mppe && j < be16_to_cpu(conf->prim_elmnt_count); + i++) { + if (be32_to_cpu(conf->phys_refnum[i]) != 0xffffffff) { + if (n == j) { + *n_bvd = i; + return 1; + } + j++; + } + } + dprintf("couldn't find BVD member %u (total %u)\n", + n, be16_to_cpu(conf->prim_elmnt_count)); + return 0; +} + +/* Given a member array instance number, and a raid disk within that instance, + * find the vd_config structure. The offset of the given disk in the phys_refnum + * table is returned in n_bvd. + * For two-level members with a secondary raid level the vd_config for + * the appropriate BVD is returned. + * The return value is always &vlc->conf, where vlc is returned in last pointer. + */ +static struct vd_config *find_vdcr(struct ddf_super *ddf, unsigned int inst, + unsigned int n, + unsigned int *n_bvd, struct vcl **vcl) +{ + struct vcl *v; + + for (v = ddf->conflist; v; v = v->next) { + unsigned int nsec, ibvd = 0; + struct vd_config *conf; + if (inst != v->vcnum) + continue; + conf = &v->conf; + if (conf->sec_elmnt_count == 1) { + if (find_index_in_bvd(ddf, conf, n, n_bvd)) { + *vcl = v; + return conf; + } else + goto bad; + } + if (v->other_bvds == NULL) { + pr_err("BUG: other_bvds is NULL, nsec=%u\n", + conf->sec_elmnt_count); + goto bad; + } + nsec = n / be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_seq != nsec) { + for (ibvd = 1; ibvd < conf->sec_elmnt_count; ibvd++) { + if (v->other_bvds[ibvd-1]->sec_elmnt_seq == + nsec) + break; + } + if (ibvd == conf->sec_elmnt_count) + goto bad; + conf = v->other_bvds[ibvd-1]; + } + if (!find_index_in_bvd(ddf, conf, + n - nsec*conf->sec_elmnt_count, n_bvd)) + goto bad; + dprintf("found disk %u as member %u in bvd %d of array %u\n", + n, *n_bvd, ibvd, inst); + *vcl = v; + return conf; + } +bad: + pr_err("Couldn't find disk %d in array %u\n", n, inst); + return NULL; +} + +static int find_phys(const struct ddf_super *ddf, be32 phys_refnum) +{ + /* Find the entry in phys_disk which has the given refnum + * and return it's index + */ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) + if (be32_eq(ddf->phys->entries[i].refnum, phys_refnum)) + return i; + return -1; +} + +static void uuid_from_ddf_guid(const char *guid, int uuid[4]) +{ + char buf[20]; + struct sha1_ctx ctx; + sha1_init_ctx(&ctx); + sha1_process_bytes(guid, DDF_GUID_LEN, &ctx); + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +static void uuid_from_super_ddf(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In the case of SVD we assume the BVD is of interest, + * though that might be the case if a bitmap were made for + * a mirrored SVD - worry about that later. + * So we need to find the VD configuration record for the + * relevant BVD and extract the GUID and Secondary_Element_Seq. + * The first 16 bytes of the sha1 of these is used. + */ + struct ddf_super *ddf = st->sb; + struct vcl *vcl = ddf->currentconf; + + if (vcl) + uuid_of_ddf_subarray(ddf, vcl->vcnum, uuid); + else + uuid_from_ddf_guid(ddf->anchor.guid, uuid); +} + +static void getinfo_super_ddf(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + int map_disks = info->array.raid_disks; + __u32 *cptr; + + if (ddf->currentconf) { + getinfo_super_ddf_bvd(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + info->array.raid_disks = be16_to_cpu(ddf->phys->used_pdes); + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + cptr = (__u32 *)(ddf->anchor.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + + info->array.chunk_size = 0; + info->container_enough = 1; + + info->disk.major = 0; + info->disk.minor = 0; + if (ddf->dlist) { + struct phys_disk_entry *pde = NULL; + info->disk.number = be32_to_cpu(ddf->dlist->disk.refnum); + info->disk.raid_disk = find_phys(ddf, ddf->dlist->disk.refnum); + + info->data_offset = be64_to_cpu(ddf->phys-> + entries[info->disk.raid_disk]. + config_size); + info->component_size = ddf->dlist->size - info->data_offset; + if (info->disk.raid_disk >= 0) + pde = ddf->phys->entries + info->disk.raid_disk; + if (pde && + !(be16_to_cpu(pde->state) & DDF_Failed) && + !(be16_to_cpu(pde->state) & DDF_Missing)) + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + else + info->disk.state = 1 << MD_DISK_FAULTY; + + } else { + /* There should always be a dlist, but just in case...*/ + info->disk.number = -1; + info->disk.raid_disk = -1; + info->disk.state = (1 << MD_DISK_SYNC) | (1 << MD_DISK_ACTIVE); + } + info->events = be32_to_cpu(ddf->active->seq); + info->array.utime = DECADE + be32_to_cpu(ddf->active->timestamp); + + info->recovery_start = MaxSector; + info->reshape_active = 0; + info->recovery_blocked = 0; + info->name[0] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "ddf"); + info->safe_mode_delay = 0; + + uuid_from_super_ddf(st, info->uuid); + + if (map) { + int i, e = 0; + int max = be16_to_cpu(ddf->phys->max_pdes); + for (i = e = 0 ; i < map_disks ; i++, e++) { + while (e < max && + be32_to_cpu(ddf->phys->entries[e].refnum) == 0xffffffff) + e++; + if (i < info->array.raid_disks && e < max && + !(be16_to_cpu(ddf->phys->entries[e].state) & + DDF_Failed)) + map[i] = 1; + else + map[i] = 0; + } + } +} + +/* size of name must be at least 17 bytes! */ +static void _ddf_array_name(char *name, const struct ddf_super *ddf, int i) +{ + int j; + memcpy(name, ddf->virt->entries[i].name, 16); + name[16] = 0; + for(j = 0; j < 16; j++) + if (name[j] == ' ') + name[j] = 0; +} + +static void getinfo_super_ddf_bvd(struct supertype *st, struct mdinfo *info, char *map) +{ + struct ddf_super *ddf = st->sb; + struct vcl *vc = ddf->currentconf; + int cd = ddf->currentdev; + int n_prim; + int j; + struct dl *dl = NULL; + int map_disks = info->array.raid_disks; + __u32 *cptr; + struct vd_config *conf; + + memset(info, 0, sizeof(*info)); + if (layout_ddf2md(&vc->conf, &info->array) == -1) + return; + info->array.md_minor = -1; + cptr = (__u32 *)(vc->conf.guid + 16); + info->array.ctime = DECADE + __be32_to_cpu(*cptr); + info->array.utime = DECADE + be32_to_cpu(vc->conf.timestamp); + info->array.chunk_size = 512 << vc->conf.chunk_shift; + info->custom_array_size = be64_to_cpu(vc->conf.array_blocks); + + conf = &vc->conf; + n_prim = be16_to_cpu(conf->prim_elmnt_count); + if (conf->sec_elmnt_count > 1 && cd >= n_prim) { + int ibvd = cd / n_prim - 1; + cd %= n_prim; + conf = vc->other_bvds[ibvd]; + } + + if (cd >= 0 && (unsigned)cd < ddf->mppe) { + info->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, conf)[cd]); + if (vc->block_sizes) + info->component_size = vc->block_sizes[cd]; + else + info->component_size = be64_to_cpu(conf->blocks); + + for (dl = ddf->dlist; dl ; dl = dl->next) + if (be32_eq(dl->disk.refnum, conf->phys_refnum[cd])) + break; + } + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.state = 0; + if (dl && dl->pdnum >= 0) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.raid_disk = cd + conf->sec_elmnt_seq + * be16_to_cpu(conf->prim_elmnt_count); + info->disk.number = dl->pdnum; + info->disk.state = 0; + if (info->disk.number >= 0 && + (be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[info->disk.number].state) & DDF_Failed)) + info->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE); + info->events = be32_to_cpu(ddf->active->seq); + } + + info->container_member = ddf->currentconf->vcnum; + + info->recovery_start = MaxSector; + info->resync_start = 0; + info->reshape_active = 0; + info->recovery_blocked = 0; + if (!(ddf->virt->entries[info->container_member].state & + DDF_state_inconsistent) && + (ddf->virt->entries[info->container_member].init_state & + DDF_initstate_mask) == DDF_init_full) + info->resync_start = MaxSector; + + uuid_from_super_ddf(st, info->uuid); + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", + st->container_devnm, + info->container_member); + info->safe_mode_delay = DDF_SAFE_MODE_DELAY; + + _ddf_array_name(info->name, ddf, info->container_member); + + if (map) + for (j = 0; j < map_disks; j++) { + map[j] = 0; + if (j < info->array.raid_disks) { + int i = find_phys(ddf, vc->conf.phys_refnum[j]); + if (i >= 0 && + (be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Online) && + !(be16_to_cpu(ddf->phys->entries[i].state) + & DDF_Failed)) + map[i] = 1; + } + } +} + +static int update_super_ddf(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * uuid: Change the uuid of the array to match what is given + * homehost: update the recorded homehost + * name: update the name - preserving the homehost + * _reshape_progress: record new reshape_progress position. + * + * Following are not relevant for this version: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + */ + int rv = 0; +// struct ddf_super *ddf = st->sb; +// struct vd_config *vd = find_vdcr(ddf, info->container_member); +// struct virtual_entry *ve = find_ve(ddf); + + /* we don't need to handle "force-*" or "assemble" as + * there is no need to 'trick' the kernel. When the metadata is + * first updated to activate the array, all the implied modifications + * will just happen. + */ + + if (strcmp(update, "grow") == 0) { + /* FIXME */ + } else if (strcmp(update, "resync") == 0) { +// info->resync_checkpoint = 0; + } else if (strcmp(update, "homehost") == 0) { + /* homehost is stored in controller->vendor_data, + * or it is when we are the vendor + */ +// if (info->vendor_is_local) +// strcpy(ddf->controller.vendor_data, homehost); + rv = -1; + } else if (strcmp(update, "name") == 0) { + /* name is stored in virtual_entry->name */ +// memset(ve->name, ' ', 16); +// strncpy(ve->name, info->name, 16); + rv = -1; + } else if (strcmp(update, "_reshape_progress") == 0) { + /* We don't support reshape yet */ + } else if (strcmp(update, "assemble") == 0 ) { + /* Do nothing, just succeed */ + rv = 0; + } else + rv = -1; + +// update_all_csum(ddf); + + return rv; +} + +static void make_header_guid(char *guid) +{ + be32 stamp; + /* Create a DDF Header of Virtual Disk GUID */ + + /* 24 bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * next 8 are controller type.. how about 0X DEAD BEEF 0000 0000 + * Remaining 8 random number plus timestamp + */ + memcpy(guid, T10, sizeof(T10)); + stamp = cpu_to_be32(0xdeadbeef); + memcpy(guid+8, &stamp, 4); + stamp = cpu_to_be32(0); + memcpy(guid+12, &stamp, 4); + stamp = cpu_to_be32(time(0) - DECADE); + memcpy(guid+16, &stamp, 4); + stamp._v32 = random32(); + memcpy(guid+20, &stamp, 4); +} + +static unsigned int find_unused_vde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static unsigned int find_vde_by_name(const struct ddf_super *ddf, + const char *name) +{ + unsigned int i; + if (name == NULL) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) { + if (all_ff(ddf->virt->entries[i].guid)) + continue; + if (!strncmp(name, ddf->virt->entries[i].name, + sizeof(ddf->virt->entries[i].name))) + return i; + } + return DDF_NOTFOUND; +} + +static unsigned int find_vde_by_guid(const struct ddf_super *ddf, + const char *guid) +{ + unsigned int i; + if (guid == NULL || all_ff(guid)) + return DDF_NOTFOUND; + for (i = 0; i < be16_to_cpu(ddf->virt->max_vdes); i++) + if (!memcmp(ddf->virt->entries[i].guid, guid, DDF_GUID_LEN)) + return i; + return DDF_NOTFOUND; +} + +static int init_super_ddf(struct supertype *st, + mdu_array_info_t *info, + struct shape *s, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For DDF, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + * + * We need to create the entire 'ddf' structure which includes: + * DDF headers - these are easy. + * Controller data - a Sector describing this controller .. not that + * this is a controller exactly. + * Physical Disk Record - one entry per device, so + * leave plenty of space. + * Virtual Disk Records - again, just leave plenty of space. + * This just lists VDs, doesn't give details. + * Config records - describe the VDs that use this disk + * DiskData - describes 'this' device. + * BadBlockManagement - empty + * Diag Space - empty + * Vendor Logs - Could we put bitmaps here? + * + */ + struct ddf_super *ddf; + char hostname[17]; + int hostlen; + int max_phys_disks, max_virt_disks; + unsigned long long sector; + int clen; + int i; + int pdsize, vdsize; + struct phys_disk *pd; + struct virtual_disk *vd; + + if (st->sb) + return init_super_ddf_bvd(st, info, s->size, name, homehost, uuid, + data_offset); + + if (posix_memalign((void**)&ddf, 512, sizeof(*ddf)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(ddf, 0, sizeof(*ddf)); + st->sb = ddf; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + /* At least 32MB *must* be reserved for the ddf. So let's just + * start 32MB from the end, and put the primary header there. + * Don't do secondary for now. + * We don't know exactly where that will be yet as it could be + * different on each device. So just set up the lengths. + */ + + ddf->anchor.magic = DDF_HEADER_MAGIC; + make_header_guid(ddf->anchor.guid); + + memcpy(ddf->anchor.revision, DDF_REVISION_2, 8); + ddf->anchor.seq = cpu_to_be32(1); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + ddf->anchor.openflag = 0xFF; + ddf->anchor.foreignflag = 0; + ddf->anchor.enforcegroups = 0; /* Is this best?? */ + ddf->anchor.pad0 = 0xff; + memset(ddf->anchor.pad1, 0xff, 12); + memset(ddf->anchor.header_ext, 0xff, 32); + ddf->anchor.primary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.secondary_lba = cpu_to_be64(~(__u64)0); + ddf->anchor.type = DDF_HEADER_ANCHOR; + memset(ddf->anchor.pad2, 0xff, 3); + ddf->anchor.workspace_len = cpu_to_be32(32768); /* Must be reserved */ + /* Put this at bottom of 32M reserved.. */ + ddf->anchor.workspace_lba = cpu_to_be64(~(__u64)0); + max_phys_disks = 1023; /* Should be enough, 4095 is also allowed */ + ddf->anchor.max_pd_entries = cpu_to_be16(max_phys_disks); + max_virt_disks = 255; /* 15, 63, 255, 1024, 4095 are all allowed */ + ddf->anchor.max_vd_entries = cpu_to_be16(max_virt_disks); + ddf->max_part = 64; + ddf->anchor.max_partitions = cpu_to_be16(ddf->max_part); + ddf->mppe = 256; /* 16, 64, 256, 1024, 4096 are all allowed */ + ddf->conf_rec_len = 1 + ROUND_UP(ddf->mppe * (4+8), 512)/512; + ddf->anchor.config_record_len = cpu_to_be16(ddf->conf_rec_len); + ddf->anchor.max_primary_element_entries = cpu_to_be16(ddf->mppe); + memset(ddf->anchor.pad3, 0xff, 54); + /* Controller section is one sector long immediately + * after the ddf header */ + sector = 1; + ddf->anchor.controller_section_offset = cpu_to_be32(sector); + ddf->anchor.controller_section_length = cpu_to_be32(1); + sector += 1; + + /* phys is 8 sectors after that */ + pdsize = ROUND_UP(sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)*max_phys_disks, + 512); + switch(pdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.phys_section_offset = cpu_to_be32(sector); + ddf->anchor.phys_section_length = + cpu_to_be32(pdsize/512); /* max_primary_element_entries/8 */ + sector += pdsize/512; + + /* virt is another 32 sectors */ + vdsize = ROUND_UP(sizeof(struct virtual_disk) + + sizeof(struct virtual_entry) * max_virt_disks, + 512); + switch(vdsize/512) { + case 2: case 8: case 32: case 128: case 512: break; + default: abort(); + } + ddf->anchor.virt_section_offset = cpu_to_be32(sector); + ddf->anchor.virt_section_length = + cpu_to_be32(vdsize/512); /* max_vd_entries/8 */ + sector += vdsize/512; + + clen = ddf->conf_rec_len * (ddf->max_part+1); + ddf->anchor.config_section_offset = cpu_to_be32(sector); + ddf->anchor.config_section_length = cpu_to_be32(clen); + sector += clen; + + ddf->anchor.data_section_offset = cpu_to_be32(sector); + ddf->anchor.data_section_length = cpu_to_be32(1); + sector += 1; + + ddf->anchor.bbm_section_length = cpu_to_be32(0); + ddf->anchor.bbm_section_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.diag_space_length = cpu_to_be32(0); + ddf->anchor.diag_space_offset = cpu_to_be32(0xFFFFFFFF); + ddf->anchor.vendor_length = cpu_to_be32(0); + ddf->anchor.vendor_offset = cpu_to_be32(0xFFFFFFFF); + + memset(ddf->anchor.pad4, 0xff, 256); + + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->primary.openflag = 1; /* I guess.. */ + ddf->primary.type = DDF_HEADER_PRIMARY; + + ddf->secondary.openflag = 1; /* I guess.. */ + ddf->secondary.type = DDF_HEADER_SECONDARY; + + ddf->active = &ddf->primary; + + ddf->controller.magic = DDF_CONTROLLER_MAGIC; + + /* 24 more bytes of fiction required. + * first 8 are a 'vendor-id' - "Linux-MD" + * Remaining 16 are serial number.... maybe a hostname would do? + */ + memcpy(ddf->controller.guid, T10, sizeof(T10)); + gethostname(hostname, sizeof(hostname)); + hostname[sizeof(hostname) - 1] = 0; + hostlen = strlen(hostname); + memcpy(ddf->controller.guid + 24 - hostlen, hostname, hostlen); + for (i = strlen(T10) ; i+hostlen < 24; i++) + ddf->controller.guid[i] = ' '; + + ddf->controller.type.vendor_id = cpu_to_be16(0xDEAD); + ddf->controller.type.device_id = cpu_to_be16(0xBEEF); + ddf->controller.type.sub_vendor_id = cpu_to_be16(0); + ddf->controller.type.sub_device_id = cpu_to_be16(0); + memcpy(ddf->controller.product_id, "What Is My PID??", 16); + memset(ddf->controller.pad, 0xff, 8); + memset(ddf->controller.vendor_data, 0xff, 448); + if (homehost && strlen(homehost) < 440) + strcpy((char*)ddf->controller.vendor_data, homehost); + + if (posix_memalign((void**)&pd, 512, pdsize) != 0) { + pr_err("could not allocate pd\n"); + return 0; + } + ddf->phys = pd; + ddf->pdsize = pdsize; + + memset(pd, 0xff, pdsize); + memset(pd, 0, sizeof(*pd)); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(0); + pd->max_pdes = cpu_to_be16(max_phys_disks); + memset(pd->pad, 0xff, 52); + for (i = 0; i < max_phys_disks; i++) + memset(pd->entries[i].guid, 0xff, DDF_GUID_LEN); + + if (posix_memalign((void**)&vd, 512, vdsize) != 0) { + pr_err("could not allocate vd\n"); + return 0; + } + ddf->virt = vd; + ddf->vdsize = vdsize; + memset(vd, 0, vdsize); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + vd->max_vdes = cpu_to_be16(max_virt_disks); + memset(vd->pad, 0xff, 52); + + for (i=0; i<max_virt_disks; i++) + memset(&vd->entries[i], 0xff, sizeof(struct virtual_entry)); + + st->sb = ddf; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +static int chunk_to_shift(int chunksize) +{ + return ffs(chunksize/512)-1; +} + +struct extent { + unsigned long long start, size; +}; +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static struct extent *get_extents(struct ddf_super *ddf, struct dl *dl) +{ + /* Find a list of used extents on the given physical device + * (dnum) of the given ddf. + * Return a malloced array of 'struct extent' + */ + struct extent *rv; + int n = 0; + unsigned int i; + __u16 state; + + if (dl->pdnum < 0) + return NULL; + state = be16_to_cpu(ddf->phys->entries[dl->pdnum].state); + + if ((state & (DDF_Online|DDF_Failed|DDF_Missing)) != DDF_Online) + return NULL; + + rv = xmalloc(sizeof(struct extent) * (ddf->max_part + 2)); + + for (i = 0; i < ddf->max_part; i++) { + const struct vd_config *bvd; + unsigned int ibvd; + struct vcl *v = dl->vlist[i]; + if (v == NULL || + get_pd_index_from_refnum(v, dl->disk.refnum, ddf->mppe, + &bvd, &ibvd) == DDF_NOTFOUND) + continue; + rv[n].start = be64_to_cpu(LBA_OFFSET(ddf, bvd)[ibvd]); + rv[n].size = be64_to_cpu(bvd->blocks); + n++; + } + qsort(rv, n, sizeof(*rv), cmp_extent); + + rv[n].start = be64_to_cpu(ddf->phys->entries[dl->pdnum].config_size); + rv[n].size = 0; + return rv; +} + +static unsigned long long find_space( + struct ddf_super *ddf, struct dl *dl, + unsigned long long data_offset, + unsigned long long *size) +{ + /* Find if the requested amount of space is available. + * If it is, return start. + * If not, set *size to largest space. + * If data_offset != INVALID_SECTORS, then the space must start + * at this location. + */ + struct extent *e = get_extents(ddf, dl); + int i = 0; + unsigned long long pos = 0; + unsigned long long max_size = 0; + + if (!e) { + *size = 0; + return INVALID_SECTORS; + } + do { + unsigned long long esize = e[i].start - pos; + if (data_offset != INVALID_SECTORS && + pos <= data_offset && + e[i].start > data_offset) { + pos = data_offset; + esize = e[i].start - pos; + } + if (data_offset != INVALID_SECTORS && + pos != data_offset) { + i++; + continue; + } + if (esize >= *size) { + /* Found! */ + free(e); + return pos; + } + if (esize > max_size) + max_size = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + *size = max_size; + free(e); + return INVALID_SECTORS; +} + +static int init_super_ddf_bvd(struct supertype *st, + mdu_array_info_t *info, + unsigned long long size, + char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + /* We are creating a BVD inside a pre-existing container. + * so st->sb is already set. + * We need to create a new vd_config and a new virtual_entry + */ + struct ddf_super *ddf = st->sb; + unsigned int venum, i; + struct virtual_entry *ve; + struct vcl *vcl; + struct vd_config *vc; + + if (find_vde_by_name(ddf, name) != DDF_NOTFOUND) { + pr_err("This ddf already has an array called %s\n", name); + return 0; + } + venum = find_unused_vde(ddf); + if (venum == DDF_NOTFOUND) { + pr_err("Cannot find spare slot for virtual disk\n"); + return 0; + } + ve = &ddf->virt->entries[venum]; + + /* A Virtual Disk GUID contains the T10 Vendor ID, controller type, + * timestamp, random number + */ + make_header_guid(ve->guid); + ve->unit = cpu_to_be16(info->md_minor); + ve->pad0 = 0xFFFF; + ve->guid_crc._v16 = crc32(0, (unsigned char *)ddf->anchor.guid, + DDF_GUID_LEN); + ve->type = cpu_to_be16(0); + ve->state = DDF_state_degraded; /* Will be modified as devices are added */ + if (info->state & 1) /* clean */ + ve->init_state = DDF_init_full; + else + ve->init_state = DDF_init_not; + + memset(ve->pad1, 0xff, 14); + memset(ve->name, '\0', sizeof(ve->name)); + if (name) { + int l = strnlen(name, sizeof(ve->name)); + memcpy(ve->name, name, l); + } + ddf->virt->populated_vdes = + cpu_to_be16(be16_to_cpu(ddf->virt->populated_vdes)+1); + + /* Now create a new vd_config */ + if (posix_memalign((void**)&vcl, 512, + (offsetof(struct vcl, conf) + ddf->conf_rec_len * 512)) != 0) { + pr_err("could not allocate vd_config\n"); + return 0; + } + vcl->vcnum = venum; + vcl->block_sizes = NULL; /* FIXME not for CONCAT */ + vc = &vcl->conf; + + vc->magic = DDF_VD_CONF_MAGIC; + memcpy(vc->guid, ve->guid, DDF_GUID_LEN); + vc->timestamp = cpu_to_be32(time(0)-DECADE); + vc->seqnum = cpu_to_be32(1); + memset(vc->pad0, 0xff, 24); + vc->chunk_shift = chunk_to_shift(info->chunk_size); + if (layout_md2ddf(info, vc) == -1 || + be16_to_cpu(vc->prim_elmnt_count) > ddf->mppe) { + pr_err("unsupported RAID level/layout %d/%d with %d disks\n", + info->level, info->layout, info->raid_disks); + free(vcl); + return 0; + } + vc->sec_elmnt_seq = 0; + if (alloc_other_bvds(ddf, vcl) != 0) { + pr_err("could not allocate other bvds\n"); + free(vcl); + return 0; + } + vc->blocks = cpu_to_be64(size * 2); + vc->array_blocks = cpu_to_be64( + calc_array_size(info->level, info->raid_disks, info->layout, + info->chunk_size, size * 2)); + memset(vc->pad1, 0xff, 8); + vc->spare_refs[0] = cpu_to_be32(0xffffffff); + vc->spare_refs[1] = cpu_to_be32(0xffffffff); + vc->spare_refs[2] = cpu_to_be32(0xffffffff); + vc->spare_refs[3] = cpu_to_be32(0xffffffff); + vc->spare_refs[4] = cpu_to_be32(0xffffffff); + vc->spare_refs[5] = cpu_to_be32(0xffffffff); + vc->spare_refs[6] = cpu_to_be32(0xffffffff); + vc->spare_refs[7] = cpu_to_be32(0xffffffff); + memset(vc->cache_pol, 0, 8); + vc->bg_rate = 0x80; + memset(vc->pad2, 0xff, 3); + memset(vc->pad3, 0xff, 52); + memset(vc->pad4, 0xff, 192); + memset(vc->v0, 0xff, 32); + memset(vc->v1, 0xff, 32); + memset(vc->v2, 0xff, 16); + memset(vc->v3, 0xff, 16); + memset(vc->vendor, 0xff, 32); + + memset(vc->phys_refnum, 0xff, 4*ddf->mppe); + memset(vc->phys_refnum+ddf->mppe, 0x00, 8*ddf->mppe); + + for (i = 1; i < vc->sec_elmnt_count; i++) { + memcpy(vcl->other_bvds[i-1], vc, ddf->conf_rec_len * 512); + vcl->other_bvds[i-1]->sec_elmnt_seq = i; + } + + vcl->next = ddf->conflist; + ddf->conflist = vcl; + ddf->currentconf = vcl; + ddf_set_updates_pending(ddf, NULL); + return 1; +} + +static void add_to_super_ddf_bvd(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + /* fd and devname identify a device within the ddf container (st). + * dk identifies a location in the new BVD. + * We need to find suitable free space in that device and update + * the phys_refnum and lba_offset for the newly created vd_config. + * We might also want to update the type in the phys_disk + * section. + * + * Alternately: fd == -1 and we have already chosen which device to + * use and recorded in dlist->raid_disk; + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + unsigned int i; + unsigned long long blocks, pos; + unsigned int raid_disk = dk->raid_disk; + + if (fd == -1) { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + if (!dl || dl->pdnum < 0 || ! (dk->state & (1<<MD_DISK_SYNC))) + return; + + vc = &ddf->currentconf->conf; + if (vc->sec_elmnt_count > 1) { + unsigned int n = be16_to_cpu(vc->prim_elmnt_count); + if (raid_disk >= n) + vc = ddf->currentconf->other_bvds[raid_disk / n - 1]; + raid_disk %= n; + } + + blocks = be64_to_cpu(vc->blocks); + if (ddf->currentconf->block_sizes) + blocks = ddf->currentconf->block_sizes[dk->raid_disk]; + + pos = find_space(ddf, dl, data_offset, &blocks); + if (pos == INVALID_SECTORS) + return; + + ddf->currentdev = dk->raid_disk; + vc->phys_refnum[raid_disk] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[raid_disk] = cpu_to_be64(pos); + + for (i = 0; i < ddf->max_part ; i++) + if (dl->vlist[i] == NULL) + break; + if (i == ddf->max_part) + return; + dl->vlist[i] = ddf->currentconf; + + if (fd >= 0) + dl->fd = fd; + if (devname) + dl->devname = devname; + + /* Check if we can mark array as optimal yet */ + i = ddf->currentconf->vcnum; + ddf->virt->entries[i].state = + (ddf->virt->entries[i].state & ~DDF_state_mask) + | get_svd_state(ddf, ddf->currentconf); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + dprintf("added disk %d/%08x to VD %d/%s as disk %d\n", + dl->pdnum, be32_to_cpu(dl->disk.refnum), + ddf->currentconf->vcnum, guid_str(vc->guid), + dk->raid_disk); + ddf_set_updates_pending(ddf, vc); +} + +static unsigned int find_unused_pde(const struct ddf_super *ddf) +{ + unsigned int i; + for (i = 0; i < be16_to_cpu(ddf->phys->max_pdes); i++) { + if (all_ff(ddf->phys->entries[i].guid)) + return i; + } + return DDF_NOTFOUND; +} + +static void _set_config_size(struct phys_disk_entry *pde, const struct dl *dl) +{ + __u64 cfs, t; + cfs = min(dl->size - 32*1024*2ULL, be64_to_cpu(dl->primary_lba)); + t = be64_to_cpu(dl->secondary_lba); + if (t != ~(__u64)0) + cfs = min(cfs, t); + /* + * Some vendor DDF structures interpret workspace_lba + * very differently than we do: Make a sanity check on the value. + */ + t = be64_to_cpu(dl->workspace_lba); + if (t < cfs) { + __u64 wsp = cfs - t; + if (wsp > 1024*1024*2ULL && wsp > dl->size / 16) { + pr_err("%x:%x: workspace size 0x%llx too big, ignoring\n", + dl->major, dl->minor, (unsigned long long)wsp); + } else + cfs = t; + } + pde->config_size = cpu_to_be64(cfs); + dprintf("%x:%x config_size %llx, DDF structure is %llx blocks\n", + dl->major, dl->minor, + (unsigned long long)cfs, (unsigned long long)(dl->size-cfs)); +} + +/* Add a device to a container, either while creating it or while + * expanding a pre-existing container + */ +static int add_to_super_ddf(struct supertype *st, + mdu_disk_info_t *dk, int fd, char *devname, + unsigned long long data_offset) +{ + struct ddf_super *ddf = st->sb; + struct dl *dd; + time_t now; + struct tm *tm; + unsigned long long size; + struct phys_disk_entry *pde; + unsigned int n, i; + struct stat stb; + __u32 *tptr; + + if (ddf->currentconf) { + add_to_super_ddf_bvd(st, dk, fd, devname, data_offset); + return 0; + } + + /* This is device numbered dk->number. We need to create + * a phys_disk entry and a more detailed disk_data entry. + */ + fstat(fd, &stb); + n = find_unused_pde(ddf); + if (n == DDF_NOTFOUND) { + pr_err("No free slot in array, cannot add disk\n"); + return 1; + } + pde = &ddf->phys->entries[n]; + get_dev_size(fd, NULL, &size); + if (size <= 32*1024*1024) { + pr_err("device size must be at least 32MB\n"); + return 1; + } + size >>= 9; + + if (posix_memalign((void**)&dd, 512, + sizeof(*dd) + sizeof(dd->vlist[0]) * ddf->max_part) != 0) { + pr_err("could allocate buffer for new disk, aborting\n"); + return 1; + } + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname; + dd->fd = fd; + dd->spare = NULL; + + dd->disk.magic = DDF_PHYS_DATA_MAGIC; + now = time(0); + tm = localtime(&now); + sprintf(dd->disk.guid, "%8s%04d%02d%02d", T10, + (__u16)tm->tm_year+1900, + (__u8)tm->tm_mon+1, (__u8)tm->tm_mday); + tptr = (__u32 *)(dd->disk.guid + 16); + *tptr++ = random32(); + *tptr = random32(); + + do { + /* Cannot be bothered finding a CRC of some irrelevant details*/ + dd->disk.refnum._v32 = random32(); + for (i = be16_to_cpu(ddf->active->max_pd_entries); + i > 0; i--) + if (be32_eq(ddf->phys->entries[i-1].refnum, + dd->disk.refnum)) + break; + } while (i > 0); + + dd->disk.forced_ref = 1; + dd->disk.forced_guid = 1; + memset(dd->disk.vendor, ' ', 32); + memcpy(dd->disk.vendor, "Linux", 5); + memset(dd->disk.pad, 0xff, 442); + for (i = 0; i < ddf->max_part ; i++) + dd->vlist[i] = NULL; + + dd->pdnum = n; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(n); + pde = &pd->entries[0]; + dd->mdupdate = pd; + } else + ddf->phys->used_pdes = cpu_to_be16( + 1 + be16_to_cpu(ddf->phys->used_pdes)); + + memcpy(pde->guid, dd->disk.guid, DDF_GUID_LEN); + pde->refnum = dd->disk.refnum; + pde->type = cpu_to_be16(DDF_Forced_PD_GUID | DDF_Global_Spare); + pde->state = cpu_to_be16(DDF_Online); + dd->size = size; + /* + * If there is already a device in dlist, try to reserve the same + * amount of workspace. Otherwise, use 32MB. + * We checked disk size above already. + */ +#define __calc_lba(new, old, lba, mb) do { \ + unsigned long long dif; \ + if ((old) != NULL) \ + dif = (old)->size - be64_to_cpu((old)->lba); \ + else \ + dif = (new)->size; \ + if ((new)->size > dif) \ + (new)->lba = cpu_to_be64((new)->size - dif); \ + else \ + (new)->lba = cpu_to_be64((new)->size - (mb*1024*2)); \ + } while (0) + __calc_lba(dd, ddf->dlist, workspace_lba, 32); + __calc_lba(dd, ddf->dlist, primary_lba, 16); + if (ddf->dlist == NULL || + be64_to_cpu(ddf->dlist->secondary_lba) != ~(__u64)0) + __calc_lba(dd, ddf->dlist, secondary_lba, 32); + _set_config_size(pde, dd); + + sprintf(pde->path, "%17.17s","Information: nil") ; + memset(pde->pad, 0xff, 6); + + if (st->update_tail) { + dd->next = ddf->add_list; + ddf->add_list = dd; + } else { + dd->next = ddf->dlist; + ddf->dlist = dd; + ddf_set_updates_pending(ddf, NULL); + } + + return 0; +} + +static int remove_from_super_ddf(struct supertype *st, mdu_disk_info_t *dk) +{ + struct ddf_super *ddf = st->sb; + struct dl *dl; + + /* mdmon has noticed that this disk (dk->major/dk->minor) has + * disappeared from the container. + * We need to arrange that it disappears from the metadata and + * internal data structures too. + * Most of the work is done by ddf_process_update which edits + * the metadata and closes the file handle and attaches the memory + * where free_updates will free it. + */ + for (dl = ddf->dlist; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + if (!dl || dl->pdnum < 0) + return -1; + + if (st->update_tail) { + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + struct phys_disk *pd; + + pd = xmalloc(len); + pd->magic = DDF_PHYS_RECORDS_MAGIC; + pd->used_pdes = cpu_to_be16(dl->pdnum); + pd->entries[0].state = cpu_to_be16(DDF_Missing); + append_metadata_update(st, pd, len); + } + return 0; +} + +/* + * This is the write_init_super method for a ddf container. It is + * called when creating a container or adding another device to a + * container. + */ + +static int __write_ddf_structure(struct dl *d, struct ddf_super *ddf, __u8 type) +{ + unsigned long long sector; + struct ddf_header *header; + int fd, i, n_config, conf_size, buf_size; + int ret = 0; + char *conf; + + fd = d->fd; + + switch (type) { + case DDF_HEADER_PRIMARY: + header = &ddf->primary; + sector = be64_to_cpu(header->primary_lba); + break; + case DDF_HEADER_SECONDARY: + header = &ddf->secondary; + sector = be64_to_cpu(header->secondary_lba); + break; + default: + return 0; + } + if (sector == ~(__u64)0) + return 0; + + header->type = type; + header->openflag = 1; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + goto out; + + ddf->controller.crc = calc_crc(&ddf->controller, 512); + if (write(fd, &ddf->controller, 512) < 0) + goto out; + + ddf->phys->crc = calc_crc(ddf->phys, ddf->pdsize); + if (write(fd, ddf->phys, ddf->pdsize) < 0) + goto out; + ddf->virt->crc = calc_crc(ddf->virt, ddf->vdsize); + if (write(fd, ddf->virt, ddf->vdsize) < 0) + goto out; + + /* Now write lots of config records. */ + n_config = ddf->max_part; + conf_size = ddf->conf_rec_len * 512; + conf = ddf->conf; + buf_size = conf_size * (n_config + 1); + if (!conf) { + if (posix_memalign((void**)&conf, 512, buf_size) != 0) + goto out; + ddf->conf = conf; + } + for (i = 0 ; i <= n_config ; i++) { + struct vcl *c; + struct vd_config *vdc = NULL; + if (i == n_config) { + c = (struct vcl *)d->spare; + if (c) + vdc = &c->conf; + } else { + unsigned int dummy; + c = d->vlist[i]; + if (c) + get_pd_index_from_refnum( + c, d->disk.refnum, + ddf->mppe, + (const struct vd_config **)&vdc, + &dummy); + } + if (vdc) { + dprintf("writing conf record %i on disk %08x for %s/%u\n", + i, be32_to_cpu(d->disk.refnum), + guid_str(vdc->guid), + vdc->sec_elmnt_seq); + vdc->crc = calc_crc(vdc, conf_size); + memcpy(conf + i*conf_size, vdc, conf_size); + } else + memset(conf + i*conf_size, 0xff, conf_size); + } + if (write(fd, conf, buf_size) != buf_size) + goto out; + + d->disk.crc = calc_crc(&d->disk, 512); + if (write(fd, &d->disk, 512) < 0) + goto out; + + ret = 1; +out: + header->openflag = 0; + header->crc = calc_crc(header, 512); + + lseek64(fd, sector<<9, 0); + if (write(fd, header, 512) < 0) + ret = 0; + + return ret; +} + +static int _write_super_to_disk(struct ddf_super *ddf, struct dl *d) +{ + unsigned long long size; + int fd = d->fd; + if (fd < 0) + return 0; + + /* We need to fill in the primary, (secondary) and workspace + * lba's in the headers, set their checksums, + * Also checksum phys, virt.... + * + * Then write everything out, finally the anchor is written. + */ + get_dev_size(fd, NULL, &size); + size /= 512; + memcpy(&ddf->anchor, ddf->active, 512); + if (be64_to_cpu(d->workspace_lba) != 0ULL) + ddf->anchor.workspace_lba = d->workspace_lba; + else + ddf->anchor.workspace_lba = + cpu_to_be64(size - 32*1024*2); + if (be64_to_cpu(d->primary_lba) != 0ULL) + ddf->anchor.primary_lba = d->primary_lba; + else + ddf->anchor.primary_lba = + cpu_to_be64(size - 16*1024*2); + if (be64_to_cpu(d->secondary_lba) != 0ULL) + ddf->anchor.secondary_lba = d->secondary_lba; + else + ddf->anchor.secondary_lba = + cpu_to_be64(size - 32*1024*2); + ddf->anchor.timestamp = cpu_to_be32(time(0) - DECADE); + memcpy(&ddf->primary, &ddf->anchor, 512); + memcpy(&ddf->secondary, &ddf->anchor, 512); + + ddf->anchor.type = DDF_HEADER_ANCHOR; + ddf->anchor.openflag = 0xFF; /* 'open' means nothing */ + ddf->anchor.seq = cpu_to_be32(0xFFFFFFFF); /* no sequencing in anchor */ + ddf->anchor.crc = calc_crc(&ddf->anchor, 512); + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_PRIMARY)) + return 0; + + if (!__write_ddf_structure(d, ddf, DDF_HEADER_SECONDARY)) + return 0; + + lseek64(fd, (size-1)*512, SEEK_SET); + if (write(fd, &ddf->anchor, 512) < 0) + return 0; + + return 1; +} + +static int __write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct dl *d; + int attempts = 0; + int successes = 0; + + pr_state(ddf, __func__); + + /* try to write updated metadata, + * if we catch a failure move on to the next disk + */ + for (d = ddf->dlist; d; d=d->next) { + attempts++; + successes += _write_super_to_disk(ddf, d); + } + + return attempts != successes; +} + +static int write_init_super_ddf(struct supertype *st) +{ + struct ddf_super *ddf = st->sb; + struct vcl *currentconf = ddf->currentconf; + + /* We are done with currentconf - reset it so st refers to the container */ + ddf->currentconf = NULL; + + if (st->update_tail) { + /* queue the virtual_disk and vd_config as metadata updates */ + struct virtual_disk *vd; + struct vd_config *vc; + int len, tlen; + unsigned int i; + + if (!currentconf) { + /* Must be adding a physical disk to the container */ + int len = (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry)); + + /* adding a disk to the container. */ + if (!ddf->add_list) + return 0; + + append_metadata_update(st, ddf->add_list->mdupdate, len); + ddf->add_list->mdupdate = NULL; + return 0; + } + + /* Newly created VD */ + + /* First the virtual disk. We have a slightly fake header */ + len = sizeof(struct virtual_disk) + sizeof(struct virtual_entry); + vd = xmalloc(len); + *vd = *ddf->virt; + vd->entries[0] = ddf->virt->entries[currentconf->vcnum]; + vd->populated_vdes = cpu_to_be16(currentconf->vcnum); + append_metadata_update(st, vd, len); + + /* Then the vd_config */ + len = ddf->conf_rec_len * 512; + tlen = len * currentconf->conf.sec_elmnt_count; + vc = xmalloc(tlen); + memcpy(vc, ¤tconf->conf, len); + for (i = 1; i < currentconf->conf.sec_elmnt_count; i++) + memcpy((char *)vc + i*len, currentconf->other_bvds[i-1], + len); + append_metadata_update(st, vc, tlen); + + return 0; + } else { + struct dl *d; + if (!currentconf) + for (d = ddf->dlist; d; d=d->next) + while (Kill(d->devname, NULL, 0, -1, 1) == 0); + /* Note: we don't close the fd's now, but a subsequent + * ->free_super() will + */ + return __write_init_super_ddf(st); + } +} + +static __u64 avail_size_ddf(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + /* We must reserve the last 32Meg */ + if (devsize <= 32*1024*2) + return 0; + return devsize - 32*1024*2; +} + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long data_offset, + unsigned long long *freesize) +{ + /* Find 'raiddisks' spare extents at least 'size' big (but + * only caring about multiples of 'chunk') and remember + * them. If size==0, find the largest size possible. + * Report available size in *freesize + * If space cannot be found, fail. + */ + struct dl *dl; + struct ddf_super *ddf = st->sb; + int cnt = 0; + + for (dl = ddf->dlist; dl ; dl=dl->next) { + dl->raiddisk = -1; + dl->esize = 0; + } + /* Now find largest extent on each device */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + unsigned long long minsize = ULLONG_MAX; + + find_space(ddf, dl, data_offset, &minsize); + if (minsize >= size && minsize >= (unsigned)chunk) { + cnt++; + dl->esize = minsize; + } + } + if (cnt < raiddisks) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + if (size == 0) { + /* choose the largest size of which there are at least 'raiddisk' */ + for (dl = ddf->dlist ; dl ; dl=dl->next) { + struct dl *dl2; + if (dl->esize <= size) + continue; + /* This is bigger than 'size', see if there are enough */ + cnt = 0; + for (dl2 = ddf->dlist; dl2 ; dl2=dl2->next) + if (dl2->esize >= dl->esize) + cnt++; + if (cnt >= raiddisks) + size = dl->esize; + } + if (chunk) { + size = size / chunk; + size *= chunk; + } + *freesize = size; + if (size < 32) { + pr_err("not enough spare devices to create array.\n"); + return 0; + } + } + /* We have a 'size' of which there are enough spaces. + * We simply do a first-fit */ + cnt = 0; + for (dl = ddf->dlist ; dl && cnt < raiddisks ; dl=dl->next) { + if (dl->esize < size) + continue; + + dl->raiddisk = cnt; + cnt++; + } + return 1; +} + +static int validate_geometry_ddf(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + int fd; + struct mdinfo *sra; + int cfd; + + /* ddf potentially supports lots of things, but it depends on + * what devices are offered (and maybe kernel version?) + * If given unused devices, we will make a container. + * If given devices in a container, we will make a BVD. + * If given BVDs, we make an SVD, changing all the GUIDs in the process. + */ + + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (level == LEVEL_NONE) + level = LEVEL_CONTAINER; + if (level == LEVEL_CONTAINER) { + /* Must be a fresh device to add to a container */ + return validate_geometry_ddf_container(st, level, layout, + raiddisks, *chunk, + size, data_offset, dev, + freesize, + verbose); + } + + if (!dev) { + mdu_array_info_t array = { + .level = level, + .layout = layout, + .raid_disks = raiddisks + }; + struct vd_config conf; + if (layout_md2ddf(&array, &conf) == -1) { + if (verbose) + pr_err("DDF does not support level %d /layout %d arrays with %d disks\n", + level, layout, raiddisks); + return 0; + } + /* Should check layout? etc */ + + if (st->sb && freesize) { + /* --create was given a container to create in. + * So we need to check that there are enough + * free spaces and return the amount of space. + * We may as well remember which drives were + * chosen so that add_to_super/getinfo_super + * can return them. + */ + return reserve_space(st, raiddisks, size, *chunk, + data_offset, freesize); + } + return 1; + } + + if (st->sb) { + /* A container has already been opened, so we are + * creating in there. Maybe a BVD, maybe an SVD. + * Should make a distinction one day. + */ + return validate_geometry_ddf_bvd(st, level, layout, raiddisks, + chunk, size, data_offset, dev, + freesize, + verbose); + } + /* This is the first device for the array. + * If it is a container, we read it in and do automagic allocations, + * no other devices should be given. + * Otherwise it must be a member device of a container, and we + * do manual allocation. + * Later we should check for a BVD and make an SVD. + */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + if (fd >= 0) { + close(fd); + /* Just a bare device, no good to us */ + if (verbose) + pr_err("ddf: Cannot create this array on device %s - a container is required.\n", + dev); + return 0; + } + if (errno != EBUSY || (fd = open(dev, O_RDONLY, 0)) < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + /* Well, it is in use by someone, maybe a 'ddf' container. */ + cfd = open_container(fd); + if (cfd < 0) { + close(fd); + if (verbose) + pr_err("ddf: Cannot use %s: %s\n", + dev, strerror(EBUSY)); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + close(fd); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "ddf") == 0) { + /* This is a member of a ddf container. Load the container + * and try to create a bvd + */ + struct ddf_super *ddf; + if (load_super_ddf_all(st, cfd, (void **)&ddf, NULL) == 0) { + st->sb = ddf; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_ddf_bvd(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, + verbose); + } + close(cfd); + } else /* device may belong to a different container */ + return 0; + + return 1; +} + +static int +validate_geometry_ddf_container(struct supertype *st, + int level, int layout, int raiddisks, + int chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = dev_open(dev, O_RDONLY|O_EXCL); + if (fd < 0) { + if (verbose) + pr_err("ddf: Cannot open %s: %s\n", + dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + if (freesize) { + *freesize = avail_size_ddf(st, ldsize >> 9, INVALID_SECTORS); + if (*freesize == 0) + return 0; + } + + return 1; +} + +static int validate_geometry_ddf_bvd(struct supertype *st, + int level, int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int verbose) +{ + dev_t rdev; + struct ddf_super *ddf = st->sb; + struct dl *dl; + unsigned long long maxsize; + /* ddf/bvd supports lots of things, but not containers */ + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("DDF cannot create a container within an container\n"); + return 0; + } + /* We must have the container info already read in. */ + if (!ddf) + return 0; + + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size'. + */ + unsigned long long minsize = size; + int dcnt = 0; + if (minsize == 0) + minsize = 8; + for (dl = ddf->dlist; dl ; dl = dl->next) { + if (find_space(ddf, dl, data_offset, &minsize) != + INVALID_SECTORS) + dcnt++; + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("ddf: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + /* This device must be a member of the set */ + if (!stat_is_blkdev(dev, &rdev)) + return 0; + for (dl = ddf->dlist ; dl ; dl = dl->next) { + if (dl->major == (int)major(rdev) && + dl->minor == (int)minor(rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("ddf: %s is not in the same DDF set\n", + dev); + return 0; + } + maxsize = ULLONG_MAX; + find_space(ddf, dl, data_offset, &maxsize); + *freesize = maxsize; + + return 1; +} + +static int load_super_ddf_all(struct supertype *st, int fd, + void **sbp, char *devname) +{ + struct mdinfo *sra; + struct ddf_super *super; + struct mdinfo *sd, *best = NULL; + int bestseq = 0; + int seq; + char nm[20]; + int dfd; + + sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "ddf") != 0) + return 1; + + if (posix_memalign((void**)&super, 512, sizeof(*super)) != 0) + return 1; + memset(super, 0, sizeof(*super)); + + /* first, try each device, and choose the best ddf */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + close(dfd); + if (rv == 0) { + seq = be32_to_cpu(super->active->seq); + if (super->active->openflag) + seq--; + if (!best || seq > bestseq) { + bestseq = seq; + best = sd; + } + } + } + if (!best) + return 1; + /* OK, load this ddf */ + sprintf(nm, "%d:%d", best->disk.major, best->disk.minor); + dfd = dev_open(nm, O_RDONLY); + if (dfd < 0) + return 1; + load_ddf_headers(dfd, super, NULL); + load_ddf_global(dfd, super, NULL); + close(dfd); + /* Now we need the device-local bits */ + for (sd = sra->devs ; sd ; sd = sd->next) { + int rv; + + sprintf(nm, "%d:%d", sd->disk.major, sd->disk.minor); + dfd = dev_open(nm, O_RDWR); + if (dfd < 0) + return 2; + rv = load_ddf_headers(dfd, super, NULL); + if (rv == 0) + rv = load_ddf_local(dfd, super, NULL, 1); + if (rv) + return 1; + } + + *sbp = super; + if (st->ss == NULL) { + st->ss = &super_ddf; + st->minor_version = 0; + st->max_devs = 512; + } + strcpy(st->container_devnm, fd2devnm(fd)); + return 0; +} + +static int load_container_ddf(struct supertype *st, int fd, + char *devname) +{ + return load_super_ddf_all(st, fd, &st->sb, devname); +} + +static int check_secondary(const struct vcl *vc) +{ + const struct vd_config *conf = &vc->conf; + int i; + + /* The only DDF secondary RAID level md can support is + * RAID 10, if the stripe sizes and Basic volume sizes + * are all equal. + * Other configurations could in theory be supported by exposing + * the BVDs to user space and using device mapper for the secondary + * mapping. So far we don't support that. + */ + + __u64 sec_elements[4] = {0, 0, 0, 0}; +#define __set_sec_seen(n) (sec_elements[(n)>>6] |= (1<<((n)&63))) +#define __was_sec_seen(n) ((sec_elements[(n)>>6] & (1<<((n)&63))) != 0) + + if (vc->other_bvds == NULL) { + pr_err("No BVDs for secondary RAID found\n"); + return -1; + } + if (conf->prl != DDF_RAID1) { + pr_err("Secondary RAID level only supported for mirrored BVD\n"); + return -1; + } + if (conf->srl != DDF_2STRIPED && conf->srl != DDF_2SPANNED) { + pr_err("Secondary RAID level %d is unsupported\n", + conf->srl); + return -1; + } + __set_sec_seen(conf->sec_elmnt_seq); + for (i = 0; i < conf->sec_elmnt_count-1; i++) { + const struct vd_config *bvd = vc->other_bvds[i]; + if (bvd->sec_elmnt_seq == DDF_UNUSED_BVD) + continue; + if (bvd->srl != conf->srl) { + pr_err("Inconsistent secondary RAID level across BVDs\n"); + return -1; + } + if (bvd->prl != conf->prl) { + pr_err("Different RAID levels for BVDs are unsupported\n"); + return -1; + } + if (!be16_eq(bvd->prim_elmnt_count, conf->prim_elmnt_count)) { + pr_err("All BVDs must have the same number of primary elements\n"); + return -1; + } + if (bvd->chunk_shift != conf->chunk_shift) { + pr_err("Different strip sizes for BVDs are unsupported\n"); + return -1; + } + if (!be64_eq(bvd->array_blocks, conf->array_blocks)) { + pr_err("Different BVD sizes are unsupported\n"); + return -1; + } + __set_sec_seen(bvd->sec_elmnt_seq); + } + for (i = 0; i < conf->sec_elmnt_count; i++) { + if (!__was_sec_seen(i)) { + /* pr_err("BVD %d is missing\n", i); */ + return -1; + } + } + return 0; +} + +static unsigned int get_pd_index_from_refnum(const struct vcl *vc, + be32 refnum, unsigned int nmax, + const struct vd_config **bvd, + unsigned int *idx) +{ + unsigned int i, j, n, sec, cnt; + + cnt = be16_to_cpu(vc->conf.prim_elmnt_count); + sec = (vc->conf.sec_elmnt_count == 1 ? 0 : vc->conf.sec_elmnt_seq); + + for (i = 0, j = 0 ; i < nmax ; i++) { + /* j counts valid entries for this BVD */ + if (be32_eq(vc->conf.phys_refnum[i], refnum)) { + *bvd = &vc->conf; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vc->conf.phys_refnum[i]) != 0xffffffff) + j++; + } + if (vc->other_bvds == NULL) + goto bad; + + for (n = 1; n < vc->conf.sec_elmnt_count; n++) { + struct vd_config *vd = vc->other_bvds[n-1]; + sec = vd->sec_elmnt_seq; + if (sec == DDF_UNUSED_BVD) + continue; + for (i = 0, j = 0 ; i < nmax ; i++) { + if (be32_eq(vd->phys_refnum[i], refnum)) { + *bvd = vd; + *idx = i; + return sec * cnt + j; + } + if (be32_to_cpu(vd->phys_refnum[i]) != 0xffffffff) + j++; + } + } +bad: + *bvd = NULL; + return DDF_NOTFOUND; +} + +static struct mdinfo *container_content_ddf(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_ddf_all, + * extract information about all the arrays into + * an mdinfo tree. + * + * For each vcl in conflist: create an mdinfo, fill it in, + * then look for matching devices (phys_refnum) in dlist + * and create appropriate device mdinfo. + */ + struct ddf_super *ddf = st->sb; + struct mdinfo *rest = NULL; + struct vcl *vc; + + for (vc = ddf->conflist ; vc ; vc=vc->next) { + unsigned int i; + struct mdinfo *this; + char *ep; + __u32 *cptr; + unsigned int pd; + + if (subarray && + (strtoul(subarray, &ep, 10) != vc->vcnum || + *ep != '\0')) + continue; + + if (vc->conf.sec_elmnt_count > 1) { + if (check_secondary(vc) != 0) + continue; + } + + this = xcalloc(1, sizeof(*this)); + this->next = rest; + rest = this; + + if (layout_ddf2md(&vc->conf, &this->array)) + continue; + this->array.md_minor = -1; + this->array.major_version = -1; + this->array.minor_version = -2; + this->safe_mode_delay = DDF_SAFE_MODE_DELAY; + cptr = (__u32 *)(vc->conf.guid + 16); + this->array.ctime = DECADE + __be32_to_cpu(*cptr); + this->array.utime = DECADE + + be32_to_cpu(vc->conf.timestamp); + this->array.chunk_size = 512 << vc->conf.chunk_shift; + + i = vc->vcnum; + if ((ddf->virt->entries[i].state & DDF_state_inconsistent) || + (ddf->virt->entries[i].init_state & DDF_initstate_mask) != + DDF_init_full) { + this->array.state = 0; + this->resync_start = 0; + } else { + this->array.state = 1; + this->resync_start = MaxSector; + } + _ddf_array_name(this->name, ddf, i); + memset(this->uuid, 0, sizeof(this->uuid)); + this->component_size = be64_to_cpu(vc->conf.blocks); + this->array.size = this->component_size / 2; + this->container_member = i; + + ddf->currentconf = vc; + uuid_from_super_ddf(st, this->uuid); + if (!subarray) + ddf->currentconf = NULL; + + sprintf(this->text_version, "/%s/%d", + st->container_devnm, this->container_member); + + for (pd = 0; pd < be16_to_cpu(ddf->phys->max_pdes); pd++) { + struct mdinfo *dev; + struct dl *d; + const struct vd_config *bvd; + unsigned int iphys; + int stt; + + if (be32_to_cpu(ddf->phys->entries[pd].refnum) == + 0xffffffff) + continue; + + stt = be16_to_cpu(ddf->phys->entries[pd].state); + if ((stt & (DDF_Online|DDF_Failed|DDF_Rebuilding)) != + DDF_Online) + continue; + + i = get_pd_index_from_refnum( + vc, ddf->phys->entries[pd].refnum, + ddf->mppe, &bvd, &iphys); + if (i == DDF_NOTFOUND) + continue; + + this->array.working_disks++; + + for (d = ddf->dlist; d ; d=d->next) + if (be32_eq(d->disk.refnum, + ddf->phys->entries[pd].refnum)) + break; + if (d == NULL) + /* Haven't found that one yet, maybe there are others */ + continue; + + dev = xcalloc(1, sizeof(*dev)); + dev->next = this->devs; + this->devs = dev; + + dev->disk.number = be32_to_cpu(d->disk.refnum); + dev->disk.major = d->major; + dev->disk.minor = d->minor; + dev->disk.raid_disk = i; + dev->disk.state = (1<<MD_DISK_SYNC)|(1<<MD_DISK_ACTIVE); + dev->recovery_start = MaxSector; + + dev->events = be32_to_cpu(ddf->active->seq); + dev->data_offset = + be64_to_cpu(LBA_OFFSET(ddf, bvd)[iphys]); + dev->component_size = be64_to_cpu(bvd->blocks); + if (d->devname) + strcpy(dev->name, d->devname); + } + } + return rest; +} + +static int store_super_ddf(struct supertype *st, int fd) +{ + struct ddf_super *ddf = st->sb; + unsigned long long dsize; + void *buf; + int rc; + + if (!ddf) + return 1; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (ddf->dlist || ddf->conflist) { + struct stat sta; + struct dl *dl; + int ofd, ret; + + if (fstat(fd, &sta) == -1 || !S_ISBLK(sta.st_mode)) { + pr_err("file descriptor for invalid device\n"); + return 1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == (int)major(sta.st_rdev) && + dl->minor == (int)minor(sta.st_rdev)) + break; + if (!dl) { + pr_err("couldn't find disk %d/%d\n", + (int)major(sta.st_rdev), + (int)minor(sta.st_rdev)); + return 1; + } + ofd = dl->fd; + dl->fd = fd; + ret = (_write_super_to_disk(ddf, dl) != 1); + dl->fd = ofd; + return ret; + } + + if (posix_memalign(&buf, 512, 512) != 0) + return 1; + memset(buf, 0, 512); + + lseek64(fd, dsize-512, 0); + rc = write(fd, buf, 512); + free(buf); + if (rc < 0) + return 1; + return 0; +} + +static int compare_super_ddf(struct supertype *st, struct supertype *tst, + int verbose) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong magic number - but that isn't possible + * 2 wrong uuid + * 3 wrong other info + */ + struct ddf_super *first = st->sb; + struct ddf_super *second = tst->sb; + struct dl *dl1, *dl2; + struct vcl *vl1, *vl2; + unsigned int max_vds, max_pds, pd, vd; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + if (memcmp(first->anchor.guid, second->anchor.guid, DDF_GUID_LEN) != 0) + return 2; + + /* It is only OK to compare info in the anchor. Anything else + * could be changing due to a reconfig so must be ignored. + * guid really should be enough anyway. + */ + + if (!be32_eq(first->active->seq, second->active->seq)) { + dprintf("sequence number mismatch %u<->%u\n", + be32_to_cpu(first->active->seq), + be32_to_cpu(second->active->seq)); + return 0; + } + + /* + * At this point we are fairly sure that the meta data matches. + * But the new disk may contain additional local data. + * Add it to the super block. + */ + max_vds = be16_to_cpu(first->active->max_vd_entries); + max_pds = be16_to_cpu(first->phys->max_pdes); + for (vl2 = second->conflist; vl2; vl2 = vl2->next) { + for (vl1 = first->conflist; vl1; vl1 = vl1->next) + if (!memcmp(vl1->conf.guid, vl2->conf.guid, + DDF_GUID_LEN)) + break; + if (vl1) { + if (vl1->other_bvds != NULL && + vl1->conf.sec_elmnt_seq != + vl2->conf.sec_elmnt_seq) { + dprintf("adding BVD %u\n", + vl2->conf.sec_elmnt_seq); + add_other_bvd(vl1, &vl2->conf, + first->conf_rec_len*512); + } + continue; + } + + if (posix_memalign((void **)&vl1, 512, + (first->conf_rec_len*512 + + offsetof(struct vcl, conf))) != 0) { + pr_err("could not allocate vcl buf\n"); + return 3; + } + + vl1->next = first->conflist; + vl1->block_sizes = NULL; + memcpy(&vl1->conf, &vl2->conf, first->conf_rec_len*512); + if (alloc_other_bvds(first, vl1) != 0) { + pr_err("could not allocate other bvds\n"); + free(vl1); + return 3; + } + for (vd = 0; vd < max_vds; vd++) + if (!memcmp(first->virt->entries[vd].guid, + vl1->conf.guid, DDF_GUID_LEN)) + break; + vl1->vcnum = vd; + dprintf("added config for VD %u\n", vl1->vcnum); + first->conflist = vl1; + } + + for (dl2 = second->dlist; dl2; dl2 = dl2->next) { + for (dl1 = first->dlist; dl1; dl1 = dl1->next) + if (be32_eq(dl1->disk.refnum, dl2->disk.refnum)) + break; + if (dl1) + continue; + + if (posix_memalign((void **)&dl1, 512, + sizeof(*dl1) + (first->max_part) * + sizeof(dl1->vlist[0])) != 0) { + pr_err("could not allocate disk info buffer\n"); + return 3; + } + memcpy(dl1, dl2, sizeof(*dl1)); + dl1->mdupdate = NULL; + dl1->next = first->dlist; + dl1->fd = -1; + for (pd = 0; pd < max_pds; pd++) + if (be32_eq(first->phys->entries[pd].refnum, + dl1->disk.refnum)) + break; + dl1->pdnum = pd < max_pds ? (int)pd : -1; + if (dl2->spare) { + if (posix_memalign((void **)&dl1->spare, 512, + first->conf_rec_len*512) != 0) { + pr_err("could not allocate spare info buf\n"); + return 3; + } + memcpy(dl1->spare, dl2->spare, first->conf_rec_len*512); + } + for (vd = 0 ; vd < first->max_part ; vd++) { + if (!dl2->vlist[vd]) { + dl1->vlist[vd] = NULL; + continue; + } + for (vl1 = first->conflist; vl1; vl1 = vl1->next) { + if (!memcmp(vl1->conf.guid, + dl2->vlist[vd]->conf.guid, + DDF_GUID_LEN)) + break; + dl1->vlist[vd] = vl1; + } + } + first->dlist = dl1; + dprintf("added disk %d: %08x\n", dl1->pdnum, + be32_to_cpu(dl1->disk.refnum)); + } + + return 0; +} + +/* + * A new array 'a' has been started which claims to be instance 'inst' + * within container 'c'. + * We need to confirm that the array matches the metadata in 'c' so + * that we don't corrupt any metadata. + */ +static int ddf_open_new(struct supertype *c, struct active_array *a, int inst) +{ + struct ddf_super *ddf = c->sb; + struct mdinfo *dev; + struct dl *dl; + static const char faulty[] = "faulty"; + + if (all_ff(ddf->virt->entries[inst].guid)) { + pr_err("subarray %d doesn't exist\n", inst); + return -ENODEV; + } + dprintf("new subarray %d, GUID: %s\n", inst, + guid_str(ddf->virt->entries[inst].guid)); + for (dev = a->info.devs; dev; dev = dev->next) { + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == dev->disk.major && + dl->minor == dev->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("device %d/%d of subarray %d not found in meta data\n", + dev->disk.major, dev->disk.minor, inst); + return -1; + } + if ((be16_to_cpu(ddf->phys->entries[dl->pdnum].state) & + (DDF_Online|DDF_Missing|DDF_Failed)) != DDF_Online) { + pr_err("new subarray %d contains broken device %d/%d (%02x)\n", + inst, dl->major, dl->minor, + be16_to_cpu(ddf->phys->entries[dl->pdnum].state)); + if (write(dev->state_fd, faulty, sizeof(faulty)-1) != + sizeof(faulty) - 1) + pr_err("Write to state_fd failed\n"); + dev->curr_state = DS_FAULTY; + } + } + a->info.container_member = inst; + return 0; +} + +static void handle_missing(struct ddf_super *ddf, struct active_array *a, int inst) +{ + /* This member array is being activated. If any devices + * are missing they must now be marked as failed. + */ + struct vd_config *vc; + unsigned int n_bvd; + struct vcl *vcl; + struct dl *dl; + int pd; + int n; + int state; + + for (n = 0; ; n++) { + vc = find_vdcr(ddf, inst, n, &n_bvd, &vcl); + if (!vc) + break; + for (dl = ddf->dlist; dl; dl = dl->next) + if (be32_eq(dl->disk.refnum, vc->phys_refnum[n_bvd])) + break; + if (dl) + /* Found this disk, so not missing */ + continue; + + /* Mark the device as failed/missing. */ + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd >= 0 && be16_and(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online))) { + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed|DDF_Missing)); + vc->phys_refnum[n_bvd] = cpu_to_be32(0); + ddf_set_updates_pending(ddf, vc); + } + + /* Mark the array as Degraded */ + state = get_svd_state(ddf, vcl); + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + a->check_degraded = 1; + ddf_set_updates_pending(ddf, vc); + } + } +} + +/* + * The array 'a' is to be marked clean in the metadata. + * If '->resync_start' is not ~(unsigned long long)0, then the array is only + * clean up to the point (in sectors). If that cannot be recorded in the + * metadata, then leave it as dirty. + * + * For DDF, we need to clear the DDF_state_inconsistent bit in the + * !global! virtual_disk.virtual_entry structure. + */ +static int ddf_set_array_state(struct active_array *a, int consistent) +{ + struct ddf_super *ddf = a->container->sb; + int inst = a->info.container_member; + int old = ddf->virt->entries[inst].state; + if (consistent == 2) { + handle_missing(ddf, a, inst); + consistent = 1; + if (!is_resync_complete(&a->info)) + consistent = 0; + } + if (consistent) + ddf->virt->entries[inst].state &= ~DDF_state_inconsistent; + else + ddf->virt->entries[inst].state |= DDF_state_inconsistent; + if (old != ddf->virt->entries[inst].state) + ddf_set_updates_pending(ddf, NULL); + + old = ddf->virt->entries[inst].init_state; + ddf->virt->entries[inst].init_state &= ~DDF_initstate_mask; + if (is_resync_complete(&a->info)) + ddf->virt->entries[inst].init_state |= DDF_init_full; + else if (a->info.resync_start == 0) + ddf->virt->entries[inst].init_state |= DDF_init_not; + else + ddf->virt->entries[inst].init_state |= DDF_init_quick; + if (old != ddf->virt->entries[inst].init_state) + ddf_set_updates_pending(ddf, NULL); + + dprintf("ddf mark %d/%s (%d) %s %llu\n", inst, + guid_str(ddf->virt->entries[inst].guid), a->curr_state, + consistent?"clean":"dirty", + a->info.resync_start); + return consistent; +} + +static int get_bvd_state(const struct ddf_super *ddf, + const struct vd_config *vc) +{ + unsigned int i, n_bvd, working = 0; + unsigned int n_prim = be16_to_cpu(vc->prim_elmnt_count); + int pd, st, state; + char *avail = xcalloc(1, n_prim); + mdu_array_info_t array; + + layout_ddf2md(vc, &array); + + for (i = 0; i < n_prim; i++) { + if (!find_index_in_bvd(ddf, vc, i, &n_bvd)) + continue; + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0) + continue; + st = be16_to_cpu(ddf->phys->entries[pd].state); + if ((st & (DDF_Online|DDF_Failed|DDF_Rebuilding)) == + DDF_Online) { + working++; + avail[i] = 1; + } + } + + state = DDF_state_degraded; + if (working == n_prim) + state = DDF_state_optimal; + else + switch (vc->prl) { + case DDF_RAID0: + case DDF_CONCAT: + case DDF_JBOD: + state = DDF_state_failed; + break; + case DDF_RAID1: + if (working == 0) + state = DDF_state_failed; + else if (working >= 2) + state = DDF_state_part_optimal; + break; + case DDF_RAID1E: + if (!enough(10, n_prim, array.layout, 1, avail)) + state = DDF_state_failed; + break; + case DDF_RAID4: + case DDF_RAID5: + if (working < n_prim - 1) + state = DDF_state_failed; + break; + case DDF_RAID6: + if (working < n_prim - 2) + state = DDF_state_failed; + else if (working == n_prim - 1) + state = DDF_state_part_optimal; + break; + } + return state; +} + +static int secondary_state(int state, int other, int seclevel) +{ + if (state == DDF_state_optimal && other == DDF_state_optimal) + return DDF_state_optimal; + if (seclevel == DDF_2MIRRORED) { + if (state == DDF_state_optimal || other == DDF_state_optimal) + return DDF_state_part_optimal; + if (state == DDF_state_failed && other == DDF_state_failed) + return DDF_state_failed; + return DDF_state_degraded; + } else { + if (state == DDF_state_failed || other == DDF_state_failed) + return DDF_state_failed; + if (state == DDF_state_degraded || other == DDF_state_degraded) + return DDF_state_degraded; + return DDF_state_part_optimal; + } +} + +static int get_svd_state(const struct ddf_super *ddf, const struct vcl *vcl) +{ + int state = get_bvd_state(ddf, &vcl->conf); + unsigned int i; + for (i = 1; i < vcl->conf.sec_elmnt_count; i++) { + state = secondary_state( + state, + get_bvd_state(ddf, vcl->other_bvds[i-1]), + vcl->conf.srl); + } + return state; +} + +/* + * The state of each disk is stored in the global phys_disk structure + * in phys_disk.entries[n].state. + * This makes various combinations awkward. + * - When a device fails in any array, it must be failed in all arrays + * that include a part of this device. + * - When a component is rebuilding, we cannot include it officially in the + * array unless this is the only array that uses the device. + * + * So: when transitioning: + * Online -> failed, just set failed flag. monitor will propagate + * spare -> online, the device might need to be added to the array. + * spare -> failed, just set failed. Don't worry if in array or not. + */ +static void ddf_set_disk(struct active_array *a, int n, int state) +{ + struct ddf_super *ddf = a->container->sb; + unsigned int inst = a->info.container_member, n_bvd; + struct vcl *vcl; + struct vd_config *vc = find_vdcr(ddf, inst, (unsigned int)n, + &n_bvd, &vcl); + int pd; + struct mdinfo *mdi; + struct dl *dl; + int update = 0; + + dprintf("%d to %x\n", n, state); + if (vc == NULL) { + dprintf("ddf: cannot find instance %d!!\n", inst); + return; + } + /* Find the matching slot in 'info'. */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + if (mdi->disk.raid_disk == n) + break; + if (!mdi) { + pr_err("cannot find raid disk %d\n", n); + return; + } + + /* and find the 'dl' entry corresponding to that. */ + for (dl = ddf->dlist; dl; dl = dl->next) + if (mdi->state_fd >= 0 && + mdi->disk.major == dl->major && + mdi->disk.minor == dl->minor) + break; + if (!dl) { + pr_err("cannot find raid disk %d (%d/%d)\n", + n, mdi->disk.major, mdi->disk.minor); + return; + } + + pd = find_phys(ddf, vc->phys_refnum[n_bvd]); + if (pd < 0 || pd != dl->pdnum) { + /* disk doesn't currently exist or has changed. + * If it is now in_sync, insert it. */ + dprintf("phys disk not found for %d: %d/%d ref %08x\n", + dl->pdnum, dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + dprintf("array %u disk %u ref %08x pd %d\n", + inst, n_bvd, + be32_to_cpu(vc->phys_refnum[n_bvd]), pd); + if ((state & DS_INSYNC) && ! (state & DS_FAULTY) && + dl->pdnum >= 0) { + pd = dl->pdnum; + vc->phys_refnum[n_bvd] = dl->disk.refnum; + LBA_OFFSET(ddf, vc)[n_bvd] = + cpu_to_be64(mdi->data_offset); + be16_clear(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[pd].type, + cpu_to_be16(DDF_Active_in_VD)); + update = 1; + } + } else { + be16 old = ddf->phys->entries[pd].state; + if (state & DS_FAULTY) + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Failed)); + if (state & DS_INSYNC) { + be16_set(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Online)); + be16_clear(ddf->phys->entries[pd].state, + cpu_to_be16(DDF_Rebuilding)); + } + if (!be16_eq(old, ddf->phys->entries[pd].state)) + update = 1; + } + + dprintf("ddf: set_disk %d (%08x) to %x->%02x\n", n, + be32_to_cpu(dl->disk.refnum), state, + be16_to_cpu(ddf->phys->entries[pd].state)); + + /* Now we need to check the state of the array and update + * virtual_disk.entries[n].state. + * It needs to be one of "optimal", "degraded", "failed". + * I don't understand 'deleted' or 'missing'. + */ + state = get_svd_state(ddf, vcl); + + if (ddf->virt->entries[inst].state != + ((ddf->virt->entries[inst].state & ~DDF_state_mask) + | state)) { + ddf->virt->entries[inst].state = + (ddf->virt->entries[inst].state & ~DDF_state_mask) + | state; + update = 1; + } + if (update) + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_sync_metadata(struct supertype *st) +{ + /* + * Write all data to all devices. + * Later, we might be able to track whether only local changes + * have been made, or whether any global data has been changed, + * but ddf is sufficiently weird that it probably always + * changes global data .... + */ + struct ddf_super *ddf = st->sb; + if (!ddf->updates_pending) + return; + ddf->updates_pending = 0; + __write_init_super_ddf(st); + dprintf("ddf: sync_metadata\n"); +} + +static int del_from_conflist(struct vcl **list, const char *guid) +{ + struct vcl **p; + int found = 0; + for (p = list; p && *p; p = &((*p)->next)) + if (!memcmp((*p)->conf.guid, guid, DDF_GUID_LEN)) { + found = 1; + *p = (*p)->next; + } + return found; +} + +static int _kill_subarray_ddf(struct ddf_super *ddf, const char *guid) +{ + struct dl *dl; + unsigned int vdnum, i; + vdnum = find_vde_by_guid(ddf, guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(guid)); + return -1; + } + if (del_from_conflist(&ddf->conflist, guid) == 0) { + pr_err("could not find conf %s\n", guid_str(guid)); + return -1; + } + for (dl = ddf->dlist; dl; dl = dl->next) + for (i = 0; i < ddf->max_part; i++) + if (dl->vlist[i] != NULL && + !memcmp(dl->vlist[i]->conf.guid, guid, + DDF_GUID_LEN)) + dl->vlist[i] = NULL; + memset(ddf->virt->entries[vdnum].guid, 0xff, DDF_GUID_LEN); + dprintf("deleted %s\n", guid_str(guid)); + return 0; +} + +static int kill_subarray_ddf(struct supertype *st, char *subarray_id) +{ + struct ddf_super *ddf = st->sb; + /* + * currentconf is set in container_content_ddf, + * called with subarray arg + */ + struct vcl *victim = ddf->currentconf; + struct vd_config *conf; + unsigned int vdnum; + + ddf->currentconf = NULL; + if (!victim) { + pr_err("nothing to kill\n"); + return -1; + } + conf = &victim->conf; + vdnum = find_vde_by_guid(ddf, conf->guid); + if (vdnum == DDF_NOTFOUND) { + pr_err("could not find VD %s\n", guid_str(conf->guid)); + return -1; + } + if (st->update_tail) { + struct virtual_disk *vd; + int len = sizeof(struct virtual_disk) + + sizeof(struct virtual_entry); + vd = xmalloc(len); + if (vd == NULL) { + pr_err("failed to allocate %d bytes\n", len); + return -1; + } + memset(vd, 0 , len); + vd->magic = DDF_VIRT_RECORDS_MAGIC; + vd->populated_vdes = cpu_to_be16(0); + memcpy(vd->entries[0].guid, conf->guid, DDF_GUID_LEN); + /* we use DDF_state_deleted as marker */ + vd->entries[0].state = DDF_state_deleted; + append_metadata_update(st, vd, len); + } else { + _kill_subarray_ddf(ddf, conf->guid); + ddf_set_updates_pending(ddf, NULL); + ddf_sync_metadata(st); + } + return 0; +} + +static void copy_matching_bvd(struct ddf_super *ddf, + struct vd_config *conf, + const struct metadata_update *update) +{ + unsigned int mppe = + be16_to_cpu(ddf->anchor.max_primary_element_entries); + unsigned int len = ddf->conf_rec_len * 512; + char *p; + struct vd_config *vc; + for (p = update->buf; p < update->buf + update->len; p += len) { + vc = (struct vd_config *) p; + if (vc->sec_elmnt_seq == conf->sec_elmnt_seq) { + memcpy(conf->phys_refnum, vc->phys_refnum, + mppe * (sizeof(__u32) + sizeof(__u64))); + return; + } + } + pr_err("no match for BVD %d of %s in update\n", + conf->sec_elmnt_seq, guid_str(conf->guid)); +} + +static void ddf_process_phys_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct phys_disk *pd; + unsigned int ent; + + pd = (struct phys_disk*)update->buf; + ent = be16_to_cpu(pd->used_pdes); + if (ent >= be16_to_cpu(ddf->phys->max_pdes)) + return; + if (be16_and(pd->entries[0].state, cpu_to_be16(DDF_Missing))) { + struct dl **dlp; + /* removing this disk. */ + be16_set(ddf->phys->entries[ent].state, + cpu_to_be16(DDF_Missing)); + for (dlp = &ddf->dlist; *dlp; dlp = &(*dlp)->next) { + struct dl *dl = *dlp; + if (dl->pdnum == (signed)ent) { + close(dl->fd); + dl->fd = -1; + *dlp = dl->next; + update->space = dl->devname; + *(void**)dl = update->space_list; + update->space_list = (void**)dl; + break; + } + } + ddf_set_updates_pending(ddf, NULL); + return; + } + if (!all_ff(ddf->phys->entries[ent].guid)) + return; + ddf->phys->entries[ent] = pd->entries[0]; + ddf->phys->used_pdes = cpu_to_be16 + (1 + be16_to_cpu(ddf->phys->used_pdes)); + ddf_set_updates_pending(ddf, NULL); + if (ddf->add_list) { + struct active_array *a; + struct dl *al = ddf->add_list; + ddf->add_list = al->next; + + al->next = ddf->dlist; + ddf->dlist = al; + + /* As a device has been added, we should check + * for any degraded devices that might make + * use of this spare */ + for (a = st->arrays ; a; a=a->next) + a->check_degraded = 1; + } +} + +static void ddf_process_virt_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct virtual_disk *vd; + unsigned int ent; + + vd = (struct virtual_disk*)update->buf; + + if (vd->entries[0].state == DDF_state_deleted) { + if (_kill_subarray_ddf(ddf, vd->entries[0].guid)) + return; + } else { + ent = find_vde_by_guid(ddf, vd->entries[0].guid); + if (ent != DDF_NOTFOUND) { + dprintf("VD %s exists already in slot %d\n", + guid_str(vd->entries[0].guid), + ent); + return; + } + ent = find_unused_vde(ddf); + if (ent == DDF_NOTFOUND) + return; + ddf->virt->entries[ent] = vd->entries[0]; + ddf->virt->populated_vdes = + cpu_to_be16( + 1 + be16_to_cpu( + ddf->virt->populated_vdes)); + dprintf("added VD %s in slot %d(s=%02x i=%02x)\n", + guid_str(vd->entries[0].guid), ent, + ddf->virt->entries[ent].state, + ddf->virt->entries[ent].init_state); + } + ddf_set_updates_pending(ddf, NULL); +} + +static void ddf_remove_failed(struct ddf_super *ddf) +{ + /* Now remove any 'Failed' devices that are not part + * of any VD. They will have the Transition flag set. + * Once done, we need to update all dl->pdnum numbers. + */ + unsigned int pdnum; + unsigned int pd2 = 0; + struct dl *dl; + + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) { + if (be32_to_cpu(ddf->phys->entries[pdnum].refnum) == + 0xFFFFFFFF) + continue; + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed)) && + be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition))) { + /* skip this one unless in dlist*/ + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + break; + if (!dl) + continue; + } + if (pdnum == pd2) + pd2++; + else { + ddf->phys->entries[pd2] = + ddf->phys->entries[pdnum]; + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->pdnum == (int)pdnum) + dl->pdnum = pd2; + pd2++; + } + } + ddf->phys->used_pdes = cpu_to_be16(pd2); + while (pd2 < pdnum) { + memset(ddf->phys->entries[pd2].guid, 0xff, + DDF_GUID_LEN); + pd2++; + } +} + +static void ddf_update_vlist(struct ddf_super *ddf, struct dl *dl) +{ + struct vcl *vcl; + unsigned int vn = 0; + int in_degraded = 0; + + if (dl->pdnum < 0) + return; + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) { + unsigned int dn, ibvd; + const struct vd_config *conf; + int vstate; + dn = get_pd_index_from_refnum(vcl, + dl->disk.refnum, + ddf->mppe, + &conf, &ibvd); + if (dn == DDF_NOTFOUND) + continue; + dprintf("dev %d/%08x has %s (sec=%u) at %d\n", + dl->pdnum, + be32_to_cpu(dl->disk.refnum), + guid_str(conf->guid), + conf->sec_elmnt_seq, vn); + /* Clear the Transition flag */ + if (be16_and + (ddf->phys->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_clear(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Transition)); + dl->vlist[vn++] = vcl; + vstate = ddf->virt->entries[vcl->vcnum].state + & DDF_state_mask; + if (vstate == DDF_state_degraded || + vstate == DDF_state_part_optimal) + in_degraded = 1; + } + while (vn < ddf->max_part) + dl->vlist[vn++] = NULL; + if (dl->vlist[0]) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + if (!be16_and(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD))) { + be16_set(ddf->phys + ->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + if (in_degraded) + be16_set(ddf->phys + ->entries[dl->pdnum] + .state, + cpu_to_be16 + (DDF_Rebuilding)); + } + } + if (dl->spare) { + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + } + if (!dl->vlist[0] && !dl->spare) { + be16_set(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare)); + be16_clear(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Active_in_VD)); + } +} + +static void ddf_process_conf_update(struct supertype *st, + struct metadata_update *update) +{ + struct ddf_super *ddf = st->sb; + struct vd_config *vc; + struct vcl *vcl; + struct dl *dl; + unsigned int ent; + unsigned int pdnum, len; + + vc = (struct vd_config*)update->buf; + len = ddf->conf_rec_len * 512; + if ((unsigned int)update->len != len * vc->sec_elmnt_count) { + pr_err("%s: insufficient data (%d) for %u BVDs\n", + guid_str(vc->guid), update->len, + vc->sec_elmnt_count); + return; + } + for (vcl = ddf->conflist; vcl ; vcl = vcl->next) + if (memcmp(vcl->conf.guid, vc->guid, DDF_GUID_LEN) == 0) + break; + dprintf("conf update for %s (%s)\n", + guid_str(vc->guid), (vcl ? "old" : "new")); + if (vcl) { + /* An update, just copy the phys_refnum and lba_offset + * fields + */ + unsigned int i; + unsigned int k; + copy_matching_bvd(ddf, &vcl->conf, update); + for (k = 0; k < be16_to_cpu(vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", 0, + be32_to_cpu(vcl->conf.phys_refnum[k]), + be64_to_cpu(LBA_OFFSET(ddf, + &vcl->conf)[k])); + for (i = 1; i < vc->sec_elmnt_count; i++) { + copy_matching_bvd(ddf, vcl->other_bvds[i-1], + update); + for (k = 0; k < be16_to_cpu( + vc->prim_elmnt_count); k++) + dprintf("BVD %u has %08x at %llu\n", i, + be32_to_cpu + (vcl->other_bvds[i-1]-> + phys_refnum[k]), + be64_to_cpu + (LBA_OFFSET + (ddf, + vcl->other_bvds[i-1])[k])); + } + } else { + /* A new VD_CONF */ + unsigned int i; + if (!update->space) + return; + vcl = update->space; + update->space = NULL; + vcl->next = ddf->conflist; + memcpy(&vcl->conf, vc, len); + ent = find_vde_by_guid(ddf, vc->guid); + if (ent == DDF_NOTFOUND) + return; + vcl->vcnum = ent; + ddf->conflist = vcl; + for (i = 1; i < vc->sec_elmnt_count; i++) + memcpy(vcl->other_bvds[i-1], + update->buf + len * i, len); + } + /* Set DDF_Transition on all Failed devices - to help + * us detect those that are no longer in use + */ + for (pdnum = 0; pdnum < be16_to_cpu(ddf->phys->max_pdes); + pdnum++) + if (be16_and(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Failed))) + be16_set(ddf->phys->entries[pdnum].state, + cpu_to_be16(DDF_Transition)); + + /* Now make sure vlist is correct for each dl. */ + for (dl = ddf->dlist; dl; dl = dl->next) + ddf_update_vlist(ddf, dl); + ddf_remove_failed(ddf); + + ddf_set_updates_pending(ddf, vc); +} + +static void ddf_process_update(struct supertype *st, + struct metadata_update *update) +{ + /* Apply this update to the metadata. + * The first 4 bytes are a DDF_*_MAGIC which guides + * our actions. + * Possible update are: + * DDF_PHYS_RECORDS_MAGIC + * Add a new physical device or remove an old one. + * Changes to this record only happen implicitly. + * used_pdes is the device number. + * DDF_VIRT_RECORDS_MAGIC + * Add a new VD. Possibly also change the 'access' bits. + * populated_vdes is the entry number. + * DDF_VD_CONF_MAGIC + * New or updated VD. the VIRT_RECORD must already + * exist. For an update, phys_refnum and lba_offset + * (at least) are updated, and the VD_CONF must + * be written to precisely those devices listed with + * a phys_refnum. + * DDF_SPARE_ASSIGN_MAGIC + * replacement Spare Assignment Record... but for which device? + * + * So, e.g.: + * - to create a new array, we send a VIRT_RECORD and + * a VD_CONF. Then assemble and start the array. + * - to activate a spare we send a VD_CONF to add the phys_refnum + * and offset. This will also mark the spare as active with + * a spare-assignment record. + */ + be32 *magic = (be32 *)update->buf; + + dprintf("Process update %x\n", be32_to_cpu(*magic)); + + if (be32_eq(*magic, DDF_PHYS_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct phys_disk) + + sizeof(struct phys_disk_entry))) + ddf_process_phys_update(st, update); + } else if (be32_eq(*magic, DDF_VIRT_RECORDS_MAGIC)) { + if (update->len == (sizeof(struct virtual_disk) + + sizeof(struct virtual_entry))) + ddf_process_virt_update(st, update); + } else if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + ddf_process_conf_update(st, update); + } + /* case DDF_SPARE_ASSIGN_MAGIC */ +} + +static int ddf_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /* This update arrived at managemon. + * We are about to pass it to monitor. + * If a malloc is needed, do it here. + */ + struct ddf_super *ddf = st->sb; + be32 *magic; + if (update->len < 4) + return 0; + magic = (be32 *)update->buf; + if (be32_eq(*magic, DDF_VD_CONF_MAGIC)) { + struct vcl *vcl; + struct vd_config *conf; + if (update->len < (int)sizeof(*conf)) + return 0; + conf = (struct vd_config *) update->buf; + if (posix_memalign(&update->space, 512, + offsetof(struct vcl, conf) + + ddf->conf_rec_len * 512) != 0) { + update->space = NULL; + return 0; + } + vcl = update->space; + vcl->conf.sec_elmnt_count = conf->sec_elmnt_count; + if (alloc_other_bvds(ddf, vcl) != 0) { + free(update->space); + update->space = NULL; + return 0; + } + } + return 1; +} + +/* + * Check degraded state of a RAID10. + * returns 2 for good, 1 for degraded, 0 for failed, and -1 for error + */ +static int raid10_degraded(struct mdinfo *info) +{ + int n_prim, n_bvds; + int i; + struct mdinfo *d; + char *found; + int ret = -1; + + n_prim = info->array.layout & ~0x100; + n_bvds = info->array.raid_disks / n_prim; + found = xmalloc(n_bvds); + if (found == NULL) + return ret; + memset(found, 0, n_bvds); + for (d = info->devs; d; d = d->next) { + i = d->disk.raid_disk / n_prim; + if (i >= n_bvds) { + pr_err("BUG: invalid raid disk\n"); + goto out; + } + if (is_fd_valid(d->state_fd)) + found[i]++; + } + ret = 2; + for (i = 0; i < n_bvds; i++) + if (!found[i]) { + dprintf("BVD %d/%d failed\n", i, n_bvds); + ret = 0; + goto out; + } else if (found[i] < n_prim) { + dprintf("BVD %d/%d degraded\n", i, n_bvds); + ret = 1; + } +out: + free(found); + return ret; +} + +/* + * Check if the array 'a' is degraded but not failed. + * If it is, find as many spares as are available and needed and + * arrange for their inclusion. + * We only choose devices which are not already in the array, + * and prefer those with a spare-assignment to this array. + * Otherwise we choose global spares - assuming always that + * there is enough room. + * For each spare that we assign, we return an 'mdinfo' which + * describes the position for the device in the array. + * We also add to 'updates' a DDF_VD_CONF_MAGIC update with + * the new phys_refnum and lba_offset values. + * + * Only worry about BVDs at the moment. + */ +static struct mdinfo *ddf_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + int working = 0; + struct mdinfo *d; + struct ddf_super *ddf = a->container->sb; + int global_ok = 0; + struct mdinfo *rv = NULL; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + int i; + unsigned int j; + struct vcl *vcl; + struct vd_config *vc; + unsigned int n_bvd; + + for (d = a->info.devs ; d ; d = d->next) { + if ((d->curr_state & DS_FAULTY) && + d->state_fd >= 0) + /* wait for Removal to happen */ + return NULL; + if (d->state_fd >= 0) + working ++; + } + + dprintf("working=%d (%d) level=%d\n", working, + a->info.array.raid_disks, + a->info.array.level); + if (working == a->info.array.raid_disks) + return NULL; /* array not degraded */ + switch (a->info.array.level) { + case 1: + if (working == 0) + return NULL; /* failed */ + break; + case 4: + case 5: + if (working < a->info.array.raid_disks - 1) + return NULL; /* failed */ + break; + case 6: + if (working < a->info.array.raid_disks - 2) + return NULL; /* failed */ + break; + case 10: + if (raid10_degraded(&a->info) < 1) + return NULL; + break; + default: /* concat or stripe */ + return NULL; /* failed */ + } + + /* For each slot, if it is not working, find a spare */ + dl = ddf->dlist; + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && (d->state_fd >= 0)) + continue; + + /* OK, this device needs recovery. Find a spare */ + again: + for ( ; dl ; dl = dl->next) { + unsigned long long esize; + unsigned long long pos; + struct mdinfo *d2; + int is_global = 0; + int is_dedicated = 0; + be16 state; + + if (dl->pdnum < 0) + continue; + state = ddf->phys->entries[dl->pdnum].state; + if (be16_and(state, + cpu_to_be16(DDF_Failed|DDF_Missing)) || + !be16_and(state, + cpu_to_be16(DDF_Online))) + continue; + + /* If in this array, skip */ + for (d2 = a->info.devs ; d2 ; d2 = d2->next) + if (d2->state_fd >= 0 && + d2->disk.major == dl->major && + d2->disk.minor == dl->minor) { + dprintf("%x:%x (%08x) already in array\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum)); + break; + } + if (d2) + continue; + if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Spare))) { + /* Check spare assign record */ + if (dl->spare) { + if (dl->spare->type & DDF_spare_dedicated) { + /* check spare_ents for guid */ + unsigned int j; + for (j = 0 ; + j < be16_to_cpu + (dl->spare + ->populated); + j++) { + if (memcmp(dl->spare->spare_ents[j].guid, + ddf->virt->entries[a->info.container_member].guid, + DDF_GUID_LEN) == 0) + is_dedicated = 1; + } + } else + is_global = 1; + } + } else if (be16_and(ddf->phys->entries[dl->pdnum].type, + cpu_to_be16(DDF_Global_Spare))) { + is_global = 1; + } else if (!be16_and(ddf->phys + ->entries[dl->pdnum].state, + cpu_to_be16(DDF_Failed))) { + /* we can possibly use some of this */ + is_global = 1; + } + if ( ! (is_dedicated || + (is_global && global_ok))) { + dprintf("%x:%x not suitable: %d %d\n", dl->major, dl->minor, + is_dedicated, is_global); + continue; + } + + /* We are allowed to use this device - is there space? + * We need a->info.component_size sectors */ + esize = a->info.component_size; + pos = find_space(ddf, dl, INVALID_SECTORS, &esize); + + if (esize < a->info.component_size) { + dprintf("%x:%x has no room: %llu %llu\n", + dl->major, dl->minor, + esize, a->info.component_size); + /* No room */ + continue; + } + + /* Cool, we have a device with some space at pos */ + di = xcalloc(1, sizeof(*di)); + di->disk.number = i; + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pos; + di->component_size = a->info.component_size; + di->next = rv; + rv = di; + dprintf("%x:%x (%08x) to be %d at %llu\n", + dl->major, dl->minor, + be32_to_cpu(dl->disk.refnum), i, pos); + + break; + } + if (!dl && ! global_ok) { + /* not enough dedicated spares, try global */ + global_ok = 1; + dl = ddf->dlist; + goto again; + } + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * phys_refnum and lba_offset values + */ + vc = find_vdcr(ddf, a->info.container_member, rv->disk.raid_disk, + &n_bvd, &vcl); + if (vc == NULL) + return NULL; + + mu = xmalloc(sizeof(*mu)); + if (posix_memalign(&mu->space, 512, sizeof(struct vcl)) != 0) { + free(mu); + mu = NULL; + } + + mu->len = ddf->conf_rec_len * 512 * vcl->conf.sec_elmnt_count; + mu->buf = xmalloc(mu->len); + mu->space = NULL; + mu->space_list = NULL; + mu->next = *updates; + memcpy(mu->buf, &vcl->conf, ddf->conf_rec_len * 512); + for (j = 1; j < vcl->conf.sec_elmnt_count; j++) + memcpy(mu->buf + j * ddf->conf_rec_len * 512, + vcl->other_bvds[j-1], ddf->conf_rec_len * 512); + + vc = (struct vd_config*)mu->buf; + for (di = rv ; di ; di = di->next) { + unsigned int i_sec, i_prim; + i_sec = di->disk.raid_disk + / be16_to_cpu(vcl->conf.prim_elmnt_count); + i_prim = di->disk.raid_disk + % be16_to_cpu(vcl->conf.prim_elmnt_count); + vc = (struct vd_config *)(mu->buf + + i_sec * ddf->conf_rec_len * 512); + for (dl = ddf->dlist; dl; dl = dl->next) + if (dl->major == di->disk.major && + dl->minor == di->disk.minor) + break; + if (!dl || dl->pdnum < 0) { + pr_err("BUG: can't find disk %d (%d/%d)\n", + di->disk.raid_disk, + di->disk.major, di->disk.minor); + return NULL; + } + vc->phys_refnum[i_prim] = ddf->phys->entries[dl->pdnum].refnum; + LBA_OFFSET(ddf, vc)[i_prim] = cpu_to_be64(di->data_offset); + dprintf("BVD %u gets %u: %08x at %llu\n", i_sec, i_prim, + be32_to_cpu(vc->phys_refnum[i_prim]), + be64_to_cpu(LBA_OFFSET(ddf, vc)[i_prim])); + } + *updates = mu; + return rv; +} + +static int ddf_level_to_layout(int level) +{ + switch(level) { + case 0: + case 1: + return 0; + case 5: + return ALGORITHM_LEFT_SYMMETRIC; + case 6: + return ALGORITHM_ROTATING_N_CONTINUE; + case 10: + return 0x102; + default: + return UnSet; + } +} + +static void default_geometry_ddf(struct supertype *st, int *level, int *layout, int *chunk) +{ + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = ddf_level_to_layout(*level); +} + +struct superswitch super_ddf = { + .examine_super = examine_super_ddf, + .brief_examine_super = brief_examine_super_ddf, + .brief_examine_subarrays = brief_examine_subarrays_ddf, + .export_examine_super = export_examine_super_ddf, + .detail_super = detail_super_ddf, + .brief_detail_super = brief_detail_super_ddf, + .validate_geometry = validate_geometry_ddf, + .write_init_super = write_init_super_ddf, + .add_to_super = add_to_super_ddf, + .remove_from_super = remove_from_super_ddf, + .load_container = load_container_ddf, + .copy_metadata = copy_metadata_ddf, + .kill_subarray = kill_subarray_ddf, + .match_home = match_home_ddf, + .uuid_from_super= uuid_from_super_ddf, + .getinfo_super = getinfo_super_ddf, + .update_super = update_super_ddf, + + .avail_size = avail_size_ddf, + + .compare_super = compare_super_ddf, + + .load_super = load_super_ddf, + .init_super = init_super_ddf, + .store_super = store_super_ddf, + .free_super = free_super_ddf, + .match_metadata_desc = match_metadata_desc_ddf, + .container_content = container_content_ddf, + .default_geometry = default_geometry_ddf, + + .external = 1, + +/* for mdmon */ + .open_new = ddf_open_new, + .set_array_state= ddf_set_array_state, + .set_disk = ddf_set_disk, + .sync_metadata = ddf_sync_metadata, + .process_update = ddf_process_update, + .prepare_update = ddf_prepare_update, + .activate_spare = ddf_activate_spare, + .name = "ddf", +}; diff --git a/super-gpt.c b/super-gpt.c new file mode 100644 index 0000000..a1e9aa9 --- /dev/null +++ b/super-gpt.c @@ -0,0 +1,220 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* + * 'gpt' is a pseudo metadata type for devices which have a + * GPT partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_gpt(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +static void examine_gpt(struct supertype *st, char *homehost) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + printf(" GPT Magic : %llx\n", (unsigned long long)__le64_to_cpu(gpt->magic)); + printf(" GPT Revision : %ld\n", (long)__le32_to_cpu(gpt->revision)); + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + printf(" Partition[%02d] : %12llu sectors at %12llu\n", + i, + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba), + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba)- + (unsigned long long)__le64_to_cpu(gpe[i].starting_lba) + +1 + ); + } +} + +static int load_gpt(struct supertype *st, int fd, char *devname) +{ + struct MBR *super; + struct GPT *gpt_head; + int to_read; + unsigned int sector_size; + + free_gpt(st); + + if (posix_memalign((void**)&super, 4096, 32*512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (!get_dev_sector_size(fd, devname, §or_size)) { + free(super); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + no_read: + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC || + super->parts[0].part_type != MBR_GPT_PARTITION_TYPE) { + not_found: + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + /* Set offset to second block (GPT header) */ + lseek(fd, sector_size, SEEK_SET); + /* Seem to have GPT, load the header */ + gpt_head = (struct GPT*)(super+1); + if (read(fd, gpt_head, sizeof(*gpt_head)) != sizeof(*gpt_head)) + goto no_read; + if (gpt_head->magic != GPT_SIGNATURE_MAGIC) + goto not_found; + if (__le32_to_cpu(gpt_head->part_cnt) >= 128) + goto not_found; + + to_read = __le32_to_cpu(gpt_head->part_cnt) * sizeof(struct GPT_part_entry); + to_read = ((to_read+511)/512) * 512; + /* Set offset to third block (GPT entries) */ + lseek(fd, sector_size*2, SEEK_SET); + if (read(fd, gpt_head+1, to_read) != to_read) + goto no_read; + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &gpt; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_gpt(struct supertype *st, int fd) +{ + /* FIXME should I save the boot loader */ + /* need to write two copies! */ + /* FIXME allow for blocks != 512 bytes + *etc + */ + struct MBR *super = st->sb; + struct GPT *gpt; + int to_write; + + gpt = (struct GPT*)(super+1); + + to_write = __le32_to_cpu(gpt->part_cnt) * sizeof(struct GPT_part_entry); + to_write = ((to_write+511)/512) * 512; + + lseek(fd, 0, 0); + if (write(fd, st->sb, to_write) != to_write) + return 4; + + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_gpt(struct supertype *st, struct mdinfo *info, char *map) +{ + struct GPT *gpt = st->sb + 512; + struct GPT_part_entry *gpe = st->sb + 1024; + unsigned int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "gpt"); + strcpy(info->name, "gpt"); + info->component_size = 0; + + for (i = 0; i < __le32_to_cpu(gpt->part_cnt); i++) { + unsigned long long last = + (unsigned long long)__le64_to_cpu(gpe[i].ending_lba); + if (last > info->component_size) + info->component_size = last; + } +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st = xmalloc(sizeof(*st)); + + if (!st) + return st; + if (strcmp(arg, "gpt") != 0) { + free(st); + return NULL; + } + + st->ss = &gpt; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + pr_err("gpt metadata cannot be used this way\n"); + return 0; +} + +struct superswitch gpt = { + .examine_super = examine_gpt, + .validate_geometry = validate_geometry, + .match_metadata_desc = match_metadata_desc, + .load_super = load_gpt, + .store_super = store_gpt, + .getinfo_super = getinfo_gpt, + .free_super = free_gpt, + .name = "gpt", +}; diff --git a/super-intel.c b/super-intel.c new file mode 100644 index 0000000..d5fad10 --- /dev/null +++ b/super-intel.c @@ -0,0 +1,12894 @@ +/* + * mdadm - Intel(R) Matrix Storage Manager Support + * + * Copyright (C) 2002-2008 Intel Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "mdmon.h" +#include "sha1.h" +#include "platform-intel.h" +#include <values.h> +#include <scsi/sg.h> +#include <ctype.h> +#include <dirent.h> + +/* MPB == Metadata Parameter Block */ +#define MPB_SIGNATURE "Intel Raid ISM Cfg Sig. " +#define MPB_SIG_LEN (strlen(MPB_SIGNATURE)) +#define MPB_VERSION_RAID0 "1.0.00" +#define MPB_VERSION_RAID1 "1.1.00" +#define MPB_VERSION_MANY_VOLUMES_PER_ARRAY "1.2.00" +#define MPB_VERSION_3OR4_DISK_ARRAY "1.2.01" +#define MPB_VERSION_RAID5 "1.2.02" +#define MPB_VERSION_5OR6_DISK_ARRAY "1.2.04" +#define MPB_VERSION_CNG "1.2.06" +#define MPB_VERSION_ATTRIBS "1.3.00" +#define MAX_SIGNATURE_LENGTH 32 +#define MAX_RAID_SERIAL_LEN 16 + +/* supports RAID0 */ +#define MPB_ATTRIB_RAID0 __cpu_to_le32(0x00000001) +/* supports RAID1 */ +#define MPB_ATTRIB_RAID1 __cpu_to_le32(0x00000002) +/* supports RAID10 */ +#define MPB_ATTRIB_RAID10 __cpu_to_le32(0x00000004) +/* supports RAID1E */ +#define MPB_ATTRIB_RAID1E __cpu_to_le32(0x00000008) +/* supports RAID5 */ +#define MPB_ATTRIB_RAID5 __cpu_to_le32(0x00000010) +/* supports RAID CNG */ +#define MPB_ATTRIB_RAIDCNG __cpu_to_le32(0x00000020) +/* supports expanded stripe sizes of 256K, 512K and 1MB */ +#define MPB_ATTRIB_EXP_STRIPE_SIZE __cpu_to_le32(0x00000040) + +/* The OROM Support RST Caching of Volumes */ +#define MPB_ATTRIB_NVM __cpu_to_le32(0x02000000) +/* The OROM supports creating disks greater than 2TB */ +#define MPB_ATTRIB_2TB_DISK __cpu_to_le32(0x04000000) +/* The OROM supports Bad Block Management */ +#define MPB_ATTRIB_BBM __cpu_to_le32(0x08000000) + +/* THe OROM Supports NVM Caching of Volumes */ +#define MPB_ATTRIB_NEVER_USE2 __cpu_to_le32(0x10000000) +/* The OROM supports creating volumes greater than 2TB */ +#define MPB_ATTRIB_2TB __cpu_to_le32(0x20000000) +/* originally for PMP, now it's wasted b/c. Never use this bit! */ +#define MPB_ATTRIB_NEVER_USE __cpu_to_le32(0x40000000) +/* Verify MPB contents against checksum after reading MPB */ +#define MPB_ATTRIB_CHECKSUM_VERIFY __cpu_to_le32(0x80000000) + +/* Define all supported attributes that have to be accepted by mdadm + */ +#define MPB_ATTRIB_SUPPORTED (MPB_ATTRIB_CHECKSUM_VERIFY | \ + MPB_ATTRIB_2TB | \ + MPB_ATTRIB_2TB_DISK | \ + MPB_ATTRIB_RAID0 | \ + MPB_ATTRIB_RAID1 | \ + MPB_ATTRIB_RAID10 | \ + MPB_ATTRIB_RAID5 | \ + MPB_ATTRIB_EXP_STRIPE_SIZE | \ + MPB_ATTRIB_BBM) + +/* Define attributes that are unused but not harmful */ +#define MPB_ATTRIB_IGNORED (MPB_ATTRIB_NEVER_USE) + +#define MPB_SECTOR_CNT 2210 +#define IMSM_RESERVED_SECTORS 8192 +#define NUM_BLOCKS_DIRTY_STRIPE_REGION 2048 +#define SECT_PER_MB_SHIFT 11 +#define MAX_SECTOR_SIZE 4096 +#define MULTIPLE_PPL_AREA_SIZE_IMSM (1024 * 1024) /* Size of the whole + * mutliple PPL area + */ + +/* + * Internal Write-intent bitmap is stored in the same area where PPL. + * Both features are mutually exclusive, so it is not an issue. + * The first 8KiB of the area are reserved and shall not be used. + */ +#define IMSM_BITMAP_AREA_RESERVED_SIZE 8192 + +#define IMSM_BITMAP_HEADER_OFFSET (IMSM_BITMAP_AREA_RESERVED_SIZE) +#define IMSM_BITMAP_HEADER_SIZE MAX_SECTOR_SIZE + +#define IMSM_BITMAP_START_OFFSET (IMSM_BITMAP_HEADER_OFFSET + IMSM_BITMAP_HEADER_SIZE) +#define IMSM_BITMAP_AREA_SIZE (MULTIPLE_PPL_AREA_SIZE_IMSM - IMSM_BITMAP_START_OFFSET) +#define IMSM_BITMAP_AND_HEADER_SIZE (IMSM_BITMAP_AREA_SIZE + IMSM_BITMAP_HEADER_SIZE) + +#define IMSM_DEFAULT_BITMAP_CHUNKSIZE (64 * 1024 * 1024) +#define IMSM_DEFAULT_BITMAP_DAEMON_SLEEP 5 + +/* + * This macro let's us ensure that no-one accidentally + * changes the size of a struct + */ +#define ASSERT_SIZE(_struct, size) \ +static inline void __assert_size_##_struct(void) \ +{ \ + switch (0) { \ + case 0: break; \ + case (sizeof(struct _struct) == size): break; \ + } \ +} + +/* Disk configuration info. */ +#define IMSM_MAX_DEVICES 255 +struct imsm_disk { + __u8 serial[MAX_RAID_SERIAL_LEN];/* 0xD8 - 0xE7 ascii serial number */ + __u32 total_blocks_lo; /* 0xE8 - 0xEB total blocks lo */ + __u32 scsi_id; /* 0xEC - 0xEF scsi ID */ +#define SPARE_DISK __cpu_to_le32(0x01) /* Spare */ +#define CONFIGURED_DISK __cpu_to_le32(0x02) /* Member of some RaidDev */ +#define FAILED_DISK __cpu_to_le32(0x04) /* Permanent failure */ +#define JOURNAL_DISK __cpu_to_le32(0x2000000) /* Device marked as Journaling Drive */ + __u32 status; /* 0xF0 - 0xF3 */ + __u32 owner_cfg_num; /* which config 0,1,2... owns this disk */ + __u32 total_blocks_hi; /* 0xF4 - 0xF5 total blocks hi */ +#define IMSM_DISK_FILLERS 3 + __u32 filler[IMSM_DISK_FILLERS]; /* 0xF5 - 0x107 MPB_DISK_FILLERS for future expansion */ +}; +ASSERT_SIZE(imsm_disk, 48) + +/* map selector for map managment + */ +#define MAP_0 0 +#define MAP_1 1 +#define MAP_X -1 + +/* RAID map configuration infos. */ +struct imsm_map { + __u32 pba_of_lba0_lo; /* start address of partition */ + __u32 blocks_per_member_lo;/* blocks per member */ + __u32 num_data_stripes_lo; /* number of data stripes */ + __u16 blocks_per_strip; + __u8 map_state; /* Normal, Uninitialized, Degraded, Failed */ +#define IMSM_T_STATE_NORMAL 0 +#define IMSM_T_STATE_UNINITIALIZED 1 +#define IMSM_T_STATE_DEGRADED 2 +#define IMSM_T_STATE_FAILED 3 + __u8 raid_level; +#define IMSM_T_RAID0 0 +#define IMSM_T_RAID1 1 +#define IMSM_T_RAID5 5 /* since metadata version 1.2.02 ? */ + __u8 num_members; /* number of member disks */ + __u8 num_domains; /* number of parity domains */ + __u8 failed_disk_num; /* valid only when state is degraded */ + __u8 ddf; + __u32 pba_of_lba0_hi; + __u32 blocks_per_member_hi; + __u32 num_data_stripes_hi; + __u32 filler[4]; /* expansion area */ +#define IMSM_ORD_REBUILD (1 << 24) + __u32 disk_ord_tbl[1]; /* disk_ord_tbl[num_members], + * top byte contains some flags + */ +}; +ASSERT_SIZE(imsm_map, 52) + +struct imsm_vol { + __u32 curr_migr_unit_lo; + __u32 checkpoint_id; /* id to access curr_migr_unit */ + __u8 migr_state; /* Normal or Migrating */ +#define MIGR_INIT 0 +#define MIGR_REBUILD 1 +#define MIGR_VERIFY 2 /* analagous to echo check > sync_action */ +#define MIGR_GEN_MIGR 3 +#define MIGR_STATE_CHANGE 4 +#define MIGR_REPAIR 5 + __u8 migr_type; /* Initializing, Rebuilding, ... */ +#define RAIDVOL_CLEAN 0 +#define RAIDVOL_DIRTY 1 +#define RAIDVOL_DSRECORD_VALID 2 + __u8 dirty; + __u8 fs_state; /* fast-sync state for CnG (0xff == disabled) */ + __u16 verify_errors; /* number of mismatches */ + __u16 bad_blocks; /* number of bad blocks during verify */ + __u32 curr_migr_unit_hi; + __u32 filler[3]; + struct imsm_map map[1]; + /* here comes another one if migr_state */ +}; +ASSERT_SIZE(imsm_vol, 84) + +struct imsm_dev { + __u8 volume[MAX_RAID_SERIAL_LEN]; + __u32 size_low; + __u32 size_high; +#define DEV_BOOTABLE __cpu_to_le32(0x01) +#define DEV_BOOT_DEVICE __cpu_to_le32(0x02) +#define DEV_READ_COALESCING __cpu_to_le32(0x04) +#define DEV_WRITE_COALESCING __cpu_to_le32(0x08) +#define DEV_LAST_SHUTDOWN_DIRTY __cpu_to_le32(0x10) +#define DEV_HIDDEN_AT_BOOT __cpu_to_le32(0x20) +#define DEV_CURRENTLY_HIDDEN __cpu_to_le32(0x40) +#define DEV_VERIFY_AND_FIX __cpu_to_le32(0x80) +#define DEV_MAP_STATE_UNINIT __cpu_to_le32(0x100) +#define DEV_NO_AUTO_RECOVERY __cpu_to_le32(0x200) +#define DEV_CLONE_N_GO __cpu_to_le32(0x400) +#define DEV_CLONE_MAN_SYNC __cpu_to_le32(0x800) +#define DEV_CNG_MASTER_DISK_NUM __cpu_to_le32(0x1000) + __u32 status; /* Persistent RaidDev status */ + __u32 reserved_blocks; /* Reserved blocks at beginning of volume */ + __u8 migr_priority; + __u8 num_sub_vols; + __u8 tid; + __u8 cng_master_disk; + __u16 cache_policy; + __u8 cng_state; + __u8 cng_sub_state; + __u16 my_vol_raid_dev_num; /* Used in Unique volume Id for this RaidDev */ + + /* NVM_EN */ + __u8 nv_cache_mode; + __u8 nv_cache_flags; + + /* Unique Volume Id of the NvCache Volume associated with this volume */ + __u32 nvc_vol_orig_family_num; + __u16 nvc_vol_raid_dev_num; + +#define RWH_OFF 0 +#define RWH_DISTRIBUTED 1 +#define RWH_JOURNALING_DRIVE 2 +#define RWH_MULTIPLE_DISTRIBUTED 3 +#define RWH_MULTIPLE_PPLS_JOURNALING_DRIVE 4 +#define RWH_MULTIPLE_OFF 5 +#define RWH_BITMAP 6 + __u8 rwh_policy; /* Raid Write Hole Policy */ + __u8 jd_serial[MAX_RAID_SERIAL_LEN]; /* Journal Drive serial number */ + __u8 filler1; + +#define IMSM_DEV_FILLERS 3 + __u32 filler[IMSM_DEV_FILLERS]; + struct imsm_vol vol; +}; +ASSERT_SIZE(imsm_dev, 164) + +struct imsm_super { + __u8 sig[MAX_SIGNATURE_LENGTH]; /* 0x00 - 0x1F */ + __u32 check_sum; /* 0x20 - 0x23 MPB Checksum */ + __u32 mpb_size; /* 0x24 - 0x27 Size of MPB */ + __u32 family_num; /* 0x28 - 0x2B Checksum from first time this config was written */ + __u32 generation_num; /* 0x2C - 0x2F Incremented each time this array's MPB is written */ + __u32 error_log_size; /* 0x30 - 0x33 in bytes */ + __u32 attributes; /* 0x34 - 0x37 */ + __u8 num_disks; /* 0x38 Number of configured disks */ + __u8 num_raid_devs; /* 0x39 Number of configured volumes */ + __u8 error_log_pos; /* 0x3A */ + __u8 fill[1]; /* 0x3B */ + __u32 cache_size; /* 0x3c - 0x40 in mb */ + __u32 orig_family_num; /* 0x40 - 0x43 original family num */ + __u32 pwr_cycle_count; /* 0x44 - 0x47 simulated power cycle count for array */ + __u32 bbm_log_size; /* 0x48 - 0x4B - size of bad Block Mgmt Log in bytes */ + __u16 num_raid_devs_created; /* 0x4C - 0x4D Used for generating unique + * volume IDs for raid_dev created in this array + * (starts at 1) + */ + __u16 filler1; /* 0x4E - 0x4F */ + __u64 creation_time; /* 0x50 - 0x57 Array creation time */ +#define IMSM_FILLERS 32 + __u32 filler[IMSM_FILLERS]; /* 0x58 - 0xD7 RAID_MPB_FILLERS */ + struct imsm_disk disk[1]; /* 0xD8 diskTbl[numDisks] */ + /* here comes imsm_dev[num_raid_devs] */ + /* here comes BBM logs */ +}; +ASSERT_SIZE(imsm_super, 264) + +#define BBM_LOG_MAX_ENTRIES 254 +#define BBM_LOG_MAX_LBA_ENTRY_VAL 256 /* Represents 256 LBAs */ +#define BBM_LOG_SIGNATURE 0xabadb10c + +struct bbm_log_block_addr { + __u16 w1; + __u32 dw1; +} __attribute__ ((__packed__)); + +struct bbm_log_entry { + __u8 marked_count; /* Number of blocks marked - 1 */ + __u8 disk_ordinal; /* Disk entry within the imsm_super */ + struct bbm_log_block_addr defective_block_start; +} __attribute__ ((__packed__)); + +struct bbm_log { + __u32 signature; /* 0xABADB10C */ + __u32 entry_count; + struct bbm_log_entry marked_block_entries[BBM_LOG_MAX_ENTRIES]; +}; +ASSERT_SIZE(bbm_log, 2040) + +static char *map_state_str[] = { "normal", "uninitialized", "degraded", "failed" }; + +#define BLOCKS_PER_KB (1024/512) + +#define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 2209 + +#define GEN_MIGR_AREA_SIZE 2048 /* General Migration Copy Area size in blocks */ + +#define MIGR_REC_BUF_SECTORS 1 /* size of migr_record i/o buffer in sectors */ +#define MIGR_REC_SECTOR_POSITION 1 /* migr_record position offset on disk, + * MIGR_REC_BUF_SECTORS <= MIGR_REC_SECTOR_POS + */ + +#define UNIT_SRC_NORMAL 0 /* Source data for curr_migr_unit must + * be recovered using srcMap */ +#define UNIT_SRC_IN_CP_AREA 1 /* Source data for curr_migr_unit has + * already been migrated and must + * be recovered from checkpoint area */ + +#define PPL_ENTRY_SPACE (128 * 1024) /* Size of single PPL, without the header */ + +struct migr_record { + __u32 rec_status; /* Status used to determine how to restart + * migration in case it aborts + * in some fashion */ + __u32 curr_migr_unit_lo; /* 0..numMigrUnits-1 */ + __u32 family_num; /* Family number of MPB + * containing the RaidDev + * that is migrating */ + __u32 ascending_migr; /* True if migrating in increasing + * order of lbas */ + __u32 blocks_per_unit; /* Num disk blocks per unit of operation */ + __u32 dest_depth_per_unit; /* Num member blocks each destMap + * member disk + * advances per unit-of-operation */ + __u32 ckpt_area_pba_lo; /* Pba of first block of ckpt copy area */ + __u32 dest_1st_member_lba_lo; /* First member lba on first + * stripe of destination */ + __u32 num_migr_units_lo; /* Total num migration units-of-op */ + __u32 post_migr_vol_cap; /* Size of volume after + * migration completes */ + __u32 post_migr_vol_cap_hi; /* Expansion space for LBA64 */ + __u32 ckpt_read_disk_num; /* Which member disk in destSubMap[0] the + * migration ckpt record was read from + * (for recovered migrations) */ + __u32 curr_migr_unit_hi; /* 0..numMigrUnits-1 high order 32 bits */ + __u32 ckpt_area_pba_hi; /* Pba of first block of ckpt copy area + * high order 32 bits */ + __u32 dest_1st_member_lba_hi; /* First member lba on first stripe of + * destination - high order 32 bits */ + __u32 num_migr_units_hi; /* Total num migration units-of-op + * high order 32 bits */ + __u32 filler[16]; +}; +ASSERT_SIZE(migr_record, 128) + +struct md_list { + /* usage marker: + * 1: load metadata + * 2: metadata does not match + * 4: already checked + */ + int used; + char *devname; + int found; + int container; + dev_t st_rdev; + struct md_list *next; +}; + +#define pr_vrb(fmt, arg...) (void) (verbose && pr_err(fmt, ##arg)) + +static __u8 migr_type(struct imsm_dev *dev) +{ + if (dev->vol.migr_type == MIGR_VERIFY && + dev->status & DEV_VERIFY_AND_FIX) + return MIGR_REPAIR; + else + return dev->vol.migr_type; +} + +static void set_migr_type(struct imsm_dev *dev, __u8 migr_type) +{ + /* for compatibility with older oroms convert MIGR_REPAIR, into + * MIGR_VERIFY w/ DEV_VERIFY_AND_FIX status + */ + if (migr_type == MIGR_REPAIR) { + dev->vol.migr_type = MIGR_VERIFY; + dev->status |= DEV_VERIFY_AND_FIX; + } else { + dev->vol.migr_type = migr_type; + dev->status &= ~DEV_VERIFY_AND_FIX; + } +} + +static unsigned int sector_count(__u32 bytes, unsigned int sector_size) +{ + return ROUND_UP(bytes, sector_size) / sector_size; +} + +static unsigned int mpb_sectors(struct imsm_super *mpb, + unsigned int sector_size) +{ + return sector_count(__le32_to_cpu(mpb->mpb_size), sector_size); +} + +struct intel_dev { + struct imsm_dev *dev; + struct intel_dev *next; + unsigned index; +}; + +struct intel_hba { + enum sys_dev_type type; + char *path; + char *pci_id; + struct intel_hba *next; +}; + +enum action { + DISK_REMOVE = 1, + DISK_ADD +}; +/* internal representation of IMSM metadata */ +struct intel_super { + union { + void *buf; /* O_DIRECT buffer for reading/writing metadata */ + struct imsm_super *anchor; /* immovable parameters */ + }; + union { + void *migr_rec_buf; /* buffer for I/O operations */ + struct migr_record *migr_rec; /* migration record */ + }; + int clean_migration_record_by_mdmon; /* when reshape is switched to next + array, it indicates that mdmon is allowed to clean migration + record */ + size_t len; /* size of the 'buf' allocation */ + size_t extra_space; /* extra space in 'buf' that is not used yet */ + void *next_buf; /* for realloc'ing buf from the manager */ + size_t next_len; + int updates_pending; /* count of pending updates for mdmon */ + int current_vol; /* index of raid device undergoing creation */ + unsigned long long create_offset; /* common start for 'current_vol' */ + __u32 random; /* random data for seeding new family numbers */ + struct intel_dev *devlist; + unsigned int sector_size; /* sector size of used member drives */ + struct dl { + struct dl *next; + int index; + __u8 serial[MAX_RAID_SERIAL_LEN]; + int major, minor; + char *devname; + struct imsm_disk disk; + int fd; + int extent_cnt; + struct extent *e; /* for determining freespace @ create */ + int raiddisk; /* slot to fill in autolayout */ + enum action action; + } *disks, *current_disk; + struct dl *disk_mgmt_list; /* list of disks to add/remove while mdmon + active */ + struct dl *missing; /* disks removed while we weren't looking */ + struct bbm_log *bbm_log; + struct intel_hba *hba; /* device path of the raid controller for this metadata */ + const struct imsm_orom *orom; /* platform firmware support */ + struct intel_super *next; /* (temp) list for disambiguating family_num */ + struct md_bb bb; /* memory for get_bad_blocks call */ +}; + +struct intel_disk { + struct imsm_disk disk; + #define IMSM_UNKNOWN_OWNER (-1) + int owner; + struct intel_disk *next; +}; + +struct extent { + unsigned long long start, size; +}; + +/* definitions of reshape process types */ +enum imsm_reshape_type { + CH_TAKEOVER, + CH_MIGRATION, + CH_ARRAY_SIZE, +}; + +/* definition of messages passed to imsm_process_update */ +enum imsm_update_type { + update_activate_spare, + update_create_array, + update_kill_array, + update_rename_array, + update_add_remove_disk, + update_reshape_container_disks, + update_reshape_migration, + update_takeover, + update_general_migration_checkpoint, + update_size_change, + update_prealloc_badblocks_mem, + update_rwh_policy, +}; + +struct imsm_update_activate_spare { + enum imsm_update_type type; + struct dl *dl; + int slot; + int array; + struct imsm_update_activate_spare *next; +}; + +struct geo_params { + char devnm[32]; + char *dev_name; + unsigned long long size; + int level; + int layout; + int chunksize; + int raid_disks; +}; + +enum takeover_direction { + R10_TO_R0, + R0_TO_R10 +}; +struct imsm_update_takeover { + enum imsm_update_type type; + int subarray; + enum takeover_direction direction; +}; + +struct imsm_update_reshape { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_reshape_migration { + enum imsm_update_type type; + int old_raid_disks; + int new_raid_disks; + /* fields for array migration changes + */ + int subdev; + int new_level; + int new_layout; + int new_chunksize; + + int new_disks[1]; /* new_raid_disks - old_raid_disks makedev number */ +}; + +struct imsm_update_size_change { + enum imsm_update_type type; + int subdev; + long long new_size; +}; + +struct imsm_update_general_migration_checkpoint { + enum imsm_update_type type; + __u64 curr_migr_unit; +}; + +struct disk_info { + __u8 serial[MAX_RAID_SERIAL_LEN]; +}; + +struct imsm_update_create_array { + enum imsm_update_type type; + int dev_idx; + struct imsm_dev dev; +}; + +struct imsm_update_kill_array { + enum imsm_update_type type; + int dev_idx; +}; + +struct imsm_update_rename_array { + enum imsm_update_type type; + __u8 name[MAX_RAID_SERIAL_LEN]; + int dev_idx; +}; + +struct imsm_update_add_remove_disk { + enum imsm_update_type type; +}; + +struct imsm_update_prealloc_bb_mem { + enum imsm_update_type type; +}; + +struct imsm_update_rwh_policy { + enum imsm_update_type type; + int new_policy; + int dev_idx; +}; + +static const char *_sys_dev_type[] = { + [SYS_DEV_UNKNOWN] = "Unknown", + [SYS_DEV_SAS] = "SAS", + [SYS_DEV_SATA] = "SATA", + [SYS_DEV_NVME] = "NVMe", + [SYS_DEV_VMD] = "VMD" +}; + +const char *get_sys_dev_type(enum sys_dev_type type) +{ + if (type >= SYS_DEV_MAX) + type = SYS_DEV_UNKNOWN; + + return _sys_dev_type[type]; +} + +static struct intel_hba * alloc_intel_hba(struct sys_dev *device) +{ + struct intel_hba *result = xmalloc(sizeof(*result)); + + result->type = device->type; + result->path = xstrdup(device->path); + result->next = NULL; + if (result->path && (result->pci_id = strrchr(result->path, '/')) != NULL) + result->pci_id++; + + return result; +} + +static struct intel_hba * find_intel_hba(struct intel_hba *hba, struct sys_dev *device) +{ + struct intel_hba *result; + + for (result = hba; result; result = result->next) { + if (result->type == device->type && strcmp(result->path, device->path) == 0) + break; + } + return result; +} + +static int attach_hba_to_super(struct intel_super *super, struct sys_dev *device) +{ + struct intel_hba *hba; + + /* check if disk attached to Intel HBA */ + hba = find_intel_hba(super->hba, device); + if (hba != NULL) + return 1; + /* Check if HBA is already attached to super */ + if (super->hba == NULL) { + super->hba = alloc_intel_hba(device); + return 1; + } + + hba = super->hba; + /* Intel metadata allows for all disks attached to the same type HBA. + * Do not support HBA types mixing + */ + if (device->type != hba->type) + return 2; + + /* Multiple same type HBAs can be used if they share the same OROM */ + const struct imsm_orom *device_orom = get_orom_by_device_id(device->dev_id); + + if (device_orom != super->orom) + return 2; + + while (hba->next) + hba = hba->next; + + hba->next = alloc_intel_hba(device); + return 1; +} + +static struct sys_dev* find_disk_attached_hba(int fd, const char *devname) +{ + struct sys_dev *list, *elem; + char *disk_path; + + if ((list = find_intel_devices()) == NULL) + return 0; + + if (!is_fd_valid(fd)) + disk_path = (char *) devname; + else + disk_path = diskfd_to_devpath(fd, 1, NULL); + + if (!disk_path) + return 0; + + for (elem = list; elem; elem = elem->next) + if (path_attached_to_hba(disk_path, elem->path)) + return elem; + + if (disk_path != devname) + free(disk_path); + + return NULL; +} + +static int find_intel_hba_capability(int fd, struct intel_super *super, + char *devname); + +static struct supertype *match_metadata_desc_imsm(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "imsm") != 0 && + strcmp(arg, "default") != 0 + ) + return NULL; + + st = xcalloc(1, sizeof(*st)); + st->ss = &super_imsm; + st->max_devs = IMSM_MAX_DEVICES; + st->minor_version = 0; + st->sb = NULL; + return st; +} + +static __u8 *get_imsm_version(struct imsm_super *mpb) +{ + return &mpb->sig[MPB_SIG_LEN]; +} + +/* retrieve a disk directly from the anchor when the anchor is known to be + * up-to-date, currently only at load time + */ +static struct imsm_disk *__get_imsm_disk(struct imsm_super *mpb, __u8 index) +{ + if (index >= mpb->num_disks) + return NULL; + return &mpb->disk[index]; +} + +/* retrieve the disk description based on a index of the disk + * in the sub-array + */ +static struct dl *get_imsm_dl_disk(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index == index) + return d; + + return NULL; +} +/* retrieve a disk from the parsed metadata */ +static struct imsm_disk *get_imsm_disk(struct intel_super *super, __u8 index) +{ + struct dl *dl; + + dl = get_imsm_dl_disk(super, index); + if (dl) + return &dl->disk; + + return NULL; +} + +/* generate a checksum directly from the anchor when the anchor is known to be + * up-to-date, currently only at load or write_super after coalescing + */ +static __u32 __gen_imsm_checksum(struct imsm_super *mpb) +{ + __u32 end = mpb->mpb_size / sizeof(end); + __u32 *p = (__u32 *) mpb; + __u32 sum = 0; + + while (end--) { + sum += __le32_to_cpu(*p); + p++; + } + + return sum - __le32_to_cpu(mpb->check_sum); +} + +static size_t sizeof_imsm_map(struct imsm_map *map) +{ + return sizeof(struct imsm_map) + sizeof(__u32) * (map->num_members - 1); +} + +struct imsm_map *get_imsm_map(struct imsm_dev *dev, int second_map) +{ + /* A device can have 2 maps if it is in the middle of a migration. + * If second_map is: + * MAP_0 - we return the first map + * MAP_1 - we return the second map if it exists, else NULL + * MAP_X - we return the second map if it exists, else the first + */ + struct imsm_map *map = &dev->vol.map[0]; + struct imsm_map *map2 = NULL; + + if (dev->vol.migr_state) + map2 = (void *)map + sizeof_imsm_map(map); + + switch (second_map) { + case MAP_0: + break; + case MAP_1: + map = map2; + break; + case MAP_X: + if (map2) + map = map2; + break; + default: + map = NULL; + } + return map; + +} + +/* return the size of the device. + * migr_state increases the returned size if map[0] were to be duplicated + */ +static size_t sizeof_imsm_dev(struct imsm_dev *dev, int migr_state) +{ + size_t size = sizeof(*dev) - sizeof(struct imsm_map) + + sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + /* migrating means an additional map */ + if (dev->vol.migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_1)); + else if (migr_state) + size += sizeof_imsm_map(get_imsm_map(dev, MAP_0)); + + return size; +} + +/* retrieve disk serial number list from a metadata update */ +static struct disk_info *get_disk_info(struct imsm_update_create_array *update) +{ + void *u = update; + struct disk_info *inf; + + inf = u + sizeof(*update) - sizeof(struct imsm_dev) + + sizeof_imsm_dev(&update->dev, 0); + + return inf; +} + +static struct imsm_dev *__get_imsm_dev(struct imsm_super *mpb, __u8 index) +{ + int offset; + int i; + void *_mpb = mpb; + + if (index >= mpb->num_raid_devs) + return NULL; + + /* devices start after all disks */ + offset = ((void *) &mpb->disk[mpb->num_disks]) - _mpb; + + for (i = 0; i <= index; i++) + if (i == index) + return _mpb + offset; + else + offset += sizeof_imsm_dev(_mpb + offset, 0); + + return NULL; +} + +static struct imsm_dev *get_imsm_dev(struct intel_super *super, __u8 index) +{ + struct intel_dev *dv; + + if (index >= super->anchor->num_raid_devs) + return NULL; + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == index) + return dv->dev; + return NULL; +} + +static inline unsigned long long __le48_to_cpu(const struct bbm_log_block_addr + *addr) +{ + return ((((__u64)__le32_to_cpu(addr->dw1)) << 16) | + __le16_to_cpu(addr->w1)); +} + +static inline struct bbm_log_block_addr __cpu_to_le48(unsigned long long sec) +{ + struct bbm_log_block_addr addr; + + addr.w1 = __cpu_to_le16((__u16)(sec & 0xffff)); + addr.dw1 = __cpu_to_le32((__u32)(sec >> 16) & 0xffffffff); + return addr; +} + +/* get size of the bbm log */ +static __u32 get_imsm_bbm_log_size(struct bbm_log *log) +{ + if (!log || log->entry_count == 0) + return 0; + + return sizeof(log->signature) + + sizeof(log->entry_count) + + log->entry_count * sizeof(struct bbm_log_entry); +} + +/* check if bad block is not partially stored in bbm log */ +static int is_stored_in_bbm(struct bbm_log *log, const __u8 idx, const unsigned + long long sector, const int length, __u32 *pos) +{ + __u32 i; + + for (i = *pos; i < log->entry_count; i++) { + struct bbm_log_entry *entry = &log->marked_block_entries[i]; + unsigned long long bb_start; + unsigned long long bb_end; + + bb_start = __le48_to_cpu(&entry->defective_block_start); + bb_end = bb_start + (entry->marked_count + 1); + + if ((entry->disk_ordinal == idx) && (bb_start >= sector) && + (bb_end <= sector + length)) { + *pos = i; + return 1; + } + } + return 0; +} + +/* record new bad block in bbm log */ +static int record_new_badblock(struct bbm_log *log, const __u8 idx, unsigned + long long sector, int length) +{ + int new_bb = 0; + __u32 pos = 0; + struct bbm_log_entry *entry = NULL; + + while (is_stored_in_bbm(log, idx, sector, length, &pos)) { + struct bbm_log_entry *e = &log->marked_block_entries[pos]; + + if ((e->marked_count + 1 == BBM_LOG_MAX_LBA_ENTRY_VAL) && + (__le48_to_cpu(&e->defective_block_start) == sector)) { + sector += BBM_LOG_MAX_LBA_ENTRY_VAL; + length -= BBM_LOG_MAX_LBA_ENTRY_VAL; + pos = pos + 1; + continue; + } + entry = e; + break; + } + + if (entry) { + int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length : + BBM_LOG_MAX_LBA_ENTRY_VAL; + entry->defective_block_start = __cpu_to_le48(sector); + entry->marked_count = cnt - 1; + if (cnt == length) + return 1; + sector += cnt; + length -= cnt; + } + + new_bb = ROUND_UP(length, BBM_LOG_MAX_LBA_ENTRY_VAL) / + BBM_LOG_MAX_LBA_ENTRY_VAL; + if (log->entry_count + new_bb > BBM_LOG_MAX_ENTRIES) + return 0; + + while (length > 0) { + int cnt = (length <= BBM_LOG_MAX_LBA_ENTRY_VAL) ? length : + BBM_LOG_MAX_LBA_ENTRY_VAL; + struct bbm_log_entry *entry = + &log->marked_block_entries[log->entry_count]; + + entry->defective_block_start = __cpu_to_le48(sector); + entry->marked_count = cnt - 1; + entry->disk_ordinal = idx; + + sector += cnt; + length -= cnt; + + log->entry_count++; + } + + return new_bb; +} + +/* clear all bad blocks for given disk */ +static void clear_disk_badblocks(struct bbm_log *log, const __u8 idx) +{ + __u32 i = 0; + + while (i < log->entry_count) { + struct bbm_log_entry *entries = log->marked_block_entries; + + if (entries[i].disk_ordinal == idx) { + if (i < log->entry_count - 1) + entries[i] = entries[log->entry_count - 1]; + log->entry_count--; + } else { + i++; + } + } +} + +/* clear given bad block */ +static int clear_badblock(struct bbm_log *log, const __u8 idx, const unsigned + long long sector, const int length) { + __u32 i = 0; + + while (i < log->entry_count) { + struct bbm_log_entry *entries = log->marked_block_entries; + + if ((entries[i].disk_ordinal == idx) && + (__le48_to_cpu(&entries[i].defective_block_start) == + sector) && (entries[i].marked_count + 1 == length)) { + if (i < log->entry_count - 1) + entries[i] = entries[log->entry_count - 1]; + log->entry_count--; + break; + } + i++; + } + + return 1; +} + +/* allocate and load BBM log from metadata */ +static int load_bbm_log(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size); + + super->bbm_log = xcalloc(1, sizeof(struct bbm_log)); + if (!super->bbm_log) + return 1; + + if (bbm_log_size) { + struct bbm_log *log = (void *)mpb + + __le32_to_cpu(mpb->mpb_size) - bbm_log_size; + + __u32 entry_count; + + if (bbm_log_size < sizeof(log->signature) + + sizeof(log->entry_count)) + return 2; + + entry_count = __le32_to_cpu(log->entry_count); + if ((__le32_to_cpu(log->signature) != BBM_LOG_SIGNATURE) || + (entry_count > BBM_LOG_MAX_ENTRIES)) + return 3; + + if (bbm_log_size != + sizeof(log->signature) + sizeof(log->entry_count) + + entry_count * sizeof(struct bbm_log_entry)) + return 4; + + memcpy(super->bbm_log, log, bbm_log_size); + } else { + super->bbm_log->signature = __cpu_to_le32(BBM_LOG_SIGNATURE); + super->bbm_log->entry_count = 0; + } + + return 0; +} + +/* checks if bad block is within volume boundaries */ +static int is_bad_block_in_volume(const struct bbm_log_entry *entry, + const unsigned long long start_sector, + const unsigned long long size) +{ + unsigned long long bb_start; + unsigned long long bb_end; + + bb_start = __le48_to_cpu(&entry->defective_block_start); + bb_end = bb_start + (entry->marked_count + 1); + + if (((bb_start >= start_sector) && (bb_start < start_sector + size)) || + ((bb_end >= start_sector) && (bb_end <= start_sector + size))) + return 1; + + return 0; +} + +/* get list of bad blocks on a drive for a volume */ +static void get_volume_badblocks(const struct bbm_log *log, const __u8 idx, + const unsigned long long start_sector, + const unsigned long long size, + struct md_bb *bbs) +{ + __u32 count = 0; + __u32 i; + + for (i = 0; i < log->entry_count; i++) { + const struct bbm_log_entry *ent = + &log->marked_block_entries[i]; + struct md_bb_entry *bb; + + if ((ent->disk_ordinal == idx) && + is_bad_block_in_volume(ent, start_sector, size)) { + + if (!bbs->entries) { + bbs->entries = xmalloc(BBM_LOG_MAX_ENTRIES * + sizeof(*bb)); + if (!bbs->entries) + break; + } + + bb = &bbs->entries[count++]; + bb->sector = __le48_to_cpu(&ent->defective_block_start); + bb->length = ent->marked_count + 1; + } + } + bbs->count = count; +} + +/* + * for second_map: + * == MAP_0 get first map + * == MAP_1 get second map + * == MAP_X than get map according to the current migr_state + */ +static __u32 get_imsm_ord_tbl_ent(struct imsm_dev *dev, + int slot, + int second_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, second_map); + + /* top byte identifies disk under rebuild */ + return __le32_to_cpu(map->disk_ord_tbl[slot]); +} + +#define ord_to_idx(ord) (((ord) << 8) >> 8) +static __u32 get_imsm_disk_idx(struct imsm_dev *dev, int slot, int second_map) +{ + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, second_map); + + return ord_to_idx(ord); +} + +static void set_imsm_ord_tbl_ent(struct imsm_map *map, int slot, __u32 ord) +{ + map->disk_ord_tbl[slot] = __cpu_to_le32(ord); +} + +static int get_imsm_disk_slot(struct imsm_map *map, unsigned idx) +{ + int slot; + __u32 ord; + + for (slot = 0; slot < map->num_members; slot++) { + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (ord_to_idx(ord) == idx) + return slot; + } + + return -1; +} + +static int get_imsm_raid_level(struct imsm_map *map) +{ + if (map->raid_level == 1) { + if (map->num_members == 2) + return 1; + else + return 10; + } + + return map->raid_level; +} + +static int cmp_extent(const void *av, const void *bv) +{ + const struct extent *a = av; + const struct extent *b = bv; + if (a->start < b->start) + return -1; + if (a->start > b->start) + return 1; + return 0; +} + +static int count_memberships(struct dl *dl, struct intel_super *super) +{ + int memberships = 0; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) + memberships++; + } + + return memberships; +} + +static __u32 imsm_min_reserved_sectors(struct intel_super *super); + +static int split_ull(unsigned long long n, void *lo, void *hi) +{ + if (lo == 0 || hi == 0) + return 1; + __put_unaligned32(__cpu_to_le32((__u32)n), lo); + __put_unaligned32(__cpu_to_le32((n >> 32)), hi); + return 0; +} + +static unsigned long long join_u32(__u32 lo, __u32 hi) +{ + return (unsigned long long)__le32_to_cpu(lo) | + (((unsigned long long)__le32_to_cpu(hi)) << 32); +} + +static unsigned long long total_blocks(struct imsm_disk *disk) +{ + if (disk == NULL) + return 0; + return join_u32(disk->total_blocks_lo, disk->total_blocks_hi); +} + +/** + * imsm_num_data_members() - get data drives count for an array. + * @map: Map to analyze. + * + * num_data_members value represents minimal count of drives for level. + * The name of the property could be misleading for RAID5 with asymmetric layout + * because some data required to be calculated from parity. + * The property is extracted from level and num_members value. + * + * Return: num_data_members value on success, zero otherwise. + */ +static __u8 imsm_num_data_members(struct imsm_map *map) +{ + switch (get_imsm_raid_level(map)) { + case 0: + return map->num_members; + case 1: + case 10: + return map->num_members / 2; + case 5: + return map->num_members - 1; + default: + dprintf("unsupported raid level\n"); + return 0; + } +} + +static unsigned long long pba_of_lba0(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->pba_of_lba0_lo, map->pba_of_lba0_hi); +} + +static unsigned long long blocks_per_member(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->blocks_per_member_lo, map->blocks_per_member_hi); +} + +static unsigned long long num_data_stripes(struct imsm_map *map) +{ + if (map == NULL) + return 0; + return join_u32(map->num_data_stripes_lo, map->num_data_stripes_hi); +} + +static unsigned long long vol_curr_migr_unit(struct imsm_dev *dev) +{ + if (dev == NULL) + return 0; + + return join_u32(dev->vol.curr_migr_unit_lo, dev->vol.curr_migr_unit_hi); +} + +static unsigned long long imsm_dev_size(struct imsm_dev *dev) +{ + if (dev == NULL) + return 0; + return join_u32(dev->size_low, dev->size_high); +} + +static unsigned long long migr_chkp_area_pba(struct migr_record *migr_rec) +{ + if (migr_rec == NULL) + return 0; + return join_u32(migr_rec->ckpt_area_pba_lo, + migr_rec->ckpt_area_pba_hi); +} + +static unsigned long long current_migr_unit(struct migr_record *migr_rec) +{ + if (migr_rec == NULL) + return 0; + return join_u32(migr_rec->curr_migr_unit_lo, + migr_rec->curr_migr_unit_hi); +} + +static unsigned long long migr_dest_1st_member_lba(struct migr_record *migr_rec) +{ + if (migr_rec == NULL) + return 0; + return join_u32(migr_rec->dest_1st_member_lba_lo, + migr_rec->dest_1st_member_lba_hi); +} + +static unsigned long long get_num_migr_units(struct migr_record *migr_rec) +{ + if (migr_rec == NULL) + return 0; + return join_u32(migr_rec->num_migr_units_lo, + migr_rec->num_migr_units_hi); +} + +static void set_total_blocks(struct imsm_disk *disk, unsigned long long n) +{ + split_ull(n, &disk->total_blocks_lo, &disk->total_blocks_hi); +} + +/** + * set_num_domains() - Set number of domains for an array. + * @map: Map to be updated. + * + * num_domains property represents copies count of each data drive, thus make + * it meaningful only for RAID1 and RAID10. IMSM supports two domains for + * raid1 and raid10. + */ +static void set_num_domains(struct imsm_map *map) +{ + int level = get_imsm_raid_level(map); + + if (level == 1 || level == 10) + map->num_domains = 2; + else + map->num_domains = 1; +} + +static void set_pba_of_lba0(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->pba_of_lba0_lo, &map->pba_of_lba0_hi); +} + +static void set_blocks_per_member(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->blocks_per_member_lo, &map->blocks_per_member_hi); +} + +static void set_num_data_stripes(struct imsm_map *map, unsigned long long n) +{ + split_ull(n, &map->num_data_stripes_lo, &map->num_data_stripes_hi); +} + +/** + * update_num_data_stripes() - Calculate and update num_data_stripes value. + * @map: map to be updated. + * @dev_size: size of volume. + * + * num_data_stripes value is addictionally divided by num_domains, therefore for + * levels where num_domains is not 1, nds is a part of real value. + */ +static void update_num_data_stripes(struct imsm_map *map, + unsigned long long dev_size) +{ + unsigned long long nds = dev_size / imsm_num_data_members(map); + + nds /= map->num_domains; + nds /= map->blocks_per_strip; + set_num_data_stripes(map, nds); +} + +static void set_vol_curr_migr_unit(struct imsm_dev *dev, unsigned long long n) +{ + if (dev == NULL) + return; + + split_ull(n, &dev->vol.curr_migr_unit_lo, &dev->vol.curr_migr_unit_hi); +} + +static void set_imsm_dev_size(struct imsm_dev *dev, unsigned long long n) +{ + split_ull(n, &dev->size_low, &dev->size_high); +} + +static void set_migr_chkp_area_pba(struct migr_record *migr_rec, + unsigned long long n) +{ + split_ull(n, &migr_rec->ckpt_area_pba_lo, &migr_rec->ckpt_area_pba_hi); +} + +static void set_current_migr_unit(struct migr_record *migr_rec, + unsigned long long n) +{ + split_ull(n, &migr_rec->curr_migr_unit_lo, + &migr_rec->curr_migr_unit_hi); +} + +static void set_migr_dest_1st_member_lba(struct migr_record *migr_rec, + unsigned long long n) +{ + split_ull(n, &migr_rec->dest_1st_member_lba_lo, + &migr_rec->dest_1st_member_lba_hi); +} + +static void set_num_migr_units(struct migr_record *migr_rec, + unsigned long long n) +{ + split_ull(n, &migr_rec->num_migr_units_lo, + &migr_rec->num_migr_units_hi); +} + +static unsigned long long per_dev_array_size(struct imsm_map *map) +{ + unsigned long long array_size = 0; + + if (map == NULL) + return array_size; + + array_size = num_data_stripes(map) * map->blocks_per_strip; + if (get_imsm_raid_level(map) == 1 || get_imsm_raid_level(map) == 10) + array_size *= 2; + + return array_size; +} + +static struct extent *get_extents(struct intel_super *super, struct dl *dl, + int get_minimal_reservation) +{ + /* find a list of used extents on the given physical device */ + struct extent *rv, *e; + int i; + int memberships = count_memberships(dl, super); + __u32 reservation; + + /* trim the reserved area for spares, so they can join any array + * regardless of whether the OROM has assigned sectors from the + * IMSM_RESERVED_SECTORS region + */ + if (dl->index == -1 || get_minimal_reservation) + reservation = imsm_min_reserved_sectors(super); + else + reservation = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + rv = xcalloc(sizeof(struct extent), (memberships + 1)); + e = rv; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, dl->index) >= 0) { + e->start = pba_of_lba0(map); + e->size = per_dev_array_size(map); + e++; + } + } + qsort(rv, memberships, sizeof(*rv), cmp_extent); + + /* determine the start of the metadata + * when no raid devices are defined use the default + * ...otherwise allow the metadata to truncate the value + * as is the case with older versions of imsm + */ + if (memberships) { + struct extent *last = &rv[memberships - 1]; + unsigned long long remainder; + + remainder = total_blocks(&dl->disk) - (last->start + last->size); + /* round down to 1k block to satisfy precision of the kernel + * 'size' interface + */ + remainder &= ~1UL; + /* make sure remainder is still sane */ + if (remainder < (unsigned)ROUND_UP(super->len, 512) >> 9) + remainder = ROUND_UP(super->len, 512) >> 9; + if (reservation > remainder) + reservation = remainder; + } + e->start = total_blocks(&dl->disk) - reservation; + e->size = 0; + return rv; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry, otherwise fallback to the + * default + */ +static __u32 imsm_reserved_sectors(struct intel_super *super, struct dl *dl) +{ + struct extent *e; + int i; + __u32 rv; + + /* for spares just return a minimal reservation which will grow + * once the spare is picked up by an array + */ + if (dl->index == -1) + return MPB_SECTOR_CNT; + + e = get_extents(super, dl, 0); + if (!e) + return MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + + /* scroll to last entry */ + for (i = 0; e[i].size; i++) + continue; + + rv = total_blocks(&dl->disk) - e[i].start; + + free(e); + + return rv; +} + +static int is_spare(struct imsm_disk *disk) +{ + return (disk->status & SPARE_DISK) == SPARE_DISK; +} + +static int is_configured(struct imsm_disk *disk) +{ + return (disk->status & CONFIGURED_DISK) == CONFIGURED_DISK; +} + +static int is_failed(struct imsm_disk *disk) +{ + return (disk->status & FAILED_DISK) == FAILED_DISK; +} + +static int is_journal(struct imsm_disk *disk) +{ + return (disk->status & JOURNAL_DISK) == JOURNAL_DISK; +} + +/* round array size down to closest MB and ensure it splits evenly + * between members + */ +static unsigned long long round_size_to_mb(unsigned long long size, unsigned int + disk_count) +{ + size /= disk_count; + size = (size >> SECT_PER_MB_SHIFT) << SECT_PER_MB_SHIFT; + size *= disk_count; + + return size; +} + +static int able_to_resync(int raid_level, int missing_disks) +{ + int max_missing_disks = 0; + + switch (raid_level) { + case 10: + max_missing_disks = 1; + break; + default: + max_missing_disks = 0; + } + return missing_disks <= max_missing_disks; +} + +/* try to determine how much space is reserved for metadata from + * the last get_extents() entry on the smallest active disk, + * otherwise fallback to the default + */ +static __u32 imsm_min_reserved_sectors(struct intel_super *super) +{ + struct extent *e; + int i; + unsigned long long min_active; + __u32 remainder; + __u32 rv = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + struct dl *dl, *dl_min = NULL; + + if (!super) + return rv; + + min_active = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (dl->index < 0) + continue; + unsigned long long blocks = total_blocks(&dl->disk); + if (blocks < min_active || min_active == 0) { + dl_min = dl; + min_active = blocks; + } + } + if (!dl_min) + return rv; + + /* find last lba used by subarrays on the smallest active disk */ + e = get_extents(super, dl_min, 0); + if (!e) + return rv; + for (i = 0; e[i].size; i++) + continue; + + remainder = min_active - e[i].start; + free(e); + + /* to give priority to recovery we should not require full + IMSM_RESERVED_SECTORS from the spare */ + rv = MPB_SECTOR_CNT + NUM_BLOCKS_DIRTY_STRIPE_REGION; + + /* if real reservation is smaller use that value */ + return (remainder < rv) ? remainder : rv; +} + +/* + * Return minimum size of a spare and sector size + * that can be used in this array + */ +int get_spare_criteria_imsm(struct supertype *st, struct spare_criteria *c) +{ + struct intel_super *super = st->sb; + struct dl *dl; + struct extent *e; + int i; + unsigned long long size = 0; + + c->min_size = 0; + c->sector_size = 0; + + if (!super) + return -EINVAL; + /* find first active disk in array */ + dl = super->disks; + while (dl && (is_failed(&dl->disk) || dl->index == -1)) + dl = dl->next; + if (!dl) + return -EINVAL; + /* find last lba used by subarrays */ + e = get_extents(super, dl, 0); + if (!e) + return -EINVAL; + for (i = 0; e[i].size; i++) + continue; + if (i > 0) + size = e[i-1].start + e[i-1].size; + free(e); + + /* add the amount of space needed for metadata */ + size += imsm_min_reserved_sectors(super); + + c->min_size = size * 512; + c->sector_size = super->sector_size; + + return 0; +} + +static bool is_gen_migration(struct imsm_dev *dev); + +#define IMSM_4K_DIV 8 + +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev); + +static void print_imsm_dev(struct intel_super *super, + struct imsm_dev *dev, + char *uuid, + int disk_idx) +{ + __u64 sz; + int slot, i; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + __u32 ord; + + printf("\n"); + printf("[%.16s]:\n", dev->volume); + printf(" Subarray : %d\n", super->current_vol); + printf(" UUID : %s\n", uuid); + printf(" RAID Level : %d", get_imsm_raid_level(map)); + if (map2) + printf(" <-- %d", get_imsm_raid_level(map2)); + printf("\n"); + printf(" Members : %d", map->num_members); + if (map2) + printf(" <-- %d", map2->num_members); + printf("\n"); + printf(" Slots : ["); + for (i = 0; i < map->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_0); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + if (map2) { + printf(" <-- ["); + for (i = 0; i < map2->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_1); + printf("%s", ord & IMSM_ORD_REBUILD ? "_" : "U"); + } + printf("]"); + } + printf("\n"); + printf(" Failed disk : "); + if (map->failed_disk_num == 0xff) + printf("none"); + else + printf("%i", map->failed_disk_num); + printf("\n"); + slot = get_imsm_disk_slot(map, disk_idx); + if (slot >= 0) { + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + printf(" This Slot : %d%s\n", slot, + ord & IMSM_ORD_REBUILD ? " (out-of-sync)" : ""); + } else + printf(" This Slot : ?\n"); + printf(" Sector Size : %u\n", super->sector_size); + sz = imsm_dev_size(dev); + printf(" Array Size : %llu%s\n", + (unsigned long long)sz * 512 / super->sector_size, + human_size(sz * 512)); + sz = blocks_per_member(map); + printf(" Per Dev Size : %llu%s\n", + (unsigned long long)sz * 512 / super->sector_size, + human_size(sz * 512)); + printf(" Sector Offset : %llu\n", + pba_of_lba0(map) * 512 / super->sector_size); + printf(" Num Stripes : %llu\n", + num_data_stripes(map)); + printf(" Chunk Size : %u KiB", + __le16_to_cpu(map->blocks_per_strip) / 2); + if (map2) + printf(" <-- %u KiB", + __le16_to_cpu(map2->blocks_per_strip) / 2); + printf("\n"); + printf(" Reserved : %d\n", __le32_to_cpu(dev->reserved_blocks)); + printf(" Migrate State : "); + if (dev->vol.migr_state) { + if (migr_type(dev) == MIGR_INIT) + printf("initialize\n"); + else if (migr_type(dev) == MIGR_REBUILD) + printf("rebuild\n"); + else if (migr_type(dev) == MIGR_VERIFY) + printf("check\n"); + else if (migr_type(dev) == MIGR_GEN_MIGR) + printf("general migration\n"); + else if (migr_type(dev) == MIGR_STATE_CHANGE) + printf("state change\n"); + else if (migr_type(dev) == MIGR_REPAIR) + printf("repair\n"); + else + printf("<unknown:%d>\n", migr_type(dev)); + } else + printf("idle\n"); + printf(" Map State : %s", map_state_str[map->map_state]); + if (dev->vol.migr_state) { + struct imsm_map *map = get_imsm_map(dev, MAP_1); + + printf(" <-- %s", map_state_str[map->map_state]); + printf("\n Checkpoint : %llu ", vol_curr_migr_unit(dev)); + if (is_gen_migration(dev) && (slot > 1 || slot < 0)) + printf("(N/A)"); + else + printf("(%llu)", (unsigned long long) + blocks_per_migr_unit(super, dev)); + } + printf("\n"); + printf(" Dirty State : %s\n", (dev->vol.dirty & RAIDVOL_DIRTY) ? + "dirty" : "clean"); + printf(" RWH Policy : "); + if (dev->rwh_policy == RWH_OFF || dev->rwh_policy == RWH_MULTIPLE_OFF) + printf("off\n"); + else if (dev->rwh_policy == RWH_DISTRIBUTED) + printf("PPL distributed\n"); + else if (dev->rwh_policy == RWH_JOURNALING_DRIVE) + printf("PPL journaling drive\n"); + else if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED) + printf("Multiple distributed PPLs\n"); + else if (dev->rwh_policy == RWH_MULTIPLE_PPLS_JOURNALING_DRIVE) + printf("Multiple PPLs on journaling drive\n"); + else if (dev->rwh_policy == RWH_BITMAP) + printf("Write-intent bitmap\n"); + else + printf("<unknown:%d>\n", dev->rwh_policy); + + printf(" Volume ID : %u\n", dev->my_vol_raid_dev_num); +} + +static void print_imsm_disk(struct imsm_disk *disk, + int index, + __u32 reserved, + unsigned int sector_size) { + char str[MAX_RAID_SERIAL_LEN + 1]; + __u64 sz; + + if (index < -1 || !disk) + return; + + printf("\n"); + snprintf(str, MAX_RAID_SERIAL_LEN + 1, "%s", disk->serial); + if (index >= 0) + printf(" Disk%02d Serial : %s\n", index, str); + else + printf(" Disk Serial : %s\n", str); + printf(" State :%s%s%s%s\n", is_spare(disk) ? " spare" : "", + is_configured(disk) ? " active" : "", + is_failed(disk) ? " failed" : "", + is_journal(disk) ? " journal" : ""); + printf(" Id : %08x\n", __le32_to_cpu(disk->scsi_id)); + sz = total_blocks(disk) - reserved; + printf(" Usable Size : %llu%s\n", + (unsigned long long)sz * 512 / sector_size, + human_size(sz * 512)); +} + +void convert_to_4k_imsm_migr_rec(struct intel_super *super) +{ + struct migr_record *migr_rec = super->migr_rec; + + migr_rec->blocks_per_unit /= IMSM_4K_DIV; + migr_rec->dest_depth_per_unit /= IMSM_4K_DIV; + split_ull((join_u32(migr_rec->post_migr_vol_cap, + migr_rec->post_migr_vol_cap_hi) / IMSM_4K_DIV), + &migr_rec->post_migr_vol_cap, &migr_rec->post_migr_vol_cap_hi); + set_migr_chkp_area_pba(migr_rec, + migr_chkp_area_pba(migr_rec) / IMSM_4K_DIV); + set_migr_dest_1st_member_lba(migr_rec, + migr_dest_1st_member_lba(migr_rec) / IMSM_4K_DIV); +} + +void convert_to_4k_imsm_disk(struct imsm_disk *disk) +{ + set_total_blocks(disk, (total_blocks(disk)/IMSM_4K_DIV)); +} + +void convert_to_4k(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_disk *disk; + int i; + __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size); + + for (i = 0; i < mpb->num_disks ; i++) { + disk = __get_imsm_disk(mpb, i); + /* disk */ + convert_to_4k_imsm_disk(disk); + } + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + /* dev */ + set_imsm_dev_size(dev, imsm_dev_size(dev)/IMSM_4K_DIV); + set_vol_curr_migr_unit(dev, + vol_curr_migr_unit(dev) / IMSM_4K_DIV); + + /* map0 */ + set_blocks_per_member(map, blocks_per_member(map)/IMSM_4K_DIV); + map->blocks_per_strip /= IMSM_4K_DIV; + set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV); + + if (dev->vol.migr_state) { + /* map1 */ + map = get_imsm_map(dev, MAP_1); + set_blocks_per_member(map, + blocks_per_member(map)/IMSM_4K_DIV); + map->blocks_per_strip /= IMSM_4K_DIV; + set_pba_of_lba0(map, pba_of_lba0(map)/IMSM_4K_DIV); + } + } + if (bbm_log_size) { + struct bbm_log *log = (void *)mpb + + __le32_to_cpu(mpb->mpb_size) - bbm_log_size; + __u32 i; + + for (i = 0; i < log->entry_count; i++) { + struct bbm_log_entry *entry = + &log->marked_block_entries[i]; + + __u8 count = entry->marked_count + 1; + unsigned long long sector = + __le48_to_cpu(&entry->defective_block_start); + + entry->defective_block_start = + __cpu_to_le48(sector/IMSM_4K_DIV); + entry->marked_count = max(count/IMSM_4K_DIV, 1) - 1; + } + } + + mpb->check_sum = __gen_imsm_checksum(mpb); +} + +void examine_migr_rec_imsm(struct intel_super *super) +{ + struct migr_record *migr_rec = super->migr_rec; + struct imsm_super *mpb = super->anchor; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_map *map; + int slot = -1; + + if (is_gen_migration(dev) == false) + continue; + + printf("\nMigration Record Information:"); + + /* first map under migration */ + map = get_imsm_map(dev, MAP_0); + if (map) + slot = get_imsm_disk_slot(map, super->disks->index); + if (map == NULL || slot > 1 || slot < 0) { + printf(" Empty\n "); + printf("Examine one of first two disks in array\n"); + break; + } + printf("\n Status : "); + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + printf("Normal\n"); + else + printf("Contains Data\n"); + printf(" Current Unit : %llu\n", + current_migr_unit(migr_rec)); + printf(" Family : %u\n", + __le32_to_cpu(migr_rec->family_num)); + printf(" Ascending : %u\n", + __le32_to_cpu(migr_rec->ascending_migr)); + printf(" Blocks Per Unit : %u\n", + __le32_to_cpu(migr_rec->blocks_per_unit)); + printf(" Dest. Depth Per Unit : %u\n", + __le32_to_cpu(migr_rec->dest_depth_per_unit)); + printf(" Checkpoint Area pba : %llu\n", + migr_chkp_area_pba(migr_rec)); + printf(" First member lba : %llu\n", + migr_dest_1st_member_lba(migr_rec)); + printf(" Total Number of Units : %llu\n", + get_num_migr_units(migr_rec)); + printf(" Size of volume : %llu\n", + join_u32(migr_rec->post_migr_vol_cap, + migr_rec->post_migr_vol_cap_hi)); + printf(" Record was read from : %u\n", + __le32_to_cpu(migr_rec->ckpt_read_disk_num)); + + break; + } +} + +void convert_from_4k_imsm_migr_rec(struct intel_super *super) +{ + struct migr_record *migr_rec = super->migr_rec; + + migr_rec->blocks_per_unit *= IMSM_4K_DIV; + migr_rec->dest_depth_per_unit *= IMSM_4K_DIV; + split_ull((join_u32(migr_rec->post_migr_vol_cap, + migr_rec->post_migr_vol_cap_hi) * IMSM_4K_DIV), + &migr_rec->post_migr_vol_cap, + &migr_rec->post_migr_vol_cap_hi); + set_migr_chkp_area_pba(migr_rec, + migr_chkp_area_pba(migr_rec) * IMSM_4K_DIV); + set_migr_dest_1st_member_lba(migr_rec, + migr_dest_1st_member_lba(migr_rec) * IMSM_4K_DIV); +} + +void convert_from_4k(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_disk *disk; + int i; + __u32 bbm_log_size = __le32_to_cpu(mpb->bbm_log_size); + + for (i = 0; i < mpb->num_disks ; i++) { + disk = __get_imsm_disk(mpb, i); + /* disk */ + set_total_blocks(disk, (total_blocks(disk)*IMSM_4K_DIV)); + } + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + /* dev */ + set_imsm_dev_size(dev, imsm_dev_size(dev)*IMSM_4K_DIV); + set_vol_curr_migr_unit(dev, + vol_curr_migr_unit(dev) * IMSM_4K_DIV); + + /* map0 */ + set_blocks_per_member(map, blocks_per_member(map)*IMSM_4K_DIV); + map->blocks_per_strip *= IMSM_4K_DIV; + set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV); + + if (dev->vol.migr_state) { + /* map1 */ + map = get_imsm_map(dev, MAP_1); + set_blocks_per_member(map, + blocks_per_member(map)*IMSM_4K_DIV); + map->blocks_per_strip *= IMSM_4K_DIV; + set_pba_of_lba0(map, pba_of_lba0(map)*IMSM_4K_DIV); + } + } + if (bbm_log_size) { + struct bbm_log *log = (void *)mpb + + __le32_to_cpu(mpb->mpb_size) - bbm_log_size; + __u32 i; + + for (i = 0; i < log->entry_count; i++) { + struct bbm_log_entry *entry = + &log->marked_block_entries[i]; + + __u8 count = entry->marked_count + 1; + unsigned long long sector = + __le48_to_cpu(&entry->defective_block_start); + + entry->defective_block_start = + __cpu_to_le48(sector*IMSM_4K_DIV); + entry->marked_count = count*IMSM_4K_DIV - 1; + } + } + + mpb->check_sum = __gen_imsm_checksum(mpb); +} + +/******************************************************************************* + * function: imsm_check_attributes + * Description: Function checks if features represented by attributes flags + * are supported by mdadm. + * Parameters: + * attributes - Attributes read from metadata + * Returns: + * 0 - passed attributes contains unsupported features flags + * 1 - all features are supported + ******************************************************************************/ +static int imsm_check_attributes(__u32 attributes) +{ + int ret_val = 1; + __u32 not_supported = MPB_ATTRIB_SUPPORTED^0xffffffff; + + not_supported &= ~MPB_ATTRIB_IGNORED; + + not_supported &= attributes; + if (not_supported) { + pr_err("(IMSM): Unsupported attributes : %x\n", + (unsigned)__le32_to_cpu(not_supported)); + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY \n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_2TB) { + dprintf("\t\tMPB_ATTRIB_2TB\n"); + not_supported ^= MPB_ATTRIB_2TB; + } + if (not_supported & MPB_ATTRIB_RAID0) { + dprintf("\t\tMPB_ATTRIB_RAID0\n"); + not_supported ^= MPB_ATTRIB_RAID0; + } + if (not_supported & MPB_ATTRIB_RAID1) { + dprintf("\t\tMPB_ATTRIB_RAID1\n"); + not_supported ^= MPB_ATTRIB_RAID1; + } + if (not_supported & MPB_ATTRIB_RAID10) { + dprintf("\t\tMPB_ATTRIB_RAID10\n"); + not_supported ^= MPB_ATTRIB_RAID10; + } + if (not_supported & MPB_ATTRIB_RAID1E) { + dprintf("\t\tMPB_ATTRIB_RAID1E\n"); + not_supported ^= MPB_ATTRIB_RAID1E; + } + if (not_supported & MPB_ATTRIB_RAID5) { + dprintf("\t\tMPB_ATTRIB_RAID5\n"); + not_supported ^= MPB_ATTRIB_RAID5; + } + if (not_supported & MPB_ATTRIB_RAIDCNG) { + dprintf("\t\tMPB_ATTRIB_RAIDCNG\n"); + not_supported ^= MPB_ATTRIB_RAIDCNG; + } + if (not_supported & MPB_ATTRIB_BBM) { + dprintf("\t\tMPB_ATTRIB_BBM\n"); + not_supported ^= MPB_ATTRIB_BBM; + } + if (not_supported & MPB_ATTRIB_CHECKSUM_VERIFY) { + dprintf("\t\tMPB_ATTRIB_CHECKSUM_VERIFY (== MPB_ATTRIB_LEGACY)\n"); + not_supported ^= MPB_ATTRIB_CHECKSUM_VERIFY; + } + if (not_supported & MPB_ATTRIB_EXP_STRIPE_SIZE) { + dprintf("\t\tMPB_ATTRIB_EXP_STRIP_SIZE\n"); + not_supported ^= MPB_ATTRIB_EXP_STRIPE_SIZE; + } + if (not_supported & MPB_ATTRIB_2TB_DISK) { + dprintf("\t\tMPB_ATTRIB_2TB_DISK\n"); + not_supported ^= MPB_ATTRIB_2TB_DISK; + } + if (not_supported & MPB_ATTRIB_NEVER_USE2) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE2\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE2; + } + if (not_supported & MPB_ATTRIB_NEVER_USE) { + dprintf("\t\tMPB_ATTRIB_NEVER_USE\n"); + not_supported ^= MPB_ATTRIB_NEVER_USE; + } + + if (not_supported) + dprintf("(IMSM): Unknown attributes : %x\n", not_supported); + + ret_val = 0; + } + + return ret_val; +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map); + +static void examine_super_imsm(struct supertype *st, char *homehost) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + char str[MAX_SIGNATURE_LENGTH]; + int i; + struct mdinfo info; + char nbuf[64]; + __u32 sum; + __u32 reserved = imsm_reserved_sectors(super, super->disks); + struct dl *dl; + time_t creation_time; + + strncpy(str, (char *)mpb->sig, MPB_SIG_LEN); + str[MPB_SIG_LEN-1] = '\0'; + printf(" Magic : %s\n", str); + printf(" Version : %s\n", get_imsm_version(mpb)); + printf(" Orig Family : %08x\n", __le32_to_cpu(mpb->orig_family_num)); + printf(" Family : %08x\n", __le32_to_cpu(mpb->family_num)); + printf(" Generation : %08x\n", __le32_to_cpu(mpb->generation_num)); + creation_time = __le64_to_cpu(mpb->creation_time); + printf(" Creation Time : %.24s\n", + creation_time ? ctime(&creation_time) : "Unknown"); + printf(" Attributes : "); + if (imsm_check_attributes(mpb->attributes)) + printf("All supported\n"); + else + printf("not supported\n"); + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID : %s\n", nbuf + 5); + sum = __le32_to_cpu(mpb->check_sum); + printf(" Checksum : %08x %s\n", sum, + __gen_imsm_checksum(mpb) == sum ? "correct" : "incorrect"); + printf(" MPB Sectors : %d\n", mpb_sectors(mpb, super->sector_size)); + printf(" Disks : %d\n", mpb->num_disks); + printf(" RAID Devices : %d\n", mpb->num_raid_devs); + print_imsm_disk(__get_imsm_disk(mpb, super->disks->index), + super->disks->index, reserved, super->sector_size); + if (get_imsm_bbm_log_size(super->bbm_log)) { + struct bbm_log *log = super->bbm_log; + + printf("\n"); + printf("Bad Block Management Log:\n"); + printf(" Log Size : %d\n", __le32_to_cpu(mpb->bbm_log_size)); + printf(" Signature : %x\n", __le32_to_cpu(log->signature)); + printf(" Entry Count : %d\n", __le32_to_cpu(log->entry_count)); + } + for (i = 0; i < mpb->num_raid_devs; i++) { + struct mdinfo info; + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + print_imsm_dev(super, dev, nbuf + 5, super->disks->index); + } + for (i = 0; i < mpb->num_disks; i++) { + if (i == super->disks->index) + continue; + print_imsm_disk(__get_imsm_disk(mpb, i), i, reserved, + super->sector_size); + } + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == -1) + print_imsm_disk(&dl->disk, -1, reserved, + super->sector_size); + + examine_migr_rec_imsm(super); +} + +static void brief_examine_super_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("ARRAY metadata=imsm UUID=%s\n", nbuf + 5); +} + +static void brief_examine_subarrays_imsm(struct supertype *st, int verbose) +{ + /* We just write a generic IMSM ARRAY entry */ + struct mdinfo info; + char nbuf[64]; + char nbuf1[64]; + struct intel_super *super = st->sb; + int i; + + if (!super->anchor->num_raid_devs) + return; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + super->current_vol = i; + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf1, ':'); + printf("ARRAY /dev/md/%.16s container=%s member=%d UUID=%s\n", + dev->volume, nbuf + 5, i, nbuf1 + 5); + } +} + +static void export_examine_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo info; + char nbuf[64]; + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("MD_METADATA=imsm\n"); + printf("MD_LEVEL=container\n"); + printf("MD_UUID=%s\n", nbuf+5); + printf("MD_DEVICES=%u\n", mpb->num_disks); + printf("MD_CREATION_TIME=%llu\n", __le64_to_cpu(mpb->creation_time)); +} + +static void detail_super_imsm(struct supertype *st, char *homehost, + char *subarray) +{ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + int temp_vol = super->current_vol; + + if (subarray) + super->current_vol = strtoul(subarray, NULL, 10); + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf("\n UUID : %s\n", nbuf + 5); + + super->current_vol = temp_vol; +} + +static void brief_detail_super_imsm(struct supertype *st, char *subarray) +{ + struct mdinfo info; + char nbuf[64]; + struct intel_super *super = st->sb; + int temp_vol = super->current_vol; + + if (subarray) + super->current_vol = strtoul(subarray, NULL, 10); + + getinfo_super_imsm(st, &info, NULL); + fname_from_uuid(st, &info, nbuf, ':'); + printf(" UUID=%s", nbuf + 5); + + super->current_vol = temp_vol; +} + +static int imsm_read_serial(int fd, char *devname, __u8 *serial, + size_t serial_buf_len); +static void fd2devname(int fd, char *name); + +static int ahci_enumerate_ports(const char *hba_path, int port_count, int host_base, int verbose) +{ + /* dump an unsorted list of devices attached to AHCI Intel storage + * controller, as well as non-connected ports + */ + int hba_len = strlen(hba_path) + 1; + struct dirent *ent; + DIR *dir; + char *path = NULL; + int err = 0; + unsigned long port_mask = (1 << port_count) - 1; + + if (port_count > (int)sizeof(port_mask) * 8) { + if (verbose > 0) + pr_err("port_count %d out of range\n", port_count); + return 2; + } + + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + if (!dir) + return 1; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + int fd; + char model[64]; + char vendor[64]; + char buf[1024]; + int major, minor; + char device[PATH_MAX]; + char *c; + int port; + int type; + + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor), 1, NULL); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + + /* retrieve the scsi device */ + if (!devt_to_devpath(makedev(major, minor), 1, device)) { + if (verbose > 0) + pr_err("failed to get device\n"); + err = 2; + break; + } + if (devpath_to_char(device, "type", buf, sizeof(buf), 0)) { + err = 2; + break; + } + type = strtoul(buf, NULL, 10); + + /* if it's not a disk print the vendor and model */ + if (!(type == 0 || type == 7 || type == 14)) { + vendor[0] = '\0'; + model[0] = '\0'; + + if (devpath_to_char(device, "vendor", buf, + sizeof(buf), 0) == 0) { + strncpy(vendor, buf, sizeof(vendor)); + vendor[sizeof(vendor) - 1] = '\0'; + c = (char *) &vendor[sizeof(vendor) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + + } + + if (devpath_to_char(device, "model", buf, + sizeof(buf), 0) == 0) { + strncpy(model, buf, sizeof(model)); + model[sizeof(model) - 1] = '\0'; + c = (char *) &model[sizeof(model) - 1]; + while (isspace(*c) || *c == '\0') + *c-- = '\0'; + } + + if (vendor[0] && model[0]) + sprintf(buf, "%.64s %.64s", vendor, model); + else + switch (type) { /* numbers from hald/linux/device.c */ + case 1: sprintf(buf, "tape"); break; + case 2: sprintf(buf, "printer"); break; + case 3: sprintf(buf, "processor"); break; + case 4: + case 5: sprintf(buf, "cdrom"); break; + case 6: sprintf(buf, "scanner"); break; + case 8: sprintf(buf, "media_changer"); break; + case 9: sprintf(buf, "comm"); break; + case 12: sprintf(buf, "raid"); break; + default: sprintf(buf, "unknown"); + } + } else + buf[0] = '\0'; + + /* chop device path to 'host%d' and calculate the port number */ + c = strchr(&path[hba_len], '/'); + if (!c) { + if (verbose > 0) + pr_err("%s - invalid path name\n", path + hba_len); + err = 2; + break; + } + *c = '\0'; + if ((sscanf(&path[hba_len], "ata%d", &port) == 1) || + ((sscanf(&path[hba_len], "host%d", &port) == 1))) + port -= host_base; + else { + if (verbose > 0) { + *c = '/'; /* repair the full string */ + pr_err("failed to determine port number for %s\n", + path); + } + err = 2; + break; + } + + /* mark this port as used */ + port_mask &= ~(1 << port); + + /* print out the device information */ + if (buf[0]) { + printf(" Port%d : - non-disk device (%s) -\n", port, buf); + continue; + } + + fd = dev_open(ent->d_name, O_RDONLY); + if (!is_fd_valid(fd)) + printf(" Port%d : - disk info unavailable -\n", port); + else { + fd2devname(fd, buf); + printf(" Port%d : %s", port, buf); + if (imsm_read_serial(fd, NULL, (__u8 *)buf, + sizeof(buf)) == 0) + printf(" (%s)\n", buf); + else + printf(" ()\n"); + close(fd); + } + free(path); + path = NULL; + } + if (path) + free(path); + if (dir) + closedir(dir); + if (err == 0) { + int i; + + for (i = 0; i < port_count; i++) + if (port_mask & (1 << i)) + printf(" Port%d : - no device attached -\n", i); + } + + return err; +} + +static int print_nvme_info(struct sys_dev *hba) +{ + struct dirent *ent; + DIR *dir; + + dir = opendir("/sys/block/"); + if (!dir) + return 1; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + char ns_path[PATH_MAX]; + char cntrl_path[PATH_MAX]; + char buf[PATH_MAX]; + int fd = -1; + + if (!strstr(ent->d_name, "nvme")) + goto skip; + + fd = open_dev(ent->d_name); + if (!is_fd_valid(fd)) + goto skip; + + if (!diskfd_to_devpath(fd, 0, ns_path) || + !diskfd_to_devpath(fd, 1, cntrl_path)) + goto skip; + + if (!path_attached_to_hba(cntrl_path, hba->path)) + goto skip; + + if (!imsm_is_nvme_namespace_supported(fd, 0)) + goto skip; + + fd2devname(fd, buf); + if (hba->type == SYS_DEV_VMD) + printf(" NVMe under VMD : %s", buf); + else if (hba->type == SYS_DEV_NVME) + printf(" NVMe Device : %s", buf); + + if (!imsm_read_serial(fd, NULL, (__u8 *)buf, + sizeof(buf))) + printf(" (%s)\n", buf); + else + printf("()\n"); + +skip: + close_fd(&fd); + } + + closedir(dir); + return 0; +} + +static void print_found_intel_controllers(struct sys_dev *elem) +{ + for (; elem; elem = elem->next) { + pr_err("found Intel(R) "); + if (elem->type == SYS_DEV_SATA) + fprintf(stderr, "SATA "); + else if (elem->type == SYS_DEV_SAS) + fprintf(stderr, "SAS "); + else if (elem->type == SYS_DEV_NVME) + fprintf(stderr, "NVMe "); + + if (elem->type == SYS_DEV_VMD) + fprintf(stderr, "VMD domain"); + else + fprintf(stderr, "RAID controller"); + + if (elem->pci_id) + fprintf(stderr, " at %s", elem->pci_id); + fprintf(stderr, ".\n"); + } + fflush(stderr); +} + +static int ahci_get_port_count(const char *hba_path, int *port_count) +{ + struct dirent *ent; + DIR *dir; + int host_base = -1; + + *port_count = 0; + if ((dir = opendir(hba_path)) == NULL) + return -1; + + for (ent = readdir(dir); ent; ent = readdir(dir)) { + int host; + + if ((sscanf(ent->d_name, "ata%d", &host) != 1) && + ((sscanf(ent->d_name, "host%d", &host) != 1))) + continue; + if (*port_count == 0) + host_base = host; + else if (host < host_base) + host_base = host; + + if (host + 1 > *port_count + host_base) + *port_count = host + 1 - host_base; + } + closedir(dir); + return host_base; +} + +static void print_imsm_capability(const struct imsm_orom *orom) +{ + printf(" Platform : Intel(R) "); + if (orom->capabilities == 0 && orom->driver_features == 0) + printf("Matrix Storage Manager\n"); + else if (imsm_orom_is_enterprise(orom) && orom->major_ver >= 6) + printf("Virtual RAID on CPU\n"); + else + printf("Rapid Storage Technology%s\n", + imsm_orom_is_enterprise(orom) ? " enterprise" : ""); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf(" Version : %d.%d.%d.%d\n", orom->major_ver, + orom->minor_ver, orom->hotfix_ver, orom->build); + printf(" RAID Levels :%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? " raid0" : "", + imsm_orom_has_raid1(orom) ? " raid1" : "", + imsm_orom_has_raid1e(orom) ? " raid1e" : "", + imsm_orom_has_raid10(orom) ? " raid10" : "", + imsm_orom_has_raid5(orom) ? " raid5" : ""); + printf(" Chunk Sizes :%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? " 2k" : "", + imsm_orom_has_chunk(orom, 4) ? " 4k" : "", + imsm_orom_has_chunk(orom, 8) ? " 8k" : "", + imsm_orom_has_chunk(orom, 16) ? " 16k" : "", + imsm_orom_has_chunk(orom, 32) ? " 32k" : "", + imsm_orom_has_chunk(orom, 64) ? " 64k" : "", + imsm_orom_has_chunk(orom, 128) ? " 128k" : "", + imsm_orom_has_chunk(orom, 256) ? " 256k" : "", + imsm_orom_has_chunk(orom, 512) ? " 512k" : "", + imsm_orom_has_chunk(orom, 1024*1) ? " 1M" : "", + imsm_orom_has_chunk(orom, 1024*2) ? " 2M" : "", + imsm_orom_has_chunk(orom, 1024*4) ? " 4M" : "", + imsm_orom_has_chunk(orom, 1024*8) ? " 8M" : "", + imsm_orom_has_chunk(orom, 1024*16) ? " 16M" : "", + imsm_orom_has_chunk(orom, 1024*32) ? " 32M" : "", + imsm_orom_has_chunk(orom, 1024*64) ? " 64M" : ""); + printf(" 2TB volumes :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB)?"":" not"); + printf(" 2TB disks :%s supported\n", + (orom->attr & IMSM_OROM_ATTR_2TB_DISK)?"":" not"); + printf(" Max Disks : %d\n", orom->tds); + printf(" Max Volumes : %d per array, %d per %s\n", + orom->vpa, orom->vphba, + imsm_orom_is_nvme(orom) ? "platform" : "controller"); + return; +} + +static void print_imsm_capability_export(const struct imsm_orom *orom) +{ + printf("MD_FIRMWARE_TYPE=imsm\n"); + if (orom->major_ver || orom->minor_ver || orom->hotfix_ver || orom->build) + printf("IMSM_VERSION=%d.%d.%d.%d\n", orom->major_ver, orom->minor_ver, + orom->hotfix_ver, orom->build); + printf("IMSM_SUPPORTED_RAID_LEVELS=%s%s%s%s%s\n", + imsm_orom_has_raid0(orom) ? "raid0 " : "", + imsm_orom_has_raid1(orom) ? "raid1 " : "", + imsm_orom_has_raid1e(orom) ? "raid1e " : "", + imsm_orom_has_raid5(orom) ? "raid10 " : "", + imsm_orom_has_raid10(orom) ? "raid5 " : ""); + printf("IMSM_SUPPORTED_CHUNK_SIZES=%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s\n", + imsm_orom_has_chunk(orom, 2) ? "2k " : "", + imsm_orom_has_chunk(orom, 4) ? "4k " : "", + imsm_orom_has_chunk(orom, 8) ? "8k " : "", + imsm_orom_has_chunk(orom, 16) ? "16k " : "", + imsm_orom_has_chunk(orom, 32) ? "32k " : "", + imsm_orom_has_chunk(orom, 64) ? "64k " : "", + imsm_orom_has_chunk(orom, 128) ? "128k " : "", + imsm_orom_has_chunk(orom, 256) ? "256k " : "", + imsm_orom_has_chunk(orom, 512) ? "512k " : "", + imsm_orom_has_chunk(orom, 1024*1) ? "1M " : "", + imsm_orom_has_chunk(orom, 1024*2) ? "2M " : "", + imsm_orom_has_chunk(orom, 1024*4) ? "4M " : "", + imsm_orom_has_chunk(orom, 1024*8) ? "8M " : "", + imsm_orom_has_chunk(orom, 1024*16) ? "16M " : "", + imsm_orom_has_chunk(orom, 1024*32) ? "32M " : "", + imsm_orom_has_chunk(orom, 1024*64) ? "64M " : ""); + printf("IMSM_2TB_VOLUMES=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB) ? "yes" : "no"); + printf("IMSM_2TB_DISKS=%s\n",(orom->attr & IMSM_OROM_ATTR_2TB_DISK) ? "yes" : "no"); + printf("IMSM_MAX_DISKS=%d\n",orom->tds); + printf("IMSM_MAX_VOLUMES_PER_ARRAY=%d\n",orom->vpa); + printf("IMSM_MAX_VOLUMES_PER_CONTROLLER=%d\n",orom->vphba); +} + +static int detail_platform_imsm(int verbose, int enumerate_only, char *controller_path) +{ + /* There are two components to imsm platform support, the ahci SATA + * controller and the option-rom. To find the SATA controller we + * simply look in /sys/bus/pci/drivers/ahci to see if an ahci + * controller with the Intel vendor id is present. This approach + * allows mdadm to leverage the kernel's ahci detection logic, with the + * caveat that if ahci.ko is not loaded mdadm will not be able to + * detect platform raid capabilities. The option-rom resides in a + * platform "Adapter ROM". We scan for its signature to retrieve the + * platform capabilities. If raid support is disabled in the BIOS the + * option-rom capability structure will not be available. + */ + struct sys_dev *list, *hba; + int host_base = 0; + int port_count = 0; + int result=1; + + if (enumerate_only) { + if (check_env("IMSM_NO_PLATFORM")) + return 0; + list = find_intel_devices(); + if (!list) + return 2; + for (hba = list; hba; hba = hba->next) { + if (find_imsm_capability(hba)) { + result = 0; + break; + } + else + result = 2; + } + return result; + } + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("no active Intel(R) RAID controller found.\n"); + return 2; + } else if (verbose > 0) + print_found_intel_controllers(list); + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path, controller_path) != 0)) + continue; + if (!find_imsm_capability(hba)) { + char buf[PATH_MAX]; + pr_err("imsm capabilities not found for controller: %s (type %s)\n", + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path, + get_sys_dev_type(hba->type)); + continue; + } + result = 0; + } + + if (controller_path && result == 1) { + pr_err("no active Intel(R) RAID controller found under %s\n", + controller_path); + return result; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) { + if (entry->type == SYS_DEV_VMD) { + print_imsm_capability(&entry->orom); + printf(" 3rd party NVMe :%s supported\n", + imsm_orom_has_tpv_support(&entry->orom)?"":" not"); + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_VMD) { + char buf[PATH_MAX]; + printf(" I/O Controller : %s (%s)\n", + vmd_domain_to_controller(hba, buf), get_sys_dev_type(hba->type)); + if (print_nvme_info(hba)) { + if (verbose > 0) + pr_err("failed to get devices attached to VMD domain.\n"); + result |= 2; + } + } + } + printf("\n"); + continue; + } + + print_imsm_capability(&entry->orom); + if (entry->type == SYS_DEV_NVME) { + for (hba = list; hba; hba = hba->next) { + if (hba->type == SYS_DEV_NVME) + print_nvme_info(hba); + } + printf("\n"); + continue; + } + + struct devid_list *devid; + for (devid = entry->devid_list; devid; devid = devid->next) { + hba = device_by_id(devid->devid); + if (!hba) + continue; + + printf(" I/O Controller : %s (%s)\n", + hba->path, get_sys_dev_type(hba->type)); + if (hba->type == SYS_DEV_SATA) { + host_base = ahci_get_port_count(hba->path, &port_count); + if (ahci_enumerate_ports(hba->path, port_count, host_base, verbose)) { + if (verbose > 0) + pr_err("failed to enumerate ports on SATA controller at %s.\n", hba->pci_id); + result |= 2; + } + } + } + printf("\n"); + } + + return result; +} + +static int export_detail_platform_imsm(int verbose, char *controller_path) +{ + struct sys_dev *list, *hba; + int result=1; + + list = find_intel_devices(); + if (!list) { + if (verbose > 0) + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_INTEL_DEVICES\n"); + result = 2; + return result; + } + + for (hba = list; hba; hba = hba->next) { + if (controller_path && (compare_paths(hba->path,controller_path) != 0)) + continue; + if (!find_imsm_capability(hba) && verbose > 0) { + char buf[PATH_MAX]; + pr_err("IMSM_DETAIL_PLATFORM_ERROR=NO_IMSM_CAPABLE_DEVICE_UNDER_%s\n", + hba->type == SYS_DEV_VMD ? vmd_domain_to_controller(hba, buf) : hba->path); + } + else + result = 0; + } + + const struct orom_entry *entry; + + for (entry = orom_entries; entry; entry = entry->next) { + if (entry->type == SYS_DEV_VMD) { + for (hba = list; hba; hba = hba->next) + print_imsm_capability_export(&entry->orom); + continue; + } + print_imsm_capability_export(&entry->orom); + } + + return result; +} + +static int match_home_imsm(struct supertype *st, char *homehost) +{ + /* the imsm metadata format does not specify any host + * identification information. We return -1 since we can never + * confirm nor deny whether a given array is "meant" for this + * host. We rely on compare_super and the 'family_num' fields to + * exclude member disks that do not belong, and we rely on + * mdadm.conf to specify the arrays that should be assembled. + * Auto-assembly may still pick up "foreign" arrays. + */ + + return -1; +} + +static void uuid_from_super_imsm(struct supertype *st, int uuid[4]) +{ + /* The uuid returned here is used for: + * uuid to put into bitmap file (Create, Grow) + * uuid for backup header when saving critical section (Grow) + * comparing uuids when re-adding a device into an array + * In these cases the uuid required is that of the data-array, + * not the device-set. + * uuid to recognise same set when adding a missing device back + * to an array. This is a uuid for the device-set. + * + * For each of these we can make do with a truncated + * or hashed uuid rather than the original, as long as + * everyone agrees. + * In each case the uuid required is that of the data-array, + * not the device-set. + */ + /* imsm does not track uuid's so we synthesis one using sha1 on + * - The signature (Which is constant for all imsm array, but no matter) + * - the orig_family_num of the container + * - the index number of the volume + * - the 'serial' number of the volume. + * Hopefully these are all constant. + */ + struct intel_super *super = st->sb; + + char buf[20]; + struct sha1_ctx ctx; + struct imsm_dev *dev = NULL; + __u32 family_num; + + /* some mdadm versions failed to set ->orig_family_num, in which + * case fall back to ->family_num. orig_family_num will be + * fixed up with the first metadata update. + */ + family_num = super->anchor->orig_family_num; + if (family_num == 0) + family_num = super->anchor->family_num; + sha1_init_ctx(&ctx); + sha1_process_bytes(super->anchor->sig, MPB_SIG_LEN, &ctx); + sha1_process_bytes(&family_num, sizeof(__u32), &ctx); + if (super->current_vol >= 0) + dev = get_imsm_dev(super, super->current_vol); + if (dev) { + __u32 vol = super->current_vol; + sha1_process_bytes(&vol, sizeof(vol), &ctx); + sha1_process_bytes(dev->volume, MAX_RAID_SERIAL_LEN, &ctx); + } + sha1_finish_ctx(&ctx, buf); + memcpy(uuid, buf, 4*4); +} + +#if 0 +static void +get_imsm_numerical_version(struct imsm_super *mpb, int *m, int *p) +{ + __u8 *v = get_imsm_version(mpb); + __u8 *end = mpb->sig + MAX_SIGNATURE_LENGTH; + char major[] = { 0, 0, 0 }; + char minor[] = { 0 ,0, 0 }; + char patch[] = { 0, 0, 0 }; + char *ver_parse[] = { major, minor, patch }; + int i, j; + + i = j = 0; + while (*v != '\0' && v < end) { + if (*v != '.' && j < 2) + ver_parse[i][j++] = *v; + else { + i++; + j = 0; + } + v++; + } + + *m = strtol(minor, NULL, 0); + *p = strtol(patch, NULL, 0); +} +#endif + +static __u32 migr_strip_blocks_resync(struct imsm_dev *dev) +{ + /* migr_strip_size when repairing or initializing parity */ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 5: + case 10: + return chunk; + default: + return 128*1024 >> 9; + } +} + +static __u32 migr_strip_blocks_rebuild(struct imsm_dev *dev) +{ + /* migr_strip_size when rebuilding a degraded disk, no idea why + * this is different than migr_strip_size_resync(), but it's good + * to be compatible + */ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: + if (map->num_members % map->num_domains == 0) + return 128*1024 >> 9; + else + return chunk; + case 5: + return max((__u32) 64*1024 >> 9, chunk); + default: + return 128*1024 >> 9; + } +} + +static __u32 num_stripes_per_unit_resync(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + __u32 lo_chunk = __le32_to_cpu(lo->blocks_per_strip); + __u32 hi_chunk = __le32_to_cpu(hi->blocks_per_strip); + + return max((__u32) 1, hi_chunk / lo_chunk); +} + +static __u32 num_stripes_per_unit_rebuild(struct imsm_dev *dev) +{ + struct imsm_map *lo = get_imsm_map(dev, MAP_0); + int level = get_imsm_raid_level(lo); + + if (level == 1 || level == 10) { + struct imsm_map *hi = get_imsm_map(dev, MAP_1); + + return hi->num_domains; + } else + return num_stripes_per_unit_resync(dev); +} + +static unsigned long long calc_component_size(struct imsm_map *map, + struct imsm_dev *dev) +{ + unsigned long long component_size; + unsigned long long dev_size = imsm_dev_size(dev); + long long calc_dev_size = 0; + unsigned int member_disks = imsm_num_data_members(map); + + if (member_disks == 0) + return 0; + + component_size = per_dev_array_size(map); + calc_dev_size = component_size * member_disks; + + /* Component size is rounded to 1MB so difference between size from + * metadata and size calculated from num_data_stripes equals up to + * 2048 blocks per each device. If the difference is higher it means + * that array size was expanded and num_data_stripes was not updated. + */ + if (llabs(calc_dev_size - (long long)dev_size) > + (1 << SECT_PER_MB_SHIFT) * member_disks) { + component_size = dev_size / member_disks; + dprintf("Invalid num_data_stripes in metadata; expected=%llu, found=%llu\n", + component_size / map->blocks_per_strip, + num_data_stripes(map)); + } + + return component_size; +} + +static __u32 parity_segment_depth(struct imsm_dev *dev) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + + switch(get_imsm_raid_level(map)) { + case 1: + case 10: + return chunk * map->num_domains; + case 5: + return chunk * map->num_members; + default: + return chunk; + } +} + +static __u32 map_migr_block(struct imsm_dev *dev, __u32 block) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_1); + __u32 chunk = __le32_to_cpu(map->blocks_per_strip); + __u32 strip = block / chunk; + + switch (get_imsm_raid_level(map)) { + case 1: + case 10: { + __u32 vol_strip = (strip * map->num_domains) + 1; + __u32 vol_stripe = vol_strip / map->num_members; + + return vol_stripe * chunk + block % chunk; + } case 5: { + __u32 stripe = strip / (map->num_members - 1); + + return stripe * chunk + block % chunk; + } + default: + return 0; + } +} + +static __u64 blocks_per_migr_unit(struct intel_super *super, + struct imsm_dev *dev) +{ + /* calculate the conversion factor between per member 'blocks' + * (md/{resync,rebuild}_start) and imsm migration units, return + * 0 for the 'not migrating' and 'unsupported migration' cases + */ + if (!dev->vol.migr_state) + return 0; + + switch (migr_type(dev)) { + case MIGR_GEN_MIGR: { + struct migr_record *migr_rec = super->migr_rec; + return __le32_to_cpu(migr_rec->blocks_per_unit); + } + case MIGR_VERIFY: + case MIGR_REPAIR: + case MIGR_INIT: { + struct imsm_map *map = get_imsm_map(dev, MAP_0); + __u32 stripes_per_unit; + __u32 blocks_per_unit; + __u32 parity_depth; + __u32 migr_chunk; + __u32 block_map; + __u32 block_rel; + __u32 segment; + __u32 stripe; + __u8 disks; + + /* yes, this is really the translation of migr_units to + * per-member blocks in the 'resync' case + */ + stripes_per_unit = num_stripes_per_unit_resync(dev); + migr_chunk = migr_strip_blocks_resync(dev); + disks = imsm_num_data_members(map); + blocks_per_unit = stripes_per_unit * migr_chunk * disks; + stripe = __le16_to_cpu(map->blocks_per_strip) * disks; + segment = blocks_per_unit / stripe; + block_rel = blocks_per_unit - segment * stripe; + parity_depth = parity_segment_depth(dev); + block_map = map_migr_block(dev, block_rel); + return block_map + parity_depth * segment; + } + case MIGR_REBUILD: { + __u32 stripes_per_unit; + __u32 migr_chunk; + + stripes_per_unit = num_stripes_per_unit_rebuild(dev); + migr_chunk = migr_strip_blocks_rebuild(dev); + return migr_chunk * stripes_per_unit; + } + case MIGR_STATE_CHANGE: + default: + return 0; + } +} + +static int imsm_level_to_layout(int level) +{ + switch (level) { + case 0: + case 1: + return 0; + case 5: + case 6: + return ALGORITHM_LEFT_ASYMMETRIC; + case 10: + return 0x102; + } + return UnSet; +} + +/******************************************************************************* + * Function: read_imsm_migr_rec + * Description: Function reads imsm migration record from last sector of disk + * Parameters: + * fd : disk descriptor + * super : metadata info + * Returns: + * 0 : success, + * -1 : fail + ******************************************************************************/ +static int read_imsm_migr_rec(int fd, struct intel_super *super) +{ + int ret_val = -1; + unsigned int sector_size = super->sector_size; + unsigned long long dsize; + + get_dev_size(fd, NULL, &dsize); + if (lseek64(fd, dsize - (sector_size*MIGR_REC_SECTOR_POSITION), + SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if ((unsigned int)read(fd, super->migr_rec_buf, + MIGR_REC_BUF_SECTORS*sector_size) != + MIGR_REC_BUF_SECTORS*sector_size) { + pr_err("Cannot read migr record block: %s\n", + strerror(errno)); + goto out; + } + ret_val = 0; + if (sector_size == 4096) + convert_from_4k_imsm_migr_rec(super); + +out: + return ret_val; +} + +static struct imsm_dev *imsm_get_device_during_migration( + struct intel_super *super) +{ + + struct intel_dev *dv; + + for (dv = super->devlist; dv; dv = dv->next) { + if (is_gen_migration(dv->dev)) + return dv->dev; + } + return NULL; +} + +/******************************************************************************* + * Function: load_imsm_migr_rec + * Description: Function reads imsm migration record (it is stored at the last + * sector of disk) + * Parameters: + * super : imsm internal array info + * Returns: + * 0 : success + * -1 : fail + * -2 : no migration in progress + ******************************************************************************/ +static int load_imsm_migr_rec(struct intel_super *super) +{ + struct dl *dl; + char nm[30]; + int retval = -1; + int fd = -1; + struct imsm_dev *dev; + struct imsm_map *map; + int slot = -1; + int keep_fd = 1; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* nothing to load,no migration in progress? + */ + if (dev == NULL) + return -2; + + map = get_imsm_map(dev, MAP_0); + if (!map) + return -1; + + for (dl = super->disks; dl; dl = dl->next) { + /* skip spare and failed disks + */ + if (dl->index < 0) + continue; + /* read only from one of the first two slots + */ + slot = get_imsm_disk_slot(map, dl->index); + if (slot > 1 || slot < 0) + continue; + + if (!is_fd_valid(dl->fd)) { + sprintf(nm, "%d:%d", dl->major, dl->minor); + fd = dev_open(nm, O_RDONLY); + + if (is_fd_valid(fd)) { + keep_fd = 0; + break; + } + } else { + fd = dl->fd; + break; + } + } + + if (!is_fd_valid(fd)) + return retval; + retval = read_imsm_migr_rec(fd, super); + if (!keep_fd) + close(fd); + + return retval; +} + +/******************************************************************************* + * function: imsm_create_metadata_checkpoint_update + * Description: It creates update for checkpoint change. + * Parameters: + * super : imsm internal array info + * u : pointer to prepared update + * Returns: + * Uptate length. + * If length is equal to 0, input pointer u contains no update + ******************************************************************************/ +static int imsm_create_metadata_checkpoint_update( + struct intel_super *super, + struct imsm_update_general_migration_checkpoint **u) +{ + + int update_memory_size = 0; + + dprintf("(enter)\n"); + + if (u == NULL) + return 0; + *u = NULL; + + /* size of all update data without anchor */ + update_memory_size = + sizeof(struct imsm_update_general_migration_checkpoint); + + *u = xcalloc(1, update_memory_size); + if (*u == NULL) { + dprintf("error: cannot get memory\n"); + return 0; + } + (*u)->type = update_general_migration_checkpoint; + (*u)->curr_migr_unit = current_migr_unit(super->migr_rec); + dprintf("prepared for %llu\n", (unsigned long long)(*u)->curr_migr_unit); + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len); + +/******************************************************************************* + * Function: write_imsm_migr_rec + * Description: Function writes imsm migration record + * (at the last sector of disk) + * Parameters: + * super : imsm internal array info + * Returns: + * 0 : success + * -1 : if fail + ******************************************************************************/ +static int write_imsm_migr_rec(struct supertype *st) +{ + struct intel_super *super = st->sb; + unsigned int sector_size = super->sector_size; + unsigned long long dsize; + int retval = -1; + struct dl *sd; + int len; + struct imsm_update_general_migration_checkpoint *u; + struct imsm_dev *dev; + struct imsm_map *map; + + /* find map under migration */ + dev = imsm_get_device_during_migration(super); + /* if no migration, write buffer anyway to clear migr_record + * on disk based on first available device + */ + if (dev == NULL) + dev = get_imsm_dev(super, super->current_vol < 0 ? 0 : + super->current_vol); + + map = get_imsm_map(dev, MAP_0); + + if (sector_size == 4096) + convert_to_4k_imsm_migr_rec(super); + for (sd = super->disks ; sd ; sd = sd->next) { + int slot = -1; + + /* skip failed and spare devices */ + if (sd->index < 0) + continue; + /* write to 2 first slots only */ + if (map) + slot = get_imsm_disk_slot(map, sd->index); + if (map == NULL || slot > 1 || slot < 0) + continue; + + get_dev_size(sd->fd, NULL, &dsize); + if (lseek64(sd->fd, dsize - (MIGR_REC_SECTOR_POSITION * + sector_size), + SEEK_SET) < 0) { + pr_err("Cannot seek to anchor block: %s\n", + strerror(errno)); + goto out; + } + if ((unsigned int)write(sd->fd, super->migr_rec_buf, + MIGR_REC_BUF_SECTORS*sector_size) != + MIGR_REC_BUF_SECTORS*sector_size) { + pr_err("Cannot write migr record block: %s\n", + strerror(errno)); + goto out; + } + } + if (sector_size == 4096) + convert_from_4k_imsm_migr_rec(super); + /* update checkpoint information in metadata */ + len = imsm_create_metadata_checkpoint_update(super, &u); + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto out; + } + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) { + append_metadata_update(st, u, len); + /* during reshape we do all work inside metadata handler + * manage_reshape(), so metadata update has to be triggered + * insida it + */ + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else + free(u); + + retval = 0; + out: + return retval; +} + +/* spare/missing disks activations are not allowe when + * array/container performs reshape operation, because + * all arrays in container works on the same disks set + */ +int imsm_reshape_blocks_arrays_changes(struct intel_super *super) +{ + int rv = 0; + struct intel_dev *i_dev; + struct imsm_dev *dev; + + /* check whole container + */ + for (i_dev = super->devlist; i_dev; i_dev = i_dev->next) { + dev = i_dev->dev; + if (is_gen_migration(dev)) { + /* No repair during any migration in container + */ + rv = 1; + break; + } + } + return rv; +} +static unsigned long long imsm_component_size_alignment_check(int level, + int chunk_size, + unsigned int sector_size, + unsigned long long component_size) +{ + unsigned int component_size_alignment; + + /* check component size alignment + */ + component_size_alignment = component_size % (chunk_size/sector_size); + + dprintf("(Level: %i, chunk_size = %i, component_size = %llu), component_size_alignment = %u\n", + level, chunk_size, component_size, + component_size_alignment); + + if (component_size_alignment && (level != 1) && (level != UnSet)) { + dprintf("imsm: reported component size aligned from %llu ", + component_size); + component_size -= component_size_alignment; + dprintf_cont("to %llu (%i).\n", + component_size, component_size_alignment); + } + + return component_size; +} + +/******************************************************************************* + * Function: get_bitmap_header_sector + * Description: Returns the sector where the bitmap header is placed. + * Parameters: + * st : supertype information + * dev_idx : index of the device with bitmap + * + * Returns: + * The sector where the bitmap header is placed + ******************************************************************************/ +static unsigned long long get_bitmap_header_sector(struct intel_super *super, + int dev_idx) +{ + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (!super->sector_size) { + dprintf("sector size is not set\n"); + return 0; + } + + return pba_of_lba0(map) + calc_component_size(map, dev) + + (IMSM_BITMAP_HEADER_OFFSET / super->sector_size); +} + +/******************************************************************************* + * Function: get_bitmap_sector + * Description: Returns the sector where the bitmap is placed. + * Parameters: + * st : supertype information + * dev_idx : index of the device with bitmap + * + * Returns: + * The sector where the bitmap is placed + ******************************************************************************/ +static unsigned long long get_bitmap_sector(struct intel_super *super, + int dev_idx) +{ + if (!super->sector_size) { + dprintf("sector size is not set\n"); + return 0; + } + + return get_bitmap_header_sector(super, dev_idx) + + (IMSM_BITMAP_HEADER_SIZE / super->sector_size); +} + +static unsigned long long get_ppl_sector(struct intel_super *super, int dev_idx) +{ + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + return pba_of_lba0(map) + + (num_data_stripes(map) * map->blocks_per_strip); +} + +static void getinfo_super_imsm_volume(struct supertype *st, struct mdinfo *info, char *dmap) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev_map = get_imsm_map(dev, MAP_1); + struct imsm_map *map_to_analyse = map; + struct dl *dl; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + if (prev_map) + map_to_analyse = prev_map; + + dl = super->current_disk; + + info->container_member = super->current_vol; + info->array.raid_disks = map->num_members; + info->array.level = get_imsm_raid_level(map_to_analyse); + info->array.layout = imsm_level_to_layout(info->array.level); + info->array.md_minor = -1; + info->array.ctime = 0; + info->array.utime = 0; + info->array.chunk_size = + __le16_to_cpu(map_to_analyse->blocks_per_strip) << 9; + info->array.state = !(dev->vol.dirty & RAIDVOL_DIRTY); + info->custom_array_size = imsm_dev_size(dev); + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + + if (is_gen_migration(dev)) { + /* + * device prev_map should be added if it is in the middle + * of migration + */ + assert(prev_map); + + info->reshape_active = 1; + info->new_level = get_imsm_raid_level(map); + info->new_layout = imsm_level_to_layout(info->new_level); + info->new_chunk = __le16_to_cpu(map->blocks_per_strip) << 9; + info->delta_disks = map->num_members - prev_map->num_members; + if (info->delta_disks) { + /* this needs to be applied to every array + * in the container. + */ + info->reshape_active = CONTAINER_RESHAPE; + } + /* We shape information that we give to md might have to be + * modify to cope with md's requirement for reshaping arrays. + * For example, when reshaping a RAID0, md requires it to be + * presented as a degraded RAID4. + * Also if a RAID0 is migrating to a RAID5 we need to specify + * the array as already being RAID5, but the 'before' layout + * is a RAID4-like layout. + */ + switch (info->array.level) { + case 0: + switch(info->new_level) { + case 0: + /* conversion is happening as RAID4 */ + info->array.level = 4; + info->array.raid_disks += 1; + break; + case 5: + /* conversion is happening as RAID5 */ + info->array.level = 5; + info->array.layout = ALGORITHM_PARITY_N; + info->delta_disks -= 1; + break; + default: + /* FIXME error message */ + info->array.level = UnSet; + break; + } + break; + } + } else { + info->new_level = UnSet; + info->new_layout = UnSet; + info->new_chunk = info->array.chunk_size; + info->delta_disks = 0; + } + + if (dl) { + info->disk.major = dl->major; + info->disk.minor = dl->minor; + info->disk.number = dl->index; + info->disk.raid_disk = get_imsm_disk_slot(map_to_analyse, + dl->index); + } + + info->data_offset = pba_of_lba0(map_to_analyse); + info->component_size = calc_component_size(map, dev); + info->component_size = imsm_component_size_alignment_check( + info->array.level, + info->array.chunk_size, + super->sector_size, + info->component_size); + info->bb.supported = 1; + + memset(info->uuid, 0, sizeof(info->uuid)); + info->recovery_start = MaxSector; + + if (info->array.level == 5 && + (dev->rwh_policy == RWH_DISTRIBUTED || + dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED)) { + info->consistency_policy = CONSISTENCY_POLICY_PPL; + info->ppl_sector = get_ppl_sector(super, super->current_vol); + if (dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED) + info->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9; + else + info->ppl_size = (PPL_HEADER_SIZE + PPL_ENTRY_SPACE) + >> 9; + } else if (info->array.level <= 0) { + info->consistency_policy = CONSISTENCY_POLICY_NONE; + } else { + if (dev->rwh_policy == RWH_BITMAP) { + info->bitmap_offset = get_bitmap_sector(super, super->current_vol); + info->consistency_policy = CONSISTENCY_POLICY_BITMAP; + } else { + info->consistency_policy = CONSISTENCY_POLICY_RESYNC; + } + } + + info->reshape_progress = 0; + info->resync_start = MaxSector; + if ((map_to_analyse->map_state == IMSM_T_STATE_UNINITIALIZED || + !(info->array.state & 1)) && + imsm_reshape_blocks_arrays_changes(super) == 0) { + info->resync_start = 0; + } + if (dev->vol.migr_state) { + switch (migr_type(dev)) { + case MIGR_REPAIR: + case MIGR_INIT: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = vol_curr_migr_unit(dev); + + info->resync_start = blocks_per_unit * units; + break; + } + case MIGR_GEN_MIGR: { + __u64 blocks_per_unit = blocks_per_migr_unit(super, + dev); + __u64 units = current_migr_unit(migr_rec); + int used_disks; + + if (__le32_to_cpu(migr_rec->ascending_migr) && + (units < + (get_num_migr_units(migr_rec)-1)) && + (super->migr_rec->rec_status == + __cpu_to_le32(UNIT_SRC_IN_CP_AREA))) + units++; + + info->reshape_progress = blocks_per_unit * units; + + dprintf("IMSM: General Migration checkpoint : %llu (%llu) -> read reshape progress : %llu\n", + (unsigned long long)units, + (unsigned long long)blocks_per_unit, + info->reshape_progress); + + used_disks = imsm_num_data_members(prev_map); + if (used_disks > 0) { + info->custom_array_size = per_dev_array_size(map) * + used_disks; + } + } + case MIGR_VERIFY: + /* we could emulate the checkpointing of + * 'sync_action=check' migrations, but for now + * we just immediately complete them + */ + case MIGR_REBUILD: + /* this is handled by container_content_imsm() */ + case MIGR_STATE_CHANGE: + /* FIXME handle other migrations */ + default: + /* we are not dirty, so... */ + info->resync_start = MaxSector; + } + } + + strncpy(info->name, (char *) dev->volume, MAX_RAID_SERIAL_LEN); + info->name[MAX_RAID_SERIAL_LEN] = 0; + + info->array.major_version = -1; + info->array.minor_version = -2; + sprintf(info->text_version, "/%s/%d", st->container_devnm, info->container_member); + info->safe_mode_delay = 4000; /* 4 secs like the Matrix driver */ + uuid_from_super_imsm(st, info->uuid); + + if (dmap) { + int i, j; + for (i=0; i<map_disks; i++) { + dmap[i] = 0; + if (i < info->array.raid_disks) { + struct imsm_disk *dsk; + j = get_imsm_disk_idx(dev, i, MAP_X); + dsk = get_imsm_disk(super, j); + if (dsk && (dsk->status & CONFIGURED_DISK)) + dmap[i] = 1; + } + } + } +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map); + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map); + +static void manage_second_map(struct intel_super *super, struct imsm_dev *dev) +{ + if (is_gen_migration(dev)) { + int failed; + __u8 map_state; + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + + failed = imsm_count_failed(super, dev, MAP_1); + map_state = imsm_check_degraded(super, dev, failed, MAP_1); + if (map2->map_state != map_state) { + map2->map_state = map_state; + super->updates_pending++; + } + } +} + +static struct imsm_disk *get_imsm_missing(struct intel_super *super, __u8 index) +{ + struct dl *d; + + for (d = super->missing; d; d = d->next) + if (d->index == index) + return &d->disk; + return NULL; +} + +static void getinfo_super_imsm(struct supertype *st, struct mdinfo *info, char *map) +{ + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int map_disks = info->array.raid_disks; + int max_enough = -1; + int i; + struct imsm_super *mpb; + + if (super->current_vol >= 0) { + getinfo_super_imsm_volume(st, info, map); + return; + } + memset(info, 0, sizeof(*info)); + + /* Set raid_disks to zero so that Assemble will always pull in valid + * spares + */ + info->array.raid_disks = 0; + info->array.level = LEVEL_CONTAINER; + info->array.layout = 0; + info->array.md_minor = -1; + info->array.ctime = 0; /* N/A for imsm */ + info->array.utime = 0; + info->array.chunk_size = 0; + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.raid_disk = -1; + info->reshape_active = 0; + info->array.major_version = -1; + info->array.minor_version = -2; + strcpy(info->text_version, "imsm"); + info->safe_mode_delay = 0; + info->disk.number = -1; + info->disk.state = 0; + info->name[0] = 0; + info->recovery_start = MaxSector; + info->recovery_blocked = imsm_reshape_blocks_arrays_changes(st->sb); + info->bb.supported = 1; + + /* do we have the all the insync disks that we expect? */ + mpb = super->anchor; + info->events = __le32_to_cpu(mpb->generation_num); + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + int failed, enough, j, missing = 0; + struct imsm_map *map; + __u8 state; + + failed = imsm_count_failed(super, dev, MAP_0); + state = imsm_check_degraded(super, dev, failed, MAP_0); + map = get_imsm_map(dev, MAP_0); + + /* any newly missing disks? + * (catches single-degraded vs double-degraded) + */ + for (j = 0; j < map->num_members; j++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, j, MAP_0); + __u32 idx = ord_to_idx(ord); + + if (super->disks && super->disks->index == (int)idx) + info->disk.raid_disk = j; + + if (!(ord & IMSM_ORD_REBUILD) && + get_imsm_missing(super, idx)) { + missing = 1; + break; + } + } + + if (state == IMSM_T_STATE_FAILED) + enough = -1; + else if (state == IMSM_T_STATE_DEGRADED && + (state != map->map_state || missing)) + enough = 0; + else /* we're normal, or already degraded */ + enough = 1; + if (is_gen_migration(dev) && missing) { + /* during general migration we need all disks + * that process is running on. + * No new missing disk is allowed. + */ + max_enough = -1; + enough = -1; + /* no more checks necessary + */ + break; + } + /* in the missing/failed disk case check to see + * if at least one array is runnable + */ + max_enough = max(max_enough, enough); + } + dprintf("enough: %d\n", max_enough); + info->container_enough = max_enough; + + if (super->disks) { + __u32 reserved = imsm_reserved_sectors(super, super->disks); + + disk = &super->disks->disk; + info->data_offset = total_blocks(&super->disks->disk) - reserved; + info->component_size = reserved; + info->disk.state = is_configured(disk) ? (1 << MD_DISK_ACTIVE) : 0; + /* we don't change info->disk.raid_disk here because + * this state will be finalized in mdmon after we have + * found the 'most fresh' version of the metadata + */ + info->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + info->disk.state |= (is_spare(disk) || is_journal(disk)) ? + 0 : (1 << MD_DISK_SYNC); + } + + /* only call uuid_from_super_imsm when this disk is part of a populated container, + * ->compare_super may have updated the 'num_raid_devs' field for spares + */ + if (info->disk.state & (1 << MD_DISK_SYNC) || super->anchor->num_raid_devs) + uuid_from_super_imsm(st, info->uuid); + else + memcpy(info->uuid, uuid_zero, sizeof(uuid_zero)); + + /* I don't know how to compute 'map' on imsm, so use safe default */ + if (map) { + int i; + for (i = 0; i < map_disks; i++) + map[i] = 1; + } + +} + +/* allocates memory and fills disk in mdinfo structure + * for each disk in array */ +struct mdinfo *getinfo_super_disks_imsm(struct supertype *st) +{ + struct mdinfo *mddev; + struct intel_super *super = st->sb; + struct imsm_disk *disk; + int count = 0; + struct dl *dl; + if (!super || !super->disks) + return NULL; + dl = super->disks; + mddev = xcalloc(1, sizeof(*mddev)); + while (dl) { + struct mdinfo *tmp; + disk = &dl->disk; + tmp = xcalloc(1, sizeof(*tmp)); + if (mddev->devs) + tmp->next = mddev->devs; + mddev->devs = tmp; + tmp->disk.number = count++; + tmp->disk.major = dl->major; + tmp->disk.minor = dl->minor; + tmp->disk.state = is_configured(disk) ? + (1 << MD_DISK_ACTIVE) : 0; + tmp->disk.state |= is_failed(disk) ? (1 << MD_DISK_FAULTY) : 0; + tmp->disk.state |= is_spare(disk) ? 0 : (1 << MD_DISK_SYNC); + tmp->disk.raid_disk = -1; + dl = dl->next; + } + return mddev; +} + +static int update_super_imsm(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* For 'assemble' and 'force' we need to return non-zero if any + * change was made. For others, the return value is ignored. + * Update options are: + * force-one : This device looks a bit old but needs to be included, + * update age info appropriately. + * assemble: clear any 'faulty' flag to allow this device to + * be assembled. + * force-array: Array is degraded but being forced, mark it clean + * if that will be needed to assemble it. + * + * newdev: not used ???? + * grow: Array has gained a new device - this is currently for + * linear only + * resync: mark as dirty so a resync will happen. + * name: update the name - preserving the homehost + * uuid: Change the uuid of the array to match watch is given + * + * Following are not relevant for this imsm: + * sparc2.2 : update from old dodgey metadata + * super-minor: change the preferred_minor number + * summaries: update redundant counters. + * homehost: update the recorded homehost + * _reshape_progress: record new reshape_progress position. + */ + int rv = 1; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + + /* we can only update container info */ + if (!super || super->current_vol >= 0 || !super->anchor) + return 1; + + mpb = super->anchor; + + if (strcmp(update, "uuid") == 0) { + /* We take this to mean that the family_num should be updated. + * However that is much smaller than the uuid so we cannot really + * allow an explicit uuid to be given. And it is hard to reliably + * know if one was. + * So if !uuid_set we know the current uuid is random and just used + * the first 'int' and copy it to the other 3 positions. + * Otherwise we require the 4 'int's to be the same as would be the + * case if we are using a random uuid. So an explicit uuid will be + * accepted as long as all for ints are the same... which shouldn't hurt + */ + if (!uuid_set) { + info->uuid[1] = info->uuid[2] = info->uuid[3] = info->uuid[0]; + rv = 0; + } else { + if (info->uuid[0] != info->uuid[1] || + info->uuid[1] != info->uuid[2] || + info->uuid[2] != info->uuid[3]) + rv = -1; + else + rv = 0; + } + if (rv == 0) + mpb->orig_family_num = info->uuid[0]; + } else if (strcmp(update, "assemble") == 0) + rv = 0; + else + rv = -1; + + /* successful update? recompute checksum */ + if (rv == 0) + mpb->check_sum = __le32_to_cpu(__gen_imsm_checksum(mpb)); + + return rv; +} + +static size_t disks_to_mpb_size(int disks) +{ + size_t size; + + size = sizeof(struct imsm_super); + size += (disks - 1) * sizeof(struct imsm_disk); + size += 2 * sizeof(struct imsm_dev); + /* up to 2 maps per raid device (-2 for imsm_maps in imsm_dev */ + size += (4 - 2) * sizeof(struct imsm_map); + /* 4 possible disk_ord_tbl's */ + size += 4 * (disks - 1) * sizeof(__u32); + /* maximum bbm log */ + size += sizeof(struct bbm_log); + + return size; +} + +static __u64 avail_size_imsm(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (devsize < (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS)) + return 0; + + return devsize - (MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS); +} + +static void free_devlist(struct intel_super *super) +{ + struct intel_dev *dv; + + while (super->devlist) { + dv = super->devlist->next; + free(super->devlist->dev); + free(super->devlist); + super->devlist = dv; + } +} + +static void imsm_copy_dev(struct imsm_dev *dest, struct imsm_dev *src) +{ + memcpy(dest, src, sizeof_imsm_dev(src, 0)); +} + +static int compare_super_imsm(struct supertype *st, struct supertype *tst, + int verbose) +{ + /* return: + * 0 same, or first was empty, and second was copied + * 1 sb are different + */ + struct intel_super *first = st->sb; + struct intel_super *sec = tst->sb; + + if (!first) { + st->sb = tst->sb; + tst->sb = NULL; + return 0; + } + + /* in platform dependent environment test if the disks + * use the same Intel hba + * if not on Intel hba at all, allow anything. + * doesn't check HBAs if num_raid_devs is not set, as it means + * it is a free floating spare, and all spares regardless of HBA type + * will fall into separate container during the assembly + */ + if (first->hba && sec->hba && first->anchor->num_raid_devs != 0) { + if (first->hba->type != sec->hba->type) { + if (verbose) + pr_err("HBAs of devices do not match %s != %s\n", + get_sys_dev_type(first->hba->type), + get_sys_dev_type(sec->hba->type)); + return 1; + } + if (first->orom != sec->orom) { + if (verbose) + pr_err("HBAs of devices do not match %s != %s\n", + first->hba->pci_id, sec->hba->pci_id); + return 1; + } + } + + if (first->anchor->num_raid_devs > 0 && + sec->anchor->num_raid_devs > 0) { + /* Determine if these disks might ever have been + * related. Further disambiguation can only take place + * in load_super_imsm_all + */ + __u32 first_family = first->anchor->orig_family_num; + __u32 sec_family = sec->anchor->orig_family_num; + + if (memcmp(first->anchor->sig, sec->anchor->sig, + MAX_SIGNATURE_LENGTH) != 0) + return 1; + + if (first_family == 0) + first_family = first->anchor->family_num; + if (sec_family == 0) + sec_family = sec->anchor->family_num; + + if (first_family != sec_family) + return 1; + + } + + /* if an anchor does not have num_raid_devs set then it is a free + * floating spare. don't assosiate spare with any array, as during assembly + * spares shall fall into separate container, from which they can be moved + * when necessary + */ + if (first->anchor->num_raid_devs ^ sec->anchor->num_raid_devs) + return 1; + + return 0; +} + +static void fd2devname(int fd, char *name) +{ + char *nm; + + nm = fd2kname(fd); + if (!nm) + return; + + snprintf(name, MAX_RAID_SERIAL_LEN, "/dev/%s", nm); +} + +static int nvme_get_serial(int fd, void *buf, size_t buf_len) +{ + char path[PATH_MAX]; + char *name = fd2kname(fd); + + if (!name) + return 1; + + if (strncmp(name, "nvme", 4) != 0) + return 1; + + if (!diskfd_to_devpath(fd, 1, path)) + return 1; + + return devpath_to_char(path, "serial", buf, buf_len, 0); +} + +extern int scsi_get_serial(int fd, void *buf, size_t buf_len); + +static int imsm_read_serial(int fd, char *devname, + __u8 *serial, size_t serial_buf_len) +{ + char buf[50]; + int rv; + size_t len; + char *dest; + char *src; + unsigned int i; + + memset(buf, 0, sizeof(buf)); + + rv = nvme_get_serial(fd, buf, sizeof(buf)); + + if (rv) + rv = scsi_get_serial(fd, buf, sizeof(buf)); + + if (rv && check_env("IMSM_DEVNAME_AS_SERIAL")) { + memset(serial, 0, MAX_RAID_SERIAL_LEN); + fd2devname(fd, (char *) serial); + return 0; + } + + if (rv != 0) { + if (devname) + pr_err("Failed to retrieve serial for %s\n", + devname); + return rv; + } + + /* trim all whitespace and non-printable characters and convert + * ':' to ';' + */ + for (i = 0, dest = buf; i < sizeof(buf) && buf[i]; i++) { + src = &buf[i]; + if (*src > 0x20) { + /* ':' is reserved for use in placeholder serial + * numbers for missing disks + */ + if (*src == ':') + *dest++ = ';'; + else + *dest++ = *src; + } + } + len = dest - buf; + dest = buf; + + if (len > serial_buf_len) { + /* truncate leading characters */ + dest += len - serial_buf_len; + len = serial_buf_len; + } + + memset(serial, 0, serial_buf_len); + memcpy(serial, dest, len); + + return 0; +} + +static int serialcmp(__u8 *s1, __u8 *s2) +{ + return strncmp((char *) s1, (char *) s2, MAX_RAID_SERIAL_LEN); +} + +static void serialcpy(__u8 *dest, __u8 *src) +{ + strncpy((char *) dest, (char *) src, MAX_RAID_SERIAL_LEN); +} + +static struct dl *serial_to_dl(__u8 *serial, struct intel_super *super) +{ + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (serialcmp(dl->serial, serial) == 0) + break; + + return dl; +} + +static struct imsm_disk * +__serial_to_disk(__u8 *serial, struct imsm_super *mpb, int *idx) +{ + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + + if (serialcmp(disk->serial, serial) == 0) { + if (idx) + *idx = i; + return disk; + } + } + + return NULL; +} + +static int +load_imsm_disk(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + struct imsm_disk *disk; + struct dl *dl; + struct stat stb; + int rv; + char name[40]; + __u8 serial[MAX_RAID_SERIAL_LEN]; + + rv = imsm_read_serial(fd, devname, serial, MAX_RAID_SERIAL_LEN); + + if (rv != 0) + return 2; + + dl = xcalloc(1, sizeof(*dl)); + + fstat(fd, &stb); + dl->major = major(stb.st_rdev); + dl->minor = minor(stb.st_rdev); + dl->next = super->disks; + dl->fd = keep_fd ? fd : -1; + assert(super->disks == NULL); + super->disks = dl; + serialcpy(dl->serial, serial); + dl->index = -2; + dl->e = NULL; + fd2devname(fd, name); + if (devname) + dl->devname = xstrdup(devname); + else + dl->devname = xstrdup(name); + + /* look up this disk's index in the current anchor */ + disk = __serial_to_disk(dl->serial, super->anchor, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of a + * populated contianer, i.e. one with raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk) || is_journal(&dl->disk)) + dl->index = -1; + } + + return 0; +} + +/* When migrating map0 contains the 'destination' state while map1 + * contains the current state. When not migrating map0 contains the + * current state. This routine assumes that map[0].map_state is set to + * the current array state before being called. + * + * Migration is indicated by one of the following states + * 1/ Idle (migr_state=0 map0state=normal||unitialized||degraded||failed) + * 2/ Initialize (migr_state=1 migr_type=MIGR_INIT map0state=normal + * map1state=unitialized) + * 3/ Repair (Resync) (migr_state=1 migr_type=MIGR_REPAIR map0state=normal + * map1state=normal) + * 4/ Rebuild (migr_state=1 migr_type=MIGR_REBUILD map0state=normal + * map1state=degraded) + * 5/ Migration (mig_state=1 migr_type=MIGR_GEN_MIGR map0state=normal + * map1state=normal) + */ +static void migrate(struct imsm_dev *dev, struct intel_super *super, + __u8 to_state, int migr_type) +{ + struct imsm_map *dest; + struct imsm_map *src = get_imsm_map(dev, MAP_0); + + dev->vol.migr_state = 1; + set_migr_type(dev, migr_type); + set_vol_curr_migr_unit(dev, 0); + dest = get_imsm_map(dev, MAP_1); + + /* duplicate and then set the target end state in map[0] */ + memcpy(dest, src, sizeof_imsm_map(src)); + if (migr_type == MIGR_GEN_MIGR) { + __u32 ord; + int i; + + for (i = 0; i < src->num_members; i++) { + ord = __le32_to_cpu(src->disk_ord_tbl[i]); + set_imsm_ord_tbl_ent(src, i, ord_to_idx(ord)); + } + } + + if (migr_type == MIGR_GEN_MIGR) + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + + src->map_state = to_state; +} + +static void end_migration(struct imsm_dev *dev, struct intel_super *super, + __u8 map_state) +{ + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, dev->vol.migr_state == 0 ? + MAP_0 : MAP_1); + int i, j; + + /* merge any IMSM_ORD_REBUILD bits that were not successfully + * completed in the last migration. + * + * FIXME add support for raid-level-migration + */ + if (map_state != map->map_state && (is_gen_migration(dev) == false) && + prev->map_state != IMSM_T_STATE_UNINITIALIZED) { + /* when final map state is other than expected + * merge maps (not for migration) + */ + int failed; + + for (i = 0; i < prev->num_members; i++) + for (j = 0; j < map->num_members; j++) + /* during online capacity expansion + * disks position can be changed + * if takeover is used + */ + if (ord_to_idx(map->disk_ord_tbl[j]) == + ord_to_idx(prev->disk_ord_tbl[i])) { + map->disk_ord_tbl[j] |= + prev->disk_ord_tbl[i]; + break; + } + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + } + + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + set_vol_curr_migr_unit(dev, 0); + map->map_state = map_state; +} + +static int parse_raid_devices(struct intel_super *super) +{ + int i; + struct imsm_dev *dev_new; + size_t len, len_migr; + size_t max_len = 0; + size_t space_needed = 0; + struct imsm_super *mpb = super->anchor; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + struct intel_dev *dv; + + len = sizeof_imsm_dev(dev_iter, 0); + len_migr = sizeof_imsm_dev(dev_iter, 1); + if (len_migr > len) + space_needed += len_migr - len; + + dv = xmalloc(sizeof(*dv)); + if (max_len < len_migr) + max_len = len_migr; + if (max_len > len_migr) + space_needed += max_len - len_migr; + dev_new = xmalloc(max_len); + imsm_copy_dev(dev_new, dev_iter); + dv->dev = dev_new; + dv->index = i; + dv->next = super->devlist; + super->devlist = dv; + } + + /* ensure that super->buf is large enough when all raid devices + * are migrating + */ + if (__le32_to_cpu(mpb->mpb_size) + space_needed > super->len) { + void *buf; + + len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + space_needed, + super->sector_size); + if (posix_memalign(&buf, MAX_SECTOR_SIZE, len) != 0) + return 1; + + memcpy(buf, super->buf, super->len); + memset(buf + super->len, 0, len - super->len); + free(super->buf); + super->buf = buf; + super->len = len; + } + + super->extra_space += space_needed; + + return 0; +} + +/******************************************************************************* + * Function: check_mpb_migr_compatibility + * Description: Function checks for unsupported migration features: + * - migration optimization area (pba_of_lba0) + * - descending reshape (ascending_migr) + * Parameters: + * super : imsm metadata information + * Returns: + * 0 : migration is compatible + * -1 : migration is not compatible + ******************************************************************************/ +int check_mpb_migr_compatibility(struct intel_super *super) +{ + struct imsm_map *map0, *map1; + struct migr_record *migr_rec = super->migr_rec; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + struct imsm_dev *dev_iter = __get_imsm_dev(super->anchor, i); + + if (dev_iter && + dev_iter->vol.migr_state == 1 && + dev_iter->vol.migr_type == MIGR_GEN_MIGR) { + /* This device is migrating */ + map0 = get_imsm_map(dev_iter, MAP_0); + map1 = get_imsm_map(dev_iter, MAP_1); + if (pba_of_lba0(map0) != pba_of_lba0(map1)) + /* migration optimization area was used */ + return -1; + if (migr_rec->ascending_migr == 0 && + migr_rec->dest_depth_per_unit > 0) + /* descending reshape not supported yet */ + return -1; + } + } + return 0; +} + +static void __free_imsm(struct intel_super *super, int free_disks); + +/* load_imsm_mpb - read matrix metadata + * allocates super->mpb to be freed by free_imsm + */ +static int load_imsm_mpb(int fd, struct intel_super *super, char *devname) +{ + unsigned long long dsize; + unsigned long long sectors; + unsigned int sector_size = super->sector_size; + struct stat; + struct imsm_super *anchor; + __u32 check_sum; + + get_dev_size(fd, NULL, &dsize); + if (dsize < 2*sector_size) { + if (devname) + pr_err("%s: device to small for imsm\n", + devname); + return 1; + } + + if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to anchor block on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void **)&anchor, sector_size, sector_size) != 0) { + if (devname) + pr_err("Failed to allocate imsm anchor buffer on %s\n", devname); + return 1; + } + if ((unsigned int)read(fd, anchor, sector_size) != sector_size) { + if (devname) + pr_err("Cannot read anchor block on %s: %s\n", + devname, strerror(errno)); + free(anchor); + return 1; + } + + if (strncmp((char *) anchor->sig, MPB_SIGNATURE, MPB_SIG_LEN) != 0) { + if (devname) + pr_err("no IMSM anchor on %s\n", devname); + free(anchor); + return 2; + } + + __free_imsm(super, 0); + /* reload capability and hba */ + + /* capability and hba must be updated with new super allocation */ + find_intel_hba_capability(fd, super, devname); + super->len = ROUND_UP(anchor->mpb_size, sector_size); + if (posix_memalign(&super->buf, MAX_SECTOR_SIZE, super->len) != 0) { + if (devname) + pr_err("unable to allocate %zu byte mpb buffer\n", + super->len); + free(anchor); + return 2; + } + memcpy(super->buf, anchor, sector_size); + + sectors = mpb_sectors(anchor, sector_size) - 1; + free(anchor); + + if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE, + MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + return 2; + } + super->clean_migration_record_by_mdmon = 0; + + if (!sectors) { + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, + __le32_to_cpu(super->anchor->check_sum), + devname); + return 2; + } + + return 0; + } + + /* read the extended mpb */ + if (lseek64(fd, dsize - (sector_size * (2 + sectors)), SEEK_SET) < 0) { + if (devname) + pr_err("Cannot seek to extended mpb on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if ((unsigned int)read(fd, super->buf + sector_size, + super->len - sector_size) != super->len - sector_size) { + if (devname) + pr_err("Cannot read extended mpb on %s: %s\n", + devname, strerror(errno)); + return 2; + } + + check_sum = __gen_imsm_checksum(super->anchor); + if (check_sum != __le32_to_cpu(super->anchor->check_sum)) { + if (devname) + pr_err("IMSM checksum %x != %x on %s\n", + check_sum, __le32_to_cpu(super->anchor->check_sum), + devname); + return 3; + } + + return 0; +} + +static int read_imsm_migr_rec(int fd, struct intel_super *super); + +/* clears hi bits in metadata if MPB_ATTRIB_2TB_DISK not set */ +static void clear_hi(struct intel_super *super) +{ + struct imsm_super *mpb = super->anchor; + int i, n; + if (mpb->attributes & MPB_ATTRIB_2TB_DISK) + return; + for (i = 0; i < mpb->num_disks; ++i) { + struct imsm_disk *disk = &mpb->disk[i]; + disk->total_blocks_hi = 0; + } + for (i = 0; i < mpb->num_raid_devs; ++i) { + struct imsm_dev *dev = get_imsm_dev(super, i); + if (!dev) + return; + for (n = 0; n < 2; ++n) { + struct imsm_map *map = get_imsm_map(dev, n); + if (!map) + continue; + map->pba_of_lba0_hi = 0; + map->blocks_per_member_hi = 0; + map->num_data_stripes_hi = 0; + } + } +} + +static int +load_and_parse_mpb(int fd, struct intel_super *super, char *devname, int keep_fd) +{ + int err; + + err = load_imsm_mpb(fd, super, devname); + if (err) + return err; + if (super->sector_size == 4096) + convert_from_4k(super); + err = load_imsm_disk(fd, super, devname, keep_fd); + if (err) + return err; + err = parse_raid_devices(super); + if (err) + return err; + err = load_bbm_log(super); + clear_hi(super); + return err; +} + +static void __free_imsm_disk(struct dl *d, int do_close) +{ + if (do_close) + close_fd(&d->fd); + if (d->devname) + free(d->devname); + if (d->e) + free(d->e); + free(d); + +} + +static void free_imsm_disks(struct intel_super *super) +{ + struct dl *d; + + while (super->disks) { + d = super->disks; + super->disks = d->next; + __free_imsm_disk(d, 1); + } + while (super->disk_mgmt_list) { + d = super->disk_mgmt_list; + super->disk_mgmt_list = d->next; + __free_imsm_disk(d, 1); + } + while (super->missing) { + d = super->missing; + super->missing = d->next; + __free_imsm_disk(d, 1); + } + +} + +/* free all the pieces hanging off of a super pointer */ +static void __free_imsm(struct intel_super *super, int free_disks) +{ + struct intel_hba *elem, *next; + + if (super->buf) { + free(super->buf); + super->buf = NULL; + } + /* unlink capability description */ + super->orom = NULL; + if (super->migr_rec_buf) { + free(super->migr_rec_buf); + super->migr_rec_buf = NULL; + } + if (free_disks) + free_imsm_disks(super); + free_devlist(super); + elem = super->hba; + while (elem) { + if (elem->path) + free((void *)elem->path); + next = elem->next; + free(elem); + elem = next; + } + if (super->bbm_log) + free(super->bbm_log); + super->hba = NULL; +} + +static void free_imsm(struct intel_super *super) +{ + __free_imsm(super, 1); + free(super->bb.entries); + free(super); +} + +static void free_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + + if (!super) + return; + + free_imsm(super); + st->sb = NULL; +} + +static struct intel_super *alloc_super(void) +{ + struct intel_super *super = xcalloc(1, sizeof(*super)); + + super->current_vol = -1; + super->create_offset = ~((unsigned long long) 0); + + super->bb.entries = xmalloc(BBM_LOG_MAX_ENTRIES * + sizeof(struct md_bb_entry)); + if (!super->bb.entries) { + free(super); + return NULL; + } + + return super; +} + +/* + * find and allocate hba and OROM/EFI based on valid fd of RAID component device + */ +static int find_intel_hba_capability(int fd, struct intel_super *super, char *devname) +{ + struct sys_dev *hba_name; + int rv = 0; + + if (is_fd_valid(fd) && test_partition(fd)) { + pr_err("imsm: %s is a partition, cannot be used in IMSM\n", + devname); + return 1; + } + if (!is_fd_valid(fd) || check_env("IMSM_NO_PLATFORM")) { + super->orom = NULL; + super->hba = NULL; + return 0; + } + hba_name = find_disk_attached_hba(fd, NULL); + if (!hba_name) { + if (devname) + pr_err("%s is not attached to Intel(R) RAID controller.\n", + devname); + return 1; + } + rv = attach_hba_to_super(super, hba_name); + if (rv == 2) { + if (devname) { + struct intel_hba *hba = super->hba; + + pr_err("%s is attached to Intel(R) %s %s (%s),\n" + " but the container is assigned to Intel(R) %s %s (", + devname, + get_sys_dev_type(hba_name->type), + hba_name->type == SYS_DEV_VMD ? "domain" : "RAID controller", + hba_name->pci_id ? : "Err!", + get_sys_dev_type(super->hba->type), + hba->type == SYS_DEV_VMD ? "domain" : "RAID controller"); + + while (hba) { + fprintf(stderr, "%s", hba->pci_id ? : "Err!"); + if (hba->next) + fprintf(stderr, ", "); + hba = hba->next; + } + fprintf(stderr, ").\n" + " Mixing devices attached to different controllers is not allowed.\n"); + } + return 2; + } + super->orom = find_imsm_capability(hba_name); + if (!super->orom) + return 3; + + return 0; +} + +/* find_missing - helper routine for load_super_imsm_all that identifies + * disks that have disappeared from the system. This routine relies on + * the mpb being uptodate, which it is at load time. + */ +static int find_missing(struct intel_super *super) +{ + int i; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + struct imsm_disk *disk; + + for (i = 0; i < mpb->num_disks; i++) { + disk = __get_imsm_disk(mpb, i); + dl = serial_to_dl(disk->serial, super); + if (dl) + continue; + + dl = xmalloc(sizeof(*dl)); + dl->major = 0; + dl->minor = 0; + dl->fd = -1; + dl->devname = xstrdup("missing"); + dl->index = i; + serialcpy(dl->serial, disk->serial); + dl->disk = *disk; + dl->e = NULL; + dl->next = super->missing; + super->missing = dl; + } + + return 0; +} + +static struct intel_disk *disk_list_get(__u8 *serial, struct intel_disk *disk_list) +{ + struct intel_disk *idisk = disk_list; + + while (idisk) { + if (serialcmp(idisk->disk.serial, serial) == 0) + break; + idisk = idisk->next; + } + + return idisk; +} + +static int __prep_thunderdome(struct intel_super **table, int tbl_size, + struct intel_super *super, + struct intel_disk **disk_list) +{ + struct imsm_disk *d = &super->disks->disk; + struct imsm_super *mpb = super->anchor; + int i, j; + + for (i = 0; i < tbl_size; i++) { + struct imsm_super *tbl_mpb = table[i]->anchor; + struct imsm_disk *tbl_d = &table[i]->disks->disk; + + if (tbl_mpb->family_num == mpb->family_num) { + if (tbl_mpb->check_sum == mpb->check_sum) { + dprintf("mpb from %d:%d matches %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + break; + } + + if (((is_configured(d) && !is_configured(tbl_d)) || + is_configured(d) == is_configured(tbl_d)) && + tbl_mpb->generation_num < mpb->generation_num) { + /* current version of the mpb is a + * better candidate than the one in + * super_table, but copy over "cross + * generational" status + */ + struct intel_disk *idisk; + + dprintf("mpb from %d:%d replaces %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + idisk = disk_list_get(tbl_d->serial, *disk_list); + if (idisk && is_failed(&idisk->disk)) + tbl_d->status |= FAILED_DISK; + break; + } else { + struct intel_disk *idisk; + struct imsm_disk *disk; + + /* tbl_mpb is more up to date, but copy + * over cross generational status before + * returning + */ + disk = __serial_to_disk(d->serial, mpb, NULL); + if (disk && is_failed(disk)) + d->status |= FAILED_DISK; + + idisk = disk_list_get(d->serial, *disk_list); + if (idisk) { + idisk->owner = i; + if (disk && is_configured(disk)) + idisk->disk.status |= CONFIGURED_DISK; + } + + dprintf("mpb from %d:%d prefer %d:%d\n", + super->disks->major, + super->disks->minor, + table[i]->disks->major, + table[i]->disks->minor); + + return tbl_size; + } + } + } + + if (i >= tbl_size) + table[tbl_size++] = super; + else + table[i] = super; + + /* update/extend the merged list of imsm_disk records */ + for (j = 0; j < mpb->num_disks; j++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, j); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, *disk_list); + if (idisk) { + idisk->disk.status |= disk->status; + if (is_configured(&idisk->disk) || + is_failed(&idisk->disk)) + idisk->disk.status &= ~(SPARE_DISK); + } else { + idisk = xcalloc(1, sizeof(*idisk)); + idisk->owner = IMSM_UNKNOWN_OWNER; + idisk->disk = *disk; + idisk->next = *disk_list; + *disk_list = idisk; + } + + if (serialcmp(idisk->disk.serial, d->serial) == 0) + idisk->owner = i; + } + + return tbl_size; +} + +static struct intel_super * +validate_members(struct intel_super *super, struct intel_disk *disk_list, + const int owner) +{ + struct imsm_super *mpb = super->anchor; + int ok_count = 0; + int i; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk = __get_imsm_disk(mpb, i); + struct intel_disk *idisk; + + idisk = disk_list_get(disk->serial, disk_list); + if (idisk) { + if (idisk->owner == owner || + idisk->owner == IMSM_UNKNOWN_OWNER) + ok_count++; + else + dprintf("'%.16s' owner %d != %d\n", + disk->serial, idisk->owner, + owner); + } else { + dprintf("unknown disk %x [%d]: %.16s\n", + __le32_to_cpu(mpb->family_num), i, + disk->serial); + break; + } + } + + if (ok_count == mpb->num_disks) + return super; + return NULL; +} + +static void show_conflicts(__u32 family_num, struct intel_super *super_list) +{ + struct intel_super *s; + + for (s = super_list; s; s = s->next) { + if (family_num != s->anchor->family_num) + continue; + pr_err("Conflict, offlining family %#x on '%s'\n", + __le32_to_cpu(family_num), s->disks->devname); + } +} + +static struct intel_super * +imsm_thunderdome(struct intel_super **super_list, int len) +{ + struct intel_super *super_table[len]; + struct intel_disk *disk_list = NULL; + struct intel_super *champion, *spare; + struct intel_super *s, **del; + int tbl_size = 0; + int conflict; + int i; + + memset(super_table, 0, sizeof(super_table)); + for (s = *super_list; s; s = s->next) + tbl_size = __prep_thunderdome(super_table, tbl_size, s, &disk_list); + + for (i = 0; i < tbl_size; i++) { + struct imsm_disk *d; + struct intel_disk *idisk; + struct imsm_super *mpb = super_table[i]->anchor; + + s = super_table[i]; + d = &s->disks->disk; + + /* 'd' must appear in merged disk list for its + * configuration to be valid + */ + idisk = disk_list_get(d->serial, disk_list); + if (idisk && idisk->owner == i) + s = validate_members(s, disk_list, i); + else + s = NULL; + + if (!s) + dprintf("marking family: %#x from %d:%d offline\n", + mpb->family_num, + super_table[i]->disks->major, + super_table[i]->disks->minor); + super_table[i] = s; + } + + /* This is where the mdadm implementation differs from the Windows + * driver which has no strict concept of a container. We can only + * assemble one family from a container, so when returning a prodigal + * array member to this system the code will not be able to disambiguate + * the container contents that should be assembled ("foreign" versus + * "local"). It requires user intervention to set the orig_family_num + * to a new value to establish a new container. The Windows driver in + * this situation fixes up the volume name in place and manages the + * foreign array as an independent entity. + */ + s = NULL; + spare = NULL; + conflict = 0; + for (i = 0; i < tbl_size; i++) { + struct intel_super *tbl_ent = super_table[i]; + int is_spare = 0; + + if (!tbl_ent) + continue; + + if (tbl_ent->anchor->num_raid_devs == 0) { + spare = tbl_ent; + is_spare = 1; + } + + if (s && !is_spare) { + show_conflicts(tbl_ent->anchor->family_num, *super_list); + conflict++; + } else if (!s && !is_spare) + s = tbl_ent; + } + + if (!s) + s = spare; + if (!s) { + champion = NULL; + goto out; + } + champion = s; + + if (conflict) + pr_err("Chose family %#x on '%s', assemble conflicts to new container with '--update=uuid'\n", + __le32_to_cpu(s->anchor->family_num), s->disks->devname); + + /* collect all dl's onto 'champion', and update them to + * champion's version of the status + */ + for (s = *super_list; s; s = s->next) { + struct imsm_super *mpb = champion->anchor; + struct dl *dl = s->disks; + + if (s == champion) + continue; + + mpb->attributes |= s->anchor->attributes & MPB_ATTRIB_2TB_DISK; + + for (i = 0; i < mpb->num_disks; i++) { + struct imsm_disk *disk; + + disk = __serial_to_disk(dl->serial, mpb, &dl->index); + if (disk) { + dl->disk = *disk; + /* only set index on disks that are a member of + * a populated contianer, i.e. one with + * raid_devs + */ + if (is_failed(&dl->disk)) + dl->index = -2; + else if (is_spare(&dl->disk)) + dl->index = -1; + break; + } + } + + if (i >= mpb->num_disks) { + struct intel_disk *idisk; + + idisk = disk_list_get(dl->serial, disk_list); + if (idisk && is_spare(&idisk->disk) && + !is_failed(&idisk->disk) && !is_configured(&idisk->disk)) + dl->index = -1; + else { + dl->index = -2; + continue; + } + } + + dl->next = champion->disks; + champion->disks = dl; + s->disks = NULL; + } + + /* delete 'champion' from super_list */ + for (del = super_list; *del; ) { + if (*del == champion) { + *del = (*del)->next; + break; + } else + del = &(*del)->next; + } + champion->next = NULL; + + out: + while (disk_list) { + struct intel_disk *idisk = disk_list; + + disk_list = disk_list->next; + free(idisk); + } + + return champion; +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd); +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd); +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd); + +static int load_super_imsm_all(struct supertype *st, int fd, void **sbp, + char *devname, struct md_list *devlist, + int keep_fd) +{ + struct intel_super *super_list = NULL; + struct intel_super *super = NULL; + int err = 0; + int i = 0; + + if (is_fd_valid(fd)) + /* 'fd' is an opened container */ + err = get_sra_super_block(fd, &super_list, devname, &i, keep_fd); + else + /* get super block from devlist devices */ + err = get_devlist_super_block(devlist, &super_list, &i, keep_fd); + if (err) + goto error; + /* all mpbs enter, maybe one leaves */ + super = imsm_thunderdome(&super_list, i); + if (!super) { + err = 1; + goto error; + } + + if (find_missing(super) != 0) { + free_imsm(super); + err = 2; + goto error; + } + + /* load migration record */ + err = load_imsm_migr_rec(super); + if (err == -1) { + /* migration is in progress, + * but migr_rec cannot be loaded, + */ + err = 4; + goto error; + } + + /* Check migration compatibility */ + if (err == 0 && check_mpb_migr_compatibility(super) != 0) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + + err = 5; + goto error; + } + + err = 0; + + error: + while (super_list) { + struct intel_super *s = super_list; + + super_list = super_list->next; + free_imsm(s); + } + + if (err) + return err; + + *sbp = super; + if (is_fd_valid(fd)) + strcpy(st->container_devnm, fd2devnm(fd)); + else + st->container_devnm[0] = 0; + if (err == 0 && st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + return 0; +} + +static int +get_devlist_super_block(struct md_list *devlist, struct intel_super **super_list, + int *max, int keep_fd) +{ + struct md_list *tmpdev; + int err = 0; + int i = 0; + + for (i = 0, tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used != 1) + continue; + if (tmpdev->container == 1) { + int lmax = 0; + int fd = dev_open(tmpdev->devname, O_RDONLY|O_EXCL); + if (!is_fd_valid(fd)) { + pr_err("cannot open device %s: %s\n", + tmpdev->devname, strerror(errno)); + err = 8; + goto error; + } + err = get_sra_super_block(fd, super_list, + tmpdev->devname, &lmax, + keep_fd); + i += lmax; + close(fd); + if (err) { + err = 7; + goto error; + } + } else { + int major = major(tmpdev->st_rdev); + int minor = minor(tmpdev->st_rdev); + err = get_super_block(super_list, + NULL, + tmpdev->devname, + major, minor, + keep_fd); + i++; + if (err) { + err = 6; + goto error; + } + } + } + error: + *max = i; + return err; +} + +static int get_super_block(struct intel_super **super_list, char *devnm, char *devname, + int major, int minor, int keep_fd) +{ + struct intel_super *s; + char nm[32]; + int dfd = -1; + int err = 0; + int retry; + + s = alloc_super(); + if (!s) { + err = 1; + goto error; + } + + sprintf(nm, "%d:%d", major, minor); + dfd = dev_open(nm, O_RDWR); + if (!is_fd_valid(dfd)) { + err = 2; + goto error; + } + + if (!get_dev_sector_size(dfd, NULL, &s->sector_size)) { + err = 2; + goto error; + } + find_intel_hba_capability(dfd, s, devname); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + + /* retry the load if we might have raced against mdmon */ + if (err == 3 && devnm && mdmon_running(devnm)) + for (retry = 0; retry < 3; retry++) { + usleep(3000); + err = load_and_parse_mpb(dfd, s, NULL, keep_fd); + if (err != 3) + break; + } + error: + if (!err) { + s->next = *super_list; + *super_list = s; + } else { + if (s) + free_imsm(s); + close_fd(&dfd); + } + if (!keep_fd) + close_fd(&dfd); + return err; + +} + +static int +get_sra_super_block(int fd, struct intel_super **super_list, char *devname, int *max, int keep_fd) +{ + struct mdinfo *sra; + char *devnm; + struct mdinfo *sd; + int err = 0; + int i = 0; + sra = sysfs_read(fd, NULL, GET_LEVEL|GET_VERSION|GET_DEVS|GET_STATE); + if (!sra) + return 1; + + if (sra->array.major_version != -1 || + sra->array.minor_version != -2 || + strcmp(sra->text_version, "imsm") != 0) { + err = 1; + goto error; + } + /* load all mpbs */ + devnm = fd2devnm(fd); + for (sd = sra->devs, i = 0; sd; sd = sd->next, i++) { + if (get_super_block(super_list, devnm, devname, + sd->disk.major, sd->disk.minor, keep_fd) != 0) { + err = 7; + goto error; + } + } + error: + sysfs_free(sra); + *max = i; + return err; +} + +static int load_container_imsm(struct supertype *st, int fd, char *devname) +{ + return load_super_imsm_all(st, fd, &st->sb, devname, NULL, 1); +} + +static int load_super_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super; + int rv; + int retry; + + if (test_partition(fd)) + /* IMSM not allowed on partitions */ + return 1; + + free_super_imsm(st); + + super = alloc_super(); + if (!super) + return 1; + + if (!get_dev_sector_size(fd, NULL, &super->sector_size)) { + free_imsm(super); + return 1; + } + /* Load hba and capabilities if they exist. + * But do not preclude loading metadata in case capabilities or hba are + * non-compliant and ignore_hw_compat is set. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if (rv != 0 && st->ignore_hw_compat == 0) { + if (devname) + pr_err("No OROM/EFI properties for %s\n", devname); + free_imsm(super); + return 2; + } + rv = load_and_parse_mpb(fd, super, devname, 0); + + /* retry the load if we might have raced against mdmon */ + if (rv == 3) { + struct mdstat_ent *mdstat = NULL; + char *name = fd2kname(fd); + + if (name) + mdstat = mdstat_by_component(name); + + if (mdstat && mdmon_running(mdstat->devnm) && getpid() != mdmon_pid(mdstat->devnm)) { + for (retry = 0; retry < 3; retry++) { + usleep(3000); + rv = load_and_parse_mpb(fd, super, devname, 0); + if (rv != 3) + break; + } + } + + free_mdstat(mdstat); + } + + if (rv) { + if (devname) + pr_err("Failed to load all information sections on %s\n", devname); + free_imsm(super); + return rv; + } + + st->sb = super; + if (st->ss == NULL) { + st->ss = &super_imsm; + st->minor_version = 0; + st->max_devs = IMSM_MAX_DEVICES; + } + + /* load migration record */ + if (load_imsm_migr_rec(super) == 0) { + /* Check for unsupported migration features */ + if (check_mpb_migr_compatibility(super) != 0) { + pr_err("Unsupported migration detected"); + if (devname) + fprintf(stderr, " on %s\n", devname); + else + fprintf(stderr, " (IMSM).\n"); + return 3; + } + } + + return 0; +} + +static __u16 info_to_blocks_per_strip(mdu_array_info_t *info) +{ + if (info->level == 1) + return 128; + return info->chunk_size >> 9; +} + +static unsigned long long info_to_blocks_per_member(mdu_array_info_t *info, + unsigned long long size) +{ + if (info->level == 1) + return size * 2; + else + return (size * 2) & ~(info_to_blocks_per_strip(info) - 1); +} + +static void imsm_update_version_info(struct intel_super *super) +{ + /* update the version and attributes */ + struct imsm_super *mpb = super->anchor; + char *version; + struct imsm_dev *dev; + struct imsm_map *map; + int i; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + if (__le32_to_cpu(dev->size_high) > 0) + mpb->attributes |= MPB_ATTRIB_2TB; + + /* FIXME detect when an array spans a port multiplier */ + #if 0 + mpb->attributes |= MPB_ATTRIB_PM; + #endif + + if (mpb->num_raid_devs > 1 || + mpb->attributes != MPB_ATTRIB_CHECKSUM_VERIFY) { + version = MPB_VERSION_ATTRIBS; + switch (get_imsm_raid_level(map)) { + case 0: mpb->attributes |= MPB_ATTRIB_RAID0; break; + case 1: mpb->attributes |= MPB_ATTRIB_RAID1; break; + case 10: mpb->attributes |= MPB_ATTRIB_RAID10; break; + case 5: mpb->attributes |= MPB_ATTRIB_RAID5; break; + } + } else { + if (map->num_members >= 5) + version = MPB_VERSION_5OR6_DISK_ARRAY; + else if (dev->status == DEV_CLONE_N_GO) + version = MPB_VERSION_CNG; + else if (get_imsm_raid_level(map) == 5) + version = MPB_VERSION_RAID5; + else if (map->num_members >= 3) + version = MPB_VERSION_3OR4_DISK_ARRAY; + else if (get_imsm_raid_level(map) == 1) + version = MPB_VERSION_RAID1; + else + version = MPB_VERSION_RAID0; + } + strcpy(((char *) mpb->sig) + strlen(MPB_SIGNATURE), version); + } +} + +static int check_name(struct intel_super *super, char *name, int quiet) +{ + struct imsm_super *mpb = super->anchor; + char *reason = NULL; + char *start = name; + size_t len = strlen(name); + int i; + + if (len > 0) { + while (isspace(start[len - 1])) + start[--len] = 0; + while (*start && isspace(*start)) + ++start, --len; + memmove(name, start, len + 1); + } + + if (len > MAX_RAID_SERIAL_LEN) + reason = "must be 16 characters or less"; + else if (len == 0) + reason = "must be a non-empty string"; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + + if (strncmp((char *) dev->volume, name, MAX_RAID_SERIAL_LEN) == 0) { + reason = "already exists"; + break; + } + } + + if (reason && !quiet) + pr_err("imsm volume name %s\n", reason); + + return !reason; +} + +static int init_super_imsm_volume(struct supertype *st, mdu_array_info_t *info, + struct shape *s, char *name, + char *homehost, int *uuid, + long long data_offset) +{ + /* We are creating a volume inside a pre-existing container. + * so st->sb is already set. + */ + struct intel_super *super = st->sb; + unsigned int sector_size = super->sector_size; + struct imsm_super *mpb = super->anchor; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_vol *vol; + struct imsm_map *map; + int idx = mpb->num_raid_devs; + int i; + int namelen; + unsigned long long array_blocks; + size_t size_old, size_new; + unsigned int data_disks; + unsigned long long size_per_member; + + if (super->orom && mpb->num_raid_devs >= super->orom->vpa) { + pr_err("This imsm-container already has the maximum of %d volumes\n", super->orom->vpa); + return 0; + } + + /* ensure the mpb is large enough for the new data */ + size_old = __le32_to_cpu(mpb->mpb_size); + size_new = disks_to_mpb_size(info->nr_disks); + if (size_new > size_old) { + void *mpb_new; + size_t size_round = ROUND_UP(size_new, sector_size); + + if (posix_memalign(&mpb_new, sector_size, size_round) != 0) { + pr_err("could not allocate new mpb\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE, + MIGR_REC_BUF_SECTORS* + MAX_SECTOR_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free(super); + free(mpb_new); + return 0; + } + memcpy(mpb_new, mpb, size_old); + free(mpb); + mpb = mpb_new; + super->anchor = mpb_new; + mpb->mpb_size = __cpu_to_le32(size_new); + memset(mpb_new + size_old, 0, size_round - size_old); + super->len = size_round; + } + super->current_vol = idx; + + /* handle 'failed_disks' by either: + * a) create dummy disk entries in the table if this the first + * volume in the array. We add them here as this is the only + * opportunity to add them. add_to_super_imsm_volume() + * handles the non-failed disks and continues incrementing + * mpb->num_disks. + * b) validate that 'failed_disks' matches the current number + * of missing disks if the container is populated + */ + if (super->current_vol == 0) { + mpb->num_disks = 0; + for (i = 0; i < info->failed_disks; i++) { + struct imsm_disk *disk; + + mpb->num_disks++; + disk = __get_imsm_disk(mpb, i); + disk->status = CONFIGURED_DISK | FAILED_DISK; + disk->scsi_id = __cpu_to_le32(~(__u32)0); + snprintf((char *) disk->serial, MAX_RAID_SERIAL_LEN, + "missing:%d", (__u8)i); + } + find_missing(super); + } else { + int missing = 0; + struct dl *d; + + for (d = super->missing; d; d = d->next) + missing++; + if (info->failed_disks > missing) { + pr_err("unable to add 'missing' disk to container\n"); + return 0; + } + } + + if (!check_name(super, name, 0)) + return 0; + dv = xmalloc(sizeof(*dv)); + dev = xcalloc(1, sizeof(*dev) + sizeof(__u32) * (info->raid_disks - 1)); + /* + * Explicitly allow truncating to not confuse gcc's + * -Werror=stringop-truncation + */ + namelen = min((int) strlen(name), MAX_RAID_SERIAL_LEN); + memcpy(dev->volume, name, namelen); + array_blocks = calc_array_size(info->level, info->raid_disks, + info->layout, info->chunk_size, + s->size * BLOCKS_PER_KB); + data_disks = get_data_disks(info->level, info->layout, + info->raid_disks); + array_blocks = round_size_to_mb(array_blocks, data_disks); + size_per_member = array_blocks / data_disks; + + set_imsm_dev_size(dev, array_blocks); + dev->status = (DEV_READ_COALESCING | DEV_WRITE_COALESCING); + vol = &dev->vol; + vol->migr_state = 0; + set_migr_type(dev, MIGR_INIT); + vol->dirty = !info->state; + set_vol_curr_migr_unit(dev, 0); + map = get_imsm_map(dev, MAP_0); + set_pba_of_lba0(map, super->create_offset); + map->blocks_per_strip = __cpu_to_le16(info_to_blocks_per_strip(info)); + map->failed_disk_num = ~0; + if (info->level > 0) + map->map_state = (info->state ? IMSM_T_STATE_NORMAL + : IMSM_T_STATE_UNINITIALIZED); + else + map->map_state = info->failed_disks ? IMSM_T_STATE_FAILED : + IMSM_T_STATE_NORMAL; + map->ddf = 1; + + if (info->level == 1 && info->raid_disks > 2) { + free(dev); + free(dv); + pr_err("imsm does not support more than 2 disksin a raid1 volume\n"); + return 0; + } + + map->raid_level = info->level; + if (info->level == 10) + map->raid_level = 1; + set_num_domains(map); + + size_per_member += NUM_BLOCKS_DIRTY_STRIPE_REGION; + set_blocks_per_member(map, info_to_blocks_per_member(info, + size_per_member / + BLOCKS_PER_KB)); + + map->num_members = info->raid_disks; + update_num_data_stripes(map, array_blocks); + for (i = 0; i < map->num_members; i++) { + /* initialized in add_to_super */ + set_imsm_ord_tbl_ent(map, i, IMSM_ORD_REBUILD); + } + mpb->num_raid_devs++; + mpb->num_raid_devs_created++; + dev->my_vol_raid_dev_num = mpb->num_raid_devs_created; + + if (s->consistency_policy <= CONSISTENCY_POLICY_RESYNC) { + dev->rwh_policy = RWH_MULTIPLE_OFF; + } else if (s->consistency_policy == CONSISTENCY_POLICY_PPL) { + dev->rwh_policy = RWH_MULTIPLE_DISTRIBUTED; + } else { + free(dev); + free(dv); + pr_err("imsm does not support consistency policy %s\n", + map_num(consistency_policies, s->consistency_policy)); + return 0; + } + + dv->dev = dev; + dv->index = super->current_vol; + dv->next = super->devlist; + super->devlist = dv; + + imsm_update_version_info(super); + + return 1; +} + +static int init_super_imsm(struct supertype *st, mdu_array_info_t *info, + struct shape *s, char *name, + char *homehost, int *uuid, + unsigned long long data_offset) +{ + /* This is primarily called by Create when creating a new array. + * We will then get add_to_super called for each component, and then + * write_init_super called to write it out to each device. + * For IMSM, Create can create on fresh devices or on a pre-existing + * array. + * To create on a pre-existing array a different method will be called. + * This one is just for fresh drives. + */ + struct intel_super *super; + struct imsm_super *mpb; + size_t mpb_size; + char *version; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not supported by imsm\n"); + return 0; + } + + if (st->sb) + return init_super_imsm_volume(st, info, s, name, homehost, uuid, + data_offset); + + if (info) + mpb_size = disks_to_mpb_size(info->nr_disks); + else + mpb_size = MAX_SECTOR_SIZE; + + super = alloc_super(); + if (super && + posix_memalign(&super->buf, MAX_SECTOR_SIZE, mpb_size) != 0) { + free_imsm(super); + super = NULL; + } + if (!super) { + pr_err("could not allocate superblock\n"); + return 0; + } + if (posix_memalign(&super->migr_rec_buf, MAX_SECTOR_SIZE, + MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE) != 0) { + pr_err("could not allocate migr_rec buffer\n"); + free(super->buf); + free_imsm(super); + return 0; + } + memset(super->buf, 0, mpb_size); + mpb = super->buf; + mpb->mpb_size = __cpu_to_le32(mpb_size); + st->sb = super; + + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + mpb->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + + version = (char *) mpb->sig; + strcpy(version, MPB_SIGNATURE); + version += strlen(MPB_SIGNATURE); + strcpy(version, MPB_VERSION_RAID0); + + return 1; +} + +static int drive_validate_sector_size(struct intel_super *super, struct dl *dl) +{ + unsigned int member_sector_size; + + if (!is_fd_valid(dl->fd)) { + pr_err("Invalid file descriptor for %s\n", dl->devname); + return 0; + } + + if (!get_dev_sector_size(dl->fd, dl->devname, &member_sector_size)) + return 0; + if (member_sector_size != super->sector_size) + return 0; + return 1; +} + +static int add_to_super_imsm_volume(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_disk *_disk; + struct imsm_dev *dev; + struct imsm_map *map; + struct dl *dl, *df; + int slot; + + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, MAP_0); + + if (! (dk->state & (1<<MD_DISK_SYNC))) { + pr_err("%s: Cannot add spare devices to IMSM volume\n", + devname); + return 1; + } + + if (!is_fd_valid(fd)) { + /* we're doing autolayout so grab the pre-marked (in + * validate_geometry) raid_disk + */ + for (dl = super->disks; dl; dl = dl->next) + if (dl->raiddisk == dk->raid_disk) + break; + } else { + for (dl = super->disks; dl ; dl = dl->next) + if (dl->major == dk->major && + dl->minor == dk->minor) + break; + } + + if (!dl) { + pr_err("%s is not a member of the same container\n", devname); + return 1; + } + + if (mpb->num_disks == 0) + if (!get_dev_sector_size(dl->fd, dl->devname, + &super->sector_size)) + return 1; + + if (!drive_validate_sector_size(super, dl)) { + pr_err("Combining drives of different sector size in one volume is not allowed\n"); + return 1; + } + + /* add a pristine spare to the metadata */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + /* Check the device has not already been added */ + slot = get_imsm_disk_slot(map, dl->index); + if (slot >= 0 && + (get_imsm_ord_tbl_ent(dev, slot, MAP_X) & IMSM_ORD_REBUILD) == 0) { + pr_err("%s has been included in this array twice\n", + devname); + return 1; + } + set_imsm_ord_tbl_ent(map, dk->raid_disk, dl->index); + dl->disk.status = CONFIGURED_DISK; + + /* update size of 'missing' disks to be at least as large as the + * largest acitve member (we only have dummy missing disks when + * creating the first volume) + */ + if (super->current_vol == 0) { + for (df = super->missing; df; df = df->next) { + if (total_blocks(&dl->disk) > total_blocks(&df->disk)) + set_total_blocks(&df->disk, total_blocks(&dl->disk)); + _disk = __get_imsm_disk(mpb, df->index); + *_disk = df->disk; + } + } + + /* refresh unset/failed slots to point to valid 'missing' entries */ + for (df = super->missing; df; df = df->next) + for (slot = 0; slot < mpb->num_disks; slot++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + + if ((ord & IMSM_ORD_REBUILD) == 0) + continue; + set_imsm_ord_tbl_ent(map, slot, df->index | IMSM_ORD_REBUILD); + if (is_gen_migration(dev)) { + struct imsm_map *map2 = get_imsm_map(dev, + MAP_1); + int slot2 = get_imsm_disk_slot(map2, df->index); + if (slot2 < map2->num_members && slot2 >= 0) { + __u32 ord2 = get_imsm_ord_tbl_ent(dev, + slot2, + MAP_1); + if ((unsigned)df->index == + ord_to_idx(ord2)) + set_imsm_ord_tbl_ent(map2, + slot2, + df->index | + IMSM_ORD_REBUILD); + } + } + dprintf("set slot:%d to missing disk:%d\n", slot, df->index); + break; + } + + /* if we are creating the first raid device update the family number */ + if (super->current_vol == 0) { + __u32 sum; + struct imsm_dev *_dev = __get_imsm_dev(mpb, 0); + + _disk = __get_imsm_disk(mpb, dl->index); + if (!_dev || !_disk) { + pr_err("BUG mpb setup error\n"); + return 1; + } + *_dev = *dev; + *_disk = dl->disk; + sum = random32(); + sum += __gen_imsm_checksum(mpb); + mpb->family_num = __cpu_to_le32(sum); + mpb->orig_family_num = mpb->family_num; + mpb->creation_time = __cpu_to_le64((__u64)time(NULL)); + } + super->current_disk = dl; + return 0; +} + +/* mark_spare() + * Function marks disk as spare and restores disk serial + * in case it was previously marked as failed by takeover operation + * reruns: + * -1 : critical error + * 0 : disk is marked as spare but serial is not set + * 1 : success + */ +int mark_spare(struct dl *disk) +{ + __u8 serial[MAX_RAID_SERIAL_LEN]; + int ret_val = -1; + + if (!disk) + return ret_val; + + ret_val = 0; + if (!imsm_read_serial(disk->fd, NULL, serial, MAX_RAID_SERIAL_LEN)) { + /* Restore disk serial number, because takeover marks disk + * as failed and adds to serial ':0' before it becomes + * a spare disk. + */ + serialcpy(disk->serial, serial); + serialcpy(disk->disk.serial, serial); + ret_val = 1; + } + disk->disk.status = SPARE_DISK; + disk->index = -1; + + return ret_val; +} + + +static int write_super_imsm_spare(struct intel_super *super, struct dl *d); + +static int add_to_super_imsm(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, + unsigned long long data_offset) +{ + struct intel_super *super = st->sb; + struct dl *dd; + unsigned long long size; + unsigned int member_sector_size; + __u32 id; + int rv; + struct stat stb; + + /* If we are on an RAID enabled platform check that the disk is + * attached to the raid controller. + * We do not need to test disks attachment for container based additions, + * they shall be already tested when container was created/assembled. + */ + rv = find_intel_hba_capability(fd, super, devname); + /* no orom/efi or non-intel hba of the disk */ + if (rv != 0) { + dprintf("capability: %p fd: %d ret: %d\n", + super->orom, fd, rv); + return 1; + } + + if (super->current_vol >= 0) + return add_to_super_imsm_volume(st, dk, fd, devname); + + fstat(fd, &stb); + dd = xcalloc(sizeof(*dd), 1); + dd->major = major(stb.st_rdev); + dd->minor = minor(stb.st_rdev); + dd->devname = devname ? xstrdup(devname) : NULL; + dd->fd = fd; + dd->e = NULL; + dd->action = DISK_ADD; + rv = imsm_read_serial(fd, devname, dd->serial, MAX_RAID_SERIAL_LEN); + if (rv) { + pr_err("failed to retrieve scsi serial, aborting\n"); + __free_imsm_disk(dd, 0); + abort(); + } + + if (super->hba && ((super->hba->type == SYS_DEV_NVME) || + (super->hba->type == SYS_DEV_VMD))) { + int i; + char cntrl_path[PATH_MAX]; + char *cntrl_name; + char pci_dev_path[PATH_MAX]; + + if (!diskfd_to_devpath(fd, 2, pci_dev_path) || + !diskfd_to_devpath(fd, 1, cntrl_path)) { + pr_err("failed to get dev paths, aborting\n"); + __free_imsm_disk(dd, 0); + return 1; + } + + cntrl_name = basename(cntrl_path); + if (is_multipath_nvme(fd)) + pr_err("%s controller supports Multi-Path I/O, Intel (R) VROC does not support multipathing\n", + cntrl_name); + + if (devpath_to_vendor(pci_dev_path) == 0x8086) { + /* + * If Intel's NVMe drive has serial ended with + * "-A","-B","-1" or "-2" it means that this is "x8" + * device (double drive on single PCIe card). + * User should be warned about potential data loss. + */ + for (i = MAX_RAID_SERIAL_LEN-1; i > 0; i--) { + /* Skip empty character at the end */ + if (dd->serial[i] == 0) + continue; + + if (((dd->serial[i] == 'A') || + (dd->serial[i] == 'B') || + (dd->serial[i] == '1') || + (dd->serial[i] == '2')) && + (dd->serial[i-1] == '-')) + pr_err("\tThe action you are about to take may put your data at risk.\n" + "\tPlease note that x8 devices may consist of two separate x4 devices " + "located on a single PCIe port.\n" + "\tRAID 0 is the only supported configuration for this type of x8 device.\n"); + break; + } + } else if (super->hba->type == SYS_DEV_VMD && super->orom && + !imsm_orom_has_tpv_support(super->orom)) { + pr_err("\tPlatform configuration does not support non-Intel NVMe drives.\n" + "\tPlease refer to Intel(R) RSTe/VROC user guide.\n"); + __free_imsm_disk(dd, 0); + return 1; + } + } + + get_dev_size(fd, NULL, &size); + if (!get_dev_sector_size(fd, NULL, &member_sector_size)) { + __free_imsm_disk(dd, 0); + return 1; + } + + if (super->sector_size == 0) { + /* this a first device, so sector_size is not set yet */ + super->sector_size = member_sector_size; + } + + /* clear migr_rec when adding disk to container */ + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE); + if (lseek64(fd, size - MIGR_REC_SECTOR_POSITION*member_sector_size, + SEEK_SET) >= 0) { + if ((unsigned int)write(fd, super->migr_rec_buf, + MIGR_REC_BUF_SECTORS*member_sector_size) != + MIGR_REC_BUF_SECTORS*member_sector_size) + perror("Write migr_rec failed"); + } + + size /= 512; + serialcpy(dd->disk.serial, dd->serial); + set_total_blocks(&dd->disk, size); + if (__le32_to_cpu(dd->disk.total_blocks_hi) > 0) { + struct imsm_super *mpb = super->anchor; + mpb->attributes |= MPB_ATTRIB_2TB_DISK; + } + mark_spare(dd); + if (sysfs_disk_to_scsi_id(fd, &id) == 0) + dd->disk.scsi_id = __cpu_to_le32(id); + else + dd->disk.scsi_id = __cpu_to_le32(0); + + if (st->update_tail) { + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + } else { + /* this is called outside of mdmon + * write initial spare metadata + * mdmon will overwrite it. + */ + dd->next = super->disks; + super->disks = dd; + write_super_imsm_spare(super, dd); + } + + return 0; +} + +static int remove_from_super_imsm(struct supertype *st, mdu_disk_info_t *dk) +{ + struct intel_super *super = st->sb; + struct dl *dd; + + /* remove from super works only in mdmon - for communication + * manager - monitor. Check if communication memory buffer + * is prepared. + */ + if (!st->update_tail) { + pr_err("shall be used in mdmon context only\n"); + return 1; + } + dd = xcalloc(1, sizeof(*dd)); + dd->major = dk->major; + dd->minor = dk->minor; + dd->fd = -1; + mark_spare(dd); + dd->action = DISK_REMOVE; + + dd->next = super->disk_mgmt_list; + super->disk_mgmt_list = dd; + + return 0; +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb); + +static union { + char buf[MAX_SECTOR_SIZE]; + struct imsm_super anchor; +} spare_record __attribute__ ((aligned(MAX_SECTOR_SIZE))); + + +static int write_super_imsm_spare(struct intel_super *super, struct dl *d) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_super *spare = &spare_record.anchor; + __u32 sum; + + if (d->index != -1) + return 1; + + spare->mpb_size = __cpu_to_le32(sizeof(struct imsm_super)); + spare->generation_num = __cpu_to_le32(1UL); + spare->attributes = MPB_ATTRIB_CHECKSUM_VERIFY; + spare->num_disks = 1; + spare->num_raid_devs = 0; + spare->cache_size = mpb->cache_size; + spare->pwr_cycle_count = __cpu_to_le32(1); + + snprintf((char *) spare->sig, MAX_SIGNATURE_LENGTH, + MPB_SIGNATURE MPB_VERSION_RAID0); + + spare->disk[0] = d->disk; + if (__le32_to_cpu(d->disk.total_blocks_hi) > 0) + spare->attributes |= MPB_ATTRIB_2TB_DISK; + + if (super->sector_size == 4096) + convert_to_4k_imsm_disk(&spare->disk[0]); + + sum = __gen_imsm_checksum(spare); + spare->family_num = __cpu_to_le32(sum); + spare->orig_family_num = 0; + sum = __gen_imsm_checksum(spare); + spare->check_sum = __cpu_to_le32(sum); + + if (store_imsm_mpb(d->fd, spare)) { + pr_err("failed for device %d:%d %s\n", + d->major, d->minor, strerror(errno)); + return 1; + } + + return 0; +} +/* spare records have their own family number and do not have any defined raid + * devices + */ +static int write_super_imsm_spares(struct intel_super *super, int doclose) +{ + struct dl *d; + + for (d = super->disks; d; d = d->next) { + if (d->index != -1) + continue; + + if (write_super_imsm_spare(super, d)) + return 1; + + if (doclose) + close_fd(&d->fd); + } + + return 0; +} + +static int write_super_imsm(struct supertype *st, int doclose) +{ + struct intel_super *super = st->sb; + unsigned int sector_size = super->sector_size; + struct imsm_super *mpb = super->anchor; + struct dl *d; + __u32 generation; + __u32 sum; + int spares = 0; + int i; + __u32 mpb_size = sizeof(struct imsm_super) - sizeof(struct imsm_disk); + int num_disks = 0; + int clear_migration_record = 1; + __u32 bbm_log_size; + + /* 'generation' is incremented everytime the metadata is written */ + generation = __le32_to_cpu(mpb->generation_num); + generation++; + mpb->generation_num = __cpu_to_le32(generation); + + /* fix up cases where previous mdadm releases failed to set + * orig_family_num + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + + for (d = super->disks; d; d = d->next) { + if (d->index == -1) + spares++; + else { + mpb->disk[d->index] = d->disk; + num_disks++; + } + } + for (d = super->missing; d; d = d->next) { + mpb->disk[d->index] = d->disk; + num_disks++; + } + mpb->num_disks = num_disks; + mpb_size += sizeof(struct imsm_disk) * mpb->num_disks; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = __get_imsm_dev(mpb, i); + struct imsm_dev *dev2 = get_imsm_dev(super, i); + if (dev && dev2) { + imsm_copy_dev(dev, dev2); + mpb_size += sizeof_imsm_dev(dev, 0); + } + if (is_gen_migration(dev2)) + clear_migration_record = 0; + } + + bbm_log_size = get_imsm_bbm_log_size(super->bbm_log); + + if (bbm_log_size) { + memcpy((void *)mpb + mpb_size, super->bbm_log, bbm_log_size); + mpb->attributes |= MPB_ATTRIB_BBM; + } else + mpb->attributes &= ~MPB_ATTRIB_BBM; + + super->anchor->bbm_log_size = __cpu_to_le32(bbm_log_size); + mpb_size += bbm_log_size; + mpb->mpb_size = __cpu_to_le32(mpb_size); + +#ifdef DEBUG + assert(super->len == 0 || mpb_size <= super->len); +#endif + + /* recalculate checksum */ + sum = __gen_imsm_checksum(mpb); + mpb->check_sum = __cpu_to_le32(sum); + + if (super->clean_migration_record_by_mdmon) { + clear_migration_record = 1; + super->clean_migration_record_by_mdmon = 0; + } + if (clear_migration_record) + memset(super->migr_rec_buf, 0, + MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE); + + if (sector_size == 4096) + convert_to_4k(super); + + /* write the mpb for disks that compose raid devices */ + for (d = super->disks; d ; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + if (clear_migration_record) { + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - sector_size, + SEEK_SET) >= 0) { + if ((unsigned int)write(d->fd, + super->migr_rec_buf, + MIGR_REC_BUF_SECTORS*sector_size) != + MIGR_REC_BUF_SECTORS*sector_size) + perror("Write migr_rec failed"); + } + } + + if (store_imsm_mpb(d->fd, mpb)) + fprintf(stderr, + "failed for device %d:%d (fd: %d)%s\n", + d->major, d->minor, + d->fd, strerror(errno)); + + if (doclose) + close_fd(&d->fd); + } + + if (spares) + return write_super_imsm_spares(super, doclose); + + return 0; +} + +static int create_array(struct supertype *st, int dev_idx) +{ + size_t len; + struct imsm_update_create_array *u; + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, dev_idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct disk_info *inf; + struct imsm_disk *disk; + int i; + + len = sizeof(*u) - sizeof(*dev) + sizeof_imsm_dev(dev, 0) + + sizeof(*inf) * map->num_members; + u = xmalloc(len); + u->type = update_create_array; + u->dev_idx = dev_idx; + imsm_copy_dev(&u->dev, dev); + inf = get_disk_info(u); + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_disk_idx(dev, i, MAP_X); + + disk = get_imsm_disk(super, idx); + if (!disk) + disk = get_imsm_missing(super, idx); + serialcpy(inf[i].serial, disk->serial); + } + append_metadata_update(st, u, len); + + return 0; +} + +static int mgmt_disk(struct supertype *st) +{ + struct intel_super *super = st->sb; + size_t len; + struct imsm_update_add_remove_disk *u; + + if (!super->disk_mgmt_list) + return 0; + + len = sizeof(*u); + u = xmalloc(len); + u->type = update_add_remove_disk; + append_metadata_update(st, u, len); + + return 0; +} + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len); + +static int write_ppl_header(unsigned long long ppl_sector, int fd, void *buf) +{ + struct ppl_header *ppl_hdr = buf; + int ret; + + ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE)); + + if (lseek64(fd, ppl_sector * 512, SEEK_SET) < 0) { + ret = -errno; + perror("Failed to seek to PPL header location"); + return ret; + } + + if (write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) { + ret = -errno; + perror("Write PPL header failed"); + return ret; + } + + fsync(fd); + + return 0; +} + +static int write_init_ppl_imsm(struct supertype *st, struct mdinfo *info, int fd) +{ + struct intel_super *super = st->sb; + void *buf; + struct ppl_header *ppl_hdr; + int ret; + + /* first clear entire ppl space */ + ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size); + if (ret) + return ret; + + ret = posix_memalign(&buf, MAX_SECTOR_SIZE, PPL_HEADER_SIZE); + if (ret) { + pr_err("Failed to allocate PPL header buffer\n"); + return -ret; + } + + memset(buf, 0, PPL_HEADER_SIZE); + ppl_hdr = buf; + memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED); + ppl_hdr->signature = __cpu_to_le32(super->anchor->orig_family_num); + + if (info->mismatch_cnt) { + /* + * We are overwriting an invalid ppl. Make one entry with wrong + * checksum to prevent the kernel from skipping resync. + */ + ppl_hdr->entries_count = __cpu_to_le32(1); + ppl_hdr->entries[0].checksum = ~0; + } + + ret = write_ppl_header(info->ppl_sector, fd, buf); + + free(buf); + return ret; +} + +static int is_rebuilding(struct imsm_dev *dev); + +static int validate_ppl_imsm(struct supertype *st, struct mdinfo *info, + struct mdinfo *disk) +{ + struct intel_super *super = st->sb; + struct dl *d; + void *buf_orig, *buf, *buf_prev = NULL; + int ret = 0; + struct ppl_header *ppl_hdr = NULL; + __u32 crc; + struct imsm_dev *dev; + __u32 idx; + unsigned int i; + unsigned long long ppl_offset = 0; + unsigned long long prev_gen_num = 0; + + if (disk->disk.raid_disk < 0) + return 0; + + dev = get_imsm_dev(super, info->container_member); + idx = get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_0); + d = get_imsm_dl_disk(super, idx); + + if (!d || d->index < 0 || is_failed(&d->disk)) + return 0; + + if (posix_memalign(&buf_orig, MAX_SECTOR_SIZE, PPL_HEADER_SIZE * 2)) { + pr_err("Failed to allocate PPL header buffer\n"); + return -1; + } + buf = buf_orig; + + ret = 1; + while (ppl_offset < MULTIPLE_PPL_AREA_SIZE_IMSM) { + void *tmp; + + dprintf("Checking potential PPL at offset: %llu\n", ppl_offset); + + if (lseek64(d->fd, info->ppl_sector * 512 + ppl_offset, + SEEK_SET) < 0) { + perror("Failed to seek to PPL header location"); + ret = -1; + break; + } + + if (read(d->fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) { + perror("Read PPL header failed"); + ret = -1; + break; + } + + ppl_hdr = buf; + + crc = __le32_to_cpu(ppl_hdr->checksum); + ppl_hdr->checksum = 0; + + if (crc != ~crc32c_le(~0, buf, PPL_HEADER_SIZE)) { + dprintf("Wrong PPL header checksum on %s\n", + d->devname); + break; + } + + if (prev_gen_num > __le64_to_cpu(ppl_hdr->generation)) { + /* previous was newest, it was already checked */ + break; + } + + if ((__le32_to_cpu(ppl_hdr->signature) != + super->anchor->orig_family_num)) { + dprintf("Wrong PPL header signature on %s\n", + d->devname); + ret = 1; + break; + } + + ret = 0; + prev_gen_num = __le64_to_cpu(ppl_hdr->generation); + + ppl_offset += PPL_HEADER_SIZE; + for (i = 0; i < __le32_to_cpu(ppl_hdr->entries_count); i++) + ppl_offset += + __le32_to_cpu(ppl_hdr->entries[i].pp_size); + + if (!buf_prev) + buf_prev = buf + PPL_HEADER_SIZE; + tmp = buf_prev; + buf_prev = buf; + buf = tmp; + } + + if (buf_prev) { + buf = buf_prev; + ppl_hdr = buf_prev; + } + + /* + * Update metadata to use mutliple PPLs area (1MB). + * This is done once for all RAID members + */ + if (info->consistency_policy == CONSISTENCY_POLICY_PPL && + info->ppl_size != (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9)) { + char subarray[20]; + struct mdinfo *member_dev; + + sprintf(subarray, "%d", info->container_member); + + if (mdmon_running(st->container_devnm)) + st->update_tail = &st->updates; + + if (st->ss->update_subarray(st, subarray, "ppl", NULL)) { + pr_err("Failed to update subarray %s\n", + subarray); + } else { + if (st->update_tail) + flush_metadata_updates(st); + else + st->ss->sync_metadata(st); + info->ppl_size = (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9); + for (member_dev = info->devs; member_dev; + member_dev = member_dev->next) + member_dev->ppl_size = + (MULTIPLE_PPL_AREA_SIZE_IMSM >> 9); + } + } + + if (ret == 1) { + struct imsm_map *map = get_imsm_map(dev, MAP_X); + + if (map->map_state == IMSM_T_STATE_UNINITIALIZED || + (map->map_state == IMSM_T_STATE_NORMAL && + !(dev->vol.dirty & RAIDVOL_DIRTY)) || + (is_rebuilding(dev) && + vol_curr_migr_unit(dev) == 0 && + get_imsm_disk_idx(dev, disk->disk.raid_disk, MAP_1) != idx)) + ret = st->ss->write_init_ppl(st, info, d->fd); + else + info->mismatch_cnt++; + } else if (ret == 0 && + ppl_hdr->entries_count == 0 && + is_rebuilding(dev) && + info->resync_start == 0) { + /* + * The header has no entries - add a single empty entry and + * rewrite the header to prevent the kernel from going into + * resync after an interrupted rebuild. + */ + ppl_hdr->entries_count = __cpu_to_le32(1); + ret = write_ppl_header(info->ppl_sector, d->fd, buf); + } + + free(buf_orig); + + return ret; +} + +static int write_init_ppl_imsm_all(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct dl *d; + int ret = 0; + + if (info->consistency_policy != CONSISTENCY_POLICY_PPL || + info->array.level != 5) + return 0; + + for (d = super->disks; d ; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + ret = st->ss->write_init_ppl(st, info, d->fd); + if (ret) + break; + } + + return ret; +} + +/******************************************************************************* + * Function: write_init_bitmap_imsm_vol + * Description: Write a bitmap header and prepares the area for the bitmap. + * Parameters: + * st : supertype information + * vol_idx : the volume index to use + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int write_init_bitmap_imsm_vol(struct supertype *st, int vol_idx) +{ + struct intel_super *super = st->sb; + int prev_current_vol = super->current_vol; + struct dl *d; + int ret = 0; + + super->current_vol = vol_idx; + for (d = super->disks; d; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + ret = st->ss->write_bitmap(st, d->fd, NoUpdate); + if (ret) + break; + } + super->current_vol = prev_current_vol; + return ret; +} + +/******************************************************************************* + * Function: write_init_bitmap_imsm_all + * Description: Write a bitmap header and prepares the area for the bitmap. + * Operation is executed for volumes with CONSISTENCY_POLICY_BITMAP. + * Parameters: + * st : supertype information + * info : info about the volume where the bitmap should be written + * vol_idx : the volume index to use + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int write_init_bitmap_imsm_all(struct supertype *st, struct mdinfo *info, + int vol_idx) +{ + int ret = 0; + + if (info && (info->consistency_policy == CONSISTENCY_POLICY_BITMAP)) + ret = write_init_bitmap_imsm_vol(st, vol_idx); + + return ret; +} + +static int write_init_super_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + int current_vol = super->current_vol; + int rv = 0; + struct mdinfo info; + + getinfo_super_imsm(st, &info, NULL); + + /* we are done with current_vol reset it to point st at the container */ + super->current_vol = -1; + + if (st->update_tail) { + /* queue the recently created array / added disk + * as a metadata update */ + + /* determine if we are creating a volume or adding a disk */ + if (current_vol < 0) { + /* in the mgmt (add/remove) disk case we are running + * in mdmon context, so don't close fd's + */ + rv = mgmt_disk(st); + } else { + /* adding the second volume to the array */ + rv = write_init_ppl_imsm_all(st, &info); + if (!rv) + rv = write_init_bitmap_imsm_all(st, &info, current_vol); + if (!rv) + rv = create_array(st, current_vol); + } + } else { + struct dl *d; + for (d = super->disks; d; d = d->next) + Kill(d->devname, NULL, 0, -1, 1); + if (current_vol >= 0) { + rv = write_init_ppl_imsm_all(st, &info); + if (!rv) + rv = write_init_bitmap_imsm_all(st, &info, current_vol); + } + + if (!rv) + rv = write_super_imsm(st, 1); + } + + return rv; +} + +static int store_super_imsm(struct supertype *st, int fd) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super ? super->anchor : NULL; + + if (!mpb) + return 1; + + if (super->sector_size == 4096) + convert_to_4k(super); + return store_imsm_mpb(fd, mpb); +} + +static int validate_geometry_imsm_container(struct supertype *st, int level, + int raiddisks, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + int fd; + unsigned long long ldsize; + struct intel_super *super = NULL; + int rv = 0; + + if (level != LEVEL_CONTAINER) + return 0; + if (!dev) + return 1; + + fd = dev_open(dev, O_RDONLY|O_EXCL); + if (!is_fd_valid(fd)) { + pr_vrb("imsm: Cannot open %s: %s\n", dev, strerror(errno)); + return 0; + } + if (!get_dev_size(fd, dev, &ldsize)) + goto exit; + + /* capabilities retrieve could be possible + * note that there is no fd for the disks in array. + */ + super = alloc_super(); + if (!super) + goto exit; + + if (!get_dev_sector_size(fd, NULL, &super->sector_size)) + goto exit; + + rv = find_intel_hba_capability(fd, super, verbose > 0 ? dev : NULL); + if (rv != 0) { +#if DEBUG + char str[256]; + fd2devname(fd, str); + dprintf("fd: %d %s orom: %p rv: %d raiddisk: %d\n", + fd, str, super->orom, rv, raiddisks); +#endif + /* no orom/efi or non-intel hba of the disk */ + rv = 0; + goto exit; + } + if (super->orom) { + if (raiddisks > super->orom->tds) { + if (verbose) + pr_err("%d exceeds maximum number of platform supported disks: %d\n", + raiddisks, super->orom->tds); + goto exit; + } + if ((super->orom->attr & IMSM_OROM_ATTR_2TB_DISK) == 0 && + (ldsize >> 9) >> 32 > 0) { + if (verbose) + pr_err("%s exceeds maximum platform supported size\n", dev); + goto exit; + } + + if (super->hba->type == SYS_DEV_VMD || + super->hba->type == SYS_DEV_NVME) { + if (!imsm_is_nvme_namespace_supported(fd, 1)) { + if (verbose) + pr_err("NVMe namespace %s is not supported by IMSM\n", + basename(dev)); + goto exit; + } + } + } + if (freesize) + *freesize = avail_size_imsm(st, ldsize >> 9, data_offset); + rv = 1; +exit: + if (super) + free_imsm(super); + close(fd); + + return rv; +} + +static unsigned long long find_size(struct extent *e, int *idx, int num_extents) +{ + const unsigned long long base_start = e[*idx].start; + unsigned long long end = base_start + e[*idx].size; + int i; + + if (base_start == end) + return 0; + + *idx = *idx + 1; + for (i = *idx; i < num_extents; i++) { + /* extend overlapping extents */ + if (e[i].start >= base_start && + e[i].start <= end) { + if (e[i].size == 0) + return 0; + if (e[i].start + e[i].size > end) + end = e[i].start + e[i].size; + } else if (e[i].start > end) { + *idx = i; + break; + } + } + + return end - base_start; +} + +static unsigned long long merge_extents(struct intel_super *super, int sum_extents) +{ + /* build a composite disk with all known extents and generate a new + * 'maxsize' given the "all disks in an array must share a common start + * offset" constraint + */ + struct extent *e = xcalloc(sum_extents, sizeof(*e)); + struct dl *dl; + int i, j; + int start_extent; + unsigned long long pos; + unsigned long long start = 0; + unsigned long long maxsize; + unsigned long reserve; + + /* coalesce and sort all extents. also, check to see if we need to + * reserve space between member arrays + */ + j = 0; + for (dl = super->disks; dl; dl = dl->next) { + if (!dl->e) + continue; + for (i = 0; i < dl->extent_cnt; i++) + e[j++] = dl->e[i]; + } + qsort(e, sum_extents, sizeof(*e), cmp_extent); + + /* merge extents */ + i = 0; + j = 0; + while (i < sum_extents) { + e[j].start = e[i].start; + e[j].size = find_size(e, &i, sum_extents); + j++; + if (e[j-1].size == 0) + break; + } + + pos = 0; + maxsize = 0; + start_extent = 0; + i = 0; + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) { + maxsize = esize; + start = pos; + start_extent = i; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + free(e); + + if (maxsize == 0) + return 0; + + /* FIXME assumes volume at offset 0 is the first volume in a + * container + */ + if (start_extent > 0) + reserve = IMSM_RESERVED_SECTORS; /* gap between raid regions */ + else + reserve = 0; + + if (maxsize < reserve) + return 0; + + super->create_offset = ~((unsigned long long) 0); + if (start + reserve > super->create_offset) + return 0; /* start overflows create_offset */ + super->create_offset = start + reserve; + + return maxsize - reserve; +} + +static int is_raid_level_supported(const struct imsm_orom *orom, int level, int raiddisks) +{ + if (level < 0 || level == 6 || level == 4) + return 0; + + /* if we have an orom prevent invalid raid levels */ + if (orom) + switch (level) { + case 0: return imsm_orom_has_raid0(orom); + case 1: + if (raiddisks > 2) + return imsm_orom_has_raid1e(orom); + return imsm_orom_has_raid1(orom) && raiddisks == 2; + case 10: return imsm_orom_has_raid10(orom) && raiddisks == 4; + case 5: return imsm_orom_has_raid5(orom) && raiddisks > 2; + } + else + return 1; /* not on an Intel RAID platform so anything goes */ + + return 0; +} + +static int +active_arrays_by_format(char *name, char* hba, struct md_list **devlist, + int dpa, int verbose) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *memb; + int count = 0; + int num = 0; + struct md_list *dv; + int found; + + for (memb = mdstat ; memb ; memb = memb->next) { + if (memb->metadata_version && + (strncmp(memb->metadata_version, "external:", 9) == 0) && + (strcmp(&memb->metadata_version[9], name) == 0) && + !is_subarray(memb->metadata_version+9) && + memb->members) { + struct dev_member *dev = memb->members; + int fd = -1; + while (dev && !is_fd_valid(fd)) { + char *path = xmalloc(strlen(dev->name) + strlen("/dev/") + 1); + num = sprintf(path, "%s%s", "/dev/", dev->name); + if (num > 0) + fd = open(path, O_RDONLY, 0); + if (num <= 0 || !is_fd_valid(fd)) { + pr_vrb("Cannot open %s: %s\n", + dev->name, strerror(errno)); + } + free(path); + dev = dev->next; + } + found = 0; + if (is_fd_valid(fd) && disk_attached_to_hba(fd, hba)) { + struct mdstat_ent *vol; + for (vol = mdstat ; vol ; vol = vol->next) { + if (vol->active > 0 && + vol->metadata_version && + is_container_member(vol, memb->devnm)) { + found++; + count++; + } + } + if (*devlist && (found < dpa)) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(strlen(memb->devnm) + strlen("/dev/") + 1); + sprintf(dv->devname, "%s%s", "/dev/", memb->devnm); + dv->found = found; + dv->used = 0; + dv->next = *devlist; + *devlist = dv; + } + } + close_fd(&fd); + } + } + free_mdstat(mdstat); + return count; +} + +#ifdef DEBUG_LOOP +static struct md_list* +get_loop_devices(void) +{ + int i; + struct md_list *devlist = NULL; + struct md_list *dv; + + for(i = 0; i < 12; i++) { + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xmalloc(40); + sprintf(dv->devname, "/dev/loop%d", i); + dv->next = devlist; + devlist = dv; + } + return devlist; +} +#endif + +static struct md_list* +get_devices(const char *hba_path) +{ + struct md_list *devlist = NULL; + struct md_list *dv; + struct dirent *ent; + DIR *dir; + int err = 0; + +#if DEBUG_LOOP + devlist = get_loop_devices(); + return devlist; +#endif + /* scroll through /sys/dev/block looking for devices attached to + * this hba + */ + dir = opendir("/sys/dev/block"); + for (ent = dir ? readdir(dir) : NULL; ent; ent = readdir(dir)) { + int fd; + char buf[1024]; + int major, minor; + char *path = NULL; + if (sscanf(ent->d_name, "%d:%d", &major, &minor) != 2) + continue; + path = devt_to_devpath(makedev(major, minor), 1, NULL); + if (!path) + continue; + if (!path_attached_to_hba(path, hba_path)) { + free(path); + path = NULL; + continue; + } + free(path); + path = NULL; + fd = dev_open(ent->d_name, O_RDONLY); + if (is_fd_valid(fd)) { + fd2devname(fd, buf); + close(fd); + } else { + pr_err("cannot open device: %s\n", + ent->d_name); + continue; + } + + dv = xcalloc(1, sizeof(*dv)); + dv->devname = xstrdup(buf); + dv->next = devlist; + devlist = dv; + } + if (err) { + while(devlist) { + dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + closedir(dir); + return devlist; +} + +static int +count_volumes_list(struct md_list *devlist, char *homehost, + int verbose, int *found) +{ + struct md_list *tmpdev; + int count = 0; + struct supertype *st; + + /* first walk the list of devices to find a consistent set + * that match the criterea, if that is possible. + * We flag the ones we like with 'used'. + */ + *found = 0; + st = match_metadata_desc_imsm("imsm"); + if (st == NULL) { + pr_vrb("cannot allocate memory for imsm supertype\n"); + return 0; + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + char *devname = tmpdev->devname; + dev_t rdev; + struct supertype *tst; + int dfd; + if (tmpdev->used > 1) + continue; + tst = dup_super(st); + if (tst == NULL) { + pr_vrb("cannot allocate memory for imsm supertype\n"); + goto err_1; + } + tmpdev->container = 0; + dfd = dev_open(devname, O_RDONLY|O_EXCL); + if (!is_fd_valid(dfd)) { + dprintf("cannot open device %s: %s\n", + devname, strerror(errno)); + tmpdev->used = 2; + } else if (!fstat_is_blkdev(dfd, devname, &rdev)) { + tmpdev->used = 2; + } else if (must_be_container(dfd)) { + struct supertype *cst; + cst = super_by_fd(dfd, NULL); + if (cst == NULL) { + dprintf("cannot recognize container type %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss != st->ss) { + dprintf("non-imsm container - ignore it: %s\n", + devname); + tmpdev->used = 2; + } else if (!tst->ss->load_container || + tst->ss->load_container(tst, dfd, NULL)) + tmpdev->used = 2; + else { + tmpdev->container = 1; + } + if (cst) + cst->ss->free_super(cst); + } else { + tmpdev->st_rdev = rdev; + if (tst->ss->load_super(tst,dfd, NULL)) { + dprintf("no RAID superblock on %s\n", + devname); + tmpdev->used = 2; + } else if (tst->ss->compare_super == NULL) { + dprintf("Cannot assemble %s metadata on %s\n", + tst->ss->name, devname); + tmpdev->used = 2; + } + } + close_fd(&dfd); + + if (tmpdev->used == 2 || tmpdev->used == 4) { + /* Ignore unrecognised devices during auto-assembly */ + goto loop; + } + else { + struct mdinfo info; + tst->ss->getinfo_super(tst, &info, NULL); + + if (st->minor_version == -1) + st->minor_version = tst->minor_version; + + if (memcmp(info.uuid, uuid_zero, + sizeof(int[4])) == 0) { + /* this is a floating spare. It cannot define + * an array unless there are no more arrays of + * this type to be found. It can be included + * in an array of this type though. + */ + tmpdev->used = 3; + goto loop; + } + + if (st->ss != tst->ss || + st->minor_version != tst->minor_version || + st->ss->compare_super(st, tst, 1) != 0) { + /* Some mismatch. If exactly one array matches this host, + * we can resolve on that one. + * Or, if we are auto assembling, we just ignore the second + * for now. + */ + dprintf("superblock on %s doesn't match others - assembly aborted\n", + devname); + goto loop; + } + tmpdev->used = 1; + *found = 1; + dprintf("found: devname: %s\n", devname); + } + loop: + if (tst) + tst->ss->free_super(tst); + } + if (*found != 0) { + int err; + if ((err = load_super_imsm_all(st, -1, &st->sb, NULL, devlist, 0)) == 0) { + struct mdinfo *iter, *head = st->ss->container_content(st, NULL); + for (iter = head; iter; iter = iter->next) { + dprintf("content->text_version: %s vol\n", + iter->text_version); + if (iter->array.state & (1<<MD_SB_BLOCK_VOLUME)) { + /* do not assemble arrays with unsupported + configurations */ + dprintf("Cannot activate member %s.\n", + iter->text_version); + } else + count++; + } + sysfs_free(head); + + } else { + dprintf("No valid super block on device list: err: %d %p\n", + err, st->sb); + } + } else { + dprintf("no more devices to examine\n"); + } + + for (tmpdev = devlist; tmpdev; tmpdev = tmpdev->next) { + if (tmpdev->used == 1 && tmpdev->found) { + if (count) { + if (count < tmpdev->found) + count = 0; + else + count -= tmpdev->found; + } + } + if (tmpdev->used == 1) + tmpdev->used = 4; + } + err_1: + if (st) + st->ss->free_super(st); + return count; +} + +static int __count_volumes(char *hba_path, int dpa, int verbose, + int cmp_hba_path) +{ + struct sys_dev *idev, *intel_devices = find_intel_devices(); + int count = 0; + const struct orom_entry *entry; + struct devid_list *dv, *devid_list; + + if (!hba_path) + return 0; + + for (idev = intel_devices; idev; idev = idev->next) { + if (strstr(idev->path, hba_path)) + break; + } + + if (!idev || !idev->dev_id) + return 0; + + entry = get_orom_entry_by_device_id(idev->dev_id); + + if (!entry || !entry->devid_list) + return 0; + + devid_list = entry->devid_list; + for (dv = devid_list; dv; dv = dv->next) { + struct md_list *devlist; + struct sys_dev *device = NULL; + char *hpath; + int found = 0; + + if (cmp_hba_path) + device = device_by_id_and_path(dv->devid, hba_path); + else + device = device_by_id(dv->devid); + + if (device) + hpath = device->path; + else + return 0; + + devlist = get_devices(hpath); + /* if no intel devices return zero volumes */ + if (devlist == NULL) + return 0; + + count += active_arrays_by_format("imsm", hpath, &devlist, dpa, + verbose); + dprintf("path: %s active arrays: %d\n", hpath, count); + if (devlist == NULL) + return 0; + do { + found = 0; + count += count_volumes_list(devlist, + NULL, + verbose, + &found); + dprintf("found %d count: %d\n", found, count); + } while (found); + + dprintf("path: %s total number of volumes: %d\n", hpath, count); + + while (devlist) { + struct md_list *dv = devlist; + devlist = devlist->next; + free(dv->devname); + free(dv); + } + } + return count; +} + +static int count_volumes(struct intel_hba *hba, int dpa, int verbose) +{ + if (!hba) + return 0; + if (hba->type == SYS_DEV_VMD) { + struct sys_dev *dev; + int count = 0; + + for (dev = find_intel_devices(); dev; dev = dev->next) { + if (dev->type == SYS_DEV_VMD) + count += __count_volumes(dev->path, dpa, + verbose, 1); + } + return count; + } + return __count_volumes(hba->path, dpa, verbose, 0); +} + +static int imsm_default_chunk(const struct imsm_orom *orom) +{ + /* up to 512 if the plaform supports it, otherwise the platform max. + * 128 if no platform detected + */ + int fs = max(7, orom ? fls(orom->sss) : 0); + + return min(512, (1 << fs)); +} + +static int +validate_geometry_imsm_orom(struct intel_super *super, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, int verbose) +{ + /* check/set platform and metadata limits/defaults */ + if (super->orom && raiddisks > super->orom->dpa) { + pr_vrb("platform supports a maximum of %d disks per array\n", + super->orom->dpa); + return 0; + } + + /* capabilities of OROM tested - copied from validate_geometry_imsm_volume */ + if (!is_raid_level_supported(super->orom, level, raiddisks)) { + pr_vrb("platform does not support raid%d with %d disk%s\n", + level, raiddisks, raiddisks > 1 ? "s" : ""); + return 0; + } + + if (*chunk == 0 || *chunk == UnSet) + *chunk = imsm_default_chunk(super->orom); + + if (super->orom && !imsm_orom_has_chunk(super->orom, *chunk)) { + pr_vrb("platform does not support a chunk size of: %d\n", *chunk); + return 0; + } + + if (layout != imsm_level_to_layout(level)) { + if (level == 5) + pr_vrb("imsm raid 5 only supports the left-asymmetric layout\n"); + else if (level == 10) + pr_vrb("imsm raid 10 only supports the n2 layout\n"); + else + pr_vrb("imsm unknown layout %#x for this raid level %d\n", + layout, level); + return 0; + } + + if (super->orom && (super->orom->attr & IMSM_OROM_ATTR_2TB) == 0 && + (calc_array_size(level, raiddisks, layout, *chunk, size) >> 32) > 0) { + pr_vrb("platform does not support a volume size over 2TB\n"); + return 0; + } + + return 1; +} + +/* validate_geometry_imsm_volume - lifted from validate_geometry_ddf_bvd + * FIX ME add ahci details + */ +static int validate_geometry_imsm_volume(struct supertype *st, int level, + int layout, int raiddisks, int *chunk, + unsigned long long size, + unsigned long long data_offset, + char *dev, + unsigned long long *freesize, + int verbose) +{ + dev_t rdev; + struct intel_super *super = st->sb; + struct imsm_super *mpb; + struct dl *dl; + unsigned long long pos = 0; + unsigned long long maxsize; + struct extent *e; + int i; + + /* We must have the container info already read in. */ + if (!super) + return 0; + + mpb = super->anchor; + + if (!validate_geometry_imsm_orom(super, level, layout, raiddisks, chunk, size, verbose)) { + pr_err("RAID geometry validation failed. Cannot proceed with the action(s).\n"); + return 0; + } + if (!dev) { + /* General test: make sure there is space for + * 'raiddisks' device extents of size 'size' at a given + * offset + */ + unsigned long long minsize = size; + unsigned long long start_offset = MaxSector; + int dcnt = 0; + if (minsize == 0) + minsize = MPB_SECTOR_CNT + IMSM_RESERVED_SECTORS; + for (dl = super->disks; dl ; dl = dl->next) { + int found = 0; + + pos = 0; + i = 0; + e = get_extents(super, dl, 0); + if (!e) continue; + do { + unsigned long long esize; + esize = e[i].start - pos; + if (esize >= minsize) + found = 1; + if (found && start_offset == MaxSector) { + start_offset = pos; + break; + } else if (found && pos != start_offset) { + found = 0; + break; + } + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + if (found) + dcnt++; + free(e); + } + if (dcnt < raiddisks) { + if (verbose) + pr_err("imsm: Not enough devices with space for this array (%d < %d)\n", + dcnt, raiddisks); + return 0; + } + return 1; + } + + /* This device must be a member of the set */ + if (!stat_is_blkdev(dev, &rdev)) + return 0; + for (dl = super->disks ; dl ; dl = dl->next) { + if (dl->major == (int)major(rdev) && + dl->minor == (int)minor(rdev)) + break; + } + if (!dl) { + if (verbose) + pr_err("%s is not in the same imsm set\n", dev); + return 0; + } else if (super->orom && dl->index < 0 && mpb->num_raid_devs) { + /* If a volume is present then the current creation attempt + * cannot incorporate new spares because the orom may not + * understand this configuration (all member disks must be + * members of each array in the container). + */ + pr_err("%s is a spare and a volume is already defined for this container\n", dev); + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } else if (super->orom && mpb->num_raid_devs > 0 && + mpb->num_disks != raiddisks) { + pr_err("The option-rom requires all member disks to be a member of all volumes\n"); + return 0; + } + + /* retrieve the largest free space block */ + e = get_extents(super, dl, 0); + maxsize = 0; + i = 0; + if (e) { + do { + unsigned long long esize; + + esize = e[i].start - pos; + if (esize >= maxsize) + maxsize = esize; + pos = e[i].start + e[i].size; + i++; + } while (e[i-1].size); + dl->e = e; + dl->extent_cnt = i; + } else { + if (verbose) + pr_err("unable to determine free space for: %s\n", + dev); + return 0; + } + if (maxsize < size) { + if (verbose) + pr_err("%s not enough space (%llu < %llu)\n", + dev, maxsize, size); + return 0; + } + + /* count total number of extents for merge */ + i = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + i += dl->extent_cnt; + + maxsize = merge_extents(super, i); + + if (mpb->num_raid_devs > 0 && size && size != maxsize) + pr_err("attempting to create a second volume with size less then remaining space.\n"); + + if (maxsize < size || maxsize == 0) { + if (verbose) { + if (maxsize == 0) + pr_err("no free space left on device. Aborting...\n"); + else + pr_err("not enough space to create volume of given size (%llu < %llu). Aborting...\n", + maxsize, size); + } + return 0; + } + + *freesize = maxsize; + + if (super->orom) { + int count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb("platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + return 1; +} + +static int imsm_get_free_size(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct dl *dl; + int i; + int extent_cnt; + struct extent *e; + unsigned long long maxsize; + unsigned long long minsize; + int cnt; + int used; + + /* find the largest common start free region of the possible disks */ + used = 0; + extent_cnt = 0; + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) { + dl->raiddisk = -1; + + if (dl->index >= 0) + used++; + + /* don't activate new spares if we are orom constrained + * and there is already a volume active in the container + */ + if (super->orom && dl->index < 0 && mpb->num_raid_devs) + continue; + + e = get_extents(super, dl, 0); + if (!e) + continue; + for (i = 1; e[i-1].size; i++) + ; + dl->e = e; + dl->extent_cnt = i; + extent_cnt += i; + cnt++; + } + + maxsize = merge_extents(super, extent_cnt); + minsize = size; + if (size == 0) + /* chunk is in K */ + minsize = chunk * 2; + + if (cnt < raiddisks || + (super->orom && used && used != raiddisks) || + maxsize < minsize || + maxsize == 0) { + pr_err("not enough devices with space to create array.\n"); + return 0; /* No enough free spaces large enough */ + } + + if (size == 0) { + size = maxsize; + if (chunk) { + size /= 2 * chunk; + size *= 2 * chunk; + } + maxsize = size; + } + if (mpb->num_raid_devs > 0 && size && size != maxsize) + pr_err("attempting to create a second volume with size less then remaining space.\n"); + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + + *freesize = size; + + dprintf("imsm: imsm_get_free_size() returns : %llu\n", size); + + return 1; +} + +static int reserve_space(struct supertype *st, int raiddisks, + unsigned long long size, int chunk, + unsigned long long *freesize) +{ + struct intel_super *super = st->sb; + struct dl *dl; + int cnt; + int rv = 0; + + rv = imsm_get_free_size(st, raiddisks, size, chunk, freesize); + if (rv) { + cnt = 0; + for (dl = super->disks; dl; dl = dl->next) + if (dl->e) + dl->raiddisk = cnt++; + rv = 1; + } + + return rv; +} + +static int validate_geometry_imsm(struct supertype *st, int level, int layout, + int raiddisks, int *chunk, unsigned long long size, + unsigned long long data_offset, + char *dev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + int fd, cfd; + struct mdinfo *sra; + int is_member = 0; + + /* load capability + * if given unused devices create a container + * if given given devices in a container create a member volume + */ + if (level == LEVEL_CONTAINER) + /* Must be a fresh device to add to a container */ + return validate_geometry_imsm_container(st, level, raiddisks, + data_offset, dev, + freesize, verbose); + + /* + * Size is given in sectors. + */ + if (size && (size < 2048)) { + pr_err("Given size must be greater than 1M.\n"); + /* Depends on algorithm in Create.c : + * if container was given (dev == NULL) return -1, + * if block device was given ( dev != NULL) return 0. + */ + return dev ? -1 : 0; + } + + if (!dev) { + if (st->sb) { + struct intel_super *super = st->sb; + if (!validate_geometry_imsm_orom(st->sb, level, layout, + raiddisks, chunk, size, + verbose)) + return 0; + /* we are being asked to automatically layout a + * new volume based on the current contents of + * the container. If the the parameters can be + * satisfied reserve_space will record the disks, + * start offset, and size of the volume to be + * created. add_to_super and getinfo_super + * detect when autolayout is in progress. + */ + /* assuming that freesize is always given when array is + created */ + if (super->orom && freesize) { + int count; + count = count_volumes(super->hba, + super->orom->dpa, verbose); + if (super->orom->vphba <= count) { + pr_vrb("platform does not support more than %d raid volumes.\n", + super->orom->vphba); + return 0; + } + } + if (freesize) + return reserve_space(st, raiddisks, size, + *chunk, freesize); + } + return 1; + } + if (st->sb) { + /* creating in a given container */ + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, size, + data_offset, + dev, freesize, verbose); + } + + /* This device needs to be a device in an 'imsm' container */ + fd = open(dev, O_RDONLY|O_EXCL, 0); + + if (is_fd_valid(fd)) { + pr_vrb("Cannot create this array on device %s\n", dev); + close(fd); + return 0; + } + if (errno == EBUSY) + fd = open(dev, O_RDONLY, 0); + + if (!is_fd_valid(fd)) { + pr_vrb("Cannot open %s: %s\n", dev, strerror(errno)); + return 0; + } + + /* Well, it is in use by someone, maybe an 'imsm' container. */ + cfd = open_container(fd); + close_fd(&fd); + + if (!is_fd_valid(cfd)) { + pr_vrb("Cannot use %s: It is busy\n", dev); + return 0; + } + sra = sysfs_read(cfd, NULL, GET_VERSION); + if (sra && sra->array.major_version == -1 && + strcmp(sra->text_version, "imsm") == 0) + is_member = 1; + sysfs_free(sra); + if (is_member) { + /* This is a member of a imsm container. Load the container + * and try to create a volume + */ + struct intel_super *super; + + if (load_super_imsm_all(st, cfd, (void **) &super, NULL, NULL, 1) == 0) { + st->sb = super; + strcpy(st->container_devnm, fd2devnm(cfd)); + close(cfd); + return validate_geometry_imsm_volume(st, level, layout, + raiddisks, chunk, + size, data_offset, dev, + freesize, 1) + ? 1 : -1; + } + } + + if (verbose) + pr_err("failed container membership check\n"); + + close(cfd); + return 0; +} + +static void default_geometry_imsm(struct supertype *st, int *level, int *layout, int *chunk) +{ + struct intel_super *super = st->sb; + + if (level && *level == UnSet) + *level = LEVEL_CONTAINER; + + if (level && layout && *layout == UnSet) + *layout = imsm_level_to_layout(*level); + + if (chunk && (*chunk == UnSet || *chunk == 0)) + *chunk = imsm_default_chunk(super->orom); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev); + +static int kill_subarray_imsm(struct supertype *st, char *subarray_id) +{ + /* remove the subarray currently referenced by subarray_id */ + __u8 i; + struct intel_dev **dp; + struct intel_super *super = st->sb; + __u8 current_vol = strtoul(subarray_id, NULL, 10); + struct imsm_super *mpb = super->anchor; + + if (mpb->num_raid_devs == 0) + return 2; + + /* block deletions that would change the uuid of active subarrays + * + * FIXME when immutable ids are available, but note that we'll + * also need to fixup the invalidated/active subarray indexes in + * mdstat + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + char subarray[4]; + + if (i < current_vol) + continue; + sprintf(subarray, "%u", i); + if (is_subarray_active(subarray, st->devnm)) { + pr_err("deleting subarray-%d would change the UUID of active subarray-%d, aborting\n", + current_vol, i); + + return 2; + } + } + + if (st->update_tail) { + struct imsm_update_kill_array *u = xmalloc(sizeof(*u)); + + u->type = update_kill_array; + u->dev_idx = current_vol; + append_metadata_update(st, u, sizeof(*u)); + + return 0; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == current_vol) { + *dp = (*dp)->next; + } else { + handle_missing(super, (*dp)->dev); + if ((*dp)->index > current_vol) + (*dp)->index--; + dp = &(*dp)->next; + } + + /* no more raid devices, all active components are now spares, + * but of course failed are still failed + */ + if (--mpb->num_raid_devs == 0) { + struct dl *d; + + for (d = super->disks; d; d = d->next) + if (d->index > -2) + mark_spare(d); + } + + super->updates_pending++; + + return 0; +} + +static int get_rwh_policy_from_update(char *update) +{ + if (strcmp(update, "ppl") == 0) + return RWH_MULTIPLE_DISTRIBUTED; + else if (strcmp(update, "no-ppl") == 0) + return RWH_MULTIPLE_OFF; + else if (strcmp(update, "bitmap") == 0) + return RWH_BITMAP; + else if (strcmp(update, "no-bitmap") == 0) + return RWH_OFF; + return -1; +} + +static int update_subarray_imsm(struct supertype *st, char *subarray, + char *update, struct mddev_ident *ident) +{ + /* update the subarray currently referenced by ->current_vol */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (strcmp(update, "name") == 0) { + char *name = ident->name; + char *ep; + int vol; + + if (is_subarray_active(subarray, st->devnm)) { + pr_err("Unable to update name of active subarray\n"); + return 2; + } + + if (!check_name(super, name, 0)) + return 2; + + vol = strtoul(subarray, &ep, 10); + if (*ep != '\0' || vol >= super->anchor->num_raid_devs) + return 2; + + if (st->update_tail) { + struct imsm_update_rename_array *u = xmalloc(sizeof(*u)); + + u->type = update_rename_array; + u->dev_idx = vol; + strncpy((char *) u->name, name, MAX_RAID_SERIAL_LEN); + u->name[MAX_RAID_SERIAL_LEN-1] = '\0'; + append_metadata_update(st, u, sizeof(*u)); + } else { + struct imsm_dev *dev; + int i, namelen; + + dev = get_imsm_dev(super, vol); + memset(dev->volume, '\0', MAX_RAID_SERIAL_LEN); + namelen = min((int)strlen(name), MAX_RAID_SERIAL_LEN); + memcpy(dev->volume, name, namelen); + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + handle_missing(super, dev); + } + super->updates_pending++; + } + } else if (get_rwh_policy_from_update(update) != -1) { + int new_policy; + char *ep; + int vol = strtoul(subarray, &ep, 10); + + if (*ep != '\0' || vol >= super->anchor->num_raid_devs) + return 2; + + new_policy = get_rwh_policy_from_update(update); + + if (st->update_tail) { + struct imsm_update_rwh_policy *u = xmalloc(sizeof(*u)); + + u->type = update_rwh_policy; + u->dev_idx = vol; + u->new_policy = new_policy; + append_metadata_update(st, u, sizeof(*u)); + } else { + struct imsm_dev *dev; + + dev = get_imsm_dev(super, vol); + dev->rwh_policy = new_policy; + super->updates_pending++; + } + if (new_policy == RWH_BITMAP) + return write_init_bitmap_imsm_vol(st, vol); + } else + return 2; + + return 0; +} + +static bool is_gen_migration(struct imsm_dev *dev) +{ + if (dev && dev->vol.migr_state && + migr_type(dev) == MIGR_GEN_MIGR) + return true; + + return false; +} + +static int is_rebuilding(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_REBUILD) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_DEGRADED) + return 1; + else + return 0; +} + +static int is_initializing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) != MIGR_INIT) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_UNINITIALIZED) + return 1; + + return 0; +} + +static void update_recovery_start(struct intel_super *super, + struct imsm_dev *dev, + struct mdinfo *array) +{ + struct mdinfo *rebuild = NULL; + struct mdinfo *d; + __u32 units; + + if (!is_rebuilding(dev)) + return; + + /* Find the rebuild target, but punt on the dual rebuild case */ + for (d = array->devs; d; d = d->next) + if (d->recovery_start == 0) { + if (rebuild) + return; + rebuild = d; + } + + if (!rebuild) { + /* (?) none of the disks are marked with + * IMSM_ORD_REBUILD, so assume they are missing and the + * disk_ord_tbl was not correctly updated + */ + dprintf("failed to locate out-of-sync disk\n"); + return; + } + + units = vol_curr_migr_unit(dev); + rebuild->recovery_start = units * blocks_per_migr_unit(super, dev); +} + +static int recover_backup_imsm(struct supertype *st, struct mdinfo *info); + +static struct mdinfo *container_content_imsm(struct supertype *st, char *subarray) +{ + /* Given a container loaded by load_super_imsm_all, + * extract information about all the arrays into + * an mdinfo tree. + * If 'subarray' is given, just extract info about that array. + * + * For each imsm_dev create an mdinfo, fill it in, + * then look for matching devices in super->disks + * and create appropriate device mdinfo. + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + struct mdinfo *rest = NULL; + unsigned int i; + int sb_errors = 0; + struct dl *d; + int spare_disks = 0; + int current_vol = super->current_vol; + + /* do not assemble arrays when not all attributes are supported */ + if (imsm_check_attributes(mpb->attributes) == 0) { + sb_errors = 1; + pr_err("Unsupported attributes in IMSM metadata.Arrays activation is blocked.\n"); + } + + /* count spare devices, not used in maps + */ + for (d = super->disks; d; d = d->next) + if (d->index == -1) + spare_disks++; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev; + struct imsm_map *map; + struct imsm_map *map2; + struct mdinfo *this; + int slot; + int chunk; + char *ep; + int level; + + if (subarray && + (i != strtoul(subarray, &ep, 10) || *ep != '\0')) + continue; + + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + map2 = get_imsm_map(dev, MAP_1); + level = get_imsm_raid_level(map); + + /* do not publish arrays that are in the middle of an + * unsupported migration + */ + if (dev->vol.migr_state && + (migr_type(dev) == MIGR_STATE_CHANGE)) { + pr_err("cannot assemble volume '%.16s': unsupported migration in progress\n", + dev->volume); + continue; + } + /* do not publish arrays that are not support by controller's + * OROM/EFI + */ + + this = xmalloc(sizeof(*this)); + + super->current_vol = i; + getinfo_super_imsm_volume(st, this, NULL); + this->next = rest; + chunk = __le16_to_cpu(map->blocks_per_strip) >> 1; + /* mdadm does not support all metadata features- set the bit in all arrays state */ + if (!validate_geometry_imsm_orom(super, + level, /* RAID level */ + imsm_level_to_layout(level), + map->num_members, /* raid disks */ + &chunk, imsm_dev_size(dev), + 1 /* verbose */)) { + pr_err("IMSM RAID geometry validation failed. Array %s activation is blocked.\n", + dev->volume); + this->array.state |= + (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) | + (1<<MD_SB_BLOCK_VOLUME); + } + + /* if array has bad blocks, set suitable bit in all arrays state */ + if (sb_errors) + this->array.state |= + (1<<MD_SB_BLOCK_CONTAINER_RESHAPE) | + (1<<MD_SB_BLOCK_VOLUME); + + for (slot = 0 ; slot < map->num_members; slot++) { + unsigned long long recovery_start; + struct mdinfo *info_d; + struct dl *d; + int idx; + int skip; + __u32 ord; + int missing = 0; + + skip = 0; + idx = get_imsm_disk_idx(dev, slot, MAP_0); + ord = get_imsm_ord_tbl_ent(dev, slot, MAP_X); + for (d = super->disks; d ; d = d->next) + if (d->index == idx) + break; + + recovery_start = MaxSector; + if (d == NULL) + skip = 1; + if (d && is_failed(&d->disk)) + skip = 1; + if (!skip && (ord & IMSM_ORD_REBUILD)) + recovery_start = 0; + if (!(ord & IMSM_ORD_REBUILD)) + this->array.working_disks++; + /* + * if we skip some disks the array will be assmebled degraded; + * reset resync start to avoid a dirty-degraded + * situation when performing the intial sync + */ + if (skip) + missing++; + + if (!(dev->vol.dirty & RAIDVOL_DIRTY)) { + if ((!able_to_resync(level, missing) || + recovery_start == 0)) + this->resync_start = MaxSector; + } + + if (skip) + continue; + + info_d = xcalloc(1, sizeof(*info_d)); + info_d->next = this->devs; + this->devs = info_d; + + info_d->disk.number = d->index; + info_d->disk.major = d->major; + info_d->disk.minor = d->minor; + info_d->disk.raid_disk = slot; + info_d->recovery_start = recovery_start; + if (map2) { + if (slot < map2->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } else { + if (slot < map->num_members) + info_d->disk.state = (1 << MD_DISK_ACTIVE); + else + this->array.spare_disks++; + } + + info_d->events = __le32_to_cpu(mpb->generation_num); + info_d->data_offset = pba_of_lba0(map); + info_d->component_size = calc_component_size(map, dev); + + if (map->raid_level == 5) { + info_d->ppl_sector = this->ppl_sector; + info_d->ppl_size = this->ppl_size; + if (this->consistency_policy == CONSISTENCY_POLICY_PPL && + recovery_start == 0) + this->resync_start = 0; + } + + info_d->bb.supported = 1; + get_volume_badblocks(super->bbm_log, ord_to_idx(ord), + info_d->data_offset, + info_d->component_size, + &info_d->bb); + } + /* now that the disk list is up-to-date fixup recovery_start */ + update_recovery_start(super, dev, this); + this->array.spare_disks += spare_disks; + + /* check for reshape */ + if (this->reshape_active == 1) + recover_backup_imsm(st, this); + rest = this; + } + + super->current_vol = current_vol; + return rest; +} + +static __u8 imsm_check_degraded(struct intel_super *super, struct imsm_dev *dev, + int failed, int look_in_map) +{ + struct imsm_map *map; + + map = get_imsm_map(dev, look_in_map); + + if (!failed) + return map->map_state == IMSM_T_STATE_UNINITIALIZED ? + IMSM_T_STATE_UNINITIALIZED : IMSM_T_STATE_NORMAL; + + switch (get_imsm_raid_level(map)) { + case 0: + return IMSM_T_STATE_FAILED; + break; + case 1: + if (failed < map->num_members) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + case 10: + { + /** + * check to see if any mirrors have failed, otherwise we + * are degraded. Even numbered slots are mirrored on + * slot+1 + */ + int i; + /* gcc -Os complains that this is unused */ + int insync = insync; + + for (i = 0; i < map->num_members; i++) { + __u32 ord = get_imsm_ord_tbl_ent(dev, i, MAP_X); + int idx = ord_to_idx(ord); + struct imsm_disk *disk; + + /* reset the potential in-sync count on even-numbered + * slots. num_copies is always 2 for imsm raid10 + */ + if ((i & 1) == 0) + insync = 2; + + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + insync--; + + /* no in-sync disks left in this mirror the + * array has failed + */ + if (insync == 0) + return IMSM_T_STATE_FAILED; + } + + return IMSM_T_STATE_DEGRADED; + } + case 5: + if (failed < 2) + return IMSM_T_STATE_DEGRADED; + else + return IMSM_T_STATE_FAILED; + break; + default: + break; + } + + return map->map_state; +} + +static int imsm_count_failed(struct intel_super *super, struct imsm_dev *dev, + int look_in_map) +{ + int i; + int failed = 0; + struct imsm_disk *disk; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *prev = get_imsm_map(dev, MAP_1); + struct imsm_map *map_for_loop; + __u32 ord; + int idx; + int idx_1; + + /* at the beginning of migration we set IMSM_ORD_REBUILD on + * disks that are being rebuilt. New failures are recorded to + * map[0]. So we look through all the disks we started with and + * see if any failures are still present, or if any new ones + * have arrived + */ + map_for_loop = map; + if (prev && (map->num_members < prev->num_members)) + map_for_loop = prev; + + for (i = 0; i < map_for_loop->num_members; i++) { + idx_1 = -255; + /* when MAP_X is passed both maps failures are counted + */ + if (prev && + (look_in_map == MAP_1 || look_in_map == MAP_X) && + i < prev->num_members) { + ord = __le32_to_cpu(prev->disk_ord_tbl[i]); + idx_1 = ord_to_idx(ord); + + disk = get_imsm_disk(super, idx_1); + if (!disk || is_failed(disk) || ord & IMSM_ORD_REBUILD) + failed++; + } + if ((look_in_map == MAP_0 || look_in_map == MAP_X) && + i < map->num_members) { + ord = __le32_to_cpu(map->disk_ord_tbl[i]); + idx = ord_to_idx(ord); + + if (idx != idx_1) { + disk = get_imsm_disk(super, idx); + if (!disk || is_failed(disk) || + ord & IMSM_ORD_REBUILD) + failed++; + } + } + } + + return failed; +} + +static int imsm_open_new(struct supertype *c, struct active_array *a, + int inst) +{ + struct intel_super *super = c->sb; + struct imsm_super *mpb = super->anchor; + struct imsm_update_prealloc_bb_mem u; + + if (inst >= mpb->num_raid_devs) { + pr_err("subarry index %d, out of range\n", inst); + return -ENODEV; + } + + dprintf("imsm: open_new %d\n", inst); + a->info.container_member = inst; + + u.type = update_prealloc_badblocks_mem; + imsm_update_metadata_locally(c, &u, sizeof(u)); + + return 0; +} + +static int is_resyncing(struct imsm_dev *dev) +{ + struct imsm_map *migr_map; + + if (!dev->vol.migr_state) + return 0; + + if (migr_type(dev) == MIGR_INIT || + migr_type(dev) == MIGR_REPAIR) + return 1; + + if (migr_type(dev) == MIGR_GEN_MIGR) + return 0; + + migr_map = get_imsm_map(dev, MAP_1); + + if (migr_map->map_state == IMSM_T_STATE_NORMAL && + dev->vol.migr_type != MIGR_GEN_MIGR) + return 1; + else + return 0; +} + +/* return true if we recorded new information */ +static int mark_failure(struct intel_super *super, + struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + __u32 ord; + int slot; + struct imsm_map *map; + char buf[MAX_RAID_SERIAL_LEN+3]; + unsigned int len, shift = 0; + + /* new failures are always set in map[0] */ + map = get_imsm_map(dev, MAP_0); + + slot = get_imsm_disk_slot(map, idx); + if (slot < 0) + return 0; + + ord = __le32_to_cpu(map->disk_ord_tbl[slot]); + if (is_failed(disk) && (ord & IMSM_ORD_REBUILD)) + return 0; + + memcpy(buf, disk->serial, MAX_RAID_SERIAL_LEN); + buf[MAX_RAID_SERIAL_LEN] = '\000'; + strcat(buf, ":0"); + if ((len = strlen(buf)) >= MAX_RAID_SERIAL_LEN) + shift = len - MAX_RAID_SERIAL_LEN + 1; + memcpy(disk->serial, &buf[shift], len + 1 - shift); + + disk->status |= FAILED_DISK; + set_imsm_ord_tbl_ent(map, slot, idx | IMSM_ORD_REBUILD); + /* mark failures in second map if second map exists and this disk + * in this slot. + * This is valid for migration, initialization and rebuild + */ + if (dev->vol.migr_state) { + struct imsm_map *map2 = get_imsm_map(dev, MAP_1); + int slot2 = get_imsm_disk_slot(map2, idx); + + if (slot2 < map2->num_members && slot2 >= 0) + set_imsm_ord_tbl_ent(map2, slot2, + idx | IMSM_ORD_REBUILD); + } + if (map->failed_disk_num == 0xff || + (!is_rebuilding(dev) && map->failed_disk_num > slot)) + map->failed_disk_num = slot; + + clear_disk_badblocks(super->bbm_log, ord_to_idx(ord)); + + return 1; +} + +static void mark_missing(struct intel_super *super, + struct imsm_dev *dev, struct imsm_disk *disk, int idx) +{ + mark_failure(super, dev, disk, idx); + + if (disk->scsi_id == __cpu_to_le32(~(__u32)0)) + return; + + disk->scsi_id = __cpu_to_le32(~(__u32)0); + memmove(&disk->serial[0], &disk->serial[1], MAX_RAID_SERIAL_LEN - 1); +} + +static void handle_missing(struct intel_super *super, struct imsm_dev *dev) +{ + struct dl *dl; + + if (!super->missing) + return; + + /* When orom adds replacement for missing disk it does + * not remove entry of missing disk, but just updates map with + * new added disk. So it is not enough just to test if there is + * any missing disk, we have to look if there are any failed disks + * in map to stop migration */ + + dprintf("imsm: mark missing\n"); + /* end process for initialization and rebuild only + */ + if (is_gen_migration(dev) == false) { + int failed = imsm_count_failed(super, dev, MAP_0); + + if (failed) { + __u8 map_state; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map1; + int i, ord, ord_map1; + int rebuilt = 1; + + for (i = 0; i < map->num_members; i++) { + ord = get_imsm_ord_tbl_ent(dev, i, MAP_0); + if (!(ord & IMSM_ORD_REBUILD)) + continue; + + map1 = get_imsm_map(dev, MAP_1); + if (!map1) + continue; + + ord_map1 = __le32_to_cpu(map1->disk_ord_tbl[i]); + if (ord_map1 & IMSM_ORD_REBUILD) + rebuilt = 0; + } + + if (rebuilt) { + map_state = imsm_check_degraded(super, dev, + failed, MAP_0); + end_migration(dev, super, map_state); + } + } + } + for (dl = super->missing; dl; dl = dl->next) + mark_missing(super, dev, &dl->disk, dl->index); + super->updates_pending++; +} + +static unsigned long long imsm_set_array_size(struct imsm_dev *dev, + long long new_size) +{ + unsigned long long array_blocks; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int used_disks = imsm_num_data_members(map); + + if (used_disks == 0) { + /* when problems occures + * return current array_blocks value + */ + array_blocks = imsm_dev_size(dev); + + return array_blocks; + } + + /* set array size in metadata + */ + if (new_size <= 0) + /* OLCE size change is caused by added disks + */ + array_blocks = per_dev_array_size(map) * used_disks; + else + /* Online Volume Size Change + * Using available free space + */ + array_blocks = new_size; + + array_blocks = round_size_to_mb(array_blocks, used_disks); + set_imsm_dev_size(dev, array_blocks); + + return array_blocks; +} + +static void imsm_set_disk(struct active_array *a, int n, int state); + +static void imsm_progress_container_reshape(struct intel_super *super) +{ + /* if no device has a migr_state, but some device has a + * different number of members than the previous device, start + * changing the number of devices in this device to match + * previous. + */ + struct imsm_super *mpb = super->anchor; + int prev_disks = -1; + int i; + int copy_map_size; + + for (i = 0; i < mpb->num_raid_devs; i++) { + struct imsm_dev *dev = get_imsm_dev(super, i); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *map2; + int prev_num_members; + + if (dev->vol.migr_state) + return; + + if (prev_disks == -1) + prev_disks = map->num_members; + if (prev_disks == map->num_members) + continue; + + /* OK, this array needs to enter reshape mode. + * i.e it needs a migr_state + */ + + copy_map_size = sizeof_imsm_map(map); + prev_num_members = map->num_members; + map->num_members = prev_disks; + dev->vol.migr_state = 1; + set_vol_curr_migr_unit(dev, 0); + set_migr_type(dev, MIGR_GEN_MIGR); + for (i = prev_num_members; + i < map->num_members; i++) + set_imsm_ord_tbl_ent(map, i, i); + map2 = get_imsm_map(dev, MAP_1); + /* Copy the current map */ + memcpy(map2, map, copy_map_size); + map2->num_members = prev_num_members; + + imsm_set_array_size(dev, -1); + super->clean_migration_record_by_mdmon = 1; + super->updates_pending++; + } +} + +/* Handle dirty -> clean transititions, resync and reshape. Degraded and rebuild + * states are handled in imsm_set_disk() with one exception, when a + * resync is stopped due to a new failure this routine will set the + * 'degraded' state for the array. + */ +static int imsm_set_array_state(struct active_array *a, int consistent) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = imsm_count_failed(super, dev, MAP_0); + __u8 map_state = imsm_check_degraded(super, dev, failed, MAP_0); + __u32 blocks_per_unit; + + if (dev->vol.migr_state && + dev->vol.migr_type == MIGR_GEN_MIGR) { + /* array state change is blocked due to reshape action + * We might need to + * - abort the reshape (if last_checkpoint is 0 and action!= reshape) + * - finish the reshape (if last_checkpoint is big and action != reshape) + * - update vol_curr_migr_unit + */ + if (a->curr_action == reshape) { + /* still reshaping, maybe update vol_curr_migr_unit */ + goto mark_checkpoint; + } else { + if (a->last_checkpoint == 0 && a->prev_action == reshape) { + /* for some reason we aborted the reshape. + * + * disable automatic metadata rollback + * user action is required to recover process + */ + if (0) { + struct imsm_map *map2 = + get_imsm_map(dev, MAP_1); + dev->vol.migr_state = 0; + set_migr_type(dev, 0); + set_vol_curr_migr_unit(dev, 0); + memcpy(map, map2, + sizeof_imsm_map(map2)); + super->updates_pending++; + } + } + if (a->last_checkpoint >= a->info.component_size) { + unsigned long long array_blocks; + int used_disks; + struct mdinfo *mdi; + + used_disks = imsm_num_data_members(map); + if (used_disks > 0) { + array_blocks = + per_dev_array_size(map) * + used_disks; + array_blocks = + round_size_to_mb(array_blocks, + used_disks); + a->info.custom_array_size = array_blocks; + /* encourage manager to update array + * size + */ + + a->check_reshape = 1; + } + /* finalize online capacity expansion/reshape */ + for (mdi = a->info.devs; mdi; mdi = mdi->next) + imsm_set_disk(a, + mdi->disk.raid_disk, + mdi->curr_state); + + imsm_progress_container_reshape(super); + } + } + } + + /* before we activate this array handle any missing disks */ + if (consistent == 2) + handle_missing(super, dev); + + if (consistent == 2 && + (!is_resync_complete(&a->info) || + map_state != IMSM_T_STATE_NORMAL || + dev->vol.migr_state)) + consistent = 0; + + if (is_resync_complete(&a->info)) { + /* complete intialization / resync, + * recovery and interrupted recovery is completed in + * ->set_disk + */ + if (is_resyncing(dev)) { + dprintf("imsm: mark resync done\n"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + } + } else if ((!is_resyncing(dev) && !failed) && + (imsm_reshape_blocks_arrays_changes(super) == 0)) { + /* mark the start of the init process if nothing is failed */ + dprintf("imsm: mark resync start\n"); + if (map->map_state == IMSM_T_STATE_UNINITIALIZED) + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_INIT); + else + migrate(dev, super, IMSM_T_STATE_NORMAL, MIGR_REPAIR); + super->updates_pending++; + } + +mark_checkpoint: + /* skip checkpointing for general migration, + * it is controlled in mdadm + */ + if (is_gen_migration(dev)) + goto skip_mark_checkpoint; + + /* check if we can update vol_curr_migr_unit from resync_start, + * recovery_start + */ + blocks_per_unit = blocks_per_migr_unit(super, dev); + if (blocks_per_unit) { + set_vol_curr_migr_unit(dev, + a->last_checkpoint / blocks_per_unit); + dprintf("imsm: mark checkpoint (%llu)\n", + vol_curr_migr_unit(dev)); + super->updates_pending++; + } + +skip_mark_checkpoint: + /* mark dirty / clean */ + if (((dev->vol.dirty & RAIDVOL_DIRTY) && consistent) || + (!(dev->vol.dirty & RAIDVOL_DIRTY) && !consistent)) { + dprintf("imsm: mark '%s'\n", consistent ? "clean" : "dirty"); + if (consistent) { + dev->vol.dirty = RAIDVOL_CLEAN; + } else { + dev->vol.dirty = RAIDVOL_DIRTY; + if (dev->rwh_policy == RWH_DISTRIBUTED || + dev->rwh_policy == RWH_MULTIPLE_DISTRIBUTED) + dev->vol.dirty |= RAIDVOL_DSRECORD_VALID; + } + super->updates_pending++; + } + + return consistent; +} + +static int imsm_disk_slot_to_ord(struct active_array *a, int slot) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + + if (slot > map->num_members) { + pr_err("imsm: imsm_disk_slot_to_ord %d out of range 0..%d\n", + slot, map->num_members - 1); + return -1; + } + + if (slot < 0) + return -1; + + return get_imsm_ord_tbl_ent(dev, slot, MAP_0); +} + +static void imsm_set_disk(struct active_array *a, int n, int state) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_disk *disk; + struct mdinfo *mdi; + int recovery_not_finished = 0; + int failed; + int ord; + __u8 map_state; + int rebuild_done = 0; + int i; + + ord = get_imsm_ord_tbl_ent(dev, n, MAP_X); + if (ord < 0) + return; + + dprintf("imsm: set_disk %d:%x\n", n, state); + disk = get_imsm_disk(super, ord_to_idx(ord)); + + /* check for new failures */ + if (disk && (state & DS_FAULTY)) { + if (mark_failure(super, dev, disk, ord_to_idx(ord))) + super->updates_pending++; + } + + /* check if in_sync */ + if (state & DS_INSYNC && ord & IMSM_ORD_REBUILD && is_rebuilding(dev)) { + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + + set_imsm_ord_tbl_ent(migr_map, n, ord_to_idx(ord)); + rebuild_done = 1; + super->updates_pending++; + } + + failed = imsm_count_failed(super, dev, MAP_0); + map_state = imsm_check_degraded(super, dev, failed, MAP_0); + + /* check if recovery complete, newly degraded, or failed */ + dprintf("imsm: Detected transition to state "); + switch (map_state) { + case IMSM_T_STATE_NORMAL: /* transition to normal state */ + dprintf("normal: "); + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding"); + /* check if recovery is really finished */ + for (mdi = a->info.devs; mdi ; mdi = mdi->next) + if (mdi->recovery_start != MaxSector) { + recovery_not_finished = 1; + break; + } + if (recovery_not_finished) { + dprintf_cont("\n"); + dprintf("Rebuild has not finished yet, state not changed"); + if (a->last_checkpoint < mdi->recovery_start) { + a->last_checkpoint = mdi->recovery_start; + super->updates_pending++; + } + break; + } + end_migration(dev, super, map_state); + map->failed_disk_num = ~0; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else + map->map_state = map_state; + map->failed_disk_num = ~0; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_DEGRADED: /* transition to degraded state */ + dprintf_cont("degraded: "); + if (map->map_state != map_state && !dev->vol.migr_state) { + dprintf_cont("mark degraded"); + map->map_state = map_state; + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + if (is_rebuilding(dev)) { + dprintf_cont("while rebuilding "); + if (state & DS_FAULTY) { + dprintf_cont("removing failed drive "); + if (n == map->failed_disk_num) { + dprintf_cont("end migration"); + end_migration(dev, super, map_state); + a->last_checkpoint = 0; + } else { + dprintf_cont("fail detected during rebuild, changing map state"); + map->map_state = map_state; + } + super->updates_pending++; + } + + if (!rebuild_done) + break; + + /* check if recovery is really finished */ + for (mdi = a->info.devs; mdi ; mdi = mdi->next) + if (mdi->recovery_start != MaxSector) { + recovery_not_finished = 1; + break; + } + if (recovery_not_finished) { + dprintf_cont("\n"); + dprintf_cont("Rebuild has not finished yet"); + if (a->last_checkpoint < mdi->recovery_start) { + a->last_checkpoint = + mdi->recovery_start; + super->updates_pending++; + } + break; + } + + dprintf_cont(" Rebuild done, still degraded"); + end_migration(dev, super, map_state); + a->last_checkpoint = 0; + super->updates_pending++; + + for (i = 0; i < map->num_members; i++) { + int idx = get_imsm_ord_tbl_ent(dev, i, MAP_0); + + if (idx & IMSM_ORD_REBUILD) + map->failed_disk_num = i; + } + super->updates_pending++; + break; + } + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + if (a->last_checkpoint >= a->info.component_size) + end_migration(dev, super, map_state); + else { + map->map_state = map_state; + manage_second_map(super, dev); + } + super->updates_pending++; + break; + } + if (is_initializing(dev)) { + dprintf_cont("while initialization."); + map->map_state = map_state; + super->updates_pending++; + break; + } + break; + case IMSM_T_STATE_FAILED: /* transition to failed state */ + dprintf_cont("failed: "); + if (is_gen_migration(dev)) { + dprintf_cont("while general migration"); + map->map_state = map_state; + super->updates_pending++; + break; + } + if (map->map_state != map_state) { + dprintf_cont("mark failed"); + end_migration(dev, super, map_state); + super->updates_pending++; + a->last_checkpoint = 0; + break; + } + break; + default: + dprintf_cont("state %i\n", map_state); + } + dprintf_cont("\n"); +} + +static int store_imsm_mpb(int fd, struct imsm_super *mpb) +{ + void *buf = mpb; + __u32 mpb_size = __le32_to_cpu(mpb->mpb_size); + unsigned long long dsize; + unsigned long long sectors; + unsigned int sector_size; + + if (!get_dev_sector_size(fd, NULL, §or_size)) + return 1; + get_dev_size(fd, NULL, &dsize); + + if (mpb_size > sector_size) { + /* -1 to account for anchor */ + sectors = mpb_sectors(mpb, sector_size) - 1; + + /* write the extended mpb to the sectors preceeding the anchor */ + if (lseek64(fd, dsize - (sector_size * (2 + sectors)), + SEEK_SET) < 0) + return 1; + + if ((unsigned long long)write(fd, buf + sector_size, + sector_size * sectors) != sector_size * sectors) + return 1; + } + + /* first block is stored on second to last sector of the disk */ + if (lseek64(fd, dsize - (sector_size * 2), SEEK_SET) < 0) + return 1; + + if ((unsigned int)write(fd, buf, sector_size) != sector_size) + return 1; + + return 0; +} + +static void imsm_sync_metadata(struct supertype *container) +{ + struct intel_super *super = container->sb; + + dprintf("sync metadata: %d\n", super->updates_pending); + if (!super->updates_pending) + return; + + write_super_imsm(container, 0); + + super->updates_pending = 0; +} + +static struct dl *imsm_readd(struct intel_super *super, int idx, struct active_array *a) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int i = get_imsm_disk_idx(dev, idx, MAP_X); + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->index == i) + break; + + if (dl && is_failed(&dl->disk)) + dl = NULL; + + if (dl) + dprintf("found %x:%x\n", dl->major, dl->minor); + + return dl; +} + +static struct dl *imsm_add_spare(struct intel_super *super, int slot, + struct active_array *a, int activate_new, + struct mdinfo *additional_test_list) +{ + struct imsm_dev *dev = get_imsm_dev(super, a->info.container_member); + int idx = get_imsm_disk_idx(dev, slot, MAP_X); + struct imsm_super *mpb = super->anchor; + struct imsm_map *map; + unsigned long long pos; + struct mdinfo *d; + struct extent *ex; + int i, j; + int found; + __u32 array_start = 0; + __u32 array_end = 0; + struct dl *dl; + struct mdinfo *test_list; + + for (dl = super->disks; dl; dl = dl->next) { + /* If in this array, skip */ + for (d = a->info.devs ; d ; d = d->next) + if (is_fd_valid(d->state_fd) && + d->disk.major == dl->major && + d->disk.minor == dl->minor) { + dprintf("%x:%x already in array\n", + dl->major, dl->minor); + break; + } + if (d) + continue; + test_list = additional_test_list; + while (test_list) { + if (test_list->disk.major == dl->major && + test_list->disk.minor == dl->minor) { + dprintf("%x:%x already in additional test list\n", + dl->major, dl->minor); + break; + } + test_list = test_list->next; + } + if (test_list) + continue; + + /* skip in use or failed drives */ + if (is_failed(&dl->disk) || idx == dl->index || + dl->index == -2) { + dprintf("%x:%x status (failed: %d index: %d)\n", + dl->major, dl->minor, is_failed(&dl->disk), idx); + continue; + } + + /* skip pure spares when we are looking for partially + * assimilated drives + */ + if (dl->index == -1 && !activate_new) + continue; + + if (!drive_validate_sector_size(super, dl)) + continue; + + /* Does this unused device have the requisite free space? + * It needs to be able to cover all member volumes + */ + ex = get_extents(super, dl, 1); + if (!ex) { + dprintf("cannot get extents\n"); + continue; + } + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + + /* check if this disk is already a member of + * this array + */ + if (get_imsm_disk_slot(map, dl->index) >= 0) + continue; + + found = 0; + j = 0; + pos = 0; + array_start = pba_of_lba0(map); + array_end = array_start + + per_dev_array_size(map) - 1; + + do { + /* check that we can start at pba_of_lba0 with + * num_data_stripes*blocks_per_stripe of space + */ + if (array_start >= pos && array_end < ex[j].start) { + found = 1; + break; + } + pos = ex[j].start + ex[j].size; + j++; + } while (ex[j-1].size); + + if (!found) + break; + } + + free(ex); + if (i < mpb->num_raid_devs) { + dprintf("%x:%x does not have %u to %u available\n", + dl->major, dl->minor, array_start, array_end); + /* No room */ + continue; + } + return dl; + } + + return dl; +} + +static int imsm_rebuild_allowed(struct supertype *cont, int dev_idx, int failed) +{ + struct imsm_dev *dev2; + struct imsm_map *map; + struct dl *idisk; + int slot; + int idx; + __u8 state; + + dev2 = get_imsm_dev(cont->sb, dev_idx); + if (dev2) { + state = imsm_check_degraded(cont->sb, dev2, failed, MAP_0); + if (state == IMSM_T_STATE_FAILED) { + map = get_imsm_map(dev2, MAP_0); + if (!map) + return 1; + for (slot = 0; slot < map->num_members; slot++) { + /* + * Check if failed disks are deleted from intel + * disk list or are marked to be deleted + */ + idx = get_imsm_disk_idx(dev2, slot, MAP_X); + idisk = get_imsm_dl_disk(cont->sb, idx); + /* + * Do not rebuild the array if failed disks + * from failed sub-array are not removed from + * container. + */ + if (idisk && + is_failed(&idisk->disk) && + (idisk->action != DISK_REMOVE)) + return 0; + } + } + } + return 1; +} + +static struct mdinfo *imsm_activate_spare(struct active_array *a, + struct metadata_update **updates) +{ + /** + * Find a device with unused free space and use it to replace a + * failed/vacant region in an array. We replace failed regions one a + * array at a time. The result is that a new spare disk will be added + * to the first failed array and after the monitor has finished + * propagating failures the remainder will be consumed. + * + * FIXME add a capability for mdmon to request spares from another + * container. + */ + + struct intel_super *super = a->container->sb; + int inst = a->info.container_member; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int failed = a->info.array.raid_disks; + struct mdinfo *rv = NULL; + struct mdinfo *d; + struct mdinfo *di; + struct metadata_update *mu; + struct dl *dl; + struct imsm_update_activate_spare *u; + int num_spares = 0; + int i; + int allowed; + + for (d = a->info.devs ; d; d = d->next) { + if (!is_fd_valid(d->state_fd)) + continue; + + if (d->curr_state & DS_FAULTY) + /* wait for Removal to happen */ + return NULL; + + failed--; + } + + dprintf("imsm: activate spare: inst=%d failed=%d (%d) level=%d\n", + inst, failed, a->info.array.raid_disks, a->info.array.level); + + if (imsm_reshape_blocks_arrays_changes(super)) + return NULL; + + /* Cannot activate another spare if rebuild is in progress already + */ + if (is_rebuilding(dev)) { + dprintf("imsm: No spare activation allowed. Rebuild in progress already.\n"); + return NULL; + } + + if (a->info.array.level == 4) + /* No repair for takeovered array + * imsm doesn't support raid4 + */ + return NULL; + + if (imsm_check_degraded(super, dev, failed, MAP_0) != + IMSM_T_STATE_DEGRADED) + return NULL; + + if (get_imsm_map(dev, MAP_0)->map_state == IMSM_T_STATE_UNINITIALIZED) { + dprintf("imsm: No spare activation allowed. Volume is not initialized.\n"); + return NULL; + } + + /* + * If there are any failed disks check state of the other volume. + * Block rebuild if the another one is failed until failed disks + * are removed from container. + */ + if (failed) { + dprintf("found failed disks in %.*s, check if there anotherfailed sub-array.\n", + MAX_RAID_SERIAL_LEN, dev->volume); + /* check if states of the other volumes allow for rebuild */ + for (i = 0; i < super->anchor->num_raid_devs; i++) { + if (i != inst) { + allowed = imsm_rebuild_allowed(a->container, + i, failed); + if (!allowed) + return NULL; + } + } + } + + /* For each slot, if it is not working, find a spare */ + for (i = 0; i < a->info.array.raid_disks; i++) { + for (d = a->info.devs ; d ; d = d->next) + if (d->disk.raid_disk == i) + break; + dprintf("found %d: %p %x\n", i, d, d?d->curr_state:0); + if (d && is_fd_valid(d->state_fd)) + continue; + + /* + * OK, this device needs recovery. Try to re-add the + * previous occupant of this slot, if this fails see if + * we can continue the assimilation of a spare that was + * partially assimilated, finally try to activate a new + * spare. + */ + dl = imsm_readd(super, i, a); + if (!dl) + dl = imsm_add_spare(super, i, a, 0, rv); + if (!dl) + dl = imsm_add_spare(super, i, a, 1, rv); + if (!dl) + continue; + + /* found a usable disk with enough space */ + di = xcalloc(1, sizeof(*di)); + + /* dl->index will be -1 in the case we are activating a + * pristine spare. imsm_process_update() will create a + * new index in this case. Once a disk is found to be + * failed in all member arrays it is kicked from the + * metadata + */ + di->disk.number = dl->index; + + /* (ab)use di->devs to store a pointer to the device + * we chose + */ + di->devs = (struct mdinfo *) dl; + + di->disk.raid_disk = i; + di->disk.major = dl->major; + di->disk.minor = dl->minor; + di->disk.state = 0; + di->recovery_start = 0; + di->data_offset = pba_of_lba0(map); + di->component_size = a->info.component_size; + di->container_member = inst; + di->bb.supported = 1; + if (a->info.consistency_policy == CONSISTENCY_POLICY_PPL) { + di->ppl_sector = get_ppl_sector(super, inst); + di->ppl_size = MULTIPLE_PPL_AREA_SIZE_IMSM >> 9; + } + super->random = random32(); + di->next = rv; + rv = di; + num_spares++; + dprintf("%x:%x to be %d at %llu\n", dl->major, dl->minor, + i, di->data_offset); + } + + if (!rv) + /* No spares found */ + return rv; + /* Now 'rv' has a list of devices to return. + * Create a metadata_update record to update the + * disk_ord_tbl for the array + */ + mu = xmalloc(sizeof(*mu)); + mu->buf = xcalloc(num_spares, + sizeof(struct imsm_update_activate_spare)); + mu->space = NULL; + mu->space_list = NULL; + mu->len = sizeof(struct imsm_update_activate_spare) * num_spares; + mu->next = *updates; + u = (struct imsm_update_activate_spare *) mu->buf; + + for (di = rv ; di ; di = di->next) { + u->type = update_activate_spare; + u->dl = (struct dl *) di->devs; + di->devs = NULL; + u->slot = di->disk.raid_disk; + u->array = inst; + u->next = u + 1; + u++; + } + (u-1)->next = NULL; + *updates = mu; + + return rv; +} + +static int disks_overlap(struct intel_super *super, int idx, struct imsm_update_create_array *u) +{ + struct imsm_dev *dev = get_imsm_dev(super, idx); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *new_map = get_imsm_map(&u->dev, MAP_0); + struct disk_info *inf = get_disk_info(u); + struct imsm_disk *disk; + int i; + int j; + + for (i = 0; i < map->num_members; i++) { + disk = get_imsm_disk(super, get_imsm_disk_idx(dev, i, MAP_X)); + for (j = 0; j < new_map->num_members; j++) + if (serialcmp(disk->serial, inf[j].serial) == 0) + return 1; + } + + return 0; +} + +static struct dl *get_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl->major == major && dl->minor == minor) + return dl; + return NULL; +} + +static int remove_disk_super(struct intel_super *super, int major, int minor) +{ + struct dl *prev; + struct dl *dl; + + prev = NULL; + for (dl = super->disks; dl; dl = dl->next) { + if (dl->major == major && dl->minor == minor) { + /* remove */ + if (prev) + prev->next = dl->next; + else + super->disks = dl->next; + dl->next = NULL; + __free_imsm_disk(dl, 1); + dprintf("removed %x:%x\n", major, minor); + break; + } + prev = dl; + } + return 0; +} + +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index); + +static int add_remove_disk_update(struct intel_super *super) +{ + int check_degraded = 0; + struct dl *disk; + + /* add/remove some spares to/from the metadata/contrainer */ + while (super->disk_mgmt_list) { + struct dl *disk_cfg; + + disk_cfg = super->disk_mgmt_list; + super->disk_mgmt_list = disk_cfg->next; + disk_cfg->next = NULL; + + if (disk_cfg->action == DISK_ADD) { + disk_cfg->next = super->disks; + super->disks = disk_cfg; + check_degraded = 1; + dprintf("added %x:%x\n", + disk_cfg->major, disk_cfg->minor); + } else if (disk_cfg->action == DISK_REMOVE) { + dprintf("Disk remove action processed: %x.%x\n", + disk_cfg->major, disk_cfg->minor); + disk = get_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + if (disk) { + /* store action status */ + disk->action = DISK_REMOVE; + /* remove spare disks only */ + if (disk->index == -1) { + remove_disk_super(super, + disk_cfg->major, + disk_cfg->minor); + } else { + disk_cfg->fd = disk->fd; + disk->fd = -1; + } + } + /* release allocate disk structure */ + __free_imsm_disk(disk_cfg, 1); + } + } + return check_degraded; +} + +static int apply_reshape_migration_update(struct imsm_update_reshape_migration *u, + struct intel_super *super, + void ***space_list) +{ + struct intel_dev *id; + void **tofree = NULL; + int ret_val = 0; + + dprintf("(enter)\n"); + if (u->subdev < 0 || u->subdev > 1) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + if (space_list == NULL || *space_list == NULL) { + dprintf("imsm: Error: Memory is not allocated\n"); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map; + struct imsm_dev *new_dev = + (struct imsm_dev *)*space_list; + struct imsm_map *migr_map = get_imsm_map(dev, MAP_1); + int to_state; + struct dl *new_disk; + + if (new_dev == NULL) + return ret_val; + *space_list = **space_list; + memcpy(new_dev, dev, sizeof_imsm_dev(dev, 0)); + map = get_imsm_map(new_dev, MAP_0); + if (migr_map) { + dprintf("imsm: Error: migration in progress"); + return ret_val; + } + + to_state = map->map_state; + if ((u->new_level == 5) && (map->raid_level == 0)) { + map->num_members++; + /* this should not happen */ + if (u->new_disks[0] < 0) { + map->failed_disk_num = + map->num_members - 1; + to_state = IMSM_T_STATE_DEGRADED; + } else + to_state = IMSM_T_STATE_NORMAL; + } + migrate(new_dev, super, to_state, MIGR_GEN_MIGR); + if (u->new_level > -1) + map->raid_level = u->new_level; + migr_map = get_imsm_map(new_dev, MAP_1); + if ((u->new_level == 5) && + (migr_map->raid_level == 0)) { + int ord = map->num_members - 1; + migr_map->num_members--; + if (u->new_disks[0] < 0) + ord |= IMSM_ORD_REBUILD; + set_imsm_ord_tbl_ent(map, + map->num_members - 1, + ord); + } + id->dev = new_dev; + tofree = (void **)dev; + + /* update chunk size + */ + if (u->new_chunksize > 0) { + struct imsm_map *dest_map = + get_imsm_map(dev, MAP_0); + int used_disks = + imsm_num_data_members(dest_map); + + if (used_disks == 0) + return ret_val; + + map->blocks_per_strip = + __cpu_to_le16(u->new_chunksize * 2); + update_num_data_stripes(map, imsm_dev_size(dev)); + } + + /* ensure blocks_per_member has valid value + */ + set_blocks_per_member(map, + per_dev_array_size(map) + + NUM_BLOCKS_DIRTY_STRIPE_REGION); + + /* add disk + */ + if (u->new_level != 5 || migr_map->raid_level != 0 || + migr_map->raid_level == map->raid_level) + goto skip_disk_add; + + if (u->new_disks[0] >= 0) { + /* use passes spare + */ + new_disk = get_disk_super(super, + major(u->new_disks[0]), + minor(u->new_disks[0])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[0]), + minor(u->new_disks[0]), + new_disk, new_disk->index); + if (new_disk == NULL) + goto error_disk_add; + + new_disk->index = map->num_members - 1; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } else + goto error_disk_add; + +skip_disk_add: + *tofree = *space_list; + /* calculate new size + */ + imsm_set_array_size(new_dev, -1); + + ret_val = 1; + } + } + + if (tofree) + *space_list = tofree; + return ret_val; + +error_disk_add: + dprintf("Error: imsm: Cannot find disk.\n"); + return ret_val; +} + +static int apply_size_change_update(struct imsm_update_size_change *u, + struct intel_super *super) +{ + struct intel_dev *id; + int ret_val = 0; + + dprintf("(enter)\n"); + if (u->subdev < 0 || u->subdev > 1) { + dprintf("imsm: Error: Wrong subdev: %i\n", u->subdev); + return ret_val; + } + + for (id = super->devlist ; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev = get_imsm_dev(super, u->subdev); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int used_disks = imsm_num_data_members(map); + unsigned long long blocks_per_member; + unsigned long long new_size_per_disk; + + if (used_disks == 0) + return 0; + + /* calculate new size + */ + new_size_per_disk = u->new_size / used_disks; + blocks_per_member = new_size_per_disk + + NUM_BLOCKS_DIRTY_STRIPE_REGION; + + imsm_set_array_size(dev, u->new_size); + set_blocks_per_member(map, blocks_per_member); + update_num_data_stripes(map, u->new_size); + ret_val = 1; + break; + } + } + + return ret_val; +} + +static int prepare_spare_to_activate(struct supertype *st, + struct imsm_update_activate_spare *u) +{ + struct intel_super *super = st->sb; + int prev_current_vol = super->current_vol; + struct active_array *a; + int ret = 1; + + for (a = st->arrays; a; a = a->next) + /* + * Additional initialization (adding bitmap header, filling + * the bitmap area with '1's to force initial rebuild for a whole + * data-area) is required when adding the spare to the volume + * with write-intent bitmap. + */ + if (a->info.container_member == u->array && + a->info.consistency_policy == CONSISTENCY_POLICY_BITMAP) { + struct dl *dl; + + for (dl = super->disks; dl; dl = dl->next) + if (dl == u->dl) + break; + if (!dl) + break; + + super->current_vol = u->array; + if (st->ss->write_bitmap(st, dl->fd, NoUpdate)) + ret = 0; + super->current_vol = prev_current_vol; + } + return ret; +} + +static int apply_update_activate_spare(struct imsm_update_activate_spare *u, + struct intel_super *super, + struct active_array *active_array) +{ + struct imsm_super *mpb = super->anchor; + struct imsm_dev *dev = get_imsm_dev(super, u->array); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct imsm_map *migr_map; + struct active_array *a; + struct imsm_disk *disk; + __u8 to_state; + struct dl *dl; + unsigned int found; + int failed; + int victim; + int i; + int second_map_created = 0; + + for (; u; u = u->next) { + victim = get_imsm_disk_idx(dev, u->slot, MAP_X); + + if (victim < 0) + return 0; + + for (dl = super->disks; dl; dl = dl->next) + if (dl == u->dl) + break; + + if (!dl) { + pr_err("error: imsm_activate_spare passed an unknown disk (index: %d)\n", + u->dl->index); + return 0; + } + + /* count failures (excluding rebuilds and the victim) + * to determine map[0] state + */ + failed = 0; + for (i = 0; i < map->num_members; i++) { + if (i == u->slot) + continue; + disk = get_imsm_disk(super, + get_imsm_disk_idx(dev, i, MAP_X)); + if (!disk || is_failed(disk)) + failed++; + } + + /* adding a pristine spare, assign a new index */ + if (dl->index < 0) { + dl->index = super->anchor->num_disks; + super->anchor->num_disks++; + } + disk = &dl->disk; + disk->status |= CONFIGURED_DISK; + disk->status &= ~SPARE_DISK; + + /* mark rebuild */ + to_state = imsm_check_degraded(super, dev, failed, MAP_0); + if (!second_map_created) { + second_map_created = 1; + map->map_state = IMSM_T_STATE_DEGRADED; + migrate(dev, super, to_state, MIGR_REBUILD); + } else + map->map_state = to_state; + migr_map = get_imsm_map(dev, MAP_1); + set_imsm_ord_tbl_ent(map, u->slot, dl->index); + set_imsm_ord_tbl_ent(migr_map, u->slot, + dl->index | IMSM_ORD_REBUILD); + + /* update the family_num to mark a new container + * generation, being careful to record the existing + * family_num in orig_family_num to clean up after + * earlier mdadm versions that neglected to set it. + */ + if (mpb->orig_family_num == 0) + mpb->orig_family_num = mpb->family_num; + mpb->family_num += super->random; + + /* count arrays using the victim in the metadata */ + found = 0; + for (a = active_array; a ; a = a->next) { + dev = get_imsm_dev(super, a->info.container_member); + map = get_imsm_map(dev, MAP_0); + + if (get_imsm_disk_slot(map, victim) >= 0) + found++; + } + + /* delete the victim if it is no longer being + * utilized anywhere + */ + if (!found) { + struct dl **dlp; + + /* We know that 'manager' isn't touching anything, + * so it is safe to delete + */ + for (dlp = &super->disks; *dlp; dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + + /* victim may be on the missing list */ + if (!*dlp) + for (dlp = &super->missing; *dlp; + dlp = &(*dlp)->next) + if ((*dlp)->index == victim) + break; + imsm_delete(super, dlp, victim); + } + } + + return 1; +} + +static int apply_reshape_container_disks_update(struct imsm_update_reshape *u, + struct intel_super *super, + void ***space_list) +{ + struct dl *new_disk; + struct intel_dev *id; + int i; + int delta_disks = u->new_raid_disks - u->old_raid_disks; + int disk_count = u->old_raid_disks; + void **tofree = NULL; + int devices_to_reshape = 1; + struct imsm_super *mpb = super->anchor; + int ret_val = 0; + unsigned int dev_id; + + dprintf("(enter)\n"); + + /* enable spares to use in array */ + for (i = 0; i < delta_disks; i++) { + new_disk = get_disk_super(super, + major(u->new_disks[i]), + minor(u->new_disks[i])); + dprintf("imsm: new disk for reshape is: %i:%i (%p, index = %i)\n", + major(u->new_disks[i]), minor(u->new_disks[i]), + new_disk, new_disk->index); + if (new_disk == NULL || + (new_disk->index >= 0 && + new_disk->index < u->old_raid_disks)) + goto update_reshape_exit; + new_disk->index = disk_count++; + /* slot to fill in autolayout + */ + new_disk->raiddisk = new_disk->index; + new_disk->disk.status |= + CONFIGURED_DISK; + new_disk->disk.status &= ~SPARE_DISK; + } + + dprintf("imsm: volume set mpb->num_raid_devs = %i\n", + mpb->num_raid_devs); + /* manage changes in volume + */ + for (dev_id = 0; dev_id < mpb->num_raid_devs; dev_id++) { + void **sp = *space_list; + struct imsm_dev *newdev; + struct imsm_map *newmap, *oldmap; + + for (id = super->devlist ; id; id = id->next) { + if (id->index == dev_id) + break; + } + if (id == NULL) + break; + if (!sp) + continue; + *space_list = *sp; + newdev = (void*)sp; + /* Copy the dev, but not (all of) the map */ + memcpy(newdev, id->dev, sizeof(*newdev)); + oldmap = get_imsm_map(id->dev, MAP_0); + newmap = get_imsm_map(newdev, MAP_0); + /* Copy the current map */ + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + /* update one device only + */ + if (devices_to_reshape) { + dprintf("imsm: modifying subdev: %i\n", + id->index); + devices_to_reshape--; + newdev->vol.migr_state = 1; + set_vol_curr_migr_unit(newdev, 0); + set_migr_type(newdev, MIGR_GEN_MIGR); + newmap->num_members = u->new_raid_disks; + for (i = 0; i < delta_disks; i++) { + set_imsm_ord_tbl_ent(newmap, + u->old_raid_disks + i, + u->old_raid_disks + i); + } + /* New map is correct, now need to save old map + */ + newmap = get_imsm_map(newdev, MAP_1); + memcpy(newmap, oldmap, sizeof_imsm_map(oldmap)); + + imsm_set_array_size(newdev, -1); + } + + sp = (void **)id->dev; + id->dev = newdev; + *sp = tofree; + tofree = sp; + + /* Clear migration record */ + memset(super->migr_rec, 0, sizeof(struct migr_record)); + } + if (tofree) + *space_list = tofree; + ret_val = 1; + +update_reshape_exit: + + return ret_val; +} + +static int apply_takeover_update(struct imsm_update_takeover *u, + struct intel_super *super, + void ***space_list) +{ + struct imsm_dev *dev = NULL; + struct intel_dev *dv; + struct imsm_dev *dev_new; + struct imsm_map *map; + struct dl *dm, *du; + int i; + + for (dv = super->devlist; dv; dv = dv->next) + if (dv->index == (unsigned int)u->subarray) { + dev = dv->dev; + break; + } + + if (dev == NULL) + return 0; + + map = get_imsm_map(dev, MAP_0); + + if (u->direction == R10_TO_R0) { + /* Number of failed disks must be half of initial disk number */ + if (imsm_count_failed(super, dev, MAP_0) != + (map->num_members / 2)) + return 0; + + /* iterate through devices to mark removed disks as spare */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->disk.status & FAILED_DISK) { + int idx = dm->index; + /* update indexes on the disk list */ +/* FIXME this loop-with-the-loop looks wrong, I'm not convinced + the index values will end up being correct.... NB */ + for (du = super->disks; du; du = du->next) + if (du->index > idx) + du->index--; + /* mark as spare disk */ + mark_spare(dm); + } + } + /* update map */ + map->num_members /= map->num_domains; + map->map_state = IMSM_T_STATE_NORMAL; + map->raid_level = 0; + set_num_domains(map); + update_num_data_stripes(map, imsm_dev_size(dev)); + map->failed_disk_num = -1; + } + + if (u->direction == R0_TO_R10) { + void **space; + + /* update slots in current disk list */ + for (dm = super->disks; dm; dm = dm->next) { + if (dm->index >= 0) + dm->index *= 2; + } + /* create new *missing* disks */ + for (i = 0; i < map->num_members; i++) { + space = *space_list; + if (!space) + continue; + *space_list = *space; + du = (void *)space; + memcpy(du, super->disks, sizeof(*du)); + du->fd = -1; + du->minor = 0; + du->major = 0; + du->index = (i * 2) + 1; + sprintf((char *)du->disk.serial, + " MISSING_%d", du->index); + sprintf((char *)du->serial, + "MISSING_%d", du->index); + du->next = super->missing; + super->missing = du; + } + /* create new dev and map */ + space = *space_list; + if (!space) + return 0; + *space_list = *space; + dev_new = (void *)space; + memcpy(dev_new, dev, sizeof(*dev)); + /* update new map */ + map = get_imsm_map(dev_new, MAP_0); + + map->map_state = IMSM_T_STATE_DEGRADED; + map->raid_level = 1; + set_num_domains(map); + map->num_members = map->num_members * map->num_domains; + update_num_data_stripes(map, imsm_dev_size(dev)); + + /* replace dev<->dev_new */ + dv->dev = dev_new; + } + /* update disk order table */ + for (du = super->disks; du; du = du->next) + if (du->index >= 0) + set_imsm_ord_tbl_ent(map, du->index, du->index); + for (du = super->missing; du; du = du->next) + if (du->index >= 0) { + set_imsm_ord_tbl_ent(map, du->index, du->index); + mark_missing(super, dv->dev, &du->disk, du->index); + } + + return 1; +} + +static void imsm_process_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * crack open the metadata_update envelope to find the update record + * update can be one of: + * update_reshape_container_disks - all the arrays in the container + * are being reshaped to have more devices. We need to mark + * the arrays for general migration and convert selected spares + * into active devices. + * update_activate_spare - a spare device has replaced a failed + * device in an array, update the disk_ord_tbl. If this disk is + * present in all member arrays then also clear the SPARE_DISK + * flag + * update_create_array + * update_kill_array + * update_rename_array + * update_add_remove_disk + */ + struct intel_super *super = st->sb; + struct imsm_super *mpb; + enum imsm_update_type type = *(enum imsm_update_type *) update->buf; + + /* update requires a larger buf but the allocation failed */ + if (super->next_len && !super->next_buf) { + super->next_len = 0; + return; + } + + if (super->next_buf) { + memcpy(super->next_buf, super->buf, super->len); + free(super->buf); + super->len = super->next_len; + super->buf = super->next_buf; + + super->next_len = 0; + super->next_buf = NULL; + } + + mpb = super->anchor; + + switch (type) { + case update_general_migration_checkpoint: { + struct intel_dev *id; + struct imsm_update_general_migration_checkpoint *u = + (void *)update->buf; + + dprintf("called for update_general_migration_checkpoint\n"); + + /* find device under general migration */ + for (id = super->devlist ; id; id = id->next) { + if (is_gen_migration(id->dev)) { + set_vol_curr_migr_unit(id->dev, + u->curr_migr_unit); + super->updates_pending++; + } + } + break; + } + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (apply_takeover_update(u, super, &update->space_list)) { + imsm_update_version_info(super); + super->updates_pending++; + } + break; + } + + case update_reshape_container_disks: { + struct imsm_update_reshape *u = (void *)update->buf; + if (apply_reshape_container_disks_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_reshape_migration: { + struct imsm_update_reshape_migration *u = (void *)update->buf; + if (apply_reshape_migration_update( + u, super, &update->space_list)) + super->updates_pending++; + break; + } + case update_size_change: { + struct imsm_update_size_change *u = (void *)update->buf; + if (apply_size_change_update(u, super)) + super->updates_pending++; + break; + } + case update_activate_spare: { + struct imsm_update_activate_spare *u = (void *) update->buf; + + if (prepare_spare_to_activate(st, u) && + apply_update_activate_spare(u, super, st->arrays)) + super->updates_pending++; + break; + } + case update_create_array: { + /* someone wants to create a new array, we need to be aware of + * a few races/collisions: + * 1/ 'Create' called by two separate instances of mdadm + * 2/ 'Create' versus 'activate_spare': mdadm has chosen + * devices that have since been assimilated via + * activate_spare. + * In the event this update can not be carried out mdadm will + * (FIX ME) notice that its update did not take hold. + */ + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev; + struct imsm_map *map, *new_map; + unsigned long long start, end; + unsigned long long new_start, new_end; + int i; + struct disk_info *inf; + struct dl *dl; + + /* handle racing creates: first come first serve */ + if (u->dev_idx < mpb->num_raid_devs) { + dprintf("subarray %d already defined\n", u->dev_idx); + goto create_error; + } + + /* check update is next in sequence */ + if (u->dev_idx != mpb->num_raid_devs) { + dprintf("can not create array %d expected index %d\n", + u->dev_idx, mpb->num_raid_devs); + goto create_error; + } + + new_map = get_imsm_map(&u->dev, MAP_0); + new_start = pba_of_lba0(new_map); + new_end = new_start + per_dev_array_size(new_map); + inf = get_disk_info(u); + + /* handle activate_spare versus create race: + * check to make sure that overlapping arrays do not include + * overalpping disks + */ + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + start = pba_of_lba0(map); + end = start + per_dev_array_size(map); + if ((new_start >= start && new_start <= end) || + (start >= new_start && start <= new_end)) + /* overlap */; + else + continue; + + if (disks_overlap(super, i, u)) { + dprintf("arrays overlap\n"); + goto create_error; + } + } + + /* check that prepare update was successful */ + if (!update->space) { + dprintf("prepare update failed\n"); + goto create_error; + } + + /* check that all disks are still active before committing + * changes. FIXME: could we instead handle this by creating a + * degraded array? That's probably not what the user expects, + * so better to drop this update on the floor. + */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + dprintf("disk disappeared\n"); + goto create_error; + } + } + + super->updates_pending++; + + /* convert spares to members and fixup ord_tbl */ + for (i = 0; i < new_map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (dl->index == -1) { + dl->index = mpb->num_disks; + mpb->num_disks++; + dl->disk.status |= CONFIGURED_DISK; + dl->disk.status &= ~SPARE_DISK; + } + set_imsm_ord_tbl_ent(new_map, i, dl->index); + } + + dv = update->space; + dev = dv->dev; + update->space = NULL; + imsm_copy_dev(dev, &u->dev); + dv->index = u->dev_idx; + dv->next = super->devlist; + super->devlist = dv; + mpb->num_raid_devs++; + + imsm_update_version_info(super); + break; + create_error: + /* mdmon knows how to release update->space, but not + * ((struct intel_dev *) update->space)->dev + */ + if (update->space) { + dv = update->space; + free(dv->dev); + } + break; + } + case update_kill_array: { + struct imsm_update_kill_array *u = (void *) update->buf; + int victim = u->dev_idx; + struct active_array *a; + struct intel_dev **dp; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * active arrays, or deleting an active array + * + * FIXME when immutable ids are available, but note that + * we'll also need to fixup the invalidated/active + * subarray indexes in mdstat + */ + for (a = st->arrays; a; a = a->next) + if (a->info.container_member >= victim) + break; + /* by definition if mdmon is running at least one array + * is active in the container, so checking + * mpb->num_raid_devs is just extra paranoia + */ + dev = get_imsm_dev(super, victim); + if (a || !dev || mpb->num_raid_devs == 1) { + dprintf("failed to delete subarray-%d\n", victim); + break; + } + + for (dp = &super->devlist; *dp;) + if ((*dp)->index == (unsigned)super->current_vol) { + *dp = (*dp)->next; + } else { + if ((*dp)->index > (unsigned)victim) + (*dp)->index--; + dp = &(*dp)->next; + } + mpb->num_raid_devs--; + super->updates_pending++; + break; + } + case update_rename_array: { + struct imsm_update_rename_array *u = (void *) update->buf; + char name[MAX_RAID_SERIAL_LEN+1]; + int target = u->dev_idx; + struct active_array *a; + struct imsm_dev *dev; + + /* sanity check that we are not affecting the uuid of + * an active array + */ + memset(name, 0, sizeof(name)); + snprintf(name, MAX_RAID_SERIAL_LEN, "%s", (char *) u->name); + name[MAX_RAID_SERIAL_LEN] = '\0'; + for (a = st->arrays; a; a = a->next) + if (a->info.container_member == target) + break; + dev = get_imsm_dev(super, u->dev_idx); + if (a || !dev || !check_name(super, name, 1)) { + dprintf("failed to rename subarray-%d\n", target); + break; + } + + memcpy(dev->volume, name, MAX_RAID_SERIAL_LEN); + super->updates_pending++; + break; + } + case update_add_remove_disk: { + /* we may be able to repair some arrays if disks are + * being added, check the status of add_remove_disk + * if discs has been added. + */ + if (add_remove_disk_update(super)) { + struct active_array *a; + + super->updates_pending++; + for (a = st->arrays; a; a = a->next) + a->check_degraded = 1; + } + break; + } + case update_prealloc_badblocks_mem: + break; + case update_rwh_policy: { + struct imsm_update_rwh_policy *u = (void *)update->buf; + int target = u->dev_idx; + struct imsm_dev *dev = get_imsm_dev(super, target); + if (!dev) { + dprintf("could not find subarray-%d\n", target); + break; + } + + if (dev->rwh_policy != u->new_policy) { + dev->rwh_policy = u->new_policy; + super->updates_pending++; + } + break; + } + default: + pr_err("error: unsupported process update type:(type: %d)\n", type); + } +} + +static struct mdinfo *get_spares_for_grow(struct supertype *st); + +static int imsm_prepare_update(struct supertype *st, + struct metadata_update *update) +{ + /** + * Allocate space to hold new disk entries, raid-device entries or a new + * mpb if necessary. The manager synchronously waits for updates to + * complete in the monitor, so new mpb buffers allocated here can be + * integrated by the monitor thread without worrying about live pointers + * in the manager thread. + */ + enum imsm_update_type type; + struct intel_super *super = st->sb; + unsigned int sector_size = super->sector_size; + struct imsm_super *mpb = super->anchor; + size_t buf_len; + size_t len = 0; + + if (update->len < (int)sizeof(type)) + return 0; + + type = *(enum imsm_update_type *) update->buf; + + switch (type) { + case update_general_migration_checkpoint: + if (update->len < (int)sizeof(struct imsm_update_general_migration_checkpoint)) + return 0; + dprintf("called for update_general_migration_checkpoint\n"); + break; + case update_takeover: { + struct imsm_update_takeover *u = (void *)update->buf; + if (update->len < (int)sizeof(*u)) + return 0; + if (u->direction == R0_TO_R10) { + void **tail = (void **)&update->space_list; + struct imsm_dev *dev = get_imsm_dev(super, u->subarray); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int num_members = map->num_members; + void *space; + int size, i; + /* allocate memory for added disks */ + for (i = 0; i < num_members; i++) { + size = sizeof(struct dl); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + } + /* allocate memory for new device */ + size = sizeof_imsm_dev(super->devlist->dev, 0) + + (num_members * sizeof(__u32)); + space = xmalloc(size); + *tail = space; + tail = space; + *tail = NULL; + len = disks_to_mpb_size(num_members * 2); + } + + break; + } + case update_reshape_container_disks: { + /* Every raid device in the container is about to + * gain some more devices, and we will enter a + * reconfiguration. + * So each 'imsm_map' will be bigger, and the imsm_vol + * will now hold 2 of them. + * Thus we need new 'struct imsm_dev' allocations sized + * as sizeof_imsm_dev but with more devices in both maps. + */ + struct imsm_update_reshape *u = (void *)update->buf; + struct intel_dev *dl; + void **space_tail = (void**)&update->space_list; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + for (dl = super->devlist; dl; dl = dl->next) { + int size = sizeof_imsm_dev(dl->dev, 1); + void *s; + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + } + + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_reshape_migration: { + /* for migration level 0->5 we need to add disks + * so the same as for container operation we will copy + * device to the bigger location. + * in memory prepared device and new disk area are prepared + * for usage in process update + */ + struct imsm_update_reshape_migration *u = (void *)update->buf; + struct intel_dev *id; + void **space_tail = (void **)&update->space_list; + int size; + void *s; + int current_level = -1; + + if (update->len < (int)sizeof(*u)) + return 0; + + dprintf("for update_reshape\n"); + + /* add space for bigger array in update + */ + for (id = super->devlist; id; id = id->next) { + if (id->index == (unsigned)u->subdev) { + size = sizeof_imsm_dev(id->dev, 1); + if (u->new_raid_disks > u->old_raid_disks) + size += sizeof(__u32)*2* + (u->new_raid_disks - u->old_raid_disks); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + break; + } + } + if (update->space_list == NULL) + break; + + /* add space for disk in update + */ + size = sizeof(struct dl); + s = xmalloc(size); + *space_tail = s; + space_tail = s; + *space_tail = NULL; + + /* add spare device to update + */ + for (id = super->devlist ; id; id = id->next) + if (id->index == (unsigned)u->subdev) { + struct imsm_dev *dev; + struct imsm_map *map; + + dev = get_imsm_dev(super, u->subdev); + map = get_imsm_map(dev, MAP_0); + current_level = map->raid_level; + break; + } + if (u->new_level == 5 && u->new_level != current_level) { + struct mdinfo *spares; + + spares = get_spares_for_grow(st); + if (spares) { + struct dl *dl; + struct mdinfo *dev; + + dev = spares->devs; + if (dev) { + u->new_disks[0] = + makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, + dev->disk.major, + dev->disk.minor); + dl->index = u->old_raid_disks; + dev = dev->next; + } + sysfs_free(spares); + } + } + len = disks_to_mpb_size(u->new_raid_disks); + dprintf("New anchor length is %llu\n", (unsigned long long)len); + break; + } + case update_size_change: { + if (update->len < (int)sizeof(struct imsm_update_size_change)) + return 0; + break; + } + case update_activate_spare: { + if (update->len < (int)sizeof(struct imsm_update_activate_spare)) + return 0; + break; + } + case update_create_array: { + struct imsm_update_create_array *u = (void *) update->buf; + struct intel_dev *dv; + struct imsm_dev *dev = &u->dev; + struct imsm_map *map = get_imsm_map(dev, MAP_0); + struct dl *dl; + struct disk_info *inf; + int i; + int activate = 0; + + if (update->len < (int)sizeof(*u)) + return 0; + + inf = get_disk_info(u); + len = sizeof_imsm_dev(dev, 1); + /* allocate a new super->devlist entry */ + dv = xmalloc(sizeof(*dv)); + dv->dev = xmalloc(len); + update->space = dv; + + /* count how many spares will be converted to members */ + for (i = 0; i < map->num_members; i++) { + dl = serial_to_dl(inf[i].serial, super); + if (!dl) { + /* hmm maybe it failed?, nothing we can do about + * it here + */ + continue; + } + if (count_memberships(dl, super) == 0) + activate++; + } + len += activate * sizeof(struct imsm_disk); + break; + } + case update_kill_array: { + if (update->len < (int)sizeof(struct imsm_update_kill_array)) + return 0; + break; + } + case update_rename_array: { + if (update->len < (int)sizeof(struct imsm_update_rename_array)) + return 0; + break; + } + case update_add_remove_disk: + /* no update->len needed */ + break; + case update_prealloc_badblocks_mem: + super->extra_space += sizeof(struct bbm_log) - + get_imsm_bbm_log_size(super->bbm_log); + break; + case update_rwh_policy: { + if (update->len < (int)sizeof(struct imsm_update_rwh_policy)) + return 0; + break; + } + default: + return 0; + } + + /* check if we need a larger metadata buffer */ + if (super->next_buf) + buf_len = super->next_len; + else + buf_len = super->len; + + if (__le32_to_cpu(mpb->mpb_size) + super->extra_space + len > buf_len) { + /* ok we need a larger buf than what is currently allocated + * if this allocation fails process_update will notice that + * ->next_len is set and ->next_buf is NULL + */ + buf_len = ROUND_UP(__le32_to_cpu(mpb->mpb_size) + + super->extra_space + len, sector_size); + if (super->next_buf) + free(super->next_buf); + + super->next_len = buf_len; + if (posix_memalign(&super->next_buf, sector_size, buf_len) == 0) + memset(super->next_buf, 0, buf_len); + else + super->next_buf = NULL; + } + return 1; +} + +/* must be called while manager is quiesced */ +static void imsm_delete(struct intel_super *super, struct dl **dlp, unsigned index) +{ + struct imsm_super *mpb = super->anchor; + struct dl *iter; + struct imsm_dev *dev; + struct imsm_map *map; + unsigned int i, j, num_members; + __u32 ord, ord_map0; + struct bbm_log *log = super->bbm_log; + + dprintf("deleting device[%d] from imsm_super\n", index); + + /* shift all indexes down one */ + for (iter = super->disks; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + for (iter = super->missing; iter; iter = iter->next) + if (iter->index > (int)index) + iter->index--; + + for (i = 0; i < mpb->num_raid_devs; i++) { + dev = get_imsm_dev(super, i); + map = get_imsm_map(dev, MAP_0); + num_members = map->num_members; + for (j = 0; j < num_members; j++) { + /* update ord entries being careful not to propagate + * ord-flags to the first map + */ + ord = get_imsm_ord_tbl_ent(dev, j, MAP_X); + ord_map0 = get_imsm_ord_tbl_ent(dev, j, MAP_0); + + if (ord_to_idx(ord) <= index) + continue; + + map = get_imsm_map(dev, MAP_0); + set_imsm_ord_tbl_ent(map, j, ord_map0 - 1); + map = get_imsm_map(dev, MAP_1); + if (map) + set_imsm_ord_tbl_ent(map, j, ord - 1); + } + } + + for (i = 0; i < log->entry_count; i++) { + struct bbm_log_entry *entry = &log->marked_block_entries[i]; + + if (entry->disk_ordinal <= index) + continue; + entry->disk_ordinal--; + } + + mpb->num_disks--; + super->updates_pending++; + if (*dlp) { + struct dl *dl = *dlp; + + *dlp = (*dlp)->next; + __free_imsm_disk(dl, 1); + } +} + +static int imsm_get_allowed_degradation(int level, int raid_disks, + struct intel_super *super, + struct imsm_dev *dev) +{ + switch (level) { + case 1: + case 10:{ + int ret_val = 0; + struct imsm_map *map; + int i; + + ret_val = raid_disks/2; + /* check map if all disks pairs not failed + * in both maps + */ + map = get_imsm_map(dev, MAP_0); + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + map = get_imsm_map(dev, MAP_1); + /* if there is no second map + * result can be returned + */ + if (map == NULL) + return ret_val; + /* check degradation in second map + */ + for (i = 0; i < ret_val; i++) { + int degradation = 0; + if (get_imsm_disk(super, i) == NULL) + degradation++; + if (get_imsm_disk(super, i + 1) == NULL) + degradation++; + if (degradation == 2) + return 0; + } + return ret_val; + } + case 5: + return 1; + case 6: + return 2; + default: + return 0; + } +} + +/******************************************************************************* + * Function: validate_container_imsm + * Description: This routine validates container after assemble, + * eg. if devices in container are under the same controller. + * + * Parameters: + * info : linked list with info about devices used in array + * Returns: + * 1 : HBA mismatch + * 0 : Success + ******************************************************************************/ +int validate_container_imsm(struct mdinfo *info) +{ + if (check_env("IMSM_NO_PLATFORM")) + return 0; + + struct sys_dev *idev; + struct sys_dev *hba = NULL; + struct sys_dev *intel_devices = find_intel_devices(); + char *dev_path = devt_to_devpath(makedev(info->disk.major, + info->disk.minor), 1, NULL); + + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba = idev; + break; + } + } + if (dev_path) + free(dev_path); + + if (!hba) { + pr_err("WARNING - Cannot detect HBA for device %s!\n", + devid2kname(makedev(info->disk.major, info->disk.minor))); + return 1; + } + + const struct imsm_orom *orom = get_orom_by_device_id(hba->dev_id); + struct mdinfo *dev; + + for (dev = info->next; dev; dev = dev->next) { + dev_path = devt_to_devpath(makedev(dev->disk.major, + dev->disk.minor), 1, NULL); + + struct sys_dev *hba2 = NULL; + for (idev = intel_devices; idev; idev = idev->next) { + if (dev_path && strstr(dev_path, idev->path)) { + hba2 = idev; + break; + } + } + if (dev_path) + free(dev_path); + + const struct imsm_orom *orom2 = hba2 == NULL ? NULL : + get_orom_by_device_id(hba2->dev_id); + + if (hba2 && hba->type != hba2->type) { + pr_err("WARNING - HBAs of devices do not match %s != %s\n", + get_sys_dev_type(hba->type), get_sys_dev_type(hba2->type)); + return 1; + } + + if (orom != orom2) { + pr_err("WARNING - IMSM container assembled with disks under different HBAs!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + + if (!orom) { + pr_err("WARNING - IMSM container assembled with disks under HBAs without IMSM platform support!\n" + " This operation is not supported and can lead to data loss.\n"); + return 1; + } + } + + return 0; +} + +/******************************************************************************* +* Function: imsm_record_badblock +* Description: This routine stores new bad block record in BBM log +* +* Parameters: +* a : array containing a bad block +* slot : disk number containing a bad block +* sector : bad block sector +* length : bad block sectors range +* Returns: +* 1 : Success +* 0 : Error +******************************************************************************/ +static int imsm_record_badblock(struct active_array *a, int slot, + unsigned long long sector, int length) +{ + struct intel_super *super = a->container->sb; + int ord; + int ret; + + ord = imsm_disk_slot_to_ord(a, slot); + if (ord < 0) + return 0; + + ret = record_new_badblock(super->bbm_log, ord_to_idx(ord), sector, + length); + if (ret) + super->updates_pending++; + + return ret; +} +/******************************************************************************* +* Function: imsm_clear_badblock +* Description: This routine clears bad block record from BBM log +* +* Parameters: +* a : array containing a bad block +* slot : disk number containing a bad block +* sector : bad block sector +* length : bad block sectors range +* Returns: +* 1 : Success +* 0 : Error +******************************************************************************/ +static int imsm_clear_badblock(struct active_array *a, int slot, + unsigned long long sector, int length) +{ + struct intel_super *super = a->container->sb; + int ord; + int ret; + + ord = imsm_disk_slot_to_ord(a, slot); + if (ord < 0) + return 0; + + ret = clear_badblock(super->bbm_log, ord_to_idx(ord), sector, length); + if (ret) + super->updates_pending++; + + return ret; +} +/******************************************************************************* +* Function: imsm_get_badblocks +* Description: This routine get list of bad blocks for an array +* +* Parameters: +* a : array +* slot : disk number +* Returns: +* bb : structure containing bad blocks +* NULL : error +******************************************************************************/ +static struct md_bb *imsm_get_badblocks(struct active_array *a, int slot) +{ + int inst = a->info.container_member; + struct intel_super *super = a->container->sb; + struct imsm_dev *dev = get_imsm_dev(super, inst); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + int ord; + + ord = imsm_disk_slot_to_ord(a, slot); + if (ord < 0) + return NULL; + + get_volume_badblocks(super->bbm_log, ord_to_idx(ord), pba_of_lba0(map), + per_dev_array_size(map), &super->bb); + + return &super->bb; +} +/******************************************************************************* +* Function: examine_badblocks_imsm +* Description: Prints list of bad blocks on a disk to the standard output +* +* Parameters: +* st : metadata handler +* fd : open file descriptor for device +* devname : device name +* Returns: +* 0 : Success +* 1 : Error +******************************************************************************/ +static int examine_badblocks_imsm(struct supertype *st, int fd, char *devname) +{ + struct intel_super *super = st->sb; + struct bbm_log *log = super->bbm_log; + struct dl *d = NULL; + int any = 0; + + for (d = super->disks; d ; d = d->next) { + if (strcmp(d->devname, devname) == 0) + break; + } + + if ((d == NULL) || (d->index < 0)) { /* serial mismatch probably */ + pr_err("%s doesn't appear to be part of a raid array\n", + devname); + return 1; + } + + if (log != NULL) { + unsigned int i; + struct bbm_log_entry *entry = &log->marked_block_entries[0]; + + for (i = 0; i < log->entry_count; i++) { + if (entry[i].disk_ordinal == d->index) { + unsigned long long sector = __le48_to_cpu( + &entry[i].defective_block_start); + int cnt = entry[i].marked_count + 1; + + if (!any) { + printf("Bad-blocks on %s:\n", devname); + any = 1; + } + + printf("%20llu for %d sectors\n", sector, cnt); + } + } + } + + if (!any) + printf("No bad-blocks list configured on %s\n", devname); + + return 0; +} +/******************************************************************************* + * Function: init_migr_record_imsm + * Description: Function inits imsm migration record + * Parameters: + * super : imsm internal array info + * dev : device under migration + * info : general array info to find the smallest device + * Returns: + * none + ******************************************************************************/ +void init_migr_record_imsm(struct supertype *st, struct imsm_dev *dev, + struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + int new_data_disks; + unsigned long long dsize, dev_sectors; + long long unsigned min_dev_sectors = -1LLU; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + struct imsm_map *map_src = get_imsm_map(dev, MAP_1); + unsigned long long num_migr_units; + unsigned long long array_blocks; + struct dl *dl_disk = NULL; + + memset(migr_rec, 0, sizeof(struct migr_record)); + migr_rec->family_num = __cpu_to_le32(super->anchor->family_num); + + /* only ascending reshape supported now */ + migr_rec->ascending_migr = __cpu_to_le32(1); + + migr_rec->dest_depth_per_unit = GEN_MIGR_AREA_SIZE / + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + migr_rec->dest_depth_per_unit *= + max(map_dest->blocks_per_strip, map_src->blocks_per_strip); + new_data_disks = imsm_num_data_members(map_dest); + migr_rec->blocks_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit * new_data_disks); + migr_rec->dest_depth_per_unit = + __cpu_to_le32(migr_rec->dest_depth_per_unit); + array_blocks = info->component_size * new_data_disks; + num_migr_units = + array_blocks / __le32_to_cpu(migr_rec->blocks_per_unit); + + if (array_blocks % __le32_to_cpu(migr_rec->blocks_per_unit)) + num_migr_units++; + set_num_migr_units(migr_rec, num_migr_units); + + migr_rec->post_migr_vol_cap = dev->size_low; + migr_rec->post_migr_vol_cap_hi = dev->size_high; + + /* Find the smallest dev */ + for (dl_disk = super->disks; dl_disk ; dl_disk = dl_disk->next) { + /* ignore spares in container */ + if (dl_disk->index < 0) + continue; + get_dev_size(dl_disk->fd, NULL, &dsize); + dev_sectors = dsize / 512; + if (dev_sectors < min_dev_sectors) + min_dev_sectors = dev_sectors; + } + set_migr_chkp_area_pba(migr_rec, min_dev_sectors - + RAID_DISK_RESERVED_BLOCKS_IMSM_HI); + + write_imsm_migr_rec(st); + + return; +} + +/******************************************************************************* + * Function: save_backup_imsm + * Description: Function saves critical data stripes to Migration Copy Area + * and updates the current migration unit status. + * Use restore_stripes() to form a destination stripe, + * and to write it to the Copy Area. + * Parameters: + * st : supertype information + * dev : imsm device that backup is saved for + * info : general array info + * buf : input buffer + * length : length of data to backup (blocks_per_unit) + * Returns: + * 0 : success + *, -1 : fail + ******************************************************************************/ +int save_backup_imsm(struct supertype *st, + struct imsm_dev *dev, + struct mdinfo *info, + void *buf, + int length) +{ + int rv = -1; + struct intel_super *super = st->sb; + int i; + struct imsm_map *map_dest = get_imsm_map(dev, MAP_0); + int new_disks = map_dest->num_members; + int dest_layout = 0; + int dest_chunk, targets[new_disks]; + unsigned long long start, target_offsets[new_disks]; + int data_disks = imsm_num_data_members(map_dest); + + for (i = 0; i < new_disks; i++) { + struct dl *dl_disk = get_imsm_dl_disk(super, i); + if (dl_disk && is_fd_valid(dl_disk->fd)) + targets[i] = dl_disk->fd; + else + goto abort; + } + + start = info->reshape_progress * 512; + for (i = 0; i < new_disks; i++) { + target_offsets[i] = migr_chkp_area_pba(super->migr_rec) * 512; + /* move back copy area adderss, it will be moved forward + * in restore_stripes() using start input variable + */ + target_offsets[i] -= start/data_disks; + } + + dest_layout = imsm_level_to_layout(map_dest->raid_level); + dest_chunk = __le16_to_cpu(map_dest->blocks_per_strip) * 512; + + if (restore_stripes(targets, /* list of dest devices */ + target_offsets, /* migration record offsets */ + new_disks, + dest_chunk, + map_dest->raid_level, + dest_layout, + -1, /* source backup file descriptor */ + 0, /* input buf offset + * always 0 buf is already offseted */ + start, + length, + buf) != 0) { + pr_err("Error restoring stripes\n"); + goto abort; + } + + rv = 0; + +abort: + return rv; +} + +/******************************************************************************* + * Function: save_checkpoint_imsm + * Description: Function called for current unit status update + * in the migration record. It writes it to disk. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0: success + * 1: failure + * 2: failure, means no valid migration record + * / no general migration in progress / + ******************************************************************************/ +int save_checkpoint_imsm(struct supertype *st, struct mdinfo *info, int state) +{ + struct intel_super *super = st->sb; + unsigned long long blocks_per_unit; + unsigned long long curr_migr_unit; + + if (load_imsm_migr_rec(super) != 0) { + dprintf("imsm: ERROR: Cannot read migration record for checkpoint save.\n"); + return 1; + } + + blocks_per_unit = __le32_to_cpu(super->migr_rec->blocks_per_unit); + if (blocks_per_unit == 0) { + dprintf("imsm: no migration in progress.\n"); + return 2; + } + curr_migr_unit = info->reshape_progress / blocks_per_unit; + /* check if array is alligned to copy area + * if it is not alligned, add one to current migration unit value + * this can happend on array reshape finish only + */ + if (info->reshape_progress % blocks_per_unit) + curr_migr_unit++; + + set_current_migr_unit(super->migr_rec, curr_migr_unit); + super->migr_rec->rec_status = __cpu_to_le32(state); + set_migr_dest_1st_member_lba(super->migr_rec, + super->migr_rec->dest_depth_per_unit * curr_migr_unit); + + if (write_imsm_migr_rec(st) < 0) { + dprintf("imsm: Cannot write migration record outside backup area\n"); + return 1; + } + + return 0; +} + +/******************************************************************************* + * Function: recover_backup_imsm + * Description: Function recovers critical data from the Migration Copy Area + * while assembling an array. + * Parameters: + * super : imsm internal array info + * info : general array info + * Returns: + * 0 : success (or there is no data to recover) + * 1 : fail + ******************************************************************************/ +int recover_backup_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + struct migr_record *migr_rec = super->migr_rec; + struct imsm_map *map_dest; + struct intel_dev *id = NULL; + unsigned long long read_offset; + unsigned long long write_offset; + unsigned unit_len; + int new_disks, err; + char *buf = NULL; + int retval = 1; + unsigned int sector_size = super->sector_size; + unsigned long long curr_migr_unit = current_migr_unit(migr_rec); + unsigned long long num_migr_units = get_num_migr_units(migr_rec); + char buffer[20]; + int skipped_disks = 0; + struct dl *dl_disk; + + err = sysfs_get_str(info, NULL, "array_state", (char *)buffer, 20); + if (err < 1) + return 1; + + /* recover data only during assemblation */ + if (strncmp(buffer, "inactive", 8) != 0) + return 0; + /* no data to recover */ + if (__le32_to_cpu(migr_rec->rec_status) == UNIT_SRC_NORMAL) + return 0; + if (curr_migr_unit >= num_migr_units) + return 1; + + /* find device during reshape */ + for (id = super->devlist; id; id = id->next) + if (is_gen_migration(id->dev)) + break; + if (id == NULL) + return 1; + + map_dest = get_imsm_map(id->dev, MAP_0); + new_disks = map_dest->num_members; + + read_offset = migr_chkp_area_pba(migr_rec) * 512; + + write_offset = (migr_dest_1st_member_lba(migr_rec) + + pba_of_lba0(map_dest)) * 512; + + unit_len = __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + if (posix_memalign((void **)&buf, sector_size, unit_len) != 0) + goto abort; + + for (dl_disk = super->disks; dl_disk; dl_disk = dl_disk->next) { + if (dl_disk->index < 0) + continue; + + if (!is_fd_valid(dl_disk->fd)) { + skipped_disks++; + continue; + } + if (lseek64(dl_disk->fd, read_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if (read(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) { + pr_err("Cannot read copy area block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if (lseek64(dl_disk->fd, write_offset, SEEK_SET) < 0) { + pr_err("Cannot seek to block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + if (write(dl_disk->fd, buf, unit_len) != (ssize_t)unit_len) { + pr_err("Cannot restore block: %s\n", + strerror(errno)); + skipped_disks++; + continue; + } + } + + if (skipped_disks > imsm_get_allowed_degradation(info->new_level, + new_disks, + super, + id->dev)) { + pr_err("Cannot restore data from backup. Too many failed disks\n"); + goto abort; + } + + if (save_checkpoint_imsm(st, info, UNIT_SRC_NORMAL)) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL) during restart\n"); + } else + retval = 0; + +abort: + free(buf); + return retval; +} + +static char disk_by_path[] = "/dev/disk/by-path/"; + +static const char *imsm_get_disk_controller_domain(const char *path) +{ + char disk_path[PATH_MAX]; + char *drv=NULL; + struct stat st; + + strcpy(disk_path, disk_by_path); + strncat(disk_path, path, PATH_MAX - strlen(disk_path) - 1); + if (stat(disk_path, &st) == 0) { + struct sys_dev* hba; + char *path; + + path = devt_to_devpath(st.st_rdev, 1, NULL); + if (path == NULL) + return "unknown"; + hba = find_disk_attached_hba(-1, path); + if (hba && hba->type == SYS_DEV_SAS) + drv = "isci"; + else if (hba && hba->type == SYS_DEV_SATA) + drv = "ahci"; + else if (hba && hba->type == SYS_DEV_VMD) + drv = "vmd"; + else if (hba && hba->type == SYS_DEV_NVME) + drv = "nvme"; + else + drv = "unknown"; + dprintf("path: %s hba: %s attached: %s\n", + path, (hba) ? hba->path : "NULL", drv); + free(path); + } + return drv; +} + +static char *imsm_find_array_devnm_by_subdev(int subdev, char *container) +{ + static char devnm[32]; + char subdev_name[20]; + struct mdstat_ent *mdstat; + + sprintf(subdev_name, "%d", subdev); + mdstat = mdstat_by_subdev(subdev_name, container); + if (!mdstat) + return NULL; + + strcpy(devnm, mdstat->devnm); + free_mdstat(mdstat); + return devnm; +} + +static int imsm_reshape_is_allowed_on_container(struct supertype *st, + struct geo_params *geo, + int *old_raid_disks, + int direction) +{ + /* currently we only support increasing the number of devices + * for a container. This increases the number of device for each + * member array. They must all be RAID0 or RAID5. + */ + int ret_val = 0; + struct mdinfo *info, *member; + int devices_that_can_grow = 0; + + dprintf("imsm: imsm_reshape_is_allowed_on_container(ENTER): st->devnm = (%s)\n", st->devnm); + + if (geo->size > 0 || + geo->level != UnSet || + geo->layout != UnSet || + geo->chunksize != 0 || + geo->raid_disks == UnSet) { + dprintf("imsm: Container operation is allowed for raid disks number change only.\n"); + return ret_val; + } + + if (direction == ROLLBACK_METADATA_CHANGES) { + dprintf("imsm: Metadata changes rollback is not supported for container operation.\n"); + return ret_val; + } + + info = container_content_imsm(st, NULL); + for (member = info; member; member = member->next) { + char *result; + + dprintf("imsm: checking device_num: %i\n", + member->container_member); + + if (geo->raid_disks <= member->array.raid_disks) { + /* we work on container for Online Capacity Expansion + * only so raid_disks has to grow + */ + dprintf("imsm: for container operation raid disks increase is required\n"); + break; + } + + if (info->array.level != 0 && info->array.level != 5) { + /* we cannot use this container with other raid level + */ + dprintf("imsm: for container operation wrong raid level (%i) detected\n", + info->array.level); + break; + } else { + /* check for platform support + * for this raid level configuration + */ + struct intel_super *super = st->sb; + if (!is_raid_level_supported(super->orom, + member->array.level, + geo->raid_disks)) { + dprintf("platform does not support raid%d with %d disk%s\n", + info->array.level, + geo->raid_disks, + geo->raid_disks > 1 ? "s" : ""); + break; + } + /* check if component size is aligned to chunk size + */ + if (info->component_size % + (info->array.chunk_size/512)) { + dprintf("Component size is not aligned to chunk size\n"); + break; + } + } + + if (*old_raid_disks && + info->array.raid_disks != *old_raid_disks) + break; + *old_raid_disks = info->array.raid_disks; + + /* All raid5 and raid0 volumes in container + * have to be ready for Online Capacity Expansion + * so they need to be assembled. We have already + * checked that no recovery etc is happening. + */ + result = imsm_find_array_devnm_by_subdev(member->container_member, + st->container_devnm); + if (result == NULL) { + dprintf("imsm: cannot find array\n"); + break; + } + devices_that_can_grow++; + } + sysfs_free(info); + if (!member && devices_that_can_grow) + ret_val = 1; + + if (ret_val) + dprintf("Container operation allowed\n"); + else + dprintf("Error: %i\n", ret_val); + + return ret_val; +} + +/* Function: get_spares_for_grow + * Description: Allocates memory and creates list of spare devices + * avaliable in container. Checks if spare drive size is acceptable. + * Parameters: Pointer to the supertype structure + * Returns: Pointer to the list of spare devices (mdinfo structure) on success, + * NULL if fail + */ +static struct mdinfo *get_spares_for_grow(struct supertype *st) +{ + struct spare_criteria sc; + + get_spare_criteria_imsm(st, &sc); + return container_choose_spares(st, &sc, NULL, NULL, NULL, 0); +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_reshape + * Function creates update for whole IMSM container. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_reshape( + struct supertype *st, + struct geo_params *geo, + int old_raid_disks, + struct imsm_update_reshape **updatep) +{ + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + int update_memory_size; + struct imsm_update_reshape *u; + struct mdinfo *spares; + int i; + int delta_disks; + struct mdinfo *dev; + + dprintf("(enter) raid_disks = %i\n", geo->raid_disks); + + delta_disks = geo->raid_disks - old_raid_disks; + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape); + + /* now add space for spare disks that we need to add. */ + update_memory_size += sizeof(u->new_disks[0]) * (delta_disks - 1); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_container_disks; + u->old_raid_disks = old_raid_disks; + u->new_raid_disks = geo->raid_disks; + + /* now get spare disks list + */ + spares = get_spares_for_grow(st); + + if (spares == NULL || delta_disks > spares->array.spare_disks) { + pr_err("imsm: ERROR: Cannot get spare devices for %s.\n", geo->dev_name); + i = -1; + goto abort; + } + + /* we have got spares + * update disk list in imsm_disk list table in anchor + */ + dprintf("imsm: %i spares are available.\n\n", + spares->array.spare_disks); + + dev = spares->devs; + for (i = 0; i < delta_disks; i++) { + struct dl *dl; + + if (dev == NULL) + break; + u->new_disks[i] = makedev(dev->disk.major, + dev->disk.minor); + dl = get_disk_super(super, dev->disk.major, dev->disk.minor); + dl->index = mpb->num_disks; + mpb->num_disks++; + dev = dev->next; + } + +abort: + /* free spares + */ + sysfs_free(spares); + + dprintf("imsm: reshape update preparation :"); + if (i == delta_disks) { + dprintf_cont(" OK\n"); + *updatep = u; + return update_memory_size; + } + free(u); + dprintf_cont(" Error\n"); + + return 0; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_size_change() + * Creates update for IMSM array for array size change. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_size_change( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_size_change **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size; + struct imsm_update_size_change *u; + + dprintf("(enter) New size = %llu\n", geo->size); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_size_change); + + u = xcalloc(1, update_memory_size); + u->type = update_size_change; + u->subdev = super->current_vol; + u->new_size = geo->size; + + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +/****************************************************************************** + * function: imsm_create_metadata_update_for_migration() + * Creates update for IMSM array. + * + ******************************************************************************/ +static int imsm_create_metadata_update_for_migration( + struct supertype *st, + struct geo_params *geo, + struct imsm_update_reshape_migration **updatep) +{ + struct intel_super *super = st->sb; + int update_memory_size; + struct imsm_update_reshape_migration *u; + struct imsm_dev *dev; + int previous_level = -1; + + dprintf("(enter) New Level = %i\n", geo->level); + + /* size of all update data without anchor */ + update_memory_size = sizeof(struct imsm_update_reshape_migration); + + u = xcalloc(1, update_memory_size); + u->type = update_reshape_migration; + u->subdev = super->current_vol; + u->new_level = geo->level; + u->new_layout = geo->layout; + u->new_raid_disks = u->old_raid_disks = geo->raid_disks; + u->new_disks[0] = -1; + u->new_chunksize = -1; + + dev = get_imsm_dev(super, u->subdev); + if (dev) { + struct imsm_map *map; + + map = get_imsm_map(dev, MAP_0); + if (map) { + int current_chunk_size = + __le16_to_cpu(map->blocks_per_strip) / 2; + + if (geo->chunksize != current_chunk_size) { + u->new_chunksize = geo->chunksize / 1024; + dprintf("imsm: chunk size change from %i to %i\n", + current_chunk_size, u->new_chunksize); + } + previous_level = map->raid_level; + } + } + if (geo->level == 5 && previous_level == 0) { + struct mdinfo *spares = NULL; + + u->new_raid_disks++; + spares = get_spares_for_grow(st); + if (spares == NULL || spares->array.spare_disks < 1) { + free(u); + sysfs_free(spares); + update_memory_size = 0; + pr_err("cannot get spare device for requested migration\n"); + return 0; + } + sysfs_free(spares); + } + dprintf("imsm: reshape update preparation : OK\n"); + *updatep = u; + + return update_memory_size; +} + +static void imsm_update_metadata_locally(struct supertype *st, + void *buf, int len) +{ + struct metadata_update mu; + + mu.buf = buf; + mu.len = len; + mu.space = NULL; + mu.space_list = NULL; + mu.next = NULL; + if (imsm_prepare_update(st, &mu)) + imsm_process_update(st, &mu); + + while (mu.space_list) { + void **space = mu.space_list; + mu.space_list = *space; + free(space); + } +} + +/*************************************************************************** +* Function: imsm_analyze_change +* Description: Function analyze change for single volume +* and validate if transition is supported +* Parameters: Geometry parameters, supertype structure, +* metadata change direction (apply/rollback) +* Returns: Operation type code on success, -1 if fail +****************************************************************************/ +enum imsm_reshape_type imsm_analyze_change(struct supertype *st, + struct geo_params *geo, + int direction) +{ + struct mdinfo info; + int change = -1; + int check_devs = 0; + int chunk; + /* number of added/removed disks in operation result */ + int devNumChange = 0; + /* imsm compatible layout value for array geometry verification */ + int imsm_layout = -1; + int data_disks; + struct imsm_dev *dev; + struct imsm_map *map; + struct intel_super *super; + unsigned long long current_size; + unsigned long long free_size; + unsigned long long max_size; + int rv; + + getinfo_super_imsm_volume(st, &info, NULL); + if (geo->level != info.array.level && geo->level >= 0 && + geo->level != UnSet) { + switch (info.array.level) { + case 0: + if (geo->level == 5) { + change = CH_MIGRATION; + if (geo->layout != ALGORITHM_LEFT_ASYMMETRIC) { + pr_err("Error. Requested Layout not supported (left-asymmetric layout is supported only)!\n"); + change = -1; + goto analyse_change_exit; + } + imsm_layout = geo->layout; + check_devs = 1; + devNumChange = 1; /* parity disk added */ + } else if (geo->level == 10) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = 2; /* two mirrors added */ + imsm_layout = 0x102; /* imsm supported layout */ + } + break; + case 1: + case 10: + if (geo->level == 0) { + change = CH_TAKEOVER; + check_devs = 1; + devNumChange = -(geo->raid_disks/2); + imsm_layout = 0; /* imsm raid0 layout */ + } + break; + } + if (change == -1) { + pr_err("Error. Level Migration from %d to %d not supported!\n", + info.array.level, geo->level); + goto analyse_change_exit; + } + } else + geo->level = info.array.level; + + if (geo->layout != info.array.layout && + (geo->layout != UnSet && geo->layout != -1)) { + change = CH_MIGRATION; + if (info.array.layout == 0 && info.array.level == 5 && + geo->layout == 5) { + /* reshape 5 -> 4 */ + } else if (info.array.layout == 5 && info.array.level == 5 && + geo->layout == 0) { + /* reshape 4 -> 5 */ + geo->layout = 0; + geo->level = 5; + } else { + pr_err("Error. Layout Migration from %d to %d not supported!\n", + info.array.layout, geo->layout); + change = -1; + goto analyse_change_exit; + } + } else { + geo->layout = info.array.layout; + if (imsm_layout == -1) + imsm_layout = info.array.layout; + } + + if (geo->chunksize > 0 && geo->chunksize != UnSet && + geo->chunksize != info.array.chunk_size) { + if (info.array.level == 10) { + pr_err("Error. Chunk size change for RAID 10 is not supported.\n"); + change = -1; + goto analyse_change_exit; + } else if (info.component_size % (geo->chunksize/512)) { + pr_err("New chunk size (%dK) does not evenly divide device size (%lluk). Aborting...\n", + geo->chunksize/1024, info.component_size/2); + change = -1; + goto analyse_change_exit; + } + change = CH_MIGRATION; + } else { + geo->chunksize = info.array.chunk_size; + } + + chunk = geo->chunksize / 1024; + + super = st->sb; + dev = get_imsm_dev(super, super->current_vol); + map = get_imsm_map(dev, MAP_0); + data_disks = imsm_num_data_members(map); + /* compute current size per disk member + */ + current_size = info.custom_array_size / data_disks; + + if (geo->size > 0 && geo->size != MAX_SIZE) { + /* align component size + */ + geo->size = imsm_component_size_alignment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, super->sector_size, + geo->size * 2); + if (geo->size == 0) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is 0).\n", + current_size); + goto analyse_change_exit; + } + } + + if (current_size != geo->size && geo->size > 0) { + if (change != -1) { + pr_err("Error. Size change should be the only one at a time.\n"); + change = -1; + goto analyse_change_exit; + } + if ((super->current_vol + 1) != super->anchor->num_raid_devs) { + pr_err("Error. The last volume in container can be expanded only (%i/%s).\n", + super->current_vol, st->devnm); + goto analyse_change_exit; + } + /* check the maximum available size + */ + rv = imsm_get_free_size(st, dev->vol.map->num_members, + 0, chunk, &free_size); + if (rv == 0) + /* Cannot find maximum available space + */ + max_size = 0; + else { + max_size = free_size + current_size; + /* align component size + */ + max_size = imsm_component_size_alignment_check( + get_imsm_raid_level(dev->vol.map), + chunk * 1024, super->sector_size, + max_size); + } + if (geo->size == MAX_SIZE) { + /* requested size change to the maximum available size + */ + if (max_size == 0) { + pr_err("Error. Cannot find maximum available space.\n"); + change = -1; + goto analyse_change_exit; + } else + geo->size = max_size; + } + + if (direction == ROLLBACK_METADATA_CHANGES) { + /* accept size for rollback only + */ + } else { + /* round size due to metadata compatibility + */ + geo->size = (geo->size >> SECT_PER_MB_SHIFT) + << SECT_PER_MB_SHIFT; + dprintf("Prepare update for size change to %llu\n", + geo->size ); + if (current_size >= geo->size) { + pr_err("Error. Size expansion is supported only (current size is %llu, requested size /rounded/ is %llu).\n", + current_size, geo->size); + goto analyse_change_exit; + } + if (max_size && geo->size > max_size) { + pr_err("Error. Requested size is larger than maximum available size (maximum available size is %llu, requested size /rounded/ is %llu).\n", + max_size, geo->size); + goto analyse_change_exit; + } + } + geo->size *= data_disks; + geo->raid_disks = dev->vol.map->num_members; + change = CH_ARRAY_SIZE; + } + if (!validate_geometry_imsm(st, + geo->level, + imsm_layout, + geo->raid_disks + devNumChange, + &chunk, + geo->size, INVALID_SECTORS, + 0, 0, info.consistency_policy, 1)) + change = -1; + + if (check_devs) { + struct intel_super *super = st->sb; + struct imsm_super *mpb = super->anchor; + + if (mpb->num_raid_devs > 1) { + pr_err("Error. Cannot perform operation on %s- for this operation it MUST be single array in container\n", + geo->dev_name); + change = -1; + } + } + +analyse_change_exit: + if (direction == ROLLBACK_METADATA_CHANGES && + (change == CH_MIGRATION || change == CH_TAKEOVER)) { + dprintf("imsm: Metadata changes rollback is not supported for migration and takeover operations.\n"); + change = -1; + } + return change; +} + +int imsm_takeover(struct supertype *st, struct geo_params *geo) +{ + struct intel_super *super = st->sb; + struct imsm_update_takeover *u; + + u = xmalloc(sizeof(struct imsm_update_takeover)); + + u->type = update_takeover; + u->subarray = super->current_vol; + + /* 10->0 transition */ + if (geo->level == 0) + u->direction = R10_TO_R0; + + /* 0->10 transition */ + if (geo->level == 10) + u->direction = R0_TO_R10; + + /* update metadata locally */ + imsm_update_metadata_locally(st, u, + sizeof(struct imsm_update_takeover)); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, + sizeof(struct imsm_update_takeover)); + else + free(u); + + return 0; +} + +/* Flush size update if size calculated by num_data_stripes is higher than + * imsm_dev_size to eliminate differences during reshape. + * Mdmon will recalculate them correctly. + * If subarray index is not set then check whole container. + * Returns: + * 0 - no error occurred + * 1 - error detected + */ +static int imsm_fix_size_mismatch(struct supertype *st, int subarray_index) +{ + struct intel_super *super = st->sb; + int tmp = super->current_vol; + int ret_val = 1; + int i; + + for (i = 0; i < super->anchor->num_raid_devs; i++) { + if (subarray_index >= 0 && i != subarray_index) + continue; + super->current_vol = i; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + struct imsm_map *map = get_imsm_map(dev, MAP_0); + unsigned int disc_count = imsm_num_data_members(map); + struct geo_params geo; + struct imsm_update_size_change *update; + unsigned long long calc_size = per_dev_array_size(map) * disc_count; + unsigned long long d_size = imsm_dev_size(dev); + int u_size; + + if (calc_size == d_size || dev->vol.migr_type == MIGR_GEN_MIGR) + continue; + + /* There is a difference, confirm that imsm_dev_size is + * smaller and push update. + */ + if (d_size > calc_size) { + pr_err("imsm: dev size of subarray %d is incorrect\n", + i); + goto exit; + } + memset(&geo, 0, sizeof(struct geo_params)); + geo.size = d_size; + u_size = imsm_create_metadata_update_for_size_change(st, &geo, + &update); + if (u_size < 1) { + dprintf("imsm: Cannot prepare size change update\n"); + goto exit; + } + imsm_update_metadata_locally(st, update, u_size); + if (st->update_tail) { + append_metadata_update(st, update, u_size); + flush_metadata_updates(st); + st->update_tail = &st->updates; + } else { + imsm_sync_metadata(st); + } + + free(update); + } + ret_val = 0; +exit: + super->current_vol = tmp; + return ret_val; +} + +static int imsm_reshape_super(struct supertype *st, unsigned long long size, + int level, + int layout, int chunksize, int raid_disks, + int delta_disks, char *backup, char *dev, + int direction, int verbose) +{ + int ret_val = 1; + struct geo_params geo; + + dprintf("(enter)\n"); + + memset(&geo, 0, sizeof(struct geo_params)); + + geo.dev_name = dev; + strcpy(geo.devnm, st->devnm); + geo.size = size; + geo.level = level; + geo.layout = layout; + geo.chunksize = chunksize; + geo.raid_disks = raid_disks; + if (delta_disks != UnSet) + geo.raid_disks += delta_disks; + + dprintf("for level : %i\n", geo.level); + dprintf("for raid_disks : %i\n", geo.raid_disks); + + if (strcmp(st->container_devnm, st->devnm) == 0) { + /* On container level we can only increase number of devices. */ + dprintf("imsm: info: Container operation\n"); + int old_raid_disks = 0; + + if (imsm_reshape_is_allowed_on_container( + st, &geo, &old_raid_disks, direction)) { + struct imsm_update_reshape *u = NULL; + int len; + + if (imsm_fix_size_mismatch(st, -1)) { + dprintf("imsm: Cannot fix size mismatch\n"); + goto exit_imsm_reshape_super; + } + + len = imsm_create_metadata_update_for_reshape( + st, &geo, old_raid_disks, &u); + + if (len <= 0) { + dprintf("imsm: Cannot prepare update\n"); + goto exit_imsm_reshape_super; + } + + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + + } else { + pr_err("(imsm) Operation is not allowed on this container\n"); + } + } else { + /* On volume level we support following operations + * - takeover: raid10 -> raid0; raid0 -> raid10 + * - chunk size migration + * - migration: raid5 -> raid0; raid0 -> raid5 + */ + struct intel_super *super = st->sb; + struct intel_dev *dev = super->devlist; + int change; + dprintf("imsm: info: Volume operation\n"); + /* find requested device */ + while (dev) { + char *devnm = + imsm_find_array_devnm_by_subdev( + dev->index, st->container_devnm); + if (devnm && strcmp(devnm, geo.devnm) == 0) + break; + dev = dev->next; + } + if (dev == NULL) { + pr_err("Cannot find %s (%s) subarray\n", + geo.dev_name, geo.devnm); + goto exit_imsm_reshape_super; + } + super->current_vol = dev->index; + change = imsm_analyze_change(st, &geo, direction); + switch (change) { + case CH_TAKEOVER: + ret_val = imsm_takeover(st, &geo); + break; + case CH_MIGRATION: { + struct imsm_update_reshape_migration *u = NULL; + int len = + imsm_create_metadata_update_for_migration( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + case CH_ARRAY_SIZE: { + struct imsm_update_size_change *u = NULL; + int len = + imsm_create_metadata_update_for_size_change( + st, &geo, &u); + if (len < 1) { + dprintf("imsm: Cannot prepare update\n"); + break; + } + ret_val = 0; + /* update metadata locally */ + imsm_update_metadata_locally(st, u, len); + /* and possibly remotely */ + if (st->update_tail) + append_metadata_update(st, u, len); + else + free(u); + } + break; + default: + ret_val = 1; + } + } + +exit_imsm_reshape_super: + dprintf("imsm: reshape_super Exit code = %i\n", ret_val); + return ret_val; +} + +#define COMPLETED_OK 0 +#define COMPLETED_NONE 1 +#define COMPLETED_DELAYED 2 + +static int read_completed(int fd, unsigned long long *val) +{ + int ret; + char buf[50]; + + ret = sysfs_fd_get_str(fd, buf, 50); + if (ret < 0) + return ret; + + ret = COMPLETED_OK; + if (strncmp(buf, "none", 4) == 0) { + ret = COMPLETED_NONE; + } else if (strncmp(buf, "delayed", 7) == 0) { + ret = COMPLETED_DELAYED; + } else { + char *ep; + *val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + ret = -1; + } + return ret; +} + +/******************************************************************************* + * Function: wait_for_reshape_imsm + * Description: Function writes new sync_max value and waits until + * reshape process reach new position + * Parameters: + * sra : general array info + * ndata : number of disks in new array's layout + * Returns: + * 0 : success, + * 1 : there is no reshape in progress, + * -1 : fail + ******************************************************************************/ +int wait_for_reshape_imsm(struct mdinfo *sra, int ndata) +{ + int fd = sysfs_get_fd(sra, NULL, "sync_completed"); + int retry = 3; + unsigned long long completed; + /* to_complete : new sync_max position */ + unsigned long long to_complete = sra->reshape_progress; + unsigned long long position_to_set = to_complete / ndata; + + if (!is_fd_valid(fd)) { + dprintf("cannot open reshape_position\n"); + return 1; + } + + do { + if (sysfs_fd_get_ll(fd, &completed) < 0) { + if (!retry) { + dprintf("cannot read reshape_position (no reshape in progres)\n"); + close(fd); + return 1; + } + usleep(30000); + } else + break; + } while (retry--); + + if (completed > position_to_set) { + dprintf("wrong next position to set %llu (%llu)\n", + to_complete, position_to_set); + close(fd); + return -1; + } + dprintf("Position set: %llu\n", position_to_set); + if (sysfs_set_num(sra, NULL, "sync_max", + position_to_set) != 0) { + dprintf("cannot set reshape position to %llu\n", + position_to_set); + close(fd); + return -1; + } + + do { + int rc; + char action[20]; + int timeout = 3000; + + sysfs_wait(fd, &timeout); + if (sysfs_get_str(sra, NULL, "sync_action", + action, 20) > 0 && + strncmp(action, "reshape", 7) != 0) { + if (strncmp(action, "idle", 4) == 0) + break; + close(fd); + return -1; + } + + rc = read_completed(fd, &completed); + if (rc < 0) { + dprintf("cannot read reshape_position (in loop)\n"); + close(fd); + return 1; + } else if (rc == COMPLETED_NONE) + break; + } while (completed < position_to_set); + + close(fd); + return 0; +} + +/******************************************************************************* + * Function: check_degradation_change + * Description: Check that array hasn't become failed. + * Parameters: + * info : for sysfs access + * sources : source disks descriptors + * degraded: previous degradation level + * Returns: + * degradation level + ******************************************************************************/ +int check_degradation_change(struct mdinfo *info, + int *sources, + int degraded) +{ + unsigned long long new_degraded; + int rv; + + rv = sysfs_get_ll(info, NULL, "degraded", &new_degraded); + if (rv == -1 || (new_degraded != (unsigned long long)degraded)) { + /* check each device to ensure it is still working */ + struct mdinfo *sd; + new_degraded = 0; + for (sd = info->devs ; sd ; sd = sd->next) { + if (sd->disk.state & (1<<MD_DISK_FAULTY)) + continue; + if (sd->disk.state & (1<<MD_DISK_SYNC)) { + char sbuf[100]; + int raid_disk = sd->disk.raid_disk; + + if (sysfs_get_str(info, + sd, "state", sbuf, sizeof(sbuf)) < 0 || + strstr(sbuf, "faulty") || + strstr(sbuf, "in_sync") == NULL) { + /* this device is dead */ + sd->disk.state = (1<<MD_DISK_FAULTY); + if (raid_disk >= 0) + close_fd(&sources[raid_disk]); + new_degraded++; + } + } + } + } + + return new_degraded; +} + +/******************************************************************************* + * Function: imsm_manage_reshape + * Description: Function finds array under reshape and it manages reshape + * process. It creates stripes backups (if required) and sets + * checkpoints. + * Parameters: + * afd : Backup handle (nattive) - not used + * sra : general array info + * reshape : reshape parameters - not used + * st : supertype structure + * blocks : size of critical section [blocks] + * fds : table of source device descriptor + * offsets : start of array (offest per devices) + * dests : not used + * destfd : table of destination device descriptor + * destoffsets : table of destination offsets (per device) + * Returns: + * 1 : success, reshape is done + * 0 : fail + ******************************************************************************/ +static int imsm_manage_reshape( + int afd, struct mdinfo *sra, struct reshape *reshape, + struct supertype *st, unsigned long backup_blocks, + int *fds, unsigned long long *offsets, + int dests, int *destfd, unsigned long long *destoffsets) +{ + int ret_val = 0; + struct intel_super *super = st->sb; + struct intel_dev *dv; + unsigned int sector_size = super->sector_size; + struct imsm_dev *dev = NULL; + struct imsm_map *map_src, *map_dest; + int migr_vol_qan = 0; + int ndata, odata; /* [bytes] */ + int chunk; /* [bytes] */ + struct migr_record *migr_rec; + char *buf = NULL; + unsigned int buf_size; /* [bytes] */ + unsigned long long max_position; /* array size [bytes] */ + unsigned long long next_step; /* [blocks]/[bytes] */ + unsigned long long old_data_stripe_length; + unsigned long long start_src; /* [bytes] */ + unsigned long long start; /* [bytes] */ + unsigned long long start_buf_shift; /* [bytes] */ + int degraded = 0; + int source_layout = 0; + int subarray_index = -1; + + if (!sra) + return ret_val; + + if (!fds || !offsets) + goto abort; + + /* Find volume during the reshape */ + for (dv = super->devlist; dv; dv = dv->next) { + if (dv->dev->vol.migr_type == MIGR_GEN_MIGR && + dv->dev->vol.migr_state == 1) { + dev = dv->dev; + migr_vol_qan++; + subarray_index = dv->index; + } + } + /* Only one volume can migrate at the same time */ + if (migr_vol_qan != 1) { + pr_err("%s", migr_vol_qan ? + "Number of migrating volumes greater than 1\n" : + "There is no volume during migrationg\n"); + goto abort; + } + + map_dest = get_imsm_map(dev, MAP_0); + map_src = get_imsm_map(dev, MAP_1); + if (map_src == NULL) + goto abort; + + ndata = imsm_num_data_members(map_dest); + odata = imsm_num_data_members(map_src); + + chunk = __le16_to_cpu(map_src->blocks_per_strip) * 512; + old_data_stripe_length = odata * chunk; + + migr_rec = super->migr_rec; + + /* initialize migration record for start condition */ + if (sra->reshape_progress == 0) + init_migr_record_imsm(st, dev, sra); + else { + if (__le32_to_cpu(migr_rec->rec_status) != UNIT_SRC_NORMAL) { + dprintf("imsm: cannot restart migration when data are present in copy area.\n"); + goto abort; + } + /* Save checkpoint to update migration record for current + * reshape position (in md). It can be farther than current + * reshape position in metadata. + */ + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL, initial save)\n"); + goto abort; + } + } + + /* size for data */ + buf_size = __le32_to_cpu(migr_rec->blocks_per_unit) * 512; + /* extend buffer size for parity disk */ + buf_size += __le32_to_cpu(migr_rec->dest_depth_per_unit) * 512; + /* add space for stripe alignment */ + buf_size += old_data_stripe_length; + if (posix_memalign((void **)&buf, MAX_SECTOR_SIZE, buf_size)) { + dprintf("imsm: Cannot allocate checkpoint buffer\n"); + goto abort; + } + + max_position = sra->component_size * ndata; + source_layout = imsm_level_to_layout(map_src->raid_level); + + while (current_migr_unit(migr_rec) < + get_num_migr_units(migr_rec)) { + /* current reshape position [blocks] */ + unsigned long long current_position = + __le32_to_cpu(migr_rec->blocks_per_unit) + * current_migr_unit(migr_rec); + unsigned long long border; + + /* Check that array hasn't become failed. + */ + degraded = check_degradation_change(sra, fds, degraded); + if (degraded > 1) { + dprintf("imsm: Abort reshape due to degradation level (%i)\n", degraded); + goto abort; + } + + next_step = __le32_to_cpu(migr_rec->blocks_per_unit); + + if ((current_position + next_step) > max_position) + next_step = max_position - current_position; + + start = current_position * 512; + + /* align reading start to old geometry */ + start_buf_shift = start % old_data_stripe_length; + start_src = start - start_buf_shift; + + border = (start_src / odata) - (start / ndata); + border /= 512; + if (border <= __le32_to_cpu(migr_rec->dest_depth_per_unit)) { + /* save critical stripes to buf + * start - start address of current unit + * to backup [bytes] + * start_src - start address of current unit + * to backup alligned to source array + * [bytes] + */ + unsigned long long next_step_filler; + unsigned long long copy_length = next_step * 512; + + /* allign copy area length to stripe in old geometry */ + next_step_filler = ((copy_length + start_buf_shift) + % old_data_stripe_length); + if (next_step_filler) + next_step_filler = (old_data_stripe_length + - next_step_filler); + dprintf("save_stripes() parameters: start = %llu,\tstart_src = %llu,\tnext_step*512 = %llu,\tstart_in_buf_shift = %llu,\tnext_step_filler = %llu\n", + start, start_src, copy_length, + start_buf_shift, next_step_filler); + + if (save_stripes(fds, offsets, map_src->num_members, + chunk, map_src->raid_level, + source_layout, 0, NULL, start_src, + copy_length + + next_step_filler + start_buf_shift, + buf)) { + dprintf("imsm: Cannot save stripes to buffer\n"); + goto abort; + } + /* Convert data to destination format and store it + * in backup general migration area + */ + if (save_backup_imsm(st, dev, sra, + buf + start_buf_shift, copy_length)) { + dprintf("imsm: Cannot save stripes to target devices\n"); + goto abort; + } + if (save_checkpoint_imsm(st, sra, + UNIT_SRC_IN_CP_AREA)) { + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_IN_CP_AREA)\n"); + goto abort; + } + } else { + /* set next step to use whole border area */ + border /= next_step; + if (border > 1) + next_step *= border; + } + /* When data backed up, checkpoint stored, + * kick the kernel to reshape unit of data + */ + next_step = next_step + sra->reshape_progress; + /* limit next step to array max position */ + if (next_step > max_position) + next_step = max_position; + sysfs_set_num(sra, NULL, "suspend_lo", sra->reshape_progress); + sysfs_set_num(sra, NULL, "suspend_hi", next_step); + sra->reshape_progress = next_step; + + /* wait until reshape finish */ + if (wait_for_reshape_imsm(sra, ndata)) { + dprintf("wait_for_reshape_imsm returned error!\n"); + goto abort; + } + if (sigterm) + goto abort; + + if (save_checkpoint_imsm(st, sra, UNIT_SRC_NORMAL) == 1) { + /* ignore error == 2, this can mean end of reshape here + */ + dprintf("imsm: Cannot write checkpoint to migration record (UNIT_SRC_NORMAL)\n"); + goto abort; + } + + } + + /* clear migr_rec on disks after successful migration */ + struct dl *d; + + memset(super->migr_rec_buf, 0, MIGR_REC_BUF_SECTORS*MAX_SECTOR_SIZE); + for (d = super->disks; d; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + unsigned long long dsize; + + get_dev_size(d->fd, NULL, &dsize); + if (lseek64(d->fd, dsize - MIGR_REC_SECTOR_POSITION*sector_size, + SEEK_SET) >= 0) { + if ((unsigned int)write(d->fd, super->migr_rec_buf, + MIGR_REC_BUF_SECTORS*sector_size) != + MIGR_REC_BUF_SECTORS*sector_size) + perror("Write migr_rec failed"); + } + } + + /* return '1' if done */ + ret_val = 1; + + /* After the reshape eliminate size mismatch in metadata. + * Don't update md/component_size here, volume hasn't + * to take whole space. It is allowed by kernel. + * md/component_size will be set propoperly after next assembly. + */ + imsm_fix_size_mismatch(st, subarray_index); + +abort: + free(buf); + /* See Grow.c: abort_reshape() for further explanation */ + sysfs_set_num(sra, NULL, "suspend_lo", 0x7FFFFFFFFFFFFFFFULL); + sysfs_set_num(sra, NULL, "suspend_hi", 0); + sysfs_set_num(sra, NULL, "suspend_lo", 0); + + return ret_val; +} + +/******************************************************************************* + * Function: calculate_bitmap_min_chunksize + * Description: Calculates the minimal valid bitmap chunk size + * Parameters: + * max_bits : indicate how many bits can be used for the bitmap + * data_area_size : the size of the data area covered by the bitmap + * + * Returns: + * The bitmap chunk size + ******************************************************************************/ +static unsigned long long +calculate_bitmap_min_chunksize(unsigned long long max_bits, + unsigned long long data_area_size) +{ + unsigned long long min_chunk = + 4096; /* sub-page chunks don't work yet.. */ + unsigned long long bits = data_area_size / min_chunk + 1; + + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits + 1) / 2; + } + return min_chunk; +} + +/******************************************************************************* + * Function: calculate_bitmap_chunksize + * Description: Calculates the bitmap chunk size for the given device + * Parameters: + * st : supertype information + * dev : device for the bitmap + * + * Returns: + * The bitmap chunk size + ******************************************************************************/ +static unsigned long long calculate_bitmap_chunksize(struct supertype *st, + struct imsm_dev *dev) +{ + struct intel_super *super = st->sb; + unsigned long long min_chunksize; + unsigned long long result = IMSM_DEFAULT_BITMAP_CHUNKSIZE; + size_t dev_size = imsm_dev_size(dev); + + min_chunksize = calculate_bitmap_min_chunksize( + IMSM_BITMAP_AREA_SIZE * super->sector_size, dev_size); + + if (result < min_chunksize) + result = min_chunksize; + + return result; +} + +/******************************************************************************* + * Function: init_bitmap_header + * Description: Initialize the bitmap header structure + * Parameters: + * st : supertype information + * bms : bitmap header struct to initialize + * dev : device for the bitmap + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int init_bitmap_header(struct supertype *st, struct bitmap_super_s *bms, + struct imsm_dev *dev) +{ + int vol_uuid[4]; + + if (!bms || !dev) + return -1; + + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(BITMAP_MAJOR_HI); + bms->daemon_sleep = __cpu_to_le32(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP); + bms->sync_size = __cpu_to_le64(IMSM_BITMAP_AREA_SIZE); + bms->write_behind = __cpu_to_le32(0); + + uuid_from_super_imsm(st, vol_uuid); + memcpy(bms->uuid, vol_uuid, 16); + + bms->chunksize = calculate_bitmap_chunksize(st, dev); + + return 0; +} + +/******************************************************************************* + * Function: validate_internal_bitmap_for_drive + * Description: Verify if the bitmap header for a given drive. + * Parameters: + * st : supertype information + * offset : The offset from the beginning of the drive where to look for + * the bitmap header. + * d : the drive info + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int validate_internal_bitmap_for_drive(struct supertype *st, + unsigned long long offset, + struct dl *d) +{ + struct intel_super *super = st->sb; + int ret = -1; + int vol_uuid[4]; + bitmap_super_t *bms; + int fd; + + if (!d) + return -1; + + void *read_buf; + + if (posix_memalign(&read_buf, MAX_SECTOR_SIZE, IMSM_BITMAP_HEADER_SIZE)) + return -1; + + fd = d->fd; + if (!is_fd_valid(fd)) { + fd = open(d->devname, O_RDONLY, 0); + + if (!is_fd_valid(fd)) { + dprintf("cannot open the device %s\n", d->devname); + goto abort; + } + } + + if (lseek64(fd, offset * super->sector_size, SEEK_SET) < 0) + goto abort; + if (read(fd, read_buf, IMSM_BITMAP_HEADER_SIZE) != + IMSM_BITMAP_HEADER_SIZE) + goto abort; + + uuid_from_super_imsm(st, vol_uuid); + + bms = read_buf; + if ((bms->magic != __cpu_to_le32(BITMAP_MAGIC)) || + (bms->version != __cpu_to_le32(BITMAP_MAJOR_HI)) || + (!same_uuid((int *)bms->uuid, vol_uuid, st->ss->swapuuid))) { + dprintf("wrong bitmap header detected\n"); + goto abort; + } + + ret = 0; +abort: + if (!is_fd_valid(d->fd)) + close_fd(&fd); + + if (read_buf) + free(read_buf); + + return ret; +} + +/******************************************************************************* + * Function: validate_internal_bitmap_imsm + * Description: Verify if the bitmap header is in place and with proper data. + * Parameters: + * st : supertype information + * + * Returns: + * 0 : success or device w/o RWH_BITMAP + * -1 : fail + ******************************************************************************/ +static int validate_internal_bitmap_imsm(struct supertype *st) +{ + struct intel_super *super = st->sb; + struct imsm_dev *dev = get_imsm_dev(super, super->current_vol); + unsigned long long offset; + struct dl *d; + + if (!dev) + return -1; + + if (dev->rwh_policy != RWH_BITMAP) + return 0; + + offset = get_bitmap_header_sector(super, super->current_vol); + for (d = super->disks; d; d = d->next) { + if (d->index < 0 || is_failed(&d->disk)) + continue; + + if (validate_internal_bitmap_for_drive(st, offset, d)) { + pr_err("imsm: bitmap validation failed\n"); + return -1; + } + } + return 0; +} + +/******************************************************************************* + * Function: add_internal_bitmap_imsm + * Description: Mark the volume to use the bitmap and updates the chunk size value. + * Parameters: + * st : supertype information + * chunkp : bitmap chunk size + * delay : not used for imsm + * write_behind : not used for imsm + * size : not used for imsm + * may_change : not used for imsm + * amajor : not used for imsm + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int add_internal_bitmap_imsm(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, + int amajor) +{ + struct intel_super *super = st->sb; + int vol_idx = super->current_vol; + struct imsm_dev *dev; + + if (!super->devlist || vol_idx == -1 || !chunkp) + return -1; + + dev = get_imsm_dev(super, vol_idx); + + if (!dev) { + dprintf("cannot find the device for volume index %d\n", + vol_idx); + return -1; + } + dev->rwh_policy = RWH_BITMAP; + + *chunkp = calculate_bitmap_chunksize(st, dev); + + return 0; +} + +/******************************************************************************* + * Function: locate_bitmap_imsm + * Description: Seek 'fd' to start of write-intent-bitmap. + * Parameters: + * st : supertype information + * fd : file descriptor for the device + * node_num : not used for imsm + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int locate_bitmap_imsm(struct supertype *st, int fd, int node_num) +{ + struct intel_super *super = st->sb; + unsigned long long offset; + int vol_idx = super->current_vol; + + if (!super->devlist || vol_idx == -1) + return -1; + + offset = get_bitmap_header_sector(super, super->current_vol); + dprintf("bitmap header offset is %llu\n", offset); + + lseek64(fd, offset << 9, 0); + + return 0; +} + +/******************************************************************************* + * Function: write_init_bitmap_imsm + * Description: Write a bitmap header and prepares the area for the bitmap. + * Parameters: + * st : supertype information + * fd : file descriptor for the device + * update : not used for imsm + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int write_init_bitmap_imsm(struct supertype *st, int fd, + enum bitmap_update update) +{ + struct intel_super *super = st->sb; + int vol_idx = super->current_vol; + int ret = 0; + unsigned long long offset; + bitmap_super_t bms = { 0 }; + size_t written = 0; + size_t to_write; + ssize_t rv_num; + void *buf; + + if (!super->devlist || !super->sector_size || vol_idx == -1) + return -1; + + struct imsm_dev *dev = get_imsm_dev(super, vol_idx); + + /* first clear the space for bitmap header */ + unsigned long long bitmap_area_start = + get_bitmap_header_sector(super, vol_idx); + + dprintf("zeroing area start (%llu) and size (%u)\n", bitmap_area_start, + IMSM_BITMAP_AND_HEADER_SIZE / super->sector_size); + if (zero_disk_range(fd, bitmap_area_start, + IMSM_BITMAP_HEADER_SIZE / super->sector_size)) { + pr_err("imsm: cannot zeroing the space for the bitmap\n"); + return -1; + } + + /* The bitmap area should be filled with "1"s to perform initial + * synchronization. + */ + if (posix_memalign(&buf, MAX_SECTOR_SIZE, MAX_SECTOR_SIZE)) + return -1; + memset(buf, 0xFF, MAX_SECTOR_SIZE); + offset = get_bitmap_sector(super, vol_idx); + lseek64(fd, offset << 9, 0); + while (written < IMSM_BITMAP_AREA_SIZE) { + to_write = IMSM_BITMAP_AREA_SIZE - written; + if (to_write > MAX_SECTOR_SIZE) + to_write = MAX_SECTOR_SIZE; + rv_num = write(fd, buf, MAX_SECTOR_SIZE); + if (rv_num != MAX_SECTOR_SIZE) { + ret = -1; + dprintf("cannot initialize bitmap area\n"); + goto abort; + } + written += rv_num; + } + + /* write a bitmap header */ + init_bitmap_header(st, &bms, dev); + memset(buf, 0, MAX_SECTOR_SIZE); + memcpy(buf, &bms, sizeof(bitmap_super_t)); + if (locate_bitmap_imsm(st, fd, 0)) { + ret = -1; + dprintf("cannot locate the bitmap\n"); + goto abort; + } + if (write(fd, buf, MAX_SECTOR_SIZE) != MAX_SECTOR_SIZE) { + ret = -1; + dprintf("cannot write the bitmap header\n"); + goto abort; + } + fsync(fd); + +abort: + free(buf); + + return ret; +} + +/******************************************************************************* + * Function: is_vol_to_setup_bitmap + * Description: Checks if a bitmap should be activated on the dev. + * Parameters: + * info : info about the volume to setup the bitmap + * dev : the device to check against bitmap creation + * + * Returns: + * 0 : bitmap should be set up on the device + * -1 : otherwise + ******************************************************************************/ +static int is_vol_to_setup_bitmap(struct mdinfo *info, struct imsm_dev *dev) +{ + if (!dev || !info) + return -1; + + if ((strcmp((char *)dev->volume, info->name) == 0) && + (dev->rwh_policy == RWH_BITMAP)) + return -1; + + return 0; +} + +/******************************************************************************* + * Function: set_bitmap_sysfs + * Description: Set the sysfs atributes of a given volume to activate the bitmap. + * Parameters: + * info : info about the volume where the bitmap should be setup + * chunksize : bitmap chunk size + * location : location of the bitmap + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int set_bitmap_sysfs(struct mdinfo *info, unsigned long long chunksize, + char *location) +{ + /* The bitmap/metadata is set to external to allow changing of value for + * bitmap/location. When external is used, the kernel will treat an offset + * related to the device's first lba (in opposition to the "internal" case + * when this value is related to the beginning of the superblock). + */ + if (sysfs_set_str(info, NULL, "bitmap/metadata", "external")) { + dprintf("failed to set bitmap/metadata\n"); + return -1; + } + + /* It can only be changed when no bitmap is active. + * Should be bigger than 512 and must be power of 2. + * It is expecting the value in bytes. + */ + if (sysfs_set_num(info, NULL, "bitmap/chunksize", + __cpu_to_le32(chunksize))) { + dprintf("failed to set bitmap/chunksize\n"); + return -1; + } + + /* It is expecting the value in sectors. */ + if (sysfs_set_num(info, NULL, "bitmap/space", + __cpu_to_le64(IMSM_BITMAP_AREA_SIZE))) { + dprintf("failed to set bitmap/space\n"); + return -1; + } + + /* Determines the delay between the bitmap updates. + * It is expecting the value in seconds. + */ + if (sysfs_set_num(info, NULL, "bitmap/time_base", + __cpu_to_le64(IMSM_DEFAULT_BITMAP_DAEMON_SLEEP))) { + dprintf("failed to set bitmap/time_base\n"); + return -1; + } + + /* It is expecting the value in sectors with a sign at the beginning. */ + if (sysfs_set_str(info, NULL, "bitmap/location", location)) { + dprintf("failed to set bitmap/location\n"); + return -1; + } + + return 0; +} + +/******************************************************************************* + * Function: set_bitmap_imsm + * Description: Setup the bitmap for the given volume + * Parameters: + * st : supertype information + * info : info about the volume where the bitmap should be setup + * + * Returns: + * 0 : success + * -1 : fail + ******************************************************************************/ +static int set_bitmap_imsm(struct supertype *st, struct mdinfo *info) +{ + struct intel_super *super = st->sb; + int prev_current_vol = super->current_vol; + struct imsm_dev *dev; + int ret = -1; + char location[16] = ""; + unsigned long long chunksize; + struct intel_dev *dev_it; + + for (dev_it = super->devlist; dev_it; dev_it = dev_it->next) { + super->current_vol = dev_it->index; + dev = get_imsm_dev(super, super->current_vol); + + if (is_vol_to_setup_bitmap(info, dev)) { + if (validate_internal_bitmap_imsm(st)) { + dprintf("bitmap header validation failed\n"); + goto abort; + } + + chunksize = calculate_bitmap_chunksize(st, dev); + dprintf("chunk size is %llu\n", chunksize); + + snprintf(location, sizeof(location), "+%llu", + get_bitmap_sector(super, super->current_vol)); + dprintf("bitmap offset is %s\n", location); + + if (set_bitmap_sysfs(info, chunksize, location)) { + dprintf("cannot setup the bitmap\n"); + goto abort; + } + } + } + ret = 0; +abort: + super->current_vol = prev_current_vol; + return ret; +} + +struct superswitch super_imsm = { + .examine_super = examine_super_imsm, + .brief_examine_super = brief_examine_super_imsm, + .brief_examine_subarrays = brief_examine_subarrays_imsm, + .export_examine_super = export_examine_super_imsm, + .detail_super = detail_super_imsm, + .brief_detail_super = brief_detail_super_imsm, + .write_init_super = write_init_super_imsm, + .validate_geometry = validate_geometry_imsm, + .add_to_super = add_to_super_imsm, + .remove_from_super = remove_from_super_imsm, + .detail_platform = detail_platform_imsm, + .export_detail_platform = export_detail_platform_imsm, + .kill_subarray = kill_subarray_imsm, + .update_subarray = update_subarray_imsm, + .load_container = load_container_imsm, + .default_geometry = default_geometry_imsm, + .get_disk_controller_domain = imsm_get_disk_controller_domain, + .reshape_super = imsm_reshape_super, + .manage_reshape = imsm_manage_reshape, + .recover_backup = recover_backup_imsm, + .examine_badblocks = examine_badblocks_imsm, + .match_home = match_home_imsm, + .uuid_from_super= uuid_from_super_imsm, + .getinfo_super = getinfo_super_imsm, + .getinfo_super_disks = getinfo_super_disks_imsm, + .update_super = update_super_imsm, + + .avail_size = avail_size_imsm, + .get_spare_criteria = get_spare_criteria_imsm, + + .compare_super = compare_super_imsm, + + .load_super = load_super_imsm, + .init_super = init_super_imsm, + .store_super = store_super_imsm, + .free_super = free_super_imsm, + .match_metadata_desc = match_metadata_desc_imsm, + .container_content = container_content_imsm, + .validate_container = validate_container_imsm, + + .add_internal_bitmap = add_internal_bitmap_imsm, + .locate_bitmap = locate_bitmap_imsm, + .write_bitmap = write_init_bitmap_imsm, + .set_bitmap = set_bitmap_imsm, + + .write_init_ppl = write_init_ppl_imsm, + .validate_ppl = validate_ppl_imsm, + + .external = 1, + .name = "imsm", + +/* for mdmon */ + .open_new = imsm_open_new, + .set_array_state= imsm_set_array_state, + .set_disk = imsm_set_disk, + .sync_metadata = imsm_sync_metadata, + .activate_spare = imsm_activate_spare, + .process_update = imsm_process_update, + .prepare_update = imsm_prepare_update, + .record_bad_block = imsm_record_badblock, + .clear_bad_block = imsm_clear_badblock, + .get_bad_blocks = imsm_get_badblocks, +}; diff --git a/super-mbr.c b/super-mbr.c new file mode 100644 index 0000000..839f000 --- /dev/null +++ b/super-mbr.c @@ -0,0 +1,206 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2010 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neil@brown.name> + * + */ + +/* + * 'mbr' is a pseudo metadata type for devices which have a + * partition table in the Master Boot Record (mbr) also known + * as a dos partition table. + * + * Obviously arrays cannot be created or assembled for this type. + * It is used to allow a new bare device to have an partition table + * added so the member partitions can then be included in other + * arrays as relevant. + * + * The meaning operations are: + * examine_super, but not brief_examine_super or export_examine + * load_super + * store_super + */ + +#include "mdadm.h" +#include "part.h" + +static void free_mbr(struct supertype *st) +{ + free(st->sb); + st->sb = NULL; +} + +static void examine_mbr(struct supertype *st, char *homehost) +{ + struct MBR *sb = st->sb; + int i; + + printf(" MBR Magic : %04x\n", sb->magic); + for (i = 0; i < MBR_PARTITIONS; i++) + /* + * Have to make every access through sb rather than using a + * pointer to the partition table (or an entry), since the + * entries are not properly aligned. + */ + if (sb->parts[i].blocks_num) + printf("Partition[%d] : %12lu sectors at %12lu (type %02x)\n", + i, + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num), + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba), + sb->parts[i].part_type); + +} + +static int load_super_mbr(struct supertype *st, int fd, char *devname) +{ + /* try to read an mbr + * Return + * 0 on success + * 1 cannot get record + * 2 record is meaningless + */ + struct MBR *super; + + free_mbr(st); + + if (posix_memalign((void**)&super, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, super, sizeof(*super)) != sizeof(*super)) { + if (devname) + pr_err("Cannot read partition table on %s\n", + devname); + free(super); + return 1; + } + + if (super->magic != MBR_SIGNATURE_MAGIC) { + if (devname) + pr_err("No partition table found on %s\n", + devname); + free(super); + return 1; + } + + st->sb = super; + + if (st->ss == NULL) { + st->ss = &mbr; + st->minor_version = 0; + st->max_devs = 1; + st->info = NULL; + } + return 0; +} + +static int store_mbr(struct supertype *st, int fd) +{ + struct MBR *old, *super; + + if (posix_memalign((void**)&old, 512, 512) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + lseek(fd, 0, 0); + if (read(fd, old, sizeof(*old)) != sizeof(*old)) { + free(old); + return 1; + } + + super = st->sb; + memcpy(super->pad, old->pad, sizeof(super->pad)); + free(old); + lseek(fd, 0, 0); + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + fsync(fd); + ioctl(fd, BLKRRPART, 0); + return 0; +} + +static void getinfo_mbr(struct supertype *st, struct mdinfo *info, char *map) +{ + struct MBR *sb = st->sb; + int i; + + memset(&info->array, 0, sizeof(info->array)); + memset(&info->disk, 0, sizeof(info->disk)); + strcpy(info->text_version, "mbr"); + strcpy(info->name, "mbr"); + info->component_size = 0; + + for (i = 0; i < MBR_PARTITIONS ; i++) + /* + * Have to make every access through sb rather than using a + * pointer to the partition table (or an entry), since the + * entries are not properly aligned. + */ + if (sb->parts[i].blocks_num) { + unsigned long last = + (unsigned long)__le32_to_cpu(sb->parts[i].blocks_num) + + (unsigned long)__le32_to_cpu(sb->parts[i].first_sect_lba); + if (last > info->component_size) + info->component_size = last; + } + +} + +static struct supertype *match_metadata_desc(char *arg) +{ + struct supertype *st; + + if (strcmp(arg, "mbr") != 0) + return NULL; + + st = xmalloc(sizeof(*st)); + st->ss = &mbr; + st->info = NULL; + st->minor_version = 0; + st->max_devs = 1; + st->sb = NULL; + return st; +} + +static int validate_geometry(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + pr_err("mbr metadata cannot be used this way\n"); + return 0; +} + +struct superswitch mbr = { + .examine_super = examine_mbr, + .validate_geometry = validate_geometry, + .match_metadata_desc = match_metadata_desc, + .load_super = load_super_mbr, + .store_super = store_mbr, + .getinfo_super = getinfo_mbr, + .free_super = free_mbr, + .name = "mbr", +}; diff --git a/super0.c b/super0.c new file mode 100644 index 0000000..b79b97a --- /dev/null +++ b/super0.c @@ -0,0 +1,1350 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#define HAVE_STDINT_H 1 +#include "mdadm.h" +#include "sha1.h" +/* + * All handling for the 0.90.0 version superblock is in + * this file. + * This includes: + * - finding, loading, and writing the superblock. + * - initialising a new superblock + * - printing the superblock for --examine + * - printing part of the superblock for --detail + * .. other stuff + */ + +static unsigned long calc_sb0_csum(mdp_super_t *super) +{ + unsigned long csum = super->sb_csum; + unsigned long newcsum; + super->sb_csum= 0 ; + newcsum = calc_csum(super, MD_SB_BYTES); + super->sb_csum = csum; + return newcsum; +} + +static void super0_swap_endian(struct mdp_superblock_s *sb) +{ + /* as super0 superblocks are host-endian, it is sometimes + * useful to be able to swap the endianness + * as (almost) everything is u32's we byte-swap every 4byte + * number. + * We then also have to swap the events_hi and events_lo + */ + char *sbc = (char *)sb; + __u32 t32; + int i; + + for (i=0; i < MD_SB_BYTES ; i+=4) { + char t = sbc[i]; + sbc[i] = sbc[i+3]; + sbc[i+3] = t; + t=sbc[i+1]; + sbc[i+1]=sbc[i+2]; + sbc[i+2]=t; + } + t32 = sb->events_hi; + sb->events_hi = sb->events_lo; + sb->events_lo = t32; + + t32 = sb->cp_events_hi; + sb->cp_events_hi = sb->cp_events_lo; + sb->cp_events_lo = t32; + +} + +static void examine_super0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + time_t atime; + int d; + int delta_extra = 0; + char *c; + + printf(" Magic : %08x\n", sb->md_magic); + printf(" Version : %d.%02d.%02d\n", + sb->major_version, sb->minor_version, sb->patch_version); + if (sb->minor_version >= 90) { + printf(" UUID : %08x:%08x:%08x:%08x", sb->set_uuid0, + sb->set_uuid1, sb->set_uuid2, sb->set_uuid3); + if (homehost) { + char buf[20]; + void *hash; + + hash = sha1_buffer(homehost, strlen(homehost), buf); + if (memcmp(&sb->set_uuid2, hash, 8) == 0) + printf(" (local to host %s)", homehost); + } + printf("\n"); + } else + printf(" UUID : %08x\n", sb->set_uuid0); + + if (sb->not_persistent) + printf(" Eedk : not persistent\n"); + + atime = sb->ctime; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c = map_num(pers, sb->level); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + if ((int)sb->level > 0) { + int ddsks = 0, ddsks_denom = 1; + printf(" Used Dev Size : %d%s\n", sb->size, + human_size((long long)sb->size<<10)); + switch(sb->level) { + case 1: + ddsks=1; + break; + case 4: + case 5: + ddsks = sb->raid_disks - 1; + break; + case 6: + ddsks = sb->raid_disks - 2; + break; + case 10: + ddsks = sb->raid_disks; + ddsks_denom = + (sb->layout & 255) * ((sb->layout >> 8) & 255); + } + if (ddsks) { + long long asize = sb->size; + asize = (asize << 10) * ddsks / ddsks_denom; + printf(" Array Size : %llu%s\n", + asize >> 10, human_size(asize)); + } + } + printf(" Raid Devices : %d\n", sb->raid_disks); + printf(" Total Devices : %d\n", sb->nr_disks); + printf("Preferred Minor : %d\n", sb->md_minor); + printf("\n"); + if (sb->minor_version > 90 && (sb->reshape_position + 1) != 0) { + printf(" Reshape pos'n : %llu%s\n", + (unsigned long long)sb->reshape_position / 2, + human_size((long long)sb->reshape_position << 9)); + if (sb->delta_disks) { + printf(" Delta Devices : %d", sb->delta_disks); + printf(" (%d->%d)\n", sb->raid_disks-sb->delta_disks, + sb->raid_disks); + if (((int)sb->delta_disks) < 0) + delta_extra = - sb->delta_disks; + } + if (sb->new_level != sb->level) { + c = map_num(pers, sb->new_level); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (sb->new_layout != sb->layout) { + if (sb->level == 5) { + c = map_num(r5layout, sb->new_layout); + printf(" New Layout : %s\n", + c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->new_layout); + printf(" New Layout : %s\n", + c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" New Layout : near=%d, %s=%d\n", + sb->new_layout&255, + (sb->new_layout&0x10000)?"offset":"far", + (sb->new_layout>>8)&255); + } + } + if (sb->new_chunk != sb->chunk_size) + printf(" New Chunksize : %d\n", sb->new_chunk); + printf("\n"); + } + atime = sb->utime; + printf(" Update Time : %.24s\n", ctime(&atime)); + printf(" State : %s\n", + (sb->state&(1 << MD_SB_CLEAN)) ? "clean":"active"); + if (sb->state & (1 << MD_SB_BITMAP_PRESENT)) + printf("Internal Bitmap : present\n"); + printf(" Active Devices : %d\n", sb->active_disks); + printf("Working Devices : %d\n", sb->working_disks); + printf(" Failed Devices : %d\n", sb->failed_disks); + printf(" Spare Devices : %d\n", sb->spare_disks); + if (calc_sb0_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", sb->sb_csum); + else + printf(" Checksum : %x - expected %lx\n", + sb->sb_csum, calc_sb0_csum(sb)); + printf(" Events : %llu\n", + ((unsigned long long)sb->events_hi << 32) + sb->events_lo); + printf("\n"); + if (sb->level == 5) { + c = map_num(r5layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 6) { + c = map_num(r6layout, sb->layout); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (sb->level == 10) { + printf(" Layout :"); + print_r10_layout(sb->layout); + printf("\n"); + } + switch(sb->level) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", sb->chunk_size / 1024); + break; + case -1: + printf(" Rounding : %dK\n", sb->chunk_size / 1024); + break; + default: + break; + } + printf("\n"); + printf(" Number Major Minor RaidDevice State\n"); + for (d = -1; + d < (signed int)(sb->raid_disks + delta_extra + sb->spare_disks); + d++) { + mdp_disk_t *dp; + char *dv; + char nb[11]; + int wonly, failfast; + if (d>=0) dp = &sb->disks[d]; + else dp = &sb->this_disk; + snprintf(nb, sizeof(nb), "%4d", d); + printf("%4s %5d %5d %5d %5d ", d < 0 ? "this" : nb, + dp->number, dp->major, dp->minor, dp->raid_disk); + wonly = dp->state & (1 << MD_DISK_WRITEMOSTLY); + failfast = dp->state & (1<<MD_DISK_FAILFAST); + dp->state &= ~(wonly | failfast); + if (dp->state & (1 << MD_DISK_FAULTY)) + printf(" faulty"); + if (dp->state & (1 << MD_DISK_ACTIVE)) + printf(" active"); + if (dp->state & (1 << MD_DISK_SYNC)) + printf(" sync"); + if (dp->state & (1 << MD_DISK_REMOVED)) + printf(" removed"); + if (wonly) + printf(" write-mostly"); + if (failfast) + printf(" failfast"); + if (dp->state == 0) + printf(" spare"); + if ((dv = map_dev(dp->major, dp->minor, 0))) + printf(" %s", dv); + printf("\n"); + if (d == -1) + printf("\n"); + } +} + +static void brief_examine_super0(struct supertype *st, int verbose) +{ + mdp_super_t *sb = st->sb; + char *c=map_num(pers, sb->level); + char devname[20]; + + sprintf(devname, "/dev/md%d", sb->md_minor); + + if (verbose) { + printf("ARRAY %s level=%s num-devices=%d", + devname, + c?c:"-unknown-", sb->raid_disks); + } else + printf("ARRAY %s", devname); + + if (sb->minor_version >= 90) + printf(" UUID=%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf(" UUID=%08x", sb->set_uuid0); + printf("\n"); +} + +static void export_examine_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + + printf("MD_LEVEL=%s\n", map_num(pers, sb->level)); + printf("MD_DEVICES=%d\n", sb->raid_disks); + if (sb->minor_version >= 90) + printf("MD_UUID=%08x:%08x:%08x:%08x\n", + sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("MD_UUID=%08x\n", sb->set_uuid0); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL); + printf("MD_EVENTS=%llu\n", + ((unsigned long long)sb->events_hi << 32) + + sb->events_lo); +} + +static int copy_metadata0(struct supertype *st, int from, int to) +{ + /* Read 64K from the appropriate offset of 'from' + * and if it looks a little like a 0.90 superblock, + * write it to the same offset of 'to' + */ + void *buf; + unsigned long long dsize, offset; + const int bufsize = 64*1024; + mdp_super_t *super; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + if (dsize < MD_RESERVED_SECTORS*512) + goto err; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(from, offset, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + if (lseek64(to, offset, 0) < 0LL) + goto err; + super = buf; + if (super->md_magic != MD_SB_MAGIC || + super->major_version != 0 || + calc_sb0_csum(super) != super->sb_csum) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + free(buf); + return 0; +err: + free(buf); + return 1; +} + +static void detail_super0(struct supertype *st, char *homehost, char *subarray) +{ + mdp_super_t *sb = st->sb; + printf(" UUID : "); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); + if (homehost) { + char buf[20]; + void *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + if (memcmp(&sb->set_uuid2, hash, 8)==0) + printf(" (local to host %s)", homehost); + } + printf("\n Events : %d.%d\n\n", sb->events_hi, sb->events_lo); +} + +static void brief_detail_super0(struct supertype *st, char *subarray) +{ + mdp_super_t *sb = st->sb; + printf(" UUID="); + if (sb->minor_version >= 90) + printf("%08x:%08x:%08x:%08x", sb->set_uuid0, sb->set_uuid1, + sb->set_uuid2, sb->set_uuid3); + else + printf("%08x", sb->set_uuid0); +} + +static int match_home0(struct supertype *st, char *homehost) +{ + mdp_super_t *sb = st->sb; + char buf[20]; + char *hash; + + if (!homehost) + return 0; + hash = sha1_buffer(homehost, + strlen(homehost), + buf); + + return (memcmp(&sb->set_uuid2, hash, 8)==0); +} + +static void uuid_from_super0(struct supertype *st, int uuid[4]) +{ + mdp_super_t *super = st->sb; + uuid[0] = super->set_uuid0; + if (super->minor_version >= 90) { + uuid[1] = super->set_uuid1; + uuid[2] = super->set_uuid2; + uuid[3] = super->set_uuid3; + } else { + uuid[1] = 0; + uuid[2] = 0; + uuid[3] = 0; + } +} + +static void getinfo_super0(struct supertype *st, struct mdinfo *info, char *map) +{ + mdp_super_t *sb = st->sb; + int working = 0; + int i; + int map_disks = info->array.raid_disks; + + memset(info, 0, sizeof(*info)); + info->array.major_version = sb->major_version; + info->array.minor_version = sb->minor_version; + info->array.patch_version = sb->patch_version; + info->array.raid_disks = sb->raid_disks; + info->array.level = sb->level; + info->array.layout = sb->layout; + info->array.md_minor = sb->md_minor; + info->array.ctime = sb->ctime; + info->array.utime = sb->utime; + info->array.chunk_size = sb->chunk_size; + info->array.state = sb->state; + info->component_size = sb->size; + info->component_size *= 2; + + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) + info->bitmap_offset = 8; + + info->disk.state = sb->this_disk.state; + info->disk.major = sb->this_disk.major; + info->disk.minor = sb->this_disk.minor; + info->disk.raid_disk = sb->this_disk.raid_disk; + info->disk.number = sb->this_disk.number; + + info->events = md_event(sb); + info->data_offset = 0; + + sprintf(info->text_version, "0.%d", sb->minor_version); + info->safe_mode_delay = 200; + + uuid_from_super0(st, info->uuid); + + info->recovery_start = MaxSector; + if (sb->minor_version > 90 && (sb->reshape_position+1) != 0) { + info->reshape_active = 1; + info->reshape_progress = sb->reshape_position; + info->new_level = sb->new_level; + info->delta_disks = sb->delta_disks; + info->new_layout = sb->new_layout; + info->new_chunk = sb->new_chunk; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + sprintf(info->name, "%d", sb->md_minor); + /* work_disks is calculated rather than read directly */ + for (i=0; i < MD_SB_DISKS; i++) + if ((sb->disks[i].state & (1<<MD_DISK_SYNC)) && + (sb->disks[i].raid_disk < (unsigned)info->array.raid_disks) && + (sb->disks[i].state & (1<<MD_DISK_ACTIVE)) && + !(sb->disks[i].state & (1<<MD_DISK_FAULTY))) { + working ++; + if (map && i < map_disks) + map[i] = 1; + } else if (map && i < map_disks) + map[i] = 0; + info->array.working_disks = working; +} + +static struct mdinfo *container_content0(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super0(st, info, NULL); + return info; +} + +static int update_super0(struct supertype *st, struct mdinfo *info, + char *update, + char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + int uuid[4]; + mdp_super_t *sb = st->sb; + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* note that 'homehost' is special as it is really + * a "uuid" update. + */ + uuid_set = 0; + update = "uuid"; + info->uuid[0] = sb->set_uuid0; + info->uuid[1] = sb->set_uuid1; + } + + if (strcmp(update, "sparc2.2")==0 ) { + /* 2.2 sparc put the events in the wrong place + * So we copy the tail of the superblock + * up 4 bytes before continuing + */ + __u32 *sb32 = (__u32*)sb; + + memmove(sb32+MD_SB_GENERIC_CONSTANT_WORDS+7, + sb32+MD_SB_GENERIC_CONSTANT_WORDS+7+1, + (MD_SB_WORDS - (MD_SB_GENERIC_CONSTANT_WORDS+7+1))*4); + if (verbose >= 0) + pr_err("adjusting superblock of %s for 2.2/sparc compatibility.\n", + devname); + } else if (strcmp(update, "super-minor") ==0) { + sb->md_minor = info->array.md_minor; + if (verbose > 0) + pr_err("updating superblock of %s with minor number %d\n", + devname, info->array.md_minor); + } else if (strcmp(update, "summaries") == 0) { + unsigned int i; + /* set nr_disks, active_disks, working_disks, + * failed_disks, spare_disks based on disks[] + * array in superblock. + * Also make sure extra slots aren't 'failed' + */ + sb->nr_disks = sb->active_disks = + sb->working_disks = sb->failed_disks = + sb->spare_disks = 0; + for (i=0; i < MD_SB_DISKS ; i++) + if (sb->disks[i].major || + sb->disks[i].minor) { + int state = sb->disks[i].state; + if (state & (1<<MD_DISK_REMOVED)) + continue; + sb->nr_disks++; + if (state & (1<<MD_DISK_ACTIVE)) + sb->active_disks++; + if (state & (1<<MD_DISK_FAULTY)) + sb->failed_disks++; + else + sb->working_disks++; + if (state == 0) + sb->spare_disks++; + } else if (i >= sb->raid_disks && sb->disks[i].number == 0) + sb->disks[i].state = 0; + } else if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, so + * bring this one up-to-date. + */ + __u32 ehi = sb->events_hi, elo = sb->events_lo; + sb->events_hi = (info->events>>32) & 0xFFFFFFFF; + sb->events_lo = (info->events) & 0xFFFFFFFF; + if (sb->events_hi != ehi || + sb->events_lo != elo) + rv = 1; + } else if (strcmp(update, "force-array")==0) { + /* degraded array and 'force' requested, so + * maybe need to mark it 'clean' + */ + if ((sb->level == 5 || sb->level == 4 || sb->level == 6) && + (sb->state & (1 << MD_SB_CLEAN)) == 0) { + /* need to force clean */ + sb->state |= (1 << MD_SB_CLEAN); + rv = 1; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int wonly = sb->disks[d].state & (1<<MD_DISK_WRITEMOSTLY); + int failfast = sb->disks[d].state & (1<<MD_DISK_FAILFAST); + int mask = (1<<MD_DISK_WRITEMOSTLY)|(1<<MD_DISK_FAILFAST); + int add = 0; + if (sb->minor_version >= 91) + /* During reshape we don't insist on everything + * being marked 'sync' + */ + add = (1<<MD_DISK_SYNC); + if (((sb->disks[d].state & ~mask) | add) != + (unsigned)info->disk.state) { + sb->disks[d].state = info->disk.state | wonly |failfast; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks >= 0 && + info->reshape_progress < sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + if (info->reshape_active && + sb->minor_version > 90 && (sb->reshape_position+1) != 0 && + info->delta_disks < 0 && + info->reshape_progress > sb->reshape_position) { + sb->reshape_position = info->reshape_progress; + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + sb->this_disk = sb->disks[info->disk.number]; + } else if (strcmp(update, "linear-grow-update") == 0) { + sb->raid_disks = info->array.raid_disks; + sb->nr_disks = info->array.nr_disks; + sb->active_disks = info->array.active_disks; + sb->working_disks = info->array.working_disks; + memset(&sb->disks[info->disk.number], 0, sizeof(sb->disks[0])); + sb->disks[info->disk.number].number = info->disk.number; + sb->disks[info->disk.number].major = info->disk.major; + sb->disks[info->disk.number].minor = info->disk.minor; + sb->disks[info->disk.number].raid_disk = info->disk.raid_disk; + sb->disks[info->disk.number].state = info->disk.state; + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->state &= ~(1<<MD_SB_CLEAN); + sb->recovery_cp = 0; + } else if (strcmp(update, "uuid") == 0) { + if (!uuid_set && homehost) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(info->uuid+2, hash, 8); + } + sb->set_uuid0 = info->uuid[0]; + sb->set_uuid1 = info->uuid[1]; + sb->set_uuid2 = info->uuid[2]; + sb->set_uuid3 = info->uuid[3]; + if (sb->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap_super_s *bm; + bm = (struct bitmap_super_s*)(sb+1); + uuid_from_super0(st, uuid); + memcpy(bm->uuid, uuid, 16); + } + } else if (strcmp(update, "metadata") == 0) { + /* Create some v1.0 metadata to match ours but make the + * ctime bigger. Also update info->array.*_version. + * We need to arrange that store_super writes out + * the v1.0 metadata. + * Not permitted for unclean array, or array with + * bitmap. + */ + if (info->bitmap_offset) { + pr_err("Cannot update metadata when bitmap is present\n"); + rv = -2; + } else if (info->array.state != 1) { + pr_err("Cannot update metadata on unclean array\n"); + rv = -2; + } else { + info->array.major_version = 1; + info->array.minor_version = 0; + uuid_from_super0(st, info->uuid); + st->other = super1_make_v0(st, info, st->sb); + } + } else if (strcmp(update, "revert-reshape") == 0) { + rv = -2; + if (sb->minor_version <= 90) + pr_err("No active reshape to revert on %s\n", + devname); + else if (sb->delta_disks == 0) + pr_err("%s: Can only revert reshape which changes number of devices\n", + devname); + else { + int tmp; + int parity = sb->level == 6 ? 2 : 1; + rv = 0; + + if (sb->level >= 4 && sb->level <= 6 && + sb->reshape_position % ( + sb->new_chunk/512 * + (sb->raid_disks - sb->delta_disks - parity))) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + sb->raid_disks -= sb->delta_disks; + sb->delta_disks = -sb->delta_disks; + + tmp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = tmp; + + tmp = sb->new_chunk; + sb->new_chunk = sb->chunk_size; + sb->chunk_size = tmp; + } + } else if (strcmp(update, "no-bitmap") == 0) { + sb->state &= ~(1<<MD_SB_BITMAP_PRESENT); + } else if (strcmp(update, "_reshape_progress")==0) + sb->reshape_position = info->reshape_progress; + else if (strcmp(update, "writemostly")==0) + sb->state |= (1<<MD_DISK_WRITEMOSTLY); + else if (strcmp(update, "readwrite")==0) + sb->state &= ~(1<<MD_DISK_WRITEMOSTLY); + else + rv = -1; + + sb->sb_csum = calc_sb0_csum(sb); + return rv; +} + +/* + * For version-0 superblock, the homehost is 'stored' in the uuid. + * 8 bytes for a hash of the host leaving 8 bytes of random material. + * We use the first 8 bytes (64bits) of the sha1 of the host name + */ +static int init_super0(struct supertype *st, mdu_array_info_t *info, + struct shape *s, char *ignored_name, + char *homehost, int *uuid, + unsigned long long data_offset) +{ + mdp_super_t *sb; + int spares; + + if (data_offset != INVALID_SECTORS) { + pr_err("data-offset not support for 0.90\n"); + return 0; + } + + if (posix_memalign((void**)&sb, 4096, + MD_SB_BYTES + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, MD_SB_BYTES + sizeof(bitmap_super_t)); + + st->sb = sb; + if (info == NULL) { + /* zeroing the superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MD_SB_DISKS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MD_SB_DISKS); + return 0; + } + + sb->md_magic = MD_SB_MAGIC; + sb->major_version = 0; + sb->minor_version = 90; + sb->patch_version = 0; + sb->gvalid_words = 0; /* ignored */ + sb->ctime = time(0); + sb->level = info->level; + sb->size = s->size; + if (s->size != (unsigned long long)sb->size) + return 0; + sb->nr_disks = info->nr_disks; + sb->raid_disks = info->raid_disks; + sb->md_minor = info->md_minor; + sb->not_persistent = 0; + if (uuid) { + sb->set_uuid0 = uuid[0]; + sb->set_uuid1 = uuid[1]; + sb->set_uuid2 = uuid[2]; + sb->set_uuid3 = uuid[3]; + } else { + __u32 r[4]; + random_uuid((__u8 *)r); + sb->set_uuid0 = r[0]; + sb->set_uuid1 = r[1]; + sb->set_uuid2 = r[2]; + sb->set_uuid3 = r[3]; + } + if (homehost && !uuid) { + char buf[20]; + char *hash = sha1_buffer(homehost, + strlen(homehost), + buf); + memcpy(&sb->set_uuid2, hash, 8); + } + + sb->utime = sb->ctime; + sb->state = info->state; + sb->active_disks = info->active_disks; + sb->working_disks = info->working_disks; + sb->failed_disks = info->failed_disks; + sb->spare_disks = info->spare_disks; + sb->events_hi = 0; + sb->events_lo = 1; + + sb->layout = info->layout; + sb->chunk_size = info->chunk_size; + + return 1; +} + +struct devinfo { + int fd; + char *devname; + mdu_disk_info_t disk; + struct devinfo *next; +}; + +/* Add a device to the superblock being created */ +static int add_to_super0(struct supertype *st, mdu_disk_info_t *dinfo, + int fd, char *devname, unsigned long long data_offset) +{ + mdp_super_t *sb = st->sb; + mdp_disk_t *dk = &sb->disks[dinfo->number]; + struct devinfo *di, **dip; + + dk->number = dinfo->number; + dk->major = dinfo->major; + dk->minor = dinfo->minor; + dk->raid_disk = dinfo->raid_disk; + dk->state = dinfo->state & ((1<<MD_DISK_ACTIVE) | + (1<<MD_DISK_SYNC)); + + sb->this_disk = sb->disks[dinfo->number]; + sb->sb_csum = calc_sb0_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dinfo; + di->next = NULL; + *dip = di; + + return 0; +} + +static int store_super0(struct supertype *st, int fd) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super = st->sb; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return 2; + + if (st->other) { + /* Writing out v1.0 metadata for --update=metadata */ + int ret = 0; + + offset = dsize/512 - 8*2; + offset &= ~(4*2-1); + offset *= 512; + if (lseek64(fd, offset, 0)< 0LL) + ret = 3; + else if (write(fd, st->other, 1024) != 1024) + ret = 4; + else + fsync(fd); + free(st->other); + st->other = NULL; + return ret; + } + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) + return 3; + + if (write(fd, super, sizeof(*super)) != sizeof(*super)) + return 4; + + if (super->state & (1<<MD_SB_BITMAP_PRESENT)) { + struct bitmap_super_s * bm = (struct bitmap_super_s*)(super+1); + if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) + if (write(fd, bm, ROUND_UP(sizeof(*bm),4096)) != + ROUND_UP(sizeof(*bm),4096)) + return 5; + } + + fsync(fd); + return 0; +} + +static int write_init_super0(struct supertype *st) +{ + mdp_super_t *sb = st->sb; + int rv = 0; + struct devinfo *di; + + for (di = st->info ; di && ! rv ; di = di->next) { + + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd == -1) + continue; + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->disks[di->disk.number].state &= ~(1<<MD_DISK_FAULTY); + + sb->this_disk = sb->disks[di->disk.number]; + sb->sb_csum = calc_sb0_csum(sb); + rv = store_super0(st, di->fd); + + if (rv == 0 && (sb->state & (1<<MD_SB_BITMAP_PRESENT))) + rv = st->ss->write_bitmap(st, di->fd, NoUpdate); + + if (rv) + pr_err("failed to write superblock to %s\n", + di->devname); + } + return rv; +} + +static int compare_super0(struct supertype *st, struct supertype *tst, + int verbose) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + mdp_super_t *first = st->sb; + mdp_super_t *second = tst->sb; + int uuid1[4], uuid2[4]; + + if (second->md_magic != MD_SB_MAGIC) + return 1; + if (!first) { + if (posix_memalign((void**)&first, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, MD_SB_BYTES + sizeof(struct bitmap_super_s)); + st->sb = first; + return 0; + } + + uuid_from_super0(st, uuid1); + uuid_from_super0(tst, uuid2); + if (!same_uuid(uuid1, uuid2, 0)) + return 2; + if (first->major_version != second->major_version || + first->minor_version != second->minor_version || + first->patch_version != second->patch_version || + first->gvalid_words != second->gvalid_words || + first->ctime != second->ctime || + first->level != second->level || + first->size != second->size || + first->raid_disks != second->raid_disks ) + return 3; + + return 0; +} + +static void free_super0(struct supertype *st); + +static int load_super0(struct supertype *st, int fd, char *devname) +{ + /* try to read in the superblock + * Return: + * 0 on success + * 1 on cannot get superblock + * 2 on superblock meaningless + */ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *super; + int uuid[4]; + struct bitmap_super_s *bsb; + + free_super0(st); + + if (!get_dev_size(fd, devname, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + st->devsize = dsize; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, + MD_SB_BYTES + + ROUND_UP(sizeof(bitmap_super_t), 4096)) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + if (read(fd, super, sizeof(*super)) != MD_SB_BYTES) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (st->ss && st->minor_version == 9) + super0_swap_endian(super); + + if (super->md_magic != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, super->md_magic); + free(super); + return 2; + } + + if (super->major_version != 0) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, super->major_version); + free(super); + return 2; + } + st->sb = super; + + if (st->ss == NULL) { + st->ss = &super0; + st->minor_version = super->minor_version; + st->max_devs = MD_SB_DISKS; + st->info = NULL; + } + + /* Now check on the bitmap superblock */ + if ((super->state & (1<<MD_SB_BITMAP_PRESENT)) == 0) + return 0; + /* Read the bitmap superblock and make sure it looks + * valid. If it doesn't clear the bit. An --assemble --force + * should get that written out. + */ + if (read(fd, super+1, ROUND_UP(sizeof(struct bitmap_super_s),4096)) != + ROUND_UP(sizeof(struct bitmap_super_s), 4096)) + goto no_bitmap; + + uuid_from_super0(st, uuid); + bsb = (struct bitmap_super_s *)(super+1); + if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->state &= ~(1<<MD_SB_BITMAP_PRESENT); + + return 0; +} + +static struct supertype *match_metadata_desc0(char *arg) +{ + struct supertype *st = xcalloc(1, sizeof(*st)); + + st->container_devnm[0] = 0; + st->ss = &super0; + st->info = NULL; + st->minor_version = 90; + st->max_devs = MD_SB_DISKS; + st->sb = NULL; + /* we sometimes get 00.90 */ + while (arg[0] == '0' && arg[1] == '0') + arg++; + if (strcmp(arg, "0") == 0 || +#ifdef DEFAULT_OLD_METADATA /* ifndef in super1.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "0.90") == 0 || + strcmp(arg, "") == 0 /* no metadata - i.e. non_persistent */ + ) + return st; + + st->minor_version = 91; /* reshape in progress */ + if (strcmp(arg, "0.91") == 0) /* For dup_super support */ + return st; + + st->minor_version = 9; /* flag for 'byte-swapped' */ + if (strcmp(arg, "0.swap")==0 || + strcmp(arg, "0.9") == 0) /* For dup_super support */ + return st; + + free(st); + return NULL; +} + +static __u64 avail_size0(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + if (data_offset != 0 && data_offset != INVALID_SECTORS) + return 0ULL; + if (devsize < MD_RESERVED_SECTORS) + return 0ULL; + return MD_NEW_SIZE_SECTORS(devsize); +} + +static int add_internal_bitmap0(struct supertype *st, int *chunkp, + int delay, int write_behind, + unsigned long long size, int may_change, + int major) +{ + /* + * The bitmap comes immediately after the superblock and must be 60K in size + * at most. The default size is between 30K and 60K + * + * size is in sectors, chunk is in bytes !!! + */ + unsigned long long bits; + unsigned long long max_bits = (60*1024 - sizeof(bitmap_super_t))*8; + unsigned long long min_chunk; + int chunk = *chunkp; + mdp_super_t *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MD_SB_BYTES); + int uuid[4]; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size * 512) / min_chunk + 1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* A chunk size less than a few Megabytes gives poor + * performance without increasing resync noticeably + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if ((unsigned long long)chunk < min_chunk) + return -EINVAL; /* chunk size too small */ + + sb->state |= (1<<MD_SB_BITMAP_PRESENT); + + memset(bms, 0, sizeof(*bms)); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super0(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + *chunkp = chunk; + return 0; +} + +static int locate_bitmap0(struct supertype *st, int fd, int node_num) +{ + unsigned long long dsize; + unsigned long long offset; + + if (!get_dev_size(fd, NULL, &dsize)) + return -1; + + if (dsize < MD_RESERVED_SECTORS*512) + return -1; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + offset += MD_SB_BYTES; + + lseek64(fd, offset, 0); + return 0; +} + +static int write_bitmap0(struct supertype *st, int fd, enum bitmap_update update) +{ + unsigned long long dsize; + unsigned long long offset; + mdp_super_t *sb = st->sb; + + int rv = 0; + + int towrite, n; + void *buf; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + if (dsize < MD_RESERVED_SECTORS*512) + return -1; + + offset = MD_NEW_SIZE_SECTORS(dsize>>9); + + offset *= 512; + + if (lseek64(fd, offset + 4096, 0)< 0LL) + return 3; + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + memset(buf, 0xff, 4096); + memcpy(buf, ((char*)sb)+MD_SB_BYTES, sizeof(bitmap_super_t)); + towrite = 60*1024; + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = write(fd, buf, n); + if (n > 0) + towrite -= n; + else + break; + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) + rv = -2; + + free(buf); + return rv; +} + +static void free_super0(struct supertype *st) +{ + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +static int validate_geometry0(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + unsigned long long ldsize; + int fd; + unsigned int tbmax = 4; + + /* prior to linux 3.1, a but limits usable device size to 2TB. + * It was introduced in 2.6.29, but we won't worry about that detail + */ + if (get_linux_version() < 3001000) + tbmax = 2; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("0.90 metadata does not support containers\n"); + return 0; + } + if (raiddisks > MD_SB_DISKS) { + if (verbose) + pr_err("0.90 metadata supports at most %d devices per array\n", + MD_SB_DISKS); + return 0; + } + if (size >= tbmax * 2ULL*1024*1024*1024) { + if (verbose) + pr_err("0.90 metadata supports at most %d terabytes per device\n", tbmax); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (level == 0 && layout != UnSet) { + if (verbose) + pr_err("0.90 metadata does not support layouts for RAID0\n"); + return 0; + } + + if (!subdev) + return 1; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super0.90 cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + if (ldsize < MD_RESERVED_SECTORS * 512) + return 0; + *freesize = MD_NEW_SIZE_SECTORS(ldsize >> 9); + return 1; +} + +struct superswitch super0 = { + .examine_super = examine_super0, + .brief_examine_super = brief_examine_super0, + .export_examine_super = export_examine_super0, + .detail_super = detail_super0, + .brief_detail_super = brief_detail_super0, + .write_init_super = write_init_super0, + .validate_geometry = validate_geometry0, + .add_to_super = add_to_super0, + .copy_metadata = copy_metadata0, + .match_home = match_home0, + .uuid_from_super = uuid_from_super0, + .getinfo_super = getinfo_super0, + .container_content = container_content0, + .update_super = update_super0, + .init_super = init_super0, + .store_super = store_super0, + .compare_super = compare_super0, + .load_super = load_super0, + .match_metadata_desc = match_metadata_desc0, + .avail_size = avail_size0, + .add_internal_bitmap = add_internal_bitmap0, + .locate_bitmap = locate_bitmap0, + .write_bitmap = write_bitmap0, + .free_super = free_super0, + .name = "0.90", +}; diff --git a/super1.c b/super1.c new file mode 100644 index 0000000..a12a5bc --- /dev/null +++ b/super1.c @@ -0,0 +1,2980 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2016 Neil Brown <neilb@suse.com> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include <stddef.h> +#include "mdadm.h" +/* + * The version-1 superblock : + * All numeric fields are little-endian. + * + * total size: 256 bytes plus 2 per device. + * 1K allows 384 devices. + */ +struct mdp_superblock_1 { + /* constant array information - 128 bytes */ + __u32 magic; /* MD_SB_MAGIC: 0xa92b4efc - little endian */ + __u32 major_version; /* 1 */ + __u32 feature_map; /* 0 for now */ + __u32 pad0; /* always set to 0 when writing */ + + __u8 set_uuid[16]; /* user-space generated. */ + char set_name[32]; /* set and interpreted by user-space */ + + __u64 ctime; /* lo 40 bits are seconds, top 24 are microseconds or 0*/ + __u32 level; /* -4 (multipath), -1 (linear), 0,1,4,5 */ + __u32 layout; /* used for raid5, raid6, raid10, and raid0 */ + __u64 size; /* used size of component devices, in 512byte sectors */ + + __u32 chunksize; /* in 512byte sectors */ + __u32 raid_disks; + union { + __u32 bitmap_offset; /* sectors after start of superblock that bitmap starts + * NOTE: signed, so bitmap can be before superblock + * only meaningful of feature_map[0] is set. + */ + + /* only meaningful when feature_map[MD_FEATURE_PPL] is set */ + struct { + __s16 offset; /* sectors from start of superblock that ppl starts */ + __u16 size; /* ppl size in sectors */ + } ppl; + }; + + /* These are only valid with feature bit '4' */ + __u32 new_level; /* new level we are reshaping to */ + __u64 reshape_position; /* next address in array-space for reshape */ + __u32 delta_disks; /* change in number of raid_disks */ + __u32 new_layout; /* new layout */ + __u32 new_chunk; /* new chunk size (sectors) */ + __u32 new_offset; /* signed number to add to data_offset in new + * layout. 0 == no-change. This can be + * different on each device in the array. + */ + + /* constant this-device information - 64 bytes */ + __u64 data_offset; /* sector start of data, often 0 */ + __u64 data_size; /* sectors in this device that can be used for data */ + __u64 super_offset; /* sector start of this superblock */ + union { + __u64 recovery_offset;/* sectors before this offset (from data_offset) have been recovered */ + __u64 journal_tail;/* journal tail of journal device (from data_offset) */ + }; + __u32 dev_number; /* permanent identifier of this device - not role in raid */ + __u32 cnt_corrected_read; /* number of read errors that were corrected by re-writing */ + __u8 device_uuid[16]; /* user-space setable, ignored by kernel */ + __u8 devflags; /* per-device flags. Only one defined...*/ +#define WriteMostly1 1 /* mask for writemostly flag in above */ +#define FailFast1 2 /* Device should get FailFast requests */ + /* bad block log. If there are any bad blocks the feature flag is set. + * if offset and size are non-zero, that space is reserved and available. + */ + __u8 bblog_shift; /* shift from sectors to block size for badblock list */ + __u16 bblog_size; /* number of sectors reserved for badblock list */ + __u32 bblog_offset; /* sector offset from superblock to bblog, signed */ + + /* array state information - 64 bytes */ + __u64 utime; /* 40 bits second, 24 bits microseconds */ + __u64 events; /* incremented when superblock updated */ + __u64 resync_offset; /* data before this offset (from data_offset) known to be in sync */ + __u32 sb_csum; /* checksum upto dev_roles[max_dev] */ + __u32 max_dev; /* size of dev_roles[] array to consider */ + __u8 pad3[64-32]; /* set to 0 when writing */ + + /* device state information. Indexed by dev_number. + * 2 bytes per device + * Note there are no per-device state flags. State information is rolled + * into the 'roles' value. If a device is spare or faulty, then it doesn't + * have a meaningful role. + */ + __u16 dev_roles[0]; /* role in array, or 0xffff for a spare, or 0xfffe for faulty */ +}; + +#define MAX_SB_SIZE 4096 +/* bitmap super size is 256, but we round up to a sector for alignment */ +#define BM_SUPER_SIZE 512 +#define MAX_DEVS ((int)(MAX_SB_SIZE - sizeof(struct mdp_superblock_1)) / 2) +#define SUPER1_SIZE (MAX_SB_SIZE + BM_SUPER_SIZE \ + + sizeof(struct misc_dev_info)) + +struct misc_dev_info { + __u64 device_size; +}; + +#define MULTIPLE_PPL_AREA_SIZE_SUPER1 (1024 * 1024) /* Size of the whole + * mutliple PPL area + */ +/* feature_map bits */ +#define MD_FEATURE_BITMAP_OFFSET 1 +#define MD_FEATURE_RECOVERY_OFFSET 2 /* recovery_offset is present and + * must be honoured + */ +#define MD_FEATURE_RESHAPE_ACTIVE 4 +#define MD_FEATURE_BAD_BLOCKS 8 /* badblock list is not empty */ +#define MD_FEATURE_REPLACEMENT 16 /* This device is replacing an + * active device with same 'role'. + * 'recovery_offset' is also set. + */ +#define MD_FEATURE_RESHAPE_BACKWARDS 32 /* Reshape doesn't change number + * of devices, but is going + * backwards anyway. + */ +#define MD_FEATURE_NEW_OFFSET 64 /* new_offset must be honoured */ +#define MD_FEATURE_BITMAP_VERSIONED 256 /* bitmap version number checked properly */ +#define MD_FEATURE_JOURNAL 512 /* support write journal */ +#define MD_FEATURE_PPL 1024 /* support PPL */ +#define MD_FEATURE_MUTLIPLE_PPLS 2048 /* support for multiple PPLs */ +#define MD_FEATURE_RAID0_LAYOUT 4096 /* layout is meaningful in RAID0 */ +#define MD_FEATURE_ALL (MD_FEATURE_BITMAP_OFFSET \ + |MD_FEATURE_RECOVERY_OFFSET \ + |MD_FEATURE_RESHAPE_ACTIVE \ + |MD_FEATURE_BAD_BLOCKS \ + |MD_FEATURE_REPLACEMENT \ + |MD_FEATURE_RESHAPE_BACKWARDS \ + |MD_FEATURE_NEW_OFFSET \ + |MD_FEATURE_BITMAP_VERSIONED \ + |MD_FEATURE_JOURNAL \ + |MD_FEATURE_PPL \ + |MD_FEATURE_MULTIPLE_PPLS \ + |MD_FEATURE_RAID0_LAYOUT \ + ) + +static int role_from_sb(struct mdp_superblock_1 *sb) +{ + unsigned int d; + int role; + + d = __le32_to_cpu(sb->dev_number); + if (d < __le32_to_cpu(sb->max_dev)) + role = __le16_to_cpu(sb->dev_roles[d]); + else + role = MD_DISK_ROLE_SPARE; + return role; +} + +/* return how many bytes are needed for bitmap, for cluster-md each node + * should have it's own bitmap */ +static unsigned int calc_bitmap_size(bitmap_super_t *bms, unsigned int boundary) +{ + unsigned long long bits, bytes; + + bits = bitmap_bits(__le64_to_cpu(bms->sync_size), + __le32_to_cpu(bms->chunksize)); + bytes = (bits+7) >> 3; + bytes += sizeof(bitmap_super_t); + bytes = ROUND_UP(bytes, boundary); + + return bytes; +} + +static unsigned int calc_sb_1_csum(struct mdp_superblock_1 * sb) +{ + unsigned int disk_csum, csum; + unsigned long long newcsum; + int size = sizeof(*sb) + __le32_to_cpu(sb->max_dev)*2; + unsigned int *isuper = (unsigned int*)sb; + +/* make sure I can count... */ + if (offsetof(struct mdp_superblock_1,data_offset) != 128 || + offsetof(struct mdp_superblock_1, utime) != 192 || + sizeof(struct mdp_superblock_1) != 256) { + fprintf(stderr, "WARNING - superblock isn't sized correctly\n"); + } + + disk_csum = sb->sb_csum; + sb->sb_csum = 0; + newcsum = 0; + for (; size>=4; size -= 4 ) { + newcsum += __le32_to_cpu(*isuper); + isuper++; + } + + if (size == 2) + newcsum += __le16_to_cpu(*(unsigned short*) isuper); + + csum = (newcsum & 0xffffffff) + (newcsum >> 32); + sb->sb_csum = disk_csum; + return __cpu_to_le32(csum); +} + +/* + * Information related to file descriptor used for aligned reads/writes. + * Cache the block size. + */ +struct align_fd { + int fd; + int blk_sz; +}; + +static void init_afd(struct align_fd *afd, int fd) +{ + afd->fd = fd; + if (!get_dev_sector_size(afd->fd, NULL, (unsigned int *)&afd->blk_sz)) + afd->blk_sz = 512; +} + +static char abuf[4096+4096]; + +static int aread(struct align_fd *afd, void *buf, int len) +{ + /* aligned read. + * On devices with a 4K sector size, we need to read + * the full sector and copy relevant bits into + * the buffer + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - aread() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len; iosize += bsize) + ; + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + if (n > len) + n = len; + memcpy(buf, b, n); + return n; +} + +static int awrite(struct align_fd *afd, void *buf, int len) +{ + /* aligned write. + * On devices with a 4K sector size, we need to write + * the full sector. We pre-read if the sector is larger + * than the write. + * The address must be sector-aligned. + */ + int bsize, iosize; + char *b; + int n; + + bsize = afd->blk_sz; + if (!bsize || bsize > 4096 || len > 4096) { + if (!bsize) + fprintf(stderr, "WARNING - awrite() called with invalid block size\n"); + return -1; + } + b = ROUND_UP_PTR((char *)abuf, 4096); + + for (iosize = 0; iosize < len ; iosize += bsize) + ; + + if (len != iosize) { + n = read(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, -n, 1); + } + + memcpy(b, buf, len); + n = write(afd->fd, b, iosize); + if (n <= 0) + return n; + lseek(afd->fd, len - n, 1); + return len; +} + +static inline unsigned int md_feature_any_ppl_on(__u32 feature_map) +{ + return ((__cpu_to_le32(feature_map) & + (MD_FEATURE_PPL | MD_FEATURE_MUTLIPLE_PPLS))); +} + +static inline unsigned int choose_ppl_space(int chunk) +{ + return (PPL_HEADER_SIZE >> 9) + (chunk > 128*2 ? chunk : 128*2); +} + +static void examine_super1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + time_t atime; + unsigned int d; + int role; + int delta_extra = 0; + int i; + char *c; + int l = homehost ? strlen(homehost) : 0; + int layout; + unsigned long long sb_offset; + struct mdinfo info; + int inconsistent = 0; + + printf(" Magic : %08x\n", __le32_to_cpu(sb->magic)); + printf(" Version : 1"); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(".1\n"); + else if (sb_offset <= 8) + printf(".2\n"); + else + printf(".0\n"); + printf(" Feature Map : 0x%x\n", __le32_to_cpu(sb->feature_map)); + printf(" Array UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && + sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + printf("\n"); + if (bms->nodes > 0 && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf(" Cluster Name : %-64s\n", bms->cluster_name); + atime = __le64_to_cpu(sb->ctime) & 0xFFFFFFFFFFULL; + printf(" Creation Time : %.24s\n", ctime(&atime)); + c=map_num(pers, __le32_to_cpu(sb->level)); + printf(" Raid Level : %s\n", c?c:"-unknown-"); + printf(" Raid Devices : %d\n", __le32_to_cpu(sb->raid_disks)); + printf("\n"); + printf(" Avail Dev Size : %llu sectors%s\n", + (unsigned long long)__le64_to_cpu(sb->data_size), + human_size(__le64_to_cpu(sb->data_size)<<9)); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: ddsks=1;break; + case 4: + case 5: ddsks = __le32_to_cpu(sb->raid_disks)-1; break; + case 6: ddsks = __le32_to_cpu(sb->raid_disks)-2; break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf(" Array Size : %llu KiB%s\n", + asize >> 10, human_size(asize)); + } + if (sb->size != sb->data_size) + printf(" Used Dev Size : %llu sectors%s\n", + (unsigned long long)__le64_to_cpu(sb->size), + human_size(__le64_to_cpu(sb->size)<<9)); + } + if (sb->data_offset) + printf(" Data Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->data_offset)); + if (sb->new_offset && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) { + unsigned long long offset = __le64_to_cpu(sb->data_offset); + offset += (signed)(int32_t)__le32_to_cpu(sb->new_offset); + printf(" New Offset : %llu sectors\n", offset); + } + printf(" Super Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->super_offset)); + if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_RECOVERY_OFFSET) + printf("Recovery Offset : %llu sectors\n", + (unsigned long long)__le64_to_cpu(sb->recovery_offset)); + + st->ss->getinfo_super(st, &info, NULL); + if (info.space_after != 1 && + !(__le32_to_cpu(sb->feature_map) & MD_FEATURE_NEW_OFFSET)) + printf(" Unused Space : before=%llu sectors, after=%llu sectors\n", + info.space_before, info.space_after); + + printf(" State : %s\n", + (__le64_to_cpu(sb->resync_offset)+1)? "active":"clean"); + printf(" Device UUID : "); + for (i=0; i<16; i++) { + if ((i&3)==0 && i != 0) + printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("\n"); + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + printf("Internal Bitmap : %ld sectors from superblock\n", + (long)(int32_t)__le32_to_cpu(sb->bitmap_offset)); + } else if (md_feature_any_ppl_on(sb->feature_map)) { + printf(" PPL : %u sectors at offset %d sectors from superblock\n", + __le16_to_cpu(sb->ppl.size), + __le16_to_cpu(sb->ppl.offset)); + } + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE)) { + printf(" Reshape pos'n : %llu%s\n", (unsigned long long) + __le64_to_cpu(sb->reshape_position)/2, + human_size(__le64_to_cpu(sb->reshape_position)<<9)); + if (__le32_to_cpu(sb->delta_disks)) { + printf(" Delta Devices : %d", + __le32_to_cpu(sb->delta_disks)); + printf(" (%d->%d)\n", + __le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks), + __le32_to_cpu(sb->raid_disks)); + if ((int)__le32_to_cpu(sb->delta_disks) < 0) + delta_extra = -__le32_to_cpu(sb->delta_disks); + } + if (__le32_to_cpu(sb->new_level) != __le32_to_cpu(sb->level)) { + c = map_num(pers, __le32_to_cpu(sb->new_level)); + printf(" New Level : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->new_layout) != + __le32_to_cpu(sb->layout)) { + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, + __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, + __le32_to_cpu(sb->new_layout)); + printf(" New Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + printf(" New Layout :"); + print_r10_layout(__le32_to_cpu(sb->new_layout)); + printf("\n"); + } + } + if (__le32_to_cpu(sb->new_chunk) != + __le32_to_cpu(sb->chunksize)) + printf(" New Chunksize : %dK\n", + __le32_to_cpu(sb->new_chunk)/2); + printf("\n"); + } + if (sb->devflags) { + printf(" Flags :"); + if (sb->devflags & WriteMostly1) + printf(" write-mostly"); + if (sb->devflags & FailFast1) + printf(" failfast"); + printf("\n"); + } + + atime = __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL; + printf(" Update Time : %.24s\n", ctime(&atime)); + + if (sb->bblog_size && sb->bblog_offset) { + printf(" Bad Block Log : %d entries available at offset %ld sectors", + __le16_to_cpu(sb->bblog_size)*512/8, + (long)(int32_t)__le32_to_cpu(sb->bblog_offset)); + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + printf(" - bad blocks present."); + printf("\n"); + } + + if (calc_sb_1_csum(sb) == sb->sb_csum) + printf(" Checksum : %x - correct\n", + __le32_to_cpu(sb->sb_csum)); + else + printf(" Checksum : %x - expected %x\n", + __le32_to_cpu(sb->sb_csum), + __le32_to_cpu(calc_sb_1_csum(sb))); + printf(" Events : %llu\n", + (unsigned long long)__le64_to_cpu(sb->events)); + printf("\n"); + if (__le32_to_cpu(sb->level) == 0 && + (sb->feature_map & __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT))) { + c = map_num(r0layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 5) { + c = map_num(r5layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 6) { + c = map_num(r6layout, __le32_to_cpu(sb->layout)); + printf(" Layout : %s\n", c?c:"-unknown-"); + } + if (__le32_to_cpu(sb->level) == 10) { + int lo = __le32_to_cpu(sb->layout); + printf(" Layout :"); + print_r10_layout(lo); + printf("\n"); + } + switch(__le32_to_cpu(sb->level)) { + case 0: + case 4: + case 5: + case 6: + case 10: + printf(" Chunk Size : %dK\n", + __le32_to_cpu(sb->chunksize)/2); + break; + case -1: + printf(" Rounding : %dK\n", + __le32_to_cpu(sb->chunksize)/2); + break; + default: + break; + } + printf("\n"); +#if 0 + /* This turns out to just be confusing */ + printf(" Array Slot : %d (", __le32_to_cpu(sb->dev_number)); + for (i = __le32_to_cpu(sb->max_dev); i> 0 ; i--) + if (__le16_to_cpu(sb->dev_roles[i-1]) != MD_DISK_ROLE_SPARE) + break; + for (d = 0; d < i; d++) { + int role = __le16_to_cpu(sb->dev_roles[d]); + if (d) + printf(", "); + if (role == MD_DISK_ROLE_SPARE) + printf("empty"); + else + if(role == MD_DISK_ROLE_FAULTY) + printf("failed"); + else + printf("%d", role); + } + printf(")\n"); +#endif + printf(" Device Role : "); + role = role_from_sb(sb); + if (role >= MD_DISK_ROLE_FAULTY) + printf("spare\n"); + else if (role == MD_DISK_ROLE_JOURNAL) + printf("Journal\n"); + else if (sb->feature_map & __cpu_to_le32(MD_FEATURE_REPLACEMENT)) + printf("Replacement device %d\n", role); + else + printf("Active device %d\n", role); + + printf(" Array State : "); + for (d = 0; d < __le32_to_cpu(sb->raid_disks) + delta_extra; d++) { + int cnt = 0; + unsigned int i; + for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) { + unsigned int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == d) + cnt++; + } + if (cnt == 2 && __le32_to_cpu(sb->level) > 0) + printf("R"); + else if (cnt == 1) + printf("A"); + else if (cnt == 0) + printf("."); + else { + printf("?"); + inconsistent = 1; + } + } +#if 0 + /* This is confusing too */ + faulty = 0; + for (i = 0; i< __le32_to_cpu(sb->max_dev); i++) { + int role = __le16_to_cpu(sb->dev_roles[i]); + if (role == MD_DISK_ROLE_FAULTY) + faulty++; + } + if (faulty) + printf(" %d failed", faulty); +#endif + printf(" ('A' == active, '.' == missing, 'R' == replacing)"); + printf("\n"); + for (d = 0; d < __le32_to_cpu(sb->max_dev); d++) { + unsigned int r = __le16_to_cpu(sb->dev_roles[d]); + if (r <= MD_DISK_ROLE_MAX && + r > __le32_to_cpu(sb->raid_disks) + delta_extra) + inconsistent = 1; + } + if (inconsistent) { + printf("WARNING Array state is inconsistent - each number should appear only once\n"); + for (d = 0; d < __le32_to_cpu(sb->max_dev); d++) + if (__le16_to_cpu(sb->dev_roles[d]) >= MD_DISK_ROLE_FAULTY) + printf(" %d:-", d); + else + printf(" %d:%d", d, __le16_to_cpu(sb->dev_roles[d])); + printf("\n"); + } +} + +static void brief_examine_super1(struct supertype *st, int verbose) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + unsigned long long sb_offset; + char *nm; + char *c = map_num(pers, __le32_to_cpu(sb->level)); + + nm = strchr(sb->set_name, ':'); + if (nm) + nm++; + else if (sb->set_name[0]) + nm = sb->set_name; + else + nm = NULL; + + printf("ARRAY "); + if (nm) { + printf("/dev/md/"); + print_escape(nm); + putchar(' '); + } + if (verbose && c) + printf(" level=%s", c); + sb_offset = __le64_to_cpu(sb->super_offset); + if (sb_offset <= 4) + printf(" metadata=1.1 "); + else if (sb_offset <= 8) + printf(" metadata=1.2 "); + else + printf(" metadata=1.0 "); + if (verbose) + printf("num-devices=%d ", __le32_to_cpu(sb->raid_disks)); + printf("UUID="); + for (i = 0; i < 16; i++) { + if ((i&3)==0 && i != 0) + printf(":"); + printf("%02x", sb->set_uuid[i]); + } + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf("\n"); +} + +static void export_examine_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + int layout; + + printf("MD_LEVEL=%s\n", map_num(pers, __le32_to_cpu(sb->level))); + printf("MD_DEVICES=%d\n", __le32_to_cpu(sb->raid_disks)); + for (i = 0; i < 32; i++) + if (sb->set_name[i] == '\n' || sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); + if (__le32_to_cpu(sb->level) > 0) { + int ddsks = 0, ddsks_denom = 1; + switch(__le32_to_cpu(sb->level)) { + case 1: + ddsks = 1; + break; + case 4: + case 5: + ddsks = __le32_to_cpu(sb->raid_disks)-1; + break; + case 6: + ddsks = __le32_to_cpu(sb->raid_disks)-2; + break; + case 10: + layout = __le32_to_cpu(sb->layout); + ddsks = __le32_to_cpu(sb->raid_disks); + ddsks_denom = (layout&255) * ((layout>>8)&255); + } + if (ddsks) { + long long asize = __le64_to_cpu(sb->size); + asize = (asize << 9) * ddsks / ddsks_denom; + printf("MD_ARRAY_SIZE=%s\n", + human_size_brief(asize, JEDEC)); + } + } + printf("MD_UUID="); + for (i = 0; i < 16; i++) { + if ((i&3) == 0 && i != 0) + printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n"); + printf("MD_UPDATE_TIME=%llu\n", + __le64_to_cpu(sb->utime) & 0xFFFFFFFFFFULL); + printf("MD_DEV_UUID="); + for (i = 0; i < 16; i++) { + if ((i&3) == 0 && i != 0) + printf(":"); + printf("%02x", sb->device_uuid[i]); + } + printf("\n"); + printf("MD_EVENTS=%llu\n", + (unsigned long long)__le64_to_cpu(sb->events)); +} + +static int copy_metadata1(struct supertype *st, int from, int to) +{ + /* Read superblock. If it looks good, write it out. + * Then if a bitmap is present, copy that. + * And if a bad-block-list is present, copy that too. + */ + void *buf; + unsigned long long dsize, sb_offset; + const int bufsize = 4*1024; + struct mdp_superblock_1 super, *sb; + + if (posix_memalign(&buf, 4096, bufsize) != 0) + return 1; + + if (!get_dev_size(from, NULL, &dsize)) + goto err; + + dsize >>= 9; + if (dsize < 24) + goto err; + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + goto err; + } + + if (lseek64(from, sb_offset << 9, 0) < 0LL) + goto err; + if (read(from, buf, bufsize) != bufsize) + goto err; + + sb = buf; + super = *sb; // save most of sb for when we reuse buf + + if (__le32_to_cpu(super.magic) != MD_SB_MAGIC || + __le32_to_cpu(super.major_version) != 1 || + __le64_to_cpu(super.super_offset) != sb_offset || + calc_sb_1_csum(sb) != super.sb_csum) + goto err; + + if (lseek64(to, sb_offset << 9, 0) < 0LL) + goto err; + if (write(to, buf, bufsize) != bufsize) + goto err; + + if (super.feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + unsigned long long bitmap_offset = sb_offset; + int bytes = 4096; // just an estimate. + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bitmap_offset += (int32_t)__le32_to_cpu(super.bitmap_offset); + + if (lseek64(from, bitmap_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bitmap_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + if (written == 0) { + /* have the header, can calculate + * correct bitmap bytes */ + bitmap_super_t *bms; + bms = (void*)buf; + bytes = calc_bitmap_size(bms, 512); + if (n > bytes) + n = bytes; + } + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + if (super.bblog_size != 0 && + __le16_to_cpu(super.bblog_size) <= 100 && + super.bblog_offset != 0 && + (super.feature_map & __le32_to_cpu(MD_FEATURE_BAD_BLOCKS))) { + /* There is a bad block log */ + unsigned long long bb_offset = sb_offset; + int bytes = __le16_to_cpu(super.bblog_size) * 512; + int written = 0; + struct align_fd afrom, ato; + + init_afd(&afrom, from); + init_afd(&ato, to); + + bb_offset += (int32_t)__le32_to_cpu(super.bblog_offset); + + if (lseek64(from, bb_offset<<9, 0) < 0) + goto err; + if (lseek64(to, bb_offset<<9, 0) < 0) + goto err; + + for (written = 0; written < bytes ; ) { + int n = bytes - written; + if (n > 4096) + n = 4096; + if (aread(&afrom, buf, n) != n) + goto err; + + if (awrite(&ato, buf, n) != n) + goto err; + written += n; + } + } + + free(buf); + return 0; + +err: + free(buf); + return 1; +} + +static void detail_super1(struct supertype *st, char *homehost, char *subarray) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int i; + int l = homehost ? strlen(homehost) : 0; + + printf(" Name : %.32s", sb->set_name); + if (l > 0 && l < 32 && sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0) + printf(" (local to host %s)", homehost); + if (bms->nodes > 0 && + (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + printf("\n Cluster Name : %-64s", bms->cluster_name); + printf("\n UUID : "); + for (i = 0; i < 16; i++) { + if ((i&3) == 0 && i != 0) + printf(":"); + printf("%02x", sb->set_uuid[i]); + } + printf("\n Events : %llu\n\n", + (unsigned long long)__le64_to_cpu(sb->events)); +} + +static void brief_detail_super1(struct supertype *st, char *subarray) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + + if (sb->set_name[0]) { + printf(" name="); + print_quoted(sb->set_name); + } + printf(" UUID="); + for (i = 0; i < 16; i++) { + if ((i & 3) == 0 && i != 0) + printf(":"); + printf("%02x", sb->set_uuid[i]); + } +} + +static void export_detail_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + int i; + int len = 32; + + for (i = 0; i < 32; i++) + if (sb->set_name[i] == '\n' || sb->set_name[i] == '\0') { + len = i; + break; + } + if (len) + printf("MD_NAME=%.*s\n", len, sb->set_name); +} + +static int examine_badblocks_super1(struct supertype *st, int fd, char *devname) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long offset; + int size; + __u64 *bbl, *bbp; + int i; + + if (!sb->bblog_size || __le16_to_cpu(sb->bblog_size) > 100 || + !sb->bblog_offset){ + printf("No bad-blocks list configured on %s\n", devname); + return 0; + } + if ((sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) == 0) { + printf("Bad-blocks list is empty in %s\n", devname); + return 0; + } + + size = __le16_to_cpu(sb->bblog_size)* 512; + if (posix_memalign((void**)&bbl, 4096, size) != 0) { + pr_err("could not allocate badblocks list\n"); + return 0; + } + offset = __le64_to_cpu(sb->super_offset) + + (int)__le32_to_cpu(sb->bblog_offset); + offset <<= 9; + if (lseek64(fd, offset, 0) < 0) { + pr_err("Cannot seek to bad-blocks list\n"); + return 1; + } + if (read(fd, bbl, size) != size) { + pr_err("Cannot read bad-blocks list\n"); + return 1; + } + /* 64bits per entry. 10 bits is block-count, 54 bits is block + * offset. Blocks are sectors unless bblog->shift makes them bigger + */ + bbp = (__u64*)bbl; + printf("Bad-blocks on %s:\n", devname); + for (i = 0; i < size/8; i++, bbp++) { + __u64 bb = __le64_to_cpu(*bbp); + int count = bb & 0x3ff; + unsigned long long sector = bb >> 10; + + if (bb + 1 == 0) + break; + + sector <<= sb->bblog_shift; + count <<= sb->bblog_shift; + + printf("%20llu for %d sectors\n", sector, count); + } + return 0; +} + +static int match_home1(struct supertype *st, char *homehost) +{ + struct mdp_superblock_1 *sb = st->sb; + int l = homehost ? strlen(homehost) : 0; + + return (l > 0 && l < 32 && sb->set_name[l] == ':' && + strncmp(sb->set_name, homehost, l) == 0); +} + +static void uuid_from_super1(struct supertype *st, int uuid[4]) +{ + struct mdp_superblock_1 *super = st->sb; + char *cuuid = (char*)uuid; + int i; + for (i = 0; i < 16; i++) + cuuid[i] = super->set_uuid[i]; +} + +static void getinfo_super1(struct supertype *st, struct mdinfo *info, char *map) +{ + struct mdp_superblock_1 *sb = st->sb; + struct bitmap_super_s *bsb = (void*)(((char*)sb)+MAX_SB_SIZE); + struct misc_dev_info *misc = + (void*)(((char*)sb)+MAX_SB_SIZE+BM_SUPER_SIZE); + int working = 0; + unsigned int i; + unsigned int role; + unsigned int map_disks = info->array.raid_disks; + unsigned long long super_offset; + unsigned long long data_size; + + memset(info, 0, sizeof(*info)); + info->array.major_version = 1; + info->array.minor_version = st->minor_version; + info->array.patch_version = 0; + info->array.raid_disks = __le32_to_cpu(sb->raid_disks); + info->array.level = __le32_to_cpu(sb->level); + info->array.layout = __le32_to_cpu(sb->layout); + info->array.md_minor = -1; + info->array.ctime = __le64_to_cpu(sb->ctime); + info->array.utime = __le64_to_cpu(sb->utime); + info->array.chunk_size = __le32_to_cpu(sb->chunksize)*512; + info->array.state = + (__le64_to_cpu(sb->resync_offset) == MaxSector) ? 1 : 0; + + super_offset = __le64_to_cpu(sb->super_offset); + info->data_offset = __le64_to_cpu(sb->data_offset); + info->component_size = __le64_to_cpu(sb->size); + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + info->bitmap_offset = (int32_t)__le32_to_cpu(sb->bitmap_offset); + if (__le32_to_cpu(bsb->nodes) > 1) + info->array.state |= (1 << MD_SB_CLUSTERED); + } else if (md_feature_any_ppl_on(sb->feature_map)) { + info->ppl_offset = __le16_to_cpu(sb->ppl.offset); + info->ppl_size = __le16_to_cpu(sb->ppl.size); + info->ppl_sector = super_offset + info->ppl_offset; + } + + info->disk.major = 0; + info->disk.minor = 0; + info->disk.number = __le32_to_cpu(sb->dev_number); + if (__le32_to_cpu(sb->dev_number) >= __le32_to_cpu(sb->max_dev) || + __le32_to_cpu(sb->dev_number) >= MAX_DEVS) + role = MD_DISK_ROLE_FAULTY; + else + role = __le16_to_cpu(sb->dev_roles[__le32_to_cpu(sb->dev_number)]); + + if (info->array.level <= 0) + data_size = __le64_to_cpu(sb->data_size); + else + data_size = __le64_to_cpu(sb->size); + if (info->data_offset < super_offset) { + unsigned long long end; + info->space_before = info->data_offset; + end = super_offset; + + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bboffset = super_offset; + bboffset += (int32_t)__le32_to_cpu(sb->bblog_offset); + if (bboffset < end) + end = bboffset; + } + + if (super_offset + info->bitmap_offset + info->ppl_offset < end) + end = super_offset + info->bitmap_offset + + info->ppl_offset; + + if (info->data_offset + data_size < end) + info->space_after = end - data_size - info->data_offset; + else + info->space_after = 0; + } else { + unsigned long long earliest; + earliest = super_offset + (32+4)*2; /* match kernel */ + if (info->bitmap_offset > 0) { + unsigned long long bmend = info->bitmap_offset; + unsigned long long size = calc_bitmap_size(bsb, 4096); + size /= 512; + bmend += size; + if (bmend > earliest) + earliest = bmend; + } else if (info->ppl_offset > 0) { + unsigned long long pplend; + + pplend = info->ppl_offset + info->ppl_size; + if (pplend > earliest) + earliest = pplend; + } + if (sb->bblog_offset && sb->bblog_size) { + unsigned long long bbend = super_offset; + bbend += (int32_t)__le32_to_cpu(sb->bblog_offset); + bbend += __le16_to_cpu(sb->bblog_size); + if (bbend > earliest) + earliest = bbend; + } + if (earliest < info->data_offset) + info->space_before = info->data_offset - earliest; + else + info->space_before = 0; + info->space_after = misc->device_size - data_size - + info->data_offset; + } + if (info->space_before == 0 && info->space_after == 0) { + /* It will look like we don't support data_offset changes, + * be we do - it's just that there is no room. + * A change that reduced the number of devices should + * still be allowed, so set the otherwise useless value of '1' + */ + info->space_after = 1; + } + + info->disk.raid_disk = -1; + switch(role) { + case MD_DISK_ROLE_SPARE: + /* spare: not active, not sync, not faulty */ + info->disk.state = 0; + break; + case MD_DISK_ROLE_FAULTY: + info->disk.state = (1 << MD_DISK_FAULTY); /* faulty */ + break; + case MD_DISK_ROLE_JOURNAL: + info->disk.state = (1 << MD_DISK_JOURNAL); + info->disk.raid_disk = role; + /* journal uses all 4kB blocks*/ + info->space_after = (misc->device_size - info->data_offset) % 8; + break; + default: + info->disk.state = 6; /* active and in sync */ + info->disk.raid_disk = role; + } + if (sb->devflags & WriteMostly1) + info->disk.state |= (1 << MD_DISK_WRITEMOSTLY); + if (sb->devflags & FailFast1) + info->disk.state |= (1 << MD_DISK_FAILFAST); + info->events = __le64_to_cpu(sb->events); + sprintf(info->text_version, "1.%d", st->minor_version); + info->safe_mode_delay = 200; + + memcpy(info->uuid, sb->set_uuid, 16); + + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + + if ((__le32_to_cpu(sb->feature_map)&MD_FEATURE_REPLACEMENT)) { + info->disk.state &= ~(1 << MD_DISK_SYNC); + info->disk.state |= 1 << MD_DISK_REPLACEMENT; + } + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RECOVERY_OFFSET)) + info->recovery_start = __le32_to_cpu(sb->recovery_offset); + else + info->recovery_start = MaxSector; + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE)) { + info->reshape_active = 1; + if ((sb->feature_map & __le32_to_cpu(MD_FEATURE_NEW_OFFSET)) && + sb->new_offset != 0) + info->reshape_active |= RESHAPE_NO_BACKUP; + info->reshape_progress = __le64_to_cpu(sb->reshape_position); + info->new_level = __le32_to_cpu(sb->new_level); + info->delta_disks = __le32_to_cpu(sb->delta_disks); + info->new_layout = __le32_to_cpu(sb->new_layout); + info->new_chunk = __le32_to_cpu(sb->new_chunk)<<9; + if (info->delta_disks < 0) + info->array.raid_disks -= info->delta_disks; + } else + info->reshape_active = 0; + + info->recovery_blocked = info->reshape_active; + + if (map) + for (i=0; i<map_disks; i++) + map[i] = 0; + for (i = 0; i < __le32_to_cpu(sb->max_dev); i++) { + role = __le16_to_cpu(sb->dev_roles[i]); + if (/*role == MD_DISK_ROLE_SPARE || */role < (unsigned) info->array.raid_disks) { + working++; + if (map && role < map_disks) + map[role] = 1; + } + } + + info->array.working_disks = working; + + if (sb->feature_map & __le32_to_cpu(MD_FEATURE_JOURNAL)) { + info->journal_device_required = 1; + info->consistency_policy = CONSISTENCY_POLICY_JOURNAL; + } else if (md_feature_any_ppl_on(sb->feature_map)) { + info->consistency_policy = CONSISTENCY_POLICY_PPL; + } else if (sb->feature_map & __le32_to_cpu(MD_FEATURE_BITMAP_OFFSET)) { + info->consistency_policy = CONSISTENCY_POLICY_BITMAP; + } else if (info->array.level <= 0) { + info->consistency_policy = CONSISTENCY_POLICY_NONE; + } else { + info->consistency_policy = CONSISTENCY_POLICY_RESYNC; + } + + info->journal_clean = 0; +} + +static struct mdinfo *container_content1(struct supertype *st, char *subarray) +{ + struct mdinfo *info; + + if (subarray) + return NULL; + + info = xmalloc(sizeof(*info)); + getinfo_super1(st, info, NULL); + return info; +} + +static int update_super1(struct supertype *st, struct mdinfo *info, + char *update, char *devname, int verbose, + int uuid_set, char *homehost) +{ + /* NOTE: for 'assemble' and 'force' we need to return non-zero + * if any change was made. For others, the return value is + * ignored. + */ + int rv = 0; + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + + if (strcmp(update, "homehost") == 0 && + homehost) { + /* Note that 'homehost' is special as it is really + * a "name" update. + */ + char *c; + update = "name"; + c = strchr(sb->set_name, ':'); + if (c) + strncpy(info->name, c+1, 31 - (c-sb->set_name)); + else + strncpy(info->name, sb->set_name, 32); + info->name[32] = 0; + } + + if (strcmp(update, "force-one")==0) { + /* Not enough devices for a working array, + * so bring this one up-to-date + */ + if (sb->events != __cpu_to_le64(info->events)) + rv = 1; + sb->events = __cpu_to_le64(info->events); + } else if (strcmp(update, "force-array")==0) { + /* Degraded array and 'force' requests to + * maybe need to mark it 'clean'. + */ + switch(__le32_to_cpu(sb->level)) { + case 4: + case 5: + case 6: + /* need to force clean */ + if (sb->resync_offset != MaxSector) + rv = 1; + sb->resync_offset = MaxSector; + } + } else if (strcmp(update, "assemble")==0) { + int d = info->disk.number; + int want; + if (info->disk.state & (1<<MD_DISK_ACTIVE)) + want = info->disk.raid_disk; + else if (info->disk.state & (1<<MD_DISK_JOURNAL)) + want = MD_DISK_ROLE_JOURNAL; + else + want = MD_DISK_ROLE_SPARE; + if (sb->dev_roles[d] != __cpu_to_le16(want)) { + sb->dev_roles[d] = __cpu_to_le16(want); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & + __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks >= 0 && + info->reshape_progress < + __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = + __cpu_to_le64(info->reshape_progress); + rv = 1; + } + if (info->reshape_active && + sb->feature_map & + __le32_to_cpu(MD_FEATURE_RESHAPE_ACTIVE) && + info->delta_disks < 0 && + info->reshape_progress > + __le64_to_cpu(sb->reshape_position)) { + sb->reshape_position = + __cpu_to_le64(info->reshape_progress); + rv = 1; + } + } else if (strcmp(update, "linear-grow-new") == 0) { + int i; + int fd; + int max = __le32_to_cpu(sb->max_dev); + + if (max > MAX_DEVS) + return -2; + + for (i = 0; i < max; i++) + if (__le16_to_cpu(sb->dev_roles[i]) >= + MD_DISK_ROLE_FAULTY) + break; + if (i != info->disk.number) + return -2; + sb->dev_number = __cpu_to_le32(i); + + if (i == max) + sb->max_dev = __cpu_to_le32(max+1); + if (i > max) + return -2; + + random_uuid(sb->device_uuid); + + sb->dev_roles[i] = __cpu_to_le16(info->disk.raid_disk); + + fd = open(devname, O_RDONLY); + if (fd >= 0) { + unsigned long long ds; + get_dev_size(fd, devname, &ds); + close(fd); + ds >>= 9; + if (__le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } else { + ds -= 8*2; + ds &= ~(unsigned long long)(4*2-1); + sb->super_offset = __cpu_to_le64(ds); + sb->data_size = __cpu_to_le64( + ds - __le64_to_cpu(sb->data_offset)); + } + } + } else if (strcmp(update, "linear-grow-update") == 0) { + int max = __le32_to_cpu(sb->max_dev); + int i = info->disk.number; + if (max > MAX_DEVS || i > MAX_DEVS) + return -2; + if (i > max) + return -2; + if (i == max) + sb->max_dev = __cpu_to_le32(max+1); + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + sb->dev_roles[info->disk.number] = + __cpu_to_le16(info->disk.raid_disk); + } else if (strcmp(update, "resync") == 0) { + /* make sure resync happens */ + sb->resync_offset = 0ULL; + } else if (strcmp(update, "uuid") == 0) { + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + + if (__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET) + memcpy(bms->uuid, sb->set_uuid, 16); + } else if (strcmp(update, "no-bitmap") == 0) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_BITMAP_OFFSET); + if (bms->version == BITMAP_MAJOR_CLUSTERED && !IsBitmapDirty(devname)) + sb->resync_offset = MaxSector; + } else if (strcmp(update, "bbl") == 0) { + /* only possible if there is room after the bitmap, or if + * there is no bitmap + */ + unsigned long long sb_offset = __le64_to_cpu(sb->super_offset); + unsigned long long data_offset = __le64_to_cpu(sb->data_offset); + long bitmap_offset = 0; + long bm_sectors = 0; + long space; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + bitmap_offset = (long)__le32_to_cpu(sb->bitmap_offset); + bm_sectors = calc_bitmap_size(bms, 4096) >> 9; + } else if (md_feature_any_ppl_on(sb->feature_map)) { + bitmap_offset = (long)__le16_to_cpu(sb->ppl.offset); + bm_sectors = (long)__le16_to_cpu(sb->ppl.size); + } + + if (sb_offset < data_offset) { + /* + * 1.1 or 1.2. Put bbl after bitmap leaving + * at least 32K + */ + long bb_offset; + bb_offset = sb_offset + 8; + if (bm_sectors && bitmap_offset > 0) + bb_offset = bitmap_offset + bm_sectors; + while (bb_offset < (long)sb_offset + 8 + 32*2 && + bb_offset + 8+8 <= (long)data_offset) + /* too close to bitmap, and room to grow */ + bb_offset += 8; + if (bb_offset + 8 <= (long)data_offset) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(bb_offset); + } + } else { + /* 1.0 - Put bbl just before super block */ + if (bm_sectors && bitmap_offset < 0) + space = -bitmap_offset - bm_sectors; + else + space = sb_offset - data_offset - + __le64_to_cpu(sb->data_size); + if (space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + } + } else if (strcmp(update, "no-bbl") == 0) { + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BAD_BLOCKS)) + pr_err("Cannot remove active bbl from %s\n",devname); + else { + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } + } else if (strcmp(update, "force-no-bbl") == 0) { + sb->feature_map &= ~ __cpu_to_le32(MD_FEATURE_BAD_BLOCKS); + sb->bblog_size = 0; + sb->bblog_shift = 0; + sb->bblog_offset = 0; + } else if (strcmp(update, "ppl") == 0) { + unsigned long long sb_offset = __le64_to_cpu(sb->super_offset); + unsigned long long data_offset = __le64_to_cpu(sb->data_offset); + unsigned long long data_size = __le64_to_cpu(sb->data_size); + long bb_offset = __le32_to_cpu(sb->bblog_offset); + int space; + int offset; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + pr_err("Cannot add PPL to array with bitmap\n"); + return -2; + } + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_JOURNAL)) { + pr_err("Cannot add PPL to array with journal\n"); + return -2; + } + + if (sb_offset < data_offset) { + if (bb_offset) + space = bb_offset - 8; + else + space = data_offset - sb_offset - 8; + offset = 8; + } else { + offset = -(sb_offset - data_offset - data_size); + if (offset < INT16_MIN) + offset = INT16_MIN; + space = -(offset - bb_offset); + } + + if (space < (PPL_HEADER_SIZE >> 9) + 8) { + pr_err("Not enough space to add ppl\n"); + return -2; + } + + if (space >= (MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9)) { + space = (MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9); + } else { + int optimal_space = choose_ppl_space( + __le32_to_cpu(sb->chunksize)); + if (space > optimal_space) + space = optimal_space; + if (space > UINT16_MAX) + space = UINT16_MAX; + } + + sb->ppl.offset = __cpu_to_le16(offset); + sb->ppl.size = __cpu_to_le16(space); + sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL); + } else if (strcmp(update, "no-ppl") == 0) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_PPL | + MD_FEATURE_MUTLIPLE_PPLS); + } else if (strcmp(update, "name") == 0) { + if (info->name[0] == 0) + sprintf(info->name, "%d", info->array.md_minor); + memset(sb->set_name, 0, sizeof(sb->set_name)); + if (homehost && + strchr(info->name, ':') == NULL && + strlen(homehost)+1+strlen(info->name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, info->name); + } else { + int namelen; + + namelen = min((int)strlen(info->name), + (int)sizeof(sb->set_name) - 1); + memcpy(sb->set_name, info->name, namelen); + memset(&sb->set_name[namelen], '\0', + sizeof(sb->set_name) - namelen); + } + } else if (strcmp(update, "devicesize") == 0 && + __le64_to_cpu(sb->super_offset) < + __le64_to_cpu(sb->data_offset)) { + /* set data_size to device size less data_offset */ + struct misc_dev_info *misc = (struct misc_dev_info*) + (st->sb + MAX_SB_SIZE + BM_SUPER_SIZE); + sb->data_size = __cpu_to_le64( + misc->device_size - __le64_to_cpu(sb->data_offset)); + } else if (strncmp(update, "revert-reshape", 14) == 0) { + rv = -2; + if (!(sb->feature_map & + __cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE))) + pr_err("No active reshape to revert on %s\n", + devname); + else { + __u32 temp; + unsigned long long reshape_sectors; + long reshape_chunk; + rv = 0; + /* If the reshape hasn't started, just stop it. + * It is conceivable that a stripe was modified but + * the metadata not updated. In that case the backup + * should have been used to get passed the critical stage. + * If that couldn't happen, the "-nobackup" version + * will be used. + */ + if (strcmp(update, "revert-reshape-nobackup") == 0 && + sb->reshape_position == 0 && + (__le32_to_cpu(sb->delta_disks) > 0 || + (__le32_to_cpu(sb->delta_disks) == 0 && + !(sb->feature_map & __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS))))) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RESHAPE_ACTIVE); + sb->raid_disks = __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + sb->delta_disks = 0; + goto done; + } + /* reshape_position is a little messy. + * Its value must be a multiple of the larger + * chunk size, and of the "after" data disks. + * So when reverting we need to change it to + * be a multiple of the new "after" data disks, + * which is the old "before". + * If it isn't already a multiple of 'before', + * the only thing we could do would be + * copy some block around on the disks, which + * is easy to get wrong. + * So we reject a revert-reshape unless the + * alignment is good. + */ + if (__le32_to_cpu(sb->level) >= 4 && + __le32_to_cpu(sb->level) <= 6) { + reshape_sectors = + __le64_to_cpu(sb->reshape_position); + reshape_chunk = __le32_to_cpu(sb->new_chunk); + reshape_chunk *= __le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks) - + (__le32_to_cpu(sb->level)==6 ? 2 : 1); + if (reshape_sectors % reshape_chunk) { + pr_err("Reshape position is not suitably aligned.\n"); + pr_err("Try normal assembly and stop again\n"); + return -2; + } + } + sb->raid_disks = + __cpu_to_le32(__le32_to_cpu(sb->raid_disks) - + __le32_to_cpu(sb->delta_disks)); + if (sb->delta_disks == 0) + sb->feature_map ^= __cpu_to_le32(MD_FEATURE_RESHAPE_BACKWARDS); + else + sb->delta_disks = __cpu_to_le32(-__le32_to_cpu(sb->delta_disks)); + + temp = sb->new_layout; + sb->new_layout = sb->layout; + sb->layout = temp; + + temp = sb->new_chunk; + sb->new_chunk = sb->chunksize; + sb->chunksize = temp; + + if (sb->feature_map & + __cpu_to_le32(MD_FEATURE_NEW_OFFSET)) { + long offset_delta = + (int32_t)__le32_to_cpu(sb->new_offset); + sb->data_offset = __cpu_to_le64(__le64_to_cpu(sb->data_offset) + offset_delta); + sb->new_offset = __cpu_to_le32(-offset_delta); + sb->data_size = __cpu_to_le64(__le64_to_cpu(sb->data_size) - offset_delta); + } + done:; + } + } else if (strcmp(update, "_reshape_progress") == 0) + sb->reshape_position = __cpu_to_le64(info->reshape_progress); + else if (strcmp(update, "writemostly") == 0) + sb->devflags |= WriteMostly1; + else if (strcmp(update, "readwrite") == 0) + sb->devflags &= ~WriteMostly1; + else if (strcmp(update, "failfast") == 0) + sb->devflags |= FailFast1; + else if (strcmp(update, "nofailfast") == 0) + sb->devflags &= ~FailFast1; + else if (strcmp(update, "layout-original") == 0 || + strcmp(update, "layout-alternate") == 0 || + strcmp(update, "layout-unspecified") == 0) { + if (__le32_to_cpu(sb->level) != 0) { + pr_err("%s: %s only supported for RAID0\n", + devname?:"", update); + rv = -1; + } else if (strcmp(update, "layout-unspecified") == 0) { + sb->feature_map &= ~__cpu_to_le32(MD_FEATURE_RAID0_LAYOUT); + sb->layout = 0; + } else { + sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT); + sb->layout = __cpu_to_le32(update[7] == 'o' ? 1 : 2); + } + } else + rv = -1; + + sb->sb_csum = calc_sb_1_csum(sb); + + return rv; +} + +static int init_super1(struct supertype *st, mdu_array_info_t *info, + struct shape *s, char *name, char *homehost, + int *uuid, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb; + int spares; + char defname[10]; + int sbsize; + + if (posix_memalign((void**)&sb, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 0; + } + memset(sb, 0, SUPER1_SIZE); + + st->sb = sb; + if (info == NULL) { + /* zeroing superblock */ + return 0; + } + + spares = info->working_disks - info->active_disks; + if (info->raid_disks + spares > MAX_DEVS) { + pr_err("too many devices requested: %d+%d > %d\n", + info->raid_disks , spares, MAX_DEVS); + return 0; + } + + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + sb->feature_map = 0; + sb->pad0 = 0; + + if (uuid) + copy_uuid(sb->set_uuid, uuid, super1.swapuuid); + else + random_uuid(sb->set_uuid);; + + if (name == NULL || *name == 0) { + sprintf(defname, "%d", info->md_minor); + name = defname; + } + if (homehost && + strchr(name, ':')== NULL && + strlen(homehost)+1+strlen(name) < 32) { + strcpy(sb->set_name, homehost); + strcat(sb->set_name, ":"); + strcat(sb->set_name, name); + } else { + int namelen; + + namelen = min((int)strlen(name), + (int)sizeof(sb->set_name) - 1); + memcpy(sb->set_name, name, namelen); + memset(&sb->set_name[namelen], '\0', + sizeof(sb->set_name) - namelen); + } + + sb->ctime = __cpu_to_le64((unsigned long long)time(0)); + sb->level = __cpu_to_le32(info->level); + sb->layout = __cpu_to_le32(info->layout); + sb->size = __cpu_to_le64(s->size*2ULL); + sb->chunksize = __cpu_to_le32(info->chunk_size>>9); + sb->raid_disks = __cpu_to_le32(info->raid_disks); + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(0); + sb->super_offset = __cpu_to_le64(0); + sb->recovery_offset = __cpu_to_le64(0); + + sb->utime = sb->ctime; + sb->events = __cpu_to_le64(1); + if (info->state & (1<<MD_SB_CLEAN)) + sb->resync_offset = MaxSector; + else + sb->resync_offset = 0; + sbsize = sizeof(struct mdp_superblock_1) + + 2 * (info->raid_disks + spares); + sbsize = ROUND_UP(sbsize, 512); + sb->max_dev = + __cpu_to_le32((sbsize - sizeof(struct mdp_superblock_1)) / 2); + + memset(sb->dev_roles, 0xff, + MAX_SB_SIZE - sizeof(struct mdp_superblock_1)); + + if (s->consistency_policy == CONSISTENCY_POLICY_PPL) + sb->feature_map |= __cpu_to_le32(MD_FEATURE_PPL); + + return 1; +} + +struct devinfo { + int fd; + char *devname; + long long data_offset; + unsigned long long dev_size; + mdu_disk_info_t disk; + struct devinfo *next; +}; + +/* Add a device to the superblock being created */ +static int add_to_super1(struct supertype *st, mdu_disk_info_t *dk, + int fd, char *devname, unsigned long long data_offset) +{ + struct mdp_superblock_1 *sb = st->sb; + __u16 *rp = sb->dev_roles + dk->number; + struct devinfo *di, **dip; + int dk_state; + + dk_state = dk->state & ~(1<<MD_DISK_FAILFAST); + if ((dk_state & (1<<MD_DISK_ACTIVE)) && + (dk_state & (1<<MD_DISK_SYNC)))/* active, sync */ + *rp = __cpu_to_le16(dk->raid_disk); + else if (dk_state & (1<<MD_DISK_JOURNAL)) + *rp = MD_DISK_ROLE_JOURNAL; + else if ((dk_state & ~(1<<MD_DISK_ACTIVE)) == 0) + /* active or idle -> spare */ + *rp = MD_DISK_ROLE_SPARE; + else + *rp = MD_DISK_ROLE_FAULTY; + + if (dk->number >= (int)__le32_to_cpu(sb->max_dev) && + __le32_to_cpu(sb->max_dev) < MAX_DEVS) + sb->max_dev = __cpu_to_le32(dk->number+1); + + sb->dev_number = __cpu_to_le32(dk->number); + sb->devflags = 0; /* don't copy another disks flags */ + sb->sb_csum = calc_sb_1_csum(sb); + + dip = (struct devinfo **)&st->info; + while (*dip) + dip = &(*dip)->next; + di = xmalloc(sizeof(struct devinfo)); + di->fd = fd; + di->devname = devname; + di->disk = *dk; + di->data_offset = data_offset; + get_dev_size(fd, NULL, &di->dev_size); + di->next = NULL; + *dip = di; + + return 0; +} + +static int locate_bitmap1(struct supertype *st, int fd, int node_num); + +static int store_super1(struct supertype *st, int fd) +{ + struct mdp_superblock_1 *sb = st->sb; + unsigned long long sb_offset; + struct align_fd afd; + int sbsize; + unsigned long long dsize; + + if (!get_dev_size(fd, NULL, &dsize)) + return 1; + + dsize >>= 9; + + if (dsize < 24) + return 2; + + init_afd(&afd, fd); + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (sb_offset != __le64_to_cpu(sb->super_offset) && + 0 != __le64_to_cpu(sb->super_offset) + ) { + pr_err("internal error - sb_offset is wrong\n"); + abort(); + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) + return 3; + + sbsize = ROUND_UP(sizeof(*sb) + 2 * __le32_to_cpu(sb->max_dev), 512); + + if (awrite(&afd, sb, sbsize) != sbsize) + return 4; + + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + struct bitmap_super_s *bm = (struct bitmap_super_s*) + (((char*)sb)+MAX_SB_SIZE); + if (__le32_to_cpu(bm->magic) == BITMAP_MAGIC) { + locate_bitmap1(st, fd, 0); + if (awrite(&afd, bm, sizeof(*bm)) != sizeof(*bm)) + return 5; + } + } + fsync(fd); + + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname); + +static unsigned long choose_bm_space(unsigned long devsize) +{ + /* if the device is bigger than 8Gig, save 64k for bitmap usage, + * if bigger than 200Gig, save 128k + * NOTE: result must be multiple of 4K else bad things happen + * on 4K-sector devices. + */ + if (devsize < 64*2) + return 0; + if (devsize - 64*2 >= 200*1024*1024*2) + return 128*2; + if (devsize - 4*2 > 8*1024*1024*2) + return 64*2; + return 4*2; +} + +static void free_super1(struct supertype *st); + +__u32 crc32c_le(__u32 crc, unsigned char const *p, size_t len); + +static int write_init_ppl1(struct supertype *st, struct mdinfo *info, int fd) +{ + struct mdp_superblock_1 *sb = st->sb; + void *buf; + struct ppl_header *ppl_hdr; + int ret; + + /* first clear entire ppl space */ + ret = zero_disk_range(fd, info->ppl_sector, info->ppl_size); + if (ret) + return ret; + + ret = posix_memalign(&buf, 4096, PPL_HEADER_SIZE); + if (ret) { + pr_err("Failed to allocate PPL header buffer\n"); + return ret; + } + + memset(buf, 0, PPL_HEADER_SIZE); + ppl_hdr = buf; + memset(ppl_hdr->reserved, 0xff, PPL_HDR_RESERVED); + ppl_hdr->signature = __cpu_to_le32(~crc32c_le(~0, sb->set_uuid, + sizeof(sb->set_uuid))); + ppl_hdr->checksum = __cpu_to_le32(~crc32c_le(~0, buf, PPL_HEADER_SIZE)); + + if (lseek64(fd, info->ppl_sector * 512, SEEK_SET) < 0) { + ret = errno; + perror("Failed to seek to PPL header location"); + } + + if (!ret && write(fd, buf, PPL_HEADER_SIZE) != PPL_HEADER_SIZE) { + ret = errno; + perror("Write PPL header failed"); + } + + if (!ret) + fsync(fd); + + free(buf); + return ret; +} + +#define META_BLOCK_SIZE 4096 + +static int write_empty_r5l_meta_block(struct supertype *st, int fd) +{ + struct r5l_meta_block *mb; + struct mdp_superblock_1 *sb = st->sb; + struct align_fd afd; + __u32 crc; + + init_afd(&afd, fd); + + if (posix_memalign((void**)&mb, 4096, META_BLOCK_SIZE) != 0) { + pr_err("Could not allocate memory for the meta block.\n"); + return 1; + } + + memset(mb, 0, META_BLOCK_SIZE); + + mb->magic = __cpu_to_le32(R5LOG_MAGIC); + mb->version = R5LOG_VERSION; + mb->meta_size = __cpu_to_le32(sizeof(struct r5l_meta_block)); + mb->seq = __cpu_to_le64(random32()); + mb->position = __cpu_to_le64(0); + + crc = crc32c_le(0xffffffff, sb->set_uuid, sizeof(sb->set_uuid)); + crc = crc32c_le(crc, (void *)mb, META_BLOCK_SIZE); + mb->checksum = crc; + + if (lseek64(fd, __le64_to_cpu(sb->data_offset) * 512, 0) < 0LL) { + pr_err("cannot seek to offset of the meta block\n"); + goto fail_to_write; + } + + if (awrite(&afd, mb, META_BLOCK_SIZE) != META_BLOCK_SIZE) { + pr_err("failed to store write the meta block \n"); + goto fail_to_write; + } + fsync(fd); + + free(mb); + return 0; + +fail_to_write: + free(mb); + return 1; +} + +static int write_init_super1(struct supertype *st) +{ + struct mdp_superblock_1 *sb = st->sb; + struct supertype *refst; + int rv = 0; + unsigned long long bm_space; + struct devinfo *di; + unsigned long long dsize, array_size; + unsigned long long sb_offset; + unsigned long long data_offset; + long bm_offset; + int raid0_need_layout = 0; + + for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_JOURNAL)) + sb->feature_map |= __cpu_to_le32(MD_FEATURE_JOURNAL); + if (sb->level == 0 && sb->layout != 0) { + struct devinfo *di2 = st->info; + unsigned long long s1, s2; + s1 = di->dev_size; + if (di->data_offset != INVALID_SECTORS) + s1 -= di->data_offset; + s1 /= __le32_to_cpu(sb->chunksize); + s2 = di2->dev_size; + if (di2->data_offset != INVALID_SECTORS) + s2 -= di2->data_offset; + s2 /= __le32_to_cpu(sb->chunksize); + if (s1 != s2) + raid0_need_layout = 1; + } + } + + for (di = st->info; di; di = di->next) { + if (di->disk.state & (1 << MD_DISK_FAULTY)) + continue; + if (di->fd < 0) + continue; + + while (Kill(di->devname, NULL, 0, -1, 1) == 0) + ; + + sb->dev_number = __cpu_to_le32(di->disk.number); + if (di->disk.state & (1<<MD_DISK_WRITEMOSTLY)) + sb->devflags |= WriteMostly1; + else + sb->devflags &= ~WriteMostly1; + if (di->disk.state & (1<<MD_DISK_FAILFAST)) + sb->devflags |= FailFast1; + else + sb->devflags &= ~FailFast1; + + random_uuid(sb->device_uuid); + + if (!(di->disk.state & (1<<MD_DISK_JOURNAL))) + sb->events = 0; + + refst = dup_super(st); + if (load_super1(refst, di->fd, NULL)==0) { + struct mdp_superblock_1 *refsb = refst->sb; + + memcpy(sb->device_uuid, refsb->device_uuid, 16); + if (memcmp(sb->set_uuid, refsb->set_uuid, 16)==0) { + /* same array, so preserve events and + * dev_number */ + sb->events = refsb->events; + /* bugs in 2.6.17 and earlier mean the + * dev_number chosen in Manage must be preserved + */ + if (get_linux_version() >= 2006018) + sb->dev_number = refsb->dev_number; + } + free_super1(refst); + } + free(refst); + + if (!get_dev_size(di->fd, NULL, &dsize)) { + rv = 1; + goto error_out; + } + dsize >>= 9; + + if (dsize < 24) { + close(di->fd); + rv = 2; + goto error_out; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + * data_offset has already been set. + */ + array_size = __le64_to_cpu(sb->size); + + /* work out how much space we left for a bitmap */ + if (sb->feature_map & __cpu_to_le32(MD_FEATURE_BITMAP_OFFSET)) { + bitmap_super_t *bms = (bitmap_super_t *) + (((char *)sb) + MAX_SB_SIZE); + bm_space = calc_bitmap_size(bms, 4096) >> 9; + bm_offset = (long)__le32_to_cpu(sb->bitmap_offset); + } else if (md_feature_any_ppl_on(sb->feature_map)) { + bm_space = MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9; + if (st->minor_version == 0) + bm_offset = -bm_space - 8; + else + bm_offset = 8; + sb->ppl.offset = __cpu_to_le16(bm_offset); + sb->ppl.size = __cpu_to_le16(bm_space); + } else { + bm_space = choose_bm_space(array_size); + bm_offset = 8; + } + + data_offset = di->data_offset; + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + switch(st->minor_version) { + case 0: + /* Add 8 sectors for bad block log */ + bm_space += 8; + if (data_offset == INVALID_SECTORS) + data_offset = 0; + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + sb->data_offset = __cpu_to_le64(data_offset); + sb->super_offset = __cpu_to_le64(sb_offset); + if (sb_offset < array_size + bm_space) + bm_space = sb_offset - array_size; + sb->data_size = __cpu_to_le64(sb_offset - bm_space); + if (bm_space >= 8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32((unsigned)-8); + } + break; + case 1: + case 2: + sb_offset = st->minor_version == 2 ? 8 : 0; + sb->super_offset = __cpu_to_le64(sb_offset); + if (data_offset == INVALID_SECTORS) + data_offset = sb_offset + 16; + + sb->data_offset = __cpu_to_le64(data_offset); + sb->data_size = __cpu_to_le64(dsize - data_offset); + if (data_offset >= sb_offset+bm_offset+bm_space+8) { + sb->bblog_size = __cpu_to_le16(8); + sb->bblog_offset = __cpu_to_le32(bm_offset + + bm_space); + } else if (data_offset >= sb_offset + 16) { + sb->bblog_size = __cpu_to_le16(8); + /* '8' sectors for the bblog, and 'sb_offset' + * because we want offset from superblock, not + * start of device. + */ + sb->bblog_offset = __cpu_to_le32(data_offset - + 8 - sb_offset); + } + break; + default: + pr_err("Failed to write invalid metadata format 1.%i to %s\n", + st->minor_version, di->devname); + rv = -EINVAL; + goto out; + } + /* + * Disable badblock log on clusters, or when + * explicitly requested + */ + if (st->nodes > 0 || conf_get_create_info()->bblist == 0) { + sb->bblog_size = 0; + sb->bblog_offset = 0; + } + + /* RAID0 needs a layout if devices aren't all the same size */ + if (raid0_need_layout) + sb->feature_map |= __cpu_to_le32(MD_FEATURE_RAID0_LAYOUT); + + sb->sb_csum = calc_sb_1_csum(sb); + rv = store_super1(st, di->fd); + + if (rv == 0 && (di->disk.state & (1 << MD_DISK_JOURNAL))) { + rv = write_empty_r5l_meta_block(st, di->fd); + if (rv) + goto error_out; + } + + if (rv == 0 && + (__le32_to_cpu(sb->feature_map) & + MD_FEATURE_BITMAP_OFFSET)) { + rv = st->ss->write_bitmap(st, di->fd, NodeNumUpdate); + } else if (rv == 0 && + md_feature_any_ppl_on(sb->feature_map)) { + struct mdinfo info; + + st->ss->getinfo_super(st, &info, NULL); + rv = st->ss->write_init_ppl(st, &info, di->fd); + } + + close(di->fd); + di->fd = -1; + if (rv) + goto error_out; + } +error_out: + if (rv) + pr_err("Failed to write metadata to %s\n", di->devname); +out: + return rv; +} + +static int compare_super1(struct supertype *st, struct supertype *tst, + int verbose) +{ + /* + * return: + * 0 same, or first was empty, and second was copied + * 1 second had wrong number + * 2 wrong uuid + * 3 wrong other info + */ + struct mdp_superblock_1 *first = st->sb; + struct mdp_superblock_1 *second = tst->sb; + + if (second->magic != __cpu_to_le32(MD_SB_MAGIC)) + return 1; + if (second->major_version != __cpu_to_le32(1)) + return 1; + + if (!first) { + if (posix_memalign((void**)&first, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + memcpy(first, second, SUPER1_SIZE); + st->sb = first; + return 0; + } + if (memcmp(first->set_uuid, second->set_uuid, 16)!= 0) + return 2; + + if (first->ctime != second->ctime || + first->level != second->level || + first->layout != second->layout || + first->size != second->size || + first->chunksize != second->chunksize || + first->raid_disks != second->raid_disks) + return 3; + return 0; +} + +static int load_super1(struct supertype *st, int fd, char *devname) +{ + unsigned long long dsize; + unsigned long long sb_offset; + struct mdp_superblock_1 *super; + int uuid[4]; + struct bitmap_super_s *bsb; + struct misc_dev_info *misc; + struct align_fd afd; + + free_super1(st); + + init_afd(&afd, fd); + + if (st->ss == NULL || st->minor_version == -1) { + int bestvers = -1; + struct supertype tst; + __u64 bestctime = 0; + /* guess... choose latest ctime */ + memset(&tst, 0, sizeof(tst)); + tst.ss = &super1; + for (tst.minor_version = 0; tst.minor_version <= 2; + tst.minor_version++) { + switch(load_super1(&tst, fd, devname)) { + case 0: super = tst.sb; + if (bestvers == -1 || + bestctime < __le64_to_cpu(super->ctime)) { + bestvers = tst.minor_version; + bestctime = __le64_to_cpu(super->ctime); + } + free(super); + tst.sb = NULL; + break; + case 1: return 1; /*bad device */ + case 2: break; /* bad, try next */ + } + } + if (bestvers != -1) { + int rv; + tst.minor_version = bestvers; + tst.ss = &super1; + tst.max_devs = MAX_DEVS; + rv = load_super1(&tst, fd, devname); + if (rv == 0) + *st = tst; + return rv; + } + return 2; + } + if (!get_dev_size(fd, devname, &dsize)) + return 1; + dsize >>= 9; + + if (dsize < 24) { + if (devname) + pr_err("%s is too small for md: size is %llu sectors.\n", + devname, dsize); + return 1; + } + + /* + * Calculate the position of the superblock. + * It is always aligned to a 4K boundary and + * depending on minor_version, it can be: + * 0: At least 8K, but less than 12K, from end of device + * 1: At start of device + * 2: 4K from start of device. + */ + switch(st->minor_version) { + case 0: + sb_offset = dsize; + sb_offset -= 8*2; + sb_offset &= ~(4*2-1); + break; + case 1: + sb_offset = 0; + break; + case 2: + sb_offset = 4*2; + break; + default: + return -EINVAL; + } + + if (lseek64(fd, sb_offset << 9, 0)< 0LL) { + if (devname) + pr_err("Cannot seek to superblock on %s: %s\n", + devname, strerror(errno)); + return 1; + } + + if (posix_memalign((void**)&super, 4096, SUPER1_SIZE) != 0) { + pr_err("could not allocate superblock\n"); + return 1; + } + + memset(super, 0, SUPER1_SIZE); + + if (aread(&afd, super, MAX_SB_SIZE) != MAX_SB_SIZE) { + if (devname) + pr_err("Cannot read superblock on %s\n", + devname); + free(super); + return 1; + } + + if (__le32_to_cpu(super->magic) != MD_SB_MAGIC) { + if (devname) + pr_err("No super block found on %s (Expected magic %08x, got %08x)\n", + devname, MD_SB_MAGIC, + __le32_to_cpu(super->magic)); + free(super); + return 2; + } + + if (__le32_to_cpu(super->major_version) != 1) { + if (devname) + pr_err("Cannot interpret superblock on %s - version is %d\n", + devname, __le32_to_cpu(super->major_version)); + free(super); + return 2; + } + if (__le64_to_cpu(super->super_offset) != sb_offset) { + if (devname) + pr_err("No superblock found on %s (super_offset is wrong)\n", + devname); + free(super); + return 2; + } + st->sb = super; + + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + + misc = (struct misc_dev_info*) + (((char*)super)+MAX_SB_SIZE+BM_SUPER_SIZE); + misc->device_size = dsize; + if (st->data_offset == INVALID_SECTORS) + st->data_offset = __le64_to_cpu(super->data_offset); + + /* Now check on the bitmap superblock */ + if ((__le32_to_cpu(super->feature_map)&MD_FEATURE_BITMAP_OFFSET) == 0) + return 0; + /* Read the bitmap superblock and make sure it looks + * valid. If it doesn't clear the bit. An --assemble --force + * should get that written out. + */ + locate_bitmap1(st, fd, 0); + if (aread(&afd, bsb, 512) != 512) + goto no_bitmap; + + uuid_from_super1(st, uuid); + if (__le32_to_cpu(bsb->magic) != BITMAP_MAGIC || + memcmp(bsb->uuid, uuid, 16) != 0) + goto no_bitmap; + return 0; + + no_bitmap: + super->feature_map = __cpu_to_le32(__le32_to_cpu(super->feature_map) + & ~MD_FEATURE_BITMAP_OFFSET); + return 0; +} + +static struct supertype *match_metadata_desc1(char *arg) +{ + struct supertype *st = xcalloc(1, sizeof(*st)); + + st->container_devnm[0] = 0; + st->ss = &super1; + st->max_devs = MAX_DEVS; + st->sb = NULL; + st->data_offset = INVALID_SECTORS; + /* leading zeros can be safely ignored. --detail generates them. */ + while (*arg == '0') + arg++; + if (strcmp(arg, "1.0") == 0 || strcmp(arg, "1.00") == 0) { + st->minor_version = 0; + return st; + } + if (strcmp(arg, "1.1") == 0 || strcmp(arg, "1.01") == 0 + ) { + st->minor_version = 1; + return st; + } + if (strcmp(arg, "1.2") == 0 || +#ifndef DEFAULT_OLD_METADATA /* ifdef in super0.c */ + strcmp(arg, "default") == 0 || +#endif /* DEFAULT_OLD_METADATA */ + strcmp(arg, "1.02") == 0) { + st->minor_version = 2; + return st; + } + if (strcmp(arg, "1") == 0 || strcmp(arg, "default") == 0) { + st->minor_version = -1; + return st; + } + + free(st); + return NULL; +} + +/* find available size on device with this devsize, using + * superblock type st, and reserving 'reserve' sectors for + * a possible bitmap + */ +static __u64 avail_size1(struct supertype *st, __u64 devsize, + unsigned long long data_offset) +{ + struct mdp_superblock_1 *super = st->sb; + int bmspace = 0; + int bbspace = 0; + if (devsize < 24) + return 0; + + if (__le32_to_cpu(super->feature_map) & MD_FEATURE_BITMAP_OFFSET) { + /* hot-add. allow for actual size of bitmap */ + struct bitmap_super_s *bsb; + bsb = (struct bitmap_super_s *)(((char*)super)+MAX_SB_SIZE); + bmspace = calc_bitmap_size(bsb, 4096) >> 9; + } else if (md_feature_any_ppl_on(super->feature_map)) { + bmspace = __le16_to_cpu(super->ppl.size); + } + + /* Allow space for bad block log */ + if (super->bblog_size) + bbspace = __le16_to_cpu(super->bblog_size); + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + + if (data_offset != INVALID_SECTORS) + switch(st->minor_version) { + case 0: + return devsize - data_offset - 8*2 - bbspace; + case 1: + case 2: + return devsize - data_offset; + default: + return 0; + } + + devsize -= bmspace; + + switch(st->minor_version) { + case 0: + /* at end */ + return ((devsize - 8*2 - bbspace ) & ~(4*2-1)); + case 1: + /* at start, 4K for superblock and possible bitmap */ + return devsize - 4*2 - bbspace; + case 2: + /* 4k from start, 4K for superblock and possible bitmap */ + return devsize - (4+4)*2 - bbspace; + } + return 0; +} + +static int +add_internal_bitmap1(struct supertype *st, + int *chunkp, int delay, int write_behind, + unsigned long long size, + int may_change, int major) +{ + /* + * If not may_change, then this is a 'Grow' without sysfs support for + * bitmaps, and the bitmap must fit after the superblock at 1K offset. + * If may_change, then this is create or a Grow with sysfs support, + * and we can put the bitmap wherever we like. + * + * size is in sectors, chunk is in bytes !!! + */ + + unsigned long long bits; + unsigned long long max_bits; + unsigned long long min_chunk; + long offset; + long bbl_offset, bbl_size; + unsigned long long chunk = *chunkp; + int room = 0; + int creating = 0; + int len; + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb) + MAX_SB_SIZE); + int uuid[4]; + + if (__le64_to_cpu(sb->data_size) == 0) + /* + * Must be creating the array, else data_size + * would be non-zero + */ + creating = 1; + switch(st->minor_version) { + case 0: + /* + * either 3K after the superblock (when hot-add), + * or some amount of space before. + */ + if (creating) { + /* + * We are creating array, so we *know* how much room has + * been left. + */ + offset = 0; + bbl_size = 8; + room = + choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->super_offset) + - __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->data_size); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size < 8) + bbl_size = 8; + bbl_offset = (__s32)__le32_to_cpu(sb->bblog_offset); + if (bbl_size < -bbl_offset) + bbl_size = -bbl_offset; + + if (!may_change || + (room < 3*2 && __le32_to_cpu(sb->max_dev) <= 384)) { + room = 3*2; + offset = 1*2; + bbl_size = 0; + } else { + offset = 0; /* means movable offset */ + } + } + break; + case 1: + case 2: /* between superblock and data */ + if (creating) { + offset = 4*2; + bbl_size = 8; + room = + choose_bm_space(__le64_to_cpu(sb->size)) + bbl_size; + } else { + room = __le64_to_cpu(sb->data_offset) + - __le64_to_cpu(sb->super_offset); + bbl_size = __le16_to_cpu(sb->bblog_size); + if (bbl_size) + room = + __le32_to_cpu(sb->bblog_offset) + bbl_size; + else + bbl_size = 8; + + if (!may_change) { + room -= 2; /* Leave 1K for superblock */ + offset = 2; + bbl_size = 0; + } else { + room -= 4*2; /* leave 4K for superblock */ + offset = 4*2; + } + } + break; + default: + return -ENOSPC; + } + + room -= bbl_size; + if (chunk == UnSet && room > 128*2) + /* Limit to 128K of bitmap when chunk size not requested */ + room = 128*2; + + if (room <= 1) + /* No room for a bitmap */ + return -ENOSPC; + + max_bits = (room * 512 - sizeof(bitmap_super_t)) * 8; + + min_chunk = 4096; /* sub-page chunks don't work yet.. */ + bits = (size*512)/min_chunk +1; + while (bits > max_bits) { + min_chunk *= 2; + bits = (bits+1)/2; + } + if (chunk == UnSet) { + /* For practical purpose, 64Meg is a good + * default chunk size for internal bitmaps. + */ + chunk = min_chunk; + if (chunk < 64*1024*1024) + chunk = 64*1024*1024; + } else if (chunk < min_chunk) + return -EINVAL; /* chunk size too small */ + if (chunk == 0) /* rounding problem */ + return -EINVAL; + + if (offset == 0) { + /* start bitmap on a 4K boundary with enough space for + * the bitmap + */ + bits = (size*512) / chunk + 1; + room = ((bits+7)/8 + sizeof(bitmap_super_t) +4095)/4096; + room *= 8; /* convert 4K blocks to sectors */ + offset = -room - bbl_size; + } + + sb->bitmap_offset = (int32_t)__cpu_to_le32(offset); + + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) | + MD_FEATURE_BITMAP_OFFSET); + memset(bms, 0, sizeof(*bms)); + bms->magic = __cpu_to_le32(BITMAP_MAGIC); + bms->version = __cpu_to_le32(major); + uuid_from_super1(st, uuid); + memcpy(bms->uuid, uuid, 16); + bms->chunksize = __cpu_to_le32(chunk); + bms->daemon_sleep = __cpu_to_le32(delay); + bms->sync_size = __cpu_to_le64(size); + bms->write_behind = __cpu_to_le32(write_behind); + bms->nodes = __cpu_to_le32(st->nodes); + if (st->nodes) + sb->feature_map = __cpu_to_le32(__le32_to_cpu(sb->feature_map) | + MD_FEATURE_BITMAP_VERSIONED); + if (st->cluster_name) { + len = sizeof(bms->cluster_name); + strncpy((char *)bms->cluster_name, st->cluster_name, len); + bms->cluster_name[len - 1] = '\0'; + } + + *chunkp = chunk; + return 0; +} + +static int locate_bitmap1(struct supertype *st, int fd, int node_num) +{ + unsigned long long offset, bm_sectors_per_node; + struct mdp_superblock_1 *sb; + bitmap_super_t *bms; + int mustfree = 0; + int ret; + + if (!st->sb) { + if (st->ss->load_super(st, fd, NULL)) + return -1; /* no error I hope... */ + mustfree = 1; + } + sb = st->sb; + + if ((__le32_to_cpu(sb->feature_map) & MD_FEATURE_BITMAP_OFFSET)) + ret = 0; + else + ret = -1; + + offset = __le64_to_cpu(sb->super_offset) + (int32_t)__le32_to_cpu(sb->bitmap_offset); + if (node_num) { + bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + bm_sectors_per_node = calc_bitmap_size(bms, 4096) >> 9; + offset += bm_sectors_per_node * node_num; + } + if (mustfree) + free(sb); + lseek64(fd, offset<<9, 0); + return ret; +} + +static int write_bitmap1(struct supertype *st, int fd, enum bitmap_update update) +{ + struct mdp_superblock_1 *sb = st->sb; + bitmap_super_t *bms = (bitmap_super_t*)(((char*)sb)+MAX_SB_SIZE); + int rv = 0; + void *buf; + int towrite, n, len; + struct align_fd afd; + unsigned int i = 0; + unsigned long long total_bm_space, bm_space_per_node; + + switch (update) { + case NameUpdate: + /* update cluster name */ + if (st->cluster_name) { + len = sizeof(bms->cluster_name); + memset((char *)bms->cluster_name, 0, len); + strncpy((char *)bms->cluster_name, + st->cluster_name, len); + bms->cluster_name[len - 1] = '\0'; + } + break; + case NodeNumUpdate: + /* cluster md only supports superblock 1.2 now */ + if (st->minor_version != 2 && + bms->version == BITMAP_MAJOR_CLUSTERED) { + pr_err("Warning: cluster md only works with superblock 1.2\n"); + return -EINVAL; + } + + if (bms->version == BITMAP_MAJOR_CLUSTERED) { + if (__cpu_to_le32(st->nodes) < bms->nodes) { + /* + * Since the nodes num is not increased, no + * need to check the space enough or not, + * just update bms->nodes + */ + bms->nodes = __cpu_to_le32(st->nodes); + break; + } + } else { + /* + * no need to change bms->nodes for other + * bitmap types + */ + if (st->nodes) + pr_err("Warning: --nodes option is only suitable for clustered bitmap\n"); + break; + } + + /* + * Each node has an independent bitmap, it is necessary to + * calculate the space is enough or not, first get how many + * bytes for the total bitmap + */ + bm_space_per_node = calc_bitmap_size(bms, 4096); + + total_bm_space = 512 * (__le64_to_cpu(sb->data_offset) - + __le64_to_cpu(sb->super_offset)); + /* leave another 4k for superblock */ + total_bm_space = total_bm_space - 4096; + + if (bm_space_per_node * st->nodes > total_bm_space) { + pr_err("Warning: The max num of nodes can't exceed %llu\n", + total_bm_space / bm_space_per_node); + return -ENOMEM; + } + + bms->nodes = __cpu_to_le32(st->nodes); + break; + case NoUpdate: + default: + break; + } + + init_afd(&afd, fd); + + if (locate_bitmap1(st, fd, 0) < 0) { + pr_err("Error: Invalid bitmap\n"); + return -EINVAL; + } + + if (posix_memalign(&buf, 4096, 4096)) + return -ENOMEM; + + do { + /* Only the bitmap[0] should resync + * whole device on initial assembly + */ + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + memcpy(buf, (char *)bms, sizeof(bitmap_super_t)); + + /* + * use 4096 boundary if bitmap_offset is aligned + * with 8 sectors, then it should compatible with + * older mdadm. + */ + if (__le32_to_cpu(sb->bitmap_offset) & 7) + towrite = calc_bitmap_size(bms, 512); + else + towrite = calc_bitmap_size(bms, 4096); + while (towrite > 0) { + n = towrite; + if (n > 4096) + n = 4096; + n = awrite(&afd, buf, n); + if (n > 0) + towrite -= n; + else + break; + if (i) + memset(buf, 0x00, 4096); + else + memset(buf, 0xff, 4096); + } + fsync(fd); + if (towrite) { + rv = -2; + break; + } + } while (++i < __le32_to_cpu(bms->nodes)); + + free(buf); + return rv; +} + +static void free_super1(struct supertype *st) +{ + + if (st->sb) + free(st->sb); + while (st->info) { + struct devinfo *di = st->info; + st->info = di->next; + if (di->fd >= 0) + close(di->fd); + free(di); + } + st->sb = NULL; +} + +static int validate_geometry1(struct supertype *st, int level, + int layout, int raiddisks, + int *chunk, unsigned long long size, + unsigned long long data_offset, + char *subdev, unsigned long long *freesize, + int consistency_policy, int verbose) +{ + unsigned long long ldsize, devsize; + int bmspace; + unsigned long long headroom; + unsigned long long overhead; + int fd; + + if (level == LEVEL_CONTAINER) { + if (verbose) + pr_err("1.x metadata does not support containers\n"); + return 0; + } + if (*chunk == UnSet) + *chunk = DEFAULT_CHUNK; + + if (!subdev) + return 1; + + if (st->minor_version < 0) + /* not specified, so time to set default */ + st->minor_version = 2; + + fd = open(subdev, O_RDONLY|O_EXCL, 0); + if (fd < 0) { + if (verbose) + pr_err("super1.x cannot open %s: %s\n", + subdev, strerror(errno)); + return 0; + } + + if (!get_dev_size(fd, subdev, &ldsize)) { + close(fd); + return 0; + } + close(fd); + + devsize = ldsize >> 9; + + /* creating: allow suitable space for bitmap or PPL */ + if (consistency_policy == CONSISTENCY_POLICY_PPL) + bmspace = MULTIPLE_PPL_AREA_SIZE_SUPER1 >> 9; + else + bmspace = choose_bm_space(devsize); + + if (data_offset == INVALID_SECTORS) + data_offset = st->data_offset; + if (data_offset == INVALID_SECTORS) + switch (st->minor_version) { + case 0: + data_offset = 0; + break; + case 1: + case 2: + /* Choose data offset appropriate for this device + * and use as default for whole array. + * The data_offset must allow for bitmap space + * and base metadata, should allow for some headroom + * for reshape, and should be rounded to multiple + * of 1M. + * Headroom is limited to 128M, but aim for about 0.1% + */ + headroom = 128*1024*2; + while ((headroom << 10) > devsize && + (*chunk == 0 || + headroom / 2 >= ((unsigned)(*chunk)*2)*2)) + headroom >>= 1; + data_offset = 12*2 + bmspace + headroom; + #define ONE_MEG (2*1024) + data_offset = ROUND_UP(data_offset, ONE_MEG); + break; + } + if (st->data_offset == INVALID_SECTORS) + st->data_offset = data_offset; + switch(st->minor_version) { + case 0: /* metadata at end. Round down and subtract space to reserve */ + devsize = (devsize & ~(4ULL*2-1)); + /* space for metadata, bblog, bitmap/ppl */ + overhead = 8*2 + 8 + bmspace; + if (devsize < overhead) /* detect underflow */ + goto dev_too_small_err; + devsize -= overhead; + break; + case 1: + case 2: + if (devsize < data_offset) /* detect underflow */ + goto dev_too_small_err; + devsize -= data_offset; + break; + } + *freesize = devsize; + return 1; + +/* Error condition, device cannot even hold the overhead. */ +dev_too_small_err: + fprintf(stderr, "device %s is too small (%lluK) for " + "required metadata!\n", subdev, devsize>>1); + *freesize = 0; + return 0; +} + +void *super1_make_v0(struct supertype *st, struct mdinfo *info, mdp_super_t *sb0) +{ + /* Create a v1.0 superblock based on 'info'*/ + void *ret; + struct mdp_superblock_1 *sb; + int i; + unsigned long long offset; + + if (posix_memalign(&ret, 4096, 1024) != 0) + return NULL; + sb = ret; + memset(ret, 0, 1024); + sb->magic = __cpu_to_le32(MD_SB_MAGIC); + sb->major_version = __cpu_to_le32(1); + + copy_uuid(sb->set_uuid, info->uuid, super1.swapuuid); + sprintf(sb->set_name, "%d", sb0->md_minor); + sb->ctime = __cpu_to_le32(info->array.ctime+1); + sb->level = __cpu_to_le32(info->array.level); + sb->layout = __cpu_to_le32(info->array.layout); + sb->size = __cpu_to_le64(info->component_size); + sb->chunksize = __cpu_to_le32(info->array.chunk_size/512); + sb->raid_disks = __cpu_to_le32(info->array.raid_disks); + if (info->array.level > 0) + sb->data_size = sb->size; + else + sb->data_size = st->ss->avail_size(st, st->devsize/512, 0); + sb->resync_offset = MaxSector; + sb->max_dev = __cpu_to_le32(MD_SB_DISKS); + sb->dev_number = __cpu_to_le32(info->disk.number); + sb->utime = __cpu_to_le64(info->array.utime); + + offset = st->devsize/512 - 8*2; + offset &= ~(4*2-1); + sb->super_offset = __cpu_to_le64(offset); + //*(__u64*)(st->other + 128 + 8 + 8) = __cpu_to_le64(offset); + + random_uuid(sb->device_uuid); + + for (i = 0; i < MD_SB_DISKS; i++) { + int state = sb0->disks[i].state; + sb->dev_roles[i] = MD_DISK_ROLE_SPARE; + if ((state & (1<<MD_DISK_SYNC)) && + !(state & (1<<MD_DISK_FAULTY))) + sb->dev_roles[i] = __cpu_to_le16(sb0->disks[i].raid_disk); + } + sb->sb_csum = calc_sb_1_csum(sb); + return ret; +} + +struct superswitch super1 = { + .examine_super = examine_super1, + .brief_examine_super = brief_examine_super1, + .export_examine_super = export_examine_super1, + .detail_super = detail_super1, + .brief_detail_super = brief_detail_super1, + .export_detail_super = export_detail_super1, + .write_init_super = write_init_super1, + .validate_geometry = validate_geometry1, + .add_to_super = add_to_super1, + .examine_badblocks = examine_badblocks_super1, + .copy_metadata = copy_metadata1, + .write_init_ppl = write_init_ppl1, + .match_home = match_home1, + .uuid_from_super = uuid_from_super1, + .getinfo_super = getinfo_super1, + .container_content = container_content1, + .update_super = update_super1, + .init_super = init_super1, + .store_super = store_super1, + .compare_super = compare_super1, + .load_super = load_super1, + .match_metadata_desc = match_metadata_desc1, + .avail_size = avail_size1, + .add_internal_bitmap = add_internal_bitmap1, + .locate_bitmap = locate_bitmap1, + .write_bitmap = write_bitmap1, + .free_super = free_super1, +#if __BYTE_ORDER == BIG_ENDIAN + .swapuuid = 0, +#else + .swapuuid = 1, +#endif + .name = "1.x", +}; diff --git a/swap_super.c b/swap_super.c new file mode 100644 index 0000000..b6db574 --- /dev/null +++ b/swap_super.c @@ -0,0 +1,81 @@ +#include <unistd.h> +#include <stdlib.h> +#include <fcntl.h> +#include <stdio.h> +#include <sys/mount.h> +/* + * This is a tiny test program to endian-swap + * the superblock on a given device. + * We simply read 4k from where the superblock should be + * do the swap, and write it back + * Don't use this on a real array, use mdadm. + */ + +#define MD_RESERVED_BYTES (64 * 1024) +#define MD_RESERVED_SECTORS (MD_RESERVED_BYTES / 512) + +#define MD_NEW_SIZE_SECTORS(x) ((x & ~(MD_RESERVED_SECTORS - 1)) - MD_RESERVED_SECTORS) + +extern long long lseek64(int, long long, int); + +int main(int argc, char *argv[]) +{ + int fd, i; + unsigned long size; + unsigned long long offset; + char super[4096]; + if (argc != 2) { + fprintf(stderr, "Usage: swap_super device\n"); + exit(1); + } + fd = open(argv[1], O_RDWR); + if (fd<0) { + perror(argv[1]); + exit(1); + } + if (ioctl(fd, BLKGETSIZE, &size)) { + perror("BLKGETSIZE"); + exit(1); + } + offset = MD_NEW_SIZE_SECTORS(size) * 512LL; + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (read(fd, super, 4096) != 4096) { + perror("read"); + exit(1); + } + + for (i=0; i < 4096 ; i+=4) { + char t = super[i]; + super[i] = super[i+3]; + super[i+3] = t; + t=super[i+1]; + super[i+1]=super[i+2]; + super[i+2]=t; + } + /* swap the u64 events counters */ + for (i=0; i<4; i++) { + /* events_hi and events_lo */ + char t=super[32*4+7*4 +i]; + super[32*4+7*4 +i] = super[32*4+8*4 +i]; + super[32*4+8*4 +i] = t; + + /* cp_events_hi and cp_events_lo */ + t=super[32*4+9*4 +i]; + super[32*4+9*4 +i] = super[32*4+10*4 +i]; + super[32*4+10*4 +i] = t; + } + + if (lseek64(fd, offset, 0) < 0LL) { + perror("lseek64"); + exit(1); + } + if (write(fd, super, 4096) != 4096) { + perror("write"); + exit(1); + } + exit(0); + +} @@ -0,0 +1,1167 @@ +/* + * sysfs - extract md related information from sysfs. Part of: + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2006-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include <dirent.h> +#include <ctype.h> +#include "dlink.h" + +#define MAX_SYSFS_PATH_LEN 120 + +struct dev_sysfs_rule { + struct dev_sysfs_rule *next; + char *devname; + int uuid[4]; + int uuid_set; + struct sysfs_entry { + struct sysfs_entry *next; + char *name; + char *value; + } *entry; +}; + +int load_sys(char *path, char *buf, int len) +{ + int fd = open(path, O_RDONLY); + int n; + if (fd < 0) + return -1; + n = read(fd, buf, len); + close(fd); + if (n <0 || n >= len) + return -1; + buf[n] = 0; + if (n && buf[n-1] == '\n') + buf[n-1] = 0; + return 0; +} + +void sysfs_free(struct mdinfo *sra) +{ + while (sra) { + struct mdinfo *sra2 = sra->next; + while (sra->devs) { + struct mdinfo *d = sra->devs; + sra->devs = d->next; + free(d->bb.entries); + free(d); + } + free(sra->bb.entries); + free(sra); + sra = sra2; + } +} + +int sysfs_open(char *devnm, char *devname, char *attr) +{ + char fname[MAX_SYSFS_PATH_LEN]; + int fd; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/", devnm); + if (devname) { + strncat(fname, devname, MAX_SYSFS_PATH_LEN - strlen(fname)); + strncat(fname, "/", MAX_SYSFS_PATH_LEN - strlen(fname)); + } + strncat(fname, attr, MAX_SYSFS_PATH_LEN - strlen(fname)); + fd = open(fname, O_RDWR); + if (fd < 0 && errno == EACCES) + fd = open(fname, O_RDONLY); + return fd; +} + +void sysfs_init_dev(struct mdinfo *mdi, dev_t devid) +{ + snprintf(mdi->sys_name, + sizeof(mdi->sys_name), "dev-%s", devid2kname(devid)); +} + +int sysfs_init(struct mdinfo *mdi, int fd, char *devnm) +{ + struct stat stb; + char fname[MAX_SYSFS_PATH_LEN]; + int retval = -ENODEV; + + mdi->sys_name[0] = 0; + if (fd >= 0) + devnm = fd2devnm(fd); + + if (devnm == NULL) + goto out; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md", devnm); + + if (stat(fname, &stb)) + goto out; + if (!S_ISDIR(stb.st_mode)) + goto out; + strcpy(mdi->sys_name, devnm); + + retval = 0; +out: + return retval; +} + +struct mdinfo *sysfs_read(int fd, char *devnm, unsigned long options) +{ + char fname[PATH_MAX]; + char buf[PATH_MAX]; + char *base; + char *dbase; + struct mdinfo *sra; + struct mdinfo *dev, **devp; + DIR *dir = NULL; + struct dirent *de; + + sra = xcalloc(1, sizeof(*sra)); + if (sysfs_init(sra, fd, devnm)) { + free(sra); + return NULL; + } + + sprintf(fname, "/sys/block/%s/md/", sra->sys_name); + base = fname + strlen(fname); + + sra->devs = NULL; + if (options & GET_VERSION) { + strcpy(base, "metadata_version"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + if (strncmp(buf, "none", 4) == 0) { + sra->array.major_version = + sra->array.minor_version = -1; + strcpy(sra->text_version, ""); + } else if (strncmp(buf, "external:", 9) == 0) { + sra->array.major_version = -1; + sra->array.minor_version = -2; + strcpy(sra->text_version, buf+9); + } else { + sscanf(buf, "%d.%d", + &sra->array.major_version, + &sra->array.minor_version); + strcpy(sra->text_version, buf); + } + } + if (options & GET_LEVEL) { + strcpy(base, "level"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->array.level = map_name(pers, buf); + } + if (options & GET_LAYOUT) { + strcpy(base, "layout"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->array.layout = strtoul(buf, NULL, 0); + } + if (options & (GET_DISKS|GET_STATE)) { + strcpy(base, "raid_disks"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->array.raid_disks = strtoul(buf, NULL, 0); + } + if (options & GET_COMPONENT) { + strcpy(base, "component_size"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->component_size = strtoull(buf, NULL, 0); + /* sysfs reports "K", but we want sectors */ + sra->component_size *= 2; + } + if (options & GET_CHUNK) { + strcpy(base, "chunk_size"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->array.chunk_size = strtoul(buf, NULL, 0); + } + if (options & GET_CACHE) { + strcpy(base, "stripe_cache_size"); + if (load_sys(fname, buf, sizeof(buf))) + /* Probably level doesn't support it */ + sra->cache_size = 0; + else + sra->cache_size = strtoul(buf, NULL, 0); + } + if (options & GET_MISMATCH) { + strcpy(base, "mismatch_cnt"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->mismatch_cnt = strtoul(buf, NULL, 0); + } + if (options & GET_SAFEMODE) { + int scale = 1; + int dot = 0; + unsigned i; + unsigned long msec; + size_t len; + + strcpy(base, "safe_mode_delay"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + + /* remove a period, and count digits after it */ + len = strlen(buf); + for (i = 0; i < len; i++) { + if (dot) { + if (isdigit(buf[i])) { + buf[i-1] = buf[i]; + scale *= 10; + } + buf[i] = 0; + } else if (buf[i] == '.') { + dot=1; + buf[i] = 0; + } + } + msec = strtoul(buf, NULL, 10); + msec = (msec * 1000) / scale; + sra->safe_mode_delay = msec; + } + if (options & GET_BITMAP_LOCATION) { + strcpy(base, "bitmap/location"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + if (strncmp(buf, "file", 4) == 0) + sra->bitmap_offset = 1; + else if (strncmp(buf, "none", 4) == 0) + sra->bitmap_offset = 0; + else if (buf[0] == '+') + sra->bitmap_offset = strtol(buf+1, NULL, 10); + else + goto abort; + } + + if (options & GET_ARRAY_STATE) { + strcpy(base, "array_state"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + sra->array_state = map_name(sysfs_array_states, buf); + } + + if (options & GET_CONSISTENCY_POLICY) { + strcpy(base, "consistency_policy"); + if (load_sys(fname, buf, sizeof(buf))) + sra->consistency_policy = CONSISTENCY_POLICY_UNKNOWN; + else + sra->consistency_policy = map_name(consistency_policies, + buf); + } + + if (! (options & GET_DEVS)) + return sra; + + /* Get all the devices as well */ + *base = 0; + dir = opendir(fname); + if (!dir) + goto abort; + sra->array.spare_disks = 0; + sra->array.active_disks = 0; + sra->array.failed_disks = 0; + sra->array.working_disks = 0; + + devp = &sra->devs; + sra->devs = NULL; + while ((de = readdir(dir)) != NULL) { + char *ep; + if (de->d_ino == 0 || + strncmp(de->d_name, "dev-", 4) != 0) + continue; + strcpy(base, de->d_name); + dbase = base + strlen(base); + *dbase++ = '/'; + + dev = xcalloc(1, sizeof(*dev)); + + /* Always get slot, major, minor */ + strcpy(dbase, "slot"); + if (load_sys(fname, buf, sizeof(buf))) { + /* hmm... unable to read 'slot' maybe the device + * is going away? + */ + strcpy(dbase, "block"); + if (readlink(fname, buf, sizeof(buf)) < 0 && + errno != ENAMETOOLONG) { + /* ...yup device is gone */ + free(dev); + continue; + } else { + /* slot is unreadable but 'block' link + * still intact... something bad is happening + * so abort + */ + free(dev); + goto abort; + } + + } + strcpy(dev->sys_name, de->d_name); + dev->disk.raid_disk = strtoul(buf, &ep, 10); + if (*ep) dev->disk.raid_disk = -1; + + sra->array.nr_disks++; + strcpy(dbase, "block/dev"); + if (load_sys(fname, buf, sizeof(buf))) { + /* assume this is a stale reference to a hot + * removed device + */ + if (!(options & GET_DEVS_ALL)) { + free(dev); + continue; + } + } else { + sscanf(buf, "%d:%d", &dev->disk.major, &dev->disk.minor); + } + + if (!(options & GET_DEVS_ALL)) { + /* special case check for block devices that can go 'offline' */ + strcpy(dbase, "block/device/state"); + if (load_sys(fname, buf, sizeof(buf)) == 0 && + strncmp(buf, "offline", 7) == 0) { + free(dev); + continue; + } + } + + /* finally add this disk to the array */ + *devp = dev; + devp = & dev->next; + dev->next = NULL; + + if (options & GET_OFFSET) { + strcpy(dbase, "offset"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + dev->data_offset = strtoull(buf, NULL, 0); + strcpy(dbase, "new_offset"); + if (load_sys(fname, buf, sizeof(buf)) == 0) + dev->new_data_offset = strtoull(buf, NULL, 0); + else + dev->new_data_offset = dev->data_offset; + } + if (options & GET_SIZE) { + strcpy(dbase, "size"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + dev->component_size = strtoull(buf, NULL, 0) * 2; + } + if (options & GET_STATE) { + dev->disk.state = 0; + strcpy(dbase, "state"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + if (strstr(buf, "faulty")) + dev->disk.state |= (1<<MD_DISK_FAULTY); + else { + sra->array.working_disks++; + if (strstr(buf, "in_sync")) { + dev->disk.state |= (1<<MD_DISK_SYNC); + sra->array.active_disks++; + } + if (dev->disk.state == 0) + sra->array.spare_disks++; + } + } + if (options & GET_ERROR) { + strcpy(buf, "errors"); + if (load_sys(fname, buf, sizeof(buf))) + goto abort; + dev->errors = strtoul(buf, NULL, 0); + } + } + + if ((options & GET_STATE) && sra->array.raid_disks) + sra->array.failed_disks = sra->array.raid_disks - + sra->array.active_disks - sra->array.spare_disks; + + closedir(dir); + return sra; + + abort: + if (dir) + closedir(dir); + sysfs_free(sra); + return NULL; +} + +int sysfs_attr_match(const char *attr, const char *str) +{ + /* See if attr, read from a sysfs file, matches + * str. They must either be the same, or attr can + * have a trailing newline or comma + */ + while (*attr && *str && *attr == *str) { + attr++; + str++; + } + + if (*str || (*attr && *attr != ',' && *attr != '\n')) + return 0; + return 1; +} + +int sysfs_match_word(const char *word, char **list) +{ + int n; + for (n=0; list[n]; n++) + if (sysfs_attr_match(word, list[n])) + break; + return n; +} + +unsigned long long get_component_size(int fd) +{ + /* Find out the component size of the array. + * We cannot trust GET_ARRAY_INFO ioctl as it's + * size field is only 32bits. + * So look in /sys/block/mdXXX/md/component_size + * + * This returns in units of sectors. + */ + struct stat stb; + char fname[MAX_SYSFS_PATH_LEN]; + int n; + if (fstat(fd, &stb)) + return 0; + snprintf(fname, MAX_SYSFS_PATH_LEN, + "/sys/block/%s/md/component_size", stat2devnm(&stb)); + fd = open(fname, O_RDONLY); + if (fd < 0) + return 0; + n = read(fd, fname, sizeof(fname)); + close(fd); + if (n < 0 || n == sizeof(fname)) + return 0; + fname[n] = 0; + return strtoull(fname, NULL, 10) * 2; +} + +int sysfs_set_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val) +{ + char fname[MAX_SYSFS_PATH_LEN]; + unsigned int n; + int fd; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, val, strlen(val)); + close(fd); + if (n != strlen(val)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + val, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_set_num(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long val) +{ + char valstr[50]; + sprintf(valstr, "%llu", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_set_num_signed(struct mdinfo *sra, struct mdinfo *dev, + char *name, long long val) +{ + char valstr[50]; + sprintf(valstr, "%lli", val); + return sysfs_set_str(sra, dev, name, valstr); +} + +int sysfs_uevent(struct mdinfo *sra, char *event) +{ + char fname[MAX_SYSFS_PATH_LEN]; + int n; + int fd; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/uevent", + sra->sys_name); + fd = open(fname, O_WRONLY); + if (fd < 0) + return -1; + n = write(fd, event, strlen(event)); + close(fd); + if (n != (int)strlen(event)) { + dprintf("failed to write '%s' to '%s' (%s)\n", + event, fname, strerror(errno)); + return -1; + } + return 0; +} + +int sysfs_attribute_available(struct mdinfo *sra, struct mdinfo *dev, char *name) +{ + char fname[MAX_SYSFS_PATH_LEN]; + struct stat st; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + + return stat(fname, &st) == 0; +} + +int sysfs_get_fd(struct mdinfo *sra, struct mdinfo *dev, + char *name) +{ + char fname[MAX_SYSFS_PATH_LEN]; + int fd; + + snprintf(fname, MAX_SYSFS_PATH_LEN, "/sys/block/%s/md/%s/%s", + sra->sys_name, dev?dev->sys_name:"", name); + fd = open(fname, O_RDWR); + if (fd < 0) + fd = open(fname, O_RDONLY); + return fd; +} + +int sysfs_fd_get_ll(int fd, unsigned long long *val) +{ + char buf[50]; + int n; + char *ep; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *val = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + return 0; +} + +int sysfs_get_ll(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *val) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_ll(fd, val); + close(fd); + return n; +} + +int sysfs_fd_get_two(int fd, unsigned long long *v1, unsigned long long *v2) +{ + /* two numbers in this sysfs file, either + * NNN (NNN) + * or + * NNN / NNN + */ + char buf[80]; + int n; + char *ep, *ep2; + + lseek(fd, 0, 0); + n = read(fd, buf, sizeof(buf)); + if (n <= 0 || n == sizeof(buf)) + return -2; + buf[n] = 0; + *v1 = strtoull(buf, &ep, 0); + if (ep == buf || (*ep != 0 && *ep != '\n' && *ep != ' ')) + return -1; + while (*ep == ' ' || *ep == '/' || *ep == '(') + ep++; + *v2 = strtoull(ep, &ep2, 0); + if (ep2 == ep || (*ep2 != 0 && *ep2 != '\n' && *ep2 != ' ' && *ep2 != ')')) { + *v2 = *v1; + return 1; + } + return 2; +} + +int sysfs_get_two(struct mdinfo *sra, struct mdinfo *dev, + char *name, unsigned long long *v1, unsigned long long *v2) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_two(fd, v1, v2); + close(fd); + return n; +} + +int sysfs_fd_get_str(int fd, char *val, int size) +{ + int n; + + lseek(fd, 0, 0); + n = read(fd, val, size); + if (n <= 0 || n == size) + return -1; + val[n] = 0; + return n; +} + +int sysfs_get_str(struct mdinfo *sra, struct mdinfo *dev, + char *name, char *val, int size) +{ + int n; + int fd; + + fd = sysfs_get_fd(sra, dev, name); + if (fd < 0) + return -1; + n = sysfs_fd_get_str(fd, val, size); + close(fd); + return n; +} + +int sysfs_set_safemode(struct mdinfo *sra, unsigned long ms) +{ + unsigned long sec; + unsigned long msec; + char delay[30]; + + sec = ms / 1000; + msec = ms % 1000; + + sprintf(delay, "%ld.%03ld\n", sec, msec); + /* this '\n' ^ needed for kernels older than 2.6.28 */ + return sysfs_set_str(sra, NULL, "safe_mode_delay", delay); +} + +int sysfs_set_array(struct mdinfo *info, int vers) +{ + int rv = 0; + char ver[100]; + int raid_disks = info->array.raid_disks; + + ver[0] = 0; + if (info->array.major_version == -1 && + info->array.minor_version == -2) { + char buf[1024]; + + strcat(strcpy(ver, "external:"), info->text_version); + + /* meta version might already be set if we are setting + * new geometry for a reshape. In that case we don't + * want to over-write the 'readonly' flag that is + * stored in the metadata version. So read the current + * version first, and preserve the flag + */ + if (sysfs_get_str(info, NULL, "metadata_version", + buf, 1024) > 0) + if (strlen(buf) >= 9 && buf[9] == '-') + ver[9] = '-'; + + if ((vers % 100) < 2 || + sysfs_set_str(info, NULL, "metadata_version", + ver) < 0) { + pr_err("This kernel does not support external metadata.\n"); + return 1; + } + } + if (info->array.level < 0) + return 0; /* FIXME */ + rv |= sysfs_set_str(info, NULL, "level", + map_num(pers, info->array.level)); + if (info->reshape_active && info->delta_disks != UnSet) + raid_disks -= info->delta_disks; + rv |= sysfs_set_num(info, NULL, "raid_disks", raid_disks); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->array.chunk_size); + rv |= sysfs_set_num(info, NULL, "layout", info->array.layout); + rv |= sysfs_set_num(info, NULL, "component_size", info->component_size/2); + if (info->custom_array_size) { + int rc; + + rc = sysfs_set_num(info, NULL, "array_size", + info->custom_array_size/2); + if (rc && errno == ENOENT) { + pr_err("This kernel does not have the md/array_size attribute, the array may be larger than expected\n"); + rc = 0; + } + rv |= rc; + } + + if (info->array.level > 0) + rv |= sysfs_set_num(info, NULL, "resync_start", info->resync_start); + + if (info->reshape_active) { + rv |= sysfs_set_num(info, NULL, "reshape_position", + info->reshape_progress); + rv |= sysfs_set_num(info, NULL, "chunk_size", info->new_chunk); + rv |= sysfs_set_num(info, NULL, "layout", info->new_layout); + rv |= sysfs_set_num(info, NULL, "raid_disks", + info->array.raid_disks); + /* We don't set 'new_level' here. That can only happen + * once the reshape completes. + */ + } + + if (info->consistency_policy == CONSISTENCY_POLICY_PPL) { + if (sysfs_set_str(info, NULL, "consistency_policy", + map_num(consistency_policies, + info->consistency_policy))) { + pr_err("This kernel does not support PPL. Falling back to consistency-policy=resync.\n"); + info->consistency_policy = CONSISTENCY_POLICY_RESYNC; + } + } + + return rv; +} + +int sysfs_add_disk(struct mdinfo *sra, struct mdinfo *sd, int resume) +{ + char dv[PATH_MAX]; + char nm[PATH_MAX]; + char *dname; + int rv; + int i; + + sprintf(dv, "%d:%d", sd->disk.major, sd->disk.minor); + rv = sysfs_set_str(sra, NULL, "new_dev", dv); + if (rv) + return rv; + + memset(nm, 0, sizeof(nm)); + dname = devid2kname(makedev(sd->disk.major, sd->disk.minor)); + strcpy(sd->sys_name, "dev-"); + strcpy(sd->sys_name+4, dname); + + /* test write to see if 'recovery_start' is available */ + if (resume && sd->recovery_start < MaxSector && + sysfs_set_num(sra, sd, "recovery_start", 0)) { + sysfs_set_str(sra, sd, "state", "remove"); + return -1; + } + + rv = sysfs_set_num(sra, sd, "offset", sd->data_offset); + rv |= sysfs_set_num(sra, sd, "size", (sd->component_size+1) / 2); + if (sra->array.level != LEVEL_CONTAINER) { + if (sra->consistency_policy == CONSISTENCY_POLICY_PPL) { + rv |= sysfs_set_num(sra, sd, "ppl_sector", sd->ppl_sector); + rv |= sysfs_set_num(sra, sd, "ppl_size", sd->ppl_size); + } + if (sd->recovery_start == MaxSector) + /* This can correctly fail if array isn't started, + * yet, so just ignore status for now. + */ + sysfs_set_str(sra, sd, "state", "insync"); + if (sd->disk.raid_disk >= 0) + rv |= sysfs_set_num(sra, sd, "slot", sd->disk.raid_disk); + if (resume) + sysfs_set_num(sra, sd, "recovery_start", sd->recovery_start); + } + if (sd->bb.supported) { + if (sysfs_set_str(sra, sd, "state", "external_bbl")) { + /* + * backward compatibility - if kernel doesn't support + * bad blocks for external metadata, let it continue + * as long as there are none known so far + */ + if (sd->bb.count) { + pr_err("The kernel has no support for bad blocks in external metadata\n"); + return -1; + } + } + + for (i = 0; i < sd->bb.count; i++) { + char s[30]; + const struct md_bb_entry *entry = &sd->bb.entries[i]; + + snprintf(s, sizeof(s) - 1, "%llu %d\n", entry->sector, + entry->length); + rv |= sysfs_set_str(sra, sd, "bad_blocks", s); + } + } + return rv; +} + +#if 0 +int sysfs_disk_to_sg(int fd) +{ + /* from an open block device, try find and open its corresponding + * scsi_generic interface + */ + struct stat st; + char path[256]; + char sg_path[256]; + char sg_major_minor[10]; + char *c; + DIR *dir; + struct dirent *de; + int major, minor, rv; + + if (fstat(fd, &st)) + return -1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return -1; + + de = readdir(dir); + while (de) { + if (strncmp("scsi_generic:", de->d_name, + strlen("scsi_generic:")) == 0) + break; + de = readdir(dir); + } + closedir(dir); + + if (!de) + return -1; + + snprintf(sg_path, sizeof(sg_path), "%s/%s/dev", path, de->d_name); + fd = open(sg_path, O_RDONLY); + if (fd < 0) + return fd; + + rv = read(fd, sg_major_minor, sizeof(sg_major_minor)); + close(fd); + if (rv < 0 || rv == sizeof(sg_major_minor)) + return -1; + else + sg_major_minor[rv - 1] = '\0'; + + c = strchr(sg_major_minor, ':'); + *c = '\0'; + c++; + major = strtol(sg_major_minor, NULL, 10); + minor = strtol(c, NULL, 10); + snprintf(path, sizeof(path), "/dev/.tmp.md.%d:%d:%d", + (int) getpid(), major, minor); + if (mknod(path, S_IFCHR|0600, makedev(major, minor))==0) { + fd = open(path, O_RDONLY); + unlink(path); + return fd; + } + + return -1; +} +#endif + +int sysfs_disk_to_scsi_id(int fd, __u32 *id) +{ + /* from an open block device, try to retrieve it scsi_id */ + struct stat st; + char path[256]; + DIR *dir; + struct dirent *de; + int host, bus, target, lun; + + if (fstat(fd, &st)) + return 1; + + snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/device/scsi_device", + major(st.st_rdev), minor(st.st_rdev)); + + dir = opendir(path); + if (!dir) + return 1; + + for (de = readdir(dir); de; de = readdir(dir)) { + int count; + + if (de->d_type != DT_DIR) + continue; + + count = sscanf(de->d_name, "%d:%d:%d:%d", &host, &bus, &target, &lun); + if (count == 4) + break; + } + closedir(dir); + + if (!de) + return 1; + + *id = (host << 24) | (bus << 16) | (target << 8) | (lun << 0); + return 0; +} + +int sysfs_unique_holder(char *devnm, long rdev) +{ + /* Check that devnm is a holder of rdev, + * and is the only holder. + * we should be locked against races by + * an O_EXCL on devnm + * Return values: + * 0 - not unique, not even a holder + * 1 - unique, this is the only holder. + * 2/3 - not unique, there is another holder + * -1 - error, cannot find the holders + */ + DIR *dir; + struct dirent *de; + char dirname[100]; + char l; + int ret = 0; + sprintf(dirname, "/sys/dev/block/%d:%d/holders", + major(rdev), minor(rdev)); + dir = opendir(dirname); + if (!dir) + return -1; + l = strlen(dirname); + while ((de = readdir(dir)) != NULL) { + char buf[100]; + char *sl; + int n; + + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + strcpy(dirname+l, "/"); + strcat(dirname+l, de->d_name); + n = readlink(dirname, buf, sizeof(buf)-1); + if (n <= 0) + continue; + buf[n] = 0; + sl = strrchr(buf, '/'); + if (!sl) + continue; + sl++; + + if (strcmp(devnm, sl) == 0) + ret |= 1; + else + ret |= 2; + } + closedir(dir); + return ret; +} + +int sysfs_freeze_array(struct mdinfo *sra) +{ + /* Try to freeze resync/rebuild on this array/container. + * Return -1 if the array is busy, + * return 0 if this kernel doesn't support 'frozen' + * return 1 if it worked. + */ + char buf[20]; + + if (!sysfs_attribute_available(sra, NULL, "sync_action")) + return 1; /* no sync_action == frozen */ + if (sysfs_get_str(sra, NULL, "sync_action", buf, 20) <= 0) + return 0; + if (strcmp(buf, "frozen\n") == 0) + /* Already frozen */ + return 0; + if (strcmp(buf, "idle\n") != 0 && strcmp(buf, "recover\n") != 0) + return -1; + if (sysfs_set_str(sra, NULL, "sync_action", "frozen") < 0) + return 0; + return 1; +} + +int sysfs_wait(int fd, int *msec) +{ + /* Wait up to '*msec' for fd to have an exception condition. + * if msec == NULL, wait indefinitely. + */ + fd_set fds; + int n; + FD_ZERO(&fds); + FD_SET(fd, &fds); + if (msec == NULL) + n = select(fd+1, NULL, NULL, &fds, NULL); + else if (*msec < 0) + n = 0; + else { + struct timeval start, end, tv; + gettimeofday(&start, NULL); + if (*msec < 1000) { + tv.tv_sec = 0; + tv.tv_usec = (*msec)*1000; + } else { + tv.tv_sec = (*msec)/1000; + tv.tv_usec = 0; + } + n = select(fd+1, NULL, NULL, &fds, &tv); + gettimeofday(&end, NULL); + end.tv_sec -= start.tv_sec; + *msec -= (end.tv_sec * 1000 + end.tv_usec/1000 + - start.tv_usec/1000) + 1; + } + return n; +} + +int sysfs_rules_apply_check(const struct mdinfo *sra, + const struct sysfs_entry *ent) +{ + /* Check whether parameter is regular file, + * exists and is under specified directory. + */ + char fname[MAX_SYSFS_PATH_LEN]; + char dname[MAX_SYSFS_PATH_LEN]; + char resolved_path[PATH_MAX]; + char resolved_dir[PATH_MAX]; + int result; + + if (sra == NULL || ent == NULL) + return -1; + + result = snprintf(dname, MAX_SYSFS_PATH_LEN, + "/sys/block/%s/md/", sra->sys_name); + if (result < 0 || result >= MAX_SYSFS_PATH_LEN) + return -1; + + result = snprintf(fname, MAX_SYSFS_PATH_LEN, + "%s/%s", dname, ent->name); + if (result < 0 || result >= MAX_SYSFS_PATH_LEN) + return -1; + + if (realpath(fname, resolved_path) == NULL || + realpath(dname, resolved_dir) == NULL) + return -1; + + if (strncmp(resolved_dir, resolved_path, + strnlen(resolved_dir, PATH_MAX)) != 0) + return -1; + + return 0; +} + +static struct dev_sysfs_rule *sysfs_rules; + +void sysfs_rules_apply(char *devnm, struct mdinfo *dev) +{ + struct dev_sysfs_rule *rules = sysfs_rules; + + while (rules) { + struct sysfs_entry *ent = rules->entry; + int match = 0; + + if (!rules->uuid_set) { + if (rules->devname) + match = strcmp(devnm, rules->devname) == 0; + } else { + match = memcmp(dev->uuid, rules->uuid, + sizeof(int[4])) == 0; + } + + while (match && ent) { + if (sysfs_rules_apply_check(dev, ent) < 0) + pr_err("SYSFS: failed to write '%s' to '%s'\n", + ent->value, ent->name); + else + sysfs_set_str(dev, NULL, ent->name, ent->value); + ent = ent->next; + } + rules = rules->next; + } +} + +static void sysfs_rule_free(struct dev_sysfs_rule *rule) +{ + struct sysfs_entry *entry; + + while (rule) { + struct dev_sysfs_rule *tmp = rule->next; + + entry = rule->entry; + while (entry) { + struct sysfs_entry *tmp = entry->next; + + free(entry->name); + free(entry->value); + free(entry); + entry = tmp; + } + + if (rule->devname) + free(rule->devname); + free(rule); + rule = tmp; + } +} + +void sysfsline(char *line) +{ + struct dev_sysfs_rule *sr; + char *w; + + sr = xcalloc(1, sizeof(*sr)); + for (w = dl_next(line); w != line ; w = dl_next(w)) { + if (strncasecmp(w, "name=", 5) == 0) { + char *devname = w + 5; + + if (strncmp(devname, "/dev/md/", 8) == 0) { + if (sr->devname) + pr_err("Only give one device per SYSFS line: %s\n", + devname); + else + sr->devname = xstrdup(devname); + } else { + pr_err("%s is an invalid name for an md device - ignored.\n", + devname); + } + } else if (strncasecmp(w, "uuid=", 5) == 0) { + char *uuid = w + 5; + + if (sr->uuid_set) { + pr_err("Only give one uuid per SYSFS line: %s\n", + uuid); + } else { + if (parse_uuid(w + 5, sr->uuid) && + memcmp(sr->uuid, uuid_zero, + sizeof(int[4])) != 0) + sr->uuid_set = 1; + else + pr_err("Invalid uuid: %s\n", uuid); + } + } else { + struct sysfs_entry *prop; + + char *sep = strchr(w, '='); + + if (sep == NULL || *(sep + 1) == 0) { + pr_err("Cannot parse \"%s\" - ignoring.\n", w); + continue; + } + + prop = xmalloc(sizeof(*prop)); + prop->value = xstrdup(sep + 1); + *sep = 0; + prop->name = xstrdup(w); + prop->next = sr->entry; + sr->entry = prop; + } + } + + if (!sr->devname && !sr->uuid_set) { + pr_err("Device name not found in sysfs config entry - ignoring.\n"); + sysfs_rule_free(sr); + return; + } + + sr->next = sysfs_rules; + sysfs_rules = sr; +} diff --git a/systemd/SUSE-mdadm_env.sh b/systemd/SUSE-mdadm_env.sh new file mode 100644 index 0000000..c13b48a --- /dev/null +++ b/systemd/SUSE-mdadm_env.sh @@ -0,0 +1,48 @@ +#!/bin/sh + +# extract configuration from /etc/sysconfig/mdadm and write +# environment to /run/sysconfig/mdadm to be used by +# systemd unit files. + +MDADM_SCAN="yes" + +# Following adapted from /etc/init.d/mdadmd on openSUSE + +mdadmd_CONFIG=/etc/sysconfig/mdadm +if test -r $mdadmd_CONFIG; then + . $mdadmd_CONFIG +fi + +if [ x$MDADM_DELAY != x"" ]; then + MDADM_DELAY="-d "$MDADM_DELAY; +fi + +if [ x$MDADM_MAIL != x"" ]; then + MDADM_MAIL="-m \"$MDADM_MAIL\"" +fi + +if [ x$MDADM_PROGRAM != x"" ]; then + MDADM_PROGRAM="-p \"$MDADM_PROGRAM\"" +fi + +if [ x$MDADM_SCAN = x"yes" ]; then + MDADM_SCAN="--scan" +else + MDADM_SCAN="" +fi + +if [ x$MDADM_SEND_MAIL_ON_START = x"yes" ]; then + MDADM_SEND_MAIL="-t" +else + MDADM_SEND_MAIL="" +fi + +if [ x$MDADM_CONFIG != x"" ]; then + MDADM_CONFIG="-c \"$MDADM_CONFIG\"" +fi + +mkdir -p /run/sysconfig +echo "MDADM_MONITOR_ARGS=$MDADM_RAIDDEVICES $MDADM_DELAY $MDADM_MAIL $MDADM_PROGRAM $MDADM_SCAN $MDADM_SEND_MAIL $MDADM_CONFIG" > /run/sysconfig/mdadm +if [ -n "$MDADM_CHECK_DURATION" ]; then + echo "MDADM_CHECK_DURATION=$MDADM_CHECK_DURATION" >> /run/sysconfig/mdadm +fi diff --git a/systemd/mdadm-grow-continue@.service b/systemd/mdadm-grow-continue@.service new file mode 100644 index 0000000..5c667d2 --- /dev/null +++ b/systemd/mdadm-grow-continue@.service @@ -0,0 +1,17 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=Manage MD Reshape on /dev/%I +DefaultDependencies=no + +[Service] +ExecStart=BINDIR/mdadm --grow --continue /dev/%I +StandardInput=null +StandardOutput=null +StandardError=null +KillMode=none diff --git a/systemd/mdadm-last-resort@.service b/systemd/mdadm-last-resort@.service new file mode 100644 index 0000000..efeb3f6 --- /dev/null +++ b/systemd/mdadm-last-resort@.service @@ -0,0 +1,8 @@ +[Unit] +Description=Activate md array %I even though degraded +DefaultDependencies=no +ConditionPathExists=!/sys/devices/virtual/block/%i/md/sync_action + +[Service] +Type=oneshot +ExecStart=BINDIR/mdadm --run /dev/%i diff --git a/systemd/mdadm-last-resort@.timer b/systemd/mdadm-last-resort@.timer new file mode 100644 index 0000000..45ad223 --- /dev/null +++ b/systemd/mdadm-last-resort@.timer @@ -0,0 +1,7 @@ +[Unit] +Description=Timer to wait for more drives before activating degraded array %I. +DefaultDependencies=no +Conflicts=sys-devices-virtual-block-%i.device + +[Timer] +OnActiveSec=30 diff --git a/systemd/mdadm.shutdown b/systemd/mdadm.shutdown new file mode 100644 index 0000000..33f2778 --- /dev/null +++ b/systemd/mdadm.shutdown @@ -0,0 +1,4 @@ +#!/bin/sh +# We need to ensure all md arrays with external metadata +# (e.g. IMSM, DDF) are clean before completing the shutdown. +BINDIR/mdadm --wait-clean --scan diff --git a/systemd/mdcheck_continue.service b/systemd/mdcheck_continue.service new file mode 100644 index 0000000..854317f --- /dev/null +++ b/systemd/mdcheck_continue.service @@ -0,0 +1,17 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array scrubbing - continuation +ConditionPathExistsGlob = /var/lib/mdcheck/MD_UUID_* + +[Service] +Type=oneshot +Environment="MDADM_CHECK_DURATION=6 hours" +EnvironmentFile=-/run/sysconfig/mdadm +ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh +ExecStart=/usr/share/mdadm/mdcheck --continue --duration ${MDADM_CHECK_DURATION} diff --git a/systemd/mdcheck_continue.timer b/systemd/mdcheck_continue.timer new file mode 100644 index 0000000..dba1074 --- /dev/null +++ b/systemd/mdcheck_continue.timer @@ -0,0 +1,15 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array scrubbing - continuation + +[Timer] +OnCalendar= 1:05:00 + +[Install] +WantedBy= mdmonitor.service diff --git a/systemd/mdcheck_start.service b/systemd/mdcheck_start.service new file mode 100644 index 0000000..3bb3d13 --- /dev/null +++ b/systemd/mdcheck_start.service @@ -0,0 +1,17 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array scrubbing +Wants=mdcheck_continue.timer + +[Service] +Type=oneshot +Environment="MDADM_CHECK_DURATION=6 hours" +EnvironmentFile=-/run/sysconfig/mdadm +ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh +ExecStart=/usr/share/mdadm/mdcheck --duration ${MDADM_CHECK_DURATION} diff --git a/systemd/mdcheck_start.timer b/systemd/mdcheck_start.timer new file mode 100644 index 0000000..9e7e02a --- /dev/null +++ b/systemd/mdcheck_start.timer @@ -0,0 +1,16 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array scrubbing + +[Timer] +OnCalendar=Sun *-*-1..7 1:00:00 + +[Install] +WantedBy= mdmonitor.service +Also= mdcheck_continue.timer diff --git a/systemd/mdmon@.service b/systemd/mdmon@.service new file mode 100644 index 0000000..85a3a7c --- /dev/null +++ b/systemd/mdmon@.service @@ -0,0 +1,28 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD Metadata Monitor on /dev/%I +DefaultDependencies=no +Before=initrd-switch-root.target + +[Service] +# mdmon should never complain due to lack of a platform, +# that is mdadm's job if at all. +Environment=IMSM_NO_PLATFORM=1 +# The mdmon starting in the initramfs (with dracut at least) +# cannot see sysfs after root is mounted, so we will have to +# 'takeover'. As the '--offroot --takeover' don't hurt when +# not necessary, are are useful with root-on-md in dracut, +# have them always present. +ExecStart=BINDIR/mdmon --offroot --takeover %I +Type=forking +# Don't set the PIDFile. It isn't necessary (systemd can work +# it out) and systemd will remove it when transitioning from +# initramfs to rootfs. +#PIDFile=/run/mdadm/%I.pid +KillMode=none diff --git a/systemd/mdmonitor-oneshot.service b/systemd/mdmonitor-oneshot.service new file mode 100644 index 0000000..373955a --- /dev/null +++ b/systemd/mdmonitor-oneshot.service @@ -0,0 +1,15 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=Reminder for degraded MD arrays + +[Service] +Environment=MDADM_MONITOR_ARGS=--scan +EnvironmentFile=-/run/sysconfig/mdadm +ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh +ExecStart=BINDIR/mdadm --monitor --oneshot $MDADM_MONITOR_ARGS diff --git a/systemd/mdmonitor-oneshot.timer b/systemd/mdmonitor-oneshot.timer new file mode 100644 index 0000000..cb54bda --- /dev/null +++ b/systemd/mdmonitor-oneshot.timer @@ -0,0 +1,15 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=Reminder for degraded MD arrays + +[Timer] +OnCalendar= 2:00:00 + +[Install] +WantedBy= mdmonitor.service diff --git a/systemd/mdmonitor.service b/systemd/mdmonitor.service new file mode 100644 index 0000000..46f7b88 --- /dev/null +++ b/systemd/mdmonitor.service @@ -0,0 +1,16 @@ +# This file is part of mdadm. +# +# mdadm is free software; you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. + +[Unit] +Description=MD array monitor +DefaultDependencies=no + +[Service] +Environment= MDADM_MONITOR_ARGS=--scan +EnvironmentFile=-/run/sysconfig/mdadm +ExecStartPre=-/usr/lib/mdadm/mdadm_env.sh +ExecStart=BINDIR/mdadm --monitor $MDADM_MONITOR_ARGS @@ -0,0 +1,283 @@ +#!/bin/bash +# +# run test suite for mdadm +mdadm=$PWD/mdadm +targetdir="/var/tmp" +logdir="$targetdir" +config=/tmp/mdadm.conf +testdir=$PWD/tests +devlist= + +savelogs=0 +exitonerror=1 +prefix='[0-9][0-9]' + +# use loop devices by default if doesn't specify --dev +DEVTYPE=loop +INTEGRITY=yes +LVM_VOLGROUP=mdtest + +# make sure to test local mdmon, not system one +export MDADM_NO_SYSTEMCTL=1 + +# assume md0, md1, md2 exist in /dev +md0=/dev/md0 +md1=/dev/md1 +md2=/dev/md2 +mdp0=/dev/md_d0 +mdp1=/dev/md_d1 + +die() { + echo -e "\n\tERROR: $* \n" + save_log fail + exit 2 +} + +ctrl_c() { + exitonerror=1 +} + +# mdadm always adds --quiet, and we want to see any unexpected messages +mdadm() { + rm -f $targetdir/stderr + case $* in + *-S* ) + udevadm settle + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 20000 > /proc/sys/dev/raid/speed_limit_max + ;; + esac + case $* in + *-C* | *--create* | *-B* | *--build* ) + # clear superblock every time once creating or + # building arrays, because it's always creating + # and building array many times in a test case. + for args in $* + do + [[ $args =~ "/dev/" ]] && { + [[ $args =~ "md" ]] || + $mdadm --zero $args > /dev/null + } + done + $mdadm 2> $targetdir/stderr --quiet "$@" --auto=yes + ;; + * ) + $mdadm 2> $targetdir/stderr --quiet "$@" + ;; + esac + rv=$? + case $* in + *-S* ) + udevadm settle + echo $p > /proc/sys/dev/raid/speed_limit_max + ;; + esac + cat >&2 $targetdir/stderr + return $rv +} + +do_test() { + _script=$1 + _basename=`basename $_script` + if [ -f "$_script" ] + then + rm -f $targetdir/stderr + # this might have been reset: restore the default. + echo 2000 > /proc/sys/dev/raid/speed_limit_max + do_clean + # source script in a subshell, so it has access to our + # namespace, but cannot change it. + echo -ne "$_script... " + if ( set -ex ; . $_script ) &> $targetdir/log + then + dmesg | grep -iq "error\|call trace\|segfault" && + die "dmesg prints errors when testing $_basename!" + echo "succeeded" + _fail=0 + else + save_log fail + _fail=1 + fi + [ "$savelogs" == "1" ] && + mv -f $targetdir/log $logdir/$_basename.log + [ "$_fail" == "1" -a "$exitonerror" == "1" ] && exit 1 + fi +} + +do_help() { + cat <<-EOF + Usage: $0 [options] + Example for disk mode: ./test --dev=disk --disks=/dev/sda{2..15} + Options: + --tests=test1,test2,... Comma separated list of tests to run + --testdir= Specify testdir as tests|clustermd_tests + --raidtype= raid0|linear|raid1|raid456|raid10|ddf|imsm + --disable-multipath Disable any tests involving multipath + --disable-integrity Disable slow tests of RAID[56] consistency + --logdir=directory Directory to save all logfiles in + --save-logs Usually use with --logdir together + --keep-going | --no-error Don't stop on error, ie. run all tests + --dev=loop|lvm|ram|disk Use loop devices (default), LVM, RAM or disk + --disks= Provide a bunch of physical devices for test + --volgroup=name LVM volume group for LVM test + setup Setup test environment and exit + cleanup Cleanup test environment + prefix Run tests with <prefix> + --help | -h Print this usage + EOF +} + +parse_args() { + for i in $* + do + case $i in + --testdir=* ) + case ${i##*=} in + tests ) + testdir=tests + ;; + clustermd_tests ) + testdir=clustermd_tests + CLUSTER_CONF="$PWD/$testdir/cluster_conf" + ;; + * ) + echo "Unknown argument: $i" + do_help + exit 1 + ;; + esac + ;; + esac + done + [ -z "$testdir" ] && testdir=tests + . $testdir/func.sh + for i in $* + do + case $i in + [0-9][0-9] ) + prefix=$i + ;; + setup ) + echo "mdadm test environment setup" + do_setup + trap 0 + exit 0 + ;; + cleanup ) + cleanup + exit 0 + ;; + --testdir=* ) + ;; + --tests=* ) + TESTLIST=($(echo ${i##*=} | sed -e 's/,/ /g')) + ;; + --raidtype=* ) + case ${i##*=} in + raid0 ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]r0\|raid0")) + ;; + linear ) + TESTLIST=($(ls $testdir | grep "linear")) + ;; + raid1 ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]r1\|raid1" | grep -vi "r10\|raid10")) + ;; + raid456 ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]r[4-6]\|raid[4-6]")) + ;; + raid10 ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]r10\|raid10")) + ;; + ddf ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]ddf")) + ;; + imsm ) + TESTLIST=($(ls $testdir | grep "[0-9][0-9]imsm")) + ;; + * ) + echo "Unknown argument: $i" + do_help + exit 1 + ;; + esac + ;; + --logdir=* ) + logdir="${i##*=}" + ;; + --save-logs ) + savelogs=1 + ;; + --keep-going | --no-error ) + exitonerror=0 + ;; + --disable-multipath ) + unset MULTIPATH + ;; + --disable-integrity ) + unset INTEGRITY + ;; + --dev=* ) + case ${i##*=} in + loop ) + DEVTYPE=loop + ;; + lvm ) + DEVTYPE=lvm + ;; + ram ) + DEVTYPE=ram + ;; + disk ) + DEVTYPE=disk + ;; + * ) + echo "Unknown argument: $i" + do_help + exit 1 + ;; + esac + ;; + --disks=* ) + disks=(${disks[*]} ${i##*=}) + ;; + --volgroup=* ) + LVM_VOLGROUP=`expr "x$i" : 'x[^=]*=\(.*\)'` + ;; + --help | -h ) + do_help + exit 0 + ;; + * ) + echo " $0: Unknown argument: $i" + do_help + exit 1 + ;; + esac + done +} + +main() { + do_setup + + echo "Testing on linux-$(uname -r) kernel" + [ "$savelogs" == "1" ] && + echo "Saving logs to $logdir" + if [ "x$TESTLIST" != "x" ] + then + for script in ${TESTLIST[@]} + do + do_test $testdir/$script + done + else + for script in $testdir/$prefix $testdir/$prefix*[^~] + do + do_test $script + done + fi + + exit 0 +} + +parse_args $@ +main diff --git a/tests/00linear b/tests/00linear new file mode 100644 index 0000000..e3ac655 --- /dev/null +++ b/tests/00linear @@ -0,0 +1,25 @@ + +# create a simple linear + +mdadm -CR $md0 -l linear -n3 $dev0 $dev1 $dev2 +check linear +testdev $md0 3 $mdsize2_l 1 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize0 1 +mdadm -S $md0 + +# now with version-1.0 superblock +mdadm -CR $md0 -e1.0 --level=linear -n4 $dev0 $dev1 $dev2 $dev3 +check linear +testdev $md0 4 $mdsize1 1 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l linear -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check linear +testdev $md0 5 $size 64 +mdadm -S $md0 diff --git a/tests/00multipath b/tests/00multipath new file mode 100644 index 0000000..84e4d69 --- /dev/null +++ b/tests/00multipath @@ -0,0 +1,29 @@ + +# +# create a multipath, and fail and stuff + +if [ "$MULTIPATH" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +mdadm -CR $md1 -l multipath -n2 $path0 $path1 + +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path0 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -r $path0 +mdadm $md1 -a $path0 + +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm $md1 -f $path1 +mdadm $md1 -r $path1 +rotest $md1 +testdev $md1 1 $mdsize12 1 + +mdadm -S $md1 diff --git a/tests/00names b/tests/00names new file mode 100644 index 0000000..7a066d8 --- /dev/null +++ b/tests/00names @@ -0,0 +1,13 @@ +set -x -e + +# create arrays with non-numeric names +conf=$targetdir/mdadm.conf +echo "CREATE names=yes" > $conf + +for i in linear raid0 raid1 raid4 raid5 raid6 +do + mdadm -CR --config $conf /dev/md/$i -l $i -n 4 $dev4 $dev3 $dev2 $dev1 + check $i + [ -d /sys/class/block/md_$i/md ] + mdadm -S md_$i +done diff --git a/tests/00raid0 b/tests/00raid0 new file mode 100644 index 0000000..8bc1898 --- /dev/null +++ b/tests/00raid0 @@ -0,0 +1,43 @@ + +# create a simple raid0 + +mdadm -CR $md0 -l raid0 -n3 $dev0 $dev1 $dev2 +check raid0 +testdev $md0 3 $mdsize2_l 512 +mdadm -S $md0 + +# now with version-0.90 superblock +mdadm -CR $md0 -e0.90 -l0 -n4 $dev0 $dev1 $dev2 $dev3 +check raid0 +testdev $md0 4 $mdsize0 512 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l0 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check raid0 +testdev $md0 5 $size 512 +mdadm -S $md0 + + +# now same again with different chunk size +for chunk in 4 32 256 +do + mdadm -CR $md0 -e0.90 -l raid0 --chunk $chunk -n3 $dev0 $dev1 $dev2 + check raid0 + testdev $md0 3 $mdsize0 $chunk + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1.0 -l0 -c $chunk -n4 $dev0 $dev1 $dev2 $dev3 + check raid0 + testdev $md0 4 $mdsize1 $chunk + mdadm -S $md0 + + # now with no superblock + mdadm -B $md0 -l0 -n5 --chun=$chunk $dev0 $dev1 $dev2 $dev3 $dev4 + check raid0 + testdev $md0 5 $size $chunk + mdadm -S $md0 + +done +exit 0 diff --git a/tests/00raid1 b/tests/00raid1 new file mode 100644 index 0000000..f6b8be1 --- /dev/null +++ b/tests/00raid1 @@ -0,0 +1,38 @@ + +# create a simple mirror +# test version0, version1, and no super +# test resync and recovery. + +# It's just a sanity check. This command shouldn't run successfully +mdadm -CR $md0 -l 1 -n2 missing missing +check opposite_result + +mdadm -CR $md0 -l 1 -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +# now with version-0.90 superblock, spare +mdadm -CR $md0 -e0.90 --level=raid1 -n3 -x2 $dev0 missing missing $dev1 $dev2 +check recovery +check raid1 +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +# now with no superblock +mdadm -B $md0 -l mirror -n2 $dev0 $dev1 +check resync +check raid1 +testdev $md0 1 $size 1 +mdadm -S $md0 + +# again, but with no resync +mdadm -B $md0 -l 1 --assume-clean -n2 $dev0 $dev1 +check raid1 +check nosync +testdev $md0 1 $size 1 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid10 b/tests/00raid10 new file mode 100644 index 0000000..796b970 --- /dev/null +++ b/tests/00raid10 @@ -0,0 +1,18 @@ + +# Create some raid10 arrays, all with 6 devices and one spare +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +for lo in n2 n3 f2 f3 +do + cm=1 + case $lo in + f2 ) m=3 cm=2;; + f3 ) m=2 cm=3;; + n2 ) m=3;; + n3 ) m=2;; + esac + mdadm --create --run --level=raid10 --layout $lo --raid-disks 6 -x 1 $md0 $devs + check resync ; check raid10 + testdev $md0 $m $mdsize1 $[512*cm] + mdadm -S $md0 +done diff --git a/tests/00raid4 b/tests/00raid4 new file mode 100644 index 0000000..00a14f2 --- /dev/null +++ b/tests/00raid4 @@ -0,0 +1,16 @@ + +# create a simple raid4 set + +mdadm -CfR $md0 -l 4 -n3 $dev0 $dev1 $dev2 +check resync ; check raid[45] +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid4 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery; check raid[45] +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00raid5 b/tests/00raid5 new file mode 100644 index 0000000..b2b7a97 --- /dev/null +++ b/tests/00raid5 @@ -0,0 +1,33 @@ + +# create a simple raid5 set + +mdadm -CfR $md0 -e 0.90 -l 5 -n3 $dev0 $dev1 $dev2 +check resync +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# now same again with explicit layout + +for lo in la ra left-symmetric right-symmetric +do + + mdadm -CfR $md0 -l 5 -p $lo -n3 $dev0 $dev1 $dev2 + check resync ; check raid5 + testdev $md0 2 $mdsize1 512 + mdadm -S $md0 + + # now with version-1 superblock + mdadm -CR $md0 -e1 --level=raid5 --layout $lo -n4 $dev0 $dev1 $dev2 $dev3 + check recovery ; check raid5 + testdev $md0 3 $mdsize1 512 + mdadm -S $md0 + +done + +exit 0 diff --git a/tests/00raid6 b/tests/00raid6 new file mode 100644 index 0000000..6977af9 --- /dev/null +++ b/tests/00raid6 @@ -0,0 +1,16 @@ + +# create a simple raid6 set + +mdadm -CfR $md0 -e0.90 -l 6 -n4 $dev0 $dev1 $dev2 $dev3 +check resync ; check raid6 +testdev $md0 2 $mdsize0 512 +mdadm -S $md0 + +# now with version-1 superblock +mdadm -CR $md0 -e1 --level=raid6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check resync ; check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + + +exit 0 diff --git a/tests/00readonly b/tests/00readonly new file mode 100644 index 0000000..28b0fa1 --- /dev/null +++ b/tests/00readonly @@ -0,0 +1,22 @@ +#!/bin/bash + +for metadata in 0.9 1.0 1.1 1.2 +do + for level in linear raid0 raid1 raid4 raid5 raid6 raid10 + do + mdadm -CR $md0 -l $level -n 4 --metadata=$metadata \ + $dev1 $dev2 $dev3 $dev4 --assume-clean + check nosync + check $level + mdadm -ro $md0 + check readonly + state=$(cat /sys/block/md0/md/array_state) + [ "$state" == "readonly" ] || + die "array_state should be 'readonly', but is $state" + mdadm -w $md0 + check $level + mdadm -S $md0 + done +done + +exit 0 diff --git a/tests/01r1fail b/tests/01r1fail new file mode 100644 index 0000000..389b813 --- /dev/null +++ b/tests/01r1fail @@ -0,0 +1,29 @@ + +# create a raid1, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l1 -n4 $dev0 $dev1 $dev2 missing +check resync +mdadm $md0 --fail $dev2 +check resync +mdadm $md0 --fail $dev1 +sleep 1 +check nosync +check state U___ +mdadm $md0 --add $dev4 $dev3 +check recovery +# there could be two separate recoveries, one for each dev +check wait +check wait +mdadm $md0 --remove $dev2 $dev1 +check nosync +check state UUU_ + +mdadm --zero-superblock $dev2 +mdadm $md0 -a $dev2 +check recovery +check wait +check state UUUU + +mdadm -S $md0 diff --git a/tests/01r5fail b/tests/01r5fail new file mode 100644 index 0000000..873dba5 --- /dev/null +++ b/tests/01r5fail @@ -0,0 +1,27 @@ + + +# create a raid5, fail and remove a drive during initial sync +# Add two more, fail and remove one +# wait for sync to complete, fail, remove, re-add + +mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check recovery +mdadm $md0 --fail $dev3 +sleep 1 +check nosync +check state UUU_ + +mdadm $md0 --add $dev4 $dev5 +check recovery +check wait +mdadm $md0 --fail $dev0 +mdadm $md0 --remove $dev3 $dev0 +check recovery +check state _UUU + +mdadm $md0 -a $dev3 +check recovery +check wait +check state UUUU + +mdadm -S $md0
\ No newline at end of file diff --git a/tests/01r5integ b/tests/01r5integ new file mode 100644 index 0000000..48676a2 --- /dev/null +++ b/tests/01r5integ @@ -0,0 +1,33 @@ + +# Check integrity of raid5 in degraded mode +# Create a 4 disk raid5, create a filesystem and +# sha1sum it with each device failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +for layout in ls rs la ra +do + mdadm -CR $md0 -l5 --layout $layout -n4 $dev0 $dev1 $dev2 $dev3 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + for i in $dev0 $dev1 $dev2 $dev3 + do + mdadm $md0 -f $i + mdadm $md0 -r $i + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $i missing + exit 1 + fi + mdadm $md0 -a $i + while ! (check state 'U*'); do check wait; sleep 0.2; done + done + mdadm -S $md0 +done diff --git a/tests/01raid6integ b/tests/01raid6integ new file mode 100644 index 0000000..12f4d81 --- /dev/null +++ b/tests/01raid6integ @@ -0,0 +1,57 @@ + +# Check integrity of raid6 in degraded modes +# Create a 5 disk raid6, dump some data to it, then +# sha1sum it with different pairs of devices failed + +if [ "$INTEGRITY" != "yes" ]; then + echo -ne 'skipping... ' + exit 0 +fi + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout $layout -n5 $dev0 $dev1 $dev2 $dev3 $dev4 + check wait + tar cf - /etc > $md0 + sum=`sha1sum $md0` + + totest= + for second in $dev0 $dev1 $dev2 $dev3 $dev4 + do + mdadm $md0 -f $second + mdadm $md0 -r $second + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $second missing + exit 1 + fi + for first in $totest + do + mdadm $md0 -f $first + mdadm $md0 -r $first + blockdev --flushbufs $md0 + sum1=`sha1sum $md0` + if [ "$sum" != "$sum1" ] + then + echo $sum does not match $sum1 with $first and $second missing + exit 1 + fi + mdadm $md0 -a $first + while ! (check state 'U*_U*'); do check wait; sleep 0.2; done + done + mdadm $md0 -a $second + while ! (check state 'U*'); do check wait; sleep 0.2; done + totest="$totest $second" + done + mdadm -S $md0 +done diff --git a/tests/01replace b/tests/01replace new file mode 100644 index 0000000..6223a22 --- /dev/null +++ b/tests/01replace @@ -0,0 +1,52 @@ +set -x -e + +## test --replace for raid5 raid6 raid1 and raid10 +#1/ after replace, can remove replaced device +#2/ after --replace-with cannot remove the 'with' device +#3/ preserve integrity with concurrent failure + +for level in 1 5 6 10 +do + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + dd if=/dev/urandom of=$md0 bs=1M || true + sum=`sha1sum < $md0` + check wait + mdadm $md0 --replace $dev1 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 && exit 1 + mdadm -S $md0 + dd if=/dev/zero of=$dev4 bs=1M || true + dd if=/dev/zero of=$dev5 bs=1M || true + mdadm -CR $md0 -l $level -n4 -x2 $devlist5 + check wait + sum1=`sha1sum < $md0` + [ "$sum" == "$sum1" ] + + mdadm $md0 --replace $dev1 --with $dev4 + check wait + mdadm $md0 --remove $dev1 + mdadm $md0 --remove $dev5 + mdadm $md0 --remove $dev4 && exit 1 + + mdadm $md0 --add $dev1 $dev5 + mdadm $md0 --replace $dev0 + sleep 1 + mdadm $md0 --fail $dev2 + check wait + sum2=`sha1sum < $md0` + [ "$sum" == "$sum2" ] + + mdadm $md0 --remove $dev0 $dev2 + mdadm $md0 --add $dev0 $dev2 + mdadm $md0 --replace $dev3 + sleep 1 + mdadm $md0 --fail $dev0 $dev2 + check wait + sum3=`sha1sum < $md0` + [ "$sum" == "$sum3" ] + + mdadm -S $md0 +done diff --git a/tests/02lineargrow b/tests/02lineargrow new file mode 100644 index 0000000..e05c219 --- /dev/null +++ b/tests/02lineargrow @@ -0,0 +1,23 @@ + +# create a liner array, and add more drives to to. + +for e in 0.90 1 1.1 1.2 +do + case $e in + 0.90 ) sz=$mdsize0 ;; + 1 ) sz=$mdsize2_l ;; + 1.0 ) sz=$mdsize1 ;; + 1.1 ) sz=$mdsize1_l ;; + 1.2 ) sz=$mdsize2_l ;; + esac + mdadm -CRf $md0 --level linear -e $e --raid-disks=1 $dev1 + testdev $md0 1 $sz 1 + + mdadm --grow $md0 --add $dev2 + testdev $md0 2 $sz 1 + + mdadm --grow $md0 --add $dev3 + testdev $md0 3 $sz 1 + + mdadm -S $md0 +done diff --git a/tests/02r1add b/tests/02r1add new file mode 100644 index 0000000..757f696 --- /dev/null +++ b/tests/02r1add @@ -0,0 +1,40 @@ + +# Make a raid1, add a device, then remove it again. + +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 +# same again for version-1 + + +mdadm -CR $md0 -l1 -n2 -e1.2 -x1 $dev0 $dev1 $dev2 +check resync +check wait +check state UU + +mdadm --grow $md0 -n 3 +check recovery +check wait +check state UUU + +mdadm $md0 --fail $dev0 +check state _UU + +mdadm --grow $md0 -n 2 +check state UU + +mdadm -S $md0 diff --git a/tests/02r1grow b/tests/02r1grow new file mode 100644 index 0000000..5754c88 --- /dev/null +++ b/tests/02r1grow @@ -0,0 +1,36 @@ + + +# create a small raid1 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize0 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid1 --metadata=1.1 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 1 $[size/2] 1 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 1 $mdsize1_l 1 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 1 $[size/2] 1 + +mdadm -S $md0 diff --git a/tests/02r5grow b/tests/02r5grow new file mode 100644 index 0000000..2da78ee --- /dev/null +++ b/tests/02r5grow @@ -0,0 +1,53 @@ + + +# create a small raid5 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e0.90 --level raid5 --chunk=64 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 3 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 3 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 3 $[size/2] 128 + +mdadm -S $md0 + +# create a raid5 array and change the chunk +mdadm -CR $md0 --level raid5 --metadata=1.1 --chunk=32 --raid-disks 3 --size $[size/2] $dev1 $dev2 $dev3 +check wait +check state UUU +check chunk 32 + +mdadm $md0 --grow --chunk=64 +check reshape +check wait +check chunk 64 + +mdadm -S $md0 +mdadm -A $md0 $dev1 $dev2 $dev3 +check state UUU +check chunk 64 +mdadm -S $md0 diff --git a/tests/02r6grow b/tests/02r6grow new file mode 100644 index 0000000..759e627 --- /dev/null +++ b/tests/02r6grow @@ -0,0 +1,36 @@ + + +# create a small raid6 array, make it larger. Then make it smaller + +mdadm -CR $md0 -e 0.90 --level raid6 --chunk=64 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 32 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $mdsize0 32 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 32 + +mdadm -S $md0 + +# same again with version 1.1 superblock +mdadm -CR $md0 --level raid6 --metadata=1.1 --chunk=128 --raid-disks 4 --size $[size/2] $dev1 $dev2 $dev3 $dev4 +check wait +check state UUUU +testdev $md0 2 $[size/2] 128 + +mdadm --grow $md0 --size max +check resync +check wait +testdev $md0 2 $[mdsize1_l] 128 + +mdadm --grow $md0 --size $[size/2] +check nosync +testdev $md0 2 $[size/2] 128 + +mdadm -S $md0 diff --git a/tests/03assem-incr b/tests/03assem-incr new file mode 100644 index 0000000..f10a1a4 --- /dev/null +++ b/tests/03assem-incr @@ -0,0 +1,17 @@ +set -x -e + +# Test interaction between -I and -A +# there are locking issue too, but those are hard to test for. +# +# Here just test that a partly "-I" assembled array can +# be completed with "-A" + +for l in 0 1 5 linear +do + mdadm -CR $md0 -l $l -n5 $dev0 $dev1 $dev2 $dev3 $dev4 --assume-clean + mdadm -S md0 + mdadm -I $dev1 + mdadm -I $dev3 + mdadm -A /dev/md0 $dev0 $dev1 $dev2 $dev3 $dev4 + mdadm -S /dev/md0 +done diff --git a/tests/03r0assem b/tests/03r0assem new file mode 100644 index 0000000..6744e32 --- /dev/null +++ b/tests/03r0assem @@ -0,0 +1,137 @@ + +# create a raid0 array from 3 devices, and assemble it in a multitude of ways. +# explicitly list devices +# uuid, md-minor on command line with wildcard devices +# mdadm.conf file + +mdadm -CR $md2 -l0 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize1_l 512" +$tst +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --name=2 $devlist +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 name=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + + +### Now for version 0... + +mdadm --zero-superblock $dev0 $dev1 $dev2 +mdadm -CR $md2 -l0 --metadata=0.90 -n3 $dev0 $dev1 $dev2 +check raid0 +tst="testdev $md2 3 $mdsize0 512" +$tst + +uuid=`mdadm -Db $md2 | sed 's/.*UUID=//'` +mdadm -S $md2 + +mdadm -A $md2 $dev0 $dev1 $dev2 +$tst +mdadm -S $md2 + +mdadm -A $md2 -u $uuid $devlist +$tst +mdadm -S $md2 + +mdadm --assemble $md2 --super-minor=2 $devlist # +$tst +mdadm -S $md2 + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md2 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + +{ + echo DEVICE $devlist + echo array $md2 super-minor=2 +} > $conf + +mdadm -As -c $conf $md2 +$tst +mdadm -S $md2 + + +{ + echo DEVICE $devlist + echo array $md2 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf $md2 +$tst + +echo "DEVICE $devlist" > $conf +mdadm -Db $md2 >> $conf +mdadm -S $md2 + +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +echo " metadata=1 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md2 +$tst +mdadm -S $md2 + +# Now use incremental assembly. +mdadm -I --config=$conf $dev0 +mdadm -I --config=$conf $dev1 +mdadm -I --config=$conf $dev2 +$tst +mdadm -S $md2 diff --git a/tests/03r5assem b/tests/03r5assem new file mode 100644 index 0000000..0c7fb8c --- /dev/null +++ b/tests/03r5assem @@ -0,0 +1,109 @@ + +# create a raid5 array and assemble it in various ways, +# including with missing devices. + +mdadm -CR -e 0.90 $md1 -l5 -n3 $dev0 $dev1 $dev2 +tst="check raid5 ;testdev $md1 2 $mdsize0 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +eval $tst + +mdadm -A $md1 -m 1 $devlist +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device + +mdadm -AR $md1 $dev0 $dev2 # +check state U_U +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 -m 1 $devlist +check state U_U +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 super-minor=1 +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=0.90 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst diff --git a/tests/03r5assem-failed b/tests/03r5assem-failed new file mode 100644 index 0000000..d38241d --- /dev/null +++ b/tests/03r5assem-failed @@ -0,0 +1,12 @@ + +# Create an array, fail one device while array is active, stop array, +# then re-assemble listing the failed device first. + +mdadm -CR $md1 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +check wait + +echo 2000 > /sys/block/md1/md/safe_mode_delay +mkfs $md1 +mdadm $md1 -f $dev0 +mdadm -S $md1 +mdadm -A $md1 $dev0 $dev1 $dev2 $dev3 || exit 1 diff --git a/tests/03r5assemV1 b/tests/03r5assemV1 new file mode 100644 index 0000000..bca0c58 --- /dev/null +++ b/tests/03r5assemV1 @@ -0,0 +1,128 @@ + +# create a v-1 raid5 array and assemble in various ways + +mdadm -CR -e1 --name one $md1 -l5 -n3 -x2 $dev0 $dev1 $dev2 $dev3 $dev4 +tst="check raid5 ;testdev $md1 2 $mdsize1 512 ; mdadm -S $md1" +uuid=`mdadm -Db $md1 | sed 's/.*UUID=//'` +check wait + +eval $tst + +mdadm -A $md1 $dev0 $dev1 $dev2 +mdadm $md1 --add $dev3 $dev4 +check spares 2 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check spares 2 +eval $tst + +mdadm -A $md1 --name one $devlist +check spares 2 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2,$dev3,$dev4 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +eval $tst +mdadm --assemble --scan --config=$conf $md1 +eval $tst +echo PING >&2 + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2,$dev3,$dev4" >> $conf +mdadm --assemble --scan --config=$conf $md1 +eval $tst + +### Now with a missing device +# We don't want the recovery to complete while we are +# messing about here. +echo 100 > /proc/sys/dev/raid/speed_limit_max +echo 100 > /proc/sys/dev/raid/speed_limit_min + +mdadm -AR $md1 $dev0 $dev2 $dev3 $dev4 # +check state U_U +check spares 1 +eval $tst + +mdadm -A $md1 -u $uuid $devlist +check state U_U +eval $tst + +mdadm -A $md1 --name=one $devlist +check state U_U +check spares 1 +eval $tst + + +conf=$targetdir/mdadm.conf +{ + echo DEVICE $devlist + echo array $md1 UUID=$uuid +} > $conf + +mdadm -As -c $conf $md1 +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 name=one +} > $conf + +mdadm -As -c $conf +check state U_U +eval $tst + +{ + echo DEVICE $devlist + echo array $md1 devices=$dev0,$dev1,$dev2 +} > $conf + +mdadm -As -c $conf + +echo "DEVICE $devlist" > $conf +mdadm -Db $md1 >> $conf +check state U_U +eval $tst + +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +echo " metadata=1.0 devices=$dev0,$dev1,$dev2" >> $conf +mdadm --assemble --scan --config=$conf $md1 +check state U_U +eval $tst + +# And now assemble with -I +mdadm -Ss +mdadm -I -c $conf $dev0 +mdadm -I -c $conf $dev1 +mdadm -I -c $conf $dev2 +eval $tst +echo 2000 > /proc/sys/dev/raid/speed_limit_max +echo 1000 > /proc/sys/dev/raid/speed_limit_min diff --git a/tests/04r0update b/tests/04r0update new file mode 100644 index 0000000..73ee3b9 --- /dev/null +++ b/tests/04r0update @@ -0,0 +1,20 @@ + +# create a raid0, re-assemble with a different super-minor +mdadm -CR -e 0.90 $md0 -l0 -n3 $dev0 $dev1 $dev2 +testdev $md0 3 $mdsize0 512 +minor1=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md0 + +mdadm -A $md1 $dev0 $dev1 $dev2 +minor2=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +mdadm -A $md1 --update=super-minor $dev0 $dev1 $dev2 +minor3=`mdadm -E $dev0 | sed -n -e 's/.*Preferred Minor : //p'` +mdadm -S /dev/md1 + +case "$minor1 $minor2 $minor3" in + "0 0 1" ) ;; + * ) echo >&2 "ERROR minors should be '0 0 1' but are '$minor1 $minor2 $minor3'" + exit 1 +esac diff --git a/tests/04r1update b/tests/04r1update new file mode 100644 index 0000000..e22965b --- /dev/null +++ b/tests/04r1update @@ -0,0 +1,15 @@ +set -i + +# create a raid1 array, let it sync, then re-assemble with a force-sync + +mdadm -CR $md0 -l1 -n2 $dev0 $dev1 +check wait +mdadm -S $md0 + +mdadm -A $md0 $dev0 $dev1 +check nosync +mdadm -S $md0 + +mdadm -A $md0 -U resync $dev0 $dev1 +check resync +mdadm -S $md0 diff --git a/tests/04r5swap b/tests/04r5swap new file mode 100644 index 0000000..5373a60 --- /dev/null +++ b/tests/04r5swap @@ -0,0 +1,18 @@ + +# make a raid5 array, byte swap the superblocks, then assemble... + +mdadm -CR $md0 -e 0.90 -l5 -n4 $dev0 $dev1 $dev2 $dev3 +sleep 4 +mdadm -S $md0 + +mdadm -E --metadata=0 $dev1 > $targetdir/d1 +for d in $dev0 $dev1 $dev2 $dev3 +do $dir/swap_super $d +done +mdadm -E --metadata=0.swap $dev1 > $targetdir/d1s +diff -u $targetdir/d1 $targetdir/d1s + +mdadm --assemble --update=byteorder $md0 $dev0 $dev1 $dev2 $dev3 +sleep 3 +check recovery +mdadm -S $md0 diff --git a/tests/04update-metadata b/tests/04update-metadata new file mode 100644 index 0000000..232fc1f --- /dev/null +++ b/tests/04update-metadata @@ -0,0 +1,48 @@ +set -xe + +# test converting v0.90 to v1.0 +# check for different levels +# check it fails for non-v0.90 +# check it fails during reshape or recovery +# check it fails when bitmap is present + +dlist="$dev0 $dev1 $dev2 $dev3" + +for ls in raid0/4 linear/4 raid1/1 raid5/3 raid6/2 +do + s=${ls#*/} l=${ls%/*} + mdadm -CR --assume-clean -e 0.90 $md0 --level $l -n 4 -c 64 $dlist + testdev $md0 $s 19904 64 + mdadm -S $md0 + mdadm -A $md0 --update=metadata $dlist + testdev $md0 $s 19904 64 check + mdadm -S $md0 +done + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail with v1.0 metadata + exit 1 +fi + +mdadm -CR -e 0.90 $md0 --level=6 -n4 -c32 $dlist +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail during resync + exit 1 +fi +mdadm -A $md0 $dlist +mdadm --wait $md0 || true +mdadm -S $md0 + +# should succeed now +mdadm -A $md0 --update=metadata $dlist + +mdadm -S /dev/md0 +mdadm -CR --assume-clean -e 0.90 $md0 --level=6 -n4 -c32 $dlist --bitmap=internal +mdadm -S $md0 + +if mdadm -A $md0 --update=metadata $dlist +then echo >&2 should fail when bitmap present + exit 1 +fi diff --git a/tests/04update-uuid b/tests/04update-uuid new file mode 100644 index 0000000..a4409e7 --- /dev/null +++ b/tests/04update-uuid @@ -0,0 +1,82 @@ +set -x + +# create an array, then change the uuid. + +mdadm -CR --assume-clean $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + +# try v1 superblock + +mdadm -CR --assume-clean -e1 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -S /dev/md0 + + +# now if we have a bitmap, that needs updating too. +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# and bitmap for version1 +rm -f $targetdir/bitmap +mdadm -CR --assume-clean -e1.1 -b $targetdir/bitmap $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 -b $targetdir/bitmap --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +# -X cannot tell which byteorder to use for the UUID, so allow both. +if mdadm -X $targetdir/bitmap | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || + mdadm -X $targetdir/bitmap | grep -s > /dev/null 67452301:efcdab89:98badcfe:10325476 +then : ; else + echo Wrong uuid; mdadm -X $targetdir/bitmap ; exit 2; +fi +mdadm -S /dev/md0 + +# Internal bitmaps too. +mdadm -CR --assume-clean -b internal --bitmap-chunk 4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 + +mdadm -CR --assume-clean -e1.2 -b internal --bitmap-chunk=4 $md0 -l5 -n3 $dev0 $dev1 $dev2 +mdadm -S /dev/md0 +mdadm -A /dev/md0 --update=uuid --uuid=0123456789abcdef:fedcba9876543210 $dev0 $dev1 $dev2 +no_errors +mdadm -D /dev/md0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -D /dev/md0 ; exit 2; +} +mdadm -X $dev0 | grep -s > /dev/null 01234567:89abcdef:fedcba98:76543210 || { + echo Wrong uuid; mdadm -X $dev0; exit 2; +} +mdadm -S /dev/md0 diff --git a/tests/05r1-add-internalbitmap b/tests/05r1-add-internalbitmap new file mode 100644 index 0000000..4e20305 --- /dev/null +++ b/tests/05r1-add-internalbitmap @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1a b/tests/05r1-add-internalbitmap-v1a new file mode 100644 index 0000000..721a41c --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1a @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1b b/tests/05r1-add-internalbitmap-v1b new file mode 100644 index 0000000..da78fd6 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1b @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-add-internalbitmap-v1c b/tests/05r1-add-internalbitmap-v1c new file mode 100644 index 0000000..9f2f128 --- /dev/null +++ b/tests/05r1-add-internalbitmap-v1c @@ -0,0 +1,20 @@ +# +# create a raid1 without any bitmap, add the bitmap and then write to +# the device. This should catch the case where the bitmap is created +# but not reloaded correctly, such as the case fixed by +# 4474ca42e2577563a919fd3ed782e2ec55bf11a2 +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 $dev1 $dev2 +check wait +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb internal --bitmap-chunk=4 $md0 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-bitmapfile b/tests/05r1-bitmapfile new file mode 100644 index 0000000..f384f0e --- /dev/null +++ b/tests/05r1-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=1 -n2 --delay=1 --bitmap $bmf $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 +testdev $md0 1 $mdsize1a 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1a 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +#it is too fast# check recovery + +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-failfast b/tests/05r1-failfast new file mode 100644 index 0000000..823dd6f --- /dev/null +++ b/tests/05r1-failfast @@ -0,0 +1,74 @@ + +# create a simple mirror and check failfast flag works +mdadm -CR $md0 -e1.2 --level=raid1 --failfast -n2 $dev0 $dev1 +check raid1 +if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null +then + die "failfast missing" +fi + +# Removing works with the failfast flag +mdadm $md0 -f $dev0 +mdadm $md0 -r $dev0 +if grep -v failfast /sys/block/md0/md/rd1/state > /dev/null +then + die "failfast missing" +fi + +# Adding works with the failfast flag +mdadm $md0 -a --failfast $dev0 +check wait +if grep -v failfast /sys/block/md0/md/rd0/state > /dev/null +then + die "failfast missing" +fi + +mdadm -S $md0 + +# Assembling works with the failfast flag +mdadm -A $md0 $dev0 $dev1 +check raid1 +if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null +then + die "failfast missing" +fi + +# Adding works with the nofailfast flag +mdadm $md0 -f $dev0 +mdadm $md0 -r $dev0 +mdadm $md0 -a --nofailfast $dev0 +check wait +if grep failfast /sys/block/md0/md/rd0/state > /dev/null +then + die "failfast should be missing" +fi + +# Assembling with one faulty slave works with the failfast flag +mdadm $md0 -f $dev0 +mdadm $md0 -r $dev0 +mdadm -S $md0 +mdadm -A $md0 $dev0 $dev1 +check raid1 +mdadm -S $md0 + +# Spare works with the failfast flag +mdadm -CR $md0 -e1.2 --level=raid1 --failfast -n2 $dev0 $dev1 +check raid1 +mdadm $md0 -a --failfast $dev2 +check wait +check spares 1 +if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null +then + die "failfast missing" +fi + +# Grow works with the failfast flag +mdadm -G $md0 --raid-devices=3 +check wait +if grep -v failfast /sys/block/md0/md/rd*/state > /dev/null +then + die "failfast missing" +fi +mdadm -S $md0 + +exit 0 diff --git a/tests/05r1-grow-external b/tests/05r1-grow-external new file mode 100644 index 0000000..69da3e9 --- /dev/null +++ b/tests/05r1-grow-external @@ -0,0 +1,33 @@ + +# +# create a raid1 array, add an external bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +bmf=$targetdir/bm +rm -f $bmf +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=$bmf --delay=1 || { mdadm -X $bmf ; exit 1; } +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal b/tests/05r1-grow-internal new file mode 100644 index 0000000..24b3aec --- /dev/null +++ b/tests/05r1-grow-internal @@ -0,0 +1,31 @@ + +# +# create a raid1 array, add an internal bitmap +# +mdadm --create --run $md0 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1a 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 || { mdadm -X $dev2 ; exit 1; } +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1a 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-grow-internal-1 b/tests/05r1-grow-internal-1 new file mode 100644 index 0000000..2f0d823 --- /dev/null +++ b/tests/05r1-grow-internal-1 @@ -0,0 +1,31 @@ + +# +# create a raid1 array, version 1 superblock, add an internal bitmap +# +mdadm --create --run $md0 -e1 -l 1 -n 2 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize1b 64 + +#mdadm -E $dev1 +mdadm --grow $md0 --bitmap=internal --bitmap-chunk=4 --delay=1 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +testdev $md0 1 $mdsize1b 64 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +#echo $dirty1 $dirty2 $dirty3 $dirty4 +if [ $dirty2 -ne 0 -o $dirty4 -ne 0 -o $dirty3 -lt 400 ] +then + echo bad dirty counts + exit 1 +fi + +# now to remove the bitmap +check bitmap +mdadm --grow $md0 --bitmap=none +check nobitmap +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap b/tests/05r1-internalbitmap new file mode 100644 index 0000000..dd7232a --- /dev/null +++ b/tests/05r1-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create -e0.90 --run $md0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 +mdadm --zero-superblock $dev1 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1a b/tests/05r1-internalbitmap-v1a new file mode 100644 index 0000000..3ddc082 --- /dev/null +++ b/tests/05r1-internalbitmap-v1a @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize1b 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize1b 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1b b/tests/05r1-internalbitmap-v1b new file mode 100644 index 0000000..40f7abe --- /dev/null +++ b/tests/05r1-internalbitmap-v1b @@ -0,0 +1,49 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize11 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +check bitmap +testdev $md0 1 $mdsize11 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize11 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-internalbitmap-v1c b/tests/05r1-internalbitmap-v1c new file mode 100644 index 0000000..2eaea59 --- /dev/null +++ b/tests/05r1-internalbitmap-v1c @@ -0,0 +1,48 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --delay=1 --bitmap internal --bitmap-chunk 4 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize12 64 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 +testdev $md0 1 $mdsize12 64 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 1 $mdsize12 64 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --zero-superblock $dev1 +mdadm --assemble -R $md0 $dev2 +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r1-n3-bitmapfile b/tests/05r1-n3-bitmapfile new file mode 100644 index 0000000..f1c3f1e --- /dev/null +++ b/tests/05r1-n3-bitmapfile @@ -0,0 +1,53 @@ + +# +# create a raid1 with 3 devices and a bitmap file +# make sure resync does right thing. +# +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create -e0.90 --run $md0 --level=1 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 1 $mdsize0 64 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 1 $mdsize0 64 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev2 +testdev $md0 1 $mdsize0 64 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev3 +check nosync +mdadm --zero-superblock $dev2 +mdadm $md0 --add $dev2 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 +exit 0 diff --git a/tests/05r1-re-add b/tests/05r1-re-add new file mode 100644 index 0000000..fa6bbcb --- /dev/null +++ b/tests/05r1-re-add @@ -0,0 +1,39 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# + +mdadm -CR $md0 -l1 -n2 -binternal --bitmap-chunk=4 -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $mdsize1a 64 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 -a $dev2 +#cat /proc/mdstat +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm $md0 -a $dev2 +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # ensure writes have been sent. +mdadm $md0 -a $dev2 +check recovery +check wait +blockdev --flushbufs $dev1 $dev2 +cmp --ignore-initial=$[64*512] --bytes=$[$mdsize0*1024] $dev1 $dev2 +mdadm -S $md0 diff --git a/tests/05r1-re-add-nosuper b/tests/05r1-re-add-nosuper new file mode 100644 index 0000000..058d602 --- /dev/null +++ b/tests/05r1-re-add-nosuper @@ -0,0 +1,38 @@ + +# +# create a raid1, remove a drive, and readd it. +# resync should be instant. +# Then do some IO first. Resync should still be very fast +# +bmf=$targetdir/bitmap2 +rm -f $bmf +mdadm -B $md0 -l1 -n2 -b$bmf -d1 $dev1 $dev2 +check resync +check wait +testdev $md0 1 $size 1 +sleep 4 + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +mdadm $md0 --re-add $dev2 +check nosync + +mdadm $md0 -f $dev2 +sleep 1 +mdadm $md0 -r $dev2 +testdev $md0 1 $size 1 +mdadm $md0 --re-add $dev2 +check wait +cmp --bytes=$[$mdsize0*1024] $dev1 $dev2 + +mdadm $md0 -f $dev2; sleep 1 +mdadm $md0 -r $dev2 +if dd if=/dev/zero of=$md0 ; then : ; fi +blockdev --flushbufs $md0 # make sure writes have been sent +mdadm $md0 --re-add $dev2 +check recovery +check wait +# should BLKFLSBUF and then read $dev1/$dev2... +cmp --bytes=$[$mdsize0*1024] $file1 $file2 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap b/tests/05r1-remove-internalbitmap new file mode 100644 index 0000000..712fd56 --- /dev/null +++ b/tests/05r1-remove-internalbitmap @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=0.9 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1a b/tests/05r1-remove-internalbitmap-v1a new file mode 100644 index 0000000..a4a9aaf --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1a @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.0 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1b b/tests/05r1-remove-internalbitmap-v1b new file mode 100644 index 0000000..c0918eb --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1b @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.1 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r1-remove-internalbitmap-v1c b/tests/05r1-remove-internalbitmap-v1c new file mode 100644 index 0000000..15f1fbb --- /dev/null +++ b/tests/05r1-remove-internalbitmap-v1c @@ -0,0 +1,18 @@ +# +# create a raid1 with bitmap, remove the bitmap and verify it is still +# gone when re-assembling the array +# +mdadm --create --run $md0 --metadata=1.2 --level=1 -n2 --bitmap internal --bitmap-chunk=4 --delay=1 $dev1 $dev2 +check wait +check bitmap +testdev $md0 1 $mdsize1b 64 +mdadm -Gb none $md0 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 + +# Re-assemble the array and verify the bitmap is still present +mdadm --assemble $md0 $dev1 $dev2 +check nobitmap +testdev $md0 1 $mdsize1b 64 +mdadm -S $md0 diff --git a/tests/05r5-bitmapfile b/tests/05r5-bitmapfile new file mode 100644 index 0000000..6d173d8 --- /dev/null +++ b/tests/05r5-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev2 $dev3 +mdadm --zero $dev1 # force add, not re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r5-internalbitmap b/tests/05r5-internalbitmap new file mode 100644 index 0000000..13dc592 --- /dev/null +++ b/tests/05r5-internalbitmap @@ -0,0 +1,47 @@ + +# +# create a raid1 with an internal bitmap +# +mdadm --create --run $md0 --level=5 -n3 --delay=1 --bitmap internal --bitmap-chunk=4 $dev1 $dev2 $dev3 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 $dev1 $dev2 $dev3 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev1 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 $dev2 $dev3 +mdadm --zero $dev1 # force --add, not --re-add +mdadm $md0 --add $dev1 +check recovery + +dirty4=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $dev2 | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6-bitmapfile b/tests/05r6-bitmapfile new file mode 100644 index 0000000..d11896d --- /dev/null +++ b/tests/05r6-bitmapfile @@ -0,0 +1,49 @@ + +# +# create a raid1 with a bitmap file +# +bmf=$targetdir/bitmap +rm -f $bmf +mdadm --create --run $md0 --level=6 -n4 --delay=1 --bitmap $bmf $dev1 $dev2 $dev3 $dev4 +check wait +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +mdadm --assemble $md0 --bitmap=$bmf $dev1 $dev2 $dev3 $dev4 +testdev $md0 2 $mdsize1 512 +dirty1=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +sleep 4 +dirty2=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty1 -lt 400 -o $dirty2 -ne 0 ] +then echo >&2 "ERROR bad 'dirty' counts: $dirty1 and $dirty2" + exit 1 +fi +mdadm $md0 -f $dev3 +testdev $md0 2 $mdsize1 512 +sleep 4 +dirty3=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +if [ $dirty3 -lt 400 ] +then + echo >&2 "ERROR dirty count $dirty3 is too small" + exit 2 +fi + +mdadm -S $md0 + +mdadm --assemble -R $md0 --bitmap=$bmf $dev1 $dev2 $dev4 +mdadm --zero $dev3 # force --add, not --re-add +mdadm $md0 --add $dev3 +check recovery + +dirty4=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` +check wait +sleep 4 +dirty5=`mdadm -X $bmf | sed -n -e 's/.*Bitmap.* \([0-9]*\) dirty.*/\1/p'` + +if [ $dirty4 -lt 400 -o $dirty5 -ne 0 ] +then echo echo >&2 "ERROR bad 'dirty' counts at end: $dirty4 $dirty5" + exit 1 +fi + +mdadm -S $md0 diff --git a/tests/05r6tor0 b/tests/05r6tor0 new file mode 100644 index 0000000..2fd51f2 --- /dev/null +++ b/tests/05r6tor0 @@ -0,0 +1,27 @@ +set -x -e + +# reshape a RAID6 to RAID5 and then RAID0. +# then reshape back up to RAID5 and RAID5 + +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +check wait; sleep 1 +check raid6 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 +check wait; sleep 1 +check raid5 +testdev $md0 3 19456 512 +mdadm -G $md0 -l0 +check wait; sleep 1 +check raid0 +testdev $md0 3 19456 512 +mdadm -G $md0 -l5 --add $dev3 $dev4 +check wait; sleep 1 +check raid5 +check algorithm 2 +testdev $md0 3 19456 512 +mdadm -G $md0 -l 6 +check wait; sleep 1 +check raid6 +check algorithm 2 +testdev $md0 3 19456 512 diff --git a/tests/06name b/tests/06name new file mode 100644 index 0000000..4d5e824 --- /dev/null +++ b/tests/06name @@ -0,0 +1,12 @@ +set -x + +# create an array with a name + +mdadm -CR $md0 -l0 -n2 --metadata=1 --name="Fred" $dev0 $dev1 +mdadm -E $dev0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -D $md0 | grep 'Name : [^:]*:Fred ' > /dev/null || exit 1 +mdadm -S $md0 + +mdadm -A $md0 --name="Fred" $devlist +#mdadm -Db $md0 +mdadm -S $md0 diff --git a/tests/06sysfs b/tests/06sysfs new file mode 100644 index 0000000..af63ef4 --- /dev/null +++ b/tests/06sysfs @@ -0,0 +1,11 @@ +exit 0 +mdadm -CR $md0 -l1 -n3 $dev1 $dev2 $dev3 + +ls -Rl /sys/block/md0 + +cat /sys/block/md0/md/level +cat /sys/block/md0/md/raid_disks + +mdadm -S $md0 + +exit 1 diff --git a/tests/06wrmostly b/tests/06wrmostly new file mode 100644 index 0000000..968c197 --- /dev/null +++ b/tests/06wrmostly @@ -0,0 +1,13 @@ + +# create a raid1 array with a wrmostly device + +mdadm -CR $md0 -l1 -n3 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 + +# unfortunately, we cannot measure if any read requests are going to $dev2 + +mdadm -S $md0 + +mdadm -CR $md0 -l1 -n3 --write-behind --bitmap=internal --bitmap-chunk=4 $dev0 $dev1 --write-mostly $dev2 +testdev $md0 1 $mdsize1a 64 +mdadm -S $md0 diff --git a/tests/07autoassemble b/tests/07autoassemble new file mode 100644 index 0000000..e689be7 --- /dev/null +++ b/tests/07autoassemble @@ -0,0 +1,24 @@ + +# create two raid1s, build a raid0 on top, then +# tear it down and get auto-assemble to rebuild it. + +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md2 -l1 -n2 $dev2 $dev3 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $md2 --homehost=testing + +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md2 1 $mdsize1a 64 +testdev $md0 2 $mdsize11a 512 +mdadm -Ss + +mdadm --zero-superblock $dev0 $dev1 $dev2 $dev3 +## Now the raid0 uses one stacked and one not +mdadm -CR $md1 -l1 -n2 $dev0 $dev1 --homehost=testing +mdadm -CR $md0 -l0 -n2 $md1 $dev2 --homehost=testing +mdadm -Ss +mdadm -As -c /dev/null --homehost=testing -vvv +testdev $md1 1 $mdsize1a 64 +testdev $md0 1 $[mdsize1a+mdsize11a] 512 +mdadm -Ss diff --git a/tests/07autodetect b/tests/07autodetect new file mode 100644 index 0000000..917e0d6 --- /dev/null +++ b/tests/07autodetect @@ -0,0 +1,34 @@ + +# +# Test in-kernel autodetect. +# Create a partitionable array on each of two devices, +# put a partition on each, create an array, and see if we can +# use autodetect to restart the array. + +if lsmod | grep md_mod > /dev/null 2>&1 +then + echo md is a module - cannot test autodetect + exit 0 +fi + + +mdadm -CR -e 0 $mdp0 -l0 -f -n1 $dev0 +mdadm -CR -e 0 $mdp1 -l0 -f -n1 $dev1 +udevadm settle +sfdisk $mdp0 >&2 << END +,,FD +END +sfdisk $mdp1 >&2 << END +,,FD +END +udevadm settle +mdadm -CR -e 0 $md0 -l1 -n2 ${mdp0}p1 ${mdp1}p1 +check resync +check raid1 +check wait +mdadm -S $md0 +mdadm --auto-detect +check raid1 + +mdadm -Ss +exit 0 diff --git a/tests/07changelevelintr b/tests/07changelevelintr new file mode 100644 index 0000000..18c6309 --- /dev/null +++ b/tests/07changelevelintr @@ -0,0 +1,61 @@ + +# +# test that we can stop and restart a level change. +# just test a few in-place changes, and a few +# size-reducing changes. + + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + +restart() { + sleep 0.5 + check reshape + mdadm -S $md0 + mdadm -A $md0 $devs --backup-file=$bu + sleep 0.5 + check reshape +} + +bu=/tmp/md-backup +rm -f $bu +devs="$dev0 $dev1 $dev2 $dev3 $dev4" +mdadm -CR $md0 -l5 -n5 -c 256 $devs +checkgeo md0 raid5 5 $[256*1024] 2 + +mdadm -G $md0 -c 128 --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 2 + +mdadm -G $md0 --layout rs --backup-file=$bu +restart +checkgeo md0 raid5 5 $[128*1024] 3 + +mdadm -G $md0 --array-size 58368 +mdadm -G $md0 --raid-disks 4 -c 64 --backup-file=$bu +restart +checkgeo md0 raid5 4 $[64*1024] 3 + +devs="$dev0 $dev1 $dev2 $dev3" +mdadm -G $md0 --array-size 19456 +mdadm -G $md0 -n 2 -c 256 --backup-file=$bu +restart +checkgeo md0 raid5 2 $[256*1024] 3 diff --git a/tests/07changelevels b/tests/07changelevels new file mode 100644 index 0000000..a328874 --- /dev/null +++ b/tests/07changelevels @@ -0,0 +1,114 @@ + +# Test changing of level, chunksize etc. +# Create a RAID1, convert to RAID5, add a disk, add another disk +# convert to RAID6, back to RAID5 and ultimately to RAID1 + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERIFY=1 + +dotest() { + sleep 2 + check wait + testdev $md0 $1 19968 64 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + sleep 1 + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `cat /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu +mdadm -CR $md0 -l1 -n2 -x1 $dev0 $dev1 $dev2 -z 19968 +testdev $md0 1 $mdsize1a 64 +dd if=/tmp/RandFile of=$md0 +dotest 1 + +mdadm --grow $md0 -l5 -n3 --chunk 64 +dotest 2 + +mdadm $md0 --add $dev3 $dev4 +mdadm --grow $md0 -n4 --chunk 32 +dotest 3 + +mdadm -G $md0 -l6 --backup-file $bu +dotest 3 + +mdadm -G /dev/md0 --array-size 39936 +mdadm -G $md0 -n4 --backup-file $bu +checkgeo md0 raid6 4 $[32*1024] +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +checkgeo md0 raid5 3 $[32*1024] +dotest 2 + +mdadm -G /dev/md0 --array-size 19968 +mdadm -G $md0 -n2 --backup-file $bu +checkgeo md0 raid5 2 $[32*1024] +dotest 1 + +mdadm -G --level=1 $md0 +dotest 1 + +# now repeat that last few steps only with a degraded array. +mdadm -S $md0 +mdadm -CR $md0 -l6 -n5 $dev0 $dev1 $dev2 $dev3 $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +mdadm $md0 --fail $dev0 + +mdadm -G /dev/md0 --array-size 37888 +mdadm -G $md0 -n4 --backup-file $bu +dotest 2 +checkgeo md0 raid6 4 $[512*1024] +mdadm $md0 --fail $dev4 + +mdadm $md0 --fail $dev3 +# now double-degraded. +# switch layout to a DDF layout and back to make sure that works. + +mdadm -G /dev/md0 --layout=ddf-N-continue --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 10 +dotest 2 +mdadm -G /dev/md0 --layout=ra --backup-file $bu +checkgeo md0 raid6 4 $[512*1024] 1 +dotest 2 + +mdadm -G $md0 -l5 --backup-file $bu +dotest 2 + +mdadm -G /dev/md0 --array-size 18944 +mdadm -G $md0 -n2 --backup-file $bu +dotest 1 +checkgeo md0 raid5 2 $[512*1024] +mdadm $md0 --fail $dev2 + +mdadm -G --level=1 $md0 +dotest 1 +checkgeo md0 raid1 2 diff --git a/tests/07layouts b/tests/07layouts new file mode 100644 index 0000000..acd1a80 --- /dev/null +++ b/tests/07layouts @@ -0,0 +1,91 @@ + +# check that kernel an restripe interpret all the different layouts +# the same +# This involves changing the layout to each different possibility +# while MDADM_GROW_VERIFY is set. + +testK=$[64*3*6] +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$testK +export MDADM_GROW_VERITY=1 + + +dotest() { + sleep 0.5 + check wait + testdev $md0 $1 $mdsize1 512 nd + blockdev --flushbufs $md0 + cmp -s -n $[textK*1024] $md0 /tmp/RandFile || { echo cmp failed; exit 2; } + # write something new - shift chars 4 space + tr ' -~' '$-~ -#' < /tmp/RandFile > /tmp/RandFile2 + mv /tmp/RandFile2 /tmp/RandFile + dd if=/tmp/RandFile of=$md0 +} + +checkgeo() { + # check the geometry of an array + # level raid_disks chunk_size layout + dev=$1 + shift + sleep 0.5 + check wait + for attr in level raid_disks chunk_size layout + do + if [ $# -gt 0 ] ; then + val=$1 + shift + if [ " `sed 's/ .*//' /sys/block/$dev/md/$attr`" != " $val" ] + then echo "$attr doesn't match for $dev" + exit 1 + fi + fi + done +} + + +bu=/tmp/md-test-backup +rm -f $bu + +# first a degraded 5 device raid5 +mdadm -CR $md0 -l5 -n5 $dev0 $dev1 missing $dev2 $dev3 +dd if=/tmp/RandFile of=$md0 +dotest 4 + +l5[0]=la +l5[1]=ra +l5[2]=ls +l5[3]=rs +l5[4]=parity-first +l5[5]=parity-last +for layout in 0 1 2 3 4 5 0 +do + mdadm -G $md0 --layout=${l5[$layout]} --backup-file $bu + checkgeo md0 raid5 5 $[512*1024] $layout + dotest 4 +done + +mdadm -S $md0 +# now a doubly degraded raid6 +mdadm -CR $md0 -l6 -n5 $dev0 missing $dev2 missing $dev4 +dd if=/tmp/RandFile of=$md0 +dotest 3 + +l6[0]=la +l6[1]=ra +l6[2]=ls +l6[3]=rs +l6[4]=parity-first +l6[5]=parity-last +l6[8]=ddf-zero-restart +l6[9]=ddf-N-restart +l6[10]=ddf-N-continue +l6[16]=left-asymmetric-6 +l6[17]=right-asymmetric-6 +l6[18]=left-symmetric-6 +l6[19]=right-symmetric-6 +l6[20]=parity-first-6 +for layout in 0 1 2 3 4 5 8 9 10 16 17 18 19 20 0 +do + mdadm -G $md0 --layout=${l6[$layout]} --backup-file $bu + checkgeo md0 raid6 5 $[512*1024] $layout + dotest 3 +done diff --git a/tests/07reshape5intr b/tests/07reshape5intr new file mode 100644 index 0000000..0f4803a --- /dev/null +++ b/tests/07reshape5intr @@ -0,0 +1,41 @@ + +# +# test interrupting and restarting raid5 reshape. +set -x +devs="$dev1" +st=UU +for disks in 2 3 4 5 +do + eval devs=\"$devs \$dev$disks\" + st=U$st + for d in $devs + do dd if=/dev/urandom of=$d bs=1024 || true + done + + case $disks in + 2 | 3) chunk=1024;; + 4 ) chunk=512;; + 5 ) chunk=256;; + esac + + mdadm -CR $md0 -amd -l5 -c $chunk -n$disks --assume-clean $devs + mdadm $md0 --add $dev6 + echo 20 > /proc/sys/dev/raid/speed_limit_min + echo 20 > /proc/sys/dev/raid/speed_limit_max + mdadm --grow $md0 -n $[disks+1] + check reshape + check state $st + mdadm --stop $md0 + mdadm --assemble $md0 $devs $dev6 + check reshape + echo 1000 > /proc/sys/dev/raid/speed_limit_min + echo 2000 > /proc/sys/dev/raid/speed_limit_max + check wait + while ! echo check > /sys/block/md0/md/sync_action; do sleep 0.1; done + check wait + mm=`cat /sys/block/md0/md/mismatch_cnt` + if [ $mm -gt 0 ] + then echo >&2 "ERROR mismatch_cnt non-zero : $mm" ; exit 1 + fi + mdadm -S $md0 +done diff --git a/tests/07revert-grow b/tests/07revert-grow new file mode 100644 index 0000000..c8c4e85 --- /dev/null +++ b/tests/07revert-grow @@ -0,0 +1,52 @@ +set -e -x + +# revert a reshape that is increasing the number of devices, +# raid5, raid6, and raid10 + +# metadate 0.90 cannot handle RAID10 growth +# metadata 1.0 doesn't get a default headspace, is don't try it either. + +for metadata in 0.90 1.1 1.2 +do +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 --metadata=$metadata +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n4 -x1 $devlist4 --metadata=$metadata +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +if [ $metadata = 0.90 ]; then continue; fi + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n4 -x1 $devlist4 --metadata=$metadata +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 5 +sleep 3 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist4 +check wait +check raid10 +testdev $md0 2 $mdsize1 512 +mdadm -S $md0 + +done diff --git a/tests/07revert-inplace b/tests/07revert-inplace new file mode 100644 index 0000000..a73eb97 --- /dev/null +++ b/tests/07revert-inplace @@ -0,0 +1,44 @@ +set -e -x + +# revert a reshape that is not changing the number of data devices, +# raid5, raid6, and raid10 + +# RAID5 -> RAID6 +mdadm -CR --assume-clean $md0 -l5 -n4 -x1 $devlist4 +check raid5 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 6 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +check algorithm 18 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID6 -> RAID5 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -l 5 +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=/tmp/md-backup +check wait +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 - decrease chunk size +mdadm -CR --assume-clean $md0 -l10 -n6 -c 64 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -G $md0 -c 32 +sleep 2 +mdadm -S $md0 +strace -o /tmp/str ./mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +testdev $md0 3 $mdsize1 64 +mdadm -S $md0 diff --git a/tests/07revert-shrink b/tests/07revert-shrink new file mode 100644 index 0000000..62b5ae0 --- /dev/null +++ b/tests/07revert-shrink @@ -0,0 +1,56 @@ +set -e -x + +# revert a reshape that is decreasing the number of devices, +# raid5, raid6, and raid10 + +bu=$targetdir/md-backup +rm -f $bu +# RAID5 +mdadm -CR --assume-clean $md0 -l5 -n5 $devlist4 +check raid5 +testdev $md0 4 $mdsize1 512 +mdadm --grow $md0 --array-size 56832 +testdev $md0 3 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid5 +fsck -f -n $md0 +testdev $md0 4 $mdsize1 512 +mdadm -S $md0 + +#FIXME +rm -f $bu +# RAID6 +mdadm -CR --assume-clean $md0 -l6 -n5 $devlist4 +check raid6 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 37888 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 --backup=$bu +sleep 2 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist4 --backup-file=$bu +check wait +check raid6 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 + +# RAID10 +mdadm -CR --assume-clean $md0 -l10 -n6 $devlist5 +check raid10 +testdev $md0 3 $mdsize1 512 +mdadm --grow $md0 --array-size 36864 +testdev $md0 2 $mdsize1 512 +mdadm -G $md0 -n 4 +sleep 3 +mdadm -S $md0 +mdadm -A $md0 --update=revert-reshape $devlist5 +check wait +check raid10 +fsck -f -n $md0 +testdev $md0 3 $mdsize1 512 +mdadm -S $md0 diff --git a/tests/07testreshape5 b/tests/07testreshape5 new file mode 100644 index 0000000..0e1f25f --- /dev/null +++ b/tests/07testreshape5 @@ -0,0 +1,45 @@ + +# +# test the reshape code by using test_reshape and the +# kernel md code to move data into and out of variously +# shaped md arrays. +set -x +layouts=(la ra ls rs) +for level in 5 6 +do +for chunk in 4 8 16 32 64 128 +do + devs="$dev1" + for disks in 2 3 4 5 6 + do + eval devs=\"$devs \$dev$disks\" + if [ " $level $disks" = " 6 3" -o " $level $disks" = " 6 2" ] + then continue + fi + for nlayout in 0 1 2 3 + do + layout=${layouts[$nlayout]} + + size=$[chunk*(disks-(level-4))*disks] + + # test restore: make a raid5 from a file, then do a compare + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$size + $dir/test_stripe restore /tmp/RandFile $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + mdadm -CR -e 1.0 $md0 -amd -l$level -n$disks --assume-clean -c $chunk -p $layout $devs + cmp -s -n $[size*1024] $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + # FIXME check parity + + # test save + dd if=/dev/urandom of=$md0 bs=1024 count=$size + blockdev --flushbufs $md0 $devs; sync + > /tmp/NewRand + $dir/test_stripe save /tmp/NewRand $disks $[chunk*1024] $level $nlayout 0 $[size*1024] $devs + cmp -s -n $[size*1024] $md0 /tmp/NewRand || { echo cmp failed ; exit 2; } + mdadm -S $md0 + udevadm settle + done + done +done +done +exit 0 diff --git a/tests/09imsm-assemble b/tests/09imsm-assemble new file mode 100644 index 0000000..d7028c6 --- /dev/null +++ b/tests/09imsm-assemble @@ -0,0 +1,73 @@ +# validate the prodigal member disk scenario i.e. a former container +# member is returned after having been rebuilt on another system + + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 +container=/dev/md/container +member=/dev/md/vol0 + + +num_disks=4 +size=$((10*1024)) +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +mdadm -CR $member $dev0 $dev2 -n 2 -l 1 -z $size +mdadm --wait $member || true +mdadm -Ss + +# make dev0 and dev1 a new rebuild family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# make dev2 and dev3 a new rebuild family +mdadm -A $container $dev2 $dev3 +mdadm -IR $container +mdadm --wait ${member}_0 || true +mdadm -Ss + +# reassemble and make sure one of the families falls out +mdadm -A $container $dev0 $dev1 $dev2 $dev3 +mdadm -IR $container +testdev ${member}_0 1 $size 64 +if mdadm --remove $container $dev0 ; then + # the dev[23] family won + imsm_check_removal $container $dev1 + imsm_check_hold $container $dev2 + imsm_check_hold $container $dev3 +else + # the dev[01] family won + imsm_check_hold $container $dev1 + imsm_check_removal $container $dev2 + imsm_check_removal $container $dev3 +fi +mdadm -Ss + +# reassemble with a new id for the dev[23] family +mdadm -A $container $dev0 $dev1 +mdadm -IR $container +mdadm -A ${container}2 $dev2 $dev3 --update=uuid +mdadm -IR ${container}2 + +testdev ${member}_0 1 $size 64 +testdev ${member}_1 1 $size 64 diff --git a/tests/09imsm-create-fail-rebuild b/tests/09imsm-create-fail-rebuild new file mode 100644 index 0000000..f09b437 --- /dev/null +++ b/tests/09imsm-create-fail-rebuild @@ -0,0 +1,78 @@ +# sanity check array creation + +imsm_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +imsm_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-imsm-template + +# IMSM rounds to multiples of one mebibyte - 1024K +DEV_ROUND_K=1024 + +num_disks=2 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 +imsm_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(1024 - 1)) + 4096)) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev0 $dev1 -n $num_disks -l $level -z $size +imsm_check member $member1 $num_disks $level $size $size $offset $chunk +testdev $member1 1 $size 64 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e imsm -n $num_disks $dev0 $dev1 $dev2 $dev3 +imsm_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +mdadm -CR $member0 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(1024 - 1)) + 4096)) +size=4000 +level=5 +mdadm -CR $member1 $dev0 $dev1 $dev2 $dev3 -n $num_disks -l $level -z $size -c $chunk +imsm_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +imsm_check_hold $container $dev0 +mdadm --fail $member0 $dev0 +mdadm --wait-clean --scan || true +imsm_check_removal $container $dev0 +mdadm --add $container $dev4 +check wait +imsm_check_hold $container $dev4 diff --git a/tests/09imsm-overlap b/tests/09imsm-overlap new file mode 100644 index 0000000..ff5d209 --- /dev/null +++ b/tests/09imsm-overlap @@ -0,0 +1,28 @@ + +. tests/env-imsm-template + +# create raid arrays with varying degress of overlap +mdadm -CR $container -e imsm -n 6 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 +imsm_check container 6 + +size=1024 +level=1 +num_disks=2 +mdadm -CR $member0 $dev0 $dev1 -n $num_disks -l $level -z $size +mdadm -CR $member1 $dev1 $dev2 -n $num_disks -l $level -z $size +mdadm -CR $member2 $dev2 $dev3 -n $num_disks -l $level -z $size +mdadm -CR $member3 $dev3 $dev4 -n $num_disks -l $level -z $size +mdadm -CR $member4 $dev4 $dev5 -n $num_disks -l $level -z $size + +udevadm settle + +offset=0 +imsm_check member $member0 $num_disks $level $size 1024 $offset +offset=$((offset+size+4096)) +imsm_check member $member1 $num_disks $level $size 1024 $offset +offset=$((offset+size+4096)) +imsm_check member $member2 $num_disks $level $size 1024 $offset +offset=$((offset+size+4096)) +imsm_check member $member3 $num_disks $level $size 1024 $offset +offset=$((offset+size+4096)) +imsm_check member $member4 $num_disks $level $size 1024 $offset diff --git a/tests/10ddf-assemble-missing b/tests/10ddf-assemble-missing new file mode 100644 index 0000000..4bf21b2 --- /dev/null +++ b/tests/10ddf-assemble-missing @@ -0,0 +1,61 @@ +# An array is assembled incompletely. +# Re missing disks get marked as missing and are not allowed back in + +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members +mdadm -IRs || true +mdadm -Ss + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# This should work because BVDs weren't written to +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done +mdadm -Ss + +# Expect consistent state +for d in $dev10 $dev11; do + mdadm -E $d>$tmp + egrep 'state\[0\] : Degraded, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + egrep 'state\[1\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member1 has unexpected state on $d + } + + if [ x$(egrep -c 'active/Online$' $tmp) != x2 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-create b/tests/10ddf-create new file mode 100644 index 0000000..44e9544 --- /dev/null +++ b/tests/10ddf-create @@ -0,0 +1,89 @@ +# +# Test basic DDF functionality. +# +# Create a container with 5 drives +# create a small raid0 across them all, +# then a small raid10 using 4 drives, then a 2disk raid1 +# and a 3disk raid5 using the remaining space +# +# add some data, tear down the array, reassemble +# and make sure it is still there. +set -e +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 + +mdadm -CR /dev/md/ddf0 -e ddf -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -CR r5 -l5 -n5 /dev/md/ddf0 -z 5000 +if mdadm -CR r5 -l1 -n2 /dev/md/ddf0 -z 5000 +then echo >&2 create with same name should fail ; exit 1 +fi +mdadm -CR r10 -l10 -n4 -pn2 /dev/md/ddf0 -z 5000 +mdadm -CR r1 -l1 -n2 /dev/md/ddf0 +mdadm -CR r0 -l0 -n3 /dev/md/ddf0 +testdev /dev/md/r5 4 5000 512 +testdev /dev/md/r10 2 5000 512 +# r0/r10 will use 4608 due to chunk size, so that leaves 23552 for the rest +testdev /dev/md/r1 1 23552 64 +testdev /dev/md/r0 3 23552 512 +dd if=$sda of=/dev/md/r0 || true +dd if=$sda of=/dev/md/r10 || true +dd if=$sda of=/dev/md/r1 || true +dd if=$sda of=/dev/md/r5 || true + +s0=`sha1sum /dev/md/r0` +s10=`sha1sum /dev/md/r10` +s1=`sha1sum /dev/md/r1` +s5=`sha1sum /dev/md/r5` + + +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 +mdadm -I /dev/md/ddf0 + +udevadm settle +s0a=`sha1sum /dev/md/r0` +s10a=`sha1sum /dev/md/r10` +s1a=`sha1sum /dev/md/r1` +s5a=`sha1sum /dev/md/r5` + +if [ "$s0" != "$s0a" ]; then + echo r0 did not match ; exit 1; +fi +if [ "$s10" != "$s10a" ]; then + echo r10 did not match ; exit 1; +fi +if [ "$s1" != "$s1a" ]; then + echo r1 did not match ; exit 1; +fi +if [ "$s5" != "$s5a" ]; then + echo r5 did not match ; exit 1; +fi + +# failure status just means it has completed already, so ignore it. +mdadm --wait /dev/md/r1 || true +mdadm --wait /dev/md/r10 || true +mdadm --wait /dev/md/r5 || true + +mdadm -Dbs > /var/tmp/mdadm.conf + +mdadm -Ss + +# Now try to assemble using mdadm.conf +mdadm -Asc /var/tmp/mdadm.conf +check nosync # This failed once. The raid5 was resyncing. +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss + +# and now assemble fully incrementally. +for i in $dev8 $dev9 $dev10 $dev11 $dev12 +do + mdadm -I $i -c /var/tmp/mdadm.conf +done +check nosync +udevadm settle +mdadm -Dbs | sort > /tmp/mdadm.conf +sort /var/tmp/mdadm.conf | diff /tmp/mdadm.conf - +mdadm -Ss +rm /tmp/mdadm.conf /var/tmp/mdadm.conf diff --git a/tests/10ddf-create-fail-rebuild b/tests/10ddf-create-fail-rebuild new file mode 100644 index 0000000..a8e8ced --- /dev/null +++ b/tests/10ddf-create-fail-rebuild @@ -0,0 +1,77 @@ +# sanity check array creation + +ddf_check_hold() { + if mdadm --remove $1 $2; then + echo "$2 removal from $1 should have been blocked" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +ddf_check_removal() { + if ! mdadm --remove $1 $2 ; then + echo "$2 removal from $1 should have succeeded" >&2 + cat /proc/mdstat >&2 + mdadm -E $2 + exit 1 + fi +} + +. tests/env-ddf-template + +num_disks=2 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 +ddf_check container $num_disks + +# RAID0 + RAID1 +size=9000 +level=0 +chunk=64 +offset=0 +layout=0 +mdadm -CR $member0 $dev8 $dev9 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $num_disks $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=1 +chunk=0 +mdadm -CR $member1 $dev8 $dev9 -n $num_disks -l $level -z $size +ddf_check member $member1 $num_disks $level $size $size $offset $chunk $layout +testdev $member1 1 $size 1 +check wait + +mdadm -Ss + +# RAID10 + RAID5 +num_disks=4 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 +ddf_check container $num_disks + +size=9000 +level=10 +chunk=64 +offset=0 +layout=2 +mdadm -CR $member0 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member0 $num_disks $level $size $((size*2)) $offset $chunk $layout +testdev $member0 $((num_disks-2)) $size $chunk + +offset=$(((size & ~(chunk - 1)))) +size=4000 +level=5 +mdadm -CR $member1 $dev8 $dev9 $dev10 $dev11 -n $num_disks -l $level -z $size -c $chunk +ddf_check member $member1 $num_disks $level $size $((size*3)) $offset $chunk $layout +testdev $member1 $((num_disks-1)) $size $chunk +check wait + +# FAIL / REBUILD +ddf_check_hold $container $dev8 +mdadm --fail $member0 $dev8 +mdadm --wait-clean --scan || true +ddf_check_removal $container $dev8 +mdadm --add $container $dev12 +check wait +ddf_check_hold $container $dev12 diff --git a/tests/10ddf-fail-create-race b/tests/10ddf-fail-create-race new file mode 100644 index 0000000..bd5dfb5 --- /dev/null +++ b/tests/10ddf-fail-create-race @@ -0,0 +1,66 @@ +# This test creates a RAID1, fails a disk, and immediately +# (simultaneously) creates a new array. This tests for a possible +# race where the meta data reflecting the disk failure may not +# be written when the 2nd array is created. +. tests/env-ddf-template + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +mdadm -CR $container -e ddf -l container -n 2 $dev11 $dev12 +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 >/tmp/mdmon.txt 2>&1 +mdadm -CR $member0 -l raid1 -n 2 $container -z 10000 +check wait +fail0=$dev11 +mdadm --fail $member0 $fail0 & + +# The test can succeed two ways: +# 1) mdadm -C member1 fails - in this case the meta data +# was already on disk when the create attempt was made +# 2) mdadm -C succeeds in the first place (meta data not on disk yet), +# but mdmon detects the problem and sets the disk faulty. + +if mdadm -CR $member1 -l raid1 -n 2 $container; then + + echo create should have failed / race condition? + + check wait + set -- $(get_raiddisks $member0) + d0=$1 + ret=0 + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + else + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 ]; then + ret=1 + fi + fi + if [ $ret -eq 1 ]; then + echo ERROR: failed disk $fail0 is still a RAID member + echo $member0: $(get_raiddisks $member0) + echo $member1: $(get_raiddisks $member1) + fi + tmp=$(mktemp /tmp/mdest-XXXXXX) + mdadm -E $d0 >$tmp + if [ x$(grep -c 'state\[[01]\] : Degraded' $tmp) != x2 ]; then + echo ERROR: non-degraded array found + mdadm -E $d0 + ret=1 + fi + if ! grep -q '^ *0 *[0-9a-f]\{8\} .*Offline, Failed' $tmp; then + echo ERROR: disk 0 not marked as failed in meta data + mdadm -E $d0 + ret=1 + fi + rm -f $tmp +else + ret=0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +[ $ret -eq 0 ] + diff --git a/tests/10ddf-fail-readd b/tests/10ddf-fail-readd new file mode 100644 index 0000000..9cd7893 --- /dev/null +++ b/tests/10ddf-fail-readd @@ -0,0 +1,55 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-readd-readonly b/tests/10ddf-fail-readd-readonly new file mode 100644 index 0000000..6a74d9c --- /dev/null +++ b/tests/10ddf-fail-readd-readonly @@ -0,0 +1,71 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +# Check that the meta data now show one disk as failed +ret=0 +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Degraded, Consistent' $tmp; then + echo ERROR: member 0 should be degraded in meta data on $x + ret=1 + fi + phys=$(grep $x $tmp) + case $x:$phys in + $fail0:*active/Offline,\ Failed);; + $good0:*active/Online);; + *) echo ERROR: wrong phys disk state for $x + ret=1 + ;; + esac +done + +mdadm $container --remove $fail0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-spare b/tests/10ddf-fail-spare new file mode 100644 index 0000000..ab737ca --- /dev/null +++ b/tests/10ddf-fail-spare @@ -0,0 +1,86 @@ +# Test suggested by Albert Pauw: Create, fail one disk, have mdmon +# activate the spare, +# then run create again. Shouldn't use the failed disk for Create, +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 5 $dev8 $dev9 $dev10 $dev11 $dev12 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm --fail $member0 $fail0 + +# To make sure the spare is activated, we may have to sleep +# 2s has always been enough for me +sleep 2 +check wait + +# This test can succeed both ways - if spare was activated +# before new array was created, we see only member 0. +# otherwise, we see both, adn member0 is degraded because the +# new array grabbed the spare +# which case occurs depends on the sleep time above. +ret=0 +if mdadm -CR $member1 -l raid5 -n 3 $container; then + # Creation successful - must have been quicker than spare activation + + check wait + set -- $(get_raiddisks $member1) + if [ $1 = $fail0 -o $2 = $fail0 -o $3 = $fail0 ]; then + echo ERROR: $member1 must not contain $fail0: $@ + ret=1 + fi + d1=$1 + mdadm -E $d1 >$tmp + if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: member 1 should be optimal in meta data + ret=1 + fi + state0=Degraded +else + # Creation unsuccessful - spare was used for member 0 + state0=Optimal +fi + +# need to delay a little bit, sometimes the meta data aren't +# up-to-date yet +sleep 0.5 +set -- $(get_raiddisks $member0) +if [ $1 = $fail0 -o $2 = $fail0 ]; then + echo ERROR: $member0 must not contain $fail0: $@ + ret=1 +fi +d0=$1 + +[ -f $tmp ] || mdadm -E $d0 >$tmp + +if ! grep -q 'state\[0\] : '$state0', Consistent' $tmp; then + echo ERROR: member 0 should be $state0 in meta data + ret=1 +fi +if ! grep -q 'Offline, Failed' $tmp; then + echo ERROR: Failed disk expected in meta data + ret=1 +fi +if [ $ret -eq 1 ]; then + cat /proc/mdstat + mdadm -E $d0 + mdadm -E $d1 + mdadm -E $fail0 +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} + +rm -f $tmp +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-stop-readd b/tests/10ddf-fail-stop-readd new file mode 100644 index 0000000..f8ebe17 --- /dev/null +++ b/tests/10ddf-fail-stop-readd @@ -0,0 +1,66 @@ +# Simple fail / re-add test +. tests/env-ddf-template + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp + +mdadm --zero-superblock $dev8 $dev9 +mdadm -CR $container -e ddf -l container -n 2 $dev8 $dev9 + +mdadm -CR $member0 -l raid1 -n 2 $container +#$dir/mdadm -CR $member0 -l raid1 -n 2 $container >/tmp/mdmon.txt 2>&1 + +# Write to the array +mke2fs -F $member0 +check wait + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 + +sleep 1 +mdadm $container --remove $fail0 + +set -- $(get_raiddisks $member0) +case $1 in MISSING) shift;; esac +good0=$1 + +mdadm -Ss + +sleep 1 +# Now simulate incremental assembly +mdadm -I $good0 +mdadm -IRs || true + +# Write to the array +mke2fs -F $member0 + +# We re-add the disk now +mdadm $container --add $fail0 + +sleep 1 +mdadm --wait $member0 || true + +ret=0 +set -- $(get_raiddisks $member0) +case $1:$2 in + $dev8:$dev9|$dev9:$dev8);; + *) echo ERROR: bad raid disks "$@"; ret=1;; +esac + +mdadm -Ss +for x in $@; do + mdadm -E $x >$tmp + if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: member 0 should be optimal in meta data on $x + ret=1 + fi +done + +rm -f $tmp +if [ $ret -ne 0 ]; then + mdadm -E $dev8 + mdadm -E $dev9 +fi + +[ $ret -eq 0 ] diff --git a/tests/10ddf-fail-twice b/tests/10ddf-fail-twice new file mode 100644 index 0000000..6af1943 --- /dev/null +++ b/tests/10ddf-fail-twice @@ -0,0 +1,59 @@ +. tests/env-ddf-template + +num_disks=5 +mdadm -CR $container -e ddf -n $num_disks $dev8 $dev9 $dev10 $dev11 $dev12 +ddf_check container $num_disks + +mdadm -CR $member0 -n 2 -l 1 $container +mdadm -CR $member1 -n 3 -l 5 $container + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + +set -- $(get_raiddisks $member0) +fail0=$1 +mdadm $member0 --fail $fail0 +set -- $(get_raiddisks $member1) +fail1=$1 +mdadm $member1 --fail $fail1 + +mdadm $container --add $dev13 + +mdadm --wait $member1 $member0 || mdadm --wait $member1 $member0 || true + + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" + +present=$(($(get_present $member0) + $(get_present $member1))) +[ $present -eq 4 ] || { + echo expected 4 present disks, got $present + devices for $member0: $devs0 + devices for $member1: $devs1 + exit 1 +} + +if echo "$devs0" | grep -q MISSING; then + good=1 + bad=0 +else + good=0 + bad=1 +fi + +# find a good device +eval "set -- \$devs$good" +check=$1 + +tmp=$(mktemp /tmp/mdtest-XXXXXX) +mdadm -E $check >$tmp + +{ grep -q 'state\['$bad'\] : Degraded, Consistent' $tmp && + grep -q 'state\['$good'\] : Optimal, Consistent' $tmp; } || { + echo unexpected meta data state on $check + mdadm -E $check + rm -f $tmp + exit 1 +} + +rm -f $tmp +exit 0 diff --git a/tests/10ddf-fail-two-spares b/tests/10ddf-fail-two-spares new file mode 100644 index 0000000..e00810d --- /dev/null +++ b/tests/10ddf-fail-two-spares @@ -0,0 +1,86 @@ +# Simulate two disks failing shorty after each other +. tests/env-ddf-template +sda=$(get_rootdev) || exit 1 +tmp=$(mktemp /tmp/mdtest-XXXXXX) + +mdadm --zero-superblock $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -CR $container -e ddf -l container -n 6 \ + $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +#fast_sync + +mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 +#$dir/mdadm -CR $member0 -l raid6 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 \ +# >/tmp/mdmon.txt 2>&1 +mdadm -CR $member1 -l raid10 -n 4 $dev10 $dev11 $dev12 $dev13 -z 16384 + +dd if=$sda of=$member0 bs=1M count=32 +dd if=$sda of=$member1 bs=1M skip=16 count=16 + +check wait + +sum0=$(sha1sum $member0) +sum1=$(sha1sum $member1) + +mdadm --fail $member1 $dev11 +sleep 1 +mdadm --fail $member1 $dev12 + +# We will have 4 resync procedures, 2 spares for 2 arrays. +mdadm --wait $member1 $member0 || true +mdadm --wait $member1 $member0 || true + +devs0="$(get_raiddisks $member0)" +devs1="$(get_raiddisks $member1)" +expected="$dev10 +$dev13 +$dev8 +$dev9" + +ret=0 +if [ "$(echo "$devs0" | sort)" != "$expected" \ + -o "$(echo "$devs1" | sort)" != "$expected" ]; then + echo ERROR: unexpected members + echo $member0: $devs0 + echo $member1: $devs1 + ret=1 +fi + +mdadm -E $dev10 >$tmp +if ! grep -q 'state\[0\] : Optimal, Consistent' $tmp; then + echo ERROR: $member0 should be optimal in meta data + ret=1 +fi +if ! grep -q 'state\[1\] : Optimal, Consistent' $tmp; then + echo ERROR: $member1 should be optimal in meta data + ret=1 +fi +if [ x"$(grep -c active/Online $tmp)" != x4 ]; then + echo ERROR: expected 4 online disks + ret=1 +fi +if [ x"$(grep -c "Offline, Failed" $tmp)" != x2 ]; then + echo ERROR: expected 2 failed disks + ret=1 +fi + +sum0a=$(sha1sum $member0) +sum1a=$(sha1sum $member1) + +if [ "$sum0" != "$sum0a" -o "$sum1" != "$sum1a" ]; then + echo ERROR: checksum mismatch + ret=1 +fi + +if [ $ret -eq 1 ]; then + cat /proc/mdstat + cat $tmp +fi + +[ -f /tmp/mdmon.txt ] && { + cat /tmp/mdmon.txt + rm -f /tmp/mdmon.txt +} +rm -f $tmp + +[ $ret -eq 0 ] diff --git a/tests/10ddf-geometry b/tests/10ddf-geometry new file mode 100644 index 0000000..b0cce2f --- /dev/null +++ b/tests/10ddf-geometry @@ -0,0 +1,82 @@ +# +# Test various RAID geometries, creation and deletion of subarrays +# + +assert_fail() { + if mdadm "$@"; then + echo mdadm "$@" must fail + return 1 + else + return 0 + fi +} + +assert_kill() { + local dev=$1 n=$2 + mdadm -S $dev + mdadm --kill-subarray=$n /dev/md/ddf0 + if mdadm -Dbs | grep -q $dev; then + echo >&2 $dev should be deleted + return 1 + fi + return 0 +} + +set -e +mdadm -CR /dev/md/ddf0 -e ddf -n 6 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# RAID1 geometries +# Use different sizes to make offset calculation harder +mdadm -CR l1s -l1 -n2 /dev/md/ddf0 -z 8000 +mdadm -CR l1m -l1 -n3 $dev8 $dev9 $dev10 -z 10000 +assert_fail -CR badl1 -l1 -n4 /dev/md/ddf0 + +# RAID10 geometries +mdadm -CR l10_0 -l10 -n3 /dev/md/ddf0 -z 1000 +mdadm -CR l10_1 -l10 -n5 /dev/md/ddf0 -z 1000 +assert_fail mdadm -CR badl10 -l10 -n4 -pn3 /dev/md/ddf0 +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 4000 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 4000 + +assert_fail -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +assert_kill /dev/md/l10_2 4 +# gone now, must be able to create it again +mdadm -CR l10_2 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 + +# Now stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 + +# Same as above, on inactive container +assert_fail -CR l10_3 -l10 -n6 -pn2 /dev/md/ddf0 -z 5000 +# Kill subarray without having started anything (no mdmon) +mdadm --kill-subarray=5 /dev/md/ddf0 +mdadm -I /dev/md/ddf0 +mdadm -CR l10_3 -l10 -n6 -pn3 /dev/md/ddf0 -z 5000 + +assert_kill /dev/md/l10_2 4 +assert_kill /dev/md/l10_3 5 + +# RAID5 geometries +mdadm -CR l5la -l5 -n3 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ra -l5 -n3 --layout=ddf-zero-restart /dev/md/ddf0 -z 5000 +mdadm -CR l5ls -l5 -n3 --layout=ddf-N-continue /dev/md/ddf0 -z 5000 +assert_fail -CR l5rs -l5 -n3 -prs /dev/md/ddf0 -z 5000 + +# Stop and reassemble +mdadm -Ss +mdadm -A /dev/md/ddf0 $dev8 $dev9 $dev10 $dev11 $dev12 $dev13 +mdadm -I /dev/md/ddf0 + +assert_kill /dev/md/l5la 4 +assert_kill /dev/md/l5ls 6 +assert_kill /dev/md/l5ra 5 + +# RAID6 geometries +assert_fail -CR l6la -l6 -n3 -pla /dev/md/ddf0 -z 5000 +assert_fail -CR l6rs -l5 -n4 -prs /dev/md/ddf0 -z 5000 +mdadm -CR l6la -l6 -n4 --layout=ddf-N-restart /dev/md/ddf0 -z 5000 +mdadm -CR l6ra -l6 -n4 --layout=ddf-zero-restart $dev8 $dev9 $dev10 $dev11 -z 5000 +mdadm -CR l6ls -l6 -n4 --layout=ddf-N-continue $dev13 $dev8 $dev9 $dev12 -z 5000 + +mdadm -Ss diff --git a/tests/10ddf-incremental-wrong-order b/tests/10ddf-incremental-wrong-order new file mode 100644 index 0000000..9ecf6bc --- /dev/null +++ b/tests/10ddf-incremental-wrong-order @@ -0,0 +1,131 @@ +# An array is assembled incompletely. Some disks will +# have later metadata than others. +# The array is then reassembled in the "wrong" order - +# older meta data first. +# This FAILS with mdadm 3.3 +. tests/env-ddf-template +tmp=$(mktemp /tmp/mdtest-XXXXXX) +rm -f $tmp /var/tmp/mdmon.log +ret=0 + +mdadm -CR $container -e ddf -n 4 $dev8 $dev9 $dev10 $dev11 +ddf_check container 4 + +mdadm -CR $member1 -n 4 -l 10 $dev8 $dev10 $dev9 $dev11 -z 10000 +mdadm -CR $member0 -n 2 -l 1 $dev8 $dev9 -z 10000 + +mdadm --wait $member0 || true +mdadm --wait $member1 || true + +mke2fs -F $member0 +mke2fs -F $member1 +sha_0a=$(sha1_sum $member0) +sha_1a=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Add all devices except those for $member0 +mdadm -I $dev10 +mdadm -I $dev11 + +# Start runnable members ($member1) and write +mdadm -IRs || true +e2fsck -fy $member1 +sha_1b=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +# Seq number should be different now +seq8a=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10a=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ $seq8a -ge $seq10a ]; then + ret=1 + echo ERROR: sequential number of $dev10 not bigger than $dev8 +fi +if [ x$sha_1a = x$sha_1b ]; then + ret=1 + echo ERROR: sha1sums equal after write +fi + +#[ -f /var/tmp/mdmon.log ] && cat /var/tmp/mdmon.log + +# Now reassemble +# Note that we add the previously missing disks first. +# $dev10 should have a higher seq number than $dev8 +for d in $dev8 $dev9 $dev10 $dev11; do + mdadm -I $d +done + +mdadm -IRs || true +sha_0c=$(sha1_sum $member0) +sha_1c=$(sha1_sum $member1) + +mdadm -Ss +sleep 1 + +seq8c=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') +seq10c=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + +if [ x$sha_0a != x$sha_0c ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed +fi +if [ x$sha_1b != x$sha_1c ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed +fi +if [ \( $seq10a -ge $seq10c \) -o \( $seq8c -ne $seq10c \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong +fi + +# Expect consistent state +for d in $dev10 $dev8; do + mdadm -E $d>$tmp + for x in 0 1; do + egrep 'state\['$x'\] : Optimal, Consistent' $tmp || { + ret=1 + echo ERROR: $member0 has unexpected state on $d + } + done + if [ x$(egrep -c 'active/Online$' $tmp) != x4 ]; then + ret=1 + echo ERROR: unexpected number of online disks on $d + fi +done + +# Now try assembly +if mdadm -A $container $dev8 $dev9 $dev10 $dev11; then + mdadm -IR $container + sha_0d=$(sha1_sum $member0) + sha_1d=$(sha1_sum $member1) + mdadm -Ss + sleep 1 + seq8d=$(mdadm -E $dev8 | sed -n 's/^ *Seq : //p') + seq10d=$(mdadm -E $dev10 | sed -n 's/^ *Seq : //p') + if [ x$sha_0a != x$sha_0d ]; then + ret=1 + echo ERROR: sha1sum of $member0 has changed + fi + if [ x$sha_1b != x$sha_1d ]; then + ret=1 + echo ERROR: sha1sum of $member1 has changed + fi + if [ \( $seq10a -ge $seq10d \) -o \( $seq8d -ne $seq10d \) ]; then + ret=1 + echo ERROR: sequential numbers are wrong + fi +else + ret=1 + echo ERROR: assembly failed +fi + +if [ $ret -ne 0 ]; then + mdadm -E $dev10 + mdadm -E $dev8 +fi +rm -f $tmp /var/tmp/mdmon.log +[ $ret -eq 0 ] diff --git a/tests/10ddf-sudden-degraded b/tests/10ddf-sudden-degraded new file mode 100644 index 0000000..dc692ae --- /dev/null +++ b/tests/10ddf-sudden-degraded @@ -0,0 +1,18 @@ +# +# An array is assembled with one device missing. +# The other device must be marked as Failed in metadata + +. tests/env-ddf-template + +mdadm -CR $container -e ddf -n 2 $dev8 $dev9 +ddf_check container 2 + +mdadm -CR $member1 -n 2 -l1 $dev8 $dev9 +mdadm --wait $member1 || true +mdadm -Ss + +mdadm -I $dev8 +mdadm -R $container +mkfs $member1 +# There must be a missing device recorded +mdadm --examine $dev8 | grep 'Raid Devices.*--' || exit 1 diff --git a/tests/11spare-migration b/tests/11spare-migration new file mode 100644 index 0000000..24b6ec6 --- /dev/null +++ b/tests/11spare-migration @@ -0,0 +1,454 @@ +# Set of tests for autorebuild functionality using mdadm -F +# To be able to test ddf one must have all loop devices of bigger size, with the ones +# above number 7 bigger again by any amount (this is not changed for now as it +# could affect other tests) + +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +export IMSM_NO_PLATFORM=1 + +. tests/utils +set -ex +verbose="yes" +sleeptime=10 + +# if listfailed=yes then don't exit if test failed due to wrong +# spare-migration and just print a list at the end. Other errors still +# stop the test. +# if listfailed=no then exit on first failure +listfailed="yes" + +# start Monitor, set monitorpid +# uses global scan variable +# all parameters are numbers of devices to be monitored. only used when $scan="no" +# eg. monitor 0 1 will start monitoring of containers c0, c1 and subarrays v0, v1 +monitor(){ + [ -z $monitorpid ] || return + if [ "$scan" == "yes" ]; then + $mdadm -F -d 1 --scan --mail root@localhost -c $config & + monitorpid=$! + return + fi + unset mddevs + while [ -n "$1" ] + do + eval container=\$c$1 + eval volumes=\$v$1 + mddevs="$mddevs /dev/$container" + if [ "$container" != "$volumes" ]; then + for vol in $volumes; do + mddevs="$mddevs /dev/$vol" + done + fi + shift + done + if [ -n "$mddevs" ]; then + if [ "$verbose" != "yes" ]; then + $mdadm -F -d 1 $mddevs -c $config >&2 & + monitorpid=$! + else + $mdadm -F -t -d 1 $mddevs -c $config & + monitorpid=$! + fi + fi + [ "$verbose" != "yes" ] || echo $mddevs $monitorpid +} + +test0() +{ +dsc "Test 0: No config file, no spare should be moved" +> $config +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test0a() +{ +dsc "Test 0a: No domains in config file, no spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was not moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 n +tidyup +} + +test1() +{ +dsc "Test 1: Common domain, add disk to one container and fail first one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with arrays and common domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev0 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test1a() +{ +dsc "Test 1a: Common domain, add disk to one container and fail second one in another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev1 +# check that spare loop2 was moved from container c1 to container c0 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test2() +{ +dsc "Test 2: Common domain, fail disk in one container and add one to another container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev2 +chksparemoved $c1 $c0 $dev2 +tidyup +} + +test3() +{ +dsc "Test 3: Two domains, fail a disk in one domain, add a disk to another domain, the spare should not be moved" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +# create config file with 2 domains +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 +createconfig domain-$platform"2" $platform spare 3 4 5 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +chksparemoved $c1 $c0 $dev5 n +tidyup +} + +test4() +{ +dsc "Test 4: One domain holds one container, fail a disk in domain, and add disk to a container not described by domain, move if metadata allows" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 +monitor 0 1 +mdadm --fail /dev/$v0 $dev1 +mdadm -a /dev/$c1 $dev5 +unset shouldmove +[ "$platform" == "imsm" ] || shouldmove="n" +chksparemoved $c1 $c0 $dev5 $shouldmove +tidyup +} + +test5() +{ +dsc "Test 5: Two domains, two containers in each domain" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +setupdevs 2 5 6 $platform +setupdevs 3 8 10 $platform +# 2 and 9 for spares +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 2 3 4 +createconfig domain-$platform"2" $platform spare 5 6 8 9 10 +monitor 0 1 2 3 +test5a +test5b +test5c +tidyup +} + +test5a() +{ +dsc "Test 5a: Two containers in each domain, add spare loop2 to domain1 and fail disk in the other domain, the spare should not be moved" +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v2 $dev5 +chksparemoved $c0 $c2 $dev2 n +} + +test5b() +{ +dsc "Test 5b: Fail disk in the same domain but different container, spare loop2 should be moved" +mdadm --fail /dev/$v1 $dev3 +chksparemoved $c0 $c1 $dev2 +} + +test5c() +{ +dsc "Test 5c: Add spare loop9 to different container in domain with degraded array, spare should be moved" +mdadm -a /dev/$c3 $dev9 +chksparemoved $c3 $c2 $dev9 +} + +test6() +{ +dsc "Test 6: One domain has two containers, fail a disk in one container, there is a spare in other container too small to use for rebuild" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +# all devices in one domain +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c0 $dev2 +mdadm --fail /dev/$v1 $dev8 +chksparemoved $c0 $c1 $dev2 n +tidyup +} + +test7() +{ +dsc "Test 7: One domain, add small spare to container, fail disk in array, spare not used, add suitable spare to other container, spare should be moved" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 10 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + + +test7a() +{ +dsc "Test 7a: Small spare in parent, suitable one in other container, $dev2 in $c1 is not in common domain" +setupdevs 0 0 1 $platform +setupdevs 1 8 9 $platform +#all $platform devices in one domain +createconfig a +createconfig domain-$platform"1" $platform spare 0 1 8 9 10 +createconfig domain-$platform"2" $platform spare 2 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +chkspare $c1 $dev2 +mdadm --fail /dev/$v1 $dev8 +mdadm -a /dev/$c0 $dev10 +chksparemoved $c0 $c1 $dev10 +tidyup +} + +test8() +{ +# ddf does not have getinfo_super_disks implemented so skip this test +return +dsc "Test 8: imsm and ddf - spare should not be migrated" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 ddf +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 n +tidyup +} + +test9() +{ +dsc "Test 9: imsm and native 1.2 - one domain, no metadata specified, spare should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain0 noplatform spare 8 9 10 11 12 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test9a() +{ +dsc "Test 9a: imsm and native 1.2 - spare in global domain, should be moved" +setupdevs 0 10 11 imsm +setupdevs 1 8 9 1.2 +createconfig a +createconfig domain-global noplatform spare 8 9 10 11 12 +createconfig domain-1.2 1.2 spare 8 9 +createconfig domain-imsm imsm spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +test10() +{ +dsc "Test 10: Two arrays on the same devices in container" +setupdevs 0 0 1 $platform 10000 +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/md/sub0_ $dev0 +chksparemoved $c1 $c0 $dev2 +if [ $failed -eq 0 ]; then +# now fail the spare and see if we get another one + mdadm --fail /dev/md/sub0_ $dev2 + mdadm -a /dev/$c1 $dev5 + chksparemoved $c1 $c0 $dev5 +fi +tidyup +} + +test11() +{ +dsc "Test 11: Failed spare from other container should not be used" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v1 $dev3 +#wait until recovery finishes so no degraded array in c1 +check wait +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $dev3 n +tidyup +} + +test12() +{ +dsc "Test 12: Only one spare should be taken for rebuild, second not needed" +setupdevs 0 0 1 $platform +setupdevs 1 3 4 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 3 4 5 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm -a /dev/$c1 $dev5 +mdadm --fail /dev/$v0 $dev0 +sleep $sleeptime +chkarray $dev2 n +sc1=$c +chkarray $dev5 n +sc2=$c +[ "$sc1" != "$sc2" ] || err "both spares in the same container $sc1" +tidyup +} + +test13() +{ +dsc "Test 13: Common domain, two containers, fail a disk in container, action is below spare, the spare should be moved regadless of action" +setupdevs 0 0 1 $platform +setupdevs 1 4 5 $platform +# same domain but different action on 4 5 6 +createconfig a +createconfig domain-$platform $platform spare 0 1 +createconfig domain-$platform $platform include 4 5 6 +monitor 0 1 +mdadm -a /dev/$c1 $dev6 +mdadm --fail /dev/$v0 $dev0 +chksparemoved $c1 $c0 $d6 +tidyup +} + +test14() +{ +dsc "Test 14: One domain, small array on big disks, check if small spare is accepted" +setupdevs 0 8 9 $platform 10000 1 +setupdevs 1 0 1 $platform +createconfig a +createconfig domain-$platform $platform spare 0 1 2 8 9 +monitor 0 1 +mdadm -a /dev/$c1 $dev2 +mdadm --fail /dev/$v0 $dev9 +chksparemoved $c1 $c0 $d2 +tidyup +} + +test15() +{ +dsc "Test 15: spare in global domain for $platform metadata, should be moved" +# this is like 9a but only one metadata used +setupdevs 0 10 11 $platform +setupdevs 1 8 9 $platform +createconfig a +createconfig domain-global $platform spare 8 9 10 11 12 +createconfig domain-1 $platform spare 8 9 +createconfig domain-2 $platform spare 10 11 +monitor 0 1 +mdadm -a /dev/$c1 $dev12 +mdadm --fail /dev/$v0 $dev10 +chksparemoved $c1 $c0 $dev12 +tidyup +} + +try() +{ +test0 +test0a +test1 +test1a +test2 +test3 +test4 +test5 +test6 +if [ "$platform" != "1.2" ]; then +# this is because we can't have a small spare added to native array + test7 + test7a +fi +test8 +test9 +test9a +if [ "$platform" != "1.2" ]; then +# we can't create two subarrays on the same devices for native (without +# partitions) + test10 +fi +test11 +test12 +test13 +test14 +test15 +} + +try_failed() +{ +platform="1.2" +scan="no" +test5 +test9 +test13 +scan="yes" +test9 +} + +#try_failed + +for scan in no yes; do + for platform in 1.2 imsm; do + try + done +done + +[ $listfailed == "no" ] || [ -z $flist ] || echo -e "\n FAILED TESTS: $flist" + +#cat $targetdir/log +rm -f /dev/disk/by-path/loop* diff --git a/tests/12imsm-r0_2d-grow-r0_3d b/tests/12imsm-r0_2d-grow-r0_3d new file mode 100644 index 0000000..3c6cf74 --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_3d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 3 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 3 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_4d b/tests/12imsm-r0_2d-grow-r0_4d new file mode 100644 index 0000000..e4fccda --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_2d-grow-r0_5d b/tests/12imsm-r0_2d-grow-r0_5d new file mode 100644 index 0000000..388a5bb --- /dev/null +++ b/tests/12imsm-r0_2d-grow-r0_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks grow to RAID 0 volume, 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 3)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r0_3d-grow-r0_4d b/tests/12imsm-r0_3d-grow-r0_4d new file mode 100644 index 0000000..7065f07 --- /dev/null +++ b/tests/12imsm-r0_3d-grow-r0_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 0 volume, 3 disks grow to RAID 0 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 0 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_4d b/tests/12imsm-r5_3d-grow-r5_4d new file mode 100644 index 0000000..097da0a --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_4d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 0 0 diff --git a/tests/12imsm-r5_3d-grow-r5_5d b/tests/12imsm-r5_3d-grow-r5_5d new file mode 100644 index 0000000..2e5c7d2 --- /dev/null +++ b/tests/12imsm-r5_3d-grow-r5_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 5 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_4d b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d new file mode 100644 index 0000000..66ceeb3 --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 2 disks to 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume in slot #0, 2 disks, 128k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID 0 volume in slot #0, 4 disks, 128k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_2d-grow-r0_r0_5d b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d new file mode 100644 index 0000000..0da9ef3 --- /dev/null +++ b/tests/13imsm-r0_r0_2d-grow-r0_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow both members from 2 disks to 5 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((4 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((6 * 1024)) +vol1_chunk=256 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID 0 volume in slot #0, 5 disks, 64k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 3)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r0_3d-grow-r0_r0_4d b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d new file mode 100644 index 0000000..1ff6025 --- /dev/null +++ b/tests/13imsm-r0_r0_3d-grow-r0_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow a container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID0 volume in slot #0, 4 disks, 128k chunk size +# RAID0 volume in slot #1, 4 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$vol0_new_num_comps + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_4d b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d new file mode 100644 index 0000000..2977f36 --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume in slot #0, 3 disks, 64k chunk size +# RAID 5 volume in slot #1, 3 disks, 128k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID 0 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 128k chunk size +vol1_new_num_comps=$num_disks +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r0_r5_3d-grow-r0_r5_5d b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d new file mode 100644 index 0000000..ff15ad0 --- /dev/null +++ b/tests/13imsm-r0_r5_3d-grow-r0_r5_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 0 volume in slot #0, 3 disks, 256k chunk size +# RAID 5 volume in slot #1, 3 disks, 512k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=128 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID 0 volume in slot #0, 5 disks, 256k chunk size +# RAID 5 volume in slot #1, 5 disks, 512k chunk size +vol0_new_num_comps=$((num_disks + 2)) +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_4d b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d new file mode 100644 index 0000000..9fed88a --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 4 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 5 volume in slot #0, 3 disks, 64k chunk size +# RAID 0 volume in slot #1, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 4096)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 4 disks, 64k chunk size +# RAID 0 volume in slot #1, 4 disks, 64k chunk size +vol0_new_num_comps=$num_disks +vol1_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/13imsm-r5_r0_3d-grow-r5_r0_5d b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d new file mode 100644 index 0000000..e8beddc --- /dev/null +++ b/tests/13imsm-r5_r0_3d-grow-r5_r0_5d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# Grow the container (arrays inside) from 3 disks to 5 disks +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3 $dev4" + +# Before: RAID 5 volume in slot #0, 3 disks, 128k chunk size +# RAID 0 volume in slot #1, 3 disks, 256k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_offset=$((vol0_comp_size + 4096)) +vol1_num_comps=$num_disks + +# After: RAID 5 volume in slot #0, 5 disks, 128k chunk size +# RAID 0 volume in slot #1, 5 disks, 256k chunk size +vol0_new_num_comps=$((num_disks + 1)) +vol1_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 0 0 diff --git a/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d new file mode 100644 index 0000000..cb7328a --- /dev/null +++ b/tests/14imsm-r0_3d-r5_3d-migrate-r5_4d-r5_4d @@ -0,0 +1,29 @@ +. tests/env-imsm-template + +# RAID 0 and RAID 5 volumes (3 disks) migrate to RAID 5 and RAID 5 volumes (4 disks) +# NEGATIVE test - migration is not allowed if there is more then one array in a container + +num_disks=3 +device_list="$dev0 $dev1 $dev2" +spare_list="$dev3" + +# Before: RAID 0 volume, 3 disks, 64k chunk size, as member #0 +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Extra: RAID 5 volume, 3 disks, 64k chunk size, as member #1 +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$((num_disks - 1)) +vol1_offset=$((vol0_comp_size + 4096)) + +# After: RAID 5 volume, 4 disks, 64k chunk size (only member #0) +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r0_3d_no_spares-migrate-r5_3d b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d new file mode 100644 index 0000000..10bbab6 --- /dev/null +++ b/tests/14imsm-r0_3d_no_spares-migrate-r5_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks, no spares) migrate to RAID 5 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0 volume, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 3 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r0_r0_2d-takeover-r10_4d b/tests/14imsm-r0_r0_2d-takeover-r10_4d new file mode 100644 index 0000000..d068abb --- /dev/null +++ b/tests/14imsm-r0_r0_2d-takeover-r10_4d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID 0 volumes (2 disks) migrate to RAID 10 volume (4 disks) +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 0 volume in slot #0, 2 disks, 64k chunk size +# RAID 0 volume in slot #1, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=num_disks +vol1_offset=$(( $vol0_comp_size + 4096 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=10 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r10_4d-grow-r10_5d b/tests/14imsm-r10_4d-grow-r10_5d new file mode 100644 index 0000000..bcbe147 --- /dev/null +++ b/tests/14imsm-r10_4d-grow-r10_5d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks grow to RAID 10 volume, 5 disks +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" +spare_list="$dev4" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 10 volume, 5 disks, 128k chunks size (test should fail) +vol0_new_num_comps=$((num_disks + 1)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r10_r5_4d-takeover-r0_2d b/tests/14imsm-r10_r5_4d-takeover-r0_2d new file mode 100644 index 0000000..720e575 --- /dev/null +++ b/tests/14imsm-r10_r5_4d-takeover-r0_2d @@ -0,0 +1,30 @@ +. tests/env-imsm-template + + +# Two RAID volumes: RAID10 and RAID5 (4 disks) migrate to RAID 0 volume (2 disks) +# NEGATIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume in slot #0, 4 disks, 64k chunk size +# RAID 5 volume in slot #1, 4 disks, 64k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$(( $num_disks - 2 )) +vol0_offset=0 + +# Before: RAID 0 volume, disks, 64k chunk size +vol1_level=5 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$(( $num_disks - 1 )) +vol1_offset=$(( $vol0_comp_size + 4096 )) + +# After: RAID 10, 4 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/14imsm-r1_2d-grow-r1_3d b/tests/14imsm-r1_2d-grow-r1_3d new file mode 100644 index 0000000..be20ab8 --- /dev/null +++ b/tests/14imsm-r1_2d-grow-r1_3d @@ -0,0 +1,19 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks grow to RAID 1 volume, 3 disks +# NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev4" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 1 volume, 3 disks, 64k chunks size (test should fail) +vol0_new_num_comps=$num_disks + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r1_2d-takeover-r0_2d b/tests/14imsm-r1_2d-takeover-r0_2d new file mode 100644 index 0000000..27002e1 --- /dev/null +++ b/tests/14imsm-r1_2d-takeover-r0_2d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 2 disks +# +#NEGATIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks, 64k chunk size +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/14imsm-r5_3d-grow-r5_5d-no-spares b/tests/14imsm-r5_3d-grow-r5_5d-no-spares new file mode 100644 index 0000000..ed18e72 --- /dev/null +++ b/tests/14imsm-r5_3d-grow-r5_5d-no-spares @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# RAID 5 volume, 3 disks grow to RAID 5 volume, 4 disks +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 5 volume, 4 disks, 64k chunks size +add_to_num_disks=2 +vol0_new_num_comps=$((num_disks + 2)) + +. tests/imsm-grow-template 1 0 diff --git a/tests/14imsm-r5_3d-migrate-r4_3d b/tests/14imsm-r5_3d-migrate-r4_3d new file mode 100644 index 0000000..e3b971c --- /dev/null +++ b/tests/14imsm-r5_3d-migrate-r4_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 4 volume (3 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5 volume, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 4, 3 disks, 64k chunk size +vol0_new_level=4 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 diff --git a/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k new file mode 100644 index 0000000..4fe3807 --- /dev/null +++ b/tests/15imsm-r0_3d_64k-migrate-r0_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 0 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# RAID 0, 2 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# RAID 0, 2 disks, 256k chunk size +vol0_new_level=0 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k new file mode 100644 index 0000000..025e9ef --- /dev/null +++ b/tests/15imsm-r5_3d_4k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256 chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k new file mode 100644 index 0000000..37547b7 --- /dev/null +++ b/tests/15imsm-r5_3d_64k-migrate-r5_3d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 64k to 256k chunk size. +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 3 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k new file mode 100644 index 0000000..d2f6c70 --- /dev/null +++ b/tests/15imsm-r5_6d_4k-migrate-r5_6d_256k @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume, Migration from 4k to 256k chunk size. +# POSITIVE test + +num_disks=6 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5" + +# RAID 5, 6 disks, 4k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=4 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# RAID 5, 6 disks, 256k chunk size +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k new file mode 100644 index 0000000..f9369d5 --- /dev/null +++ b/tests/15imsm-r5_r0_3d_64k-migrate-r5_r0_3d_256k @@ -0,0 +1,34 @@ +. tests/env-imsm-template + +# Member 0: RAID 5 volume, Member 1: RAID 0 volume +# Migration from 64k to 256k chunk size (both members) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After migration parameters +vol0_new_level=5 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=256 + +# RAID 0, 3 disks, 64k chunk size +vol1_level=0 +vol1_comp_size=$((5 * 1024)) +vol1_chunk=64 +vol1_num_comps=$num_disks +vol1_offset=$((vol0_comp_size + 4096)) + +# After migration paramters +vol1_new_level=0 +vol1_new_num_comps=$vol1_num_comps +vol1_new_chunk=256 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_3d-migrate-r5_4d b/tests/16imsm-r0_3d-migrate-r5_4d new file mode 100644 index 0000000..265adf9 --- /dev/null +++ b/tests/16imsm-r0_3d-migrate-r5_4d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 0 volume (3 disks) migrate to RAID 5 volume (4 disks) +# POSITIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 0, 3 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 4 disks, 64k chunk size +vol0_new_level=5 +new_num_disks=4 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r0_5d-migrate-r5_6d b/tests/16imsm-r0_5d-migrate-r5_6d new file mode 100644 index 0000000..535b609 --- /dev/null +++ b/tests/16imsm-r0_5d-migrate-r5_6d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 0 volume (5 disks) migrate to RAID 5 volume (6 disks) +# POSITIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 0, 5 disks, 64k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 5, 6 disks, 64k chunk size +vol0_new_level=5 +vol0_new_num_comps=$num_disks +vol0_new_chunk=64 +new_num_disks=6 + +. tests/imsm-grow-template 0 1 diff --git a/tests/16imsm-r5_3d-migrate-r0_3d b/tests/16imsm-r5_3d-migrate-r0_3d new file mode 100644 index 0000000..bcb5709 --- /dev/null +++ b/tests/16imsm-r5_3d-migrate-r0_3d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (3 disks) migrate to RAID 0 volume (2 disks) +# NEGATIVE test + +num_disks=3 +device_list="$dev0 $dev1 $dev2" + +# Before: RAID 5, 3 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0, 3 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks-1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/16imsm-r5_5d-migrate-r0_5d b/tests/16imsm-r5_5d-migrate-r0_5d new file mode 100644 index 0000000..ca77435 --- /dev/null +++ b/tests/16imsm-r5_5d-migrate-r0_5d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 5 volume (5 disks) migration to RAID 0 volume (4 disks) +# NEGATIVE test + +num_disks=5 +device_list="$dev0 $dev1 $dev2 $dev3 $dev4" + +# Before: RAID 5 volume, 5 disks, 64k chunk size +vol0_level=5 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=64 +vol0_num_comps=$((num_disks - 1)) +vol0_offset=0 + +# After: RAID 0 volume, 5 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=$((num_disks - 1)) +vol0_new_chunk=64 + +. tests/imsm-grow-template 1 1 diff --git a/tests/18imsm-1d-takeover-r0_1d b/tests/18imsm-1d-takeover-r0_1d new file mode 100644 index 0000000..6f5cf5a --- /dev/null +++ b/tests/18imsm-1d-takeover-r0_1d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# Create RAID 0 from a single disk. +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 0 volume +mdadm --create --run $member0 --auto=md --level=0 --size=$vol0_comp_size --chunk=64 --force --raid-disks=$vol0_num_comps $dev0 +check wait + +# Test the member +imsm_check member $member0 $vol0_num_comps 0 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 + +exit 0 diff --git a/tests/18imsm-1d-takeover-r1_2d b/tests/18imsm-1d-takeover-r1_2d new file mode 100644 index 0000000..e38ed89 --- /dev/null +++ b/tests/18imsm-1d-takeover-r1_2d @@ -0,0 +1,20 @@ +. tests/env-imsm-template + +# Create RAID 1 from a single disk +# POSITIVE test + +vol0_num_comps=1 +vol0_comp_size=$((10 * 1024)) + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --force --raid-disks=$vol0_num_comps $dev0 +check wait +imsm_check container $vol0_num_comps + +# Create RAID 1 volume +mdadm --create --run $member0 --auto=md --level=1 --size=$vol0_comp_size --raid-disks=$((vol0_num_comps + 1)) $dev0 missing +check wait + +# Test the member0 +imsm_check member $member0 $((vol_num_comps + 1)) 1 $vol0_comp_size $((vol0_num_comps * vol0_comp_size)) 0 64 +testdev $member0 $vol0_num_comps $vol0_comp_size 64 diff --git a/tests/18imsm-r0_2d-takeover-r10_4d b/tests/18imsm-r0_2d-takeover-r10_4d new file mode 100644 index 0000000..0e77e5d --- /dev/null +++ b/tests/18imsm-r0_2d-takeover-r10_4d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 0 volume, 2 disks change to RAID 10 volume, 4 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" +spare_list="$dev2 $dev3" + +# Before: RAID 0 volume, 2 disks, 256k chunk size +vol0_level=0 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$num_disks +vol0_offset=0 + +# After: RAID 10 volume, 4 disks, 256k chunk size +vol0_new_level=10 +vol0_new_num_comps=$vol0_num_comps +vol0_new_chunk=128 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r10_4d-takeover-r0_2d b/tests/18imsm-r10_4d-takeover-r0_2d new file mode 100644 index 0000000..8a9606b --- /dev/null +++ b/tests/18imsm-r10_4d-takeover-r0_2d @@ -0,0 +1,22 @@ +. tests/env-imsm-template + +# RAID 10 volume, 4 disks change to RAID 0 volume, 2 disks +# POSITIVE test + +num_disks=4 +device_list="$dev0 $dev1 $dev2 $dev3" + +# Before: RAID 10 volume, 4 disks, 128k chunk size +vol0_level=10 +vol0_comp_size=$((5 * 1024)) +vol0_chunk=128 +vol0_num_comps=$((num_disks - 2)) +vol0_offset=0 + +# After: RAID 0 volume, 2 disks, 128k chunk size +vol0_new_level=0 +vol0_new_num_comps=2 +vol0_new_chunk=128 +new_num_disks=2 + +. tests/imsm-grow-template 0 1 diff --git a/tests/18imsm-r1_2d-takeover-r0_1d b/tests/18imsm-r1_2d-takeover-r0_1d new file mode 100644 index 0000000..049f19c --- /dev/null +++ b/tests/18imsm-r1_2d-takeover-r0_1d @@ -0,0 +1,21 @@ +. tests/env-imsm-template + +# RAID 1 volume, 2 disks change to RAID 0 volume, 1 disks +# POSITIVE test + +num_disks=2 +device_list="$dev0 $dev1" + +# Before: RAID 1 volume, 2 disks +vol0_level=1 +vol0_comp_size=$((5 * 1024)) +vol0_num_comps=$(( $num_disks - 1 )) +vol0_offset=0 + +# After: RAID 0 volume, 1 disks, 64k chunk size +vol0_new_level=0 +vol0_new_num_comps=1 +vol0_new_chunk=64 +new_num_disks=1 + +. tests/imsm-grow-template 0 1 diff --git a/tests/19raid6auto-repair b/tests/19raid6auto-repair new file mode 100644 index 0000000..ce4a7c0 --- /dev/null +++ b/tests/19raid6auto-repair @@ -0,0 +1,49 @@ +number_of_disks=5 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +# make a raid5 from a file +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + +# perform test for every layout +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + # wipe out 5 chunks on each device + dd if=/dev/urandom of=$dev0 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*0] + dd if=/dev/urandom of=$dev1 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*5] + dd if=/dev/urandom of=$dev2 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*10] + dd if=/dev/urandom of=$dev3 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*15] + dd if=/dev/urandom of=$dev4 bs=1024 count=$[5*chunksize_in_kib] seek=$[data_offset_in_kib+chunksize_in_kib*20] + + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 0 0 autorepair > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 +done diff --git a/tests/19raid6check b/tests/19raid6check new file mode 100644 index 0000000..67958c6 --- /dev/null +++ b/tests/19raid6check @@ -0,0 +1,27 @@ +# +# Confirm that raid6check handles all RAID6 layouts. +# Try both 4 and 5 devices. + +layouts='ls rs la ra' +lv=`uname -r` +if expr $lv '>=' 2.6.30 > /dev/null +then + layouts="$layouts parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 right-symmetric-6 parity-first-6" +fi + +for layout in $layouts +do + for devs in 4 5 + do + dl="$dev0 $dev1 $dev2 $dev3" + if [ $devs = 5 ]; then dl="$dl $dev4"; fi + + mdadm -CR $md0 -l6 --layout $layout -n$devs $dl + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + mdadm -S $md0 + done +done + diff --git a/tests/19raid6repair b/tests/19raid6repair new file mode 100644 index 0000000..26846cc --- /dev/null +++ b/tests/19raid6repair @@ -0,0 +1,56 @@ +number_of_disks=4 +chunksize_in_kib=512 +chunksize_in_b=$[chunksize_in_kib*1024] +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev1 $dev2 $dev3 $dev4" + +# default 2048 sectors +data_offset_in_kib=$[2048/2] + +layouts="ls rs la ra parity-first ddf-zero-restart ddf-N-restart ddf-N-continue \ + left-asymmetric-6 right-asymmetric-6 left-symmetric-6 \ + right-symmetric-6 parity-first-6" + +for layout in $layouts +do + for failure in "$dev3 3 3 2" "$dev3 3 2 3" "$dev3 3 2 1" "$dev3 3 2 0" \ + "$dev4 3 3 0" "$dev4 3 3 1" "$dev4 3 3 2" \ + "$dev1 3 0 1" "$dev1 3 0 2" "$dev1 3 0 3" \ + "$dev2 3 1 0" "$dev2 3 1 2" "$dev2 3 1 3" ; do + failure_split=( $failure ) + device_with_error=${failure_split[0]} + stripe_with_error=${failure_split[1]} + repair_params="$stripe_with_error ${failure_split[2]} ${failure_split[3]}" + start_of_errors_in_kib=$[data_offset_in_kib+chunksize_in_kib*stripe_with_error] + + # make a raid5 from a file + dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib + mdadm -CR $md0 -l6 --layout=$layout -n$number_of_disks -c $chunksize_in_kib $devs + dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib + blockdev --flushbufs $md0; sync + + check wait + blockdev --flushbufs $devs; sync + echo 3 > /proc/sys/vm/drop_caches + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo sanity cmp failed ; exit 2; } + + dd if=/dev/urandom of=$device_with_error bs=1024 count=$chunksize_in_kib seek=$start_of_errors_in_kib + blockdev --flushbufs $device_with_error; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" || { echo should detect errors; exit 2; } + + $dir/raid6check $md0 repair $repair_params > /dev/null || { echo repair failed; exit 2; } + blockdev --flushbufs $md0 $devs; sync + echo 3 > /proc/sys/vm/drop_caches + + $dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } + cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo cmp failed ; exit 2; } + + mdadm -S $md0 + udevadm settle + sync + echo 3 > /proc/sys/vm/drop_caches + done +done diff --git a/tests/19repair-does-not-destroy b/tests/19repair-does-not-destroy new file mode 100644 index 0000000..a92883f --- /dev/null +++ b/tests/19repair-does-not-destroy @@ -0,0 +1,28 @@ +number_of_disks=7 +chunksize_in_kib=512 +array_data_size_in_kib=$[chunksize_in_kib*(number_of_disks-2)*number_of_disks] +array_data_size_in_b=$[array_data_size_in_kib*1024] +devs="$dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6" + +dd if=/dev/urandom of=/tmp/RandFile bs=1024 count=$array_data_size_in_kib +mdadm -CR $md0 -l6 -n$number_of_disks -c $chunksize_in_kib $devs +dd if=/tmp/RandFile of=$md0 bs=1024 count=$array_data_size_in_kib +blockdev --flushbufs $md0; sync +check wait +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 repair 1 2 3 > /dev/null # D D +$dir/raid6check $md0 repair 8 2 5 > /dev/null # D P +$dir/raid6check $md0 repair 15 4 6 > /dev/null # D Q +$dir/raid6check $md0 repair 22 5 6 > /dev/null # P Q +$dir/raid6check $md0 repair 3 4 0 > /dev/null # Q D +$dir/raid6check $md0 repair 3 3 1 > /dev/null # P D +$dir/raid6check $md0 repair 6 4 5 > /dev/null # D<D +$dir/raid6check $md0 repair 13 5 4 > /dev/null # D>D +blockdev --flushbufs $devs; sync +echo 3 > /proc/sys/vm/drop_caches +$dir/raid6check $md0 0 0 2>&1 | grep -qs "Error" && { echo errors detected; exit 2; } +cmp -s -n $array_data_size_in_b $md0 /tmp/RandFile || { echo should not mess up correct stripe ; exit 2; } + +mdadm -S $md0 +udevadm settle diff --git a/tests/20raid5journal b/tests/20raid5journal new file mode 100644 index 0000000..f751ace --- /dev/null +++ b/tests/20raid5journal @@ -0,0 +1,64 @@ +# check write journal of raid456 + +# test --detail +test_detail_shows_journal() { + mdadm -D $1 | grep journal || { + echo >&2 "ERROR --detail does show journal device!"; mdadm -D $1 ; exit 1; } +} + +# test --examine +test_examine_shows_journal() { + mdadm -E $1 | grep Journal || { + echo >&2 "ERROR --examine does show Journal device!"; mdadm -E $1 ; exit 1; } +} + +# test --create +create_with_journal_and_stop() { + mdadm -CR $md0 -l5 -n4 $dev0 $dev1 $dev2 $dev3 --write-journal $dev4 + check wait + tar cf - /etc > $md0 + ./raid6check $md0 0 0 | grep 'Error detected' && exit 1 + test_detail_shows_journal $md0 + test_examine_shows_journal $dev4 + mdadm -S $md0 +} + +# test --assemble +test_assemble() { + create_with_journal_and_stop + if mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 + then + echo >&2 "ERROR should return 1 when journal is missing!"; cat /proc/mdstat ; exit 1; + fi + mdadm -S $md0 + + mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 --force + check readonly + mdadm -S $md0 +} + +# test --incremental +test_incremental() { + create_with_journal_and_stop + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + check inactive + mdadm -I $dev4 + check raid5 + mdadm -S $md0 + + # test --incremental with journal missing + for d in $dev0 $dev1 $dev2 $dev3 + do + mdadm -I $d + done + mdadm -R $md0 + check readonly + mdadm -S $md0 +} + +create_with_journal_and_stop +test_assemble +test_incremental diff --git a/tests/21raid5cache b/tests/21raid5cache new file mode 100644 index 0000000..0dd97bf --- /dev/null +++ b/tests/21raid5cache @@ -0,0 +1,87 @@ +# check data integrity with raid5 write back cache + +# create a 4kB random file and 4 files each with a 1kB chunk of the random file: +# randfile: ABCD randchunk[0-3]: A B C D +# +# then create another random 1kB chunk E, and a new random page with A, B, E, D: +# randchunk4: E newrandfile: ABED +create_random_data() { + dd if=/dev/urandom of=/tmp/randfile bs=4k count=1 + for x in {0..3} + do + dd if=/tmp/randfile of=/tmp/randchunk$x bs=1k count=1 skip=$x count=1 + done + + dd if=/dev/urandom of=/tmp/randchunk4 bs=1k count=1 + + rm /tmp/newrandfile + for x in 0 1 4 3 + do + cat /tmp/randchunk$x >> /tmp/newrandfile + done +} + +# create array, $1 could be 5 for raid5 and 6 for raid6 +create_array() { + if [ $1 -lt 5 -o $1 -gt 6 ] + then + echo wrong array type $1 + exit 2 + fi + + mdadm -CR $md0 -c4 -l5 -n10 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6 $dev11 $dev8 $dev9 --write-journal $dev10 + check wait + echo write-back > /sys/block/md0/md/journal_mode +} + +restart_array_write_back() { + mdadm -S $md0 + mdadm -A $md0 $dev0 $dev1 $dev2 $dev3 $dev4 $dev5 $dev6 $dev11 $dev8 $dev9 $dev10 + echo write-back > /sys/block/md0/md/journal_mode +} + +# compare the first page of md0 with file in $1 +cmp_first_page() { + cmp -n 4096 $1 $md0 || { echo cmp failed ; exit 2 ; } +} + +# write 3 pages after the first page of md0 +write_three_pages() { + for x in {1..3} + do + dd if=/dev/urandom of=$md0 bs=4k count=1 seek=$x count=1 + done +} + +# run_test <array_type:5/6> <degraded_or_not:yes/no> +run_test() { + create_random_data + create_array $1 + + if [ $2 == yes ] + then + mdadm --fail $md0 $dev0 + fi + + dd if=/tmp/randfile of=$md0 bs=4k count=1 + restart_array_write_back + cmp_first_page /tmp/randfile + restart_array_write_back + write_three_pages + cmp_first_page /tmp/randfile + + + dd if=/tmp/randchunk4 of=/dev/md0 bs=1k count=1 seek=2 + restart_array_write_back + cmp_first_page /tmp/newrandfile + restart_array_write_back + write_three_pages + cmp_first_page /tmp/newrandfile + + mdadm -S $md0 +} + +run_test 5 no +run_test 5 yes +run_test 6 no +run_test 6 yes diff --git a/tests/ToTest b/tests/ToTest new file mode 100644 index 0000000..b98e266 --- /dev/null +++ b/tests/ToTest @@ -0,0 +1,44 @@ + +multipath!! + +add/remove/fail + raid1 DONE + raid5 DONE + raid6/10 needed?? + +assemble + by devices DONE + by uuid DONE + by superminor DONE + by config file DONE + + various --updates DONE (not sparc2.2 or summaries) + +stop + --scan + +readonly/readwrite + +bitmap + separate file + internal + filename in config file + +examine + --scan + --brief + +detail + +grow: + size + raid1/5/6 DONE + devices + raid1 add DONE + raid1 shrink DONE + +'--quiet' option, and remove "" +'--name' option fo v1, and configfile etc... + +faulty + errors in raid1/5/6 diff --git a/tests/env-ddf-template b/tests/env-ddf-template new file mode 100644 index 0000000..90d7272 --- /dev/null +++ b/tests/env-ddf-template @@ -0,0 +1,113 @@ +sha1_sum() { + sha1sum "$1" | cut -c 1-40 +} + +get_rootdev() { + local dev=$(stat -c %D /) + local maj=$(expr $dev : '\(..*\)..') + local min=${dev#$maj} + local bd=/dev/$(basename $(readlink /sys/dev/block/$((0x$maj)):$((0x$min)))) + [ -b $bd ] || exit 1 + echo $bd +} + +get_sysdir() { + local mddev=$1 + [ -L $mddev ] && mddev=$(readlink -f $mddev) + echo "/sys/class/block/$(basename $mddev)/md" +} + +get_raiddisks() { + sysdir=$(get_sysdir "$1") + for i in $(seq 0 $(($(cat $sysdir/raid_disks)-1))); do + if [ -d $sysdir/rd$i ]; then + readlink -f /dev/block/$(cat $sysdir/rd$i/block/dev) + else + echo MISSING + fi + done +} + +get_present() { + get_raiddisks $1 | grep -vc MISSING +} + +ddf_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:ddf" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + t_layout=$9 + + if [ $t_chunk -ne 0 ]; then + t_rd_size=$((t_rd_size & ~(t_chunk - 1))) + fi + case $t_level in + 0) t_size=$((t_num_disks*$t_rd_size));; + 1) t_size=$t_rd_size;; + 4|5) t_size=$(((t_num_disks-1)*$t_rd_size));; + 6) t_size=$(((t_num_disks-2)*$t_rd_size));; + 10) t_size=$((t_num_disks*$t_rd_size/t_layout));; + esac + + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $_chunk" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $((_offset/2))" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + [ o$_size = odefault ] && _size=$(($(cat ${sysfs}/size)/2)) + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop8 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +container=/dev/md/ddf0 +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 + +# We don't want systemd to start system mdmon; start our own +export MDADM_NO_SYSTEMCTL=1 diff --git a/tests/env-imsm-template b/tests/env-imsm-template new file mode 100644 index 0000000..d524771 --- /dev/null +++ b/tests/env-imsm-template @@ -0,0 +1,91 @@ +imsm_check() { + udevadm settle + case $1 in + container ) + grep -s "blocks super external:imsm" /proc/mdstat > /dev/null || { + echo >&2 "**Fatal** Correctly formed container not found"; cat /proc/mdstat; exit 1; } + ;; + member ) + t_member=$2 + t_num_disks=$3 + t_level=$4 + t_rd_size=$5 + t_size=$6 + t_offset=$7 + t_chunk=$8 + + t_rd_size=$((t_rd_size & ~(1024 - 1))) + + if [ $t_level -eq 1 ]; then + t_chunk=64 + fi + + t_num_data_disks=0 + + case $t_level in + 0) + t_num_data_disks=$t_num_disks + ;; + 1) + t_num_data_disks=1 + ;; + 5) + t_num_data_disks=$((t_num_disks-1)) + ;; + 10) + t_num_data_disks=$((t_num_disks/2)) + ;; + esac + + t_size=$((t_rd_size*t_num_data_disks)) + + err=0 + + eval `stat -L -c "let major=0x%t; let minor=0x%T;" $t_member` + sysfs=/sys/dev/block/${major}:${minor} + if [ ! -f ${sysfs}/md/array_state ]; then + echo "**Fatal**: Array member $t_member not found" >&2; cat /proc/mdstat >&2; exit 1 + fi + _chunk=`cat ${sysfs}/md/chunk_size` + if [ $t_chunk -ne $((_chunk/1024)) ]; then + echo "**Error**: Chunk size mismatch - expected $t_chunk, actual $(($_chunk/1024))" >&2 + err=$((err + 1)) + fi + for i in `seq 0 $((t_num_disks - 1))`; do + _offset=`cat ${sysfs}/md/rd${i}/offset` + if [ $t_offset -ne $((_offset / 2)) ]; then + echo "**Error**: Offset mismatch - expected $t_offset, actual $_offset" >&2 + err=$((err + 1)) + fi + _rd_size=`cat ${sysfs}/md/rd${i}/size` + if [ $t_rd_size -ne $_rd_size ]; then + echo "**Error**: Component size mismatch - expected $t_rd_size, actual $_rd_size" >&2 + err=$((err + 1)) + fi + done + _size=`cat ${sysfs}/md/array_size` + if [ $t_size -ne $_size ]; then + echo "**Error**: Array size mismatch - expected $t_size, actual $_size" >&2 + err=$((err + 1)) + fi + if [ $err -gt 0 ]; then + echo "$t_member failed check" >&2 + cat /proc/mdstat >&2 + mdadm -E /dev/loop0 >&2 + exit 1 + fi + ;; + * ) + echo >&2 "**Error** unknown check $1"; exit 1; + esac +} + +export IMSM_NO_PLATFORM=1 +export IMSM_DEVNAME_AS_SERIAL=1 +export IMSM_TEST_OROM=1 +container=/dev/md/container +member0=/dev/md/vol0 +member1=/dev/md/vol1 +member2=/dev/md/vol2 +member3=/dev/md/vol3 +member4=/dev/md/vol4 diff --git a/tests/func.sh b/tests/func.sh new file mode 100644 index 0000000..9710a53 --- /dev/null +++ b/tests/func.sh @@ -0,0 +1,344 @@ +#!/bin/bash + +# We test mdadm on loop-back block devices. +# dir for storing files should be settable by command line maybe +size=20000 +# super0, round down to multiple of 64 and substract 64 +mdsize0=19904 +# super00 is nested, subtract 128 +mdsize00=19840 +# super1.0 round down to multiple of 2, subtract 8 +mdsize1=19992 +mdsize1a=19988 +mdsize12=19988 +# super1.2 for linear: round to multiple of 2, subtract 4 +mdsize1_l=19996 +mdsize2_l=19996 +# subtract another 4 for bitmaps +mdsize1b=19988 +mdsize11=19992 +mdsize11a=19456 +mdsize12=19988 + +# ddf needs bigger devices as 32Meg is reserved! +ddfsize=65536 + +# $1 is optional parameter, it shows why to save log +save_log() { + status=$1 + logfile="$status""$_basename".log + + cat $targetdir/stderr >> $targetdir/log + cp $targetdir/log $logdir/$_basename.log + echo "## $HOSTNAME: saving dmesg." >> $logdir/$logfile + dmesg -c >> $logdir/$logfile + echo "## $HOSTNAME: saving proc mdstat." >> $logdir/$logfile + cat /proc/mdstat >> $logdir/$logfile + array=($(mdadm -Ds | cut -d' ' -f2)) + [ "$1" == "fail" ] && + echo "FAILED - see $logdir/$_basename.log and $logdir/$logfile for details" + if [ $DEVTYPE == 'lvm' ] + then + # not supported lvm type yet + echo + elif [ "$DEVTYPE" == 'loop' -o "$DEVTYPE" == 'disk' ] + then + if [ ! -z "$array" -a ${#array[@]} -ge 1 ] + then + echo "## $HOSTNAME: mdadm -D ${array[@]}" >> $logdir/$logfile + $mdadm -D ${array[@]} >> $logdir/$logfile + # ignore saving external(external file, imsm...) bitmap + cat /proc/mdstat | grep -q "linear\|external" && return 0 + md_disks=($($mdadm -D -Y ${array[@]} | grep "/dev/" | cut -d'=' -f2)) + cat /proc/mdstat | grep -q "bitmap" + if [ $? -eq 0 ] + then + echo "## $HOSTNAME: mdadm -X ${md_disks[@]}" >> $logdir/$logfile + $mdadm -X ${md_disks[@]} >> $logdir/$logfile + echo "## $HOSTNAME: mdadm -E ${md_disks[@]}" >> $logdir/$logfile + $mdadm -E ${md_disks[@]} >> $logdir/$logfile + fi + else + echo "## $HOSTNAME: no array assembled!" >> $logdir/$logfile + fi + fi +} + +cleanup() { + udevadm settle + $mdadm -Ssq 2> /dev/null + case $DEVTYPE in + loop ) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + losetup -d /dev/loop$d &> /dev/null + rm -f /dev/disk/by-path/loop* + rm -f /var/tmp/mdtest$d + done + ;; + lvm ) + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + eval "lvremove --quiet -f \$dev$d" + done + ;; + disk ) + $mdadm --zero ${disks[@]} &> /dev/null + ;; + esac +} + +do_clean() +{ + mdadm -Ss > /dev/null + mdadm --zero $devlist 2> /dev/null + dmesg -c > /dev/null +} + +check_env() { + user=$(id -un) + [ "X$user" != "Xroot" ] && { + echo "test: testing can only be done as 'root'." + exit 1 + } + [ \! -x $mdadm ] && { + echo "test: please run make everything before perform testing." + exit 1 + } + cmds=(mdadm lsblk df udevadm losetup mkfs.ext3 fsck seq) + for cmd in ${cmds[@]} + do + which $cmd > /dev/null || { + echo "$cmd command not found!" + exit 1 + } + done + if $(lsblk -a | grep -iq raid) + then + # donot run mdadm -Ss directly if there are RAIDs working. + echo "test: please run test suite without running RAIDs environment." + exit 1 + fi + # Check whether to run multipath tests + modprobe multipath 2> /dev/null + grep -sq 'Personalities : .*multipath' /proc/mdstat && + MULTIPATH="yes" +} + +do_setup() { + trap cleanup 0 1 3 15 + trap ctrl_c 2 + + check_env + [ -d $logdir ] || mkdir -p $logdir + + devlist= + if [ "$DEVTYPE" == "loop" ] + then + # make sure there are no loop devices remaining. + # udev started things can sometimes prevent them being stopped + # immediately + while grep loop /proc/partitions > /dev/null 2>&1 + do + $mdadm -Ssq + losetup -d /dev/loop[0-9]* 2> /dev/null + sleep 0.2 + done + elif [ "$DEVTYPE" == "disk" ] + then + if [ ! -z "$disks" ] + then + for d in $(seq 0 ${#disks[@]}) + do + eval "dev$d=${disks[$d]}" + eval devlist=\"\$devlist \$dev$d\" + eval devlist$d=\"\$devlist\" + done + $mdadm --zero ${disks[@]} &> /dev/null + else + echo "Forget to provide physical devices for disk mode." + exit 1 + fi + fi + for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 13 + do + sz=$size + [ $d -gt 7 ] && sz=$ddfsize + case $DEVTYPE in + loop) + [ -f $targetdir/mdtest$d ] || + dd if=/dev/zero of=$targetdir/mdtest$d count=$sz bs=1K > /dev/null 2>&1 + # make sure udev doesn't touch + mdadm --zero $targetdir/mdtest$d 2> /dev/null + [ -b /dev/loop$d ] || mknod /dev/loop$d b 7 $d + if [ $d -eq 7 ] + then + losetup /dev/loop$d $targetdir/mdtest6 # for multipath use + else + losetup /dev/loop$d $targetdir/mdtest$d + fi + eval dev$d=/dev/loop$d + eval file$d=$targetdir/mdtest$d + ;; + lvm) + unset MULTIPATH + eval dev$d=/dev/mapper/${LVM_VOLGROUP}-mdtest$d + if ! lvcreate --quiet -L ${sz}K -n mdtest$d $LVM_VOLGROUP + then + trap '' 0 # make sure lvremove is not called + eval echo error creating \$dev$d + exit 129 + fi + ;; + ram) + unset MULTIPATH + eval dev$d=/dev/ram$d + ;; + esac + eval devlist=\"\$devlist \$dev$d\" + eval devlist$d=\"\$devlist\" + #" <-- add this quote to un-confuse vim syntax highlighting + done + path0=$dev6 + path1=$dev7 + ulimit -c unlimited + [ -f /proc/mdstat ] || modprobe md_mod + echo 2000 > /proc/sys/dev/raid/speed_limit_max + echo 0 > /sys/module/md_mod/parameters/start_ro +} + +# check various things +check() { + case $1 in + opposite_result ) + if [ $? -eq 0 ]; then + die "This command shouldn't run successfully" + fi + ;; + spares ) + spares=$(tr '] ' '\012\012' < /proc/mdstat | grep -c '(S)' || exit 0) + [ $spares -ne $2 ] && + die "expected $2 spares, found $spares" + ;; + raid* | linear ) + grep -sq "active $1 " /proc/mdstat || + die "active $1 not found" + ;; + algorithm ) + grep -sq " algorithm $2 " /proc/mdstat || + die "algorithm $2 not found" + ;; + resync | recovery | reshape ) + cnt=5 + while ! grep -sq $1 /proc/mdstat + do + if [ $cnt -gt 0 ] && grep -v idle /sys/block/md*/md/sync_action > /dev/null + then # Something isn't idle - wait a bit + sleep 0.5 + cnt=$[cnt-1] + else + die "no $1 happening" + fi + done + ;; + nosync ) + sleep 0.5 + # Since 4.2 we delay the close of recovery until there has been a chance for + # spares to be activated. That means that a recovery that finds nothing + # to do can still take a little longer than expected. + # add an extra check: is sync_completed shows the end is reached, assume + # there is no recovery. + if grep -sq -E '(resync|recovery|reshape) *=' /proc/mdstat + then + incomplete=`grep / /sys/block/md*/md/sync_completed 2> /dev/null | sed '/^ *\([0-9]*\) \/ \1/d'` + [ -n "$incomplete" ] && + die "resync or recovery is happening!" + fi + ;; + wait ) + p=`cat /proc/sys/dev/raid/speed_limit_max` + echo 2000000 > /proc/sys/dev/raid/speed_limit_max + sleep 0.1 + while grep -Eq '(resync|recovery|reshape|check|repair) *=' /proc/mdstat || + grep -v idle > /dev/null /sys/block/md*/md/sync_action + do + sleep 0.5 + done + echo $p > /proc/sys/dev/raid/speed_limit_max + ;; + state ) + grep -sq "blocks.*\[$2\]\$" /proc/mdstat || + die "state $2 not found!" + sleep 0.5 + ;; + bitmap ) + grep -sq bitmap /proc/mdstat || + die "no bitmap" + ;; + nobitmap ) + grep -sq "bitmap" /proc/mdstat && + die "bitmap present" + ;; + readonly ) + grep -sq "read-only" /proc/mdstat || + die "array is not read-only!" + ;; + inactive ) + grep -sq "inactive" /proc/mdstat || + die "array is not inactive!" + ;; + # It only can be used when there is only one raid + chunk ) + chunk_size=`awk -F',' '/chunk/{print $2}' /proc/mdstat | awk -F'[a-z]' '{print $1}'` + if [ "$chunk_size" -ne "$2" ] ; then + die "chunksize should be $2, but it's $chunk_size" + fi + ;; + * ) + die "unknown check $1" + ;; + esac +} + +no_errors() { + if [ -s $targetdir/stderr ] + then + echo Bad errors from mdadm: + cat $targetdir/stderr + exit 2 + fi +} + +# basic device test +testdev() { + [ -b $1 ] || die "$1 isn't a block device." + [ "$DEVTYPE" == "disk" ] && return 0 + udevadm settle + dev=$1 + cnt=$2 + dvsize=$3 + chunk=$4 + if [ -z "$5" ] + then + mkfs.ext3 -F -j $dev > /dev/null 2>&1 && fsck -fn $dev >&2 + fi + dsize=$[dvsize/chunk] + dsize=$[dsize*chunk] + rasize=$[dsize*2*cnt] + # rasize is in sectors + if [ -n "$DEV_ROUND_K" ] + then + rasize=$[rasize/DEV_ROUND_K/2] + rasize=$[rasize*DEV_ROUND_K*2] + fi + [ `/sbin/blockdev --getsize $dev` -eq 0 ] && sleep 2 + _sz=`/sbin/blockdev --getsize $dev` + [ $rasize -lt $_sz -o $[rasize*4/5] -gt $_sz ] && + die "size is wrong for $dev: $cnt * $dvsize (chunk=$chunk) = $rasize, not $_sz" + return 0 +} + +rotest() { + dev=$1 + fsck -fn $dev >&2 +} diff --git a/tests/imsm-grow-template b/tests/imsm-grow-template new file mode 100644 index 0000000..1a8676e --- /dev/null +++ b/tests/imsm-grow-template @@ -0,0 +1,119 @@ + +# 0 - POSITIVE test, otherwise NEGATIVE test +negative_test=$1 + +# 0 - On-line Capacity Expansion test, otherwise LEVEL migration or CHUNK size migration test +migration_test=$2 + +function grow_member() { + local member=$1 + local disks=$2 + local comps=$3 + local level=$4 + local size=$5 + local offset=$6 + local chunk=$7 + local old_chunk=$8 + local array_size=$((comps * size)) + + rm -f $backup_imsm + if [ $chunk -eq $old_chunk ]; then + ( set -ex; mdadm --grow $member --level=$level ) + else + ( set -ex; mdadm --grow $member --chunk=$chunk ) + fi + local status=$? + if [ $negative_test -ne 0 ]; then + if [ $status -eq 0 ]; then + echo >&2 "**Error**: $member: --grow should failed, but it completed successfuly" + exit 1 + fi + return + fi + check wait + sleep 5 + imsm_check member $member $disks $level $size $array_size $offset $chunk + testdev $member $comps $size $chunk +} + +# Create container +mdadm --create --run $container --auto=md --metadata=imsm --raid-disks=$num_disks $device_list +check wait +imsm_check container $num_disks + +# Create first volume inside the container +if [[ ! -z $vol0_chunk ]]; then + mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --chunk=$vol0_chunk --raid-disks=$num_disks $device_list +else + mdadm --create --run $member0 --auto=md --level=$vol0_level --size=$vol0_comp_size --raid-disks=$num_disks $device_list +fi +check wait + +# Create second volume inside the container (if defined) +if [ ! -z $vol1_level ]; then + if [ ! -z $vol1_chunk ]; then + mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --chunk=$vol1_chunk --raid-disks=$num_disks $device_list + else + mdadm --create --run $member1 --auto=md --level=$vol1_level --size=$vol1_comp_size --raid-disks=$num_disks $device_list + fi + check wait +fi + +# Wait for any RESYNC to complete +check wait + +# Test first volume +imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_num_comps)) $vol0_offset $vol0_chunk +testdev $member0 $vol0_num_comps $vol0_comp_size $vol0_chunk + +# Test second volume (if defined) +if [ ! -z $vol1_level ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_num_comps $vol1_comp_size $vol1_chunk +fi + +# Add extra disks to container if operation requires spares in container. +for i in $spare_list +do + mdadm --add $container $i + check wait + num_disks=$((num_disks + 1)) +done + +imsm_check container $num_disks +num_disks=$((num_disks + add_to_num_disks)) +backup_imsm=/tmp/backup_imsm + +# Grow each member or a container depending on the type of an operation +if [ $migration_test -ne 0 ]; then + if [ -z $new_num_disks ]; then + new_num_disks=$num_disks + fi + grow_member $member0 $new_num_disks $vol0_new_num_comps $vol0_new_level $vol0_comp_size $vol0_offset $vol0_new_chunk $vol0_chunk + if [[ $vol1_new_chunk -ne 0 ]] ; then + grow_member $member1 $new_num_disks $vol1_new_num_comps $vol1_new_level $vol1_comp_size $vol1_offset $vol1_new_chunk $vol1_chunk + fi +else + rm -f $backup_imsm + ( set -x; mdadm --grow $container --raid-disks=$num_disks ) + grow_status=$? + if [ $negative_test -ne 0 ]; then + if [ $grow_status -eq 0 ]; then + echo >&2 "**Error**: $container: --grow should failed, but it completed successfuly" + exit 1 + fi + else + sleep 5 + check wait + sleep 5 + check wait + imsm_check member $member0 $num_disks $vol0_level $vol0_comp_size $((vol0_comp_size * vol0_new_num_comps)) $vol0_offset $vol0_chunk + testdev $member0 $vol0_new_num_comps $vol0_comp_size $vol0_chunk + if [ $vol1_new_num_comps -ne 0 ]; then + imsm_check member $member1 $num_disks $vol1_level $vol1_comp_size $((vol1_comp_size * vol1_new_num_comps)) $vol1_offset $vol1_chunk + testdev $member1 $vol1_new_num_comps $vol1_comp_size $vol1_chunk + fi + fi +fi + +exit 0 diff --git a/tests/utils b/tests/utils new file mode 100644 index 0000000..3acebd7 --- /dev/null +++ b/tests/utils @@ -0,0 +1,191 @@ +# set of functions used to test policy framework with assemble, incremental and Monitor + +set +e +#create links to be able to use domains +for d in 0 1 2 3 4 5 6 7 8 9 10 11 12 +do + eval ln -s \$dev$d /dev/disk/by-path/loop$d + eval d$d="loop$d" + eval mdadm --zero-superblock \$dev$d +done + +devices="/dev/loop[0-9] /dev/loop10 /dev/loop11 /dev/loop12" + +# on failure print out few things before exit +# uses testdsc and platform global variables +err(){ + echo >&2 "ERROR: $*" + cat $config >&2 || true + cat /proc/mdstat >&2 + [ -z "$testdsc" ] || { echo >&2 $platform: $testdsc "- failed"; } + ps -e | grep mdadm >&2 || true + if [ $listfailed == "yes" ]; then + [ "$verbose" != "yes" ] || echo ---FAILED--- + flist="$flist \n $platform $testdsc" + failed=1 + else + exit 1 + fi +} + +# set test description +dsc(){ + failed=0 + testdsc="$*" + [ "$verbose" != "yes" ] || echo $testdsc +} + +killmonitor(){ + [ -z "$monitorpid" ] || { kill -9 $monitorpid; unset monitorpid; } +} + +tidyup(){ + killmonitor + mdadm -Ss || true + mdadm -Ss + mdadm --zero-superblock $devices || true + udevadm settle + rm -f $config +} + +trap tidyup 0 1 2 3 15 + +# create a RAID 1 array or container and subarray(s) on 2 disks +# if platform not specified imsm is used +# if subsize is given, first subarray is created with given size and second one on remaining space +ccv(){ + # mddevno used to name created array + local mddevno="$1" + # numbers of devices to be used in array + local devno1="$2" + local devno2="$3" + local platform="$4" + local subsize="$5" + local onearray="$6" + [ -n "$platform" ] || platform="imsm" + if [ "$platform" == "imsm" ] || [ "$platform" == "ddf" ]; then + eval mdadm -CR /dev/md/con$mddevno -e $platform -n 2 \$dev$devno1 \$dev$devno2 + udevadm settle + [ -z "$subsize" ] || eval mdadm -CR sub$mddevno"_" -l 1 -n 2 /dev/md/con$mddevno -z $subsize + [ -n "$onearray" ] || eval mdadm -CR sub$mddevno -l 1 -n 2 /dev/md/con$mddevno + else + [ -z "$subsize" ] || sizepar="-z $subsize" + eval mdadm -CR arr$mddevno -e $platform -l 1 -n 2 \$dev$devno1 \$dev$devno2 $sizepar + unset sizepar + fi +} + +# get container and subarray using given device from mdstat +# sets global variables c and v +getarray(){ + local devname=`basename $1` + local platformtype=`grep -A 1 $devname /proc/mdstat | awk '/active/ {getline; print $4 }' | awk -F ":" 'END {print $1}'` + c=`grep "inactive.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + v=`grep " active.*$devname" /proc/mdstat | awk -F " " '{print $1}'` + [ "$platformtype" == "external" ] || c=$v +} + +# check if given device belongs to any container and subarray +# if $2 given then only container checked +chkarray(){ + local devname="$1" + local subcheck="$2" + getarray $devname + [ -n "$c" ] || err "$devname not in any container" + [ -n "$subcheck" ] || [ -n "$v" ] || err " $devname not in subarray" +} + +# test if two devices in the same container/subarray +# $1 $2 - devices +# $3 don't check subarrays, only containers +tst(){ + local device1=`basename $1` + local device2=`basename $2` + local subcheck="$3" + chkarray $device1 $subcheck + local x="$c" + local y="$v" + chkarray $device2 $subcheck + [ "$c" == "$x" ] || err "$device1 and $device2 not in the same container" + [ -n "$subcheck" ] || [ "$v" == "$y" ] || err "$device1 and $device2 not in the same subarray" +} + +# same as tst, just use numbers of devices instead of names as parameters +dtst(){ + local devno1="$1" + local devno2="$2" + local subcheck="$3" + eval tst \$dev$devno1 \$dev$devno2 $subcheck +} + +# create containers/subarrays, check if created properly, +# set global variables c$mddevno v$mddevno, usually c0=md127, v0=md126 , etc. +setupdevs(){ + local mddevno="$1" + local devno1="$2" + local devno2="$3" + local p="$4" + local subsize="$5" + local onearray="$6" + [ -n "$p" ] || p=$platform + ccv $mddevno $devno1 $devno2 $p $subsize $onearray + dtst $devno1 $devno2 + eval c$mddevno=\"$c\" + eval v$mddevno=\"$v\" +} + +# check if given spare in container +# usage: chkspare container spare [n] (n if spare shouldn't be in container) +chkspare(){ + local container=`basename $1` + local spare=$2 + local expected=$3 + getarray $spare + [ -n "$expected" ] || expected="y" + if [ "$expected" == "y" ]; then + [ "$c" == "$container" ] || err "$spare not in container $container" + else + [ "$c" != "$container" ] || err "$spare in container $container" + fi +} + +#check if spare was moved from one container to another +# args: from_container to_container spare [yn] +# n when spare should remain in original container +chksparemoved(){ + sleep $sleeptime + from_container="$1" + to_container="$2" + spare="$3" + expected="$4" + [ -n "$expected" ] || expected="y" + notexpected="n"; [ "$expected" == "y" ] || notexpected="y" + chkspare $from_container $spare $notexpected + [ $failed -eq 1 ] || chkspare $to_container $spare $expected +} + + +# for domains defined through policy +createconfig(){ +if [ "$1" != "a" ]; then +{ + domain=$1 + metadata=$2 + action=$3 + while [ -n "$4" ]; do + echo="policy domain=$domain" + [ "$metadata" == "noplatform" ] || echo="$echo metadata=$metadata" + echo="$echo path=loop$4" + echo="$echo action=$action" + echo "$echo" + shift + done +} >> $config +else +{ + echo "DEVICES $devlist /dev/md1*" + mdadm -Ebs +} > $config +fi +#[ "$verbose" != "yes" ] || cat $config | grep policy || true +} diff --git a/udev-md-clustered-confirm-device.rules b/udev-md-clustered-confirm-device.rules new file mode 100644 index 0000000..3e5381e --- /dev/null +++ b/udev-md-clustered-confirm-device.rules @@ -0,0 +1,21 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="clustermd_end" + +# handle md arrays +KERNEL!="md*", GOTO="clustermd_end" +ENV{DEVTYPE}!="disk", GOTO="clustermd_end" +ACTION!="change", GOTO="clustermd_end" +ENV{EVENT}!="ADD_DEVICE", GOTO="clustermd_end" +ENV{DEVICE_UUID}!="?*", GOTO="clustermd_end" +ENV{RAID_DISK}!="?*", GOTO="clustermd_end" + +# Based on the received UUID, node confirms the device if +# it is found by blkid, otherwise the node reports it is +# missing. +PROGRAM="BINDIR/blkid -o device -t UUID_SUB=$env{DEVICE_UUID}", ENV{.md.newdevice} = "$result" + +ENV{.md.newdevice}!="", RUN+="BINDIR/mdadm --manage $env{DEVNAME} --cluster-confirm $env{RAID_DISK}:$env{.md.newdevice}" +ENV{.md.newdevice}=="", RUN+="BINDIR/mdadm --manage $env{DEVNAME} --cluster-confirm $env{RAID_DISK}:missing" + +LABEL="clustermd_end" diff --git a/udev-md-raid-arrays.rules b/udev-md-raid-arrays.rules new file mode 100644 index 0000000..13c9076 --- /dev/null +++ b/udev-md-raid-arrays.rules @@ -0,0 +1,44 @@ +# do not edit this file, it will be overwritten on update + +SUBSYSTEM!="block", GOTO="md_end" + +# handle md arrays +ACTION!="add|change", GOTO="md_end" +KERNEL!="md*", GOTO="md_end" + +# partitions have no md/{array_state,metadata_version}, but should not +# for that reason be ignored. +ENV{DEVTYPE}=="partition", GOTO="md_ignore_state" + +# container devices have a metadata version of e.g. 'external:ddf' and +# never leave state 'inactive' +ATTR{md/metadata_version}=="external:[A-Za-z]*", ATTR{md/array_state}=="inactive", GOTO="md_ignore_state" +TEST!="md/array_state", ENV{SYSTEMD_READY}="0", GOTO="md_end" +ATTR{md/array_state}=="clear*|inactive", ENV{SYSTEMD_READY}="0", GOTO="md_end" +ATTR{md/sync_action}=="reshape", ENV{RESHAPE_ACTIVE}="yes" +LABEL="md_ignore_state" + +IMPORT{program}="BINDIR/mdadm --detail --no-devices --export $devnode" +ENV{DEVTYPE}=="disk", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="disk", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}" +ENV{DEVTYPE}=="disk", ENV{MD_DEVNAME}=="?*", SYMLINK+="md/$env{MD_DEVNAME}" +ENV{DEVTYPE}=="partition", ENV{MD_NAME}=="?*", SYMLINK+="disk/by-id/md-name-$env{MD_NAME}-part%n", OPTIONS+="string_escape=replace" +ENV{DEVTYPE}=="partition", ENV{MD_UUID}=="?*", SYMLINK+="disk/by-id/md-uuid-$env{MD_UUID}-part%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[^0-9]", SYMLINK+="md/$env{MD_DEVNAME}%n" +ENV{DEVTYPE}=="partition", ENV{MD_DEVNAME}=="*[0-9]", SYMLINK+="md/$env{MD_DEVNAME}p%n" + +IMPORT{builtin}="blkid" +OPTIONS+="link_priority=100" +OPTIONS+="watch" +ENV{ID_FS_USAGE}=="filesystem|other|crypto", ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_PART_ENTRY_UUID}=="?*", SYMLINK+="disk/by-partuuid/$env{ID_PART_ENTRY_UUID}" +ENV{ID_FS_USAGE}=="filesystem|other", ENV{ID_FS_LABEL_ENC}=="?*", SYMLINK+="disk/by-label/$env{ID_FS_LABEL_ENC}" + +ENV{MD_LEVEL}=="raid[1-9]*", ENV{SYSTEMD_WANTS}+="mdmonitor.service" + +# Tell systemd to run mdmon for our container, if we need it. +ENV{MD_LEVEL}=="raid[1-9]*", ENV{MD_CONTAINER}=="?*", PROGRAM="/usr/bin/readlink $env{MD_CONTAINER}", ENV{MD_MON_THIS}="%c" +ENV{MD_MON_THIS}=="?*", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdmon@%c.service" +ENV{RESHAPE_ACTIVE}=="yes", PROGRAM="/usr/bin/basename $env{MD_MON_THIS}", ENV{SYSTEMD_WANTS}+="mdadm-grow-continue@%c.service" + +LABEL="md_end" diff --git a/udev-md-raid-assembly.rules b/udev-md-raid-assembly.rules new file mode 100644 index 0000000..d668cdd --- /dev/null +++ b/udev-md-raid-assembly.rules @@ -0,0 +1,38 @@ +# do not edit this file, it will be overwritten on update + +# Don't process any events if anaconda is running as anaconda brings up +# raid devices manually +ENV{ANACONDA}=="?*", GOTO="md_inc_end" +# assemble md arrays + +SUBSYSTEM!="block", GOTO="md_inc_end" + +# skip non-initialized devices +ENV{SYSTEMD_READY}=="0", GOTO="md_inc_end" + +# handle potential components of arrays (the ones supported by md) +ENV{ID_FS_TYPE}=="linux_raid_member", GOTO="md_inc" + +# "noiswmd" on kernel command line stops mdadm from handling +# "isw" (aka IMSM - Intel RAID). +# "nodmraid" on kernel command line stops mdadm from handling +# "isw" or "ddf". +IMPORT{cmdline}="noiswmd" +IMPORT{cmdline}="nodmraid" + +ENV{nodmraid}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="ddf_raid_member", GOTO="md_inc" +ENV{noiswmd}=="?*", GOTO="md_inc_end" +ENV{ID_FS_TYPE}=="isw_raid_member", ACTION!="change", GOTO="md_inc" +GOTO="md_inc_end" + +LABEL="md_inc" + +# remember you can limit what gets auto/incrementally assembled by +# mdadm.conf(5)'s 'AUTO' and selectively whitelist using 'ARRAY' +ACTION=="add|change", IMPORT{program}="BINDIR/mdadm --incremental --export $devnode --offroot $env{DEVLINKS}" +ACTION=="add|change", ENV{MD_STARTED}=="*unsafe*", ENV{MD_FOREIGN}=="no", ENV{SYSTEMD_WANTS}+="mdadm-last-resort@$env{MD_DEVICE}.timer" +ACTION=="remove", ENV{ID_PATH}=="?*", RUN+="BINDIR/mdadm -If $name --path $env{ID_PATH}" +ACTION=="remove", ENV{ID_PATH}!="?*", RUN+="BINDIR/mdadm -If $name" + +LABEL="md_inc_end" diff --git a/udev-md-raid-creating.rules b/udev-md-raid-creating.rules new file mode 100644 index 0000000..9bef8d1 --- /dev/null +++ b/udev-md-raid-creating.rules @@ -0,0 +1,7 @@ +# do not edit this file, it will be overwritten on update +# While mdadm is creating an array, it creates a file +# /run/mdadm/creating-mdXXX. If that file exists, then +# the array is not "ready" and we should make sure the +# content is ignored. + +KERNEL=="md*", TEST=="/run/mdadm/creating-$kernel", ENV{SYSTEMD_READY}="0" diff --git a/udev-md-raid-safe-timeouts.rules b/udev-md-raid-safe-timeouts.rules new file mode 100644 index 0000000..12bdcaa --- /dev/null +++ b/udev-md-raid-safe-timeouts.rules @@ -0,0 +1,61 @@ +# Copyright (C) 2017 by Jonathan G. Underwood +# This file is part of mdraid-safe-timeouts. +# +# mdraid-safe-timeouts is free software: you can redistribute it +# and/or modify it under the terms of the GNU General Public License +# as published by the Free Software Foundation, either version 3 of +# the License, or (at your option) any later version. +# +# Foobar is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with mdraid-safe-timeouts. If not, see +# <https://www.gnu.org/licenses/>. + +# This file causes block devices with Linux RAID (mdadm) signatures to +# attempt to set safe timeouts for the drives involved +# See udev(8) for syntax + +# Don't process any events if anaconda is running as anaconda brings up +# raid devices manually +ENV{ANACONDA}=="?*", GOTO="md_timeouts_end" + +SUBSYSTEM!="block|machinecheck", GOTO="md_timeouts_end" + +# "noiswmd" on kernel command line stops mdadm from handling +# "isw" (aka IMSM - Intel RAID). +# "nodmraid" on kernel command line stops mdadm from handling +# "isw" or "ddf". +IMPORT{cmdline}="nodmraid" +ENV{nodmraid}=="?*", GOTO="md_timeouts_end" +IMPORT{cmdline}="noiswmd" +ENV{noiswmd}=="?*", GOTO="md_timeouts_end" + +# Set controller timeout for parent disk of each partition if the +# partition is a mdraid partition of higher than raid 0, and the disk +# doesn't have scterc turned on (i.e. if it's disabled or the disk +# doesn't support it). We determine if the disk has SCTERC turned on +# by examining the output of smartctl and seeing if it contains the +# word "seconds". If the word "seconds" is found we take this to imply +# STCERC is turned on, and take no action. Otherwise we set the drive +# controller timeout to 180 seconds. It would be better to check the +# exit status code of smartctl rather than grepping for "seconds", but +# it's not clear what that will be in the three cases (supported and +# turned on, supported but disabled, not supported). + +ENV{DEVTYPE}!="partition", GOTO="md_timeouts_end" + +IMPORT{program}="/sbin/mdadm --examine --export $devnode" + +ACTION=="add|change", \ + ENV{ID_FS_TYPE}=="linux_raid_member", \ + ENV{MD_LEVEL}=="raid[1-9]*", \ + TEST=="/sys/block/$parent/device/timeout", \ + TEST=="/usr/sbin/smartctl", \ + PROGRAM!="/bin/sh -c '/usr/sbin/smartctl -l scterc /dev/$parent | grep -q seconds && exit 0 || exit 1'", \ + RUN+="/bin/sh -c 'echo 180 > /sys/block/$parent/device/timeout && /usr/bin/logger timeout for /dev/$parent set to 180 secs'" + +LABEL="md_timeouts_end" @@ -0,0 +1,2378 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +#include "md_p.h" +#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/un.h> +#include <sys/resource.h> +#include <sys/vfs.h> +#include <sys/mman.h> +#include <linux/magic.h> +#include <poll.h> +#include <ctype.h> +#include <dirent.h> +#include <signal.h> +#include <dlfcn.h> + + +/* + * following taken from linux/blkpg.h because they aren't + * anywhere else and it isn't safe to #include linux/ * stuff. + */ + +#define BLKPG _IO(0x12,105) + +/* The argument structure */ +struct blkpg_ioctl_arg { + int op; + int flags; + int datalen; + void *data; +}; + +/* The subfunctions (for the op field) */ +#define BLKPG_ADD_PARTITION 1 +#define BLKPG_DEL_PARTITION 2 + +/* Sizes of name fields. Unused at present. */ +#define BLKPG_DEVNAMELTH 64 +#define BLKPG_VOLNAMELTH 64 + +/* The data structure for ADD_PARTITION and DEL_PARTITION */ +struct blkpg_partition { + long long start; /* starting offset in bytes */ + long long length; /* length in bytes */ + int pno; /* partition number */ + char devname[BLKPG_DEVNAMELTH]; /* partition name, like sda5 or c0d1p2, + to be used in kernel messages */ + char volname[BLKPG_VOLNAMELTH]; /* volume label */ +}; + +#include "part.h" + +/* Force a compilation error if condition is true */ +#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition)) + +/* Force a compilation error if condition is true, but also produce a + result (of value 0 and type size_t), so the expression can be used + e.g. in a structure initializer (or where-ever else comma expressions + aren't permitted). */ +#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); })) + +static int is_dlm_hooks_ready = 0; + +int dlm_funs_ready(void) +{ + return is_dlm_hooks_ready ? 1 : 0; +} + +static struct dlm_hooks *dlm_hooks = NULL; +struct dlm_lock_resource *dlm_lock_res = NULL; +static int ast_called = 0; + +struct dlm_lock_resource { + dlm_lshandle_t *ls; + struct dlm_lksb lksb; +}; + +/* Using poll(2) to wait for and dispatch ASTs */ +static int poll_for_ast(dlm_lshandle_t ls) +{ + struct pollfd pfd; + + pfd.fd = dlm_hooks->ls_get_fd(ls); + pfd.events = POLLIN; + + while (!ast_called) + { + if (poll(&pfd, 1, 0) < 0) + { + perror("poll"); + return -1; + } + dlm_hooks->dispatch(dlm_hooks->ls_get_fd(ls)); + } + ast_called = 0; + + return 0; +} + +static void dlm_ast(void *arg) +{ + ast_called = 1; +} + +static char *cluster_name = NULL; +/* Create the lockspace, take bitmapXXX locks on all the bitmaps. */ +int cluster_get_dlmlock(void) +{ + int ret = -1; + char str[64]; + int flags = LKF_NOQUEUE; + int retry_count = 0; + + if (!dlm_funs_ready()) { + pr_err("Something wrong with dlm library\n"); + return -1; + } + + ret = get_cluster_name(&cluster_name); + if (ret) { + pr_err("The md can't get cluster name\n"); + return -1; + } + + dlm_lock_res = xmalloc(sizeof(struct dlm_lock_resource)); + dlm_lock_res->ls = dlm_hooks->open_lockspace(cluster_name); + if (!dlm_lock_res->ls) { + dlm_lock_res->ls = dlm_hooks->create_lockspace(cluster_name, O_RDWR); + if (!dlm_lock_res->ls) { + pr_err("%s failed to create lockspace\n", cluster_name); + return -ENOMEM; + } + } else { + pr_err("open existed %s lockspace\n", cluster_name); + } + + snprintf(str, 64, "bitmap%s", cluster_name); +retry: + ret = dlm_hooks->ls_lock(dlm_lock_res->ls, LKM_PWMODE, + &dlm_lock_res->lksb, flags, str, strlen(str), + 0, dlm_ast, dlm_lock_res, NULL, NULL); + if (ret) { + pr_err("error %d when get PW mode on lock %s\n", errno, str); + /* let's try several times if EAGAIN happened */ + if (dlm_lock_res->lksb.sb_status == EAGAIN && retry_count < 10) { + sleep(10); + retry_count++; + goto retry; + } + dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + return ret; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + if (dlm_lock_res->lksb.sb_status) { + pr_err("failed to lock cluster\n"); + return -1; + } + return 1; +} + +int cluster_release_dlmlock(void) +{ + int ret = -1; + + if (!cluster_name) + goto out; + + if (!dlm_lock_res->lksb.sb_lkid) + goto out; + + ret = dlm_hooks->ls_unlock_wait(dlm_lock_res->ls, + dlm_lock_res->lksb.sb_lkid, 0, + &dlm_lock_res->lksb); + if (ret) { + pr_err("error %d happened when unlock\n", errno); + /* XXX make sure the lock is unlocked eventually */ + goto out; + } + + /* Wait for it to complete */ + poll_for_ast(dlm_lock_res->ls); + + errno = dlm_lock_res->lksb.sb_status; + if (errno != EUNLOCK) { + pr_err("error %d happened in ast when unlock lockspace\n", + errno); + /* XXX make sure the lockspace is unlocked eventually */ + goto out; + } + + ret = dlm_hooks->release_lockspace(cluster_name, dlm_lock_res->ls, 1); + if (ret) { + pr_err("error %d happened when release lockspace\n", errno); + /* XXX make sure the lockspace is released eventually */ + goto out; + } + free(dlm_lock_res); + +out: + return ret; +} + +int md_array_valid(int fd) +{ + struct mdinfo *sra; + int ret; + + sra = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (sra) { + if (sra->array_state != ARRAY_UNKNOWN_STATE) + ret = 0; + else + ret = -ENODEV; + + free(sra); + } else { + /* + * GET_ARRAY_INFO doesn't provide access to the proper state + * information, so fallback to a basic check for raid_disks != 0 + */ + ret = ioctl(fd, RAID_VERSION); + } + + return !ret; +} + +int md_array_active(int fd) +{ + struct mdinfo *sra; + struct mdu_array_info_s array; + int ret = 0; + + sra = sysfs_read(fd, NULL, GET_ARRAY_STATE); + if (sra) { + if (!md_array_is_active(sra)) + ret = -ENODEV; + + free(sra); + } else { + /* + * GET_ARRAY_INFO doesn't provide access to the proper state + * information, so fallback to a basic check for raid_disks != 0 + */ + ret = ioctl(fd, GET_ARRAY_INFO, &array); + } + + return !ret; +} + +int md_array_is_active(struct mdinfo *info) +{ + return (info->array_state != ARRAY_CLEAR && + info->array_state != ARRAY_INACTIVE && + info->array_state != ARRAY_UNKNOWN_STATE); +} + +/* + * Get array info from the kernel. Longer term we want to deprecate the + * ioctl and get it from sysfs. + */ +int md_get_array_info(int fd, struct mdu_array_info_s *array) +{ + return ioctl(fd, GET_ARRAY_INFO, array); +} + +/* + * Set array info + */ +int md_set_array_info(int fd, struct mdu_array_info_s *array) +{ + return ioctl(fd, SET_ARRAY_INFO, array); +} + +/* + * Get disk info from the kernel. + */ +int md_get_disk_info(int fd, struct mdu_disk_info_s *disk) +{ + return ioctl(fd, GET_DISK_INFO, disk); +} + +int get_linux_version() +{ + struct utsname name; + char *cp; + int a = 0, b = 0,c = 0; + if (uname(&name) <0) + return -1; + + cp = name.release; + a = strtoul(cp, &cp, 10); + if (*cp == '.') + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + + return (a*1000000)+(b*1000)+c; +} + +int mdadm_version(char *version) +{ + int a, b, c; + char *cp; + + if (!version) + version = Version; + + cp = strchr(version, '-'); + if (!cp || *(cp+1) != ' ' || *(cp+2) != 'v') + return -1; + cp += 3; + a = strtoul(cp, &cp, 10); + if (*cp != '.') + return -1; + b = strtoul(cp+1, &cp, 10); + if (*cp == '.') + c = strtoul(cp+1, &cp, 10); + else + c = 0; + if (*cp != ' ' && *cp != '-') + return -1; + return (a*1000000)+(b*1000)+c; +} + +unsigned long long parse_size(char *size) +{ + /* parse 'size' which should be a number optionally + * followed by 'K', 'M'. 'G' or 'T'. + * Without a suffix, K is assumed. + * Number returned is in sectors (half-K) + * INVALID_SECTORS returned on error. + */ + char *c; + long long s = strtoll(size, &c, 10); + if (s > 0) { + switch (*c) { + case 'K': + c++; + default: + s *= 2; + break; + case 'M': + c++; + s *= 1024 * 2; + break; + case 'G': + c++; + s *= 1024 * 1024 * 2; + break; + case 'T': + c++; + s *= 1024 * 1024 * 1024 * 2LL; + break; + case 's': /* sectors */ + c++; + break; + } + } else + s = INVALID_SECTORS; + if (*c) + s = INVALID_SECTORS; + return s; +} + +int is_near_layout_10(int layout) +{ + int fc, fo; + + fc = (layout >> 8) & 255; + fo = layout & (1 << 16); + if (fc > 1 || fo > 0) + return 0; + return 1; +} + +int parse_layout_10(char *layout) +{ + int copies, rv; + char *cp; + /* Parse the layout string for raid10 */ + /* 'f', 'o' or 'n' followed by a number <= raid_disks */ + if ((layout[0] != 'n' && layout[0] != 'f' && layout[0] != 'o') || + (copies = strtoul(layout+1, &cp, 10)) < 1 || + copies > 200 || + *cp) + return -1; + if (layout[0] == 'n') + rv = 256 + copies; + else if (layout[0] == 'o') + rv = 0x10000 + (copies<<8) + 1; + else + rv = 1 + (copies<<8); + return rv; +} + +int parse_layout_faulty(char *layout) +{ + if (!layout) + return -1; + /* Parse the layout string for 'faulty' */ + int ln = strcspn(layout, "0123456789"); + char *m = xstrdup(layout); + int mode; + m[ln] = 0; + mode = map_name(faultylayout, m); + if (mode == UnSet) + return -1; + + return mode | (atoi(layout+ln)<< ModeShift); +} + +int parse_cluster_confirm_arg(char *input, char **devname, int *slot) +{ + char *dev; + *slot = strtoul(input, &dev, 10); + if (dev == input || dev[0] != ':') + return -1; + *devname = dev+1; + return 0; +} + +void remove_partitions(int fd) +{ + /* remove partitions from this block devices. + * This is used for components added to an array + */ +#ifdef BLKPG_DEL_PARTITION + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + for (p.pno = 0; p.pno < 16; p.pno++) + ioctl(fd, BLKPG, &a); +#endif +} + +int test_partition(int fd) +{ + /* Check if fd is a whole-disk or a partition. + * BLKPG will return EINVAL on a partition, and BLKPG_DEL_PARTITION + * will return ENXIO on an invalid partition number. + */ + struct blkpg_ioctl_arg a; + struct blkpg_partition p; + a.op = BLKPG_DEL_PARTITION; + a.data = (void*)&p; + a.datalen = sizeof(p); + a.flags = 0; + memset(a.data, 0, a.datalen); + p.pno = 1<<30; + if (ioctl(fd, BLKPG, &a) == 0) + /* Very unlikely, but not a partition */ + return 0; + if (errno == ENXIO || errno == ENOTTY) + /* not a partition */ + return 0; + + return 1; +} + +int test_partition_from_id(dev_t id) +{ + char buf[20]; + int fd, rv; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return -1; + rv = test_partition(fd); + close(fd); + return rv; +} + +int enough(int level, int raid_disks, int layout, int clean, char *avail) +{ + int copies, first; + int i; + int avail_disks = 0; + + for (i = 0; i < raid_disks; i++) + avail_disks += !!avail[i]; + + switch (level) { + case 10: + /* This is the tricky one - we need to check + * which actual disks are present. + */ + copies = (layout&255)* ((layout>>8) & 255); + first = 0; + do { + /* there must be one of the 'copies' form 'first' */ + int n = copies; + int cnt = 0; + int this = first; + while (n--) { + if (avail[this]) + cnt++; + this = (this+1) % raid_disks; + } + if (cnt == 0) + return 0; + first = (first+(layout&255)) % raid_disks; + } while (first != 0); + return 1; + + case LEVEL_MULTIPATH: + return avail_disks>= 1; + case LEVEL_LINEAR: + case 0: + return avail_disks == raid_disks; + case 1: + return avail_disks >= 1; + case 4: + if (avail_disks == raid_disks - 1 && + !avail[raid_disks - 1]) + /* If just the parity device is missing, then we + * have enough, even if not clean + */ + return 1; + /* FALL THROUGH */ + case 5: + if (clean) + return avail_disks >= raid_disks-1; + else + return avail_disks >= raid_disks; + case 6: + if (clean) + return avail_disks >= raid_disks-2; + else + return avail_disks >= raid_disks; + default: + return 0; + } +} + +char *__fname_from_uuid(int id[4], int swap, char *buf, char sep) +{ + int i, j; + char uuid[16]; + char *c = buf; + strcpy(c, "UUID-"); + c += strlen(c); + copy_uuid(uuid, id, swap); + for (i = 0; i < 4; i++) { + if (i) + *c++ = sep; + for (j = 3; j >= 0; j--) { + sprintf(c,"%02x", (unsigned char) uuid[j+4*i]); + c+= 2; + } + } + return buf; + +} + +char *fname_from_uuid(struct supertype *st, struct mdinfo *info, + char *buf, char sep) +{ + // dirty hack to work around an issue with super1 superblocks... + // super1 superblocks need swapuuid set in order for assembly to + // work, but can't have it set if we want this printout to match + // all the other uuid printouts in super1.c, so we force swapuuid + // to 1 to make our printout match the rest of super1 +#if __BYTE_ORDER == BIG_ENDIAN + return __fname_from_uuid(info->uuid, 1, buf, sep); +#else + return __fname_from_uuid(info->uuid, (st->ss == &super1) ? 1 : + st->ss->swapuuid, buf, sep); +#endif +} + +int check_ext2(int fd, char *name) +{ + /* + * Check for an ext2fs file system. + * Superblock is always 1K at 1K offset + * + * s_magic is le16 at 56 == 0xEF53 + * report mtime - le32 at 44 + * blocks - le32 at 4 + * logblksize - le32 at 24 + */ + unsigned char sb[1024]; + time_t mtime; + unsigned long long size; + int bsize; + if (lseek(fd, 1024,0)!= 1024) + return 0; + if (read(fd, sb, 1024)!= 1024) + return 0; + if (sb[56] != 0x53 || sb[57] != 0xef) + return 0; + + mtime = sb[44]|(sb[45]|(sb[46]|sb[47]<<8)<<8)<<8; + bsize = sb[24]|(sb[25]|(sb[26]|sb[27]<<8)<<8)<<8; + size = sb[4]|(sb[5]|(sb[6]|sb[7]<<8)<<8)<<8; + size <<= bsize; + pr_err("%s appears to contain an ext2fs file system\n", + name); + cont_err("size=%lluK mtime=%s", size, ctime(&mtime)); + return 1; +} + +int check_reiser(int fd, char *name) +{ + /* + * superblock is at 64K + * size is 1024; + * Magic string "ReIsErFs" or "ReIsEr2Fs" at 52 + * + */ + unsigned char sb[1024]; + unsigned long long size; + if (lseek(fd, 64*1024, 0) != 64*1024) + return 0; + if (read(fd, sb, 1024) != 1024) + return 0; + if (strncmp((char*)sb+52, "ReIsErFs",8) != 0 && + strncmp((char*)sb+52, "ReIsEr2Fs",9) != 0) + return 0; + pr_err("%s appears to contain a reiserfs file system\n",name); + size = sb[0]|(sb[1]|(sb[2]|sb[3]<<8)<<8)<<8; + cont_err("size = %lluK\n", size*4); + + return 1; +} + +int check_raid(int fd, char *name) +{ + struct mdinfo info; + time_t crtime; + char *level; + struct supertype *st = guess_super(fd); + + if (!st) + return 0; + if (st->ss->add_to_super != NULL) { + st->ss->load_super(st, fd, name); + /* Looks like a raid array .. */ + pr_err("%s appears to be part of a raid array:\n", name); + st->ss->getinfo_super(st, &info, NULL); + st->ss->free_super(st); + crtime = info.array.ctime; + level = map_num(pers, info.array.level); + if (!level) + level = "-unknown-"; + cont_err("level=%s devices=%d ctime=%s", + level, info.array.raid_disks, ctime(&crtime)); + } else { + /* Looks like GPT or MBR */ + pr_err("partition table exists on %s\n", name); + } + return 1; +} + +int fstat_is_blkdev(int fd, char *devname, dev_t *rdev) +{ + struct stat stb; + + if (fstat(fd, &stb) != 0) { + pr_err("fstat failed for %s: %s\n", devname, strerror(errno)); + return 0; + } + if ((S_IFMT & stb.st_mode) != S_IFBLK) { + pr_err("%s is not a block device.\n", devname); + return 0; + } + if (rdev) + *rdev = stb.st_rdev; + return 1; +} + +int stat_is_blkdev(char *devname, dev_t *rdev) +{ + struct stat stb; + + if (stat(devname, &stb) != 0) { + pr_err("stat failed for %s: %s\n", devname, strerror(errno)); + return 0; + } + if ((S_IFMT & stb.st_mode) != S_IFBLK) { + pr_err("%s is not a block device.\n", devname); + return 0; + } + if (rdev) + *rdev = stb.st_rdev; + return 1; +} + +int ask(char *mesg) +{ + char *add = ""; + int i; + for (i = 0; i < 5; i++) { + char buf[100]; + fprintf(stderr, "%s%s", mesg, add); + fflush(stderr); + if (fgets(buf, 100, stdin)==NULL) + return 0; + if (buf[0]=='y' || buf[0]=='Y') + return 1; + if (buf[0]=='n' || buf[0]=='N') + return 0; + add = "(y/n) "; + } + pr_err("assuming 'no'\n"); + return 0; +} + +int is_standard(char *dev, int *nump) +{ + /* tests if dev is a "standard" md dev name. + * i.e if the last component is "/dNN" or "/mdNN", + * where NN is a string of digits + * Returns 1 if a partitionable standard, + * -1 if non-partitonable, + * 0 if not a standard name. + */ + char *d = strrchr(dev, '/'); + int type = 0; + int num; + if (!d) + return 0; + if (strncmp(d, "/d",2) == 0) + d += 2, type = 1; /* /dev/md/dN{pM} */ + else if (strncmp(d, "/md_d", 5) == 0) + d += 5, type = 1; /* /dev/md_dN{pM} */ + else if (strncmp(d, "/md", 3) == 0) + d += 3, type = -1; /* /dev/mdN */ + else if (d-dev > 3 && strncmp(d-2, "md/", 3) == 0) + d += 1, type = -1; /* /dev/md/N */ + else + return 0; + if (!*d) + return 0; + num = atoi(d); + while (isdigit(*d)) + d++; + if (*d) + return 0; + if (nump) *nump = num; + + return type; +} + +unsigned long calc_csum(void *super, int bytes) +{ + unsigned long long newcsum = 0; + int i; + unsigned int csum; + unsigned int *superc = (unsigned int*) super; + + for(i = 0; i < bytes/4; i++) + newcsum += superc[i]; + csum = (newcsum& 0xffffffff) + (newcsum>>32); +#ifdef __alpha__ +/* The in-kernel checksum calculation is always 16bit on + * the alpha, though it is 32 bit on i386... + * I wonder what it is elsewhere... (it uses an API in + * a way that it shouldn't). + */ + csum = (csum & 0xffff) + (csum >> 16); + csum = (csum & 0xffff) + (csum >> 16); +#endif + return csum; +} + +char *human_size(long long bytes) +{ + static char buf[47]; + + /* We convert bytes to either centi-M{ega,ibi}bytes, + * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes + * with appropriate rounding, and then print + * 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes and 2048Gigabytes before converting to + * terabytes, as that shows more precision and isn't + * too large a number. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) + 1) / 2; + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld MiB %ld.%02ld MB)", + cMiB/100, cMiB % 100, cMB/100, cMB % 100); + } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) { + long cGiB = (bytes * 200LL / (1LL<<30) +1) / 2; + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), " (%ld.%02ld GiB %ld.%02ld GB)", + cGiB/100, cGiB % 100, cGB/100, cGB % 100); + } else { + long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2; + long cTB = (bytes / (1000000000000LL / 200LL) + 1) / 2; + snprintf(buf, sizeof(buf), " (%ld.%02ld TiB %ld.%02ld TB)", + cTiB/100, cTiB % 100, cTB/100, cTB % 100); + } + return buf; +} + +char *human_size_brief(long long bytes, int prefix) +{ + static char buf[30]; + + /* We convert bytes to either centi-M{ega,ibi}bytes, + * centi-G{igi,ibi}bytes or centi-T{era,ebi}bytes + * with appropriate rounding, and then print + * 1/100th of those as a decimal. + * We allow upto 2048Megabytes before converting to + * gigabytes and 2048Gigabytes before converting to + * terabytes, as that shows more precision and isn't + * too large a number. + * + * If prefix == IEC, we mean prefixes like kibi,mebi,gibi etc. + * If prefix == JEDEC, we mean prefixes like kilo,mega,giga etc. + */ + + if (bytes < 5000*1024) + buf[0] = 0; + else if (prefix == IEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMiB = (bytes * 200LL / (1LL<<20) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMiB", + cMiB/100, cMiB % 100); + } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) { + long cGiB = (bytes * 200LL / (1LL<<30) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGiB", + cGiB/100, cGiB % 100); + } else { + long cTiB = (bytes * 200LL / (1LL<<40) + 1) / 2; + snprintf(buf, sizeof(buf), "%ld.%02ldTiB", + cTiB/100, cTiB % 100); + } + } + else if (prefix == JEDEC) { + if (bytes < 2*1024LL*1024LL*1024LL) { + long cMB = (bytes / ( 1000000LL / 200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldMB", + cMB/100, cMB % 100); + } else if (bytes < 2*1024LL*1024LL*1024LL*1024LL) { + long cGB = (bytes / (1000000000LL/200LL ) +1) /2; + snprintf(buf, sizeof(buf), "%ld.%02ldGB", + cGB/100, cGB % 100); + } else { + long cTB = (bytes / (1000000000000LL / 200LL) + 1) / 2; + snprintf(buf, sizeof(buf), "%ld.%02ldTB", + cTB/100, cTB % 100); + } + } + else + buf[0] = 0; + + return buf; +} + +void print_r10_layout(int layout) +{ + int near = layout & 255; + int far = (layout >> 8) & 255; + int offset = (layout&0x10000); + char *sep = ""; + + if (near != 1) { + printf("%s near=%d", sep, near); + sep = ","; + } + if (far != 1) + printf("%s %s=%d", sep, offset?"offset":"far", far); + if (near*far == 1) + printf("NO REDUNDANCY"); +} + +unsigned long long calc_array_size(int level, int raid_disks, int layout, + int chunksize, unsigned long long devsize) +{ + if (level == 1) + return devsize; + devsize &= ~(unsigned long long)((chunksize>>9)-1); + return get_data_disks(level, layout, raid_disks) * devsize; +} + +int get_data_disks(int level, int layout, int raid_disks) +{ + int data_disks = 0; + switch (level) { + case 0: data_disks = raid_disks; + break; + case 1: data_disks = 1; + break; + case 4: + case 5: data_disks = raid_disks - 1; + break; + case 6: data_disks = raid_disks - 2; + break; + case 10: data_disks = raid_disks / (layout & 255) / ((layout>>8)&255); + break; + } + + return data_disks; +} + +dev_t devnm2devid(char *devnm) +{ + /* First look in /sys/block/$DEVNM/dev for %d:%d + * If that fails, try parsing out a number + */ + char path[PATH_MAX]; + char *ep; + int fd; + int mjr,mnr; + + snprintf(path, sizeof(path), "/sys/block/%s/dev", devnm); + fd = open(path, O_RDONLY); + if (fd >= 0) { + char buf[20]; + int n = read(fd, buf, sizeof(buf)); + close(fd); + if (n > 0) + buf[n] = 0; + if (n > 0 && sscanf(buf, "%d:%d\n", &mjr, &mnr) == 2) + return makedev(mjr, mnr); + } + if (strncmp(devnm, "md_d", 4) == 0 && + isdigit(devnm[4]) && + (mnr = strtoul(devnm+4, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(get_mdp_major(), mnr << MdpMinorShift); + + if (strncmp(devnm, "md", 2) == 0 && + isdigit(devnm[2]) && + (mnr = strtoul(devnm+2, &ep, 10)) >= 0 && + ep > devnm && *ep == 0) + return makedev(MD_MAJOR, mnr); + + return 0; +} + +char *get_md_name(char *devnm) +{ + /* find /dev/md%d or /dev/md/%d or make a device /dev/.tmp.md%d */ + /* if dev < 0, want /dev/md/d%d or find mdp in /proc/devices ... */ + + static char devname[50]; + struct stat stb; + dev_t rdev = devnm2devid(devnm); + char *dn; + + if (rdev == 0) + return 0; + if (strncmp(devnm, "md_", 3) == 0) { + snprintf(devname, sizeof(devname), "/dev/md/%s", + devnm + 3); + if (stat(devname, &stb) == 0 && + (S_IFMT&stb.st_mode) == S_IFBLK && (stb.st_rdev == rdev)) + return devname; + } + snprintf(devname, sizeof(devname), "/dev/%s", devnm); + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) + return devname; + + snprintf(devname, sizeof(devname), "/dev/md/%s", devnm+2); + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) + return devname; + + dn = map_dev(major(rdev), minor(rdev), 0); + if (dn) + return dn; + snprintf(devname, sizeof(devname), "/dev/.tmp.%s", devnm); + if (mknod(devname, S_IFBLK | 0600, rdev) == -1) + if (errno != EEXIST) + return NULL; + + if (stat(devname, &stb) == 0 && (S_IFMT&stb.st_mode) == S_IFBLK && + (stb.st_rdev == rdev)) + return devname; + unlink(devname); + return NULL; +} + +void put_md_name(char *name) +{ + if (strncmp(name, "/dev/.tmp.md", 12) == 0) + unlink(name); +} + +int get_maj_min(char *dev, int *major, int *minor) +{ + char *e; + *major = strtoul(dev, &e, 0); + return (e > dev && *e == ':' && e[1] && + (*minor = strtoul(e+1, &e, 0)) >= 0 && + *e == 0); +} + +int dev_open(char *dev, int flags) +{ + /* like 'open', but if 'dev' matches %d:%d, create a temp + * block device and open that + */ + int fd = -1; + char devname[32]; + int major; + int minor; + + if (!dev) + return -1; + flags |= O_DIRECT; + + if (get_maj_min(dev, &major, &minor)) { + snprintf(devname, sizeof(devname), "/dev/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + if (fd < 0) { + /* Try /tmp as /dev appear to be read-only */ + snprintf(devname, sizeof(devname), + "/tmp/.tmp.md.%d:%d:%d", + (int)getpid(), major, minor); + if (mknod(devname, S_IFBLK|0600, + makedev(major, minor)) == 0) { + fd = open(devname, flags); + unlink(devname); + } + } + } else + fd = open(dev, flags); + return fd; +} + +int open_dev_flags(char *devnm, int flags) +{ + dev_t devid; + char buf[20]; + + devid = devnm2devid(devnm); + sprintf(buf, "%d:%d", major(devid), minor(devid)); + return dev_open(buf, flags); +} + +int open_dev(char *devnm) +{ + return open_dev_flags(devnm, O_RDONLY); +} + +int open_dev_excl(char *devnm) +{ + char buf[20]; + int i; + int flags = O_RDWR; + dev_t devid = devnm2devid(devnm); + long delay = 1000; + + sprintf(buf, "%d:%d", major(devid), minor(devid)); + for (i = 0; i < 25; i++) { + int fd = dev_open(buf, flags|O_EXCL); + if (fd >= 0) + return fd; + if (errno == EACCES && flags == O_RDWR) { + flags = O_RDONLY; + continue; + } + if (errno != EBUSY) + return fd; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + return -1; +} + +int same_dev(char *one, char *two) +{ + struct stat st1, st2; + if (stat(one, &st1) != 0) + return 0; + if (stat(two, &st2) != 0) + return 0; + if ((st1.st_mode & S_IFMT) != S_IFBLK) + return 0; + if ((st2.st_mode & S_IFMT) != S_IFBLK) + return 0; + return st1.st_rdev == st2.st_rdev; +} + +void wait_for(char *dev, int fd) +{ + int i; + struct stat stb_want; + long delay = 1000; + + if (fstat(fd, &stb_want) != 0 || + (stb_want.st_mode & S_IFMT) != S_IFBLK) + return; + + for (i = 0; i < 25; i++) { + struct stat stb; + if (stat(dev, &stb) == 0 && + (stb.st_mode & S_IFMT) == S_IFBLK && + (stb.st_rdev == stb_want.st_rdev)) + return; + usleep(delay); + if (delay < 200000) + delay *= 2; + } + if (i == 25) + pr_err("timeout waiting for %s\n", dev); +} + +struct superswitch *superlist[] = +{ + &super0, &super1, + &super_ddf, &super_imsm, + &mbr, &gpt, + NULL +}; + +struct supertype *super_by_fd(int fd, char **subarrayp) +{ + mdu_array_info_t array; + int vers; + int minor; + struct supertype *st = NULL; + struct mdinfo *sra; + char *verstr; + char version[20]; + int i; + char *subarray = NULL; + char container[32] = ""; + + sra = sysfs_read(fd, NULL, GET_VERSION); + + if (sra) { + vers = sra->array.major_version; + minor = sra->array.minor_version; + verstr = sra->text_version; + } else { + if (md_get_array_info(fd, &array)) + array.major_version = array.minor_version = 0; + vers = array.major_version; + minor = array.minor_version; + verstr = ""; + } + + if (vers != -1) { + sprintf(version, "%d.%d", vers, minor); + verstr = version; + } + if (minor == -2 && is_subarray(verstr)) { + char *dev = verstr+1; + + subarray = strchr(dev, '/'); + if (subarray) { + *subarray++ = '\0'; + subarray = xstrdup(subarray); + } + strcpy(container, dev); + sysfs_free(sra); + sra = sysfs_read(-1, container, GET_VERSION); + if (sra && sra->text_version[0]) + verstr = sra->text_version; + else + verstr = "-no-metadata-"; + } + + for (i = 0; st == NULL && superlist[i]; i++) + st = superlist[i]->match_metadata_desc(verstr); + + sysfs_free(sra); + if (st) { + st->sb = NULL; + if (subarrayp) + *subarrayp = subarray; + strcpy(st->container_devnm, container); + strcpy(st->devnm, fd2devnm(fd)); + } else + free(subarray); + + return st; +} + +int dev_size_from_id(dev_t id, unsigned long long *size) +{ + char buf[20]; + int fd; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return 0; + if (get_dev_size(fd, NULL, size)) { + close(fd); + return 1; + } + close(fd); + return 0; +} + +int dev_sector_size_from_id(dev_t id, unsigned int *size) +{ + char buf[20]; + int fd; + + sprintf(buf, "%d:%d", major(id), minor(id)); + fd = dev_open(buf, O_RDONLY); + if (fd < 0) + return 0; + if (get_dev_sector_size(fd, NULL, size)) { + close(fd); + return 1; + } + close(fd); + return 0; +} + +struct supertype *dup_super(struct supertype *orig) +{ + struct supertype *st; + + if (!orig) + return orig; + st = xcalloc(1, sizeof(*st)); + st->ss = orig->ss; + st->max_devs = orig->max_devs; + st->minor_version = orig->minor_version; + st->ignore_hw_compat = orig->ignore_hw_compat; + st->data_offset = orig->data_offset; + st->sb = NULL; + st->info = NULL; + return st; +} + +struct supertype *guess_super_type(int fd, enum guess_types guess_type) +{ + /* try each load_super to find the best match, + * and return the best superswitch + */ + struct superswitch *ss; + struct supertype *st; + unsigned int besttime = 0; + int bestsuper = -1; + int i; + + st = xcalloc(1, sizeof(*st)); + st->container_devnm[0] = 0; + + for (i = 0; superlist[i]; i++) { + int rv; + ss = superlist[i]; + if (guess_type == guess_array && ss->add_to_super == NULL) + continue; + if (guess_type == guess_partitions && ss->add_to_super != NULL) + continue; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = ss->load_super(st, fd, NULL); + if (rv == 0) { + struct mdinfo info; + st->ss->getinfo_super(st, &info, NULL); + if (bestsuper == -1 || + besttime < info.array.ctime) { + bestsuper = i; + besttime = info.array.ctime; + } + ss->free_super(st); + } + } + if (bestsuper != -1) { + int rv; + memset(st, 0, sizeof(*st)); + st->ignore_hw_compat = 1; + rv = superlist[bestsuper]->load_super(st, fd, NULL); + if (rv == 0) { + superlist[bestsuper]->free_super(st); + return st; + } + } + free(st); + return NULL; +} + +/* Return size of device in bytes */ +int get_dev_size(int fd, char *dname, unsigned long long *sizep) +{ + unsigned long long ldsize; + struct stat st; + + if (fstat(fd, &st) != -1 && S_ISREG(st.st_mode)) + ldsize = (unsigned long long)st.st_size; + else +#ifdef BLKGETSIZE64 + if (ioctl(fd, BLKGETSIZE64, &ldsize) != 0) +#endif + { + unsigned long dsize; + if (ioctl(fd, BLKGETSIZE, &dsize) == 0) { + ldsize = dsize; + ldsize <<= 9; + } else { + if (dname) + pr_err("Cannot get size of %s: %s\n", + dname, strerror(errno)); + return 0; + } + } + *sizep = ldsize; + return 1; +} + +/* Return sector size of device in bytes */ +int get_dev_sector_size(int fd, char *dname, unsigned int *sectsizep) +{ + unsigned int sectsize; + + if (ioctl(fd, BLKSSZGET, §size) != 0) { + if (dname) + pr_err("Cannot get sector size of %s: %s\n", + dname, strerror(errno)); + return 0; + } + + *sectsizep = sectsize; + return 1; +} + +/* Return true if this can only be a container, not a member device. + * i.e. is and md device and size is zero + */ +int must_be_container(int fd) +{ + struct mdinfo *mdi; + unsigned long long size; + + mdi = sysfs_read(fd, NULL, GET_VERSION); + if (!mdi) + return 0; + sysfs_free(mdi); + + if (get_dev_size(fd, NULL, &size) == 0) + return 1; + if (size == 0) + return 1; + return 0; +} + +/* Sets endofpart parameter to the last block used by the last GPT partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_gpt_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct GPT gpt; + unsigned char empty_gpt_entry[16]= {0}; + struct GPT_part_entry *part; + char buf[512]; + unsigned long long curr_part_end; + unsigned all_partitions, entry_size; + unsigned part_nr; + unsigned int sector_size = 0; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(gpt) != 512); + /* skip protective MBR */ + if (!get_dev_sector_size(fd, NULL, §or_size)) + return 0; + lseek(fd, sector_size, SEEK_SET); + /* read GPT header */ + if (read(fd, &gpt, 512) != 512) + return 0; + + /* get the number of partition entries and the entry size */ + all_partitions = __le32_to_cpu(gpt.part_cnt); + entry_size = __le32_to_cpu(gpt.part_size); + + /* Check GPT signature*/ + if (gpt.magic != GPT_SIGNATURE_MAGIC) + return -1; + + /* sanity checks */ + if (all_partitions > 1024 || + entry_size > sizeof(buf)) + return -1; + + part = (struct GPT_part_entry *)buf; + + /* set offset to third block (GPT entries) */ + lseek(fd, sector_size*2, SEEK_SET); + for (part_nr = 0; part_nr < all_partitions; part_nr++) { + /* read partition entry */ + if (read(fd, buf, entry_size) != (ssize_t)entry_size) + return 0; + + /* is this valid partition? */ + if (memcmp(part->type_guid, empty_gpt_entry, 16) != 0) { + /* check the last lba for the current partition */ + curr_part_end = __le64_to_cpu(part->ending_lba); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + } + + } + return 1; +} + +/* Sets endofpart parameter to the last block used by the last partition on the device. + * Returns: 1 if successful + * -1 for unknown partition type + * 0 for other errors + */ +static int get_last_partition_end(int fd, unsigned long long *endofpart) +{ + struct MBR boot_sect; + unsigned long long curr_part_end; + unsigned part_nr; + unsigned int sector_size; + int retval = 0; + + *endofpart = 0; + + BUILD_BUG_ON(sizeof(boot_sect) != 512); + /* read MBR */ + lseek(fd, 0, 0); + if (read(fd, &boot_sect, 512) != 512) + goto abort; + + /* check MBP signature */ + if (boot_sect.magic == MBR_SIGNATURE_MAGIC) { + retval = 1; + /* found the correct signature */ + + for (part_nr = 0; part_nr < MBR_PARTITIONS; part_nr++) { + /* + * Have to make every access through boot_sect rather + * than using a pointer to the partition table (or an + * entry), since the entries are not properly aligned. + */ + + /* check for GPT type */ + if (boot_sect.parts[part_nr].part_type == + MBR_GPT_PARTITION_TYPE) { + retval = get_gpt_last_partition_end(fd, endofpart); + break; + } + /* check the last used lba for the current partition */ + curr_part_end = + __le32_to_cpu(boot_sect.parts[part_nr].first_sect_lba) + + __le32_to_cpu(boot_sect.parts[part_nr].blocks_num); + if (curr_part_end > *endofpart) + *endofpart = curr_part_end; + } + } else { + /* Unknown partition table */ + retval = -1; + } + /* calculate number of 512-byte blocks */ + if (get_dev_sector_size(fd, NULL, §or_size)) + *endofpart *= (sector_size / 512); + abort: + return retval; +} + +int check_partitions(int fd, char *dname, unsigned long long freesize, + unsigned long long size) +{ + /* + * Check where the last partition ends + */ + unsigned long long endofpart; + + if (get_last_partition_end(fd, &endofpart) > 0) { + /* There appears to be a partition table here */ + if (freesize == 0) { + /* partitions will not be visible in new device */ + pr_err("partition table exists on %s but will be lost or\n" + " meaningless after creating array\n", + dname); + return 1; + } else if (endofpart > freesize) { + /* last partition overlaps metadata */ + pr_err("metadata will over-write last partition on %s.\n", + dname); + return 1; + } else if (size && endofpart > size) { + /* partitions will be truncated in new device */ + pr_err("array size is too small to cover all partitions on %s.\n", + dname); + return 1; + } + } + return 0; +} + +int open_container(int fd) +{ + /* 'fd' is a block device. Find out if it is in use + * by a container, and return an open fd on that container. + */ + char path[288]; + char *e; + DIR *dir; + struct dirent *de; + int dfd, n; + char buf[200]; + int major, minor; + struct stat st; + + if (fstat(fd, &st) != 0) + return -1; + sprintf(path, "/sys/dev/block/%d:%d/holders", + (int)major(st.st_rdev), (int)minor(st.st_rdev)); + e = path + strlen(path); + + dir = opendir(path); + if (!dir) + return -1; + while ((de = readdir(dir))) { + if (de->d_ino == 0) + continue; + if (de->d_name[0] == '.') + continue; + /* Need to make sure it is a container and not a volume */ + sprintf(e, "/%s/md/metadata_version", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (strncmp(buf, "external", 8) != 0 || + n < 10 || + buf[9] == '/') + continue; + sprintf(e, "/%s/dev", de->d_name); + dfd = open(path, O_RDONLY); + if (dfd < 0) + continue; + n = read(dfd, buf, sizeof(buf)); + close(dfd); + if (n <= 0 || (unsigned)n >= sizeof(buf)) + continue; + buf[n] = 0; + if (sscanf(buf, "%d:%d", &major, &minor) != 2) + continue; + sprintf(buf, "%d:%d", major, minor); + dfd = dev_open(buf, O_RDONLY); + if (dfd >= 0) { + closedir(dir); + return dfd; + } + } + closedir(dir); + return -1; +} + +struct superswitch *version_to_superswitch(char *vers) +{ + int i; + + for (i = 0; superlist[i]; i++) { + struct superswitch *ss = superlist[i]; + + if (strcmp(vers, ss->name) == 0) + return ss; + } + + return NULL; +} + +int metadata_container_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the container named in 'metadata' + * which is + * /containername/componentname or + * -containername/componentname + */ + int l; + if (*metadata != '/' && *metadata != '-') + return 0; + l = strlen(devnm); + if (strncmp(metadata+1, devnm, l) != 0) + return 0; + if (metadata[l+1] != '/') + return 0; + return 1; +} + +int metadata_subdev_matches(char *metadata, char *devnm) +{ + /* Check if 'devnm' is the subdev named in 'metadata' + * which is + * /containername/subdev or + * -containername/subdev + */ + char *sl; + if (*metadata != '/' && *metadata != '-') + return 0; + sl = strchr(metadata+1, '/'); + if (!sl) + return 0; + if (strcmp(sl+1, devnm) == 0) + return 1; + return 0; +} + +int is_container_member(struct mdstat_ent *mdstat, char *container) +{ + if (mdstat->metadata_version == NULL || + strncmp(mdstat->metadata_version, "external:", 9) != 0 || + !metadata_container_matches(mdstat->metadata_version+9, container)) + return 0; + + return 1; +} + +int is_subarray_active(char *subarray, char *container) +{ + struct mdstat_ent *mdstat = mdstat_read(0, 0); + struct mdstat_ent *ent; + + for (ent = mdstat; ent; ent = ent->next) + if (is_container_member(ent, container)) + if (strcmp(to_subarray(ent, container), subarray) == 0) + break; + + free_mdstat(mdstat); + + return ent != NULL; +} + +/* open_subarray - opens a subarray in a container + * @dev: container device name + * @st: empty supertype + * @quiet: block reporting errors flag + * + * On success returns an fd to a container and fills in *st + */ +int open_subarray(char *dev, char *subarray, struct supertype *st, int quiet) +{ + struct mdinfo *mdi; + struct mdinfo *info; + int fd, err = 1; + char *_devnm; + + fd = open(dev, O_RDWR|O_EXCL); + if (fd < 0) { + if (!quiet) + pr_err("Couldn't open %s, aborting\n", + dev); + return -1; + } + + _devnm = fd2devnm(fd); + if (_devnm == NULL) { + if (!quiet) + pr_err("Failed to determine device number for %s\n", + dev); + goto close_fd; + } + strcpy(st->devnm, _devnm); + + mdi = sysfs_read(fd, st->devnm, GET_VERSION|GET_LEVEL); + if (!mdi) { + if (!quiet) + pr_err("Failed to read sysfs for %s\n", + dev); + goto close_fd; + } + + if (mdi->array.level != UnSet) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + st->ss = version_to_superswitch(mdi->text_version); + if (!st->ss) { + if (!quiet) + pr_err("Operation not supported for %s metadata\n", + mdi->text_version); + goto free_sysfs; + } + + if (st->devnm[0] == 0) { + if (!quiet) + pr_err("Failed to allocate device name\n"); + goto free_sysfs; + } + + if (!st->ss->load_container) { + if (!quiet) + pr_err("%s is not a container\n", dev); + goto free_sysfs; + } + + if (st->ss->load_container(st, fd, NULL)) { + if (!quiet) + pr_err("Failed to load metadata for %s\n", + dev); + goto free_sysfs; + } + + info = st->ss->container_content(st, subarray); + if (!info) { + if (!quiet) + pr_err("Failed to find subarray-%s in %s\n", + subarray, dev); + goto free_super; + } + free(info); + + err = 0; + + free_super: + if (err) + st->ss->free_super(st); + free_sysfs: + sysfs_free(mdi); + close_fd: + if (err) + close(fd); + + if (err) + return -1; + else + return fd; +} + +int add_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + /* Add a device to an array, in one of 2 ways. */ + int rv; + + if (st->ss->external) { + if (info->disk.state & (1<<MD_DISK_SYNC)) + info->recovery_start = MaxSector; + else + info->recovery_start = 0; + rv = sysfs_add_disk(sra, info, 0); + if (! rv) { + struct mdinfo *sd2; + for (sd2 = sra->devs; sd2; sd2=sd2->next) + if (sd2 == info) + break; + if (sd2 == NULL) { + sd2 = xmalloc(sizeof(*sd2)); + *sd2 = *info; + sd2->next = sra->devs; + sra->devs = sd2; + } + } + } else + rv = ioctl(mdfd, ADD_NEW_DISK, &info->disk); + return rv; +} + +int remove_disk(int mdfd, struct supertype *st, + struct mdinfo *sra, struct mdinfo *info) +{ + int rv; + + /* Remove the disk given by 'info' from the array */ + if (st->ss->external) + rv = sysfs_set_str(sra, info, "slot", "none"); + else + rv = ioctl(mdfd, HOT_REMOVE_DISK, makedev(info->disk.major, + info->disk.minor)); + return rv; +} + +int hot_remove_disk(int mdfd, unsigned long dev, int force) +{ + int cnt = force ? 500 : 5; + int ret; + + /* HOT_REMOVE_DISK can fail with EBUSY if there are + * outstanding IO requests to the device. + * In this case, it can be helpful to wait a little while, + * up to 5 seconds if 'force' is set, or 50 msec if not. + */ + while ((ret = ioctl(mdfd, HOT_REMOVE_DISK, dev)) == -1 && + errno == EBUSY && + cnt-- > 0) + usleep(10000); + + return ret; +} + +int sys_hot_remove_disk(int statefd, int force) +{ + int cnt = force ? 500 : 5; + int ret; + + while ((ret = write(statefd, "remove", 6)) == -1 && + errno == EBUSY && + cnt-- > 0) + usleep(10000); + return ret == 6 ? 0 : -1; +} + +int set_array_info(int mdfd, struct supertype *st, struct mdinfo *info) +{ + /* Initialise kernel's knowledge of array. + * This varies between externally managed arrays + * and older kernels + */ + mdu_array_info_t inf; + int rv; + + if (st->ss->external) + return sysfs_set_array(info, 9003); + + memset(&inf, 0, sizeof(inf)); + inf.major_version = info->array.major_version; + inf.minor_version = info->array.minor_version; + rv = md_set_array_info(mdfd, &inf); + + return rv; +} + +unsigned long long min_recovery_start(struct mdinfo *array) +{ + /* find the minimum recovery_start in an array for metadata + * formats that only record per-array recovery progress instead + * of per-device + */ + unsigned long long recovery_start = MaxSector; + struct mdinfo *d; + + for (d = array->devs; d; d = d->next) + recovery_start = min(recovery_start, d->recovery_start); + + return recovery_start; +} + +int mdmon_pid(char *devnm) +{ + char path[100]; + char pid[10]; + int fd; + int n; + + sprintf(path, "%s/%s.pid", MDMON_DIR, devnm); + + fd = open(path, O_RDONLY | O_NOATIME, 0); + + if (fd < 0) + return -1; + n = read(fd, pid, 9); + close(fd); + if (n <= 0) + return -1; + return atoi(pid); +} + +int mdmon_running(char *devnm) +{ + int pid = mdmon_pid(devnm); + if (pid <= 0) + return 0; + if (kill(pid, 0) == 0) + return 1; + return 0; +} + +int start_mdmon(char *devnm) +{ + int i; + int len; + pid_t pid; + int status; + char pathbuf[1024]; + char *paths[4] = { + pathbuf, + BINDIR "/mdmon", + "./mdmon", + NULL + }; + + if (check_env("MDADM_NO_MDMON")) + return 0; + if (continue_via_systemd(devnm, MDMON_SERVICE)) + return 0; + + /* That failed, try running mdmon directly */ + len = readlink("/proc/self/exe", pathbuf, sizeof(pathbuf)-1); + if (len > 0) { + char *sl; + pathbuf[len] = 0; + sl = strrchr(pathbuf, '/'); + if (sl) + sl++; + else + sl = pathbuf; + strcpy(sl, "mdmon"); + } else + pathbuf[0] = '\0'; + + switch(fork()) { + case 0: + manage_fork_fds(1); + for (i = 0; paths[i]; i++) + if (paths[i][0]) { + execl(paths[i], paths[i], + devnm, NULL); + } + exit(1); + case -1: pr_err("cannot run mdmon. Array remains readonly\n"); + return -1; + default: /* parent - good */ + pid = wait(&status); + if (pid < 0 || status != 0) { + pr_err("failed to launch mdmon. Array remains readonly\n"); + return -1; + } + } + return 0; +} + +__u32 random32(void) +{ + __u32 rv; + int rfd = open("/dev/urandom", O_RDONLY); + if (rfd < 0 || read(rfd, &rv, 4) != 4) + rv = random(); + if (rfd >= 0) + close(rfd); + return rv; +} + +void random_uuid(__u8 *buf) +{ + int fd, i, len; + __u32 r[4]; + + fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) + goto use_random; + len = read(fd, buf, 16); + close(fd); + if (len != 16) + goto use_random; + + return; + +use_random: + for (i = 0; i < 4; i++) + r[i] = random(); + memcpy(buf, r, 16); +} + +int flush_metadata_updates(struct supertype *st) +{ + int sfd; + if (!st->updates) { + st->update_tail = NULL; + return -1; + } + + sfd = connect_monitor(st->container_devnm); + if (sfd < 0) + return -1; + + while (st->updates) { + struct metadata_update *mu = st->updates; + st->updates = mu->next; + + send_message(sfd, mu, 0); + wait_reply(sfd, 0); + free(mu->buf); + free(mu); + } + ack(sfd, 0); + wait_reply(sfd, 0); + close(sfd); + st->update_tail = NULL; + return 0; +} + +void append_metadata_update(struct supertype *st, void *buf, int len) +{ + + struct metadata_update *mu = xmalloc(sizeof(*mu)); + + mu->buf = buf; + mu->len = len; + mu->space = NULL; + mu->space_list = NULL; + mu->next = NULL; + *st->update_tail = mu; + st->update_tail = &mu->next; +} + +#ifdef __TINYC__ +/* tinyc doesn't optimize this check in ioctl.h out ... */ +unsigned int __invalid_size_argument_for_IOC = 0; +#endif + +/* Pick all spares matching given criteria from a container + * if min_size == 0 do not check size + * if domlist == NULL do not check domains + * if spare_group given add it to domains of each spare + * metadata allows to test domains using metadata of destination array */ +struct mdinfo *container_choose_spares(struct supertype *st, + struct spare_criteria *criteria, + struct domainlist *domlist, + char *spare_group, + const char *metadata, int get_one) +{ + struct mdinfo *d, **dp, *disks = NULL; + + /* get list of all disks in container */ + if (st->ss->getinfo_super_disks) + disks = st->ss->getinfo_super_disks(st); + + if (!disks) + return disks; + /* find spare devices on the list */ + dp = &disks->devs; + disks->array.spare_disks = 0; + while (*dp) { + int found = 0; + d = *dp; + if (d->disk.state == 0) { + /* check if size is acceptable */ + unsigned long long dev_size; + unsigned int dev_sector_size; + int size_valid = 0; + int sector_size_valid = 0; + + dev_t dev = makedev(d->disk.major,d->disk.minor); + + if (!criteria->min_size || + (dev_size_from_id(dev, &dev_size) && + dev_size >= criteria->min_size)) + size_valid = 1; + + if (!criteria->sector_size || + (dev_sector_size_from_id(dev, &dev_sector_size) && + criteria->sector_size == dev_sector_size)) + sector_size_valid = 1; + + found = size_valid && sector_size_valid; + + /* check if domain matches */ + if (found && domlist) { + struct dev_policy *pol = devid_policy(dev); + if (spare_group) + pol_add(&pol, pol_domain, + spare_group, NULL); + if (domain_test(domlist, pol, metadata) != 1) + found = 0; + dev_policy_free(pol); + } + } + if (found) { + dp = &d->next; + disks->array.spare_disks++; + if (get_one) { + sysfs_free(*dp); + d->next = NULL; + } + } else { + *dp = d->next; + d->next = NULL; + sysfs_free(d); + } + } + return disks; +} + +/* Checks if paths point to the same device + * Returns 0 if they do. + * Returns 1 if they don't. + * Returns -1 if something went wrong, + * e.g. paths are empty or the files + * they point to don't exist */ +int compare_paths (char* path1, char* path2) +{ + struct stat st1,st2; + + if (path1 == NULL || path2 == NULL) + return -1; + if (stat(path1,&st1) != 0) + return -1; + if (stat(path2,&st2) != 0) + return -1; + if ((st1.st_ino == st2.st_ino) && (st1.st_dev == st2.st_dev)) + return 0; + return 1; +} + +/* Make sure we can open as many devices as needed */ +void enable_fds(int devices) +{ + unsigned int fds = 20 + devices; + struct rlimit lim; + if (getrlimit(RLIMIT_NOFILE, &lim) != 0 || lim.rlim_cur >= fds) + return; + if (lim.rlim_max < fds) + lim.rlim_max = fds; + lim.rlim_cur = fds; + setrlimit(RLIMIT_NOFILE, &lim); +} + +/* Close all opened descriptors if needed and redirect + * streams to /dev/null. + * For debug purposed, leave STDOUT and STDERR untouched + * Returns: + * 1- if any error occurred + * 0- otherwise + */ +void manage_fork_fds(int close_all) +{ + DIR *dir; + struct dirent *dirent; + + close(0); + open("/dev/null", O_RDWR); + +#ifndef DEBUG + dup2(0, 1); + dup2(0, 2); +#endif + + if (close_all == 0) + return; + + dir = opendir("/proc/self/fd"); + if (!dir) { + pr_err("Cannot open /proc/self/fd directory.\n"); + return; + } + for (dirent = readdir(dir); dirent; dirent = readdir(dir)) { + int fd = -1; + + if ((strcmp(dirent->d_name, ".") == 0) || + (strcmp(dirent->d_name, "..")) == 0) + continue; + + fd = strtol(dirent->d_name, NULL, 10); + if (fd > 2) + close(fd); + } +} + +/* In a systemd/udev world, it is best to get systemd to + * run daemon rather than running in the background. + * Returns: + * 1- if systemd service has been started + * 0- otherwise + */ +int continue_via_systemd(char *devnm, char *service_name) +{ + int pid, status; + char pathbuf[1024]; + + /* Simply return that service cannot be started */ + if (check_env("MDADM_NO_SYSTEMCTL")) + return 0; + switch (fork()) { + case 0: + manage_fork_fds(1); + snprintf(pathbuf, sizeof(pathbuf), + "%s@%s.service", service_name, devnm); + status = execl("/usr/bin/systemctl", "systemctl", "restart", + pathbuf, NULL); + status = execl("/bin/systemctl", "systemctl", "restart", + pathbuf, NULL); + exit(1); + case -1: /* Just do it ourselves. */ + break; + default: /* parent - good */ + pid = wait(&status); + if (pid >= 0 && status == 0) + return 1; + } + return 0; +} + +int in_initrd(void) +{ + /* This is based on similar function in systemd. */ + struct statfs s; + /* statfs.f_type is signed long on s390x and MIPS, causing all + sorts of sign extension problems with RAMFS_MAGIC being + defined as 0x858458f6 */ + return statfs("/", &s) >= 0 && + ((unsigned long)s.f_type == TMPFS_MAGIC || + ((unsigned long)s.f_type & 0xFFFFFFFFUL) == + ((unsigned long)RAMFS_MAGIC & 0xFFFFFFFFUL)); +} + +void reopen_mddev(int mdfd) +{ + /* Re-open without any O_EXCL, but keep + * the same fd + */ + char *devnm; + int fd; + devnm = fd2devnm(mdfd); + close(mdfd); + fd = open_dev(devnm); + if (fd >= 0 && fd != mdfd) + dup2(fd, mdfd); +} + +static struct cmap_hooks *cmap_hooks = NULL; +static int is_cmap_hooks_ready = 0; + +void set_cmap_hooks(void) +{ + cmap_hooks = xmalloc(sizeof(struct cmap_hooks)); + cmap_hooks->cmap_handle = dlopen("libcmap.so.4", RTLD_NOW | RTLD_LOCAL); + if (!cmap_hooks->cmap_handle) + return; + + cmap_hooks->initialize = + dlsym(cmap_hooks->cmap_handle, "cmap_initialize"); + cmap_hooks->get_string = + dlsym(cmap_hooks->cmap_handle, "cmap_get_string"); + cmap_hooks->finalize = dlsym(cmap_hooks->cmap_handle, "cmap_finalize"); + + if (!cmap_hooks->initialize || !cmap_hooks->get_string || + !cmap_hooks->finalize) + dlclose(cmap_hooks->cmap_handle); + else + is_cmap_hooks_ready = 1; +} + +int get_cluster_name(char **cluster_name) +{ + int rv = -1; + cmap_handle_t handle; + + if (!is_cmap_hooks_ready) + return rv; + + rv = cmap_hooks->initialize(&handle); + if (rv != CS_OK) + goto out; + + rv = cmap_hooks->get_string(handle, "totem.cluster_name", cluster_name); + if (rv != CS_OK) { + free(*cluster_name); + rv = -1; + goto name_err; + } + + rv = 0; +name_err: + cmap_hooks->finalize(handle); +out: + return rv; +} + +void set_dlm_hooks(void) +{ + dlm_hooks = xmalloc(sizeof(struct dlm_hooks)); + dlm_hooks->dlm_handle = dlopen("libdlm_lt.so.3", RTLD_NOW | RTLD_LOCAL); + if (!dlm_hooks->dlm_handle) + return; + + dlm_hooks->open_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_open_lockspace"); + dlm_hooks->create_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_create_lockspace"); + dlm_hooks->release_lockspace = + dlsym(dlm_hooks->dlm_handle, "dlm_release_lockspace"); + dlm_hooks->ls_lock = dlsym(dlm_hooks->dlm_handle, "dlm_ls_lock"); + dlm_hooks->ls_unlock_wait = + dlsym(dlm_hooks->dlm_handle, "dlm_ls_unlock_wait"); + dlm_hooks->ls_get_fd = dlsym(dlm_hooks->dlm_handle, "dlm_ls_get_fd"); + dlm_hooks->dispatch = dlsym(dlm_hooks->dlm_handle, "dlm_dispatch"); + + if (!dlm_hooks->open_lockspace || !dlm_hooks->create_lockspace || + !dlm_hooks->ls_lock || !dlm_hooks->ls_unlock_wait || + !dlm_hooks->release_lockspace || !dlm_hooks->ls_get_fd || + !dlm_hooks->dispatch) + dlclose(dlm_hooks->dlm_handle); + else + is_dlm_hooks_ready = 1; +} + +void set_hooks(void) +{ + set_dlm_hooks(); + set_cmap_hooks(); +} + +int zero_disk_range(int fd, unsigned long long sector, size_t count) +{ + int ret = 0; + int fd_zero; + void *addr = NULL; + size_t written = 0; + size_t len = count * 512; + ssize_t n; + + fd_zero = open("/dev/zero", O_RDONLY); + if (fd_zero < 0) { + pr_err("Cannot open /dev/zero\n"); + return -1; + } + + if (lseek64(fd, sector * 512, SEEK_SET) < 0) { + ret = -errno; + pr_err("Failed to seek offset for zeroing\n"); + goto out; + } + + addr = mmap(NULL, len, PROT_READ, MAP_PRIVATE, fd_zero, 0); + + if (addr == MAP_FAILED) { + ret = -errno; + pr_err("Mapping /dev/zero failed\n"); + goto out; + } + + do { + n = write(fd, addr + written, len - written); + if (n < 0) { + if (errno == EINTR) + continue; + ret = -errno; + pr_err("Zeroing disk range failed\n"); + break; + } + written += n; + } while (written != len); + + munmap(addr, len); + +out: + close(fd_zero); + return ret; +} @@ -0,0 +1,112 @@ +/* + * mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2013 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include <string.h> + +const int uuid_zero[4] = { 0, 0, 0, 0 }; + +int same_uuid(int a[4], int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + if (ac[i+0] != bc[i+3] || + ac[i+1] != bc[i+2] || + ac[i+2] != bc[i+1] || + ac[i+3] != bc[i+0]) + return 0; + } + return 1; + } else { + if (a[0]==b[0] && + a[1]==b[1] && + a[2]==b[2] && + a[3]==b[3]) + return 1; + return 0; + } +} + +void copy_uuid(void *a, int b[4], int swapuuid) +{ + if (swapuuid) { + /* parse uuids are hostendian. + * uuid's from some superblocks are big-ending + * if there is a difference, we need to swap.. + */ + unsigned char *ac = (unsigned char *)a; + unsigned char *bc = (unsigned char *)b; + int i; + for (i = 0; i < 16; i += 4) { + ac[i+0] = bc[i+3]; + ac[i+1] = bc[i+2]; + ac[i+2] = bc[i+1]; + ac[i+3] = bc[i+0]; + } + } else + memcpy(a, b, 16); +} + +/* + * Parse a 128 bit uuid in 4 integers + * format is 32 hexx nibbles with options :.<space> separator + * If not exactly 32 hex digits are found, return 0 + * else return 1 + */ +int parse_uuid(char *str, int uuid[4]) +{ + int hit = 0; /* number of Hex digIT */ + int i; + char c; + for (i = 0; i < 4; i++) + uuid[i] = 0; + + while ((c = *str++) != 0) { + int n; + if (c >= '0' && c <= '9') + n = c-'0'; + else if (c >= 'a' && c <= 'f') + n = 10 + c - 'a'; + else if (c >= 'A' && c <= 'F') + n = 10 + c - 'A'; + else if (strchr(":. -", c)) + continue; + else return 0; + + if (hit<32) { + uuid[hit/8] <<= 4; + uuid[hit/8] += n; + } + hit++; + } + if (hit == 32) + return 1; + return 0; +} diff --git a/xmalloc.c b/xmalloc.c new file mode 100644 index 0000000..8b3f78a --- /dev/null +++ b/xmalloc.c @@ -0,0 +1,84 @@ +/* mdadm - manage Linux "md" devices aka RAID arrays. + * + * Copyright (C) 2001-2009 Neil Brown <neilb@suse.de> + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Author: Neil Brown + * Email: <neilb@suse.de> + */ + +#include "mdadm.h" +/*#include <sys/socket.h> +#include <sys/utsname.h> +#include <sys/wait.h> +#include <sys/un.h> +#include <ctype.h> +#include <dirent.h> +#include <signal.h> +*/ + +void *xmalloc(size_t len) +{ + void *rv = malloc(len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xrealloc(void *ptr, size_t len) +{ + void *rv = realloc(ptr, len); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +void *xcalloc(size_t num, size_t size) +{ + void *rv = calloc(num, size); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} + +char *xstrdup(const char *str) +{ + char *rv = strdup(str); + char *msg; + int n; + if (rv) + return rv; + msg = ": memory allocation failure - aborting\n"; + n = write(2, Name, strlen(Name)); + n += write(2, msg, strlen(msg)); + exit(4+!!n); +} |