summaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--block/Kconfig21
-rw-r--r--block/badblocks.c1620
-rw-r--r--block/bdev.c278
-rw-r--r--block/bio-integrity.c218
-rw-r--r--block/bio.c14
-rw-r--r--block/blk-cgroup.c18
-rw-r--r--block/blk-cgroup.h2
-rw-r--r--block/blk-core.c38
-rw-r--r--block/blk-flush.c11
-rw-r--r--block/blk-iocost.c9
-rw-r--r--block/blk-map.c13
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-mq-debugfs.c19
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq.c83
-rw-r--r--block/blk-mq.h57
-rw-r--r--block/blk-pm.c33
-rw-r--r--block/blk-rq-qos.h2
-rw-r--r--block/blk-settings.c111
-rw-r--r--block/blk-stat.c2
-rw-r--r--block/blk-sysfs.c13
-rw-r--r--block/blk-wbt.c17
-rw-r--r--block/blk-wbt.h5
-rw-r--r--block/blk-zoned.c21
-rw-r--r--block/blk.h2
-rw-r--r--block/disk-events.c18
-rw-r--r--block/fops.c67
-rw-r--r--block/genhd.c19
-rw-r--r--block/holder.c12
-rw-r--r--block/ioctl.c14
-rw-r--r--block/ioprio.c26
-rw-r--r--block/mq-deadline.c3
-rw-r--r--block/opal_proto.h1
-rw-r--r--block/partitions/core.c59
-rw-r--r--block/partitions/ibm.c98
-rw-r--r--block/sed-opal.c24
36 files changed, 2161 insertions, 795 deletions
diff --git a/block/Kconfig b/block/Kconfig
index f1364d1c0..1de4682d4 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
select CRC_T10DIF
select CRC64_ROCKSOFT
+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache is very
+ likely going to cause filesystem corruption. It is also rather easy to
+ crash the kernel in this way since the filesystem has no practical way
+ of detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ privileged processes and generally makes it much harder to crash
+ filesystem drivers. Note however that this does not prevent
+ underlying device(s) from being modified by other means, e.g. by
+ directly submitting SCSI commands or through access to lower layers of
+ storage stack. If in doubt, say Y. The configuration can be overridden
+ with the bdev_allow_write_mounted boot option.
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
@@ -186,6 +206,7 @@ config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
depends on KEYS
select PSERIES_PLPKS if PPC_PSERIES
+ select PSERIES_PLPKS_SED if PPC_PSERIES
help
Builds Logic for interfacing with Opal enabled controllers.
Enabling this option enables users to setup/unlock/lock
diff --git a/block/badblocks.c b/block/badblocks.c
index 3afb550c0..db4ec8b9b 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -16,119 +16,830 @@
#include <linux/types.h>
#include <linux/slab.h>
-/**
- * badblocks_check() - check a given range for bad sectors
- * @bb: the badblocks structure that holds all badblock information
- * @s: sector (start) at which to check for badblocks
- * @sectors: number of sectors to check for badblocks
- * @first_bad: pointer to store location of the first badblock
- * @bad_sectors: pointer to store number of badblocks after @first_bad
+/*
+ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
+ * identified by LBA addresses.
*
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ * When the caller of badblocks_set() wants to set a range of bad blocks, the
+ * setting range can be acked or unacked. And the setting range may merge,
+ * overwrite, skip the overlapped already set range, depends on who they are
+ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
+ * more complicated when the setting range covers multiple already set bad block
+ * ranges, with restrictions of maximum length of each bad range and the bad
+ * table space limitation.
*
- * Locking of the bad-block table uses a seqlock so badblocks_check
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
+ * It is difficult and unnecessary to take care of all the possible situations,
+ * for setting a large range of bad blocks, we can handle it by dividing the
+ * large range into smaller ones when encounter overlap, max range length or
+ * bad table full conditions. Every time only a smaller piece of the bad range
+ * is handled with a limited number of conditions how it is interacted with
+ * possible overlapped or adjacent already set bad block ranges. Then the hard
+ * complicated problem can be much simpler to handle in proper way.
*
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
+ * When setting a range of bad blocks to the bad table, the simplified situations
+ * to be considered are, (The already set bad blocks ranges are naming with
+ * prefix E, and the setting bad blocks range is naming with prefix S)
*
- * Return:
- * 0: there are no known bad blocks in the range
- * 1: there are known bad block which are all acknowledged
- * -1: there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
+ * 1) A setting range is not overlapped or adjacent to any other already set bad
+ * block range.
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+ +-------------+
+ * | E1 | | E2 |
+ * +-------------+ +-------------+
+ * For this situation if the bad blocks table is not full, just allocate a
+ * free slot from the bad blocks table to mark the setting range S. The
+ * result is,
+ * +-------------+ +--------+ +-------------+
+ * | E1 | | S | | E2 |
+ * +-------------+ +--------+ +-------------+
+ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
+ * range.
+ * 2.1) The setting range size < already set range size
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
+ * An extra slot from the bad blocks table will be allocated for S, and head
+ * of E will move to end of the inserted range S. The result is,
+ * +--------+----+
+ * | S | E |
+ * +--------+----+
+ * 2.2) The setting range size == already set range size
+ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
+ bad blocks range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.3) The setting range size > already set range size
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For such situation, the setting range S can be treated as two parts, the
+ * first part (S1) is as same size as the already set range E, the second
+ * part (S2) is the rest of setting range.
+ * +-------------+-----+ +-------------+ +-----+
+ * | S1 | S2 | | S1 | | S2 |
+ * +-------------+-----+ ===> +-------------+ +-----+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now we only focus on how to handle the setting range S1 and already set
+ * range E, which are already explained in 2.2), for the rest S2 it will be
+ * handled later in next loop.
+ * 3) A setting range starts before the start LBA of an already set bad blocks
+ * range.
+ * +-------------+
+ * | S |
+ * +-------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation, the setting range S can be divided into two parts, the
+ * first (S1) ends at the start LBA of already set range E, the second part
+ * (S2) starts exactly at a start LBA of the already set range E.
+ * +----+---------+ +----+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +----+---------+ ===> +----+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now only the first part S1 should be handled in this loop, which is in
+ * similar condition as 1). The rest part S2 has exact same start LBA address
+ * of the already set range E, they will be handled in next loop in one of
+ * situations in 2).
+ * 4) A setting range starts after the start LBA of an already set bad blocks
+ * range.
+ * 4.1) If the setting range S exactly matches the tail part of already set bad
+ * blocks range E, like the following chart shows,
+ * +---------+
+ * | S |
+ * +---------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
+ * overwrite the overlapped range of E, the result is,
+ * +---+---------+
+ * | E | S |
+ * +---+---------+
+ * 4.2) If the setting range S stays in middle of an already set range E, like
+ * the following chart shows,
+ * +----+
+ * | S |
+ * +----+
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is also,
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
+ * inserted into middle of E and split previous range E into two parts (E1
+ * and E2), the result is,
+ * +----+----+----+
+ * | E1 | S | E2 |
+ * +----+----+----+
+ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
+ * blocks range E. The range S starts after the start LBA of range E, and
+ * ends after the end LBA of range E, as the following chart shows,
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation the range S can be divided into two parts, the first
+ * part (S1) ends at end range E, and the second part (S2) has rest range of
+ * origin S.
+ * +---------+---------+ +---------+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +---------+---------+ ===> +---------+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now in this loop the setting range S1 and already set range E can be
+ * handled as the situations 4.1), the rest range S2 will be handled in next
+ * loop and ignored in this loop.
+ * 5) A setting bad blocks range S is adjacent to one or more already set bad
+ * blocks range(s), and they are all acked or unacked range.
+ * 5.1) Front merge: If the already set bad blocks range E is before setting
+ * range S and they are adjacent,
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can front merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
+ * range S right after already set range E into the bad blocks table. The
+ * result is,
+ * +--------+------+
+ * | E | S |
+ * +--------+------+
+ * 6) Special cases which above conditions cannot handle
+ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
+ * +-------------------------------------------------------+
+ * | S |
+ * +-------------------------------------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+ +-----+ +-----+
+ * | E1 | | E2 | | E3 |
+ * +-----+ +-----+ +-----+
+ * In the above example, when the bad blocks table is full, inserting the
+ * first part of setting range S will fail because no more available slot
+ * can be allocated from bad blocks table. In this situation a proper
+ * setting method should be go though all the setting bad blocks range and
+ * look for chance to merge already set ranges into less ones. When there
+ * is available slot from bad blocks table, re-try again to handle more
+ * setting bad blocks ranges as many as possible.
+ * +------------------------+
+ * | S3 |
+ * +------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+-----+-----+---+-----+--+
+ * | S1 | S2 |
+ * +-----+-----+-----+---+-----+--+
+ * The above chart shows although the first part (S3) cannot be inserted due
+ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
+ * can be merged with rest part of S into less range S1 and S2. Now there is
+ * 1 free slot in bad blocks table.
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * | S3 | S1 | S2 |
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * Since the bad blocks table is not full anymore, re-try again for the
+ * origin setting range S. Now the setting range S3 can be inserted into the
+ * bad blocks table with previous freed slot from multiple ranges merge.
+ * 6.2) Front merge after overwrite
+ * In the following example, in bad blocks table, E1 is an acked bad blocks
+ * range and E2 is an unacked bad blocks range, therefore they are not able
+ * to merge into a larger range. The setting bad blocks range S is acked,
+ * therefore part of E2 can be overwritten by S.
+ * +--------+
+ * | S | acknowledged
+ * +--------+ S: 1
+ * +-------+-------------+ E1: 1
+ * | E1 | E2 | E2: 0
+ * +-------+-------------+
+ * With previous simplified routines, after overwriting part of E2 with S,
+ * the bad blocks table should be (E3 is remaining part of E2 which is not
+ * overwritten by S),
+ * acknowledged
+ * +-------+--------+----+ S: 1
+ * | E1 | S | E3 | E1: 1
+ * +-------+--------+----+ E3: 0
+ * The above result is correct but not perfect. Range E1 and S in the bad
+ * blocks table are all acked, merging them into a larger one range may
+ * occupy less bad blocks table space and make badblocks_check() faster.
+ * Therefore in such situation, after overwriting range S, the previous range
+ * E1 should be checked for possible front combination. Then the ideal
+ * result can be,
+ * +----------------+----+ acknowledged
+ * | E1 | E3 | E1: 1
+ * +----------------+----+ E3: 0
+ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
+ * range S and they are adjacent. Normally we don't need to care about this
+ * because front merge handles this while going though range S from head to
+ * tail, except for the tail part of range S. When the setting range S are
+ * fully handled, all the above simplified routine doesn't check whether the
+ * tail LBA of range S is adjacent to the next already set range and not
+ * merge them even it is possible.
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * For the above special situation, when the setting range S are all handled
+ * and the loop ends, an extra check is necessary for whether next already
+ * set range E is right after S and mergeable.
+ * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can behind merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
+ * S in front of the already set range E in the bad blocks table. The result
+ * is,
+ * +------+-------+
+ * | S | E |
+ * +------+-------+
+ *
+ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
+ * the bad block range setting conditions. Maybe there is some rare corner case
+ * is not considered and optimized, it won't hurt if badblocks_set() fails due
+ * to no space, or some ranges are not merged to save bad blocks table space.
+ *
+ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
+ * time for the new loop prev_badblocks() is called to find an already set range
+ * which starts before or at current setting range. Since the setting bad blocks
+ * range is handled from head to tail, most of the cases it is unnecessary to do
+ * the binary search inside prev_badblocks(), it is possible to provide a hint
+ * to prev_badblocks() for a fast path, then the expensive binary search can be
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
+ *
+ *
+ * Clearing a bad blocks range from the bad block table has similar idea as
+ * setting does, but much more simpler. The only thing needs to be noticed is
+ * when the clearing range hits middle of a bad block range, the existing bad
+ * block range will split into two, and one more item should be added into the
+ * bad block table. The simplified situations to be considered are, (The already
+ * set bad blocks ranges in bad block table are naming with prefix E, and the
+ * clearing bad blocks range is naming with prefix C)
+ *
+ * 1) A clearing range is not overlapped to any already set ranges in bad block
+ * table.
+ * +-----+ | +-----+ | +-----+
+ * | C | | | C | | | C |
+ * +-----+ or +-----+ or +-----+
+ * +---+ | +----+ +----+ | +---+
+ * | E | | | E1 | | E2 | | | E |
+ * +---+ | +----+ +----+ | +---+
+ * For the above situations, no bad block to be cleared and no failure
+ * happens, simply returns 0.
+ * 2) The clearing range hits middle of an already setting bad blocks range in
+ * the bad block table.
+ * +---+
+ * | C |
+ * +---+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * In this situation if the bad block table is not full, the range E will be
+ * split into two ranges E1 and E2. The result is,
+ * +------+ +------+
+ * | E1 | | E2 |
+ * +------+ +------+
+ * 3) The clearing range starts exactly at same LBA as an already set bad block range
+ * from the bad block table.
+ * 3.1) Partially covered at head part
+ * +------------+
+ * | C |
+ * +------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation, the overlapped already set range will update the
+ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
+ * item deleted from bad block table. The result is,
+ * +----+
+ * | E1 |
+ * +----+
+ * 3.2) Exact fully covered
+ * +-----------------+
+ * | C |
+ * +-----------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation the whole bad blocks range E will be cleared and its
+ * corresponded item is deleted from the bad block table.
+ * 4) The clearing range exactly ends at same LBA as an already set bad block
+ * range.
+ * +-------+
+ * | C |
+ * +-------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For the above situation, the already set range E is updated to shrink its
+ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
+ * The result is,
+ * +---------+
+ * | E |
+ * +---------+
+ * 5) The clearing range is partially overlapped with an already set bad block
+ * range from the bad block table.
+ * 5.1) The already set bad block range is front overlapped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part ends at the start LBA of range E, and the second part starts at
+ * same LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
+ * handled as condition 3.1) in next loop.
+ * 5.2) The already set bad block range is behind overlaopped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part C1 ends at same end LBA of range E, and the second part starts
+ * at end LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part clearing range C1 can be handled as condition 4), and
+ * the second part clearing range C2 can be handled as condition 1) in next
+ * loop.
+ *
+ * All bad blocks range clearing can be simplified into the above 5 situations
+ * by only handling the head part of the clearing range in each run of the
+ * while-loop. The idea is similar to bad blocks range setting but much
+ * simpler.
*/
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+
+/*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+ * table.
+ */
+static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
{
- int hi;
- int lo;
+ int hint_end = hint + 2;
u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
+ int ret = -1;
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
+ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
+ (BB_OFFSET(p[hint]) <= s)) {
+ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
+ ret = hint;
+ break;
+ }
+ hint++;
+ }
+
+ return ret;
+}
+
+/*
+ * Find the range starts at-or-before bad->start. If 'hint' is provided
+ * (hint >= 0) then search in the bad table from hint firstly. It is
+ * very probably the wanted bad range can be found from the hint index,
+ * then the unnecessary while-loop iteration can be avoided.
+ */
+static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
+ int hint)
+{
+ sector_t s = bad->start;
+ int ret = -1;
+ int lo, hi;
+ u64 *p;
+
+ if (!bb->count)
+ goto out;
+
+ if (hint >= 0) {
+ ret = prev_by_hint(bb, s, hint);
+ if (ret >= 0)
+ goto out;
}
- /* 'target' is now the first block after the bad range */
-retry:
- seq = read_seqbegin(&bb->lock);
lo = 0;
- rv = 0;
hi = bb->count;
+ p = bb->page;
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
+ /* The following bisect search might be unnecessary */
+ if (BB_OFFSET(p[lo]) > s)
+ return -1;
+ if (BB_OFFSET(p[hi - 1]) <= s)
+ return hi - 1;
+
+ /* Do bisect search in bad table */
while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
+ int mid = (lo + hi)/2;
sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not.
- */
+ if (a == s) {
+ ret = mid;
+ goto out;
+ }
+
+ if (a < s)
lo = mid;
else
- /* This and later ranges are definitely out. */
hi = mid;
}
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
+
+ if (BB_OFFSET(p[lo]) <= s)
+ ret = lo;
+out:
+ return ret;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be backward merged
+ * with the bad range (from the bad table) index by 'behind'.
+ */
+static bool can_merge_behind(struct badblocks *bb,
+ struct badblocks_context *bad, int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if ((s < BB_OFFSET(p[behind])) &&
+ ((s + sectors) >= BB_OFFSET(p[behind])) &&
+ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
+ BB_ACK(p[behind]) == bad->ack)
+ return true;
+ return false;
+}
+
+/*
+ * Do backward merge for range indicated by 'bad' and the bad range
+ * (from the bad table) indexed by 'behind'. The return value is merged
+ * sectors from bad->len.
+ */
+static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s >= BB_OFFSET(p[behind]));
+ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
+
+ if (s < BB_OFFSET(p[behind])) {
+ merged = BB_OFFSET(p[behind]) - s;
+ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
+
+ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
+ }
+
+ return merged;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be forward
+ * merged with the bad range (from the bad table) indexed by 'prev'.
+ */
+static bool can_merge_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if (BB_ACK(p[prev]) == bad->ack &&
+ (s < BB_END(p[prev]) ||
+ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
+ return true;
+ return false;
+}
+
+/*
+ * Do forward merge for range indicated by 'bad' and the bad range
+ * (from bad table) indexed by 'prev'. The return value is sectors
+ * merged from bad->len.
+ */
+static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s > BB_END(p[prev]));
+
+ if (s < BB_END(p[prev])) {
+ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
+ } else {
+ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
+ if ((prev + 1) < bb->count &&
+ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
+ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
+ }
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + merged, bad->ack);
+ }
+
+ return merged;
+}
+
+/*
+ * 'Combine' is a special case which can_merge_front() is not able to
+ * handle: If a bad range (indexed by 'prev' from bad table) exactly
+ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
+ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
+ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
+ * these two bad range (from bad table) can be combined.
+ *
+ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
+ * table can be combined.
+ */
+static bool can_combine_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if ((prev > 0) &&
+ (BB_OFFSET(p[prev]) == bad->start) &&
+ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
+ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
+ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
+ return true;
+ return false;
+}
+
+/*
+ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
+ * table) into one larger bad range, and the new range is indexed by
+ * 'prev - 1'.
+ * The caller of front_combine() will decrease bb->count, therefore
+ * it is unnecessary to clear p[perv] after front merge.
+ */
+static void front_combine(struct badblocks *bb, int prev)
+{
+ u64 *p = bb->page;
+
+ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
+ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
+ BB_ACK(p[prev]));
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly forward
+ * overlapped with the bad range (from bad table) indexed by 'front'.
+ * Exactly forward overlap means the bad range (from bad table) indexed
+ * by 'prev' does not cover the whole range indicated by 'bad'.
+ */
+static bool overlap_front(struct badblocks *bb, int front,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if (bad->start >= BB_OFFSET(p[front]) &&
+ bad->start < BB_END(p[front]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ u64 *p = bb->page;
+
+ if (bad->start < BB_OFFSET(p[behind]) &&
+ (bad->start + bad->len) > BB_OFFSET(p[behind]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can overwrite the bad
+ * range (from bad table) indexed by 'prev'.
+ *
+ * The range indicated by 'bad' can overwrite the bad range indexed by
+ * 'prev' when,
+ * 1) The whole range indicated by 'bad' can cover partial or whole bad
+ * range (from bad table) indexed by 'prev'.
+ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
+ * range 'prev'.
+ *
+ * If the overwriting doesn't cover the whole bad range (from bad table)
+ * indexed by 'prev', new range might be split from existing bad range,
+ * 1) The overwrite covers head or tail part of existing bad range, 1
+ * extra bad range will be split and added into the bad table.
+ * 2) The overwrite covers middle of existing bad range, 2 extra bad
+ * ranges will be split (ahead and after the overwritten range) and
+ * added into the bad table.
+ * The number of extra split ranges of the overwriting is stored in
+ * 'extra' and returned for the caller.
+ */
+static bool can_front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *extra)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(!overlap_front(bb, prev, bad));
+
+ if (BB_ACK(p[prev]) >= bad->ack)
+ return false;
+
+ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
+ len = BB_END(p[prev]) - bad->start;
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 0;
+ else
+ *extra = 1;
+
+ bad->len = len;
+ } else {
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 1;
+ else
+ /*
+ * prev range will be split into two, beside the overwritten
+ * one, an extra slot needed from bad table.
*/
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
+ *extra = 2;
+ }
+
+ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+ return false;
+
+ return true;
+}
+
+/*
+ * Do the overwrite from the range indicated by 'bad' to the bad range
+ * (from bad table) indexed by 'prev'.
+ * The previously called can_front_overwrite() will provide how many
+ * extra bad range(s) might be split and added into the bad table. All
+ * the splitting cases in the bad table will be handled here.
+ */
+static int front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int extra)
+{
+ u64 *p = bb->page;
+ sector_t orig_end = BB_END(p[prev]);
+ int orig_ack = BB_ACK(p[prev]);
+
+ switch (extra) {
+ case 0:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
+ bad->ack);
+ break;
+ case 1:
+ if (BB_OFFSET(p[prev]) == bad->start) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->len, bad->ack);
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start + bad->len,
+ orig_end - BB_END(p[prev]),
+ orig_ack);
+ } else {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev +2 -> prev + 1 + 1, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 1: one more slot for extra being 1.
+ */
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
}
+ break;
+ case 2:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev + 3 -> prev + 1 + 2, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 2: two more slots for extra being 2.
+ */
+ memmove(p + prev + 3, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
+ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
+ orig_end - BB_END(p[prev + 1]),
+ orig_ack);
+ break;
+ default:
+ break;
}
- if (read_seqretry(&bb->lock, seq))
- goto retry;
+ return bad->len;
+}
- return rv;
+/*
+ * Explicitly insert a range indicated by 'bad' to the bad table, where
+ * the location is indexed by 'at'.
+ */
+static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(badblocks_full(bb));
+
+ len = min_t(sector_t, bad->len, BB_MAX_LEN);
+ if (at < bb->count)
+ memmove(p + at + 1, p + at, (bb->count - at) * 8);
+ p[at] = BB_MAKE(bad->start, len, bad->ack);
+
+ return len;
}
-EXPORT_SYMBOL_GPL(badblocks_check);
static void badblocks_update_acked(struct badblocks *bb)
{
+ bool unacked = false;
u64 *p = bb->page;
int i;
- bool unacked = false;
if (!bb->unacked_exist)
return;
@@ -144,281 +855,602 @@ static void badblocks_update_acked(struct badblocks *bb)
bb->unacked_exist = 0;
}
-/**
- * badblocks_set() - Add a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- * @acknowledged: weather to mark the bad sectors as acknowledged
- *
- * This might extend the table, or might contract it if two adjacent ranges
- * can be merged. We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- *
- * Return:
- * 0: success
- * 1: failed to set badblocks (out of space)
- */
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+/* Do exact work to set bad block range into the bad block table */
+static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
{
- u64 *p;
- int lo, hi;
- int rv = 0;
+ int retried = 0, space_desired = 0;
+ int orig_len, len = 0, added = 0;
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ sector_t orig_start;
unsigned long flags;
+ int rv = 0;
+ u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
return 1;
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
+ rounddown(s, bb->shift);
+ roundup(next, bb->shift);
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
+ orig_start = s;
+ orig_len = sectors;
+ bad.ack = acknowledged;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
+re_insert:
+ bad.start = s;
+ bad.len = sectors;
+ len = 0;
+
+ if (badblocks_empty(bb)) {
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ goto update_sectors;
}
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
-
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start before all badblocks */
+ if (prev < 0) {
+ if (!badblocks_full(bb)) {
+ /* insert on the first */
+ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+ bad.len = BB_OFFSET(p[0]) - bad.start;
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ hint = 0;
+ goto update_sectors;
+ }
+
+ /* No sapce, try to merge */
+ if (overlap_behind(bb, &bad, 0)) {
+ if (can_merge_behind(bb, &bad, 0)) {
+ len = behind_merge(bb, &bad, 0);
+ added++;
} else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+ len = BB_OFFSET(p[0]) - s;
+ space_desired = 1;
}
- sectors = e - s;
+ hint = 0;
+ goto update_sectors;
}
+
+ /* no table space and give up */
+ goto out;
}
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range
- */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
-
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+
+ /* in case p[prev-1] can be merged with p[prev] */
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
+ added++;
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ } else {
+ int extra = 0;
+
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
+ goto update_sectors;
+ }
+
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
+
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
}
- sectors = e - s;
- lo = hi;
- hi++;
}
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ hint = prev;
+ goto update_sectors;
}
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
-
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
+
+ /* if no space in table, still try to merge in the covered range */
+ if (badblocks_full(bb)) {
+ /* skip the cannot-merge range */
+ if (((prev + 1) < bb->count) &&
+ overlap_behind(bb, &bad, prev + 1) &&
+ ((s + sectors) >= BB_END(p[prev + 1]))) {
+ len = BB_END(p[prev + 1]) - s;
+ hint = prev + 1;
+ goto update_sectors;
}
+
+ /* no retry any more */
+ len = sectors;
+ space_desired = 1;
+ hint = -1;
+ goto update_sectors;
}
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi'
- */
- if (bb->count >= MAX_BADBLOCKS) {
- /* No room for more */
- rv = 1;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
+ /* cannot merge and there is space in bad table */
+ if ((prev + 1) < bb->count &&
+ overlap_behind(bb, &bad, prev + 1))
+ bad.len = min_t(sector_t,
+ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
+ len = insert_at(bb, prev + 1, &bad);
+ bb->count++;
+ added++;
+ hint = prev + 1;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_insert;
+
+ WARN_ON(sectors < 0);
+
+ /*
+ * Check whether the following already set range can be
+ * merged. (prev < 0) condition is not handled here,
+ * because it's already complicated enough.
+ */
+ if (prev >= 0 &&
+ (prev + 1) < bb->count &&
+ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+ BB_ACK(p[prev]));
+
+ if ((prev + 2) < bb->count)
+ memmove(p + prev + 1, p + prev + 2,
+ (bb->count - (prev + 2)) * 8);
+ bb->count--;
+ }
+
+ if (space_desired && !badblocks_full(bb)) {
+ s = orig_start;
+ sectors = orig_len;
+ space_desired = 0;
+ if (retried++ < 3)
+ goto re_insert;
+ }
+
+out:
+ if (added) {
+ set_changed(bb);
+
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ else
+ badblocks_update_acked(bb);
}
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- else
- badblocks_update_acked(bb);
write_sequnlock_irqrestore(&bb->lock, flags);
+ if (!added)
+ rv = 1;
+
return rv;
}
-EXPORT_SYMBOL_GPL(badblocks_set);
-/**
- * badblocks_clear() - Remove a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- *
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- *
- * Return:
- * 0: success
- * 1: failed to clear badblocks
+/*
+ * Clear the bad block range from bad block table which is front overlapped
+ * with the clearing range. The return value is how many sectors from an
+ * already set bad block range are cleared. If the whole bad block range is
+ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
+ * the caller to reduce bb->count.
*/
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static int front_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *deleted)
{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int cleared = 0;
+
+ *deleted = 0;
+ if (s == BB_OFFSET(p[prev])) {
+ if (BB_LEN(p[prev]) > sectors) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
+ BB_LEN(p[prev]) - sectors,
+ BB_ACK(p[prev]));
+ cleared = sectors;
+ } else {
+ /* BB_LEN(p[prev]) <= sectors */
+ cleared = BB_LEN(p[prev]);
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ *deleted = 1;
+ }
+ } else if (s > BB_OFFSET(p[prev])) {
+ if (BB_END(p[prev]) <= (s + sectors)) {
+ cleared = BB_END(p[prev]) - s;
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ BB_ACK(p[prev]));
+ } else {
+ /* Splitting is handled in front_splitting_clear() */
+ BUG();
+ }
+ }
+
+ return cleared;
+}
+
+/*
+ * Handle the condition that the clearing range hits middle of an already set
+ * bad block range from bad block table. In this condition the existing bad
+ * block range is split into two after the middle part is cleared.
+ */
+static int front_splitting_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ u64 end = BB_END(p[prev]);
+ int ack = BB_ACK(p[prev]);
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ ack);
+ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
+ return sectors;
+}
+
+/* Do the exact work to clear bad block range from the bad block table */
+static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ int len = 0, cleared = 0;
int rv = 0;
+ u64 *p;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 1;
+
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
+ if (bb->shift) {
+ sector_t target;
- if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
+ target = s + sectors;
+ roundup(s, bb->shift);
+ rounddown(target, bb->shift);
+ sectors = target - s;
}
write_seqlock_irq(&bb->lock);
+ bad.ack = true;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
+re_clear:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
}
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
-
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
+
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* Start before all badblocks */
+ if (prev < 0) {
+ if (overlap_behind(bb, &bad, 0)) {
+ len = BB_OFFSET(p[0]) - s;
+ hint = 0;
+ } else {
+ len = sectors;
}
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
+ /*
+ * Both situations are to clear non-bad range,
+ * should be treated as successful
*/
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Start after all badblocks */
+ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Clear will split a bad record but the table is full */
+ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + sectors))) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if ((BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + bad.len))) {
+ /* Splitting */
+ if ((bb->count + 1) < MAX_BADBLOCKS) {
+ len = front_splitting_clear(bb, prev, &bad);
+ bb->count += 1;
+ cleared++;
+ } else {
+ /* No space to split, give up */
+ len = sectors;
+ }
+ } else {
+ int deleted = 0;
+
+ len = front_clear(bb, prev, &bad, &deleted);
+ bb->count -= deleted;
+ cleared++;
+ hint = prev;
}
+
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Not cover any badblocks range in the table */
+ len = sectors;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_clear;
+
+ WARN_ON(sectors < 0);
+
+ if (cleared) {
+ badblocks_update_acked(bb);
+ set_changed(bb);
}
- badblocks_update_acked(bb);
- bb->changed = 1;
-out:
write_sequnlock_irq(&bb->lock);
+
+ if (!cleared)
+ rv = 1;
+
return rv;
}
+
+/* Do the exact work to check bad blocks range from the bad block table */
+static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int unacked_badblocks, acked_badblocks;
+ int prev = -1, hint = -1, set = 0;
+ struct badblocks_context bad;
+ unsigned int seq;
+ int len, rv;
+ u64 *p;
+
+ WARN_ON(bb->shift < 0 || sectors == 0);
+
+ if (bb->shift > 0) {
+ sector_t target;
+
+ /* round the start down, and the end up */
+ target = s + sectors;
+ rounddown(s, bb->shift);
+ roundup(target, bb->shift);
+ sectors = target - s;
+ }
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ p = bb->page;
+ unacked_badblocks = 0;
+ acked_badblocks = 0;
+
+re_check:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start after all badblocks */
+ if ((prev >= 0) &&
+ ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ /* Overlapped with front badblocks record */
+ if ((prev >= 0) && overlap_front(bb, prev, &bad)) {
+ if (BB_ACK(p[prev]))
+ acked_badblocks++;
+ else
+ unacked_badblocks++;
+
+ if (BB_END(p[prev]) >= (s + sectors))
+ len = sectors;
+ else
+ len = BB_END(p[prev]) - s;
+
+ if (set == 0) {
+ *first_bad = BB_OFFSET(p[prev]);
+ *bad_sectors = BB_LEN(p[prev]);
+ set = 1;
+ }
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ goto update_sectors;
+ }
+
+ /* not cover any badblocks range in the table */
+ len = sectors;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_check;
+
+ WARN_ON(sectors < 0);
+
+ if (unacked_badblocks > 0)
+ rv = -1;
+ else if (acked_badblocks > 0)
+ rv = 1;
+ else
+ rv = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
+ return rv;
+}
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ return _badblocks_set(bb, s, sectors, acknowledged);
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ return _badblocks_clear(bb, s, sectors);
+}
EXPORT_SYMBOL_GPL(badblocks_clear);
/**
diff --git a/block/bdev.c b/block/bdev.c
index 04dba25b0..812872cdc 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
#include "../fs/internal.h"
#include "blk.h"
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -207,92 +210,95 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(sync_blockdev_range);
/**
- * freeze_bdev - lock a filesystem and force it into a consistent state
+ * bdev_freeze - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
+ * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int freeze_bdev(struct block_device *bdev)
+int bdev_freeze(struct block_device *bdev)
{
- struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1)
- goto done;
-
- sb = get_active_super(bdev);
- if (!sb)
- goto sync;
- if (sb->s_op->freeze_super)
- error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- deactivate_super(sb);
- if (error) {
- bdev->bd_fsfreeze_count--;
- goto done;
+ if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return 0;
}
- bdev->bd_fsfreeze_sb = sb;
-sync:
- sync_blockdev(bdev);
-done:
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
+ error = bdev->bd_holder_ops->freeze(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ error = sync_blockdev(bdev);
+ }
+
+ if (error)
+ atomic_dec(&bdev->bd_fsfreeze_count);
+
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(freeze_bdev);
+EXPORT_SYMBOL(bdev_freeze);
/**
- * thaw_bdev - unlock filesystem
+ * bdev_thaw - unlock filesystem
* @bdev: blockdevice to unlock
*
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the filesystem and marks it writeable again after bdev_freeze().
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int thaw_bdev(struct block_device *bdev)
+int bdev_thaw(struct block_device *bdev)
{
- struct super_block *sb;
- int error = -EINVAL;
+ int error = -EINVAL, nr_freeze;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count)
+
+ /*
+ * If this returns < 0 it means that @bd_fsfreeze_count was
+ * already 0 and no decrement was performed.
+ */
+ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
+ if (nr_freeze < 0)
goto out;
error = 0;
- if (--bdev->bd_fsfreeze_count > 0)
+ if (nr_freeze > 0)
goto out;
- sb = bdev->bd_fsfreeze_sb;
- if (!sb)
- goto out;
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
+ error = bdev->bd_holder_ops->thaw(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ }
- if (sb->s_op->thaw_super)
- error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
- bdev->bd_fsfreeze_count++;
- else
- bdev->bd_fsfreeze_sb = NULL;
+ atomic_inc(&bdev->bd_fsfreeze_count);
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(thaw_bdev);
+EXPORT_SYMBOL(bdev_thaw);
/*
* pseudo-fs
*/
static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
-static struct kmem_cache * bdev_cachep __read_mostly;
+static struct kmem_cache *bdev_cachep __ro_after_init;
static struct inode *bdev_alloc_inode(struct super_block *sb)
{
@@ -361,13 +367,13 @@ static struct file_system_type bd_type = {
.kill_sb = kill_anon_super,
};
-struct super_block *blockdev_superblock __read_mostly;
+struct super_block *blockdev_superblock __ro_after_init;
EXPORT_SYMBOL_GPL(blockdev_superblock);
void __init bdev_cache_init(void)
{
int err;
- static struct vfsmount *bd_mnt;
+ static struct vfsmount *bd_mnt __ro_after_init;
bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
@@ -633,6 +639,14 @@ static void blkdev_flush_mapping(struct block_device *bdev)
bdev_write_inode(bdev);
}
+static void blkdev_put_whole(struct block_device *bdev)
+{
+ if (atomic_dec_and_test(&bdev->bd_openers))
+ blkdev_flush_mapping(bdev);
+ if (bdev->bd_disk->fops->release)
+ bdev->bd_disk->fops->release(bdev->bd_disk);
+}
+
static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
{
struct gendisk *disk = bdev->bd_disk;
@@ -651,20 +665,21 @@ static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
if (!atomic_read(&bdev->bd_openers))
set_init_blocksize(bdev);
- if (test_bit(GD_NEED_PART_SCAN, &disk->state))
- bdev_disk_changed(disk, false);
atomic_inc(&bdev->bd_openers);
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state)) {
+ /*
+ * Only return scanning errors if we are called from contexts
+ * that explicitly want them, e.g. the BLKRRPART ioctl.
+ */
+ ret = bdev_disk_changed(disk, false);
+ if (ret && (mode & BLK_OPEN_STRICT_SCAN)) {
+ blkdev_put_whole(bdev);
+ return ret;
+ }
+ }
return 0;
}
-static void blkdev_put_whole(struct block_device *bdev)
-{
- if (atomic_dec_and_test(&bdev->bd_openers))
- blkdev_flush_mapping(bdev);
- if (bdev->bd_disk->fops->release)
- bdev->bd_disk->fops->release(bdev->bd_disk);
-}
-
static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
{
struct gendisk *disk = part->bd_disk;
@@ -729,9 +744,60 @@ void blkdev_put_no_open(struct block_device *bdev)
{
put_device(&bdev->bd_device);
}
-
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers < 0;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers--;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers++;
+}
+
+static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return true;
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+ return false;
+ return true;
+}
+
+static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Claim exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+}
+
+static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Yield exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+}
+
/**
- * blkdev_get_by_dev - open a block device by device number
+ * bdev_open_by_dev - open a block device by device number
* @dev: device number of block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -743,32 +809,46 @@ void blkdev_put_no_open(struct block_device *bdev)
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
+ * number. Everything else should use bdev_open_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
{
- bool unblock_events = true;
+ struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+ GFP_KERNEL);
struct block_device *bdev;
+ bool unblock_events = true;
struct gendisk *disk;
int ret;
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
- return ERR_PTR(ret);
+ goto free_handle;
+
+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
+ ret = -EINVAL;
+ goto free_handle;
+ }
bdev = blkdev_get_no_open(dev);
- if (!bdev)
- return ERR_PTR(-ENXIO);
+ if (!bdev) {
+ ret = -ENXIO;
+ goto free_handle;
+ }
disk = bdev->bd_disk;
if (holder) {
@@ -791,12 +871,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
+ ret = -EBUSY;
+ if (!bdev_may_open(bdev, mode))
+ goto put_module;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto put_module;
+ bdev_claim_write_access(bdev, mode);
if (holder) {
bd_finish_claiming(bdev, holder, hops);
@@ -817,7 +901,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (unblock_events)
disk_unblock_events(disk);
- return bdev;
+ handle->bdev = bdev;
+ handle->holder = holder;
+ handle->mode = mode;
+ return handle;
put_module:
module_put(disk->fops->owner);
abort_claiming:
@@ -827,12 +914,14 @@ abort_claiming:
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
+free_handle:
+ kfree(handle);
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(blkdev_get_by_dev);
+EXPORT_SYMBOL(bdev_open_by_dev);
/**
- * blkdev_get_by_path - open a block device by name
+ * bdev_open_by_path - open a block device by name
* @path: path to the block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -846,12 +935,13 @@ EXPORT_SYMBOL(blkdev_get_by_dev);
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
void *holder, const struct blk_holder_ops *hops)
{
- struct block_device *bdev;
+ struct bdev_handle *handle;
dev_t dev;
int error;
@@ -859,18 +949,20 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
if (error)
return ERR_PTR(error);
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, holder);
+ handle = bdev_open_by_dev(dev, mode, holder, hops);
+ if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+ bdev_read_only(handle->bdev)) {
+ bdev_release(handle);
return ERR_PTR(-EACCES);
}
- return bdev;
+ return handle;
}
-EXPORT_SYMBOL(blkdev_get_by_path);
+EXPORT_SYMBOL(bdev_open_by_path);
-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
{
+ struct block_device *bdev = handle->bdev;
struct gendisk *disk = bdev->bd_disk;
/*
@@ -884,8 +976,10 @@ void blkdev_put(struct block_device *bdev, void *holder)
sync_blockdev(bdev);
mutex_lock(&disk->open_mutex);
- if (holder)
- bd_end_claim(bdev, holder);
+ bdev_yield_write_access(bdev, handle->mode);
+
+ if (handle->holder)
+ bd_end_claim(bdev, handle->holder);
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -902,8 +996,9 @@ void blkdev_put(struct block_device *bdev, void *holder)
module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
+ kfree(handle);
}
-EXPORT_SYMBOL(blkdev_put);
+EXPORT_SYMBOL(bdev_release);
/**
* lookup_bdev() - Look up a struct block_device by name.
@@ -963,20 +1058,20 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise)
mutex_lock(&bdev->bd_holder_lock);
if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
bdev->bd_holder_ops->mark_dead(bdev, surprise);
- else
+ else {
+ mutex_unlock(&bdev->bd_holder_lock);
sync_blockdev(bdev);
- mutex_unlock(&bdev->bd_holder_lock);
+ }
invalidate_bdev(bdev);
}
-#ifdef CONFIG_DASD_MODULE
/*
- * Drivers should not use this directly, but the DASD driver has historically
- * had a shutdown to offline mode that doesn't actually remove the gendisk
- * that otherwise looks a lot like a safe device removal.
+ * New drivers should not use this directly. There are some drivers however
+ * that needs this for historical reasons. For example, the DASD driver has
+ * historically had a shutdown to offline mode that doesn't actually remove the
+ * gendisk that otherwise looks a lot like a safe device removal.
*/
EXPORT_SYMBOL_GPL(bdev_mark_dead);
-#endif
void sync_bdevs(bool wait)
{
@@ -1051,3 +1146,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
blkdev_put_no_open(bdev);
}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6..c9a16fba5 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
memset(bip, 0, sizeof(*bip));
+ /* always report as many vecs as asked explicitly, not inline vecs */
+ bip->bip_max_vcnt = nr_vecs;
if (nr_vecs > inline_vecs) {
- bip->bip_max_vcnt = nr_vecs;
bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
&bip->bip_max_vcnt, gfp_mask);
if (!bip->bip_vec)
goto err;
} else {
bip->bip_vec = bip->bip_inline_vecs;
- bip->bip_max_vcnt = inline_vecs;
}
bip->bip_bio = bio;
@@ -91,6 +91,47 @@ err:
}
EXPORT_SYMBOL(bio_integrity_alloc);
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
+ bool dirty)
+{
+ int i;
+
+ for (i = 0; i < nr_vecs; i++) {
+ if (dirty && !PageCompound(bv[i].bv_page))
+ set_page_dirty_lock(bv[i].bv_page);
+ unpin_user_page(bv[i].bv_page);
+ }
+}
+
+static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
+{
+ unsigned short nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *copy = &bip->bip_vec[1];
+ size_t bytes = bip->bip_iter.bi_size;
+ struct iov_iter iter;
+ int ret;
+
+ iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ WARN_ON_ONCE(ret != bytes);
+
+ bio_integrity_unpin_bvec(copy, nr_vecs, true);
+}
+
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+ bool dirty = bio_data_dir(bip->bip_bio) == READ;
+
+ if (bip->bip_flags & BIP_COPY_USER) {
+ if (dirty)
+ bio_integrity_uncopy_user(bip);
+ kfree(bvec_virt(bip->bip_vec));
+ return;
+ }
+
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
+}
+
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
@@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio)
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
kfree(bvec_virt(bip->bip_vec));
+ else if (bip->bip_flags & BIP_INTEGRITY_USER)
+ bio_integrity_unmap_user(bip);
__bio_integrity_free(bs, bip);
bio->bi_integrity = NULL;
@@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_integrity_add_page);
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len,
+ unsigned int direction, u32 seed)
+{
+ bool write = direction == ITER_SOURCE;
+ struct bio_integrity_payload *bip;
+ struct iov_iter iter;
+ void *buf;
+ int ret;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (write) {
+ iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+ if (!copy_from_iter_full(buf, len, &iter)) {
+ ret = -EFAULT;
+ goto free_buf;
+ }
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+ } else {
+ memset(buf, 0, len);
+
+ /*
+ * We need to preserve the original bvec and the number of vecs
+ * in it for completion handling
+ */
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
+ }
+
+ if (IS_ERR(bip)) {
+ ret = PTR_ERR(bip);
+ goto free_buf;
+ }
+
+ if (write)
+ bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ else
+ memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf));
+ if (ret != len) {
+ ret = -ENOMEM;
+ goto free_bip;
+ }
+
+ bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
+ bip->bip_iter.bi_sector = seed;
+ return 0;
+free_bip:
+ bio_integrity_free(bio);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len, u32 seed)
+{
+ struct bio_integrity_payload *bip;
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
+ bip->bip_flags |= BIP_INTEGRITY_USER;
+ bip->bip_iter.bi_sector = seed;
+ bip->bip_iter.bi_size = len;
+ return 0;
+}
+
+static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
+ int nr_vecs, ssize_t bytes, ssize_t offset)
+{
+ unsigned int nr_bvecs = 0;
+ int i, j;
+
+ for (i = 0; i < nr_vecs; i = j) {
+ size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
+ struct folio *folio = page_folio(pages[i]);
+
+ bytes -= size;
+ for (j = i + 1; j < nr_vecs; j++) {
+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+ if (page_folio(pages[j]) != folio ||
+ pages[j] != pages[j - 1] + 1)
+ break;
+ unpin_user_page(pages[j]);
+ size += next;
+ bytes -= next;
+ }
+
+ bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
+ offset = 0;
+ nr_bvecs++;
+ }
+
+ return nr_bvecs;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
+ u32 seed)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+ struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
+ struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ unsigned int direction, nr_bvecs;
+ struct iov_iter iter;
+ int ret, nr_vecs;
+ size_t offset;
+ bool copy;
+
+ if (bio_integrity(bio))
+ return -EINVAL;
+ if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+ return -E2BIG;
+
+ if (bio_data_dir(bio) == READ)
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ if (nr_vecs > BIO_MAX_VECS)
+ return -E2BIG;
+ if (nr_vecs > UIO_FASTIOV) {
+ bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ return -ENOMEM;
+ pages = NULL;
+ }
+
+ copy = !iov_iter_is_aligned(&iter, align, align);
+ ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ if (unlikely(ret < 0))
+ goto free_bvec;
+
+ nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+ if (pages != stack_pages)
+ kvfree(pages);
+ if (nr_bvecs > queue_max_integrity_segments(q))
+ copy = true;
+
+ if (copy)
+ ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
+ direction, seed);
+ else
+ ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+ if (ret)
+ goto release_pages;
+ if (bvec != stack_vec)
+ kfree(bvec);
+
+ return 0;
+
+release_pages:
+ bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+free_bvec:
+ if (bvec != stack_vec)
+ kfree(bvec);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
diff --git a/block/bio.c b/block/bio.c
index 5eba53ca9..b52b56067 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
if ((addr1 | mask) != (addr2 | mask))
return false;
- if (bv->bv_len + len > queue_max_segment_size(q))
+ if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
return bvec_try_merge_page(bv, page, len, offset, same_page);
}
@@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page)
{
+ unsigned int max_size = max_sectors << SECTOR_SHIFT;
+
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
+ len = min3(len, max_size, queue_max_segment_size(q));
+ if (len > max_size - bio->bi_iter.bi_size)
return 0;
if (bio->bi_vcnt > 0) {
@@ -1149,7 +1152,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
bio_for_each_folio_all(fi, bio) {
struct page *page;
- size_t done = 0;
+ size_t nr_pages;
if (mark_dirty) {
folio_lock(fi.folio);
@@ -1157,10 +1160,11 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty)
folio_unlock(fi.folio);
}
page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
+ nr_pages = (fi.offset + fi.length - 1) / PAGE_SIZE -
+ fi.offset / PAGE_SIZE + 1;
do {
bio_release_page(bio, page++);
- done += PAGE_SIZE;
- } while (done < fi.length);
+ } while (--nr_pages != 0);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b48c2c44..4529122e0 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
* @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use
*
- * Allocate a new blkg assocating @blkcg and @q.
+ * Allocate a new blkg associating @blkcg and @disk.
*/
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask)
@@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg)
static void blkg_destroy_all(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- struct blkcg_gq *blkg, *n;
+ struct blkcg_gq *blkg;
int count = BLKG_DESTROY_BATCH_SIZE;
int i;
restart:
spin_lock_irq(&q->queue_lock);
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
struct blkcg *blkcg = blkg->blkcg;
if (hlist_unhashed(&blkg->blkcg_node))
@@ -1409,6 +1409,12 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
return 0;
}
+void blkg_init_queue(struct request_queue *q)
+{
+ INIT_LIST_HEAD(&q->blkg_list);
+ mutex_init(&q->blkcg_mutex);
+}
+
int blkcg_init_disk(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
@@ -1416,9 +1422,6 @@ int blkcg_init_disk(struct gendisk *disk)
bool preloaded;
int ret;
- INIT_LIST_HEAD(&q->blkg_list);
- mutex_init(&q->blkcg_mutex);
-
new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
@@ -2064,6 +2067,9 @@ void bio_associate_blkg(struct bio *bio)
{
struct cgroup_subsys_state *css;
+ if (blk_op_is_passthrough(bio->bi_opf))
+ return;
+
rcu_read_lock();
if (bio->bi_blkg)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index b927a4a0a..5b0bdc268 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -188,6 +188,7 @@ struct blkcg_policy {
extern struct blkcg blkcg_root;
extern bool blkcg_debug_stats;
+void blkg_init_queue(struct request_queue *q);
int blkcg_init_disk(struct gendisk *disk);
void blkcg_exit_disk(struct gendisk *disk);
@@ -481,6 +482,7 @@ struct blkcg {
};
static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline void blkg_init_queue(struct request_queue *q) { }
static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
static inline void blkcg_exit_disk(struct gendisk *disk) { }
static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
diff --git a/block/blk-core.c b/block/blk-core.c
index 2eca76ccf..99d684085 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -49,6 +49,7 @@
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
+#include "blk-ioprio.h"
struct dentry *blk_debugfs_root;
@@ -430,6 +431,8 @@ struct request_queue *blk_alloc_queue(int node_id)
init_waitqueue_head(&q->mq_freeze_wq);
mutex_init(&q->mq_freeze_lock);
+ blkg_init_queue(q);
+
/*
* Init percpu_ref in atomic mode so that it's faster to shutdown.
* See blk_register_queue() for details.
@@ -772,6 +775,15 @@ void submit_bio_noacct(struct bio *bio)
bio_clear_polled(bio);
switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ break;
+ case REQ_OP_FLUSH:
+ /*
+ * REQ_OP_FLUSH can't be submitted through bios, it is only
+ * synthetized in struct request by the flush state machine.
+ */
+ goto not_supported;
case REQ_OP_DISCARD:
if (!bdev_max_discard_sectors(bdev))
goto not_supported;
@@ -785,6 +797,10 @@ void submit_bio_noacct(struct bio *bio)
if (status != BLK_STS_OK)
goto end_io;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!q->limits.max_write_zeroes_sectors)
+ goto not_supported;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
@@ -796,12 +812,15 @@ void submit_bio_noacct(struct bio *bio)
if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
- case REQ_OP_WRITE_ZEROES:
- if (!q->limits.max_write_zeroes_sectors)
- goto not_supported;
- break;
+ case REQ_OP_DRV_IN:
+ case REQ_OP_DRV_OUT:
+ /*
+ * Driver private operations are only used with passthrough
+ * requests.
+ */
+ fallthrough;
default:
- break;
+ goto not_supported;
}
if (blk_throtl_bio(bio))
@@ -817,6 +836,14 @@ end_io:
}
EXPORT_SYMBOL(submit_bio_noacct);
+static void bio_set_ioprio(struct bio *bio)
+{
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
+}
+
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
@@ -839,6 +866,7 @@ void submit_bio(struct bio *bio)
count_vm_events(PGPGOUT, bio_sectors(bio));
}
+ bio_set_ioprio(bio);
submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index e73dc22d0..3f4d41952 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
- if (!q->elevator) {
+ if (!q->elevator)
flush_rq->tag = first_rq->tag;
-
- /*
- * We borrow data request's driver tag, so have to mark
- * this flush request as INFLIGHT for avoiding double
- * account of this driver tag
- */
- flush_rq->rq_flags |= RQF_MQ_INFLIGHT;
- } else
+ else
flush_rq->internal_tag = first_rq->internal_tag;
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 089fcb9cf..04d44f0bc 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period;
+ u64 __maybe_unused last_period, cur_period;
u64 vtime, vtarget;
int i;
@@ -1353,6 +1353,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
lockdep_assert_held(&iocg->waitq.lock);
+ /*
+ * If the delay is set by another CPU, we may be in the past. No need to
+ * change anything if so. This avoids decay calculation underflow.
+ */
+ if (time_before64(now->now, iocg->delay_at))
+ return false;
+
/* calculate the current delay in effect - 1/2 every second */
tdelta = now->now - iocg->delay_at;
if (iocg->delay)
diff --git a/block/blk-map.c b/block/blk-map.c
index 8584babf3..71210cdb3 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -205,12 +205,19 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
/*
* success
*/
- if ((iov_iter_rw(iter) == WRITE &&
- (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
+ if (iov_iter_rw(iter) == WRITE &&
+ (!map_data || !map_data->null_mapped)) {
ret = bio_copy_from_iter(bio, iter);
if (ret)
goto cleanup;
+ } else if (map_data && map_data->from_user) {
+ struct iov_iter iter2 = *iter;
+
+ /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */
+ iter2.data_source = ITER_SOURCE;
+ ret = bio_copy_from_iter(bio, &iter2);
+ if (ret)
+ goto cleanup;
} else {
if (bmd->is_our_pages)
zero_fill_bio(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 65e75efa9..2d470cf21 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio,
*nsegs = 1;
- /* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(lim->discard_granularity >> 9, 1U);
max_discard_sectors =
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
-
- if (unlikely(!max_discard_sectors)) {
- /* XXX: warn */
+ if (unlikely(!max_discard_sectors))
return NULL;
- }
if (bio_sectors(bio) <= max_discard_sectors)
return NULL;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index c3b593010..94668e72a 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -246,7 +246,6 @@ static const char *const rqf_name[] = {
RQF_NAME(STARTED),
RQF_NAME(FLUSH_SEQ),
RQF_NAME(MIXED_MERGE),
- RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
RQF_NAME(SCHED_TAGS),
RQF_NAME(USE_SCHED),
@@ -480,23 +479,6 @@ out:
return res;
}
-static int hctx_run_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->run);
- return 0;
-}
-
-static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->run = 0;
- return count;
-}
-
static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
@@ -625,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
- {"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
{"type", 0400, hctx_type_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 67c95f31b..451a2c1f1 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
- hctx->run++;
-
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 6041e1749..25d2f3239 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,7 +40,6 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
@@ -426,6 +425,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
rq_list_add(data->cached_rq, rq);
nr++;
}
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_add_active_requests(data->hctx, nr);
/* caller already holds a reference, add for remainder */
percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
data->nr_tags -= nr;
@@ -510,6 +511,8 @@ retry:
goto retry;
}
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data->hctx);
rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
return rq;
@@ -669,6 +672,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
+ if (!(data.rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data.hctx);
rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
blk_mq_rq_time_init(rq, alloc_time_ns);
rq->__data_len = 0;
@@ -708,11 +713,10 @@ static void __blk_mq_free_request(struct request *rq)
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->rq_flags & RQF_MQ_INFLIGHT)
- __blk_mq_dec_active_requests(hctx);
-
- if (rq->tag != BLK_MQ_NO_TAG)
+ if (rq->tag != BLK_MQ_NO_TAG) {
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+ }
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
@@ -1061,12 +1065,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
{
struct request_queue *q = hctx->queue;
- /*
- * All requests should have been marked as RQF_MQ_INFLIGHT, so
- * update hctx->nr_active in batch
- */
- if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
- __blk_mq_sub_active_requests(hctx, nr_tags);
+ blk_mq_sub_active_requests(hctx, nr_tags);
blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
percpu_ref_put_many(&q->q_usage_counter, nr_tags);
@@ -1248,7 +1247,8 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(rq);
- if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
+ !blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
@@ -1259,6 +1259,7 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
+ rq->mq_hctx->tags->rqs[rq->tag] = rq;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
@@ -1760,7 +1761,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
return data.rq;
}
-static bool __blk_mq_alloc_driver_tag(struct request *rq)
+bool __blk_mq_alloc_driver_tag(struct request *rq)
{
struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
@@ -1781,20 +1782,7 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq)
return false;
rq->tag = tag + tag_offset;
- return true;
-}
-
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq)
-{
- if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
- return false;
-
- if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
- !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- __blk_mq_inc_active_requests(hctx);
- }
- hctx->tags->rqs[rq->tag] = rq;
+ blk_mq_inc_active_requests(rq->mq_hctx);
return true;
}
@@ -1871,6 +1859,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
__add_wait_queue(wq, wait);
/*
+ * Add one explicit barrier since blk_mq_get_driver_tag() may
+ * not imply barrier in case of failure.
+ *
+ * Order adding us to wait queue and allocating driver tag.
+ *
+ * The pair is the one implied in sbitmap_queue_wake_up() which
+ * orders clearing sbitmap tag bits and waitqueue_active() in
+ * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+ *
+ * Otherwise, re-order of adding wait queue and getting driver tag
+ * may cause __sbitmap_queue_wake_up() to wake up nothing because
+ * the waitqueue_active() may not observe us in wait queue.
+ */
+ smp_mb();
+
+ /*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
@@ -2806,13 +2810,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
* If we do, we can dispatch the whole plug list in one go. We
* already know at this point that all requests belong to the
* same queue, caller must ensure that's the case.
- *
- * Since we pass off the full list to the driver at this point,
- * we do not increment the active request count for the queue.
- * Bypass shared tags for now because of that.
*/
- if (q->mq_ops->queue_rqs &&
- !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ if (q->mq_ops->queue_rqs) {
blk_mq_run_dispatch_ops(q,
__blk_mq_flush_plug_list(q, plug));
if (rq_list_empty(plug->mq_list))
@@ -2907,8 +2906,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
return NULL;
}
-/* return true if this @rq can be used for @bio */
-static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+/*
+ * Check if we can use the passed on request for submitting the passed in bio,
+ * and remove it from the request list if it can be used.
+ */
+static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
@@ -2936,14 +2938,6 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
return true;
}
-static void bio_set_ioprio(struct bio *bio)
-{
- /* Nobody set ioprio so far? Initialize it based on task's nice value */
- if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
- bio->bi_ioprio = get_current_ioprio();
- blkcg_set_ioprio(bio);
-}
-
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2968,7 +2962,6 @@ void blk_mq_submit_bio(struct bio *bio)
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
- bio_set_ioprio(bio);
if (plug) {
rq = rq_list_peek(&plug->cached_rq);
@@ -2985,7 +2978,7 @@ void blk_mq_submit_bio(struct bio *bio)
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
- if (blk_mq_can_use_cached_rq(rq, plug, bio))
+ if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 1743857e0..f75a9ecfe 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -271,12 +271,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq)
return -1;
}
-static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
{
if (blk_mq_is_shared_tags(hctx->flags))
- atomic_inc(&hctx->queue->nr_active_requests_shared_tags);
+ atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
else
- atomic_inc(&hctx->nr_active);
+ atomic_add(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ __blk_mq_add_active_requests(hctx, 1);
}
static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
@@ -293,6 +299,32 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
__blk_mq_sub_active_requests(hctx, 1);
}
+static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_add_active_requests(hctx, val);
+}
+
+static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_inc_active_requests(hctx);
+}
+
+static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_sub_active_requests(hctx, val);
+}
+
+static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_dec_active_requests(hctx);
+}
+
static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
{
if (blk_mq_is_shared_tags(hctx->flags))
@@ -302,13 +334,9 @@ static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = BLK_MQ_NO_TAG;
-
- if (rq->rq_flags & RQF_MQ_INFLIGHT) {
- rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- __blk_mq_dec_active_requests(hctx);
- }
}
static inline void blk_mq_put_driver_tag(struct request *rq)
@@ -319,19 +347,14 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
-bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq);
+bool __blk_mq_alloc_driver_tag(struct request *rq);
static inline bool blk_mq_get_driver_tag(struct request *rq)
{
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- if (rq->tag != BLK_MQ_NO_TAG &&
- !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
- hctx->tags->rqs[rq->tag] = rq;
- return true;
- }
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+ return false;
- return __blk_mq_get_driver_tag(hctx, rq);
+ return true;
}
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 6b72b2e03..42e842074 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -163,39 +163,16 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
* @q: the queue of the device
*
* Description:
- * For historical reasons, this routine merely calls blk_set_runtime_active()
- * to do the real work of restarting the queue. It does this regardless of
- * whether the device's runtime-resume succeeded; even if it failed the
+ * Restart the queue of a runtime suspended device. It does this regardless
+ * of whether the device's runtime-resume succeeded; even if it failed the
* driver or error handler will need to communicate with the device.
*
* This function should be called near the end of the device's
- * runtime_resume callback.
+ * runtime_resume callback to correct queue runtime PM status and re-enable
+ * peeking requests from the queue.
*/
void blk_post_runtime_resume(struct request_queue *q)
{
- blk_set_runtime_active(q);
-}
-EXPORT_SYMBOL(blk_post_runtime_resume);
-
-/**
- * blk_set_runtime_active - Force runtime status of the queue to be active
- * @q: the queue of the device
- *
- * If the device is left runtime suspended during system suspend the resume
- * hook typically resumes the device and corrects runtime status
- * accordingly. However, that does not affect the queue runtime PM status
- * which is still "suspended". This prevents processing requests from the
- * queue.
- *
- * This function can be used in driver's resume hook to correct queue
- * runtime PM status and re-enable peeking requests from the queue. It
- * should be called before first request is added to the queue.
- *
- * This function is also called by blk_post_runtime_resume() for
- * runtime resumes. It does everything necessary to restart the queue.
- */
-void blk_set_runtime_active(struct request_queue *q)
-{
int old_status;
if (!q->dev)
@@ -211,4 +188,4 @@ void blk_set_runtime_active(struct request_queue *q)
if (old_status != RPM_ACTIVE)
blk_clear_pm_only(q);
}
-EXPORT_SYMBOL(blk_set_runtime_active);
+EXPORT_SYMBOL(blk_post_runtime_resume);
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index f48ee150d..37245c97e 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (q->rq_qos && !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b4472..5adadce08 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -48,7 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->max_secure_erase_sectors = 0;
- lim->discard_granularity = 0;
+ lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
@@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
- lim->zoned = BLK_ZONED_NONE;
+ lim->zoned = false;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
}
@@ -127,8 +127,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
if ((max_hw_sectors << 9) < PAGE_SIZE) {
max_hw_sectors = 1 << (PAGE_SHIFT - 9);
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_hw_sectors);
+ pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
}
max_hw_sectors = round_down(max_hw_sectors,
@@ -140,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
if (limits->max_user_sectors)
max_sectors = min(max_sectors, limits->max_user_sectors);
else
- max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS);
+ max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
max_sectors = round_down(max_sectors,
limits->logical_block_size >> SECTOR_SHIFT);
@@ -248,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
{
if (!max_segments) {
max_segments = 1;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_segments);
+ pr_info("%s: set to minimum %u\n", __func__, max_segments);
}
q->limits.max_segments = max_segments;
@@ -285,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
if (max_size < PAGE_SIZE) {
max_size = PAGE_SIZE;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_size);
+ pr_info("%s: set to minimum %u\n", __func__, max_size);
}
/* see blk_queue_virt_boundary() for the explanation */
@@ -312,6 +309,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
limits->logical_block_size = size;
+ if (limits->discard_granularity < limits->logical_block_size)
+ limits->discard_granularity = limits->logical_block_size;
+
if (limits->physical_block_size < size)
limits->physical_block_size = size;
@@ -342,6 +342,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size;
+ if (q->limits.discard_granularity < q->limits.physical_block_size)
+ q->limits.discard_granularity = q->limits.physical_block_size;
+
if (q->limits.io_min < q->limits.physical_block_size)
q->limits.io_min = q->limits.physical_block_size;
}
@@ -686,6 +689,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->zone_write_granularity = max(t->zone_write_granularity,
b->zone_write_granularity);
t->zoned = max(t->zoned, b->zoned);
+ if (!t->zoned) {
+ t->zone_write_granularity = 0;
+ t->max_zone_append_sectors = 0;
+ }
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
@@ -740,8 +747,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
if (mask < PAGE_SIZE - 1) {
mask = PAGE_SIZE - 1;
- printk(KERN_INFO "%s: set to minimum %lx\n",
- __func__, mask);
+ pr_info("%s: set to minimum %lx\n", __func__, mask);
}
q->limits.seg_boundary_mask = mask;
@@ -841,8 +847,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
-
- wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
@@ -884,81 +888,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
-static bool disk_has_partitions(struct gendisk *disk)
-{
- unsigned long idx;
- struct block_device *part;
- bool ret = false;
-
- rcu_read_lock();
- xa_for_each(&disk->part_tbl, idx, part) {
- if (bdev_is_partition(part)) {
- ret = true;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/**
- * disk_set_zoned - configure the zoned model for a disk
- * @disk: the gendisk of the queue to configure
- * @model: the zoned model to set
- *
- * Set the zoned model of @disk to @model.
- *
- * When @model is BLK_ZONED_HM (host managed), this should be called only
- * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
- * If @model specifies BLK_ZONED_HA (host aware), the effective model used
- * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
- * on the disk.
+ * disk_set_zoned - inidicate a zoned device
+ * @disk: gendisk to configure
*/
-void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- unsigned int old_model = q->limits.zoned;
-
- switch (model) {
- case BLK_ZONED_HM:
- /*
- * Host managed devices are supported only if
- * CONFIG_BLK_DEV_ZONED is enabled.
- */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
- break;
- case BLK_ZONED_HA:
- /*
- * Host aware devices can be treated either as regular block
- * devices (similar to drive managed devices) or as zoned block
- * devices to take advantage of the zone command set, similarly
- * to host managed devices. We try the latter if there are no
- * partitions and zoned block device support is enabled, else
- * we do nothing special as far as the block layer is concerned.
- */
- if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
- disk_has_partitions(disk))
- model = BLK_ZONED_NONE;
- break;
- case BLK_ZONED_NONE:
- default:
- if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
- model = BLK_ZONED_NONE;
- break;
- }
- q->limits.zoned = model;
- if (model != BLK_ZONED_NONE) {
- /*
- * Set the zone write granularity to the device logical block
- * size by default. The driver can change this value if needed.
- */
- blk_queue_zone_write_granularity(q,
- queue_logical_block_size(q));
- } else if (old_model != BLK_ZONED_NONE) {
- disk_clear_zone_settings(disk);
- }
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
+
+ /*
+ * Set the zone write granularity to the device logical block
+ * size by default. The driver can change this value if needed.
+ */
+ q->limits.zoned = true;
+ blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
}
EXPORT_SYMBOL_GPL(disk_set_zoned);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 7ff76ae6c..e42c263e5 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -27,7 +27,7 @@ void blk_rq_stat_init(struct blk_rq_stat *stat)
/* src is a per-cpu stat, mean isn't initialized */
void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src)
{
- if (!src->nr_samples)
+ if (dst->nr_samples + src->nr_samples <= dst->nr_samples)
return;
dst->min = min(dst->min, src->min);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 63e481262..6b2429cad 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
if (max_sectors_kb == 0) {
q->limits.max_user_sectors = 0;
max_sectors_kb = min(max_hw_sectors_kb,
- BLK_DEF_MAX_SECTORS >> 1);
+ BLK_DEF_MAX_SECTORS_CAP >> 1);
} else {
if (max_sectors_kb > max_hw_sectors_kb ||
max_sectors_kb < page_kb)
@@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
{
- switch (blk_queue_zoned_model(q)) {
- case BLK_ZONED_HA:
- return sprintf(page, "host-aware\n");
- case BLK_ZONED_HM:
+ if (blk_queue_is_zoned(q))
return sprintf(page, "host-managed\n");
- default:
- return sprintf(page, "none\n");
- }
+ return sprintf(page, "none\n");
}
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
@@ -615,6 +610,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
#endif
+/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
@@ -659,6 +655,7 @@ static struct attribute *queue_attrs[] = {
NULL,
};
+/* Request-based queue attributes that are not relevant for bio-based queues. */
static struct attribute *blk_mq_queue_attrs[] = {
&queue_requests_entry.attr,
&elv_iosched_entry.attr,
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0bb613139..0c0e270a8 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -84,8 +84,6 @@ struct rq_wb {
u64 sync_issue;
void *sync_cookie;
- unsigned int wc;
-
unsigned long last_issue; /* last non-throttled issue */
unsigned long last_comp; /* last non-throttled comp */
unsigned long min_lat_nsec;
@@ -165,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
@@ -207,7 +205,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
*/
if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background;
- else if (rwb->wc && !wb_recent_wait(rwb))
+ else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
+ !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos)
- RQWB(rqos)->wc = write_cache_on;
-}
-
/*
* Enable wbt if defaults are configured that way
*/
@@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk)
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
rwb->rq_depth.queue_depth = blk_queue_depth(q);
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 8a029e138..e5fc653b9 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
bool wbt_disabled(struct request_queue *);
-void wbt_set_write_cache(struct request_queue *, bool);
-
u64 wbt_default_latency_nsec(struct request_queue *);
#else
@@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk)
static inline void wbt_enable_default(struct gendisk *disk)
{
}
-static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
-{
-}
#endif /* CONFIG_BLK_WBT */
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 619ee41a5..d343e5756 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -498,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
set_bit(idx, args->conv_zones_bitmap);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
if (!args->seq_zones_wlock) {
args->seq_zones_wlock =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
@@ -506,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENOMEM;
}
break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
@@ -615,22 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
-void disk_clear_zone_settings(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
-
- blk_mq_freeze_queue(q);
-
- disk_free_zone_bitmaps(disk);
- blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
- q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
- disk->nr_zones = 0;
- disk->max_open_zones = 0;
- disk->max_active_zones = 0;
- q->limits.chunk_sectors = 0;
- q->limits.zone_write_granularity = 0;
- q->limits.max_zone_append_sectors = 0;
-
- blk_mq_unfreeze_queue(q);
-}
diff --git a/block/blk.h b/block/blk.h
index 08a358bc0..1ef920f72 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
#ifdef CONFIG_BLK_DEV_ZONED
void disk_free_zone_bitmaps(struct gendisk *disk);
-void disk_clear_zone_settings(struct gendisk *disk);
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
-static inline void disk_clear_zone_settings(struct gendisk *disk) {}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/block/disk-events.c b/block/disk-events.c
index 13c3372c4..2f6972243 100644
--- a/block/disk-events.c
+++ b/block/disk-events.c
@@ -266,11 +266,8 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
* disk_check_media_change - check if a removable media has been changed
* @disk: gendisk to check
*
- * Check whether a removable media has been changed, and attempt to free all
- * dentries and inodes and invalidates all block device page cache entries in
- * that case.
- *
- * Returns %true if the media has changed, or %false if not.
+ * Returns %true and marks the disk for a partition rescan whether a removable
+ * media has been changed, and %false if the media did not change.
*/
bool disk_check_media_change(struct gendisk *disk)
{
@@ -278,12 +275,11 @@ bool disk_check_media_change(struct gendisk *disk)
events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
DISK_EVENT_EJECT_REQUEST);
- if (!(events & DISK_EVENT_MEDIA_CHANGE))
- return false;
-
- bdev_mark_dead(disk->part0, true);
- set_bit(GD_NEED_PART_SCAN, &disk->state);
- return true;
+ if (events & DISK_EVENT_MEDIA_CHANGE) {
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ return true;
+ }
+ return false;
}
EXPORT_SYMBOL(disk_check_media_change);
diff --git a/block/fops.c b/block/fops.c
index 73e427425..0cf8cf72c 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock. The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, blkdev_get_block, wbc);
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ err = write_cache_pages(mapping, wbc, block_write_full_folio,
+ blkdev_get_block);
+ blk_finish_plug(&plug);
+
+ return err;
}
static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
- .writepage = blkdev_writepage,
+ .writepages = blkdev_writepages,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.migrate_folio = buffer_migrate_folio_norefs,
@@ -500,7 +515,7 @@ const struct address_space_operations def_blk_aops = {
.readahead = blkdev_readahead,
.writepages = blkdev_writepages,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.migrate_folio = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */
@@ -542,15 +557,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
return error;
}
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
blk_mode_t file_to_blk_mode(struct file *file)
{
blk_mode_t mode = 0;
+ struct bdev_handle *handle = file->private_data;
if (file->f_mode & FMODE_READ)
mode |= BLK_OPEN_READ;
if (file->f_mode & FMODE_WRITE)
mode |= BLK_OPEN_WRITE;
- if (file->private_data)
+ /*
+ * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
+ * determine whether the open was exclusive for already open files.
+ */
+ if (handle)
+ mode |= handle->mode & BLK_OPEN_EXCL;
+ else if (file->f_flags & O_EXCL)
mode |= BLK_OPEN_EXCL;
if (file->f_flags & O_NDELAY)
mode |= BLK_OPEN_NDELAY;
@@ -568,7 +599,8 @@ blk_mode_t file_to_blk_mode(struct file *file)
static int blkdev_open(struct inode *inode, struct file *filp)
{
- struct block_device *bdev;
+ struct bdev_handle *handle;
+ blk_mode_t mode;
/*
* Preserve backwards compatibility and allow large file access
@@ -579,29 +611,24 @@ static int blkdev_open(struct inode *inode, struct file *filp)
filp->f_flags |= O_LARGEFILE;
filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
- /*
- * Use the file private data to store the holder for exclusive openes.
- * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL.
- */
- if (filp->f_flags & O_EXCL)
- filp->private_data = filp;
-
- bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp),
- filp->private_data, NULL);
- if (IS_ERR(bdev))
- return PTR_ERR(bdev);
+ mode = file_to_blk_mode(filp);
+ handle = bdev_open_by_dev(inode->i_rdev, mode,
+ mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
- if (bdev_nowait(bdev))
+ if (bdev_nowait(handle->bdev))
filp->f_mode |= FMODE_NOWAIT;
- filp->f_mapping = bdev->bd_inode->i_mapping;
+ filp->f_mapping = handle->bdev->bd_inode->i_mapping;
filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+ filp->private_data = handle;
return 0;
}
static int blkdev_release(struct inode *inode, struct file *filp)
{
- blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data);
+ bdev_release(filp->private_data);
return 0;
}
diff --git a/block/genhd.c b/block/genhd.c
index f9b81be6c..d74fb5b4a 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent);
int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
{
- struct block_device *bdev;
+ struct bdev_handle *handle;
int ret = 0;
if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
@@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
}
set_bit(GD_NEED_PART_SCAN, &disk->state);
- bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
- NULL);
- if (IS_ERR(bdev))
- ret = PTR_ERR(bdev);
+ handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
+ NULL);
+ if (IS_ERR(handle))
+ ret = PTR_ERR(handle);
else
- blkdev_put(bdev, NULL);
+ bdev_release(handle);
/*
* If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
@@ -562,6 +562,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
struct block_device *bdev;
unsigned long idx;
+ /*
+ * On surprise disk removal, bdev_mark_dead() may call into file
+ * systems below. Make it clear that we're expecting to not hold
+ * disk->open_mutex.
+ */
+ lockdep_assert_not_held(&disk->open_mutex);
+
rcu_read_lock();
xa_for_each(&disk->part_tbl, idx, bdev) {
if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
diff --git a/block/holder.c b/block/holder.c
index 37d18c13d..791091a7e 100644
--- a/block/holder.c
+++ b/block/holder.c
@@ -8,6 +8,8 @@ struct bd_holder_disk {
int refcnt;
};
+static DEFINE_MUTEX(blk_holder_mutex);
+
static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
struct gendisk *disk)
{
@@ -80,7 +82,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
kobject_get(bdev->bd_holder_dir);
mutex_unlock(&bdev->bd_disk->open_mutex);
- mutex_lock(&disk->open_mutex);
+ mutex_lock(&blk_holder_mutex);
WARN_ON_ONCE(!bdev->bd_holder);
holder = bd_find_holder_disk(bdev, disk);
@@ -108,7 +110,7 @@ int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
goto out_del_symlink;
list_add(&holder->list, &disk->slave_bdevs);
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
return 0;
out_del_symlink:
@@ -116,7 +118,7 @@ out_del_symlink:
out_free_holder:
kfree(holder);
out_unlock:
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
if (ret)
kobject_put(bdev->bd_holder_dir);
return ret;
@@ -140,7 +142,7 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
if (WARN_ON_ONCE(!disk->slave_dir))
return;
- mutex_lock(&disk->open_mutex);
+ mutex_lock(&blk_holder_mutex);
holder = bd_find_holder_disk(bdev, disk);
if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
del_symlink(disk->slave_dir, bdev_kobj(bdev));
@@ -149,6 +151,6 @@ void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
list_del_init(&holder->list);
kfree(holder);
}
- mutex_unlock(&disk->open_mutex);
+ mutex_unlock(&blk_holder_mutex);
}
EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
diff --git a/block/ioctl.c b/block/ioctl.c
index d1d8e8391..5f8c98823 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -371,9 +371,10 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
mutex_lock(&bdev->bd_holder_lock);
if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
bdev->bd_holder_ops->sync(bdev);
- else
+ else {
+ mutex_unlock(&bdev->bd_holder_lock);
sync_blockdev(bdev);
- mutex_unlock(&bdev->bd_holder_lock);
+ }
invalidate_bdev(bdev);
return 0;
@@ -468,6 +469,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
int __user *argp)
{
int ret, n;
+ struct bdev_handle *handle;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -479,10 +481,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
if (mode & BLK_OPEN_EXCL)
return set_blocksize(bdev, n);
- if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL)))
+ handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
+ if (IS_ERR(handle))
return -EBUSY;
ret = set_blocksize(bdev, n);
- blkdev_put(bdev, &bdev);
+ bdev_release(handle);
return ret;
}
@@ -553,7 +556,8 @@ static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
return -EACCES;
if (bdev_is_partition(bdev))
return -EINVAL;
- return disk_scan_partitions(bdev->bd_disk, mode);
+ return disk_scan_partitions(bdev->bd_disk,
+ mode | BLK_OPEN_STRICT_SCAN);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
diff --git a/block/ioprio.c b/block/ioprio.c
index b5a942519..73301a261 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -139,32 +139,6 @@ out:
return ret;
}
-/*
- * If the task has set an I/O priority, use that. Otherwise, return
- * the default I/O priority.
- *
- * Expected to be called for current task or with task_lock() held to keep
- * io_context stable.
- */
-int __get_task_ioprio(struct task_struct *p)
-{
- struct io_context *ioc = p->io_context;
- int prio;
-
- if (p != current)
- lockdep_assert_held(&p->alloc_lock);
- if (ioc)
- prio = ioc->ioprio;
- else
- prio = IOPRIO_DEFAULT;
-
- if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
- prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
- task_nice_ioprio(p));
- return prio;
-}
-EXPORT_SYMBOL_GPL(__get_task_ioprio);
-
static int get_task_ioprio(struct task_struct *p)
{
int ret;
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index f958e7927..02a916ba6 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -646,9 +646,8 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int shift = tags->bitmap_tags.sb.shift;
- dd->async_depth = max(1U, 3 * (1U << shift) / 4);
+ dd->async_depth = max(1UL, 3 * q->nr_requests / 4);
sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
}
diff --git a/block/opal_proto.h b/block/opal_proto.h
index dec7ce3a3..d247a457b 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -71,6 +71,7 @@ enum opal_response_token {
#define SHORT_ATOM_BYTE 0xBF
#define MEDIUM_ATOM_BYTE 0xDF
#define LONG_ATOM_BYTE 0xE3
+#define EMPTY_ATOM_BYTE 0xFF
#define OPAL_INVAL_PARAM 12
#define OPAL_MANUFACTURED_INACTIVE 0x08
diff --git a/block/partitions/core.c b/block/partitions/core.c
index e58c8b503..5f5ed5c75 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -274,17 +274,6 @@ void drop_partition(struct block_device *part)
put_device(&part->bd_device);
}
-static void delete_partition(struct block_device *part)
-{
- /*
- * Remove the block device from the inode hash, so that it cannot be
- * looked up any more even when openers still hold references.
- */
- remove_inode_hash(part->bd_inode);
- bdev_mark_dead(part, false);
- drop_partition(part);
-}
-
static ssize_t whole_disk_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
@@ -316,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
* Partitions are not supported on zoned block devices that are used as
* such.
*/
- switch (disk->queue->limits.zoned) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: partitions not supported on host managed zoned block device\n",
disk->disk_name);
return ERR_PTR(-ENXIO);
- case BLK_ZONED_HA:
- pr_info("%s: disabling host aware zoned block device support due to partitions\n",
- disk->disk_name);
- disk_set_zoned(disk, BLK_ZONED_NONE);
- break;
- case BLK_ZONED_NONE:
- break;
}
if (xa_load(&disk->part_tbl, partno))
@@ -490,7 +471,18 @@ int bdev_del_partition(struct gendisk *disk, int partno)
if (atomic_read(&part->bd_openers))
goto out_unlock;
- delete_partition(part);
+ /*
+ * We verified that @part->bd_openers is zero above and so
+ * @part->bd_holder{_ops} can't be set. And since we hold
+ * @disk->open_mutex the device can't be claimed by anyone.
+ *
+ * So no need to call @part->bd_holder_ops->mark_dead() here.
+ * Just delete the partition and invalidate it.
+ */
+
+ remove_inode_hash(part->bd_inode);
+ invalidate_bdev(part);
+ drop_partition(part);
ret = 0;
out_unlock:
mutex_unlock(&disk->open_mutex);
@@ -575,8 +567,8 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %ld\n",
- disk->disk_name, p, -PTR_ERR(part));
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
return true;
}
@@ -618,7 +610,7 @@ static int blk_add_partitions(struct gendisk *disk)
/*
* Partitions are not supported on host managed zoned block devices.
*/
- if (disk->queue->limits.zoned == BLK_ZONED_HM) {
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: ignoring partition table on host managed zoned block device\n",
disk->disk_name);
ret = 0;
@@ -668,8 +660,23 @@ rescan:
sync_blockdev(disk->part0);
invalidate_bdev(disk->part0);
- xa_for_each_start(&disk->part_tbl, idx, part, 1)
- delete_partition(part);
+ xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+ /*
+ * Remove the block device from the inode hash, so that
+ * it cannot be looked up any more even when openers
+ * still hold references.
+ */
+ remove_inode_hash(part->bd_inode);
+
+ /*
+ * If @disk->open_partitions isn't elevated but there's
+ * still an active holder of that block device things
+ * are broken.
+ */
+ WARN_ON_ONCE(atomic_read(&part->bd_openers));
+ invalidate_bdev(part);
+ drop_partition(part);
+ }
clear_bit(GD_NEED_PART_SCAN, &disk->state);
/*
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index 403756dbd..82d9c4c3f 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
ptr->b;
}
+/* Volume Label Type/ID Length */
+#define DASD_VOL_TYPE_LEN 4
+#define DASD_VOL_ID_LEN 6
+
+/* Volume Label Types */
+#define DASD_VOLLBL_TYPE_VOL1 0
+#define DASD_VOLLBL_TYPE_LNX1 1
+#define DASD_VOLLBL_TYPE_CMS1 2
+
+struct dasd_vollabel {
+ char *type;
+ int idx;
+};
+
+static struct dasd_vollabel dasd_vollabels[] = {
+ [DASD_VOLLBL_TYPE_VOL1] = {
+ .type = "VOL1",
+ .idx = DASD_VOLLBL_TYPE_VOL1,
+ },
+ [DASD_VOLLBL_TYPE_LNX1] = {
+ .type = "LNX1",
+ .idx = DASD_VOLLBL_TYPE_LNX1,
+ },
+ [DASD_VOLLBL_TYPE_CMS1] = {
+ .type = "CMS1",
+ .idx = DASD_VOLLBL_TYPE_CMS1,
+ },
+};
+
+static int get_label_by_type(const char *type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) {
+ if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN))
+ return dasd_vollabels[i].idx;
+ }
+
+ return -1;
+}
+
static int find_label(struct parsed_partitions *state,
dasd_information2_t *info,
struct hd_geometry *geo,
@@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state,
char type[],
union label_t *label)
{
- Sector sect;
- unsigned char *data;
sector_t testsect[3];
- unsigned char temp[5];
- int found = 0;
int i, testcount;
+ Sector sect;
+ void *data;
/* There a three places where we may find a valid label:
* - on an ECKD disk it's block 2
@@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state,
if (data == NULL)
continue;
memcpy(label, data, sizeof(*label));
- memcpy(temp, data, 4);
- temp[4] = 0;
- EBCASC(temp, 4);
+ memcpy(type, data, DASD_VOL_TYPE_LEN);
+ EBCASC(type, DASD_VOL_TYPE_LEN);
put_dev_sector(sect);
- if (!strcmp(temp, "VOL1") ||
- !strcmp(temp, "LNX1") ||
- !strcmp(temp, "CMS1")) {
- if (!strcmp(temp, "VOL1")) {
- strncpy(type, label->vol.vollbl, 4);
- strncpy(name, label->vol.volid, 6);
- } else {
- strncpy(type, label->lnx.vollbl, 4);
- strncpy(name, label->lnx.volid, 6);
- }
- EBCASC(type, 4);
- EBCASC(name, 6);
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
+ memcpy(name, label->vol.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
+ *labelsect = testsect[i];
+ return 1;
+ case DASD_VOLLBL_TYPE_LNX1:
+ case DASD_VOLLBL_TYPE_CMS1:
+ memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
*labelsect = testsect[i];
- found = 1;
+ return 1;
+ default:
break;
}
}
- if (!found)
- memset(label, 0, sizeof(*label));
- return found;
+ return 0;
}
static int find_vol1_partitions(struct parsed_partitions *state,
@@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state)
sector_t nr_sectors;
dasd_information2_t *info;
struct hd_geometry *geo;
- char type[5] = {0,};
- char name[7] = {0,};
+ char type[DASD_VOL_TYPE_LEN + 1] = "";
+ char name[DASD_VOL_ID_LEN + 1] = "";
sector_t labelsect;
union label_t *label;
@@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state)
info = NULL;
}
- if (find_label(state, info, geo, blocksize, &labelsect, name, type,
- label)) {
- if (!strncmp(type, "VOL1", 4)) {
+ if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) {
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
res = find_vol1_partitions(state, geo, blocksize, name,
label);
- } else if (!strncmp(type, "LNX1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_LNX1:
res = find_lnx1_partitions(state, geo, blocksize, name,
label, labelsect, nr_sectors,
info);
- } else if (!strncmp(type, "CMS1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_CMS1:
res = find_cms1_partitions(state, geo, blocksize, name,
label, labelsect);
+ break;
}
} else if (info) {
/*
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 04f38a3f5..fa4dba5d8 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -18,6 +18,7 @@
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>
#include <linux/sed-opal.h>
+#include <linux/sed-opal-key.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
#include <linux/key.h>
@@ -1055,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length,
token_length = response_parse_medium(iter, pos);
else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
token_length = response_parse_long(iter, pos);
+ else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */
+ token_length = 1;
else /* TOKEN */
token_length = response_parse_token(iter, pos);
if (token_length < 0)
return token_length;
+ if (pos[0] != EMPTY_ATOM_BYTE)
+ num_entries++;
+
pos += token_length;
total -= token_length;
iter++;
- num_entries++;
}
resp->num = num_entries;
@@ -3018,7 +3023,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
if (ret)
return ret;
- /* update keyring with new password */
+ /* update keyring and key store with new password */
+ ret = sed_write_key(OPAL_AUTH_KEY,
+ opal_pw->new_user_pw.opal_key.key,
+ opal_pw->new_user_pw.opal_key.key_len);
+ if (ret != -EOPNOTSUPP)
+ pr_warn("error updating SED key: %d\n", ret);
+
ret = update_sed_opal_key(OPAL_AUTH_KEY,
opal_pw->new_user_pw.opal_key.key,
opal_pw->new_user_pw.opal_key.key_len);
@@ -3291,6 +3302,8 @@ EXPORT_SYMBOL_GPL(sed_ioctl);
static int __init sed_opal_init(void)
{
struct key *kr;
+ char init_sed_key[OPAL_KEY_MAX];
+ int keylen = OPAL_KEY_MAX - 1;
kr = keyring_alloc(".sed_opal",
GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
@@ -3303,6 +3316,11 @@ static int __init sed_opal_init(void)
sed_opal_keyring = kr;
- return 0;
+ if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) {
+ memset(init_sed_key, '\0', sizeof(init_sed_key));
+ keylen = OPAL_KEY_MAX - 1;
+ }
+
+ return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen);
}
late_initcall(sed_opal_init);