From dc50eab76b709d68175a358d6e23a5a3890764d3 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 18 May 2024 19:39:57 +0200 Subject: Merging upstream version 6.7.7. Signed-off-by: Daniel Baumann --- block/Kconfig | 1 + block/badblocks.c | 1620 ++++++++++++++++++++++++++++++++++++++--------- block/bdev.c | 71 ++- block/bio.c | 2 +- block/blk-flush.c | 11 +- block/blk-iocost.c | 7 + block/blk-map.c | 13 +- block/blk-mq-debugfs.c | 1 - block/blk-mq.c | 70 +- block/blk-mq.h | 57 +- block/blk-pm.c | 33 +- block/blk-sysfs.c | 2 + block/blk-wbt.c | 4 +- block/disk-events.c | 18 +- block/fops.c | 44 +- block/genhd.c | 19 +- block/ioctl.c | 11 +- block/partitions/core.c | 43 +- block/partitions/ibm.c | 98 ++- block/sed-opal.c | 18 +- 20 files changed, 1662 insertions(+), 481 deletions(-) (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig index f1364d1c0d..55ae2286a4 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -186,6 +186,7 @@ config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" depends on KEYS select PSERIES_PLPKS if PPC_PSERIES + select PSERIES_PLPKS_SED if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock diff --git a/block/badblocks.c b/block/badblocks.c index 3afb550c0f..db4ec8b9b2 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -16,119 +16,830 @@ #include #include -/** - * badblocks_check() - check a given range for bad sectors - * @bb: the badblocks structure that holds all badblock information - * @s: sector (start) at which to check for badblocks - * @sectors: number of sectors to check for badblocks - * @first_bad: pointer to store location of the first badblock - * @bad_sectors: pointer to store number of badblocks after @first_bad +/* + * The purpose of badblocks set/clear is to manage bad blocks ranges which are + * identified by LBA addresses. * - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. + * When the caller of badblocks_set() wants to set a range of bad blocks, the + * setting range can be acked or unacked. And the setting range may merge, + * overwrite, skip the overlapped already set range, depends on who they are + * overlapped or adjacent, and the acknowledgment type of the ranges. It can be + * more complicated when the setting range covers multiple already set bad block + * ranges, with restrictions of maximum length of each bad range and the bad + * table space limitation. * - * Locking of the bad-block table uses a seqlock so badblocks_check - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. + * It is difficult and unnecessary to take care of all the possible situations, + * for setting a large range of bad blocks, we can handle it by dividing the + * large range into smaller ones when encounter overlap, max range length or + * bad table full conditions. Every time only a smaller piece of the bad range + * is handled with a limited number of conditions how it is interacted with + * possible overlapped or adjacent already set bad block ranges. Then the hard + * complicated problem can be much simpler to handle in proper way. * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. + * When setting a range of bad blocks to the bad table, the simplified situations + * to be considered are, (The already set bad blocks ranges are naming with + * prefix E, and the setting bad blocks range is naming with prefix S) * - * Return: - * 0: there are no known bad blocks in the range - * 1: there are known bad block which are all acknowledged - * -1: there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. + * 1) A setting range is not overlapped or adjacent to any other already set bad + * block range. + * +--------+ + * | S | + * +--------+ + * +-------------+ +-------------+ + * | E1 | | E2 | + * +-------------+ +-------------+ + * For this situation if the bad blocks table is not full, just allocate a + * free slot from the bad blocks table to mark the setting range S. The + * result is, + * +-------------+ +--------+ +-------------+ + * | E1 | | S | | E2 | + * +-------------+ +--------+ +-------------+ + * 2) A setting range starts exactly at a start LBA of an already set bad blocks + * range. + * 2.1) The setting range size < already set range size + * +--------+ + * | S | + * +--------+ + * +-------------+ + * | E | + * +-------------+ + * 2.1.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. + * An extra slot from the bad blocks table will be allocated for S, and head + * of E will move to end of the inserted range S. The result is, + * +--------+----+ + * | S | E | + * +--------+----+ + * 2.2) The setting range size == already set range size + * 2.2.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of + bad blocks range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.3) The setting range size > already set range size + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For such situation, the setting range S can be treated as two parts, the + * first part (S1) is as same size as the already set range E, the second + * part (S2) is the rest of setting range. + * +-------------+-----+ +-------------+ +-----+ + * | S1 | S2 | | S1 | | S2 | + * +-------------+-----+ ===> +-------------+ +-----+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now we only focus on how to handle the setting range S1 and already set + * range E, which are already explained in 2.2), for the rest S2 it will be + * handled later in next loop. + * 3) A setting range starts before the start LBA of an already set bad blocks + * range. + * +-------------+ + * | S | + * +-------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation, the setting range S can be divided into two parts, the + * first (S1) ends at the start LBA of already set range E, the second part + * (S2) starts exactly at a start LBA of the already set range E. + * +----+---------+ +----+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +----+---------+ ===> +----+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now only the first part S1 should be handled in this loop, which is in + * similar condition as 1). The rest part S2 has exact same start LBA address + * of the already set range E, they will be handled in next loop in one of + * situations in 2). + * 4) A setting range starts after the start LBA of an already set bad blocks + * range. + * 4.1) If the setting range S exactly matches the tail part of already set bad + * blocks range E, like the following chart shows, + * +---------+ + * | S | + * +---------+ + * +-------------+ + * | E | + * +-------------+ + * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +-------------+ + * | S | + * +-------------+ + * 4.1.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is, + * +-------------+ + * | E | + * +-------------+ + * 4.1.3) If range E is unacked, and the setting range S is acked, then S may + * overwrite the overlapped range of E, the result is, + * +---+---------+ + * | E | S | + * +---+---------+ + * 4.2) If the setting range S stays in middle of an already set range E, like + * the following chart shows, + * +----+ + * | S | + * +----+ + * +--------------+ + * | E | + * +--------------+ + * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +--------------+ + * | S | + * +--------------+ + * 4.2.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is also, + * +--------------+ + * | E | + * +--------------+ + * 4.2.3) If range E is unacked, and the setting range S is acked, then S will + * inserted into middle of E and split previous range E into two parts (E1 + * and E2), the result is, + * +----+----+----+ + * | E1 | S | E2 | + * +----+----+----+ + * 4.3) If the setting bad blocks range S is overlapped with an already set bad + * blocks range E. The range S starts after the start LBA of range E, and + * ends after the end LBA of range E, as the following chart shows, + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation the range S can be divided into two parts, the first + * part (S1) ends at end range E, and the second part (S2) has rest range of + * origin S. + * +---------+---------+ +---------+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +---------+---------+ ===> +---------+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now in this loop the setting range S1 and already set range E can be + * handled as the situations 4.1), the rest range S2 will be handled in next + * loop and ignored in this loop. + * 5) A setting bad blocks range S is adjacent to one or more already set bad + * blocks range(s), and they are all acked or unacked range. + * 5.1) Front merge: If the already set bad blocks range E is before setting + * range S and they are adjacent, + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can front merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting + * range S right after already set range E into the bad blocks table. The + * result is, + * +--------+------+ + * | E | S | + * +--------+------+ + * 6) Special cases which above conditions cannot handle + * 6.1) Multiple already set ranges may merge into less ones in a full bad table + * +-------------------------------------------------------+ + * | S | + * +-------------------------------------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+ +-----+ +-----+ + * | E1 | | E2 | | E3 | + * +-----+ +-----+ +-----+ + * In the above example, when the bad blocks table is full, inserting the + * first part of setting range S will fail because no more available slot + * can be allocated from bad blocks table. In this situation a proper + * setting method should be go though all the setting bad blocks range and + * look for chance to merge already set ranges into less ones. When there + * is available slot from bad blocks table, re-try again to handle more + * setting bad blocks ranges as many as possible. + * +------------------------+ + * | S3 | + * +------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+-----+-----+---+-----+--+ + * | S1 | S2 | + * +-----+-----+-----+---+-----+--+ + * The above chart shows although the first part (S3) cannot be inserted due + * to no-space in bad blocks table, but the following E1, E2 and E3 ranges + * can be merged with rest part of S into less range S1 and S2. Now there is + * 1 free slot in bad blocks table. + * +------------------------+-----+-----+-----+---+-----+--+ + * | S3 | S1 | S2 | + * +------------------------+-----+-----+-----+---+-----+--+ + * Since the bad blocks table is not full anymore, re-try again for the + * origin setting range S. Now the setting range S3 can be inserted into the + * bad blocks table with previous freed slot from multiple ranges merge. + * 6.2) Front merge after overwrite + * In the following example, in bad blocks table, E1 is an acked bad blocks + * range and E2 is an unacked bad blocks range, therefore they are not able + * to merge into a larger range. The setting bad blocks range S is acked, + * therefore part of E2 can be overwritten by S. + * +--------+ + * | S | acknowledged + * +--------+ S: 1 + * +-------+-------------+ E1: 1 + * | E1 | E2 | E2: 0 + * +-------+-------------+ + * With previous simplified routines, after overwriting part of E2 with S, + * the bad blocks table should be (E3 is remaining part of E2 which is not + * overwritten by S), + * acknowledged + * +-------+--------+----+ S: 1 + * | E1 | S | E3 | E1: 1 + * +-------+--------+----+ E3: 0 + * The above result is correct but not perfect. Range E1 and S in the bad + * blocks table are all acked, merging them into a larger one range may + * occupy less bad blocks table space and make badblocks_check() faster. + * Therefore in such situation, after overwriting range S, the previous range + * E1 should be checked for possible front combination. Then the ideal + * result can be, + * +----------------+----+ acknowledged + * | E1 | E3 | E1: 1 + * +----------------+----+ E3: 0 + * 6.3) Behind merge: If the already set bad blocks range E is behind the setting + * range S and they are adjacent. Normally we don't need to care about this + * because front merge handles this while going though range S from head to + * tail, except for the tail part of range S. When the setting range S are + * fully handled, all the above simplified routine doesn't check whether the + * tail LBA of range S is adjacent to the next already set range and not + * merge them even it is possible. + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * For the above special situation, when the setting range S are all handled + * and the loop ends, an extra check is necessary for whether next already + * set range E is right after S and mergeable. + * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can behind merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range + * S in front of the already set range E in the bad blocks table. The result + * is, + * +------+-------+ + * | S | E | + * +------+-------+ + * + * All the above 5 simplified situations and 3 special cases may cover 99%+ of + * the bad block range setting conditions. Maybe there is some rare corner case + * is not considered and optimized, it won't hurt if badblocks_set() fails due + * to no space, or some ranges are not merged to save bad blocks table space. + * + * Inside badblocks_set() each loop starts by jumping to re_insert label, every + * time for the new loop prev_badblocks() is called to find an already set range + * which starts before or at current setting range. Since the setting bad blocks + * range is handled from head to tail, most of the cases it is unnecessary to do + * the binary search inside prev_badblocks(), it is possible to provide a hint + * to prev_badblocks() for a fast path, then the expensive binary search can be + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. + * + * + * Clearing a bad blocks range from the bad block table has similar idea as + * setting does, but much more simpler. The only thing needs to be noticed is + * when the clearing range hits middle of a bad block range, the existing bad + * block range will split into two, and one more item should be added into the + * bad block table. The simplified situations to be considered are, (The already + * set bad blocks ranges in bad block table are naming with prefix E, and the + * clearing bad blocks range is naming with prefix C) + * + * 1) A clearing range is not overlapped to any already set ranges in bad block + * table. + * +-----+ | +-----+ | +-----+ + * | C | | | C | | | C | + * +-----+ or +-----+ or +-----+ + * +---+ | +----+ +----+ | +---+ + * | E | | | E1 | | E2 | | | E | + * +---+ | +----+ +----+ | +---+ + * For the above situations, no bad block to be cleared and no failure + * happens, simply returns 0. + * 2) The clearing range hits middle of an already setting bad blocks range in + * the bad block table. + * +---+ + * | C | + * +---+ + * +-----------------+ + * | E | + * +-----------------+ + * In this situation if the bad block table is not full, the range E will be + * split into two ranges E1 and E2. The result is, + * +------+ +------+ + * | E1 | | E2 | + * +------+ +------+ + * 3) The clearing range starts exactly at same LBA as an already set bad block range + * from the bad block table. + * 3.1) Partially covered at head part + * +------------+ + * | C | + * +------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation, the overlapped already set range will update the + * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No + * item deleted from bad block table. The result is, + * +----+ + * | E1 | + * +----+ + * 3.2) Exact fully covered + * +-----------------+ + * | C | + * +-----------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation the whole bad blocks range E will be cleared and its + * corresponded item is deleted from the bad block table. + * 4) The clearing range exactly ends at same LBA as an already set bad block + * range. + * +-------+ + * | C | + * +-------+ + * +-----------------+ + * | E | + * +-----------------+ + * For the above situation, the already set range E is updated to shrink its + * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). + * The result is, + * +---------+ + * | E | + * +---------+ + * 5) The clearing range is partially overlapped with an already set bad block + * range from the bad block table. + * 5.1) The already set bad block range is front overlapped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part ends at the start LBA of range E, and the second part starts at + * same LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part C1 can be handled as condition 1), and the second part C2 can be + * handled as condition 3.1) in next loop. + * 5.2) The already set bad block range is behind overlaopped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part C1 ends at same end LBA of range E, and the second part starts + * at end LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part clearing range C1 can be handled as condition 4), and + * the second part clearing range C2 can be handled as condition 1) in next + * loop. + * + * All bad blocks range clearing can be simplified into the above 5 situations + * by only handling the head part of the clearing range in each run of the + * while-loop. The idea is similar to bad blocks range setting but much + * simpler. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) + +/* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad + * table. + */ +static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) { - int hi; - int lo; + int hint_end = hint + 2; u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; + int ret = -1; - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<shift) - 1; - target >>= bb->shift; + while ((hint < hint_end) && ((hint + 1) <= bb->count) && + (BB_OFFSET(p[hint]) <= s)) { + if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { + ret = hint; + break; + } + hint++; + } + + return ret; +} + +/* + * Find the range starts at-or-before bad->start. If 'hint' is provided + * (hint >= 0) then search in the bad table from hint firstly. It is + * very probably the wanted bad range can be found from the hint index, + * then the unnecessary while-loop iteration can be avoided. + */ +static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, + int hint) +{ + sector_t s = bad->start; + int ret = -1; + int lo, hi; + u64 *p; + + if (!bb->count) + goto out; + + if (hint >= 0) { + ret = prev_by_hint(bb, s, hint); + if (ret >= 0) + goto out; } - /* 'target' is now the first block after the bad range */ -retry: - seq = read_seqbegin(&bb->lock); lo = 0; - rv = 0; hi = bb->count; + p = bb->page; - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ + /* The following bisect search might be unnecessary */ + if (BB_OFFSET(p[lo]) > s) + return -1; + if (BB_OFFSET(p[hi - 1]) <= s) + return hi - 1; + + /* Do bisect search in bad table */ while (hi - lo > 1) { - int mid = (lo + hi) / 2; + int mid = (lo + hi)/2; sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. - */ + if (a == s) { + ret = mid; + goto out; + } + + if (a < s) lo = mid; else - /* This and later ranges are definitely out. */ hi = mid; } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. + + if (BB_OFFSET(p[lo]) <= s) + ret = lo; +out: + return ret; +} + +/* + * Return 'true' if the range indicated by 'bad' can be backward merged + * with the bad range (from the bad table) index by 'behind'. + */ +static bool can_merge_behind(struct badblocks *bb, + struct badblocks_context *bad, int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + + if ((s < BB_OFFSET(p[behind])) && + ((s + sectors) >= BB_OFFSET(p[behind])) && + ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && + BB_ACK(p[behind]) == bad->ack) + return true; + return false; +} + +/* + * Do backward merge for range indicated by 'bad' and the bad range + * (from the bad table) indexed by 'behind'. The return value is merged + * sectors from bad->len. + */ +static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s >= BB_OFFSET(p[behind])); + WARN_ON((s + sectors) < BB_OFFSET(p[behind])); + + if (s < BB_OFFSET(p[behind])) { + merged = BB_OFFSET(p[behind]) - s; + p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); + + WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); + } + + return merged; +} + +/* + * Return 'true' if the range indicated by 'bad' can be forward + * merged with the bad range (from the bad table) indexed by 'prev'. + */ +static bool can_merge_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + sector_t s = bad->start; + u64 *p = bb->page; + + if (BB_ACK(p[prev]) == bad->ack && + (s < BB_END(p[prev]) || + (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) + return true; + return false; +} + +/* + * Do forward merge for range indicated by 'bad' and the bad range + * (from bad table) indexed by 'prev'. The return value is sectors + * merged from bad->len. + */ +static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s > BB_END(p[prev])); + + if (s < BB_END(p[prev])) { + merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); + } else { + merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); + if ((prev + 1) < bb->count && + merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { + merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); + } + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + merged, bad->ack); + } + + return merged; +} + +/* + * 'Combine' is a special case which can_merge_front() is not able to + * handle: If a bad range (indexed by 'prev' from bad table) exactly + * starts as bad->start, and the bad range ahead of 'prev' (indexed by + * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and + * the sum of their lengths does not exceed BB_MAX_LEN limitation, then + * these two bad range (from bad table) can be combined. + * + * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad + * table can be combined. + */ +static bool can_combine_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if ((prev > 0) && + (BB_OFFSET(p[prev]) == bad->start) && + (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && + (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && + (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) + return true; + return false; +} + +/* + * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad + * table) into one larger bad range, and the new range is indexed by + * 'prev - 1'. + * The caller of front_combine() will decrease bb->count, therefore + * it is unnecessary to clear p[perv] after front merge. + */ +static void front_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), + BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), + BB_ACK(p[prev])); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly forward + * overlapped with the bad range (from bad table) indexed by 'front'. + * Exactly forward overlap means the bad range (from bad table) indexed + * by 'prev' does not cover the whole range indicated by 'bad'. + */ +static bool overlap_front(struct badblocks *bb, int front, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if (bad->start >= BB_OFFSET(p[front]) && + bad->start < BB_END(p[front])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + u64 *p = bb->page; + + if (bad->start < BB_OFFSET(p[behind]) && + (bad->start + bad->len) > BB_OFFSET(p[behind])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' can overwrite the bad + * range (from bad table) indexed by 'prev'. + * + * The range indicated by 'bad' can overwrite the bad range indexed by + * 'prev' when, + * 1) The whole range indicated by 'bad' can cover partial or whole bad + * range (from bad table) indexed by 'prev'. + * 2) The ack value of 'bad' is larger or equal to the ack value of bad + * range 'prev'. + * + * If the overwriting doesn't cover the whole bad range (from bad table) + * indexed by 'prev', new range might be split from existing bad range, + * 1) The overwrite covers head or tail part of existing bad range, 1 + * extra bad range will be split and added into the bad table. + * 2) The overwrite covers middle of existing bad range, 2 extra bad + * ranges will be split (ahead and after the overwritten range) and + * added into the bad table. + * The number of extra split ranges of the overwriting is stored in + * 'extra' and returned for the caller. + */ +static bool can_front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *extra) +{ + u64 *p = bb->page; + int len; + + WARN_ON(!overlap_front(bb, prev, bad)); + + if (BB_ACK(p[prev]) >= bad->ack) + return false; + + if (BB_END(p[prev]) <= (bad->start + bad->len)) { + len = BB_END(p[prev]) - bad->start; + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 0; + else + *extra = 1; + + bad->len = len; + } else { + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 1; + else + /* + * prev range will be split into two, beside the overwritten + * one, an extra slot needed from bad table. */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; + *extra = 2; + } + + if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + return false; + + return true; +} + +/* + * Do the overwrite from the range indicated by 'bad' to the bad range + * (from bad table) indexed by 'prev'. + * The previously called can_front_overwrite() will provide how many + * extra bad range(s) might be split and added into the bad table. All + * the splitting cases in the bad table will be handled here. + */ +static int front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int extra) +{ + u64 *p = bb->page; + sector_t orig_end = BB_END(p[prev]); + int orig_ack = BB_ACK(p[prev]); + + switch (extra) { + case 0: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), + bad->ack); + break; + case 1: + if (BB_OFFSET(p[prev]) == bad->start) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->len, bad->ack); + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start + bad->len, + orig_end - BB_END(p[prev]), + orig_ack); + } else { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev +2 -> prev + 1 + 1, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 1: one more slot for extra being 1. + */ + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); } + break; + case 2: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev + 3 -> prev + 1 + 2, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 2: two more slots for extra being 2. + */ + memmove(p + prev + 3, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); + p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), + orig_end - BB_END(p[prev + 1]), + orig_ack); + break; + default: + break; } - if (read_seqretry(&bb->lock, seq)) - goto retry; + return bad->len; +} - return rv; +/* + * Explicitly insert a range indicated by 'bad' to the bad table, where + * the location is indexed by 'at'. + */ +static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) +{ + u64 *p = bb->page; + int len; + + WARN_ON(badblocks_full(bb)); + + len = min_t(sector_t, bad->len, BB_MAX_LEN); + if (at < bb->count) + memmove(p + at + 1, p + at, (bb->count - at) * 8); + p[at] = BB_MAKE(bad->start, len, bad->ack); + + return len; } -EXPORT_SYMBOL_GPL(badblocks_check); static void badblocks_update_acked(struct badblocks *bb) { + bool unacked = false; u64 *p = bb->page; int i; - bool unacked = false; if (!bb->unacked_exist) return; @@ -144,281 +855,602 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } -/** - * badblocks_set() - Add a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * @acknowledged: weather to mark the bad sectors as acknowledged - * - * This might extend the table, or might contract it if two adjacent ranges - * can be merged. We binary-search to find the 'insertion' point, then - * decide how best to handle it. - * - * Return: - * 0: success - * 1: failed to set badblocks (out of space) - */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +/* Do exact work to set bad block range into the bad block table */ +static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { - u64 *p; - int lo, hi; - int rv = 0; + int retried = 0, space_desired = 0; + int orig_len, len = 0, added = 0; + struct badblocks_context bad; + int prev = -1, hint = -1; + sector_t orig_start; unsigned long flags; + int rv = 0; + u64 *p; if (bb->shift < 0) /* badblocks are disabled */ return 1; + if (sectors == 0) + /* Invalid sectors number */ + return 1; + if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - s >>= bb->shift; - next += (1<shift) - 1; - next >>= bb->shift; + rounddown(s, bb->shift); + roundup(next, bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); + orig_start = s; + orig_len = sectors; + bad.ack = acknowledged; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; +re_insert: + bad.start = s; + bad.len = sectors; + len = 0; + + if (badblocks_empty(bb)) { + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + goto update_sectors; } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; + prev = prev_badblocks(bb, &bad, hint); + + /* start before all badblocks */ + if (prev < 0) { + if (!badblocks_full(bb)) { + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = 0; + goto update_sectors; + } + + /* No sapce, try to merge */ + if (overlap_behind(bb, &bad, 0)) { + if (can_merge_behind(bb, &bad, 0)) { + len = behind_merge(bb, &bad, 0); + added++; } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + len = BB_OFFSET(p[0]) - s; + space_desired = 1; } - sectors = e - s; + hint = 0; + goto update_sectors; } + + /* no table space and give up */ + goto out; } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range - */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + + /* in case p[prev-1] can be merged with p[prev] */ + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + added++; + hint = prev; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + } else { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; + goto update_sectors; + } + + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; } - sectors = e - s; - lo = hi; - hi++; } + hint = prev; + goto update_sectors; + } + + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + hint = prev; + goto update_sectors; } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; + + /* if no space in table, still try to merge in the covered range */ + if (badblocks_full(bb)) { + /* skip the cannot-merge range */ + if (((prev + 1) < bb->count) && + overlap_behind(bb, &bad, prev + 1) && + ((s + sectors) >= BB_END(p[prev + 1]))) { + len = BB_END(p[prev + 1]) - s; + hint = prev + 1; + goto update_sectors; } + + /* no retry any more */ + len = sectors; + space_desired = 1; + hint = -1; + goto update_sectors; } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' - */ - if (bb->count >= MAX_BADBLOCKS) { - /* No room for more */ - rv = 1; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; + /* cannot merge and there is space in bad table */ + if ((prev + 1) < bb->count && + overlap_behind(bb, &bad, prev + 1)) + bad.len = min_t(sector_t, + bad.len, BB_OFFSET(p[prev + 1]) - bad.start); - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } + len = insert_at(bb, prev + 1, &bad); + bb->count++; + added++; + hint = prev + 1; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_insert; + + WARN_ON(sectors < 0); + + /* + * Check whether the following already set range can be + * merged. (prev < 0) condition is not handled here, + * because it's already complicated enough. + */ + if (prev >= 0 && + (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + } + + if (space_desired && !badblocks_full(bb)) { + s = orig_start; + sectors = orig_len; + space_desired = 0; + if (retried++ < 3) + goto re_insert; + } + +out: + if (added) { + set_changed(bb); + + if (!acknowledged) + bb->unacked_exist = 1; + else + badblocks_update_acked(bb); } - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - else - badblocks_update_acked(bb); write_sequnlock_irqrestore(&bb->lock, flags); + if (!added) + rv = 1; + return rv; } -EXPORT_SYMBOL_GPL(badblocks_set); -/** - * badblocks_clear() - Remove a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - * - * Return: - * 0: success - * 1: failed to clear badblocks +/* + * Clear the bad block range from bad block table which is front overlapped + * with the clearing range. The return value is how many sectors from an + * already set bad block range are cleared. If the whole bad block range is + * covered by the clearing range and fully cleared, 'delete' is set as 1 for + * the caller to reduce bb->count. */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static int front_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *deleted) { - u64 *p; - int lo, hi; - sector_t target = s + sectors; + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int cleared = 0; + + *deleted = 0; + if (s == BB_OFFSET(p[prev])) { + if (BB_LEN(p[prev]) > sectors) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, + BB_LEN(p[prev]) - sectors, + BB_ACK(p[prev])); + cleared = sectors; + } else { + /* BB_LEN(p[prev]) <= sectors */ + cleared = BB_LEN(p[prev]); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, + (bb->count - prev - 1) * 8); + *deleted = 1; + } + } else if (s > BB_OFFSET(p[prev])) { + if (BB_END(p[prev]) <= (s + sectors)) { + cleared = BB_END(p[prev]) - s; + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + BB_ACK(p[prev])); + } else { + /* Splitting is handled in front_splitting_clear() */ + BUG(); + } + } + + return cleared; +} + +/* + * Handle the condition that the clearing range hits middle of an already set + * bad block range from bad block table. In this condition the existing bad + * block range is split into two after the middle part is cleared. + */ +static int front_splitting_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + u64 end = BB_END(p[prev]); + int ack = BB_ACK(p[prev]); + sector_t sectors = bad->len; + sector_t s = bad->start; + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + ack); + memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); + return sectors; +} + +/* Do the exact work to clear bad block range from the bad block table */ +static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + struct badblocks_context bad; + int prev = -1, hint = -1; + int len = 0, cleared = 0; int rv = 0; + u64 *p; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 1; + + if (sectors == 0) + /* Invalid sectors number */ + return 1; + + if (bb->shift) { + sector_t target; - if (bb->shift > 0) { /* When clearing we round the start up and the end down. * This should not matter as the shift should align with * the block size and no rounding should ever be needed. * However it is better the think a block is bad when it * isn't than to think a block is not bad when it is. */ - s += (1<shift) - 1; - s >>= bb->shift; - target >>= bb->shift; + target = s + sectors; + roundup(s, bb->shift); + rounddown(target, bb->shift); + sectors = target - s; } write_seqlock_irq(&bb->lock); + bad.ack = true; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; +re_clear: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + cleared++; + goto update_sectors; } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && - (BB_OFFSET(p[lo]) < target)) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && - (BB_OFFSET(p[lo]) < target)) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; + + + prev = prev_badblocks(bb, &bad, hint); + + /* Start before all badblocks */ + if (prev < 0) { + if (overlap_behind(bb, &bad, 0)) { + len = BB_OFFSET(p[0]) - s; + hint = 0; + } else { + len = sectors; } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded + /* + * Both situations are to clear non-bad range, + * should be treated as successful */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); + cleared++; + goto update_sectors; + } + + /* Start after all badblocks */ + if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { + len = sectors; + cleared++; + goto update_sectors; + } + + /* Clear will split a bad record but the table is full */ + if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + sectors))) { + len = sectors; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if ((BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + bad.len))) { + /* Splitting */ + if ((bb->count + 1) < MAX_BADBLOCKS) { + len = front_splitting_clear(bb, prev, &bad); + bb->count += 1; + cleared++; + } else { + /* No space to split, give up */ + len = sectors; + } + } else { + int deleted = 0; + + len = front_clear(bb, prev, &bad, &deleted); + bb->count -= deleted; + cleared++; + hint = prev; } + + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + /* Clear non-bad range should be treated as successful */ + cleared++; + goto update_sectors; + } + + /* Not cover any badblocks range in the table */ + len = sectors; + /* Clear non-bad range should be treated as successful */ + cleared++; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_clear; + + WARN_ON(sectors < 0); + + if (cleared) { + badblocks_update_acked(bb); + set_changed(bb); } - badblocks_update_acked(bb); - bb->changed = 1; -out: write_sequnlock_irq(&bb->lock); + + if (!cleared) + rv = 1; + return rv; } + +/* Do the exact work to check bad blocks range from the bad block table */ +static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int unacked_badblocks, acked_badblocks; + int prev = -1, hint = -1, set = 0; + struct badblocks_context bad; + unsigned int seq; + int len, rv; + u64 *p; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + sector_t target; + + /* round the start down, and the end up */ + target = s + sectors; + rounddown(s, bb->shift); + roundup(target, bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + + p = bb->page; + unacked_badblocks = 0; + acked_badblocks = 0; + +re_check: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + goto update_sectors; + } + + prev = prev_badblocks(bb, &bad, hint); + + /* start after all badblocks */ + if ((prev >= 0) && + ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) { + len = sectors; + goto update_sectors; + } + + /* Overlapped with front badblocks record */ + if ((prev >= 0) && overlap_front(bb, prev, &bad)) { + if (BB_ACK(p[prev])) + acked_badblocks++; + else + unacked_badblocks++; + + if (BB_END(p[prev]) >= (s + sectors)) + len = sectors; + else + len = BB_END(p[prev]) - s; + + if (set == 0) { + *first_bad = BB_OFFSET(p[prev]); + *bad_sectors = BB_LEN(p[prev]); + set = 1; + } + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + goto update_sectors; + } + + /* not cover any badblocks range in the table */ + len = sectors; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_check; + + WARN_ON(sectors < 0); + + if (unacked_badblocks > 0) + rv = -1; + else if (acked_badblocks > 0) + rv = 1; + else + rv = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + return _badblocks_set(bb, s, sectors, acknowledged); +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + return _badblocks_clear(bb, s, sectors); +} EXPORT_SYMBOL_GPL(badblocks_clear); /** diff --git a/block/bdev.c b/block/bdev.c index 04dba25b00..750aec178b 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -292,7 +292,7 @@ EXPORT_SYMBOL(thaw_bdev); */ static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); -static struct kmem_cache * bdev_cachep __read_mostly; +static struct kmem_cache *bdev_cachep __ro_after_init; static struct inode *bdev_alloc_inode(struct super_block *sb) { @@ -361,13 +361,13 @@ static struct file_system_type bd_type = { .kill_sb = kill_anon_super, }; -struct super_block *blockdev_superblock __read_mostly; +struct super_block *blockdev_superblock __ro_after_init; EXPORT_SYMBOL_GPL(blockdev_superblock); void __init bdev_cache_init(void) { int err; - static struct vfsmount *bd_mnt; + static struct vfsmount *bd_mnt __ro_after_init; bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| @@ -831,6 +831,28 @@ put_blkdev: } EXPORT_SYMBOL(blkdev_get_by_dev); +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ + struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL); + struct block_device *bdev; + + if (!handle) + return ERR_PTR(-ENOMEM); + bdev = blkdev_get_by_dev(dev, mode, holder, hops); + if (IS_ERR(bdev)) { + kfree(handle); + return ERR_CAST(bdev); + } + handle->bdev = bdev; + handle->holder = holder; + if (holder) + mode |= BLK_OPEN_EXCL; + handle->mode = mode; + return handle; +} +EXPORT_SYMBOL(bdev_open_by_dev); + /** * blkdev_get_by_path - open a block device by name * @path: path to the block device to open @@ -869,6 +891,28 @@ struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); +struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops) +{ + struct bdev_handle *handle; + dev_t dev; + int error; + + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); + + handle = bdev_open_by_dev(dev, mode, holder, hops); + if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) && + bdev_read_only(handle->bdev)) { + bdev_release(handle); + return ERR_PTR(-EACCES); + } + + return handle; +} +EXPORT_SYMBOL(bdev_open_by_path); + void blkdev_put(struct block_device *bdev, void *holder) { struct gendisk *disk = bdev->bd_disk; @@ -905,6 +949,13 @@ void blkdev_put(struct block_device *bdev, void *holder) } EXPORT_SYMBOL(blkdev_put); +void bdev_release(struct bdev_handle *handle) +{ + blkdev_put(handle->bdev, handle->holder); + kfree(handle); +} +EXPORT_SYMBOL(bdev_release); + /** * lookup_bdev() - Look up a struct block_device by name. * @pathname: Name of the block device in the filesystem. @@ -963,20 +1014,20 @@ void bdev_mark_dead(struct block_device *bdev, bool surprise) mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) bdev->bd_holder_ops->mark_dead(bdev, surprise); - else + else { + mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); - mutex_unlock(&bdev->bd_holder_lock); + } invalidate_bdev(bdev); } -#ifdef CONFIG_DASD_MODULE /* - * Drivers should not use this directly, but the DASD driver has historically - * had a shutdown to offline mode that doesn't actually remove the gendisk - * that otherwise looks a lot like a safe device removal. + * New drivers should not use this directly. There are some drivers however + * that needs this for historical reasons. For example, the DASD driver has + * historically had a shutdown to offline mode that doesn't actually remove the + * gendisk that otherwise looks a lot like a safe device removal. */ EXPORT_SYMBOL_GPL(bdev_mark_dead); -#endif void sync_bdevs(bool wait) { diff --git a/block/bio.c b/block/bio.c index 5eba53ca95..270f6b9992 100644 --- a/block/bio.c +++ b/block/bio.c @@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) + if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset, same_page); } diff --git a/block/blk-flush.c b/block/blk-flush.c index e73dc22d05..3f4d41952e 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -323,16 +323,9 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { + if (!q->elevator) flush_rq->tag = first_rq->tag; - - /* - * We borrow data request's driver tag, so have to mark - * this flush request as INFLIGHT for avoiding double - * account of this driver tag - */ - flush_rq->rq_flags |= RQF_MQ_INFLIGHT; - } else + else flush_rq->internal_tag = first_rq->internal_tag; flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 089fcb9cfc..7ee8d85c2c 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1353,6 +1353,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) lockdep_assert_held(&iocg->waitq.lock); + /* + * If the delay is set by another CPU, we may be in the past. No need to + * change anything if so. This avoids decay calculation underflow. + */ + if (time_before64(now->now, iocg->delay_at)) + return false; + /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; if (iocg->delay) diff --git a/block/blk-map.c b/block/blk-map.c index 8584babf3e..71210cdb34 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -205,12 +205,19 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, /* * success */ - if ((iov_iter_rw(iter) == WRITE && - (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { + if (iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) { ret = bio_copy_from_iter(bio, iter); if (ret) goto cleanup; + } else if (map_data && map_data->from_user) { + struct iov_iter iter2 = *iter; + + /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */ + iter2.data_source = ITER_SOURCE; + ret = bio_copy_from_iter(bio, &iter2); + if (ret) + goto cleanup; } else { if (bmd->is_our_pages) zero_fill_bio(bio); diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index c3b5930106..5cbeb9344f 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -246,7 +246,6 @@ static const char *const rqf_name[] = { RQF_NAME(STARTED), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), - RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), RQF_NAME(SCHED_TAGS), RQF_NAME(USE_SCHED), diff --git a/block/blk-mq.c b/block/blk-mq.c index 6041e17492..a02d3d922c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -426,6 +426,8 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) rq_list_add(data->cached_rq, rq); nr++; } + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_add_active_requests(data->hctx, nr); /* caller already holds a reference, add for remainder */ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); data->nr_tags -= nr; @@ -510,6 +512,8 @@ retry: goto retry; } + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data->hctx); rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); return rq; @@ -669,6 +673,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; + if (!(data.rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data.hctx); rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; @@ -708,11 +714,10 @@ static void __blk_mq_free_request(struct request *rq) blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; - if (rq->rq_flags & RQF_MQ_INFLIGHT) - __blk_mq_dec_active_requests(hctx); - - if (rq->tag != BLK_MQ_NO_TAG) + if (rq->tag != BLK_MQ_NO_TAG) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); + } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); @@ -767,11 +772,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); @@ -1061,12 +1071,7 @@ static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; - /* - * All requests should have been marked as RQF_MQ_INFLIGHT, so - * update hctx->nr_active in batch - */ - if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) - __blk_mq_sub_active_requests(hctx, nr_tags); + blk_mq_sub_active_requests(hctx, nr_tags); blk_mq_put_tags(hctx->tags, tag_array, nr_tags); percpu_ref_put_many(&q->q_usage_counter, nr_tags); @@ -1259,6 +1264,7 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); + rq->mq_hctx->tags->rqs[rq->tag] = rq; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) @@ -1760,7 +1766,7 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static bool __blk_mq_alloc_driver_tag(struct request *rq) +bool __blk_mq_alloc_driver_tag(struct request *rq) { struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; @@ -1781,20 +1787,7 @@ static bool __blk_mq_alloc_driver_tag(struct request *rq) return false; rq->tag = tag + tag_offset; - return true; -} - -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) - return false; - - if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && - !(rq->rq_flags & RQF_MQ_INFLIGHT)) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - __blk_mq_inc_active_requests(hctx); - } - hctx->tags->rqs[rq->tag] = rq; + blk_mq_inc_active_requests(rq->mq_hctx); return true; } @@ -1870,6 +1863,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, wait->flags &= ~WQ_FLAG_EXCLUSIVE; __add_wait_queue(wq, wait); + /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait @@ -2806,13 +2815,8 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) * If we do, we can dispatch the whole plug list in one go. We * already know at this point that all requests belong to the * same queue, caller must ensure that's the case. - * - * Since we pass off the full list to the driver at this point, - * we do not increment the active request count for the queue. - * Bypass shared tags for now because of that. */ - if (q->mq_ops->queue_rqs && - !(rq->mq_hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + if (q->mq_ops->queue_rqs) { blk_mq_run_dispatch_ops(q, __blk_mq_flush_plug_list(q, plug)); if (rq_list_empty(plug->mq_list)) diff --git a/block/blk-mq.h b/block/blk-mq.h index 1743857e0b..f75a9ecfeb 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -271,12 +271,18 @@ static inline int blk_mq_get_rq_budget_token(struct request *rq) return -1; } -static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) { if (blk_mq_is_shared_tags(hctx->flags)) - atomic_inc(&hctx->queue->nr_active_requests_shared_tags); + atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); else - atomic_inc(&hctx->nr_active); + atomic_add(val, &hctx->nr_active); +} + +static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + __blk_mq_add_active_requests(hctx, 1); } static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, @@ -293,6 +299,32 @@ static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) __blk_mq_sub_active_requests(hctx, 1); } +static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_add_active_requests(hctx, val); +} + +static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_inc_active_requests(hctx); +} + +static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_sub_active_requests(hctx, val); +} + +static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_dec_active_requests(hctx); +} + static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) { if (blk_mq_is_shared_tags(hctx->flags)) @@ -302,13 +334,9 @@ static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - __blk_mq_dec_active_requests(hctx); - } } static inline void blk_mq_put_driver_tag(struct request *rq) @@ -319,19 +347,14 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } -bool __blk_mq_get_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq); +bool __blk_mq_alloc_driver_tag(struct request *rq); static inline bool blk_mq_get_driver_tag(struct request *rq) { - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->tag != BLK_MQ_NO_TAG && - !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { - hctx->tags->rqs[rq->tag] = rq; - return true; - } + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) + return false; - return __blk_mq_get_driver_tag(hctx, rq); + return true; } static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) diff --git a/block/blk-pm.c b/block/blk-pm.c index 6b72b2e03f..42e8420747 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -163,38 +163,15 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); * @q: the queue of the device * * Description: - * For historical reasons, this routine merely calls blk_set_runtime_active() - * to do the real work of restarting the queue. It does this regardless of - * whether the device's runtime-resume succeeded; even if it failed the + * Restart the queue of a runtime suspended device. It does this regardless + * of whether the device's runtime-resume succeeded; even if it failed the * driver or error handler will need to communicate with the device. * * This function should be called near the end of the device's - * runtime_resume callback. + * runtime_resume callback to correct queue runtime PM status and re-enable + * peeking requests from the queue. */ void blk_post_runtime_resume(struct request_queue *q) -{ - blk_set_runtime_active(q); -} -EXPORT_SYMBOL(blk_post_runtime_resume); - -/** - * blk_set_runtime_active - Force runtime status of the queue to be active - * @q: the queue of the device - * - * If the device is left runtime suspended during system suspend the resume - * hook typically resumes the device and corrects runtime status - * accordingly. However, that does not affect the queue runtime PM status - * which is still "suspended". This prevents processing requests from the - * queue. - * - * This function can be used in driver's resume hook to correct queue - * runtime PM status and re-enable peeking requests from the queue. It - * should be called before first request is added to the queue. - * - * This function is also called by blk_post_runtime_resume() for - * runtime resumes. It does everything necessary to restart the queue. - */ -void blk_set_runtime_active(struct request_queue *q) { int old_status; @@ -211,4 +188,4 @@ void blk_set_runtime_active(struct request_queue *q) if (old_status != RPM_ACTIVE) blk_clear_pm_only(q); } -EXPORT_SYMBOL(blk_set_runtime_active); +EXPORT_SYMBOL(blk_post_runtime_resume); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 63e4812623..0b2d047663 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -615,6 +615,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif +/* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, @@ -659,6 +660,7 @@ static struct attribute *queue_attrs[] = { NULL, }; +/* Request-based queue attributes that are not relevant for bio-based queues. */ static struct attribute *blk_mq_queue_attrs[] = { &queue_requests_entry.attr, &elv_iosched_entry.attr, diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0bb613139b..f8fda9cf58 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -165,9 +165,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, diff --git a/block/disk-events.c b/block/disk-events.c index 13c3372c46..2f69722438 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -266,11 +266,8 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) * disk_check_media_change - check if a removable media has been changed * @disk: gendisk to check * - * Check whether a removable media has been changed, and attempt to free all - * dentries and inodes and invalidates all block device page cache entries in - * that case. - * - * Returns %true if the media has changed, or %false if not. + * Returns %true and marks the disk for a partition rescan whether a removable + * media has been changed, and %false if the media did not change. */ bool disk_check_media_change(struct gendisk *disk) { @@ -278,12 +275,11 @@ bool disk_check_media_change(struct gendisk *disk) events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); - if (!(events & DISK_EVENT_MEDIA_CHANGE)) - return false; - - bdev_mark_dead(disk->part0, true); - set_bit(GD_NEED_PART_SCAN, &disk->state); - return true; + if (events & DISK_EVENT_MEDIA_CHANGE) { + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; + } + return false; } EXPORT_SYMBOL(disk_check_media_change); diff --git a/block/fops.c b/block/fops.c index 73e4274254..0abaac705d 100644 --- a/block/fops.c +++ b/block/fops.c @@ -542,15 +542,31 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, return error; } +/** + * file_to_blk_mode - get block open flags from file flags + * @file: file whose open flags should be converted + * + * Look at file open flags and generate corresponding block open flags from + * them. The function works both for file just being open (e.g. during ->open + * callback) and for file that is already open. This is actually non-trivial + * (see comment in the function). + */ blk_mode_t file_to_blk_mode(struct file *file) { blk_mode_t mode = 0; + struct bdev_handle *handle = file->private_data; if (file->f_mode & FMODE_READ) mode |= BLK_OPEN_READ; if (file->f_mode & FMODE_WRITE) mode |= BLK_OPEN_WRITE; - if (file->private_data) + /* + * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to + * determine whether the open was exclusive for already open files. + */ + if (handle) + mode |= handle->mode & BLK_OPEN_EXCL; + else if (file->f_flags & O_EXCL) mode |= BLK_OPEN_EXCL; if (file->f_flags & O_NDELAY) mode |= BLK_OPEN_NDELAY; @@ -568,7 +584,8 @@ blk_mode_t file_to_blk_mode(struct file *file) static int blkdev_open(struct inode *inode, struct file *filp) { - struct block_device *bdev; + struct bdev_handle *handle; + blk_mode_t mode; /* * Preserve backwards compatibility and allow large file access @@ -579,29 +596,24 @@ static int blkdev_open(struct inode *inode, struct file *filp) filp->f_flags |= O_LARGEFILE; filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; - /* - * Use the file private data to store the holder for exclusive openes. - * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL. - */ - if (filp->f_flags & O_EXCL) - filp->private_data = filp; - - bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp), - filp->private_data, NULL); - if (IS_ERR(bdev)) - return PTR_ERR(bdev); + mode = file_to_blk_mode(filp); + handle = bdev_open_by_dev(inode->i_rdev, mode, + mode & BLK_OPEN_EXCL ? filp : NULL, NULL); + if (IS_ERR(handle)) + return PTR_ERR(handle); - if (bdev_nowait(bdev)) + if (bdev_nowait(handle->bdev)) filp->f_mode |= FMODE_NOWAIT; - filp->f_mapping = bdev->bd_inode->i_mapping; + filp->f_mapping = handle->bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->private_data = handle; return 0; } static int blkdev_release(struct inode *inode, struct file *filp) { - blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data); + bdev_release(filp->private_data); return 0; } diff --git a/block/genhd.c b/block/genhd.c index f9b81be6c7..d74fb5b4ae 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -342,7 +342,7 @@ EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { - struct block_device *bdev; + struct bdev_handle *handle; int ret = 0; if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) @@ -366,12 +366,12 @@ int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, - NULL); - if (IS_ERR(bdev)) - ret = PTR_ERR(bdev); + handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, + NULL); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); else - blkdev_put(bdev, NULL); + bdev_release(handle); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, @@ -562,6 +562,13 @@ static void blk_report_disk_dead(struct gendisk *disk, bool surprise) struct block_device *bdev; unsigned long idx; + /* + * On surprise disk removal, bdev_mark_dead() may call into file + * systems below. Make it clear that we're expecting to not hold + * disk->open_mutex. + */ + lockdep_assert_not_held(&disk->open_mutex); + rcu_read_lock(); xa_for_each(&disk->part_tbl, idx, bdev) { if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) diff --git a/block/ioctl.c b/block/ioctl.c index d1d8e83912..438f79c564 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -371,9 +371,10 @@ static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, mutex_lock(&bdev->bd_holder_lock); if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) bdev->bd_holder_ops->sync(bdev); - else + else { + mutex_unlock(&bdev->bd_holder_lock); sync_blockdev(bdev); - mutex_unlock(&bdev->bd_holder_lock); + } invalidate_bdev(bdev); return 0; @@ -468,6 +469,7 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; + struct bdev_handle *handle; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -479,10 +481,11 @@ static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, if (mode & BLK_OPEN_EXCL) return set_blocksize(bdev, n); - if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL))) + handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); + if (IS_ERR(handle)) return -EBUSY; ret = set_blocksize(bdev, n); - blkdev_put(bdev, &bdev); + bdev_release(handle); return ret; } diff --git a/block/partitions/core.c b/block/partitions/core.c index e58c8b5035..f14602022c 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -274,17 +274,6 @@ void drop_partition(struct block_device *part) put_device(&part->bd_device); } -static void delete_partition(struct block_device *part) -{ - /* - * Remove the block device from the inode hash, so that it cannot be - * looked up any more even when openers still hold references. - */ - remove_inode_hash(part->bd_inode); - bdev_mark_dead(part, false); - drop_partition(part); -} - static ssize_t whole_disk_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -490,7 +479,18 @@ int bdev_del_partition(struct gendisk *disk, int partno) if (atomic_read(&part->bd_openers)) goto out_unlock; - delete_partition(part); + /* + * We verified that @part->bd_openers is zero above and so + * @part->bd_holder{_ops} can't be set. And since we hold + * @disk->open_mutex the device can't be claimed by anyone. + * + * So no need to call @part->bd_holder_ops->mark_dead() here. + * Just delete the partition and invalidate it. + */ + + remove_inode_hash(part->bd_inode); + invalidate_bdev(part); + drop_partition(part); ret = 0; out_unlock: mutex_unlock(&disk->open_mutex); @@ -668,8 +668,23 @@ rescan: sync_blockdev(disk->part0); invalidate_bdev(disk->part0); - xa_for_each_start(&disk->part_tbl, idx, part, 1) - delete_partition(part); + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + /* + * Remove the block device from the inode hash, so that + * it cannot be looked up any more even when openers + * still hold references. + */ + remove_inode_hash(part->bd_inode); + + /* + * If @disk->open_partitions isn't elevated but there's + * still an active holder of that block device things + * are broken. + */ + WARN_ON_ONCE(atomic_read(&part->bd_openers)); + invalidate_bdev(part); + drop_partition(part); + } clear_bit(GD_NEED_PART_SCAN, &disk->state); /* diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index 403756dbd5..82d9c4c3fb 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) ptr->b; } +/* Volume Label Type/ID Length */ +#define DASD_VOL_TYPE_LEN 4 +#define DASD_VOL_ID_LEN 6 + +/* Volume Label Types */ +#define DASD_VOLLBL_TYPE_VOL1 0 +#define DASD_VOLLBL_TYPE_LNX1 1 +#define DASD_VOLLBL_TYPE_CMS1 2 + +struct dasd_vollabel { + char *type; + int idx; +}; + +static struct dasd_vollabel dasd_vollabels[] = { + [DASD_VOLLBL_TYPE_VOL1] = { + .type = "VOL1", + .idx = DASD_VOLLBL_TYPE_VOL1, + }, + [DASD_VOLLBL_TYPE_LNX1] = { + .type = "LNX1", + .idx = DASD_VOLLBL_TYPE_LNX1, + }, + [DASD_VOLLBL_TYPE_CMS1] = { + .type = "CMS1", + .idx = DASD_VOLLBL_TYPE_CMS1, + }, +}; + +static int get_label_by_type(const char *type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) { + if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN)) + return dasd_vollabels[i].idx; + } + + return -1; +} + static int find_label(struct parsed_partitions *state, dasd_information2_t *info, struct hd_geometry *geo, @@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state, char type[], union label_t *label) { - Sector sect; - unsigned char *data; sector_t testsect[3]; - unsigned char temp[5]; - int found = 0; int i, testcount; + Sector sect; + void *data; /* There a three places where we may find a valid label: * - on an ECKD disk it's block 2 @@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state, if (data == NULL) continue; memcpy(label, data, sizeof(*label)); - memcpy(temp, data, 4); - temp[4] = 0; - EBCASC(temp, 4); + memcpy(type, data, DASD_VOL_TYPE_LEN); + EBCASC(type, DASD_VOL_TYPE_LEN); put_dev_sector(sect); - if (!strcmp(temp, "VOL1") || - !strcmp(temp, "LNX1") || - !strcmp(temp, "CMS1")) { - if (!strcmp(temp, "VOL1")) { - strncpy(type, label->vol.vollbl, 4); - strncpy(name, label->vol.volid, 6); - } else { - strncpy(type, label->lnx.vollbl, 4); - strncpy(name, label->lnx.volid, 6); - } - EBCASC(type, 4); - EBCASC(name, 6); + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: + memcpy(name, label->vol.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); + *labelsect = testsect[i]; + return 1; + case DASD_VOLLBL_TYPE_LNX1: + case DASD_VOLLBL_TYPE_CMS1: + memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); *labelsect = testsect[i]; - found = 1; + return 1; + default: break; } } - if (!found) - memset(label, 0, sizeof(*label)); - return found; + return 0; } static int find_vol1_partitions(struct parsed_partitions *state, @@ -297,8 +332,8 @@ int ibm_partition(struct parsed_partitions *state) sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; - char type[5] = {0,}; - char name[7] = {0,}; + char type[DASD_VOL_TYPE_LEN + 1] = ""; + char name[DASD_VOL_ID_LEN + 1] = ""; sector_t labelsect; union label_t *label; @@ -330,18 +365,21 @@ int ibm_partition(struct parsed_partitions *state) info = NULL; } - if (find_label(state, info, geo, blocksize, &labelsect, name, type, - label)) { - if (!strncmp(type, "VOL1", 4)) { + if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) { + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: res = find_vol1_partitions(state, geo, blocksize, name, label); - } else if (!strncmp(type, "LNX1", 4)) { + break; + case DASD_VOLLBL_TYPE_LNX1: res = find_lnx1_partitions(state, geo, blocksize, name, label, labelsect, nr_sectors, info); - } else if (!strncmp(type, "CMS1", 4)) { + break; + case DASD_VOLLBL_TYPE_CMS1: res = find_cms1_partitions(state, geo, blocksize, name, label, labelsect); + break; } } else if (info) { /* diff --git a/block/sed-opal.c b/block/sed-opal.c index 04f38a3f5d..3d9e9cd250 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -3018,7 +3019,13 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) if (ret) return ret; - /* update keyring with new password */ + /* update keyring and key store with new password */ + ret = sed_write_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + if (ret != -EOPNOTSUPP) + pr_warn("error updating SED key: %d\n", ret); + ret = update_sed_opal_key(OPAL_AUTH_KEY, opal_pw->new_user_pw.opal_key.key, opal_pw->new_user_pw.opal_key.key_len); @@ -3291,6 +3298,8 @@ EXPORT_SYMBOL_GPL(sed_ioctl); static int __init sed_opal_init(void) { struct key *kr; + char init_sed_key[OPAL_KEY_MAX]; + int keylen = OPAL_KEY_MAX - 1; kr = keyring_alloc(".sed_opal", GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), @@ -3303,6 +3312,11 @@ static int __init sed_opal_init(void) sed_opal_keyring = kr; - return 0; + if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) { + memset(init_sed_key, '\0', sizeof(init_sed_key)); + keylen = OPAL_KEY_MAX - 1; + } + + return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen); } late_initcall(sed_opal_init); -- cgit v1.2.3