From 5d1646d90e1f2cceb9f0828f4b28318cd0ec7744 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sat, 27 Apr 2024 12:05:51 +0200 Subject: Adding upstream version 5.10.209. Signed-off-by: Daniel Baumann --- block/Kconfig | 236 ++ block/Kconfig.iosched | 49 + block/Makefile | 40 + block/badblocks.c | 603 ++++ block/bfq-cgroup.c | 1478 +++++++++ block/bfq-iosched.c | 6900 ++++++++++++++++++++++++++++++++++++++++++ block/bfq-iosched.h | 1107 +++++++ block/bfq-wf2q.c | 1712 +++++++++++ block/bio-integrity.c | 476 +++ block/bio.c | 1684 +++++++++++ block/blk-cgroup-rwstat.c | 130 + block/blk-cgroup-rwstat.h | 149 + block/blk-cgroup.c | 1950 ++++++++++++ block/blk-core.c | 1816 +++++++++++ block/blk-crypto-fallback.c | 657 ++++ block/blk-crypto-internal.h | 229 ++ block/blk-crypto.c | 422 +++ block/blk-exec.c | 95 + block/blk-flush.c | 511 ++++ block/blk-integrity.c | 455 +++ block/blk-ioc.c | 422 +++ block/blk-iocost.c | 3456 +++++++++++++++++++++ block/blk-iolatency.c | 1063 +++++++ block/blk-lib.c | 441 +++ block/blk-map.c | 718 +++++ block/blk-merge.c | 1147 +++++++ block/blk-mq-cpumap.c | 96 + block/blk-mq-debugfs-zoned.c | 22 + block/blk-mq-debugfs.c | 994 ++++++ block/blk-mq-debugfs.h | 103 + block/blk-mq-pci.c | 48 + block/blk-mq-rdma.c | 44 + block/blk-mq-sched.c | 654 ++++ block/blk-mq-sched.h | 85 + block/blk-mq-sysfs.c | 389 +++ block/blk-mq-tag.c | 643 ++++ block/blk-mq-tag.h | 94 + block/blk-mq-virtio.c | 46 + block/blk-mq.c | 4000 ++++++++++++++++++++++++ block/blk-mq.h | 327 ++ block/blk-pm.c | 216 ++ block/blk-pm.h | 73 + block/blk-rq-qos.c | 304 ++ block/blk-rq-qos.h | 231 ++ block/blk-settings.c | 887 ++++++ block/blk-stat.c | 221 ++ block/blk-stat.h | 171 ++ block/blk-sysfs.c | 977 ++++++ block/blk-throttle.c | 2527 ++++++++++++++++ block/blk-timeout.c | 167 + block/blk-wbt.c | 854 ++++++ block/blk-wbt.h | 134 + block/blk-zoned.c | 565 ++++ block/blk.h | 451 +++ block/bounce.c | 389 +++ block/bsg-lib.c | 415 +++ block/bsg.c | 525 ++++ block/cmdline-parser.c | 255 ++ block/elevator.c | 837 +++++ block/genhd.c | 2367 +++++++++++++++ block/ioctl.c | 707 +++++ block/ioprio.c | 262 ++ block/keyslot-manager.c | 402 +++ block/kyber-iosched.c | 1051 +++++++ block/mq-deadline.c | 824 +++++ block/opal_proto.h | 465 +++ block/partitions/Kconfig | 270 ++ block/partitions/Makefile | 22 + block/partitions/acorn.c | 550 ++++ block/partitions/aix.c | 298 ++ block/partitions/amiga.c | 207 ++ block/partitions/atari.c | 157 + block/partitions/atari.h | 36 + block/partitions/check.h | 70 + block/partitions/cmdline.c | 156 + block/partitions/core.c | 802 +++++ block/partitions/efi.c | 747 +++++ block/partitions/efi.h | 116 + block/partitions/ibm.c | 375 +++ block/partitions/karma.c | 60 + block/partitions/ldm.c | 1498 +++++++++ block/partitions/ldm.h | 194 ++ block/partitions/mac.c | 143 + block/partitions/mac.h | 44 + block/partitions/msdos.c | 715 +++++ block/partitions/osf.c | 87 + block/partitions/sgi.c | 88 + block/partitions/sun.c | 130 + block/partitions/sysv68.c | 95 + block/partitions/ultrix.c | 48 + block/scsi_ioctl.c | 891 ++++++ block/sed-opal.c | 2719 +++++++++++++++++ block/t10-pi.c | 285 ++ 93 files changed, 62571 insertions(+) create mode 100644 block/Kconfig create mode 100644 block/Kconfig.iosched create mode 100644 block/Makefile create mode 100644 block/badblocks.c create mode 100644 block/bfq-cgroup.c create mode 100644 block/bfq-iosched.c create mode 100644 block/bfq-iosched.h create mode 100644 block/bfq-wf2q.c create mode 100644 block/bio-integrity.c create mode 100644 block/bio.c create mode 100644 block/blk-cgroup-rwstat.c create mode 100644 block/blk-cgroup-rwstat.h create mode 100644 block/blk-cgroup.c create mode 100644 block/blk-core.c create mode 100644 block/blk-crypto-fallback.c create mode 100644 block/blk-crypto-internal.h create mode 100644 block/blk-crypto.c create mode 100644 block/blk-exec.c create mode 100644 block/blk-flush.c create mode 100644 block/blk-integrity.c create mode 100644 block/blk-ioc.c create mode 100644 block/blk-iocost.c create mode 100644 block/blk-iolatency.c create mode 100644 block/blk-lib.c create mode 100644 block/blk-map.c create mode 100644 block/blk-merge.c create mode 100644 block/blk-mq-cpumap.c create mode 100644 block/blk-mq-debugfs-zoned.c create mode 100644 block/blk-mq-debugfs.c create mode 100644 block/blk-mq-debugfs.h create mode 100644 block/blk-mq-pci.c create mode 100644 block/blk-mq-rdma.c create mode 100644 block/blk-mq-sched.c create mode 100644 block/blk-mq-sched.h create mode 100644 block/blk-mq-sysfs.c create mode 100644 block/blk-mq-tag.c create mode 100644 block/blk-mq-tag.h create mode 100644 block/blk-mq-virtio.c create mode 100644 block/blk-mq.c create mode 100644 block/blk-mq.h create mode 100644 block/blk-pm.c create mode 100644 block/blk-pm.h create mode 100644 block/blk-rq-qos.c create mode 100644 block/blk-rq-qos.h create mode 100644 block/blk-settings.c create mode 100644 block/blk-stat.c create mode 100644 block/blk-stat.h create mode 100644 block/blk-sysfs.c create mode 100644 block/blk-throttle.c create mode 100644 block/blk-timeout.c create mode 100644 block/blk-wbt.c create mode 100644 block/blk-wbt.h create mode 100644 block/blk-zoned.c create mode 100644 block/blk.h create mode 100644 block/bounce.c create mode 100644 block/bsg-lib.c create mode 100644 block/bsg.c create mode 100644 block/cmdline-parser.c create mode 100644 block/elevator.c create mode 100644 block/genhd.c create mode 100644 block/ioctl.c create mode 100644 block/ioprio.c create mode 100644 block/keyslot-manager.c create mode 100644 block/kyber-iosched.c create mode 100644 block/mq-deadline.c create mode 100644 block/opal_proto.h create mode 100644 block/partitions/Kconfig create mode 100644 block/partitions/Makefile create mode 100644 block/partitions/acorn.c create mode 100644 block/partitions/aix.c create mode 100644 block/partitions/amiga.c create mode 100644 block/partitions/atari.c create mode 100644 block/partitions/atari.h create mode 100644 block/partitions/check.h create mode 100644 block/partitions/cmdline.c create mode 100644 block/partitions/core.c create mode 100644 block/partitions/efi.c create mode 100644 block/partitions/efi.h create mode 100644 block/partitions/ibm.c create mode 100644 block/partitions/karma.c create mode 100644 block/partitions/ldm.c create mode 100644 block/partitions/ldm.h create mode 100644 block/partitions/mac.c create mode 100644 block/partitions/mac.h create mode 100644 block/partitions/msdos.c create mode 100644 block/partitions/osf.c create mode 100644 block/partitions/sgi.c create mode 100644 block/partitions/sun.c create mode 100644 block/partitions/sysv68.c create mode 100644 block/partitions/ultrix.c create mode 100644 block/scsi_ioctl.c create mode 100644 block/sed-opal.c create mode 100644 block/t10-pi.c (limited to 'block') diff --git a/block/Kconfig b/block/Kconfig new file mode 100644 index 000000000..a2297edfd --- /dev/null +++ b/block/Kconfig @@ -0,0 +1,236 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Block layer core configuration +# +menuconfig BLOCK + bool "Enable the block layer" if EXPERT + default y + select SBITMAP + select SRCU + help + Provide block layer support for the kernel. + + Disable this option to remove the block layer support from the + kernel. This may be useful for embedded devices. + + If this option is disabled: + + - block device files will become unusable + - some filesystems (such as ext3) will become unavailable. + + Also, SCSI character devices and USB storage will be disabled since + they make use of various block layer definitions and facilities. + + Say Y here unless you know you really don't want to mount disks and + suchlike. + +if BLOCK + +config BLK_RQ_ALLOC_TIME + bool + +config BLK_SCSI_REQUEST + bool + +config BLK_CGROUP_RWSTAT + bool + +config BLK_DEV_BSG + bool "Block layer SG support v4" + default y + select BLK_SCSI_REQUEST + help + Saying Y here will enable generic SG (SCSI generic) v4 support + for any block device. + + Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4 + can handle complicated SCSI commands: tagged variable length cdbs + with bidirectional data transfers and generic request/response + protocols (e.g. Task Management Functions and SMP in Serial + Attached SCSI). + + This option is required by recent UDEV versions to properly + access device serial numbers, etc. + + If unsure, say Y. + +config BLK_DEV_BSGLIB + bool "Block layer SG support v4 helper lib" + select BLK_DEV_BSG + select BLK_SCSI_REQUEST + help + Subsystems will normally enable this if needed. Users will not + normally need to manually enable this. + + If unsure, say N. + +config BLK_DEV_INTEGRITY + bool "Block layer data integrity support" + help + Some storage devices allow extra information to be + stored/retrieved to help protect the data. The block layer + data integrity option provides hooks which can be used by + filesystems to ensure better data integrity. + + Say yes here if you have a storage device that provides the + T10/SCSI Data Integrity Field or the T13/ATA External Path + Protection. If in doubt, say N. + +config BLK_DEV_INTEGRITY_T10 + tristate + depends on BLK_DEV_INTEGRITY + select CRC_T10DIF + +config BLK_DEV_ZONED + bool "Zoned block device support" + select MQ_IOSCHED_DEADLINE + help + Block layer zoned block device support. This option enables + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. + + Say yes here if you have a ZAC, ZBC, or ZNS storage device. + +config BLK_DEV_THROTTLING + bool "Block layer bio throttling support" + depends on BLK_CGROUP=y + select BLK_CGROUP_RWSTAT + help + Block layer bio throttling support. It can be used to limit + the IO rate to a device. IO rate policies are per cgroup and + one needs to mount and use blkio cgroup controller for creating + cgroups and specifying per device IO rate policies. + + See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information. + +config BLK_DEV_THROTTLING_LOW + bool "Block throttling .low limit interface support (EXPERIMENTAL)" + depends on BLK_DEV_THROTTLING + help + Add .low limit interface for block throttling. The low limit is a best + effort limit to prioritize cgroups. Depending on the setting, the limit + can be used to protect cgroups in terms of bandwidth/iops and better + utilize disk resource. + + Note, this is an experimental interface and could be changed someday. + +config BLK_CMDLINE_PARSER + bool "Block device command line partition parser" + help + Enabling this option allows you to specify the partition layout from + the kernel boot args. This is typically of use for embedded devices + which don't otherwise have any standardized method for listing the + partitions on a block device. + + See Documentation/block/cmdline-partition.rst for more information. + +config BLK_WBT + bool "Enable support for block device writeback throttling" + help + Enabling this option enables the block layer to throttle buffered + background writeback from the VM, making it more smooth and having + less impact on foreground operations. The throttling is done + dynamically on an algorithm loosely based on CoDel, factoring in + the realtime performance of the disk. + +config BLK_CGROUP_IOLATENCY + bool "Enable support for latency based cgroup IO protection" + depends on BLK_CGROUP=y + help + Enabling this option enables the .latency interface for IO throttling. + The IO controller will attempt to maintain average IO latencies below + the configured latency target, throttling anybody with a higher latency + target than the victimized group. + + Note, this is an experimental interface and could be changed someday. + +config BLK_CGROUP_IOCOST + bool "Enable support for cost model based cgroup IO controller" + depends on BLK_CGROUP=y + select BLK_RQ_IO_DATA_LEN + select BLK_RQ_ALLOC_TIME + help + Enabling this option enables the .weight interface for cost + model based proportional IO control. The IO controller + distributes IO capacity between different groups based on + their share of the overall weight distribution. + +config BLK_WBT_MQ + bool "Multiqueue writeback throttling" + default y + depends on BLK_WBT + help + Enable writeback throttling by default on multiqueue devices. + +config BLK_DEBUG_FS + bool "Block layer debugging information in debugfs" + default y + depends on DEBUG_FS + help + Include block layer debugging information in debugfs. This information + is mostly useful for kernel developers, but it doesn't incur any cost + at runtime. + + Unless you are building a kernel for a tiny system, you should + say Y here. + +config BLK_DEBUG_FS_ZONED + bool + default BLK_DEBUG_FS && BLK_DEV_ZONED + +config BLK_SED_OPAL + bool "Logic for interfacing with Opal enabled SEDs" + help + Builds Logic for interfacing with Opal enabled controllers. + Enabling this option enables users to setup/unlock/lock + Locking ranges for SED devices using the Opal protocol. + +config BLK_INLINE_ENCRYPTION + bool "Enable inline encryption support in block layer" + help + Build the blk-crypto subsystem. Enabling this lets the + block layer handle encryption, so users can take + advantage of inline encryption hardware if present. + +config BLK_INLINE_ENCRYPTION_FALLBACK + bool "Enable crypto API fallback for blk-crypto" + depends on BLK_INLINE_ENCRYPTION + select CRYPTO + select CRYPTO_SKCIPHER + help + Enabling this lets the block layer handle inline encryption + by falling back to the kernel crypto API when inline + encryption hardware is not present. + +menu "Partition Types" + +source "block/partitions/Kconfig" + +endmenu + +endif # BLOCK + +config BLOCK_COMPAT + bool + depends on BLOCK && COMPAT + default y + +config BLK_MQ_PCI + bool + depends on BLOCK && PCI + default y + +config BLK_MQ_VIRTIO + bool + depends on BLOCK && VIRTIO + default y + +config BLK_MQ_RDMA + bool + depends on BLOCK && INFINIBAND + default y + +config BLK_PM + def_bool BLOCK && PM + +source "block/Kconfig.iosched" diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched new file mode 100644 index 000000000..2f2158e05 --- /dev/null +++ b/block/Kconfig.iosched @@ -0,0 +1,49 @@ +# SPDX-License-Identifier: GPL-2.0 +if BLOCK + +menu "IO Schedulers" + +config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y + help + MQ version of the deadline IO scheduler. + +config MQ_IOSCHED_KYBER + tristate "Kyber I/O scheduler" + default y + help + The Kyber I/O scheduler is a low-overhead scheduler suitable for + multiqueue and other fast devices. Given target latencies for reads and + synchronous writes, it will self-tune queue depths to achieve that + goal. + +config IOSCHED_BFQ + tristate "BFQ I/O scheduler" + help + BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of + of the device among all processes according to their weights, + regardless of the device parameters and with any workload. It + also guarantees a low latency to interactive and soft + real-time applications. Details in + Documentation/block/bfq-iosched.rst + +config BFQ_GROUP_IOSCHED + bool "BFQ hierarchical scheduling support" + depends on IOSCHED_BFQ && BLK_CGROUP + select BLK_CGROUP_RWSTAT + help + + Enable hierarchical scheduling in BFQ, using the blkio + (cgroups-v1) or io (cgroups-v2) controller. + +config BFQ_CGROUP_DEBUG + bool "BFQ IO controller debugging" + depends on BFQ_GROUP_IOSCHED + help + Enable some debugging help. Currently it exports additional stat + files in a cgroup which can be useful for debugging. + +endmenu + +endif diff --git a/block/Makefile b/block/Makefile new file mode 100644 index 000000000..8d841f5f9 --- /dev/null +++ b/block/Makefile @@ -0,0 +1,40 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the kernel block layer +# + +obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ + blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ + blk-exec.o blk-merge.o blk-timeout.o \ + blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ + blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o + +obj-$(CONFIG_BOUNCE) += bounce.o +obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o +obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o +obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o +obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o +obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o +obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o +obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o +obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o +obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o +bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o +obj-$(CONFIG_IOSCHED_BFQ) += bfq.o + +obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o +obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o +obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o +obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o +obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o +obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o +obj-$(CONFIG_BLK_WBT) += blk-wbt.o +obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o +obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o +obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o +obj-$(CONFIG_BLK_PM) += blk-pm.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o diff --git a/block/badblocks.c b/block/badblocks.c new file mode 100644 index 000000000..d39056630 --- /dev/null +++ b/block/badblocks.c @@ -0,0 +1,603 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Bad block management + * + * - Heavily based on MD badblocks code from Neil Brown + * + * Copyright (c) 2015, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo; + u64 *p = bb->page; + int rv; + sector_t target = s + sectors; + unsigned seq; + + if (bb->shift > 0) { + /* round the start down, and the end up */ + s >>= bb->shift; + target += (1<shift) - 1; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + +retry: + seq = read_seqbegin(&bb->lock); + lo = 0; + rv = 0; + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + /* This could still be the one, earlier ranges + * could not. + */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + if (BB_OFFSET(p[lo]) < target) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + } + lo--; + } + } + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_check); + +static void badblocks_update_acked(struct badblocks *bb) +{ + u64 *p = bb->page; + int i; + bool unacked = false; + + if (!bb->unacked_exist) + return; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + unacked = true; + break; + } + } + + if (!unacked) + bb->unacked_exist = 0; +} + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 0; + unsigned long flags; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 1; + + if (bb->shift) { + /* round the start down, and the end up */ + sector_t next = s + sectors; + + s >>= bb->shift; + next += (1<shift) - 1; + next >>= bb->shift; + sectors = next - s; + } + + write_seqlock_irqsave(&bb->lock, flags); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s == a && s + sectors >= e) + /* new range covers old */ + ack = acknowledged; + else + ack = ack && acknowledged; + + if (e < s + sectors) + e = s + sectors; + if (e - a <= BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range + */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + + if (a <= s + sectors) { + /* merging is possible */ + if (e <= s + sectors) { + /* full overlap */ + e = s + sectors; + ack = acknowledged; + } else + ack = ack && acknowledged; + + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, ack); + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + lo = hi; + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + /* Note: 's' is at the end of 'lo' */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); + + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi - 1) * 8); + bb->count--; + } + } + while (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' + */ + if (bb->count >= MAX_BADBLOCKS) { + /* No room for more */ + rv = 1; + break; + } else { + int this_sectors = sectors; + + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + + if (this_sectors > BB_MAX_LEN) + this_sectors = BB_MAX_LEN; + p[hi] = BB_MAKE(s, this_sectors, acknowledged); + sectors -= this_sectors; + s += this_sectors; + } + } + + bb->changed = 1; + if (!acknowledged) + bb->unacked_exist = 1; + else + badblocks_update_acked(bb); + write_sequnlock_irqrestore(&bb->lock, flags); + + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift > 0) { + /* When clearing we round the start up and the end down. + * This should not matter as the shift should align with + * the block size and no rounding should ever be needed. + * However it is better the think a block is bad when it + * isn't than to think a block is not bad when it is. + */ + s += (1<shift) - 1; + s >>= bb->shift; + target >>= bb->shift; + sectors = target - s; + } + + write_seqlock_irq(&bb->lock); + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && + (BB_OFFSET(p[lo]) < target)) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MAX_BADBLOCKS) { + rv = -ENOSPC; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && + (BB_OFFSET(p[lo]) < target)) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + badblocks_update_acked(bb); + bb->changed = 1; +out: + write_sequnlock_irq(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(badblocks_clear); + +/** + * ack_all_badblocks() - Acknowledge all bad blocks in a list. + * @bb: the badblocks structure that holds all badblock information + * + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point even trying */ + return; + write_seqlock_irq(&bb->lock); + + if (bb->changed == 0 && bb->unacked_exist) { + u64 *p = bb->page; + int i; + + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + + p[i] = BB_MAKE(start, len, 1); + } + } + bb->unacked_exist = 0; + } + write_sequnlock_irq(&bb->lock); +} +EXPORT_SYMBOL_GPL(ack_all_badblocks); + +/** + * badblocks_show() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of returned data + */ +ssize_t badblocks_show(struct badblocks *bb, char *page, int unack) +{ + size_t len; + int i; + u64 *p = bb->page; + unsigned seq; + + if (bb->shift < 0) + return 0; + +retry: + seq = read_seqbegin(&bb->lock); + + len = 0; + i = 0; + + while (len < PAGE_SIZE && i < bb->count) { + sector_t s = BB_OFFSET(p[i]); + unsigned int length = BB_LEN(p[i]); + int ack = BB_ACK(p[i]); + + i++; + + if (unack && ack) + continue; + + len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", + (unsigned long long)s << bb->shift, + length << bb->shift); + } + if (unack && len == 0) + bb->unacked_exist = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + + return len; +} +EXPORT_SYMBOL_GPL(badblocks_show); + +/** + * badblocks_store() - sysfs access to bad-blocks list + * @bb: the badblocks structure that holds all badblock information + * @page: buffer received from sysfs + * @len: length of data received from sysfs + * @unack: weather to show unacknowledged badblocks + * + * Return: + * Length of the buffer processed or -ve error. + */ +ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, + int unack) +{ + unsigned long long sector; + int length; + char newline; + + switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { + case 3: + if (newline != '\n') + return -EINVAL; + fallthrough; + case 2: + if (length <= 0) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (badblocks_set(bb, sector, length, !unack)) + return -ENOSPC; + else + return len; +} +EXPORT_SYMBOL_GPL(badblocks_store); + +static int __badblocks_init(struct device *dev, struct badblocks *bb, + int enable) +{ + bb->dev = dev; + bb->count = 0; + if (enable) + bb->shift = 0; + else + bb->shift = -1; + if (dev) + bb->page = devm_kzalloc(dev, PAGE_SIZE, GFP_KERNEL); + else + bb->page = kzalloc(PAGE_SIZE, GFP_KERNEL); + if (!bb->page) { + bb->shift = -1; + return -ENOMEM; + } + seqlock_init(&bb->lock); + + return 0; +} + +/** + * badblocks_init() - initialize the badblocks structure + * @bb: the badblocks structure that holds all badblock information + * @enable: weather to enable badblocks accounting + * + * Return: + * 0: success + * -ve errno: on error + */ +int badblocks_init(struct badblocks *bb, int enable) +{ + return __badblocks_init(NULL, bb, enable); +} +EXPORT_SYMBOL_GPL(badblocks_init); + +int devm_init_badblocks(struct device *dev, struct badblocks *bb) +{ + if (!bb) + return -EINVAL; + return __badblocks_init(dev, bb, 1); +} +EXPORT_SYMBOL_GPL(devm_init_badblocks); + +/** + * badblocks_exit() - free the badblocks structure + * @bb: the badblocks structure that holds all badblock information + */ +void badblocks_exit(struct badblocks *bb) +{ + if (!bb) + return; + if (bb->dev) + devm_kfree(bb->dev, bb->page); + else + kfree(bb->page); + bb->page = NULL; +} +EXPORT_SYMBOL_GPL(badblocks_exit); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c new file mode 100644 index 000000000..1f9ccc661 --- /dev/null +++ b/block/bfq-cgroup.c @@ -0,0 +1,1478 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * cgroups support for the BFQ I/O scheduler. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bfq-iosched.h" + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp) +{ + int ret; + + ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp); + if (ret) + return ret; + + atomic64_set(&stat->aux_cnt, 0); + return 0; +} + +static void bfq_stat_exit(struct bfq_stat *stat) +{ + percpu_counter_destroy(&stat->cpu_cnt); +} + +/** + * bfq_stat_add - add a value to a bfq_stat + * @stat: target bfq_stat + * @val: value to add + * + * Add @val to @stat. The caller must ensure that IRQ on the same CPU + * don't re-enter this function for the same counter. + */ +static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val) +{ + percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * bfq_stat_read - read the current value of a bfq_stat + * @stat: bfq_stat to read + */ +static inline uint64_t bfq_stat_read(struct bfq_stat *stat) +{ + return percpu_counter_sum_positive(&stat->cpu_cnt); +} + +/** + * bfq_stat_reset - reset a bfq_stat + * @stat: bfq_stat to reset + */ +static inline void bfq_stat_reset(struct bfq_stat *stat) +{ + percpu_counter_set(&stat->cpu_cnt, 0); + atomic64_set(&stat->aux_cnt, 0); +} + +/** + * bfq_stat_add_aux - add a bfq_stat into another's aux count + * @to: the destination bfq_stat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void bfq_stat_add_aux(struct bfq_stat *to, + struct bfq_stat *from) +{ + atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt), + &to->aux_cnt); +} + +/** + * blkg_prfill_stat - prfill callback for bfq_stat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the bfq_stat in @pd + * + * prfill callback for printing a bfq_stat. + */ +static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off)); +} + +/* bfqg stats flags */ +enum bfqg_stats_flags { + BFQG_stats_waiting = 0, + BFQG_stats_idling, + BFQG_stats_empty, +}; + +#define BFQG_FLAG_FNS(name) \ +static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags |= (1 << BFQG_stats_##name); \ +} \ +static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ +{ \ + stats->flags &= ~(1 << BFQG_stats_##name); \ +} \ +static int bfqg_stats_##name(struct bfqg_stats *stats) \ +{ \ + return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ +} \ + +BFQG_FLAG_FNS(waiting) +BFQG_FLAG_FNS(idling) +BFQG_FLAG_FNS(empty) +#undef BFQG_FLAG_FNS + +/* This should be called with the scheduler lock held. */ +static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) +{ + u64 now; + + if (!bfqg_stats_waiting(stats)) + return; + + now = ktime_get_ns(); + if (now > stats->start_group_wait_time) + bfq_stat_add(&stats->group_wait_time, + now - stats->start_group_wait_time); + bfqg_stats_clear_waiting(stats); +} + +/* This should be called with the scheduler lock held. */ +static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, + struct bfq_group *curr_bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_waiting(stats)) + return; + if (bfqg == curr_bfqg) + return; + stats->start_group_wait_time = ktime_get_ns(); + bfqg_stats_mark_waiting(stats); +} + +/* This should be called with the scheduler lock held. */ +static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) +{ + u64 now; + + if (!bfqg_stats_empty(stats)) + return; + + now = ktime_get_ns(); + if (now > stats->start_empty_time) + bfq_stat_add(&stats->empty_time, + now - stats->start_empty_time); + bfqg_stats_clear_empty(stats); +} + +void bfqg_stats_update_dequeue(struct bfq_group *bfqg) +{ + bfq_stat_add(&bfqg->stats.dequeue, 1); +} + +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (blkg_rwstat_total(&stats->queued)) + return; + + /* + * group is already marked empty. This can happen if bfqq got new + * request in parent group and moved to this group while being added + * to service tree. Just ignore the event and move on. + */ + if (bfqg_stats_empty(stats)) + return; + + stats->start_empty_time = ktime_get_ns(); + bfqg_stats_mark_empty(stats); +} + +void bfqg_stats_update_idle_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + if (bfqg_stats_idling(stats)) { + u64 now = ktime_get_ns(); + + if (now > stats->start_idle_time) + bfq_stat_add(&stats->idle_time, + now - stats->start_idle_time); + bfqg_stats_clear_idling(stats); + } +} + +void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + stats->start_idle_time = ktime_get_ns(); + bfqg_stats_mark_idling(stats); +} + +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) +{ + struct bfqg_stats *stats = &bfqg->stats; + + bfq_stat_add(&stats->avg_queue_size_sum, + blkg_rwstat_total(&stats->queued)); + bfq_stat_add(&stats->avg_queue_size_samples, 1); + bfqg_stats_update_group_wait_time(stats); +} + +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, 1); + bfqg_stats_end_empty_time(&bfqg->stats); + if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); +} + +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.queued, op, -1); +} + +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +{ + blkg_rwstat_add(&bfqg->stats.merged, op, 1); +} + +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op) +{ + struct bfqg_stats *stats = &bfqg->stats; + u64 now = ktime_get_ns(); + + if (now > io_start_time_ns) + blkg_rwstat_add(&stats->service_time, op, + now - io_start_time_ns); + if (io_start_time_ns > start_time_ns) + blkg_rwstat_add(&stats->wait_time, op, + io_start_time_ns - start_time_ns); +} + +#else /* CONFIG_BFQ_CGROUP_DEBUG */ + +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op) { } +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op) { } +void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } + +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +/* + * blk-cgroup policy-related handlers + * The following functions help in converting between blk-cgroup + * internal structures and BFQ-specific structures. + */ + +static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct bfq_group, pd) : NULL; +} + +struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) +{ + return pd_to_blkg(&bfqg->pd); +} + +static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) +{ + return pd_to_bfqg(blkg_to_pd(blkg, &blkcg_policy_bfq)); +} + +/* + * bfq_group handlers + * The following functions help in navigating the bfq_group hierarchy + * by allowing to find the parent of a bfq_group or the bfq_group + * associated to a bfq_queue. + */ + +static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) +{ + struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; + + return pblkg ? blkg_to_bfqg(pblkg) : NULL; +} + +struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + return group_entity ? container_of(group_entity, struct bfq_group, + entity) : + bfqq->bfqd->root_group; +} + +/* + * The following two functions handle get and put of a bfq_group by + * wrapping the related blk-cgroup hooks. + */ + +static void bfqg_get(struct bfq_group *bfqg) +{ + bfqg->ref++; +} + +static void bfqg_put(struct bfq_group *bfqg) +{ + bfqg->ref--; + + if (bfqg->ref == 0) + kfree(bfqg); +} + +static void bfqg_and_blkg_get(struct bfq_group *bfqg) +{ + /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ + bfqg_get(bfqg); + + blkg_get(bfqg_to_blkg(bfqg)); +} + +void bfqg_and_blkg_put(struct bfq_group *bfqg) +{ + blkg_put(bfqg_to_blkg(bfqg)); + + bfqg_put(bfqg); +} + +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq) +{ + struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg); + + if (!bfqg) + return; + + blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq)); + blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1); +} + +/* @stats = 0 */ +static void bfqg_stats_reset(struct bfqg_stats *stats) +{ +#ifdef CONFIG_BFQ_CGROUP_DEBUG + /* queued stats shouldn't be cleared */ + blkg_rwstat_reset(&stats->merged); + blkg_rwstat_reset(&stats->service_time); + blkg_rwstat_reset(&stats->wait_time); + bfq_stat_reset(&stats->time); + bfq_stat_reset(&stats->avg_queue_size_sum); + bfq_stat_reset(&stats->avg_queue_size_samples); + bfq_stat_reset(&stats->dequeue); + bfq_stat_reset(&stats->group_wait_time); + bfq_stat_reset(&stats->idle_time); + bfq_stat_reset(&stats->empty_time); +#endif +} + +/* @to += @from */ +static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) +{ + if (!to || !from) + return; + +#ifdef CONFIG_BFQ_CGROUP_DEBUG + /* queued stats shouldn't be cleared */ + blkg_rwstat_add_aux(&to->merged, &from->merged); + blkg_rwstat_add_aux(&to->service_time, &from->service_time); + blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); + bfq_stat_add_aux(&from->time, &from->time); + bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); + bfq_stat_add_aux(&to->avg_queue_size_samples, + &from->avg_queue_size_samples); + bfq_stat_add_aux(&to->dequeue, &from->dequeue); + bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time); + bfq_stat_add_aux(&to->idle_time, &from->idle_time); + bfq_stat_add_aux(&to->empty_time, &from->empty_time); +#endif +} + +/* + * Transfer @bfqg's stats to its parent's aux counts so that the ancestors' + * recursive stats can still account for the amount used by this bfqg after + * it's gone. + */ +static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) +{ + struct bfq_group *parent; + + if (!bfqg) /* root_group */ + return; + + parent = bfqg_parent(bfqg); + + lockdep_assert_held(&bfqg_to_blkg(bfqg)->q->queue_lock); + + if (unlikely(!parent)) + return; + + bfqg_stats_add_aux(&parent->stats, &bfqg->stats); + bfqg_stats_reset(&bfqg->stats); +} + +void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + /* + * Make sure that bfqg and its associated blkg do not + * disappear before entity. + */ + bfqg_and_blkg_get(bfqg); + } + entity->parent = bfqg->my_entity; /* NULL for root group */ + entity->sched_data = &bfqg->sched_data; +} + +static void bfqg_stats_exit(struct bfqg_stats *stats) +{ + blkg_rwstat_exit(&stats->bytes); + blkg_rwstat_exit(&stats->ios); +#ifdef CONFIG_BFQ_CGROUP_DEBUG + blkg_rwstat_exit(&stats->merged); + blkg_rwstat_exit(&stats->service_time); + blkg_rwstat_exit(&stats->wait_time); + blkg_rwstat_exit(&stats->queued); + bfq_stat_exit(&stats->time); + bfq_stat_exit(&stats->avg_queue_size_sum); + bfq_stat_exit(&stats->avg_queue_size_samples); + bfq_stat_exit(&stats->dequeue); + bfq_stat_exit(&stats->group_wait_time); + bfq_stat_exit(&stats->idle_time); + bfq_stat_exit(&stats->empty_time); +#endif +} + +static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) +{ + if (blkg_rwstat_init(&stats->bytes, gfp) || + blkg_rwstat_init(&stats->ios, gfp)) + return -ENOMEM; + +#ifdef CONFIG_BFQ_CGROUP_DEBUG + if (blkg_rwstat_init(&stats->merged, gfp) || + blkg_rwstat_init(&stats->service_time, gfp) || + blkg_rwstat_init(&stats->wait_time, gfp) || + blkg_rwstat_init(&stats->queued, gfp) || + bfq_stat_init(&stats->time, gfp) || + bfq_stat_init(&stats->avg_queue_size_sum, gfp) || + bfq_stat_init(&stats->avg_queue_size_samples, gfp) || + bfq_stat_init(&stats->dequeue, gfp) || + bfq_stat_init(&stats->group_wait_time, gfp) || + bfq_stat_init(&stats->idle_time, gfp) || + bfq_stat_init(&stats->empty_time, gfp)) { + bfqg_stats_exit(stats); + return -ENOMEM; + } +#endif + + return 0; +} + +static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) +{ + return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; +} + +static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) +{ + return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); +} + +static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) +{ + struct bfq_group_data *bgd; + + bgd = kzalloc(sizeof(*bgd), gfp); + if (!bgd) + return NULL; + return &bgd->pd; +} + +static void bfq_cpd_init(struct blkcg_policy_data *cpd) +{ + struct bfq_group_data *d = cpd_to_bfqgd(cpd); + + d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? + CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; +} + +static void bfq_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(cpd_to_bfqgd(cpd)); +} + +static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + struct bfq_group *bfqg; + + bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); + if (!bfqg) + return NULL; + + if (bfqg_stats_init(&bfqg->stats, gfp)) { + kfree(bfqg); + return NULL; + } + + /* see comments in bfq_bic_update_cgroup for why refcounting */ + bfqg_get(bfqg); + return &bfqg->pd; +} + +static void bfq_pd_init(struct blkg_policy_data *pd) +{ + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + struct bfq_data *bfqd = blkg->q->elevator->elevator_data; + struct bfq_entity *entity = &bfqg->entity; + struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); + + entity->orig_weight = entity->weight = entity->new_weight = d->weight; + entity->my_sched_data = &bfqg->sched_data; + bfqg->my_entity = entity; /* + * the root_group's will be set to NULL + * in bfq_init_queue() + */ + bfqg->bfqd = bfqd; + bfqg->active_entities = 0; + bfqg->online = true; + bfqg->rq_pos_tree = RB_ROOT; +} + +static void bfq_pd_free(struct blkg_policy_data *pd) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_exit(&bfqg->stats); + bfqg_put(bfqg); +} + +static void bfq_pd_reset_stats(struct blkg_policy_data *pd) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + bfqg_stats_reset(&bfqg->stats); +} + +static void bfq_group_set_parent(struct bfq_group *bfqg, + struct bfq_group *parent) +{ + struct bfq_entity *entity; + + entity = &bfqg->entity; + entity->parent = parent->my_entity; + entity->sched_data = &parent->sched_data; +} + +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + struct bfq_group *parent; + struct bfq_entity *entity; + + /* + * Update chain of bfq_groups as we might be handling a leaf group + * which, along with some of its relatives, has not been hooked yet + * to the private hierarchy of BFQ. + */ + entity = &bfqg->entity; + for_each_entity(entity) { + struct bfq_group *curr_bfqg = container_of(entity, + struct bfq_group, entity); + if (curr_bfqg != bfqd->root_group) { + parent = bfqg_parent(curr_bfqg); + if (!parent) + parent = bfqd->root_group; + bfq_group_set_parent(curr_bfqg, parent); + } + } +} + +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct bfq_group *bfqg; + + while (blkg) { + if (!blkg->online) { + blkg = blkg->parent; + continue; + } + bfqg = blkg_to_bfqg(blkg); + if (bfqg->online) { + bio_associate_blkg_from_css(bio, &blkg->blkcg->css); + return bfqg; + } + blkg = blkg->parent; + } + bio_associate_blkg_from_css(bio, + &bfqg_to_blkg(bfqd->root_group)->blkcg->css); + return bfqd->root_group; +} + +/** + * bfq_bfqq_move - migrate @bfqq to @bfqg. + * @bfqd: queue descriptor. + * @bfqq: the queue to move. + * @bfqg: the group to move to. + * + * Move @bfqq to @bfqg, deactivating it from its old group and reactivating + * it on the new one. Avoid putting the entity on the old group idle tree. + * + * Must be called under the scheduler lock, to make sure that the blkg + * owning @bfqg does not disappear (see comments in + * bfq_bic_update_cgroup on guaranteeing the consistency of blkg + * objects). + */ +void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) +{ + struct bfq_entity *entity = &bfqq->entity; + + /* + * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group + * until elevator exit. + */ + if (bfqq == &bfqd->oom_bfqq) + return; + /* + * Get extra reference to prevent bfqq from being freed in + * next possible expire or deactivate. + */ + bfqq->ref++; + + /* If bfqq is empty, then bfq_bfqq_expire also invokes + * bfq_del_bfqq_busy, thereby removing bfqq and its entity + * from data structures related to current group. Otherwise we + * need to remove bfqq explicitly with bfq_deactivate_bfqq, as + * we do below. + */ + if (bfqq == bfqd->in_service_queue) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQQE_PREEMPTED); + + if (bfq_bfqq_busy(bfqq)) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + else if (entity->on_st_or_in_serv) + bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); + bfqg_and_blkg_put(bfqq_group(bfqq)); + + entity->parent = bfqg->my_entity; + entity->sched_data = &bfqg->sched_data; + /* pin down bfqg and its associated blkg */ + bfqg_and_blkg_get(bfqg); + + if (bfq_bfqq_busy(bfqq)) { + if (unlikely(!bfqd->nonrot_with_queueing)) + bfq_pos_tree_add_move(bfqd, bfqq); + bfq_activate_bfqq(bfqd, bfqq); + } + + if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); + /* release extra ref taken above, bfqq may happen to be freed now */ + bfq_put_queue(bfqq); +} + +/** + * __bfq_bic_change_cgroup - move @bic to @cgroup. + * @bfqd: the queue descriptor. + * @bic: the bic to move. + * @blkcg: the blk-cgroup to move to. + * + * Move bic to blkcg, assuming that bfqd->lock is held; which makes + * sure that the reference to cgroup is valid across the call (see + * comments in bfq_bic_update_cgroup on this issue) + */ +static void *__bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) +{ + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true); + struct bfq_entity *entity; + + if (async_bfqq) { + entity = &async_bfqq->entity; + + if (entity->sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, false); + bfq_release_process_ref(bfqd, async_bfqq); + } + } + + if (sync_bfqq) { + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + } else { + struct bfq_queue *bfqq; + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != + &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is + * not valid anymore. We cannot easily just + * cancel the merge (by clearing new_bfqq) as + * there may be other processes using this + * queue and holding refs to all queues below + * sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from + * bfqq now so that we cannot merge bio to a + * request from the old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bic_set_bfqq(bic, NULL, true); + bfq_release_process_ref(bfqd, sync_bfqq); + } + } + } + + return bfqg; +} + +void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); + uint64_t serial_nr; + + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; + + /* + * Check whether blkcg has changed. The condition may trigger + * spuriously on a newly created cic but there's no harm. + */ + if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) + return; + + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); + /* + * Update blkg_path for bfq_log_* functions. We cache this + * path, and update it here, for the following + * reasons. Operations on blkg objects in blk-cgroup are + * protected with the request_queue lock, and not with the + * lock that protects the instances of this scheduler + * (bfqd->lock). This exposes BFQ to the following sort of + * race. + * + * The blkg_lookup performed in bfq_get_queue, protected + * through rcu, may happen to return the address of a copy of + * the original blkg. If this is the case, then the + * bfqg_and_blkg_get performed in bfq_get_queue, to pin down + * the blkg, is useless: it does not prevent blk-cgroup code + * from destroying both the original blkg and all objects + * directly or indirectly referred by the copy of the + * blkg. + * + * On the bright side, destroy operations on a blkg invoke, as + * a first step, hooks of the scheduler associated with the + * blkg. And these hooks are executed with bfqd->lock held for + * BFQ. As a consequence, for any blkg associated with the + * request queue this instance of the scheduler is attached + * to, we are guaranteed that such a blkg is not destroyed, and + * that all the pointers it contains are consistent, while we + * are holding bfqd->lock. A blkg_lookup performed with + * bfqd->lock held then returns a fully consistent blkg, which + * remains consistent until this lock is held. + * + * Thanks to the last fact, and to the fact that: (1) bfqg has + * been obtained through a blkg_lookup in the above + * assignment, and (2) bfqd->lock is being held, here we can + * safely use the policy data for the involved blkg (i.e., the + * field bfqg->pd) to get to the blkg associated with bfqg, + * and then we can safely use any field of blkg. After we + * release bfqd->lock, even just getting blkg through this + * bfqg may cause dangling references to be traversed, as + * bfqg->pd may not exist any more. + * + * In view of the above facts, here we cache, in the bfqg, any + * blkg data we may need for this bic, and for its associated + * bfq_queue. As of now, we need to cache only the path of the + * blkg, which is used in the bfq_log_* functions. + * + * Finally, note that bfqg itself needs to be protected from + * destruction on the blkg_free of the original blkg (which + * invokes bfq_pd_free). We use an additional private + * refcounter for bfqg, to let it disappear only after no + * bfq_queue refers to it any longer. + */ + blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); + bic->blkcg_serial_nr = serial_nr; +} + +/** + * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. + * @st: the service tree being flushed. + */ +static void bfq_flush_idle_tree(struct bfq_service_tree *st) +{ + struct bfq_entity *entity = st->first_idle; + + for (; entity ; entity = st->first_idle) + __bfq_deactivate_entity(entity, false); +} + +/** + * bfq_reparent_leaf_entity - move leaf entity to the root_group. + * @bfqd: the device data structure with the root group. + * @entity: the entity to move, if entity is a leaf; or the parent entity + * of an active leaf entity to move, if entity is not a leaf. + */ +static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, + struct bfq_entity *entity, + int ioprio_class) +{ + struct bfq_queue *bfqq; + struct bfq_entity *child_entity = entity; + + while (child_entity->my_sched_data) { /* leaf not reached yet */ + struct bfq_sched_data *child_sd = child_entity->my_sched_data; + struct bfq_service_tree *child_st = child_sd->service_tree + + ioprio_class; + struct rb_root *child_active = &child_st->active; + + child_entity = bfq_entity_of(rb_first(child_active)); + + if (!child_entity) + child_entity = child_sd->in_service_entity; + } + + bfqq = bfq_entity_to_bfqq(child_entity); + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); +} + +/** + * bfq_reparent_active_queues - move to the root group all active queues. + * @bfqd: the device data structure with the root group. + * @bfqg: the group to move from. + * @st: the service tree to start the search from. + */ +static void bfq_reparent_active_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg, + struct bfq_service_tree *st, + int ioprio_class) +{ + struct rb_root *active = &st->active; + struct bfq_entity *entity; + + while ((entity = bfq_entity_of(rb_first(active)))) + bfq_reparent_leaf_entity(bfqd, entity, ioprio_class); + + if (bfqg->sched_data.in_service_entity) + bfq_reparent_leaf_entity(bfqd, + bfqg->sched_data.in_service_entity, + ioprio_class); +} + +/** + * bfq_pd_offline - deactivate the entity associated with @pd, + * and reparent its children entities. + * @pd: descriptor of the policy going offline. + * + * blkio already grabs the queue_lock for us, so no need to use + * RCU-based magic + */ +static void bfq_pd_offline(struct blkg_policy_data *pd) +{ + struct bfq_service_tree *st; + struct bfq_group *bfqg = pd_to_bfqg(pd); + struct bfq_data *bfqd = bfqg->bfqd; + struct bfq_entity *entity = bfqg->my_entity; + unsigned long flags; + int i; + + spin_lock_irqsave(&bfqd->lock, flags); + + if (!entity) /* root group */ + goto put_async_queues; + + /* + * Empty all service_trees belonging to this group before + * deactivating the group itself. + */ + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { + st = bfqg->sched_data.service_tree + i; + + /* + * It may happen that some queues are still active + * (busy) upon group destruction (if the corresponding + * processes have been forced to terminate). We move + * all the leaf entities corresponding to these queues + * to the root_group. + * Also, it may happen that the group has an entity + * in service, which is disconnected from the active + * tree: it must be moved, too. + * There is no need to put the sync queues, as the + * scheduler has taken no reference. + */ + bfq_reparent_active_queues(bfqd, bfqg, st, i); + + /* + * The idle tree may still contain bfq_queues + * belonging to exited task because they never + * migrated to a different cgroup from the one being + * destroyed now. In addition, even + * bfq_reparent_active_queues() may happen to add some + * entities to the idle tree. It happens if, in some + * of the calls to bfq_bfqq_move() performed by + * bfq_reparent_active_queues(), the queue to move is + * empty and gets expired. + */ + bfq_flush_idle_tree(st); + } + + __bfq_deactivate_entity(entity, false); + +put_async_queues: + bfq_put_async_queues(bfqd, bfqg); + bfqg->online = false; + + spin_unlock_irqrestore(&bfqd->lock, flags); + /* + * @blkg is going offline and will be ignored by + * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so + * that they don't get lost. If IOs complete after this point, the + * stats for them will be lost. Oh well... + */ + bfqg_stats_xfer_dead(bfqg); +} + +void bfq_end_wr_async(struct bfq_data *bfqd) +{ + struct blkcg_gq *blkg; + + list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + + bfq_end_wr_async_queues(bfqd, bfqg); + } + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + unsigned int val = 0; + + if (bfqgd) + val = bfqgd->weight; + + seq_printf(sf, "%u\n", val); + + return 0; +} + +static u64 bfqg_prfill_weight_device(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + + if (!bfqg->entity.dev_weight) + return 0; + return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight); +} + +static int bfq_io_show_weight(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + + seq_printf(sf, "default %u\n", bfqgd->weight); + blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device, + &blkcg_policy_bfq, 0, false); + return 0; +} + +static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight) +{ + weight = dev_weight ?: weight; + + bfqg->entity.dev_weight = dev_weight; + /* + * Setting the prio_changed flag of the entity + * to 1 with new_weight == weight would re-set + * the value of the weight to its ioprio mapping. + * Set the flag only if necessary. + */ + if ((unsigned short)weight != bfqg->entity.new_weight) { + bfqg->entity.new_weight = (unsigned short)weight; + /* + * Make sure that the above new value has been + * stored in bfqg->entity.new_weight before + * setting the prio_changed flag. In fact, + * this flag may be read asynchronously (in + * critical sections protected by a different + * lock than that held here), and finding this + * flag set may cause the execution of the code + * for updating parameters whose value may + * depend also on bfqg->entity.new_weight (in + * __bfq_entity_update_weight_prio). + * This barrier makes sure that the new value + * of bfqg->entity.new_weight is correctly + * seen in that code. + */ + smp_wmb(); + bfqg->entity.prio_changed = 1; + } +} + +static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, + struct cftype *cftype, + u64 val) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); + struct blkcg_gq *blkg; + int ret = -ERANGE; + + if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) + return ret; + + ret = 0; + spin_lock_irq(&blkcg->lock); + bfqgd->weight = (unsigned short)val; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct bfq_group *bfqg = blkg_to_bfqg(blkg); + + if (bfqg) + bfq_group_set_weight(bfqg, val, 0); + } + spin_unlock_irq(&blkcg->lock); + + return ret; +} + +static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + int ret; + struct blkg_conf_ctx ctx; + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct bfq_group *bfqg; + u64 v; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); + if (ret) + return ret; + + if (sscanf(ctx.body, "%llu", &v) == 1) { + /* require "default" on dfl */ + ret = -ERANGE; + if (!v) + goto out; + } else if (!strcmp(strim(ctx.body), "default")) { + v = 0; + } else { + ret = -EINVAL; + goto out; + } + + bfqg = blkg_to_bfqg(ctx.blkg); + + ret = -ERANGE; + if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) { + bfq_group_set_weight(bfqg, bfqg->entity.weight, v); + ret = 0; + } +out: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + char *endp; + int ret; + u64 v; + + buf = strim(buf); + + /* "WEIGHT" or "default WEIGHT" sets the default weight */ + v = simple_strtoull(buf, &endp, 0); + if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) { + ret = bfq_io_set_weight_legacy(of_css(of), NULL, v); + return ret ?: nbytes; + } + + return bfq_io_set_device_weight(of, buf, nbytes, off); +} + +static int bfqg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, + &blkcg_policy_bfq, seq_cft(sf)->private, true); + return 0; +} + +static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, true); + return 0; +} + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static int bfqg_print_stat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, + &blkcg_policy_bfq, seq_cft(sf)->private, false); + return 0; +} + +static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkcg_gq *blkg = pd_to_blkg(pd); + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + u64 sum = 0; + + lockdep_assert_held(&blkg->q->queue_lock); + + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct bfq_stat *stat; + + if (!pos_blkg->online) + continue; + + stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off; + sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt); + } + rcu_read_unlock(); + + return __blkg_prfill_u64(sf, pd, sum); +} + +static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_stat_recursive, &blkcg_policy_bfq, + seq_cft(sf)->private, false); + return 0; +} + +static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg); + u64 sum = blkg_rwstat_total(&bfqg->stats.bytes); + + return __blkg_prfill_u64(sf, pd, sum >> 9); +} + +static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); + return 0; +} + +static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample tmp; + + blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq, + offsetof(struct bfq_group, stats.bytes), &tmp); + + return __blkg_prfill_u64(sf, pd, + (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9); +} + +static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, + false); + return 0; +} + +static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct bfq_group *bfqg = pd_to_bfqg(pd); + u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples); + u64 v = 0; + + if (samples) { + v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum); + v = div64_u64(v, samples); + } + __blkg_prfill_u64(sf, pd, v); + return 0; +} + +/* print avg_queue_size */ +static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, + 0, false); + return 0; +} +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + +struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +{ + int ret; + + ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + if (ret) + return NULL; + + return blkg_to_bfqg(bfqd->queue->root_blkg); +} + +struct blkcg_policy blkcg_policy_bfq = { + .dfl_cftypes = bfq_blkg_files, + .legacy_cftypes = bfq_blkcg_legacy_files, + + .cpd_alloc_fn = bfq_cpd_alloc, + .cpd_init_fn = bfq_cpd_init, + .cpd_bind_fn = bfq_cpd_init, + .cpd_free_fn = bfq_cpd_free, + + .pd_alloc_fn = bfq_pd_alloc, + .pd_init_fn = bfq_pd_init, + .pd_offline_fn = bfq_pd_offline, + .pd_free_fn = bfq_pd_free, + .pd_reset_stats_fn = bfq_pd_reset_stats, +}; + +struct cftype bfq_blkcg_legacy_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight_legacy, + .write_u64 = bfq_io_set_weight_legacy, + }, + { + .name = "bfq.weight_device", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + + /* statistics, covers only the tasks in the bfqg */ + { + .name = "bfq.io_service_bytes", + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_serviced", + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat, + }, +#ifdef CONFIG_BFQ_CGROUP_DEBUG + { + .name = "bfq.time", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.sectors", + .seq_show = bfqg_print_stat_sectors, + }, + { + .name = "bfq.io_service_time", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_wait_time", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_merged", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat, + }, + { + .name = "bfq.io_queued", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat, + }, +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + + /* the same statistics which cover the bfqg and its descendants */ + { + .name = "bfq.io_service_bytes_recursive", + .private = offsetof(struct bfq_group, stats.bytes), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_serviced_recursive", + .private = offsetof(struct bfq_group, stats.ios), + .seq_show = bfqg_print_rwstat_recursive, + }, +#ifdef CONFIG_BFQ_CGROUP_DEBUG + { + .name = "bfq.time_recursive", + .private = offsetof(struct bfq_group, stats.time), + .seq_show = bfqg_print_stat_recursive, + }, + { + .name = "bfq.sectors_recursive", + .seq_show = bfqg_print_stat_sectors_recursive, + }, + { + .name = "bfq.io_service_time_recursive", + .private = offsetof(struct bfq_group, stats.service_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_wait_time_recursive", + .private = offsetof(struct bfq_group, stats.wait_time), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_merged_recursive", + .private = offsetof(struct bfq_group, stats.merged), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.io_queued_recursive", + .private = offsetof(struct bfq_group, stats.queued), + .seq_show = bfqg_print_rwstat_recursive, + }, + { + .name = "bfq.avg_queue_size", + .seq_show = bfqg_print_avg_queue_size, + }, + { + .name = "bfq.group_wait_time", + .private = offsetof(struct bfq_group, stats.group_wait_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.idle_time", + .private = offsetof(struct bfq_group, stats.idle_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.empty_time", + .private = offsetof(struct bfq_group, stats.empty_time), + .seq_show = bfqg_print_stat, + }, + { + .name = "bfq.dequeue", + .private = offsetof(struct bfq_group, stats.dequeue), + .seq_show = bfqg_print_stat, + }, +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + { } /* terminate */ +}; + +struct cftype bfq_blkg_files[] = { + { + .name = "bfq.weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = bfq_io_show_weight, + .write = bfq_io_set_weight, + }, + {} /* terminate */ +}; + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg) {} + +void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->weight = entity->new_weight; + entity->orig_weight = entity->new_weight; + if (bfqq) { + bfqq->ioprio = bfqq->new_ioprio; + bfqq->ioprio_class = bfqq->new_ioprio_class; + } + entity->sched_data = &bfqg->sched_data; +} + +void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} + +void bfq_end_wr_async(struct bfq_data *bfqd) +{ + bfq_end_wr_async_queues(bfqd, bfqd->root_group); +} + +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + return bfqd->root_group; +} + +struct bfq_group *bfqq_group(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +void bfqg_and_blkg_get(struct bfq_group *bfqg) {} + +void bfqg_and_blkg_put(struct bfq_group *bfqg) {} + +struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) +{ + struct bfq_group *bfqg; + int i; + + bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); + if (!bfqg) + return NULL; + + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + + return bfqg; +} +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c new file mode 100644 index 000000000..6687b805b --- /dev/null +++ b/block/bfq-iosched.c @@ -0,0 +1,6900 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Budget Fair Queueing (BFQ) I/O scheduler. + * + * Based on ideas and code from CFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2010 Paolo Valente + * Arianna Avanzini + * + * Copyright (C) 2017 Paolo Valente + * + * BFQ is a proportional-share I/O scheduler, with some extra + * low-latency capabilities. BFQ also supports full hierarchical + * scheduling through cgroups. Next paragraphs provide an introduction + * on BFQ inner workings. Details on BFQ benefits, usage and + * limitations can be found in Documentation/block/bfq-iosched.rst. + * + * BFQ is a proportional-share storage-I/O scheduling algorithm based + * on the slice-by-slice service scheme of CFQ. But BFQ assigns + * budgets, measured in number of sectors, to processes instead of + * time slices. The device is not granted to the in-service process + * for a given time slice, but until it has exhausted its assigned + * budget. This change from the time to the service domain enables BFQ + * to distribute the device throughput among processes as desired, + * without any distortion due to throughput fluctuations, or to device + * internal queueing. BFQ uses an ad hoc internal scheduler, called + * B-WF2Q+, to schedule processes according to their budgets. More + * precisely, BFQ schedules queues associated with processes. Each + * process/queue is assigned a user-configurable weight, and B-WF2Q+ + * guarantees that each queue receives a fraction of the throughput + * proportional to its weight. Thanks to the accurate policy of + * B-WF2Q+, BFQ can afford to assign high budgets to I/O-bound + * processes issuing sequential requests (to boost the throughput), + * and yet guarantee a low latency to interactive and soft real-time + * applications. + * + * In particular, to provide these low-latency guarantees, BFQ + * explicitly privileges the I/O of two classes of time-sensitive + * applications: interactive and soft real-time. In more detail, BFQ + * behaves this way if the low_latency parameter is set (default + * configuration). This feature enables BFQ to provide applications in + * these classes with a very low latency. + * + * To implement this feature, BFQ constantly tries to detect whether + * the I/O requests in a bfq_queue come from an interactive or a soft + * real-time application. For brevity, in these cases, the queue is + * said to be interactive or soft real-time. In both cases, BFQ + * privileges the service of the queue, over that of non-interactive + * and non-soft-real-time queues. This privileging is performed, + * mainly, by raising the weight of the queue. So, for brevity, we + * call just weight-raising periods the time periods during which a + * queue is privileged, because deemed interactive or soft real-time. + * + * The detection of soft real-time queues/applications is described in + * detail in the comments on the function + * bfq_bfqq_softrt_next_start. On the other hand, the detection of an + * interactive queue works as follows: a queue is deemed interactive + * if it is constantly non empty only for a limited time interval, + * after which it does become empty. The queue may be deemed + * interactive again (for a limited time), if it restarts being + * constantly non empty, provided that this happens only after the + * queue has remained empty for a given minimum idle time. + * + * By default, BFQ computes automatically the above maximum time + * interval, i.e., the time interval after which a constantly + * non-empty queue stops being deemed interactive. Since a queue is + * weight-raised while it is deemed interactive, this maximum time + * interval happens to coincide with the (maximum) duration of the + * weight-raising for interactive queues. + * + * Finally, BFQ also features additional heuristics for + * preserving both a low latency and a high throughput on NCQ-capable, + * rotational or flash-based devices, and to get the job done quickly + * for applications consisting in many I/O-bound processes. + * + * NOTE: if the main or only goal, with a given device, is to achieve + * the maximum-possible throughput at all times, then do switch off + * all low-latency heuristics for that device, by setting low_latency + * to 0. + * + * BFQ is described in [1], where also a reference to the initial, + * more theoretical paper on BFQ can be found. The interested reader + * can find in the latter paper full details on the main algorithm, as + * well as formulas of the guarantees and formal proofs of all the + * properties. With respect to the version of BFQ presented in these + * papers, this implementation adds a few more heuristics, such as the + * ones that guarantee a low latency to interactive and soft real-time + * applications, and a hierarchical extension based on H-WF2Q+. + * + * B-WF2Q+ is based on WF2Q+, which is described in [2], together with + * H-WF2Q+, while the augmented tree used here to implement B-WF2Q+ + * with O(log N) complexity derives from the one introduced with EEVDF + * in [3]. + * + * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O + * Scheduler", Proceedings of the First Workshop on Mobile System + * Technologies (MST-2015), May 2015. + * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf + * + * [2] Jon C.R. Bennett and H. Zhang, "Hierarchical Packet Fair Queueing + * Algorithms", IEEE/ACM Transactions on Networking, 5(5):675-689, + * Oct 1997. + * + * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz + * + * [3] I. Stoica and H. Abdel-Wahab, "Earliest Eligible Virtual Deadline + * First: A Flexible and Accurate Mechanism for Proportional Share + * Resource Allocation", technical report. + * + * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" +#include "bfq-iosched.h" +#include "blk-wbt.h" + +#define BFQ_BFQQ_FNS(name) \ +void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + __set_bit(BFQQF_##name, &(bfqq)->flags); \ +} \ +void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ +{ \ + __clear_bit(BFQQF_##name, &(bfqq)->flags); \ +} \ +int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ +{ \ + return test_bit(BFQQF_##name, &(bfqq)->flags); \ +} + +BFQ_BFQQ_FNS(just_created); +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(has_short_ttime); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); +#undef BFQ_BFQQ_FNS \ + +/* Expiration time of sync (0) and async (1) requests, in ns. */ +static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; + +/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */ +static const int bfq_back_max = 16 * 1024; + +/* Penalty of a backwards seek, in number of sectors. */ +static const int bfq_back_penalty = 2; + +/* Idling period duration, in ns. */ +static u64 bfq_slice_idle = NSEC_PER_SEC / 125; + +/* Minimum number of assigned budgets for which stats are safe to compute. */ +static const int bfq_stats_min_budgets = 194; + +/* Default maximum budget values, in sectors and number of requests. */ +static const int bfq_default_max_budget = 16 * 1024; + +/* + * When a sync request is dispatched, the queue that contains that + * request, and all the ancestor entities of that queue, are charged + * with the number of sectors of the request. In contrast, if the + * request is async, then the queue and its ancestor entities are + * charged with the number of sectors of the request, multiplied by + * the factor below. This throttles the bandwidth for async I/O, + * w.r.t. to sync I/O, and it is done to counter the tendency of async + * writes to steal I/O throughput to reads. + * + * The current value of this parameter is the result of a tuning with + * several hardware and software configurations. We tried to find the + * lowest value for which writes do not cause noticeable problems to + * reads. In fact, the lower this parameter, the stabler I/O control, + * in the following respect. The lower this parameter is, the less + * the bandwidth enjoyed by a group decreases + * - when the group does writes, w.r.t. to when it does reads; + * - when other groups do reads, w.r.t. to when they do writes. + */ +static const int bfq_async_charge_factor = 3; + +/* Default timeout values, in jiffies, approximating CFQ defaults. */ +const int bfq_timeout = HZ / 8; + +/* + * Time limit for merging (see comments in bfq_setup_cooperator). Set + * to the slowest value that, in our tests, proved to be effective in + * removing false positives, while not causing true positives to miss + * queue merging. + * + * As can be deduced from the low time limit below, queue merging, if + * successful, happens at the very beginning of the I/O of the involved + * cooperating processes, as a consequence of the arrival of the very + * first requests from each cooperator. After that, there is very + * little chance to find cooperators. + */ +static const unsigned long bfq_merge_time_limit = HZ/10; + +static struct kmem_cache *bfq_pool; + +/* Below this threshold (in ns), we consider thinktime immediate. */ +#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) + +/* hw_tag detection: parallel requests threshold and min samples needed. */ +#define BFQ_HW_QUEUE_THRESHOLD 3 +#define BFQ_HW_QUEUE_SAMPLES 32 + +#define BFQQ_SEEK_THR (sector_t)(8 * 100) +#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) +#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ + (get_sdist(last_pos, rq) > \ + BFQQ_SEEK_THR && \ + (!blk_queue_nonrot(bfqd->queue) || \ + blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) +#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) +#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) +/* + * Sync random I/O is likely to be confused with soft real-time I/O, + * because it is characterized by limited throughput and apparently + * isochronous arrival pattern. To avoid false positives, queues + * containing only random (seeky) I/O are prevented from being tagged + * as soft real-time. + */ +#define BFQQ_TOTALLY_SEEKY(bfqq) (bfqq->seek_history == -1) + +/* Min number of samples required to perform peak-rate update */ +#define BFQ_RATE_MIN_SAMPLES 32 +/* Min observation time interval required to perform a peak-rate update (ns) */ +#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) +/* Target observation time interval for a peak-rate update (ns) */ +#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC + +/* + * Shift used for peak-rate fixed precision calculations. + * With + * - the current shift: 16 positions + * - the current type used to store rate: u32 + * - the current unit of measure for rate: [sectors/usec], or, more precisely, + * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, + * the range of rates that can be stored is + * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = + * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = + * [15, 65G] sectors/sec + * Which, assuming a sector size of 512B, corresponds to a range of + * [7.5K, 33T] B/sec + */ +#define BFQ_RATE_SHIFT 16 + +/* + * When configured for computing the duration of the weight-raising + * for interactive queues automatically (see the comments at the + * beginning of this file), BFQ does it using the following formula: + * duration = (ref_rate / r) * ref_wr_duration, + * where r is the peak rate of the device, and ref_rate and + * ref_wr_duration are two reference parameters. In particular, + * ref_rate is the peak rate of the reference storage device (see + * below), and ref_wr_duration is about the maximum time needed, with + * BFQ and while reading two files in parallel, to load typical large + * applications on the reference device (see the comments on + * max_service_from_wr below, for more details on how ref_wr_duration + * is obtained). In practice, the slower/faster the device at hand + * is, the more/less it takes to load applications with respect to the + * reference device. Accordingly, the longer/shorter BFQ grants + * weight raising to interactive applications. + * + * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), + * depending on whether the device is rotational or non-rotational. + * + * In the following definitions, ref_rate[0] and ref_wr_duration[0] + * are the reference values for a rotational device, whereas + * ref_rate[1] and ref_wr_duration[1] are the reference values for a + * non-rotational device. The reference rates are not the actual peak + * rates of the devices used as a reference, but slightly lower + * values. The reason for using slightly lower values is that the + * peak-rate estimator tends to yield slightly lower values than the + * actual peak rate (it can yield the actual peak rate only if there + * is only one process doing I/O, and the process does sequential + * I/O). + * + * The reference peak rates are measured in sectors/usec, left-shifted + * by BFQ_RATE_SHIFT. + */ +static int ref_rate[2] = {14000, 33000}; +/* + * To improve readability, a conversion function is used to initialize + * the following array, which entails that the array can be + * initialized only in a function. + */ +static int ref_wr_duration[2]; + +/* + * BFQ uses the above-detailed, time-based weight-raising mechanism to + * privilege interactive tasks. This mechanism is vulnerable to the + * following false positives: I/O-bound applications that will go on + * doing I/O for much longer than the duration of weight + * raising. These applications have basically no benefit from being + * weight-raised at the beginning of their I/O. On the opposite end, + * while being weight-raised, these applications + * a) unjustly steal throughput to applications that may actually need + * low latency; + * b) make BFQ uselessly perform device idling; device idling results + * in loss of device throughput with most flash-based storage, and may + * increase latencies when used purposelessly. + * + * BFQ tries to reduce these problems, by adopting the following + * countermeasure. To introduce this countermeasure, we need first to + * finish explaining how the duration of weight-raising for + * interactive tasks is computed. + * + * For a bfq_queue deemed as interactive, the duration of weight + * raising is dynamically adjusted, as a function of the estimated + * peak rate of the device, so as to be equal to the time needed to + * execute the 'largest' interactive task we benchmarked so far. By + * largest task, we mean the task for which each involved process has + * to do more I/O than for any of the other tasks we benchmarked. This + * reference interactive task is the start-up of LibreOffice Writer, + * and in this task each process/bfq_queue needs to have at most ~110K + * sectors transferred. + * + * This last piece of information enables BFQ to reduce the actual + * duration of weight-raising for at least one class of I/O-bound + * applications: those doing sequential or quasi-sequential I/O. An + * example is file copy. In fact, once started, the main I/O-bound + * processes of these applications usually consume the above 110K + * sectors in much less time than the processes of an application that + * is starting, because these I/O-bound processes will greedily devote + * almost all their CPU cycles only to their target, + * throughput-friendly I/O operations. This is even more true if BFQ + * happens to be underestimating the device peak rate, and thus + * overestimating the duration of weight raising. But, according to + * our measurements, once transferred 110K sectors, these processes + * have no right to be weight-raised any longer. + * + * Basing on the last consideration, BFQ ends weight-raising for a + * bfq_queue if the latter happens to have received an amount of + * service at least equal to the following constant. The constant is + * set to slightly more than 110K, to have a minimum safety margin. + * + * This early ending of weight-raising reduces the amount of time + * during which interactive false positives cause the two problems + * described at the beginning of these comments. + */ +static const unsigned long max_service_from_wr = 120000; + +#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) +#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) + +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +{ + return bic->bfqq[is_sync]; +} + +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) +{ + struct bfq_queue *old_bfqq = bic->bfqq[is_sync]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; + + bic->bfqq[is_sync] = bfqq; +} + +struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) +{ + return bic->icq.q->elevator->elevator_data; +} + +/** + * icq_to_bic - convert iocontext queue structure to bfq_io_cq. + * @icq: the iocontext queue. + */ +static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) +{ + /* bic->icq is the first member, %NULL will convert to %NULL */ + return container_of(icq, struct bfq_io_cq, icq); +} + +/** + * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. + * @bfqd: the lookup key. + * @ioc: the io_context of the process doing I/O. + * @q: the request queue. + */ +static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, + struct io_context *ioc, + struct request_queue *q) +{ + if (ioc) { + unsigned long flags; + struct bfq_io_cq *icq; + + spin_lock_irqsave(&q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(ioc, q)); + spin_unlock_irqrestore(&q->queue_lock, flags); + + return icq; + } + + return NULL; +} + +/* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. + */ +void bfq_schedule_dispatch(struct bfq_data *bfqd) +{ + lockdep_assert_held(&bfqd->lock); + + if (bfqd->queued != 0) { + bfq_log(bfqd, "schedule dispatch"); + blk_mq_run_hw_queues(bfqd->queue, true); + } +} + +#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) + +#define bfq_sample_valid(samples) ((samples) > 80) + +/* + * Lifted from AS - choose which of rq1 and rq2 that is best served now. + * We choose the request that is closer to the head right now. Distance + * behind the head is penalized and only allowed to a certain extent. + */ +static struct request *bfq_choose_req(struct bfq_data *bfqd, + struct request *rq1, + struct request *rq2, + sector_t last) +{ + sector_t s1, s2, d1 = 0, d2 = 0; + unsigned long back_max; +#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ +#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ + unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ + + if (!rq1 || rq1 == rq2) + return rq2; + if (!rq2) + return rq1; + + if (rq_is_sync(rq1) && !rq_is_sync(rq2)) + return rq1; + else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) + return rq2; + if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) + return rq1; + else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) + return rq2; + + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); + + /* + * By definition, 1KiB is 2 sectors. + */ + back_max = bfqd->bfq_back_max * 2; + + /* + * Strict one way elevator _except_ in the case where we allow + * short backward seeks which are biased as twice the cost of a + * similar forward seek. + */ + if (s1 >= last) + d1 = s1 - last; + else if (s1 + back_max >= last) + d1 = (last - s1) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ1_WRAP; + + if (s2 >= last) + d2 = s2 - last; + else if (s2 + back_max >= last) + d2 = (last - s2) * bfqd->bfq_back_penalty; + else + wrap |= BFQ_RQ2_WRAP; + + /* Found required data */ + + /* + * By doing switch() on the bit mask "wrap" we avoid having to + * check two variables for all permutations: --> faster! + */ + switch (wrap) { + case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ + if (d1 < d2) + return rq1; + else if (d2 < d1) + return rq2; + + if (s1 >= s2) + return rq1; + else + return rq2; + + case BFQ_RQ2_WRAP: + return rq1; + case BFQ_RQ1_WRAP: + return rq2; + case BFQ_RQ1_WRAP|BFQ_RQ2_WRAP: /* both rqs wrapped */ + default: + /* + * Since both rqs are wrapped, + * start with the one that's further behind head + * (--> only *one* back seek required), + * since back seek takes more time than forward. + */ + if (s1 <= s2) + return rq1; + else + return rq2; + } +} + +/* + * Async I/O can easily starve sync I/O (both sync reads and sync + * writes), by consuming all tags. Similarly, storms of sync writes, + * such as those that sync(2) may trigger, can starve sync reads. + * Limit depths of async I/O and sync writes so as to counter both + * problems. + */ +static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + struct bfq_data *bfqd = data->q->elevator->elevator_data; + + if (op_is_sync(op) && !op_is_write(op)) + return; + + data->shallow_depth = + bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + + bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", + __func__, bfqd->wr_busy_queues, op_is_sync(op), + data->shallow_depth); +} + +static struct bfq_queue * +bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, + sector_t sector, struct rb_node **ret_parent, + struct rb_node ***rb_link) +{ + struct rb_node **p, *parent; + struct bfq_queue *bfqq = NULL; + + parent = NULL; + p = &root->rb_node; + while (*p) { + struct rb_node **n; + + parent = *p; + bfqq = rb_entry(parent, struct bfq_queue, pos_node); + + /* + * Sort strictly based on sector. Smallest to the left, + * largest to the right. + */ + if (sector > blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_right; + else if (sector < blk_rq_pos(bfqq->next_rq)) + n = &(*p)->rb_left; + else + break; + p = n; + bfqq = NULL; + } + + *ret_parent = parent; + if (rb_link) + *rb_link = p; + + bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", + (unsigned long long)sector, + bfqq ? bfqq->pid : 0); + + return bfqq; +} + +static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) +{ + return bfqq->service_from_backlogged > 0 && + time_is_before_jiffies(bfqq->first_IO_time + + bfq_merge_time_limit); +} + +/* + * The following function is not marked as __cold because it is + * actually cold, but for the same performance goal described in the + * comments on the likely() at the beginning of + * bfq_setup_cooperator(). Unexpectedly, to reach an even lower + * execution time for the case where this function is not invoked, we + * had to add an unlikely() in each involved if(). + */ +void __cold +bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct rb_node **p, *parent; + struct bfq_queue *__bfqq; + + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + + /* oom_bfqq does not participate in queue merging */ + if (bfqq == &bfqd->oom_bfqq) + return; + + /* + * bfqq cannot be merged any longer (see comments in + * bfq_setup_cooperator): no point in adding bfqq into the + * position tree. + */ + if (bfq_too_late_for_merging(bfqq)) + return; + + if (bfq_class_idle(bfqq)) + return; + if (!bfqq->next_rq) + return; + + bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, + blk_rq_pos(bfqq->next_rq), &parent, &p); + if (!__bfqq) { + rb_link_node(&bfqq->pos_node, parent, p); + rb_insert_color(&bfqq->pos_node, bfqq->pos_root); + } else + bfqq->pos_root = NULL; +} + +/* + * The following function returns false either if every active queue + * must receive the same share of the throughput (symmetric scenario), + * or, as a special case, if bfqq must receive a share of the + * throughput lower than or equal to the share that every other active + * queue must receive. If bfqq does sync I/O, then these are the only + * two cases where bfqq happens to be guaranteed its share of the + * throughput even if I/O dispatching is not plugged when bfqq remains + * temporarily empty (for more details, see the comments in the + * function bfq_better_to_idle()). For this reason, the return value + * of this function is used to check whether I/O-dispatch plugging can + * be avoided. + * + * The above first case (symmetric scenario) occurs when: + * 1) all active queues have the same weight, + * 2) all active queues belong to the same I/O-priority class, + * 3) all active groups at the same level in the groups tree have the same + * weight, + * 4) all active groups at the same level in the groups tree have the same + * number of children. + * + * Unfortunately, keeping the necessary state for evaluating exactly + * the last two symmetry sub-conditions above would be quite complex + * and time consuming. Therefore this function evaluates, instead, + * only the following stronger three sub-conditions, for which it is + * much easier to maintain the needed state: + * 1) all active queues have the same weight, + * 2) all active queues belong to the same I/O-priority class, + * 3) there are no active groups. + * In particular, the last condition is always true if hierarchical + * support or the cgroups interface are not enabled, thus no state + * needs to be maintained in this case. + */ +static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + bool smallest_weight = bfqq && + bfqq->weight_counter && + bfqq->weight_counter == + container_of( + rb_first_cached(&bfqd->queue_weights_tree), + struct bfq_weight_counter, + weights_node); + + /* + * For queue weights to differ, queue_weights_tree must contain + * at least two nodes. + */ + bool varied_queue_weights = !smallest_weight && + !RB_EMPTY_ROOT(&bfqd->queue_weights_tree.rb_root) && + (bfqd->queue_weights_tree.rb_root.rb_node->rb_left || + bfqd->queue_weights_tree.rb_root.rb_node->rb_right); + + bool multiple_classes_busy = + (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || + (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || + (bfqd->busy_queues[1] && bfqd->busy_queues[2]); + + return varied_queue_weights || multiple_classes_busy +#ifdef CONFIG_BFQ_GROUP_IOSCHED + || bfqd->num_groups_with_pending_reqs > 0 +#endif + ; +} + +/* + * If the weight-counter tree passed as input contains no counter for + * the weight of the input queue, then add that counter; otherwise just + * increment the existing counter. + * + * Note that weight-counter trees contain few nodes in mostly symmetric + * scenarios. For example, if all queues have the same weight, then the + * weight-counter tree for the queues may contain at most one node. + * This holds even if low_latency is on, because weight-raised queues + * are not inserted in the tree. + * In most scenarios, the rate at which nodes are created/destroyed + * should be low too. + */ +void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct rb_root_cached *root) +{ + struct bfq_entity *entity = &bfqq->entity; + struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; + bool leftmost = true; + + /* + * Do not insert if the queue is already associated with a + * counter, which happens if: + * 1) a request arrival has caused the queue to become both + * non-weight-raised, and hence change its weight, and + * backlogged; in this respect, each of the two events + * causes an invocation of this function, + * 2) this is the invocation of this function caused by the + * second event. This second invocation is actually useless, + * and we handle this fact by exiting immediately. More + * efficient or clearer solutions might possibly be adopted. + */ + if (bfqq->weight_counter) + return; + + while (*new) { + struct bfq_weight_counter *__counter = container_of(*new, + struct bfq_weight_counter, + weights_node); + parent = *new; + + if (entity->weight == __counter->weight) { + bfqq->weight_counter = __counter; + goto inc_counter; + } + if (entity->weight < __counter->weight) + new = &((*new)->rb_left); + else { + new = &((*new)->rb_right); + leftmost = false; + } + } + + bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), + GFP_ATOMIC); + + /* + * In the unlucky event of an allocation failure, we just + * exit. This will cause the weight of queue to not be + * considered in bfq_asymmetric_scenario, which, in its turn, + * causes the scenario to be deemed wrongly symmetric in case + * bfqq's weight would have been the only weight making the + * scenario asymmetric. On the bright side, no unbalance will + * however occur when bfqq becomes inactive again (the + * invocation of this function is triggered by an activation + * of queue). In fact, bfq_weights_tree_remove does nothing + * if !bfqq->weight_counter. + */ + if (unlikely(!bfqq->weight_counter)) + return; + + bfqq->weight_counter->weight = entity->weight; + rb_link_node(&bfqq->weight_counter->weights_node, parent, new); + rb_insert_color_cached(&bfqq->weight_counter->weights_node, root, + leftmost); + +inc_counter: + bfqq->weight_counter->num_active++; + bfqq->ref++; +} + +/* + * Decrement the weight counter associated with the queue, and, if the + * counter reaches 0, remove the counter from the tree. + * See the comments to the function bfq_weights_tree_add() for considerations + * about overhead. + */ +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct rb_root_cached *root) +{ + if (!bfqq->weight_counter) + return; + + bfqq->weight_counter->num_active--; + if (bfqq->weight_counter->num_active > 0) + goto reset_entity_pointer; + + rb_erase_cached(&bfqq->weight_counter->weights_node, root); + kfree(bfqq->weight_counter); + +reset_entity_pointer: + bfqq->weight_counter = NULL; + bfq_put_queue(bfqq); +} + +/* + * Invoke __bfq_weights_tree_remove on bfqq and decrement the number + * of active groups for each queue's inactive parent entity. + */ +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = bfqq->entity.parent; + + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->my_sched_data; + + if (sd->next_in_service || sd->in_service_entity) { + /* + * entity is still active, because either + * next_in_service or in_service_entity is not + * NULL (see the comments on the definition of + * next_in_service for details on why + * in_service_entity must be checked too). + * + * As a consequence, its parent entities are + * active as well, and thus this loop must + * stop here. + */ + break; + } + + /* + * The decrement of num_groups_with_pending_reqs is + * not performed immediately upon the deactivation of + * entity, but it is delayed to when it also happens + * that the first leaf descendant bfqq of entity gets + * all its pending requests completed. The following + * instructions perform this delayed decrement, if + * needed. See the comments on + * num_groups_with_pending_reqs for details. + */ + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + bfqd->num_groups_with_pending_reqs--; + } + } + + /* + * Next function is invoked last, because it causes bfqq to be + * freed if the following holds: bfqq is not in service and + * has no dispatched request. DO NOT use bfqq after the next + * function invocation. + */ + __bfq_weights_tree_remove(bfqd, bfqq, + &bfqd->queue_weights_tree); +} + +/* + * Return expired entry, or NULL to just start from scratch in rbtree. + */ +static struct request *bfq_check_fifo(struct bfq_queue *bfqq, + struct request *last) +{ + struct request *rq; + + if (bfq_bfqq_fifo_expire(bfqq)) + return NULL; + + bfq_mark_bfqq_fifo_expire(bfqq); + + rq = rq_entry_fifo(bfqq->fifo.next); + + if (rq == last || ktime_get_ns() < rq->fifo_time) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); + return rq; +} + +static struct request *bfq_find_next_rq(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct request *last) +{ + struct rb_node *rbnext = rb_next(&last->rb_node); + struct rb_node *rbprev = rb_prev(&last->rb_node); + struct request *next, *prev = NULL; + + /* Follow expired path, else get first next available. */ + next = bfq_check_fifo(bfqq, last); + if (next) + return next; + + if (rbprev) + prev = rb_entry_rq(rbprev); + + if (rbnext) + next = rb_entry_rq(rbnext); + else { + rbnext = rb_first(&bfqq->sort_list); + if (rbnext && rbnext != &last->rb_node) + next = rb_entry_rq(rbnext); + } + + return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); +} + +/* see the definition of bfq_async_charge_factor for details */ +static unsigned long bfq_serv_to_charge(struct request *rq, + struct bfq_queue *bfqq) +{ + if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || + bfq_asymmetric_scenario(bfqq->bfqd, bfqq)) + return blk_rq_sectors(rq); + + return blk_rq_sectors(rq) * bfq_async_charge_factor; +} + +/** + * bfq_updated_next_req - update the queue after a new next_rq selection. + * @bfqd: the device data the queue belongs to. + * @bfqq: the queue to update. + * + * If the first request of a queue changes we make sure that the queue + * has enough budget to serve at least its first request (if the + * request has grown). We do this because if the queue has not enough + * budget for its first request, it has to go through two dispatch + * rounds to actually get it dispatched. + */ +static void bfq_updated_next_req(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + struct request *next_rq = bfqq->next_rq; + unsigned long new_budget; + + if (!next_rq) + return; + + if (bfqq == bfqd->in_service_queue) + /* + * In order not to break guarantees, budgets cannot be + * changed after an entity has been selected. + */ + return; + + new_budget = max_t(unsigned long, + max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)), + entity->service); + if (entity->budget != new_budget) { + entity->budget = new_budget; + bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", + new_budget); + bfq_requeue_bfqq(bfqd, bfqq, false); + } +} + +static unsigned int bfq_wr_duration(struct bfq_data *bfqd) +{ + u64 dur; + + if (bfqd->bfq_wr_max_time > 0) + return bfqd->bfq_wr_max_time; + + dur = bfqd->rate_dur_prod; + do_div(dur, bfqd->peak_rate); + + /* + * Limit duration between 3 and 25 seconds. The upper limit + * has been conservatively set after the following worst case: + * on a QEMU/KVM virtual machine + * - running in a slow PC + * - with a virtual disk stacked on a slow low-end 5400rpm HDD + * - serving a heavy I/O workload, such as the sequential reading + * of several files + * mplayer took 23 seconds to start, if constantly weight-raised. + * + * As for higher values than that accommodating the above bad + * scenario, tests show that higher values would often yield + * the opposite of the desired result, i.e., would worsen + * responsiveness by allowing non-interactive applications to + * preserve weight raising for too long. + * + * On the other end, lower values than 3 seconds make it + * difficult for most interactive tasks to complete their jobs + * before weight-raising finishes. + */ + return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); +} + +/* switch back from soft real-time to interactive weight raising */ +static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, + struct bfq_data *bfqd) +{ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; +} + +static void +bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, + struct bfq_io_cq *bic, bool bfq_already_existing) +{ + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + + if (bic->saved_has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); + else + bfq_clear_bfqq_has_short_ttime(bfqq); + + if (bic->saved_IO_bound) + bfq_mark_bfqq_IO_bound(bfqq); + else + bfq_clear_bfqq_IO_bound(bfqq); + + bfqq->entity.new_weight = bic->saved_weight; + bfqq->ttime = bic->saved_ttime; + bfqq->wr_coeff = bic->saved_wr_coeff; + bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; + bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + + if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || + time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time))) { + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + !bfq_bfqq_in_large_burst(bfqq) && + time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + switch_back_to_interactive_wr(bfqq, bfqd); + } else { + bfqq->wr_coeff = 1; + bfq_log_bfqq(bfqq->bfqd, bfqq, + "resume state: switching off wr"); + } + } + + /* make sure weight will be updated, however we got here */ + bfqq->entity.prio_changed = 1; + + if (likely(!busy)) + return; + + if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; + else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) + bfqd->wr_busy_queues--; +} + +static int bfqq_process_refs(struct bfq_queue *bfqq) +{ + return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - + (bfqq->weight_counter != NULL); +} + +/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ +static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + + hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) + hlist_del_init(&item->burst_list_node); + + /* + * Start the creation of a new burst list only if there is no + * active queue. See comments on the conditional invocation of + * bfq_handle_burst(). + */ + if (bfq_tot_busy_queues(bfqd) == 0) { + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); + bfqd->burst_size = 1; + } else + bfqd->burst_size = 0; + + bfqd->burst_parent_entity = bfqq->entity.parent; +} + +/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ +static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* Increment burst size to take into account also bfqq */ + bfqd->burst_size++; + + if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { + struct bfq_queue *pos, *bfqq_item; + struct hlist_node *n; + + /* + * Enough queues have been activated shortly after each + * other to consider this burst as large. + */ + bfqd->large_burst = true; + + /* + * We can now mark all queues in the burst list as + * belonging to a large burst. + */ + hlist_for_each_entry(bfqq_item, &bfqd->burst_list, + burst_list_node) + bfq_mark_bfqq_in_large_burst(bfqq_item); + bfq_mark_bfqq_in_large_burst(bfqq); + + /* + * From now on, and until the current burst finishes, any + * new queue being activated shortly after the last queue + * was inserted in the burst can be immediately marked as + * belonging to a large burst. So the burst list is not + * needed any more. Remove it. + */ + hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, + burst_list_node) + hlist_del_init(&pos->burst_list_node); + } else /* + * Burst not yet large: add bfqq to the burst list. Do + * not increment the ref counter for bfqq, because bfqq + * is removed from the burst list before freeing bfqq + * in put_queue. + */ + hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); +} + +/* + * If many queues belonging to the same group happen to be created + * shortly after each other, then the processes associated with these + * queues have typically a common goal. In particular, bursts of queue + * creations are usually caused by services or applications that spawn + * many parallel threads/processes. Examples are systemd during boot, + * or git grep. To help these processes get their job done as soon as + * possible, it is usually better to not grant either weight-raising + * or device idling to their queues, unless these queues must be + * protected from the I/O flowing through other active queues. + * + * In this comment we describe, firstly, the reasons why this fact + * holds, and, secondly, the next function, which implements the main + * steps needed to properly mark these queues so that they can then be + * treated in a different way. + * + * The above services or applications benefit mostly from a high + * throughput: the quicker the requests of the activated queues are + * cumulatively served, the sooner the target job of these queues gets + * completed. As a consequence, weight-raising any of these queues, + * which also implies idling the device for it, is almost always + * counterproductive, unless there are other active queues to isolate + * these new queues from. If there no other active queues, then + * weight-raising these new queues just lowers throughput in most + * cases. + * + * On the other hand, a burst of queue creations may be caused also by + * the start of an application that does not consist of a lot of + * parallel I/O-bound threads. In fact, with a complex application, + * several short processes may need to be executed to start-up the + * application. In this respect, to start an application as quickly as + * possible, the best thing to do is in any case to privilege the I/O + * related to the application with respect to all other + * I/O. Therefore, the best strategy to start as quickly as possible + * an application that causes a burst of queue creations is to + * weight-raise all the queues created during the burst. This is the + * exact opposite of the best strategy for the other type of bursts. + * + * In the end, to take the best action for each of the two cases, the + * two types of bursts need to be distinguished. Fortunately, this + * seems relatively easy, by looking at the sizes of the bursts. In + * particular, we found a threshold such that only bursts with a + * larger size than that threshold are apparently caused by + * services or commands such as systemd or git grep. For brevity, + * hereafter we call just 'large' these bursts. BFQ *does not* + * weight-raise queues whose creation occurs in a large burst. In + * addition, for each of these queues BFQ performs or does not perform + * idling depending on which choice boosts the throughput more. The + * exact choice depends on the device and request pattern at + * hand. + * + * Unfortunately, false positives may occur while an interactive task + * is starting (e.g., an application is being started). The + * consequence is that the queues associated with the task do not + * enjoy weight raising as expected. Fortunately these false positives + * are very rare. They typically occur if some service happens to + * start doing I/O exactly when the interactive task starts. + * + * Turning back to the next function, it is invoked only if there are + * no active queues (apart from active queues that would belong to the + * same, possible burst bfqq would belong to), and it implements all + * the steps needed to detect the occurrence of a large burst and to + * properly mark all the queues belonging to it (so that they can then + * be treated in a different way). This goal is achieved by + * maintaining a "burst list" that holds, temporarily, the queues that + * belong to the burst in progress. The list is then used to mark + * these queues as belonging to a large burst if the burst does become + * large. The main steps are the following. + * + * . when the very first queue is created, the queue is inserted into the + * list (as it could be the first queue in a possible burst) + * + * . if the current burst has not yet become large, and a queue Q that does + * not yet belong to the burst is activated shortly after the last time + * at which a new queue entered the burst list, then the function appends + * Q to the burst list + * + * . if, as a consequence of the previous step, the burst size reaches + * the large-burst threshold, then + * + * . all the queues in the burst list are marked as belonging to a + * large burst + * + * . the burst list is deleted; in fact, the burst list already served + * its purpose (keeping temporarily track of the queues in a burst, + * so as to be able to mark them as belonging to a large burst in the + * previous sub-step), and now is not needed any more + * + * . the device enters a large-burst mode + * + * . if a queue Q that does not belong to the burst is created while + * the device is in large-burst mode and shortly after the last time + * at which a queue either entered the burst list or was marked as + * belonging to the current large burst, then Q is immediately marked + * as belonging to a large burst. + * + * . if a queue Q that does not belong to the burst is created a while + * later, i.e., not shortly after, than the last time at which a queue + * either entered the burst list or was marked as belonging to the + * current large burst, then the current burst is deemed as finished and: + * + * . the large-burst mode is reset if set + * + * . the burst list is emptied + * + * . Q is inserted in the burst list, as Q may be the first queue + * in a possible new burst (then the burst list contains just Q + * after this step). + */ +static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * If bfqq is already in the burst list or is part of a large + * burst, or finally has just been split, then there is + * nothing else to do. + */ + if (!hlist_unhashed(&bfqq->burst_list_node) || + bfq_bfqq_in_large_burst(bfqq) || + time_is_after_eq_jiffies(bfqq->split_time + + msecs_to_jiffies(10))) + return; + + /* + * If bfqq's creation happens late enough, or bfqq belongs to + * a different group than the burst group, then the current + * burst is finished, and related data structures must be + * reset. + * + * In this respect, consider the special case where bfqq is + * the very first queue created after BFQ is selected for this + * device. In this case, last_ins_in_burst and + * burst_parent_entity are not yet significant when we get + * here. But it is easy to verify that, whether or not the + * following condition is true, bfqq will end up being + * inserted into the burst list. In particular the list will + * happen to contain only bfqq. And this is exactly what has + * to happen, as bfqq may be the first queue of the first + * burst. + */ + if (time_is_before_jiffies(bfqd->last_ins_in_burst + + bfqd->bfq_burst_interval) || + bfqq->entity.parent != bfqd->burst_parent_entity) { + bfqd->large_burst = false; + bfq_reset_burst_list(bfqd, bfqq); + goto end; + } + + /* + * If we get here, then bfqq is being activated shortly after the + * last queue. So, if the current burst is also large, we can mark + * bfqq as belonging to this large burst immediately. + */ + if (bfqd->large_burst) { + bfq_mark_bfqq_in_large_burst(bfqq); + goto end; + } + + /* + * If we get here, then a large-burst state has not yet been + * reached, but bfqq is being activated shortly after the last + * queue. Then we add bfqq to the burst. + */ + bfq_add_to_burst(bfqd, bfqq); +end: + /* + * At this point, bfqq either has been added to the current + * burst or has caused the current burst to terminate and a + * possible new burst to start. In particular, in the second + * case, bfqq has become the first queue in the possible new + * burst. In both cases last_ins_in_burst needs to be moved + * forward. + */ + bfqd->last_ins_in_burst = jiffies; +} + +static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + return entity->budget - entity->service; +} + +/* + * If enough samples have been computed, return the current max budget + * stored in bfqd, which is dynamically updated according to the + * estimated disk peak rate; otherwise return the default max budget + */ +static int bfq_max_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget; + else + return bfqd->bfq_max_budget; +} + +/* + * Return min budget, which is a fraction of the current or default + * max budget (trying with 1/32) + */ +static int bfq_min_budget(struct bfq_data *bfqd) +{ + if (bfqd->budgets_assigned < bfq_stats_min_budgets) + return bfq_default_max_budget / 32; + else + return bfqd->bfq_max_budget / 32; +} + +/* + * The next function, invoked after the input queue bfqq switches from + * idle to busy, updates the budget of bfqq. The function also tells + * whether the in-service queue should be expired, by returning + * true. The purpose of expiring the in-service queue is to give bfqq + * the chance to possibly preempt the in-service queue, and the reason + * for preempting the in-service queue is to achieve one of the two + * goals below. + * + * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has + * expired because it has remained idle. In particular, bfqq may have + * expired for one of the following two reasons: + * + * - BFQQE_NO_MORE_REQUESTS bfqq did not enjoy any device idling + * and did not make it to issue a new request before its last + * request was served; + * + * - BFQQE_TOO_IDLE bfqq did enjoy device idling, but did not issue + * a new request before the expiration of the idling-time. + * + * Even if bfqq has expired for one of the above reasons, the process + * associated with the queue may be however issuing requests greedily, + * and thus be sensitive to the bandwidth it receives (bfqq may have + * remained idle for other reasons: CPU high load, bfqq not enjoying + * idling, I/O throttling somewhere in the path from the process to + * the I/O scheduler, ...). But if, after every expiration for one of + * the above two reasons, bfqq has to wait for the service of at least + * one full budget of another queue before being served again, then + * bfqq is likely to get a much lower bandwidth or resource time than + * its reserved ones. To address this issue, two countermeasures need + * to be taken. + * + * First, the budget and the timestamps of bfqq need to be updated in + * a special way on bfqq reactivation: they need to be updated as if + * bfqq did not remain idle and did not expire. In fact, if they are + * computed as if bfqq expired and remained idle until reactivation, + * then the process associated with bfqq is treated as if, instead of + * being greedy, it stopped issuing requests when bfqq remained idle, + * and restarts issuing requests only on this reactivation. In other + * words, the scheduler does not help the process recover the "service + * hole" between bfqq expiration and reactivation. As a consequence, + * the process receives a lower bandwidth than its reserved one. In + * contrast, to recover this hole, the budget must be updated as if + * bfqq was not expired at all before this reactivation, i.e., it must + * be set to the value of the remaining budget when bfqq was + * expired. Along the same line, timestamps need to be assigned the + * value they had the last time bfqq was selected for service, i.e., + * before last expiration. Thus timestamps need to be back-shifted + * with respect to their normal computation (see [1] for more details + * on this tricky aspect). + * + * Secondly, to allow the process to recover the hole, the in-service + * queue must be expired too, to give bfqq the chance to preempt it + * immediately. In fact, if bfqq has to wait for a full budget of the + * in-service queue to be completed, then it may become impossible to + * let the process recover the hole, even if the back-shifted + * timestamps of bfqq are lower than those of the in-service queue. If + * this happens for most or all of the holes, then the process may not + * receive its reserved bandwidth. In this respect, it is worth noting + * that, being the service of outstanding requests unpreemptible, a + * little fraction of the holes may however be unrecoverable, thereby + * causing a little loss of bandwidth. + * + * The last important point is detecting whether bfqq does need this + * bandwidth recovery. In this respect, the next function deems the + * process associated with bfqq greedy, and thus allows it to recover + * the hole, if: 1) the process is waiting for the arrival of a new + * request (which implies that bfqq expired for one of the above two + * reasons), and 2) such a request has arrived soon. The first + * condition is controlled through the flag non_blocking_wait_rq, + * while the second through the flag arrived_in_time. If both + * conditions hold, then the function computes the budget in the + * above-described special way, and signals that the in-service queue + * should be expired. Timestamp back-shifting is done later in + * __bfq_activate_entity. + * + * 2. Reduce latency. Even if timestamps are not backshifted to let + * the process associated with bfqq recover a service hole, bfqq may + * however happen to have, after being (re)activated, a lower finish + * timestamp than the in-service queue. That is, the next budget of + * bfqq may have to be completed before the one of the in-service + * queue. If this is the case, then preempting the in-service queue + * allows this goal to be achieved, apart from the unpreemptible, + * outstanding requests mentioned above. + * + * Unfortunately, regardless of which of the above two goals one wants + * to achieve, service trees need first to be updated to know whether + * the in-service queue must be preempted. To have service trees + * correctly updated, the in-service queue must be expired and + * rescheduled, and bfqq must be scheduled too. This is one of the + * most costly operations (in future versions, the scheduling + * mechanism may be re-designed in such a way to make it possible to + * know whether preemption is needed without needing to update service + * trees). In addition, queue preemptions almost always cause random + * I/O, which may in turn cause loss of throughput. Finally, there may + * even be no in-service queue when the next function is invoked (so, + * no queue to compare timestamps with). Because of these facts, the + * next function adopts the following simple scheme to avoid costly + * operations, too frequent preemptions and too many dependencies on + * the state of the scheduler: it requests the expiration of the + * in-service queue (unconditionally) only for queues that need to + * recover a hole. Then it delegates to other parts of the code the + * responsibility of handling the above case 2. + */ +static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool arrived_in_time) +{ + struct bfq_entity *entity = &bfqq->entity; + + /* + * In the next compound condition, we check also whether there + * is some budget left, because otherwise there is no point in + * trying to go on serving bfqq with this same budget: bfqq + * would be expired immediately after being selected for + * service. This would only cause useless overhead. + */ + if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && + bfq_bfqq_budget_left(bfqq) > 0) { + /* + * We do not clear the flag non_blocking_wait_rq here, as + * the latter is used in bfq_activate_bfqq to signal + * that timestamps need to be back-shifted (and is + * cleared right after). + */ + + /* + * In next assignment we rely on that either + * entity->service or entity->budget are not updated + * on expiration if bfqq is empty (see + * __bfq_bfqq_recalc_budget). Thus both quantities + * remain unchanged after such an expiration, and the + * following statement therefore assigns to + * entity->budget the remaining budget on such an + * expiration. + */ + entity->budget = min_t(unsigned long, + bfq_bfqq_budget_left(bfqq), + bfqq->max_budget); + + /* + * At this point, we have used entity->service to get + * the budget left (needed for updating + * entity->budget). Thus we finally can, and have to, + * reset entity->service. The latter must be reset + * because bfqq would otherwise be charged again for + * the service it has received during its previous + * service slot(s). + */ + entity->service = 0; + + return true; + } + + /* + * We can finally complete expiration, by setting service to 0. + */ + entity->service = 0; + entity->budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(bfqq->next_rq, bfqq)); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); + return false; +} + +/* + * Return the farthest past time instant according to jiffies + * macros. + */ +static unsigned long bfq_smallest_from_now(void) +{ + return jiffies - MAX_JIFFY_OFFSET; +} + +static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + unsigned int old_wr_coeff, + bool wr_or_deserves_wr, + bool interactive, + bool in_burst, + bool soft_rt) +{ + if (old_wr_coeff == 1 && wr_or_deserves_wr) { + /* start a weight-raising period */ + if (interactive) { + bfqq->service_from_wr = 0; + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else { + /* + * No interactive weight raising in progress + * here: assign minus infinity to + * wr_start_at_switch_to_srt, to make sure + * that, at the end of the soft-real-time + * weight raising periods that is starting + * now, no interactive weight-raising period + * may be wrongly considered as still in + * progress (and thus actually started by + * mistake). + */ + bfqq->wr_start_at_switch_to_srt = + bfq_smallest_from_now(); + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + } + + /* + * If needed, further reduce budget to make sure it is + * close to bfqq's backlog, so as to reduce the + * scheduling-error component due to a too large + * budget. Do not care about throughput consequences, + * but only about latency. Finally, do not assign a + * too small budget either, to avoid increasing + * latency by causing too frequent expirations. + */ + bfqq->entity.budget = min_t(unsigned long, + bfqq->entity.budget, + 2 * bfq_min_budget(bfqd)); + } else if (old_wr_coeff > 1) { + if (interactive) { /* update wr coeff and duration */ + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + } else if (in_burst) + bfqq->wr_coeff = 1; + else if (soft_rt) { + /* + * The application is now or still meeting the + * requirements for being deemed soft rt. We + * can then correctly and safely (re)charge + * the weight-raising duration for the + * application with the weight-raising + * duration for soft rt applications. + * + * In particular, doing this recharge now, i.e., + * before the weight-raising period for the + * application finishes, reduces the probability + * of the following negative scenario: + * 1) the weight of a soft rt application is + * raised at startup (as for any newly + * created application), + * 2) since the application is not interactive, + * at a certain time weight-raising is + * stopped for the application, + * 3) at that time the application happens to + * still have pending requests, and hence + * is destined to not have a chance to be + * deemed soft rt before these requests are + * completed (see the comments to the + * function bfq_bfqq_softrt_next_start() + * for details on soft rt detection), + * 4) these pending requests experience a high + * latency because the application is not + * weight-raised while they are pending. + */ + if (bfqq->wr_cur_max_time != + bfqd->bfq_wr_rt_max_time) { + bfqq->wr_start_at_switch_to_srt = + bfqq->last_wr_start_finish; + + bfqq->wr_cur_max_time = + bfqd->bfq_wr_rt_max_time; + bfqq->wr_coeff = bfqd->bfq_wr_coeff * + BFQ_SOFTRT_WEIGHT_FACTOR; + } + bfqq->last_wr_start_finish = jiffies; + } + } +} + +static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return bfqq->dispatched == 0 && + time_is_before_jiffies( + bfqq->budget_timeout + + bfqd->bfq_wr_min_idle_time); +} + + +/* + * Return true if bfqq is in a higher priority class, or has a higher + * weight than the in-service queue. + */ +static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, + struct bfq_queue *in_serv_bfqq) +{ + int bfqq_weight, in_serv_weight; + + if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class) + return true; + + if (in_serv_bfqq->entity.parent == bfqq->entity.parent) { + bfqq_weight = bfqq->entity.weight; + in_serv_weight = in_serv_bfqq->entity.weight; + } else { + if (bfqq->entity.parent) + bfqq_weight = bfqq->entity.parent->weight; + else + bfqq_weight = bfqq->entity.weight; + if (in_serv_bfqq->entity.parent) + in_serv_weight = in_serv_bfqq->entity.parent->weight; + else + in_serv_weight = in_serv_bfqq->entity.weight; + } + + return bfqq_weight > in_serv_weight; +} + +static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + int old_wr_coeff, + struct request *rq, + bool *interactive) +{ + bool soft_rt, in_burst, wr_or_deserves_wr, + bfqq_wants_to_preempt, + idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), + /* + * See the comments on + * bfq_bfqq_update_budg_for_activation for + * details on the usage of the next variable. + */ + arrived_in_time = ktime_get_ns() <= + bfqq->ttime.last_end_request + + bfqd->bfq_slice_idle * 3; + + + /* + * bfqq deserves to be weight-raised if: + * - it is sync, + * - it does not belong to a large burst, + * - it has been idle for enough time or is soft real-time, + * - is linked to a bfq_io_cq (it is not shared in any sense). + */ + in_burst = bfq_bfqq_in_large_burst(bfqq); + soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && + !BFQQ_TOTALLY_SEEKY(bfqq) && + !in_burst && + time_is_before_jiffies(bfqq->soft_rt_next_start) && + bfqq->dispatched == 0; + *interactive = !in_burst && idle_for_long_time; + wr_or_deserves_wr = bfqd->low_latency && + (bfqq->wr_coeff > 1 || + (bfq_bfqq_sync(bfqq) && + bfqq->bic && (*interactive || soft_rt))); + + /* + * Using the last flag, update budget and check whether bfqq + * may want to preempt the in-service queue. + */ + bfqq_wants_to_preempt = + bfq_bfqq_update_budg_for_activation(bfqd, bfqq, + arrived_in_time); + + /* + * If bfqq happened to be activated in a burst, but has been + * idle for much more than an interactive queue, then we + * assume that, in the overall I/O initiated in the burst, the + * I/O associated with bfqq is finished. So bfqq does not need + * to be treated as a queue belonging to a burst + * anymore. Accordingly, we reset bfqq's in_large_burst flag + * if set, and remove bfqq from the burst list if it's + * there. We do not decrement burst_size, because the fact + * that bfqq does not need to belong to the burst list any + * more does not invalidate the fact that bfqq was created in + * a burst. + */ + if (likely(!bfq_bfqq_just_created(bfqq)) && + idle_for_long_time && + time_is_before_jiffies( + bfqq->budget_timeout + + msecs_to_jiffies(10000))) { + hlist_del_init(&bfqq->burst_list_node); + bfq_clear_bfqq_in_large_burst(bfqq); + } + + bfq_clear_bfqq_just_created(bfqq); + + + if (!bfq_bfqq_IO_bound(bfqq)) { + if (arrived_in_time) { + bfqq->requests_within_timer++; + if (bfqq->requests_within_timer >= + bfqd->bfq_requests_within_timer) + bfq_mark_bfqq_IO_bound(bfqq); + } else + bfqq->requests_within_timer = 0; + } + + if (bfqd->low_latency) { + if (unlikely(time_is_after_jiffies(bfqq->split_time))) + /* wraparound */ + bfqq->split_time = + jiffies - bfqd->bfq_wr_min_idle_time - 1; + + if (time_is_before_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) { + bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, + old_wr_coeff, + wr_or_deserves_wr, + *interactive, + in_burst, + soft_rt); + + if (old_wr_coeff != bfqq->wr_coeff) + bfqq->entity.prio_changed = 1; + } + } + + bfqq->last_idle_bklogged = jiffies; + bfqq->service_from_backlogged = 0; + bfq_clear_bfqq_softrt_update(bfqq); + + bfq_add_bfqq_busy(bfqd, bfqq); + + /* + * Expire in-service queue only if preemption may be needed + * for guarantees. In particular, we care only about two + * cases. The first is that bfqq has to recover a service + * hole, as explained in the comments on + * bfq_bfqq_update_budg_for_activation(), i.e., that + * bfqq_wants_to_preempt is true. However, if bfqq does not + * carry time-critical I/O, then bfqq's bandwidth is less + * important than that of queues that carry time-critical I/O. + * So, as a further constraint, we consider this case only if + * bfqq is at least as weight-raised, i.e., at least as time + * critical, as the in-service queue. + * + * The second case is that bfqq is in a higher priority class, + * or has a higher weight than the in-service queue. If this + * condition does not hold, we don't care because, even if + * bfqq does not start to be served immediately, the resulting + * delay for bfqq's I/O is however lower or much lower than + * the ideal completion time to be guaranteed to bfqq's I/O. + * + * In both cases, preemption is needed only if, according to + * the timestamps of both bfqq and of the in-service queue, + * bfqq actually is the next queue to serve. So, to reduce + * useless preemptions, the return value of + * next_queue_may_preempt() is considered in the next compound + * condition too. Yet next_queue_may_preempt() just checks a + * simple, necessary condition for bfqq to be the next queue + * to serve. In fact, to evaluate a sufficient condition, the + * timestamps of the in-service queue would need to be + * updated, and this operation is quite costly (see the + * comments on bfq_bfqq_update_budg_for_activation()). + */ + if (bfqd->in_service_queue && + ((bfqq_wants_to_preempt && + bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + next_queue_may_preempt(bfqd)) + bfq_bfqq_expire(bfqd, bfqd->in_service_queue, + false, BFQQE_PREEMPTED); +} + +static void bfq_reset_inject_limit(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + /* invalidate baseline total service time */ + bfqq->last_serv_time_ns = 0; + + /* + * Reset pointer in case we are waiting for + * some request completion. + */ + bfqd->waited_rq = NULL; + + /* + * If bfqq has a short think time, then start by setting the + * inject limit to 0 prudentially, because the service time of + * an injected I/O request may be higher than the think time + * of bfqq, and therefore, if one request was injected when + * bfqq remains empty, this injected request might delay the + * service of the next I/O request for bfqq significantly. In + * case bfqq can actually tolerate some injection, then the + * adaptive update will however raise the limit soon. This + * lucky circumstance holds exactly because bfqq has a short + * think time, and thus, after remaining empty, is likely to + * get new I/O enqueued---and then completed---before being + * expired. This is the very pattern that gives the + * limit-update algorithm the chance to measure the effect of + * injection on request service times, and then to update the + * limit accordingly. + * + * However, in the following special case, the inject limit is + * left to 1 even if the think time is short: bfqq's I/O is + * synchronized with that of some other queue, i.e., bfqq may + * receive new I/O only after the I/O of the other queue is + * completed. Keeping the inject limit to 1 allows the + * blocking I/O to be served while bfqq is in service. And + * this is very convenient both for bfqq and for overall + * throughput, as explained in detail in the comments in + * bfq_update_has_short_ttime(). + * + * On the opposite end, if bfqq has a long think time, then + * start directly by 1, because: + * a) on the bright side, keeping at most one request in + * service in the drive is unlikely to cause any harm to the + * latency of bfqq's requests, as the service time of a single + * request is likely to be lower than the think time of bfqq; + * b) on the downside, after becoming empty, bfqq is likely to + * expire before getting its next request. With this request + * arrival pattern, it is very hard to sample total service + * times and update the inject limit accordingly (see comments + * on bfq_update_inject_limit()). So the limit is likely to be + * never, or at least seldom, updated. As a consequence, by + * setting the limit to 1, we avoid that no injection ever + * occurs with bfqq. On the downside, this proactive step + * further reduces chances to actually compute the baseline + * total service time. Thus it reduces chances to execute the + * limit-update algorithm and possibly raise the limit to more + * than 1. + */ + if (bfq_bfqq_has_short_ttime(bfqq)) + bfqq->inject_limit = 0; + else + bfqq->inject_limit = 1; + + bfqq->decrease_time_jif = jiffies; +} + +static void bfq_add_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + struct request *next_rq, *prev; + unsigned int old_wr_coeff = bfqq->wr_coeff; + bool interactive = false; + + bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); + bfqq->queued[rq_is_sync(rq)]++; + bfqd->queued++; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { + /* + * Detect whether bfqq's I/O seems synchronized with + * that of some other queue, i.e., whether bfqq, after + * remaining empty, happens to receive new I/O only + * right after some I/O request of the other queue has + * been completed. We call waker queue the other + * queue, and we assume, for simplicity, that bfqq may + * have at most one waker queue. + * + * A remarkable throughput boost can be reached by + * unconditionally injecting the I/O of the waker + * queue, every time a new bfq_dispatch_request + * happens to be invoked while I/O is being plugged + * for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth + * and latency for bfqq. Note that these same results + * may be achieved with the general injection + * mechanism, but less effectively. For details on + * this aspect, see the comments on the choice of the + * queue for injection in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a + * queue Q is deemed as a waker queue for bfqq if, for + * two consecutive times, bfqq happens to become non + * empty right after a request of Q has been + * completed. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while + * on the second time, the flag + * bfq_bfqq_has_waker(bfqq) is set to confirm that Q + * is a waker queue for bfqq. These detection steps + * are performed only if bfqq has a long think time, + * so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This + * last filter, plus the above two-times requirement, + * make false positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner + * throughput can be boosted by injecting I/O from the + * waker queue. Fortunately, detection is likely to be + * actually fast, for the following reasons. While + * blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at + * least equal to 1 (see the comments in + * bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served + * during the very first I/O-plugging time interval + * for bfqq. This triggers the first step of the + * detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be + * confirmed no later than during the next + * I/O-plugging interval for bfqq. + */ + if (bfqd->last_completed_rq_bfqq && + !bfq_bfqq_has_short_ttime(bfqq) && + ktime_get_ns() - bfqd->last_completion < + 200 * NSEC_PER_USEC) { + if (bfqd->last_completed_rq_bfqq != bfqq && + bfqd->last_completed_rq_bfqq != + bfqq->waker_bfqq) { + /* + * First synchronization detected with + * a candidate waker queue, or with a + * different candidate waker queue + * from the current one. + */ + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + + bfq_clear_bfqq_has_waker(bfqq); + } else if (bfqd->last_completed_rq_bfqq == + bfqq->waker_bfqq && + !bfq_bfqq_has_waker(bfqq)) { + /* + * synchronization with waker_bfqq + * seen for the second time + */ + bfq_mark_bfqq_has_waker(bfqq); + } + } + + /* + * Periodically reset inject limit, to make sure that + * the latter eventually drops in case workload + * changes, see step (3) in the comments on + * bfq_update_inject_limit(). + */ + if (time_is_before_eq_jiffies(bfqq->decrease_time_jif + + msecs_to_jiffies(1000))) + bfq_reset_inject_limit(bfqd, bfqq); + + /* + * The following conditions must hold to setup a new + * sampling of total service time, and then a new + * update of the inject limit: + * - bfqq is in service, because the total service + * time is evaluated only for the I/O requests of + * the queues in service; + * - this is the right occasion to compute or to + * lower the baseline total service time, because + * there are actually no requests in the drive, + * or + * the baseline total service time is available, and + * this is the right occasion to compute the other + * quantity needed to update the inject limit, i.e., + * the total service time caused by the amount of + * injection allowed by the current value of the + * limit. It is the right occasion because injection + * has actually been performed during the service + * hole, and there are still in-flight requests, + * which are very likely to be exactly the injected + * requests, or part of them; + * - the minimum interval for sampling the total + * service time and updating the inject limit has + * elapsed. + */ + if (bfqq == bfqd->in_service_queue && + (bfqd->rq_in_driver == 0 || + (bfqq->last_serv_time_ns > 0 && + bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && + time_is_before_eq_jiffies(bfqq->decrease_time_jif + + msecs_to_jiffies(10))) { + bfqd->last_empty_occupied_ns = ktime_get_ns(); + /* + * Start the state machine for measuring the + * total service time of rq: setting + * wait_dispatch will cause bfqd->waited_rq to + * be set when rq will be dispatched. + */ + bfqd->wait_dispatch = true; + /* + * If there is no I/O in service in the drive, + * then possible injection occurred before the + * arrival of rq will not affect the total + * service time of rq. So the injection limit + * must not be updated as a function of such + * total service time, unless new injection + * occurs before rq is completed. To have the + * injection limit updated only in the latter + * case, reset rqs_injected here (rqs_injected + * will be set in case injection is performed + * on bfqq before rq is completed). + */ + if (bfqd->rq_in_driver == 0) + bfqd->rqs_injected = false; + } + } + + elv_rb_add(&bfqq->sort_list, rq); + + /* + * Check if this request is a better next-serve candidate. + */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); + bfqq->next_rq = next_rq; + + /* + * Adjust priority tree position, if next_rq changes. + * See comments on bfq_pos_tree_add_move() for the unlikely(). + */ + if (unlikely(!bfqd->nonrot_with_queueing && prev != bfqq->next_rq)) + bfq_pos_tree_add_move(bfqd, bfqq); + + if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ + bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, + rq, &interactive); + else { + if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && + time_is_before_jiffies( + bfqq->last_wr_start_finish + + bfqd->bfq_wr_min_inter_arr_async)) { + bfqq->wr_coeff = bfqd->bfq_wr_coeff; + bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); + + bfqd->wr_busy_queues++; + bfqq->entity.prio_changed = 1; + } + if (prev != bfqq->next_rq) + bfq_updated_next_req(bfqd, bfqq); + } + + /* + * Assign jiffies to last_wr_start_finish in the following + * cases: + * + * . if bfqq is not going to be weight-raised, because, for + * non weight-raised queues, last_wr_start_finish stores the + * arrival time of the last request; as of now, this piece + * of information is used only for deciding whether to + * weight-raise async queues + * + * . if bfqq is not weight-raised, because, if bfqq is now + * switching to weight-raised, then last_wr_start_finish + * stores the time when weight-raising starts + * + * . if bfqq is interactive, because, regardless of whether + * bfqq is currently weight-raised, the weight-raising + * period must start or restart (this case is considered + * separately because it is not detected by the above + * conditions, if bfqq is already weight-raised) + * + * last_wr_start_finish has to be updated also if bfqq is soft + * real-time, because the weight-raising period is constantly + * restarted on idle-to-busy transitions for these queues, but + * this is already done in bfq_bfqq_handle_idle_busy_switch if + * needed. + */ + if (bfqd->low_latency && + (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) + bfqq->last_wr_start_finish = jiffies; +} + +static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, + struct bio *bio, + struct request_queue *q) +{ + struct bfq_queue *bfqq = bfqd->bio_bfqq; + + + if (bfqq) + return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); + + return NULL; +} + +static sector_t get_sdist(sector_t last_pos, struct request *rq) +{ + if (last_pos) + return abs(blk_rq_pos(rq) - last_pos); + + return 0; +} + +#if 0 /* Still not clear if we can do without next two functions */ +static void bfq_activate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver++; +} + +static void bfq_deactivate_request(struct request_queue *q, struct request *rq) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + + bfqd->rq_in_driver--; +} +#endif + +static void bfq_remove_request(struct request_queue *q, + struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd = bfqq->bfqd; + const int sync = rq_is_sync(rq); + + if (bfqq->next_rq == rq) { + bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); + bfq_updated_next_req(bfqd, bfqq); + } + + if (rq->queuelist.prev != &rq->queuelist) + list_del_init(&rq->queuelist); + bfqq->queued[sync]--; + bfqd->queued--; + elv_rb_del(&bfqq->sort_list, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; + + if (RB_EMPTY_ROOT(&bfqq->sort_list)) { + bfqq->next_rq = NULL; + + if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { + bfq_del_bfqq_busy(bfqd, bfqq, false); + /* + * bfqq emptied. In normal operation, when + * bfqq is empty, bfqq->entity.service and + * bfqq->entity.budget must contain, + * respectively, the service received and the + * budget used last time bfqq emptied. These + * facts do not hold in this case, as at least + * this last removal occurred while bfqq is + * not in service. To avoid inconsistencies, + * reset both bfqq->entity.service and + * bfqq->entity.budget, if bfqq has still a + * process that may issue I/O requests to it. + */ + bfqq->entity.budget = bfqq->entity.service = 0; + } + + /* + * Remove queue from request-position tree as it is empty. + */ + if (bfqq->pos_root) { + rb_erase(&bfqq->pos_node, bfqq->pos_root); + bfqq->pos_root = NULL; + } + } else { + /* see comments on bfq_pos_tree_add_move() for the unlikely() */ + if (unlikely(!bfqd->nonrot_with_queueing)) + bfq_pos_tree_add_move(bfqd, bfqq); + } + + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending--; + +} + +static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *free = NULL; + /* + * bfq_bic_lookup grabs the queue_lock: invoke it now and + * store its return value for later use, to avoid nesting + * queue_lock inside the bfqd->lock. We assume that the bic + * returned by bfq_bic_lookup does not go away before + * bfqd->lock is taken. + */ + struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + bool ret; + + spin_lock_irq(&bfqd->lock); + + if (bic) { + /* + * Make sure cgroup info is uptodate for current process before + * considering the merge. + */ + bfq_bic_update_cgroup(bic, bio); + + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); + } else { + bfqd->bio_bfqq = NULL; + } + bfqd->bio_bic = bic; + + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + + if (free) + blk_mq_free_request(free); + spin_unlock_irq(&bfqd->lock); + + return ret; +} + +static int bfq_request_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + struct request *__rq; + + __rq = bfq_find_rq_fmerge(bfqd, bio, q); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; + return ELEVATOR_FRONT_MERGE; + } + + return ELEVATOR_NO_MERGE; +} + +static void bfq_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + if (type == ELEVATOR_FRONT_MERGE && + rb_prev(&req->rb_node) && + blk_rq_pos(req) < + blk_rq_pos(container_of(rb_prev(&req->rb_node), + struct request, rb_node))) { + struct bfq_queue *bfqq = RQ_BFQQ(req); + struct bfq_data *bfqd; + struct request *prev, *next_rq; + + if (!bfqq) + return; + + bfqd = bfqq->bfqd; + + /* Reposition request in its sort_list */ + elv_rb_del(&bfqq->sort_list, req); + elv_rb_add(&bfqq->sort_list, req); + + /* Choose next request to be served for bfqq */ + prev = bfqq->next_rq; + next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, + bfqd->last_position); + bfqq->next_rq = next_rq; + /* + * If next_rq changes, update both the queue's budget to + * fit the new request and the queue's position in its + * rq_pos_tree. + */ + if (prev != bfqq->next_rq) { + bfq_updated_next_req(bfqd, bfqq); + /* + * See comments on bfq_pos_tree_add_move() for + * the unlikely(). + */ + if (unlikely(!bfqd->nonrot_with_queueing)) + bfq_pos_tree_add_move(bfqd, bfqq); + } + } +} + +/* + * This function is called to notify the scheduler that the requests + * rq and 'next' have been merged, with 'next' going away. BFQ + * exploits this hook to address the following issue: if 'next' has a + * fifo_time lower that rq, then the fifo_time of rq must be set to + * the value of 'next', to not forget the greater age of 'next'. + * + * NOTE: in this function we assume that rq is in a bfq_queue, basing + * on that rq is picked from the hash table q->elevator->hash, which, + * in its turn, is filled only with I/O requests present in + * bfq_queues, while BFQ is in use for the request queue q. In fact, + * the function that fills this hash table (elv_rqhash_add) is called + * only by bfq_insert_request. + */ +static void bfq_requests_merged(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *next_bfqq = RQ_BFQQ(next); + + if (!bfqq) + return; + + /* + * If next and rq belong to the same bfq_queue and next is older + * than rq, then reposition rq in the fifo (by substituting next + * with rq). Otherwise, if next and rq belong to different + * bfq_queues, never reposition rq: in fact, we would have to + * reposition it with respect to next's position in its own fifo, + * which would most certainly be too expensive with respect to + * the benefits. + */ + if (bfqq == next_bfqq && + !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && + next->fifo_time < rq->fifo_time) { + list_del_init(&rq->queuelist); + list_replace_init(&next->queuelist, &rq->queuelist); + rq->fifo_time = next->fifo_time; + } + + if (bfqq->next_rq == next) + bfqq->next_rq = rq; + + bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +} + +/* Must be called with bfqq != NULL */ +static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) +{ + if (bfq_bfqq_busy(bfqq)) + bfqq->bfqd->wr_busy_queues--; + bfqq->wr_coeff = 1; + bfqq->wr_cur_max_time = 0; + bfqq->last_wr_start_finish = jiffies; + /* + * Trigger a weight change on the next invocation of + * __bfq_entity_update_weight_prio. + */ + bfqq->entity.prio_changed = 1; +} + +void bfq_end_wr_async_queues(struct bfq_data *bfqd, + struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + if (bfqg->async_bfqq[i][j]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); + if (bfqg->async_idle_bfqq) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq); +} + +static void bfq_end_wr(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + + spin_lock_irq(&bfqd->lock); + + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) + bfq_bfqq_end_wr(bfqq); + bfq_end_wr_async(bfqd); + + spin_unlock_irq(&bfqd->lock); +} + +static sector_t bfq_io_struct_pos(void *io_struct, bool request) +{ + if (request) + return blk_rq_pos(io_struct); + else + return ((struct bio *)io_struct)->bi_iter.bi_sector; +} + +static int bfq_rq_close_to_sector(void *io_struct, bool request, + sector_t sector) +{ + return abs(bfq_io_struct_pos(io_struct, request) - sector) <= + BFQQ_CLOSE_THR; +} + +static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + sector_t sector) +{ + struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_node *parent, *node; + struct bfq_queue *__bfqq; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + /* + * First, if we find a request starting at the end of the last + * request, choose it. + */ + __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); + if (__bfqq) + return __bfqq; + + /* + * If the exact sector wasn't found, the parent of the NULL leaf + * will contain the closest sector (rq_pos_tree sorted by + * next_request position). + */ + __bfqq = rb_entry(parent, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + if (blk_rq_pos(__bfqq->next_rq) < sector) + node = rb_next(&__bfqq->pos_node); + else + node = rb_prev(&__bfqq->pos_node); + if (!node) + return NULL; + + __bfqq = rb_entry(node, struct bfq_queue, pos_node); + if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) + return __bfqq; + + return NULL; +} + +static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, + struct bfq_queue *cur_bfqq, + sector_t sector) +{ + struct bfq_queue *bfqq; + + /* + * We shall notice if some of the queues are cooperating, + * e.g., working closely on the same area of the device. In + * that case, we can group them together and: 1) don't waste + * time idling, and 2) serve the union of their requests in + * the best possible order for throughput. + */ + bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); + if (!bfqq || bfqq == cur_bfqq) + return NULL; + + return bfqq; +} + +static struct bfq_queue * +bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + int process_refs, new_process_refs; + struct bfq_queue *__bfqq; + + /* + * If there are no process references on the new_bfqq, then it is + * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain + * may have dropped their last reference (not just their last process + * reference). + */ + if (!bfqq_process_refs(new_bfqq)) + return NULL; + + /* Avoid a circular list and skip interim queue merges. */ + while ((__bfqq = new_bfqq->new_bfqq)) { + if (__bfqq == bfqq) + return NULL; + new_bfqq = __bfqq; + } + + process_refs = bfqq_process_refs(bfqq); + new_process_refs = bfqq_process_refs(new_bfqq); + /* + * If the process for the bfqq has gone away, there is no + * sense in merging the queues. + */ + if (process_refs == 0 || new_process_refs == 0) + return NULL; + + /* + * Make sure merged queues belong to the same parent. Parents could + * have changed since the time we decided the two queues are suitable + * for merging. + */ + if (new_bfqq->entity.parent != bfqq->entity.parent) + return NULL; + + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", + new_bfqq->pid); + + /* + * Merging is just a redirection: the requests of the process + * owning one of the two queues are redirected to the other queue. + * The latter queue, in its turn, is set as shared if this is the + * first time that the requests of some process are redirected to + * it. + * + * We redirect bfqq to new_bfqq and not the opposite, because + * we are in the context of the process owning bfqq, thus we + * have the io_cq of this process. So we can immediately + * configure this io_cq to redirect the requests of the + * process to new_bfqq. In contrast, the io_cq of new_bfqq is + * not available any more (new_bfqq->bic == NULL). + * + * Anyway, even in case new_bfqq coincides with the in-service + * queue, redirecting requests the in-service queue is the + * best option, as we feed the in-service queue with new + * requests close to the last request served and, by doing so, + * are likely to increase the throughput. + */ + bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ + new_bfqq->ref += process_refs; + return new_bfqq; +} + +static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, + struct bfq_queue *new_bfqq) +{ + if (bfq_too_late_for_merging(new_bfqq)) + return false; + + if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || + (bfqq->ioprio_class != new_bfqq->ioprio_class)) + return false; + + /* + * If either of the queues has already been detected as seeky, + * then merging it with the other queue is unlikely to lead to + * sequential I/O. + */ + if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) + return false; + + /* + * Interleaved I/O is known to be done by (some) applications + * only for reads, so it does not make sense to merge async + * queues. + */ + if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) + return false; + + return true; +} + +/* + * Attempt to schedule a merge of bfqq with the currently in-service + * queue or with a close queue among the scheduled queues. Return + * NULL if no merge was scheduled, a pointer to the shared bfq_queue + * structure otherwise. + * + * The OOM queue is not allowed to participate to cooperation: in fact, since + * the requests temporarily redirected to the OOM queue could be redirected + * again to dedicated queues at any time, the state needed to correctly + * handle merging with the OOM queue would be quite complex and expensive + * to maintain. Besides, in such a critical condition as an out of memory, + * the benefits of queue merging may be little relevant, or even negligible. + * + * WARNING: queue merging may impair fairness among non-weight raised + * queues, for at least two reasons: 1) the original weight of a + * merged queue may change during the merged state, 2) even being the + * weight the same, a merged queue may be bloated with many more + * requests than the ones produced by its originally-associated + * process. + */ +static struct bfq_queue * +bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, + void *io_struct, bool request) +{ + struct bfq_queue *in_service_bfqq, *new_bfqq; + + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + /* + * Do not perform queue merging if the device is non + * rotational and performs internal queueing. In fact, such a + * device reaches a high speed through internal parallelism + * and pipelining. This means that, to reach a high + * throughput, it must have many requests enqueued at the same + * time. But, in this configuration, the internal scheduling + * algorithm of the device does exactly the job of queue + * merging: it reorders requests so as to obtain as much as + * possible a sequential I/O pattern. As a consequence, with + * the workload generated by processes doing interleaved I/O, + * the throughput reached by the device is likely to be the + * same, with and without queue merging. + * + * Disabling merging also provides a remarkable benefit in + * terms of throughput. Merging tends to make many workloads + * artificially more uneven, because of shared queues + * remaining non empty for incomparably more time than + * non-merged queues. This may accentuate workload + * asymmetries. For example, if one of the queues in a set of + * merged queues has a higher weight than a normal queue, then + * the shared queue may inherit such a high weight and, by + * staying almost always active, may force BFQ to perform I/O + * plugging most of the time. This evidently makes it harder + * for BFQ to let the device reach a high throughput. + * + * Finally, the likely() macro below is not used because one + * of the two branches is more likely than the other, but to + * have the code path after the following if() executed as + * fast as possible for the case of a non rotational device + * with queueing. We want it because this is the fastest kind + * of device. On the opposite end, the likely() may lengthen + * the execution time of BFQ for the case of slower devices + * (rotational or at least without queueing). But in this case + * the execution time of BFQ matters very little, if not at + * all. + */ + if (likely(bfqd->nonrot_with_queueing)) + return NULL; + + /* + * Prevent bfqq from being merged if it has been created too + * long ago. The idea is that true cooperating processes, and + * thus their associated bfq_queues, are supposed to be + * created shortly after each other. This is the case, e.g., + * for KVM/QEMU and dump I/O threads. Basing on this + * assumption, the following filtering greatly reduces the + * probability that two non-cooperating processes, which just + * happen to do close I/O for some short time interval, have + * their queues merged by mistake. + */ + if (bfq_too_late_for_merging(bfqq)) + return NULL; + + if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) + return NULL; + + /* If there is only one backlogged queue, don't search. */ + if (bfq_tot_busy_queues(bfqd) == 1) + return NULL; + + in_service_bfqq = bfqd->in_service_queue; + + if (in_service_bfqq && in_service_bfqq != bfqq && + likely(in_service_bfqq != &bfqd->oom_bfqq) && + bfq_rq_close_to_sector(io_struct, request, + bfqd->in_serv_last_pos) && + bfqq->entity.parent == in_service_bfqq->entity.parent && + bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { + new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); + if (new_bfqq) + return new_bfqq; + } + /* + * Check whether there is a cooperator among currently scheduled + * queues. The only thing we need is that the bio/request is not + * NULL, as we need it to establish whether a cooperator exists. + */ + new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, + bfq_io_struct_pos(io_struct, request)); + + if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && + bfq_may_be_close_cooperator(bfqq, new_bfqq)) + return bfq_setup_merge(bfqq, new_bfqq); + + return NULL; +} + +static void bfq_bfqq_save_state(struct bfq_queue *bfqq) +{ + struct bfq_io_cq *bic = bfqq->bic; + + /* + * If !bfqq->bic, the queue is already shared or its requests + * have already been redirected to a shared queue; both idle window + * and weight raising state have already been saved. Do nothing. + */ + if (!bic) + return; + + bic->saved_weight = bfqq->entity.orig_weight; + bic->saved_ttime = bfqq->ttime; + bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); + bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && + !bfq_bfqq_in_large_burst(bfqq) && + bfqq->bfqd->low_latency)) { + /* + * bfqq being merged right after being created: bfqq + * would have deserved interactive weight raising, but + * did not make it to be set in a weight-raised state, + * because of this early merge. Store directly the + * weight-raising state that would have been assigned + * to bfqq, so that to avoid that bfqq unjustly fails + * to enjoy weight raising if split soon. + */ + bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); + bic->saved_last_wr_start_finish = jiffies; + } else { + bic->saved_wr_coeff = bfqq->wr_coeff; + bic->saved_wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; + bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + } +} + +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + /* + * To prevent bfqq's service guarantees from being violated, + * bfqq may be left busy, i.e., queued for service, even if + * empty (see comments in __bfq_bfqq_expire() for + * details). But, if no process will send requests to bfqq any + * longer, then there is no point in keeping bfqq queued for + * service. In addition, keeping bfqq queued for service, but + * with no process ref any longer, may have caused bfqq to be + * freed when dequeued from service. But this is assumed to + * never happen. + */ + if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq != bfqd->in_service_queue) + bfq_del_bfqq_busy(bfqd, bfqq, false); + + bfq_put_queue(bfqq); +} + +static void +bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, + struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", + (unsigned long)new_bfqq->pid); + /* Save weight raising and idle window of the merged queues */ + bfq_bfqq_save_state(bfqq); + bfq_bfqq_save_state(new_bfqq); + if (bfq_bfqq_IO_bound(bfqq)) + bfq_mark_bfqq_IO_bound(new_bfqq); + bfq_clear_bfqq_IO_bound(bfqq); + + /* + * If bfqq is weight-raised, then let new_bfqq inherit + * weight-raising. To reduce false positives, neglect the case + * where bfqq has just been created, but has not yet made it + * to be weight-raised (which may happen because EQM may merge + * bfqq even before bfq_add_request is executed for the first + * time for bfqq). Handling this case would however be very + * easy, thanks to the flag just_created. + */ + if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { + new_bfqq->wr_coeff = bfqq->wr_coeff; + new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; + new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; + new_bfqq->wr_start_at_switch_to_srt = + bfqq->wr_start_at_switch_to_srt; + if (bfq_bfqq_busy(new_bfqq)) + bfqd->wr_busy_queues++; + new_bfqq->entity.prio_changed = 1; + } + + if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ + bfqq->wr_coeff = 1; + bfqq->entity.prio_changed = 1; + if (bfq_bfqq_busy(bfqq)) + bfqd->wr_busy_queues--; + } + + bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", + bfqd->wr_busy_queues); + + /* + * Merge queues (that is, let bic redirect its requests to new_bfqq) + */ + bic_set_bfqq(bic, new_bfqq, true); + bfq_mark_bfqq_coop(new_bfqq); + /* + * new_bfqq now belongs to at least two bics (it is a shared queue): + * set new_bfqq->bic to NULL. bfqq either: + * - does not belong to any bic any more, and hence bfqq->bic must + * be set to NULL, or + * - is a queue whose owning bics have already been redirected to a + * different queue, hence the queue is destined to not belong to + * any bic soon and bfqq->bic is already NULL (therefore the next + * assignment causes no harm). + */ + new_bfqq->bic = NULL; + /* + * If the queue is shared, the pid is the pid of one of the associated + * processes. Which pid depends on the exact sequence of merge events + * the queue underwent. So printing such a pid is useless and confusing + * because it reports a random pid between those of the associated + * processes. + * We mark such a queue with a pid -1, and then print SHARED instead of + * a pid in logging messages. + */ + new_bfqq->pid = -1; + bfqq->bic = NULL; + bfq_release_process_ref(bfqd, bfqq); +} + +static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct bfq_data *bfqd = q->elevator->elevator_data; + bool is_sync = op_is_sync(bio->bi_opf); + struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; + + /* + * Disallow merge of a sync bio into an async request. + */ + if (is_sync && !rq_is_sync(rq)) + return false; + + /* + * Lookup the bfqq that this bio will be queued with. Allow + * merge only if rq is queued there. + */ + if (!bfqq) + return false; + + /* + * We take advantage of this function to perform an early merge + * of the queues of possible cooperating processes. + */ + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + if (new_bfqq) { + /* + * bic still points to bfqq, then it has not yet been + * redirected to some other bfq_queue, and a queue + * merge between bfqq and new_bfqq can be safely + * fulfilled, i.e., bic can be redirected to new_bfqq + * and bfqq can be put. + */ + bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, + new_bfqq); + /* + * If we get here, bio will be queued into new_queue, + * so use new_bfqq to decide whether bio and rq can be + * merged. + */ + bfqq = new_bfqq; + + /* + * Change also bqfd->bio_bfqq, as + * bfqd->bio_bic now points to new_bfqq, and + * this function may be invoked again (and then may + * use again bqfd->bio_bfqq). + */ + bfqd->bio_bfqq = bfqq; + } + + return bfqq == RQ_BFQQ(rq); +} + +/* + * Set the maximum time for the in-service queue to consume its + * budget. This prevents seeky processes from lowering the throughput. + * In practice, a time-slice service scheme is used with seeky + * processes. + */ +static void bfq_set_budget_timeout(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + unsigned int timeout_coeff; + + if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) + timeout_coeff = 1; + else + timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; + + bfqd->last_budget_start = ktime_get(); + + bfqq->budget_timeout = jiffies + + bfqd->bfq_timeout * timeout_coeff; +} + +static void __bfq_set_in_service_queue(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + if (bfqq) { + bfq_clear_bfqq_fifo_expire(bfqq); + + bfqd->budgets_assigned = (bfqd->budgets_assigned * 7 + 256) / 8; + + if (time_is_before_jiffies(bfqq->last_wr_start_finish) && + bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + time_is_before_jiffies(bfqq->budget_timeout)) { + /* + * For soft real-time queues, move the start + * of the weight-raising period forward by the + * time the queue has not received any + * service. Otherwise, a relatively long + * service delay is likely to cause the + * weight-raising period of the queue to end, + * because of the short duration of the + * weight-raising period of a soft real-time + * queue. It is worth noting that this move + * is not so dangerous for the other queues, + * because soft real-time queues are not + * greedy. + * + * To not add a further variable, we use the + * overloaded field budget_timeout to + * determine for how long the queue has not + * received service, i.e., how much time has + * elapsed since the queue expired. However, + * this is a little imprecise, because + * budget_timeout is set to jiffies if bfqq + * not only expires, but also remains with no + * request. + */ + if (time_after(bfqq->budget_timeout, + bfqq->last_wr_start_finish)) + bfqq->last_wr_start_finish += + jiffies - bfqq->budget_timeout; + else + bfqq->last_wr_start_finish = jiffies; + } + + bfq_set_budget_timeout(bfqd, bfqq); + bfq_log_bfqq(bfqd, bfqq, + "set_in_service_queue, cur-budget = %d", + bfqq->entity.budget); + } + + bfqd->in_service_queue = bfqq; + bfqd->in_serv_last_pos = 0; +} + +/* + * Get and set a new queue for service. + */ +static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); + + __bfq_set_in_service_queue(bfqd, bfqq); + return bfqq; +} + +static void bfq_arm_slice_timer(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + u32 sl; + + bfq_mark_bfqq_wait_request(bfqq); + + /* + * We don't want to idle for seeks, but we do want to allow + * fair distribution of slice time for a process doing back-to-back + * seeks. So allow a little bit of time for him to submit a new rq. + */ + sl = bfqd->bfq_slice_idle; + /* + * Unless the queue is being weight-raised or the scenario is + * asymmetric, grant only minimum idle time if the queue + * is seeky. A long idling is preserved for a weight-raised + * queue, or, more in general, in an asymmetric scenario, + * because a long idling is needed for guaranteeing to a queue + * its reserved share of the throughput (in particular, it is + * needed if the queue has a higher weight than some other + * queue). + */ + if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && + !bfq_asymmetric_scenario(bfqd, bfqq)) + sl = min_t(u64, sl, BFQ_MIN_TT); + else if (bfqq->wr_coeff > 1) + sl = max_t(u32, sl, 20ULL * NSEC_PER_MSEC); + + bfqd->last_idling_start = ktime_get(); + bfqd->last_idling_start_jiffies = jiffies; + + hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), + HRTIMER_MODE_REL); + bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); +} + +/* + * In autotuning mode, max_budget is dynamically recomputed as the + * amount of sectors transferred in timeout at the estimated peak + * rate. This enables BFQ to utilize a full timeslice with a full + * budget, even if the in-service queue is served at peak rate. And + * this maximises throughput with sequential workloads. + */ +static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) +{ + return (u64)bfqd->peak_rate * USEC_PER_MSEC * + jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; +} + +/* + * Update parameters related to throughput and responsiveness, as a + * function of the estimated peak rate. See comments on + * bfq_calc_max_budget(), and on the ref_wr_duration array. + */ +static void update_thr_responsiveness_params(struct bfq_data *bfqd) +{ + if (bfqd->bfq_user_max_budget == 0) { + bfqd->bfq_max_budget = + bfq_calc_max_budget(bfqd); + bfq_log(bfqd, "new max_budget = %d", bfqd->bfq_max_budget); + } +} + +static void bfq_reset_rate_computation(struct bfq_data *bfqd, + struct request *rq) +{ + if (rq != NULL) { /* new rq dispatch now, reset accordingly */ + bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns(); + bfqd->peak_rate_samples = 1; + bfqd->sequential_samples = 0; + bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = + blk_rq_sectors(rq); + } else /* no new rq dispatched, just reset the number of samples */ + bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ + + bfq_log(bfqd, + "reset_rate_computation at end, sample %u/%u tot_sects %llu", + bfqd->peak_rate_samples, bfqd->sequential_samples, + bfqd->tot_sectors_dispatched); +} + +static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) +{ + u32 rate, weight, divisor; + + /* + * For the convergence property to hold (see comments on + * bfq_update_peak_rate()) and for the assessment to be + * reliable, a minimum number of samples must be present, and + * a minimum amount of time must have elapsed. If not so, do + * not compute new rate. Just reset parameters, to get ready + * for a new evaluation attempt. + */ + if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || + bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) + goto reset_computation; + + /* + * If a new request completion has occurred after last + * dispatch, then, to approximate the rate at which requests + * have been served by the device, it is more precise to + * extend the observation interval to the last completion. + */ + bfqd->delta_from_first = + max_t(u64, bfqd->delta_from_first, + bfqd->last_completion - bfqd->first_dispatch); + + /* + * Rate computed in sects/usec, and not sects/nsec, for + * precision issues. + */ + rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); + + /* + * Peak rate not updated if: + * - the percentage of sequential dispatches is below 3/4 of the + * total, and rate is below the current estimated peak rate + * - rate is unreasonably high (> 20M sectors/sec) + */ + if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && + rate <= bfqd->peak_rate) || + rate > 20<sequential_samples cannot + * become equal to bfqd->peak_rate_samples, which, in its + * turn, holds true because bfqd->sequential_samples is not + * incremented for the first sample. + */ + weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; + + /* + * Second step: further refine the weight as a function of the + * duration of the observation interval. + */ + weight = min_t(u32, 8, + div_u64(weight * bfqd->delta_from_first, + BFQ_RATE_REF_INTERVAL)); + + /* + * Divisor ranging from 10, for minimum weight, to 2, for + * maximum weight. + */ + divisor = 10 - weight; + + /* + * Finally, update peak rate: + * + * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor + */ + bfqd->peak_rate *= divisor-1; + bfqd->peak_rate /= divisor; + rate /= divisor; /* smoothing constant alpha = 1/divisor */ + + bfqd->peak_rate += rate; + + /* + * For a very slow device, bfqd->peak_rate can reach 0 (see + * the minimum representable values reported in the comments + * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid + * divisions by zero where bfqd->peak_rate is used as a + * divisor. + */ + bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); + + update_thr_responsiveness_params(bfqd); + +reset_computation: + bfq_reset_rate_computation(bfqd, rq); +} + +/* + * Update the read/write peak rate (the main quantity used for + * auto-tuning, see update_thr_responsiveness_params()). + * + * It is not trivial to estimate the peak rate (correctly): because of + * the presence of sw and hw queues between the scheduler and the + * device components that finally serve I/O requests, it is hard to + * say exactly when a given dispatched request is served inside the + * device, and for how long. As a consequence, it is hard to know + * precisely at what rate a given set of requests is actually served + * by the device. + * + * On the opposite end, the dispatch time of any request is trivially + * available, and, from this piece of information, the "dispatch rate" + * of requests can be immediately computed. So, the idea in the next + * function is to use what is known, namely request dispatch times + * (plus, when useful, request completion times), to estimate what is + * unknown, namely in-device request service rate. + * + * The main issue is that, because of the above facts, the rate at + * which a certain set of requests is dispatched over a certain time + * interval can vary greatly with respect to the rate at which the + * same requests are then served. But, since the size of any + * intermediate queue is limited, and the service scheme is lossless + * (no request is silently dropped), the following obvious convergence + * property holds: the number of requests dispatched MUST become + * closer and closer to the number of requests completed as the + * observation interval grows. This is the key property used in + * the next function to estimate the peak service rate as a function + * of the observed dispatch rate. The function assumes to be invoked + * on every request dispatch. + */ +static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) +{ + u64 now_ns = ktime_get_ns(); + + if (bfqd->peak_rate_samples == 0) { /* first dispatch */ + bfq_log(bfqd, "update_peak_rate: goto reset, samples %d", + bfqd->peak_rate_samples); + bfq_reset_rate_computation(bfqd, rq); + goto update_last_values; /* will add one sample */ + } + + /* + * Device idle for very long: the observation interval lasting + * up to this dispatch cannot be a valid observation interval + * for computing a new peak rate (similarly to the late- + * completion event in bfq_completed_request()). Go to + * update_rate_and_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - start a new observation interval with this dispatch + */ + if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && + bfqd->rq_in_driver == 0) + goto update_rate_and_reset; + + /* Update sampling information */ + bfqd->peak_rate_samples++; + + if ((bfqd->rq_in_driver > 0 || + now_ns - bfqd->last_completion < BFQ_MIN_TT) + && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) + bfqd->sequential_samples++; + + bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); + + /* Reset max observed rq size every 32 dispatches */ + if (likely(bfqd->peak_rate_samples % 32)) + bfqd->last_rq_max_size = + max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); + else + bfqd->last_rq_max_size = blk_rq_sectors(rq); + + bfqd->delta_from_first = now_ns - bfqd->first_dispatch; + + /* Target observation interval not yet reached, go on sampling */ + if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) + goto update_last_values; + +update_rate_and_reset: + bfq_update_rate_reset(bfqd, rq); +update_last_values: + bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); + if (RQ_BFQQ(rq) == bfqd->in_service_queue) + bfqd->in_serv_last_pos = bfqd->last_position; + bfqd->last_dispatch = now_ns; +} + +/* + * Remove request from internal lists. + */ +static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + + /* + * For consistency, the next instruction should have been + * executed after removing the request from the queue and + * dispatching it. We execute instead this instruction before + * bfq_remove_request() (and hence introduce a temporary + * inconsistency), for efficiency. In fact, should this + * dispatch occur for a non in-service bfqq, this anticipated + * increment prevents two counters related to bfqq->dispatched + * from risking to be, first, uselessly decremented, and then + * incremented again when the (new) value of bfqq->dispatched + * happens to be taken into account. + */ + bfqq->dispatched++; + bfq_update_peak_rate(q->elevator->elevator_data, rq); + + bfq_remove_request(q, rq); +} + +/* + * There is a case where idling does not have to be performed for + * throughput concerns, but to preserve the throughput share of + * the process associated with bfqq. + * + * To introduce this case, we can note that allowing the drive + * to enqueue more than one request at a time, and hence + * delegating de facto final scheduling decisions to the + * drive's internal scheduler, entails loss of control on the + * actual request service order. In particular, the critical + * situation is when requests from different processes happen + * to be present, at the same time, in the internal queue(s) + * of the drive. In such a situation, the drive, by deciding + * the service order of the internally-queued requests, does + * determine also the actual throughput distribution among + * these processes. But the drive typically has no notion or + * concern about per-process throughput distribution, and + * makes its decisions only on a per-request basis. Therefore, + * the service distribution enforced by the drive's internal + * scheduler is likely to coincide with the desired throughput + * distribution only in a completely symmetric, or favorably + * skewed scenario where: + * (i-a) each of these processes must get the same throughput as + * the others, + * (i-b) in case (i-a) does not hold, it holds that the process + * associated with bfqq must receive a lower or equal + * throughput than any of the other processes; + * (ii) the I/O of each process has the same properties, in + * terms of locality (sequential or random), direction + * (reads or writes), request sizes, greediness + * (from I/O-bound to sporadic), and so on; + + * In fact, in such a scenario, the drive tends to treat the requests + * of each process in about the same way as the requests of the + * others, and thus to provide each of these processes with about the + * same throughput. This is exactly the desired throughput + * distribution if (i-a) holds, or, if (i-b) holds instead, this is an + * even more convenient distribution for (the process associated with) + * bfqq. + * + * In contrast, in any asymmetric or unfavorable scenario, device + * idling (I/O-dispatch plugging) is certainly needed to guarantee + * that bfqq receives its assigned fraction of the device throughput + * (see [1] for details). + * + * The problem is that idling may significantly reduce throughput with + * certain combinations of types of I/O and devices. An important + * example is sync random I/O on flash storage with command + * queueing. So, unless bfqq falls in cases where idling also boosts + * throughput, it is important to check conditions (i-a), i(-b) and + * (ii) accurately, so as to avoid idling when not strictly needed for + * service guarantees. + * + * Unfortunately, it is extremely difficult to thoroughly check + * condition (ii). And, in case there are active groups, it becomes + * very difficult to check conditions (i-a) and (i-b) too. In fact, + * if there are active groups, then, for conditions (i-a) or (i-b) to + * become false 'indirectly', it is enough that an active group + * contains more active processes or sub-groups than some other active + * group. More precisely, for conditions (i-a) or (i-b) to become + * false because of such a group, it is not even necessary that the + * group is (still) active: it is sufficient that, even if the group + * has become inactive, some of its descendant processes still have + * some request already dispatched but still waiting for + * completion. In fact, requests have still to be guaranteed their + * share of the throughput even after being dispatched. In this + * respect, it is easy to show that, if a group frequently becomes + * inactive while still having in-flight requests, and if, when this + * happens, the group is not considered in the calculation of whether + * the scenario is asymmetric, then the group may fail to be + * guaranteed its fair share of the throughput (basically because + * idling may not be performed for the descendant processes of the + * group, but it had to be). We address this issue with the following + * bi-modal behavior, implemented in the function + * bfq_asymmetric_scenario(). + * + * If there are groups with requests waiting for completion + * (as commented above, some of these groups may even be + * already inactive), then the scenario is tagged as + * asymmetric, conservatively, without checking any of the + * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq. + * This behavior matches also the fact that groups are created + * exactly if controlling I/O is a primary concern (to + * preserve bandwidth and latency guarantees). + * + * On the opposite end, if there are no groups with requests waiting + * for completion, then only conditions (i-a) and (i-b) are actually + * controlled, i.e., provided that conditions (i-a) or (i-b) holds, + * idling is not performed, regardless of whether condition (ii) + * holds. In other words, only if conditions (i-a) and (i-b) do not + * hold, then idling is allowed, and the device tends to be prevented + * from queueing many requests, possibly of several processes. Since + * there are no groups with requests waiting for completion, then, to + * control conditions (i-a) and (i-b) it is enough to check just + * whether all the queues with requests waiting for completion also + * have the same weight. + * + * Not checking condition (ii) evidently exposes bfqq to the + * risk of getting less throughput than its fair share. + * However, for queues with the same weight, a further + * mechanism, preemption, mitigates or even eliminates this + * problem. And it does so without consequences on overall + * throughput. This mechanism and its benefits are explained + * in the next three paragraphs. + * + * Even if a queue, say Q, is expired when it remains idle, Q + * can still preempt the new in-service queue if the next + * request of Q arrives soon (see the comments on + * bfq_bfqq_update_budg_for_activation). If all queues and + * groups have the same weight, this form of preemption, + * combined with the hole-recovery heuristic described in the + * comments on function bfq_bfqq_update_budg_for_activation, + * are enough to preserve a correct bandwidth distribution in + * the mid term, even without idling. In fact, even if not + * idling allows the internal queues of the device to contain + * many requests, and thus to reorder requests, we can rather + * safely assume that the internal scheduler still preserves a + * minimum of mid-term fairness. + * + * More precisely, this preemption-based, idleless approach + * provides fairness in terms of IOPS, and not sectors per + * second. This can be seen with a simple example. Suppose + * that there are two queues with the same weight, but that + * the first queue receives requests of 8 sectors, while the + * second queue receives requests of 1024 sectors. In + * addition, suppose that each of the two queues contains at + * most one request at a time, which implies that each queue + * always remains idle after it is served. Finally, after + * remaining idle, each queue receives very quickly a new + * request. It follows that the two queues are served + * alternatively, preempting each other if needed. This + * implies that, although both queues have the same weight, + * the queue with large requests receives a service that is + * 1024/8 times as high as the service received by the other + * queue. + * + * The motivation for using preemption instead of idling (for + * queues with the same weight) is that, by not idling, + * service guarantees are preserved (completely or at least in + * part) without minimally sacrificing throughput. And, if + * there is no active group, then the primary expectation for + * this device is probably a high throughput. + * + * We are now left only with explaining the two sub-conditions in the + * additional compound condition that is checked below for deciding + * whether the scenario is asymmetric. To explain the first + * sub-condition, we need to add that the function + * bfq_asymmetric_scenario checks the weights of only + * non-weight-raised queues, for efficiency reasons (see comments on + * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised + * is checked explicitly here. More precisely, the compound condition + * below takes into account also the fact that, even if bfqq is being + * weight-raised, the scenario is still symmetric if all queues with + * requests waiting for completion happen to be + * weight-raised. Actually, we should be even more precise here, and + * differentiate between interactive weight raising and soft real-time + * weight raising. + * + * The second sub-condition checked in the compound condition is + * whether there is a fair amount of already in-flight I/O not + * belonging to bfqq. If so, I/O dispatching is to be plugged, for the + * following reason. The drive may decide to serve in-flight + * non-bfqq's I/O requests before bfqq's ones, thereby delaying the + * arrival of new I/O requests for bfqq (recall that bfqq is sync). If + * I/O-dispatching is not plugged, then, while bfqq remains empty, a + * basically uncontrolled amount of I/O from other queues may be + * dispatched too, possibly causing the service of bfqq's I/O to be + * delayed even longer in the drive. This problem gets more and more + * serious as the speed and the queue depth of the drive grow, + * because, as these two quantities grow, the probability to find no + * queue busy but many requests in flight grows too. By contrast, + * plugging I/O dispatching minimizes the delay induced by already + * in-flight I/O, and enables bfqq to recover the bandwidth it may + * lose because of this delay. + * + * As a side note, it is worth considering that the above + * device-idling countermeasures may however fail in the following + * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled + * in a time period during which all symmetry sub-conditions hold, and + * therefore the device is allowed to enqueue many requests, but at + * some later point in time some sub-condition stops to hold, then it + * may become impossible to make requests be served in the desired + * order until all the requests already queued in the device have been + * served. The last sub-condition commented above somewhat mitigates + * this problem for weight-raised queues. + */ +static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + + return (bfqq->wr_coeff > 1 && + (bfqd->wr_busy_queues < + bfq_tot_busy_queues(bfqd) || + bfqd->rq_in_driver >= + bfqq->dispatched + 4)) || + bfq_asymmetric_scenario(bfqd, bfqq); +} + +static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + /* + * If this bfqq is shared between multiple processes, check + * to make sure that those processes are still issuing I/Os + * within the mean seek distance. If not, it may be time to + * break the queues apart again. + */ + if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) + bfq_mark_bfqq_split_coop(bfqq); + + /* + * Consider queues with a higher finish virtual time than + * bfqq. If idling_needed_for_service_guarantees(bfqq) returns + * true, then bfqq's bandwidth would be violated if an + * uncontrolled amount of I/O from these queues were + * dispatched while bfqq is waiting for its new I/O to + * arrive. This is exactly what may happen if this is a forced + * expiration caused by a preemption attempt, and if bfqq is + * not re-scheduled. To prevent this from happening, re-queue + * bfqq if it needs I/O-dispatch plugging, even if it is + * empty. By doing so, bfqq is granted to be served before the + * above queues (provided that bfqq is of course eligible). + */ + if (RB_EMPTY_ROOT(&bfqq->sort_list) && + !(reason == BFQQE_PREEMPTED && + idling_needed_for_service_guarantees(bfqd, bfqq))) { + if (bfqq->dispatched == 0) + /* + * Overloading budget_timeout field to store + * the time at which the queue remains with no + * backlog and no outstanding request; used by + * the weight-raising mechanism. + */ + bfqq->budget_timeout = jiffies; + + bfq_del_bfqq_busy(bfqd, bfqq, true); + } else { + bfq_requeue_bfqq(bfqd, bfqq, true); + /* + * Resort priority tree of potential close cooperators. + * See comments on bfq_pos_tree_add_move() for the unlikely(). + */ + if (unlikely(!bfqd->nonrot_with_queueing && + !RB_EMPTY_ROOT(&bfqq->sort_list))) + bfq_pos_tree_add_move(bfqd, bfqq); + } + + /* + * All in-service entities must have been properly deactivated + * or requeued before executing the next function, which + * resets all in-service entities as no more in service. This + * may cause bfqq to be freed. If this happens, the next + * function returns true. + */ + return __bfq_bfqd_reset_in_service(bfqd); +} + +/** + * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. + * @bfqd: device data. + * @bfqq: queue to update. + * @reason: reason for expiration. + * + * Handle the feedback on @bfqq budget at queue expiration. + * See the body for detailed comments. + */ +static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + enum bfqq_expiration reason) +{ + struct request *next_rq; + int budget, min_budget; + + min_budget = bfq_min_budget(bfqd); + + if (bfqq->wr_coeff == 1) + budget = bfqq->max_budget; + else /* + * Use a constant, low budget for weight-raised queues, + * to help achieve a low latency. Keep it slightly higher + * than the minimum possible budget, to cause a little + * bit fewer expirations. + */ + budget = 2 * min_budget; + + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", + bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", + budget, bfq_min_budget(bfqd)); + bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", + bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); + + if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { + switch (reason) { + /* + * Caveat: in all the following cases we trade latency + * for throughput. + */ + case BFQQE_TOO_IDLE: + /* + * This is the only case where we may reduce + * the budget: if there is no request of the + * process still waiting for completion, then + * we assume (tentatively) that the timer has + * expired because the batch of requests of + * the process could have been served with a + * smaller budget. Hence, betting that + * process will behave in the same way when it + * becomes backlogged again, we reduce its + * next budget. As long as we guess right, + * this budget cut reduces the latency + * experienced by the process. + * + * However, if there are still outstanding + * requests, then the process may have not yet + * issued its next request just because it is + * still waiting for the completion of some of + * the still outstanding ones. So in this + * subcase we do not reduce its budget, on the + * contrary we increase it to possibly boost + * the throughput, as discussed in the + * comments to the BUDGET_TIMEOUT case. + */ + if (bfqq->dispatched > 0) /* still outstanding reqs */ + budget = min(budget * 2, bfqd->bfq_max_budget); + else { + if (budget > 5 * min_budget) + budget -= 4 * min_budget; + else + budget = min_budget; + } + break; + case BFQQE_BUDGET_TIMEOUT: + /* + * We double the budget here because it gives + * the chance to boost the throughput if this + * is not a seeky process (and has bumped into + * this timeout because of, e.g., ZBR). + */ + budget = min(budget * 2, bfqd->bfq_max_budget); + break; + case BFQQE_BUDGET_EXHAUSTED: + /* + * The process still has backlog, and did not + * let either the budget timeout or the disk + * idling timeout expire. Hence it is not + * seeky, has a short thinktime and may be + * happy with a higher budget too. So + * definitely increase the budget of this good + * candidate to boost the disk throughput. + */ + budget = min(budget * 4, bfqd->bfq_max_budget); + break; + case BFQQE_NO_MORE_REQUESTS: + /* + * For queues that expire for this reason, it + * is particularly important to keep the + * budget close to the actual service they + * need. Doing so reduces the timestamp + * misalignment problem described in the + * comments in the body of + * __bfq_activate_entity. In fact, suppose + * that a queue systematically expires for + * BFQQE_NO_MORE_REQUESTS and presents a + * new request in time to enjoy timestamp + * back-shifting. The larger the budget of the + * queue is with respect to the service the + * queue actually requests in each service + * slot, the more times the queue can be + * reactivated with the same virtual finish + * time. It follows that, even if this finish + * time is pushed to the system virtual time + * to reduce the consequent timestamp + * misalignment, the queue unjustly enjoys for + * many re-activations a lower finish time + * than all newly activated queues. + * + * The service needed by bfqq is measured + * quite precisely by bfqq->entity.service. + * Since bfqq does not enjoy device idling, + * bfqq->entity.service is equal to the number + * of sectors that the process associated with + * bfqq requested to read/write before waiting + * for request completions, or blocking for + * other reasons. + */ + budget = max_t(int, bfqq->entity.service, min_budget); + break; + default: + return; + } + } else if (!bfq_bfqq_sync(bfqq)) { + /* + * Async queues get always the maximum possible + * budget, as for them we do not care about latency + * (in addition, their ability to dispatch is limited + * by the charging factor). + */ + budget = bfqd->bfq_max_budget; + } + + bfqq->max_budget = budget; + + if (bfqd->budgets_assigned >= bfq_stats_min_budgets && + !bfqd->bfq_user_max_budget) + bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); + + /* + * If there is still backlog, then assign a new budget, making + * sure that it is large enough for the next request. Since + * the finish time of bfqq must be kept in sync with the + * budget, be sure to call __bfq_bfqq_expire() *after* this + * update. + * + * If there is no backlog, then no need to update the budget; + * it will be updated on the arrival of a new request. + */ + next_rq = bfqq->next_rq; + if (next_rq) + bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, + bfq_serv_to_charge(next_rq, bfqq)); + + bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", + next_rq ? blk_rq_sectors(next_rq) : 0, + bfqq->entity.budget); +} + +/* + * Return true if the process associated with bfqq is "slow". The slow + * flag is used, in addition to the budget timeout, to reduce the + * amount of service provided to seeky processes, and thus reduce + * their chances to lower the throughput. More details in the comments + * on the function bfq_bfqq_expire(). + * + * An important observation is in order: as discussed in the comments + * on the function bfq_update_peak_rate(), with devices with internal + * queues, it is hard if ever possible to know when and for how long + * an I/O request is processed by the device (apart from the trivial + * I/O pattern where a new request is dispatched only after the + * previous one has been completed). This makes it hard to evaluate + * the real rate at which the I/O requests of each bfq_queue are + * served. In fact, for an I/O scheduler like BFQ, serving a + * bfq_queue means just dispatching its requests during its service + * slot (i.e., until the budget of the queue is exhausted, or the + * queue remains idle, or, finally, a timeout fires). But, during the + * service slot of a bfq_queue, around 100 ms at most, the device may + * be even still processing requests of bfq_queues served in previous + * service slots. On the opposite end, the requests of the in-service + * bfq_queue may be completed after the service slot of the queue + * finishes. + * + * Anyway, unless more sophisticated solutions are used + * (where possible), the sum of the sizes of the requests dispatched + * during the service slot of a bfq_queue is probably the only + * approximation available for the service received by the bfq_queue + * during its service slot. And this sum is the quantity used in this + * function to evaluate the I/O speed of a process. + */ +static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason, + unsigned long *delta_ms) +{ + ktime_t delta_ktime; + u32 delta_usecs; + bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ + + if (!bfq_bfqq_sync(bfqq)) + return false; + + if (compensate) + delta_ktime = bfqd->last_idling_start; + else + delta_ktime = ktime_get(); + delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); + delta_usecs = ktime_to_us(delta_ktime); + + /* don't use too short time intervals */ + if (delta_usecs < 1000) { + if (blk_queue_nonrot(bfqd->queue)) + /* + * give same worst-case guarantees as idling + * for seeky + */ + *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; + else /* charge at least one seek */ + *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; + + return slow; + } + + *delta_ms = delta_usecs / USEC_PER_MSEC; + + /* + * Use only long (> 20ms) intervals to filter out excessive + * spikes in service rate estimation. + */ + if (delta_usecs > 20000) { + /* + * Caveat for rotational devices: processes doing I/O + * in the slower disk zones tend to be slow(er) even + * if not seeky. In this respect, the estimated peak + * rate is likely to be an average over the disk + * surface. Accordingly, to not be too harsh with + * unlucky processes, a process is deemed slow only if + * its rate has been lower than half of the estimated + * peak rate. + */ + slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; + } + + bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); + + return slow; +} + +/* + * To be deemed as soft real-time, an application must meet two + * requirements. First, the application must not require an average + * bandwidth higher than the approximate bandwidth required to playback or + * record a compressed high-definition video. + * The next function is invoked on the completion of the last request of a + * batch, to compute the next-start time instant, soft_rt_next_start, such + * that, if the next request of the application does not arrive before + * soft_rt_next_start, then the above requirement on the bandwidth is met. + * + * The second requirement is that the request pattern of the application is + * isochronous, i.e., that, after issuing a request or a batch of requests, + * the application stops issuing new requests until all its pending requests + * have been completed. After that, the application may issue a new batch, + * and so on. + * For this reason the next function is invoked to compute + * soft_rt_next_start only for applications that meet this requirement, + * whereas soft_rt_next_start is set to infinity for applications that do + * not. + * + * Unfortunately, even a greedy (i.e., I/O-bound) application may + * happen to meet, occasionally or systematically, both the above + * bandwidth and isochrony requirements. This may happen at least in + * the following circumstances. First, if the CPU load is high. The + * application may stop issuing requests while the CPUs are busy + * serving other processes, then restart, then stop again for a while, + * and so on. The other circumstances are related to the storage + * device: the storage device is highly loaded or reaches a low-enough + * throughput with the I/O of the application (e.g., because the I/O + * is random and/or the device is slow). In all these cases, the + * I/O of the application may be simply slowed down enough to meet + * the bandwidth and isochrony requirements. To reduce the probability + * that greedy applications are deemed as soft real-time in these + * corner cases, a further rule is used in the computation of + * soft_rt_next_start: the return value of this function is forced to + * be higher than the maximum between the following two quantities. + * + * (a) Current time plus: (1) the maximum time for which the arrival + * of a request is waited for when a sync queue becomes idle, + * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We + * postpone for a moment the reason for adding a few extra + * jiffies; we get back to it after next item (b). Lower-bounding + * the return value of this function with the current time plus + * bfqd->bfq_slice_idle tends to filter out greedy applications, + * because the latter issue their next request as soon as possible + * after the last one has been completed. In contrast, a soft + * real-time application spends some time processing data, after a + * batch of its requests has been completed. + * + * (b) Current value of bfqq->soft_rt_next_start. As pointed out + * above, greedy applications may happen to meet both the + * bandwidth and isochrony requirements under heavy CPU or + * storage-device load. In more detail, in these scenarios, these + * applications happen, only for limited time periods, to do I/O + * slowly enough to meet all the requirements described so far, + * including the filtering in above item (a). These slow-speed + * time intervals are usually interspersed between other time + * intervals during which these applications do I/O at a very high + * speed. Fortunately, exactly because of the high speed of the + * I/O in the high-speed intervals, the values returned by this + * function happen to be so high, near the end of any such + * high-speed interval, to be likely to fall *after* the end of + * the low-speed time interval that follows. These high values are + * stored in bfqq->soft_rt_next_start after each invocation of + * this function. As a consequence, if the last value of + * bfqq->soft_rt_next_start is constantly used to lower-bound the + * next value that this function may return, then, from the very + * beginning of a low-speed interval, bfqq->soft_rt_next_start is + * likely to be constantly kept so high that any I/O request + * issued during the low-speed interval is considered as arriving + * to soon for the application to be deemed as soft + * real-time. Then, in the high-speed interval that follows, the + * application will not be deemed as soft real-time, just because + * it will do I/O at a high speed. And so on. + * + * Getting back to the filtering in item (a), in the following two + * cases this filtering might be easily passed by a greedy + * application, if the reference quantity was just + * bfqd->bfq_slice_idle: + * 1) HZ is so low that the duration of a jiffy is comparable to or + * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow + * devices with HZ=100. The time granularity may be so coarse + * that the approximation, in jiffies, of bfqd->bfq_slice_idle + * is rather lower than the exact value. + * 2) jiffies, instead of increasing at a constant rate, may stop increasing + * for a while, then suddenly 'jump' by several units to recover the lost + * increments. This seems to happen, e.g., inside virtual machines. + * To address this issue, in the filtering in (a) we do not use as a + * reference time interval just bfqd->bfq_slice_idle, but + * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the + * minimum number of jiffies for which the filter seems to be quite + * precise also in embedded systems and KVM/QEMU virtual machines. + */ +static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + return max3(bfqq->soft_rt_next_start, + bfqq->last_idle_bklogged + + HZ * bfqq->service_from_backlogged / + bfqd->bfq_wr_max_softrt_rate, + jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); +} + +/** + * bfq_bfqq_expire - expire a queue. + * @bfqd: device owning the queue. + * @bfqq: the queue to expire. + * @compensate: if true, compensate for the time spent idling. + * @reason: the reason causing the expiration. + * + * If the process associated with bfqq does slow I/O (e.g., because it + * issues random requests), we charge bfqq with the time it has been + * in service instead of the service it has received (see + * bfq_bfqq_charge_time for details on how this goal is achieved). As + * a consequence, bfqq will typically get higher timestamps upon + * reactivation, and hence it will be rescheduled as if it had + * received more service than what it has actually received. In the + * end, bfqq receives less service in proportion to how slowly its + * associated process consumes its budgets (and hence how seriously it + * tends to lower the throughput). In addition, this time-charging + * strategy guarantees time fairness among slow processes. In + * contrast, if the process associated with bfqq is not slow, we + * charge bfqq exactly with the service it has received. + * + * Charging time to the first type of queues and the exact service to + * the other has the effect of using the WF2Q+ policy to schedule the + * former on a timeslice basis, without violating service domain + * guarantees among the latter. + */ +void bfq_bfqq_expire(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + bool compensate, + enum bfqq_expiration reason) +{ + bool slow; + unsigned long delta = 0; + struct bfq_entity *entity = &bfqq->entity; + + /* + * Check whether the process is slow (see bfq_bfqq_is_slow). + */ + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + + /* + * As above explained, charge slow (typically seeky) and + * timed-out queues with the time and not the service + * received, to favor sequential workloads. + * + * Processes doing I/O in the slower disk zones will tend to + * be slow(er) even if not seeky. Therefore, since the + * estimated peak rate is actually an average over the disk + * surface, these processes may timeout just for bad luck. To + * avoid punishing them, do not charge time to processes that + * succeeded in consuming at least 2/3 of their budget. This + * allows BFQ to preserve enough elasticity to still perform + * bandwidth, and not time, distribution with little unlucky + * or quasi-sequential processes. + */ + if (bfqq->wr_coeff == 1 && + (slow || + (reason == BFQQE_BUDGET_TIMEOUT && + bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) + bfq_bfqq_charge_time(bfqd, bfqq, delta); + + if (reason == BFQQE_TOO_IDLE && + entity->service <= 2 * entity->budget / 10) + bfq_clear_bfqq_IO_bound(bfqq); + + if (bfqd->low_latency && bfqq->wr_coeff == 1) + bfqq->last_wr_start_finish = jiffies; + + if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && + RB_EMPTY_ROOT(&bfqq->sort_list)) { + /* + * If we get here, and there are no outstanding + * requests, then the request pattern is isochronous + * (see the comments on the function + * bfq_bfqq_softrt_next_start()). Thus we can compute + * soft_rt_next_start. And we do it, unless bfqq is in + * interactive weight raising. We do not do it in the + * latter subcase, for the following reason. bfqq may + * be conveying the I/O needed to load a soft + * real-time application. Such an application will + * actually exhibit a soft real-time I/O pattern after + * it finally starts doing its job. But, if + * soft_rt_next_start is computed here for an + * interactive bfqq, and bfqq had received a lot of + * service before remaining with no outstanding + * request (likely to happen on a fast device), then + * soft_rt_next_start would be assigned such a high + * value that, for a very long time, bfqq would be + * prevented from being possibly considered as soft + * real time. + * + * If, instead, the queue still has outstanding + * requests, then we have to wait for the completion + * of all the outstanding requests to discover whether + * the request pattern is actually isochronous. + */ + if (bfqq->dispatched == 0 && + bfqq->wr_coeff != bfqd->bfq_wr_coeff) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + else if (bfqq->dispatched > 0) { + /* + * Schedule an update of soft_rt_next_start to when + * the task may be discovered to be isochronous. + */ + bfq_mark_bfqq_softrt_update(bfqq); + } + } + + bfq_log_bfqq(bfqd, bfqq, + "expire (%d, slow %d, num_disp %d, short_ttime %d)", reason, + slow, bfqq->dispatched, bfq_bfqq_has_short_ttime(bfqq)); + + /* + * bfqq expired, so no total service time needs to be computed + * any longer: reset state machine for measuring total service + * times. + */ + bfqd->rqs_injected = bfqd->wait_dispatch = false; + bfqd->waited_rq = NULL; + + /* + * Increase, decrease or leave budget unchanged according to + * reason. + */ + __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); + if (__bfq_bfqq_expire(bfqd, bfqq, reason)) + /* bfqq is gone, no more actions on it */ + return; + + /* mark bfqq as waiting a request only if a bic still points to it */ + if (!bfq_bfqq_busy(bfqq) && + reason != BFQQE_BUDGET_TIMEOUT && + reason != BFQQE_BUDGET_EXHAUSTED) { + bfq_mark_bfqq_non_blocking_wait_rq(bfqq); + /* + * Not setting service to 0, because, if the next rq + * arrives in time, the queue will go on receiving + * service with this same budget (as if it never expired) + */ + } else + entity->service = 0; + + /* + * Reset the received-service counter for every parent entity. + * Differently from what happens with bfqq->entity.service, + * the resetting of this counter never needs to be postponed + * for parent entities. In fact, in case bfqq may have a + * chance to go on being served using the last, partially + * consumed budget, bfqq->entity.service needs to be kept, + * because if bfqq then actually goes on being served using + * the same budget, the last value of bfqq->entity.service is + * needed to properly decrement bfqq->entity.budget by the + * portion already consumed. In contrast, it is not necessary + * to keep entity->service for parent entities too, because + * the bubble up of the new value of bfqq->entity.budget will + * make sure that the budgets of parent entities are correct, + * even in case bfqq and thus parent entities go on receiving + * service with the same budget. + */ + entity = entity->parent; + for_each_entity(entity) + entity->service = 0; +} + +/* + * Budget timeout is not implemented through a dedicated timer, but + * just checked on request arrivals and completions, as well as on + * idle timer expirations. + */ +static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) +{ + return time_is_before_eq_jiffies(bfqq->budget_timeout); +} + +/* + * If we expire a queue that is actively waiting (i.e., with the + * device idled) for the arrival of a new request, then we may incur + * the timestamp misalignment problem described in the body of the + * function __bfq_activate_entity. Hence we return true only if this + * condition does not hold, or if the queue is slow enough to deserve + * only to be kicked off for preserving a high throughput. + */ +static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, + "may_budget_timeout: wait_request %d left %d timeout %d", + bfq_bfqq_wait_request(bfqq), + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, + bfq_bfqq_budget_timeout(bfqq)); + + return (!bfq_bfqq_wait_request(bfqq) || + bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) + && + bfq_bfqq_budget_timeout(bfqq); +} + +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + bool rot_without_queueing = + !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, + bfqq_sequential_and_IO_bound, + idling_boosts_thr; + + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + + bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && + bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); + + /* + * The next variable takes into account the cases where idling + * boosts the throughput. + * + * The value of the variable is computed considering, first, that + * idling is virtually always beneficial for the throughput if: + * (a) the device is not NCQ-capable and rotational, or + * (b) regardless of the presence of NCQ, the device is rotational and + * the request pattern for bfqq is I/O-bound and sequential, or + * (c) regardless of whether it is rotational, the device is + * not NCQ-capable and the request pattern for bfqq is + * I/O-bound and sequential. + * + * Secondly, and in contrast to the above item (b), idling an + * NCQ-capable flash-based device would not boost the + * throughput even with sequential I/O; rather it would lower + * the throughput in proportion to how fast the device + * is. Accordingly, the next variable is true if any of the + * above conditions (a), (b) or (c) is true, and, in + * particular, happens to be false if bfqd is an NCQ-capable + * flash-based device. + */ + idling_boosts_thr = rot_without_queueing || + ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && + bfqq_sequential_and_IO_bound); + + /* + * The return value of this function is equal to that of + * idling_boosts_thr, unless a special case holds. In this + * special case, described below, idling may cause problems to + * weight-raised queues. + * + * When the request pool is saturated (e.g., in the presence + * of write hogs), if the processes associated with + * non-weight-raised queues ask for requests at a lower rate, + * then processes associated with weight-raised queues have a + * higher probability to get a request from the pool + * immediately (or at least soon) when they need one. Thus + * they have a higher probability to actually get a fraction + * of the device throughput proportional to their high + * weight. This is especially true with NCQ-capable drives, + * which enqueue several requests in advance, and further + * reorder internally-queued requests. + * + * For this reason, we force to false the return value if + * there are weight-raised busy queues. In this case, and if + * bfqq is not weight-raised, this guarantees that the device + * is not idled for bfqq (if, instead, bfqq is weight-raised, + * then idling will be guaranteed by another variable, see + * below). Combined with the timestamping rules of BFQ (see + * [1] for details), this behavior causes bfqq, and hence any + * sync non-weight-raised queue, to get a lower number of + * requests served, and thus to ask for a lower number of + * requests from the request pool, before the busy + * weight-raised queues get served again. This often mitigates + * starvation problems in the presence of heavy write + * workloads and NCQ, thereby guaranteeing a higher + * application and system responsiveness in these hostile + * scenarios. + */ + return idling_boosts_thr && + bfqd->wr_busy_queues == 0; +} + +/* + * For a queue that becomes empty, device idling is allowed only if + * this function returns true for that queue. As a consequence, since + * device idling plays a critical role for both throughput boosting + * and service guarantees, the return value of this function plays a + * critical role as well. + * + * In a nutshell, this function returns true only if idling is + * beneficial for throughput or, even if detrimental for throughput, + * idling is however necessary to preserve service guarantees (low + * latency, desired throughput distribution, ...). In particular, on + * NCQ-capable devices, this function tries to return false, so as to + * help keep the drives' internal queues full, whenever this helps the + * device boost the throughput without causing any service-guarantee + * issue. + * + * Most of the issues taken into account to get the return value of + * this function are not trivial. We discuss these issues in the two + * functions providing the main pieces of information needed by this + * function. + */ +static bool bfq_better_to_idle(struct bfq_queue *bfqq) +{ + struct bfq_data *bfqd = bfqq->bfqd; + bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; + + /* No point in idling for bfqq if it won't get requests any longer */ + if (unlikely(!bfqq_process_refs(bfqq))) + return false; + + if (unlikely(bfqd->strict_guarantees)) + return true; + + /* + * Idling is performed only if slice_idle > 0. In addition, we + * do not idle if + * (a) bfqq is async + * (b) bfqq is in the idle io prio class: in this case we do + * not idle because we want to minimize the bandwidth that + * queues in this class can steal to higher-priority queues + */ + if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || + bfq_class_idle(bfqq)) + return false; + + idling_boosts_thr_with_no_issue = + idling_boosts_thr_without_issues(bfqd, bfqq); + + idling_needed_for_service_guar = + idling_needed_for_service_guarantees(bfqd, bfqq); + + /* + * We have now the two components we need to compute the + * return value of the function, which is true only if idling + * either boosts the throughput (without issues), or is + * necessary to preserve service guarantees. + */ + return idling_boosts_thr_with_no_issue || + idling_needed_for_service_guar; +} + +/* + * If the in-service queue is empty but the function bfq_better_to_idle + * returns true, then: + * 1) the queue must remain in service and cannot be expired, and + * 2) the device must be idled to wait for the possible arrival of a new + * request for the queue. + * See the comments on the function bfq_better_to_idle for the reasons + * why performing device idling is the best choice to boost the throughput + * and preserve service guarantees when bfq_better_to_idle itself + * returns true. + */ +static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) +{ + return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); +} + +/* + * This function chooses the queue from which to pick the next extra + * I/O request to inject, if it finds a compatible queue. See the + * comments on bfq_update_inject_limit() for details on the injection + * mechanism, and for the definitions of the quantities mentioned + * below. + */ +static struct bfq_queue * +bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; + unsigned int limit = in_serv_bfqq->inject_limit; + /* + * If + * - bfqq is not weight-raised and therefore does not carry + * time-critical I/O, + * or + * - regardless of whether bfqq is weight-raised, bfqq has + * however a long think time, during which it can absorb the + * effect of an appropriate number of extra I/O requests + * from other queues (see bfq_update_inject_limit for + * details on the computation of this number); + * then injection can be performed without restrictions. + */ + bool in_serv_always_inject = in_serv_bfqq->wr_coeff == 1 || + !bfq_bfqq_has_short_ttime(in_serv_bfqq); + + /* + * If + * - the baseline total service time could not be sampled yet, + * so the inject limit happens to be still 0, and + * - a lot of time has elapsed since the plugging of I/O + * dispatching started, so drive speed is being wasted + * significantly; + * then temporarily raise inject limit to one request. + */ + if (limit == 0 && in_serv_bfqq->last_serv_time_ns == 0 && + bfq_bfqq_wait_request(in_serv_bfqq) && + time_is_before_eq_jiffies(bfqd->last_idling_start_jiffies + + bfqd->bfq_slice_idle) + ) + limit = 1; + + if (bfqd->rq_in_driver >= limit) + return NULL; + + /* + * Linear search of the source queue for injection; but, with + * a high probability, very few steps are needed to find a + * candidate queue, i.e., a queue with enough budget left for + * its next request. In fact: + * - BFQ dynamically updates the budget of every queue so as + * to accommodate the expected backlog of the queue; + * - if a queue gets all its requests dispatched as injected + * service, then the queue is removed from the active list + * (and re-added only if it gets new requests, but then it + * is assigned again enough budget for its new backlog). + */ + list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + (in_serv_always_inject || bfqq->wr_coeff > 1) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { + /* + * Allow for only one large in-flight request + * on non-rotational devices, for the + * following reason. On non-rotationl drives, + * large requests take much longer than + * smaller requests to be served. In addition, + * the drive prefers to serve large requests + * w.r.t. to small ones, if it can choose. So, + * having more than one large requests queued + * in the drive may easily make the next first + * request of the in-service queue wait for so + * long to break bfqq's service guarantees. On + * the bright side, large requests let the + * drive reach a very high throughput, even if + * there is only one in-flight large request + * at a time. + */ + if (blk_queue_nonrot(bfqd->queue) && + blk_rq_sectors(bfqq->next_rq) >= + BFQQ_SECT_THR_NONROT) + limit = min_t(unsigned int, 1, limit); + else + limit = in_serv_bfqq->inject_limit; + + if (bfqd->rq_in_driver < limit) { + bfqd->rqs_injected = true; + return bfqq; + } + } + + return NULL; +} + +/* + * Select a queue for service. If we have a current queue in service, + * check whether to continue servicing it, or retrieve and set a new one. + */ +static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq; + struct request *next_rq; + enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; + + bfqq = bfqd->in_service_queue; + if (!bfqq) + goto new_queue; + + bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); + + /* + * Do not expire bfqq for budget timeout if bfqq may be about + * to enjoy device idling. The reason why, in this case, we + * prevent bfqq from expiring is the same as in the comments + * on the case where bfq_bfqq_must_idle() returns true, in + * bfq_completed_request(). + */ + if (bfq_may_expire_for_budg_timeout(bfqq) && + !bfq_bfqq_must_idle(bfqq)) + goto expire; + +check_queue: + /* + * This loop is rarely executed more than once. Even when it + * happens, it is much more convenient to re-execute this loop + * than to return NULL and trigger a new dispatch to get a + * request served. + */ + next_rq = bfqq->next_rq; + /* + * If bfqq has requests queued and it has enough budget left to + * serve them, keep the queue, otherwise expire it. + */ + if (next_rq) { + if (bfq_serv_to_charge(next_rq, bfqq) > + bfq_bfqq_budget_left(bfqq)) { + /* + * Expire the queue for budget exhaustion, + * which makes sure that the next budget is + * enough to serve the next request, even if + * it comes from the fifo expired path. + */ + reason = BFQQE_BUDGET_EXHAUSTED; + goto expire; + } else { + /* + * The idle timer may be pending because we may + * not disable disk idling even when a new request + * arrives. + */ + if (bfq_bfqq_wait_request(bfqq)) { + /* + * If we get here: 1) at least a new request + * has arrived but we have not disabled the + * timer because the request was too small, + * 2) then the block layer has unplugged + * the device, causing the dispatch to be + * invoked. + * + * Since the device is unplugged, now the + * requests are probably large enough to + * provide a reasonable throughput. + * So we disable idling. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + } + goto keep_queue; + } + } + + /* + * No requests pending. However, if the in-service queue is idling + * for a new request, or has requests waiting for a completion and + * may idle after their completion, then keep it anyway. + * + * Yet, inject service from other queues if it boosts + * throughput and is possible. + */ + if (bfq_bfqq_wait_request(bfqq) || + (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { + struct bfq_queue *async_bfqq = + bfqq->bic && bfqq->bic->bfqq[0] && + bfq_bfqq_busy(bfqq->bic->bfqq[0]) && + bfqq->bic->bfqq[0]->next_rq ? + bfqq->bic->bfqq[0] : NULL; + + /* + * The next three mutually-exclusive ifs decide + * whether to try injection, and choose the queue to + * pick an I/O request from. + * + * The first if checks whether the process associated + * with bfqq has also async I/O pending. If so, it + * injects such I/O unconditionally. Injecting async + * I/O from the same process can cause no harm to the + * process. On the contrary, it can only increase + * bandwidth and reduce latency for the process. + * + * The second if checks whether there happens to be a + * non-empty waker queue for bfqq, i.e., a queue whose + * I/O needs to be completed for bfqq to receive new + * I/O. This happens, e.g., if bfqq is associated with + * a process that does some sync. A sync generates + * extra blocking I/O, which must be completed before + * the process associated with bfqq can go on with its + * I/O. If the I/O of the waker queue is not served, + * then bfqq remains empty, and no I/O is dispatched, + * until the idle timeout fires for bfqq. This is + * likely to result in lower bandwidth and higher + * latencies for bfqq, and in a severe loss of total + * throughput. The best action to take is therefore to + * serve the waker queue as soon as possible. So do it + * (without relying on the third alternative below for + * eventually serving waker_bfqq's I/O; see the last + * paragraph for further details). This systematic + * injection of I/O from the waker queue does not + * cause any delay to bfqq's I/O. On the contrary, + * next bfqq's I/O is brought forward dramatically, + * for it is not blocked for milliseconds. + * + * The third if checks whether bfqq is a queue for + * which it is better to avoid injection. It is so if + * bfqq delivers more throughput when served without + * any further I/O from other queues in the middle, or + * if the service times of bfqq's I/O requests both + * count more than overall throughput, and may be + * easily increased by injection (this happens if bfqq + * has a short think time). If none of these + * conditions holds, then a candidate queue for + * injection is looked for through + * bfq_choose_bfqq_for_injection(). Note that the + * latter may return NULL (for example if the inject + * limit for bfqq is currently 0). + * + * NOTE: motivation for the second alternative + * + * Thanks to the way the inject limit is updated in + * bfq_update_has_short_ttime(), it is rather likely + * that, if I/O is being plugged for bfqq and the + * waker queue has pending I/O requests that are + * blocking bfqq's I/O, then the third alternative + * above lets the waker queue get served before the + * I/O-plugging timeout fires. So one may deem the + * second alternative superfluous. It is not, because + * the third alternative may be way less effective in + * case of a synchronization. For two main + * reasons. First, throughput may be low because the + * inject limit may be too low to guarantee the same + * amount of injected I/O, from the waker queue or + * other queues, that the second alternative + * guarantees (the second alternative unconditionally + * injects a pending I/O request of the waker queue + * for each bfq_dispatch_request()). Second, with the + * third alternative, the duration of the plugging, + * i.e., the time before bfqq finally receives new I/O, + * may not be minimized, because the waker queue may + * happen to be served only after other queues. + */ + if (async_bfqq && + icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && + bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= + bfq_bfqq_budget_left(async_bfqq)) + bfqq = bfqq->bic->bfqq[0]; + else if (bfq_bfqq_has_waker(bfqq) && + bfq_bfqq_busy(bfqq->waker_bfqq) && + bfqq->next_rq && + bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, + bfqq->waker_bfqq) <= + bfq_bfqq_budget_left(bfqq->waker_bfqq) + ) + bfqq = bfqq->waker_bfqq; + else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && + (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || + !bfq_bfqq_has_short_ttime(bfqq))) + bfqq = bfq_choose_bfqq_for_injection(bfqd); + else + bfqq = NULL; + + goto keep_queue; + } + + reason = BFQQE_NO_MORE_REQUESTS; +expire: + bfq_bfqq_expire(bfqd, bfqq, false, reason); +new_queue: + bfqq = bfq_set_in_service_queue(bfqd); + if (bfqq) { + bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); + goto check_queue; + } +keep_queue: + if (bfqq) + bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); + else + bfq_log(bfqd, "select_queue: no queue returned"); + + return bfqq; +} + +static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ + bfq_log_bfqq(bfqd, bfqq, + "raising period dur %u/%u msec, old coeff %u, w %d(%d)", + jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), + jiffies_to_msecs(bfqq->wr_cur_max_time), + bfqq->wr_coeff, + bfqq->entity.weight, bfqq->entity.orig_weight); + + if (entity->prio_changed) + bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); + + /* + * If the queue was activated in a burst, or too much + * time has elapsed from the beginning of this + * weight-raising period, then end weight raising. + */ + if (bfq_bfqq_in_large_burst(bfqq)) + bfq_bfqq_end_wr(bfqq); + else if (time_is_before_jiffies(bfqq->last_wr_start_finish + + bfqq->wr_cur_max_time)) { + if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || + time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) + bfq_bfqq_end_wr(bfqq); + else { + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && + bfqq->service_from_wr > max_service_from_wr) { + /* see comments on max_service_from_wr */ + bfq_bfqq_end_wr(bfqq); + } + } + /* + * To improve latency (for this or other queues), immediately + * update weight both if it must be raised and if it must be + * lowered. Since, entity may be on some active tree here, and + * might have a pending change of its ioprio class, invoke + * next function with the last parameter unset (see the + * comments on the function). + */ + if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) + __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), + entity, false); +} + +/* + * Dispatch next request from bfqq. + */ +static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct request *rq = bfqq->next_rq; + unsigned long service_to_charge; + + service_to_charge = bfq_serv_to_charge(rq, bfqq); + + bfq_bfqq_served(bfqq, service_to_charge); + + if (bfqq == bfqd->in_service_queue && bfqd->wait_dispatch) { + bfqd->wait_dispatch = false; + bfqd->waited_rq = rq; + } + + bfq_dispatch_remove(bfqd->queue, rq); + + if (bfqq != bfqd->in_service_queue) + goto return_rq; + + /* + * If weight raising has to terminate for bfqq, then next + * function causes an immediate update of bfqq's weight, + * without waiting for next activation. As a consequence, on + * expiration, bfqq will be timestamped as if has never been + * weight-raised during this service slot, even if it has + * received part or even most of the service as a + * weight-raised queue. This inflates bfqq's timestamps, which + * is beneficial, as bfqq is then more willing to leave the + * device immediately to possible other weight-raised queues. + */ + bfq_update_wr_data(bfqd, bfqq); + + /* + * Expire bfqq, pretending that its budget expired, if bfqq + * belongs to CLASS_IDLE and other queues are waiting for + * service. + */ + if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) + goto return_rq; + + bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + +return_rq: + return rq; +} + +static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + + if (!atomic_read(&hctx->elevator_queued)) + return false; + + /* + * Avoiding lock: a race on bfqd->busy_queues should cause at + * most a call to dispatch for nothing + */ + return !list_empty_careful(&bfqd->dispatch) || + bfq_tot_busy_queues(bfqd) > 0; +} + +static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq = NULL; + struct bfq_queue *bfqq = NULL; + + if (!list_empty(&bfqd->dispatch)) { + rq = list_first_entry(&bfqd->dispatch, struct request, + queuelist); + list_del_init(&rq->queuelist); + + bfqq = RQ_BFQQ(rq); + + if (bfqq) { + /* + * Increment counters here, because this + * dispatch does not follow the standard + * dispatch flow (where counters are + * incremented) + */ + bfqq->dispatched++; + + goto inc_in_driver_start_rq; + } + + /* + * We exploit the bfq_finish_requeue_request hook to + * decrement rq_in_driver, but + * bfq_finish_requeue_request will not be invoked on + * this request. So, to avoid unbalance, just start + * this request, without incrementing rq_in_driver. As + * a negative consequence, rq_in_driver is deceptively + * lower than it should be while this request is in + * service. This may cause bfq_schedule_dispatch to be + * invoked uselessly. + * + * As for implementing an exact solution, the + * bfq_finish_requeue_request hook, if defined, is + * probably invoked also on this request. So, by + * exploiting this hook, we could 1) increment + * rq_in_driver here, and 2) decrement it in + * bfq_finish_requeue_request. Such a solution would + * let the value of the counter be always accurate, + * but it would entail using an extra interface + * function. This cost seems higher than the benefit, + * being the frequency of non-elevator-private + * requests very low. + */ + goto start_rq; + } + + bfq_log(bfqd, "dispatch requests: %d busy queues", + bfq_tot_busy_queues(bfqd)); + + if (bfq_tot_busy_queues(bfqd) == 0) + goto exit; + + /* + * Force device to serve one request at a time if + * strict_guarantees is true. Forcing this service scheme is + * currently the ONLY way to guarantee that the request + * service order enforced by the scheduler is respected by a + * queueing device. Otherwise the device is free even to make + * some unlucky request wait for as long as the device + * wishes. + * + * Of course, serving one request at a time may cause loss of + * throughput. + */ + if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + goto exit; + + bfqq = bfq_select_queue(bfqd); + if (!bfqq) + goto exit; + + rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); + + if (rq) { +inc_in_driver_start_rq: + bfqd->rq_in_driver++; +start_rq: + rq->rq_flags |= RQF_STARTED; + } +exit: + return rq; +} + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) +{ + struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; + + if (!idle_timer_disabled && !bfqq) + return; + + /* + * rq and bfqq are guaranteed to exist until this function + * ends, for the following reasons. First, rq can be + * dispatched to the device, and then can be completed and + * freed, only after this function ends. Second, rq cannot be + * merged (and thus freed because of a merge) any longer, + * because it has already started. Thus rq cannot be freed + * before this function ends, and, since rq has a reference to + * bfqq, the same guarantee holds for bfqq too. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(&q->queue_lock); + if (idle_timer_disabled) + /* + * Since the idle timer has been disabled, + * in_serv_queue contained some request when + * __bfq_dispatch_request was invoked above, which + * implies that rq was picked exactly from + * in_serv_queue. Thus in_serv_queue == bfqq, and is + * therefore guaranteed to exist because of the above + * arguments. + */ + bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); + if (bfqq) { + struct bfq_group *bfqg = bfqq_group(bfqq); + + bfqg_stats_update_avg_queue_size(bfqg); + bfqg_stats_set_start_empty_time(bfqg); + bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); + } + spin_unlock_irq(&q->queue_lock); +} +#else +static inline void bfq_update_dispatch_stats(struct request_queue *q, + struct request *rq, + struct bfq_queue *in_serv_queue, + bool idle_timer_disabled) {} +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + +static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct request *rq; + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled = false; + + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; + waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); + + rq = __bfq_dispatch_request(hctx); + if (in_serv_queue == bfqd->in_service_queue) { + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } + + spin_unlock_irq(&bfqd->lock); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, + idle_timer_disabled); + + return rq; +} + +/* + * Task holds one reference to the queue, dropped when task exits. Each rq + * in-flight on this queue also holds a reference, dropped when rq is freed. + * + * Scheduler lock must be held here. Recall not to use bfqq after calling + * this function on it. + */ +void bfq_put_queue(struct bfq_queue *bfqq) +{ + struct bfq_queue *item; + struct hlist_node *n; + struct bfq_group *bfqg = bfqq_group(bfqq); + + if (bfqq->bfqd) + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", + bfqq, bfqq->ref); + + bfqq->ref--; + if (bfqq->ref) + return; + + if (!hlist_unhashed(&bfqq->burst_list_node)) { + hlist_del_init(&bfqq->burst_list_node); + /* + * Decrement also burst size after the removal, if the + * process associated with bfqq is exiting, and thus + * does not contribute to the burst any longer. This + * decrement helps filter out false positives of large + * bursts, when some short-lived process (often due to + * the execution of commands by some service) happens + * to start and exit while a complex application is + * starting, and thus spawning several processes that + * do I/O (and that *must not* be treated as a large + * burst, see comments on bfq_handle_burst). + * + * In particular, the decrement is performed only if: + * 1) bfqq is not a merged queue, because, if it is, + * then this free of bfqq is not triggered by the exit + * of the process bfqq is associated with, but exactly + * by the fact that bfqq has just been merged. + * 2) burst_size is greater than 0, to handle + * unbalanced decrements. Unbalanced decrements may + * happen in te following case: bfqq is inserted into + * the current burst list--without incrementing + * bust_size--because of a split, but the current + * burst list is not the burst list bfqq belonged to + * (see comments on the case of a split in + * bfq_set_request). + */ + if (bfqq->bic && bfqq->bfqd->burst_size > 0) + bfqq->bfqd->burst_size--; + } + + /* + * bfqq does not exist any longer, so it cannot be woken by + * any other queue, and cannot wake any other queue. Then bfqq + * must be removed from the woken list of its possible waker + * queue, and all queues in the woken list of bfqq must stop + * having a waker queue. Strictly speaking, these updates + * should be performed when bfqq remains with no I/O source + * attached to it, which happens before bfqq gets freed. In + * particular, this happens when the last process associated + * with bfqq exits or gets associated with a different + * queue. However, both events lead to bfqq being freed soon, + * and dangling references would come out only after bfqq gets + * freed. So these updates are done here, as a simple and safe + * way to handle all cases. + */ + /* remove bfqq from woken list */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + + /* reset waker for all queues in woken list */ + hlist_for_each_entry_safe(item, n, &bfqq->woken_list, + woken_list_node) { + item->waker_bfqq = NULL; + bfq_clear_bfqq_has_waker(item); + hlist_del_init(&item->woken_list_node); + } + + if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) + bfqq->bfqd->last_completed_rq_bfqq = NULL; + + kmem_cache_free(bfq_pool, bfqq); + bfqg_and_blkg_put(bfqg); +} + +void bfq_put_cooperator(struct bfq_queue *bfqq) +{ + struct bfq_queue *__bfqq, *next; + + /* + * If this queue was scheduled to merge with another queue, be + * sure to drop the reference taken on that queue (and others in + * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. + */ + __bfqq = bfqq->new_bfqq; + while (__bfqq) { + if (__bfqq == bfqq) + break; + next = __bfqq->new_bfqq; + bfq_put_queue(__bfqq); + __bfqq = next; + } +} + +static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + if (bfqq == bfqd->in_service_queue) { + __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT); + bfq_schedule_dispatch(bfqd); + } + + bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); + + bfq_put_cooperator(bfqq); + + bfq_release_process_ref(bfqd, bfqq); +} + +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) +{ + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_data *bfqd; + + if (bfqq) + bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ + + if (bfqq && bfqd) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + bic_set_bfqq(bic, NULL, is_sync); + bfq_exit_bfqq(bfqd, bfqq); + spin_unlock_irqrestore(&bfqd->lock, flags); + } +} + +static void bfq_exit_icq(struct io_cq *icq) +{ + struct bfq_io_cq *bic = icq_to_bic(icq); + + bfq_exit_icq_bfqq(bic, true); + bfq_exit_icq_bfqq(bic, false); +} + +/* + * Update the entity prio values; note that the new values will not + * be used until the next (re)activation. + */ +static void +bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) +{ + struct task_struct *tsk = current; + int ioprio_class; + struct bfq_data *bfqd = bfqq->bfqd; + + if (!bfqd) + return; + + ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + switch (ioprio_class) { + default: + pr_err("bdi %s: bfq: bad prio class %d\n", + bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), + ioprio_class); + fallthrough; + case IOPRIO_CLASS_NONE: + /* + * No prio set, inherit CPU scheduling settings. + */ + bfqq->new_ioprio = task_nice_ioprio(tsk); + bfqq->new_ioprio_class = task_nice_ioclass(tsk); + break; + case IOPRIO_CLASS_RT: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_RT; + break; + case IOPRIO_CLASS_BE: + bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio_class = IOPRIO_CLASS_BE; + break; + case IOPRIO_CLASS_IDLE: + bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; + bfqq->new_ioprio = 7; + break; + } + + if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", + bfqq->new_ioprio); + bfqq->new_ioprio = IOPRIO_BE_NR - 1; + } + + bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfqq->entity.prio_changed = 1; +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic); + +static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) +{ + struct bfq_data *bfqd = bic_to_bfqd(bic); + struct bfq_queue *bfqq; + int ioprio = bic->icq.ioc->ioprio; + + /* + * This condition may trigger on a newly created bic, be sure to + * drop the lock before returning. + */ + if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) + return; + + bic->ioprio = ioprio; + + bfqq = bic_to_bfqq(bic, false); + if (bfqq) { + struct bfq_queue *old_bfqq = bfqq; + + bfqq = bfq_get_queue(bfqd, bio, false, bic); + bic_set_bfqq(bic, bfqq, false); + bfq_release_process_ref(bfqd, old_bfqq); + } + + bfqq = bic_to_bfqq(bic, true); + if (bfqq) + bfq_set_next_ioprio_data(bfqq, bic); +} + +static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, pid_t pid, int is_sync) +{ + RB_CLEAR_NODE(&bfqq->entity.rb_node); + INIT_LIST_HEAD(&bfqq->fifo); + INIT_HLIST_NODE(&bfqq->burst_list_node); + INIT_HLIST_NODE(&bfqq->woken_list_node); + INIT_HLIST_HEAD(&bfqq->woken_list); + + bfqq->ref = 0; + bfqq->bfqd = bfqd; + + if (bic) + bfq_set_next_ioprio_data(bfqq, bic); + + if (is_sync) { + /* + * No need to mark as has_short_ttime if in + * idle_class, because no device idling is performed + * for queues in idle class + */ + if (!bfq_class_idle(bfqq)) + /* tentatively mark as has_short_ttime */ + bfq_mark_bfqq_has_short_ttime(bfqq); + bfq_mark_bfqq_sync(bfqq); + bfq_mark_bfqq_just_created(bfqq); + } else + bfq_clear_bfqq_sync(bfqq); + + /* set end request to minus infinity from now */ + bfqq->ttime.last_end_request = ktime_get_ns() + 1; + + bfq_mark_bfqq_IO_bound(bfqq); + + bfqq->pid = pid; + + /* Tentative initial value to trade off between thr and lat */ + bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; + bfqq->budget_timeout = bfq_smallest_from_now(); + + bfqq->wr_coeff = 1; + bfqq->last_wr_start_finish = jiffies; + bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); + bfqq->split_time = bfq_smallest_from_now(); + + /* + * To not forget the possibly high bandwidth consumed by a + * process/queue in the recent past, + * bfq_bfqq_softrt_next_start() returns a value at least equal + * to the current value of bfqq->soft_rt_next_start (see + * comments on bfq_bfqq_softrt_next_start). Set + * soft_rt_next_start to now, to mean that bfqq has consumed + * no bandwidth so far. + */ + bfqq->soft_rt_next_start = jiffies; + + /* first request is almost certainly seeky */ + bfqq->seek_history = 1; +} + +static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, + struct bfq_group *bfqg, + int ioprio_class, int ioprio) +{ + switch (ioprio_class) { + case IOPRIO_CLASS_RT: + return &bfqg->async_bfqq[0][ioprio]; + case IOPRIO_CLASS_NONE: + ioprio = IOPRIO_NORM; + fallthrough; + case IOPRIO_CLASS_BE: + return &bfqg->async_bfqq[1][ioprio]; + case IOPRIO_CLASS_IDLE: + return &bfqg->async_idle_bfqq; + default: + return NULL; + } +} + +static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, + struct bio *bio, bool is_sync, + struct bfq_io_cq *bic) +{ + const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); + struct bfq_queue **async_bfqq = NULL; + struct bfq_queue *bfqq; + struct bfq_group *bfqg; + + bfqg = bfq_bio_bfqg(bfqd, bio); + if (!is_sync) { + async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, + ioprio); + bfqq = *async_bfqq; + if (bfqq) + goto out; + } + + bfqq = kmem_cache_alloc_node(bfq_pool, + GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, + bfqd->queue->node); + + if (bfqq) { + bfq_init_bfqq(bfqd, bfqq, bic, current->pid, + is_sync); + bfq_init_entity(&bfqq->entity, bfqg); + bfq_log_bfqq(bfqd, bfqq, "allocated"); + } else { + bfqq = &bfqd->oom_bfqq; + bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); + goto out; + } + + /* + * Pin the queue now that it's allocated, scheduler exit will + * prune it. + */ + if (async_bfqq) { + bfqq->ref++; /* + * Extra group reference, w.r.t. sync + * queue. This extra reference is removed + * only if bfqq->bfqg disappears, to + * guarantee that this queue is not freed + * until its group goes away. + */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", + bfqq, bfqq->ref); + *async_bfqq = bfqq; + } + +out: + bfqq->ref++; /* get a process reference to this queue */ + bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); + return bfqq; +} + +static void bfq_update_io_thinktime(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + struct bfq_ttime *ttime = &bfqq->ttime; + u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + + elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); + + ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); + ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, + ttime->ttime_samples); +} + +static void +bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + bfqq->seek_history <<= 1; + bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); + + if (bfqq->wr_coeff > 1 && + bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && + BFQQ_TOTALLY_SEEKY(bfqq)) + bfq_bfqq_end_wr(bfqq); +} + +static void bfq_update_has_short_ttime(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + bool has_short_ttime = true, state_changed; + + /* + * No need to update has_short_ttime if bfqq is async or in + * idle io prio class, or if bfq_slice_idle is zero, because + * no device idling is performed for bfqq in this case. + */ + if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || + bfqd->bfq_slice_idle == 0) + return; + + /* Idle window just restored, statistics are meaningless. */ + if (time_is_after_eq_jiffies(bfqq->split_time + + bfqd->bfq_wr_min_idle_time)) + return; + + /* Think time is infinite if no process is linked to + * bfqq. Otherwise check average think time to + * decide whether to mark as has_short_ttime + */ + if (atomic_read(&bic->icq.ioc->active_ref) == 0 || + (bfq_sample_valid(bfqq->ttime.ttime_samples) && + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + has_short_ttime = false; + + state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); + + if (has_short_ttime) + bfq_mark_bfqq_has_short_ttime(bfqq); + else + bfq_clear_bfqq_has_short_ttime(bfqq); + + /* + * Until the base value for the total service time gets + * finally computed for bfqq, the inject limit does depend on + * the think-time state (short|long). In particular, the limit + * is 0 or 1 if the think time is deemed, respectively, as + * short or long (details in the comments in + * bfq_update_inject_limit()). Accordingly, the next + * instructions reset the inject limit if the think-time state + * has changed and the above base value is still to be + * computed. + * + * However, the reset is performed only if more than 100 ms + * have elapsed since the last update of the inject limit, or + * (inclusive) if the change is from short to long think + * time. The reason for this waiting is as follows. + * + * bfqq may have a long think time because of a + * synchronization with some other queue, i.e., because the + * I/O of some other queue may need to be completed for bfqq + * to receive new I/O. Details in the comments on the choice + * of the queue for injection in bfq_select_queue(). + * + * As stressed in those comments, if such a synchronization is + * actually in place, then, without injection on bfqq, the + * blocking I/O cannot happen to served while bfqq is in + * service. As a consequence, if bfqq is granted + * I/O-dispatch-plugging, then bfqq remains empty, and no I/O + * is dispatched, until the idle timeout fires. This is likely + * to result in lower bandwidth and higher latencies for bfqq, + * and in a severe loss of total throughput. + * + * On the opposite end, a non-zero inject limit may allow the + * I/O that blocks bfqq to be executed soon, and therefore + * bfqq to receive new I/O soon. + * + * But, if the blocking gets actually eliminated, then the + * next think-time sample for bfqq may be very low. This in + * turn may cause bfqq's think time to be deemed + * short. Without the 100 ms barrier, this new state change + * would cause the body of the next if to be executed + * immediately. But this would set to 0 the inject + * limit. Without injection, the blocking I/O would cause the + * think time of bfqq to become long again, and therefore the + * inject limit to be raised again, and so on. The only effect + * of such a steady oscillation between the two think-time + * states would be to prevent effective injection on bfqq. + * + * In contrast, if the inject limit is not reset during such a + * long time interval as 100 ms, then the number of short + * think time samples can grow significantly before the reset + * is performed. As a consequence, the think time state can + * become stable before the reset. Therefore there will be no + * state change when the 100 ms elapse, and no reset of the + * inject limit. The inject limit remains steadily equal to 1 + * both during and after the 100 ms. So injection can be + * performed at all times, and throughput gets boosted. + * + * An inject limit equal to 1 is however in conflict, in + * general, with the fact that the think time of bfqq is + * short, because injection may be likely to delay bfqq's I/O + * (as explained in the comments in + * bfq_update_inject_limit()). But this does not happen in + * this special case, because bfqq's low think time is due to + * an effective handling of a synchronization, through + * injection. In this special case, bfqq's I/O does not get + * delayed by injection; on the contrary, bfqq's I/O is + * brought forward, because it is not blocked for + * milliseconds. + * + * In addition, serving the blocking I/O much sooner, and much + * more frequently than once per I/O-plugging timeout, makes + * it much quicker to detect a waker queue (the concept of + * waker queue is defined in the comments in + * bfq_add_request()). This makes it possible to start sooner + * to boost throughput more effectively, by injecting the I/O + * of the waker queue unconditionally on every + * bfq_dispatch_request(). + * + * One last, important benefit of not resetting the inject + * limit before 100 ms is that, during this time interval, the + * base value for the total service time is likely to get + * finally computed for bfqq, freeing the inject limit from + * its relation with the think time. + */ + if (state_changed && bfqq->last_serv_time_ns == 0 && + (time_is_before_eq_jiffies(bfqq->decrease_time_jif + + msecs_to_jiffies(100)) || + !has_short_ttime)) + bfq_reset_inject_limit(bfqd, bfqq); +} + +/* + * Called when a new fs request (rq) is added to bfqq. Check if there's + * something we should do about it. + */ +static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct request *rq) +{ + if (rq->cmd_flags & REQ_META) + bfqq->meta_pending++; + + bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); + + if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { + bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && + blk_rq_sectors(rq) < 32; + bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); + + /* + * There is just this request queued: if + * - the request is small, and + * - we are idling to boost throughput, and + * - the queue is not to be expired, + * then just exit. + * + * In this way, if the device is being idled to wait + * for a new request from the in-service queue, we + * avoid unplugging the device and committing the + * device to serve just a small request. In contrast + * we wait for the block layer to decide when to + * unplug the device: hopefully, new requests will be + * merged to this one quickly, then the device will be + * unplugged and larger requests will be dispatched. + */ + if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && + !budget_timeout) + return; + + /* + * A large enough request arrived, or idling is being + * performed to preserve service guarantees, or + * finally the queue is to be expired: in all these + * cases disk idling is to be stopped, so clear + * wait_request flag and reset timer. + */ + bfq_clear_bfqq_wait_request(bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + + /* + * The queue is not empty, because a new request just + * arrived. Hence we can safely expire the queue, in + * case of budget timeout, without risking that the + * timestamps of the queue are not updated correctly. + * See [1] for more details. + */ + if (budget_timeout) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQQE_BUDGET_TIMEOUT); + } +} + +/* returns true if it causes the idle timer to be disabled */ +static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + bool waiting, idle_timer_disabled = false; + + if (new_bfqq) { + /* + * Release the request's reference to the old bfqq + * and make sure one is taken to the shared queue. + */ + new_bfqq->allocated++; + bfqq->allocated--; + new_bfqq->ref++; + /* + * If the bic associated with the process + * issuing this request still points to bfqq + * (and thus has not been already redirected + * to new_bfqq or even some other bfq_queue), + * then complete the merge and redirect it to + * new_bfqq. + */ + if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + bfq_merge_bfqqs(bfqd, RQ_BIC(rq), + bfqq, new_bfqq); + + bfq_clear_bfqq_just_created(bfqq); + /* + * rq is about to be enqueued into new_bfqq, + * release rq reference on bfqq + */ + bfq_put_queue(bfqq); + rq->elv.priv[1] = new_bfqq; + bfqq = new_bfqq; + } + + bfq_update_io_thinktime(bfqd, bfqq); + bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq)); + bfq_update_io_seektime(bfqd, bfqq, rq); + + waiting = bfqq && bfq_bfqq_wait_request(bfqq); + bfq_add_request(rq); + idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); + + rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; + list_add_tail(&rq->queuelist, &bfqq->fifo); + + bfq_rq_enqueued(bfqd, bfqq, rq); + + return idle_timer_disabled; +} + +#ifdef CONFIG_BFQ_CGROUP_DEBUG +static void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) +{ + if (!bfqq) + return; + + /* + * bfqq still exists, because it can disappear only after + * either it is merged with another queue, or the process it + * is associated with exits. But both actions must be taken by + * the same process currently executing this flow of + * instructions. + * + * In addition, the following queue lock guarantees that + * bfqq_group(bfqq) exists as well. + */ + spin_lock_irq(&q->queue_lock); + bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); + if (idle_timer_disabled) + bfqg_stats_update_idle_time(bfqq_group(bfqq)); + spin_unlock_irq(&q->queue_lock); +} +#else +static inline void bfq_update_insert_stats(struct request_queue *q, + struct bfq_queue *bfqq, + bool idle_timer_disabled, + unsigned int cmd_flags) {} +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ + +static struct bfq_queue *bfq_init_rq(struct request *rq); + +static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + bool idle_timer_disabled = false; + unsigned int cmd_flags; + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) + bfqg_stats_update_legacy_io(q, rq); +#endif + spin_lock_irq(&bfqd->lock); + bfqq = bfq_init_rq(rq); + if (blk_mq_sched_try_insert_merge(q, rq)) { + spin_unlock_irq(&bfqd->lock); + return; + } + + blk_mq_sched_request_inserted(rq); + + if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { + if (at_head) + list_add(&rq->queuelist, &bfqd->dispatch); + else + list_add_tail(&rq->queuelist, &bfqd->dispatch); + } else { + idle_timer_disabled = __bfq_insert_request(bfqd, rq); + /* + * Update bfqq, because, if a queue merge has occurred + * in __bfq_insert_request, then rq has been + * redirected into a new queue. + */ + bfqq = RQ_BFQQ(rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + } + + /* + * Cache cmd_flags before releasing scheduler lock, because rq + * may disappear afterwards (for example, because of a request + * merge). + */ + cmd_flags = rq->cmd_flags; + + spin_unlock_irq(&bfqd->lock); + + bfq_update_insert_stats(q, bfqq, idle_timer_disabled, + cmd_flags); +} + +static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + bfq_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); + } +} + +static void bfq_update_hw_tag(struct bfq_data *bfqd) +{ + struct bfq_queue *bfqq = bfqd->in_service_queue; + + bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, + bfqd->rq_in_driver); + + if (bfqd->hw_tag == 1) + return; + + /* + * This sample is valid if the number of outstanding requests + * is large enough to allow a queueing behavior. Note that the + * sum is not exact, as it's not taking into account deactivated + * requests. + */ + if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + return; + + /* + * If active queue hasn't enough requests and can idle, bfq might not + * dispatch sufficient requests to hardware. Don't zero hw_tag in this + * case + */ + if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && + bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < + BFQ_HW_QUEUE_THRESHOLD && + bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + return; + + if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) + return; + + bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; + bfqd->max_rq_in_driver = 0; + bfqd->hw_tag_samples = 0; + + bfqd->nonrot_with_queueing = + blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag; +} + +static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) +{ + u64 now_ns; + u32 delta_us; + + bfq_update_hw_tag(bfqd); + + bfqd->rq_in_driver--; + bfqq->dispatched--; + + if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { + /* + * Set budget_timeout (which we overload to store the + * time at which the queue remains with no backlog and + * no outstanding request; used by the weight-raising + * mechanism). + */ + bfqq->budget_timeout = jiffies; + + bfq_weights_tree_remove(bfqd, bfqq); + } + + now_ns = ktime_get_ns(); + + bfqq->ttime.last_end_request = now_ns; + + /* + * Using us instead of ns, to get a reasonable precision in + * computing rate in next check. + */ + delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); + + /* + * If the request took rather long to complete, and, according + * to the maximum request size recorded, this completion latency + * implies that the request was certainly served at a very low + * rate (less than 1M sectors/sec), then the whole observation + * interval that lasts up to this time instant cannot be a + * valid time interval for computing a new peak rate. Invoke + * bfq_update_rate_reset to have the following three steps + * taken: + * - close the observation interval at the last (previous) + * request dispatch or completion + * - compute rate, if possible, for that observation interval + * - reset to zero samples, which will trigger a proper + * re-initialization of the observation interval on next + * dispatch + */ + if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && + (bfqd->last_rq_max_size<last_completion = now_ns; + bfqd->last_completed_rq_bfqq = bfqq; + + /* + * If we are waiting to discover whether the request pattern + * of the task associated with the queue is actually + * isochronous, and both requisites for this condition to hold + * are now satisfied, then compute soft_rt_next_start (see the + * comments on the function bfq_bfqq_softrt_next_start()). We + * do not compute soft_rt_next_start if bfqq is in interactive + * weight raising (see the comments in bfq_bfqq_expire() for + * an explanation). We schedule this delayed update when bfqq + * expires, if it still has in-flight requests. + */ + if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && + RB_EMPTY_ROOT(&bfqq->sort_list) && + bfqq->wr_coeff != bfqd->bfq_wr_coeff) + bfqq->soft_rt_next_start = + bfq_bfqq_softrt_next_start(bfqd, bfqq); + + /* + * If this is the in-service queue, check if it needs to be expired, + * or if we want to idle in case it has no pending requests. + */ + if (bfqd->in_service_queue == bfqq) { + if (bfq_bfqq_must_idle(bfqq)) { + if (bfqq->dispatched == 0) + bfq_arm_slice_timer(bfqd); + /* + * If we get here, we do not expire bfqq, even + * if bfqq was in budget timeout or had no + * more requests (as controlled in the next + * conditional instructions). The reason for + * not expiring bfqq is as follows. + * + * Here bfqq->dispatched > 0 holds, but + * bfq_bfqq_must_idle() returned true. This + * implies that, even if no request arrives + * for bfqq before bfqq->dispatched reaches 0, + * bfqq will, however, not be expired on the + * completion event that causes bfqq->dispatch + * to reach zero. In contrast, on this event, + * bfqq will start enjoying device idling + * (I/O-dispatch plugging). + * + * But, if we expired bfqq here, bfqq would + * not have the chance to enjoy device idling + * when bfqq->dispatched finally reaches + * zero. This would expose bfqq to violation + * of its reserved service guarantees. + */ + return; + } else if (bfq_may_expire_for_budg_timeout(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQQE_BUDGET_TIMEOUT); + else if (RB_EMPTY_ROOT(&bfqq->sort_list) && + (bfqq->dispatched == 0 || + !bfq_better_to_idle(bfqq))) + bfq_bfqq_expire(bfqd, bfqq, false, + BFQQE_NO_MORE_REQUESTS); + } + + if (!bfqd->rq_in_driver) + bfq_schedule_dispatch(bfqd); +} + +static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) +{ + bfqq->allocated--; + + bfq_put_queue(bfqq); +} + +/* + * The processes associated with bfqq may happen to generate their + * cumulative I/O at a lower rate than the rate at which the device + * could serve the same I/O. This is rather probable, e.g., if only + * one process is associated with bfqq and the device is an SSD. It + * results in bfqq becoming often empty while in service. In this + * respect, if BFQ is allowed to switch to another queue when bfqq + * remains empty, then the device goes on being fed with I/O requests, + * and the throughput is not affected. In contrast, if BFQ is not + * allowed to switch to another queue---because bfqq is sync and + * I/O-dispatch needs to be plugged while bfqq is temporarily + * empty---then, during the service of bfqq, there will be frequent + * "service holes", i.e., time intervals during which bfqq gets empty + * and the device can only consume the I/O already queued in its + * hardware queues. During service holes, the device may even get to + * remaining idle. In the end, during the service of bfqq, the device + * is driven at a lower speed than the one it can reach with the kind + * of I/O flowing through bfqq. + * + * To counter this loss of throughput, BFQ implements a "request + * injection mechanism", which tries to fill the above service holes + * with I/O requests taken from other queues. The hard part in this + * mechanism is finding the right amount of I/O to inject, so as to + * both boost throughput and not break bfqq's bandwidth and latency + * guarantees. In this respect, the mechanism maintains a per-queue + * inject limit, computed as below. While bfqq is empty, the injection + * mechanism dispatches extra I/O requests only until the total number + * of I/O requests in flight---i.e., already dispatched but not yet + * completed---remains lower than this limit. + * + * A first definition comes in handy to introduce the algorithm by + * which the inject limit is computed. We define as first request for + * bfqq, an I/O request for bfqq that arrives while bfqq is in + * service, and causes bfqq to switch from empty to non-empty. The + * algorithm updates the limit as a function of the effect of + * injection on the service times of only the first requests of + * bfqq. The reason for this restriction is that these are the + * requests whose service time is affected most, because they are the + * first to arrive after injection possibly occurred. + * + * To evaluate the effect of injection, the algorithm measures the + * "total service time" of first requests. We define as total service + * time of an I/O request, the time that elapses since when the + * request is enqueued into bfqq, to when it is completed. This + * quantity allows the whole effect of injection to be measured. It is + * easy to see why. Suppose that some requests of other queues are + * actually injected while bfqq is empty, and that a new request R + * then arrives for bfqq. If the device does start to serve all or + * part of the injected requests during the service hole, then, + * because of this extra service, it may delay the next invocation of + * the dispatch hook of BFQ. Then, even after R gets eventually + * dispatched, the device may delay the actual service of R if it is + * still busy serving the extra requests, or if it decides to serve, + * before R, some extra request still present in its queues. As a + * conclusion, the cumulative extra delay caused by injection can be + * easily evaluated by just comparing the total service time of first + * requests with and without injection. + * + * The limit-update algorithm works as follows. On the arrival of a + * first request of bfqq, the algorithm measures the total time of the + * request only if one of the three cases below holds, and, for each + * case, it updates the limit as described below: + * + * (1) If there is no in-flight request. This gives a baseline for the + * total service time of the requests of bfqq. If the baseline has + * not been computed yet, then, after computing it, the limit is + * set to 1, to start boosting throughput, and to prepare the + * ground for the next case. If the baseline has already been + * computed, then it is updated, in case it results to be lower + * than the previous value. + * + * (2) If the limit is higher than 0 and there are in-flight + * requests. By comparing the total service time in this case with + * the above baseline, it is possible to know at which extent the + * current value of the limit is inflating the total service + * time. If the inflation is below a certain threshold, then bfqq + * is assumed to be suffering from no perceivable loss of its + * service guarantees, and the limit is even tentatively + * increased. If the inflation is above the threshold, then the + * limit is decreased. Due to the lack of any hysteresis, this + * logic makes the limit oscillate even in steady workload + * conditions. Yet we opted for it, because it is fast in reaching + * the best value for the limit, as a function of the current I/O + * workload. To reduce oscillations, this step is disabled for a + * short time interval after the limit happens to be decreased. + * + * (3) Periodically, after resetting the limit, to make sure that the + * limit eventually drops in case the workload changes. This is + * needed because, after the limit has gone safely up for a + * certain workload, it is impossible to guess whether the + * baseline total service time may have changed, without measuring + * it again without injection. A more effective version of this + * step might be to just sample the baseline, by interrupting + * injection only once, and then to reset/lower the limit only if + * the total service time with the current limit does happen to be + * too large. + * + * More details on each step are provided in the comments on the + * pieces of code that implement these steps: the branch handling the + * transition from empty to non empty in bfq_add_request(), the branch + * handling injection in bfq_select_queue(), and the function + * bfq_choose_bfqq_for_injection(). These comments also explain some + * exceptions, made by the injection mechanism in some special cases. + */ +static void bfq_update_inject_limit(struct bfq_data *bfqd, + struct bfq_queue *bfqq) +{ + u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns; + unsigned int old_limit = bfqq->inject_limit; + + if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) { + u64 threshold = (bfqq->last_serv_time_ns * 3)>>1; + + if (tot_time_ns >= threshold && old_limit > 0) { + bfqq->inject_limit--; + bfqq->decrease_time_jif = jiffies; + } else if (tot_time_ns < threshold && + old_limit <= bfqd->max_rq_in_driver) + bfqq->inject_limit++; + } + + /* + * Either we still have to compute the base value for the + * total service time, and there seem to be the right + * conditions to do it, or we can lower the last base value + * computed. + * + * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * request in flight, because this function is in the code + * path that handles the completion of a request of bfqq, and, + * in particular, this function is executed before + * bfqd->rq_in_driver is decremented in such a code path. + */ + if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || + tot_time_ns < bfqq->last_serv_time_ns) { + if (bfqq->last_serv_time_ns == 0) { + /* + * Now we certainly have a base value: make sure we + * start trying injection. + */ + bfqq->inject_limit = max_t(unsigned int, 1, old_limit); + } + bfqq->last_serv_time_ns = tot_time_ns; + } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + /* + * No I/O injected and no request still in service in + * the drive: these are the exact conditions for + * computing the base value of the total service time + * for bfqq. So let's update this value, because it is + * rather variable. For example, it varies if the size + * or the spatial locality of the I/O requests in bfqq + * change. + */ + bfqq->last_serv_time_ns = tot_time_ns; + + + /* update complete, not waiting for any request completion any longer */ + bfqd->waited_rq = NULL; + bfqd->rqs_injected = false; +} + +/* + * Handle either a requeue or a finish for rq. The things to do are + * the same in both cases: all references to rq are to be dropped. In + * particular, rq is considered completed from the point of view of + * the scheduler. + */ +static void bfq_finish_requeue_request(struct request *rq) +{ + struct bfq_queue *bfqq = RQ_BFQQ(rq); + struct bfq_data *bfqd; + + /* + * rq either is not associated with any icq, or is an already + * requeued request that has not (yet) been re-inserted into + * a bfq_queue. + */ + if (!rq->elv.icq || !bfqq) + return; + + bfqd = bfqq->bfqd; + + if (rq->rq_flags & RQF_STARTED) + bfqg_stats_update_completion(bfqq_group(bfqq), + rq->start_time_ns, + rq->io_start_time_ns, + rq->cmd_flags); + + if (likely(rq->rq_flags & RQF_STARTED)) { + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + + if (rq == bfqd->waited_rq) + bfq_update_inject_limit(bfqd, bfqq); + + bfq_completed_request(bfqq, bfqd); + bfq_finish_requeue_request_body(bfqq); + atomic_dec(&rq->mq_hctx->elevator_queued); + + spin_unlock_irqrestore(&bfqd->lock, flags); + } else { + /* + * Request rq may be still/already in the scheduler, + * in which case we need to remove it (this should + * never happen in case of requeue). And we cannot + * defer such a check and removal, to avoid + * inconsistencies in the time interval from the end + * of this function to the start of the deferred work. + * This situation seems to occur only in process + * context, as a consequence of a merge. In the + * current version of the code, this implies that the + * lock is held. + */ + + if (!RB_EMPTY_NODE(&rq->rb_node)) { + bfq_remove_request(rq->q, rq); + bfqg_stats_update_io_remove(bfqq_group(bfqq), + rq->cmd_flags); + } + bfq_finish_requeue_request_body(bfqq); + } + + /* + * Reset private fields. In case of a requeue, this allows + * this function to correctly do nothing if it is spuriously + * invoked again on this same request (see the check at the + * beginning of the function). Probably, a better general + * design would be to prevent blk-mq from invoking the requeue + * or finish hooks of an elevator, for a request that is not + * referred by that elevator. + * + * Resetting the following fields would break the + * request-insertion logic if rq is re-inserted into a bfq + * internal queue, without a re-preparation. Here we assume + * that re-insertions of requeued requests, without + * re-preparation, can happen only for pass_through or at_head + * requests (which are not re-inserted into bfq internal + * queues). + */ + rq->elv.priv[0] = NULL; + rq->elv.priv[1] = NULL; +} + +/* + * Removes the association between the current task and bfqq, assuming + * that bic points to the bfq iocontext of the task. + * Returns NULL if a new bfqq should be allocated, or the old bfqq if this + * was the last process referring to that bfqq. + */ +static struct bfq_queue * +bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); + + if (bfqq_process_refs(bfqq) == 1) { + bfqq->pid = current->pid; + bfq_clear_bfqq_coop(bfqq); + bfq_clear_bfqq_split_coop(bfqq); + return bfqq; + } + + bic_set_bfqq(bic, NULL, true); + + bfq_put_cooperator(bfqq); + + bfq_release_process_ref(bfqq->bfqd, bfqq); + return NULL; +} + +static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bio *bio, + bool split, bool is_sync, + bool *new_queue) +{ + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + + if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) + return bfqq; + + if (new_queue) + *new_queue = true; + + if (bfqq) + bfq_put_queue(bfqq); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + + bic_set_bfqq(bic, bfqq, is_sync); + if (split && is_sync) { + if ((bic->was_in_burst_list && bfqd->large_burst) || + bic->saved_in_large_burst) + bfq_mark_bfqq_in_large_burst(bfqq); + else { + bfq_clear_bfqq_in_large_burst(bfqq); + if (bic->was_in_burst_list) + /* + * If bfqq was in the current + * burst list before being + * merged, then we have to add + * it back. And we do not need + * to increase burst_size, as + * we did not decrement + * burst_size when we removed + * bfqq from the burst list as + * a consequence of a merge + * (see comments in + * bfq_put_queue). In this + * respect, it would be rather + * costly to know whether the + * current burst list is still + * the same burst list from + * which bfqq was removed on + * the merge. To avoid this + * cost, if bfqq was in a + * burst list, then we add + * bfqq to the current burst + * list without any further + * check. This can cause + * inappropriate insertions, + * but rarely enough to not + * harm the detection of large + * bursts significantly. + */ + hlist_add_head(&bfqq->burst_list_node, + &bfqd->burst_list); + } + bfqq->split_time = jiffies; + } + + return bfqq; +} + +/* + * Only reset private fields. The actual request preparation will be + * performed by bfq_init_rq, when rq is either inserted or merged. See + * comments on bfq_init_rq for the reason behind this delayed + * preparation. + */ +static void bfq_prepare_request(struct request *rq) +{ + /* + * Regardless of whether we have an icq attached, we have to + * clear the scheduler pointers, as they might point to + * previously allocated bic/bfqq structs. + */ + rq->elv.priv[0] = rq->elv.priv[1] = NULL; +} + +/* + * If needed, init rq, allocate bfq data structures associated with + * rq, and increment reference counters in the destination bfq_queue + * for rq. Return the destination bfq_queue for rq, or NULL is rq is + * not associated with any bfq_queue. + * + * This function is invoked by the functions that perform rq insertion + * or merging. One may have expected the above preparation operations + * to be performed in bfq_prepare_request, and not delayed to when rq + * is inserted or merged. The rationale behind this delayed + * preparation is that, after the prepare_request hook is invoked for + * rq, rq may still be transformed into a request with no icq, i.e., a + * request not associated with any queue. No bfq hook is invoked to + * signal this transformation. As a consequence, should these + * preparation operations be performed when the prepare_request hook + * is invoked, and should rq be transformed one moment later, bfq + * would end up in an inconsistent state, because it would have + * incremented some queue counters for an rq destined to + * transformation, without any chance to correctly lower these + * counters back. In contrast, no transformation can still happen for + * rq after rq has been inserted or merged. So, it is safe to execute + * these preparation operations when rq is finally inserted or merged. + */ +static struct bfq_queue *bfq_init_rq(struct request *rq) +{ + struct request_queue *q = rq->q; + struct bio *bio = rq->bio; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_io_cq *bic; + const int is_sync = rq_is_sync(rq); + struct bfq_queue *bfqq; + bool new_queue = false; + bool bfqq_already_existing = false, split = false; + + if (unlikely(!rq->elv.icq)) + return NULL; + + /* + * Assuming that elv.priv[1] is set only if everything is set + * for this rq. This holds true, because this function is + * invoked only for insertion or merging, and, after such + * events, a request cannot be manipulated any longer before + * being removed from bfq. + */ + if (rq->elv.priv[1]) + return rq->elv.priv[1]; + + bic = icq_to_bic(rq->elv.icq); + + bfq_check_ioprio_change(bic, bio); + + bfq_bic_update_cgroup(bic, bio); + + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, + &new_queue); + + if (likely(!new_queue)) { + /* If the queue was seeky for too long, break it apart. */ + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { + bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + + /* Update bic before losing reference to bfqq */ + if (bfq_bfqq_in_large_burst(bfqq)) + bic->saved_in_large_burst = true; + + bfqq = bfq_split_bfqq(bic, bfqq); + split = true; + + if (!bfqq) + bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, + true, is_sync, + NULL); + else + bfqq_already_existing = true; + } + } + + bfqq->allocated++; + bfqq->ref++; + bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", + rq, bfqq, bfqq->ref); + + rq->elv.priv[0] = bic; + rq->elv.priv[1] = bfqq; + + /* + * If a bfq_queue has only one process reference, it is owned + * by only this bic: we can then set bfqq->bic = bic. in + * addition, if the queue has also just been split, we have to + * resume its state. + */ + if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { + bfqq->bic = bic; + if (split) { + /* + * The queue has just been split from a shared + * queue: restore the idle window and the + * possible weight raising period. + */ + bfq_bfqq_resume_state(bfqq, bfqd, bic, + bfqq_already_existing); + } + } + + /* + * Consider bfqq as possibly belonging to a burst of newly + * created queues only if: + * 1) A burst is actually happening (bfqd->burst_size > 0) + * or + * 2) There is no other active queue. In fact, if, in + * contrast, there are active queues not belonging to the + * possible burst bfqq may belong to, then there is no gain + * in considering bfqq as belonging to a burst, and + * therefore in not weight-raising bfqq. See comments on + * bfq_handle_burst(). + * + * This filtering also helps eliminating false positives, + * occurring when bfqq does not belong to an actual large + * burst, but some background task (e.g., a service) happens + * to trigger the creation of new queues very close to when + * bfqq and its possible companion queues are created. See + * comments on bfq_handle_burst() for further details also on + * this issue. + */ + if (unlikely(bfq_bfqq_just_created(bfqq) && + (bfqd->burst_size > 0 || + bfq_tot_busy_queues(bfqd) == 0))) + bfq_handle_burst(bfqd, bfqq); + + return bfqq; +} + +static void +bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + enum bfqq_expiration reason; + unsigned long flags; + + spin_lock_irqsave(&bfqd->lock, flags); + + /* + * Considering that bfqq may be in race, we should firstly check + * whether bfqq is in service before doing something on it. If + * the bfqq in race is not in service, it has already been expired + * through __bfq_bfqq_expire func and its wait_request flags has + * been cleared in __bfq_bfqd_reset_in_service func. + */ + if (bfqq != bfqd->in_service_queue) { + spin_unlock_irqrestore(&bfqd->lock, flags); + return; + } + + bfq_clear_bfqq_wait_request(bfqq); + + if (bfq_bfqq_budget_timeout(bfqq)) + /* + * Also here the queue can be safely expired + * for budget timeout without wasting + * guarantees + */ + reason = BFQQE_BUDGET_TIMEOUT; + else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) + /* + * The queue may not be empty upon timer expiration, + * because we may not disable the timer when the + * first request of the in-service queue arrives + * during disk idling. + */ + reason = BFQQE_TOO_IDLE; + else + goto schedule_dispatch; + + bfq_bfqq_expire(bfqd, bfqq, true, reason); + +schedule_dispatch: + bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(&bfqd->lock, flags); +} + +/* + * Handler of the expiration of the timer running if the in-service queue + * is idling inside its time slice. + */ +static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) +{ + struct bfq_data *bfqd = container_of(timer, struct bfq_data, + idle_slice_timer); + struct bfq_queue *bfqq = bfqd->in_service_queue; + + /* + * Theoretical race here: the in-service queue can be NULL or + * different from the queue that was idling if a new request + * arrives for the current queue and there is a full dispatch + * cycle that changes the in-service queue. This can hardly + * happen, but in the worst case we just expire a queue too + * early. + */ + if (bfqq) + bfq_idle_slice_timer_body(bfqd, bfqq); + + return HRTIMER_NORESTART; +} + +static void __bfq_put_async_bfqq(struct bfq_data *bfqd, + struct bfq_queue **bfqq_ptr) +{ + struct bfq_queue *bfqq = *bfqq_ptr; + + bfq_log(bfqd, "put_async_bfqq: %p", bfqq); + if (bfqq) { + bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); + + bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", + bfqq, bfqq->ref); + bfq_put_queue(bfqq); + *bfqq_ptr = NULL; + } +} + +/* + * Release all the bfqg references to its async queues. If we are + * deallocating the group these queues may still contain requests, so + * we reparent them to the root cgroup (i.e., the only one that will + * exist for sure until all the requests on a device are gone). + */ +void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) +{ + int i, j; + + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_BE_NR; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); +} + +/* + * See the comments on bfq_limit_depth for the purpose of + * the depths set in the function. Return minimum shallow depth we'll use. + */ +static unsigned int bfq_update_depths(struct bfq_data *bfqd, + struct sbitmap_queue *bt) +{ + unsigned int i, j, min_shallow = UINT_MAX; + + /* + * In-word depths if no bfq_queue is being weight-raised: + * leaving 25% of tags only for sync reads. + * + * In next formulas, right-shift the value + * (1U<sb.shift), instead of computing directly + * (1U<<(bt->sb.shift - something)), to be robust against + * any possible value of bt->sb.shift, without having to + * limit 'something'. + */ + /* no more than 50% of tags for async I/O */ + bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + /* + * no more than 75% of tags for sync writes (25% extra tags + * w.r.t. async I/O, to prevent async I/O from starving sync + * writes) + */ + bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + + /* + * In-word depths in case some bfq_queue is being weight- + * raised: leaving ~63% of tags for sync reads. This is the + * highest percentage for which, in our tests, application + * start-up times didn't suffer from any regression due to tag + * shortage. + */ + /* no more than ~18% of tags for async I/O */ + bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + /* no more than ~37% of tags for sync writes (~20% extra tags) */ + bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); + + for (i = 0; i < 2; i++) + for (j = 0; j < 2; j++) + min_shallow = min(min_shallow, bfqd->word_depths[i][j]); + + return min_shallow; +} + +static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int min_shallow; + + min_shallow = bfq_update_depths(bfqd, tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(tags->bitmap_tags, min_shallow); +} + +static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) +{ + bfq_depth_updated(hctx); + return 0; +} + +static void bfq_exit_queue(struct elevator_queue *e) +{ + struct bfq_data *bfqd = e->elevator_data; + struct bfq_queue *bfqq, *n; + + hrtimer_cancel(&bfqd->idle_slice_timer); + + spin_lock_irq(&bfqd->lock); + list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) + bfq_deactivate_bfqq(bfqd, bfqq, false, false); + spin_unlock_irq(&bfqd->lock); + + hrtimer_cancel(&bfqd->idle_slice_timer); + + /* release oom-queue reference to root group */ + bfqg_and_blkg_put(bfqd->root_group); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); +#else + spin_lock_irq(&bfqd->lock); + bfq_put_async_queues(bfqd, bfqd->root_group); + kfree(bfqd->root_group); + spin_unlock_irq(&bfqd->lock); +#endif + + wbt_enable_default(bfqd->queue); + + kfree(bfqd); +} + +static void bfq_init_root_group(struct bfq_group *root_group, + struct bfq_data *bfqd) +{ + int i; + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + root_group->entity.parent = NULL; + root_group->my_entity = NULL; + root_group->bfqd = bfqd; +#endif + root_group->rq_pos_tree = RB_ROOT; + for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) + root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; + root_group->sched_data.bfq_class_idle_last_service = jiffies; +} + +static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct bfq_data *bfqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); + if (!bfqd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = bfqd; + + spin_lock_irq(&q->queue_lock); + q->elevator = eq; + spin_unlock_irq(&q->queue_lock); + + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow + * will not attempt to free it. + */ + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfqd->oom_bfqq.ref++; + bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; + bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; + bfqd->oom_bfqq.entity.new_weight = + bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); + + /* oom_bfqq does not participate to bursts */ + bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); + + /* + * Trigger weight initialization, according to ioprio, at the + * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio + * class won't be changed any more. + */ + bfqd->oom_bfqq.entity.prio_changed = 1; + + bfqd->queue = q; + + INIT_LIST_HEAD(&bfqd->dispatch); + + hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, + HRTIMER_MODE_REL); + bfqd->idle_slice_timer.function = bfq_idle_slice_timer; + + bfqd->queue_weights_tree = RB_ROOT_CACHED; + bfqd->num_groups_with_pending_reqs = 0; + + INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->idle_list); + INIT_HLIST_HEAD(&bfqd->burst_list); + + bfqd->hw_tag = -1; + bfqd->nonrot_with_queueing = blk_queue_nonrot(bfqd->queue); + + bfqd->bfq_max_budget = bfq_default_max_budget; + + bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; + bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; + bfqd->bfq_back_max = bfq_back_max; + bfqd->bfq_back_penalty = bfq_back_penalty; + bfqd->bfq_slice_idle = bfq_slice_idle; + bfqd->bfq_timeout = bfq_timeout; + + bfqd->bfq_requests_within_timer = 120; + + bfqd->bfq_large_burst_thresh = 8; + bfqd->bfq_burst_interval = msecs_to_jiffies(180); + + bfqd->low_latency = true; + + /* + * Trade-off between responsiveness and fairness. + */ + bfqd->bfq_wr_coeff = 30; + bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); + bfqd->bfq_wr_max_time = 0; + bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); + bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); + bfqd->bfq_wr_max_softrt_rate = 7000; /* + * Approximate rate required + * to playback or record a + * high-definition compressed + * video. + */ + bfqd->wr_busy_queues = 0; + + /* + * Begin by assuming, optimistically, that the device peak + * rate is equal to 2/3 of the highest reference rate. + */ + bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * + ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; + bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + + spin_lock_init(&bfqd->lock); + + /* + * The invocation of the next bfq_create_group_hierarchy + * function is the head of a chain of function calls + * (bfq_create_group_hierarchy->blkcg_activate_policy-> + * blk_mq_freeze_queue) that may lead to the invocation of the + * has_work hook function. For this reason, + * bfq_create_group_hierarchy is invoked only after all + * scheduler data has been initialized, apart from the fields + * that can be initialized only after invoking + * bfq_create_group_hierarchy. This, in particular, enables + * has_work to correctly return false. Of course, to avoid + * other inconsistencies, the blk-mq stack must then refrain + * from invoking further scheduler hooks before this init + * function is finished. + */ + bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); + if (!bfqd->root_group) + goto out_free; + bfq_init_root_group(bfqd->root_group, bfqd); + bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); + + wbt_disable_default(q); + return 0; + +out_free: + kfree(bfqd); + kobject_put(&eq->kobj); + return -ENOMEM; +} + +static void bfq_slab_kill(void) +{ + kmem_cache_destroy(bfq_pool); +} + +static int __init bfq_slab_setup(void) +{ + bfq_pool = KMEM_CACHE(bfq_queue, 0); + if (!bfq_pool) + return -ENOMEM; + return 0; +} + +static ssize_t bfq_var_show(unsigned int var, char *page) +{ + return sprintf(page, "%u\n", var); +} + +static int bfq_var_store(unsigned long *var, const char *page) +{ + unsigned long new_val; + int ret = kstrtoul(page, 10, &new_val); + + if (ret) + return ret; + *var = new_val; + return 0; +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + if (__CONV == 1) \ + __data = jiffies_to_msecs(__data); \ + else if (__CONV == 2) \ + __data = div_u64(__data, NSEC_PER_MSEC); \ + return bfq_var_show(__data, (page)); \ +} +SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); +SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); +SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); +SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); +SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); +SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); +SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); +SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); +SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); +#undef SHOW_FUNCTION + +#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + u64 __data = __VAR; \ + __data = div_u64(__data, NSEC_PER_USEC); \ + return bfq_var_show(__data, (page)); \ +} +USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); +#undef USEC_SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t \ +__FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long __data, __min = (MIN), __max = (MAX); \ + int ret; \ + \ + ret = bfq_var_store(&__data, (page)); \ + if (ret) \ + return ret; \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ + if (__CONV == 1) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else if (__CONV == 2) \ + *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ + else \ + *(__PTR) = __data; \ + return count; \ +} +STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, + INT_MAX, 2); +STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); +STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, + INT_MAX, 0); +STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); +#undef STORE_FUNCTION + +#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ +{ \ + struct bfq_data *bfqd = e->elevator_data; \ + unsigned long __data, __min = (MIN), __max = (MAX); \ + int ret; \ + \ + ret = bfq_var_store(&__data, (page)); \ + if (ret) \ + return ret; \ + if (__data < __min) \ + __data = __min; \ + else if (__data > __max) \ + __data = __max; \ + *(__PTR) = (u64)__data * NSEC_PER_USEC; \ + return count; \ +} +USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, + UINT_MAX); +#undef USEC_STORE_FUNCTION + +static ssize_t bfq_max_budget_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret; + + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; + + if (__data == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + else { + if (__data > INT_MAX) + __data = INT_MAX; + bfqd->bfq_max_budget = __data; + } + + bfqd->bfq_user_max_budget = __data; + + return count; +} + +/* + * Leaving this name to preserve name compatibility with cfq + * parameters, but this timeout is used for both sync and async. + */ +static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret; + + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; + + if (__data < 1) + __data = 1; + else if (__data > INT_MAX) + __data = INT_MAX; + + bfqd->bfq_timeout = msecs_to_jiffies(__data); + if (bfqd->bfq_user_max_budget == 0) + bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); + + return count; +} + +static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret; + + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; + + if (__data > 1) + __data = 1; + if (!bfqd->strict_guarantees && __data == 1 + && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) + bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; + + bfqd->strict_guarantees = __data; + + return count; +} + +static ssize_t bfq_low_latency_store(struct elevator_queue *e, + const char *page, size_t count) +{ + struct bfq_data *bfqd = e->elevator_data; + unsigned long __data; + int ret; + + ret = bfq_var_store(&__data, (page)); + if (ret) + return ret; + + if (__data > 1) + __data = 1; + if (__data == 0 && bfqd->low_latency != 0) + bfq_end_wr(bfqd); + bfqd->low_latency = __data; + + return count; +} + +#define BFQ_ATTR(name) \ + __ATTR(name, 0644, bfq_##name##_show, bfq_##name##_store) + +static struct elv_fs_entry bfq_attrs[] = { + BFQ_ATTR(fifo_expire_sync), + BFQ_ATTR(fifo_expire_async), + BFQ_ATTR(back_seek_max), + BFQ_ATTR(back_seek_penalty), + BFQ_ATTR(slice_idle), + BFQ_ATTR(slice_idle_us), + BFQ_ATTR(max_budget), + BFQ_ATTR(timeout_sync), + BFQ_ATTR(strict_guarantees), + BFQ_ATTR(low_latency), + __ATTR_NULL +}; + +static struct elevator_type iosched_bfq_mq = { + .ops = { + .limit_depth = bfq_limit_depth, + .prepare_request = bfq_prepare_request, + .requeue_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_requeue_request, + .exit_icq = bfq_exit_icq, + .insert_requests = bfq_insert_requests, + .dispatch_request = bfq_dispatch_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .allow_merge = bfq_allow_bio_merge, + .bio_merge = bfq_bio_merge, + .request_merge = bfq_request_merge, + .requests_merged = bfq_requests_merged, + .request_merged = bfq_request_merged, + .has_work = bfq_has_work, + .depth_updated = bfq_depth_updated, + .init_hctx = bfq_init_hctx, + .init_sched = bfq_init_queue, + .exit_sched = bfq_exit_queue, + }, + + .icq_size = sizeof(struct bfq_io_cq), + .icq_align = __alignof__(struct bfq_io_cq), + .elevator_attrs = bfq_attrs, + .elevator_name = "bfq", + .elevator_owner = THIS_MODULE, +}; +MODULE_ALIAS("bfq-iosched"); + +static int __init bfq_init(void) +{ + int ret; + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + ret = blkcg_policy_register(&blkcg_policy_bfq); + if (ret) + return ret; +#endif + + ret = -ENOMEM; + if (bfq_slab_setup()) + goto err_pol_unreg; + + /* + * Times to load large popular applications for the typical + * systems installed on the reference devices (see the + * comments before the definition of the next + * array). Actually, we use slightly lower values, as the + * estimated peak rate tends to be smaller than the actual + * peak rate. The reason for this last fact is that estimates + * are computed over much shorter time intervals than the long + * intervals typically used for benchmarking. Why? First, to + * adapt more quickly to variations. Second, because an I/O + * scheduler cannot rely on a peak-rate-evaluation workload to + * be run for a long time. + */ + ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ + ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ + + ret = elv_register(&iosched_bfq_mq); + if (ret) + goto slab_kill; + + return 0; + +slab_kill: + bfq_slab_kill(); +err_pol_unreg: +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + return ret; +} + +static void __exit bfq_exit(void) +{ + elv_unregister(&iosched_bfq_mq); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + blkcg_policy_unregister(&blkcg_policy_bfq); +#endif + bfq_slab_kill(); +} + +module_init(bfq_init); +module_exit(bfq_exit); + +MODULE_AUTHOR("Paolo Valente"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h new file mode 100644 index 000000000..2a4a6f44e --- /dev/null +++ b/block/bfq-iosched.h @@ -0,0 +1,1107 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/* + * Header file for the BFQ I/O scheduler: data structures and + * prototypes of interface functions among BFQ components. + */ +#ifndef _BFQ_H +#define _BFQ_H + +#include +#include +#include + +#include "blk-cgroup-rwstat.h" + +#define BFQ_IOPRIO_CLASSES 3 +#define BFQ_CL_IDLE_TIMEOUT (HZ/5) + +#define BFQ_MIN_WEIGHT 1 +#define BFQ_MAX_WEIGHT 1000 +#define BFQ_WEIGHT_CONVERSION_COEFF 10 + +#define BFQ_DEFAULT_QUEUE_IOPRIO 4 + +#define BFQ_WEIGHT_LEGACY_DFL 100 +#define BFQ_DEFAULT_GRP_IOPRIO 0 +#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE + +#define MAX_PID_STR_LENGTH 12 + +/* + * Soft real-time applications are extremely more latency sensitive + * than interactive ones. Over-raise the weight of the former to + * privilege them against the latter. + */ +#define BFQ_SOFTRT_WEIGHT_FACTOR 100 + +struct bfq_entity; + +/** + * struct bfq_service_tree - per ioprio_class service tree. + * + * Each service tree represents a B-WF2Q+ scheduler on its own. Each + * ioprio_class has its own independent scheduler, and so its own + * bfq_service_tree. All the fields are protected by the queue lock + * of the containing bfqd. + */ +struct bfq_service_tree { + /* tree for active entities (i.e., those backlogged) */ + struct rb_root active; + /* tree for idle entities (i.e., not backlogged, with V < F_i)*/ + struct rb_root idle; + + /* idle entity with minimum F_i */ + struct bfq_entity *first_idle; + /* idle entity with maximum F_i */ + struct bfq_entity *last_idle; + + /* scheduler virtual time */ + u64 vtime; + /* scheduler weight sum; active and idle entities contribute to it */ + unsigned long wsum; +}; + +/** + * struct bfq_sched_data - multi-class scheduler. + * + * bfq_sched_data is the basic scheduler queue. It supports three + * ioprio_classes, and can be used either as a toplevel queue or as an + * intermediate queue in a hierarchical setup. + * + * The supported ioprio_classes are the same as in CFQ, in descending + * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. + * Requests from higher priority queues are served before all the + * requests from lower priority queues; among requests of the same + * queue requests are served according to B-WF2Q+. + * + * The schedule is implemented by the service trees, plus the field + * @next_in_service, which points to the entity on the active trees + * that will be served next, if 1) no changes in the schedule occurs + * before the current in-service entity is expired, 2) the in-service + * queue becomes idle when it expires, and 3) if the entity pointed by + * in_service_entity is not a queue, then the in-service child entity + * of the entity pointed by in_service_entity becomes idle on + * expiration. This peculiar definition allows for the following + * optimization, not yet exploited: while a given entity is still in + * service, we already know which is the best candidate for next + * service among the other active entities in the same parent + * entity. We can then quickly compare the timestamps of the + * in-service entity with those of such best candidate. + * + * All fields are protected by the lock of the containing bfqd. + */ +struct bfq_sched_data { + /* entity in service */ + struct bfq_entity *in_service_entity; + /* head-of-line entity (see comments above) */ + struct bfq_entity *next_in_service; + /* array of service trees, one per ioprio_class */ + struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; + /* last time CLASS_IDLE was served */ + unsigned long bfq_class_idle_last_service; + +}; + +/** + * struct bfq_weight_counter - counter of the number of all active queues + * with a given weight. + */ +struct bfq_weight_counter { + unsigned int weight; /* weight of the queues this counter refers to */ + unsigned int num_active; /* nr of active queues with this weight */ + /* + * Weights tree member (see bfq_data's @queue_weights_tree) + */ + struct rb_node weights_node; +}; + +/** + * struct bfq_entity - schedulable entity. + * + * A bfq_entity is used to represent either a bfq_queue (leaf node in the + * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each + * entity belongs to the sched_data of the parent group in the cgroup + * hierarchy. Non-leaf entities have also their own sched_data, stored + * in @my_sched_data. + * + * Each entity stores independently its priority values; this would + * allow different weights on different devices, but this + * functionality is not exported to userspace by now. Priorities and + * weights are updated lazily, first storing the new values into the + * new_* fields, then setting the @prio_changed flag. As soon as + * there is a transition in the entity state that allows the priority + * update to take place the effective and the requested priority + * values are synchronized. + * + * Unless cgroups are used, the weight value is calculated from the + * ioprio to export the same interface as CFQ. When dealing with + * "well-behaved" queues (i.e., queues that do not spend too much + * time to consume their budget and have true sequential behavior, and + * when there are no external factors breaking anticipation) the + * relative weights at each level of the cgroups hierarchy should be + * guaranteed. All the fields are protected by the queue lock of the + * containing bfqd. + */ +struct bfq_entity { + /* service_tree member */ + struct rb_node rb_node; + + /* + * Flag, true if the entity is on a tree (either the active or + * the idle one of its service_tree) or is in service. + */ + bool on_st_or_in_serv; + + /* B-WF2Q+ start and finish timestamps [sectors/weight] */ + u64 start, finish; + + /* tree the entity is enqueued into; %NULL if not on a tree */ + struct rb_root *tree; + + /* + * minimum start time of the (active) subtree rooted at this + * entity; used for O(log N) lookups into active trees + */ + u64 min_start; + + /* amount of service received during the last service slot */ + int service; + + /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ + int budget; + + /* device weight, if non-zero, it overrides the default weight of + * bfq_group_data */ + int dev_weight; + /* weight of the queue */ + int weight; + /* next weight if a change is in progress */ + int new_weight; + + /* original weight, used to implement weight boosting */ + int orig_weight; + + /* parent entity, for hierarchical scheduling */ + struct bfq_entity *parent; + + /* + * For non-leaf nodes in the hierarchy, the associated + * scheduler queue, %NULL on leaf nodes. + */ + struct bfq_sched_data *my_sched_data; + /* the scheduler queue this entity belongs to */ + struct bfq_sched_data *sched_data; + + /* flag, set to request a weight, ioprio or ioprio_class change */ + int prio_changed; + + /* flag, set if the entity is counted in groups_with_pending_reqs */ + bool in_groups_with_pending_reqs; +}; + +struct bfq_group; + +/** + * struct bfq_ttime - per process thinktime stats. + */ +struct bfq_ttime { + /* completion time of the last request */ + u64 last_end_request; + + /* total process thinktime */ + u64 ttime_total; + /* number of thinktime samples */ + unsigned long ttime_samples; + /* average process thinktime */ + u64 ttime_mean; +}; + +/** + * struct bfq_queue - leaf schedulable entity. + * + * A bfq_queue is a leaf request queue; it can be associated with an + * io_context or more, if it is async or shared between cooperating + * processes. @cgroup holds a reference to the cgroup, to be sure that it + * does not disappear while a bfqq still references it (mostly to avoid + * races between request issuing and task migration followed by cgroup + * destruction). + * All the fields are protected by the queue lock of the containing bfqd. + */ +struct bfq_queue { + /* reference counter */ + int ref; + /* parent bfq_data */ + struct bfq_data *bfqd; + + /* current ioprio and ioprio class */ + unsigned short ioprio, ioprio_class; + /* next ioprio and ioprio class if a change is in progress */ + unsigned short new_ioprio, new_ioprio_class; + + /* last total-service-time sample, see bfq_update_inject_limit() */ + u64 last_serv_time_ns; + /* limit for request injection */ + unsigned int inject_limit; + /* last time the inject limit has been decreased, in jiffies */ + unsigned long decrease_time_jif; + + /* + * Shared bfq_queue if queue is cooperating with one or more + * other queues. + */ + struct bfq_queue *new_bfqq; + /* request-position tree member (see bfq_group's @rq_pos_tree) */ + struct rb_node pos_node; + /* request-position tree root (see bfq_group's @rq_pos_tree) */ + struct rb_root *pos_root; + + /* sorted list of pending requests */ + struct rb_root sort_list; + /* if fifo isn't expired, next request to serve */ + struct request *next_rq; + /* number of sync and async requests queued */ + int queued[2]; + /* number of requests currently allocated */ + int allocated; + /* number of pending metadata requests */ + int meta_pending; + /* fifo list of requests in sort_list */ + struct list_head fifo; + + /* entity representing this queue in the scheduler */ + struct bfq_entity entity; + + /* pointer to the weight counter associated with this entity */ + struct bfq_weight_counter *weight_counter; + + /* maximum budget allowed from the feedback mechanism */ + int max_budget; + /* budget expiration (in jiffies) */ + unsigned long budget_timeout; + + /* number of requests on the dispatch list or inside driver */ + int dispatched; + + /* status flags */ + unsigned long flags; + + /* node for active/idle bfqq list inside parent bfqd */ + struct list_head bfqq_list; + + /* associated @bfq_ttime struct */ + struct bfq_ttime ttime; + + /* bit vector: a 1 for each seeky requests in history */ + u32 seek_history; + + /* node for the device's burst list */ + struct hlist_node burst_list_node; + + /* position of the last request enqueued */ + sector_t last_request_pos; + + /* Number of consecutive pairs of request completion and + * arrival, such that the queue becomes idle after the + * completion, but the next request arrives within an idle + * time slice; used only if the queue's IO_bound flag has been + * cleared. + */ + unsigned int requests_within_timer; + + /* pid of the process owning the queue, used for logging purposes */ + pid_t pid; + + /* + * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL + * if the queue is shared. + */ + struct bfq_io_cq *bic; + + /* current maximum weight-raising time for this queue */ + unsigned long wr_cur_max_time; + /* + * Minimum time instant such that, only if a new request is + * enqueued after this time instant in an idle @bfq_queue with + * no outstanding requests, then the task associated with the + * queue it is deemed as soft real-time (see the comments on + * the function bfq_bfqq_softrt_next_start()) + */ + unsigned long soft_rt_next_start; + /* + * Start time of the current weight-raising period if + * the @bfq-queue is being weight-raised, otherwise + * finish time of the last weight-raising period. + */ + unsigned long last_wr_start_finish; + /* factor by which the weight of this queue is multiplied */ + unsigned int wr_coeff; + /* + * Time of the last transition of the @bfq_queue from idle to + * backlogged. + */ + unsigned long last_idle_bklogged; + /* + * Cumulative service received from the @bfq_queue since the + * last transition from idle to backlogged. + */ + unsigned long service_from_backlogged; + /* + * Cumulative service received from the @bfq_queue since its + * last transition to weight-raised state. + */ + unsigned long service_from_wr; + + /* + * Value of wr start time when switching to soft rt + */ + unsigned long wr_start_at_switch_to_srt; + + unsigned long split_time; /* time of last split */ + + unsigned long first_IO_time; /* time of first I/O for this queue */ + + /* max service rate measured so far */ + u32 max_service_rate; + + /* + * Pointer to the waker queue for this queue, i.e., to the + * queue Q such that this queue happens to get new I/O right + * after some I/O request of Q is completed. For details, see + * the comments on the choice of the queue for injection in + * bfq_select_queue(). + */ + struct bfq_queue *waker_bfqq; + /* node for woken_list, see below */ + struct hlist_node woken_list_node; + /* + * Head of the list of the woken queues for this queue, i.e., + * of the list of the queues for which this queue is a waker + * queue. This list is used to reset the waker_bfqq pointer in + * the woken queues when this queue exits. + */ + struct hlist_head woken_list; +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* array of two process queues, the sync and the async */ + struct bfq_queue *bfqq[2]; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + /* + * Snapshot of the has_short_time flag before merging; taken + * to remember its value while the queue is merged, so as to + * be able to restore it in case of split. + */ + bool saved_has_short_ttime; + /* + * Same purpose as the previous two fields for the I/O bound + * classification of a queue. + */ + bool saved_IO_bound; + + /* + * Same purpose as the previous fields for the value of the + * field keeping the queue's belonging to a large burst + */ + bool saved_in_large_burst; + /* + * True if the queue belonged to a burst list before its merge + * with another cooperating queue. + */ + bool was_in_burst_list; + + /* + * Save the weight when a merge occurs, to be able + * to restore it in case of split. If the weight is not + * correctly resumed when the queue is recycled, + * then the weight of the recycled queue could differ + * from the weight of the original queue. + */ + unsigned int saved_weight; + + /* + * Similar to previous fields: save wr information. + */ + unsigned long saved_wr_coeff; + unsigned long saved_last_wr_start_finish; + unsigned long saved_wr_start_at_switch_to_srt; + unsigned int saved_wr_cur_max_time; + struct bfq_ttime saved_ttime; +}; + +/** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by @lock. + */ +struct bfq_data { + /* device request queue */ + struct request_queue *queue; + /* dispatch queue */ + struct list_head dispatch; + + /* root bfq_group for the device */ + struct bfq_group *root_group; + + /* + * rbtree of weight counters of @bfq_queues, sorted by + * weight. Used to keep track of whether all @bfq_queues have + * the same weight. The tree contains one counter for each + * distinct weight associated to some active and not + * weight-raised @bfq_queue (see the comments to the functions + * bfq_weights_tree_[add|remove] for further details). + */ + struct rb_root_cached queue_weights_tree; + + /* + * Number of groups with at least one descendant process that + * has at least one request waiting for completion. Note that + * this accounts for also requests already dispatched, but not + * yet completed. Therefore this number of groups may differ + * (be larger) than the number of active groups, as a group is + * considered active only if its corresponding entity has + * descendant queues with at least one request queued. This + * number is used to decide whether a scenario is symmetric. + * For a detailed explanation see comments on the computation + * of the variable asymmetric_scenario in the function + * bfq_better_to_idle(). + * + * However, it is hard to compute this number exactly, for + * groups with multiple descendant processes. Consider a group + * that is inactive, i.e., that has no descendant process with + * pending I/O inside BFQ queues. Then suppose that + * num_groups_with_pending_reqs is still accounting for this + * group, because the group has descendant processes with some + * I/O request still in flight. num_groups_with_pending_reqs + * should be decremented when the in-flight request of the + * last descendant process is finally completed (assuming that + * nothing else has changed for the group in the meantime, in + * terms of composition of the group and active/inactive state of child + * groups and processes). To accomplish this, an additional + * pending-request counter must be added to entities, and must + * be updated correctly. To avoid this additional field and operations, + * we resort to the following tradeoff between simplicity and + * accuracy: for an inactive group that is still counted in + * num_groups_with_pending_reqs, we decrement + * num_groups_with_pending_reqs when the first descendant + * process of the group remains with no request waiting for + * completion. + * + * Even this simpler decrement strategy requires a little + * carefulness: to avoid multiple decrements, we flag a group, + * more precisely an entity representing a group, as still + * counted in num_groups_with_pending_reqs when it becomes + * inactive. Then, when the first descendant queue of the + * entity remains with no request waiting for completion, + * num_groups_with_pending_reqs is decremented, and this flag + * is reset. After this flag is reset for the entity, + * num_groups_with_pending_reqs won't be decremented any + * longer in case a new descendant queue of the entity remains + * with no request waiting for completion. + */ + unsigned int num_groups_with_pending_reqs; + + /* + * Per-class (RT, BE, IDLE) number of bfq_queues containing + * requests (including the queue in service, even if it is + * idling). + */ + unsigned int busy_queues[3]; + /* number of weight-raised busy @bfq_queues */ + int wr_busy_queues; + /* number of queued requests */ + int queued; + /* number of requests dispatched and waiting for completion */ + int rq_in_driver; + + /* true if the device is non rotational and performs queueing */ + bool nonrot_with_queueing; + + /* + * Maximum number of requests in driver in the last + * @hw_tag_samples completed requests. + */ + int max_rq_in_driver; + /* number of samples used to calculate hw_tag */ + int hw_tag_samples; + /* flag set to one if the driver is showing a queueing behavior */ + int hw_tag; + + /* number of budgets assigned */ + int budgets_assigned; + + /* + * Timer set when idling (waiting) for the next request from + * the queue in service. + */ + struct hrtimer idle_slice_timer; + + /* bfq_queue in service */ + struct bfq_queue *in_service_queue; + + /* on-disk position of the last served request */ + sector_t last_position; + + /* position of the last served request for the in-service queue */ + sector_t in_serv_last_pos; + + /* time of last request completion (ns) */ + u64 last_completion; + + /* bfqq owning the last completed rq */ + struct bfq_queue *last_completed_rq_bfqq; + + /* time of last transition from empty to non-empty (ns) */ + u64 last_empty_occupied_ns; + + /* + * Flag set to activate the sampling of the total service time + * of a just-arrived first I/O request (see + * bfq_update_inject_limit()). This will cause the setting of + * waited_rq when the request is finally dispatched. + */ + bool wait_dispatch; + /* + * If set, then bfq_update_inject_limit() is invoked when + * waited_rq is eventually completed. + */ + struct request *waited_rq; + /* + * True if some request has been injected during the last service hole. + */ + bool rqs_injected; + + /* time of first rq dispatch in current observation interval (ns) */ + u64 first_dispatch; + /* time of last rq dispatch in current observation interval (ns) */ + u64 last_dispatch; + + /* beginning of the last budget */ + ktime_t last_budget_start; + /* beginning of the last idle slice */ + ktime_t last_idling_start; + unsigned long last_idling_start_jiffies; + + /* number of samples in current observation interval */ + int peak_rate_samples; + /* num of samples of seq dispatches in current observation interval */ + u32 sequential_samples; + /* total num of sectors transferred in current observation interval */ + u64 tot_sectors_dispatched; + /* max rq size seen during current observation interval (sectors) */ + u32 last_rq_max_size; + /* time elapsed from first dispatch in current observ. interval (us) */ + u64 delta_from_first; + /* + * Current estimate of the device peak rate, measured in + * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by + * BFQ_RATE_SHIFT is performed to increase precision in + * fixed-point calculations. + */ + u32 peak_rate; + + /* maximum budget allotted to a bfq_queue before rescheduling */ + int bfq_max_budget; + + /* list of all the bfq_queues active on the device */ + struct list_head active_list; + /* list of all the bfq_queues idle on the device */ + struct list_head idle_list; + + /* + * Timeout for async/sync requests; when it fires, requests + * are served in fifo order. + */ + u64 bfq_fifo_expire[2]; + /* weight of backward seeks wrt forward ones */ + unsigned int bfq_back_penalty; + /* maximum allowed backward seek */ + unsigned int bfq_back_max; + /* maximum idling time */ + u32 bfq_slice_idle; + + /* user-configured max budget value (0 for auto-tuning) */ + int bfq_user_max_budget; + /* + * Timeout for bfq_queues to consume their budget; used to + * prevent seeky queues from imposing long latencies to + * sequential or quasi-sequential ones (this also implies that + * seeky queues cannot receive guarantees in the service + * domain; after a timeout they are charged for the time they + * have been in service, to preserve fairness among them, but + * without service-domain guarantees). + */ + unsigned int bfq_timeout; + + /* + * Number of consecutive requests that must be issued within + * the idle time slice to set again idling to a queue which + * was marked as non-I/O-bound (see the definition of the + * IO_bound flag for further details). + */ + unsigned int bfq_requests_within_timer; + + /* + * Force device idling whenever needed to provide accurate + * service guarantees, without caring about throughput + * issues. CAVEAT: this may even increase latencies, in case + * of useless idling for processes that did stop doing I/O. + */ + bool strict_guarantees; + + /* + * Last time at which a queue entered the current burst of + * queues being activated shortly after each other; for more + * details about this and the following parameters related to + * a burst of activations, see the comments on the function + * bfq_handle_burst. + */ + unsigned long last_ins_in_burst; + /* + * Reference time interval used to decide whether a queue has + * been activated shortly after @last_ins_in_burst. + */ + unsigned long bfq_burst_interval; + /* number of queues in the current burst of queue activations */ + int burst_size; + + /* common parent entity for the queues in the burst */ + struct bfq_entity *burst_parent_entity; + /* Maximum burst size above which the current queue-activation + * burst is deemed as 'large'. + */ + unsigned long bfq_large_burst_thresh; + /* true if a large queue-activation burst is in progress */ + bool large_burst; + /* + * Head of the burst list (as for the above fields, more + * details in the comments on the function bfq_handle_burst). + */ + struct hlist_head burst_list; + + /* if set to true, low-latency heuristics are enabled */ + bool low_latency; + /* + * Maximum factor by which the weight of a weight-raised queue + * is multiplied. + */ + unsigned int bfq_wr_coeff; + /* maximum duration of a weight-raising period (jiffies) */ + unsigned int bfq_wr_max_time; + + /* Maximum weight-raising duration for soft real-time processes */ + unsigned int bfq_wr_rt_max_time; + /* + * Minimum idle period after which weight-raising may be + * reactivated for a queue (in jiffies). + */ + unsigned int bfq_wr_min_idle_time; + /* + * Minimum period between request arrivals after which + * weight-raising may be reactivated for an already busy async + * queue (in jiffies). + */ + unsigned long bfq_wr_min_inter_arr_async; + + /* Max service-rate for a soft real-time queue, in sectors/sec */ + unsigned int bfq_wr_max_softrt_rate; + /* + * Cached value of the product ref_rate*ref_wr_duration, used + * for computing the maximum duration of weight raising + * automatically. + */ + u64 rate_dur_prod; + + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; + + spinlock_t lock; + + /* + * bic associated with the task issuing current bio for + * merging. This and the next field are used as a support to + * be able to perform the bic lookup, needed by bio-merge + * functions, before the scheduler lock is taken, and thus + * avoid taking the request-queue lock while the scheduler + * lock is being held. + */ + struct bfq_io_cq *bio_bic; + /* bfqq associated with the task issuing current bio for merging */ + struct bfq_queue *bio_bfqq; + + /* + * Depth limits used in bfq_limit_depth (see comments on the + * function) + */ + unsigned int word_depths[2][2]; +}; + +enum bfqq_state_flags { + BFQQF_just_created = 0, /* queue just allocated */ + BFQQF_busy, /* has requests or is in service */ + BFQQF_wait_request, /* waiting for a request */ + BFQQF_non_blocking_wait_rq, /* + * waiting for a request + * without idling the device + */ + BFQQF_fifo_expire, /* FIFO checked in this slice */ + BFQQF_has_short_ttime, /* queue has a short think time */ + BFQQF_sync, /* synchronous queue */ + BFQQF_IO_bound, /* + * bfqq has timed-out at least once + * having consumed at most 2/10 of + * its budget + */ + BFQQF_in_large_burst, /* + * bfqq activated in a large burst, + * see comments to bfq_handle_burst. + */ + BFQQF_softrt_update, /* + * may need softrt-next-start + * update + */ + BFQQF_coop, /* bfqq is shared */ + BFQQF_split_coop, /* shared bfqq will be split */ + BFQQF_has_waker /* bfqq has a waker queue */ +}; + +#define BFQ_BFQQ_FNS(name) \ +void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \ +void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \ +int bfq_bfqq_##name(const struct bfq_queue *bfqq); + +BFQ_BFQQ_FNS(just_created); +BFQ_BFQQ_FNS(busy); +BFQ_BFQQ_FNS(wait_request); +BFQ_BFQQ_FNS(non_blocking_wait_rq); +BFQ_BFQQ_FNS(fifo_expire); +BFQ_BFQQ_FNS(has_short_ttime); +BFQ_BFQQ_FNS(sync); +BFQ_BFQQ_FNS(IO_bound); +BFQ_BFQQ_FNS(in_large_burst); +BFQ_BFQQ_FNS(coop); +BFQ_BFQQ_FNS(split_coop); +BFQ_BFQQ_FNS(softrt_update); +BFQ_BFQQ_FNS(has_waker); +#undef BFQ_BFQQ_FNS + +/* Expiration reasons. */ +enum bfqq_expiration { + BFQQE_TOO_IDLE = 0, /* + * queue has been idling for + * too long + */ + BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */ + BFQQE_BUDGET_EXHAUSTED, /* budget consumed */ + BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */ + BFQQE_PREEMPTED /* preemption in progress */ +}; + +struct bfq_stat { + struct percpu_counter cpu_cnt; + atomic64_t aux_cnt; +}; + +struct bfqg_stats { + /* basic stats */ + struct blkg_rwstat bytes; + struct blkg_rwstat ios; +#ifdef CONFIG_BFQ_CGROUP_DEBUG + /* number of ios merged */ + struct blkg_rwstat merged; + /* total time spent on device in ns, may not be accurate w/ queueing */ + struct blkg_rwstat service_time; + /* total time spent waiting in scheduler queue in ns */ + struct blkg_rwstat wait_time; + /* number of IOs queued up */ + struct blkg_rwstat queued; + /* total disk time and nr sectors dispatched by this group */ + struct bfq_stat time; + /* sum of number of ios queued across all samples */ + struct bfq_stat avg_queue_size_sum; + /* count of samples taken for average */ + struct bfq_stat avg_queue_size_samples; + /* how many times this group has been removed from service tree */ + struct bfq_stat dequeue; + /* total time spent waiting for it to be assigned a timeslice. */ + struct bfq_stat group_wait_time; + /* time spent idling for this blkcg_gq */ + struct bfq_stat idle_time; + /* total time with empty current active q with other requests queued */ + struct bfq_stat empty_time; + /* fields after this shouldn't be cleared on stat reset */ + u64 start_group_wait_time; + u64 start_idle_time; + u64 start_empty_time; + uint16_t flags; +#endif /* CONFIG_BFQ_CGROUP_DEBUG */ +}; + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +/* + * struct bfq_group_data - per-blkcg storage for the blkio subsystem. + * + * @ps: @blkcg_policy_storage that this structure inherits + * @weight: weight of the bfq_group + */ +struct bfq_group_data { + /* must be the first member */ + struct blkcg_policy_data pd; + + unsigned int weight; +}; + +/** + * struct bfq_group - per (device, cgroup) data structure. + * @entity: schedulable entity to insert into the parent group sched_data. + * @sched_data: own sched_data, to contain child entities (they may be + * both bfq_queues and bfq_groups). + * @bfqd: the bfq_data for the device this group acts upon. + * @async_bfqq: array of async queues for all the tasks belonging to + * the group, one queue per ioprio value per ioprio_class, + * except for the idle class that has only one queue. + * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). + * @my_entity: pointer to @entity, %NULL for the toplevel group; used + * to avoid too many special cases during group creation/ + * migration. + * @stats: stats for this bfqg. + * @active_entities: number of active entities belonging to the group; + * unused for the root group. Used to know whether there + * are groups with more than one active @bfq_entity + * (see the comments to the function + * bfq_bfqq_may_idle()). + * @rq_pos_tree: rbtree sorted by next_request position, used when + * determining if two or more queues have interleaving + * requests (see bfq_find_close_cooperator()). + * + * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup + * there is a set of bfq_groups, each one collecting the lower-level + * entities belonging to the group that are acting on the same device. + * + * Locking works as follows: + * o @bfqd is protected by the queue lock, RCU is used to access it + * from the readers. + * o All the other fields are protected by the @bfqd queue lock. + */ +struct bfq_group { + /* must be the first member */ + struct blkg_policy_data pd; + + /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ + char blkg_path[128]; + + /* reference counter (see comments in bfq_bic_update_cgroup) */ + int ref; + /* Is bfq_group still online? */ + bool online; + + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + void *bfqd; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct bfq_entity *my_entity; + + int active_entities; + + struct rb_root rq_pos_tree; + + struct bfqg_stats stats; +}; + +#else +struct bfq_group { + struct bfq_entity entity; + struct bfq_sched_data sched_data; + + struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; + struct bfq_queue *async_idle_bfqq; + + struct rb_root rq_pos_tree; +}; +#endif + +struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); + +/* --------------- main algorithm interface ----------------- */ + +#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ + { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) + +extern const int bfq_timeout; + +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); +struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); +void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct rb_root_cached *root); +void __bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct rb_root_cached *root); +void bfq_weights_tree_remove(struct bfq_data *bfqd, + struct bfq_queue *bfqq); +void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool compensate, enum bfqq_expiration reason); +void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); +void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); +void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_schedule_dispatch(struct bfq_data *bfqd); +void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); + +/* ------------ end of main algorithm interface -------------- */ + +/* ---------------- cgroups-support interface ---------------- */ + +void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + unsigned int op); +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); +void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, + u64 io_start_time_ns, unsigned int op); +void bfqg_stats_update_dequeue(struct bfq_group *bfqg); +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +void bfqg_stats_update_idle_time(struct bfq_group *bfqg); +void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); +void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_group *bfqg); + +void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); +void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); +void bfq_end_wr_async(struct bfq_data *bfqd); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); +struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); +struct bfq_group *bfqq_group(struct bfq_queue *bfqq); +struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); +void bfqg_and_blkg_put(struct bfq_group *bfqg); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +extern struct cftype bfq_blkcg_legacy_files[]; +extern struct cftype bfq_blkg_files[]; +extern struct blkcg_policy blkcg_policy_bfq; +#endif + +/* ------------- end of cgroups-support interface ------------- */ + +/* - interface of the internal hierarchical B-WF2Q+ scheduler - */ + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +/* both next loops stop at one of the child entities of the root group */ +#define for_each_entity(entity) \ + for (; entity ; entity = entity->parent) + +/* + * For each iteration, compute parent in advance, so as to be safe if + * entity is deallocated during the iteration. Such a deallocation may + * happen as a consequence of a bfq_put_queue that frees the bfq_queue + * containing entity. + */ +#define for_each_entity_safe(entity, parent) \ + for (; entity && ({ parent = entity->parent; 1; }); entity = parent) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ +/* + * Next two macros are fake loops when cgroups support is not + * enabled. I fact, in such a case, there is only one level to go up + * (to reach the root group). + */ +#define for_each_entity(entity) \ + for (; entity ; entity = NULL) + +#define for_each_entity_safe(entity, parent) \ + for (parent = NULL; entity ; entity = parent) +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ + +struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); +struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); +unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); +struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); +struct bfq_entity *bfq_entity_of(struct rb_node *node); +unsigned short bfq_ioprio_to_weight(int ioprio); +void bfq_put_idle_entity(struct bfq_service_tree *st, + struct bfq_entity *entity); +struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity, + bool update_class_too); +void bfq_bfqq_served(struct bfq_queue *bfqq, int served); +void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms); +bool __bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree); +bool next_queue_may_preempt(struct bfq_data *bfqd); +struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd); +bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); +void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool ins_into_idle_tree, bool expiration); +void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration); +void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration); +void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); + +/* --------------- end of interface of B-WF2Q+ ---------------- */ + +/* Logging facilities. */ +static inline void bfq_pid_to_str(int pid, char *str, int len) +{ + if (pid != -1) + snprintf(str, len, "%d", pid); + else + snprintf(str, len, "SHARED-"); +} + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +struct bfq_group *bfqq_group(struct bfq_queue *bfqq); + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ + bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ + "bfq%s%c " fmt, pid_str, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ +} while (0) + +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ + blk_add_cgroup_trace_msg((bfqd)->queue, \ + bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ +} while (0) + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ + char pid_str[MAX_PID_STR_LENGTH]; \ + if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ + break; \ + bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ + bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ + ##args); \ +} while (0) +#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ + +#define bfq_log(bfqd, fmt, args...) \ + blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) + +#endif /* _BFQ_H */ diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c new file mode 100644 index 000000000..26776bdbd --- /dev/null +++ b/block/bfq-wf2q.c @@ -0,0 +1,1712 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Hierarchical Budget Worst-case Fair Weighted Fair Queueing + * (B-WF2Q+): hierarchical scheduling algorithm by which the BFQ I/O + * scheduler schedules generic entities. The latter can represent + * either single bfq queues (associated with processes) or groups of + * bfq queues (associated with cgroups). + */ +#include "bfq-iosched.h" + +/** + * bfq_gt - compare two timestamps. + * @a: first ts. + * @b: second ts. + * + * Return @a > @b, dealing with wrapping correctly. + */ +static int bfq_gt(u64 a, u64 b) +{ + return (s64)(a - b) > 0; +} + +static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) +{ + struct rb_node *node = tree->rb_node; + + return rb_entry(node, struct bfq_entity, rb_node); +} + +static unsigned int bfq_class_idx(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + return bfqq ? bfqq->ioprio_class - 1 : + BFQ_DEFAULT_GRP_CLASS - 1; +} + +unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) +{ + return bfqd->busy_queues[0] + bfqd->busy_queues[1] + + bfqd->busy_queues[2]; +} + +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration); + +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); + +/** + * bfq_update_next_in_service - update sd->next_in_service + * @sd: sched_data for which to perform the update. + * @new_entity: if not NULL, pointer to the entity whose activation, + * requeueing or repositioning triggered the invocation of + * this function. + * @expiration: id true, this function is being invoked after the + * expiration of the in-service entity + * + * This function is called to update sd->next_in_service, which, in + * its turn, may change as a consequence of the insertion or + * extraction of an entity into/from one of the active trees of + * sd. These insertions/extractions occur as a consequence of + * activations/deactivations of entities, with some activations being + * 'true' activations, and other activations being requeueings (i.e., + * implementing the second, requeueing phase of the mechanism used to + * reposition an entity in its active tree; see comments on + * __bfq_activate_entity and __bfq_requeue_entity for details). In + * both the last two activation sub-cases, new_entity points to the + * just activated or requeued entity. + * + * Returns true if sd->next_in_service changes in such a way that + * entity->parent may become the next_in_service for its parent + * entity. + */ +static bool bfq_update_next_in_service(struct bfq_sched_data *sd, + struct bfq_entity *new_entity, + bool expiration) +{ + struct bfq_entity *next_in_service = sd->next_in_service; + bool parent_sched_may_change = false; + bool change_without_lookup = false; + + /* + * If this update is triggered by the activation, requeueing + * or repositioning of an entity that does not coincide with + * sd->next_in_service, then a full lookup in the active tree + * can be avoided. In fact, it is enough to check whether the + * just-modified entity has the same priority as + * sd->next_in_service, is eligible and has a lower virtual + * finish time than sd->next_in_service. If this compound + * condition holds, then the new entity becomes the new + * next_in_service. Otherwise no change is needed. + */ + if (new_entity && new_entity != sd->next_in_service) { + /* + * Flag used to decide whether to replace + * sd->next_in_service with new_entity. Tentatively + * set to true, and left as true if + * sd->next_in_service is NULL. + */ + change_without_lookup = true; + + /* + * If there is already a next_in_service candidate + * entity, then compare timestamps to decide whether + * to replace sd->service_tree with new_entity. + */ + if (next_in_service) { + unsigned int new_entity_class_idx = + bfq_class_idx(new_entity); + struct bfq_service_tree *st = + sd->service_tree + new_entity_class_idx; + + change_without_lookup = + (new_entity_class_idx == + bfq_class_idx(next_in_service) + && + !bfq_gt(new_entity->start, st->vtime) + && + bfq_gt(next_in_service->finish, + new_entity->finish)); + } + + if (change_without_lookup) + next_in_service = new_entity; + } + + if (!change_without_lookup) /* lookup needed */ + next_in_service = bfq_lookup_next_entity(sd, expiration); + + if (next_in_service) { + bool new_budget_triggers_change = + bfq_update_parent_budget(next_in_service); + + parent_sched_may_change = !sd->next_in_service || + new_budget_triggers_change; + } + + sd->next_in_service = next_in_service; + + if (!next_in_service) + return parent_sched_may_change; + + return parent_sched_may_change; +} + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + +struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + struct bfq_entity *group_entity = bfqq->entity.parent; + + if (!group_entity) + group_entity = &bfqq->bfqd->root_group->entity; + + return container_of(group_entity, struct bfq_group, entity); +} + +/* + * Returns true if this budget changes may let next_in_service->parent + * become the next_in_service entity for its parent entity. + */ +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) +{ + struct bfq_entity *bfqg_entity; + struct bfq_group *bfqg; + struct bfq_sched_data *group_sd; + bool ret = false; + + group_sd = next_in_service->sched_data; + + bfqg = container_of(group_sd, struct bfq_group, sched_data); + /* + * bfq_group's my_entity field is not NULL only if the group + * is not the root group. We must not touch the root entity + * as it must never become an in-service entity. + */ + bfqg_entity = bfqg->my_entity; + if (bfqg_entity) { + if (bfqg_entity->budget > next_in_service->budget) + ret = true; + bfqg_entity->budget = next_in_service->budget; + } + + return ret; +} + +/* + * This function tells whether entity stops being a candidate for next + * service, according to the restrictive definition of the field + * next_in_service. In particular, this function is invoked for an + * entity that is about to be set in service. + * + * If entity is a queue, then the entity is no longer a candidate for + * next service according to the that definition, because entity is + * about to become the in-service queue. This function then returns + * true if entity is a queue. + * + * In contrast, entity could still be a candidate for next service if + * it is not a queue, and has more than one active child. In fact, + * even if one of its children is about to be set in service, other + * active children may still be the next to serve, for the parent + * entity, even according to the above definition. As a consequence, a + * non-queue entity is not a candidate for next-service only if it has + * only one active child. And only if this condition holds, then this + * function returns true for a non-queue entity. + */ +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +{ + struct bfq_group *bfqg; + + if (bfq_entity_to_bfqq(entity)) + return true; + + bfqg = container_of(entity, struct bfq_group, entity); + + /* + * The field active_entities does not always contain the + * actual number of active children entities: it happens to + * not account for the in-service entity in case the latter is + * removed from its active tree (which may get done after + * invoking the function bfq_no_longer_next_in_service in + * bfq_get_next_queue). Fortunately, here, i.e., while + * bfq_no_longer_next_in_service is not yet completed in + * bfq_get_next_queue, bfq_active_extract has not yet been + * invoked, and thus active_entities still coincides with the + * actual number of active entities. + */ + if (bfqg->active_entities == 1) + return true; + + return false; +} + +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + +struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +{ + return bfqq->bfqd->root_group; +} + +static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) +{ + return false; +} + +static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) +{ + return true; +} + +#endif /* CONFIG_BFQ_GROUP_IOSCHED */ + +/* + * Shift for timestamp calculations. This actually limits the maximum + * service allowed in one timestamp delta (small shift values increase it), + * the maximum total weight that can be used for the queues in the system + * (big shift values increase it), and the period of virtual time + * wraparounds. + */ +#define WFQ_SERVICE_SHIFT 22 + +struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = NULL; + + if (!entity->my_sched_data) + bfqq = container_of(entity, struct bfq_queue, entity); + + return bfqq; +} + + +/** + * bfq_delta - map service into the virtual time domain. + * @service: amount of service. + * @weight: scale factor (weight of an entity or weight sum). + */ +static u64 bfq_delta(unsigned long service, unsigned long weight) +{ + return div64_ul((u64)service << WFQ_SERVICE_SHIFT, weight); +} + +/** + * bfq_calc_finish - assign the finish time to an entity. + * @entity: the entity to act upon. + * @service: the service to be charged to the entity. + */ +static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->finish = entity->start + + bfq_delta(service, entity->weight); + + if (bfqq) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: serv %lu, w %d", + service, entity->weight); + bfq_log_bfqq(bfqq->bfqd, bfqq, + "calc_finish: start %llu, finish %llu, delta %llu", + entity->start, entity->finish, + bfq_delta(service, entity->weight)); + } +} + +/** + * bfq_entity_of - get an entity from a node. + * @node: the node field of the entity. + * + * Convert a node pointer to the relative entity. This is used only + * to simplify the logic of some functions and not as the generic + * conversion mechanism because, e.g., in the tree walking functions, + * the check for a %NULL value would be redundant. + */ +struct bfq_entity *bfq_entity_of(struct rb_node *node) +{ + struct bfq_entity *entity = NULL; + + if (node) + entity = rb_entry(node, struct bfq_entity, rb_node); + + return entity; +} + +/** + * bfq_extract - remove an entity from a tree. + * @root: the tree root. + * @entity: the entity to remove. + */ +static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) +{ + entity->tree = NULL; + rb_erase(&entity->rb_node, root); +} + +/** + * bfq_idle_extract - extract an entity from the idle tree. + * @st: the service tree of the owning @entity. + * @entity: the entity being removed. + */ +static void bfq_idle_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *next; + + if (entity == st->first_idle) { + next = rb_next(&entity->rb_node); + st->first_idle = bfq_entity_of(next); + } + + if (entity == st->last_idle) { + next = rb_prev(&entity->rb_node); + st->last_idle = bfq_entity_of(next); + } + + bfq_extract(&st->idle, entity); + + if (bfqq) + list_del(&bfqq->bfqq_list); +} + +/** + * bfq_insert - generic tree insertion. + * @root: tree root. + * @entity: entity to insert. + * + * This is used for the idle and the active tree, since they are both + * ordered by finish time. + */ +static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) +{ + struct bfq_entity *entry; + struct rb_node **node = &root->rb_node; + struct rb_node *parent = NULL; + + while (*node) { + parent = *node; + entry = rb_entry(parent, struct bfq_entity, rb_node); + + if (bfq_gt(entry->finish, entity->finish)) + node = &parent->rb_left; + else + node = &parent->rb_right; + } + + rb_link_node(&entity->rb_node, parent, node); + rb_insert_color(&entity->rb_node, root); + + entity->tree = root; +} + +/** + * bfq_update_min - update the min_start field of a entity. + * @entity: the entity to update. + * @node: one of its children. + * + * This function is called when @entity may store an invalid value for + * min_start due to updates to the active tree. The function assumes + * that the subtree rooted at @node (which may be its left or its right + * child) has a valid min_start value. + */ +static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) +{ + struct bfq_entity *child; + + if (node) { + child = rb_entry(node, struct bfq_entity, rb_node); + if (bfq_gt(entity->min_start, child->min_start)) + entity->min_start = child->min_start; + } +} + +/** + * bfq_update_active_node - recalculate min_start. + * @node: the node to update. + * + * @node may have changed position or one of its children may have moved, + * this function updates its min_start value. The left and right subtrees + * are assumed to hold a correct min_start value. + */ +static void bfq_update_active_node(struct rb_node *node) +{ + struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); + + entity->min_start = entity->start; + bfq_update_min(entity, node->rb_right); + bfq_update_min(entity, node->rb_left); +} + +/** + * bfq_update_active_tree - update min_start for the whole active tree. + * @node: the starting node. + * + * @node must be the deepest modified node after an update. This function + * updates its min_start using the values held by its children, assuming + * that they did not change, and then updates all the nodes that may have + * changed in the path to the root. The only nodes that may have changed + * are the ones in the path or their siblings. + */ +static void bfq_update_active_tree(struct rb_node *node) +{ + struct rb_node *parent; + +up: + bfq_update_active_node(node); + + parent = rb_parent(node); + if (!parent) + return; + + if (node == parent->rb_left && parent->rb_right) + bfq_update_active_node(parent->rb_right); + else if (parent->rb_left) + bfq_update_active_node(parent->rb_left); + + node = parent; + goto up; +} + +/** + * bfq_active_insert - insert an entity in the active tree of its + * group/device. + * @st: the service tree of the entity. + * @entity: the entity being inserted. + * + * The active tree is ordered by finish time, but an extra key is kept + * per each node, containing the minimum value for the start times of + * its children (and the node itself), so it's possible to search for + * the eligible node with the lowest finish time in logarithmic time. + */ +static void bfq_active_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node = &entity->rb_node; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + bfq_insert(&st->active, entity); + + if (node->rb_left) + node = node->rb_left; + else if (node->rb_right) + node = node->rb_right; + + bfq_update_active_tree(node); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (bfqg != bfqd->root_group) + bfqg->active_entities++; +#endif +} + +/** + * bfq_ioprio_to_weight - calc a weight from an ioprio. + * @ioprio: the ioprio value to convert. + */ +unsigned short bfq_ioprio_to_weight(int ioprio) +{ + return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; +} + +/** + * bfq_weight_to_ioprio - calc an ioprio from a weight. + * @weight: the weight value to convert. + * + * To preserve as much as possible the old only-ioprio user interface, + * 0 is used as an escape ioprio value for weights (numerically) equal or + * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + */ +static unsigned short bfq_weight_to_ioprio(int weight) +{ + return max_t(int, 0, + IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); +} + +static void bfq_get_entity(struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + if (bfqq) { + bfqq->ref++; + bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", + bfqq, bfqq->ref); + } +} + +/** + * bfq_find_deepest - find the deepest node that an extraction can modify. + * @node: the node being removed. + * + * Do the first step of an extraction in an rb tree, looking for the + * node that will replace @node, and returning the deepest node that + * the following modifications to the tree can touch. If @node is the + * last node in the tree return %NULL. + */ +static struct rb_node *bfq_find_deepest(struct rb_node *node) +{ + struct rb_node *deepest; + + if (!node->rb_right && !node->rb_left) + deepest = rb_parent(node); + else if (!node->rb_right) + deepest = node->rb_left; + else if (!node->rb_left) + deepest = node->rb_right; + else { + deepest = rb_next(node); + if (deepest->rb_right) + deepest = deepest->rb_right; + else if (rb_parent(deepest) != node) + deepest = rb_parent(deepest); + } + + return deepest; +} + +/** + * bfq_active_extract - remove an entity from the active tree. + * @st: the service_tree containing the tree. + * @entity: the entity being removed. + */ +static void bfq_active_extract(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct rb_node *node; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_sched_data *sd = NULL; + struct bfq_group *bfqg = NULL; + struct bfq_data *bfqd = NULL; +#endif + + node = bfq_find_deepest(&entity->rb_node); + bfq_extract(&st->active, entity); + + if (node) + bfq_update_active_tree(node); + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + sd = entity->sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + bfqd = (struct bfq_data *)bfqg->bfqd; +#endif + if (bfqq) + list_del(&bfqq->bfqq_list); +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (bfqg != bfqd->root_group) + bfqg->active_entities--; +#endif +} + +/** + * bfq_idle_insert - insert an entity into the idle tree. + * @st: the service tree containing the tree. + * @entity: the entity to insert. + */ +static void bfq_idle_insert(struct bfq_service_tree *st, + struct bfq_entity *entity) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) + st->first_idle = entity; + if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) + st->last_idle = entity; + + bfq_insert(&st->idle, entity); + + if (bfqq) + list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); +} + +/** + * bfq_forget_entity - do not consider entity any longer for scheduling + * @st: the service tree. + * @entity: the entity being removed. + * @is_in_service: true if entity is currently the in-service entity. + * + * Forget everything about @entity. In addition, if entity represents + * a queue, and the latter is not in service, then release the service + * reference to the queue (the one taken through bfq_get_entity). In + * fact, in this case, there is really no more service reference to + * the queue, as the latter is also outside any service tree. If, + * instead, the queue is in service, then __bfq_bfqd_reset_in_service + * will take care of putting the reference when the queue finally + * stops being served. + */ +static void bfq_forget_entity(struct bfq_service_tree *st, + struct bfq_entity *entity, + bool is_in_service) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + entity->on_st_or_in_serv = false; + st->wsum -= entity->weight; + if (bfqq && !is_in_service) + bfq_put_queue(bfqq); +} + +/** + * bfq_put_idle_entity - release the idle tree ref of an entity. + * @st: service tree for the entity. + * @entity: the entity being released. + */ +void bfq_put_idle_entity(struct bfq_service_tree *st, struct bfq_entity *entity) +{ + bfq_idle_extract(st, entity); + bfq_forget_entity(st, entity, + entity == entity->sched_data->in_service_entity); +} + +/** + * bfq_forget_idle - update the idle tree if necessary. + * @st: the service tree to act upon. + * + * To preserve the global O(log N) complexity we only remove one entry here; + * as the idle tree will not grow indefinitely this can be done safely. + */ +static void bfq_forget_idle(struct bfq_service_tree *st) +{ + struct bfq_entity *first_idle = st->first_idle; + struct bfq_entity *last_idle = st->last_idle; + + if (RB_EMPTY_ROOT(&st->active) && last_idle && + !bfq_gt(last_idle->finish, st->vtime)) { + /* + * Forget the whole idle tree, increasing the vtime past + * the last finish time of idle entities. + */ + st->vtime = last_idle->finish; + } + + if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) + bfq_put_idle_entity(st, first_idle); +} + +struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity) +{ + struct bfq_sched_data *sched_data = entity->sched_data; + unsigned int idx = bfq_class_idx(entity); + + return sched_data->service_tree + idx; +} + +/* + * Update weight and priority of entity. If update_class_too is true, + * then update the ioprio_class of entity too. + * + * The reason why the update of ioprio_class is controlled through the + * last parameter is as follows. Changing the ioprio class of an + * entity implies changing the destination service trees for that + * entity. If such a change occurred when the entity is already on one + * of the service trees for its previous class, then the state of the + * entity would become more complex: none of the new possible service + * trees for the entity, according to bfq_entity_service_tree(), would + * match any of the possible service trees on which the entity + * is. Complex operations involving these trees, such as entity + * activations and deactivations, should take into account this + * additional complexity. To avoid this issue, this function is + * invoked with update_class_too unset in the points in the code where + * entity may happen to be on some tree. + */ +struct bfq_service_tree * +__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, + struct bfq_entity *entity, + bool update_class_too) +{ + struct bfq_service_tree *new_st = old_st; + + if (entity->prio_changed) { + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + unsigned int prev_weight, new_weight; + struct bfq_data *bfqd = NULL; + struct rb_root_cached *root; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_sched_data *sd; + struct bfq_group *bfqg; +#endif + + if (bfqq) + bfqd = bfqq->bfqd; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + else { + sd = entity->my_sched_data; + bfqg = container_of(sd, struct bfq_group, sched_data); + bfqd = (struct bfq_data *)bfqg->bfqd; + } +#endif + + /* Matches the smp_wmb() in bfq_group_set_weight. */ + smp_rmb(); + old_st->wsum -= entity->weight; + + if (entity->new_weight != entity->orig_weight) { + if (entity->new_weight < BFQ_MIN_WEIGHT || + entity->new_weight > BFQ_MAX_WEIGHT) { + pr_crit("update_weight_prio: new_weight %d\n", + entity->new_weight); + if (entity->new_weight < BFQ_MIN_WEIGHT) + entity->new_weight = BFQ_MIN_WEIGHT; + else + entity->new_weight = BFQ_MAX_WEIGHT; + } + entity->orig_weight = entity->new_weight; + if (bfqq) + bfqq->ioprio = + bfq_weight_to_ioprio(entity->orig_weight); + } + + if (bfqq && update_class_too) + bfqq->ioprio_class = bfqq->new_ioprio_class; + + /* + * Reset prio_changed only if the ioprio_class change + * is not pending any longer. + */ + if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) + entity->prio_changed = 0; + + /* + * NOTE: here we may be changing the weight too early, + * this will cause unfairness. The correct approach + * would have required additional complexity to defer + * weight changes to the proper time instants (i.e., + * when entity->finish <= old_st->vtime). + */ + new_st = bfq_entity_service_tree(entity); + + prev_weight = entity->weight; + new_weight = entity->orig_weight * + (bfqq ? bfqq->wr_coeff : 1); + /* + * If the weight of the entity changes, and the entity is a + * queue, remove the entity from its old weight counter (if + * there is a counter associated with the entity). + */ + if (prev_weight != new_weight && bfqq) { + root = &bfqd->queue_weights_tree; + __bfq_weights_tree_remove(bfqd, bfqq, root); + } + entity->weight = new_weight; + /* + * Add the entity, if it is not a weight-raised queue, + * to the counter associated with its new weight. + */ + if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { + /* If we get here, root has been initialized. */ + bfq_weights_tree_add(bfqd, bfqq, root); + } + + new_st->wsum += entity->weight; + + if (new_st != old_st) + entity->start = new_st->vtime; + } + + return new_st; +} + +/** + * bfq_bfqq_served - update the scheduler status after selection for + * service. + * @bfqq: the queue being served. + * @served: bytes to transfer. + * + * NOTE: this can be optimized, as the timestamps of upper level entities + * are synchronized every time a new bfqq is selected for service. By now, + * we keep it to better check consistency. + */ +void bfq_bfqq_served(struct bfq_queue *bfqq, int served) +{ + struct bfq_entity *entity = &bfqq->entity; + struct bfq_service_tree *st; + + if (!bfqq->service_from_backlogged) + bfqq->first_IO_time = jiffies; + + if (bfqq->wr_coeff > 1) + bfqq->service_from_wr += served; + + bfqq->service_from_backlogged += served; + for_each_entity(entity) { + st = bfq_entity_service_tree(entity); + + entity->service += served; + + st->vtime += bfq_delta(served, st->wsum); + bfq_forget_idle(st); + } + bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); +} + +/** + * bfq_bfqq_charge_time - charge an amount of service equivalent to the length + * of the time interval during which bfqq has been in + * service. + * @bfqd: the device + * @bfqq: the queue that needs a service update. + * @time_ms: the amount of time during which the queue has received service + * + * If a queue does not consume its budget fast enough, then providing + * the queue with service fairness may impair throughput, more or less + * severely. For this reason, queues that consume their budget slowly + * are provided with time fairness instead of service fairness. This + * goal is achieved through the BFQ scheduling engine, even if such an + * engine works in the service, and not in the time domain. The trick + * is charging these queues with an inflated amount of service, equal + * to the amount of service that they would have received during their + * service slot if they had been fast, i.e., if their requests had + * been dispatched at a rate equal to the estimated peak rate. + * + * It is worth noting that time fairness can cause important + * distortions in terms of bandwidth distribution, on devices with + * internal queueing. The reason is that I/O requests dispatched + * during the service slot of a queue may be served after that service + * slot is finished, and may have a total processing time loosely + * correlated with the duration of the service slot. This is + * especially true for short service slots. + */ +void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, + unsigned long time_ms) +{ + struct bfq_entity *entity = &bfqq->entity; + unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); + unsigned long bounded_time_ms = min(time_ms, timeout_ms); + int serv_to_charge_for_time = + (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; + int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); + + /* Increase budget to avoid inconsistencies */ + if (tot_serv_to_charge > entity->budget) + entity->budget = tot_serv_to_charge; + + bfq_bfqq_served(bfqq, + max_t(int, 0, tot_serv_to_charge - entity->service)); +} + +static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, + struct bfq_service_tree *st, + bool backshifted) +{ + struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); + + /* + * When this function is invoked, entity is not in any service + * tree, then it is safe to invoke next function with the last + * parameter set (see the comments on the function). + */ + st = __bfq_entity_update_weight_prio(st, entity, true); + bfq_calc_finish(entity, entity->budget); + + /* + * If some queues enjoy backshifting for a while, then their + * (virtual) finish timestamps may happen to become lower and + * lower than the system virtual time. In particular, if + * these queues often happen to be idle for short time + * periods, and during such time periods other queues with + * higher timestamps happen to be busy, then the backshifted + * timestamps of the former queues can become much lower than + * the system virtual time. In fact, to serve the queues with + * higher timestamps while the ones with lower timestamps are + * idle, the system virtual time may be pushed-up to much + * higher values than the finish timestamps of the idle + * queues. As a consequence, the finish timestamps of all new + * or newly activated queues may end up being much larger than + * those of lucky queues with backshifted timestamps. The + * latter queues may then monopolize the device for a lot of + * time. This would simply break service guarantees. + * + * To reduce this problem, push up a little bit the + * backshifted timestamps of the queue associated with this + * entity (only a queue can happen to have the backshifted + * flag set): just enough to let the finish timestamp of the + * queue be equal to the current value of the system virtual + * time. This may introduce a little unfairness among queues + * with backshifted timestamps, but it does not break + * worst-case fairness guarantees. + * + * As a special case, if bfqq is weight-raised, push up + * timestamps much less, to keep very low the probability that + * this push up causes the backshifted finish timestamps of + * weight-raised queues to become higher than the backshifted + * finish timestamps of non weight-raised queues. + */ + if (backshifted && bfq_gt(st->vtime, entity->finish)) { + unsigned long delta = st->vtime - entity->finish; + + if (bfqq) + delta /= bfqq->wr_coeff; + + entity->start += delta; + entity->finish += delta; + } + + bfq_active_insert(st, entity); +} + +/** + * __bfq_activate_entity - handle activation of entity. + * @entity: the entity being activated. + * @non_blocking_wait_rq: true if entity was waiting for a request + * + * Called for a 'true' activation, i.e., if entity is not active and + * one of its children receives a new request. + * + * Basically, this function updates the timestamps of entity and + * inserts entity into its active tree, after possibly extracting it + * from its idle tree. + */ +static void __bfq_activate_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq) +{ + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + bool backshifted = false; + unsigned long long min_vstart; + + /* See comments on bfq_fqq_update_budg_for_activation */ + if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { + backshifted = true; + min_vstart = entity->finish; + } else + min_vstart = st->vtime; + + if (entity->tree == &st->idle) { + /* + * Must be on the idle tree, bfq_idle_extract() will + * check for that. + */ + bfq_idle_extract(st, entity); + entity->start = bfq_gt(min_vstart, entity->finish) ? + min_vstart : entity->finish; + } else { + /* + * The finish time of the entity may be invalid, and + * it is in the past for sure, otherwise the queue + * would have been on the idle tree. + */ + entity->start = min_vstart; + st->wsum += entity->weight; + /* + * entity is about to be inserted into a service tree, + * and then set in service: get a reference to make + * sure entity does not disappear until it is no + * longer in service or scheduled for service. + */ + bfq_get_entity(entity); + + entity->on_st_or_in_serv = true; + } + +#ifdef CONFIG_BFQ_GROUP_IOSCHED + if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ + struct bfq_group *bfqg = + container_of(entity, struct bfq_group, entity); + struct bfq_data *bfqd = bfqg->bfqd; + + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; + bfqd->num_groups_with_pending_reqs++; + } + } +#endif + + bfq_update_fin_time_enqueue(entity, st, backshifted); +} + +/** + * __bfq_requeue_entity - handle requeueing or repositioning of an entity. + * @entity: the entity being requeued or repositioned. + * + * Requeueing is needed if this entity stops being served, which + * happens if a leaf descendant entity has expired. On the other hand, + * repositioning is needed if the next_inservice_entity for the child + * entity has changed. See the comments inside the function for + * details. + * + * Basically, this function: 1) removes entity from its active tree if + * present there, 2) updates the timestamps of entity and 3) inserts + * entity back into its active tree (in the new, right position for + * the new values of the timestamps). + */ +static void __bfq_requeue_entity(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (entity == sd->in_service_entity) { + /* + * We are requeueing the current in-service entity, + * which may have to be done for one of the following + * reasons: + * - entity represents the in-service queue, and the + * in-service queue is being requeued after an + * expiration; + * - entity represents a group, and its budget has + * changed because one of its child entities has + * just been either activated or requeued for some + * reason; the timestamps of the entity need then to + * be updated, and the entity needs to be enqueued + * or repositioned accordingly. + * + * In particular, before requeueing, the start time of + * the entity must be moved forward to account for the + * service that the entity has received while in + * service. This is done by the next instructions. The + * finish time will then be updated according to this + * new value of the start time, and to the budget of + * the entity. + */ + bfq_calc_finish(entity, entity->service); + entity->start = entity->finish; + /* + * In addition, if the entity had more than one child + * when set in service, then it was not extracted from + * the active tree. This implies that the position of + * the entity in the active tree may need to be + * changed now, because we have just updated the start + * time of the entity, and we will update its finish + * time in a moment (the requeueing is then, more + * precisely, a repositioning in this case). To + * implement this repositioning, we: 1) dequeue the + * entity here, 2) update the finish time and requeue + * the entity according to the new timestamps below. + */ + if (entity->tree) + bfq_active_extract(st, entity); + } else { /* The entity is already active, and not in service */ + /* + * In this case, this function gets called only if the + * next_in_service entity below this entity has + * changed, and this change has caused the budget of + * this entity to change, which, finally implies that + * the finish time of this entity must be + * updated. Such an update may cause the scheduling, + * i.e., the position in the active tree, of this + * entity to change. We handle this change by: 1) + * dequeueing the entity here, 2) updating the finish + * time and requeueing the entity according to the new + * timestamps below. This is the same approach as the + * non-extracted-entity sub-case above. + */ + bfq_active_extract(st, entity); + } + + bfq_update_fin_time_enqueue(entity, st, false); +} + +static void __bfq_activate_requeue_entity(struct bfq_entity *entity, + struct bfq_sched_data *sd, + bool non_blocking_wait_rq) +{ + struct bfq_service_tree *st = bfq_entity_service_tree(entity); + + if (sd->in_service_entity == entity || entity->tree == &st->active) + /* + * in service or already queued on the active tree, + * requeue or reposition + */ + __bfq_requeue_entity(entity); + else + /* + * Not in service and not queued on its active tree: + * the activity is idle and this is a true activation. + */ + __bfq_activate_entity(entity, non_blocking_wait_rq); +} + + +/** + * bfq_activate_requeue_entity - activate or requeue an entity representing a + * bfq_queue, and activate, requeue or reposition + * all ancestors for which such an update becomes + * necessary. + * @entity: the entity to activate. + * @non_blocking_wait_rq: true if this entity was waiting for a request + * @requeue: true if this is a requeue, which implies that bfqq is + * being expired; thus ALL its ancestors stop being served and must + * therefore be requeued + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue + */ +static void bfq_activate_requeue_entity(struct bfq_entity *entity, + bool non_blocking_wait_rq, + bool requeue, bool expiration) +{ + struct bfq_sched_data *sd; + + for_each_entity(entity) { + sd = entity->sched_data; + __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); + + if (!bfq_update_next_in_service(sd, entity, expiration) && + !requeue) + break; + } +} + +/** + * __bfq_deactivate_entity - update sched_data and service trees for + * entity, so as to represent entity as inactive + * @entity: the entity being deactivated. + * @ins_into_idle_tree: if false, the entity will not be put into the + * idle tree. + * + * If necessary and allowed, puts entity into the idle tree. NOTE: + * entity may be on no tree if in service. + */ +bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_service_tree *st; + bool is_in_service; + + if (!entity->on_st_or_in_serv) /* + * entity never activated, or + * already inactive + */ + return false; + + /* + * If we get here, then entity is active, which implies that + * bfq_group_set_parent has already been invoked for the group + * represented by entity. Therefore, the field + * entity->sched_data has been set, and we can safely use it. + */ + st = bfq_entity_service_tree(entity); + is_in_service = entity == sd->in_service_entity; + + bfq_calc_finish(entity, entity->service); + + if (is_in_service) + sd->in_service_entity = NULL; + else + /* + * Non in-service entity: nobody will take care of + * resetting its service counter on expiration. Do it + * now. + */ + entity->service = 0; + + if (entity->tree == &st->active) + bfq_active_extract(st, entity); + else if (!is_in_service && entity->tree == &st->idle) + bfq_idle_extract(st, entity); + + if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) + bfq_forget_entity(st, entity, is_in_service); + else + bfq_idle_insert(st, entity); + + return true; +} + +/** + * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. + * @entity: the entity to deactivate. + * @ins_into_idle_tree: true if the entity can be put into the idle tree + * @expiration: true if this function is being invoked in the expiration path + * of the in-service queue + */ +static void bfq_deactivate_entity(struct bfq_entity *entity, + bool ins_into_idle_tree, + bool expiration) +{ + struct bfq_sched_data *sd; + struct bfq_entity *parent = NULL; + + for_each_entity_safe(entity, parent) { + sd = entity->sched_data; + + if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { + /* + * entity is not in any tree any more, so + * this deactivation is a no-op, and there is + * nothing to change for upper-level entities + * (in case of expiration, this can never + * happen). + */ + return; + } + + if (sd->next_in_service == entity) + /* + * entity was the next_in_service entity, + * then, since entity has just been + * deactivated, a new one must be found. + */ + bfq_update_next_in_service(sd, NULL, expiration); + + if (sd->next_in_service || sd->in_service_entity) { + /* + * The parent entity is still active, because + * either next_in_service or in_service_entity + * is not NULL. So, no further upwards + * deactivation must be performed. Yet, + * next_in_service has changed. Then the + * schedule does need to be updated upwards. + * + * NOTE If in_service_entity is not NULL, then + * next_in_service may happen to be NULL, + * although the parent entity is evidently + * active. This happens if 1) the entity + * pointed by in_service_entity is the only + * active entity in the parent entity, and 2) + * according to the definition of + * next_in_service, the in_service_entity + * cannot be considered as + * next_in_service. See the comments on the + * definition of next_in_service for details. + */ + break; + } + + /* + * If we get here, then the parent is no more + * backlogged and we need to propagate the + * deactivation upwards. Thus let the loop go on. + */ + + /* + * Also let parent be queued into the idle tree on + * deactivation, to preserve service guarantees, and + * assuming that who invoked this function does not + * need parent entities too to be removed completely. + */ + ins_into_idle_tree = true; + } + + /* + * If the deactivation loop is fully executed, then there are + * no more entities to touch and next loop is not executed at + * all. Otherwise, requeue remaining entities if they are + * about to stop receiving service, or reposition them if this + * is not the case. + */ + entity = parent; + for_each_entity(entity) { + /* + * Invoke __bfq_requeue_entity on entity, even if + * already active, to requeue/reposition it in the + * active tree (because sd->next_in_service has + * changed) + */ + __bfq_requeue_entity(entity); + + sd = entity->sched_data; + if (!bfq_update_next_in_service(sd, entity, expiration) && + !expiration) + /* + * next_in_service unchanged or not causing + * any change in entity->parent->sd, and no + * requeueing needed for expiration: stop + * here. + */ + break; + } +} + +/** + * bfq_calc_vtime_jump - compute the value to which the vtime should jump, + * if needed, to have at least one entity eligible. + * @st: the service tree to act upon. + * + * Assumes that st is not empty. + */ +static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) +{ + struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); + + if (bfq_gt(root_entity->min_start, st->vtime)) + return root_entity->min_start; + + return st->vtime; +} + +static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) +{ + if (new_value > st->vtime) { + st->vtime = new_value; + bfq_forget_idle(st); + } +} + +/** + * bfq_first_active_entity - find the eligible entity with + * the smallest finish time + * @st: the service tree to select from. + * @vtime: the system virtual to use as a reference for eligibility + * + * This function searches the first schedulable entity, starting from the + * root of the tree and going on the left every time on this side there is + * a subtree with at least one eligible (start <= vtime) entity. The path on + * the right is followed only if a) the left subtree contains no eligible + * entities and b) no eligible entity has been found yet. + */ +static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, + u64 vtime) +{ + struct bfq_entity *entry, *first = NULL; + struct rb_node *node = st->active.rb_node; + + while (node) { + entry = rb_entry(node, struct bfq_entity, rb_node); +left: + if (!bfq_gt(entry->start, vtime)) + first = entry; + + if (node->rb_left) { + entry = rb_entry(node->rb_left, + struct bfq_entity, rb_node); + if (!bfq_gt(entry->min_start, vtime)) { + node = node->rb_left; + goto left; + } + } + if (first) + break; + node = node->rb_right; + } + + return first; +} + +/** + * __bfq_lookup_next_entity - return the first eligible entity in @st. + * @st: the service tree. + * + * If there is no in-service entity for the sched_data st belongs to, + * then return the entity that will be set in service if: + * 1) the parent entity this st belongs to is set in service; + * 2) no entity belonging to such parent entity undergoes a state change + * that would influence the timestamps of the entity (e.g., becomes idle, + * becomes backlogged, changes its budget, ...). + * + * In this first case, update the virtual time in @st too (see the + * comments on this update inside the function). + * + * In contrast, if there is an in-service entity, then return the + * entity that would be set in service if not only the above + * conditions, but also the next one held true: the currently + * in-service entity, on expiration, + * 1) gets a finish time equal to the current one, or + * 2) is not eligible any more, or + * 3) is idle. + */ +static struct bfq_entity * +__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) +{ + struct bfq_entity *entity; + u64 new_vtime; + + if (RB_EMPTY_ROOT(&st->active)) + return NULL; + + /* + * Get the value of the system virtual time for which at + * least one entity is eligible. + */ + new_vtime = bfq_calc_vtime_jump(st); + + /* + * If there is no in-service entity for the sched_data this + * active tree belongs to, then push the system virtual time + * up to the value that guarantees that at least one entity is + * eligible. If, instead, there is an in-service entity, then + * do not make any such update, because there is already an + * eligible entity, namely the in-service one (even if the + * entity is not on st, because it was extracted when set in + * service). + */ + if (!in_service) + bfq_update_vtime(st, new_vtime); + + entity = bfq_first_active_entity(st, new_vtime); + + return entity; +} + +/** + * bfq_lookup_next_entity - return the first eligible entity in @sd. + * @sd: the sched_data. + * @expiration: true if we are on the expiration path of the in-service queue + * + * This function is invoked when there has been a change in the trees + * for sd, and we need to know what is the new next entity to serve + * after this change. + */ +static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, + bool expiration) +{ + struct bfq_service_tree *st = sd->service_tree; + struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); + struct bfq_entity *entity = NULL; + int class_idx = 0; + + /* + * Choose from idle class, if needed to guarantee a minimum + * bandwidth to this class (and if there is some active entity + * in idle class). This should also mitigate + * priority-inversion problems in case a low priority task is + * holding file system resources. + */ + if (time_is_before_jiffies(sd->bfq_class_idle_last_service + + BFQ_CL_IDLE_TIMEOUT)) { + if (!RB_EMPTY_ROOT(&idle_class_st->active)) + class_idx = BFQ_IOPRIO_CLASSES - 1; + /* About to be served if backlogged, or not yet backlogged */ + sd->bfq_class_idle_last_service = jiffies; + } + + /* + * Find the next entity to serve for the highest-priority + * class, unless the idle class needs to be served. + */ + for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { + /* + * If expiration is true, then bfq_lookup_next_entity + * is being invoked as a part of the expiration path + * of the in-service queue. In this case, even if + * sd->in_service_entity is not NULL, + * sd->in_service_entity at this point is actually not + * in service any more, and, if needed, has already + * been properly queued or requeued into the right + * tree. The reason why sd->in_service_entity is still + * not NULL here, even if expiration is true, is that + * sd->in_service_entity is reset as a last step in the + * expiration path. So, if expiration is true, tell + * __bfq_lookup_next_entity that there is no + * sd->in_service_entity. + */ + entity = __bfq_lookup_next_entity(st + class_idx, + sd->in_service_entity && + !expiration); + + if (entity) + break; + } + + if (!entity) + return NULL; + + return entity; +} + +bool next_queue_may_preempt(struct bfq_data *bfqd) +{ + struct bfq_sched_data *sd = &bfqd->root_group->sched_data; + + return sd->next_in_service != sd->in_service_entity; +} + +/* + * Get next queue for service. + */ +struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) +{ + struct bfq_entity *entity = NULL; + struct bfq_sched_data *sd; + struct bfq_queue *bfqq; + + if (bfq_tot_busy_queues(bfqd) == 0) + return NULL; + + /* + * Traverse the path from the root to the leaf entity to + * serve. Set in service all the entities visited along the + * way. + */ + sd = &bfqd->root_group->sched_data; + for (; sd ; sd = entity->my_sched_data) { + /* + * WARNING. We are about to set the in-service entity + * to sd->next_in_service, i.e., to the (cached) value + * returned by bfq_lookup_next_entity(sd) the last + * time it was invoked, i.e., the last time when the + * service order in sd changed as a consequence of the + * activation or deactivation of an entity. In this + * respect, if we execute bfq_lookup_next_entity(sd) + * in this very moment, it may, although with low + * probability, yield a different entity than that + * pointed to by sd->next_in_service. This rare event + * happens in case there was no CLASS_IDLE entity to + * serve for sd when bfq_lookup_next_entity(sd) was + * invoked for the last time, while there is now one + * such entity. + * + * If the above event happens, then the scheduling of + * such entity in CLASS_IDLE is postponed until the + * service of the sd->next_in_service entity + * finishes. In fact, when the latter is expired, + * bfq_lookup_next_entity(sd) gets called again, + * exactly to update sd->next_in_service. + */ + + /* Make next_in_service entity become in_service_entity */ + entity = sd->next_in_service; + sd->in_service_entity = entity; + + /* + * If entity is no longer a candidate for next + * service, then it must be extracted from its active + * tree, so as to make sure that it won't be + * considered when computing next_in_service. See the + * comments on the function + * bfq_no_longer_next_in_service() for details. + */ + if (bfq_no_longer_next_in_service(entity)) + bfq_active_extract(bfq_entity_service_tree(entity), + entity); + + /* + * Even if entity is not to be extracted according to + * the above check, a descendant entity may get + * extracted in one of the next iterations of this + * loop. Such an event could cause a change in + * next_in_service for the level of the descendant + * entity, and thus possibly back to this level. + * + * However, we cannot perform the resulting needed + * update of next_in_service for this level before the + * end of the whole loop, because, to know which is + * the correct next-to-serve candidate entity for each + * level, we need first to find the leaf entity to set + * in service. In fact, only after we know which is + * the next-to-serve leaf entity, we can discover + * whether the parent entity of the leaf entity + * becomes the next-to-serve, and so on. + */ + } + + bfqq = bfq_entity_to_bfqq(entity); + + /* + * We can finally update all next-to-serve entities along the + * path from the leaf entity just set in service to the root. + */ + for_each_entity(entity) { + struct bfq_sched_data *sd = entity->sched_data; + + if (!bfq_update_next_in_service(sd, NULL, false)) + break; + } + + return bfqq; +} + +/* returns true if the in-service queue gets freed */ +bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) +{ + struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; + struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; + struct bfq_entity *entity = in_serv_entity; + + bfq_clear_bfqq_wait_request(in_serv_bfqq); + hrtimer_try_to_cancel(&bfqd->idle_slice_timer); + bfqd->in_service_queue = NULL; + + /* + * When this function is called, all in-service entities have + * been properly deactivated or requeued, so we can safely + * execute the final step: reset in_service_entity along the + * path from entity to the root. + */ + for_each_entity(entity) + entity->sched_data->in_service_entity = NULL; + + /* + * in_serv_entity is no longer in service, so, if it is in no + * service tree either, then release the service reference to + * the queue it represents (taken with bfq_get_entity). + */ + if (!in_serv_entity->on_st_or_in_serv) { + /* + * If no process is referencing in_serv_bfqq any + * longer, then the service reference may be the only + * reference to the queue. If this is the case, then + * bfqq gets freed here. + */ + int ref = in_serv_bfqq->ref; + bfq_put_queue(in_serv_bfqq); + if (ref == 1) + return true; + } + + return false; +} + +void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool ins_into_idle_tree, bool expiration) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); +} + +void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), + false, false); + bfq_clear_bfqq_non_blocking_wait_rq(bfqq); +} + +void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration) +{ + struct bfq_entity *entity = &bfqq->entity; + + bfq_activate_requeue_entity(entity, false, + bfqq == bfqd->in_service_queue, expiration); +} + +/* + * Called when the bfqq no longer has requests pending, remove it from + * the service tree. As a special case, it can be invoked during an + * expiration. + */ +void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, + bool expiration) +{ + bfq_log_bfqq(bfqd, bfqq, "del from busy"); + + bfq_clear_bfqq_busy(bfqq); + + bfqd->busy_queues[bfqq->ioprio_class - 1]--; + + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues--; + + bfqg_stats_update_dequeue(bfqq_group(bfqq)); + + bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); + + if (!bfqq->dispatched) + bfq_weights_tree_remove(bfqd, bfqq); +} + +/* + * Called when an inactive queue receives a new request. + */ +void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +{ + bfq_log_bfqq(bfqd, bfqq, "add to busy"); + + bfq_activate_bfqq(bfqd, bfqq); + + bfq_mark_bfqq_busy(bfqq); + bfqd->busy_queues[bfqq->ioprio_class - 1]++; + + if (!bfqq->dispatched) + if (bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqd, bfqq, + &bfqd->queue_weights_tree); + + if (bfqq->wr_coeff > 1) + bfqd->wr_busy_queues++; +} diff --git a/block/bio-integrity.c b/block/bio-integrity.c new file mode 100644 index 000000000..a4cfc9727 --- /dev/null +++ b/block/bio-integrity.c @@ -0,0 +1,476 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bio-integrity.c - bio data integrity extensions + * + * Copyright (C) 2007, 2008, 2009 Oracle Corporation + * Written by: Martin K. Petersen + */ + +#include +#include +#include +#include +#include +#include +#include "blk.h" + +#define BIP_INLINE_VECS 4 + +static struct kmem_cache *bip_slab; +static struct workqueue_struct *kintegrityd_wq; + +void blk_flush_integrity(void) +{ + flush_workqueue(kintegrityd_wq); +} + +static void __bio_integrity_free(struct bio_set *bs, + struct bio_integrity_payload *bip) +{ + if (bs && mempool_initialized(&bs->bio_integrity_pool)) { + if (bip->bip_vec) + bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, + bip->bip_slab); + mempool_free(bip, &bs->bio_integrity_pool); + } else { + kfree(bip); + } +} + +/** + * bio_integrity_alloc - Allocate integrity payload and attach it to bio + * @bio: bio to attach integrity metadata to + * @gfp_mask: Memory allocation mask + * @nr_vecs: Number of integrity metadata scatter-gather elements + * + * Description: This function prepares a bio for attaching integrity + * metadata. nr_vecs specifies the maximum number of pages containing + * integrity metadata that can be attached. + */ +struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, + gfp_t gfp_mask, + unsigned int nr_vecs) +{ + struct bio_integrity_payload *bip; + struct bio_set *bs = bio->bi_pool; + unsigned inline_vecs; + + if (WARN_ON_ONCE(bio_has_crypt_ctx(bio))) + return ERR_PTR(-EOPNOTSUPP); + + if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) { + bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask); + inline_vecs = nr_vecs; + } else { + bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); + inline_vecs = BIP_INLINE_VECS; + } + + if (unlikely(!bip)) + return ERR_PTR(-ENOMEM); + + memset(bip, 0, sizeof(*bip)); + + if (nr_vecs > inline_vecs) { + unsigned long idx = 0; + + bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, + &bs->bvec_integrity_pool); + if (!bip->bip_vec) + goto err; + bip->bip_max_vcnt = bvec_nr_vecs(idx); + bip->bip_slab = idx; + } else { + bip->bip_vec = bip->bip_inline_vecs; + bip->bip_max_vcnt = inline_vecs; + } + + bip->bip_bio = bio; + bio->bi_integrity = bip; + bio->bi_opf |= REQ_INTEGRITY; + + return bip; +err: + __bio_integrity_free(bs, bip); + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(bio_integrity_alloc); + +/** + * bio_integrity_free - Free bio integrity payload + * @bio: bio containing bip to be freed + * + * Description: Used to free the integrity portion of a bio. Usually + * called from bio_free(). + */ +void bio_integrity_free(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_set *bs = bio->bi_pool; + + if (bip->bip_flags & BIP_BLOCK_INTEGRITY) + kfree(page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset); + + __bio_integrity_free(bs, bip); + bio->bi_integrity = NULL; + bio->bi_opf &= ~REQ_INTEGRITY; +} + +/** + * bio_integrity_add_page - Attach integrity metadata + * @bio: bio to update + * @page: page containing integrity metadata + * @len: number of bytes of integrity metadata in page + * @offset: start offset within page + * + * Description: Attach a page containing integrity metadata to bio. + */ +int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_vec *iv; + + if (bip->bip_vcnt >= bip->bip_max_vcnt) { + printk(KERN_ERR "%s: bip_vec full\n", __func__); + return 0; + } + + iv = bip->bip_vec + bip->bip_vcnt; + + if (bip->bip_vcnt && + bvec_gap_to_prev(bio->bi_disk->queue, + &bip->bip_vec[bip->bip_vcnt - 1], offset)) + return 0; + + iv->bv_page = page; + iv->bv_len = len; + iv->bv_offset = offset; + bip->bip_vcnt++; + + return len; +} +EXPORT_SYMBOL(bio_integrity_add_page); + +/** + * bio_integrity_process - Process integrity metadata for a bio + * @bio: bio to generate/verify integrity metadata for + * @proc_iter: iterator to process + * @proc_fn: Pointer to the relevant processing function + */ +static blk_status_t bio_integrity_process(struct bio *bio, + struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity_iter iter; + struct bvec_iter bviter; + struct bio_vec bv; + struct bio_integrity_payload *bip = bio_integrity(bio); + blk_status_t ret = BLK_STS_OK; + void *prot_buf = page_address(bip->bip_vec->bv_page) + + bip->bip_vec->bv_offset; + + iter.disk_name = bio->bi_disk->disk_name; + iter.interval = 1 << bi->interval_exp; + iter.seed = proc_iter->bi_sector; + iter.prot_buf = prot_buf; + + __bio_for_each_segment(bv, bio, bviter, *proc_iter) { + void *kaddr = kmap_atomic(bv.bv_page); + + iter.data_buf = kaddr + bv.bv_offset; + iter.data_size = bv.bv_len; + + ret = proc_fn(&iter); + if (ret) { + kunmap_atomic(kaddr); + return ret; + } + + kunmap_atomic(kaddr); + } + return ret; +} + +/** + * bio_integrity_prep - Prepare bio for integrity I/O + * @bio: bio to prepare + * + * Description: Checks if the bio already has an integrity payload attached. + * If it does, the payload has been generated by another kernel subsystem, + * and we just pass it through. Otherwise allocates integrity payload. + * The bio must have data direction, target device and start sector set priot + * to calling. In the WRITE case, integrity metadata will be generated using + * the block device's integrity function. In the READ case, the buffer + * will be prepared for DMA and a suitable end_io handler set up. + */ +bool bio_integrity_prep(struct bio *bio) +{ + struct bio_integrity_payload *bip; + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct request_queue *q = bio->bi_disk->queue; + void *buf; + unsigned long start, end; + unsigned int len, nr_pages; + unsigned int bytes, offset, i; + unsigned int intervals; + blk_status_t status; + + if (!bi) + return true; + + if (bio_op(bio) != REQ_OP_READ && bio_op(bio) != REQ_OP_WRITE) + return true; + + if (!bio_sectors(bio)) + return true; + + /* Already protected? */ + if (bio_integrity(bio)) + return true; + + if (bio_data_dir(bio) == READ) { + if (!bi->profile->verify_fn || + !(bi->flags & BLK_INTEGRITY_VERIFY)) + return true; + } else { + if (!bi->profile->generate_fn || + !(bi->flags & BLK_INTEGRITY_GENERATE)) + return true; + } + intervals = bio_integrity_intervals(bi, bio_sectors(bio)); + + /* Allocate kernel buffer for protection data */ + len = intervals * bi->tuple_size; + buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); + status = BLK_STS_RESOURCE; + if (unlikely(buf == NULL)) { + printk(KERN_ERR "could not allocate integrity buffer\n"); + goto err_end_io; + } + + end = (((unsigned long) buf) + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + start = ((unsigned long) buf) >> PAGE_SHIFT; + nr_pages = end - start; + + /* Allocate bio integrity payload and integrity vectors */ + bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); + if (IS_ERR(bip)) { + printk(KERN_ERR "could not allocate data integrity bioset\n"); + kfree(buf); + status = BLK_STS_RESOURCE; + goto err_end_io; + } + + bip->bip_flags |= BIP_BLOCK_INTEGRITY; + bip->bip_iter.bi_size = len; + bip_set_seed(bip, bio->bi_iter.bi_sector); + + if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) + bip->bip_flags |= BIP_IP_CHECKSUM; + + /* Map it */ + offset = offset_in_page(buf); + for (i = 0 ; i < nr_pages ; i++) { + int ret; + bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + ret = bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset); + + if (ret == 0) { + printk(KERN_ERR "could not attach integrity payload\n"); + status = BLK_STS_RESOURCE; + goto err_end_io; + } + + if (ret < bytes) + break; + + buf += bytes; + len -= bytes; + offset = 0; + } + + /* Auto-generate integrity metadata if this is a write */ + if (bio_data_dir(bio) == WRITE) { + bio_integrity_process(bio, &bio->bi_iter, + bi->profile->generate_fn); + } else { + bip->bio_iter = bio->bi_iter; + } + return true; + +err_end_io: + bio->bi_status = status; + bio_endio(bio); + return false; + +} +EXPORT_SYMBOL(bio_integrity_prep); + +/** + * bio_integrity_verify_fn - Integrity I/O completion worker + * @work: Work struct stored in bio to be verified + * + * Description: This workqueue function is called to complete a READ + * request. The function verifies the transferred integrity metadata + * and then calls the original bio end_io function. + */ +static void bio_integrity_verify_fn(struct work_struct *work) +{ + struct bio_integrity_payload *bip = + container_of(work, struct bio_integrity_payload, bip_work); + struct bio *bio = bip->bip_bio; + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + + /* + * At the moment verify is called bio's iterator was advanced + * during split and completion, we need to rewind iterator to + * it's original position. + */ + bio->bi_status = bio_integrity_process(bio, &bip->bio_iter, + bi->profile->verify_fn); + bio_integrity_free(bio); + bio_endio(bio); +} + +/** + * __bio_integrity_endio - Integrity I/O completion function + * @bio: Protected bio + * + * Description: Completion for integrity I/O + * + * Normally I/O completion is done in interrupt context. However, + * verifying I/O integrity is a time-consuming task which must be run + * in process context. This function postpones completion + * accordingly. + */ +bool __bio_integrity_endio(struct bio *bio) +{ + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct bio_integrity_payload *bip = bio_integrity(bio); + + if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && + (bip->bip_flags & BIP_BLOCK_INTEGRITY) && bi->profile->verify_fn) { + INIT_WORK(&bip->bip_work, bio_integrity_verify_fn); + queue_work(kintegrityd_wq, &bip->bip_work); + return false; + } + + bio_integrity_free(bio); + return true; +} + +/** + * bio_integrity_advance - Advance integrity vector + * @bio: bio whose integrity vector to update + * @bytes_done: number of data bytes that have been completed + * + * Description: This function calculates how many integrity bytes the + * number of completed data bytes correspond to and advances the + * integrity vector accordingly. + */ +void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); + + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); + bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); +} + +/** + * bio_integrity_trim - Trim integrity vector + * @bio: bio whose integrity vector to update + * + * Description: Used to trim the integrity vector in a cloned bio. + */ +void bio_integrity_trim(struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + + bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); +} +EXPORT_SYMBOL(bio_integrity_trim); + +/** + * bio_integrity_clone - Callback for cloning bios with integrity metadata + * @bio: New bio + * @bio_src: Original bio + * @gfp_mask: Memory allocation mask + * + * Description: Called to allocate a bip when cloning a bio + */ +int bio_integrity_clone(struct bio *bio, struct bio *bio_src, + gfp_t gfp_mask) +{ + struct bio_integrity_payload *bip_src = bio_integrity(bio_src); + struct bio_integrity_payload *bip; + + BUG_ON(bip_src == NULL); + + bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bip_src->bip_vec, + bip_src->bip_vcnt * sizeof(struct bio_vec)); + + bip->bip_vcnt = bip_src->bip_vcnt; + bip->bip_iter = bip_src->bip_iter; + bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; + + return 0; +} +EXPORT_SYMBOL(bio_integrity_clone); + +int bioset_integrity_create(struct bio_set *bs, int pool_size) +{ + if (mempool_initialized(&bs->bio_integrity_pool)) + return 0; + + if (mempool_init_slab_pool(&bs->bio_integrity_pool, + pool_size, bip_slab)) + return -1; + + if (biovec_init_pool(&bs->bvec_integrity_pool, pool_size)) { + mempool_exit(&bs->bio_integrity_pool); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(bioset_integrity_create); + +void bioset_integrity_free(struct bio_set *bs) +{ + mempool_exit(&bs->bio_integrity_pool); + mempool_exit(&bs->bvec_integrity_pool); +} + +void __init bio_integrity_init(void) +{ + /* + * kintegrityd won't block much but may burn a lot of CPU cycles. + * Make it highpri CPU intensive wq with max concurrency of 1. + */ + kintegrityd_wq = alloc_workqueue("kintegrityd", WQ_MEM_RECLAIM | + WQ_HIGHPRI | WQ_CPU_INTENSIVE, 1); + if (!kintegrityd_wq) + panic("Failed to create kintegrityd\n"); + + bip_slab = kmem_cache_create("bio_integrity_payload", + sizeof(struct bio_integrity_payload) + + sizeof(struct bio_vec) * BIP_INLINE_VECS, + 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); +} diff --git a/block/bio.c b/block/bio.c new file mode 100644 index 000000000..6d6e7b96b --- /dev/null +++ b/block/bio.c @@ -0,0 +1,1684 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "blk.h" +#include "blk-rq-qos.h" + +/* + * Test patch to inline a certain number of bi_io_vec's inside the bio + * itself, to shrink a bio data allocation from two mempool calls to one + */ +#define BIO_INLINE_VECS 4 + +/* + * if you change this list, also change bvec_alloc or things will + * break badly! cannot be bigger than what you can fit into an + * unsigned short + */ +#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } +static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { + BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +}; +#undef BV + +/* + * fs_bio_set is the bio_set containing bio and iovec memory pools used by + * IO code that does not need private memory pools. + */ +struct bio_set fs_bio_set; +EXPORT_SYMBOL(fs_bio_set); + +/* + * Our slab pool management + */ +struct bio_slab { + struct kmem_cache *slab; + unsigned int slab_ref; + unsigned int slab_size; + char name[8]; +}; +static DEFINE_MUTEX(bio_slab_lock); +static struct bio_slab *bio_slabs; +static unsigned int bio_slab_nr, bio_slab_max; + +static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +{ + unsigned int sz = sizeof(struct bio) + extra_size; + struct kmem_cache *slab = NULL; + struct bio_slab *bslab, *new_bio_slabs; + unsigned int new_bio_slab_max; + unsigned int i, entry = -1; + + mutex_lock(&bio_slab_lock); + + i = 0; + while (i < bio_slab_nr) { + bslab = &bio_slabs[i]; + + if (!bslab->slab && entry == -1) + entry = i; + else if (bslab->slab_size == sz) { + slab = bslab->slab; + bslab->slab_ref++; + break; + } + i++; + } + + if (slab) + goto out_unlock; + + if (bio_slab_nr == bio_slab_max && entry == -1) { + new_bio_slab_max = bio_slab_max << 1; + new_bio_slabs = krealloc(bio_slabs, + new_bio_slab_max * sizeof(struct bio_slab), + GFP_KERNEL); + if (!new_bio_slabs) + goto out_unlock; + bio_slab_max = new_bio_slab_max; + bio_slabs = new_bio_slabs; + } + if (entry == -1) + entry = bio_slab_nr++; + + bslab = &bio_slabs[entry]; + + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); + slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN, NULL); + if (!slab) + goto out_unlock; + + bslab->slab = slab; + bslab->slab_ref = 1; + bslab->slab_size = sz; +out_unlock: + mutex_unlock(&bio_slab_lock); + return slab; +} + +static void bio_put_slab(struct bio_set *bs) +{ + struct bio_slab *bslab = NULL; + unsigned int i; + + mutex_lock(&bio_slab_lock); + + for (i = 0; i < bio_slab_nr; i++) { + if (bs->bio_slab == bio_slabs[i].slab) { + bslab = &bio_slabs[i]; + break; + } + } + + if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) + goto out; + + WARN_ON(!bslab->slab_ref); + + if (--bslab->slab_ref) + goto out; + + kmem_cache_destroy(bslab->slab); + bslab->slab = NULL; + +out: + mutex_unlock(&bio_slab_lock); +} + +unsigned int bvec_nr_vecs(unsigned short idx) +{ + return bvec_slabs[--idx].nr_vecs; +} + +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) +{ + if (!idx) + return; + idx--; + + BIO_BUG_ON(idx >= BVEC_POOL_NR); + + if (idx == BVEC_POOL_MAX) { + mempool_free(bv, pool); + } else { + struct biovec_slab *bvs = bvec_slabs + idx; + + kmem_cache_free(bvs->slab, bv); + } +} + +struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, + mempool_t *pool) +{ + struct bio_vec *bvl; + + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; + default: + return NULL; + } + + /* + * idx now points to the pool we want to allocate from. only the + * 1-vec entry pool is mempool backed. + */ + if (*idx == BVEC_POOL_MAX) { +fallback: + bvl = mempool_alloc(pool, gfp_mask); + } else { + struct biovec_slab *bvs = bvec_slabs + *idx; + gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + + /* + * Make this allocation restricted and don't dump info on + * allocation failures, since we'll fallback to the mempool + * in case of failure. + */ + __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + + /* + * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM + * is set, retry with the 1-entry mempool + */ + bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); + if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { + *idx = BVEC_POOL_MAX; + goto fallback; + } + } + + (*idx)++; + return bvl; +} + +void bio_uninit(struct bio *bio) +{ +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif + if (bio_integrity(bio)) + bio_integrity_free(bio); + + bio_crypt_free_ctx(bio); +} +EXPORT_SYMBOL(bio_uninit); + +static void bio_free(struct bio *bio) +{ + struct bio_set *bs = bio->bi_pool; + void *p; + + bio_uninit(bio); + + if (bs) { + bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); + + /* + * If we have front padding, adjust the bio pointer before freeing + */ + p = bio; + p -= bs->front_pad; + + mempool_free(p, &bs->bio_pool); + } else { + /* Bio was allocated by bio_kmalloc() */ + kfree(bio); + } +} + +/* + * Users of this function have their own bio allocation. Subsequently, + * they must remember to pair any call to bio_init() with bio_uninit() + * when IO has completed, or when the bio is released. + */ +void bio_init(struct bio *bio, struct bio_vec *table, + unsigned short max_vecs) +{ + memset(bio, 0, sizeof(*bio)); + atomic_set(&bio->__bi_remaining, 1); + atomic_set(&bio->__bi_cnt, 1); + + bio->bi_io_vec = table; + bio->bi_max_vecs = max_vecs; +} +EXPORT_SYMBOL(bio_init); + +/** + * bio_reset - reinitialize a bio + * @bio: bio to reset + * + * Description: + * After calling bio_reset(), @bio will be in the same state as a freshly + * allocated bio returned bio bio_alloc_bioset() - the only fields that are + * preserved are the ones that are initialized by bio_alloc_bioset(). See + * comment in struct bio. + */ +void bio_reset(struct bio *bio) +{ + unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); + + bio_uninit(bio); + + memset(bio, 0, BIO_RESET_BYTES); + bio->bi_flags = flags; + atomic_set(&bio->__bi_remaining, 1); +} +EXPORT_SYMBOL(bio_reset); + +static struct bio *__bio_chain_endio(struct bio *bio) +{ + struct bio *parent = bio->bi_private; + + if (bio->bi_status && !parent->bi_status) + parent->bi_status = bio->bi_status; + bio_put(bio); + return parent; +} + +static void bio_chain_endio(struct bio *bio) +{ + bio_endio(__bio_chain_endio(bio)); +} + +/** + * bio_chain - chain bio completions + * @bio: the target bio + * @parent: the parent bio of @bio + * + * The caller won't have a bi_end_io called when @bio completes - instead, + * @parent's bi_end_io won't be called until both @parent and @bio have + * completed; the chained bio will also be freed when it completes. + * + * The caller must not set bi_private or bi_end_io in @bio. + */ +void bio_chain(struct bio *bio, struct bio *parent) +{ + BUG_ON(bio->bi_private || bio->bi_end_io); + + bio->bi_private = parent; + bio->bi_end_io = bio_chain_endio; + bio_inc_remaining(parent); +} +EXPORT_SYMBOL(bio_chain); + +static void bio_alloc_rescue(struct work_struct *work) +{ + struct bio_set *bs = container_of(work, struct bio_set, rescue_work); + struct bio *bio; + + while (1) { + spin_lock(&bs->rescue_lock); + bio = bio_list_pop(&bs->rescue_list); + spin_unlock(&bs->rescue_lock); + + if (!bio) + break; + + submit_bio_noacct(bio); + } +} + +static void punt_bios_to_rescuer(struct bio_set *bs) +{ + struct bio_list punt, nopunt; + struct bio *bio; + + if (WARN_ON_ONCE(!bs->rescue_workqueue)) + return; + /* + * In order to guarantee forward progress we must punt only bios that + * were allocated from this bio_set; otherwise, if there was a bio on + * there for a stacking driver higher up in the stack, processing it + * could require allocating bios from this bio_set, and doing that from + * our own rescuer would be bad. + * + * Since bio lists are singly linked, pop them all instead of trying to + * remove from the middle of the list: + */ + + bio_list_init(&punt); + bio_list_init(&nopunt); + + while ((bio = bio_list_pop(¤t->bio_list[0]))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[0] = nopunt; + + bio_list_init(&nopunt); + while ((bio = bio_list_pop(¤t->bio_list[1]))) + bio_list_add(bio->bi_pool == bs ? &punt : &nopunt, bio); + current->bio_list[1] = nopunt; + + spin_lock(&bs->rescue_lock); + bio_list_merge(&bs->rescue_list, &punt); + spin_unlock(&bs->rescue_lock); + + queue_work(bs->rescue_workqueue, &bs->rescue_work); +} + +/** + * bio_alloc_bioset - allocate a bio for I/O + * @gfp_mask: the GFP_* mask given to the slab allocator + * @nr_iovecs: number of iovecs to pre-allocate + * @bs: the bio_set to allocate from. + * + * Description: + * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is + * backed by the @bs's mempool. + * + * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will + * always be able to allocate a bio. This is due to the mempool guarantees. + * To make this work, callers must never allocate more than 1 bio at a time + * from this pool. Callers that need to allocate more than 1 bio must always + * submit the previously allocated bio for IO before attempting to allocate + * a new one. Failure to do so can cause deadlocks under memory pressure. + * + * Note that when running under submit_bio_noacct() (i.e. any block + * driver), bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. + * + * This would normally mean allocating multiple bios under + * submit_bio_noacct() would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. + * + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. + * + * RETURNS: + * Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, + struct bio_set *bs) +{ + gfp_t saved_gfp = gfp_mask; + unsigned front_pad; + unsigned inline_vecs; + struct bio_vec *bvl = NULL; + struct bio *bio; + void *p; + + if (!bs) { + if (nr_iovecs > UIO_MAXIOV) + return NULL; + + p = kmalloc(struct_size(bio, bi_inline_vecs, nr_iovecs), gfp_mask); + front_pad = 0; + inline_vecs = nr_iovecs; + } else { + /* should not use nobvec bioset for nr_iovecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && + nr_iovecs > 0)) + return NULL; + /* + * submit_bio_noacct() converts recursion to iteration; this + * means if we're running beneath it, any bios we allocate and + * submit will not be submitted (and thus freed) until after we + * return. + * + * This exposes us to a potential deadlock if we allocate + * multiple bios from the same bio_set() while running + * underneath submit_bio_noacct(). If we were to allocate + * multiple bios (say a stacking block driver that was splitting + * bios), we would deadlock if we exhausted the mempool's + * reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are + * bios on current->bio_list, we first try the allocation + * without __GFP_DIRECT_RECLAIM; if that fails, we punt those + * bios we would be blocking to the rescuer workqueue before + * we retry with the original gfp_flags. + */ + + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(&bs->bio_pool, gfp_mask); + } + + front_pad = bs->front_pad; + inline_vecs = BIO_INLINE_VECS; + } + + if (unlikely(!p)) + return NULL; + + bio = p + front_pad; + bio_init(bio, NULL, 0); + + if (nr_iovecs > inline_vecs) { + unsigned long idx = 0; + + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + if (!bvl && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + } + + if (unlikely(!bvl)) + goto err_free; + + bio->bi_flags |= idx << BVEC_POOL_OFFSET; + } else if (nr_iovecs) { + bvl = bio->bi_inline_vecs; + } + + bio->bi_pool = bs; + bio->bi_max_vecs = nr_iovecs; + bio->bi_io_vec = bvl; + return bio; + +err_free: + mempool_free(p, &bs->bio_pool); + return NULL; +} +EXPORT_SYMBOL(bio_alloc_bioset); + +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +{ + unsigned long flags; + struct bio_vec bv; + struct bvec_iter iter; + + __bio_for_each_segment(bv, bio, iter, start) { + char *data = bvec_kmap_irq(&bv, &flags); + memset(data, 0, bv.bv_len); + flush_dcache_page(bv.bv_page); + bvec_kunmap_irq(data, &flags); + } +} +EXPORT_SYMBOL(zero_fill_bio_iter); + +/** + * bio_truncate - truncate the bio to small size of @new_size + * @bio: the bio to be truncated + * @new_size: new size for truncating the bio + * + * Description: + * Truncate the bio to new size of @new_size. If bio_op(bio) is + * REQ_OP_READ, zero the truncated part. This function should only + * be used for handling corner cases, such as bio eod. + */ +void bio_truncate(struct bio *bio, unsigned new_size) +{ + struct bio_vec bv; + struct bvec_iter iter; + unsigned int done = 0; + bool truncated = false; + + if (new_size >= bio->bi_iter.bi_size) + return; + + if (bio_op(bio) != REQ_OP_READ) + goto exit; + + bio_for_each_segment(bv, bio, iter) { + if (done + bv.bv_len > new_size) { + unsigned offset; + + if (!truncated) + offset = new_size - done; + else + offset = 0; + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); + truncated = true; + } + done += bv.bv_len; + } + + exit: + /* + * Don't touch bvec table here and make it really immutable, since + * fs bio user has to retrieve all pages via bio_for_each_segment_all + * in its .end_bio() callback. + * + * It is enough to truncate bio by updating .bi_size since we can make + * correct bvec with the updated .bi_size for drivers. + */ + bio->bi_iter.bi_size = new_size; +} + +/** + * guard_bio_eod - truncate a BIO to fit the block device + * @bio: bio to truncate + * + * This allows us to do IO even on the odd last sectors of a device, even if the + * block size is some multiple of the physical sector size. + * + * We'll just truncate the bio to the size of the device, and clear the end of + * the buffer head manually. Truly out-of-range accesses will turn into actual + * I/O errors, this only handles the "we need to be able to do I/O at the final + * sector" case. + */ +void guard_bio_eod(struct bio *bio) +{ + sector_t maxsector; + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (part) + maxsector = part_nr_sects_read(part); + else + maxsector = get_capacity(bio->bi_disk); + rcu_read_unlock(); + + if (!maxsector) + return; + + /* + * If the *whole* IO is past the end of the device, + * let it through, and the IO layer will turn it into + * an EIO. + */ + if (unlikely(bio->bi_iter.bi_sector >= maxsector)) + return; + + maxsector -= bio->bi_iter.bi_sector; + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) + return; + + bio_truncate(bio, maxsector << 9); +} + +/** + * bio_put - release a reference to a bio + * @bio: bio to release reference to + * + * Description: + * Put a reference to a &struct bio, either one you have gotten with + * bio_alloc, bio_get or bio_clone_*. The last put of a bio will free it. + **/ +void bio_put(struct bio *bio) +{ + if (!bio_flagged(bio, BIO_REFFED)) + bio_free(bio); + else { + BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); + + /* + * last put frees it + */ + if (atomic_dec_and_test(&bio->__bi_cnt)) + bio_free(bio); + } +} +EXPORT_SYMBOL(bio_put); + +/** + * __bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: destination bio + * @bio_src: bio to clone + * + * Clone a &bio. Caller will own the returned bio, but not + * the actual data it points to. Reference count of returned + * bio will be one. + * + * Caller must ensure that @bio_src is not freed before @bio. + */ +void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +{ + BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); + + /* + * most users will be overriding ->bi_disk with a new target, + * so we don't set nor calculate new physical/hw segment counts here + */ + bio->bi_disk = bio_src->bi_disk; + bio->bi_partno = bio_src->bi_partno; + bio_set_flag(bio, BIO_CLONED); + if (bio_flagged(bio_src, BIO_THROTTLED)) + bio_set_flag(bio, BIO_THROTTLED); + bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter = bio_src->bi_iter; + bio->bi_io_vec = bio_src->bi_io_vec; + + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); +} +EXPORT_SYMBOL(__bio_clone_fast); + +/** + * bio_clone_fast - clone a bio that shares the original bio's biovec + * @bio: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Like __bio_clone_fast, only also allocates the returned bio + */ +struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +{ + struct bio *b; + + b = bio_alloc_bioset(gfp_mask, 0, bs); + if (!b) + return NULL; + + __bio_clone_fast(b, bio); + + if (bio_crypt_clone(b, bio, gfp_mask) < 0) + goto err_put; + + if (bio_integrity(bio) && + bio_integrity_clone(b, bio, gfp_mask) < 0) + goto err_put; + + return b; + +err_put: + bio_put(b); + return NULL; +} +EXPORT_SYMBOL(bio_clone_fast); + +const char *bio_devname(struct bio *bio, char *buf) +{ + return disk_name(bio->bi_disk, bio->bi_partno, buf); +} +EXPORT_SYMBOL(bio_devname); + +static inline bool page_is_mergeable(const struct bio_vec *bv, + struct page *page, unsigned int len, unsigned int off, + bool *same_page) +{ + size_t bv_end = bv->bv_offset + bv->bv_len; + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; + phys_addr_t page_addr = page_to_phys(page); + + if (vec_end_addr + 1 != page_addr + off) + return false; + if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) + return false; + + *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); + if (*same_page) + return true; + return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); +} + +/* + * Try to merge a page into a segment, while obeying the hardware segment + * size limit. This is not for normal read/write bios, but for passthrough + * or Zone Append operations that we can't split. + */ +static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, + struct page *page, unsigned len, + unsigned offset, bool *same_page) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; + phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; + + if ((addr1 | mask) != (addr2 | mask)) + return false; + if (bv->bv_len + len > queue_max_segment_size(q)) + return false; + return __bio_try_merge_page(bio, page, len, offset, same_page); +} + +/** + * bio_add_hw_page - attempt to add a page to a bio with hw constraints + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * @max_sectors: maximum number of sectors that can be added + * @same_page: return if the segment has been merged inside the same page + * + * Add a page to a bio while respecting the hardware max_sectors, max_segment + * and gap limitations. + */ +int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page) +{ + struct bio_vec *bvec; + + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + + if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + return 0; + + if (bio->bi_vcnt > 0) { + if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + return len; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (bvec_gap_to_prev(q, bvec, offset)) + return 0; + } + + if (bio_full(bio, len)) + return 0; + + if (bio->bi_vcnt >= queue_max_segments(q)) + return 0; + + bvec = &bio->bi_io_vec[bio->bi_vcnt]; + bvec->bv_page = page; + bvec->bv_len = len; + bvec->bv_offset = offset; + bio->bi_vcnt++; + bio->bi_iter.bi_size += len; + return len; +} + +/** + * bio_add_pc_page - attempt to add page to passthrough bio + * @q: the target queue + * @bio: destination bio + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset + * + * Attempt to add a page to the bio_vec maplist. This can fail for a + * number of reasons, such as the bio being full or target block device + * limitations. The target block device must allow bio's up to PAGE_SIZE, + * so it is always possible to add a single page to an empty bio. + * + * This should only be used by passthrough bios. + */ +int bio_add_pc_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset) +{ + bool same_page = false; + return bio_add_hw_page(q, bio, page, len, offset, + queue_max_hw_sectors(q), &same_page); +} +EXPORT_SYMBOL(bio_add_pc_page); + +/** + * __bio_try_merge_page - try appending data to an existing bvec. + * @bio: destination bio + * @page: start page to add + * @len: length of the data to add + * @off: offset of the data relative to @page + * @same_page: return if the segment has been merged inside the same page + * + * Try to add the data at @page + @off to the last bvec of @bio. This is a + * useful optimisation for file systems with a block size smaller than the + * page size. + * + * Warn if (@len, @off) crosses pages in case that @same_page is true. + * + * Return %true on success or %false on failure. + */ +bool __bio_try_merge_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off, bool *same_page) +{ + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return false; + + if (bio->bi_vcnt > 0) { + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (page_is_mergeable(bv, page, len, off, same_page)) { + if (bio->bi_iter.bi_size > UINT_MAX - len) { + *same_page = false; + return false; + } + bv->bv_len += len; + bio->bi_iter.bi_size += len; + return true; + } + } + return false; +} +EXPORT_SYMBOL_GPL(__bio_try_merge_page); + +/** + * __bio_add_page - add page(s) to a bio in a new segment + * @bio: destination bio + * @page: start page to add + * @len: length of the data to add, may cross pages + * @off: offset of the data relative to @page, may cross pages + * + * Add the data at @page + @off to @bio as a new bvec. The caller must ensure + * that @bio has space for another bvec. + */ +void __bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int off) +{ + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; + + WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); + WARN_ON_ONCE(bio_full(bio, len)); + + bv->bv_page = page; + bv->bv_offset = off; + bv->bv_len = len; + + bio->bi_iter.bi_size += len; + bio->bi_vcnt++; + + if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) + bio_set_flag(bio, BIO_WORKINGSET); +} +EXPORT_SYMBOL_GPL(__bio_add_page); + +/** + * bio_add_page - attempt to add page(s) to bio + * @bio: destination bio + * @page: start page to add + * @len: vec entry length, may cross pages + * @offset: vec entry offset relative to @page, may cross pages + * + * Attempt to add page(s) to the bio_vec maplist. This will only fail + * if either bio->bi_vcnt == bio->bi_max_vecs or it's a cloned bio. + */ +int bio_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + bool same_page = false; + + if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (bio_full(bio, len)) + return 0; + __bio_add_page(bio, page, len, offset); + } + return len; +} +EXPORT_SYMBOL(bio_add_page); + +void bio_release_pages(struct bio *bio, bool mark_dirty) +{ + struct bvec_iter_all iter_all; + struct bio_vec *bvec; + + if (bio_flagged(bio, BIO_NO_PAGE_REF)) + return; + + bio_for_each_segment_all(bvec, bio, iter_all) { + if (mark_dirty && !PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); + put_page(bvec->bv_page); + } +} +EXPORT_SYMBOL_GPL(bio_release_pages); + +static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +{ + const struct bio_vec *bv = iter->bvec; + unsigned int len; + size_t size; + + if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) + return -EINVAL; + + len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); + size = bio_add_page(bio, bv->bv_page, len, + bv->bv_offset + iter->iov_offset); + if (unlikely(size != len)) + return -EINVAL; + iov_iter_advance(iter, size); + return 0; +} + +static void bio_put_pages(struct page **pages, size_t size, size_t off) +{ + size_t i, nr = DIV_ROUND_UP(size + (off & ~PAGE_MASK), PAGE_SIZE); + + for (i = 0; i < nr; i++) + put_page(pages[i]); +} + +#define PAGE_PTRS_PER_BVEC (sizeof(struct bio_vec) / sizeof(struct page *)) + +/** + * __bio_iov_iter_get_pages - pin user or kernel pages and add them to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be mapped + * + * Pins pages from *iter and appends them to @bio's bvec array. The + * pages will have to be released using put_page() when done. + * For multi-segment *iter, this function only adds pages from the + * next non-empty segment of the iov iterator. + */ +static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + bool same_page = false; + ssize_t size, left; + unsigned len, i; + size_t offset; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + + len = min_t(size_t, PAGE_SIZE - offset, left); + + if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { + if (same_page) + put_page(page); + } else { + if (WARN_ON_ONCE(bio_full(bio, len))) { + bio_put_pages(pages + i, left, offset); + return -EINVAL; + } + __bio_add_page(bio, page, len, offset); + } + offset = 0; + } + + iov_iter_advance(iter, size); + return 0; +} + +static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) +{ + unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; + unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; + struct request_queue *q = bio->bi_disk->queue; + unsigned int max_append_sectors = queue_max_zone_append_sectors(q); + struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; + struct page **pages = (struct page **)bv; + ssize_t size, left; + unsigned len, i; + size_t offset; + int ret = 0; + + /* + * Move page array up in the allocated memory for the bio vecs as far as + * possible so that we can start filling biovecs from the beginning + * without overwriting the temporary page array. + */ + BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); + pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); + + size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (unlikely(size <= 0)) + return size ? size : -EFAULT; + + for (left = size, i = 0; left > 0; left -= len, i++) { + struct page *page = pages[i]; + bool same_page = false; + + len = min_t(size_t, PAGE_SIZE - offset, left); + if (bio_add_hw_page(q, bio, page, len, offset, + max_append_sectors, &same_page) != len) { + bio_put_pages(pages + i, left, offset); + ret = -EINVAL; + break; + } + if (same_page) + put_page(page); + offset = 0; + } + + iov_iter_advance(iter, size - left); + return ret; +} + +/** + * bio_iov_iter_get_pages - add user or kernel pages to a bio + * @bio: bio to add pages to + * @iter: iov iterator describing the region to be added + * + * This takes either an iterator pointing to user memory, or one pointing to + * kernel pages (BVEC iterator). If we're adding user pages, we pin them and + * map them into the kernel. On IO completion, the caller should put those + * pages. If we're adding kernel pages, and the caller told us it's safe to + * do so, we just have to add the pages to the bio directly. We don't grab an + * extra reference to those pages (the user should already have that), and we + * don't put the page on IO completion. The caller needs to check if the bio is + * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be + * released. + * + * The function tries, but does not guarantee, to pin as many pages as + * fit into the bio, or are requested in @iter, whatever is smaller. If + * MM encounters an error pinning the requested pages, it stops. Error + * is returned only if 0 pages could be pinned. + */ +int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) +{ + const bool is_bvec = iov_iter_is_bvec(iter); + int ret; + + if (WARN_ON_ONCE(bio->bi_vcnt)) + return -EINVAL; + + do { + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + if (WARN_ON_ONCE(is_bvec)) + return -EINVAL; + ret = __bio_iov_append_get_pages(bio, iter); + } else { + if (is_bvec) + ret = __bio_iov_bvec_add_pages(bio, iter); + else + ret = __bio_iov_iter_get_pages(bio, iter); + } + } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); + + if (is_bvec) + bio_set_flag(bio, BIO_NO_PAGE_REF); + return bio->bi_vcnt ? 0 : ret; +} +EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); + +static void submit_bio_wait_endio(struct bio *bio) +{ + complete(bio->bi_private); +} + +/** + * submit_bio_wait - submit a bio, and wait until it completes + * @bio: The &struct bio which describes the I/O + * + * Simple wrapper around submit_bio(). Returns 0 on success, or the error from + * bio_endio() on failure. + * + * WARNING: Unlike to how submit_bio() is usually used, this function does not + * result in bio reference to be consumed. The caller must drop the reference + * on his own. + */ +int submit_bio_wait(struct bio *bio) +{ + DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + unsigned long hang_check; + + bio->bi_private = &done; + bio->bi_end_io = submit_bio_wait_endio; + bio->bi_opf |= REQ_SYNC; + submit_bio(bio); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&done); + + return blk_status_to_errno(bio->bi_status); +} +EXPORT_SYMBOL(submit_bio_wait); + +/** + * bio_advance - increment/complete a bio by some number of bytes + * @bio: bio to advance + * @bytes: number of bytes to complete + * + * This updates bi_sector, bi_size and bi_idx; if the number of bytes to + * complete doesn't align with a bvec boundary, then bv_len and bv_offset will + * be updated on the last bvec as well. + * + * @bio will then represent the remaining, uncompleted portion of the io. + */ +void bio_advance(struct bio *bio, unsigned bytes) +{ + if (bio_integrity(bio)) + bio_integrity_advance(bio, bytes); + + bio_crypt_advance(bio, bytes); + bio_advance_iter(bio, &bio->bi_iter, bytes); +} +EXPORT_SYMBOL(bio_advance); + +void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, + struct bio *src, struct bvec_iter *src_iter) +{ + struct bio_vec src_bv, dst_bv; + void *src_p, *dst_p; + unsigned bytes; + + while (src_iter->bi_size && dst_iter->bi_size) { + src_bv = bio_iter_iovec(src, *src_iter); + dst_bv = bio_iter_iovec(dst, *dst_iter); + + bytes = min(src_bv.bv_len, dst_bv.bv_len); + + src_p = kmap_atomic(src_bv.bv_page); + dst_p = kmap_atomic(dst_bv.bv_page); + + memcpy(dst_p + dst_bv.bv_offset, + src_p + src_bv.bv_offset, + bytes); + + kunmap_atomic(dst_p); + kunmap_atomic(src_p); + + flush_dcache_page(dst_bv.bv_page); + + bio_advance_iter(src, src_iter, bytes); + bio_advance_iter(dst, dst_iter, bytes); + } +} +EXPORT_SYMBOL(bio_copy_data_iter); + +/** + * bio_copy_data - copy contents of data buffers from one bio to another + * @src: source bio + * @dst: destination bio + * + * Stops when it reaches the end of either @src or @dst - that is, copies + * min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of bios). + */ +void bio_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); +} +EXPORT_SYMBOL(bio_copy_data); + +/** + * bio_list_copy_data - copy contents of data buffers from one chain of bios to + * another + * @src: source bio list + * @dst: destination bio list + * + * Stops when it reaches the end of either the @src list or @dst list - that is, + * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of + * bios). + */ +void bio_list_copy_data(struct bio *dst, struct bio *src) +{ + struct bvec_iter src_iter = src->bi_iter; + struct bvec_iter dst_iter = dst->bi_iter; + + while (1) { + if (!src_iter.bi_size) { + src = src->bi_next; + if (!src) + break; + + src_iter = src->bi_iter; + } + + if (!dst_iter.bi_size) { + dst = dst->bi_next; + if (!dst) + break; + + dst_iter = dst->bi_iter; + } + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } +} +EXPORT_SYMBOL(bio_list_copy_data); + +void bio_free_pages(struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) + __free_page(bvec->bv_page); +} +EXPORT_SYMBOL(bio_free_pages); + +/* + * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions + * for performing direct-IO in BIOs. + * + * The problem is that we cannot run set_page_dirty() from interrupt context + * because the required locks are not interrupt-safe. So what we can do is to + * mark the pages dirty _before_ performing IO. And in interrupt context, + * check that the pages are still dirty. If so, fine. If not, redirty them + * in process context. + * + * We special-case compound pages here: normally this means reads into hugetlb + * pages. The logic in here doesn't really work right for compound pages + * because the VM does not uniformly chase down the head page in all cases. + * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't + * handle them at all. So we skip compound pages here at an early stage. + * + * Note that this code is very hard to test under normal circumstances because + * direct-io pins the pages with get_user_pages(). This makes + * is_page_cache_freeable return false, and the VM will not clean the pages. + * But other code (eg, flusher threads) could clean the pages if they are mapped + * pagecache. + * + * Simply disabling the call to bio_set_pages_dirty() is a good way to test the + * deferred bio dirtying paths. + */ + +/* + * bio_set_pages_dirty() will mark all the bio's pages as dirty. + */ +void bio_set_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + if (!PageCompound(bvec->bv_page)) + set_page_dirty_lock(bvec->bv_page); + } +} + +/* + * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. + * If they are, then fine. If, however, some pages are clean then they must + * have been written out during the direct-IO read. So we take another ref on + * the BIO and re-dirty the pages in process context. + * + * It is expected that bio_check_pages_dirty() will wholly own the BIO from + * here on. It will run one put_page() against each page and will run one + * bio_put() against the BIO. + */ + +static void bio_dirty_fn(struct work_struct *work); + +static DECLARE_WORK(bio_dirty_work, bio_dirty_fn); +static DEFINE_SPINLOCK(bio_dirty_lock); +static struct bio *bio_dirty_list; + +/* + * This runs in process context + */ +static void bio_dirty_fn(struct work_struct *work) +{ + struct bio *bio, *next; + + spin_lock_irq(&bio_dirty_lock); + next = bio_dirty_list; + bio_dirty_list = NULL; + spin_unlock_irq(&bio_dirty_lock); + + while ((bio = next) != NULL) { + next = bio->bi_private; + + bio_release_pages(bio, true); + bio_put(bio); + } +} + +void bio_check_pages_dirty(struct bio *bio) +{ + struct bio_vec *bvec; + unsigned long flags; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + goto defer; + } + + bio_release_pages(bio, false); + bio_put(bio); + return; +defer: + spin_lock_irqsave(&bio_dirty_lock, flags); + bio->bi_private = bio_dirty_list; + bio_dirty_list = bio; + spin_unlock_irqrestore(&bio_dirty_lock, flags); + schedule_work(&bio_dirty_work); +} + +static inline bool bio_remaining_done(struct bio *bio) +{ + /* + * If we're not chaining, then ->__bi_remaining is always 1 and + * we always end io on the first invocation. + */ + if (!bio_flagged(bio, BIO_CHAIN)) + return true; + + BUG_ON(atomic_read(&bio->__bi_remaining) <= 0); + + if (atomic_dec_and_test(&bio->__bi_remaining)) { + bio_clear_flag(bio, BIO_CHAIN); + return true; + } + + return false; +} + +/** + * bio_endio - end I/O on a bio + * @bio: bio + * + * Description: + * bio_endio() will end I/O on the whole bio. bio_endio() is the preferred + * way to end I/O on a bio. No one should call bi_end_io() directly on a + * bio unless they own it and thus know that it has an end_io function. + * + * bio_endio() can be called several times on a bio that has been chained + * using bio_chain(). The ->bi_end_io() function will only be called the + * last time. At this point the BLK_TA_COMPLETE tracing event will be + * generated if BIO_TRACE_COMPLETION is set. + **/ +void bio_endio(struct bio *bio) +{ +again: + if (!bio_remaining_done(bio)) + return; + if (!bio_integrity_endio(bio)) + return; + + if (bio->bi_disk) + rq_qos_done_bio(bio->bi_disk->queue, bio); + + /* + * Need to have a real endio function for chained bios, otherwise + * various corner cases will break (like stacking block devices that + * save/restore bi_end_io) - however, we want to avoid unbounded + * recursion and blowing the stack. Tail call optimization would + * handle this, but compiling with frame pointers also disables + * gcc's sibling call optimization. + */ + if (bio->bi_end_io == bio_chain_endio) { + bio = __bio_chain_endio(bio); + goto again; + } + + if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bio->bi_disk->queue, bio); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } + + blk_throtl_bio_endio(bio); + /* release cgroup info */ + bio_uninit(bio); + if (bio->bi_end_io) + bio->bi_end_io(bio); +} +EXPORT_SYMBOL(bio_endio); + +/** + * bio_split - split a bio + * @bio: bio to split + * @sectors: number of sectors to split from the front of @bio + * @gfp: gfp mask + * @bs: bio set to allocate from + * + * Allocates and returns a new bio which represents @sectors from the start of + * @bio, and updates @bio to represent the remaining sectors. + * + * Unless this is a discard request the newly allocated bio will point + * to @bio's bi_io_vec. It is the caller's responsibility to ensure that + * neither @bio nor @bs are freed before the split bio. + */ +struct bio *bio_split(struct bio *bio, int sectors, + gfp_t gfp, struct bio_set *bs) +{ + struct bio *split; + + BUG_ON(sectors <= 0); + BUG_ON(sectors >= bio_sectors(bio)); + + /* Zone append commands cannot be split */ + if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) + return NULL; + + split = bio_clone_fast(bio, gfp, bs); + if (!split) + return NULL; + + split->bi_iter.bi_size = sectors << 9; + + if (bio_integrity(split)) + bio_integrity_trim(split); + + bio_advance(bio, split->bi_iter.bi_size); + + if (bio_flagged(bio, BIO_TRACE_COMPLETION)) + bio_set_flag(split, BIO_TRACE_COMPLETION); + + return split; +} +EXPORT_SYMBOL(bio_split); + +/** + * bio_trim - trim a bio + * @bio: bio to trim + * @offset: number of sectors to trim from the front of @bio + * @size: size we want to trim @bio to, in sectors + */ +void bio_trim(struct bio *bio, int offset, int size) +{ + /* 'bio' is a cloned bio which we need to trim to match + * the given offset and size. + */ + + size <<= 9; + if (offset == 0 && size == bio->bi_iter.bi_size) + return; + + bio_advance(bio, offset << 9); + bio->bi_iter.bi_size = size; + + if (bio_integrity(bio)) + bio_integrity_trim(bio); + +} +EXPORT_SYMBOL_GPL(bio_trim); + +/* + * create memory pools for biovec's in a bio_set. + * use the global biovec slabs created for general use. + */ +int biovec_init_pool(mempool_t *pool, int pool_entries) +{ + struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + + return mempool_init_slab_pool(pool, pool_entries, bp->slab); +} + +/* + * bioset_exit - exit a bioset initialized with bioset_init() + * + * May be called on a zeroed but uninitialized bioset (i.e. allocated with + * kzalloc()). + */ +void bioset_exit(struct bio_set *bs) +{ + if (bs->rescue_workqueue) + destroy_workqueue(bs->rescue_workqueue); + bs->rescue_workqueue = NULL; + + mempool_exit(&bs->bio_pool); + mempool_exit(&bs->bvec_pool); + + bioset_integrity_free(bs); + if (bs->bio_slab) + bio_put_slab(bs); + bs->bio_slab = NULL; +} +EXPORT_SYMBOL(bioset_exit); + +/** + * bioset_init - Initialize a bio_set + * @bs: pool to initialize + * @pool_size: Number of bio and bio_vecs to cache in the mempool + * @front_pad: Number of bytes to allocate in front of the returned bio + * @flags: Flags to modify behavior, currently %BIOSET_NEED_BVECS + * and %BIOSET_NEED_RESCUER + * + * Description: + * Set up a bio_set to be used with @bio_alloc_bioset. Allows the caller + * to ask for a number of bytes to be allocated in front of the bio. + * Front pad allocation is useful for embedding the bio inside + * another structure, to avoid allocating extra data to go with the bio. + * Note that the bio must be embedded at the END of that structure always, + * or things will break badly. + * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated + * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to + * dispatch queued requests when the mempool runs out of space. + * + */ +int bioset_init(struct bio_set *bs, + unsigned int pool_size, + unsigned int front_pad, + int flags) +{ + unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + + bs->front_pad = front_pad; + + spin_lock_init(&bs->rescue_lock); + bio_list_init(&bs->rescue_list); + INIT_WORK(&bs->rescue_work, bio_alloc_rescue); + + bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + if (!bs->bio_slab) + return -ENOMEM; + + if (mempool_init_slab_pool(&bs->bio_pool, pool_size, bs->bio_slab)) + goto bad; + + if ((flags & BIOSET_NEED_BVECS) && + biovec_init_pool(&bs->bvec_pool, pool_size)) + goto bad; + + if (!(flags & BIOSET_NEED_RESCUER)) + return 0; + + bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + + return 0; +bad: + bioset_exit(bs); + return -ENOMEM; +} +EXPORT_SYMBOL(bioset_init); + +/* + * Initialize and setup a new bio_set, based on the settings from + * another bio_set. + */ +int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) +{ + int flags; + + flags = 0; + if (src->bvec_pool.min_nr) + flags |= BIOSET_NEED_BVECS; + if (src->rescue_workqueue) + flags |= BIOSET_NEED_RESCUER; + + return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); +} +EXPORT_SYMBOL(bioset_init_from_src); + +static void __init biovec_init_slabs(void) +{ + int i; + + for (i = 0; i < BVEC_POOL_NR; i++) { + int size; + struct biovec_slab *bvs = bvec_slabs + i; + + if (bvs->nr_vecs <= BIO_INLINE_VECS) { + bvs->slab = NULL; + continue; + } + + size = bvs->nr_vecs * sizeof(struct bio_vec); + bvs->slab = kmem_cache_create(bvs->name, size, 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + } +} + +static int __init init_bio(void) +{ + bio_slab_max = 2; + bio_slab_nr = 0; + bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), + GFP_KERNEL); + + BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); + + if (!bio_slabs) + panic("bio: can't allocate bios\n"); + + bio_integrity_init(); + biovec_init_slabs(); + + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) + panic("bio: can't allocate bios\n"); + + if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) + panic("bio: can't create integrity pool\n"); + + return 0; +} +subsys_initcall(init_bio); diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c new file mode 100644 index 000000000..3304e841d --- /dev/null +++ b/block/blk-cgroup-rwstat.c @@ -0,0 +1,130 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#include "blk-cgroup-rwstat.h" + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp) +{ + int i, ret; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp); + if (ret) { + while (--i >= 0) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); + return ret; + } + atomic64_set(&rwstat->aux_cnt[i], 0); + } + return 0; +} +EXPORT_SYMBOL_GPL(blkg_rwstat_init); + +void blkg_rwstat_exit(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + percpu_counter_destroy(&rwstat->cpu_cnt[i]); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_exit); + +/** + * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @rwstat: rwstat to print + * + * Print @rwstat to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat) +{ + static const char *rwstr[] = { + [BLKG_RWSTAT_READ] = "Read", + [BLKG_RWSTAT_WRITE] = "Write", + [BLKG_RWSTAT_SYNC] = "Sync", + [BLKG_RWSTAT_ASYNC] = "Async", + [BLKG_RWSTAT_DISCARD] = "Discard", + }; + const char *dname = blkg_dev_name(pd->blkg); + u64 v; + int i; + + if (!dname) + return 0; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + seq_printf(sf, "%s %s %llu\n", dname, rwstr[i], + rwstat->cnt[i]); + + v = rwstat->cnt[BLKG_RWSTAT_READ] + + rwstat->cnt[BLKG_RWSTAT_WRITE] + + rwstat->cnt[BLKG_RWSTAT_DISCARD]; + seq_printf(sf, "%s Total %llu\n", dname, v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat); + +/** + * blkg_prfill_rwstat - prfill callback for blkg_rwstat + * @sf: seq_file to print to + * @pd: policy private data of interest + * @off: offset to the blkg_rwstat in @pd + * + * prfill callback for printing a blkg_rwstat. + */ +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct blkg_rwstat_sample rwstat = { }; + + blkg_rwstat_read((void *)pd + off, &rwstat); + return __blkg_prfill_rwstat(sf, pd, &rwstat); +} +EXPORT_SYMBOL_GPL(blkg_prfill_rwstat); + +/** + * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat + * @blkg: blkg of interest + * @pol: blkcg_policy which contains the blkg_rwstat + * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg + * @sum: blkg_rwstat_sample structure containing the results + * + * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its + * online descendants and their aux counts. The caller must be holding the + * queue lock for online tests. + * + * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it + * is at @off bytes into @blkg's blkg_policy_data of the policy. + */ +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum) +{ + struct blkcg_gq *pos_blkg; + struct cgroup_subsys_state *pos_css; + unsigned int i; + + lockdep_assert_held(&blkg->q->queue_lock); + + memset(sum, 0, sizeof(*sum)); + rcu_read_lock(); + blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { + struct blkg_rwstat *rwstat; + + if (!pos_blkg->online) + continue; + + if (pol) + rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off; + else + rwstat = (void *)pos_blkg + off; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i); + } + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum); diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h new file mode 100644 index 000000000..ee746919c --- /dev/null +++ b/block/blk-cgroup-rwstat.h @@ -0,0 +1,149 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT. + * Do not use in new code. + */ +#ifndef _BLK_CGROUP_RWSTAT_H +#define _BLK_CGROUP_RWSTAT_H + +#include + +enum blkg_rwstat_type { + BLKG_RWSTAT_READ, + BLKG_RWSTAT_WRITE, + BLKG_RWSTAT_SYNC, + BLKG_RWSTAT_ASYNC, + BLKG_RWSTAT_DISCARD, + + BLKG_RWSTAT_NR, + BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR, +}; + +/* + * blkg_[rw]stat->aux_cnt is excluded for local stats but included for + * recursive. Used to carry stats of dead children. + */ +struct blkg_rwstat { + struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR]; + atomic64_t aux_cnt[BLKG_RWSTAT_NR]; +}; + +struct blkg_rwstat_sample { + u64 cnt[BLKG_RWSTAT_NR]; +}; + +static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat, + unsigned int idx) +{ + return atomic64_read(&rwstat->aux_cnt[idx]) + + percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]); +} + +int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp); +void blkg_rwstat_exit(struct blkg_rwstat *rwstat); +u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + const struct blkg_rwstat_sample *rwstat); +u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd, + int off); +void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, + int off, struct blkg_rwstat_sample *sum); + + +/** + * blkg_rwstat_add - add a value to a blkg_rwstat + * @rwstat: target blkg_rwstat + * @op: REQ_OP and flags + * @val: value to add + * + * Add @val to @rwstat. The counters are chosen according to @rw. The + * caller is responsible for synchronizing calls to this function. + */ +static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, + unsigned int op, uint64_t val) +{ + struct percpu_counter *cnt; + + if (op_is_discard(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; + else if (op_is_write(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); + + if (op_is_sync(op)) + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; + else + cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; + + percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); +} + +/** + * blkg_rwstat_read - read the current values of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Read the current snapshot of @rwstat and return it in the aux counts. + */ +static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat, + struct blkg_rwstat_sample *result) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + result->cnt[i] = + percpu_counter_sum_positive(&rwstat->cpu_cnt[i]); +} + +/** + * blkg_rwstat_total - read the total count of a blkg_rwstat + * @rwstat: blkg_rwstat to read + * + * Return the total count of @rwstat regardless of the IO direction. This + * function can be called without synchronization and takes care of u64 + * atomicity. + */ +static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat) +{ + struct blkg_rwstat_sample tmp = { }; + + blkg_rwstat_read(rwstat, &tmp); + return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]; +} + +/** + * blkg_rwstat_reset - reset a blkg_rwstat + * @rwstat: blkg_rwstat to reset + */ +static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat) +{ + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) { + percpu_counter_set(&rwstat->cpu_cnt[i], 0); + atomic64_set(&rwstat->aux_cnt[i], 0); + } +} + +/** + * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count + * @to: the destination blkg_rwstat + * @from: the source + * + * Add @from's count including the aux one to @to's aux count. + */ +static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to, + struct blkg_rwstat *from) +{ + u64 sum[BLKG_RWSTAT_NR]; + int i; + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]); + + for (i = 0; i < BLKG_RWSTAT_NR; i++) + atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]), + &to->aux_cnt[i]); +} +#endif /* _BLK_CGROUP_RWSTAT_H */ diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c new file mode 100644 index 000000000..c623632c1 --- /dev/null +++ b/block/blk-cgroup.c @@ -0,0 +1,1950 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Common Block IO controller cgroup interface + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe + * + * Copyright (C) 2008 Fabio Checconi + * Paolo Valente + * + * Copyright (C) 2009 Vivek Goyal + * Nauman Rafique + * + * For policy-specific per-blkcg data: + * Copyright (C) 2015 Paolo Valente + * Arianna Avanzini + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" + +#define MAX_KEY_LEN 100 + +/* + * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. + * blkcg_pol_register_mutex nests outside of it and synchronizes entire + * policy [un]register operations including cgroup file additions / + * removals. Putting cgroup file registration outside blkcg_pol_mutex + * allows grabbing it from cgroup callbacks. + */ +static DEFINE_MUTEX(blkcg_pol_register_mutex); +static DEFINE_MUTEX(blkcg_pol_mutex); + +struct blkcg blkcg_root; +EXPORT_SYMBOL_GPL(blkcg_root); + +struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css; +EXPORT_SYMBOL_GPL(blkcg_root_css); + +static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; + +static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ + +bool blkcg_debug_stats = false; +static struct workqueue_struct *blkcg_punt_bio_wq; + +static bool blkcg_policy_enabled(struct request_queue *q, + const struct blkcg_policy *pol) +{ + return pol && test_bit(pol->plid, q->blkcg_pols); +} + +/** + * blkg_free - free a blkg + * @blkg: blkg to free + * + * Free @blkg which may be partially allocated. + */ +static void blkg_free(struct blkcg_gq *blkg) +{ + int i; + + if (!blkg) + return; + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + + free_percpu(blkg->iostat_cpu); + percpu_ref_exit(&blkg->refcnt); + kfree(blkg); +} + +static void __blkg_release(struct rcu_head *rcu) +{ + struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + + WARN_ON(!bio_list_empty(&blkg->async_bios)); + + /* release the blkcg and parent blkg refs this blkg has been holding */ + css_put(&blkg->blkcg->css); + if (blkg->parent) + blkg_put(blkg->parent); + blkg_free(blkg); +} + +/* + * A group is RCU protected, but having an rcu lock does not mean that one + * can access all the fields of blkg and assume these are valid. For + * example, don't try to follow throtl_data and request queue links. + * + * Having a reference to blkg under an rcu allows accesses to only values + * local to groups like group stats and group rate limits. + */ +static void blkg_release(struct percpu_ref *ref) +{ + struct blkcg_gq *blkg = container_of(ref, struct blkcg_gq, refcnt); + + call_rcu(&blkg->rcu_head, __blkg_release); +} + +static void blkg_async_bio_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + async_bio_work); + struct bio_list bios = BIO_EMPTY_LIST; + struct bio *bio; + struct blk_plug plug; + bool need_plug = false; + + /* as long as there are pending bios, @blkg can't go away */ + spin_lock_bh(&blkg->async_bio_lock); + bio_list_merge(&bios, &blkg->async_bios); + bio_list_init(&blkg->async_bios); + spin_unlock_bh(&blkg->async_bio_lock); + + /* start plug only when bio_list contains at least 2 bios */ + if (bios.head && bios.head->bi_next) { + need_plug = true; + blk_start_plug(&plug); + } + while ((bio = bio_list_pop(&bios))) + submit_bio(bio); + if (need_plug) + blk_finish_plug(&plug); +} + +/** + * blkg_alloc - allocate a blkg + * @blkcg: block cgroup the new blkg is associated with + * @q: request_queue the new blkg is associated with + * @gfp_mask: allocation mask to use + * + * Allocate a new blkg assocating @blkcg and @q. + */ +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, + gfp_t gfp_mask) +{ + struct blkcg_gq *blkg; + int i, cpu; + + /* alloc and init base part */ + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + if (!blkg) + return NULL; + + if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) + goto err_free; + + blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); + if (!blkg->iostat_cpu) + goto err_free; + + blkg->q = q; + INIT_LIST_HEAD(&blkg->q_node); + spin_lock_init(&blkg->async_bio_lock); + bio_list_init(&blkg->async_bios); + INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); + blkg->blkcg = blkcg; + + u64_stats_init(&blkg->iostat.sync); + for_each_possible_cpu(cpu) + u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkg_policy_data *pd; + + if (!blkcg_policy_enabled(q, pol)) + continue; + + /* alloc per-policy data and attach it to blkg */ + pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + if (!pd) + goto err_free; + + blkg->pd[i] = pd; + pd->blkg = blkg; + pd->plid = i; + } + + return blkg; + +err_free: + blkg_free(blkg); + return NULL; +} + +struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, + struct request_queue *q, bool update_hint) +{ + struct blkcg_gq *blkg; + + /* + * Hint didn't match. Look up from the radix tree. Note that the + * hint can only be updated under queue_lock as otherwise @blkg + * could have already been removed from blkg_tree. The caller is + * responsible for grabbing queue_lock if @update_hint. + */ + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q == q) { + if (update_hint) { + lockdep_assert_held(&q->queue_lock); + rcu_assign_pointer(blkcg->blkg_hint, blkg); + } + return blkg; + } + + return NULL; +} +EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); + +/* + * If @new_blkg is %NULL, this function tries to allocate a new one as + * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. + */ +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, + struct request_queue *q, + struct blkcg_gq *new_blkg) +{ + struct blkcg_gq *blkg; + int i, ret; + + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(&q->queue_lock); + + /* request_queue is dying, do not create/recreate a blkg */ + if (blk_queue_dying(q)) { + ret = -ENODEV; + goto err_free_blkg; + } + + /* blkg holds a reference to blkcg */ + if (!css_tryget_online(&blkcg->css)) { + ret = -ENODEV; + goto err_free_blkg; + } + + /* allocate */ + if (!new_blkg) { + new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto err_put_css; + } + } + blkg = new_blkg; + + /* link parent */ + if (blkcg_parent(blkcg)) { + blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + if (WARN_ON_ONCE(!blkg->parent)) { + ret = -ENODEV; + goto err_put_css; + } + blkg_get(blkg->parent); + } + + /* invoke per-policy init */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_init_fn) + pol->pd_init_fn(blkg->pd[i]); + } + + /* insert */ + spin_lock(&blkcg->lock); + ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + if (likely(!ret)) { + hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); + list_add(&blkg->q_node, &q->blkg_list); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + } + } + blkg->online = true; + spin_unlock(&blkcg->lock); + + if (!ret) + return blkg; + + /* @blkg failed fully initialized, use the usual release path */ + blkg_put(blkg); + return ERR_PTR(ret); + +err_put_css: + css_put(&blkcg->css); +err_free_blkg: + blkg_free(new_blkg); + return ERR_PTR(ret); +} + +/** + * blkg_lookup_create - lookup blkg, try to create one if not there + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * create one. blkg creation is performed recursively from blkcg_root such + * that all non-root blkg's have access to the parent blkg. This function + * should be called under RCU read lock and takes @q->queue_lock. + * + * Returns the blkg or the closest blkg if blkg_create() fails as it walks + * down from root. + */ +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct request_queue *q) +{ + struct blkcg_gq *blkg; + unsigned long flags; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + blkg = blkg_lookup(blkcg, q); + if (blkg) + return blkg; + + spin_lock_irqsave(&q->queue_lock, flags); + blkg = __blkg_lookup(blkcg, q, true); + if (blkg) + goto found; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. Returns the closest + * blkg to the intended blkg should blkg_create() fail. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg_gq *ret_blkg = q->root_blkg; + + while (parent) { + blkg = __blkg_lookup(parent, q, false); + if (blkg) { + /* remember closest blkg */ + ret_blkg = blkg; + break; + } + pos = parent; + parent = blkcg_parent(parent); + } + + blkg = blkg_create(pos, q, NULL); + if (IS_ERR(blkg)) { + blkg = ret_blkg; + break; + } + if (pos == blkcg) + break; + } + +found: + spin_unlock_irqrestore(&q->queue_lock, flags); + return blkg; +} + +static void blkg_destroy(struct blkcg_gq *blkg) +{ + struct blkcg *blkcg = blkg->blkcg; + int i; + + lockdep_assert_held(&blkg->q->queue_lock); + lockdep_assert_held(&blkcg->lock); + + /* Something wrong if we are trying to remove same group twice */ + WARN_ON_ONCE(list_empty(&blkg->q_node)); + WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } + + blkg->online = false; + + radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); + list_del_init(&blkg->q_node); + hlist_del_init_rcu(&blkg->blkcg_node); + + /* + * Both setting lookup hint to and clearing it from @blkg are done + * under queue_lock. If it's not pointing to @blkg now, it never + * will. Hint assignment itself can race safely. + */ + if (rcu_access_pointer(blkcg->blkg_hint) == blkg) + rcu_assign_pointer(blkcg->blkg_hint, NULL); + + /* + * Put the reference taken at the time of creation so that when all + * queues are gone, group can be destroyed. + */ + percpu_ref_kill(&blkg->refcnt); +} + +/** + * blkg_destroy_all - destroy all blkgs associated with a request_queue + * @q: request_queue of interest + * + * Destroy all blkgs associated with @q. + */ +static void blkg_destroy_all(struct request_queue *q) +{ + struct blkcg_gq *blkg, *n; + + spin_lock_irq(&q->queue_lock); + list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); + blkg_destroy(blkg); + spin_unlock(&blkcg->lock); + } + + q->root_blkg = NULL; + spin_unlock_irq(&q->queue_lock); +} + +static int blkcg_reset_stats(struct cgroup_subsys_state *css, + struct cftype *cftype, u64 val) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + int i, cpu; + + mutex_lock(&blkcg_pol_mutex); + spin_lock_irq(&blkcg->lock); + + /* + * Note that stat reset is racy - it doesn't synchronize against + * stat updates. This is a debug feature which shouldn't exist + * anyway. If you get hit by a race, retry. + */ + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + for_each_possible_cpu(cpu) { + struct blkg_iostat_set *bis = + per_cpu_ptr(blkg->iostat_cpu, cpu); + memset(bis, 0, sizeof(*bis)); + } + memset(&blkg->iostat, 0, sizeof(blkg->iostat)); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (blkg->pd[i] && pol->pd_reset_stats_fn) + pol->pd_reset_stats_fn(blkg->pd[i]); + } + } + + spin_unlock_irq(&blkcg->lock); + mutex_unlock(&blkcg_pol_mutex); + return 0; +} + +const char *blkg_dev_name(struct blkcg_gq *blkg) +{ + /* some drivers (floppy) instantiate a queue w/o disk registered */ + if (blkg->q->backing_dev_info->dev) + return bdi_dev_name(blkg->q->backing_dev_info); + return NULL; +} + +/** + * blkcg_print_blkgs - helper for printing per-blkg data + * @sf: seq_file to print to + * @blkcg: blkcg of interest + * @prfill: fill function to print out a blkg + * @pol: policy in question + * @data: data to be passed to @prfill + * @show_total: to print out sum of prfill return values or not + * + * This function invokes @prfill on each blkg of @blkcg if pd for the + * policy specified by @pol exists. @prfill is invoked with @sf, the + * policy data and @data and the matching queue lock held. If @show_total + * is %true, the sum of the return values from @prfill is printed with + * "Total" label at the end. + * + * This is to be used to construct print functions for + * cftype->read_seq_string method. + */ +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total) +{ + struct blkcg_gq *blkg; + u64 total = 0; + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(&blkg->q->queue_lock); + if (blkcg_policy_enabled(blkg->q, pol)) + total += prfill(sf, blkg->pd[pol->plid], data); + spin_unlock_irq(&blkg->q->queue_lock); + } + rcu_read_unlock(); + + if (show_total) + seq_printf(sf, "Total %llu\n", (unsigned long long)total); +} +EXPORT_SYMBOL_GPL(blkcg_print_blkgs); + +/** + * __blkg_prfill_u64 - prfill helper for a single u64 value + * @sf: seq_file to print to + * @pd: policy private data of interest + * @v: value to print + * + * Print @v to @sf for the device assocaited with @pd. + */ +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) +{ + const char *dname = blkg_dev_name(pd->blkg); + + if (!dname) + return 0; + + seq_printf(sf, "%s %llu\n", dname, (unsigned long long)v); + return v; +} +EXPORT_SYMBOL_GPL(__blkg_prfill_u64); + +/* Performs queue bypass and policy enabled checks then looks up blkg. */ +static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, + const struct blkcg_policy *pol, + struct request_queue *q) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + lockdep_assert_held(&q->queue_lock); + + if (!blkcg_policy_enabled(q, pol)) + return ERR_PTR(-EOPNOTSUPP); + return __blkg_lookup(blkcg, q, true /* update_hint */); +} + +/** + * blkg_conf_prep - parse and prepare for per-blkg config update + * @inputp: input string pointer + * + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update + * from @input and get and return the matching gendisk. *@inputp is + * updated to point past the device node prefix. Returns an ERR_PTR() + * value on error. + * + * Use this function iff blkg_conf_prep() can't be used for some reason. + */ +struct gendisk *blkcg_conf_get_disk(char **inputp) +{ + char *input = *inputp; + unsigned int major, minor; + struct gendisk *disk; + int key_len, part; + + if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) + return ERR_PTR(-EINVAL); + + input += key_len; + if (!isspace(*input)) + return ERR_PTR(-EINVAL); + input = skip_spaces(input); + + disk = get_gendisk(MKDEV(major, minor), &part); + if (!disk) + return ERR_PTR(-ENODEV); + if (part) { + put_disk_and_module(disk); + return ERR_PTR(-ENODEV); + } + + *inputp = input; + return disk; +} + +/** + * blkg_conf_prep - parse and prepare for per-blkg config update + * @blkcg: target block cgroup + * @pol: target policy + * @input: input string + * @ctx: blkg_conf_ctx to be filled + * + * Parse per-blkg config update from @input and initialize @ctx with the + * result. @ctx->blkg points to the blkg to be updated and @ctx->body the + * part of @input following MAJ:MIN. This function returns with RCU read + * lock and queue lock held and must be paired with blkg_conf_finish(). + */ +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + char *input, struct blkg_conf_ctx *ctx) + __acquires(rcu) __acquires(&disk->queue->queue_lock) +{ + struct gendisk *disk; + struct request_queue *q; + struct blkcg_gq *blkg; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + q = disk->queue; + + rcu_read_lock(); + spin_lock_irq(&q->queue_lock); + + blkg = blkg_lookup_check(blkcg, pol, q); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto fail_unlock; + } + + if (blkg) + goto success; + + /* + * Create blkgs walking down from blkcg_root to @blkcg, so that all + * non-root blkgs have access to their parents. + */ + while (true) { + struct blkcg *pos = blkcg; + struct blkcg *parent; + struct blkcg_gq *new_blkg; + + parent = blkcg_parent(blkcg); + while (parent && !__blkg_lookup(parent, q, false)) { + pos = parent; + parent = blkcg_parent(parent); + } + + /* Drop locks to do new blkg allocation with GFP_KERNEL. */ + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); + + new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + if (unlikely(!new_blkg)) { + ret = -ENOMEM; + goto fail; + } + + if (radix_tree_preload(GFP_KERNEL)) { + blkg_free(new_blkg); + ret = -ENOMEM; + goto fail; + } + + rcu_read_lock(); + spin_lock_irq(&q->queue_lock); + + blkg = blkg_lookup_check(pos, pol, q); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + blkg_free(new_blkg); + goto fail_preloaded; + } + + if (blkg) { + blkg_free(new_blkg); + } else { + blkg = blkg_create(pos, q, new_blkg); + if (IS_ERR(blkg)) { + ret = PTR_ERR(blkg); + goto fail_preloaded; + } + } + + radix_tree_preload_end(); + + if (pos == blkcg) + goto success; + } +success: + ctx->disk = disk; + ctx->blkg = blkg; + ctx->body = input; + return 0; + +fail_preloaded: + radix_tree_preload_end(); +fail_unlock: + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); +fail: + put_disk_and_module(disk); + /* + * If queue was bypassing, we should retry. Do so after a + * short msleep(). It isn't strictly necessary but queue + * can be bypassing for some time and it's always nice to + * avoid busy looping. + */ + if (ret == -EBUSY) { + msleep(10); + ret = restart_syscall(); + } + return ret; +} +EXPORT_SYMBOL_GPL(blkg_conf_prep); + +/** + * blkg_conf_finish - finish up per-blkg config update + * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * + * Finish up after per-blkg config update. This function must be paired + * with blkg_conf_prep(). + */ +void blkg_conf_finish(struct blkg_conf_ctx *ctx) + __releases(&ctx->disk->queue->queue_lock) __releases(rcu) +{ + spin_unlock_irq(&ctx->disk->queue->queue_lock); + rcu_read_unlock(); + put_disk_and_module(ctx->disk); +} +EXPORT_SYMBOL_GPL(blkg_conf_finish); + +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} + +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} + +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; + + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg_gq *blkg; + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); + struct blkg_iostat cur, delta; + unsigned int seq; + + /* fetch the current per-cpu values */ + do { + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); + + /* propagate percpu delta to global */ + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&delta, &cur); + blkg_iostat_sub(&delta, &bisc->last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(&bisc->last, &delta); + u64_stats_update_end(&blkg->iostat.sync); + + /* propagate global delta to parent */ + if (parent) { + u64_stats_update_begin(&parent->iostat.sync); + blkg_iostat_set(&delta, &blkg->iostat.cur); + blkg_iostat_sub(&delta, &blkg->iostat.last); + blkg_iostat_add(&parent->iostat.cur, &delta); + blkg_iostat_add(&blkg->iostat.last, &delta); + u64_stats_update_end(&parent->iostat.sync); + } + } + + rcu_read_unlock(); +} + +/* + * The rstat algorithms intentionally don't handle the root cgroup to avoid + * incurring overhead when no cgroups are defined. For that reason, + * cgroup_rstat_flush in blkcg_print_stat does not actually fill out the + * iostat in the root cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part = disk_get_part(disk, 0); + struct blkcg_gq *blkg = blk_queue_root_blkg(disk->queue); + struct blkg_iostat tmp; + int cpu; + + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(part->dkstats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; + + u64_stats_update_begin(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end(&blkg->iostat.sync); + } + disk_put_part(part); + } +} + +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + + rcu_read_lock(); + + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + struct blkg_iostat_set *bis = &blkg->iostat; + const char *dname; + char *buf; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + size_t size = seq_get_buf(sf, &buf), off = 0; + int i; + bool has_stats = false; + unsigned seq; + + spin_lock_irq(&blkg->q->queue_lock); + + if (!blkg->online) + goto skip; + + dname = blkg_dev_name(blkg); + if (!dname) + goto skip; + + /* + * Hooray string manipulation, count is the size written NOT + * INCLUDING THE \0, so size is now count+1 less than what we + * had before, but we want to start writing the next bit from + * the \0 so we only add count to buf. + */ + off += scnprintf(buf+off, size-off, "%s ", dname); + + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + has_stats = true; + off += scnprintf(buf+off, size-off, + "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + has_stats = true; + off += scnprintf(buf+off, size-off, + " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + (unsigned long long)atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + size_t written; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); + if (written) + has_stats = true; + off += written; + } + + if (has_stats) { + if (off < size - 1) { + off += scnprintf(buf+off, size-off, "\n"); + seq_commit(sf, off); + } else { + seq_commit(sf, -1); + } + } + skip: + spin_unlock_irq(&blkg->q->queue_lock); + } + + rcu_read_unlock(); + return 0; +} + +static struct cftype blkcg_files[] = { + { + .name = "stat", + .seq_show = blkcg_print_stat, + }, + { } /* terminate */ +}; + +static struct cftype blkcg_legacy_files[] = { + { + .name = "reset_stats", + .write_u64 = blkcg_reset_stats, + }, + { } /* terminate */ +}; + +/* + * blkcg destruction is a three-stage process. + * + * 1. Destruction starts. The blkcg_css_offline() callback is invoked + * which offlines writeback. Here we tie the next stage of blkg destruction + * to the completion of writeback associated with the blkcg. This lets us + * avoid punting potentially large amounts of outstanding writeback to root + * while maintaining any ongoing policies. The next stage is triggered when + * the nr_cgwbs count goes to zero. + * + * 2. When the nr_cgwbs count goes to zero, blkcg_destroy_blkgs() is called + * and handles the destruction of blkgs. Here the css reference held by + * the blkg is put back eventually allowing blkcg_css_free() to be called. + * This work may occur in cgwb_release_workfn() on the cgwb_release + * workqueue. Any submitted ios that fail to get the blkg ref will be + * punted to the root_blkg. + * + * 3. Once the blkcg ref count goes to zero, blkcg_css_free() is called. + * This finally frees the blkcg. + */ + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + struct blkcg *blkcg = css_to_blkcg(css); + + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(blkcg); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(blkcg); +} + +/** + * blkcg_destroy_blkgs - responsible for shooting down blkgs + * @blkcg: blkcg of interest + * + * blkgs should be removed while holding both q and blkcg locks. As blkcg lock + * is nested inside q lock, this function performs reverse double lock dancing. + * Destroying the blkgs releases the reference held on the blkcg's css allowing + * blkcg_css_free to eventually be called. + * + * This is the blkcg counterpart of ioc_release_fn(). + */ +void blkcg_destroy_blkgs(struct blkcg *blkcg) +{ + might_sleep(); + + spin_lock_irq(&blkcg->lock); + + while (!hlist_empty(&blkcg->blkg_list)) { + struct blkcg_gq *blkg = hlist_entry(blkcg->blkg_list.first, + struct blkcg_gq, blkcg_node); + struct request_queue *q = blkg->q; + + if (need_resched() || !spin_trylock(&q->queue_lock)) { + /* + * Given that the system can accumulate a huge number + * of blkgs in pathological cases, check to see if we + * need to rescheduling to avoid softlockup. + */ + spin_unlock_irq(&blkcg->lock); + cond_resched(); + spin_lock_irq(&blkcg->lock); + continue; + } + + blkg_destroy(blkg); + spin_unlock(&q->queue_lock); + } + + spin_unlock_irq(&blkcg->lock); +} + +static void blkcg_css_free(struct cgroup_subsys_state *css) +{ + struct blkcg *blkcg = css_to_blkcg(css); + int i; + + mutex_lock(&blkcg_pol_mutex); + + list_del(&blkcg->all_blkcgs_node); + + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + mutex_unlock(&blkcg_pol_mutex); + + kfree(blkcg); +} + +static struct cgroup_subsys_state * +blkcg_css_alloc(struct cgroup_subsys_state *parent_css) +{ + struct blkcg *blkcg; + struct cgroup_subsys_state *ret; + int i; + + mutex_lock(&blkcg_pol_mutex); + + if (!parent_css) { + blkcg = &blkcg_root; + } else { + blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); + if (!blkcg) { + ret = ERR_PTR(-ENOMEM); + goto unlock; + } + } + + for (i = 0; i < BLKCG_MAX_POLS ; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg_policy_data *cpd; + + /* + * If the policy hasn't been attached yet, wait for it + * to be attached before doing anything else. Otherwise, + * check if the policy requires any specific per-cgroup + * data: if it does, allocate and initialize it. + */ + if (!pol || !pol->cpd_alloc_fn) + continue; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) { + ret = ERR_PTR(-ENOMEM); + goto free_pd_blkcg; + } + blkcg->cpd[i] = cpd; + cpd->blkcg = blkcg; + cpd->plid = i; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } + + spin_lock_init(&blkcg->lock); + refcount_set(&blkcg->online_pin, 1); + INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN); + INIT_HLIST_HEAD(&blkcg->blkg_list); +#ifdef CONFIG_CGROUP_WRITEBACK + INIT_LIST_HEAD(&blkcg->cgwb_list); +#endif + list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs); + + mutex_unlock(&blkcg_pol_mutex); + return &blkcg->css; + +free_pd_blkcg: + for (i--; i >= 0; i--) + if (blkcg->cpd[i]) + blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); + + if (blkcg != &blkcg_root) + kfree(blkcg); +unlock: + mutex_unlock(&blkcg_pol_mutex); + return ret; +} + +static int blkcg_css_online(struct cgroup_subsys_state *css) +{ + struct blkcg *blkcg = css_to_blkcg(css); + struct blkcg *parent = blkcg_parent(blkcg); + + /* + * blkcg_pin_online() is used to delay blkcg offline so that blkgs + * don't go offline while cgwbs are still active on them. Pin the + * parent so that offline always happens towards the root. + */ + if (parent) + blkcg_pin_online(parent); + return 0; +} + +/** + * blkcg_init_queue - initialize blkcg part of request queue + * @q: request_queue to initialize + * + * Called from blk_alloc_queue(). Responsible for initializing blkcg + * part of new request_queue @q. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int blkcg_init_queue(struct request_queue *q) +{ + struct blkcg_gq *new_blkg, *blkg; + bool preloaded; + int ret; + + new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + if (!new_blkg) + return -ENOMEM; + + preloaded = !radix_tree_preload(GFP_KERNEL); + + /* Make sure the root blkg exists. */ + rcu_read_lock(); + spin_lock_irq(&q->queue_lock); + blkg = blkg_create(&blkcg_root, q, new_blkg); + if (IS_ERR(blkg)) + goto err_unlock; + q->root_blkg = blkg; + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); + + if (preloaded) + radix_tree_preload_end(); + + ret = blk_throtl_init(q); + if (ret) + goto err_destroy_all; + + ret = blk_iolatency_init(q); + if (ret) { + blk_throtl_exit(q); + goto err_destroy_all; + } + return 0; + +err_destroy_all: + blkg_destroy_all(q); + return ret; +err_unlock: + spin_unlock_irq(&q->queue_lock); + rcu_read_unlock(); + if (preloaded) + radix_tree_preload_end(); + return PTR_ERR(blkg); +} + +/** + * blkcg_exit_queue - exit and release blkcg part of request_queue + * @q: request_queue being released + * + * Called from blk_exit_queue(). Responsible for exiting blkcg part. + */ +void blkcg_exit_queue(struct request_queue *q) +{ + blkg_destroy_all(q); + blk_throtl_exit(q); +} + +/* + * We cannot support shared io contexts, as we have no mean to support + * two tasks with the same ioc in two different groups without major rework + * of the main cic data structures. For now we allow a task to change + * its cgroup only if it's the only owner of its ioc. + */ +static int blkcg_can_attach(struct cgroup_taskset *tset) +{ + struct task_struct *task; + struct cgroup_subsys_state *dst_css; + struct io_context *ioc; + int ret = 0; + + /* task_lock() is needed to avoid races with exit_io_context() */ + cgroup_taskset_for_each(task, dst_css, tset) { + task_lock(task); + ioc = task->io_context; + if (ioc && atomic_read(&ioc->nr_tasks) > 1) + ret = -EINVAL; + task_unlock(task); + if (ret) + break; + } + return ret; +} + +static void blkcg_bind(struct cgroup_subsys_state *root_css) +{ + int i; + + mutex_lock(&blkcg_pol_mutex); + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + struct blkcg *blkcg; + + if (!pol || !pol->cpd_bind_fn) + continue; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) + if (blkcg->cpd[pol->plid]) + pol->cpd_bind_fn(blkcg->cpd[pol->plid]); + } + mutex_unlock(&blkcg_pol_mutex); +} + +static void blkcg_exit(struct task_struct *tsk) +{ + if (tsk->throttle_queue) + blk_put_queue(tsk->throttle_queue); + tsk->throttle_queue = NULL; +} + +struct cgroup_subsys io_cgrp_subsys = { + .css_alloc = blkcg_css_alloc, + .css_online = blkcg_css_online, + .css_offline = blkcg_css_offline, + .css_free = blkcg_css_free, + .can_attach = blkcg_can_attach, + .css_rstat_flush = blkcg_rstat_flush, + .bind = blkcg_bind, + .dfl_cftypes = blkcg_files, + .legacy_cftypes = blkcg_legacy_files, + .legacy_name = "blkio", + .exit = blkcg_exit, +#ifdef CONFIG_MEMCG + /* + * This ensures that, if available, memcg is automatically enabled + * together on the default hierarchy so that the owner cgroup can + * be retrieved from writeback pages. + */ + .depends_on = 1 << memory_cgrp_id, +#endif +}; +EXPORT_SYMBOL_GPL(io_cgrp_subsys); + +/** + * blkcg_activate_policy - activate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to activate + * + * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * bypass mode to populate its blkgs with policy_data for @pol. + * + * Activation happens with @q bypassed, so nobody would be accessing blkgs + * from IO path. Update of each blkg is protected by both queue and blkcg + * locks so that holding either lock and testing blkcg_policy_enabled() is + * always enough for dereferencing policy data. + * + * The caller is responsible for synchronizing [de]activations and policy + * [un]registerations. Returns 0 on success, -errno on failure. + */ +int blkcg_activate_policy(struct request_queue *q, + const struct blkcg_policy *pol) +{ + struct blkg_policy_data *pd_prealloc = NULL; + struct blkcg_gq *blkg, *pinned_blkg = NULL; + int ret; + + if (blkcg_policy_enabled(q, pol)) + return 0; + + if (queue_is_mq(q)) + blk_mq_freeze_queue(q); +retry: + spin_lock_irq(&q->queue_lock); + + /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { + struct blkg_policy_data *pd; + + if (blkg->pd[pol->plid]) + continue; + + /* If prealloc matches, use it; otherwise try GFP_NOWAIT */ + if (blkg == pinned_blkg) { + pd = pd_prealloc; + pd_prealloc = NULL; + } else { + pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, + blkg->blkcg); + } + + if (!pd) { + /* + * GFP_NOWAIT failed. Free the existing one and + * prealloc for @blkg w/ GFP_KERNEL. + */ + if (pinned_blkg) + blkg_put(pinned_blkg); + blkg_get(blkg); + pinned_blkg = blkg; + + spin_unlock_irq(&q->queue_lock); + + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); + pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, + blkg->blkcg); + if (pd_prealloc) + goto retry; + else + goto enomem; + } + + blkg->pd[pol->plid] = pd; + pd->blkg = blkg; + pd->plid = pol->plid; + } + + /* all allocated, init in the same order */ + if (pol->pd_init_fn) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_init_fn(blkg->pd[pol->plid]); + + if (pol->pd_online_fn) + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_online_fn(blkg->pd[pol->plid]); + + __set_bit(pol->plid, q->blkcg_pols); + ret = 0; + + spin_unlock_irq(&q->queue_lock); +out: + if (queue_is_mq(q)) + blk_mq_unfreeze_queue(q); + if (pinned_blkg) + blkg_put(pinned_blkg); + if (pd_prealloc) + pol->pd_free_fn(pd_prealloc); + return ret; + +enomem: + /* alloc failed, nothing's initialized yet, free everything */ + spin_lock_irq(&q->queue_lock); + list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); + if (blkg->pd[pol->plid]) { + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } + spin_unlock(&blkcg->lock); + } + spin_unlock_irq(&q->queue_lock); + ret = -ENOMEM; + goto out; +} +EXPORT_SYMBOL_GPL(blkcg_activate_policy); + +/** + * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue + * @q: request_queue of interest + * @pol: blkcg policy to deactivate + * + * Deactivate @pol on @q. Follows the same synchronization rules as + * blkcg_activate_policy(). + */ +void blkcg_deactivate_policy(struct request_queue *q, + const struct blkcg_policy *pol) +{ + struct blkcg_gq *blkg; + + if (!blkcg_policy_enabled(q, pol)) + return; + + if (queue_is_mq(q)) + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + + __clear_bit(pol->plid, q->blkcg_pols); + + list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); + if (blkg->pd[pol->plid]) { + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; + } + spin_unlock(&blkcg->lock); + } + + spin_unlock_irq(&q->queue_lock); + + if (queue_is_mq(q)) + blk_mq_unfreeze_queue(q); +} +EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); + +/** + * blkcg_policy_register - register a blkcg policy + * @pol: blkcg policy to register + * + * Register @pol with blkcg core. Might sleep and @pol may be modified on + * successful registration. Returns 0 on success and -errno on failure. + */ +int blkcg_policy_register(struct blkcg_policy *pol) +{ + struct blkcg *blkcg; + int i, ret; + + mutex_lock(&blkcg_pol_register_mutex); + mutex_lock(&blkcg_pol_mutex); + + /* find an empty slot */ + ret = -ENOSPC; + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (!blkcg_policy[i]) + break; + if (i >= BLKCG_MAX_POLS) { + pr_warn("blkcg_policy_register: BLKCG_MAX_POLS too small\n"); + goto err_unlock; + } + + /* Make sure cpd/pd_alloc_fn and cpd/pd_free_fn in pairs */ + if ((!pol->cpd_alloc_fn ^ !pol->cpd_free_fn) || + (!pol->pd_alloc_fn ^ !pol->pd_free_fn)) + goto err_unlock; + + /* register @pol */ + pol->plid = i; + blkcg_policy[pol->plid] = pol; + + /* allocate and install cpd's */ + if (pol->cpd_alloc_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + struct blkcg_policy_data *cpd; + + cpd = pol->cpd_alloc_fn(GFP_KERNEL); + if (!cpd) + goto err_free_cpds; + + blkcg->cpd[pol->plid] = cpd; + cpd->blkcg = blkcg; + cpd->plid = pol->plid; + if (pol->cpd_init_fn) + pol->cpd_init_fn(cpd); + } + } + + mutex_unlock(&blkcg_pol_mutex); + + /* everything is in place, add intf files for the new policy */ + if (pol->dfl_cftypes) + WARN_ON(cgroup_add_dfl_cftypes(&io_cgrp_subsys, + pol->dfl_cftypes)); + if (pol->legacy_cftypes) + WARN_ON(cgroup_add_legacy_cftypes(&io_cgrp_subsys, + pol->legacy_cftypes)); + mutex_unlock(&blkcg_pol_register_mutex); + return 0; + +err_free_cpds: + if (pol->cpd_free_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; +err_unlock: + mutex_unlock(&blkcg_pol_mutex); + mutex_unlock(&blkcg_pol_register_mutex); + return ret; +} +EXPORT_SYMBOL_GPL(blkcg_policy_register); + +/** + * blkcg_policy_unregister - unregister a blkcg policy + * @pol: blkcg policy to unregister + * + * Undo blkcg_policy_register(@pol). Might sleep. + */ +void blkcg_policy_unregister(struct blkcg_policy *pol) +{ + struct blkcg *blkcg; + + mutex_lock(&blkcg_pol_register_mutex); + + if (WARN_ON(blkcg_policy[pol->plid] != pol)) + goto out_unlock; + + /* kill the intf files first */ + if (pol->dfl_cftypes) + cgroup_rm_cftypes(pol->dfl_cftypes); + if (pol->legacy_cftypes) + cgroup_rm_cftypes(pol->legacy_cftypes); + + /* remove cpds and unregister */ + mutex_lock(&blkcg_pol_mutex); + + if (pol->cpd_free_fn) { + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } + } + blkcg_policy[pol->plid] = NULL; + + mutex_unlock(&blkcg_pol_mutex); +out_unlock: + mutex_unlock(&blkcg_pol_register_mutex); +} +EXPORT_SYMBOL_GPL(blkcg_policy_unregister); + +bool __blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + /* consume the flag first */ + bio->bi_opf &= ~REQ_CGROUP_PUNT; + + /* never bounce for the root cgroup */ + if (!blkg->parent) + return false; + + spin_lock_bh(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock_bh(&blkg->async_bio_lock); + + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + return true; +} + +/* + * Scale the accumulated delay based on how long it has been since we updated + * the delay. We only call this when we are adding delay, in case it's been a + * while since we added delay, and when we are checking to see if we need to + * delay a task, to account for any delays that may have occurred. + */ +static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) +{ + u64 old = atomic64_read(&blkg->delay_start); + + /* negative use_delay means no scaling, see blkcg_set_delay() */ + if (atomic_read(&blkg->use_delay) < 0) + return; + + /* + * We only want to scale down every second. The idea here is that we + * want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain + * time window. We only want to throttle tasks for recent delay that + * has occurred, in 1 second time windows since that's the maximum + * things can be throttled. We save the current delay window in + * blkg->last_delay so we know what amount is still left to be charged + * to the blkg from this point onward. blkg->last_use keeps track of + * the use_delay counter. The idea is if we're unthrottling the blkg we + * are ok with whatever is happening now, and we can take away more of + * the accumulated delay as we've already throttled enough that + * everybody is happy with their IO latencies. + */ + if (time_before64(old + NSEC_PER_SEC, now) && + atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + u64 cur = atomic64_read(&blkg->delay_nsec); + u64 sub = min_t(u64, blkg->last_delay, now - old); + int cur_use = atomic_read(&blkg->use_delay); + + /* + * We've been unthrottled, subtract a larger chunk of our + * accumulated delay. + */ + if (cur_use < blkg->last_use) + sub = max_t(u64, sub, blkg->last_delay >> 1); + + /* + * This shouldn't happen, but handle it anyway. Our delay_nsec + * should only ever be growing except here where we subtract out + * min(last_delay, 1 second), but lord knows bugs happen and I'd + * rather not end up with negative numbers. + */ + if (unlikely(cur < sub)) { + atomic64_set(&blkg->delay_nsec, 0); + blkg->last_delay = 0; + } else { + atomic64_sub(sub, &blkg->delay_nsec); + blkg->last_delay = cur - sub; + } + blkg->last_use = cur_use; + } +} + +/* + * This is called when we want to actually walk up the hierarchy and check to + * see if we need to throttle, and then actually throttle if there is some + * accumulated delay. This should only be called upon return to user space so + * we're not holding some lock that would induce a priority inversion. + */ +static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) +{ + unsigned long pflags; + bool clamp; + u64 now = ktime_to_ns(ktime_get()); + u64 exp; + u64 delay_nsec = 0; + int tok; + + while (blkg->parent) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + + blkcg_scale_delay(blkg, now); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } + } + blkg = blkg->parent; + } + + if (!delay_nsec) + return; + + /* + * Let's not sleep for all eternity if we've amassed a huge delay. + * Swapping or metadata IO can accumulate 10's of seconds worth of + * delay, and we want userspace to be able to do _something_ so cap the + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. + */ + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + + if (use_memdelay) + psi_memstall_enter(&pflags); + + exp = ktime_add_ns(now, delay_nsec); + tok = io_schedule_prepare(); + do { + __set_current_state(TASK_KILLABLE); + if (!schedule_hrtimeout(&exp, HRTIMER_MODE_ABS)) + break; + } while (!fatal_signal_pending(current)); + io_schedule_finish(tok); + + if (use_memdelay) + psi_memstall_leave(&pflags); +} + +/** + * blkcg_maybe_throttle_current - throttle the current task if it has been marked + * + * This is only called if we've been marked with set_notify_resume(). Obviously + * we can be set_notify_resume() for reasons other than blkcg throttling, so we + * check to see if current->throttle_queue is set and if not this doesn't do + * anything. This should only ever be called by the resume code, it's not meant + * to be called by people willy-nilly as it will actually do the work to + * throttle the task if it is setup for throttling. + */ +void blkcg_maybe_throttle_current(void) +{ + struct request_queue *q = current->throttle_queue; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + struct blkcg_gq *blkg; + bool use_memdelay = current->use_memdelay; + + if (!q) + return; + + current->throttle_queue = NULL; + current->use_memdelay = false; + + rcu_read_lock(); + css = kthread_blkcg(); + if (css) + blkcg = css_to_blkcg(css); + else + blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); + + if (!blkcg) + goto out; + blkg = blkg_lookup(blkcg, q); + if (!blkg) + goto out; + if (!blkg_tryget(blkg)) + goto out; + rcu_read_unlock(); + + blkcg_maybe_throttle_blkg(blkg, use_memdelay); + blkg_put(blkg); + blk_put_queue(q); + return; +out: + rcu_read_unlock(); + blk_put_queue(q); +} + +/** + * blkcg_schedule_throttle - this task needs to check for throttling + * @q: the request queue IO was submitted on + * @use_memdelay: do we charge this to memory delay for PSI + * + * This is called by the IO controller when we know there's delay accumulated + * for the blkg for this task. We do not pass the blkg because there are places + * we call this that may not have that information, the swapping code for + * instance will only have a request_queue at that point. This set's the + * notify_resume for the task to check and see if it requires throttling before + * returning to user space. + * + * We will only schedule once per syscall. You can call this over and over + * again and it will only do the check once upon return to user space, and only + * throttle once. If the task needs to be throttled again it'll need to be + * re-set at the next time we see the task. + */ +void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +{ + if (unlikely(current->flags & PF_KTHREAD)) + return; + + if (!blk_get_queue(q)) + return; + + if (current->throttle_queue) + blk_put_queue(current->throttle_queue); + current->throttle_queue = q; + if (use_memdelay) + current->use_memdelay = use_memdelay; + set_notify_resume(current); +} + +/** + * blkcg_add_delay - add delay to this blkg + * @blkg: blkg of interest + * @now: the current time in nanoseconds + * @delta: how many nanoseconds of delay to add + * + * Charge @delta to the blkg's current delay accumulation. This is used to + * throttle tasks if an IO controller thinks we need more throttling. + */ +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) +{ + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; + blkcg_scale_delay(blkg, now); + atomic64_add(delta, &blkg->delay_nsec); +} + +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @bio: target bio + * @css: target css + * + * As the failure mode here is to walk up the blkg tree, this ensure that the + * blkg->parent pointers are always valid. This returns the blkg that it ended + * up taking a reference on or %NULL if no reference was taken. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, + struct cgroup_subsys_state *css) +{ + struct blkcg_gq *blkg, *ret_blkg = NULL; + + rcu_read_lock(); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_disk->queue); + while (blkg) { + if (blkg_tryget(blkg)) { + ret_blkg = blkg; + break; + } + blkg = blkg->parent; + } + rcu_read_unlock(); + + return ret_blkg; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); + + if (css && css->parent) { + bio->bi_blkg = blkg_tryget_closest(bio, css); + } else { + blkg_get(bio->bi_disk->queue->root_blkg); + bio->bi_blkg = bio->bi_disk->queue->root_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = &bio_blkcg(bio)->css; + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_blkg) + bio_associate_blkg_from_css(dst, &bio_blkcg(src)->css); +} +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); + +static int blk_cgroup_io_type(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return BLKG_IOSTAT_DISCARD; + if (op_is_write(bio->bi_opf)) + return BLKG_IOSTAT_WRITE; + return BLKG_IOSTAT_READ; +} + +void blk_cgroup_bio_start(struct bio *bio) +{ + int rwd = blk_cgroup_io_type(bio), cpu; + struct blkg_iostat_set *bis; + + cpu = get_cpu(); + bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); + u64_stats_update_begin(&bis->sync); + + /* + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split + * bio and we would have already accounted for the size of the bio. + */ + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } + bis->cur.ios[rwd]++; + + u64_stats_update_end(&bis->sync); + if (cgroup_subsys_on_dfl(io_cgrp_subsys)) + cgroup_rstat_updated(bio->bi_blkg->blkcg->css.cgroup, cpu); + put_cpu(); +} + +static int __init blkcg_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_init); + +module_param(blkcg_debug_stats, bool, 0644); +MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-core.c b/block/blk-core.c new file mode 100644 index 000000000..e5eeec801 --- /dev/null +++ b/block/blk-core.c @@ -0,0 +1,1816 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 1994, Karl Keyte: Added support for disk statistics + * Elevator latency, (C) 2000 Andrea Arcangeli SuSE + * Queue request tables / lock, selectable elevator, Jens Axboe + * kernel-doc documentation started by NeilBrown + * - July2000 + * bio rewrite, highmem i/o, etc, Jens Axboe - may 2001 + */ + +/* + * This handles all read/write requests to block devices + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" +#include "blk-pm.h" +#include "blk-rq-qos.h" + +struct dentry *blk_debugfs_root; + +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); + +DEFINE_IDA(blk_queue_ida); + +/* + * For queue allocation + */ +struct kmem_cache *blk_requestq_cachep; + +/* + * Controlling structure to kblockd + */ +static struct workqueue_struct *kblockd_workqueue; + +/** + * blk_queue_flag_set - atomically set a queue flag + * @flag: flag to be set + * @q: request queue + */ +void blk_queue_flag_set(unsigned int flag, struct request_queue *q) +{ + set_bit(flag, &q->queue_flags); +} +EXPORT_SYMBOL(blk_queue_flag_set); + +/** + * blk_queue_flag_clear - atomically clear a queue flag + * @flag: flag to be cleared + * @q: request queue + */ +void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) +{ + clear_bit(flag, &q->queue_flags); +} +EXPORT_SYMBOL(blk_queue_flag_clear); + +/** + * blk_queue_flag_test_and_set - atomically test and set a queue flag + * @flag: flag to be set + * @q: request queue + * + * Returns the previous value of @flag - 0 if the flag was not set and 1 if + * the flag was already set. + */ +bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) +{ + return test_and_set_bit(flag, &q->queue_flags); +} +EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); + +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + memset(rq, 0, sizeof(*rq)); + + INIT_LIST_HEAD(&rq->queuelist); + rq->q = q; + rq->__sector = (sector_t) -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; + rq->start_time_ns = ktime_get_ns(); + rq->part = NULL; + blk_crypto_rq_set_defaults(rq); +} +EXPORT_SYMBOL(blk_rq_init); + +#define REQ_OP_NAME(name) [REQ_OP_##name] = #name +static const char *const blk_op_name[] = { + REQ_OP_NAME(READ), + REQ_OP_NAME(WRITE), + REQ_OP_NAME(FLUSH), + REQ_OP_NAME(DISCARD), + REQ_OP_NAME(SECURE_ERASE), + REQ_OP_NAME(ZONE_RESET), + REQ_OP_NAME(ZONE_RESET_ALL), + REQ_OP_NAME(ZONE_OPEN), + REQ_OP_NAME(ZONE_CLOSE), + REQ_OP_NAME(ZONE_FINISH), + REQ_OP_NAME(ZONE_APPEND), + REQ_OP_NAME(WRITE_SAME), + REQ_OP_NAME(WRITE_ZEROES), + REQ_OP_NAME(SCSI_IN), + REQ_OP_NAME(SCSI_OUT), + REQ_OP_NAME(DRV_IN), + REQ_OP_NAME(DRV_OUT), +}; +#undef REQ_OP_NAME + +/** + * blk_op_str - Return string XXX in the REQ_OP_XXX. + * @op: REQ_OP_XXX. + * + * Description: Centralize block layer function to convert REQ_OP_XXX into + * string format. Useful in the debugging and tracing bio or request. For + * invalid REQ_OP_XXX it returns string "UNKNOWN". + */ +inline const char *blk_op_str(unsigned int op) +{ + const char *op_str = "UNKNOWN"; + + if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op]) + op_str = blk_op_name[op]; + + return op_str; +} +EXPORT_SYMBOL_GPL(blk_op_str); + +static const struct { + int errno; + const char *name; +} blk_errors[] = { + [BLK_STS_OK] = { 0, "" }, + [BLK_STS_NOTSUPP] = { -EOPNOTSUPP, "operation not supported" }, + [BLK_STS_TIMEOUT] = { -ETIMEDOUT, "timeout" }, + [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, + [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, + [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, + [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, + [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, + [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, + [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, + [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + + /* device mapper special case, should not leak out: */ + [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + + /* zone device specific errors */ + [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, + [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + + /* everything else not covered above: */ + [BLK_STS_IOERR] = { -EIO, "I/O" }, +}; + +blk_status_t errno_to_blk_status(int errno) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(blk_errors); i++) { + if (blk_errors[i].errno == errno) + return (__force blk_status_t)i; + } + + return BLK_STS_IOERR; +} +EXPORT_SYMBOL_GPL(errno_to_blk_status); + +int blk_status_to_errno(blk_status_t status) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return -EIO; + return blk_errors[idx].errno; +} +EXPORT_SYMBOL_GPL(blk_status_to_errno); + +static void print_req_error(struct request *req, blk_status_t status, + const char *caller) +{ + int idx = (__force int)status; + + if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) + return; + + printk_ratelimited(KERN_ERR + "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", + caller, blk_errors[idx].name, + req->rq_disk ? req->rq_disk->disk_name : "?", + blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), + req->cmd_flags & ~REQ_OP_MASK, + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); +} + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, blk_status_t error) +{ + if (error) + bio->bi_status = error; + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); + + bio_advance(bio, nbytes); + + if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + */ + if (bio->bi_iter.bi_size) + bio->bi_status = BLK_STS_IOERR; + else + bio->bi_iter.bi_sector = rq->__sector; + } + + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + bio_endio(bio); +} + +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->rq_disk ? rq->rq_disk->disk_name : "?", + (unsigned long long) rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); +} +EXPORT_SYMBOL(blk_dump_rq_flags); + +/** + * blk_sync_queue - cancel any pending callbacks on a queue + * @q: the queue + * + * Description: + * The block layer may perform asynchronous callback activity + * on a queue, such as calling the unplug function after a timeout. + * A block device may call blk_sync_queue to ensure that any + * such activity is cancelled, thus allowing it to release resources + * that the callbacks might use. The caller must already have made sure + * that its ->submit_bio will not re-add plugging prior to calling + * this function. + * + * This function does not cancel any asynchronous activity arising + * out of elevator or throttling code. That would require elevator_exit() + * and blkcg_exit_queue() to be called with queue lock initialized. + * + */ +void blk_sync_queue(struct request_queue *q) +{ + del_timer_sync(&q->timeout); + cancel_work_sync(&q->timeout_work); +} +EXPORT_SYMBOL(blk_sync_queue); + +/** + * blk_set_pm_only - increment pm_only counter + * @q: request queue pointer + */ +void blk_set_pm_only(struct request_queue *q) +{ + atomic_inc(&q->pm_only); +} +EXPORT_SYMBOL_GPL(blk_set_pm_only); + +void blk_clear_pm_only(struct request_queue *q) +{ + int pm_only; + + pm_only = atomic_dec_return(&q->pm_only); + WARN_ON_ONCE(pm_only < 0); + if (pm_only == 0) + wake_up_all(&q->mq_freeze_wq); +} +EXPORT_SYMBOL_GPL(blk_clear_pm_only); + +/** + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for + * + * Decrements the refcount of the request_queue kobject. When this reaches 0 + * we'll have blk_release_queue() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ +void blk_put_queue(struct request_queue *q) +{ + kobject_put(&q->kobj); +} +EXPORT_SYMBOL(blk_put_queue); + +void blk_set_queue_dying(struct request_queue *q) +{ + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + + /* + * When queue DYING flag is set, we need to block new req + * entering queue, so we call blk_freeze_queue_start() to + * prevent I/O from crossing blk_queue_enter(). + */ + blk_freeze_queue_start(q); + + if (queue_is_mq(q)) + blk_mq_wake_waiters(q); + + /* Make blk_queue_enter() reexamine the DYING flag. */ + wake_up_all(&q->mq_freeze_wq); +} +EXPORT_SYMBOL_GPL(blk_set_queue_dying); + +/** + * blk_cleanup_queue - shutdown a request queue + * @q: request queue to shutdown + * + * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and + * put it. All future requests will be failed immediately with -ENODEV. + * + * Context: can sleep + */ +void blk_cleanup_queue(struct request_queue *q) +{ + /* cannot be called from atomic context */ + might_sleep(); + + WARN_ON_ONCE(blk_queue_registered(q)); + + /* mark @q DYING, no new request or merges will be allowed afterwards */ + blk_set_queue_dying(q); + + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + + /* + * Drain all requests queued before DYING marking. Set DEAD flag to + * prevent that blk_mq_run_hw_queues() accesses the hardware queues + * after draining finished. + */ + blk_freeze_queue(q); + + rq_qos_exit(q); + + blk_queue_flag_set(QUEUE_FLAG_DEAD, q); + + /* for synchronous bio-based driver finish in-flight integrity i/o */ + blk_flush_integrity(); + + /* @q won't process any more request, flush async actions */ + del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); + blk_sync_queue(q); + + if (queue_is_mq(q)) + blk_mq_exit_queue(q); + + /* + * In theory, request pool of sched_tags belongs to request queue. + * However, the current implementation requires tag_set for freeing + * requests, so free the pool now. + * + * Queue has become frozen, there can't be any in-queue requests, so + * it is safe to free requests now. + */ + mutex_lock(&q->sysfs_lock); + if (q->elevator) + blk_mq_sched_free_requests(q); + mutex_unlock(&q->sysfs_lock); + + /* @q is and will stay empty, shutdown and put */ + blk_put_queue(q); +} +EXPORT_SYMBOL(blk_cleanup_queue); + +/** + * blk_queue_enter() - try to increase q->q_usage_counter + * @q: request queue pointer + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM + */ +int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) +{ + const bool pm = flags & BLK_MQ_REQ_PM; + + while (true) { + bool success = false; + + rcu_read_lock(); + if (percpu_ref_tryget_live(&q->q_usage_counter)) { + /* + * The code that increments the pm_only counter is + * responsible for ensuring that that counter is + * globally visible before the queue is unfrozen. + */ + if ((pm && queue_rpm_status(q) != RPM_SUSPENDED) || + !blk_queue_pm_only(q)) { + success = true; + } else { + percpu_ref_put(&q->q_usage_counter); + } + } + rcu_read_unlock(); + + if (success) + return 0; + + if (flags & BLK_MQ_REQ_NOWAIT) + return -EBUSY; + + /* + * read pair of barrier in blk_freeze_queue_start(), + * we need to order reading __PERCPU_REF_DEAD flag of + * .q_usage_counter and reading .mq_freeze_depth or + * queue dying flag, otherwise the following wait may + * never return if the two reads are reordered. + */ + smp_rmb(); + + wait_event(q->mq_freeze_wq, + (!q->mq_freeze_depth && + blk_pm_resume_queue(pm, q)) || + blk_queue_dying(q)); + if (blk_queue_dying(q)) + return -ENODEV; + } +} + +static inline int bio_queue_enter(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + bool nowait = bio->bi_opf & REQ_NOWAIT; + int ret; + + ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); + if (unlikely(ret)) { + if (nowait && !blk_queue_dying(q)) + bio_wouldblock_error(bio); + else + bio_io_error(bio); + } + + return ret; +} + +void blk_queue_exit(struct request_queue *q) +{ + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_queue_usage_counter_release(struct percpu_ref *ref) +{ + struct request_queue *q = + container_of(ref, struct request_queue, q_usage_counter); + + wake_up_all(&q->mq_freeze_wq); +} + +static void blk_rq_timed_out_timer(struct timer_list *t) +{ + struct request_queue *q = from_timer(q, t, timeout); + + kblockd_schedule_work(&q->timeout_work); +} + +static void blk_timeout_work(struct work_struct *work) +{ +} + +struct request_queue *blk_alloc_queue(int node_id) +{ + struct request_queue *q; + int ret; + + q = kmem_cache_alloc_node(blk_requestq_cachep, + GFP_KERNEL | __GFP_ZERO, node_id); + if (!q) + return NULL; + + q->last_merge = NULL; + + q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); + if (q->id < 0) + goto fail_q; + + ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + if (ret) + goto fail_id; + + q->backing_dev_info = bdi_alloc(node_id); + if (!q->backing_dev_info) + goto fail_split; + + q->stats = blk_alloc_queue_stats(); + if (!q->stats) + goto fail_stats; + + q->node = node_id; + + atomic_set(&q->nr_active_requests_shared_sbitmap, 0); + + timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, + laptop_mode_timer_fn, 0); + timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); + INIT_WORK(&q->timeout_work, blk_timeout_work); + INIT_LIST_HEAD(&q->icq_list); +#ifdef CONFIG_BLK_CGROUP + INIT_LIST_HEAD(&q->blkg_list); +#endif + + kobject_init(&q->kobj, &blk_queue_ktype); + + mutex_init(&q->debugfs_mutex); + mutex_init(&q->sysfs_lock); + mutex_init(&q->sysfs_dir_lock); + spin_lock_init(&q->queue_lock); + + init_waitqueue_head(&q->mq_freeze_wq); + mutex_init(&q->mq_freeze_lock); + + /* + * Init percpu_ref in atomic mode so that it's faster to shutdown. + * See blk_register_queue() for details. + */ + if (percpu_ref_init(&q->q_usage_counter, + blk_queue_usage_counter_release, + PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) + goto fail_bdi; + + if (blkcg_init_queue(q)) + goto fail_ref; + + blk_queue_dma_alignment(q, 511); + blk_set_default_limits(&q->limits); + q->nr_requests = BLKDEV_MAX_RQ; + + return q; + +fail_ref: + percpu_ref_exit(&q->q_usage_counter); +fail_bdi: + blk_free_queue_stats(q->stats); +fail_stats: + bdi_put(q->backing_dev_info); +fail_split: + bioset_exit(&q->bio_split); +fail_id: + ida_simple_remove(&blk_queue_ida, q->id); +fail_q: + kmem_cache_free(blk_requestq_cachep, q); + return NULL; +} +EXPORT_SYMBOL(blk_alloc_queue); + +/** + * blk_get_queue - increment the request_queue refcount + * @q: the request_queue structure to increment the refcount for + * + * Increment the refcount of the request_queue kobject. + * + * Context: Any context. + */ +bool blk_get_queue(struct request_queue *q) +{ + if (likely(!blk_queue_dying(q))) { + __blk_get_queue(q); + return true; + } + + return false; +} +EXPORT_SYMBOL(blk_get_queue); + +/** + * blk_get_request - allocate a request + * @q: request queue to allocate a request for + * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. + * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. + */ +struct request *blk_get_request(struct request_queue *q, unsigned int op, + blk_mq_req_flags_t flags) +{ + struct request *req; + + WARN_ON_ONCE(op & REQ_NOWAIT); + WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PM)); + + req = blk_mq_alloc_request(q, op, flags); + if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) + q->mq_ops->initialize_rq_fn(req); + + return req; +} +EXPORT_SYMBOL(blk_get_request); + +void blk_put_request(struct request *req) +{ + blk_mq_free_request(req); +} +EXPORT_SYMBOL(blk_put_request); + +static void handle_bad_sector(struct bio *bio, sector_t maxsector) +{ + char b[BDEVNAME_SIZE]; + + pr_info_ratelimited("attempt to access beyond end of device\n" + "%s: rw=%d, want=%llu, limit=%llu\n", + bio_devname(bio, b), bio->bi_opf, + bio_end_sector(bio), maxsector); +} + +#ifdef CONFIG_FAIL_MAKE_REQUEST + +static DECLARE_FAULT_ATTR(fail_make_request); + +static int __init setup_fail_make_request(char *str) +{ + return setup_fault_attr(&fail_make_request, str); +} +__setup("fail_make_request=", setup_fail_make_request); + +static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +{ + return part->make_it_fail && should_fail(&fail_make_request, bytes); +} + +static int __init fail_make_request_debugfs(void) +{ + struct dentry *dir = fault_create_debugfs_attr("fail_make_request", + NULL, &fail_make_request); + + return PTR_ERR_OR_ZERO(dir); +} + +late_initcall(fail_make_request_debugfs); + +#else /* CONFIG_FAIL_MAKE_REQUEST */ + +static inline bool should_fail_request(struct hd_struct *part, + unsigned int bytes) +{ + return false; +} + +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + +static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +{ + const int op = bio_op(bio); + + if (part->policy && op_is_write(op)) { + char b[BDEVNAME_SIZE]; + + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return false; + pr_warn("Trying to write to read-only block-device %s (partno %d)\n", + bio_devname(bio, b), part->partno); + /* Older lvm-tools actually trigger this */ + return false; + } + + return false; +} + +static noinline int should_fail_bio(struct bio *bio) +{ + if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + return -EIO; + return 0; +} +ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); + +/* + * Check whether this bio extends beyond the end of the device or partition. + * This may well happen - the kernel calls bread() without checking the size of + * the device, e.g., when mounting a file system. + */ +static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +{ + unsigned int nr_sectors = bio_sectors(bio); + + if (nr_sectors && maxsector && + (nr_sectors > maxsector || + bio->bi_iter.bi_sector > maxsector - nr_sectors)) { + handle_bad_sector(bio, maxsector); + return -EIO; + } + return 0; +} + +/* + * Remap block n of partition p to block n+start(p) of the disk. + */ +static inline int blk_partition_remap(struct bio *bio) +{ + struct hd_struct *p; + int ret = -EIO; + + rcu_read_lock(); + p = __disk_get_part(bio->bi_disk, bio->bi_partno); + if (unlikely(!p)) + goto out; + if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) + goto out; + if (unlikely(bio_check_ro(bio, p))) + goto out; + + if (bio_sectors(bio)) { + if (bio_check_eod(bio, part_nr_sects_read(p))) + goto out; + bio->bi_iter.bi_sector += p->start_sect; + trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), + bio->bi_iter.bi_sector - p->start_sect); + } + bio->bi_partno = 0; + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +/* + * Check write append to a zoned block device. + */ +static inline blk_status_t blk_check_zone_append(struct request_queue *q, + struct bio *bio) +{ + sector_t pos = bio->bi_iter.bi_sector; + int nr_sectors = bio_sectors(bio); + + /* Only applicable to zoned block devices */ + if (!blk_queue_is_zoned(q)) + return BLK_STS_NOTSUPP; + + /* The bio sector must point to the start of a sequential zone */ + if (pos & (blk_queue_zone_sectors(q) - 1) || + !blk_queue_zone_is_seq(q, pos)) + return BLK_STS_IOERR; + + /* + * Not allowed to cross zone boundaries. Otherwise, the BIO will be + * split and could result in non-contiguous sectors being written in + * different zones. + */ + if (nr_sectors > q->limits.chunk_sectors) + return BLK_STS_IOERR; + + /* Make sure the BIO is small enough and will not get split */ + if (nr_sectors > q->limits.max_zone_append_sectors) + return BLK_STS_IOERR; + + bio->bi_opf |= REQ_NOMERGE; + + return BLK_STS_OK; +} + +static noinline_for_stack bool submit_bio_checks(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + blk_status_t status = BLK_STS_IOERR; + struct blk_plug *plug; + + might_sleep(); + + plug = blk_mq_plug(q, bio); + if (plug && plug->nowait) + bio->bi_opf |= REQ_NOWAIT; + + /* + * For a REQ_NOWAIT based request, return -EOPNOTSUPP + * if queue does not support NOWAIT. + */ + if ((bio->bi_opf & REQ_NOWAIT) && !blk_queue_nowait(q)) + goto not_supported; + + if (should_fail_bio(bio)) + goto end_io; + + if (bio->bi_partno) { + if (unlikely(blk_partition_remap(bio))) + goto end_io; + } else { + if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + goto end_io; + if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + goto end_io; + } + + /* + * Filter flush bio's early so that bio based drivers without flush + * support don't have to worry about them. + */ + if (op_is_flush(bio->bi_opf) && + !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); + if (!bio_sectors(bio)) { + status = BLK_STS_OK; + goto end_io; + } + } + + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + bio->bi_opf &= ~REQ_HIPRI; + + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + if (!blk_queue_discard(q)) + goto not_supported; + break; + case REQ_OP_SECURE_ERASE: + if (!blk_queue_secure_erase(q)) + goto not_supported; + break; + case REQ_OP_WRITE_SAME: + if (!q->limits.max_write_same_sectors) + goto not_supported; + break; + case REQ_OP_ZONE_APPEND: + status = blk_check_zone_append(q, bio); + if (status != BLK_STS_OK) + goto end_io; + break; + case REQ_OP_ZONE_RESET: + case REQ_OP_ZONE_OPEN: + case REQ_OP_ZONE_CLOSE: + case REQ_OP_ZONE_FINISH: + if (!blk_queue_is_zoned(q)) + goto not_supported; + break; + case REQ_OP_ZONE_RESET_ALL: + if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) + goto not_supported; + break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; + default: + break; + } + + /* + * Various block parts want %current->io_context, so allocate it up + * front rather than dealing with lots of pain to allocate it only + * where needed. This may fail and the block layer knows how to live + * with it. + */ + if (unlikely(!current->io_context)) + create_task_io_context(current, GFP_ATOMIC, q->node); + + if (blk_throtl_bio(bio)) + return false; + + blk_cgroup_bio_start(bio); + blkcg_bio_issue_init(bio); + + if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_queue(q, bio); + /* Now that enqueuing has been traced, we need to trace + * completion as well. + */ + bio_set_flag(bio, BIO_TRACE_COMPLETION); + } + return true; + +not_supported: + status = BLK_STS_NOTSUPP; +end_io: + bio->bi_status = status; + bio_endio(bio); + return false; +} + +static blk_qc_t __submit_bio(struct bio *bio) +{ + struct gendisk *disk = bio->bi_disk; + blk_qc_t ret = BLK_QC_T_NONE; + + if (blk_crypto_bio_prep(&bio)) { + if (!disk->fops->submit_bio) + return blk_mq_submit_bio(bio); + ret = disk->fops->submit_bio(bio); + } + blk_queue_exit(disk->queue); + return ret; +} + +/* + * The loop in this function may be a bit non-obvious, and so deserves some + * explanation: + * + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure + * that), so we have a list with a single bio. + * - We pretend that we have just taken it off a longer list, so we assign + * bio_list to a pointer to the bio_list_on_stack, thus initialising the + * bio_list of new bios to be added. ->submit_bio() may indeed add some more + * bios through a recursive call to submit_bio_noacct. If it did, we find a + * non-NULL value in bio_list and re-enter the loop from the top. + * - In this case we really did just take the bio of the top of the list (no + * pretending) and so remove it from bio_list, and call into ->submit_bio() + * again. + * + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the current + * ->submit_bio_bio, but that haven't been processed yet. + */ +static blk_qc_t __submit_bio_noacct(struct bio *bio) +{ + struct bio_list bio_list_on_stack[2]; + blk_qc_t ret = BLK_QC_T_NONE; + + BUG_ON(bio->bi_next); + + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; + + do { + struct request_queue *q = bio->bi_disk->queue; + struct bio_list lower, same; + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; + + /* + * Create a fresh bio_list for all subordinate requests. + */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); + + ret = __submit_bio(bio); + + /* + * Sort new bios into those for a lower level and those for the + * same level. + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bio->bi_disk->queue) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + + /* + * Now assemble so we handle the lowest level first. + */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + + current->bio_list = NULL; + return ret; +} + +static blk_qc_t __submit_bio_noacct_mq(struct bio *bio) +{ + struct bio_list bio_list[2] = { }; + blk_qc_t ret = BLK_QC_T_NONE; + + current->bio_list = bio_list; + + do { + struct gendisk *disk = bio->bi_disk; + + if (unlikely(bio_queue_enter(bio) != 0)) + continue; + + if (!blk_crypto_bio_prep(&bio)) { + blk_queue_exit(disk->queue); + ret = BLK_QC_T_NONE; + continue; + } + + ret = blk_mq_submit_bio(bio); + } while ((bio = bio_list_pop(&bio_list[0]))); + + current->bio_list = NULL; + return ret; +} + +/** + * submit_bio_noacct - re-submit a bio to the block device layer for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This is a version of submit_bio() that shall only be used for I/O that is + * resubmitted to lower level drivers by stacking block drivers. All file + * systems and other upper level users of the block layer should use + * submit_bio() instead. + */ +blk_qc_t submit_bio_noacct(struct bio *bio) +{ + if (!submit_bio_checks(bio)) + return BLK_QC_T_NONE; + + /* + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list + * to collect a list of requests submited by a ->submit_bio method while + * it is active, and then process them after it returned. + */ + if (current->bio_list) { + bio_list_add(¤t->bio_list[0], bio); + return BLK_QC_T_NONE; + } + + if (!bio->bi_disk->fops->submit_bio) + return __submit_bio_noacct_mq(bio); + return __submit_bio_noacct(bio); +} +EXPORT_SYMBOL(submit_bio_noacct); + +/** + * submit_bio - submit a bio to the block device layer for I/O + * @bio: The &struct bio which describes the I/O + * + * submit_bio() is used to submit I/O requests to block devices. It is passed a + * fully set up &struct bio that describes the I/O that needs to be done. The + * bio will be send to the device described by the bi_disk and bi_partno fields. + * + * The success/failure status of the request, along with notification of + * completion, is delivered asynchronously through the ->bi_end_io() callback + * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * been called. + */ +blk_qc_t submit_bio(struct bio *bio) +{ + if (blkcg_punt_bio_submit(bio)) + return BLK_QC_T_NONE; + + /* + * If it's a regular read/write or a barrier with data attached, + * go through the normal accounting stuff before submission. + */ + if (bio_has_data(bio)) { + unsigned int count; + + if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) + count = queue_logical_block_size(bio->bi_disk->queue) >> 9; + else + count = bio_sectors(bio); + + if (op_is_write(bio_op(bio))) { + count_vm_events(PGPGOUT, count); + } else { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, count); + } + + if (unlikely(block_dump)) { + char b[BDEVNAME_SIZE]; + printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", + current->comm, task_pid_nr(current), + op_is_write(bio_op(bio)) ? "WRITE" : "READ", + (unsigned long long)bio->bi_iter.bi_sector, + bio_devname(bio, b), count); + } + } + + /* + * If we're reading data that is part of the userspace workingset, count + * submission time as memory stall. When the device is congested, or + * the submitting cgroup IO-throttled, submission can be a significant + * part of overall IO time. + */ + if (unlikely(bio_op(bio) == REQ_OP_READ && + bio_flagged(bio, BIO_WORKINGSET))) { + unsigned long pflags; + blk_qc_t ret; + + psi_memstall_enter(&pflags); + ret = submit_bio_noacct(bio); + psi_memstall_leave(&pflags); + + return ret; + } + + return submit_bio_noacct(bio); +} +EXPORT_SYMBOL(submit_bio); + +/** + * blk_cloned_rq_check_limits - Helper function to check a cloned request + * for the new queue limits + * @q: the queue + * @rq: the request being checked + * + * Description: + * @rq may have been made based on weaker limitations of upper-level queues + * in request stacking drivers, and it may violate the limitation of @q. + * Since the block layer and the underlying device driver trust @rq + * after it is inserted to @q, it should be checked against @q before + * the insertion using this generic function. + * + * Request stacking drivers like request-based dm may change the queue + * limits when retrying requests on other queues. Those requests need + * to be checked against the new queue limits again during dispatch. + */ +static blk_status_t blk_cloned_rq_check_limits(struct request_queue *q, + struct request *rq) +{ + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + + if (blk_rq_sectors(rq) > max_sectors) { + /* + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. + */ + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; + } + + /* + * queue's settings related to segment counting like q->bounce_pfn + * may differ from that of other stacking queues. + * Recalculate it to check the request correctly on this queue's + * limitation. + */ + rq->nr_phys_segments = blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > queue_max_segments(q)) { + printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", + __func__, rq->nr_phys_segments, queue_max_segments(q)); + return BLK_STS_IOERR; + } + + return BLK_STS_OK; +} + +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @q: the queue to submit the request + * @rq: the request being queued + */ +blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +{ + blk_status_t ret; + + ret = blk_cloned_rq_check_limits(q, rq); + if (ret != BLK_STS_OK) + return ret; + + if (rq->rq_disk && + should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) + return BLK_STS_IOERR; + + if (blk_crypto_insert_cloned_request(rq)) + return BLK_STS_IOERR; + + if (blk_queue_io_stat(q)) + blk_account_io_start(rq); + + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + return blk_mq_request_issue_directly(rq, true); +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blk_rq_err_bytes - determine number of bytes till the next failure boundary + * @rq: request to examine + * + * Description: + * A request could be merge of IOs which require different failure + * handling. This function determines the number of bytes which + * can be failed from the beginning of the request without + * crossing into area which need to be retried further. + * + * Return: + * The number of bytes to fail. + */ +unsigned int blk_rq_err_bytes(const struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + unsigned int bytes = 0; + struct bio *bio; + + if (!(rq->rq_flags & RQF_MIXED_MERGE)) + return blk_rq_bytes(rq); + + /* + * Currently the only 'mixing' which can happen is between + * different fastfail types. We can safely fail portions + * which have all the failfast bits that the first one has - + * the ones which are at least as eager to fail as the first + * one. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + if ((bio->bi_opf & ff) != ff) + break; + bytes += bio->bi_iter.bi_size; + } + + /* this could lead to infinite loop */ + BUG_ON(blk_rq_bytes(rq) && !bytes); + return bytes; +} +EXPORT_SYMBOL_GPL(blk_rq_err_bytes); + +static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +{ + unsigned long stamp; +again: + stamp = READ_ONCE(part->stamp); + if (unlikely(stamp != now)) { + if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + __part_stat_add(part, io_ticks, end ? now - stamp : 1); + } + if (part->partno) { + part = &part_to_disk(part)->part0; + goto again; + } +} + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + if (req->part && blk_do_io_stat(req)) { + const int sgrp = op_stat_group(req_op(req)); + struct hd_struct *part; + + part_stat_lock(); + part = req->part; + part_stat_add(part, sectors[sgrp], bytes >> 9); + part_stat_unlock(); + } +} + +void blk_account_io_done(struct request *req, u64 now) +{ + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (req->part && blk_do_io_stat(req) && + !(req->rq_flags & RQF_FLUSH_SEQ)) { + const int sgrp = op_stat_group(req_op(req)); + struct hd_struct *part; + + part_stat_lock(); + part = req->part; + + update_io_ticks(part, jiffies, true); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); + + hd_struct_put(part); + } +} + +void blk_account_io_start(struct request *rq) +{ + if (!blk_do_io_stat(rq)) + return; + + rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); + + part_stat_lock(); + update_io_ticks(rq->part, jiffies, false); + part_stat_unlock(); +} + +static unsigned long __part_start_io_acct(struct hd_struct *part, + unsigned int sectors, unsigned int op) +{ + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + + part_stat_lock(); + update_io_ticks(part, now, false); + part_stat_inc(part, ios[sgrp]); + part_stat_add(part, sectors[sgrp], sectors); + part_stat_local_inc(part, in_flight[op_is_write(op)]); + part_stat_unlock(); + + return now; +} + +unsigned long part_start_io_acct(struct gendisk *disk, struct hd_struct **part, + struct bio *bio) +{ + *part = disk_map_sector_rcu(disk, bio->bi_iter.bi_sector); + + return __part_start_io_acct(*part, bio_sectors(bio), bio_op(bio)); +} +EXPORT_SYMBOL_GPL(part_start_io_acct); + +unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, + unsigned int op) +{ + return __part_start_io_acct(&disk->part0, sectors, op); +} +EXPORT_SYMBOL(disk_start_io_acct); + +static void __part_end_io_acct(struct hd_struct *part, unsigned int op, + unsigned long start_time) +{ + const int sgrp = op_stat_group(op); + unsigned long now = READ_ONCE(jiffies); + unsigned long duration = now - start_time; + + part_stat_lock(); + update_io_ticks(part, now, true); + part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(part, in_flight[op_is_write(op)]); + part_stat_unlock(); +} + +void part_end_io_acct(struct hd_struct *part, struct bio *bio, + unsigned long start_time) +{ + __part_end_io_acct(part, bio_op(bio), start_time); + hd_struct_put(part); +} +EXPORT_SYMBOL_GPL(part_end_io_acct); + +void disk_end_io_acct(struct gendisk *disk, unsigned int op, + unsigned long start_time) +{ + __part_end_io_acct(&disk->part0, op, start_time); +} +EXPORT_SYMBOL(disk_end_io_acct); + +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + +/** + * blk_update_request - Special helper function for request stacking drivers + * @req: the request being processed + * @error: block status code + * @nr_bytes: number of bytes to complete @req + * + * Description: + * Ends I/O on a number of bytes attached to @req, but doesn't complete + * the request structure even if @req doesn't have leftover. + * If @req has leftover, sets it up for the next range of segments. + * + * This special helper function is only for request stacking drivers + * (e.g. request-based dm) so that they can handle partial completion. + * Actual device drivers should use blk_mq_end_request instead. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. + * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both + * blk_rq_bytes() and in blk_update_request(). + * + * Return: + * %false - this request doesn't have any more data + * %true - this request has more data + **/ +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) +{ + int total_bytes; + + trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); + + if (!req->bio) + return false; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) + __blk_crypto_rq_put_keyslot(req); + + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET))) + print_req_error(req, error, __func__); + + blk_account_io_completion(req, nr_bytes); + + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); + + if (bio_bytes == bio->bi_iter.bi_size) + req->bio = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + req_bio_endio(req, bio, bio_bytes, error); + + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; + + if (!nr_bytes) + break; + } + + /* + * completely done + */ + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->__data_len = 0; + return false; + } + + req->__data_len -= total_bytes; + + /* update sector only for requests with clear definition of sector */ + if (!blk_rq_is_passthrough(req)) + req->__sector += total_bytes >> 9; + + /* mixed attributes always follow the first bio */ + if (req->rq_flags & RQF_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; + } + + if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + blk_dump_rq_flags(req, "request botched"); + req->__data_len = blk_rq_cur_bytes(req); + } + + /* recalculate the number of segments */ + req->nr_phys_segments = blk_recalc_rq_segments(req); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_update_request); + +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +/** + * rq_flush_dcache_pages - Helper function to flush all pages in a request + * @rq: the request to be flushed + * + * Description: + * Flush all pages in @rq. + */ +void rq_flush_dcache_pages(struct request *rq) +{ + struct req_iterator iter; + struct bio_vec bvec; + + rq_for_each_segment(bvec, rq, iter) + flush_dcache_page(bvec.bv_page); +} +EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); +#endif + +/** + * blk_lld_busy - Check if underlying low-level drivers of a device are busy + * @q : the queue of the device being checked + * + * Description: + * Check if underlying low-level drivers of a device are busy. + * If the drivers want to export their busy state, they must set own + * exporting function using blk_queue_lld_busy() first. + * + * Basically, this function is used only by request stacking drivers + * to stop dispatching requests to underlying devices when underlying + * devices are busy. This behavior helps more I/O merging on the queue + * of the request stacking driver and prevents I/O throughput regression + * on burst I/O load. + * + * Return: + * 0 - Not busy (The request stacking driver should dispatch request) + * 1 - Busy (The request stacking driver should stop dispatching request) + */ +int blk_lld_busy(struct request_queue *q) +{ + if (queue_is_mq(q) && q->mq_ops->busy) + return q->mq_ops->busy(q); + + return 0; +} +EXPORT_SYMBOL_GPL(blk_lld_busy); + +/** + * blk_rq_unprep_clone - Helper function to free all bios in a cloned request + * @rq: the clone request to be cleaned up + * + * Description: + * Free all bios in @rq for a cloned request. + */ +void blk_rq_unprep_clone(struct request *rq) +{ + struct bio *bio; + + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + + bio_put(bio); + } +} +EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); + +/** + * blk_rq_prep_clone - Helper function to setup clone request + * @rq: the request to be setup + * @rq_src: original request to be cloned + * @bs: bio_set that bios for clone are allocated from + * @gfp_mask: memory allocation mask for bio + * @bio_ctr: setup function to be called for each clone bio. + * Returns %0 for success, non %0 for failure. + * @data: private data to be passed to @bio_ctr + * + * Description: + * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. + * Also, pages which the original bios are pointing to are not copied + * and the cloned bios just point same pages. + * So cloned bios must be completed before original bios, which means + * the caller must complete @rq before @rq_src. + */ +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data) +{ + struct bio *bio, *bio_src; + + if (!bs) + bs = &fs_bio_set; + + __rq_for_each_bio(bio_src, rq_src) { + bio = bio_clone_fast(bio_src, gfp_mask, bs); + if (!bio) + goto free_and_out; + + if (bio_ctr && bio_ctr(bio, bio_src, data)) + goto free_and_out; + + if (rq->bio) { + rq->biotail->bi_next = bio; + rq->biotail = bio; + } else { + rq->bio = rq->biotail = bio; + } + bio = NULL; + } + + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + + if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) + goto free_and_out; + + return 0; + +free_and_out: + if (bio) + bio_put(bio); + blk_rq_unprep_clone(rq); + + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_rq_prep_clone); + +int kblockd_schedule_work(struct work_struct *work) +{ + return queue_work(kblockd_workqueue, work); +} +EXPORT_SYMBOL(kblockd_schedule_work); + +int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, + unsigned long delay) +{ + return mod_delayed_work_on(cpu, kblockd_workqueue, dwork, delay); +} +EXPORT_SYMBOL(kblockd_mod_delayed_work_on); + +/** + * blk_start_plug - initialize blk_plug and track it inside the task_struct + * @plug: The &struct blk_plug that needs to be initialized + * + * Description: + * blk_start_plug() indicates to the block layer an intent by the caller + * to submit multiple I/O requests in a batch. The block layer may use + * this hint to defer submitting I/Os from the caller until blk_finish_plug() + * is called. However, the block layer may choose to submit requests + * before a call to blk_finish_plug() if the number of queued I/Os + * exceeds %BLK_MAX_REQUEST_COUNT, or if the size of the I/O is larger than + * %BLK_PLUG_FLUSH_SIZE. The queued I/Os may also be submitted early if + * the task schedules (see below). + * + * Tracking blk_plug inside the task_struct will help with auto-flushing the + * pending I/O should the task end up blocking between blk_start_plug() and + * blk_finish_plug(). This is important from a performance perspective, but + * also ensures that we don't deadlock. For instance, if the task is blocking + * for a memory allocation, memory reclaim could end up wanting to free a + * page belonging to that request that is currently residing in our private + * plug. By flushing the pending I/O when the process goes to sleep, we avoid + * this kind of deadlock. + */ +void blk_start_plug(struct blk_plug *plug) +{ + struct task_struct *tsk = current; + + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + + INIT_LIST_HEAD(&plug->mq_list); + INIT_LIST_HEAD(&plug->cb_list); + plug->rq_count = 0; + plug->multiple_queues = false; + plug->nowait = false; + + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; +} +EXPORT_SYMBOL(blk_start_plug); + +static void flush_plug_callbacks(struct blk_plug *plug, bool from_schedule) +{ + LIST_HEAD(callbacks); + + while (!list_empty(&plug->cb_list)) { + list_splice_init(&plug->cb_list, &callbacks); + + while (!list_empty(&callbacks)) { + struct blk_plug_cb *cb = list_first_entry(&callbacks, + struct blk_plug_cb, + list); + list_del(&cb->list); + cb->callback(cb, from_schedule); + } + } +} + +struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, + int size) +{ + struct blk_plug *plug = current->plug; + struct blk_plug_cb *cb; + + if (!plug) + return NULL; + + list_for_each_entry(cb, &plug->cb_list, list) + if (cb->callback == unplug && cb->data == data) + return cb; + + /* Not currently on the callback list */ + BUG_ON(size < sizeof(*cb)); + cb = kzalloc(size, GFP_ATOMIC); + if (cb) { + cb->data = data; + cb->callback = unplug; + list_add(&cb->list, &plug->cb_list); + } + return cb; +} +EXPORT_SYMBOL(blk_check_plugged); + +void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + flush_plug_callbacks(plug, from_schedule); + + if (!list_empty(&plug->mq_list)) + blk_mq_flush_plug_list(plug, from_schedule); +} + +/** + * blk_finish_plug - mark the end of a batch of submitted I/O + * @plug: The &struct blk_plug passed to blk_start_plug() + * + * Description: + * Indicate that a batch of I/O submissions is complete. This function + * must be paired with an initial call to blk_start_plug(). The intent + * is to allow the block layer to optimize I/O submission. See the + * documentation for blk_start_plug() for more information. + */ +void blk_finish_plug(struct blk_plug *plug) +{ + if (plug != current->plug) + return; + blk_flush_plug_list(plug, false); + + current->plug = NULL; +} +EXPORT_SYMBOL(blk_finish_plug); + +void blk_io_schedule(void) +{ + /* Prevent hang_check timer from firing at us during very long I/O */ + unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2; + + if (timeout) + io_schedule_timeout(timeout); + else + io_schedule(); +} +EXPORT_SYMBOL_GPL(blk_io_schedule); + +int __init blk_dev_init(void) +{ + BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + sizeof_field(struct request, cmd_flags)); + BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * + sizeof_field(struct bio, bi_opf)); + + /* used for unplugging and affects IO latency/throughput - HIGHPRI */ + kblockd_workqueue = alloc_workqueue("kblockd", + WQ_MEM_RECLAIM | WQ_HIGHPRI, 0); + if (!kblockd_workqueue) + panic("Failed to create kblockd\n"); + + blk_requestq_cachep = kmem_cache_create("request_queue", + sizeof(struct request_queue), 0, SLAB_PANIC, NULL); + + blk_debugfs_root = debugfs_create_dir("block", NULL); + + return 0; +} diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c new file mode 100644 index 000000000..c162b754e --- /dev/null +++ b/block/blk-crypto-fallback.c @@ -0,0 +1,657 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto-fallback: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +static unsigned int num_prealloc_bounce_pg = 32; +module_param(num_prealloc_bounce_pg, uint, 0); +MODULE_PARM_DESC(num_prealloc_bounce_pg, + "Number of preallocated bounce pages for the blk-crypto crypto API fallback"); + +static unsigned int blk_crypto_num_keyslots = 100; +module_param_named(num_keyslots, blk_crypto_num_keyslots, uint, 0); +MODULE_PARM_DESC(num_keyslots, + "Number of keyslots for the blk-crypto crypto API fallback"); + +static unsigned int num_prealloc_fallback_crypt_ctxs = 128; +module_param(num_prealloc_fallback_crypt_ctxs, uint, 0); +MODULE_PARM_DESC(num_prealloc_crypt_fallback_ctxs, + "Number of preallocated bio fallback crypto contexts for blk-crypto to use during crypto API fallback"); + +struct bio_fallback_crypt_ctx { + struct bio_crypt_ctx crypt_ctx; + /* + * Copy of the bvec_iter when this bio was submitted. + * We only want to en/decrypt the part of the bio as described by the + * bvec_iter upon submission because bio might be split before being + * resubmitted + */ + struct bvec_iter crypt_iter; + union { + struct { + struct work_struct work; + struct bio *bio; + }; + struct { + void *bi_private_orig; + bio_end_io_t *bi_end_io_orig; + }; + }; +}; + +static struct kmem_cache *bio_fallback_crypt_ctx_cache; +static mempool_t *bio_fallback_crypt_ctx_pool; + +/* + * Allocating a crypto tfm during I/O can deadlock, so we have to preallocate + * all of a mode's tfms when that mode starts being used. Since each mode may + * need all the keyslots at some point, each mode needs its own tfm for each + * keyslot; thus, a keyslot may contain tfms for multiple modes. However, to + * match the behavior of real inline encryption hardware (which only supports a + * single encryption context per keyslot), we only allow one tfm per keyslot to + * be used at a time - the rest of the unused tfms have their keys cleared. + */ +static DEFINE_MUTEX(tfms_init_lock); +static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; + +static struct blk_crypto_keyslot { + enum blk_crypto_mode_num crypto_mode; + struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; +} *blk_crypto_keyslots; + +static struct blk_keyslot_manager blk_crypto_ksm; +static struct workqueue_struct *blk_crypto_wq; +static mempool_t *blk_crypto_bounce_page_pool; + +/* + * This is the key we set when evicting a keyslot. This *should* be the all 0's + * key, but AES-XTS rejects that key, so we use some random bytes instead. + */ +static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; + +static void blk_crypto_evict_keyslot(unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; + int err; + + WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID); + + /* Clear the key in the skcipher */ + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key, + blk_crypto_modes[crypto_mode].keysize); + WARN_ON(err); + slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; +} + +static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + const enum blk_crypto_mode_num crypto_mode = + key->crypto_cfg.crypto_mode; + int err; + + if (crypto_mode != slotp->crypto_mode && + slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) + blk_crypto_evict_keyslot(slot); + + slotp->crypto_mode = crypto_mode; + err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, + key->size); + if (err) { + blk_crypto_evict_keyslot(slot); + return err; + } + return 0; +} + +static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + unsigned int slot) +{ + blk_crypto_evict_keyslot(slot); + return 0; +} + +/* + * The crypto API fallback KSM ops - only used for a bio when it specifies a + * blk_crypto_key that was not supported by the device's inline encryption + * hardware. + */ +static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { + .keyslot_program = blk_crypto_keyslot_program, + .keyslot_evict = blk_crypto_keyslot_evict, +}; + +static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) +{ + struct bio *src_bio = enc_bio->bi_private; + int i; + + for (i = 0; i < enc_bio->bi_vcnt; i++) + mempool_free(enc_bio->bi_io_vec[i].bv_page, + blk_crypto_bounce_page_pool); + + src_bio->bi_status = enc_bio->bi_status; + + bio_put(enc_bio); + bio_endio(src_bio); +} + +static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); + + return bio; +} + +static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) +{ + struct skcipher_request *ciph_req; + const struct blk_crypto_keyslot *slotp; + int keyslot_idx = blk_ksm_get_slot_idx(slot); + + slotp = &blk_crypto_keyslots[keyslot_idx]; + ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], + GFP_NOIO); + if (!ciph_req) + return false; + + skcipher_request_set_callback(ciph_req, + CRYPTO_TFM_REQ_MAY_BACKLOG | + CRYPTO_TFM_REQ_MAY_SLEEP, + crypto_req_done, wait); + *ciph_req_ret = ciph_req; + + return true; +} + +static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + unsigned int i = 0; + unsigned int num_sectors = 0; + struct bio_vec bv; + struct bvec_iter iter; + + bio_for_each_segment(bv, bio, iter) { + num_sectors += bv.bv_len >> SECTOR_SHIFT; + if (++i == BIO_MAX_PAGES) + break; + } + if (num_sectors < bio_sectors(bio)) { + struct bio *split_bio; + + split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + if (!split_bio) { + bio->bi_status = BLK_STS_RESOURCE; + return false; + } + bio_chain(split_bio, bio); + submit_bio_noacct(bio); + *bio_ptr = split_bio; + } + + return true; +} + +union blk_crypto_iv { + __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + u8 bytes[BLK_CRYPTO_MAX_IV_SIZE]; +}; + +static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + union blk_crypto_iv *iv) +{ + int i; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) + iv->dun[i] = cpu_to_le64(dun[i]); +} + +/* + * The crypto API fallback's encryption routine. + * Allocate a bounce bio for encryption, encrypt the input bio using crypto API, + * and replace *bio_ptr with the bounce bio. May split input bio if it's too + * large. Returns true on success. Returns false and sets bio->bi_status on + * error. + */ +static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) +{ + struct bio *src_bio, *enc_bio; + struct bio_crypt_ctx *bc; + struct blk_ksm_keyslot *slot; + int data_unit_size; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + struct scatterlist src, dst; + union blk_crypto_iv iv; + unsigned int i, j; + bool ret = false; + blk_status_t blk_st; + + /* Split the bio if it's too big for single page bvec */ + if (!blk_crypto_split_bio_if_needed(bio_ptr)) + return false; + + src_bio = *bio_ptr; + bc = src_bio->bi_crypt_context; + data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + + /* Allocate bounce bio for encryption */ + enc_bio = blk_crypto_clone_bio(src_bio); + if (!enc_bio) { + src_bio->bi_status = BLK_STS_RESOURCE; + return false; + } + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + src_bio->bi_status = blk_st; + goto out_put_enc_bio; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_release_keyslot; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&src, 1); + sg_init_table(&dst, 1); + + skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size, + iv.bytes); + + /* Encrypt each page in the bounce bio */ + for (i = 0; i < enc_bio->bi_vcnt; i++) { + struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i]; + struct page *plaintext_page = enc_bvec->bv_page; + struct page *ciphertext_page = + mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO); + + enc_bvec->bv_page = ciphertext_page; + + if (!ciphertext_page) { + src_bio->bi_status = BLK_STS_RESOURCE; + goto out_free_bounce_pages; + } + + sg_set_page(&src, plaintext_page, data_unit_size, + enc_bvec->bv_offset); + sg_set_page(&dst, ciphertext_page, data_unit_size, + enc_bvec->bv_offset); + + /* Encrypt each data unit in this page */ + for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req), + &wait)) { + i++; + src_bio->bi_status = BLK_STS_IOERR; + goto out_free_bounce_pages; + } + bio_crypt_dun_increment(curr_dun, 1); + src.offset += data_unit_size; + dst.offset += data_unit_size; + } + } + + enc_bio->bi_private = src_bio; + enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio; + *bio_ptr = enc_bio; + ret = true; + + enc_bio = NULL; + goto out_free_ciph_req; + +out_free_bounce_pages: + while (i > 0) + mempool_free(enc_bio->bi_io_vec[--i].bv_page, + blk_crypto_bounce_page_pool); +out_free_ciph_req: + skcipher_request_free(ciph_req); +out_release_keyslot: + blk_ksm_put_slot(slot); +out_put_enc_bio: + if (enc_bio) + bio_put(enc_bio); + + return ret; +} + +/* + * The crypto API fallback's main decryption routine. + * Decrypts input bio in place, and calls bio_endio on the bio. + */ +static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) +{ + struct bio_fallback_crypt_ctx *f_ctx = + container_of(work, struct bio_fallback_crypt_ctx, work); + struct bio *bio = f_ctx->bio; + struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; + struct blk_ksm_keyslot *slot; + struct skcipher_request *ciph_req = NULL; + DECLARE_CRYPTO_WAIT(wait); + u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; + union blk_crypto_iv iv; + struct scatterlist sg; + struct bio_vec bv; + struct bvec_iter iter; + const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; + unsigned int i; + blk_status_t blk_st; + + /* + * Use the crypto API fallback keyslot manager to get a crypto_skcipher + * for the algorithm and key specified for this bio. + */ + blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + if (blk_st != BLK_STS_OK) { + bio->bi_status = blk_st; + goto out_no_keyslot; + } + + /* and then allocate an skcipher_request for it */ + if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + bio->bi_status = BLK_STS_RESOURCE; + goto out; + } + + memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun)); + sg_init_table(&sg, 1); + skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size, + iv.bytes); + + /* Decrypt each segment in the bio */ + __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) { + struct page *page = bv.bv_page; + + sg_set_page(&sg, page, data_unit_size, bv.bv_offset); + + /* Decrypt each data unit in the segment */ + for (i = 0; i < bv.bv_len; i += data_unit_size) { + blk_crypto_dun_to_iv(curr_dun, &iv); + if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req), + &wait)) { + bio->bi_status = BLK_STS_IOERR; + goto out; + } + bio_crypt_dun_increment(curr_dun, 1); + sg.offset += data_unit_size; + } + } + +out: + skcipher_request_free(ciph_req); + blk_ksm_put_slot(slot); +out_no_keyslot: + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); +} + +/** + * blk_crypto_fallback_decrypt_endio - queue bio for fallback decryption + * + * @bio: the bio to queue + * + * Restore bi_private and bi_end_io, and queue the bio for decryption into a + * workqueue, since this function will be called from an atomic context. + */ +static void blk_crypto_fallback_decrypt_endio(struct bio *bio) +{ + struct bio_fallback_crypt_ctx *f_ctx = bio->bi_private; + + bio->bi_private = f_ctx->bi_private_orig; + bio->bi_end_io = f_ctx->bi_end_io_orig; + + /* If there was an IO error, don't queue for decrypt. */ + if (bio->bi_status) { + mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); + bio_endio(bio); + return; + } + + INIT_WORK(&f_ctx->work, blk_crypto_fallback_decrypt_bio); + f_ctx->bio = bio; + queue_work(blk_crypto_wq, &f_ctx->work); +} + +/** + * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption + * + * @bio_ptr: pointer to the bio to prepare + * + * If bio is doing a WRITE operation, this splits the bio into two parts if it's + * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio + * for the first part, encrypts it, and update bio_ptr to point to the bounce + * bio. + * + * For a READ operation, we mark the bio for decryption by using bi_private and + * bi_end_io. + * + * In either case, this function will make the bio look like a regular bio (i.e. + * as if no encryption context was ever specified) for the purposes of the rest + * of the stack except for blk-integrity (blk-integrity and blk-crypto are not + * currently supported together). + * + * Return: true on success. Sets bio->bi_status and returns false on error. + */ +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + struct bio_fallback_crypt_ctx *f_ctx; + + if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) { + /* User didn't call blk_crypto_start_using_key() first */ + bio->bi_status = BLK_STS_IOERR; + return false; + } + + if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, + &bc->bc_key->crypto_cfg)) { + bio->bi_status = BLK_STS_NOTSUPP; + return false; + } + + if (bio_data_dir(bio) == WRITE) + return blk_crypto_fallback_encrypt_bio(bio_ptr); + + /* + * bio READ case: Set up a f_ctx in the bio's bi_private and set the + * bi_end_io appropriately to trigger decryption when the bio is ended. + */ + f_ctx = mempool_alloc(bio_fallback_crypt_ctx_pool, GFP_NOIO); + f_ctx->crypt_ctx = *bc; + f_ctx->crypt_iter = bio->bi_iter; + f_ctx->bi_private_orig = bio->bi_private; + f_ctx->bi_end_io_orig = bio->bi_end_io; + bio->bi_private = (void *)f_ctx; + bio->bi_end_io = blk_crypto_fallback_decrypt_endio; + bio_crypt_free_ctx(bio); + + return true; +} + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return blk_ksm_evict_key(&blk_crypto_ksm, key); +} + +static bool blk_crypto_fallback_inited; +static int blk_crypto_fallback_init(void) +{ + int i; + int err; + + if (blk_crypto_fallback_inited) + return 0; + + prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + + err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + if (err) + goto out; + err = -ENOMEM; + + blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; + blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + + /* All blk-crypto modes have a crypto API fallback. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) + blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; + blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + + blk_crypto_wq = alloc_workqueue("blk_crypto_wq", + WQ_UNBOUND | WQ_HIGHPRI | + WQ_MEM_RECLAIM, num_online_cpus()); + if (!blk_crypto_wq) + goto fail_free_ksm; + + blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, + sizeof(blk_crypto_keyslots[0]), + GFP_KERNEL); + if (!blk_crypto_keyslots) + goto fail_free_wq; + + blk_crypto_bounce_page_pool = + mempool_create_page_pool(num_prealloc_bounce_pg, 0); + if (!blk_crypto_bounce_page_pool) + goto fail_free_keyslots; + + bio_fallback_crypt_ctx_cache = KMEM_CACHE(bio_fallback_crypt_ctx, 0); + if (!bio_fallback_crypt_ctx_cache) + goto fail_free_bounce_page_pool; + + bio_fallback_crypt_ctx_pool = + mempool_create_slab_pool(num_prealloc_fallback_crypt_ctxs, + bio_fallback_crypt_ctx_cache); + if (!bio_fallback_crypt_ctx_pool) + goto fail_free_crypt_ctx_cache; + + blk_crypto_fallback_inited = true; + + return 0; +fail_free_crypt_ctx_cache: + kmem_cache_destroy(bio_fallback_crypt_ctx_cache); +fail_free_bounce_page_pool: + mempool_destroy(blk_crypto_bounce_page_pool); +fail_free_keyslots: + kfree(blk_crypto_keyslots); +fail_free_wq: + destroy_workqueue(blk_crypto_wq); +fail_free_ksm: + blk_ksm_destroy(&blk_crypto_ksm); +out: + return err; +} + +/* + * Prepare blk-crypto-fallback for the specified crypto mode. + * Returns -ENOPKG if the needed crypto API support is missing. + */ +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; + struct blk_crypto_keyslot *slotp; + unsigned int i; + int err = 0; + + /* + * Fast path + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we try to access them. + */ + if (likely(smp_load_acquire(&tfms_inited[mode_num]))) + return 0; + + mutex_lock(&tfms_init_lock); + if (tfms_inited[mode_num]) + goto out; + + err = blk_crypto_fallback_init(); + if (err) + goto out; + + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0); + if (IS_ERR(slotp->tfms[mode_num])) { + err = PTR_ERR(slotp->tfms[mode_num]); + if (err == -ENOENT) { + pr_warn_once("Missing crypto API support for \"%s\"\n", + cipher_str); + err = -ENOPKG; + } + slotp->tfms[mode_num] = NULL; + goto out_free_tfms; + } + + crypto_skcipher_set_flags(slotp->tfms[mode_num], + CRYPTO_TFM_REQ_FORBID_WEAK_KEYS); + } + + /* + * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num] + * for each i are visible before we set tfms_inited[mode_num]. + */ + smp_store_release(&tfms_inited[mode_num], true); + goto out; + +out_free_tfms: + for (i = 0; i < blk_crypto_num_keyslots; i++) { + slotp = &blk_crypto_keyslots[i]; + crypto_free_skcipher(slotp->tfms[mode_num]); + slotp->tfms[mode_num] = NULL; + } +out: + mutex_unlock(&tfms_init_lock); + return err; +} diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h new file mode 100644 index 000000000..8e0834557 --- /dev/null +++ b/block/blk-crypto-internal.h @@ -0,0 +1,229 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright 2019 Google LLC + */ + +#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H +#define __LINUX_BLK_CRYPTO_INTERNAL_H + +#include +#include + +/* Represents a crypto mode supported by blk-crypto */ +struct blk_crypto_mode { + const char *cipher_str; /* crypto API name (for fallback case) */ + unsigned int keysize; /* key size in bytes */ + unsigned int ivsize; /* iv size in bytes */ +}; + +extern const struct blk_crypto_mode blk_crypto_modes[]; + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc); + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio); + +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2); + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + bio->bi_crypt_context); +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return bio_crypt_ctx_mergeable(bio->bi_crypt_context, + bio->bi_iter.bi_size, req->crypt_ctx); +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req), + next->crypt_ctx); +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) +{ + rq->crypt_ctx = NULL; + rq->crypt_keyslot = NULL; +} + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return rq->crypt_ctx; +} + +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return rq->crypt_keyslot; +} + +#else /* CONFIG_BLK_INLINE_ENCRYPTION */ + +static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_front_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_back_mergeable(struct request *req, + struct bio *bio) +{ + return true; +} + +static inline bool bio_crypt_ctx_merge_rq(struct request *req, + struct request *next) +{ + return true; +} + +static inline void blk_crypto_rq_set_defaults(struct request *rq) { } + +static inline bool blk_crypto_rq_is_encrypted(struct request *rq) +{ + return false; +} + +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return false; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION */ + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes); +static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_advance(bio, bytes); +} + +void __bio_crypt_free_ctx(struct bio *bio); +static inline void bio_crypt_free_ctx(struct bio *bio) +{ + if (bio_has_crypt_ctx(bio)) + __bio_crypt_free_ctx(bio); +} + +static inline void bio_crypt_do_front_merge(struct request *rq, + struct bio *bio) +{ +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (bio_has_crypt_ctx(bio)) + memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun, + sizeof(rq->crypt_ctx->bc_dun)); +#endif +} + +bool __blk_crypto_bio_prep(struct bio **bio_ptr); +static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) +{ + if (bio_has_crypt_ctx(*bio_ptr)) + return __blk_crypto_bio_prep(bio_ptr); + return true; +} + +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); +static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + return __blk_crypto_rq_get_keyslot(rq); + return BLK_STS_OK; +} + +void __blk_crypto_rq_put_keyslot(struct request *rq); +static inline void blk_crypto_rq_put_keyslot(struct request *rq) +{ + if (blk_crypto_rq_has_keyslot(rq)) + __blk_crypto_rq_put_keyslot(rq); +} + +void __blk_crypto_free_request(struct request *rq); +static inline void blk_crypto_free_request(struct request *rq) +{ + if (blk_crypto_rq_is_encrypted(rq)) + __blk_crypto_free_request(rq); +} + +int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask); +/** + * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio + * is inserted + * @rq: The request to prepare + * @bio: The first bio being inserted into the request + * @gfp_mask: Memory allocation flags + * + * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if + * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. + */ +static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (bio_has_crypt_ctx(bio)) + return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); + return 0; +} + +/** + * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted + * into a request queue. + * @rq: the request being queued + * + * Return: BLK_STS_OK on success, nonzero on error. + */ +static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) +{ + + if (blk_crypto_rq_is_encrypted(rq)) + return blk_crypto_rq_get_keyslot(rq); + return BLK_STS_OK; +} + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK + +int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num); + +bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr); + +int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key); + +#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + +static inline int +blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) +{ + pr_warn_once("crypto API fallback is disabled\n"); + return -ENOPKG; +} + +static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) +{ + pr_warn_once("crypto API fallback disabled; failing request.\n"); + (*bio_ptr)->bi_status = BLK_STS_NOTSUPP; + return false; +} + +static inline int +blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) +{ + return 0; +} + +#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */ + +#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */ diff --git a/block/blk-crypto.c b/block/blk-crypto.c new file mode 100644 index 000000000..87ec55d43 --- /dev/null +++ b/block/blk-crypto.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/* + * Refer to Documentation/block/inline-encryption.rst for detailed explanation. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include +#include +#include +#include +#include +#include + +#include "blk-crypto-internal.h" + +const struct blk_crypto_mode blk_crypto_modes[] = { + [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .cipher_str = "xts(aes)", + .keysize = 64, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .cipher_str = "essiv(cbc(aes),sha256)", + .keysize = 16, + .ivsize = 16, + }, + [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .cipher_str = "adiantum(xchacha12,aes)", + .keysize = 32, + .ivsize = 32, + }, +}; + +/* + * This number needs to be at least (the number of threads doing IO + * concurrently) * (maximum recursive depth of a bio), so that we don't + * deadlock on crypt_ctx allocations. The default is chosen to be the same + * as the default number of post read contexts in both EXT4 and F2FS. + */ +static int num_prealloc_crypt_ctxs = 128; + +module_param(num_prealloc_crypt_ctxs, int, 0444); +MODULE_PARM_DESC(num_prealloc_crypt_ctxs, + "Number of bio crypto contexts to preallocate"); + +static struct kmem_cache *bio_crypt_ctx_cache; +static mempool_t *bio_crypt_ctx_pool; + +static int __init bio_crypt_ctx_init(void) +{ + size_t i; + + bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0); + if (!bio_crypt_ctx_cache) + goto out_no_mem; + + bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs, + bio_crypt_ctx_cache); + if (!bio_crypt_ctx_pool) + goto out_no_mem; + + /* This is assumed in various places. */ + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + + /* Sanity check that no algorithm exceeds the defined limits. */ + for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) { + BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE); + BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE); + } + + return 0; +out_no_mem: + panic("Failed to allocate mem for bio crypt ctxs\n"); +} +subsys_initcall(bio_crypt_ctx_init); + +void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, + const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) +{ + struct bio_crypt_ctx *bc; + + /* + * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so + * that the mempool_alloc() can't fail. + */ + WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM)); + + bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + + bc->bc_key = key; + memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); + + bio->bi_crypt_context = bc; +} + +void __bio_crypt_free_ctx(struct bio *bio) +{ + mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool); + bio->bi_crypt_context = NULL; +} + +int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) +{ + dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + if (!dst->bi_crypt_context) + return -ENOMEM; + *dst->bi_crypt_context = *src->bi_crypt_context; + return 0; +} +EXPORT_SYMBOL_GPL(__bio_crypt_clone); + +/* Increments @dun by @inc, treating @dun as a multi-limb integer. */ +void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], + unsigned int inc) +{ + int i; + + for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + dun[i] += inc; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if (dun[i] < inc) + inc = 1; + else + inc = 0; + } +} + +void __bio_crypt_advance(struct bio *bio, unsigned int bytes) +{ + struct bio_crypt_ctx *bc = bio->bi_crypt_context; + + bio_crypt_dun_increment(bc->bc_dun, + bytes >> bc->bc_key->data_unit_size_bits); +} + +/* + * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to + * @next_dun, treating the DUNs as multi-limb integers. + */ +bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc, + unsigned int bytes, + const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]) +{ + int i; + unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits; + + for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) { + if (bc->bc_dun[i] + carry != next_dun[i]) + return false; + /* + * If the addition in this limb overflowed, then we need to + * carry 1 into the next limb. Else the carry is 0. + */ + if ((bc->bc_dun[i] + carry) < carry) + carry = 1; + else + carry = 0; + } + + /* If the DUN wrapped through 0, don't treat it as contiguous. */ + return carry == 0; +} + +/* + * Checks that two bio crypt contexts are compatible - i.e. that + * they are mergeable except for data_unit_num continuity. + */ +static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1, + struct bio_crypt_ctx *bc2) +{ + if (!bc1) + return !bc2; + + return bc2 && bc1->bc_key == bc2->bc_key; +} + +bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) +{ + return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context); +} + +/* + * Checks that two bio crypt contexts are compatible, and also + * that their data_unit_nums are continuous (and can hence be merged) + * in the order @bc1 followed by @bc2. + */ +bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes, + struct bio_crypt_ctx *bc2) +{ + if (!bio_crypt_ctx_compatible(bc1, bc2)) + return false; + + return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun); +} + +/* Check that all I/O segments are data unit aligned. */ +static bool bio_crypt_check_alignment(struct bio *bio) +{ + const unsigned int data_unit_size = + bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size; + struct bvec_iter iter; + struct bio_vec bv; + + bio_for_each_segment(bv, bio, iter) { + if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size)) + return false; + } + + return true; +} + +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) +{ + return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); +} + +void __blk_crypto_rq_put_keyslot(struct request *rq) +{ + blk_ksm_put_slot(rq->crypt_keyslot); + rq->crypt_keyslot = NULL; +} + +void __blk_crypto_free_request(struct request *rq) +{ + /* The keyslot, if one was needed, should have been released earlier. */ + if (WARN_ON_ONCE(rq->crypt_keyslot)) + __blk_crypto_rq_put_keyslot(rq); + + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); + rq->crypt_ctx = NULL; +} + +/** + * __blk_crypto_bio_prep - Prepare bio for inline encryption + * + * @bio_ptr: pointer to original bio pointer + * + * If the bio crypt context provided for the bio is supported by the underlying + * device's inline encryption hardware, do nothing. + * + * Otherwise, try to perform en/decryption for this bio by falling back to the + * kernel crypto API. When the crypto API fallback is used for encryption, + * blk-crypto may choose to split the bio into 2 - the first one that will + * continue to be processed and the second one that will be resubmitted via + * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents + * of the aforementioned "first one", and *bio_ptr will be updated to this + * bounce bio. + * + * Caller must ensure bio has bio_crypt_ctx. + * + * Return: true on success; false on error (and bio->bi_status will be set + * appropriately, and bio_endio() will have been called so bio + * submission should abort). + */ +bool __blk_crypto_bio_prep(struct bio **bio_ptr) +{ + struct bio *bio = *bio_ptr; + const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key; + + /* Error if bio has no data. */ + if (WARN_ON_ONCE(!bio_has_data(bio))) { + bio->bi_status = BLK_STS_IOERR; + goto fail; + } + + if (!bio_crypt_check_alignment(bio)) { + bio->bi_status = BLK_STS_IOERR; + goto fail; + } + + /* + * Success if device supports the encryption context, or if we succeeded + * in falling back to the crypto API. + */ + if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, + &bc_key->crypto_cfg)) + return true; + + if (blk_crypto_fallback_bio_prep(bio_ptr)) + return true; +fail: + bio_endio(*bio_ptr); + return false; +} + +int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) +{ + if (!rq->crypt_ctx) { + rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + if (!rq->crypt_ctx) + return -ENOMEM; + } + *rq->crypt_ctx = *bio->bi_crypt_context; + return 0; +} + +/** + * blk_crypto_init_key() - Prepare a key for use with blk-crypto + * @blk_key: Pointer to the blk_crypto_key to initialize. + * @raw_key: Pointer to the raw key. Must be the correct length for the chosen + * @crypto_mode; see blk_crypto_modes[]. + * @crypto_mode: identifier for the encryption algorithm to use + * @dun_bytes: number of bytes that will be used to specify the DUN when this + * key is used + * @data_unit_size: the data unit size to use for en/decryption + * + * Return: 0 on success, -errno on failure. The caller is responsible for + * zeroizing both blk_key and raw_key when done with them. + */ +int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, + enum blk_crypto_mode_num crypto_mode, + unsigned int dun_bytes, + unsigned int data_unit_size) +{ + const struct blk_crypto_mode *mode; + + memset(blk_key, 0, sizeof(*blk_key)); + + if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes)) + return -EINVAL; + + mode = &blk_crypto_modes[crypto_mode]; + if (mode->keysize == 0) + return -EINVAL; + + if (dun_bytes == 0 || dun_bytes > mode->ivsize) + return -EINVAL; + + if (!is_power_of_2(data_unit_size)) + return -EINVAL; + + blk_key->crypto_cfg.crypto_mode = crypto_mode; + blk_key->crypto_cfg.dun_bytes = dun_bytes; + blk_key->crypto_cfg.data_unit_size = data_unit_size; + blk_key->data_unit_size_bits = ilog2(data_unit_size); + blk_key->size = mode->keysize; + memcpy(blk_key->raw, raw_key, mode->keysize); + + return 0; +} + +/* + * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the + * request queue it's submitted to supports inline crypto, or the + * blk-crypto-fallback is enabled and supports the cfg). + */ +bool blk_crypto_config_supported(struct request_queue *q, + const struct blk_crypto_config *cfg) +{ + return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || + blk_ksm_crypto_cfg_supported(q->ksm, cfg); +} + +/** + * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @key: A key to use on the device + * @q: the request queue for the device + * + * Upper layers must call this function to ensure that either the hardware + * supports the key's crypto settings, or the crypto API fallback has transforms + * for the needed mode allocated and ready to go. This function may allocate + * an skcipher, and *should not* be called from the data path, since that might + * cause a deadlock + * + * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and + * blk-crypto-fallback is either disabled or the needed algorithm + * is disabled in the crypto API; or another -errno code. + */ +int blk_crypto_start_using_key(const struct blk_crypto_key *key, + struct request_queue *q) +{ + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + return 0; + return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); +} + +/** + * blk_crypto_evict_key() - Evict a blk_crypto_key from a request_queue + * @q: a request_queue on which I/O using the key may have been done + * @key: the key to evict + * + * For a given request_queue, this function removes the given blk_crypto_key + * from the keyslot management structures and evicts it from any underlying + * hardware keyslot(s) or blk-crypto-fallback keyslot it may have been + * programmed into. + * + * Upper layers must call this before freeing the blk_crypto_key. It must be + * called for every request_queue the key may have been used on. The key must + * no longer be in use by any I/O when this function is called. + * + * Context: May sleep. + */ +void blk_crypto_evict_key(struct request_queue *q, + const struct blk_crypto_key *key) +{ + int err; + + if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + err = blk_ksm_evict_key(q->ksm, key); + else + err = blk_crypto_fallback_evict_key(key); + /* + * An error can only occur here if the key failed to be evicted from a + * keyslot (due to a hardware or driver issue) or is allegedly still in + * use by I/O (due to a kernel bug). Even in these cases, the key is + * still unlinked from the keyslot management structures, and the caller + * is allowed and expected to free it right away. There's nothing + * callers can do to handle errors, so just log them and return void. + */ + if (err) + pr_warn_ratelimited("error %d evicting key\n", err); +} diff --git a/block/blk-exec.c b/block/blk-exec.c new file mode 100644 index 000000000..85324d53d --- /dev/null +++ b/block/blk-exec.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to setting various queue properties from drivers + */ +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq-sched.h" + +/** + * blk_end_sync_rq - executes a completion event on a request + * @rq: request to complete + * @error: end I/O status of the request + */ +static void blk_end_sync_rq(struct request *rq, blk_status_t error) +{ + struct completion *waiting = rq->end_io_data; + + rq->end_io_data = NULL; + + /* + * complete last, if this is a stack request the process (and thus + * the rq pointer) could be invalid right after this complete() + */ + complete(waiting); +} + +/** + * blk_execute_rq_nowait - insert a request into queue for execution + * @q: queue to insert the request in + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * @done: I/O completion handler + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. + */ +void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head, + rq_end_io_fn *done) +{ + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->rq_disk = bd_disk; + rq->end_io = done; + + blk_account_io_start(rq); + + /* + * don't check dying flag for MQ because the request won't + * be reused after dying flag is set + */ + blk_mq_sched_insert_request(rq, at_head, true, false); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +/** + * blk_execute_rq - insert a request into queue for execution + * @q: queue to insert the request in + * @bd_disk: matching gendisk + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution and wait for completion. + */ +void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, + struct request *rq, int at_head) +{ + DECLARE_COMPLETION_ONSTACK(wait); + unsigned long hang_check; + + rq->end_io_data = &wait; + blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion_io(&wait); +} +EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c new file mode 100644 index 000000000..33b487b5c --- /dev/null +++ b/block/blk-flush.c @@ -0,0 +1,511 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions to sequence PREFLUSH and FUA writes. + * + * Copyright (C) 2011 Max Planck Institute for Gravitational Physics + * Copyright (C) 2011 Tejun Heo + * + * REQ_{PREFLUSH|FUA} requests are decomposed to sequences consisted of three + * optional steps - PREFLUSH, DATA and POSTFLUSH - according to the request + * properties and hardware capability. + * + * If a request doesn't have data, only REQ_PREFLUSH makes sense, which + * indicates a simple flush request. If there is data, REQ_PREFLUSH indicates + * that the device cache should be flushed before the data is executed, and + * REQ_FUA means that the data must be on non-volatile media on request + * completion. + * + * If the device doesn't have writeback cache, PREFLUSH and FUA don't make any + * difference. The requests are either completed immediately if there's no data + * or executed as normal requests otherwise. + * + * If the device has writeback cache and supports FUA, REQ_PREFLUSH is + * translated to PREFLUSH but REQ_FUA is passed down directly with DATA. + * + * If the device has writeback cache and doesn't support FUA, REQ_PREFLUSH + * is translated to PREFLUSH and REQ_FUA to POSTFLUSH. + * + * The actual execution of flush is double buffered. Whenever a request + * needs to execute PRE or POSTFLUSH, it queues at + * fq->flush_queue[fq->flush_pending_idx]. Once certain criteria are met, a + * REQ_OP_FLUSH is issued and the pending_idx is toggled. When the flush + * completes, all the requests which were pending are proceeded to the next + * step. This allows arbitrary merging of different types of PREFLUSH/FUA + * requests. + * + * Currently, the following conditions are used to determine when to issue + * flush. + * + * C1. At any given time, only one flush shall be in progress. This makes + * double buffering sufficient. + * + * C2. Flush is deferred if any request is executing DATA of its sequence. + * This avoids issuing separate POSTFLUSHes for requests which shared + * PREFLUSH. + * + * C3. The second condition is ignored if there is a request which has + * waited longer than FLUSH_PENDING_TIMEOUT. This is to avoid + * starvation in the unlikely case where there are continuous stream of + * FUA (without PREFLUSH) requests. + * + * For devices which support FUA, it isn't clear whether C2 (and thus C3) + * is beneficial. + * + * Note that a sequenced PREFLUSH/FUA request with DATA is completed twice. + * Once while executing DATA and again after the whole sequence is + * complete. The first completion updates the contained bio but doesn't + * finish it so that the bio submitter is notified only after the whole + * sequence is complete. This is implemented by testing RQF_FLUSH_SEQ in + * req_bio_endio(). + * + * The above peculiarity requires that each PREFLUSH/FUA request has only one + * bio attached to it, which is guaranteed as they aren't allowed to be + * merged in the usual way. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* PREFLUSH/FUA sequences */ +enum { + REQ_FSEQ_PREFLUSH = (1 << 0), /* pre-flushing in progress */ + REQ_FSEQ_DATA = (1 << 1), /* data write in progress */ + REQ_FSEQ_POSTFLUSH = (1 << 2), /* post-flushing in progress */ + REQ_FSEQ_DONE = (1 << 3), + + REQ_FSEQ_ACTIONS = REQ_FSEQ_PREFLUSH | REQ_FSEQ_DATA | + REQ_FSEQ_POSTFLUSH, + + /* + * If flush has been pending longer than the following timeout, + * it's issued even if flush_data requests are still in flight. + */ + FLUSH_PENDING_TIMEOUT = 5 * HZ, +}; + +static void blk_kick_flush(struct request_queue *q, + struct blk_flush_queue *fq, unsigned int flags); + +static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) +{ + unsigned int policy = 0; + + if (blk_rq_sectors(rq)) + policy |= REQ_FSEQ_DATA; + + if (fflags & (1UL << QUEUE_FLAG_WC)) { + if (rq->cmd_flags & REQ_PREFLUSH) + policy |= REQ_FSEQ_PREFLUSH; + if (!(fflags & (1UL << QUEUE_FLAG_FUA)) && + (rq->cmd_flags & REQ_FUA)) + policy |= REQ_FSEQ_POSTFLUSH; + } + return policy; +} + +static unsigned int blk_flush_cur_seq(struct request *rq) +{ + return 1 << ffz(rq->flush.seq); +} + +static void blk_flush_restore_request(struct request *rq) +{ + /* + * After flush data completion, @rq->bio is %NULL but we need to + * complete the bio again. @rq->biotail is guaranteed to equal the + * original @rq->bio. Restore it. + */ + rq->bio = rq->biotail; + + /* make @rq a normal request */ + rq->rq_flags &= ~RQF_FLUSH_SEQ; + rq->end_io = rq->flush.saved_end_io; +} + +static void blk_flush_queue_rq(struct request *rq, bool add_front) +{ + blk_mq_add_to_requeue_list(rq, add_front, true); +} + +static void blk_account_io_flush(struct request *rq) +{ + struct hd_struct *part = &rq->rq_disk->part0; + + part_stat_lock(); + part_stat_inc(part, ios[STAT_FLUSH]); + part_stat_add(part, nsecs[STAT_FLUSH], + ktime_get_ns() - rq->start_time_ns); + part_stat_unlock(); +} + +/** + * blk_flush_complete_seq - complete flush sequence + * @rq: PREFLUSH/FUA request being sequenced + * @fq: flush queue + * @seq: sequences to complete (mask of %REQ_FSEQ_*, can be zero) + * @error: whether an error occurred + * + * @rq just completed @seq part of its flush sequence, record the + * completion and trigger the next step. + * + * CONTEXT: + * spin_lock_irq(fq->mq_flush_lock) + */ +static void blk_flush_complete_seq(struct request *rq, + struct blk_flush_queue *fq, + unsigned int seq, blk_status_t error) +{ + struct request_queue *q = rq->q; + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; + unsigned int cmd_flags; + + BUG_ON(rq->flush.seq & seq); + rq->flush.seq |= seq; + cmd_flags = rq->cmd_flags; + + if (likely(!error)) + seq = blk_flush_cur_seq(rq); + else + seq = REQ_FSEQ_DONE; + + switch (seq) { + case REQ_FSEQ_PREFLUSH: + case REQ_FSEQ_POSTFLUSH: + /* queue for flush */ + if (list_empty(pending)) + fq->flush_pending_since = jiffies; + list_move_tail(&rq->flush.list, pending); + break; + + case REQ_FSEQ_DATA: + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + blk_flush_queue_rq(rq, true); + break; + + case REQ_FSEQ_DONE: + /* + * @rq was previously adjusted by blk_insert_flush() for + * flush sequencing and may already have gone through the + * flush data request completion path. Restore @rq for + * normal completion and end it. + */ + BUG_ON(!list_empty(&rq->queuelist)); + list_del_init(&rq->flush.list); + blk_flush_restore_request(rq); + blk_mq_end_request(rq, error); + break; + + default: + BUG(); + } + + blk_kick_flush(q, fq, cmd_flags); +} + +static void flush_end_io(struct request *flush_rq, blk_status_t error) +{ + struct request_queue *q = flush_rq->q; + struct list_head *running; + struct request *rq, *n; + unsigned long flags = 0; + struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); + + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); + + if (!refcount_dec_and_test(&flush_rq->ref)) { + fq->rq_status = error; + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return; + } + + blk_account_io_flush(flush_rq); + /* + * Flush request has to be marked as IDLE when it is really ended + * because its .end_io() is called from timeout code path too for + * avoiding use-after-free. + */ + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + if (fq->rq_status != BLK_STS_OK) { + error = fq->rq_status; + fq->rq_status = BLK_STS_OK; + } + + if (!q->elevator) { + flush_rq->tag = BLK_MQ_NO_TAG; + } else { + blk_mq_put_driver_tag(flush_rq); + flush_rq->internal_tag = BLK_MQ_NO_TAG; + } + + running = &fq->flush_queue[fq->flush_running_idx]; + BUG_ON(fq->flush_pending_idx == fq->flush_running_idx); + + /* account completion of the flush request */ + fq->flush_running_idx ^= 1; + + /* and push the waiting requests to the next stage */ + list_for_each_entry_safe(rq, n, running, flush.list) { + unsigned int seq = blk_flush_cur_seq(rq); + + BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); + blk_flush_complete_seq(rq, fq, seq, error); + } + + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); +} + +bool is_flush_rq(struct request *rq) +{ + return rq->end_io == flush_end_io; +} + +/** + * blk_kick_flush - consider issuing flush request + * @q: request_queue being kicked + * @fq: flush queue + * @flags: cmd_flags of the original request + * + * Flush related states of @q have changed, consider issuing flush request. + * Please read the comment at the top of this file for more info. + * + * CONTEXT: + * spin_lock_irq(fq->mq_flush_lock) + * + */ +static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, + unsigned int flags) +{ + struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; + struct request *first_rq = + list_first_entry(pending, struct request, flush.list); + struct request *flush_rq = fq->flush_rq; + + /* C1 described at the top of this file */ + if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) + return; + + /* C2 and C3 */ + if (!list_empty(&fq->flush_data_in_flight) && + time_before(jiffies, + fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) + return; + + /* + * Issue flush and toggle pending_idx. This makes pending_idx + * different from running_idx, which means flush is in flight. + */ + fq->flush_pending_idx ^= 1; + + blk_rq_init(q, flush_rq); + + /* + * In case of none scheduler, borrow tag from the first request + * since they can't be in flight at the same time. And acquire + * the tag's ownership for flush req. + * + * In case of IO scheduler, flush rq need to borrow scheduler tag + * just for cheating put/get driver tag. + */ + flush_rq->mq_ctx = first_rq->mq_ctx; + flush_rq->mq_hctx = first_rq->mq_hctx; + + if (!q->elevator) { + flush_rq->tag = first_rq->tag; + + /* + * We borrow data request's driver tag, so have to mark + * this flush request as INFLIGHT for avoiding double + * account of this driver tag + */ + flush_rq->rq_flags |= RQF_MQ_INFLIGHT; + } else + flush_rq->internal_tag = first_rq->internal_tag; + + flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; + flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); + flush_rq->rq_flags |= RQF_FLUSH_SEQ; + flush_rq->rq_disk = first_rq->rq_disk; + flush_rq->end_io = flush_end_io; + /* + * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one + * implied in refcount_inc_not_zero() called from + * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref + * and READ flush_rq->end_io + */ + smp_wmb(); + refcount_set(&flush_rq->ref, 1); + + blk_flush_queue_rq(flush_rq, false); +} + +static void mq_flush_data_end_io(struct request *rq, blk_status_t error) +{ + struct request_queue *q = rq->q; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + struct blk_mq_ctx *ctx = rq->mq_ctx; + unsigned long flags; + struct blk_flush_queue *fq = blk_get_flush_queue(q, ctx); + + if (q->elevator) { + WARN_ON(rq->tag < 0); + blk_mq_put_driver_tag(rq); + } + + /* + * After populating an empty queue, kick it to avoid stall. Read + * the comment in flush_end_io(). + */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); + spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + + blk_mq_sched_restart(hctx); +} + +/** + * blk_insert_flush - insert a new PREFLUSH/FUA request + * @rq: request to insert + * + * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. + * or __blk_mq_run_hw_queue() to dispatch request. + * @rq is being submitted. Analyze what needs to be done and put it on the + * right queue. + */ +void blk_insert_flush(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned long fflags = q->queue_flags; /* may change, cache */ + unsigned int policy = blk_flush_policy(fflags, rq); + struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + + /* + * @policy now records what operations need to be done. Adjust + * REQ_PREFLUSH and FUA for the driver. + */ + rq->cmd_flags &= ~REQ_PREFLUSH; + if (!(fflags & (1UL << QUEUE_FLAG_FUA))) + rq->cmd_flags &= ~REQ_FUA; + + /* + * REQ_PREFLUSH|REQ_FUA implies REQ_SYNC, so if we clear any + * of those flags, we have to set REQ_SYNC to avoid skewing + * the request accounting. + */ + rq->cmd_flags |= REQ_SYNC; + + /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ + if (!policy) { + blk_mq_end_request(rq, 0); + return; + } + + BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ + + /* + * If there's data but flush is not necessary, the request can be + * processed directly without going through flush machinery. Queue + * for normal execution. + */ + if ((policy & REQ_FSEQ_DATA) && + !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { + blk_mq_request_bypass_insert(rq, false, false); + return; + } + + /* + * @rq should go through flush machinery. Mark it part of flush + * sequence and submit for further processing. + */ + memset(&rq->flush, 0, sizeof(rq->flush)); + INIT_LIST_HEAD(&rq->flush.list); + rq->rq_flags |= RQF_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + + rq->end_io = mq_flush_data_end_io; + + spin_lock_irq(&fq->mq_flush_lock); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); +} + +/** + * blkdev_issue_flush - queue a flush + * @bdev: blockdev to issue flush for + * @gfp_mask: memory allocation flags (for bio_alloc) + * + * Description: + * Issue a flush for the block device in question. + */ +int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +{ + struct bio *bio; + int ret = 0; + + bio = bio_alloc(gfp_mask, 0); + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; + + ret = submit_bio_wait(bio); + bio_put(bio); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_flush); + +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags) +{ + struct blk_flush_queue *fq; + int rq_sz = sizeof(struct request); + + fq = kzalloc_node(sizeof(*fq), flags, node); + if (!fq) + goto fail; + + spin_lock_init(&fq->mq_flush_lock); + + rq_sz = round_up(rq_sz + cmd_size, cache_line_size()); + fq->flush_rq = kzalloc_node(rq_sz, flags, node); + if (!fq->flush_rq) + goto fail_rq; + + INIT_LIST_HEAD(&fq->flush_queue[0]); + INIT_LIST_HEAD(&fq->flush_queue[1]); + INIT_LIST_HEAD(&fq->flush_data_in_flight); + + lockdep_register_key(&fq->key); + lockdep_set_class(&fq->mq_flush_lock, &fq->key); + + return fq; + + fail_rq: + kfree(fq); + fail: + return NULL; +} + +void blk_free_flush_queue(struct blk_flush_queue *fq) +{ + /* bio based request queue hasn't flush queue */ + if (!fq) + return; + + lockdep_unregister_key(&fq->key); + kfree(fq->flush_rq); + kfree(fq); +} diff --git a/block/blk-integrity.c b/block/blk-integrity.c new file mode 100644 index 000000000..9e83159f5 --- /dev/null +++ b/block/blk-integrity.c @@ -0,0 +1,455 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * blk-integrity.c - Block layer data integrity extensions + * + * Copyright (C) 2007, 2008 Oracle Corporation + * Written by: Martin K. Petersen + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +/** + * blk_rq_count_integrity_sg - Count number of integrity scatterlist elements + * @q: request queue + * @bio: bio with integrity metadata attached + * + * Description: Returns the number of elements required in a + * scatterlist corresponding to the integrity metadata in a bio. + */ +int blk_rq_count_integrity_sg(struct request_queue *q, struct bio *bio) +{ + struct bio_vec iv, ivprv = { NULL }; + unsigned int segments = 0; + unsigned int seg_size = 0; + struct bvec_iter iter; + int prev = 0; + + bio_for_each_integrity_vec(iv, bio, iter) { + + if (prev) { + if (!biovec_phys_mergeable(q, &ivprv, &iv)) + goto new_segment; + if (seg_size + iv.bv_len > queue_max_segment_size(q)) + goto new_segment; + + seg_size += iv.bv_len; + } else { +new_segment: + segments++; + seg_size = iv.bv_len; + } + + prev = 1; + ivprv = iv; + } + + return segments; +} +EXPORT_SYMBOL(blk_rq_count_integrity_sg); + +/** + * blk_rq_map_integrity_sg - Map integrity metadata into a scatterlist + * @q: request queue + * @bio: bio with integrity metadata attached + * @sglist: target scatterlist + * + * Description: Map the integrity vectors in request into a + * scatterlist. The scatterlist must be big enough to hold all + * elements. I.e. sized using blk_rq_count_integrity_sg(). + */ +int blk_rq_map_integrity_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist) +{ + struct bio_vec iv, ivprv = { NULL }; + struct scatterlist *sg = NULL; + unsigned int segments = 0; + struct bvec_iter iter; + int prev = 0; + + bio_for_each_integrity_vec(iv, bio, iter) { + + if (prev) { + if (!biovec_phys_mergeable(q, &ivprv, &iv)) + goto new_segment; + if (sg->length + iv.bv_len > queue_max_segment_size(q)) + goto new_segment; + + sg->length += iv.bv_len; + } else { +new_segment: + if (!sg) + sg = sglist; + else { + sg_unmark_end(sg); + sg = sg_next(sg); + } + + sg_set_page(sg, iv.bv_page, iv.bv_len, iv.bv_offset); + segments++; + } + + prev = 1; + ivprv = iv; + } + + if (sg) + sg_mark_end(sg); + + return segments; +} +EXPORT_SYMBOL(blk_rq_map_integrity_sg); + +/** + * blk_integrity_compare - Compare integrity profile of two disks + * @gd1: Disk to compare + * @gd2: Disk to compare + * + * Description: Meta-devices like DM and MD need to verify that all + * sub-devices use the same integrity format before advertising to + * upper layers that they can send/receive integrity metadata. This + * function can be used to check whether two gendisk devices have + * compatible integrity formats. + */ +int blk_integrity_compare(struct gendisk *gd1, struct gendisk *gd2) +{ + struct blk_integrity *b1 = &gd1->queue->integrity; + struct blk_integrity *b2 = &gd2->queue->integrity; + + if (!b1->profile && !b2->profile) + return 0; + + if (!b1->profile || !b2->profile) + return -1; + + if (b1->interval_exp != b2->interval_exp) { + pr_err("%s: %s/%s protection interval %u != %u\n", + __func__, gd1->disk_name, gd2->disk_name, + 1 << b1->interval_exp, 1 << b2->interval_exp); + return -1; + } + + if (b1->tuple_size != b2->tuple_size) { + pr_err("%s: %s/%s tuple sz %u != %u\n", __func__, + gd1->disk_name, gd2->disk_name, + b1->tuple_size, b2->tuple_size); + return -1; + } + + if (b1->tag_size && b2->tag_size && (b1->tag_size != b2->tag_size)) { + pr_err("%s: %s/%s tag sz %u != %u\n", __func__, + gd1->disk_name, gd2->disk_name, + b1->tag_size, b2->tag_size); + return -1; + } + + if (b1->profile != b2->profile) { + pr_err("%s: %s/%s type %s != %s\n", __func__, + gd1->disk_name, gd2->disk_name, + b1->profile->name, b2->profile->name); + return -1; + } + + return 0; +} +EXPORT_SYMBOL(blk_integrity_compare); + +bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, + struct request *next) +{ + if (blk_integrity_rq(req) == 0 && blk_integrity_rq(next) == 0) + return true; + + if (blk_integrity_rq(req) == 0 || blk_integrity_rq(next) == 0) + return false; + + if (bio_integrity(req->bio)->bip_flags != + bio_integrity(next->bio)->bip_flags) + return false; + + if (req->nr_integrity_segments + next->nr_integrity_segments > + q->limits.max_integrity_segments) + return false; + + if (integrity_req_gap_back_merge(req, next->bio)) + return false; + + return true; +} + +bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, + struct bio *bio) +{ + int nr_integrity_segs; + struct bio *next = bio->bi_next; + + if (blk_integrity_rq(req) == 0 && bio_integrity(bio) == NULL) + return true; + + if (blk_integrity_rq(req) == 0 || bio_integrity(bio) == NULL) + return false; + + if (bio_integrity(req->bio)->bip_flags != bio_integrity(bio)->bip_flags) + return false; + + bio->bi_next = NULL; + nr_integrity_segs = blk_rq_count_integrity_sg(q, bio); + bio->bi_next = next; + + if (req->nr_integrity_segments + nr_integrity_segs > + q->limits.max_integrity_segments) + return false; + + req->nr_integrity_segments += nr_integrity_segs; + + return true; +} + +struct integrity_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_integrity *, char *); + ssize_t (*store)(struct blk_integrity *, const char *, size_t); +}; + +static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + + return entry->show(bi, page); +} + +static ssize_t integrity_attr_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t count) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); + struct blk_integrity *bi = &disk->queue->integrity; + struct integrity_sysfs_entry *entry = + container_of(attr, struct integrity_sysfs_entry, attr); + ssize_t ret = 0; + + if (entry->store) + ret = entry->store(bi, page, count); + + return ret; +} + +static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) +{ + if (bi->profile && bi->profile->name) + return sprintf(page, "%s\n", bi->profile->name); + else + return sprintf(page, "none\n"); +} + +static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", bi->tag_size); +} + +static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); +} + +static ssize_t integrity_verify_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + bi->flags |= BLK_INTEGRITY_VERIFY; + else + bi->flags &= ~BLK_INTEGRITY_VERIFY; + + return count; +} + +static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); +} + +static ssize_t integrity_generate_store(struct blk_integrity *bi, + const char *page, size_t count) +{ + char *p = (char *) page; + unsigned long val = simple_strtoul(p, &p, 10); + + if (val) + bi->flags |= BLK_INTEGRITY_GENERATE; + else + bi->flags &= ~BLK_INTEGRITY_GENERATE; + + return count; +} + +static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); +} + +static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) +{ + return sprintf(page, "%u\n", + (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); +} + +static struct integrity_sysfs_entry integrity_format_entry = { + .attr = { .name = "format", .mode = 0444 }, + .show = integrity_format_show, +}; + +static struct integrity_sysfs_entry integrity_tag_size_entry = { + .attr = { .name = "tag_size", .mode = 0444 }, + .show = integrity_tag_size_show, +}; + +static struct integrity_sysfs_entry integrity_interval_entry = { + .attr = { .name = "protection_interval_bytes", .mode = 0444 }, + .show = integrity_interval_show, +}; + +static struct integrity_sysfs_entry integrity_verify_entry = { + .attr = { .name = "read_verify", .mode = 0644 }, + .show = integrity_verify_show, + .store = integrity_verify_store, +}; + +static struct integrity_sysfs_entry integrity_generate_entry = { + .attr = { .name = "write_generate", .mode = 0644 }, + .show = integrity_generate_show, + .store = integrity_generate_store, +}; + +static struct integrity_sysfs_entry integrity_device_entry = { + .attr = { .name = "device_is_integrity_capable", .mode = 0444 }, + .show = integrity_device_show, +}; + +static struct attribute *integrity_attrs[] = { + &integrity_format_entry.attr, + &integrity_tag_size_entry.attr, + &integrity_interval_entry.attr, + &integrity_verify_entry.attr, + &integrity_generate_entry.attr, + &integrity_device_entry.attr, + NULL, +}; +ATTRIBUTE_GROUPS(integrity); + +static const struct sysfs_ops integrity_ops = { + .show = &integrity_attr_show, + .store = &integrity_attr_store, +}; + +static struct kobj_type integrity_ktype = { + .default_groups = integrity_groups, + .sysfs_ops = &integrity_ops, +}; + +static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) +{ + return BLK_STS_OK; +} + +static void blk_integrity_nop_prepare(struct request *rq) +{ +} + +static void blk_integrity_nop_complete(struct request *rq, + unsigned int nr_bytes) +{ +} + +static const struct blk_integrity_profile nop_profile = { + .name = "nop", + .generate_fn = blk_integrity_nop_fn, + .verify_fn = blk_integrity_nop_fn, + .prepare_fn = blk_integrity_nop_prepare, + .complete_fn = blk_integrity_nop_complete, +}; + +/** + * blk_integrity_register - Register a gendisk as being integrity-capable + * @disk: struct gendisk pointer to make integrity-aware + * @template: block integrity profile to register + * + * Description: When a device needs to advertise itself as being able to + * send/receive integrity metadata it must use this function to register + * the capability with the block layer. The template is a blk_integrity + * struct with values appropriate for the underlying hardware. See + * Documentation/block/data-integrity.rst. + */ +void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + bi->flags = BLK_INTEGRITY_VERIFY | BLK_INTEGRITY_GENERATE | + template->flags; + bi->interval_exp = template->interval_exp ? : + ilog2(queue_logical_block_size(disk->queue)); + bi->profile = template->profile ? template->profile : &nop_profile; + bi->tuple_size = template->tuple_size; + bi->tag_size = template->tag_size; + + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); + +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + if (disk->queue->ksm) { + pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + blk_ksm_unregister(disk->queue); + } +#endif +} +EXPORT_SYMBOL(blk_integrity_register); + +/** + * blk_integrity_unregister - Unregister block integrity profile + * @disk: disk whose integrity profile to unregister + * + * Description: This function unregisters the integrity capability from + * a block device. + */ +void blk_integrity_unregister(struct gendisk *disk) +{ + struct blk_integrity *bi = &disk->queue->integrity; + + if (!bi->profile) + return; + + /* ensure all bios are off the integrity workqueue */ + blk_flush_integrity(); + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); + memset(bi, 0, sizeof(*bi)); +} +EXPORT_SYMBOL(blk_integrity_unregister); + +void blk_integrity_add(struct gendisk *disk) +{ + if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, + &disk_to_dev(disk)->kobj, "%s", "integrity")) + return; + + kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); +} + +void blk_integrity_del(struct gendisk *disk) +{ + kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); + kobject_del(&disk->integrity_kobj); + kobject_put(&disk->integrity_kobj); +} diff --git a/block/blk-ioc.c b/block/blk-ioc.c new file mode 100644 index 000000000..57299f860 --- /dev/null +++ b/block/blk-ioc.c @@ -0,0 +1,422 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to io context handling + */ +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +/* + * For io context allocations + */ +static struct kmem_cache *iocontext_cachep; + +/** + * get_io_context - increment reference count to io_context + * @ioc: io_context to get + * + * Increment reference count to @ioc. + */ +void get_io_context(struct io_context *ioc) +{ + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + atomic_long_inc(&ioc->refcount); +} + +static void icq_free_icq_rcu(struct rcu_head *head) +{ + struct io_cq *icq = container_of(head, struct io_cq, __rcu_head); + + kmem_cache_free(icq->__rcu_icq_cache, icq); +} + +/* + * Exit an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. + */ +static void ioc_exit_icq(struct io_cq *icq) +{ + struct elevator_type *et = icq->q->elevator->type; + + if (icq->flags & ICQ_EXITED) + return; + + if (et->ops.exit_icq) + et->ops.exit_icq(icq); + + icq->flags |= ICQ_EXITED; +} + +/* + * Release an icq. Called with ioc locked for blk-mq, and with both ioc + * and queue locked for legacy. + */ +static void ioc_destroy_icq(struct io_cq *icq) +{ + struct io_context *ioc = icq->ioc; + struct request_queue *q = icq->q; + struct elevator_type *et = q->elevator->type; + + lockdep_assert_held(&ioc->lock); + + radix_tree_delete(&ioc->icq_tree, icq->q->id); + hlist_del_init(&icq->ioc_node); + list_del_init(&icq->q_node); + + /* + * Both setting lookup hint to and clearing it from @icq are done + * under queue_lock. If it's not pointing to @icq now, it never + * will. Hint assignment itself can race safely. + */ + if (rcu_access_pointer(ioc->icq_hint) == icq) + rcu_assign_pointer(ioc->icq_hint, NULL); + + ioc_exit_icq(icq); + + /* + * @icq->q might have gone away by the time RCU callback runs + * making it impossible to determine icq_cache. Record it in @icq. + */ + icq->__rcu_icq_cache = et->icq_cache; + icq->flags |= ICQ_DESTROYED; + call_rcu(&icq->__rcu_head, icq_free_icq_rcu); +} + +/* + * Slow path for ioc release in put_io_context(). Performs double-lock + * dancing to unlink all icq's and then frees ioc. + */ +static void ioc_release_fn(struct work_struct *work) +{ + struct io_context *ioc = container_of(work, struct io_context, + release_work); + spin_lock_irq(&ioc->lock); + + while (!hlist_empty(&ioc->icq_list)) { + struct io_cq *icq = hlist_entry(ioc->icq_list.first, + struct io_cq, ioc_node); + struct request_queue *q = icq->q; + + if (spin_trylock(&q->queue_lock)) { + ioc_destroy_icq(icq); + spin_unlock(&q->queue_lock); + } else { + /* Make sure q and icq cannot be freed. */ + rcu_read_lock(); + + /* Re-acquire the locks in the correct order. */ + spin_unlock(&ioc->lock); + spin_lock(&q->queue_lock); + spin_lock(&ioc->lock); + + /* + * The icq may have been destroyed when the ioc lock + * was released. + */ + if (!(icq->flags & ICQ_DESTROYED)) + ioc_destroy_icq(icq); + + spin_unlock(&q->queue_lock); + rcu_read_unlock(); + } + } + + spin_unlock_irq(&ioc->lock); + + kmem_cache_free(iocontext_cachep, ioc); +} + +/** + * put_io_context - put a reference of io_context + * @ioc: io_context to put + * + * Decrement reference count of @ioc and release it if the count reaches + * zero. + */ +void put_io_context(struct io_context *ioc) +{ + unsigned long flags; + bool free_ioc = false; + + if (ioc == NULL) + return; + + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + + /* + * Releasing ioc requires reverse order double locking and we may + * already be holding a queue_lock. Do it asynchronously from wq. + */ + if (atomic_long_dec_and_test(&ioc->refcount)) { + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) + queue_work(system_power_efficient_wq, + &ioc->release_work); + else + free_ioc = true; + spin_unlock_irqrestore(&ioc->lock, flags); + } + + if (free_ioc) + kmem_cache_free(iocontext_cachep, ioc); +} + +/** + * put_io_context_active - put active reference on ioc + * @ioc: ioc of interest + * + * Undo get_io_context_active(). If active reference reaches zero after + * put, @ioc can never issue further IOs and ioscheds are notified. + */ +void put_io_context_active(struct io_context *ioc) +{ + struct io_cq *icq; + + if (!atomic_dec_and_test(&ioc->active_ref)) { + put_io_context(ioc); + return; + } + + spin_lock_irq(&ioc->lock); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { + if (icq->flags & ICQ_EXITED) + continue; + + ioc_exit_icq(icq); + } + spin_unlock_irq(&ioc->lock); + + put_io_context(ioc); +} + +/* Called by the exiting task */ +void exit_io_context(struct task_struct *task) +{ + struct io_context *ioc; + + task_lock(task); + ioc = task->io_context; + task->io_context = NULL; + task_unlock(task); + + atomic_dec(&ioc->nr_tasks); + put_io_context_active(ioc); +} + +static void __ioc_clear_queue(struct list_head *icq_list) +{ + unsigned long flags; + + rcu_read_lock(); + while (!list_empty(icq_list)) { + struct io_cq *icq = list_entry(icq_list->next, + struct io_cq, q_node); + struct io_context *ioc = icq->ioc; + + spin_lock_irqsave(&ioc->lock, flags); + if (icq->flags & ICQ_DESTROYED) { + spin_unlock_irqrestore(&ioc->lock, flags); + continue; + } + ioc_destroy_icq(icq); + spin_unlock_irqrestore(&ioc->lock, flags); + } + rcu_read_unlock(); +} + +/** + * ioc_clear_queue - break any ioc association with the specified queue + * @q: request_queue being cleared + * + * Walk @q->icq_list and exit all io_cq's. + */ +void ioc_clear_queue(struct request_queue *q) +{ + LIST_HEAD(icq_list); + + spin_lock_irq(&q->queue_lock); + list_splice_init(&q->icq_list, &icq_list); + spin_unlock_irq(&q->queue_lock); + + __ioc_clear_queue(&icq_list); +} + +int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) +{ + struct io_context *ioc; + int ret; + + ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, + node); + if (unlikely(!ioc)) + return -ENOMEM; + + /* initialize */ + atomic_long_set(&ioc->refcount, 1); + atomic_set(&ioc->nr_tasks, 1); + atomic_set(&ioc->active_ref, 1); + spin_lock_init(&ioc->lock); + INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); + INIT_HLIST_HEAD(&ioc->icq_list); + INIT_WORK(&ioc->release_work, ioc_release_fn); + + /* + * Try to install. ioc shouldn't be installed if someone else + * already did or @task, which isn't %current, is exiting. Note + * that we need to allow ioc creation on exiting %current as exit + * path may issue IOs from e.g. exit_files(). The exit path is + * responsible for not issuing IO after exit_io_context(). + */ + task_lock(task); + if (!task->io_context && + (task == current || !(task->flags & PF_EXITING))) + task->io_context = ioc; + else + kmem_cache_free(iocontext_cachep, ioc); + + ret = task->io_context ? 0 : -EBUSY; + + task_unlock(task); + + return ret; +} + +/** + * get_task_io_context - get io_context of a task + * @task: task of interest + * @gfp_flags: allocation flags, used if allocation is necessary + * @node: allocation node, used if allocation is necessary + * + * Return io_context of @task. If it doesn't exist, it is created with + * @gfp_flags and @node. The returned io_context has its reference count + * incremented. + * + * This function always goes through task_lock() and it's better to use + * %current->io_context + get_io_context() for %current. + */ +struct io_context *get_task_io_context(struct task_struct *task, + gfp_t gfp_flags, int node) +{ + struct io_context *ioc; + + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); + + do { + task_lock(task); + ioc = task->io_context; + if (likely(ioc)) { + get_io_context(ioc); + task_unlock(task); + return ioc; + } + task_unlock(task); + } while (!create_task_io_context(task, gfp_flags, node)); + + return NULL; +} + +/** + * ioc_lookup_icq - lookup io_cq from ioc + * @ioc: the associated io_context + * @q: the associated request_queue + * + * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called + * with @q->queue_lock held. + */ +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) +{ + struct io_cq *icq; + + lockdep_assert_held(&q->queue_lock); + + /* + * icq's are indexed from @ioc using radix tree and hint pointer, + * both of which are protected with RCU. All removals are done + * holding both q and ioc locks, and we're holding q lock - if we + * find a icq which points to us, it's guaranteed to be valid. + */ + rcu_read_lock(); + icq = rcu_dereference(ioc->icq_hint); + if (icq && icq->q == q) + goto out; + + icq = radix_tree_lookup(&ioc->icq_tree, q->id); + if (icq && icq->q == q) + rcu_assign_pointer(ioc->icq_hint, icq); /* allowed to race */ + else + icq = NULL; +out: + rcu_read_unlock(); + return icq; +} +EXPORT_SYMBOL(ioc_lookup_icq); + +/** + * ioc_create_icq - create and link io_cq + * @ioc: io_context of interest + * @q: request_queue of interest + * @gfp_mask: allocation mask + * + * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they + * will be created using @gfp_mask. + * + * The caller is responsible for ensuring @ioc won't go away and @q is + * alive and will stay alive until this function returns. + */ +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask) +{ + struct elevator_type *et = q->elevator->type; + struct io_cq *icq; + + /* allocate stuff */ + icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + q->node); + if (!icq) + return NULL; + + if (radix_tree_maybe_preload(gfp_mask) < 0) { + kmem_cache_free(et->icq_cache, icq); + return NULL; + } + + icq->ioc = ioc; + icq->q = q; + INIT_LIST_HEAD(&icq->q_node); + INIT_HLIST_NODE(&icq->ioc_node); + + /* lock both q and ioc and try to link @icq */ + spin_lock_irq(&q->queue_lock); + spin_lock(&ioc->lock); + + if (likely(!radix_tree_insert(&ioc->icq_tree, q->id, icq))) { + hlist_add_head(&icq->ioc_node, &ioc->icq_list); + list_add(&icq->q_node, &q->icq_list); + if (et->ops.init_icq) + et->ops.init_icq(icq); + } else { + kmem_cache_free(et->icq_cache, icq); + icq = ioc_lookup_icq(ioc, q); + if (!icq) + printk(KERN_ERR "cfq: icq link failed!\n"); + } + + spin_unlock(&ioc->lock); + spin_unlock_irq(&q->queue_lock); + radix_tree_preload_end(); + return icq; +} + +static int __init blk_ioc_init(void) +{ + iocontext_cachep = kmem_cache_create("blkdev_ioc", + sizeof(struct io_context), 0, SLAB_PANIC, NULL); + return 0; +} +subsys_initcall(blk_ioc_init); diff --git a/block/blk-iocost.c b/block/blk-iocost.c new file mode 100644 index 000000000..7ba7c4e4e --- /dev/null +++ b/block/blk-iocost.c @@ -0,0 +1,3456 @@ +/* SPDX-License-Identifier: GPL-2.0 + * + * IO cost model based controller. + * + * Copyright (C) 2019 Tejun Heo + * Copyright (C) 2019 Andy Newell + * Copyright (C) 2019 Facebook + * + * One challenge of controlling IO resources is the lack of trivially + * observable cost metric. This is distinguished from CPU and memory where + * wallclock time and the number of bytes can serve as accurate enough + * approximations. + * + * Bandwidth and iops are the most commonly used metrics for IO devices but + * depending on the type and specifics of the device, different IO patterns + * easily lead to multiple orders of magnitude variations rendering them + * useless for the purpose of IO capacity distribution. While on-device + * time, with a lot of clutches, could serve as a useful approximation for + * non-queued rotational devices, this is no longer viable with modern + * devices, even the rotational ones. + * + * While there is no cost metric we can trivially observe, it isn't a + * complete mystery. For example, on a rotational device, seek cost + * dominates while a contiguous transfer contributes a smaller amount + * proportional to the size. If we can characterize at least the relative + * costs of these different types of IOs, it should be possible to + * implement a reasonable work-conserving proportional IO resource + * distribution. + * + * 1. IO Cost Model + * + * IO cost model estimates the cost of an IO given its basic parameters and + * history (e.g. the end sector of the last IO). The cost is measured in + * device time. If a given IO is estimated to cost 10ms, the device should + * be able to process ~100 of those IOs in a second. + * + * Currently, there's only one builtin cost model - linear. Each IO is + * classified as sequential or random and given a base cost accordingly. + * On top of that, a size cost proportional to the length of the IO is + * added. While simple, this model captures the operational + * characteristics of a wide varienty of devices well enough. Default + * paramters for several different classes of devices are provided and the + * parameters can be configured from userspace via + * /sys/fs/cgroup/io.cost.model. + * + * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate + * device-specific coefficients. + * + * 2. Control Strategy + * + * The device virtual time (vtime) is used as the primary control metric. + * The control strategy is composed of the following three parts. + * + * 2-1. Vtime Distribution + * + * When a cgroup becomes active in terms of IOs, its hierarchical share is + * calculated. Please consider the following hierarchy where the numbers + * inside parentheses denote the configured weights. + * + * root + * / \ + * A (w:100) B (w:300) + * / \ + * A0 (w:100) A1 (w:100) + * + * If B is idle and only A0 and A1 are actively issuing IOs, as the two are + * of equal weight, each gets 50% share. If then B starts issuing IOs, B + * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, + * 12.5% each. The distribution mechanism only cares about these flattened + * shares. They're called hweights (hierarchical weights) and always add + * upto 1 (WEIGHT_ONE). + * + * A given cgroup's vtime runs slower in inverse proportion to its hweight. + * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) + * against the device vtime - an IO which takes 10ms on the underlying + * device is considered to take 80ms on A0. + * + * This constitutes the basis of IO capacity distribution. Each cgroup's + * vtime is running at a rate determined by its hweight. A cgroup tracks + * the vtime consumed by past IOs and can issue a new IO iff doing so + * wouldn't outrun the current device vtime. Otherwise, the IO is + * suspended until the vtime has progressed enough to cover it. + * + * 2-2. Vrate Adjustment + * + * It's unrealistic to expect the cost model to be perfect. There are too + * many devices and even on the same device the overall performance + * fluctuates depending on numerous factors such as IO mixture and device + * internal garbage collection. The controller needs to adapt dynamically. + * + * This is achieved by adjusting the overall IO rate according to how busy + * the device is. If the device becomes overloaded, we're sending down too + * many IOs and should generally slow down. If there are waiting issuers + * but the device isn't saturated, we're issuing too few and should + * generally speed up. + * + * To slow down, we lower the vrate - the rate at which the device vtime + * passes compared to the wall clock. For example, if the vtime is running + * at the vrate of 75%, all cgroups added up would only be able to issue + * 750ms worth of IOs per second, and vice-versa for speeding up. + * + * Device business is determined using two criteria - rq wait and + * completion latencies. + * + * When a device gets saturated, the on-device and then the request queues + * fill up and a bio which is ready to be issued has to wait for a request + * to become available. When this delay becomes noticeable, it's a clear + * indication that the device is saturated and we lower the vrate. This + * saturation signal is fairly conservative as it only triggers when both + * hardware and software queues are filled up, and is used as the default + * busy signal. + * + * As devices can have deep queues and be unfair in how the queued commands + * are executed, soley depending on rq wait may not result in satisfactory + * control quality. For a better control quality, completion latency QoS + * parameters can be configured so that the device is considered saturated + * if N'th percentile completion latency rises above the set point. + * + * The completion latency requirements are a function of both the + * underlying device characteristics and the desired IO latency quality of + * service. There is an inherent trade-off - the tighter the latency QoS, + * the higher the bandwidth lossage. Latency QoS is disabled by default + * and can be set through /sys/fs/cgroup/io.cost.qos. + * + * 2-3. Work Conservation + * + * Imagine two cgroups A and B with equal weights. A is issuing a small IO + * periodically while B is sending out enough parallel IOs to saturate the + * device on its own. Let's say A's usage amounts to 100ms worth of IO + * cost per second, i.e., 10% of the device capacity. The naive + * distribution of half and half would lead to 60% utilization of the + * device, a significant reduction in the total amount of work done + * compared to free-for-all competition. This is too high a cost to pay + * for IO control. + * + * To conserve the total amount of work done, we keep track of how much + * each active cgroup is actually using and yield part of its weight if + * there are other cgroups which can make use of it. In the above case, + * A's weight will be lowered so that it hovers above the actual usage and + * B would be able to use the rest. + * + * As we don't want to penalize a cgroup for donating its weight, the + * surplus weight adjustment factors in a margin and has an immediate + * snapback mechanism in case the cgroup needs more IO vtime for itself. + * + * Note that adjusting down surplus weights has the same effects as + * accelerating vtime for other cgroups and work conservation can also be + * implemented by adjusting vrate dynamically. However, squaring who can + * donate and should take back how much requires hweight propagations + * anyway making it easier to implement and understand as a separate + * mechanism. + * + * 3. Monitoring + * + * Instead of debugfs or other clumsy monitoring mechanisms, this + * controller uses a drgn based monitoring script - + * tools/cgroup/iocost_monitor.py. For details on drgn, please see + * https://github.com/osandov/drgn. The ouput looks like the following. + * + * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% + * active weight hweight% inflt% dbt delay usages% + * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033 + * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077 + * + * - per : Timer period + * - cur_per : Internal wall and device vtime clock + * - vrate : Device virtual time rate against wall clock + * - weight : Surplus-adjusted and configured weights + * - hweight : Surplus-adjusted and configured hierarchical weights + * - inflt : The percentage of in-flight IO cost at the end of last period + * - del_ms : Deferred issuer delay induction level and duration + * - usages : Usage history + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk-rq-qos.h" +#include "blk-stat.h" +#include "blk-wbt.h" + +#ifdef CONFIG_TRACEPOINTS + +/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */ +#define TRACE_IOCG_PATH_LEN 1024 +static DEFINE_SPINLOCK(trace_iocg_path_lock); +static char trace_iocg_path[TRACE_IOCG_PATH_LEN]; + +#define TRACE_IOCG_PATH(type, iocg, ...) \ + do { \ + unsigned long flags; \ + if (trace_iocost_##type##_enabled()) { \ + spin_lock_irqsave(&trace_iocg_path_lock, flags); \ + cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \ + trace_iocg_path, TRACE_IOCG_PATH_LEN); \ + trace_iocost_##type(iocg, trace_iocg_path, \ + ##__VA_ARGS__); \ + spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \ + } \ + } while (0) + +#else /* CONFIG_TRACE_POINTS */ +#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0) +#endif /* CONFIG_TRACE_POINTS */ + +enum { + MILLION = 1000000, + + /* timer period is calculated from latency requirements, bound it */ + MIN_PERIOD = USEC_PER_MSEC, + MAX_PERIOD = USEC_PER_SEC, + + /* + * iocg->vtime is targeted at 50% behind the device vtime, which + * serves as its IO credit buffer. Surplus weight adjustment is + * immediately canceled if the vtime margin runs below 10%. + */ + MARGIN_MIN_PCT = 10, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, + + INUSE_ADJ_STEP_PCT = 25, + + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, + + /* 1/64k is granular enough and can easily be handled w/ u32 */ + WEIGHT_ONE = 1 << 16, +}; + +enum { + /* + * As vtime is used to calculate the cost of each IO, it needs to + * be fairly high precision. For example, it should be able to + * represent the cost of a single page worth of discard with + * suffificient accuracy. At the same time, it should be able to + * represent reasonably long enough durations to be useful and + * convenient during operation. + * + * 1s worth of vtime is 2^37. This gives us both sub-nanosecond + * granularity and days of wrap-around time even at extreme vrates. + */ + VTIME_PER_SEC_SHIFT = 37, + VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT, + VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC, + VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC, + + /* bound vrate adjustments within two orders of magnitude */ + VRATE_MIN_PPM = 10000, /* 1% */ + VRATE_MAX_PPM = 100000000, /* 10000% */ + + VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, + VRATE_CLAMP_ADJ_PCT = 4, + + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, +}; + +enum { + /* if IOs end up waiting for requests, issue less */ + RQ_WAIT_BUSY_PCT = 5, + + /* unbusy hysterisis */ + UNBUSY_THR_PCT = 75, + + /* + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. + */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, + + /* halve debts if avg usage over 100ms is under 50% */ + DFGV_USAGE_PCT = 50, + DFGV_PERIOD = 100 * USEC_PER_MSEC, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, + + /* + * Count IO size in 4k pages. The 12bit shift helps keeping + * size-proportional components of cost calculation in closer + * numbers of digits to per-IO cost components. + */ + IOC_PAGE_SHIFT = 12, + IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT, + IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT, + + /* if apart further than 16M, consider randio for linear model */ + LCOEF_RANDIO_PAGES = 4096, +}; + +enum ioc_running { + IOC_IDLE, + IOC_RUNNING, + IOC_STOP, +}; + +/* io.cost.qos controls including per-dev enable of the whole controller */ +enum { + QOS_ENABLE, + QOS_CTRL, + NR_QOS_CTRL_PARAMS, +}; + +/* io.cost.qos params */ +enum { + QOS_RPPM, + QOS_RLAT, + QOS_WPPM, + QOS_WLAT, + QOS_MIN, + QOS_MAX, + NR_QOS_PARAMS, +}; + +/* io.cost.model controls */ +enum { + COST_CTRL, + COST_MODEL, + NR_COST_CTRL_PARAMS, +}; + +/* builtin linear cost model coefficients */ +enum { + I_LCOEF_RBPS, + I_LCOEF_RSEQIOPS, + I_LCOEF_RRANDIOPS, + I_LCOEF_WBPS, + I_LCOEF_WSEQIOPS, + I_LCOEF_WRANDIOPS, + NR_I_LCOEFS, +}; + +enum { + LCOEF_RPAGE, + LCOEF_RSEQIO, + LCOEF_RRANDIO, + LCOEF_WPAGE, + LCOEF_WSEQIO, + LCOEF_WRANDIO, + NR_LCOEFS, +}; + +enum { + AUTOP_INVALID, + AUTOP_HDD, + AUTOP_SSD_QD1, + AUTOP_SSD_DFL, + AUTOP_SSD_FAST, +}; + +struct ioc_gq; + +struct ioc_params { + u32 qos[NR_QOS_PARAMS]; + u64 i_lcoefs[NR_I_LCOEFS]; + u64 lcoefs[NR_LCOEFS]; + u32 too_fast_vrate_pct; + u32 too_slow_vrate_pct; +}; + +struct ioc_margins { + s64 min; + s64 low; + s64 target; +}; + +struct ioc_missed { + local_t nr_met; + local_t nr_missed; + u32 last_met; + u32 last_missed; +}; + +struct ioc_pcpu_stat { + struct ioc_missed missed[2]; + + local64_t rq_wait_ns; + u64 last_rq_wait_ns; +}; + +/* per device */ +struct ioc { + struct rq_qos rqos; + + bool enabled; + + struct ioc_params params; + struct ioc_margins margins; + u32 period_us; + u32 timer_slack_ns; + u64 vrate_min; + u64 vrate_max; + + spinlock_t lock; + struct timer_list timer; + struct list_head active_iocgs; /* active cgroups */ + struct ioc_pcpu_stat __percpu *pcpu_stat; + + enum ioc_running running; + atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; + + seqcount_spinlock_t period_seqcount; + u64 period_at; /* wallclock starttime */ + u64 period_at_vtime; /* vtime starttime */ + + atomic64_t cur_period; /* inc'd each period */ + int busy_level; /* saturation history */ + + bool weights_updated; + atomic_t hweight_gen; /* for lazy hweights */ + + /* debt forgivness */ + u64 dfgv_period_at; + u64 dfgv_period_rem; + u64 dfgv_usage_us_sum; + + u64 autop_too_fast_at; + u64 autop_too_slow_at; + int autop_idx; + bool user_qos_params:1; + bool user_cost_model:1; +}; + +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; + u64 wait_us; + u64 indebt_us; + u64 indelay_us; +}; + +/* per device-cgroup pair */ +struct ioc_gq { + struct blkg_policy_data pd; + struct ioc *ioc; + + /* + * A iocg can get its weight from two sources - an explicit + * per-device-cgroup configuration or the default weight of the + * cgroup. `cfg_weight` is the explicit per-device-cgroup + * configuration. `weight` is the effective considering both + * sources. + * + * When an idle cgroup becomes active its `active` goes from 0 to + * `weight`. `inuse` is the surplus adjusted active weight. + * `active` and `inuse` are used to calculate `hweight_active` and + * `hweight_inuse`. + * + * `last_inuse` remembers `inuse` while an iocg is idle to persist + * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. + */ + u32 cfg_weight; + u32 weight; + u32 active; + u32 inuse; + + u32 last_inuse; + s64 saved_margin; + + sector_t cursor; /* to detect randio */ + + /* + * `vtime` is this iocg's vtime cursor which progresses as IOs are + * issued. If lagging behind device vtime, the delta represents + * the currently available IO budget. If runnning ahead, the + * overage. + * + * `vtime_done` is the same but progressed on completion rather + * than issue. The delta behind `vtime` represents the cost of + * currently in-flight IOs. + */ + atomic64_t vtime; + atomic64_t done_vtime; + u64 abs_vdebt; + + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; + + /* + * The period this iocg was last active in. Used for deactivation + * and invalidating `vtime`. + */ + atomic64_t active_period; + struct list_head active_list; + + /* see __propagate_weights() and current_hweight() for details */ + u64 child_active_sum; + u64 child_inuse_sum; + u64 child_adjusted_sum; + int hweight_gen; + u32 hweight_active; + u32 hweight_inuse; + u32 hweight_donating; + u32 hweight_after_donation; + + struct list_head walk_list; + struct list_head surplus_list; + + struct wait_queue_head waitq; + struct hrtimer waitq_timer; + + /* timestamp at the latest activation */ + u64 activated_at; + + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat local_stat; + struct iocg_stat desc_stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + u64 usage_delta_us; + u64 wait_since; + u64 indebt_since; + u64 indelay_since; + + /* this iocg's depth in the hierarchy and ancestors including self */ + int level; + struct ioc_gq *ancestors[]; +}; + +/* per cgroup */ +struct ioc_cgrp { + struct blkcg_policy_data cpd; + unsigned int dfl_weight; +}; + +struct ioc_now { + u64 now_ns; + u64 now; + u64 vnow; + u64 vrate; +}; + +struct iocg_wait { + struct wait_queue_entry wait; + struct bio *bio; + u64 abs_cost; + bool committed; +}; + +struct iocg_wake_ctx { + struct ioc_gq *iocg; + u32 hw_inuse; + s64 vbudget; +}; + +static const struct ioc_params autop[] = { + [AUTOP_HDD] = { + .qos = { + [QOS_RLAT] = 250000, /* 250ms */ + [QOS_WLAT] = 250000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 174019176, + [I_LCOEF_RSEQIOPS] = 41708, + [I_LCOEF_RRANDIOPS] = 370, + [I_LCOEF_WBPS] = 178075866, + [I_LCOEF_WSEQIOPS] = 42705, + [I_LCOEF_WRANDIOPS] = 378, + }, + }, + [AUTOP_SSD_QD1] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 245855193, + [I_LCOEF_RSEQIOPS] = 61575, + [I_LCOEF_RRANDIOPS] = 6946, + [I_LCOEF_WBPS] = 141365009, + [I_LCOEF_WSEQIOPS] = 33716, + [I_LCOEF_WRANDIOPS] = 26796, + }, + }, + [AUTOP_SSD_DFL] = { + .qos = { + [QOS_RLAT] = 25000, /* 25ms */ + [QOS_WLAT] = 25000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 488636629, + [I_LCOEF_RSEQIOPS] = 8932, + [I_LCOEF_RRANDIOPS] = 8518, + [I_LCOEF_WBPS] = 427891549, + [I_LCOEF_WSEQIOPS] = 28755, + [I_LCOEF_WRANDIOPS] = 21940, + }, + .too_fast_vrate_pct = 500, + }, + [AUTOP_SSD_FAST] = { + .qos = { + [QOS_RLAT] = 5000, /* 5ms */ + [QOS_WLAT] = 5000, + [QOS_MIN] = VRATE_MIN_PPM, + [QOS_MAX] = VRATE_MAX_PPM, + }, + .i_lcoefs = { + [I_LCOEF_RBPS] = 3102524156LLU, + [I_LCOEF_RSEQIOPS] = 724816, + [I_LCOEF_RRANDIOPS] = 778122, + [I_LCOEF_WBPS] = 1742780862LLU, + [I_LCOEF_WSEQIOPS] = 425702, + [I_LCOEF_WRANDIOPS] = 443193, + }, + .too_slow_vrate_pct = 10, + }, +}; + +/* + * vrate adjust percentages indexed by ioc->busy_level. We adjust up on + * vtime credit shortage and down on device saturation. + */ +static u32 vrate_adj_pct[] = + { 0, 0, 0, 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 }; + +static struct blkcg_policy blkcg_policy_iocost; + +/* accessors and helpers */ +static struct ioc *rqos_to_ioc(struct rq_qos *rqos) +{ + return container_of(rqos, struct ioc, rqos); +} + +static struct ioc *q_to_ioc(struct request_queue *q) +{ + return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); +} + +static const char *q_name(struct request_queue *q) +{ + if (blk_queue_registered(q)) + return kobject_name(q->kobj.parent); + else + return ""; +} + +static const char __maybe_unused *ioc_name(struct ioc *ioc) +{ + return q_name(ioc->rqos.q); +} + +static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioc_gq, pd) : NULL; +} + +static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg) +{ + return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost)); +} + +static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg) +{ + return pd_to_blkg(&iocg->pd); +} + +static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost), + struct ioc_cgrp, cpd); +} + +/* + * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical + * weight, the more expensive each IO. Must round up. + */ +static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); +} + +/* + * The inverse of abs_cost_to_cost(). Must round up. + */ +static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) +{ + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); +} + +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) +{ + struct iocg_pcpu_stat *gcs; + + bio->bi_iocost_cost = cost; + atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } +} + +#define CREATE_TRACE_POINTS +#include + +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = ioc->vtime_base_rate; + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; +} + +/* latency Qos params changed, update period_us and all the dependent params */ +static void ioc_refresh_period_us(struct ioc *ioc) +{ + u32 ppm, lat, multi, period_us; + + lockdep_assert_held(&ioc->lock); + + /* pick the higher latency target */ + if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { + ppm = ioc->params.qos[QOS_RPPM]; + lat = ioc->params.qos[QOS_RLAT]; + } else { + ppm = ioc->params.qos[QOS_WPPM]; + lat = ioc->params.qos[QOS_WLAT]; + } + + /* + * We want the period to be long enough to contain a healthy number + * of IOs while short enough for granular control. Define it as a + * multiple of the latency target. Ideally, the multiplier should + * be scaled according to the percentile so that it would nominally + * contain a certain number of requests. Let's be simpler and + * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50). + */ + if (ppm) + multi = max_t(u32, (MILLION - ppm) / 50000, 2); + else + multi = 2; + period_us = multi * lat; + period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD); + + /* calculate dependent params */ + ioc->period_us = period_us; + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); +} + +static int ioc_autop_idx(struct ioc *ioc) +{ + int idx = ioc->autop_idx; + const struct ioc_params *p = &autop[idx]; + u32 vrate_pct; + u64 now_ns; + + /* rotational? */ + if (!blk_queue_nonrot(ioc->rqos.q)) + return AUTOP_HDD; + + /* handle SATA SSDs w/ broken NCQ */ + if (blk_queue_depth(ioc->rqos.q) == 1) + return AUTOP_SSD_QD1; + + /* use one of the normal ssd sets */ + if (idx < AUTOP_SSD_DFL) + return AUTOP_SSD_DFL; + + /* if user is overriding anything, maintain what was there */ + if (ioc->user_qos_params || ioc->user_cost_model) + return idx; + + /* step up/down based on the vrate */ + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); + now_ns = ktime_get_ns(); + + if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { + if (!ioc->autop_too_fast_at) + ioc->autop_too_fast_at = now_ns; + if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) + return idx + 1; + } else { + ioc->autop_too_fast_at = 0; + } + + if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { + if (!ioc->autop_too_slow_at) + ioc->autop_too_slow_at = now_ns; + if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) + return idx - 1; + } else { + ioc->autop_too_slow_at = 0; + } + + return idx; +} + +/* + * Take the followings as input + * + * @bps maximum sequential throughput + * @seqiops maximum sequential 4k iops + * @randiops maximum random 4k iops + * + * and calculate the linear model cost coefficients. + * + * *@page per-page cost 1s / (@bps / 4096) + * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0) + * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0) + */ +static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, + u64 *page, u64 *seqio, u64 *randio) +{ + u64 v; + + *page = *seqio = *randio = 0; + + if (bps) { + u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); + + if (bps_pages) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); + else + *page = 1; + } + + if (seqiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); + if (v > *page) + *seqio = v - *page; + } + + if (randiops) { + v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops); + if (v > *page) + *randio = v - *page; + } +} + +static void ioc_refresh_lcoefs(struct ioc *ioc) +{ + u64 *u = ioc->params.i_lcoefs; + u64 *c = ioc->params.lcoefs; + + calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]); + calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS], + &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); +} + +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + const struct ioc_params *p; + int idx; + + lockdep_assert_held(&ioc->lock); + + idx = ioc_autop_idx(ioc); + p = &autop[idx]; + + if (idx == ioc->autop_idx && !force) + return false; + + if (idx != ioc->autop_idx) + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + + ioc->autop_idx = idx; + ioc->autop_too_fast_at = 0; + ioc->autop_too_slow_at = 0; + + if (!ioc->user_qos_params) + memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); + if (!ioc->user_cost_model) + memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); + + ioc_refresh_period_us(ioc); + ioc_refresh_lcoefs(ioc); + + ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * + VTIME_PER_USEC, MILLION); + ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); + + return true; +} + +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); + + ioc->vtime_err += vcomp * pleft; + + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); +done: + /* bound how much error can accumulate */ + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); +} + +/* take a snapshot of the current [v]time and vrate */ +static void ioc_now(struct ioc *ioc, struct ioc_now *now) +{ + unsigned seq; + + now->now_ns = ktime_get(); + now->now = ktime_to_us(now->now_ns); + now->vrate = atomic64_read(&ioc->vtime_rate); + + /* + * The current vtime is + * + * vtime at period start + (wallclock time since the start) * vrate + * + * As a consistent snapshot of `period_at_vtime` and `period_at` is + * needed, they're seqcount protected. + */ + do { + seq = read_seqcount_begin(&ioc->period_seqcount); + now->vnow = ioc->period_at_vtime + + (now->now - ioc->period_at) * now->vrate; + } while (read_seqcount_retry(&ioc->period_seqcount, seq)); +} + +static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) +{ + WARN_ON_ONCE(ioc->running != IOC_RUNNING); + + write_seqcount_begin(&ioc->period_seqcount); + ioc->period_at = now->now; + ioc->period_at_vtime = now->vnow; + write_seqcount_end(&ioc->period_seqcount); + + ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); + add_timer(&ioc->timer); +} + +/* + * Update @iocg's `active` and `inuse` to @active and @inuse, update level + * weight sums and propagate upwards accordingly. If @save, the current margin + * is saved to be used as reference for later inuse in-period adjustments. + */ +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + + lockdep_assert_held(&ioc->lock); + + /* + * For an active leaf node, its inuse shouldn't be zero or exceed + * @active. An active internal node's inuse is solely determined by the + * inuse to active ratio of its children regardless of @inuse. + */ + if (list_empty(&iocg->active_list) && iocg->child_active_sum) { + inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, + iocg->child_active_sum); + } else { + inuse = clamp_t(u32, inuse, 1, active); + } + + iocg->last_inuse = iocg->inuse; + if (save) + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); + + if (active == iocg->active && inuse == iocg->inuse) + return; + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u32 parent_active = 0, parent_inuse = 0; + + /* update the level sums */ + parent->child_active_sum += (s32)(active - child->active); + parent->child_inuse_sum += (s32)(inuse - child->inuse); + /* apply the updates */ + child->active = active; + child->inuse = inuse; + + /* + * The delta between inuse and active sums indicates that + * that much of weight is being given away. Parent's inuse + * and active should reflect the ratio. + */ + if (parent->child_active_sum) { + parent_active = parent->weight; + parent_inuse = DIV64_U64_ROUND_UP( + parent_active * parent->child_inuse_sum, + parent->child_active_sum); + } + + /* do we need to keep walking up? */ + if (parent_active == parent->active && + parent_inuse == parent->inuse) + break; + + active = parent_active; + inuse = parent_inuse; + } + + ioc->weights_updated = true; +} + +static void commit_weights(struct ioc *ioc) +{ + lockdep_assert_held(&ioc->lock); + + if (ioc->weights_updated) { + /* paired with rmb in current_hweight(), see there */ + smp_wmb(); + atomic_inc(&ioc->hweight_gen); + ioc->weights_updated = false; + } +} + +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) +{ + __propagate_weights(iocg, active, inuse, save, now); + commit_weights(iocg->ioc); +} + +static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) +{ + struct ioc *ioc = iocg->ioc; + int lvl; + u32 hwa, hwi; + int ioc_gen; + + /* hot path - if uptodate, use cached */ + ioc_gen = atomic_read(&ioc->hweight_gen); + if (ioc_gen == iocg->hweight_gen) + goto out; + + /* + * Paired with wmb in commit_weights(). If we saw the updated + * hweight_gen, all the weight updates from __propagate_weights() are + * visible too. + * + * We can race with weight updates during calculation and get it + * wrong. However, hweight_gen would have changed and a future + * reader will recalculate and we're guaranteed to discard the + * wrong result soon. + */ + smp_rmb(); + + hwa = hwi = WEIGHT_ONE; + for (lvl = 0; lvl <= iocg->level - 1; lvl++) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + u64 active_sum = READ_ONCE(parent->child_active_sum); + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u32 active = READ_ONCE(child->active); + u32 inuse = READ_ONCE(child->inuse); + + /* we can race with deactivations and either may read as zero */ + if (!active_sum || !inuse_sum) + continue; + + active_sum = max_t(u64, active, active_sum); + hwa = div64_u64((u64)hwa * active, active_sum); + + inuse_sum = max_t(u64, inuse, inuse_sum); + hwi = div64_u64((u64)hwi * inuse, inuse_sum); + } + + iocg->hweight_active = max_t(u32, hwa, 1); + iocg->hweight_inuse = max_t(u32, hwi, 1); + iocg->hweight_gen = ioc_gen; +out: + if (hw_activep) + *hw_activep = iocg->hweight_active; + if (hw_inusep) + *hw_inusep = iocg->hweight_inuse; +} + +/* + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the + * other weights stay unchanged. + */ +static u32 current_hweight_max(struct ioc_gq *iocg) +{ + u32 hwm = WEIGHT_ONE; + u32 inuse = iocg->active; + u64 child_inuse_sum; + int lvl; + + lockdep_assert_held(&iocg->ioc->lock); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, + parent->child_active_sum); + } + + return max_t(u32, hwm, 1); +} + +static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); + u32 weight; + + lockdep_assert_held(&ioc->lock); + + weight = iocg->cfg_weight ?: iocc->dfl_weight; + if (weight != iocg->weight && iocg->active) + propagate_weights(iocg, weight, iocg->inuse, true, now); + iocg->weight = weight; +} + +static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 last_period, cur_period; + u64 vtime, vtarget; + int i; + + /* + * If seem to be already active, just update the stamp to tell the + * timer that we're still active. We don't mind occassional races. + */ + if (!list_empty(&iocg->active_list)) { + ioc_now(ioc, now); + cur_period = atomic64_read(&ioc->cur_period); + if (atomic64_read(&iocg->active_period) != cur_period) + atomic64_set(&iocg->active_period, cur_period); + return true; + } + + /* racy check on internal node IOs, treat as root level IOs */ + if (iocg->child_active_sum) + return false; + + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, now); + + /* update period */ + cur_period = atomic64_read(&ioc->cur_period); + last_period = atomic64_read(&iocg->active_period); + atomic64_set(&iocg->active_period, cur_period); + + /* already activated or breaking leaf-only constraint? */ + if (!list_empty(&iocg->active_list)) + goto succeed_unlock; + for (i = iocg->level - 1; i > 0; i--) + if (!list_empty(&iocg->ancestors[i]->active_list)) + goto fail_unlock; + + if (iocg->child_active_sum) + goto fail_unlock; + + /* + * Always start with the target budget. On deactivation, we throw away + * anything above it. + */ + vtarget = now->vnow - ioc->margins.target; + vtime = atomic64_read(&iocg->vtime); + + atomic64_add(vtarget - vtime, &iocg->vtime); + atomic64_add(vtarget - vtime, &iocg->done_vtime); + vtime = vtarget; + + /* + * Activate, propagate weight and start period timer if not + * running. Reset hweight_gen to avoid accidental match from + * wrapping. + */ + iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; + list_add(&iocg->active_list, &ioc->active_iocgs); + + propagate_weights(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight, true, now); + + TRACE_IOCG_PATH(iocg_activate, iocg, now, + last_period, cur_period, vtime); + + iocg->activated_at = now->now; + + if (ioc->running == IOC_IDLE) { + ioc->running = IOC_RUNNING; + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; + ioc_start_period(ioc, now); + } + +succeed_unlock: + spin_unlock_irq(&ioc->lock); + return true; + +fail_unlock: + spin_unlock_irq(&ioc->lock); + return false; +} + +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 tdelta, delay, new_delay; + s64 vover, vover_pct; + u32 hwa; + + lockdep_assert_held(&iocg->waitq.lock); + + /* calculate the current delay in effect - 1/2 every second */ + tdelta = now->now - iocg->delay_at; + if (iocg->delay) + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + else + delay = 0; + + /* calculate the new delay from the debt amount */ + current_hweight(iocg, &hwa, NULL); + vover = atomic64_read(&iocg->vtime) + + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; + vover_pct = div64_s64(100 * vover, + ioc->period_us * ioc->vtime_base_rate); + + if (vover_pct <= MIN_DELAY_THR_PCT) + new_delay = 0; + else if (vover_pct >= MAX_DELAY_THR_PCT) + new_delay = MAX_DELAY; + else + new_delay = MIN_DELAY + + div_u64((MAX_DELAY - MIN_DELAY) * + (vover_pct - MIN_DELAY_THR_PCT), + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); + + /* pick the higher one and apply */ + if (new_delay > delay) { + iocg->delay = new_delay; + iocg->delay_at = now->now; + delay = new_delay; + } + + if (delay >= MIN_DELAY) { + if (!iocg->indelay_since) + iocg->indelay_since = now->now; + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); + return true; + } else { + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += now->now - iocg->indelay_since; + iocg->indelay_since = 0; + } + iocg->delay = 0; + blkcg_clear_delay(blkg); + return false; + } +} + +static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, + struct ioc_now *now) +{ + struct iocg_pcpu_stat *gcs; + + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + WARN_ON_ONCE(list_empty(&iocg->active_list)); + + /* + * Once in debt, debt handling owns inuse. @iocg stays at the minimum + * inuse donating all of it share to others until its debt is paid off. + */ + if (!iocg->abs_vdebt && abs_cost) { + iocg->indebt_since = now->now; + propagate_weights(iocg, iocg->active, 0, false, now); + } + + iocg->abs_vdebt += abs_cost; + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, + struct ioc_now *now) +{ + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + + /* make sure that nobody messed with @iocg */ + WARN_ON_ONCE(list_empty(&iocg->active_list)); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); + + /* if debt is paid in full, restore inuse */ + if (!iocg->abs_vdebt) { + iocg->local_stat.indebt_us += now->now - iocg->indebt_since; + iocg->indebt_since = 0; + + propagate_weights(iocg, iocg->active, iocg->last_inuse, + false, now); + } +} + +static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, + int flags, void *key) +{ + struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); + struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); + + ctx->vbudget -= cost; + + if (ctx->vbudget < 0) + return -1; + + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); + wait->committed = true; + + /* + * autoremove_wake_function() removes the wait entry only when it + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). Note that the + * order of operations is important as finish_wait() tests whether + * @wq_entry is removed without grabbing the lock. + */ + default_wake_function(wq_entry, mode, flags, key); + list_del_init_careful(&wq_entry->entry); + return 0; +} + +/* + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in + * addition to iocg->waitq.lock. + */ +static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, + struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct iocg_wake_ctx ctx = { .iocg = iocg }; + u64 vshortage, expires, oexpires; + s64 vbudget; + u32 hwa; + + lockdep_assert_held(&iocg->waitq.lock); + + current_hweight(iocg, &hwa, NULL); + vbudget = now->vnow - atomic64_read(&iocg->vtime); + + /* pay off debt */ + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); + + lockdep_assert_held(&ioc->lock); + + atomic64_add(vpay, &iocg->vtime); + atomic64_add(vpay, &iocg->done_vtime); + iocg_pay_debt(iocg, abs_vpay, now); + vbudget -= vpay; + } + + if (iocg->abs_vdebt || iocg->delay) + iocg_kick_delay(iocg, now); + + /* + * Debt can still be outstanding if we haven't paid all yet or the + * caller raced and called without @pay_debt. Shouldn't wake up waiters + * under debt. Make sure @vbudget reflects the outstanding amount and is + * not positive. + */ + if (iocg->abs_vdebt) { + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); + vbudget = min_t(s64, 0, vbudget - vdebt); + } + + /* + * Wake up the ones which are due and see how much vtime we'll need for + * the next one. As paying off debt restores hw_inuse, it must be read + * after the above debt payment. + */ + ctx.vbudget = vbudget; + current_hweight(iocg, NULL, &ctx.hw_inuse); + + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); + + if (!waitqueue_active(&iocg->waitq)) { + if (iocg->wait_since) { + iocg->local_stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = 0; + } + return; + } + + if (!iocg->wait_since) + iocg->wait_since = now->now; + + if (WARN_ON_ONCE(ctx.vbudget >= 0)) + return; + + /* determine next wakeup, add a timer margin to guarantee chunking */ + vshortage = -ctx.vbudget; + expires = now->now_ns + + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * + NSEC_PER_USEC; + expires += ioc->timer_slack_ns; + + /* if already active and close enough, don't bother */ + oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); + if (hrtimer_is_queued(&iocg->waitq_timer) && + abs(oexpires - expires) <= ioc->timer_slack_ns) + return; + + hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), + ioc->timer_slack_ns, HRTIMER_MODE_ABS); +} + +static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) +{ + struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + bool pay_debt = READ_ONCE(iocg->abs_vdebt); + struct ioc_now now; + unsigned long flags; + + ioc_now(iocg->ioc, &now); + + iocg_lock(iocg, pay_debt, &flags); + iocg_kick_waitq(iocg, pay_debt, &now); + iocg_unlock(iocg, pay_debt, &flags); + + return HRTIMER_NORESTART; +} + +static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p) +{ + u32 nr_met[2] = { }; + u32 nr_missed[2] = { }; + u64 rq_wait_ns = 0; + int cpu, rw; + + for_each_online_cpu(cpu) { + struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); + u64 this_rq_wait_ns; + + for (rw = READ; rw <= WRITE; rw++) { + u32 this_met = local_read(&stat->missed[rw].nr_met); + u32 this_missed = local_read(&stat->missed[rw].nr_missed); + + nr_met[rw] += this_met - stat->missed[rw].last_met; + nr_missed[rw] += this_missed - stat->missed[rw].last_missed; + stat->missed[rw].last_met = this_met; + stat->missed[rw].last_missed = this_missed; + } + + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); + rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; + stat->last_rq_wait_ns = this_rq_wait_ns; + } + + for (rw = READ; rw <= WRITE; rw++) { + if (nr_met[rw] + nr_missed[rw]) + missed_ppm_ar[rw] = + DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION, + nr_met[rw] + nr_missed[rw]); + else + missed_ppm_ar[rw] = 0; + } + + *rq_wait_pct_p = div64_u64(rq_wait_ns * 100, + ioc->period_us * NSEC_PER_USEC); +} + +/* was iocg idle this period? */ +static bool iocg_is_idle(struct ioc_gq *iocg) +{ + struct ioc *ioc = iocg->ioc; + + /* did something get issued this period? */ + if (atomic64_read(&iocg->active_period) == + atomic64_read(&ioc->cur_period)) + return false; + + /* is something in flight? */ + if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) + return false; + + return true; +} + +/* + * Call this function on the target leaf @iocg's to build pre-order traversal + * list of all the ancestors in @inner_walk. The inner nodes are linked through + * ->walk_list and the caller is responsible for dissolving the list after use. + */ +static void iocg_build_inner_walk(struct ioc_gq *iocg, + struct list_head *inner_walk) +{ + int lvl; + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + + /* find the first ancestor which hasn't been visited yet */ + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) + break; + } + + /* walk down and visit the inner nodes to get pre-order traversal */ + while (++lvl <= iocg->level - 1) { + struct ioc_gq *inner = iocg->ancestors[lvl]; + + /* record traversal order */ + list_add_tail(&inner->walk_list, inner_walk); + } +} + +/* collect per-cpu counters and propagate the deltas to the parent */ +static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct iocg_stat new_stat; + u64 abs_vusage = 0; + u64 vusage_delta; + int cpu; + + lockdep_assert_held(&iocg->ioc->lock); + + /* collect per-cpu counters */ + for_each_possible_cpu(cpu) { + abs_vusage += local64_read( + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); + } + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; + iocg->last_stat_abs_vusage = abs_vusage; + + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); + iocg->local_stat.usage_us += iocg->usage_delta_us; + + /* propagate upwards */ + new_stat.usage_us = + iocg->local_stat.usage_us + iocg->desc_stat.usage_us; + new_stat.wait_us = + iocg->local_stat.wait_us + iocg->desc_stat.wait_us; + new_stat.indebt_us = + iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us; + new_stat.indelay_us = + iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us; + + /* propagate the deltas to the parent */ + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->desc_stat; + + parent_stat->usage_us += + new_stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + new_stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + new_stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + new_stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = new_stat; +} + +/* get stat counters ready for reading on all active iocgs */ +static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) +{ + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg; + + /* flush leaves and build inner node walk list */ + list_for_each_entry(iocg, target_iocgs, active_list) { + iocg_flush_stat_one(iocg, now); + iocg_build_inner_walk(iocg, &inner_walk); + } + + /* keep flushing upwards by walking the inner list backwards */ + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { + iocg_flush_stat_one(iocg, now); + list_del_init(&iocg->walk_list); + } +} + +/* + * Determine what @iocg's hweight_inuse should be after donating unused + * capacity. @hwm is the upper bound and used to signal no donation. This + * function also throws away @iocg's excess budget. + */ +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, + u32 usage, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess, delta, target, new_hwi; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return 1; + + /* see whether minimum margin requirement is met */ + if (waitqueue_active(&iocg->waitq) || + time_after64(vtime, now->vnow - ioc->margins.min)) + return hwm; + + /* throw away excess above target */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + atomic64_add(excess, &iocg->vtime); + atomic64_add(excess, &iocg->done_vtime); + vtime += excess; + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); + } + + /* + * Let's say the distance between iocg's and device's vtimes as a + * fraction of period duration is delta. Assuming that the iocg will + * consume the usage determined above, we want to determine new_hwi so + * that delta equals MARGIN_TARGET at the end of the next period. + * + * We need to execute usage worth of IOs while spending the sum of the + * new budget (1 - MARGIN_TARGET) and the leftover from the last period + * (delta): + * + * usage = (1 - MARGIN_TARGET + delta) * new_hwi + * + * Therefore, the new_hwi is: + * + * new_hwi = usage / (1 - MARGIN_TARGET + delta) + */ + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), + now->vnow - ioc->period_at_vtime); + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); + + return clamp_t(s64, new_hwi, 1, hwm); +} + +/* + * For work-conservation, an iocg which isn't using all of its share should + * donate the leftover to other iocgs. There are two ways to achieve this - 1. + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. + * + * #1 is mathematically simpler but has the drawback of requiring synchronous + * global hweight_inuse updates when idle iocg's get activated or inuse weights + * change due to donation snapbacks as it has the possibility of grossly + * overshooting what's allowed by the model and vrate. + * + * #2 is inherently safe with local operations. The donating iocg can easily + * snap back to higher weights when needed without worrying about impacts on + * other nodes as the impacts will be inherently correct. This also makes idle + * iocg activations safe. The only effect activations have is decreasing + * hweight_inuse of others, the right solution to which is for those iocgs to + * snap back to higher weights. + * + * So, we go with #2. The challenge is calculating how each donating iocg's + * inuse should be adjusted to achieve the target donation amounts. This is done + * using Andy's method described in the following pdf. + * + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo + * + * Given the weights and target after-donation hweight_inuse values, Andy's + * method determines how the proportional distribution should look like at each + * sibling level to maintain the relative relationship between all non-donating + * pairs. To roughly summarize, it divides the tree into donating and + * non-donating parts, calculates global donation rate which is used to + * determine the target hweight_inuse for each node, and then derives per-level + * proportions. + * + * The following pdf shows that global distribution calculated this way can be + * achieved by scaling inuse weights of donating leaves and propagating the + * adjustments upwards proportionally. + * + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE + * + * Combining the above two, we can determine how each leaf iocg's inuse should + * be adjusted to achieve the target donation. + * + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN + * + * The inline comments use symbols from the last pdf. + * + * b is the sum of the absolute budgets in the subtree. 1 for the root node. + * f is the sum of the absolute budgets of non-donating nodes in the subtree. + * t is the sum of the absolute budgets of donating nodes in the subtree. + * w is the weight of the node. w = w_f + w_t + * w_f is the non-donating portion of w. w_f = w * f / b + * w_b is the donating portion of w. w_t = w * t / b + * s is the sum of all sibling weights. s = Sum(w) for siblings + * s_f and s_t are the non-donating and donating portions of s. + * + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. + * w_pt is the donating portion of the parent's weight and w'_pt the same value + * after adjustments. Subscript r denotes the root node's values. + */ +static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) +{ + LIST_HEAD(over_hwa); + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg, *root_iocg; + u32 after_sum, over_sum, over_target, gamma; + + /* + * It's pretty unlikely but possible for the total sum of + * hweight_after_donation's to be higher than WEIGHT_ONE, which will + * confuse the following calculations. If such condition is detected, + * scale down everyone over its full share equally to keep the sum below + * WEIGHT_ONE. + */ + after_sum = 0; + over_sum = 0; + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 hwa; + + current_hweight(iocg, &hwa, NULL); + after_sum += iocg->hweight_after_donation; + + if (iocg->hweight_after_donation > hwa) { + over_sum += iocg->hweight_after_donation; + list_add(&iocg->walk_list, &over_hwa); + } + } + + if (after_sum >= WEIGHT_ONE) { + /* + * The delta should be deducted from the over_sum, calculate + * target over_sum value. + */ + u32 over_delta = after_sum - (WEIGHT_ONE - 1); + WARN_ON_ONCE(over_sum <= over_delta); + over_target = over_sum - over_delta; + } else { + over_target = 0; + } + + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { + if (over_target) + iocg->hweight_after_donation = + div_u64((u64)iocg->hweight_after_donation * + over_target, over_sum); + list_del_init(&iocg->walk_list); + } + + /* + * Build pre-order inner node walk list and prepare for donation + * adjustment calculations. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + iocg_build_inner_walk(iocg, &inner_walk); + } + + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); + WARN_ON_ONCE(root_iocg->level > 0); + + list_for_each_entry(iocg, &inner_walk, walk_list) { + iocg->child_adjusted_sum = 0; + iocg->hweight_donating = 0; + iocg->hweight_after_donation = 0; + } + + /* + * Propagate the donating budget (b_t) and after donation budget (b'_t) + * up the hierarchy. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { + if (iocg->level > 0) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + } + + /* + * Calculate inner hwa's (b) and make sure the donation values are + * within the accepted ranges as we're doing low res calculations with + * roundups. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + if (iocg->level) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + iocg->hweight_active = DIV64_U64_ROUND_UP( + (u64)parent->hweight_active * iocg->active, + parent->child_active_sum); + + } + + iocg->hweight_donating = min(iocg->hweight_donating, + iocg->hweight_active); + iocg->hweight_after_donation = min(iocg->hweight_after_donation, + iocg->hweight_donating - 1); + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || + iocg->hweight_donating <= 1 || + iocg->hweight_after_donation == 0)) { + pr_warn("iocg: invalid donation weights in "); + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); + pr_cont(": active=%u donating=%u after=%u\n", + iocg->hweight_active, iocg->hweight_donating, + iocg->hweight_after_donation); + } + } + + /* + * Calculate the global donation rate (gamma) - the rate to adjust + * non-donating budgets by. + * + * No need to use 64bit multiplication here as the first operand is + * guaranteed to be smaller than WEIGHT_ONE (1<<16). + * + * We know that there are beneficiary nodes and the sum of the donating + * hweights can't be whole; however, due to the round-ups during hweight + * calculations, root_iocg->hweight_donating might still end up equal to + * or greater than whole. Limit the range when calculating the divider. + * + * gamma = (1 - t_r') / (1 - t_r) + */ + gamma = DIV_ROUND_UP( + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, + WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); + + /* + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner + * nodes. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + struct ioc_gq *parent; + u32 inuse, wpt, wptp; + u64 st, sf; + + if (iocg->level == 0) { + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), + WEIGHT_ONE - iocg->hweight_after_donation); + continue; + } + + parent = iocg->ancestors[iocg->level - 1]; + + /* b' = gamma * b_f + b_t' */ + iocg->hweight_inuse = DIV64_U64_ROUND_UP( + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), + WEIGHT_ONE) + iocg->hweight_after_donation; + + /* w' = s' * b' / b'_p */ + inuse = DIV64_U64_ROUND_UP( + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, + parent->hweight_inuse); + + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ + st = DIV64_U64_ROUND_UP( + iocg->child_active_sum * iocg->hweight_donating, + iocg->hweight_active); + sf = iocg->child_active_sum - st; + wpt = DIV64_U64_ROUND_UP( + (u64)iocg->active * iocg->hweight_donating, + iocg->hweight_active); + wptp = DIV64_U64_ROUND_UP( + (u64)inuse * iocg->hweight_after_donation, + iocg->hweight_inuse); + + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); + } + + /* + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and + * we can finally determine leaf adjustments. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + u32 inuse; + + /* + * In-debt iocgs participated in the donation calculation with + * the minimum target hweight_inuse. Configuring inuse + * accordingly would work fine but debt handling expects + * @iocg->inuse stay at the minimum and we don't wanna + * interfere. + */ + if (iocg->abs_vdebt) { + WARN_ON_ONCE(iocg->inuse > 1); + continue; + } + + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ + inuse = DIV64_U64_ROUND_UP( + parent->child_adjusted_sum * iocg->hweight_after_donation, + parent->hweight_inuse); + + TRACE_IOCG_PATH(inuse_transfer, iocg, now, + iocg->inuse, inuse, + iocg->hweight_inuse, + iocg->hweight_after_donation); + + __propagate_weights(iocg, iocg->active, inuse, true, now); + } + + /* walk list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) + list_del_init(&iocg->walk_list); +} + +/* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot of + * memory paired with a slow IO device, the debt can span multiple seconds or + * more. If there are no other subsequent IO issuers, the in-debt iocg may end + * up blocked paying its debt while the IO device is idle. + * + * The following protects against such cases. If the device has been + * sufficiently idle for a while, the debts are halved and delays are + * recalculated. + */ +static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, + struct ioc_now *now) +{ + struct ioc_gq *iocg; + u64 dur, usage_pct, nr_cycles; + + /* if no debtor, reset the cycle */ + if (!nr_debtors) { + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; + ioc->dfgv_usage_us_sum = 0; + return; + } + + /* + * Debtors can pass through a lot of writes choking the device and we + * don't want to be forgiving debts while the device is struggling from + * write bursts. If we're missing latency targets, consider the device + * fully utilized. + */ + if (ioc->busy_level > 0) + usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); + + ioc->dfgv_usage_us_sum += usage_us_sum; + if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) + return; + + /* + * At least DFGV_PERIOD has passed since the last period. Calculate the + * average usage and reset the period counters. + */ + dur = now->now - ioc->dfgv_period_at; + usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); + + ioc->dfgv_period_at = now->now; + ioc->dfgv_usage_us_sum = 0; + + /* if was too busy, reset everything */ + if (usage_pct > DFGV_USAGE_PCT) { + ioc->dfgv_period_rem = 0; + return; + } + + /* + * Usage is lower than threshold. Let's forgive some debts. Debt + * forgiveness runs off of the usual ioc timer but its period usually + * doesn't match ioc's. Compensate the difference by performing the + * reduction as many times as would fit in the duration since the last + * run and carrying over the left-over duration in @ioc->dfgv_period_rem + * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive + * reductions is doubled. + */ + nr_cycles = dur + ioc->dfgv_period_rem; + ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); + + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 __maybe_unused old_debt, __maybe_unused old_delay; + + if (!iocg->abs_vdebt && !iocg->delay) + continue; + + spin_lock(&iocg->waitq.lock); + + old_debt = iocg->abs_vdebt; + old_delay = iocg->delay; + + if (iocg->abs_vdebt) + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + if (iocg->delay) + iocg->delay = iocg->delay >> nr_cycles ?: 1; + + iocg_kick_waitq(iocg, true, now); + + TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, + old_debt, iocg->abs_vdebt, + old_delay, iocg->delay); + + spin_unlock(&iocg->waitq.lock); + } +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + /* + * Waiters determine the sleep durations based on the vrate they + * saw at the time of sleep. If vrate has increased, some waiters + * could be sleeping for too long. Wake up tardy waiters which + * should have woken up in the last period and expire idle iocgs. + */ + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + !iocg->delay && !iocg_is_idle(iocg)) + continue; + + spin_lock(&iocg->waitq.lock); + + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->local_stat.wait_us += now.now - iocg->wait_since; + iocg->wait_since = now.now; + } + if (iocg->indebt_since) { + iocg->local_stat.indebt_us += + now.now - iocg->indebt_since; + iocg->indebt_since = now.now; + } + if (iocg->indelay_since) { + iocg->local_stat.indelay_us += + now.now - iocg->indelay_since; + iocg->indelay_since = now.now; + } + + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { + /* might be oversleeping vtime / hweight changes, kick */ + iocg_kick_waitq(iocg, true, &now); + if (iocg->abs_vdebt || iocg->delay) + nr_debtors++; + } else if (iocg_is_idle(iocg)) { + /* no waiter and idle, deactivate */ + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now.vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + + __propagate_weights(iocg, 0, 0, false, &now); + list_del_init(&iocg->active_list); + } + + spin_unlock(&iocg->waitq.lock); + } + commit_weights(ioc); + + /* + * Wait and indebt stat are flushed above and the donation calculation + * below needs updated usage stat. Let's bring stat up-to-date. + */ + iocg_flush_stat(&ioc->active_iocgs, &now); + + /* calc usage and see whether some weights need to be moved around */ + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 vdone, vtime, usage_us, usage_dur; + u32 usage, hw_active, hw_inuse; + + /* + * Collect unused and wind vtime closer to vnow to prevent + * iocgs from accumulating a large amount of budget. + */ + vdone = atomic64_read(&iocg->done_vtime); + vtime = atomic64_read(&iocg->vtime); + current_hweight(iocg, &hw_active, &hw_inuse); + + /* + * Latency QoS detection doesn't account for IOs which are + * in-flight for longer than a period. Detect them by + * comparing vdone against period start. If lagging behind + * IOs from past periods, don't increase vrate. + */ + if ((ppm_rthr != MILLION || ppm_wthr != MILLION) && + !atomic_read(&iocg_to_blkg(iocg)->use_delay) && + time_after64(vtime, vdone) && + time_after64(vtime, now.vnow - + MAX_LAGGING_PERIODS * period_vtime) && + time_before64(vdone, now.vnow - period_vtime)) + nr_lagging++; + + /* + * Determine absolute usage factoring in in-flight IOs to avoid + * high-latency completions appearing as idle. + */ + usage_us = iocg->usage_delta_us; + usage_us_sum += usage_us; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + usage_us = max(usage_us, inflight_us); + } + + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); + + /* see whether there's surplus vtime */ + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + if (hw_inuse < hw_active || + (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, now.vnow - ioc->margins.low))) { + u32 hwa, old_hwi, hwm, new_hwi; + + /* + * Already donating or accumulated enough to start. + * Determine the donation amount. + */ + current_hweight(iocg, &hwa, &old_hwi); + hwm = current_hweight_max(iocg); + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, + usage, &now); + /* + * Donation calculation assumes hweight_after_donation + * to be positive, a condition that a donor w/ hwa < 2 + * can't meet. Don't bother with donation if hwa is + * below 2. It's not gonna make a meaningful difference + * anyway. + */ + if (new_hwi < hwm && hwa >= 2) { + iocg->hweight_donating = hwa; + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); + } else if (!iocg->abs_vdebt) { + /* + * @iocg doesn't have enough to donate. Reset + * its inuse to active. + * + * Don't reset debtors as their inuse's are + * owned by debt handling. This shouldn't affect + * donation calculuation in any meaningful way + * as @iocg doesn't have a meaningful amount of + * share anyway. + */ + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); + + __propagate_weights(iocg, iocg->active, + iocg->active, true, &now); + nr_shortages++; + } + } else { + /* genuinely short on vtime */ + nr_shortages++; + } + } + + if (!list_empty(&surpluses) && nr_shortages) + transfer_surpluses(&surpluses, &now); + + commit_weights(ioc); + + /* surplus list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) + list_del_init(&iocg->surplus_list); + + /* + * If q is getting clogged or we're missing too much, we're issuing + * too much IO and should lower vtime rate. If we're not missing + * and experiencing shortages but not surpluses, we're too stingy + * and should increase vtime rate. + */ + prev_busy_level = ioc->busy_level; + if (rq_wait_pct > RQ_WAIT_BUSY_PCT || + missed_ppm[READ] > ppm_rthr || + missed_ppm[WRITE] > ppm_wthr) { + /* clearly missing QoS targets, slow down vrate */ + ioc->busy_level = max(ioc->busy_level, 0); + ioc->busy_level++; + } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 && + missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 && + missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) { + /* QoS targets are being met with >25% margin */ + if (nr_shortages) { + /* + * We're throttling while the device has spare + * capacity. If vrate was being slowed down, stop. + */ + ioc->busy_level = min(ioc->busy_level, 0); + + /* + * If there are IOs spanning multiple periods, wait + * them out before pushing the device harder. + */ + if (!nr_lagging) + ioc->busy_level--; + } else { + /* + * Nobody is being throttled and the users aren't + * issuing enough IOs to saturate the device. We + * simply don't know how close the device is to + * saturation. Coast. + */ + ioc->busy_level = 0; + } + } else { + /* inside the hysterisis margin, we're good */ + ioc->busy_level = 0; + } + + ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); + + if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + /* rq_wait signal is always reliable, ignore user vrate_min */ + if (rq_wait_pct > RQ_WAIT_BUSY_PCT) + vrate_min = VRATE_MIN; + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), + 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), + 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); + } else if (ioc->busy_level != prev_busy_level || nr_lagging) { + trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), + missed_ppm, rq_wait_pct, nr_lagging, + nr_shortages); + } + + ioc_refresh_params(ioc, false); + + ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); + + /* + * This period is done. Move onto the next one. If nothing's + * going on with the device, stop the timer. + */ + atomic64_inc(&ioc->cur_period); + + if (ioc->running != IOC_STOP) { + if (!list_empty(&ioc->active_iocgs)) { + ioc_start_period(ioc, &now); + } else { + ioc->busy_level = 0; + ioc->vtime_err = 0; + ioc->running = IOC_IDLE; + } + + ioc_refresh_vrate(ioc, &now); + } + + spin_unlock_irq(&ioc->lock); +} + +static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, + u64 abs_cost, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct ioc_margins *margins = &ioc->margins; + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; + u32 hwi, adj_step; + s64 margin; + u64 cost, new_inuse; + unsigned long flags; + + current_hweight(iocg, NULL, &hwi); + old_hwi = hwi; + cost = abs_cost_to_cost(abs_cost, hwi); + margin = now->vnow - vtime - cost; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return cost; + + /* + * We only increase inuse during period and do so iff the margin has + * deteriorated since the previous adjustment. + */ + if (margin >= iocg->saved_margin || margin >= margins->low || + iocg->inuse == iocg->active) + return cost; + + spin_lock_irqsave(&ioc->lock, flags); + + /* we own inuse only when @iocg is in the normal active state */ + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { + spin_unlock_irqrestore(&ioc->lock, flags); + return cost; + } + + /* + * Bump up inuse till @abs_cost fits in the existing budget. + * adj_step must be determined after acquiring ioc->lock - we might + * have raced and lost to another thread for activation and could + * be reading 0 iocg->active before ioc->lock which will lead to + * infinite loop. + */ + new_inuse = iocg->inuse; + adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + do { + new_inuse = new_inuse + adj_step; + propagate_weights(iocg, iocg->active, new_inuse, true, now); + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + } while (time_after64(vtime + cost, now->vnow) && + iocg->inuse != iocg->active); + + spin_unlock_irqrestore(&ioc->lock, flags); + + TRACE_IOCG_PATH(inuse_adjust, iocg, now, + old_inuse, iocg->inuse, old_hwi, hwi); + + return cost; +} + +static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, + bool is_merge, u64 *costp) +{ + struct ioc *ioc = iocg->ioc; + u64 coef_seqio, coef_randio, coef_page; + u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1); + u64 seek_pages = 0; + u64 cost = 0; + + switch (bio_op(bio)) { + case REQ_OP_READ: + coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; + coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; + coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + goto out; + } + + if (iocg->cursor) { + seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); + seek_pages >>= IOC_SECT_TO_PAGE_SHIFT; + } + + if (!is_merge) { + if (seek_pages > LCOEF_RANDIO_PAGES) { + cost += coef_randio; + } else { + cost += coef_seqio; + } + } + cost += pages * coef_page; +out: + *costp = cost; +} + +static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge) +{ + u64 cost; + + calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); + return cost; +} + +static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc, + u64 *costp) +{ + unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT; + + switch (req_op(rq)) { + case REQ_OP_READ: + *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; + break; + case REQ_OP_WRITE: + *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; + break; + default: + *costp = 0; + } +} + +static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc) +{ + u64 cost; + + calc_size_vtime_cost_builtin(rq, ioc, &cost); + return cost; +} + +static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_gq *iocg = blkg_to_iocg(blkg); + struct ioc_now now; + struct iocg_wait wait; + u64 abs_cost, cost, vtime; + bool use_debt, ioc_locked; + unsigned long flags; + + /* bypass IOs if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) + return; + + /* calculate the absolute vtime cost */ + abs_cost = calc_vtime_cost(bio, iocg, false); + if (!abs_cost) + return; + + if (!iocg_activate(iocg, &now)) + return; + + iocg->cursor = bio_end_sector(bio); + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); + + /* + * If no one's waiting and within budget, issue right away. The + * tests are racy but the races aren't systemic - we only miss once + * in a while which is fine. + */ + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + time_before_eq64(vtime + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, abs_cost, cost); + return; + } + + /* + * We're over budget. This can be handled in two ways. IOs which may + * cause priority inversions are punted to @ioc->aux_iocg and charged as + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine + * whether debt handling is needed and acquire locks accordingly. + */ + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); +retry_lock: + iocg_lock(iocg, ioc_locked, &flags); + + /* + * @iocg must stay activated for debt and waitq handling. Deactivation + * is synchronized against both ioc->lock and waitq.lock and we won't + * get deactivated as long as we're waiting or has debt, so we're good + * if we're activated here. In the unlikely cases that we aren't, just + * issue the IO. + */ + if (unlikely(list_empty(&iocg->active_list))) { + iocg_unlock(iocg, ioc_locked, &flags); + iocg_commit_bio(iocg, bio, abs_cost, cost); + return; + } + + /* + * We're over budget. If @bio has to be issued regardless, remember + * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay + * off the debt before waking more IOs. + * + * This way, the debt is continuously paid off each period with the + * actual budget available to the cgroup. If we just wound vtime, we + * would incorrectly use the current hw_inuse for the entire amount + * which, for example, can lead to the cgroup staying blocked for a + * long time even with substantially raised hw_inuse. + * + * An iocg with vdebt should stay online so that the timer can keep + * deducting its vdebt and [de]activate use_delay mechanism + * accordingly. We don't want to race against the timer trying to + * clear them and leave @iocg inactive w/ dangling use_delay heavily + * penalizing the cgroup and its descendants. + */ + if (use_debt) { + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + iocg_unlock(iocg, ioc_locked, &flags); + return; + } + + /* guarantee that iocgs w/ waiters have maximum inuse */ + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { + if (!ioc_locked) { + iocg_unlock(iocg, false, &flags); + ioc_locked = true; + goto retry_lock; + } + propagate_weights(iocg, iocg->active, iocg->active, true, + &now); + } + + /* + * Append self to the waitq and schedule the wakeup timer if we're + * the first waiter. The timer duration is calculated based on the + * current vrate. vtime and hweight changes can make it too short + * or too long. Each wait entry records the absolute cost it's + * waiting for to allow re-evaluation using a custom wait entry. + * + * If too short, the timer simply reschedules itself. If too long, + * the period timer will notice and trigger wakeups. + * + * All waiters are on iocg->waitq and the wait states are + * synchronized using waitq.lock. + */ + init_waitqueue_func_entry(&wait.wait, iocg_wake_fn); + wait.wait.private = current; + wait.bio = bio; + wait.abs_cost = abs_cost; + wait.committed = false; /* will be set true by waker */ + + __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); + iocg_kick_waitq(iocg, ioc_locked, &now); + + iocg_unlock(iocg, ioc_locked, &flags); + + while (true) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (wait.committed) + break; + io_schedule(); + } + + /* waker already committed us, proceed */ + finish_wait(&iocg->waitq, &wait.wait); +} + +static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, + struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + struct ioc *ioc = rqos_to_ioc(rqos); + sector_t bio_end = bio_end_sector(bio); + struct ioc_now now; + u64 vtime, abs_cost, cost; + unsigned long flags; + + /* bypass if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) + return; + + abs_cost = calc_vtime_cost(bio, iocg, true); + if (!abs_cost) + return; + + ioc_now(ioc, &now); + + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); + + /* update cursor if backmerging into the request at the cursor */ + if (blk_rq_pos(rq) < bio_end && + blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) + iocg->cursor = bio_end; + + /* + * Charge if there's enough vtime budget and the existing request has + * cost assigned. + */ + if (rq->bio && rq->bio->bi_iocost_cost && + time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { + iocg_commit_bio(iocg, bio, abs_cost, cost); + return; + } + + /* + * Otherwise, account it as debt if @iocg is online, which it should + * be for the vast majority of cases. See debt handling in + * ioc_rqos_throttle() for details. + */ + spin_lock_irqsave(&ioc->lock, flags); + spin_lock(&iocg->waitq.lock); + + if (likely(!list_empty(&iocg->active_list))) { + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->q, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + } else { + iocg_commit_bio(iocg, bio, abs_cost, cost); + } + + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&ioc->lock, flags); +} + +static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); + + if (iocg && bio->bi_iocost_cost) + atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); +} + +static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_pcpu_stat *ccs; + u64 on_q_ns, rq_wait_ns, size_nsec; + int pidx, rw; + + if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) + return; + + switch (req_op(rq) & REQ_OP_MASK) { + case REQ_OP_READ: + pidx = QOS_RLAT; + rw = READ; + break; + case REQ_OP_WRITE: + pidx = QOS_WLAT; + rw = WRITE; + break; + default: + return; + } + + on_q_ns = ktime_get_ns() - rq->alloc_time_ns; + rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; + size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); + + ccs = get_cpu_ptr(ioc->pcpu_stat); + + if (on_q_ns <= size_nsec || + on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) + local_inc(&ccs->missed[rw].nr_met); + else + local_inc(&ccs->missed[rw].nr_missed); + + local64_add(rq_wait_ns, &ccs->rq_wait_ns); + + put_cpu_ptr(ccs); +} + +static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + spin_lock_irq(&ioc->lock); + ioc_refresh_params(ioc, false); + spin_unlock_irq(&ioc->lock); +} + +static void ioc_rqos_exit(struct rq_qos *rqos) +{ + struct ioc *ioc = rqos_to_ioc(rqos); + + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + + spin_lock_irq(&ioc->lock); + ioc->running = IOC_STOP; + spin_unlock_irq(&ioc->lock); + + del_timer_sync(&ioc->timer); + free_percpu(ioc->pcpu_stat); + kfree(ioc); +} + +static struct rq_qos_ops ioc_rqos_ops = { + .throttle = ioc_rqos_throttle, + .merge = ioc_rqos_merge, + .done_bio = ioc_rqos_done_bio, + .done = ioc_rqos_done, + .queue_depth_changed = ioc_rqos_queue_depth_changed, + .exit = ioc_rqos_exit, +}; + +static int blk_iocost_init(struct request_queue *q) +{ + struct ioc *ioc; + struct rq_qos *rqos; + int i, cpu, ret; + + ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); + if (!ioc) + return -ENOMEM; + + ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); + if (!ioc->pcpu_stat) { + kfree(ioc); + return -ENOMEM; + } + + for_each_possible_cpu(cpu) { + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); + + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { + local_set(&ccs->missed[i].nr_met, 0); + local_set(&ccs->missed[i].nr_missed, 0); + } + local64_set(&ccs->rq_wait_ns, 0); + } + + rqos = &ioc->rqos; + rqos->id = RQ_QOS_COST; + rqos->ops = &ioc_rqos_ops; + rqos->q = q; + + spin_lock_init(&ioc->lock); + timer_setup(&ioc->timer, ioc_timer_fn, 0); + INIT_LIST_HEAD(&ioc->active_iocgs); + + ioc->running = IOC_IDLE; + ioc->vtime_base_rate = VTIME_PER_USEC; + atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); + ioc->period_at = ktime_to_us(ktime_get()); + atomic64_set(&ioc->cur_period, 0); + atomic_set(&ioc->hweight_gen, 0); + + spin_lock_irq(&ioc->lock); + ioc->autop_idx = AUTOP_INVALID; + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + /* + * rqos must be added before activation to allow iocg_pd_init() to + * lookup the ioc from q. This means that the rqos methods may get + * called before policy activation completion, can't assume that the + * target bio has an iocg associated and need to test for NULL iocg. + */ + rq_qos_add(q, rqos); + ret = blkcg_activate_policy(q, &blkcg_policy_iocost); + if (ret) { + rq_qos_del(q, rqos); + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; + } + return 0; +} + +static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) +{ + struct ioc_cgrp *iocc; + + iocc = kzalloc(sizeof(struct ioc_cgrp), gfp); + if (!iocc) + return NULL; + + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; + return &iocc->cpd; +} + +static void ioc_cpd_free(struct blkcg_policy_data *cpd) +{ + kfree(container_of(cpd, struct ioc_cgrp, cpd)); +} + +static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, + struct blkcg *blkcg) +{ + int levels = blkcg->css.cgroup->level + 1; + struct ioc_gq *iocg; + + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node); + if (!iocg) + return NULL; + + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); + if (!iocg->pcpu_stat) { + kfree(iocg); + return NULL; + } + + return &iocg->pd; +} + +static void ioc_pd_init(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); + struct ioc *ioc = q_to_ioc(blkg->q); + struct ioc_now now; + struct blkcg_gq *tblkg; + unsigned long flags; + + ioc_now(ioc, &now); + + iocg->ioc = ioc; + atomic64_set(&iocg->vtime, now.vnow); + atomic64_set(&iocg->done_vtime, now.vnow); + atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); + INIT_LIST_HEAD(&iocg->active_list); + INIT_LIST_HEAD(&iocg->walk_list); + INIT_LIST_HEAD(&iocg->surplus_list); + iocg->hweight_active = WEIGHT_ONE; + iocg->hweight_inuse = WEIGHT_ONE; + + init_waitqueue_head(&iocg->waitq); + hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + iocg->waitq_timer.function = iocg_waitq_timer_fn; + + iocg->level = blkg->blkcg->css.cgroup->level; + + for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { + struct ioc_gq *tiocg = blkg_to_iocg(tblkg); + iocg->ancestors[tiocg->level] = tiocg; + } + + spin_lock_irqsave(&ioc->lock, flags); + weight_updated(iocg, &now); + spin_unlock_irqrestore(&ioc->lock, flags); +} + +static void ioc_pd_free(struct blkg_policy_data *pd) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + unsigned long flags; + + if (ioc) { + spin_lock_irqsave(&ioc->lock, flags); + + if (!list_empty(&iocg->active_list)) { + struct ioc_now now; + + ioc_now(ioc, &now); + propagate_weights(iocg, 0, 0, false, &now); + list_del_init(&iocg->active_list); + } + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + + spin_unlock_irqrestore(&ioc->lock, flags); + + hrtimer_cancel(&iocg->waitq_timer); + } + free_percpu(iocg->pcpu_stat); + kfree(iocg); +} + +static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + size_t pos = 0; + + if (!ioc->enabled) + return 0; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( + ioc->vtime_base_rate * 10000, + VTIME_PER_USEC); + pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u", + vp10k / 100, vp10k % 100); + } + + pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu", + iocg->last_stat.usage_us); + + if (blkcg_debug_stats) + pos += scnprintf(buf + pos, size - pos, + " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); + + return pos; +} + +static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc_gq *iocg = pd_to_iocg(pd); + + if (dname && iocg->cfg_weight) + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); + return 0; +} + + +static int ioc_weight_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); + blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); + struct blkg_conf_ctx ctx; + struct ioc_now now; + struct ioc_gq *iocg; + u32 v; + int ret; + + if (!strchr(buf, ':')) { + struct blkcg_gq *blkg; + + if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v)) + return -EINVAL; + + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + return -EINVAL; + + spin_lock_irq(&blkcg->lock); + iocc->dfl_weight = v * WEIGHT_ONE; + hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { + struct ioc_gq *iocg = blkg_to_iocg(blkg); + + if (iocg) { + spin_lock(&iocg->ioc->lock); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); + spin_unlock(&iocg->ioc->lock); + } + } + spin_unlock_irq(&blkcg->lock); + + return nbytes; + } + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + if (ret) + return ret; + + iocg = blkg_to_iocg(ctx.blkg); + + if (!strncmp(ctx.body, "default", 7)) { + v = 0; + } else { + if (!sscanf(ctx.body, "%u", &v)) + goto einval; + if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) + goto einval; + } + + spin_lock(&iocg->ioc->lock); + iocg->cfg_weight = v * WEIGHT_ONE; + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); + spin_unlock(&iocg->ioc->lock); + + blkg_conf_finish(&ctx); + return nbytes; + +einval: + blkg_conf_finish(&ctx); + return -EINVAL; +} + +static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + + if (!dname) + return 0; + + seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", + dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", + ioc->params.qos[QOS_RPPM] / 10000, + ioc->params.qos[QOS_RPPM] % 10000 / 100, + ioc->params.qos[QOS_RLAT], + ioc->params.qos[QOS_WPPM] / 10000, + ioc->params.qos[QOS_WPPM] % 10000 / 100, + ioc->params.qos[QOS_WLAT], + ioc->params.qos[QOS_MIN] / 10000, + ioc->params.qos[QOS_MIN] % 10000 / 100, + ioc->params.qos[QOS_MAX] / 10000, + ioc->params.qos[QOS_MAX] % 10000 / 100); + return 0; +} + +static int ioc_qos_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t qos_ctrl_tokens = { + { QOS_ENABLE, "enable=%u" }, + { QOS_CTRL, "ctrl=%s" }, + { NR_QOS_CTRL_PARAMS, NULL }, +}; + +static const match_table_t qos_tokens = { + { QOS_RPPM, "rpct=%s" }, + { QOS_RLAT, "rlat=%u" }, + { QOS_WPPM, "wpct=%s" }, + { QOS_WLAT, "wlat=%u" }, + { QOS_MIN, "min=%s" }, + { QOS_MAX, "max=%s" }, + { NR_QOS_PARAMS, NULL }, +}; + +static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u32 qos[NR_QOS_PARAMS]; + bool enable, user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(qos, ioc->params.qos, sizeof(qos)); + enable = ioc->enabled; + user = ioc->user_qos_params; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + s64 v; + + if (!*p) + continue; + + switch (match_token(p, qos_ctrl_tokens, args)) { + case QOS_ENABLE: + match_u64(&args[0], &v); + enable = v; + continue; + case QOS_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + } + + tok = match_token(p, qos_tokens, args); + switch (tok) { + case QOS_RPPM: + case QOS_WPPM: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0 || v > 10000) + goto einval; + qos[tok] = v * 100; + break; + case QOS_RLAT: + case QOS_WLAT: + if (match_u64(&args[0], &v)) + goto einval; + qos[tok] = v; + break; + case QOS_MIN: + case QOS_MAX: + if (match_strlcpy(buf, &args[0], sizeof(buf)) >= + sizeof(buf)) + goto einval; + if (cgroup_parse_float(buf, 2, &v)) + goto einval; + if (v < 0) + goto einval; + qos[tok] = clamp_t(s64, v * 100, + VRATE_MIN_PPM, VRATE_MAX_PPM); + break; + default: + goto einval; + } + user = true; + } + + if (qos[QOS_MIN] > qos[QOS_MAX]) + goto einval; + + spin_lock_irq(&ioc->lock); + + if (enable) { + blk_stat_enable_accounting(ioc->rqos.q); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = true; + } else { + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + ioc->enabled = false; + } + + if (user) { + memcpy(ioc->params.qos, qos, sizeof(qos)); + ioc->user_qos_params = true; + } else { + ioc->user_qos_params = false; + } + + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static u64 ioc_cost_model_prfill(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + const char *dname = blkg_dev_name(pd->blkg); + struct ioc *ioc = pd_to_iocg(pd)->ioc; + u64 *u = ioc->params.i_lcoefs; + + if (!dname) + return 0; + + seq_printf(sf, "%s ctrl=%s model=linear " + "rbps=%llu rseqiops=%llu rrandiops=%llu " + "wbps=%llu wseqiops=%llu wrandiops=%llu\n", + dname, ioc->user_cost_model ? "user" : "auto", + u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], + u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + return 0; +} + +static int ioc_cost_model_show(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + + blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill, + &blkcg_policy_iocost, seq_cft(sf)->private, false); + return 0; +} + +static const match_table_t cost_ctrl_tokens = { + { COST_CTRL, "ctrl=%s" }, + { COST_MODEL, "model=%s" }, + { NR_COST_CTRL_PARAMS, NULL }, +}; + +static const match_table_t i_lcoef_tokens = { + { I_LCOEF_RBPS, "rbps=%u" }, + { I_LCOEF_RSEQIOPS, "rseqiops=%u" }, + { I_LCOEF_RRANDIOPS, "rrandiops=%u" }, + { I_LCOEF_WBPS, "wbps=%u" }, + { I_LCOEF_WSEQIOPS, "wseqiops=%u" }, + { I_LCOEF_WRANDIOPS, "wrandiops=%u" }, + { NR_I_LCOEFS, NULL }, +}; + +static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, + size_t nbytes, loff_t off) +{ + struct gendisk *disk; + struct ioc *ioc; + u64 u[NR_I_LCOEFS]; + bool user; + char *p; + int ret; + + disk = blkcg_conf_get_disk(&input); + if (IS_ERR(disk)) + return PTR_ERR(disk); + + ioc = q_to_ioc(disk->queue); + if (!ioc) { + ret = blk_iocost_init(disk->queue); + if (ret) + goto err; + ioc = q_to_ioc(disk->queue); + } + + spin_lock_irq(&ioc->lock); + memcpy(u, ioc->params.i_lcoefs, sizeof(u)); + user = ioc->user_cost_model; + spin_unlock_irq(&ioc->lock); + + while ((p = strsep(&input, " \t\n"))) { + substring_t args[MAX_OPT_ARGS]; + char buf[32]; + int tok; + u64 v; + + if (!*p) + continue; + + switch (match_token(p, cost_ctrl_tokens, args)) { + case COST_CTRL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (!strcmp(buf, "auto")) + user = false; + else if (!strcmp(buf, "user")) + user = true; + else + goto einval; + continue; + case COST_MODEL: + match_strlcpy(buf, &args[0], sizeof(buf)); + if (strcmp(buf, "linear")) + goto einval; + continue; + } + + tok = match_token(p, i_lcoef_tokens, args); + if (tok == NR_I_LCOEFS) + goto einval; + if (match_u64(&args[0], &v)) + goto einval; + u[tok] = v; + user = true; + } + + spin_lock_irq(&ioc->lock); + if (user) { + memcpy(ioc->params.i_lcoefs, u, sizeof(u)); + ioc->user_cost_model = true; + } else { + ioc->user_cost_model = false; + } + ioc_refresh_params(ioc, true); + spin_unlock_irq(&ioc->lock); + + put_disk_and_module(disk); + return nbytes; + +einval: + ret = -EINVAL; +err: + put_disk_and_module(disk); + return ret; +} + +static struct cftype ioc_files[] = { + { + .name = "weight", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = ioc_weight_show, + .write = ioc_weight_write, + }, + { + .name = "cost.qos", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_qos_show, + .write = ioc_qos_write, + }, + { + .name = "cost.model", + .flags = CFTYPE_ONLY_ON_ROOT, + .seq_show = ioc_cost_model_show, + .write = ioc_cost_model_write, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iocost = { + .dfl_cftypes = ioc_files, + .cpd_alloc_fn = ioc_cpd_alloc, + .cpd_free_fn = ioc_cpd_free, + .pd_alloc_fn = ioc_pd_alloc, + .pd_init_fn = ioc_pd_init, + .pd_free_fn = ioc_pd_free, + .pd_stat_fn = ioc_pd_stat, +}; + +static int __init ioc_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iocost); +} + +static void __exit ioc_exit(void) +{ + blkcg_policy_unregister(&blkcg_policy_iocost); +} + +module_init(ioc_init); +module_exit(ioc_exit); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c new file mode 100644 index 000000000..74511a060 --- /dev/null +++ b/block/blk-iolatency.c @@ -0,0 +1,1063 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block rq-qos base io controller + * + * This works similar to wbt with a few exceptions + * + * - It's bio based, so the latency covers the whole block layer in addition to + * the actual io. + * - We will throttle all IO that comes in here if we need to. + * - We use the mean latency over the 100ms window. This is because writes can + * be particularly fast, which could give us a false sense of the impact of + * other workloads on our protected workload. + * - By default there's no throttling, we set the queue_depth to UINT_MAX so + * that we can have as many outstanding bio's as we're allowed to. Only at + * throttle time do we pay attention to the actual queue depth. + * + * The hierarchy works like the cpu controller does, we track the latency at + * every configured node, and each configured node has it's own independent + * queue depth. This means that we only care about our latency targets at the + * peer level. Some group at the bottom of the hierarchy isn't going to affect + * a group at the end of some other path if we're only configred at leaf level. + * + * Consider the following + * + * root blkg + * / \ + * fast (target=5ms) slow (target=10ms) + * / \ / \ + * a b normal(15ms) unloved + * + * "a" and "b" have no target, but their combined io under "fast" cannot exceed + * an average latency of 5ms. If it does then we will throttle the "slow" + * group. In the case of "normal", if it exceeds its 15ms target, we will + * throttle "unloved", but nobody else. + * + * In this example "fast", "slow", and "normal" will be the only groups actually + * accounting their io latencies. We have to walk up the heirarchy to the root + * on every submit and complete so we can do the appropriate stat recording and + * adjust the queue depth of ourselves if needed. + * + * There are 2 ways we throttle IO. + * + * 1) Queue depth throttling. As we throttle down we will adjust the maximum + * number of IO's we're allowed to have in flight. This starts at (u64)-1 down + * to 1. If the group is only ever submitting IO for itself then this is the + * only way we throttle. + * + * 2) Induced delay throttling. This is for the case that a group is generating + * IO that has to be issued by the root cg to avoid priority inversion. So think + * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot + * of work done for us on behalf of the root cg and are being asked to scale + * down more then we induce a latency at userspace return. We accumulate the + * total amount of time we need to be punished by doing + * + * total_time += min_lat_nsec - actual_io_completion + * + * and then at throttle time will do + * + * throttle_time = min(total_time, NSEC_PER_SEC) + * + * This induced delay will throttle back the activity that is generating the + * root cg issued io's, wethere that's some metadata intensive operation or the + * group is using so much memory that it is pushing us into swap. + * + * Copyright (C) 2018 Josef Bacik + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk-rq-qos.h" +#include "blk-stat.h" +#include "blk.h" + +#define DEFAULT_SCALE_COOKIE 1000000U + +static struct blkcg_policy blkcg_policy_iolatency; +struct iolatency_grp; + +struct blk_iolatency { + struct rq_qos rqos; + struct timer_list timer; + + /* + * ->enabled is the master enable switch gating the throttling logic and + * inflight tracking. The number of cgroups which have iolat enabled is + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly + * from ->enable_work with the request_queue frozen. For details, See + * blkiolatency_enable_work_fn(). + */ + bool enabled; + atomic_t enable_cnt; + struct work_struct enable_work; +}; + +static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) +{ + return container_of(rqos, struct blk_iolatency, rqos); +} + +struct child_latency_info { + spinlock_t lock; + + /* Last time we adjusted the scale of everybody. */ + u64 last_scale_event; + + /* The latency that we missed. */ + u64 scale_lat; + + /* Total io's from all of our children for the last summation. */ + u64 nr_samples; + + /* The guy who actually changed the latency numbers. */ + struct iolatency_grp *scale_grp; + + /* Cookie to tell if we need to scale up or down. */ + atomic_t scale_cookie; +}; + +struct percentile_stats { + u64 total; + u64 missed; +}; + +struct latency_stat { + union { + struct percentile_stats ps; + struct blk_rq_stat rqs; + }; +}; + +struct iolatency_grp { + struct blkg_policy_data pd; + struct latency_stat __percpu *stats; + struct latency_stat cur_stat; + struct blk_iolatency *blkiolat; + struct rq_depth rq_depth; + struct rq_wait rq_wait; + atomic64_t window_start; + atomic_t scale_cookie; + u64 min_lat_nsec; + u64 cur_win_nsec; + + /* total running average of our io latency. */ + u64 lat_avg; + + /* Our current number of IO's for the last summation. */ + u64 nr_samples; + + bool ssd; + struct child_latency_info child_lat; +}; + +#define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC +/* + * These are the constants used to fake the fixed-point moving average + * calculation just like load average. The call to calc_load() folds + * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling + * window size is bucketed to try to approximately calculate average + * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows + * elapse immediately. Note, windows only elapse with IO activity. Idle + * periods extend the most recent window. + */ +#define BLKIOLATENCY_NR_EXP_FACTORS 5 +#define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ + (BLKIOLATENCY_NR_EXP_FACTORS - 1)) +static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { + 2045, // exp(1/600) - 600 samples + 2039, // exp(1/240) - 240 samples + 2031, // exp(1/120) - 120 samples + 2023, // exp(1/80) - 80 samples + 2014, // exp(1/60) - 60 samples +}; + +static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; +} + +static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) +{ + return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); +} + +static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) +{ + return pd_to_blkg(&iolat->pd); +} + +static inline void latency_stat_init(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) { + stat->ps.total = 0; + stat->ps.missed = 0; + } else + blk_rq_stat_init(&stat->rqs); +} + +static inline void latency_stat_sum(struct iolatency_grp *iolat, + struct latency_stat *sum, + struct latency_stat *stat) +{ + if (iolat->ssd) { + sum->ps.total += stat->ps.total; + sum->ps.missed += stat->ps.missed; + } else + blk_rq_stat_sum(&sum->rqs, &stat->rqs); +} + +static inline void latency_stat_record_time(struct iolatency_grp *iolat, + u64 req_time) +{ + struct latency_stat *stat = get_cpu_ptr(iolat->stats); + if (iolat->ssd) { + if (req_time >= iolat->min_lat_nsec) + stat->ps.missed++; + stat->ps.total++; + } else + blk_rq_stat_add(&stat->rqs, req_time); + put_cpu_ptr(stat); +} + +static inline bool latency_sum_ok(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) { + u64 thresh = div64_u64(stat->ps.total, 10); + thresh = max(thresh, 1ULL); + return stat->ps.missed < thresh; + } + return stat->rqs.mean <= iolat->min_lat_nsec; +} + +static inline u64 latency_stat_samples(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + if (iolat->ssd) + return stat->ps.total; + return stat->rqs.nr_samples; +} + +static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, + struct latency_stat *stat) +{ + int exp_idx; + + if (iolat->ssd) + return; + + /* + * calc_load() takes in a number stored in fixed point representation. + * Because we are using this for IO time in ns, the values stored + * are significantly larger than the FIXED_1 denominator (2048). + * Therefore, rounding errors in the calculation are negligible and + * can be ignored. + */ + exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, + div64_u64(iolat->cur_win_nsec, + BLKIOLATENCY_EXP_BUCKET_SIZE)); + iolat->lat_avg = calc_load(iolat->lat_avg, + iolatency_exp_factors[exp_idx], + stat->rqs.mean); +} + +static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + atomic_dec(&rqw->inflight); + wake_up(&rqw->wait); +} + +static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) +{ + struct iolatency_grp *iolat = private_data; + return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); +} + +static void __blkcg_iolatency_throttle(struct rq_qos *rqos, + struct iolatency_grp *iolat, + bool issue_as_root, + bool use_memdelay) +{ + struct rq_wait *rqw = &iolat->rq_wait; + unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); + + if (use_delay) + blkcg_schedule_throttle(rqos->q, use_memdelay); + + /* + * To avoid priority inversions we want to just take a slot if we are + * issuing as root. If we're being killed off there's no point in + * delaying things, we may have been killed by OOM so throttling may + * make recovery take even longer, so just let the IO's through so the + * task can go away. + */ + if (issue_as_root || fatal_signal_pending(current)) { + atomic_inc(&rqw->inflight); + return; + } + + rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); +} + +#define SCALE_DOWN_FACTOR 2 +#define SCALE_UP_FACTOR 4 + +static inline unsigned long scale_amount(unsigned long qd, bool up) +{ + return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); +} + +/* + * We scale the qd down faster than we scale up, so we need to use this helper + * to adjust the scale_cookie accordingly so we don't prematurely get + * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. + * + * Each group has their own local copy of the last scale cookie they saw, so if + * the global scale cookie goes up or down they know which way they need to go + * based on their last knowledge of it. + */ +static void scale_cookie_change(struct blk_iolatency *blkiolat, + struct child_latency_info *lat_info, + bool up) +{ + unsigned long qd = blkiolat->rqos.q->nr_requests; + unsigned long scale = scale_amount(qd, up); + unsigned long old = atomic_read(&lat_info->scale_cookie); + unsigned long max_scale = qd << 1; + unsigned long diff = 0; + + if (old < DEFAULT_SCALE_COOKIE) + diff = DEFAULT_SCALE_COOKIE - old; + + if (up) { + if (scale + old > DEFAULT_SCALE_COOKIE) + atomic_set(&lat_info->scale_cookie, + DEFAULT_SCALE_COOKIE); + else if (diff > qd) + atomic_inc(&lat_info->scale_cookie); + else + atomic_add(scale, &lat_info->scale_cookie); + } else { + /* + * We don't want to dig a hole so deep that it takes us hours to + * dig out of it. Just enough that we don't throttle/unthrottle + * with jagged workloads but can still unthrottle once pressure + * has sufficiently dissipated. + */ + if (diff > qd) { + if (diff < max_scale) + atomic_dec(&lat_info->scale_cookie); + } else { + atomic_sub(scale, &lat_info->scale_cookie); + } + } +} + +/* + * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * queue depth at a time so we don't get wild swings and hopefully dial in to + * fairer distribution of the overall queue depth. + */ +static void scale_change(struct iolatency_grp *iolat, bool up) +{ + unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; + unsigned long scale = scale_amount(qd, up); + unsigned long old = iolat->rq_depth.max_depth; + + if (old > qd) + old = qd; + + if (up) { + if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) + return; + + if (old < qd) { + old += scale; + old = min(old, qd); + iolat->rq_depth.max_depth = old; + wake_up_all(&iolat->rq_wait.wait); + } + } else { + old >>= 1; + iolat->rq_depth.max_depth = max(old, 1UL); + } +} + +/* Check our parent and see if the scale cookie has changed. */ +static void check_scale_change(struct iolatency_grp *iolat) +{ + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + unsigned int cur_cookie; + unsigned int our_cookie = atomic_read(&iolat->scale_cookie); + u64 scale_lat; + unsigned int old; + int direction = 0; + + if (lat_to_blkg(iolat)->parent == NULL) + return; + + parent = blkg_to_lat(lat_to_blkg(iolat)->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + cur_cookie = atomic_read(&lat_info->scale_cookie); + scale_lat = READ_ONCE(lat_info->scale_lat); + + if (cur_cookie < our_cookie) + direction = -1; + else if (cur_cookie > our_cookie) + direction = 1; + else + return; + + old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); + + /* Somebody beat us to the punch, just bail. */ + if (old != our_cookie) + return; + + if (direction < 0 && iolat->min_lat_nsec) { + u64 samples_thresh; + + if (!scale_lat || iolat->min_lat_nsec <= scale_lat) + return; + + /* + * Sometimes high priority groups are their own worst enemy, so + * instead of taking it out on some poor other group that did 5% + * or less of the IO's for the last summation just skip this + * scale down event. + */ + samples_thresh = lat_info->nr_samples * 5; + samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); + if (iolat->nr_samples <= samples_thresh) + return; + } + + /* We're as low as we can go. */ + if (iolat->rq_depth.max_depth == 1 && direction < 0) { + blkcg_use_delay(lat_to_blkg(iolat)); + return; + } + + /* We're back to the default cookie, unthrottle all the things. */ + if (cur_cookie == DEFAULT_SCALE_COOKIE) { + blkcg_clear_delay(lat_to_blkg(iolat)); + iolat->rq_depth.max_depth = UINT_MAX; + wake_up_all(&iolat->rq_wait.wait); + return; + } + + scale_change(iolat, direction > 0); +} + +static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + struct blkcg_gq *blkg = bio->bi_blkg; + bool issue_as_root = bio_issue_as_root_blkg(bio); + + if (!blkiolat->enabled) + return; + + while (blkg && blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + + check_scale_change(iolat); + __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); + blkg = blkg->parent; + } + if (!timer_pending(&blkiolat->timer)) + mod_timer(&blkiolat->timer, jiffies + HZ); +} + +static void iolatency_record_time(struct iolatency_grp *iolat, + struct bio_issue *issue, u64 now, + bool issue_as_root) +{ + u64 start = bio_issue_time(issue); + u64 req_time; + + /* + * Have to do this so we are truncated to the correct time that our + * issue is truncated to. + */ + now = __bio_issue_time(now); + + if (now <= start) + return; + + req_time = now - start; + + /* + * We don't want to count issue_as_root bio's in the cgroups latency + * statistics as it could skew the numbers downwards. + */ + if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { + u64 sub = iolat->min_lat_nsec; + if (req_time < sub) + blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); + return; + } + + latency_stat_record_time(iolat, req_time); +} + +#define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) +#define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 + +static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) +{ + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct iolatency_grp *parent; + struct child_latency_info *lat_info; + struct latency_stat stat; + unsigned long flags; + int cpu; + + latency_stat_init(iolat, &stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct latency_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + latency_stat_sum(iolat, &stat, s); + latency_stat_init(iolat, s); + } + preempt_enable(); + + parent = blkg_to_lat(blkg->parent); + if (!parent) + return; + + lat_info = &parent->child_lat; + + iolat_update_total_lat_avg(iolat, &stat); + + /* Everything is ok and we don't need to adjust the scale. */ + if (latency_sum_ok(iolat, &stat) && + atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) + return; + + /* Somebody beat us to the punch, just bail. */ + spin_lock_irqsave(&lat_info->lock, flags); + + latency_stat_sum(iolat, &iolat->cur_stat, &stat); + lat_info->nr_samples -= iolat->nr_samples; + lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); + iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); + + if ((lat_info->last_scale_event >= now || + now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) + goto out; + + if (latency_sum_ok(iolat, &iolat->cur_stat) && + latency_sum_ok(iolat, &stat)) { + if (latency_stat_samples(iolat, &iolat->cur_stat) < + BLKIOLATENCY_MIN_GOOD_SAMPLES) + goto out; + if (lat_info->scale_grp == iolat) { + lat_info->last_scale_event = now; + scale_cookie_change(iolat->blkiolat, lat_info, true); + } + } else if (lat_info->scale_lat == 0 || + lat_info->scale_lat >= iolat->min_lat_nsec) { + lat_info->last_scale_event = now; + if (!lat_info->scale_grp || + lat_info->scale_lat > iolat->min_lat_nsec) { + WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); + lat_info->scale_grp = iolat; + } + scale_cookie_change(iolat->blkiolat, lat_info, false); + } + latency_stat_init(iolat, &iolat->cur_stat); +out: + spin_unlock_irqrestore(&lat_info->lock, flags); +} + +static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + struct blkcg_gq *blkg; + struct rq_wait *rqw; + struct iolatency_grp *iolat; + u64 window_start; + u64 now; + bool issue_as_root = bio_issue_as_root_blkg(bio); + int inflight = 0; + + blkg = bio->bi_blkg; + if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + return; + + iolat = blkg_to_lat(bio->bi_blkg); + if (!iolat) + return; + + if (!iolat->blkiolat->enabled) + return; + + now = ktime_to_ns(ktime_get()); + while (blkg && blkg->parent) { + iolat = blkg_to_lat(blkg); + if (!iolat) { + blkg = blkg->parent; + continue; + } + rqw = &iolat->rq_wait; + + inflight = atomic_dec_return(&rqw->inflight); + WARN_ON_ONCE(inflight < 0); + /* + * If bi_status is BLK_STS_AGAIN, the bio wasn't actually + * submitted, so do not account for it. + */ + if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) { + iolatency_record_time(iolat, &bio->bi_issue, now, + issue_as_root); + window_start = atomic64_read(&iolat->window_start); + if (now > window_start && + (now - window_start) >= iolat->cur_win_nsec) { + if (atomic64_cmpxchg(&iolat->window_start, + window_start, now) == window_start) + iolatency_check_latencies(iolat, now); + } + } + wake_up(&rqw->wait); + blkg = blkg->parent; + } +} + +static void blkcg_iolatency_exit(struct rq_qos *rqos) +{ + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + + del_timer_sync(&blkiolat->timer); + flush_work(&blkiolat->enable_work); + blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + kfree(blkiolat); +} + +static struct rq_qos_ops blkcg_iolatency_ops = { + .throttle = blkcg_iolatency_throttle, + .done_bio = blkcg_iolatency_done_bio, + .exit = blkcg_iolatency_exit, +}; + +static void blkiolatency_timer_fn(struct timer_list *t) +{ + struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); + struct blkcg_gq *blkg; + struct cgroup_subsys_state *pos_css; + u64 now = ktime_to_ns(ktime_get()); + + rcu_read_lock(); + blkg_for_each_descendant_pre(blkg, pos_css, + blkiolat->rqos.q->root_blkg) { + struct iolatency_grp *iolat; + struct child_latency_info *lat_info; + unsigned long flags; + u64 cookie; + + /* + * We could be exiting, don't access the pd unless we have a + * ref on the blkg. + */ + if (!blkg_tryget(blkg)) + continue; + + iolat = blkg_to_lat(blkg); + if (!iolat) + goto next; + + lat_info = &iolat->child_lat; + cookie = atomic_read(&lat_info->scale_cookie); + + if (cookie >= DEFAULT_SCALE_COOKIE) + goto next; + + spin_lock_irqsave(&lat_info->lock, flags); + if (lat_info->last_scale_event >= now) + goto next_lock; + + /* + * We scaled down but don't have a scale_grp, scale up and carry + * on. + */ + if (lat_info->scale_grp == NULL) { + scale_cookie_change(iolat->blkiolat, lat_info, true); + goto next_lock; + } + + /* + * It's been 5 seconds since our last scale event, clear the + * scale grp in case the group that needed the scale down isn't + * doing any IO currently. + */ + if (now - lat_info->last_scale_event >= + ((u64)NSEC_PER_SEC * 5)) + lat_info->scale_grp = NULL; +next_lock: + spin_unlock_irqrestore(&lat_info->lock, flags); +next: + blkg_put(blkg); + } + rcu_read_unlock(); +} + +/** + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device + * @work: enable_work of the blk_iolatency of interest + * + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This + * is relatively expensive as it involves walking up the hierarchy twice for + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we + * want to disable the in-flight tracking. + * + * We have to make sure that the counting is balanced - we don't want to leak + * the in-flight counts by disabling accounting in the completion path while IOs + * are in flight. This is achieved by ensuring that no IO is in flight by + * freezing the queue while flipping ->enabled. As this requires a sleepable + * context, ->enabled flipping is punted to this work function. + */ +static void blkiolatency_enable_work_fn(struct work_struct *work) +{ + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, + enable_work); + bool enabled; + + /* + * There can only be one instance of this function running for @blkiolat + * and it's guaranteed to be executed at least once after the latest + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is + * sufficient. + * + * Also, we know @blkiolat is safe to access as ->enable_work is flushed + * in blkcg_iolatency_exit(). + */ + enabled = atomic_read(&blkiolat->enable_cnt); + if (enabled != blkiolat->enabled) { + blk_mq_freeze_queue(blkiolat->rqos.q); + blkiolat->enabled = enabled; + blk_mq_unfreeze_queue(blkiolat->rqos.q); + } +} + +int blk_iolatency_init(struct request_queue *q) +{ + struct blk_iolatency *blkiolat; + struct rq_qos *rqos; + int ret; + + blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); + if (!blkiolat) + return -ENOMEM; + + rqos = &blkiolat->rqos; + rqos->id = RQ_QOS_LATENCY; + rqos->ops = &blkcg_iolatency_ops; + rqos->q = q; + + rq_qos_add(q, rqos); + + ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); + if (ret) { + rq_qos_del(q, rqos); + kfree(blkiolat); + return ret; + } + + timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); + + return 0; +} + +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +{ + struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; + u64 oldval = iolat->min_lat_nsec; + + iolat->min_lat_nsec = val; + iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); + iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, + BLKIOLATENCY_MAX_WIN_SIZE); + + if (!oldval && val) { + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) + schedule_work(&blkiolat->enable_work); + } + if (oldval && !val) { + blkcg_clear_delay(blkg); + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) + schedule_work(&blkiolat->enable_work); + } +} + +static void iolatency_clear_scaling(struct blkcg_gq *blkg) +{ + if (blkg->parent) { + struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); + struct child_latency_info *lat_info; + if (!iolat) + return; + + lat_info = &iolat->child_lat; + spin_lock(&lat_info->lock); + atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); + lat_info->last_scale_event = 0; + lat_info->scale_grp = NULL; + lat_info->scale_lat = 0; + spin_unlock(&lat_info->lock); + } +} + +static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkcg_gq *blkg; + struct blkg_conf_ctx ctx; + struct iolatency_grp *iolat; + char *p, *tok; + u64 lat_val = 0; + u64 oldval; + int ret; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + if (ret) + return ret; + + iolat = blkg_to_lat(ctx.blkg); + p = ctx.body; + + ret = -EINVAL; + while ((tok = strsep(&p, " "))) { + char key[16]; + char val[21]; /* 18446744073709551616 */ + + if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) + goto out; + + if (!strcmp(key, "target")) { + u64 v; + + if (!strcmp(val, "max")) + lat_val = 0; + else if (sscanf(val, "%llu", &v) == 1) + lat_val = v * NSEC_PER_USEC; + else + goto out; + } else { + goto out; + } + } + + /* Walk up the tree to see if our new val is lower than it should be. */ + blkg = ctx.blkg; + oldval = iolat->min_lat_nsec; + + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) + iolatency_clear_scaling(blkg); + ret = 0; +out: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static u64 iolatency_prfill_limit(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + const char *dname = blkg_dev_name(pd->blkg); + + if (!dname || !iolat->min_lat_nsec) + return 0; + seq_printf(sf, "%s target=%llu\n", + dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); + return 0; +} + +static int iolatency_print_limit(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + iolatency_prfill_limit, + &blkcg_policy_iolatency, seq_cft(sf)->private, false); + return 0; +} + +static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, + size_t size) +{ + struct latency_stat stat; + int cpu; + + latency_stat_init(iolat, &stat); + preempt_disable(); + for_each_online_cpu(cpu) { + struct latency_stat *s; + s = per_cpu_ptr(iolat->stats, cpu); + latency_stat_sum(iolat, &stat, s); + } + preempt_enable(); + + if (iolat->rq_depth.max_depth == UINT_MAX) + return scnprintf(buf, size, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->rq_depth.max_depth); +} + +static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, + size_t size) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + unsigned long long avg_lat; + unsigned long long cur_win; + + if (!blkcg_debug_stats) + return 0; + + if (iolat->ssd) + return iolatency_ssd_stat(iolat, buf, size); + + avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); + cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); + if (iolat->rq_depth.max_depth == UINT_MAX) + return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + + return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", + iolat->rq_depth.max_depth, avg_lat, cur_win); +} + + +static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) +{ + struct iolatency_grp *iolat; + + iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); + if (!iolat) + return NULL; + iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), + __alignof__(struct latency_stat), gfp); + if (!iolat->stats) { + kfree(iolat); + return NULL; + } + return &iolat->pd; +} + +static void iolatency_pd_init(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); + u64 now = ktime_to_ns(ktime_get()); + int cpu; + + if (blk_queue_nonrot(blkg->q)) + iolat->ssd = true; + else + iolat->ssd = false; + + for_each_possible_cpu(cpu) { + struct latency_stat *stat; + stat = per_cpu_ptr(iolat->stats, cpu); + latency_stat_init(iolat, stat); + } + + latency_stat_init(iolat, &iolat->cur_stat); + rq_wait_init(&iolat->rq_wait); + spin_lock_init(&iolat->child_lat.lock); + iolat->rq_depth.queue_depth = blkg->q->nr_requests; + iolat->rq_depth.max_depth = UINT_MAX; + iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->blkiolat = blkiolat; + iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; + atomic64_set(&iolat->window_start, now); + + /* + * We init things in list order, so the pd for the parent may not be + * init'ed yet for whatever reason. + */ + if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { + struct iolatency_grp *parent = blkg_to_lat(blkg->parent); + atomic_set(&iolat->scale_cookie, + atomic_read(&parent->child_lat.scale_cookie)); + } else { + atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); + } + + atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); +} + +static void iolatency_pd_offline(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + struct blkcg_gq *blkg = lat_to_blkg(iolat); + + iolatency_set_min_lat_nsec(blkg, 0); + iolatency_clear_scaling(blkg); +} + +static void iolatency_pd_free(struct blkg_policy_data *pd) +{ + struct iolatency_grp *iolat = pd_to_lat(pd); + free_percpu(iolat->stats); + kfree(iolat); +} + +static struct cftype iolatency_files[] = { + { + .name = "latency", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = iolatency_print_limit, + .write = iolatency_set_limit, + }, + {} +}; + +static struct blkcg_policy blkcg_policy_iolatency = { + .dfl_cftypes = iolatency_files, + .pd_alloc_fn = iolatency_pd_alloc, + .pd_init_fn = iolatency_pd_init, + .pd_offline_fn = iolatency_pd_offline, + .pd_free_fn = iolatency_pd_free, + .pd_stat_fn = iolatency_pd_stat, +}; + +static int __init iolatency_init(void) +{ + return blkcg_policy_register(&blkcg_policy_iolatency); +} + +static void __exit iolatency_exit(void) +{ + blkcg_policy_unregister(&blkcg_policy_iolatency); +} + +module_init(iolatency_init); +module_exit(iolatency_exit); diff --git a/block/blk-lib.c b/block/blk-lib.c new file mode 100644 index 000000000..e90614fd8 --- /dev/null +++ b/block/blk-lib.c @@ -0,0 +1,441 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to generic helpers functions + */ +#include +#include +#include +#include +#include + +#include "blk.h" + +struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) +{ + struct bio *new = bio_alloc(gfp, nr_pages); + + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + + return new; +} + +int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + unsigned int op; + sector_t bs_mask, part_offset = 0; + + if (!q) + return -ENXIO; + + if (bdev_read_only(bdev)) + return -EPERM; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secure_erase(q)) + return -EOPNOTSUPP; + op = REQ_OP_SECURE_ERASE; + } else { + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + op = REQ_OP_DISCARD; + } + + /* In case the discard granularity isn't set by buggy device driver */ + if (WARN_ON_ONCE(!q->limits.discard_granularity)) { + char dev_name[BDEVNAME_SIZE]; + + bdevname(bdev, dev_name); + pr_err_ratelimited("%s: Error: discard_granularity is 0.\n", dev_name); + return -EOPNOTSUPP; + } + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + if (!nr_sects) + return -EINVAL; + + /* In case the discard request is in a partition */ + if (bdev_is_partition(bdev)) + part_offset = bdev->bd_part->start_sect; + + while (nr_sects) { + sector_t granularity_aligned_lba, req_sects; + sector_t sector_mapped = sector + part_offset; + + granularity_aligned_lba = round_up(sector_mapped, + q->limits.discard_granularity >> SECTOR_SHIFT); + + /* + * Check whether the discard bio starts at a discard_granularity + * aligned LBA, + * - If no: set (granularity_aligned_lba - sector_mapped) to + * bi_size of the first split bio, then the second bio will + * start at a discard_granularity aligned LBA on the device. + * - If yes: use bio_aligned_discard_max_sectors() as the max + * possible bi_size of the first split bio. Then when this bio + * is split in device drive, the split ones are very probably + * to be aligned to discard_granularity of the device's queue. + */ + if (granularity_aligned_lba == sector_mapped) + req_sects = min_t(sector_t, nr_sects, + bio_aligned_discard_max_sectors(q)); + else + req_sects = min_t(sector_t, nr_sects, + granularity_aligned_lba - sector_mapped); + + WARN_ON_ONCE((req_sects << 9) > UINT_MAX); + + bio = blk_next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, op, 0); + + bio->bi_iter.bi_size = req_sects << 9; + sector += req_sects; + nr_sects -= req_sects; + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + } + + *biop = bio; + return 0; +} +EXPORT_SYMBOL(__blkdev_issue_discard); + +/** + * blkdev_issue_discard - queue a discard + * @bdev: blockdev to issue discard for + * @sector: start sector + * @nr_sects: number of sectors to discard + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: BLKDEV_DISCARD_* flags to control behaviour + * + * Description: + * Issue a discard request for the sectors in question. + */ +int blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, + &bio); + if (!ret && bio) { + ret = submit_bio_wait(bio); + if (ret == -EOPNOTSUPP) + ret = 0; + bio_put(bio); + } + blk_finish_plug(&plug); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_discard); + +/** + * __blkdev_issue_write_same - generate number of bios with same page + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data to write + * @biop: pointer to anchor bio + * + * Description: + * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. + */ +static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct page *page, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + unsigned int max_write_same_sectors; + struct bio *bio = *biop; + sector_t bs_mask; + + if (!q) + return -ENXIO; + + if (bdev_read_only(bdev)) + return -EPERM; + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + if (!bdev_write_same(bdev)) + return -EOPNOTSUPP; + + /* Ensure that max_write_same_sectors doesn't overflow bi_size */ + max_write_same_sectors = bio_allowed_max_sectors(q); + + while (nr_sects) { + bio = blk_next_bio(bio, 1, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio->bi_vcnt = 1; + bio->bi_io_vec->bv_page = page; + bio->bi_io_vec->bv_offset = 0; + bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); + bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0); + + if (nr_sects > max_write_same_sectors) { + bio->bi_iter.bi_size = max_write_same_sectors << 9; + nr_sects -= max_write_same_sectors; + sector += max_write_same_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + +/** + * blkdev_issue_write_same - queue a write same operation + * @bdev: target blockdev + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @page: page containing data + * + * Description: + * Issue a write same request for the sectors in question. + */ +int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, + struct page *page) +{ + struct bio *bio = NULL; + struct blk_plug plug; + int ret; + + blk_start_plug(&plug); + ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, + &bio); + if (ret == 0 && bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + return ret; +} +EXPORT_SYMBOL(blkdev_issue_write_same); + +static int __blkdev_issue_write_zeroes(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop, unsigned flags) +{ + struct bio *bio = *biop; + unsigned int max_write_zeroes_sectors; + struct request_queue *q = bdev_get_queue(bdev); + + if (!q) + return -ENXIO; + + if (bdev_read_only(bdev)) + return -EPERM; + + /* Ensure that max_write_zeroes_sectors doesn't overflow bi_size */ + max_write_zeroes_sectors = bdev_write_zeroes_sectors(bdev); + + if (max_write_zeroes_sectors == 0) + return -EOPNOTSUPP; + + while (nr_sects) { + bio = blk_next_bio(bio, 0, gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio->bi_opf = REQ_OP_WRITE_ZEROES; + if (flags & BLKDEV_ZERO_NOUNMAP) + bio->bi_opf |= REQ_NOUNMAP; + + if (nr_sects > max_write_zeroes_sectors) { + bio->bi_iter.bi_size = max_write_zeroes_sectors << 9; + nr_sects -= max_write_zeroes_sectors; + sector += max_write_zeroes_sectors; + } else { + bio->bi_iter.bi_size = nr_sects << 9; + nr_sects = 0; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + +/* + * Convert a number of 512B sectors to a number of pages. + * The result is limited to a number of pages that can fit into a BIO. + * Also make sure that the result is always at least 1 (page) for the cases + * where nr_sects is lower than the number of sectors in a page. + */ +static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) +{ + sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); + + return min(pages, (sector_t)BIO_MAX_PAGES); +} + +static int __blkdev_issue_zero_pages(struct block_device *bdev, + sector_t sector, sector_t nr_sects, gfp_t gfp_mask, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + int bi_size = 0; + unsigned int sz; + + if (!q) + return -ENXIO; + + if (bdev_read_only(bdev)) + return -EPERM; + + while (nr_sects != 0) { + bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), + gfp_mask); + bio->bi_iter.bi_sector = sector; + bio_set_dev(bio, bdev); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + + while (nr_sects != 0) { + sz = min((sector_t) PAGE_SIZE, nr_sects << 9); + bi_size = bio_add_page(bio, ZERO_PAGE(0), sz, 0); + nr_sects -= bi_size >> 9; + sector += bi_size >> 9; + if (bi_size < sz) + break; + } + cond_resched(); + } + + *biop = bio; + return 0; +} + +/** + * __blkdev_issue_zeroout - generate number of zero filed write bios + * @bdev: blockdev to issue + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @biop: pointer to anchor bio + * @flags: controls detailed behavior + * + * Description: + * Zero-fill a block range, either using hardware offload or by explicitly + * writing zeroes to the device. + * + * If a device is using logical block provisioning, the underlying space will + * not be released if %flags contains BLKDEV_ZERO_NOUNMAP. + * + * If %flags contains BLKDEV_ZERO_NOFALLBACK, the function will return + * -EOPNOTSUPP if no explicit hardware offload for zeroing is provided. + */ +int __blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, + unsigned flags) +{ + int ret; + sector_t bs_mask; + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, gfp_mask, + biop, flags); + if (ret != -EOPNOTSUPP || (flags & BLKDEV_ZERO_NOFALLBACK)) + return ret; + + return __blkdev_issue_zero_pages(bdev, sector, nr_sects, gfp_mask, + biop); +} +EXPORT_SYMBOL(__blkdev_issue_zeroout); + +/** + * blkdev_issue_zeroout - zero-fill a block range + * @bdev: blockdev to write + * @sector: start sector + * @nr_sects: number of sectors to write + * @gfp_mask: memory allocation flags (for bio_alloc) + * @flags: controls detailed behavior + * + * Description: + * Zero-fill a block range, either using hardware offload or by explicitly + * writing zeroes to the device. See __blkdev_issue_zeroout() for the + * valid values for %flags. + */ +int blkdev_issue_zeroout(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, unsigned flags) +{ + int ret = 0; + sector_t bs_mask; + struct bio *bio; + struct blk_plug plug; + bool try_write_zeroes = !!bdev_write_zeroes_sectors(bdev); + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + +retry: + bio = NULL; + blk_start_plug(&plug); + if (try_write_zeroes) { + ret = __blkdev_issue_write_zeroes(bdev, sector, nr_sects, + gfp_mask, &bio, flags); + } else if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + ret = __blkdev_issue_zero_pages(bdev, sector, nr_sects, + gfp_mask, &bio); + } else { + /* No zeroing offload support */ + ret = -EOPNOTSUPP; + } + if (ret == 0 && bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + blk_finish_plug(&plug); + if (ret && try_write_zeroes) { + if (!(flags & BLKDEV_ZERO_NOFALLBACK)) { + try_write_zeroes = false; + goto retry; + } + if (!bdev_write_zeroes_sectors(bdev)) { + /* + * Zeroing offload support was indicated, but the + * device reported ILLEGAL REQUEST (for some devices + * there is no non-destructive way to verify whether + * WRITE ZEROES is actually supported). + */ + ret = -EOPNOTSUPP; + } + } + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_zeroout); diff --git a/block/blk-map.c b/block/blk-map.c new file mode 100644 index 000000000..ede73f4f7 --- /dev/null +++ b/block/blk-map.c @@ -0,0 +1,718 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to mapping data to requests + */ +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +struct bio_map_data { + bool is_our_pages : 1; + bool is_null_mapped : 1; + struct iov_iter iter; + struct iovec iov[]; +}; + +static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, + gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + + if (data->nr_segs > UIO_MAXIOV) + return NULL; + + bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); + if (!bmd) + return NULL; + memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); + bmd->iter = *data; + bmd->iter.iov = bmd->iov; + return bmd; +} + +/** + * bio_copy_from_iter - copy all pages from iov_iter to bio + * @bio: The &struct bio which describes the I/O as destination + * @iter: iov_iter as source + * + * Copy all pages from iov_iter to bio. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_from_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + iter); + + if (!iov_iter_count(iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_copy_to_iter - copy all pages from bio to iov_iter + * @bio: The &struct bio which describes the I/O as source + * @iter: iov_iter as destination + * + * Copy all pages from bio to iov_iter. + * Returns 0 on success, or error on failure. + */ +static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + ssize_t ret; + + ret = copy_page_to_iter(bvec->bv_page, + bvec->bv_offset, + bvec->bv_len, + &iter); + + if (!iov_iter_count(&iter)) + break; + + if (ret < bvec->bv_len) + return -EFAULT; + } + + return 0; +} + +/** + * bio_uncopy_user - finish previously mapped bio + * @bio: bio being terminated + * + * Free pages allocated from bio_copy_user_iov() and write back data + * to user space in case of a read. + */ +static int bio_uncopy_user(struct bio *bio) +{ + struct bio_map_data *bmd = bio->bi_private; + int ret = 0; + + if (!bmd->is_null_mapped) { + /* + * if we're in a workqueue, the request is orphaned, so + * don't copy into a random user address space, just free + * and return -EINTR so user space doesn't expect any data. + */ + if (!current->mm) + ret = -EINTR; + else if (bio_data_dir(bio) == READ) + ret = bio_copy_to_iter(bio, bmd->iter); + if (bmd->is_our_pages) + bio_free_pages(bio); + } + kfree(bmd); + bio_put(bio); + return ret; +} + +static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, + struct iov_iter *iter, gfp_t gfp_mask) +{ + struct bio_map_data *bmd; + struct page *page; + struct bio *bio, *bounce_bio; + int i = 0, ret; + int nr_pages; + unsigned int len = iter->count; + unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0; + + bmd = bio_alloc_map_data(iter, gfp_mask); + if (!bmd) + return -ENOMEM; + + /* + * We need to do a deep copy of the iov_iter including the iovecs. + * The caller provided iov might point to an on-stack or otherwise + * shortlived one. + */ + bmd->is_our_pages = !map_data; + bmd->is_null_mapped = (map_data && map_data->null_mapped); + + nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); + if (nr_pages > BIO_MAX_PAGES) + nr_pages = BIO_MAX_PAGES; + + ret = -ENOMEM; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + goto out_bmd; + bio->bi_opf |= req_op(rq); + + if (map_data) { + nr_pages = 1 << map_data->page_order; + i = map_data->offset / PAGE_SIZE; + } + while (len) { + unsigned int bytes = PAGE_SIZE; + + bytes -= offset; + + if (bytes > len) + bytes = len; + + if (map_data) { + if (i == map_data->nr_entries * nr_pages) { + ret = -ENOMEM; + goto cleanup; + } + + page = map_data->pages[i / nr_pages]; + page += (i % nr_pages); + + i++; + } else { + page = alloc_page(rq->q->bounce_gfp | gfp_mask); + if (!page) { + ret = -ENOMEM; + goto cleanup; + } + } + + if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { + if (!map_data) + __free_page(page); + break; + } + + len -= bytes; + offset = 0; + } + + if (map_data) + map_data->offset += bio->bi_iter.bi_size; + + /* + * success + */ + if ((iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) || + (map_data && map_data->from_user)) { + ret = bio_copy_from_iter(bio, iter); + if (ret) + goto cleanup; + } else { + if (bmd->is_our_pages) + zero_fill_bio(bio); + iov_iter_advance(iter, bio->bi_iter.bi_size); + } + + bio->bi_private = bmd; + + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto cleanup; + + /* + * We link the bounce buffer in and could have to traverse it later, so + * we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; +cleanup: + if (!map_data) + bio_free_pages(bio); + bio_put(bio); +out_bmd: + kfree(bmd); + return ret; +} + +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) +{ + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + struct bio *bio, *bounce_bio; + int ret; + int j; + + if (!iov_iter_count(iter)) + return -EINVAL; + + bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); + if (!bio) + return -ENOMEM; + bio->bi_opf |= req_op(rq); + + while (iov_iter_count(iter)) { + struct page **pages; + ssize_t bytes; + size_t offs, added = 0; + int npages; + + bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (unlikely(bytes <= 0)) { + ret = bytes ? bytes : -EFAULT; + goto out_unmap; + } + + npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); + + if (unlikely(offs & queue_dma_alignment(rq->q))) { + ret = -EINVAL; + j = 0; + } else { + for (j = 0; j < npages; j++) { + struct page *page = pages[j]; + unsigned int n = PAGE_SIZE - offs; + bool same_page = false; + + if (n > bytes) + n = bytes; + + if (!bio_add_hw_page(rq->q, bio, page, n, offs, + max_sectors, &same_page)) { + if (same_page) + put_page(page); + break; + } + + added += n; + bytes -= n; + offs = 0; + } + iov_iter_advance(iter, added); + } + /* + * release the pages we didn't map into the bio, if any + */ + while (j < npages) + put_page(pages[j++]); + kvfree(pages); + /* couldn't stuff something into bio? */ + if (bytes) + break; + } + + /* + * Subtle: if we end up needing to bounce a bio, it would normally + * disappear when its bi_end_io is run. However, we need the original + * bio for the unmap, so grab an extra reference to it + */ + bio_get(bio); + + bounce_bio = bio; + ret = blk_rq_append_bio(rq, &bounce_bio); + if (ret) + goto out_put_orig; + + /* + * We link the bounce buffer in and could have to traverse it + * later, so we have to get a ref to prevent it from being freed + */ + bio_get(bounce_bio); + return 0; + + out_put_orig: + bio_put(bio); + out_unmap: + bio_release_pages(bio, false); + bio_put(bio); + return ret; +} + +/** + * bio_unmap_user - unmap a bio + * @bio: the bio being unmapped + * + * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from + * process context. + * + * bio_unmap_user() may sleep. + */ +static void bio_unmap_user(struct bio *bio) +{ + bio_release_pages(bio, bio_data_dir(bio) == READ); + bio_put(bio); + bio_put(bio); +} + +static void bio_invalidate_vmalloc_pages(struct bio *bio) +{ +#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE + if (bio->bi_private && !op_is_write(bio_op(bio))) { + unsigned long i, len = 0; + + for (i = 0; i < bio->bi_vcnt; i++) + len += bio->bi_io_vec[i].bv_len; + invalidate_kernel_vmap_range(bio->bi_private, len); + } +#endif +} + +static void bio_map_kern_endio(struct bio *bio) +{ + bio_invalidate_vmalloc_pages(bio); + bio_put(bio); +} + +/** + * bio_map_kern - map kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to map + * @len: length in bytes + * @gfp_mask: allocation flags for bio allocation + * + * Map the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_map_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + const int nr_pages = end - start; + bool is_vmalloc = is_vmalloc_addr(data); + struct page *page; + int offset, i; + struct bio *bio; + + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + if (is_vmalloc) { + flush_kernel_vmap_range(data, len); + bio->bi_private = data; + } + + offset = offset_in_page(kaddr); + for (i = 0; i < nr_pages; i++) { + unsigned int bytes = PAGE_SIZE - offset; + + if (len <= 0) + break; + + if (bytes > len) + bytes = len; + + if (!is_vmalloc) + page = virt_to_page(data); + else + page = vmalloc_to_page(data); + if (bio_add_pc_page(q, bio, page, bytes, + offset) < bytes) { + /* we don't support partial mappings */ + bio_put(bio); + return ERR_PTR(-EINVAL); + } + + data += bytes; + len -= bytes; + offset = 0; + } + + bio->bi_end_io = bio_map_kern_endio; + return bio; +} + +static void bio_copy_kern_endio(struct bio *bio) +{ + bio_free_pages(bio); + bio_put(bio); +} + +static void bio_copy_kern_endio_read(struct bio *bio) +{ + char *p = bio->bi_private; + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + bio_for_each_segment_all(bvec, bio, iter_all) { + memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + p += bvec->bv_len; + } + + bio_copy_kern_endio(bio); +} + +/** + * bio_copy_kern - copy kernel address into bio + * @q: the struct request_queue for the bio + * @data: pointer to buffer to copy + * @len: length in bytes + * @gfp_mask: allocation flags for bio and page allocation + * @reading: data direction is READ + * + * copy the kernel address into a bio suitable for io to a block + * device. Returns an error pointer in case of error. + */ +static struct bio *bio_copy_kern(struct request_queue *q, void *data, + unsigned int len, gfp_t gfp_mask, int reading) +{ + unsigned long kaddr = (unsigned long)data; + unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; + unsigned long start = kaddr >> PAGE_SHIFT; + struct bio *bio; + void *p = data; + int nr_pages = 0; + + /* + * Overflow, abort + */ + if (end < start) + return ERR_PTR(-EINVAL); + + nr_pages = end - start; + bio = bio_kmalloc(gfp_mask, nr_pages); + if (!bio) + return ERR_PTR(-ENOMEM); + + while (len) { + struct page *page; + unsigned int bytes = PAGE_SIZE; + + if (bytes > len) + bytes = len; + + page = alloc_page(q->bounce_gfp | __GFP_ZERO | gfp_mask); + if (!page) + goto cleanup; + + if (!reading) + memcpy(page_address(page), p, bytes); + + if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) + break; + + len -= bytes; + p += bytes; + } + + if (reading) { + bio->bi_end_io = bio_copy_kern_endio_read; + bio->bi_private = data; + } else { + bio->bi_end_io = bio_copy_kern_endio; + } + + return bio; + +cleanup: + bio_free_pages(bio); + bio_put(bio); + return ERR_PTR(-ENOMEM); +} + +/* + * Append a bio to a passthrough request. Only works if the bio can be merged + * into the request based on the driver constraints. + */ +int blk_rq_append_bio(struct request *rq, struct bio **bio) +{ + struct bio *orig_bio = *bio; + struct bvec_iter iter; + struct bio_vec bv; + unsigned int nr_segs = 0; + + blk_queue_bounce(rq->q, bio); + + bio_for_each_bvec(bv, *bio, iter) + nr_segs++; + + if (!rq->bio) { + blk_rq_bio_prep(rq, *bio, nr_segs); + } else { + if (!ll_back_merge_fn(rq, *bio, nr_segs)) { + if (orig_bio != *bio) { + bio_put(*bio); + *bio = orig_bio; + } + return -EINVAL; + } + + rq->biotail->bi_next = *bio; + rq->biotail = *bio; + rq->__data_len += (*bio)->bi_iter.bi_size; + bio_crypt_free_ctx(*bio); + } + + return 0; +} +EXPORT_SYMBOL(blk_rq_append_bio); + +/** + * blk_rq_map_user_iov - map user data to a request, for passthrough requests + * @q: request queue where request should be inserted + * @rq: request to map data to + * @map_data: pointer to the rq_map_data holding pages (if necessary) + * @iter: iovec iterator + * @gfp_mask: memory allocation flags + * + * Description: + * Data will be mapped directly for zero copy I/O, if possible. Otherwise + * a kernel bounce buffer is used. + * + * A matching blk_rq_unmap_user() must be issued at the end of I/O, while + * still in process context. + * + * Note: The mapped bio may need to be bounced through blk_queue_bounce() + * before being submitted to the device, as pages mapped may be out of + * reach. It's the callers responsibility to make sure this happens. The + * original bio must be passed back in to blk_rq_unmap_user() for proper + * unmapping. + */ +int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, + struct rq_map_data *map_data, + const struct iov_iter *iter, gfp_t gfp_mask) +{ + bool copy = false; + unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); + struct bio *bio = NULL; + struct iov_iter i; + int ret = -EINVAL; + + if (!iter_is_iovec(iter)) + goto fail; + + if (map_data) + copy = true; + else if (iov_iter_alignment(iter) & align) + copy = true; + else if (queue_virt_boundary(q)) + copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); + + i = *iter; + do { + if (copy) + ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); + else + ret = bio_map_user_iov(rq, &i, gfp_mask); + if (ret) + goto unmap_rq; + if (!bio) + bio = rq->bio; + } while (iov_iter_count(&i)); + + return 0; + +unmap_rq: + blk_rq_unmap_user(bio); +fail: + rq->bio = NULL; + return ret; +} +EXPORT_SYMBOL(blk_rq_map_user_iov); + +int blk_rq_map_user(struct request_queue *q, struct request *rq, + struct rq_map_data *map_data, void __user *ubuf, + unsigned long len, gfp_t gfp_mask) +{ + struct iovec iov; + struct iov_iter i; + int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + + if (unlikely(ret < 0)) + return ret; + + return blk_rq_map_user_iov(q, rq, map_data, &i, gfp_mask); +} +EXPORT_SYMBOL(blk_rq_map_user); + +/** + * blk_rq_unmap_user - unmap a request with user data + * @bio: start of bio list + * + * Description: + * Unmap a rq previously mapped by blk_rq_map_user(). The caller must + * supply the original rq->bio from the blk_rq_map_user() return, since + * the I/O completion may have changed rq->bio. + */ +int blk_rq_unmap_user(struct bio *bio) +{ + struct bio *mapped_bio; + int ret = 0, ret2; + + while (bio) { + mapped_bio = bio; + if (unlikely(bio_flagged(bio, BIO_BOUNCED))) + mapped_bio = bio->bi_private; + + if (bio->bi_private) { + ret2 = bio_uncopy_user(mapped_bio); + if (ret2 && !ret) + ret = ret2; + } else { + bio_unmap_user(mapped_bio); + } + + mapped_bio = bio; + bio = bio->bi_next; + bio_put(mapped_bio); + } + + return ret; +} +EXPORT_SYMBOL(blk_rq_unmap_user); + +/** + * blk_rq_map_kern - map kernel data to a request, for passthrough requests + * @q: request queue where request should be inserted + * @rq: request to fill + * @kbuf: the kernel buffer + * @len: length of user data + * @gfp_mask: memory allocation flags + * + * Description: + * Data will be mapped directly if possible. Otherwise a bounce + * buffer is used. Can be called multiple times to append multiple + * buffers. + */ +int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, + unsigned int len, gfp_t gfp_mask) +{ + int reading = rq_data_dir(rq) == READ; + unsigned long addr = (unsigned long) kbuf; + struct bio *bio, *orig_bio; + int ret; + + if (len > (queue_max_hw_sectors(q) << 9)) + return -EINVAL; + if (!len || !kbuf) + return -EINVAL; + + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) + bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); + else + bio = bio_map_kern(q, kbuf, len, gfp_mask); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + bio->bi_opf &= ~REQ_OP_MASK; + bio->bi_opf |= req_op(rq); + + orig_bio = bio; + ret = blk_rq_append_bio(rq, &bio); + if (unlikely(ret)) { + /* request is too big */ + bio_put(orig_bio); + return ret; + } + + return 0; +} +EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c new file mode 100644 index 000000000..f3b016b31 --- /dev/null +++ b/block/blk-merge.c @@ -0,0 +1,1147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to segment and merge handling + */ +#include +#include +#include +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-rq-qos.h" + +static inline bool bio_will_gap(struct request_queue *q, + struct request *prev_rq, struct bio *prev, struct bio *next) +{ + struct bio_vec pb, nb; + + if (!bio_has_data(prev) || !queue_virt_boundary(q)) + return false; + + /* + * Don't merge if the 1st bio starts with non-zero offset, otherwise it + * is quite difficult to respect the sg gap limit. We work hard to + * merge a huge number of small single bios in case of mkfs. + */ + if (prev_rq) + bio_get_first_bvec(prev_rq->bio, &pb); + else + bio_get_first_bvec(prev, &pb); + if (pb.bv_offset & queue_virt_boundary(q)) + return true; + + /* + * We don't need to worry about the situation that the merged segment + * ends in unaligned virt boundary: + * + * - if 'pb' ends aligned, the merged segment ends aligned + * - if 'pb' ends unaligned, the next bio must include + * one single bvec of 'nb', otherwise the 'nb' can't + * merge with 'pb' + */ + bio_get_last_bvec(prev, &pb); + bio_get_first_bvec(next, &nb); + if (biovec_phys_mergeable(q, &pb, &nb)) + return false; + return __bvec_gap_to_prev(q, &pb, nb.bv_offset); +} + +static inline bool req_gap_back_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, req, req->biotail, bio); +} + +static inline bool req_gap_front_merge(struct request *req, struct bio *bio) +{ + return bio_will_gap(req->q, NULL, bio, req->bio); +} + +static struct bio *blk_bio_discard_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + unsigned int max_discard_sectors, granularity; + int alignment; + sector_t tmp; + unsigned split_sectors; + + *nsegs = 1; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + + max_discard_sectors = min(q->limits.max_discard_sectors, + bio_allowed_max_sectors(q)); + max_discard_sectors -= max_discard_sectors % granularity; + + if (unlikely(!max_discard_sectors)) { + /* XXX: warn */ + return NULL; + } + + if (bio_sectors(bio) <= max_discard_sectors) + return NULL; + + split_sectors = max_discard_sectors; + + /* + * If the next starting sector would be misaligned, stop the discard at + * the previous aligned sector. + */ + alignment = (q->limits.discard_alignment >> 9) % granularity; + + tmp = bio->bi_iter.bi_sector + split_sectors - alignment; + tmp = sector_div(tmp, granularity); + + if (split_sectors > tmp) + split_sectors -= tmp; + + return bio_split(bio, split_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, + struct bio *bio, struct bio_set *bs, unsigned *nsegs) +{ + *nsegs = 0; + + if (!q->limits.max_write_zeroes_sectors) + return NULL; + + if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors) + return NULL; + + return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); +} + +static struct bio *blk_bio_write_same_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *nsegs) +{ + *nsegs = 1; + + if (!q->limits.max_write_same_sectors) + return NULL; + + if (bio_sectors(bio) <= q->limits.max_write_same_sectors) + return NULL; + + return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); +} + +/* + * Return the maximum number of sectors from the start of a bio that may be + * submitted as a single request to a block device. If enough sectors remain, + * align the end to the physical block size. Otherwise align the end to the + * logical block size. This approach minimizes the number of non-aligned + * requests that are submitted to a block device if the start of a bio is not + * aligned to a physical block boundary. + */ +static inline unsigned get_max_io_size(struct request_queue *q, + struct bio *bio) +{ + unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector, 0); + unsigned max_sectors = sectors; + unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; + unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; + unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); + + max_sectors += start_offset; + max_sectors &= ~(pbs - 1); + if (max_sectors > start_offset) + return max_sectors - start_offset; + + return sectors & ~(lbs - 1); +} + +static inline unsigned get_max_segment_size(const struct request_queue *q, + struct page *start_page, + unsigned long offset) +{ + unsigned long mask = queue_segment_boundary(q); + + offset = mask & (page_to_phys(start_page) + offset); + + /* + * overflow may be triggered in case of zero page physical address + * on 32bit arch, use queue's max segment size when that happens. + */ + return min_not_zero(mask - offset + 1, + (unsigned long)queue_max_segment_size(q)); +} + +/** + * bvec_split_segs - verify whether or not a bvec should be split in the middle + * @q: [in] request queue associated with the bio associated with @bv + * @bv: [in] bvec to examine + * @nsegs: [in,out] Number of segments in the bio being built. Incremented + * by the number of segments from @bv that may be appended to that + * bio without exceeding @max_segs + * @sectors: [in,out] Number of sectors in the bio being built. Incremented + * by the number of sectors from @bv that may be appended to that + * bio without exceeding @max_sectors + * @max_segs: [in] upper bound for *@nsegs + * @max_sectors: [in] upper bound for *@sectors + * + * When splitting a bio, it can happen that a bvec is encountered that is too + * big to fit in a single segment and hence that it has to be split in the + * middle. This function verifies whether or not that should happen. The value + * %true is returned if and only if appending the entire @bv to a bio with + * *@nsegs segments and *@sectors sectors would make that bio unacceptable for + * the block driver. + */ +static bool bvec_split_segs(const struct request_queue *q, + const struct bio_vec *bv, unsigned *nsegs, + unsigned *sectors, unsigned max_segs, + unsigned max_sectors) +{ + unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned len = min(bv->bv_len, max_len); + unsigned total_len = 0; + unsigned seg_size = 0; + + while (len && *nsegs < max_segs) { + seg_size = get_max_segment_size(q, bv->bv_page, + bv->bv_offset + total_len); + seg_size = min(seg_size, len); + + (*nsegs)++; + total_len += seg_size; + len -= seg_size; + + if ((bv->bv_offset + total_len) & queue_virt_boundary(q)) + break; + } + + *sectors += total_len >> 9; + + /* tell the caller to split the bvec if it is too big to fit */ + return len > 0 || bv->bv_len > max_len; +} + +/** + * blk_bio_segment_split - split a bio in two bios + * @q: [in] request queue pointer + * @bio: [in] bio to be split + * @bs: [in] bio set to allocate the clone from + * @segs: [out] number of segments in the bio with the first half of the sectors + * + * Clone @bio, update the bi_iter of the clone to represent the first sectors + * of @bio and update @bio->bi_iter to represent the remaining sectors. The + * following is guaranteed for the cloned bio: + * - That it has at most get_max_io_size(@q, @bio) sectors. + * - That it has at most queue_max_segments(@q) segments. + * + * Except for discard requests the cloned bio will point at the bi_io_vec of + * the original bio. It is the responsibility of the caller to ensure that the + * original bio is not freed before the cloned bio. The caller is also + * responsible for ensuring that @bs is only destroyed after processing of the + * split bio has finished. + */ +static struct bio *blk_bio_segment_split(struct request_queue *q, + struct bio *bio, + struct bio_set *bs, + unsigned *segs) +{ + struct bio_vec bv, bvprv, *bvprvp = NULL; + struct bvec_iter iter; + unsigned nsegs = 0, sectors = 0; + const unsigned max_sectors = get_max_io_size(q, bio); + const unsigned max_segs = queue_max_segments(q); + + bio_for_each_bvec(bv, bio, iter) { + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, disallow it. + */ + if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + goto split; + + if (nsegs < max_segs && + sectors + (bv.bv_len >> 9) <= max_sectors && + bv.bv_offset + bv.bv_len <= PAGE_SIZE) { + nsegs++; + sectors += bv.bv_len >> 9; + } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, + max_sectors)) { + goto split; + } + + bvprv = bv; + bvprvp = &bvprv; + } + + *segs = nsegs; + return NULL; +split: + *segs = nsegs; + return bio_split(bio, sectors, GFP_NOIO, bs); +} + +/** + * __blk_queue_split - split a bio and submit the second half + * @bio: [in, out] bio to be split + * @nr_segs: [out] number of segments in the first bio + * + * Split a bio into two bios, chain the two bios, submit the second half and + * store a pointer to the first half in *@bio. If the second bio is still too + * big it will be split by a recursive call to this function. Since this + * function may allocate a new bio from @bio->bi_disk->queue->bio_split, it is + * the responsibility of the caller to ensure that + * @bio->bi_disk->queue->bio_split is only released after processing of the + * split bio has finished. + */ +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs) +{ + struct request_queue *q = (*bio)->bi_disk->queue; + struct bio *split = NULL; + + switch (bio_op(*bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs); + break; + case REQ_OP_WRITE_ZEROES: + split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, + nr_segs); + break; + case REQ_OP_WRITE_SAME: + split = blk_bio_write_same_split(q, *bio, &q->bio_split, + nr_segs); + break; + default: + /* + * All drivers must accept single-segments bios that are <= + * PAGE_SIZE. This is a quick and dirty check that relies on + * the fact that bi_io_vec[0] is always valid if a bio has data. + * The check might lead to occasional false negatives when bios + * are cloned, but compared to the performance impact of cloned + * bios themselves the loop below doesn't matter anyway. + */ + if (!q->limits.chunk_sectors && + (*bio)->bi_vcnt == 1 && + ((*bio)->bi_io_vec[0].bv_len + + (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { + *nr_segs = 1; + break; + } + split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); + break; + } + + if (split) { + /* there isn't chance to merge the splitted bio */ + split->bi_opf |= REQ_NOMERGE; + + bio_chain(split, *bio); + trace_block_split(q, split, (*bio)->bi_iter.bi_sector); + submit_bio_noacct(*bio); + *bio = split; + + blk_throtl_charge_bio_split(*bio); + } +} + +/** + * blk_queue_split - split a bio and submit the second half + * @bio: [in, out] bio to be split + * + * Split a bio into two bios, chains the two bios, submit the second half and + * store a pointer to the first half in *@bio. Since this function may allocate + * a new bio from @bio->bi_disk->queue->bio_split, it is the responsibility of + * the caller to ensure that @bio->bi_disk->queue->bio_split is only released + * after processing of the split bio has finished. + */ +void blk_queue_split(struct bio **bio) +{ + unsigned int nr_segs; + + __blk_queue_split(bio, &nr_segs); +} +EXPORT_SYMBOL(blk_queue_split); + +unsigned int blk_recalc_rq_segments(struct request *rq) +{ + unsigned int nr_phys_segs = 0; + unsigned int nr_sectors = 0; + struct req_iterator iter; + struct bio_vec bv; + + if (!rq->bio) + return 0; + + switch (bio_op(rq->bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + if (queue_max_discard_segments(rq->q) > 1) { + struct bio *bio = rq->bio; + + for_each_bio(bio) + nr_phys_segs++; + return nr_phys_segs; + } + return 1; + case REQ_OP_WRITE_ZEROES: + return 0; + case REQ_OP_WRITE_SAME: + return 1; + } + + rq_for_each_bvec(bv, rq, iter) + bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + UINT_MAX, UINT_MAX); + return nr_phys_segs; +} + +static inline struct scatterlist *blk_next_sg(struct scatterlist **sg, + struct scatterlist *sglist) +{ + if (!*sg) + return sglist; + + /* + * If the driver previously mapped a shorter list, we could see a + * termination bit prematurely unless it fully inits the sg table + * on each mapping. We KNOW that there must be more entries here + * or the driver would be buggy, so force clear the termination bit + * to avoid doing a full sg_init_table() in drivers for each command. + */ + sg_unmark_end(*sg); + return sg_next(*sg); +} + +static unsigned blk_bvec_map_sg(struct request_queue *q, + struct bio_vec *bvec, struct scatterlist *sglist, + struct scatterlist **sg) +{ + unsigned nbytes = bvec->bv_len; + unsigned nsegs = 0, total = 0; + + while (nbytes > 0) { + unsigned offset = bvec->bv_offset + total; + unsigned len = min(get_max_segment_size(q, bvec->bv_page, + offset), nbytes); + struct page *page = bvec->bv_page; + + /* + * Unfortunately a fair number of drivers barf on scatterlists + * that have an offset larger than PAGE_SIZE, despite other + * subsystems dealing with that invariant just fine. For now + * stick to the legacy format where we never present those from + * the block layer, but the code below should be removed once + * these offenders (mostly MMC/SD drivers) are fixed. + */ + page += (offset >> PAGE_SHIFT); + offset &= ~PAGE_MASK; + + *sg = blk_next_sg(sg, sglist); + sg_set_page(*sg, page, len, offset); + + total += len; + nbytes -= len; + nsegs++; + } + + return nsegs; +} + +static inline int __blk_bvec_map_sg(struct bio_vec bv, + struct scatterlist *sglist, struct scatterlist **sg) +{ + *sg = blk_next_sg(sg, sglist); + sg_set_page(*sg, bv.bv_page, bv.bv_len, bv.bv_offset); + return 1; +} + +/* only try to merge bvecs into one sg if they are from two bios */ +static inline bool +__blk_segment_map_sg_merge(struct request_queue *q, struct bio_vec *bvec, + struct bio_vec *bvprv, struct scatterlist **sg) +{ + + int nbytes = bvec->bv_len; + + if (!*sg) + return false; + + if ((*sg)->length + nbytes > queue_max_segment_size(q)) + return false; + + if (!biovec_phys_mergeable(q, bvprv, bvec)) + return false; + + (*sg)->length += nbytes; + + return true; +} + +static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, + struct scatterlist *sglist, + struct scatterlist **sg) +{ + struct bio_vec bvec, bvprv = { NULL }; + struct bvec_iter iter; + int nsegs = 0; + bool new_bio = false; + + for_each_bio(bio) { + bio_for_each_bvec(bvec, bio, iter) { + /* + * Only try to merge bvecs from two bios given we + * have done bio internal merge when adding pages + * to bio + */ + if (new_bio && + __blk_segment_map_sg_merge(q, &bvec, &bvprv, sg)) + goto next_bvec; + + if (bvec.bv_offset + bvec.bv_len <= PAGE_SIZE) + nsegs += __blk_bvec_map_sg(bvec, sglist, sg); + else + nsegs += blk_bvec_map_sg(q, &bvec, sglist, sg); + next_bvec: + new_bio = false; + } + if (likely(bio->bi_iter.bi_size)) { + bvprv = bvec; + new_bio = true; + } + } + + return nsegs; +} + +/* + * map a request to scatterlist, return number of sg entries setup. Caller + * must make sure sg can hold rq->nr_phys_segments entries + */ +int __blk_rq_map_sg(struct request_queue *q, struct request *rq, + struct scatterlist *sglist, struct scatterlist **last_sg) +{ + int nsegs = 0; + + if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) + nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); + else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) + nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); + else if (rq->bio) + nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); + + if (*last_sg) + sg_mark_end(*last_sg); + + /* + * Something must have been wrong if the figured number of + * segment is bigger than number of req's physical segments + */ + WARN_ON(nsegs > blk_rq_nr_phys_segments(rq)); + + return nsegs; +} +EXPORT_SYMBOL(__blk_rq_map_sg); + +static inline unsigned int blk_rq_get_max_segments(struct request *rq) +{ + if (req_op(rq) == REQ_OP_DISCARD) + return queue_max_discard_segments(rq->q); + return queue_max_segments(rq->q); +} + +static inline int ll_new_hw_segment(struct request *req, struct bio *bio, + unsigned int nr_phys_segs) +{ + if (!blk_cgroup_mergeable(req, bio)) + goto no_merge; + + if (blk_integrity_merge_bio(req->q, req, bio) == false) + goto no_merge; + + /* discard request merge won't add new segment */ + if (req_op(req) == REQ_OP_DISCARD) + return 1; + + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) + goto no_merge; + + /* + * This will form the start of a new hw segment. Bump both + * counters. + */ + req->nr_phys_segments += nr_phys_segs; + return 1; + +no_merge: + req_set_nomerge(req->q, req); + return 0; +} + +int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) +{ + if (req_gap_back_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_back_merge(req, bio)) + return 0; + if (!bio_crypt_ctx_back_mergeable(req, bio)) + return 0; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) { + req_set_nomerge(req->q, req); + return 0; + } + + return ll_new_hw_segment(req, bio, nr_segs); +} + +static int ll_front_merge_fn(struct request *req, struct bio *bio, + unsigned int nr_segs) +{ + if (req_gap_front_merge(req, bio)) + return 0; + if (blk_integrity_rq(req) && + integrity_req_gap_front_merge(req, bio)) + return 0; + if (!bio_crypt_ctx_front_mergeable(req, bio)) + return 0; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) { + req_set_nomerge(req->q, req); + return 0; + } + + return ll_new_hw_segment(req, bio, nr_segs); +} + +static bool req_attempt_discard_merge(struct request_queue *q, struct request *req, + struct request *next) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(next->bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + req->nr_phys_segments = segments + blk_rq_nr_discard_segments(next); + return true; +no_merge: + req_set_nomerge(q, req); + return false; +} + +static int ll_merge_requests_fn(struct request_queue *q, struct request *req, + struct request *next) +{ + int total_phys_segments; + + if (req_gap_back_merge(req, next->bio)) + return 0; + + /* + * Will it become too large? + */ + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + return 0; + + total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; + if (total_phys_segments > blk_rq_get_max_segments(req)) + return 0; + + if (!blk_cgroup_mergeable(req, next->bio)) + return 0; + + if (blk_integrity_merge_rq(q, req, next) == false) + return 0; + + if (!bio_crypt_ctx_merge_rq(req, next)) + return 0; + + /* Merge is OK... */ + req->nr_phys_segments = total_phys_segments; + return 1; +} + +/** + * blk_rq_set_mixed_merge - mark a request as mixed merge + * @rq: request to mark as mixed merge + * + * Description: + * @rq is about to be mixed merged. Make sure the attributes + * which can be mixed are set in each bio and mark @rq as mixed + * merged. + */ +void blk_rq_set_mixed_merge(struct request *rq) +{ + unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + struct bio *bio; + + if (rq->rq_flags & RQF_MIXED_MERGE) + return; + + /* + * @rq will no longer represent mixable attributes for all the + * contained bios. It will just track those of the first one. + * Distributes the attributs to each bio. + */ + for (bio = rq->bio; bio; bio = bio->bi_next) { + WARN_ON_ONCE((bio->bi_opf & REQ_FAILFAST_MASK) && + (bio->bi_opf & REQ_FAILFAST_MASK) != ff); + bio->bi_opf |= ff; + } + rq->rq_flags |= RQF_MIXED_MERGE; +} + +static void blk_account_io_merge_request(struct request *req) +{ + if (blk_do_io_stat(req)) { + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); + + hd_struct_put(req->part); + } +} + +static enum elv_merge blk_try_req_merge(struct request *req, + struct request *next) +{ + if (blk_discard_mergable(req)) + return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(req) + blk_rq_sectors(req) == blk_rq_pos(next)) + return ELEVATOR_BACK_MERGE; + + return ELEVATOR_NO_MERGE; +} + +/* + * For non-mq, this has to be called with the request spinlock acquired. + * For mq with scheduling, the appropriate queue wide lock should be held. + */ +static struct request *attempt_merge(struct request_queue *q, + struct request *req, struct request *next) +{ + if (!rq_mergeable(req) || !rq_mergeable(next)) + return NULL; + + if (req_op(req) != req_op(next)) + return NULL; + + if (rq_data_dir(req) != rq_data_dir(next) + || req->rq_disk != next->rq_disk) + return NULL; + + if (req_op(req) == REQ_OP_WRITE_SAME && + !blk_write_same_mergeable(req->bio, next->bio)) + return NULL; + + /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (req->write_hint != next->write_hint) + return NULL; + + if (req->ioprio != next->ioprio) + return NULL; + + /* + * If we are allowed to merge, then append bio list + * from next to rq and release next. merge_requests_fn + * will have updated segment counts, update sector + * counts here. Handle DISCARDs separately, as they + * have separate settings. + */ + + switch (blk_try_req_merge(req, next)) { + case ELEVATOR_DISCARD_MERGE: + if (!req_attempt_discard_merge(q, req, next)) + return NULL; + break; + case ELEVATOR_BACK_MERGE: + if (!ll_merge_requests_fn(q, req, next)) + return NULL; + break; + default: + return NULL; + } + + /* + * If failfast settings disagree or any of the two is already + * a mixed merge, mark both as mixed before proceeding. This + * makes sure that all involved bios have mixable attributes + * set properly. + */ + if (((req->rq_flags | next->rq_flags) & RQF_MIXED_MERGE) || + (req->cmd_flags & REQ_FAILFAST_MASK) != + (next->cmd_flags & REQ_FAILFAST_MASK)) { + blk_rq_set_mixed_merge(req); + blk_rq_set_mixed_merge(next); + } + + /* + * At this point we have either done a back merge or front merge. We + * need the smaller start_time_ns of the merged requests to be the + * current request for accounting purposes. + */ + if (next->start_time_ns < req->start_time_ns) + req->start_time_ns = next->start_time_ns; + + req->biotail->bi_next = next->bio; + req->biotail = next->biotail; + + req->__data_len += blk_rq_bytes(next); + + if (!blk_discard_mergable(req)) + elv_merge_requests(q, req, next); + + blk_crypto_rq_put_keyslot(next); + + /* + * 'next' is going away, so update stats accordingly + */ + blk_account_io_merge_request(next); + + trace_block_rq_merge(next); + + /* + * ownership of bio passed from next to req, return 'next' for + * the caller to free + */ + next->bio = NULL; + return next; +} + +static struct request *attempt_back_merge(struct request_queue *q, + struct request *rq) +{ + struct request *next = elv_latter_request(q, rq); + + if (next) + return attempt_merge(q, rq, next); + + return NULL; +} + +static struct request *attempt_front_merge(struct request_queue *q, + struct request *rq) +{ + struct request *prev = elv_former_request(q, rq); + + if (prev) + return attempt_merge(q, prev, rq); + + return NULL; +} + +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct request *free; + + free = attempt_merge(q, rq, next); + if (free) { + blk_put_request(free); + return 1; + } + + return 0; +} + +bool blk_rq_merge_ok(struct request *rq, struct bio *bio) +{ + if (!rq_mergeable(rq) || !bio_mergeable(bio)) + return false; + + if (req_op(rq) != bio_op(bio)) + return false; + + /* different data direction or already started, don't merge */ + if (bio_data_dir(bio) != rq_data_dir(rq)) + return false; + + /* must be same device */ + if (rq->rq_disk != bio->bi_disk) + return false; + + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) + return false; + + /* only merge integrity protected bio into ditto rq */ + if (blk_integrity_merge_bio(rq->q, rq, bio) == false) + return false; + + /* Only merge if the crypt contexts are compatible */ + if (!bio_crypt_rq_ctx_compatible(rq, bio)) + return false; + + /* must be using the same buffer */ + if (req_op(rq) == REQ_OP_WRITE_SAME && + !blk_write_same_mergeable(rq->bio, bio)) + return false; + + /* + * Don't allow merge of different write hints, or for a hint with + * non-hint IO. + */ + if (rq->write_hint != bio->bi_write_hint) + return false; + + if (rq->ioprio != bio_prio(bio)) + return false; + + return true; +} + +enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) +{ + if (blk_discard_mergable(rq)) + return ELEVATOR_DISCARD_MERGE; + else if (blk_rq_pos(rq) + blk_rq_sectors(rq) == bio->bi_iter.bi_sector) + return ELEVATOR_BACK_MERGE; + else if (blk_rq_pos(rq) - bio_sectors(bio) == bio->bi_iter.bi_sector) + return ELEVATOR_FRONT_MERGE; + return ELEVATOR_NO_MERGE; +} + +static void blk_account_io_merge_bio(struct request *req) +{ + if (!blk_do_io_stat(req)) + return; + + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); +} + +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + +static enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_back_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_backmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_free_ctx(bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +static enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, unsigned int nr_segs) +{ + const int ff = bio->bi_opf & REQ_FAILFAST_MASK; + + if (!ll_front_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_frontmerge(req->q, req, bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + bio->bi_next = req->bio; + req->bio = bio; + + req->__sector = bio->bi_iter.bi_sector; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_do_front_merge(req, bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, struct bio *bio) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + rq_qos_merge(q, req, bio); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->nr_phys_segments = segments + 1; + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +no_merge: + req_set_nomerge(q, req); + return BIO_MERGE_FAILED; +} + +static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, + struct request *rq, + struct bio *bio, + unsigned int nr_segs, + bool sched_allow_merge) +{ + if (!blk_rq_merge_ok(rq, bio)) + return BIO_MERGE_NONE; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_back_merge(rq, bio, nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_front_merge(rq, bio, nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); + default: + return BIO_MERGE_NONE; + } + + return BIO_MERGE_FAILED; +} + +/** + * blk_attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @nr_segs: number of segments in @bio + * @same_queue_rq: pointer to &struct request that gets filled in when + * another request associated with @q is found on the plug list + * (optional, may be %NULL) + * + * Determine whether @bio being queued on @q can be merged with a request + * on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. + */ +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **same_queue_rq) +{ + struct blk_plug *plug; + struct request *rq; + struct list_head *plug_list; + + plug = blk_mq_plug(q, bio); + if (!plug) + return false; + + plug_list = &plug->mq_list; + + list_for_each_entry_reverse(rq, plug_list, queuelist) { + if (rq->q == q && same_queue_rq) { + /* + * Only blk-mq multiple hardware queues case checks the + * rq in the same queue, there should be only one such + * rq in a queue + **/ + *same_queue_rq = rq; + } + + if (rq->q != q) + continue; + + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + } + + return false; +} + +/* + * Iterate list of requests and see if we can merge this bio with any + * of them. + */ +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, list, queuelist) { + if (!checked--) + break; + + switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) { + case BIO_MERGE_NONE: + continue; + case BIO_MERGE_OK: + return true; + case BIO_MERGE_FAILED: + return false; + } + + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_bio_list_merge); + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **merged_request) +{ + struct request *rq; + + switch (elv_merge(q, &rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK) + return false; + *merged_request = attempt_back_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); + return true; + case ELEVATOR_FRONT_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK) + return false; + *merged_request = attempt_front_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); + return true; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c new file mode 100644 index 000000000..3db84d319 --- /dev/null +++ b/block/blk-mq-cpumap.c @@ -0,0 +1,96 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * CPU <-> hardware queue mapping helpers + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include +#include +#include +#include +#include +#include + +#include +#include "blk.h" +#include "blk-mq.h" + +static int queue_index(struct blk_mq_queue_map *qmap, + unsigned int nr_queues, const int q) +{ + return qmap->queue_offset + (q % nr_queues); +} + +static int get_first_sibling(unsigned int cpu) +{ + unsigned int ret; + + ret = cpumask_first(topology_sibling_cpumask(cpu)); + if (ret < nr_cpu_ids) + return ret; + + return cpu; +} + +int blk_mq_map_queues(struct blk_mq_queue_map *qmap) +{ + unsigned int *map = qmap->mq_map; + unsigned int nr_queues = qmap->nr_queues; + unsigned int cpu, first_sibling, q = 0; + + for_each_possible_cpu(cpu) + map[cpu] = -1; + + /* + * Spread queues among present CPUs first for minimizing + * count of dead queues which are mapped by all un-present CPUs + */ + for_each_present_cpu(cpu) { + if (q >= nr_queues) + break; + map[cpu] = queue_index(qmap, nr_queues, q++); + } + + for_each_possible_cpu(cpu) { + if (map[cpu] != -1) + continue; + /* + * First do sequential mapping between CPUs and queues. + * In case we still have CPUs to map, and we have some number of + * threads per cores then map sibling threads to the same queue + * for performance optimizations. + */ + if (q < nr_queues) { + map[cpu] = queue_index(qmap, nr_queues, q++); + } else { + first_sibling = get_first_sibling(cpu); + if (first_sibling == cpu) + map[cpu] = queue_index(qmap, nr_queues, q++); + else + map[cpu] = map[first_sibling]; + } + } + + return 0; +} +EXPORT_SYMBOL_GPL(blk_mq_map_queues); + +/** + * blk_mq_hw_queue_to_node - Look up the memory node for a hardware queue index + * @qmap: CPU to hardware queue map. + * @index: hardware queue index. + * + * We have no quick way of doing reverse lookups. This is only used at + * queue init time, so runtime isn't important. + */ +int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) +{ + int i; + + for_each_possible_cpu(i) { + if (index == qmap->mq_map[i]) + return cpu_to_node(i); + } + + return NUMA_NO_NODE; +} diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c new file mode 100644 index 000000000..038cb627c --- /dev/null +++ b/block/blk-mq-debugfs-zoned.c @@ -0,0 +1,22 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Western Digital Corporation or its affiliates. + */ + +#include +#include "blk-mq-debugfs.h" + +int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + unsigned int i; + + if (!q->seq_zones_wlock) + return 0; + + for (i = 0; i < q->nr_zones; i++) + if (test_bit(i, q->seq_zones_wlock)) + seq_printf(m, "%u\n", i); + + return 0; +} diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c new file mode 100644 index 000000000..212e1e795 --- /dev/null +++ b/block/blk-mq-debugfs.c @@ -0,0 +1,994 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2017 Facebook + */ + +#include +#include +#include + +#include +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-tag.h" +#include "blk-rq-qos.h" + +static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) +{ + if (stat->nr_samples) { + seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", + stat->nr_samples, stat->mean, stat->min, stat->max); + } else { + seq_puts(m, "samples=0"); + } +} + +static int queue_poll_stat_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + int bucket; + + for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { + seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); + print_stat(m, &q->poll_stat[2 * bucket]); + seq_puts(m, "\n"); + + seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); + print_stat(m, &q->poll_stat[2 * bucket + 1]); + seq_puts(m, "\n"); + } + return 0; +} + +static void *queue_requeue_list_start(struct seq_file *m, loff_t *pos) + __acquires(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_lock_irq(&q->requeue_lock); + return seq_list_start(&q->requeue_list, *pos); +} + +static void *queue_requeue_list_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + + return seq_list_next(v, &q->requeue_list, pos); +} + +static void queue_requeue_list_stop(struct seq_file *m, void *v) + __releases(&q->requeue_lock) +{ + struct request_queue *q = m->private; + + spin_unlock_irq(&q->requeue_lock); +} + +static const struct seq_operations queue_requeue_list_seq_ops = { + .start = queue_requeue_list_start, + .next = queue_requeue_list_next, + .stop = queue_requeue_list_stop, + .show = blk_mq_debugfs_rq_show, +}; + +static int blk_flags_show(struct seq_file *m, const unsigned long flags, + const char *const *flag_name, int flag_name_count) +{ + bool sep = false; + int i; + + for (i = 0; i < sizeof(flags) * BITS_PER_BYTE; i++) { + if (!(flags & BIT(i))) + continue; + if (sep) + seq_puts(m, "|"); + sep = true; + if (i < flag_name_count && flag_name[i]) + seq_puts(m, flag_name[i]); + else + seq_printf(m, "%d", i); + } + return 0; +} + +static int queue_pm_only_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + + seq_printf(m, "%d\n", atomic_read(&q->pm_only)); + return 0; +} + +#define QUEUE_FLAG_NAME(name) [QUEUE_FLAG_##name] = #name +static const char *const blk_queue_flag_name[] = { + QUEUE_FLAG_NAME(STOPPED), + QUEUE_FLAG_NAME(DYING), + QUEUE_FLAG_NAME(NOMERGES), + QUEUE_FLAG_NAME(SAME_COMP), + QUEUE_FLAG_NAME(FAIL_IO), + QUEUE_FLAG_NAME(NONROT), + QUEUE_FLAG_NAME(IO_STAT), + QUEUE_FLAG_NAME(DISCARD), + QUEUE_FLAG_NAME(NOXMERGES), + QUEUE_FLAG_NAME(ADD_RANDOM), + QUEUE_FLAG_NAME(SECERASE), + QUEUE_FLAG_NAME(SAME_FORCE), + QUEUE_FLAG_NAME(DEAD), + QUEUE_FLAG_NAME(INIT_DONE), + QUEUE_FLAG_NAME(STABLE_WRITES), + QUEUE_FLAG_NAME(POLL), + QUEUE_FLAG_NAME(WC), + QUEUE_FLAG_NAME(FUA), + QUEUE_FLAG_NAME(DAX), + QUEUE_FLAG_NAME(STATS), + QUEUE_FLAG_NAME(POLL_STATS), + QUEUE_FLAG_NAME(REGISTERED), + QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), + QUEUE_FLAG_NAME(QUIESCED), + QUEUE_FLAG_NAME(PCI_P2PDMA), + QUEUE_FLAG_NAME(ZONE_RESETALL), + QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), + QUEUE_FLAG_NAME(NOWAIT), +}; +#undef QUEUE_FLAG_NAME + +static int queue_state_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + + blk_flags_show(m, q->queue_flags, blk_queue_flag_name, + ARRAY_SIZE(blk_queue_flag_name)); + seq_puts(m, "\n"); + return 0; +} + +static ssize_t queue_state_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct request_queue *q = data; + char opbuf[16] = { }, *op; + + /* + * The "state" attribute is removed after blk_cleanup_queue() has called + * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid + * triggering a use-after-free. + */ + if (blk_queue_dead(q)) + return -ENOENT; + + if (count >= sizeof(opbuf)) { + pr_err("%s: operation too long\n", __func__); + goto inval; + } + + if (copy_from_user(opbuf, buf, count)) + return -EFAULT; + op = strstrip(opbuf); + if (strcmp(op, "run") == 0) { + blk_mq_run_hw_queues(q, true); + } else if (strcmp(op, "start") == 0) { + blk_mq_start_stopped_hw_queues(q, true); + } else if (strcmp(op, "kick") == 0) { + blk_mq_kick_requeue_list(q); + } else { + pr_err("%s: unsupported operation '%s'\n", __func__, op); +inval: + pr_err("%s: use 'run', 'start' or 'kick'\n", __func__); + return -EINVAL; + } + return count; +} + +static int queue_write_hint_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]); + + return 0; +} + +static ssize_t queue_write_hint_store(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct request_queue *q = data; + int i; + + for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) + q->write_hints[i] = 0; + + return count; +} + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { + { "poll_stat", 0400, queue_poll_stat_show }, + { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, + { "pm_only", 0600, queue_pm_only_show, NULL }, + { "state", 0600, queue_state_show, queue_state_write }, + { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, + { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, + { }, +}; + +#define HCTX_STATE_NAME(name) [BLK_MQ_S_##name] = #name +static const char *const hctx_state_name[] = { + HCTX_STATE_NAME(STOPPED), + HCTX_STATE_NAME(TAG_ACTIVE), + HCTX_STATE_NAME(SCHED_RESTART), + HCTX_STATE_NAME(INACTIVE), +}; +#undef HCTX_STATE_NAME + +static int hctx_state_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + blk_flags_show(m, hctx->state, hctx_state_name, + ARRAY_SIZE(hctx_state_name)); + seq_puts(m, "\n"); + return 0; +} + +#define BLK_TAG_ALLOC_NAME(name) [BLK_TAG_ALLOC_##name] = #name +static const char *const alloc_policy_name[] = { + BLK_TAG_ALLOC_NAME(FIFO), + BLK_TAG_ALLOC_NAME(RR), +}; +#undef BLK_TAG_ALLOC_NAME + +#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name +static const char *const hctx_flag_name[] = { + HCTX_FLAG_NAME(SHOULD_MERGE), + HCTX_FLAG_NAME(TAG_QUEUE_SHARED), + HCTX_FLAG_NAME(BLOCKING), + HCTX_FLAG_NAME(NO_SCHED), + HCTX_FLAG_NAME(STACKING), + HCTX_FLAG_NAME(TAG_HCTX_SHARED), +}; +#undef HCTX_FLAG_NAME + +static int hctx_flags_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + const int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(hctx->flags); + + seq_puts(m, "alloc_policy="); + if (alloc_policy < ARRAY_SIZE(alloc_policy_name) && + alloc_policy_name[alloc_policy]) + seq_puts(m, alloc_policy_name[alloc_policy]); + else + seq_printf(m, "%d", alloc_policy); + seq_puts(m, " "); + blk_flags_show(m, + hctx->flags ^ BLK_ALLOC_POLICY_TO_MQ_FLAG(alloc_policy), + hctx_flag_name, ARRAY_SIZE(hctx_flag_name)); + seq_puts(m, "\n"); + return 0; +} + +#define CMD_FLAG_NAME(name) [__REQ_##name] = #name +static const char *const cmd_flag_name[] = { + CMD_FLAG_NAME(FAILFAST_DEV), + CMD_FLAG_NAME(FAILFAST_TRANSPORT), + CMD_FLAG_NAME(FAILFAST_DRIVER), + CMD_FLAG_NAME(SYNC), + CMD_FLAG_NAME(META), + CMD_FLAG_NAME(PRIO), + CMD_FLAG_NAME(NOMERGE), + CMD_FLAG_NAME(IDLE), + CMD_FLAG_NAME(INTEGRITY), + CMD_FLAG_NAME(FUA), + CMD_FLAG_NAME(PREFLUSH), + CMD_FLAG_NAME(RAHEAD), + CMD_FLAG_NAME(BACKGROUND), + CMD_FLAG_NAME(NOWAIT), + CMD_FLAG_NAME(NOUNMAP), + CMD_FLAG_NAME(HIPRI), +}; +#undef CMD_FLAG_NAME + +#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name +static const char *const rqf_name[] = { + RQF_NAME(SORTED), + RQF_NAME(STARTED), + RQF_NAME(SOFTBARRIER), + RQF_NAME(FLUSH_SEQ), + RQF_NAME(MIXED_MERGE), + RQF_NAME(MQ_INFLIGHT), + RQF_NAME(DONTPREP), + RQF_NAME(FAILED), + RQF_NAME(QUIET), + RQF_NAME(ELVPRIV), + RQF_NAME(IO_STAT), + RQF_NAME(ALLOCED), + RQF_NAME(PM), + RQF_NAME(HASHED), + RQF_NAME(STATS), + RQF_NAME(SPECIAL_PAYLOAD), + RQF_NAME(ZONE_WRITE_LOCKED), + RQF_NAME(MQ_POLL_SLEPT), +}; +#undef RQF_NAME + +static const char *const blk_mq_rq_state_name_array[] = { + [MQ_RQ_IDLE] = "idle", + [MQ_RQ_IN_FLIGHT] = "in_flight", + [MQ_RQ_COMPLETE] = "complete", +}; + +static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) +{ + if (WARN_ON_ONCE((unsigned int)rq_state >= + ARRAY_SIZE(blk_mq_rq_state_name_array))) + return "(?)"; + return blk_mq_rq_state_name_array[rq_state]; +} + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) +{ + const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; + const unsigned int op = req_op(rq); + const char *op_str = blk_op_str(op); + + seq_printf(m, "%p {.op=", rq); + if (strcmp(op_str, "UNKNOWN") == 0) + seq_printf(m, "%u", op); + else + seq_printf(m, "%s", op_str); + seq_puts(m, ", .cmd_flags="); + blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, + ARRAY_SIZE(cmd_flag_name)); + seq_puts(m, ", .rq_flags="); + blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, + ARRAY_SIZE(rqf_name)); + seq_printf(m, ", .state=%s", blk_mq_rq_state_name(blk_mq_rq_state(rq))); + seq_printf(m, ", .tag=%d, .internal_tag=%d", rq->tag, + rq->internal_tag); + if (mq_ops->show_rq) + mq_ops->show_rq(m, rq); + seq_puts(m, "}\n"); + return 0; +} +EXPORT_SYMBOL_GPL(__blk_mq_debugfs_rq_show); + +int blk_mq_debugfs_rq_show(struct seq_file *m, void *v) +{ + return __blk_mq_debugfs_rq_show(m, list_entry_rq(v)); +} +EXPORT_SYMBOL_GPL(blk_mq_debugfs_rq_show); + +static void *hctx_dispatch_start(struct seq_file *m, loff_t *pos) + __acquires(&hctx->lock) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_lock(&hctx->lock); + return seq_list_start(&hctx->dispatch, *pos); +} + +static void *hctx_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + return seq_list_next(v, &hctx->dispatch, pos); +} + +static void hctx_dispatch_stop(struct seq_file *m, void *v) + __releases(&hctx->lock) +{ + struct blk_mq_hw_ctx *hctx = m->private; + + spin_unlock(&hctx->lock); +} + +static const struct seq_operations hctx_dispatch_seq_ops = { + .start = hctx_dispatch_start, + .next = hctx_dispatch_next, + .stop = hctx_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; + +struct show_busy_params { + struct seq_file *m; + struct blk_mq_hw_ctx *hctx; +}; + +/* + * Note: the state of a request may change while this function is in progress, + * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to + * keep iterating requests. + */ +static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +{ + const struct show_busy_params *params = data; + + if (rq->mq_hctx == params->hctx) + __blk_mq_debugfs_rq_show(params->m, rq); + + return true; +} + +static int hctx_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct show_busy_params params = { .m = m, .hctx = hctx }; + + blk_mq_tagset_busy_iter(hctx->queue->tag_set, hctx_show_busy_rq, + ¶ms); + + return 0; +} + +static const char *const hctx_types[] = { + [HCTX_TYPE_DEFAULT] = "default", + [HCTX_TYPE_READ] = "read", + [HCTX_TYPE_POLL] = "poll", +}; + +static int hctx_type_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + BUILD_BUG_ON(ARRAY_SIZE(hctx_types) != HCTX_MAX_TYPES); + seq_printf(m, "%s\n", hctx_types[hctx->type]); + return 0; +} + +static int hctx_ctx_map_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + sbitmap_bitmap_show(&hctx->ctx_map, m); + return 0; +} + +static void blk_mq_debugfs_tags_show(struct seq_file *m, + struct blk_mq_tags *tags) +{ + seq_printf(m, "nr_tags=%u\n", tags->nr_tags); + seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); + seq_printf(m, "active_queues=%d\n", + atomic_read(&tags->active_queues)); + + seq_puts(m, "\nbitmap_tags:\n"); + sbitmap_queue_show(tags->bitmap_tags, m); + + if (tags->nr_reserved_tags) { + seq_puts(m, "\nbreserved_tags:\n"); + sbitmap_queue_show(tags->breserved_tags, m); + } +} + +static int hctx_tags_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->tags) + blk_mq_debugfs_tags_show(m, hctx->tags); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_tags_bitmap_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->tags) + sbitmap_bitmap_show(&hctx->tags->bitmap_tags->sb, m); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_sched_tags_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->sched_tags) + blk_mq_debugfs_tags_show(m, hctx->sched_tags); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_sched_tags_bitmap_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct request_queue *q = hctx->queue; + int res; + + res = mutex_lock_interruptible(&q->sysfs_lock); + if (res) + goto out; + if (hctx->sched_tags) + sbitmap_bitmap_show(&hctx->sched_tags->bitmap_tags->sb, m); + mutex_unlock(&q->sysfs_lock); + +out: + return res; +} + +static int hctx_io_poll_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "considered=%lu\n", hctx->poll_considered); + seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); + seq_printf(m, "success=%lu\n", hctx->poll_success); + return 0; +} + +static ssize_t hctx_io_poll_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_hw_ctx *hctx = data; + + hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; + return count; +} + +static int hctx_dispatched_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + int i; + + seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); + + for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { + unsigned int d = 1U << (i - 1); + + seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); + } + + seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); + return 0; +} + +static ssize_t hctx_dispatched_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_hw_ctx *hctx = data; + int i; + + for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) + hctx->dispatched[i] = 0; + return count; +} + +static int hctx_queued_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%lu\n", hctx->queued); + return 0; +} + +static ssize_t hctx_queued_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_hw_ctx *hctx = data; + + hctx->queued = 0; + return count; +} + +static int hctx_run_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%lu\n", hctx->run); + return 0; +} + +static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, + loff_t *ppos) +{ + struct blk_mq_hw_ctx *hctx = data; + + hctx->run = 0; + return count; +} + +static int hctx_active_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + return 0; +} + +static int hctx_dispatch_busy_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + + seq_printf(m, "%u\n", hctx->dispatch_busy); + return 0; +} + +#define CTX_RQ_SEQ_OPS(name, type) \ +static void *ctx_##name##_rq_list_start(struct seq_file *m, loff_t *pos) \ + __acquires(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_lock(&ctx->lock); \ + return seq_list_start(&ctx->rq_lists[type], *pos); \ +} \ + \ +static void *ctx_##name##_rq_list_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + return seq_list_next(v, &ctx->rq_lists[type], pos); \ +} \ + \ +static void ctx_##name##_rq_list_stop(struct seq_file *m, void *v) \ + __releases(&ctx->lock) \ +{ \ + struct blk_mq_ctx *ctx = m->private; \ + \ + spin_unlock(&ctx->lock); \ +} \ + \ +static const struct seq_operations ctx_##name##_rq_list_seq_ops = { \ + .start = ctx_##name##_rq_list_start, \ + .next = ctx_##name##_rq_list_next, \ + .stop = ctx_##name##_rq_list_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} + +CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); +CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); +CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); + +static int ctx_dispatched_show(void *data, struct seq_file *m) +{ + struct blk_mq_ctx *ctx = data; + + seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); + return 0; +} + +static ssize_t ctx_dispatched_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_ctx *ctx = data; + + ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; + return count; +} + +static int ctx_merged_show(void *data, struct seq_file *m) +{ + struct blk_mq_ctx *ctx = data; + + seq_printf(m, "%lu\n", ctx->rq_merged); + return 0; +} + +static ssize_t ctx_merged_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_ctx *ctx = data; + + ctx->rq_merged = 0; + return count; +} + +static int ctx_completed_show(void *data, struct seq_file *m) +{ + struct blk_mq_ctx *ctx = data; + + seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); + return 0; +} + +static ssize_t ctx_completed_write(void *data, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct blk_mq_ctx *ctx = data; + + ctx->rq_completed[0] = ctx->rq_completed[1] = 0; + return count; +} + +static int blk_mq_debugfs_show(struct seq_file *m, void *v) +{ + const struct blk_mq_debugfs_attr *attr = m->private; + void *data = d_inode(m->file->f_path.dentry->d_parent)->i_private; + + return attr->show(data, m); +} + +static ssize_t blk_mq_debugfs_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct seq_file *m = file->private_data; + const struct blk_mq_debugfs_attr *attr = m->private; + void *data = d_inode(file->f_path.dentry->d_parent)->i_private; + + /* + * Attributes that only implement .seq_ops are read-only and 'attr' is + * the same with 'data' in this case. + */ + if (attr == data || !attr->write) + return -EPERM; + + return attr->write(data, buf, count, ppos); +} + +static int blk_mq_debugfs_open(struct inode *inode, struct file *file) +{ + const struct blk_mq_debugfs_attr *attr = inode->i_private; + void *data = d_inode(file->f_path.dentry->d_parent)->i_private; + struct seq_file *m; + int ret; + + if (attr->seq_ops) { + ret = seq_open(file, attr->seq_ops); + if (!ret) { + m = file->private_data; + m->private = data; + } + return ret; + } + + if (WARN_ON_ONCE(!attr->show)) + return -EPERM; + + return single_open(file, blk_mq_debugfs_show, inode->i_private); +} + +static int blk_mq_debugfs_release(struct inode *inode, struct file *file) +{ + const struct blk_mq_debugfs_attr *attr = inode->i_private; + + if (attr->show) + return single_release(inode, file); + + return seq_release(inode, file); +} + +static const struct file_operations blk_mq_debugfs_fops = { + .open = blk_mq_debugfs_open, + .read = seq_read, + .write = blk_mq_debugfs_write, + .llseek = seq_lseek, + .release = blk_mq_debugfs_release, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { + {"state", 0400, hctx_state_show}, + {"flags", 0400, hctx_flags_show}, + {"dispatch", 0400, .seq_ops = &hctx_dispatch_seq_ops}, + {"busy", 0400, hctx_busy_show}, + {"ctx_map", 0400, hctx_ctx_map_show}, + {"tags", 0400, hctx_tags_show}, + {"tags_bitmap", 0400, hctx_tags_bitmap_show}, + {"sched_tags", 0400, hctx_sched_tags_show}, + {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, + {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write}, + {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write}, + {"queued", 0600, hctx_queued_show, hctx_queued_write}, + {"run", 0600, hctx_run_show, hctx_run_write}, + {"active", 0400, hctx_active_show}, + {"dispatch_busy", 0400, hctx_dispatch_busy_show}, + {"type", 0400, hctx_type_show}, + {}, +}; + +static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { + {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, + {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, + {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, + {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, + {"merged", 0600, ctx_merged_show, ctx_merged_write}, + {"completed", 0600, ctx_completed_show, ctx_completed_write}, + {}, +}; + +static void debugfs_create_files(struct dentry *parent, void *data, + const struct blk_mq_debugfs_attr *attr) +{ + if (IS_ERR_OR_NULL(parent)) + return; + + d_inode(parent)->i_private = data; + + for (; attr->name; attr++) + debugfs_create_file(attr->name, attr->mode, parent, + (void *)attr, &blk_mq_debugfs_fops); +} + +void blk_mq_debugfs_register(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); + + /* + * blk_mq_init_sched() attempted to do this already, but q->debugfs_dir + * didn't exist yet (because we don't know what to name the directory + * until the queue is registered to a gendisk). + */ + if (q->elevator && !q->sched_debugfs_dir) + blk_mq_debugfs_register_sched(q); + + /* Similarly, blk_mq_init_hctx() couldn't do this previously. */ + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->debugfs_dir) + blk_mq_debugfs_register_hctx(q, hctx); + if (q->elevator && !hctx->sched_debugfs_dir) + blk_mq_debugfs_register_sched_hctx(q, hctx); + } + + if (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + + while (rqos) { + blk_mq_debugfs_register_rqos(rqos); + rqos = rqos->next; + } + } +} + +void blk_mq_debugfs_unregister(struct request_queue *q) +{ + q->sched_debugfs_dir = NULL; +} + +static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + struct dentry *ctx_dir; + char name[20]; + + snprintf(name, sizeof(name), "cpu%u", ctx->cpu); + ctx_dir = debugfs_create_dir(name, hctx->debugfs_dir); + + debugfs_create_files(ctx_dir, ctx, blk_mq_debugfs_ctx_attrs); +} + +void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + char name[20]; + int i; + + if (!q->debugfs_dir) + return; + + snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); + hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); + + debugfs_create_files(hctx->debugfs_dir, hctx, blk_mq_debugfs_hctx_attrs); + + hctx_for_each_ctx(hctx, ctx, i) + blk_mq_debugfs_register_ctx(hctx, ctx); +} + +void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ + debugfs_remove_recursive(hctx->debugfs_dir); + hctx->sched_debugfs_dir = NULL; + hctx->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_register_hctx(q, hctx); +} + +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_debugfs_unregister_hctx(hctx); +} + +void blk_mq_debugfs_register_sched(struct request_queue *q) +{ + struct elevator_type *e = q->elevator->type; + + /* + * If the parent directory has not been created yet, return, we will be + * called again later on and the directory/files will be created then. + */ + if (!q->debugfs_dir) + return; + + if (!e->queue_debugfs_attrs) + return; + + q->sched_debugfs_dir = debugfs_create_dir("sched", q->debugfs_dir); + + debugfs_create_files(q->sched_debugfs_dir, q, e->queue_debugfs_attrs); +} + +void blk_mq_debugfs_unregister_sched(struct request_queue *q) +{ + debugfs_remove_recursive(q->sched_debugfs_dir); + q->sched_debugfs_dir = NULL; +} + +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ + debugfs_remove_recursive(rqos->debugfs_dir); + rqos->debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->q; + const char *dir_name = rq_qos_id_to_name(rqos->id); + + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) + return; + + if (!q->rqos_debugfs_dir) + q->rqos_debugfs_dir = debugfs_create_dir("rqos", + q->debugfs_dir); + + rqos->debugfs_dir = debugfs_create_dir(dir_name, + rqos->q->rqos_debugfs_dir); + + debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); +} + +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ + debugfs_remove_recursive(q->rqos_debugfs_dir); + q->rqos_debugfs_dir = NULL; +} + +void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ + struct elevator_type *e = q->elevator->type; + + if (!e->hctx_debugfs_attrs) + return; + + hctx->sched_debugfs_dir = debugfs_create_dir("sched", + hctx->debugfs_dir); + debugfs_create_files(hctx->sched_debugfs_dir, hctx, + e->hctx_debugfs_attrs); +} + +void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) +{ + debugfs_remove_recursive(hctx->sched_debugfs_dir); + hctx->sched_debugfs_dir = NULL; +} diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h new file mode 100644 index 000000000..a68aa6041 --- /dev/null +++ b/block/blk-mq-debugfs.h @@ -0,0 +1,103 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef INT_BLK_MQ_DEBUGFS_H +#define INT_BLK_MQ_DEBUGFS_H + +#ifdef CONFIG_BLK_DEBUG_FS + +#include + +struct blk_mq_debugfs_attr { + const char *name; + umode_t mode; + int (*show)(void *, struct seq_file *); + ssize_t (*write)(void *, const char __user *, size_t, loff_t *); + /* Set either .show or .seq_ops. */ + const struct seq_operations *seq_ops; +}; + +int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); +int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); + +void blk_mq_debugfs_register(struct request_queue *q); +void blk_mq_debugfs_unregister(struct request_queue *q); +void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_register_hctxs(struct request_queue *q); +void blk_mq_debugfs_unregister_hctxs(struct request_queue *q); + +void blk_mq_debugfs_register_sched(struct request_queue *q); +void blk_mq_debugfs_unregister_sched(struct request_queue *q); +void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx); +void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); + +void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); +void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); +#else +static inline void blk_mq_debugfs_register(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_unregister(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ +} + +static inline void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ +} + +static inline void blk_mq_debugfs_register_hctxs(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_register_sched(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_unregister_sched(struct request_queue *q) +{ +} + +static inline void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, + struct blk_mq_hw_ctx *hctx) +{ +} + +static inline void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) +{ +} + +static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) +{ +} + +static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) +{ +} + +static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) +{ +} +#endif + +#ifdef CONFIG_BLK_DEBUG_FS_ZONED +int queue_zone_wlock_show(void *data, struct seq_file *m); +#else +static inline int queue_zone_wlock_show(void *data, struct seq_file *m) +{ + return 0; +} +#endif + +#endif diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c new file mode 100644 index 000000000..b595a94c4 --- /dev/null +++ b/block/blk-mq-pci.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Christoph Hellwig. + */ +#include +#include +#include +#include +#include +#include + +#include "blk-mq.h" + +/** + * blk_mq_pci_map_queues - provide a default queue mapping for PCI device + * @qmap: CPU to hardware queue map. + * @pdev: PCI device associated with @set. + * @offset: Offset to use for the pci irq vector + * + * This function assumes the PCI device @pdev has at least as many available + * interrupt vectors as @set has queues. It will then query the vector + * corresponding to each queue for it's affinity mask and built queue mapping + * that maps a queue to the CPUs that have irq affinity for the corresponding + * vector. + */ +int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = pci_irq_get_affinity(pdev, queue + offset); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return 0; + +fallback: + WARN_ON_ONCE(qmap->nr_queues > 1); + blk_mq_clear_mq_map(qmap); + return 0; +} +EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c new file mode 100644 index 000000000..14f968e58 --- /dev/null +++ b/block/blk-mq-rdma.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2017 Sagi Grimberg. + */ +#include +#include +#include + +/** + * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device + * @map: CPU to hardware queue map. + * @dev: rdma device to provide a mapping for. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the rdma device @dev has at least as many available + * interrupt vetors as @set has queues. It will then query it's affinity mask + * and built queue mapping that maps a queue to the CPUs that have irq affinity + * for the corresponding vector. + * + * In case either the driver passed a @dev with less vectors than + * @set->nr_hw_queues, or @dev does not provide an affinity mask for a + * vector, we fallback to the naive mapping. + */ +int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, + struct ib_device *dev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + for (queue = 0; queue < map->nr_queues; queue++) { + mask = ib_get_vector_affinity(dev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + map->mq_map[cpu] = map->queue_offset + queue; + } + + return 0; + +fallback: + return blk_mq_map_queues(map); +} +EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c new file mode 100644 index 000000000..7858c5a35 --- /dev/null +++ b/block/blk-mq-sched.c @@ -0,0 +1,654 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" +#include "blk-mq-tag.h" +#include "blk-wbt.h" + +void blk_mq_sched_assign_ioc(struct request *rq) +{ + struct request_queue *q = rq->q; + struct io_context *ioc; + struct io_cq *icq; + + /* + * May not have an IO context if it's a passthrough request + */ + ioc = current->io_context; + if (!ioc) + return; + + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(ioc, q); + spin_unlock_irq(&q->queue_lock); + + if (!icq) { + icq = ioc_create_icq(ioc, q, GFP_ATOMIC); + if (!icq) + return; + } + get_io_context(icq->ioc); + rq->elv.icq = icq; +} + +/* + * Mark a hardware queue as needing a restart. + */ +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +{ + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; + + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); + +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +{ + if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + + /* + * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) + * in blk_mq_run_hw_queue(). Its pair is the barrier in + * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, + * meantime new request added to hctx->dispatch is missed to check in + * blk_mq_run_hw_queue(). + */ + smp_mb(); + + blk_mq_run_hw_queue(hctx, true); +} + +static int sched_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + return rqa->mq_hctx > rqb->mq_hctx; +} + +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) +{ + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; + + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); +} + +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ + +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() fails to get the budget. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. + */ +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; + LIST_HEAD(rq_list); + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; + + do { + struct request *rq; + + if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) + break; + + if (!list_empty_careful(&hctx->dispatch)) { + busy = true; + break; + } + + if (!blk_mq_get_dispatch_budget(q)) + break; + + rq = e->type->ops.dispatch_request(hctx); + if (!rq) { + blk_mq_put_dispatch_budget(q); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + run_queue = true; + break; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add_tail(&rq->queuelist, &rq_list); + if (rq->mq_hctx != hctx) + multi_hctxs = true; + } while (++count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + unsigned long end = jiffies + HZ; + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); + + return ret; +} + +static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + unsigned short idx = ctx->index_hw[hctx->type]; + + if (++idx == hctx->nr_ctx) + idx = 0; + + return hctx->ctxs[idx]; +} + +/* + * Only SCSI implements .get_budget and .put_budget, and SCSI restarts + * its queue by itself in its completion handler, so we don't need to + * restart queue if .get_budget() fails to get the budget. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. + */ +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + LIST_HEAD(rq_list); + struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + int ret = 0; + struct request *rq; + + do { + if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } + + if (!sbitmap_any_bit_set(&hctx->ctx_map)) + break; + + if (!blk_mq_get_dispatch_budget(q)) + break; + + rq = blk_mq_dequeue_from_ctx(hctx, ctx); + if (!rq) { + blk_mq_put_dispatch_budget(q); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + break; + } + + /* + * Now this rq owns the budget which has to be released + * if this rq won't be queued to driver via .queue_rq() + * in blk_mq_dispatch_rq_list(). + */ + list_add(&rq->queuelist, &rq_list); + + /* round robin for fair dispatch */ + ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); + + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); + + WRITE_ONCE(hctx->dispatch_from, ctx); + return ret; +} + +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct elevator_queue *e = q->elevator; + const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + int ret = 0; + LIST_HEAD(rq_list); + + /* + * If we have previous entries on our dispatch list, grab them first for + * more fair dispatch. + */ + if (!list_empty_careful(&hctx->dispatch)) { + spin_lock(&hctx->lock); + if (!list_empty(&hctx->dispatch)) + list_splice_init(&hctx->dispatch, &rq_list); + spin_unlock(&hctx->lock); + } + + /* + * Only ask the scheduler for requests, if we didn't have residual + * requests from the dispatch list. This is to avoid the case where + * we only ever dispatch a fraction of the requests available because + * of low device queue depth. Once we pull requests out of the IO + * scheduler, we can no longer merge or sort them. So it's best to + * leave them there for as long as we can. Mark the hw queue as + * needing a restart in that case. + * + * We want to dispatch from the scheduler if there was nothing + * on the dispatch list or we were able to dispatch from the + * dispatch list. + */ + if (!list_empty(&rq_list)) { + blk_mq_sched_mark_restart_hctx(hctx); + if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { + if (has_sched_dispatch) + ret = blk_mq_do_dispatch_sched(hctx); + else + ret = blk_mq_do_dispatch_ctx(hctx); + } + } else if (has_sched_dispatch) { + ret = blk_mq_do_dispatch_sched(hctx); + } else if (hctx->dispatch_busy) { + /* dequeue request one by one from sw queue if queue is busy */ + ret = blk_mq_do_dispatch_ctx(hctx); + } else { + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + } + + return ret; +} + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) + return; + + hctx->run++; + + /* + * A return of -EAGAIN is an indication that hctx->dispatch is not + * empty and we must run again in order to avoid starving flushes. + */ + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) + blk_mq_run_hw_queue(hctx, true); + } +} + +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; + bool ret = false; + enum hctx_type type; + + if (e && e->type->ops.bio_merge) + return e->type->ops.bio_merge(q, bio, nr_segs); + + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + type = hctx->type; + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || + list_empty_careful(&ctx->rq_lists[type])) + return false; + + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + /* + * Reverse check our software queue for entries that we could + * potentially merge with. Currently includes a hand-wavy stop + * count of 8, to not spend too much time checking for merges. + */ + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { + ctx->rq_merged++; + ret = true; + } + + spin_unlock(&ctx->lock); + + return ret; +} + +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +{ + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); + +void blk_mq_sched_request_inserted(struct request *rq) +{ + trace_block_rq_insert(rq); +} +EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); + +static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, + bool has_sched, + struct request *rq) +{ + /* + * dispatch flush and passthrough rq directly + * + * passthrough request has to be added to hctx->dispatch directly. + * For some reason, device may be in one situation which can't + * handle FS request, so STS_RESOURCE is always returned and the + * FS request will be added to hctx->dispatch. However passthrough + * request may be required at that time for fixing the problem. If + * passthrough request is added to scheduler queue, there isn't any + * chance to dispatch it given we prioritize requests in hctx->dispatch. + */ + if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) + return true; + + if (has_sched) + rq->rq_flags |= RQF_SORTED; + + return false; +} + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + WARN_ON(e && (rq->tag != BLK_MQ_NO_TAG)); + + if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { + /* + * Firstly normal IO request is inserted to scheduler queue or + * sw queue, meantime we add flush request to dispatch queue( + * hctx->dispatch) directly and there is at most one in-flight + * flush request for each hw queue, so it doesn't matter to add + * flush request to tail or front of the dispatch queue. + * + * Secondly in case of NCQ, flush request belongs to non-NCQ + * command, and queueing it will fail when there is any + * in-flight normal IO request(NCQ command). When adding flush + * rq to the front of hctx->dispatch, it is easier to introduce + * extra time to flush rq's latency because of S_SCHED_RESTART + * compared with adding to the tail of dispatch queue, then + * chance of flush merge is increased, and less flush requests + * will be issued to controller. It is observed that ~10% time + * is saved in blktests block/004 on disk attached to AHCI/NCQ + * drive when adding flush rq to the front of hctx->dispatch. + * + * Simply queue flush rq to the front of hctx->dispatch so that + * intensive flush workloads can benefit in case of NCQ HW. + */ + at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; + blk_mq_request_bypass_insert(rq, at_head, false); + goto run; + } + + if (e && e->type->ops.insert_requests) { + LIST_HEAD(list); + + list_add(&rq->queuelist, &list); + e->type->ops.insert_requests(hctx, &list, at_head); + } else { + spin_lock(&ctx->lock); + __blk_mq_insert_request(hctx, rq, at_head); + spin_unlock(&ctx->lock); + } + +run: + if (run_queue) + blk_mq_run_hw_queue(hctx, async); +} + +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async) +{ + struct elevator_queue *e; + struct request_queue *q = hctx->queue; + + /* + * blk_mq_sched_insert_requests() is called from flush plug + * context only, and hold one usage counter to prevent queue + * from being released. + */ + percpu_ref_get(&q->q_usage_counter); + + e = hctx->queue->elevator; + if (e && e->type->ops.insert_requests) + e->type->ops.insert_requests(hctx, list, false); + else { + /* + * try to issue requests directly if the hw queue isn't + * busy in case of 'none' scheduler, and this way may save + * us one extra enqueue & dequeue to sw queue. + */ + if (!hctx->dispatch_busy && !e && !run_queue_async) { + blk_mq_try_issue_list_directly(hctx, list); + if (list_empty(list)) + goto out; + } + blk_mq_insert_requests(hctx, ctx, list); + } + + blk_mq_run_hw_queue(hctx, run_queue_async); + out: + percpu_ref_put(&q->q_usage_counter); +} + +static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) +{ + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + + if (hctx->sched_tags) { + blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); + blk_mq_free_rq_map(hctx->sched_tags, flags); + hctx->sched_tags = NULL; + } +} + +static int blk_mq_sched_alloc_tags(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) +{ + struct blk_mq_tag_set *set = q->tag_set; + /* Clear HCTX_SHARED so tags are init'ed */ + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + int ret; + + hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, + set->reserved_tags, flags); + if (!hctx->sched_tags) + return -ENOMEM; + + ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); + if (ret) + blk_mq_sched_free_tags(set, hctx, hctx_idx); + + return ret; +} + +/* called in queue's release handler, tagset has gone away */ +static void blk_mq_sched_tags_teardown(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + /* Clear HCTX_SHARED so tags are freed */ + unsigned int flags = hctx->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + + if (hctx->sched_tags) { + blk_mq_free_rq_map(hctx->sched_tags, flags); + hctx->sched_tags = NULL; + } + } +} + +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) +{ + struct blk_mq_hw_ctx *hctx; + struct elevator_queue *eq; + unsigned int i; + int ret; + + if (!e) { + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; + return 0; + } + + /* + * Default to double of smaller one between hw queue_depth and 128, + * since we don't split into sync/async like the old code did. + * Additionally, this is a per-hw queue depth. + */ + q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, + BLKDEV_MAX_RQ); + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_sched_alloc_tags(q, hctx, i); + if (ret) + goto err; + } + + ret = e->ops.init_sched(q, e); + if (ret) + goto err; + + blk_mq_debugfs_register_sched(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (e->ops.init_hctx) { + ret = e->ops.init_hctx(hctx, i); + if (ret) { + eq = q->elevator; + blk_mq_sched_free_requests(q); + blk_mq_exit_sched(q, eq); + kobject_put(&eq->kobj); + return ret; + } + } + blk_mq_debugfs_register_sched_hctx(q, hctx); + } + + return 0; + +err: + blk_mq_sched_free_requests(q); + blk_mq_sched_tags_teardown(q); + q->elevator = NULL; + return ret; +} + +/* + * called in either blk_queue_cleanup or elevator_switch, tagset + * is required for freeing requests + */ +void blk_mq_sched_free_requests(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + } +} + +void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + blk_mq_debugfs_unregister_sched_hctx(hctx); + if (e->type->ops.exit_hctx && hctx->sched_data) { + e->type->ops.exit_hctx(hctx, i); + hctx->sched_data = NULL; + } + } + blk_mq_debugfs_unregister_sched(q); + if (e->type->ops.exit_sched) + e->type->ops.exit_sched(e); + blk_mq_sched_tags_teardown(q); + q->elevator = NULL; +} diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h new file mode 100644 index 000000000..0476360f0 --- /dev/null +++ b/block/blk-mq-sched.h @@ -0,0 +1,85 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BLK_MQ_SCHED_H +#define BLK_MQ_SCHED_H + +#include "blk-mq.h" +#include "blk-mq-tag.h" + +void blk_mq_sched_assign_ioc(struct request *rq); + +void blk_mq_sched_request_inserted(struct request *rq); +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **merged_request); +bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); +void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); + +void blk_mq_sched_insert_request(struct request *rq, bool at_head, + bool run_queue, bool async); +void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, + struct list_head *list, bool run_queue_async); + +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); + +int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); +void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); +void blk_mq_sched_free_requests(struct request_queue *q); + +static inline bool +blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) + return false; + + return __blk_mq_sched_bio_merge(q, bio, nr_segs); +} + +static inline bool +blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + + if (e && e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); + + return true; +} + +static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) +{ + struct elevator_queue *e = rq->q->elevator; + + if (e && e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); +} + +static inline void blk_mq_sched_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if ((rq->rq_flags & RQF_ELVPRIV) && e && e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); +} + +static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct elevator_queue *e = hctx->queue->elevator; + + if (e && e->type->ops.has_work) + return e->type->ops.has_work(hctx); + + return false; +} + +static inline bool blk_mq_sched_needs_restart(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); +} + +#endif diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c new file mode 100644 index 000000000..f0bc3398f --- /dev/null +++ b/block/blk-mq-sysfs.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +static void blk_mq_sysfs_release(struct kobject *kobj) +{ + struct blk_mq_ctxs *ctxs = container_of(kobj, struct blk_mq_ctxs, kobj); + + free_percpu(ctxs->queue_ctx); + kfree(ctxs); +} + +static void blk_mq_ctx_sysfs_release(struct kobject *kobj) +{ + struct blk_mq_ctx *ctx = container_of(kobj, struct blk_mq_ctx, kobj); + + /* ctx->ctxs won't be released until all ctx are freed */ + kobject_put(&ctx->ctxs->kobj); +} + +static void blk_mq_hw_sysfs_release(struct kobject *kobj) +{ + struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, + kobj); + + if (hctx->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(hctx->srcu); + blk_free_flush_queue(hctx->fq); + sbitmap_free(&hctx->ctx_map); + free_cpumask_var(hctx->cpumask); + kfree(hctx->ctxs); + kfree(hctx); +} + +struct blk_mq_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_ctx *, char *); + ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); +}; + +struct blk_mq_hw_ctx_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_mq_hw_ctx *, char *); + ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); +}; + +static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, + char *page) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->show) + return -EIO; + + mutex_lock(&q->sysfs_lock); + res = entry->show(ctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct blk_mq_ctx_sysfs_entry *entry; + struct blk_mq_ctx *ctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); + ctx = container_of(kobj, struct blk_mq_ctx, kobj); + q = ctx->queue; + + if (!entry->store) + return -EIO; + + mutex_lock(&q->sysfs_lock); + res = entry->store(ctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->show) + return -EIO; + + mutex_lock(&q->sysfs_lock); + res = entry->show(hctx, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, + struct attribute *attr, const char *page, + size_t length) +{ + struct blk_mq_hw_ctx_sysfs_entry *entry; + struct blk_mq_hw_ctx *hctx; + struct request_queue *q; + ssize_t res; + + entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); + hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); + q = hctx->queue; + + if (!entry->store) + return -EIO; + + mutex_lock(&q->sysfs_lock); + res = entry->store(hctx, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + return sprintf(page, "%u\n", hctx->tags->nr_tags); +} + +static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx, + char *page) +{ + return sprintf(page, "%u\n", hctx->tags->nr_reserved_tags); +} + +static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page) +{ + const size_t size = PAGE_SIZE - 1; + unsigned int i, first = 1; + int ret = 0, pos = 0; + + for_each_cpu(i, hctx->cpumask) { + if (first) + ret = snprintf(pos + page, size - pos, "%u", i); + else + ret = snprintf(pos + page, size - pos, ", %u", i); + + if (ret >= size - pos) + break; + + first = 0; + pos += ret; + } + + ret = snprintf(pos + page, size + 1 - pos, "\n"); + return pos + ret; +} + +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = { + .attr = {.name = "nr_tags", .mode = 0444 }, + .show = blk_mq_hw_sysfs_nr_tags_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_reserved_tags = { + .attr = {.name = "nr_reserved_tags", .mode = 0444 }, + .show = blk_mq_hw_sysfs_nr_reserved_tags_show, +}; +static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_cpus = { + .attr = {.name = "cpu_list", .mode = 0444 }, + .show = blk_mq_hw_sysfs_cpus_show, +}; + +static struct attribute *default_hw_ctx_attrs[] = { + &blk_mq_hw_sysfs_nr_tags.attr, + &blk_mq_hw_sysfs_nr_reserved_tags.attr, + &blk_mq_hw_sysfs_cpus.attr, + NULL, +}; +ATTRIBUTE_GROUPS(default_hw_ctx); + +static const struct sysfs_ops blk_mq_sysfs_ops = { + .show = blk_mq_sysfs_show, + .store = blk_mq_sysfs_store, +}; + +static const struct sysfs_ops blk_mq_hw_sysfs_ops = { + .show = blk_mq_hw_sysfs_show, + .store = blk_mq_hw_sysfs_store, +}; + +static struct kobj_type blk_mq_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .release = blk_mq_sysfs_release, +}; + +static struct kobj_type blk_mq_ctx_ktype = { + .sysfs_ops = &blk_mq_sysfs_ops, + .release = blk_mq_ctx_sysfs_release, +}; + +static struct kobj_type blk_mq_hw_ktype = { + .sysfs_ops = &blk_mq_hw_sysfs_ops, + .default_groups = default_hw_ctx_groups, + .release = blk_mq_hw_sysfs_release, +}; + +static void blk_mq_unregister_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_ctx *ctx; + int i; + + if (!hctx->nr_ctx) + return; + + hctx_for_each_ctx(hctx, ctx, i) + kobject_del(&ctx->kobj); + + kobject_del(&hctx->kobj); +} + +static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct blk_mq_ctx *ctx; + int i, j, ret; + + if (!hctx->nr_ctx) + return 0; + + ret = kobject_add(&hctx->kobj, q->mq_kobj, "%u", hctx->queue_num); + if (ret) + return ret; + + hctx_for_each_ctx(hctx, ctx, i) { + ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); + if (ret) + goto out; + } + + return 0; +out: + hctx_for_each_ctx(hctx, ctx, j) { + if (j < i) + kobject_del(&ctx->kobj); + } + kobject_del(&hctx->kobj); + return ret; +} + +void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + lockdep_assert_held(&q->sysfs_dir_lock); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); + + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); + kobject_put(&dev->kobj); + + q->mq_sysfs_init_done = false; +} + +void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) +{ + kobject_init(&hctx->kobj, &blk_mq_hw_ktype); +} + +void blk_mq_sysfs_deinit(struct request_queue *q) +{ + struct blk_mq_ctx *ctx; + int cpu; + + for_each_possible_cpu(cpu) { + ctx = per_cpu_ptr(q->queue_ctx, cpu); + kobject_put(&ctx->kobj); + } + kobject_put(q->mq_kobj); +} + +void blk_mq_sysfs_init(struct request_queue *q) +{ + struct blk_mq_ctx *ctx; + int cpu; + + kobject_init(q->mq_kobj, &blk_mq_ktype); + + for_each_possible_cpu(cpu) { + ctx = per_cpu_ptr(q->queue_ctx, cpu); + + kobject_get(q->mq_kobj); + kobject_init(&ctx->kobj, &blk_mq_ctx_ktype); + } +} + +int __blk_mq_register_dev(struct device *dev, struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int ret, i; + + WARN_ON_ONCE(!q->kobj.parent); + lockdep_assert_held(&q->sysfs_dir_lock); + + ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + if (ret < 0) + goto out; + + kobject_uevent(q->mq_kobj, KOBJ_ADD); + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_register_hctx(hctx); + if (ret) + goto unreg; + } + + q->mq_sysfs_init_done = true; + +out: + return ret; + +unreg: + while (--i >= 0) + blk_mq_unregister_hctx(q->queue_hw_ctx[i]); + + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); + kobject_put(&dev->kobj); + return ret; +} + +void blk_mq_sysfs_unregister(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + mutex_lock(&q->sysfs_dir_lock); + if (!q->mq_sysfs_init_done) + goto unlock; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); + +unlock: + mutex_unlock(&q->sysfs_dir_lock); +} + +int blk_mq_sysfs_register(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i, ret = 0; + + mutex_lock(&q->sysfs_dir_lock); + if (!q->mq_sysfs_init_done) + goto unlock; + + queue_for_each_hw_ctx(q, hctx, i) { + ret = blk_mq_register_hctx(hctx); + if (ret) + break; + } + +unlock: + mutex_unlock(&q->sysfs_dir_lock); + + return ret; +} diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c new file mode 100644 index 000000000..16ad9e656 --- /dev/null +++ b/block/blk-mq-tag.c @@ -0,0 +1,643 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Tag allocation using scalable bitmaps. Uses active queue tracking to support + * fairer distribution of tags between multiple submitters when a shared tag map + * is used. + * + * Copyright (C) 2013-2014 Jens Axboe + */ +#include +#include + +#include +#include +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-tag.h" + +/* + * If a previously inactive queue goes active, bump the active user count. + * We need to do this before try to allocate driver tag, then even if fail + * to get tag when first time, the other shared-tag users could reserve + * budget for it. + */ +bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) && + !test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + atomic_inc(&set->active_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && + !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + atomic_inc(&hctx->tags->active_queues); + } + + return true; +} + +/* + * Wakeup all potentially sleeping on tags + */ +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) +{ + sbitmap_queue_wake_all(tags->bitmap_tags); + if (include_reserve) + sbitmap_queue_wake_all(tags->breserved_tags); +} + +/* + * If a previously busy queue goes inactive, potential waiters could now + * be allowed to queue. Wake them up and check. + */ +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->tags; + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, + &q->queue_flags)) + return; + atomic_dec(&set->active_queues_shared_sbitmap); + } else { + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + atomic_dec(&tags->active_queues); + } + + blk_mq_tag_wakeup_all(tags, false); +} + +static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, + struct sbitmap_queue *bt) +{ + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && + !hctx_may_queue(data->hctx, bt)) + return BLK_MQ_NO_TAG; + + if (data->shallow_depth) + return __sbitmap_queue_get_shallow(bt, data->shallow_depth); + else + return __sbitmap_queue_get(bt); +} + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt; + struct sbq_wait_state *ws; + DEFINE_SBQ_WAIT(wait); + unsigned int tag_offset; + int tag; + + if (data->flags & BLK_MQ_REQ_RESERVED) { + if (unlikely(!tags->nr_reserved_tags)) { + WARN_ON_ONCE(1); + return BLK_MQ_NO_TAG; + } + bt = tags->breserved_tags; + tag_offset = 0; + } else { + bt = tags->bitmap_tags; + tag_offset = tags->nr_reserved_tags; + } + + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + goto found_tag; + + if (data->flags & BLK_MQ_REQ_NOWAIT) + return BLK_MQ_NO_TAG; + + ws = bt_wait_ptr(bt, data->hctx); + do { + struct sbitmap_queue *bt_prev; + + /* + * We're out of tags on this hardware queue, kick any + * pending IO submits before going to sleep waiting for + * some to complete. + */ + blk_mq_run_hw_queue(data->hctx, false); + + /* + * Retry tag allocation after running the hardware queue, + * as running the queue may also have found completions. + */ + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + break; + + sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE); + + tag = __blk_mq_get_tag(data, bt); + if (tag != BLK_MQ_NO_TAG) + break; + + bt_prev = bt; + io_schedule(); + + sbitmap_finish_wait(bt, ws, &wait); + + data->ctx = blk_mq_get_ctx(data->q); + data->hctx = blk_mq_map_queue(data->q, data->cmd_flags, + data->ctx); + tags = blk_mq_tags_from_data(data); + if (data->flags & BLK_MQ_REQ_RESERVED) + bt = tags->breserved_tags; + else + bt = tags->bitmap_tags; + + /* + * If destination hw queue is changed, fake wake up on + * previous queue for compensating the wake up miss, so + * other allocations on previous queue won't be starved. + */ + if (bt != bt_prev) + sbitmap_queue_wake_up(bt_prev); + + ws = bt_wait_ptr(bt, data->hctx); + } while (1); + + sbitmap_finish_wait(bt, ws, &wait); + +found_tag: + /* + * Give up this allocation if the hctx is inactive. The caller will + * retry on an active hctx. + */ + if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) { + blk_mq_put_tag(tags, data->ctx, tag + tag_offset); + return BLK_MQ_NO_TAG; + } + return tag + tag_offset; +} + +void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag) +{ + if (!blk_mq_tag_is_reserved(tags, tag)) { + const int real_tag = tag - tags->nr_reserved_tags; + + BUG_ON(real_tag >= tags->nr_tags); + sbitmap_queue_clear(tags->bitmap_tags, real_tag, ctx->cpu); + } else { + BUG_ON(tag >= tags->nr_reserved_tags); + sbitmap_queue_clear(tags->breserved_tags, tag, ctx->cpu); + } +} + +struct bt_iter_data { + struct blk_mq_hw_ctx *hctx; + busy_iter_fn *fn; + void *data; + bool reserved; +}; + +static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, + unsigned int bitnr) +{ + struct request *rq; + unsigned long flags; + + spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; + if (!rq || rq->tag != bitnr || !refcount_inc_not_zero(&rq->ref)) + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); + return rq; +} + +static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) +{ + struct bt_iter_data *iter_data = data; + struct blk_mq_hw_ctx *hctx = iter_data->hctx; + struct blk_mq_tags *tags = hctx->tags; + bool reserved = iter_data->reserved; + struct request *rq; + bool ret = true; + + if (!reserved) + bitnr += tags->nr_reserved_tags; + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assigning ->rqs[]. + */ + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (rq->q == hctx->queue && rq->mq_hctx == hctx) + ret = iter_data->fn(hctx, rq, iter_data->data, reserved); + blk_mq_put_rq_ref(rq); + return ret; +} + +/** + * bt_for_each - iterate over the requests associated with a hardware queue + * @hctx: Hardware queue to examine. + * @bt: sbitmap to examine. This is either the breserved_tags member + * or the bitmap_tags member of struct blk_mq_tags. + * @fn: Pointer to the function that will be called for each request + * associated with @hctx that has been assigned a driver tag. + * @fn will be called as follows: @fn(@hctx, rq, @data, @reserved) + * where rq is a pointer to a request. Return true to continue + * iterating tags, false to stop. + * @data: Will be passed as third argument to @fn. + * @reserved: Indicates whether @bt is the breserved_tags member or the + * bitmap_tags member of struct blk_mq_tags. + */ +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, + busy_iter_fn *fn, void *data, bool reserved) +{ + struct bt_iter_data iter_data = { + .hctx = hctx, + .fn = fn, + .data = data, + .reserved = reserved, + }; + + sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); +} + +struct bt_tags_iter_data { + struct blk_mq_tags *tags; + busy_tag_iter_fn *fn; + void *data; + unsigned int flags; +}; + +#define BT_TAG_ITER_RESERVED (1 << 0) +#define BT_TAG_ITER_STARTED (1 << 1) +#define BT_TAG_ITER_STATIC_RQS (1 << 2) + +static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) +{ + struct bt_tags_iter_data *iter_data = data; + struct blk_mq_tags *tags = iter_data->tags; + bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; + struct request *rq; + bool ret = true; + bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); + + if (!reserved) + bitnr += tags->nr_reserved_tags; + + /* + * We can hit rq == NULL here, because the tagging functions + * test and set the bit before assigning ->rqs[]. + */ + if (iter_static_rqs) + rq = tags->static_rqs[bitnr]; + else + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (!(iter_data->flags & BT_TAG_ITER_STARTED) || + blk_mq_request_started(rq)) + ret = iter_data->fn(rq, iter_data->data, reserved); + if (!iter_static_rqs) + blk_mq_put_rq_ref(rq); + return ret; +} + +/** + * bt_tags_for_each - iterate over the requests in a tag map + * @tags: Tag map to iterate over. + * @bt: sbitmap to examine. This is either the breserved_tags member + * or the bitmap_tags member of struct blk_mq_tags. + * @fn: Pointer to the function that will be called for each started + * request. @fn will be called as follows: @fn(rq, @data, + * @reserved) where rq is a pointer to a request. Return true + * to continue iterating tags, false to stop. + * @data: Will be passed as second argument to @fn. + * @flags: BT_TAG_ITER_* + */ +static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt, + busy_tag_iter_fn *fn, void *data, unsigned int flags) +{ + struct bt_tags_iter_data iter_data = { + .tags = tags, + .fn = fn, + .data = data, + .flags = flags, + }; + + if (tags->rqs) + sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data); +} + +static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags, + busy_tag_iter_fn *fn, void *priv, unsigned int flags) +{ + WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED); + + if (tags->nr_reserved_tags) + bt_tags_for_each(tags, tags->breserved_tags, fn, priv, + flags | BT_TAG_ITER_RESERVED); + bt_tags_for_each(tags, tags->bitmap_tags, fn, priv, flags); +} + +/** + * blk_mq_all_tag_iter - iterate over all requests in a tag map + * @tags: Tag map to iterate over. + * @fn: Pointer to the function that will be called for each + * request. @fn will be called as follows: @fn(rq, @priv, + * reserved) where rq is a pointer to a request. 'reserved' + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. + * @priv: Will be passed as second argument to @fn. + * + * Caller has to pass the tag map from which requests are allocated. + */ +void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv) +{ + __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS); +} + +/** + * blk_mq_tagset_busy_iter - iterate over all started requests in a tag set + * @tagset: Tag set to iterate over. + * @fn: Pointer to the function that will be called for each started + * request. @fn will be called as follows: @fn(rq, @priv, + * reserved) where rq is a pointer to a request. 'reserved' + * indicates whether or not @rq is a reserved request. Return + * true to continue iterating tags, false to stop. + * @priv: Will be passed as second argument to @fn. + * + * We grab one request reference before calling @fn and release it after + * @fn returns. + */ +void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, + busy_tag_iter_fn *fn, void *priv) +{ + int i; + + for (i = 0; i < tagset->nr_hw_queues; i++) { + if (tagset->tags && tagset->tags[i]) + __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, + BT_TAG_ITER_STARTED); + } +} +EXPORT_SYMBOL(blk_mq_tagset_busy_iter); + +static bool blk_mq_tagset_count_completed_rqs(struct request *rq, + void *data, bool reserved) +{ + unsigned *count = data; + + if (blk_mq_request_completed(rq)) + (*count)++; + return true; +} + +/** + * blk_mq_tagset_wait_completed_request - wait until all completed req's + * complete funtion is run + * @tagset: Tag set to drain completed request + * + * Note: This function has to be run after all IO queues are shutdown + */ +void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset) +{ + while (true) { + unsigned count = 0; + + blk_mq_tagset_busy_iter(tagset, + blk_mq_tagset_count_completed_rqs, &count); + if (!count) + break; + msleep(5); + } +} +EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); + +/** + * blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag + * @q: Request queue to examine. + * @fn: Pointer to the function that will be called for each request + * on @q. @fn will be called as follows: @fn(hctx, rq, @priv, + * reserved) where rq is a pointer to a request and hctx points + * to the hardware queue associated with the request. 'reserved' + * indicates whether or not @rq is a reserved request. + * @priv: Will be passed as third argument to @fn. + * + * Note: if @q->tag_set is shared with other request queues then @fn will be + * called for all requests on all queues that share that tag set and not only + * for requests associated with @q. + */ +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + /* + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * while the queue is frozen. So we can use q_usage_counter to avoid + * racing with it. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) + return; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, tags->breserved_tags, fn, priv, true); + bt_for_each(hctx, tags->bitmap_tags, fn, priv, false); + } + blk_queue_exit(q); +} + +static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, + bool round_robin, int node) +{ + return sbitmap_queue_init_node(bt, depth, -1, round_robin, GFP_KERNEL, + node); +} + +static int blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, + int node, int alloc_policy) +{ + unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + + if (bt_alloc(&tags->__bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(&tags->__breserved_tags, tags->nr_reserved_tags, + round_robin, node)) + goto free_bitmap_tags; + + tags->bitmap_tags = &tags->__bitmap_tags; + tags->breserved_tags = &tags->__breserved_tags; + + return 0; +free_bitmap_tags: + sbitmap_queue_free(&tags->__bitmap_tags); + return -ENOMEM; +} + +int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int flags) +{ + unsigned int depth = set->queue_depth - set->reserved_tags; + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags); + bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; + int i, node = set->numa_node; + + if (bt_alloc(&set->__bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(&set->__breserved_tags, set->reserved_tags, + round_robin, node)) + goto free_bitmap_tags; + + for (i = 0; i < set->nr_hw_queues; i++) { + struct blk_mq_tags *tags = set->tags[i]; + + tags->bitmap_tags = &set->__bitmap_tags; + tags->breserved_tags = &set->__breserved_tags; + } + + return 0; +free_bitmap_tags: + sbitmap_queue_free(&set->__bitmap_tags); + return -ENOMEM; +} + +void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set) +{ + sbitmap_queue_free(&set->__bitmap_tags); + sbitmap_queue_free(&set->__breserved_tags); +} + +struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, + unsigned int reserved_tags, + int node, unsigned int flags) +{ + int alloc_policy = BLK_MQ_FLAG_TO_ALLOC_POLICY(flags); + struct blk_mq_tags *tags; + + if (total_tags > BLK_MQ_TAG_MAX) { + pr_err("blk-mq: tag depth too large\n"); + return NULL; + } + + tags = kzalloc_node(sizeof(*tags), GFP_KERNEL, node); + if (!tags) + return NULL; + + tags->nr_tags = total_tags; + tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); + + if (flags & BLK_MQ_F_TAG_HCTX_SHARED) + return tags; + + if (blk_mq_init_bitmap_tags(tags, node, alloc_policy) < 0) { + kfree(tags); + return NULL; + } + return tags; +} + +void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags) +{ + if (!(flags & BLK_MQ_F_TAG_HCTX_SHARED)) { + sbitmap_queue_free(tags->bitmap_tags); + sbitmap_queue_free(tags->breserved_tags); + } + kfree(tags); +} + +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tagsptr, unsigned int tdepth, + bool can_grow) +{ + struct blk_mq_tags *tags = *tagsptr; + + if (tdepth <= tags->nr_reserved_tags) + return -EINVAL; + + /* + * If we are allowed to grow beyond the original size, allocate + * a new set of tags before freeing the old one. + */ + if (tdepth > tags->nr_tags) { + struct blk_mq_tag_set *set = hctx->queue->tag_set; + /* Only sched tags can grow, so clear HCTX_SHARED flag */ + unsigned int flags = set->flags & ~BLK_MQ_F_TAG_HCTX_SHARED; + struct blk_mq_tags *new; + bool ret; + + if (!can_grow) + return -EINVAL; + + /* + * We need some sort of upper limit, set it high enough that + * no valid use cases should require more. + */ + if (tdepth > 16 * BLKDEV_MAX_RQ) + return -EINVAL; + + new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, + tags->nr_reserved_tags, flags); + if (!new) + return -ENOMEM; + ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); + if (ret) { + blk_mq_free_rq_map(new, flags); + return -ENOMEM; + } + + blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); + blk_mq_free_rq_map(*tagsptr, flags); + *tagsptr = new; + } else { + /* + * Don't need (or can't) update reserved tags here, they + * remain static and should never need resizing. + */ + sbitmap_queue_resize(tags->bitmap_tags, + tdepth - tags->nr_reserved_tags); + } + + return 0; +} + +void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, unsigned int size) +{ + sbitmap_queue_resize(&set->__bitmap_tags, size - set->reserved_tags); +} + +/** + * blk_mq_unique_tag() - return a tag that is unique queue-wide + * @rq: request for which to compute a unique tag + * + * The tag field in struct request is unique per hardware queue but not over + * all hardware queues. Hence this function that returns a tag with the + * hardware context index in the upper bits and the per hardware queue tag in + * the lower bits. + * + * Note: When called for a request that is queued on a non-multiqueue request + * queue, the hardware context index is set to zero. + */ +u32 blk_mq_unique_tag(struct request *rq) +{ + return (rq->mq_hctx->queue_num << BLK_MQ_UNIQUE_TAG_BITS) | + (rq->tag & BLK_MQ_UNIQUE_TAG_MASK); +} +EXPORT_SYMBOL(blk_mq_unique_tag); diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h new file mode 100644 index 000000000..f887988e5 --- /dev/null +++ b/block/blk-mq-tag.h @@ -0,0 +1,94 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef INT_BLK_MQ_TAG_H +#define INT_BLK_MQ_TAG_H + +/* + * Tag address space map. + */ +struct blk_mq_tags { + unsigned int nr_tags; + unsigned int nr_reserved_tags; + + atomic_t active_queues; + + struct sbitmap_queue *bitmap_tags; + struct sbitmap_queue *breserved_tags; + + struct sbitmap_queue __bitmap_tags; + struct sbitmap_queue __breserved_tags; + + struct request **rqs; + struct request **static_rqs; + struct list_head page_list; + + /* + * used to clear request reference in rqs[] before freeing one + * request pool + */ + spinlock_t lock; +}; + +extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, + unsigned int reserved_tags, + int node, unsigned int flags); +extern void blk_mq_free_tags(struct blk_mq_tags *tags, unsigned int flags); + +extern int blk_mq_init_shared_sbitmap(struct blk_mq_tag_set *set, + unsigned int flags); +extern void blk_mq_exit_shared_sbitmap(struct blk_mq_tag_set *set); + +extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag); +extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, + unsigned int depth, bool can_grow); +extern void blk_mq_tag_resize_shared_sbitmap(struct blk_mq_tag_set *set, + unsigned int size); + +extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, + void *priv); +void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); + +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + +enum { + BLK_MQ_NO_TAG = -1U, + BLK_MQ_TAG_MIN = 1, + BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, +}; + +extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return false; + + return __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return; + + __blk_mq_tag_idle(hctx); +} + +static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, + unsigned int tag) +{ + return tag < tags->nr_reserved_tags; +} + +#endif diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c new file mode 100644 index 000000000..7b8a42c35 --- /dev/null +++ b/block/blk-mq-virtio.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2016 Christoph Hellwig. + */ +#include +#include +#include +#include +#include +#include "blk-mq.h" + +/** + * blk_mq_virtio_map_queues - provide a default queue mapping for virtio device + * @qmap: CPU to hardware queue map. + * @vdev: virtio device to provide a mapping for. + * @first_vec: first interrupt vectors to use for queues (usually 0) + * + * This function assumes the virtio device @vdev has at least as many available + * interrupt vectors as @set has queues. It will then query the vector + * corresponding to each queue for it's affinity mask and built queue mapping + * that maps a queue to the CPUs that have irq affinity for the corresponding + * vector. + */ +int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, + struct virtio_device *vdev, int first_vec) +{ + const struct cpumask *mask; + unsigned int queue, cpu; + + if (!vdev->config->get_vq_affinity) + goto fallback; + + for (queue = 0; queue < qmap->nr_queues; queue++) { + mask = vdev->config->get_vq_affinity(vdev, first_vec + queue); + if (!mask) + goto fallback; + + for_each_cpu(cpu, mask) + qmap->mq_map[cpu] = qmap->queue_offset + queue; + } + + return 0; +fallback: + return blk_mq_map_queues(qmap); +} +EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c new file mode 100644 index 000000000..e153a36c9 --- /dev/null +++ b/block/blk-mq.c @@ -0,0 +1,4000 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block multiqueue core code + * + * Copyright (C) 2013-2014 Jens Axboe + * Copyright (C) 2013-2014 Christoph Hellwig + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-tag.h" +#include "blk-pm.h" +#include "blk-stat.h" +#include "blk-mq-sched.h" +#include "blk-rq-qos.h" + +static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + +static void blk_mq_poll_stats_start(struct request_queue *q); +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); + +static int blk_mq_poll_stats_bkt(const struct request *rq) +{ + int ddir, sectors, bucket; + + ddir = rq_data_dir(rq); + sectors = blk_rq_stats_sectors(rq); + + bucket = ddir + 2 * ilog2(sectors); + + if (bucket < 0) + return -1; + else if (bucket >= BLK_MQ_POLL_STATS_BKTS) + return ddir + BLK_MQ_POLL_STATS_BKTS - 2; + + return bucket; +} + +/* + * Check if any of the ctx, dispatch list or elevator + * have pending work in this hardware queue. + */ +static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) +{ + return !list_empty_careful(&hctx->dispatch) || + sbitmap_any_bit_set(&hctx->ctx_map) || + blk_mq_sched_has_work(hctx); +} + +/* + * Mark this ctx as having pending work in this hardware queue + */ +static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + const int bit = ctx->index_hw[hctx->type]; + + if (!sbitmap_test_bit(&hctx->ctx_map, bit)) + sbitmap_set_bit(&hctx->ctx_map, bit); +} + +static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx) +{ + const int bit = ctx->index_hw[hctx->type]; + + sbitmap_clear_bit(&hctx->ctx_map, bit); +} + +struct mq_inflight { + struct hd_struct *part; + unsigned int inflight[2]; +}; + +static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, + bool reserved) +{ + struct mq_inflight *mi = priv; + + if ((!mi->part->partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) + mi->inflight[rq_data_dir(rq)]++; + + return true; +} + +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +{ + struct mq_inflight mi = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + + return mi.inflight[0] + mi.inflight[1]; +} + +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]) +{ + struct mq_inflight mi = { .part = part }; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + inflight[0] = mi.inflight[0]; + inflight[1] = mi.inflight[1]; +} + +void blk_freeze_queue_start(struct request_queue *q) +{ + mutex_lock(&q->mq_freeze_lock); + if (++q->mq_freeze_depth == 1) { + percpu_ref_kill(&q->q_usage_counter); + mutex_unlock(&q->mq_freeze_lock); + if (queue_is_mq(q)) + blk_mq_run_hw_queues(q, false); + } else { + mutex_unlock(&q->mq_freeze_lock); + } +} +EXPORT_SYMBOL_GPL(blk_freeze_queue_start); + +void blk_mq_freeze_queue_wait(struct request_queue *q) +{ + wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); + +int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, + unsigned long timeout) +{ + return wait_event_timeout(q->mq_freeze_wq, + percpu_ref_is_zero(&q->q_usage_counter), + timeout); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); + +/* + * Guarantee no request is in use, so we can change any data structure of + * the queue afterward. + */ +void blk_freeze_queue(struct request_queue *q) +{ + /* + * In the !blk_mq case we are only calling this to kill the + * q_usage_counter, otherwise this increases the freeze depth + * and waits for it to return to zero. For this reason there is + * no blk_unfreeze_queue(), and blk_freeze_queue() is not + * exported to drivers as the only user for unfreeze is blk_mq. + */ + blk_freeze_queue_start(q); + blk_mq_freeze_queue_wait(q); +} + +void blk_mq_freeze_queue(struct request_queue *q) +{ + /* + * ...just an alias to keep freeze and unfreeze actions balanced + * in the blk_mq_* namespace + */ + blk_freeze_queue(q); +} +EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); + +void blk_mq_unfreeze_queue(struct request_queue *q) +{ + mutex_lock(&q->mq_freeze_lock); + q->mq_freeze_depth--; + WARN_ON_ONCE(q->mq_freeze_depth < 0); + if (!q->mq_freeze_depth) { + percpu_ref_resurrect(&q->q_usage_counter); + wake_up_all(&q->mq_freeze_wq); + } + mutex_unlock(&q->mq_freeze_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); + +/* + * FIXME: replace the scsi_internal_device_*block_nowait() calls in the + * mpt3sas driver such that this function can be removed. + */ +void blk_mq_quiesce_queue_nowait(struct request_queue *q) +{ + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); + +/** + * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished + * @q: request queue. + * + * Note: this function does not prevent that the struct request end_io() + * callback function is invoked. Once this function is returned, we make + * sure no dispatch can happen until the queue is unquiesced via + * blk_mq_unquiesce_queue(). + */ +void blk_mq_quiesce_queue(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + bool rcu = false; + + blk_mq_quiesce_queue_nowait(q); + + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(hctx->srcu); + else + rcu = true; + } + if (rcu) + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); + +/* + * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() + * @q: request queue. + * + * This function recovers queue into the state before quiescing + * which is done by blk_mq_quiesce_queue. + */ +void blk_mq_unquiesce_queue(struct request_queue *q) +{ + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + + /* dispatch requests which are inserted during quiescing */ + blk_mq_run_hw_queues(q, true); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); + +void blk_mq_wake_waiters(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_wakeup_all(hctx->tags, true); +} + +/* + * Only need start/end time stamping if we have iostat or + * blk stats enabled, or using an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; +} + +static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, + unsigned int tag, u64 alloc_time_ns) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct request *rq = tags->static_rqs[tag]; + + if (data->q->elevator) { + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = tag; + } else { + rq->tag = tag; + rq->internal_tag = BLK_MQ_NO_TAG; + } + + /* csd/requeue_work/fifo_time is initialized before use */ + rq->q = data->q; + rq->mq_ctx = data->ctx; + rq->mq_hctx = data->hctx; + rq->rq_flags = 0; + rq->cmd_flags = data->cmd_flags; + if (data->flags & BLK_MQ_REQ_PM) + rq->rq_flags |= RQF_PM; + if (blk_queue_io_stat(data->q)) + rq->rq_flags |= RQF_IO_STAT; + INIT_LIST_HEAD(&rq->queuelist); + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->rq_disk = NULL; + rq->part = NULL; +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + rq->alloc_time_ns = alloc_time_ns; +#endif + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; + rq->io_start_time_ns = 0; + rq->stats_sectors = 0; + rq->nr_phys_segments = 0; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + rq->nr_integrity_segments = 0; +#endif + blk_crypto_rq_set_defaults(rq); + /* tag was already set */ + WRITE_ONCE(rq->deadline, 0); + + rq->timeout = 0; + + rq->end_io = NULL; + rq->end_io_data = NULL; + + data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; + refcount_set(&rq->ref, 1); + + if (!op_is_flush(data->cmd_flags)) { + struct elevator_queue *e = data->q->elevator; + + rq->elv.icq = NULL; + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); + + e->type->ops.prepare_request(rq); + rq->rq_flags |= RQF_ELVPRIV; + } + } + + data->hctx->queued++; + return rq; +} + +static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) +{ + struct request_queue *q = data->q; + struct elevator_queue *e = q->elevator; + u64 alloc_time_ns = 0; + unsigned int tag; + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + + if (data->cmd_flags & REQ_NOWAIT) + data->flags |= BLK_MQ_REQ_NOWAIT; + + if (e) { + /* + * Flush requests are special and go directly to the + * dispatch list. Don't include reserved tags in the + * limiting, as it isn't useful. + */ + if (!op_is_flush(data->cmd_flags) && + e->type->ops.limit_depth && + !(data->flags & BLK_MQ_REQ_RESERVED)) + e->type->ops.limit_depth(data->cmd_flags, data); + } + +retry: + data->ctx = blk_mq_get_ctx(q); + data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); + if (!e) + blk_mq_tag_busy(data->hctx); + + /* + * Waiting allocations only fail because of an inactive hctx. In that + * case just retry the hctx assignment and tag allocation as CPU hotplug + * should have migrated us to an online CPU by now. + */ + tag = blk_mq_get_tag(data); + if (tag == BLK_MQ_NO_TAG) { + if (data->flags & BLK_MQ_REQ_NOWAIT) + return NULL; + + /* + * Give up the CPU and sleep for a random short time to ensure + * that thread using a realtime scheduling class are migrated + * off the CPU, and thus off the hctx that is going away. + */ + msleep(3); + goto retry; + } + return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); +} + +struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, + blk_mq_req_flags_t flags) +{ + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; + struct request *rq; + int ret; + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + rq = __blk_mq_alloc_request(&data); + if (!rq) + goto out_queue_exit; + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + return rq; +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(-EWOULDBLOCK); +} +EXPORT_SYMBOL(blk_mq_alloc_request); + +struct request *blk_mq_alloc_request_hctx(struct request_queue *q, + unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) +{ + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = op, + }; + u64 alloc_time_ns = 0; + unsigned int cpu; + unsigned int tag; + int ret; + + /* alloc_time includes depth and tag waits */ + if (blk_queue_rq_alloc_time(q)) + alloc_time_ns = ktime_get_ns(); + + /* + * If the tag allocator sleeps we could get an allocation for a + * different hardware context. No need to complicate the low level + * allocator for this for the rare use case of a command tied to + * a specific queue. + */ + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || + WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) + return ERR_PTR(-EINVAL); + + if (hctx_idx >= q->nr_hw_queues) + return ERR_PTR(-EIO); + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + /* + * Check if the hardware context is actually mapped to anything. + * If not tell the caller that it should skip this queue. + */ + ret = -EXDEV; + data.hctx = q->queue_hw_ctx[hctx_idx]; + if (!blk_mq_hw_queue_mapped(data.hctx)) + goto out_queue_exit; + cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); + if (cpu >= nr_cpu_ids) + goto out_queue_exit; + data.ctx = __blk_mq_get_ctx(q, cpu); + + if (!q->elevator) + blk_mq_tag_busy(data.hctx); + + ret = -EWOULDBLOCK; + tag = blk_mq_get_tag(&data); + if (tag == BLK_MQ_NO_TAG) + goto out_queue_exit; + return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + +out_queue_exit: + blk_queue_exit(q); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); + +static void __blk_mq_free_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + const int sched_tag = rq->internal_tag; + + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + rq->mq_hctx = NULL; + if (rq->tag != BLK_MQ_NO_TAG) + blk_mq_put_tag(hctx->tags, ctx, rq->tag); + if (sched_tag != BLK_MQ_NO_TAG) + blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); + blk_mq_sched_restart(hctx); + blk_queue_exit(q); +} + +void blk_mq_free_request(struct request *rq) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->rq_flags & RQF_ELVPRIV) { + if (e && e->type->ops.finish_request) + e->type->ops.finish_request(rq); + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } + } + + ctx->rq_completed[rq_is_sync(rq)]++; + if (rq->rq_flags & RQF_MQ_INFLIGHT) + __blk_mq_dec_active_requests(hctx); + + if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) + laptop_io_completion(q->backing_dev_info); + + rq_qos_done(q, rq); + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} +EXPORT_SYMBOL_GPL(blk_mq_free_request); + +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +{ + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); + + if (rq->rq_flags & RQF_STATS) { + blk_mq_poll_stats_start(rq->q); + blk_stat_add(rq, now); + } + + blk_mq_sched_completed_request(rq, now); + + blk_account_io_done(rq, now); + + if (rq->end_io) { + rq_qos_done(rq->q, rq); + rq->end_io(rq, error); + } else { + blk_mq_free_request(rq); + } +} +EXPORT_SYMBOL(__blk_mq_end_request); + +void blk_mq_end_request(struct request *rq, blk_status_t error) +{ + if (blk_update_request(rq, error, blk_rq_bytes(rq))) + BUG(); + __blk_mq_end_request(rq, error); +} +EXPORT_SYMBOL(blk_mq_end_request); + +/* + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +static __latent_entropy void blk_done_softirq(struct softirq_action *h) +{ + struct list_head *cpu_list, local_list; + + local_irq_disable(); + cpu_list = this_cpu_ptr(&blk_cpu_done); + list_replace_init(cpu_list, &local_list); + local_irq_enable(); + + while (!list_empty(&local_list)) { + struct request *rq; + + rq = list_entry(local_list.next, struct request, ipi_list); + list_del_init(&rq->ipi_list); + rq->q->mq_ops->complete(rq); + } +} + +static void blk_mq_trigger_softirq(struct request *rq) +{ + struct list_head *list; + unsigned long flags; + + local_irq_save(flags); + list = this_cpu_ptr(&blk_cpu_done); + list_add_tail(&rq->ipi_list, list); + + /* + * If the list only contains our just added request, signal a raise of + * the softirq. If there are already entries there, someone already + * raised the irq but it hasn't run yet. + */ + if (list->next == &rq->ipi_list) + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_restore(flags); +} + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + /* + * If a CPU goes away, splice its entries to the current CPU + * and trigger a run of the softirq + */ + local_irq_disable(); + list_splice_init(&per_cpu(blk_cpu_done, cpu), + this_cpu_ptr(&blk_cpu_done)); + raise_softirq_irqoff(BLOCK_SOFTIRQ); + local_irq_enable(); + + return 0; +} + + +static void __blk_mq_complete_request_remote(void *data) +{ + struct request *rq = data; + + /* + * For most of single queue controllers, there is only one irq vector + * for handling I/O completion, and the only irq's affinity is set + * to all possible CPUs. On most of ARCHs, this affinity means the irq + * is handled on one specific CPU. + * + * So complete I/O requests in softirq context in case of single queue + * devices to avoid degrading I/O performance due to irqsoff latency. + */ + if (rq->q->nr_hw_queues == 1) + blk_mq_trigger_softirq(rq); + else + rq->q->mq_ops->complete(rq); +} + +static inline bool blk_mq_complete_need_ipi(struct request *rq) +{ + int cpu = raw_smp_processor_id(); + + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; + + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; + + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); +} + +bool blk_mq_complete_request_remote(struct request *rq) +{ + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if (rq->cmd_flags & REQ_HIPRI) + return false; + + if (blk_mq_complete_need_ipi(rq)) { + rq->csd.func = __blk_mq_complete_request_remote; + rq->csd.info = rq; + rq->csd.flags = 0; + smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); + } else { + if (rq->q->nr_hw_queues > 1) + return false; + blk_mq_trigger_softirq(rq); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); + +/** + * blk_mq_complete_request - end I/O on a request + * @rq: the request being processed + * + * Description: + * Complete a request by scheduling the ->complete_rq operation. + **/ +void blk_mq_complete_request(struct request *rq) +{ + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); +} +EXPORT_SYMBOL(blk_mq_complete_request); + +static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) + __releases(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) + rcu_read_unlock(); + else + srcu_read_unlock(hctx->srcu, srcu_idx); +} + +static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) + __acquires(hctx->srcu) +{ + if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { + /* shut up gcc false positive */ + *srcu_idx = 0; + rcu_read_lock(); + } else + *srcu_idx = srcu_read_lock(hctx->srcu); +} + +/** + * blk_mq_start_request - Start processing a request + * @rq: Pointer to request to be started + * + * Function used by device drivers to notify the block layer that a request + * is going to be processed now, so blk layer can do proper initializations + * such as starting the timeout timer. + */ +void blk_mq_start_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + trace_block_rq_issue(rq); + + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + rq->io_start_time_ns = ktime_get_ns(); + rq->stats_sectors = blk_rq_sectors(rq); + rq->rq_flags |= RQF_STATS; + rq_qos_issue(q, rq); + } + + WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); + + blk_add_timer(rq); + WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) + q->integrity.profile->prepare_fn(rq); +#endif +} +EXPORT_SYMBOL(blk_mq_start_request); + +static void __blk_mq_requeue_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + blk_mq_put_driver_tag(rq); + + trace_block_rq_requeue(rq); + rq_qos_requeue(q, rq); + + if (blk_mq_request_started(rq)) { + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + rq->rq_flags &= ~RQF_TIMED_OUT; + } +} + +void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) +{ + __blk_mq_requeue_request(rq); + + /* this request will be re-inserted to io scheduler queue */ + blk_mq_sched_requeue_request(rq); + + blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); +} +EXPORT_SYMBOL(blk_mq_requeue_request); + +static void blk_mq_requeue_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, requeue_work.work); + LIST_HEAD(rq_list); + struct request *rq, *next; + + spin_lock_irq(&q->requeue_lock); + list_splice_init(&q->requeue_list, &rq_list); + spin_unlock_irq(&q->requeue_lock); + + list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) + continue; + + rq->rq_flags &= ~RQF_SOFTBARRIER; + list_del_init(&rq->queuelist); + /* + * If RQF_DONTPREP, rq has contained some driver specific + * data, so insert it to hctx dispatch list to avoid any + * merge. + */ + if (rq->rq_flags & RQF_DONTPREP) + blk_mq_request_bypass_insert(rq, false, false); + else + blk_mq_sched_insert_request(rq, true, false, false); + } + + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); + list_del_init(&rq->queuelist); + blk_mq_sched_insert_request(rq, false, false, false); + } + + blk_mq_run_hw_queues(q, false); +} + +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list) +{ + struct request_queue *q = rq->q; + unsigned long flags; + + /* + * We abuse this flag that is otherwise used by the I/O scheduler to + * request head insertion from the workqueue. + */ + BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); + + spin_lock_irqsave(&q->requeue_lock, flags); + if (at_head) { + rq->rq_flags |= RQF_SOFTBARRIER; + list_add(&rq->queuelist, &q->requeue_list); + } else { + list_add_tail(&rq->queuelist, &q->requeue_list); + } + spin_unlock_irqrestore(&q->requeue_lock, flags); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); +} + +void blk_mq_kick_requeue_list(struct request_queue *q) +{ + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); +} +EXPORT_SYMBOL(blk_mq_kick_requeue_list); + +void blk_mq_delay_kick_requeue_list(struct request_queue *q, + unsigned long msecs) +{ + kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, + msecs_to_jiffies(msecs)); +} +EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); + +struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +{ + if (tag < tags->nr_tags) { + prefetch(tags->rqs[tag]); + return tags->rqs[tag]; + } + + return NULL; +} +EXPORT_SYMBOL(blk_mq_tag_to_rq); + +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) +{ + /* + * If we find a request that isn't idle and the queue matches, + * we know the queue is busy. Return false to stop the iteration. + */ + if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + bool *busy = priv; + + *busy = true; + return false; + } + + return true; +} + +bool blk_mq_queue_inflight(struct request_queue *q) +{ + bool busy = false; + + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); + return busy; +} +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); + +static void blk_mq_rq_timed_out(struct request *req, bool reserved) +{ + req->rq_flags |= RQF_TIMED_OUT; + if (req->q->mq_ops->timeout) { + enum blk_eh_timer_return ret; + + ret = req->q->mq_ops->timeout(req, reserved); + if (ret == BLK_EH_DONE) + return; + WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); + } + + blk_add_timer(req); +} + +static bool blk_mq_req_expired(struct request *rq, unsigned long *next) +{ + unsigned long deadline; + + if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) + return false; + if (rq->rq_flags & RQF_TIMED_OUT) + return false; + + deadline = READ_ONCE(rq->deadline); + if (time_after_eq(jiffies, deadline)) + return true; + + if (*next == 0) + *next = deadline; + else if (time_after(*next, deadline)) + *next = deadline; + return false; +} + +void blk_mq_put_rq_ref(struct request *rq) +{ + if (is_flush_rq(rq)) + rq->end_io(rq, 0); + else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); +} + +static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, + struct request *rq, void *priv, bool reserved) +{ + unsigned long *next = priv; + + /* + * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot + * be reallocated underneath the timeout handler's processing, then + * the expire check is reliable. If the request is not expired, then + * it was completed and reallocated as a new request after returning + * from blk_mq_check_expired(). + */ + if (blk_mq_req_expired(rq, next)) + blk_mq_rq_timed_out(rq, reserved); + return true; +} + +static void blk_mq_timeout_work(struct work_struct *work) +{ + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); + unsigned long next = 0; + struct blk_mq_hw_ctx *hctx; + int i; + + /* A deadlock might occur if a request is stuck requiring a + * timeout at the same time a queue freeze is waiting + * completion, since the timeout code would not be able to + * acquire the queue reference here. + * + * That's why we don't use blk_queue_enter here; instead, we use + * percpu_ref_tryget directly, because we need to be able to + * obtain a reference even in the short window between the queue + * starting to freeze, by dropping the first reference in + * blk_freeze_queue_start, and the moment the last request is + * consumed, marked by the instant q_usage_counter reaches + * zero. + */ + if (!percpu_ref_tryget(&q->q_usage_counter)) + return; + + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + + if (next != 0) { + mod_timer(&q->timeout, next); + } else { + /* + * Request timeouts are handled as a forward rolling timer. If + * we end up here it means that no requests are pending and + * also that no request has been pending for a while. Mark + * each hctx as idle. + */ + queue_for_each_hw_ctx(q, hctx, i) { + /* the hctx may be unmapped, so check it here */ + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); + } + } + blk_queue_exit(q); +} + +struct flush_busy_ctx_data { + struct blk_mq_hw_ctx *hctx; + struct list_head *list; +}; + +static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_busy_ctx_data *flush_data = data; + struct blk_mq_hw_ctx *hctx = flush_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; + + spin_lock(&ctx->lock); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); + sbitmap_clear_bit(sb, bitnr); + spin_unlock(&ctx->lock); + return true; +} + +/* + * Process software queues that have been marked busy, splicing them + * to the for-dispatch + */ +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) +{ + struct flush_busy_ctx_data data = { + .hctx = hctx, + .list = list, + }; + + sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); +} +EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); + +struct dispatch_rq_data { + struct blk_mq_hw_ctx *hctx; + struct request *rq; +}; + +static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, + void *data) +{ + struct dispatch_rq_data *dispatch_data = data; + struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; + struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); + list_del_init(&dispatch_data->rq->queuelist); + if (list_empty(&ctx->rq_lists[type])) + sbitmap_clear_bit(sb, bitnr); + } + spin_unlock(&ctx->lock); + + return !dispatch_data->rq; +} + +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start) +{ + unsigned off = start ? start->index_hw[hctx->type] : 0; + struct dispatch_rq_data data = { + .hctx = hctx, + .rq = NULL, + }; + + __sbitmap_for_each_set(&hctx->ctx_map, off, + dispatch_rq_from_ctx, &data); + + return data.rq; +} + +static inline unsigned int queued_to_index(unsigned int queued) +{ + if (!queued) + return 0; + + return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); +} + +static bool __blk_mq_get_driver_tag(struct request *rq) +{ + struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; + unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + int tag; + + blk_mq_tag_busy(rq->mq_hctx); + + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { + bt = rq->mq_hctx->tags->breserved_tags; + tag_offset = 0; + } else { + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; + } + + tag = __sbitmap_queue_get(bt); + if (tag == BLK_MQ_NO_TAG) + return false; + + rq->tag = tag + tag_offset; + return true; +} + +static bool blk_mq_get_driver_tag(struct request *rq) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) + return false; + + if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && + !(rq->rq_flags & RQF_MQ_INFLIGHT)) { + rq->rq_flags |= RQF_MQ_INFLIGHT; + __blk_mq_inc_active_requests(hctx); + } + hctx->tags->rqs[rq->tag] = rq; + return true; +} + +static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, + int flags, void *key) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); + + spin_lock(&hctx->dispatch_wait_lock); + if (!list_empty(&wait->entry)) { + struct sbitmap_queue *sbq; + + list_del_init(&wait->entry); + sbq = hctx->tags->bitmap_tags; + atomic_dec(&sbq->ws_active); + } + spin_unlock(&hctx->dispatch_wait_lock); + + blk_mq_run_hw_queue(hctx, true); + return 1; +} + +/* + * Mark us waiting for a tag. For shared tags, this involves hooking us into + * the tag wakeups. For non-shared tags, we can simply mark us needing a + * restart. For both cases, take care to check the condition again after + * marking us as waiting. + */ +static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; + struct wait_queue_head *wq; + wait_queue_entry_t *wait; + bool ret; + + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + blk_mq_sched_mark_restart_hctx(hctx); + + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + * + * Don't clear RESTART here, someone else could have set it. + * At most this will cost an extra queue run. + */ + return blk_mq_get_driver_tag(rq); + } + + wait = &hctx->dispatch_wait; + if (!list_empty_careful(&wait->entry)) + return false; + + wq = &bt_wait_ptr(sbq, hctx)->wait; + + spin_lock_irq(&wq->lock); + spin_lock(&hctx->dispatch_wait_lock); + if (!list_empty(&wait->entry)) { + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); + return false; + } + + atomic_inc(&sbq->ws_active); + wait->flags &= ~WQ_FLAG_EXCLUSIVE; + __add_wait_queue(wq, wait); + + /* + * It's possible that a tag was freed in the window between the + * allocation failure and adding the hardware queue to the wait + * queue. + */ + ret = blk_mq_get_driver_tag(rq); + if (!ret) { + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); + return false; + } + + /* + * We got a tag, remove ourselves from the wait queue to ensure + * someone else gets the wakeup. + */ + list_del_init(&wait->entry); + atomic_dec(&sbq->ws_active); + spin_unlock(&hctx->dispatch_wait_lock); + spin_unlock_irq(&wq->lock); + + return true; +} + +#define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 +#define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 +/* + * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): + * - EWMA is one simple way to compute running average value + * - weight(7/8 and 1/8) is applied so that it can decrease exponentially + * - take 4 as factor for avoiding to get too small(0) result, and this + * factor doesn't matter because EWMA decreases exponentially + */ +static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) +{ + unsigned int ewma; + + ewma = hctx->dispatch_busy; + + if (!ewma && !busy) + return; + + ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; + if (busy) + ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; + ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; + + hctx->dispatch_busy = ewma; +} + +#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ + +static void blk_mq_handle_dev_resource(struct request *rq, + struct list_head *list) +{ + struct request *next = + list_first_entry_or_null(list, struct request, queuelist); + + /* + * If an I/O scheduler has been configured and we got a driver tag for + * the next request already, free it. + */ + if (next) + blk_mq_put_driver_tag(next); + + list_add(&rq->queuelist, list); + __blk_mq_requeue_request(rq); +} + +static void blk_mq_handle_zone_resource(struct request *rq, + struct list_head *zone_list) +{ + /* + * If we end up here it is because we cannot dispatch a request to a + * specific zone due to LLD level zone-write locking or other zone + * related resource not being available. In this case, set the request + * aside in zone_list for retrying it later. + */ + list_add(&rq->queuelist, zone_list); + __blk_mq_requeue_request(rq); +} + +enum prep_dispatch { + PREP_DISPATCH_OK, + PREP_DISPATCH_NO_TAG, + PREP_DISPATCH_NO_BUDGET, +}; + +static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, + bool need_budget) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + + if (!blk_mq_get_driver_tag(rq)) { + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. + */ + if (!blk_mq_mark_tag_wait(hctx, rq)) { + /* + * All budgets not got from this function will be put + * together during handling partial dispatch + */ + if (need_budget) + blk_mq_put_dispatch_budget(rq->q); + return PREP_DISPATCH_NO_TAG; + } + } + + return PREP_DISPATCH_OK; +} + +/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ +static void blk_mq_release_budgets(struct request_queue *q, + unsigned int nr_budgets) +{ + int i; + + for (i = 0; i < nr_budgets; i++) + blk_mq_put_dispatch_budget(q); +} + +/* + * Returns true if we did some work AND can potentially do more. + */ +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets) +{ + enum prep_dispatch prep; + struct request_queue *q = hctx->queue; + struct request *rq, *nxt; + int errors, queued; + blk_status_t ret = BLK_STS_OK; + LIST_HEAD(zone_list); + bool needs_resource = false; + + if (list_empty(list)) + return false; + + /* + * Now process all the entries, sending them to the driver. + */ + errors = queued = 0; + do { + struct blk_mq_queue_data bd; + + rq = list_first_entry(list, struct request, queuelist); + + WARN_ON_ONCE(hctx != rq->mq_hctx); + prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + if (prep != PREP_DISPATCH_OK) + break; + + list_del_init(&rq->queuelist); + + bd.rq = rq; + + /* + * Flag last if we have no more requests, or if we have more + * but can't assign a driver tag to it. + */ + if (list_empty(list)) + bd.last = true; + else { + nxt = list_first_entry(list, struct request, queuelist); + bd.last = !blk_mq_get_driver_tag(nxt); + } + + /* + * once the request is queued to lld, no need to cover the + * budget any more + */ + if (nr_budgets) + nr_budgets--; + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: + /* + * Move the request to zone_list and keep going through + * the dispatch list to find more requests the drive can + * accept. + */ + blk_mq_handle_zone_resource(rq, &zone_list); + needs_resource = true; + break; + default: + errors++; + blk_mq_end_request(rq, BLK_STS_IOERR); + } + } while (!list_empty(list)); +out: + if (!list_empty(&zone_list)) + list_splice_tail_init(&zone_list, list); + + hctx->dispatched[queued_to_index(queued)]++; + + /* If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if ((!list_empty(list) || errors || needs_resource || + ret == BLK_STS_DEV_RESOURCE) && q->mq_ops->commit_rqs && queued) + q->mq_ops->commit_rqs(hctx); + /* + * Any items that need requeuing? Stuff them into hctx->dispatch, + * that is where we will continue on next queue run. + */ + if (!list_empty(list)) { + bool needs_restart; + /* For non-shared tags, the RESTART check will suffice */ + bool no_tag = prep == PREP_DISPATCH_NO_TAG && + (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); + + blk_mq_release_budgets(q, nr_budgets); + + spin_lock(&hctx->lock); + list_splice_tail_init(list, &hctx->dispatch); + spin_unlock(&hctx->lock); + + /* + * Order adding requests to hctx->dispatch and checking + * SCHED_RESTART flag. The pair of this smp_mb() is the one + * in blk_mq_sched_restart(). Avoid restart code path to + * miss the new added requests to hctx->dispatch, meantime + * SCHED_RESTART is observed here. + */ + smp_mb(); + + /* + * If SCHED_RESTART was set by the caller of this function and + * it is no longer set that means that it was cleared by another + * thread and hence that a queue rerun is needed. + * + * If 'no_tag' is set, that means that we failed getting + * a driver tag with an I/O scheduler attached. If our dispatch + * waitqueue is no longer active, ensure that we run the queue + * AFTER adding our entries back to the list. + * + * If no I/O scheduler has been configured it is possible that + * the hardware queue got stopped and restarted before requests + * were pushed back onto the dispatch list. Rerun the queue to + * avoid starvation. Notes: + * - blk_mq_run_hw_queue() checks whether or not a queue has + * been stopped before rerunning a queue. + * - Some but not all block drivers stop a queue before + * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq + * and dm-rq. + * + * If driver returns BLK_STS_RESOURCE and SCHED_RESTART + * bit is set, run queue after a delay to avoid IO stalls + * that could otherwise occur if the queue is idle. We'll do + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. + */ + needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; + if (!needs_restart || + (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) + blk_mq_run_hw_queue(hctx, true); + else if (needs_restart && needs_resource) + blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); + + blk_mq_update_dispatch_busy(hctx, true); + return false; + } else + blk_mq_update_dispatch_busy(hctx, false); + + return (queued + errors) != 0; +} + +/** + * __blk_mq_run_hw_queue - Run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * + * Send pending requests to the hardware. + */ +static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + int srcu_idx; + + /* + * We should be running this queue from one of the CPUs that + * are mapped to it. + * + * There are at least two related races now between setting + * hctx->next_cpu from blk_mq_hctx_next_cpu() and running + * __blk_mq_run_hw_queue(): + * + * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), + * but later it becomes online, then this warning is harmless + * at all + * + * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), + * but later it becomes offline, then the warning can't be + * triggered, and we depend on blk-mq timeout handler to + * handle dispatched requests to this hctx + */ + if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && + cpu_online(hctx->next_cpu)) { + printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", + raw_smp_processor_id(), + cpumask_empty(hctx->cpumask) ? "inactive": "active"); + dump_stack(); + } + + /* + * We can't run the queue inline with ints disabled. Ensure that + * we catch bad users of this early. + */ + WARN_ON_ONCE(in_interrupt()); + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + hctx_lock(hctx, &srcu_idx); + blk_mq_sched_dispatch_requests(hctx); + hctx_unlock(hctx, srcu_idx); +} + +static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) +{ + int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); + + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(hctx->cpumask); + return cpu; +} + +/* + * It'd be great if the workqueue API had a way to pass + * in a mask and had some smarts for more clever placement. + * For now we just round-robin here, switching for every + * BLK_MQ_CPU_WORK_BATCH queued items. + */ +static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) +{ + bool tried = false; + int next_cpu = hctx->next_cpu; + + if (hctx->queue->nr_hw_queues == 1) + return WORK_CPU_UNBOUND; + + if (--hctx->next_cpu_batch <= 0) { +select_cpu: + next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, + cpu_online_mask); + if (next_cpu >= nr_cpu_ids) + next_cpu = blk_mq_first_mapped_cpu(hctx); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } + + /* + * Do unbound schedule if we can't find a online CPU for this hctx, + * and it should only happen in the path of handling CPU DEAD. + */ + if (!cpu_online(next_cpu)) { + if (!tried) { + tried = true; + goto select_cpu; + } + + /* + * Make sure to re-select CPU next time once after CPUs + * in hctx->cpumask become online again. + */ + hctx->next_cpu = next_cpu; + hctx->next_cpu_batch = 1; + return WORK_CPU_UNBOUND; + } + + hctx->next_cpu = next_cpu; + return next_cpu; +} + +/** + * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * @msecs: Microseconds of delay to wait before running the queue. + * + * If !@async, try to run the queue now. Else, run the queue asynchronously and + * with a delay of @msecs. + */ +static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, + unsigned long msecs) +{ + if (unlikely(blk_mq_hctx_stopped(hctx))) + return; + + if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { + int cpu = get_cpu(); + if (cpumask_test_cpu(cpu, hctx->cpumask)) { + __blk_mq_run_hw_queue(hctx); + put_cpu(); + return; + } + + put_cpu(); + } + + kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, + msecs_to_jiffies(msecs)); +} + +/** + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. + * @hctx: Pointer to the hardware queue to run. + * @msecs: Microseconds of delay to wait before running the queue. + * + * Run a hardware queue asynchronously with a delay of @msecs. + */ +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) +{ + __blk_mq_delay_run_hw_queue(hctx, true, msecs); +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); + +/** + * blk_mq_run_hw_queue - Start to run a hardware queue. + * @hctx: Pointer to the hardware queue to run. + * @async: If we want to run the queue asynchronously. + * + * Check if the request queue is not in a quiesced state and if there are + * pending requests to be sent. If this is true, run the queue to send requests + * to hardware. + */ +void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + int srcu_idx; + bool need_run; + + /* + * When queue is quiesced, we may be switching io scheduler, or + * updating nr_hw_queues, or other things, and we can't run queue + * any more, even __blk_mq_hctx_has_pending() can't be called safely. + * + * And queue will be rerun in blk_mq_unquiesce_queue() if it is + * quiesced. + */ + hctx_lock(hctx, &srcu_idx); + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx); + hctx_unlock(hctx, srcu_idx); + + if (need_run) + __blk_mq_delay_run_hw_queue(hctx, async, 0); +} +EXPORT_SYMBOL(blk_mq_run_hw_queue); + +/** + * blk_mq_run_hw_queues - Run all hardware queues in a request queue. + * @q: Pointer to the request queue to run. + * @async: If we want to run the queue asynchronously. + */ +void blk_mq_run_hw_queues(struct request_queue *q, bool async) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + + blk_mq_run_hw_queue(hctx, async); + } +} +EXPORT_SYMBOL(blk_mq_run_hw_queues); + +/** + * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. + * @q: Pointer to the request queue to run. + * @msecs: Microseconds of delay to wait before running the queues. + */ +void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (blk_mq_hctx_stopped(hctx)) + continue; + + blk_mq_delay_run_hw_queue(hctx, msecs); + } +} +EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); + +/** + * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped + * @q: request queue. + * + * The caller is responsible for serializing this function against + * blk_mq_{start,stop}_hw_queue(). + */ +bool blk_mq_queue_stopped(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + if (blk_mq_hctx_stopped(hctx)) + return true; + + return false; +} +EXPORT_SYMBOL(blk_mq_queue_stopped); + +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_STS_RESOURCE is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queue() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + cancel_delayed_work(&hctx->run_work); + + set_bit(BLK_MQ_S_STOPPED, &hctx->state); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queue); + +/* + * This function is often used for pausing .queue_rq() by driver when + * there isn't enough resource or some conditions aren't satisfied, and + * BLK_STS_RESOURCE is usually returned. + * + * We do not guarantee that dispatch can be drained or blocked + * after blk_mq_stop_hw_queues() returns. Please use + * blk_mq_quiesce_queue() for that requirement. + */ +void blk_mq_stop_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_stop_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_stop_hw_queues); + +void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) +{ + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + + blk_mq_run_hw_queue(hctx, false); +} +EXPORT_SYMBOL(blk_mq_start_hw_queue); + +void blk_mq_start_hw_queues(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_hw_queue(hctx); +} +EXPORT_SYMBOL(blk_mq_start_hw_queues); + +void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) +{ + if (!blk_mq_hctx_stopped(hctx)) + return; + + clear_bit(BLK_MQ_S_STOPPED, &hctx->state); + blk_mq_run_hw_queue(hctx, async); +} +EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); + +void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_start_stopped_hw_queue(hctx, async); +} +EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); + +static void blk_mq_run_work_fn(struct work_struct *work) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); + + /* + * If we are stopped, don't run the queue. + */ + if (blk_mq_hctx_stopped(hctx)) + return; + + __blk_mq_run_hw_queue(hctx); +} + +static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, + struct request *rq, + bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; + + lockdep_assert_held(&ctx->lock); + + trace_block_rq_insert(rq); + + if (at_head) + list_add(&rq->queuelist, &ctx->rq_lists[type]); + else + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); +} + +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct blk_mq_ctx *ctx = rq->mq_ctx; + + lockdep_assert_held(&ctx->lock); + + __blk_mq_insert_req_list(hctx, rq, at_head); + blk_mq_hctx_mark_pending(hctx, ctx); +} + +/** + * blk_mq_request_bypass_insert - Insert a request at dispatch list. + * @rq: Pointer to request to be inserted. + * @at_head: true if the request should be inserted at the head of the list. + * @run_queue: If we should run the hardware queue after inserting the request. + * + * Should only be used carefully, when the caller knows we want to + * bypass a potential IO scheduler on the target device. + */ +void blk_mq_request_bypass_insert(struct request *rq, bool at_head, + bool run_queue) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + spin_lock(&hctx->lock); + if (at_head) + list_add(&rq->queuelist, &hctx->dispatch); + else + list_add_tail(&rq->queuelist, &hctx->dispatch); + spin_unlock(&hctx->lock); + + if (run_queue) + blk_mq_run_hw_queue(hctx, false); +} + +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list) + +{ + struct request *rq; + enum hctx_type type = hctx->type; + + /* + * preemption doesn't flush plug list, so it's possible ctx->cpu is + * offline now + */ + list_for_each_entry(rq, list, queuelist) { + BUG_ON(rq->mq_ctx != ctx); + trace_block_rq_insert(rq); + } + + spin_lock(&ctx->lock); + list_splice_tail_init(list, &ctx->rq_lists[type]); + blk_mq_hctx_mark_pending(hctx, ctx); + spin_unlock(&ctx->lock); +} + +static int plug_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); + + if (rqa->mq_ctx != rqb->mq_ctx) + return rqa->mq_ctx > rqb->mq_ctx; + if (rqa->mq_hctx != rqb->mq_hctx) + return rqa->mq_hctx > rqb->mq_hctx; + + return blk_rq_pos(rqa) > blk_rq_pos(rqb); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + LIST_HEAD(list); + + if (list_empty(&plug->mq_list)) + return; + list_splice_init(&plug->mq_list, &list); + + if (plug->rq_count > 2 && plug->multiple_queues) + list_sort(NULL, &list, plug_rq_cmp); + + plug->rq_count = 0; + + do { + struct list_head rq_list; + struct request *rq, *head_rq = list_entry_rq(list.next); + struct list_head *pos = &head_rq->queuelist; /* skip first */ + struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; + struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; + unsigned int depth = 1; + + list_for_each_continue(pos, &list) { + rq = list_entry_rq(pos); + BUG_ON(!rq->q); + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) + break; + depth++; + } + + list_cut_before(&rq_list, &list, pos); + trace_block_unplug(head_rq->q, depth, !from_schedule); + blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, + from_schedule); + } while(!list_empty(&list)); +} + +static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + int err; + + if (bio->bi_opf & REQ_RAHEAD) + rq->cmd_flags |= REQ_FAILFAST_MASK; + + rq->__sector = bio->bi_iter.bi_sector; + rq->write_hint = bio->bi_write_hint; + blk_rq_bio_prep(rq, bio, nr_segs); + + /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ + err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); + WARN_ON_ONCE(err); + + blk_account_io_start(rq); +} + +static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, bool last) +{ + struct request_queue *q = rq->q; + struct blk_mq_queue_data bd = { + .rq = rq, + .last = last, + }; + blk_qc_t new_cookie; + blk_status_t ret; + + new_cookie = request_to_qc_t(hctx, rq); + + /* + * For OK queue, we are done. For error, caller may kill it. + * Any other error (busy), just add it to our list as we + * previously would have done. + */ + ret = q->mq_ops->queue_rq(hctx, &bd); + switch (ret) { + case BLK_STS_OK: + blk_mq_update_dispatch_busy(hctx, false); + *cookie = new_cookie; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_update_dispatch_busy(hctx, true); + __blk_mq_requeue_request(rq); + break; + default: + blk_mq_update_dispatch_busy(hctx, false); + *cookie = BLK_QC_T_NONE; + break; + } + + return ret; +} + +static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, + blk_qc_t *cookie, + bool bypass_insert, bool last) +{ + struct request_queue *q = rq->q; + bool run_queue = true; + + /* + * RCU or SRCU read lock is needed before checking quiesced flag. + * + * When queue is stopped or quiesced, ignore 'bypass_insert' from + * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, + * and avoid driver to try to dispatch again. + */ + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { + run_queue = false; + bypass_insert = false; + goto insert; + } + + if (q->elevator && !bypass_insert) + goto insert; + + if (!blk_mq_get_dispatch_budget(q)) + goto insert; + + if (!blk_mq_get_driver_tag(rq)) { + blk_mq_put_dispatch_budget(q); + goto insert; + } + + return __blk_mq_issue_directly(hctx, rq, cookie, last); +insert: + if (bypass_insert) + return BLK_STS_RESOURCE; + + blk_mq_sched_insert_request(rq, false, run_queue, false); + + return BLK_STS_OK; +} + +/** + * blk_mq_try_issue_directly - Try to send a request directly to device driver. + * @hctx: Pointer of the associated hardware queue. + * @rq: Pointer to request to be sent. + * @cookie: Request queue cookie. + * + * If the device has enough resources to accept a new request now, send the + * request directly to device driver. Else, insert at hctx->dispatch queue, so + * we can try send it another time in the future. Requests inserted at this + * queue have higher priority. + */ +static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, + struct request *rq, blk_qc_t *cookie) +{ + blk_status_t ret; + int srcu_idx; + + might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + + hctx_lock(hctx, &srcu_idx); + + ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); + if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) + blk_mq_request_bypass_insert(rq, false, true); + else if (ret != BLK_STS_OK) + blk_mq_end_request(rq, ret); + + hctx_unlock(hctx, srcu_idx); +} + +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +{ + blk_status_t ret; + int srcu_idx; + blk_qc_t unused_cookie; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + hctx_lock(hctx, &srcu_idx); + ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); + hctx_unlock(hctx, srcu_idx); + + return ret; +} + +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list) +{ + int queued = 0; + int errors = 0; + + while (!list_empty(list)) { + blk_status_t ret; + struct request *rq = list_first_entry(list, struct request, + queuelist); + + list_del_init(&rq->queuelist); + ret = blk_mq_request_issue_directly(rq, list_empty(list)); + if (ret != BLK_STS_OK) { + errors++; + if (ret == BLK_STS_RESOURCE || + ret == BLK_STS_DEV_RESOURCE) { + blk_mq_request_bypass_insert(rq, false, + list_empty(list)); + break; + } + blk_mq_end_request(rq, ret); + } else + queued++; + } + + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if ((!list_empty(list) || errors) && + hctx->queue->mq_ops->commit_rqs && queued) + hctx->queue->mq_ops->commit_rqs(hctx); +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count++; + if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { + struct request *tmp; + + tmp = list_first_entry(&plug->mq_list, struct request, + queuelist); + if (tmp->q != rq->q) + plug->multiple_queues = true; + } +} + +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +/** + * blk_mq_submit_bio - Create and send a request to block device. + * @bio: Bio pointer. + * + * Builds up a request structure from @q and @bio and send to the device. The + * request may not be queued directly to hardware if: + * * This request can be merged with another one + * * We want to place request at plug queue for possible future merging + * * There is an IO scheduler active at this queue + * + * It will not queue the request if there is an error with the bio, or at the + * request creation. + * + * Returns: Request queue cookie. + */ +blk_qc_t blk_mq_submit_bio(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + const int is_sync = op_is_sync(bio->bi_opf); + const int is_flush_fua = op_is_flush(bio->bi_opf); + struct blk_mq_alloc_data data = { + .q = q, + }; + struct request *rq; + struct blk_plug *plug; + struct request *same_queue_rq = NULL; + unsigned int nr_segs; + blk_qc_t cookie; + blk_status_t ret; + + blk_queue_bounce(q, &bio); + __blk_queue_split(&bio, &nr_segs); + + if (!bio_integrity_prep(bio)) + goto queue_exit; + + if (!is_flush_fua && !blk_queue_nomerges(q) && + blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) + goto queue_exit; + + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + goto queue_exit; + + rq_qos_throttle(q, bio); + + data.cmd_flags = bio->bi_opf; + rq = __blk_mq_alloc_request(&data); + if (unlikely(!rq)) { + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + goto queue_exit; + } + + trace_block_getrq(q, bio, bio->bi_opf); + + rq_qos_track(q, rq, bio); + + cookie = request_to_qc_t(data.hctx, rq); + + blk_mq_bio_to_request(rq, bio, nr_segs); + + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) { + bio->bi_status = ret; + bio_endio(bio); + blk_mq_free_request(rq); + return BLK_QC_T_NONE; + } + + plug = blk_mq_plug(q, bio); + if (unlikely(is_flush_fua)) { + /* Bypass scheduler for flush requests */ + blk_insert_flush(rq); + blk_mq_run_hw_queue(data.hctx, true); + } else if (plug && (q->nr_hw_queues == 1 || + blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || + q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { + /* + * Use plugging if we have a ->commit_rqs() hook as well, as + * we know the driver uses bd->last in a smart fashion. + * + * Use normal plugging if this disk is slow HDD, as sequential + * IO may benefit a lot from plug merging. + */ + unsigned int request_count = plug->rq_count; + struct request *last = NULL; + + if (!request_count) + trace_block_plug(q); + else + last = list_entry_rq(plug->mq_list.prev); + + if (request_count >= blk_plug_max_rq_count(plug) || (last && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_flush_plug_list(plug, false); + trace_block_plug(q); + } + + blk_add_rq_to_plug(plug, rq); + } else if (q->elevator) { + /* Insert the request at the IO scheduler queue */ + blk_mq_sched_insert_request(rq, false, true, true); + } else if (plug && !blk_queue_nomerges(q)) { + /* + * We do limited plugging. If the bio can be merged, do that. + * Otherwise the existing request in the plug list will be + * issued. So the plug list will have one request at most + * The plug list might get flushed before this. If that happens, + * the plug list is empty, and same_queue_rq is invalid. + */ + if (list_empty(&plug->mq_list)) + same_queue_rq = NULL; + if (same_queue_rq) { + list_del_init(&same_queue_rq->queuelist); + plug->rq_count--; + } + blk_add_rq_to_plug(plug, rq); + trace_block_plug(q); + + if (same_queue_rq) { + data.hctx = same_queue_rq->mq_hctx; + trace_block_unplug(q, 1, true); + blk_mq_try_issue_directly(data.hctx, same_queue_rq, + &cookie); + } + } else if ((q->nr_hw_queues > 1 && is_sync) || + !data.hctx->dispatch_busy) { + /* + * There is no scheduler and we can try to send directly + * to the hardware. + */ + blk_mq_try_issue_directly(data.hctx, rq, &cookie); + } else { + /* Default case. */ + blk_mq_sched_insert_request(rq, false, true, true); + } + + return cookie; +queue_exit: + blk_queue_exit(q); + return BLK_QC_T_NONE; +} + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, unsigned int hctx_idx) +{ + struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; + struct page *page; + unsigned long flags; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < set->queue_depth; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(refcount_read(&rq->ref) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } + } + } + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); +} + +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx) +{ + struct page *page; + + if (tags->rqs && set->ops->exit_request) { + int i; + + for (i = 0; i < tags->nr_tags; i++) { + struct request *rq = tags->static_rqs[i]; + + if (!rq) + continue; + set->ops->exit_request(set, rq, hctx_idx); + tags->static_rqs[i] = NULL; + } + } + + blk_mq_clear_rq_mapping(set, tags, hctx_idx); + + while (!list_empty(&tags->page_list)) { + page = list_first_entry(&tags->page_list, struct page, lru); + list_del_init(&page->lru); + /* + * Remove kmemleak object previously allocated in + * blk_mq_alloc_rqs(). + */ + kmemleak_free(page_address(page)); + __free_pages(page, page->private); + } +} + +void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) +{ + kfree(tags->rqs); + tags->rqs = NULL; + kfree(tags->static_rqs); + tags->static_rqs = NULL; + + blk_mq_free_tags(tags, flags); +} + +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags, + unsigned int flags) +{ + struct blk_mq_tags *tags; + int node; + + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; + + tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); + if (!tags) + return NULL; + + tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node); + if (!tags->rqs) { + blk_mq_free_tags(tags, flags); + return NULL; + } + + tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node); + if (!tags->static_rqs) { + kfree(tags->rqs); + blk_mq_free_tags(tags, flags); + return NULL; + } + + return tags; +} + +static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, + unsigned int hctx_idx, int node) +{ + int ret; + + if (set->ops->init_request) { + ret = set->ops->init_request(set, rq, hctx_idx, node); + if (ret) + return ret; + } + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + return 0; +} + +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) +{ + unsigned int i, j, entries_per_page, max_order = 4; + size_t rq_size, left; + int node; + + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); + if (node == NUMA_NO_NODE) + node = set->numa_node; + + INIT_LIST_HEAD(&tags->page_list); + + /* + * rq_size is the size of the request plus driver payload, rounded + * to the cacheline size + */ + rq_size = round_up(sizeof(struct request) + set->cmd_size, + cache_line_size()); + left = rq_size * depth; + + for (i = 0; i < depth; ) { + int this_order = max_order; + struct page *page; + int to_do; + void *p; + + while (this_order && left < order_to_size(this_order - 1)) + this_order--; + + do { + page = alloc_pages_node(node, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, + this_order); + if (page) + break; + if (!this_order--) + break; + if (order_to_size(this_order) < rq_size) + break; + } while (1); + + if (!page) + goto fail; + + page->private = this_order; + list_add_tail(&page->lru, &tags->page_list); + + p = page_address(page); + /* + * Allow kmemleak to scan these pages as they contain pointers + * to additional allocations like via ops->init_request(). + */ + kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); + entries_per_page = order_to_size(this_order) / rq_size; + to_do = min(entries_per_page, depth - i); + left -= to_do * rq_size; + for (j = 0; j < to_do; j++) { + struct request *rq = p; + + tags->static_rqs[i] = rq; + if (blk_mq_init_request(set, rq, hctx_idx, node)) { + tags->static_rqs[i] = NULL; + goto fail; + } + + p += rq_size; + i++; + } + } + return 0; + +fail: + blk_mq_free_rqs(set, tags, hctx_idx); + return -ENOMEM; +} + +struct rq_iter_data { + struct blk_mq_hw_ctx *hctx; + bool has_rq; +}; + +static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +{ + struct rq_iter_data *iter_data = data; + + if (rq->mq_hctx != iter_data->hctx) + return true; + iter_data->has_rq = true; + return false; +} + +static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) +{ + struct blk_mq_tags *tags = hctx->sched_tags ? + hctx->sched_tags : hctx->tags; + struct rq_iter_data data = { + .hctx = hctx, + }; + + blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); + return data.has_rq; +} + +static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, + struct blk_mq_hw_ctx *hctx) +{ + if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + return false; + if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) + return false; + return true; +} + +static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (!cpumask_test_cpu(cpu, hctx->cpumask) || + !blk_mq_last_cpu_in_hctx(cpu, hctx)) + return 0; + + /* + * Prevent new request from being allocated on the current hctx. + * + * The smp_mb__after_atomic() Pairs with the implied barrier in + * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is + * seen once we return from the tag allocator. + */ + set_bit(BLK_MQ_S_INACTIVE, &hctx->state); + smp_mb__after_atomic(); + + /* + * Try to grab a reference to the queue and wait for any outstanding + * requests. If we could not grab a reference the queue has been + * frozen and there are no requests. + */ + if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { + while (blk_mq_hctx_has_requests(hctx)) + msleep(5); + percpu_ref_put(&hctx->queue->q_usage_counter); + } + + return 0; +} + +static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, + struct blk_mq_hw_ctx, cpuhp_online); + + if (cpumask_test_cpu(cpu, hctx->cpumask)) + clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); + return 0; +} + +/* + * 'cpu' is going away. splice any existing rq_list entries from this + * software queue to the hw queue dispatch list, and ensure that it + * gets run. + */ +static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) +{ + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + LIST_HEAD(tmp); + enum hctx_type type; + + hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); + if (!cpumask_test_cpu(cpu, hctx->cpumask)) + return 0; + + ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; + + spin_lock(&ctx->lock); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); + blk_mq_hctx_clear_pending(hctx, ctx); + } + spin_unlock(&ctx->lock); + + if (list_empty(&tmp)) + return 0; + + spin_lock(&hctx->lock); + list_splice_tail_init(&tmp, &hctx->dispatch); + spin_unlock(&hctx->lock); + + blk_mq_run_hw_queue(hctx, true); + return 0; +} + +static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) +{ + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, + &hctx->cpuhp_dead); +} + +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + +/* hctx->ctxs will be freed in queue's release handler */ +static void blk_mq_exit_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct request *flush_rq = hctx->fq->flush_rq; + + if (blk_mq_hw_queue_mapped(hctx)) + blk_mq_tag_idle(hctx); + + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); + if (set->ops->exit_request) + set->ops->exit_request(set, flush_rq, hctx_idx); + + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + + blk_mq_remove_cpuhp(hctx); + + spin_lock(&q->unused_hctx_lock); + list_add(&hctx->hctx_list, &q->unused_hctx_list); + spin_unlock(&q->unused_hctx_lock); +} + +static void blk_mq_exit_hw_queues(struct request_queue *q, + struct blk_mq_tag_set *set, int nr_queue) +{ + struct blk_mq_hw_ctx *hctx; + unsigned int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (i == nr_queue) + break; + blk_mq_debugfs_unregister_hctx(hctx); + blk_mq_exit_hctx(q, set, hctx, i); + } +} + +static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) +{ + int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); + + BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), + __alignof__(struct blk_mq_hw_ctx)) != + sizeof(struct blk_mq_hw_ctx)); + + if (tag_set->flags & BLK_MQ_F_BLOCKING) + hw_ctx_size += sizeof(struct srcu_struct); + + return hw_ctx_size; +} + +static int blk_mq_init_hctx(struct request_queue *q, + struct blk_mq_tag_set *set, + struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) +{ + hctx->queue_num = hctx_idx; + + if (!(hctx->flags & BLK_MQ_F_STACKING)) + cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, + &hctx->cpuhp_online); + cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); + + hctx->tags = set->tags[hctx_idx]; + + if (set->ops->init_hctx && + set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) + goto unregister_cpu_notifier; + + if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, + hctx->numa_node)) + goto exit_hctx; + return 0; + + exit_hctx: + if (set->ops->exit_hctx) + set->ops->exit_hctx(hctx, hctx_idx); + unregister_cpu_notifier: + blk_mq_remove_cpuhp(hctx); + return -1; +} + +static struct blk_mq_hw_ctx * +blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, + int node) +{ + struct blk_mq_hw_ctx *hctx; + gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; + + hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + if (!hctx) + goto fail_alloc_hctx; + + if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) + goto free_hctx; + + atomic_set(&hctx->nr_active, 0); + atomic_set(&hctx->elevator_queued, 0); + if (node == NUMA_NO_NODE) + node = set->numa_node; + hctx->numa_node = node; + + INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); + spin_lock_init(&hctx->lock); + INIT_LIST_HEAD(&hctx->dispatch); + hctx->queue = q; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; + + INIT_LIST_HEAD(&hctx->hctx_list); + + /* + * Allocate space for all possible cpus to avoid allocation at + * runtime + */ + hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), + gfp, node); + if (!hctx->ctxs) + goto free_cpumask; + + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), + gfp, node)) + goto free_ctxs; + hctx->nr_ctx = 0; + + spin_lock_init(&hctx->dispatch_wait_lock); + init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); + INIT_LIST_HEAD(&hctx->dispatch_wait.entry); + + hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); + if (!hctx->fq) + goto free_bitmap; + + if (hctx->flags & BLK_MQ_F_BLOCKING) + init_srcu_struct(hctx->srcu); + blk_mq_hctx_kobj_init(hctx); + + return hctx; + + free_bitmap: + sbitmap_free(&hctx->ctx_map); + free_ctxs: + kfree(hctx->ctxs); + free_cpumask: + free_cpumask_var(hctx->cpumask); + free_hctx: + kfree(hctx); + fail_alloc_hctx: + return NULL; +} + +static void blk_mq_init_cpu_queues(struct request_queue *q, + unsigned int nr_hw_queues) +{ + struct blk_mq_tag_set *set = q->tag_set; + unsigned int i, j; + + for_each_possible_cpu(i) { + struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); + struct blk_mq_hw_ctx *hctx; + int k; + + __ctx->cpu = i; + spin_lock_init(&__ctx->lock); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + + __ctx->queue = q; + + /* + * Set local node, IFF we have more than one hw queue. If + * not, we remain on the home node of the device + */ + for (j = 0; j < set->nr_maps; j++) { + hctx = blk_mq_map_queue_type(q, j, i); + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = cpu_to_node(i); + } + } +} + +static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, + int hctx_idx) +{ + unsigned int flags = set->flags; + int ret = 0; + + set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, + set->queue_depth, set->reserved_tags, flags); + if (!set->tags[hctx_idx]) + return false; + + ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, + set->queue_depth); + if (!ret) + return true; + + blk_mq_free_rq_map(set->tags[hctx_idx], flags); + set->tags[hctx_idx] = NULL; + return false; +} + +static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + unsigned int flags = set->flags; + + if (set->tags && set->tags[hctx_idx]) { + blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); + blk_mq_free_rq_map(set->tags[hctx_idx], flags); + set->tags[hctx_idx] = NULL; + } +} + +static void blk_mq_map_swqueue(struct request_queue *q) +{ + unsigned int i, j, hctx_idx; + struct blk_mq_hw_ctx *hctx; + struct blk_mq_ctx *ctx; + struct blk_mq_tag_set *set = q->tag_set; + + queue_for_each_hw_ctx(q, hctx, i) { + cpumask_clear(hctx->cpumask); + hctx->nr_ctx = 0; + hctx->dispatch_from = NULL; + } + + /* + * Map software to hardware queues. + * + * If the cpu isn't present, the cpu is mapped to first hctx. + */ + for_each_possible_cpu(i) { + + ctx = per_cpu_ptr(q->queue_ctx, i); + for (j = 0; j < set->nr_maps; j++) { + if (!set->map[j].nr_queues) { + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); + continue; + } + hctx_idx = set->map[j].mq_map[i]; + /* unmapped hw queue can be remapped after CPU topo changed */ + if (!set->tags[hctx_idx] && + !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + /* + * If tags initialization fail for some hctx, + * that hctx won't be brought online. In this + * case, remap the current ctx to hctx[0] which + * is guaranteed to always have tags allocated + */ + set->map[j].mq_map[i] = 0; + } + + hctx = blk_mq_map_queue_type(q, j, i); + ctx->hctxs[j] = hctx; + /* + * If the CPU is already set in the mask, then we've + * mapped this one already. This can happen if + * devices share queues across queue maps. + */ + if (cpumask_test_cpu(i, hctx->cpumask)) + continue; + + cpumask_set_cpu(i, hctx->cpumask); + hctx->type = j; + ctx->index_hw[hctx->type] = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; + + /* + * If the nr_ctx type overflows, we have exceeded the + * amount of sw queues we can support. + */ + BUG_ON(!hctx->nr_ctx); + } + + for (; j < HCTX_MAX_TYPES; j++) + ctx->hctxs[j] = blk_mq_map_queue_type(q, + HCTX_TYPE_DEFAULT, i); + } + + queue_for_each_hw_ctx(q, hctx, i) { + /* + * If no software queues are mapped to this hardware queue, + * disable it and free the request entries. + */ + if (!hctx->nr_ctx) { + /* Never unmap queue 0. We need it as a + * fallback in case of a new remap fails + * allocation + */ + if (i && set->tags[i]) + blk_mq_free_map_and_requests(set, i); + + hctx->tags = NULL; + continue; + } + + hctx->tags = set->tags[i]; + WARN_ON(!hctx->tags); + + /* + * Set the map size to the number of mapped software queues. + * This is more accurate and more efficient than looping + * over all possibly mapped software queues. + */ + sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); + + /* + * Initialize batch roundrobin counts + */ + hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); + hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; + } +} + +/* + * Caller needs to ensure that we're either frozen/quiesced, or that + * the queue isn't live yet. + */ +static void queue_set_hctx_shared(struct request_queue *q, bool shared) +{ + struct blk_mq_hw_ctx *hctx; + int i; + + queue_for_each_hw_ctx(q, hctx, i) { + if (shared) + hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; + else + hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; + } +} + +static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, + bool shared) +{ + struct request_queue *q; + + lockdep_assert_held(&set->tag_list_lock); + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_freeze_queue(q); + queue_set_hctx_shared(q, shared); + blk_mq_unfreeze_queue(q); + } +} + +static void blk_mq_del_queue_tag_set(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + mutex_lock(&set->tag_list_lock); + list_del(&q->tag_set_list); + if (list_is_singular(&set->tag_list)) { + /* just transitioned to unshared */ + set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_shared(set, false); + } + mutex_unlock(&set->tag_list_lock); + INIT_LIST_HEAD(&q->tag_set_list); +} + +static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + mutex_lock(&set->tag_list_lock); + + /* + * Check to see if we're transitioning to shared (from 1 to 2 queues). + */ + if (!list_empty(&set->tag_list) && + !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; + /* update existing queue */ + blk_mq_update_tag_set_shared(set, true); + } + if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + queue_set_hctx_shared(q, true); + list_add_tail(&q->tag_set_list, &set->tag_list); + + mutex_unlock(&set->tag_list_lock); +} + +/* All allocations will be freed in release handler of q->mq_kobj */ +static int blk_mq_alloc_ctxs(struct request_queue *q) +{ + struct blk_mq_ctxs *ctxs; + int cpu; + + ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctxs->queue_ctx) + goto fail; + + for_each_possible_cpu(cpu) { + struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); + ctx->ctxs = ctxs; + } + + q->mq_kobj = &ctxs->kobj; + q->queue_ctx = ctxs->queue_ctx; + + return 0; + fail: + kfree(ctxs); + return -ENOMEM; +} + +/* + * It is the actual release handler for mq, but we do it from + * request queue's release handler for avoiding use-after-free + * and headache because q->mq_kobj shouldn't have been introduced, + * but we can't group ctx/kctx kobj without it. + */ +void blk_mq_release(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx, *next; + int i; + + queue_for_each_hw_ctx(q, hctx, i) + WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); + + /* all hctx are in .unused_hctx_list now */ + list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { + list_del_init(&hctx->hctx_list); + kobject_put(&hctx->kobj); + } + + kfree(q->queue_hw_ctx); + + /* + * release .mq_kobj and sw queue's kobject now because + * both share lifetime with request queue. + */ + blk_mq_sysfs_deinit(q); +} + +struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, + void *queuedata) +{ + struct request_queue *uninit_q, *q; + + uninit_q = blk_alloc_queue(set->numa_node); + if (!uninit_q) + return ERR_PTR(-ENOMEM); + uninit_q->queuedata = queuedata; + + /* + * Initialize the queue without an elevator. device_add_disk() will do + * the initialization. + */ + q = blk_mq_init_allocated_queue(set, uninit_q, false); + if (IS_ERR(q)) + blk_cleanup_queue(uninit_q); + + return q; +} +EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); + +struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) +{ + return blk_mq_init_queue_data(set, NULL); +} +EXPORT_SYMBOL(blk_mq_init_queue); + +/* + * Helper for setting up a queue with mq ops, given queue depth, and + * the passed in mq ops flags. + */ +struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, + unsigned int queue_depth, + unsigned int set_flags) +{ + struct request_queue *q; + int ret; + + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + blk_mq_free_tag_set(set); + return q; + } + + return q; +} +EXPORT_SYMBOL(blk_mq_init_sq_queue); + +static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( + struct blk_mq_tag_set *set, struct request_queue *q, + int hctx_idx, int node) +{ + struct blk_mq_hw_ctx *hctx = NULL, *tmp; + + /* reuse dead hctx first */ + spin_lock(&q->unused_hctx_lock); + list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { + if (tmp->numa_node == node) { + hctx = tmp; + break; + } + } + if (hctx) + list_del_init(&hctx->hctx_list); + spin_unlock(&q->unused_hctx_lock); + + if (!hctx) + hctx = blk_mq_alloc_hctx(q, set, node); + if (!hctx) + goto fail; + + if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) + goto free_hctx; + + return hctx; + + free_hctx: + kobject_put(&hctx->kobj); + fail: + return NULL; +} + +static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, + struct request_queue *q) +{ + int i, j, end; + struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; + + if (q->nr_hw_queues < set->nr_hw_queues) { + struct blk_mq_hw_ctx **new_hctxs; + + new_hctxs = kcalloc_node(set->nr_hw_queues, + sizeof(*new_hctxs), GFP_KERNEL, + set->numa_node); + if (!new_hctxs) + return; + if (hctxs) + memcpy(new_hctxs, hctxs, q->nr_hw_queues * + sizeof(*hctxs)); + q->queue_hw_ctx = new_hctxs; + kfree(hctxs); + hctxs = new_hctxs; + } + + /* protect against switching io scheduler */ + mutex_lock(&q->sysfs_lock); + for (i = 0; i < set->nr_hw_queues; i++) { + int node; + struct blk_mq_hw_ctx *hctx; + + node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); + /* + * If the hw queue has been mapped to another numa node, + * we need to realloc the hctx. If allocation fails, fallback + * to use the previous one. + */ + if (hctxs[i] && (hctxs[i]->numa_node == node)) + continue; + + hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (hctx) { + if (hctxs[i]) + blk_mq_exit_hctx(q, set, hctxs[i], i); + hctxs[i] = hctx; + } else { + if (hctxs[i]) + pr_warn("Allocate new hctx on node %d fails,\ + fallback to previous one on node %d\n", + node, hctxs[i]->numa_node); + else + break; + } + } + /* + * Increasing nr_hw_queues fails. Free the newly allocated + * hctxs and keep the previous q->nr_hw_queues. + */ + if (i != set->nr_hw_queues) { + j = q->nr_hw_queues; + end = i; + } else { + j = i; + end = q->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + } + + for (; j < end; j++) { + struct blk_mq_hw_ctx *hctx = hctxs[j]; + + if (hctx) { + if (hctx->tags) + blk_mq_free_map_and_requests(set, j); + blk_mq_exit_hctx(q, set, hctx, j); + hctxs[j] = NULL; + } + } + mutex_unlock(&q->sysfs_lock); +} + +struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q, + bool elevator_init) +{ + /* mark the queue as mq asap */ + q->mq_ops = set->ops; + + q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, + blk_mq_poll_stats_bkt, + BLK_MQ_POLL_STATS_BKTS, q); + if (!q->poll_cb) + goto err_exit; + + if (blk_mq_alloc_ctxs(q)) + goto err_poll; + + /* init q->mq_kobj and sw queues' kobjects */ + blk_mq_sysfs_init(q); + + INIT_LIST_HEAD(&q->unused_hctx_list); + spin_lock_init(&q->unused_hctx_lock); + + blk_mq_realloc_hw_ctxs(set, q); + if (!q->nr_hw_queues) + goto err_hctxs; + + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); + blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); + + q->tag_set = set; + + q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + + q->sg_reserved_size = INT_MAX; + + INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->requeue_list); + spin_lock_init(&q->requeue_lock); + + q->nr_requests = set->queue_depth; + + /* + * Default to classic polling + */ + q->poll_nsec = BLK_MQ_POLL_CLASSIC; + + blk_mq_init_cpu_queues(q, set->nr_hw_queues); + blk_mq_add_queue_tag_set(set, q); + blk_mq_map_swqueue(q); + + if (elevator_init) + elevator_init_mq(q); + + return q; + +err_hctxs: + kfree(q->queue_hw_ctx); + q->nr_hw_queues = 0; + blk_mq_sysfs_deinit(q); +err_poll: + blk_stat_free_callback(q->poll_cb); + q->poll_cb = NULL; +err_exit: + q->mq_ops = NULL; + return ERR_PTR(-ENOMEM); +} +EXPORT_SYMBOL(blk_mq_init_allocated_queue); + +/* tags can _not_ be used after returning from blk_mq_exit_queue */ +void blk_mq_exit_queue(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ + blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ + blk_mq_del_queue_tag_set(q); +} + +static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) +{ + int i; + + for (i = 0; i < set->nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_request(set, i)) + goto out_unwind; + cond_resched(); + } + + return 0; + +out_unwind: + while (--i >= 0) + blk_mq_free_map_and_requests(set, i); + + return -ENOMEM; +} + +/* + * Allocate the request maps associated with this tag_set. Note that this + * may reduce the depth asked for, if memory is tight. set->queue_depth + * will be updated to reflect the allocated depth. + */ +static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +{ + unsigned int depth; + int err; + + depth = set->queue_depth; + do { + err = __blk_mq_alloc_rq_maps(set); + if (!err) + break; + + set->queue_depth >>= 1; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { + err = -ENOMEM; + break; + } + } while (set->queue_depth); + + if (!set->queue_depth || err) { + pr_err("blk-mq: failed to allocate request map\n"); + return -ENOMEM; + } + + if (depth != set->queue_depth) + pr_info("blk-mq: reduced tag depth (%u -> %u)\n", + depth, set->queue_depth); + + return 0; +} + +static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) +{ + /* + * blk_mq_map_queues() and multiple .map_queues() implementations + * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the + * number of hardware queues. + */ + if (set->nr_maps == 1) + set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; + + if (set->ops->map_queues && !is_kdump_kernel()) { + int i; + + /* + * transport .map_queues is usually done in the following + * way: + * + * for (queue = 0; queue < set->nr_hw_queues; queue++) { + * mask = get_cpu_mask(queue) + * for_each_cpu(cpu, mask) + * set->map[x].mq_map[cpu] = queue; + * } + * + * When we need to remap, the table has to be cleared for + * killing stale mapping since one CPU may not be mapped + * to any hw queue. + */ + for (i = 0; i < set->nr_maps; i++) + blk_mq_clear_mq_map(&set->map[i]); + + return set->ops->map_queues(set); + } else { + BUG_ON(set->nr_maps > 1); + return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + } +} + +static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, + int cur_nr_hw_queues, int new_nr_hw_queues) +{ + struct blk_mq_tags **new_tags; + + if (cur_nr_hw_queues >= new_nr_hw_queues) + return 0; + + new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), + GFP_KERNEL, set->numa_node); + if (!new_tags) + return -ENOMEM; + + if (set->tags) + memcpy(new_tags, set->tags, cur_nr_hw_queues * + sizeof(*set->tags)); + kfree(set->tags); + set->tags = new_tags; + set->nr_hw_queues = new_nr_hw_queues; + + return 0; +} + +/* + * Alloc a tag set to be associated with one or more request queues. + * May fail with EINVAL for various error conditions. May adjust the + * requested depth down, if it's too large. In that case, the set + * value will be stored in set->queue_depth. + */ +int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) +{ + int i, ret; + + BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); + + if (!set->nr_hw_queues) + return -EINVAL; + if (!set->queue_depth) + return -EINVAL; + if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) + return -EINVAL; + + if (!set->ops->queue_rq) + return -EINVAL; + + if (!set->ops->get_budget ^ !set->ops->put_budget) + return -EINVAL; + + if (set->queue_depth > BLK_MQ_MAX_DEPTH) { + pr_info("blk-mq: reduced tag depth to %u\n", + BLK_MQ_MAX_DEPTH); + set->queue_depth = BLK_MQ_MAX_DEPTH; + } + + if (!set->nr_maps) + set->nr_maps = 1; + else if (set->nr_maps > HCTX_MAX_TYPES) + return -EINVAL; + + /* + * If a crashdump is active, then we are potentially in a very + * memory constrained environment. Limit us to 1 queue and + * 64 tags to prevent using too much memory. + */ + if (is_kdump_kernel()) { + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = min(64U, set->queue_depth); + } + /* + * There is no use for more h/w queues than cpus if we just have + * a single map + */ + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) + set->nr_hw_queues = nr_cpu_ids; + + if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) + return -ENOMEM; + + ret = -ENOMEM; + for (i = 0; i < set->nr_maps; i++) { + set->map[i].mq_map = kcalloc_node(nr_cpu_ids, + sizeof(set->map[i].mq_map[0]), + GFP_KERNEL, set->numa_node); + if (!set->map[i].mq_map) + goto out_free_mq_map; + set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + } + + ret = blk_mq_update_queue_map(set); + if (ret) + goto out_free_mq_map; + + ret = blk_mq_alloc_map_and_requests(set); + if (ret) + goto out_free_mq_map; + + if (blk_mq_is_sbitmap_shared(set->flags)) { + atomic_set(&set->active_queues_shared_sbitmap, 0); + + if (blk_mq_init_shared_sbitmap(set, set->flags)) { + ret = -ENOMEM; + goto out_free_mq_rq_maps; + } + } + + mutex_init(&set->tag_list_lock); + INIT_LIST_HEAD(&set->tag_list); + + return 0; + +out_free_mq_rq_maps: + for (i = 0; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); +out_free_mq_map: + for (i = 0; i < set->nr_maps; i++) { + kfree(set->map[i].mq_map); + set->map[i].mq_map = NULL; + } + kfree(set->tags); + set->tags = NULL; + return ret; +} +EXPORT_SYMBOL(blk_mq_alloc_tag_set); + +void blk_mq_free_tag_set(struct blk_mq_tag_set *set) +{ + int i, j; + + for (i = 0; i < set->nr_hw_queues; i++) + blk_mq_free_map_and_requests(set, i); + + if (blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_exit_shared_sbitmap(set); + + for (j = 0; j < set->nr_maps; j++) { + kfree(set->map[j].mq_map); + set->map[j].mq_map = NULL; + } + + kfree(set->tags); + set->tags = NULL; +} +EXPORT_SYMBOL(blk_mq_free_tag_set); + +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) +{ + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_hw_ctx *hctx; + int i, ret; + + if (!set) + return -EINVAL; + + if (q->nr_requests == nr) + return 0; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + ret = 0; + queue_for_each_hw_ctx(q, hctx, i) { + if (!hctx->tags) + continue; + /* + * If we're using an MQ scheduler, just update the scheduler + * queue depth. This is similar to what the old code would do. + */ + if (!hctx->sched_tags) { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); + if (!ret && blk_mq_is_sbitmap_shared(set->flags)) + blk_mq_tag_resize_shared_sbitmap(set, nr); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, + nr, true); + } + if (ret) + break; + if (q->elevator && q->elevator->type->ops.depth_updated) + q->elevator->type->ops.depth_updated(hctx); + } + + if (!ret) + q->nr_requests = nr; + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return ret; +} + +/* + * request_queue and elevator_type pair. + * It is just used by __blk_mq_update_nr_hw_queues to cache + * the elevator_type associated with a request_queue. + */ +struct blk_mq_qe_pair { + struct list_head node; + struct request_queue *q; + struct elevator_type *type; +}; + +/* + * Cache the elevator_type in qe pair list and switch the + * io scheduler to 'none' + */ +static bool blk_mq_elv_switch_none(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + + if (!q->elevator) + return true; + + qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); + if (!qe) + return false; + + INIT_LIST_HEAD(&qe->node); + qe->q = q; + qe->type = q->elevator->type; + list_add(&qe->node, head); + + mutex_lock(&q->sysfs_lock); + /* + * After elevator_switch_mq, the previous elevator_queue will be + * released by elevator_release. The reference of the io scheduler + * module get by elevator_get will also be put. So we need to get + * a reference of the io scheduler module here to prevent it to be + * removed. + */ + __module_get(qe->type->elevator_owner); + elevator_switch_mq(q, NULL); + mutex_unlock(&q->sysfs_lock); + + return true; +} + +static void blk_mq_elv_switch_back(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + struct elevator_type *t = NULL; + + list_for_each_entry(qe, head, node) + if (qe->q == q) { + t = qe->type; + break; + } + + if (!t) + return; + + list_del(&qe->node); + kfree(qe); + + mutex_lock(&q->sysfs_lock); + elevator_switch_mq(q, t); + mutex_unlock(&q->sysfs_lock); +} + +static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, + int nr_hw_queues) +{ + struct request_queue *q; + LIST_HEAD(head); + int prev_nr_hw_queues; + + lockdep_assert_held(&set->tag_list_lock); + + if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) + nr_hw_queues = nr_cpu_ids; + if (nr_hw_queues < 1) + return; + if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) + return; + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_freeze_queue(q); + /* + * Switch IO scheduler to 'none', cleaning up the data associated + * with the previous scheduler. We will switch back once we are done + * updating the new sw to hw queue mappings. + */ + list_for_each_entry(q, &set->tag_list, tag_set_list) + if (!blk_mq_elv_switch_none(&head, q)) + goto switch_back; + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_debugfs_unregister_hctxs(q); + blk_mq_sysfs_unregister(q); + } + + prev_nr_hw_queues = set->nr_hw_queues; + if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < + 0) + goto reregister; + + set->nr_hw_queues = nr_hw_queues; +fallback: + blk_mq_update_queue_map(set); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_realloc_hw_ctxs(set, q); + if (q->nr_hw_queues != set->nr_hw_queues) { + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", + nr_hw_queues, prev_nr_hw_queues); + set->nr_hw_queues = prev_nr_hw_queues; + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + goto fallback; + } + blk_mq_map_swqueue(q); + } + +reregister: + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_sysfs_register(q); + blk_mq_debugfs_register_hctxs(q); + } + +switch_back: + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_elv_switch_back(&head, q); + + list_for_each_entry(q, &set->tag_list, tag_set_list) + blk_mq_unfreeze_queue(q); +} + +void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) +{ + mutex_lock(&set->tag_list_lock); + __blk_mq_update_nr_hw_queues(set, nr_hw_queues); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); + +/* Enable polling stats and return whether they were already enabled. */ +static bool blk_poll_stats_enable(struct request_queue *q) +{ + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) + return true; + blk_stat_add_callback(q, q->poll_cb); + return false; +} + +static void blk_mq_poll_stats_start(struct request_queue *q) +{ + /* + * We don't arm the callback if polling stats are not enabled or the + * callback is already active. + */ + if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || + blk_stat_is_active(q->poll_cb)) + return; + + blk_stat_activate_msecs(q->poll_cb, 100); +} + +static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) +{ + struct request_queue *q = cb->data; + int bucket; + + for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { + if (cb->stat[bucket].nr_samples) + q->poll_stat[bucket] = cb->stat[bucket]; + } +} + +static unsigned long blk_mq_poll_nsecs(struct request_queue *q, + struct request *rq) +{ + unsigned long ret = 0; + int bucket; + + /* + * If stats collection isn't on, don't sleep but turn it on for + * future users + */ + if (!blk_poll_stats_enable(q)) + return 0; + + /* + * As an optimistic guess, use half of the mean service time + * for this type of request. We can (and should) make this smarter. + * For instance, if the completion latencies are tight, we can + * get closer than just half the mean. This is especially + * important on devices where the completion latencies are longer + * than ~10 usec. We do use the stats for the relevant IO size + * if available which does lead to better estimates. + */ + bucket = blk_mq_poll_stats_bkt(rq); + if (bucket < 0) + return ret; + + if (q->poll_stat[bucket].nr_samples) + ret = (q->poll_stat[bucket].mean + 1) / 2; + + return ret; +} + +static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, + struct request *rq) +{ + struct hrtimer_sleeper hs; + enum hrtimer_mode mode; + unsigned int nsecs; + ktime_t kt; + + if (rq->rq_flags & RQF_MQ_POLL_SLEPT) + return false; + + /* + * If we get here, hybrid polling is enabled. Hence poll_nsec can be: + * + * 0: use half of prev avg + * >0: use this specific value + */ + if (q->poll_nsec > 0) + nsecs = q->poll_nsec; + else + nsecs = blk_mq_poll_nsecs(q, rq); + + if (!nsecs) + return false; + + rq->rq_flags |= RQF_MQ_POLL_SLEPT; + + /* + * This will be replaced with the stats tracking code, using + * 'avg_completion_time / 2' as the pre-sleep target. + */ + kt = nsecs; + + mode = HRTIMER_MODE_REL; + hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); + hrtimer_set_expires(&hs.timer, kt); + + do { + if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) + break; + set_current_state(TASK_UNINTERRUPTIBLE); + hrtimer_sleeper_start_expires(&hs, mode); + if (hs.task) + io_schedule(); + hrtimer_cancel(&hs.timer); + mode = HRTIMER_MODE_ABS; + } while (hs.task && !signal_pending(current)); + + __set_current_state(TASK_RUNNING); + destroy_hrtimer_on_stack(&hs.timer); + return true; +} + +static bool blk_mq_poll_hybrid(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) +{ + struct request *rq; + + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) + return false; + + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else { + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); + /* + * With scheduling, if the request has completed, we'll + * get a NULL return here, as we clear the sched tag when + * that happens. The request still remains valid, like always, + * so we should be safe with just the NULL check. + */ + if (!rq) + return false; + } + + return blk_mq_poll_hybrid_sleep(q, rq); +} + +/** + * blk_poll - poll for IO completions + * @q: the queue + * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions + * + * Description: + * Poll for completions on the passed in queue. Returns number of + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). + */ +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +{ + struct blk_mq_hw_ctx *hctx; + long state; + + if (!blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + + /* + * If we sleep, have the caller restart the poll loop to reset + * the state. Like for the other success return cases, the + * caller is responsible for checking if the IO completed. If + * the IO isn't complete, we'll get called again and will go + * straight to the busy poll loop. + */ + if (blk_mq_poll_hybrid(q, hctx, cookie)) + return 1; + + hctx->poll_considered++; + + state = current->state; + do { + int ret; + + hctx->poll_invoked++; + + ret = q->mq_ops->poll(hctx); + if (ret > 0) { + hctx->poll_success++; + __set_current_state(TASK_RUNNING); + return ret; + } + + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + + if (current->state == TASK_RUNNING) + return 1; + if (ret < 0 || !spin) + break; + cpu_relax(); + } while (!need_resched()); + + __set_current_state(TASK_RUNNING); + return 0; +} +EXPORT_SYMBOL_GPL(blk_poll); + +unsigned int blk_mq_rq_cpu(struct request *rq) +{ + return rq->mq_ctx->cpu; +} +EXPORT_SYMBOL(blk_mq_rq_cpu); + +static int __init blk_mq_init(void) +{ + int i; + + for_each_possible_cpu(i) + INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); + cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, + blk_mq_hctx_notify_dead); + cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", + blk_mq_hctx_notify_online, + blk_mq_hctx_notify_offline); + return 0; +} +subsys_initcall(blk_mq_init); diff --git a/block/blk-mq.h b/block/blk-mq.h new file mode 100644 index 000000000..f792a0920 --- /dev/null +++ b/block/blk-mq.h @@ -0,0 +1,327 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef INT_BLK_MQ_H +#define INT_BLK_MQ_H + +#include "blk-stat.h" +#include "blk-mq-tag.h" + +struct blk_mq_tag_set; + +struct blk_mq_ctxs { + struct kobject kobj; + struct blk_mq_ctx __percpu *queue_ctx; +}; + +/** + * struct blk_mq_ctx - State for a software queue facing the submitting CPUs + */ +struct blk_mq_ctx { + struct { + spinlock_t lock; + struct list_head rq_lists[HCTX_MAX_TYPES]; + } ____cacheline_aligned_in_smp; + + unsigned int cpu; + unsigned short index_hw[HCTX_MAX_TYPES]; + struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; + + /* incremented at dispatch time */ + unsigned long rq_dispatched[2]; + unsigned long rq_merged; + + /* incremented at completion time */ + unsigned long ____cacheline_aligned_in_smp rq_completed[2]; + + struct request_queue *queue; + struct blk_mq_ctxs *ctxs; + struct kobject kobj; +} ____cacheline_aligned_in_smp; + +void blk_mq_exit_queue(struct request_queue *q); +int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); +void blk_mq_wake_waiters(struct request_queue *q); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, + unsigned int); +void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, + bool kick_requeue_list); +void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); +struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *start); +void blk_mq_put_rq_ref(struct request *rq); + +/* + * Internal helpers for allocating/freeing the request map + */ +void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx); +void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags); +struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags, + unsigned int flags); +int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth); + +/* + * Internal helpers for request insertion into sw queues + */ +void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head); +void blk_mq_request_bypass_insert(struct request *rq, bool at_head, + bool run_queue); +void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, + struct list_head *list); + +/* Used by blk_insert_cloned_request() to issue request directly */ +blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); +void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); + +/* + * CPU -> queue mappings + */ +extern int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int); + +/* + * blk_mq_map_queue_type() - map (hctx_type,cpu) to hardware queue + * @q: request queue + * @type: the hctx type index + * @cpu: CPU + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *q, + enum hctx_type type, + unsigned int cpu) +{ + return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; +} + +/* + * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue + * @q: request queue + * @flags: request command flags + * @cpu: cpu ctx + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + unsigned int flags, + struct blk_mq_ctx *ctx) +{ + enum hctx_type type = HCTX_TYPE_DEFAULT; + + /* + * The caller ensure that if REQ_HIPRI, poll must be enabled. + */ + if (flags & REQ_HIPRI) + type = HCTX_TYPE_POLL; + else if ((flags & REQ_OP_MASK) == REQ_OP_READ) + type = HCTX_TYPE_READ; + + return ctx->hctxs[type]; +} + +/* + * sysfs helpers + */ +extern void blk_mq_sysfs_init(struct request_queue *q); +extern void blk_mq_sysfs_deinit(struct request_queue *q); +extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); +extern int blk_mq_sysfs_register(struct request_queue *q); +extern void blk_mq_sysfs_unregister(struct request_queue *q); +extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); + +void blk_mq_release(struct request_queue *q); + +static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, + unsigned int cpu) +{ + return per_cpu_ptr(q->queue_ctx, cpu); +} + +/* + * This assumes per-cpu software queueing queues. They could be per-node + * as well, for instance. For now this is hardcoded as-is. Note that we don't + * care about preemption, since we know the ctx's are persistent. This does + * mean that we can't rely on ctx always matching the currently running CPU. + */ +static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) +{ + return __blk_mq_get_ctx(q, raw_smp_processor_id()); +} + +struct blk_mq_alloc_data { + /* input parameter */ + struct request_queue *q; + blk_mq_req_flags_t flags; + unsigned int shallow_depth; + unsigned int cmd_flags; + + /* input & output parameter */ + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; +}; + +static inline bool blk_mq_is_sbitmap_shared(unsigned int flags) +{ + return flags & BLK_MQ_F_TAG_HCTX_SHARED; +} + +static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) +{ + if (data->q->elevator) + return data->hctx->sched_tags; + + return data->hctx->tags; +} + +static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) +{ + return test_bit(BLK_MQ_S_STOPPED, &hctx->state); +} + +static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) +{ + return hctx->nr_ctx && hctx->tags; +} + +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, + unsigned int inflight[2]); + +static inline void blk_mq_put_dispatch_budget(struct request_queue *q) +{ + if (q->mq_ops->put_budget) + q->mq_ops->put_budget(q); +} + +static inline bool blk_mq_get_dispatch_budget(struct request_queue *q) +{ + if (q->mq_ops->get_budget) + return q->mq_ops->get_budget(q); + return true; +} + +static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + atomic_inc(&hctx->queue->nr_active_requests_shared_sbitmap); + else + atomic_inc(&hctx->nr_active); +} + +static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + atomic_dec(&hctx->queue->nr_active_requests_shared_sbitmap); + else + atomic_dec(&hctx->nr_active); +} + +static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_sbitmap_shared(hctx->flags)) + return atomic_read(&hctx->queue->nr_active_requests_shared_sbitmap); + return atomic_read(&hctx->nr_active); +} +static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, + struct request *rq) +{ + blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); + rq->tag = BLK_MQ_NO_TAG; + + if (rq->rq_flags & RQF_MQ_INFLIGHT) { + rq->rq_flags &= ~RQF_MQ_INFLIGHT; + __blk_mq_dec_active_requests(hctx); + } +} + +static inline void blk_mq_put_driver_tag(struct request *rq) +{ + if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG) + return; + + __blk_mq_put_driver_tag(rq->mq_hctx, rq); +} + +static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) +{ + int cpu; + + for_each_possible_cpu(cpu) + qmap->mq_map[cpu] = 0; +} + +/* + * blk_mq_plug() - Get caller context plug + * @q: request queue + * @bio : the bio being submitted by the caller context + * + * Plugging, by design, may delay the insertion of BIOs into the elevator in + * order to increase BIO merging opportunities. This however can cause BIO + * insertion order to change from the order in which submit_bio() is being + * executed in the case of multiple contexts concurrently issuing BIOs to a + * device, even if these context are synchronized to tightly control BIO issuing + * order. While this is not a problem with regular block devices, this ordering + * change can cause write BIO failures with zoned block devices as these + * require sequential write patterns to zones. Prevent this from happening by + * ignoring the plug state of a BIO issuing context if the target request queue + * is for a zoned block device and the BIO to plug is a write operation. + * + * Return current->plug if the bio can be plugged and NULL otherwise + */ +static inline struct blk_plug *blk_mq_plug(struct request_queue *q, + struct bio *bio) +{ + /* + * For regular block devices or read operations, use the context plug + * which may be NULL if blk_start_plug() was not executed. + */ + if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) + return current->plug; + + /* Zoned block device write operation case: do not plug the BIO */ + return NULL; +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->sb.depth == 1) + return true; + + if (blk_mq_is_sbitmap_shared(hctx->flags)) { + struct request_queue *q = hctx->queue; + struct blk_mq_tag_set *set = q->tag_set; + + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + return true; + users = atomic_read(&set->active_queues_shared_sbitmap); + } else { + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + users = atomic_read(&hctx->tags->active_queues); + } + + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->sb.depth + users - 1) / users, 4U); + return __blk_mq_active_requests(hctx) < depth; +} + + +#endif diff --git a/block/blk-pm.c b/block/blk-pm.c new file mode 100644 index 000000000..2dad62cc1 --- /dev/null +++ b/block/blk-pm.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include "blk-mq.h" +#include "blk-mq-tag.h" + +/** + * blk_pm_runtime_init - Block layer runtime PM initialization routine + * @q: the queue of the device + * @dev: the device the queue belongs to + * + * Description: + * Initialize runtime-PM-related fields for @q and start auto suspend for + * @dev. Drivers that want to take advantage of request-based runtime PM + * should call this function after @dev has been initialized, and its + * request queue @q has been allocated, and runtime PM for it can not happen + * yet(either due to disabled/forbidden or its usage_count > 0). In most + * cases, driver should call this function before any I/O has taken place. + * + * This function takes care of setting up using auto suspend for the device, + * the autosuspend delay is set to -1 to make runtime suspend impossible + * until an updated value is either set by user or by driver. Drivers do + * not need to touch other autosuspend settings. + * + * The block layer runtime PM is request based, so only works for drivers + * that use request as their IO unit instead of those directly use bio's. + */ +void blk_pm_runtime_init(struct request_queue *q, struct device *dev) +{ + q->dev = dev; + q->rpm_status = RPM_ACTIVE; + pm_runtime_set_autosuspend_delay(q->dev, -1); + pm_runtime_use_autosuspend(q->dev); +} +EXPORT_SYMBOL(blk_pm_runtime_init); + +/** + * blk_pre_runtime_suspend - Pre runtime suspend check + * @q: the queue of the device + * + * Description: + * This function will check if runtime suspend is allowed for the device + * by examining if there are any requests pending in the queue. If there + * are requests pending, the device can not be runtime suspended; otherwise, + * the queue's status will be updated to SUSPENDING and the driver can + * proceed to suspend the device. + * + * For the not allowed case, we mark last busy for the device so that + * runtime PM core will try to autosuspend it some time later. + * + * This function should be called near the start of the device's + * runtime_suspend callback. + * + * Return: + * 0 - OK to runtime suspend the device + * -EBUSY - Device should not be runtime suspended + */ +int blk_pre_runtime_suspend(struct request_queue *q) +{ + int ret = 0; + + if (!q->dev) + return ret; + + WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE); + + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_SUSPENDING; + spin_unlock_irq(&q->queue_lock); + + /* + * Increase the pm_only counter before checking whether any + * non-PM blk_queue_enter() calls are in progress to avoid that any + * new non-PM blk_queue_enter() calls succeed before the pm_only + * counter is decreased again. + */ + blk_set_pm_only(q); + ret = -EBUSY; + /* Switch q_usage_counter from per-cpu to atomic mode. */ + blk_freeze_queue_start(q); + /* + * Wait until atomic mode has been reached. Since that + * involves calling call_rcu(), it is guaranteed that later + * blk_queue_enter() calls see the pm-only state. See also + * http://lwn.net/Articles/573497/. + */ + percpu_ref_switch_to_atomic_sync(&q->q_usage_counter); + if (percpu_ref_is_zero(&q->q_usage_counter)) + ret = 0; + /* Switch q_usage_counter back to per-cpu mode. */ + blk_mq_unfreeze_queue(q); + + if (ret < 0) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + spin_unlock_irq(&q->queue_lock); + + blk_clear_pm_only(q); + } + + return ret; +} +EXPORT_SYMBOL(blk_pre_runtime_suspend); + +/** + * blk_post_runtime_suspend - Post runtime suspend processing + * @q: the queue of the device + * @err: return value of the device's runtime_suspend function + * + * Description: + * Update the queue's runtime status according to the return value of the + * device's runtime suspend function and mark last busy for the device so + * that PM core will try to auto suspend the device at a later time. + * + * This function should be called near the end of the device's + * runtime_suspend callback. + */ +void blk_post_runtime_suspend(struct request_queue *q, int err) +{ + if (!q->dev) + return; + + spin_lock_irq(&q->queue_lock); + if (!err) { + q->rpm_status = RPM_SUSPENDED; + } else { + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + } + spin_unlock_irq(&q->queue_lock); + + if (err) + blk_clear_pm_only(q); +} +EXPORT_SYMBOL(blk_post_runtime_suspend); + +/** + * blk_pre_runtime_resume - Pre runtime resume processing + * @q: the queue of the device + * + * Description: + * Update the queue's runtime status to RESUMING in preparation for the + * runtime resume of the device. + * + * This function should be called near the start of the device's + * runtime_resume callback. + */ +void blk_pre_runtime_resume(struct request_queue *q) +{ + if (!q->dev) + return; + + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_RESUMING; + spin_unlock_irq(&q->queue_lock); +} +EXPORT_SYMBOL(blk_pre_runtime_resume); + +/** + * blk_post_runtime_resume - Post runtime resume processing + * @q: the queue of the device + * + * Description: + * For historical reasons, this routine merely calls blk_set_runtime_active() + * to do the real work of restarting the queue. It does this regardless of + * whether the device's runtime-resume succeeded; even if it failed the + * driver or error handler will need to communicate with the device. + * + * This function should be called near the end of the device's + * runtime_resume callback. + */ +void blk_post_runtime_resume(struct request_queue *q) +{ + blk_set_runtime_active(q); +} +EXPORT_SYMBOL(blk_post_runtime_resume); + +/** + * blk_set_runtime_active - Force runtime status of the queue to be active + * @q: the queue of the device + * + * If the device is left runtime suspended during system suspend the resume + * hook typically resumes the device and corrects runtime status + * accordingly. However, that does not affect the queue runtime PM status + * which is still "suspended". This prevents processing requests from the + * queue. + * + * This function can be used in driver's resume hook to correct queue + * runtime PM status and re-enable peeking requests from the queue. It + * should be called before first request is added to the queue. + * + * This function is also called by blk_post_runtime_resume() for + * runtime resumes. It does everything necessary to restart the queue. + */ +void blk_set_runtime_active(struct request_queue *q) +{ + int old_status; + + if (!q->dev) + return; + + spin_lock_irq(&q->queue_lock); + old_status = q->rpm_status; + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); + spin_unlock_irq(&q->queue_lock); + + if (old_status != RPM_ACTIVE) + blk_clear_pm_only(q); +} +EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-pm.h b/block/blk-pm.h new file mode 100644 index 000000000..a2283cc9f --- /dev/null +++ b/block/blk-pm.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLOCK_BLK_PM_H_ +#define _BLOCK_BLK_PM_H_ + +#include + +#ifdef CONFIG_PM +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) +{ + if (!q->dev || !blk_queue_pm_only(q)) + return 1; /* Nothing to do */ + if (pm && q->rpm_status != RPM_SUSPENDED) + return 1; /* Request allowed */ + pm_request_resume(q->dev); + return 0; +} + +static inline void blk_pm_mark_last_busy(struct request *rq) +{ + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + pm_runtime_mark_last_busy(rq->q->dev); +} + +static inline void blk_pm_requeue_request(struct request *rq) +{ + lockdep_assert_held(&rq->q->queue_lock); + + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + rq->q->nr_pending--; +} + +static inline void blk_pm_add_request(struct request_queue *q, + struct request *rq) +{ + lockdep_assert_held(&q->queue_lock); + + if (q->dev && !(rq->rq_flags & RQF_PM)) + q->nr_pending++; +} + +static inline void blk_pm_put_request(struct request *rq) +{ + lockdep_assert_held(&rq->q->queue_lock); + + if (rq->q->dev && !(rq->rq_flags & RQF_PM)) + --rq->q->nr_pending; +} +#else +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) +{ + return 1; +} + +static inline void blk_pm_mark_last_busy(struct request *rq) +{ +} + +static inline void blk_pm_requeue_request(struct request *rq) +{ +} + +static inline void blk_pm_add_request(struct request_queue *q, + struct request *rq) +{ +} + +static inline void blk_pm_put_request(struct request *rq) +{ +} +#endif + +#endif /* _BLOCK_BLK_PM_H_ */ diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c new file mode 100644 index 000000000..e83af7bc7 --- /dev/null +++ b/block/blk-rq-qos.c @@ -0,0 +1,304 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "blk-rq-qos.h" + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, unsigned int below) +{ + unsigned int cur = atomic_read(v); + + for (;;) { + unsigned int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit) +{ + return atomic_inc_below(&rq_wait->inflight, limit); +} + +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + do { + if (rqos->ops->cleanup) + rqos->ops->cleanup(rqos, bio); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_done(struct rq_qos *rqos, struct request *rq) +{ + do { + if (rqos->ops->done) + rqos->ops->done(rqos, rq); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq) +{ + do { + if (rqos->ops->issue) + rqos->ops->issue(rqos, rq); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq) +{ + do { + if (rqos->ops->requeue) + rqos->ops->requeue(rqos, rq); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio) +{ + do { + if (rqos->ops->throttle) + rqos->ops->throttle(rqos, bio); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + do { + if (rqos->ops->track) + rqos->ops->track(rqos, rq, bio); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + do { + if (rqos->ops->merge) + rqos->ops->merge(rqos, rq, bio); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio) +{ + do { + if (rqos->ops->done_bio) + rqos->ops->done_bio(rqos, bio); + rqos = rqos->next; + } while (rqos); +} + +void __rq_qos_queue_depth_changed(struct rq_qos *rqos) +{ + do { + if (rqos->ops->queue_depth_changed) + rqos->ops->queue_depth_changed(rqos); + rqos = rqos->next; + } while (rqos); +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +bool rq_depth_calc_max_depth(struct rq_depth *rqd) +{ + unsigned int depth; + bool ret = false; + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rqd->queue_depth == 1) { + if (rqd->scale_step > 0) + rqd->max_depth = 1; + else { + rqd->max_depth = 2; + ret = true; + } + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, rqd->default_depth, + rqd->queue_depth); + if (rqd->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rqd->scale_step)); + else if (rqd->scale_step < 0) { + unsigned int maxd = 3 * rqd->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rqd->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + rqd->max_depth = depth; + } + + return ret; +} + +/* Returns true on success and false if scaling up wasn't possible */ +bool rq_depth_scale_up(struct rq_depth *rqd) +{ + /* + * Hit max in previous round, stop here + */ + if (rqd->scaled_max) + return false; + + rqd->scale_step--; + + rqd->scaled_max = rq_depth_calc_max_depth(rqd); + return true; +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. Returns true on success and returns false if + * scaling down wasn't possible. + */ +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rqd->max_depth == 1) + return false; + + if (rqd->scale_step < 0 && hard_throttle) + rqd->scale_step = 0; + else + rqd->scale_step++; + + rqd->scaled_max = false; + rq_depth_calc_max_depth(rqd); + return true; +} + +struct rq_qos_wait_data { + struct wait_queue_entry wq; + struct task_struct *task; + struct rq_wait *rqw; + acquire_inflight_cb_t *cb; + void *private_data; + bool got_token; +}; + +static int rq_qos_wake_function(struct wait_queue_entry *curr, + unsigned int mode, int wake_flags, void *key) +{ + struct rq_qos_wait_data *data = container_of(curr, + struct rq_qos_wait_data, + wq); + + /* + * If we fail to get a budget, return -1 to interrupt the wake up loop + * in __wake_up_common. + */ + if (!data->cb(data->rqw, data->private_data)) + return -1; + + data->got_token = true; + smp_wmb(); + list_del_init(&curr->entry); + wake_up_process(data->task); + return 1; +} + +/** + * rq_qos_wait - throttle on a rqw if we need to + * @rqw: rqw to throttle on + * @private_data: caller provided specific data + * @acquire_inflight_cb: inc the rqw->inflight counter if we can + * @cleanup_cb: the callback to cleanup in case we race with a waker + * + * This provides a uniform place for the rq_qos users to do their throttling. + * Since you can end up with a lot of things sleeping at once, this manages the + * waking up based on the resources available. The acquire_inflight_cb should + * inc the rqw->inflight if we have the ability to do so, or return false if not + * and then we will sleep until the room becomes available. + * + * cleanup_cb is in case that we race with a waker and need to cleanup the + * inflight count accordingly. + */ +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb) +{ + struct rq_qos_wait_data data = { + .wq = { + .func = rq_qos_wake_function, + .entry = LIST_HEAD_INIT(data.wq.entry), + }, + .task = current, + .rqw = rqw, + .cb = acquire_inflight_cb, + .private_data = private_data, + }; + bool has_sleeper; + + has_sleeper = wq_has_sleeper(&rqw->wait); + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) + return; + + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + TASK_UNINTERRUPTIBLE); + do { + /* The memory barrier in set_task_state saves us here. */ + if (data.got_token) + break; + if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) { + finish_wait(&rqw->wait, &data.wq); + + /* + * We raced with wbt_wake_function() getting a token, + * which means we now have two. Put our local token + * and wake anyone else potentially waiting for one. + */ + smp_rmb(); + if (data.got_token) + cleanup_cb(rqw, private_data); + break; + } + io_schedule(); + has_sleeper = true; + set_current_state(TASK_UNINTERRUPTIBLE); + } while (1); + finish_wait(&rqw->wait, &data.wq); +} + +void rq_qos_exit(struct request_queue *q) +{ + blk_mq_debugfs_unregister_queue_rqos(q); + + while (q->rq_qos) { + struct rq_qos *rqos = q->rq_qos; + q->rq_qos = rqos->next; + rqos->ops->exit(rqos); + } +} diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h new file mode 100644 index 000000000..2bcb3495e --- /dev/null +++ b/block/blk-rq-qos.h @@ -0,0 +1,231 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef RQ_QOS_H +#define RQ_QOS_H + +#include +#include +#include +#include +#include +#include + +#include "blk-mq-debugfs.h" + +struct blk_mq_debugfs_attr; + +enum rq_qos_id { + RQ_QOS_WBT, + RQ_QOS_LATENCY, + RQ_QOS_COST, +}; + +struct rq_wait { + wait_queue_head_t wait; + atomic_t inflight; +}; + +struct rq_qos { + struct rq_qos_ops *ops; + struct request_queue *q; + enum rq_qos_id id; + struct rq_qos *next; +#ifdef CONFIG_BLK_DEBUG_FS + struct dentry *debugfs_dir; +#endif +}; + +struct rq_qos_ops { + void (*throttle)(struct rq_qos *, struct bio *); + void (*track)(struct rq_qos *, struct request *, struct bio *); + void (*merge)(struct rq_qos *, struct request *, struct bio *); + void (*issue)(struct rq_qos *, struct request *); + void (*requeue)(struct rq_qos *, struct request *); + void (*done)(struct rq_qos *, struct request *); + void (*done_bio)(struct rq_qos *, struct bio *); + void (*cleanup)(struct rq_qos *, struct bio *); + void (*queue_depth_changed)(struct rq_qos *); + void (*exit)(struct rq_qos *); + const struct blk_mq_debugfs_attr *debugfs_attrs; +}; + +struct rq_depth { + unsigned int max_depth; + + int scale_step; + bool scaled_max; + + unsigned int queue_depth; + unsigned int default_depth; +}; + +static inline struct rq_qos *rq_qos_id(struct request_queue *q, + enum rq_qos_id id) +{ + struct rq_qos *rqos; + for (rqos = q->rq_qos; rqos; rqos = rqos->next) { + if (rqos->id == id) + break; + } + return rqos; +} + +static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_WBT); +} + +static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +{ + return rq_qos_id(q, RQ_QOS_LATENCY); +} + +static inline const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; + } + return "unknown"; +} + +static inline void rq_wait_init(struct rq_wait *rq_wait) +{ + atomic_set(&rq_wait->inflight, 0); + init_waitqueue_head(&rq_wait->wait); +} + +static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) +{ + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + * + * Reuse ->queue_lock for protecting against other concurrent + * rq_qos adding/deleting + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + rqos->next = q->rq_qos; + q->rq_qos = rqos; + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + if (rqos->ops->debugfs_attrs) + blk_mq_debugfs_register_rqos(rqos); +} + +static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) +{ + struct rq_qos **cur; + + /* + * See comment in rq_qos_add() about freezing queue & using + * ->queue_lock. + */ + blk_mq_freeze_queue(q); + + spin_lock_irq(&q->queue_lock); + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; + break; + } + } + spin_unlock_irq(&q->queue_lock); + + blk_mq_unfreeze_queue(q); + + blk_mq_debugfs_unregister_rqos(rqos); +} + +typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); +typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); + +void rq_qos_wait(struct rq_wait *rqw, void *private_data, + acquire_inflight_cb_t *acquire_inflight_cb, + cleanup_cb_t *cleanup_cb); +bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit); +bool rq_depth_scale_up(struct rq_depth *rqd); +bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle); +bool rq_depth_calc_max_depth(struct rq_depth *rqd); + +void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_done(struct rq_qos *rqos, struct request *rq); +void __rq_qos_issue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq); +void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio); +void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio); +void __rq_qos_queue_depth_changed(struct rq_qos *rqos); + +static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_cleanup(q->rq_qos, bio); +} + +static inline void rq_qos_done(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_done(q->rq_qos, rq); +} + +static inline void rq_qos_issue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_issue(q->rq_qos, rq); +} + +static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) +{ + if (q->rq_qos) + __rq_qos_requeue(q->rq_qos, rq); +} + +static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); +} + +static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) +{ + /* + * BIO_TRACKED lets controllers know that a bio went through the + * normal rq_qos path. + */ + bio_set_flag(bio, BIO_TRACKED); + if (q->rq_qos) + __rq_qos_throttle(q->rq_qos, bio); +} + +static inline void rq_qos_track(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_track(q->rq_qos, rq, bio); +} + +static inline void rq_qos_merge(struct request_queue *q, struct request *rq, + struct bio *bio) +{ + if (q->rq_qos) + __rq_qos_merge(q->rq_qos, rq, bio); +} + +static inline void rq_qos_queue_depth_changed(struct request_queue *q) +{ + if (q->rq_qos) + __rq_qos_queue_depth_changed(q->rq_qos); +} + +void rq_qos_exit(struct request_queue *); + +#endif diff --git a/block/blk-settings.c b/block/blk-settings.c new file mode 100644 index 000000000..c3aa7f8ee --- /dev/null +++ b/block/blk-settings.c @@ -0,0 +1,887 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to setting various queue properties from drivers + */ +#include +#include +#include +#include +#include +#include /* for max_pfn/max_low_pfn */ +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-wbt.h" + +unsigned long blk_max_low_pfn; +EXPORT_SYMBOL(blk_max_low_pfn); + +unsigned long blk_max_pfn; + +void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) +{ + q->rq_timeout = timeout; +} +EXPORT_SYMBOL_GPL(blk_queue_rq_timeout); + +/** + * blk_set_default_limits - reset limits to default values + * @lim: the queue_limits structure to reset + * + * Description: + * Returns a queue_limit struct to its default state. + */ +void blk_set_default_limits(struct queue_limits *lim) +{ + lim->max_segments = BLK_MAX_SEGMENTS; + lim->max_discard_segments = 1; + lim->max_integrity_segments = 0; + lim->seg_boundary_mask = BLK_SEG_BOUNDARY_MASK; + lim->virt_boundary_mask = 0; + lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; + lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; + lim->max_dev_sectors = 0; + lim->chunk_sectors = 0; + lim->max_write_same_sectors = 0; + lim->max_write_zeroes_sectors = 0; + lim->max_zone_append_sectors = 0; + lim->max_discard_sectors = 0; + lim->max_hw_discard_sectors = 0; + lim->discard_granularity = 0; + lim->discard_alignment = 0; + lim->discard_misaligned = 0; + lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; + lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); + lim->alignment_offset = 0; + lim->io_opt = 0; + lim->misaligned = 0; + lim->zoned = BLK_ZONED_NONE; +} +EXPORT_SYMBOL(blk_set_default_limits); + +/** + * blk_set_stacking_limits - set default limits for stacking devices + * @lim: the queue_limits structure to reset + * + * Description: + * Returns a queue_limit struct to its default state. Should be used + * by stacking drivers like DM that have no internal limits. + */ +void blk_set_stacking_limits(struct queue_limits *lim) +{ + blk_set_default_limits(lim); + + /* Inherit limits from component devices */ + lim->max_segments = USHRT_MAX; + lim->max_discard_segments = USHRT_MAX; + lim->max_hw_sectors = UINT_MAX; + lim->max_segment_size = UINT_MAX; + lim->max_sectors = UINT_MAX; + lim->max_dev_sectors = UINT_MAX; + lim->max_write_same_sectors = UINT_MAX; + lim->max_write_zeroes_sectors = UINT_MAX; + lim->max_zone_append_sectors = UINT_MAX; +} +EXPORT_SYMBOL(blk_set_stacking_limits); + +/** + * blk_queue_bounce_limit - set bounce buffer limit for queue + * @q: the request queue for the device + * @max_addr: the maximum address the device can handle + * + * Description: + * Different hardware can have different requirements as to what pages + * it can do I/O directly to. A low level driver can call + * blk_queue_bounce_limit to have lower memory pages allocated as bounce + * buffers for doing I/O to pages residing above @max_addr. + **/ +void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) +{ + unsigned long b_pfn = max_addr >> PAGE_SHIFT; + int dma = 0; + + q->bounce_gfp = GFP_NOIO; +#if BITS_PER_LONG == 64 + /* + * Assume anything <= 4GB can be handled by IOMMU. Actually + * some IOMMUs can handle everything, but I don't know of a + * way to test this here. + */ + if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) + dma = 1; + q->limits.bounce_pfn = max(max_low_pfn, b_pfn); +#else + if (b_pfn < blk_max_low_pfn) + dma = 1; + q->limits.bounce_pfn = b_pfn; +#endif + if (dma) { + init_emergency_isa_pool(); + q->bounce_gfp = GFP_NOIO | GFP_DMA; + q->limits.bounce_pfn = b_pfn; + } +} +EXPORT_SYMBOL(blk_queue_bounce_limit); + +/** + * blk_queue_max_hw_sectors - set max sectors for a request for this queue + * @q: the request queue for the device + * @max_hw_sectors: max hardware sectors in the usual 512b unit + * + * Description: + * Enables a low level driver to set a hard upper limit, + * max_hw_sectors, on the size of requests. max_hw_sectors is set by + * the device driver based upon the capabilities of the I/O + * controller. + * + * max_dev_sectors is a hard limit imposed by the storage device for + * READ/WRITE requests. It is set by the disk driver. + * + * max_sectors is a soft limit imposed by the block layer for + * filesystem type requests. This value can be overridden on a + * per-device basis in /sys/block//queue/max_sectors_kb. + * The soft limit can not exceed max_hw_sectors. + **/ +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_sectors) +{ + struct queue_limits *limits = &q->limits; + unsigned int max_sectors; + + if ((max_hw_sectors << 9) < PAGE_SIZE) { + max_hw_sectors = 1 << (PAGE_SHIFT - 9); + printk(KERN_INFO "%s: set to minimum %d\n", + __func__, max_hw_sectors); + } + + limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); + max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + limits->max_sectors = max_sectors; + q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); +} +EXPORT_SYMBOL(blk_queue_max_hw_sectors); + +/** + * blk_queue_chunk_sectors - set size of the chunk for this queue + * @q: the request queue for the device + * @chunk_sectors: chunk sectors in the usual 512b unit + * + * Description: + * If a driver doesn't want IOs to cross a given chunk size, it can set + * this limit and prevent merging across chunks. Note that the block layer + * must accept a page worth of data at any offset. So if the crossing of + * chunks is a hard limitation in the driver, it must still be prepared + * to split single page bios. + **/ +void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) +{ + q->limits.chunk_sectors = chunk_sectors; +} +EXPORT_SYMBOL(blk_queue_chunk_sectors); + +/** + * blk_queue_max_discard_sectors - set max sectors for a single discard + * @q: the request queue for the device + * @max_discard_sectors: maximum number of sectors to discard + **/ +void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors) +{ + q->limits.max_hw_discard_sectors = max_discard_sectors; + q->limits.max_discard_sectors = max_discard_sectors; +} +EXPORT_SYMBOL(blk_queue_max_discard_sectors); + +/** + * blk_queue_max_write_same_sectors - set max sectors for a single write same + * @q: the request queue for the device + * @max_write_same_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_same_sectors(struct request_queue *q, + unsigned int max_write_same_sectors) +{ + q->limits.max_write_same_sectors = max_write_same_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_same_sectors); + +/** + * blk_queue_max_write_zeroes_sectors - set max sectors for a single + * write zeroes + * @q: the request queue for the device + * @max_write_zeroes_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_write_zeroes_sectors(struct request_queue *q, + unsigned int max_write_zeroes_sectors) +{ + q->limits.max_write_zeroes_sectors = max_write_zeroes_sectors; +} +EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors); + +/** + * blk_queue_max_zone_append_sectors - set max sectors for a single zone append + * @q: the request queue for the device + * @max_zone_append_sectors: maximum number of sectors to write per command + **/ +void blk_queue_max_zone_append_sectors(struct request_queue *q, + unsigned int max_zone_append_sectors) +{ + unsigned int max_sectors; + + if (WARN_ON(!blk_queue_is_zoned(q))) + return; + + max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors); + max_sectors = min(q->limits.chunk_sectors, max_sectors); + + /* + * Signal eventual driver bugs resulting in the max_zone_append sectors limit + * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set, + * or the max_hw_sectors limit not set. + */ + WARN_ON(!max_sectors); + + q->limits.max_zone_append_sectors = max_sectors; +} +EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors); + +/** + * blk_queue_max_segments - set max hw segments for a request for this queue + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * hw data segments in a request. + **/ +void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments) +{ + if (!max_segments) { + max_segments = 1; + printk(KERN_INFO "%s: set to minimum %d\n", + __func__, max_segments); + } + + q->limits.max_segments = max_segments; +} +EXPORT_SYMBOL(blk_queue_max_segments); + +/** + * blk_queue_max_discard_segments - set max segments for discard requests + * @q: the request queue for the device + * @max_segments: max number of segments + * + * Description: + * Enables a low level driver to set an upper limit on the number of + * segments in a discard request. + **/ +void blk_queue_max_discard_segments(struct request_queue *q, + unsigned short max_segments) +{ + q->limits.max_discard_segments = max_segments; +} +EXPORT_SYMBOL_GPL(blk_queue_max_discard_segments); + +/** + * blk_queue_max_segment_size - set max segment size for blk_rq_map_sg + * @q: the request queue for the device + * @max_size: max size of segment in bytes + * + * Description: + * Enables a low level driver to set an upper limit on the size of a + * coalesced segment + **/ +void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) +{ + if (max_size < PAGE_SIZE) { + max_size = PAGE_SIZE; + printk(KERN_INFO "%s: set to minimum %d\n", + __func__, max_size); + } + + /* see blk_queue_virt_boundary() for the explanation */ + WARN_ON_ONCE(q->limits.virt_boundary_mask); + + q->limits.max_segment_size = max_size; +} +EXPORT_SYMBOL(blk_queue_max_segment_size); + +/** + * blk_queue_logical_block_size - set logical block size for the queue + * @q: the request queue for the device + * @size: the logical block size, in bytes + * + * Description: + * This should be set to the lowest possible block size that the + * storage device can address. The default of 512 covers most + * hardware. + **/ +void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) +{ + q->limits.logical_block_size = size; + + if (q->limits.physical_block_size < size) + q->limits.physical_block_size = size; + + if (q->limits.io_min < q->limits.physical_block_size) + q->limits.io_min = q->limits.physical_block_size; +} +EXPORT_SYMBOL(blk_queue_logical_block_size); + +/** + * blk_queue_physical_block_size - set physical block size for the queue + * @q: the request queue for the device + * @size: the physical block size, in bytes + * + * Description: + * This should be set to the lowest possible sector size that the + * hardware can operate on without reverting to read-modify-write + * operations. + */ +void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) +{ + q->limits.physical_block_size = size; + + if (q->limits.physical_block_size < q->limits.logical_block_size) + q->limits.physical_block_size = q->limits.logical_block_size; + + if (q->limits.io_min < q->limits.physical_block_size) + q->limits.io_min = q->limits.physical_block_size; +} +EXPORT_SYMBOL(blk_queue_physical_block_size); + +/** + * blk_queue_alignment_offset - set physical block alignment offset + * @q: the request queue for the device + * @offset: alignment offset in bytes + * + * Description: + * Some devices are naturally misaligned to compensate for things like + * the legacy DOS partition table 63-sector offset. Low-level drivers + * should call this function for devices whose first sector is not + * naturally aligned. + */ +void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) +{ + q->limits.alignment_offset = + offset & (q->limits.physical_block_size - 1); + q->limits.misaligned = 0; +} +EXPORT_SYMBOL(blk_queue_alignment_offset); + +void blk_queue_update_readahead(struct request_queue *q) +{ + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + q->backing_dev_info->io_pages = + queue_max_sectors(q) >> (PAGE_SHIFT - 9); +} +EXPORT_SYMBOL_GPL(blk_queue_update_readahead); + +/** + * blk_limits_io_min - set minimum request size for a device + * @limits: the queue limits + * @min: smallest I/O size in bytes + * + * Description: + * Some devices have an internal block size bigger than the reported + * hardware sector size. This function can be used to signal the + * smallest I/O the device can perform without incurring a performance + * penalty. + */ +void blk_limits_io_min(struct queue_limits *limits, unsigned int min) +{ + limits->io_min = min; + + if (limits->io_min < limits->logical_block_size) + limits->io_min = limits->logical_block_size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; +} +EXPORT_SYMBOL(blk_limits_io_min); + +/** + * blk_queue_io_min - set minimum request size for the queue + * @q: the request queue for the device + * @min: smallest I/O size in bytes + * + * Description: + * Storage devices may report a granularity or preferred minimum I/O + * size which is the smallest request the device can perform without + * incurring a performance penalty. For disk drives this is often the + * physical block size. For RAID arrays it is often the stripe chunk + * size. A properly aligned multiple of minimum_io_size is the + * preferred request size for workloads where a high number of I/O + * operations is desired. + */ +void blk_queue_io_min(struct request_queue *q, unsigned int min) +{ + blk_limits_io_min(&q->limits, min); +} +EXPORT_SYMBOL(blk_queue_io_min); + +/** + * blk_limits_io_opt - set optimal request size for a device + * @limits: the queue limits + * @opt: smallest I/O size in bytes + * + * Description: + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. + */ +void blk_limits_io_opt(struct queue_limits *limits, unsigned int opt) +{ + limits->io_opt = opt; +} +EXPORT_SYMBOL(blk_limits_io_opt); + +/** + * blk_queue_io_opt - set optimal request size for the queue + * @q: the request queue for the device + * @opt: optimal request size in bytes + * + * Description: + * Storage devices may report an optimal I/O size, which is the + * device's preferred unit for sustained I/O. This is rarely reported + * for disk drives. For RAID arrays it is usually the stripe width or + * the internal track size. A properly aligned multiple of + * optimal_io_size is the preferred request size for workloads where + * sustained throughput is desired. + */ +void blk_queue_io_opt(struct request_queue *q, unsigned int opt) +{ + blk_limits_io_opt(&q->limits, opt); + q->backing_dev_info->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); +} +EXPORT_SYMBOL(blk_queue_io_opt); + +static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) +{ + sectors = round_down(sectors, lbs >> SECTOR_SHIFT); + if (sectors < PAGE_SIZE >> SECTOR_SHIFT) + sectors = PAGE_SIZE >> SECTOR_SHIFT; + return sectors; +} + +/** + * blk_stack_limits - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top device) + * @b: the underlying queue limits (bottom, component device) + * @start: first data sector within component device + * + * Description: + * This function is used by stacking drivers like MD and DM to ensure + * that all component devices have compatible block sizes and + * alignments. The stacking driver must provide a queue_limits + * struct (top) and then iteratively call the stacking function for + * all component (bottom) devices. The stacking function will + * attempt to combine the values and ensure proper alignment. + * + * Returns 0 if the top and bottom queue_limits are compatible. The + * top device's block sizes and alignment offsets may be adjusted to + * ensure alignment with the bottom device. If no compatible sizes + * and alignments exist, -1 is returned and the resulting top + * queue_limits will have the misaligned flag set to indicate that + * the alignment_offset is undefined. + */ +int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, + sector_t start) +{ + unsigned int top, bottom, alignment, ret = 0; + + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); + t->max_write_same_sectors = min(t->max_write_same_sectors, + b->max_write_same_sectors); + t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, + b->max_write_zeroes_sectors); + t->max_zone_append_sectors = min(t->max_zone_append_sectors, + b->max_zone_append_sectors); + t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); + + t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, + b->seg_boundary_mask); + t->virt_boundary_mask = min_not_zero(t->virt_boundary_mask, + b->virt_boundary_mask); + + t->max_segments = min_not_zero(t->max_segments, b->max_segments); + t->max_discard_segments = min_not_zero(t->max_discard_segments, + b->max_discard_segments); + t->max_integrity_segments = min_not_zero(t->max_integrity_segments, + b->max_integrity_segments); + + t->max_segment_size = min_not_zero(t->max_segment_size, + b->max_segment_size); + + t->misaligned |= b->misaligned; + + alignment = queue_limit_alignment_offset(b, start); + + /* Bottom device has different alignment. Check that it is + * compatible with the current top alignment. + */ + if (t->alignment_offset != alignment) { + + top = max(t->physical_block_size, t->io_min) + + t->alignment_offset; + bottom = max(b->physical_block_size, b->io_min) + alignment; + + /* Verify that top and bottom intervals line up */ + if (max(top, bottom) % min(top, bottom)) { + t->misaligned = 1; + ret = -1; + } + } + + t->logical_block_size = max(t->logical_block_size, + b->logical_block_size); + + t->physical_block_size = max(t->physical_block_size, + b->physical_block_size); + + t->io_min = max(t->io_min, b->io_min); + t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); + + /* Set non-power-of-2 compatible chunk_sectors boundary */ + if (b->chunk_sectors) + t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors); + + /* Physical block size a multiple of the logical block size? */ + if (t->physical_block_size & (t->logical_block_size - 1)) { + t->physical_block_size = t->logical_block_size; + t->misaligned = 1; + ret = -1; + } + + /* Minimum I/O a multiple of the physical block size? */ + if (t->io_min & (t->physical_block_size - 1)) { + t->io_min = t->physical_block_size; + t->misaligned = 1; + ret = -1; + } + + /* Optimal I/O a multiple of the physical block size? */ + if (t->io_opt & (t->physical_block_size - 1)) { + t->io_opt = 0; + t->misaligned = 1; + ret = -1; + } + + /* chunk_sectors a multiple of the physical block size? */ + if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + t->chunk_sectors = 0; + t->misaligned = 1; + ret = -1; + } + + t->raid_partial_stripes_expensive = + max(t->raid_partial_stripes_expensive, + b->raid_partial_stripes_expensive); + + /* Find lowest common alignment_offset */ + t->alignment_offset = lcm_not_zero(t->alignment_offset, alignment) + % max(t->physical_block_size, t->io_min); + + /* Verify that new alignment_offset is on a logical block boundary */ + if (t->alignment_offset & (t->logical_block_size - 1)) { + t->misaligned = 1; + ret = -1; + } + + t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size); + t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size); + t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size); + + /* Discard alignment and granularity */ + if (b->discard_granularity) { + alignment = queue_limit_discard_alignment(b, start); + + if (t->discard_granularity != 0 && + t->discard_alignment != alignment) { + top = t->discard_granularity + t->discard_alignment; + bottom = b->discard_granularity + alignment; + + /* Verify that top and bottom intervals line up */ + if ((max(top, bottom) % min(top, bottom)) != 0) + t->discard_misaligned = 1; + } + + t->max_discard_sectors = min_not_zero(t->max_discard_sectors, + b->max_discard_sectors); + t->max_hw_discard_sectors = min_not_zero(t->max_hw_discard_sectors, + b->max_hw_discard_sectors); + t->discard_granularity = max(t->discard_granularity, + b->discard_granularity); + t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % + t->discard_granularity; + } + + t->zoned = max(t->zoned, b->zoned); + return ret; +} +EXPORT_SYMBOL(blk_stack_limits); + +/** + * disk_stack_limits - adjust queue limits for stacked drivers + * @disk: MD/DM gendisk (top) + * @bdev: the underlying block device (bottom) + * @offset: offset to beginning of data within component device + * + * Description: + * Merges the limits for a top level gendisk and a bottom level + * block_device. + */ +void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, + sector_t offset) +{ + struct request_queue *t = disk->queue; + + if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + (offset >> 9)) < 0) { + char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; + + disk_name(disk, 0, top); + bdevname(bdev, bottom); + + printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", + top, bottom); + } + + blk_queue_update_readahead(disk->queue); +} +EXPORT_SYMBOL(disk_stack_limits); + +/** + * blk_queue_update_dma_pad - update pad mask + * @q: the request queue for the device + * @mask: pad mask + * + * Update dma pad mask. + * + * Appending pad buffer to a request modifies the last entry of a + * scatter list such that it includes the pad buffer. + **/ +void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask) +{ + if (mask > q->dma_pad_mask) + q->dma_pad_mask = mask; +} +EXPORT_SYMBOL(blk_queue_update_dma_pad); + +/** + * blk_queue_segment_boundary - set boundary rules for segment merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) +{ + if (mask < PAGE_SIZE - 1) { + mask = PAGE_SIZE - 1; + printk(KERN_INFO "%s: set to minimum %lx\n", + __func__, mask); + } + + q->limits.seg_boundary_mask = mask; +} +EXPORT_SYMBOL(blk_queue_segment_boundary); + +/** + * blk_queue_virt_boundary - set boundary rules for bio merging + * @q: the request queue for the device + * @mask: the memory boundary mask + **/ +void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask) +{ + q->limits.virt_boundary_mask = mask; + + /* + * Devices that require a virtual boundary do not support scatter/gather + * I/O natively, but instead require a descriptor list entry for each + * page (which might not be idential to the Linux PAGE_SIZE). Because + * of that they are not limited by our notion of "segment size". + */ + if (mask) + q->limits.max_segment_size = UINT_MAX; +} +EXPORT_SYMBOL(blk_queue_virt_boundary); + +/** + * blk_queue_dma_alignment - set dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * set required memory and length alignment for direct dma transactions. + * this is used when building direct io requests for the queue. + * + **/ +void blk_queue_dma_alignment(struct request_queue *q, int mask) +{ + q->dma_alignment = mask; +} +EXPORT_SYMBOL(blk_queue_dma_alignment); + +/** + * blk_queue_update_dma_alignment - update dma length and memory alignment + * @q: the request queue for the device + * @mask: alignment mask + * + * description: + * update required memory and length alignment for direct dma transactions. + * If the requested alignment is larger than the current alignment, then + * the current queue alignment is updated to the new value, otherwise it + * is left alone. The design of this is to allow multiple objects + * (driver, device, transport etc) to set their respective + * alignments without having them interfere. + * + **/ +void blk_queue_update_dma_alignment(struct request_queue *q, int mask) +{ + BUG_ON(mask > PAGE_SIZE); + + if (mask > q->dma_alignment) + q->dma_alignment = mask; +} +EXPORT_SYMBOL(blk_queue_update_dma_alignment); + +/** + * blk_set_queue_depth - tell the block layer about the device queue depth + * @q: the request queue for the device + * @depth: queue depth + * + */ +void blk_set_queue_depth(struct request_queue *q, unsigned int depth) +{ + q->queue_depth = depth; + rq_qos_queue_depth_changed(q); +} +EXPORT_SYMBOL(blk_set_queue_depth); + +/** + * blk_queue_write_cache - configure queue's write cache + * @q: the request queue for the device + * @wc: write back cache on or off + * @fua: device supports FUA writes, if true + * + * Tell the block layer about the write cache of @q. + */ +void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) +{ + if (wc) + blk_queue_flag_set(QUEUE_FLAG_WC, q); + else + blk_queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) + blk_queue_flag_set(QUEUE_FLAG_FUA, q); + else + blk_queue_flag_clear(QUEUE_FLAG_FUA, q); + + wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); +} +EXPORT_SYMBOL_GPL(blk_queue_write_cache); + +/** + * blk_queue_required_elevator_features - Set a queue required elevator features + * @q: the request queue for the target device + * @features: Required elevator features OR'ed together + * + * Tell the block layer that for the device controlled through @q, only the + * only elevators that can be used are those that implement at least the set of + * features specified by @features. + */ +void blk_queue_required_elevator_features(struct request_queue *q, + unsigned int features) +{ + q->required_elevator_features = features; +} +EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features); + +/** + * blk_queue_can_use_dma_map_merging - configure queue for merging segments. + * @q: the request queue for the device + * @dev: the device pointer for dma + * + * Tell the block layer about merging the segments by dma map of @q. + */ +bool blk_queue_can_use_dma_map_merging(struct request_queue *q, + struct device *dev) +{ + unsigned long boundary = dma_get_merge_boundary(dev); + + if (!boundary) + return false; + + /* No need to update max_segment_size. see blk_queue_virt_boundary() */ + blk_queue_virt_boundary(q, boundary); + + return true; +} +EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); + +/** + * blk_queue_set_zoned - configure a disk queue zoned model. + * @disk: the gendisk of the queue to configure + * @model: the zoned model to set + * + * Set the zoned model of the request queue of @disk according to @model. + * When @model is BLK_ZONED_HM (host managed), this should be called only + * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). + * If @model specifies BLK_ZONED_HA (host aware), the effective model used + * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions + * on the disk. + */ +void blk_queue_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +{ + switch (model) { + case BLK_ZONED_HM: + /* + * Host managed devices are supported only if + * CONFIG_BLK_DEV_ZONED is enabled. + */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); + break; + case BLK_ZONED_HA: + /* + * Host aware devices can be treated either as regular block + * devices (similar to drive managed devices) or as zoned block + * devices to take advantage of the zone command set, similarly + * to host managed devices. We try the latter if there are no + * partitions and zoned block device support is enabled, else + * we do nothing special as far as the block layer is concerned. + */ + if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || + disk_has_partitions(disk)) + model = BLK_ZONED_NONE; + break; + case BLK_ZONED_NONE: + default: + if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) + model = BLK_ZONED_NONE; + break; + } + + disk->queue->limits.zoned = model; +} +EXPORT_SYMBOL_GPL(blk_queue_set_zoned); + +static int __init blk_settings_init(void) +{ + blk_max_low_pfn = max_low_pfn - 1; + blk_max_pfn = max_pfn - 1; + return 0; +} +subsys_initcall(blk_settings_init); diff --git a/block/blk-stat.c b/block/blk-stat.c new file mode 100644 index 000000000..ae3dd1fb8 --- /dev/null +++ b/block/blk-stat.c @@ -0,0 +1,221 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block stat tracking code + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include + +#include "blk-stat.h" +#include "blk-mq.h" +#include "blk.h" + +struct blk_queue_stats { + struct list_head callbacks; + spinlock_t lock; + bool enable_accounting; +}; + +void blk_rq_stat_init(struct blk_rq_stat *stat) +{ + stat->min = -1ULL; + stat->max = stat->nr_samples = stat->mean = 0; + stat->batch = 0; +} + +/* src is a per-cpu stat, mean isn't initialized */ +void blk_rq_stat_sum(struct blk_rq_stat *dst, struct blk_rq_stat *src) +{ + if (!src->nr_samples) + return; + + dst->min = min(dst->min, src->min); + dst->max = max(dst->max, src->max); + + dst->mean = div_u64(src->batch + dst->mean * dst->nr_samples, + dst->nr_samples + src->nr_samples); + + dst->nr_samples += src->nr_samples; +} + +void blk_rq_stat_add(struct blk_rq_stat *stat, u64 value) +{ + stat->min = min(stat->min, value); + stat->max = max(stat->max, value); + stat->batch += value; + stat->nr_samples++; +} + +void blk_stat_add(struct request *rq, u64 now) +{ + struct request_queue *q = rq->q; + struct blk_stat_callback *cb; + struct blk_rq_stat *stat; + int bucket, cpu; + u64 value; + + value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; + + blk_throtl_stat_add(rq, value); + + rcu_read_lock(); + cpu = get_cpu(); + list_for_each_entry_rcu(cb, &q->stats->callbacks, list) { + if (!blk_stat_is_active(cb)) + continue; + + bucket = cb->bucket_fn(rq); + if (bucket < 0) + continue; + + stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket]; + blk_rq_stat_add(stat, value); + } + put_cpu(); + rcu_read_unlock(); +} + +static void blk_stat_timer_fn(struct timer_list *t) +{ + struct blk_stat_callback *cb = from_timer(cb, t, timer); + unsigned int bucket; + int cpu; + + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_rq_stat_init(&cb->stat[bucket]); + + for_each_online_cpu(cpu) { + struct blk_rq_stat *cpu_stat; + + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) { + blk_rq_stat_sum(&cb->stat[bucket], &cpu_stat[bucket]); + blk_rq_stat_init(&cpu_stat[bucket]); + } + } + + cb->timer_fn(cb); +} + +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data) +{ + struct blk_stat_callback *cb; + + cb = kmalloc(sizeof(*cb), GFP_KERNEL); + if (!cb) + return NULL; + + cb->stat = kmalloc_array(buckets, sizeof(struct blk_rq_stat), + GFP_KERNEL); + if (!cb->stat) { + kfree(cb); + return NULL; + } + cb->cpu_stat = __alloc_percpu(buckets * sizeof(struct blk_rq_stat), + __alignof__(struct blk_rq_stat)); + if (!cb->cpu_stat) { + kfree(cb->stat); + kfree(cb); + return NULL; + } + + cb->timer_fn = timer_fn; + cb->bucket_fn = bucket_fn; + cb->data = data; + cb->buckets = buckets; + timer_setup(&cb->timer, blk_stat_timer_fn, 0); + + return cb; +} + +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb) +{ + unsigned int bucket; + unsigned long flags; + int cpu; + + for_each_possible_cpu(cpu) { + struct blk_rq_stat *cpu_stat; + + cpu_stat = per_cpu_ptr(cb->cpu_stat, cpu); + for (bucket = 0; bucket < cb->buckets; bucket++) + blk_rq_stat_init(&cpu_stat[bucket]); + } + + spin_lock_irqsave(&q->stats->lock, flags); + list_add_tail_rcu(&cb->list, &q->stats->callbacks); + blk_queue_flag_set(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} + +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + list_del_rcu(&cb->list); + if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); + + del_timer_sync(&cb->timer); +} + +static void blk_stat_free_callback_rcu(struct rcu_head *head) +{ + struct blk_stat_callback *cb; + + cb = container_of(head, struct blk_stat_callback, rcu); + free_percpu(cb->cpu_stat); + kfree(cb->stat); + kfree(cb); +} + +void blk_stat_free_callback(struct blk_stat_callback *cb) +{ + if (cb) + call_rcu(&cb->rcu, blk_stat_free_callback_rcu); +} + +void blk_stat_enable_accounting(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + q->stats->enable_accounting = true; + blk_queue_flag_set(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} +EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); + +struct blk_queue_stats *blk_alloc_queue_stats(void) +{ + struct blk_queue_stats *stats; + + stats = kmalloc(sizeof(*stats), GFP_KERNEL); + if (!stats) + return NULL; + + INIT_LIST_HEAD(&stats->callbacks); + spin_lock_init(&stats->lock); + stats->enable_accounting = false; + + return stats; +} + +void blk_free_queue_stats(struct blk_queue_stats *stats) +{ + if (!stats) + return; + + WARN_ON(!list_empty(&stats->callbacks)); + + kfree(stats); +} diff --git a/block/blk-stat.h b/block/blk-stat.h new file mode 100644 index 000000000..17b47a86e --- /dev/null +++ b/block/blk-stat.h @@ -0,0 +1,171 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BLK_STAT_H +#define BLK_STAT_H + +#include +#include +#include +#include +#include + +/** + * struct blk_stat_callback - Block statistics callback. + * + * A &struct blk_stat_callback is associated with a &struct request_queue. While + * @timer is active, that queue's request completion latencies are sorted into + * buckets by @bucket_fn and added to a per-cpu buffer, @cpu_stat. When the + * timer fires, @cpu_stat is flushed to @stat and @timer_fn is invoked. + */ +struct blk_stat_callback { + /* + * @list: RCU list of callbacks for a &struct request_queue. + */ + struct list_head list; + + /** + * @timer: Timer for the next callback invocation. + */ + struct timer_list timer; + + /** + * @cpu_stat: Per-cpu statistics buckets. + */ + struct blk_rq_stat __percpu *cpu_stat; + + /** + * @bucket_fn: Given a request, returns which statistics bucket it + * should be accounted under. Return -1 for no bucket for this + * request. + */ + int (*bucket_fn)(const struct request *); + + /** + * @buckets: Number of statistics buckets. + */ + unsigned int buckets; + + /** + * @stat: Array of statistics buckets. + */ + struct blk_rq_stat *stat; + + /** + * @fn: Callback function. + */ + void (*timer_fn)(struct blk_stat_callback *); + + /** + * @data: Private pointer for the user. + */ + void *data; + + struct rcu_head rcu; +}; + +struct blk_queue_stats *blk_alloc_queue_stats(void); +void blk_free_queue_stats(struct blk_queue_stats *); + +void blk_stat_add(struct request *rq, u64 now); + +/* record time/size info in request but not add a callback */ +void blk_stat_enable_accounting(struct request_queue *q); + +/** + * blk_stat_alloc_callback() - Allocate a block statistics callback. + * @timer_fn: Timer callback function. + * @bucket_fn: Bucket callback function. + * @buckets: Number of statistics buckets. + * @data: Value for the @data field of the &struct blk_stat_callback. + * + * See &struct blk_stat_callback for details on the callback functions. + * + * Return: &struct blk_stat_callback on success or NULL on ENOMEM. + */ +struct blk_stat_callback * +blk_stat_alloc_callback(void (*timer_fn)(struct blk_stat_callback *), + int (*bucket_fn)(const struct request *), + unsigned int buckets, void *data); + +/** + * blk_stat_add_callback() - Add a block statistics callback to be run on a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * Note that a single &struct blk_stat_callback can only be added to a single + * &struct request_queue. + */ +void blk_stat_add_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_remove_callback() - Remove a block statistics callback from a + * request queue. + * @q: The request queue. + * @cb: The callback. + * + * When this returns, the callback is not running on any CPUs and will not be + * called again unless readded. + */ +void blk_stat_remove_callback(struct request_queue *q, + struct blk_stat_callback *cb); + +/** + * blk_stat_free_callback() - Free a block statistics callback. + * @cb: The callback. + * + * @cb may be NULL, in which case this does nothing. If it is not NULL, @cb must + * not be associated with a request queue. I.e., if it was previously added with + * blk_stat_add_callback(), it must also have been removed since then with + * blk_stat_remove_callback(). + */ +void blk_stat_free_callback(struct blk_stat_callback *cb); + +/** + * blk_stat_is_active() - Check if a block statistics callback is currently + * gathering statistics. + * @cb: The callback. + */ +static inline bool blk_stat_is_active(struct blk_stat_callback *cb) +{ + return timer_pending(&cb->timer); +} + +/** + * blk_stat_activate_nsecs() - Gather block statistics during a time window in + * nanoseconds. + * @cb: The callback. + * @nsecs: Number of nanoseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_nsecs(struct blk_stat_callback *cb, + u64 nsecs) +{ + mod_timer(&cb->timer, jiffies + nsecs_to_jiffies(nsecs)); +} + +static inline void blk_stat_deactivate(struct blk_stat_callback *cb) +{ + del_timer_sync(&cb->timer); +} + +/** + * blk_stat_activate_msecs() - Gather block statistics during a time window in + * milliseconds. + * @cb: The callback. + * @msecs: Number of milliseconds to gather statistics for. + * + * The timer callback will be called when the window expires. + */ +static inline void blk_stat_activate_msecs(struct blk_stat_callback *cb, + unsigned int msecs) +{ + mod_timer(&cb->timer, jiffies + msecs_to_jiffies(msecs)); +} + +void blk_rq_stat_add(struct blk_rq_stat *, u64); +void blk_rq_stat_sum(struct blk_rq_stat *, struct blk_rq_stat *); +void blk_rq_stat_init(struct blk_rq_stat *); + +#endif diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c new file mode 100644 index 000000000..9174137a9 --- /dev/null +++ b/block/blk-sysfs.c @@ -0,0 +1,977 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to sysfs handling + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-wbt.h" + +struct queue_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct request_queue *, char *); + ssize_t (*store)(struct request_queue *, const char *, size_t); +}; + +static ssize_t +queue_var_show(unsigned long var, char *page) +{ + return sprintf(page, "%lu\n", var); +} + +static ssize_t +queue_var_store(unsigned long *var, const char *page, size_t count) +{ + int err; + unsigned long v; + + err = kstrtoul(page, 10, &v); + if (err || v > UINT_MAX) + return -EINVAL; + + *var = v; + + return count; +} + +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + +static ssize_t queue_requests_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->nr_requests, (page)); +} + +static ssize_t +queue_requests_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long nr; + int ret, err; + + if (!queue_is_mq(q)) + return -EINVAL; + + ret = queue_var_store(&nr, page, count); + if (ret < 0) + return ret; + + if (nr < BLKDEV_MIN_RQ) + nr = BLKDEV_MIN_RQ; + + err = blk_mq_update_nr_requests(q, nr); + if (err) + return err; + + return ret; +} + +static ssize_t queue_ra_show(struct request_queue *q, char *page) +{ + unsigned long ra_kb = q->backing_dev_info->ra_pages << + (PAGE_SHIFT - 10); + + return queue_var_show(ra_kb, (page)); +} + +static ssize_t +queue_ra_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long ra_kb; + ssize_t ret = queue_var_store(&ra_kb, page, count); + + if (ret < 0) + return ret; + + q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); + + return ret; +} + +static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) +{ + int max_sectors_kb = queue_max_sectors(q) >> 1; + + return queue_var_show(max_sectors_kb, (page)); +} + +static ssize_t queue_max_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segments(q), (page)); +} + +static ssize_t queue_max_discard_segments_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_max_discard_segments(q), (page)); +} + +static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.max_integrity_segments, (page)); +} + +static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_segment_size(q), (page)); +} + +static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_logical_block_size(q), page); +} + +static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_physical_block_size(q), page); +} + +static ssize_t queue_chunk_sectors_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.chunk_sectors, page); +} + +static ssize_t queue_io_min_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_min(q), page); +} + +static ssize_t queue_io_opt_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_opt(q), page); +} + +static ssize_t queue_discard_granularity_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.discard_granularity, page); +} + +static ssize_t queue_discard_max_hw_show(struct request_queue *q, char *page) +{ + + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_hw_discard_sectors << 9); +} + +static ssize_t queue_discard_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_discard_sectors << 9); +} + +static ssize_t queue_discard_max_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long max_discard; + ssize_t ret = queue_var_store(&max_discard, page, count); + + if (ret < 0) + return ret; + + if (max_discard & (q->limits.discard_granularity - 1)) + return -EINVAL; + + max_discard >>= 9; + if (max_discard > UINT_MAX) + return -EINVAL; + + if (max_discard > q->limits.max_hw_discard_sectors) + max_discard = q->limits.max_hw_discard_sectors; + + q->limits.max_discard_sectors = max_discard; + return ret; +} + +static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *page) +{ + return queue_var_show(0, page); +} + +static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_same_sectors << 9); +} + +static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%llu\n", + (unsigned long long)q->limits.max_write_zeroes_sectors << 9); +} + +static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) +{ + unsigned long long max_sectors = q->limits.max_zone_append_sectors; + + return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT); +} + +static ssize_t +queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) +{ + unsigned long max_sectors_kb, + max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, + page_kb = 1 << (PAGE_SHIFT - 10); + ssize_t ret = queue_var_store(&max_sectors_kb, page, count); + + if (ret < 0) + return ret; + + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + q->limits.max_dev_sectors >> 1); + + if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) + return -EINVAL; + + spin_lock_irq(&q->queue_lock); + q->limits.max_sectors = max_sectors_kb << 1; + q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + spin_unlock_irq(&q->queue_lock); + + return ret; +} + +static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) +{ + int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; + + return queue_var_show(max_hw_sectors_kb, (page)); +} + +#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ +static ssize_t \ +queue_##name##_show(struct request_queue *q, char *page) \ +{ \ + int bit; \ + bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ + return queue_var_show(neg ? !bit : bit, page); \ +} \ +static ssize_t \ +queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ +{ \ + unsigned long val; \ + ssize_t ret; \ + ret = queue_var_store(&val, page, count); \ + if (ret < 0) \ + return ret; \ + if (neg) \ + val = !val; \ + \ + if (val) \ + blk_queue_flag_set(QUEUE_FLAG_##flag, q); \ + else \ + blk_queue_flag_clear(QUEUE_FLAG_##flag, q); \ + return ret; \ +} + +QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); +QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); +QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); +QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); +#undef QUEUE_SYSFS_BIT_FNS + +static ssize_t queue_zoned_show(struct request_queue *q, char *page) +{ + switch (blk_queue_zoned_model(q)) { + case BLK_ZONED_HA: + return sprintf(page, "host-aware\n"); + case BLK_ZONED_HM: + return sprintf(page, "host-managed\n"); + default: + return sprintf(page, "none\n"); + } +} + +static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_nr_zones(q), page); +} + +static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_open_zones(q), page); +} + +static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_max_active_zones(q), page); +} + +static ssize_t queue_nomerges_show(struct request_queue *q, char *page) +{ + return queue_var_show((blk_queue_nomerges(q) << 1) | + blk_queue_noxmerges(q), page); +} + +static ssize_t queue_nomerges_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long nm; + ssize_t ret = queue_var_store(&nm, page, count); + + if (ret < 0) + return ret; + + blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, q); + blk_queue_flag_clear(QUEUE_FLAG_NOXMERGES, q); + if (nm == 2) + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); + else if (nm) + blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + + return ret; +} + +static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page) +{ + bool set = test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags); + bool force = test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags); + + return queue_var_show(set << force, page); +} + +static ssize_t +queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) +{ + ssize_t ret = -EINVAL; +#ifdef CONFIG_SMP + unsigned long val; + + ret = queue_var_store(&val, page, count); + if (ret < 0) + return ret; + + if (val == 2) { + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 1) { + blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } else if (val == 0) { + blk_queue_flag_clear(QUEUE_FLAG_SAME_COMP, q); + blk_queue_flag_clear(QUEUE_FLAG_SAME_FORCE, q); + } +#endif + return ret; +} + +static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) +{ + int val; + + if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) + val = BLK_MQ_POLL_CLASSIC; + else + val = q->poll_nsec / 1000; + + return sprintf(page, "%d\n", val); +} + +static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, + size_t count) +{ + int err, val; + + if (!q->mq_ops || !q->mq_ops->poll) + return -EINVAL; + + err = kstrtoint(page, 10, &val); + if (err < 0) + return err; + + if (val == BLK_MQ_POLL_CLASSIC) + q->poll_nsec = BLK_MQ_POLL_CLASSIC; + else if (val >= 0) + q->poll_nsec = val * 1000; + else + return -EINVAL; + + return count; +} + +static ssize_t queue_poll_show(struct request_queue *q, char *page) +{ + return queue_var_show(test_bit(QUEUE_FLAG_POLL, &q->queue_flags), page); +} + +static ssize_t queue_poll_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned long poll_on; + ssize_t ret; + + if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || + !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) + return -EINVAL; + + ret = queue_var_store(&poll_on, page, count); + if (ret < 0) + return ret; + + if (poll_on) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); + + return ret; +} + +static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", jiffies_to_msecs(q->rq_timeout)); +} + +static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, + size_t count) +{ + unsigned int val; + int err; + + err = kstrtou32(page, 10, &val); + if (err || val == 0) + return -EINVAL; + + blk_queue_rq_timeout(q, msecs_to_jiffies(val)); + + return count; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!wbt_rq_qos(q)) + return -EINVAL; + + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + struct rq_qos *rqos; + ssize_t ret; + s64 val; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + if (val < -1) + return -EINVAL; + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(q); + if (ret) + return ret; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + return count; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + wbt_set_min_lat(q, val); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return count; +} + +static ssize_t queue_wc_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return sprintf(page, "write back\n"); + + return sprintf(page, "write through\n"); +} + +static ssize_t queue_wc_store(struct request_queue *q, const char *page, + size_t count) +{ + int set = -1; + + if (!strncmp(page, "write back", 10)) + set = 1; + else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) + set = 0; + + if (set == -1) + return -EINVAL; + + if (set) + blk_queue_flag_set(QUEUE_FLAG_WC, q); + else + blk_queue_flag_clear(QUEUE_FLAG_WC, q); + + return count; +} + +static ssize_t queue_fua_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); +} + +static ssize_t queue_dax_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_dax(q), page); +} + +#define QUEUE_RO_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show = _prefix##_show, \ +}; + +#define QUEUE_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store = _prefix##_store, \ +}; + +QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); +QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); + +QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); + +QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); +QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); + +QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); +QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); + +QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); +QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); + +QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); +QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); +QUEUE_RW_ENTRY(queue_poll, "io_poll"); +QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); +QUEUE_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_RO_ENTRY(queue_fua, "fua"); +QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); +#endif + +/* legacy alias for logical_block_size: */ +static struct queue_sysfs_entry queue_hw_sector_size_entry = { + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show = queue_logical_block_size_show, +}; + +QUEUE_RW_ENTRY(queue_nonrot, "rotational"); +QUEUE_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_RW_ENTRY(queue_random, "add_random"); +QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); + +static struct attribute *queue_attrs[] = { + &queue_requests_entry.attr, + &queue_ra_entry.attr, + &queue_max_hw_sectors_entry.attr, + &queue_max_sectors_entry.attr, + &queue_max_segments_entry.attr, + &queue_max_discard_segments_entry.attr, + &queue_max_integrity_segments_entry.attr, + &queue_max_segment_size_entry.attr, + &elv_iosched_entry.attr, + &queue_hw_sector_size_entry.attr, + &queue_logical_block_size_entry.attr, + &queue_physical_block_size_entry.attr, + &queue_chunk_sectors_entry.attr, + &queue_io_min_entry.attr, + &queue_io_opt_entry.attr, + &queue_discard_granularity_entry.attr, + &queue_discard_max_entry.attr, + &queue_discard_max_hw_entry.attr, + &queue_discard_zeroes_data_entry.attr, + &queue_write_same_max_entry.attr, + &queue_write_zeroes_max_entry.attr, + &queue_zone_append_max_entry.attr, + &queue_nonrot_entry.attr, + &queue_zoned_entry.attr, + &queue_nr_zones_entry.attr, + &queue_max_open_zones_entry.attr, + &queue_max_active_zones_entry.attr, + &queue_nomerges_entry.attr, + &queue_rq_affinity_entry.attr, + &queue_iostats_entry.attr, + &queue_stable_writes_entry.attr, + &queue_random_entry.attr, + &queue_poll_entry.attr, + &queue_wc_entry.attr, + &queue_fua_entry.attr, + &queue_dax_entry.attr, + &queue_wb_lat_entry.attr, + &queue_poll_delay_entry.attr, + &queue_io_timeout_entry.attr, +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + &blk_throtl_sample_time_entry.attr, +#endif + NULL, +}; + +static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, + int n) +{ + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + if (attr == &queue_io_timeout_entry.attr && + (!q->mq_ops || !q->mq_ops->timeout)) + return 0; + + if ((attr == &queue_max_open_zones_entry.attr || + attr == &queue_max_active_zones_entry.attr) && + !blk_queue_is_zoned(q)) + return 0; + + return attr->mode; +} + +static struct attribute_group queue_attr_group = { + .attrs = queue_attrs, + .is_visible = queue_attr_visible, +}; + + +#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) + +static ssize_t +queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct queue_sysfs_entry *entry = to_queue(attr); + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + ssize_t res; + + if (!entry->show) + return -EIO; + mutex_lock(&q->sysfs_lock); + res = entry->show(q, page); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static ssize_t +queue_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct queue_sysfs_entry *entry = to_queue(attr); + struct request_queue *q; + ssize_t res; + + if (!entry->store) + return -EIO; + + q = container_of(kobj, struct request_queue, kobj); + mutex_lock(&q->sysfs_lock); + res = entry->store(q, page, length); + mutex_unlock(&q->sysfs_lock); + return res; +} + +static void blk_free_queue_rcu(struct rcu_head *rcu_head) +{ + struct request_queue *q = container_of(rcu_head, struct request_queue, + rcu_head); + + percpu_ref_exit(&q->q_usage_counter); + kmem_cache_free(blk_requestq_cachep, q); +} + +/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ +static void blk_exit_queue(struct request_queue *q) +{ + /* + * Since the I/O scheduler exit code may access cgroup information, + * perform I/O scheduler exit before disassociating from the block + * cgroup controller. + */ + if (q->elevator) { + ioc_clear_queue(q); + __elevator_exit(q, q->elevator); + } + + /* + * Remove all references to @q from the block cgroup controller before + * restoring @q->queue_lock to avoid that restoring this pointer causes + * e.g. blkcg_print_blkgs() to crash. + */ + blkcg_exit_queue(q); + + /* + * Since the cgroup code may dereference the @q->backing_dev_info + * pointer, only decrease its reference count after having removed the + * association with the block cgroup controller. + */ + bdi_put(q->backing_dev_info); +} + +/** + * blk_release_queue - releases all allocated resources of the request_queue + * @kobj: pointer to a kobject, whose container is a request_queue + * + * This function releases all allocated resources of the request queue. + * + * The struct request_queue refcount is incremented with blk_get_queue() and + * decremented with blk_put_queue(). Once the refcount reaches 0 this function + * is called. + * + * For drivers that have a request_queue on a gendisk and added with + * __device_add_disk() the refcount to request_queue will reach 0 with + * the last put_disk() called by the driver. For drivers which don't use + * __device_add_disk() this happens with blk_cleanup_queue(). + * + * Drivers exist which depend on the release of the request_queue to be + * synchronous, it should not be deferred. + * + * Context: can sleep + */ +static void blk_release_queue(struct kobject *kobj) +{ + struct request_queue *q = + container_of(kobj, struct request_queue, kobj); + + might_sleep(); + + if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) + blk_stat_remove_callback(q, q->poll_cb); + blk_stat_free_callback(q->poll_cb); + + blk_free_queue_stats(q->stats); + + if (queue_is_mq(q)) { + struct blk_mq_hw_ctx *hctx; + int i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); + } + + blk_exit_queue(q); + + blk_queue_free_zone_bitmaps(q); + + if (queue_is_mq(q)) + blk_mq_release(q); + + blk_trace_shutdown(q); + mutex_lock(&q->debugfs_mutex); + debugfs_remove_recursive(q->debugfs_dir); + mutex_unlock(&q->debugfs_mutex); + + if (queue_is_mq(q)) + blk_mq_debugfs_unregister(q); + + bioset_exit(&q->bio_split); + + ida_simple_remove(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); +} + +static const struct sysfs_ops queue_sysfs_ops = { + .show = queue_attr_show, + .store = queue_attr_store, +}; + +struct kobj_type blk_queue_ktype = { + .sysfs_ops = &queue_sysfs_ops, + .release = blk_release_queue, +}; + +/** + * blk_register_queue - register a block layer queue with sysfs + * @disk: Disk of which the request queue should be registered with sysfs. + */ +int blk_register_queue(struct gendisk *disk) +{ + int ret; + struct device *dev = disk_to_dev(disk); + struct request_queue *q = disk->queue; + + if (WARN_ON(!q)) + return -ENXIO; + + WARN_ONCE(blk_queue_registered(q), + "%s is registering an already registered queue\n", + kobject_name(&dev->kobj)); + + /* + * SCSI probing may synchronously create and destroy a lot of + * request_queues for non-existent devices. Shutting down a fully + * functional queue takes measureable wallclock time as RCU grace + * periods are involved. To avoid excessive latency in these + * cases, a request_queue starts out in a degraded mode which is + * faster to shut down and is made fully functional here as + * request_queues for non-existent devices never get registered. + */ + if (!blk_queue_init_done(q)) { + blk_queue_flag_set(QUEUE_FLAG_INIT_DONE, q); + percpu_ref_switch_to_percpu(&q->q_usage_counter); + } + + blk_queue_update_readahead(q); + + ret = blk_trace_init_sysfs(dev); + if (ret) + return ret; + + mutex_lock(&q->sysfs_dir_lock); + + ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); + if (ret < 0) { + blk_trace_remove_sysfs(dev); + goto unlock; + } + + ret = sysfs_create_group(&q->kobj, &queue_attr_group); + if (ret) { + blk_trace_remove_sysfs(dev); + kobject_del(&q->kobj); + kobject_put(&dev->kobj); + goto unlock; + } + + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), + blk_debugfs_root); + mutex_unlock(&q->debugfs_mutex); + + if (queue_is_mq(q)) { + __blk_mq_register_dev(dev, q); + blk_mq_debugfs_register(q); + } + + mutex_lock(&q->sysfs_lock); + if (q->elevator) { + ret = elv_register_queue(q, false); + if (ret) { + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); + kobject_del(&q->kobj); + blk_trace_remove_sysfs(dev); + kobject_put(&dev->kobj); + return ret; + } + } + + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(q); + blk_throtl_register_queue(q); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&q->kobj, KOBJ_ADD); + if (q->elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + + ret = 0; +unlock: + mutex_unlock(&q->sysfs_dir_lock); + return ret; +} +EXPORT_SYMBOL_GPL(blk_register_queue); + +/** + * blk_unregister_queue - counterpart of blk_register_queue() + * @disk: Disk of which the request queue should be unregistered from sysfs. + * + * Note: the caller is responsible for guaranteeing that this function is called + * after blk_register_queue() has finished. + */ +void blk_unregister_queue(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + + if (WARN_ON(!q)) + return; + + /* Return early if disk->queue was never registered. */ + if (!blk_queue_registered(q)) + return; + + /* + * Since sysfs_remove_dir() prevents adding new directory entries + * before removal of existing entries starts, protect against + * concurrent elv_iosched_store() calls. + */ + mutex_lock(&q->sysfs_lock); + blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q); + mutex_unlock(&q->sysfs_lock); + + mutex_lock(&q->sysfs_dir_lock); + /* + * Remove the sysfs attributes before unregistering the queue data + * structures that can be modified through sysfs. + */ + if (queue_is_mq(q)) + blk_mq_unregister_dev(disk_to_dev(disk), q); + blk_trace_remove_sysfs(disk_to_dev(disk)); + + mutex_lock(&q->sysfs_lock); + if (q->elevator) + elv_unregister_queue(q); + mutex_unlock(&q->sysfs_lock); + + /* Now that we've deleted all child objects, we can delete the queue. */ + kobject_uevent(&q->kobj, KOBJ_REMOVE); + kobject_del(&q->kobj); + + mutex_unlock(&q->sysfs_dir_lock); + + kobject_put(&disk_to_dev(disk)->kobj); +} diff --git a/block/blk-throttle.c b/block/blk-throttle.c new file mode 100644 index 000000000..4bf514a7b --- /dev/null +++ b/block/blk-throttle.c @@ -0,0 +1,2527 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Interface for controlling IO bandwidth on a request queue + * + * Copyright (C) 2010 Vivek Goyal + */ + +#include +#include +#include +#include +#include +#include +#include "blk.h" +#include "blk-cgroup-rwstat.h" + +/* Max dispatch from a group in 1 round */ +#define THROTL_GRP_QUANTUM 8 + +/* Total max dispatch from all groups in one round */ +#define THROTL_QUANTUM 32 + +/* Throttling is performed over a slice and after that slice is renewed */ +#define DFL_THROTL_SLICE_HD (HZ / 10) +#define DFL_THROTL_SLICE_SSD (HZ / 50) +#define MAX_THROTL_SLICE (HZ) +#define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ +#define MIN_THROTL_BPS (320 * 1024) +#define MIN_THROTL_IOPS (10) +#define DFL_LATENCY_TARGET (-1L) +#define DFL_IDLE_THRESHOLD (0) +#define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ +#define LATENCY_FILTERED_SSD (0) +/* + * For HD, very small latency comes from sequential IO. Such IO is helpless to + * help determine if its IO is impacted by others, hence we ignore the IO + */ +#define LATENCY_FILTERED_HD (1000L) /* 1ms */ + +static struct blkcg_policy blkcg_policy_throtl; + +/* A workqueue to queue throttle related work */ +static struct workqueue_struct *kthrotld_workqueue; + +/* + * To implement hierarchical throttling, throtl_grps form a tree and bios + * are dispatched upwards level by level until they reach the top and get + * issued. When dispatching bios from the children and local group at each + * level, if the bios are dispatched into a single bio_list, there's a risk + * of a local or child group which can queue many bios at once filling up + * the list starving others. + * + * To avoid such starvation, dispatched bios are queued separately + * according to where they came from. When they are again dispatched to + * the parent, they're popped in round-robin order so that no single source + * hogs the dispatch window. + * + * throtl_qnode is used to keep the queued bios separated by their sources. + * Bios are queued to throtl_qnode which in turn is queued to + * throtl_service_queue and then dispatched in round-robin order. + * + * It's also used to track the reference counts on blkg's. A qnode always + * belongs to a throtl_grp and gets queued on itself or the parent, so + * incrementing the reference of the associated throtl_grp when a qnode is + * queued and decrementing when dequeued is enough to keep the whole blkg + * tree pinned while bios are in flight. + */ +struct throtl_qnode { + struct list_head node; /* service_queue->queued[] */ + struct bio_list bios; /* queued bios */ + struct throtl_grp *tg; /* tg this qnode belongs to */ +}; + +struct throtl_service_queue { + struct throtl_service_queue *parent_sq; /* the parent service_queue */ + + /* + * Bios queued directly to this service_queue or dispatched from + * children throtl_grp's. + */ + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ + unsigned int nr_queued[2]; /* number of queued bios */ + + /* + * RB tree of active children throtl_grp's, which are sorted by + * their ->disptime. + */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ + unsigned int nr_pending; /* # queued in the tree */ + unsigned long first_pending_disptime; /* disptime of the first tg */ + struct timer_list pending_timer; /* fires on first_pending_disptime */ +}; + +enum tg_state_flags { + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ +}; + +#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) + +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + +struct throtl_grp { + /* must be the first member */ + struct blkg_policy_data pd; + + /* active throtl group service_queue member */ + struct rb_node rb_node; + + /* throtl_data this group belongs to */ + struct throtl_data *td; + + /* this group's service queue */ + struct throtl_service_queue service_queue; + + /* + * qnode_on_self is used when bios are directly queued to this + * throtl_grp so that local bios compete fairly with bios + * dispatched from children. qnode_on_parent is used when bios are + * dispatched from this throtl_grp into its parent and will compete + * with the sibling qnode_on_parents and the parent's + * qnode_on_self. + */ + struct throtl_qnode qnode_on_self[2]; + struct throtl_qnode qnode_on_parent[2]; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + unsigned int flags; + + /* are there any throtl rules between this group and td? */ + bool has_rules[2]; + + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; + + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; + + /* Number of bytes dispatched in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; + + atomic_t io_split_cnt[2]; + atomic_t last_io_split_cnt[2]; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; +}; + +/* We measure latency for request size from <= 4k to >= 1M */ +#define LATENCY_BUCKET_SIZE 9 + +struct latency_bucket { + unsigned long total_latency; /* ns / 1024 */ + int samples; +}; + +struct avg_latency_bucket { + unsigned long latency; /* ns / 1024 */ + bool valid; +}; + +struct throtl_data +{ + /* service tree for active throtl groups */ + struct throtl_service_queue service_queue; + + struct request_queue *queue; + + /* Total Number of queued bios on READ and WRITE lists */ + unsigned int nr_queued[2]; + + unsigned int throtl_slice; + + /* Work for dispatching throttled bios */ + struct work_struct dispatch_work; + unsigned int limit_index; + bool limit_valid[LIMIT_CNT]; + + unsigned long low_upgrade_time; + unsigned long low_downgrade_time; + + unsigned int scale; + + struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; + struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; + struct latency_bucket __percpu *latency_buckets[2]; + unsigned long last_calculate_time; + unsigned long filtered_latency; + + bool track_bio_latency; +}; + +static void throtl_pending_timer_fn(struct timer_list *t); + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; +} + +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} + +static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) +{ + return pd_to_blkg(&tg->pd); +} + +/** + * sq_to_tg - return the throl_grp the specified service queue belongs to + * @sq: the throtl_service_queue of interest + * + * Return the throtl_grp @sq belongs to. If @sq is the top-level one + * embedded in throtl_data, %NULL is returned. + */ +static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) +{ + if (sq && sq->parent_sq) + return container_of(sq, struct throtl_grp, service_queue); + else + return NULL; +} + +/** + * sq_to_td - return throtl_data the specified service queue belongs to + * @sq: the throtl_service_queue of interest + * + * A service_queue can be embedded in either a throtl_grp or throtl_data. + * Determine the associated throtl_data accordingly and return it. + */ +static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) +{ + struct throtl_grp *tg = sq_to_tg(sq); + + if (tg) + return tg->td; + else + return container_of(sq, struct throtl_data, service_queue); +} + +/* + * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to + * make the IO dispatch more smooth. + * Scale up: linearly scale up according to lapsed time since upgrade. For + * every throtl_slice, the limit scales up 1/2 .low limit till the + * limit hits .max limit + * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit + */ +static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) +{ + /* arbitrary value to avoid too big scale */ + if (td->scale < 4096 && time_after_eq(jiffies, + td->low_upgrade_time + td->scale * td->throtl_slice)) + td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; + + return low + (low >> 1) * td->scale; +} + +static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + uint64_t ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return U64_MAX; + + td = tg->td; + ret = tg->bps[rw][td->limit_index]; + if (ret == 0 && td->limit_index == LIMIT_LOW) { + /* intermediate node or iops isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->iops[rw][td->limit_index]) + return U64_MAX; + else + return MIN_THROTL_BPS; + } + + if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && + tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); + ret = min(tg->bps[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) +{ + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td; + unsigned int ret; + + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) + return UINT_MAX; + + td = tg->td; + ret = tg->iops[rw][td->limit_index]; + if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { + /* intermediate node or bps isn't 0 */ + if (!list_empty(&blkg->blkcg->css.children) || + tg->bps[rw][td->limit_index]) + return UINT_MAX; + else + return MIN_THROTL_IOPS; + } + + if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && + tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { + uint64_t adjusted; + + adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); + if (adjusted > UINT_MAX) + adjusted = UINT_MAX; + ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); + } + return ret; +} + +#define request_bucket_index(sectors) \ + clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) + +/** + * throtl_log - log debug message via blktrace + * @sq: the service_queue being reported + * @fmt: printf format string + * @args: printf args + * + * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a + * throtl_grp; otherwise, just "throtl". + */ +#define throtl_log(sq, fmt, args...) do { \ + struct throtl_grp *__tg = sq_to_tg((sq)); \ + struct throtl_data *__td = sq_to_td((sq)); \ + \ + (void)__td; \ + if (likely(!blk_trace_note_message_enabled(__td->queue))) \ + break; \ + if ((__tg)) { \ + blk_add_cgroup_trace_msg(__td->queue, \ + tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + } else { \ + blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ + } \ +} while (0) + +static inline unsigned int throtl_bio_data_size(struct bio *bio) +{ + /* assume it's one sector */ + if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) + return 512; + return bio->bi_iter.bi_size; +} + +static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) +{ + INIT_LIST_HEAD(&qn->node); + bio_list_init(&qn->bios); + qn->tg = tg; +} + +/** + * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it + * @bio: bio being added + * @qn: qnode to add bio to + * @queued: the service_queue->queued[] list @qn belongs to + * + * Add @bio to @qn and put @qn on @queued if it's not already on. + * @qn->tg's reference count is bumped when @qn is activated. See the + * comment on top of throtl_qnode definition for details. + */ +static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, + struct list_head *queued) +{ + bio_list_add(&qn->bios, bio); + if (list_empty(&qn->node)) { + list_add_tail(&qn->node, queued); + blkg_get(tg_to_blkg(qn->tg)); + } +} + +/** + * throtl_peek_queued - peek the first bio on a qnode list + * @queued: the qnode list to peek + */ +static struct bio *throtl_peek_queued(struct list_head *queued) +{ + struct throtl_qnode *qn; + struct bio *bio; + + if (list_empty(queued)) + return NULL; + + qn = list_first_entry(queued, struct throtl_qnode, node); + bio = bio_list_peek(&qn->bios); + WARN_ON_ONCE(!bio); + return bio; +} + +/** + * throtl_pop_queued - pop the first bio form a qnode list + * @queued: the qnode list to pop a bio from + * @tg_to_put: optional out argument for throtl_grp to put + * + * Pop the first bio from the qnode list @queued. After popping, the first + * qnode is removed from @queued if empty or moved to the end of @queued so + * that the popping order is round-robin. + * + * When the first qnode is removed, its associated throtl_grp should be put + * too. If @tg_to_put is NULL, this function automatically puts it; + * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is + * responsible for putting it. + */ +static struct bio *throtl_pop_queued(struct list_head *queued, + struct throtl_grp **tg_to_put) +{ + struct throtl_qnode *qn; + struct bio *bio; + + if (list_empty(queued)) + return NULL; + + qn = list_first_entry(queued, struct throtl_qnode, node); + bio = bio_list_pop(&qn->bios); + WARN_ON_ONCE(!bio); + + if (bio_list_empty(&qn->bios)) { + list_del_init(&qn->node); + if (tg_to_put) + *tg_to_put = qn->tg; + else + blkg_put(tg_to_blkg(qn->tg)); + } else { + list_move_tail(&qn->node, queued); + } + + return bio; +} + +/* init a service_queue, assumes the caller zeroed it */ +static void throtl_service_queue_init(struct throtl_service_queue *sq) +{ + INIT_LIST_HEAD(&sq->queued[0]); + INIT_LIST_HEAD(&sq->queued[1]); + sq->pending_tree = RB_ROOT_CACHED; + timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); +} + +static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, + struct request_queue *q, + struct blkcg *blkcg) +{ + struct throtl_grp *tg; + int rw; + + tg = kzalloc_node(sizeof(*tg), gfp, q->node); + if (!tg) + return NULL; + + if (blkg_rwstat_init(&tg->stat_bytes, gfp)) + goto err_free_tg; + + if (blkg_rwstat_init(&tg->stat_ios, gfp)) + goto err_exit_stat_bytes; + + throtl_service_queue_init(&tg->service_queue); + + for (rw = READ; rw <= WRITE; rw++) { + throtl_qnode_init(&tg->qnode_on_self[rw], tg); + throtl_qnode_init(&tg->qnode_on_parent[rw], tg); + } + + RB_CLEAR_NODE(&tg->rb_node); + tg->bps[READ][LIMIT_MAX] = U64_MAX; + tg->bps[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops[READ][LIMIT_MAX] = UINT_MAX; + tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; + tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; + tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; + tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; + tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; + /* LIMIT_LOW will have default value 0 */ + + tg->latency_target = DFL_LATENCY_TARGET; + tg->latency_target_conf = DFL_LATENCY_TARGET; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; + + return &tg->pd; + +err_exit_stat_bytes: + blkg_rwstat_exit(&tg->stat_bytes); +err_free_tg: + kfree(tg); + return NULL; +} + +static void throtl_pd_init(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + struct blkcg_gq *blkg = tg_to_blkg(tg); + struct throtl_data *td = blkg->q->td; + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * If on the default hierarchy, we switch to properly hierarchical + * behavior where limits on a given throtl_grp are applied to the + * whole subtree rather than just the group itself. e.g. If 16M + * read_bps limit is set on the root group, the whole system can't + * exceed 16M for the device. + * + * If not on the default hierarchy, the broken flat hierarchy + * behavior is retained where all throtl_grps are treated as if + * they're all separate root groups right below throtl_data. + * Limits of a group don't interact with limits of other groups + * regardless of the position of the group in the hierarchy. + */ + sq->parent_sq = &td->service_queue; + if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) + sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; + tg->td = td; +} + +/* + * Set has_rules[] if @tg or any of its parents have limits configured. + * This doesn't require walking up to the top of the hierarchy as the + * parent's has_rules[] is guaranteed to be correct. + */ +static void tg_update_has_rules(struct throtl_grp *tg) +{ + struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); + struct throtl_data *td = tg->td; + int rw; + + for (rw = READ; rw <= WRITE; rw++) + tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + (td->limit_valid[td->limit_index] && + (tg_bps_limit(tg, rw) != U64_MAX || + tg_iops_limit(tg, rw) != UINT_MAX)); +} + +static void throtl_pd_online(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + /* + * We don't want new groups to escape the limits of its ancestors. + * Update has_rules[] after a new group is brought online. + */ + tg_update_has_rules(tg); +} + +static void blk_throtl_update_limit_valid(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + bool low_valid = false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || + tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { + low_valid = true; + break; + } + } + rcu_read_unlock(); + + td->limit_valid[LIMIT_LOW] = low_valid; +} + +static void throtl_upgrade_state(struct throtl_data *td); +static void throtl_pd_offline(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + + blk_throtl_update_limit_valid(tg->td); + + if (!tg->td->limit_valid[tg->td->limit_index]) + throtl_upgrade_state(tg->td); +} + +static void throtl_pd_free(struct blkg_policy_data *pd) +{ + struct throtl_grp *tg = pd_to_tg(pd); + + del_timer_sync(&tg->service_queue.pending_timer); + blkg_rwstat_exit(&tg->stat_bytes); + blkg_rwstat_exit(&tg->stat_ios); + kfree(tg); +} + +static struct throtl_grp * +throtl_rb_first(struct throtl_service_queue *parent_sq) +{ + struct rb_node *n; + + n = rb_first_cached(&parent_sq->pending_tree); + WARN_ON_ONCE(!n); + if (!n) + return NULL; + return rb_entry_tg(n); +} + +static void throtl_rb_erase(struct rb_node *n, + struct throtl_service_queue *parent_sq) +{ + rb_erase_cached(n, &parent_sq->pending_tree); + RB_CLEAR_NODE(n); + --parent_sq->nr_pending; +} + +static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) +{ + struct throtl_grp *tg; + + tg = throtl_rb_first(parent_sq); + if (!tg) + return; + + parent_sq->first_pending_disptime = tg->disptime; +} + +static void tg_service_queue_add(struct throtl_grp *tg) +{ + struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; + struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; + struct rb_node *parent = NULL; + struct throtl_grp *__tg; + unsigned long key = tg->disptime; + bool leftmost = true; + + while (*node != NULL) { + parent = *node; + __tg = rb_entry_tg(parent); + + if (time_before(key, __tg->disptime)) + node = &parent->rb_left; + else { + node = &parent->rb_right; + leftmost = false; + } + } + + rb_link_node(&tg->rb_node, parent, node); + rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, + leftmost); +} + +static void throtl_enqueue_tg(struct throtl_grp *tg) +{ + if (!(tg->flags & THROTL_TG_PENDING)) { + tg_service_queue_add(tg); + tg->flags |= THROTL_TG_PENDING; + tg->service_queue.parent_sq->nr_pending++; + } +} + +static void throtl_dequeue_tg(struct throtl_grp *tg) +{ + if (tg->flags & THROTL_TG_PENDING) { + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); + tg->flags &= ~THROTL_TG_PENDING; + } +} + +/* Call with queue lock held */ +static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, + unsigned long expires) +{ + unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; + + /* + * Since we are adjusting the throttle limit dynamically, the sleep + * time calculated according to previous limit might be invalid. It's + * possible the cgroup sleep time is very long and no other cgroups + * have IO running so notify the limit changes. Make sure the cgroup + * doesn't sleep too long to avoid the missed notification. + */ + if (time_after(expires, max_expire)) + expires = max_expire; + mod_timer(&sq->pending_timer, expires); + throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", + expires - jiffies, jiffies); +} + +/** + * throtl_schedule_next_dispatch - schedule the next dispatch cycle + * @sq: the service_queue to schedule dispatch for + * @force: force scheduling + * + * Arm @sq->pending_timer so that the next dispatch cycle starts on the + * dispatch time of the first pending child. Returns %true if either timer + * is armed or there's no pending child left. %false if the current + * dispatch window is still open and the caller should continue + * dispatching. + * + * If @force is %true, the dispatch timer is always scheduled and this + * function is guaranteed to return %true. This is to be used when the + * caller can't dispatch itself and needs to invoke pending_timer + * unconditionally. Note that forced scheduling is likely to induce short + * delay before dispatch starts even if @sq->first_pending_disptime is not + * in the future and thus shouldn't be used in hot paths. + */ +static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, + bool force) +{ + /* any pending children left? */ + if (!sq->nr_pending) + return true; + + update_min_dispatch_time(sq); + + /* is the next dispatch time in the future? */ + if (force || time_after(sq->first_pending_disptime, jiffies)) { + throtl_schedule_pending_timer(sq, sq->first_pending_disptime); + return true; + } + + /* tell the caller to continue dispatching */ + return false; +} + +static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, + bool rw, unsigned long start) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + + atomic_set(&tg->io_split_cnt[rw], 0); + + /* + * Previous slice has expired. We must have trimmed it after last + * bio dispatch. That means since start of last slice, we never used + * that bandwidth. Do try to make use of that bandwidth while giving + * credit. + */ + if (time_after_eq(start, tg->slice_start[rw])) + tg->slice_start[rw] = start; + + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + throtl_log(&tg->service_queue, + "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +{ + tg->bytes_disp[rw] = 0; + tg->io_disp[rw] = 0; + tg->slice_start[rw] = jiffies; + tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + + atomic_set(&tg->io_split_cnt[rw], 0); + + throtl_log(&tg->service_queue, + "[%c] new slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, + unsigned long jiffy_end) +{ + tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); +} + +static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, + unsigned long jiffy_end) +{ + throtl_set_slice_end(tg, rw, jiffy_end); + throtl_log(&tg->service_queue, + "[%c] extend slice start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', tg->slice_start[rw], + tg->slice_end[rw], jiffies); +} + +/* Determine if previously allocated or extended slice is complete or not */ +static bool throtl_slice_used(struct throtl_grp *tg, bool rw) +{ + if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) + return false; + + return true; +} + +/* Trim the used slices and adjust slice start accordingly */ +static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) +{ + unsigned long nr_slices, time_elapsed, io_trim; + u64 bytes_trim, tmp; + + BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); + + /* + * If bps are unlimited (-1), then time slice don't get + * renewed. Don't try to trim the slice if slice is used. A new + * slice will start when appropriate. + */ + if (throtl_slice_used(tg, rw)) + return; + + /* + * A bio has been dispatched. Also adjust slice_end. It might happen + * that initially cgroup limit was very low resulting in high + * slice_end, but later limit was bumped up and bio was dispatched + * sooner, then we need to reduce slice_end. A high bogus slice_end + * is bad because it does not allow new slice to start. + */ + + throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); + + time_elapsed = jiffies - tg->slice_start[rw]; + + nr_slices = time_elapsed / tg->td->throtl_slice; + + if (!nr_slices) + return; + tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; + do_div(tmp, HZ); + bytes_trim = tmp; + + io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / + HZ; + + if (!bytes_trim && !io_trim) + return; + + if (tg->bytes_disp[rw] >= bytes_trim) + tg->bytes_disp[rw] -= bytes_trim; + else + tg->bytes_disp[rw] = 0; + + if (tg->io_disp[rw] >= io_trim) + tg->io_disp[rw] -= io_trim; + else + tg->io_disp[rw] = 0; + + tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; + + throtl_log(&tg->service_queue, + "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, + tg->slice_start[rw], tg->slice_end[rw], jiffies); +} + +static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + u64 tmp; + + if (iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + + jiffy_elapsed = jiffies - tg->slice_start[rw]; + + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + + /* + * jiffy_elapsed_rnd should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)iops_limit * jiffy_elapsed_rnd; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + if (tg->io_disp[rw] + 1 <= io_allowed) { + if (wait) + *wait = 0; + return true; + } + + /* Calc approx time to dispatch */ + jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; + + if (wait) + *wait = jiffy_wait; + return false; +} + +static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit, unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + u64 bytes_allowed, extra_bytes; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + unsigned int bio_size = throtl_bio_data_size(bio); + + if (bps_limit == U64_MAX) { + if (wait) + *wait = 0; + return true; + } + + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; + + /* Slice has just started. Consider one slice interval */ + if (!jiffy_elapsed) + jiffy_elapsed_rnd = tg->td->throtl_slice; + + jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); + bytes_allowed = mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed_rnd, + (u64)HZ); + + if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { + if (wait) + *wait = 0; + return true; + } + + /* Calc approx time to dispatch */ + extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; + jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); + + if (!jiffy_wait) + jiffy_wait = 1; + + /* + * This wait time is without taking into consideration the rounding + * up we did. Add that time also. + */ + jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); + if (wait) + *wait = jiffy_wait; + return false; +} + +/* + * Returns whether one can dispatch a bio or not. Also returns approx number + * of jiffies to wait before this bio is with-in IO rate and can be dispatched + */ +static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, + unsigned long *wait) +{ + bool rw = bio_data_dir(bio); + unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); + + /* + * Currently whole state machine of group depends on first bio + * queued in the group bio list. So one should not be calling + * this function with a different bio if there are other bios + * queued. + */ + BUG_ON(tg->service_queue.nr_queued[rw] && + bio != throtl_peek_queued(&tg->service_queue.queued[rw])); + + /* If tg->bps = -1, then BW is unlimited */ + if (bps_limit == U64_MAX && iops_limit == UINT_MAX) { + if (wait) + *wait = 0; + return true; + } + + /* + * If previous slice expired, start a new one otherwise renew/extend + * existing slice to make sure it is at least throtl_slice interval + * long since now. New slice is started only for empty throttle group. + * If there is queued bio, that means there should be an active + * slice and it should be extended instead. + */ + if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) + throtl_start_new_slice(tg, rw); + else { + if (time_before(tg->slice_end[rw], + jiffies + tg->td->throtl_slice)) + throtl_extend_slice(tg, rw, + jiffies + tg->td->throtl_slice); + } + + if (iops_limit != UINT_MAX) + tg->io_disp[rw] += atomic_xchg(&tg->io_split_cnt[rw], 0); + + if (tg_with_in_bps_limit(tg, bio, bps_limit, &bps_wait) && + tg_with_in_iops_limit(tg, bio, iops_limit, &iops_wait)) { + if (wait) + *wait = 0; + return true; + } + + max_wait = max(bps_wait, iops_wait); + + if (wait) + *wait = max_wait; + + if (time_before(tg->slice_end[rw], jiffies + max_wait)) + throtl_extend_slice(tg, rw, jiffies + max_wait); + + return false; +} + +static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) +{ + bool rw = bio_data_dir(bio); + unsigned int bio_size = throtl_bio_data_size(bio); + + /* Charge the bio to the group */ + tg->bytes_disp[rw] += bio_size; + tg->io_disp[rw]++; + tg->last_bytes_disp[rw] += bio_size; + tg->last_io_disp[rw]++; + + /* + * BIO_THROTTLED is used to prevent the same bio to be throttled + * more than once as a throttled bio will go through blk-throtl the + * second time when it eventually gets issued. Set it when a bio + * is being charged to a tg. + */ + if (!bio_flagged(bio, BIO_THROTTLED)) + bio_set_flag(bio, BIO_THROTTLED); +} + +/** + * throtl_add_bio_tg - add a bio to the specified throtl_grp + * @bio: bio to add + * @qn: qnode to use + * @tg: the target throtl_grp + * + * Add @bio to @tg's service_queue using @qn. If @qn is not specified, + * tg->qnode_on_self[] is used. + */ +static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, + struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + bool rw = bio_data_dir(bio); + + if (!qn) + qn = &tg->qnode_on_self[rw]; + + /* + * If @tg doesn't currently have any bios queued in the same + * direction, queueing @bio can change when @tg should be + * dispatched. Mark that @tg was empty. This is automatically + * cleared on the next tg_update_disptime(). + */ + if (!sq->nr_queued[rw]) + tg->flags |= THROTL_TG_WAS_EMPTY; + + throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); + + sq->nr_queued[rw]++; + throtl_enqueue_tg(tg); +} + +static void tg_update_disptime(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; + struct bio *bio; + + bio = throtl_peek_queued(&sq->queued[READ]); + if (bio) + tg_may_dispatch(tg, bio, &read_wait); + + bio = throtl_peek_queued(&sq->queued[WRITE]); + if (bio) + tg_may_dispatch(tg, bio, &write_wait); + + min_wait = min(read_wait, write_wait); + disptime = jiffies + min_wait; + + /* Update dispatch time */ + throtl_dequeue_tg(tg); + tg->disptime = disptime; + throtl_enqueue_tg(tg); + + /* see throtl_add_bio_tg() */ + tg->flags &= ~THROTL_TG_WAS_EMPTY; +} + +static void start_parent_slice_with_credit(struct throtl_grp *child_tg, + struct throtl_grp *parent_tg, bool rw) +{ + if (throtl_slice_used(parent_tg, rw)) { + throtl_start_new_slice_with_credit(parent_tg, rw, + child_tg->slice_start[rw]); + } + +} + +static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) +{ + struct throtl_service_queue *sq = &tg->service_queue; + struct throtl_service_queue *parent_sq = sq->parent_sq; + struct throtl_grp *parent_tg = sq_to_tg(parent_sq); + struct throtl_grp *tg_to_put = NULL; + struct bio *bio; + + /* + * @bio is being transferred from @tg to @parent_sq. Popping a bio + * from @tg may put its reference and @parent_sq might end up + * getting released prematurely. Remember the tg to put and put it + * after @bio is transferred to @parent_sq. + */ + bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); + sq->nr_queued[rw]--; + + throtl_charge_bio(tg, bio); + + /* + * If our parent is another tg, we just need to transfer @bio to + * the parent using throtl_add_bio_tg(). If our parent is + * @td->service_queue, @bio is ready to be issued. Put it on its + * bio_lists[] and decrease total number queued. The caller is + * responsible for issuing these bios. + */ + if (parent_tg) { + throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); + start_parent_slice_with_credit(tg, parent_tg, rw); + } else { + throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], + &parent_sq->queued[rw]); + BUG_ON(tg->td->nr_queued[rw] <= 0); + tg->td->nr_queued[rw]--; + } + + throtl_trim_slice(tg, rw); + + if (tg_to_put) + blkg_put(tg_to_blkg(tg_to_put)); +} + +static int throtl_dispatch_tg(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + unsigned int nr_reads = 0, nr_writes = 0; + unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; + unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; + struct bio *bio; + + /* Try to dispatch 75% READS and 25% WRITES */ + + while ((bio = throtl_peek_queued(&sq->queued[READ])) && + tg_may_dispatch(tg, bio, NULL)) { + + tg_dispatch_one_bio(tg, bio_data_dir(bio)); + nr_reads++; + + if (nr_reads >= max_nr_reads) + break; + } + + while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && + tg_may_dispatch(tg, bio, NULL)) { + + tg_dispatch_one_bio(tg, bio_data_dir(bio)); + nr_writes++; + + if (nr_writes >= max_nr_writes) + break; + } + + return nr_reads + nr_writes; +} + +static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) +{ + unsigned int nr_disp = 0; + + while (1) { + struct throtl_grp *tg; + struct throtl_service_queue *sq; + + if (!parent_sq->nr_pending) + break; + + tg = throtl_rb_first(parent_sq); + if (!tg) + break; + + if (time_before(jiffies, tg->disptime)) + break; + + throtl_dequeue_tg(tg); + + nr_disp += throtl_dispatch_tg(tg); + + sq = &tg->service_queue; + if (sq->nr_queued[0] || sq->nr_queued[1]) + tg_update_disptime(tg); + + if (nr_disp >= THROTL_QUANTUM) + break; + } + + return nr_disp; +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg); +/** + * throtl_pending_timer_fn - timer function for service_queue->pending_timer + * @t: the pending_timer member of the throtl_service_queue being serviced + * + * This timer is armed when a child throtl_grp with active bio's become + * pending and queued on the service_queue's pending_tree and expires when + * the first child throtl_grp should be dispatched. This function + * dispatches bio's from the children throtl_grps to the parent + * service_queue. + * + * If the parent's parent is another throtl_grp, dispatching is propagated + * by either arming its pending_timer or repeating dispatch directly. If + * the top-level service_tree is reached, throtl_data->dispatch_work is + * kicked so that the ready bio's are issued. + */ +static void throtl_pending_timer_fn(struct timer_list *t) +{ + struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); + struct throtl_grp *tg = sq_to_tg(sq); + struct throtl_data *td = sq_to_td(sq); + struct request_queue *q = td->queue; + struct throtl_service_queue *parent_sq; + bool dispatched; + int ret; + + spin_lock_irq(&q->queue_lock); + if (throtl_can_upgrade(td, NULL)) + throtl_upgrade_state(td); + +again: + parent_sq = sq->parent_sq; + dispatched = false; + + while (true) { + throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", + sq->nr_queued[READ] + sq->nr_queued[WRITE], + sq->nr_queued[READ], sq->nr_queued[WRITE]); + + ret = throtl_select_dispatch(sq); + if (ret) { + throtl_log(sq, "bios disp=%u", ret); + dispatched = true; + } + + if (throtl_schedule_next_dispatch(sq, false)) + break; + + /* this dispatch windows is still open, relax and repeat */ + spin_unlock_irq(&q->queue_lock); + cpu_relax(); + spin_lock_irq(&q->queue_lock); + } + + if (!dispatched) + goto out_unlock; + + if (parent_sq) { + /* @parent_sq is another throl_grp, propagate dispatch */ + if (tg->flags & THROTL_TG_WAS_EMPTY) { + tg_update_disptime(tg); + if (!throtl_schedule_next_dispatch(parent_sq, false)) { + /* window is already open, repeat dispatching */ + sq = parent_sq; + tg = sq_to_tg(sq); + goto again; + } + } + } else { + /* reached the top-level, queue issuing */ + queue_work(kthrotld_workqueue, &td->dispatch_work); + } +out_unlock: + spin_unlock_irq(&q->queue_lock); +} + +/** + * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work + * @work: work item being executed + * + * This function is queued for execution when bios reach the bio_lists[] + * of throtl_data->service_queue. Those bios are ready and issued by this + * function. + */ +static void blk_throtl_dispatch_work_fn(struct work_struct *work) +{ + struct throtl_data *td = container_of(work, struct throtl_data, + dispatch_work); + struct throtl_service_queue *td_sq = &td->service_queue; + struct request_queue *q = td->queue; + struct bio_list bio_list_on_stack; + struct bio *bio; + struct blk_plug plug; + int rw; + + bio_list_init(&bio_list_on_stack); + + spin_lock_irq(&q->queue_lock); + for (rw = READ; rw <= WRITE; rw++) + while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) + bio_list_add(&bio_list_on_stack, bio); + spin_unlock_irq(&q->queue_lock); + + if (!bio_list_empty(&bio_list_on_stack)) { + blk_start_plug(&plug); + while ((bio = bio_list_pop(&bio_list_on_stack))) + submit_bio_noacct(bio); + blk_finish_plug(&plug); + } +} + +static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + u64 v = *(u64 *)((void *)tg + off); + + if (v == U64_MAX) + return 0; + return __blkg_prfill_u64(sf, pd, v); +} + +static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + unsigned int v = *(unsigned int *)((void *)tg + off); + + if (v == UINT_MAX) + return 0; + return __blkg_prfill_u64(sf, pd, v); +} + +static int tg_print_conf_u64(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static int tg_print_conf_uint(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static void tg_conf_updated(struct throtl_grp *tg, bool global) +{ + struct throtl_service_queue *sq = &tg->service_queue; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + throtl_log(&tg->service_queue, + "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", + tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), + tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); + + rcu_read_lock(); + /* + * Update has_rules[] flags for the updated tg's subtree. A tg is + * considered to have rules if either the tg itself or any of its + * ancestors has rules. This identifies groups without any + * restrictions in the whole hierarchy and allows them to bypass + * blk-throttle. + */ + blkg_for_each_descendant_pre(blkg, pos_css, + global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { + struct throtl_grp *this_tg = blkg_to_tg(blkg); + struct throtl_grp *parent_tg; + + tg_update_has_rules(this_tg); + /* ignore root/second level */ + if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || + !blkg->parent->parent) + continue; + parent_tg = blkg_to_tg(blkg->parent); + /* + * make sure all children has lower idle time threshold and + * higher latency target + */ + this_tg->idletime_threshold = min(this_tg->idletime_threshold, + parent_tg->idletime_threshold); + this_tg->latency_target = max(this_tg->latency_target, + parent_tg->latency_target); + } + rcu_read_unlock(); + + /* + * We're already holding queue_lock and know @tg is valid. Let's + * apply the new config directly. + * + * Restart the slices for both READ and WRITES. It might happen + * that a group's limit are dropped suddenly and we don't want to + * account recently dispatched IO with new low rate. + */ + throtl_start_new_slice(tg, READ); + throtl_start_new_slice(tg, WRITE); + + if (tg->flags & THROTL_TG_PENDING) { + tg_update_disptime(tg); + throtl_schedule_next_dispatch(sq->parent_sq, true); + } +} + +static ssize_t tg_set_conf(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off, bool is_u64) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + int ret; + u64 v; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + ret = -EINVAL; + if (sscanf(ctx.body, "%llu", &v) != 1) + goto out_finish; + if (!v) + v = U64_MAX; + + tg = blkg_to_tg(ctx.blkg); + + if (is_u64) + *(u64 *)((void *)tg + of_cft(of)->private) = v; + else + *(unsigned int *)((void *)tg + of_cft(of)->private) = v; + + tg_conf_updated(tg, false); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return tg_set_conf(of, buf, nbytes, off, true); +} + +static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + return tg_set_conf(of, buf, nbytes, off, false); +} + +static int tg_print_rwstat(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + blkg_prfill_rwstat, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, + struct blkg_policy_data *pd, int off) +{ + struct blkg_rwstat_sample sum; + + blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, + &sum); + return __blkg_prfill_rwstat(sf, pd, &sum); +} + +static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), + tg_prfill_rwstat_recursive, &blkcg_policy_throtl, + seq_cft(sf)->private, true); + return 0; +} + +static struct cftype throtl_legacy_files[] = { + { + .name = "throttle.read_bps_device", + .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), + .seq_show = tg_print_conf_u64, + .write = tg_set_conf_u64, + }, + { + .name = "throttle.write_bps_device", + .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), + .seq_show = tg_print_conf_u64, + .write = tg_set_conf_u64, + }, + { + .name = "throttle.read_iops_device", + .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), + .seq_show = tg_print_conf_uint, + .write = tg_set_conf_uint, + }, + { + .name = "throttle.write_iops_device", + .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), + .seq_show = tg_print_conf_uint, + .write = tg_set_conf_uint, + }, + { + .name = "throttle.io_service_bytes", + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_service_bytes_recursive", + .private = offsetof(struct throtl_grp, stat_bytes), + .seq_show = tg_print_rwstat_recursive, + }, + { + .name = "throttle.io_serviced", + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat, + }, + { + .name = "throttle.io_serviced_recursive", + .private = offsetof(struct throtl_grp, stat_ios), + .seq_show = tg_print_rwstat_recursive, + }, + { } /* terminate */ +}; + +static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, + int off) +{ + struct throtl_grp *tg = pd_to_tg(pd); + const char *dname = blkg_dev_name(pd->blkg); + char bufs[4][21] = { "max", "max", "max", "max" }; + u64 bps_dft; + unsigned int iops_dft; + char idle_time[26] = ""; + char latency_time[26] = ""; + + if (!dname) + return 0; + + if (off == LIMIT_LOW) { + bps_dft = 0; + iops_dft = 0; + } else { + bps_dft = U64_MAX; + iops_dft = UINT_MAX; + } + + if (tg->bps_conf[READ][off] == bps_dft && + tg->bps_conf[WRITE][off] == bps_dft && + tg->iops_conf[READ][off] == iops_dft && + tg->iops_conf[WRITE][off] == iops_dft && + (off != LIMIT_LOW || + (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && + tg->latency_target_conf == DFL_LATENCY_TARGET))) + return 0; + + if (tg->bps_conf[READ][off] != U64_MAX) + snprintf(bufs[0], sizeof(bufs[0]), "%llu", + tg->bps_conf[READ][off]); + if (tg->bps_conf[WRITE][off] != U64_MAX) + snprintf(bufs[1], sizeof(bufs[1]), "%llu", + tg->bps_conf[WRITE][off]); + if (tg->iops_conf[READ][off] != UINT_MAX) + snprintf(bufs[2], sizeof(bufs[2]), "%u", + tg->iops_conf[READ][off]); + if (tg->iops_conf[WRITE][off] != UINT_MAX) + snprintf(bufs[3], sizeof(bufs[3]), "%u", + tg->iops_conf[WRITE][off]); + if (off == LIMIT_LOW) { + if (tg->idletime_threshold_conf == ULONG_MAX) + strcpy(idle_time, " idle=max"); + else + snprintf(idle_time, sizeof(idle_time), " idle=%lu", + tg->idletime_threshold_conf); + + if (tg->latency_target_conf == ULONG_MAX) + strcpy(latency_time, " latency=max"); + else + snprintf(latency_time, sizeof(latency_time), + " latency=%lu", tg->latency_target_conf); + } + + seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", + dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, + latency_time); + return 0; +} + +static int tg_print_limit(struct seq_file *sf, void *v) +{ + blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, + &blkcg_policy_throtl, seq_cft(sf)->private, false); + return 0; +} + +static ssize_t tg_set_limit(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct blkcg *blkcg = css_to_blkcg(of_css(of)); + struct blkg_conf_ctx ctx; + struct throtl_grp *tg; + u64 v[4]; + unsigned long idle_time; + unsigned long latency_time; + int ret; + int index = of_cft(of)->private; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + if (ret) + return ret; + + tg = blkg_to_tg(ctx.blkg); + + v[0] = tg->bps_conf[READ][index]; + v[1] = tg->bps_conf[WRITE][index]; + v[2] = tg->iops_conf[READ][index]; + v[3] = tg->iops_conf[WRITE][index]; + + idle_time = tg->idletime_threshold_conf; + latency_time = tg->latency_target_conf; + while (true) { + char tok[27]; /* wiops=18446744073709551616 */ + char *p; + u64 val = U64_MAX; + int len; + + if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) + break; + if (tok[0] == '\0') + break; + ctx.body += len; + + ret = -EINVAL; + p = tok; + strsep(&p, "="); + if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) + goto out_finish; + + ret = -ERANGE; + if (!val) + goto out_finish; + + ret = -EINVAL; + if (!strcmp(tok, "rbps") && val > 1) + v[0] = val; + else if (!strcmp(tok, "wbps") && val > 1) + v[1] = val; + else if (!strcmp(tok, "riops") && val > 1) + v[2] = min_t(u64, val, UINT_MAX); + else if (!strcmp(tok, "wiops") && val > 1) + v[3] = min_t(u64, val, UINT_MAX); + else if (off == LIMIT_LOW && !strcmp(tok, "idle")) + idle_time = val; + else if (off == LIMIT_LOW && !strcmp(tok, "latency")) + latency_time = val; + else + goto out_finish; + } + + tg->bps_conf[READ][index] = v[0]; + tg->bps_conf[WRITE][index] = v[1]; + tg->iops_conf[READ][index] = v[2]; + tg->iops_conf[WRITE][index] = v[3]; + + if (index == LIMIT_MAX) { + tg->bps[READ][index] = v[0]; + tg->bps[WRITE][index] = v[1]; + tg->iops[READ][index] = v[2]; + tg->iops[WRITE][index] = v[3]; + } + tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], + tg->bps_conf[READ][LIMIT_MAX]); + tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], + tg->bps_conf[WRITE][LIMIT_MAX]); + tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], + tg->iops_conf[READ][LIMIT_MAX]); + tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], + tg->iops_conf[WRITE][LIMIT_MAX]); + tg->idletime_threshold_conf = idle_time; + tg->latency_target_conf = latency_time; + + /* force user to configure all settings for low limit */ + if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || + tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || + tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || + tg->latency_target_conf == DFL_LATENCY_TARGET) { + tg->bps[READ][LIMIT_LOW] = 0; + tg->bps[WRITE][LIMIT_LOW] = 0; + tg->iops[READ][LIMIT_LOW] = 0; + tg->iops[WRITE][LIMIT_LOW] = 0; + tg->idletime_threshold = DFL_IDLE_THRESHOLD; + tg->latency_target = DFL_LATENCY_TARGET; + } else if (index == LIMIT_LOW) { + tg->idletime_threshold = tg->idletime_threshold_conf; + tg->latency_target = tg->latency_target_conf; + } + + blk_throtl_update_limit_valid(tg->td); + if (tg->td->limit_valid[LIMIT_LOW]) { + if (index == LIMIT_LOW) + tg->td->limit_index = LIMIT_LOW; + } else + tg->td->limit_index = LIMIT_MAX; + tg_conf_updated(tg, index == LIMIT_LOW && + tg->td->limit_valid[LIMIT_LOW]); + ret = 0; +out_finish: + blkg_conf_finish(&ctx); + return ret ?: nbytes; +} + +static struct cftype throtl_files[] = { +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + { + .name = "low", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_LOW, + }, +#endif + { + .name = "max", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = tg_print_limit, + .write = tg_set_limit, + .private = LIMIT_MAX, + }, + { } /* terminate */ +}; + +static void throtl_shutdown_wq(struct request_queue *q) +{ + struct throtl_data *td = q->td; + + cancel_work_sync(&td->dispatch_work); +} + +static struct blkcg_policy blkcg_policy_throtl = { + .dfl_cftypes = throtl_files, + .legacy_cftypes = throtl_legacy_files, + + .pd_alloc_fn = throtl_pd_alloc, + .pd_init_fn = throtl_pd_init, + .pd_online_fn = throtl_pd_online, + .pd_offline_fn = throtl_pd_offline, + .pd_free_fn = throtl_pd_free, +}; + +static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) +{ + unsigned long rtime = jiffies, wtime = jiffies; + + if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) + rtime = tg->last_low_overflow_time[READ]; + if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) + wtime = tg->last_low_overflow_time[WRITE]; + return min(rtime, wtime); +} + +/* tg should not be an intermediate node */ +static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) +{ + struct throtl_service_queue *parent_sq; + struct throtl_grp *parent = tg; + unsigned long ret = __tg_last_low_overflow_time(tg); + + while (true) { + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + if (!parent) + break; + + /* + * The parent doesn't have low limit, it always reaches low + * limit. Its overflow time is useless for children + */ + if (!parent->bps[READ][LIMIT_LOW] && + !parent->iops[READ][LIMIT_LOW] && + !parent->bps[WRITE][LIMIT_LOW] && + !parent->iops[WRITE][LIMIT_LOW]) + continue; + if (time_after(__tg_last_low_overflow_time(parent), ret)) + ret = __tg_last_low_overflow_time(parent); + } + return ret; +} + +static bool throtl_tg_is_idle(struct throtl_grp *tg) +{ + /* + * cgroup is idle if: + * - single idle is too long, longer than a fixed value (in case user + * configure a too big threshold) or 4 times of idletime threshold + * - average think time is more than threshold + * - IO latency is largely below threshold + */ + unsigned long time; + bool ret; + + time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); + ret = tg->latency_target == DFL_LATENCY_TARGET || + tg->idletime_threshold == DFL_IDLE_THRESHOLD || + (ktime_get_ns() >> 10) - tg->last_finish_time > time || + tg->avg_idletime > tg->idletime_threshold || + (tg->latency_target && tg->bio_cnt && + tg->bad_bio_cnt * 5 < tg->bio_cnt); + throtl_log(&tg->service_queue, + "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", + tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, + tg->bio_cnt, ret, tg->td->scale); + return ret; +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + struct throtl_service_queue *sq = &tg->service_queue; + bool read_limit, write_limit; + + /* + * if cgroup reaches low limit (if low limit is 0, the cgroup always + * reaches), it's ok to upgrade to next limit + */ + read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; + write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; + if (!read_limit && !write_limit) + return true; + if (read_limit && sq->nr_queued[READ] && + (!write_limit || sq->nr_queued[WRITE])) + return true; + if (write_limit && sq->nr_queued[WRITE] && + (!read_limit || sq->nr_queued[READ])) + return true; + + if (time_after_eq(jiffies, + tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && + throtl_tg_is_idle(tg)) + return true; + return false; +} + +static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) +{ + while (true) { + if (throtl_tg_can_upgrade(tg)) + return true; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + return false; + } + return false; +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + if (td->limit_index != LIMIT_LOW) + return false; + + if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) + return false; + + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + + if (tg == this_tg) + continue; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + continue; + if (!throtl_hierarchy_can_upgrade(tg)) { + rcu_read_unlock(); + return false; + } + } + rcu_read_unlock(); + return true; +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_LOW) + return; + + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + tg->last_check_time = now; + + if (!time_after_eq(now, + __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) + return; + + if (throtl_can_upgrade(tg->td, NULL)) + throtl_upgrade_state(tg->td); +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + throtl_log(&td->service_queue, "upgrade to max"); + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->scale = 0; + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + tg->disptime = jiffies - 1; + throtl_select_dispatch(sq); + throtl_schedule_next_dispatch(sq, true); + } + rcu_read_unlock(); + throtl_select_dispatch(&td->service_queue); + throtl_schedule_next_dispatch(&td->service_queue, true); + queue_work(kthrotld_workqueue, &td->dispatch_work); +} + +static void throtl_downgrade_state(struct throtl_data *td) +{ + td->scale /= 2; + + throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); + if (td->scale) { + td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; + return; + } + + td->limit_index = LIMIT_LOW; + td->low_downgrade_time = jiffies; +} + +static bool throtl_tg_can_downgrade(struct throtl_grp *tg) +{ + struct throtl_data *td = tg->td; + unsigned long now = jiffies; + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && + time_after_eq(now, tg_last_low_overflow_time(tg) + + td->throtl_slice) && + (!throtl_tg_is_idle(tg) || + !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) + return true; + return false; +} + +static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) +{ + while (true) { + if (!throtl_tg_can_downgrade(tg)) + return false; + tg = sq_to_tg(tg->service_queue.parent_sq); + if (!tg || !tg_to_blkg(tg)->parent) + break; + } + return true; +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ + uint64_t bps; + unsigned int iops; + unsigned long elapsed_time; + unsigned long now = jiffies; + + if (tg->td->limit_index != LIMIT_MAX || + !tg->td->limit_valid[LIMIT_LOW]) + return; + if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) + return; + if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) + return; + + elapsed_time = now - tg->last_check_time; + tg->last_check_time = now; + + if (time_before(now, tg_last_low_overflow_time(tg) + + tg->td->throtl_slice)) + return; + + if (tg->bps[READ][LIMIT_LOW]) { + bps = tg->last_bytes_disp[READ] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->bps[WRITE][LIMIT_LOW]) { + bps = tg->last_bytes_disp[WRITE] * HZ; + do_div(bps, elapsed_time); + if (bps >= tg->bps[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + if (tg->iops[READ][LIMIT_LOW]) { + tg->last_io_disp[READ] += atomic_xchg(&tg->last_io_split_cnt[READ], 0); + iops = tg->last_io_disp[READ] * HZ / elapsed_time; + if (iops >= tg->iops[READ][LIMIT_LOW]) + tg->last_low_overflow_time[READ] = now; + } + + if (tg->iops[WRITE][LIMIT_LOW]) { + tg->last_io_disp[WRITE] += atomic_xchg(&tg->last_io_split_cnt[WRITE], 0); + iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; + if (iops >= tg->iops[WRITE][LIMIT_LOW]) + tg->last_low_overflow_time[WRITE] = now; + } + + /* + * If cgroup is below low limit, consider downgrade and throttle other + * cgroups + */ + if (throtl_hierarchy_can_downgrade(tg)) + throtl_downgrade_state(tg->td); + + tg->last_bytes_disp[READ] = 0; + tg->last_bytes_disp[WRITE] = 0; + tg->last_io_disp[READ] = 0; + tg->last_io_disp[WRITE] = 0; +} + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ + unsigned long now; + unsigned long last_finish_time = tg->last_finish_time; + + if (last_finish_time == 0) + return; + + now = ktime_get_ns() >> 10; + if (now <= last_finish_time || + last_finish_time == tg->checked_last_finish_time) + return; + + tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; + tg->checked_last_finish_time = last_finish_time; +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_update_latency_buckets(struct throtl_data *td) +{ + struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; + int i, cpu, rw; + unsigned long last_latency[2] = { 0 }; + unsigned long latency[2]; + + if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) + return; + if (time_before(jiffies, td->last_calculate_time + HZ)) + return; + td->last_calculate_time = jiffies; + + memset(avg_latency, 0, sizeof(avg_latency)); + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; + + for_each_possible_cpu(cpu) { + struct latency_bucket *bucket; + + /* this isn't race free, but ok in practice */ + bucket = per_cpu_ptr(td->latency_buckets[rw], + cpu); + tmp->total_latency += bucket[i].total_latency; + tmp->samples += bucket[i].samples; + bucket[i].total_latency = 0; + bucket[i].samples = 0; + } + + if (tmp->samples >= 32) { + int samples = tmp->samples; + + latency[rw] = tmp->total_latency; + + tmp->total_latency = 0; + tmp->samples = 0; + latency[rw] /= samples; + if (latency[rw] == 0) + continue; + avg_latency[rw][i].latency = latency[rw]; + } + } + } + + for (rw = READ; rw <= WRITE; rw++) { + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + if (!avg_latency[rw][i].latency) { + if (td->avg_buckets[rw][i].latency < last_latency[rw]) + td->avg_buckets[rw][i].latency = + last_latency[rw]; + continue; + } + + if (!td->avg_buckets[rw][i].valid) + latency[rw] = avg_latency[rw][i].latency; + else + latency[rw] = (td->avg_buckets[rw][i].latency * 7 + + avg_latency[rw][i].latency) >> 3; + + td->avg_buckets[rw][i].latency = max(latency[rw], + last_latency[rw]); + td->avg_buckets[rw][i].valid = true; + last_latency[rw] = td->avg_buckets[rw][i].latency; + } + } + + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) + throtl_log(&td->service_queue, + "Latency bucket %d: read latency=%ld, read valid=%d, " + "write latency=%ld, write valid=%d", i, + td->avg_buckets[READ][i].latency, + td->avg_buckets[READ][i].valid, + td->avg_buckets[WRITE][i].latency, + td->avg_buckets[WRITE][i].valid); +} +#else +static inline void throtl_update_latency_buckets(struct throtl_data *td) +{ +} +#endif + +void blk_throtl_charge_bio_split(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_grp *parent = blkg_to_tg(blkg); + struct throtl_service_queue *parent_sq; + bool rw = bio_data_dir(bio); + + do { + if (!parent->has_rules[rw]) + break; + + atomic_inc(&parent->io_split_cnt[rw]); + atomic_inc(&parent->last_io_split_cnt[rw]); + + parent_sq = parent->service_queue.parent_sq; + parent = sq_to_tg(parent_sq); + } while (parent); +} + +bool blk_throtl_bio(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + struct blkcg_gq *blkg = bio->bi_blkg; + struct throtl_qnode *qn = NULL; + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq; + bool rw = bio_data_dir(bio); + bool throttled = false; + struct throtl_data *td = tg->td; + + rcu_read_lock(); + + /* see throtl_charge_bio() */ + if (bio_flagged(bio, BIO_THROTTLED)) + goto out; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + if (!tg->has_rules[rw]) + goto out; + + spin_lock_irq(&q->queue_lock); + + throtl_update_latency_buckets(td); + + blk_throtl_update_idletime(tg); + + sq = &tg->service_queue; + +again: + while (true) { + if (tg->last_low_overflow_time[rw] == 0) + tg->last_low_overflow_time[rw] = jiffies; + throtl_downgrade_check(tg); + throtl_upgrade_check(tg); + /* throtl is FIFO - if bios are already queued, should queue */ + if (sq->nr_queued[rw]) + break; + + /* if above limits, break to queue */ + if (!tg_may_dispatch(tg, bio, NULL)) { + tg->last_low_overflow_time[rw] = jiffies; + if (throtl_can_upgrade(td, tg)) { + throtl_upgrade_state(td); + goto again; + } + break; + } + + /* within limits, let's charge and dispatch directly */ + throtl_charge_bio(tg, bio); + + /* + * We need to trim slice even when bios are not being queued + * otherwise it might happen that a bio is not queued for + * a long time and slice keeps on extending and trim is not + * called for a long time. Now if limits are reduced suddenly + * we take into account all the IO dispatched so far at new + * low rate and * newly queued IO gets a really long dispatch + * time. + * + * So keep on trimming slice even if bio is not queued. + */ + throtl_trim_slice(tg, rw); + + /* + * @bio passed through this layer without being throttled. + * Climb up the ladder. If we're already at the top, it + * can be executed directly. + */ + qn = &tg->qnode_on_parent[rw]; + sq = sq->parent_sq; + tg = sq_to_tg(sq); + if (!tg) + goto out_unlock; + } + + /* out-of-limit, queue to @tg */ + throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", + rw == READ ? 'R' : 'W', + tg->bytes_disp[rw], bio->bi_iter.bi_size, + tg_bps_limit(tg, rw), + tg->io_disp[rw], tg_iops_limit(tg, rw), + sq->nr_queued[READ], sq->nr_queued[WRITE]); + + tg->last_low_overflow_time[rw] = jiffies; + + td->nr_queued[rw]++; + throtl_add_bio_tg(bio, qn, tg); + throttled = true; + + /* + * Update @tg's dispatch time and force schedule dispatch if @tg + * was empty before @bio. The forced scheduling isn't likely to + * cause undue delay as @bio is likely to be dispatched directly if + * its @tg's disptime is not in the future. + */ + if (tg->flags & THROTL_TG_WAS_EMPTY) { + tg_update_disptime(tg); + throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); + } + +out_unlock: + spin_unlock_irq(&q->queue_lock); +out: + bio_set_flag(bio, BIO_THROTTLED); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW + if (throttled || !td->track_bio_latency) + bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; +#endif + rcu_read_unlock(); + return throttled; +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +static void throtl_track_latency(struct throtl_data *td, sector_t size, + int op, unsigned long time) +{ + struct latency_bucket *latency; + int index; + + if (!td || td->limit_index != LIMIT_LOW || + !(op == REQ_OP_READ || op == REQ_OP_WRITE) || + !blk_queue_nonrot(td->queue)) + return; + + index = request_bucket_index(size); + + latency = get_cpu_ptr(td->latency_buckets[op]); + latency[index].total_latency += time; + latency[index].samples++; + put_cpu_ptr(td->latency_buckets[op]); +} + +void blk_throtl_stat_add(struct request *rq, u64 time_ns) +{ + struct request_queue *q = rq->q; + struct throtl_data *td = q->td; + + throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), + time_ns >> 10); +} + +void blk_throtl_bio_endio(struct bio *bio) +{ + struct blkcg_gq *blkg; + struct throtl_grp *tg; + u64 finish_time_ns; + unsigned long finish_time; + unsigned long start_time; + unsigned long lat; + int rw = bio_data_dir(bio); + + blkg = bio->bi_blkg; + if (!blkg) + return; + tg = blkg_to_tg(blkg); + if (!tg->td->limit_valid[LIMIT_LOW]) + return; + + finish_time_ns = ktime_get_ns(); + tg->last_finish_time = finish_time_ns >> 10; + + start_time = bio_issue_time(&bio->bi_issue) >> 10; + finish_time = __bio_issue_time(finish_time_ns) >> 10; + if (!start_time || finish_time <= start_time) + return; + + lat = finish_time - start_time; + /* this is only for bio based driver */ + if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) + throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), + bio_op(bio), lat); + + if (tg->latency_target && lat >= tg->td->filtered_latency) { + int bucket; + unsigned int threshold; + + bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); + threshold = tg->td->avg_buckets[rw][bucket].latency + + tg->latency_target; + if (lat > threshold) + tg->bad_bio_cnt++; + /* + * Not race free, could get wrong count, which means cgroups + * will be throttled + */ + tg->bio_cnt++; + } + + if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { + tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; + tg->bio_cnt /= 2; + tg->bad_bio_cnt /= 2; + } +} +#endif + +int blk_throtl_init(struct request_queue *q) +{ + struct throtl_data *td; + int ret; + + td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); + if (!td) + return -ENOMEM; + td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[READ]) { + kfree(td); + return -ENOMEM; + } + td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * + LATENCY_BUCKET_SIZE, __alignof__(u64)); + if (!td->latency_buckets[WRITE]) { + free_percpu(td->latency_buckets[READ]); + kfree(td); + return -ENOMEM; + } + + INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); + throtl_service_queue_init(&td->service_queue); + + q->td = td; + td->queue = q; + + td->limit_valid[LIMIT_MAX] = true; + td->limit_index = LIMIT_MAX; + td->low_upgrade_time = jiffies; + td->low_downgrade_time = jiffies; + + /* activate policy */ + ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + if (ret) { + free_percpu(td->latency_buckets[READ]); + free_percpu(td->latency_buckets[WRITE]); + kfree(td); + } + return ret; +} + +void blk_throtl_exit(struct request_queue *q) +{ + BUG_ON(!q->td); + del_timer_sync(&q->td->service_queue.pending_timer); + throtl_shutdown_wq(q); + blkcg_deactivate_policy(q, &blkcg_policy_throtl); + free_percpu(q->td->latency_buckets[READ]); + free_percpu(q->td->latency_buckets[WRITE]); + kfree(q->td); +} + +void blk_throtl_register_queue(struct request_queue *q) +{ + struct throtl_data *td; + int i; + + td = q->td; + BUG_ON(!td); + + if (blk_queue_nonrot(q)) { + td->throtl_slice = DFL_THROTL_SLICE_SSD; + td->filtered_latency = LATENCY_FILTERED_SSD; + } else { + td->throtl_slice = DFL_THROTL_SLICE_HD; + td->filtered_latency = LATENCY_FILTERED_HD; + for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { + td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; + td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; + } + } +#ifndef CONFIG_BLK_DEV_THROTTLING_LOW + /* if no low limit, use previous default */ + td->throtl_slice = DFL_THROTL_SLICE_HD; +#endif + + td->track_bio_latency = !queue_is_mq(q); + if (!td->track_bio_latency) + blk_stat_enable_accounting(q); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) +{ + if (!q->td) + return -EINVAL; + return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); +} + +ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count) +{ + unsigned long v; + unsigned long t; + + if (!q->td) + return -EINVAL; + if (kstrtoul(page, 10, &v)) + return -EINVAL; + t = msecs_to_jiffies(v); + if (t == 0 || t > MAX_THROTL_SLICE) + return -EINVAL; + q->td->throtl_slice = t; + return count; +} +#endif + +static int __init throtl_init(void) +{ + kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); + if (!kthrotld_workqueue) + panic("Failed to create kthrotld\n"); + + return blkcg_policy_register(&blkcg_policy_throtl); +} + +module_init(throtl_init); diff --git a/block/blk-timeout.c b/block/blk-timeout.c new file mode 100644 index 000000000..1b8de0417 --- /dev/null +++ b/block/blk-timeout.c @@ -0,0 +1,167 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Functions related to generic timeout handling of requests. + */ +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" + +#ifdef CONFIG_FAIL_IO_TIMEOUT + +static DECLARE_FAULT_ATTR(fail_io_timeout); + +static int __init setup_fail_io_timeout(char *str) +{ + return setup_fault_attr(&fail_io_timeout, str); +} +__setup("fail_io_timeout=", setup_fail_io_timeout); + +bool __blk_should_fake_timeout(struct request_queue *q) +{ + return should_fail(&fail_io_timeout, 1); +} +EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); + +static int __init fail_io_timeout_debugfs(void) +{ + struct dentry *dir = fault_create_debugfs_attr("fail_io_timeout", + NULL, &fail_io_timeout); + + return PTR_ERR_OR_ZERO(dir); +} + +late_initcall(fail_io_timeout_debugfs); + +ssize_t part_timeout_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + int set = test_bit(QUEUE_FLAG_FAIL_IO, &disk->queue->queue_flags); + + return sprintf(buf, "%d\n", set != 0); +} + +ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + int val; + + if (count) { + struct request_queue *q = disk->queue; + char *p = (char *) buf; + + val = simple_strtoul(p, &p, 10); + if (val) + blk_queue_flag_set(QUEUE_FLAG_FAIL_IO, q); + else + blk_queue_flag_clear(QUEUE_FLAG_FAIL_IO, q); + } + + return count; +} + +#endif /* CONFIG_FAIL_IO_TIMEOUT */ + +/** + * blk_abort_request - Request recovery for the specified command + * @req: pointer to the request of interest + * + * This function requests that the block layer start recovery for the + * request by deleting the timer and calling the q's timeout function. + * LLDDs who implement their own error recovery MAY ignore the timeout + * event if they generated blk_abort_request. + */ +void blk_abort_request(struct request *req) +{ + /* + * All we need to ensure is that timeout scan takes place + * immediately and that scan sees the new timeout value. + * No need for fancy synchronizations. + */ + WRITE_ONCE(req->deadline, jiffies); + kblockd_schedule_work(&req->q->timeout_work); +} +EXPORT_SYMBOL_GPL(blk_abort_request); + +static unsigned long blk_timeout_mask __read_mostly; + +static int __init blk_timeout_init(void) +{ + blk_timeout_mask = roundup_pow_of_two(HZ) - 1; + return 0; +} + +late_initcall(blk_timeout_init); + +/* + * Just a rough estimate, we don't care about specific values for timeouts. + */ +static inline unsigned long blk_round_jiffies(unsigned long j) +{ + return (j + blk_timeout_mask) + 1; +} + +unsigned long blk_rq_timeout(unsigned long timeout) +{ + unsigned long maxt; + + maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); + if (time_after(timeout, maxt)) + timeout = maxt; + + return timeout; +} + +/** + * blk_add_timer - Start timeout timer for a single request + * @req: request that is about to start running. + * + * Notes: + * Each request has its own timer, and as it is added to the queue, we + * set up the timer. When the request completes, we cancel the timer. + */ +void blk_add_timer(struct request *req) +{ + struct request_queue *q = req->q; + unsigned long expiry; + + /* + * Some LLDs, like scsi, peek at the timeout to prevent a + * command from being retried forever. + */ + if (!req->timeout) + req->timeout = q->rq_timeout; + + req->rq_flags &= ~RQF_TIMED_OUT; + + expiry = jiffies + req->timeout; + WRITE_ONCE(req->deadline, expiry); + + /* + * If the timer isn't already pending or this timeout is earlier + * than an existing one, modify the timer. Round up to next nearest + * second. + */ + expiry = blk_rq_timeout(blk_round_jiffies(expiry)); + + if (!timer_pending(&q->timeout) || + time_before(expiry, q->timeout.expires)) { + unsigned long diff = q->timeout.expires - expiry; + + /* + * Due to added timer slack to group timers, the timer + * will often be a little in front of what we asked for. + * So apply some tolerance here too, otherwise we keep + * modifying the timer because expires for value X + * will be X + something. + */ + if (!timer_pending(&q->timeout) || (diff >= HZ / 2)) + mod_timer(&q->timeout, expiry); + } + +} diff --git a/block/blk-wbt.c b/block/blk-wbt.c new file mode 100644 index 000000000..6f63920f0 --- /dev/null +++ b/block/blk-wbt.c @@ -0,0 +1,854 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. + * + * Copyright (C) 2016 Jens Axboe + * + */ +#include +#include +#include +#include +#include + +#include "blk-wbt.h" +#include "blk-rq-qos.h" + +#define CREATE_TRACE_POINTS +#include + +static inline void wbt_clear_state(struct request *rq) +{ + rq->wbt_flags = 0; +} + +static inline enum wbt_flags wbt_flags(struct request *rq) +{ + return rq->wbt_flags; +} + +static inline bool wbt_is_tracked(struct request *rq) +{ + return rq->wbt_flags & WBT_TRACKED; +} + +static inline bool wbt_is_read(struct request *rq) +{ + return rq->wbt_flags & WBT_READ; +} + +enum { + /* + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats + */ + RWB_DEF_DEPTH = 16, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet this minimum + */ + RWB_MIN_WRITE_SAMPLES = 3, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && + rwb->wb_normal != 0; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, + enum wbt_flags wb_acct) +{ + if (wb_acct & WBT_KSWAPD) + return &rwb->rq_wait[WBT_RWQ_KSWAPD]; + else if (wb_acct & WBT_DISCARD) + return &rwb->rq_wait[WBT_RWQ_DISCARD]; + + return &rwb->rq_wait[WBT_RWQ_BG]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (wq_has_sleeper(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, + enum wbt_flags wb_acct) +{ + int inflight, limit; + + inflight = atomic_dec_return(&rqw->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + rwb_wake_all(rwb); + return; + } + + /* + * For discards, our limit is always the background. For writes, if + * the device does write back caching, drop further down before we + * wake people up. + */ + if (wb_acct & WBT_DISCARD) + limit = rwb->wb_background; + else if (rwb->wc && !wb_recent_wait(rwb)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (wq_has_sleeper(&rqw->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_all(&rqw->wait); + } +} + +static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct) +{ + struct rq_wb *rwb = RQWB(rqos); + struct rq_wait *rqw; + + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct); + wbt_rqw_done(rwb, rqw, wb_acct); +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +static void wbt_done(struct rq_qos *rqos, struct request *rq) +{ + struct rq_wb *rwb = RQWB(rqos); + + if (!wbt_is_tracked(rq)) { + if (rwb->sync_cookie == rq) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + if (wbt_is_read(rq)) + wb_timestamp(rwb, &rwb->last_comp); + } else { + WARN_ON_ONCE(rq == rwb->sync_cookie); + __wbt_done(rqos, wbt_flags(rq)); + } + wbt_clear_state(rq); +} + +static inline bool stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return (stat[READ].nr_samples >= 1 && + stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = READ_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK = 1, + LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, + LAT_EXCEEDED, +}; + +static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { + trace_wbt_lat(bdi, thislat); + return LAT_EXCEEDED; + } + + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || + wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; + return LAT_UNKNOWN; + } + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[READ].min); + trace_wbt_stat(bdi, stat); + return LAT_EXCEEDED; + } + + if (rqd->scale_step) + trace_wbt_stat(bdi, stat); + + return LAT_OK; +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct rq_depth *rqd = &rwb->rq_depth; + + trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rqd->max_depth); +} + +static void calc_wb_limits(struct rq_wb *rwb) +{ + if (rwb->min_lat_nsec == 0) { + rwb->wb_normal = rwb->wb_background = 0; + } else if (rwb->rq_depth.max_depth <= 2) { + rwb->wb_normal = rwb->rq_depth.max_depth; + rwb->wb_background = 1; + } else { + rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2; + rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4; + } +} + +static void scale_up(struct rq_wb *rwb) +{ + if (!rq_depth_scale_up(&rwb->rq_depth)) + return; + calc_wb_limits(rwb); + rwb->unknown_cnt = 0; + rwb_wake_all(rwb); + rwb_trace_step(rwb, tracepoint_string("scale up")); +} + +static void scale_down(struct rq_wb *rwb, bool hard_throttle) +{ + if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle)) + return; + calc_wb_limits(rwb); + rwb->unknown_cnt = 0; + rwb_trace_step(rwb, tracepoint_string("scale down")); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + struct rq_depth *rqd = &rwb->rq_depth; + + if (rqd->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rqd->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + + blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); +} + +static void wb_timer_fn(struct blk_stat_callback *cb) +{ + struct rq_wb *rwb = cb->data; + struct rq_depth *rqd = &rwb->rq_depth; + unsigned int inflight = wbt_inflight(rwb); + int status; + + status = latency_exceeded(rwb, cb->stat); + + trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, + inflight); + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb, true); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN_WRITES: + /* + * We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. + */ + scale_up(rwb); + break; + case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; + /* + * We get here when previously scaled reduced depth, and we + * currently don't have a valid read/write sample. For that + * case, slowly return to center state (step == 0). + */ + if (rqd->scale_step > 0) + scale_up(rwb); + else if (rqd->scale_step < 0) + scale_down(rwb, false); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rqd->scale_step || inflight) + rwb_arm_timer(rwb); +} + +static void wbt_update_limits(struct rq_wb *rwb) +{ + struct rq_depth *rqd = &rwb->rq_depth; + + rqd->scale_step = 0; + rqd->scaled_max = false; + + rq_depth_calc_max_depth(rqd); + calc_wb_limits(rwb); + + rwb_wake_all(rwb); +} + +u64 wbt_get_min_lat(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return 0; + return RQWB(rqos)->min_lat_nsec; +} + +void wbt_set_min_lat(struct request_queue *q, u64 val) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (!rqos) + return; + RQWB(rqos)->min_lat_nsec = val; + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + wbt_update_limits(RQWB(rqos)); +} + + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * If we got disabled, just return UINT_MAX. This ensures that + * we'll properly inc a new IO, and dec+wakeup at the end. + */ + if (!rwb_enabled(rwb)) + return UINT_MAX; + + if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) + return rwb->wb_background; + + /* + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. + */ + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + limit = rwb->rq_depth.max_depth; + else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +struct wbt_wait_data { + struct rq_wb *rwb; + enum wbt_flags wb_acct; + unsigned long rw; +}; + +static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); +} + +static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) +{ + struct wbt_wait_data *data = private_data; + wbt_rqw_done(data->rwb, rqw, data->wb_acct); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, + unsigned long rw) +{ + struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); + struct wbt_wait_data data = { + .rwb = rwb, + .wb_acct = wb_acct, + .rw = rw, + }; + + rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +{ + switch (bio_op(bio)) { + case REQ_OP_WRITE: + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == + (REQ_SYNC | REQ_IDLE)) + return false; + fallthrough; + case REQ_OP_DISCARD: + return true; + default: + return false; + } +} + +static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) +{ + enum wbt_flags flags = 0; + + if (!rwb_enabled(rwb)) + return 0; + + if (bio_op(bio) == REQ_OP_READ) { + flags = WBT_READ; + } else if (wbt_should_throttle(rwb, bio)) { + if (current_is_kswapd()) + flags |= WBT_KSWAPD; + if (bio_op(bio) == REQ_OP_DISCARD) + flags |= WBT_DISCARD; + flags |= WBT_TRACKED; + } + return flags; +} + +static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags = bio_to_wbt_flags(rwb, bio); + __wbt_done(rqos, flags); +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +static void wbt_wait(struct rq_qos *rqos, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + enum wbt_flags flags; + + flags = bio_to_wbt_flags(rwb, bio); + if (!(flags & WBT_TRACKED)) { + if (flags & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return; + } + + __wbt_wait(rwb, flags, bio->bi_opf); + + if (!blk_stat_is_active(rwb->cb)) + rwb_arm_timer(rwb); +} + +static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio) +{ + struct rq_wb *rwb = RQWB(rqos); + rq->wbt_flags |= bio_to_wbt_flags(rwb, bio); +} + +static void wbt_issue(struct rq_qos *rqos, struct request *rq) +{ + struct rq_wb *rwb = RQWB(rqos); + + if (!rwb_enabled(rwb)) + return; + + /* + * Track sync issue, in case it takes a long time to complete. Allows us + * to react quicker, if a sync IO takes a long time to complete. Note + * that this is just a hint. The request can go away when it completes, + * so it's important we never dereference it. We only use the address to + * compare with, which is why we store the sync_issue time locally. + */ + if (wbt_is_read(rq) && !rwb->sync_issue) { + rwb->sync_cookie = rq; + rwb->sync_issue = rq->io_start_time_ns; + } +} + +static void wbt_requeue(struct rq_qos *rqos, struct request *rq) +{ + struct rq_wb *rwb = RQWB(rqos); + if (!rwb_enabled(rwb)) + return; + if (rq == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + if (rqos) + RQWB(rqos)->wc = write_cache_on; +} + +/* + * Enable wbt if defaults are configured that way + */ +void wbt_enable_default(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + /* Throttling already enabled? */ + if (rqos) { + if (RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; + return; + } + + /* Queue not registered? Maybe shutting down... */ + if (!blk_queue_registered(q)) + return; + + if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) + wbt_init(q); +} +EXPORT_SYMBOL_GPL(wbt_enable_default); + +u64 wbt_default_latency_nsec(struct request_queue *q) +{ + /* + * We default to 2msec for non-rotational storage, and 75msec + * for rotational storage. + */ + if (blk_queue_nonrot(q)) + return 2000000ULL; + else + return 75000000ULL; +} + +static int wbt_data_dir(const struct request *rq) +{ + const int op = req_op(rq); + + if (op == REQ_OP_READ) + return READ; + else if (op_is_write(op)) + return WRITE; + + /* don't account */ + return -1; +} + +static void wbt_queue_depth_changed(struct rq_qos *rqos) +{ + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + wbt_update_limits(RQWB(rqos)); +} + +static void wbt_exit(struct rq_qos *rqos) +{ + struct rq_wb *rwb = RQWB(rqos); + struct request_queue *q = rqos->q; + + blk_stat_remove_callback(q, rwb->cb); + blk_stat_free_callback(rwb->cb); + kfree(rwb); +} + +/* + * Disable wbt, if enabled by default. + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_wb *rwb; + if (!rqos) + return; + rwb = RQWB(rqos); + if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { + blk_stat_deactivate(rwb->cb); + rwb->enable_state = WBT_STATE_OFF_DEFAULT; + } +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + +#ifdef CONFIG_BLK_DEBUG_FS +static int wbt_curr_win_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%llu\n", rwb->cur_win_nsec); + return 0; +} + +static int wbt_enabled_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%d\n", rwb->enable_state); + return 0; +} + +static int wbt_id_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + + seq_printf(m, "%u\n", rqos->id); + return 0; +} + +static int wbt_inflight_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) + seq_printf(m, "%d: inflight %d\n", i, + atomic_read(&rwb->rq_wait[i].inflight)); + return 0; +} + +static int wbt_min_lat_nsec_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%lu\n", rwb->min_lat_nsec); + return 0; +} + +static int wbt_unknown_cnt_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->unknown_cnt); + return 0; +} + +static int wbt_normal_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_normal); + return 0; +} + +static int wbt_background_show(void *data, struct seq_file *m) +{ + struct rq_qos *rqos = data; + struct rq_wb *rwb = RQWB(rqos); + + seq_printf(m, "%u\n", rwb->wb_background); + return 0; +} + +static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { + {"curr_win_nsec", 0400, wbt_curr_win_nsec_show}, + {"enabled", 0400, wbt_enabled_show}, + {"id", 0400, wbt_id_show}, + {"inflight", 0400, wbt_inflight_show}, + {"min_lat_nsec", 0400, wbt_min_lat_nsec_show}, + {"unknown_cnt", 0400, wbt_unknown_cnt_show}, + {"wb_normal", 0400, wbt_normal_show}, + {"wb_background", 0400, wbt_background_show}, + {}, +}; +#endif + +static struct rq_qos_ops wbt_rqos_ops = { + .throttle = wbt_wait, + .issue = wbt_issue, + .track = wbt_track, + .requeue = wbt_requeue, + .done = wbt_done, + .cleanup = wbt_cleanup, + .queue_depth_changed = wbt_queue_depth_changed, + .exit = wbt_exit, +#ifdef CONFIG_BLK_DEBUG_FS + .debugfs_attrs = wbt_debugfs_attrs, +#endif +}; + +int wbt_init(struct request_queue *q) +{ + struct rq_wb *rwb; + int i; + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); + if (!rwb->cb) { + kfree(rwb); + return -ENOMEM; + } + + for (i = 0; i < WBT_NUM_RWQ; i++) + rq_wait_init(&rwb->rq_wait[i]); + + rwb->rqos.id = RQ_QOS_WBT; + rwb->rqos.ops = &wbt_rqos_ops; + rwb->rqos.q = q; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->enable_state = WBT_STATE_ON_DEFAULT; + rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); + rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_queue_depth_changed(&rwb->rqos); + + /* + * Assign rwb and add the stats callback. + */ + rq_qos_add(q, &rwb->rqos); + blk_stat_add_callback(q, rwb->cb); + + return 0; +} diff --git a/block/blk-wbt.h b/block/blk-wbt.h new file mode 100644 index 000000000..2eb01becd --- /dev/null +++ b/block/blk-wbt.h @@ -0,0 +1,134 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef WB_THROTTLE_H +#define WB_THROTTLE_H + +#include +#include +#include +#include +#include + +#include "blk-stat.h" +#include "blk-rq-qos.h" + +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * Enable states. Either off, or on by default (done at init time), + * or on through manual setup in sysfs. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, + WBT_STATE_ON_MANUAL = 2, + WBT_STATE_OFF_DEFAULT +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned int wc; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + + +#ifdef CONFIG_BLK_WBT + +int wbt_init(struct request_queue *); +void wbt_disable_default(struct request_queue *); +void wbt_enable_default(struct request_queue *); + +u64 wbt_get_min_lat(struct request_queue *q); +void wbt_set_min_lat(struct request_queue *q, u64 val); + +void wbt_set_write_cache(struct request_queue *, bool); + +u64 wbt_default_latency_nsec(struct request_queue *); + +#else + +static inline void wbt_track(struct request *rq, enum wbt_flags flags) +{ +} +static inline int wbt_init(struct request_queue *q) +{ + return -EINVAL; +} +static inline void wbt_disable_default(struct request_queue *q) +{ +} +static inline void wbt_enable_default(struct request_queue *q) +{ +} +static inline void wbt_set_write_cache(struct request_queue *q, bool wc) +{ +} +static inline u64 wbt_get_min_lat(struct request_queue *q) +{ + return 0; +} +static inline void wbt_set_min_lat(struct request_queue *q, u64 val) +{ +} +static inline u64 wbt_default_latency_nsec(struct request_queue *q) +{ + return 0; +} + +#endif /* CONFIG_BLK_WBT */ + +#endif diff --git a/block/blk-zoned.c b/block/blk-zoned.c new file mode 100644 index 000000000..61b452272 --- /dev/null +++ b/block/blk-zoned.c @@ -0,0 +1,565 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Zoned block device handling + * + * Copyright (c) 2015, Hannes Reinecke + * Copyright (c) 2015, SUSE Linux GmbH + * + * Copyright (c) 2016, Damien Le Moal + * Copyright (c) 2016, Western Digital + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name +static const char *const zone_cond_name[] = { + ZONE_COND_NAME(NOT_WP), + ZONE_COND_NAME(EMPTY), + ZONE_COND_NAME(IMP_OPEN), + ZONE_COND_NAME(EXP_OPEN), + ZONE_COND_NAME(CLOSED), + ZONE_COND_NAME(READONLY), + ZONE_COND_NAME(FULL), + ZONE_COND_NAME(OFFLINE), +}; +#undef ZONE_COND_NAME + +/** + * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX. + * @zone_cond: BLK_ZONE_COND_XXX. + * + * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX + * into string format. Useful in the debugging and tracing zone conditions. For + * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN". + */ +const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) +{ + static const char *zone_cond_str = "UNKNOWN"; + + if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond]) + zone_cond_str = zone_cond_name[zone_cond]; + + return zone_cond_str; +} +EXPORT_SYMBOL_GPL(blk_zone_cond_str); + +static inline sector_t blk_zone_start(struct request_queue *q, + sector_t sector) +{ + sector_t zone_mask = blk_queue_zone_sectors(q) - 1; + + return sector & ~zone_mask; +} + +/* + * Return true if a request is a write requests that needs zone write locking. + */ +bool blk_req_needs_zone_write_lock(struct request *rq) +{ + if (!rq->q->seq_zones_wlock) + return false; + + if (blk_rq_is_passthrough(rq)) + return false; + + switch (req_op(rq)) { + case REQ_OP_WRITE_ZEROES: + case REQ_OP_WRITE_SAME: + case REQ_OP_WRITE: + return blk_rq_zone_is_seq(rq); + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); + +bool blk_req_zone_write_trylock(struct request *rq) +{ + unsigned int zno = blk_rq_zone_no(rq); + + if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) + return false; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; + + return true; +} +EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); + +void __blk_req_zone_write_lock(struct request *rq) +{ + if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock))) + return; + + WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); + rq->rq_flags |= RQF_ZONE_WRITE_LOCKED; +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); + +void __blk_req_zone_write_unlock(struct request *rq) +{ + rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; + if (rq->q->seq_zones_wlock) + WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), + rq->q->seq_zones_wlock)); +} +EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); + +/** + * blkdev_nr_zones - Get number of zones + * @disk: Target gendisk + * + * Return the total number of zones of a zoned block device. For a block + * device without zone capabilities, the number of zones is always 0. + */ +unsigned int blkdev_nr_zones(struct gendisk *disk) +{ + sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); + + if (!blk_queue_is_zoned(disk->queue)) + return 0; + return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); +} +EXPORT_SYMBOL_GPL(blkdev_nr_zones); + +/** + * blkdev_report_zones - Get zones information + * @bdev: Target block device + * @sector: Sector from which to report zones + * @nr_zones: Maximum number of zones to report + * @cb: Callback function called for each reported zone + * @data: Private data for the callback + * + * Description: + * Get zone information starting from the zone containing @sector for at most + * @nr_zones, and call @cb for each zone reported by the device. + * To report all zones in a device starting from @sector, the BLK_ALL_ZONES + * constant can be passed to @nr_zones. + * Returns the number of zones reported by the device, or a negative errno + * value in case of failure. + * + * Note: The caller must use memalloc_noXX_save/restore() calls to control + * memory allocations done within this function. + */ +int blkdev_report_zones(struct block_device *bdev, sector_t sector, + unsigned int nr_zones, report_zones_cb cb, void *data) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = get_capacity(disk); + + if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || + WARN_ON_ONCE(!disk->fops->report_zones)) + return -EOPNOTSUPP; + + if (!nr_zones || sector >= capacity) + return 0; + + return disk->fops->report_zones(disk, sector, nr_zones, cb, data); +} +EXPORT_SYMBOL_GPL(blkdev_report_zones); + +static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, + sector_t sector, + sector_t nr_sectors) +{ + if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) + return false; + + /* + * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors + * of the applicable zone range is the entire disk. + */ + return !sector && nr_sectors == get_capacity(bdev->bd_disk); +} + +/** + * blkdev_zone_mgmt - Execute a zone management operation on a range of zones + * @bdev: Target block device + * @op: Operation to be performed on the zones + * @sector: Start sector of the first zone to operate on + * @nr_sectors: Number of sectors, should be at least the length of one zone and + * must be zone size aligned. + * @gfp_mask: Memory allocation flags (for bio_alloc) + * + * Description: + * Perform the specified operation on the range of zones specified by + * @sector..@sector+@nr_sectors. Specifying the entire disk sector range + * is valid, but the specified range should not contain conventional zones. + * The operation to execute on each zone can be a zone reset, open, close + * or finish request. + */ +int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, + sector_t sector, sector_t nr_sectors, + gfp_t gfp_mask) +{ + struct request_queue *q = bdev_get_queue(bdev); + sector_t zone_sectors = blk_queue_zone_sectors(q); + sector_t capacity = get_capacity(bdev->bd_disk); + sector_t end_sector = sector + nr_sectors; + struct bio *bio = NULL; + int ret; + + if (!blk_queue_is_zoned(q)) + return -EOPNOTSUPP; + + if (bdev_read_only(bdev)) + return -EPERM; + + if (!op_is_zone_mgmt(op)) + return -EOPNOTSUPP; + + if (end_sector <= sector || end_sector > capacity) + /* Out of range */ + return -EINVAL; + + /* Check alignment (handle eventual smaller last zone) */ + if (sector & (zone_sectors - 1)) + return -EINVAL; + + if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) + return -EINVAL; + + while (sector < end_sector) { + bio = blk_next_bio(bio, 0, gfp_mask); + bio_set_dev(bio, bdev); + + /* + * Special case for the zone reset operation that reset all + * zones, this is useful for applications like mkfs. + */ + if (op == REQ_OP_ZONE_RESET && + blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { + bio->bi_opf = REQ_OP_ZONE_RESET_ALL | REQ_SYNC; + break; + } + + bio->bi_opf = op | REQ_SYNC; + bio->bi_iter.bi_sector = sector; + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + } + + ret = submit_bio_wait(bio); + bio_put(bio); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_zone_mgmt); + +struct zone_report_args { + struct blk_zone __user *zones; +}; + +static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct zone_report_args *args = data; + + if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone))) + return -EFAULT; + return 0; +} + +/* + * BLKREPORTZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct zone_report_args args; + struct request_queue *q; + struct blk_zone_report rep; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) + return -EFAULT; + + if (!rep.nr_zones) + return -EINVAL; + + args.zones = argp + sizeof(struct blk_zone_report); + ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones, + blkdev_copy_zone_to_user, &args); + if (ret < 0) + return ret; + + rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; + if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) + return -EFAULT; + return 0; +} + +static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, + const struct blk_zone_range *zrange) +{ + loff_t start, end; + + if (zrange->sector + zrange->nr_sectors <= zrange->sector || + zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) + /* Out of range */ + return -EINVAL; + + start = zrange->sector << SECTOR_SHIFT; + end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; + + return truncate_bdev_range(bdev, mode, start, end); +} + +/* + * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. + * Called from blkdev_ioctl. + */ +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + void __user *argp = (void __user *)arg; + struct request_queue *q; + struct blk_zone_range zrange; + enum req_opf op; + int ret; + + if (!argp) + return -EINVAL; + + q = bdev_get_queue(bdev); + if (!q) + return -ENXIO; + + if (!blk_queue_is_zoned(q)) + return -ENOTTY; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) + return -EFAULT; + + switch (cmd) { + case BLKRESETZONE: + op = REQ_OP_ZONE_RESET; + + /* Invalidate the page cache, including dirty pages. */ + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + if (ret) + return ret; + break; + case BLKOPENZONE: + op = REQ_OP_ZONE_OPEN; + break; + case BLKCLOSEZONE: + op = REQ_OP_ZONE_CLOSE; + break; + case BLKFINISHZONE: + op = REQ_OP_ZONE_FINISH; + break; + default: + return -ENOTTY; + } + + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); + + /* + * Invalidate the page cache again for zone reset: writes can only be + * direct for zoned devices so concurrent writes would not add any page + * to the page cache after/during reset. The page cache may be filled + * again due to concurrent reads though and dropping the pages for + * these is fine. + */ + if (!ret && cmd == BLKRESETZONE) + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + + return ret; +} + +static inline unsigned long *blk_alloc_zone_bitmap(int node, + unsigned int nr_zones) +{ + return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), + GFP_NOIO, node); +} + +void blk_queue_free_zone_bitmaps(struct request_queue *q) +{ + kfree(q->conv_zones_bitmap); + q->conv_zones_bitmap = NULL; + kfree(q->seq_zones_wlock); + q->seq_zones_wlock = NULL; +} + +struct blk_revalidate_zone_args { + struct gendisk *disk; + unsigned long *conv_zones_bitmap; + unsigned long *seq_zones_wlock; + unsigned int nr_zones; + sector_t zone_sectors; + sector_t sector; +}; + +/* + * Helper function to check the validity of zones of a zoned block device. + */ +static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ + struct blk_revalidate_zone_args *args = data; + struct gendisk *disk = args->disk; + struct request_queue *q = disk->queue; + sector_t capacity = get_capacity(disk); + + /* + * All zones must have the same size, with the exception on an eventual + * smaller last zone. + */ + if (zone->start == 0) { + if (zone->len == 0 || !is_power_of_2(zone->len)) { + pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", + disk->disk_name, zone->len); + return -ENODEV; + } + + args->zone_sectors = zone->len; + args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); + } else if (zone->start + args->zone_sectors < capacity) { + if (zone->len != args->zone_sectors) { + pr_warn("%s: Invalid zoned device with non constant zone size\n", + disk->disk_name); + return -ENODEV; + } + } else { + if (zone->len > args->zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); + return -ENODEV; + } + } + + /* Check for holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } + + /* Check zone type */ + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!args->conv_zones_bitmap) { + args->conv_zones_bitmap = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->conv_zones_bitmap) + return -ENOMEM; + } + set_bit(idx, args->conv_zones_bitmap); + break; + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + if (!args->seq_zones_wlock) { + args->seq_zones_wlock = + blk_alloc_zone_bitmap(q->node, args->nr_zones); + if (!args->seq_zones_wlock) + return -ENOMEM; + } + break; + default: + pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", + disk->disk_name, (int)zone->type, zone->start); + return -ENODEV; + } + + args->sector += zone->len; + return 0; +} + +/** + * blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps + * @disk: Target disk + * @update_driver_data: Callback to update driver data on the frozen disk + * + * Helper function for low-level device drivers to (re) allocate and initialize + * a disk request queue zone bitmaps. This functions should normally be called + * within the disk ->revalidate method for blk-mq based drivers. For BIO based + * drivers only q->nr_zones needs to be updated so that the sysfs exposed value + * is correct. + * If the @update_driver_data callback function is not NULL, the callback is + * executed with the device request queue frozen after all zones have been + * checked. + */ +int blk_revalidate_disk_zones(struct gendisk *disk, + void (*update_driver_data)(struct gendisk *disk)) +{ + struct request_queue *q = disk->queue; + struct blk_revalidate_zone_args args = { + .disk = disk, + }; + unsigned int noio_flag; + int ret; + + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return -EIO; + if (WARN_ON_ONCE(!queue_is_mq(q))) + return -EIO; + + if (!get_capacity(disk)) + return -EIO; + + /* + * Ensure that all memory allocations in this context are done as if + * GFP_NOIO was specified. + */ + noio_flag = memalloc_noio_save(); + ret = disk->fops->report_zones(disk, 0, UINT_MAX, + blk_revalidate_zone_cb, &args); + memalloc_noio_restore(noio_flag); + + /* + * Install the new bitmaps and update nr_zones only once the queue is + * stopped and all I/Os are completed (i.e. a scheduler is not + * referencing the bitmaps). + */ + blk_mq_freeze_queue(q); + if (ret >= 0) { + blk_queue_chunk_sectors(q, args.zone_sectors); + q->nr_zones = args.nr_zones; + swap(q->seq_zones_wlock, args.seq_zones_wlock); + swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + if (update_driver_data) + update_driver_data(disk); + ret = 0; + } else { + pr_warn("%s: failed to revalidate zones\n", disk->disk_name); + blk_queue_free_zone_bitmaps(q); + } + blk_mq_unfreeze_queue(q); + + kfree(args.seq_zones_wlock); + kfree(args.conv_zones_bitmap); + return ret; +} +EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); diff --git a/block/blk.h b/block/blk.h new file mode 100644 index 000000000..997941cd9 --- /dev/null +++ b/block/blk.h @@ -0,0 +1,451 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef BLK_INTERNAL_H +#define BLK_INTERNAL_H + +#include +#include +#include +#include +#include +#include "blk-crypto-internal.h" +#include "blk-mq.h" +#include "blk-mq-sched.h" + +/* Max future timer expiry for timeouts */ +#define BLK_MAX_TIMEOUT (5 * HZ) + +extern struct dentry *blk_debugfs_root; + +struct blk_flush_queue { + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; + blk_status_t rq_status; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; + struct request *flush_rq; + + struct lock_class_key key; + spinlock_t mq_flush_lock; +}; + +extern struct kmem_cache *blk_requestq_cachep; +extern struct kobj_type blk_queue_ktype; +extern struct ida blk_queue_ida; + +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +{ + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +} + +static inline void __blk_get_queue(struct request_queue *q) +{ + kobject_get(&q->kobj); +} + +bool is_flush_rq(struct request *req); + +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags); +void blk_free_flush_queue(struct blk_flush_queue *q); + +void blk_freeze_queue(struct request_queue *q); + +static inline bool biovec_phys_mergeable(struct request_queue *q, + struct bio_vec *vec1, struct bio_vec *vec2) +{ + unsigned long mask = queue_segment_boundary(q); + phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; + phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + + if (addr1 + vec1->bv_len != addr2) + return false; + if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) + return false; + if ((addr1 | mask) != ((addr2 + vec2->bv_len - 1) | mask)) + return false; + return true; +} + +static inline bool __bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + return (offset & queue_virt_boundary(q)) || + ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); +} + +/* + * Check if adding a bio_vec after bprv with offset would create a gap in + * the SG list. Most drivers don't care about this, but some do. + */ +static inline bool bvec_gap_to_prev(struct request_queue *q, + struct bio_vec *bprv, unsigned int offset) +{ + if (!queue_virt_boundary(q)) + return false; + return __bvec_gap_to_prev(q, bprv, offset); +} + +static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, + unsigned int nr_segs) +{ + rq->nr_phys_segments = nr_segs; + rq->__data_len = bio->bi_iter.bi_size; + rq->bio = rq->biotail = bio; + rq->ioprio = bio_prio(bio); + + if (bio->bi_disk) + rq->rq_disk = bio->bi_disk; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +void blk_flush_integrity(void); +bool __bio_integrity_endio(struct bio *); +void bio_integrity_free(struct bio *bio); +static inline bool bio_integrity_endio(struct bio *bio) +{ + if (bio_integrity(bio)) + return __bio_integrity_endio(bio); + return true; +} + +bool blk_integrity_merge_rq(struct request_queue *, struct request *, + struct request *); +bool blk_integrity_merge_bio(struct request_queue *, struct request *, + struct bio *); + +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + struct bio_integrity_payload *bip = bio_integrity(req->bio); + struct bio_integrity_payload *bip_next = bio_integrity(next); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + struct bio_integrity_payload *bip = bio_integrity(bio); + struct bio_integrity_payload *bip_next = bio_integrity(req->bio); + + return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + bip_next->bip_vec[0].bv_offset); +} + +void blk_integrity_add(struct gendisk *); +void blk_integrity_del(struct gendisk *); +#else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline bool blk_integrity_merge_rq(struct request_queue *rq, + struct request *r1, struct request *r2) +{ + return true; +} +static inline bool blk_integrity_merge_bio(struct request_queue *rq, + struct request *r, struct bio *b) +{ + return true; +} +static inline bool integrity_req_gap_back_merge(struct request *req, + struct bio *next) +{ + return false; +} +static inline bool integrity_req_gap_front_merge(struct request *req, + struct bio *bio) +{ + return false; +} + +static inline void blk_flush_integrity(void) +{ +} +static inline bool bio_integrity_endio(struct bio *bio) +{ + return true; +} +static inline void bio_integrity_free(struct bio *bio) +{ +} +static inline void blk_integrity_add(struct gendisk *disk) +{ +} +static inline void blk_integrity_del(struct gendisk *disk) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +unsigned long blk_rq_timeout(unsigned long timeout); +void blk_add_timer(struct request *req); + +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **same_queue_rq); +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs); + +void blk_account_io_start(struct request *req); +void blk_account_io_done(struct request *req, u64 now); + +/* + * Plug flush limits + */ +#define BLK_MAX_REQUEST_COUNT 32 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) + +/* + * Internal elevator interface + */ +#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) + +void blk_insert_flush(struct request *rq); + +void elevator_init_mq(struct request_queue *q); +int elevator_switch_mq(struct request_queue *q, + struct elevator_type *new_e); +void __elevator_exit(struct request_queue *, struct elevator_queue *); +int elv_register_queue(struct request_queue *q, bool uevent); +void elv_unregister_queue(struct request_queue *q); + +static inline void elevator_exit(struct request_queue *q, + struct elevator_queue *e) +{ + lockdep_assert_held(&q->sysfs_lock); + + blk_mq_sched_free_requests(q); + __elevator_exit(q, e); +} + +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); + +ssize_t part_size_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, + char *buf); +ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, + const char *buf, size_t count); +ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); +ssize_t part_timeout_store(struct device *, struct device_attribute *, + const char *, size_t); + +void __blk_queue_split(struct bio **bio, unsigned int *nr_segs); +int ll_back_merge_fn(struct request *req, struct bio *bio, + unsigned int nr_segs); +int blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next); +unsigned int blk_recalc_rq_segments(struct request *rq); +void blk_rq_set_mixed_merge(struct request *rq); +bool blk_rq_merge_ok(struct request *rq, struct bio *bio); +enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); + +int blk_dev_init(void); + +/* + * Contribute to IO statistics IFF: + * + * a) it's attached to a gendisk, and + * b) the queue had IO stats enabled when this request was started + */ +static inline bool blk_do_io_stat(struct request *rq) +{ + return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); +} + +static inline void req_set_nomerge(struct request_queue *q, struct request *req) +{ + req->cmd_flags |= REQ_NOMERGE; + if (req == q->last_merge) + q->last_merge = NULL; +} + +/* + * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size + * is defined as 'unsigned int', meantime it has to aligned to with logical + * block size which is the minimum accepted unit by hardware. + */ +static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) +{ + return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; +} + +/* + * The max bio size which is aligned to q->limits.discard_granularity. This + * is a hint to split large discard bio in generic block layer, then if device + * driver needs to split the discard bio into smaller ones, their bi_size can + * be very probably and easily aligned to discard_granularity of the device's + * queue. + */ +static inline unsigned int bio_aligned_discard_max_sectors( + struct request_queue *q) +{ + return round_down(UINT_MAX, q->limits.discard_granularity) >> + SECTOR_SHIFT; +} + +/* + * Internal io_context interface + */ +void get_io_context(struct io_context *ioc); +struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); +struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, + gfp_t gfp_mask); +void ioc_clear_queue(struct request_queue *q); + +int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); + +/* + * Internal throttling interface + */ +#ifdef CONFIG_BLK_DEV_THROTTLING +extern int blk_throtl_init(struct request_queue *q); +extern void blk_throtl_exit(struct request_queue *q); +extern void blk_throtl_register_queue(struct request_queue *q); +extern void blk_throtl_charge_bio_split(struct bio *bio); +bool blk_throtl_bio(struct bio *bio); +#else /* CONFIG_BLK_DEV_THROTTLING */ +static inline int blk_throtl_init(struct request_queue *q) { return 0; } +static inline void blk_throtl_exit(struct request_queue *q) { } +static inline void blk_throtl_register_queue(struct request_queue *q) { } +static inline void blk_throtl_charge_bio_split(struct bio *bio) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } +#endif /* CONFIG_BLK_DEV_THROTTLING */ +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); +extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, + const char *page, size_t count); +extern void blk_throtl_bio_endio(struct bio *bio); +extern void blk_throtl_stat_add(struct request *rq, u64 time); +#else +static inline void blk_throtl_bio_endio(struct bio *bio) { } +static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } +#endif + +#ifdef CONFIG_BOUNCE +extern int init_emergency_isa_pool(void); +extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); +#else +static inline int init_emergency_isa_pool(void) +{ + return 0; +} +static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) +{ +} +#endif /* CONFIG_BOUNCE */ + +#ifdef CONFIG_BLK_CGROUP_IOLATENCY +extern int blk_iolatency_init(struct request_queue *q); +#else +static inline int blk_iolatency_init(struct request_queue *q) { return 0; } +#endif + +struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); + +#ifdef CONFIG_BLK_DEV_ZONED +void blk_queue_free_zone_bitmaps(struct request_queue *q); +#else +static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} +#endif + +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); + +int blk_alloc_devt(struct hd_struct *part, dev_t *devt); +void blk_free_devt(dev_t devt); +void blk_invalidate_devt(dev_t devt); +char *disk_name(struct gendisk *hd, int partno, char *buf); +#define ADDPART_FLAG_NONE 0 +#define ADDPART_FLAG_RAID 1 +#define ADDPART_FLAG_WHOLEDISK 2 +void delete_partition(struct hd_struct *part); +int bdev_add_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length); +int bdev_del_partition(struct block_device *bdev, int partno); +int bdev_resize_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length); +int disk_expand_part_tbl(struct gendisk *disk, int target); +int hd_ref_init(struct hd_struct *part); + +/* no need to get/put refcount of part0 */ +static inline int hd_struct_try_get(struct hd_struct *part) +{ + if (part->partno) + return percpu_ref_tryget_live(&part->ref); + return 1; +} + +static inline void hd_struct_put(struct hd_struct *part) +{ + if (part->partno) + percpu_ref_put(&part->ref); +} + +static inline void hd_free_part(struct hd_struct *part) +{ + free_percpu(part->dkstats); + kfree(part->info); + percpu_ref_exit(&part->ref); +} + +/* + * Any access of part->nr_sects which is not protected by partition + * bd_mutex or gendisk bdev bd_mutex, should be done using this + * accessor function. + * + * Code written along the lines of i_size_read() and i_size_write(). + * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption + * on. + */ +static inline sector_t part_nr_sects_read(struct hd_struct *part) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + sector_t nr_sects; + unsigned seq; + do { + seq = read_seqcount_begin(&part->nr_sects_seq); + nr_sects = part->nr_sects; + } while (read_seqcount_retry(&part->nr_sects_seq, seq)); + return nr_sects; +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + sector_t nr_sects; + + preempt_disable(); + nr_sects = part->nr_sects; + preempt_enable(); + return nr_sects; +#else + return part->nr_sects; +#endif +} + +/* + * Should be called with mutex lock held (typically bd_mutex) of partition + * to provide mutual exlusion among writers otherwise seqcount might be + * left in wrong state leaving the readers spinning infinitely. + */ +static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) +{ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) + preempt_disable(); + write_seqcount_begin(&part->nr_sects_seq); + part->nr_sects = size; + write_seqcount_end(&part->nr_sects_seq); + preempt_enable(); +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) + preempt_disable(); + part->nr_sects = size; + preempt_enable(); +#else + part->nr_sects = size; +#endif +} + +int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page); + +#endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c new file mode 100644 index 000000000..162a6eee8 --- /dev/null +++ b/block/bounce.c @@ -0,0 +1,389 @@ +// SPDX-License-Identifier: GPL-2.0 +/* bounce buffer handling for block devices + * + * - Split from highmem.c + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include "blk.h" + +#define POOL_SIZE 64 +#define ISA_POOL_SIZE 16 + +static struct bio_set bounce_bio_set, bounce_bio_split; +static mempool_t page_pool, isa_page_pool; + +static void init_bounce_bioset(void) +{ + static bool bounce_bs_setup; + int ret; + + if (bounce_bs_setup) + return; + + ret = bioset_init(&bounce_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); + BUG_ON(ret); + if (bioset_integrity_create(&bounce_bio_set, BIO_POOL_SIZE)) + BUG_ON(1); + + ret = bioset_init(&bounce_bio_split, BIO_POOL_SIZE, 0, 0); + BUG_ON(ret); + bounce_bs_setup = true; +} + +#if defined(CONFIG_HIGHMEM) +static __init int init_emergency_pool(void) +{ + int ret; +#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + if (max_pfn <= max_low_pfn) + return 0; +#endif + + ret = mempool_init_page_pool(&page_pool, POOL_SIZE, 0); + BUG_ON(ret); + pr_info("pool size: %d pages\n", POOL_SIZE); + + init_bounce_bioset(); + return 0; +} + +__initcall(init_emergency_pool); +#endif + +#ifdef CONFIG_HIGHMEM +/* + * highmem version, map in to vec + */ +static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) +{ + unsigned char *vto; + + vto = kmap_atomic(to->bv_page); + memcpy(vto + to->bv_offset, vfrom, to->bv_len); + kunmap_atomic(vto); +} + +#else /* CONFIG_HIGHMEM */ + +#define bounce_copy_vec(to, vfrom) \ + memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) + +#endif /* CONFIG_HIGHMEM */ + +/* + * allocate pages in the DMA region for the ISA pool + */ +static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) +{ + return mempool_alloc_pages(gfp_mask | GFP_DMA, data); +} + +static DEFINE_MUTEX(isa_mutex); + +/* + * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA + * as the max address, so check if the pool has already been created. + */ +int init_emergency_isa_pool(void) +{ + int ret; + + mutex_lock(&isa_mutex); + + if (mempool_initialized(&isa_page_pool)) { + mutex_unlock(&isa_mutex); + return 0; + } + + ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, + mempool_free_pages, (void *) 0); + BUG_ON(ret); + + pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); + init_bounce_bioset(); + mutex_unlock(&isa_mutex); + return 0; +} + +/* + * Simple bounce buffer support for highmem pages. Depending on the + * queue gfp mask set, *to may or may not be a highmem page. kmap it + * always, it will do the Right Thing + */ +static void copy_to_high_bio_irq(struct bio *to, struct bio *from) +{ + unsigned char *vfrom; + struct bio_vec tovec, fromvec; + struct bvec_iter iter; + /* + * The bio of @from is created by bounce, so we can iterate + * its bvec from start to end, but the @from->bi_iter can't be + * trusted because it might be changed by splitting. + */ + struct bvec_iter from_iter = BVEC_ITER_ALL_INIT; + + bio_for_each_segment(tovec, to, iter) { + fromvec = bio_iter_iovec(from, from_iter); + if (tovec.bv_page != fromvec.bv_page) { + /* + * fromvec->bv_offset and fromvec->bv_len might have + * been modified by the block layer, so use the original + * copy, bounce_copy_vec already uses tovec->bv_len + */ + vfrom = page_address(fromvec.bv_page) + + tovec.bv_offset; + + bounce_copy_vec(&tovec, vfrom); + flush_dcache_page(tovec.bv_page); + } + bio_advance_iter(from, &from_iter, tovec.bv_len); + } +} + +static void bounce_end_io(struct bio *bio, mempool_t *pool) +{ + struct bio *bio_orig = bio->bi_private; + struct bio_vec *bvec, orig_vec; + struct bvec_iter orig_iter = bio_orig->bi_iter; + struct bvec_iter_all iter_all; + + /* + * free up bounce indirect pages used + */ + bio_for_each_segment_all(bvec, bio, iter_all) { + orig_vec = bio_iter_iovec(bio_orig, orig_iter); + if (bvec->bv_page != orig_vec.bv_page) { + dec_zone_page_state(bvec->bv_page, NR_BOUNCE); + mempool_free(bvec->bv_page, pool); + } + bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); + } + + bio_orig->bi_status = bio->bi_status; + bio_endio(bio_orig); + bio_put(bio); +} + +static void bounce_end_io_write(struct bio *bio) +{ + bounce_end_io(bio, &page_pool); +} + +static void bounce_end_io_write_isa(struct bio *bio) +{ + + bounce_end_io(bio, &isa_page_pool); +} + +static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +{ + struct bio *bio_orig = bio->bi_private; + + if (!bio->bi_status) + copy_to_high_bio_irq(bio_orig, bio); + + bounce_end_io(bio, pool); +} + +static void bounce_end_io_read(struct bio *bio) +{ + __bounce_end_io_read(bio, &page_pool); +} + +static void bounce_end_io_read_isa(struct bio *bio) +{ + __bounce_end_io_read(bio, &isa_page_pool); +} + +static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + struct bvec_iter iter; + struct bio_vec bv; + struct bio *bio; + + /* + * Pre immutable biovecs, __bio_clone() used to just do a memcpy from + * bio_src->bi_io_vec to bio->bi_io_vec. + * + * We can't do that anymore, because: + * + * - The point of cloning the biovec is to produce a bio with a biovec + * the caller can modify: bi_idx and bi_bvec_done should be 0. + * + * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * we tried to clone the whole thing bio_alloc_bioset() would fail. + * But the clone should succeed as long as the number of biovecs we + * actually need to allocate is fewer than BIO_MAX_PAGES. + * + * - Lastly, bi_vcnt should not be looked at or relied upon by code + * that does not own the bio - reason being drivers don't use it for + * iterating over the biovec anymore, so expecting it to be kept up + * to date (i.e. for clones that share the parent biovec) is just + * asking for trouble and would force extra work on + * __bio_clone_fast() anyways. + */ + + bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + if (!bio) + return NULL; + bio->bi_disk = bio_src->bi_disk; + bio->bi_opf = bio_src->bi_opf; + bio->bi_ioprio = bio_src->bi_ioprio; + bio->bi_write_hint = bio_src->bi_write_hint; + bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; + bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; + + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + break; + case REQ_OP_WRITE_SAME: + bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; + break; + default: + bio_for_each_segment(bv, bio_src, iter) + bio->bi_io_vec[bio->bi_vcnt++] = bv; + break; + } + + if (bio_crypt_clone(bio, bio_src, gfp_mask) < 0) + goto err_put; + + if (bio_integrity(bio_src) && + bio_integrity_clone(bio, bio_src, gfp_mask) < 0) + goto err_put; + + bio_clone_blkg_association(bio, bio_src); + blkcg_bio_issue_init(bio); + + return bio; + +err_put: + bio_put(bio); + return NULL; +} + +static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, + mempool_t *pool) +{ + struct bio *bio; + int rw = bio_data_dir(*bio_orig); + struct bio_vec *to, from; + struct bvec_iter iter; + unsigned i = 0; + bool bounce = false; + int sectors = 0; + bool passthrough = bio_is_passthrough(*bio_orig); + + bio_for_each_segment(from, *bio_orig, iter) { + if (i++ < BIO_MAX_PAGES) + sectors += from.bv_len >> 9; + if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + bounce = true; + } + if (!bounce) + return; + + if (!passthrough && sectors < bio_sectors(*bio_orig)) { + bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); + bio_chain(bio, *bio_orig); + submit_bio_noacct(*bio_orig); + *bio_orig = bio; + } + bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : + &bounce_bio_set); + + /* + * Bvec table can't be updated by bio_for_each_segment_all(), + * so retrieve bvec from the table directly. This way is safe + * because the 'bio' is single-page bvec. + */ + for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { + struct page *page = to->bv_page; + + if (page_to_pfn(page) <= q->limits.bounce_pfn) + continue; + + to->bv_page = mempool_alloc(pool, q->bounce_gfp); + inc_zone_page_state(to->bv_page, NR_BOUNCE); + + if (rw == WRITE) { + char *vto, *vfrom; + + flush_dcache_page(page); + + vto = page_address(to->bv_page) + to->bv_offset; + vfrom = kmap_atomic(page) + to->bv_offset; + memcpy(vto, vfrom, to->bv_len); + kunmap_atomic(vfrom); + } + } + + trace_block_bio_bounce(q, *bio_orig); + + bio->bi_flags |= (1 << BIO_BOUNCED); + + if (pool == &page_pool) { + bio->bi_end_io = bounce_end_io_write; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + } else { + bio->bi_end_io = bounce_end_io_write_isa; + if (rw == READ) + bio->bi_end_io = bounce_end_io_read_isa; + } + + bio->bi_private = *bio_orig; + *bio_orig = bio; +} + +void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) +{ + mempool_t *pool; + + /* + * Data-less bio, nothing to bounce + */ + if (!bio_has_data(*bio_orig)) + return; + + /* + * for non-isa bounce case, just check if the bounce pfn is equal + * to or bigger than the highest pfn in the system -- in that case, + * don't waste time iterating over bio segments + */ + if (!(q->bounce_gfp & GFP_DMA)) { + if (q->limits.bounce_pfn >= blk_max_pfn) + return; + pool = &page_pool; + } else { + BUG_ON(!mempool_initialized(&isa_page_pool)); + pool = &isa_page_pool; + } + + /* + * slow path + */ + __blk_queue_bounce(q, bio_orig, pool); +} diff --git a/block/bsg-lib.c b/block/bsg-lib.c new file mode 100644 index 000000000..330fede77 --- /dev/null +++ b/block/bsg-lib.c @@ -0,0 +1,415 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * BSG helper library + * + * Copyright (C) 2008 James Smart, Emulex Corporation + * Copyright (C) 2011 Red Hat, Inc. All rights reserved. + * Copyright (C) 2011 Mike Christie + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#define uptr64(val) ((void __user *)(uintptr_t)(val)) + +struct bsg_set { + struct blk_mq_tag_set tag_set; + bsg_job_fn *job_fn; + bsg_timeout_fn *timeout_fn; +}; + +static int bsg_transport_check_proto(struct sg_io_v4 *hdr) +{ + if (hdr->protocol != BSG_PROTOCOL_SCSI || + hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT) + return -EINVAL; + if (!capable(CAP_SYS_RAWIO)) + return -EPERM; + return 0; +} + +static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, + fmode_t mode) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + int ret; + + job->request_len = hdr->request_len; + job->request = memdup_user(uptr64(hdr->request), hdr->request_len); + if (IS_ERR(job->request)) + return PTR_ERR(job->request); + + if (hdr->dout_xfer_len && hdr->din_xfer_len) { + job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0); + if (IS_ERR(job->bidi_rq)) { + ret = PTR_ERR(job->bidi_rq); + goto out; + } + + ret = blk_rq_map_user(rq->q, job->bidi_rq, NULL, + uptr64(hdr->din_xferp), hdr->din_xfer_len, + GFP_KERNEL); + if (ret) + goto out_free_bidi_rq; + + job->bidi_bio = job->bidi_rq->bio; + } else { + job->bidi_rq = NULL; + job->bidi_bio = NULL; + } + + return 0; + +out_free_bidi_rq: + if (job->bidi_rq) + blk_put_request(job->bidi_rq); +out: + kfree(job->request); + return ret; +} + +static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + int ret = 0; + + /* + * The assignments below don't make much sense, but are kept for + * bug by bug backwards compatibility: + */ + hdr->device_status = job->result & 0xff; + hdr->transport_status = host_byte(job->result); + hdr->driver_status = driver_byte(job->result); + hdr->info = 0; + if (hdr->device_status || hdr->transport_status || hdr->driver_status) + hdr->info |= SG_INFO_CHECK; + hdr->response_len = 0; + + if (job->result < 0) { + /* we're only returning the result field in the reply */ + job->reply_len = sizeof(u32); + ret = job->result; + } + + if (job->reply_len && hdr->response) { + int len = min(hdr->max_response_len, job->reply_len); + + if (copy_to_user(uptr64(hdr->response), job->reply, len)) + ret = -EFAULT; + else + hdr->response_len = len; + } + + /* we assume all request payload was transferred, residual == 0 */ + hdr->dout_resid = 0; + + if (job->bidi_rq) { + unsigned int rsp_len = job->reply_payload.payload_len; + + if (WARN_ON(job->reply_payload_rcv_len > rsp_len)) + hdr->din_resid = 0; + else + hdr->din_resid = rsp_len - job->reply_payload_rcv_len; + } else { + hdr->din_resid = 0; + } + + return ret; +} + +static void bsg_transport_free_rq(struct request *rq) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + + if (job->bidi_rq) { + blk_rq_unmap_user(job->bidi_bio); + blk_put_request(job->bidi_rq); + } + + kfree(job->request); +} + +static const struct bsg_ops bsg_transport_ops = { + .check_proto = bsg_transport_check_proto, + .fill_hdr = bsg_transport_fill_hdr, + .complete_rq = bsg_transport_complete_rq, + .free_rq = bsg_transport_free_rq, +}; + +/** + * bsg_teardown_job - routine to teardown a bsg job + * @kref: kref inside bsg_job that is to be torn down + */ +static void bsg_teardown_job(struct kref *kref) +{ + struct bsg_job *job = container_of(kref, struct bsg_job, kref); + struct request *rq = blk_mq_rq_from_pdu(job); + + put_device(job->dev); /* release reference for the request */ + + kfree(job->request_payload.sg_list); + kfree(job->reply_payload.sg_list); + + blk_mq_end_request(rq, BLK_STS_OK); +} + +void bsg_job_put(struct bsg_job *job) +{ + kref_put(&job->kref, bsg_teardown_job); +} +EXPORT_SYMBOL_GPL(bsg_job_put); + +int bsg_job_get(struct bsg_job *job) +{ + return kref_get_unless_zero(&job->kref); +} +EXPORT_SYMBOL_GPL(bsg_job_get); + +/** + * bsg_job_done - completion routine for bsg requests + * @job: bsg_job that is complete + * @result: job reply result + * @reply_payload_rcv_len: length of payload recvd + * + * The LLD should call this when the bsg job has completed. + */ +void bsg_job_done(struct bsg_job *job, int result, + unsigned int reply_payload_rcv_len) +{ + struct request *rq = blk_mq_rq_from_pdu(job); + + job->result = result; + job->reply_payload_rcv_len = reply_payload_rcv_len; + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); +} +EXPORT_SYMBOL_GPL(bsg_job_done); + +/** + * bsg_complete - softirq done routine for destroying the bsg requests + * @rq: BSG request that holds the job to be destroyed + */ +static void bsg_complete(struct request *rq) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(rq); + + bsg_job_put(job); +} + +static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) +{ + size_t sz = (sizeof(struct scatterlist) * req->nr_phys_segments); + + BUG_ON(!req->nr_phys_segments); + + buf->sg_list = kmalloc(sz, GFP_KERNEL); + if (!buf->sg_list) + return -ENOMEM; + sg_init_table(buf->sg_list, req->nr_phys_segments); + buf->sg_cnt = blk_rq_map_sg(req->q, req, buf->sg_list); + buf->payload_len = blk_rq_bytes(req); + return 0; +} + +/** + * bsg_prepare_job - create the bsg_job structure for the bsg request + * @dev: device that is being sent the bsg request + * @req: BSG request that needs a job structure + */ +static bool bsg_prepare_job(struct device *dev, struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + int ret; + + job->timeout = req->timeout; + + if (req->bio) { + ret = bsg_map_buffer(&job->request_payload, req); + if (ret) + goto failjob_rls_job; + } + if (job->bidi_rq) { + ret = bsg_map_buffer(&job->reply_payload, job->bidi_rq); + if (ret) + goto failjob_rls_rqst_payload; + } + job->dev = dev; + /* take a reference for the request */ + get_device(job->dev); + kref_init(&job->kref); + return true; + +failjob_rls_rqst_payload: + kfree(job->request_payload.sg_list); +failjob_rls_job: + job->result = -ENOMEM; + return false; +} + +/** + * bsg_queue_rq - generic handler for bsg requests + * @hctx: hardware queue + * @bd: queue data + * + * On error the create_bsg_job function should return a -Exyz error value + * that will be set to ->result. + * + * Drivers/subsys should pass this to the queue init function. + */ +static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx, + const struct blk_mq_queue_data *bd) +{ + struct request_queue *q = hctx->queue; + struct device *dev = q->queuedata; + struct request *req = bd->rq; + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); + blk_status_t sts = BLK_STS_IOERR; + int ret; + + blk_mq_start_request(req); + + if (!get_device(dev)) + return BLK_STS_IOERR; + + if (!bsg_prepare_job(dev, req)) + goto out; + + ret = bset->job_fn(blk_mq_rq_to_pdu(req)); + if (!ret) + sts = BLK_STS_OK; + +out: + put_device(dev); + return sts; +} + +/* called right after the request is allocated for the request_queue */ +static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx, unsigned int numa_node) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + + job->reply = kzalloc(SCSI_SENSE_BUFFERSIZE, GFP_KERNEL); + if (!job->reply) + return -ENOMEM; + return 0; +} + +/* called right before the request is given to the request_queue user */ +static void bsg_initialize_rq(struct request *req) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + void *reply = job->reply; + + memset(job, 0, sizeof(*job)); + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; + job->dd_data = job + 1; +} + +static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, + unsigned int hctx_idx) +{ + struct bsg_job *job = blk_mq_rq_to_pdu(req); + + kfree(job->reply); +} + +void bsg_remove_queue(struct request_queue *q) +{ + if (q) { + struct bsg_set *bset = + container_of(q->tag_set, struct bsg_set, tag_set); + + bsg_unregister_queue(q); + blk_cleanup_queue(q); + blk_mq_free_tag_set(&bset->tag_set); + kfree(bset); + } +} +EXPORT_SYMBOL_GPL(bsg_remove_queue); + +static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) +{ + struct bsg_set *bset = + container_of(rq->q->tag_set, struct bsg_set, tag_set); + + if (!bset->timeout_fn) + return BLK_EH_DONE; + return bset->timeout_fn(rq); +} + +static const struct blk_mq_ops bsg_mq_ops = { + .queue_rq = bsg_queue_rq, + .init_request = bsg_init_rq, + .exit_request = bsg_exit_rq, + .initialize_rq_fn = bsg_initialize_rq, + .complete = bsg_complete, + .timeout = bsg_timeout, +}; + +/** + * bsg_setup_queue - Create and add the bsg hooks so we can receive requests + * @dev: device to attach bsg device to + * @name: device to give bsg device + * @job_fn: bsg job handler + * @timeout: timeout handler function pointer + * @dd_job_size: size of LLD data needed for each job + */ +struct request_queue *bsg_setup_queue(struct device *dev, const char *name, + bsg_job_fn *job_fn, bsg_timeout_fn *timeout, int dd_job_size) +{ + struct bsg_set *bset; + struct blk_mq_tag_set *set; + struct request_queue *q; + int ret = -ENOMEM; + + bset = kzalloc(sizeof(*bset), GFP_KERNEL); + if (!bset) + return ERR_PTR(-ENOMEM); + + bset->job_fn = job_fn; + bset->timeout_fn = timeout; + + set = &bset->tag_set; + set->ops = &bsg_mq_ops; + set->nr_hw_queues = 1; + set->queue_depth = 128; + set->numa_node = NUMA_NO_NODE; + set->cmd_size = sizeof(struct bsg_job) + dd_job_size; + set->flags = BLK_MQ_F_NO_SCHED | BLK_MQ_F_BLOCKING; + if (blk_mq_alloc_tag_set(set)) + goto out_tag_set; + + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + ret = PTR_ERR(q); + goto out_queue; + } + + q->queuedata = dev; + blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); + + ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); + if (ret) { + printk(KERN_ERR "%s: bsg interface failed to " + "initialize - register queue\n", dev->kobj.name); + goto out_cleanup_queue; + } + + return q; +out_cleanup_queue: + blk_cleanup_queue(q); +out_queue: + blk_mq_free_tag_set(set); +out_tag_set: + kfree(bset); + return ERR_PTR(ret); +} +EXPORT_SYMBOL_GPL(bsg_setup_queue); diff --git a/block/bsg.c b/block/bsg.c new file mode 100644 index 000000000..2cbc1fcc8 --- /dev/null +++ b/block/bsg.c @@ -0,0 +1,525 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * bsg.c - block layer implementation of the sg v4 interface + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" +#define BSG_VERSION "0.4" + +#define bsg_dbg(bd, fmt, ...) \ + pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) + +struct bsg_device { + struct request_queue *queue; + spinlock_t lock; + struct hlist_node dev_list; + refcount_t ref_count; + char name[20]; + int max_queue; +}; + +#define BSG_DEFAULT_CMDS 64 +#define BSG_MAX_DEVS 32768 + +static DEFINE_MUTEX(bsg_mutex); +static DEFINE_IDR(bsg_minor_idr); + +#define BSG_LIST_ARRAY_SIZE 8 +static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; + +static struct class *bsg_class; +static int bsg_major; + +static inline struct hlist_head *bsg_dev_idx_hash(int index) +{ + return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; +} + +#define uptr64(val) ((void __user *)(uintptr_t)(val)) + +static int bsg_scsi_check_proto(struct sg_io_v4 *hdr) +{ + if (hdr->protocol != BSG_PROTOCOL_SCSI || + hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) + return -EINVAL; + return 0; +} + +static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, + fmode_t mode) +{ + struct scsi_request *sreq = scsi_req(rq); + + if (hdr->dout_xfer_len && hdr->din_xfer_len) { + pr_warn_once("BIDI support in bsg has been removed.\n"); + return -EOPNOTSUPP; + } + + sreq->cmd_len = hdr->request_len; + if (sreq->cmd_len > BLK_MAX_CDB) { + sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL); + if (!sreq->cmd) + return -ENOMEM; + } + + if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len)) + return -EFAULT; + if (blk_verify_command(sreq->cmd, mode)) + return -EPERM; + return 0; +} + +static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr) +{ + struct scsi_request *sreq = scsi_req(rq); + int ret = 0; + + /* + * fill in all the output members + */ + hdr->device_status = sreq->result & 0xff; + hdr->transport_status = host_byte(sreq->result); + hdr->driver_status = driver_byte(sreq->result); + hdr->info = 0; + if (hdr->device_status || hdr->transport_status || hdr->driver_status) + hdr->info |= SG_INFO_CHECK; + hdr->response_len = 0; + + if (sreq->sense_len && hdr->response) { + int len = min_t(unsigned int, hdr->max_response_len, + sreq->sense_len); + + if (copy_to_user(uptr64(hdr->response), sreq->sense, len)) + ret = -EFAULT; + else + hdr->response_len = len; + } + + if (rq_data_dir(rq) == READ) + hdr->din_resid = sreq->resid_len; + else + hdr->dout_resid = sreq->resid_len; + + return ret; +} + +static void bsg_scsi_free_rq(struct request *rq) +{ + scsi_req_free_cmd(scsi_req(rq)); +} + +static const struct bsg_ops bsg_scsi_ops = { + .check_proto = bsg_scsi_check_proto, + .fill_hdr = bsg_scsi_fill_hdr, + .complete_rq = bsg_scsi_complete_rq, + .free_rq = bsg_scsi_free_rq, +}; + +static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) +{ + struct request *rq; + struct bio *bio; + struct sg_io_v4 hdr; + int ret; + + if (copy_from_user(&hdr, uarg, sizeof(hdr))) + return -EFAULT; + + if (!q->bsg_dev.class_dev) + return -ENXIO; + + if (hdr.guard != 'Q') + return -EINVAL; + ret = q->bsg_dev.ops->check_proto(&hdr); + if (ret) + return ret; + + rq = blk_get_request(q, hdr.dout_xfer_len ? + REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + + ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); + if (ret) { + blk_put_request(rq); + return ret; + } + + rq->timeout = msecs_to_jiffies(hdr.timeout); + if (!rq->timeout) + rq->timeout = q->sg_timeout; + if (!rq->timeout) + rq->timeout = BLK_DEFAULT_SG_TIMEOUT; + if (rq->timeout < BLK_MIN_SG_TIMEOUT) + rq->timeout = BLK_MIN_SG_TIMEOUT; + + if (hdr.dout_xfer_len) { + ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.dout_xferp), + hdr.dout_xfer_len, GFP_KERNEL); + } else if (hdr.din_xfer_len) { + ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.din_xferp), + hdr.din_xfer_len, GFP_KERNEL); + } + + if (ret) + goto out_free_rq; + + bio = rq->bio; + + blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); + ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); + blk_rq_unmap_user(bio); + +out_free_rq: + rq->q->bsg_dev.ops->free_rq(rq); + blk_put_request(rq); + if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) + return -EFAULT; + return ret; +} + +static struct bsg_device *bsg_alloc_device(void) +{ + struct bsg_device *bd; + + bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL); + if (unlikely(!bd)) + return NULL; + + spin_lock_init(&bd->lock); + bd->max_queue = BSG_DEFAULT_CMDS; + INIT_HLIST_NODE(&bd->dev_list); + return bd; +} + +static int bsg_put_device(struct bsg_device *bd) +{ + struct request_queue *q = bd->queue; + + mutex_lock(&bsg_mutex); + + if (!refcount_dec_and_test(&bd->ref_count)) { + mutex_unlock(&bsg_mutex); + return 0; + } + + hlist_del(&bd->dev_list); + mutex_unlock(&bsg_mutex); + + bsg_dbg(bd, "tearing down\n"); + + /* + * close can always block + */ + kfree(bd); + blk_put_queue(q); + return 0; +} + +static struct bsg_device *bsg_add_device(struct inode *inode, + struct request_queue *rq, + struct file *file) +{ + struct bsg_device *bd; + unsigned char buf[32]; + + lockdep_assert_held(&bsg_mutex); + + if (!blk_get_queue(rq)) + return ERR_PTR(-ENXIO); + + bd = bsg_alloc_device(); + if (!bd) { + blk_put_queue(rq); + return ERR_PTR(-ENOMEM); + } + + bd->queue = rq; + + refcount_set(&bd->ref_count, 1); + hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); + + strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); + bsg_dbg(bd, "bound to <%s>, max queue %d\n", + format_dev_t(buf, inode->i_rdev), bd->max_queue); + + return bd; +} + +static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) +{ + struct bsg_device *bd; + + lockdep_assert_held(&bsg_mutex); + + hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { + if (bd->queue == q) { + refcount_inc(&bd->ref_count); + goto found; + } + } + bd = NULL; +found: + return bd; +} + +static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) +{ + struct bsg_device *bd; + struct bsg_class_device *bcd; + + /* + * find the class device + */ + mutex_lock(&bsg_mutex); + bcd = idr_find(&bsg_minor_idr, iminor(inode)); + + if (!bcd) { + bd = ERR_PTR(-ENODEV); + goto out_unlock; + } + + bd = __bsg_get_device(iminor(inode), bcd->queue); + if (!bd) + bd = bsg_add_device(inode, bcd->queue, file); + +out_unlock: + mutex_unlock(&bsg_mutex); + return bd; +} + +static int bsg_open(struct inode *inode, struct file *file) +{ + struct bsg_device *bd; + + bd = bsg_get_device(inode, file); + + if (IS_ERR(bd)) + return PTR_ERR(bd); + + file->private_data = bd; + return 0; +} + +static int bsg_release(struct inode *inode, struct file *file) +{ + struct bsg_device *bd = file->private_data; + + file->private_data = NULL; + return bsg_put_device(bd); +} + +static int bsg_get_command_q(struct bsg_device *bd, int __user *uarg) +{ + return put_user(bd->max_queue, uarg); +} + +static int bsg_set_command_q(struct bsg_device *bd, int __user *uarg) +{ + int queue; + + if (get_user(queue, uarg)) + return -EFAULT; + if (queue < 1) + return -EINVAL; + + spin_lock_irq(&bd->lock); + bd->max_queue = queue; + spin_unlock_irq(&bd->lock); + return 0; +} + +static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct bsg_device *bd = file->private_data; + void __user *uarg = (void __user *) arg; + + switch (cmd) { + /* + * Our own ioctls + */ + case SG_GET_COMMAND_Q: + return bsg_get_command_q(bd, uarg); + case SG_SET_COMMAND_Q: + return bsg_set_command_q(bd, uarg); + + /* + * SCSI/sg ioctls + */ + case SG_GET_VERSION_NUM: + case SCSI_IOCTL_GET_IDLUN: + case SCSI_IOCTL_GET_BUS_NUMBER: + case SG_SET_TIMEOUT: + case SG_GET_TIMEOUT: + case SG_GET_RESERVED_SIZE: + case SG_SET_RESERVED_SIZE: + case SG_EMULATED_HOST: + return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg); + case SG_IO: + return bsg_sg_io(bd->queue, file->f_mode, uarg); + case SCSI_IOCTL_SEND_COMMAND: + pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", + current->comm); + return -EINVAL; + default: + return -ENOTTY; + } +} + +static const struct file_operations bsg_fops = { + .open = bsg_open, + .release = bsg_release, + .unlocked_ioctl = bsg_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .owner = THIS_MODULE, + .llseek = default_llseek, +}; + +void bsg_unregister_queue(struct request_queue *q) +{ + struct bsg_class_device *bcd = &q->bsg_dev; + + if (!bcd->class_dev) + return; + + mutex_lock(&bsg_mutex); + idr_remove(&bsg_minor_idr, bcd->minor); + if (q->kobj.sd) + sysfs_remove_link(&q->kobj, "bsg"); + device_unregister(bcd->class_dev); + bcd->class_dev = NULL; + mutex_unlock(&bsg_mutex); +} +EXPORT_SYMBOL_GPL(bsg_unregister_queue); + +int bsg_register_queue(struct request_queue *q, struct device *parent, + const char *name, const struct bsg_ops *ops) +{ + struct bsg_class_device *bcd; + dev_t dev; + int ret; + struct device *class_dev = NULL; + + /* + * we need a proper transport to send commands, not a stacked device + */ + if (!queue_is_mq(q)) + return 0; + + bcd = &q->bsg_dev; + memset(bcd, 0, sizeof(*bcd)); + + mutex_lock(&bsg_mutex); + + ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL); + if (ret < 0) { + if (ret == -ENOSPC) { + printk(KERN_ERR "bsg: too many bsg devices\n"); + ret = -EINVAL; + } + goto unlock; + } + + bcd->minor = ret; + bcd->queue = q; + bcd->ops = ops; + dev = MKDEV(bsg_major, bcd->minor); + class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name); + if (IS_ERR(class_dev)) { + ret = PTR_ERR(class_dev); + goto idr_remove; + } + bcd->class_dev = class_dev; + + if (q->kobj.sd) { + ret = sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg"); + if (ret) + goto unregister_class_dev; + } + + mutex_unlock(&bsg_mutex); + return 0; + +unregister_class_dev: + device_unregister(class_dev); +idr_remove: + idr_remove(&bsg_minor_idr, bcd->minor); +unlock: + mutex_unlock(&bsg_mutex); + return ret; +} + +int bsg_scsi_register_queue(struct request_queue *q, struct device *parent) +{ + if (!blk_queue_scsi_passthrough(q)) { + WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); + return -EINVAL; + } + + return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops); +} +EXPORT_SYMBOL_GPL(bsg_scsi_register_queue); + +static struct cdev bsg_cdev; + +static char *bsg_devnode(struct device *dev, umode_t *mode) +{ + return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); +} + +static int __init bsg_init(void) +{ + int ret, i; + dev_t devid; + + for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) + INIT_HLIST_HEAD(&bsg_device_list[i]); + + bsg_class = class_create(THIS_MODULE, "bsg"); + if (IS_ERR(bsg_class)) + return PTR_ERR(bsg_class); + bsg_class->devnode = bsg_devnode; + + ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); + if (ret) + goto destroy_bsg_class; + + bsg_major = MAJOR(devid); + + cdev_init(&bsg_cdev, &bsg_fops); + ret = cdev_add(&bsg_cdev, MKDEV(bsg_major, 0), BSG_MAX_DEVS); + if (ret) + goto unregister_chrdev; + + printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION + " loaded (major %d)\n", bsg_major); + return 0; +unregister_chrdev: + unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); +destroy_bsg_class: + class_destroy(bsg_class); + return ret; +} + +MODULE_AUTHOR("Jens Axboe"); +MODULE_DESCRIPTION(BSG_DESCRIPTION); +MODULE_LICENSE("GPL"); + +device_initcall(bsg_init); diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c new file mode 100644 index 000000000..f2a145718 --- /dev/null +++ b/block/cmdline-parser.c @@ -0,0 +1,255 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Parse command line, get partition information + * + * Written by Cai Zhiyong + * + */ +#include +#include + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strncpy(new_subpart->name, partdef, length); + new_subpart->name[length] = '\0'; + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strncpy(newparts->name, bdevdef, length); + newparts->name[length] = '\0'; + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strncpy(buf, bdevdef, length); + buf[length] = '\0'; + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} +EXPORT_SYMBOL(cmdline_parts_free); + +int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} +EXPORT_SYMBOL(cmdline_parts_parse); + +struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} +EXPORT_SYMBOL(cmdline_parts_find); + +/* + * add_part() + * 0 success. + * 1 can not add so many partitions. + */ +int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + int slot, + int (*add_part)(int, struct cmdline_subpart *, void *), + void *param) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, param)) + break; + } + + return slot; +} +EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/elevator.c b/block/elevator.c new file mode 100644 index 000000000..2f962662c --- /dev/null +++ b/block/elevator.c @@ -0,0 +1,837 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block device elevator/IO-scheduler. + * + * Copyright (C) 2000 Andrea Arcangeli SuSE + * + * 30042000 Jens Axboe : + * + * Split the elevator a bit so that it is possible to choose a different + * one or even write a new "plug in". There are three pieces: + * - elevator_fn, inserts a new request in the queue list + * - elevator_merge_fn, decides whether a new buffer can be merged with + * an existing request + * - elevator_dequeue_fn, called when a request is taken off the active list + * + * 20082000 Dave Jones : + * Removed tests for max-bomb-segments, which was breaking elvtune + * when run without -bN + * + * Jens: + * - Rework again to work with bio instead of buffer_heads + * - loose bi_dev comparisons, partition handling is right now + * - completely modularize elevator setup and teardown + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "blk.h" +#include "blk-mq-sched.h" +#include "blk-pm.h" +#include "blk-wbt.h" + +static DEFINE_SPINLOCK(elv_list_lock); +static LIST_HEAD(elv_list); + +/* + * Merge hash stuff. + */ +#define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) + +/* + * Query io scheduler to see if the current process issuing bio may be + * merged with rq. + */ +static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) +{ + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; + + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); + + return 1; +} + +/* + * can we safely merge with this request? + */ +bool elv_bio_merge_ok(struct request *rq, struct bio *bio) +{ + if (!blk_rq_merge_ok(rq, bio)) + return false; + + if (!elv_iosched_allow_bio_merge(rq, bio)) + return false; + + return true; +} +EXPORT_SYMBOL(elv_bio_merge_ok); + +static inline bool elv_support_features(unsigned int elv_features, + unsigned int required_features) +{ + return (required_features & elv_features) == required_features; +} + +/** + * elevator_match - Test an elevator name and features + * @e: Scheduler to test + * @name: Elevator name to test + * @required_features: Features that the elevator must provide + * + * Return true if the elevator @e name matches @name and if @e provides all + * the features specified by @required_features. + */ +static bool elevator_match(const struct elevator_type *e, const char *name, + unsigned int required_features) +{ + if (!elv_support_features(e->elevator_features, required_features)) + return false; + if (!strcmp(e->elevator_name, name)) + return true; + if (e->elevator_alias && !strcmp(e->elevator_alias, name)) + return true; + + return false; +} + +/** + * elevator_find - Find an elevator + * @name: Name of the elevator to find + * @required_features: Features that the elevator must provide + * + * Return the first registered scheduler with name @name and supporting the + * features @required_features and NULL otherwise. + */ +static struct elevator_type *elevator_find(const char *name, + unsigned int required_features) +{ + struct elevator_type *e; + + list_for_each_entry(e, &elv_list, list) { + if (elevator_match(e, name, required_features)) + return e; + } + + return NULL; +} + +static void elevator_put(struct elevator_type *e) +{ + module_put(e->elevator_owner); +} + +static struct elevator_type *elevator_get(struct request_queue *q, + const char *name, bool try_loading) +{ + struct elevator_type *e; + + spin_lock(&elv_list_lock); + + e = elevator_find(name, q->required_elevator_features); + if (!e && try_loading) { + spin_unlock(&elv_list_lock); + request_module("%s-iosched", name); + spin_lock(&elv_list_lock); + e = elevator_find(name, q->required_elevator_features); + } + + if (e && !try_module_get(e->elevator_owner)) + e = NULL; + + spin_unlock(&elv_list_lock); + return e; +} + +static struct kobj_type elv_ktype; + +struct elevator_queue *elevator_alloc(struct request_queue *q, + struct elevator_type *e) +{ + struct elevator_queue *eq; + + eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); + if (unlikely(!eq)) + return NULL; + + eq->type = e; + kobject_init(&eq->kobj, &elv_ktype); + mutex_init(&eq->sysfs_lock); + hash_init(eq->hash); + + return eq; +} +EXPORT_SYMBOL(elevator_alloc); + +static void elevator_release(struct kobject *kobj) +{ + struct elevator_queue *e; + + e = container_of(kobj, struct elevator_queue, kobj); + elevator_put(e->type); + kfree(e); +} + +void __elevator_exit(struct request_queue *q, struct elevator_queue *e) +{ + mutex_lock(&e->sysfs_lock); + blk_mq_exit_sched(q, e); + mutex_unlock(&e->sysfs_lock); + + kobject_put(&e->kobj); +} + +static inline void __elv_rqhash_del(struct request *rq) +{ + hash_del(&rq->hash); + rq->rq_flags &= ~RQF_HASHED; +} + +void elv_rqhash_del(struct request_queue *q, struct request *rq) +{ + if (ELV_ON_HASH(rq)) + __elv_rqhash_del(rq); +} +EXPORT_SYMBOL_GPL(elv_rqhash_del); + +void elv_rqhash_add(struct request_queue *q, struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + BUG_ON(ELV_ON_HASH(rq)); + hash_add(e->hash, &rq->hash, rq_hash_key(rq)); + rq->rq_flags |= RQF_HASHED; +} +EXPORT_SYMBOL_GPL(elv_rqhash_add); + +void elv_rqhash_reposition(struct request_queue *q, struct request *rq) +{ + __elv_rqhash_del(rq); + elv_rqhash_add(q, rq); +} + +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) +{ + struct elevator_queue *e = q->elevator; + struct hlist_node *next; + struct request *rq; + + hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { + BUG_ON(!ELV_ON_HASH(rq)); + + if (unlikely(!rq_mergeable(rq))) { + __elv_rqhash_del(rq); + continue; + } + + if (rq_hash_key(rq) == offset) + return rq; + } + + return NULL; +} + +/* + * RB-tree support functions for inserting/lookup/removal of requests + * in a sorted RB tree. + */ +void elv_rb_add(struct rb_root *root, struct request *rq) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct request *__rq; + + while (*p) { + parent = *p; + __rq = rb_entry(parent, struct request, rb_node); + + if (blk_rq_pos(rq) < blk_rq_pos(__rq)) + p = &(*p)->rb_left; + else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) + p = &(*p)->rb_right; + } + + rb_link_node(&rq->rb_node, parent, p); + rb_insert_color(&rq->rb_node, root); +} +EXPORT_SYMBOL(elv_rb_add); + +void elv_rb_del(struct rb_root *root, struct request *rq) +{ + BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); + rb_erase(&rq->rb_node, root); + RB_CLEAR_NODE(&rq->rb_node); +} +EXPORT_SYMBOL(elv_rb_del); + +struct request *elv_rb_find(struct rb_root *root, sector_t sector) +{ + struct rb_node *n = root->rb_node; + struct request *rq; + + while (n) { + rq = rb_entry(n, struct request, rb_node); + + if (sector < blk_rq_pos(rq)) + n = n->rb_left; + else if (sector > blk_rq_pos(rq)) + n = n->rb_right; + else + return rq; + } + + return NULL; +} +EXPORT_SYMBOL(elv_rb_find); + +enum elv_merge elv_merge(struct request_queue *q, struct request **req, + struct bio *bio) +{ + struct elevator_queue *e = q->elevator; + struct request *__rq; + + /* + * Levels of merges: + * nomerges: No merges at all attempted + * noxmerges: Only simple one-hit cache try + * merges: All merge tries attempted + */ + if (blk_queue_nomerges(q) || !bio_mergeable(bio)) + return ELEVATOR_NO_MERGE; + + /* + * First try one-hit cache. + */ + if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { + enum elv_merge ret = blk_try_merge(q->last_merge, bio); + + if (ret != ELEVATOR_NO_MERGE) { + *req = q->last_merge; + return ret; + } + } + + if (blk_queue_noxmerges(q)) + return ELEVATOR_NO_MERGE; + + /* + * See if our hash lookup can find a potential backmerge. + */ + __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); + if (__rq && elv_bio_merge_ok(__rq, bio)) { + *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; + return ELEVATOR_BACK_MERGE; + } + + if (e->type->ops.request_merge) + return e->type->ops.request_merge(q, req, bio); + + return ELEVATOR_NO_MERGE; +} + +/* + * Attempt to do an insertion back merge. Only check for the case where + * we can append 'rq' to an existing request, so we can throw 'rq' away + * afterwards. + * + * Returns true if we merged, false otherwise + */ +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +{ + struct request *__rq; + bool ret; + + if (blk_queue_nomerges(q)) + return false; + + /* + * First try one-hit cache. + */ + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + return true; + + if (blk_queue_noxmerges(q)) + return false; + + ret = false; + /* + * See if our hash lookup can find a potential backmerge. + */ + while (1) { + __rq = elv_rqhash_find(q, blk_rq_pos(rq)); + if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) + break; + + /* The merged request could be merged with others, try again */ + ret = true; + rq = __rq; + } + + return ret; +} + +void elv_merged_request(struct request_queue *q, struct request *rq, + enum elv_merge type) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.request_merged) + e->type->ops.request_merged(q, rq, type); + + if (type == ELEVATOR_BACK_MERGE) + elv_rqhash_reposition(q, rq); + + q->last_merge = rq; +} + +void elv_merge_requests(struct request_queue *q, struct request *rq, + struct request *next) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.requests_merged) + e->type->ops.requests_merged(q, rq, next); + + elv_rqhash_reposition(q, rq); + q->last_merge = rq; +} + +struct request *elv_latter_request(struct request_queue *q, struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.next_request) + return e->type->ops.next_request(q, rq); + + return NULL; +} + +struct request *elv_former_request(struct request_queue *q, struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e->type->ops.former_request) + return e->type->ops.former_request(q, rq); + + return NULL; +} + +#define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) + +static ssize_t +elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) +{ + struct elv_fs_entry *entry = to_elv(attr); + struct elevator_queue *e; + ssize_t error; + + if (!entry->show) + return -EIO; + + e = container_of(kobj, struct elevator_queue, kobj); + mutex_lock(&e->sysfs_lock); + error = e->type ? entry->show(e, page) : -ENOENT; + mutex_unlock(&e->sysfs_lock); + return error; +} + +static ssize_t +elv_attr_store(struct kobject *kobj, struct attribute *attr, + const char *page, size_t length) +{ + struct elv_fs_entry *entry = to_elv(attr); + struct elevator_queue *e; + ssize_t error; + + if (!entry->store) + return -EIO; + + e = container_of(kobj, struct elevator_queue, kobj); + mutex_lock(&e->sysfs_lock); + error = e->type ? entry->store(e, page, length) : -ENOENT; + mutex_unlock(&e->sysfs_lock); + return error; +} + +static const struct sysfs_ops elv_sysfs_ops = { + .show = elv_attr_show, + .store = elv_attr_store, +}; + +static struct kobj_type elv_ktype = { + .sysfs_ops = &elv_sysfs_ops, + .release = elevator_release, +}; + +int elv_register_queue(struct request_queue *q, bool uevent) +{ + struct elevator_queue *e = q->elevator; + int error; + + lockdep_assert_held(&q->sysfs_lock); + + error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); + if (!error) { + struct elv_fs_entry *attr = e->type->elevator_attrs; + if (attr) { + while (attr->attr.name) { + if (sysfs_create_file(&e->kobj, &attr->attr)) + break; + attr++; + } + } + if (uevent) + kobject_uevent(&e->kobj, KOBJ_ADD); + + e->registered = 1; + } + return error; +} + +void elv_unregister_queue(struct request_queue *q) +{ + lockdep_assert_held(&q->sysfs_lock); + + if (q) { + struct elevator_queue *e = q->elevator; + + kobject_uevent(&e->kobj, KOBJ_REMOVE); + kobject_del(&e->kobj); + + e->registered = 0; + } +} + +int elv_register(struct elevator_type *e) +{ + /* create icq_cache if requested */ + if (e->icq_size) { + if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || + WARN_ON(e->icq_align < __alignof__(struct io_cq))) + return -EINVAL; + + snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), + "%s_io_cq", e->elevator_name); + e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, + e->icq_align, 0, NULL); + if (!e->icq_cache) + return -ENOMEM; + } + + /* register, don't allow duplicate names */ + spin_lock(&elv_list_lock); + if (elevator_find(e->elevator_name, 0)) { + spin_unlock(&elv_list_lock); + kmem_cache_destroy(e->icq_cache); + return -EBUSY; + } + list_add_tail(&e->list, &elv_list); + spin_unlock(&elv_list_lock); + + printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); + + return 0; +} +EXPORT_SYMBOL_GPL(elv_register); + +void elv_unregister(struct elevator_type *e) +{ + /* unregister */ + spin_lock(&elv_list_lock); + list_del_init(&e->list); + spin_unlock(&elv_list_lock); + + /* + * Destroy icq_cache if it exists. icq's are RCU managed. Make + * sure all RCU operations are complete before proceeding. + */ + if (e->icq_cache) { + rcu_barrier(); + kmem_cache_destroy(e->icq_cache); + e->icq_cache = NULL; + } +} +EXPORT_SYMBOL_GPL(elv_unregister); + +int elevator_switch_mq(struct request_queue *q, + struct elevator_type *new_e) +{ + int ret; + + lockdep_assert_held(&q->sysfs_lock); + + if (q->elevator) { + if (q->elevator->registered) + elv_unregister_queue(q); + + ioc_clear_queue(q); + elevator_exit(q, q->elevator); + } + + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out; + + if (new_e) { + ret = elv_register_queue(q, true); + if (ret) { + elevator_exit(q, q->elevator); + goto out; + } + } + + if (new_e) + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + else + blk_add_trace_msg(q, "elv switch: none"); + +out: + return ret; +} + +static inline bool elv_support_iosched(struct request_queue *q) +{ + if (!queue_is_mq(q) || + (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) + return false; + return true; +} + +/* + * For single queue devices, default to using mq-deadline. If we have multiple + * queues or mq-deadline is not available, default to "none". + */ +static struct elevator_type *elevator_get_default(struct request_queue *q) +{ + if (q->nr_hw_queues != 1) + return NULL; + + return elevator_get(q, "mq-deadline", false); +} + +/* + * Get the first elevator providing the features required by the request queue. + * Default to "none" if no matching elevator is found. + */ +static struct elevator_type *elevator_get_by_features(struct request_queue *q) +{ + struct elevator_type *e, *found = NULL; + + spin_lock(&elv_list_lock); + + list_for_each_entry(e, &elv_list, list) { + if (elv_support_features(e->elevator_features, + q->required_elevator_features)) { + found = e; + break; + } + } + + if (found && !try_module_get(found->elevator_owner)) + found = NULL; + + spin_unlock(&elv_list_lock); + return found; +} + +/* + * For a device queue that has no required features, use the default elevator + * settings. Otherwise, use the first elevator available matching the required + * features. If no suitable elevator is find or if the chosen elevator + * initialization fails, fall back to the "none" elevator (no elevator). + */ +void elevator_init_mq(struct request_queue *q) +{ + struct elevator_type *e; + int err; + + if (!elv_support_iosched(q)) + return; + + WARN_ON_ONCE(blk_queue_registered(q)); + + if (unlikely(q->elevator)) + return; + + if (!q->required_elevator_features) + e = elevator_get_default(q); + else + e = elevator_get_by_features(q); + if (!e) + return; + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + err = blk_mq_init_sched(q, e); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + if (err) { + pr_warn("\"%s\" elevator initialization failed, " + "falling back to \"none\"\n", e->elevator_name); + elevator_put(e); + } +} + + +/* + * switch to new_e io scheduler. be careful not to introduce deadlocks - + * we don't free the old io scheduler, before we have allocated what we + * need for the new one. this way we have a chance of going back to the old + * one, if the new one fails init for some reason. + */ +static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +{ + int err; + + lockdep_assert_held(&q->sysfs_lock); + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + err = elevator_switch_mq(q, new_e); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return err; +} + +/* + * Switch this queue to the given IO scheduler. + */ +static int __elevator_change(struct request_queue *q, const char *name) +{ + char elevator_name[ELV_NAME_MAX]; + struct elevator_type *e; + + /* Make sure queue is not in the middle of being removed */ + if (!blk_queue_registered(q)) + return -ENOENT; + + /* + * Special case for mq, turn off scheduling + */ + if (!strncmp(name, "none", 4)) { + if (!q->elevator) + return 0; + return elevator_switch(q, NULL); + } + + strlcpy(elevator_name, name, sizeof(elevator_name)); + e = elevator_get(q, strstrip(elevator_name), true); + if (!e) + return -EINVAL; + + if (q->elevator && + elevator_match(q->elevator->type, elevator_name, 0)) { + elevator_put(e); + return 0; + } + + return elevator_switch(q, e); +} + +ssize_t elv_iosched_store(struct request_queue *q, const char *name, + size_t count) +{ + int ret; + + if (!elv_support_iosched(q)) + return count; + + ret = __elevator_change(q, name); + if (!ret) + return count; + + return ret; +} + +ssize_t elv_iosched_show(struct request_queue *q, char *name) +{ + struct elevator_queue *e = q->elevator; + struct elevator_type *elv = NULL; + struct elevator_type *__e; + int len = 0; + + if (!queue_is_mq(q)) + return sprintf(name, "none\n"); + + if (!q->elevator) + len += sprintf(name+len, "[none] "); + else + elv = e->type; + + spin_lock(&elv_list_lock); + list_for_each_entry(__e, &elv_list, list) { + if (elv && elevator_match(elv, __e->elevator_name, 0)) { + len += sprintf(name+len, "[%s] ", elv->elevator_name); + continue; + } + if (elv_support_iosched(q) && + elevator_match(__e, __e->elevator_name, + q->required_elevator_features)) + len += sprintf(name+len, "%s ", __e->elevator_name); + } + spin_unlock(&elv_list_lock); + + if (q->elevator) + len += sprintf(name+len, "none"); + + len += sprintf(len+name, "\n"); + return len; +} + +struct request *elv_rb_former_request(struct request_queue *q, + struct request *rq) +{ + struct rb_node *rbprev = rb_prev(&rq->rb_node); + + if (rbprev) + return rb_entry_rq(rbprev); + + return NULL; +} +EXPORT_SYMBOL(elv_rb_former_request); + +struct request *elv_rb_latter_request(struct request_queue *q, + struct request *rq) +{ + struct rb_node *rbnext = rb_next(&rq->rb_node); + + if (rbnext) + return rb_entry_rq(rbnext); + + return NULL; +} +EXPORT_SYMBOL(elv_rb_latter_request); + +static int __init elevator_setup(char *str) +{ + pr_warn("Kernel parameter elevator= does not have any effect anymore.\n" + "Please use sysfs to set IO scheduler for individual devices.\n"); + return 1; +} + +__setup("elevator=", elevator_setup); diff --git a/block/genhd.c b/block/genhd.c new file mode 100644 index 000000000..796baf761 --- /dev/null +++ b/block/genhd.c @@ -0,0 +1,2367 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * gendisk handling + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" + +static DEFINE_MUTEX(block_class_lock); +static struct kobject *block_depr; + +/* for extended dynamic devt allocation, currently only one major is used */ +#define NR_EXT_DEVT (1 << MINORBITS) + +/* For extended devt allocation. ext_devt_lock prevents look up + * results from going away underneath its user. + */ +static DEFINE_SPINLOCK(ext_devt_lock); +static DEFINE_IDR(ext_devt_idr); + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr); +static void disk_alloc_events(struct gendisk *disk); +static void disk_add_events(struct gendisk *disk); +static void disk_del_events(struct gendisk *disk); +static void disk_release_events(struct gendisk *disk); + +/* + * Set disk capacity and notify if the size is not currently + * zero and will not be set to zero + */ +bool set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, + bool update_bdev) +{ + sector_t capacity = get_capacity(disk); + + set_capacity(disk, size); + if (update_bdev) + revalidate_disk_size(disk, true); + + if (capacity != size && capacity != 0 && size != 0) { + char *envp[] = { "RESIZE=1", NULL }; + + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; + } + + return false; +} + +EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); + +/* + * Format the device name of the indicated disk into the supplied buffer and + * return a pointer to that same buffer for convenience. + */ +char *disk_name(struct gendisk *hd, int partno, char *buf) +{ + if (!partno) + snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); + else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); + else + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + + return buf; +} + +const char *bdevname(struct block_device *bdev, char *buf) +{ + return disk_name(bdev->bd_disk, bdev->bd_partno, buf); +} +EXPORT_SYMBOL(bdevname); + +static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +{ + int cpu; + + memset(stat, 0, sizeof(struct disk_stats)); + for_each_possible_cpu(cpu) { + struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + int group; + + for (group = 0; group < NR_STAT_GROUPS; group++) { + stat->nsecs[group] += ptr->nsecs[group]; + stat->sectors[group] += ptr->sectors[group]; + stat->ios[group] += ptr->ios[group]; + stat->merges[group] += ptr->merges[group]; + } + + stat->io_ticks += ptr->io_ticks; + } +} + +static unsigned int part_in_flight(struct hd_struct *part) +{ + unsigned int inflight = 0; + int cpu; + + for_each_possible_cpu(cpu) { + inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) + + part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight < 0) + inflight = 0; + + return inflight; +} + +static void part_in_flight_rw(struct hd_struct *part, unsigned int inflight[2]) +{ + int cpu; + + inflight[0] = 0; + inflight[1] = 0; + for_each_possible_cpu(cpu) { + inflight[0] += part_stat_local_read_cpu(part, in_flight[0], cpu); + inflight[1] += part_stat_local_read_cpu(part, in_flight[1], cpu); + } + if ((int)inflight[0] < 0) + inflight[0] = 0; + if ((int)inflight[1] < 0) + inflight[1] = 0; +} + +struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); + + if (unlikely(partno < 0 || partno >= ptbl->len)) + return NULL; + return rcu_dereference(ptbl->part[partno]); +} + +/** + * disk_get_part - get partition + * @disk: disk to look partition from + * @partno: partition number + * + * Look for partition @partno from @disk. If found, increment + * reference count and return it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Pointer to the found partition on success, NULL if not found. + */ +struct hd_struct *disk_get_part(struct gendisk *disk, int partno) +{ + struct hd_struct *part; + + rcu_read_lock(); + part = __disk_get_part(disk, partno); + if (part) + get_device(part_to_dev(part)); + rcu_read_unlock(); + + return part; +} + +/** + * disk_part_iter_init - initialize partition iterator + * @piter: iterator to initialize + * @disk: disk to iterate over + * @flags: DISK_PITER_* flags + * + * Initialize @piter so that it iterates over partitions of @disk. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, + unsigned int flags) +{ + struct disk_part_tbl *ptbl; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + piter->disk = disk; + piter->part = NULL; + + if (flags & DISK_PITER_REVERSE) + piter->idx = ptbl->len - 1; + else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) + piter->idx = 0; + else + piter->idx = 1; + + piter->flags = flags; + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(disk_part_iter_init); + +/** + * disk_part_iter_next - proceed iterator to the next partition and return it + * @piter: iterator of interest + * + * Proceed @piter to the next partition and return it. + * + * CONTEXT: + * Don't care. + */ +struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) +{ + struct disk_part_tbl *ptbl; + int inc, end; + + /* put the last partition */ + disk_put_part(piter->part); + piter->part = NULL; + + /* get part_tbl */ + rcu_read_lock(); + ptbl = rcu_dereference(piter->disk->part_tbl); + + /* determine iteration parameters */ + if (piter->flags & DISK_PITER_REVERSE) { + inc = -1; + if (piter->flags & (DISK_PITER_INCL_PART0 | + DISK_PITER_INCL_EMPTY_PART0)) + end = -1; + else + end = 0; + } else { + inc = 1; + end = ptbl->len; + } + + /* iterate to the next partition */ + for (; piter->idx != end; piter->idx += inc) { + struct hd_struct *part; + + part = rcu_dereference(ptbl->part[piter->idx]); + if (!part) + continue; + get_device(part_to_dev(part)); + piter->part = part; + if (!part_nr_sects_read(part) && + !(piter->flags & DISK_PITER_INCL_EMPTY) && + !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && + piter->idx == 0)) { + put_device(part_to_dev(part)); + piter->part = NULL; + continue; + } + + piter->idx += inc; + break; + } + + rcu_read_unlock(); + + return piter->part; +} +EXPORT_SYMBOL_GPL(disk_part_iter_next); + +/** + * disk_part_iter_exit - finish up partition iteration + * @piter: iter of interest + * + * Called when iteration is over. Cleans up @piter. + * + * CONTEXT: + * Don't care. + */ +void disk_part_iter_exit(struct disk_part_iter *piter) +{ + disk_put_part(piter->part); + piter->part = NULL; +} +EXPORT_SYMBOL_GPL(disk_part_iter_exit); + +static inline int sector_in_part(struct hd_struct *part, sector_t sector) +{ + return part->start_sect <= sector && + sector < part->start_sect + part_nr_sects_read(part); +} + +/** + * disk_map_sector_rcu - map sector to partition + * @disk: gendisk of interest + * @sector: sector to map + * + * Find out which partition @sector maps to on @disk. This is + * primarily used for stats accounting. + * + * CONTEXT: + * RCU read locked. The returned partition pointer is always valid + * because its refcount is grabbed except for part0, which lifetime + * is same with the disk. + * + * RETURNS: + * Found partition on success, part0 is returned if no partition matches + * or the matched partition is being deleted. + */ +struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) +{ + struct disk_part_tbl *ptbl; + struct hd_struct *part; + int i; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + part = rcu_dereference(ptbl->last_lookup); + if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) + goto out_unlock; + + for (i = 1; i < ptbl->len; i++) { + part = rcu_dereference(ptbl->part[i]); + + if (part && sector_in_part(part, sector)) { + /* + * only live partition can be cached for lookup, + * so use-after-free on cached & deleting partition + * can be avoided + */ + if (!hd_struct_try_get(part)) + break; + rcu_assign_pointer(ptbl->last_lookup, part); + goto out_unlock; + } + } + + part = &disk->part0; +out_unlock: + rcu_read_unlock(); + return part; +} + +/** + * disk_has_partitions + * @disk: gendisk of interest + * + * Walk through the partition table and check if valid partition exists. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * True if the gendisk has at least one valid non-zero size partition. + * Otherwise false. + */ +bool disk_has_partitions(struct gendisk *disk) +{ + struct disk_part_tbl *ptbl; + int i; + bool ret = false; + + rcu_read_lock(); + ptbl = rcu_dereference(disk->part_tbl); + + /* Iterate partitions skipping the whole device at index 0 */ + for (i = 1; i < ptbl->len; i++) { + if (rcu_dereference(ptbl->part[i])) { + ret = true; + break; + } + } + + rcu_read_unlock(); + + return ret; +} +EXPORT_SYMBOL_GPL(disk_has_partitions); + +/* + * Can be deleted altogether. Later. + * + */ +#define BLKDEV_MAJOR_HASH_SIZE 255 +static struct blk_major_name { + struct blk_major_name *next; + int major; + char name[16]; +} *major_names[BLKDEV_MAJOR_HASH_SIZE]; + +/* index in the above - for now: assume no multimajor ranges */ +static inline int major_to_index(unsigned major) +{ + return major % BLKDEV_MAJOR_HASH_SIZE; +} + +#ifdef CONFIG_PROC_FS +void blkdev_show(struct seq_file *seqf, off_t offset) +{ + struct blk_major_name *dp; + + mutex_lock(&block_class_lock); + for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) + if (dp->major == offset) + seq_printf(seqf, "%3d %s\n", dp->major, dp->name); + mutex_unlock(&block_class_lock); +} +#endif /* CONFIG_PROC_FS */ + +/** + * register_blkdev - register a new block device + * + * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If + * @major = 0, try to allocate any unused major number. + * @name: the name of the new block device as a zero terminated string + * + * The @name must be unique within the system. + * + * The return value depends on the @major input parameter: + * + * - if a major device number was requested in range [1..BLKDEV_MAJOR_MAX-1] + * then the function returns zero on success, or a negative error code + * - if any unused major number was requested with @major = 0 parameter + * then the return value is the allocated major number in range + * [1..BLKDEV_MAJOR_MAX-1] or a negative error code otherwise + * + * See Documentation/admin-guide/devices.txt for the list of allocated + * major numbers. + */ +int register_blkdev(unsigned int major, const char *name) +{ + struct blk_major_name **n, *p; + int index, ret = 0; + + mutex_lock(&block_class_lock); + + /* temporary */ + if (major == 0) { + for (index = ARRAY_SIZE(major_names)-1; index > 0; index--) { + if (major_names[index] == NULL) + break; + } + + if (index == 0) { + printk("%s: failed to get major for %s\n", + __func__, name); + ret = -EBUSY; + goto out; + } + major = index; + ret = major; + } + + if (major >= BLKDEV_MAJOR_MAX) { + pr_err("%s: major requested (%u) is greater than the maximum (%u) for %s\n", + __func__, major, BLKDEV_MAJOR_MAX-1, name); + + ret = -EINVAL; + goto out; + } + + p = kmalloc(sizeof(struct blk_major_name), GFP_KERNEL); + if (p == NULL) { + ret = -ENOMEM; + goto out; + } + + p->major = major; + strlcpy(p->name, name, sizeof(p->name)); + p->next = NULL; + index = major_to_index(major); + + for (n = &major_names[index]; *n; n = &(*n)->next) { + if ((*n)->major == major) + break; + } + if (!*n) + *n = p; + else + ret = -EBUSY; + + if (ret < 0) { + printk("register_blkdev: cannot get major %u for %s\n", + major, name); + kfree(p); + } +out: + mutex_unlock(&block_class_lock); + return ret; +} + +EXPORT_SYMBOL(register_blkdev); + +void unregister_blkdev(unsigned int major, const char *name) +{ + struct blk_major_name **n; + struct blk_major_name *p = NULL; + int index = major_to_index(major); + + mutex_lock(&block_class_lock); + for (n = &major_names[index]; *n; n = &(*n)->next) + if ((*n)->major == major) + break; + if (!*n || strcmp((*n)->name, name)) { + WARN_ON(1); + } else { + p = *n; + *n = p->next; + } + mutex_unlock(&block_class_lock); + kfree(p); +} + +EXPORT_SYMBOL(unregister_blkdev); + +static struct kobj_map *bdev_map; + +/** + * blk_mangle_minor - scatter minor numbers apart + * @minor: minor number to mangle + * + * Scatter consecutively allocated @minor number apart if MANGLE_DEVT + * is enabled. Mangling twice gives the original value. + * + * RETURNS: + * Mangled value. + * + * CONTEXT: + * Don't care. + */ +static int blk_mangle_minor(int minor) +{ +#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT + int i; + + for (i = 0; i < MINORBITS / 2; i++) { + int low = minor & (1 << i); + int high = minor & (1 << (MINORBITS - 1 - i)); + int distance = MINORBITS - 1 - 2 * i; + + minor ^= low | high; /* clear both bits */ + low <<= distance; /* swap the positions */ + high >>= distance; + minor |= low | high; /* and set */ + } +#endif + return minor; +} + +/** + * blk_alloc_devt - allocate a dev_t for a partition + * @part: partition to allocate dev_t for + * @devt: out parameter for resulting dev_t + * + * Allocate a dev_t for block device. + * + * RETURNS: + * 0 on success, allocated dev_t is returned in *@devt. -errno on + * failure. + * + * CONTEXT: + * Might sleep. + */ +int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +{ + struct gendisk *disk = part_to_disk(part); + int idx; + + /* in consecutive minor range? */ + if (part->partno < disk->minors) { + *devt = MKDEV(disk->major, disk->first_minor + part->partno); + return 0; + } + + /* allocate ext devt */ + idr_preload(GFP_KERNEL); + + spin_lock_bh(&ext_devt_lock); + idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); + spin_unlock_bh(&ext_devt_lock); + + idr_preload_end(); + if (idx < 0) + return idx == -ENOSPC ? -EBUSY : idx; + + *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); + return 0; +} + +/** + * blk_free_devt - free a dev_t + * @devt: dev_t to free + * + * Free @devt which was allocated using blk_alloc_devt(). + * + * CONTEXT: + * Might sleep. + */ +void blk_free_devt(dev_t devt) +{ + if (devt == MKDEV(0, 0)) + return; + + if (MAJOR(devt) == BLOCK_EXT_MAJOR) { + spin_lock_bh(&ext_devt_lock); + idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + spin_unlock_bh(&ext_devt_lock); + } +} + +/* + * We invalidate devt by assigning NULL pointer for devt in idr. + */ +void blk_invalidate_devt(dev_t devt) +{ + if (MAJOR(devt) == BLOCK_EXT_MAJOR) { + spin_lock_bh(&ext_devt_lock); + idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); + spin_unlock_bh(&ext_devt_lock); + } +} + +static char *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + +/* + * Register device numbers dev..(dev+range-1) + * range must be nonzero + * The hash chain is sorted on range, so that subranges can override. + */ +void blk_register_region(dev_t devt, unsigned long range, struct module *module, + struct kobject *(*probe)(dev_t, int *, void *), + int (*lock)(dev_t, void *), void *data) +{ + kobj_map(bdev_map, devt, range, module, probe, lock, data); +} + +EXPORT_SYMBOL(blk_register_region); + +void blk_unregister_region(dev_t devt, unsigned long range) +{ + kobj_unmap(bdev_map, devt, range); +} + +EXPORT_SYMBOL(blk_unregister_region); + +static struct kobject *exact_match(dev_t devt, int *partno, void *data) +{ + struct gendisk *p = data; + + return &disk_to_dev(p)->kobj; +} + +static int exact_lock(dev_t devt, void *data) +{ + struct gendisk *p = data; + + if (!get_disk_and_module(p)) + return -1; + return 0; +} + +static void disk_scan_partitions(struct gendisk *disk) +{ + struct block_device *bdev; + + if (!get_capacity(disk) || !disk_part_scan_enabled(disk)) + return; + + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev = blkdev_get_by_dev(disk_devt(disk), FMODE_READ, NULL); + if (!IS_ERR(bdev)) + blkdev_put(bdev, FMODE_READ); +} + +static void register_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) +{ + struct device *ddev = disk_to_dev(disk); + struct disk_part_iter piter; + struct hd_struct *part; + int err; + + ddev->parent = parent; + + dev_set_name(ddev, "%s", disk->disk_name); + + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + + if (groups) { + WARN_ON(ddev->groups); + ddev->groups = groups; + } + if (device_add(ddev)) + return; + if (!sysfs_deprecated) { + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (err) { + device_del(ddev); + return; + } + } + + /* + * avoid probable deadlock caused by allocating memory with + * GFP_KERNEL in runtime_resume callback of its all ancestor + * devices + */ + pm_runtime_set_memalloc_noio(ddev, true); + + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + + if (disk->flags & GENHD_FL_HIDDEN) + return; + + disk_scan_partitions(disk); + + /* announce disk after possible partitions are created */ + dev_set_uevent_suppress(ddev, 0); + kobject_uevent(&ddev->kobj, KOBJ_ADD); + + /* announce possible partitions */ + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); + + if (disk->queue->backing_dev_info->dev) { + err = sysfs_create_link(&ddev->kobj, + &disk->queue->backing_dev_info->dev->kobj, + "bdi"); + WARN_ON(err); + } +} + +/** + * __device_add_disk - add disk information to kernel list + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * @register_queue: register the queue if set to true + * + * This function registers the partitioning information in @disk + * with the kernel. + * + * FIXME: error handling + */ +static void __device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups, + bool register_queue) +{ + dev_t devt; + int retval; + + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + if (register_queue) + elevator_init_mq(disk->queue); + + /* minors == 0 indicates to use ext devt from part0 and should + * be accompanied with EXT_DEVT flag. Make sure all + * parameters make sense. + */ + WARN_ON(disk->minors && !(disk->major || disk->first_minor)); + WARN_ON(!disk->minors && + !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); + + disk->flags |= GENHD_FL_UP; + + retval = blk_alloc_devt(&disk->part0, &devt); + if (retval) { + WARN_ON(1); + return; + } + disk->major = MAJOR(devt); + disk->first_minor = MINOR(devt); + + disk_alloc_events(disk); + + if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Don't let hidden disks show up in /proc/partitions, + * and don't bother scanning for partitions either. + */ + disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; + disk->flags |= GENHD_FL_NO_PART_SCAN; + } else { + struct backing_dev_info *bdi = disk->queue->backing_dev_info; + struct device *dev = disk_to_dev(disk); + int ret; + + /* Register BDI before referencing it from bdev */ + dev->devt = devt; + ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); + WARN_ON(ret); + bdi_set_owner(bdi, dev); + blk_register_region(disk_devt(disk), disk->minors, NULL, + exact_match, exact_lock, disk); + } + register_disk(parent, disk, groups); + if (register_queue) + blk_register_queue(disk); + + /* + * Take an extra ref on queue which will be put on disk_release() + * so that it sticks around as long as @disk is there. + */ + WARN_ON_ONCE(!blk_get_queue(disk->queue)); + + disk_add_events(disk); + blk_integrity_add(disk); +} + +void device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) + +{ + __device_add_disk(parent, disk, groups, true); +} +EXPORT_SYMBOL(device_add_disk); + +void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +{ + __device_add_disk(parent, disk, NULL, false); +} +EXPORT_SYMBOL(device_add_disk_no_queue_reg); + +static void invalidate_partition(struct gendisk *disk, int partno) +{ + struct block_device *bdev; + + bdev = bdget_disk(disk, partno); + if (!bdev) + return; + + fsync_bdev(bdev); + __invalidate_device(bdev, true); + + /* + * Unhash the bdev inode for this device so that it gets evicted as soon + * as last inode reference is dropped. + */ + remove_inode_hash(bdev->bd_inode); + bdput(bdev); +} + +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) +{ + struct disk_part_iter piter; + struct hd_struct *part; + + might_sleep(); + + blk_integrity_del(disk); + disk_del_events(disk); + + /* + * Block lookups of the disk until all bdevs are unhashed and the + * disk is marked as dead (GENHD_FL_UP cleared). + */ + down_write(&disk->lookup_sem); + /* invalidate stuff */ + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(part); + } + disk_part_iter_exit(&piter); + + invalidate_partition(disk, 0); + set_capacity(disk, 0); + disk->flags &= ~GENHD_FL_UP; + up_write(&disk->lookup_sem); + + if (!(disk->flags & GENHD_FL_HIDDEN)) + sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + if (disk->queue) { + /* + * Unregister bdi before releasing device numbers (as they can + * get reused and we'd get clashes in sysfs). + */ + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->queue->backing_dev_info); + blk_unregister_queue(disk); + } else { + WARN_ON(1); + } + + if (!(disk->flags & GENHD_FL_HIDDEN)) + blk_unregister_region(disk_devt(disk), disk->minors); + /* + * Remove gendisk pointer from idr so that it cannot be looked up + * while RCU period before freeing gendisk is running to prevent + * use-after-free issues. Note that the device number stays + * "in-use" until we really free the gendisk. + */ + blk_invalidate_devt(disk_devt(disk)); + + kobject_put(disk->part0.holder_dir); + kobject_put(disk->slave_dir); + + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; + if (!sysfs_deprecated) + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); + device_del(disk_to_dev(disk)); +} +EXPORT_SYMBOL(del_gendisk); + +/* sysfs access to bad-blocks list. */ +static ssize_t disk_badblocks_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return sprintf(page, "\n"); + + return badblocks_show(disk->bb, page, 0); +} + +static ssize_t disk_badblocks_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t len) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->bb) + return -ENXIO; + + return badblocks_store(disk->bb, page, len, 0); +} + +/** + * get_gendisk - get partitioning information for a given device + * @devt: device to get partitioning information for + * @partno: returned partition index + * + * This function gets the structure containing partitioning + * information for the given device @devt. + * + * Context: can sleep + */ +struct gendisk *get_gendisk(dev_t devt, int *partno) +{ + struct gendisk *disk = NULL; + + might_sleep(); + + if (MAJOR(devt) != BLOCK_EXT_MAJOR) { + struct kobject *kobj; + + kobj = kobj_lookup(bdev_map, devt, partno); + if (kobj) + disk = dev_to_disk(kobj_to_dev(kobj)); + } else { + struct hd_struct *part; + + spin_lock_bh(&ext_devt_lock); + part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); + if (part && get_disk_and_module(part_to_disk(part))) { + *partno = part->partno; + disk = part_to_disk(part); + } + spin_unlock_bh(&ext_devt_lock); + } + + if (!disk) + return NULL; + + /* + * Synchronize with del_gendisk() to not return disk that is being + * destroyed. + */ + down_read(&disk->lookup_sem); + if (unlikely((disk->flags & GENHD_FL_HIDDEN) || + !(disk->flags & GENHD_FL_UP))) { + up_read(&disk->lookup_sem); + put_disk_and_module(disk); + disk = NULL; + } else { + up_read(&disk->lookup_sem); + } + return disk; +} + +/** + * bdget_disk - do bdget() by gendisk and partition number + * @disk: gendisk of interest + * @partno: partition number + * + * Find partition @partno from @disk, do bdget() on it. + * + * CONTEXT: + * Don't care. + * + * RETURNS: + * Resulting block_device on success, NULL on failure. + */ +struct block_device *bdget_disk(struct gendisk *disk, int partno) +{ + struct hd_struct *part; + struct block_device *bdev = NULL; + + part = disk_get_part(disk, partno); + if (part) + bdev = bdget_part(part); + disk_put_part(part); + + return bdev; +} +EXPORT_SYMBOL(bdget_disk); + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct disk_part_iter piter; + struct hd_struct *part; + char name_buf[BDEVNAME_SIZE]; + char devt_buf[BDEVT_SIZE]; + + /* + * Don't show empty devices or things that have been + * suppressed + */ + if (get_capacity(disk) == 0 || + (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the + * numbers in hex - the same format as the root= + * option takes. + */ + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) { + bool is_part0 = part == &disk->part0; + + printk("%s%s %10llu %s %s", is_part0 ? "" : " ", + bdevt_str(part_devt(part), devt_buf), + (unsigned long long)part_nr_sects_read(part) >> 1 + , disk_name(disk, part->partno, name_buf), + part->info ? part->info->uuid : ""); + if (is_part0) { + if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); + } else + printk("\n"); + } + disk_part_iter_exit(&piter); + } + class_dev_iter_exit(&iter); +} + +#ifdef CONFIG_PROC_FS +/* iterator */ +static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) +{ + loff_t skip = *pos; + struct class_dev_iter *iter; + struct device *dev; + + iter = kmalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return ERR_PTR(-ENOMEM); + + seqf->private = iter; + class_dev_iter_init(iter, &block_class, NULL, &disk_type); + do { + dev = class_dev_iter_next(iter); + if (!dev) + return NULL; + } while (skip--); + + return dev_to_disk(dev); +} + +static void *disk_seqf_next(struct seq_file *seqf, void *v, loff_t *pos) +{ + struct device *dev; + + (*pos)++; + dev = class_dev_iter_next(seqf->private); + if (dev) + return dev_to_disk(dev); + + return NULL; +} + +static void disk_seqf_stop(struct seq_file *seqf, void *v) +{ + struct class_dev_iter *iter = seqf->private; + + /* stop is called even after start failed :-( */ + if (iter) { + class_dev_iter_exit(iter); + kfree(iter); + seqf->private = NULL; + } +} + +static void *show_partition_start(struct seq_file *seqf, loff_t *pos) +{ + void *p; + + p = disk_seqf_start(seqf, pos); + if (!IS_ERR_OR_NULL(p) && !*pos) + seq_puts(seqf, "major minor #blocks name\n\n"); + return p; +} + +static int show_partition(struct seq_file *seqf, void *v) +{ + struct gendisk *sgp = v; + struct disk_part_iter piter; + struct hd_struct *part; + char buf[BDEVNAME_SIZE]; + + /* Don't show non-partitionable removeable devices or empty devices */ + if (!get_capacity(sgp) || (!disk_max_parts(sgp) && + (sgp->flags & GENHD_FL_REMOVABLE))) + return 0; + if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + return 0; + + /* show the full disk and all non-0 size partitions of it */ + disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); + while ((part = disk_part_iter_next(&piter))) + seq_printf(seqf, "%4d %7d %10llu %s\n", + MAJOR(part_devt(part)), MINOR(part_devt(part)), + (unsigned long long)part_nr_sects_read(part) >> 1, + disk_name(sgp, part->partno, buf)); + disk_part_iter_exit(&piter); + + return 0; +} + +static const struct seq_operations partitions_op = { + .start = show_partition_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, + .show = show_partition +}; +#endif + + +static struct kobject *base_probe(dev_t devt, int *partno, void *data) +{ + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); + return NULL; +} + +static int __init genhd_device_init(void) +{ + int error; + + block_class.dev_kobj = sysfs_dev_block_kobj; + error = class_register(&block_class); + if (unlikely(error)) + return error; + bdev_map = kobj_map_init(base_probe, &block_class_lock); + blk_dev_init(); + + register_blkdev(BLOCK_EXT_MAJOR, "blkext"); + + /* create top-level block dir */ + if (!sysfs_deprecated) + block_depr = kobject_create_and_add("block", NULL); + return 0; +} + +subsys_initcall(genhd_device_init); + +static ssize_t disk_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk->minors); +} + +static ssize_t disk_ext_range_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", disk_max_parts(disk)); +} + +static ssize_t disk_removable_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_REMOVABLE ? 1 : 0)); +} + +static ssize_t disk_hidden_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_HIDDEN ? 1 : 0)); +} + +static ssize_t disk_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", get_disk_ro(disk) ? 1 : 0); +} + +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n", + (unsigned long long)part_nr_sects_read(p)); +} + +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + struct disk_stats stat; + unsigned int inflight; + + part_stat_read_all(p, &stat); + if (queue_is_mq(q)) + inflight = blk_mq_in_flight(q, p); + else + inflight = part_in_flight(p); + + return sprintf(buf, + "%8lu %8lu %8llu %8u " + "%8lu %8lu %8llu %8u " + "%8u %8u %8u " + "%8lu %8lu %8llu %8u " + "%8lu %8u" + "\n", + stat.ios[STAT_READ], + stat.merges[STAT_READ], + (unsigned long long)stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + (unsigned long long)stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], NSEC_PER_MSEC), + inflight, + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + (unsigned long long)stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], NSEC_PER_MSEC)); +} + +ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + struct request_queue *q = part_to_disk(p)->queue; + unsigned int inflight[2]; + + if (queue_is_mq(q)) + blk_mq_in_flight_rw(q, p, inflight); + else + part_in_flight_rw(p, inflight); + + return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); +} + +static ssize_t disk_capability_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%x\n", disk->flags); +} + +static ssize_t disk_alignment_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); +} + +static ssize_t disk_discard_alignment_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); +} + +static DEVICE_ATTR(range, 0444, disk_range_show, NULL); +static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL); +static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL); +static DEVICE_ATTR(hidden, 0444, disk_hidden_show, NULL); +static DEVICE_ATTR(ro, 0444, disk_ro_show, NULL); +static DEVICE_ATTR(size, 0444, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, 0444, disk_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, 0444, disk_discard_alignment_show, NULL); +static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); +static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); +static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); + +#ifdef CONFIG_FAIL_MAKE_REQUEST +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->make_it_fail); +} + +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct hd_struct *p = dev_to_part(dev); + int i; + + if (count > 0 && sscanf(buf, "%d", &i) > 0) + p->make_it_fail = (i == 0) ? 0 : 1; + + return count; +} + +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); +#endif /* CONFIG_FAIL_MAKE_REQUEST */ + +#ifdef CONFIG_FAIL_IO_TIMEOUT +static struct device_attribute dev_attr_fail_timeout = + __ATTR(io-timeout-fail, 0644, part_timeout_show, part_timeout_store); +#endif + +static struct attribute *disk_attrs[] = { + &dev_attr_range.attr, + &dev_attr_ext_range.attr, + &dev_attr_removable.attr, + &dev_attr_hidden.attr, + &dev_attr_ro.attr, + &dev_attr_size.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, + &dev_attr_capability.attr, + &dev_attr_stat.attr, + &dev_attr_inflight.attr, + &dev_attr_badblocks.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif +#ifdef CONFIG_FAIL_IO_TIMEOUT + &dev_attr_fail_timeout.attr, +#endif + NULL +}; + +static umode_t disk_visible(struct kobject *kobj, struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, typeof(*dev), kobj); + struct gendisk *disk = dev_to_disk(dev); + + if (a == &dev_attr_badblocks.attr && !disk->bb) + return 0; + return a->mode; +} + +static struct attribute_group disk_attr_group = { + .attrs = disk_attrs, + .is_visible = disk_visible, +}; + +static const struct attribute_group *disk_attr_groups[] = { + &disk_attr_group, + NULL +}; + +/** + * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way + * @disk: disk to replace part_tbl for + * @new_ptbl: new part_tbl to install + * + * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The + * original ptbl is freed using RCU callback. + * + * LOCKING: + * Matching bd_mutex locked or the caller is the only user of @disk. + */ +static void disk_replace_part_tbl(struct gendisk *disk, + struct disk_part_tbl *new_ptbl) +{ + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + + rcu_assign_pointer(disk->part_tbl, new_ptbl); + + if (old_ptbl) { + rcu_assign_pointer(old_ptbl->last_lookup, NULL); + kfree_rcu(old_ptbl, rcu_head); + } +} + +/** + * disk_expand_part_tbl - expand disk->part_tbl + * @disk: disk to expand part_tbl for + * @partno: expand such that this partno can fit in + * + * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl + * uses RCU to allow unlocked dereferencing for stats and other stuff. + * + * LOCKING: + * Matching bd_mutex locked or the caller is the only user of @disk. + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int disk_expand_part_tbl(struct gendisk *disk, int partno) +{ + struct disk_part_tbl *old_ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + struct disk_part_tbl *new_ptbl; + int len = old_ptbl ? old_ptbl->len : 0; + int i, target; + + /* + * check for int overflow, since we can get here from blkpg_ioctl() + * with a user passed 'partno'. + */ + target = partno + 1; + if (target < 0) + return -EINVAL; + + /* disk_max_parts() is zero during initialization, ignore if so */ + if (disk_max_parts(disk) && target > disk_max_parts(disk)) + return -EINVAL; + + if (target <= len) + return 0; + + new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, + disk->node_id); + if (!new_ptbl) + return -ENOMEM; + + new_ptbl->len = target; + + for (i = 0; i < len; i++) + rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + + disk_replace_part_tbl(disk, new_ptbl); + return 0; +} + +/** + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk + * + * This function releases all allocated resources of the gendisk. + * + * The struct gendisk refcount is incremented with get_gendisk() or + * get_disk_and_module(), and its refcount is decremented with + * put_disk_and_module() or put_disk(). Once the refcount reaches 0 this + * function is called. + * + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + * + * Context: can sleep + */ +static void disk_release(struct device *dev) +{ + struct gendisk *disk = dev_to_disk(dev); + + might_sleep(); + + blk_free_devt(dev->devt); + disk_release_events(disk); + kfree(disk->random); + disk_replace_part_tbl(disk, NULL); + hd_free_part(&disk->part0); + if (disk->queue) + blk_put_queue(disk->queue); + kfree(disk); +} +struct class block_class = { + .name = "block", +}; + +static char *block_devnode(struct device *dev, umode_t *mode, + kuid_t *uid, kgid_t *gid) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (disk->fops->devnode) + return disk->fops->devnode(disk, mode); + return NULL; +} + +const struct device_type disk_type = { + .name = "disk", + .groups = disk_attr_groups, + .release = disk_release, + .devnode = block_devnode, +}; + +#ifdef CONFIG_PROC_FS +/* + * aggregate disk stat collector. Uses the same stats that the sysfs + * entries do, above, but makes them available through one seq_file. + * + * The output looks suspiciously like /proc/partitions with a bunch of + * extra fields. + */ +static int diskstats_show(struct seq_file *seqf, void *v) +{ + struct gendisk *gp = v; + struct disk_part_iter piter; + struct hd_struct *hd; + char buf[BDEVNAME_SIZE]; + unsigned int inflight; + struct disk_stats stat; + + /* + if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) + seq_puts(seqf, "major minor name" + " rio rmerge rsect ruse wio wmerge " + "wsect wuse running use aveq" + "\n\n"); + */ + + disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); + while ((hd = disk_part_iter_next(&piter))) { + part_stat_read_all(hd, &stat); + if (queue_is_mq(gp->queue)) + inflight = blk_mq_in_flight(gp->queue, hd); + else + inflight = part_in_flight(hd); + + seq_printf(seqf, "%4d %7d %s " + "%lu %lu %lu %u " + "%lu %lu %lu %u " + "%u %u %u " + "%lu %lu %lu %u " + "%lu %u" + "\n", + MAJOR(part_devt(hd)), MINOR(part_devt(hd)), + disk_name(gp, hd->partno, buf), + stat.ios[STAT_READ], + stat.merges[STAT_READ], + stat.sectors[STAT_READ], + (unsigned int)div_u64(stat.nsecs[STAT_READ], + NSEC_PER_MSEC), + stat.ios[STAT_WRITE], + stat.merges[STAT_WRITE], + stat.sectors[STAT_WRITE], + (unsigned int)div_u64(stat.nsecs[STAT_WRITE], + NSEC_PER_MSEC), + inflight, + jiffies_to_msecs(stat.io_ticks), + (unsigned int)div_u64(stat.nsecs[STAT_READ] + + stat.nsecs[STAT_WRITE] + + stat.nsecs[STAT_DISCARD] + + stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC), + stat.ios[STAT_DISCARD], + stat.merges[STAT_DISCARD], + stat.sectors[STAT_DISCARD], + (unsigned int)div_u64(stat.nsecs[STAT_DISCARD], + NSEC_PER_MSEC), + stat.ios[STAT_FLUSH], + (unsigned int)div_u64(stat.nsecs[STAT_FLUSH], + NSEC_PER_MSEC) + ); + } + disk_part_iter_exit(&piter); + + return 0; +} + +static const struct seq_operations diskstats_op = { + .start = disk_seqf_start, + .next = disk_seqf_next, + .stop = disk_seqf_stop, + .show = diskstats_show +}; + +static int __init proc_genhd_init(void) +{ + proc_create_seq("diskstats", 0, NULL, &diskstats_op); + proc_create_seq("partitions", 0, NULL, &partitions_op); + return 0; +} +module_init(proc_genhd_init); +#endif /* CONFIG_PROC_FS */ + +dev_t blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct hd_struct *part; + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + break; + } + part = disk_get_part(disk, partno); + if (part) { + devt = part_devt(part); + disk_put_part(part); + break; + } + disk_put_part(part); + } + class_dev_iter_exit(&iter); + return devt; +} + +struct gendisk *__alloc_disk_node(int minors, int node_id) +{ + struct gendisk *disk; + struct disk_part_tbl *ptbl; + + if (minors > DISK_MAX_PARTS) { + printk(KERN_ERR + "block: can't allocate more than %d partitions\n", + DISK_MAX_PARTS); + minors = DISK_MAX_PARTS; + } + + disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); + if (!disk) + return NULL; + + disk->part0.dkstats = alloc_percpu(struct disk_stats); + if (!disk->part0.dkstats) + goto out_free_disk; + + init_rwsem(&disk->lookup_sem); + disk->node_id = node_id; + if (disk_expand_part_tbl(disk, 0)) { + free_percpu(disk->part0.dkstats); + goto out_free_disk; + } + + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + rcu_assign_pointer(ptbl->part[0], &disk->part0); + + /* + * set_capacity() and get_capacity() currently don't use + * seqcounter to read/update the part0->nr_sects. Still init + * the counter as we can read the sectors in IO submission + * patch using seqence counters. + * + * TODO: Ideally set_capacity() and get_capacity() should be + * converted to make use of bd_mutex and sequence counters. + */ + hd_sects_seq_init(&disk->part0); + if (hd_ref_init(&disk->part0)) + goto out_free_part0; + + disk->minors = minors; + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); + return disk; + +out_free_part0: + hd_free_part(&disk->part0); +out_free_disk: + kfree(disk); + return NULL; +} +EXPORT_SYMBOL(__alloc_disk_node); + +/** + * get_disk_and_module - increments the gendisk and gendisk fops module refcount + * @disk: the struct gendisk to increment the refcount for + * + * This increments the refcount for the struct gendisk, and the gendisk's + * fops module owner. + * + * Context: Any context. + */ +struct kobject *get_disk_and_module(struct gendisk *disk) +{ + struct module *owner; + struct kobject *kobj; + + if (!disk->fops) + return NULL; + owner = disk->fops->owner; + if (owner && !try_module_get(owner)) + return NULL; + kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); + if (kobj == NULL) { + module_put(owner); + return NULL; + } + return kobj; + +} +EXPORT_SYMBOL(get_disk_and_module); + +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ +void put_disk(struct gendisk *disk) +{ + if (disk) + kobject_put(&disk_to_dev(disk)->kobj); +} +EXPORT_SYMBOL(put_disk); + +/** + * put_disk_and_module - decrements the module and gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * + * This is a counterpart of get_disk_and_module() and thus also of + * get_gendisk(). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ +void put_disk_and_module(struct gendisk *disk) +{ + if (disk) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + } +} +EXPORT_SYMBOL(put_disk_and_module); + +static void set_disk_ro_uevent(struct gendisk *gd, int ro) +{ + char event[] = "DISK_RO=1"; + char *envp[] = { event, NULL }; + + if (!ro) + event[8] = '0'; + kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); +} + +void set_device_ro(struct block_device *bdev, int flag) +{ + bdev->bd_part->policy = flag; +} + +EXPORT_SYMBOL(set_device_ro); + +void set_disk_ro(struct gendisk *disk, int flag) +{ + struct disk_part_iter piter; + struct hd_struct *part; + + if (disk->part0.policy != flag) { + set_disk_ro_uevent(disk, flag); + disk->part0.policy = flag; + } + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + part->policy = flag; + disk_part_iter_exit(&piter); +} + +EXPORT_SYMBOL(set_disk_ro); + +int bdev_read_only(struct block_device *bdev) +{ + if (!bdev) + return 0; + return bdev->bd_part->policy; +} + +EXPORT_SYMBOL(bdev_read_only); + +/* + * Disk events - monitor disk events like media change and eject request. + */ +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + struct mutex block_mutex; /* protects blocking */ + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll if the POLL flag is set. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->event_flags & DISK_EVENT_FLAG_POLL) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + intv = disk_events_poll_jiffies(disk); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, false); +} + +/** + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush + * + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. + * + * CONTEXT: + * If @mask is non-zero must be called with bdev->bd_mutex held. + */ +void disk_flush_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); +} + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and cleared + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + unsigned int pending; + unsigned int clearing = mask; + + if (!ev) + return 0; + + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ + spin_lock_irq(&ev->lock); + clearing |= ev->clearing; + ev->clearing = 0; + spin_unlock_irq(&ev->lock); + + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); + + return pending; +} + +/** + * bdev_check_media_change - check if a removable media has been changed + * @bdev: block device to check + * + * Check whether a removable media has been changed, and attempt to free all + * dentries and inodes and invalidates all block device page cache entries in + * that case. + * + * Returns %true if the block device changed, or %false if not. + */ +bool bdev_check_media_change(struct block_device *bdev) +{ + unsigned int events; + + events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (!(events & DISK_EVENT_MEDIA_CHANGE)) + return false; + + if (__invalidate_device(bdev, true)) + pr_warn("VFS: busy inodes on changed media %s\n", + bdev->bd_disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + return true; +} +EXPORT_SYMBOL(bdev_check_media_change); + +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ + struct gendisk *disk = ev->disk; + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + unsigned int clearing = *clearing_ptr; + unsigned int events; + unsigned long intv; + int nr_events = 0, i; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + *clearing_ptr &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + /* + * Tell userland about new events. Only the events listed in + * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT + * is set. Otherwise, events are processed internally but never + * get reported to userland. + */ + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if ((events & disk->events & (1 << i)) && + (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * (always empty, only for backwards compatibility) + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + return 0; + + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->ev) + return sprintf(buf, "-1\n"); + + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + if (!disk->ev) + return -ENODEV; + + disk_block_events(disk); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + + return count; +} + +static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); +static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +static const DEVICE_ATTR(events_poll_msecs, 0644, + disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +static const struct attribute *disk_events_attrs[] = { + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + NULL, +}; + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/parameters/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + + list_for_each_entry(ev, &disk_events, node) + disk_flush_events(ev->disk, 0); + + mutex_unlock(&disk_events_mutex); + + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. + */ +static void disk_alloc_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !disk->events) + return; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return; + } + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + disk->ev = ev; +} + +static void disk_add_events(struct gendisk *disk) +{ + /* FIXME: error handling */ + if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) + pr_warn("%s: failed to create sysfs files for events\n", + disk->disk_name); + + if (!disk->ev) + return; + + mutex_lock(&disk_events_mutex); + list_add_tail(&disk->ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +static void disk_del_events(struct gendisk *disk) +{ + if (disk->ev) { + disk_block_events(disk); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + } + + sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); +} + +static void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/ioctl.c b/block/ioctl.c new file mode 100644 index 000000000..e7eed7dad --- /dev/null +++ b/block/ioctl.c @@ -0,0 +1,707 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "blk.h" + +static int blkpg_do_ioctl(struct block_device *bdev, + struct blkpg_partition __user *upart, int op) +{ + struct blkpg_partition p; + long long start, length; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) + return -EFAULT; + if (bdev_is_partition(bdev)) + return -EINVAL; + + if (p.pno <= 0) + return -EINVAL; + + if (op == BLKPG_DEL_PARTITION) + return bdev_del_partition(bdev, p.pno); + + start = p.start >> SECTOR_SHIFT; + length = p.length >> SECTOR_SHIFT; + + /* check for fit in a hd_struct */ + if (sizeof(sector_t) < sizeof(long long)) { + long pstart = start, plength = length; + + if (pstart != start || plength != length || pstart < 0 || + plength < 0 || p.pno > 65535) + return -EINVAL; + } + + switch (op) { + case BLKPG_ADD_PARTITION: + /* check if partition is aligned to blocksize */ + if (p.start & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + return bdev_add_partition(bdev, p.pno, start, length); + case BLKPG_RESIZE_PARTITION: + return bdev_resize_partition(bdev, p.pno, start, length); + default: + return -EINVAL; + } +} + +static int blkpg_ioctl(struct block_device *bdev, + struct blkpg_ioctl_arg __user *arg) +{ + struct blkpg_partition __user *udata; + int op; + + if (get_user(op, &arg->op) || get_user(udata, &arg->data)) + return -EFAULT; + + return blkpg_do_ioctl(bdev, udata, op); +} + +#ifdef CONFIG_COMPAT +struct compat_blkpg_ioctl_arg { + compat_int_t op; + compat_int_t flags; + compat_int_t datalen; + compat_caddr_t data; +}; + +static int compat_blkpg_ioctl(struct block_device *bdev, + struct compat_blkpg_ioctl_arg __user *arg) +{ + compat_caddr_t udata; + int op; + + if (get_user(op, &arg->op) || get_user(udata, &arg->data)) + return -EFAULT; + + return blkpg_do_ioctl(bdev, compat_ptr(udata), op); +} +#endif + +static int blkdev_reread_part(struct block_device *bdev, fmode_t mode) +{ + struct block_device *tmp; + + if (!disk_part_scan_enabled(bdev->bd_disk) || bdev_is_partition(bdev)) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (bdev->bd_part_count) + return -EBUSY; + + /* + * Reopen the device to revalidate the driver state and force a + * partition rescan. + */ + mode &= ~FMODE_EXCL; + set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + + tmp = blkdev_get_by_dev(bdev->bd_dev, mode, NULL); + if (IS_ERR(tmp)) + return PTR_ERR(tmp); + blkdev_put(tmp, mode); + return 0; +} + +static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, + unsigned long arg, unsigned long flags) +{ + uint64_t range[2]; + uint64_t start, len; + struct request_queue *q = bdev_get_queue(bdev); + int err; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + + if (start & 511) + return -EINVAL; + if (len & 511) + return -EINVAL; + + if (start + len > i_size_read(bdev->bd_inode)) + return -EINVAL; + + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (err) + return err; + + return blkdev_issue_discard(bdev, start >> 9, len >> 9, + GFP_KERNEL, flags); +} + +static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, + unsigned long arg) +{ + uint64_t range[2]; + uint64_t start, end, len; + int err; + + if (!(mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(range, (void __user *)arg, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + end = start + len - 1; + + if (start & 511) + return -EINVAL; + if (len & 511) + return -EINVAL; + if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + return -EINVAL; + if (end < start) + return -EINVAL; + + /* Invalidate the page cache, including dirty pages */ + err = truncate_bdev_range(bdev, mode, start, end); + if (err) + return err; + + return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); +} + +static int put_ushort(unsigned short __user *argp, unsigned short val) +{ + return put_user(val, argp); +} + +static int put_int(int __user *argp, int val) +{ + return put_user(val, argp); +} + +static int put_uint(unsigned int __user *argp, unsigned int val) +{ + return put_user(val, argp); +} + +static int put_long(long __user *argp, long val) +{ + return put_user(val, argp); +} + +static int put_ulong(unsigned long __user *argp, unsigned long val) +{ + return put_user(val, argp); +} + +static int put_u64(u64 __user *argp, u64 val) +{ + return put_user(val, argp); +} + +#ifdef CONFIG_COMPAT +static int compat_put_long(compat_long_t __user *argp, long val) +{ + return put_user(val, argp); +} + +static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) +{ + return put_user(val, argp); +} +#endif + +int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + struct gendisk *disk = bdev->bd_disk; + + if (disk->fops->ioctl) + return disk->fops->ioctl(bdev, mode, cmd, arg); + + return -ENOTTY; +} +/* + * For the record: _GPL here is only because somebody decided to slap it + * on the previous export. Sheer idiocy, since it wasn't copyrightable + * at all and could be open-coded without any exports by anybody who cares. + */ +EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); + +#ifdef CONFIG_COMPAT +/* + * This is the equivalent of compat_ptr_ioctl(), to be used by block + * drivers that implement only commands that are completely compatible + * between 32-bit and 64-bit user space + */ +int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + struct gendisk *disk = bdev->bd_disk; + + if (disk->fops->ioctl) + return disk->fops->ioctl(bdev, mode, cmd, + (unsigned long)compat_ptr(arg)); + + return -ENOIOCTLCMD; +} +EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); +#endif + +static int blkdev_pr_register(struct block_device *bdev, + struct pr_registration __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_registration reg; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_register) + return -EOPNOTSUPP; + if (copy_from_user(®, arg, sizeof(reg))) + return -EFAULT; + + if (reg.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); +} + +static int blkdev_pr_reserve(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_reserve) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); +} + +static int blkdev_pr_release(struct block_device *bdev, + struct pr_reservation __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_reservation rsv; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_release) + return -EOPNOTSUPP; + if (copy_from_user(&rsv, arg, sizeof(rsv))) + return -EFAULT; + + if (rsv.flags) + return -EOPNOTSUPP; + return ops->pr_release(bdev, rsv.key, rsv.type); +} + +static int blkdev_pr_preempt(struct block_device *bdev, + struct pr_preempt __user *arg, bool abort) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_preempt p; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_preempt) + return -EOPNOTSUPP; + if (copy_from_user(&p, arg, sizeof(p))) + return -EFAULT; + + if (p.flags) + return -EOPNOTSUPP; + return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); +} + +static int blkdev_pr_clear(struct block_device *bdev, + struct pr_clear __user *arg) +{ + const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; + struct pr_clear c; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (!ops || !ops->pr_clear) + return -EOPNOTSUPP; + if (copy_from_user(&c, arg, sizeof(c))) + return -EFAULT; + + if (c.flags) + return -EOPNOTSUPP; + return ops->pr_clear(bdev, c.key); +} + +/* + * Is it an unrecognized ioctl? The correct returns are either + * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a + * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl + * code before returning. + * + * Confused drivers sometimes return EINVAL, which is wrong. It + * means "I understood the ioctl command, but the parameters to + * it were wrong". + * + * We should aim to just fix the broken drivers, the EINVAL case + * should go away. + */ +static inline int is_unrecognized_ioctl(int ret) +{ + return ret == -EINVAL || + ret == -ENOTTY || + ret == -ENOIOCTLCMD; +} + +static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; + + fsync_bdev(bdev); + invalidate_bdev(bdev); + return 0; +} + +static int blkdev_roset(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + int ret, n; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + + ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (!is_unrecognized_ioctl(ret)) + return ret; + if (get_user(n, (int __user *)arg)) + return -EFAULT; + set_device_ro(bdev, n); + return 0; +} + +static int blkdev_getgeo(struct block_device *bdev, + struct hd_geometry __user *argp) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; + + if (!argp) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + memset(&geo, 0, sizeof(geo)); + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + if (copy_to_user(argp, &geo, sizeof(geo))) + return -EFAULT; + return 0; +} + +#ifdef CONFIG_COMPAT +struct compat_hd_geometry { + unsigned char heads; + unsigned char sectors; + unsigned short cylinders; + u32 start; +}; + +static int compat_hdio_getgeo(struct block_device *bdev, + struct compat_hd_geometry __user *ugeo) +{ + struct gendisk *disk = bdev->bd_disk; + struct hd_geometry geo; + int ret; + + if (!ugeo) + return -EINVAL; + if (!disk->fops->getgeo) + return -ENOTTY; + + memset(&geo, 0, sizeof(geo)); + /* + * We need to set the startsect first, the driver may + * want to override it. + */ + geo.start = get_start_sect(bdev); + ret = disk->fops->getgeo(bdev, &geo); + if (ret) + return ret; + + ret = copy_to_user(ugeo, &geo, 4); + ret |= put_user(geo.start, &ugeo->start); + if (ret) + ret = -EFAULT; + + return ret; +} +#endif + +/* set the logical block size */ +static int blkdev_bszset(struct block_device *bdev, fmode_t mode, + int __user *argp) +{ + int ret, n; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!argp) + return -EINVAL; + if (get_user(n, argp)) + return -EFAULT; + + if (mode & FMODE_EXCL) + return set_blocksize(bdev, n); + + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev))) + return -EBUSY; + ret = set_blocksize(bdev, n); + blkdev_put(bdev, mode | FMODE_EXCL); + + return ret; +} + +/* + * Common commands that are handled the same way on native and compat + * user space. Note the separate arg/argp parameters that are needed + * to deal with the compat_ptr() conversion. + */ +static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg, void __user *argp) +{ + unsigned int max_sectors; + + switch (cmd) { + case BLKFLSBUF: + return blkdev_flushbuf(bdev, mode, cmd, arg); + case BLKROSET: + return blkdev_roset(bdev, mode, cmd, arg); + case BLKDISCARD: + return blk_ioctl_discard(bdev, mode, arg, 0); + case BLKSECDISCARD: + return blk_ioctl_discard(bdev, mode, arg, + BLKDEV_DISCARD_SECURE); + case BLKZEROOUT: + return blk_ioctl_zeroout(bdev, mode, arg); + case BLKREPORTZONE: + return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + case BLKRESETZONE: + case BLKOPENZONE: + case BLKCLOSEZONE: + case BLKFINISHZONE: + return blkdev_zone_mgmt_ioctl(bdev, mode, cmd, arg); + case BLKGETZONESZ: + return put_uint(argp, bdev_zone_sectors(bdev)); + case BLKGETNRZONES: + return put_uint(argp, blkdev_nr_zones(bdev->bd_disk)); + case BLKROGET: + return put_int(argp, bdev_read_only(bdev) != 0); + case BLKSSZGET: /* get block device logical block size */ + return put_int(argp, bdev_logical_block_size(bdev)); + case BLKPBSZGET: /* get block device physical block size */ + return put_uint(argp, bdev_physical_block_size(bdev)); + case BLKIOMIN: + return put_uint(argp, bdev_io_min(bdev)); + case BLKIOOPT: + return put_uint(argp, bdev_io_opt(bdev)); + case BLKALIGNOFF: + return put_int(argp, bdev_alignment_offset(bdev)); + case BLKDISCARDZEROES: + return put_uint(argp, 0); + case BLKSECTGET: + max_sectors = min_t(unsigned int, USHRT_MAX, + queue_max_sectors(bdev_get_queue(bdev))); + return put_ushort(argp, max_sectors); + case BLKROTATIONAL: + return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); + case BLKRASET: + case BLKFRASET: + if(!capable(CAP_SYS_ADMIN)) + return -EACCES; + bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + return 0; + case BLKRRPART: + return blkdev_reread_part(bdev, mode); + case BLKTRACESTART: + case BLKTRACESTOP: + case BLKTRACETEARDOWN: + return blk_trace_ioctl(bdev, cmd, argp); + case IOC_PR_REGISTER: + return blkdev_pr_register(bdev, argp); + case IOC_PR_RESERVE: + return blkdev_pr_reserve(bdev, argp); + case IOC_PR_RELEASE: + return blkdev_pr_release(bdev, argp); + case IOC_PR_PREEMPT: + return blkdev_pr_preempt(bdev, argp, false); + case IOC_PR_PREEMPT_ABORT: + return blkdev_pr_preempt(bdev, argp, true); + case IOC_PR_CLEAR: + return blkdev_pr_clear(bdev, argp); + default: + return -ENOIOCTLCMD; + } +} + +/* + * Always keep this in sync with compat_blkdev_ioctl() + * to handle all incompatible commands in both functions. + * + * New commands must be compatible and go into blkdev_common_ioctl + */ +int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, + unsigned long arg) +{ + int ret; + loff_t size; + void __user *argp = (void __user *)arg; + + switch (cmd) { + /* These need separate implementations for the data structure */ + case HDIO_GETGEO: + return blkdev_getgeo(bdev, argp); + case BLKPG: + return blkpg_ioctl(bdev, argp); + + /* Compat mode returns 32-bit data instead of 'long' */ + case BLKRAGET: + case BLKFRAGET: + if (!argp) + return -EINVAL; + return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + case BLKGETSIZE: + size = i_size_read(bdev->bd_inode); + if ((size >> 9) > ~0UL) + return -EFBIG; + return put_ulong(argp, size >> 9); + + /* The data is compatible, but the command number is different */ + case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ + return put_int(argp, block_size(bdev)); + case BLKBSZSET: + return blkdev_bszset(bdev, mode, argp); + case BLKGETSIZE64: + return put_u64(argp, i_size_read(bdev->bd_inode)); + + /* Incompatible alignment on i386 */ + case BLKTRACESETUP: + return blk_trace_ioctl(bdev, cmd, argp); + default: + break; + } + + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + if (ret == -ENOIOCTLCMD) + return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + + return ret; +} +EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ + +#ifdef CONFIG_COMPAT + +#define BLKBSZGET_32 _IOR(0x12, 112, int) +#define BLKBSZSET_32 _IOW(0x12, 113, int) +#define BLKGETSIZE64_32 _IOR(0x12, 114, int) + +/* Most of the generic ioctls are handled in the normal fallback path. + This assumes the blkdev's low level compat_ioctl always returns + ENOIOCTLCMD for unknown ioctls. */ +long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) +{ + int ret; + void __user *argp = compat_ptr(arg); + struct inode *inode = file->f_mapping->host; + struct block_device *bdev = inode->i_bdev; + struct gendisk *disk = bdev->bd_disk; + fmode_t mode = file->f_mode; + loff_t size; + + /* + * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have + * to updated it before every ioctl. + */ + if (file->f_flags & O_NDELAY) + mode |= FMODE_NDELAY; + else + mode &= ~FMODE_NDELAY; + + switch (cmd) { + /* These need separate implementations for the data structure */ + case HDIO_GETGEO: + return compat_hdio_getgeo(bdev, argp); + case BLKPG: + return compat_blkpg_ioctl(bdev, argp); + + /* Compat mode returns 32-bit data instead of 'long' */ + case BLKRAGET: + case BLKFRAGET: + if (!argp) + return -EINVAL; + return compat_put_long(argp, + (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + case BLKGETSIZE: + size = i_size_read(bdev->bd_inode); + if ((size >> 9) > ~(compat_ulong_t)0) + return -EFBIG; + return compat_put_ulong(argp, size >> 9); + + /* The data is compatible, but the command number is different */ + case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ + return put_int(argp, bdev_logical_block_size(bdev)); + case BLKBSZSET_32: + return blkdev_bszset(bdev, mode, argp); + case BLKGETSIZE64_32: + return put_u64(argp, i_size_read(bdev->bd_inode)); + + /* Incompatible alignment on i386 */ + case BLKTRACESETUP32: + return blk_trace_ioctl(bdev, cmd, argp); + default: + break; + } + + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); + if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) + ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); + + return ret; +} +#endif diff --git a/block/ioprio.c b/block/ioprio.c new file mode 100644 index 000000000..c8878647d --- /dev/null +++ b/block/ioprio.c @@ -0,0 +1,262 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/ioprio.c + * + * Copyright (C) 2004 Jens Axboe + * + * Helper functions for setting/querying io priorities of processes. The + * system calls closely mimmick getpriority/setpriority, see the man page for + * those. The prio argument is a composite of prio class and prio data, where + * the data argument has meaning within that class. The standard scheduling + * classes have 8 distinct prio levels, with 0 being the highest prio and 7 + * being the lowest. + * + * IOW, setting BE scheduling class with prio 2 is done ala: + * + * unsigned int prio = (IOPRIO_CLASS_BE << IOPRIO_CLASS_SHIFT) | 2; + * + * ioprio_set(PRIO_PROCESS, pid, prio); + * + * See also Documentation/block/ioprio.rst + * + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + struct io_context *ioc; + const struct cred *cred = current_cred(), *tcred; + + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); + + err = security_task_setioprio(task, ioprio); + if (err) + return err; + + ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); + if (ioc) { + ioc->ioprio = ioprio; + put_io_context(ioc); + } + + return err; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +int ioprio_check_cap(int ioprio) +{ + int class = IOPRIO_PRIO_CLASS(ioprio); + int data = IOPRIO_PRIO_DATA(ioprio); + + switch (class) { + case IOPRIO_CLASS_RT: + /* + * Originally this only checked for CAP_SYS_ADMIN, + * which was implicitly allowed for pid 0 by security + * modules such as SELinux. Make sure we check + * CAP_SYS_ADMIN first to avoid a denial/avc for + * possibly missing CAP_SYS_NICE permission. + */ + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) + return -EPERM; + fallthrough; + /* rt has prio field too */ + case IOPRIO_CLASS_BE: + if (data >= IOPRIO_BE_NR || data < 0) + return -EINVAL; + + break; + case IOPRIO_CLASS_IDLE: + break; + case IOPRIO_CLASS_NONE: + if (data) + return -EINVAL; + break; + default: + return -EINVAL; + } + + return 0; +} + +SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) +{ + struct task_struct *p, *g; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret; + + ret = ioprio_check_cap(ioprio); + if (ret) + return ret; + + ret = -ESRCH; + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = set_task_ioprio(p, ioprio); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + ret = set_task_ioprio(p, ioprio); + if (ret) + break; + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!uid_valid(uid)) + break; + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + for_each_process_thread(g, p) { + if (!uid_eq(task_uid(p), uid) || + !task_pid_vnr(p)) + continue; + ret = set_task_ioprio(p, ioprio); + if (ret) + goto free_uid; + } +free_uid: + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} + +static int get_task_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; + ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); + if (p->io_context) + ret = p->io_context->ioprio; + task_unlock(p); +out: + return ret; +} + +int ioprio_best(unsigned short aprio, unsigned short bprio) +{ + if (!ioprio_valid(aprio)) + aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + if (!ioprio_valid(bprio)) + bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); + + return min(aprio, bprio); +} + +SYSCALL_DEFINE2(ioprio_get, int, which, int, who) +{ + struct task_struct *g, *p; + struct user_struct *user; + struct pid *pgrp; + kuid_t uid; + int ret = -ESRCH; + int tmpio; + + rcu_read_lock(); + switch (which) { + case IOPRIO_WHO_PROCESS: + if (!who) + p = current; + else + p = find_task_by_vpid(who); + if (p) + ret = get_task_ioprio(p); + break; + case IOPRIO_WHO_PGRP: + if (!who) + pgrp = task_pgrp(current); + else + pgrp = find_vpid(who); + read_lock(&tasklist_lock); + do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + + break; + case IOPRIO_WHO_USER: + uid = make_kuid(current_user_ns(), who); + if (!who) + user = current_user(); + else + user = find_user(uid); + + if (!user) + break; + + for_each_process_thread(g, p) { + if (!uid_eq(task_uid(p), user->uid) || + !task_pid_vnr(p)) + continue; + tmpio = get_task_ioprio(p); + if (tmpio < 0) + continue; + if (ret == -ESRCH) + ret = tmpio; + else + ret = ioprio_best(ret, tmpio); + } + + if (who) + free_uid(user); + break; + default: + ret = -EINVAL; + } + + rcu_read_unlock(); + return ret; +} diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c new file mode 100644 index 000000000..17a1f1ba4 --- /dev/null +++ b/block/keyslot-manager.c @@ -0,0 +1,402 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: The Keyslot Manager + * + * Many devices with inline encryption support have a limited number of "slots" + * into which encryption contexts may be programmed, and requests can be tagged + * with a slot number to specify the key to use for en/decryption. + * + * As the number of slots is limited, and programming keys is expensive on + * many inline encryption hardware, we don't want to program the same key into + * multiple slots - if multiple requests are using the same key, we want to + * program just one slot with that key and use that slot for all requests. + * + * The keyslot manager manages these keyslots appropriately, and also acts as + * an abstraction between the inline encryption hardware and the upper layers. + * + * Lower layer devices will set up a keyslot manager in their request queue + * and tell it how to perform device specific operations like programming/ + * evicting keys from keyslots. + * + * Upper layers will call blk_ksm_get_slot_for_key() to program a + * key into some slot in the inline encryption hardware. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include +#include +#include +#include +#include +#include + +struct blk_ksm_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_keyslot_manager *ksm; +}; + +static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) +{ + /* + * Calling into the driver requires ksm->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release ksm->lock via blk_ksm_reprogram_all_keys(). + */ + if (ksm->dev) + pm_runtime_get_sync(ksm->dev); + down_write(&ksm->lock); +} + +static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) +{ + up_write(&ksm->lock); + if (ksm->dev) + pm_runtime_put_sync(ksm->dev); +} + +/** + * blk_ksm_init() - Initialize a keyslot manager + * @ksm: The keyslot_manager to initialize. + * @num_slots: The number of key slots to manage. + * + * Allocate memory for keyslots and initialize a keyslot manager. Called by + * e.g. storage drivers to set up a keyslot manager in their request_queue. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(ksm, 0, sizeof(*ksm)); + + if (num_slots == 0) + return -EINVAL; + + ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); + if (!ksm->slots) + return -ENOMEM; + + ksm->num_slots = num_slots; + + init_rwsem(&ksm->lock); + + init_waitqueue_head(&ksm->idle_slots_wait_queue); + INIT_LIST_HEAD(&ksm->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + ksm->slots[slot].ksm = ksm; + list_add_tail(&ksm->slots[slot].idle_slot_node, + &ksm->idle_slots); + } + + spin_lock_init(&ksm->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + /* + * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 + * buckets. This only makes a difference when there is only 1 keyslot. + */ + if (slot_hashtable_size < 2) + slot_hashtable_size = 2; + + ksm->log_slot_ht_size = ilog2(slot_hashtable_size); + ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, + sizeof(ksm->slot_hashtable[0]), + GFP_KERNEL); + if (!ksm->slot_hashtable) + goto err_destroy_ksm; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); + + return 0; + +err_destroy_ksm: + blk_ksm_destroy(ksm); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_ksm_init); + +static inline struct hlist_head * +blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; +} + +static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm = slot->ksm; + unsigned long flags; + + spin_lock_irqsave(&ksm->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); +} + +static struct blk_ksm_keyslot *blk_ksm_find_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); + struct blk_ksm_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( + struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_ksm_remove_slot_from_lru_list(slot); + } + return slot; +} + +unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) +{ + return slot - slot->ksm->slots; +} +EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); + +/** + * blk_ksm_get_slot_for_key() - Program a key into a keyslot. + * @ksm: The keyslot manager to program the key into. + * @key: Pointer to the key object to program, including the raw key, crypto + * mode, and data unit size. + * @slot_ptr: A pointer to return the pointer of the allocated keyslot. + * + * Get a keyslot that's been programmed with the specified key. If one already + * exists, return it with incremented refcount. Otherwise, wait for a keyslot + * to become idle and program it. + * + * Context: Process context. Takes and releases ksm->lock. + * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the + * allocated keyslot), or some other blk_status_t otherwise (and + * keyslot is set to NULL). + */ +blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key, + struct blk_ksm_keyslot **slot_ptr) +{ + struct blk_ksm_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + down_read(&ksm->lock); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + up_read(&ksm->lock); + if (slot) + goto success; + + for (;;) { + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_and_grab_keyslot(ksm, key); + if (slot) { + blk_ksm_hw_exit(ksm); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&ksm->idle_slots)) + break; + + blk_ksm_hw_exit(ksm); + wait_event(ksm->idle_slots_wait_queue, + !list_empty(&ksm->idle_slots)); + } + + slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, + idle_slot_node); + slot_idx = blk_ksm_get_slot_idx(slot); + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); + if (err) { + wake_up(&ksm->idle_slots_wait_queue); + blk_ksm_hw_exit(ksm); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_ksm_remove_slot_from_lru_list(slot); + + blk_ksm_hw_exit(ksm); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_ksm_put_slot() - Release a reference to a slot + * @slot: The keyslot to release the reference of. + * + * Context: Any context. + */ +void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) +{ + struct blk_keyslot_manager *ksm; + unsigned long flags; + + if (!slot) + return; + + ksm = slot->ksm; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &ksm->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); + spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); + wake_up(&ksm->idle_slots_wait_queue); + } +} + +/** + * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is + * supported by a ksm. + * @ksm: The keyslot manager to check + * @cfg: The crypto configuration to check for. + * + * Checks for crypto_mode/data unit size/dun bytes support. + * + * Return: Whether or not this ksm supports the specified crypto config. + */ +bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, + const struct blk_crypto_config *cfg) +{ + if (!ksm) + return false; + if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & + cfg->data_unit_size)) + return false; + if (ksm->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/* + * This is an internal function that evicts a key from an inline encryption + * device that can be either a real device or the blk-crypto-fallback "device". + * It is used only by blk_crypto_evict_key(); see that function for details. + */ +int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, + const struct blk_crypto_key *key) +{ + struct blk_ksm_keyslot *slot; + int err; + + blk_ksm_hw_enter(ksm); + slot = blk_ksm_find_keyslot(ksm, key); + if (!slot) { + /* + * Not an error, since a key not in use by I/O is not guaranteed + * to be in a keyslot. There can be more keys than keyslots. + */ + err = 0; + goto out; + } + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + /* BUG: key is still in use by I/O */ + err = -EBUSY; + goto out_remove; + } + err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, + blk_ksm_get_slot_idx(slot)); +out_remove: + /* + * Callers free the key even on error, so unlink the key from the hash + * table and clear slot->key even on error. + */ + hlist_del(&slot->hash_node); + slot->key = NULL; +out: + blk_ksm_hw_exit(ksm); + return err; +} + +/** + * blk_ksm_reprogram_all_keys() - Re-program all keyslots. + * @ksm: The keyslot manager + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases ksm->lock. + */ +void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) +{ + unsigned int slot; + + /* This is for device initialization, so don't resume the device */ + down_write(&ksm->lock); + for (slot = 0; slot < ksm->num_slots; slot++) { + const struct blk_crypto_key *key = ksm->slots[slot].key; + int err; + + if (!key) + continue; + + err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); + WARN_ON(err); + } + up_write(&ksm->lock); +} +EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); + +void blk_ksm_destroy(struct blk_keyslot_manager *ksm) +{ + if (!ksm) + return; + kvfree(ksm->slot_hashtable); + kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); + memzero_explicit(ksm, sizeof(*ksm)); +} +EXPORT_SYMBOL_GPL(blk_ksm_destroy); + +bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->ksm = ksm; + return true; +} +EXPORT_SYMBOL_GPL(blk_ksm_register); + +void blk_ksm_unregister(struct request_queue *q) +{ + q->ksm = NULL; +} diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c new file mode 100644 index 000000000..7f9ef773b --- /dev/null +++ b/block/kyber-iosched.c @@ -0,0 +1,1051 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * The Kyber I/O scheduler. Controls latency by throttling queue depths using + * scalable techniques. + * + * Copyright (C) 2017 Facebook + */ + +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" +#include "blk-mq-tag.h" + +#define CREATE_TRACE_POINTS +#include + +/* + * Scheduling domains: the device is divided into multiple domains based on the + * request type. + */ +enum { + KYBER_READ, + KYBER_WRITE, + KYBER_DISCARD, + KYBER_OTHER, + KYBER_NUM_DOMAINS, +}; + +static const char *kyber_domain_names[] = { + [KYBER_READ] = "READ", + [KYBER_WRITE] = "WRITE", + [KYBER_DISCARD] = "DISCARD", + [KYBER_OTHER] = "OTHER", +}; + +enum { + /* + * In order to prevent starvation of synchronous requests by a flood of + * asynchronous requests, we reserve 25% of requests for synchronous + * operations. + */ + KYBER_ASYNC_PERCENT = 75, +}; + +/* + * Maximum device-wide depth for each scheduling domain. + * + * Even for fast devices with lots of tags like NVMe, you can saturate the + * device with only a fraction of the maximum possible queue depth. So, we cap + * these to a reasonable value. + */ +static const unsigned int kyber_depth[] = { + [KYBER_READ] = 256, + [KYBER_WRITE] = 128, + [KYBER_DISCARD] = 64, + [KYBER_OTHER] = 16, +}; + +/* + * Default latency targets for each scheduling domain. + */ +static const u64 kyber_latency_targets[] = { + [KYBER_READ] = 2ULL * NSEC_PER_MSEC, + [KYBER_WRITE] = 10ULL * NSEC_PER_MSEC, + [KYBER_DISCARD] = 5ULL * NSEC_PER_SEC, +}; + +/* + * Batch size (number of requests we'll dispatch in a row) for each scheduling + * domain. + */ +static const unsigned int kyber_batch_size[] = { + [KYBER_READ] = 16, + [KYBER_WRITE] = 8, + [KYBER_DISCARD] = 1, + [KYBER_OTHER] = 1, +}; + +/* + * Requests latencies are recorded in a histogram with buckets defined relative + * to the target latency: + * + * <= 1/4 * target latency + * <= 1/2 * target latency + * <= 3/4 * target latency + * <= target latency + * <= 1 1/4 * target latency + * <= 1 1/2 * target latency + * <= 1 3/4 * target latency + * > 1 3/4 * target latency + */ +enum { + /* + * The width of the latency histogram buckets is + * 1 / (1 << KYBER_LATENCY_SHIFT) * target latency. + */ + KYBER_LATENCY_SHIFT = 2, + /* + * The first (1 << KYBER_LATENCY_SHIFT) buckets are <= target latency, + * thus, "good". + */ + KYBER_GOOD_BUCKETS = 1 << KYBER_LATENCY_SHIFT, + /* There are also (1 << KYBER_LATENCY_SHIFT) "bad" buckets. */ + KYBER_LATENCY_BUCKETS = 2 << KYBER_LATENCY_SHIFT, +}; + +/* + * We measure both the total latency and the I/O latency (i.e., latency after + * submitting to the device). + */ +enum { + KYBER_TOTAL_LATENCY, + KYBER_IO_LATENCY, +}; + +static const char *kyber_latency_type_names[] = { + [KYBER_TOTAL_LATENCY] = "total", + [KYBER_IO_LATENCY] = "I/O", +}; + +/* + * Per-cpu latency histograms: total latency and I/O latency for each scheduling + * domain except for KYBER_OTHER. + */ +struct kyber_cpu_latency { + atomic_t buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; +}; + +/* + * There is a same mapping between ctx & hctx and kcq & khd, + * we use request->mq_ctx->index_hw to index the kcq in khd. + */ +struct kyber_ctx_queue { + /* + * Used to ensure operations on rq_list and kcq_map to be an atmoic one. + * Also protect the rqs on rq_list when merge. + */ + spinlock_t lock; + struct list_head rq_list[KYBER_NUM_DOMAINS]; +} ____cacheline_aligned_in_smp; + +struct kyber_queue_data { + struct request_queue *q; + + /* + * Each scheduling domain has a limited number of in-flight requests + * device-wide, limited by these tokens. + */ + struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; + + /* + * Async request percentage, converted to per-word depth for + * sbitmap_get_shallow(). + */ + unsigned int async_depth; + + struct kyber_cpu_latency __percpu *cpu_latency; + + /* Timer for stats aggregation and adjusting domain tokens. */ + struct timer_list timer; + + unsigned int latency_buckets[KYBER_OTHER][2][KYBER_LATENCY_BUCKETS]; + + unsigned long latency_timeout[KYBER_OTHER]; + + int domain_p99[KYBER_OTHER]; + + /* Target latencies in nanoseconds. */ + u64 latency_targets[KYBER_OTHER]; +}; + +struct kyber_hctx_data { + spinlock_t lock; + struct list_head rqs[KYBER_NUM_DOMAINS]; + unsigned int cur_domain; + unsigned int batching; + struct kyber_ctx_queue *kcqs; + struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; + struct sbq_wait domain_wait[KYBER_NUM_DOMAINS]; + struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; + atomic_t wait_index[KYBER_NUM_DOMAINS]; +}; + +static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, + void *key); + +static unsigned int kyber_sched_domain(unsigned int op) +{ + switch (op & REQ_OP_MASK) { + case REQ_OP_READ: + return KYBER_READ; + case REQ_OP_WRITE: + return KYBER_WRITE; + case REQ_OP_DISCARD: + return KYBER_DISCARD; + default: + return KYBER_OTHER; + } +} + +static void flush_latency_buckets(struct kyber_queue_data *kqd, + struct kyber_cpu_latency *cpu_latency, + unsigned int sched_domain, unsigned int type) +{ + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; + atomic_t *cpu_buckets = cpu_latency->buckets[sched_domain][type]; + unsigned int bucket; + + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) + buckets[bucket] += atomic_xchg(&cpu_buckets[bucket], 0); +} + +/* + * Calculate the histogram bucket with the given percentile rank, or -1 if there + * aren't enough samples yet. + */ +static int calculate_percentile(struct kyber_queue_data *kqd, + unsigned int sched_domain, unsigned int type, + unsigned int percentile) +{ + unsigned int *buckets = kqd->latency_buckets[sched_domain][type]; + unsigned int bucket, samples = 0, percentile_samples; + + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS; bucket++) + samples += buckets[bucket]; + + if (!samples) + return -1; + + /* + * We do the calculation once we have 500 samples or one second passes + * since the first sample was recorded, whichever comes first. + */ + if (!kqd->latency_timeout[sched_domain]) + kqd->latency_timeout[sched_domain] = max(jiffies + HZ, 1UL); + if (samples < 500 && + time_is_after_jiffies(kqd->latency_timeout[sched_domain])) { + return -1; + } + kqd->latency_timeout[sched_domain] = 0; + + percentile_samples = DIV_ROUND_UP(samples * percentile, 100); + for (bucket = 0; bucket < KYBER_LATENCY_BUCKETS - 1; bucket++) { + if (buckets[bucket] >= percentile_samples) + break; + percentile_samples -= buckets[bucket]; + } + memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); + + trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + kyber_latency_type_names[type], percentile, + bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); + + return bucket; +} + +static void kyber_resize_domain(struct kyber_queue_data *kqd, + unsigned int sched_domain, unsigned int depth) +{ + depth = clamp(depth, 1U, kyber_depth[sched_domain]); + if (depth != kqd->domain_tokens[sched_domain].sb.depth) { + sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); + trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + depth); + } +} + +static void kyber_timer_fn(struct timer_list *t) +{ + struct kyber_queue_data *kqd = from_timer(kqd, t, timer); + unsigned int sched_domain; + int cpu; + bool bad = false; + + /* Sum all of the per-cpu latency histograms. */ + for_each_online_cpu(cpu) { + struct kyber_cpu_latency *cpu_latency; + + cpu_latency = per_cpu_ptr(kqd->cpu_latency, cpu); + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + flush_latency_buckets(kqd, cpu_latency, sched_domain, + KYBER_TOTAL_LATENCY); + flush_latency_buckets(kqd, cpu_latency, sched_domain, + KYBER_IO_LATENCY); + } + } + + /* + * Check if any domains have a high I/O latency, which might indicate + * congestion in the device. Note that we use the p90; we don't want to + * be too sensitive to outliers here. + */ + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + int p90; + + p90 = calculate_percentile(kqd, sched_domain, KYBER_IO_LATENCY, + 90); + if (p90 >= KYBER_GOOD_BUCKETS) + bad = true; + } + + /* + * Adjust the scheduling domain depths. If we determined that there was + * congestion, we throttle all domains with good latencies. Either way, + * we ease up on throttling domains with bad latencies. + */ + for (sched_domain = 0; sched_domain < KYBER_OTHER; sched_domain++) { + unsigned int orig_depth, depth; + int p99; + + p99 = calculate_percentile(kqd, sched_domain, + KYBER_TOTAL_LATENCY, 99); + /* + * This is kind of subtle: different domains will not + * necessarily have enough samples to calculate the latency + * percentiles during the same window, so we have to remember + * the p99 for the next time we observe congestion; once we do, + * we don't want to throttle again until we get more data, so we + * reset it to -1. + */ + if (bad) { + if (p99 < 0) + p99 = kqd->domain_p99[sched_domain]; + kqd->domain_p99[sched_domain] = -1; + } else if (p99 >= 0) { + kqd->domain_p99[sched_domain] = p99; + } + if (p99 < 0) + continue; + + /* + * If this domain has bad latency, throttle less. Otherwise, + * throttle more iff we determined that there is congestion. + * + * The new depth is scaled linearly with the p99 latency vs the + * latency target. E.g., if the p99 is 3/4 of the target, then + * we throttle down to 3/4 of the current depth, and if the p99 + * is 2x the target, then we double the depth. + */ + if (bad || p99 >= KYBER_GOOD_BUCKETS) { + orig_depth = kqd->domain_tokens[sched_domain].sb.depth; + depth = (orig_depth * (p99 + 1)) >> KYBER_LATENCY_SHIFT; + kyber_resize_domain(kqd, sched_domain, depth); + } + } +} + +static unsigned int kyber_sched_tags_shift(struct request_queue *q) +{ + /* + * All of the hardware queues have the same depth, so we can just grab + * the shift of the first one. + */ + return q->queue_hw_ctx[0]->sched_tags->bitmap_tags->sb.shift; +} + +static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) +{ + struct kyber_queue_data *kqd; + unsigned int shift; + int ret = -ENOMEM; + int i; + + kqd = kzalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); + if (!kqd) + goto err; + + kqd->q = q; + + kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, + GFP_KERNEL | __GFP_ZERO); + if (!kqd->cpu_latency) + goto err_kqd; + + timer_setup(&kqd->timer, kyber_timer_fn, 0); + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + WARN_ON(!kyber_depth[i]); + WARN_ON(!kyber_batch_size[i]); + ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], + kyber_depth[i], -1, false, + GFP_KERNEL, q->node); + if (ret) { + while (--i >= 0) + sbitmap_queue_free(&kqd->domain_tokens[i]); + goto err_buckets; + } + } + + for (i = 0; i < KYBER_OTHER; i++) { + kqd->domain_p99[i] = -1; + kqd->latency_targets[i] = kyber_latency_targets[i]; + } + + shift = kyber_sched_tags_shift(q); + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + + return kqd; + +err_buckets: + free_percpu(kqd->cpu_latency); +err_kqd: + kfree(kqd); +err: + return ERR_PTR(ret); +} + +static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) +{ + struct kyber_queue_data *kqd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + kqd = kyber_queue_data_alloc(q); + if (IS_ERR(kqd)) { + kobject_put(&eq->kobj); + return PTR_ERR(kqd); + } + + blk_stat_enable_accounting(q); + + eq->elevator_data = kqd; + q->elevator = eq; + + return 0; +} + +static void kyber_exit_sched(struct elevator_queue *e) +{ + struct kyber_queue_data *kqd = e->elevator_data; + int i; + + del_timer_sync(&kqd->timer); + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) + sbitmap_queue_free(&kqd->domain_tokens[i]); + free_percpu(kqd->cpu_latency); + kfree(kqd); +} + +static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) +{ + unsigned int i; + + spin_lock_init(&kcq->lock); + for (i = 0; i < KYBER_NUM_DOMAINS; i++) + INIT_LIST_HEAD(&kcq->rq_list[i]); +} + +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct kyber_hctx_data *khd; + int i; + + khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); + if (!khd) + return -ENOMEM; + + khd->kcqs = kmalloc_array_node(hctx->nr_ctx, + sizeof(struct kyber_ctx_queue), + GFP_KERNEL, hctx->numa_node); + if (!khd->kcqs) + goto err_khd; + + for (i = 0; i < hctx->nr_ctx; i++) + kyber_ctx_queue_init(&khd->kcqs[i]); + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, + ilog2(8), GFP_KERNEL, hctx->numa_node)) { + while (--i >= 0) + sbitmap_free(&khd->kcq_map[i]); + goto err_kcqs; + } + } + + spin_lock_init(&khd->lock); + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + INIT_LIST_HEAD(&khd->rqs[i]); + khd->domain_wait[i].sbq = NULL; + init_waitqueue_func_entry(&khd->domain_wait[i].wait, + kyber_domain_wake); + khd->domain_wait[i].wait.private = hctx; + INIT_LIST_HEAD(&khd->domain_wait[i].wait.entry); + atomic_set(&khd->wait_index[i], 0); + } + + khd->cur_domain = 0; + khd->batching = 0; + + hctx->sched_data = khd; + sbitmap_queue_min_shallow_depth(hctx->sched_tags->bitmap_tags, + kqd->async_depth); + + return 0; + +err_kcqs: + kfree(khd->kcqs); +err_khd: + kfree(khd); + return -ENOMEM; +} + +static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + struct kyber_hctx_data *khd = hctx->sched_data; + int i; + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) + sbitmap_free(&khd->kcq_map[i]); + kfree(khd->kcqs); + kfree(hctx->sched_data); +} + +static int rq_get_domain_token(struct request *rq) +{ + return (long)rq->elv.priv[0]; +} + +static void rq_set_domain_token(struct request *rq, int token) +{ + rq->elv.priv[0] = (void *)(long)token; +} + +static void rq_clear_domain_token(struct kyber_queue_data *kqd, + struct request *rq) +{ + unsigned int sched_domain; + int nr; + + nr = rq_get_domain_token(rq); + if (nr != -1) { + sched_domain = kyber_sched_domain(rq->cmd_flags); + sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, + rq->mq_ctx->cpu); + } +} + +static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +{ + /* + * We use the scheduler tags as per-hardware queue queueing tokens. + * Async requests can be limited at this stage. + */ + if (!op_is_sync(op)) { + struct kyber_queue_data *kqd = data->q->elevator->elevator_data; + + data->shallow_depth = kqd->async_depth; + } +} + +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct kyber_hctx_data *khd = hctx->sched_data; + struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; + unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); + struct list_head *rq_list = &kcq->rq_list[sched_domain]; + bool merged; + + spin_lock(&kcq->lock); + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); + spin_unlock(&kcq->lock); + + return merged; +} + +static void kyber_prepare_request(struct request *rq) +{ + rq_set_domain_token(rq, -1); +} + +static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *rq_list, bool at_head) +{ + struct kyber_hctx_data *khd = hctx->sched_data; + struct request *rq, *next; + + list_for_each_entry_safe(rq, next, rq_list, queuelist) { + unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); + struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw[hctx->type]]; + struct list_head *head = &kcq->rq_list[sched_domain]; + + spin_lock(&kcq->lock); + if (at_head) + list_move(&rq->queuelist, head); + else + list_move_tail(&rq->queuelist, head); + sbitmap_set_bit(&khd->kcq_map[sched_domain], + rq->mq_ctx->index_hw[hctx->type]); + blk_mq_sched_request_inserted(rq); + spin_unlock(&kcq->lock); + } +} + +static void kyber_finish_request(struct request *rq) +{ + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; + + rq_clear_domain_token(kqd, rq); +} + +static void add_latency_sample(struct kyber_cpu_latency *cpu_latency, + unsigned int sched_domain, unsigned int type, + u64 target, u64 latency) +{ + unsigned int bucket; + u64 divisor; + + if (latency > 0) { + divisor = max_t(u64, target >> KYBER_LATENCY_SHIFT, 1); + bucket = min_t(unsigned int, div64_u64(latency - 1, divisor), + KYBER_LATENCY_BUCKETS - 1); + } else { + bucket = 0; + } + + atomic_inc(&cpu_latency->buckets[sched_domain][type][bucket]); +} + +static void kyber_completed_request(struct request *rq, u64 now) +{ + struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; + struct kyber_cpu_latency *cpu_latency; + unsigned int sched_domain; + u64 target; + + sched_domain = kyber_sched_domain(rq->cmd_flags); + if (sched_domain == KYBER_OTHER) + return; + + cpu_latency = get_cpu_ptr(kqd->cpu_latency); + target = kqd->latency_targets[sched_domain]; + add_latency_sample(cpu_latency, sched_domain, KYBER_TOTAL_LATENCY, + target, now - rq->start_time_ns); + add_latency_sample(cpu_latency, sched_domain, KYBER_IO_LATENCY, target, + now - rq->io_start_time_ns); + put_cpu_ptr(kqd->cpu_latency); + + timer_reduce(&kqd->timer, jiffies + HZ / 10); +} + +struct flush_kcq_data { + struct kyber_hctx_data *khd; + unsigned int sched_domain; + struct list_head *list; +}; + +static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) +{ + struct flush_kcq_data *flush_data = data; + struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; + + spin_lock(&kcq->lock); + list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], + flush_data->list); + sbitmap_clear_bit(sb, bitnr); + spin_unlock(&kcq->lock); + + return true; +} + +static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, + unsigned int sched_domain, + struct list_head *list) +{ + struct flush_kcq_data data = { + .khd = khd, + .sched_domain = sched_domain, + .list = list, + }; + + sbitmap_for_each_set(&khd->kcq_map[sched_domain], + flush_busy_kcq, &data); +} + +static int kyber_domain_wake(wait_queue_entry_t *wqe, unsigned mode, int flags, + void *key) +{ + struct blk_mq_hw_ctx *hctx = READ_ONCE(wqe->private); + struct sbq_wait *wait = container_of(wqe, struct sbq_wait, wait); + + sbitmap_del_wait_queue(wait); + blk_mq_run_hw_queue(hctx, true); + return 1; +} + +static int kyber_get_domain_token(struct kyber_queue_data *kqd, + struct kyber_hctx_data *khd, + struct blk_mq_hw_ctx *hctx) +{ + unsigned int sched_domain = khd->cur_domain; + struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; + struct sbq_wait *wait = &khd->domain_wait[sched_domain]; + struct sbq_wait_state *ws; + int nr; + + nr = __sbitmap_queue_get(domain_tokens); + + /* + * If we failed to get a domain token, make sure the hardware queue is + * run when one becomes available. Note that this is serialized on + * khd->lock, but we still need to be careful about the waker. + */ + if (nr < 0 && list_empty_careful(&wait->wait.entry)) { + ws = sbq_wait_ptr(domain_tokens, + &khd->wait_index[sched_domain]); + khd->domain_ws[sched_domain] = ws; + sbitmap_add_wait_queue(domain_tokens, ws, wait); + + /* + * Try again in case a token was freed before we got on the wait + * queue. + */ + nr = __sbitmap_queue_get(domain_tokens); + } + + /* + * If we got a token while we were on the wait queue, remove ourselves + * from the wait queue to ensure that all wake ups make forward + * progress. It's possible that the waker already deleted the entry + * between the !list_empty_careful() check and us grabbing the lock, but + * list_del_init() is okay with that. + */ + if (nr >= 0 && !list_empty_careful(&wait->wait.entry)) { + ws = khd->domain_ws[sched_domain]; + spin_lock_irq(&ws->wait.lock); + sbitmap_del_wait_queue(wait); + spin_unlock_irq(&ws->wait.lock); + } + + return nr; +} + +static struct request * +kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, + struct kyber_hctx_data *khd, + struct blk_mq_hw_ctx *hctx) +{ + struct list_head *rqs; + struct request *rq; + int nr; + + rqs = &khd->rqs[khd->cur_domain]; + + /* + * If we already have a flushed request, then we just need to get a + * token for it. Otherwise, if there are pending requests in the kcqs, + * flush the kcqs, but only if we can get a token. If not, we should + * leave the requests in the kcqs so that they can be merged. Note that + * khd->lock serializes the flushes, so if we observed any bit set in + * the kcq_map, we will always get a request. + */ + rq = list_first_entry_or_null(rqs, struct request, queuelist); + if (rq) { + nr = kyber_get_domain_token(kqd, khd, hctx); + if (nr >= 0) { + khd->batching++; + rq_set_domain_token(rq, nr); + list_del_init(&rq->queuelist); + return rq; + } else { + trace_kyber_throttled(kqd->q, + kyber_domain_names[khd->cur_domain]); + } + } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { + nr = kyber_get_domain_token(kqd, khd, hctx); + if (nr >= 0) { + kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); + rq = list_first_entry(rqs, struct request, queuelist); + khd->batching++; + rq_set_domain_token(rq, nr); + list_del_init(&rq->queuelist); + return rq; + } else { + trace_kyber_throttled(kqd->q, + kyber_domain_names[khd->cur_domain]); + } + } + + /* There were either no pending requests or no tokens. */ + return NULL; +} + +static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct kyber_hctx_data *khd = hctx->sched_data; + struct request *rq; + int i; + + spin_lock(&khd->lock); + + /* + * First, if we are still entitled to batch, try to dispatch a request + * from the batch. + */ + if (khd->batching < kyber_batch_size[khd->cur_domain]) { + rq = kyber_dispatch_cur_domain(kqd, khd, hctx); + if (rq) + goto out; + } + + /* + * Either, + * 1. We were no longer entitled to a batch. + * 2. The domain we were batching didn't have any requests. + * 3. The domain we were batching was out of tokens. + * + * Start another batch. Note that this wraps back around to the original + * domain if no other domains have requests or tokens. + */ + khd->batching = 0; + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) + khd->cur_domain = 0; + else + khd->cur_domain++; + + rq = kyber_dispatch_cur_domain(kqd, khd, hctx); + if (rq) + goto out; + } + + rq = NULL; +out: + spin_unlock(&khd->lock); + return rq; +} + +static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct kyber_hctx_data *khd = hctx->sched_data; + int i; + + for (i = 0; i < KYBER_NUM_DOMAINS; i++) { + if (!list_empty_careful(&khd->rqs[i]) || + sbitmap_any_bit_set(&khd->kcq_map[i])) + return true; + } + + return false; +} + +#define KYBER_LAT_SHOW_STORE(domain, name) \ +static ssize_t kyber_##name##_lat_show(struct elevator_queue *e, \ + char *page) \ +{ \ + struct kyber_queue_data *kqd = e->elevator_data; \ + \ + return sprintf(page, "%llu\n", kqd->latency_targets[domain]); \ +} \ + \ +static ssize_t kyber_##name##_lat_store(struct elevator_queue *e, \ + const char *page, size_t count) \ +{ \ + struct kyber_queue_data *kqd = e->elevator_data; \ + unsigned long long nsec; \ + int ret; \ + \ + ret = kstrtoull(page, 10, &nsec); \ + if (ret) \ + return ret; \ + \ + kqd->latency_targets[domain] = nsec; \ + \ + return count; \ +} +KYBER_LAT_SHOW_STORE(KYBER_READ, read); +KYBER_LAT_SHOW_STORE(KYBER_WRITE, write); +#undef KYBER_LAT_SHOW_STORE + +#define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) +static struct elv_fs_entry kyber_sched_attrs[] = { + KYBER_LAT_ATTR(read), + KYBER_LAT_ATTR(write), + __ATTR_NULL +}; +#undef KYBER_LAT_ATTR + +#ifdef CONFIG_BLK_DEBUG_FS +#define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \ +static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \ +{ \ + struct request_queue *q = data; \ + struct kyber_queue_data *kqd = q->elevator->elevator_data; \ + \ + sbitmap_queue_show(&kqd->domain_tokens[domain], m); \ + return 0; \ +} \ + \ +static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \ + __acquires(&khd->lock) \ +{ \ + struct blk_mq_hw_ctx *hctx = m->private; \ + struct kyber_hctx_data *khd = hctx->sched_data; \ + \ + spin_lock(&khd->lock); \ + return seq_list_start(&khd->rqs[domain], *pos); \ +} \ + \ +static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct blk_mq_hw_ctx *hctx = m->private; \ + struct kyber_hctx_data *khd = hctx->sched_data; \ + \ + return seq_list_next(v, &khd->rqs[domain], pos); \ +} \ + \ +static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \ + __releases(&khd->lock) \ +{ \ + struct blk_mq_hw_ctx *hctx = m->private; \ + struct kyber_hctx_data *khd = hctx->sched_data; \ + \ + spin_unlock(&khd->lock); \ +} \ + \ +static const struct seq_operations kyber_##name##_rqs_seq_ops = { \ + .start = kyber_##name##_rqs_start, \ + .next = kyber_##name##_rqs_next, \ + .stop = kyber_##name##_rqs_stop, \ + .show = blk_mq_debugfs_rq_show, \ +}; \ + \ +static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ +{ \ + struct blk_mq_hw_ctx *hctx = data; \ + struct kyber_hctx_data *khd = hctx->sched_data; \ + wait_queue_entry_t *wait = &khd->domain_wait[domain].wait; \ + \ + seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ + return 0; \ +} +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_WRITE, write) +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_DISCARD, discard) +KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) +#undef KYBER_DEBUGFS_DOMAIN_ATTRS + +static int kyber_async_depth_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct kyber_queue_data *kqd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", kqd->async_depth); + return 0; +} + +static int kyber_cur_domain_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct kyber_hctx_data *khd = hctx->sched_data; + + seq_printf(m, "%s\n", kyber_domain_names[khd->cur_domain]); + return 0; +} + +static int kyber_batching_show(void *data, struct seq_file *m) +{ + struct blk_mq_hw_ctx *hctx = data; + struct kyber_hctx_data *khd = hctx->sched_data; + + seq_printf(m, "%u\n", khd->batching); + return 0; +} + +#define KYBER_QUEUE_DOMAIN_ATTRS(name) \ + {#name "_tokens", 0400, kyber_##name##_tokens_show} +static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { + KYBER_QUEUE_DOMAIN_ATTRS(read), + KYBER_QUEUE_DOMAIN_ATTRS(write), + KYBER_QUEUE_DOMAIN_ATTRS(discard), + KYBER_QUEUE_DOMAIN_ATTRS(other), + {"async_depth", 0400, kyber_async_depth_show}, + {}, +}; +#undef KYBER_QUEUE_DOMAIN_ATTRS + +#define KYBER_HCTX_DOMAIN_ATTRS(name) \ + {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \ + {#name "_waiting", 0400, kyber_##name##_waiting_show} +static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { + KYBER_HCTX_DOMAIN_ATTRS(read), + KYBER_HCTX_DOMAIN_ATTRS(write), + KYBER_HCTX_DOMAIN_ATTRS(discard), + KYBER_HCTX_DOMAIN_ATTRS(other), + {"cur_domain", 0400, kyber_cur_domain_show}, + {"batching", 0400, kyber_batching_show}, + {}, +}; +#undef KYBER_HCTX_DOMAIN_ATTRS +#endif + +static struct elevator_type kyber_sched = { + .ops = { + .init_sched = kyber_init_sched, + .exit_sched = kyber_exit_sched, + .init_hctx = kyber_init_hctx, + .exit_hctx = kyber_exit_hctx, + .limit_depth = kyber_limit_depth, + .bio_merge = kyber_bio_merge, + .prepare_request = kyber_prepare_request, + .insert_requests = kyber_insert_requests, + .finish_request = kyber_finish_request, + .requeue_request = kyber_finish_request, + .completed_request = kyber_completed_request, + .dispatch_request = kyber_dispatch_request, + .has_work = kyber_has_work, + }, +#ifdef CONFIG_BLK_DEBUG_FS + .queue_debugfs_attrs = kyber_queue_debugfs_attrs, + .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, +#endif + .elevator_attrs = kyber_sched_attrs, + .elevator_name = "kyber", + .elevator_owner = THIS_MODULE, +}; + +static int __init kyber_init(void) +{ + return elv_register(&kyber_sched); +} + +static void __exit kyber_exit(void) +{ + elv_unregister(&kyber_sched); +} + +module_init(kyber_init); +module_exit(kyber_exit); + +MODULE_AUTHOR("Omar Sandoval"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("Kyber I/O scheduler"); diff --git a/block/mq-deadline.c b/block/mq-deadline.c new file mode 100644 index 000000000..e4e90761e --- /dev/null +++ b/block/mq-deadline.c @@ -0,0 +1,824 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, + * for the blk-mq scheduling framework + * + * Copyright (C) 2016 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "blk.h" +#include "blk-mq.h" +#include "blk-mq-debugfs.h" +#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* + * See Documentation/block/deadline-iosched.rst + */ +static const int read_expire = HZ / 2; /* max time before a read is submitted. */ +static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +static const int writes_starved = 2; /* max times reads can starve a write */ +static const int fifo_batch = 16; /* # of sequential requests treated as one + by the above parameters. For throughput. */ + +struct deadline_data { + /* + * run time data + */ + + /* + * requests (deadline_rq s) are present on both sort_list and fifo_list + */ + struct rb_root sort_list[2]; + struct list_head fifo_list[2]; + + /* + * next in sort order. read, write or both are NULL + */ + struct request *next_rq[2]; + unsigned int batching; /* number of sequential requests made */ + unsigned int starved; /* times reads have starved writes */ + + /* + * settings that change how the i/o scheduler behaves + */ + int fifo_expire[2]; + int fifo_batch; + int writes_starved; + int front_merges; + + spinlock_t lock; + spinlock_t zone_lock; + struct list_head dispatch; +}; + +static inline struct rb_root * +deadline_rb_root(struct deadline_data *dd, struct request *rq) +{ + return &dd->sort_list[rq_data_dir(rq)]; +} + +/* + * get the request after `rq' in sector-sorted order + */ +static inline struct request * +deadline_latter_request(struct request *rq) +{ + struct rb_node *node = rb_next(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; +} + +static void +deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +{ + struct rb_root *root = deadline_rb_root(dd, rq); + + elv_rb_add(root, rq); +} + +static inline void +deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + if (dd->next_rq[data_dir] == rq) + dd->next_rq[data_dir] = deadline_latter_request(rq); + + elv_rb_del(deadline_rb_root(dd, rq), rq); +} + +/* + * remove rq from rbtree and fifo. + */ +static void deadline_remove_request(struct request_queue *q, struct request *rq) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + list_del_init(&rq->queuelist); + + /* + * We might not be on the rbtree, if we are doing an insert merge + */ + if (!RB_EMPTY_NODE(&rq->rb_node)) + deadline_del_rq_rb(dd, rq); + + elv_rqhash_del(q, rq); + if (q->last_merge == rq) + q->last_merge = NULL; +} + +static void dd_request_merged(struct request_queue *q, struct request *req, + enum elv_merge type) +{ + struct deadline_data *dd = q->elevator->elevator_data; + + /* + * if the merge was a front merge, we need to reposition request + */ + if (type == ELEVATOR_FRONT_MERGE) { + elv_rb_del(deadline_rb_root(dd, req), req); + deadline_add_rq_rb(dd, req); + } +} + +static void dd_merged_requests(struct request_queue *q, struct request *req, + struct request *next) +{ + /* + * if next expires before rq, assign its expire time to rq + * and move into next position (next will be deleted) in fifo + */ + if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { + if (time_before((unsigned long)next->fifo_time, + (unsigned long)req->fifo_time)) { + list_move(&req->queuelist, &next->queuelist); + req->fifo_time = next->fifo_time; + } + } + + /* + * kill knowledge of next, this one is a goner + */ + deadline_remove_request(q, next); +} + +/* + * move an entry to dispatch queue + */ +static void +deadline_move_request(struct deadline_data *dd, struct request *rq) +{ + const int data_dir = rq_data_dir(rq); + + dd->next_rq[READ] = NULL; + dd->next_rq[WRITE] = NULL; + dd->next_rq[data_dir] = deadline_latter_request(rq); + + /* + * take it off the sort and fifo list + */ + deadline_remove_request(rq->q, rq); +} + +/* + * deadline_check_fifo returns 0 if there are no expired requests on the fifo, + * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + */ +static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +{ + struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + + /* + * rq is expired! + */ + if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) + return 1; + + return 0; +} + +/* + * For the specified data direction, return the next request to + * dispatch using arrival ordered lists. + */ +static struct request * +deadline_fifo_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + if (list_empty(&dd->fifo_list[data_dir])) + return NULL; + + rq = rq_entry_fifo(dd->fifo_list[data_dir].next); + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { + if (blk_req_can_dispatch_to_zone(rq)) + goto out; + } + rq = NULL; +out: + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * For the specified data direction, return the next request to + * dispatch using sector position sorted lists. + */ +static struct request * +deadline_next_request(struct deadline_data *dd, int data_dir) +{ + struct request *rq; + unsigned long flags; + + if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) + return NULL; + + rq = dd->next_rq[data_dir]; + if (!rq) + return NULL; + + if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + return rq; + + /* + * Look for a write request that can be dispatched, that is one with + * an unlocked target zone. + */ + spin_lock_irqsave(&dd->zone_lock, flags); + while (rq) { + if (blk_req_can_dispatch_to_zone(rq)) + break; + rq = deadline_latter_request(rq); + } + spin_unlock_irqrestore(&dd->zone_lock, flags); + + return rq; +} + +/* + * deadline_dispatch_requests selects the best request according to + * read/write expire, fifo_batch, etc + */ +static struct request *__dd_dispatch_request(struct deadline_data *dd) +{ + struct request *rq, *next_rq; + bool reads, writes; + int data_dir; + + if (!list_empty(&dd->dispatch)) { + rq = list_first_entry(&dd->dispatch, struct request, queuelist); + list_del_init(&rq->queuelist); + goto done; + } + + reads = !list_empty(&dd->fifo_list[READ]); + writes = !list_empty(&dd->fifo_list[WRITE]); + + /* + * batches are currently reads XOR writes + */ + rq = deadline_next_request(dd, WRITE); + if (!rq) + rq = deadline_next_request(dd, READ); + + if (rq && dd->batching < dd->fifo_batch) + /* we have a next request are still entitled to batch */ + goto dispatch_request; + + /* + * at this point we are not running a batch. select the appropriate + * data direction (read / write) + */ + + if (reads) { + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + + if (deadline_fifo_request(dd, WRITE) && + (dd->starved++ >= dd->writes_starved)) + goto dispatch_writes; + + data_dir = READ; + + goto dispatch_find_request; + } + + /* + * there are either no reads or writes have been starved + */ + + if (writes) { +dispatch_writes: + BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + + dd->starved = 0; + + data_dir = WRITE; + + goto dispatch_find_request; + } + + return NULL; + +dispatch_find_request: + /* + * we are not running a batch, find best request for selected data_dir + */ + next_rq = deadline_next_request(dd, data_dir); + if (deadline_check_fifo(dd, data_dir) || !next_rq) { + /* + * A deadline has expired, the last request was in the other + * direction, or we have run out of higher-sectored requests. + * Start again from the request with the earliest expiry time. + */ + rq = deadline_fifo_request(dd, data_dir); + } else { + /* + * The last req was the same dir and we have a next request in + * sort order. No expired requests so continue on from here. + */ + rq = next_rq; + } + + /* + * For a zoned block device, if we only have writes queued and none of + * them can be dispatched, rq will be NULL. + */ + if (!rq) + return NULL; + + dd->batching = 0; + +dispatch_request: + /* + * rq is the selected appropriate request. + */ + dd->batching++; + deadline_move_request(dd, rq); +done: + /* + * If the request needs its target zone locked, do it. + */ + blk_req_zone_write_lock(rq); + rq->rq_flags |= RQF_STARTED; + return rq; +} + +/* + * One confusing aspect here is that we get called for a specific + * hardware queue, but we may return a request that is for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. + */ +static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + struct request *rq; + + spin_lock(&dd->lock); + rq = __dd_dispatch_request(dd); + spin_unlock(&dd->lock); + if (rq) + atomic_dec(&rq->mq_hctx->elevator_queued); + + return rq; +} + +static void dd_exit_queue(struct elevator_queue *e) +{ + struct deadline_data *dd = e->elevator_data; + + BUG_ON(!list_empty(&dd->fifo_list[READ])); + BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + + kfree(dd); +} + +/* + * initialize elevator private data (deadline_data). + */ +static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +{ + struct deadline_data *dd; + struct elevator_queue *eq; + + eq = elevator_alloc(q, e); + if (!eq) + return -ENOMEM; + + dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); + if (!dd) { + kobject_put(&eq->kobj); + return -ENOMEM; + } + eq->elevator_data = dd; + + INIT_LIST_HEAD(&dd->fifo_list[READ]); + INIT_LIST_HEAD(&dd->fifo_list[WRITE]); + dd->sort_list[READ] = RB_ROOT; + dd->sort_list[WRITE] = RB_ROOT; + dd->fifo_expire[READ] = read_expire; + dd->fifo_expire[WRITE] = write_expire; + dd->writes_starved = writes_starved; + dd->front_merges = 1; + dd->fifo_batch = fifo_batch; + spin_lock_init(&dd->lock); + spin_lock_init(&dd->zone_lock); + INIT_LIST_HEAD(&dd->dispatch); + + q->elevator = eq; + return 0; +} + +static int dd_request_merge(struct request_queue *q, struct request **rq, + struct bio *bio) +{ + struct deadline_data *dd = q->elevator->elevator_data; + sector_t sector = bio_end_sector(bio); + struct request *__rq; + + if (!dd->front_merges) + return ELEVATOR_NO_MERGE; + + __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + if (__rq) { + BUG_ON(sector != blk_rq_pos(__rq)); + + if (elv_bio_merge_ok(__rq, bio)) { + *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; + return ELEVATOR_FRONT_MERGE; + } + } + + return ELEVATOR_NO_MERGE; +} + +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct deadline_data *dd = q->elevator->elevator_data; + struct request *free = NULL; + bool ret; + + spin_lock(&dd->lock); + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + + if (free) + blk_mq_free_request(free); + + return ret; +} + +/* + * add rq to rbtree and fifo + */ +static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, + bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const int data_dir = rq_data_dir(rq); + + /* + * This may be a requeue of a write request that has locked its + * target zone. If it is the case, this releases the zone lock. + */ + blk_req_zone_write_unlock(rq); + + if (blk_mq_sched_try_insert_merge(q, rq)) + return; + + blk_mq_sched_request_inserted(rq); + + if (at_head || blk_rq_is_passthrough(rq)) { + if (at_head) + list_add(&rq->queuelist, &dd->dispatch); + else + list_add_tail(&rq->queuelist, &dd->dispatch); + } else { + deadline_add_rq_rb(dd, rq); + + if (rq_mergeable(rq)) { + elv_rqhash_add(q, rq); + if (!q->last_merge) + q->last_merge = rq; + } + + /* + * set expire time and add to fifo list + */ + rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; + list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + } +} + +static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + struct list_head *list, bool at_head) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + while (!list_empty(list)) { + struct request *rq; + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); + dd_insert_request(hctx, rq, at_head); + atomic_inc(&hctx->elevator_queued); + } + spin_unlock(&dd->lock); +} + +/* + * Nothing to do here. This is defined only to ensure that .finish_request + * method is called upon request completion. + */ +static void dd_prepare_request(struct request *rq) +{ +} + +/* + * For zoned block devices, write unlock the target zone of + * completed write requests. Do this while holding the zone lock + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. + * + * For a zoned block device, __dd_dispatch_request() may have stopped + * dispatching requests if all the queued requests are write requests directed + * at zones that are already locked due to on-going write requests. To ensure + * write request dispatch progress in this case, mark the queue as needing a + * restart to ensure that the queue is run again after completion of the + * request and zones being unlocked. + */ +static void dd_finish_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (blk_queue_is_zoned(q)) { + struct deadline_data *dd = q->elevator->elevator_data; + unsigned long flags; + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); + if (!list_empty(&dd->fifo_list[WRITE])) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } +} + +static bool dd_has_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + + if (!atomic_read(&hctx->elevator_queued)) + return false; + + return !list_empty_careful(&dd->dispatch) || + !list_empty_careful(&dd->fifo_list[0]) || + !list_empty_careful(&dd->fifo_list[1]); +} + +/* + * sysfs parts below + */ +static ssize_t +deadline_var_show(int var, char *page) +{ + return sprintf(page, "%d\n", var); +} + +static void +deadline_var_store(int *var, const char *page) +{ + char *p = (char *) page; + + *var = simple_strtol(p, &p, 10); +} + +#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, char *page) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data = __VAR; \ + if (__CONV) \ + __data = jiffies_to_msecs(__data); \ + return deadline_var_show(__data, (page)); \ +} +SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); +SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); +SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); +SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); +SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); +#undef SHOW_FUNCTION + +#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ +static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ +{ \ + struct deadline_data *dd = e->elevator_data; \ + int __data; \ + deadline_var_store(&__data, (page)); \ + if (__data < (MIN)) \ + __data = (MIN); \ + else if (__data > (MAX)) \ + __data = (MAX); \ + if (__CONV) \ + *(__PTR) = msecs_to_jiffies(__data); \ + else \ + *(__PTR) = __data; \ + return count; \ +} +STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); +STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); +STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); +STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#undef STORE_FUNCTION + +#define DD_ATTR(name) \ + __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) + +static struct elv_fs_entry deadline_attrs[] = { + DD_ATTR(read_expire), + DD_ATTR(write_expire), + DD_ATTR(writes_starved), + DD_ATTR(front_merges), + DD_ATTR(fifo_batch), + __ATTR_NULL +}; + +#ifdef CONFIG_BLK_DEBUG_FS +#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ +static void *deadline_##name##_fifo_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&dd->fifo_list[ddir], *pos); \ +} \ + \ +static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ + loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + return seq_list_next(v, &dd->fifo_list[ddir], pos); \ +} \ + \ +static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ + .start = deadline_##name##_fifo_start, \ + .next = deadline_##name##_fifo_next, \ + .stop = deadline_##name##_fifo_stop, \ + .show = blk_mq_debugfs_rq_show, \ +}; \ + \ +static int deadline_##name##_next_rq_show(void *data, \ + struct seq_file *m) \ +{ \ + struct request_queue *q = data; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct request *rq = dd->next_rq[ddir]; \ + \ + if (rq) \ + __blk_mq_debugfs_rq_show(m, rq); \ + return 0; \ +} +DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) +DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) +#undef DEADLINE_DEBUGFS_DDIR_ATTRS + +static int deadline_batching_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->batching); + return 0; +} + +static int deadline_starved_show(void *data, struct seq_file *m) +{ + struct request_queue *q = data; + struct deadline_data *dd = q->elevator->elevator_data; + + seq_printf(m, "%u\n", dd->starved); + return 0; +} + +static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) + __acquires(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_lock(&dd->lock); + return seq_list_start(&dd->dispatch, *pos); +} + +static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + return seq_list_next(v, &dd->dispatch, pos); +} + +static void deadline_dispatch_stop(struct seq_file *m, void *v) + __releases(&dd->lock) +{ + struct request_queue *q = m->private; + struct deadline_data *dd = q->elevator->elevator_data; + + spin_unlock(&dd->lock); +} + +static const struct seq_operations deadline_dispatch_seq_ops = { + .start = deadline_dispatch_start, + .next = deadline_dispatch_next, + .stop = deadline_dispatch_stop, + .show = blk_mq_debugfs_rq_show, +}; + +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ + {#name "_next_rq", 0400, deadline_##name##_next_rq_show} +static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { + DEADLINE_QUEUE_DDIR_ATTRS(read), + DEADLINE_QUEUE_DDIR_ATTRS(write), + {"batching", 0400, deadline_batching_show}, + {"starved", 0400, deadline_starved_show}, + {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, + {}, +}; +#undef DEADLINE_QUEUE_DDIR_ATTRS +#endif + +static struct elevator_type mq_deadline = { + .ops = { + .insert_requests = dd_insert_requests, + .dispatch_request = dd_dispatch_request, + .prepare_request = dd_prepare_request, + .finish_request = dd_finish_request, + .next_request = elv_rb_latter_request, + .former_request = elv_rb_former_request, + .bio_merge = dd_bio_merge, + .request_merge = dd_request_merge, + .requests_merged = dd_merged_requests, + .request_merged = dd_request_merged, + .has_work = dd_has_work, + .init_sched = dd_init_queue, + .exit_sched = dd_exit_queue, + }, + +#ifdef CONFIG_BLK_DEBUG_FS + .queue_debugfs_attrs = deadline_queue_debugfs_attrs, +#endif + .elevator_attrs = deadline_attrs, + .elevator_name = "mq-deadline", + .elevator_alias = "deadline", + .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, + .elevator_owner = THIS_MODULE, +}; +MODULE_ALIAS("mq-deadline-iosched"); + +static int __init deadline_init(void) +{ + return elv_register(&mq_deadline); +} + +static void __exit deadline_exit(void) +{ + elv_unregister(&mq_deadline); +} + +module_init(deadline_init); +module_exit(deadline_exit); + +MODULE_AUTHOR("Jens Axboe"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/opal_proto.h b/block/opal_proto.h new file mode 100644 index 000000000..b486b3ec7 --- /dev/null +++ b/block/opal_proto.h @@ -0,0 +1,465 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Rafael Antognolli + * Scott Bauer + */ +#include + +#ifndef _OPAL_PROTO_H +#define _OPAL_PROTO_H + +/* + * These constant values come from: + * SPC-4 section + * 6.30 SECURITY PROTOCOL IN command / table 265. + */ +enum { + TCG_SECP_00 = 0, + TCG_SECP_01, +}; + +/* + * Token defs derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * 3.2.2 Data Stream Encoding + */ +enum opal_response_token { + OPAL_DTA_TOKENID_BYTESTRING = 0xe0, + OPAL_DTA_TOKENID_SINT = 0xe1, + OPAL_DTA_TOKENID_UINT = 0xe2, + OPAL_DTA_TOKENID_TOKEN = 0xe3, /* actual token is returned */ + OPAL_DTA_TOKENID_INVALID = 0X0 +}; + +#define DTAERROR_NO_METHOD_STATUS 0x89 +#define GENERIC_HOST_SESSION_NUM 0x41 +#define FIRST_TPER_SESSION_NUM 4096 + +#define TPER_SYNC_SUPPORTED 0x01 +#define MBR_ENABLED_MASK 0x10 + +#define TINY_ATOM_DATA_MASK 0x3F +#define TINY_ATOM_SIGNED 0x40 + +#define SHORT_ATOM_ID 0x80 +#define SHORT_ATOM_BYTESTRING 0x20 +#define SHORT_ATOM_SIGNED 0x10 +#define SHORT_ATOM_LEN_MASK 0xF + +#define MEDIUM_ATOM_ID 0xC0 +#define MEDIUM_ATOM_BYTESTRING 0x10 +#define MEDIUM_ATOM_SIGNED 0x8 +#define MEDIUM_ATOM_LEN_MASK 0x7 + +#define LONG_ATOM_ID 0xe0 +#define LONG_ATOM_BYTESTRING 0x2 +#define LONG_ATOM_SIGNED 0x1 + +/* Derived from TCG Core spec 2.01 Section: + * 3.2.2.1 + * Data Type + */ +#define TINY_ATOM_BYTE 0x7F +#define SHORT_ATOM_BYTE 0xBF +#define MEDIUM_ATOM_BYTE 0xDF +#define LONG_ATOM_BYTE 0xE3 + +#define OPAL_INVAL_PARAM 12 +#define OPAL_MANUFACTURED_INACTIVE 0x08 +#define OPAL_DISCOVERY_COMID 0x0001 + +#define LOCKING_RANGE_NON_GLOBAL 0x03 +/* + * User IDs used in the TCG storage SSCs + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +#define OPAL_METHOD_LENGTH 8 +#define OPAL_MSID_KEYLEN 15 +#define OPAL_UID_LENGTH_HALF 4 + +/* Enum to index OPALUID array */ +enum opal_uid { + /* users */ + OPAL_SMUID_UID, + OPAL_THISSP_UID, + OPAL_ADMINSP_UID, + OPAL_LOCKINGSP_UID, + OPAL_ENTERPRISE_LOCKINGSP_UID, + OPAL_ANYBODY_UID, + OPAL_SID_UID, + OPAL_ADMIN1_UID, + OPAL_USER1_UID, + OPAL_USER2_UID, + OPAL_PSID_UID, + OPAL_ENTERPRISE_BANDMASTER0_UID, + OPAL_ENTERPRISE_ERASEMASTER_UID, + /* tables */ + OPAL_TABLE_TABLE, + OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + OPAL_LOCKINGRANGE_ACE_WRLOCKED, + OPAL_MBRCONTROL, + OPAL_MBR, + OPAL_AUTHORITY_TABLE, + OPAL_C_PIN_TABLE, + OPAL_LOCKING_INFO_TABLE, + OPAL_ENTERPRISE_LOCKING_INFO_TABLE, + OPAL_DATASTORE, + /* C_PIN_TABLE object ID's */ + OPAL_C_PIN_MSID, + OPAL_C_PIN_SID, + OPAL_C_PIN_ADMIN1, + /* half UID's (only first 4 bytes used) */ + OPAL_HALF_UID_AUTHORITY_OBJ_REF, + OPAL_HALF_UID_BOOLEAN_ACE, + /* omitted optional parameter */ + OPAL_UID_HEXFF, +}; + +/* Enum for indexing the OPALMETHOD array */ +enum opal_method { + OPAL_PROPERTIES, + OPAL_STARTSESSION, + OPAL_REVERT, + OPAL_ACTIVATE, + OPAL_EGET, + OPAL_ESET, + OPAL_NEXT, + OPAL_EAUTHENTICATE, + OPAL_GETACL, + OPAL_GENKEY, + OPAL_REVERTSP, + OPAL_GET, + OPAL_SET, + OPAL_AUTHENTICATE, + OPAL_RANDOM, + OPAL_ERASE, +}; + +enum opal_token { + /* Boolean */ + OPAL_TRUE = 0x01, + OPAL_FALSE = 0x00, + OPAL_BOOLEAN_EXPR = 0x03, + /* cellblocks */ + OPAL_TABLE = 0x00, + OPAL_STARTROW = 0x01, + OPAL_ENDROW = 0x02, + OPAL_STARTCOLUMN = 0x03, + OPAL_ENDCOLUMN = 0x04, + OPAL_VALUES = 0x01, + /* table table */ + OPAL_TABLE_UID = 0x00, + OPAL_TABLE_NAME = 0x01, + OPAL_TABLE_COMMON = 0x02, + OPAL_TABLE_TEMPLATE = 0x03, + OPAL_TABLE_KIND = 0x04, + OPAL_TABLE_COLUMN = 0x05, + OPAL_TABLE_COLUMNS = 0x06, + OPAL_TABLE_ROWS = 0x07, + OPAL_TABLE_ROWS_FREE = 0x08, + OPAL_TABLE_ROW_BYTES = 0x09, + OPAL_TABLE_LASTID = 0x0A, + OPAL_TABLE_MIN = 0x0B, + OPAL_TABLE_MAX = 0x0C, + /* authority table */ + OPAL_PIN = 0x03, + /* locking tokens */ + OPAL_RANGESTART = 0x03, + OPAL_RANGELENGTH = 0x04, + OPAL_READLOCKENABLED = 0x05, + OPAL_WRITELOCKENABLED = 0x06, + OPAL_READLOCKED = 0x07, + OPAL_WRITELOCKED = 0x08, + OPAL_ACTIVEKEY = 0x0A, + /* lockingsp table */ + OPAL_LIFECYCLE = 0x06, + /* locking info table */ + OPAL_MAXRANGES = 0x04, + /* mbr control */ + OPAL_MBRENABLE = 0x01, + OPAL_MBRDONE = 0x02, + /* properties */ + OPAL_HOSTPROPERTIES = 0x00, + /* atoms */ + OPAL_STARTLIST = 0xf0, + OPAL_ENDLIST = 0xf1, + OPAL_STARTNAME = 0xf2, + OPAL_ENDNAME = 0xf3, + OPAL_CALL = 0xf8, + OPAL_ENDOFDATA = 0xf9, + OPAL_ENDOFSESSION = 0xfa, + OPAL_STARTTRANSACTON = 0xfb, + OPAL_ENDTRANSACTON = 0xfC, + OPAL_EMPTYATOM = 0xff, + OPAL_WHERE = 0x00, +}; + +/* Locking state for a locking range */ +enum opal_lockingstate { + OPAL_LOCKING_READWRITE = 0x01, + OPAL_LOCKING_READONLY = 0x02, + OPAL_LOCKING_LOCKED = 0x03, +}; + +enum opal_parameter { + OPAL_SUM_SET_LIST = 0x060000, +}; + +/* Packets derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Secion: 3.2.3 ComPackets, Packets & Subpackets + */ + +/* Comm Packet (header) for transmissions. */ +struct opal_compacket { + __be32 reserved0; + u8 extendedComID[4]; + __be32 outstandingData; + __be32 minTransfer; + __be32 length; +}; + +/* Packet structure. */ +struct opal_packet { + __be32 tsn; + __be32 hsn; + __be32 seq_number; + __be16 reserved0; + __be16 ack_type; + __be32 acknowledgment; + __be32 length; +}; + +/* Data sub packet header */ +struct opal_data_subpacket { + u8 reserved0[6]; + __be16 kind; + __be32 length; +}; + +/* header of a response */ +struct opal_header { + struct opal_compacket cp; + struct opal_packet pkt; + struct opal_data_subpacket subpkt; +}; + +#define FC_TPER 0x0001 +#define FC_LOCKING 0x0002 +#define FC_GEOMETRY 0x0003 +#define FC_ENTERPRISE 0x0100 +#define FC_DATASTORE 0x0202 +#define FC_SINGLEUSER 0x0201 +#define FC_OPALV100 0x0200 +#define FC_OPALV200 0x0203 + +/* + * The Discovery 0 Header. As defined in + * Opal SSC Documentation + * Section: 3.3.5 Capability Discovery + */ +struct d0_header { + __be32 length; /* the length of the header 48 in 2.00.100 */ + __be32 revision; /**< revision of the header 1 in 2.00.100 */ + __be32 reserved01; + __be32 reserved02; + /* + * the remainder of the structure is vendor specific and will not be + * addressed now + */ + u8 ignored[32]; +}; + +/* + * TPer Feature Descriptor. Contains flags indicating support for the + * TPer features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x001 in 2.00.100 + */ +struct d0_tper_features { + /* + * supported_features bits: + * bit 7: reserved + * bit 6: com ID management + * bit 5: reserved + * bit 4: streaming support + * bit 3: buffer management + * bit 2: ACK/NACK + * bit 1: async + * bit 0: sync + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Locking Feature Descriptor. Contains flags indicating support for the + * locking features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0002 in 2.00.100 + */ +struct d0_locking_features { + /* + * supported_features bits: + * bits 6-7: reserved + * bit 5: MBR done + * bit 4: MBR enabled + * bit 3: media encryption + * bit 2: locked + * bit 1: locking enabled + * bit 0: locking supported + */ + u8 supported_features; + /* + * bytes 5 through 15 are reserved, but we represent the first 3 as + * u8 to keep the other two 32bits integers aligned. + */ + u8 reserved01[3]; + __be32 reserved02; + __be32 reserved03; +}; + +/* + * Geometry Feature Descriptor. Contains flags indicating support for the + * geometry features described in the OPAL specification. The names match the + * OPAL terminology + * + * code == 0x0003 in 2.00.100 + */ +struct d0_geometry_features { + /* + * skip 32 bits from header, needed to align the struct to 64 bits. + */ + u8 header[4]; + /* + * reserved01: + * bits 1-6: reserved + * bit 0: align + */ + u8 reserved01; + u8 reserved02[7]; + __be32 logical_block_size; + __be64 alignment_granularity; + __be64 lowest_aligned_lba; +}; + +/* + * Enterprise SSC Feature + * + * code == 0x0100 + */ +struct d0_enterprise_ssc { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + u8 reserved01; + __be16 reserved02; + __be32 reserved03; + __be32 reserved04; +}; + +/* + * Opal V1 feature + * + * code == 0x0200 + */ +struct d0_opal_v100 { + __be16 baseComID; + __be16 numComIDs; +}; + +/* + * Single User Mode feature + * + * code == 0x0201 + */ +struct d0_single_user_mode { + __be32 num_locking_objects; + /* reserved01: + * bit 0: any + * bit 1: all + * bit 2: policy + * bits 3-7: reserved + */ + u8 reserved01; + u8 reserved02; + __be16 reserved03; + __be32 reserved04; +}; + +/* + * Additonal Datastores feature + * + * code == 0x0202 + */ +struct d0_datastore_table { + __be16 reserved01; + __be16 max_tables; + __be32 max_size_tables; + __be32 table_size_alignment; +}; + +/* + * OPAL 2.0 feature + * + * code == 0x0203 + */ +struct d0_opal_v200 { + __be16 baseComID; + __be16 numComIDs; + /* range_crossing: + * bits 1-6: reserved + * bit 0: range crossing + */ + u8 range_crossing; + /* num_locking_admin_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_admin_auth[2]; + /* num_locking_user_auth: + * not aligned to 16 bits, so use two u8. + * stored in big endian: + * 0: MSB + * 1: LSB + */ + u8 num_locking_user_auth[2]; + u8 initialPIN; + u8 revertedPIN; + u8 reserved01; + __be32 reserved02; +}; + +/* Union of features used to parse the discovery 0 response */ +struct d0_features { + __be16 code; + /* + * r_version bits: + * bits 4-7: version + * bits 0-3: reserved + */ + u8 r_version; + u8 length; + u8 features[]; +}; + +#endif /* _OPAL_PROTO_H */ diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig new file mode 100644 index 000000000..6e2a64966 --- /dev/null +++ b/block/partitions/Kconfig @@ -0,0 +1,270 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Partition configuration +# +config PARTITION_ADVANCED + bool "Advanced partition selection" + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under an operating system running on a different + architecture than your Linux system. + + Note that the answer to this question won't directly affect the + kernel: saying N will just cause the configurator to skip all + the questions about foreign partitioning schemes. + + If unsure, say N. + +config ACORN_PARTITION + bool "Acorn partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + help + Support hard disks partitioned under Acorn operating systems. + +config ACORN_PARTITION_CUMANA + bool "Cumana partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the Cumana interface on Acorn machines. + +config ACORN_PARTITION_EESOX + bool "EESOX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + +config ACORN_PARTITION_ICS + bool "ICS partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using the ICS interface on Acorn machines. + +config ACORN_PARTITION_ADFS + bool "Native filecore partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + The Acorn Disc Filing System is the standard file system of the + RiscOS operating system which runs on Acorn's ARM-based Risc PC + systems and the Acorn Archimedes range of machines. If you say + `Y' here, Linux will support disk partitions created under ADFS. + +config ACORN_PARTITION_POWERTEC + bool "PowerTec partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Support reading partition tables created on Acorn machines using + the PowerTec SCSI drive. + +config ACORN_PARTITION_RISCIX + bool "RISCiX partition support" if PARTITION_ADVANCED + default y if ARCH_ACORN + depends on ACORN_PARTITION + help + Once upon a time, there was a native Unix port for the Acorn series + of machines called RISCiX. If you say 'Y' here, Linux will be able + to read disks partitioned under RISCiX. + +config AIX_PARTITION + bool "AIX basic partition table support" if PARTITION_ADVANCED + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM or Motorola PowerPC machines + running AIX. AIX actually uses a Logical Volume Manager, where + "logical volumes" can be spread across one or multiple disks, + but this driver works only for the simple case of partitions which + are contiguous. + Otherwise, say N. + +config OSF_PARTITION + bool "Alpha OSF partition support" if PARTITION_ADVANCED + default y if ALPHA + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on an Alpha machine. + +config AMIGA_PARTITION + bool "Amiga partition table support" if PARTITION_ADVANCED + default y if (AMIGA || AFFS_FS=y) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under AmigaOS. + +config ATARI_PARTITION + bool "Atari partition table support" if PARTITION_ADVANCED + default y if ATARI + help + Say Y here if you would like to use hard disks under Linux which + were partitioned under the Atari OS. + +config IBM_PARTITION + bool "IBM disk label and partition support" + depends on PARTITION_ADVANCED && S390 + help + Say Y here if you would like to be able to read the hard disk + partition table format used by IBM DASD disks operating under CMS. + Otherwise, say N. + +config MAC_PARTITION + bool "Macintosh partition map support" if PARTITION_ADVANCED + default y if (MAC || PPC_PMAC) + help + Say Y here if you would like to use hard disks under Linux which + were partitioned on a Macintosh. + +config MSDOS_PARTITION + bool "PC BIOS (MSDOS partition tables) support" if PARTITION_ADVANCED + default y + help + Say Y here. + +config BSD_DISKLABEL + bool "BSD disklabel (FreeBSD partition tables) support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + FreeBSD uses its own hard disk partition scheme on your PC. It + requires only one entry in the primary partition table of your disk + and manages it similarly to DOS extended partitions, putting in its + first sector a new partition table in BSD disklabel format. Saying Y + here allows you to read these disklabels and further mount FreeBSD + partitions from within Linux if you have also said Y to "UFS + file system support", above. If you don't know what all this is + about, say N. + +config MINIX_SUBPARTITION + bool "Minix subpartition support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Minix 2.0.0/2.0.2 subpartition table support for Linux. + Say Y here if you want to mount and use Minix 2.0.0/2.0.2 + subpartitions. + +config SOLARIS_X86_PARTITION + bool "Solaris (x86) partition table support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Like most systems, Solaris x86 uses its own hard disk partition + table format, incompatible with all others. Saying Y here allows you + to read these partition tables and further mount Solaris x86 + partitions from within Linux if you have also said Y to "UFS + file system support", above. + +config UNIXWARE_DISKLABEL + bool "Unixware slices support" + depends on PARTITION_ADVANCED && MSDOS_PARTITION + help + Like some systems, UnixWare uses its own slice table inside a + partition (VTOC - Virtual Table of Contents). Its format is + incompatible with all other OSes. Saying Y here allows you to read + VTOC and further mount UnixWare partitions read-only from within + Linux if you have also said Y to "UFS file system support" or + "System V and Coherent file system support", above. + + This is mainly used to carry data from a UnixWare box to your + Linux box via a removable medium like magneto-optical, ZIP or + removable IDE drives. Note, however, that a good portable way to + transport files and directories between unixes (and even other + operating systems) is given by the tar program ("man tar" or + preferably "info tar"). + + If you don't know what all this is about, say N. + +config LDM_PARTITION + bool "Windows Logical Disk Manager (Dynamic Disk) support" + depends on PARTITION_ADVANCED + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using Windows 2000's/XP's or Vista's Logical Disk + Manager. They are also known as "Dynamic Disks". + + Note this driver only supports Dynamic Disks with a protective MBR + label, i.e. DOS partition table. It does not support GPT labelled + Dynamic Disks yet as can be created with Vista. + + Windows 2000 introduced the concept of Dynamic Disks to get around + the limitations of the PC's partitioning scheme. The Logical Disk + Manager allows the user to repartition a disk and create spanned, + mirrored, striped or RAID volumes, all without the need for + rebooting. + + Normal partitions are now called Basic Disks under Windows 2000, XP, + and Vista. + + For a fuller description read . + + If unsure, say N. + +config LDM_DEBUG + bool "Windows LDM extra logging" + depends on LDM_PARTITION + help + Say Y here if you would like LDM to log verbosely. This could be + helpful if the driver doesn't work as expected and you'd like to + report a bug. + + If unsure, say N. + +config SGI_PARTITION + bool "SGI partition support" if PARTITION_ADVANCED + default y if DEFAULT_SGI_PARTITION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by SGI machines. + +config ULTRIX_PARTITION + bool "Ultrix partition table support" if PARTITION_ADVANCED + default y if MACH_DECSTATION + help + Say Y here if you would like to be able to read the hard disk + partition table format used by DEC (now Compaq) Ultrix machines. + Otherwise, say N. + +config SUN_PARTITION + bool "Sun partition tables support" if PARTITION_ADVANCED + default y if (SPARC || SUN3 || SUN3X) + help + Like most systems, SunOS uses its own hard disk partition table + format, incompatible with all others. Saying Y here allows you to + read these partition tables and further mount SunOS partitions from + within Linux if you have also said Y to "UFS file system support", + above. This is mainly used to carry data from a SPARC under SunOS to + your Linux box via a removable medium like magneto-optical or ZIP + drives; note however that a good portable way to transport files and + directories between unixes (and even other operating systems) is + given by the tar program ("man tar" or preferably "info tar"). If + you don't know what all this is about, say N. + +config KARMA_PARTITION + bool "Karma Partition support" + depends on PARTITION_ADVANCED + help + Say Y here if you would like to mount the Rio Karma MP3 player, as it + uses a proprietary partition table. + +config EFI_PARTITION + bool "EFI GUID Partition support" if PARTITION_ADVANCED + default y + select CRC32 + help + Say Y here if you would like to use hard disks under Linux which + were partitioned using EFI GPT. + +config SYSV68_PARTITION + bool "SYSV68 partition table support" if PARTITION_ADVANCED + default y if VME + help + Say Y here if you would like to be able to read the hard disk + partition table format used by Motorola Delta machines (using + sysv68). + Otherwise, say N. + +config CMDLINE_PARTITION + bool "Command line partition support" if PARTITION_ADVANCED + select BLK_CMDLINE_PARSER + help + Say Y here if you want to read the partition table from bootargs. + The format for the command line is just like mtdparts. diff --git a/block/partitions/Makefile b/block/partitions/Makefile new file mode 100644 index 000000000..a7f05cdb0 --- /dev/null +++ b/block/partitions/Makefile @@ -0,0 +1,22 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for the linux kernel. +# + +obj-$(CONFIG_BLOCK) += core.o +obj-$(CONFIG_ACORN_PARTITION) += acorn.o +obj-$(CONFIG_AMIGA_PARTITION) += amiga.o +obj-$(CONFIG_ATARI_PARTITION) += atari.o +obj-$(CONFIG_AIX_PARTITION) += aix.o +obj-$(CONFIG_CMDLINE_PARTITION) += cmdline.o +obj-$(CONFIG_MAC_PARTITION) += mac.o +obj-$(CONFIG_LDM_PARTITION) += ldm.o +obj-$(CONFIG_MSDOS_PARTITION) += msdos.o +obj-$(CONFIG_OSF_PARTITION) += osf.o +obj-$(CONFIG_SGI_PARTITION) += sgi.o +obj-$(CONFIG_SUN_PARTITION) += sun.o +obj-$(CONFIG_ULTRIX_PARTITION) += ultrix.o +obj-$(CONFIG_IBM_PARTITION) += ibm.o +obj-$(CONFIG_EFI_PARTITION) += efi.o +obj-$(CONFIG_KARMA_PARTITION) += karma.o +obj-$(CONFIG_SYSV68_PARTITION) += sysv68.o diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c new file mode 100644 index 000000000..c64c57b95 --- /dev/null +++ b/block/partitions/acorn.c @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 1996-2000 Russell King. + * + * Scan ADFS partitions on hard disk drives. Unfortunately, there + * isn't a standard for partitioning drives on Acorn machines, so + * every single manufacturer of SCSI and IDE cards created their own + * method. + */ +#include +#include + +#include "check.h" + +/* + * Partition types. (Oh for reusability) + */ +#define PARTITION_RISCIX_MFM 1 +#define PARTITION_RISCIX_SCSI 2 +#define PARTITION_LINUX 9 + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static struct adfs_discrecord * +adfs_partition(struct parsed_partitions *state, char *name, char *data, + unsigned long first_sector, int slot) +{ + struct adfs_discrecord *dr; + unsigned int nr_sects; + + if (adfs_checkbblk(data)) + return NULL; + + dr = (struct adfs_discrecord *)(data + 0x1c0); + + if (dr->disc_size == 0 && dr->disc_size_high == 0) + return NULL; + + nr_sects = (le32_to_cpu(dr->disc_size_high) << 23) | + (le32_to_cpu(dr->disc_size) >> 9); + + if (name) { + strlcat(state->pp_buf, " [", PAGE_SIZE); + strlcat(state->pp_buf, name, PAGE_SIZE); + strlcat(state->pp_buf, "]", PAGE_SIZE); + } + put_partition(state, slot, first_sector, nr_sects); + return dr; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + +struct riscix_part { + __le32 start; + __le32 length; + __le32 one; + char name[16]; +}; + +struct riscix_record { + __le32 magic; +#define RISCIX_MAGIC cpu_to_le32(0x4a657320) + __le32 date; + struct riscix_part part[8]; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int riscix_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct riscix_record *rr; + + rr = read_part_sector(state, first_sect, §); + if (!rr) + return -1; + + strlcat(state->pp_buf, " [RISCiX]", PAGE_SIZE); + + + if (rr->magic == RISCIX_MAGIC) { + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + int part; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + for (part = 0; part < 8; part++) { + if (rr->part[part].one && + memcmp(rr->part[part].name, "All\0", 4)) { + put_partition(state, slot++, + le32_to_cpu(rr->part[part].start), + le32_to_cpu(rr->part[part].length)); + strlcat(state->pp_buf, "(", PAGE_SIZE); + strlcat(state->pp_buf, rr->part[part].name, PAGE_SIZE); + strlcat(state->pp_buf, ")", PAGE_SIZE); + } + } + + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } else { + put_partition(state, slot++, first_sect, nr_sects); + } + + put_dev_sector(sect); + return slot; +} +#endif +#endif + +#define LINUX_NATIVE_MAGIC 0xdeafa1de +#define LINUX_SWAP_MAGIC 0xdeafab1e + +struct linux_part { + __le32 magic; + __le32 start_sect; + __le32 nr_sects; +}; + +#if defined(CONFIG_ACORN_PARTITION_CUMANA) || \ + defined(CONFIG_ACORN_PARTITION_ADFS) +static int linux_partition(struct parsed_partitions *state, + unsigned long first_sect, int slot, + unsigned long nr_sects) +{ + Sector sect; + struct linux_part *linuxp; + unsigned long size = nr_sects > 2 ? 2 : nr_sects; + + strlcat(state->pp_buf, " [Linux]", PAGE_SIZE); + + put_partition(state, slot++, first_sect, size); + + linuxp = read_part_sector(state, first_sect, §); + if (!linuxp) + return -1; + + strlcat(state->pp_buf, " <", PAGE_SIZE); + while (linuxp->magic == cpu_to_le32(LINUX_NATIVE_MAGIC) || + linuxp->magic == cpu_to_le32(LINUX_SWAP_MAGIC)) { + if (slot == state->limit) + break; + put_partition(state, slot++, first_sect + + le32_to_cpu(linuxp->start_sect), + le32_to_cpu(linuxp->nr_sects)); + linuxp ++; + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + + put_dev_sector(sect); + return slot; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_CUMANA +int adfspart_check_CUMANA(struct parsed_partitions *state) +{ + unsigned long first_sector = 0; + unsigned int start_blk = 0; + Sector sect; + unsigned char *data; + char *name = "CUMANA/ADFS"; + int first = 1; + int slot = 1; + + /* + * Try Cumana style partitions - sector 6 contains ADFS boot block + * with pointer to next 'drive'. + * + * There are unknowns in this code - is the 'cylinder number' of the + * next partition relative to the start of this one - I'm assuming + * it is. + * + * Also, which ID did Cumana use? + * + * This is totally unfinished, and will require more work to get it + * going. Hence it is totally untested. + */ + do { + struct adfs_discrecord *dr; + unsigned int nr_sects; + + data = read_part_sector(state, start_blk * 2 + 6, §); + if (!data) + return -1; + + if (slot == state->limit) + break; + + dr = adfs_partition(state, name, data, first_sector, slot++); + if (!dr) + break; + + name = NULL; + + nr_sects = (data[0x1fd] + (data[0x1fe] << 8)) * + (dr->heads + (dr->lowsector & 0x40 ? 1 : 0)) * + dr->secspertrack; + + if (!nr_sects) + break; + + first = 0; + first_sector += nr_sects; + start_blk += nr_sects >> (BLOCK_SIZE_BITS - 9); + nr_sects = 0; /* hmm - should be partition size */ + + switch (data[0x1fc] & 15) { + case 0: /* No partition / ADFS? */ + break; + +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + /* RISCiX - we don't know how to find the next one. */ + slot = riscix_partition(state, first_sector, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, first_sector, slot, + nr_sects); + break; + } + put_dev_sector(sect); + if (slot == -1) + return -1; + } while (1); + put_dev_sector(sect); + return first ? 0 : 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ADFS +/* + * Purpose: allocate ADFS partitions. + * + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * + * Returns: -1 on error, 0 for no ADFS boot sector, 1 for ok. + * + * Alloc : hda = whole drive + * hda1 = ADFS partition on first drive. + * hda2 = non-ADFS partition. + */ +int adfspart_check_ADFS(struct parsed_partitions *state) +{ + unsigned long start_sect, nr_sects, sectscyl, heads; + Sector sect; + unsigned char *data; + struct adfs_discrecord *dr; + unsigned char id; + int slot = 1; + + data = read_part_sector(state, 6, §); + if (!data) + return -1; + + dr = adfs_partition(state, "ADFS", data, 0, slot++); + if (!dr) { + put_dev_sector(sect); + return 0; + } + + heads = dr->heads + ((dr->lowsector >> 6) & 1); + sectscyl = dr->secspertrack * heads; + start_sect = ((data[0x1fe] << 8) + data[0x1fd]) * sectscyl; + id = data[0x1fc] & 15; + put_dev_sector(sect); + + /* + * Work out start of non-adfs partition. + */ + nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + + if (start_sect) { + switch (id) { +#ifdef CONFIG_ACORN_PARTITION_RISCIX + case PARTITION_RISCIX_SCSI: + case PARTITION_RISCIX_MFM: + slot = riscix_partition(state, start_sect, slot, + nr_sects); + break; +#endif + + case PARTITION_LINUX: + slot = linux_partition(state, start_sect, slot, + nr_sects); + break; + } + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_ICS + +struct ics_part { + __le32 start; + __le32 size; +}; + +static int adfspart_check_ICSLinux(struct parsed_partitions *state, + unsigned long block) +{ + Sector sect; + unsigned char *data = read_part_sector(state, block, §); + int result = 0; + + if (data) { + if (memcmp(data, "LinuxPart", 9) == 0) + result = 1; + put_dev_sector(sect); + } + + return result; +} + +/* + * Check for a valid ICS partition using the checksum. + */ +static inline int valid_ics_sector(const unsigned char *data) +{ + unsigned long sum; + int i; + + for (i = 0, sum = 0x50617274; i < 508; i++) + sum += data[i]; + + sum -= le32_to_cpu(*(__le32 *)(&data[508])); + + return sum == 0; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_ICS(struct parsed_partitions *state) +{ + const unsigned char *data; + const struct ics_part *p; + int slot; + Sector sect; + + /* + * Try ICS style partitions - sector 0 contains partition info. + */ + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ics_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [ICS]", PAGE_SIZE); + + for (slot = 1, p = (const struct ics_part *)data; p->size; p++) { + u32 start = le32_to_cpu(p->start); + s32 size = le32_to_cpu(p->size); /* yes, it's signed. */ + + if (slot == state->limit) + break; + + /* + * Negative sizes tell the RISC OS ICS driver to ignore + * this partition - in effect it says that this does not + * contain an ADFS filesystem. + */ + if (size < 0) { + size = -size; + + /* + * Our own extension - We use the first sector + * of the partition to identify what type this + * partition is. We must not make this visible + * to the filesystem. + */ + if (size > 1 && adfspart_check_ICSLinux(state, start)) { + start += 1; + size -= 1; + } + } + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_POWERTEC +struct ptec_part { + __le32 unused1; + __le32 unused2; + __le32 start; + __le32 size; + __le32 unused5; + char type[8]; +}; + +static inline int valid_ptec_sector(const unsigned char *data) +{ + unsigned char checksum = 0x2a; + int i; + + /* + * If it looks like a PC/BIOS partition, then it + * probably isn't PowerTec. + */ + if (data[510] == 0x55 && data[511] == 0xaa) + return 0; + + for (i = 0; i < 511; i++) + checksum += data[i]; + + return checksum == data[511]; +} + +/* + * Purpose: allocate ICS partitions. + * Params : hd - pointer to gendisk structure to store partition info. + * dev - device number to access. + * Returns: -1 on error, 0 for no ICS table, 1 for partitions ok. + * Alloc : hda = whole drive + * hda1 = ADFS partition 0 on first drive. + * hda2 = ADFS partition 1 on first drive. + * ..etc.. + */ +int adfspart_check_POWERTEC(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + const struct ptec_part *p; + int slot = 1; + int i; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + if (!valid_ptec_sector(data)) { + put_dev_sector(sect); + return 0; + } + + strlcat(state->pp_buf, " [POWERTEC]", PAGE_SIZE); + + for (i = 0, p = (const struct ptec_part *)data; i < 12; i++, p++) { + u32 start = le32_to_cpu(p->start); + u32 size = le32_to_cpu(p->size); + + if (size) + put_partition(state, slot++, start, size); + } + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} +#endif + +#ifdef CONFIG_ACORN_PARTITION_EESOX +struct eesox_part { + char magic[6]; + char name[10]; + __le32 start; + __le32 unused6; + __le32 unused7; + __le32 unused8; +}; + +/* + * Guess who created this format? + */ +static const char eesox_name[] = { + 'N', 'e', 'i', 'l', ' ', + 'C', 'r', 'i', 't', 'c', 'h', 'e', 'l', 'l', ' ', ' ' +}; + +/* + * EESOX SCSI partition format. + * + * This is a goddamned awful partition format. We don't seem to store + * the size of the partition in this table, only the start addresses. + * + * There are two possibilities where the size comes from: + * 1. The individual ADFS boot block entries that are placed on the disk. + * 2. The start address of the next entry. + */ +int adfspart_check_EESOX(struct parsed_partitions *state) +{ + Sector sect; + const unsigned char *data; + unsigned char buffer[256]; + struct eesox_part *p; + sector_t start = 0; + int i, slot = 1; + + data = read_part_sector(state, 7, §); + if (!data) + return -1; + + /* + * "Decrypt" the partition table. God knows why... + */ + for (i = 0; i < 256; i++) + buffer[i] = data[i] ^ eesox_name[i & 15]; + + put_dev_sector(sect); + + for (i = 0, p = (struct eesox_part *)buffer; i < 8; i++, p++) { + sector_t next; + + if (memcmp(p->magic, "Eesox", 6)) + break; + + next = le32_to_cpu(p->start); + if (i) + put_partition(state, slot++, start, next - start); + start = next; + } + + if (i != 0) { + sector_t size; + + size = get_capacity(state->bdev->bd_disk); + put_partition(state, slot++, start, size - start); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + } + + return i ? 1 : 0; +} +#endif diff --git a/block/partitions/aix.c b/block/partitions/aix.c new file mode 100644 index 000000000..c7b4fd1a4 --- /dev/null +++ b/block/partitions/aix.c @@ -0,0 +1,298 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/aix.c + * + * Copyright (C) 2012-2013 Philippe De Muyter + */ + +#include "check.h" + +struct lvm_rec { + char lvm_id[4]; /* "_LVM" */ + char reserved4[16]; + __be32 lvmarea_len; + __be32 vgda_len; + __be32 vgda_psn[2]; + char reserved36[10]; + __be16 pp_size; /* log2(pp_size) */ + char reserved46[12]; + __be16 version; + }; + +struct vgda { + __be32 secs; + __be32 usec; + char reserved8[16]; + __be16 numlvs; + __be16 maxlvs; + __be16 pp_size; + __be16 numpvs; + __be16 total_vgdas; + __be16 vgda_size; + }; + +struct lvd { + __be16 lv_ix; + __be16 res2; + __be16 res4; + __be16 maxsize; + __be16 lv_state; + __be16 mirror; + __be16 mirror_policy; + __be16 num_lps; + __be16 res10[8]; + }; + +struct lvname { + char name[64]; + }; + +struct ppe { + __be16 lv_ix; + unsigned short res2; + unsigned short res4; + __be16 lp_ix; + unsigned short res8[12]; + }; + +struct pvd { + char reserved0[16]; + __be16 pp_count; + char reserved18[2]; + __be32 psn_part1; + char reserved24[8]; + struct ppe ppe[1016]; + }; + +#define LVM_MAXLVS 256 + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return (bdev->bd_inode->i_size >> 9) - 1ULL; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state + * @lba + * @buffer + * @count + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, + size_t count) +{ + size_t totalreadcount = 0; + + if (!buffer || lba + count / 512 > last_lba(state->bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, lba++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount += copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_pvd(): reads physical volume descriptor + * @state + * @lba + * + * Description: Returns pvd on success, NULL on error. + * Allocates space for pvd and fill it with disk blocks at @lba + * Notes: remember to free pvd when you're done! + */ +static struct pvd *alloc_pvd(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct pvd); + struct pvd *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +/** + * alloc_lvn(): reads logical volume names + * @state + * @lba + * + * Description: Returns lvn on success, NULL on error. + * Allocates space for lvn and fill it with disk blocks at @lba + * Notes: remember to free lvn when you're done! + */ +static struct lvname *alloc_lvn(struct parsed_partitions *state, u32 lba) +{ + size_t count = sizeof(struct lvname) * LVM_MAXLVS; + struct lvname *p; + + p = kmalloc(count, GFP_KERNEL); + if (!p) + return NULL; + + if (read_lba(state, lba, (u8 *) p, count) < count) { + kfree(p); + return NULL; + } + return p; +} + +int aix_partition(struct parsed_partitions *state) +{ + int ret = 0; + Sector sect; + unsigned char *d; + u32 pp_bytes_size; + u32 pp_blocks_size = 0; + u32 vgda_sector = 0; + u32 vgda_len = 0; + int numlvs = 0; + struct pvd *pvd = NULL; + struct lv_info { + unsigned short pps_per_lv; + unsigned short pps_found; + unsigned char lv_is_contiguous; + } *lvip; + struct lvname *n = NULL; + + d = read_part_sector(state, 7, §); + if (d) { + struct lvm_rec *p = (struct lvm_rec *)d; + u16 lvm_version = be16_to_cpu(p->version); + char tmp[64]; + + if (lvm_version == 1) { + int pp_size_log2 = be16_to_cpu(p->pp_size); + + pp_bytes_size = 1 << pp_size_log2; + pp_blocks_size = pp_bytes_size / 512; + snprintf(tmp, sizeof(tmp), + " AIX LVM header version %u found\n", + lvm_version); + vgda_len = be32_to_cpu(p->vgda_len); + vgda_sector = be32_to_cpu(p->vgda_psn[0]); + } else { + snprintf(tmp, sizeof(tmp), + " unsupported AIX LVM version %d found\n", + lvm_version); + } + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + } + if (vgda_sector && (d = read_part_sector(state, vgda_sector, §))) { + struct vgda *p = (struct vgda *)d; + + numlvs = be16_to_cpu(p->numlvs); + put_dev_sector(sect); + } + lvip = kcalloc(state->limit, sizeof(struct lv_info), GFP_KERNEL); + if (!lvip) + return 0; + if (numlvs && (d = read_part_sector(state, vgda_sector + 1, §))) { + struct lvd *p = (struct lvd *)d; + int i; + + n = alloc_lvn(state, vgda_sector + vgda_len - 33); + if (n) { + int foundlvs = 0; + + for (i = 0; foundlvs < numlvs && i < state->limit; i += 1) { + lvip[i].pps_per_lv = be16_to_cpu(p[i].num_lps); + if (lvip[i].pps_per_lv) + foundlvs += 1; + } + /* pvd loops depend on n[].name and lvip[].pps_per_lv */ + pvd = alloc_pvd(state, vgda_sector + 17); + } + put_dev_sector(sect); + } + if (pvd) { + int numpps = be16_to_cpu(pvd->pp_count); + int psn_part1 = be32_to_cpu(pvd->psn_part1); + int i; + int cur_lv_ix = -1; + int next_lp_ix = 1; + int lp_ix; + + for (i = 0; i < numpps; i += 1) { + struct ppe *p = pvd->ppe + i; + unsigned int lv_ix; + + lp_ix = be16_to_cpu(p->lp_ix); + if (!lp_ix) { + next_lp_ix = 1; + continue; + } + lv_ix = be16_to_cpu(p->lv_ix) - 1; + if (lv_ix >= state->limit) { + cur_lv_ix = -1; + continue; + } + lvip[lv_ix].pps_found += 1; + if (lp_ix == 1) { + cur_lv_ix = lv_ix; + next_lp_ix = 1; + } else if (lv_ix != cur_lv_ix || lp_ix != next_lp_ix) { + next_lp_ix = 1; + continue; + } + if (lp_ix == lvip[lv_ix].pps_per_lv) { + char tmp[70]; + + put_partition(state, lv_ix + 1, + (i + 1 - lp_ix) * pp_blocks_size + psn_part1, + lvip[lv_ix].pps_per_lv * pp_blocks_size); + snprintf(tmp, sizeof(tmp), " <%s>\n", + n[lv_ix].name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + lvip[lv_ix].lv_is_contiguous = 1; + ret = 1; + next_lp_ix = 1; + } else + next_lp_ix += 1; + } + for (i = 0; i < state->limit; i += 1) + if (lvip[i].pps_found && !lvip[i].lv_is_contiguous) { + char tmp[sizeof(n[i].name) + 1]; // null char + + snprintf(tmp, sizeof(tmp), "%s", n[i].name); + pr_warn("partition %s (%u pp's found) is " + "not contiguous\n", + tmp, lvip[i].pps_found); + } + kfree(pvd); + } + kfree(n); + kfree(lvip); + return ret; +} diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c new file mode 100644 index 000000000..a99ec7f1a --- /dev/null +++ b/block/partitions/amiga.c @@ -0,0 +1,207 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/amiga.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#define pr_fmt(fmt) fmt + +#include +#include +#include +#include + +#include "check.h" + +/* magic offsets in partition DosEnvVec */ +#define NR_HD 3 +#define NR_SECT 5 +#define LO_CYL 9 +#define HI_CYL 10 + +static __inline__ u32 +checksum_block(__be32 *m, int size) +{ + u32 sum = 0; + + while (size--) + sum += be32_to_cpu(*m++); + return sum; +} + +int amiga_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + struct RigidDiskBlock *rdb; + struct PartitionBlock *pb; + u64 start_sect, nr_sects; + sector_t blk, end_sect; + u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ + u32 nr_hd, nr_sect, lo_cyl, hi_cyl; + int part, res = 0; + unsigned int blksize = 1; /* Multiplier for disk block size */ + int slot = 1; + char b[BDEVNAME_SIZE]; + + for (blk = 0; ; blk++, put_dev_sector(sect)) { + if (blk == RDB_ALLOCATION_LIMIT) + goto rdb_done; + data = read_part_sector(state, blk, §); + if (!data) { + pr_err("Dev %s: unable to read RDB block %llu\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + if (*(__be32 *)data != cpu_to_be32(IDNAME_RIGIDDISK)) + continue; + + rdb = (struct RigidDiskBlock *)data; + if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F) == 0) + break; + /* Try again with 0xdc..0xdf zeroed, Windows might have + * trashed it. + */ + *(__be32 *)(data+0xdc) = 0; + if (checksum_block((__be32 *)data, + be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { + pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", + blk); + break; + } + + pr_err("Dev %s: RDB in block %llu has bad checksum\n", + bdevname(state->bdev, b), blk); + } + + /* blksize is blocks per 512 byte standard block */ + blksize = be32_to_cpu( rdb->rdb_BlockBytes ) / 512; + + { + char tmp[7 + 10 + 1 + 1]; + + /* Be more informative */ + snprintf(tmp, sizeof(tmp), " RDSK (%d)", blksize * 512); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + blk = be32_to_cpu(rdb->rdb_PartitionList); + put_dev_sector(sect); + for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { + /* Read in terms partition table understands */ + if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { + pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", + bdevname(state->bdev, b), blk, part); + break; + } + data = read_part_sector(state, blk, §); + if (!data) { + pr_err("Dev %s: unable to read partition block %llu\n", + bdevname(state->bdev, b), blk); + res = -1; + goto rdb_done; + } + pb = (struct PartitionBlock *)data; + blk = be32_to_cpu(pb->pb_Next); + if (pb->pb_ID != cpu_to_be32(IDNAME_PARTITION)) + continue; + if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) + continue; + + /* RDB gives us more than enough rope to hang ourselves with, + * many times over (2^128 bytes if all fields max out). + * Some careful checks are in order, so check for potential + * overflows. + * We are multiplying four 32 bit numbers to one sector_t! + */ + + nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); + nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); + + /* CylBlocks is total number of blocks per cylinder */ + if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { + pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", + bdevname(state->bdev, b), cylblk); + continue; + } + + /* check for consistency with RDB defined CylBlocks */ + if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { + pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", + bdevname(state->bdev, b), cylblk, + be32_to_cpu(rdb->rdb_CylBlocks)); + } + + /* RDB allows for variable logical block size - + * normalize to 512 byte blocks and check result. + */ + + if (check_mul_overflow(cylblk, blksize, &cylblk)) { + pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", + bdevname(state->bdev, b), part); + continue; + } + + /* Calculate partition start and end. Limit of 32 bit on cylblk + * guarantees no overflow occurs if LBD support is enabled. + */ + + lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); + start_sect = ((u64) lo_cyl * cylblk); + + hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); + nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); + + if (!nr_sects) + continue; + + /* Warn user if partition end overflows u32 (AmigaDOS limit) */ + + if ((start_sect + nr_sects) > UINT_MAX) { + pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", + bdevname(state->bdev, b), part, + start_sect, start_sect + nr_sects); + } + + if (check_add_overflow(start_sect, nr_sects, &end_sect)) { + pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", + bdevname(state->bdev, b), part, + start_sect, end_sect); + continue; + } + + /* Tell Kernel about it */ + + put_partition(state,slot++,start_sect,nr_sects); + { + /* Be even more informative to aid mounting */ + char dostype[4]; + char tmp[42]; + + __be32 *dt = (__be32 *)dostype; + *dt = pb->pb_Environment[16]; + if (dostype[3] < ' ') + snprintf(tmp, sizeof(tmp), " (%c%c%c^%c)", + dostype[0], dostype[1], + dostype[2], dostype[3] + '@' ); + else + snprintf(tmp, sizeof(tmp), " (%c%c%c%c)", + dostype[0], dostype[1], + dostype[2], dostype[3]); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + snprintf(tmp, sizeof(tmp), "(res %d spb %d)", + be32_to_cpu(pb->pb_Environment[6]), + be32_to_cpu(pb->pb_Environment[4])); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + res = 1; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + +rdb_done: + return res; +} diff --git a/block/partitions/atari.c b/block/partitions/atari.c new file mode 100644 index 000000000..2305840c8 --- /dev/null +++ b/block/partitions/atari.c @@ -0,0 +1,157 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/atari.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include +#include "check.h" +#include "atari.h" + +/* ++guenther: this should be settable by the user ("make config")?. + */ +#define ICD_PARTS + +/* check if a partition entry looks valid -- Atari format is assumed if at + least one of the primary entries is ok this way */ +#define VALID_PARTITION(pi,hdsiz) \ + (((pi)->flg & 1) && \ + isalnum((pi)->id[0]) && isalnum((pi)->id[1]) && isalnum((pi)->id[2]) && \ + be32_to_cpu((pi)->st) <= (hdsiz) && \ + be32_to_cpu((pi)->st) + be32_to_cpu((pi)->siz) <= (hdsiz)) + +static inline int OK_id(char *s) +{ + return memcmp (s, "GEM", 3) == 0 || memcmp (s, "BGM", 3) == 0 || + memcmp (s, "LNX", 3) == 0 || memcmp (s, "SWP", 3) == 0 || + memcmp (s, "RAW", 3) == 0 ; +} + +int atari_partition(struct parsed_partitions *state) +{ + Sector sect; + struct rootsector *rs; + struct partition_info *pi; + u32 extensect; + u32 hd_size; + int slot; +#ifdef ICD_PARTS + int part_fmt = 0; /* 0:unknown, 1:AHDI, 2:ICD/Supra */ +#endif + + /* + * ATARI partition scheme supports 512 lba only. If this is not + * the case, bail early to avoid miscalculating hd_size. + */ + if (bdev_logical_block_size(state->bdev) != 512) + return 0; + + rs = read_part_sector(state, 0, §); + if (!rs) + return -1; + + /* Verify this is an Atari rootsector: */ + hd_size = state->bdev->bd_inode->i_size >> 9; + if (!VALID_PARTITION(&rs->part[0], hd_size) && + !VALID_PARTITION(&rs->part[1], hd_size) && + !VALID_PARTITION(&rs->part[2], hd_size) && + !VALID_PARTITION(&rs->part[3], hd_size)) { + /* + * if there's no valid primary partition, assume that no Atari + * format partition table (there's no reliable magic or the like + * :-() + */ + put_dev_sector(sect); + return 0; + } + + pi = &rs->part[0]; + strlcat(state->pp_buf, " AHDI", PAGE_SIZE); + for (slot = 1; pi < &rs->part[4] && slot < state->limit; slot++, pi++) { + struct rootsector *xrs; + Sector sect2; + ulong partsect; + + if ( !(pi->flg & 1) ) + continue; + /* active partition */ + if (memcmp (pi->id, "XGM", 3) != 0) { + /* we don't care about other id's */ + put_partition (state, slot, be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + continue; + } + /* extension partition */ +#ifdef ICD_PARTS + part_fmt = 1; +#endif + strlcat(state->pp_buf, " XGM<", PAGE_SIZE); + partsect = extensect = be32_to_cpu(pi->st); + while (1) { + xrs = read_part_sector(state, partsect, §2); + if (!xrs) { + printk (" block %ld read failed\n", partsect); + put_dev_sector(sect); + return -1; + } + + /* ++roman: sanity check: bit 0 of flg field must be set */ + if (!(xrs->part[0].flg & 1)) { + printk( "\nFirst sub-partition in extended partition is not valid!\n" ); + put_dev_sector(sect2); + break; + } + + put_partition(state, slot, + partsect + be32_to_cpu(xrs->part[0].st), + be32_to_cpu(xrs->part[0].siz)); + + if (!(xrs->part[1].flg & 1)) { + /* end of linked partition list */ + put_dev_sector(sect2); + break; + } + if (memcmp( xrs->part[1].id, "XGM", 3 ) != 0) { + printk("\nID of extended partition is not XGM!\n"); + put_dev_sector(sect2); + break; + } + + partsect = be32_to_cpu(xrs->part[1].st) + extensect; + put_dev_sector(sect2); + if (++slot == state->limit) { + printk( "\nMaximum number of partitions reached!\n" ); + break; + } + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } +#ifdef ICD_PARTS + if ( part_fmt!=1 ) { /* no extended partitions -> test ICD-format */ + pi = &rs->icdpart[0]; + /* sanity check: no ICD format if first partition invalid */ + if (OK_id(pi->id)) { + strlcat(state->pp_buf, " ICD<", PAGE_SIZE); + for (; pi < &rs->icdpart[8] && slot < state->limit; slot++, pi++) { + /* accept only GEM,BGM,RAW,LNX,SWP partitions */ + if (!((pi->flg & 1) && OK_id(pi->id))) + continue; + part_fmt = 2; + put_partition (state, slot, + be32_to_cpu(pi->st), + be32_to_cpu(pi->siz)); + } + strlcat(state->pp_buf, " >", PAGE_SIZE); + } + } +#endif + put_dev_sector(sect); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/atari.h b/block/partitions/atari.h new file mode 100644 index 000000000..678202442 --- /dev/null +++ b/block/partitions/atari.h @@ -0,0 +1,36 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/partitions/atari.h + * Moved by Russell King from: + * + * linux/include/linux/atari_rootsec.h + * definitions for Atari Rootsector layout + * by Andreas Schwab (schwab@ls5.informatik.uni-dortmund.de) + * + * modified for ICD/Supra partitioning scheme restricted to at most 12 + * partitions + * by Guenther Kelleter (guenther@pool.informatik.rwth-aachen.de) + */ + +#include + +struct partition_info +{ + u8 flg; /* bit 0: active; bit 7: bootable */ + char id[3]; /* "GEM", "BGM", "XGM", or other */ + __be32 st; /* start of partition */ + __be32 siz; /* length of partition */ +}; + +struct rootsector +{ + char unused[0x156]; /* room for boot code */ + struct partition_info icdpart[8]; /* info for ICD-partitions 5..12 */ + char unused2[0xc]; + u32 hd_siz; /* size of disk in blocks */ + struct partition_info part[4]; + u32 bsl_st; /* start of bad sector list */ + u32 bsl_cnt; /* length of bad sector list */ + u16 checksum; /* checksum for bootable disks */ +} __packed; + diff --git a/block/partitions/check.h b/block/partitions/check.h new file mode 100644 index 000000000..c577e9ee6 --- /dev/null +++ b/block/partitions/check.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include +#include +#include +#include "../blk.h" + +/* + * add_gd_partition adds a partitions details to the devices partition + * description. + */ +struct parsed_partitions { + struct block_device *bdev; + char name[BDEVNAME_SIZE]; + struct { + sector_t from; + sector_t size; + int flags; + bool has_info; + struct partition_meta_info info; + } *parts; + int next; + int limit; + bool access_beyond_eod; + char *pp_buf; +}; + +typedef struct { + struct page *v; +} Sector; + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); +static inline void put_dev_sector(Sector p) +{ + put_page(p.v); +} + +static inline void +put_partition(struct parsed_partitions *p, int n, sector_t from, sector_t size) +{ + if (n < p->limit) { + char tmp[1 + BDEVNAME_SIZE + 10 + 1]; + + p->parts[n].from = from; + p->parts[n].size = size; + snprintf(tmp, sizeof(tmp), " %s%d", p->name, n); + strlcat(p->pp_buf, tmp, PAGE_SIZE); + } +} + +/* detection routines go here in alphabetical order: */ +int adfspart_check_ADFS(struct parsed_partitions *state); +int adfspart_check_CUMANA(struct parsed_partitions *state); +int adfspart_check_EESOX(struct parsed_partitions *state); +int adfspart_check_ICS(struct parsed_partitions *state); +int adfspart_check_POWERTEC(struct parsed_partitions *state); +int aix_partition(struct parsed_partitions *state); +int amiga_partition(struct parsed_partitions *state); +int atari_partition(struct parsed_partitions *state); +int cmdline_partition(struct parsed_partitions *state); +int efi_partition(struct parsed_partitions *state); +int ibm_partition(struct parsed_partitions *); +int karma_partition(struct parsed_partitions *state); +int ldm_partition(struct parsed_partitions *state); +int mac_partition(struct parsed_partitions *state); +int msdos_partition(struct parsed_partitions *state); +int osf_partition(struct parsed_partitions *state); +int sgi_partition(struct parsed_partitions *state); +int sun_partition(struct parsed_partitions *state); +int sysv68_partition(struct parsed_partitions *state); +int ultrix_partition(struct parsed_partitions *state); diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c new file mode 100644 index 000000000..8f545c36c --- /dev/null +++ b/block/partitions/cmdline.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2013 HUAWEI + * Author: Cai Zhiyong + * + * Read block device partition table from the command line. + * Typically used for fixed block (eMMC) embedded devices. + * It has no MBR, so saves storage space. Bootloader can be easily accessed + * by absolute address of data on the block device. + * Users can easily change the partition. + * + * The format for the command line is just like mtdparts. + * + * For further information, see "Documentation/block/cmdline-partition.rst" + * + */ + +#include + +#include "check.h" + +static char *cmdline; +static struct cmdline_parts *bdev_parts; + +static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +{ + int label_min; + struct partition_meta_info *info; + char tmp[sizeof(info->volname) + 4]; + struct parsed_partitions *state = (struct parsed_partitions *)param; + + if (slot >= state->limit) + return 1; + + put_partition(state, slot, subpart->from >> 9, + subpart->size >> 9); + + info = &state->parts[slot].info; + + label_min = min_t(int, sizeof(info->volname) - 1, + sizeof(subpart->name)); + strncpy(info->volname, subpart->name, label_min); + info->volname[label_min] = '\0'; + + snprintf(tmp, sizeof(tmp), "(%s)", info->volname); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + state->parts[slot].has_info = true; + + return 0; +} + +static int __init cmdline_parts_setup(char *s) +{ + cmdline = s; + return 1; +} +__setup("blkdevparts=", cmdline_parts_setup); + +static bool has_overlaps(sector_t from, sector_t size, + sector_t from2, sector_t size2) +{ + sector_t end = from + size; + sector_t end2 = from2 + size2; + + if (from >= from2 && from < end2) + return true; + + if (end > from2 && end <= end2) + return true; + + if (from2 >= from && from2 < end) + return true; + + if (end2 > from && end2 <= end) + return true; + + return false; +} + +static inline void overlaps_warns_header(void) +{ + pr_warn("Overlapping partitions are used in command line partitions."); + pr_warn("Don't use filesystems on overlapping partitions:"); +} + +static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) +{ + int i; + bool header = true; + + for (; slot < state->limit && state->parts[slot].has_info; slot++) { + for (i = slot+1; i < state->limit && state->parts[i].has_info; + i++) { + if (has_overlaps(state->parts[slot].from, + state->parts[slot].size, + state->parts[i].from, + state->parts[i].size)) { + if (header) { + header = false; + overlaps_warns_header(); + } + pr_warn("%s[%llu,%llu] overlaps with " + "%s[%llu,%llu].", + state->parts[slot].info.volname, + (u64)state->parts[slot].from << 9, + (u64)state->parts[slot].size << 9, + state->parts[i].info.volname, + (u64)state->parts[i].from << 9, + (u64)state->parts[i].size << 9); + } + } + } +} + +/* + * Purpose: allocate cmdline partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + */ +int cmdline_partition(struct parsed_partitions *state) +{ + sector_t disk_size; + char bdev[BDEVNAME_SIZE]; + struct cmdline_parts *parts; + + if (cmdline) { + if (bdev_parts) + cmdline_parts_free(&bdev_parts); + + if (cmdline_parts_parse(&bdev_parts, cmdline)) { + cmdline = NULL; + return -1; + } + cmdline = NULL; + } + + if (!bdev_parts) + return 0; + + bdevname(state->bdev, bdev); + parts = cmdline_parts_find(bdev_parts, bdev); + if (!parts) + return 0; + + disk_size = get_capacity(state->bdev->bd_disk) << 9; + + cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_verifier(1, state); + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + return 1; +} diff --git a/block/partitions/core.c b/block/partitions/core.c new file mode 100644 index 000000000..e3d61ec4a --- /dev/null +++ b/block/partitions/core.c @@ -0,0 +1,802 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ +#include +#include +#include +#include +#include +#include +#include +#include "check.h" + +static int (*check_part[])(struct parsed_partitions *) = { + /* + * Probe partition formats with tables at disk address 0 + * that also have an ADFS boot block at 0xdc0. + */ +#ifdef CONFIG_ACORN_PARTITION_ICS + adfspart_check_ICS, +#endif +#ifdef CONFIG_ACORN_PARTITION_POWERTEC + adfspart_check_POWERTEC, +#endif +#ifdef CONFIG_ACORN_PARTITION_EESOX + adfspart_check_EESOX, +#endif + + /* + * Now move on to formats that only have partition info at + * disk address 0xdc0. Since these may also have stale + * PC/BIOS partition tables, they need to come before + * the msdos entry. + */ +#ifdef CONFIG_ACORN_PARTITION_CUMANA + adfspart_check_CUMANA, +#endif +#ifdef CONFIG_ACORN_PARTITION_ADFS + adfspart_check_ADFS, +#endif + +#ifdef CONFIG_CMDLINE_PARTITION + cmdline_partition, +#endif +#ifdef CONFIG_EFI_PARTITION + efi_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_SGI_PARTITION + sgi_partition, +#endif +#ifdef CONFIG_LDM_PARTITION + ldm_partition, /* this must come before msdos */ +#endif +#ifdef CONFIG_MSDOS_PARTITION + msdos_partition, +#endif +#ifdef CONFIG_OSF_PARTITION + osf_partition, +#endif +#ifdef CONFIG_SUN_PARTITION + sun_partition, +#endif +#ifdef CONFIG_AMIGA_PARTITION + amiga_partition, +#endif +#ifdef CONFIG_ATARI_PARTITION + atari_partition, +#endif +#ifdef CONFIG_MAC_PARTITION + mac_partition, +#endif +#ifdef CONFIG_ULTRIX_PARTITION + ultrix_partition, +#endif +#ifdef CONFIG_IBM_PARTITION + ibm_partition, +#endif +#ifdef CONFIG_KARMA_PARTITION + karma_partition, +#endif +#ifdef CONFIG_SYSV68_PARTITION + sysv68_partition, +#endif + NULL +}; + +static struct parsed_partitions *allocate_partitions(struct gendisk *hd) +{ + struct parsed_partitions *state; + int nr; + + state = kzalloc(sizeof(*state), GFP_KERNEL); + if (!state) + return NULL; + + nr = disk_max_parts(hd); + state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); + if (!state->parts) { + kfree(state); + return NULL; + } + + state->limit = nr; + + return state; +} + +static void free_partitions(struct parsed_partitions *state) +{ + vfree(state->parts); + kfree(state); +} + +static struct parsed_partitions *check_partition(struct gendisk *hd, + struct block_device *bdev) +{ + struct parsed_partitions *state; + int i, res, err; + + state = allocate_partitions(hd); + if (!state) + return NULL; + state->pp_buf = (char *)__get_free_page(GFP_KERNEL); + if (!state->pp_buf) { + free_partitions(state); + return NULL; + } + state->pp_buf[0] = '\0'; + + state->bdev = bdev; + disk_name(hd, 0, state->name); + snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); + if (isdigit(state->name[strlen(state->name)-1])) + sprintf(state->name, "p"); + + i = res = err = 0; + while (!res && check_part[i]) { + memset(state->parts, 0, state->limit * sizeof(state->parts[0])); + res = check_part[i++](state); + if (res < 0) { + /* + * We have hit an I/O error which we don't report now. + * But record it, and let the others do their job. + */ + err = res; + res = 0; + } + + } + if (res > 0) { + printk(KERN_INFO "%s", state->pp_buf); + + free_page((unsigned long)state->pp_buf); + return state; + } + if (state->access_beyond_eod) + err = -ENOSPC; + /* + * The partition is unrecognized. So report I/O errors if there were any + */ + if (err) + res = err; + if (res) { + strlcat(state->pp_buf, + " unable to read partition table\n", PAGE_SIZE); + printk(KERN_INFO "%s", state->pp_buf); + } + + free_page((unsigned long)state->pp_buf); + free_partitions(state); + return ERR_PTR(res); +} + +static ssize_t part_partition_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%d\n", p->partno); +} + +static ssize_t part_start_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); +} + +static ssize_t part_ro_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%d\n", p->policy ? 1 : 0); +} + +static ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%u\n", + queue_limit_alignment_offset(&part_to_disk(p)->queue->limits, + p->start_sect)); +} + +static ssize_t part_discard_alignment_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + + return sprintf(buf, "%u\n", + queue_limit_discard_alignment(&part_to_disk(p)->queue->limits, + p->start_sect)); +} + +static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); +static DEVICE_ATTR(start, 0444, part_start_show, NULL); +static DEVICE_ATTR(size, 0444, part_size_show, NULL); +static DEVICE_ATTR(ro, 0444, part_ro_show, NULL); +static DEVICE_ATTR(alignment_offset, 0444, part_alignment_offset_show, NULL); +static DEVICE_ATTR(discard_alignment, 0444, part_discard_alignment_show, NULL); +static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); +static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); +#ifdef CONFIG_FAIL_MAKE_REQUEST +static struct device_attribute dev_attr_fail = + __ATTR(make-it-fail, 0644, part_fail_show, part_fail_store); +#endif + +static struct attribute *part_attrs[] = { + &dev_attr_partition.attr, + &dev_attr_start.attr, + &dev_attr_size.attr, + &dev_attr_ro.attr, + &dev_attr_alignment_offset.attr, + &dev_attr_discard_alignment.attr, + &dev_attr_stat.attr, + &dev_attr_inflight.attr, +#ifdef CONFIG_FAIL_MAKE_REQUEST + &dev_attr_fail.attr, +#endif + NULL +}; + +static struct attribute_group part_attr_group = { + .attrs = part_attrs, +}; + +static const struct attribute_group *part_attr_groups[] = { + &part_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif + NULL +}; + +static void part_release(struct device *dev) +{ + struct hd_struct *p = dev_to_part(dev); + blk_free_devt(dev->devt); + hd_free_part(p); + kfree(p); +} + +static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + struct hd_struct *part = dev_to_part(dev); + + add_uevent_var(env, "PARTN=%u", part->partno); + if (part->info && part->info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->info->volname); + return 0; +} + +struct device_type part_type = { + .name = "partition", + .groups = part_attr_groups, + .release = part_release, + .uevent = part_uevent, +}; + +static void hd_struct_free_work(struct work_struct *work) +{ + struct hd_struct *part = + container_of(to_rcu_work(work), struct hd_struct, rcu_work); + struct gendisk *disk = part_to_disk(part); + + /* + * Release the disk reference acquired in delete_partition here. + * We can't release it in hd_struct_free because the final put_device + * needs process context and thus can't be run directly from a + * percpu_ref ->release handler. + */ + put_device(disk_to_dev(disk)); + + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(part_to_dev(part)); +} + +static void hd_struct_free(struct percpu_ref *ref) +{ + struct hd_struct *part = container_of(ref, struct hd_struct, ref); + struct gendisk *disk = part_to_disk(part); + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + + rcu_assign_pointer(ptbl->last_lookup, NULL); + + INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); + queue_rcu_work(system_wq, &part->rcu_work); +} + +int hd_ref_init(struct hd_struct *part) +{ + if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) + return -ENOMEM; + return 0; +} + +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ +void delete_partition(struct hd_struct *part) +{ + struct gendisk *disk = part_to_disk(part); + struct disk_part_tbl *ptbl = + rcu_dereference_protected(disk->part_tbl, 1); + struct block_device *bdev; + + /* + * ->part_tbl is referenced in this part's release handler, so + * we have to hold the disk device + */ + get_device(disk_to_dev(disk)); + rcu_assign_pointer(ptbl->part[part->partno], NULL); + kobject_put(part->holder_dir); + device_del(part_to_dev(part)); + + /* + * Remove gendisk pointer from idr so that it cannot be looked up + * while RCU period before freeing gendisk is running to prevent + * use-after-free issues. Note that the device number stays + * "in-use" until we really free the gendisk. + */ + blk_invalidate_devt(part_devt(part)); + + bdev = bdget_part(part); + if (bdev) { + remove_inode_hash(bdev->bd_inode); + bdput(bdev); + } + percpu_ref_kill(&part->ref); +} + +static ssize_t whole_disk_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} +static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); + +/* + * Must be called either with bd_mutex held, before a disk can be opened or + * after all disk users are gone. + */ +static struct hd_struct *add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags, + struct partition_meta_info *info) +{ + struct hd_struct *p; + dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + struct disk_part_tbl *ptbl; + const char *dname; + int err; + + /* + * Partitions are not supported on zoned block devices that are used as + * such. + */ + switch (disk->queue->limits.zoned) { + case BLK_ZONED_HM: + pr_warn("%s: partitions not supported on host managed zoned block device\n", + disk->disk_name); + return ERR_PTR(-ENXIO); + case BLK_ZONED_HA: + pr_info("%s: disabling host aware zoned block device support due to partitions\n", + disk->disk_name); + disk->queue->limits.zoned = BLK_ZONED_NONE; + break; + case BLK_ZONED_NONE: + break; + } + + err = disk_expand_part_tbl(disk, partno); + if (err) + return ERR_PTR(err); + ptbl = rcu_dereference_protected(disk->part_tbl, 1); + + if (ptbl->part[partno]) + return ERR_PTR(-EBUSY); + + p = kzalloc(sizeof(*p), GFP_KERNEL); + if (!p) + return ERR_PTR(-EBUSY); + + p->dkstats = alloc_percpu(struct disk_stats); + if (!p->dkstats) { + err = -ENOMEM; + goto out_free; + } + + hd_sects_seq_init(p); + pdev = part_to_dev(p); + + p->start_sect = start; + p->nr_sects = len; + p->partno = partno; + p->policy = get_disk_ro(disk); + + if (info) { + struct partition_meta_info *pinfo; + + pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); + if (!pinfo) { + err = -ENOMEM; + goto out_free_stats; + } + memcpy(pinfo, info, sizeof(*info)); + p->info = pinfo; + } + + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + dev_set_name(pdev, "%sp%d", dname, partno); + else + dev_set_name(pdev, "%s%d", dname, partno); + + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; + + err = blk_alloc_devt(p, &devt); + if (err) + goto out_free_info; + pdev->devt = devt; + + /* delay uevent until 'holders' subdir is created */ + dev_set_uevent_suppress(pdev, 1); + err = device_add(pdev); + if (err) + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!p->holder_dir) + goto out_del; + + dev_set_uevent_suppress(pdev, 0); + if (flags & ADDPART_FLAG_WHOLEDISK) { + err = device_create_file(pdev, &dev_attr_whole_disk); + if (err) + goto out_del; + } + + err = hd_ref_init(p); + if (err) { + if (flags & ADDPART_FLAG_WHOLEDISK) + goto out_remove_file; + goto out_del; + } + + /* everything is up and running, commence */ + rcu_assign_pointer(ptbl->part[partno], p); + + /* suppress uevent if the disk suppresses it */ + if (!dev_get_uevent_suppress(ddev)) + kobject_uevent(&pdev->kobj, KOBJ_ADD); + return p; + +out_free_info: + kfree(p->info); +out_free_stats: + free_percpu(p->dkstats); +out_free: + kfree(p); + return ERR_PTR(err); +out_remove_file: + device_remove_file(pdev, &dev_attr_whole_disk); +out_del: + kobject_put(p->holder_dir); + device_del(pdev); +out_put: + put_device(pdev); + return ERR_PTR(err); +} + +static bool partition_overlaps(struct gendisk *disk, sector_t start, + sector_t length, int skip_partno) +{ + struct disk_part_iter piter; + struct hd_struct *part; + bool overlap = false; + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) { + if (part->partno == skip_partno || + start >= part->start_sect + part->nr_sects || + start + length <= part->start_sect) + continue; + overlap = true; + break; + } + + disk_part_iter_exit(&piter); + return overlap; +} + +int bdev_add_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length) +{ + struct hd_struct *part; + + mutex_lock(&bdev->bd_mutex); + if (partition_overlaps(bdev->bd_disk, start, length, -1)) { + mutex_unlock(&bdev->bd_mutex); + return -EBUSY; + } + + part = add_partition(bdev->bd_disk, partno, start, length, + ADDPART_FLAG_NONE, NULL); + mutex_unlock(&bdev->bd_mutex); + return PTR_ERR_OR_ZERO(part); +} + +int bdev_del_partition(struct block_device *bdev, int partno) +{ + struct block_device *bdevp; + struct hd_struct *part = NULL; + int ret; + + bdevp = bdget_disk(bdev->bd_disk, partno); + if (!bdevp) + return -ENXIO; + + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + + ret = -ENXIO; + part = disk_get_part(bdev->bd_disk, partno); + if (!part) + goto out_unlock; + + ret = -EBUSY; + if (bdevp->bd_openers) + goto out_unlock; + + sync_blockdev(bdevp); + invalidate_bdev(bdevp); + + delete_partition(part); + ret = 0; +out_unlock: + mutex_unlock(&bdev->bd_mutex); + mutex_unlock(&bdevp->bd_mutex); + bdput(bdevp); + if (part) + disk_put_part(part); + return ret; +} + +int bdev_resize_partition(struct block_device *bdev, int partno, + sector_t start, sector_t length) +{ + struct block_device *bdevp; + struct hd_struct *part; + int ret = 0; + + part = disk_get_part(bdev->bd_disk, partno); + if (!part) + return -ENXIO; + + ret = -ENOMEM; + bdevp = bdget_part(part); + if (!bdevp) + goto out_put_part; + + mutex_lock(&bdevp->bd_mutex); + mutex_lock_nested(&bdev->bd_mutex, 1); + + ret = -EINVAL; + if (start != part->start_sect) + goto out_unlock; + + ret = -EBUSY; + if (partition_overlaps(bdev->bd_disk, start, length, partno)) + goto out_unlock; + + part_nr_sects_write(part, length); + bd_set_nr_sectors(bdevp, length); + + ret = 0; +out_unlock: + mutex_unlock(&bdevp->bd_mutex); + mutex_unlock(&bdev->bd_mutex); + bdput(bdevp); +out_put_part: + disk_put_part(part); + return ret; +} + +static bool disk_unlock_native_capacity(struct gendisk *disk) +{ + const struct block_device_operations *bdops = disk->fops; + + if (bdops->unlock_native_capacity && + !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { + printk(KERN_CONT "enabling native capacity\n"); + bdops->unlock_native_capacity(disk); + disk->flags |= GENHD_FL_NATIVE_CAPACITY; + return true; + } else { + printk(KERN_CONT "truncated\n"); + return false; + } +} + +int blk_drop_partitions(struct block_device *bdev) +{ + struct disk_part_iter piter; + struct hd_struct *part; + + if (bdev->bd_part_count) + return -EBUSY; + + sync_blockdev(bdev); + invalidate_bdev(bdev); + + disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(part); + disk_part_iter_exit(&piter); + + return 0; +} +#ifdef CONFIG_S390 +/* for historic reasons in the DASD driver */ +EXPORT_SYMBOL_GPL(blk_drop_partitions); +#endif + +static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, + struct parsed_partitions *state, int p) +{ + sector_t size = state->parts[p].size; + sector_t from = state->parts[p].from; + struct hd_struct *part; + + if (!size) + return true; + + if (from >= get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d start %llu is beyond EOD, ", + disk->disk_name, p, (unsigned long long) from); + if (disk_unlock_native_capacity(disk)) + return false; + return true; + } + + if (from + size > get_capacity(disk)) { + printk(KERN_WARNING + "%s: p%d size %llu extends beyond EOD, ", + disk->disk_name, p, (unsigned long long) size); + + if (disk_unlock_native_capacity(disk)) + return false; + + /* + * We can not ignore partitions of broken tables created by for + * example camera firmware, but we limit them to the end of the + * disk to avoid creating invalid block devices. + */ + size = get_capacity(disk) - from; + } + + part = add_partition(disk, p, from, size, state->parts[p].flags, + &state->parts[p].info); + if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { + printk(KERN_ERR " %s: p%d could not be added: %ld\n", + disk->disk_name, p, -PTR_ERR(part)); + return true; + } + + if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && + (state->parts[p].flags & ADDPART_FLAG_RAID)) + md_autodetect_dev(part_to_dev(part)->devt); + + return true; +} + +int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +{ + struct parsed_partitions *state; + int ret = -EAGAIN, p, highest; + + if (!disk_part_scan_enabled(disk)) + return 0; + + state = check_partition(disk, bdev); + if (!state) + return 0; + if (IS_ERR(state)) { + /* + * I/O error reading the partition table. If we tried to read + * beyond EOD, retry after unlocking the native capacity. + */ + if (PTR_ERR(state) == -ENOSPC) { + printk(KERN_WARNING "%s: partition table beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + return -EAGAIN; + } + return -EIO; + } + + /* + * Partitions are not supported on host managed zoned block devices. + */ + if (disk->queue->limits.zoned == BLK_ZONED_HM) { + pr_warn("%s: ignoring partition table on host managed zoned block device\n", + disk->disk_name); + ret = 0; + goto out_free_state; + } + + /* + * If we read beyond EOD, try unlocking native capacity even if the + * partition table was successfully read as we could be missing some + * partitions. + */ + if (state->access_beyond_eod) { + printk(KERN_WARNING + "%s: partition table partially beyond EOD, ", + disk->disk_name); + if (disk_unlock_native_capacity(disk)) + goto out_free_state; + } + + /* tell userspace that the media / partition table may have changed */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + + /* + * Detect the highest partition number and preallocate disk->part_tbl. + * This is an optimization and not strictly necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + disk_expand_part_tbl(disk, highest); + + for (p = 1; p < state->limit; p++) + if (!blk_add_partition(disk, bdev, state, p)) + goto out_free_state; + + ret = 0; +out_free_state: + free_partitions(state); + return ret; +} + +void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) +{ + struct address_space *mapping = state->bdev->bd_inode->i_mapping; + struct page *page; + + if (n >= get_capacity(state->bdev->bd_disk)) { + state->access_beyond_eod = true; + return NULL; + } + + page = read_mapping_page(mapping, + (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); + if (IS_ERR(page)) + goto out; + if (PageError(page)) + goto out_put_page; + + p->v = page; + return (unsigned char *)page_address(page) + + ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); +out_put_page: + put_page(page); +out: + p->v = NULL; + return NULL; +} diff --git a/block/partitions/efi.c b/block/partitions/efi.c new file mode 100644 index 000000000..b64bfdd43 --- /dev/null +++ b/block/partitions/efi.c @@ -0,0 +1,747 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/************************************************************ + * EFI GUID Partition Table handling + * + * http://www.uefi.org/specs/ + * http://www.intel.com/technology/efi/ + * + * efi.[ch] by Matt Domsch + * Copyright 2000,2001,2002,2004 Dell Inc. + * + * TODO: + * + * Changelog: + * Mon August 5th, 2013 Davidlohr Bueso + * - detect hybrid MBRs, tighter pMBR checking & cleanups. + * + * Mon Nov 09 2004 Matt Domsch + * - test for valid PMBR and valid PGPT before ever reading + * AGPT, allow override with 'gpt' kernel command line option. + * - check for first/last_usable_lba outside of size of disk + * + * Tue Mar 26 2002 Matt Domsch + * - Ported to 2.5.7-pre1 and 2.5.7-dj2 + * - Applied patch to avoid fault in alternate header handling + * - cleaned up find_valid_gpt + * - On-disk structure and copy in memory is *always* LE now - + * swab fields as needed + * - remove print_gpt_header() + * - only use first max_p partition entries, to keep the kernel minor number + * and partition numbers tied. + * + * Mon Feb 04 2002 Matt Domsch + * - Removed __PRIPTR_PREFIX - not being used + * + * Mon Jan 14 2002 Matt Domsch + * - Ported to 2.5.2-pre11 + library crc32 patch Linus applied + * + * Thu Dec 6 2001 Matt Domsch + * - Added compare_gpts(). + * - moved le_efi_guid_to_cpus() back into this file. GPT is the only + * thing that keeps EFI GUIDs on disk. + * - Changed gpt structure names and members to be simpler and more Linux-like. + * + * Wed Oct 17 2001 Matt Domsch + * - Removed CONFIG_DEVFS_VOLUMES_UUID code entirely per Martin Wilck + * + * Wed Oct 10 2001 Matt Domsch + * - Changed function comments to DocBook style per Andreas Dilger suggestion. + * + * Mon Oct 08 2001 Matt Domsch + * - Change read_lba() to use the page cache per Al Viro's work. + * - print u64s properly on all architectures + * - fixed debug_printk(), now Dprintk() + * + * Mon Oct 01 2001 Matt Domsch + * - Style cleanups + * - made most functions static + * - Endianness addition + * - remove test for second alternate header, as it's not per spec, + * and is unnecessary. There's now a method to read/write the last + * sector of an odd-sized disk from user space. No tools have ever + * been released which used this code, so it's effectively dead. + * - Per Asit Mallick of Intel, added a test for a valid PMBR. + * - Added kernel command line option 'gpt' to override valid PMBR test. + * + * Wed Jun 6 2001 Martin Wilck + * - added devfs volume UUID support (/dev/volumes/uuids) for + * mounting file systems by the partition GUID. + * + * Tue Dec 5 2000 Matt Domsch + * - Moved crc32() to linux/lib, added efi_crc32(). + * + * Thu Nov 30 2000 Matt Domsch + * - Replaced Intel's CRC32 function with an equivalent + * non-license-restricted version. + * + * Wed Oct 25 2000 Matt Domsch + * - Fixed the last_lba() call to return the proper last block + * + * Thu Oct 12 2000 Matt Domsch + * - Thanks to Andries Brouwer for his debugging assistance. + * - Code works, detects all the partitions. + * + ************************************************************/ +#include +#include +#include +#include +#include +#include "check.h" +#include "efi.h" + +/* This allows a kernel command line option 'gpt' to override + * the test for invalid PMBR. Not __initdata because reloading + * the partition tables happens after init too. + */ +static int force_gpt; +static int __init +force_gpt_fn(char *str) +{ + force_gpt = 1; + return 1; +} +__setup("gpt", force_gpt_fn); + + +/** + * efi_crc32() - EFI version of crc32 function + * @buf: buffer to calculate crc32 of + * @len: length of buf + * + * Description: Returns EFI-style CRC32 value for @buf + * + * This function uses the little endian Ethernet polynomial + * but seeds the function with ~0, and xor's with ~0 at the end. + * Note, the EFI Specification, v1.02, has a reference to + * Dr. Dobbs Journal, May 1994 (actually it's in May 1992). + */ +static inline u32 +efi_crc32(const void *buf, unsigned long len) +{ + return (crc32(~0L, buf, len) ^ ~0L); +} + +/** + * last_lba(): return number of last logical block of device + * @bdev: block device + * + * Description: Returns last LBA value on success, 0 on error. + * This is stored (by sd and ide-geometry) in + * the part[0] entry for this disk, and is the number of + * physical sectors available on the disk. + */ +static u64 last_lba(struct block_device *bdev) +{ + if (!bdev || !bdev->bd_inode) + return 0; + return div_u64(bdev->bd_inode->i_size, + bdev_logical_block_size(bdev)) - 1ULL; +} + +static inline int pmbr_part_valid(gpt_mbr_record *part) +{ + if (part->os_type != EFI_PMBR_OSTYPE_EFI_GPT) + goto invalid; + + /* set to 0x00000001 (i.e., the LBA of the GPT Partition Header) */ + if (le32_to_cpu(part->starting_lba) != GPT_PRIMARY_PARTITION_TABLE_LBA) + goto invalid; + + return GPT_MBR_PROTECTIVE; +invalid: + return 0; +} + +/** + * is_pmbr_valid(): test Protective MBR for validity + * @mbr: pointer to a legacy mbr structure + * @total_sectors: amount of sectors in the device + * + * Description: Checks for a valid protective or hybrid + * master boot record (MBR). The validity of a pMBR depends + * on all of the following properties: + * 1) MSDOS signature is in the last two bytes of the MBR + * 2) One partition of type 0xEE is found + * + * In addition, a hybrid MBR will have up to three additional + * primary partitions, which point to the same space that's + * marked out by up to three GPT partitions. + * + * Returns 0 upon invalid MBR, or GPT_MBR_PROTECTIVE or + * GPT_MBR_HYBRID depending on the device layout. + */ +static int is_pmbr_valid(legacy_mbr *mbr, sector_t total_sectors) +{ + uint32_t sz = 0; + int i, part = 0, ret = 0; /* invalid by default */ + + if (!mbr || le16_to_cpu(mbr->signature) != MSDOS_MBR_SIGNATURE) + goto done; + + for (i = 0; i < 4; i++) { + ret = pmbr_part_valid(&mbr->partition_record[i]); + if (ret == GPT_MBR_PROTECTIVE) { + part = i; + /* + * Ok, we at least know that there's a protective MBR, + * now check if there are other partition types for + * hybrid MBR. + */ + goto check_hybrid; + } + } + + if (ret != GPT_MBR_PROTECTIVE) + goto done; +check_hybrid: + for (i = 0; i < 4; i++) + if ((mbr->partition_record[i].os_type != + EFI_PMBR_OSTYPE_EFI_GPT) && + (mbr->partition_record[i].os_type != 0x00)) + ret = GPT_MBR_HYBRID; + + /* + * Protective MBRs take up the lesser of the whole disk + * or 2 TiB (32bit LBA), ignoring the rest of the disk. + * Some partitioning programs, nonetheless, choose to set + * the size to the maximum 32-bit limitation, disregarding + * the disk size. + * + * Hybrid MBRs do not necessarily comply with this. + * + * Consider a bad value here to be a warning to support dd'ing + * an image from a smaller disk to a larger disk. + */ + if (ret == GPT_MBR_PROTECTIVE) { + sz = le32_to_cpu(mbr->partition_record[part].size_in_lba); + if (sz != (uint32_t) total_sectors - 1 && sz != 0xFFFFFFFF) + pr_debug("GPT: mbr size in lba (%u) different than whole disk (%u).\n", + sz, min_t(uint32_t, + total_sectors - 1, 0xFFFFFFFF)); + } +done: + return ret; +} + +/** + * read_lba(): Read bytes from disk, starting at given LBA + * @state: disk parsed partitions + * @lba: the Logical Block Address of the partition table + * @buffer: destination buffer + * @count: bytes to read + * + * Description: Reads @count bytes from @state->bdev into @buffer. + * Returns number of bytes read on success, 0 on error. + */ +static size_t read_lba(struct parsed_partitions *state, + u64 lba, u8 *buffer, size_t count) +{ + size_t totalreadcount = 0; + struct block_device *bdev = state->bdev; + sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + + if (!buffer || lba > last_lba(bdev)) + return 0; + + while (count) { + int copied = 512; + Sector sect; + unsigned char *data = read_part_sector(state, n++, §); + if (!data) + break; + if (copied > count) + copied = count; + memcpy(buffer, data, copied); + put_dev_sector(sect); + buffer += copied; + totalreadcount +=copied; + count -= copied; + } + return totalreadcount; +} + +/** + * alloc_read_gpt_entries(): reads partition entries from disk + * @state: disk parsed partitions + * @gpt: GPT header + * + * Description: Returns ptes on success, NULL on error. + * Allocates space for PTEs based on information found in @gpt. + * Notes: remember to free pte when you're done! + */ +static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, + gpt_header *gpt) +{ + size_t count; + gpt_entry *pte; + + if (!gpt) + return NULL; + + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * + le32_to_cpu(gpt->sizeof_partition_entry); + if (!count) + return NULL; + pte = kmalloc(count, GFP_KERNEL); + if (!pte) + return NULL; + + if (read_lba(state, le64_to_cpu(gpt->partition_entry_lba), + (u8 *) pte, count) < count) { + kfree(pte); + pte=NULL; + return NULL; + } + return pte; +} + +/** + * alloc_read_gpt_header(): Allocates GPT header, reads into it from disk + * @state: disk parsed partitions + * @lba: the Logical Block Address of the partition table + * + * Description: returns GPT header on success, NULL on error. Allocates + * and fills a GPT header starting at @ from @state->bdev. + * Note: remember to free gpt when finished with it. + */ +static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, + u64 lba) +{ + gpt_header *gpt; + unsigned ssz = bdev_logical_block_size(state->bdev); + + gpt = kmalloc(ssz, GFP_KERNEL); + if (!gpt) + return NULL; + + if (read_lba(state, lba, (u8 *) gpt, ssz) < ssz) { + kfree(gpt); + gpt=NULL; + return NULL; + } + + return gpt; +} + +/** + * is_gpt_valid() - tests one GPT header and PTEs for validity + * @state: disk parsed partitions + * @lba: logical block address of the GPT header to test + * @gpt: GPT header ptr, filled on return. + * @ptes: PTEs ptr, filled on return. + * + * Description: returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + */ +static int is_gpt_valid(struct parsed_partitions *state, u64 lba, + gpt_header **gpt, gpt_entry **ptes) +{ + u32 crc, origcrc; + u64 lastlba, pt_size; + + if (!ptes) + return 0; + if (!(*gpt = alloc_read_gpt_header(state, lba))) + return 0; + + /* Check the GUID Partition Table signature */ + if (le64_to_cpu((*gpt)->signature) != GPT_HEADER_SIGNATURE) { + pr_debug("GUID Partition Table Header signature is wrong:" + "%lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->signature), + (unsigned long long)GPT_HEADER_SIGNATURE); + goto fail; + } + + /* Check the GUID Partition Table header size is too big */ + if (le32_to_cpu((*gpt)->header_size) > + bdev_logical_block_size(state->bdev)) { + pr_debug("GUID Partition Table Header size is too large: %u > %u\n", + le32_to_cpu((*gpt)->header_size), + bdev_logical_block_size(state->bdev)); + goto fail; + } + + /* Check the GUID Partition Table header size is too small */ + if (le32_to_cpu((*gpt)->header_size) < sizeof(gpt_header)) { + pr_debug("GUID Partition Table Header size is too small: %u < %zu\n", + le32_to_cpu((*gpt)->header_size), + sizeof(gpt_header)); + goto fail; + } + + /* Check the GUID Partition Table CRC */ + origcrc = le32_to_cpu((*gpt)->header_crc32); + (*gpt)->header_crc32 = 0; + crc = efi_crc32((const unsigned char *) (*gpt), le32_to_cpu((*gpt)->header_size)); + + if (crc != origcrc) { + pr_debug("GUID Partition Table Header CRC is wrong: %x != %x\n", + crc, origcrc); + goto fail; + } + (*gpt)->header_crc32 = cpu_to_le32(origcrc); + + /* Check that the my_lba entry points to the LBA that contains + * the GUID Partition Table */ + if (le64_to_cpu((*gpt)->my_lba) != lba) { + pr_debug("GPT my_lba incorrect: %lld != %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->my_lba), + (unsigned long long)lba); + goto fail; + } + + /* Check the first_usable_lba and last_usable_lba are + * within the disk. + */ + lastlba = last_lba(state->bdev); + if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { + pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + if (le64_to_cpu((*gpt)->last_usable_lba) > lastlba) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)lastlba); + goto fail; + } + if (le64_to_cpu((*gpt)->last_usable_lba) < le64_to_cpu((*gpt)->first_usable_lba)) { + pr_debug("GPT: last_usable_lba incorrect: %lld > %lld\n", + (unsigned long long)le64_to_cpu((*gpt)->last_usable_lba), + (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba)); + goto fail; + } + /* Check that sizeof_partition_entry has the correct value */ + if (le32_to_cpu((*gpt)->sizeof_partition_entry) != sizeof(gpt_entry)) { + pr_debug("GUID Partition Entry Size check failed.\n"); + goto fail; + } + + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) + goto fail; + + /* Check the GUID Partition Entry Array CRC */ + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); + + if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { + pr_debug("GUID Partition Entry Array CRC check failed.\n"); + goto fail_ptes; + } + + /* We're done, all's well */ + return 1; + + fail_ptes: + kfree(*ptes); + *ptes = NULL; + fail: + kfree(*gpt); + *gpt = NULL; + return 0; +} + +/** + * is_pte_valid() - tests one PTE for validity + * @pte:pte to check + * @lastlba: last lba of the disk + * + * Description: returns 1 if valid, 0 on error. + */ +static inline int +is_pte_valid(const gpt_entry *pte, const u64 lastlba) +{ + if ((!efi_guidcmp(pte->partition_type_guid, NULL_GUID)) || + le64_to_cpu(pte->starting_lba) > lastlba || + le64_to_cpu(pte->ending_lba) > lastlba) + return 0; + return 1; +} + +/** + * compare_gpts() - Search disk for valid GPT headers and PTEs + * @pgpt: primary GPT header + * @agpt: alternate GPT header + * @lastlba: last LBA number + * + * Description: Returns nothing. Sanity checks pgpt and agpt fields + * and prints warnings on discrepancies. + * + */ +static void +compare_gpts(gpt_header *pgpt, gpt_header *agpt, u64 lastlba) +{ + int error_found = 0; + if (!pgpt || !agpt) + return; + if (le64_to_cpu(pgpt->my_lba) != le64_to_cpu(agpt->alternate_lba)) { + pr_warn("GPT:Primary header LBA != Alt. header alternate_lba\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->my_lba), + (unsigned long long)le64_to_cpu(agpt->alternate_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != le64_to_cpu(agpt->my_lba)) { + pr_warn("GPT:Primary header alternate_lba != Alt. header my_lba\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)le64_to_cpu(agpt->my_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->first_usable_lba) != + le64_to_cpu(agpt->first_usable_lba)) { + pr_warn("GPT:first_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->first_usable_lba), + (unsigned long long)le64_to_cpu(agpt->first_usable_lba)); + error_found++; + } + if (le64_to_cpu(pgpt->last_usable_lba) != + le64_to_cpu(agpt->last_usable_lba)) { + pr_warn("GPT:last_usable_lbas don't match.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->last_usable_lba), + (unsigned long long)le64_to_cpu(agpt->last_usable_lba)); + error_found++; + } + if (efi_guidcmp(pgpt->disk_guid, agpt->disk_guid)) { + pr_warn("GPT:disk_guids don't match.\n"); + error_found++; + } + if (le32_to_cpu(pgpt->num_partition_entries) != + le32_to_cpu(agpt->num_partition_entries)) { + pr_warn("GPT:num_partition_entries don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->num_partition_entries), + le32_to_cpu(agpt->num_partition_entries)); + error_found++; + } + if (le32_to_cpu(pgpt->sizeof_partition_entry) != + le32_to_cpu(agpt->sizeof_partition_entry)) { + pr_warn("GPT:sizeof_partition_entry values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->sizeof_partition_entry), + le32_to_cpu(agpt->sizeof_partition_entry)); + error_found++; + } + if (le32_to_cpu(pgpt->partition_entry_array_crc32) != + le32_to_cpu(agpt->partition_entry_array_crc32)) { + pr_warn("GPT:partition_entry_array_crc32 values don't match: " + "0x%x != 0x%x\n", + le32_to_cpu(pgpt->partition_entry_array_crc32), + le32_to_cpu(agpt->partition_entry_array_crc32)); + error_found++; + } + if (le64_to_cpu(pgpt->alternate_lba) != lastlba) { + pr_warn("GPT:Primary header thinks Alt. header is not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(pgpt->alternate_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (le64_to_cpu(agpt->my_lba) != lastlba) { + pr_warn("GPT:Alternate GPT header not at the end of the disk.\n"); + pr_warn("GPT:%lld != %lld\n", + (unsigned long long)le64_to_cpu(agpt->my_lba), + (unsigned long long)lastlba); + error_found++; + } + + if (error_found) + pr_warn("GPT: Use GNU Parted to correct GPT errors.\n"); + return; +} + +/** + * find_valid_gpt() - Search disk for valid GPT headers and PTEs + * @state: disk parsed partitions + * @gpt: GPT header ptr, filled on return. + * @ptes: PTEs ptr, filled on return. + * + * Description: Returns 1 if valid, 0 on error. + * If valid, returns pointers to newly allocated GPT header and PTEs. + * Validity depends on PMBR being valid (or being overridden by the + * 'gpt' kernel command line option) and finding either the Primary + * GPT header and PTEs valid, or the Alternate GPT header and PTEs + * valid. If the Primary GPT header is not valid, the Alternate GPT header + * is not checked unless the 'gpt' kernel command line option is passed. + * This protects against devices which misreport their size, and forces + * the user to decide to use the Alternate GPT. + */ +static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, + gpt_entry **ptes) +{ + int good_pgpt = 0, good_agpt = 0, good_pmbr = 0; + gpt_header *pgpt = NULL, *agpt = NULL; + gpt_entry *pptes = NULL, *aptes = NULL; + legacy_mbr *legacymbr; + sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + u64 lastlba; + + if (!ptes) + return 0; + + lastlba = last_lba(state->bdev); + if (!force_gpt) { + /* This will be added to the EFI Spec. per Intel after v1.02. */ + legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); + if (!legacymbr) + goto fail; + + read_lba(state, 0, (u8 *)legacymbr, sizeof(*legacymbr)); + good_pmbr = is_pmbr_valid(legacymbr, total_sectors); + kfree(legacymbr); + + if (!good_pmbr) + goto fail; + + pr_debug("Device has a %s MBR\n", + good_pmbr == GPT_MBR_PROTECTIVE ? + "protective" : "hybrid"); + } + + good_pgpt = is_gpt_valid(state, GPT_PRIMARY_PARTITION_TABLE_LBA, + &pgpt, &pptes); + if (good_pgpt) + good_agpt = is_gpt_valid(state, + le64_to_cpu(pgpt->alternate_lba), + &agpt, &aptes); + if (!good_agpt && force_gpt) + good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + + /* The obviously unsuccessful case */ + if (!good_pgpt && !good_agpt) + goto fail; + + compare_gpts(pgpt, agpt, lastlba); + + /* The good cases */ + if (good_pgpt) { + *gpt = pgpt; + *ptes = pptes; + kfree(agpt); + kfree(aptes); + if (!good_agpt) + pr_warn("Alternate GPT is invalid, using primary GPT.\n"); + return 1; + } + else if (good_agpt) { + *gpt = agpt; + *ptes = aptes; + kfree(pgpt); + kfree(pptes); + pr_warn("Primary GPT is invalid, using alternate GPT.\n"); + return 1; + } + + fail: + kfree(pgpt); + kfree(agpt); + kfree(pptes); + kfree(aptes); + *gpt = NULL; + *ptes = NULL; + return 0; +} + +/** + * utf16_le_to_7bit(): Naively converts a UTF-16LE string to 7-bit ASCII characters + * @in: input UTF-16LE string + * @size: size of the input string + * @out: output string ptr, should be capable to store @size+1 characters + * + * Description: Converts @size UTF16-LE symbols from @in string to 7-bit + * ASCII characters and stores them to @out. Adds trailing zero to @out array. + */ +static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) +{ + unsigned int i = 0; + + out[size] = 0; + + while (i < size) { + u8 c = le16_to_cpu(in[i]) & 0xff; + + if (c && !isprint(c)) + c = '!'; + out[i] = c; + i++; + } +} + +/** + * efi_partition(struct parsed_partitions *state) + * @state: disk parsed partitions + * + * Description: called from check.c, if the disk contains GPT + * partitions, sets up partition entries in the kernel. + * + * If the first block on the disk is a legacy MBR, + * it will get handled by msdos_partition(). + * If it's a Protective MBR, we'll handle it here. + * + * We do not create a Linux partition for GPT, but + * only for the actual data partitions. + * Returns: + * -1 if unable to read the partition table + * 0 if this isn't our partition table + * 1 if successful + * + */ +int efi_partition(struct parsed_partitions *state) +{ + gpt_header *gpt = NULL; + gpt_entry *ptes = NULL; + u32 i; + unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + + if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { + kfree(gpt); + kfree(ptes); + return 0; + } + + pr_debug("GUID Partition Table is valid! Yea!\n"); + + for (i = 0; i < le32_to_cpu(gpt->num_partition_entries) && i < state->limit-1; i++) { + struct partition_meta_info *info; + unsigned label_max; + u64 start = le64_to_cpu(ptes[i].starting_lba); + u64 size = le64_to_cpu(ptes[i].ending_lba) - + le64_to_cpu(ptes[i].starting_lba) + 1ULL; + + if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + continue; + + put_partition(state, i+1, start * ssz, size * ssz); + + /* If this is a RAID volume, tell md */ + if (!efi_guidcmp(ptes[i].partition_type_guid, PARTITION_LINUX_RAID_GUID)) + state->parts[i + 1].flags = ADDPART_FLAG_RAID; + + info = &state->parts[i + 1].info; + efi_guid_to_str(&ptes[i].unique_partition_guid, info->uuid); + + /* Naively convert UTF16-LE to 7 bits. */ + label_max = min(ARRAY_SIZE(info->volname) - 1, + ARRAY_SIZE(ptes[i].partition_name)); + utf16_le_to_7bit(ptes[i].partition_name, label_max, info->volname); + state->parts[i + 1].has_info = true; + } + kfree(ptes); + kfree(gpt); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/block/partitions/efi.h b/block/partitions/efi.h new file mode 100644 index 000000000..8cc2b88d0 --- /dev/null +++ b/block/partitions/efi.h @@ -0,0 +1,116 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +/************************************************************ + * EFI GUID Partition Table + * Per Intel EFI Specification v1.02 + * http://developer.intel.com/technology/efi/efi.htm + * + * By Matt Domsch Fri Sep 22 22:15:56 CDT 2000 + * Copyright 2000,2001 Dell Inc. + ************************************************************/ + +#ifndef FS_PART_EFI_H_INCLUDED +#define FS_PART_EFI_H_INCLUDED + +#include +#include +#include +#include +#include +#include +#include +#include + +#define MSDOS_MBR_SIGNATURE 0xaa55 +#define EFI_PMBR_OSTYPE_EFI 0xEF +#define EFI_PMBR_OSTYPE_EFI_GPT 0xEE + +#define GPT_MBR_PROTECTIVE 1 +#define GPT_MBR_HYBRID 2 + +#define GPT_HEADER_SIGNATURE 0x5452415020494645ULL +#define GPT_HEADER_REVISION_V1 0x00010000 +#define GPT_PRIMARY_PARTITION_TABLE_LBA 1 + +#define PARTITION_SYSTEM_GUID \ + EFI_GUID( 0xC12A7328, 0xF81F, 0x11d2, \ + 0xBA, 0x4B, 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B) +#define LEGACY_MBR_PARTITION_GUID \ + EFI_GUID( 0x024DEE41, 0x33E7, 0x11d3, \ + 0x9D, 0x69, 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F) +#define PARTITION_MSFT_RESERVED_GUID \ + EFI_GUID( 0xE3C9E316, 0x0B5C, 0x4DB8, \ + 0x81, 0x7D, 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE) +#define PARTITION_BASIC_DATA_GUID \ + EFI_GUID( 0xEBD0A0A2, 0xB9E5, 0x4433, \ + 0x87, 0xC0, 0x68, 0xB6, 0xB7, 0x26, 0x99, 0xC7) +#define PARTITION_LINUX_RAID_GUID \ + EFI_GUID( 0xa19d880f, 0x05fc, 0x4d3b, \ + 0xa0, 0x06, 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e) +#define PARTITION_LINUX_SWAP_GUID \ + EFI_GUID( 0x0657fd6d, 0xa4ab, 0x43c4, \ + 0x84, 0xe5, 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f) +#define PARTITION_LINUX_LVM_GUID \ + EFI_GUID( 0xe6d6d379, 0xf507, 0x44c2, \ + 0xa2, 0x3c, 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28) + +typedef struct _gpt_header { + __le64 signature; + __le32 revision; + __le32 header_size; + __le32 header_crc32; + __le32 reserved1; + __le64 my_lba; + __le64 alternate_lba; + __le64 first_usable_lba; + __le64 last_usable_lba; + efi_guid_t disk_guid; + __le64 partition_entry_lba; + __le32 num_partition_entries; + __le32 sizeof_partition_entry; + __le32 partition_entry_array_crc32; + + /* The rest of the logical block is reserved by UEFI and must be zero. + * EFI standard handles this by: + * + * uint8_t reserved2[ BlockSize - 92 ]; + */ +} __packed gpt_header; + +typedef struct _gpt_entry_attributes { + u64 required_to_function:1; + u64 reserved:47; + u64 type_guid_specific:16; +} __packed gpt_entry_attributes; + +typedef struct _gpt_entry { + efi_guid_t partition_type_guid; + efi_guid_t unique_partition_guid; + __le64 starting_lba; + __le64 ending_lba; + gpt_entry_attributes attributes; + __le16 partition_name[72/sizeof(__le16)]; +} __packed gpt_entry; + +typedef struct _gpt_mbr_record { + u8 boot_indicator; /* unused by EFI, set to 0x80 for bootable */ + u8 start_head; /* unused by EFI, pt start in CHS */ + u8 start_sector; /* unused by EFI, pt start in CHS */ + u8 start_track; + u8 os_type; /* EFI and legacy non-EFI OS types */ + u8 end_head; /* unused by EFI, pt end in CHS */ + u8 end_sector; /* unused by EFI, pt end in CHS */ + u8 end_track; /* unused by EFI, pt end in CHS */ + __le32 starting_lba; /* used by EFI - start addr of the on disk pt */ + __le32 size_in_lba; /* used by EFI - size of pt in LBA */ +} __packed gpt_mbr_record; + + +typedef struct _legacy_mbr { + u8 boot_code[440]; + __le32 unique_mbr_signature; + __le16 unknown; + gpt_mbr_record partition_record[4]; + __le16 signature; +} __packed legacy_mbr; + +#endif diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c new file mode 100644 index 000000000..4b044e620 --- /dev/null +++ b/block/partitions/ibm.c @@ -0,0 +1,375 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Author(s)......: Holger Smolinski + * Volker Sameske + * Bugreports.to..: + * Copyright IBM Corp. 1999, 2012 + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "check.h" + +union label_t { + struct vtoc_volume_label_cdl vol; + struct vtoc_volume_label_ldl lnx; + struct vtoc_cms_label cms; +}; + +/* + * compute the block number from a + * cyl-cyl-head-head structure + */ +static sector_t cchh2blk(struct vtoc_cchh *ptr, struct hd_geometry *geo) +{ + sector_t cyl; + __u16 head; + + /* decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors; +} + +/* + * compute the block number from a + * cyl-cyl-head-head-block structure + */ +static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) +{ + sector_t cyl; + __u16 head; + + /* decode cylinder and heads for large volumes */ + cyl = ptr->hh & 0xFFF0; + cyl <<= 12; + cyl |= ptr->cc; + head = ptr->hh & 0x000F; + return cyl * geo->heads * geo->sectors + + head * geo->sectors + + ptr->b; +} + +static int find_label(struct parsed_partitions *state, + dasd_information2_t *info, + struct hd_geometry *geo, + int blocksize, + sector_t *labelsect, + char name[], + char type[], + union label_t *label) +{ + Sector sect; + unsigned char *data; + sector_t testsect[3]; + unsigned char temp[5]; + int found = 0; + int i, testcount; + + /* There a three places where we may find a valid label: + * - on an ECKD disk it's block 2 + * - on an FBA disk it's block 1 + * - on an CMS formatted FBA disk it is sector 1, even if the block size + * is larger than 512 bytes (possible if the DIAG discipline is used) + * If we have a valid info structure, then we know exactly which case we + * have, otherwise we just search through all possebilities. + */ + if (info) { + if ((info->cu_type == 0x6310 && info->dev_type == 0x9336) || + (info->cu_type == 0x3880 && info->dev_type == 0x3370)) + testsect[0] = info->label_block; + else + testsect[0] = info->label_block * (blocksize >> 9); + testcount = 1; + } else { + testsect[0] = 1; + testsect[1] = (blocksize >> 9); + testsect[2] = 2 * (blocksize >> 9); + testcount = 3; + } + for (i = 0; i < testcount; ++i) { + data = read_part_sector(state, testsect[i], §); + if (data == NULL) + continue; + memcpy(label, data, sizeof(*label)); + memcpy(temp, data, 4); + temp[4] = 0; + EBCASC(temp, 4); + put_dev_sector(sect); + if (!strcmp(temp, "VOL1") || + !strcmp(temp, "LNX1") || + !strcmp(temp, "CMS1")) { + if (!strcmp(temp, "VOL1")) { + strncpy(type, label->vol.vollbl, 4); + strncpy(name, label->vol.volid, 6); + } else { + strncpy(type, label->lnx.vollbl, 4); + strncpy(name, label->lnx.volid, 6); + } + EBCASC(type, 4); + EBCASC(name, 6); + *labelsect = testsect[i]; + found = 1; + break; + } + } + if (!found) + memset(label, 0, sizeof(*label)); + + return found; +} + +static int find_vol1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label) +{ + sector_t blk; + int counter; + char tmp[64]; + Sector sect; + unsigned char *data; + loff_t offset, size; + struct vtoc_format1_label f1; + int secperblk; + + snprintf(tmp, sizeof(tmp), "VOL1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* + * get start of VTOC from the disk label and then search for format1 + * and format8 labels + */ + secperblk = blocksize >> 9; + blk = cchhb2blk(&label->vol.vtoc, geo) + 1; + counter = 0; + data = read_part_sector(state, blk * secperblk, §); + while (data != NULL) { + memcpy(&f1, data, sizeof(struct vtoc_format1_label)); + put_dev_sector(sect); + /* skip FMT4 / FMT5 / FMT7 labels */ + if (f1.DS1FMTID == _ascebc['4'] + || f1.DS1FMTID == _ascebc['5'] + || f1.DS1FMTID == _ascebc['7'] + || f1.DS1FMTID == _ascebc['9']) { + blk++; + data = read_part_sector(state, blk * secperblk, §); + continue; + } + /* only FMT1 and 8 labels valid at this point */ + if (f1.DS1FMTID != _ascebc['1'] && + f1.DS1FMTID != _ascebc['8']) + break; + /* OK, we got valid partition data */ + offset = cchh2blk(&f1.DS1EXT1.llimit, geo); + size = cchh2blk(&f1.DS1EXT1.ulimit, geo) - + offset + geo->sectors; + offset *= secperblk; + size *= secperblk; + if (counter >= state->limit) + break; + put_partition(state, counter + 1, offset, size); + counter++; + blk++; + data = read_part_sector(state, blk * secperblk, §); + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + if (!data) + return -1; + + return 1; +} + +static int find_lnx1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label, + sector_t labelsect, + loff_t i_size, + dasd_information2_t *info) +{ + loff_t offset, geo_size, size; + char tmp[64]; + int secperblk; + + snprintf(tmp, sizeof(tmp), "LNX1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + secperblk = blocksize >> 9; + if (label->lnx.ldl_version == 0xf2) { + size = label->lnx.formatted_blocks * secperblk; + } else { + /* + * Formated w/o large volume support. If the sanity check + * 'size based on geo == size based on i_size' is true, then + * we can safely assume that we know the formatted size of + * the disk, otherwise we need additional information + * that we can only get from a real DASD device. + */ + geo_size = geo->cylinders * geo->heads + * geo->sectors * secperblk; + size = i_size >> 9; + if (size != geo_size) { + if (!info) { + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; + } + if (!strcmp(info->type, "ECKD")) + if (geo_size < size) + size = geo_size; + /* else keep size based on i_size */ + } + } + /* first and only partition starts in the first block after the label */ + offset = labelsect + secperblk; + put_partition(state, 1, offset, size - offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} + +static int find_cms1_partitions(struct parsed_partitions *state, + struct hd_geometry *geo, + int blocksize, + char name[], + union label_t *label, + sector_t labelsect) +{ + loff_t offset, size; + char tmp[64]; + int secperblk; + + /* + * VM style CMS1 labeled disk + */ + blocksize = label->cms.block_size; + secperblk = blocksize >> 9; + if (label->cms.disk_offset != 0) { + snprintf(tmp, sizeof(tmp), "CMS1/%8s(MDSK):", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* disk is reserved minidisk */ + offset = label->cms.disk_offset * secperblk; + size = (label->cms.block_count - 1) * secperblk; + } else { + snprintf(tmp, sizeof(tmp), "CMS1/%8s:", name); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* + * Special case for FBA devices: + * If an FBA device is CMS formatted with blocksize > 512 byte + * and the DIAG discipline is used, then the CMS label is found + * in sector 1 instead of block 1. However, the partition is + * still supposed to start in block 2. + */ + if (labelsect == 1) + offset = 2 * secperblk; + else + offset = labelsect + secperblk; + size = label->cms.block_count * secperblk; + } + + put_partition(state, 1, offset, size-offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} + + +/* + * This is the main function, called by check.c + */ +int ibm_partition(struct parsed_partitions *state) +{ + int (*fn)(struct gendisk *disk, dasd_information2_t *info); + struct block_device *bdev = state->bdev; + struct gendisk *disk = bdev->bd_disk; + int blocksize, res; + loff_t i_size, offset, size; + dasd_information2_t *info; + struct hd_geometry *geo; + char type[5] = {0,}; + char name[7] = {0,}; + sector_t labelsect; + union label_t *label; + + res = 0; + if (!disk->fops->getgeo) + goto out_exit; + fn = symbol_get(dasd_biodasdinfo); + blocksize = bdev_logical_block_size(bdev); + if (blocksize <= 0) + goto out_symbol; + i_size = i_size_read(bdev->bd_inode); + if (i_size == 0) + goto out_symbol; + info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); + if (info == NULL) + goto out_symbol; + geo = kmalloc(sizeof(struct hd_geometry), GFP_KERNEL); + if (geo == NULL) + goto out_nogeo; + label = kmalloc(sizeof(union label_t), GFP_KERNEL); + if (label == NULL) + goto out_nolab; + /* set start if not filled by getgeo function e.g. virtblk */ + geo->start = get_start_sect(bdev); + if (disk->fops->getgeo(bdev, geo)) + goto out_freeall; + if (!fn || fn(disk, info)) { + kfree(info); + info = NULL; + } + + if (find_label(state, info, geo, blocksize, &labelsect, name, type, + label)) { + if (!strncmp(type, "VOL1", 4)) { + res = find_vol1_partitions(state, geo, blocksize, name, + label); + } else if (!strncmp(type, "LNX1", 4)) { + res = find_lnx1_partitions(state, geo, blocksize, name, + label, labelsect, i_size, + info); + } else if (!strncmp(type, "CMS1", 4)) { + res = find_cms1_partitions(state, geo, blocksize, name, + label, labelsect); + } + } else if (info) { + /* + * ugly but needed for backward compatibility: + * If the block device is a DASD (i.e. BIODASDINFO2 works), + * then we claim it in any case, even though it has no valid + * label. If it has the LDL format, then we simply define a + * partition as if it had an LNX1 label. + */ + res = 1; + if (info->format == DASD_FORMAT_LDL) { + strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); + size = i_size >> 9; + offset = (info->label_block + 1) * (blocksize >> 9); + put_partition(state, 1, offset, size-offset); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + } + } else + res = 0; + +out_freeall: + kfree(label); +out_nolab: + kfree(geo); +out_nogeo: + kfree(info); +out_symbol: + if (fn) + symbol_put(dasd_biodasdinfo); +out_exit: + return res; +} diff --git a/block/partitions/karma.c b/block/partitions/karma.c new file mode 100644 index 000000000..4d93512f4 --- /dev/null +++ b/block/partitions/karma.c @@ -0,0 +1,60 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/karma.c + * Rio Karma partition info. + * + * Copyright (C) 2006 Bob Copeland (me@bobcopeland.com) + * based on osf.c + */ + +#include "check.h" +#include + +#define KARMA_LABEL_MAGIC 0xAB56 + +int karma_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + Sector sect; + unsigned char *data; + struct disklabel { + u8 d_reserved[270]; + struct d_partition { + __le32 p_res; + u8 p_fstype; + u8 p_res2[3]; + __le32 p_offset; + __le32 p_size; + } d_partitions[2]; + u8 d_blank[208]; + __le16 d_magic; + } __packed *label; + struct d_partition *p; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *)data; + if (le16_to_cpu(label->d_magic) != KARMA_LABEL_MAGIC) { + put_dev_sector(sect); + return 0; + } + + p = label->d_partitions; + for (i = 0 ; i < 2; i++, p++) { + if (slot == state->limit) + break; + + if (p->p_fstype == 0x4d && le32_to_cpu(p->p_size)) { + put_partition(state, slot, le32_to_cpu(p->p_offset), + le32_to_cpu(p->p_size)); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} + diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c new file mode 100644 index 000000000..cc86534c8 --- /dev/null +++ b/block/partitions/ldm.c @@ -0,0 +1,1498 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) + * + * Copyright (C) 2001,2002 Richard Russon + * Copyright (c) 2001-2012 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + */ + +#include +#include +#include +#include +#include +#include + +#include "ldm.h" +#include "check.h" + +/* + * ldm_debug/info/error/crit - Output an error message + * @f: A printf format string containing the message + * @...: Variables to substitute into @f + * + * ldm_debug() writes a DEBUG level message to the syslog but only if the + * driver was compiled with debug enabled. Otherwise, the call turns into a NOP. + */ +#ifndef CONFIG_LDM_DEBUG +#define ldm_debug(...) do {} while (0) +#else +#define ldm_debug(f, a...) _ldm_printk (KERN_DEBUG, __func__, f, ##a) +#endif + +#define ldm_crit(f, a...) _ldm_printk (KERN_CRIT, __func__, f, ##a) +#define ldm_error(f, a...) _ldm_printk (KERN_ERR, __func__, f, ##a) +#define ldm_info(f, a...) _ldm_printk (KERN_INFO, __func__, f, ##a) + +static __printf(3, 4) +void _ldm_printk(const char *level, const char *function, const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + + va_start (args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%s%s(): %pV\n", level, function, &vaf); + + va_end(args); +} + +/** + * ldm_parse_privhead - Read the LDM Database PRIVHEAD structure + * @data: Raw database PRIVHEAD structure loaded from the device + * @ph: In-memory privhead structure in which to return parsed information + * + * This parses the LDM database PRIVHEAD structure supplied in @data and + * sets up the in-memory privhead structure @ph with the obtained information. + * + * Return: 'true' @ph contains the PRIVHEAD data + * 'false' @ph contents are undefined + */ +static bool ldm_parse_privhead(const u8 *data, struct privhead *ph) +{ + bool is_vista = false; + + BUG_ON(!data || !ph); + if (MAGIC_PRIVHEAD != get_unaligned_be64(data)) { + ldm_error("Cannot find PRIVHEAD structure. LDM database is" + " corrupt. Aborting."); + return false; + } + ph->ver_major = get_unaligned_be16(data + 0x000C); + ph->ver_minor = get_unaligned_be16(data + 0x000E); + ph->logical_disk_start = get_unaligned_be64(data + 0x011B); + ph->logical_disk_size = get_unaligned_be64(data + 0x0123); + ph->config_start = get_unaligned_be64(data + 0x012B); + ph->config_size = get_unaligned_be64(data + 0x0133); + /* Version 2.11 is Win2k/XP and version 2.12 is Vista. */ + if (ph->ver_major == 2 && ph->ver_minor == 12) + is_vista = true; + if (!is_vista && (ph->ver_major != 2 || ph->ver_minor != 11)) { + ldm_error("Expected PRIVHEAD version 2.11 or 2.12, got %d.%d." + " Aborting.", ph->ver_major, ph->ver_minor); + return false; + } + ldm_debug("PRIVHEAD version %d.%d (Windows %s).", ph->ver_major, + ph->ver_minor, is_vista ? "Vista" : "2000/XP"); + if (ph->config_size != LDM_DB_SIZE) { /* 1 MiB in sectors. */ + /* Warn the user and continue, carefully. */ + ldm_info("Database is normally %u bytes, it claims to " + "be %llu bytes.", LDM_DB_SIZE, + (unsigned long long)ph->config_size); + } + if ((ph->logical_disk_size == 0) || (ph->logical_disk_start + + ph->logical_disk_size > ph->config_start)) { + ldm_error("PRIVHEAD disk size doesn't match real disk size"); + return false; + } + if (uuid_parse(data + 0x0030, &ph->disk_id)) { + ldm_error("PRIVHEAD contains an invalid GUID."); + return false; + } + ldm_debug("Parsed PRIVHEAD successfully."); + return true; +} + +/** + * ldm_parse_tocblock - Read the LDM Database TOCBLOCK structure + * @data: Raw database TOCBLOCK structure loaded from the device + * @toc: In-memory toc structure in which to return parsed information + * + * This parses the LDM Database TOCBLOCK (table of contents) structure supplied + * in @data and sets up the in-memory tocblock structure @toc with the obtained + * information. + * + * N.B. The *_start and *_size values returned in @toc are not range-checked. + * + * Return: 'true' @toc contains the TOCBLOCK data + * 'false' @toc contents are undefined + */ +static bool ldm_parse_tocblock (const u8 *data, struct tocblock *toc) +{ + BUG_ON (!data || !toc); + + if (MAGIC_TOCBLOCK != get_unaligned_be64(data)) { + ldm_crit ("Cannot find TOCBLOCK, database may be corrupt."); + return false; + } + strncpy (toc->bitmap1_name, data + 0x24, sizeof (toc->bitmap1_name)); + toc->bitmap1_name[sizeof (toc->bitmap1_name) - 1] = 0; + toc->bitmap1_start = get_unaligned_be64(data + 0x2E); + toc->bitmap1_size = get_unaligned_be64(data + 0x36); + + if (strncmp (toc->bitmap1_name, TOC_BITMAP1, + sizeof (toc->bitmap1_name)) != 0) { + ldm_crit ("TOCBLOCK's first bitmap is '%s', should be '%s'.", + TOC_BITMAP1, toc->bitmap1_name); + return false; + } + strncpy (toc->bitmap2_name, data + 0x46, sizeof (toc->bitmap2_name)); + toc->bitmap2_name[sizeof (toc->bitmap2_name) - 1] = 0; + toc->bitmap2_start = get_unaligned_be64(data + 0x50); + toc->bitmap2_size = get_unaligned_be64(data + 0x58); + if (strncmp (toc->bitmap2_name, TOC_BITMAP2, + sizeof (toc->bitmap2_name)) != 0) { + ldm_crit ("TOCBLOCK's second bitmap is '%s', should be '%s'.", + TOC_BITMAP2, toc->bitmap2_name); + return false; + } + ldm_debug ("Parsed TOCBLOCK successfully."); + return true; +} + +/** + * ldm_parse_vmdb - Read the LDM Database VMDB structure + * @data: Raw database VMDB structure loaded from the device + * @vm: In-memory vmdb structure in which to return parsed information + * + * This parses the LDM Database VMDB structure supplied in @data and sets up + * the in-memory vmdb structure @vm with the obtained information. + * + * N.B. The *_start, *_size and *_seq values will be range-checked later. + * + * Return: 'true' @vm contains VMDB info + * 'false' @vm contents are undefined + */ +static bool ldm_parse_vmdb (const u8 *data, struct vmdb *vm) +{ + BUG_ON (!data || !vm); + + if (MAGIC_VMDB != get_unaligned_be32(data)) { + ldm_crit ("Cannot find the VMDB, database may be corrupt."); + return false; + } + + vm->ver_major = get_unaligned_be16(data + 0x12); + vm->ver_minor = get_unaligned_be16(data + 0x14); + if ((vm->ver_major != 4) || (vm->ver_minor != 10)) { + ldm_error ("Expected VMDB version %d.%d, got %d.%d. " + "Aborting.", 4, 10, vm->ver_major, vm->ver_minor); + return false; + } + + vm->vblk_size = get_unaligned_be32(data + 0x08); + if (vm->vblk_size == 0) { + ldm_error ("Illegal VBLK size"); + return false; + } + + vm->vblk_offset = get_unaligned_be32(data + 0x0C); + vm->last_vblk_seq = get_unaligned_be32(data + 0x04); + + ldm_debug ("Parsed VMDB successfully."); + return true; +} + +/** + * ldm_compare_privheads - Compare two privhead objects + * @ph1: First privhead + * @ph2: Second privhead + * + * This compares the two privhead structures @ph1 and @ph2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_privheads (const struct privhead *ph1, + const struct privhead *ph2) +{ + BUG_ON (!ph1 || !ph2); + + return ((ph1->ver_major == ph2->ver_major) && + (ph1->ver_minor == ph2->ver_minor) && + (ph1->logical_disk_start == ph2->logical_disk_start) && + (ph1->logical_disk_size == ph2->logical_disk_size) && + (ph1->config_start == ph2->config_start) && + (ph1->config_size == ph2->config_size) && + uuid_equal(&ph1->disk_id, &ph2->disk_id)); +} + +/** + * ldm_compare_tocblocks - Compare two tocblock objects + * @toc1: First toc + * @toc2: Second toc + * + * This compares the two tocblock structures @toc1 and @toc2. + * + * Return: 'true' Identical + * 'false' Different + */ +static bool ldm_compare_tocblocks (const struct tocblock *toc1, + const struct tocblock *toc2) +{ + BUG_ON (!toc1 || !toc2); + + return ((toc1->bitmap1_start == toc2->bitmap1_start) && + (toc1->bitmap1_size == toc2->bitmap1_size) && + (toc1->bitmap2_start == toc2->bitmap2_start) && + (toc1->bitmap2_size == toc2->bitmap2_size) && + !strncmp (toc1->bitmap1_name, toc2->bitmap1_name, + sizeof (toc1->bitmap1_name)) && + !strncmp (toc1->bitmap2_name, toc2->bitmap2_name, + sizeof (toc1->bitmap2_name))); +} + +/** + * ldm_validate_privheads - Compare the primary privhead with its backups + * @state: Partition check state including device holding the LDM Database + * @ph1: Memory struct to fill with ph contents + * + * Read and compare all three privheads from disk. + * + * The privheads on disk show the size and location of the main disk area and + * the configuration area (the database). The values are range-checked against + * @hd, which contains the real size of the disk. + * + * Return: 'true' Success + * 'false' Error + */ +static bool ldm_validate_privheads(struct parsed_partitions *state, + struct privhead *ph1) +{ + static const int off[3] = { OFF_PRIV1, OFF_PRIV2, OFF_PRIV3 }; + struct privhead *ph[3] = { ph1 }; + Sector sect; + u8 *data; + bool result = false; + long num_sects; + int i; + + BUG_ON (!state || !ph1); + + ph[1] = kmalloc (sizeof (*ph[1]), GFP_KERNEL); + ph[2] = kmalloc (sizeof (*ph[2]), GFP_KERNEL); + if (!ph[1] || !ph[2]) { + ldm_crit ("Out of memory."); + goto out; + } + + /* off[1 & 2] are relative to ph[0]->config_start */ + ph[0]->config_start = 0; + + /* Read and parse privheads */ + for (i = 0; i < 3; i++) { + data = read_part_sector(state, ph[0]->config_start + off[i], + §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + result = ldm_parse_privhead (data, ph[i]); + put_dev_sector (sect); + if (!result) { + ldm_error ("Cannot find PRIVHEAD %d.", i+1); /* Log again */ + if (i < 2) + goto out; /* Already logged */ + else + break; /* FIXME ignore for now, 3rd PH can fail on odd-sized disks */ + } + } + + num_sects = state->bdev->bd_inode->i_size >> 9; + + if ((ph[0]->config_start > num_sects) || + ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { + ldm_crit ("Database extends beyond the end of the disk."); + goto out; + } + + if ((ph[0]->logical_disk_start > ph[0]->config_start) || + ((ph[0]->logical_disk_start + ph[0]->logical_disk_size) + > ph[0]->config_start)) { + ldm_crit ("Disk and database overlap."); + goto out; + } + + if (!ldm_compare_privheads (ph[0], ph[1])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + } + /* FIXME ignore this for now + if (!ldm_compare_privheads (ph[0], ph[2])) { + ldm_crit ("Primary and backup PRIVHEADs don't match."); + goto out; + }*/ + ldm_debug ("Validated PRIVHEADs successfully."); + result = true; +out: + kfree (ph[1]); + kfree (ph[2]); + return result; +} + +/** + * ldm_validate_tocblocks - Validate the table of contents and its backups + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * Find and compare the four tables of contents of the LDM Database stored on + * @state->bdev and return the parsed information into @toc1. + * + * The offsets and sizes of the configs are range-checked against a privhead. + * + * Return: 'true' @toc1 contains validated TOCBLOCK info + * 'false' @toc1 contents are undefined + */ +static bool ldm_validate_tocblocks(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + static const int off[4] = { OFF_TOCB1, OFF_TOCB2, OFF_TOCB3, OFF_TOCB4}; + struct tocblock *tb[4]; + struct privhead *ph; + Sector sect; + u8 *data; + int i, nr_tbs; + bool result = false; + + BUG_ON(!state || !ldb); + ph = &ldb->ph; + tb[0] = &ldb->toc; + tb[1] = kmalloc_array(3, sizeof(*tb[1]), GFP_KERNEL); + if (!tb[1]) { + ldm_crit("Out of memory."); + goto err; + } + tb[2] = (struct tocblock*)((u8*)tb[1] + sizeof(*tb[1])); + tb[3] = (struct tocblock*)((u8*)tb[2] + sizeof(*tb[2])); + /* + * Try to read and parse all four TOCBLOCKs. + * + * Windows Vista LDM v2.12 does not always have all four TOCBLOCKs so + * skip any that fail as long as we get at least one valid TOCBLOCK. + */ + for (nr_tbs = i = 0; i < 4; i++) { + data = read_part_sector(state, base + off[i], §); + if (!data) { + ldm_error("Disk read failed for TOCBLOCK %d.", i); + continue; + } + if (ldm_parse_tocblock(data, tb[nr_tbs])) + nr_tbs++; + put_dev_sector(sect); + } + if (!nr_tbs) { + ldm_crit("Failed to find a valid TOCBLOCK."); + goto err; + } + /* Range check the TOCBLOCK against a privhead. */ + if (((tb[0]->bitmap1_start + tb[0]->bitmap1_size) > ph->config_size) || + ((tb[0]->bitmap2_start + tb[0]->bitmap2_size) > + ph->config_size)) { + ldm_crit("The bitmaps are out of range. Giving up."); + goto err; + } + /* Compare all loaded TOCBLOCKs. */ + for (i = 1; i < nr_tbs; i++) { + if (!ldm_compare_tocblocks(tb[0], tb[i])) { + ldm_crit("TOCBLOCKs 0 and %d do not match.", i); + goto err; + } + } + ldm_debug("Validated %d TOCBLOCKs successfully.", nr_tbs); + result = true; +err: + kfree(tb[1]); + return result; +} + +/** + * ldm_validate_vmdb - Read the VMDB and validate it + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @bdev, of the database + * @ldb: Cache of the database structures + * + * Find the vmdb of the LDM Database stored on @bdev and return the parsed + * information in @ldb. + * + * Return: 'true' @ldb contains validated VBDB info + * 'false' @ldb contents are undefined + */ +static bool ldm_validate_vmdb(struct parsed_partitions *state, + unsigned long base, struct ldmdb *ldb) +{ + Sector sect; + u8 *data; + bool result = false; + struct vmdb *vm; + struct tocblock *toc; + + BUG_ON (!state || !ldb); + + vm = &ldb->vm; + toc = &ldb->toc; + + data = read_part_sector(state, base + OFF_VMDB, §); + if (!data) { + ldm_crit ("Disk read failed."); + return false; + } + + if (!ldm_parse_vmdb (data, vm)) + goto out; /* Already logged */ + + /* Are there uncommitted transactions? */ + if (get_unaligned_be16(data + 0x10) != 0x01) { + ldm_crit ("Database is not in a consistent state. Aborting."); + goto out; + } + + if (vm->vblk_offset != 512) + ldm_info ("VBLKs start at offset 0x%04x.", vm->vblk_offset); + + /* + * The last_vblkd_seq can be before the end of the vmdb, just make sure + * it is not out of bounds. + */ + if ((vm->vblk_size * vm->last_vblk_seq) > (toc->bitmap1_size << 9)) { + ldm_crit ("VMDB exceeds allowed size specified by TOCBLOCK. " + "Database is corrupt. Aborting."); + goto out; + } + + result = true; +out: + put_dev_sector (sect); + return result; +} + + +/** + * ldm_validate_partition_table - Determine whether bdev might be a dynamic disk + * @state: Partition check state including device holding the LDM Database + * + * This function provides a weak test to decide whether the device is a dynamic + * disk or not. It looks for an MS-DOS-style partition table containing at + * least one partition of type 0x42 (formerly SFS, now used by Windows for + * dynamic disks). + * + * N.B. The only possible error can come from the read_part_sector and that is + * only likely to happen if the underlying device is strange. If that IS + * the case we should return zero to let someone else try. + * + * Return: 'true' @state->bdev is a dynamic disk + * 'false' @state->bdev is not a dynamic disk, or an error occurred + */ +static bool ldm_validate_partition_table(struct parsed_partitions *state) +{ + Sector sect; + u8 *data; + struct msdos_partition *p; + int i; + bool result = false; + + BUG_ON(!state); + + data = read_part_sector(state, 0, §); + if (!data) { + ldm_info ("Disk read failed."); + return false; + } + + if (*(__le16*) (data + 0x01FE) != cpu_to_le16 (MSDOS_LABEL_MAGIC)) + goto out; + + p = (struct msdos_partition *)(data + 0x01BE); + for (i = 0; i < 4; i++, p++) + if (p->sys_ind == LDM_PARTITION) { + result = true; + break; + } + + if (result) + ldm_debug ("Found W2K dynamic disk partition type."); + +out: + put_dev_sector (sect); + return result; +} + +/** + * ldm_get_disk_objid - Search a linked list of vblk's for a given Disk Id + * @ldb: Cache of the database structures + * + * The LDM Database contains a list of all partitions on all dynamic disks. + * The primary PRIVHEAD, at the beginning of the physical disk, tells us + * the GUID of this disk. This function searches for the GUID in a linked + * list of vblk's. + * + * Return: Pointer, A matching vblk was found + * NULL, No match, or an error + */ +static struct vblk * ldm_get_disk_objid (const struct ldmdb *ldb) +{ + struct list_head *item; + + BUG_ON (!ldb); + + list_for_each (item, &ldb->v_disk) { + struct vblk *v = list_entry (item, struct vblk, list); + if (uuid_equal(&v->vblk.disk.disk_id, &ldb->ph.disk_id)) + return v; + } + + return NULL; +} + +/** + * ldm_create_data_partitions - Create data partitions for this device + * @pp: List of the partitions parsed so far + * @ldb: Cache of the database structures + * + * The database contains ALL the partitions for ALL disk groups, so we need to + * filter out this specific disk. Using the disk's object id, we can find all + * the partitions in the database that belong to this disk. + * + * Add each partition in our database, to the parsed_partitions structure. + * + * N.B. This function creates the partitions in the order it finds partition + * objects in the linked list. + * + * Return: 'true' Partition created + * 'false' Error, probably a range checking problem + */ +static bool ldm_create_data_partitions (struct parsed_partitions *pp, + const struct ldmdb *ldb) +{ + struct list_head *item; + struct vblk *vb; + struct vblk *disk; + struct vblk_part *part; + int part_num = 1; + + BUG_ON (!pp || !ldb); + + disk = ldm_get_disk_objid (ldb); + if (!disk) { + ldm_crit ("Can't find the ID of this disk in the database."); + return false; + } + + strlcat(pp->pp_buf, " [LDM]", PAGE_SIZE); + + /* Create the data partitions */ + list_for_each (item, &ldb->v_part) { + vb = list_entry (item, struct vblk, list); + part = &vb->vblk.part; + + if (part->disk_id != disk->obj_id) + continue; + + put_partition (pp, part_num, ldb->ph.logical_disk_start + + part->start, part->size); + part_num++; + } + + strlcat(pp->pp_buf, "\n", PAGE_SIZE); + return true; +} + + +/** + * ldm_relative - Calculate the next relative offset + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @base: Size of the previous fixed width fields + * @offset: Cumulative size of the previous variable-width fields + * + * Because many of the VBLK fields are variable-width, it's necessary + * to calculate each offset based on the previous one and the length + * of the field it pointed to. + * + * Return: -1 Error, the calculated offset exceeded the size of the buffer + * n OK, a range-checked offset into buffer + */ +static int ldm_relative(const u8 *buffer, int buflen, int base, int offset) +{ + + base += offset; + if (!buffer || offset < 0 || base > buflen) { + if (!buffer) + ldm_error("!buffer"); + if (offset < 0) + ldm_error("offset (%d) < 0", offset); + if (base > buflen) + ldm_error("base (%d) > buflen (%d)", base, buflen); + return -1; + } + if (base + buffer[base] >= buflen) { + ldm_error("base (%d) + buffer[base] (%d) >= buflen (%d)", base, + buffer[base], buflen); + return -1; + } + return buffer[base] + offset + 1; +} + +/** + * ldm_get_vnum - Convert a variable-width, big endian number, into cpu order + * @block: Pointer to the variable-width number to convert + * + * Large numbers in the LDM Database are often stored in a packed format. Each + * number is prefixed by a one byte width marker. All numbers in the database + * are stored in big-endian byte order. This function reads one of these + * numbers and returns the result + * + * N.B. This function DOES NOT perform any range checking, though the most + * it will read is eight bytes. + * + * Return: n A number + * 0 Zero, or an error occurred + */ +static u64 ldm_get_vnum (const u8 *block) +{ + u64 tmp = 0; + u8 length; + + BUG_ON (!block); + + length = *block++; + + if (length && length <= 8) + while (length--) + tmp = (tmp << 8) | *block++; + else + ldm_error ("Illegal length %d.", length); + + return tmp; +} + +/** + * ldm_get_vstr - Read a length-prefixed string into a buffer + * @block: Pointer to the length marker + * @buffer: Location to copy string to + * @buflen: Size of the output buffer + * + * Many of the strings in the LDM Database are not NULL terminated. Instead + * they are prefixed by a one byte length marker. This function copies one of + * these strings into a buffer. + * + * N.B. This function DOES NOT perform any range checking on the input. + * If the buffer is too small, the output will be truncated. + * + * Return: 0, Error and @buffer contents are undefined + * n, String length in characters (excluding NULL) + * buflen-1, String was truncated. + */ +static int ldm_get_vstr (const u8 *block, u8 *buffer, int buflen) +{ + int length; + + BUG_ON (!block || !buffer); + + length = block[0]; + if (length >= buflen) { + ldm_error ("Truncating string %d -> %d.", length, buflen); + length = buflen - 1; + } + memcpy (buffer, block + 1, length); + buffer[length] = 0; + return length; +} + + +/** + * ldm_parse_cmp3 - Read a raw VBLK Component object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Component object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Component VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vstate, r_child, r_parent, r_stripe, r_cols, len; + struct vblk_comp *comp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_vstate = ldm_relative (buffer, buflen, 0x18, r_name); + r_child = ldm_relative (buffer, buflen, 0x1D, r_vstate); + r_parent = ldm_relative (buffer, buflen, 0x2D, r_child); + + if (buffer[0x12] & VBLK_FLAG_COMP_STRIPE) { + r_stripe = ldm_relative (buffer, buflen, 0x2E, r_parent); + r_cols = ldm_relative (buffer, buflen, 0x2E, r_stripe); + len = r_cols; + } else { + r_stripe = 0; + r_cols = 0; + len = r_parent; + } + if (len < 0) + return false; + + len += VBLK_SIZE_CMP3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + comp = &vb->vblk.comp; + ldm_get_vstr (buffer + 0x18 + r_name, comp->state, + sizeof (comp->state)); + comp->type = buffer[0x18 + r_vstate]; + comp->children = ldm_get_vnum (buffer + 0x1D + r_vstate); + comp->parent_id = ldm_get_vnum (buffer + 0x2D + r_child); + comp->chunksize = r_stripe ? ldm_get_vnum (buffer+r_parent+0x2E) : 0; + + return true; +} + +/** + * ldm_parse_dgr3 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_id1, r_id2, len; + struct vblk_dgrp *dgrp; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + + if (buffer[0x12] & VBLK_FLAG_DGR3_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); + r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_diskid; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + dgrp = &vb->vblk.dgrp; + ldm_get_vstr (buffer + 0x18 + r_name, dgrp->disk_id, + sizeof (dgrp->disk_id)); + return true; +} + +/** + * ldm_parse_dgr4 - Read a raw VBLK Disk Group object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk Group object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk Group VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + char buf[64]; + int r_objid, r_name, r_id1, r_id2, len; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + + if (buffer[0x12] & VBLK_FLAG_DGR4_IDS) { + r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); + r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); + len = r_id2; + } else { + r_id1 = 0; + r_id2 = 0; + len = r_name; + } + if (len < 0) + return false; + + len += VBLK_SIZE_DGR4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + ldm_get_vstr (buffer + 0x18 + r_objid, buf, sizeof (buf)); + return true; +} + +/** + * ldm_parse_dsk3 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk3 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_diskid, r_altname, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + r_diskid = ldm_relative (buffer, buflen, 0x18, r_name); + r_altname = ldm_relative (buffer, buflen, 0x18, r_diskid); + len = r_altname; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK3; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + ldm_get_vstr (buffer + 0x18 + r_diskid, disk->alt_name, + sizeof (disk->alt_name)); + if (uuid_parse(buffer + 0x19 + r_name, &disk->disk_id)) + return false; + + return true; +} + +/** + * ldm_parse_dsk4 - Read a raw VBLK Disk object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Disk object (version 4) into a vblk structure. + * + * Return: 'true' @vb contains a Disk VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_dsk4 (const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, len; + struct vblk_disk *disk; + + BUG_ON (!buffer || !vb); + + r_objid = ldm_relative (buffer, buflen, 0x18, 0); + r_name = ldm_relative (buffer, buflen, 0x18, r_objid); + len = r_name; + if (len < 0) + return false; + + len += VBLK_SIZE_DSK4; + if (len != get_unaligned_be32(buffer + 0x14)) + return false; + + disk = &vb->vblk.disk; + import_uuid(&disk->disk_id, buffer + 0x18 + r_name); + return true; +} + +/** + * ldm_parse_prt3 - Read a raw VBLK Partition object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Partition object (version 3) into a vblk structure. + * + * Return: 'true' @vb contains a Partition VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_size, r_parent, r_diskid, r_index, len; + struct vblk_part *part; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x34, r_name); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + r_parent = ldm_relative(buffer, buflen, 0x34, r_size); + if (r_parent < 0) { + ldm_error("r_parent %d < 0", r_parent); + return false; + } + r_diskid = ldm_relative(buffer, buflen, 0x34, r_parent); + if (r_diskid < 0) { + ldm_error("r_diskid %d < 0", r_diskid); + return false; + } + if (buffer[0x12] & VBLK_FLAG_PART_INDEX) { + r_index = ldm_relative(buffer, buflen, 0x34, r_diskid); + if (r_index < 0) { + ldm_error("r_index %d < 0", r_index); + return false; + } + len = r_index; + } else { + r_index = 0; + len = r_diskid; + } + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_PRT3; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + part = &vb->vblk.part; + part->start = get_unaligned_be64(buffer + 0x24 + r_name); + part->volume_offset = get_unaligned_be64(buffer + 0x2C + r_name); + part->size = ldm_get_vnum(buffer + 0x34 + r_name); + part->parent_id = ldm_get_vnum(buffer + 0x34 + r_size); + part->disk_id = ldm_get_vnum(buffer + 0x34 + r_parent); + if (vb->flags & VBLK_FLAG_PART_INDEX) + part->partnum = buffer[0x35 + r_diskid]; + else + part->partnum = 0; + return true; +} + +/** + * ldm_parse_vol5 - Read a raw VBLK Volume object into a vblk structure + * @buffer: Block of data being worked on + * @buflen: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK Volume object (version 5) into a vblk structure. + * + * Return: 'true' @vb contains a Volume VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vol5(const u8 *buffer, int buflen, struct vblk *vb) +{ + int r_objid, r_name, r_vtype, r_disable_drive_letter, r_child, r_size; + int r_id1, r_id2, r_size2, r_drive, len; + struct vblk_volu *volu; + + BUG_ON(!buffer || !vb); + r_objid = ldm_relative(buffer, buflen, 0x18, 0); + if (r_objid < 0) { + ldm_error("r_objid %d < 0", r_objid); + return false; + } + r_name = ldm_relative(buffer, buflen, 0x18, r_objid); + if (r_name < 0) { + ldm_error("r_name %d < 0", r_name); + return false; + } + r_vtype = ldm_relative(buffer, buflen, 0x18, r_name); + if (r_vtype < 0) { + ldm_error("r_vtype %d < 0", r_vtype); + return false; + } + r_disable_drive_letter = ldm_relative(buffer, buflen, 0x18, r_vtype); + if (r_disable_drive_letter < 0) { + ldm_error("r_disable_drive_letter %d < 0", + r_disable_drive_letter); + return false; + } + r_child = ldm_relative(buffer, buflen, 0x2D, r_disable_drive_letter); + if (r_child < 0) { + ldm_error("r_child %d < 0", r_child); + return false; + } + r_size = ldm_relative(buffer, buflen, 0x3D, r_child); + if (r_size < 0) { + ldm_error("r_size %d < 0", r_size); + return false; + } + if (buffer[0x12] & VBLK_FLAG_VOLU_ID1) { + r_id1 = ldm_relative(buffer, buflen, 0x52, r_size); + if (r_id1 < 0) { + ldm_error("r_id1 %d < 0", r_id1); + return false; + } + } else + r_id1 = r_size; + if (buffer[0x12] & VBLK_FLAG_VOLU_ID2) { + r_id2 = ldm_relative(buffer, buflen, 0x52, r_id1); + if (r_id2 < 0) { + ldm_error("r_id2 %d < 0", r_id2); + return false; + } + } else + r_id2 = r_id1; + if (buffer[0x12] & VBLK_FLAG_VOLU_SIZE) { + r_size2 = ldm_relative(buffer, buflen, 0x52, r_id2); + if (r_size2 < 0) { + ldm_error("r_size2 %d < 0", r_size2); + return false; + } + } else + r_size2 = r_id2; + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + r_drive = ldm_relative(buffer, buflen, 0x52, r_size2); + if (r_drive < 0) { + ldm_error("r_drive %d < 0", r_drive); + return false; + } + } else + r_drive = r_size2; + len = r_drive; + if (len < 0) { + ldm_error("len %d < 0", len); + return false; + } + len += VBLK_SIZE_VOL5; + if (len > get_unaligned_be32(buffer + 0x14)) { + ldm_error("len %d > BE32(buffer + 0x14) %d", len, + get_unaligned_be32(buffer + 0x14)); + return false; + } + volu = &vb->vblk.volu; + ldm_get_vstr(buffer + 0x18 + r_name, volu->volume_type, + sizeof(volu->volume_type)); + memcpy(volu->volume_state, buffer + 0x18 + r_disable_drive_letter, + sizeof(volu->volume_state)); + volu->size = ldm_get_vnum(buffer + 0x3D + r_child); + volu->partition_type = buffer[0x41 + r_size]; + memcpy(volu->guid, buffer + 0x42 + r_size, sizeof(volu->guid)); + if (buffer[0x12] & VBLK_FLAG_VOLU_DRIVE) { + ldm_get_vstr(buffer + 0x52 + r_size, volu->drive_hint, + sizeof(volu->drive_hint)); + } + return true; +} + +/** + * ldm_parse_vblk - Read a raw VBLK object into a vblk structure + * @buf: Block of data being worked on + * @len: Size of the block of data + * @vb: In-memory vblk in which to return information + * + * Read a raw VBLK object into a vblk structure. This function just reads the + * information common to all VBLK types, then delegates the rest of the work to + * helper functions: ldm_parse_*. + * + * Return: 'true' @vb contains a VBLK + * 'false' @vb contents are not defined + */ +static bool ldm_parse_vblk (const u8 *buf, int len, struct vblk *vb) +{ + bool result = false; + int r_objid; + + BUG_ON (!buf || !vb); + + r_objid = ldm_relative (buf, len, 0x18, 0); + if (r_objid < 0) { + ldm_error ("VBLK header is corrupt."); + return false; + } + + vb->flags = buf[0x12]; + vb->type = buf[0x13]; + vb->obj_id = ldm_get_vnum (buf + 0x18); + ldm_get_vstr (buf+0x18+r_objid, vb->name, sizeof (vb->name)); + + switch (vb->type) { + case VBLK_CMP3: result = ldm_parse_cmp3 (buf, len, vb); break; + case VBLK_DSK3: result = ldm_parse_dsk3 (buf, len, vb); break; + case VBLK_DSK4: result = ldm_parse_dsk4 (buf, len, vb); break; + case VBLK_DGR3: result = ldm_parse_dgr3 (buf, len, vb); break; + case VBLK_DGR4: result = ldm_parse_dgr4 (buf, len, vb); break; + case VBLK_PRT3: result = ldm_parse_prt3 (buf, len, vb); break; + case VBLK_VOL5: result = ldm_parse_vol5 (buf, len, vb); break; + } + + if (result) + ldm_debug ("Parsed VBLK 0x%llx (type: 0x%02x) ok.", + (unsigned long long) vb->obj_id, vb->type); + else + ldm_error ("Failed to parse VBLK 0x%llx (type: 0x%02x).", + (unsigned long long) vb->obj_id, vb->type); + + return result; +} + + +/** + * ldm_ldmdb_add - Adds a raw VBLK entry to the ldmdb database + * @data: Raw VBLK to add to the database + * @len: Size of the raw VBLK + * @ldb: Cache of the database structures + * + * The VBLKs are sorted into categories. Partitions are also sorted by offset. + * + * N.B. This function does not check the validity of the VBLKs. + * + * Return: 'true' The VBLK was added + * 'false' An error occurred + */ +static bool ldm_ldmdb_add (u8 *data, int len, struct ldmdb *ldb) +{ + struct vblk *vb; + struct list_head *item; + + BUG_ON (!data || !ldb); + + vb = kmalloc (sizeof (*vb), GFP_KERNEL); + if (!vb) { + ldm_crit ("Out of memory."); + return false; + } + + if (!ldm_parse_vblk (data, len, vb)) { + kfree(vb); + return false; /* Already logged */ + } + + /* Put vblk into the correct list. */ + switch (vb->type) { + case VBLK_DGR3: + case VBLK_DGR4: + list_add (&vb->list, &ldb->v_dgrp); + break; + case VBLK_DSK3: + case VBLK_DSK4: + list_add (&vb->list, &ldb->v_disk); + break; + case VBLK_VOL5: + list_add (&vb->list, &ldb->v_volu); + break; + case VBLK_CMP3: + list_add (&vb->list, &ldb->v_comp); + break; + case VBLK_PRT3: + /* Sort by the partition's start sector. */ + list_for_each (item, &ldb->v_part) { + struct vblk *v = list_entry (item, struct vblk, list); + if ((v->vblk.part.disk_id == vb->vblk.part.disk_id) && + (v->vblk.part.start > vb->vblk.part.start)) { + list_add_tail (&vb->list, &v->list); + return true; + } + } + list_add_tail (&vb->list, &ldb->v_part); + break; + } + return true; +} + +/** + * ldm_frag_add - Add a VBLK fragment to a list + * @data: Raw fragment to be added to the list + * @size: Size of the raw fragment + * @frags: Linked list of VBLK fragments + * + * Fragmented VBLKs may not be consecutive in the database, so they are placed + * in a list so they can be pieced together later. + * + * Return: 'true' Success, the VBLK was added to the list + * 'false' Error, a problem occurred + */ +static bool ldm_frag_add (const u8 *data, int size, struct list_head *frags) +{ + struct frag *f; + struct list_head *item; + int rec, num, group; + + BUG_ON (!data || !frags); + + if (size < 2 * VBLK_SIZE_HEAD) { + ldm_error("Value of size is too small."); + return false; + } + + group = get_unaligned_be32(data + 0x08); + rec = get_unaligned_be16(data + 0x0C); + num = get_unaligned_be16(data + 0x0E); + if ((num < 1) || (num > 4)) { + ldm_error ("A VBLK claims to have %d parts.", num); + return false; + } + if (rec >= num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, num); + return false; + } + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + if (f->group == group) + goto found; + } + + f = kmalloc (sizeof (*f) + size*num, GFP_KERNEL); + if (!f) { + ldm_crit ("Out of memory."); + return false; + } + + f->group = group; + f->num = num; + f->rec = rec; + f->map = 0xFF << num; + + list_add_tail (&f->list, frags); +found: + if (rec >= f->num) { + ldm_error("REC value (%d) exceeds NUM value (%d)", rec, f->num); + return false; + } + if (f->map & (1 << rec)) { + ldm_error ("Duplicate VBLK, part %d.", rec); + f->map &= 0x7F; /* Mark the group as broken */ + return false; + } + f->map |= (1 << rec); + if (!rec) + memcpy(f->data, data, VBLK_SIZE_HEAD); + data += VBLK_SIZE_HEAD; + size -= VBLK_SIZE_HEAD; + memcpy(f->data + VBLK_SIZE_HEAD + rec * size, data, size); + return true; +} + +/** + * ldm_frag_free - Free a linked list of VBLK fragments + * @list: Linked list of fragments + * + * Free a linked list of VBLK fragments + * + * Return: none + */ +static void ldm_frag_free (struct list_head *list) +{ + struct list_head *item, *tmp; + + BUG_ON (!list); + + list_for_each_safe (item, tmp, list) + kfree (list_entry (item, struct frag, list)); +} + +/** + * ldm_frag_commit - Validate fragmented VBLKs and add them to the database + * @frags: Linked list of VBLK fragments + * @ldb: Cache of the database structures + * + * Now that all the fragmented VBLKs have been collected, they must be added to + * the database for later use. + * + * Return: 'true' All the fragments we added successfully + * 'false' One or more of the fragments we invalid + */ +static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) +{ + struct frag *f; + struct list_head *item; + + BUG_ON (!frags || !ldb); + + list_for_each (item, frags) { + f = list_entry (item, struct frag, list); + + if (f->map != 0xFF) { + ldm_error ("VBLK group %d is incomplete (0x%02x).", + f->group, f->map); + return false; + } + + if (!ldm_ldmdb_add (f->data, f->num*ldb->vm.vblk_size, ldb)) + return false; /* Already logged */ + } + return true; +} + +/** + * ldm_get_vblks - Read the on-disk database of VBLKs into memory + * @state: Partition check state including device holding the LDM Database + * @base: Offset, into @state->bdev, of the database + * @ldb: Cache of the database structures + * + * To use the information from the VBLKs, they need to be read from the disk, + * unpacked and validated. We cache them in @ldb according to their type. + * + * Return: 'true' All the VBLKs were read successfully + * 'false' An error occurred + */ +static bool ldm_get_vblks(struct parsed_partitions *state, unsigned long base, + struct ldmdb *ldb) +{ + int size, perbuf, skip, finish, s, v, recs; + u8 *data = NULL; + Sector sect; + bool result = false; + LIST_HEAD (frags); + + BUG_ON(!state || !ldb); + + size = ldb->vm.vblk_size; + perbuf = 512 / size; + skip = ldb->vm.vblk_offset >> 9; /* Bytes to sectors */ + finish = (size * ldb->vm.last_vblk_seq) >> 9; + + for (s = skip; s < finish; s++) { /* For each sector */ + data = read_part_sector(state, base + OFF_VMDB + s, §); + if (!data) { + ldm_crit ("Disk read failed."); + goto out; + } + + for (v = 0; v < perbuf; v++, data+=size) { /* For each vblk */ + if (MAGIC_VBLK != get_unaligned_be32(data)) { + ldm_error ("Expected to find a VBLK."); + goto out; + } + + recs = get_unaligned_be16(data + 0x0E); /* Number of records */ + if (recs == 1) { + if (!ldm_ldmdb_add (data, size, ldb)) + goto out; /* Already logged */ + } else if (recs > 1) { + if (!ldm_frag_add (data, size, &frags)) + goto out; /* Already logged */ + } + /* else Record is not in use, ignore it. */ + } + put_dev_sector (sect); + data = NULL; + } + + result = ldm_frag_commit (&frags, ldb); /* Failures, already logged */ +out: + if (data) + put_dev_sector (sect); + ldm_frag_free (&frags); + + return result; +} + +/** + * ldm_free_vblks - Free a linked list of vblk's + * @lh: Head of a linked list of struct vblk + * + * Free a list of vblk's and free the memory used to maintain the list. + * + * Return: none + */ +static void ldm_free_vblks (struct list_head *lh) +{ + struct list_head *item, *tmp; + + BUG_ON (!lh); + + list_for_each_safe (item, tmp, lh) + kfree (list_entry (item, struct vblk, list)); +} + + +/** + * ldm_partition - Find out whether a device is a dynamic disk and handle it + * @state: Partition check state including device holding the LDM Database + * + * This determines whether the device @bdev is a dynamic disk and if so creates + * the partitions necessary in the gendisk structure pointed to by @hd. + * + * We create a dummy device 1, which contains the LDM database, and then create + * each partition described by the LDM database in sequence as devices 2+. For + * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, + * and so on: the actual data containing partitions. + * + * Return: 1 Success, @state->bdev is a dynamic disk and we handled it + * 0 Success, @state->bdev is not a dynamic disk + * -1 An error occurred before enough information had been read + * Or @state->bdev is a dynamic disk, but it may be corrupted + */ +int ldm_partition(struct parsed_partitions *state) +{ + struct ldmdb *ldb; + unsigned long base; + int result = -1; + + BUG_ON(!state); + + /* Look for signs of a Dynamic Disk */ + if (!ldm_validate_partition_table(state)) + return 0; + + ldb = kmalloc (sizeof (*ldb), GFP_KERNEL); + if (!ldb) { + ldm_crit ("Out of memory."); + goto out; + } + + /* Parse and check privheads. */ + if (!ldm_validate_privheads(state, &ldb->ph)) + goto out; /* Already logged */ + + /* All further references are relative to base (database start). */ + base = ldb->ph.config_start; + + /* Parse and check tocs and vmdb. */ + if (!ldm_validate_tocblocks(state, base, ldb) || + !ldm_validate_vmdb(state, base, ldb)) + goto out; /* Already logged */ + + /* Initialize vblk lists in ldmdb struct */ + INIT_LIST_HEAD (&ldb->v_dgrp); + INIT_LIST_HEAD (&ldb->v_disk); + INIT_LIST_HEAD (&ldb->v_volu); + INIT_LIST_HEAD (&ldb->v_comp); + INIT_LIST_HEAD (&ldb->v_part); + + if (!ldm_get_vblks(state, base, ldb)) { + ldm_crit ("Failed to read the VBLKs from the database."); + goto cleanup; + } + + /* Finally, create the data partition devices. */ + if (ldm_create_data_partitions(state, ldb)) { + ldm_debug ("Parsed LDM database successfully."); + result = 1; + } + /* else Already logged */ + +cleanup: + ldm_free_vblks (&ldb->v_dgrp); + ldm_free_vblks (&ldb->v_disk); + ldm_free_vblks (&ldb->v_volu); + ldm_free_vblks (&ldb->v_comp); + ldm_free_vblks (&ldb->v_part); +out: + kfree (ldb); + return result; +} diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h new file mode 100644 index 000000000..8693704dc --- /dev/null +++ b/block/partitions/ldm.h @@ -0,0 +1,194 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/** + * ldm - Part of the Linux-NTFS project. + * + * Copyright (C) 2001,2002 Richard Russon + * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (C) 2001,2002 Jakob Kemi + * + * Documentation is available at http://www.linux-ntfs.org/doku.php?id=downloads + */ + +#ifndef _FS_PT_LDM_H_ +#define _FS_PT_LDM_H_ + +#include +#include +#include +#include +#include +#include + +struct parsed_partitions; + +/* Magic numbers in CPU format. */ +#define MAGIC_VMDB 0x564D4442 /* VMDB */ +#define MAGIC_VBLK 0x56424C4B /* VBLK */ +#define MAGIC_PRIVHEAD 0x5052495648454144ULL /* PRIVHEAD */ +#define MAGIC_TOCBLOCK 0x544F43424C4F434BULL /* TOCBLOCK */ + +/* The defined vblk types. */ +#define VBLK_VOL5 0x51 /* Volume, version 5 */ +#define VBLK_CMP3 0x32 /* Component, version 3 */ +#define VBLK_PRT3 0x33 /* Partition, version 3 */ +#define VBLK_DSK3 0x34 /* Disk, version 3 */ +#define VBLK_DSK4 0x44 /* Disk, version 4 */ +#define VBLK_DGR3 0x35 /* Disk Group, version 3 */ +#define VBLK_DGR4 0x45 /* Disk Group, version 4 */ + +/* vblk flags indicating extra information will be present */ +#define VBLK_FLAG_COMP_STRIPE 0x10 +#define VBLK_FLAG_PART_INDEX 0x08 +#define VBLK_FLAG_DGR3_IDS 0x08 +#define VBLK_FLAG_DGR4_IDS 0x08 +#define VBLK_FLAG_VOLU_ID1 0x08 +#define VBLK_FLAG_VOLU_ID2 0x20 +#define VBLK_FLAG_VOLU_SIZE 0x80 +#define VBLK_FLAG_VOLU_DRIVE 0x02 + +/* size of a vblk's static parts */ +#define VBLK_SIZE_HEAD 16 +#define VBLK_SIZE_CMP3 22 /* Name and version */ +#define VBLK_SIZE_DGR3 12 +#define VBLK_SIZE_DGR4 44 +#define VBLK_SIZE_DSK3 12 +#define VBLK_SIZE_DSK4 45 +#define VBLK_SIZE_PRT3 28 +#define VBLK_SIZE_VOL5 58 + +/* component types */ +#define COMP_STRIPE 0x01 /* Stripe-set */ +#define COMP_BASIC 0x02 /* Basic disk */ +#define COMP_RAID 0x03 /* Raid-set */ + +/* Other constants. */ +#define LDM_DB_SIZE 2048 /* Size in sectors (= 1MiB). */ + +#define OFF_PRIV1 6 /* Offset of the first privhead + relative to the start of the + device in sectors */ + +/* Offsets to structures within the LDM Database in sectors. */ +#define OFF_PRIV2 1856 /* Backup private headers. */ +#define OFF_PRIV3 2047 + +#define OFF_TOCB1 1 /* Tables of contents. */ +#define OFF_TOCB2 2 +#define OFF_TOCB3 2045 +#define OFF_TOCB4 2046 + +#define OFF_VMDB 17 /* List of partitions. */ + +#define LDM_PARTITION 0x42 /* Formerly SFS (Landis). */ + +#define TOC_BITMAP1 "config" /* Names of the two defined */ +#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ + +struct frag { /* VBLK Fragment handling */ + struct list_head list; + u32 group; + u8 num; /* Total number of records */ + u8 rec; /* This is record number n */ + u8 map; /* Which portions are in use */ + u8 data[]; +}; + +/* In memory LDM database structures. */ + +struct privhead { /* Offsets and sizes are in sectors. */ + u16 ver_major; + u16 ver_minor; + u64 logical_disk_start; + u64 logical_disk_size; + u64 config_start; + u64 config_size; + uuid_t disk_id; +}; + +struct tocblock { /* We have exactly two bitmaps. */ + u8 bitmap1_name[16]; + u64 bitmap1_start; + u64 bitmap1_size; + u8 bitmap2_name[16]; + u64 bitmap2_start; + u64 bitmap2_size; +}; + +struct vmdb { /* VMDB: The database header */ + u16 ver_major; + u16 ver_minor; + u32 vblk_size; + u32 vblk_offset; + u32 last_vblk_seq; +}; + +struct vblk_comp { /* VBLK Component */ + u8 state[16]; + u64 parent_id; + u8 type; + u8 children; + u16 chunksize; +}; + +struct vblk_dgrp { /* VBLK Disk Group */ + u8 disk_id[64]; +}; + +struct vblk_disk { /* VBLK Disk */ + uuid_t disk_id; + u8 alt_name[128]; +}; + +struct vblk_part { /* VBLK Partition */ + u64 start; + u64 size; /* start, size and vol_off in sectors */ + u64 volume_offset; + u64 parent_id; + u64 disk_id; + u8 partnum; +}; + +struct vblk_volu { /* VBLK Volume */ + u8 volume_type[16]; + u8 volume_state[16]; + u8 guid[16]; + u8 drive_hint[4]; + u64 size; + u8 partition_type; +}; + +struct vblk_head { /* VBLK standard header */ + u32 group; + u16 rec; + u16 nrec; +}; + +struct vblk { /* Generalised VBLK */ + u8 name[64]; + u64 obj_id; + u32 sequence; + u8 flags; + u8 type; + union { + struct vblk_comp comp; + struct vblk_dgrp dgrp; + struct vblk_disk disk; + struct vblk_part part; + struct vblk_volu volu; + } vblk; + struct list_head list; +}; + +struct ldmdb { /* Cache of the database */ + struct privhead ph; + struct tocblock toc; + struct vmdb vm; + struct list_head v_dgrp; + struct list_head v_disk; + struct list_head v_volu; + struct list_head v_comp; + struct list_head v_part; +}; + +#endif /* _FS_PT_LDM_H_ */ + diff --git a/block/partitions/mac.c b/block/partitions/mac.c new file mode 100644 index 000000000..b60953356 --- /dev/null +++ b/block/partitions/mac.c @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/mac.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include +#include "check.h" +#include "mac.h" + +#ifdef CONFIG_PPC_PMAC +#include +extern void note_bootable_part(dev_t dev, int part, int goodness); +#endif + +/* + * Code to understand MacOS partition tables. + */ + +static inline void mac_fix_string(char *stg, int len) +{ + int i; + + for (i = len - 1; i >= 0 && stg[i] == ' '; i--) + stg[i] = 0; +} + +int mac_partition(struct parsed_partitions *state) +{ + Sector sect; + unsigned char *data; + int slot, blocks_in_map; + unsigned secsize, datasize, partoffset; +#ifdef CONFIG_PPC_PMAC + int found_root = 0; + int found_root_goodness = 0; +#endif + struct mac_partition *part; + struct mac_driver_desc *md; + + /* Get 0th block and look at the first partition map entry. */ + md = read_part_sector(state, 0, §); + if (!md) + return -1; + if (be16_to_cpu(md->signature) != MAC_DRIVER_MAGIC) { + put_dev_sector(sect); + return 0; + } + secsize = be16_to_cpu(md->block_size); + put_dev_sector(sect); + datasize = round_down(secsize, 512); + data = read_part_sector(state, datasize / 512, §); + if (!data) + return -1; + partoffset = secsize % 512; + if (partoffset + sizeof(*part) > datasize) + return -1; + part = (struct mac_partition *) (data + partoffset); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) { + put_dev_sector(sect); + return 0; /* not a MacOS disk */ + } + blocks_in_map = be32_to_cpu(part->map_count); + if (blocks_in_map < 0 || blocks_in_map >= DISK_MAX_PARTS) { + put_dev_sector(sect); + return 0; + } + + if (blocks_in_map >= state->limit) + blocks_in_map = state->limit - 1; + + strlcat(state->pp_buf, " [mac]", PAGE_SIZE); + for (slot = 1; slot <= blocks_in_map; ++slot) { + int pos = slot * secsize; + put_dev_sector(sect); + data = read_part_sector(state, pos/512, §); + if (!data) + return -1; + part = (struct mac_partition *) (data + pos%512); + if (be16_to_cpu(part->signature) != MAC_PARTITION_MAGIC) + break; + put_partition(state, slot, + be32_to_cpu(part->start_block) * (secsize/512), + be32_to_cpu(part->block_count) * (secsize/512)); + + if (!strncasecmp(part->type, "Linux_RAID", 10)) + state->parts[slot].flags = ADDPART_FLAG_RAID; +#ifdef CONFIG_PPC_PMAC + /* + * If this is the first bootable partition, tell the + * setup code, in case it wants to make this the root. + */ + if (machine_is(powermac)) { + int goodness = 0; + + mac_fix_string(part->processor, 16); + mac_fix_string(part->name, 32); + mac_fix_string(part->type, 32); + + if ((be32_to_cpu(part->status) & MAC_STATUS_BOOTABLE) + && strcasecmp(part->processor, "powerpc") == 0) + goodness++; + + if (strcasecmp(part->type, "Apple_UNIX_SVR2") == 0 + || (strncasecmp(part->type, "Linux", 5) == 0 + && strcasecmp(part->type, "Linux_swap") != 0)) { + int i, l; + + goodness++; + l = strlen(part->name); + if (strcmp(part->name, "/") == 0) + goodness++; + for (i = 0; i <= l - 4; ++i) { + if (strncasecmp(part->name + i, "root", + 4) == 0) { + goodness += 2; + break; + } + } + if (strncasecmp(part->name, "swap", 4) == 0) + goodness--; + } + + if (goodness > found_root_goodness) { + found_root = slot; + found_root_goodness = goodness; + } + } +#endif /* CONFIG_PPC_PMAC */ + } +#ifdef CONFIG_PPC_PMAC + if (found_root_goodness) + note_bootable_part(state->bdev->bd_dev, found_root, + found_root_goodness); +#endif + + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; +} diff --git a/block/partitions/mac.h b/block/partitions/mac.h new file mode 100644 index 000000000..0e41c9da7 --- /dev/null +++ b/block/partitions/mac.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * fs/partitions/mac.h + */ + +#define MAC_PARTITION_MAGIC 0x504d + +/* type field value for A/UX or other Unix partitions */ +#define APPLE_AUX_TYPE "Apple_UNIX_SVR2" + +struct mac_partition { + __be16 signature; /* expected to be MAC_PARTITION_MAGIC */ + __be16 res1; + __be32 map_count; /* # blocks in partition map */ + __be32 start_block; /* absolute starting block # of partition */ + __be32 block_count; /* number of blocks in partition */ + char name[32]; /* partition name */ + char type[32]; /* string type description */ + __be32 data_start; /* rel block # of first data block */ + __be32 data_count; /* number of data blocks */ + __be32 status; /* partition status bits */ + __be32 boot_start; + __be32 boot_size; + __be32 boot_load; + __be32 boot_load2; + __be32 boot_entry; + __be32 boot_entry2; + __be32 boot_cksum; + char processor[16]; /* identifies ISA of boot */ + /* there is more stuff after this that we don't need */ +}; + +#define MAC_STATUS_BOOTABLE 8 /* partition is bootable */ + +#define MAC_DRIVER_MAGIC 0x4552 + +/* Driver descriptor structure, in block 0 */ +struct mac_driver_desc { + __be16 signature; /* expected to be MAC_DRIVER_MAGIC */ + __be16 block_size; + __be32 block_count; + /* ... more stuff */ +}; + diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c new file mode 100644 index 000000000..c94de377c --- /dev/null +++ b/block/partitions/msdos.c @@ -0,0 +1,715 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/msdos.c + * + * Code extracted from drivers/block/genhd.c + * Copyright (C) 1991-1998 Linus Torvalds + * + * Thanks to Branko Lankester, lankeste@fwi.uva.nl, who found a bug + * in the early extended-partition checks and added DM partitions + * + * Support for DiskManager v6.0x added by Mark Lord, + * with information provided by OnTrack. This now works for linux fdisk + * and LILO, as well as loadlin and bootln. Note that disks other than + * /dev/hda *must* have a "DOS" type 0x51 partition in the first slot (hda1). + * + * More flexible handling of extended partitions - aeb, 950831 + * + * Check partition table on IDE disks for common CHS translations + * + * Re-organised Feb 1998 Russell King + * + * BSD disklabel support by Yossi Gottlieb + * updated by Marc Espie + * + * Unixware slices support by Andrzej Krzysztofowicz + * and Krzysztof G. Baranowski + */ +#include +#include + +#include "check.h" +#include "efi.h" + +/* + * Many architectures don't like unaligned accesses, while + * the nr_sects and start_sect partition table entries are + * at a 2 (mod 4) address. + */ +#include + +static inline sector_t nr_sects(struct msdos_partition *p) +{ + return (sector_t)get_unaligned_le32(&p->nr_sects); +} + +static inline sector_t start_sect(struct msdos_partition *p) +{ + return (sector_t)get_unaligned_le32(&p->start_sect); +} + +static inline int is_extended_partition(struct msdos_partition *p) +{ + return (p->sys_ind == DOS_EXTENDED_PARTITION || + p->sys_ind == WIN98_EXTENDED_PARTITION || + p->sys_ind == LINUX_EXTENDED_PARTITION); +} + +#define MSDOS_LABEL_MAGIC1 0x55 +#define MSDOS_LABEL_MAGIC2 0xAA + +static inline int +msdos_magic_present(unsigned char *p) +{ + return (p[0] == MSDOS_LABEL_MAGIC1 && p[1] == MSDOS_LABEL_MAGIC2); +} + +/* Value is EBCDIC 'IBMA' */ +#define AIX_LABEL_MAGIC1 0xC9 +#define AIX_LABEL_MAGIC2 0xC2 +#define AIX_LABEL_MAGIC3 0xD4 +#define AIX_LABEL_MAGIC4 0xC1 +static int aix_magic_present(struct parsed_partitions *state, unsigned char *p) +{ + struct msdos_partition *pt = (struct msdos_partition *) (p + 0x1be); + Sector sect; + unsigned char *d; + int slot, ret = 0; + + if (!(p[0] == AIX_LABEL_MAGIC1 && + p[1] == AIX_LABEL_MAGIC2 && + p[2] == AIX_LABEL_MAGIC3 && + p[3] == AIX_LABEL_MAGIC4)) + return 0; + + /* + * Assume the partition table is valid if Linux partitions exists. + * Note that old Solaris/x86 partitions use the same indicator as + * Linux swap partitions, so we consider that a Linux partition as + * well. + */ + for (slot = 1; slot <= 4; slot++, pt++) { + if (pt->sys_ind == SOLARIS_X86_PARTITION || + pt->sys_ind == LINUX_RAID_PARTITION || + pt->sys_ind == LINUX_DATA_PARTITION || + pt->sys_ind == LINUX_LVM_PARTITION || + is_extended_partition(pt)) + return 0; + } + d = read_part_sector(state, 7, §); + if (d) { + if (d[0] == '_' && d[1] == 'L' && d[2] == 'V' && d[3] == 'M') + ret = 1; + put_dev_sector(sect); + } + return ret; +} + +static void set_info(struct parsed_partitions *state, int slot, + u32 disksig) +{ + struct partition_meta_info *info = &state->parts[slot].info; + + snprintf(info->uuid, sizeof(info->uuid), "%08x-%02x", disksig, + slot); + info->volname[0] = 0; + state->parts[slot].has_info = true; +} + +/* + * Create devices for each logical partition in an extended partition. + * The logical partitions form a linked list, with each entry being + * a partition table with two entries. The first entry + * is the real data partition (with a start relative to the partition + * table start). The second is a pointer to the next logical partition + * (with a start relative to the entire extended partition). + * We do not create a Linux partition for the partition tables, but + * only for the actual data partitions. + */ + +static void parse_extended(struct parsed_partitions *state, + sector_t first_sector, sector_t first_size, + u32 disksig) +{ + struct msdos_partition *p; + Sector sect; + unsigned char *data; + sector_t this_sector, this_size; + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + int loopct = 0; /* number of links followed + without finding a data partition */ + int i; + + this_sector = first_sector; + this_size = first_size; + + while (1) { + if (++loopct > 100) + return; + if (state->next == state->limit) + return; + data = read_part_sector(state, this_sector, §); + if (!data) + return; + + if (!msdos_magic_present(data + 510)) + goto done; + + p = (struct msdos_partition *) (data + 0x1be); + + /* + * Usually, the first entry is the real data partition, + * the 2nd entry is the next extended partition, or empty, + * and the 3rd and 4th entries are unused. + * However, DRDOS sometimes has the extended partition as + * the first entry (when the data partition is empty), + * and OS/2 seems to use all four entries. + */ + + /* + * First process the data partition(s) + */ + for (i = 0; i < 4; i++, p++) { + sector_t offs, size, next; + + if (!nr_sects(p) || is_extended_partition(p)) + continue; + + /* Check the 3rd and 4th entries - + these sometimes contain random garbage */ + offs = start_sect(p)*sector_size; + size = nr_sects(p)*sector_size; + next = this_sector + offs; + if (i >= 2) { + if (offs + size > this_size) + continue; + if (next < first_sector) + continue; + if (next + size > first_sector + first_size) + continue; + } + + put_partition(state, state->next, next, size); + set_info(state, state->next, disksig); + if (p->sys_ind == LINUX_RAID_PARTITION) + state->parts[state->next].flags = ADDPART_FLAG_RAID; + loopct = 0; + if (++state->next == state->limit) + goto done; + } + /* + * Next, process the (first) extended partition, if present. + * (So far, there seems to be no reason to make + * parse_extended() recursive and allow a tree + * of extended partitions.) + * It should be a link to the next logical partition. + */ + p -= 4; + for (i = 0; i < 4; i++, p++) + if (nr_sects(p) && is_extended_partition(p)) + break; + if (i == 4) + goto done; /* nothing left to do */ + + this_sector = first_sector + start_sect(p) * sector_size; + this_size = nr_sects(p) * sector_size; + put_dev_sector(sect); + } +done: + put_dev_sector(sect); +} + +#define SOLARIS_X86_NUMSLICE 16 +#define SOLARIS_X86_VTOC_SANE (0x600DDEEEUL) + +struct solaris_x86_slice { + __le16 s_tag; /* ID tag of partition */ + __le16 s_flag; /* permission flags */ + __le32 s_start; /* start sector no of partition */ + __le32 s_size; /* # of blocks in partition */ +}; + +struct solaris_x86_vtoc { + unsigned int v_bootinfo[3]; /* info needed by mboot */ + __le32 v_sanity; /* to verify vtoc sanity */ + __le32 v_version; /* layout version */ + char v_volume[8]; /* volume name */ + __le16 v_sectorsz; /* sector size in bytes */ + __le16 v_nparts; /* number of partitions */ + unsigned int v_reserved[10]; /* free space */ + struct solaris_x86_slice + v_slice[SOLARIS_X86_NUMSLICE]; /* slice headers */ + unsigned int timestamp[SOLARIS_X86_NUMSLICE]; /* timestamp */ + char v_asciilabel[128]; /* for compatibility */ +}; + +/* james@bpgc.com: Solaris has a nasty indicator: 0x82 which also + indicates linux swap. Be careful before believing this is Solaris. */ + +static void parse_solaris_x86(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_SOLARIS_X86_PARTITION + Sector sect; + struct solaris_x86_vtoc *v; + int i; + short max_nparts; + + v = read_part_sector(state, offset + 1, §); + if (!v) + return; + if (le32_to_cpu(v->v_sanity) != SOLARIS_X86_VTOC_SANE) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 11 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + if (le32_to_cpu(v->v_version) != 1) { + char tmp[64]; + + snprintf(tmp, sizeof(tmp), " cannot handle version %d vtoc>\n", + le32_to_cpu(v->v_version)); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + put_dev_sector(sect); + return; + } + /* Ensure we can handle previous case of VTOC with 8 entries gracefully */ + max_nparts = le16_to_cpu(v->v_nparts) > 8 ? SOLARIS_X86_NUMSLICE : 8; + for (i = 0; i < max_nparts && state->next < state->limit; i++) { + struct solaris_x86_slice *s = &v->v_slice[i]; + char tmp[3 + 10 + 1 + 1]; + + if (s->s_size == 0) + continue; + snprintf(tmp, sizeof(tmp), " [s%d]", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + /* solaris partitions are relative to current MS-DOS + * one; must add the offset of the current partition */ + put_partition(state, state->next++, + le32_to_cpu(s->s_start)+offset, + le32_to_cpu(s->s_size)); + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +/* check against BSD src/sys/sys/disklabel.h for consistency */ +#define BSD_DISKMAGIC (0x82564557UL) /* The disk magic number */ +#define BSD_MAXPARTITIONS 16 +#define OPENBSD_MAXPARTITIONS 16 +#define BSD_FS_UNUSED 0 /* disklabel unused partition entry ID */ +struct bsd_disklabel { + __le32 d_magic; /* the magic number */ + __s16 d_type; /* drive type */ + __s16 d_subtype; /* controller/d_type specific */ + char d_typename[16]; /* type name, e.g. "eagle" */ + char d_packname[16]; /* pack identifier */ + __u32 d_secsize; /* # of bytes per sector */ + __u32 d_nsectors; /* # of data sectors per track */ + __u32 d_ntracks; /* # of tracks per cylinder */ + __u32 d_ncylinders; /* # of data cylinders per unit */ + __u32 d_secpercyl; /* # of data sectors per cylinder */ + __u32 d_secperunit; /* # of data sectors per unit */ + __u16 d_sparespertrack; /* # of spare sectors per track */ + __u16 d_sparespercyl; /* # of spare sectors per cylinder */ + __u32 d_acylinders; /* # of alt. cylinders per unit */ + __u16 d_rpm; /* rotational speed */ + __u16 d_interleave; /* hardware sector interleave */ + __u16 d_trackskew; /* sector 0 skew, per track */ + __u16 d_cylskew; /* sector 0 skew, per cylinder */ + __u32 d_headswitch; /* head switch time, usec */ + __u32 d_trkseek; /* track-to-track seek, usec */ + __u32 d_flags; /* generic flags */ +#define NDDATA 5 + __u32 d_drivedata[NDDATA]; /* drive-type specific information */ +#define NSPARE 5 + __u32 d_spare[NSPARE]; /* reserved for future use */ + __le32 d_magic2; /* the magic number (again) */ + __le16 d_checksum; /* xor of data incl. partitions */ + + /* filesystem and partition information: */ + __le16 d_npartitions; /* number of partitions in following */ + __le32 d_bbsize; /* size of boot area at sn0, bytes */ + __le32 d_sbsize; /* max size of fs superblock, bytes */ + struct bsd_partition { /* the partition table */ + __le32 p_size; /* number of sectors in partition */ + __le32 p_offset; /* starting sector */ + __le32 p_fsize; /* filesystem basic fragment size */ + __u8 p_fstype; /* filesystem type, see below */ + __u8 p_frag; /* filesystem fragments per block */ + __le16 p_cpg; /* filesystem cylinders per group */ + } d_partitions[BSD_MAXPARTITIONS]; /* actually may be more */ +}; + +#if defined(CONFIG_BSD_DISKLABEL) +/* + * Create devices for BSD partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_bsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin, char *flavour, + int max_partitions) +{ + Sector sect; + struct bsd_disklabel *l; + struct bsd_partition *p; + char tmp[64]; + + l = read_part_sector(state, offset + 1, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != BSD_DISKMAGIC) { + put_dev_sector(sect); + return; + } + + snprintf(tmp, sizeof(tmp), " %s%d: <%s:", state->name, origin, flavour); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + + if (le16_to_cpu(l->d_npartitions) < max_partitions) + max_partitions = le16_to_cpu(l->d_npartitions); + for (p = l->d_partitions; p - l->d_partitions < max_partitions; p++) { + sector_t bsd_start, bsd_size; + + if (state->next == state->limit) + break; + if (p->p_fstype == BSD_FS_UNUSED) + continue; + bsd_start = le32_to_cpu(p->p_offset); + bsd_size = le32_to_cpu(p->p_size); + /* FreeBSD has relative offset if C partition offset is zero */ + if (memcmp(flavour, "bsd\0", 4) == 0 && + le32_to_cpu(l->d_partitions[2].p_offset) == 0) + bsd_start += offset; + if (offset == bsd_start && size == bsd_size) + /* full parent partition, we have it already */ + continue; + if (offset > bsd_start || offset+size < bsd_start+bsd_size) { + strlcat(state->pp_buf, "bad subpartition - ignored\n", PAGE_SIZE); + continue; + } + put_partition(state, state->next++, bsd_start, bsd_size); + } + put_dev_sector(sect); + if (le16_to_cpu(l->d_npartitions) > max_partitions) { + snprintf(tmp, sizeof(tmp), " (ignored %d more)", + le16_to_cpu(l->d_npartitions) - max_partitions); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +} +#endif + +static void parse_freebsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "bsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_netbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "netbsd", BSD_MAXPARTITIONS); +#endif +} + +static void parse_openbsd(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_BSD_DISKLABEL + parse_bsd(state, offset, size, origin, "openbsd", + OPENBSD_MAXPARTITIONS); +#endif +} + +#define UNIXWARE_DISKMAGIC (0xCA5E600DUL) /* The disk magic number */ +#define UNIXWARE_DISKMAGIC2 (0x600DDEEEUL) /* The slice table magic nr */ +#define UNIXWARE_NUMSLICE 16 +#define UNIXWARE_FS_UNUSED 0 /* Unused slice entry ID */ + +struct unixware_slice { + __le16 s_label; /* label */ + __le16 s_flags; /* permission flags */ + __le32 start_sect; /* starting sector */ + __le32 nr_sects; /* number of sectors in slice */ +}; + +struct unixware_disklabel { + __le32 d_type; /* drive type */ + __le32 d_magic; /* the magic number */ + __le32 d_version; /* version number */ + char d_serial[12]; /* serial number of the device */ + __le32 d_ncylinders; /* # of data cylinders per device */ + __le32 d_ntracks; /* # of tracks per cylinder */ + __le32 d_nsectors; /* # of data sectors per track */ + __le32 d_secsize; /* # of bytes per sector */ + __le32 d_part_start; /* # of first sector of this partition*/ + __le32 d_unknown1[12]; /* ? */ + __le32 d_alt_tbl; /* byte offset of alternate table */ + __le32 d_alt_len; /* byte length of alternate table */ + __le32 d_phys_cyl; /* # of physical cylinders per device */ + __le32 d_phys_trk; /* # of physical tracks per cylinder */ + __le32 d_phys_sec; /* # of physical sectors per track */ + __le32 d_phys_bytes; /* # of physical bytes per sector */ + __le32 d_unknown2; /* ? */ + __le32 d_unknown3; /* ? */ + __le32 d_pad[8]; /* pad */ + + struct unixware_vtoc { + __le32 v_magic; /* the magic number */ + __le32 v_version; /* version number */ + char v_name[8]; /* volume name */ + __le16 v_nslices; /* # of slices */ + __le16 v_unknown1; /* ? */ + __le32 v_reserved[10]; /* reserved */ + struct unixware_slice + v_slice[UNIXWARE_NUMSLICE]; /* slice headers */ + } vtoc; +}; /* 408 */ + +/* + * Create devices for Unixware partitions listed in a disklabel, under a + * dos-like partition. See parse_extended() for more information. + */ +static void parse_unixware(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_UNIXWARE_DISKLABEL + Sector sect; + struct unixware_disklabel *l; + struct unixware_slice *p; + + l = read_part_sector(state, offset + 29, §); + if (!l) + return; + if (le32_to_cpu(l->d_magic) != UNIXWARE_DISKMAGIC || + le32_to_cpu(l->vtoc.v_magic) != UNIXWARE_DISKMAGIC2) { + put_dev_sector(sect); + return; + } + { + char tmp[1 + BDEVNAME_SIZE + 10 + 12 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + p = &l->vtoc.v_slice[1]; + /* I omit the 0th slice as it is the same as whole disk. */ + while (p - &l->vtoc.v_slice[0] < UNIXWARE_NUMSLICE) { + if (state->next == state->limit) + break; + + if (p->s_label != UNIXWARE_FS_UNUSED) + put_partition(state, state->next++, + le32_to_cpu(p->start_sect), + le32_to_cpu(p->nr_sects)); + p++; + } + put_dev_sector(sect); + strlcat(state->pp_buf, " >\n", PAGE_SIZE); +#endif +} + +#define MINIX_NR_SUBPARTITIONS 4 + +/* + * Minix 2.0.0/2.0.2 subpartition support. + * Anand Krishnamurthy + * Rajeev V. Pillai + */ +static void parse_minix(struct parsed_partitions *state, + sector_t offset, sector_t size, int origin) +{ +#ifdef CONFIG_MINIX_SUBPARTITION + Sector sect; + unsigned char *data; + struct msdos_partition *p; + int i; + + data = read_part_sector(state, offset, §); + if (!data) + return; + + p = (struct msdos_partition *)(data + 0x1be); + + /* The first sector of a Minix partition can have either + * a secondary MBR describing its subpartitions, or + * the normal boot sector. */ + if (msdos_magic_present(data + 510) && + p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ + char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; + + snprintf(tmp, sizeof(tmp), " %s%d: name, origin); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + for (i = 0; i < MINIX_NR_SUBPARTITIONS; i++, p++) { + if (state->next == state->limit) + break; + /* add each partition in use */ + if (p->sys_ind == MINIX_PARTITION) + put_partition(state, state->next++, + start_sect(p), nr_sects(p)); + } + strlcat(state->pp_buf, " >\n", PAGE_SIZE); + } + put_dev_sector(sect); +#endif /* CONFIG_MINIX_SUBPARTITION */ +} + +static struct { + unsigned char id; + void (*parse)(struct parsed_partitions *, sector_t, sector_t, int); +} subtypes[] = { + {FREEBSD_PARTITION, parse_freebsd}, + {NETBSD_PARTITION, parse_netbsd}, + {OPENBSD_PARTITION, parse_openbsd}, + {MINIX_PARTITION, parse_minix}, + {UNIXWARE_PARTITION, parse_unixware}, + {SOLARIS_X86_PARTITION, parse_solaris_x86}, + {NEW_SOLARIS_X86_PARTITION, parse_solaris_x86}, + {0, NULL}, +}; + +int msdos_partition(struct parsed_partitions *state) +{ + sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + Sector sect; + unsigned char *data; + struct msdos_partition *p; + struct fat_boot_sector *fb; + int slot; + u32 disksig; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + /* + * Note order! (some AIX disks, e.g. unbootable kind, + * have no MSDOS 55aa) + */ + if (aix_magic_present(state, data)) { + put_dev_sector(sect); +#ifdef CONFIG_AIX_PARTITION + return aix_partition(state); +#else + strlcat(state->pp_buf, " [AIX]", PAGE_SIZE); + return 0; +#endif + } + + if (!msdos_magic_present(data + 510)) { + put_dev_sector(sect); + return 0; + } + + /* + * Now that the 55aa signature is present, this is probably + * either the boot sector of a FAT filesystem or a DOS-type + * partition table. Reject this in case the boot indicator + * is not 0 or 0x80. + */ + p = (struct msdos_partition *) (data + 0x1be); + for (slot = 1; slot <= 4; slot++, p++) { + if (p->boot_ind != 0 && p->boot_ind != 0x80) { + /* + * Even without a valid boot inidicator value + * its still possible this is valid FAT filesystem + * without a partition table. + */ + fb = (struct fat_boot_sector *) data; + if (slot == 1 && fb->reserved && fb->fats + && fat_valid_media(fb->media)) { + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; + } else { + put_dev_sector(sect); + return 0; + } + } + } + +#ifdef CONFIG_EFI_PARTITION + p = (struct msdos_partition *) (data + 0x1be); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + /* If this is an EFI GPT disk, msdos should ignore it. */ + if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) { + put_dev_sector(sect); + return 0; + } + } +#endif + p = (struct msdos_partition *) (data + 0x1be); + + disksig = le32_to_cpup((__le32 *)(data + 0x1b8)); + + /* + * Look for partitions in two passes: + * First find the primary and DOS-type extended partitions. + * On the second pass look inside *BSD, Unixware and Solaris partitions. + */ + + state->next = 5; + for (slot = 1 ; slot <= 4 ; slot++, p++) { + sector_t start = start_sect(p)*sector_size; + sector_t size = nr_sects(p)*sector_size; + + if (!size) + continue; + if (is_extended_partition(p)) { + /* + * prevent someone doing mkfs or mkswap on an + * extended partition, but leave room for LILO + * FIXME: this uses one logical sector for > 512b + * sector, although it may not be enough/proper. + */ + sector_t n = 2; + + n = min(size, max(sector_size, n)); + put_partition(state, slot, start, n); + + strlcat(state->pp_buf, " <", PAGE_SIZE); + parse_extended(state, start, size, disksig); + strlcat(state->pp_buf, " >", PAGE_SIZE); + continue; + } + put_partition(state, slot, start, size); + set_info(state, slot, disksig); + if (p->sys_ind == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + if (p->sys_ind == DM6_PARTITION) + strlcat(state->pp_buf, "[DM]", PAGE_SIZE); + if (p->sys_ind == EZD_PARTITION) + strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); + } + + strlcat(state->pp_buf, "\n", PAGE_SIZE); + + /* second pass - output for each on a separate line */ + p = (struct msdos_partition *) (0x1be + data); + for (slot = 1 ; slot <= 4 ; slot++, p++) { + unsigned char id = p->sys_ind; + int n; + + if (!nr_sects(p)) + continue; + + for (n = 0; subtypes[n].parse && id != subtypes[n].id; n++) + ; + + if (!subtypes[n].parse) + continue; + subtypes[n].parse(state, start_sect(p) * sector_size, + nr_sects(p) * sector_size, slot); + } + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/osf.c b/block/partitions/osf.c new file mode 100644 index 000000000..84560d076 --- /dev/null +++ b/block/partitions/osf.c @@ -0,0 +1,87 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/osf.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" + +#define MAX_OSF_PARTITIONS 18 +#define DISKLABELMAGIC (0x82564557UL) + +int osf_partition(struct parsed_partitions *state) +{ + int i; + int slot = 1; + unsigned int npartitions; + Sector sect; + unsigned char *data; + struct disklabel { + __le32 d_magic; + __le16 d_type,d_subtype; + u8 d_typename[16]; + u8 d_packname[16]; + __le32 d_secsize; + __le32 d_nsectors; + __le32 d_ntracks; + __le32 d_ncylinders; + __le32 d_secpercyl; + __le32 d_secprtunit; + __le16 d_sparespertrack; + __le16 d_sparespercyl; + __le32 d_acylinders; + __le16 d_rpm, d_interleave, d_trackskew, d_cylskew; + __le32 d_headswitch, d_trkseek, d_flags; + __le32 d_drivedata[5]; + __le32 d_spare[5]; + __le32 d_magic2; + __le16 d_checksum; + __le16 d_npartitions; + __le32 d_bbsize, d_sbsize; + struct d_partition { + __le32 p_size; + __le32 p_offset; + __le32 p_fsize; + u8 p_fstype; + u8 p_frag; + __le16 p_cpg; + } d_partitions[MAX_OSF_PARTITIONS]; + } * label; + struct d_partition * partition; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + label = (struct disklabel *) (data+64); + partition = label->d_partitions; + if (le32_to_cpu(label->d_magic) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + if (le32_to_cpu(label->d_magic2) != DISKLABELMAGIC) { + put_dev_sector(sect); + return 0; + } + npartitions = le16_to_cpu(label->d_npartitions); + if (npartitions > MAX_OSF_PARTITIONS) { + put_dev_sector(sect); + return 0; + } + for (i = 0 ; i < npartitions; i++, partition++) { + if (slot == state->limit) + break; + if (le32_to_cpu(partition->p_size)) + put_partition(state, slot, + le32_to_cpu(partition->p_offset), + le32_to_cpu(partition->p_size)); + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c new file mode 100644 index 000000000..4273f1bb0 --- /dev/null +++ b/block/partitions/sgi.c @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/sgi.c + * + * Code extracted from drivers/block/genhd.c + */ + +#include "check.h" + +#define SGI_LABEL_MAGIC 0x0be5a941 + +enum { + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + +struct sgi_disklabel { + __be32 magic_mushroom; /* Big fat spliff... */ + __be16 root_part_num; /* Root partition number */ + __be16 swap_part_num; /* Swap partition number */ + s8 boot_file[16]; /* Name of boot file for ARCS */ + u8 _unused0[48]; /* Device parameter useless crapola.. */ + struct sgi_volume { + s8 name[8]; /* Name of volume */ + __be32 block_num; /* Logical block number */ + __be32 num_bytes; /* How big, in bytes */ + } volume[15]; + struct sgi_partition { + __be32 num_blocks; /* Size in logical blocks */ + __be32 first_block; /* First logical block */ + __be32 type; /* Type of this partition */ + } partitions[16]; + __be32 csum; /* Disk label checksum */ + __be32 _unused1; /* Padding */ +}; + +int sgi_partition(struct parsed_partitions *state) +{ + int i, csum; + __be32 magic; + int slot = 1; + unsigned int start, blocks; + __be32 *ui, cs; + Sector sect; + struct sgi_disklabel *label; + struct sgi_partition *p; + char b[BDEVNAME_SIZE]; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + p = &label->partitions[0]; + magic = label->magic_mushroom; + if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { + /*printk("Dev %s SGI disklabel: bad magic %08x\n", + bdevname(bdev, b), be32_to_cpu(magic));*/ + put_dev_sector(sect); + return 0; + } + ui = ((__be32 *) (label + 1)) - 1; + for(csum = 0; ui >= ((__be32 *) label);) { + cs = *ui--; + csum += be32_to_cpu(cs); + } + if(csum) { + printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + /* All SGI disk labels have 16 partitions, disks under Linux only + * have 15 minor's. Luckily there are always a few zero length + * partitions which we don't care about so we never overflow the + * current_minor. + */ + for(i = 0; i < 16; i++, p++) { + blocks = be32_to_cpu(p->num_blocks); + start = be32_to_cpu(p->first_block); + if (blocks) { + put_partition(state, slot, start, blocks); + if (be32_to_cpu(p->type) == LINUX_RAID_PARTITION) + state->parts[slot].flags = ADDPART_FLAG_RAID; + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sun.c b/block/partitions/sun.c new file mode 100644 index 000000000..47dc53ecc --- /dev/null +++ b/block/partitions/sun.c @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/sun.c + * + * Code extracted from drivers/block/genhd.c + * + * Copyright (C) 1991-1998 Linus Torvalds + * Re-organised Feb 1998 Russell King + */ + +#include "check.h" + +#define SUN_LABEL_MAGIC 0xDABE +#define SUN_VTOC_SANITY 0x600DDEEE + +enum { + SUN_WHOLE_DISK = 5, + LINUX_RAID_PARTITION = 0xfd, /* autodetect RAID partition */ +}; + +int sun_partition(struct parsed_partitions *state) +{ + int i; + __be16 csum; + int slot = 1; + __be16 *ush; + Sector sect; + struct sun_disklabel { + unsigned char info[128]; /* Informative text string */ + struct sun_vtoc { + __be32 version; /* Layout version */ + char volume[8]; /* Volume name */ + __be16 nparts; /* Number of partitions */ + struct sun_info { /* Partition hdrs, sec 2 */ + __be16 id; + __be16 flags; + } infos[8]; + __be16 padding; /* Alignment padding */ + __be32 bootinfo[3]; /* Info needed by mboot */ + __be32 sanity; /* To verify vtoc sanity */ + __be32 reserved[10]; /* Free space */ + __be32 timestamp[8]; /* Partition timestamp */ + } vtoc; + __be32 write_reinstruct; /* sectors to skip, writes */ + __be32 read_reinstruct; /* sectors to skip, reads */ + unsigned char spare[148]; /* Padding */ + __be16 rspeed; /* Disk rotational speed */ + __be16 pcylcount; /* Physical cylinder count */ + __be16 sparecyl; /* extra sects per cylinder */ + __be16 obs1; /* gap1 */ + __be16 obs2; /* gap2 */ + __be16 ilfact; /* Interleave factor */ + __be16 ncyl; /* Data cylinder count */ + __be16 nacyl; /* Alt. cylinder count */ + __be16 ntrks; /* Tracks per cylinder */ + __be16 nsect; /* Sectors per track */ + __be16 obs3; /* bhead - Label head offset */ + __be16 obs4; /* ppart - Physical Partition */ + struct sun_partition { + __be32 start_cylinder; + __be32 num_sectors; + } partitions[8]; + __be16 magic; /* Magic number */ + __be16 csum; /* Label xor'd checksum */ + } * label; + struct sun_partition *p; + unsigned long spc; + char b[BDEVNAME_SIZE]; + int use_vtoc; + int nparts; + + label = read_part_sector(state, 0, §); + if (!label) + return -1; + + p = label->partitions; + if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { +/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", + bdevname(bdev, b), be16_to_cpu(label->magic)); */ + put_dev_sector(sect); + return 0; + } + /* Look at the checksum */ + ush = ((__be16 *) (label+1)) - 1; + for (csum = 0; ush >= ((__be16 *) label);) + csum ^= *ush--; + if (csum) { + printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", + bdevname(state->bdev, b)); + put_dev_sector(sect); + return 0; + } + + /* Check to see if we can use the VTOC table */ + use_vtoc = ((be32_to_cpu(label->vtoc.sanity) == SUN_VTOC_SANITY) && + (be32_to_cpu(label->vtoc.version) == 1) && + (be16_to_cpu(label->vtoc.nparts) <= 8)); + + /* Use 8 partition entries if not specified in validated VTOC */ + nparts = (use_vtoc) ? be16_to_cpu(label->vtoc.nparts) : 8; + + /* + * So that old Linux-Sun partitions continue to work, + * alow the VTOC to be used under the additional condition ... + */ + use_vtoc = use_vtoc || !(label->vtoc.sanity || + label->vtoc.version || label->vtoc.nparts); + spc = be16_to_cpu(label->ntrks) * be16_to_cpu(label->nsect); + for (i = 0; i < nparts; i++, p++) { + unsigned long st_sector; + unsigned int num_sectors; + + st_sector = be32_to_cpu(p->start_cylinder) * spc; + num_sectors = be32_to_cpu(p->num_sectors); + if (num_sectors) { + put_partition(state, slot, st_sector, num_sectors); + state->parts[slot].flags = 0; + if (use_vtoc) { + if (be16_to_cpu(label->vtoc.infos[i].id) == LINUX_RAID_PARTITION) + state->parts[slot].flags |= ADDPART_FLAG_RAID; + else if (be16_to_cpu(label->vtoc.infos[i].id) == SUN_WHOLE_DISK) + state->parts[slot].flags |= ADDPART_FLAG_WHOLEDISK; + } + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/sysv68.c b/block/partitions/sysv68.c new file mode 100644 index 000000000..6f6257fd4 --- /dev/null +++ b/block/partitions/sysv68.c @@ -0,0 +1,95 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/sysv68.c + * + * Copyright (C) 2007 Philippe De Muyter + */ + +#include "check.h" + +/* + * Volume ID structure: on first 256-bytes sector of disk + */ + +struct volumeid { + u8 vid_unused[248]; + u8 vid_mac[8]; /* ASCII string "MOTOROLA" */ +}; + +/* + * config block: second 256-bytes sector on disk + */ + +struct dkconfig { + u8 ios_unused0[128]; + __be32 ios_slcblk; /* Slice table block number */ + __be16 ios_slccnt; /* Number of entries in slice table */ + u8 ios_unused1[122]; +}; + +/* + * combined volumeid and dkconfig block + */ + +struct dkblk0 { + struct volumeid dk_vid; + struct dkconfig dk_ios; +}; + +/* + * Slice Table Structure + */ + +struct slice { + __be32 nblocks; /* slice size (in blocks) */ + __be32 blkoff; /* block offset of slice */ +}; + + +int sysv68_partition(struct parsed_partitions *state) +{ + int i, slices; + int slot = 1; + Sector sect; + unsigned char *data; + struct dkblk0 *b; + struct slice *slice; + char tmp[64]; + + data = read_part_sector(state, 0, §); + if (!data) + return -1; + + b = (struct dkblk0 *)data; + if (memcmp(b->dk_vid.vid_mac, "MOTOROLA", sizeof(b->dk_vid.vid_mac))) { + put_dev_sector(sect); + return 0; + } + slices = be16_to_cpu(b->dk_ios.ios_slccnt); + i = be32_to_cpu(b->dk_ios.ios_slcblk); + put_dev_sector(sect); + + data = read_part_sector(state, i, §); + if (!data) + return -1; + + slices -= 1; /* last slice is the whole disk */ + snprintf(tmp, sizeof(tmp), "sysV68: %s(s%u)", state->name, slices); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + slice = (struct slice *)data; + for (i = 0; i < slices; i++, slice++) { + if (slot == state->limit) + break; + if (be32_to_cpu(slice->nblocks)) { + put_partition(state, slot, + be32_to_cpu(slice->blkoff), + be32_to_cpu(slice->nblocks)); + snprintf(tmp, sizeof(tmp), "(s%u)", i); + strlcat(state->pp_buf, tmp, PAGE_SIZE); + } + slot++; + } + strlcat(state->pp_buf, "\n", PAGE_SIZE); + put_dev_sector(sect); + return 1; +} diff --git a/block/partitions/ultrix.c b/block/partitions/ultrix.c new file mode 100644 index 000000000..4aaa81043 --- /dev/null +++ b/block/partitions/ultrix.c @@ -0,0 +1,48 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * fs/partitions/ultrix.c + * + * Code extracted from drivers/block/genhd.c + * + * Re-organised Jul 1999 Russell King + */ + +#include "check.h" + +int ultrix_partition(struct parsed_partitions *state) +{ + int i; + Sector sect; + unsigned char *data; + struct ultrix_disklabel { + s32 pt_magic; /* magic no. indicating part. info exits */ + s32 pt_valid; /* set by driver if pt is current */ + struct pt_info { + s32 pi_nblocks; /* no. of sectors */ + u32 pi_blkoff; /* block offset for start */ + } pt_part[8]; + } *label; + +#define PT_MAGIC 0x032957 /* Partition magic number */ +#define PT_VALID 1 /* Indicates if struct is valid */ + + data = read_part_sector(state, (16384 - sizeof(*label))/512, §); + if (!data) + return -1; + + label = (struct ultrix_disklabel *)(data + 512 - sizeof(*label)); + + if (label->pt_magic == PT_MAGIC && label->pt_valid == PT_VALID) { + for (i=0; i<8; i++) + if (label->pt_part[i].pi_nblocks) + put_partition(state, i+1, + label->pt_part[i].pi_blkoff, + label->pt_part[i].pi_nblocks); + put_dev_sector(sect); + strlcat(state->pp_buf, "\n", PAGE_SIZE); + return 1; + } else { + put_dev_sector(sect); + return 0; + } +} diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c new file mode 100644 index 000000000..c9f009cc0 --- /dev/null +++ b/block/scsi_ioctl.c @@ -0,0 +1,891 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2001 Jens Axboe + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct blk_cmd_filter { + unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; + unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; +}; + +static struct blk_cmd_filter blk_default_cmd_filter; + +/* Command group 3 is reserved and should never be used. */ +const unsigned char scsi_command_size_tbl[8] = +{ + 6, 10, 10, 12, + 16, 12, 10, 10 +}; +EXPORT_SYMBOL(scsi_command_size_tbl); + +static int sg_get_version(int __user *p) +{ + static const int sg_version_num = 30527; + return put_user(sg_version_num, p); +} + +static int scsi_get_idlun(struct request_queue *q, int __user *p) +{ + return put_user(0, p); +} + +static int scsi_get_bus(struct request_queue *q, int __user *p) +{ + return put_user(0, p); +} + +static int sg_get_timeout(struct request_queue *q) +{ + return jiffies_to_clock_t(q->sg_timeout); +} + +static int sg_set_timeout(struct request_queue *q, int __user *p) +{ + int timeout, err = get_user(timeout, p); + + if (!err) + q->sg_timeout = clock_t_to_jiffies(timeout); + + return err; +} + +static int max_sectors_bytes(struct request_queue *q) +{ + unsigned int max_sectors = queue_max_sectors(q); + + max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); + + return max_sectors << 9; +} + +static int sg_get_reserved_size(struct request_queue *q, int __user *p) +{ + int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); + + return put_user(val, p); +} + +static int sg_set_reserved_size(struct request_queue *q, int __user *p) +{ + int size, err = get_user(size, p); + + if (err) + return err; + + if (size < 0) + return -EINVAL; + + q->sg_reserved_size = min(size, max_sectors_bytes(q)); + return 0; +} + +/* + * will always return that we are ATAPI even for a real SCSI drive, I'm not + * so sure this is worth doing anything about (why would you care??) + */ +static int sg_emulated_host(struct request_queue *q, int __user *p) +{ + return put_user(1, p); +} + +static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) +{ + /* Basic read-only commands */ + __set_bit(TEST_UNIT_READY, filter->read_ok); + __set_bit(REQUEST_SENSE, filter->read_ok); + __set_bit(READ_6, filter->read_ok); + __set_bit(READ_10, filter->read_ok); + __set_bit(READ_12, filter->read_ok); + __set_bit(READ_16, filter->read_ok); + __set_bit(READ_BUFFER, filter->read_ok); + __set_bit(READ_DEFECT_DATA, filter->read_ok); + __set_bit(READ_CAPACITY, filter->read_ok); + __set_bit(READ_LONG, filter->read_ok); + __set_bit(INQUIRY, filter->read_ok); + __set_bit(MODE_SENSE, filter->read_ok); + __set_bit(MODE_SENSE_10, filter->read_ok); + __set_bit(LOG_SENSE, filter->read_ok); + __set_bit(START_STOP, filter->read_ok); + __set_bit(GPCMD_VERIFY_10, filter->read_ok); + __set_bit(VERIFY_16, filter->read_ok); + __set_bit(REPORT_LUNS, filter->read_ok); + __set_bit(SERVICE_ACTION_IN_16, filter->read_ok); + __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); + __set_bit(MAINTENANCE_IN, filter->read_ok); + __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); + + /* Audio CD commands */ + __set_bit(GPCMD_PLAY_CD, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); + __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); + __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); + + /* CD/DVD data reading */ + __set_bit(GPCMD_READ_CD, filter->read_ok); + __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); + __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); + __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); + __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); + __set_bit(GPCMD_READ_HEADER, filter->read_ok); + __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); + __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); + __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); + __set_bit(GPCMD_REPORT_KEY, filter->read_ok); + __set_bit(GPCMD_SCAN, filter->read_ok); + __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); + __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); + __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); + __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); + __set_bit(GPCMD_SEEK, filter->read_ok); + __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); + + /* Basic writing commands */ + __set_bit(WRITE_6, filter->write_ok); + __set_bit(WRITE_10, filter->write_ok); + __set_bit(WRITE_VERIFY, filter->write_ok); + __set_bit(WRITE_12, filter->write_ok); + __set_bit(WRITE_VERIFY_12, filter->write_ok); + __set_bit(WRITE_16, filter->write_ok); + __set_bit(WRITE_LONG, filter->write_ok); + __set_bit(WRITE_LONG_2, filter->write_ok); + __set_bit(WRITE_SAME, filter->write_ok); + __set_bit(WRITE_SAME_16, filter->write_ok); + __set_bit(WRITE_SAME_32, filter->write_ok); + __set_bit(ERASE, filter->write_ok); + __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); + __set_bit(MODE_SELECT, filter->write_ok); + __set_bit(LOG_SELECT, filter->write_ok); + __set_bit(GPCMD_BLANK, filter->write_ok); + __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); + __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); + __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); + __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); + __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); + __set_bit(GPCMD_SEND_EVENT, filter->write_ok); + __set_bit(GPCMD_SEND_KEY, filter->write_ok); + __set_bit(GPCMD_SEND_OPC, filter->write_ok); + __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); + __set_bit(GPCMD_SET_SPEED, filter->write_ok); + __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); + __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); + __set_bit(GPCMD_SET_STREAMING, filter->write_ok); + __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); + + /* ZBC Commands */ + __set_bit(ZBC_OUT, filter->write_ok); + __set_bit(ZBC_IN, filter->read_ok); +} + +int blk_verify_command(unsigned char *cmd, fmode_t mode) +{ + struct blk_cmd_filter *filter = &blk_default_cmd_filter; + + /* root can do any command. */ + if (capable(CAP_SYS_RAWIO)) + return 0; + + /* Anybody who can open the device can do a read-safe command */ + if (test_bit(cmd[0], filter->read_ok)) + return 0; + + /* Write-safe commands require a writable open */ + if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) + return 0; + + return -EPERM; +} +EXPORT_SYMBOL(blk_verify_command); + +static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, + struct sg_io_hdr *hdr, fmode_t mode) +{ + struct scsi_request *req = scsi_req(rq); + + if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) + return -EFAULT; + if (blk_verify_command(req->cmd, mode)) + return -EPERM; + + /* + * fill in request structure + */ + req->cmd_len = hdr->cmd_len; + + rq->timeout = msecs_to_jiffies(hdr->timeout); + if (!rq->timeout) + rq->timeout = q->sg_timeout; + if (!rq->timeout) + rq->timeout = BLK_DEFAULT_SG_TIMEOUT; + if (rq->timeout < BLK_MIN_SG_TIMEOUT) + rq->timeout = BLK_MIN_SG_TIMEOUT; + + return 0; +} + +static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, + struct bio *bio) +{ + struct scsi_request *req = scsi_req(rq); + int r, ret = 0; + + /* + * fill in all the output members + */ + hdr->status = req->result & 0xff; + hdr->masked_status = status_byte(req->result); + hdr->msg_status = msg_byte(req->result); + hdr->host_status = host_byte(req->result); + hdr->driver_status = driver_byte(req->result); + hdr->info = 0; + if (hdr->masked_status || hdr->host_status || hdr->driver_status) + hdr->info |= SG_INFO_CHECK; + hdr->resid = req->resid_len; + hdr->sb_len_wr = 0; + + if (req->sense_len && hdr->sbp) { + int len = min((unsigned int) hdr->mx_sb_len, req->sense_len); + + if (!copy_to_user(hdr->sbp, req->sense, len)) + hdr->sb_len_wr = len; + else + ret = -EFAULT; + } + + r = blk_rq_unmap_user(bio); + if (!ret) + ret = r; + + return ret; +} + +static int sg_io(struct request_queue *q, struct gendisk *bd_disk, + struct sg_io_hdr *hdr, fmode_t mode) +{ + unsigned long start_time; + ssize_t ret = 0; + int writing = 0; + int at_head = 0; + struct request *rq; + struct scsi_request *req; + struct bio *bio; + + if (hdr->interface_id != 'S') + return -EINVAL; + + if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) + return -EIO; + + if (hdr->dxfer_len) + switch (hdr->dxfer_direction) { + default: + return -EINVAL; + case SG_DXFER_TO_DEV: + writing = 1; + break; + case SG_DXFER_TO_FROM_DEV: + case SG_DXFER_FROM_DEV: + break; + } + if (hdr->flags & SG_FLAG_Q_AT_HEAD) + at_head = 1; + + ret = -ENOMEM; + rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + req = scsi_req(rq); + + if (hdr->cmd_len > BLK_MAX_CDB) { + req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); + if (!req->cmd) + goto out_put_request; + } + + ret = blk_fill_sghdr_rq(q, rq, hdr, mode); + if (ret < 0) + goto out_free_cdb; + + ret = 0; + if (hdr->iovec_count) { + struct iov_iter i; + struct iovec *iov = NULL; + + ret = import_iovec(rq_data_dir(rq), hdr->dxferp, + hdr->iovec_count, 0, &iov, &i); + if (ret < 0) + goto out_free_cdb; + + /* SG_IO howto says that the shorter of the two wins */ + iov_iter_truncate(&i, hdr->dxfer_len); + + ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); + kfree(iov); + } else if (hdr->dxfer_len) + ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, + GFP_KERNEL); + + if (ret) + goto out_free_cdb; + + bio = rq->bio; + req->retries = 0; + + start_time = jiffies; + + /* ignore return value. All information is passed back to caller + * (if he doesn't check that is his problem). + * N.B. a non-zero SCSI status is _not_ necessarily an error. + */ + blk_execute_rq(q, bd_disk, rq, at_head); + + hdr->duration = jiffies_to_msecs(jiffies - start_time); + + ret = blk_complete_sghdr_rq(rq, hdr, bio); + +out_free_cdb: + scsi_req_free_cmd(req); +out_put_request: + blk_put_request(rq); + return ret; +} + +/** + * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl + * @q: request queue to send scsi commands down + * @disk: gendisk to operate on (option) + * @mode: mode used to open the file through which the ioctl has been + * submitted + * @sic: userspace structure describing the command to perform + * + * Send down the scsi command described by @sic to the device below + * the request queue @q. If @file is non-NULL it's used to perform + * fine-grained permission checks that allow users to send down + * non-destructive SCSI commands. If the caller has a struct gendisk + * available it should be passed in as @disk to allow the low level + * driver to use the information contained in it. A non-NULL @disk + * is only allowed if the caller knows that the low level driver doesn't + * need it (e.g. in the scsi subsystem). + * + * Notes: + * - This interface is deprecated - users should use the SG_IO + * interface instead, as this is a more flexible approach to + * performing SCSI commands on a device. + * - The SCSI command length is determined by examining the 1st byte + * of the given command. There is no way to override this. + * - Data transfers are limited to PAGE_SIZE + * - The length (x + y) must be at least OMAX_SB_LEN bytes long to + * accommodate the sense buffer when an error occurs. + * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that + * old code will not be surprised. + * - If a Unix error occurs (e.g. ENOMEM) then the user will receive + * a negative return and the Unix error code in 'errno'. + * If the SCSI command succeeds then 0 is returned. + * Positive numbers returned are the compacted SCSI error codes (4 + * bytes in one int) where the lowest byte is the SCSI status. + */ +int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, + struct scsi_ioctl_command __user *sic) +{ + enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ + struct request *rq; + struct scsi_request *req; + int err; + unsigned int in_len, out_len, bytes, opcode, cmdlen; + char *buffer = NULL; + + if (!sic) + return -EINVAL; + + /* + * get in an out lengths, verify they don't exceed a page worth of data + */ + if (get_user(in_len, &sic->inlen)) + return -EFAULT; + if (get_user(out_len, &sic->outlen)) + return -EFAULT; + if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) + return -EINVAL; + if (get_user(opcode, sic->data)) + return -EFAULT; + + bytes = max(in_len, out_len); + if (bytes) { + buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); + if (!buffer) + return -ENOMEM; + + } + + rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); + if (IS_ERR(rq)) { + err = PTR_ERR(rq); + goto error_free_buffer; + } + req = scsi_req(rq); + + cmdlen = COMMAND_SIZE(opcode); + + /* + * get command and data to send to device, if any + */ + err = -EFAULT; + req->cmd_len = cmdlen; + if (copy_from_user(req->cmd, sic->data, cmdlen)) + goto error; + + if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) + goto error; + + err = blk_verify_command(req->cmd, mode); + if (err) + goto error; + + /* default. possible overriden later */ + req->retries = 5; + + switch (opcode) { + case SEND_DIAGNOSTIC: + case FORMAT_UNIT: + rq->timeout = FORMAT_UNIT_TIMEOUT; + req->retries = 1; + break; + case START_STOP: + rq->timeout = START_STOP_TIMEOUT; + break; + case MOVE_MEDIUM: + rq->timeout = MOVE_MEDIUM_TIMEOUT; + break; + case READ_ELEMENT_STATUS: + rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; + break; + case READ_DEFECT_DATA: + rq->timeout = READ_DEFECT_DATA_TIMEOUT; + req->retries = 1; + break; + default: + rq->timeout = BLK_DEFAULT_SG_TIMEOUT; + break; + } + + if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) { + err = DRIVER_ERROR << 24; + goto error; + } + + blk_execute_rq(q, disk, rq, 0); + + err = req->result & 0xff; /* only 8 bit SCSI status */ + if (err) { + if (req->sense_len && req->sense) { + bytes = (OMAX_SB_LEN > req->sense_len) ? + req->sense_len : OMAX_SB_LEN; + if (copy_to_user(sic->data, req->sense, bytes)) + err = -EFAULT; + } + } else { + if (copy_to_user(sic->data, buffer, out_len)) + err = -EFAULT; + } + +error: + blk_put_request(rq); + +error_free_buffer: + kfree(buffer); + + return err; +} +EXPORT_SYMBOL_GPL(sg_scsi_ioctl); + +/* Send basic block requests */ +static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, + int cmd, int data) +{ + struct request *rq; + int err; + + rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + rq->timeout = BLK_DEFAULT_SG_TIMEOUT; + scsi_req(rq)->cmd[0] = cmd; + scsi_req(rq)->cmd[4] = data; + scsi_req(rq)->cmd_len = 6; + blk_execute_rq(q, bd_disk, rq, 0); + err = scsi_req(rq)->result ? -EIO : 0; + blk_put_request(rq); + + return err; +} + +static inline int blk_send_start_stop(struct request_queue *q, + struct gendisk *bd_disk, int data) +{ + return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); +} + +int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_sg_io_hdr hdr32 = { + .interface_id = hdr->interface_id, + .dxfer_direction = hdr->dxfer_direction, + .cmd_len = hdr->cmd_len, + .mx_sb_len = hdr->mx_sb_len, + .iovec_count = hdr->iovec_count, + .dxfer_len = hdr->dxfer_len, + .dxferp = (uintptr_t)hdr->dxferp, + .cmdp = (uintptr_t)hdr->cmdp, + .sbp = (uintptr_t)hdr->sbp, + .timeout = hdr->timeout, + .flags = hdr->flags, + .pack_id = hdr->pack_id, + .usr_ptr = (uintptr_t)hdr->usr_ptr, + .status = hdr->status, + .masked_status = hdr->masked_status, + .msg_status = hdr->msg_status, + .sb_len_wr = hdr->sb_len_wr, + .host_status = hdr->host_status, + .driver_status = hdr->driver_status, + .resid = hdr->resid, + .duration = hdr->duration, + .info = hdr->info, + }; + + if (copy_to_user(argp, &hdr32, sizeof(hdr32))) + return -EFAULT; + + return 0; + } +#endif + + if (copy_to_user(argp, hdr, sizeof(*hdr))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(put_sg_io_hdr); + +int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp) +{ +#ifdef CONFIG_COMPAT + struct compat_sg_io_hdr hdr32; + + if (in_compat_syscall()) { + if (copy_from_user(&hdr32, argp, sizeof(hdr32))) + return -EFAULT; + + *hdr = (struct sg_io_hdr) { + .interface_id = hdr32.interface_id, + .dxfer_direction = hdr32.dxfer_direction, + .cmd_len = hdr32.cmd_len, + .mx_sb_len = hdr32.mx_sb_len, + .iovec_count = hdr32.iovec_count, + .dxfer_len = hdr32.dxfer_len, + .dxferp = compat_ptr(hdr32.dxferp), + .cmdp = compat_ptr(hdr32.cmdp), + .sbp = compat_ptr(hdr32.sbp), + .timeout = hdr32.timeout, + .flags = hdr32.flags, + .pack_id = hdr32.pack_id, + .usr_ptr = compat_ptr(hdr32.usr_ptr), + .status = hdr32.status, + .masked_status = hdr32.masked_status, + .msg_status = hdr32.msg_status, + .sb_len_wr = hdr32.sb_len_wr, + .host_status = hdr32.host_status, + .driver_status = hdr32.driver_status, + .resid = hdr32.resid, + .duration = hdr32.duration, + .info = hdr32.info, + }; + + return 0; + } +#endif + + if (copy_from_user(hdr, argp, sizeof(*hdr))) + return -EFAULT; + + return 0; +} +EXPORT_SYMBOL(get_sg_io_hdr); + +#ifdef CONFIG_COMPAT +struct compat_cdrom_generic_command { + unsigned char cmd[CDROM_PACKET_SIZE]; + compat_caddr_t buffer; + compat_uint_t buflen; + compat_int_t stat; + compat_caddr_t sense; + unsigned char data_direction; + unsigned char pad[3]; + compat_int_t quiet; + compat_int_t timeout; + compat_caddr_t unused; +}; +#endif + +static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, + const void __user *arg) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_cdrom_generic_command cgc32; + + if (copy_from_user(&cgc32, arg, sizeof(cgc32))) + return -EFAULT; + + *cgc = (struct cdrom_generic_command) { + .buffer = compat_ptr(cgc32.buffer), + .buflen = cgc32.buflen, + .stat = cgc32.stat, + .sense = compat_ptr(cgc32.sense), + .data_direction = cgc32.data_direction, + .quiet = cgc32.quiet, + .timeout = cgc32.timeout, + .unused = compat_ptr(cgc32.unused), + }; + memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); + return 0; + } +#endif + if (copy_from_user(cgc, arg, sizeof(*cgc))) + return -EFAULT; + + return 0; +} + +static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, + void __user *arg) +{ +#ifdef CONFIG_COMPAT + if (in_compat_syscall()) { + struct compat_cdrom_generic_command cgc32 = { + .buffer = (uintptr_t)(cgc->buffer), + .buflen = cgc->buflen, + .stat = cgc->stat, + .sense = (uintptr_t)(cgc->sense), + .data_direction = cgc->data_direction, + .quiet = cgc->quiet, + .timeout = cgc->timeout, + .unused = (uintptr_t)(cgc->unused), + }; + memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); + + if (copy_to_user(arg, &cgc32, sizeof(cgc32))) + return -EFAULT; + + return 0; + } +#endif + if (copy_to_user(arg, cgc, sizeof(*cgc))) + return -EFAULT; + + return 0; +} + +static int scsi_cdrom_send_packet(struct request_queue *q, + struct gendisk *bd_disk, + fmode_t mode, void __user *arg) +{ + struct cdrom_generic_command cgc; + struct sg_io_hdr hdr; + int err; + + err = scsi_get_cdrom_generic_arg(&cgc, arg); + if (err) + return err; + + cgc.timeout = clock_t_to_jiffies(cgc.timeout); + memset(&hdr, 0, sizeof(hdr)); + hdr.interface_id = 'S'; + hdr.cmd_len = sizeof(cgc.cmd); + hdr.dxfer_len = cgc.buflen; + switch (cgc.data_direction) { + case CGC_DATA_UNKNOWN: + hdr.dxfer_direction = SG_DXFER_UNKNOWN; + break; + case CGC_DATA_WRITE: + hdr.dxfer_direction = SG_DXFER_TO_DEV; + break; + case CGC_DATA_READ: + hdr.dxfer_direction = SG_DXFER_FROM_DEV; + break; + case CGC_DATA_NONE: + hdr.dxfer_direction = SG_DXFER_NONE; + break; + default: + return -EINVAL; + } + + hdr.dxferp = cgc.buffer; + hdr.sbp = cgc.sense; + if (hdr.sbp) + hdr.mx_sb_len = sizeof(struct request_sense); + hdr.timeout = jiffies_to_msecs(cgc.timeout); + hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; + hdr.cmd_len = sizeof(cgc.cmd); + + err = sg_io(q, bd_disk, &hdr, mode); + if (err == -EFAULT) + return -EFAULT; + + if (hdr.status) + return -EIO; + + cgc.stat = err; + cgc.buflen = hdr.resid; + if (scsi_put_cdrom_generic_arg(&cgc, arg)) + return -EFAULT; + + return err; +} + +int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, + unsigned int cmd, void __user *arg) +{ + int err; + + if (!q) + return -ENXIO; + + switch (cmd) { + /* + * new sgv3 interface + */ + case SG_GET_VERSION_NUM: + err = sg_get_version(arg); + break; + case SCSI_IOCTL_GET_IDLUN: + err = scsi_get_idlun(q, arg); + break; + case SCSI_IOCTL_GET_BUS_NUMBER: + err = scsi_get_bus(q, arg); + break; + case SG_SET_TIMEOUT: + err = sg_set_timeout(q, arg); + break; + case SG_GET_TIMEOUT: + err = sg_get_timeout(q); + break; + case SG_GET_RESERVED_SIZE: + err = sg_get_reserved_size(q, arg); + break; + case SG_SET_RESERVED_SIZE: + err = sg_set_reserved_size(q, arg); + break; + case SG_EMULATED_HOST: + err = sg_emulated_host(q, arg); + break; + case SG_IO: { + struct sg_io_hdr hdr; + + err = get_sg_io_hdr(&hdr, arg); + if (err) + break; + err = sg_io(q, bd_disk, &hdr, mode); + if (err == -EFAULT) + break; + + if (put_sg_io_hdr(&hdr, arg)) + err = -EFAULT; + break; + } + case CDROM_SEND_PACKET: + err = scsi_cdrom_send_packet(q, bd_disk, mode, arg); + break; + + /* + * old junk scsi send command ioctl + */ + case SCSI_IOCTL_SEND_COMMAND: + printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); + err = -EINVAL; + if (!arg) + break; + + err = sg_scsi_ioctl(q, bd_disk, mode, arg); + break; + case CDROMCLOSETRAY: + err = blk_send_start_stop(q, bd_disk, 0x03); + break; + case CDROMEJECT: + err = blk_send_start_stop(q, bd_disk, 0x02); + break; + default: + err = -ENOTTY; + } + + return err; +} +EXPORT_SYMBOL(scsi_cmd_ioctl); + +int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) +{ + if (bd && !bdev_is_partition(bd)) + return 0; + + if (capable(CAP_SYS_RAWIO)) + return 0; + + return -ENOIOCTLCMD; +} +EXPORT_SYMBOL(scsi_verify_blk_ioctl); + +int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, + unsigned int cmd, void __user *arg) +{ + int ret; + + ret = scsi_verify_blk_ioctl(bd, cmd); + if (ret < 0) + return ret; + + return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); +} +EXPORT_SYMBOL(scsi_cmd_blk_ioctl); + +/** + * scsi_req_init - initialize certain fields of a scsi_request structure + * @req: Pointer to a scsi_request structure. + * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members + * of struct scsi_request. + */ +void scsi_req_init(struct scsi_request *req) +{ + memset(req->__cmd, 0, sizeof(req->__cmd)); + req->cmd = req->__cmd; + req->cmd_len = BLK_MAX_CDB; + req->sense_len = 0; +} +EXPORT_SYMBOL(scsi_req_init); + +static int __init blk_scsi_ioctl_init(void) +{ + blk_set_cmd_filter_defaults(&blk_default_cmd_filter); + return 0; +} +fs_initcall(blk_scsi_ioctl_init); diff --git a/block/sed-opal.c b/block/sed-opal.c new file mode 100644 index 000000000..0ac5a4f3f --- /dev/null +++ b/block/sed-opal.c @@ -0,0 +1,2719 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright © 2016 Intel Corporation + * + * Authors: + * Scott Bauer + * Rafael Antognolli + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":OPAL: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "opal_proto.h" + +#define IO_BUFFER_LENGTH 2048 +#define MAX_TOKS 64 + +/* Number of bytes needed by cmd_finalize. */ +#define CMD_FINALIZE_BYTES_NEEDED 7 + +struct opal_step { + int (*fn)(struct opal_dev *dev, void *data); + void *data; +}; +typedef int (cont_fn)(struct opal_dev *dev); + +enum opal_atom_width { + OPAL_WIDTH_TINY, + OPAL_WIDTH_SHORT, + OPAL_WIDTH_MEDIUM, + OPAL_WIDTH_LONG, + OPAL_WIDTH_TOKEN +}; + +/* + * On the parsed response, we don't store again the toks that are already + * stored in the response buffer. Instead, for each token, we just store a + * pointer to the position in the buffer where the token starts, and the size + * of the token in bytes. + */ +struct opal_resp_tok { + const u8 *pos; + size_t len; + enum opal_response_token type; + enum opal_atom_width width; + union { + u64 u; + s64 s; + } stored; +}; + +/* + * From the response header it's not possible to know how many tokens there are + * on the payload. So we hardcode that the maximum will be MAX_TOKS, and later + * if we start dealing with messages that have more than that, we can increase + * this number. This is done to avoid having to make two passes through the + * response, the first one counting how many tokens we have and the second one + * actually storing the positions. + */ +struct parsed_resp { + int num; + struct opal_resp_tok toks[MAX_TOKS]; +}; + +struct opal_dev { + bool supported; + bool mbr_enabled; + + void *data; + sec_send_recv *send_recv; + + struct mutex dev_lock; + u16 comid; + u32 hsn; + u32 tsn; + u64 align; + u64 lowest_lba; + + size_t pos; + u8 *cmd; + u8 *resp; + + struct parsed_resp parsed; + size_t prev_d_len; + void *prev_data; + + struct list_head unlk_lst; +}; + + +static const u8 opaluid[][OPAL_UID_LENGTH] = { + /* users */ + [OPAL_SMUID_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff }, + [OPAL_THISSP_UID] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ADMINSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x00, 0x00, 0x02 }, + [OPAL_ENTERPRISE_LOCKINGSP_UID] = + { 0x00, 0x00, 0x02, 0x05, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_ANYBODY_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_SID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ADMIN1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0x00, 0x01 }, + [OPAL_USER1_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x01 }, + [OPAL_USER2_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x03, 0x00, 0x02 }, + [OPAL_PSID_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x01, 0xff, 0x01 }, + [OPAL_ENTERPRISE_BANDMASTER0_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x80, 0x01 }, + [OPAL_ENTERPRISE_ERASEMASTER_UID] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 }, + + /* tables */ + [OPAL_TABLE_TABLE] = + { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_GLOBAL] = + { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE8, 0x01 }, + [OPAL_MBRCONTROL] = + { 0x00, 0x00, 0x08, 0x03, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_MBR] = + { 0x00, 0x00, 0x08, 0x04, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_AUTHORITY_TABLE] = + { 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x00, 0x00}, + [OPAL_C_PIN_TABLE] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x00}, + [OPAL_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_ENTERPRISE_LOCKING_INFO_TABLE] = + { 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 }, + [OPAL_DATASTORE] = + { 0x00, 0x00, 0x10, 0x01, 0x00, 0x00, 0x00, 0x00 }, + + /* C_PIN_TABLE object ID's */ + [OPAL_C_PIN_MSID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02}, + [OPAL_C_PIN_SID] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x00, 0x01}, + [OPAL_C_PIN_ADMIN1] = + { 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01}, + + /* half UID's (only first 4 bytes used) */ + [OPAL_HALF_UID_AUTHORITY_OBJ_REF] = + { 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff }, + [OPAL_HALF_UID_BOOLEAN_ACE] = + { 0x00, 0x00, 0x04, 0x0E, 0xff, 0xff, 0xff, 0xff }, + + /* special value for omitted optional parameter */ + [OPAL_UID_HEXFF] = + { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}, +}; + +/* + * TCG Storage SSC Methods. + * Derived from: TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 6.3 Assigned UIDs + */ +static const u8 opalmethod[][OPAL_METHOD_LENGTH] = { + [OPAL_PROPERTIES] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x01 }, + [OPAL_STARTSESSION] = + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xff, 0x02 }, + [OPAL_REVERT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x02 }, + [OPAL_ACTIVATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x02, 0x03 }, + [OPAL_EGET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x06 }, + [OPAL_ESET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x07 }, + [OPAL_NEXT] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x08 }, + [OPAL_EAUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0c }, + [OPAL_GETACL] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x0d }, + [OPAL_GENKEY] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x10 }, + [OPAL_REVERTSP] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x11 }, + [OPAL_GET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x16 }, + [OPAL_SET] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x17 }, + [OPAL_AUTHENTICATE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x00, 0x1c }, + [OPAL_RANDOM] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x06, 0x01 }, + [OPAL_ERASE] = + { 0x00, 0x00, 0x00, 0x06, 0x00, 0x00, 0x08, 0x03 }, +}; + +static int end_opal_session_error(struct opal_dev *dev); +static int opal_discovery0_step(struct opal_dev *dev); + +struct opal_suspend_data { + struct opal_lock_unlock unlk; + u8 lr; + struct list_head node; +}; + +/* + * Derived from: + * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 + * Section: 5.1.5 Method Status Codes + */ +static const char * const opal_errors[] = { + "Success", + "Not Authorized", + "Unknown Error", + "SP Busy", + "SP Failed", + "SP Disabled", + "SP Frozen", + "No Sessions Available", + "Uniqueness Conflict", + "Insufficient Space", + "Insufficient Rows", + "Invalid Function", + "Invalid Parameter", + "Invalid Reference", + "Unknown Error", + "TPER Malfunction", + "Transaction Failure", + "Response Overflow", + "Authority Locked Out", +}; + +static const char *opal_error_to_human(int error) +{ + if (error == 0x3f) + return "Failed"; + + if (error >= ARRAY_SIZE(opal_errors) || error < 0) + return "Unknown Error"; + + return opal_errors[error]; +} + +static void print_buffer(const u8 *ptr, u32 length) +{ +#ifdef DEBUG + print_hex_dump_bytes("OPAL: ", DUMP_PREFIX_OFFSET, ptr, length); + pr_debug("\n"); +#endif +} + +static bool check_tper(const void *data) +{ + const struct d0_tper_features *tper = data; + u8 flags = tper->supported_features; + + if (!(flags & TPER_SYNC_SUPPORTED)) { + pr_debug("TPer sync not supported. flags = %d\n", + tper->supported_features); + return false; + } + + return true; +} + +static bool check_mbrenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_ENABLED_MASK); +} + +static bool check_sum(const void *data) +{ + const struct d0_single_user_mode *sum = data; + u32 nlo = be32_to_cpu(sum->num_locking_objects); + + if (nlo == 0) { + pr_debug("Need at least one locking object.\n"); + return false; + } + + pr_debug("Number of locking objects: %d\n", nlo); + + return true; +} + +static u16 get_comid_v100(const void *data) +{ + const struct d0_opal_v100 *v100 = data; + + return be16_to_cpu(v100->baseComID); +} + +static u16 get_comid_v200(const void *data) +{ + const struct d0_opal_v200 *v200 = data; + + return be16_to_cpu(v200->baseComID); +} + +static int opal_send_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->cmd, IO_BUFFER_LENGTH, + true); +} + +static int opal_recv_cmd(struct opal_dev *dev) +{ + return dev->send_recv(dev->data, dev->comid, TCG_SECP_01, + dev->resp, IO_BUFFER_LENGTH, + false); +} + +static int opal_recv_check(struct opal_dev *dev) +{ + size_t buflen = IO_BUFFER_LENGTH; + void *buffer = dev->resp; + struct opal_header *hdr = buffer; + int ret; + + do { + pr_debug("Sent OPAL command: outstanding=%d, minTransfer=%d\n", + hdr->cp.outstandingData, + hdr->cp.minTransfer); + + if (hdr->cp.outstandingData == 0 || + hdr->cp.minTransfer != 0) + return 0; + + memset(buffer, 0, buflen); + ret = opal_recv_cmd(dev); + } while (!ret); + + return ret; +} + +static int opal_send_recv(struct opal_dev *dev, cont_fn *cont) +{ + int ret; + + ret = opal_send_cmd(dev); + if (ret) + return ret; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + ret = opal_recv_check(dev); + if (ret) + return ret; + return cont(dev); +} + +static void check_geometry(struct opal_dev *dev, const void *data) +{ + const struct d0_geometry_features *geo = data; + + dev->align = be64_to_cpu(geo->alignment_granularity); + dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba); +} + +static int execute_step(struct opal_dev *dev, + const struct opal_step *step, size_t stepIndex) +{ + int error = step->fn(dev, step->data); + + if (error) { + pr_debug("Step %zu (%pS) failed with error %d: %s\n", + stepIndex, step->fn, error, + opal_error_to_human(error)); + } + + return error; +} + +static int execute_steps(struct opal_dev *dev, + const struct opal_step *steps, size_t n_steps) +{ + size_t state = 0; + int error; + + /* first do a discovery0 */ + error = opal_discovery0_step(dev); + if (error) + return error; + + for (state = 0; state < n_steps; state++) { + error = execute_step(dev, &steps[state], state); + if (error) + goto out_error; + } + + return 0; + +out_error: + /* + * For each OPAL command the first step in steps starts some sort of + * session. If an error occurred in the initial discovery0 or if an + * error occurred in the first step (and thus stopping the loop with + * state == 0) then there was an error before or during the attempt to + * start a session. Therefore we shouldn't attempt to terminate a + * session, as one has not yet been created. + */ + if (state > 0) + end_opal_session_error(dev); + + return error; +} + +static int opal_discovery0_end(struct opal_dev *dev) +{ + bool found_com_id = false, supported = true, single_user = false; + const struct d0_header *hdr = (struct d0_header *)dev->resp; + const u8 *epos = dev->resp, *cpos = dev->resp; + u16 comid = 0; + u32 hlen = be32_to_cpu(hdr->length); + + print_buffer(dev->resp, hlen); + dev->mbr_enabled = false; + + if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", + sizeof(*hdr), hlen, IO_BUFFER_LENGTH); + return -EFAULT; + } + + epos += hlen; /* end of buffer */ + cpos += sizeof(*hdr); /* current position on buffer */ + + while (cpos < epos && supported) { + const struct d0_features *body = + (const struct d0_features *)cpos; + + switch (be16_to_cpu(body->code)) { + case FC_TPER: + supported = check_tper(body->features); + break; + case FC_SINGLEUSER: + single_user = check_sum(body->features); + break; + case FC_GEOMETRY: + check_geometry(dev, body); + break; + case FC_LOCKING: + dev->mbr_enabled = check_mbrenabled(body->features); + break; + case FC_ENTERPRISE: + case FC_DATASTORE: + /* some ignored properties */ + pr_debug("Found OPAL feature description: %d\n", + be16_to_cpu(body->code)); + break; + case FC_OPALV100: + comid = get_comid_v100(body->features); + found_com_id = true; + break; + case FC_OPALV200: + comid = get_comid_v200(body->features); + found_com_id = true; + break; + case 0xbfff ... 0xffff: + /* vendor specific, just ignore */ + break; + default: + pr_debug("OPAL Unknown feature: %d\n", + be16_to_cpu(body->code)); + + } + cpos += body->length + 4; + } + + if (!supported) { + pr_debug("This device is not Opal enabled. Not Supported!\n"); + return -EOPNOTSUPP; + } + + if (!single_user) + pr_debug("Device doesn't support single user mode\n"); + + + if (!found_com_id) { + pr_debug("Could not find OPAL comid for device. Returning early\n"); + return -EOPNOTSUPP; + } + + dev->comid = comid; + + return 0; +} + +static int opal_discovery0(struct opal_dev *dev, void *data) +{ + int ret; + + memset(dev->resp, 0, IO_BUFFER_LENGTH); + dev->comid = OPAL_DISCOVERY_COMID; + ret = opal_recv_cmd(dev); + if (ret) + return ret; + + return opal_discovery0_end(dev); +} + +static int opal_discovery0_step(struct opal_dev *dev) +{ + const struct opal_step discovery0_step = { + opal_discovery0, + }; + + return execute_step(dev, &discovery0_step, 0); +} + +static size_t remaining_size(struct opal_dev *cmd) +{ + return IO_BUFFER_LENGTH - cmd->pos; +} + +static bool can_add(int *err, struct opal_dev *cmd, size_t len) +{ + if (*err) + return false; + + if (remaining_size(cmd) < len) { + pr_debug("Error adding %zu bytes: end of buffer.\n", len); + *err = -ERANGE; + return false; + } + + return true; +} + +static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok) +{ + if (!can_add(err, cmd, 1)) + return; + + cmd->cmd[cmd->pos++] = tok; +} + +static void add_short_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 atom; + int err = 0; + + atom = SHORT_ATOM_ID; + atom |= bytestring ? SHORT_ATOM_BYTESTRING : 0; + atom |= has_sign ? SHORT_ATOM_SIGNED : 0; + atom |= len & SHORT_ATOM_LEN_MASK; + + add_token_u8(&err, cmd, atom); +} + +static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring, + bool has_sign, int len) +{ + u8 header0; + + header0 = MEDIUM_ATOM_ID; + header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0; + header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0; + header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK; + + cmd->cmd[cmd->pos++] = header0; + cmd->cmd[cmd->pos++] = len; +} + +static void add_token_u64(int *err, struct opal_dev *cmd, u64 number) +{ + size_t len; + int msb; + + if (!(number & ~TINY_ATOM_DATA_MASK)) { + add_token_u8(err, cmd, number); + return; + } + + msb = fls64(number); + len = DIV_ROUND_UP(msb, 8); + + if (!can_add(err, cmd, len + 1)) { + pr_debug("Error adding u64: end of buffer.\n"); + return; + } + add_short_atom_header(cmd, false, false, len); + while (len--) + add_token_u8(err, cmd, number >> (len * 8)); +} + +static u8 *add_bytestring_header(int *err, struct opal_dev *cmd, size_t len) +{ + size_t header_len = 1; + bool is_short_atom = true; + + if (len & ~SHORT_ATOM_LEN_MASK) { + header_len = 2; + is_short_atom = false; + } + + if (!can_add(err, cmd, header_len + len)) { + pr_debug("Error adding bytestring: end of buffer.\n"); + return NULL; + } + + if (is_short_atom) + add_short_atom_header(cmd, true, false, len); + else + add_medium_atom_header(cmd, true, false, len); + + return &cmd->cmd[cmd->pos]; +} + +static void add_token_bytestring(int *err, struct opal_dev *cmd, + const u8 *bytestring, size_t len) +{ + u8 *start; + + start = add_bytestring_header(err, cmd, len); + if (!start) + return; + memcpy(start, bytestring, len); + cmd->pos += len; +} + +static int build_locking_range(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_debug("Can't build locking range. Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_LOCKINGRANGE_GLOBAL], OPAL_UID_LENGTH); + + if (lr == 0) + return 0; + + buffer[5] = LOCKING_RANGE_NON_GLOBAL; + buffer[7] = lr; + + return 0; +} + +static int build_locking_user(u8 *buffer, size_t length, u8 lr) +{ + if (length > OPAL_UID_LENGTH) { + pr_debug("Can't build locking range user. Length OOB\n"); + return -ERANGE; + } + + memcpy(buffer, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + buffer[7] = lr + 1; + + return 0; +} + +static void set_comid(struct opal_dev *cmd, u16 comid) +{ + struct opal_header *hdr = (struct opal_header *)cmd->cmd; + + hdr->cp.extendedComID[0] = comid >> 8; + hdr->cp.extendedComID[1] = comid; + hdr->cp.extendedComID[2] = 0; + hdr->cp.extendedComID[3] = 0; +} + +static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn) +{ + struct opal_header *hdr; + int err = 0; + + /* + * Close the parameter list opened from cmd_start. + * The number of bytes added must be equal to + * CMD_FINALIZE_BYTES_NEEDED. + */ + add_token_u8(&err, cmd, OPAL_ENDLIST); + + add_token_u8(&err, cmd, OPAL_ENDOFDATA); + add_token_u8(&err, cmd, OPAL_STARTLIST); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, 0); + add_token_u8(&err, cmd, OPAL_ENDLIST); + + if (err) { + pr_debug("Error finalizing command.\n"); + return -EFAULT; + } + + hdr = (struct opal_header *) cmd->cmd; + + hdr->pkt.tsn = cpu_to_be32(tsn); + hdr->pkt.hsn = cpu_to_be32(hsn); + + hdr->subpkt.length = cpu_to_be32(cmd->pos - sizeof(*hdr)); + while (cmd->pos % 4) { + if (cmd->pos >= IO_BUFFER_LENGTH) { + pr_debug("Error: Buffer overrun\n"); + return -ERANGE; + } + cmd->cmd[cmd->pos++] = 0; + } + hdr->pkt.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp) - + sizeof(hdr->pkt)); + hdr->cp.length = cpu_to_be32(cmd->pos - sizeof(hdr->cp)); + + return 0; +} + +static const struct opal_resp_tok *response_get_token( + const struct parsed_resp *resp, + int n) +{ + const struct opal_resp_tok *tok; + + if (!resp) { + pr_debug("Response is NULL\n"); + return ERR_PTR(-EINVAL); + } + + if (n >= resp->num) { + pr_debug("Token number doesn't exist: %d, resp: %d\n", + n, resp->num); + return ERR_PTR(-EINVAL); + } + + tok = &resp->toks[n]; + if (tok->len == 0) { + pr_debug("Token length must be non-zero\n"); + return ERR_PTR(-EINVAL); + } + + return tok; +} + +static ssize_t response_parse_tiny(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->width = OPAL_WIDTH_TINY; + + if (pos[0] & TINY_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + tok->type = OPAL_DTA_TOKENID_UINT; + tok->stored.u = pos[0] & 0x3f; + } + + return tok->len; +} + +static ssize_t response_parse_short(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (pos[0] & SHORT_ATOM_LEN_MASK) + 1; + tok->width = OPAL_WIDTH_SHORT; + + if (pos[0] & SHORT_ATOM_BYTESTRING) { + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + } else if (pos[0] & SHORT_ATOM_SIGNED) { + tok->type = OPAL_DTA_TOKENID_SINT; + } else { + u64 u_integer = 0; + ssize_t i, b = 0; + + tok->type = OPAL_DTA_TOKENID_UINT; + if (tok->len > 9) { + pr_debug("uint64 with more than 8 bytes\n"); + return -EINVAL; + } + for (i = tok->len - 1; i > 0; i--) { + u_integer |= ((u64)pos[i] << (8 * b)); + b++; + } + tok->stored.u = u_integer; + } + + return tok->len; +} + +static ssize_t response_parse_medium(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = (((pos[0] & MEDIUM_ATOM_LEN_MASK) << 8) | pos[1]) + 2; + tok->width = OPAL_WIDTH_MEDIUM; + + if (pos[0] & MEDIUM_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & MEDIUM_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static ssize_t response_parse_long(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = ((pos[1] << 16) | (pos[2] << 8) | pos[3]) + 4; + tok->width = OPAL_WIDTH_LONG; + + if (pos[0] & LONG_ATOM_BYTESTRING) + tok->type = OPAL_DTA_TOKENID_BYTESTRING; + else if (pos[0] & LONG_ATOM_SIGNED) + tok->type = OPAL_DTA_TOKENID_SINT; + else + tok->type = OPAL_DTA_TOKENID_UINT; + + return tok->len; +} + +static ssize_t response_parse_token(struct opal_resp_tok *tok, + const u8 *pos) +{ + tok->pos = pos; + tok->len = 1; + tok->type = OPAL_DTA_TOKENID_TOKEN; + tok->width = OPAL_WIDTH_TOKEN; + + return tok->len; +} + +static int response_parse(const u8 *buf, size_t length, + struct parsed_resp *resp) +{ + const struct opal_header *hdr; + struct opal_resp_tok *iter; + int num_entries = 0; + int total; + ssize_t token_length; + const u8 *pos; + u32 clen, plen, slen; + + if (!buf) + return -EFAULT; + + if (!resp) + return -EFAULT; + + hdr = (struct opal_header *)buf; + pos = buf; + pos += sizeof(*hdr); + + clen = be32_to_cpu(hdr->cp.length); + plen = be32_to_cpu(hdr->pkt.length); + slen = be32_to_cpu(hdr->subpkt.length); + pr_debug("Response size: cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + + if (clen == 0 || plen == 0 || slen == 0 || + slen > IO_BUFFER_LENGTH - sizeof(*hdr)) { + pr_debug("Bad header length. cp: %u, pkt: %u, subpkt: %u\n", + clen, plen, slen); + print_buffer(pos, sizeof(*hdr)); + return -EINVAL; + } + + if (pos > buf + length) + return -EFAULT; + + iter = resp->toks; + total = slen; + print_buffer(pos, total); + while (total > 0) { + if (pos[0] <= TINY_ATOM_BYTE) /* tiny atom */ + token_length = response_parse_tiny(iter, pos); + else if (pos[0] <= SHORT_ATOM_BYTE) /* short atom */ + token_length = response_parse_short(iter, pos); + else if (pos[0] <= MEDIUM_ATOM_BYTE) /* medium atom */ + token_length = response_parse_medium(iter, pos); + else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ + token_length = response_parse_long(iter, pos); + else /* TOKEN */ + token_length = response_parse_token(iter, pos); + + if (token_length < 0) + return token_length; + + pos += token_length; + total -= token_length; + iter++; + num_entries++; + } + + resp->num = num_entries; + + return 0; +} + +static size_t response_get_string(const struct parsed_resp *resp, int n, + const char **store) +{ + u8 skip; + const struct opal_resp_tok *tok; + + *store = NULL; + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return 0; + + if (tok->type != OPAL_DTA_TOKENID_BYTESTRING) { + pr_debug("Token is not a byte string!\n"); + return 0; + } + + switch (tok->width) { + case OPAL_WIDTH_TINY: + case OPAL_WIDTH_SHORT: + skip = 1; + break; + case OPAL_WIDTH_MEDIUM: + skip = 2; + break; + case OPAL_WIDTH_LONG: + skip = 4; + break; + default: + pr_debug("Token has invalid width!\n"); + return 0; + } + + *store = tok->pos + skip; + + return tok->len - skip; +} + +static u64 response_get_u64(const struct parsed_resp *resp, int n) +{ + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return 0; + + if (tok->type != OPAL_DTA_TOKENID_UINT) { + pr_debug("Token is not unsigned int: %d\n", tok->type); + return 0; + } + + if (tok->width != OPAL_WIDTH_TINY && tok->width != OPAL_WIDTH_SHORT) { + pr_debug("Atom is not short or tiny: %d\n", tok->width); + return 0; + } + + return tok->stored.u; +} + +static bool response_token_matches(const struct opal_resp_tok *token, u8 match) +{ + if (IS_ERR(token) || + token->type != OPAL_DTA_TOKENID_TOKEN || + token->pos[0] != match) + return false; + return true; +} + +static u8 response_status(const struct parsed_resp *resp) +{ + const struct opal_resp_tok *tok; + + tok = response_get_token(resp, 0); + if (response_token_matches(tok, OPAL_ENDOFSESSION)) + return 0; + + if (resp->num < 5) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 5); + if (!response_token_matches(tok, OPAL_STARTLIST)) + return DTAERROR_NO_METHOD_STATUS; + + tok = response_get_token(resp, resp->num - 1); + if (!response_token_matches(tok, OPAL_ENDLIST)) + return DTAERROR_NO_METHOD_STATUS; + + return response_get_u64(resp, resp->num - 4); +} + +/* Parses and checks for errors */ +static int parse_and_check_status(struct opal_dev *dev) +{ + int error; + + print_buffer(dev->cmd, dev->pos); + + error = response_parse(dev->resp, IO_BUFFER_LENGTH, &dev->parsed); + if (error) { + pr_debug("Couldn't parse response.\n"); + return error; + } + + return response_status(&dev->parsed); +} + +static void clear_opal_cmd(struct opal_dev *dev) +{ + dev->pos = sizeof(struct opal_header); + memset(dev->cmd, 0, IO_BUFFER_LENGTH); +} + +static int cmd_start(struct opal_dev *dev, const u8 *uid, const u8 *method) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + add_token_u8(&err, dev, OPAL_CALL); + add_token_bytestring(&err, dev, uid, OPAL_UID_LENGTH); + add_token_bytestring(&err, dev, method, OPAL_METHOD_LENGTH); + + /* + * Every method call is followed by its parameters enclosed within + * OPAL_STARTLIST and OPAL_ENDLIST tokens. We automatically open the + * parameter list here and close it later in cmd_finalize. + */ + add_token_u8(&err, dev, OPAL_STARTLIST); + + return err; +} + +static int start_opal_session_cont(struct opal_dev *dev) +{ + u32 hsn, tsn; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + hsn = response_get_u64(&dev->parsed, 4); + tsn = response_get_u64(&dev->parsed, 5); + + if (hsn != GENERIC_HOST_SESSION_NUM || tsn < FIRST_TPER_SESSION_NUM) { + pr_debug("Couldn't authenticate session\n"); + return -EPERM; + } + + dev->hsn = hsn; + dev->tsn = tsn; + + return 0; +} + +static void add_suspend_info(struct opal_dev *dev, + struct opal_suspend_data *sus) +{ + struct opal_suspend_data *iter; + + list_for_each_entry(iter, &dev->unlk_lst, node) { + if (iter->lr == sus->lr) { + list_del(&iter->node); + kfree(iter); + break; + } + } + list_add_tail(&sus->node, &dev->unlk_lst); +} + +static int end_session_cont(struct opal_dev *dev) +{ + dev->hsn = 0; + dev->tsn = 0; + + return parse_and_check_status(dev); +} + +static int finalize_and_send(struct opal_dev *dev, cont_fn cont) +{ + int ret; + + ret = cmd_finalize(dev, dev->hsn, dev->tsn); + if (ret) { + pr_debug("Error finalizing command buffer: %d\n", ret); + return ret; + } + + print_buffer(dev->cmd, dev->pos); + + return opal_send_recv(dev, cont); +} + +/* + * request @column from table @table on device @dev. On success, the column + * data will be available in dev->resp->tok[4] + */ +static int generic_get_column(struct opal_dev *dev, const u8 *table, + u64 column) +{ + int err; + + err = cmd_start(dev, table, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTCOLUMN); + add_token_u64(&err, dev, column); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDCOLUMN); + add_token_u64(&err, dev, column); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) + return err; + + return finalize_and_send(dev, parse_and_check_status); +} + +/* + * see TCG SAS 5.3.2.3 for a description of the available columns + * + * the result is provided in dev->resp->tok[4] + */ +static int generic_get_table_info(struct opal_dev *dev, const u8 *table_uid, + u64 column) +{ + u8 uid[OPAL_UID_LENGTH]; + const unsigned int half = OPAL_UID_LENGTH_HALF; + + /* sed-opal UIDs can be split in two halves: + * first: actual table index + * second: relative index in the table + * so we have to get the first half of the OPAL_TABLE_TABLE and use the + * first part of the target table as relative index into that table + */ + memcpy(uid, opaluid[OPAL_TABLE_TABLE], half); + memcpy(uid + half, table_uid, half); + + return generic_get_column(dev, uid, column); +} + +static int gen_key(struct opal_dev *dev, void *data) +{ + u8 uid[OPAL_UID_LENGTH]; + int err; + + memcpy(uid, dev->prev_data, min(sizeof(uid), dev->prev_d_len)); + kfree(dev->prev_data); + dev->prev_data = NULL; + + err = cmd_start(dev, uid, opalmethod[OPAL_GENKEY]); + + if (err) { + pr_debug("Error building gen key command\n"); + return err; + + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int get_active_key_cont(struct opal_dev *dev) +{ + const char *activekey; + size_t keylen; + int error = 0; + + error = parse_and_check_status(dev); + if (error) + return error; + + keylen = response_get_string(&dev->parsed, 4, &activekey); + if (!activekey) { + pr_debug("%s: Couldn't extract the Activekey from the response\n", + __func__); + return OPAL_INVAL_PARAM; + } + + dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL); + + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = keylen; + + return 0; +} + +static int get_active_key(struct opal_dev *dev, void *data) +{ + u8 uid[OPAL_UID_LENGTH]; + int err; + u8 *lr = data; + + err = build_locking_range(uid, sizeof(uid), *lr); + if (err) + return err; + + err = generic_get_column(dev, uid, OPAL_ACTIVEKEY); + if (err) + return err; + + return get_active_key_cont(dev); +} + +static int generic_table_write_data(struct opal_dev *dev, const u64 data, + u64 offset, u64 size, const u8 *uid) +{ + const u8 __user *src = (u8 __user *)(uintptr_t)data; + u8 *dst; + u64 len; + size_t off = 0; + int err; + + /* do we fit in the available space? */ + err = generic_get_table_info(dev, uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + len = response_get_u64(&dev->parsed, 4); + if (size > len || offset > len - size) { + pr_debug("Does not fit in the table (%llu vs. %llu)\n", + offset + size, len); + return -ENOSPC; + } + + /* do the actual transmission(s) */ + while (off < size) { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WHERE); + add_token_u64(&err, dev, offset + off); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + /* + * The bytestring header is either 1 or 2 bytes, so assume 2. + * There also needs to be enough space to accommodate the + * trailing OPAL_ENDNAME (1 byte) and tokens added by + * cmd_finalize. + */ + len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED), + (size_t)(size - off)); + pr_debug("Write bytes %zu+%llu/%llu\n", off, len, size); + + dst = add_bytestring_header(&err, dev, len); + if (!dst) + break; + + if (copy_from_user(dst, src + off, len)) { + err = -EFAULT; + break; + } + + dev->pos += len; + + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) + break; + + err = finalize_and_send(dev, parse_and_check_status); + if (err) + break; + + off += len; + } + + return err; +} + +static int generic_lr_enable_disable(struct opal_dev *dev, + u8 *uid, bool rle, bool wle, + bool rl, bool wl) +{ + int err; + + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKENABLED); + add_token_u8(&err, dev, rle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKENABLED); + add_token_u8(&err, dev, wle); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, rl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, wl); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + return err; +} + +static inline int enable_global_lr(struct opal_dev *dev, u8 *uid, + struct opal_user_lr_setup *setup) +{ + int err; + + err = generic_lr_enable_disable(dev, uid, !!setup->RLE, !!setup->WLE, + 0, 0); + if (err) + pr_debug("Failed to create enable global lr command\n"); + + return err; +} + +static int setup_locking_range(struct opal_dev *dev, void *data) +{ + u8 uid[OPAL_UID_LENGTH]; + struct opal_user_lr_setup *setup = data; + u8 lr; + int err; + + lr = setup->session.opal_key.lr; + err = build_locking_range(uid, sizeof(uid), lr); + if (err) + return err; + + if (lr == 0) + err = enable_global_lr(dev, uid, setup); + else { + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGESTART); + add_token_u64(&err, dev, setup->range_start); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_RANGELENGTH); + add_token_u64(&err, dev, setup->range_length); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKENABLED); + add_token_u64(&err, dev, !!setup->RLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKENABLED); + add_token_u64(&err, dev, !!setup->WLE); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + if (err) { + pr_debug("Error building Setup Locking range command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int start_generic_opal_session(struct opal_dev *dev, + enum opal_uid auth, + enum opal_uid sp_type, + const char *key, + u8 key_len) +{ + u32 hsn; + int err; + + if (key == NULL && auth != OPAL_ANYBODY_UID) + return OPAL_INVAL_PARAM; + + hsn = GENERIC_HOST_SESSION_NUM; + err = cmd_start(dev, opaluid[OPAL_SMUID_UID], + opalmethod[OPAL_STARTSESSION]); + + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[sp_type], OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + + switch (auth) { + case OPAL_ANYBODY_UID: + break; + case OPAL_ADMIN1_UID: + case OPAL_SID_UID: + case OPAL_PSID_UID: + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); /* HostChallenge */ + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); /* HostSignAuth */ + add_token_bytestring(&err, dev, opaluid[auth], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + break; + default: + pr_debug("Cannot start Admin SP session with auth %d\n", auth); + return OPAL_INVAL_PARAM; + } + + if (err) { + pr_debug("Error building start adminsp session command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int start_anybodyASP_opal_session(struct opal_dev *dev, void *data) +{ + return start_generic_opal_session(dev, OPAL_ANYBODY_UID, + OPAL_ADMINSP_UID, NULL, 0); +} + +static int start_SIDASP_opal_session(struct opal_dev *dev, void *data) +{ + int ret; + const u8 *key = dev->prev_data; + + if (!key) { + const struct opal_key *okey = data; + + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); + } else { + ret = start_generic_opal_session(dev, OPAL_SID_UID, + OPAL_ADMINSP_UID, + key, dev->prev_d_len); + kfree(key); + dev->prev_data = NULL; + } + + return ret; +} + +static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data) +{ + struct opal_key *key = data; + + return start_generic_opal_session(dev, OPAL_ADMIN1_UID, + OPAL_LOCKINGSP_UID, + key->key, key->key_len); +} + +static int start_PSID_opal_session(struct opal_dev *dev, void *data) +{ + const struct opal_key *okey = data; + + return start_generic_opal_session(dev, OPAL_PSID_UID, + OPAL_ADMINSP_UID, + okey->key, + okey->key_len); +} + +static int start_auth_opal_session(struct opal_dev *dev, void *data) +{ + struct opal_session_info *session = data; + u8 lk_ul_user[OPAL_UID_LENGTH]; + size_t keylen = session->opal_key.key_len; + int err = 0; + + u8 *key = session->opal_key.key; + u32 hsn = GENERIC_HOST_SESSION_NUM; + + if (session->sum) + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->opal_key.lr); + else if (session->who != OPAL_ADMIN1 && !session->sum) + err = build_locking_user(lk_ul_user, sizeof(lk_ul_user), + session->who - 1); + else + memcpy(lk_ul_user, opaluid[OPAL_ADMIN1_UID], OPAL_UID_LENGTH); + + if (err) + return err; + + err = cmd_start(dev, opaluid[OPAL_SMUID_UID], + opalmethod[OPAL_STARTSESSION]); + + add_token_u64(&err, dev, hsn); + add_token_bytestring(&err, dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_UID_LENGTH); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 0); + add_token_bytestring(&err, dev, key, keylen); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + add_token_bytestring(&err, dev, lk_ul_user, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building STARTSESSION command.\n"); + return err; + } + + return finalize_and_send(dev, start_opal_session_cont); +} + +static int revert_tper(struct opal_dev *dev, void *data) +{ + int err; + + err = cmd_start(dev, opaluid[OPAL_ADMINSP_UID], + opalmethod[OPAL_REVERT]); + if (err) { + pr_debug("Error building REVERT TPER command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int internal_activate_user(struct opal_dev *dev, void *data) +{ + struct opal_session_info *session = data; + u8 uid[OPAL_UID_LENGTH]; + int err; + + memcpy(uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + uid[7] = session->who; + + err = cmd_start(dev, uid, opalmethod[OPAL_SET]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 5); /* Enabled */ + add_token_u8(&err, dev, OPAL_TRUE); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building Activate UserN command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int erase_locking_range(struct opal_dev *dev, void *data) +{ + struct opal_session_info *session = data; + u8 uid[OPAL_UID_LENGTH]; + int err; + + if (build_locking_range(uid, sizeof(uid), session->opal_key.lr) < 0) + return -ERANGE; + + err = cmd_start(dev, uid, opalmethod[OPAL_ERASE]); + + if (err) { + pr_debug("Error building Erase Locking Range Command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_done(struct opal_dev *dev, void *data) +{ + u8 *mbr_done_tf = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_MBRCONTROL], + opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_MBRDONE); + add_token_u8(&err, dev, *mbr_done_tf); /* Done T or F */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error Building set MBR Done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_mbr_enable_disable(struct opal_dev *dev, void *data) +{ + u8 *mbr_en_dis = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_MBRCONTROL], + opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_MBRENABLE); + add_token_u8(&err, dev, *mbr_en_dis); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error Building set MBR done command\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int write_shadow_mbr(struct opal_dev *dev, void *data) +{ + struct opal_shadow_mbr *shadow = data; + + return generic_table_write_data(dev, shadow->data, shadow->offset, + shadow->size, opaluid[OPAL_MBR]); +} + +static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid, + struct opal_dev *dev) +{ + int err; + + err = cmd_start(dev, cpin_uid, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_PIN); + add_token_bytestring(&err, dev, key, key_len); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + return err; +} + +static int set_new_pw(struct opal_dev *dev, void *data) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_session_info *usr = data; + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_ADMIN1], OPAL_UID_LENGTH); + + if (usr->who != OPAL_ADMIN1) { + cpin_uid[5] = 0x03; + if (usr->sum) + cpin_uid[7] = usr->opal_key.lr + 1; + else + cpin_uid[7] = usr->who; + } + + if (generic_pw_cmd(usr->opal_key.key, usr->opal_key.key_len, + cpin_uid, dev)) { + pr_debug("Error building set password command.\n"); + return -ERANGE; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int set_sid_cpin_pin(struct opal_dev *dev, void *data) +{ + u8 cpin_uid[OPAL_UID_LENGTH]; + struct opal_key *key = data; + + memcpy(cpin_uid, opaluid[OPAL_C_PIN_SID], OPAL_UID_LENGTH); + + if (generic_pw_cmd(key->key, key->key_len, cpin_uid, dev)) { + pr_debug("Error building Set SID cpin\n"); + return -ERANGE; + } + return finalize_and_send(dev, parse_and_check_status); +} + +static int add_user_to_lr(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 user_uid[OPAL_UID_LENGTH]; + struct opal_lock_unlock *lkul = data; + int err; + + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], + OPAL_UID_LENGTH); + + if (lkul->l_state == OPAL_RW) + memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], + OPAL_UID_LENGTH); + + lr_buffer[7] = lkul->session.opal_key.lr; + + memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); + + user_uid[7] = lkul->session.who; + + err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, 3); + + add_token_u8(&err, dev, OPAL_STARTLIST); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(&err, dev, 1); + add_token_u8(&err, dev, OPAL_ENDNAME); + + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building add user to locking range command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int lock_unlock_locking_range(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + struct opal_lock_unlock *lkul = data; + u8 read_locked = 1, write_locked = 1; + int err = 0; + + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initialized to locked */ + break; + default: + pr_debug("Tried to set an invalid locking state... returning to uland\n"); + return OPAL_INVAL_PARAM; + } + + err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_VALUES); + add_token_u8(&err, dev, OPAL_STARTLIST); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_READLOCKED); + add_token_u8(&err, dev, read_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_WRITELOCKED); + add_token_u8(&err, dev, write_locked); + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + + if (err) { + pr_debug("Error building SET command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + + +static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u8 read_locked = 1, write_locked = 1; + struct opal_lock_unlock *lkul = data; + int ret; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + + if (build_locking_range(lr_buffer, sizeof(lr_buffer), + lkul->session.opal_key.lr) < 0) + return -ERANGE; + + switch (lkul->l_state) { + case OPAL_RO: + read_locked = 0; + write_locked = 1; + break; + case OPAL_RW: + read_locked = 0; + write_locked = 0; + break; + case OPAL_LK: + /* vars are initialized to locked */ + break; + default: + pr_debug("Tried to set an invalid locking state.\n"); + return OPAL_INVAL_PARAM; + } + ret = generic_lr_enable_disable(dev, lr_buffer, 1, 1, + read_locked, write_locked); + + if (ret < 0) { + pr_debug("Error building SET command.\n"); + return ret; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +static int activate_lsp(struct opal_dev *dev, void *data) +{ + struct opal_lr_act *opal_act = data; + u8 user_lr[OPAL_UID_LENGTH]; + int err, i; + + err = cmd_start(dev, opaluid[OPAL_LOCKINGSP_UID], + opalmethod[OPAL_ACTIVATE]); + + if (opal_act->sum) { + err = build_locking_range(user_lr, sizeof(user_lr), + opal_act->lr[0]); + if (err) + return err; + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_SUM_SET_LIST); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + for (i = 1; i < opal_act->num_lrs; i++) { + user_lr[7] = opal_act->lr[i]; + add_token_bytestring(&err, dev, user_lr, OPAL_UID_LENGTH); + } + add_token_u8(&err, dev, OPAL_ENDLIST); + add_token_u8(&err, dev, OPAL_ENDNAME); + } + + if (err) { + pr_debug("Error building Activate LockingSP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + +/* Determine if we're in the Manufactured Inactive or Active state */ +static int get_lsp_lifecycle(struct opal_dev *dev, void *data) +{ + u8 lc_status; + int err; + + err = generic_get_column(dev, opaluid[OPAL_LOCKINGSP_UID], + OPAL_LIFECYCLE); + if (err) + return err; + + lc_status = response_get_u64(&dev->parsed, 4); + /* 0x08 is Manufactured Inactive */ + /* 0x09 is Manufactured */ + if (lc_status != OPAL_MANUFACTURED_INACTIVE) { + pr_debug("Couldn't determine the status of the Lifecycle state\n"); + return -ENODEV; + } + + return 0; +} + +static int get_msid_cpin_pin(struct opal_dev *dev, void *data) +{ + const char *msid_pin; + size_t strlen; + int err; + + err = generic_get_column(dev, opaluid[OPAL_C_PIN_MSID], OPAL_PIN); + if (err) + return err; + + strlen = response_get_string(&dev->parsed, 4, &msid_pin); + if (!msid_pin) { + pr_debug("Couldn't extract MSID_CPIN from response\n"); + return OPAL_INVAL_PARAM; + } + + dev->prev_data = kmemdup(msid_pin, strlen, GFP_KERNEL); + if (!dev->prev_data) + return -ENOMEM; + + dev->prev_d_len = strlen; + + return 0; +} + +static int write_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *write_tbl = data; + + return generic_table_write_data(dev, write_tbl->data, write_tbl->offset, + write_tbl->size, write_tbl->table_uid); +} + +static int read_table_data_cont(struct opal_dev *dev) +{ + int err; + const char *data_read; + + err = parse_and_check_status(dev); + if (err) + return err; + + dev->prev_d_len = response_get_string(&dev->parsed, 1, &data_read); + dev->prev_data = (void *)data_read; + if (!dev->prev_data) { + pr_debug("%s: Couldn't read data from the table.\n", __func__); + return OPAL_INVAL_PARAM; + } + + return 0; +} + +/* + * IO_BUFFER_LENGTH = 2048 + * sizeof(header) = 56 + * No. of Token Bytes in the Response = 11 + * MAX size of data that can be carried in response buffer + * at a time is : 2048 - (56 + 11) = 1981 = 0x7BD. + */ +#define OPAL_MAX_READ_TABLE (0x7BD) + +static int read_table_data(struct opal_dev *dev, void *data) +{ + struct opal_read_write_table *read_tbl = data; + int err; + size_t off = 0, max_read_size = OPAL_MAX_READ_TABLE; + u64 table_len, len; + u64 offset = read_tbl->offset, read_size = read_tbl->size - 1; + u8 __user *dst; + + err = generic_get_table_info(dev, read_tbl->table_uid, OPAL_TABLE_ROWS); + if (err) { + pr_debug("Couldn't get the table size\n"); + return err; + } + + table_len = response_get_u64(&dev->parsed, 4); + + /* Check if the user is trying to read from the table limits */ + if (read_size > table_len || offset > table_len - read_size) { + pr_debug("Read size exceeds the Table size limits (%llu vs. %llu)\n", + offset + read_size, table_len); + return -EINVAL; + } + + while (off < read_size) { + err = cmd_start(dev, read_tbl->table_uid, opalmethod[OPAL_GET]); + + add_token_u8(&err, dev, OPAL_STARTLIST); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_STARTROW); + add_token_u64(&err, dev, offset + off); /* start row value */ + add_token_u8(&err, dev, OPAL_ENDNAME); + + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u8(&err, dev, OPAL_ENDROW); + + len = min(max_read_size, (size_t)(read_size - off)); + add_token_u64(&err, dev, offset + off + len); /* end row value + */ + add_token_u8(&err, dev, OPAL_ENDNAME); + add_token_u8(&err, dev, OPAL_ENDLIST); + + if (err) { + pr_debug("Error building read table data command.\n"); + break; + } + + err = finalize_and_send(dev, read_table_data_cont); + if (err) + break; + + /* len+1: This includes the NULL terminator at the end*/ + if (dev->prev_d_len > len + 1) { + err = -EOVERFLOW; + break; + } + + dst = (u8 __user *)(uintptr_t)read_tbl->data; + if (copy_to_user(dst + off, dev->prev_data, dev->prev_d_len)) { + pr_debug("Error copying data to userspace\n"); + err = -EFAULT; + break; + } + dev->prev_data = NULL; + + off += len; + } + + return err; +} + +static int end_opal_session(struct opal_dev *dev, void *data) +{ + int err = 0; + + clear_opal_cmd(dev); + set_comid(dev, dev->comid); + add_token_u8(&err, dev, OPAL_ENDOFSESSION); + + if (err < 0) + return err; + + return finalize_and_send(dev, end_session_cont); +} + +static int end_opal_session_error(struct opal_dev *dev) +{ + const struct opal_step error_end_session = { + end_opal_session, + }; + + return execute_step(dev, &error_end_session, 0); +} + +static inline void setup_opal_dev(struct opal_dev *dev) +{ + dev->tsn = 0; + dev->hsn = 0; + dev->prev_data = NULL; +} + +static int check_opal_support(struct opal_dev *dev) +{ + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = opal_discovery0_step(dev); + dev->supported = !ret; + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static void clean_opal_dev(struct opal_dev *dev) +{ + + struct opal_suspend_data *suspend, *next; + + mutex_lock(&dev->dev_lock); + list_for_each_entry_safe(suspend, next, &dev->unlk_lst, node) { + list_del(&suspend->node); + kfree(suspend); + } + mutex_unlock(&dev->dev_lock); +} + +void free_opal_dev(struct opal_dev *dev) +{ + if (!dev) + return; + + clean_opal_dev(dev); + kfree(dev->resp); + kfree(dev->cmd); + kfree(dev); +} +EXPORT_SYMBOL(free_opal_dev); + +struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) +{ + struct opal_dev *dev; + + dev = kmalloc(sizeof(*dev), GFP_KERNEL); + if (!dev) + return NULL; + + /* + * Presumably DMA-able buffers must be cache-aligned. Kmalloc makes + * sure the allocated buffer is DMA-safe in that regard. + */ + dev->cmd = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL); + if (!dev->cmd) + goto err_free_dev; + + dev->resp = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL); + if (!dev->resp) + goto err_free_cmd; + + INIT_LIST_HEAD(&dev->unlk_lst); + mutex_init(&dev->dev_lock); + dev->data = data; + dev->send_recv = send_recv; + if (check_opal_support(dev) != 0) { + pr_debug("Opal is not supported on this device\n"); + goto err_free_resp; + } + + return dev; + +err_free_resp: + kfree(dev->resp); + +err_free_cmd: + kfree(dev->cmd); + +err_free_dev: + kfree(dev); + + return NULL; +} +EXPORT_SYMBOL(init_opal_dev); + +static int opal_secure_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + const struct opal_step erase_steps[] = { + { start_auth_opal_session, opal_session }, + { get_active_key, &opal_session->opal_key.lr }, + { gen_key, }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_erase_locking_range(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + const struct opal_step erase_steps[] = { + { start_auth_opal_session, opal_session }, + { erase_locking_range, opal_session }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, + struct opal_mbr_data *opal_mbr) +{ + u8 enable_disable = opal_mbr->enable_disable == OPAL_MBR_ENABLE ? + OPAL_TRUE : OPAL_FALSE; + + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &opal_mbr->key }, + { set_mbr_done, &enable_disable }, + { end_opal_session, }, + { start_admin1LSP_opal_session, &opal_mbr->key }, + { set_mbr_enable_disable, &enable_disable }, + { end_opal_session, } + }; + int ret; + + if (opal_mbr->enable_disable != OPAL_MBR_ENABLE && + opal_mbr->enable_disable != OPAL_MBR_DISABLE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_set_mbr_done(struct opal_dev *dev, + struct opal_mbr_done *mbr_done) +{ + u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ? + OPAL_TRUE : OPAL_FALSE; + + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &mbr_done->key }, + { set_mbr_done, &mbr_done_tf }, + { end_opal_session, } + }; + int ret; + + if (mbr_done->done_flag != OPAL_MBR_DONE && + mbr_done->done_flag != OPAL_MBR_NOT_DONE) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_write_shadow_mbr(struct opal_dev *dev, + struct opal_shadow_mbr *info) +{ + const struct opal_step mbr_steps[] = { + { start_admin1LSP_opal_session, &info->key }, + { write_shadow_mbr, info }, + { end_opal_session, } + }; + int ret; + + if (info->size == 0) + return 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *suspend; + + suspend = kzalloc(sizeof(*suspend), GFP_KERNEL); + if (!suspend) + return -ENOMEM; + + suspend->unlk = *lk_unlk; + suspend->lr = lk_unlk->session.opal_key.lr; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + add_suspend_info(dev, suspend); + mutex_unlock(&dev->dev_lock); + + return 0; +} + +static int opal_add_user_to_lr(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + const struct opal_step steps[] = { + { start_admin1LSP_opal_session, &lk_unlk->session.opal_key }, + { add_user_to_lr, lk_unlk }, + { end_opal_session, } + }; + int ret; + + if (lk_unlk->l_state != OPAL_RO && + lk_unlk->l_state != OPAL_RW) { + pr_debug("Locking state was not RO or RW\n"); + return -EINVAL; + } + + if (lk_unlk->session.who < OPAL_USER1 || + lk_unlk->session.who > OPAL_USER9) { + pr_debug("Authority was not within the range of users: %d\n", + lk_unlk->session.who); + return -EINVAL; + } + + if (lk_unlk->session.sum) { + pr_debug("%s not supported in sum. Use setup locking range\n", + __func__); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid) +{ + /* controller will terminate session */ + const struct opal_step revert_steps[] = { + { start_SIDASP_opal_session, opal }, + { revert_tper, } + }; + const struct opal_step psid_revert_steps[] = { + { start_PSID_opal_session, opal }, + { revert_tper, } + }; + + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + if (psid) + ret = execute_steps(dev, psid_revert_steps, + ARRAY_SIZE(psid_revert_steps)); + else + ret = execute_steps(dev, revert_steps, + ARRAY_SIZE(revert_steps)); + mutex_unlock(&dev->dev_lock); + + /* + * If we successfully reverted lets clean + * any saved locking ranges. + */ + if (!ret) + clean_opal_dev(dev); + + return ret; +} + +static int __opal_lock_unlock(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + const struct opal_step unlock_steps[] = { + { start_auth_opal_session, &lk_unlk->session }, + { lock_unlock_locking_range, lk_unlk }, + { end_opal_session, } + }; + const struct opal_step unlock_sum_steps[] = { + { start_auth_opal_session, &lk_unlk->session }, + { lock_unlock_locking_range_sum, lk_unlk }, + { end_opal_session, } + }; + + if (lk_unlk->session.sum) + return execute_steps(dev, unlock_sum_steps, + ARRAY_SIZE(unlock_sum_steps)); + else + return execute_steps(dev, unlock_steps, + ARRAY_SIZE(unlock_steps)); +} + +static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) +{ + u8 mbr_done_tf = OPAL_TRUE; + const struct opal_step mbrdone_step[] = { + { start_admin1LSP_opal_session, key }, + { set_mbr_done, &mbr_done_tf }, + { end_opal_session, } + }; + + return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step)); +} + +static int opal_lock_unlock(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + int ret; + + if (lk_unlk->session.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + ret = __opal_lock_unlock(dev, lk_unlk); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) +{ + const struct opal_step owner_steps[] = { + { start_anybodyASP_opal_session, }, + { get_msid_cpin_pin, }, + { end_opal_session, }, + { start_SIDASP_opal_session, opal }, + { set_sid_cpin_pin, opal }, + { end_opal_session, } + }; + int ret; + + if (!dev) + return -ENODEV; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_activate_lsp(struct opal_dev *dev, + struct opal_lr_act *opal_lr_act) +{ + const struct opal_step active_steps[] = { + { start_SIDASP_opal_session, &opal_lr_act->key }, + { get_lsp_lifecycle, }, + { activate_lsp, opal_lr_act }, + { end_opal_session, } + }; + int ret; + + if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_setup_locking_range(struct opal_dev *dev, + struct opal_user_lr_setup *opal_lrs) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrs->session }, + { setup_locking_range, opal_lrs }, + { end_opal_session, } + }; + int ret; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) +{ + const struct opal_step pw_steps[] = { + { start_auth_opal_session, &opal_pw->session }, + { set_new_pw, &opal_pw->new_user_pw }, + { end_opal_session, } + }; + int ret; + + if (opal_pw->session.who > OPAL_USER9 || + opal_pw->new_user_pw.who > OPAL_USER9) + return -EINVAL; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_activate_user(struct opal_dev *dev, + struct opal_session_info *opal_session) +{ + const struct opal_step act_steps[] = { + { start_admin1LSP_opal_session, &opal_session->opal_key }, + { internal_activate_user, opal_session }, + { end_opal_session, } + }; + int ret; + + /* We can't activate Admin1 it's active as manufactured */ + if (opal_session->who < OPAL_USER1 || + opal_session->who > OPAL_USER9) { + pr_debug("Who was not a valid user: %d\n", opal_session->who); + return -EINVAL; + } + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +bool opal_unlock_from_suspend(struct opal_dev *dev) +{ + struct opal_suspend_data *suspend; + bool was_failure = false; + int ret = 0; + + if (!dev) + return false; + + if (!dev->supported) + return false; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + list_for_each_entry(suspend, &dev->unlk_lst, node) { + dev->tsn = 0; + dev->hsn = 0; + + ret = __opal_lock_unlock(dev, &suspend->unlk); + if (ret) { + pr_debug("Failed to unlock LR %hhu with sum %d\n", + suspend->unlk.session.opal_key.lr, + suspend->unlk.session.sum); + was_failure = true; + } + + if (dev->mbr_enabled) { + ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); + if (ret) + pr_debug("Failed to set MBR Done in S3 resume\n"); + } + } + mutex_unlock(&dev->dev_lock); + + return was_failure; +} +EXPORT_SYMBOL(opal_unlock_from_suspend); + +static int opal_read_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step read_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { read_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, read_table_steps, + ARRAY_SIZE(read_table_steps)); +} + +static int opal_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + const struct opal_step write_table_steps[] = { + { start_admin1LSP_opal_session, &rw_tbl->key }, + { write_table_data, rw_tbl }, + { end_opal_session, } + }; + int ret = 0; + + if (!rw_tbl->size) + return ret; + + return execute_steps(dev, write_table_steps, + ARRAY_SIZE(write_table_steps)); +} + +static int opal_generic_read_write_table(struct opal_dev *dev, + struct opal_read_write_table *rw_tbl) +{ + int ret, bit_set; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + + bit_set = fls64(rw_tbl->flags) - 1; + switch (bit_set) { + case OPAL_READ_TABLE: + ret = opal_read_table(dev, rw_tbl); + break; + case OPAL_WRITE_TABLE: + ret = opal_write_table(dev, rw_tbl); + break; + default: + pr_debug("Invalid bit set in the flag (%016llx).\n", + rw_tbl->flags); + ret = -EINVAL; + break; + } + + mutex_unlock(&dev->dev_lock); + + return ret; +} + +int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) +{ + void *p; + int ret = -ENOTTY; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (!dev) + return -ENOTSUPP; + if (!dev->supported) + return -ENOTSUPP; + + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + + switch (cmd) { + case IOC_OPAL_SAVE: + ret = opal_save(dev, p); + break; + case IOC_OPAL_LOCK_UNLOCK: + ret = opal_lock_unlock(dev, p); + break; + case IOC_OPAL_TAKE_OWNERSHIP: + ret = opal_take_ownership(dev, p); + break; + case IOC_OPAL_ACTIVATE_LSP: + ret = opal_activate_lsp(dev, p); + break; + case IOC_OPAL_SET_PW: + ret = opal_set_new_pw(dev, p); + break; + case IOC_OPAL_ACTIVATE_USR: + ret = opal_activate_user(dev, p); + break; + case IOC_OPAL_REVERT_TPR: + ret = opal_reverttper(dev, p, false); + break; + case IOC_OPAL_LR_SETUP: + ret = opal_setup_locking_range(dev, p); + break; + case IOC_OPAL_ADD_USR_TO_LR: + ret = opal_add_user_to_lr(dev, p); + break; + case IOC_OPAL_ENABLE_DISABLE_MBR: + ret = opal_enable_disable_shadow_mbr(dev, p); + break; + case IOC_OPAL_MBR_DONE: + ret = opal_set_mbr_done(dev, p); + break; + case IOC_OPAL_WRITE_SHADOW_MBR: + ret = opal_write_shadow_mbr(dev, p); + break; + case IOC_OPAL_ERASE_LR: + ret = opal_erase_locking_range(dev, p); + break; + case IOC_OPAL_SECURE_ERASE_LR: + ret = opal_secure_erase_locking_range(dev, p); + break; + case IOC_OPAL_PSID_REVERT_TPR: + ret = opal_reverttper(dev, p, true); + break; + case IOC_OPAL_GENERIC_TABLE_RW: + ret = opal_generic_read_write_table(dev, p); + break; + default: + break; + } + + kfree(p); + return ret; +} +EXPORT_SYMBOL_GPL(sed_ioctl); diff --git a/block/t10-pi.c b/block/t10-pi.c new file mode 100644 index 000000000..d910534b3 --- /dev/null +++ b/block/t10-pi.c @@ -0,0 +1,285 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * t10_pi.c - Functions for generating and verifying T10 Protection + * Information. + */ + +#include +#include +#include +#include +#include + +typedef __be16 (csum_fn) (void *, unsigned int); + +static __be16 t10_pi_crc_fn(void *data, unsigned int len) +{ + return cpu_to_be16(crc_t10dif(data, len)); +} + +static __be16 t10_pi_ip_fn(void *data, unsigned int len) +{ + return (__force __be16)ip_compute_csum(data, len); +} + +/* + * Type 1 and Type 2 protection use the same format: 16 bit guard tag, + * 16 bit app tag, 32 bit reference tag. Type 3 does not define the ref + * tag. + */ +static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, + csum_fn *fn, enum t10_dif_type type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + + pi->guard_tag = fn(iter->data_buf, iter->interval); + pi->app_tag = 0; + + if (type == T10_PI_TYPE1_PROTECTION) + pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed)); + else + pi->ref_tag = 0; + + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return BLK_STS_OK; +} + +static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, + csum_fn *fn, enum t10_dif_type type) +{ + unsigned int i; + + BUG_ON(type == T10_PI_TYPE0_PROTECTION); + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct t10_pi_tuple *pi = iter->prot_buf; + __be16 csum; + + if (type == T10_PI_TYPE1_PROTECTION || + type == T10_PI_TYPE2_PROTECTION) { + if (pi->app_tag == T10_PI_APP_ESCAPE) + goto next; + + if (be32_to_cpu(pi->ref_tag) != + lower_32_bits(iter->seed)) { + pr_err("%s: ref tag error at location %llu " \ + "(rcvd %u)\n", iter->disk_name, + (unsigned long long) + iter->seed, be32_to_cpu(pi->ref_tag)); + return BLK_STS_PROTECTION; + } + } else if (type == T10_PI_TYPE3_PROTECTION) { + if (pi->app_tag == T10_PI_APP_ESCAPE && + pi->ref_tag == T10_PI_REF_ESCAPE) + goto next; + } + + csum = fn(iter->data_buf, iter->interval); + + if (pi->guard_tag != csum) { + pr_err("%s: guard tag error at sector %llu " \ + "(rcvd %04x, want %04x)\n", iter->disk_name, + (unsigned long long)iter->seed, + be16_to_cpu(pi->guard_tag), be16_to_cpu(csum)); + return BLK_STS_PROTECTION; + } + +next: + iter->data_buf += iter->interval; + iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->seed++; + } + + return BLK_STS_OK; +} + +static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); +} + +static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); +} + +static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION); +} + +static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION); +} + +/** + * t10_pi_type1_prepare - prepare PI prior submitting request to device + * @rq: request with PI that should be prepared + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Remap protection information to match the + * physical LBA. + */ +static void t10_pi_type1_prepare(struct request *rq) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + /* Already remapped? */ + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) + break; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == virt) + pi->ref_tag = cpu_to_be32(ref_tag); + virt++; + ref_tag++; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + + bip->bip_flags |= BIP_MAPPED_INTEGRITY; + } +} + +/** + * t10_pi_type1_complete - prepare PI prior returning request to the blk layer + * @rq: request with PI that should be prepared + * @nr_bytes: total bytes to prepare + * + * For Type 1/Type 2, the virtual start sector is the one that was + * originally submitted by the block layer for the ref_tag usage. Due to + * partitioning, MD/DM cloning, etc. the actual physical start sector is + * likely to be different. Since the physical start sector was submitted + * to the device, we should remap it back to virtual values expected by the + * block layer. + */ +static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +{ + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; + const int tuple_sz = rq->q->integrity.tuple_size; + u32 ref_tag = t10_pi_ref_tag(rq); + struct bio *bio; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u32 virt = bip_get_seed(bip) & 0xffffffff; + struct bio_vec iv; + struct bvec_iter iter; + + bip_for_each_vec(iv, bip, iter) { + void *p, *pmap; + unsigned int j; + + pmap = kmap_atomic(iv.bv_page); + p = pmap + iv.bv_offset; + for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { + struct t10_pi_tuple *pi = p; + + if (be32_to_cpu(pi->ref_tag) == ref_tag) + pi->ref_tag = cpu_to_be32(virt); + virt++; + ref_tag++; + intervals--; + p += tuple_sz; + } + + kunmap_atomic(pmap); + } + } +} + +static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter) +{ + return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION); +} + +/* Type 3 does not have a reference tag so no remapping is required. */ +static void t10_pi_type3_prepare(struct request *rq) +{ +} + +/* Type 3 does not have a reference tag so no remapping is required. */ +static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes) +{ +} + +const struct blk_integrity_profile t10_pi_type1_crc = { + .name = "T10-DIF-TYPE1-CRC", + .generate_fn = t10_pi_type1_generate_crc, + .verify_fn = t10_pi_type1_verify_crc, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_crc); + +const struct blk_integrity_profile t10_pi_type1_ip = { + .name = "T10-DIF-TYPE1-IP", + .generate_fn = t10_pi_type1_generate_ip, + .verify_fn = t10_pi_type1_verify_ip, + .prepare_fn = t10_pi_type1_prepare, + .complete_fn = t10_pi_type1_complete, +}; +EXPORT_SYMBOL(t10_pi_type1_ip); + +const struct blk_integrity_profile t10_pi_type3_crc = { + .name = "T10-DIF-TYPE3-CRC", + .generate_fn = t10_pi_type3_generate_crc, + .verify_fn = t10_pi_type3_verify_crc, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_crc); + +const struct blk_integrity_profile t10_pi_type3_ip = { + .name = "T10-DIF-TYPE3-IP", + .generate_fn = t10_pi_type3_generate_ip, + .verify_fn = t10_pi_type3_verify_ip, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL(t10_pi_type3_ip); + +MODULE_LICENSE("GPL"); -- cgit v1.2.3