118 files changed, 99615 insertions, 0 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
new file mode 100644
index 000000000..a41145d52
--- /dev/null
+++ b/drivers/block/Kconfig
@@ -0,0 +1,419 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Block device driver configuration
+#
+
+menuconfig BLK_DEV
+	bool "Block devices"
+	depends on BLOCK
+	default y
+	help
+	  Say Y here to get to see options for various different block device
+	  drivers. This option alone does not add any kernel code.
+
+	  If you say N, all options in this submenu will be skipped and disabled;
+	  only do this if you know what you are doing.
+
+if BLK_DEV
+
+source "drivers/block/null_blk/Kconfig"
+
+config BLK_DEV_FD
+	tristate "Normal floppy disk support"
+	depends on ARCH_MAY_HAVE_PC_FDC
+	help
+	  If you want to use the floppy disk drive(s) of your PC under Linux,
+	  say Y. Information about this driver, especially important for IBM
+	  Thinkpad users, is contained in
+	  <file:Documentation/admin-guide/blockdev/floppy.rst>.
+	  That file also contains the location of the Floppy driver FAQ as
+	  well as location of the fdutils package used to configure additional
+	  parameters of the driver at run time.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called floppy.
+
+config BLK_DEV_FD_RAWCMD
+	bool "Support for raw floppy disk commands (DEPRECATED)"
+	depends on BLK_DEV_FD
+	help
+	  If you want to use actual physical floppies and expect to do
+	  special low-level hardware accesses to them (access and use
+	  non-standard formats, for example), then enable this.
+
+	  Note that the code enabled by this option is rarely used and
+	  might be unstable or insecure, and distros should not enable it.
+
+	  Note: FDRAWCMD is deprecated and will be removed from the kernel
+	  in the near future.
+
+	  If unsure, say N.
+
+config AMIGA_FLOPPY
+	tristate "Amiga floppy support"
+	depends on AMIGA
+
+config ATARI_FLOPPY
+	tristate "Atari floppy support"
+	depends on ATARI
+
+config MAC_FLOPPY
+	tristate "Support for PowerMac floppy"
+	depends on PPC_PMAC && !PPC_PMAC64
+	help
+	  If you have a SWIM-3 (Super Woz Integrated Machine 3; from Apple)
+	  floppy controller, say Y here. Most commonly found in PowerMacs.
+
+config BLK_DEV_SWIM
+	tristate "Support for SWIM Macintosh floppy"
+	depends on M68K && MAC && !HIGHMEM
+	help
+	  You should select this option if you want floppy support
+	  and you don't have a II, IIfx, Q900, Q950 or AV series.
+
+config AMIGA_Z2RAM
+	tristate "Amiga Zorro II ramdisk support"
+	depends on ZORRO
+	help
+	  This enables support for using Chip RAM and Zorro II RAM as a
+	  ramdisk or as a swap partition. Say Y if you want to include this
+	  driver in the kernel.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called z2ram.
+
+config N64CART
+	bool "N64 cart support"
+	depends on MACH_NINTENDO64
+	help
+	  Support for the N64 cart.
+
+config CDROM
+	tristate
+
+config GDROM
+	tristate "SEGA Dreamcast GD-ROM drive"
+	depends on SH_DREAMCAST
+	select CDROM
+	help
+	  A standard SEGA Dreamcast comes with a modified CD ROM drive called a
+	  "GD-ROM" by SEGA to signify it is capable of reading special disks
+	  with up to 1 GB of data. This drive will also read standard CD ROM
+	  disks. Select this option to access any disks in your GD ROM drive.
+	  Most users will want to say "Y" here.
+	  You can also build this as a module which will be called gdrom.
+
+config PARIDE
+	tristate "Parallel port IDE device support"
+	depends on PARPORT_PC
+	help
+	  There are many external CD-ROM and disk devices that connect through
+	  your computer's parallel port. Most of them are actually IDE devices
+	  using a parallel port IDE adapter. This option enables the PARIDE
+	  subsystem which contains drivers for many of these external drives.
+	  Read <file:Documentation/admin-guide/blockdev/paride.rst> for more information.
+
+	  If you have said Y to the "Parallel-port support" configuration
+	  option, you may share a single port between your printer and other
+	  parallel port devices. Answer Y to build PARIDE support into your
+	  kernel, or M if you would like to build it as a loadable module. If
+	  your parallel port support is in a loadable module, you must build
+	  PARIDE as a module. If you built PARIDE support into your kernel,
+	  you may still build the individual protocol modules and high-level
+	  drivers as loadable modules. If you build this support as a module,
+	  it will be called paride.
+
+	  To use the PARIDE support, you must say Y or M here and also to at
+	  least one high-level driver (e.g. "Parallel port IDE disks",
+	  "Parallel port ATAPI CD-ROMs", "Parallel port ATAPI disks" etc.) and
+	  to at least one protocol driver (e.g. "ATEN EH-100 protocol",
+	  "MicroSolutions backpack protocol", "DataStor Commuter protocol"
+	  etc.).
+
+source "drivers/block/paride/Kconfig"
+
+source "drivers/block/mtip32xx/Kconfig"
+
+source "drivers/block/zram/Kconfig"
+
+config BLK_DEV_UBD
+	bool "Virtual block device"
+	depends on UML
+	help
+          The User-Mode Linux port includes a driver called UBD which will let
+          you access arbitrary files on the host computer as block devices.
+          Unless you know that you do not need such virtual block devices say
+          Y here.
+
+config BLK_DEV_UBD_SYNC
+	bool "Always do synchronous disk IO for UBD"
+	depends on BLK_DEV_UBD
+	help
+	  Writes to the virtual block device are not immediately written to the
+	  host's disk; this may cause problems if, for example, the User-Mode
+	  Linux 'Virtual Machine' uses a journalling filesystem and the host
+	  computer crashes.
+
+          Synchronous operation (i.e. always writing data to the host's disk
+          immediately) is configurable on a per-UBD basis by using a special
+          kernel command line option.  Alternatively, you can say Y here to
+          turn on synchronous operation by default for all block devices.
+
+          If you're running a journalling file system (like reiserfs, for
+          example) in your virtual machine, you will want to say Y here.  If
+          you care for the safety of the data in your virtual machine, Y is a
+          wise choice too.  In all other cases (for example, if you're just
+          playing around with User-Mode Linux) you can choose N.
+
+config BLK_DEV_COW_COMMON
+	bool
+	default BLK_DEV_UBD
+
+config BLK_DEV_LOOP
+	tristate "Loopback device support"
+	help
+	  Saying Y here will allow you to use a regular file as a block
+	  device; you can then create a file system on that block device and
+	  mount it just as you would mount other block devices such as hard
+	  drive partitions, CD-ROM drives or floppy drives. The loop devices
+	  are block special device files with major number 7 and typically
+	  called /dev/loop0, /dev/loop1 etc.
+
+	  This is useful if you want to check an ISO 9660 file system before
+	  burning the CD, or if you want to use floppy images without first
+	  writing them to floppy. Furthermore, some Linux distributions avoid
+	  the need for a dedicated Linux partition by keeping their complete
+	  root file system inside a DOS FAT file using this loop device
+	  driver.
+
+	  To use the loop device, you need the losetup utility, found in the
+	  util-linux package, see
+	  <https://www.kernel.org/pub/linux/utils/util-linux/>.
+
+	  The loop device driver can also be used to "hide" a file system in
+	  a disk partition, floppy, or regular file, either using encryption
+	  (scrambling the data) or steganography (hiding the data in the low
+	  bits of, say, a sound file). This is also safe if the file resides
+	  on a remote file server.
+
+	  Note that this loop device has nothing to do with the loopback
+	  device used for network connections from the machine to itself.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called loop.
+
+	  Most users will answer N here.
+
+config BLK_DEV_LOOP_MIN_COUNT
+	int "Number of loop devices to pre-create at init time"
+	depends on BLK_DEV_LOOP
+	default 8
+	help
+	  Static number of loop devices to be unconditionally pre-created
+	  at init time.
+
+	  This default value can be overwritten on the kernel command
+	  line or with module-parameter loop.max_loop.
+
+	  The historic default is 8. If a late 2011 version of losetup(8)
+	  is used, it can be set to 0, since needed loop devices can be
+	  dynamically allocated with the /dev/loop-control interface.
+
+source "drivers/block/drbd/Kconfig"
+
+config BLK_DEV_NBD
+	tristate "Network block device support"
+	depends on NET
+	help
+	  Saying Y here will allow your computer to be a client for network
+	  block devices, i.e. it will be able to use block devices exported by
+	  servers (mount file systems on them etc.). Communication between
+	  client and server works over TCP/IP networking, but to the client
+	  program this is hidden: it looks like a regular local file access to
+	  a block device special file such as /dev/nd0.
+
+	  Network block devices also allows you to run a block-device in
+	  userland (making server and client physically the same computer,
+	  communicating using the loopback network device).
+
+	  Read <file:Documentation/admin-guide/blockdev/nbd.rst> for more information,
+	  especially about where to find the server code, which runs in user
+	  space and does not need special kernel support.
+
+	  Note that this has nothing to do with the network file systems NFS
+	  or Coda; you can say N here even if you intend to use NFS or Coda.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called nbd.
+
+	  If unsure, say N.
+
+config BLK_DEV_RAM
+	tristate "RAM block device support"
+	help
+	  Saying Y here will allow you to use a portion of your RAM memory as
+	  a block device, so that you can make file systems on it, read and
+	  write to it and do all the other things that you can do with normal
+	  block devices (such as hard drives). It is usually used to load and
+	  store a copy of a minimal root file system off of a floppy into RAM
+	  during the initial install of Linux.
+
+	  Note that the kernel command line option "ramdisk=XX" is now obsolete.
+	  For details, read <file:Documentation/admin-guide/blockdev/ramdisk.rst>.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called brd. An alias "rd" has been defined
+	  for historical reasons.
+
+	  Most normal users won't need the RAM disk functionality, and can
+	  thus say N here.
+
+config BLK_DEV_RAM_COUNT
+	int "Default number of RAM disks"
+	default "16"
+	depends on BLK_DEV_RAM
+	help
+	  The default value is 16 RAM disks. Change this if you know what you
+	  are doing. If you boot from a filesystem that needs to be extracted
+	  in memory, you will need at least one RAM disk (e.g. root on cramfs).
+
+config BLK_DEV_RAM_SIZE
+	int "Default RAM disk size (kbytes)"
+	depends on BLK_DEV_RAM
+	default "4096"
+	help
+	  The default value is 4096 kilobytes. Only change this if you know
+	  what you are doing.
+
+config CDROM_PKTCDVD
+	tristate "Packet writing on CD/DVD media (DEPRECATED)"
+	depends on !UML
+	depends on SCSI
+	select CDROM
+	help
+	  Note: This driver is deprecated and will be removed from the
+	  kernel in the near future!
+
+	  If you have a CDROM/DVD drive that supports packet writing, say
+	  Y to include support. It should work with any MMC/Mt Fuji
+	  compliant ATAPI or SCSI drive, which is just about any newer
+	  DVD/CD writer.
+
+	  Currently only writing to CD-RW, DVD-RW, DVD+RW and DVDRAM discs
+	  is possible.
+	  DVD-RW disks must be in restricted overwrite mode.
+
+	  See the file <file:Documentation/cdrom/packet-writing.rst>
+	  for further information on the use of this driver.
+
+	  To compile this driver as a module, choose M here: the
+	  module will be called pktcdvd.
+
+config CDROM_PKTCDVD_BUFFERS
+	int "Free buffers for data gathering"
+	depends on CDROM_PKTCDVD
+	default "8"
+	help
+	  This controls the maximum number of active concurrent packets. More
+	  concurrent packets can increase write performance, but also require
+	  more memory. Each concurrent packet will require approximately 64Kb
+	  of non-swappable kernel memory, memory which will be allocated when
+	  a disc is opened for writing.
+
+config CDROM_PKTCDVD_WCACHE
+	bool "Enable write caching"
+	depends on CDROM_PKTCDVD
+	help
+	  If enabled, write caching will be set for the CD-R/W device. For now
+	  this option is dangerous unless the CD-RW media is known good, as we
+	  don't do deferred write error handling yet.
+
+config ATA_OVER_ETH
+	tristate "ATA over Ethernet support"
+	depends on NET
+	help
+	This driver provides Support for ATA over Ethernet block
+	devices like the Coraid EtherDrive (R) Storage Blade.
+
+config SUNVDC
+	tristate "Sun Virtual Disk Client support"
+	depends on SUN_LDOMS
+	help
+	  Support for virtual disk devices as a client under Sun
+	  Logical Domains.
+
+source "drivers/s390/block/Kconfig"
+
+config XEN_BLKDEV_FRONTEND
+	tristate "Xen virtual block device support"
+	depends on XEN
+	default y
+	select XEN_XENBUS_FRONTEND
+	help
+	  This driver implements the front-end of the Xen virtual
+	  block device driver.  It communicates with a back-end driver
+	  in another domain which drives the actual block device.
+
+config XEN_BLKDEV_BACKEND
+	tristate "Xen block-device backend driver"
+	depends on XEN_BACKEND
+	help
+	  The block-device backend driver allows the kernel to export its
+	  block devices to other guests via a high-performance shared-memory
+	  interface.
+
+	  The corresponding Linux frontend driver is enabled by the
+	  CONFIG_XEN_BLKDEV_FRONTEND configuration option.
+
+	  The backend driver attaches itself to a any block device specified
+	  in the XenBus configuration. There are no limits to what the block
+	  device as long as it has a major and minor.
+
+	  If you are compiling a kernel to run in a Xen block backend driver
+	  domain (often this is domain 0) you should say Y here. To
+	  compile this driver as a module, chose M here: the module
+	  will be called xen-blkback.
+
+
+config VIRTIO_BLK
+	tristate "Virtio block driver"
+	depends on VIRTIO
+	select SG_POOL
+	help
+	  This is the virtual block driver for virtio.  It can be used with
+          QEMU based VMMs (like KVM or Xen).  Say Y or M.
+
+config BLK_DEV_RBD
+	tristate "Rados block device (RBD)"
+	depends on INET && BLOCK
+	select CEPH_LIB
+	select LIBCRC32C
+	select CRYPTO_AES
+	select CRYPTO
+	help
+	  Say Y here if you want include the Rados block device, which stripes
+	  a block device over objects stored in the Ceph distributed object
+	  store.
+
+	  More information at http://ceph.newdream.net/.
+
+	  If unsure, say N.
+
+config BLK_DEV_UBLK
+	tristate "Userspace block driver (Experimental)"
+	select IO_URING
+	help
+	  io_uring based userspace block driver. Together with ublk server, ublk
+	  has been working well, but interface with userspace or command data
+	  definition isn't finalized yet, and might change according to future
+	  requirement, so mark is as experimental now.
+
+	  Say Y if you want to get better performance because task_work_add()
+	  can be used in IO path for replacing io_uring cmd, which will become
+	  shared between IO tasks and ubq daemon, meantime task_work_add() can
+	  can handle batch more effectively, but task_work_add() isn't exported
+	  for module, so ublk has to be built to kernel.
+
+source "drivers/block/rnbd/Kconfig"
+
+endif # BLK_DEV
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
new file mode 100644
index 000000000..101612cba
--- /dev/null
+++ b/drivers/block/Makefile
@@ -0,0 +1,42 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for the kernel block device drivers.
+#
+# 12 June 2000, Christoph Hellwig <hch@infradead.org>
+# Rewritten to use lists instead of if-statements.
+# 
+
+# needed for trace events
+ccflags-y				+= -I$(src)
+
+obj-$(CONFIG_MAC_FLOPPY)	+= swim3.o
+obj-$(CONFIG_BLK_DEV_SWIM)	+= swim_mod.o
+obj-$(CONFIG_BLK_DEV_FD)	+= floppy.o
+obj-$(CONFIG_AMIGA_FLOPPY)	+= amiflop.o
+obj-$(CONFIG_PS3_DISK)		+= ps3disk.o
+obj-$(CONFIG_PS3_VRAM)		+= ps3vram.o
+obj-$(CONFIG_ATARI_FLOPPY)	+= ataflop.o
+obj-$(CONFIG_AMIGA_Z2RAM)	+= z2ram.o
+obj-$(CONFIG_N64CART)		+= n64cart.o
+obj-$(CONFIG_BLK_DEV_RAM)	+= brd.o
+obj-$(CONFIG_BLK_DEV_LOOP)	+= loop.o
+obj-$(CONFIG_CDROM_PKTCDVD)	+= pktcdvd.o
+obj-$(CONFIG_SUNVDC)		+= sunvdc.o
+
+obj-$(CONFIG_BLK_DEV_NBD)	+= nbd.o
+obj-$(CONFIG_VIRTIO_BLK)	+= virtio_blk.o
+
+obj-$(CONFIG_XEN_BLKDEV_FRONTEND)	+= xen-blkfront.o
+obj-$(CONFIG_XEN_BLKDEV_BACKEND)	+= xen-blkback/
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd/
+obj-$(CONFIG_BLK_DEV_RBD)     += rbd.o
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX)	+= mtip32xx/
+
+obj-$(CONFIG_ZRAM) += zram/
+obj-$(CONFIG_BLK_DEV_RNBD)	+= rnbd/
+
+obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk/
+
+obj-$(CONFIG_BLK_DEV_UBLK)			+= ublk_drv.o
+
+swim_mod-y	:= swim.o swim_asm.o
diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c
new file mode 100644
index 000000000..4c8b2ba57
--- /dev/null
+++ b/drivers/block/amiflop.c
@@ -0,0 +1,1965 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/amiga/amiflop.c
+ *
+ *  Copyright (C) 1993  Greg Harp
+ *  Portions of this driver are based on code contributed by Brad Pepers
+ *  
+ *  revised 28.5.95 by Joerg Dorchain
+ *  - now no bugs(?) any more for both HD & DD
+ *  - added support for 40 Track 5.25" drives, 80-track hopefully behaves
+ *    like 3.5" dd (no way to test - are there any 5.25" drives out there
+ *    that work on an A4000?)
+ *  - wrote formatting routine (maybe dirty, but works)
+ *
+ *  june/july 1995 added ms-dos support by Joerg Dorchain
+ *  (portions based on messydos.device and various contributors)
+ *  - currently only 9 and 18 sector disks
+ *
+ *  - fixed a bug with the internal trackbuffer when using multiple 
+ *    disks the same time
+ *  - made formatting a bit safer
+ *  - added command line and machine based default for "silent" df0
+ *
+ *  december 1995 adapted for 1.2.13pl4 by Joerg Dorchain
+ *  - works but I think it's inefficient. (look in redo_fd_request)
+ *    But the changes were very efficient. (only three and a half lines)
+ *
+ *  january 1996 added special ioctl for tracking down read/write problems
+ *  - usage ioctl(d, RAW_TRACK, ptr); the raw track buffer (MFM-encoded data
+ *    is copied to area. (area should be large enough since no checking is
+ *    done - 30K is currently sufficient). return the actual size of the
+ *    trackbuffer
+ *  - replaced udelays() by a timer (CIAA timer B) for the waits 
+ *    needed for the disk mechanic.
+ *
+ *  february 1996 fixed error recovery and multiple disk access
+ *  - both got broken the first time I tampered with the driver :-(
+ *  - still not safe, but better than before
+ *
+ *  revised Marts 3rd, 1996 by Jes Sorensen for use in the 1.3.28 kernel.
+ *  - Minor changes to accept the kdev_t.
+ *  - Replaced some more udelays with ms_delays. Udelay is just a loop,
+ *    and so the delay will be different depending on the given
+ *    processor :-(
+ *  - The driver could use a major cleanup because of the new
+ *    major/minor handling that came with kdev_t. It seems to work for
+ *    the time being, but I can't guarantee that it will stay like
+ *    that when we start using 16 (24?) bit minors.
+ *
+ * restructured jan 1997 by Joerg Dorchain
+ * - Fixed Bug accessing multiple disks
+ * - some code cleanup
+ * - added trackbuffer for each drive to speed things up
+ * - fixed some race conditions (who finds the next may send it to me ;-)
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/fd.h>
+#include <linux/hdreg.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/major.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/blk-mq.h>
+#include <linux/interrupt.h>
+#include <linux/platform_device.h>
+
+#include <asm/setup.h>
+#include <linux/uaccess.h>
+#include <asm/amigahw.h>
+#include <asm/amigaints.h>
+#include <asm/irq.h>
+
+#undef DEBUG /* print _LOTS_ of infos */
+
+#define RAW_IOCTL
+#ifdef RAW_IOCTL
+#define IOCTL_RAW_TRACK 0x5254524B  /* 'RTRK' */
+#endif
+
+/*
+ *  Defines
+ */
+
+/*
+ * CIAAPRA bits (read only)
+ */
+
+#define DSKRDY      (0x1<<5)        /* disk ready when low */
+#define DSKTRACK0   (0x1<<4)        /* head at track zero when low */
+#define DSKPROT     (0x1<<3)        /* disk protected when low */
+#define DSKCHANGE   (0x1<<2)        /* low when disk removed */
+
+/*
+ * CIAAPRB bits (read/write)
+ */
+
+#define DSKMOTOR    (0x1<<7)        /* motor on when low */
+#define DSKSEL3     (0x1<<6)        /* select drive 3 when low */
+#define DSKSEL2     (0x1<<5)        /* select drive 2 when low */
+#define DSKSEL1     (0x1<<4)        /* select drive 1 when low */
+#define DSKSEL0     (0x1<<3)        /* select drive 0 when low */
+#define DSKSIDE     (0x1<<2)        /* side selection: 0 = upper, 1 = lower */
+#define DSKDIREC    (0x1<<1)        /* step direction: 0=in, 1=out (to trk 0) */
+#define DSKSTEP     (0x1)           /* pulse low to step head 1 track */
+
+/*
+ * DSKBYTR bits (read only)
+ */
+
+#define DSKBYT      (1<<15)         /* register contains valid byte when set */
+#define DMAON       (1<<14)         /* disk DMA enabled */
+#define DISKWRITE   (1<<13)         /* disk write bit in DSKLEN enabled */
+#define WORDEQUAL   (1<<12)         /* DSKSYNC register match when true */
+/* bits 7-0 are data */
+
+/*
+ * ADKCON/ADKCONR bits
+ */
+
+#ifndef SETCLR
+#define ADK_SETCLR      (1<<15)     /* control bit */
+#endif
+#define ADK_PRECOMP1    (1<<14)     /* precompensation selection */
+#define ADK_PRECOMP0    (1<<13)     /* 00=none, 01=140ns, 10=280ns, 11=500ns */
+#define ADK_MFMPREC     (1<<12)     /* 0=GCR precomp., 1=MFM precomp. */
+#define ADK_WORDSYNC    (1<<10)     /* enable DSKSYNC auto DMA */
+#define ADK_MSBSYNC     (1<<9)      /* when 1, enable sync on MSbit (for GCR) */
+#define ADK_FAST        (1<<8)      /* bit cell: 0=2us (GCR), 1=1us (MFM) */
+
+/*
+ * DSKLEN bits
+ */
+
+#define DSKLEN_DMAEN    (1<<15)
+#define DSKLEN_WRITE    (1<<14)
+
+/*
+ * INTENA/INTREQ bits
+ */
+
+#define DSKINDEX    (0x1<<4)        /* DSKINDEX bit */
+
+/*
+ * Misc
+ */
+
+#define MFM_SYNC    0x4489          /* standard MFM sync value */
+
+/* Values for FD_COMMAND */
+#define FD_RECALIBRATE		0x07	/* move to track 0 */
+#define FD_SEEK			0x0F	/* seek track */
+#define FD_READ			0xE6	/* read with MT, MFM, SKip deleted */
+#define FD_WRITE		0xC5	/* write with MT, MFM */
+#define FD_SENSEI		0x08	/* Sense Interrupt Status */
+#define FD_SPECIFY		0x03	/* specify HUT etc */
+#define FD_FORMAT		0x4D	/* format one track */
+#define FD_VERSION		0x10	/* get version code */
+#define FD_CONFIGURE		0x13	/* configure FIFO operation */
+#define FD_PERPENDICULAR	0x12	/* perpendicular r/w mode */
+
+#define FD_MAX_UNITS    4	/* Max. Number of drives */
+#define FLOPPY_MAX_SECTORS	22	/* Max. Number of sectors per track */
+
+struct fd_data_type {
+	char *name;		/* description of data type */
+	int sects;		/* sectors per track */
+	int (*read_fkt)(int);	/* read whole track */
+	void (*write_fkt)(int);	/* write whole track */
+};
+
+struct fd_drive_type {
+	unsigned long code;		/* code returned from drive */
+	char *name;			/* description of drive */
+	unsigned int tracks;	/* number of tracks */
+	unsigned int heads;		/* number of heads */
+	unsigned int read_size;	/* raw read size for one track */
+	unsigned int write_size;	/* raw write size for one track */
+	unsigned int sect_mult;	/* sectors and gap multiplier (HD = 2) */
+	unsigned int precomp1;	/* start track for precomp 1 */
+	unsigned int precomp2;	/* start track for precomp 2 */
+	unsigned int step_delay;	/* time (in ms) for delay after step */
+	unsigned int settle_time;	/* time to settle after dir change */
+	unsigned int side_time;	/* time needed to change sides */
+};
+
+struct amiga_floppy_struct {
+	struct fd_drive_type *type;	/* type of floppy for this unit */
+	struct fd_data_type *dtype;	/* type of floppy for this unit */
+	int track;			/* current track (-1 == unknown) */
+	unsigned char *trackbuf;	/* current track (kmaloc()'d */
+
+	int blocks;			/* total # blocks on disk */
+
+	int changed;			/* true when not known */
+	int disk;			/* disk in drive (-1 == unknown) */
+	int motor;			/* true when motor is at speed */
+	int busy;			/* true when drive is active */
+	int dirty;			/* true when trackbuf is not on disk */
+	int status;			/* current error code for unit */
+	struct gendisk *gendisk[2];
+	struct blk_mq_tag_set tag_set;
+};
+
+/*
+ *  Error codes
+ */
+#define FD_OK		0	/* operation succeeded */
+#define FD_ERROR	-1	/* general error (seek, read, write, etc) */
+#define FD_NOUNIT	1	/* unit does not exist */
+#define FD_UNITBUSY	2	/* unit already active */
+#define FD_NOTACTIVE	3	/* unit is not active */
+#define FD_NOTREADY	4	/* unit is not ready (motor not on/no disk) */
+
+#define MFM_NOSYNC	1
+#define MFM_HEADER	2
+#define MFM_DATA	3
+#define MFM_TRACK	4
+
+/*
+ *  Floppy ID values
+ */
+#define FD_NODRIVE	0x00000000  /* response when no unit is present */
+#define FD_DD_3 	0xffffffff  /* double-density 3.5" (880K) drive */
+#define FD_HD_3 	0x55555555  /* high-density 3.5" (1760K) drive */
+#define FD_DD_5 	0xaaaaaaaa  /* double-density 5.25" (440K) drive */
+
+static DEFINE_MUTEX(amiflop_mutex);
+static unsigned long int fd_def_df0 = FD_DD_3;     /* default for df0 if it doesn't identify */
+
+module_param(fd_def_df0, ulong, 0);
+MODULE_LICENSE("GPL");
+
+/*
+ *  Macros
+ */
+#define MOTOR_ON	(ciab.prb &= ~DSKMOTOR)
+#define MOTOR_OFF	(ciab.prb |= DSKMOTOR)
+#define SELECT(mask)    (ciab.prb &= ~mask)
+#define DESELECT(mask)  (ciab.prb |= mask)
+#define SELMASK(drive)  (1 << (3 + (drive & 3)))
+
+static struct fd_drive_type drive_types[] = {
+/*  code	name	   tr he   rdsz   wrsz sm pc1 pc2 sd  st st*/
+/*  warning: times are now in milliseconds (ms)                    */
+{ FD_DD_3,	"DD 3.5",  80, 2, 14716, 13630, 1, 80,161, 3, 18, 1},
+{ FD_HD_3,	"HD 3.5",  80, 2, 28344, 27258, 2, 80,161, 3, 18, 1},
+{ FD_DD_5,	"DD 5.25", 40, 2, 14716, 13630, 1, 40, 81, 6, 30, 2},
+{ FD_NODRIVE, "No Drive", 0, 0,     0,     0, 0,  0,  0,  0,  0, 0}
+};
+static int num_dr_types = ARRAY_SIZE(drive_types);
+
+static int amiga_read(int), dos_read(int);
+static void amiga_write(int), dos_write(int);
+static struct fd_data_type data_types[] = {
+	{ "Amiga", 11 , amiga_read, amiga_write},
+	{ "MS-Dos", 9, dos_read, dos_write}
+};
+
+/* current info on each unit */
+static struct amiga_floppy_struct unit[FD_MAX_UNITS];
+
+static struct timer_list flush_track_timer[FD_MAX_UNITS];
+static struct timer_list post_write_timer;
+static unsigned long post_write_timer_drive;
+static struct timer_list motor_on_timer;
+static struct timer_list motor_off_timer[FD_MAX_UNITS];
+static int on_attempts;
+
+/* Synchronization of FDC access */
+/* request loop (trackbuffer) */
+static volatile int fdc_busy = -1;
+static volatile int fdc_nested;
+static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
+ 
+static DECLARE_COMPLETION(motor_on_completion);
+
+static volatile int selected = -1;	/* currently selected drive */
+
+static int writepending;
+static int writefromint;
+static char *raw_buf;
+
+static DEFINE_SPINLOCK(amiflop_lock);
+
+#define RAW_BUF_SIZE 30000  /* size of raw disk data */
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+static volatile char block_flag;
+static DECLARE_WAIT_QUEUE_HEAD(wait_fd_block);
+
+/* MS-Dos MFM Coding tables (should go quick and easy) */
+static unsigned char mfmencode[16]={
+	0x2a, 0x29, 0x24, 0x25, 0x12, 0x11, 0x14, 0x15,
+	0x4a, 0x49, 0x44, 0x45, 0x52, 0x51, 0x54, 0x55
+};
+static unsigned char mfmdecode[128];
+
+/* floppy internal millisecond timer stuff */
+static DECLARE_COMPLETION(ms_wait_completion);
+#define MS_TICKS ((amiga_eclock+50)/1000)
+
+/*
+ * Note that MAX_ERRORS=X doesn't imply that we retry every bad read
+ * max X times - some types of errors increase the errorcount by 2 or
+ * even 3, so we might actually retry only X/2 times before giving up.
+ */
+#define MAX_ERRORS 12
+
+#define custom amiga_custom
+
+/* Prevent "aliased" accesses. */
+static int fd_ref[4] = { 0,0,0,0 };
+static int fd_device[4] = { 0, 0, 0, 0 };
+
+/*
+ * Here come the actual hardware access and helper functions.
+ * They are not reentrant and single threaded because all drives
+ * share the same hardware and the same trackbuffer.
+ */
+
+/* Milliseconds timer */
+
+static irqreturn_t ms_isr(int irq, void *dummy)
+{
+	complete(&ms_wait_completion);
+	return IRQ_HANDLED;
+}
+
+/* all waits are queued up 
+   A more generic routine would do a schedule a la timer.device */
+static void ms_delay(int ms)
+{
+	int ticks;
+	static DEFINE_MUTEX(mutex);
+
+	if (ms > 0) {
+		mutex_lock(&mutex);
+		ticks = MS_TICKS*ms-1;
+		ciaa.tblo=ticks%256;
+		ciaa.tbhi=ticks/256;
+		ciaa.crb=0x19; /*count eclock, force load, one-shoot, start */
+		wait_for_completion(&ms_wait_completion);
+		mutex_unlock(&mutex);
+	}
+}
+
+/* Hardware semaphore */
+
+/* returns true when we would get the semaphore */
+static inline int try_fdc(int drive)
+{
+	drive &= 3;
+	return ((fdc_busy < 0) || (fdc_busy == drive));
+}
+
+static void get_fdc(int drive)
+{
+	unsigned long flags;
+
+	drive &= 3;
+#ifdef DEBUG
+	printk("get_fdc: drive %d  fdc_busy %d  fdc_nested %d\n",drive,fdc_busy,fdc_nested);
+#endif
+	local_irq_save(flags);
+	wait_event(fdc_wait, try_fdc(drive));
+	fdc_busy = drive;
+	fdc_nested++;
+	local_irq_restore(flags);
+}
+
+static inline void rel_fdc(void)
+{
+#ifdef DEBUG
+	if (fdc_nested == 0)
+		printk("fd: unmatched rel_fdc\n");
+	printk("rel_fdc: fdc_busy %d fdc_nested %d\n",fdc_busy,fdc_nested);
+#endif
+	fdc_nested--;
+	if (fdc_nested == 0) {
+		fdc_busy = -1;
+		wake_up(&fdc_wait);
+	}
+}
+
+static void fd_select (int drive)
+{
+	unsigned char prb = ~0;
+
+	drive&=3;
+#ifdef DEBUG
+	printk("selecting %d\n",drive);
+#endif
+	if (drive == selected)
+		return;
+	get_fdc(drive);
+	selected = drive;
+
+	if (unit[drive].track % 2 != 0)
+		prb &= ~DSKSIDE;
+	if (unit[drive].motor == 1)
+		prb &= ~DSKMOTOR;
+	ciab.prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3));
+	ciab.prb = prb;
+	prb &= ~SELMASK(drive);
+	ciab.prb = prb;
+	rel_fdc();
+}
+
+static void fd_deselect (int drive)
+{
+	unsigned char prb;
+	unsigned long flags;
+
+	drive&=3;
+#ifdef DEBUG
+	printk("deselecting %d\n",drive);
+#endif
+	if (drive != selected) {
+		printk(KERN_WARNING "Deselecting drive %d while %d was selected!\n",drive,selected);
+		return;
+	}
+
+	get_fdc(drive);
+	local_irq_save(flags);
+
+	selected = -1;
+
+	prb = ciab.prb;
+	prb |= (SELMASK(0)|SELMASK(1)|SELMASK(2)|SELMASK(3));
+	ciab.prb = prb;
+
+	local_irq_restore (flags);
+	rel_fdc();
+
+}
+
+static void motor_on_callback(struct timer_list *unused)
+{
+	if (!(ciaa.pra & DSKRDY) || --on_attempts == 0) {
+		complete_all(&motor_on_completion);
+	} else {
+		motor_on_timer.expires = jiffies + HZ/10;
+		add_timer(&motor_on_timer);
+	}
+}
+
+static int fd_motor_on(int nr)
+{
+	nr &= 3;
+
+	del_timer(motor_off_timer + nr);
+
+	if (!unit[nr].motor) {
+		unit[nr].motor = 1;
+		fd_select(nr);
+
+		reinit_completion(&motor_on_completion);
+		mod_timer(&motor_on_timer, jiffies + HZ/2);
+
+		on_attempts = 10;
+		wait_for_completion(&motor_on_completion);
+		fd_deselect(nr);
+	}
+
+	if (on_attempts == 0) {
+		on_attempts = -1;
+#if 0
+		printk (KERN_ERR "motor_on failed, turning motor off\n");
+		fd_motor_off (motor_off_timer + nr);
+		return 0;
+#else
+		printk (KERN_WARNING "DSKRDY not set after 1.5 seconds - assuming drive is spinning notwithstanding\n");
+#endif
+	}
+
+	return 1;
+}
+
+static void fd_motor_off(struct timer_list *timer)
+{
+	unsigned long drive = ((unsigned long)timer -
+			       (unsigned long)&motor_off_timer[0]) /
+					sizeof(motor_off_timer[0]);
+
+	drive&=3;
+	if (!try_fdc(drive)) {
+		/* We would be blocked in an interrupt, so try again later */
+		timer->expires = jiffies + 1;
+		add_timer(timer);
+		return;
+	}
+	unit[drive].motor = 0;
+	fd_select(drive);
+	udelay (1);
+	fd_deselect(drive);
+}
+
+static void floppy_off (unsigned int nr)
+{
+	int drive;
+
+	drive = nr & 3;
+	mod_timer(motor_off_timer + drive, jiffies + 3*HZ);
+}
+
+static int fd_calibrate(int drive)
+{
+	unsigned char prb;
+	int n;
+
+	drive &= 3;
+	get_fdc(drive);
+	if (!fd_motor_on (drive))
+		return 0;
+	fd_select (drive);
+	prb = ciab.prb;
+	prb |= DSKSIDE;
+	prb &= ~DSKDIREC;
+	ciab.prb = prb;
+	for (n = unit[drive].type->tracks/2; n != 0; --n) {
+		if (ciaa.pra & DSKTRACK0)
+			break;
+		prb &= ~DSKSTEP;
+		ciab.prb = prb;
+		prb |= DSKSTEP;
+		udelay (2);
+		ciab.prb = prb;
+		ms_delay(unit[drive].type->step_delay);
+	}
+	ms_delay (unit[drive].type->settle_time);
+	prb |= DSKDIREC;
+	n = unit[drive].type->tracks + 20;
+	for (;;) {
+		prb &= ~DSKSTEP;
+		ciab.prb = prb;
+		prb |= DSKSTEP;
+		udelay (2);
+		ciab.prb = prb;
+		ms_delay(unit[drive].type->step_delay + 1);
+		if ((ciaa.pra & DSKTRACK0) == 0)
+			break;
+		if (--n == 0) {
+			printk (KERN_ERR "fd%d: calibrate failed, turning motor off\n", drive);
+			fd_motor_off (motor_off_timer + drive);
+			unit[drive].track = -1;
+			rel_fdc();
+			return 0;
+		}
+	}
+	unit[drive].track = 0;
+	ms_delay(unit[drive].type->settle_time);
+
+	rel_fdc();
+	fd_deselect(drive);
+	return 1;
+}
+
+static int fd_seek(int drive, int track)
+{
+	unsigned char prb;
+	int cnt;
+
+#ifdef DEBUG
+	printk("seeking drive %d to track %d\n",drive,track);
+#endif
+	drive &= 3;
+	get_fdc(drive);
+	if (unit[drive].track == track) {
+		rel_fdc();
+		return 1;
+	}
+	if (!fd_motor_on(drive)) {
+		rel_fdc();
+		return 0;
+	}
+	if (unit[drive].track < 0 && !fd_calibrate(drive)) {
+		rel_fdc();
+		return 0;
+	}
+
+	fd_select (drive);
+	cnt = unit[drive].track/2 - track/2;
+	prb = ciab.prb;
+	prb |= DSKSIDE | DSKDIREC;
+	if (track % 2 != 0)
+		prb &= ~DSKSIDE;
+	if (cnt < 0) {
+		cnt = - cnt;
+		prb &= ~DSKDIREC;
+	}
+	ciab.prb = prb;
+	if (track % 2 != unit[drive].track % 2)
+		ms_delay (unit[drive].type->side_time);
+	unit[drive].track = track;
+	if (cnt == 0) {
+		rel_fdc();
+		fd_deselect(drive);
+		return 1;
+	}
+	do {
+		prb &= ~DSKSTEP;
+		ciab.prb = prb;
+		prb |= DSKSTEP;
+		udelay (1);
+		ciab.prb = prb;
+		ms_delay (unit[drive].type->step_delay);
+	} while (--cnt != 0);
+	ms_delay (unit[drive].type->settle_time);
+
+	rel_fdc();
+	fd_deselect(drive);
+	return 1;
+}
+
+static unsigned long fd_get_drive_id(int drive)
+{
+	int i;
+	ulong id = 0;
+
+  	drive&=3;
+  	get_fdc(drive);
+	/* set up for ID */
+	MOTOR_ON;
+	udelay(2);
+	SELECT(SELMASK(drive));
+	udelay(2);
+	DESELECT(SELMASK(drive));
+	udelay(2);
+	MOTOR_OFF;
+	udelay(2);
+	SELECT(SELMASK(drive));
+	udelay(2);
+	DESELECT(SELMASK(drive));
+	udelay(2);
+
+	/* loop and read disk ID */
+	for (i=0; i<32; i++) {
+		SELECT(SELMASK(drive));
+		udelay(2);
+
+		/* read and store value of DSKRDY */
+		id <<= 1;
+		id |= (ciaa.pra & DSKRDY) ? 0 : 1;	/* cia regs are low-active! */
+
+		DESELECT(SELMASK(drive));
+	}
+
+	rel_fdc();
+
+        /*
+         * RB: At least A500/A2000's df0: don't identify themselves.
+         * As every (real) Amiga has at least a 3.5" DD drive as df0:
+         * we default to that if df0: doesn't identify as a certain
+         * type.
+         */
+        if(drive == 0 && id == FD_NODRIVE)
+	{
+                id = fd_def_df0;
+                printk(KERN_NOTICE "fd: drive 0 didn't identify, setting default %08lx\n", (ulong)fd_def_df0);
+	}
+	/* return the ID value */
+	return (id);
+}
+
+static irqreturn_t fd_block_done(int irq, void *dummy)
+{
+	if (block_flag)
+		custom.dsklen = 0x4000;
+
+	if (block_flag == 2) { /* writing */
+		writepending = 2;
+		post_write_timer.expires = jiffies + 1; /* at least 2 ms */
+		post_write_timer_drive = selected;
+		add_timer(&post_write_timer);
+	}
+	else {                /* reading */
+		block_flag = 0;
+		wake_up (&wait_fd_block);
+	}
+	return IRQ_HANDLED;
+}
+
+static void raw_read(int drive)
+{
+	drive&=3;
+	get_fdc(drive);
+	wait_event(wait_fd_block, !block_flag);
+	fd_select(drive);
+	/* setup adkcon bits correctly */
+	custom.adkcon = ADK_MSBSYNC;
+	custom.adkcon = ADK_SETCLR|ADK_WORDSYNC|ADK_FAST;
+
+	custom.dsksync = MFM_SYNC;
+
+	custom.dsklen = 0;
+	custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf);
+	custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN;
+	custom.dsklen = unit[drive].type->read_size/sizeof(short) | DSKLEN_DMAEN;
+
+	block_flag = 1;
+
+	wait_event(wait_fd_block, !block_flag);
+
+	custom.dsklen = 0;
+	fd_deselect(drive);
+	rel_fdc();
+}
+
+static int raw_write(int drive)
+{
+	ushort adk;
+
+	drive&=3;
+	get_fdc(drive); /* corresponds to rel_fdc() in post_write() */
+	if ((ciaa.pra & DSKPROT) == 0) {
+		rel_fdc();
+		return 0;
+	}
+	wait_event(wait_fd_block, !block_flag);
+	fd_select(drive);
+	/* clear adkcon bits */
+	custom.adkcon = ADK_PRECOMP1|ADK_PRECOMP0|ADK_WORDSYNC|ADK_MSBSYNC;
+	/* set appropriate adkcon bits */
+	adk = ADK_SETCLR|ADK_FAST;
+	if ((ulong)unit[drive].track >= unit[drive].type->precomp2)
+		adk |= ADK_PRECOMP1;
+	else if ((ulong)unit[drive].track >= unit[drive].type->precomp1)
+		adk |= ADK_PRECOMP0;
+	custom.adkcon = adk;
+
+	custom.dsklen = DSKLEN_WRITE;
+	custom.dskptr = (u_char *)ZTWO_PADDR((u_char *)raw_buf);
+	custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE;
+	custom.dsklen = unit[drive].type->write_size/sizeof(short) | DSKLEN_DMAEN|DSKLEN_WRITE;
+
+	block_flag = 2;
+	return 1;
+}
+
+/*
+ * to be called at least 2ms after the write has finished but before any
+ * other access to the hardware.
+ */
+static void post_write (unsigned long drive)
+{
+#ifdef DEBUG
+	printk("post_write for drive %ld\n",drive);
+#endif
+	drive &= 3;
+	custom.dsklen = 0;
+	block_flag = 0;
+	writepending = 0;
+	writefromint = 0;
+	unit[drive].dirty = 0;
+	wake_up(&wait_fd_block);
+	fd_deselect(drive);
+	rel_fdc(); /* corresponds to get_fdc() in raw_write */
+}
+
+static void post_write_callback(struct timer_list *timer)
+{
+	post_write(post_write_timer_drive);
+}
+
+/*
+ * The following functions are to convert the block contents into raw data
+ * written to disk and vice versa.
+ * (Add other formats here ;-))
+ */
+
+static unsigned long scan_sync(unsigned long raw, unsigned long end)
+{
+	ushort *ptr = (ushort *)raw, *endp = (ushort *)end;
+
+	while (ptr < endp && *ptr++ != 0x4489)
+		;
+	if (ptr < endp) {
+		while (*ptr == 0x4489 && ptr < endp)
+			ptr++;
+		return (ulong)ptr;
+	}
+	return 0;
+}
+
+static inline unsigned long checksum(unsigned long *addr, int len)
+{
+	unsigned long csum = 0;
+
+	len /= sizeof(*addr);
+	while (len-- > 0)
+		csum ^= *addr++;
+	csum = ((csum>>1) & 0x55555555)  ^  (csum & 0x55555555);
+
+	return csum;
+}
+
+static unsigned long decode (unsigned long *data, unsigned long *raw,
+			     int len)
+{
+	ulong *odd, *even;
+
+	/* convert length from bytes to longwords */
+	len >>= 2;
+	odd = raw;
+	even = odd + len;
+
+	/* prepare return pointer */
+	raw += len * 2;
+
+	do {
+		*data++ = ((*odd++ & 0x55555555) << 1) | (*even++ & 0x55555555);
+	} while (--len != 0);
+
+	return (ulong)raw;
+}
+
+struct header {
+	unsigned char magic;
+	unsigned char track;
+	unsigned char sect;
+	unsigned char ord;
+	unsigned char labels[16];
+	unsigned long hdrchk;
+	unsigned long datachk;
+};
+
+static int amiga_read(int drive)
+{
+	unsigned long raw;
+	unsigned long end;
+	int scnt;
+	unsigned long csum;
+	struct header hdr;
+
+	drive&=3;
+	raw = (long) raw_buf;
+	end = raw + unit[drive].type->read_size;
+
+	for (scnt = 0;scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) {
+		if (!(raw = scan_sync(raw, end))) {
+			printk (KERN_INFO "can't find sync for sector %d\n", scnt);
+			return MFM_NOSYNC;
+		}
+
+		raw = decode ((ulong *)&hdr.magic, (ulong *)raw, 4);
+		raw = decode ((ulong *)&hdr.labels, (ulong *)raw, 16);
+		raw = decode ((ulong *)&hdr.hdrchk, (ulong *)raw, 4);
+		raw = decode ((ulong *)&hdr.datachk, (ulong *)raw, 4);
+		csum = checksum((ulong *)&hdr,
+				(char *)&hdr.hdrchk-(char *)&hdr);
+
+#ifdef DEBUG
+		printk ("(%x,%d,%d,%d) (%lx,%lx,%lx,%lx) %lx %lx\n",
+			hdr.magic, hdr.track, hdr.sect, hdr.ord,
+			*(ulong *)&hdr.labels[0], *(ulong *)&hdr.labels[4],
+			*(ulong *)&hdr.labels[8], *(ulong *)&hdr.labels[12],
+			hdr.hdrchk, hdr.datachk);
+#endif
+
+		if (hdr.hdrchk != csum) {
+			printk(KERN_INFO "MFM_HEADER: %08lx,%08lx\n", hdr.hdrchk, csum);
+			return MFM_HEADER;
+		}
+
+		/* verify track */
+		if (hdr.track != unit[drive].track) {
+			printk(KERN_INFO "MFM_TRACK: %d, %d\n", hdr.track, unit[drive].track);
+			return MFM_TRACK;
+		}
+
+		raw = decode ((ulong *)(unit[drive].trackbuf + hdr.sect*512),
+			      (ulong *)raw, 512);
+		csum = checksum((ulong *)(unit[drive].trackbuf + hdr.sect*512), 512);
+
+		if (hdr.datachk != csum) {
+			printk(KERN_INFO "MFM_DATA: (%x:%d:%d:%d) sc=%d %lx, %lx\n",
+			       hdr.magic, hdr.track, hdr.sect, hdr.ord, scnt,
+			       hdr.datachk, csum);
+			printk (KERN_INFO "data=(%lx,%lx,%lx,%lx)\n",
+				((ulong *)(unit[drive].trackbuf+hdr.sect*512))[0],
+				((ulong *)(unit[drive].trackbuf+hdr.sect*512))[1],
+				((ulong *)(unit[drive].trackbuf+hdr.sect*512))[2],
+				((ulong *)(unit[drive].trackbuf+hdr.sect*512))[3]);
+			return MFM_DATA;
+		}
+	}
+
+	return 0;
+}
+
+static void encode(unsigned long data, unsigned long *dest)
+{
+	unsigned long data2;
+
+	data &= 0x55555555;
+	data2 = data ^ 0x55555555;
+	data |= ((data2 >> 1) | 0x80000000) & (data2 << 1);
+
+	if (*(dest - 1) & 0x00000001)
+		data &= 0x7FFFFFFF;
+
+	*dest = data;
+}
+
+static void encode_block(unsigned long *dest, unsigned long *src, int len)
+{
+	int cnt, to_cnt = 0;
+	unsigned long data;
+
+	/* odd bits */
+	for (cnt = 0; cnt < len / 4; cnt++) {
+		data = src[cnt] >> 1;
+		encode(data, dest + to_cnt++);
+	}
+
+	/* even bits */
+	for (cnt = 0; cnt < len / 4; cnt++) {
+		data = src[cnt];
+		encode(data, dest + to_cnt++);
+	}
+}
+
+static unsigned long *putsec(int disk, unsigned long *raw, int cnt)
+{
+	struct header hdr;
+	int i;
+
+	disk&=3;
+	*raw = (raw[-1]&1) ? 0x2AAAAAAA : 0xAAAAAAAA;
+	raw++;
+	*raw++ = 0x44894489;
+
+	hdr.magic = 0xFF;
+	hdr.track = unit[disk].track;
+	hdr.sect = cnt;
+	hdr.ord = unit[disk].dtype->sects * unit[disk].type->sect_mult - cnt;
+	for (i = 0; i < 16; i++)
+		hdr.labels[i] = 0;
+	hdr.hdrchk = checksum((ulong *)&hdr,
+			      (char *)&hdr.hdrchk-(char *)&hdr);
+	hdr.datachk = checksum((ulong *)(unit[disk].trackbuf+cnt*512), 512);
+
+	encode_block(raw, (ulong *)&hdr.magic, 4);
+	raw += 2;
+	encode_block(raw, (ulong *)&hdr.labels, 16);
+	raw += 8;
+	encode_block(raw, (ulong *)&hdr.hdrchk, 4);
+	raw += 2;
+	encode_block(raw, (ulong *)&hdr.datachk, 4);
+	raw += 2;
+	encode_block(raw, (ulong *)(unit[disk].trackbuf+cnt*512), 512);
+	raw += 256;
+
+	return raw;
+}
+
+static void amiga_write(int disk)
+{
+	unsigned int cnt;
+	unsigned long *ptr = (unsigned long *)raw_buf;
+
+	disk&=3;
+	/* gap space */
+	for (cnt = 0; cnt < 415 * unit[disk].type->sect_mult; cnt++)
+		*ptr++ = 0xaaaaaaaa;
+
+	/* sectors */
+	for (cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++)
+		ptr = putsec (disk, ptr, cnt);
+	*(ushort *)ptr = (ptr[-1]&1) ? 0x2AA8 : 0xAAA8;
+}
+
+
+struct dos_header {
+	unsigned char track,   /* 0-80 */
+		side,    /* 0-1 */
+		sec,     /* 0-...*/
+		len_desc;/* 2 */
+	unsigned short crc;     /* on 68000 we got an alignment problem, 
+				   but this compiler solves it  by adding silently 
+				   adding a pad byte so data won't fit
+				   and this took about 3h to discover.... */
+	unsigned char gap1[22];     /* for longword-alignedness (0x4e) */
+};
+
+/* crc routines are borrowed from the messydos-handler  */
+
+/* excerpt from the messydos-device           
+; The CRC is computed not only over the actual data, but including
+; the SYNC mark (3 * $a1) and the 'ID/DATA - Address Mark' ($fe/$fb).
+; As we don't read or encode these fields into our buffers, we have to
+; preload the registers containing the CRC with the values they would have
+; after stepping over these fields.
+;
+; How CRCs "really" work:
+;
+; First, you should regard a bitstring as a series of coefficients of
+; polynomials. We calculate with these polynomials in modulo-2
+; arithmetic, in which both add and subtract are done the same as
+; exclusive-or. Now, we modify our data (a very long polynomial) in
+; such a way that it becomes divisible by the CCITT-standard 16-bit
+;		 16   12   5
+; polynomial:	x  + x	+ x + 1, represented by $11021. The easiest
+; way to do this would be to multiply (using proper arithmetic) our
+; datablock with $11021. So we have:
+;   data * $11021		 =
+;   data * ($10000 + $1021)      =
+;   data * $10000 + data * $1021
+; The left part of this is simple: Just add two 0 bytes. But then
+; the right part (data $1021) remains difficult and even could have
+; a carry into the left part. The solution is to use a modified
+; multiplication, which has a result that is not correct, but with
+; a difference of any multiple of $11021. We then only need to keep
+; the 16 least significant bits of the result.
+;
+; The following algorithm does this for us:
+;
+;   unsigned char *data, c, crclo, crchi;
+;   while (not done) {
+;	c = *data++ + crchi;
+;	crchi = (@ c) >> 8 + crclo;
+;	crclo = @ c;
+;   }
+;
+; Remember, + is done with EOR, the @ operator is in two tables (high
+; and low byte separately), which is calculated as
+;
+;      $1021 * (c & $F0)
+;  xor $1021 * (c & $0F)
+;  xor $1021 * (c >> 4)         (* is regular multiplication)
+;
+;
+; Anyway, the end result is the same as the remainder of the division of
+; the data by $11021. I am afraid I need to study theory a bit more...
+
+
+my only works was to code this from manx to C....
+
+*/
+
+static ushort dos_crc(void * data_a3, int data_d0, int data_d1, int data_d3)
+{
+	static unsigned char CRCTable1[] = {
+		0x00,0x10,0x20,0x30,0x40,0x50,0x60,0x70,0x81,0x91,0xa1,0xb1,0xc1,0xd1,0xe1,0xf1,
+		0x12,0x02,0x32,0x22,0x52,0x42,0x72,0x62,0x93,0x83,0xb3,0xa3,0xd3,0xc3,0xf3,0xe3,
+		0x24,0x34,0x04,0x14,0x64,0x74,0x44,0x54,0xa5,0xb5,0x85,0x95,0xe5,0xf5,0xc5,0xd5,
+		0x36,0x26,0x16,0x06,0x76,0x66,0x56,0x46,0xb7,0xa7,0x97,0x87,0xf7,0xe7,0xd7,0xc7,
+		0x48,0x58,0x68,0x78,0x08,0x18,0x28,0x38,0xc9,0xd9,0xe9,0xf9,0x89,0x99,0xa9,0xb9,
+		0x5a,0x4a,0x7a,0x6a,0x1a,0x0a,0x3a,0x2a,0xdb,0xcb,0xfb,0xeb,0x9b,0x8b,0xbb,0xab,
+		0x6c,0x7c,0x4c,0x5c,0x2c,0x3c,0x0c,0x1c,0xed,0xfd,0xcd,0xdd,0xad,0xbd,0x8d,0x9d,
+		0x7e,0x6e,0x5e,0x4e,0x3e,0x2e,0x1e,0x0e,0xff,0xef,0xdf,0xcf,0xbf,0xaf,0x9f,0x8f,
+		0x91,0x81,0xb1,0xa1,0xd1,0xc1,0xf1,0xe1,0x10,0x00,0x30,0x20,0x50,0x40,0x70,0x60,
+		0x83,0x93,0xa3,0xb3,0xc3,0xd3,0xe3,0xf3,0x02,0x12,0x22,0x32,0x42,0x52,0x62,0x72,
+		0xb5,0xa5,0x95,0x85,0xf5,0xe5,0xd5,0xc5,0x34,0x24,0x14,0x04,0x74,0x64,0x54,0x44,
+		0xa7,0xb7,0x87,0x97,0xe7,0xf7,0xc7,0xd7,0x26,0x36,0x06,0x16,0x66,0x76,0x46,0x56,
+		0xd9,0xc9,0xf9,0xe9,0x99,0x89,0xb9,0xa9,0x58,0x48,0x78,0x68,0x18,0x08,0x38,0x28,
+		0xcb,0xdb,0xeb,0xfb,0x8b,0x9b,0xab,0xbb,0x4a,0x5a,0x6a,0x7a,0x0a,0x1a,0x2a,0x3a,
+		0xfd,0xed,0xdd,0xcd,0xbd,0xad,0x9d,0x8d,0x7c,0x6c,0x5c,0x4c,0x3c,0x2c,0x1c,0x0c,
+		0xef,0xff,0xcf,0xdf,0xaf,0xbf,0x8f,0x9f,0x6e,0x7e,0x4e,0x5e,0x2e,0x3e,0x0e,0x1e
+	};
+
+	static unsigned char CRCTable2[] = {
+		0x00,0x21,0x42,0x63,0x84,0xa5,0xc6,0xe7,0x08,0x29,0x4a,0x6b,0x8c,0xad,0xce,0xef,
+		0x31,0x10,0x73,0x52,0xb5,0x94,0xf7,0xd6,0x39,0x18,0x7b,0x5a,0xbd,0x9c,0xff,0xde,
+		0x62,0x43,0x20,0x01,0xe6,0xc7,0xa4,0x85,0x6a,0x4b,0x28,0x09,0xee,0xcf,0xac,0x8d,
+		0x53,0x72,0x11,0x30,0xd7,0xf6,0x95,0xb4,0x5b,0x7a,0x19,0x38,0xdf,0xfe,0x9d,0xbc,
+		0xc4,0xe5,0x86,0xa7,0x40,0x61,0x02,0x23,0xcc,0xed,0x8e,0xaf,0x48,0x69,0x0a,0x2b,
+		0xf5,0xd4,0xb7,0x96,0x71,0x50,0x33,0x12,0xfd,0xdc,0xbf,0x9e,0x79,0x58,0x3b,0x1a,
+		0xa6,0x87,0xe4,0xc5,0x22,0x03,0x60,0x41,0xae,0x8f,0xec,0xcd,0x2a,0x0b,0x68,0x49,
+		0x97,0xb6,0xd5,0xf4,0x13,0x32,0x51,0x70,0x9f,0xbe,0xdd,0xfc,0x1b,0x3a,0x59,0x78,
+		0x88,0xa9,0xca,0xeb,0x0c,0x2d,0x4e,0x6f,0x80,0xa1,0xc2,0xe3,0x04,0x25,0x46,0x67,
+		0xb9,0x98,0xfb,0xda,0x3d,0x1c,0x7f,0x5e,0xb1,0x90,0xf3,0xd2,0x35,0x14,0x77,0x56,
+		0xea,0xcb,0xa8,0x89,0x6e,0x4f,0x2c,0x0d,0xe2,0xc3,0xa0,0x81,0x66,0x47,0x24,0x05,
+		0xdb,0xfa,0x99,0xb8,0x5f,0x7e,0x1d,0x3c,0xd3,0xf2,0x91,0xb0,0x57,0x76,0x15,0x34,
+		0x4c,0x6d,0x0e,0x2f,0xc8,0xe9,0x8a,0xab,0x44,0x65,0x06,0x27,0xc0,0xe1,0x82,0xa3,
+		0x7d,0x5c,0x3f,0x1e,0xf9,0xd8,0xbb,0x9a,0x75,0x54,0x37,0x16,0xf1,0xd0,0xb3,0x92,
+		0x2e,0x0f,0x6c,0x4d,0xaa,0x8b,0xe8,0xc9,0x26,0x07,0x64,0x45,0xa2,0x83,0xe0,0xc1,
+		0x1f,0x3e,0x5d,0x7c,0x9b,0xba,0xd9,0xf8,0x17,0x36,0x55,0x74,0x93,0xb2,0xd1,0xf0
+	};
+
+/* look at the asm-code - what looks in C a bit strange is almost as good as handmade */
+	register int i;
+	register unsigned char *CRCT1, *CRCT2, *data, c, crch, crcl;
+
+	CRCT1=CRCTable1;
+	CRCT2=CRCTable2;
+	data=data_a3;
+	crcl=data_d1;
+	crch=data_d0;
+	for (i=data_d3; i>=0; i--) {
+		c = (*data++) ^ crch;
+		crch = CRCT1[c] ^ crcl;
+		crcl = CRCT2[c];
+	}
+	return (crch<<8)|crcl;
+}
+
+static inline ushort dos_hdr_crc (struct dos_header *hdr)
+{
+	return dos_crc(&(hdr->track), 0xb2, 0x30, 3); /* precomputed magic */
+}
+
+static inline ushort dos_data_crc(unsigned char *data)
+{
+	return dos_crc(data, 0xe2, 0x95 ,511); /* precomputed magic */
+}
+
+static inline unsigned char dos_decode_byte(ushort word)
+{
+	register ushort w2;
+	register unsigned char byte;
+	register unsigned char *dec = mfmdecode;
+
+	w2=word;
+	w2>>=8;
+	w2&=127;
+	byte = dec[w2];
+	byte <<= 4;
+	w2 = word & 127;
+	byte |= dec[w2];
+	return byte;
+}
+
+static unsigned long dos_decode(unsigned char *data, unsigned short *raw, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++)
+		*data++=dos_decode_byte(*raw++);
+	return ((ulong)raw);
+}
+
+#ifdef DEBUG
+static void dbg(unsigned long ptr)
+{
+	printk("raw data @%08lx: %08lx, %08lx ,%08lx, %08lx\n", ptr,
+	       ((ulong *)ptr)[0], ((ulong *)ptr)[1],
+	       ((ulong *)ptr)[2], ((ulong *)ptr)[3]);
+}
+#endif
+
+static int dos_read(int drive)
+{
+	unsigned long end;
+	unsigned long raw;
+	int scnt;
+	unsigned short crc,data_crc[2];
+	struct dos_header hdr;
+
+	drive&=3;
+	raw = (long) raw_buf;
+	end = raw + unit[drive].type->read_size;
+
+	for (scnt=0; scnt < unit[drive].dtype->sects * unit[drive].type->sect_mult; scnt++) {
+		do { /* search for the right sync of each sec-hdr */
+			if (!(raw = scan_sync (raw, end))) {
+				printk(KERN_INFO "dos_read: no hdr sync on "
+				       "track %d, unit %d for sector %d\n",
+				       unit[drive].track,drive,scnt);
+				return MFM_NOSYNC;
+			}
+#ifdef DEBUG
+			dbg(raw);
+#endif
+		} while (*((ushort *)raw)!=0x5554); /* loop usually only once done */
+		raw+=2; /* skip over headermark */
+		raw = dos_decode((unsigned char *)&hdr,(ushort *) raw,8);
+		crc = dos_hdr_crc(&hdr);
+
+#ifdef DEBUG
+		printk("(%3d,%d,%2d,%d) %x\n", hdr.track, hdr.side,
+		       hdr.sec, hdr.len_desc, hdr.crc);
+#endif
+
+		if (crc != hdr.crc) {
+			printk(KERN_INFO "dos_read: MFM_HEADER %04x,%04x\n",
+			       hdr.crc, crc);
+			return MFM_HEADER;
+		}
+		if (hdr.track != unit[drive].track/unit[drive].type->heads) {
+			printk(KERN_INFO "dos_read: MFM_TRACK %d, %d\n",
+			       hdr.track,
+			       unit[drive].track/unit[drive].type->heads);
+			return MFM_TRACK;
+		}
+
+		if (hdr.side != unit[drive].track%unit[drive].type->heads) {
+			printk(KERN_INFO "dos_read: MFM_SIDE %d, %d\n",
+			       hdr.side,
+			       unit[drive].track%unit[drive].type->heads);
+			return MFM_TRACK;
+		}
+
+		if (hdr.len_desc != 2) {
+			printk(KERN_INFO "dos_read: unknown sector len "
+			       "descriptor %d\n", hdr.len_desc);
+			return MFM_DATA;
+		}
+#ifdef DEBUG
+		printk("hdr accepted\n");
+#endif
+		if (!(raw = scan_sync (raw, end))) {
+			printk(KERN_INFO "dos_read: no data sync on track "
+			       "%d, unit %d for sector%d, disk sector %d\n",
+			       unit[drive].track, drive, scnt, hdr.sec);
+			return MFM_NOSYNC;
+		}
+#ifdef DEBUG
+		dbg(raw);
+#endif
+
+		if (*((ushort *)raw)!=0x5545) {
+			printk(KERN_INFO "dos_read: no data mark after "
+			       "sync (%d,%d,%d,%d) sc=%d\n",
+			       hdr.track,hdr.side,hdr.sec,hdr.len_desc,scnt);
+			return MFM_NOSYNC;
+		}
+
+		raw+=2;  /* skip data mark (included in checksum) */
+		raw = dos_decode((unsigned char *)(unit[drive].trackbuf + (hdr.sec - 1) * 512), (ushort *) raw, 512);
+		raw = dos_decode((unsigned char  *)data_crc,(ushort *) raw,4);
+		crc = dos_data_crc(unit[drive].trackbuf + (hdr.sec - 1) * 512);
+
+		if (crc != data_crc[0]) {
+			printk(KERN_INFO "dos_read: MFM_DATA (%d,%d,%d,%d) "
+			       "sc=%d, %x %x\n", hdr.track, hdr.side,
+			       hdr.sec, hdr.len_desc, scnt,data_crc[0], crc);
+			printk(KERN_INFO "data=(%lx,%lx,%lx,%lx,...)\n",
+			       ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[0],
+			       ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[1],
+			       ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[2],
+			       ((ulong *)(unit[drive].trackbuf+(hdr.sec-1)*512))[3]);
+			return MFM_DATA;
+		}
+	}
+	return 0;
+}
+
+static inline ushort dos_encode_byte(unsigned char byte)
+{
+	register unsigned char *enc, b2, b1;
+	register ushort word;
+
+	enc=mfmencode;
+	b1=byte;
+	b2=b1>>4;
+	b1&=15;
+	word=enc[b2] <<8 | enc [b1];
+	return (word|((word&(256|64)) ? 0: 128));
+}
+
+static void dos_encode_block(ushort *dest, unsigned char *src, int len)
+{
+	int i;
+
+	for (i = 0; i < len; i++) {
+		*dest=dos_encode_byte(*src++);
+		*dest|=((dest[-1]&1)||(*dest&0x4000))? 0: 0x8000;
+		dest++;
+	}
+}
+
+static unsigned long *ms_putsec(int drive, unsigned long *raw, int cnt)
+{
+	static struct dos_header hdr={0,0,0,2,0,
+	  {78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78,78}};
+	int i;
+	static ushort crc[2]={0,0x4e4e};
+
+	drive&=3;
+/* id gap 1 */
+/* the MFM word before is always 9254 */
+	for(i=0;i<6;i++)
+		*raw++=0xaaaaaaaa;
+/* 3 sync + 1 headermark */
+	*raw++=0x44894489;
+	*raw++=0x44895554;
+
+/* fill in the variable parts of the header */
+	hdr.track=unit[drive].track/unit[drive].type->heads;
+	hdr.side=unit[drive].track%unit[drive].type->heads;
+	hdr.sec=cnt+1;
+	hdr.crc=dos_hdr_crc(&hdr);
+
+/* header (without "magic") and id gap 2*/
+	dos_encode_block((ushort *)raw,(unsigned char *) &hdr.track,28);
+	raw+=14;
+
+/*id gap 3 */
+	for(i=0;i<6;i++)
+		*raw++=0xaaaaaaaa;
+
+/* 3 syncs and 1 datamark */
+	*raw++=0x44894489;
+	*raw++=0x44895545;
+
+/* data */
+	dos_encode_block((ushort *)raw,
+			 (unsigned char *)unit[drive].trackbuf+cnt*512,512);
+	raw+=256;
+
+/*data crc + jd's special gap (long words :-/) */
+	crc[0]=dos_data_crc(unit[drive].trackbuf+cnt*512);
+	dos_encode_block((ushort *) raw,(unsigned char *)crc,4);
+	raw+=2;
+
+/* data gap */
+	for(i=0;i<38;i++)
+		*raw++=0x92549254;
+
+	return raw; /* wrote 652 MFM words */
+}
+
+static void dos_write(int disk)
+{
+	int cnt;
+	unsigned long raw = (unsigned long) raw_buf;
+	unsigned long *ptr=(unsigned long *)raw;
+
+	disk&=3;
+/* really gap4 + indexgap , but we write it first and round it up */
+	for (cnt=0;cnt<425;cnt++)
+		*ptr++=0x92549254;
+
+/* the following is just guessed */
+	if (unit[disk].type->sect_mult==2)  /* check for HD-Disks */
+		for(cnt=0;cnt<473;cnt++)
+			*ptr++=0x92549254;
+
+/* now the index marks...*/
+	for (cnt=0;cnt<20;cnt++)
+		*ptr++=0x92549254;
+	for (cnt=0;cnt<6;cnt++)
+		*ptr++=0xaaaaaaaa;
+	*ptr++=0x52245224;
+	*ptr++=0x52245552;
+	for (cnt=0;cnt<20;cnt++)
+		*ptr++=0x92549254;
+
+/* sectors */
+	for(cnt = 0; cnt < unit[disk].dtype->sects * unit[disk].type->sect_mult; cnt++)
+		ptr=ms_putsec(disk,ptr,cnt);
+
+	*(ushort *)ptr = 0xaaa8; /* MFM word before is always 0x9254 */
+}
+
+/*
+ * Here comes the high level stuff (i.e. the filesystem interface)
+ * and helper functions.
+ * Normally this should be the only part that has to be adapted to
+ * different kernel versions.
+ */
+
+/* FIXME: this assumes the drive is still spinning -
+ * which is only true if we complete writing a track within three seconds
+ */
+static void flush_track_callback(struct timer_list *timer)
+{
+	unsigned long nr = ((unsigned long)timer -
+			       (unsigned long)&flush_track_timer[0]) /
+					sizeof(flush_track_timer[0]);
+
+	nr&=3;
+	writefromint = 1;
+	if (!try_fdc(nr)) {
+		/* we might block in an interrupt, so try again later */
+		flush_track_timer[nr].expires = jiffies + 1;
+		add_timer(flush_track_timer + nr);
+		return;
+	}
+	get_fdc(nr);
+	(*unit[nr].dtype->write_fkt)(nr);
+	if (!raw_write(nr)) {
+		printk (KERN_NOTICE "floppy disk write protected\n");
+		writefromint = 0;
+		writepending = 0;
+	}
+	rel_fdc();
+}
+
+static int non_int_flush_track (unsigned long nr)
+{
+	unsigned long flags;
+
+	nr&=3;
+	writefromint = 0;
+	del_timer(&post_write_timer);
+	get_fdc(nr);
+	if (!fd_motor_on(nr)) {
+		writepending = 0;
+		rel_fdc();
+		return 0;
+	}
+	local_irq_save(flags);
+	if (writepending != 2) {
+		local_irq_restore(flags);
+		(*unit[nr].dtype->write_fkt)(nr);
+		if (!raw_write(nr)) {
+			printk (KERN_NOTICE "floppy disk write protected "
+				"in write!\n");
+			writepending = 0;
+			return 0;
+		}
+		wait_event(wait_fd_block, block_flag != 2);
+	}
+	else {
+		local_irq_restore(flags);
+		ms_delay(2); /* 2 ms post_write delay */
+		post_write(nr);
+	}
+	rel_fdc();
+	return 1;
+}
+
+static int get_track(int drive, int track)
+{
+	int error, errcnt;
+
+	drive&=3;
+	if (unit[drive].track == track)
+		return 0;
+	get_fdc(drive);
+	if (!fd_motor_on(drive)) {
+		rel_fdc();
+		return -1;
+	}
+
+	if (unit[drive].dirty == 1) {
+		del_timer (flush_track_timer + drive);
+		non_int_flush_track (drive);
+	}
+	errcnt = 0;
+	while (errcnt < MAX_ERRORS) {
+		if (!fd_seek(drive, track))
+			return -1;
+		raw_read(drive);
+		error = (*unit[drive].dtype->read_fkt)(drive);
+		if (error == 0) {
+			rel_fdc();
+			return 0;
+		}
+		/* Read Error Handling: recalibrate and try again */
+		unit[drive].track = -1;
+		errcnt++;
+	}
+	rel_fdc();
+	return -1;
+}
+
+static blk_status_t amiflop_rw_cur_segment(struct amiga_floppy_struct *floppy,
+					   struct request *rq)
+{
+	int drive = floppy - unit;
+	unsigned int cnt, block, track, sector;
+	char *data;
+
+	for (cnt = 0; cnt < blk_rq_cur_sectors(rq); cnt++) {
+#ifdef DEBUG
+		printk("fd: sector %ld + %d requested for %s\n",
+		       blk_rq_pos(rq), cnt,
+		       (rq_data_dir(rq) == READ) ? "read" : "write");
+#endif
+		block = blk_rq_pos(rq) + cnt;
+		track = block / (floppy->dtype->sects * floppy->type->sect_mult);
+		sector = block % (floppy->dtype->sects * floppy->type->sect_mult);
+		data = bio_data(rq->bio) + 512 * cnt;
+#ifdef DEBUG
+		printk("access to track %d, sector %d, with buffer at "
+		       "0x%08lx\n", track, sector, data);
+#endif
+
+		if (get_track(drive, track) == -1)
+			return BLK_STS_IOERR;
+
+		if (rq_data_dir(rq) == READ) {
+			memcpy(data, floppy->trackbuf + sector * 512, 512);
+		} else {
+			memcpy(floppy->trackbuf + sector * 512, data, 512);
+
+			/* keep the drive spinning while writes are scheduled */
+			if (!fd_motor_on(drive))
+				return BLK_STS_IOERR;
+			/*
+			 * setup a callback to write the track buffer
+			 * after a short (1 tick) delay.
+			 */
+			floppy->dirty = 1;
+		        /* reset the timer */
+			mod_timer (flush_track_timer + drive, jiffies + 1);
+		}
+	}
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t amiflop_queue_rq(struct blk_mq_hw_ctx *hctx,
+				     const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct amiga_floppy_struct *floppy = rq->q->disk->private_data;
+	blk_status_t err;
+
+	if (!spin_trylock_irq(&amiflop_lock))
+		return BLK_STS_DEV_RESOURCE;
+
+	blk_mq_start_request(rq);
+
+	do {
+		err = amiflop_rw_cur_segment(floppy, rq);
+	} while (blk_update_request(rq, err, blk_rq_cur_bytes(rq)));
+	blk_mq_end_request(rq, err);
+
+	spin_unlock_irq(&amiflop_lock);
+	return BLK_STS_OK;
+}
+
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = MINOR(bdev->bd_dev) & 3;
+
+	geo->heads = unit[drive].type->heads;
+	geo->sectors = unit[drive].dtype->sects * unit[drive].type->sect_mult;
+	geo->cylinders = unit[drive].type->tracks;
+	return 0;
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long param)
+{
+	struct amiga_floppy_struct *p = bdev->bd_disk->private_data;
+	int drive = p - unit;
+	static struct floppy_struct getprm;
+	void __user *argp = (void __user *)param;
+
+	switch(cmd){
+	case FDFMTBEG:
+		get_fdc(drive);
+		if (fd_ref[drive] > 1) {
+			rel_fdc();
+			return -EBUSY;
+		}
+		fsync_bdev(bdev);
+		if (fd_motor_on(drive) == 0) {
+			rel_fdc();
+			return -ENODEV;
+		}
+		if (fd_calibrate(drive) == 0) {
+			rel_fdc();
+			return -ENXIO;
+		}
+		floppy_off(drive);
+		rel_fdc();
+		break;
+	case FDFMTTRK:
+		if (param < p->type->tracks * p->type->heads)
+		{
+			get_fdc(drive);
+			if (fd_seek(drive,param) != 0){
+				memset(p->trackbuf, FD_FILL_BYTE,
+				       p->dtype->sects * p->type->sect_mult * 512);
+				non_int_flush_track(drive);
+			}
+			floppy_off(drive);
+			rel_fdc();
+		}
+		else
+			return -EINVAL;
+		break;
+	case FDFMTEND:
+		floppy_off(drive);
+		invalidate_bdev(bdev);
+		break;
+	case FDGETPRM:
+		memset((void *)&getprm, 0, sizeof (getprm));
+		getprm.track=p->type->tracks;
+		getprm.head=p->type->heads;
+		getprm.sect=p->dtype->sects * p->type->sect_mult;
+		getprm.size=p->blocks;
+		if (copy_to_user(argp, &getprm, sizeof(struct floppy_struct)))
+			return -EFAULT;
+		break;
+	case FDSETPRM:
+	case FDDEFPRM:
+		return -EINVAL;
+	case FDFLUSH: /* unconditionally, even if not needed */
+		del_timer (flush_track_timer + drive);
+		non_int_flush_track(drive);
+		break;
+#ifdef RAW_IOCTL
+	case IOCTL_RAW_TRACK:
+		if (copy_to_user(argp, raw_buf, p->type->read_size))
+			return -EFAULT;
+		else
+			return p->type->read_size;
+#endif
+	default:
+		return -ENOSYS;
+	}
+	return 0;
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&amiflop_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&amiflop_mutex);
+
+	return ret;
+}
+
+static void fd_probe(int dev)
+{
+	unsigned long code;
+	int type;
+	int drive;
+
+	drive = dev & 3;
+	code = fd_get_drive_id(drive);
+
+	/* get drive type */
+	for (type = 0; type < num_dr_types; type++)
+		if (drive_types[type].code == code)
+			break;
+
+	if (type >= num_dr_types) {
+		printk(KERN_WARNING "fd_probe: unsupported drive type "
+		       "%08lx found\n", code);
+		unit[drive].type = &drive_types[num_dr_types-1]; /* FD_NODRIVE */
+		return;
+	}
+
+	unit[drive].type = drive_types + type;
+	unit[drive].track = -1;
+
+	unit[drive].disk = -1;
+	unit[drive].motor = 0;
+	unit[drive].busy = 0;
+	unit[drive].status = -1;
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+	int drive = MINOR(bdev->bd_dev) & 3;
+	int system =  (MINOR(bdev->bd_dev) & 4) >> 2;
+	int old_dev;
+	unsigned long flags;
+
+	mutex_lock(&amiflop_mutex);
+	old_dev = fd_device[drive];
+
+	if (fd_ref[drive] && old_dev != system) {
+		mutex_unlock(&amiflop_mutex);
+		return -EBUSY;
+	}
+
+	if (unit[drive].type->code == FD_NODRIVE) {
+		mutex_unlock(&amiflop_mutex);
+		return -ENXIO;
+	}
+
+	if (mode & (FMODE_READ|FMODE_WRITE)) {
+		bdev_check_media_change(bdev);
+		if (mode & FMODE_WRITE) {
+			int wrprot;
+
+			get_fdc(drive);
+			fd_select (drive);
+			wrprot = !(ciaa.pra & DSKPROT);
+			fd_deselect (drive);
+			rel_fdc();
+
+			if (wrprot) {
+				mutex_unlock(&amiflop_mutex);
+				return -EROFS;
+			}
+		}
+	}
+
+	local_irq_save(flags);
+	fd_ref[drive]++;
+	fd_device[drive] = system;
+	local_irq_restore(flags);
+
+	unit[drive].dtype=&data_types[system];
+	unit[drive].blocks=unit[drive].type->heads*unit[drive].type->tracks*
+		data_types[system].sects*unit[drive].type->sect_mult;
+	set_capacity(unit[drive].gendisk[system], unit[drive].blocks);
+
+	printk(KERN_INFO "fd%d: accessing %s-disk with %s-layout\n",drive,
+	       unit[drive].type->name, data_types[system].name);
+
+	mutex_unlock(&amiflop_mutex);
+	return 0;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+	struct amiga_floppy_struct *p = disk->private_data;
+	int drive = p - unit;
+
+	mutex_lock(&amiflop_mutex);
+	if (unit[drive].dirty == 1) {
+		del_timer (flush_track_timer + drive);
+		non_int_flush_track (drive);
+	}
+  
+	if (!fd_ref[drive]--) {
+		printk(KERN_CRIT "floppy_release with fd_ref == 0");
+		fd_ref[drive] = 0;
+	}
+#ifdef MODULE
+	floppy_off (drive);
+#endif
+	mutex_unlock(&amiflop_mutex);
+}
+
+/*
+ * check_events is never called from an interrupt, so we can relax a bit
+ * here, sleep etc. Note that floppy-on tries to set current_DOR to point
+ * to the desired drive, but it will probably not survive the sleep if
+ * several floppies are used at the same time: thus the loop.
+ */
+static unsigned amiga_check_events(struct gendisk *disk, unsigned int clearing)
+{
+	struct amiga_floppy_struct *p = disk->private_data;
+	int drive = p - unit;
+	int changed;
+	static int first_time = 1;
+
+	if (first_time)
+		changed = first_time--;
+	else {
+		get_fdc(drive);
+		fd_select (drive);
+		changed = !(ciaa.pra & DSKCHANGE);
+		fd_deselect (drive);
+		rel_fdc();
+	}
+
+	if (changed) {
+		fd_probe(drive);
+		p->track = -1;
+		p->dirty = 0;
+		writepending = 0; /* if this was true before, too bad! */
+		writefromint = 0;
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+	return 0;
+}
+
+static const struct block_device_operations floppy_fops = {
+	.owner		= THIS_MODULE,
+	.open		= floppy_open,
+	.release	= floppy_release,
+	.ioctl		= fd_ioctl,
+	.getgeo		= fd_getgeo,
+	.check_events	= amiga_check_events,
+};
+
+static const struct blk_mq_ops amiflop_mq_ops = {
+	.queue_rq = amiflop_queue_rq,
+};
+
+static int fd_alloc_disk(int drive, int system)
+{
+	struct gendisk *disk;
+	int err;
+
+	disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
+	if (IS_ERR(disk))
+		return PTR_ERR(disk);
+
+	disk->major = FLOPPY_MAJOR;
+	disk->first_minor = drive + system;
+	disk->minors = 1;
+	disk->fops = &floppy_fops;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	if (system)
+		sprintf(disk->disk_name, "fd%d_msdos", drive);
+	else
+		sprintf(disk->disk_name, "fd%d", drive);
+	disk->private_data = &unit[drive];
+	set_capacity(disk, 880 * 2);
+
+	unit[drive].gendisk[system] = disk;
+	err = add_disk(disk);
+	if (err)
+		put_disk(disk);
+	return err;
+}
+
+static int fd_alloc_drive(int drive)
+{
+	unit[drive].trackbuf = kmalloc(FLOPPY_MAX_SECTORS * 512, GFP_KERNEL);
+	if (!unit[drive].trackbuf)
+		goto out;
+
+	memset(&unit[drive].tag_set, 0, sizeof(unit[drive].tag_set));
+	unit[drive].tag_set.ops = &amiflop_mq_ops;
+	unit[drive].tag_set.nr_hw_queues = 1;
+	unit[drive].tag_set.nr_maps = 1;
+	unit[drive].tag_set.queue_depth = 2;
+	unit[drive].tag_set.numa_node = NUMA_NO_NODE;
+	unit[drive].tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	if (blk_mq_alloc_tag_set(&unit[drive].tag_set))
+		goto out_cleanup_trackbuf;
+
+	pr_cont(" fd%d", drive);
+
+	if (fd_alloc_disk(drive, 0) || fd_alloc_disk(drive, 1))
+		goto out_cleanup_tagset;
+	return 0;
+
+out_cleanup_tagset:
+	blk_mq_free_tag_set(&unit[drive].tag_set);
+out_cleanup_trackbuf:
+	kfree(unit[drive].trackbuf);
+out:
+	unit[drive].type->code = FD_NODRIVE;
+	return -ENOMEM;
+}
+
+static int __init fd_probe_drives(void)
+{
+	int drive,drives,nomem;
+
+	pr_info("FD: probing units\nfound");
+	drives=0;
+	nomem=0;
+	for(drive=0;drive<FD_MAX_UNITS;drive++) {
+		fd_probe(drive);
+		if (unit[drive].type->code == FD_NODRIVE)
+			continue;
+
+		if (fd_alloc_drive(drive) < 0) {
+			pr_cont(" no mem for fd%d", drive);
+			nomem = 1;
+			continue;
+		}
+		drives++;
+	}
+	if ((drives > 0) || (nomem == 0)) {
+		if (drives == 0)
+			pr_cont(" no drives");
+		pr_cont("\n");
+		return drives;
+	}
+	pr_cont("\n");
+	return -ENOMEM;
+}
+ 
+static int __init amiga_floppy_probe(struct platform_device *pdev)
+{
+	int i, ret;
+
+	if (register_blkdev(FLOPPY_MAJOR,"fd"))
+		return -EBUSY;
+
+	ret = -ENOMEM;
+	raw_buf = amiga_chip_alloc(RAW_BUF_SIZE, "Floppy");
+	if (!raw_buf) {
+		printk("fd: cannot get chip mem buffer\n");
+		goto out_blkdev;
+	}
+
+	ret = -EBUSY;
+	if (request_irq(IRQ_AMIGA_DSKBLK, fd_block_done, 0, "floppy_dma", NULL)) {
+		printk("fd: cannot get irq for dma\n");
+		goto out_irq;
+	}
+
+	if (request_irq(IRQ_AMIGA_CIAA_TB, ms_isr, 0, "floppy_timer", NULL)) {
+		printk("fd: cannot get irq for timer\n");
+		goto out_irq2;
+	}
+
+	ret = -ENODEV;
+	if (fd_probe_drives() < 1) /* No usable drives */
+		goto out_probe;
+
+	/* initialize variables */
+	timer_setup(&motor_on_timer, motor_on_callback, 0);
+	motor_on_timer.expires = 0;
+	for (i = 0; i < FD_MAX_UNITS; i++) {
+		timer_setup(&motor_off_timer[i], fd_motor_off, 0);
+		motor_off_timer[i].expires = 0;
+		timer_setup(&flush_track_timer[i], flush_track_callback, 0);
+		flush_track_timer[i].expires = 0;
+
+		unit[i].track = -1;
+	}
+
+	timer_setup(&post_write_timer, post_write_callback, 0);
+	post_write_timer.expires = 0;
+  
+	for (i = 0; i < 128; i++)
+		mfmdecode[i]=255;
+	for (i = 0; i < 16; i++)
+		mfmdecode[mfmencode[i]]=i;
+
+	/* make sure that disk DMA is enabled */
+	custom.dmacon = DMAF_SETCLR | DMAF_DISK;
+
+	/* init ms timer */
+	ciaa.crb = 8; /* one-shot, stop */
+	return 0;
+
+out_probe:
+	free_irq(IRQ_AMIGA_CIAA_TB, NULL);
+out_irq2:
+	free_irq(IRQ_AMIGA_DSKBLK, NULL);
+out_irq:
+	amiga_chip_free(raw_buf);
+out_blkdev:
+	unregister_blkdev(FLOPPY_MAJOR,"fd");
+	return ret;
+}
+
+static struct platform_driver amiga_floppy_driver = {
+	.driver   = {
+		.name	= "amiga-floppy",
+	},
+};
+
+static int __init amiga_floppy_init(void)
+{
+	return platform_driver_probe(&amiga_floppy_driver, amiga_floppy_probe);
+}
+
+module_init(amiga_floppy_init);
+
+#ifndef MODULE
+static int __init amiga_floppy_setup (char *str)
+{
+	int n;
+	if (!MACH_IS_AMIGA)
+		return 0;
+	if (!get_option(&str, &n))
+		return 0;
+	printk (KERN_INFO "amiflop: Setting default df0 to %x\n", n);
+	fd_def_df0 = n;
+	return 1;
+}
+
+__setup("floppy=", amiga_floppy_setup);
+#endif
+
+MODULE_ALIAS("platform:amiga-floppy");
diff --git a/drivers/block/aoe/Makefile b/drivers/block/aoe/Makefile
new file mode 100644
index 000000000..b7545ce2f
--- /dev/null
+++ b/drivers/block/aoe/Makefile
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for ATA over Ethernet
+#
+
+obj-$(CONFIG_ATA_OVER_ETH)	+= aoe.o
+aoe-y := aoeblk.o aoechr.o aoecmd.o aoedev.o aoemain.o aoenet.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
new file mode 100644
index 000000000..749ae1246
--- /dev/null
+++ b/drivers/block/aoe/aoe.h
@@ -0,0 +1,248 @@
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+#include <linux/blk-mq.h>
+
+#define VERSION "85"
+#define AOE_MAJOR 152
+#define DEVICE_NAME "aoe"
+
+/* set AOE_PARTITIONS to 1 to use whole-disks only
+ * default is 16, which is 15 partitions plus the whole disk
+ */
+#ifndef AOE_PARTITIONS
+#define AOE_PARTITIONS (16)
+#endif
+
+#define WHITESPACE " \t\v\f\n,"
+
+enum {
+	AOECMD_ATA,
+	AOECMD_CFG,
+	AOECMD_VEND_MIN = 0xf0,
+
+	AOEFL_RSP = (1<<3),
+	AOEFL_ERR = (1<<2),
+
+	AOEAFL_EXT = (1<<6),
+	AOEAFL_DEV = (1<<4),
+	AOEAFL_ASYNC = (1<<1),
+	AOEAFL_WRITE = (1<<0),
+
+	AOECCMD_READ = 0,
+	AOECCMD_TEST,
+	AOECCMD_PTEST,
+	AOECCMD_SET,
+	AOECCMD_FSET,
+
+	AOE_HVER = 0x10,
+};
+
+struct aoe_hdr {
+	unsigned char dst[6];
+	unsigned char src[6];
+	__be16 type;
+	unsigned char verfl;
+	unsigned char err;
+	__be16 major;
+	unsigned char minor;
+	unsigned char cmd;
+	__be32 tag;
+};
+
+struct aoe_atahdr {
+	unsigned char aflags;
+	unsigned char errfeat;
+	unsigned char scnt;
+	unsigned char cmdstat;
+	unsigned char lba0;
+	unsigned char lba1;
+	unsigned char lba2;
+	unsigned char lba3;
+	unsigned char lba4;
+	unsigned char lba5;
+	unsigned char res[2];
+};
+
+struct aoe_cfghdr {
+	__be16 bufcnt;
+	__be16 fwver;
+	unsigned char scnt;
+	unsigned char aoeccmd;
+	unsigned char cslen[2];
+};
+
+enum {
+	DEVFL_UP = 1,	/* device is installed in system and ready for AoE->ATA commands */
+	DEVFL_TKILL = (1<<1),	/* flag for timer to know when to kill self */
+	DEVFL_EXT = (1<<2),	/* device accepts lba48 commands */
+	DEVFL_GDALLOC = (1<<3),	/* need to alloc gendisk */
+	DEVFL_GD_NOW = (1<<4),	/* allocating gendisk */
+	DEVFL_KICKME = (1<<5),	/* slow polling network card catch */
+	DEVFL_NEWSIZE = (1<<6),	/* need to update dev size in block layer */
+	DEVFL_FREEING = (1<<7),	/* set when device is being cleaned up */
+	DEVFL_FREED = (1<<8),	/* device has been cleaned up */
+};
+
+enum {
+	DEFAULTBCNT = 2 * 512,	/* 2 sectors */
+	MIN_BUFS = 16,
+	NTARGETS = 4,
+	NAOEIFS = 8,
+	NSKBPOOLMAX = 256,
+	NFACTIVE = 61,
+
+	TIMERTICK = HZ / 10,
+	RTTSCALE = 8,
+	RTTDSCALE = 3,
+	RTTAVG_INIT = USEC_PER_SEC / 4 << RTTSCALE,
+	RTTDEV_INIT = RTTAVG_INIT / 4,
+
+	HARD_SCORN_SECS = 10,	/* try another remote port after this */
+	MAX_TAINT = 1000,	/* cap on aoetgt taint */
+};
+
+struct aoe_req {
+	unsigned long nr_bios;
+};
+
+struct buf {
+	ulong nframesout;
+	struct bio *bio;
+	struct bvec_iter iter;
+	struct request *rq;
+};
+
+enum frame_flags {
+	FFL_PROBE = 1,
+};
+
+struct frame {
+	struct list_head head;
+	u32 tag;
+	ktime_t sent;			/* high-res time packet was sent */
+	ulong waited;
+	ulong waited_total;
+	struct aoetgt *t;		/* parent target I belong to */
+	struct sk_buff *skb;		/* command skb freed on module exit */
+	struct sk_buff *r_skb;		/* response skb for async processing */
+	struct buf *buf;
+	struct bvec_iter iter;
+	char flags;
+};
+
+struct aoeif {
+	struct net_device *nd;
+	ulong lost;
+	int bcnt;
+};
+
+struct aoetgt {
+	unsigned char addr[6];
+	ushort nframes;		/* cap on frames to use */
+	struct aoedev *d;			/* parent device I belong to */
+	struct list_head ffree;			/* list of free frames */
+	struct aoeif ifs[NAOEIFS];
+	struct aoeif *ifp;	/* current aoeif in use */
+	ushort nout;		/* number of AoE commands outstanding */
+	ushort maxout;		/* current value for max outstanding */
+	ushort next_cwnd;	/* incr maxout after decrementing to zero */
+	ushort ssthresh;	/* slow start threshold */
+	ulong falloc;		/* number of allocated frames */
+	int taint;		/* how much we want to avoid this aoetgt */
+	int minbcnt;
+	int wpkts, rpkts;
+	char nout_probes;
+};
+
+struct aoedev {
+	struct aoedev *next;
+	ulong sysminor;
+	ulong aoemajor;
+	u32 rttavg;		/* scaled AoE round trip time average */
+	u32 rttdev;		/* scaled round trip time mean deviation */
+	u16 aoeminor;
+	u16 flags;
+	u16 nopen;		/* (bd_openers isn't available without sleeping) */
+	u16 fw_ver;		/* version of blade's firmware */
+	u16 lasttag;		/* last tag sent */
+	u16 useme;
+	ulong ref;
+	struct work_struct work;/* disk create work struct */
+	struct gendisk *gd;
+	struct dentry *debugfs;
+	struct request_queue *blkq;
+	struct list_head rq_list;
+	struct blk_mq_tag_set tag_set;
+	struct hd_geometry geo;
+	sector_t ssize;
+	struct timer_list timer;
+	spinlock_t lock;
+	struct sk_buff_head skbpool;
+	mempool_t *bufpool;	/* for deadlock-free Buf allocation */
+	struct {		/* pointers to work in progress */
+		struct buf *buf;
+		struct bio *nxbio;
+		struct request *rq;
+	} ip;
+	ulong maxbcnt;
+	struct list_head factive[NFACTIVE];	/* hash of active frames */
+	struct list_head rexmitq; /* deferred retransmissions */
+	struct aoetgt **targets;
+	ulong ntargets;		/* number of allocated aoetgt pointers */
+	struct aoetgt **tgt;	/* target in use when working */
+	ulong kicked;
+	char ident[512];
+};
+
+/* kthread tracking */
+struct ktstate {
+	struct completion rendez;
+	struct task_struct *task;
+	wait_queue_head_t *waitq;
+	int (*fn) (int);
+	char name[12];
+	spinlock_t *lock;
+	int id;
+	int active;
+};
+
+int aoeblk_init(void);
+void aoeblk_exit(void);
+void aoeblk_gdalloc(void *);
+void aoedisk_rm_debugfs(struct aoedev *d);
+
+int aoechr_init(void);
+void aoechr_exit(void);
+void aoechr_error(char *);
+
+void aoecmd_work(struct aoedev *d);
+void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
+struct sk_buff *aoecmd_ata_rsp(struct sk_buff *);
+void aoecmd_cfg_rsp(struct sk_buff *);
+void aoecmd_sleepwork(struct work_struct *);
+void aoecmd_wreset(struct aoetgt *t);
+void aoecmd_cleanslate(struct aoedev *);
+void aoecmd_exit(void);
+int aoecmd_init(void);
+struct sk_buff *aoecmd_ata_id(struct aoedev *);
+void aoe_freetframe(struct frame *);
+void aoe_flush_iocq(void);
+void aoe_flush_iocq_by_index(int);
+void aoe_end_request(struct aoedev *, struct request *, int);
+int aoe_ktstart(struct ktstate *k);
+void aoe_ktstop(struct ktstate *k);
+
+int aoedev_init(void);
+void aoedev_exit(void);
+struct aoedev *aoedev_by_aoeaddr(ulong maj, int min, int do_alloc);
+void aoedev_downdev(struct aoedev *d);
+int aoedev_flush(const char __user *str, size_t size);
+void aoe_failbuf(struct aoedev *, struct buf *);
+void aoedev_put(struct aoedev *);
+
+int aoenet_init(void);
+void aoenet_exit(void);
+void aoenet_xmit(struct sk_buff_head *);
+int is_aoe_netif(struct net_device *ifp);
+int set_aoe_iflist(const char __user *str, size_t size);
+
+extern struct workqueue_struct *aoe_wq;
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
new file mode 100644
index 000000000..128722cf6
--- /dev/null
+++ b/drivers/block/aoe/aoeblk.c
@@ -0,0 +1,450 @@
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoeblk.c
+ * block device routines
+ */
+
+#include <linux/kernel.h>
+#include <linux/hdreg.h>
+#include <linux/blk-mq.h>
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/ioctl.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/netdevice.h>
+#include <linux/mutex.h>
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/debugfs.h>
+#include <scsi/sg.h>
+#include "aoe.h"
+
+static DEFINE_MUTEX(aoeblk_mutex);
+static struct kmem_cache *buf_pool_cache;
+static struct dentry *aoe_debugfs_dir;
+
+/* GPFS needs a larger value than the default. */
+static int aoe_maxsectors;
+module_param(aoe_maxsectors, int, 0644);
+MODULE_PARM_DESC(aoe_maxsectors,
+	"When nonzero, set the maximum number of sectors per I/O request");
+
+static ssize_t aoedisk_show_state(struct device *dev,
+				  struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+
+	return sysfs_emit(page, "%s%s\n",
+			(d->flags & DEVFL_UP) ? "up" : "down",
+			(d->flags & DEVFL_KICKME) ? ",kickme" :
+			(d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : "");
+	/* I'd rather see nopen exported so we can ditch closewait */
+}
+static ssize_t aoedisk_show_mac(struct device *dev,
+				struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+	struct aoetgt *t = d->targets[0];
+
+	if (t == NULL)
+		return sysfs_emit(page, "none\n");
+	return sysfs_emit(page, "%pm\n", t->addr);
+}
+static ssize_t aoedisk_show_netif(struct device *dev,
+				  struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+	struct net_device *nds[8], **nd, **nnd, **ne;
+	struct aoetgt **t, **te;
+	struct aoeif *ifp, *e;
+	char *p;
+
+	memset(nds, 0, sizeof nds);
+	nd = nds;
+	ne = nd + ARRAY_SIZE(nds);
+	t = d->targets;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++) {
+		ifp = (*t)->ifs;
+		e = ifp + NAOEIFS;
+		for (; ifp < e && ifp->nd; ifp++) {
+			for (nnd = nds; nnd < nd; nnd++)
+				if (*nnd == ifp->nd)
+					break;
+			if (nnd == nd && nd != ne)
+				*nd++ = ifp->nd;
+		}
+	}
+
+	ne = nd;
+	nd = nds;
+	if (*nd == NULL)
+		return sysfs_emit(page, "none\n");
+	for (p = page; nd < ne; nd++)
+		p += scnprintf(p, PAGE_SIZE - (p-page), "%s%s",
+			p == page ? "" : ",", (*nd)->name);
+	p += scnprintf(p, PAGE_SIZE - (p-page), "\n");
+	return p-page;
+}
+/* firmware version */
+static ssize_t aoedisk_show_fwver(struct device *dev,
+				  struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+
+	return sysfs_emit(page, "0x%04x\n", (unsigned int) d->fw_ver);
+}
+static ssize_t aoedisk_show_payload(struct device *dev,
+				    struct device_attribute *attr, char *page)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct aoedev *d = disk->private_data;
+
+	return sysfs_emit(page, "%lu\n", d->maxbcnt);
+}
+
+static int aoe_debugfs_show(struct seq_file *s, void *ignored)
+{
+	struct aoedev *d;
+	struct aoetgt **t, **te;
+	struct aoeif *ifp, *ife;
+	unsigned long flags;
+	char c;
+
+	d = s->private;
+	seq_printf(s, "rttavg: %d rttdev: %d\n",
+		d->rttavg >> RTTSCALE,
+		d->rttdev >> RTTDSCALE);
+	seq_printf(s, "nskbpool: %d\n", skb_queue_len(&d->skbpool));
+	seq_printf(s, "kicked: %ld\n", d->kicked);
+	seq_printf(s, "maxbcnt: %ld\n", d->maxbcnt);
+	seq_printf(s, "ref: %ld\n", d->ref);
+
+	spin_lock_irqsave(&d->lock, flags);
+	t = d->targets;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++) {
+		c = '\t';
+		seq_printf(s, "falloc: %ld\n", (*t)->falloc);
+		seq_printf(s, "ffree: %p\n",
+			list_empty(&(*t)->ffree) ? NULL : (*t)->ffree.next);
+		seq_printf(s, "%pm:%d:%d:%d\n", (*t)->addr, (*t)->nout,
+			(*t)->maxout, (*t)->nframes);
+		seq_printf(s, "\tssthresh:%d\n", (*t)->ssthresh);
+		seq_printf(s, "\ttaint:%d\n", (*t)->taint);
+		seq_printf(s, "\tr:%d\n", (*t)->rpkts);
+		seq_printf(s, "\tw:%d\n", (*t)->wpkts);
+		ifp = (*t)->ifs;
+		ife = ifp + ARRAY_SIZE((*t)->ifs);
+		for (; ifp->nd && ifp < ife; ifp++) {
+			seq_printf(s, "%c%s", c, ifp->nd->name);
+			c = ',';
+		}
+		seq_puts(s, "\n");
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(aoe_debugfs);
+
+static DEVICE_ATTR(state, 0444, aoedisk_show_state, NULL);
+static DEVICE_ATTR(mac, 0444, aoedisk_show_mac, NULL);
+static DEVICE_ATTR(netif, 0444, aoedisk_show_netif, NULL);
+static struct device_attribute dev_attr_firmware_version = {
+	.attr = { .name = "firmware-version", .mode = 0444 },
+	.show = aoedisk_show_fwver,
+};
+static DEVICE_ATTR(payload, 0444, aoedisk_show_payload, NULL);
+
+static struct attribute *aoe_attrs[] = {
+	&dev_attr_state.attr,
+	&dev_attr_mac.attr,
+	&dev_attr_netif.attr,
+	&dev_attr_firmware_version.attr,
+	&dev_attr_payload.attr,
+	NULL,
+};
+
+static const struct attribute_group aoe_attr_group = {
+	.attrs = aoe_attrs,
+};
+
+static const struct attribute_group *aoe_attr_groups[] = {
+	&aoe_attr_group,
+	NULL,
+};
+
+static void
+aoedisk_add_debugfs(struct aoedev *d)
+{
+	char *p;
+
+	if (aoe_debugfs_dir == NULL)
+		return;
+	p = strchr(d->gd->disk_name, '/');
+	if (p == NULL)
+		p = d->gd->disk_name;
+	else
+		p++;
+	BUG_ON(*p == '\0');
+	d->debugfs = debugfs_create_file(p, 0444, aoe_debugfs_dir, d,
+					 &aoe_debugfs_fops);
+}
+void
+aoedisk_rm_debugfs(struct aoedev *d)
+{
+	debugfs_remove(d->debugfs);
+	d->debugfs = NULL;
+}
+
+static int
+aoeblk_open(struct block_device *bdev, fmode_t mode)
+{
+	struct aoedev *d = bdev->bd_disk->private_data;
+	ulong flags;
+
+	if (!virt_addr_valid(d)) {
+		pr_crit("aoe: invalid device pointer in %s\n",
+			__func__);
+		WARN_ON(1);
+		return -ENODEV;
+	}
+	if (!(d->flags & DEVFL_UP) || d->flags & DEVFL_TKILL)
+		return -ENODEV;
+
+	mutex_lock(&aoeblk_mutex);
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_UP && !(d->flags & DEVFL_TKILL)) {
+		d->nopen++;
+		spin_unlock_irqrestore(&d->lock, flags);
+		mutex_unlock(&aoeblk_mutex);
+		return 0;
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+	mutex_unlock(&aoeblk_mutex);
+	return -ENODEV;
+}
+
+static void
+aoeblk_release(struct gendisk *disk, fmode_t mode)
+{
+	struct aoedev *d = disk->private_data;
+	ulong flags;
+
+	spin_lock_irqsave(&d->lock, flags);
+
+	if (--d->nopen == 0) {
+		spin_unlock_irqrestore(&d->lock, flags);
+		aoecmd_cfg(d->aoemajor, d->aoeminor);
+		return;
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+}
+
+static blk_status_t aoeblk_queue_rq(struct blk_mq_hw_ctx *hctx,
+				    const struct blk_mq_queue_data *bd)
+{
+	struct aoedev *d = hctx->queue->queuedata;
+
+	spin_lock_irq(&d->lock);
+
+	if ((d->flags & DEVFL_UP) == 0) {
+		pr_info_ratelimited("aoe: device %ld.%d is not up\n",
+			d->aoemajor, d->aoeminor);
+		spin_unlock_irq(&d->lock);
+		blk_mq_start_request(bd->rq);
+		return BLK_STS_IOERR;
+	}
+
+	list_add_tail(&bd->rq->queuelist, &d->rq_list);
+	aoecmd_work(d);
+	spin_unlock_irq(&d->lock);
+	return BLK_STS_OK;
+}
+
+static int
+aoeblk_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct aoedev *d = bdev->bd_disk->private_data;
+
+	if ((d->flags & DEVFL_UP) == 0) {
+		printk(KERN_ERR "aoe: disk not up\n");
+		return -ENODEV;
+	}
+
+	geo->cylinders = d->geo.cylinders;
+	geo->heads = d->geo.heads;
+	geo->sectors = d->geo.sectors;
+	return 0;
+}
+
+static int
+aoeblk_ioctl(struct block_device *bdev, fmode_t mode, uint cmd, ulong arg)
+{
+	struct aoedev *d;
+
+	if (!arg)
+		return -EINVAL;
+
+	d = bdev->bd_disk->private_data;
+	if ((d->flags & DEVFL_UP) == 0) {
+		pr_err("aoe: disk not up\n");
+		return -ENODEV;
+	}
+
+	if (cmd == HDIO_GET_IDENTITY) {
+		if (!copy_to_user((void __user *) arg, &d->ident,
+			sizeof(d->ident)))
+			return 0;
+		return -EFAULT;
+	}
+
+	/* udev calls scsi_id, which uses SG_IO, resulting in noise */
+	if (cmd != SG_IO)
+		pr_info("aoe: unknown ioctl 0x%x\n", cmd);
+
+	return -ENOTTY;
+}
+
+static const struct block_device_operations aoe_bdops = {
+	.open = aoeblk_open,
+	.release = aoeblk_release,
+	.ioctl = aoeblk_ioctl,
+	.compat_ioctl = blkdev_compat_ptr_ioctl,
+	.getgeo = aoeblk_getgeo,
+	.owner = THIS_MODULE,
+};
+
+static const struct blk_mq_ops aoeblk_mq_ops = {
+	.queue_rq	= aoeblk_queue_rq,
+};
+
+/* blk_mq_alloc_disk and add_disk can sleep */
+void
+aoeblk_gdalloc(void *vp)
+{
+	struct aoedev *d = vp;
+	struct gendisk *gd;
+	mempool_t *mp;
+	struct blk_mq_tag_set *set;
+	ulong flags;
+	int late = 0;
+	int err;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_GDALLOC
+	&& !(d->flags & DEVFL_TKILL)
+	&& !(d->flags & DEVFL_GD_NOW))
+		d->flags |= DEVFL_GD_NOW;
+	else
+		late = 1;
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (late)
+		return;
+
+	mp = mempool_create(MIN_BUFS, mempool_alloc_slab, mempool_free_slab,
+		buf_pool_cache);
+	if (mp == NULL) {
+		printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
+			d->aoemajor, d->aoeminor);
+		goto err;
+	}
+
+	set = &d->tag_set;
+	set->ops = &aoeblk_mq_ops;
+	set->cmd_size = sizeof(struct aoe_req);
+	set->nr_hw_queues = 1;
+	set->queue_depth = 128;
+	set->numa_node = NUMA_NO_NODE;
+	set->flags = BLK_MQ_F_SHOULD_MERGE;
+	err = blk_mq_alloc_tag_set(set);
+	if (err) {
+		pr_err("aoe: cannot allocate tag set for %ld.%d\n",
+			d->aoemajor, d->aoeminor);
+		goto err_mempool;
+	}
+
+	gd = blk_mq_alloc_disk(set, d);
+	if (IS_ERR(gd)) {
+		pr_err("aoe: cannot allocate block queue for %ld.%d\n",
+			d->aoemajor, d->aoeminor);
+		goto err_tagset;
+	}
+
+	spin_lock_irqsave(&d->lock, flags);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	WARN_ON(!(d->flags & DEVFL_GDALLOC));
+	WARN_ON(d->flags & DEVFL_TKILL);
+	WARN_ON(d->gd);
+	WARN_ON(d->flags & DEVFL_UP);
+	blk_queue_max_hw_sectors(gd->queue, BLK_DEF_MAX_SECTORS);
+	blk_queue_io_opt(gd->queue, SZ_2M);
+	d->bufpool = mp;
+	d->blkq = gd->queue;
+	d->gd = gd;
+	if (aoe_maxsectors)
+		blk_queue_max_hw_sectors(gd->queue, aoe_maxsectors);
+	gd->major = AOE_MAJOR;
+	gd->first_minor = d->sysminor;
+	gd->minors = AOE_PARTITIONS;
+	gd->fops = &aoe_bdops;
+	gd->private_data = d;
+	set_capacity(gd, d->ssize);
+	snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
+		d->aoemajor, d->aoeminor);
+
+	d->flags &= ~DEVFL_GDALLOC;
+	d->flags |= DEVFL_UP;
+
+	spin_unlock_irqrestore(&d->lock, flags);
+
+	err = device_add_disk(NULL, gd, aoe_attr_groups);
+	if (err)
+		goto out_disk_cleanup;
+	aoedisk_add_debugfs(d);
+
+	spin_lock_irqsave(&d->lock, flags);
+	WARN_ON(!(d->flags & DEVFL_GD_NOW));
+	d->flags &= ~DEVFL_GD_NOW;
+	spin_unlock_irqrestore(&d->lock, flags);
+	return;
+
+out_disk_cleanup:
+	put_disk(gd);
+err_tagset:
+	blk_mq_free_tag_set(set);
+err_mempool:
+	mempool_destroy(mp);
+err:
+	spin_lock_irqsave(&d->lock, flags);
+	d->flags &= ~DEVFL_GD_NOW;
+	queue_work(aoe_wq, &d->work);
+	spin_unlock_irqrestore(&d->lock, flags);
+}
+
+void
+aoeblk_exit(void)
+{
+	debugfs_remove_recursive(aoe_debugfs_dir);
+	aoe_debugfs_dir = NULL;
+	kmem_cache_destroy(buf_pool_cache);
+}
+
+int __init
+aoeblk_init(void)
+{
+	buf_pool_cache = kmem_cache_create("aoe_bufs",
+					   sizeof(struct buf),
+					   0, 0, NULL);
+	if (buf_pool_cache == NULL)
+		return -ENOMEM;
+	aoe_debugfs_dir = debugfs_create_dir("aoe", NULL);
+	return 0;
+}
+
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
new file mode 100644
index 000000000..8eea2529d
--- /dev/null
+++ b/drivers/block/aoe/aoechr.c
@@ -0,0 +1,318 @@
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoechr.c
+ * AoE character device driver
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mutex.h>
+#include <linux/skbuff.h>
+#include <linux/export.h>
+#include "aoe.h"
+
+enum {
+	//MINOR_STAT = 1, (moved to sysfs)
+	MINOR_ERR = 2,
+	MINOR_DISCOVER,
+	MINOR_INTERFACES,
+	MINOR_REVALIDATE,
+	MINOR_FLUSH,
+	MSGSZ = 2048,
+	NMSG = 100,		/* message backlog to retain */
+};
+
+struct aoe_chardev {
+	ulong minor;
+	char name[32];
+};
+
+enum { EMFL_VALID = 1 };
+
+struct ErrMsg {
+	short flags;
+	short len;
+	char *msg;
+};
+
+static DEFINE_MUTEX(aoechr_mutex);
+
+/* A ring buffer of error messages, to be read through
+ * "/dev/etherd/err".  When no messages are present,
+ * readers will block waiting for messages to appear.
+ */
+static struct ErrMsg emsgs[NMSG];
+static int emsgs_head_idx, emsgs_tail_idx;
+static struct completion emsgs_comp;
+static spinlock_t emsgs_lock;
+static int nblocked_emsgs_readers;
+static struct class *aoe_class;
+static struct aoe_chardev chardevs[] = {
+	{ MINOR_ERR, "err" },
+	{ MINOR_DISCOVER, "discover" },
+	{ MINOR_INTERFACES, "interfaces" },
+	{ MINOR_REVALIDATE, "revalidate" },
+	{ MINOR_FLUSH, "flush" },
+};
+
+static int
+discover(void)
+{
+	aoecmd_cfg(0xffff, 0xff);
+	return 0;
+}
+
+static int
+interfaces(const char __user *str, size_t size)
+{
+	if (set_aoe_iflist(str, size)) {
+		printk(KERN_ERR
+			"aoe: could not set interface list: too many interfaces\n");
+		return -EINVAL;
+	}
+	return 0;
+}
+
+static int
+revalidate(const char __user *str, size_t size)
+{
+	int major, minor, n;
+	ulong flags;
+	struct aoedev *d;
+	struct sk_buff *skb;
+	char buf[16];
+
+	if (size >= sizeof buf)
+		return -EINVAL;
+	buf[sizeof buf - 1] = '\0';
+	if (copy_from_user(buf, str, size))
+		return -EFAULT;
+
+	n = sscanf(buf, "e%d.%d", &major, &minor);
+	if (n != 2) {
+		pr_err("aoe: invalid device specification %s\n", buf);
+		return -EINVAL;
+	}
+	d = aoedev_by_aoeaddr(major, minor, 0);
+	if (!d)
+		return -EINVAL;
+	spin_lock_irqsave(&d->lock, flags);
+	aoecmd_cleanslate(d);
+	aoecmd_cfg(major, minor);
+loop:
+	skb = aoecmd_ata_id(d);
+	spin_unlock_irqrestore(&d->lock, flags);
+	/* try again if we are able to sleep a bit,
+	 * otherwise give up this revalidation
+	 */
+	if (!skb && !msleep_interruptible(250)) {
+		spin_lock_irqsave(&d->lock, flags);
+		goto loop;
+	}
+	aoedev_put(d);
+	if (skb) {
+		struct sk_buff_head queue;
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
+	return 0;
+}
+
+void
+aoechr_error(char *msg)
+{
+	struct ErrMsg *em;
+	char *mp;
+	ulong flags, n;
+
+	n = strlen(msg);
+
+	spin_lock_irqsave(&emsgs_lock, flags);
+
+	em = emsgs + emsgs_tail_idx;
+	if ((em->flags & EMFL_VALID)) {
+bail:		spin_unlock_irqrestore(&emsgs_lock, flags);
+		return;
+	}
+
+	mp = kmemdup(msg, n, GFP_ATOMIC);
+	if (!mp)
+		goto bail;
+
+	em->msg = mp;
+	em->flags |= EMFL_VALID;
+	em->len = n;
+
+	emsgs_tail_idx++;
+	emsgs_tail_idx %= ARRAY_SIZE(emsgs);
+
+	spin_unlock_irqrestore(&emsgs_lock, flags);
+
+	if (nblocked_emsgs_readers)
+		complete(&emsgs_comp);
+}
+
+static ssize_t
+aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp)
+{
+	int ret = -EINVAL;
+
+	switch ((unsigned long) filp->private_data) {
+	default:
+		printk(KERN_INFO "aoe: can't write to that file.\n");
+		break;
+	case MINOR_DISCOVER:
+		ret = discover();
+		break;
+	case MINOR_INTERFACES:
+		ret = interfaces(buf, cnt);
+		break;
+	case MINOR_REVALIDATE:
+		ret = revalidate(buf, cnt);
+		break;
+	case MINOR_FLUSH:
+		ret = aoedev_flush(buf, cnt);
+		break;
+	}
+	if (ret == 0)
+		ret = cnt;
+	return ret;
+}
+
+static int
+aoechr_open(struct inode *inode, struct file *filp)
+{
+	int n, i;
+
+	mutex_lock(&aoechr_mutex);
+	n = iminor(inode);
+	filp->private_data = (void *) (unsigned long) n;
+
+	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+		if (chardevs[i].minor == n) {
+			mutex_unlock(&aoechr_mutex);
+			return 0;
+		}
+	mutex_unlock(&aoechr_mutex);
+	return -EINVAL;
+}
+
+static int
+aoechr_rel(struct inode *inode, struct file *filp)
+{
+	return 0;
+}
+
+static ssize_t
+aoechr_read(struct file *filp, char __user *buf, size_t cnt, loff_t *off)
+{
+	unsigned long n;
+	char *mp;
+	struct ErrMsg *em;
+	ssize_t len;
+	ulong flags;
+
+	n = (unsigned long) filp->private_data;
+	if (n != MINOR_ERR)
+		return -EFAULT;
+
+	spin_lock_irqsave(&emsgs_lock, flags);
+
+	for (;;) {
+		em = emsgs + emsgs_head_idx;
+		if ((em->flags & EMFL_VALID) != 0)
+			break;
+		if (filp->f_flags & O_NDELAY) {
+			spin_unlock_irqrestore(&emsgs_lock, flags);
+			return -EAGAIN;
+		}
+		nblocked_emsgs_readers++;
+
+		spin_unlock_irqrestore(&emsgs_lock, flags);
+
+		n = wait_for_completion_interruptible(&emsgs_comp);
+
+		spin_lock_irqsave(&emsgs_lock, flags);
+
+		nblocked_emsgs_readers--;
+
+		if (n) {
+			spin_unlock_irqrestore(&emsgs_lock, flags);
+			return -ERESTARTSYS;
+		}
+	}
+	if (em->len > cnt) {
+		spin_unlock_irqrestore(&emsgs_lock, flags);
+		return -EAGAIN;
+	}
+	mp = em->msg;
+	len = em->len;
+	em->msg = NULL;
+	em->flags &= ~EMFL_VALID;
+
+	emsgs_head_idx++;
+	emsgs_head_idx %= ARRAY_SIZE(emsgs);
+
+	spin_unlock_irqrestore(&emsgs_lock, flags);
+
+	n = copy_to_user(buf, mp, len);
+	kfree(mp);
+	return n == 0 ? len : -EFAULT;
+}
+
+static const struct file_operations aoe_fops = {
+	.write = aoechr_write,
+	.read = aoechr_read,
+	.open = aoechr_open,
+	.release = aoechr_rel,
+	.owner = THIS_MODULE,
+	.llseek = noop_llseek,
+};
+
+static char *aoe_devnode(struct device *dev, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "etherd/%s", dev_name(dev));
+}
+
+int __init
+aoechr_init(void)
+{
+	int n, i;
+
+	n = register_chrdev(AOE_MAJOR, "aoechr", &aoe_fops);
+	if (n < 0) {
+		printk(KERN_ERR "aoe: can't register char device\n");
+		return n;
+	}
+	init_completion(&emsgs_comp);
+	spin_lock_init(&emsgs_lock);
+	aoe_class = class_create(THIS_MODULE, "aoe");
+	if (IS_ERR(aoe_class)) {
+		unregister_chrdev(AOE_MAJOR, "aoechr");
+		return PTR_ERR(aoe_class);
+	}
+	aoe_class->devnode = aoe_devnode;
+
+	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+		device_create(aoe_class, NULL,
+			      MKDEV(AOE_MAJOR, chardevs[i].minor), NULL,
+			      chardevs[i].name);
+
+	return 0;
+}
+
+void
+aoechr_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(chardevs); ++i)
+		device_destroy(aoe_class, MKDEV(AOE_MAJOR, chardevs[i].minor));
+	class_destroy(aoe_class);
+	unregister_chrdev(AOE_MAJOR, "aoechr");
+}
+
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
new file mode 100644
index 000000000..d7317425b
--- /dev/null
+++ b/drivers/block/aoe/aoecmd.c
@@ -0,0 +1,1751 @@
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoecmd.c
+ * Filesystem request handling methods
+ */
+
+#include <linux/ata.h>
+#include <linux/slab.h>
+#include <linux/hdreg.h>
+#include <linux/blk-mq.h>
+#include <linux/skbuff.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <linux/workqueue.h>
+#include <linux/kthread.h>
+#include <net/net_namespace.h>
+#include <asm/unaligned.h>
+#include <linux/uio.h>
+#include "aoe.h"
+
+#define MAXIOC (8192)	/* default meant to avoid most soft lockups */
+
+static void ktcomplete(struct frame *, struct sk_buff *);
+static int count_targets(struct aoedev *d, int *untainted);
+
+static struct buf *nextbuf(struct aoedev *);
+
+static int aoe_deadsecs = 60 * 3;
+module_param(aoe_deadsecs, int, 0644);
+MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
+
+static int aoe_maxout = 64;
+module_param(aoe_maxout, int, 0644);
+MODULE_PARM_DESC(aoe_maxout,
+	"Only aoe_maxout outstanding packets for every MAC on eX.Y.");
+
+/* The number of online cpus during module initialization gives us a
+ * convenient heuristic cap on the parallelism used for ktio threads
+ * doing I/O completion.  It is not important that the cap equal the
+ * actual number of running CPUs at any given time, but because of CPU
+ * hotplug, we take care to use ncpus instead of using
+ * num_online_cpus() after module initialization.
+ */
+static int ncpus;
+
+/* mutex lock used for synchronization while thread spawning */
+static DEFINE_MUTEX(ktio_spawn_lock);
+
+static wait_queue_head_t *ktiowq;
+static struct ktstate *kts;
+
+/* io completion queue */
+struct iocq_ktio {
+	struct list_head head;
+	spinlock_t lock;
+};
+static struct iocq_ktio *iocq;
+
+static struct page *empty_page;
+
+static struct sk_buff *
+new_skb(ulong len)
+{
+	struct sk_buff *skb;
+
+	skb = alloc_skb(len + MAX_HEADER, GFP_ATOMIC);
+	if (skb) {
+		skb_reserve(skb, MAX_HEADER);
+		skb_reset_mac_header(skb);
+		skb_reset_network_header(skb);
+		skb->protocol = __constant_htons(ETH_P_AOE);
+		skb_checksum_none_assert(skb);
+	}
+	return skb;
+}
+
+static struct frame *
+getframe_deferred(struct aoedev *d, u32 tag)
+{
+	struct list_head *head, *pos, *nx;
+	struct frame *f;
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
+			return f;
+		}
+	}
+	return NULL;
+}
+
+static struct frame *
+getframe(struct aoedev *d, u32 tag)
+{
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	u32 n;
+
+	n = tag % NFACTIVE;
+	head = &d->factive[n];
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		if (f->tag == tag) {
+			list_del(pos);
+			return f;
+		}
+	}
+	return NULL;
+}
+
+/*
+ * Leave the top bit clear so we have tagspace for userland.
+ * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
+ * This driver reserves tag -1 to mean "unused frame."
+ */
+static int
+newtag(struct aoedev *d)
+{
+	register ulong n;
+
+	n = jiffies & 0xffff;
+	return n | (++d->lasttag & 0x7fff) << 16;
+}
+
+static u32
+aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
+{
+	u32 host_tag = newtag(d);
+
+	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+	memcpy(h->dst, t->addr, sizeof h->dst);
+	h->type = __constant_cpu_to_be16(ETH_P_AOE);
+	h->verfl = AOE_HVER;
+	h->major = cpu_to_be16(d->aoemajor);
+	h->minor = d->aoeminor;
+	h->cmd = AOECMD_ATA;
+	h->tag = cpu_to_be32(host_tag);
+
+	return host_tag;
+}
+
+static inline void
+put_lba(struct aoe_atahdr *ah, sector_t lba)
+{
+	ah->lba0 = lba;
+	ah->lba1 = lba >>= 8;
+	ah->lba2 = lba >>= 8;
+	ah->lba3 = lba >>= 8;
+	ah->lba4 = lba >>= 8;
+	ah->lba5 = lba >>= 8;
+}
+
+static struct aoeif *
+ifrotate(struct aoetgt *t)
+{
+	struct aoeif *ifp;
+
+	ifp = t->ifp;
+	ifp++;
+	if (ifp >= &t->ifs[NAOEIFS] || ifp->nd == NULL)
+		ifp = t->ifs;
+	if (ifp->nd == NULL)
+		return NULL;
+	return t->ifp = ifp;
+}
+
+static void
+skb_pool_put(struct aoedev *d, struct sk_buff *skb)
+{
+	__skb_queue_tail(&d->skbpool, skb);
+}
+
+static struct sk_buff *
+skb_pool_get(struct aoedev *d)
+{
+	struct sk_buff *skb = skb_peek(&d->skbpool);
+
+	if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
+		__skb_unlink(skb, &d->skbpool);
+		return skb;
+	}
+	if (skb_queue_len(&d->skbpool) < NSKBPOOLMAX &&
+	    (skb = new_skb(ETH_ZLEN)))
+		return skb;
+
+	return NULL;
+}
+
+void
+aoe_freetframe(struct frame *f)
+{
+	struct aoetgt *t;
+
+	t = f->t;
+	f->buf = NULL;
+	memset(&f->iter, 0, sizeof(f->iter));
+	f->r_skb = NULL;
+	f->flags = 0;
+	list_add(&f->head, &t->ffree);
+}
+
+static struct frame *
+newtframe(struct aoedev *d, struct aoetgt *t)
+{
+	struct frame *f;
+	struct sk_buff *skb;
+	struct list_head *pos;
+
+	if (list_empty(&t->ffree)) {
+		if (t->falloc >= NSKBPOOLMAX*2)
+			return NULL;
+		f = kcalloc(1, sizeof(*f), GFP_ATOMIC);
+		if (f == NULL)
+			return NULL;
+		t->falloc++;
+		f->t = t;
+	} else {
+		pos = t->ffree.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+	}
+
+	skb = f->skb;
+	if (skb == NULL) {
+		f->skb = skb = new_skb(ETH_ZLEN);
+		if (!skb) {
+bail:			aoe_freetframe(f);
+			return NULL;
+		}
+	}
+
+	if (atomic_read(&skb_shinfo(skb)->dataref) != 1) {
+		skb = skb_pool_get(d);
+		if (skb == NULL)
+			goto bail;
+		skb_pool_put(d, f->skb);
+		f->skb = skb;
+	}
+
+	skb->truesize -= skb->data_len;
+	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+	skb_trim(skb, 0);
+	return f;
+}
+
+static struct frame *
+newframe(struct aoedev *d)
+{
+	struct frame *f;
+	struct aoetgt *t, **tt;
+	int totout = 0;
+	int use_tainted;
+	int has_untainted;
+
+	if (!d->targets || !d->targets[0]) {
+		printk(KERN_ERR "aoe: NULL TARGETS!\n");
+		return NULL;
+	}
+	tt = d->tgt;	/* last used target */
+	for (use_tainted = 0, has_untainted = 0;;) {
+		tt++;
+		if (tt >= &d->targets[d->ntargets] || !*tt)
+			tt = d->targets;
+		t = *tt;
+		if (!t->taint) {
+			has_untainted = 1;
+			totout += t->nout;
+		}
+		if (t->nout < t->maxout
+		&& (use_tainted || !t->taint)
+		&& t->ifp->nd) {
+			f = newtframe(d, t);
+			if (f) {
+				ifrotate(t);
+				d->tgt = tt;
+				return f;
+			}
+		}
+		if (tt == d->tgt) {	/* we've looped and found nada */
+			if (!use_tainted && !has_untainted)
+				use_tainted = 1;
+			else
+				break;
+		}
+	}
+	if (totout == 0) {
+		d->kicked++;
+		d->flags |= DEVFL_KICKME;
+	}
+	return NULL;
+}
+
+static void
+skb_fillup(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter)
+{
+	int frag = 0;
+	struct bio_vec bv;
+
+	__bio_for_each_segment(bv, bio, iter, iter)
+		skb_fill_page_desc(skb, frag++, bv.bv_page,
+				   bv.bv_offset, bv.bv_len);
+}
+
+static void
+fhash(struct frame *f)
+{
+	struct aoedev *d = f->t->d;
+	u32 n;
+
+	n = f->tag % NFACTIVE;
+	list_add_tail(&f->head, &d->factive[n]);
+}
+
+static void
+ata_rw_frameinit(struct frame *f)
+{
+	struct aoetgt *t;
+	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
+	struct sk_buff *skb;
+	char writebit, extbit;
+
+	skb = f->skb;
+	h = (struct aoe_hdr *) skb_mac_header(skb);
+	ah = (struct aoe_atahdr *) (h + 1);
+	skb_put(skb, sizeof(*h) + sizeof(*ah));
+	memset(h, 0, skb->len);
+
+	writebit = 0x10;
+	extbit = 0x4;
+
+	t = f->t;
+	f->tag = aoehdr_atainit(t->d, t, h);
+	fhash(f);
+	t->nout++;
+	f->waited = 0;
+	f->waited_total = 0;
+
+	/* set up ata header */
+	ah->scnt = f->iter.bi_size >> 9;
+	put_lba(ah, f->iter.bi_sector);
+	if (t->d->flags & DEVFL_EXT) {
+		ah->aflags |= AOEAFL_EXT;
+	} else {
+		extbit = 0;
+		ah->lba3 &= 0x0f;
+		ah->lba3 |= 0xe0;	/* LBA bit + obsolete 0xa0 */
+	}
+	if (f->buf && bio_data_dir(f->buf->bio) == WRITE) {
+		skb_fillup(skb, f->buf->bio, f->iter);
+		ah->aflags |= AOEAFL_WRITE;
+		skb->len += f->iter.bi_size;
+		skb->data_len = f->iter.bi_size;
+		skb->truesize += f->iter.bi_size;
+		t->wpkts++;
+	} else {
+		t->rpkts++;
+		writebit = 0;
+	}
+
+	ah->cmdstat = ATA_CMD_PIO_READ | writebit | extbit;
+	skb->dev = t->ifp->nd;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
+{
+	struct frame *f;
+	struct buf *buf;
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+
+	buf = nextbuf(d);
+	if (buf == NULL)
+		return 0;
+	f = newframe(d);
+	if (f == NULL)
+		return 0;
+
+	/* initialize the headers & frame */
+	f->buf = buf;
+	f->iter = buf->iter;
+	f->iter.bi_size = min_t(unsigned long,
+				d->maxbcnt ?: DEFAULTBCNT,
+				f->iter.bi_size);
+	bio_advance_iter(buf->bio, &buf->iter, f->iter.bi_size);
+
+	if (!buf->iter.bi_size)
+		d->ip.buf = NULL;
+
+	/* mark all tracking fields and load out */
+	buf->nframesout += 1;
+
+	ata_rw_frameinit(f);
+
+	skb = skb_clone(f->skb, GFP_ATOMIC);
+	if (skb) {
+		f->sent = ktime_get();
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
+	return 1;
+}
+
+/* some callers cannot sleep, and they can call this function,
+ * transmitting the packets later, when interrupts are on
+ */
+static void
+aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff_head *queue)
+{
+	struct aoe_hdr *h;
+	struct aoe_cfghdr *ch;
+	struct sk_buff *skb;
+	struct net_device *ifp;
+
+	rcu_read_lock();
+	for_each_netdev_rcu(&init_net, ifp) {
+		dev_hold(ifp);
+		if (!is_aoe_netif(ifp))
+			goto cont;
+
+		skb = new_skb(sizeof *h + sizeof *ch);
+		if (skb == NULL) {
+			printk(KERN_INFO "aoe: skb alloc failure\n");
+			goto cont;
+		}
+		skb_put(skb, sizeof *h + sizeof *ch);
+		skb->dev = ifp;
+		__skb_queue_tail(queue, skb);
+		h = (struct aoe_hdr *) skb_mac_header(skb);
+		memset(h, 0, sizeof *h + sizeof *ch);
+
+		memset(h->dst, 0xff, sizeof h->dst);
+		memcpy(h->src, ifp->dev_addr, sizeof h->src);
+		h->type = __constant_cpu_to_be16(ETH_P_AOE);
+		h->verfl = AOE_HVER;
+		h->major = cpu_to_be16(aoemajor);
+		h->minor = aoeminor;
+		h->cmd = AOECMD_CFG;
+
+cont:
+		dev_put(ifp);
+	}
+	rcu_read_unlock();
+}
+
+static void
+resend(struct aoedev *d, struct frame *f)
+{
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+	struct aoe_hdr *h;
+	struct aoetgt *t;
+	char buf[128];
+	u32 n;
+
+	t = f->t;
+	n = newtag(d);
+	skb = f->skb;
+	if (ifrotate(t) == NULL) {
+		/* probably can't happen, but set it up to fail anyway */
+		pr_info("aoe: resend: no interfaces to rotate to.\n");
+		ktcomplete(f, NULL);
+		return;
+	}
+	h = (struct aoe_hdr *) skb_mac_header(skb);
+
+	if (!(f->flags & FFL_PROBE)) {
+		snprintf(buf, sizeof(buf),
+			"%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x s=%pm d=%pm nout=%d\n",
+			"retransmit", d->aoemajor, d->aoeminor,
+			f->tag, jiffies, n,
+			h->src, h->dst, t->nout);
+		aoechr_error(buf);
+	}
+
+	f->tag = n;
+	fhash(f);
+	h->tag = cpu_to_be32(n);
+	memcpy(h->dst, t->addr, sizeof h->dst);
+	memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+
+	skb->dev = t->ifp->nd;
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return;
+	f->sent = ktime_get();
+	__skb_queue_head_init(&queue);
+	__skb_queue_tail(&queue, skb);
+	aoenet_xmit(&queue);
+}
+
+static int
+tsince_hr(struct frame *f)
+{
+	u64 delta = ktime_to_ns(ktime_sub(ktime_get(), f->sent));
+
+	/* delta is normally under 4.2 seconds, avoid 64-bit division */
+	if (likely(delta <= UINT_MAX))
+		return (u32)delta / NSEC_PER_USEC;
+
+	/* avoid overflow after 71 minutes */
+	if (delta > ((u64)INT_MAX * NSEC_PER_USEC))
+		return INT_MAX;
+
+	return div_u64(delta, NSEC_PER_USEC);
+}
+
+static int
+tsince(u32 tag)
+{
+	int n;
+
+	n = jiffies & 0xffff;
+	n -= tag & 0xffff;
+	if (n < 0)
+		n += 1<<16;
+	return jiffies_to_usecs(n + 1);
+}
+
+static struct aoeif *
+getif(struct aoetgt *t, struct net_device *nd)
+{
+	struct aoeif *p, *e;
+
+	p = t->ifs;
+	e = p + NAOEIFS;
+	for (; p < e; p++)
+		if (p->nd == nd)
+			return p;
+	return NULL;
+}
+
+static void
+ejectif(struct aoetgt *t, struct aoeif *ifp)
+{
+	struct aoeif *e;
+	struct net_device *nd;
+	ulong n;
+
+	nd = ifp->nd;
+	e = t->ifs + NAOEIFS - 1;
+	n = (e - ifp) * sizeof *ifp;
+	memmove(ifp, ifp+1, n);
+	e->nd = NULL;
+	dev_put(nd);
+}
+
+static struct frame *
+reassign_frame(struct frame *f)
+{
+	struct frame *nf;
+	struct sk_buff *skb;
+
+	nf = newframe(f->t->d);
+	if (!nf)
+		return NULL;
+	if (nf->t == f->t) {
+		aoe_freetframe(nf);
+		return NULL;
+	}
+
+	skb = nf->skb;
+	nf->skb = f->skb;
+	nf->buf = f->buf;
+	nf->iter = f->iter;
+	nf->waited = 0;
+	nf->waited_total = f->waited_total;
+	nf->sent = f->sent;
+	f->skb = skb;
+
+	return nf;
+}
+
+static void
+probe(struct aoetgt *t)
+{
+	struct aoedev *d;
+	struct frame *f;
+	struct sk_buff *skb;
+	struct sk_buff_head queue;
+	size_t n, m;
+	int frag;
+
+	d = t->d;
+	f = newtframe(d, t);
+	if (!f) {
+		pr_err("%s %pm for e%ld.%d: %s\n",
+			"aoe: cannot probe remote address",
+			t->addr,
+			(long) d->aoemajor, d->aoeminor,
+			"no frame available");
+		return;
+	}
+	f->flags |= FFL_PROBE;
+	ifrotate(t);
+	f->iter.bi_size = t->d->maxbcnt ? t->d->maxbcnt : DEFAULTBCNT;
+	ata_rw_frameinit(f);
+	skb = f->skb;
+	for (frag = 0, n = f->iter.bi_size; n > 0; ++frag, n -= m) {
+		if (n < PAGE_SIZE)
+			m = n;
+		else
+			m = PAGE_SIZE;
+		skb_fill_page_desc(skb, frag, empty_page, 0, m);
+	}
+	skb->len += f->iter.bi_size;
+	skb->data_len = f->iter.bi_size;
+	skb->truesize += f->iter.bi_size;
+
+	skb = skb_clone(f->skb, GFP_ATOMIC);
+	if (skb) {
+		f->sent = ktime_get();
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, skb);
+		aoenet_xmit(&queue);
+	}
+}
+
+static long
+rto(struct aoedev *d)
+{
+	long t;
+
+	t = 2 * d->rttavg >> RTTSCALE;
+	t += 8 * d->rttdev >> RTTDSCALE;
+	if (t == 0)
+		t = 1;
+
+	return t;
+}
+
+static void
+rexmit_deferred(struct aoedev *d)
+{
+	struct aoetgt *t;
+	struct frame *f;
+	struct frame *nf;
+	struct list_head *pos, *nx, *head;
+	int since;
+	int untainted;
+
+	count_targets(d, &untainted);
+
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head) {
+		f = list_entry(pos, struct frame, head);
+		t = f->t;
+		if (t->taint) {
+			if (!(f->flags & FFL_PROBE)) {
+				nf = reassign_frame(f);
+				if (nf) {
+					if (t->nout_probes == 0
+					&& untainted > 0) {
+						probe(t);
+						t->nout_probes++;
+					}
+					list_replace(&f->head, &nf->head);
+					pos = &nf->head;
+					aoe_freetframe(f);
+					f = nf;
+					t = f->t;
+				}
+			} else if (untainted < 1) {
+				/* don't probe w/o other untainted aoetgts */
+				goto stop_probe;
+			} else if (tsince_hr(f) < t->taint * rto(d)) {
+				/* reprobe slowly when taint is high */
+				continue;
+			}
+		} else if (f->flags & FFL_PROBE) {
+stop_probe:		/* don't probe untainted aoetgts */
+			list_del(pos);
+			aoe_freetframe(f);
+			/* leaving d->kicked, because this is routine */
+			f->t->d->flags |= DEVFL_KICKME;
+			continue;
+		}
+		if (t->nout >= t->maxout)
+			continue;
+		list_del(pos);
+		t->nout++;
+		if (f->flags & FFL_PROBE)
+			t->nout_probes++;
+		since = tsince_hr(f);
+		f->waited += since;
+		f->waited_total += since;
+		resend(d, f);
+	}
+}
+
+/* An aoetgt accumulates demerits quickly, and successful
+ * probing redeems the aoetgt slowly.
+ */
+static void
+scorn(struct aoetgt *t)
+{
+	int n;
+
+	n = t->taint++;
+	t->taint += t->taint * 2;
+	if (n > t->taint)
+		t->taint = n;
+	if (t->taint > MAX_TAINT)
+		t->taint = MAX_TAINT;
+}
+
+static int
+count_targets(struct aoedev *d, int *untainted)
+{
+	int i, good;
+
+	for (i = good = 0; i < d->ntargets && d->targets[i]; ++i)
+		if (d->targets[i]->taint == 0)
+			good++;
+
+	if (untainted)
+		*untainted = good;
+	return i;
+}
+
+static void
+rexmit_timer(struct timer_list *timer)
+{
+	struct aoedev *d;
+	struct aoetgt *t;
+	struct aoeif *ifp;
+	struct frame *f;
+	struct list_head *head, *pos, *nx;
+	LIST_HEAD(flist);
+	register long timeout;
+	ulong flags, n;
+	int i;
+	int utgts;	/* number of aoetgt descriptors (not slots) */
+	int since;
+
+	d = from_timer(d, timer, timer);
+
+	spin_lock_irqsave(&d->lock, flags);
+
+	/* timeout based on observed timings and variations */
+	timeout = rto(d);
+
+	utgts = count_targets(d, NULL);
+
+	if (d->flags & DEVFL_TKILL) {
+		spin_unlock_irqrestore(&d->lock, flags);
+		return;
+	}
+
+	/* collect all frames to rexmit into flist */
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head) {
+			f = list_entry(pos, struct frame, head);
+			if (tsince_hr(f) < timeout)
+				break;	/* end of expired frames */
+			/* move to flist for later processing */
+			list_move_tail(pos, &flist);
+		}
+	}
+
+	/* process expired frames */
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		f = list_entry(pos, struct frame, head);
+		since = tsince_hr(f);
+		n = f->waited_total + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs
+		&& n > aoe_deadsecs
+		&& !(f->flags & FFL_PROBE)) {
+			/* Waited too long.  Device failure.
+			 * Hang all frames on first hash bucket for downdev
+			 * to clean up.
+			 */
+			list_splice(&flist, &d->factive[0]);
+			aoedev_downdev(d);
+			goto out;
+		}
+
+		t = f->t;
+		n = f->waited + since;
+		n /= USEC_PER_SEC;
+		if (aoe_deadsecs && utgts > 0
+		&& (n > aoe_deadsecs / utgts || n > HARD_SCORN_SECS))
+			scorn(t); /* avoid this target */
+
+		if (t->maxout != 1) {
+			t->ssthresh = t->maxout / 2;
+			t->maxout = 1;
+		}
+
+		if (f->flags & FFL_PROBE) {
+			t->nout_probes--;
+		} else {
+			ifp = getif(t, f->skb->dev);
+			if (ifp && ++ifp->lost > (t->nframes << 1)
+			&& (ifp != t->ifs || t->ifs[1].nd)) {
+				ejectif(t, ifp);
+				ifp = NULL;
+			}
+		}
+		list_move_tail(pos, &d->rexmitq);
+		t->nout--;
+	}
+	rexmit_deferred(d);
+
+out:
+	if ((d->flags & DEVFL_KICKME) && d->blkq) {
+		d->flags &= ~DEVFL_KICKME;
+		blk_mq_run_hw_queues(d->blkq, true);
+	}
+
+	d->timer.expires = jiffies + TIMERTICK;
+	add_timer(&d->timer);
+
+	spin_unlock_irqrestore(&d->lock, flags);
+}
+
+static void
+bufinit(struct buf *buf, struct request *rq, struct bio *bio)
+{
+	memset(buf, 0, sizeof(*buf));
+	buf->rq = rq;
+	buf->bio = bio;
+	buf->iter = bio->bi_iter;
+}
+
+static struct buf *
+nextbuf(struct aoedev *d)
+{
+	struct request *rq;
+	struct request_queue *q;
+	struct aoe_req *req;
+	struct buf *buf;
+	struct bio *bio;
+
+	q = d->blkq;
+	if (q == NULL)
+		return NULL;	/* initializing */
+	if (d->ip.buf)
+		return d->ip.buf;
+	rq = d->ip.rq;
+	if (rq == NULL) {
+		rq = list_first_entry_or_null(&d->rq_list, struct request,
+						queuelist);
+		if (rq == NULL)
+			return NULL;
+		list_del_init(&rq->queuelist);
+		blk_mq_start_request(rq);
+		d->ip.rq = rq;
+		d->ip.nxbio = rq->bio;
+
+		req = blk_mq_rq_to_pdu(rq);
+		req->nr_bios = 0;
+		__rq_for_each_bio(bio, rq)
+			req->nr_bios++;
+	}
+	buf = mempool_alloc(d->bufpool, GFP_ATOMIC);
+	if (buf == NULL) {
+		pr_err("aoe: nextbuf: unable to mempool_alloc!\n");
+		return NULL;
+	}
+	bio = d->ip.nxbio;
+	bufinit(buf, rq, bio);
+	bio = bio->bi_next;
+	d->ip.nxbio = bio;
+	if (bio == NULL)
+		d->ip.rq = NULL;
+	return d->ip.buf = buf;
+}
+
+/* enters with d->lock held */
+void
+aoecmd_work(struct aoedev *d)
+{
+	rexmit_deferred(d);
+	while (aoecmd_ata_rw(d))
+		;
+}
+
+/* this function performs work that has been deferred until sleeping is OK
+ */
+void
+aoecmd_sleepwork(struct work_struct *work)
+{
+	struct aoedev *d = container_of(work, struct aoedev, work);
+
+	if (d->flags & DEVFL_GDALLOC)
+		aoeblk_gdalloc(d);
+
+	if (d->flags & DEVFL_NEWSIZE) {
+		set_capacity_and_notify(d->gd, d->ssize);
+
+		spin_lock_irq(&d->lock);
+		d->flags |= DEVFL_UP;
+		d->flags &= ~DEVFL_NEWSIZE;
+		spin_unlock_irq(&d->lock);
+	}
+}
+
+static void
+ata_ident_fixstring(u16 *id, int ns)
+{
+	u16 s;
+
+	while (ns-- > 0) {
+		s = *id;
+		*id++ = s >> 8 | s << 8;
+	}
+}
+
+static void
+ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
+{
+	u64 ssize;
+	u16 n;
+
+	/* word 83: command set supported */
+	n = get_unaligned_le16(&id[83 << 1]);
+
+	/* word 86: command set/feature enabled */
+	n |= get_unaligned_le16(&id[86 << 1]);
+
+	if (n & (1<<10)) {	/* bit 10: LBA 48 */
+		d->flags |= DEVFL_EXT;
+
+		/* word 100: number lba48 sectors */
+		ssize = get_unaligned_le64(&id[100 << 1]);
+
+		/* set as in ide-disk.c:init_idedisk_capacity */
+		d->geo.cylinders = ssize;
+		d->geo.cylinders /= (255 * 63);
+		d->geo.heads = 255;
+		d->geo.sectors = 63;
+	} else {
+		d->flags &= ~DEVFL_EXT;
+
+		/* number lba28 sectors */
+		ssize = get_unaligned_le32(&id[60 << 1]);
+
+		/* NOTE: obsolete in ATA 6 */
+		d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
+		d->geo.heads = get_unaligned_le16(&id[55 << 1]);
+		d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
+	}
+
+	ata_ident_fixstring((u16 *) &id[10<<1], 10);	/* serial */
+	ata_ident_fixstring((u16 *) &id[23<<1], 4);	/* firmware */
+	ata_ident_fixstring((u16 *) &id[27<<1], 20);	/* model */
+	memcpy(d->ident, id, sizeof(d->ident));
+
+	if (d->ssize != ssize)
+		printk(KERN_INFO
+			"aoe: %pm e%ld.%d v%04x has %llu sectors\n",
+			t->addr,
+			d->aoemajor, d->aoeminor,
+			d->fw_ver, (long long)ssize);
+	d->ssize = ssize;
+	d->geo.start = 0;
+	if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
+		return;
+	if (d->gd != NULL)
+		d->flags |= DEVFL_NEWSIZE;
+	else
+		d->flags |= DEVFL_GDALLOC;
+	queue_work(aoe_wq, &d->work);
+}
+
+static void
+calc_rttavg(struct aoedev *d, struct aoetgt *t, int rtt)
+{
+	register long n;
+
+	n = rtt;
+
+	/* cf. Congestion Avoidance and Control, Jacobson & Karels, 1988 */
+	n -= d->rttavg >> RTTSCALE;
+	d->rttavg += n;
+	if (n < 0)
+		n = -n;
+	n -= d->rttdev >> RTTDSCALE;
+	d->rttdev += n;
+
+	if (!t || t->maxout >= t->nframes)
+		return;
+	if (t->maxout < t->ssthresh)
+		t->maxout += 1;
+	else if (t->nout == t->maxout && t->next_cwnd-- == 0) {
+		t->maxout += 1;
+		t->next_cwnd = t->maxout;
+	}
+}
+
+static struct aoetgt *
+gettgt(struct aoedev *d, char *addr)
+{
+	struct aoetgt **t, **e;
+
+	t = d->targets;
+	e = t + d->ntargets;
+	for (; t < e && *t; t++)
+		if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
+			return *t;
+	return NULL;
+}
+
+static void
+bvcpy(struct sk_buff *skb, struct bio *bio, struct bvec_iter iter, long cnt)
+{
+	int soff = 0;
+	struct bio_vec bv;
+
+	iter.bi_size = cnt;
+
+	__bio_for_each_segment(bv, bio, iter, iter) {
+		char *p = bvec_kmap_local(&bv);
+		skb_copy_bits(skb, soff, p, bv.bv_len);
+		kunmap_local(p);
+		soff += bv.bv_len;
+	}
+}
+
+void
+aoe_end_request(struct aoedev *d, struct request *rq, int fastfail)
+{
+	struct bio *bio;
+	int bok;
+	struct request_queue *q;
+	blk_status_t err = BLK_STS_OK;
+
+	q = d->blkq;
+	if (rq == d->ip.rq)
+		d->ip.rq = NULL;
+	do {
+		bio = rq->bio;
+		bok = !fastfail && !bio->bi_status;
+		if (!bok)
+			err = BLK_STS_IOERR;
+	} while (blk_update_request(rq, bok ? BLK_STS_OK : BLK_STS_IOERR, bio->bi_iter.bi_size));
+
+	__blk_mq_end_request(rq, err);
+
+	/* cf. https://lore.kernel.org/lkml/20061031071040.GS14055@kernel.dk/ */
+	if (!fastfail)
+		blk_mq_run_hw_queues(q, true);
+}
+
+static void
+aoe_end_buf(struct aoedev *d, struct buf *buf)
+{
+	struct request *rq = buf->rq;
+	struct aoe_req *req = blk_mq_rq_to_pdu(rq);
+
+	if (buf == d->ip.buf)
+		d->ip.buf = NULL;
+	mempool_free(buf, d->bufpool);
+	if (--req->nr_bios == 0)
+		aoe_end_request(d, rq, 0);
+}
+
+static void
+ktiocomplete(struct frame *f)
+{
+	struct aoe_hdr *hin, *hout;
+	struct aoe_atahdr *ahin, *ahout;
+	struct buf *buf;
+	struct sk_buff *skb;
+	struct aoetgt *t;
+	struct aoeif *ifp;
+	struct aoedev *d;
+	long n;
+	int untainted;
+
+	if (f == NULL)
+		return;
+
+	t = f->t;
+	d = t->d;
+	skb = f->r_skb;
+	buf = f->buf;
+	if (f->flags & FFL_PROBE)
+		goto out;
+	if (!skb)		/* just fail the buf. */
+		goto noskb;
+
+	hout = (struct aoe_hdr *) skb_mac_header(f->skb);
+	ahout = (struct aoe_atahdr *) (hout+1);
+
+	hin = (struct aoe_hdr *) skb->data;
+	skb_pull(skb, sizeof(*hin));
+	ahin = (struct aoe_atahdr *) skb->data;
+	skb_pull(skb, sizeof(*ahin));
+	if (ahin->cmdstat & 0xa9) {	/* these bits cleared on success */
+		pr_err("aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
+			ahout->cmdstat, ahin->cmdstat,
+			d->aoemajor, d->aoeminor);
+noskb:		if (buf)
+			buf->bio->bi_status = BLK_STS_IOERR;
+		goto out;
+	}
+
+	n = ahout->scnt << 9;
+	switch (ahout->cmdstat) {
+	case ATA_CMD_PIO_READ:
+	case ATA_CMD_PIO_READ_EXT:
+		if (skb->len < n) {
+			pr_err("%s e%ld.%d.  skb->len=%d need=%ld\n",
+				"aoe: runt data size in read from",
+				(long) d->aoemajor, d->aoeminor,
+			       skb->len, n);
+			buf->bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
+		if (n > f->iter.bi_size) {
+			pr_err_ratelimited("%s e%ld.%d.  bytes=%ld need=%u\n",
+				"aoe: too-large data size in read from",
+				(long) d->aoemajor, d->aoeminor,
+				n, f->iter.bi_size);
+			buf->bio->bi_status = BLK_STS_IOERR;
+			break;
+		}
+		bvcpy(skb, f->buf->bio, f->iter, n);
+		fallthrough;
+	case ATA_CMD_PIO_WRITE:
+	case ATA_CMD_PIO_WRITE_EXT:
+		spin_lock_irq(&d->lock);
+		ifp = getif(t, skb->dev);
+		if (ifp)
+			ifp->lost = 0;
+		spin_unlock_irq(&d->lock);
+		break;
+	case ATA_CMD_ID_ATA:
+		if (skb->len < 512) {
+			pr_info("%s e%ld.%d.  skb->len=%d need=512\n",
+				"aoe: runt data size in ataid from",
+				(long) d->aoemajor, d->aoeminor,
+				skb->len);
+			break;
+		}
+		if (skb_linearize(skb))
+			break;
+		spin_lock_irq(&d->lock);
+		ataid_complete(d, t, skb->data);
+		spin_unlock_irq(&d->lock);
+		break;
+	default:
+		pr_info("aoe: unrecognized ata command %2.2Xh for %d.%d\n",
+			ahout->cmdstat,
+			be16_to_cpu(get_unaligned(&hin->major)),
+			hin->minor);
+	}
+out:
+	spin_lock_irq(&d->lock);
+	if (t->taint > 0
+	&& --t->taint > 0
+	&& t->nout_probes == 0) {
+		count_targets(d, &untainted);
+		if (untainted > 0) {
+			probe(t);
+			t->nout_probes++;
+		}
+	}
+
+	aoe_freetframe(f);
+
+	if (buf && --buf->nframesout == 0 && buf->iter.bi_size == 0)
+		aoe_end_buf(d, buf);
+
+	spin_unlock_irq(&d->lock);
+	aoedev_put(d);
+	dev_kfree_skb(skb);
+}
+
+/* Enters with iocq.lock held.
+ * Returns true iff responses needing processing remain.
+ */
+static int
+ktio(int id)
+{
+	struct frame *f;
+	struct list_head *pos;
+	int i;
+	int actual_id;
+
+	for (i = 0; ; ++i) {
+		if (i == MAXIOC)
+			return 1;
+		if (list_empty(&iocq[id].head))
+			return 0;
+		pos = iocq[id].head.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		spin_unlock_irq(&iocq[id].lock);
+		ktiocomplete(f);
+
+		/* Figure out if extra threads are required. */
+		actual_id = f->t->d->aoeminor % ncpus;
+
+		if (!kts[actual_id].active) {
+			BUG_ON(id != 0);
+			mutex_lock(&ktio_spawn_lock);
+			if (!kts[actual_id].active
+				&& aoe_ktstart(&kts[actual_id]) == 0)
+				kts[actual_id].active = 1;
+			mutex_unlock(&ktio_spawn_lock);
+		}
+		spin_lock_irq(&iocq[id].lock);
+	}
+}
+
+static int
+kthread(void *vp)
+{
+	struct ktstate *k;
+	DECLARE_WAITQUEUE(wait, current);
+	int more;
+
+	k = vp;
+	current->flags |= PF_NOFREEZE;
+	set_user_nice(current, -10);
+	complete(&k->rendez);	/* tell spawner we're running */
+	do {
+		spin_lock_irq(k->lock);
+		more = k->fn(k->id);
+		if (!more) {
+			add_wait_queue(k->waitq, &wait);
+			__set_current_state(TASK_INTERRUPTIBLE);
+		}
+		spin_unlock_irq(k->lock);
+		if (!more) {
+			schedule();
+			remove_wait_queue(k->waitq, &wait);
+		} else
+			cond_resched();
+	} while (!kthread_should_stop());
+	complete(&k->rendez);	/* tell spawner we're stopping */
+	return 0;
+}
+
+void
+aoe_ktstop(struct ktstate *k)
+{
+	kthread_stop(k->task);
+	wait_for_completion(&k->rendez);
+}
+
+int
+aoe_ktstart(struct ktstate *k)
+{
+	struct task_struct *task;
+
+	init_completion(&k->rendez);
+	task = kthread_run(kthread, k, "%s", k->name);
+	if (task == NULL || IS_ERR(task))
+		return -ENOMEM;
+	k->task = task;
+	wait_for_completion(&k->rendez); /* allow kthread to start */
+	init_completion(&k->rendez);	/* for waiting for exit later */
+	return 0;
+}
+
+/* pass it off to kthreads for processing */
+static void
+ktcomplete(struct frame *f, struct sk_buff *skb)
+{
+	int id;
+	ulong flags;
+
+	f->r_skb = skb;
+	id = f->t->d->aoeminor % ncpus;
+	spin_lock_irqsave(&iocq[id].lock, flags);
+	if (!kts[id].active) {
+		spin_unlock_irqrestore(&iocq[id].lock, flags);
+		/* The thread with id has not been spawned yet,
+		 * so delegate the work to the main thread and
+		 * try spawning a new thread.
+		 */
+		id = 0;
+		spin_lock_irqsave(&iocq[id].lock, flags);
+	}
+	list_add_tail(&f->head, &iocq[id].head);
+	spin_unlock_irqrestore(&iocq[id].lock, flags);
+	wake_up(&ktiowq[id]);
+}
+
+struct sk_buff *
+aoecmd_ata_rsp(struct sk_buff *skb)
+{
+	struct aoedev *d;
+	struct aoe_hdr *h;
+	struct frame *f;
+	u32 n;
+	ulong flags;
+	char ebuf[128];
+	u16 aoemajor;
+
+	h = (struct aoe_hdr *) skb->data;
+	aoemajor = be16_to_cpu(get_unaligned(&h->major));
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 0);
+	if (d == NULL) {
+		snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
+			"for unknown device %d.%d\n",
+			aoemajor, h->minor);
+		aoechr_error(ebuf);
+		return skb;
+	}
+
+	spin_lock_irqsave(&d->lock, flags);
+
+	n = be32_to_cpu(get_unaligned(&h->tag));
+	f = getframe(d, n);
+	if (f) {
+		calc_rttavg(d, f->t, tsince_hr(f));
+		f->t->nout--;
+		if (f->flags & FFL_PROBE)
+			f->t->nout_probes--;
+	} else {
+		f = getframe_deferred(d, n);
+		if (f) {
+			calc_rttavg(d, NULL, tsince_hr(f));
+		} else {
+			calc_rttavg(d, NULL, tsince(n));
+			spin_unlock_irqrestore(&d->lock, flags);
+			aoedev_put(d);
+			snprintf(ebuf, sizeof(ebuf),
+				 "%15s e%d.%d    tag=%08x@%08lx s=%pm d=%pm\n",
+				 "unexpected rsp",
+				 get_unaligned_be16(&h->major),
+				 h->minor,
+				 get_unaligned_be32(&h->tag),
+				 jiffies,
+				 h->src,
+				 h->dst);
+			aoechr_error(ebuf);
+			return skb;
+		}
+	}
+	aoecmd_work(d);
+
+	spin_unlock_irqrestore(&d->lock, flags);
+
+	ktcomplete(f, skb);
+
+	/*
+	 * Note here that we do not perform an aoedev_put, as we are
+	 * leaving this reference for the ktio to release.
+	 */
+	return NULL;
+}
+
+void
+aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
+{
+	struct sk_buff_head queue;
+
+	__skb_queue_head_init(&queue);
+	aoecmd_cfg_pkts(aoemajor, aoeminor, &queue);
+	aoenet_xmit(&queue);
+}
+
+struct sk_buff *
+aoecmd_ata_id(struct aoedev *d)
+{
+	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
+	struct frame *f;
+	struct sk_buff *skb;
+	struct aoetgt *t;
+
+	f = newframe(d);
+	if (f == NULL)
+		return NULL;
+
+	t = *d->tgt;
+
+	/* initialize the headers & frame */
+	skb = f->skb;
+	h = (struct aoe_hdr *) skb_mac_header(skb);
+	ah = (struct aoe_atahdr *) (h+1);
+	skb_put(skb, sizeof *h + sizeof *ah);
+	memset(h, 0, skb->len);
+	f->tag = aoehdr_atainit(d, t, h);
+	fhash(f);
+	t->nout++;
+	f->waited = 0;
+	f->waited_total = 0;
+
+	/* set up ata header */
+	ah->scnt = 1;
+	ah->cmdstat = ATA_CMD_ID_ATA;
+	ah->lba3 = 0xa0;
+
+	skb->dev = t->ifp->nd;
+
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
+	d->timer.function = rexmit_timer;
+
+	skb = skb_clone(skb, GFP_ATOMIC);
+	if (skb)
+		f->sent = ktime_get();
+
+	return skb;
+}
+
+static struct aoetgt **
+grow_targets(struct aoedev *d)
+{
+	ulong oldn, newn;
+	struct aoetgt **tt;
+
+	oldn = d->ntargets;
+	newn = oldn * 2;
+	tt = kcalloc(newn, sizeof(*d->targets), GFP_ATOMIC);
+	if (!tt)
+		return NULL;
+	memmove(tt, d->targets, sizeof(*d->targets) * oldn);
+	d->tgt = tt + (d->tgt - d->targets);
+	kfree(d->targets);
+	d->targets = tt;
+	d->ntargets = newn;
+
+	return &d->targets[oldn];
+}
+
+static struct aoetgt *
+addtgt(struct aoedev *d, char *addr, ulong nframes)
+{
+	struct aoetgt *t, **tt, **te;
+
+	tt = d->targets;
+	te = tt + d->ntargets;
+	for (; tt < te && *tt; tt++)
+		;
+
+	if (tt == te) {
+		tt = grow_targets(d);
+		if (!tt)
+			goto nomem;
+	}
+	t = kzalloc(sizeof(*t), GFP_ATOMIC);
+	if (!t)
+		goto nomem;
+	t->nframes = nframes;
+	t->d = d;
+	memcpy(t->addr, addr, sizeof t->addr);
+	t->ifp = t->ifs;
+	aoecmd_wreset(t);
+	t->maxout = t->nframes / 2;
+	INIT_LIST_HEAD(&t->ffree);
+	return *tt = t;
+
+ nomem:
+	pr_info("aoe: cannot allocate memory to add target\n");
+	return NULL;
+}
+
+static void
+setdbcnt(struct aoedev *d)
+{
+	struct aoetgt **t, **e;
+	int bcnt = 0;
+
+	t = d->targets;
+	e = t + d->ntargets;
+	for (; t < e && *t; t++)
+		if (bcnt == 0 || bcnt > (*t)->minbcnt)
+			bcnt = (*t)->minbcnt;
+	if (bcnt != d->maxbcnt) {
+		d->maxbcnt = bcnt;
+		pr_info("aoe: e%ld.%d: setting %d byte data frames\n",
+			d->aoemajor, d->aoeminor, bcnt);
+	}
+}
+
+static void
+setifbcnt(struct aoetgt *t, struct net_device *nd, int bcnt)
+{
+	struct aoedev *d;
+	struct aoeif *p, *e;
+	int minbcnt;
+
+	d = t->d;
+	minbcnt = bcnt;
+	p = t->ifs;
+	e = p + NAOEIFS;
+	for (; p < e; p++) {
+		if (p->nd == NULL)
+			break;		/* end of the valid interfaces */
+		if (p->nd == nd) {
+			p->bcnt = bcnt;	/* we're updating */
+			nd = NULL;
+		} else if (minbcnt > p->bcnt)
+			minbcnt = p->bcnt; /* find the min interface */
+	}
+	if (nd) {
+		if (p == e) {
+			pr_err("aoe: device setifbcnt failure; too many interfaces.\n");
+			return;
+		}
+		dev_hold(nd);
+		p->nd = nd;
+		p->bcnt = bcnt;
+	}
+	t->minbcnt = minbcnt;
+	setdbcnt(d);
+}
+
+void
+aoecmd_cfg_rsp(struct sk_buff *skb)
+{
+	struct aoedev *d;
+	struct aoe_hdr *h;
+	struct aoe_cfghdr *ch;
+	struct aoetgt *t;
+	ulong flags, aoemajor;
+	struct sk_buff *sl;
+	struct sk_buff_head queue;
+	u16 n;
+
+	sl = NULL;
+	h = (struct aoe_hdr *) skb_mac_header(skb);
+	ch = (struct aoe_cfghdr *) (h+1);
+
+	/*
+	 * Enough people have their dip switches set backwards to
+	 * warrant a loud message for this special case.
+	 */
+	aoemajor = get_unaligned_be16(&h->major);
+	if (aoemajor == 0xfff) {
+		printk(KERN_ERR "aoe: Warning: shelf address is all ones.  "
+			"Check shelf dip switches.\n");
+		return;
+	}
+	if (aoemajor == 0xffff) {
+		pr_info("aoe: e%ld.%d: broadcast shelf number invalid\n",
+			aoemajor, (int) h->minor);
+		return;
+	}
+	if (h->minor == 0xff) {
+		pr_info("aoe: e%ld.%d: broadcast slot number invalid\n",
+			aoemajor, (int) h->minor);
+		return;
+	}
+
+	n = be16_to_cpu(ch->bufcnt);
+	if (n > aoe_maxout)	/* keep it reasonable */
+		n = aoe_maxout;
+
+	d = aoedev_by_aoeaddr(aoemajor, h->minor, 1);
+	if (d == NULL) {
+		pr_info("aoe: device allocation failure\n");
+		return;
+	}
+
+	spin_lock_irqsave(&d->lock, flags);
+
+	t = gettgt(d, h->src);
+	if (t) {
+		t->nframes = n;
+		if (n < t->maxout)
+			aoecmd_wreset(t);
+	} else {
+		t = addtgt(d, h->src, n);
+		if (!t)
+			goto bail;
+	}
+	n = skb->dev->mtu;
+	n -= sizeof(struct aoe_hdr) + sizeof(struct aoe_atahdr);
+	n /= 512;
+	if (n > ch->scnt)
+		n = ch->scnt;
+	n = n ? n * 512 : DEFAULTBCNT;
+	setifbcnt(t, skb->dev, n);
+
+	/* don't change users' perspective */
+	if (d->nopen == 0) {
+		d->fw_ver = be16_to_cpu(ch->fwver);
+		sl = aoecmd_ata_id(d);
+	}
+bail:
+	spin_unlock_irqrestore(&d->lock, flags);
+	aoedev_put(d);
+	if (sl) {
+		__skb_queue_head_init(&queue);
+		__skb_queue_tail(&queue, sl);
+		aoenet_xmit(&queue);
+	}
+}
+
+void
+aoecmd_wreset(struct aoetgt *t)
+{
+	t->maxout = 1;
+	t->ssthresh = t->nframes / 2;
+	t->next_cwnd = t->nframes;
+}
+
+void
+aoecmd_cleanslate(struct aoedev *d)
+{
+	struct aoetgt **t, **te;
+
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
+	d->maxbcnt = 0;
+
+	t = d->targets;
+	te = t + d->ntargets;
+	for (; t < te && *t; t++)
+		aoecmd_wreset(*t);
+}
+
+void
+aoe_failbuf(struct aoedev *d, struct buf *buf)
+{
+	if (buf == NULL)
+		return;
+	buf->iter.bi_size = 0;
+	buf->bio->bi_status = BLK_STS_IOERR;
+	if (buf->nframesout == 0)
+		aoe_end_buf(d, buf);
+}
+
+void
+aoe_flush_iocq(void)
+{
+	int i;
+
+	for (i = 0; i < ncpus; i++) {
+		if (kts[i].active)
+			aoe_flush_iocq_by_index(i);
+	}
+}
+
+void
+aoe_flush_iocq_by_index(int id)
+{
+	struct frame *f;
+	struct aoedev *d;
+	LIST_HEAD(flist);
+	struct list_head *pos;
+	struct sk_buff *skb;
+	ulong flags;
+
+	spin_lock_irqsave(&iocq[id].lock, flags);
+	list_splice_init(&iocq[id].head, &flist);
+	spin_unlock_irqrestore(&iocq[id].lock, flags);
+	while (!list_empty(&flist)) {
+		pos = flist.next;
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		d = f->t->d;
+		skb = f->r_skb;
+		spin_lock_irqsave(&d->lock, flags);
+		if (f->buf) {
+			f->buf->nframesout--;
+			aoe_failbuf(d, f->buf);
+		}
+		aoe_freetframe(f);
+		spin_unlock_irqrestore(&d->lock, flags);
+		dev_kfree_skb(skb);
+		aoedev_put(d);
+	}
+}
+
+int __init
+aoecmd_init(void)
+{
+	void *p;
+	int i;
+	int ret;
+
+	/* get_zeroed_page returns page with ref count 1 */
+	p = (void *) get_zeroed_page(GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+	empty_page = virt_to_page(p);
+
+	ncpus = num_online_cpus();
+
+	iocq = kcalloc(ncpus, sizeof(struct iocq_ktio), GFP_KERNEL);
+	if (!iocq)
+		return -ENOMEM;
+
+	kts = kcalloc(ncpus, sizeof(struct ktstate), GFP_KERNEL);
+	if (!kts) {
+		ret = -ENOMEM;
+		goto kts_fail;
+	}
+
+	ktiowq = kcalloc(ncpus, sizeof(wait_queue_head_t), GFP_KERNEL);
+	if (!ktiowq) {
+		ret = -ENOMEM;
+		goto ktiowq_fail;
+	}
+
+	for (i = 0; i < ncpus; i++) {
+		INIT_LIST_HEAD(&iocq[i].head);
+		spin_lock_init(&iocq[i].lock);
+		init_waitqueue_head(&ktiowq[i]);
+		snprintf(kts[i].name, sizeof(kts[i].name), "aoe_ktio%d", i);
+		kts[i].fn = ktio;
+		kts[i].waitq = &ktiowq[i];
+		kts[i].lock = &iocq[i].lock;
+		kts[i].id = i;
+		kts[i].active = 0;
+	}
+	kts[0].active = 1;
+	if (aoe_ktstart(&kts[0])) {
+		ret = -ENOMEM;
+		goto ktstart_fail;
+	}
+	return 0;
+
+ktstart_fail:
+	kfree(ktiowq);
+ktiowq_fail:
+	kfree(kts);
+kts_fail:
+	kfree(iocq);
+
+	return ret;
+}
+
+void
+aoecmd_exit(void)
+{
+	int i;
+
+	for (i = 0; i < ncpus; i++)
+		if (kts[i].active)
+			aoe_ktstop(&kts[i]);
+
+	aoe_flush_iocq();
+
+	/* Free up the iocq and thread speicific configuration
+	* allocated during startup.
+	*/
+	kfree(iocq);
+	kfree(kts);
+	kfree(ktiowq);
+
+	free_page((unsigned long) page_address(empty_page));
+	empty_page = NULL;
+}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
new file mode 100644
index 000000000..3523dd82d
--- /dev/null
+++ b/drivers/block/aoe/aoedev.c
@@ -0,0 +1,531 @@
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoedev.c
+ * AoE device utility functions; maintains device list.
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blk-mq.h>
+#include <linux/netdevice.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/bitmap.h>
+#include <linux/kdev_t.h>
+#include <linux/moduleparam.h>
+#include <linux/string.h>
+#include "aoe.h"
+
+static void freetgt(struct aoedev *d, struct aoetgt *t);
+static void skbpoolfree(struct aoedev *d);
+
+static int aoe_dyndevs = 1;
+module_param(aoe_dyndevs, int, 0644);
+MODULE_PARM_DESC(aoe_dyndevs, "Use dynamic minor numbers for devices.");
+
+static struct aoedev *devlist;
+static DEFINE_SPINLOCK(devlist_lock);
+
+/* Because some systems will have one, many, or no
+ *   - partitions,
+ *   - slots per shelf,
+ *   - or shelves,
+ * we need some flexibility in the way the minor numbers
+ * are allocated.  So they are dynamic.
+ */
+#define N_DEVS ((1U<<MINORBITS)/AOE_PARTITIONS)
+
+static DEFINE_SPINLOCK(used_minors_lock);
+static DECLARE_BITMAP(used_minors, N_DEVS);
+
+static int
+minor_get_dyn(ulong *sysminor)
+{
+	ulong flags;
+	ulong n;
+	int error = 0;
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	n = find_first_zero_bit(used_minors, N_DEVS);
+	if (n < N_DEVS)
+		set_bit(n, used_minors);
+	else
+		error = -1;
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+
+	*sysminor = n * AOE_PARTITIONS;
+	return error;
+}
+
+static int
+minor_get_static(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+	ulong flags;
+	ulong n;
+	int error = 0;
+	enum {
+		/* for backwards compatibility when !aoe_dyndevs,
+		 * a static number of supported slots per shelf */
+		NPERSHELF = 16,
+	};
+
+	if (aoemin >= NPERSHELF) {
+		pr_err("aoe: %s %d slots per shelf\n",
+			"static minor device numbers support only",
+			NPERSHELF);
+		error = -1;
+		goto out;
+	}
+
+	n = aoemaj * NPERSHELF + aoemin;
+	if (n >= N_DEVS) {
+		pr_err("aoe: %s with e%ld.%d\n",
+			"cannot use static minor device numbers",
+			aoemaj, aoemin);
+		error = -1;
+		goto out;
+	}
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	if (test_bit(n, used_minors)) {
+		pr_err("aoe: %s %lu\n",
+			"existing device already has static minor number",
+			n);
+		error = -1;
+	} else
+		set_bit(n, used_minors);
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+	*sysminor = n * AOE_PARTITIONS;
+out:
+	return error;
+}
+
+static int
+minor_get(ulong *sysminor, ulong aoemaj, int aoemin)
+{
+	if (aoe_dyndevs)
+		return minor_get_dyn(sysminor);
+	else
+		return minor_get_static(sysminor, aoemaj, aoemin);
+}
+
+static void
+minor_free(ulong minor)
+{
+	ulong flags;
+
+	minor /= AOE_PARTITIONS;
+	BUG_ON(minor >= N_DEVS);
+
+	spin_lock_irqsave(&used_minors_lock, flags);
+	BUG_ON(!test_bit(minor, used_minors));
+	clear_bit(minor, used_minors);
+	spin_unlock_irqrestore(&used_minors_lock, flags);
+}
+
+/*
+ * Users who grab a pointer to the device with aoedev_by_aoeaddr
+ * automatically get a reference count and must be responsible
+ * for performing a aoedev_put.  With the addition of async
+ * kthread processing I'm no longer confident that we can
+ * guarantee consistency in the face of device flushes.
+ *
+ * For the time being, we only bother to add extra references for
+ * frames sitting on the iocq.  When the kthreads finish processing
+ * these frames, they will aoedev_put the device.
+ */
+
+void
+aoedev_put(struct aoedev *d)
+{
+	ulong flags;
+
+	spin_lock_irqsave(&devlist_lock, flags);
+	d->ref--;
+	spin_unlock_irqrestore(&devlist_lock, flags);
+}
+
+static void
+dummy_timer(struct timer_list *t)
+{
+	struct aoedev *d;
+
+	d = from_timer(d, t, timer);
+	if (d->flags & DEVFL_TKILL)
+		return;
+	d->timer.expires = jiffies + HZ;
+	add_timer(&d->timer);
+}
+
+static void
+aoe_failip(struct aoedev *d)
+{
+	struct request *rq;
+	struct aoe_req *req;
+	struct bio *bio;
+
+	aoe_failbuf(d, d->ip.buf);
+	rq = d->ip.rq;
+	if (rq == NULL)
+		return;
+
+	req = blk_mq_rq_to_pdu(rq);
+	while ((bio = d->ip.nxbio)) {
+		bio->bi_status = BLK_STS_IOERR;
+		d->ip.nxbio = bio->bi_next;
+		req->nr_bios--;
+	}
+
+	if (!req->nr_bios)
+		aoe_end_request(d, rq, 0);
+}
+
+static void
+downdev_frame(struct list_head *pos)
+{
+	struct frame *f;
+
+	f = list_entry(pos, struct frame, head);
+	list_del(pos);
+	if (f->buf) {
+		f->buf->nframesout--;
+		aoe_failbuf(f->t->d, f->buf);
+	}
+	aoe_freetframe(f);
+}
+
+void
+aoedev_downdev(struct aoedev *d)
+{
+	struct aoetgt *t, **tt, **te;
+	struct list_head *head, *pos, *nx;
+	int i;
+
+	d->flags &= ~DEVFL_UP;
+
+	/* clean out active and to-be-retransmitted buffers */
+	for (i = 0; i < NFACTIVE; i++) {
+		head = &d->factive[i];
+		list_for_each_safe(pos, nx, head)
+			downdev_frame(pos);
+	}
+	head = &d->rexmitq;
+	list_for_each_safe(pos, nx, head)
+		downdev_frame(pos);
+
+	/* reset window dressings */
+	tt = d->targets;
+	te = tt + d->ntargets;
+	for (; tt < te && (t = *tt); tt++) {
+		aoecmd_wreset(t);
+		t->nout = 0;
+	}
+
+	/* clean out the in-process request (if any) */
+	aoe_failip(d);
+
+	/* fast fail all pending I/O */
+	if (d->blkq) {
+		/* UP is cleared, freeze+quiesce to insure all are errored */
+		blk_mq_freeze_queue(d->blkq);
+		blk_mq_quiesce_queue(d->blkq);
+		blk_mq_unquiesce_queue(d->blkq);
+		blk_mq_unfreeze_queue(d->blkq);
+	}
+
+	if (d->gd)
+		set_capacity(d->gd, 0);
+}
+
+/* return whether the user asked for this particular
+ * device to be flushed
+ */
+static int
+user_req(char *s, size_t slen, struct aoedev *d)
+{
+	const char *p;
+	size_t lim;
+
+	if (!d->gd)
+		return 0;
+	p = kbasename(d->gd->disk_name);
+	lim = sizeof(d->gd->disk_name);
+	lim -= p - d->gd->disk_name;
+	if (slen < lim)
+		lim = slen;
+
+	return !strncmp(s, p, lim);
+}
+
+static void
+freedev(struct aoedev *d)
+{
+	struct aoetgt **t, **e;
+	int freeing = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&d->lock, flags);
+	if (d->flags & DEVFL_TKILL
+	&& !(d->flags & DEVFL_FREEING)) {
+		d->flags |= DEVFL_FREEING;
+		freeing = 1;
+	}
+	spin_unlock_irqrestore(&d->lock, flags);
+	if (!freeing)
+		return;
+
+	del_timer_sync(&d->timer);
+	if (d->gd) {
+		aoedisk_rm_debugfs(d);
+		del_gendisk(d->gd);
+		put_disk(d->gd);
+		blk_mq_free_tag_set(&d->tag_set);
+	}
+	t = d->targets;
+	e = t + d->ntargets;
+	for (; t < e && *t; t++)
+		freetgt(d, *t);
+
+	mempool_destroy(d->bufpool);
+	skbpoolfree(d);
+	minor_free(d->sysminor);
+
+	spin_lock_irqsave(&d->lock, flags);
+	d->flags |= DEVFL_FREED;
+	spin_unlock_irqrestore(&d->lock, flags);
+}
+
+enum flush_parms {
+	NOT_EXITING = 0,
+	EXITING = 1,
+};
+
+static int
+flush(const char __user *str, size_t cnt, int exiting)
+{
+	ulong flags;
+	struct aoedev *d, **dd;
+	char buf[16];
+	int all = 0;
+	int specified = 0;	/* flush a specific device */
+	unsigned int skipflags;
+
+	skipflags = DEVFL_GDALLOC | DEVFL_NEWSIZE | DEVFL_TKILL;
+
+	if (!exiting && cnt >= 3) {
+		if (cnt > sizeof buf)
+			cnt = sizeof buf;
+		if (copy_from_user(buf, str, cnt))
+			return -EFAULT;
+		all = !strncmp(buf, "all", 3);
+		if (!all)
+			specified = 1;
+	}
+
+	flush_workqueue(aoe_wq);
+	/* pass one: do aoedev_downdev, which might sleep */
+restart1:
+	spin_lock_irqsave(&devlist_lock, flags);
+	for (d = devlist; d; d = d->next) {
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_TKILL)
+			goto cont;
+
+		if (exiting) {
+			/* unconditionally take each device down */
+		} else if (specified) {
+			if (!user_req(buf, cnt, d))
+				goto cont;
+		} else if ((!all && (d->flags & DEVFL_UP))
+		|| d->flags & skipflags
+		|| d->nopen
+		|| d->ref)
+			goto cont;
+
+		spin_unlock(&d->lock);
+		spin_unlock_irqrestore(&devlist_lock, flags);
+		aoedev_downdev(d);
+		d->flags |= DEVFL_TKILL;
+		goto restart1;
+cont:
+		spin_unlock(&d->lock);
+	}
+	spin_unlock_irqrestore(&devlist_lock, flags);
+
+	/* pass two: call freedev, which might sleep,
+	 * for aoedevs marked with DEVFL_TKILL
+	 */
+restart2:
+	spin_lock_irqsave(&devlist_lock, flags);
+	for (d = devlist; d; d = d->next) {
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_TKILL
+		&& !(d->flags & DEVFL_FREEING)) {
+			spin_unlock(&d->lock);
+			spin_unlock_irqrestore(&devlist_lock, flags);
+			freedev(d);
+			goto restart2;
+		}
+		spin_unlock(&d->lock);
+	}
+
+	/* pass three: remove aoedevs marked with DEVFL_FREED */
+	for (dd = &devlist, d = *dd; d; d = *dd) {
+		struct aoedev *doomed = NULL;
+
+		spin_lock(&d->lock);
+		if (d->flags & DEVFL_FREED) {
+			*dd = d->next;
+			doomed = d;
+		} else {
+			dd = &d->next;
+		}
+		spin_unlock(&d->lock);
+		if (doomed)
+			kfree(doomed->targets);
+		kfree(doomed);
+	}
+	spin_unlock_irqrestore(&devlist_lock, flags);
+
+	return 0;
+}
+
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+	return flush(str, cnt, NOT_EXITING);
+}
+
+/* This has been confirmed to occur once with Tms=3*1000 due to the
+ * driver changing link and not processing its transmit ring.  The
+ * problem is hard enough to solve by returning an error that I'm
+ * still punting on "solving" this.
+ */
+static void
+skbfree(struct sk_buff *skb)
+{
+	enum { Sms = 250, Tms = 30 * 1000};
+	int i = Tms / Sms;
+
+	if (skb == NULL)
+		return;
+	while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
+		msleep(Sms);
+	if (i < 0) {
+		printk(KERN_ERR
+			"aoe: %s holds ref: %s\n",
+			skb->dev ? skb->dev->name : "netif",
+			"cannot free skb -- memory leaked.");
+		return;
+	}
+	skb->truesize -= skb->data_len;
+	skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+	skb_trim(skb, 0);
+	dev_kfree_skb(skb);
+}
+
+static void
+skbpoolfree(struct aoedev *d)
+{
+	struct sk_buff *skb, *tmp;
+
+	skb_queue_walk_safe(&d->skbpool, skb, tmp)
+		skbfree(skb);
+
+	__skb_queue_head_init(&d->skbpool);
+}
+
+/* find it or allocate it */
+struct aoedev *
+aoedev_by_aoeaddr(ulong maj, int min, int do_alloc)
+{
+	struct aoedev *d;
+	int i;
+	ulong flags;
+	ulong sysminor = 0;
+
+	spin_lock_irqsave(&devlist_lock, flags);
+
+	for (d=devlist; d; d=d->next)
+		if (d->aoemajor == maj && d->aoeminor == min) {
+			spin_lock(&d->lock);
+			if (d->flags & DEVFL_TKILL) {
+				spin_unlock(&d->lock);
+				d = NULL;
+				goto out;
+			}
+			d->ref++;
+			spin_unlock(&d->lock);
+			break;
+		}
+	if (d || !do_alloc || minor_get(&sysminor, maj, min) < 0)
+		goto out;
+	d = kcalloc(1, sizeof *d, GFP_ATOMIC);
+	if (!d)
+		goto out;
+	d->targets = kcalloc(NTARGETS, sizeof(*d->targets), GFP_ATOMIC);
+	if (!d->targets) {
+		kfree(d);
+		d = NULL;
+		goto out;
+	}
+	d->ntargets = NTARGETS;
+	INIT_WORK(&d->work, aoecmd_sleepwork);
+	spin_lock_init(&d->lock);
+	INIT_LIST_HEAD(&d->rq_list);
+	skb_queue_head_init(&d->skbpool);
+	timer_setup(&d->timer, dummy_timer, 0);
+	d->timer.expires = jiffies + HZ;
+	add_timer(&d->timer);
+	d->bufpool = NULL;	/* defer to aoeblk_gdalloc */
+	d->tgt = d->targets;
+	d->ref = 1;
+	for (i = 0; i < NFACTIVE; i++)
+		INIT_LIST_HEAD(&d->factive[i]);
+	INIT_LIST_HEAD(&d->rexmitq);
+	d->sysminor = sysminor;
+	d->aoemajor = maj;
+	d->aoeminor = min;
+	d->rttavg = RTTAVG_INIT;
+	d->rttdev = RTTDEV_INIT;
+	d->next = devlist;
+	devlist = d;
+ out:
+	spin_unlock_irqrestore(&devlist_lock, flags);
+	return d;
+}
+
+static void
+freetgt(struct aoedev *d, struct aoetgt *t)
+{
+	struct frame *f;
+	struct list_head *pos, *nx, *head;
+	struct aoeif *ifp;
+
+	for (ifp = t->ifs; ifp < &t->ifs[NAOEIFS]; ++ifp) {
+		if (!ifp->nd)
+			break;
+		dev_put(ifp->nd);
+	}
+
+	head = &t->ffree;
+	list_for_each_safe(pos, nx, head) {
+		list_del(pos);
+		f = list_entry(pos, struct frame, head);
+		skbfree(f->skb);
+		kfree(f);
+	}
+	kfree(t);
+}
+
+void
+aoedev_exit(void)
+{
+	flush_workqueue(aoe_wq);
+	flush(NULL, 0, EXITING);
+}
+
+int __init
+aoedev_init(void)
+{
+	return 0;
+}
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
new file mode 100644
index 000000000..6238c4c87
--- /dev/null
+++ b/drivers/block/aoe/aoemain.c
@@ -0,0 +1,95 @@
+/* Copyright (c) 2012 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoemain.c
+ * Module initialization routines, discover timer
+ */
+
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include "aoe.h"
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Sam Hopkins <sah@coraid.com>");
+MODULE_DESCRIPTION("AoE block/char driver for 2.6.2 and newer 2.6 kernels");
+MODULE_VERSION(VERSION);
+
+static struct timer_list timer;
+struct workqueue_struct *aoe_wq;
+
+static void discover_timer(struct timer_list *t)
+{
+	mod_timer(t, jiffies + HZ * 60); /* one minute */
+
+	aoecmd_cfg(0xffff, 0xff);
+}
+
+static void __exit
+aoe_exit(void)
+{
+	del_timer_sync(&timer);
+
+	aoenet_exit();
+	unregister_blkdev(AOE_MAJOR, DEVICE_NAME);
+	aoecmd_exit();
+	aoechr_exit();
+	aoedev_exit();
+	aoeblk_exit();		/* free cache after de-allocating bufs */
+	destroy_workqueue(aoe_wq);
+}
+
+static int __init
+aoe_init(void)
+{
+	int ret;
+
+	aoe_wq = alloc_workqueue("aoe_wq", 0, 0);
+	if (!aoe_wq)
+		return -ENOMEM;
+
+	ret = aoedev_init();
+	if (ret)
+		goto dev_fail;
+	ret = aoechr_init();
+	if (ret)
+		goto chr_fail;
+	ret = aoeblk_init();
+	if (ret)
+		goto blk_fail;
+	ret = aoenet_init();
+	if (ret)
+		goto net_fail;
+	ret = aoecmd_init();
+	if (ret)
+		goto cmd_fail;
+	ret = register_blkdev(AOE_MAJOR, DEVICE_NAME);
+	if (ret < 0) {
+		printk(KERN_ERR "aoe: can't register major\n");
+		goto blkreg_fail;
+	}
+	printk(KERN_INFO "aoe: AoE v%s initialised.\n", VERSION);
+
+	timer_setup(&timer, discover_timer, 0);
+	discover_timer(&timer);
+	return 0;
+ blkreg_fail:
+	aoecmd_exit();
+ cmd_fail:
+	aoenet_exit();
+ net_fail:
+	aoeblk_exit();
+ blk_fail:
+	aoechr_exit();
+ chr_fail:
+	aoedev_exit();
+ dev_fail:
+	destroy_workqueue(aoe_wq);
+
+	printk(KERN_INFO "aoe: initialisation failure.\n");
+	return ret;
+}
+
+module_init(aoe_init);
+module_exit(aoe_exit);
+
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
new file mode 100644
index 000000000..63773a905
--- /dev/null
+++ b/drivers/block/aoe/aoenet.c
@@ -0,0 +1,223 @@
+/* Copyright (c) 2013 Coraid, Inc.  See COPYING for GPL terms. */
+/*
+ * aoenet.c
+ * Ethernet portion of AoE driver
+ */
+
+#include <linux/gfp.h>
+#include <linux/hdreg.h>
+#include <linux/blkdev.h>
+#include <linux/netdevice.h>
+#include <linux/moduleparam.h>
+#include <net/net_namespace.h>
+#include <asm/unaligned.h>
+#include "aoe.h"
+
+#define NECODES 5
+
+static char *aoe_errlist[] =
+{
+	"no such error",
+	"unrecognized command code",
+	"bad argument parameter",
+	"device unavailable",
+	"config string present",
+	"unsupported version"
+};
+
+enum {
+	IFLISTSZ = 1024,
+};
+
+static char aoe_iflist[IFLISTSZ];
+module_param_string(aoe_iflist, aoe_iflist, IFLISTSZ, 0600);
+MODULE_PARM_DESC(aoe_iflist, "aoe_iflist=dev1[,dev2...]");
+
+static wait_queue_head_t txwq;
+static struct ktstate kts;
+
+#ifndef MODULE
+static int __init aoe_iflist_setup(char *str)
+{
+	strncpy(aoe_iflist, str, IFLISTSZ);
+	aoe_iflist[IFLISTSZ - 1] = '\0';
+	return 1;
+}
+
+__setup("aoe_iflist=", aoe_iflist_setup);
+#endif
+
+static spinlock_t txlock;
+static struct sk_buff_head skbtxq;
+
+/* enters with txlock held */
+static int
+tx(int id) __must_hold(&txlock)
+{
+	struct sk_buff *skb;
+	struct net_device *ifp;
+
+	while ((skb = skb_dequeue(&skbtxq))) {
+		spin_unlock_irq(&txlock);
+		ifp = skb->dev;
+		if (dev_queue_xmit(skb) == NET_XMIT_DROP && net_ratelimit())
+			pr_warn("aoe: packet could not be sent on %s.  %s\n",
+				ifp ? ifp->name : "netif",
+				"consider increasing tx_queue_len");
+		spin_lock_irq(&txlock);
+	}
+	return 0;
+}
+
+int
+is_aoe_netif(struct net_device *ifp)
+{
+	register char *p, *q;
+	register int len;
+
+	if (aoe_iflist[0] == '\0')
+		return 1;
+
+	p = aoe_iflist + strspn(aoe_iflist, WHITESPACE);
+	for (; *p; p = q + strspn(q, WHITESPACE)) {
+		q = p + strcspn(p, WHITESPACE);
+		if (q != p)
+			len = q - p;
+		else
+			len = strlen(p); /* last token in aoe_iflist */
+
+		if (strlen(ifp->name) == len && !strncmp(ifp->name, p, len))
+			return 1;
+		if (q == p)
+			break;
+	}
+
+	return 0;
+}
+
+int
+set_aoe_iflist(const char __user *user_str, size_t size)
+{
+	if (size >= IFLISTSZ)
+		return -EINVAL;
+
+	if (copy_from_user(aoe_iflist, user_str, size)) {
+		printk(KERN_INFO "aoe: copy from user failed\n");
+		return -EFAULT;
+	}
+	aoe_iflist[size] = 0x00;
+	return 0;
+}
+
+void
+aoenet_xmit(struct sk_buff_head *queue)
+{
+	struct sk_buff *skb, *tmp;
+	ulong flags;
+
+	skb_queue_walk_safe(queue, skb, tmp) {
+		__skb_unlink(skb, queue);
+		spin_lock_irqsave(&txlock, flags);
+		skb_queue_tail(&skbtxq, skb);
+		spin_unlock_irqrestore(&txlock, flags);
+		wake_up(&txwq);
+	}
+}
+
+/*
+ * (1) len doesn't include the header by default.  I want this.
+ */
+static int
+aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct aoe_hdr *h;
+	struct aoe_atahdr *ah;
+	u32 n;
+	int sn;
+
+	if (dev_net(ifp) != &init_net)
+		goto exit;
+
+	skb = skb_share_check(skb, GFP_ATOMIC);
+	if (skb == NULL)
+		return 0;
+	if (!is_aoe_netif(ifp))
+		goto exit;
+	skb_push(skb, ETH_HLEN);	/* (1) */
+	sn = sizeof(*h) + sizeof(*ah);
+	if (skb->len >= sn) {
+		sn -= skb_headlen(skb);
+		if (sn > 0 && !__pskb_pull_tail(skb, sn))
+			goto exit;
+	}
+	h = (struct aoe_hdr *) skb->data;
+	n = get_unaligned_be32(&h->tag);
+	if ((h->verfl & AOEFL_RSP) == 0 || (n & 1<<31))
+		goto exit;
+
+	if (h->verfl & AOEFL_ERR) {
+		n = h->err;
+		if (n > NECODES)
+			n = 0;
+		if (net_ratelimit())
+			printk(KERN_ERR
+				"%s%d.%d@%s; ecode=%d '%s'\n",
+				"aoe: error packet from ",
+				get_unaligned_be16(&h->major),
+				h->minor, skb->dev->name,
+				h->err, aoe_errlist[n]);
+		goto exit;
+	}
+
+	switch (h->cmd) {
+	case AOECMD_ATA:
+		/* ata_rsp may keep skb for later processing or give it back */
+		skb = aoecmd_ata_rsp(skb);
+		break;
+	case AOECMD_CFG:
+		aoecmd_cfg_rsp(skb);
+		break;
+	default:
+		if (h->cmd >= AOECMD_VEND_MIN)
+			break;	/* don't complain about vendor commands */
+		pr_info("aoe: unknown AoE command type 0x%02x\n", h->cmd);
+		break;
+	}
+
+	if (!skb)
+		return 0;
+exit:
+	dev_kfree_skb(skb);
+	return 0;
+}
+
+static struct packet_type aoe_pt __read_mostly = {
+	.type = __constant_htons(ETH_P_AOE),
+	.func = aoenet_rcv,
+};
+
+int __init
+aoenet_init(void)
+{
+	skb_queue_head_init(&skbtxq);
+	init_waitqueue_head(&txwq);
+	spin_lock_init(&txlock);
+	kts.lock = &txlock;
+	kts.fn = tx;
+	kts.waitq = &txwq;
+	kts.id = 0;
+	snprintf(kts.name, sizeof(kts.name), "aoe_tx%d", kts.id);
+	if (aoe_ktstart(&kts))
+		return -EAGAIN;
+	dev_add_pack(&aoe_pt);
+	return 0;
+}
+
+void
+aoenet_exit(void)
+{
+	aoe_ktstop(&kts);
+	skb_queue_purge(&skbtxq);
+	dev_remove_pack(&aoe_pt);
+}
+
diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c
new file mode 100644
index 000000000..9deb4df6b
--- /dev/null
+++ b/drivers/block/ataflop.c
@@ -0,0 +1,2199 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  drivers/block/ataflop.c
+ *
+ *  Copyright (C) 1993  Greg Harp
+ *  Atari Support by Bjoern Brauel, Roman Hodek
+ *
+ *  Big cleanup Sep 11..14 1994 Roman Hodek:
+ *   - Driver now works interrupt driven
+ *   - Support for two drives; should work, but I cannot test that :-(
+ *   - Reading is done in whole tracks and buffered to speed up things
+ *   - Disk change detection and drive deselecting after motor-off
+ *     similar to TOS
+ *   - Autodetection of disk format (DD/HD); untested yet, because I
+ *     don't have an HD drive :-(
+ *
+ *  Fixes Nov 13 1994 Martin Schaller:
+ *   - Autodetection works now
+ *   - Support for 5 1/4'' disks
+ *   - Removed drive type (unknown on atari)
+ *   - Do seeks with 8 Mhz
+ *
+ *  Changes by Andreas Schwab:
+ *   - After errors in multiple read mode try again reading single sectors
+ *  (Feb 1995):
+ *   - Clean up error handling
+ *   - Set blk_size for proper size checking
+ *   - Initialize track register when testing presence of floppy
+ *   - Implement some ioctl's
+ *
+ *  Changes by Torsten Lang:
+ *   - When probing the floppies we should add the FDCCMDADD_H flag since
+ *     the FDC will otherwise wait forever when no disk is inserted...
+ *
+ * ++ Freddi Aschwanden (fa) 20.9.95 fixes for medusa:
+ *  - MFPDELAY() after each FDC access -> atari 
+ *  - more/other disk formats
+ *  - DMA to the block buffer directly if we have a 32bit DMA
+ *  - for medusa, the step rate is always 3ms
+ *  - on medusa, use only cache_push()
+ * Roman:
+ *  - Make disk format numbering independent from minors
+ *  - Let user set max. supported drive type (speeds up format
+ *    detection, saves buffer space)
+ *
+ * Roman 10/15/95:
+ *  - implement some more ioctls
+ *  - disk formatting
+ *  
+ * Andreas 95/12/12:
+ *  - increase gap size at start of track for HD/ED disks
+ *
+ * Michael (MSch) 11/07/96:
+ *  - implemented FDSETPRM and FDDEFPRM ioctl
+ *
+ * Andreas (97/03/19):
+ *  - implemented missing BLK* ioctls
+ *
+ *  Things left to do:
+ *   - Formatting
+ *   - Maybe a better strategy for disk change detection (does anyone
+ *     know one?)
+ */
+
+#include <linux/module.h>
+
+#include <linux/fd.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/blk-mq.h>
+#include <linux/major.h>
+#include <linux/mutex.h>
+#include <linux/completion.h>
+#include <linux/wait.h>
+
+#include <asm/atariints.h>
+#include <asm/atari_stdma.h>
+#include <asm/atari_stram.h>
+
+#define	FD_MAX_UNITS 2
+
+#undef DEBUG
+
+static DEFINE_MUTEX(ataflop_mutex);
+static struct request *fd_request;
+
+/*
+ * WD1772 stuff
+ */
+
+/* register codes */
+
+#define FDCSELREG_STP   (0x80)   /* command/status register */
+#define FDCSELREG_TRA   (0x82)   /* track register */
+#define FDCSELREG_SEC   (0x84)   /* sector register */
+#define FDCSELREG_DTA   (0x86)   /* data register */
+
+/* register names for FDC_READ/WRITE macros */
+
+#define FDCREG_CMD		0
+#define FDCREG_STATUS	0
+#define FDCREG_TRACK	2
+#define FDCREG_SECTOR	4
+#define FDCREG_DATA		6
+
+/* command opcodes */
+
+#define FDCCMD_RESTORE  (0x00)   /*  -                   */
+#define FDCCMD_SEEK     (0x10)   /*   |                  */
+#define FDCCMD_STEP     (0x20)   /*   |  TYP 1 Commands  */
+#define FDCCMD_STIN     (0x40)   /*   |                  */
+#define FDCCMD_STOT     (0x60)   /*  -                   */
+#define FDCCMD_RDSEC    (0x80)   /*  -   TYP 2 Commands  */
+#define FDCCMD_WRSEC    (0xa0)   /*  -          "        */
+#define FDCCMD_RDADR    (0xc0)   /*  -                   */
+#define FDCCMD_RDTRA    (0xe0)   /*   |  TYP 3 Commands  */
+#define FDCCMD_WRTRA    (0xf0)   /*  -                   */
+#define FDCCMD_FORCI    (0xd0)   /*  -   TYP 4 Command   */
+
+/* command modifier bits */
+
+#define FDCCMDADD_SR6   (0x00)   /* step rate settings */
+#define FDCCMDADD_SR12  (0x01)
+#define FDCCMDADD_SR2   (0x02)
+#define FDCCMDADD_SR3   (0x03)
+#define FDCCMDADD_V     (0x04)   /* verify */
+#define FDCCMDADD_H     (0x08)   /* wait for spin-up */
+#define FDCCMDADD_U     (0x10)   /* update track register */
+#define FDCCMDADD_M     (0x10)   /* multiple sector access */
+#define FDCCMDADD_E     (0x04)   /* head settling flag */
+#define FDCCMDADD_P     (0x02)   /* precompensation off */
+#define FDCCMDADD_A0    (0x01)   /* DAM flag */
+
+/* status register bits */
+
+#define	FDCSTAT_MOTORON	(0x80)   /* motor on */
+#define	FDCSTAT_WPROT	(0x40)   /* write protected (FDCCMD_WR*) */
+#define	FDCSTAT_SPINUP	(0x20)   /* motor speed stable (Type I) */
+#define	FDCSTAT_DELDAM	(0x20)   /* sector has deleted DAM (Type II+III) */
+#define	FDCSTAT_RECNF	(0x10)   /* record not found */
+#define	FDCSTAT_CRC		(0x08)   /* CRC error */
+#define	FDCSTAT_TR00	(0x04)   /* Track 00 flag (Type I) */
+#define	FDCSTAT_LOST	(0x04)   /* Lost Data (Type II+III) */
+#define	FDCSTAT_IDX		(0x02)   /* Index status (Type I) */
+#define	FDCSTAT_DRQ		(0x02)   /* DRQ status (Type II+III) */
+#define	FDCSTAT_BUSY	(0x01)   /* FDC is busy */
+
+
+/* PSG Port A Bit Nr 0 .. Side Sel .. 0 -> Side 1  1 -> Side 2 */
+#define DSKSIDE     (0x01)
+
+#define DSKDRVNONE  (0x06)
+#define DSKDRV0     (0x02)
+#define DSKDRV1     (0x04)
+
+/* step rates */
+#define	FDCSTEP_6	0x00
+#define	FDCSTEP_12	0x01
+#define	FDCSTEP_2	0x02
+#define	FDCSTEP_3	0x03
+
+struct atari_format_descr {
+	int track;		/* to be formatted */
+	int head;		/*   ""     ""     */
+	int sect_offset;	/* offset of first sector */
+};
+
+/* Disk types: DD, HD, ED */
+static struct atari_disk_type {
+	const char	*name;
+	unsigned	spt;		/* sectors per track */
+	unsigned	blocks;		/* total number of blocks */
+	unsigned	fdc_speed;	/* fdc_speed setting */
+	unsigned 	stretch;	/* track doubling ? */
+} atari_disk_type[] = {
+	{ "d360",  9, 720, 0, 0},	/*  0: 360kB diskette */
+	{ "D360",  9, 720, 0, 1},	/*  1: 360kb in 720k or 1.2MB drive */
+	{ "D720",  9,1440, 0, 0},	/*  2: 720kb in 720k or 1.2MB drive */
+	{ "D820", 10,1640, 0, 0},	/*  3: DD disk with 82 tracks/10 sectors */
+/* formats above are probed for type DD */
+#define	MAX_TYPE_DD 3
+	{ "h1200",15,2400, 3, 0},	/*  4: 1.2MB diskette */
+	{ "H1440",18,2880, 3, 0},	/*  5: 1.4 MB diskette (HD) */
+	{ "H1640",20,3280, 3, 0},	/*  6: 1.64MB diskette (fat HD) 82 tr 20 sec */
+/* formats above are probed for types DD and HD */
+#define	MAX_TYPE_HD 6
+	{ "E2880",36,5760, 3, 0},	/*  7: 2.8 MB diskette (ED) */
+	{ "E3280",40,6560, 3, 0},	/*  8: 3.2 MB diskette (fat ED) 82 tr 40 sec */
+/* formats above are probed for types DD, HD and ED */
+#define	MAX_TYPE_ED 8
+/* types below are never autoprobed */
+	{ "H1680",21,3360, 3, 0},	/*  9: 1.68MB diskette (fat HD) 80 tr 21 sec */
+	{ "h410",10,820, 0, 1},		/* 10: 410k diskette 41 tr 10 sec, stretch */
+	{ "h1476",18,2952, 3, 0},	/* 11: 1.48MB diskette 82 tr 18 sec */
+	{ "H1722",21,3444, 3, 0},	/* 12: 1.72MB diskette 82 tr 21 sec */
+	{ "h420",10,840, 0, 1},		/* 13: 420k diskette 42 tr 10 sec, stretch */
+	{ "H830",10,1660, 0, 0},	/* 14: 820k diskette 83 tr 10 sec */
+	{ "h1494",18,2952, 3, 0},	/* 15: 1.49MB diskette 83 tr 18 sec */
+	{ "H1743",21,3486, 3, 0},	/* 16: 1.74MB diskette 83 tr 21 sec */
+	{ "h880",11,1760, 0, 0},	/* 17: 880k diskette 80 tr 11 sec */
+	{ "D1040",13,2080, 0, 0},	/* 18: 1.04MB diskette 80 tr 13 sec */
+	{ "D1120",14,2240, 0, 0},	/* 19: 1.12MB diskette 80 tr 14 sec */
+	{ "h1600",20,3200, 3, 0},	/* 20: 1.60MB diskette 80 tr 20 sec */
+	{ "H1760",22,3520, 3, 0},	/* 21: 1.76MB diskette 80 tr 22 sec */
+	{ "H1920",24,3840, 3, 0},	/* 22: 1.92MB diskette 80 tr 24 sec */
+	{ "E3200",40,6400, 3, 0},	/* 23: 3.2MB diskette 80 tr 40 sec */
+	{ "E3520",44,7040, 3, 0},	/* 24: 3.52MB diskette 80 tr 44 sec */
+	{ "E3840",48,7680, 3, 0},	/* 25: 3.84MB diskette 80 tr 48 sec */
+	{ "H1840",23,3680, 3, 0},	/* 26: 1.84MB diskette 80 tr 23 sec */
+	{ "D800",10,1600, 0, 0},	/* 27: 800k diskette 80 tr 10 sec */
+};
+
+static int StartDiskType[] = {
+	MAX_TYPE_DD,
+	MAX_TYPE_HD,
+	MAX_TYPE_ED
+};
+
+#define	TYPE_DD		0
+#define	TYPE_HD		1
+#define	TYPE_ED		2
+
+static int DriveType = TYPE_HD;
+
+static DEFINE_SPINLOCK(ataflop_lock);
+
+/* Array for translating minors into disk formats */
+static struct {
+	int 	 index;
+	unsigned drive_types;
+} minor2disktype[] = {
+	{  0, TYPE_DD },	/*  1: d360 */
+	{  4, TYPE_HD },	/*  2: h1200 */
+	{  1, TYPE_DD },	/*  3: D360 */
+	{  2, TYPE_DD },	/*  4: D720 */
+	{  1, TYPE_DD },	/*  5: h360 = D360 */
+	{  2, TYPE_DD },	/*  6: h720 = D720 */
+	{  5, TYPE_HD },	/*  7: H1440 */
+	{  7, TYPE_ED },	/*  8: E2880 */
+/* some PC formats :-) */
+	{  8, TYPE_ED },	/*  9: E3280    <- was "CompaQ" == E2880 for PC */
+	{  5, TYPE_HD },	/* 10: h1440 = H1440 */
+	{  9, TYPE_HD },	/* 11: H1680 */
+	{ 10, TYPE_DD },	/* 12: h410  */
+	{  3, TYPE_DD },	/* 13: H820     <- == D820, 82x10 */
+	{ 11, TYPE_HD },	/* 14: h1476 */
+	{ 12, TYPE_HD },	/* 15: H1722 */
+	{ 13, TYPE_DD },	/* 16: h420  */
+	{ 14, TYPE_DD },	/* 17: H830  */
+	{ 15, TYPE_HD },	/* 18: h1494 */
+	{ 16, TYPE_HD },	/* 19: H1743 */
+	{ 17, TYPE_DD },	/* 20: h880  */
+	{ 18, TYPE_DD },	/* 21: D1040 */
+	{ 19, TYPE_DD },	/* 22: D1120 */
+	{ 20, TYPE_HD },	/* 23: h1600 */
+	{ 21, TYPE_HD },	/* 24: H1760 */
+	{ 22, TYPE_HD },	/* 25: H1920 */
+	{ 23, TYPE_ED },	/* 26: E3200 */
+	{ 24, TYPE_ED },	/* 27: E3520 */
+	{ 25, TYPE_ED },	/* 28: E3840 */
+	{ 26, TYPE_HD },	/* 29: H1840 */
+	{ 27, TYPE_DD },	/* 30: D800  */
+	{  6, TYPE_HD },	/* 31: H1640    <- was H1600 == h1600 for PC */
+};
+
+#define NUM_DISK_MINORS ARRAY_SIZE(minor2disktype)
+
+/*
+ * Maximum disk size (in kilobytes). This default is used whenever the
+ * current disk size is unknown.
+ */
+#define MAX_DISK_SIZE 3280
+
+/*
+ * MSch: User-provided type information. 'drive' points to
+ * the respective entry of this array. Set by FDSETPRM ioctls.
+ */
+static struct atari_disk_type user_params[FD_MAX_UNITS];
+
+/*
+ * User-provided permanent type information. 'drive' points to
+ * the respective entry of this array.  Set by FDDEFPRM ioctls, 
+ * restored upon disk change by floppy_revalidate() if valid (as seen by
+ * default_params[].blocks > 0 - a bit in unit[].flags might be used for this?)
+ */
+static struct atari_disk_type default_params[FD_MAX_UNITS];
+
+/* current info on each unit */
+static struct atari_floppy_struct {
+	int connected;				/* !=0 : drive is connected */
+	int autoprobe;				/* !=0 : do autoprobe	    */
+
+	struct atari_disk_type	*disktype;	/* current type of disk */
+
+	int track;		/* current head position or -1 if
+				   unknown */
+	unsigned int steprate;	/* steprate setting */
+	unsigned int wpstat;	/* current state of WP signal (for
+				   disk change detection) */
+	int flags;		/* flags */
+	struct gendisk *disk[NUM_DISK_MINORS];
+	bool registered[NUM_DISK_MINORS];
+	int ref;
+	int type;
+	struct blk_mq_tag_set tag_set;
+	int error_count;
+} unit[FD_MAX_UNITS];
+
+#define	UD	unit[drive]
+#define	UDT	unit[drive].disktype
+#define	SUD	unit[SelectedDrive]
+#define	SUDT	unit[SelectedDrive].disktype
+
+
+#define FDC_READ(reg) ({			\
+    /* unsigned long __flags; */		\
+    unsigned short __val;			\
+    /* local_irq_save(__flags); */		\
+    dma_wd.dma_mode_status = 0x80 | (reg);	\
+    udelay(25);					\
+    __val = dma_wd.fdc_acces_seccount;		\
+    MFPDELAY();					\
+    /* local_irq_restore(__flags); */		\
+    __val & 0xff;				\
+})
+
+#define FDC_WRITE(reg,val)			\
+    do {					\
+	/* unsigned long __flags; */		\
+	/* local_irq_save(__flags); */		\
+	dma_wd.dma_mode_status = 0x80 | (reg);	\
+	udelay(25);				\
+	dma_wd.fdc_acces_seccount = (val);	\
+	MFPDELAY();				\
+        /* local_irq_restore(__flags); */	\
+    } while(0)
+
+
+/* Buffering variables:
+ * First, there is a DMA buffer in ST-RAM that is used for floppy DMA
+ * operations. Second, a track buffer is used to cache a whole track
+ * of the disk to save read operations. These are two separate buffers
+ * because that allows write operations without clearing the track buffer.
+ */
+
+static int MaxSectors[] = {
+	11, 22, 44
+};
+static int BufferSize[] = {
+	15*512, 30*512, 60*512
+};
+
+#define	BUFFER_SIZE	(BufferSize[DriveType])
+
+unsigned char *DMABuffer;			  /* buffer for writes */
+static unsigned long PhysDMABuffer;   /* physical address */
+
+static int UseTrackbuffer = -1;		  /* Do track buffering? */
+module_param(UseTrackbuffer, int, 0);
+
+unsigned char *TrackBuffer;			  /* buffer for reads */
+static unsigned long PhysTrackBuffer; /* physical address */
+static int BufferDrive, BufferSide, BufferTrack;
+static int read_track;		/* non-zero if we are reading whole tracks */
+
+#define	SECTOR_BUFFER(sec)	(TrackBuffer + ((sec)-1)*512)
+#define	IS_BUFFERED(drive,side,track) \
+    (BufferDrive == (drive) && BufferSide == (side) && BufferTrack == (track))
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+static int SelectedDrive = 0;
+static int ReqCmd, ReqBlock;
+static int ReqSide, ReqTrack, ReqSector, ReqCnt;
+static int HeadSettleFlag = 0;
+static unsigned char *ReqData, *ReqBuffer;
+static int MotorOn = 0, MotorOffTrys;
+static int IsFormatting = 0, FormatError;
+
+static int UserSteprate[FD_MAX_UNITS] = { -1, -1 };
+module_param_array(UserSteprate, int, NULL, 0);
+
+static DECLARE_COMPLETION(format_wait);
+
+static unsigned long changed_floppies = 0xff, fake_change = 0;
+#define	CHECK_CHANGE_DELAY	HZ/2
+
+#define	FD_MOTOR_OFF_DELAY	(3*HZ)
+#define	FD_MOTOR_OFF_MAXTRY	(10*20)
+
+#define FLOPPY_TIMEOUT		(6*HZ)
+#define RECALIBRATE_ERRORS	4	/* After this many errors the drive
+					 * will be recalibrated. */
+#define MAX_ERRORS		8	/* After this many errors the driver
+					 * will give up. */
+
+
+/*
+ * The driver is trying to determine the correct media format
+ * while Probing is set. fd_rwsec_done() clears it after a
+ * successful access.
+ */
+static int Probing = 0;
+
+/* This flag is set when a dummy seek is necessary to make the WP
+ * status bit accessible.
+ */
+static int NeedSeek = 0;
+
+
+#ifdef DEBUG
+#define DPRINT(a)	printk a
+#else
+#define DPRINT(a)
+#endif
+
+/***************************** Prototypes *****************************/
+
+static void fd_select_side( int side );
+static void fd_select_drive( int drive );
+static void fd_deselect( void );
+static void fd_motor_off_timer(struct timer_list *unused);
+static void check_change(struct timer_list *unused);
+static irqreturn_t floppy_irq (int irq, void *dummy);
+static void fd_error( void );
+static int do_format(int drive, int type, struct atari_format_descr *desc);
+static void do_fd_action( int drive );
+static void fd_calibrate( void );
+static void fd_calibrate_done( int status );
+static void fd_seek( void );
+static void fd_seek_done( int status );
+static void fd_rwsec( void );
+static void fd_readtrack_check(struct timer_list *unused);
+static void fd_rwsec_done( int status );
+static void fd_rwsec_done1(int status);
+static void fd_writetrack( void );
+static void fd_writetrack_done( int status );
+static void fd_times_out(struct timer_list *unused);
+static void finish_fdc( void );
+static void finish_fdc_done( int dummy );
+static void setup_req_params( int drive );
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int
+                     cmd, unsigned long param);
+static void fd_probe( int drive );
+static int fd_test_drive_present( int drive );
+static void config_types( void );
+static int floppy_open(struct block_device *bdev, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
+
+/************************* End of Prototypes **************************/
+
+static DEFINE_TIMER(motor_off_timer, fd_motor_off_timer);
+static DEFINE_TIMER(readtrack_timer, fd_readtrack_check);
+static DEFINE_TIMER(timeout_timer, fd_times_out);
+static DEFINE_TIMER(fd_timer, check_change);
+	
+static void fd_end_request_cur(blk_status_t err)
+{
+	DPRINT(("fd_end_request_cur(), bytes %d of %d\n",
+		blk_rq_cur_bytes(fd_request),
+		blk_rq_bytes(fd_request)));
+
+	if (!blk_update_request(fd_request, err,
+				blk_rq_cur_bytes(fd_request))) {
+		DPRINT(("calling __blk_mq_end_request()\n"));
+		__blk_mq_end_request(fd_request, err);
+		fd_request = NULL;
+	} else {
+		/* requeue rest of request */
+		DPRINT(("calling blk_mq_requeue_request()\n"));
+		blk_mq_requeue_request(fd_request, true);
+		fd_request = NULL;
+	}
+}
+
+static inline void start_motor_off_timer(void)
+{
+	mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY);
+	MotorOffTrys = 0;
+}
+
+static inline void start_check_change_timer( void )
+{
+	mod_timer(&fd_timer, jiffies + CHECK_CHANGE_DELAY);
+}
+
+static inline void start_timeout(void)
+{
+	mod_timer(&timeout_timer, jiffies + FLOPPY_TIMEOUT);
+}
+
+static inline void stop_timeout(void)
+{
+	del_timer(&timeout_timer);
+}
+
+/* Select the side to use. */
+
+static void fd_select_side( int side )
+{
+	unsigned long flags;
+
+	/* protect against various other ints mucking around with the PSG */
+	local_irq_save(flags);
+  
+	sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */
+	sound_ym.wd_data = (side == 0) ? sound_ym.rd_data_reg_sel | 0x01 :
+	                                 sound_ym.rd_data_reg_sel & 0xfe;
+
+	local_irq_restore(flags);
+}
+
+
+/* Select a drive, update the FDC's track register and set the correct
+ * clock speed for this disk's type.
+ */
+
+static void fd_select_drive( int drive )
+{
+	unsigned long flags;
+	unsigned char tmp;
+  
+	if (drive == SelectedDrive)
+	  return;
+
+	/* protect against various other ints mucking around with the PSG */
+	local_irq_save(flags);
+	sound_ym.rd_data_reg_sel = 14; /* Select PSG Port A */
+	tmp = sound_ym.rd_data_reg_sel;
+	sound_ym.wd_data = (tmp | DSKDRVNONE) & ~(drive == 0 ? DSKDRV0 : DSKDRV1);
+	atari_dont_touch_floppy_select = 1;
+	local_irq_restore(flags);
+
+	/* restore track register to saved value */
+	FDC_WRITE( FDCREG_TRACK, UD.track );
+	udelay(25);
+
+	/* select 8/16 MHz */
+	if (UDT)
+		if (ATARIHW_PRESENT(FDCSPEED))
+			dma_wd.fdc_speed = UDT->fdc_speed;
+	
+	SelectedDrive = drive;
+}
+
+
+/* Deselect both drives. */
+
+static void fd_deselect( void )
+{
+	unsigned long flags;
+
+	/* protect against various other ints mucking around with the PSG */
+	local_irq_save(flags);
+	atari_dont_touch_floppy_select = 0;
+	sound_ym.rd_data_reg_sel=14;	/* Select PSG Port A */
+	sound_ym.wd_data = (sound_ym.rd_data_reg_sel |
+			    (MACH_IS_FALCON ? 3 : 7)); /* no drives selected */
+	/* On Falcon, the drive B select line is used on the printer port, so
+	 * leave it alone... */
+	SelectedDrive = -1;
+	local_irq_restore(flags);
+}
+
+
+/* This timer function deselects the drives when the FDC switched the
+ * motor off. The deselection cannot happen earlier because the FDC
+ * counts the index signals, which arrive only if one drive is selected.
+ */
+
+static void fd_motor_off_timer(struct timer_list *unused)
+{
+	unsigned char status;
+
+	if (SelectedDrive < 0)
+		/* no drive selected, needn't deselect anyone */
+		return;
+
+	if (stdma_islocked())
+		goto retry;
+
+	status = FDC_READ( FDCREG_STATUS );
+
+	if (!(status & 0x80)) {
+		/* motor already turned off by FDC -> deselect drives */
+		MotorOn = 0;
+		fd_deselect();
+		return;
+	}
+	/* not yet off, try again */
+
+  retry:
+	/* Test again later; if tested too often, it seems there is no disk
+	 * in the drive and the FDC will leave the motor on forever (or,
+	 * at least until a disk is inserted). So we'll test only twice
+	 * per second from then on...
+	 */
+	mod_timer(&motor_off_timer,
+		  jiffies + (MotorOffTrys++ < FD_MOTOR_OFF_MAXTRY ? HZ/20 : HZ/2));
+}
+
+
+/* This function is repeatedly called to detect disk changes (as good
+ * as possible) and keep track of the current state of the write protection.
+ */
+
+static void check_change(struct timer_list *unused)
+{
+	static int    drive = 0;
+
+	unsigned long flags;
+	unsigned char old_porta;
+	int			  stat;
+
+	if (++drive > 1 || !UD.connected)
+		drive = 0;
+
+	/* protect against various other ints mucking around with the PSG */
+	local_irq_save(flags);
+
+	if (!stdma_islocked()) {
+		sound_ym.rd_data_reg_sel = 14;
+		old_porta = sound_ym.rd_data_reg_sel;
+		sound_ym.wd_data = (old_porta | DSKDRVNONE) &
+			               ~(drive == 0 ? DSKDRV0 : DSKDRV1);
+		stat = !!(FDC_READ( FDCREG_STATUS ) & FDCSTAT_WPROT);
+		sound_ym.wd_data = old_porta;
+
+		if (stat != UD.wpstat) {
+			DPRINT(( "wpstat[%d] = %d\n", drive, stat ));
+			UD.wpstat = stat;
+			set_bit (drive, &changed_floppies);
+		}
+	}
+	local_irq_restore(flags);
+
+	start_check_change_timer();
+}
+
+ 
+/* Handling of the Head Settling Flag: This flag should be set after each
+ * seek operation, because we don't use seeks with verify.
+ */
+
+static inline void set_head_settle_flag(void)
+{
+	HeadSettleFlag = FDCCMDADD_E;
+}
+
+static inline int get_head_settle_flag(void)
+{
+	int	tmp = HeadSettleFlag;
+	HeadSettleFlag = 0;
+	return( tmp );
+}
+
+static inline void copy_buffer(void *from, void *to)
+{
+	ulong *p1 = (ulong *)from, *p2 = (ulong *)to;
+	int cnt;
+
+	for (cnt = 512/4; cnt; cnt--)
+		*p2++ = *p1++;
+}
+
+/* General Interrupt Handling */
+
+static void (*FloppyIRQHandler)( int status ) = NULL;
+
+static irqreturn_t floppy_irq (int irq, void *dummy)
+{
+	unsigned char status;
+	void (*handler)( int );
+
+	handler = xchg(&FloppyIRQHandler, NULL);
+
+	if (handler) {
+		nop();
+		status = FDC_READ( FDCREG_STATUS );
+		DPRINT(("FDC irq, status = %02x handler = %08lx\n",status,(unsigned long)handler));
+		handler( status );
+	}
+	else {
+		DPRINT(("FDC irq, no handler\n"));
+	}
+	return IRQ_HANDLED;
+}
+
+
+/* Error handling: If some error happened, retry some times, then
+ * recalibrate, then try again, and fail after MAX_ERRORS.
+ */
+
+static void fd_error( void )
+{
+	if (IsFormatting) {
+		IsFormatting = 0;
+		FormatError = 1;
+		complete(&format_wait);
+		return;
+	}
+
+	if (!fd_request)
+		return;
+
+	unit[SelectedDrive].error_count++;
+	if (unit[SelectedDrive].error_count >= MAX_ERRORS) {
+		printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive );
+		fd_end_request_cur(BLK_STS_IOERR);
+		finish_fdc();
+		return;
+	}
+	else if (unit[SelectedDrive].error_count == RECALIBRATE_ERRORS) {
+		printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive );
+		if (SelectedDrive != -1)
+			SUD.track = -1;
+	}
+	/* need to re-run request to recalibrate */
+	atari_disable_irq( IRQ_MFP_FDC );
+
+	setup_req_params( SelectedDrive );
+	do_fd_action( SelectedDrive );
+
+	atari_enable_irq( IRQ_MFP_FDC );
+}
+
+
+
+#define	SET_IRQ_HANDLER(proc) do { FloppyIRQHandler = (proc); } while(0)
+
+
+/* ---------- Formatting ---------- */
+
+#define FILL(n,val)		\
+    do {			\
+	memset( p, val, n );	\
+	p += n;			\
+    } while(0)
+
+static int do_format(int drive, int type, struct atari_format_descr *desc)
+{
+	struct request_queue *q;
+	unsigned char	*p;
+	int sect, nsect;
+	unsigned long	flags;
+	int ret;
+
+	if (type) {
+		type--;
+		if (type >= NUM_DISK_MINORS ||
+		    minor2disktype[type].drive_types > DriveType) {
+			finish_fdc();
+			return -EINVAL;
+		}
+	}
+
+	q = unit[drive].disk[type]->queue;
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	local_irq_save(flags);
+	stdma_lock(floppy_irq, NULL);
+	atari_turnon_irq( IRQ_MFP_FDC ); /* should be already, just to be sure */
+	local_irq_restore(flags);
+
+	if (type) {
+		type = minor2disktype[type].index;
+		UDT = &atari_disk_type[type];
+	}
+
+	if (!UDT || desc->track >= UDT->blocks/UDT->spt/2 || desc->head >= 2) {
+		finish_fdc();
+		ret = -EINVAL;
+		goto out;
+	}
+
+	nsect = UDT->spt;
+	p = TrackBuffer;
+	/* The track buffer is used for the raw track data, so its
+	   contents become invalid! */
+	BufferDrive = -1;
+	/* stop deselect timer */
+	del_timer( &motor_off_timer );
+
+	FILL( 60 * (nsect / 9), 0x4e );
+	for( sect = 0; sect < nsect; ++sect ) {
+		FILL( 12, 0 );
+		FILL( 3, 0xf5 );
+		*p++ = 0xfe;
+		*p++ = desc->track;
+		*p++ = desc->head;
+		*p++ = (nsect + sect - desc->sect_offset) % nsect + 1;
+		*p++ = 2;
+		*p++ = 0xf7;
+		FILL( 22, 0x4e );
+		FILL( 12, 0 );
+		FILL( 3, 0xf5 );
+		*p++ = 0xfb;
+		FILL( 512, 0xe5 );
+		*p++ = 0xf7;
+		FILL( 40, 0x4e );
+	}
+	FILL( TrackBuffer+BUFFER_SIZE-p, 0x4e );
+
+	IsFormatting = 1;
+	FormatError = 0;
+	ReqTrack = desc->track;
+	ReqSide  = desc->head;
+	do_fd_action( drive );
+
+	wait_for_completion(&format_wait);
+
+	finish_fdc();
+	ret = FormatError ? -EIO : 0;
+out:
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
+	return ret;
+}
+
+
+/* do_fd_action() is the general procedure for a fd request: All
+ * required parameter settings (drive select, side select, track
+ * position) are checked and set if needed. For each of these
+ * parameters and the actual reading or writing exist two functions:
+ * one that starts the setting (or skips it if possible) and one
+ * callback for the "done" interrupt. Each done func calls the next
+ * set function to propagate the request down to fd_rwsec_done().
+ */
+
+static void do_fd_action( int drive )
+{
+	DPRINT(("do_fd_action\n"));
+	
+	if (UseTrackbuffer && !IsFormatting) {
+	repeat:
+	    if (IS_BUFFERED( drive, ReqSide, ReqTrack )) {
+		if (ReqCmd == READ) {
+		    copy_buffer( SECTOR_BUFFER(ReqSector), ReqData );
+		    if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
+			/* read next sector */
+			setup_req_params( drive );
+			goto repeat;
+		    }
+		    else {
+			/* all sectors finished */
+			fd_end_request_cur(BLK_STS_OK);
+			finish_fdc();
+			return;
+		    }
+		}
+		else {
+		    /* cmd == WRITE, pay attention to track buffer
+		     * consistency! */
+		    copy_buffer( ReqData, SECTOR_BUFFER(ReqSector) );
+		}
+	    }
+	}
+
+	if (SelectedDrive != drive)
+		fd_select_drive( drive );
+    
+	if (UD.track == -1)
+		fd_calibrate();
+	else if (UD.track != ReqTrack << UDT->stretch)
+		fd_seek();
+	else if (IsFormatting)
+		fd_writetrack();
+	else
+		fd_rwsec();
+}
+
+
+/* Seek to track 0 if the current track is unknown */
+
+static void fd_calibrate( void )
+{
+	if (SUD.track >= 0) {
+		fd_calibrate_done( 0 );
+		return;
+	}
+
+	if (ATARIHW_PRESENT(FDCSPEED))
+		dma_wd.fdc_speed = 0;   /* always seek with 8 Mhz */
+	DPRINT(("fd_calibrate\n"));
+	SET_IRQ_HANDLER( fd_calibrate_done );
+	/* we can't verify, since the speed may be incorrect */
+	FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | SUD.steprate );
+
+	NeedSeek = 1;
+	MotorOn = 1;
+	start_timeout();
+	/* wait for IRQ */
+}
+
+
+static void fd_calibrate_done( int status )
+{
+	DPRINT(("fd_calibrate_done()\n"));
+	stop_timeout();
+    
+	/* set the correct speed now */
+	if (ATARIHW_PRESENT(FDCSPEED))
+		dma_wd.fdc_speed = SUDT->fdc_speed;
+	if (status & FDCSTAT_RECNF) {
+		printk(KERN_ERR "fd%d: restore failed\n", SelectedDrive );
+		fd_error();
+	}
+	else {
+		SUD.track = 0;
+		fd_seek();
+	}
+}
+  
+  
+/* Seek the drive to the requested track. The drive must have been
+ * calibrated at some point before this.
+ */
+  
+static void fd_seek( void )
+{
+	if (SUD.track == ReqTrack << SUDT->stretch) {
+		fd_seek_done( 0 );
+		return;
+	}
+
+	if (ATARIHW_PRESENT(FDCSPEED)) {
+		dma_wd.fdc_speed = 0;	/* always seek witch 8 Mhz */
+		MFPDELAY();
+	}
+
+	DPRINT(("fd_seek() to track %d\n",ReqTrack));
+	FDC_WRITE( FDCREG_DATA, ReqTrack << SUDT->stretch);
+	udelay(25);
+	SET_IRQ_HANDLER( fd_seek_done );
+	FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK | SUD.steprate );
+
+	MotorOn = 1;
+	set_head_settle_flag();
+	start_timeout();
+	/* wait for IRQ */
+}
+
+
+static void fd_seek_done( int status )
+{
+	DPRINT(("fd_seek_done()\n"));
+	stop_timeout();
+	
+	/* set the correct speed */
+	if (ATARIHW_PRESENT(FDCSPEED))
+		dma_wd.fdc_speed = SUDT->fdc_speed;
+	if (status & FDCSTAT_RECNF) {
+		printk(KERN_ERR "fd%d: seek error (to track %d)\n",
+				SelectedDrive, ReqTrack );
+		/* we don't know exactly which track we are on now! */
+		SUD.track = -1;
+		fd_error();
+	}
+	else {
+		SUD.track = ReqTrack << SUDT->stretch;
+		NeedSeek = 0;
+		if (IsFormatting)
+			fd_writetrack();
+		else
+			fd_rwsec();
+	}
+}
+
+
+/* This does the actual reading/writing after positioning the head
+ * over the correct track.
+ */
+
+static int MultReadInProgress = 0;
+
+
+static void fd_rwsec( void )
+{
+	unsigned long paddr, flags;
+	unsigned int  rwflag, old_motoron;
+	unsigned int track;
+	
+	DPRINT(("fd_rwsec(), Sec=%d, Access=%c\n",ReqSector, ReqCmd == WRITE ? 'w' : 'r' ));
+	if (ReqCmd == WRITE) {
+		if (ATARIHW_PRESENT(EXTD_DMA)) {
+			paddr = virt_to_phys(ReqData);
+		}
+		else {
+			copy_buffer( ReqData, DMABuffer );
+			paddr = PhysDMABuffer;
+		}
+		dma_cache_maintenance( paddr, 512, 1 );
+		rwflag = 0x100;
+	}
+	else {
+		if (read_track)
+			paddr = PhysTrackBuffer;
+		else
+			paddr = ATARIHW_PRESENT(EXTD_DMA) ? 
+				virt_to_phys(ReqData) : PhysDMABuffer;
+		rwflag = 0;
+	}
+
+	fd_select_side( ReqSide );
+  
+	/* Start sector of this operation */
+	FDC_WRITE( FDCREG_SECTOR, read_track ? 1 : ReqSector );
+	MFPDELAY();
+	/* Cheat for track if stretch != 0 */
+	if (SUDT->stretch) {
+		track = FDC_READ( FDCREG_TRACK);
+		MFPDELAY();
+		FDC_WRITE( FDCREG_TRACK, track >> SUDT->stretch);
+	}
+	udelay(25);
+  
+	/* Setup DMA */
+	local_irq_save(flags);
+	dma_wd.dma_lo = (unsigned char)paddr;
+	MFPDELAY();
+	paddr >>= 8;
+	dma_wd.dma_md = (unsigned char)paddr;
+	MFPDELAY();
+	paddr >>= 8;
+	if (ATARIHW_PRESENT(EXTD_DMA))
+		st_dma_ext_dmahi = (unsigned short)paddr;
+	else
+		dma_wd.dma_hi = (unsigned char)paddr;
+	MFPDELAY();
+	local_irq_restore(flags);
+  
+	/* Clear FIFO and switch DMA to correct mode */  
+	dma_wd.dma_mode_status = 0x90 | rwflag;  
+	MFPDELAY();
+	dma_wd.dma_mode_status = 0x90 | (rwflag ^ 0x100);  
+	MFPDELAY();
+	dma_wd.dma_mode_status = 0x90 | rwflag;
+	MFPDELAY();
+  
+	/* How many sectors for DMA */
+	dma_wd.fdc_acces_seccount = read_track ? SUDT->spt : 1;
+  
+	udelay(25);  
+  
+	/* Start operation */
+	dma_wd.dma_mode_status = FDCSELREG_STP | rwflag;
+	udelay(25);
+	SET_IRQ_HANDLER( fd_rwsec_done );
+	dma_wd.fdc_acces_seccount =
+	  (get_head_settle_flag() |
+	   (rwflag ? FDCCMD_WRSEC : (FDCCMD_RDSEC | (read_track ? FDCCMDADD_M : 0))));
+
+	old_motoron = MotorOn;
+	MotorOn = 1;
+	NeedSeek = 1;
+	/* wait for interrupt */
+
+	if (read_track) {
+		/* If reading a whole track, wait about one disk rotation and
+		 * then check if all sectors are read. The FDC will even
+		 * search for the first non-existent sector and need 1 sec to
+		 * recognise that it isn't present :-(
+		 */
+		MultReadInProgress = 1;
+		mod_timer(&readtrack_timer,
+			  /* 1 rot. + 5 rot.s if motor was off  */
+			  jiffies + HZ/5 + (old_motoron ? 0 : HZ));
+	}
+	start_timeout();
+}
+
+    
+static void fd_readtrack_check(struct timer_list *unused)
+{
+	unsigned long flags, addr, addr2;
+
+	local_irq_save(flags);
+
+	if (!MultReadInProgress) {
+		/* This prevents a race condition that could arise if the
+		 * interrupt is triggered while the calling of this timer
+		 * callback function takes place. The IRQ function then has
+		 * already cleared 'MultReadInProgress'  when flow of control
+		 * gets here.
+		 */
+		local_irq_restore(flags);
+		return;
+	}
+
+	/* get the current DMA address */
+	/* ++ f.a. read twice to avoid being fooled by switcher */
+	addr = 0;
+	do {
+		addr2 = addr;
+		addr = dma_wd.dma_lo & 0xff;
+		MFPDELAY();
+		addr |= (dma_wd.dma_md & 0xff) << 8;
+		MFPDELAY();
+		if (ATARIHW_PRESENT( EXTD_DMA ))
+			addr |= (st_dma_ext_dmahi & 0xffff) << 16;
+		else
+			addr |= (dma_wd.dma_hi & 0xff) << 16;
+		MFPDELAY();
+	} while(addr != addr2);
+  
+	if (addr >= PhysTrackBuffer + SUDT->spt*512) {
+		/* already read enough data, force an FDC interrupt to stop
+		 * the read operation
+		 */
+		SET_IRQ_HANDLER( NULL );
+		MultReadInProgress = 0;
+		local_irq_restore(flags);
+		DPRINT(("fd_readtrack_check(): done\n"));
+		FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+		udelay(25);
+
+		/* No error until now -- the FDC would have interrupted
+		 * otherwise!
+		 */
+		fd_rwsec_done1(0);
+	}
+	else {
+		/* not yet finished, wait another tenth rotation */
+		local_irq_restore(flags);
+		DPRINT(("fd_readtrack_check(): not yet finished\n"));
+		mod_timer(&readtrack_timer, jiffies + HZ/5/10);
+	}
+}
+
+
+static void fd_rwsec_done( int status )
+{
+	DPRINT(("fd_rwsec_done()\n"));
+
+	if (read_track) {
+		del_timer(&readtrack_timer);
+		if (!MultReadInProgress)
+			return;
+		MultReadInProgress = 0;
+	}
+	fd_rwsec_done1(status);
+}
+
+static void fd_rwsec_done1(int status)
+{
+	unsigned int track;
+
+	stop_timeout();
+	
+	/* Correct the track if stretch != 0 */
+	if (SUDT->stretch) {
+		track = FDC_READ( FDCREG_TRACK);
+		MFPDELAY();
+		FDC_WRITE( FDCREG_TRACK, track << SUDT->stretch);
+	}
+
+	if (!UseTrackbuffer) {
+		dma_wd.dma_mode_status = 0x90;
+		MFPDELAY();
+		if (!(dma_wd.dma_mode_status & 0x01)) {
+			printk(KERN_ERR "fd%d: DMA error\n", SelectedDrive );
+			goto err_end;
+		}
+	}
+	MFPDELAY();
+
+	if (ReqCmd == WRITE && (status & FDCSTAT_WPROT)) {
+		printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive );
+		goto err_end;
+	}	
+	if ((status & FDCSTAT_RECNF) &&
+	    /* RECNF is no error after a multiple read when the FDC
+	       searched for a non-existent sector! */
+	    !(read_track && FDC_READ(FDCREG_SECTOR) > SUDT->spt)) {
+		if (Probing) {
+			if (SUDT > atari_disk_type) {
+			    if (SUDT[-1].blocks > ReqBlock) {
+				/* try another disk type */
+				SUDT--;
+				set_capacity(unit[SelectedDrive].disk[0],
+							SUDT->blocks);
+			    } else
+				Probing = 0;
+			}
+			else {
+				if (SUD.flags & FTD_MSG)
+					printk(KERN_INFO "fd%d: Auto-detected floppy type %s\n",
+					       SelectedDrive, SUDT->name );
+				Probing=0;
+			}
+		} else {	
+/* record not found, but not probing. Maybe stretch wrong ? Restart probing */
+			if (SUD.autoprobe) {
+				SUDT = atari_disk_type + StartDiskType[DriveType];
+				set_capacity(unit[SelectedDrive].disk[0],
+							SUDT->blocks);
+				Probing = 1;
+			}
+		}
+		if (Probing) {
+			if (ATARIHW_PRESENT(FDCSPEED)) {
+				dma_wd.fdc_speed = SUDT->fdc_speed;
+				MFPDELAY();
+			}
+			setup_req_params( SelectedDrive );
+			BufferDrive = -1;
+			do_fd_action( SelectedDrive );
+			return;
+		}
+
+		printk(KERN_ERR "fd%d: sector %d not found (side %d, track %d)\n",
+		       SelectedDrive, FDC_READ (FDCREG_SECTOR), ReqSide, ReqTrack );
+		goto err_end;
+	}
+	if (status & FDCSTAT_CRC) {
+		printk(KERN_ERR "fd%d: CRC error (side %d, track %d, sector %d)\n",
+		       SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) );
+		goto err_end;
+	}
+	if (status & FDCSTAT_LOST) {
+		printk(KERN_ERR "fd%d: lost data (side %d, track %d, sector %d)\n",
+		       SelectedDrive, ReqSide, ReqTrack, FDC_READ (FDCREG_SECTOR) );
+		goto err_end;
+	}
+
+	Probing = 0;
+	
+	if (ReqCmd == READ) {
+		if (!read_track) {
+			void *addr;
+			addr = ATARIHW_PRESENT( EXTD_DMA ) ? ReqData : DMABuffer;
+			dma_cache_maintenance( virt_to_phys(addr), 512, 0 );
+			if (!ATARIHW_PRESENT( EXTD_DMA ))
+				copy_buffer (addr, ReqData);
+		} else {
+			dma_cache_maintenance( PhysTrackBuffer, MaxSectors[DriveType] * 512, 0 );
+			BufferDrive = SelectedDrive;
+			BufferSide  = ReqSide;
+			BufferTrack = ReqTrack;
+			copy_buffer (SECTOR_BUFFER (ReqSector), ReqData);
+		}
+	}
+  
+	if (++ReqCnt < blk_rq_cur_sectors(fd_request)) {
+		/* read next sector */
+		setup_req_params( SelectedDrive );
+		do_fd_action( SelectedDrive );
+	}
+	else {
+		/* all sectors finished */
+		fd_end_request_cur(BLK_STS_OK);
+		finish_fdc();
+	}
+	return;
+  
+  err_end:
+	BufferDrive = -1;
+	fd_error();
+}
+
+
+static void fd_writetrack( void )
+{
+	unsigned long paddr, flags;
+	unsigned int track;
+	
+	DPRINT(("fd_writetrack() Tr=%d Si=%d\n", ReqTrack, ReqSide ));
+
+	paddr = PhysTrackBuffer;
+	dma_cache_maintenance( paddr, BUFFER_SIZE, 1 );
+
+	fd_select_side( ReqSide );
+  
+	/* Cheat for track if stretch != 0 */
+	if (SUDT->stretch) {
+		track = FDC_READ( FDCREG_TRACK);
+		MFPDELAY();
+		FDC_WRITE(FDCREG_TRACK,track >> SUDT->stretch);
+	}
+	udelay(40);
+  
+	/* Setup DMA */
+	local_irq_save(flags);
+	dma_wd.dma_lo = (unsigned char)paddr;
+	MFPDELAY();
+	paddr >>= 8;
+	dma_wd.dma_md = (unsigned char)paddr;
+	MFPDELAY();
+	paddr >>= 8;
+	if (ATARIHW_PRESENT( EXTD_DMA ))
+		st_dma_ext_dmahi = (unsigned short)paddr;
+	else
+		dma_wd.dma_hi = (unsigned char)paddr;
+	MFPDELAY();
+	local_irq_restore(flags);
+  
+	/* Clear FIFO and switch DMA to correct mode */  
+	dma_wd.dma_mode_status = 0x190;  
+	MFPDELAY();
+	dma_wd.dma_mode_status = 0x90;  
+	MFPDELAY();
+	dma_wd.dma_mode_status = 0x190;
+	MFPDELAY();
+  
+	/* How many sectors for DMA */
+	dma_wd.fdc_acces_seccount = BUFFER_SIZE/512;
+	udelay(40);  
+  
+	/* Start operation */
+	dma_wd.dma_mode_status = FDCSELREG_STP | 0x100;
+	udelay(40);
+	SET_IRQ_HANDLER( fd_writetrack_done );
+	dma_wd.fdc_acces_seccount = FDCCMD_WRTRA | get_head_settle_flag(); 
+
+	MotorOn = 1;
+	start_timeout();
+	/* wait for interrupt */
+}
+
+
+static void fd_writetrack_done( int status )
+{
+	DPRINT(("fd_writetrack_done()\n"));
+
+	stop_timeout();
+
+	if (status & FDCSTAT_WPROT) {
+		printk(KERN_NOTICE "fd%d: is write protected\n", SelectedDrive );
+		goto err_end;
+	}	
+	if (status & FDCSTAT_LOST) {
+		printk(KERN_ERR "fd%d: lost data (side %d, track %d)\n",
+				SelectedDrive, ReqSide, ReqTrack );
+		goto err_end;
+	}
+
+	complete(&format_wait);
+	return;
+
+  err_end:
+	fd_error();
+}
+
+static void fd_times_out(struct timer_list *unused)
+{
+	atari_disable_irq( IRQ_MFP_FDC );
+	if (!FloppyIRQHandler) goto end; /* int occurred after timer was fired, but
+					  * before we came here... */
+
+	SET_IRQ_HANDLER( NULL );
+	/* If the timeout occurred while the readtrack_check timer was
+	 * active, we need to cancel it, else bad things will happen */
+	if (UseTrackbuffer)
+		del_timer( &readtrack_timer );
+	FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+	udelay( 25 );
+	
+	printk(KERN_ERR "floppy timeout\n" );
+	fd_error();
+  end:
+	atari_enable_irq( IRQ_MFP_FDC );
+}
+
+
+/* The (noop) seek operation here is needed to make the WP bit in the
+ * FDC status register accessible for check_change. If the last disk
+ * operation would have been a RDSEC, this bit would always read as 0
+ * no matter what :-( To save time, the seek goes to the track we're
+ * already on.
+ */
+
+static void finish_fdc( void )
+{
+	if (!NeedSeek || !stdma_is_locked_by(floppy_irq)) {
+		finish_fdc_done( 0 );
+	}
+	else {
+		DPRINT(("finish_fdc: dummy seek started\n"));
+		FDC_WRITE (FDCREG_DATA, SUD.track);
+		SET_IRQ_HANDLER( finish_fdc_done );
+		FDC_WRITE (FDCREG_CMD, FDCCMD_SEEK);
+		MotorOn = 1;
+		start_timeout();
+		/* we must wait for the IRQ here, because the ST-DMA
+		   is released immediately afterwards and the interrupt
+		   may be delivered to the wrong driver. */
+	  }
+}
+
+
+static void finish_fdc_done( int dummy )
+{
+	unsigned long flags;
+
+	DPRINT(("finish_fdc_done entered\n"));
+	stop_timeout();
+	NeedSeek = 0;
+
+	if (timer_pending(&fd_timer) && time_before(fd_timer.expires, jiffies + 5))
+		/* If the check for a disk change is done too early after this
+		 * last seek command, the WP bit still reads wrong :-((
+		 */
+		mod_timer(&fd_timer, jiffies + 5);
+	else
+		start_check_change_timer();
+	start_motor_off_timer();
+
+	local_irq_save(flags);
+	if (stdma_is_locked_by(floppy_irq))
+		stdma_release();
+	local_irq_restore(flags);
+
+	DPRINT(("finish_fdc() finished\n"));
+}
+
+/* The detection of disk changes is a dark chapter in Atari history :-(
+ * Because the "Drive ready" signal isn't present in the Atari
+ * hardware, one has to rely on the "Write Protect". This works fine,
+ * as long as no write protected disks are used. TOS solves this
+ * problem by introducing tri-state logic ("maybe changed") and
+ * looking at the serial number in block 0. This isn't possible for
+ * Linux, since the floppy driver can't make assumptions about the
+ * filesystem used on the disk and thus the contents of block 0. I've
+ * chosen the method to always say "The disk was changed" if it is
+ * unsure whether it was. This implies that every open or mount
+ * invalidates the disk buffers if you work with write protected
+ * disks. But at least this is better than working with incorrect data
+ * due to unrecognised disk changes.
+ */
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
+{
+	struct atari_floppy_struct *p = disk->private_data;
+	unsigned int drive = p - unit;
+	if (test_bit (drive, &fake_change)) {
+		/* simulated change (e.g. after formatting) */
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+	if (test_bit (drive, &changed_floppies)) {
+		/* surely changed (the WP signal changed at least once) */
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+	if (UD.wpstat) {
+		/* WP is on -> could be changed: to be sure, buffers should be
+		 * invalidated...
+		 */
+		return DISK_EVENT_MEDIA_CHANGE;
+	}
+
+	return 0;
+}
+
+static int floppy_revalidate(struct gendisk *disk)
+{
+	struct atari_floppy_struct *p = disk->private_data;
+	unsigned int drive = p - unit;
+
+	if (test_bit(drive, &changed_floppies) ||
+	    test_bit(drive, &fake_change) || !p->disktype) {
+		if (UD.flags & FTD_MSG)
+			printk(KERN_ERR "floppy: clear format %p!\n", UDT);
+		BufferDrive = -1;
+		clear_bit(drive, &fake_change);
+		clear_bit(drive, &changed_floppies);
+		/* MSch: clearing geometry makes sense only for autoprobe
+		   formats, for 'permanent user-defined' parameter:
+		   restore default_params[] here if flagged valid! */
+		if (default_params[drive].blocks == 0)
+			UDT = NULL;
+		else
+			UDT = &default_params[drive];
+	}
+	return 0;
+}
+
+
+/* This sets up the global variables describing the current request. */
+
+static void setup_req_params( int drive )
+{
+	int block = ReqBlock + ReqCnt;
+
+	ReqTrack = block / UDT->spt;
+	ReqSector = block - ReqTrack * UDT->spt + 1;
+	ReqSide = ReqTrack & 1;
+	ReqTrack >>= 1;
+	ReqData = ReqBuffer + 512 * ReqCnt;
+
+	if (UseTrackbuffer)
+		read_track = (ReqCmd == READ && unit[drive].error_count == 0);
+	else
+		read_track = 0;
+
+	DPRINT(("Request params: Si=%d Tr=%d Se=%d Data=%08lx\n",ReqSide,
+			ReqTrack, ReqSector, (unsigned long)ReqData ));
+}
+
+static blk_status_t ataflop_queue_rq(struct blk_mq_hw_ctx *hctx,
+				     const struct blk_mq_queue_data *bd)
+{
+	struct atari_floppy_struct *floppy = bd->rq->q->disk->private_data;
+	int drive = floppy - unit;
+	int type = floppy->type;
+
+	DPRINT(("Queue request: drive %d type %d sectors %d of %d last %d\n",
+		drive, type, blk_rq_cur_sectors(bd->rq),
+		blk_rq_sectors(bd->rq), bd->last));
+
+	spin_lock_irq(&ataflop_lock);
+	if (fd_request) {
+		spin_unlock_irq(&ataflop_lock);
+		return BLK_STS_DEV_RESOURCE;
+	}
+	if (!stdma_try_lock(floppy_irq, NULL))  {
+		spin_unlock_irq(&ataflop_lock);
+		return BLK_STS_RESOURCE;
+	}
+	fd_request = bd->rq;
+	unit[drive].error_count = 0;
+	blk_mq_start_request(fd_request);
+
+	atari_disable_irq( IRQ_MFP_FDC );
+
+	IsFormatting = 0;
+
+	if (!UD.connected) {
+		/* drive not connected */
+		printk(KERN_ERR "Unknown Device: fd%d\n", drive );
+		fd_end_request_cur(BLK_STS_IOERR);
+		stdma_release();
+		goto out;
+	}
+		
+	if (type == 0) {
+		if (!UDT) {
+			Probing = 1;
+			UDT = atari_disk_type + StartDiskType[DriveType];
+			set_capacity(bd->rq->q->disk, UDT->blocks);
+			UD.autoprobe = 1;
+		}
+	} 
+	else {
+		/* user supplied disk type */
+		if (--type >= NUM_DISK_MINORS) {
+			printk(KERN_WARNING "fd%d: invalid disk format", drive );
+			fd_end_request_cur(BLK_STS_IOERR);
+			stdma_release();
+			goto out;
+		}
+		if (minor2disktype[type].drive_types > DriveType)  {
+			printk(KERN_WARNING "fd%d: unsupported disk format", drive );
+			fd_end_request_cur(BLK_STS_IOERR);
+			stdma_release();
+			goto out;
+		}
+		type = minor2disktype[type].index;
+		UDT = &atari_disk_type[type];
+		set_capacity(bd->rq->q->disk, UDT->blocks);
+		UD.autoprobe = 0;
+	}
+
+	/* stop deselect timer */
+	del_timer( &motor_off_timer );
+		
+	ReqCnt = 0;
+	ReqCmd = rq_data_dir(fd_request);
+	ReqBlock = blk_rq_pos(fd_request);
+	ReqBuffer = bio_data(fd_request->bio);
+	setup_req_params( drive );
+	do_fd_action( drive );
+
+	atari_enable_irq( IRQ_MFP_FDC );
+
+out:
+	spin_unlock_irq(&ataflop_lock);
+	return BLK_STS_OK;
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long param)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	struct atari_floppy_struct *floppy = disk->private_data;
+	int drive = floppy - unit;
+	int type = floppy->type;
+	struct atari_format_descr fmt_desc;
+	struct atari_disk_type *dtp;
+	struct floppy_struct getprm;
+	int settype;
+	struct floppy_struct setprm;
+	void __user *argp = (void __user *)param;
+
+	switch (cmd) {
+	case FDGETPRM:
+		if (type) {
+			if (--type >= NUM_DISK_MINORS)
+				return -ENODEV;
+			if (minor2disktype[type].drive_types > DriveType)
+				return -ENODEV;
+			type = minor2disktype[type].index;
+			dtp = &atari_disk_type[type];
+			if (UD.flags & FTD_MSG)
+			    printk (KERN_ERR "floppy%d: found dtp %p name %s!\n",
+			        drive, dtp, dtp->name);
+		}
+		else {
+			if (!UDT)
+				return -ENXIO;
+			else
+				dtp = UDT;
+		}
+		memset((void *)&getprm, 0, sizeof(getprm));
+		getprm.size = dtp->blocks;
+		getprm.sect = dtp->spt;
+		getprm.head = 2;
+		getprm.track = dtp->blocks/dtp->spt/2;
+		getprm.stretch = dtp->stretch;
+		if (copy_to_user(argp, &getprm, sizeof(getprm)))
+			return -EFAULT;
+		return 0;
+	}
+	switch (cmd) {
+	case FDSETPRM:
+	case FDDEFPRM:
+	        /* 
+		 * MSch 7/96: simple 'set geometry' case: just set the
+		 * 'default' device params (minor == 0).
+		 * Currently, the drive geometry is cleared after each
+		 * disk change and subsequent revalidate()! simple
+		 * implementation of FDDEFPRM: save geometry from a
+		 * FDDEFPRM call and restore it in floppy_revalidate() !
+		 */
+
+		/* get the parameters from user space */
+		if (floppy->ref != 1 && floppy->ref != -1)
+			return -EBUSY;
+		if (copy_from_user(&setprm, argp, sizeof(setprm)))
+			return -EFAULT;
+		/* 
+		 * first of all: check for floppy change and revalidate, 
+		 * or the next access will revalidate - and clear UDT :-(
+		 */
+
+		if (floppy_check_events(disk, 0))
+		        floppy_revalidate(disk);
+
+		if (UD.flags & FTD_MSG)
+		    printk (KERN_INFO "floppy%d: setting size %d spt %d str %d!\n",
+			drive, setprm.size, setprm.sect, setprm.stretch);
+
+		/* what if type > 0 here? Overwrite specified entry ? */
+		if (type) {
+		        /* refuse to re-set a predefined type for now */
+			finish_fdc();
+			return -EINVAL;
+		}
+
+		/* 
+		 * type == 0: first look for a matching entry in the type list,
+		 * and set the UD.disktype field to use the perdefined entry.
+		 * TODO: add user-defined format to head of autoprobe list ? 
+		 * Useful to include the user-type for future autodetection!
+		 */
+
+		for (settype = 0; settype < NUM_DISK_MINORS; settype++) {
+			int setidx = 0;
+			if (minor2disktype[settype].drive_types > DriveType) {
+				/* skip this one, invalid for drive ... */
+				continue;
+			}
+			setidx = minor2disktype[settype].index;
+			dtp = &atari_disk_type[setidx];
+
+			/* found matching entry ?? */
+			if (   dtp->blocks  == setprm.size 
+			    && dtp->spt     == setprm.sect
+			    && dtp->stretch == setprm.stretch ) {
+				if (UD.flags & FTD_MSG)
+				    printk (KERN_INFO "floppy%d: setting %s %p!\n",
+				        drive, dtp->name, dtp);
+				UDT = dtp;
+				set_capacity(disk, UDT->blocks);
+
+				if (cmd == FDDEFPRM) {
+				  /* save settings as permanent default type */
+				  default_params[drive].name    = dtp->name;
+				  default_params[drive].spt     = dtp->spt;
+				  default_params[drive].blocks  = dtp->blocks;
+				  default_params[drive].fdc_speed = dtp->fdc_speed;
+				  default_params[drive].stretch = dtp->stretch;
+				}
+				
+				return 0;
+			}
+
+		}
+
+		/* no matching disk type found above - setting user_params */
+
+	       	if (cmd == FDDEFPRM) {
+			/* set permanent type */
+			dtp = &default_params[drive];
+		} else
+			/* set user type (reset by disk change!) */
+			dtp = &user_params[drive];
+
+		dtp->name   = "user format";
+		dtp->blocks = setprm.size;
+		dtp->spt    = setprm.sect;
+		if (setprm.sect > 14) 
+			dtp->fdc_speed = 3;
+		else
+			dtp->fdc_speed = 0;
+		dtp->stretch = setprm.stretch;
+
+		if (UD.flags & FTD_MSG)
+			printk (KERN_INFO "floppy%d: blk %d spt %d str %d!\n",
+				drive, dtp->blocks, dtp->spt, dtp->stretch);
+
+		/* sanity check */
+		if (setprm.track != dtp->blocks/dtp->spt/2 ||
+		    setprm.head != 2) {
+			finish_fdc();
+			return -EINVAL;
+		}
+
+		UDT = dtp;
+		set_capacity(disk, UDT->blocks);
+
+		return 0;
+	case FDMSGON:
+		UD.flags |= FTD_MSG;
+		return 0;
+	case FDMSGOFF:
+		UD.flags &= ~FTD_MSG;
+		return 0;
+	case FDSETEMSGTRESH:
+		return -EINVAL;
+	case FDFMTBEG:
+		return 0;
+	case FDFMTTRK:
+		if (floppy->ref != 1 && floppy->ref != -1)
+			return -EBUSY;
+		if (copy_from_user(&fmt_desc, argp, sizeof(fmt_desc)))
+			return -EFAULT;
+		return do_format(drive, type, &fmt_desc);
+	case FDCLRPRM:
+		UDT = NULL;
+		/* MSch: invalidate default_params */
+		default_params[drive].blocks  = 0;
+		set_capacity(disk, MAX_DISK_SIZE * 2);
+		fallthrough;
+	case FDFMTEND:
+	case FDFLUSH:
+		/* invalidate the buffer track to force a reread */
+		BufferDrive = -1;
+		set_bit(drive, &fake_change);
+		if (bdev_check_media_change(bdev))
+			floppy_revalidate(bdev->bd_disk);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long arg)
+{
+	int ret;
+
+	mutex_lock(&ataflop_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, arg);
+	mutex_unlock(&ataflop_mutex);
+
+	return ret;
+}
+
+/* Initialize the 'unit' variable for drive 'drive' */
+
+static void __init fd_probe( int drive )
+{
+	UD.connected = 0;
+	UDT  = NULL;
+
+	if (!fd_test_drive_present( drive ))
+		return;
+
+	UD.connected = 1;
+	UD.track     = 0;
+	switch( UserSteprate[drive] ) {
+	case 2:
+		UD.steprate = FDCSTEP_2;
+		break;
+	case 3:
+		UD.steprate = FDCSTEP_3;
+		break;
+	case 6:
+		UD.steprate = FDCSTEP_6;
+		break;
+	case 12:
+		UD.steprate = FDCSTEP_12;
+		break;
+	default: /* should be -1 for "not set by user" */
+		if (ATARIHW_PRESENT( FDCSPEED ) || MACH_IS_MEDUSA)
+			UD.steprate = FDCSTEP_3;
+		else
+			UD.steprate = FDCSTEP_6;
+		break;
+	}
+	MotorOn = 1;	/* from probe restore operation! */
+}
+
+
+/* This function tests the physical presence of a floppy drive (not
+ * whether a disk is inserted). This is done by issuing a restore
+ * command, waiting max. 2 seconds (that should be enough to move the
+ * head across the whole disk) and looking at the state of the "TR00"
+ * signal. This should now be raised if there is a drive connected
+ * (and there is no hardware failure :-) Otherwise, the drive is
+ * declared absent.
+ */
+
+static int __init fd_test_drive_present( int drive )
+{
+	unsigned long timeout;
+	unsigned char status;
+	int ok;
+	
+	if (drive >= (MACH_IS_FALCON ? 1 : 2)) return( 0 );
+	fd_select_drive( drive );
+
+	/* disable interrupt temporarily */
+	atari_turnoff_irq( IRQ_MFP_FDC );
+	FDC_WRITE (FDCREG_TRACK, 0xff00);
+	FDC_WRITE( FDCREG_CMD, FDCCMD_RESTORE | FDCCMDADD_H | FDCSTEP_6 );
+
+	timeout = jiffies + 2*HZ+HZ/2;
+	while (time_before(jiffies, timeout))
+		if (!(st_mfp.par_dt_reg & 0x20))
+			break;
+
+	status = FDC_READ( FDCREG_STATUS );
+	ok = (status & FDCSTAT_TR00) != 0;
+
+	/* force interrupt to abort restore operation (FDC would try
+	 * about 50 seconds!) */
+	FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+	udelay(500);
+	status = FDC_READ( FDCREG_STATUS );
+	udelay(20);
+
+	if (ok) {
+		/* dummy seek command to make WP bit accessible */
+		FDC_WRITE( FDCREG_DATA, 0 );
+		FDC_WRITE( FDCREG_CMD, FDCCMD_SEEK );
+		while( st_mfp.par_dt_reg & 0x20 )
+			;
+		status = FDC_READ( FDCREG_STATUS );
+	}
+
+	atari_turnon_irq( IRQ_MFP_FDC );
+	return( ok );
+}
+
+
+/* Look how many and which kind of drives are connected. If there are
+ * floppies, additionally start the disk-change and motor-off timers.
+ */
+
+static void __init config_types( void )
+{
+	int drive, cnt = 0;
+
+	/* for probing drives, set the FDC speed to 8 MHz */
+	if (ATARIHW_PRESENT(FDCSPEED))
+		dma_wd.fdc_speed = 0;
+
+	printk(KERN_INFO "Probing floppy drive(s):\n");
+	for( drive = 0; drive < FD_MAX_UNITS; drive++ ) {
+		fd_probe( drive );
+		if (UD.connected) {
+			printk(KERN_INFO "fd%d\n", drive);
+			++cnt;
+		}
+	}
+
+	if (FDC_READ( FDCREG_STATUS ) & FDCSTAT_BUSY) {
+		/* If FDC is still busy from probing, give it another FORCI
+		 * command to abort the operation. If this isn't done, the FDC
+		 * will interrupt later and its IRQ line stays low, because
+		 * the status register isn't read. And this will block any
+		 * interrupts on this IRQ line :-(
+		 */
+		FDC_WRITE( FDCREG_CMD, FDCCMD_FORCI );
+		udelay(500);
+		FDC_READ( FDCREG_STATUS );
+		udelay(20);
+	}
+	
+	if (cnt > 0) {
+		start_motor_off_timer();
+		if (cnt == 1) fd_select_drive( 0 );
+		start_check_change_timer();
+	}
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+	struct atari_floppy_struct *p = bdev->bd_disk->private_data;
+	int type  = MINOR(bdev->bd_dev) >> 2;
+
+	DPRINT(("fd_open: type=%d\n",type));
+	if (p->ref && p->type != type)
+		return -EBUSY;
+
+	if (p->ref == -1 || (p->ref && mode & FMODE_EXCL))
+		return -EBUSY;
+
+	if (mode & FMODE_EXCL)
+		p->ref = -1;
+	else
+		p->ref++;
+
+	p->type = type;
+
+	if (mode & FMODE_NDELAY)
+		return 0;
+
+	if (mode & (FMODE_READ|FMODE_WRITE)) {
+		if (bdev_check_media_change(bdev))
+			floppy_revalidate(bdev->bd_disk);
+		if (mode & FMODE_WRITE) {
+			if (p->wpstat) {
+				if (p->ref < 0)
+					p->ref = 0;
+				else
+					p->ref--;
+				return -EROFS;
+			}
+		}
+	}
+	return 0;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&ataflop_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&ataflop_mutex);
+
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+	struct atari_floppy_struct *p = disk->private_data;
+	mutex_lock(&ataflop_mutex);
+	if (p->ref < 0)
+		p->ref = 0;
+	else if (!p->ref--) {
+		printk(KERN_ERR "floppy_release with fd_ref == 0");
+		p->ref = 0;
+	}
+	mutex_unlock(&ataflop_mutex);
+}
+
+static const struct block_device_operations floppy_fops = {
+	.owner		= THIS_MODULE,
+	.open		= floppy_unlocked_open,
+	.release	= floppy_release,
+	.ioctl		= fd_ioctl,
+	.check_events	= floppy_check_events,
+};
+
+static const struct blk_mq_ops ataflop_mq_ops = {
+	.queue_rq = ataflop_queue_rq,
+};
+
+static int ataflop_alloc_disk(unsigned int drive, unsigned int type)
+{
+	struct gendisk *disk;
+
+	disk = blk_mq_alloc_disk(&unit[drive].tag_set, NULL);
+	if (IS_ERR(disk))
+		return PTR_ERR(disk);
+
+	disk->major = FLOPPY_MAJOR;
+	disk->first_minor = drive + (type << 2);
+	disk->minors = 1;
+	sprintf(disk->disk_name, "fd%d", drive);
+	disk->fops = &floppy_fops;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	disk->private_data = &unit[drive];
+	set_capacity(disk, MAX_DISK_SIZE * 2);
+
+	unit[drive].disk[type] = disk;
+	return 0;
+}
+
+static void ataflop_probe(dev_t dev)
+{
+	int drive = MINOR(dev) & 3;
+	int type  = MINOR(dev) >> 2;
+
+	if (type)
+		type--;
+
+	if (drive >= FD_MAX_UNITS || type >= NUM_DISK_MINORS)
+		return;
+	if (unit[drive].disk[type])
+		return;
+	if (ataflop_alloc_disk(drive, type))
+		return;
+	if (add_disk(unit[drive].disk[type]))
+		goto cleanup_disk;
+	unit[drive].registered[type] = true;
+	return;
+
+cleanup_disk:
+	put_disk(unit[drive].disk[type]);
+	unit[drive].disk[type] = NULL;
+}
+
+static void atari_floppy_cleanup(void)
+{
+	int i;
+	int type;
+
+	for (i = 0; i < FD_MAX_UNITS; i++) {
+		for (type = 0; type < NUM_DISK_MINORS; type++) {
+			if (!unit[i].disk[type])
+				continue;
+			del_gendisk(unit[i].disk[type]);
+			put_disk(unit[i].disk[type]);
+		}
+		blk_mq_free_tag_set(&unit[i].tag_set);
+	}
+
+	del_timer_sync(&fd_timer);
+	atari_stram_free(DMABuffer);
+}
+
+static void atari_cleanup_floppy_disk(struct atari_floppy_struct *fs)
+{
+	int type;
+
+	for (type = 0; type < NUM_DISK_MINORS; type++) {
+		if (!fs->disk[type])
+			continue;
+		if (fs->registered[type])
+			del_gendisk(fs->disk[type]);
+		put_disk(fs->disk[type]);
+	}
+	blk_mq_free_tag_set(&fs->tag_set);
+}
+
+static int __init atari_floppy_init (void)
+{
+	int i;
+	int ret;
+
+	if (!MACH_IS_ATARI)
+		/* Amiga, Mac, ... don't have Atari-compatible floppy :-) */
+		return -ENODEV;
+
+	for (i = 0; i < FD_MAX_UNITS; i++) {
+		memset(&unit[i].tag_set, 0, sizeof(unit[i].tag_set));
+		unit[i].tag_set.ops = &ataflop_mq_ops;
+		unit[i].tag_set.nr_hw_queues = 1;
+		unit[i].tag_set.nr_maps = 1;
+		unit[i].tag_set.queue_depth = 2;
+		unit[i].tag_set.numa_node = NUMA_NO_NODE;
+		unit[i].tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+		ret = blk_mq_alloc_tag_set(&unit[i].tag_set);
+		if (ret)
+			goto err;
+
+		ret = ataflop_alloc_disk(i, 0);
+		if (ret) {
+			blk_mq_free_tag_set(&unit[i].tag_set);
+			goto err;
+		}
+	}
+
+	if (UseTrackbuffer < 0)
+		/* not set by user -> use default: for now, we turn
+		   track buffering off for all Medusas, though it
+		   could be used with ones that have a counter
+		   card. But the test is too hard :-( */
+		UseTrackbuffer = !MACH_IS_MEDUSA;
+
+	/* initialize variables */
+	SelectedDrive = -1;
+	BufferDrive = -1;
+
+	DMABuffer = atari_stram_alloc(BUFFER_SIZE+512, "ataflop");
+	if (!DMABuffer) {
+		printk(KERN_ERR "atari_floppy_init: cannot get dma buffer\n");
+		ret = -ENOMEM;
+		goto err;
+	}
+	TrackBuffer = DMABuffer + 512;
+	PhysDMABuffer = atari_stram_to_phys(DMABuffer);
+	PhysTrackBuffer = virt_to_phys(TrackBuffer);
+	BufferDrive = BufferSide = BufferTrack = -1;
+
+	for (i = 0; i < FD_MAX_UNITS; i++) {
+		unit[i].track = -1;
+		unit[i].flags = 0;
+		ret = add_disk(unit[i].disk[0]);
+		if (ret)
+			goto err_out_dma;
+		unit[i].registered[0] = true;
+	}
+
+	printk(KERN_INFO "Atari floppy driver: max. %cD, %strack buffering\n",
+	       DriveType == 0 ? 'D' : DriveType == 1 ? 'H' : 'E',
+	       UseTrackbuffer ? "" : "no ");
+	config_types();
+
+	ret = __register_blkdev(FLOPPY_MAJOR, "fd", ataflop_probe);
+	if (ret) {
+		printk(KERN_ERR "atari_floppy_init: cannot register block device\n");
+		atari_floppy_cleanup();
+	}
+	return ret;
+
+err_out_dma:
+	atari_stram_free(DMABuffer);
+err:
+	while (--i >= 0)
+		atari_cleanup_floppy_disk(&unit[i]);
+
+	return ret;
+}
+
+#ifndef MODULE
+static int __init atari_floppy_setup(char *str)
+{
+	int ints[3 + FD_MAX_UNITS];
+	int i;
+
+	if (!MACH_IS_ATARI)
+		return 0;
+
+	str = get_options(str, 3 + FD_MAX_UNITS, ints);
+	
+	if (ints[0] < 1) {
+		printk(KERN_ERR "ataflop_setup: no arguments!\n" );
+		return 0;
+	}
+	else if (ints[0] > 2+FD_MAX_UNITS) {
+		printk(KERN_ERR "ataflop_setup: too many arguments\n" );
+	}
+
+	if (ints[1] < 0 || ints[1] > 2)
+		printk(KERN_ERR "ataflop_setup: bad drive type\n" );
+	else
+		DriveType = ints[1];
+
+	if (ints[0] >= 2)
+		UseTrackbuffer = (ints[2] > 0);
+
+	for( i = 3; i <= ints[0] && i-3 < FD_MAX_UNITS; ++i ) {
+		if (ints[i] != 2 && ints[i] != 3 && ints[i] != 6 && ints[i] != 12)
+			printk(KERN_ERR "ataflop_setup: bad steprate\n" );
+		else
+			UserSteprate[i-3] = ints[i];
+	}
+	return 1;
+}
+
+__setup("floppy=", atari_floppy_setup);
+#endif
+
+static void __exit atari_floppy_exit(void)
+{
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
+	atari_floppy_cleanup();
+}
+
+module_init(atari_floppy_init)
+module_exit(atari_floppy_exit)
+
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
new file mode 100644
index 000000000..a8a77a1ef
--- /dev/null
+++ b/drivers/block/brd.c
@@ -0,0 +1,530 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Ram backed block device driver.
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ *
+ * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
+ * of their respective owners.
+ */
+
+#include <linux/init.h>
+#include <linux/initrd.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/mutex.h>
+#include <linux/pagemap.h>
+#include <linux/radix-tree.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/debugfs.h>
+
+#include <linux/uaccess.h>
+
+/*
+ * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * the pages containing the block device's contents. A brd page's ->index is
+ * its offset in PAGE_SIZE units. This is similar to, but in no way connected
+ * with, the kernel's pagecache or buffer cache (which sit above our block
+ * device).
+ */
+struct brd_device {
+	int			brd_number;
+	struct gendisk		*brd_disk;
+	struct list_head	brd_list;
+
+	/*
+	 * Backing store of pages and lock to protect it. This is the contents
+	 * of the block device.
+	 */
+	spinlock_t		brd_lock;
+	struct radix_tree_root	brd_pages;
+	u64			brd_nr_pages;
+};
+
+/*
+ * Look up and return a brd's page for a given sector.
+ */
+static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
+{
+	pgoff_t idx;
+	struct page *page;
+
+	/*
+	 * The page lifetime is protected by the fact that we have opened the
+	 * device node -- brd pages will never be deleted under us, so we
+	 * don't need any further locking or refcounting.
+	 *
+	 * This is strictly true for the radix-tree nodes as well (ie. we
+	 * don't actually need the rcu_read_lock()), however that is not a
+	 * documented feature of the radix-tree API so it is better to be
+	 * safe here (we don't have total exclusion from radix tree updates
+	 * here, only deletes).
+	 */
+	rcu_read_lock();
+	idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
+	page = radix_tree_lookup(&brd->brd_pages, idx);
+	rcu_read_unlock();
+
+	BUG_ON(page && page->index != idx);
+
+	return page;
+}
+
+/*
+ * Insert a new page for a given sector, if one does not already exist.
+ */
+static int brd_insert_page(struct brd_device *brd, sector_t sector, gfp_t gfp)
+{
+	pgoff_t idx;
+	struct page *page;
+	int ret = 0;
+
+	page = brd_lookup_page(brd, sector);
+	if (page)
+		return 0;
+
+	page = alloc_page(gfp | __GFP_ZERO | __GFP_HIGHMEM);
+	if (!page)
+		return -ENOMEM;
+
+	if (radix_tree_maybe_preload(gfp)) {
+		__free_page(page);
+		return -ENOMEM;
+	}
+
+	spin_lock(&brd->brd_lock);
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	page->index = idx;
+	if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+		__free_page(page);
+		page = radix_tree_lookup(&brd->brd_pages, idx);
+		if (!page)
+			ret = -ENOMEM;
+		else if (page->index != idx)
+			ret = -EIO;
+	} else {
+		brd->brd_nr_pages++;
+	}
+	spin_unlock(&brd->brd_lock);
+
+	radix_tree_preload_end();
+	return ret;
+}
+
+/*
+ * Free all backing store pages and radix tree. This must only be called when
+ * there are no other users of the device.
+ */
+#define FREE_BATCH 16
+static void brd_free_pages(struct brd_device *brd)
+{
+	unsigned long pos = 0;
+	struct page *pages[FREE_BATCH];
+	int nr_pages;
+
+	do {
+		int i;
+
+		nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
+				(void **)pages, pos, FREE_BATCH);
+
+		for (i = 0; i < nr_pages; i++) {
+			void *ret;
+
+			BUG_ON(pages[i]->index < pos);
+			pos = pages[i]->index;
+			ret = radix_tree_delete(&brd->brd_pages, pos);
+			BUG_ON(!ret || ret != pages[i]);
+			__free_page(pages[i]);
+		}
+
+		pos++;
+
+		/*
+		 * It takes 3.4 seconds to remove 80GiB ramdisk.
+		 * So, we need cond_resched to avoid stalling the CPU.
+		 */
+		cond_resched();
+
+		/*
+		 * This assumes radix_tree_gang_lookup always returns as
+		 * many pages as possible. If the radix-tree code changes,
+		 * so will this have to.
+		 */
+	} while (nr_pages == FREE_BATCH);
+}
+
+/*
+ * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ */
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n,
+			     gfp_t gfp)
+{
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+	int ret;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	ret = brd_insert_page(brd, sector, gfp);
+	if (ret)
+		return ret;
+	if (copy < n) {
+		sector += copy >> SECTOR_SHIFT;
+		ret = brd_insert_page(brd, sector, gfp);
+	}
+	return ret;
+}
+
+/*
+ * Copy n bytes from src to the brd starting at sector. Does not sleep.
+ */
+static void copy_to_brd(struct brd_device *brd, const void *src,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *dst;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = brd_lookup_page(brd, sector);
+	BUG_ON(!page);
+
+	dst = kmap_atomic(page);
+	memcpy(dst + offset, src, copy);
+	kunmap_atomic(dst);
+
+	if (copy < n) {
+		src += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = brd_lookup_page(brd, sector);
+		BUG_ON(!page);
+
+		dst = kmap_atomic(page);
+		memcpy(dst, src, copy);
+		kunmap_atomic(dst);
+	}
+}
+
+/*
+ * Copy n bytes to dst from the brd starting at sector. Does not sleep.
+ */
+static void copy_from_brd(void *dst, struct brd_device *brd,
+			sector_t sector, size_t n)
+{
+	struct page *page;
+	void *src;
+	unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+	size_t copy;
+
+	copy = min_t(size_t, n, PAGE_SIZE - offset);
+	page = brd_lookup_page(brd, sector);
+	if (page) {
+		src = kmap_atomic(page);
+		memcpy(dst, src + offset, copy);
+		kunmap_atomic(src);
+	} else
+		memset(dst, 0, copy);
+
+	if (copy < n) {
+		dst += copy;
+		sector += copy >> SECTOR_SHIFT;
+		copy = n - copy;
+		page = brd_lookup_page(brd, sector);
+		if (page) {
+			src = kmap_atomic(page);
+			memcpy(dst, src, copy);
+			kunmap_atomic(src);
+		} else
+			memset(dst, 0, copy);
+	}
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static int brd_do_bvec(struct brd_device *brd, struct page *page,
+			unsigned int len, unsigned int off, blk_opf_t opf,
+			sector_t sector)
+{
+	void *mem;
+	int err = 0;
+
+	if (op_is_write(opf)) {
+		/*
+		 * Must use NOIO because we don't want to recurse back into the
+		 * block or filesystem layers from page reclaim.
+		 */
+		gfp_t gfp = opf & REQ_NOWAIT ? GFP_NOWAIT : GFP_NOIO;
+
+		err = copy_to_brd_setup(brd, sector, len, gfp);
+		if (err)
+			goto out;
+	}
+
+	mem = kmap_atomic(page);
+	if (!op_is_write(opf)) {
+		copy_from_brd(mem + off, brd, sector, len);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		copy_to_brd(brd, mem + off, sector, len);
+	}
+	kunmap_atomic(mem);
+
+out:
+	return err;
+}
+
+static void brd_submit_bio(struct bio *bio)
+{
+	struct brd_device *brd = bio->bi_bdev->bd_disk->private_data;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		unsigned int len = bvec.bv_len;
+		int err;
+
+		/* Don't support un-aligned buffer */
+		WARN_ON_ONCE((bvec.bv_offset & (SECTOR_SIZE - 1)) ||
+				(len & (SECTOR_SIZE - 1)));
+
+		err = brd_do_bvec(brd, bvec.bv_page, len, bvec.bv_offset,
+				  bio->bi_opf, sector);
+		if (err) {
+			if (err == -ENOMEM && bio->bi_opf & REQ_NOWAIT) {
+				bio_wouldblock_error(bio);
+				return;
+			}
+			bio_io_error(bio);
+			return;
+		}
+		sector += len >> SECTOR_SHIFT;
+	}
+
+	bio_endio(bio);
+}
+
+static int brd_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, enum req_op op)
+{
+	struct brd_device *brd = bdev->bd_disk->private_data;
+	int err;
+
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
+	err = brd_do_bvec(brd, page, PAGE_SIZE, 0, op, sector);
+	page_endio(page, op_is_write(op), err);
+	return err;
+}
+
+static const struct block_device_operations brd_fops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		brd_submit_bio,
+	.rw_page =		brd_rw_page,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+static int rd_nr = CONFIG_BLK_DEV_RAM_COUNT;
+module_param(rd_nr, int, 0444);
+MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+
+unsigned long rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_size, ulong, 0444);
+MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+
+static int max_part = 1;
+module_param(max_part, int, 0444);
+MODULE_PARM_DESC(max_part, "Num Minors to reserve between devices");
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
+MODULE_ALIAS("rd");
+
+#ifndef MODULE
+/* Legacy boot options - nonmodular */
+static int __init ramdisk_size(char *str)
+{
+	rd_size = simple_strtol(str, NULL, 0);
+	return 1;
+}
+__setup("ramdisk_size=", ramdisk_size);
+#endif
+
+/*
+ * The device scheme is derived from loop.c. Keep them in synch where possible
+ * (should share code eventually).
+ */
+static LIST_HEAD(brd_devices);
+static struct dentry *brd_debugfs_dir;
+
+static int brd_alloc(int i)
+{
+	struct brd_device *brd;
+	struct gendisk *disk;
+	char buf[DISK_NAME_LEN];
+	int err = -ENOMEM;
+
+	list_for_each_entry(brd, &brd_devices, brd_list)
+		if (brd->brd_number == i)
+			return -EEXIST;
+	brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+	if (!brd)
+		return -ENOMEM;
+	brd->brd_number		= i;
+	list_add_tail(&brd->brd_list, &brd_devices);
+
+	spin_lock_init(&brd->brd_lock);
+	INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+
+	snprintf(buf, DISK_NAME_LEN, "ram%d", i);
+	if (!IS_ERR_OR_NULL(brd_debugfs_dir))
+		debugfs_create_u64(buf, 0444, brd_debugfs_dir,
+				&brd->brd_nr_pages);
+
+	disk = brd->brd_disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!disk)
+		goto out_free_dev;
+
+	disk->major		= RAMDISK_MAJOR;
+	disk->first_minor	= i * max_part;
+	disk->minors		= max_part;
+	disk->fops		= &brd_fops;
+	disk->private_data	= brd;
+	strscpy(disk->disk_name, buf, DISK_NAME_LEN);
+	set_capacity(disk, rd_size * 2);
+	
+	/*
+	 * This is so fdisk will align partitions on 4k, because of
+	 * direct_access API needing 4k alignment, returning a PFN
+	 * (This is only a problem on very small devices <= 4M,
+	 *  otherwise fdisk will align on 1M. Regardless this call
+	 *  is harmless)
+	 */
+	blk_queue_physical_block_size(disk->queue, PAGE_SIZE);
+
+	/* Tell the block layer that this is not a rotational device */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+	blk_queue_flag_set(QUEUE_FLAG_NOWAIT, disk->queue);
+	err = add_disk(disk);
+	if (err)
+		goto out_cleanup_disk;
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(disk);
+out_free_dev:
+	list_del(&brd->brd_list);
+	kfree(brd);
+	return err;
+}
+
+static void brd_probe(dev_t dev)
+{
+	brd_alloc(MINOR(dev) / max_part);
+}
+
+static void brd_cleanup(void)
+{
+	struct brd_device *brd, *next;
+
+	debugfs_remove_recursive(brd_debugfs_dir);
+
+	list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+		del_gendisk(brd->brd_disk);
+		put_disk(brd->brd_disk);
+		brd_free_pages(brd);
+		list_del(&brd->brd_list);
+		kfree(brd);
+	}
+}
+
+static inline void brd_check_and_reset_par(void)
+{
+	if (unlikely(!max_part))
+		max_part = 1;
+
+	/*
+	 * make sure 'max_part' can be divided exactly by (1U << MINORBITS),
+	 * otherwise, it is possiable to get same dev_t when adding partitions.
+	 */
+	if ((1U << MINORBITS) % max_part != 0)
+		max_part = 1UL << fls(max_part);
+
+	if (max_part > DISK_MAX_PARTS) {
+		pr_info("brd: max_part can't be larger than %d, reset max_part = %d.\n",
+			DISK_MAX_PARTS, DISK_MAX_PARTS);
+		max_part = DISK_MAX_PARTS;
+	}
+}
+
+static int __init brd_init(void)
+{
+	int err, i;
+
+	brd_check_and_reset_par();
+
+	brd_debugfs_dir = debugfs_create_dir("ramdisk_pages", NULL);
+
+	for (i = 0; i < rd_nr; i++) {
+		err = brd_alloc(i);
+		if (err)
+			goto out_free;
+	}
+
+	/*
+	 * brd module now has a feature to instantiate underlying device
+	 * structure on-demand, provided that there is an access dev node.
+	 *
+	 * (1) if rd_nr is specified, create that many upfront. else
+	 *     it defaults to CONFIG_BLK_DEV_RAM_COUNT
+	 * (2) User can further extend brd devices by create dev node themselves
+	 *     and have kernel automatically instantiate actual device
+	 *     on-demand. Example:
+	 *		mknod /path/devnod_name b 1 X	# 1 is the rd major
+	 *		fdisk -l /path/devnod_name
+	 *	If (X / max_part) was not already created it will be created
+	 *	dynamically.
+	 */
+
+	if (__register_blkdev(RAMDISK_MAJOR, "ramdisk", brd_probe)) {
+		err = -EIO;
+		goto out_free;
+	}
+
+	pr_info("brd: module loaded\n");
+	return 0;
+
+out_free:
+	brd_cleanup();
+
+	pr_info("brd: module NOT loaded !!!\n");
+	return err;
+}
+
+static void __exit brd_exit(void)
+{
+
+	unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+	brd_cleanup();
+
+	pr_info("brd: module unloaded\n");
+}
+
+module_init(brd_init);
+module_exit(brd_exit);
+
diff --git a/drivers/block/drbd/Kconfig b/drivers/block/drbd/Kconfig
new file mode 100644
index 000000000..cbacddc55
--- /dev/null
+++ b/drivers/block/drbd/Kconfig
@@ -0,0 +1,73 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# DRBD device driver configuration
+#
+
+comment "DRBD disabled because PROC_FS or INET not selected"
+	depends on PROC_FS='n' || INET='n'
+
+config BLK_DEV_DRBD
+	tristate "DRBD Distributed Replicated Block Device support"
+	depends on PROC_FS && INET
+	select LRU_CACHE
+	select LIBCRC32C
+	help
+
+	  NOTE: In order to authenticate connections you have to select
+	  CRYPTO_HMAC and a hash function as well.
+
+	  DRBD is a shared-nothing, synchronously replicated block device. It
+	  is designed to serve as a building block for high availability
+	  clusters and in this context, is a "drop-in" replacement for shared
+	  storage. Simplistically, you could see it as a network RAID 1.
+
+	  Each minor device has a role, which can be 'primary' or 'secondary'.
+	  On the node with the primary device the application is supposed to
+	  run and to access the device (/dev/drbdX). Every write is sent to
+	  the local 'lower level block device' and, across the network, to the
+	  node with the device in 'secondary' state.  The secondary device
+	  simply writes the data to its lower level block device.
+
+	  DRBD can also be used in dual-Primary mode (device writable on both
+	  nodes), which means it can exhibit shared disk semantics in a
+	  shared-nothing cluster.  Needless to say, on top of dual-Primary
+	  DRBD utilizing a cluster file system is necessary to maintain for
+	  cache coherency.
+
+	  For automatic failover you need a cluster manager (e.g. heartbeat).
+	  See also: https://www.drbd.org/, http://www.linux-ha.org
+
+	  If unsure, say N.
+
+config DRBD_FAULT_INJECTION
+	bool "DRBD fault injection"
+	depends on BLK_DEV_DRBD
+	help
+
+	  Say Y here if you want to simulate IO errors, in order to test DRBD's
+	  behavior.
+
+	  The actual simulation of IO errors is done by writing 3 values to
+	  /sys/module/drbd/parameters/
+
+	  enable_faults: bitmask of...
+	  1	meta data write
+	  2               read
+	  4	resync data write
+	  8	            read
+	  16	data write
+	  32	data read
+	  64	read ahead
+	  128	kmalloc of bitmap
+	  256	allocation of peer_requests
+	  512	insert data corruption on receiving side
+
+	  fault_devs: bitmask of minor numbers
+	  fault_rate: frequency in percent
+
+	  Example: Simulate data write errors on /dev/drbd0 with a probability of 5%.
+		echo 16 > /sys/module/drbd/parameters/enable_faults
+		echo 1 > /sys/module/drbd/parameters/fault_devs
+		echo 5 > /sys/module/drbd/parameters/fault_rate
+
+	  If unsure, say N.
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
new file mode 100644
index 000000000..8bd534697
--- /dev/null
+++ b/drivers/block/drbd/Makefile
@@ -0,0 +1,9 @@
+# SPDX-License-Identifier: GPL-2.0
+drbd-y := drbd_bitmap.o drbd_proc.o
+drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
+drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
+drbd-$(CONFIG_DEBUG_FS) += drbd_debugfs.o
+
+obj-$(CONFIG_BLK_DEV_DRBD)     += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
new file mode 100644
index 000000000..e27478ae5
--- /dev/null
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -0,0 +1,1236 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_actlog.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+ */
+
+#include <linux/slab.h>
+#include <linux/crc32c.h>
+#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+
+
+enum al_transaction_types {
+	AL_TR_UPDATE = 0,
+	AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+	/* don't we all like magic */
+	__be32	magic;
+
+	/* to identify the most recent transaction block
+	 * in the on disk ring buffer */
+	__be32	tr_number;
+
+	/* checksum on the full 4k block, with this field set to 0. */
+	__be32	crc32c;
+
+	/* type of transaction, special transaction types like:
+	 * purge-all, set-all-idle, set-all-active, ... to-be-defined
+	 * see also enum al_transaction_types */
+	__be16	transaction_type;
+
+	/* we currently allow only a few thousand extents,
+	 * so 16bit will be enough for the slot number. */
+
+	/* how many updates in this transaction */
+	__be16	n_updates;
+
+	/* maximum slot number, "al-extents" in drbd.conf speak.
+	 * Having this in each transaction should make reconfiguration
+	 * of that parameter easier. */
+	__be16	context_size;
+
+	/* slot number the context starts with */
+	__be16	context_start_slot_nr;
+
+	/* Some reserved bytes.  Expected usage is a 64bit counter of
+	 * sectors-written since device creation, and other data generation tag
+	 * supporting usage */
+	__be32	__reserved[4];
+
+	/* --- 36 byte used --- */
+
+	/* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+	 * in one transaction, then use the remaining byte in the 4k block for
+	 * context information.  "Flexible" number of updates per transaction
+	 * does not help, as we have to account for the case when all update
+	 * slots are used anyways, so it would only complicate code without
+	 * additional benefit.
+	 */
+	__be16	update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* but the extent number is 32bit, which at an extent size of 4 MiB
+	 * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+	__be32	update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+	/* --- 420 bytes used (36 + 64*6) --- */
+
+	/* 4096 - 420 = 3676 = 919 * 4 */
+	__be32	context[AL_CONTEXT_PER_TRANSACTION];
+};
+
+void *drbd_md_get_buffer(struct drbd_device *device, const char *intent)
+{
+	int r;
+
+	wait_event(device->misc_wait,
+		   (r = atomic_cmpxchg(&device->md_io.in_use, 0, 1)) == 0 ||
+		   device->state.disk <= D_FAILED);
+
+	if (r)
+		return NULL;
+
+	device->md_io.current_use = intent;
+	device->md_io.start_jif = jiffies;
+	device->md_io.submit_jif = device->md_io.start_jif - 1;
+	return page_address(device->md_io.page);
+}
+
+void drbd_md_put_buffer(struct drbd_device *device)
+{
+	if (atomic_dec_and_test(&device->md_io.in_use))
+		wake_up(&device->misc_wait);
+}
+
+void wait_until_done_or_force_detached(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				     unsigned int *done)
+{
+	long dt;
+
+	rcu_read_lock();
+	dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+	rcu_read_unlock();
+	dt = dt * HZ / 10;
+	if (dt == 0)
+		dt = MAX_SCHEDULE_TIMEOUT;
+
+	dt = wait_event_timeout(device->misc_wait,
+			*done || test_bit(FORCE_DETACH, &device->flags), dt);
+	if (dt == 0) {
+		drbd_err(device, "meta-data IO operation timed out\n");
+		drbd_chk_io_error(device, 1, DRBD_FORCE_DETACH);
+	}
+}
+
+static int _drbd_md_sync_page_io(struct drbd_device *device,
+				 struct drbd_backing_dev *bdev,
+				 sector_t sector, enum req_op op)
+{
+	struct bio *bio;
+	/* we do all our meta data IO in aligned 4k blocks. */
+	const int size = 4096;
+	int err;
+	blk_opf_t op_flags = 0;
+
+	device->md_io.done = 0;
+	device->md_io.error = -ENODEV;
+
+	if ((op == REQ_OP_WRITE) && !test_bit(MD_NO_FUA, &device->flags))
+		op_flags |= REQ_FUA | REQ_PREFLUSH;
+	op_flags |= REQ_SYNC;
+
+	bio = bio_alloc_bioset(bdev->md_bdev, 1, op | op_flags, GFP_NOIO,
+			       &drbd_md_io_bio_set);
+	bio->bi_iter.bi_sector = sector;
+	err = -EIO;
+	if (bio_add_page(bio, device->md_io.page, size, 0) != size)
+		goto out;
+	bio->bi_private = device;
+	bio->bi_end_io = drbd_md_endio;
+
+	if (op != REQ_OP_WRITE && device->state.disk == D_DISKLESS && device->ldev == NULL)
+		/* special case, drbd_md_read() during drbd_adm_attach(): no get_ldev */
+		;
+	else if (!get_ldev_if_state(device, D_ATTACHING)) {
+		/* Corresponding put_ldev in drbd_md_endio() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
+		err = -ENODEV;
+		goto out;
+	}
+
+	bio_get(bio); /* one bio_put() is in the completion handler */
+	atomic_inc(&device->md_io.in_use); /* drbd_md_put_buffer() is in the completion handler */
+	device->md_io.submit_jif = jiffies;
+	if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD))
+		bio_io_error(bio);
+	else
+		submit_bio(bio);
+	wait_until_done_or_force_detached(device, bdev, &device->md_io.done);
+	if (!bio->bi_status)
+		err = device->md_io.error;
+
+ out:
+	bio_put(bio);
+	return err;
+}
+
+int drbd_md_sync_page_io(struct drbd_device *device, struct drbd_backing_dev *bdev,
+			 sector_t sector, enum req_op op)
+{
+	int err;
+	D_ASSERT(device, atomic_read(&device->md_io.in_use) == 1);
+
+	BUG_ON(!bdev->md_bdev);
+
+	dynamic_drbd_dbg(device, "meta_data io: %s [%d]:%s(,%llus,%s) %pS\n",
+	     current->comm, current->pid, __func__,
+	     (unsigned long long)sector, (op == REQ_OP_WRITE) ? "WRITE" : "READ",
+	     (void*)_RET_IP_ );
+
+	if (sector < drbd_md_first_sector(bdev) ||
+	    sector + 7 > drbd_md_last_sector(bdev))
+		drbd_alert(device, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
+		     current->comm, current->pid, __func__,
+		     (unsigned long long)sector,
+		     (op == REQ_OP_WRITE) ? "WRITE" : "READ");
+
+	err = _drbd_md_sync_page_io(device, bdev, sector, op);
+	if (err) {
+		drbd_err(device, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+		    (unsigned long long)sector,
+		    (op == REQ_OP_WRITE) ? "WRITE" : "READ", err);
+	}
+	return err;
+}
+
+static struct bm_extent *find_active_resync_extent(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *tmp;
+	tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+	if (unlikely(tmp != NULL)) {
+		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_NO_WRITES, &bm_ext->flags))
+			return bm_ext;
+	}
+	return NULL;
+}
+
+static struct lc_element *_al_get(struct drbd_device *device, unsigned int enr, bool nonblock)
+{
+	struct lc_element *al_ext;
+	struct bm_extent *bm_ext;
+	int wake;
+
+	spin_lock_irq(&device->al_lock);
+	bm_ext = find_active_resync_extent(device, enr);
+	if (bm_ext) {
+		wake = !test_and_set_bit(BME_PRIORITY, &bm_ext->flags);
+		spin_unlock_irq(&device->al_lock);
+		if (wake)
+			wake_up(&device->al_wait);
+		return NULL;
+	}
+	if (nonblock)
+		al_ext = lc_try_get(device->act_log, enr);
+	else
+		al_ext = lc_get(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+	return al_ext;
+}
+
+bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+
+	D_ASSERT(device, first <= last);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	/* FIXME figure out a fast path for bios crossing AL extent boundaries */
+	if (first != last)
+		return false;
+
+	return _al_get(device, first, true);
+}
+
+bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	bool need_transaction = false;
+
+	D_ASSERT(device, first <= last);
+	D_ASSERT(device, atomic_read(&device->local_cnt) > 0);
+
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		wait_event(device->al_wait,
+				(al_ext = _al_get(device, enr, false)) != NULL);
+		if (al_ext->lc_number != enr)
+			need_transaction = true;
+	}
+	return need_transaction;
+}
+
+#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
+/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT
+ * are still coupled, or assume too much about their relation.
+ * Code below will not work if this is violated.
+ * Will be cleaned up with some followup patch.
+ */
+# error FIXME
+#endif
+
+static unsigned int al_extent_to_bm_page(unsigned int al_enr)
+{
+	return al_enr >>
+		/* bit to page */
+		((PAGE_SHIFT + 3) -
+		/* al extent number to bit */
+		 (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT));
+}
+
+static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device)
+{
+	const unsigned int stripes = device->ldev->md.al_stripes;
+	const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k;
+
+	/* transaction number, modulo on-disk ring buffer wrap around */
+	unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k);
+
+	/* ... to aligned 4k on disk block */
+	t = ((t % stripes) * stripe_size_4kB) + t/stripes;
+
+	/* ... to 512 byte sector in activity log */
+	t *= 8;
+
+	/* ... plus offset to the on disk position */
+	return device->ldev->md.md_offset + device->ldev->md.al_offset + t;
+}
+
+static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer)
+{
+	struct lc_element *e;
+	sector_t sector;
+	int i, mx;
+	unsigned extent_nr;
+	unsigned crc = 0;
+	int err = 0;
+
+	memset(buffer, 0, sizeof(*buffer));
+	buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	buffer->tr_number = cpu_to_be32(device->al_tr_number);
+
+	i = 0;
+
+	drbd_bm_reset_al_hints(device);
+
+	/* Even though no one can start to change this list
+	 * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+	 * lc_try_lock_for_transaction() --, someone may still
+	 * be in the process of changing it. */
+	spin_lock_irq(&device->al_lock);
+	list_for_each_entry(e, &device->act_log->to_be_changed, list) {
+		if (i == AL_UPDATES_PER_TRANSACTION) {
+			i++;
+			break;
+		}
+		buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+		buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+		if (e->lc_number != LC_FREE)
+			drbd_bm_mark_for_writeout(device,
+					al_extent_to_bm_page(e->lc_number));
+		i++;
+	}
+	spin_unlock_irq(&device->al_lock);
+	BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
+
+	buffer->n_updates = cpu_to_be16(i);
+	for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+		buffer->update_slot_nr[i] = cpu_to_be16(-1);
+		buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+	}
+
+	buffer->context_size = cpu_to_be16(device->act_log->nr_elements);
+	buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle);
+
+	mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
+		   device->act_log->nr_elements - device->al_tr_cycle);
+	for (i = 0; i < mx; i++) {
+		unsigned idx = device->al_tr_cycle + i;
+		extent_nr = lc_element_by_index(device->act_log, idx)->lc_number;
+		buffer->context[i] = cpu_to_be32(extent_nr);
+	}
+	for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+		buffer->context[i] = cpu_to_be32(LC_FREE);
+
+	device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
+	if (device->al_tr_cycle >= device->act_log->nr_elements)
+		device->al_tr_cycle = 0;
+
+	sector = al_tr_number_to_on_disk_sector(device);
+
+	crc = crc32c(0, buffer, 4096);
+	buffer->crc32c = cpu_to_be32(crc);
+
+	if (drbd_bm_write_hinted(device))
+		err = -EIO;
+	else {
+		bool write_al_updates;
+		rcu_read_lock();
+		write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+		rcu_read_unlock();
+		if (write_al_updates) {
+			if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) {
+				err = -EIO;
+				drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+			} else {
+				device->al_tr_number++;
+				device->al_writ_cnt++;
+			}
+		}
+	}
+
+	return err;
+}
+
+static int al_write_transaction(struct drbd_device *device)
+{
+	struct al_transaction_on_disk *buffer;
+	int err;
+
+	if (!get_ldev(device)) {
+		drbd_err(device, "disk is %s, cannot start al transaction\n",
+			drbd_disk_str(device->state.disk));
+		return -EIO;
+	}
+
+	/* The bitmap write may have failed, causing a state change. */
+	if (device->state.disk < D_INCONSISTENT) {
+		drbd_err(device,
+			"disk is %s, cannot write al transaction\n",
+			drbd_disk_str(device->state.disk));
+		put_ldev(device);
+		return -EIO;
+	}
+
+	/* protects md_io_buffer, al_tr_cycle, ... */
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer) {
+		drbd_err(device, "disk failed while waiting for md_io buffer\n");
+		put_ldev(device);
+		return -ENODEV;
+	}
+
+	err = __al_write_transaction(device, buffer);
+
+	drbd_md_put_buffer(device);
+	put_ldev(device);
+
+	return err;
+}
+
+
+void drbd_al_begin_io_commit(struct drbd_device *device)
+{
+	bool locked = false;
+
+	/* Serialize multiple transactions.
+	 * This uses test_and_set_bit, memory barrier is implicit.
+	 */
+	wait_event(device->al_wait,
+			device->act_log->pending_changes == 0 ||
+			(locked = lc_try_lock_for_transaction(device->act_log)));
+
+	if (locked) {
+		/* Double check: it may have been committed by someone else,
+		 * while we have been waiting for the lock. */
+		if (device->act_log->pending_changes) {
+			bool write_al_updates;
+
+			rcu_read_lock();
+			write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates;
+			rcu_read_unlock();
+
+			if (write_al_updates)
+				al_write_transaction(device);
+			spin_lock_irq(&device->al_lock);
+			/* FIXME
+			if (err)
+				we need an "lc_cancel" here;
+			*/
+			lc_committed(device->act_log);
+			spin_unlock_irq(&device->al_lock);
+		}
+		lc_unlock(device->act_log);
+		wake_up(&device->al_wait);
+	}
+}
+
+/*
+ * @delegate:   delegate activity log I/O to the worker thread
+ */
+void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i)
+{
+	if (drbd_al_begin_io_prepare(device, i))
+		drbd_al_begin_io_commit(device);
+}
+
+int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct lru_cache *al = device->act_log;
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned nr_al_extents;
+	unsigned available_update_slots;
+	unsigned enr;
+
+	D_ASSERT(device, first <= last);
+
+	nr_al_extents = 1 + last - first; /* worst case: all touched extends are cold. */
+	available_update_slots = min(al->nr_elements - al->used,
+				al->max_pending_changes - al->pending_changes);
+
+	/* We want all necessary updates for a given request within the same transaction
+	 * We could first check how many updates are *actually* needed,
+	 * and use that instead of the worst-case nr_al_extents */
+	if (available_update_slots < nr_al_extents) {
+		/* Too many activity log extents are currently "hot".
+		 *
+		 * If we have accumulated pending changes already,
+		 * we made progress.
+		 *
+		 * If we cannot get even a single pending change through,
+		 * stop the fast path until we made some progress,
+		 * or requests to "cold" extents could be starved. */
+		if (!al->pending_changes)
+			__set_bit(__LC_STARVING, &device->act_log->flags);
+		return -ENOBUFS;
+	}
+
+	/* Is resync active in this area? */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *tmp;
+		tmp = lc_find(device->resync, enr/AL_EXT_PER_BM_SECT);
+		if (unlikely(tmp != NULL)) {
+			struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+			if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+				if (!test_and_set_bit(BME_PRIORITY, &bm_ext->flags))
+					return -EBUSY;
+				return -EWOULDBLOCK;
+			}
+		}
+	}
+
+	/* Checkout the refcounts.
+	 * Given that we checked for available elements and update slots above,
+	 * this has to be successful. */
+	for (enr = first; enr <= last; enr++) {
+		struct lc_element *al_ext;
+		al_ext = lc_get_cumulative(device->act_log, enr);
+		if (!al_ext)
+			drbd_info(device, "LOGIC BUG for enr=%u\n", enr);
+	}
+	return 0;
+}
+
+void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i)
+{
+	/* for bios crossing activity log extent boundaries,
+	 * we may need to activate two extents in one go */
+	unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+	unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+	unsigned enr;
+	struct lc_element *extent;
+	unsigned long flags;
+
+	D_ASSERT(device, first <= last);
+	spin_lock_irqsave(&device->al_lock, flags);
+
+	for (enr = first; enr <= last; enr++) {
+		extent = lc_find(device->act_log, enr);
+		if (!extent) {
+			drbd_err(device, "al_complete_io() called on inactive extent %u\n", enr);
+			continue;
+		}
+		lc_put(device->act_log, extent);
+	}
+	spin_unlock_irqrestore(&device->al_lock, flags);
+	wake_up(&device->al_wait);
+}
+
+static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = (al_ext->refcnt == 0);
+	if (likely(rv))
+		lc_del(device->act_log, al_ext);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_al_shrink() - Removes all active extents form the activity log
+ * @device:	DRBD device.
+ *
+ * Removes all active extents form the activity log, waiting until
+ * the reference count of each entry dropped to 0 first, of course.
+ *
+ * You need to lock device->act_log with lc_try_lock() / lc_unlock()
+ */
+void drbd_al_shrink(struct drbd_device *device)
+{
+	struct lc_element *al_ext;
+	int i;
+
+	D_ASSERT(device, test_bit(__LC_LOCKED, &device->act_log->flags));
+
+	for (i = 0; i < device->act_log->nr_elements; i++) {
+		al_ext = lc_element_by_index(device->act_log, i);
+		if (al_ext->lc_number == LC_FREE)
+			continue;
+		wait_event(device->al_wait, _try_lc_del(device, al_ext));
+	}
+
+	wake_up(&device->al_wait);
+}
+
+int drbd_al_initialize(struct drbd_device *device, void *buffer)
+{
+	struct al_transaction_on_disk *al = buffer;
+	struct drbd_md *md = &device->ldev->md;
+	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+	int i;
+
+	__al_write_transaction(device, al);
+	/* There may or may not have been a pending transaction. */
+	spin_lock_irq(&device->al_lock);
+	lc_committed(device->act_log);
+	spin_unlock_irq(&device->al_lock);
+
+	/* The rest of the transactions will have an empty "updates" list, and
+	 * are written out only to provide the context, and to initialize the
+	 * on-disk ring buffer. */
+	for (i = 1; i < al_size_4k; i++) {
+		int err = __al_write_transaction(device, al);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static const char *drbd_change_sync_fname[] = {
+	[RECORD_RS_FAILED] = "drbd_rs_failed_io",
+	[SET_IN_SYNC] = "drbd_set_in_sync",
+	[SET_OUT_OF_SYNC] = "drbd_set_out_of_sync"
+};
+
+/* ATTENTION. The AL's extents are 4MB each, while the extents in the
+ * resync LRU-cache are 16MB each.
+ * The caller of this function has to hold an get_ldev() reference.
+ *
+ * Adjusts the caching members ->rs_left (success) or ->rs_failed (!success),
+ * potentially pulling in (and recounting the corresponding bits)
+ * this resync extent into the resync extent lru cache.
+ *
+ * Returns whether all bits have been cleared for this resync extent,
+ * precisely: (rs_left <= rs_failed)
+ *
+ * TODO will be obsoleted once we have a caching lru of the on disk bitmap
+ */
+static bool update_rs_extent(struct drbd_device *device,
+		unsigned int enr, int count,
+		enum update_sync_bits_mode mode)
+{
+	struct lc_element *e;
+
+	D_ASSERT(device, atomic_read(&device->local_cnt));
+
+	/* When setting out-of-sync bits,
+	 * we don't need it cached (lc_find).
+	 * But if it is present in the cache,
+	 * we should update the cached bit count.
+	 * Otherwise, that extent should be in the resync extent lru cache
+	 * already -- or we want to pull it in if necessary -- (lc_get),
+	 * then update and check rs_left and rs_failed. */
+	if (mode == SET_OUT_OF_SYNC)
+		e = lc_find(device->resync, enr);
+	else
+		e = lc_get(device->resync, enr);
+	if (e) {
+		struct bm_extent *ext = lc_entry(e, struct bm_extent, lce);
+		if (ext->lce.lc_number == enr) {
+			if (mode == SET_IN_SYNC)
+				ext->rs_left -= count;
+			else if (mode == SET_OUT_OF_SYNC)
+				ext->rs_left += count;
+			else
+				ext->rs_failed += count;
+			if (ext->rs_left < ext->rs_failed) {
+				drbd_warn(device, "BAD! enr=%u rs_left=%d "
+				    "rs_failed=%d count=%d cstate=%s\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->rs_failed, count,
+				     drbd_conn_str(device->state.conn));
+
+				/* We don't expect to be able to clear more bits
+				 * than have been set when we originally counted
+				 * the set bits to cache that value in ext->rs_left.
+				 * Whatever the reason (disconnect during resync,
+				 * delayed local completion of an application write),
+				 * try to fix it up by recounting here. */
+				ext->rs_left = drbd_bm_e_weight(device, enr);
+			}
+		} else {
+			/* Normally this element should be in the cache,
+			 * since drbd_rs_begin_io() pulled it already in.
+			 *
+			 * But maybe an application write finished, and we set
+			 * something outside the resync lru_cache in sync.
+			 */
+			int rs_left = drbd_bm_e_weight(device, enr);
+			if (ext->flags != 0) {
+				drbd_warn(device, "changing resync lce: %d[%u;%02lx]"
+				     " -> %d[%u;00]\n",
+				     ext->lce.lc_number, ext->rs_left,
+				     ext->flags, enr, rs_left);
+				ext->flags = 0;
+			}
+			if (ext->rs_failed) {
+				drbd_warn(device, "Kicking resync_lru element enr=%u "
+				     "out with rs_failed=%d\n",
+				     ext->lce.lc_number, ext->rs_failed);
+			}
+			ext->rs_left = rs_left;
+			ext->rs_failed = (mode == RECORD_RS_FAILED) ? count : 0;
+			/* we don't keep a persistent log of the resync lru,
+			 * we can commit any change right away. */
+			lc_committed(device->resync);
+		}
+		if (mode != SET_OUT_OF_SYNC)
+			lc_put(device->resync, &ext->lce);
+		/* no race, we are within the al_lock! */
+
+		if (ext->rs_left <= ext->rs_failed) {
+			ext->rs_failed = 0;
+			return true;
+		}
+	} else if (mode != SET_OUT_OF_SYNC) {
+		/* be quiet if lc_find() did not find it. */
+		drbd_err(device, "lc_get() failed! locked=%d/%d flags=%lu\n",
+		    device->resync_locked,
+		    device->resync->nr_elements,
+		    device->resync->flags);
+	}
+	return false;
+}
+
+void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go)
+{
+	unsigned long now = jiffies;
+	unsigned long last = device->rs_mark_time[device->rs_last_mark];
+	int next = (device->rs_last_mark + 1) % DRBD_SYNC_MARKS;
+	if (time_after_eq(now, last + DRBD_SYNC_MARK_STEP)) {
+		if (device->rs_mark_left[device->rs_last_mark] != still_to_go &&
+		    device->state.conn != C_PAUSED_SYNC_T &&
+		    device->state.conn != C_PAUSED_SYNC_S) {
+			device->rs_mark_time[next] = now;
+			device->rs_mark_left[next] = still_to_go;
+			device->rs_last_mark = next;
+		}
+	}
+}
+
+/* It is called lazy update, so don't do write-out too often. */
+static bool lazy_bitmap_update_due(struct drbd_device *device)
+{
+	return time_after(jiffies, device->rs_last_bcast + 2*HZ);
+}
+
+static void maybe_schedule_on_disk_bitmap_update(struct drbd_device *device, bool rs_done)
+{
+	if (rs_done) {
+		struct drbd_connection *connection = first_peer_device(device)->connection;
+		if (connection->agreed_pro_version <= 95 ||
+		    is_sync_target_state(device->state.conn))
+			set_bit(RS_DONE, &device->flags);
+			/* and also set RS_PROGRESS below */
+
+		/* Else: rather wait for explicit notification via receive_state,
+		 * to avoid uuids-rotated-too-fast causing full resync
+		 * in next handshake, in case the replication link breaks
+		 * at the most unfortunate time... */
+	} else if (!lazy_bitmap_update_due(device))
+		return;
+
+	drbd_device_post_work(device, RS_PROGRESS);
+}
+
+static int update_sync_bits(struct drbd_device *device,
+		unsigned long sbnr, unsigned long ebnr,
+		enum update_sync_bits_mode mode)
+{
+	/*
+	 * We keep a count of set bits per resync-extent in the ->rs_left
+	 * caching member, so we need to loop and work within the resync extent
+	 * alignment. Typically this loop will execute exactly once.
+	 */
+	unsigned long flags;
+	unsigned long count = 0;
+	unsigned int cleared = 0;
+	while (sbnr <= ebnr) {
+		/* set temporary boundary bit number to last bit number within
+		 * the resync extent of the current start bit number,
+		 * but cap at provided end bit number */
+		unsigned long tbnr = min(ebnr, sbnr | BM_BLOCKS_PER_BM_EXT_MASK);
+		unsigned long c;
+
+		if (mode == RECORD_RS_FAILED)
+			/* Only called from drbd_rs_failed_io(), bits
+			 * supposedly still set.  Recount, maybe some
+			 * of the bits have been successfully cleared
+			 * by application IO meanwhile.
+			 */
+			c = drbd_bm_count_bits(device, sbnr, tbnr);
+		else if (mode == SET_IN_SYNC)
+			c = drbd_bm_clear_bits(device, sbnr, tbnr);
+		else /* if (mode == SET_OUT_OF_SYNC) */
+			c = drbd_bm_set_bits(device, sbnr, tbnr);
+
+		if (c) {
+			spin_lock_irqsave(&device->al_lock, flags);
+			cleared += update_rs_extent(device, BM_BIT_TO_EXT(sbnr), c, mode);
+			spin_unlock_irqrestore(&device->al_lock, flags);
+			count += c;
+		}
+		sbnr = tbnr + 1;
+	}
+	if (count) {
+		if (mode == SET_IN_SYNC) {
+			unsigned long still_to_go = drbd_bm_total_weight(device);
+			bool rs_is_done = (still_to_go <= device->rs_failed);
+			drbd_advance_rs_marks(device, still_to_go);
+			if (cleared || rs_is_done)
+				maybe_schedule_on_disk_bitmap_update(device, rs_is_done);
+		} else if (mode == RECORD_RS_FAILED)
+			device->rs_failed += count;
+		wake_up(&device->al_wait);
+	}
+	return count;
+}
+
+static bool plausible_request_size(int size)
+{
+	return size > 0
+		&& size <= DRBD_MAX_BATCH_BIO_SIZE
+		&& IS_ALIGNED(size, 512);
+}
+
+/* clear the bit corresponding to the piece of storage in question:
+ * size byte of data starting from sector.  Only clear a bits of the affected
+ * one ore more _aligned_ BM_BLOCK_SIZE blocks.
+ *
+ * called by worker on C_SYNC_TARGET and receiver on SyncSource.
+ *
+ */
+int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode)
+{
+	/* Is called from worker and receiver context _only_ */
+	unsigned long sbnr, ebnr, lbnr;
+	unsigned long count = 0;
+	sector_t esector, nr_sectors;
+
+	/* This would be an empty REQ_PREFLUSH, be silent. */
+	if ((mode == SET_OUT_OF_SYNC) && size == 0)
+		return 0;
+
+	if (!plausible_request_size(size)) {
+		drbd_err(device, "%s: sector=%llus size=%d nonsense!\n",
+				drbd_change_sync_fname[mode],
+				(unsigned long long)sector, size);
+		return 0;
+	}
+
+	if (!get_ldev(device))
+		return 0; /* no disk, no metadata, no bitmap to manipulate bits in */
+
+	nr_sectors = get_capacity(device->vdisk);
+	esector = sector + (size >> 9) - 1;
+
+	if (!expect(sector < nr_sectors))
+		goto out;
+	if (!expect(esector < nr_sectors))
+		esector = nr_sectors - 1;
+
+	lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+
+	if (mode == SET_IN_SYNC) {
+		/* Round up start sector, round down end sector.  We make sure
+		 * we only clear full, aligned, BM_BLOCK_SIZE blocks. */
+		if (unlikely(esector < BM_SECT_PER_BIT-1))
+			goto out;
+		if (unlikely(esector == (nr_sectors-1)))
+			ebnr = lbnr;
+		else
+			ebnr = BM_SECT_TO_BIT(esector - (BM_SECT_PER_BIT-1));
+		sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
+	} else {
+		/* We set it out of sync, or record resync failure.
+		 * Should not round anything here. */
+		sbnr = BM_SECT_TO_BIT(sector);
+		ebnr = BM_SECT_TO_BIT(esector);
+	}
+
+	count = update_sync_bits(device, sbnr, ebnr, mode);
+out:
+	put_ldev(device);
+	return count;
+}
+
+static
+struct bm_extent *_bme_get(struct drbd_device *device, unsigned int enr)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int wakeup = 0;
+	unsigned long rs_flags;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_locked > device->resync->nr_elements/2) {
+		spin_unlock_irq(&device->al_lock);
+		return NULL;
+	}
+	e = lc_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wakeup = 1;
+		}
+		if (bm_ext->lce.refcnt == 1)
+			device->resync_locked++;
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+	}
+	rs_flags = device->resync->flags;
+	spin_unlock_irq(&device->al_lock);
+	if (wakeup)
+		wake_up(&device->al_wait);
+
+	if (!bm_ext) {
+		if (rs_flags & LC_STARVING)
+			drbd_warn(device, "Have to wait for element"
+			     " (resync LRU too small?)\n");
+		BUG_ON(rs_flags & LC_LOCKED);
+	}
+
+	return bm_ext;
+}
+
+static int _is_in_al(struct drbd_device *device, unsigned int enr)
+{
+	int rv;
+
+	spin_lock_irq(&device->al_lock);
+	rv = lc_is_used(device->act_log, enr);
+	spin_unlock_irq(&device->al_lock);
+
+	return rv;
+}
+
+/**
+ * drbd_rs_begin_io() - Gets an extent in the resync LRU cache and sets it to BME_LOCKED
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * This functions sleeps on al_wait. Returns 0 on success, -EINTR if interrupted.
+ */
+int drbd_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct bm_extent *bm_ext;
+	int i, sig;
+	bool sa;
+
+retry:
+	sig = wait_event_interruptible(device->al_wait,
+			(bm_ext = _bme_get(device, enr)));
+	if (sig)
+		return -EINTR;
+
+	if (test_bit(BME_LOCKED, &bm_ext->flags))
+		return 0;
+
+	/* step aside only while we are above c-min-rate; unless disabled. */
+	sa = drbd_rs_c_min_rate_throttle(device);
+
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		sig = wait_event_interruptible(device->al_wait,
+					       !_is_in_al(device, enr * AL_EXT_PER_BM_SECT + i) ||
+					       (sa && test_bit(BME_PRIORITY, &bm_ext->flags)));
+
+		if (sig || (sa && test_bit(BME_PRIORITY, &bm_ext->flags))) {
+			spin_lock_irq(&device->al_lock);
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0; /* clears BME_NO_WRITES and eventually BME_PRIORITY */
+				device->resync_locked--;
+				wake_up(&device->al_wait);
+			}
+			spin_unlock_irq(&device->al_lock);
+			if (sig)
+				return -EINTR;
+			if (schedule_timeout_interruptible(HZ/10))
+				return -EINTR;
+			goto retry;
+		}
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+	return 0;
+}
+
+/**
+ * drbd_try_rs_begin_io() - Gets an extent in the resync LRU cache, does not sleep
+ * @device:	DRBD device.
+ * @sector:	The sector number.
+ *
+ * Gets an extent in the resync LRU cache, sets it to BME_NO_WRITES, then
+ * tries to set it to BME_LOCKED. Returns 0 upon success, and -EAGAIN
+ * if there is still application IO going on in this area.
+ */
+int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	const unsigned int al_enr = enr*AL_EXT_PER_BM_SECT;
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+	bool throttle = drbd_rs_should_slow_down(device, sector, true);
+
+	/* If we need to throttle, a half-locked (only marked BME_NO_WRITES,
+	 * not yet BME_LOCKED) extent needs to be kicked out explicitly if we
+	 * need to throttle. There is at most one such half-locked extent,
+	 * which is remembered in resync_wenr. */
+
+	if (throttle && device->resync_wenr != enr)
+		return -EAGAIN;
+
+	spin_lock_irq(&device->al_lock);
+	if (device->resync_wenr != LC_FREE && device->resync_wenr != enr) {
+		/* in case you have very heavy scattered io, it may
+		 * stall the syncer undefined if we give up the ref count
+		 * when we try again and requeue.
+		 *
+		 * if we don't give up the refcount, but the next time
+		 * we are scheduled this extent has been "synced" by new
+		 * application writes, we'd miss the lc_put on the
+		 * extent we keep the refcount on.
+		 * so we remembered which extent we had to try again, and
+		 * if the next requested one is something else, we do
+		 * the lc_put here...
+		 * we also have to wake_up
+		 */
+		e = lc_find(device->resync, device->resync_wenr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (bm_ext) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else {
+			drbd_alert(device, "LOGIC BUG\n");
+		}
+	}
+	/* TRY. */
+	e = lc_try_get(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (bm_ext) {
+		if (test_bit(BME_LOCKED, &bm_ext->flags))
+			goto proceed;
+		if (!test_and_set_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			device->resync_locked++;
+		} else {
+			/* we did set the BME_NO_WRITES,
+			 * but then could not set BME_LOCKED,
+			 * so we tried again.
+			 * drop the extra reference. */
+			bm_ext->lce.refcnt--;
+			D_ASSERT(device, bm_ext->lce.refcnt > 0);
+		}
+		goto check_al;
+	} else {
+		/* do we rather want to try later? */
+		if (device->resync_locked > device->resync->nr_elements-3)
+			goto try_again;
+		/* Do or do not. There is no try. -- Yoda */
+		e = lc_get(device->resync, enr);
+		bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+		if (!bm_ext) {
+			const unsigned long rs_flags = device->resync->flags;
+			if (rs_flags & LC_STARVING)
+				drbd_warn(device, "Have to wait for element"
+				     " (resync LRU too small?)\n");
+			BUG_ON(rs_flags & LC_LOCKED);
+			goto try_again;
+		}
+		if (bm_ext->lce.lc_number != enr) {
+			bm_ext->rs_left = drbd_bm_e_weight(device, enr);
+			bm_ext->rs_failed = 0;
+			lc_committed(device->resync);
+			wake_up(&device->al_wait);
+			D_ASSERT(device, test_bit(BME_LOCKED, &bm_ext->flags) == 0);
+		}
+		set_bit(BME_NO_WRITES, &bm_ext->flags);
+		D_ASSERT(device, bm_ext->lce.refcnt == 1);
+		device->resync_locked++;
+		goto check_al;
+	}
+check_al:
+	for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
+		if (lc_is_used(device->act_log, al_enr+i))
+			goto try_again;
+	}
+	set_bit(BME_LOCKED, &bm_ext->flags);
+proceed:
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	return 0;
+
+try_again:
+	if (bm_ext) {
+		if (throttle) {
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+			clear_bit(BME_NO_WRITES, &bm_ext->flags);
+			device->resync_wenr = LC_FREE;
+			if (lc_put(device->resync, &bm_ext->lce) == 0) {
+				bm_ext->flags = 0;
+				device->resync_locked--;
+			}
+			wake_up(&device->al_wait);
+		} else
+			device->resync_wenr = enr;
+	}
+	spin_unlock_irq(&device->al_lock);
+	return -EAGAIN;
+}
+
+void drbd_rs_complete_io(struct drbd_device *device, sector_t sector)
+{
+	unsigned int enr = BM_SECT_TO_EXT(sector);
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	unsigned long flags;
+
+	spin_lock_irqsave(&device->al_lock, flags);
+	e = lc_find(device->resync, enr);
+	bm_ext = e ? lc_entry(e, struct bm_extent, lce) : NULL;
+	if (!bm_ext) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "drbd_rs_complete_io() called, but extent not found\n");
+		return;
+	}
+
+	if (bm_ext->lce.refcnt == 0) {
+		spin_unlock_irqrestore(&device->al_lock, flags);
+		drbd_err(device, "drbd_rs_complete_io(,%llu [=%u]) called, "
+		    "but refcnt is 0!?\n",
+		    (unsigned long long)sector, enr);
+		return;
+	}
+
+	if (lc_put(device->resync, &bm_ext->lce) == 0) {
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
+		device->resync_locked--;
+		wake_up(&device->al_wait);
+	}
+
+	spin_unlock_irqrestore(&device->al_lock, flags);
+}
+
+/**
+ * drbd_rs_cancel_all() - Removes all extents from the resync LRU (even BME_LOCKED)
+ * @device:	DRBD device.
+ */
+void drbd_rs_cancel_all(struct drbd_device *device)
+{
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) { /* Makes sure ->resync is there. */
+		lc_reset(device->resync);
+		put_ldev(device);
+	}
+	device->resync_locked = 0;
+	device->resync_wenr = LC_FREE;
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+}
+
+/**
+ * drbd_rs_del_all() - Gracefully remove all extents from the resync LRU
+ * @device:	DRBD device.
+ *
+ * Returns 0 upon success, -EAGAIN if at least one reference count was
+ * not zero.
+ */
+int drbd_rs_del_all(struct drbd_device *device)
+{
+	struct lc_element *e;
+	struct bm_extent *bm_ext;
+	int i;
+
+	spin_lock_irq(&device->al_lock);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		/* ok, ->resync is there. */
+		for (i = 0; i < device->resync->nr_elements; i++) {
+			e = lc_element_by_index(device->resync, i);
+			bm_ext = lc_entry(e, struct bm_extent, lce);
+			if (bm_ext->lce.lc_number == LC_FREE)
+				continue;
+			if (bm_ext->lce.lc_number == device->resync_wenr) {
+				drbd_info(device, "dropping %u in drbd_rs_del_all, apparently"
+				     " got 'synced' by application io\n",
+				     device->resync_wenr);
+				D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+				D_ASSERT(device, test_bit(BME_NO_WRITES, &bm_ext->flags));
+				clear_bit(BME_NO_WRITES, &bm_ext->flags);
+				device->resync_wenr = LC_FREE;
+				lc_put(device->resync, &bm_ext->lce);
+			}
+			if (bm_ext->lce.refcnt != 0) {
+				drbd_info(device, "Retrying drbd_rs_del_all() later. "
+				     "refcnt=%d\n", bm_ext->lce.refcnt);
+				put_ldev(device);
+				spin_unlock_irq(&device->al_lock);
+				return -EAGAIN;
+			}
+			D_ASSERT(device, !test_bit(BME_LOCKED, &bm_ext->flags));
+			D_ASSERT(device, !test_bit(BME_NO_WRITES, &bm_ext->flags));
+			lc_del(device->resync, &bm_ext->lce);
+		}
+		D_ASSERT(device, device->resync->used == 0);
+		put_ldev(device);
+	}
+	spin_unlock_irq(&device->al_lock);
+	wake_up(&device->al_wait);
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
new file mode 100644
index 000000000..7d9db3336
--- /dev/null
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -0,0 +1,1694 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_bitmap.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2004-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2004-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2004-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/bitmap.h>
+#include <linux/vmalloc.h>
+#include <linux/string.h>
+#include <linux/drbd.h>
+#include <linux/slab.h>
+#include <linux/highmem.h>
+
+#include "drbd_int.h"
+
+
+/* OPAQUE outside this file!
+ * interface defined in drbd_int.h
+
+ * convention:
+ * function name drbd_bm_... => used elsewhere, "public".
+ * function name      bm_... => internal to implementation, "private".
+ */
+
+
+/*
+ * LIMITATIONS:
+ * We want to support >= peta byte of backend storage, while for now still using
+ * a granularity of one bit per 4KiB of storage.
+ * 1 << 50		bytes backend storage (1 PiB)
+ * 1 << (50 - 12)	bits needed
+ *	38 --> we need u64 to index and count bits
+ * 1 << (38 - 3)	bitmap bytes needed
+ *	35 --> we still need u64 to index and count bytes
+ *			(that's 32 GiB of bitmap for 1 PiB storage)
+ * 1 << (35 - 2)	32bit longs needed
+ *	33 --> we'd even need u64 to index and count 32bit long words.
+ * 1 << (35 - 3)	64bit longs needed
+ *	32 --> we could get away with a 32bit unsigned int to index and count
+ *	64bit long words, but I rather stay with unsigned long for now.
+ *	We probably should neither count nor point to bytes or long words
+ *	directly, but either by bitnumber, or by page index and offset.
+ * 1 << (35 - 12)
+ *	22 --> we need that much 4KiB pages of bitmap.
+ *	1 << (22 + 3) --> on a 64bit arch,
+ *	we need 32 MiB to store the array of page pointers.
+ *
+ * Because I'm lazy, and because the resulting patch was too large, too ugly
+ * and still incomplete, on 32bit we still "only" support 16 TiB (minus some),
+ * (1 << 32) bits * 4k storage.
+ *
+
+ * bitmap storage and IO:
+ *	Bitmap is stored little endian on disk, and is kept little endian in
+ *	core memory. Currently we still hold the full bitmap in core as long
+ *	as we are "attached" to a local disk, which at 32 GiB for 1PiB storage
+ *	seems excessive.
+ *
+ *	We plan to reduce the amount of in-core bitmap pages by paging them in
+ *	and out against their on-disk location as necessary, but need to make
+ *	sure we don't cause too much meta data IO, and must not deadlock in
+ *	tight memory situations. This needs some more work.
+ */
+
+/*
+ * NOTE
+ *  Access to the *bm_pages is protected by bm_lock.
+ *  It is safe to read the other members within the lock.
+ *
+ *  drbd_bm_set_bits is called from bio_endio callbacks,
+ *  We may be called with irq already disabled,
+ *  so we need spin_lock_irqsave().
+ *  And we need the kmap_atomic.
+ */
+struct drbd_bitmap {
+	struct page **bm_pages;
+	spinlock_t bm_lock;
+
+	/* exclusively to be used by __al_write_transaction(),
+	 * drbd_bm_mark_for_writeout() and
+	 * and drbd_bm_write_hinted() -> bm_rw() called from there.
+	 */
+	unsigned int n_bitmap_hints;
+	unsigned int al_bitmap_hints[AL_UPDATES_PER_TRANSACTION];
+
+	/* see LIMITATIONS: above */
+
+	unsigned long bm_set;       /* nr of set bits; THINK maybe atomic_t? */
+	unsigned long bm_bits;
+	size_t   bm_words;
+	size_t   bm_number_of_pages;
+	sector_t bm_dev_capacity;
+	struct mutex bm_change; /* serializes resize operations */
+
+	wait_queue_head_t bm_io_wait; /* used to serialize IO of single pages */
+
+	enum bm_flag bm_flags;
+
+	/* debugging aid, in case we are still racy somewhere */
+	char          *bm_why;
+	struct task_struct *bm_task;
+};
+
+#define bm_print_lock_info(m) __bm_print_lock_info(m, __func__)
+static void __bm_print_lock_info(struct drbd_device *device, const char *func)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+	drbd_err(device, "FIXME %s[%d] in %s, bitmap locked for '%s' by %s[%d]\n",
+		 current->comm, task_pid_nr(current),
+		 func, b->bm_why ?: "?",
+		 b->bm_task->comm, task_pid_nr(b->bm_task));
+}
+
+void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int trylock_failed;
+
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_lock!?\n");
+		return;
+	}
+
+	trylock_failed = !mutex_trylock(&b->bm_change);
+
+	if (trylock_failed) {
+		drbd_warn(device, "%s[%d] going to '%s' but bitmap already locked for '%s' by %s[%d]\n",
+			  current->comm, task_pid_nr(current),
+			  why, b->bm_why ?: "?",
+			  b->bm_task->comm, task_pid_nr(b->bm_task));
+		mutex_lock(&b->bm_change);
+	}
+	if (BM_LOCKED_MASK & b->bm_flags)
+		drbd_err(device, "FIXME bitmap already locked in bm_lock\n");
+	b->bm_flags |= flags & BM_LOCKED_MASK;
+
+	b->bm_why  = why;
+	b->bm_task = current;
+}
+
+void drbd_bm_unlock(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!b) {
+		drbd_err(device, "FIXME no bitmap in drbd_bm_unlock!?\n");
+		return;
+	}
+
+	if (!(BM_LOCKED_MASK & device->bitmap->bm_flags))
+		drbd_err(device, "FIXME bitmap not locked in bm_unlock\n");
+
+	b->bm_flags &= ~BM_LOCKED_MASK;
+	b->bm_why  = NULL;
+	b->bm_task = NULL;
+	mutex_unlock(&b->bm_change);
+}
+
+/* we store some "meta" info about our pages in page->private */
+/* at a granularity of 4k storage per bitmap bit:
+ * one peta byte storage: 1<<50 byte, 1<<38 * 4k storage blocks
+ *  1<<38 bits,
+ *  1<<23 4k bitmap pages.
+ * Use 24 bits as page index, covers 2 peta byte storage
+ * at a granularity of 4k per bit.
+ * Used to report the failed page idx on io error from the endio handlers.
+ */
+#define BM_PAGE_IDX_MASK	((1UL<<24)-1)
+/* this page is currently read in, or written back */
+#define BM_PAGE_IO_LOCK		31
+/* if there has been an IO error for this page */
+#define BM_PAGE_IO_ERROR	30
+/* this is to be able to intelligently skip disk IO,
+ * set if bits have been set since last IO. */
+#define BM_PAGE_NEED_WRITEOUT	29
+/* to mark for lazy writeout once syncer cleared all clearable bits,
+ * we if bits have been cleared since last IO. */
+#define BM_PAGE_LAZY_WRITEOUT	28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT	27
+
+/* store_page_idx uses non-atomic assignment. It is only used directly after
+ * allocating the page.  All other bm_set_page_* and bm_clear_page_* need to
+ * use atomic bit manipulation, as set_out_of_sync (and therefore bitmap
+ * changes) may happen from various contexts, and wait_on_bit/wake_up_bit
+ * requires it all to be atomic as well. */
+static void bm_store_page_idx(struct page *page, unsigned long idx)
+{
+	BUG_ON(0 != (idx & ~BM_PAGE_IDX_MASK));
+	set_page_private(page, idx);
+}
+
+static unsigned long bm_page_to_idx(struct page *page)
+{
+	return page_private(page) & BM_PAGE_IDX_MASK;
+}
+
+/* As is very unlikely that the same page is under IO from more than one
+ * context, we can get away with a bit per page and one wait queue per bitmap.
+ */
+static void bm_page_lock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	wait_event(b->bm_io_wait, !test_and_set_bit(BM_PAGE_IO_LOCK, addr));
+}
+
+static void bm_page_unlock_io(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	void *addr = &page_private(b->bm_pages[page_nr]);
+	clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
+	wake_up(&device->bitmap->bm_io_wait);
+}
+
+/* set _before_ submit_io, so it may be reset due to being changed
+ * while this page is in flight... will get submitted later again */
+static void bm_set_page_unchanged(struct page *page)
+{
+	/* use cmpxchg? */
+	clear_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+	clear_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static void bm_set_page_need_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
+}
+
+void drbd_bm_reset_al_hints(struct drbd_device *device)
+{
+	device->bitmap->n_bitmap_hints = 0;
+}
+
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @device:	DRBD device.
+ * @page_nr:	the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	struct page *page;
+	if (page_nr >= device->bitmap->bm_number_of_pages) {
+		drbd_warn(device, "BAD: page_nr: %u, number_of_pages: %u\n",
+			 page_nr, (int)device->bitmap->bm_number_of_pages);
+		return;
+	}
+	page = device->bitmap->bm_pages[page_nr];
+	BUG_ON(b->n_bitmap_hints >= ARRAY_SIZE(b->al_bitmap_hints));
+	if (!test_and_set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page)))
+		b->al_bitmap_hints[b->n_bitmap_hints++] = page_nr;
+}
+
+static int bm_test_page_unchanged(struct page *page)
+{
+	volatile const unsigned long *addr = &page_private(page);
+	return (*addr & ((1UL<<BM_PAGE_NEED_WRITEOUT)|(1UL<<BM_PAGE_LAZY_WRITEOUT))) == 0;
+}
+
+static void bm_set_page_io_err(struct page *page)
+{
+	set_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_clear_page_io_err(struct page *page)
+{
+	clear_bit(BM_PAGE_IO_ERROR, &page_private(page));
+}
+
+static void bm_set_page_lazy_writeout(struct page *page)
+{
+	set_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+static int bm_test_page_lazy_writeout(struct page *page)
+{
+	return test_bit(BM_PAGE_LAZY_WRITEOUT, &page_private(page));
+}
+
+/* on a 32bit box, this would allow for exactly (2<<38) bits. */
+static unsigned int bm_word_to_page_idx(struct drbd_bitmap *b, unsigned long long_nr)
+{
+	/* page_nr = (word*sizeof(long)) >> PAGE_SHIFT; */
+	unsigned int page_nr = long_nr >> (PAGE_SHIFT - LN2_BPL + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned int bm_bit_to_page_idx(struct drbd_bitmap *b, u64 bitnr)
+{
+	/* page_nr = (bitnr/8) >> PAGE_SHIFT; */
+	unsigned int page_nr = bitnr >> (PAGE_SHIFT + 3);
+	BUG_ON(page_nr >= b->bm_number_of_pages);
+	return page_nr;
+}
+
+static unsigned long *__bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	struct page *page = b->bm_pages[idx];
+	return (unsigned long *) kmap_atomic(page);
+}
+
+static unsigned long *bm_map_pidx(struct drbd_bitmap *b, unsigned int idx)
+{
+	return __bm_map_pidx(b, idx);
+}
+
+static void __bm_unmap(unsigned long *p_addr)
+{
+	kunmap_atomic(p_addr);
+};
+
+static void bm_unmap(unsigned long *p_addr)
+{
+	return __bm_unmap(p_addr);
+}
+
+/* long word offset of _bitmap_ sector */
+#define S2W(s)	((s)<<(BM_EXT_SHIFT-BM_BLOCK_SHIFT-LN2_BPL))
+/* word offset from start of bitmap to word number _in_page_
+ * modulo longs per page
+#define MLPP(X) ((X) % (PAGE_SIZE/sizeof(long))
+ hm, well, Philipp thinks gcc might not optimize the % into & (... - 1)
+ so do it explicitly:
+ */
+#define MLPP(X) ((X) & ((PAGE_SIZE/sizeof(long))-1))
+
+/* Long words per page */
+#define LWPP (PAGE_SIZE/sizeof(long))
+
+/*
+ * actually most functions herein should take a struct drbd_bitmap*, not a
+ * struct drbd_device*, but for the debug macros I like to have the device around
+ * to be able to report device specific.
+ */
+
+
+static void bm_free_pages(struct page **pages, unsigned long number)
+{
+	unsigned long i;
+	if (!pages)
+		return;
+
+	for (i = 0; i < number; i++) {
+		if (!pages[i]) {
+			pr_alert("bm_free_pages tried to free a NULL pointer; i=%lu n=%lu\n",
+				 i, number);
+			continue;
+		}
+		__free_page(pages[i]);
+		pages[i] = NULL;
+	}
+}
+
+static inline void bm_vk_free(void *ptr)
+{
+	kvfree(ptr);
+}
+
+/*
+ * "have" and "want" are NUMBER OF PAGES.
+ */
+static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want)
+{
+	struct page **old_pages = b->bm_pages;
+	struct page **new_pages, *page;
+	unsigned int i, bytes;
+	unsigned long have = b->bm_number_of_pages;
+
+	BUG_ON(have == 0 && old_pages != NULL);
+	BUG_ON(have != 0 && old_pages == NULL);
+
+	if (have == want)
+		return old_pages;
+
+	/* Trying kmalloc first, falling back to vmalloc.
+	 * GFP_NOIO, as this is called while drbd IO is "suspended",
+	 * and during resize or attach on diskless Primary,
+	 * we must not block on IO to ourselves.
+	 * Context is receiver thread or dmsetup. */
+	bytes = sizeof(struct page *)*want;
+	new_pages = kzalloc(bytes, GFP_NOIO | __GFP_NOWARN);
+	if (!new_pages) {
+		new_pages = __vmalloc(bytes, GFP_NOIO | __GFP_ZERO);
+		if (!new_pages)
+			return NULL;
+	}
+
+	if (want >= have) {
+		for (i = 0; i < have; i++)
+			new_pages[i] = old_pages[i];
+		for (; i < want; i++) {
+			page = alloc_page(GFP_NOIO | __GFP_HIGHMEM);
+			if (!page) {
+				bm_free_pages(new_pages + have, i - have);
+				bm_vk_free(new_pages);
+				return NULL;
+			}
+			/* we want to know which page it is
+			 * from the endio handlers */
+			bm_store_page_idx(page, i);
+			new_pages[i] = page;
+		}
+	} else {
+		for (i = 0; i < want; i++)
+			new_pages[i] = old_pages[i];
+		/* NOT HERE, we are outside the spinlock!
+		bm_free_pages(old_pages + want, have - want);
+		*/
+	}
+
+	return new_pages;
+}
+
+/*
+ * allocates the drbd_bitmap and stores it in device->bitmap.
+ */
+int drbd_bm_init(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	WARN_ON(b != NULL);
+	b = kzalloc(sizeof(struct drbd_bitmap), GFP_KERNEL);
+	if (!b)
+		return -ENOMEM;
+	spin_lock_init(&b->bm_lock);
+	mutex_init(&b->bm_change);
+	init_waitqueue_head(&b->bm_io_wait);
+
+	device->bitmap = b;
+
+	return 0;
+}
+
+sector_t drbd_bm_capacity(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return 0;
+	return device->bitmap->bm_dev_capacity;
+}
+
+/* called on driver unload. TODO: call when a device is destroyed.
+ */
+void drbd_bm_cleanup(struct drbd_device *device)
+{
+	if (!expect(device->bitmap))
+		return;
+	bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages);
+	bm_vk_free(device->bitmap->bm_pages);
+	kfree(device->bitmap);
+	device->bitmap = NULL;
+}
+
+/*
+ * since (b->bm_bits % BITS_PER_LONG) != 0,
+ * this masks out the remaining bits.
+ * Returns the number of bits cleared.
+ */
+#ifndef BITS_PER_PAGE
+#define BITS_PER_PAGE		(1UL << (PAGE_SHIFT + 3))
+#define BITS_PER_PAGE_MASK	(BITS_PER_PAGE - 1)
+#else
+# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3))
+#  error "ambiguous BITS_PER_PAGE"
+# endif
+#endif
+#define BITS_PER_LONG_MASK	(BITS_PER_LONG - 1)
+static int bm_clear_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+	int cleared = 0;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		cleared = hweight_long(*bm & ~mask);
+		*bm &= mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		cleared += hweight_long(*bm);
+		*bm = 0;
+	}
+	bm_unmap(p_addr);
+	return cleared;
+}
+
+static void bm_set_surplus(struct drbd_bitmap *b)
+{
+	unsigned long mask;
+	unsigned long *p_addr, *bm;
+	int tmp;
+
+	/* number of bits modulo bits per page */
+	tmp = (b->bm_bits & BITS_PER_PAGE_MASK);
+	/* mask the used bits of the word containing the last bit */
+	mask = (1UL << (tmp & BITS_PER_LONG_MASK)) -1;
+	/* bitmap is always stored little endian,
+	 * on disk and in core memory alike */
+	mask = cpu_to_lel(mask);
+
+	p_addr = bm_map_pidx(b, b->bm_number_of_pages - 1);
+	bm = p_addr + (tmp/BITS_PER_LONG);
+	if (mask) {
+		/* If mask != 0, we are not exactly aligned, so bm now points
+		 * to the long containing the last bit.
+		 * If mask == 0, bm already points to the word immediately
+		 * after the last (long word aligned) bit. */
+		*bm |= ~mask;
+		bm++;
+	}
+
+	if (BITS_PER_LONG == 32 && ((bm - p_addr) & 1) == 1) {
+		/* on a 32bit arch, we may need to zero out
+		 * a padding long to align with a 64bit remote */
+		*bm = ~0UL;
+	}
+	bm_unmap(p_addr);
+}
+
+/* you better not modify the bitmap while this is running,
+ * or its results will be stale */
+static unsigned long bm_count_bits(struct drbd_bitmap *b)
+{
+	unsigned long *p_addr;
+	unsigned long bits = 0;
+	unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1;
+	int idx, last_word;
+
+	/* all but last page */
+	for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) {
+		p_addr = __bm_map_pidx(b, idx);
+		bits += bitmap_weight(p_addr, BITS_PER_PAGE);
+		__bm_unmap(p_addr);
+		cond_resched();
+	}
+	/* last (or only) page */
+	last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL;
+	p_addr = __bm_map_pidx(b, idx);
+	bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG);
+	p_addr[last_word] &= cpu_to_lel(mask);
+	bits += hweight_long(p_addr[last_word]);
+	/* 32bit arch, may have an unused padding long */
+	if (BITS_PER_LONG == 32 && (last_word & 1) == 0)
+		p_addr[last_word+1] = 0;
+	__bm_unmap(p_addr);
+	return bits;
+}
+
+/* offset and len in long words.*/
+static void bm_memset(struct drbd_bitmap *b, size_t offset, int c, size_t len)
+{
+	unsigned long *p_addr, *bm;
+	unsigned int idx;
+	size_t do_now, end;
+
+	end = offset + len;
+
+	if (end > b->bm_words) {
+		pr_alert("bm_memset end > bm_words\n");
+		return;
+	}
+
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset + 1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		if (bm+do_now > p_addr + LWPP) {
+			pr_alert("BUG BUG BUG! p_addr:%p bm:%p do_now:%d\n",
+			       p_addr, bm, (int)do_now);
+		} else
+			memset(bm, c, do_now * sizeof(long));
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+		offset += do_now;
+	}
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static u64 drbd_md_on_disk_bits(struct drbd_backing_dev *ldev)
+{
+	u64 bitmap_sectors;
+	if (ldev->md.al_offset == 8)
+		bitmap_sectors = ldev->md.md_size_sect - ldev->md.bm_offset;
+	else
+		bitmap_sectors = ldev->md.al_offset - ldev->md.bm_offset;
+	return bitmap_sectors << (9 + 3);
+}
+
+/*
+ * make sure the bitmap has enough room for the attached storage,
+ * if necessary, resize.
+ * called whenever we may have changed the device size.
+ * returns -ENOMEM if we could not allocate enough memory, 0 on success.
+ * In case this is actually a resize, we copy the old bitmap into the new one.
+ * Otherwise, the bitmap is initialized to all bits set.
+ */
+int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bits)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long bits, words, owords, obits;
+	unsigned long want, have, onpages; /* number of pages */
+	struct page **npages, **opages = NULL;
+	int err = 0;
+	bool growing;
+
+	if (!expect(b))
+		return -ENOMEM;
+
+	drbd_bm_lock(device, "resize", BM_LOCKED_MASK);
+
+	drbd_info(device, "drbd_bm_resize called with capacity == %llu\n",
+			(unsigned long long)capacity);
+
+	if (capacity == b->bm_dev_capacity)
+		goto out;
+
+	if (capacity == 0) {
+		spin_lock_irq(&b->bm_lock);
+		opages = b->bm_pages;
+		onpages = b->bm_number_of_pages;
+		owords = b->bm_words;
+		b->bm_pages = NULL;
+		b->bm_number_of_pages =
+		b->bm_set   =
+		b->bm_bits  =
+		b->bm_words =
+		b->bm_dev_capacity = 0;
+		spin_unlock_irq(&b->bm_lock);
+		bm_free_pages(opages, onpages);
+		bm_vk_free(opages);
+		goto out;
+	}
+	bits  = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT));
+
+	/* if we would use
+	   words = ALIGN(bits,BITS_PER_LONG) >> LN2_BPL;
+	   a 32bit host could present the wrong number of words
+	   to a 64bit host.
+	*/
+	words = ALIGN(bits, 64) >> LN2_BPL;
+
+	if (get_ldev(device)) {
+		u64 bits_on_disk = drbd_md_on_disk_bits(device->ldev);
+		put_ldev(device);
+		if (bits > bits_on_disk) {
+			drbd_info(device, "bits = %lu\n", bits);
+			drbd_info(device, "bits_on_disk = %llu\n", bits_on_disk);
+			err = -ENOSPC;
+			goto out;
+		}
+	}
+
+	want = PFN_UP(words*sizeof(long));
+	have = b->bm_number_of_pages;
+	if (want == have) {
+		D_ASSERT(device, b->bm_pages != NULL);
+		npages = b->bm_pages;
+	} else {
+		if (drbd_insert_fault(device, DRBD_FAULT_BM_ALLOC))
+			npages = NULL;
+		else
+			npages = bm_realloc_pages(b, want);
+	}
+
+	if (!npages) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	spin_lock_irq(&b->bm_lock);
+	opages = b->bm_pages;
+	owords = b->bm_words;
+	obits  = b->bm_bits;
+
+	growing = bits > obits;
+	if (opages && growing && set_new_bits)
+		bm_set_surplus(b);
+
+	b->bm_pages = npages;
+	b->bm_number_of_pages = want;
+	b->bm_bits  = bits;
+	b->bm_words = words;
+	b->bm_dev_capacity = capacity;
+
+	if (growing) {
+		if (set_new_bits) {
+			bm_memset(b, owords, 0xff, words-owords);
+			b->bm_set += bits - obits;
+		} else
+			bm_memset(b, owords, 0x00, words-owords);
+
+	}
+
+	if (want < have) {
+		/* implicit: (opages != NULL) && (opages != npages) */
+		bm_free_pages(opages + want, have - want);
+	}
+
+	(void)bm_clear_surplus(b);
+
+	spin_unlock_irq(&b->bm_lock);
+	if (opages != npages)
+		bm_vk_free(opages);
+	if (!growing)
+		b->bm_set = bm_count_bits(b);
+	drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want);
+
+ out:
+	drbd_bm_unlock(device);
+	return err;
+}
+
+/* inherently racy:
+ * if not protected by other means, return value may be out of date when
+ * leaving this function...
+ * we still need to lock it, since it is important that this returns
+ * bm_set == 0 precisely.
+ *
+ * maybe bm_set should be atomic_t ?
+ */
+unsigned long _drbd_bm_total_weight(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long s;
+	unsigned long flags;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	s = b->bm_set;
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+
+	return s;
+}
+
+unsigned long drbd_bm_total_weight(struct drbd_device *device)
+{
+	unsigned long s;
+	/* if I don't have a disk, I don't know about out-of-sync status */
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+	s = _drbd_bm_total_weight(device);
+	put_ldev(device);
+	return s;
+}
+
+size_t drbd_bm_words(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	return b->bm_words;
+}
+
+unsigned long drbd_bm_bits(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return 0;
+
+	return b->bm_bits;
+}
+
+/* merge number words from buffer into the bitmap starting at offset.
+ * buffer[i] is expected to be little endian unsigned long.
+ * bitmap must be locked by drbd_bm_lock.
+ * currently only used from receive_bitmap.
+ */
+void drbd_bm_merge_lel(struct drbd_device *device, size_t offset, size_t number,
+			unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	unsigned long word, bits;
+	unsigned int idx;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+	if (number == 0)
+		return;
+	WARN_ON(offset >= b->bm_words);
+	WARN_ON(end    >  b->bm_words);
+
+	spin_lock_irq(&b->bm_lock);
+	while (offset < end) {
+		do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+		idx = bm_word_to_page_idx(b, offset);
+		p_addr = bm_map_pidx(b, idx);
+		bm = p_addr + MLPP(offset);
+		offset += do_now;
+		while (do_now--) {
+			bits = hweight_long(*bm);
+			word = *bm | *buffer++;
+			*bm++ = word;
+			b->bm_set += hweight_long(word) - bits;
+		}
+		bm_unmap(p_addr);
+		bm_set_page_need_writeout(b->bm_pages[idx]);
+	}
+	/* with 32bit <-> 64bit cross-platform connect
+	 * this is only correct for current usage,
+	 * where we _know_ that we are 64 bit aligned,
+	 * and know that this function is used in this way, too...
+	 */
+	if (end == b->bm_words)
+		b->bm_set -= bm_clear_surplus(b);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* copy number words from the bitmap starting at offset into the buffer.
+ * buffer[i] will be little endian unsigned long.
+ */
+void drbd_bm_get_lel(struct drbd_device *device, size_t offset, size_t number,
+		     unsigned long *buffer)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr, *bm;
+	size_t end, do_now;
+
+	end = offset + number;
+
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	if ((offset >= b->bm_words) ||
+	    (end    >  b->bm_words) ||
+	    (number <= 0))
+		drbd_err(device, "offset=%lu number=%lu bm_words=%lu\n",
+			(unsigned long)	offset,
+			(unsigned long)	number,
+			(unsigned long) b->bm_words);
+	else {
+		while (offset < end) {
+			do_now = min_t(size_t, ALIGN(offset+1, LWPP), end) - offset;
+			p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, offset));
+			bm = p_addr + MLPP(offset);
+			offset += do_now;
+			while (do_now--)
+				*buffer++ = *bm++;
+			bm_unmap(p_addr);
+		}
+	}
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* set all bits in the bitmap */
+void drbd_bm_set_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0xff, b->bm_words);
+	(void)bm_clear_surplus(b);
+	b->bm_set = b->bm_bits;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* clear all bits in the bitmap */
+void drbd_bm_clear_all(struct drbd_device *device)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	if (!expect(b))
+		return;
+	if (!expect(b->bm_pages))
+		return;
+
+	spin_lock_irq(&b->bm_lock);
+	bm_memset(b, 0, 0, b->bm_words);
+	b->bm_set = 0;
+	spin_unlock_irq(&b->bm_lock);
+}
+
+static void drbd_bm_aio_ctx_destroy(struct kref *kref)
+{
+	struct drbd_bm_aio_ctx *ctx = container_of(kref, struct drbd_bm_aio_ctx, kref);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ctx->device->resource->req_lock, flags);
+	list_del(&ctx->list);
+	spin_unlock_irqrestore(&ctx->device->resource->req_lock, flags);
+	put_ldev(ctx->device);
+	kfree(ctx);
+}
+
+/* bv_page may be a copy, or may be the original */
+static void drbd_bm_endio(struct bio *bio)
+{
+	struct drbd_bm_aio_ctx *ctx = bio->bi_private;
+	struct drbd_device *device = ctx->device;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned int idx = bm_page_to_idx(bio_first_page_all(bio));
+
+	if ((ctx->flags & BM_AIO_COPY_PAGES) == 0 &&
+	    !bm_test_page_unchanged(b->bm_pages[idx]))
+		drbd_warn(device, "bitmap page idx %u changed during IO!\n", idx);
+
+	if (bio->bi_status) {
+		/* ctx error will hold the completed-last non-zero error code,
+		 * in case error codes differ. */
+		ctx->error = blk_status_to_errno(bio->bi_status);
+		bm_set_page_io_err(b->bm_pages[idx]);
+		/* Not identical to on disk version of it.
+		 * Is BM_PAGE_IO_ERROR enough? */
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR %d on bitmap page idx %u\n",
+					bio->bi_status, idx);
+	} else {
+		bm_clear_page_io_err(b->bm_pages[idx]);
+		dynamic_drbd_dbg(device, "bitmap page idx %u completed\n", idx);
+	}
+
+	bm_page_unlock_io(device, idx);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES)
+		mempool_free(bio->bi_io_vec[0].bv_page, &drbd_md_io_page_pool);
+
+	bio_put(bio);
+
+	if (atomic_dec_and_test(&ctx->in_flight)) {
+		ctx->done = 1;
+		wake_up(&device->misc_wait);
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+	}
+}
+
+/* For the layout, see comment above drbd_md_set_sector_offsets(). */
+static inline sector_t drbd_md_last_bitmap_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.al_offset -1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect -1;
+	}
+}
+
+static void bm_page_io_async(struct drbd_bm_aio_ctx *ctx, int page_nr) __must_hold(local)
+{
+	struct drbd_device *device = ctx->device;
+	enum req_op op = ctx->flags & BM_AIO_READ ? REQ_OP_READ : REQ_OP_WRITE;
+	struct drbd_bitmap *b = device->bitmap;
+	struct bio *bio;
+	struct page *page;
+	sector_t last_bm_sect;
+	sector_t first_bm_sect;
+	sector_t on_disk_sector;
+	unsigned int len;
+
+	first_bm_sect = device->ldev->md.md_offset + device->ldev->md.bm_offset;
+	on_disk_sector = first_bm_sect + (((sector_t)page_nr) << (PAGE_SHIFT-SECTOR_SHIFT));
+
+	/* this might happen with very small
+	 * flexible external meta data device,
+	 * or with PAGE_SIZE > 4k */
+	last_bm_sect = drbd_md_last_bitmap_sector(device->ldev);
+	if (first_bm_sect <= on_disk_sector && last_bm_sect >= on_disk_sector) {
+		sector_t len_sect = last_bm_sect - on_disk_sector + 1;
+		if (len_sect < PAGE_SIZE/SECTOR_SIZE)
+			len = (unsigned int)len_sect*SECTOR_SIZE;
+		else
+			len = PAGE_SIZE;
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state)) {
+			drbd_err(device, "Invalid offset during on-disk bitmap access: "
+				 "page idx %u, sector %llu\n", page_nr, on_disk_sector);
+		}
+		ctx->error = -EIO;
+		bm_set_page_io_err(b->bm_pages[page_nr]);
+		if (atomic_dec_and_test(&ctx->in_flight)) {
+			ctx->done = 1;
+			wake_up(&device->misc_wait);
+			kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+		}
+		return;
+	}
+
+	/* serialize IO on this page */
+	bm_page_lock_io(device, page_nr);
+	/* before memcpy and submit,
+	 * so it can be redirtied any time */
+	bm_set_page_unchanged(b->bm_pages[page_nr]);
+
+	if (ctx->flags & BM_AIO_COPY_PAGES) {
+		page = mempool_alloc(&drbd_md_io_page_pool,
+				GFP_NOIO | __GFP_HIGHMEM);
+		copy_highpage(page, b->bm_pages[page_nr]);
+		bm_store_page_idx(page, page_nr);
+	} else
+		page = b->bm_pages[page_nr];
+	bio = bio_alloc_bioset(device->ldev->md_bdev, 1, op, GFP_NOIO,
+			&drbd_md_io_bio_set);
+	bio->bi_iter.bi_sector = on_disk_sector;
+	/* bio_add_page of a single page to an empty bio will always succeed,
+	 * according to api.  Do we want to assert that? */
+	bio_add_page(bio, page, len, 0);
+	bio->bi_private = ctx;
+	bio->bi_end_io = drbd_bm_endio;
+
+	if (drbd_insert_fault(device, (op == REQ_OP_WRITE) ? DRBD_FAULT_MD_WR : DRBD_FAULT_MD_RD)) {
+		bio_io_error(bio);
+	} else {
+		submit_bio(bio);
+		/* this should not count as user activity and cause the
+		 * resync to throttle -- see drbd_rs_should_slow_down(). */
+		atomic_add(len >> 9, &device->rs_sect_ev);
+	}
+}
+
+/*
+ * bm_rw: read/write the whole bitmap from/to its on disk location.
+ */
+static int bm_rw(struct drbd_device *device, const unsigned int flags, unsigned lazy_writeout_upper_idx) __must_hold(local)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned int num_pages, i, count = 0;
+	unsigned long now;
+	char ppb[10];
+	int err = 0;
+
+	/*
+	 * We are protected against bitmap disappearing/resizing by holding an
+	 * ldev reference (caller must have called get_ldev()).
+	 * For read/write, we are protected against changes to the bitmap by
+	 * the bitmap lock (see drbd_bitmap_io).
+	 * For lazy writeout, we don't care for ongoing changes to the bitmap,
+	 * as we submit copies of pages anyways.
+	 */
+
+	ctx = kmalloc(sizeof(struct drbd_bm_aio_ctx), GFP_NOIO);
+	if (!ctx)
+		return -ENOMEM;
+
+	*ctx = (struct drbd_bm_aio_ctx) {
+		.device = device,
+		.start_jif = jiffies,
+		.in_flight = ATOMIC_INIT(1),
+		.done = 0,
+		.flags = flags,
+		.error = 0,
+		.kref = KREF_INIT(2),
+	};
+
+	if (!get_ldev_if_state(device, D_ATTACHING)) {  /* put is in drbd_bm_aio_ctx_destroy() */
+		drbd_err(device, "ASSERT FAILED: get_ldev_if_state() == 1 in bm_rw()\n");
+		kfree(ctx);
+		return -ENODEV;
+	}
+	/* Here D_ATTACHING is sufficient since drbd_bm_read() is called only from
+	   drbd_adm_attach(), after device->ldev was assigned. */
+
+	if (0 == (ctx->flags & ~BM_AIO_READ))
+		WARN_ON(!(BM_LOCKED_MASK & b->bm_flags));
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&ctx->list, &device->pending_bitmap_io);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	num_pages = b->bm_number_of_pages;
+
+	now = jiffies;
+
+	/* let the layers below us try to merge these bios... */
+
+	if (flags & BM_AIO_READ) {
+		for (i = 0; i < num_pages; i++) {
+			atomic_inc(&ctx->in_flight);
+			bm_page_io_async(ctx, i);
+			++count;
+			cond_resched();
+		}
+	} else if (flags & BM_AIO_WRITE_HINTED) {
+		/* ASSERT: BM_AIO_WRITE_ALL_PAGES is not set. */
+		unsigned int hint;
+		for (hint = 0; hint < b->n_bitmap_hints; hint++) {
+			i = b->al_bitmap_hints[hint];
+			if (i >= num_pages) /* == -1U: no hint here. */
+				continue;
+			/* Several AL-extents may point to the same page. */
+			if (!test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+			    &page_private(b->bm_pages[i])))
+				continue;
+			/* Has it even changed? */
+			if (bm_test_page_unchanged(b->bm_pages[i]))
+				continue;
+			atomic_inc(&ctx->in_flight);
+			bm_page_io_async(ctx, i);
+			++count;
+		}
+	} else {
+		for (i = 0; i < num_pages; i++) {
+			/* ignore completely unchanged pages */
+			if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
+				break;
+			if (!(flags & BM_AIO_WRITE_ALL_PAGES) &&
+			    bm_test_page_unchanged(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm write for idx %u\n", i);
+				continue;
+			}
+			/* during lazy writeout,
+			 * ignore those pages not marked for lazy writeout. */
+			if (lazy_writeout_upper_idx &&
+			    !bm_test_page_lazy_writeout(b->bm_pages[i])) {
+				dynamic_drbd_dbg(device, "skipped bm lazy write for idx %u\n", i);
+				continue;
+			}
+			atomic_inc(&ctx->in_flight);
+			bm_page_io_async(ctx, i);
+			++count;
+			cond_resched();
+		}
+	}
+
+	/*
+	 * We initialize ctx->in_flight to one to make sure drbd_bm_endio
+	 * will not set ctx->done early, and decrement / test it here.  If there
+	 * are still some bios in flight, we need to wait for them here.
+	 * If all IO is done already (or nothing had been submitted), there is
+	 * no need to wait.  Still, we need to put the kref associated with the
+	 * "in_flight reached zero, all done" event.
+	 */
+	if (!atomic_dec_and_test(&ctx->in_flight))
+		wait_until_done_or_force_detached(device, device->ldev, &ctx->done);
+	else
+		kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+
+	/* summary for global bitmap IO */
+	if (flags == 0) {
+		unsigned int ms = jiffies_to_msecs(jiffies - now);
+		if (ms > 5) {
+			drbd_info(device, "bitmap %s of %u pages took %u ms\n",
+				 (flags & BM_AIO_READ) ? "READ" : "WRITE",
+				 count, ms);
+		}
+	}
+
+	if (ctx->error) {
+		drbd_alert(device, "we had at least one MD IO ERROR during bitmap IO\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+		err = -EIO; /* ctx->error ? */
+	}
+
+	if (atomic_read(&ctx->in_flight))
+		err = -EIO; /* Disk timeout/force-detach during IO... */
+
+	now = jiffies;
+	if (flags & BM_AIO_READ) {
+		b->bm_set = bm_count_bits(b);
+		drbd_info(device, "recounting of set bits took additional %lu jiffies\n",
+		     jiffies - now);
+	}
+	now = b->bm_set;
+
+	if ((flags & ~BM_AIO_READ) == 0)
+		drbd_info(device, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+		     ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+
+	kref_put(&ctx->kref, &drbd_bm_aio_ctx_destroy);
+	return err;
+}
+
+/**
+ * drbd_bm_read() - Read the whole bitmap from its on disk location.
+ * @device:	DRBD device.
+ */
+int drbd_bm_read(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_READ, 0);
+}
+
+/**
+ * drbd_bm_write() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ */
+int drbd_bm_write(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, 0, 0);
+}
+
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_WRITE_ALL_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_lazy() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
+ * @device:	DRBD device.
+ * @upper_idx:	0: write all changed pages; +ve: page index to stop scanning for changed pages
+ */
+int drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, upper_idx);
+}
+
+/**
+ * drbd_bm_write_copy_pages() - Write the whole bitmap to its on disk location.
+ * @device:	DRBD device.
+ *
+ * Will only write pages that have changed since last IO.
+ * In contrast to drbd_bm_write(), this will copy the bitmap pages
+ * to temporary writeout pages. It is intended to trigger a full write-out
+ * while still allowing the bitmap to change, for example if a resync or online
+ * verify is aborted due to a failed peer disk, while local IO continues, or
+ * pending resync acks are still being processed.
+ */
+int drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_COPY_PAGES, 0);
+}
+
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @device:	DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local)
+{
+	return bm_rw(device, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
+
+/* NOTE
+ * find_first_bit returns int, we return unsigned long.
+ * For this to work on 32bit arch with bitnumbers > (1<<32),
+ * we'd need to return u64, and get a whole lot of other places
+ * fixed where we still use unsigned long.
+ *
+ * this returns a bit number, NOT a sector!
+ */
+static unsigned long __bm_find_next(struct drbd_device *device, unsigned long bm_fo,
+	const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	unsigned long bit_offset;
+	unsigned i;
+
+
+	if (bm_fo > b->bm_bits) {
+		drbd_err(device, "bm_fo=%lu bm_bits=%lu\n", bm_fo, b->bm_bits);
+		bm_fo = DRBD_END_OF_BITMAP;
+	} else {
+		while (bm_fo < b->bm_bits) {
+			/* bit offset of the first bit in the page */
+			bit_offset = bm_fo & ~BITS_PER_PAGE_MASK;
+			p_addr = __bm_map_pidx(b, bm_bit_to_page_idx(b, bm_fo));
+
+			if (find_zero_bit)
+				i = find_next_zero_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+			else
+				i = find_next_bit_le(p_addr,
+						PAGE_SIZE*8, bm_fo & BITS_PER_PAGE_MASK);
+
+			__bm_unmap(p_addr);
+			if (i < PAGE_SIZE*8) {
+				bm_fo = bit_offset + i;
+				if (bm_fo >= b->bm_bits)
+					break;
+				goto found;
+			}
+			bm_fo = bit_offset + PAGE_SIZE*8;
+		}
+		bm_fo = DRBD_END_OF_BITMAP;
+	}
+ found:
+	return bm_fo;
+}
+
+static unsigned long bm_find_next(struct drbd_device *device,
+	unsigned long bm_fo, const int find_zero_bit)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long i = DRBD_END_OF_BITMAP;
+
+	if (!expect(b))
+		return i;
+	if (!expect(b->bm_pages))
+		return i;
+
+	spin_lock_irq(&b->bm_lock);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	i = __bm_find_next(device, bm_fo, find_zero_bit);
+
+	spin_unlock_irq(&b->bm_lock);
+	return i;
+}
+
+unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 0);
+}
+
+#if 0
+/* not yet needed for anything. */
+unsigned long drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	return bm_find_next(device, bm_fo, 1);
+}
+#endif
+
+/* does not spin_lock_irqsave.
+ * you must take drbd_bm_lock() first */
+unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 0);
+}
+
+unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo)
+{
+	/* WARN_ON(!(BM_DONT_SET & device->b->bm_flags)); */
+	return __bm_find_next(device, bm_fo, 1);
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector.
+ * expected to be called for only a few bits (e - s about BITS_PER_LONG).
+ * Must hold bitmap lock already. */
+static int __bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	unsigned long e, int val)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int last_page_nr = -1U;
+	int c = 0;
+	int changed_total = 0;
+
+	if (e >= b->bm_bits) {
+		drbd_err(device, "ASSERT FAILED: bit_s=%lu bit_e=%lu bm_bits=%lu\n",
+				s, e, b->bm_bits);
+		e = b->bm_bits ? b->bm_bits -1 : 0;
+	}
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int page_nr = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != last_page_nr) {
+			if (p_addr)
+				__bm_unmap(p_addr);
+			if (c < 0)
+				bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+			else if (c > 0)
+				bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+			changed_total += c;
+			c = 0;
+			p_addr = __bm_map_pidx(b, page_nr);
+			last_page_nr = page_nr;
+		}
+		if (val)
+			c += (0 == __test_and_set_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+		else
+			c -= (0 != __test_and_clear_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr));
+	}
+	if (p_addr)
+		__bm_unmap(p_addr);
+	if (c < 0)
+		bm_set_page_lazy_writeout(b->bm_pages[last_page_nr]);
+	else if (c > 0)
+		bm_set_page_need_writeout(b->bm_pages[last_page_nr]);
+	changed_total += c;
+	b->bm_set += changed_total;
+	return changed_total;
+}
+
+/* returns number of bits actually changed.
+ * for val != 0, we change 0 -> 1, return code positive
+ * for val == 0, we change 1 -> 0, return code negative
+ * wants bitnr, not sector */
+static int bm_change_bits_to(struct drbd_device *device, const unsigned long s,
+	const unsigned long e, int val)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	int c = 0;
+
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
+		bm_print_lock_info(device);
+
+	c = __bm_change_bits_to(device, s, e, val);
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+/* returns number of bits changed 0 -> 1 */
+int drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return bm_change_bits_to(device, s, e, 1);
+}
+
+/* returns number of bits changed 1 -> 0 */
+int drbd_bm_clear_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	return -bm_change_bits_to(device, s, e, 0);
+}
+
+/* sets all bits in full words,
+ * from first_word up to, but not including, last_word */
+static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b,
+		int page_nr, int first_word, int last_word)
+{
+	int i;
+	int bits;
+	int changed = 0;
+	unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
+
+	/* I think it is more cache line friendly to hweight_long then set to ~0UL,
+	 * than to first bitmap_weight() all words, then bitmap_fill() all words */
+	for (i = first_word; i < last_word; i++) {
+		bits = hweight_long(paddr[i]);
+		paddr[i] = ~0UL;
+		changed += BITS_PER_LONG - bits;
+	}
+	kunmap_atomic(paddr);
+	if (changed) {
+		/* We only need lazy writeout, the information is still in the
+		 * remote bitmap as well, and is reconstructed during the next
+		 * bitmap exchange, if lost locally due to a crash. */
+		bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+		b->bm_set += changed;
+	}
+}
+
+/* Same thing as drbd_bm_set_bits,
+ * but more efficient for a large bit range.
+ * You must first drbd_bm_lock().
+ * Can be called to set the whole bitmap in one go.
+ * Sets bits from s to e _inclusive_. */
+void _drbd_bm_set_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	/* First set_bit from the first bit (s)
+	 * up to the next long boundary (sl),
+	 * then assign full words up to the last long boundary (el),
+	 * then set_bit up to and including the last bit (e).
+	 *
+	 * Do not use memset, because we must account for changes,
+	 * so we need to loop over the words with hweight() anyways.
+	 */
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long sl = ALIGN(s,BITS_PER_LONG);
+	unsigned long el = (e+1) & ~((unsigned long)BITS_PER_LONG-1);
+	int first_page;
+	int last_page;
+	int page_nr;
+	int first_word;
+	int last_word;
+
+	if (e - s <= 3*BITS_PER_LONG) {
+		/* don't bother; el and sl may even be wrong. */
+		spin_lock_irq(&b->bm_lock);
+		__bm_change_bits_to(device, s, e, 1);
+		spin_unlock_irq(&b->bm_lock);
+		return;
+	}
+
+	/* difference is large enough that we can trust sl and el */
+
+	spin_lock_irq(&b->bm_lock);
+
+	/* bits filling the current long */
+	if (sl)
+		__bm_change_bits_to(device, s, sl-1, 1);
+
+	first_page = sl >> (3 + PAGE_SHIFT);
+	last_page = el >> (3 + PAGE_SHIFT);
+
+	/* MLPP: modulo longs per page */
+	/* LWPP: long words per page */
+	first_word = MLPP(sl >> LN2_BPL);
+	last_word = LWPP;
+
+	/* first and full pages, unless first page == last page */
+	for (page_nr = first_page; page_nr < last_page; page_nr++) {
+		bm_set_full_words_within_one_page(device->bitmap, page_nr, first_word, last_word);
+		spin_unlock_irq(&b->bm_lock);
+		cond_resched();
+		first_word = 0;
+		spin_lock_irq(&b->bm_lock);
+	}
+	/* last page (respectively only page, for first page == last page) */
+	last_word = MLPP(el >> LN2_BPL);
+
+	/* consider bitmap->bm_bits = 32768, bitmap->bm_number_of_pages = 1. (or multiples).
+	 * ==> e = 32767, el = 32768, last_page = 2,
+	 * and now last_word = 0.
+	 * We do not want to touch last_page in this case,
+	 * as we did not allocate it, it is not present in bitmap->bm_pages.
+	 */
+	if (last_word)
+		bm_set_full_words_within_one_page(device->bitmap, last_page, first_word, last_word);
+
+	/* possibly trailing bits.
+	 * example: (e & 63) == 63, el will be e+1.
+	 * if that even was the very last bit,
+	 * it would trigger an assert in __bm_change_bits_to()
+	 */
+	if (el <= e)
+		__bm_change_bits_to(device, el, e, 1);
+	spin_unlock_irq(&b->bm_lock);
+}
+
+/* returns bit state
+ * wants bitnr, NOT sector.
+ * inherently racy... area needs to be locked by means of {al,rs}_lru
+ *  1 ... bit set
+ *  0 ... bit not set
+ * -1 ... first out of bounds access, stop testing for bits!
+ */
+int drbd_bm_test_bit(struct drbd_device *device, const unsigned long bitnr)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr;
+	int i;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	if (bitnr < b->bm_bits) {
+		p_addr = bm_map_pidx(b, bm_bit_to_page_idx(b, bitnr));
+		i = test_bit_le(bitnr & BITS_PER_PAGE_MASK, p_addr) ? 1 : 0;
+		bm_unmap(p_addr);
+	} else if (bitnr == b->bm_bits) {
+		i = -1;
+	} else { /* (bitnr > b->bm_bits) */
+		drbd_err(device, "bitnr=%lu > bm_bits=%lu\n", bitnr, b->bm_bits);
+		i = 0;
+	}
+
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return i;
+}
+
+/* returns number of bits set in the range [s, e] */
+int drbd_bm_count_bits(struct drbd_device *device, const unsigned long s, const unsigned long e)
+{
+	unsigned long flags;
+	struct drbd_bitmap *b = device->bitmap;
+	unsigned long *p_addr = NULL;
+	unsigned long bitnr;
+	unsigned int page_nr = -1U;
+	int c = 0;
+
+	/* If this is called without a bitmap, that is a bug.  But just to be
+	 * robust in case we screwed up elsewhere, in that case pretend there
+	 * was one dirty bit in the requested area, so we won't try to do a
+	 * local read there (no bitmap probably implies no disk) */
+	if (!expect(b))
+		return 1;
+	if (!expect(b->bm_pages))
+		return 1;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+	for (bitnr = s; bitnr <= e; bitnr++) {
+		unsigned int idx = bm_bit_to_page_idx(b, bitnr);
+		if (page_nr != idx) {
+			page_nr = idx;
+			if (p_addr)
+				bm_unmap(p_addr);
+			p_addr = bm_map_pidx(b, idx);
+		}
+		if (expect(bitnr < b->bm_bits))
+			c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
+		else
+			drbd_err(device, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
+	}
+	if (p_addr)
+		bm_unmap(p_addr);
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return c;
+}
+
+
+/* inherently racy...
+ * return value may be already out-of-date when this function returns.
+ * but the general usage is that this is only use during a cstate when bits are
+ * only cleared, not set, and typically only care for the case when the return
+ * value is zero, or we already "locked" this "bitmap extent" by other means.
+ *
+ * enr is bm-extent number, since we chose to name one sector (512 bytes)
+ * worth of the bitmap a "bitmap extent".
+ *
+ * TODO
+ * I think since we use it like a reference count, we should use the real
+ * reference count of some bitmap extent element from some lru instead...
+ *
+ */
+int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr)
+{
+	struct drbd_bitmap *b = device->bitmap;
+	int count, s, e;
+	unsigned long flags;
+	unsigned long *p_addr, *bm;
+
+	if (!expect(b))
+		return 0;
+	if (!expect(b->bm_pages))
+		return 0;
+
+	spin_lock_irqsave(&b->bm_lock, flags);
+	if (BM_DONT_TEST & b->bm_flags)
+		bm_print_lock_info(device);
+
+	s = S2W(enr);
+	e = min((size_t)S2W(enr+1), b->bm_words);
+	count = 0;
+	if (s < b->bm_words) {
+		int n = e-s;
+		p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
+		bm = p_addr + MLPP(s);
+		count += bitmap_weight(bm, n * BITS_PER_LONG);
+		bm_unmap(p_addr);
+	} else {
+		drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s);
+	}
+	spin_unlock_irqrestore(&b->bm_lock, flags);
+	return count;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c
new file mode 100644
index 000000000..b3b9cd562
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.c
@@ -0,0 +1,891 @@
+// SPDX-License-Identifier: GPL-2.0
+#define pr_fmt(fmt) "drbd debugfs: " fmt
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+#include <linux/stat.h>
+#include <linux/jiffies.h>
+#include <linux/list.h>
+
+#include "drbd_int.h"
+#include "drbd_req.h"
+#include "drbd_debugfs.h"
+
+
+/**********************************************************************
+ * Whenever you change the file format, remember to bump the version. *
+ **********************************************************************/
+
+static struct dentry *drbd_debugfs_root;
+static struct dentry *drbd_debugfs_version;
+static struct dentry *drbd_debugfs_resources;
+static struct dentry *drbd_debugfs_minors;
+
+static void seq_print_age_or_dash(struct seq_file *m, bool valid, unsigned long dt)
+{
+	if (valid)
+		seq_printf(m, "\t%d", jiffies_to_msecs(dt));
+	else
+		seq_printf(m, "\t-");
+}
+
+static void __seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name, const char *unset_name)
+{
+	if (is_set && set_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, set_name);
+		*sep = '|';
+	} else if (!is_set && unset_name) {
+		seq_putc(m, *sep);
+		seq_puts(m, unset_name);
+		*sep = '|';
+	}
+}
+
+static void seq_print_rq_state_bit(struct seq_file *m,
+	bool is_set, char *sep, const char *set_name)
+{
+	__seq_print_rq_state_bit(m, is_set, sep, set_name, NULL);
+}
+
+/* pretty print enum drbd_req_state_bits req->rq_state */
+static void seq_print_request_state(struct seq_file *m, struct drbd_request *req)
+{
+	unsigned int s = req->rq_state;
+	char sep = ' ';
+	seq_printf(m, "\t0x%08x", s);
+	seq_printf(m, "\tmaster: %s", req->master_bio ? "pending" : "completed");
+
+	/* RQ_WRITE ignored, already reported */
+	seq_puts(m, "\tlocal:");
+	seq_print_rq_state_bit(m, s & RQ_IN_ACT_LOG, &sep, "in-AL");
+	seq_print_rq_state_bit(m, s & RQ_POSTPONED, &sep, "postponed");
+	seq_print_rq_state_bit(m, s & RQ_COMPLETION_SUSP, &sep, "suspended");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_COMPLETED, &sep, "completed");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_ABORTED, &sep, "aborted");
+	seq_print_rq_state_bit(m, s & RQ_LOCAL_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	/* for_each_connection ... */
+	seq_printf(m, "\tnet:");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_NET_PENDING, &sep, "pending");
+	seq_print_rq_state_bit(m, s & RQ_NET_QUEUED, &sep, "queued");
+	seq_print_rq_state_bit(m, s & RQ_NET_SENT, &sep, "sent");
+	seq_print_rq_state_bit(m, s & RQ_NET_DONE, &sep, "done");
+	seq_print_rq_state_bit(m, s & RQ_NET_SIS, &sep, "sis");
+	seq_print_rq_state_bit(m, s & RQ_NET_OK, &sep, "ok");
+	if (sep == ' ')
+		seq_puts(m, " -");
+
+	seq_printf(m, " :");
+	sep = ' ';
+	seq_print_rq_state_bit(m, s & RQ_EXP_RECEIVE_ACK, &sep, "B");
+	seq_print_rq_state_bit(m, s & RQ_EXP_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, s & RQ_EXP_BARR_ACK, &sep, "barr");
+	if (sep == ' ')
+		seq_puts(m, " -");
+	seq_printf(m, "\n");
+}
+
+static void seq_print_one_request(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	/* change anything here, fixup header below! */
+	unsigned int s = req->rq_state;
+
+#define RQ_HDR_1 "epoch\tsector\tsize\trw"
+	seq_printf(m, "0x%x\t%llu\t%u\t%s",
+		req->epoch,
+		(unsigned long long)req->i.sector, req->i.size >> 9,
+		(s & RQ_WRITE) ? "W" : "R");
+
+#define RQ_HDR_2 "\tstart\tin AL\tsubmit"
+	seq_printf(m, "\t%d", jiffies_to_msecs(now - req->start_jif));
+	seq_print_age_or_dash(m, s & RQ_IN_ACT_LOG, now - req->in_actlog_jif);
+	seq_print_age_or_dash(m, s & RQ_LOCAL_PENDING, now - req->pre_submit_jif);
+
+#define RQ_HDR_3 "\tsent\tacked\tdone"
+	seq_print_age_or_dash(m, s & RQ_NET_SENT, now - req->pre_send_jif);
+	seq_print_age_or_dash(m, (s & RQ_NET_SENT) && !(s & RQ_NET_PENDING), now - req->acked_jif);
+	seq_print_age_or_dash(m, s & RQ_NET_DONE, now - req->net_done_jif);
+
+#define RQ_HDR_4 "\tstate\n"
+	seq_print_request_state(m, req);
+}
+#define RQ_HDR RQ_HDR_1 RQ_HDR_2 RQ_HDR_3 RQ_HDR_4
+
+static void seq_print_minor_vnr_req(struct seq_file *m, struct drbd_request *req, unsigned long now)
+{
+	seq_printf(m, "%u\t%u\t", req->device->minor, req->device->vnr);
+	seq_print_one_request(m, req, now);
+}
+
+static void seq_print_resource_pending_meta_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tstart\tsubmit\tintent\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		struct drbd_md_io tmp;
+		/* In theory this is racy,
+		 * in the sense that there could have been a
+		 * drbd_md_put_buffer(); drbd_md_get_buffer();
+		 * between accessing these members here.  */
+		tmp = device->md_io;
+		if (atomic_read(&tmp.in_use)) {
+			seq_printf(m, "%u\t%u\t%d\t",
+				device->minor, device->vnr,
+				jiffies_to_msecs(now - tmp.start_jif));
+			if (time_before(tmp.submit_jif, tmp.start_jif))
+				seq_puts(m, "-\t");
+			else
+				seq_printf(m, "%d\t", jiffies_to_msecs(now - tmp.submit_jif));
+			seq_printf(m, "%s\n", tmp.current_use);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_waiting_for_AL(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\tage\t#waiting\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		unsigned long jif;
+		struct drbd_request *req;
+		int n = atomic_read(&device->ap_actlog_cnt);
+		if (n) {
+			spin_lock_irq(&device->resource->req_lock);
+			req = list_first_entry_or_null(&device->pending_master_completion[1],
+				struct drbd_request, req_pending_master_completion);
+			/* if the oldest request does not wait for the activity log
+			 * it is not interesting for us here */
+			if (req && !(req->rq_state & RQ_IN_ACT_LOG))
+				jif = req->start_jif;
+			else
+				req = NULL;
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		if (n) {
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+			if (req)
+				seq_printf(m, "%u\t", jiffies_to_msecs(now - jif));
+			else
+				seq_puts(m, "-\t");
+			seq_printf(m, "%u\n", n);
+		}
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_device_bitmap_io(struct seq_file *m, struct drbd_device *device, unsigned long now)
+{
+	struct drbd_bm_aio_ctx *ctx;
+	unsigned long start_jif;
+	unsigned int in_flight;
+	unsigned int flags;
+	spin_lock_irq(&device->resource->req_lock);
+	ctx = list_first_entry_or_null(&device->pending_bitmap_io, struct drbd_bm_aio_ctx, list);
+	if (ctx && ctx->done)
+		ctx = NULL;
+	if (ctx) {
+		start_jif = ctx->start_jif;
+		in_flight = atomic_read(&ctx->in_flight);
+		flags = ctx->flags;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+	if (ctx) {
+		seq_printf(m, "%u\t%u\t%c\t%u\t%u\n",
+			device->minor, device->vnr,
+			(flags & BM_AIO_READ) ? 'R' : 'W',
+			jiffies_to_msecs(now - start_jif),
+			in_flight);
+	}
+}
+
+static void seq_print_resource_pending_bitmap_io(struct seq_file *m, struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	seq_puts(m, "minor\tvnr\trw\tage\t#in-flight\n");
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_bitmap_io(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+/* pretty print enum peer_req->flags */
+static void seq_print_peer_request_flags(struct seq_file *m, struct drbd_peer_request *peer_req)
+{
+	unsigned long f = peer_req->flags;
+	char sep = ' ';
+
+	__seq_print_rq_state_bit(m, f & EE_SUBMITTED, &sep, "submitted", "preparing");
+	__seq_print_rq_state_bit(m, f & EE_APPLICATION, &sep, "application", "internal");
+	seq_print_rq_state_bit(m, f & EE_CALL_AL_COMPLETE_IO, &sep, "in-AL");
+	seq_print_rq_state_bit(m, f & EE_SEND_WRITE_ACK, &sep, "C");
+	seq_print_rq_state_bit(m, f & EE_MAY_SET_IN_SYNC, &sep, "set-in-sync");
+	seq_print_rq_state_bit(m, f & EE_TRIM, &sep, "trim");
+	seq_print_rq_state_bit(m, f & EE_ZEROOUT, &sep, "zero-out");
+	seq_print_rq_state_bit(m, f & EE_WRITE_SAME, &sep, "write-same");
+	seq_putc(m, '\n');
+}
+
+static void seq_print_peer_request(struct seq_file *m,
+	struct drbd_device *device, struct list_head *lh,
+	unsigned long now)
+{
+	bool reported_preparing = false;
+	struct drbd_peer_request *peer_req;
+	list_for_each_entry(peer_req, lh, w.list) {
+		if (reported_preparing && !(peer_req->flags & EE_SUBMITTED))
+			continue;
+
+		if (device)
+			seq_printf(m, "%u\t%u\t", device->minor, device->vnr);
+
+		seq_printf(m, "%llu\t%u\t%c\t%u\t",
+			(unsigned long long)peer_req->i.sector, peer_req->i.size >> 9,
+			(peer_req->flags & EE_WRITE) ? 'W' : 'R',
+			jiffies_to_msecs(now - peer_req->submit_jif));
+		seq_print_peer_request_flags(m, peer_req);
+		if (peer_req->flags & EE_SUBMITTED)
+			break;
+		else
+			reported_preparing = true;
+	}
+}
+
+static void seq_print_device_peer_requests(struct seq_file *m,
+	struct drbd_device *device, unsigned long now)
+{
+	seq_puts(m, "minor\tvnr\tsector\tsize\trw\tage\tflags\n");
+	spin_lock_irq(&device->resource->req_lock);
+	seq_print_peer_request(m, device, &device->active_ee, now);
+	seq_print_peer_request(m, device, &device->read_ee, now);
+	seq_print_peer_request(m, device, &device->sync_ee, now);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (test_bit(FLUSH_PENDING, &device->flags)) {
+		seq_printf(m, "%u\t%u\t-\t-\tF\t%u\tflush\n",
+			device->minor, device->vnr,
+			jiffies_to_msecs(now - device->flush_jif));
+	}
+}
+
+static void seq_print_resource_pending_peer_requests(struct seq_file *m,
+	struct drbd_resource *resource, unsigned long now)
+{
+	struct drbd_device *device;
+	unsigned int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, i) {
+		seq_print_device_peer_requests(m, device, now);
+	}
+	rcu_read_unlock();
+}
+
+static void seq_print_resource_transfer_log_summary(struct seq_file *m,
+	struct drbd_resource *resource,
+	struct drbd_connection *connection,
+	unsigned long now)
+{
+	struct drbd_request *req;
+	unsigned int count = 0;
+	unsigned int show_state = 0;
+
+	seq_puts(m, "n\tdevice\tvnr\t" RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	list_for_each_entry(req, &connection->transfer_log, tl_requests) {
+		unsigned int tmp = 0;
+		unsigned int s;
+		++count;
+
+		/* don't disable irq "forever" */
+		if (!(count & 0x1ff)) {
+			struct drbd_request *req_next;
+			kref_get(&req->kref);
+			spin_unlock_irq(&resource->req_lock);
+			cond_resched();
+			spin_lock_irq(&resource->req_lock);
+			req_next = list_next_entry(req, tl_requests);
+			if (kref_put(&req->kref, drbd_req_destroy))
+				req = req_next;
+			if (&req->tl_requests == &connection->transfer_log)
+				break;
+		}
+
+		s = req->rq_state;
+
+		/* This is meant to summarize timing issues, to be able to tell
+		 * local disk problems from network problems.
+		 * Skip requests, if we have shown an even older request with
+		 * similar aspects already.  */
+		if (req->master_bio == NULL)
+			tmp |= 1;
+		if ((s & RQ_LOCAL_MASK) && (s & RQ_LOCAL_PENDING))
+			tmp |= 2;
+		if (s & RQ_NET_MASK) {
+			if (!(s & RQ_NET_SENT))
+				tmp |= 4;
+			if (s & RQ_NET_PENDING)
+				tmp |= 8;
+			if (!(s & RQ_NET_DONE))
+				tmp |= 16;
+		}
+		if ((tmp & show_state) == tmp)
+			continue;
+		show_state |= tmp;
+		seq_printf(m, "%u\t", count);
+		seq_print_minor_vnr_req(m, req, now);
+		if (show_state == 0x1f)
+			break;
+	}
+	spin_unlock_irq(&resource->req_lock);
+}
+
+/* TODO: transfer_log and friends should be moved to resource */
+static int in_flight_summary_show(struct seq_file *m, void *pos)
+{
+	struct drbd_resource *resource = m->private;
+	struct drbd_connection *connection;
+	unsigned long jif = jiffies;
+
+	connection = first_connection(resource);
+	/* This does not happen, actually.
+	 * But be robust and prepare for future code changes. */
+	if (!connection || !kref_get_unless_zero(&connection->kref))
+		return -ESTALE;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "oldest bitmap IO\n");
+	seq_print_resource_pending_bitmap_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "meta data IO\n");
+	seq_print_resource_pending_meta_io(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "socket buffer stats\n");
+	/* for each connection ... once we have more than one */
+	rcu_read_lock();
+	if (connection->data.socket) {
+		/* open coded SIOCINQ, the "relevant" part */
+		struct tcp_sock *tp = tcp_sk(connection->data.socket->sk);
+		int answ = tp->rcv_nxt - tp->copied_seq;
+		seq_printf(m, "unread receive buffer: %u Byte\n", answ);
+		/* open coded SIOCOUTQ, the "relevant" part */
+		answ = tp->write_seq - tp->snd_una;
+		seq_printf(m, "unacked send buffer: %u Byte\n", answ);
+	}
+	rcu_read_unlock();
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest peer requests\n");
+	seq_print_resource_pending_peer_requests(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "application requests waiting for activity log\n");
+	seq_print_waiting_for_AL(m, resource, jif);
+	seq_putc(m, '\n');
+
+	seq_puts(m, "oldest application requests\n");
+	seq_print_resource_transfer_log_summary(m, resource, connection, jif);
+	seq_putc(m, '\n');
+
+	jif = jiffies - jif;
+	if (jif)
+		seq_printf(m, "generated in %d ms\n", jiffies_to_msecs(jif));
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+/* make sure at *open* time that the respective object won't go away. */
+static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, void *),
+		                void *data, struct kref *kref,
+				void (*release)(struct kref *))
+{
+	struct dentry *parent;
+	int ret = -ESTALE;
+
+	/* Are we still linked,
+	 * or has debugfs_remove() already been called? */
+	parent = file->f_path.dentry->d_parent;
+	/* serialize with d_delete() */
+	inode_lock(d_inode(parent));
+	/* Make sure the object is still alive */
+	if (simple_positive(file->f_path.dentry)
+	&& kref_get_unless_zero(kref))
+		ret = 0;
+	inode_unlock(d_inode(parent));
+	if (!ret) {
+		ret = single_open(file, show, data);
+		if (ret)
+			kref_put(kref, release);
+	}
+	return ret;
+}
+
+static int in_flight_summary_open(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	return drbd_single_open(file, in_flight_summary_show, resource,
+				&resource->kref, drbd_destroy_resource);
+}
+
+static int in_flight_summary_release(struct inode *inode, struct file *file)
+{
+	struct drbd_resource *resource = inode->i_private;
+	kref_put(&resource->kref, drbd_destroy_resource);
+	return single_release(inode, file);
+}
+
+static const struct file_operations in_flight_summary_fops = {
+	.owner		= THIS_MODULE,
+	.open		= in_flight_summary_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= in_flight_summary_release,
+};
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_create_dir(resource->name, drbd_debugfs_resources);
+	resource->debugfs_res = dentry;
+
+	dentry = debugfs_create_dir("volumes", resource->debugfs_res);
+	resource->debugfs_res_volumes = dentry;
+
+	dentry = debugfs_create_dir("connections", resource->debugfs_res);
+	resource->debugfs_res_connections = dentry;
+
+	dentry = debugfs_create_file("in_flight_summary", 0440,
+				     resource->debugfs_res, resource,
+				     &in_flight_summary_fops);
+	resource->debugfs_res_in_flight_summary = dentry;
+}
+
+static void drbd_debugfs_remove(struct dentry **dp)
+{
+	debugfs_remove(*dp);
+	*dp = NULL;
+}
+
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource)
+{
+	/* it is ok to call debugfs_remove(NULL) */
+	drbd_debugfs_remove(&resource->debugfs_res_in_flight_summary);
+	drbd_debugfs_remove(&resource->debugfs_res_connections);
+	drbd_debugfs_remove(&resource->debugfs_res_volumes);
+	drbd_debugfs_remove(&resource->debugfs_res);
+}
+
+static void seq_print_one_timing_detail(struct seq_file *m,
+	const struct drbd_thread_timing_details *tdp,
+	unsigned long now)
+{
+	struct drbd_thread_timing_details td;
+	/* No locking...
+	 * use temporary assignment to get at consistent data. */
+	do {
+		td = *tdp;
+	} while (td.cb_nr != tdp->cb_nr);
+	if (!td.cb_addr)
+		return;
+	seq_printf(m, "%u\t%d\t%s:%u\t%ps\n",
+			td.cb_nr,
+			jiffies_to_msecs(now - td.start_jif),
+			td.caller_fn, td.line,
+			td.cb_addr);
+}
+
+static void seq_print_timing_details(struct seq_file *m,
+		const char *title,
+		unsigned int cb_nr, struct drbd_thread_timing_details *tdp, unsigned long now)
+{
+	unsigned int start_idx;
+	unsigned int i;
+
+	seq_printf(m, "%s\n", title);
+	/* If not much is going on, this will result in natural ordering.
+	 * If it is very busy, we will possibly skip events, or even see wrap
+	 * arounds, which could only be avoided with locking.
+	 */
+	start_idx = cb_nr % DRBD_THREAD_DETAILS_HIST;
+	for (i = start_idx; i < DRBD_THREAD_DETAILS_HIST; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+	for (i = 0; i < start_idx; i++)
+		seq_print_one_timing_detail(m, tdp+i, now);
+}
+
+static int callback_history_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long jif = jiffies;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, "n\tage\tcallsite\tfn\n");
+	seq_print_timing_details(m, "worker", connection->w_cb_nr, connection->w_timing_details, jif);
+	seq_print_timing_details(m, "receiver", connection->r_cb_nr, connection->r_timing_details, jif);
+	return 0;
+}
+
+static int callback_history_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, callback_history_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int callback_history_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_callback_history_fops = {
+	.owner		= THIS_MODULE,
+	.open		= callback_history_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= callback_history_release,
+};
+
+static int connection_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_connection *connection = m->private;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	r1 = connection->req_next;
+	if (r1)
+		seq_print_minor_vnr_req(m, r1, now);
+	r2 = connection->req_ack_pending;
+	if (r2 && r2 != r1) {
+		r1 = r2;
+		seq_print_minor_vnr_req(m, r1, now);
+	}
+	r2 = connection->req_not_net_done;
+	if (r2 && r2 != r1)
+		seq_print_minor_vnr_req(m, r2, now);
+	spin_unlock_irq(&connection->resource->req_lock);
+	return 0;
+}
+
+static int connection_oldest_requests_open(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	return drbd_single_open(file, connection_oldest_requests_show, connection,
+				&connection->kref, drbd_destroy_connection);
+}
+
+static int connection_oldest_requests_release(struct inode *inode, struct file *file)
+{
+	struct drbd_connection *connection = inode->i_private;
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return single_release(inode, file);
+}
+
+static const struct file_operations connection_oldest_requests_fops = {
+	.owner		= THIS_MODULE,
+	.open		= connection_oldest_requests_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= connection_oldest_requests_release,
+};
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection)
+{
+	struct dentry *conns_dir = connection->resource->debugfs_res_connections;
+	struct dentry *dentry;
+
+	/* Once we enable mutliple peers,
+	 * these connections will have descriptive names.
+	 * For now, it is just the one connection to the (only) "peer". */
+	dentry = debugfs_create_dir("peer", conns_dir);
+	connection->debugfs_conn = dentry;
+
+	dentry = debugfs_create_file("callback_history", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_callback_history_fops);
+	connection->debugfs_conn_callback_history = dentry;
+
+	dentry = debugfs_create_file("oldest_requests", 0440,
+				     connection->debugfs_conn, connection,
+				     &connection_oldest_requests_fops);
+	connection->debugfs_conn_oldest_requests = dentry;
+}
+
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection)
+{
+	drbd_debugfs_remove(&connection->debugfs_conn_callback_history);
+	drbd_debugfs_remove(&connection->debugfs_conn_oldest_requests);
+	drbd_debugfs_remove(&connection->debugfs_conn);
+}
+
+static void resync_dump_detail(struct seq_file *m, struct lc_element *e)
+{
+       struct bm_extent *bme = lc_entry(e, struct bm_extent, lce);
+
+       seq_printf(m, "%5d %s %s %s", bme->rs_left,
+		  test_bit(BME_NO_WRITES, &bme->flags) ? "NO_WRITES" : "---------",
+		  test_bit(BME_LOCKED, &bme->flags) ? "LOCKED" : "------",
+		  test_bit(BME_PRIORITY, &bme->flags) ? "PRIORITY" : "--------"
+		  );
+}
+
+static int device_resync_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->resync);
+		lc_seq_dump_details(m, device->resync, "rs_left flags", resync_dump_detail);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_act_log_extents_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		lc_seq_printf_stats(m, device->act_log);
+		lc_seq_dump_details(m, device->act_log, "", NULL);
+		put_ldev(device);
+	}
+	return 0;
+}
+
+static int device_oldest_requests_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_resource *resource = device->resource;
+	unsigned long now = jiffies;
+	struct drbd_request *r1, *r2;
+	int i;
+
+	/* BUMP me if you change the file format/content/presentation */
+	seq_printf(m, "v: %u\n\n", 0);
+
+	seq_puts(m, RQ_HDR);
+	spin_lock_irq(&resource->req_lock);
+	/* WRITE, then READ */
+	for (i = 1; i >= 0; --i) {
+		r1 = list_first_entry_or_null(&device->pending_master_completion[i],
+			struct drbd_request, req_pending_master_completion);
+		r2 = list_first_entry_or_null(&device->pending_completion[i],
+			struct drbd_request, req_pending_local);
+		if (r1)
+			seq_print_one_request(m, r1, now);
+		if (r2 && r2 != r1)
+			seq_print_one_request(m, r2, now);
+	}
+	spin_unlock_irq(&resource->req_lock);
+	return 0;
+}
+
+static int device_data_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	struct drbd_md *md;
+	enum drbd_uuid_index idx;
+
+	if (!get_ldev_if_state(device, D_FAILED))
+		return -ENODEV;
+
+	md = &device->ldev->md;
+	spin_lock_irq(&md->uuid_lock);
+	for (idx = UI_CURRENT; idx <= UI_HISTORY_END; idx++) {
+		seq_printf(m, "0x%016llX\n", md->uuid[idx]);
+	}
+	spin_unlock_irq(&md->uuid_lock);
+	put_ldev(device);
+	return 0;
+}
+
+static int device_ed_gen_id_show(struct seq_file *m, void *ignored)
+{
+	struct drbd_device *device = m->private;
+	seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid);
+	return 0;
+}
+
+#define drbd_debugfs_device_attr(name)						\
+static int device_ ## name ## _open(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	return drbd_single_open(file, device_ ## name ## _show, device,		\
+				&device->kref, drbd_destroy_device);		\
+}										\
+static int device_ ## name ## _release(struct inode *inode, struct file *file)	\
+{										\
+	struct drbd_device *device = inode->i_private;				\
+	kref_put(&device->kref, drbd_destroy_device);				\
+	return single_release(inode, file);					\
+}										\
+static const struct file_operations device_ ## name ## _fops = {		\
+	.owner		= THIS_MODULE,						\
+	.open		= device_ ## name ## _open,				\
+	.read		= seq_read,						\
+	.llseek		= seq_lseek,						\
+	.release	= device_ ## name ## _release,				\
+};
+
+drbd_debugfs_device_attr(oldest_requests)
+drbd_debugfs_device_attr(act_log_extents)
+drbd_debugfs_device_attr(resync_extents)
+drbd_debugfs_device_attr(data_gen_id)
+drbd_debugfs_device_attr(ed_gen_id)
+
+void drbd_debugfs_device_add(struct drbd_device *device)
+{
+	struct dentry *vols_dir = device->resource->debugfs_res_volumes;
+	char minor_buf[8]; /* MINORMASK, MINORBITS == 20; */
+	char vnr_buf[8];   /* volume number vnr is even 16 bit only; */
+	char *slink_name = NULL;
+
+	struct dentry *dentry;
+	if (!vols_dir || !drbd_debugfs_minors)
+		return;
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, vols_dir);
+	device->debugfs_vol = dentry;
+
+	snprintf(minor_buf, sizeof(minor_buf), "%u", device->minor);
+	slink_name = kasprintf(GFP_KERNEL, "../resources/%s/volumes/%u",
+			device->resource->name, device->vnr);
+	if (!slink_name)
+		goto fail;
+	dentry = debugfs_create_symlink(minor_buf, drbd_debugfs_minors, slink_name);
+	device->debugfs_minor = dentry;
+	kfree(slink_name);
+	slink_name = NULL;
+
+#define DCF(name)	do {					\
+	dentry = debugfs_create_file(#name, 0440,	\
+			device->debugfs_vol, device,		\
+			&device_ ## name ## _fops);		\
+	device->debugfs_vol_ ## name = dentry;			\
+	} while (0)
+
+	DCF(oldest_requests);
+	DCF(act_log_extents);
+	DCF(resync_extents);
+	DCF(data_gen_id);
+	DCF(ed_gen_id);
+#undef DCF
+	return;
+
+fail:
+	drbd_debugfs_device_cleanup(device);
+	drbd_err(device, "failed to create debugfs entries\n");
+}
+
+void drbd_debugfs_device_cleanup(struct drbd_device *device)
+{
+	drbd_debugfs_remove(&device->debugfs_minor);
+	drbd_debugfs_remove(&device->debugfs_vol_oldest_requests);
+	drbd_debugfs_remove(&device->debugfs_vol_act_log_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_resync_extents);
+	drbd_debugfs_remove(&device->debugfs_vol_data_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id);
+	drbd_debugfs_remove(&device->debugfs_vol);
+}
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device)
+{
+	struct dentry *conn_dir = peer_device->connection->debugfs_conn;
+	struct dentry *dentry;
+	char vnr_buf[8];
+
+	snprintf(vnr_buf, sizeof(vnr_buf), "%u", peer_device->device->vnr);
+	dentry = debugfs_create_dir(vnr_buf, conn_dir);
+	peer_device->debugfs_peer_dev = dentry;
+}
+
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device)
+{
+	drbd_debugfs_remove(&peer_device->debugfs_peer_dev);
+}
+
+static int drbd_version_show(struct seq_file *m, void *ignored)
+{
+	seq_printf(m, "# %s\n", drbd_buildtag());
+	seq_printf(m, "VERSION=%s\n", REL_VERSION);
+	seq_printf(m, "API_VERSION=%u\n", API_VERSION);
+	seq_printf(m, "PRO_VERSION_MIN=%u\n", PRO_VERSION_MIN);
+	seq_printf(m, "PRO_VERSION_MAX=%u\n", PRO_VERSION_MAX);
+	return 0;
+}
+
+static int drbd_version_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, drbd_version_show, NULL);
+}
+
+static const struct file_operations drbd_version_fops = {
+	.owner = THIS_MODULE,
+	.open = drbd_version_open,
+	.llseek = seq_lseek,
+	.read = seq_read,
+	.release = single_release,
+};
+
+/* not __exit, may be indirectly called
+ * from the module-load-failure path as well. */
+void drbd_debugfs_cleanup(void)
+{
+	drbd_debugfs_remove(&drbd_debugfs_resources);
+	drbd_debugfs_remove(&drbd_debugfs_minors);
+	drbd_debugfs_remove(&drbd_debugfs_version);
+	drbd_debugfs_remove(&drbd_debugfs_root);
+}
+
+void __init drbd_debugfs_init(void)
+{
+	struct dentry *dentry;
+
+	dentry = debugfs_create_dir("drbd", NULL);
+	drbd_debugfs_root = dentry;
+
+	dentry = debugfs_create_file("version", 0444, drbd_debugfs_root, NULL, &drbd_version_fops);
+	drbd_debugfs_version = dentry;
+
+	dentry = debugfs_create_dir("resources", drbd_debugfs_root);
+	drbd_debugfs_resources = dentry;
+
+	dentry = debugfs_create_dir("minors", drbd_debugfs_root);
+	drbd_debugfs_minors = dentry;
+}
diff --git a/drivers/block/drbd/drbd_debugfs.h b/drivers/block/drbd/drbd_debugfs.h
new file mode 100644
index 000000000..58e31cef0
--- /dev/null
+++ b/drivers/block/drbd/drbd_debugfs.h
@@ -0,0 +1,40 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/debugfs.h>
+
+#include "drbd_int.h"
+
+#ifdef CONFIG_DEBUG_FS
+void __init drbd_debugfs_init(void);
+void drbd_debugfs_cleanup(void);
+
+void drbd_debugfs_resource_add(struct drbd_resource *resource);
+void drbd_debugfs_resource_cleanup(struct drbd_resource *resource);
+
+void drbd_debugfs_connection_add(struct drbd_connection *connection);
+void drbd_debugfs_connection_cleanup(struct drbd_connection *connection);
+
+void drbd_debugfs_device_add(struct drbd_device *device);
+void drbd_debugfs_device_cleanup(struct drbd_device *device);
+
+void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device);
+void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device);
+#else
+
+static inline void __init drbd_debugfs_init(void) { }
+static inline void drbd_debugfs_cleanup(void) { }
+
+static inline void drbd_debugfs_resource_add(struct drbd_resource *resource) { }
+static inline void drbd_debugfs_resource_cleanup(struct drbd_resource *resource) { }
+
+static inline void drbd_debugfs_connection_add(struct drbd_connection *connection) { }
+static inline void drbd_debugfs_connection_cleanup(struct drbd_connection *connection) { }
+
+static inline void drbd_debugfs_device_add(struct drbd_device *device) { }
+static inline void drbd_debugfs_device_cleanup(struct drbd_device *device) { }
+
+static inline void drbd_debugfs_peer_device_add(struct drbd_peer_device *peer_device) { }
+static inline void drbd_debugfs_peer_device_cleanup(struct drbd_peer_device *peer_device) { }
+
+#endif
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
new file mode 100644
index 000000000..4d661282f
--- /dev/null
+++ b/drivers/block/drbd/drbd_int.h
@@ -0,0 +1,2320 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+  drbd_int.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+*/
+
+#ifndef _DRBD_INT_H
+#define _DRBD_INT_H
+
+#include <crypto/hash.h>
+#include <linux/compiler.h>
+#include <linux/types.h>
+#include <linux/list.h>
+#include <linux/sched/signal.h>
+#include <linux/bitops.h>
+#include <linux/slab.h>
+#include <linux/ratelimit.h>
+#include <linux/tcp.h>
+#include <linux/mutex.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/backing-dev.h>
+#include <linux/idr.h>
+#include <linux/dynamic_debug.h>
+#include <net/tcp.h>
+#include <linux/lru_cache.h>
+#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+#include "drbd_state.h"
+#include "drbd_protocol.h"
+
+#ifdef __CHECKER__
+# define __protected_by(x)       __attribute__((require_context(x,1,999,"rdwr")))
+# define __protected_read_by(x)  __attribute__((require_context(x,1,999,"read")))
+# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write")))
+#else
+# define __protected_by(x)
+# define __protected_read_by(x)
+# define __protected_write_by(x)
+#endif
+
+/* shared module parameters, defined in drbd_main.c */
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+extern int drbd_enable_faults;
+extern int drbd_fault_rate;
+#endif
+
+extern unsigned int drbd_minor_count;
+extern char drbd_usermode_helper[];
+extern int drbd_proc_details;
+
+
+/* This is used to stop/restart our threads.
+ * Cannot use SIGTERM nor SIGKILL, since these
+ * are sent out by init on runlevel changes
+ * I choose SIGHUP for now.
+ */
+#define DRBD_SIGKILL SIGHUP
+
+#define ID_IN_SYNC      (4711ULL)
+#define ID_OUT_OF_SYNC  (4712ULL)
+#define ID_SYNCER (-1ULL)
+
+#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
+
+struct drbd_device;
+struct drbd_connection;
+
+#define __drbd_printk_device(level, device, fmt, args...) \
+	dev_printk(level, disk_to_dev((device)->vdisk), fmt, ## args)
+#define __drbd_printk_peer_device(level, peer_device, fmt, args...) \
+	dev_printk(level, disk_to_dev((peer_device)->device->vdisk), fmt, ## args)
+#define __drbd_printk_resource(level, resource, fmt, args...) \
+	printk(level "drbd %s: " fmt, (resource)->name, ## args)
+#define __drbd_printk_connection(level, connection, fmt, args...) \
+	printk(level "drbd %s: " fmt, (connection)->resource->name, ## args)
+
+void drbd_printk_with_wrong_object_type(void);
+
+#define __drbd_printk_if_same_type(obj, type, func, level, fmt, args...) \
+	(__builtin_types_compatible_p(typeof(obj), type) || \
+	 __builtin_types_compatible_p(typeof(obj), const type)), \
+	func(level, (const type)(obj), fmt, ## args)
+
+#define drbd_printk(level, obj, fmt, args...) \
+	__builtin_choose_expr( \
+	  __drbd_printk_if_same_type(obj, struct drbd_device *, \
+			     __drbd_printk_device, level, fmt, ## args), \
+	  __builtin_choose_expr( \
+	    __drbd_printk_if_same_type(obj, struct drbd_resource *, \
+			       __drbd_printk_resource, level, fmt, ## args), \
+	    __builtin_choose_expr( \
+	      __drbd_printk_if_same_type(obj, struct drbd_connection *, \
+				 __drbd_printk_connection, level, fmt, ## args), \
+	      __builtin_choose_expr( \
+		__drbd_printk_if_same_type(obj, struct drbd_peer_device *, \
+				 __drbd_printk_peer_device, level, fmt, ## args), \
+		drbd_printk_with_wrong_object_type()))))
+
+#define drbd_dbg(obj, fmt, args...) \
+	drbd_printk(KERN_DEBUG, obj, fmt, ## args)
+#define drbd_alert(obj, fmt, args...) \
+	drbd_printk(KERN_ALERT, obj, fmt, ## args)
+#define drbd_err(obj, fmt, args...) \
+	drbd_printk(KERN_ERR, obj, fmt, ## args)
+#define drbd_warn(obj, fmt, args...) \
+	drbd_printk(KERN_WARNING, obj, fmt, ## args)
+#define drbd_info(obj, fmt, args...) \
+	drbd_printk(KERN_INFO, obj, fmt, ## args)
+#define drbd_emerg(obj, fmt, args...) \
+	drbd_printk(KERN_EMERG, obj, fmt, ## args)
+
+#define dynamic_drbd_dbg(device, fmt, args...) \
+	dynamic_dev_dbg(disk_to_dev(device->vdisk), fmt, ## args)
+
+#define D_ASSERT(device, exp)	do { \
+	if (!(exp)) \
+		drbd_err(device, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__); \
+	} while (0)
+
+/**
+ * expect  -  Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({								\
+		bool _bool = (exp);						\
+		if (!_bool)							\
+			drbd_err(device, "ASSERTION %s FAILED in %s\n",		\
+			        #exp, __func__);				\
+		_bool;								\
+		})
+
+/* Defines to control fault insertion */
+enum {
+	DRBD_FAULT_MD_WR = 0,	/* meta data write */
+	DRBD_FAULT_MD_RD = 1,	/*           read  */
+	DRBD_FAULT_RS_WR = 2,	/* resync          */
+	DRBD_FAULT_RS_RD = 3,
+	DRBD_FAULT_DT_WR = 4,	/* data            */
+	DRBD_FAULT_DT_RD = 5,
+	DRBD_FAULT_DT_RA = 6,	/* data read ahead */
+	DRBD_FAULT_BM_ALLOC = 7,	/* bitmap allocation */
+	DRBD_FAULT_AL_EE = 8,	/* alloc ee */
+	DRBD_FAULT_RECEIVE = 9, /* Changes some bytes upon receiving a [rs]data block */
+
+	DRBD_FAULT_MAX,
+};
+
+extern unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type);
+
+static inline int
+drbd_insert_fault(struct drbd_device *device, unsigned int type) {
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+	return drbd_fault_rate &&
+		(drbd_enable_faults & (1<<type)) &&
+		_drbd_insert_fault(device, type);
+#else
+	return 0;
+#endif
+}
+
+/* integer division, round _UP_ to the next integer */
+#define div_ceil(A, B) ((A)/(B) + ((A)%(B) ? 1 : 0))
+/* usual integer division */
+#define div_floor(A, B) ((A)/(B))
+
+extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr drbd_devices; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_resources; /* RCU, updates: genl_lock() */
+
+extern const char *cmdname(enum drbd_packet cmd);
+
+/* for sending/receiving the bitmap,
+ * possibly in some encoding scheme */
+struct bm_xfer_ctx {
+	/* "const"
+	 * stores total bits and long words
+	 * of the bitmap, so we don't need to
+	 * call the accessor functions over and again. */
+	unsigned long bm_bits;
+	unsigned long bm_words;
+	/* during xfer, current position within the bitmap */
+	unsigned long bit_offset;
+	unsigned long word_offset;
+
+	/* statistics; index: (h->command == P_BITMAP) */
+	unsigned packets[2];
+	unsigned bytes[2];
+};
+
+extern void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c);
+
+static inline void bm_xfer_ctx_bit_to_word_offset(struct bm_xfer_ctx *c)
+{
+	/* word_offset counts "native long words" (32 or 64 bit),
+	 * aligned at 64 bit.
+	 * Encoded packet may end at an unaligned bit offset.
+	 * In case a fallback clear text packet is transmitted in
+	 * between, we adjust this offset back to the last 64bit
+	 * aligned "native long word", which makes coding and decoding
+	 * the plain text bitmap much more convenient.  */
+#if BITS_PER_LONG == 64
+	c->word_offset = c->bit_offset >> 6;
+#elif BITS_PER_LONG == 32
+	c->word_offset = c->bit_offset >> 5;
+	c->word_offset &= ~(1UL);
+#else
+# error "unsupported BITS_PER_LONG"
+#endif
+}
+
+extern unsigned int drbd_header_size(struct drbd_connection *connection);
+
+/**********************************************************************/
+enum drbd_thread_state {
+	NONE,
+	RUNNING,
+	EXITING,
+	RESTARTING
+};
+
+struct drbd_thread {
+	spinlock_t t_lock;
+	struct task_struct *task;
+	struct completion stop;
+	enum drbd_thread_state t_state;
+	int (*function) (struct drbd_thread *);
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	int reset_cpu_mask;
+	const char *name;
+};
+
+static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
+{
+	/* THINK testing the t_state seems to be uncritical in all cases
+	 * (but thread_{start,stop}), so we can read it *without* the lock.
+	 *	--lge */
+
+	smp_rmb();
+	return thi->t_state;
+}
+
+struct drbd_work {
+	struct list_head list;
+	int (*cb)(struct drbd_work *, int cancel);
+};
+
+struct drbd_device_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+};
+
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *);
+
+extern void lock_all_resources(void);
+extern void unlock_all_resources(void);
+
+struct drbd_request {
+	struct drbd_work w;
+	struct drbd_device *device;
+
+	/* if local IO is not allowed, will be NULL.
+	 * if local IO _is_ allowed, holds the locally submitted bio clone,
+	 * or, after local IO completion, the ERR_PTR(error).
+	 * see drbd_request_endio(). */
+	struct bio *private_bio;
+
+	struct drbd_interval i;
+
+	/* epoch: used to check on "completion" whether this req was in
+	 * the current epoch, and we therefore have to close it,
+	 * causing a p_barrier packet to be send, starting a new epoch.
+	 *
+	 * This corresponds to "barrier" in struct p_barrier[_ack],
+	 * and to "barrier_nr" in struct drbd_epoch (and various
+	 * comments/function parameters/local variable names).
+	 */
+	unsigned int epoch;
+
+	struct list_head tl_requests; /* ring list in the transfer log */
+	struct bio *master_bio;       /* master bio pointer */
+
+	/* see struct drbd_device */
+	struct list_head req_pending_master_completion;
+	struct list_head req_pending_local;
+
+	/* for generic IO accounting */
+	unsigned long start_jif;
+
+	/* for DRBD internal statistics */
+
+	/* Minimal set of time stamps to determine if we wait for activity log
+	 * transactions, local disk or peer.  32 bit "jiffies" are good enough,
+	 * we don't expect a DRBD request to be stalled for several month.
+	 */
+
+	/* before actual request processing */
+	unsigned long in_actlog_jif;
+
+	/* local disk */
+	unsigned long pre_submit_jif;
+
+	/* per connection */
+	unsigned long pre_send_jif;
+	unsigned long acked_jif;
+	unsigned long net_done_jif;
+
+	/* Possibly even more detail to track each phase:
+	 *  master_completion_jif
+	 *      how long did it take to complete the master bio
+	 *      (application visible latency)
+	 *  allocated_jif
+	 *      how long the master bio was blocked until we finally allocated
+	 *      a tracking struct
+	 *  in_actlog_jif
+	 *      how long did we wait for activity log transactions
+	 *
+	 *  net_queued_jif
+	 *      when did we finally queue it for sending
+	 *  pre_send_jif
+	 *      when did we start sending it
+	 *  post_send_jif
+	 *      how long did we block in the network stack trying to send it
+	 *  acked_jif
+	 *      when did we receive (or fake, in protocol A) a remote ACK
+	 *  net_done_jif
+	 *      when did we receive final acknowledgement (P_BARRIER_ACK),
+	 *      or decide, e.g. on connection loss, that we do no longer expect
+	 *      anything from this peer for this request.
+	 *
+	 *  pre_submit_jif
+	 *  post_sub_jif
+	 *      when did we start submiting to the lower level device,
+	 *      and how long did we block in that submit function
+	 *  local_completion_jif
+	 *      how long did it take the lower level device to complete this request
+	 */
+
+
+	/* once it hits 0, we may complete the master_bio */
+	atomic_t completion_ref;
+	/* once it hits 0, we may destroy this drbd_request object */
+	struct kref kref;
+
+	unsigned rq_state; /* see comments above _req_mod() */
+};
+
+struct drbd_epoch {
+	struct drbd_connection *connection;
+	struct list_head list;
+	unsigned int barrier_nr;
+	atomic_t epoch_size; /* increased on every request added. */
+	atomic_t active;     /* increased on every req. added, and dec on every finished. */
+	unsigned long flags;
+};
+
+/* Prototype declaration of function defined in drbd_receiver.c */
+int drbdd_init(struct drbd_thread *);
+int drbd_asender(struct drbd_thread *);
+
+/* drbd_epoch flag bits */
+enum {
+	DE_HAVE_BARRIER_NUMBER,
+};
+
+enum epoch_event {
+	EV_PUT,
+	EV_GOT_BARRIER_NR,
+	EV_BECAME_LAST,
+	EV_CLEANUP = 32, /* used as flag */
+};
+
+struct digest_info {
+	int digest_size;
+	void *digest;
+};
+
+struct drbd_peer_request {
+	struct drbd_work w;
+	struct drbd_peer_device *peer_device;
+	struct drbd_epoch *epoch; /* for writes */
+	struct page *pages;
+	atomic_t pending_bios;
+	struct drbd_interval i;
+	/* see comments on ee flag bits below */
+	unsigned long flags;
+	unsigned long submit_jif;
+	union {
+		u64 block_id;
+		struct digest_info *digest;
+	};
+};
+
+/* ee flag bits.
+ * While corresponding bios are in flight, the only modification will be
+ * set_bit WAS_ERROR, which has to be atomic.
+ * If no bios are in flight yet, or all have been completed,
+ * non-atomic modification to ee->flags is ok.
+ */
+enum {
+	__EE_CALL_AL_COMPLETE_IO,
+	__EE_MAY_SET_IN_SYNC,
+
+	/* is this a TRIM aka REQ_OP_DISCARD? */
+	__EE_TRIM,
+	/* explicit zero-out requested, or
+	 * our lower level cannot handle trim,
+	 * and we want to fall back to zeroout instead */
+	__EE_ZEROOUT,
+
+	/* In case a barrier failed,
+	 * we need to resubmit without the barrier flag. */
+	__EE_RESUBMITTED,
+
+	/* we may have several bios per peer request.
+	 * if any of those fail, we set this flag atomically
+	 * from the endio callback */
+	__EE_WAS_ERROR,
+
+	/* This ee has a pointer to a digest instead of a block id */
+	__EE_HAS_DIGEST,
+
+	/* Conflicting local requests need to be restarted after this request */
+	__EE_RESTART_REQUESTS,
+
+	/* The peer wants a write ACK for this (wire proto C) */
+	__EE_SEND_WRITE_ACK,
+
+	/* Is set when net_conf had two_primaries set while creating this peer_req */
+	__EE_IN_INTERVAL_TREE,
+
+	/* for debugfs: */
+	/* has this been submitted, or does it still wait for something else? */
+	__EE_SUBMITTED,
+
+	/* this is/was a write request */
+	__EE_WRITE,
+
+	/* this is/was a write same request */
+	__EE_WRITE_SAME,
+
+	/* this originates from application on peer
+	 * (not some resync or verify or other DRBD internal request) */
+	__EE_APPLICATION,
+
+	/* If it contains only 0 bytes, send back P_RS_DEALLOCATED */
+	__EE_RS_THIN_REQ,
+};
+#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
+#define EE_MAY_SET_IN_SYNC     (1<<__EE_MAY_SET_IN_SYNC)
+#define EE_TRIM                (1<<__EE_TRIM)
+#define EE_ZEROOUT             (1<<__EE_ZEROOUT)
+#define EE_RESUBMITTED         (1<<__EE_RESUBMITTED)
+#define EE_WAS_ERROR           (1<<__EE_WAS_ERROR)
+#define EE_HAS_DIGEST          (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS	(1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK	(1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE	(1<<__EE_IN_INTERVAL_TREE)
+#define EE_SUBMITTED		(1<<__EE_SUBMITTED)
+#define EE_WRITE		(1<<__EE_WRITE)
+#define EE_WRITE_SAME		(1<<__EE_WRITE_SAME)
+#define EE_APPLICATION		(1<<__EE_APPLICATION)
+#define EE_RS_THIN_REQ		(1<<__EE_RS_THIN_REQ)
+
+/* flag bits per device */
+enum {
+	UNPLUG_REMOTE,		/* sending a "UnplugRemote" could help */
+	MD_DIRTY,		/* current uuids and flags not yet on disk */
+	USE_DEGR_WFC_T,		/* degr-wfc-timeout instead of wfc-timeout. */
+	CL_ST_CHG_SUCCESS,
+	CL_ST_CHG_FAIL,
+	CRASHED_PRIMARY,	/* This node was a crashed primary.
+				 * Gets cleared when the state.conn
+				 * goes into C_CONNECTED state. */
+	CONSIDER_RESYNC,
+
+	MD_NO_FUA,		/* Users wants us to not use FUA/FLUSH on meta data dev */
+
+	BITMAP_IO,		/* suspend application io;
+				   once no more io in flight, start bitmap io */
+	BITMAP_IO_QUEUED,       /* Started bitmap IO */
+	WAS_IO_ERROR,		/* Local disk failed, returned IO error */
+	WAS_READ_ERROR,		/* Local disk READ failed (set additionally to the above) */
+	FORCE_DETACH,		/* Force-detach from local disk, aborting any pending local IO */
+	RESYNC_AFTER_NEG,       /* Resync after online grow after the attach&negotiate finished. */
+	RESIZE_PENDING,		/* Size change detected locally, waiting for the response from
+				 * the peer, if it changed there as well. */
+	NEW_CUR_UUID,		/* Create new current UUID when thawing IO */
+	AL_SUSPENDED,		/* Activity logging is currently suspended. */
+	AHEAD_TO_SYNC_SOURCE,   /* Ahead -> SyncSource queued */
+	B_RS_H_DONE,		/* Before resync handler done (already executed) */
+	DISCARD_MY_DATA,	/* discard_my_data flag per volume */
+	READ_BALANCE_RR,
+
+	FLUSH_PENDING,		/* if set, device->flush_jif is when we submitted that flush
+				 * from drbd_flush_after_epoch() */
+
+	/* cleared only after backing device related structures have been destroyed. */
+	GOING_DISKLESS,		/* Disk is being detached, because of io-error, or admin request. */
+
+	/* to be used in drbd_device_post_work() */
+	GO_DISKLESS,		/* tell worker to schedule cleanup before detach */
+	DESTROY_DISK,		/* tell worker to close backing devices and destroy related structures. */
+	MD_SYNC,		/* tell worker to call drbd_md_sync() */
+	RS_START,		/* tell worker to start resync/OV */
+	RS_PROGRESS,		/* tell worker that resync made significant progress */
+	RS_DONE,		/* tell worker that resync is done */
+};
+
+struct drbd_bitmap; /* opaque for drbd_device */
+
+/* definition of bits in bm_flags to be used in drbd_bm_lock
+ * and drbd_bitmap_io and friends. */
+enum bm_flag {
+	/* currently locked for bulk operation */
+	BM_LOCKED_MASK = 0xf,
+
+	/* in detail, that is: */
+	BM_DONT_CLEAR = 0x1,
+	BM_DONT_SET   = 0x2,
+	BM_DONT_TEST  = 0x4,
+
+	/* so we can mark it locked for bulk operation,
+	 * and still allow all non-bulk operations */
+	BM_IS_LOCKED  = 0x8,
+
+	/* (test bit, count bit) allowed (common case) */
+	BM_LOCKED_TEST_ALLOWED = BM_DONT_CLEAR | BM_DONT_SET | BM_IS_LOCKED,
+
+	/* testing bits, as well as setting new bits allowed, but clearing bits
+	 * would be unexpected.  Used during bitmap receive.  Setting new bits
+	 * requires sending of "out-of-sync" information, though. */
+	BM_LOCKED_SET_ALLOWED = BM_DONT_CLEAR | BM_IS_LOCKED,
+
+	/* for drbd_bm_write_copy_pages, everything is allowed,
+	 * only concurrent bulk operations are locked out. */
+	BM_LOCKED_CHANGE_ALLOWED = BM_IS_LOCKED,
+};
+
+struct drbd_work_queue {
+	struct list_head q;
+	spinlock_t q_lock;  /* to protect the list. */
+	wait_queue_head_t q_wait;
+};
+
+struct drbd_socket {
+	struct mutex mutex;
+	struct socket    *socket;
+	/* this way we get our
+	 * send/receive buffers off the stack */
+	void *sbuf;
+	void *rbuf;
+};
+
+struct drbd_md {
+	u64 md_offset;		/* sector offset to 'super' block */
+
+	u64 la_size_sect;	/* last agreed size, unit sectors */
+	spinlock_t uuid_lock;
+	u64 uuid[UI_SIZE];
+	u64 device_uuid;
+	u32 flags;
+	u32 md_size_sect;
+
+	s32 al_offset;	/* signed relative sector offset to activity log */
+	s32 bm_offset;	/* signed relative sector offset to bitmap */
+
+	/* cached value of bdev->disk_conf->meta_dev_idx (see below) */
+	s32 meta_dev_idx;
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+	u32 al_size_4k; /* cached product of the above */
+};
+
+struct drbd_backing_dev {
+	struct block_device *backing_bdev;
+	struct block_device *md_bdev;
+	struct drbd_md md;
+	struct disk_conf *disk_conf; /* RCU, for updates: resource->conf_update */
+	sector_t known_size; /* last known size of that backing device */
+};
+
+struct drbd_md_io {
+	struct page *page;
+	unsigned long start_jif;	/* last call to drbd_md_get_buffer */
+	unsigned long submit_jif;	/* last _drbd_md_sync_page_io() submit */
+	const char *current_use;
+	atomic_t in_use;
+	unsigned int done;
+	int error;
+};
+
+struct bm_io_work {
+	struct drbd_work w;
+	char *why;
+	enum bm_flag flags;
+	int (*io_fn)(struct drbd_device *device);
+	void (*done)(struct drbd_device *device, int rv);
+};
+
+struct fifo_buffer {
+	unsigned int head_index;
+	unsigned int size;
+	int total; /* sum of all values */
+	int values[];
+};
+extern struct fifo_buffer *fifo_alloc(unsigned int fifo_size);
+
+/* flag bits per connection */
+enum {
+	NET_CONGESTED,		/* The data socket is congested */
+	RESOLVE_CONFLICTS,	/* Set on one node, cleared on the peer! */
+	SEND_PING,
+	GOT_PING_ACK,		/* set when we receive a ping_ack packet, ping_wait gets woken */
+	CONN_WD_ST_CHG_REQ,	/* A cluster wide state change on the connection is active */
+	CONN_WD_ST_CHG_OKAY,
+	CONN_WD_ST_CHG_FAIL,
+	CONN_DRY_RUN,		/* Expect disconnect after resync handshake. */
+	CREATE_BARRIER,		/* next P_DATA is preceded by a P_BARRIER */
+	STATE_SENT,		/* Do not change state/UUIDs while this is set */
+	CALLBACK_PENDING,	/* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+				 * pending, from drbd worker context.
+				 */
+	DISCONNECT_SENT,
+
+	DEVICE_WORK_PENDING,	/* tell worker that some device has pending work */
+};
+
+enum which_state { NOW, OLD = NOW, NEW };
+
+struct drbd_resource {
+	char *name;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_res;
+	struct dentry *debugfs_res_volumes;
+	struct dentry *debugfs_res_connections;
+	struct dentry *debugfs_res_in_flight_summary;
+#endif
+	struct kref kref;
+	struct idr devices;		/* volume number to device mapping */
+	struct list_head connections;
+	struct list_head resources;
+	struct res_opts res_opts;
+	struct mutex conf_update;	/* mutex for ready-copy-update of net_conf and disk_conf */
+	struct mutex adm_mutex;		/* mutex to serialize administrative requests */
+	spinlock_t req_lock;
+
+	unsigned susp:1;		/* IO suspended by user */
+	unsigned susp_nod:1;		/* IO suspended because no data */
+	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
+
+	enum write_ordering_e write_ordering;
+
+	cpumask_var_t cpu_mask;
+};
+
+struct drbd_thread_timing_details
+{
+	unsigned long start_jif;
+	void *cb_addr;
+	const char *caller_fn;
+	unsigned int line;
+	unsigned int cb_nr;
+};
+
+struct drbd_connection {
+	struct list_head connections;
+	struct drbd_resource *resource;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_conn;
+	struct dentry *debugfs_conn_callback_history;
+	struct dentry *debugfs_conn_oldest_requests;
+#endif
+	struct kref kref;
+	struct idr peer_devices;	/* volume number to peer device mapping */
+	enum drbd_conns cstate;		/* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+	struct mutex cstate_mutex;	/* Protects graceful disconnects */
+	unsigned int connect_cnt;	/* Inc each time a connection is established */
+
+	unsigned long flags;
+	struct net_conf *net_conf;	/* content protected by rcu */
+	wait_queue_head_t ping_wait;	/* Woken upon reception of a ping, and a state change */
+
+	struct sockaddr_storage my_addr;
+	int my_addr_len;
+	struct sockaddr_storage peer_addr;
+	int peer_addr_len;
+
+	struct drbd_socket data;	/* data/barrier/cstate/parameter packets */
+	struct drbd_socket meta;	/* ping/ack (metadata) packets */
+	int agreed_pro_version;		/* actually used protocol version */
+	u32 agreed_features;
+	unsigned long last_received;	/* in jiffies, either socket */
+	unsigned int ko_count;
+
+	struct list_head transfer_log;	/* all requests not yet fully processed */
+
+	struct crypto_shash *cram_hmac_tfm;
+	struct crypto_shash *integrity_tfm;  /* checksums we compute, updates protected by connection->data->mutex */
+	struct crypto_shash *peer_integrity_tfm;  /* checksums we verify, only accessed from receiver thread  */
+	struct crypto_shash *csums_tfm;
+	struct crypto_shash *verify_tfm;
+	void *int_dig_in;
+	void *int_dig_vv;
+
+	/* receiver side */
+	struct drbd_epoch *current_epoch;
+	spinlock_t epoch_lock;
+	unsigned int epochs;
+	atomic_t current_tle_nr;	/* transfer log epoch number */
+	unsigned current_tle_writes;	/* writes seen within this tl epoch */
+
+	unsigned long last_reconnect_jif;
+	/* empty member on older kernels without blk_start_plug() */
+	struct blk_plug receiver_plug;
+	struct drbd_thread receiver;
+	struct drbd_thread worker;
+	struct drbd_thread ack_receiver;
+	struct workqueue_struct *ack_sender;
+
+	/* cached pointers,
+	 * so we can look up the oldest pending requests more quickly.
+	 * protected by resource->req_lock */
+	struct drbd_request *req_next; /* DRBD 9: todo.req_next */
+	struct drbd_request *req_ack_pending;
+	struct drbd_request *req_not_net_done;
+
+	/* sender side */
+	struct drbd_work_queue sender_work;
+
+#define DRBD_THREAD_DETAILS_HIST	16
+	unsigned int w_cb_nr; /* keeps counting up */
+	unsigned int r_cb_nr; /* keeps counting up */
+	struct drbd_thread_timing_details w_timing_details[DRBD_THREAD_DETAILS_HIST];
+	struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST];
+
+	struct {
+		unsigned long last_sent_barrier_jif;
+
+		/* whether this sender thread
+		 * has processed a single write yet. */
+		bool seen_any_write_yet;
+
+		/* Which barrier number to send with the next P_BARRIER */
+		int current_epoch_nr;
+
+		/* how many write requests have been sent
+		 * with req->epoch == current_epoch_nr.
+		 * If none, no P_BARRIER will be sent. */
+		unsigned current_epoch_writes;
+	} send;
+};
+
+static inline bool has_net_conf(struct drbd_connection *connection)
+{
+	bool has_net_conf;
+
+	rcu_read_lock();
+	has_net_conf = rcu_dereference(connection->net_conf);
+	rcu_read_unlock();
+
+	return has_net_conf;
+}
+
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line);
+
+#define update_worker_timing_details(c, cb) \
+	__update_timing_details(c->w_timing_details, &c->w_cb_nr, cb, __func__ , __LINE__ )
+#define update_receiver_timing_details(c, cb) \
+	__update_timing_details(c->r_timing_details, &c->r_cb_nr, cb, __func__ , __LINE__ )
+
+struct submit_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	/* protected by ..->resource->req_lock */
+	struct list_head writes;
+};
+
+struct drbd_peer_device {
+	struct list_head peer_devices;
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+	struct work_struct send_acks_work;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_peer_dev;
+#endif
+};
+
+struct drbd_device {
+	struct drbd_resource *resource;
+	struct list_head peer_devices;
+	struct list_head pending_bitmap_io;
+
+	unsigned long flush_jif;
+#ifdef CONFIG_DEBUG_FS
+	struct dentry *debugfs_minor;
+	struct dentry *debugfs_vol;
+	struct dentry *debugfs_vol_oldest_requests;
+	struct dentry *debugfs_vol_act_log_extents;
+	struct dentry *debugfs_vol_resync_extents;
+	struct dentry *debugfs_vol_data_gen_id;
+	struct dentry *debugfs_vol_ed_gen_id;
+#endif
+
+	unsigned int vnr;	/* volume number within the connection */
+	unsigned int minor;	/* device minor number */
+
+	struct kref kref;
+
+	/* things that are stored as / read from meta data on disk */
+	unsigned long flags;
+
+	/* configured by drbdsetup */
+	struct drbd_backing_dev *ldev __protected_by(local);
+
+	sector_t p_size;     /* partner's disk size */
+	struct request_queue *rq_queue;
+	struct gendisk	    *vdisk;
+
+	unsigned long last_reattach_jif;
+	struct drbd_work resync_work;
+	struct drbd_work unplug_work;
+	struct timer_list resync_timer;
+	struct timer_list md_sync_timer;
+	struct timer_list start_resync_timer;
+	struct timer_list request_timer;
+
+	/* Used after attach while negotiating new disk state. */
+	union drbd_state new_state_tmp;
+
+	union drbd_dev_state state;
+	wait_queue_head_t misc_wait;
+	wait_queue_head_t state_wait;  /* upon each state change. */
+	unsigned int send_cnt;
+	unsigned int recv_cnt;
+	unsigned int read_cnt;
+	unsigned int writ_cnt;
+	unsigned int al_writ_cnt;
+	unsigned int bm_writ_cnt;
+	atomic_t ap_bio_cnt;	 /* Requests we need to complete */
+	atomic_t ap_actlog_cnt;  /* Requests waiting for activity log */
+	atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
+	atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
+	atomic_t unacked_cnt;	 /* Need to send replies for */
+	atomic_t local_cnt;	 /* Waiting for local completion */
+	atomic_t suspend_cnt;
+
+	/* Interval tree of pending local requests */
+	struct rb_root read_requests;
+	struct rb_root write_requests;
+
+	/* for statistics and timeouts */
+	/* [0] read, [1] write */
+	struct list_head pending_master_completion[2];
+	struct list_head pending_completion[2];
+
+	/* use checksums for *this* resync */
+	bool use_csums;
+	/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
+	unsigned long rs_total;
+	/* number of resync blocks that failed in this run */
+	unsigned long rs_failed;
+	/* Syncer's start time [unit jiffies] */
+	unsigned long rs_start;
+	/* cumulated time in PausedSyncX state [unit jiffies] */
+	unsigned long rs_paused;
+	/* skipped because csum was equal [unit BM_BLOCK_SIZE] */
+	unsigned long rs_same_csum;
+#define DRBD_SYNC_MARKS 8
+#define DRBD_SYNC_MARK_STEP (3*HZ)
+	/* block not up-to-date at mark [unit BM_BLOCK_SIZE] */
+	unsigned long rs_mark_left[DRBD_SYNC_MARKS];
+	/* marks's time [unit jiffies] */
+	unsigned long rs_mark_time[DRBD_SYNC_MARKS];
+	/* current index into rs_mark_{left,time} */
+	int rs_last_mark;
+	unsigned long rs_last_bcast; /* [unit jiffies] */
+
+	/* where does the admin want us to start? (sector) */
+	sector_t ov_start_sector;
+	sector_t ov_stop_sector;
+	/* where are we now? (sector) */
+	sector_t ov_position;
+	/* Start sector of out of sync range (to merge printk reporting). */
+	sector_t ov_last_oos_start;
+	/* size of out-of-sync range in sectors. */
+	sector_t ov_last_oos_size;
+	unsigned long ov_left; /* in bits */
+
+	struct drbd_bitmap *bitmap;
+	unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
+
+	/* Used to track operations of resync... */
+	struct lru_cache *resync;
+	/* Number of locked elements in resync LRU */
+	unsigned int resync_locked;
+	/* resync extent number waiting for application requests */
+	unsigned int resync_wenr;
+
+	int open_cnt;
+	u64 *p_uuid;
+
+	struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
+	struct list_head sync_ee;   /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
+	struct list_head done_ee;   /* need to send P_WRITE_ACK */
+	struct list_head read_ee;   /* [RS]P_DATA_REQUEST being read */
+	struct list_head net_ee;    /* zero-copy network send in progress */
+
+	int next_barrier_nr;
+	struct list_head resync_reads;
+	atomic_t pp_in_use;		/* allocated from page pool */
+	atomic_t pp_in_use_by_net;	/* sendpage()d, still referenced by tcp */
+	wait_queue_head_t ee_wait;
+	struct drbd_md_io md_io;
+	spinlock_t al_lock;
+	wait_queue_head_t al_wait;
+	struct lru_cache *act_log;	/* activity log */
+	unsigned int al_tr_number;
+	int al_tr_cycle;
+	wait_queue_head_t seq_wait;
+	atomic_t packet_seq;
+	unsigned int peer_seq;
+	spinlock_t peer_seq_lock;
+	unsigned long comm_bm_set; /* communicated number of set bits. */
+	struct bm_io_work bm_io_work;
+	u64 ed_uuid; /* UUID of the exposed data */
+	struct mutex own_state_mutex;
+	struct mutex *state_mutex; /* either own_state_mutex or first_peer_device(device)->connection->cstate_mutex */
+	char congestion_reason;  /* Why we where congested... */
+	atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
+	atomic_t rs_sect_ev; /* for submitted resync data rate, both */
+	int rs_last_sect_ev; /* counter to compare with */
+	int rs_last_events;  /* counter of read or write "events" (unit sectors)
+			      * on the lower level device when we last looked. */
+	int c_sync_rate; /* current resync rate after syncer throttle magic */
+	struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, connection->conn_update) */
+	int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
+	atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
+	unsigned int peer_max_bio_size;
+	unsigned int local_max_bio_size;
+
+	/* any requests that would block in drbd_make_request()
+	 * are deferred to this single-threaded work queue */
+	struct submit_worker submit;
+};
+
+struct drbd_bm_aio_ctx {
+	struct drbd_device *device;
+	struct list_head list; /* on device->pending_bitmap_io */;
+	unsigned long start_jif;
+	atomic_t in_flight;
+	unsigned int done;
+	unsigned flags;
+#define BM_AIO_COPY_PAGES	1
+#define BM_AIO_WRITE_HINTED	2
+#define BM_AIO_WRITE_ALL_PAGES	4
+#define BM_AIO_READ		8
+	int error;
+	struct kref kref;
+};
+
+struct drbd_config_context {
+	/* assigned from drbd_genlmsghdr */
+	unsigned int minor;
+	/* assigned from request attributes, if present */
+	unsigned int volume;
+#define VOLUME_UNSPECIFIED		(-1U)
+	/* pointer into the request skb,
+	 * limited lifetime! */
+	char *resource_name;
+	struct nlattr *my_addr;
+	struct nlattr *peer_addr;
+
+	/* reply buffer */
+	struct sk_buff *reply_skb;
+	/* pointer into reply buffer */
+	struct drbd_genlmsghdr *reply_dh;
+	/* resolved from attributes, if possible */
+	struct drbd_device *device;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+};
+
+static inline struct drbd_device *minor_to_device(unsigned int minor)
+{
+	return (struct drbd_device *)idr_find(&drbd_devices, minor);
+}
+
+static inline struct drbd_peer_device *first_peer_device(struct drbd_device *device)
+{
+	return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices);
+}
+
+static inline struct drbd_peer_device *
+conn_peer_device(struct drbd_connection *connection, int volume_number)
+{
+	return idr_find(&connection->peer_devices, volume_number);
+}
+
+#define for_each_resource(resource, _resources) \
+	list_for_each_entry(resource, _resources, resources)
+
+#define for_each_resource_rcu(resource, _resources) \
+	list_for_each_entry_rcu(resource, _resources, resources)
+
+#define for_each_resource_safe(resource, tmp, _resources) \
+	list_for_each_entry_safe(resource, tmp, _resources, resources)
+
+#define for_each_connection(connection, resource) \
+	list_for_each_entry(connection, &resource->connections, connections)
+
+#define for_each_connection_rcu(connection, resource) \
+	list_for_each_entry_rcu(connection, &resource->connections, connections)
+
+#define for_each_connection_safe(connection, tmp, resource) \
+	list_for_each_entry_safe(connection, tmp, &resource->connections, connections)
+
+#define for_each_peer_device(peer_device, device) \
+	list_for_each_entry(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_rcu(peer_device, device) \
+	list_for_each_entry_rcu(peer_device, &device->peer_devices, peer_devices)
+
+#define for_each_peer_device_safe(peer_device, tmp, device) \
+	list_for_each_entry_safe(peer_device, tmp, &device->peer_devices, peer_devices)
+
+static inline unsigned int device_to_minor(struct drbd_device *device)
+{
+	return device->minor;
+}
+
+/*
+ * function declarations
+ *************************/
+
+/* drbd_main.c */
+
+enum dds_flags {
+	DDSF_FORCED    = 1,
+	DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
+};
+
+extern void drbd_init_set_defaults(struct drbd_device *device);
+extern int  drbd_thread_start(struct drbd_thread *thi);
+extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+#ifdef CONFIG_SMP
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+#else
+#define drbd_thread_current_set_cpu(A) ({})
+#endif
+extern void tl_release(struct drbd_connection *, unsigned int barrier_nr,
+		       unsigned int set_size);
+extern void tl_clear(struct drbd_connection *);
+extern void drbd_free_sock(struct drbd_connection *connection);
+extern int drbd_send(struct drbd_connection *connection, struct socket *sock,
+		     void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_connection *, struct socket *, void *, size_t,
+			 unsigned);
+
+extern int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_connection *connection);
+extern int drbd_send_uuids(struct drbd_peer_device *);
+extern int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *);
+extern int drbd_send_sizes(struct drbd_peer_device *, int trigger_reply, enum dds_flags flags);
+extern int drbd_send_state(struct drbd_peer_device *, union drbd_state s);
+extern int drbd_send_current_state(struct drbd_peer_device *);
+extern int drbd_send_sync_param(struct drbd_peer_device *);
+extern void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr,
+			    u32 set_size);
+extern int drbd_send_ack(struct drbd_peer_device *, enum drbd_packet,
+			 struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_peer_device *, enum drbd_packet,
+			     struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_peer_device *, enum drbd_packet,
+			    sector_t sector, int blksize, u64 block_id);
+extern int drbd_send_out_of_sync(struct drbd_peer_device *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_peer_device *, enum drbd_packet,
+			   struct drbd_peer_request *);
+extern int drbd_send_dblock(struct drbd_peer_device *, struct drbd_request *req);
+extern int drbd_send_drequest(struct drbd_peer_device *, int cmd,
+			      sector_t sector, int size, u64 block_id);
+extern int drbd_send_drequest_csum(struct drbd_peer_device *, sector_t sector,
+				   int size, void *digest, int digest_size,
+				   enum drbd_packet cmd);
+extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int size);
+
+extern int drbd_send_bitmap(struct drbd_device *device);
+extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode);
+extern int drbd_send_rs_deallocated(struct drbd_peer_device *, struct drbd_peer_request *);
+extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev);
+extern void drbd_device_cleanup(struct drbd_device *device);
+extern void drbd_print_uuids(struct drbd_device *device, const char *text);
+extern void drbd_queue_unplug(struct drbd_device *device);
+
+extern void conn_md_sync(struct drbd_connection *connection);
+extern void drbd_md_write(struct drbd_device *device, void *buffer);
+extern void drbd_md_sync(struct drbd_device *device);
+extern int  drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev);
+extern void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local);
+extern void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local);
+extern void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local);
+extern void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local);
+extern void drbd_md_set_flag(struct drbd_device *device, int flags) __must_hold(local);
+extern void drbd_md_clear_flag(struct drbd_device *device, int flags)__must_hold(local);
+extern int drbd_md_test_flag(struct drbd_backing_dev *, int);
+extern void drbd_md_mark_dirty(struct drbd_device *device);
+extern void drbd_queue_bitmap_io(struct drbd_device *device,
+				 int (*io_fn)(struct drbd_device *),
+				 void (*done)(struct drbd_device *, int),
+				 char *why, enum bm_flag flags);
+extern int drbd_bitmap_io(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags);
+extern int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local);
+extern int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local);
+
+/* Meta data layout
+ *
+ * We currently have two possible layouts.
+ * Offsets in (512 byte) sectors.
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ *  Variants:
+ *     old, indexed fixed size meta data:
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ][padding*]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  [padding*] are zero or up to 7 unused 512 Byte sectors to the
+ *  end of the device, so that the [4k superblock] will be 4k aligned.
+ *
+ *  The activity log consists of 4k transaction blocks,
+ *  which are written in a ring-buffer, or striped ring-buffer like fashion,
+ *  which are writtensize used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+
+/* Our old fixed size meta data layout
+ * allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_128MB_SECT (128LLU << 11)  /* 128 MB, unit sectors */
+#define MD_4kB_SECT	 8
+#define MD_32kB_SECT	64
+
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
+#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ *   This many changes to the active set can be logged with one transaction.
+ *   This number is arbitrary.
+ * context per transaction:
+ *   This many context extent numbers are logged with each transaction.
+ *   This number is resulting from the transaction block size (4k), the layout
+ *   of the transaction header, and the number of updates per transaction.
+ *   See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION	 64	// arbitrary
+#define AL_CONTEXT_PER_TRANSACTION	919	// (4096 - 36 - 6*64)/4
+
+#if BITS_PER_LONG == 32
+#define LN2_BPL 5
+#define cpu_to_lel(A) cpu_to_le32(A)
+#define lel_to_cpu(A) le32_to_cpu(A)
+#elif BITS_PER_LONG == 64
+#define LN2_BPL 6
+#define cpu_to_lel(A) cpu_to_le64(A)
+#define lel_to_cpu(A) le64_to_cpu(A)
+#else
+#error "LN2 of BITS_PER_LONG unknown!"
+#endif
+
+/* resync bitmap */
+/* 16MB sized 'bitmap extent' to track syncer usage */
+struct bm_extent {
+	int rs_left; /* number of bits set (out of sync) in this extent. */
+	int rs_failed; /* number of failed resync requests in this extent. */
+	unsigned long flags;
+	struct lc_element lce;
+};
+
+#define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
+#define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
+
+/* drbd_bitmap.c */
+/*
+ * We need to store one bit for a block.
+ * Example: 1GB disk @ 4096 byte blocks ==> we need 32 KB bitmap.
+ * Bit 0 ==> local node thinks this block is binary identical on both nodes
+ * Bit 1 ==> local node thinks this block needs to be synced.
+ */
+
+#define SLEEP_TIME (HZ/10)
+
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT	12			 /* 4k per bit */
+#define BM_BLOCK_SIZE	 (1<<BM_BLOCK_SHIFT)
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT	 24	/* 16 MiB per resync extent */
+#define BM_EXT_SIZE	 (1<<BM_EXT_SHIFT)
+
+#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
+#error "HAVE YOU FIXED drbdmeta AS WELL??"
+#endif
+
+/* thus many _storage_ sectors are described by one bit */
+#define BM_SECT_TO_BIT(x)   ((x)>>(BM_BLOCK_SHIFT-9))
+#define BM_BIT_TO_SECT(x)   ((sector_t)(x)<<(BM_BLOCK_SHIFT-9))
+#define BM_SECT_PER_BIT     BM_BIT_TO_SECT(1)
+
+/* bit to represented kilo byte conversion */
+#define Bit2KB(bits) ((bits)<<(BM_BLOCK_SHIFT-10))
+
+/* in which _bitmap_ extent (resp. sector) the bit for a certain
+ * _storage_ sector is located in */
+#define BM_SECT_TO_EXT(x)   ((x)>>(BM_EXT_SHIFT-9))
+#define BM_BIT_TO_EXT(x)    ((x) >> (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+/* first storage sector a bitmap extent corresponds to */
+#define BM_EXT_TO_SECT(x)   ((sector_t)(x) << (BM_EXT_SHIFT-9))
+/* how much _storage_ sectors we have per bitmap extent */
+#define BM_SECT_PER_EXT     BM_EXT_TO_SECT(1)
+/* how many bits are covered by one bitmap extent (resync extent) */
+#define BM_BITS_PER_EXT     (1UL << (BM_EXT_SHIFT - BM_BLOCK_SHIFT))
+
+#define BM_BLOCKS_PER_BM_EXT_MASK  (BM_BITS_PER_EXT - 1)
+
+
+/* in one sector of the bitmap, we have this many activity_log extents. */
+#define AL_EXT_PER_BM_SECT  (1 << (BM_EXT_SHIFT - AL_EXTENT_SHIFT))
+
+/* the extent in "PER_EXTENT" below is an activity log extent
+ * we need that many (long words/bytes) to store the bitmap
+ *		     of one AL_EXTENT_SIZE chunk of storage.
+ * we can store the bitmap for that many AL_EXTENTS within
+ * one sector of the _on_disk_ bitmap:
+ * bit	 0	  bit 37   bit 38	     bit (512*8)-1
+ *	     ...|........|........|.. // ..|........|
+ * sect. 0	 `296	  `304			   ^(512*8*8)-1
+ *
+#define BM_WORDS_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / BITS_PER_LONG )
+#define BM_BYTES_PER_EXT    ( (AL_EXT_SIZE/BM_BLOCK_SIZE) / 8 )  // 128
+#define BM_EXT_PER_SECT	    ( 512 / BM_BYTES_PER_EXTENT )	 //   4
+ */
+
+#define DRBD_MAX_SECTORS_32 (0xffffffffLU)
+/* we have a certain meta data variant that has a fixed on-disk size of 128
+ * MiB, of which 4k are our "superblock", and 32k are the fixed size activity
+ * log, leaving this many sectors for the bitmap.
+ */
+
+#define DRBD_MAX_SECTORS_FIXED_BM \
+	  ((MD_128MB_SECT - MD_32kB_SECT - MD_4kB_SECT) * (1LL<<(BM_EXT_SHIFT-9)))
+#define DRBD_MAX_SECTORS      DRBD_MAX_SECTORS_FIXED_BM
+/* 16 TB in units of sectors */
+#if BITS_PER_LONG == 32
+/* adjust by one page worth of bitmap,
+ * so we won't wrap around in drbd_bm_find_next_bit.
+ * you should use 64bit OS for that much storage, anyways. */
+#define DRBD_MAX_SECTORS_FLEX BM_BIT_TO_SECT(0xffff7fff)
+#else
+/* we allow up to 1 PiB now on 64bit architecture with "flexible" meta data */
+#define DRBD_MAX_SECTORS_FLEX (1UL << 51)
+/* corresponds to (1UL << 38) bits right now. */
+#endif
+
+/* Estimate max bio size as 256 * PAGE_SIZE,
+ * so for typical PAGE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > (BIO_MAX_VECS << PAGE_SHIFT)
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
+#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12)       /* Works always = 4k */
+
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95    (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
+
+/* For now, don't allow more than half of what we can "activate" in one
+ * activity log transaction to be discarded in one go. We may need to rework
+ * drbd_al_begin_io() to allow for even larger discard ranges */
+#define DRBD_MAX_BATCH_BIO_SIZE	 (AL_UPDATES_PER_TRANSACTION/2*AL_EXTENT_SIZE)
+#define DRBD_MAX_BBIO_SECTORS    (DRBD_MAX_BATCH_BIO_SIZE >> 9)
+
+extern int  drbd_bm_init(struct drbd_device *device);
+extern int  drbd_bm_resize(struct drbd_device *device, sector_t sectors, int set_new_bits);
+extern void drbd_bm_cleanup(struct drbd_device *device);
+extern void drbd_bm_set_all(struct drbd_device *device);
+extern void drbd_bm_clear_all(struct drbd_device *device);
+/* set/clear/test only a few bits at a time */
+extern int  drbd_bm_set_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int  drbd_bm_clear_bits(
+		struct drbd_device *device, unsigned long s, unsigned long e);
+extern int drbd_bm_count_bits(
+	struct drbd_device *device, const unsigned long s, const unsigned long e);
+/* bm_set_bits variant for use while holding drbd_bm_lock,
+ * may process the whole bitmap in one go */
+extern void _drbd_bm_set_bits(struct drbd_device *device,
+		const unsigned long s, const unsigned long e);
+extern int  drbd_bm_test_bit(struct drbd_device *device, unsigned long bitnr);
+extern int  drbd_bm_e_weight(struct drbd_device *device, unsigned long enr);
+extern int  drbd_bm_read(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_device *device, int page_nr);
+extern int  drbd_bm_write(struct drbd_device *device) __must_hold(local);
+extern void drbd_bm_reset_al_hints(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_hinted(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_lazy(struct drbd_device *device, unsigned upper_idx) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_device *device) __must_hold(local);
+extern int  drbd_bm_write_copy_pages(struct drbd_device *device) __must_hold(local);
+extern size_t	     drbd_bm_words(struct drbd_device *device);
+extern unsigned long drbd_bm_bits(struct drbd_device *device);
+extern sector_t      drbd_bm_capacity(struct drbd_device *device);
+
+#define DRBD_END_OF_BITMAP	(~(unsigned long)0)
+extern unsigned long drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+/* bm_find_next variants for use while you hold drbd_bm_lock() */
+extern unsigned long _drbd_bm_find_next(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_find_next_zero(struct drbd_device *device, unsigned long bm_fo);
+extern unsigned long _drbd_bm_total_weight(struct drbd_device *device);
+extern unsigned long drbd_bm_total_weight(struct drbd_device *device);
+/* for receive_bitmap */
+extern void drbd_bm_merge_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+/* for _drbd_send_bitmap */
+extern void drbd_bm_get_lel(struct drbd_device *device, size_t offset,
+		size_t number, unsigned long *buffer);
+
+extern void drbd_bm_lock(struct drbd_device *device, char *why, enum bm_flag flags);
+extern void drbd_bm_unlock(struct drbd_device *device);
+/* drbd_main.c */
+
+extern struct kmem_cache *drbd_request_cache;
+extern struct kmem_cache *drbd_ee_cache;	/* peer requests */
+extern struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+extern struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+extern mempool_t drbd_request_mempool;
+extern mempool_t drbd_ee_mempool;
+
+/* drbd's page pool, used to buffer data received from the peer,
+ * or data requested by the peer.
+ *
+ * This does not have an emergency reserve.
+ *
+ * When allocating from this pool, it first takes pages from the pool.
+ * Only if the pool is depleted will try to allocate from the system.
+ *
+ * The assumption is that pages taken from this pool will be processed,
+ * and given back, "quickly", and then can be recycled, so we can avoid
+ * frequent calls to alloc_page(), and still will be able to make progress even
+ * under memory pressure.
+ */
+extern struct page *drbd_pp_pool;
+extern spinlock_t   drbd_pp_lock;
+extern int	    drbd_pp_vacant;
+extern wait_queue_head_t drbd_pp_wait;
+
+/* We also need a standard (emergency-reserve backed) page pool
+ * for meta data IO (activity log, bitmap).
+ * We can keep it global, as long as it is used as "N pages at a time".
+ * 128 should be plenty, currently we probably can get away with as few as 1.
+ */
+#define DRBD_MIN_POOL_PAGES	128
+extern mempool_t drbd_md_io_page_pool;
+
+/* We also need to make sure we get a bio
+ * when we need it for housekeeping purposes */
+extern struct bio_set drbd_md_io_bio_set;
+
+/* And a bio_set for cloning */
+extern struct bio_set drbd_io_bio_set;
+
+extern struct mutex resources_mutex;
+
+extern int conn_lowest_minor(struct drbd_connection *connection);
+extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor);
+extern void drbd_destroy_device(struct kref *kref);
+extern void drbd_delete_device(struct drbd_device *device);
+
+extern struct drbd_resource *drbd_create_resource(const char *name);
+extern void drbd_free_resource(struct drbd_resource *resource);
+
+extern int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts);
+extern struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts);
+extern void drbd_destroy_connection(struct kref *kref);
+extern struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+					    void *peer_addr, int peer_addr_len);
+extern struct drbd_resource *drbd_find_resource(const char *name);
+extern void drbd_destroy_resource(struct kref *kref);
+extern void conn_free_crypto(struct drbd_connection *connection);
+
+/* drbd_req */
+extern void do_submit(struct work_struct *ws);
+extern void __drbd_make_request(struct drbd_device *, struct bio *);
+void drbd_submit_bio(struct bio *bio);
+extern int drbd_read_remote(struct drbd_device *device, struct drbd_request *req);
+extern int is_valid_ar_handle(struct drbd_request *, sector_t);
+
+
+/* drbd_nl.c */
+
+extern struct mutex notification_mutex;
+
+extern void drbd_suspend_io(struct drbd_device *device);
+extern void drbd_resume_io(struct drbd_device *device);
+extern char *ppsize(char *buf, unsigned long long size);
+extern sector_t drbd_new_dev_size(struct drbd_device *, struct drbd_backing_dev *, sector_t, int);
+enum determine_dev_size {
+	DS_ERROR_SHRINK = -3,
+	DS_ERROR_SPACE_MD = -2,
+	DS_ERROR = -1,
+	DS_UNCHANGED = 0,
+	DS_SHRUNK = 1,
+	DS_GREW = 2,
+	DS_GREW_FROM_ZERO = 3,
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *, enum dds_flags, struct resize_parms *) __must_hold(local);
+extern void resync_after_online_grow(struct drbd_device *);
+extern void drbd_reconsider_queue_parameters(struct drbd_device *device,
+			struct drbd_backing_dev *bdev, struct o_qlim *o);
+extern enum drbd_state_rv drbd_set_role(struct drbd_device *device,
+					enum drbd_role new_role,
+					int force);
+extern bool conn_try_outdate_peer(struct drbd_connection *connection);
+extern void conn_try_outdate_peer_async(struct drbd_connection *connection);
+extern enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd);
+extern int drbd_khelper(struct drbd_device *device, char *cmd);
+
+/* drbd_worker.c */
+/* bi_end_io handlers */
+extern void drbd_md_endio(struct bio *bio);
+extern void drbd_peer_request_endio(struct bio *bio);
+extern void drbd_request_endio(struct bio *bio);
+extern int drbd_worker(struct drbd_thread *thi);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor);
+void drbd_resync_after_changed(struct drbd_device *device);
+extern void drbd_start_resync(struct drbd_device *device, enum drbd_conns side);
+extern void resume_next_sg(struct drbd_device *device);
+extern void suspend_other_sg(struct drbd_device *device);
+extern int drbd_resync_finished(struct drbd_device *device);
+/* maybe rather drbd_main.c ? */
+extern void *drbd_md_get_buffer(struct drbd_device *device, const char *intent);
+extern void drbd_md_put_buffer(struct drbd_device *device);
+extern int drbd_md_sync_page_io(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, sector_t sector, enum req_op op);
+extern void drbd_ov_out_of_sync_found(struct drbd_device *, sector_t, int);
+extern void wait_until_done_or_force_detached(struct drbd_device *device,
+		struct drbd_backing_dev *bdev, unsigned int *done);
+extern void drbd_rs_controller_reset(struct drbd_device *device);
+
+static inline void ov_out_of_sync_print(struct drbd_device *device)
+{
+	if (device->ov_last_oos_size) {
+		drbd_err(device, "Out of sync: start=%llu, size=%lu (sectors)\n",
+		     (unsigned long long)device->ov_last_oos_start,
+		     (unsigned long)device->ov_last_oos_size);
+	}
+	device->ov_last_oos_size = 0;
+}
+
+
+extern void drbd_csum_bio(struct crypto_shash *, struct bio *, void *);
+extern void drbd_csum_ee(struct crypto_shash *, struct drbd_peer_request *,
+			 void *);
+/* worker callbacks */
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+
+extern void resync_timer_fn(struct timer_list *t);
+extern void start_resync_timer_fn(struct timer_list *t);
+
+extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req);
+
+/* drbd_receiver.c */
+extern int drbd_issue_discard_or_zero_out(struct drbd_device *device,
+		sector_t start, unsigned int nr_sectors, int flags);
+extern int drbd_receiver(struct drbd_thread *thi);
+extern int drbd_ack_receiver(struct drbd_thread *thi);
+extern void drbd_send_ping_wf(struct work_struct *ws);
+extern void drbd_send_acks_wf(struct work_struct *ws);
+extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device);
+extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting);
+extern int drbd_submit_peer_request(struct drbd_device *,
+				    struct drbd_peer_request *, blk_opf_t, int);
+extern int drbd_free_peer_reqs(struct drbd_device *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *, u64,
+						     sector_t, unsigned int,
+						     unsigned int,
+						     gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
+				 int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
+extern void drbd_set_recv_tcq(struct drbd_device *device, int tcq_enabled);
+extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
+extern int drbd_connected(struct drbd_peer_device *);
+
+/* sets the number of 512 byte sectors of our virtual device */
+void drbd_set_my_capacity(struct drbd_device *device, sector_t size);
+
+/*
+ * used to submit our private bio
+ */
+static inline void drbd_submit_bio_noacct(struct drbd_device *device,
+					     int fault_type, struct bio *bio)
+{
+	__release(local);
+	if (!bio->bi_bdev) {
+		drbd_err(device, "drbd_submit_bio_noacct: bio->bi_bdev == NULL\n");
+		bio->bi_status = BLK_STS_IOERR;
+		bio_endio(bio);
+		return;
+	}
+
+	if (drbd_insert_fault(device, fault_type))
+		bio_io_error(bio);
+	else
+		submit_bio_noacct(bio);
+}
+
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo);
+
+/* drbd_proc.c */
+extern struct proc_dir_entry *drbd_proc;
+int drbd_seq_show(struct seq_file *seq, void *v);
+
+/* drbd_actlog.c */
+extern bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval *i);
+extern int drbd_al_begin_io_nonblock(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io_commit(struct drbd_device *device);
+extern bool drbd_al_begin_io_fastpath(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_begin_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i);
+extern void drbd_rs_complete_io(struct drbd_device *device, sector_t sector);
+extern int drbd_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern int drbd_try_rs_begin_io(struct drbd_device *device, sector_t sector);
+extern void drbd_rs_cancel_all(struct drbd_device *device);
+extern int drbd_rs_del_all(struct drbd_device *device);
+extern void drbd_rs_failed_io(struct drbd_device *device,
+		sector_t sector, int size);
+extern void drbd_advance_rs_marks(struct drbd_device *device, unsigned long still_to_go);
+
+enum update_sync_bits_mode { RECORD_RS_FAILED, SET_OUT_OF_SYNC, SET_IN_SYNC };
+extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int size,
+		enum update_sync_bits_mode mode);
+#define drbd_set_in_sync(device, sector, size) \
+	__drbd_change_sync(device, sector, size, SET_IN_SYNC)
+#define drbd_set_out_of_sync(device, sector, size) \
+	__drbd_change_sync(device, sector, size, SET_OUT_OF_SYNC)
+#define drbd_rs_failed_io(device, sector, size) \
+	__drbd_change_sync(device, sector, size, RECORD_RS_FAILED)
+extern void drbd_al_shrink(struct drbd_device *device);
+extern int drbd_al_initialize(struct drbd_device *, void *);
+
+/* drbd_nl.c */
+/* state info broadcast */
+struct sib_info {
+	enum drbd_state_info_bcast_reason sib_reason;
+	union {
+		struct {
+			char *helper_name;
+			unsigned helper_exit_code;
+		};
+		struct {
+			union drbd_state os;
+			union drbd_state ns;
+		};
+	};
+};
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib);
+
+extern int notify_resource_state(struct sk_buff *,
+				  unsigned int,
+				  struct drbd_resource *,
+				  struct resource_info *,
+				  enum drbd_notification_type);
+extern int notify_device_state(struct sk_buff *,
+				unsigned int,
+				struct drbd_device *,
+				struct device_info *,
+				enum drbd_notification_type);
+extern int notify_connection_state(struct sk_buff *,
+				    unsigned int,
+				    struct drbd_connection *,
+				    struct connection_info *,
+				    enum drbd_notification_type);
+extern int notify_peer_device_state(struct sk_buff *,
+				     unsigned int,
+				     struct drbd_peer_device *,
+				     struct peer_device_info *,
+				     enum drbd_notification_type);
+extern void notify_helper(enum drbd_notification_type, struct drbd_device *,
+			  struct drbd_connection *, const char *, int);
+
+/*
+ * inline helper functions
+ *************************/
+
+/* see also page_chain_add and friends in drbd_receiver.c */
+static inline struct page *page_chain_next(struct page *page)
+{
+	return (struct page *)page_private(page);
+}
+#define page_chain_for_each(page) \
+	for (; page && ({ prefetch(page_chain_next(page)); 1; }); \
+			page = page_chain_next(page))
+#define page_chain_for_each_safe(page, n) \
+	for (; page && ({ n = page_chain_next(page); 1; }); page = n)
+
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	page_chain_for_each(page) {
+		if (page_count(page) > 1)
+			return 1;
+	}
+	return 0;
+}
+
+static inline union drbd_state drbd_read_state(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	union drbd_state rv;
+
+	rv.i = device->state.i;
+	rv.susp = resource->susp;
+	rv.susp_nod = resource->susp_nod;
+	rv.susp_fen = resource->susp_fen;
+
+	return rv;
+}
+
+enum drbd_force_detach_flags {
+	DRBD_READ_ERROR,
+	DRBD_WRITE_ERROR,
+	DRBD_META_IO_ERROR,
+	DRBD_FORCE_DETACH,
+};
+
+#define __drbd_chk_io_error(m,f) __drbd_chk_io_error_(m,f, __func__)
+static inline void __drbd_chk_io_error_(struct drbd_device *device,
+		enum drbd_force_detach_flags df,
+		const char *where)
+{
+	enum drbd_io_error_p ep;
+
+	rcu_read_lock();
+	ep = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+	rcu_read_unlock();
+	switch (ep) {
+	case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
+		if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Local IO failed in %s.\n", where);
+			if (device->state.disk > D_INCONSISTENT)
+				_drbd_set_state(_NS(device, disk, D_INCONSISTENT), CS_HARD, NULL);
+			break;
+		}
+		fallthrough;	/* for DRBD_META_IO_ERROR or DRBD_FORCE_DETACH */
+	case EP_DETACH:
+	case EP_CALL_HELPER:
+		/* Remember whether we saw a READ or WRITE error.
+		 *
+		 * Recovery of the affected area for WRITE failure is covered
+		 * by the activity log.
+		 * READ errors may fall outside that area though. Certain READ
+		 * errors can be "healed" by writing good data to the affected
+		 * blocks, which triggers block re-allocation in lower layers.
+		 *
+		 * If we can not write the bitmap after a READ error,
+		 * we may need to trigger a full sync (see w_go_diskless()).
+		 *
+		 * Force-detach is not really an IO error, but rather a
+		 * desperate measure to try to deal with a completely
+		 * unresponsive lower level IO stack.
+		 * Still it should be treated as a WRITE error.
+		 *
+		 * Meta IO error is always WRITE error:
+		 * we read meta data only once during attach,
+		 * which will fail in case of errors.
+		 */
+		set_bit(WAS_IO_ERROR, &device->flags);
+		if (df == DRBD_READ_ERROR)
+			set_bit(WAS_READ_ERROR, &device->flags);
+		if (df == DRBD_FORCE_DETACH)
+			set_bit(FORCE_DETACH, &device->flags);
+		if (device->state.disk > D_FAILED) {
+			_drbd_set_state(_NS(device, disk, D_FAILED), CS_HARD, NULL);
+			drbd_err(device,
+				"Local IO failed in %s. Detaching...\n", where);
+		}
+		break;
+	}
+}
+
+/**
+ * drbd_chk_io_error: Handle the on_io_error setting, should be called from all io completion handlers
+ * @device:	 DRBD device.
+ * @error:	 Error code passed to the IO completion callback
+ * @forcedetach: Force detach. I.e. the error happened while accessing the meta data
+ *
+ * See also drbd_main.c:after_state_ch() if (os.disk > D_FAILED && ns.disk == D_FAILED)
+ */
+#define drbd_chk_io_error(m,e,f) drbd_chk_io_error_(m,e,f, __func__)
+static inline void drbd_chk_io_error_(struct drbd_device *device,
+	int error, enum drbd_force_detach_flags forcedetach, const char *where)
+{
+	if (error) {
+		unsigned long flags;
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		__drbd_chk_io_error_(device, forcedetach, where);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	}
+}
+
+
+/**
+ * drbd_md_first_sector() - Returns the first sector number of the meta data area
+ * @bdev:	Meta data block device.
+ *
+ * BTW, for internal meta data, this happens to be the maximum capacity
+ * we could agree upon with our peer node.
+ */
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + bdev->md.bm_offset;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset;
+	}
+}
+
+/**
+ * drbd_md_last_sector() - Return the last sector number of the meta data area
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
+{
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		return bdev->md.md_offset + MD_4kB_SECT -1;
+	case DRBD_MD_INDEX_FLEX_EXT:
+	default:
+		return bdev->md.md_offset + bdev->md.md_size_sect -1;
+	}
+}
+
+/* Returns the number of 512 byte sectors of the device */
+static inline sector_t drbd_get_capacity(struct block_device *bdev)
+{
+	return bdev ? bdev_nr_sectors(bdev) : 0;
+}
+
+/**
+ * drbd_get_max_capacity() - Returns the capacity we announce to out peer
+ * @bdev:	Meta data block device.
+ *
+ * returns the capacity we announce to out peer.  we clip ourselves at the
+ * various MAX_SECTORS, because if we don't, current implementation will
+ * oops sooner or later
+ */
+static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
+{
+	sector_t s;
+
+	switch (bdev->md.meta_dev_idx) {
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		s = drbd_get_capacity(bdev->backing_bdev)
+			? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_md_first_sector(bdev))
+			: 0;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		s = min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
+				drbd_get_capacity(bdev->backing_bdev));
+		/* clip at maximum size the meta device can support */
+		s = min_t(sector_t, s,
+			BM_EXT_TO_SECT(bdev->md.md_size_sect
+				     - bdev->md.bm_offset));
+		break;
+	default:
+		s = min_t(sector_t, DRBD_MAX_SECTORS,
+				drbd_get_capacity(bdev->backing_bdev));
+	}
+	return s;
+}
+
+/**
+ * drbd_md_ss() - Return the sector number of our meta data super block
+ * @bdev:	Meta data block device.
+ */
+static inline sector_t drbd_md_ss(struct drbd_backing_dev *bdev)
+{
+	const int meta_dev_idx = bdev->md.meta_dev_idx;
+
+	if (meta_dev_idx == DRBD_MD_INDEX_FLEX_EXT)
+		return 0;
+
+	/* Since drbd08, internal meta data is always "flexible".
+	 * position: last 4k aligned block of 4k size */
+	if (meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	    meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)
+		return (drbd_get_capacity(bdev->backing_bdev) & ~7ULL) - 8;
+
+	/* external, some index; this is the old fixed size layout */
+	return MD_128MB_SECT * bdev->md.meta_dev_idx;
+}
+
+static inline void
+drbd_queue_work(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_queue_work_if_unqueued(struct drbd_work_queue *q, struct drbd_work *w)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&q->q_lock, flags);
+	if (list_empty_careful(&w->list))
+		list_add_tail(&w->list, &q->q);
+	spin_unlock_irqrestore(&q->q_lock, flags);
+	wake_up(&q->q_wait);
+}
+
+static inline void
+drbd_device_post_work(struct drbd_device *device, int work_bit)
+{
+	if (!test_and_set_bit(work_bit, &device->flags)) {
+		struct drbd_connection *connection =
+			first_peer_device(device)->connection;
+		struct drbd_work_queue *q = &connection->sender_work;
+		if (!test_and_set_bit(DEVICE_WORK_PENDING, &connection->flags))
+			wake_up(&q->q_wait);
+	}
+}
+
+extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue);
+
+/* To get the ack_receiver out of the blocking network stack,
+ * so it can change its sk_rcvtimeo from idle- to ping-timeout,
+ * and send a ping, we need to send a signal.
+ * Which signal we send is irrelevant. */
+static inline void wake_ack_receiver(struct drbd_connection *connection)
+{
+	struct task_struct *task = connection->ack_receiver.task;
+	if (task && get_t_state(&connection->ack_receiver) == RUNNING)
+		send_sig(SIGXCPU, task, 1);
+}
+
+static inline void request_ping(struct drbd_connection *connection)
+{
+	set_bit(SEND_PING, &connection->flags);
+	wake_ack_receiver(connection);
+}
+
+extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_peer_device *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_connection *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+extern int drbd_send_command(struct drbd_peer_device *, struct drbd_socket *,
+			     enum drbd_packet, unsigned int, void *,
+			     unsigned int);
+
+extern int drbd_send_ping(struct drbd_connection *connection);
+extern int drbd_send_ping_ack(struct drbd_connection *connection);
+extern int drbd_send_state_req(struct drbd_peer_device *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_connection *, union drbd_state, union drbd_state);
+
+static inline void drbd_thread_stop(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, true);
+}
+
+static inline void drbd_thread_stop_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, false, false);
+}
+
+static inline void drbd_thread_restart_nowait(struct drbd_thread *thi)
+{
+	_drbd_thread_stop(thi, true, false);
+}
+
+/* counts how many answer packets packets we expect from our peer,
+ * for either explicit application requests,
+ * or implicit barrier packets as necessary.
+ * increased:
+ *  w_send_barrier
+ *  _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
+ *    it is much easier and equally valid to count what we queue for the
+ *    worker, even before it actually was queued or send.
+ *    (drbd_make_request_common; recovery path on read io-error)
+ * decreased:
+ *  got_BarrierAck (respective tl_clear, tl_clear_barrier)
+ *  _req_mod(req, DATA_RECEIVED)
+ *     [from receive_DataReply]
+ *  _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
+ *     [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
+ *     for some reason it is NOT decreased in got_NegAck,
+ *     but in the resulting cleanup code from report_params.
+ *     we should try to remember the reason for that...
+ *  _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ *  _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
+ *     [from tl_clear_barrier]
+ */
+static inline void inc_ap_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->ap_pending_cnt);
+}
+
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line)			\
+	if (atomic_read(&device->which) < 0)				\
+		drbd_err(device, "in %s:%d: " #which " = %d < 0 !\n",	\
+			func, line,					\
+			atomic_read(&device->which))
+
+#define dec_ap_pending(device) _dec_ap_pending(device, __func__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_device *device, const char *func, int line)
+{
+	if (atomic_dec_and_test(&device->ap_pending_cnt))
+		wake_up(&device->misc_wait);
+	ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
+
+/* counts how many resync-related answers we still expect from the peer
+ *		     increase			decrease
+ * C_SYNC_TARGET sends P_RS_DATA_REQUEST (and expects P_RS_DATA_REPLY)
+ * C_SYNC_SOURCE sends P_RS_DATA_REPLY   (and expects P_WRITE_ACK with ID_SYNCER)
+ *					   (or P_NEG_ACK with ID_SYNCER)
+ */
+static inline void inc_rs_pending(struct drbd_device *device)
+{
+	atomic_inc(&device->rs_pending_cnt);
+}
+
+#define dec_rs_pending(device) _dec_rs_pending(device, __func__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->rs_pending_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
+
+/* counts how many answers we still need to send to the peer.
+ * increased on
+ *  receive_Data	unless protocol A;
+ *			we need to send a P_RECV_ACK (proto B)
+ *			or P_WRITE_ACK (proto C)
+ *  receive_RSDataReply (recv_resync_read) we need to send a P_WRITE_ACK
+ *  receive_DataRequest (receive_RSDataRequest) we need to send back P_DATA
+ *  receive_Barrier_*	we need to send a P_BARRIER_ACK
+ */
+static inline void inc_unacked(struct drbd_device *device)
+{
+	atomic_inc(&device->unacked_cnt);
+}
+
+#define dec_unacked(device) _dec_unacked(device, __func__, __LINE__)
+static inline void _dec_unacked(struct drbd_device *device, const char *func, int line)
+{
+	atomic_dec(&device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+#define sub_unacked(device, n) _sub_unacked(device, n, __func__, __LINE__)
+static inline void _sub_unacked(struct drbd_device *device, int n, const char *func, int line)
+{
+	atomic_sub(n, &device->unacked_cnt);
+	ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
+}
+
+static inline bool is_sync_target_state(enum drbd_conns connection_state)
+{
+	return	connection_state == C_SYNC_TARGET ||
+		connection_state == C_PAUSED_SYNC_T;
+}
+
+static inline bool is_sync_source_state(enum drbd_conns connection_state)
+{
+	return	connection_state == C_SYNC_SOURCE ||
+		connection_state == C_PAUSED_SYNC_S;
+}
+
+static inline bool is_sync_state(enum drbd_conns connection_state)
+{
+	return	is_sync_source_state(connection_state) ||
+		is_sync_target_state(connection_state);
+}
+
+/**
+ * get_ldev() - Increase the ref count on device->ldev. Returns 0 if there is no ldev
+ * @_device:		DRBD device.
+ * @_min_state:		Minimum device state required for success.
+ *
+ * You have to call put_ldev() when finished working with device->ldev.
+ */
+#define get_ldev_if_state(_device, _min_state)				\
+	(_get_ldev_if_state((_device), (_min_state)) ?			\
+	 ({ __acquire(x); true; }) : false)
+#define get_ldev(_device) get_ldev_if_state(_device, D_INCONSISTENT)
+
+static inline void put_ldev(struct drbd_device *device)
+{
+	enum drbd_disk_state disk_state = device->state.disk;
+	/* We must check the state *before* the atomic_dec becomes visible,
+	 * or we have a theoretical race where someone hitting zero,
+	 * while state still D_FAILED, will then see D_DISKLESS in the
+	 * condition below and calling into destroy, where he must not, yet. */
+	int i = atomic_dec_return(&device->local_cnt);
+
+	/* This may be called from some endio handler,
+	 * so we must not sleep here. */
+
+	__release(local);
+	D_ASSERT(device, i >= 0);
+	if (i == 0) {
+		if (disk_state == D_DISKLESS)
+			/* even internal references gone, safe to destroy */
+			drbd_device_post_work(device, DESTROY_DISK);
+		if (disk_state == D_FAILED)
+			/* all application IO references gone. */
+			if (!test_and_set_bit(GOING_DISKLESS, &device->flags))
+				drbd_device_post_work(device, GO_DISKLESS);
+		wake_up(&device->misc_wait);
+	}
+}
+
+#ifndef __CHECKER__
+static inline int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	/* never get a reference while D_DISKLESS */
+	if (device->state.disk == D_DISKLESS)
+		return 0;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed)
+		put_ldev(device);
+	return io_allowed;
+}
+#else
+extern int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins);
+#endif
+
+/* this throttles on-the-fly application requests
+ * according to max_buffers settings;
+ * maybe re-implement using semaphores? */
+static inline int drbd_get_max_buffers(struct drbd_device *device)
+{
+	struct net_conf *nc;
+	int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;  /* arbitrary limit on open requests */
+	rcu_read_unlock();
+
+	return mxb;
+}
+
+static inline int drbd_state_is_stable(struct drbd_device *device)
+{
+	union drbd_dev_state s = device->state;
+
+	/* DO NOT add a default clause, we want the compiler to warn us
+	 * for any newly introduced state we may have forgotten to add here */
+
+	switch ((enum drbd_conns)s.conn) {
+	/* new io only accepted when there is no connection, ... */
+	case C_STANDALONE:
+	case C_WF_CONNECTION:
+	/* ... or there is a well established connection. */
+	case C_CONNECTED:
+	case C_SYNC_SOURCE:
+	case C_SYNC_TARGET:
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+	case C_PAUSED_SYNC_S:
+	case C_PAUSED_SYNC_T:
+	case C_AHEAD:
+	case C_BEHIND:
+		/* transitional states, IO allowed */
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_REPORT_PARAMS:
+	case C_STARTING_SYNC_S:
+	case C_STARTING_SYNC_T:
+		break;
+
+		/* Allow IO in BM exchange states with new protocols */
+	case C_WF_BITMAP_S:
+		if (first_peer_device(device)->connection->agreed_pro_version < 96)
+			return 0;
+		break;
+
+		/* no new io accepted in these states */
+	case C_WF_BITMAP_T:
+	case C_WF_SYNC_UUID:
+	case C_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	switch ((enum drbd_disk_state)s.disk) {
+	case D_DISKLESS:
+	case D_INCONSISTENT:
+	case D_OUTDATED:
+	case D_CONSISTENT:
+	case D_UP_TO_DATE:
+	case D_FAILED:
+		/* disk state is stable as well. */
+		break;
+
+	/* no new io accepted during transitional states */
+	case D_ATTACHING:
+	case D_NEGOTIATING:
+	case D_UNKNOWN:
+	case D_MASK:
+		/* not "stable" */
+		return 0;
+	}
+
+	return 1;
+}
+
+static inline int drbd_suspended(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+
+	return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+static inline bool may_inc_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+
+	if (drbd_suspended(device))
+		return false;
+	if (atomic_read(&device->suspend_cnt))
+		return false;
+
+	/* to avoid potential deadlock or bitmap corruption,
+	 * in various places, we only allow new application io
+	 * to start during "stable" states. */
+
+	/* no new io accepted when attaching or detaching the disk */
+	if (!drbd_state_is_stable(device))
+		return false;
+
+	/* since some older kernels don't have atomic_add_unless,
+	 * and we are within the spinlock anyways, we have this workaround.  */
+	if (atomic_read(&device->ap_bio_cnt) > mxb)
+		return false;
+	if (test_bit(BITMAP_IO, &device->flags))
+		return false;
+	return true;
+}
+
+static inline bool inc_ap_bio_cond(struct drbd_device *device)
+{
+	bool rv = false;
+
+	spin_lock_irq(&device->resource->req_lock);
+	rv = may_inc_ap_bio(device);
+	if (rv)
+		atomic_inc(&device->ap_bio_cnt);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+static inline void inc_ap_bio(struct drbd_device *device)
+{
+	/* we wait here
+	 *    as long as the device is suspended
+	 *    until the bitmap is no longer on the fly during connection
+	 *    handshake as long as we would exceed the max_buffer limit.
+	 *
+	 * to avoid races with the reconnect code,
+	 * we need to atomic_inc within the spinlock. */
+
+	wait_event(device->misc_wait, inc_ap_bio_cond(device));
+}
+
+static inline void dec_ap_bio(struct drbd_device *device)
+{
+	int mxb = drbd_get_max_buffers(device);
+	int ap_bio = atomic_dec_return(&device->ap_bio_cnt);
+
+	D_ASSERT(device, ap_bio >= 0);
+
+	if (ap_bio == 0 && test_bit(BITMAP_IO, &device->flags)) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->
+				connection->sender_work,
+				&device->bm_io_work.w);
+	}
+
+	/* this currently does wake_up for every dec_ap_bio!
+	 * maybe rather introduce some type of hysteresis?
+	 * e.g. (ap_bio == mxb/2 || ap_bio == 0) ? */
+	if (ap_bio < mxb)
+		wake_up(&device->misc_wait);
+}
+
+static inline bool verify_can_do_stop_sector(struct drbd_device *device)
+{
+	return first_peer_device(device)->connection->agreed_pro_version >= 97 &&
+		first_peer_device(device)->connection->agreed_pro_version != 100;
+}
+
+static inline int drbd_set_ed_uuid(struct drbd_device *device, u64 val)
+{
+	int changed = device->ed_uuid != val;
+	device->ed_uuid = val;
+	return changed;
+}
+
+static inline int drbd_queue_order_type(struct drbd_device *device)
+{
+	/* sorry, we currently have no working implementation
+	 * of distributed TCQ stuff */
+#ifndef QUEUE_ORDERED_NONE
+#define QUEUE_ORDERED_NONE 0
+#endif
+	return QUEUE_ORDERED_NONE;
+}
+
+static inline struct drbd_connection *first_connection(struct drbd_resource *resource)
+{
+	return list_first_entry_or_null(&resource->connections,
+				struct drbd_connection, connections);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 000000000..f07b43783
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,159 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/*
+ * interval_end  -  return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+	struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+	return this->end;
+}
+
+#define NODE_END(node) ((node)->sector + ((node)->size >> 9))
+
+RB_DECLARE_CALLBACKS_MAX(static, augment_callbacks,
+			 struct drbd_interval, rb, sector_t, end, NODE_END);
+
+/*
+ * drbd_insert_interval  -  insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	struct rb_node **new = &root->rb_node, *parent = NULL;
+	sector_t this_end = this->sector + (this->size >> 9);
+
+	BUG_ON(!IS_ALIGNED(this->size, 512));
+
+	while (*new) {
+		struct drbd_interval *here =
+			rb_entry(*new, struct drbd_interval, rb);
+
+		parent = *new;
+		if (here->end < this_end)
+			here->end = this_end;
+		if (this->sector < here->sector)
+			new = &(*new)->rb_left;
+		else if (this->sector > here->sector)
+			new = &(*new)->rb_right;
+		else if (this < here)
+			new = &(*new)->rb_left;
+		else if (this > here)
+			new = &(*new)->rb_right;
+		else
+			return false;
+	}
+
+	this->end = this_end;
+	rb_link_node(&this->rb, parent, new);
+	rb_insert_augmented(&this->rb, root, &augment_callbacks);
+	return true;
+}
+
+/**
+ * drbd_contains_interval  -  check if a tree contains a given interval
+ * @root:	red black tree root
+ * @sector:	start sector of @interval
+ * @interval:	may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree.  Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+		       struct drbd_interval *interval)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (sector < here->sector)
+			node = node->rb_left;
+		else if (sector > here->sector)
+			node = node->rb_right;
+		else if (interval < here)
+			node = node->rb_left;
+		else if (interval > here)
+			node = node->rb_right;
+		else
+			return true;
+	}
+	return false;
+}
+
+/*
+ * drbd_remove_interval  -  remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+	rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap  - search for an interval overlapping with [sector, sector + size)
+ * @root:	red black tree root
+ * @sector:	start sector
+ * @size:	size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none.  When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+	struct rb_node *node = root->rb_node;
+	struct drbd_interval *overlap = NULL;
+	sector_t end = sector + (size >> 9);
+
+	BUG_ON(!IS_ALIGNED(size, 512));
+
+	while (node) {
+		struct drbd_interval *here =
+			rb_entry(node, struct drbd_interval, rb);
+
+		if (node->rb_left &&
+		    sector < interval_end(node->rb_left)) {
+			/* Overlap if any must be on left side */
+			node = node->rb_left;
+		} else if (here->sector < end &&
+			   sector < here->sector + (here->size >> 9)) {
+			overlap = here;
+			break;
+		} else if (sector >= here->sector) {
+			/* Overlap if any must be on right side */
+			node = node->rb_right;
+		} else
+			break;
+	}
+	return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+	sector_t end = sector + (size >> 9);
+	struct rb_node *node;
+
+	for (;;) {
+		node = rb_next(&i->rb);
+		if (!node)
+			return NULL;
+		i = rb_entry(node, struct drbd_interval, rb);
+		if (i->sector >= end)
+			return NULL;
+		if (sector < i->sector + (i->size >> 9))
+			return i;
+	}
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 000000000..b8c2dee5e
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+	struct rb_node rb;
+	sector_t sector;		/* start sector of the interval */
+	unsigned int size;		/* size in bytes */
+	sector_t end;			/* highest interval end in subtree */
+	unsigned int local:1		/* local or remote request? */;
+	unsigned int waiting:1;		/* someone is waiting for completion */
+	unsigned int completed:1;	/* this has been completed already;
+					 * ignore for conflict detection */
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+	RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+	return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+				   struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+					unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+					unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size)		\
+	for (i = drbd_find_overlap(root, sector, size);		\
+	     i;							\
+	     i = drbd_next_overlap(i, sector, size))
+
+#endif  /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
new file mode 100644
index 000000000..590d1b50a
--- /dev/null
+++ b/drivers/block/drbd/drbd_main.c
@@ -0,0 +1,3803 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/jiffies.h>
+#include <linux/drbd.h>
+#include <linux/uaccess.h>
+#include <asm/types.h>
+#include <net/sock.h>
+#include <linux/ctype.h>
+#include <linux/mutex.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/reboot.h>
+#include <linux/notifier.h>
+#include <linux/kthread.h>
+#include <linux/workqueue.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/signal.h>
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h" /* only for _req_mod in tl_release and tl_clear */
+#include "drbd_vli.h"
+#include "drbd_debugfs.h"
+
+static DEFINE_MUTEX(drbd_main_mutex);
+static int drbd_open(struct block_device *bdev, fmode_t mode);
+static void drbd_release(struct gendisk *gd, fmode_t mode);
+static void md_sync_timer_fn(struct timer_list *t);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+
+MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
+	      "Lars Ellenberg <lars@linbit.com>");
+MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
+MODULE_VERSION(REL_VERSION);
+MODULE_LICENSE("GPL");
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
+		 __stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
+MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
+
+#include <linux/moduleparam.h>
+/* thanks to these macros, if compiled into the kernel (not-module),
+ * these become boot parameters (e.g., drbd.minor_count) */
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+int drbd_enable_faults;
+int drbd_fault_rate;
+static int drbd_fault_count;
+static int drbd_fault_devs;
+/* bitmap of enabled faults */
+module_param_named(enable_faults, drbd_enable_faults, int, 0664);
+/* fault rate % value - applies to all enabled faults */
+module_param_named(fault_rate, drbd_fault_rate, int, 0664);
+/* count of faults inserted */
+module_param_named(fault_count, drbd_fault_count, int, 0664);
+/* bitmap of devices to insert faults on */
+module_param_named(fault_devs, drbd_fault_devs, int, 0644);
+#endif
+
+/* module parameters we can keep static */
+static bool drbd_allow_oos; /* allow_open_on_secondary */
+static bool drbd_disable_sendpage;
+MODULE_PARM_DESC(allow_oos, "DONT USE!");
+module_param_named(allow_oos, drbd_allow_oos, bool, 0);
+module_param_named(disable_sendpage, drbd_disable_sendpage, bool, 0644);
+
+/* module parameters we share */
+int drbd_proc_details; /* Detail level in proc drbd*/
+module_param_named(proc_details, drbd_proc_details, int, 0644);
+/* module parameters shared with defaults */
+unsigned int drbd_minor_count = DRBD_MINOR_COUNT_DEF;
+/* Module parameter for setting the user mode helper program
+ * to run. Default is /sbin/drbdadm */
+char drbd_usermode_helper[80] = "/sbin/drbdadm";
+module_param_named(minor_count, drbd_minor_count, uint, 0444);
+module_param_string(usermode_helper, drbd_usermode_helper, sizeof(drbd_usermode_helper), 0644);
+
+/* in 2.6.x, our device mapping and config info contains our virtual gendisks
+ * as member "struct gendisk *vdisk;"
+ */
+struct idr drbd_devices;
+struct list_head drbd_resources;
+struct mutex resources_mutex;
+
+struct kmem_cache *drbd_request_cache;
+struct kmem_cache *drbd_ee_cache;	/* peer requests */
+struct kmem_cache *drbd_bm_ext_cache;	/* bitmap extents */
+struct kmem_cache *drbd_al_ext_cache;	/* activity log extents */
+mempool_t drbd_request_mempool;
+mempool_t drbd_ee_mempool;
+mempool_t drbd_md_io_page_pool;
+struct bio_set drbd_md_io_bio_set;
+struct bio_set drbd_io_bio_set;
+
+/* I do not use a standard mempool, because:
+   1) I want to hand out the pre-allocated objects first.
+   2) I want to be able to interrupt sleeping allocation with a signal.
+   Note: This is a single linked list, the next pointer is the private
+	 member of struct page.
+ */
+struct page *drbd_pp_pool;
+DEFINE_SPINLOCK(drbd_pp_lock);
+int          drbd_pp_vacant;
+wait_queue_head_t drbd_pp_wait;
+
+DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
+
+static const struct block_device_operations drbd_ops = {
+	.owner		= THIS_MODULE,
+	.submit_bio	= drbd_submit_bio,
+	.open		= drbd_open,
+	.release	= drbd_release,
+};
+
+#ifdef __CHECKER__
+/* When checking with sparse, and this is an inline function, sparse will
+   give tons of false positives. When this is a real functions sparse works.
+ */
+int _get_ldev_if_state(struct drbd_device *device, enum drbd_disk_state mins)
+{
+	int io_allowed;
+
+	atomic_inc(&device->local_cnt);
+	io_allowed = (device->state.disk >= mins);
+	if (!io_allowed) {
+		if (atomic_dec_and_test(&device->local_cnt))
+			wake_up(&device->misc_wait);
+	}
+	return io_allowed;
+}
+
+#endif
+
+/**
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @connection:	DRBD connection.
+ * @barrier_nr:	Expected identifier of the DRBD write barrier packet.
+ * @set_size:	Expected number of requests before that barrier.
+ *
+ * In case the passed barrier_nr or set_size does not match the oldest
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
+ */
+void tl_release(struct drbd_connection *connection, unsigned int barrier_nr,
+		unsigned int set_size)
+{
+	struct drbd_request *r;
+	struct drbd_request *req = NULL, *tmp = NULL;
+	int expect_epoch = 0;
+	int expect_size = 0;
+
+	spin_lock_irq(&connection->resource->req_lock);
+
+	/* find oldest not yet barrier-acked write request,
+	 * count writes in its epoch. */
+	list_for_each_entry(r, &connection->transfer_log, tl_requests) {
+		const unsigned s = r->rq_state;
+		if (!req) {
+			if (!(s & RQ_WRITE))
+				continue;
+			if (!(s & RQ_NET_MASK))
+				continue;
+			if (s & RQ_NET_DONE)
+				continue;
+			req = r;
+			expect_epoch = req->epoch;
+			expect_size ++;
+		} else {
+			if (r->epoch != expect_epoch)
+				break;
+			if (!(s & RQ_WRITE))
+				continue;
+			/* if (s & RQ_DONE): not expected */
+			/* if (!(s & RQ_NET_MASK)): not expected */
+			expect_size++;
+		}
+	}
+
+	/* first some paranoia code */
+	if (req == NULL) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+			 barrier_nr);
+		goto bail;
+	}
+	if (expect_epoch != barrier_nr) {
+		drbd_err(connection, "BAD! BarrierAck #%u received, expected #%u!\n",
+			 barrier_nr, expect_epoch);
+		goto bail;
+	}
+
+	if (expect_size != set_size) {
+		drbd_err(connection, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+			 barrier_nr, set_size, expect_size);
+		goto bail;
+	}
+
+	/* Clean up list of requests processed during current epoch. */
+	/* this extra list walk restart is paranoia,
+	 * to catch requests being barrier-acked "unexpectedly".
+	 * It usually should find the same req again, or some READ preceding it. */
+	list_for_each_entry(req, &connection->transfer_log, tl_requests)
+		if (req->epoch == expect_epoch) {
+			tmp = req;
+			break;
+		}
+	req = list_prepare_entry(tmp, &connection->transfer_log, tl_requests);
+	list_for_each_entry_safe_from(req, r, &connection->transfer_log, tl_requests) {
+		if (req->epoch != expect_epoch)
+			break;
+		_req_mod(req, BARRIER_ACKED);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return;
+
+bail:
+	spin_unlock_irq(&connection->resource->req_lock);
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+
+/**
+ * _tl_restart() - Walks the transfer log, and applies an action to all requests
+ * @connection:	DRBD connection to operate on.
+ * @what:       The action/event to perform with all request objects
+ *
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
+ */
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	struct drbd_request *req, *r;
+
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests)
+		_req_mod(req, what);
+}
+
+void tl_restart(struct drbd_connection *connection, enum drbd_req_event what)
+{
+	spin_lock_irq(&connection->resource->req_lock);
+	_tl_restart(connection, what);
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+/**
+ * tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
+ * @connection:	DRBD connection.
+ *
+ * This is called after the connection to the peer was lost. The storage covered
+ * by the requests on the transfer gets marked as our of sync. Called from the
+ * receiver thread and the worker thread.
+ */
+void tl_clear(struct drbd_connection *connection)
+{
+	tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+}
+
+/**
+ * tl_abort_disk_io() - Abort disk I/O for all requests for a certain device in the TL
+ * @device:	DRBD device.
+ */
+void tl_abort_disk_io(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req, *r;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	list_for_each_entry_safe(req, r, &connection->transfer_log, tl_requests) {
+		if (!(req->rq_state & RQ_LOCAL_PENDING))
+			continue;
+		if (req->device != device)
+			continue;
+		_req_mod(req, ABORT_DISK_IO);
+	}
+	spin_unlock_irq(&connection->resource->req_lock);
+}
+
+static int drbd_thread_setup(void *arg)
+{
+	struct drbd_thread *thi = (struct drbd_thread *) arg;
+	struct drbd_resource *resource = thi->resource;
+	unsigned long flags;
+	int retval;
+
+	snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+		 thi->name[0],
+		 resource->name);
+
+	allow_kernel_signal(DRBD_SIGKILL);
+	allow_kernel_signal(SIGXCPU);
+restart:
+	retval = thi->function(thi);
+
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	/* if the receiver has been "EXITING", the last thing it did
+	 * was set the conn state to "StandAlone",
+	 * if now a re-connect request comes in, conn state goes C_UNCONNECTED,
+	 * and receiver thread will be "started".
+	 * drbd_thread_start needs to set "RESTARTING" in that case.
+	 * t_state check and assignment needs to be within the same spinlock,
+	 * so either thread_start sees EXITING, and can remap to RESTARTING,
+	 * or thread_start see NONE, and can proceed as normal.
+	 */
+
+	if (thi->t_state == RESTARTING) {
+		drbd_info(resource, "Restarting %s thread\n", thi->name);
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		goto restart;
+	}
+
+	thi->task = NULL;
+	thi->t_state = NONE;
+	smp_mb();
+	complete_all(&thi->stop);
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	drbd_info(resource, "Terminating %s\n", current->comm);
+
+	/* Release mod reference taken when thread was started */
+
+	if (thi->connection)
+		kref_put(&thi->connection->kref, drbd_destroy_connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+	module_put(THIS_MODULE);
+	return retval;
+}
+
+static void drbd_thread_init(struct drbd_resource *resource, struct drbd_thread *thi,
+			     int (*func) (struct drbd_thread *), const char *name)
+{
+	spin_lock_init(&thi->t_lock);
+	thi->task    = NULL;
+	thi->t_state = NONE;
+	thi->function = func;
+	thi->resource = resource;
+	thi->connection = NULL;
+	thi->name = name;
+}
+
+int drbd_thread_start(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *nt;
+	unsigned long flags;
+
+	/* is used from state engine doing drbd_thread_stop_nowait,
+	 * while holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	switch (thi->t_state) {
+	case NONE:
+		drbd_info(resource, "Starting %s thread (from %s [%d])\n",
+			 thi->name, current->comm, current->pid);
+
+		/* Get ref on module for thread - this is released when thread exits */
+		if (!try_module_get(THIS_MODULE)) {
+			drbd_err(resource, "Failed to get module reference in drbd_thread_start\n");
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return false;
+		}
+
+		kref_get(&resource->kref);
+		if (thi->connection)
+			kref_get(&thi->connection->kref);
+
+		init_completion(&thi->stop);
+		thi->reset_cpu_mask = 1;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
+
+		nt = kthread_create(drbd_thread_setup, (void *) thi,
+				    "drbd_%c_%s", thi->name[0], thi->resource->name);
+
+		if (IS_ERR(nt)) {
+			drbd_err(resource, "Couldn't start thread\n");
+
+			if (thi->connection)
+				kref_put(&thi->connection->kref, drbd_destroy_connection);
+			kref_put(&resource->kref, drbd_destroy_resource);
+			module_put(THIS_MODULE);
+			return false;
+		}
+		spin_lock_irqsave(&thi->t_lock, flags);
+		thi->task = nt;
+		thi->t_state = RUNNING;
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		wake_up_process(nt);
+		break;
+	case EXITING:
+		thi->t_state = RESTARTING;
+		drbd_info(resource, "Restarting %s thread (from %s [%d])\n",
+				thi->name, current->comm, current->pid);
+		fallthrough;
+	case RUNNING:
+	case RESTARTING:
+	default:
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		break;
+	}
+
+	return true;
+}
+
+
+void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait)
+{
+	unsigned long flags;
+
+	enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
+
+	/* may be called from state engine, holding the req lock irqsave */
+	spin_lock_irqsave(&thi->t_lock, flags);
+
+	if (thi->t_state == NONE) {
+		spin_unlock_irqrestore(&thi->t_lock, flags);
+		if (restart)
+			drbd_thread_start(thi);
+		return;
+	}
+
+	if (thi->t_state != ns) {
+		if (thi->task == NULL) {
+			spin_unlock_irqrestore(&thi->t_lock, flags);
+			return;
+		}
+
+		thi->t_state = ns;
+		smp_mb();
+		init_completion(&thi->stop);
+		if (thi->task != current)
+			send_sig(DRBD_SIGKILL, thi->task, 1);
+	}
+
+	spin_unlock_irqrestore(&thi->t_lock, flags);
+
+	if (wait)
+		wait_for_completion(&thi->stop);
+}
+
+int conn_lowest_minor(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr = 0, minor = -1;
+
+	rcu_read_lock();
+	peer_device = idr_get_next(&connection->peer_devices, &vnr);
+	if (peer_device)
+		minor = device_to_minor(peer_device->device);
+	rcu_read_unlock();
+
+	return minor;
+}
+
+#ifdef CONFIG_SMP
+/*
+ * drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
+ *
+ * Forces all threads of a resource onto the same CPU. This is beneficial for
+ * DRBD's performance. May be overwritten by user's configuration.
+ */
+static void drbd_calc_cpu_mask(cpumask_var_t *cpu_mask)
+{
+	unsigned int *resources_per_cpu, min_index = ~0;
+
+	resources_per_cpu = kcalloc(nr_cpu_ids, sizeof(*resources_per_cpu),
+				    GFP_KERNEL);
+	if (resources_per_cpu) {
+		struct drbd_resource *resource;
+		unsigned int cpu, min = ~0;
+
+		rcu_read_lock();
+		for_each_resource_rcu(resource, &drbd_resources) {
+			for_each_cpu(cpu, resource->cpu_mask)
+				resources_per_cpu[cpu]++;
+		}
+		rcu_read_unlock();
+		for_each_online_cpu(cpu) {
+			if (resources_per_cpu[cpu] < min) {
+				min = resources_per_cpu[cpu];
+				min_index = cpu;
+			}
+		}
+		kfree(resources_per_cpu);
+	}
+	if (min_index == ~0) {
+		cpumask_setall(*cpu_mask);
+		return;
+	}
+	cpumask_set_cpu(min_index, *cpu_mask);
+}
+
+/**
+ * drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
+ * @thi:	drbd_thread object
+ *
+ * call in the "main loop" of _all_ threads, no need for any mutex, current won't die
+ * prematurely.
+ */
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
+{
+	struct drbd_resource *resource = thi->resource;
+	struct task_struct *p = current;
+
+	if (!thi->reset_cpu_mask)
+		return;
+	thi->reset_cpu_mask = 0;
+	set_cpus_allowed_ptr(p, resource->cpu_mask);
+}
+#else
+#define drbd_calc_cpu_mask(A) ({})
+#endif
+
+/*
+ * drbd_header_size  -  size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures.  (The bitmap send and receive code
+ * relies on this.)
+ */
+unsigned int drbd_header_size(struct drbd_connection *connection)
+{
+	if (connection->agreed_pro_version >= 100) {
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+		return sizeof(struct p_header100);
+	} else {
+		BUILD_BUG_ON(sizeof(struct p_header80) !=
+			     sizeof(struct p_header95));
+		BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+		return sizeof(struct p_header80);
+	}
+}
+
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be32(DRBD_MAGIC);
+	h->command = cpu_to_be16(cmd);
+	h->length  = cpu_to_be16(size);
+	return sizeof(struct p_header80);
+}
+
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+	h->magic   = cpu_to_be16(DRBD_MAGIC_BIG);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+				      int size, int vnr)
+{
+	h->magic = cpu_to_be32(DRBD_MAGIC_100);
+	h->volume = cpu_to_be16(vnr);
+	h->command = cpu_to_be16(cmd);
+	h->length = cpu_to_be32(size);
+	h->pad = 0;
+	return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_connection *connection, int vnr,
+				   void *buffer, enum drbd_packet cmd, int size)
+{
+	if (connection->agreed_pro_version >= 100)
+		return prepare_header100(buffer, cmd, size, vnr);
+	else if (connection->agreed_pro_version >= 95 &&
+		 size > DRBD_MAX_SIZE_H80_PACKET)
+		return prepare_header95(buffer, cmd, size);
+	else
+		return prepare_header80(buffer, cmd, size);
+}
+
+static void *__conn_prepare_command(struct drbd_connection *connection,
+				    struct drbd_socket *sock)
+{
+	if (!sock->socket)
+		return NULL;
+	return sock->sbuf + drbd_header_size(connection);
+}
+
+void *conn_prepare_command(struct drbd_connection *connection, struct drbd_socket *sock)
+{
+	void *p;
+
+	mutex_lock(&sock->mutex);
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		mutex_unlock(&sock->mutex);
+
+	return p;
+}
+
+void *drbd_prepare_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock)
+{
+	return conn_prepare_command(peer_device->connection, sock);
+}
+
+static int __send_command(struct drbd_connection *connection, int vnr,
+			  struct drbd_socket *sock, enum drbd_packet cmd,
+			  unsigned int header_size, void *data,
+			  unsigned int size)
+{
+	int msg_flags;
+	int err;
+
+	/*
+	 * Called with @data == NULL and the size of the data blocks in @size
+	 * for commands that send data blocks.  For those commands, omit the
+	 * MSG_MORE flag: this will increase the likelihood that data blocks
+	 * which are page aligned on the sender will end up page aligned on the
+	 * receiver.
+	 */
+	msg_flags = data ? MSG_MORE : 0;
+
+	header_size += prepare_header(connection, vnr, sock->sbuf, cmd,
+				      header_size + size);
+	err = drbd_send_all(connection, sock->socket, sock->sbuf, header_size,
+			    msg_flags);
+	if (data && !err)
+		err = drbd_send_all(connection, sock->socket, data, size, 0);
+	/* DRBD protocol "pings" are latency critical.
+	 * This is supposed to trigger tcp_push_pending_frames() */
+	if (!err && (cmd == P_PING || cmd == P_PING_ACK))
+		tcp_sock_set_nodelay(sock->socket->sk);
+
+	return err;
+}
+
+static int __conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+			       enum drbd_packet cmd, unsigned int header_size,
+			       void *data, unsigned int size)
+{
+	return __send_command(connection, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_connection *connection, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __conn_send_command(connection, sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_command(struct drbd_peer_device *peer_device, struct drbd_socket *sock,
+		      enum drbd_packet cmd, unsigned int header_size,
+		      void *data, unsigned int size)
+{
+	int err;
+
+	err = __send_command(peer_device->connection, peer_device->device->vnr,
+			     sock, cmd, header_size, data, size);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+int drbd_send_ping(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+
+	sock = &connection->meta;
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_rs_param_95 *p;
+	int size;
+	const int apv = peer_device->connection->agreed_pro_version;
+	enum drbd_packet cmd;
+	struct net_conf *nc;
+	struct disk_conf *dc;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+
+	size = apv <= 87 ? sizeof(struct p_rs_param)
+		: apv == 88 ? sizeof(struct p_rs_param)
+			+ strlen(nc->verify_alg) + 1
+		: apv <= 94 ? sizeof(struct p_rs_param_89)
+		: /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+
+	/* initialize verify_alg and csums_alg */
+	BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
+	memset(&p->algs, 0, sizeof(p->algs));
+
+	if (get_ldev(peer_device->device)) {
+		dc = rcu_dereference(peer_device->device->ldev->disk_conf);
+		p->resync_rate = cpu_to_be32(dc->resync_rate);
+		p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+		p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+		p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+		p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+		put_ldev(peer_device->device);
+	} else {
+		p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+		p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+		p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+		p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+		p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+	}
+
+	if (apv >= 88)
+		strcpy(p->verify_alg, nc->verify_alg);
+	if (apv >= 89)
+		strcpy(p->csums_alg, nc->csums_alg);
+	rcu_read_unlock();
+
+	return drbd_send_command(peer_device, sock, cmd, size, NULL, 0);
+}
+
+int __drbd_send_protocol(struct drbd_connection *connection, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_protocol *p;
+	struct net_conf *nc;
+	int size, cf;
+
+	sock = &connection->data;
+	p = __conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	if (nc->tentative && connection->agreed_pro_version < 92) {
+		rcu_read_unlock();
+		drbd_err(connection, "--dry-run is not supported by peer");
+		return -EOPNOTSUPP;
+	}
+
+	size = sizeof(*p);
+	if (connection->agreed_pro_version >= 87)
+		size += strlen(nc->integrity_alg) + 1;
+
+	p->protocol      = cpu_to_be32(nc->wire_protocol);
+	p->after_sb_0p   = cpu_to_be32(nc->after_sb_0p);
+	p->after_sb_1p   = cpu_to_be32(nc->after_sb_1p);
+	p->after_sb_2p   = cpu_to_be32(nc->after_sb_2p);
+	p->two_primaries = cpu_to_be32(nc->two_primaries);
+	cf = 0;
+	if (nc->discard_my_data)
+		cf |= CF_DISCARD_MY_DATA;
+	if (nc->tentative)
+		cf |= CF_DRY_RUN;
+	p->conn_flags    = cpu_to_be32(cf);
+
+	if (connection->agreed_pro_version >= 87)
+		strcpy(p->integrity_alg, nc->integrity_alg);
+	rcu_read_unlock();
+
+	return __conn_send_command(connection, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_connection *connection)
+{
+	int err;
+
+	mutex_lock(&connection->data.mutex);
+	err = __drbd_send_protocol(connection, P_PROTOCOL);
+	mutex_unlock(&connection->data.mutex);
+
+	return err;
+}
+
+static int _drbd_send_uuids(struct drbd_peer_device *peer_device, u64 uuid_flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_uuids *p;
+	int i;
+
+	if (!get_ldev_if_state(device, D_NEGOTIATING))
+		return 0;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p) {
+		put_ldev(device);
+		return -EIO;
+	}
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		p->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	device->comm_bm_set = drbd_bm_total_weight(device);
+	p->uuid[UI_SIZE] = cpu_to_be64(device->comm_bm_set);
+	rcu_read_lock();
+	uuid_flags |= rcu_dereference(peer_device->connection->net_conf)->discard_my_data ? 1 : 0;
+	rcu_read_unlock();
+	uuid_flags |= test_bit(CRASHED_PRIMARY, &device->flags) ? 2 : 0;
+	uuid_flags |= device->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
+	p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+
+	put_ldev(device);
+	return drbd_send_command(peer_device, sock, P_UUIDS, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_uuids(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 0);
+}
+
+int drbd_send_uuids_skip_initial_sync(struct drbd_peer_device *peer_device)
+{
+	return _drbd_send_uuids(peer_device, 8);
+}
+
+void drbd_print_uuids(struct drbd_device *device, const char *text)
+{
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		u64 *uuid = device->ldev->md.uuid;
+		drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX\n",
+		     text,
+		     (unsigned long long)uuid[UI_CURRENT],
+		     (unsigned long long)uuid[UI_BITMAP],
+		     (unsigned long long)uuid[UI_HISTORY_START],
+		     (unsigned long long)uuid[UI_HISTORY_END]);
+		put_ldev(device);
+	} else {
+		drbd_info(device, "%s effective data uuid: %016llX\n",
+				text,
+				(unsigned long long)device->ed_uuid);
+	}
+}
+
+void drbd_gen_and_send_sync_uuid(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_rs_uuid *p;
+	u64 uuid;
+
+	D_ASSERT(device, device->state.disk == D_UP_TO_DATE);
+
+	uuid = device->ldev->md.uuid[UI_BITMAP];
+	if (uuid && uuid != UUID_JUST_CREATED)
+		uuid = uuid + UUID_NEW_BM_OFFSET;
+	else
+		get_random_bytes(&uuid, sizeof(u64));
+	drbd_uuid_set(device, UI_BITMAP, uuid);
+	drbd_print_uuids(device, "updated sync UUID");
+	drbd_md_sync(device);
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->uuid = cpu_to_be64(uuid);
+		drbd_send_command(peer_device, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+	}
+}
+
+int drbd_send_sizes(struct drbd_peer_device *peer_device, int trigger_reply, enum dds_flags flags)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_sizes *p;
+	sector_t d_size, u_size;
+	int q_order_type;
+	unsigned int max_bio_size;
+	unsigned int packet_size;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+
+	packet_size = sizeof(*p);
+	if (peer_device->connection->agreed_features & DRBD_FF_WSAME)
+		packet_size += sizeof(p->qlim[0]);
+
+	memset(p, 0, packet_size);
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		struct block_device *bdev = device->ldev->backing_bdev;
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		d_size = drbd_get_max_capacity(device->ldev);
+		rcu_read_lock();
+		u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+		q_order_type = drbd_queue_order_type(device);
+		max_bio_size = queue_max_hw_sectors(q) << 9;
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
+		p->qlim->physical_block_size =
+			cpu_to_be32(bdev_physical_block_size(bdev));
+		p->qlim->logical_block_size =
+			cpu_to_be32(bdev_logical_block_size(bdev));
+		p->qlim->alignment_offset =
+			cpu_to_be32(bdev_alignment_offset(bdev));
+		p->qlim->io_min = cpu_to_be32(bdev_io_min(bdev));
+		p->qlim->io_opt = cpu_to_be32(bdev_io_opt(bdev));
+		p->qlim->discard_enabled = !!bdev_max_discard_sectors(bdev);
+		put_ldev(device);
+	} else {
+		struct request_queue *q = device->rq_queue;
+
+		p->qlim->physical_block_size =
+			cpu_to_be32(queue_physical_block_size(q));
+		p->qlim->logical_block_size =
+			cpu_to_be32(queue_logical_block_size(q));
+		p->qlim->alignment_offset = 0;
+		p->qlim->io_min = cpu_to_be32(queue_io_min(q));
+		p->qlim->io_opt = cpu_to_be32(queue_io_opt(q));
+		p->qlim->discard_enabled = 0;
+
+		d_size = 0;
+		u_size = 0;
+		q_order_type = QUEUE_ORDERED_NONE;
+		max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
+	}
+
+	if (peer_device->connection->agreed_pro_version <= 94)
+		max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+	else if (peer_device->connection->agreed_pro_version < 100)
+		max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
+
+	p->d_size = cpu_to_be64(d_size);
+	p->u_size = cpu_to_be64(u_size);
+	if (trigger_reply)
+		p->c_size = 0;
+	else
+		p->c_size = cpu_to_be64(get_capacity(device->vdisk));
+	p->max_bio_size = cpu_to_be32(max_bio_size);
+	p->queue_order_type = cpu_to_be16(q_order_type);
+	p->dds_flags = cpu_to_be16(flags);
+
+	return drbd_send_command(peer_device, sock, P_SIZES, packet_size, NULL, 0);
+}
+
+/**
+ * drbd_send_current_state() - Sends the drbd state to the peer
+ * @peer_device:	DRBD peer device.
+ */
+int drbd_send_current_state(struct drbd_peer_device *peer_device)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(peer_device->device->state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+/**
+ * drbd_send_state() - After a state change, sends the new state to the peer
+ * @peer_device:      DRBD peer device.
+ * @state:     the state to send, not necessarily the current state.
+ *
+ * Each state change queues an "after_state_ch" work, which will eventually
+ * send the resulting new state to the peer. If more state changes happen
+ * between queuing and processing of the after_state_ch work, we still
+ * want to send each intermediary state in the order it occurred.
+ */
+int drbd_send_state(struct drbd_peer_device *peer_device, union drbd_state state)
+{
+	struct drbd_socket *sock;
+	struct p_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->state = cpu_to_be32(state.i); /* Within the send mutex */
+	return drbd_send_command(peer_device, sock, P_STATE, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_state_req(struct drbd_peer_device *peer_device, union drbd_state mask, union drbd_state val)
+{
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return drbd_send_command(peer_device, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_packet cmd;
+	struct drbd_socket *sock;
+	struct p_req_state *p;
+
+	cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->mask = cpu_to_be32(mask.i);
+	p->val = cpu_to_be32(val.i);
+	return conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_peer_device *peer_device, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		drbd_send_command(peer_device, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
+	}
+}
+
+void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode)
+{
+	struct drbd_socket *sock;
+	struct p_req_state_reply *p;
+	enum drbd_packet cmd = connection->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (p) {
+		p->retcode = cpu_to_be32(retcode);
+		conn_send_command(connection, sock, cmd, sizeof(*p), NULL, 0);
+	}
+}
+
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
+{
+	BUG_ON(code & ~0xf);
+	p->encoding = (p->encoding & ~0xf) | code;
+}
+
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+	p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
+
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+	BUG_ON(n & ~0x7);
+	p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
+}
+
+static int fill_bitmap_rle_bits(struct drbd_device *device,
+			 struct p_compressed_bm *p,
+			 unsigned int size,
+			 struct bm_xfer_ctx *c)
+{
+	struct bitstream bs;
+	unsigned long plain_bits;
+	unsigned long tmp;
+	unsigned long rl;
+	unsigned len;
+	unsigned toggle;
+	int bits, use_rle;
+
+	/* may we use this feature? */
+	rcu_read_lock();
+	use_rle = rcu_dereference(first_peer_device(device)->connection->net_conf)->use_rle;
+	rcu_read_unlock();
+	if (!use_rle || first_peer_device(device)->connection->agreed_pro_version < 90)
+		return 0;
+
+	if (c->bit_offset >= c->bm_bits)
+		return 0; /* nothing to do. */
+
+	/* use at most thus many bytes */
+	bitstream_init(&bs, p->code, size, 0);
+	memset(p->code, 0, size);
+	/* plain bits covered in this code string */
+	plain_bits = 0;
+
+	/* p->encoding & 0x80 stores whether the first run length is set.
+	 * bit offset is implicit.
+	 * start with toggle == 2 to be able to tell the first iteration */
+	toggle = 2;
+
+	/* see how much plain bits we can stuff into one packet
+	 * using RLE and VLI. */
+	do {
+		tmp = (toggle == 0) ? _drbd_bm_find_next_zero(device, c->bit_offset)
+				    : _drbd_bm_find_next(device, c->bit_offset);
+		if (tmp == -1UL)
+			tmp = c->bm_bits;
+		rl = tmp - c->bit_offset;
+
+		if (toggle == 2) { /* first iteration */
+			if (rl == 0) {
+				/* the first checked bit was set,
+				 * store start value, */
+				dcbp_set_start(p, 1);
+				/* but skip encoding of zero run length */
+				toggle = !toggle;
+				continue;
+			}
+			dcbp_set_start(p, 0);
+		}
+
+		/* paranoia: catch zero runlength.
+		 * can only happen if bitmap is modified while we scan it. */
+		if (rl == 0) {
+			drbd_err(device, "unexpected zero runlength while encoding bitmap "
+			    "t:%u bo:%lu\n", toggle, c->bit_offset);
+			return -1;
+		}
+
+		bits = vli_encode_bits(&bs, rl);
+		if (bits == -ENOBUFS) /* buffer full */
+			break;
+		if (bits <= 0) {
+			drbd_err(device, "error while encoding bitmap: %d\n", bits);
+			return 0;
+		}
+
+		toggle = !toggle;
+		plain_bits += rl;
+		c->bit_offset = tmp;
+	} while (c->bit_offset < c->bm_bits);
+
+	len = bs.cur.b - p->code + !!bs.cur.bit;
+
+	if (plain_bits < (len << 3)) {
+		/* incompressible with this method.
+		 * we need to rewind both word and bit position. */
+		c->bit_offset -= plain_bits;
+		bm_xfer_ctx_bit_to_word_offset(c);
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+		return 0;
+	}
+
+	/* RLE + VLI was able to compress it just fine.
+	 * update c->word_offset. */
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	/* store pad_bits */
+	dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+
+	return len;
+}
+
+/*
+ * send_bitmap_rle_or_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+send_bitmap_rle_or_plain(struct drbd_device *device, struct bm_xfer_ctx *c)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	struct p_compressed_bm *p = sock->sbuf + header_size;
+	int len, err;
+
+	len = fill_bitmap_rle_bits(device, p,
+			DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
+	if (len < 0)
+		return -EIO;
+
+	if (len) {
+		dcbp_set_code(p, RLE_VLI_Bits);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock,
+				     P_COMPRESSED_BITMAP, sizeof(*p) + len,
+				     NULL, 0);
+		c->packets[0]++;
+		c->bytes[0] += header_size + sizeof(*p) + len;
+
+		if (c->bit_offset >= c->bm_bits)
+			len = 0; /* DONE */
+	} else {
+		/* was not compressible.
+		 * send a buffer full of plain text bits instead. */
+		unsigned int data_size;
+		unsigned long num_words;
+		unsigned long *p = sock->sbuf + header_size;
+
+		data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+		num_words = min_t(size_t, data_size / sizeof(*p),
+				  c->bm_words - c->word_offset);
+		len = num_words * sizeof(*p);
+		if (len)
+			drbd_bm_get_lel(device, c->word_offset, num_words, p);
+		err = __send_command(first_peer_device(device)->connection, device->vnr, sock, P_BITMAP, len, NULL, 0);
+		c->word_offset += num_words;
+		c->bit_offset = c->word_offset * BITS_PER_LONG;
+
+		c->packets[1]++;
+		c->bytes[1] += header_size + len;
+
+		if (c->bit_offset > c->bm_bits)
+			c->bit_offset = c->bm_bits;
+	}
+	if (!err) {
+		if (len == 0) {
+			INFO_bm_xfer_stats(device, "send", c);
+			return 0;
+		} else
+			return 1;
+	}
+	return -EIO;
+}
+
+/* See the comment at receive_bitmap() */
+static int _drbd_send_bitmap(struct drbd_device *device)
+{
+	struct bm_xfer_ctx c;
+	int err;
+
+	if (!expect(device->bitmap))
+		return false;
+
+	if (get_ldev(device)) {
+		if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC)) {
+			drbd_info(device, "Writing the whole bitmap, MDF_FullSync was set.\n");
+			drbd_bm_set_all(device);
+			if (drbd_bm_write(device)) {
+				/* write_bm did fail! Leave full sync flag set in Meta P_DATA
+				 * but otherwise process as per normal - need to tell other
+				 * side that a full resync is required! */
+				drbd_err(device, "Failed to write bitmap to disk!\n");
+			} else {
+				drbd_md_clear_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+		put_ldev(device);
+	}
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	do {
+		err = send_bitmap_rle_or_plain(device, &c);
+	} while (err > 0);
+
+	return err == 0;
+}
+
+int drbd_send_bitmap(struct drbd_device *device)
+{
+	struct drbd_socket *sock = &first_peer_device(device)->connection->data;
+	int err = -1;
+
+	mutex_lock(&sock->mutex);
+	if (sock->socket)
+		err = !_drbd_send_bitmap(device);
+	mutex_unlock(&sock->mutex);
+	return err;
+}
+
+void drbd_send_b_ack(struct drbd_connection *connection, u32 barrier_nr, u32 set_size)
+{
+	struct drbd_socket *sock;
+	struct p_barrier_ack *p;
+
+	if (connection->cstate < C_WF_REPORT_PARAMS)
+		return;
+
+	sock = &connection->meta;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return;
+	p->barrier = barrier_nr;
+	p->set_size = cpu_to_be32(set_size);
+	conn_send_command(connection, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
+}
+
+/**
+ * _drbd_send_ack() - Sends an ack packet
+ * @peer_device:	DRBD peer device.
+ * @cmd:		Packet command code.
+ * @sector:		sector, needs to be in big endian byte order
+ * @blksize:		size in byte, needs to be in big endian byte order
+ * @block_id:		Id, big endian byte order
+ */
+static int _drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+			  u64 sector, u32 blksize, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_ack *p;
+
+	if (peer_device->device->state.conn < C_CONNECTED)
+		return -EIO;
+
+	sock = &peer_device->connection->meta;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = sector;
+	p->block_id = block_id;
+	p->blksize = blksize;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&peer_device->device->packet_seq));
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+/* dp->sector and dp->block_id already/still in network byte order,
+ * data_size is payload size according to dp->head,
+ * and may need to be corrected for digest size. */
+void drbd_send_ack_dp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_data *dp, int data_size)
+{
+	if (peer_device->connection->peer_integrity_tfm)
+		data_size -= crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
+	_drbd_send_ack(peer_device, cmd, dp->sector, cpu_to_be32(data_size),
+		       dp->block_id);
+}
+
+void drbd_send_ack_rp(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		      struct p_block_req *rp)
+{
+	_drbd_send_ack(peer_device, cmd, rp->sector, rp->blksize, rp->block_id);
+}
+
+/**
+ * drbd_send_ack() - Sends an ack packet
+ * @peer_device:	DRBD peer device
+ * @cmd:		packet command code
+ * @peer_req:		peer request
+ */
+int drbd_send_ack(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		  struct drbd_peer_request *peer_req)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(peer_req->i.sector),
+			      cpu_to_be32(peer_req->i.size),
+			      peer_req->block_id);
+}
+
+/* This function misuses the block_id field to signal if the blocks
+ * are is sync or not. */
+int drbd_send_ack_ex(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		     sector_t sector, int blksize, u64 block_id)
+{
+	return _drbd_send_ack(peer_device, cmd,
+			      cpu_to_be64(sector),
+			      cpu_to_be32(blksize),
+			      cpu_to_be64(block_id));
+}
+
+int drbd_send_rs_deallocated(struct drbd_peer_device *peer_device,
+			     struct drbd_peer_request *peer_req)
+{
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->blksize = cpu_to_be32(peer_req->i.size);
+	p->pad = 0;
+	return drbd_send_command(peer_device, sock, P_RS_DEALLOCATED, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest(struct drbd_peer_device *peer_device, int cmd,
+		       sector_t sector, int size, u64 block_id)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = block_id;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+int drbd_send_drequest_csum(struct drbd_peer_device *peer_device, sector_t sector, int size,
+			    void *digest, int digest_size, enum drbd_packet cmd)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	/* FIXME: Put the digest into the preallocated socket buffer.  */
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, cmd, sizeof(*p), digest, digest_size);
+}
+
+int drbd_send_ov_request(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_socket *sock;
+	struct p_block_req *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(sector);
+	p->block_id = ID_SYNCER /* unused */;
+	p->blksize = cpu_to_be32(size);
+	return drbd_send_command(peer_device, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
+}
+
+/* called on sndtimeo
+ * returns false if we should retry,
+ * true if we think connection is dead
+ */
+static int we_should_drop_the_connection(struct drbd_connection *connection, struct socket *sock)
+{
+	int drop_it;
+	/* long elapsed = (long)(jiffies - device->last_received); */
+
+	drop_it =   connection->meta.socket == sock
+		|| !connection->ack_receiver.task
+		|| get_t_state(&connection->ack_receiver) != RUNNING
+		|| connection->cstate < C_WF_REPORT_PARAMS;
+
+	if (drop_it)
+		return true;
+
+	drop_it = !--connection->ko_count;
+	if (!drop_it) {
+		drbd_err(connection, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+			 current->comm, current->pid, connection->ko_count);
+		request_ping(connection);
+	}
+
+	return drop_it; /* && (device->state == R_PRIMARY) */;
+}
+
+static void drbd_update_congested(struct drbd_connection *connection)
+{
+	struct sock *sk = connection->data.socket->sk;
+	if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+		set_bit(NET_CONGESTED, &connection->flags);
+}
+
+/* The idea of sendpage seems to be to put some kind of reference
+ * to the page into the skb, and to hand it over to the NIC. In
+ * this process get_page() gets called.
+ *
+ * As soon as the page was really sent over the network put_page()
+ * gets called by some part of the network layer. [ NIC driver? ]
+ *
+ * [ get_page() / put_page() increment/decrement the count. If count
+ *   reaches 0 the page will be freed. ]
+ *
+ * This works nicely with pages from FSs.
+ * But this means that in protocol A we might signal IO completion too early!
+ *
+ * In order not to corrupt data during a resync we must make sure
+ * that we do not reuse our own buffer pages (EEs) to early, therefore
+ * we have the net_ee list.
+ *
+ * XFS seems to have problems, still, it submits pages with page_count == 0!
+ * As a workaround, we disable sendpage on pages
+ * with page_count == 0 or PageSlab.
+ */
+static int _drbd_no_send_page(struct drbd_peer_device *peer_device, struct page *page,
+			      int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket;
+	void *addr;
+	int err;
+
+	socket = peer_device->connection->data.socket;
+	addr = kmap(page) + offset;
+	err = drbd_send_all(peer_device->connection, socket, addr, size, msg_flags);
+	kunmap(page);
+	if (!err)
+		peer_device->device->send_cnt += size >> 9;
+	return err;
+}
+
+static int _drbd_send_page(struct drbd_peer_device *peer_device, struct page *page,
+		    int offset, size_t size, unsigned msg_flags)
+{
+	struct socket *socket = peer_device->connection->data.socket;
+	int len = size;
+	int err = -EIO;
+
+	/* e.g. XFS meta- & log-data is in slab pages, which have a
+	 * page_count of 0 and/or have PageSlab() set.
+	 * we cannot use send_page for those, as that does get_page();
+	 * put_page(); and would cause either a VM_BUG directly, or
+	 * __page_cache_release a page that would actually still be referenced
+	 * by someone, leading to some obscure delayed Oops somewhere else. */
+	if (drbd_disable_sendpage || !sendpage_ok(page))
+		return _drbd_no_send_page(peer_device, page, offset, size, msg_flags);
+
+	msg_flags |= MSG_NOSIGNAL;
+	drbd_update_congested(peer_device->connection);
+	do {
+		int sent;
+
+		sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
+		if (sent <= 0) {
+			if (sent == -EAGAIN) {
+				if (we_should_drop_the_connection(peer_device->connection, socket))
+					break;
+				continue;
+			}
+			drbd_warn(peer_device->device, "%s: size=%d len=%d sent=%d\n",
+			     __func__, (int)size, len, sent);
+			if (sent < 0)
+				err = sent;
+			break;
+		}
+		len    -= sent;
+		offset += sent;
+	} while (len > 0 /* THINK && device->cstate >= C_CONNECTED*/);
+	clear_bit(NET_CONGESTED, &peer_device->connection->flags);
+
+	if (len == 0) {
+		err = 0;
+		peer_device->device->send_cnt += size >> 9;
+	}
+	return err;
+}
+
+static int _drbd_send_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_no_send_page(peer_device, bvec.bv_page,
+					 bvec.bv_offset, bvec.bv_len,
+					 bio_iter_last(bvec, iter)
+					 ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	/* hint all but last page with MSG_MORE */
+	bio_for_each_segment(bvec, bio, iter) {
+		int err;
+
+		err = _drbd_send_page(peer_device, bvec.bv_page,
+				      bvec.bv_offset, bvec.bv_len,
+				      bio_iter_last(bvec, iter) ? 0 : MSG_MORE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
+static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
+			    struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	unsigned len = peer_req->i.size;
+	int err;
+
+	/* hint all but last page with MSG_MORE */
+	page_chain_for_each(page) {
+		unsigned l = min_t(unsigned, len, PAGE_SIZE);
+
+		err = _drbd_send_page(peer_device, page, 0, l,
+				      page_chain_next(page) ? MSG_MORE : 0);
+		if (err)
+			return err;
+		len -= l;
+	}
+	return 0;
+}
+
+static u32 bio_flags_to_wire(struct drbd_connection *connection,
+			     struct bio *bio)
+{
+	if (connection->agreed_pro_version >= 95)
+		return  (bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0) |
+			(bio->bi_opf & REQ_FUA ? DP_FUA : 0) |
+			(bio->bi_opf & REQ_PREFLUSH ? DP_FLUSH : 0) |
+			(bio_op(bio) == REQ_OP_DISCARD ? DP_DISCARD : 0) |
+			(bio_op(bio) == REQ_OP_WRITE_ZEROES ?
+			  ((connection->agreed_features & DRBD_FF_WZEROES) ?
+			   (DP_ZEROES |(!(bio->bi_opf & REQ_NOUNMAP) ? DP_DISCARD : 0))
+			   : DP_DISCARD)
+			: 0);
+	else
+		return bio->bi_opf & REQ_SYNC ? DP_RW_SYNC : 0;
+}
+
+/* Used to send write or TRIM aka REQ_OP_DISCARD requests
+ * R_PRIMARY -> Peer	(P_DATA, P_TRIM)
+ */
+int drbd_send_dblock(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	void *digest_out;
+	unsigned int dp_flags = 0;
+	int digest_size;
+	int err;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	digest_size = peer_device->connection->integrity_tfm ?
+		      crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->block_id = (unsigned long)req;
+	p->seq_num = cpu_to_be32(atomic_inc_return(&device->packet_seq));
+	dp_flags = bio_flags_to_wire(peer_device->connection, req->master_bio);
+	if (device->state.conn >= C_SYNC_SOURCE &&
+	    device->state.conn <= C_PAUSED_SYNC_T)
+		dp_flags |= DP_MAY_SET_IN_SYNC;
+	if (peer_device->connection->agreed_pro_version >= 100) {
+		if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+		/* During resync, request an explicit write ack,
+		 * even in protocol != C */
+		if (req->rq_state & RQ_EXP_WRITE_ACK
+		|| (dp_flags & DP_MAY_SET_IN_SYNC))
+			dp_flags |= DP_SEND_WRITE_ACK;
+	}
+	p->dp_flags = cpu_to_be32(dp_flags);
+
+	if (dp_flags & (DP_DISCARD|DP_ZEROES)) {
+		enum drbd_packet cmd = (dp_flags & DP_ZEROES) ? P_ZEROES : P_TRIM;
+		struct p_trim *t = (struct p_trim*)p;
+		t->size = cpu_to_be32(req->i.size);
+		err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*t), NULL, 0);
+		goto out;
+	}
+	digest_out = p + 1;
+
+	/* our digest is still only over the payload.
+	 * TRIM does not carry any payload. */
+	if (digest_size)
+		drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest_out);
+	err = __send_command(peer_device->connection, device->vnr, sock, P_DATA,
+			     sizeof(*p) + digest_size, NULL, req->i.size);
+	if (!err) {
+		/* For protocol A, we have to memcpy the payload into
+		 * socket buffers, as we may complete right away
+		 * as soon as we handed it over to tcp, at which point the data
+		 * pages may become invalid.
+		 *
+		 * For data-integrity enabled, we copy it as well, so we can be
+		 * sure that even if the bio pages may still be modified, it
+		 * won't change the data on the wire, thus if the digest checks
+		 * out ok after sending on this side, but does not fit on the
+		 * receiving side, we sure have detected corruption elsewhere.
+		 */
+		if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || digest_size)
+			err = _drbd_send_bio(peer_device, req->master_bio);
+		else
+			err = _drbd_send_zc_bio(peer_device, req->master_bio);
+
+		/* double check digest, sometimes buffers have been modified in flight. */
+		if (digest_size > 0 && digest_size <= 64) {
+			/* 64 byte, 512 bit, is the largest digest size
+			 * currently supported in kernel crypto. */
+			unsigned char digest[64];
+			drbd_csum_bio(peer_device->connection->integrity_tfm, req->master_bio, digest);
+			if (memcmp(p + 1, digest, digest_size)) {
+				drbd_warn(device,
+					"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
+					(unsigned long long)req->i.sector, req->i.size);
+			}
+		} /* else if (digest_size > 64) {
+		     ... Be noisy about digest too large ...
+		} */
+	}
+out:
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+/* answer packet, used to send data back for read requests:
+ *  Peer       -> (diskless) R_PRIMARY   (P_DATA_REPLY)
+ *  C_SYNC_SOURCE -> C_SYNC_TARGET         (P_RS_DATA_REPLY)
+ */
+int drbd_send_block(struct drbd_peer_device *peer_device, enum drbd_packet cmd,
+		    struct drbd_peer_request *peer_req)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_socket *sock;
+	struct p_data *p;
+	int err;
+	int digest_size;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+
+	digest_size = peer_device->connection->integrity_tfm ?
+		      crypto_shash_digestsize(peer_device->connection->integrity_tfm) : 0;
+
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(peer_req->i.sector);
+	p->block_id = peer_req->block_id;
+	p->seq_num = 0;  /* unused */
+	p->dp_flags = 0;
+	if (digest_size)
+		drbd_csum_ee(peer_device->connection->integrity_tfm, peer_req, p + 1);
+	err = __send_command(peer_device->connection, device->vnr, sock, cmd, sizeof(*p) + digest_size, NULL, peer_req->i.size);
+	if (!err)
+		err = _drbd_send_zc_ee(peer_device, peer_req);
+	mutex_unlock(&sock->mutex);  /* locked by drbd_prepare_command() */
+
+	return err;
+}
+
+int drbd_send_out_of_sync(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_socket *sock;
+	struct p_block_desc *p;
+
+	sock = &peer_device->connection->data;
+	p = drbd_prepare_command(peer_device, sock);
+	if (!p)
+		return -EIO;
+	p->sector = cpu_to_be64(req->i.sector);
+	p->blksize = cpu_to_be32(req->i.size);
+	return drbd_send_command(peer_device, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
+}
+
+/*
+  drbd_send distinguishes two cases:
+
+  Packets sent via the data socket "sock"
+  and packets sent via the meta data socket "msock"
+
+		    sock                      msock
+  -----------------+-------------------------+------------------------------
+  timeout           conf.timeout / 2          conf.timeout / 2
+  timeout action    send a ping via msock     Abort communication
+					      and close all sockets
+*/
+
+/*
+ * you must have down()ed the appropriate [m]sock_mutex elsewhere!
+ */
+int drbd_send(struct drbd_connection *connection, struct socket *sock,
+	      void *buf, size_t size, unsigned msg_flags)
+{
+	struct kvec iov = {.iov_base = buf, .iov_len = size};
+	struct msghdr msg = {.msg_flags = msg_flags | MSG_NOSIGNAL};
+	int rv, sent = 0;
+
+	if (!sock)
+		return -EBADR;
+
+	/* THINK  if (signal_pending) return ... ? */
+
+	iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, size);
+
+	if (sock == connection->data.socket) {
+		rcu_read_lock();
+		connection->ko_count = rcu_dereference(connection->net_conf)->ko_count;
+		rcu_read_unlock();
+		drbd_update_congested(connection);
+	}
+	do {
+		rv = sock_sendmsg(sock, &msg);
+		if (rv == -EAGAIN) {
+			if (we_should_drop_the_connection(connection, sock))
+				break;
+			else
+				continue;
+		}
+		if (rv == -EINTR) {
+			flush_signals(current);
+			rv = 0;
+		}
+		if (rv < 0)
+			break;
+		sent += rv;
+	} while (sent < size);
+
+	if (sock == connection->data.socket)
+		clear_bit(NET_CONGESTED, &connection->flags);
+
+	if (rv <= 0) {
+		if (rv != -EAGAIN) {
+			drbd_err(connection, "%s_sendmsg returned %d\n",
+				 sock == connection->meta.socket ? "msock" : "sock",
+				 rv);
+			conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+		} else
+			conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+	}
+
+	return sent;
+}
+
+/*
+ * drbd_send_all  -  Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_connection *connection, struct socket *sock, void *buffer,
+		  size_t size, unsigned msg_flags)
+{
+	int err;
+
+	err = drbd_send(connection, sock, buffer, size, msg_flags);
+	if (err < 0)
+		return err;
+	if (err != size)
+		return -EIO;
+	return 0;
+}
+
+static int drbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct drbd_device *device = bdev->bd_disk->private_data;
+	unsigned long flags;
+	int rv = 0;
+
+	mutex_lock(&drbd_main_mutex);
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	/* to have a stable device->state.role
+	 * and no race with updating open_cnt */
+
+	if (device->state.role != R_PRIMARY) {
+		if (mode & FMODE_WRITE)
+			rv = -EROFS;
+		else if (!drbd_allow_oos)
+			rv = -EMEDIUMTYPE;
+	}
+
+	if (!rv)
+		device->open_cnt++;
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	mutex_unlock(&drbd_main_mutex);
+
+	return rv;
+}
+
+static void drbd_release(struct gendisk *gd, fmode_t mode)
+{
+	struct drbd_device *device = gd->private_data;
+	mutex_lock(&drbd_main_mutex);
+	device->open_cnt--;
+	mutex_unlock(&drbd_main_mutex);
+}
+
+/* need to hold resource->req_lock */
+void drbd_queue_unplug(struct drbd_device *device)
+{
+	if (device->state.pdsk >= D_INCONSISTENT && device->state.conn >= C_CONNECTED) {
+		D_ASSERT(device, device->state.role == R_PRIMARY);
+		if (test_and_clear_bit(UNPLUG_REMOTE, &device->flags)) {
+			drbd_queue_work_if_unqueued(
+				&first_peer_device(device)->connection->sender_work,
+				&device->unplug_work);
+		}
+	}
+}
+
+static void drbd_set_defaults(struct drbd_device *device)
+{
+	/* Beware! The actual layout differs
+	 * between big endian and little endian */
+	device->state = (union drbd_dev_state) {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = C_STANDALONE,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+}
+
+void drbd_init_set_defaults(struct drbd_device *device)
+{
+	/* the memset(,0,) did most of this.
+	 * note: only assignments, no allocation in here */
+
+	drbd_set_defaults(device);
+
+	atomic_set(&device->ap_bio_cnt, 0);
+	atomic_set(&device->ap_actlog_cnt, 0);
+	atomic_set(&device->ap_pending_cnt, 0);
+	atomic_set(&device->rs_pending_cnt, 0);
+	atomic_set(&device->unacked_cnt, 0);
+	atomic_set(&device->local_cnt, 0);
+	atomic_set(&device->pp_in_use_by_net, 0);
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	atomic_set(&device->ap_in_flight, 0);
+	atomic_set(&device->md_io.in_use, 0);
+
+	mutex_init(&device->own_state_mutex);
+	device->state_mutex = &device->own_state_mutex;
+
+	spin_lock_init(&device->al_lock);
+	spin_lock_init(&device->peer_seq_lock);
+
+	INIT_LIST_HEAD(&device->active_ee);
+	INIT_LIST_HEAD(&device->sync_ee);
+	INIT_LIST_HEAD(&device->done_ee);
+	INIT_LIST_HEAD(&device->read_ee);
+	INIT_LIST_HEAD(&device->net_ee);
+	INIT_LIST_HEAD(&device->resync_reads);
+	INIT_LIST_HEAD(&device->resync_work.list);
+	INIT_LIST_HEAD(&device->unplug_work.list);
+	INIT_LIST_HEAD(&device->bm_io_work.w.list);
+	INIT_LIST_HEAD(&device->pending_master_completion[0]);
+	INIT_LIST_HEAD(&device->pending_master_completion[1]);
+	INIT_LIST_HEAD(&device->pending_completion[0]);
+	INIT_LIST_HEAD(&device->pending_completion[1]);
+
+	device->resync_work.cb  = w_resync_timer;
+	device->unplug_work.cb  = w_send_write_hint;
+	device->bm_io_work.w.cb = w_bitmap_io;
+
+	timer_setup(&device->resync_timer, resync_timer_fn, 0);
+	timer_setup(&device->md_sync_timer, md_sync_timer_fn, 0);
+	timer_setup(&device->start_resync_timer, start_resync_timer_fn, 0);
+	timer_setup(&device->request_timer, request_timer_fn, 0);
+
+	init_waitqueue_head(&device->misc_wait);
+	init_waitqueue_head(&device->state_wait);
+	init_waitqueue_head(&device->ee_wait);
+	init_waitqueue_head(&device->al_wait);
+	init_waitqueue_head(&device->seq_wait);
+
+	device->resync_wenr = LC_FREE;
+	device->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+	device->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
+}
+
+void drbd_set_my_capacity(struct drbd_device *device, sector_t size)
+{
+	char ppb[10];
+
+	set_capacity_and_notify(device->vdisk, size);
+
+	drbd_info(device, "size = %s (%llu KB)\n",
+		ppsize(ppb, size>>1), (unsigned long long)size>>1);
+}
+
+void drbd_device_cleanup(struct drbd_device *device)
+{
+	int i;
+	if (first_peer_device(device)->connection->receiver.t_state != NONE)
+		drbd_err(device, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
+				first_peer_device(device)->connection->receiver.t_state);
+
+	device->al_writ_cnt  =
+	device->bm_writ_cnt  =
+	device->read_cnt     =
+	device->recv_cnt     =
+	device->send_cnt     =
+	device->writ_cnt     =
+	device->p_size       =
+	device->rs_start     =
+	device->rs_total     =
+	device->rs_failed    = 0;
+	device->rs_last_events = 0;
+	device->rs_last_sect_ev = 0;
+	for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+		device->rs_mark_left[i] = 0;
+		device->rs_mark_time[i] = 0;
+	}
+	D_ASSERT(device, first_peer_device(device)->connection->net_conf == NULL);
+
+	set_capacity_and_notify(device->vdisk, 0);
+	if (device->bitmap) {
+		/* maybe never allocated. */
+		drbd_bm_resize(device, 0, 1);
+		drbd_bm_cleanup(device);
+	}
+
+	drbd_backing_dev_free(device, device->ldev);
+	device->ldev = NULL;
+
+	clear_bit(AL_SUSPENDED, &device->flags);
+
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->net_ee));
+	D_ASSERT(device, list_empty(&device->resync_reads));
+	D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
+	D_ASSERT(device, list_empty(&device->resync_work.list));
+	D_ASSERT(device, list_empty(&device->unplug_work.list));
+
+	drbd_set_defaults(device);
+}
+
+
+static void drbd_destroy_mempools(void)
+{
+	struct page *page;
+
+	while (drbd_pp_pool) {
+		page = drbd_pp_pool;
+		drbd_pp_pool = (struct page *)page_private(page);
+		__free_page(page);
+		drbd_pp_vacant--;
+	}
+
+	/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
+
+	bioset_exit(&drbd_io_bio_set);
+	bioset_exit(&drbd_md_io_bio_set);
+	mempool_exit(&drbd_md_io_page_pool);
+	mempool_exit(&drbd_ee_mempool);
+	mempool_exit(&drbd_request_mempool);
+	kmem_cache_destroy(drbd_ee_cache);
+	kmem_cache_destroy(drbd_request_cache);
+	kmem_cache_destroy(drbd_bm_ext_cache);
+	kmem_cache_destroy(drbd_al_ext_cache);
+
+	drbd_ee_cache        = NULL;
+	drbd_request_cache   = NULL;
+	drbd_bm_ext_cache    = NULL;
+	drbd_al_ext_cache    = NULL;
+
+	return;
+}
+
+static int drbd_create_mempools(void)
+{
+	struct page *page;
+	const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
+	int i, ret;
+
+	/* caches */
+	drbd_request_cache = kmem_cache_create(
+		"drbd_req", sizeof(struct drbd_request), 0, 0, NULL);
+	if (drbd_request_cache == NULL)
+		goto Enomem;
+
+	drbd_ee_cache = kmem_cache_create(
+		"drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
+	if (drbd_ee_cache == NULL)
+		goto Enomem;
+
+	drbd_bm_ext_cache = kmem_cache_create(
+		"drbd_bm", sizeof(struct bm_extent), 0, 0, NULL);
+	if (drbd_bm_ext_cache == NULL)
+		goto Enomem;
+
+	drbd_al_ext_cache = kmem_cache_create(
+		"drbd_al", sizeof(struct lc_element), 0, 0, NULL);
+	if (drbd_al_ext_cache == NULL)
+		goto Enomem;
+
+	/* mempools */
+	ret = bioset_init(&drbd_io_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret)
+		goto Enomem;
+
+	ret = bioset_init(&drbd_md_io_bio_set, DRBD_MIN_POOL_PAGES, 0,
+			  BIOSET_NEED_BVECS);
+	if (ret)
+		goto Enomem;
+
+	ret = mempool_init_page_pool(&drbd_md_io_page_pool, DRBD_MIN_POOL_PAGES, 0);
+	if (ret)
+		goto Enomem;
+
+	ret = mempool_init_slab_pool(&drbd_request_mempool, number,
+				     drbd_request_cache);
+	if (ret)
+		goto Enomem;
+
+	ret = mempool_init_slab_pool(&drbd_ee_mempool, number, drbd_ee_cache);
+	if (ret)
+		goto Enomem;
+
+	for (i = 0; i < number; i++) {
+		page = alloc_page(GFP_HIGHUSER);
+		if (!page)
+			goto Enomem;
+		set_page_private(page, (unsigned long)drbd_pp_pool);
+		drbd_pp_pool = page;
+	}
+	drbd_pp_vacant = number;
+
+	return 0;
+
+Enomem:
+	drbd_destroy_mempools(); /* in case we allocated some */
+	return -ENOMEM;
+}
+
+static void drbd_release_all_peer_reqs(struct drbd_device *device)
+{
+	int rr;
+
+	rr = drbd_free_peer_reqs(device, &device->active_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in active list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->sync_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in sync list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->read_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in read list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->done_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in done list found!\n", rr);
+
+	rr = drbd_free_peer_reqs(device, &device->net_ee);
+	if (rr)
+		drbd_err(device, "%d EEs in net list found!\n", rr);
+}
+
+/* caution. no locking. */
+void drbd_destroy_device(struct kref *kref)
+{
+	struct drbd_device *device = container_of(kref, struct drbd_device, kref);
+	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
+
+	del_timer_sync(&device->request_timer);
+
+	/* paranoia asserts */
+	D_ASSERT(device, device->open_cnt == 0);
+	/* end paranoia asserts */
+
+	/* cleanup stuff that may have been allocated during
+	 * device (re-)configuration or state changes */
+
+	drbd_backing_dev_free(device, device->ldev);
+	device->ldev = NULL;
+
+	drbd_release_all_peer_reqs(device);
+
+	lc_destroy(device->act_log);
+	lc_destroy(device->resync);
+
+	kfree(device->p_uuid);
+	/* device->p_uuid = NULL; */
+
+	if (device->bitmap) /* should no longer be there. */
+		drbd_bm_cleanup(device);
+	__free_page(device->md_io.page);
+	put_disk(device->vdisk);
+	kfree(device->rs_plan_s);
+
+	/* not for_each_connection(connection, resource):
+	 * those may have been cleaned up and disassociated already.
+	 */
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		kref_put(&peer_device->connection->kref, drbd_destroy_connection);
+		kfree(peer_device);
+	}
+	if (device->submit.wq)
+		destroy_workqueue(device->submit.wq);
+	kfree(device);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+	struct workqueue_struct *wq;
+	struct work_struct worker;
+
+	spinlock_t lock;
+	struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+	struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+	LIST_HEAD(writes);
+	struct drbd_request *req, *tmp;
+
+	spin_lock_irq(&retry->lock);
+	list_splice_init(&retry->writes, &writes);
+	spin_unlock_irq(&retry->lock);
+
+	list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+		struct drbd_device *device = req->device;
+		struct bio *bio = req->master_bio;
+		bool expected;
+
+		expected =
+			expect(atomic_read(&req->completion_ref) == 0) &&
+			expect(req->rq_state & RQ_POSTPONED) &&
+			expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+				(req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+		if (!expected)
+			drbd_err(device, "req=%p completion_ref=%d rq_state=%x\n",
+				req, atomic_read(&req->completion_ref),
+				req->rq_state);
+
+		/* We still need to put one kref associated with the
+		 * "completion_ref" going zero in the code path that queued it
+		 * here.  The request object may still be referenced by a
+		 * frozen local req->private_bio, in case we force-detached.
+		 */
+		kref_put(&req->kref, drbd_req_destroy);
+
+		/* A single suspended or otherwise blocking device may stall
+		 * all others as well.  Fortunately, this code path is to
+		 * recover from a situation that "should not happen":
+		 * concurrent writes in multi-primary setup.
+		 * In a "normal" lifecycle, this workqueue is supposed to be
+		 * destroyed without ever doing anything.
+		 * If it turns out to be an issue anyways, we can do per
+		 * resource (replication group) or per device (minor) retry
+		 * workqueues instead.
+		 */
+
+		/* We are not just doing submit_bio_noacct(),
+		 * as we want to keep the start_time information. */
+		inc_ap_bio(device);
+		__drbd_make_request(device, bio);
+	}
+}
+
+/* called via drbd_req_put_completion_ref(),
+ * holds resource->req_lock */
+void drbd_restart_request(struct drbd_request *req)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&retry.lock, flags);
+	list_move_tail(&req->tl_requests, &retry.writes);
+	spin_unlock_irqrestore(&retry.lock, flags);
+
+	/* Drop the extra reference that would otherwise
+	 * have been dropped by complete_master_bio.
+	 * do_retry() needs to grab a new one. */
+	dec_ap_bio(req->device);
+
+	queue_work(retry.wq, &retry.worker);
+}
+
+void drbd_destroy_resource(struct kref *kref)
+{
+	struct drbd_resource *resource =
+		container_of(kref, struct drbd_resource, kref);
+
+	idr_destroy(&resource->devices);
+	free_cpumask_var(resource->cpu_mask);
+	kfree(resource->name);
+	kfree(resource);
+}
+
+void drbd_free_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection, *tmp;
+
+	for_each_connection_safe(connection, tmp, resource) {
+		list_del(&connection->connections);
+		drbd_debugfs_connection_cleanup(connection);
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	drbd_debugfs_resource_cleanup(resource);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static void drbd_cleanup(void)
+{
+	unsigned int i;
+	struct drbd_device *device;
+	struct drbd_resource *resource, *tmp;
+
+	/* first remove proc,
+	 * drbdsetup uses it's presence to detect
+	 * whether DRBD is loaded.
+	 * If we would get stuck in proc removal,
+	 * but have netlink already deregistered,
+	 * some drbdsetup commands may wait forever
+	 * for an answer.
+	 */
+	if (drbd_proc)
+		remove_proc_entry("drbd", NULL);
+
+	if (retry.wq)
+		destroy_workqueue(retry.wq);
+
+	drbd_genl_unregister();
+
+	idr_for_each_entry(&drbd_devices, device, i)
+		drbd_delete_device(device);
+
+	/* not _rcu since, no other updater anymore. Genl already unregistered */
+	for_each_resource_safe(resource, tmp, &drbd_resources) {
+		list_del(&resource->resources);
+		drbd_free_resource(resource);
+	}
+
+	drbd_debugfs_cleanup();
+
+	drbd_destroy_mempools();
+	unregister_blkdev(DRBD_MAJOR, "drbd");
+
+	idr_destroy(&drbd_devices);
+
+	pr_info("module cleanup done.\n");
+}
+
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+	spin_lock_init(&wq->q_lock);
+	INIT_LIST_HEAD(&wq->q);
+	init_waitqueue_head(&wq->q_wait);
+}
+
+struct completion_work {
+	struct drbd_work w;
+	struct completion done;
+};
+
+static int w_complete(struct drbd_work *w, int cancel)
+{
+	struct completion_work *completion_work =
+		container_of(w, struct completion_work, w);
+
+	complete(&completion_work->done);
+	return 0;
+}
+
+void drbd_flush_workqueue(struct drbd_work_queue *work_queue)
+{
+	struct completion_work completion_work;
+
+	completion_work.w.cb = w_complete;
+	init_completion(&completion_work.done);
+	drbd_queue_work(work_queue, &completion_work.w);
+	wait_for_completion(&completion_work.done);
+}
+
+struct drbd_resource *drbd_find_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	if (!name || !name[0])
+		return NULL;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		if (!strcmp(resource->name, name)) {
+			kref_get(&resource->kref);
+			goto found;
+		}
+	}
+	resource = NULL;
+found:
+	rcu_read_unlock();
+	return resource;
+}
+
+struct drbd_connection *conn_get_by_addrs(void *my_addr, int my_addr_len,
+				     void *peer_addr, int peer_addr_len)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	rcu_read_lock();
+	for_each_resource_rcu(resource, &drbd_resources) {
+		for_each_connection_rcu(connection, resource) {
+			if (connection->my_addr_len == my_addr_len &&
+			    connection->peer_addr_len == peer_addr_len &&
+			    !memcmp(&connection->my_addr, my_addr, my_addr_len) &&
+			    !memcmp(&connection->peer_addr, peer_addr, peer_addr_len)) {
+				kref_get(&connection->kref);
+				goto found;
+			}
+		}
+	}
+	connection = NULL;
+found:
+	rcu_read_unlock();
+	return connection;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+	socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->rbuf)
+		return -ENOMEM;
+	socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+	if (!socket->sbuf)
+		return -ENOMEM;
+	return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+	free_page((unsigned long) socket->sbuf);
+	free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_connection *connection)
+{
+	drbd_free_sock(connection);
+
+	crypto_free_shash(connection->csums_tfm);
+	crypto_free_shash(connection->verify_tfm);
+	crypto_free_shash(connection->cram_hmac_tfm);
+	crypto_free_shash(connection->integrity_tfm);
+	crypto_free_shash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+
+	connection->csums_tfm = NULL;
+	connection->verify_tfm = NULL;
+	connection->cram_hmac_tfm = NULL;
+	connection->integrity_tfm = NULL;
+	connection->peer_integrity_tfm = NULL;
+	connection->int_dig_in = NULL;
+	connection->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_resource *resource, struct res_opts *res_opts)
+{
+	struct drbd_connection *connection;
+	cpumask_var_t new_cpu_mask;
+	int err;
+
+	if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+		return -ENOMEM;
+
+	/* silently ignore cpu mask on UP kernel */
+	if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+		err = bitmap_parse(res_opts->cpu_mask, DRBD_CPU_MASK_SIZE,
+				   cpumask_bits(new_cpu_mask), nr_cpu_ids);
+		if (err == -EOVERFLOW) {
+			/* So what. mask it out. */
+			cpumask_var_t tmp_cpu_mask;
+			if (zalloc_cpumask_var(&tmp_cpu_mask, GFP_KERNEL)) {
+				cpumask_setall(tmp_cpu_mask);
+				cpumask_and(new_cpu_mask, new_cpu_mask, tmp_cpu_mask);
+				drbd_warn(resource, "Overflow in bitmap_parse(%.12s%s), truncating to %u bits\n",
+					res_opts->cpu_mask,
+					strlen(res_opts->cpu_mask) > 12 ? "..." : "",
+					nr_cpu_ids);
+				free_cpumask_var(tmp_cpu_mask);
+				err = 0;
+			}
+		}
+		if (err) {
+			drbd_warn(resource, "bitmap_parse() failed with %d\n", err);
+			/* retcode = ERR_CPU_MASK_PARSE; */
+			goto fail;
+		}
+	}
+	resource->res_opts = *res_opts;
+	if (cpumask_empty(new_cpu_mask))
+		drbd_calc_cpu_mask(&new_cpu_mask);
+	if (!cpumask_equal(resource->cpu_mask, new_cpu_mask)) {
+		cpumask_copy(resource->cpu_mask, new_cpu_mask);
+		for_each_connection_rcu(connection, resource) {
+			connection->receiver.reset_cpu_mask = 1;
+			connection->ack_receiver.reset_cpu_mask = 1;
+			connection->worker.reset_cpu_mask = 1;
+		}
+	}
+	err = 0;
+
+fail:
+	free_cpumask_var(new_cpu_mask);
+	return err;
+
+}
+
+struct drbd_resource *drbd_create_resource(const char *name)
+{
+	struct drbd_resource *resource;
+
+	resource = kzalloc(sizeof(struct drbd_resource), GFP_KERNEL);
+	if (!resource)
+		goto fail;
+	resource->name = kstrdup(name, GFP_KERNEL);
+	if (!resource->name)
+		goto fail_free_resource;
+	if (!zalloc_cpumask_var(&resource->cpu_mask, GFP_KERNEL))
+		goto fail_free_name;
+	kref_init(&resource->kref);
+	idr_init(&resource->devices);
+	INIT_LIST_HEAD(&resource->connections);
+	resource->write_ordering = WO_BDEV_FLUSH;
+	list_add_tail_rcu(&resource->resources, &drbd_resources);
+	mutex_init(&resource->conf_update);
+	mutex_init(&resource->adm_mutex);
+	spin_lock_init(&resource->req_lock);
+	drbd_debugfs_resource_add(resource);
+	return resource;
+
+fail_free_name:
+	kfree(resource->name);
+fail_free_resource:
+	kfree(resource);
+fail:
+	return NULL;
+}
+
+/* caller must be under adm_mutex */
+struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts)
+{
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+
+	connection = kzalloc(sizeof(struct drbd_connection), GFP_KERNEL);
+	if (!connection)
+		return NULL;
+
+	if (drbd_alloc_socket(&connection->data))
+		goto fail;
+	if (drbd_alloc_socket(&connection->meta))
+		goto fail;
+
+	connection->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+	if (!connection->current_epoch)
+		goto fail;
+
+	INIT_LIST_HEAD(&connection->transfer_log);
+
+	INIT_LIST_HEAD(&connection->current_epoch->list);
+	connection->epochs = 1;
+	spin_lock_init(&connection->epoch_lock);
+
+	connection->send.seen_any_write_yet = false;
+	connection->send.current_epoch_nr = 0;
+	connection->send.current_epoch_writes = 0;
+
+	resource = drbd_create_resource(name);
+	if (!resource)
+		goto fail;
+
+	connection->cstate = C_STANDALONE;
+	mutex_init(&connection->cstate_mutex);
+	init_waitqueue_head(&connection->ping_wait);
+	idr_init(&connection->peer_devices);
+
+	drbd_init_workqueue(&connection->sender_work);
+	mutex_init(&connection->data.mutex);
+	mutex_init(&connection->meta.mutex);
+
+	drbd_thread_init(resource, &connection->receiver, drbd_receiver, "receiver");
+	connection->receiver.connection = connection;
+	drbd_thread_init(resource, &connection->worker, drbd_worker, "worker");
+	connection->worker.connection = connection;
+	drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv");
+	connection->ack_receiver.connection = connection;
+
+	kref_init(&connection->kref);
+
+	connection->resource = resource;
+
+	if (set_resource_options(resource, res_opts))
+		goto fail_resource;
+
+	kref_get(&resource->kref);
+	list_add_tail_rcu(&connection->connections, &resource->connections);
+	drbd_debugfs_connection_add(connection);
+	return connection;
+
+fail_resource:
+	list_del(&resource->resources);
+	drbd_free_resource(resource);
+fail:
+	kfree(connection->current_epoch);
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection);
+	return NULL;
+}
+
+void drbd_destroy_connection(struct kref *kref)
+{
+	struct drbd_connection *connection = container_of(kref, struct drbd_connection, kref);
+	struct drbd_resource *resource = connection->resource;
+
+	if (atomic_read(&connection->current_epoch->epoch_size) !=  0)
+		drbd_err(connection, "epoch_size:%d\n", atomic_read(&connection->current_epoch->epoch_size));
+	kfree(connection->current_epoch);
+
+	idr_destroy(&connection->peer_devices);
+
+	drbd_free_socket(&connection->meta);
+	drbd_free_socket(&connection->data);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	kfree(connection);
+	kref_put(&resource->kref, drbd_destroy_resource);
+}
+
+static int init_submitter(struct drbd_device *device)
+{
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to say "drbd%d", ..., minor */
+	device->submit.wq =
+		alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor);
+	if (!device->submit.wq)
+		return -ENOMEM;
+
+	INIT_WORK(&device->submit.worker, do_submit);
+	INIT_LIST_HEAD(&device->submit.writes);
+	return 0;
+}
+
+enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor)
+{
+	struct drbd_resource *resource = adm_ctx->resource;
+	struct drbd_connection *connection, *n;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device, *tmp_peer_device;
+	struct gendisk *disk;
+	int id;
+	int vnr = adm_ctx->volume;
+	enum drbd_ret_code err = ERR_NOMEM;
+
+	device = minor_to_device(minor);
+	if (device)
+		return ERR_MINOR_OR_VOLUME_EXISTS;
+
+	/* GFP_KERNEL, we are outside of all write-out paths */
+	device = kzalloc(sizeof(struct drbd_device), GFP_KERNEL);
+	if (!device)
+		return ERR_NOMEM;
+	kref_init(&device->kref);
+
+	kref_get(&resource->kref);
+	device->resource = resource;
+	device->minor = minor;
+	device->vnr = vnr;
+
+	drbd_init_set_defaults(device);
+
+	disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!disk)
+		goto out_no_disk;
+
+	device->vdisk = disk;
+	device->rq_queue = disk->queue;
+
+	set_disk_ro(disk, true);
+
+	disk->major = DRBD_MAJOR;
+	disk->first_minor = minor;
+	disk->minors = 1;
+	disk->fops = &drbd_ops;
+	disk->flags |= GENHD_FL_NO_PART;
+	sprintf(disk->disk_name, "drbd%d", minor);
+	disk->private_data = device;
+
+	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
+	blk_queue_write_cache(disk->queue, true, true);
+	/* Setting the max_hw_sectors to an odd value of 8kibyte here
+	   This triggers a max_bio_size message upon first attach or connect */
+	blk_queue_max_hw_sectors(disk->queue, DRBD_MAX_BIO_SIZE_SAFE >> 8);
+
+	device->md_io.page = alloc_page(GFP_KERNEL);
+	if (!device->md_io.page)
+		goto out_no_io_page;
+
+	if (drbd_bm_init(device))
+		goto out_no_bitmap;
+	device->read_requests = RB_ROOT;
+	device->write_requests = RB_ROOT;
+
+	id = idr_alloc(&drbd_devices, device, minor, minor + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC)
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
+		goto out_no_minor_idr;
+	}
+	kref_get(&device->kref);
+
+	id = idr_alloc(&resource->devices, device, vnr, vnr + 1, GFP_KERNEL);
+	if (id < 0) {
+		if (id == -ENOSPC)
+			err = ERR_MINOR_OR_VOLUME_EXISTS;
+		goto out_idr_remove_minor;
+	}
+	kref_get(&device->kref);
+
+	INIT_LIST_HEAD(&device->peer_devices);
+	INIT_LIST_HEAD(&device->pending_bitmap_io);
+	for_each_connection(connection, resource) {
+		peer_device = kzalloc(sizeof(struct drbd_peer_device), GFP_KERNEL);
+		if (!peer_device)
+			goto out_idr_remove_from_resource;
+		peer_device->connection = connection;
+		peer_device->device = device;
+
+		list_add(&peer_device->peer_devices, &device->peer_devices);
+		kref_get(&device->kref);
+
+		id = idr_alloc(&connection->peer_devices, peer_device, vnr, vnr + 1, GFP_KERNEL);
+		if (id < 0) {
+			if (id == -ENOSPC)
+				err = ERR_INVALID_REQUEST;
+			goto out_idr_remove_from_resource;
+		}
+		kref_get(&connection->kref);
+		INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf);
+	}
+
+	if (init_submitter(device)) {
+		err = ERR_NOMEM;
+		goto out_idr_remove_from_resource;
+	}
+
+	err = add_disk(disk);
+	if (err)
+		goto out_destroy_workqueue;
+
+	/* inherit the connection state */
+	device->state.conn = first_connection(resource)->cstate;
+	if (device->state.conn == C_WF_REPORT_PARAMS) {
+		for_each_peer_device(peer_device, device)
+			drbd_connected(peer_device);
+	}
+	/* move to create_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_add(peer_device);
+	drbd_debugfs_device_add(device);
+	return NO_ERROR;
+
+out_destroy_workqueue:
+	destroy_workqueue(device->submit.wq);
+out_idr_remove_from_resource:
+	for_each_connection_safe(connection, n, resource) {
+		peer_device = idr_remove(&connection->peer_devices, vnr);
+		if (peer_device)
+			kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	for_each_peer_device_safe(peer_device, tmp_peer_device, device) {
+		list_del(&peer_device->peer_devices);
+		kfree(peer_device);
+	}
+	idr_remove(&resource->devices, vnr);
+out_idr_remove_minor:
+	idr_remove(&drbd_devices, minor);
+	synchronize_rcu();
+out_no_minor_idr:
+	drbd_bm_cleanup(device);
+out_no_bitmap:
+	__free_page(device->md_io.page);
+out_no_io_page:
+	put_disk(disk);
+out_no_disk:
+	kref_put(&resource->kref, drbd_destroy_resource);
+	kfree(device);
+	return err;
+}
+
+void drbd_delete_device(struct drbd_device *device)
+{
+	struct drbd_resource *resource = device->resource;
+	struct drbd_connection *connection;
+	struct drbd_peer_device *peer_device;
+
+	/* move to free_peer_device() */
+	for_each_peer_device(peer_device, device)
+		drbd_debugfs_peer_device_cleanup(peer_device);
+	drbd_debugfs_device_cleanup(device);
+	for_each_connection(connection, resource) {
+		idr_remove(&connection->peer_devices, device->vnr);
+		kref_put(&device->kref, drbd_destroy_device);
+	}
+	idr_remove(&resource->devices, device->vnr);
+	kref_put(&device->kref, drbd_destroy_device);
+	idr_remove(&drbd_devices, device_to_minor(device));
+	kref_put(&device->kref, drbd_destroy_device);
+	del_gendisk(device->vdisk);
+	synchronize_rcu();
+	kref_put(&device->kref, drbd_destroy_device);
+}
+
+static int __init drbd_init(void)
+{
+	int err;
+
+	if (drbd_minor_count < DRBD_MINOR_COUNT_MIN || drbd_minor_count > DRBD_MINOR_COUNT_MAX) {
+		pr_err("invalid minor_count (%d)\n", drbd_minor_count);
+#ifdef MODULE
+		return -EINVAL;
+#else
+		drbd_minor_count = DRBD_MINOR_COUNT_DEF;
+#endif
+	}
+
+	err = register_blkdev(DRBD_MAJOR, "drbd");
+	if (err) {
+		pr_err("unable to register block device major %d\n",
+		       DRBD_MAJOR);
+		return err;
+	}
+
+	/*
+	 * allocate all necessary structs
+	 */
+	init_waitqueue_head(&drbd_pp_wait);
+
+	drbd_proc = NULL; /* play safe for drbd_cleanup */
+	idr_init(&drbd_devices);
+
+	mutex_init(&resources_mutex);
+	INIT_LIST_HEAD(&drbd_resources);
+
+	err = drbd_genl_register();
+	if (err) {
+		pr_err("unable to register generic netlink family\n");
+		goto fail;
+	}
+
+	err = drbd_create_mempools();
+	if (err)
+		goto fail;
+
+	err = -ENOMEM;
+	drbd_proc = proc_create_single("drbd", S_IFREG | 0444 , NULL, drbd_seq_show);
+	if (!drbd_proc)	{
+		pr_err("unable to register proc file\n");
+		goto fail;
+	}
+
+	retry.wq = create_singlethread_workqueue("drbd-reissue");
+	if (!retry.wq) {
+		pr_err("unable to create retry workqueue\n");
+		goto fail;
+	}
+	INIT_WORK(&retry.worker, do_retry);
+	spin_lock_init(&retry.lock);
+	INIT_LIST_HEAD(&retry.writes);
+
+	drbd_debugfs_init();
+
+	pr_info("initialized. "
+	       "Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
+	       API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX);
+	pr_info("%s\n", drbd_buildtag());
+	pr_info("registered as block device major %d\n", DRBD_MAJOR);
+	return 0; /* Success! */
+
+fail:
+	drbd_cleanup();
+	if (err == -ENOMEM)
+		pr_err("ran out of memory\n");
+	else
+		pr_err("initialization failure\n");
+	return err;
+}
+
+static void drbd_free_one_sock(struct drbd_socket *ds)
+{
+	struct socket *s;
+	mutex_lock(&ds->mutex);
+	s = ds->socket;
+	ds->socket = NULL;
+	mutex_unlock(&ds->mutex);
+	if (s) {
+		/* so debugfs does not need to mutex_lock() */
+		synchronize_rcu();
+		kernel_sock_shutdown(s, SHUT_RDWR);
+		sock_release(s);
+	}
+}
+
+void drbd_free_sock(struct drbd_connection *connection)
+{
+	if (connection->data.socket)
+		drbd_free_one_sock(&connection->data);
+	if (connection->meta.socket)
+		drbd_free_one_sock(&connection->meta);
+}
+
+/* meta data management */
+
+void conn_md_sync(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_md_sync(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+/* aligned 4kByte */
+struct meta_data_on_disk {
+	u64 la_size_sect;      /* last agreed size. */
+	u64 uuid[UI_SIZE];   /* UUIDs. */
+	u64 device_uuid;
+	u64 reserved_u64_1;
+	u32 flags;             /* MDF */
+	u32 magic;
+	u32 md_size_sect;
+	u32 al_offset;         /* offset to this block */
+	u32 al_nr_extents;     /* important for restoring the AL (userspace) */
+	      /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
+	u32 bm_offset;         /* offset to the bitmap, from here */
+	u32 bm_bytes_per_bit;  /* BM_BLOCK_SIZE */
+	u32 la_peer_max_bio_size;   /* last peer max_bio_size */
+
+	/* see al_tr_number_to_on_disk_sector() */
+	u32 al_stripes;
+	u32 al_stripe_size_4k;
+
+	u8 reserved_u8[4096 - (7*8 + 10*4)];
+} __packed;
+
+
+
+void drbd_md_write(struct drbd_device *device, void *b)
+{
+	struct meta_data_on_disk *buffer = b;
+	sector_t sector;
+	int i;
+
+	memset(buffer, 0, sizeof(*buffer));
+
+	buffer->la_size_sect = cpu_to_be64(get_capacity(device->vdisk));
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		buffer->uuid[i] = cpu_to_be64(device->ldev->md.uuid[i]);
+	buffer->flags = cpu_to_be32(device->ldev->md.flags);
+	buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
+
+	buffer->md_size_sect  = cpu_to_be32(device->ldev->md.md_size_sect);
+	buffer->al_offset     = cpu_to_be32(device->ldev->md.al_offset);
+	buffer->al_nr_extents = cpu_to_be32(device->act_log->nr_elements);
+	buffer->bm_bytes_per_bit = cpu_to_be32(BM_BLOCK_SIZE);
+	buffer->device_uuid = cpu_to_be64(device->ldev->md.device_uuid);
+
+	buffer->bm_offset = cpu_to_be32(device->ldev->md.bm_offset);
+	buffer->la_peer_max_bio_size = cpu_to_be32(device->peer_max_bio_size);
+
+	buffer->al_stripes = cpu_to_be32(device->ldev->md.al_stripes);
+	buffer->al_stripe_size_4k = cpu_to_be32(device->ldev->md.al_stripe_size_4k);
+
+	D_ASSERT(device, drbd_md_ss(device->ldev) == device->ldev->md.md_offset);
+	sector = device->ldev->md.md_offset;
+
+	if (drbd_md_sync_page_io(device, device->ldev, sector, REQ_OP_WRITE)) {
+		/* this was a try anyways ... */
+		drbd_err(device, "meta data update failed!\n");
+		drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR);
+	}
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @device:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_device *device)
+{
+	struct meta_data_on_disk *buffer;
+
+	/* Don't accidentally change the DRBD meta data layout. */
+	BUILD_BUG_ON(UI_SIZE != 4);
+	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+	del_timer(&device->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
+	if (!test_and_clear_bit(MD_DIRTY, &device->flags))
+		return;
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(device, D_FAILED))
+		return;
+
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer)
+		goto out;
+
+	drbd_md_write(device, buffer);
+
+	/* Update device->ldev->md.la_size_sect,
+	 * since we updated it on metadata. */
+	device->ldev->md.la_size_sect = get_capacity(device->vdisk);
+
+	drbd_md_put_buffer(device);
+out:
+	put_ldev(device);
+}
+
+static int check_activity_log_stripe_size(struct drbd_device *device,
+		struct meta_data_on_disk *on_disk,
+		struct drbd_md *in_core)
+{
+	u32 al_stripes = be32_to_cpu(on_disk->al_stripes);
+	u32 al_stripe_size_4k = be32_to_cpu(on_disk->al_stripe_size_4k);
+	u64 al_size_4k;
+
+	/* both not set: default to old fixed size activity log */
+	if (al_stripes == 0 && al_stripe_size_4k == 0) {
+		al_stripes = 1;
+		al_stripe_size_4k = MD_32kB_SECT/8;
+	}
+
+	/* some paranoia plausibility checks */
+
+	/* we need both values to be set */
+	if (al_stripes == 0 || al_stripe_size_4k == 0)
+		goto err;
+
+	al_size_4k = (u64)al_stripes * al_stripe_size_4k;
+
+	/* Upper limit of activity log area, to avoid potential overflow
+	 * problems in al_tr_number_to_on_disk_sector(). As right now, more
+	 * than 72 * 4k blocks total only increases the amount of history,
+	 * limiting this arbitrarily to 16 GB is not a real limitation ;-)  */
+	if (al_size_4k > (16 * 1024 * 1024/4))
+		goto err;
+
+	/* Lower limit: we need at least 8 transaction slots (32kB)
+	 * to not break existing setups */
+	if (al_size_4k < MD_32kB_SECT/8)
+		goto err;
+
+	in_core->al_stripe_size_4k = al_stripe_size_4k;
+	in_core->al_stripes = al_stripes;
+	in_core->al_size_4k = al_size_4k;
+
+	return 0;
+err:
+	drbd_err(device, "invalid activity log striping: al_stripes=%u, al_stripe_size_4k=%u\n",
+			al_stripes, al_stripe_size_4k);
+	return -EINVAL;
+}
+
+static int check_offsets_and_sizes(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	sector_t capacity = drbd_get_capacity(bdev->md_bdev);
+	struct drbd_md *in_core = &bdev->md;
+	s32 on_disk_al_sect;
+	s32 on_disk_bm_sect;
+
+	/* The on-disk size of the activity log, calculated from offsets, and
+	 * the size of the activity log calculated from the stripe settings,
+	 * should match.
+	 * Though we could relax this a bit: it is ok, if the striped activity log
+	 * fits in the available on-disk activity log size.
+	 * Right now, that would break how resize is implemented.
+	 * TODO: make drbd_determine_dev_size() (and the drbdmeta tool) aware
+	 * of possible unused padding space in the on disk layout. */
+	if (in_core->al_offset < 0) {
+		if (in_core->bm_offset > in_core->al_offset)
+			goto err;
+		on_disk_al_sect = -in_core->al_offset;
+		on_disk_bm_sect = in_core->al_offset - in_core->bm_offset;
+	} else {
+		if (in_core->al_offset != MD_4kB_SECT)
+			goto err;
+		if (in_core->bm_offset < in_core->al_offset + in_core->al_size_4k * MD_4kB_SECT)
+			goto err;
+
+		on_disk_al_sect = in_core->bm_offset - MD_4kB_SECT;
+		on_disk_bm_sect = in_core->md_size_sect - in_core->bm_offset;
+	}
+
+	/* old fixed size meta data is exactly that: fixed. */
+	if (in_core->meta_dev_idx >= 0) {
+		if (in_core->md_size_sect != MD_128MB_SECT
+		||  in_core->al_offset != MD_4kB_SECT
+		||  in_core->bm_offset != MD_4kB_SECT + MD_32kB_SECT
+		||  in_core->al_stripes != 1
+		||  in_core->al_stripe_size_4k != MD_32kB_SECT/8)
+			goto err;
+	}
+
+	if (capacity < in_core->md_size_sect)
+		goto err;
+	if (capacity - in_core->md_size_sect < drbd_md_first_sector(bdev))
+		goto err;
+
+	/* should be aligned, and at least 32k */
+	if ((on_disk_al_sect & 7) || (on_disk_al_sect < MD_32kB_SECT))
+		goto err;
+
+	/* should fit (for now: exactly) into the available on-disk space;
+	 * overflow prevention is in check_activity_log_stripe_size() above. */
+	if (on_disk_al_sect != in_core->al_size_4k * MD_4kB_SECT)
+		goto err;
+
+	/* again, should be aligned */
+	if (in_core->bm_offset & 7)
+		goto err;
+
+	/* FIXME check for device grow with flex external meta data? */
+
+	/* can the available bitmap space cover the last agreed device size? */
+	if (on_disk_bm_sect < (in_core->la_size_sect+7)/MD_4kB_SECT/8/512)
+		goto err;
+
+	return 0;
+
+err:
+	drbd_err(device, "meta data offsets don't make sense: idx=%d "
+			"al_s=%u, al_sz4k=%u, al_offset=%d, bm_offset=%d, "
+			"md_size_sect=%u, la_size=%llu, md_capacity=%llu\n",
+			in_core->meta_dev_idx,
+			in_core->al_stripes, in_core->al_stripe_size_4k,
+			in_core->al_offset, in_core->bm_offset, in_core->md_size_sect,
+			(unsigned long long)in_core->la_size_sect,
+			(unsigned long long)capacity);
+
+	return -EINVAL;
+}
+
+
+/**
+ * drbd_md_read() - Reads in the meta data super block
+ * @device:	DRBD device.
+ * @bdev:	Device from which the meta data should be read in.
+ *
+ * Return NO_ERROR on success, and an enum drbd_ret_code in case
+ * something goes wrong.
+ *
+ * Called exactly once during drbd_adm_attach(), while still being D_DISKLESS,
+ * even before @bdev is assigned to @device->ldev.
+ */
+int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev)
+{
+	struct meta_data_on_disk *buffer;
+	u32 magic, flags;
+	int i, rv = NO_ERROR;
+
+	if (device->state.disk != D_DISKLESS)
+		return ERR_DISK_CONFIGURED;
+
+	buffer = drbd_md_get_buffer(device, __func__);
+	if (!buffer)
+		return ERR_NOMEM;
+
+	/* First, figure out where our meta data superblock is located,
+	 * and read it. */
+	bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx;
+	bdev->md.md_offset = drbd_md_ss(bdev);
+	/* Even for (flexible or indexed) external meta data,
+	 * initially restrict us to the 4k superblock for now.
+	 * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */
+	bdev->md.md_size_sect = 8;
+
+	if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset,
+				 REQ_OP_READ)) {
+		/* NOTE: can't do normal error processing here as this is
+		   called BEFORE disk is attached */
+		drbd_err(device, "Error while reading metadata.\n");
+		rv = ERR_IO_MD_DISK;
+		goto err;
+	}
+
+	magic = be32_to_cpu(buffer->magic);
+	flags = be32_to_cpu(buffer->flags);
+	if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+	    (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+			/* btw: that's Activity Log clean, not "all" clean. */
+		drbd_err(device, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+		rv = ERR_MD_UNCLEAN;
+		goto err;
+	}
+
+	rv = ERR_MD_INVALID;
+	if (magic != DRBD_MD_MAGIC_08) {
+		if (magic == DRBD_MD_MAGIC_07)
+			drbd_err(device, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+		else
+			drbd_err(device, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
+		goto err;
+	}
+
+	if (be32_to_cpu(buffer->bm_bytes_per_bit) != BM_BLOCK_SIZE) {
+		drbd_err(device, "unexpected bm_bytes_per_bit: %u (expected %u)\n",
+		    be32_to_cpu(buffer->bm_bytes_per_bit), BM_BLOCK_SIZE);
+		goto err;
+	}
+
+
+	/* convert to in_core endian */
+	bdev->md.la_size_sect = be64_to_cpu(buffer->la_size_sect);
+	for (i = UI_CURRENT; i < UI_SIZE; i++)
+		bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
+	bdev->md.flags = be32_to_cpu(buffer->flags);
+	bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
+
+	bdev->md.md_size_sect = be32_to_cpu(buffer->md_size_sect);
+	bdev->md.al_offset = be32_to_cpu(buffer->al_offset);
+	bdev->md.bm_offset = be32_to_cpu(buffer->bm_offset);
+
+	if (check_activity_log_stripe_size(device, buffer, &bdev->md))
+		goto err;
+	if (check_offsets_and_sizes(device, bdev))
+		goto err;
+
+	if (be32_to_cpu(buffer->bm_offset) != bdev->md.bm_offset) {
+		drbd_err(device, "unexpected bm_offset: %d (expected %d)\n",
+		    be32_to_cpu(buffer->bm_offset), bdev->md.bm_offset);
+		goto err;
+	}
+	if (be32_to_cpu(buffer->md_size_sect) != bdev->md.md_size_sect) {
+		drbd_err(device, "unexpected md_size: %u (expected %u)\n",
+		    be32_to_cpu(buffer->md_size_sect), bdev->md.md_size_sect);
+		goto err;
+	}
+
+	rv = NO_ERROR;
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED) {
+		unsigned int peer;
+		peer = be32_to_cpu(buffer->la_peer_max_bio_size);
+		peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
+		device->peer_max_bio_size = peer;
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+ err:
+	drbd_md_put_buffer(device);
+
+	return rv;
+}
+
+/**
+ * drbd_md_mark_dirty() - Mark meta data super block as dirty
+ * @device:	DRBD device.
+ *
+ * Call this function if you change anything that should be written to
+ * the meta-data super block. This function sets MD_DIRTY, and starts a
+ * timer that ensures that within five seconds you have to call drbd_md_sync().
+ */
+void drbd_md_mark_dirty(struct drbd_device *device)
+{
+	if (!test_and_set_bit(MD_DIRTY, &device->flags))
+		mod_timer(&device->md_sync_timer, jiffies + 5*HZ);
+}
+
+void drbd_uuid_move_history(struct drbd_device *device) __must_hold(local)
+{
+	int i;
+
+	for (i = UI_HISTORY_START; i < UI_HISTORY_END; i++)
+		device->ldev->md.uuid[i+1] = device->ldev->md.uuid[i];
+}
+
+void __drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	if (idx == UI_CURRENT) {
+		if (device->state.role == R_PRIMARY)
+			val |= 1;
+		else
+			val &= ~((u64)1);
+
+		drbd_set_ed_uuid(device, val);
+	}
+
+	device->ldev->md.uuid[idx] = val;
+	drbd_md_mark_dirty(device);
+}
+
+void _drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+void drbd_uuid_set(struct drbd_device *device, int idx, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (device->ldev->md.uuid[idx]) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[idx];
+	}
+	__drbd_uuid_set(device, idx, val);
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+}
+
+/**
+ * drbd_uuid_new_current() - Creates a new current UUID
+ * @device:	DRBD device.
+ *
+ * Creates a new current UUID, and rotates the old current UUID into
+ * the bitmap slot. Causes an incremental resync upon next connect.
+ */
+void drbd_uuid_new_current(struct drbd_device *device) __must_hold(local)
+{
+	u64 val;
+	unsigned long long bm_uuid;
+
+	get_random_bytes(&val, sizeof(u64));
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+
+	if (bm_uuid)
+		drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+	device->ldev->md.uuid[UI_BITMAP] = device->ldev->md.uuid[UI_CURRENT];
+	__drbd_uuid_set(device, UI_CURRENT, val);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_print_uuids(device, "new current UUID");
+	/* get it to stable storage _now_ */
+	drbd_md_sync(device);
+}
+
+void drbd_uuid_set_bm(struct drbd_device *device, u64 val) __must_hold(local)
+{
+	unsigned long flags;
+	if (device->ldev->md.uuid[UI_BITMAP] == 0 && val == 0)
+		return;
+
+	spin_lock_irqsave(&device->ldev->md.uuid_lock, flags);
+	if (val == 0) {
+		drbd_uuid_move_history(device);
+		device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+		device->ldev->md.uuid[UI_BITMAP] = 0;
+	} else {
+		unsigned long long bm_uuid = device->ldev->md.uuid[UI_BITMAP];
+		if (bm_uuid)
+			drbd_warn(device, "bm UUID was already set: %llX\n", bm_uuid);
+
+		device->ldev->md.uuid[UI_BITMAP] = val & ~((u64)1);
+	}
+	spin_unlock_irqrestore(&device->ldev->md.uuid_lock, flags);
+
+	drbd_md_mark_dirty(device);
+}
+
+/**
+ * drbd_bmio_set_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Sets all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_set_n_write(struct drbd_device *device) __must_hold(local)
+{
+	int rv = -EIO;
+
+	drbd_md_set_flag(device, MDF_FULL_SYNC);
+	drbd_md_sync(device);
+	drbd_bm_set_all(device);
+
+	rv = drbd_bm_write(device);
+
+	if (!rv) {
+		drbd_md_clear_flag(device, MDF_FULL_SYNC);
+		drbd_md_sync(device);
+	}
+
+	return rv;
+}
+
+/**
+ * drbd_bmio_clear_n_write() - io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io()
+ * @device:	DRBD device.
+ *
+ * Clears all bits in the bitmap and writes the whole bitmap to stable storage.
+ */
+int drbd_bmio_clear_n_write(struct drbd_device *device) __must_hold(local)
+{
+	drbd_resume_al(device);
+	drbd_bm_clear_all(device);
+	return drbd_bm_write(device);
+}
+
+static int w_bitmap_io(struct drbd_work *w, int unused)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, bm_io_work.w);
+	struct bm_io_work *work = &device->bm_io_work;
+	int rv = -EIO;
+
+	if (work->flags != BM_LOCKED_CHANGE_ALLOWED) {
+		int cnt = atomic_read(&device->ap_bio_cnt);
+		if (cnt)
+			drbd_err(device, "FIXME: ap_bio_cnt %d, expected 0; queued for '%s'\n",
+					cnt, work->why);
+	}
+
+	if (get_ldev(device)) {
+		drbd_bm_lock(device, work->why, work->flags);
+		rv = work->io_fn(device);
+		drbd_bm_unlock(device);
+		put_ldev(device);
+	}
+
+	clear_bit_unlock(BITMAP_IO, &device->flags);
+	wake_up(&device->misc_wait);
+
+	if (work->done)
+		work->done(device, rv);
+
+	clear_bit(BITMAP_IO_QUEUED, &device->flags);
+	work->why = NULL;
+	work->flags = 0;
+
+	return 0;
+}
+
+/**
+ * drbd_queue_bitmap_io() - Queues an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @done:	callback to be called after the bitmap IO was performed
+ * @why:	Descriptive text of the reason for doing the IO
+ * @flags:	Bitmap flags
+ *
+ * While IO on the bitmap happens we freeze application IO thus we ensure
+ * that drbd_set_out_of_sync() can not be called. This function MAY ONLY be
+ * called from worker context. It MUST NOT be used while a previous such
+ * work is still pending!
+ *
+ * Its worker function encloses the call of io_fn() by get_ldev() and
+ * put_ldev().
+ */
+void drbd_queue_bitmap_io(struct drbd_device *device,
+			  int (*io_fn)(struct drbd_device *),
+			  void (*done)(struct drbd_device *, int),
+			  char *why, enum bm_flag flags)
+{
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	D_ASSERT(device, !test_bit(BITMAP_IO_QUEUED, &device->flags));
+	D_ASSERT(device, !test_bit(BITMAP_IO, &device->flags));
+	D_ASSERT(device, list_empty(&device->bm_io_work.w.list));
+	if (device->bm_io_work.why)
+		drbd_err(device, "FIXME going to queue '%s' but '%s' still pending?\n",
+			why, device->bm_io_work.why);
+
+	device->bm_io_work.io_fn = io_fn;
+	device->bm_io_work.done = done;
+	device->bm_io_work.why = why;
+	device->bm_io_work.flags = flags;
+
+	spin_lock_irq(&device->resource->req_lock);
+	set_bit(BITMAP_IO, &device->flags);
+	/* don't wait for pending application IO if the caller indicates that
+	 * application IO does not conflict anyways. */
+	if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) {
+		if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags))
+			drbd_queue_work(&first_peer_device(device)->connection->sender_work,
+					&device->bm_io_work.w);
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+/**
+ * drbd_bitmap_io() -  Does an IO operation on the whole bitmap
+ * @device:	DRBD device.
+ * @io_fn:	IO callback to be called when bitmap IO is possible
+ * @why:	Descriptive text of the reason for doing the IO
+ * @flags:	Bitmap flags
+ *
+ * freezes application IO while that the actual IO operations runs. This
+ * functions MAY NOT be called from worker context.
+ */
+int drbd_bitmap_io(struct drbd_device *device, int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	/* Only suspend io, if some operation is supposed to be locked out */
+	const bool do_suspend_io = flags & (BM_DONT_CLEAR|BM_DONT_SET|BM_DONT_TEST);
+	int rv;
+
+	D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+
+	if (do_suspend_io)
+		drbd_suspend_io(device);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	if (do_suspend_io)
+		drbd_resume_io(device);
+
+	return rv;
+}
+
+void drbd_md_set_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != flag) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags |= flag;
+	}
+}
+
+void drbd_md_clear_flag(struct drbd_device *device, int flag) __must_hold(local)
+{
+	if ((device->ldev->md.flags & flag) != 0) {
+		drbd_md_mark_dirty(device);
+		device->ldev->md.flags &= ~flag;
+	}
+}
+int drbd_md_test_flag(struct drbd_backing_dev *bdev, int flag)
+{
+	return (bdev->md.flags & flag) != 0;
+}
+
+static void md_sync_timer_fn(struct timer_list *t)
+{
+	struct drbd_device *device = from_timer(device, t, md_sync_timer);
+	drbd_device_post_work(device, MD_SYNC);
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+	/* THINK may need to become several global tables
+	 * when we want to support more than
+	 * one PRO_VERSION */
+	static const char *cmdnames[] = {
+
+		[P_DATA]	        = "Data",
+		[P_DATA_REPLY]	        = "DataReply",
+		[P_RS_DATA_REPLY]	= "RSDataReply",
+		[P_BARRIER]	        = "Barrier",
+		[P_BITMAP]	        = "ReportBitMap",
+		[P_BECOME_SYNC_TARGET]  = "BecomeSyncTarget",
+		[P_BECOME_SYNC_SOURCE]  = "BecomeSyncSource",
+		[P_UNPLUG_REMOTE]	= "UnplugRemote",
+		[P_DATA_REQUEST]	= "DataRequest",
+		[P_RS_DATA_REQUEST]     = "RSDataRequest",
+		[P_SYNC_PARAM]	        = "SyncParam",
+		[P_PROTOCOL]            = "ReportProtocol",
+		[P_UUIDS]	        = "ReportUUIDs",
+		[P_SIZES]	        = "ReportSizes",
+		[P_STATE]	        = "ReportState",
+		[P_SYNC_UUID]           = "ReportSyncUUID",
+		[P_AUTH_CHALLENGE]      = "AuthChallenge",
+		[P_AUTH_RESPONSE]	= "AuthResponse",
+		[P_STATE_CHG_REQ]       = "StateChgRequest",
+		[P_PING]		= "Ping",
+		[P_PING_ACK]	        = "PingAck",
+		[P_RECV_ACK]	        = "RecvAck",
+		[P_WRITE_ACK]	        = "WriteAck",
+		[P_RS_WRITE_ACK]	= "RSWriteAck",
+		[P_SUPERSEDED]          = "Superseded",
+		[P_NEG_ACK]	        = "NegAck",
+		[P_NEG_DREPLY]	        = "NegDReply",
+		[P_NEG_RS_DREPLY]	= "NegRSDReply",
+		[P_BARRIER_ACK]	        = "BarrierAck",
+		[P_STATE_CHG_REPLY]     = "StateChgReply",
+		[P_OV_REQUEST]          = "OVRequest",
+		[P_OV_REPLY]            = "OVReply",
+		[P_OV_RESULT]           = "OVResult",
+		[P_CSUM_RS_REQUEST]     = "CsumRSRequest",
+		[P_RS_IS_IN_SYNC]	= "CsumRSIsInSync",
+		[P_SYNC_PARAM89]	= "SyncParam89",
+		[P_COMPRESSED_BITMAP]   = "CBitmap",
+		[P_DELAY_PROBE]         = "DelayProbe",
+		[P_OUT_OF_SYNC]		= "OutOfSync",
+		[P_RS_CANCEL]		= "RSCancel",
+		[P_CONN_ST_CHG_REQ]	= "conn_st_chg_req",
+		[P_CONN_ST_CHG_REPLY]	= "conn_st_chg_reply",
+		[P_PROTOCOL_UPDATE]	= "protocol_update",
+		[P_TRIM]	        = "Trim",
+		[P_RS_THIN_REQ]         = "rs_thin_req",
+		[P_RS_DEALLOCATED]      = "rs_deallocated",
+		[P_WSAME]	        = "WriteSame",
+		[P_ZEROES]		= "Zeroes",
+
+		/* enum drbd_packet, but not commands - obsoleted flags:
+		 *	P_MAY_IGNORE
+		 *	P_MAX_OPT_CMD
+		 */
+	};
+
+	/* too big for the array: 0xfffX */
+	if (cmd == P_INITIAL_META)
+		return "InitialMeta";
+	if (cmd == P_INITIAL_DATA)
+		return "InitialData";
+	if (cmd == P_CONNECTION_FEATURES)
+		return "ConnectionFeatures";
+	if (cmd >= ARRAY_SIZE(cmdnames))
+		return "Unknown";
+	return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc  -  wait for a request to make progress
+ * @device:	device associated with the request
+ * @i:		the struct drbd_interval embedded in struct drbd_request or
+ *		struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i)
+{
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	long timeout;
+
+	rcu_read_lock();
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -ETIMEDOUT;
+	}
+	timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+	rcu_read_unlock();
+
+	/* Indicate to wake up device->misc_wait on progress.  */
+	i->waiting = true;
+	prepare_to_wait(&device->misc_wait, &wait, TASK_INTERRUPTIBLE);
+	spin_unlock_irq(&device->resource->req_lock);
+	timeout = schedule_timeout(timeout);
+	finish_wait(&device->misc_wait, &wait);
+	spin_lock_irq(&device->resource->req_lock);
+	if (!timeout || device->state.conn < C_CONNECTED)
+		return -ETIMEDOUT;
+	if (signal_pending(current))
+		return -ERESTARTSYS;
+	return 0;
+}
+
+void lock_all_resources(void)
+{
+	struct drbd_resource *resource;
+	int __maybe_unused i = 0;
+
+	mutex_lock(&resources_mutex);
+	local_irq_disable();
+	for_each_resource(resource, &drbd_resources)
+		spin_lock_nested(&resource->req_lock, i++);
+}
+
+void unlock_all_resources(void)
+{
+	struct drbd_resource *resource;
+
+	for_each_resource(resource, &drbd_resources)
+		spin_unlock(&resource->req_lock);
+	local_irq_enable();
+	mutex_unlock(&resources_mutex);
+}
+
+#ifdef CONFIG_DRBD_FAULT_INJECTION
+/* Fault insertion support including random number generator shamelessly
+ * stolen from kernel/rcutorture.c */
+struct fault_random_state {
+	unsigned long state;
+	unsigned long count;
+};
+
+#define FAULT_RANDOM_MULT 39916801  /* prime */
+#define FAULT_RANDOM_ADD	479001701 /* prime */
+#define FAULT_RANDOM_REFRESH 10000
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static unsigned long
+_drbd_fault_random(struct fault_random_state *rsp)
+{
+	long refresh;
+
+	if (!rsp->count--) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rsp->state += refresh;
+		rsp->count = FAULT_RANDOM_REFRESH;
+	}
+	rsp->state = rsp->state * FAULT_RANDOM_MULT + FAULT_RANDOM_ADD;
+	return swahw32(rsp->state);
+}
+
+static char *
+_drbd_fault_str(unsigned int type) {
+	static char *_faults[] = {
+		[DRBD_FAULT_MD_WR] = "Meta-data write",
+		[DRBD_FAULT_MD_RD] = "Meta-data read",
+		[DRBD_FAULT_RS_WR] = "Resync write",
+		[DRBD_FAULT_RS_RD] = "Resync read",
+		[DRBD_FAULT_DT_WR] = "Data write",
+		[DRBD_FAULT_DT_RD] = "Data read",
+		[DRBD_FAULT_DT_RA] = "Data read ahead",
+		[DRBD_FAULT_BM_ALLOC] = "BM allocation",
+		[DRBD_FAULT_AL_EE] = "EE allocation",
+		[DRBD_FAULT_RECEIVE] = "receive data corruption",
+	};
+
+	return (type < DRBD_FAULT_MAX) ? _faults[type] : "**Unknown**";
+}
+
+unsigned int
+_drbd_insert_fault(struct drbd_device *device, unsigned int type)
+{
+	static struct fault_random_state rrs = {0, 0};
+
+	unsigned int ret = (
+		(drbd_fault_devs == 0 ||
+			((1 << device_to_minor(device)) & drbd_fault_devs) != 0) &&
+		(((_drbd_fault_random(&rrs) % 100) + 1) <= drbd_fault_rate));
+
+	if (ret) {
+		drbd_fault_count++;
+
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_warn(device, "***Simulating %s failure\n",
+				_drbd_fault_str(type));
+	}
+
+	return ret;
+}
+#endif
+
+const char *drbd_buildtag(void)
+{
+	/* DRBD built from external sources has here a reference to the
+	   git hash of the source code. */
+
+	static char buildtag[38] = "\0uilt-in";
+
+	if (buildtag[0] == 0) {
+#ifdef MODULE
+		sprintf(buildtag, "srcversion: %-24s", THIS_MODULE->srcversion);
+#else
+		buildtag[0] = 'b';
+#endif
+	}
+
+	return buildtag;
+}
+
+module_init(drbd_init)
+module_exit(drbd_cleanup)
+
+EXPORT_SYMBOL(drbd_conn_str);
+EXPORT_SYMBOL(drbd_role_str);
+EXPORT_SYMBOL(drbd_disk_str);
+EXPORT_SYMBOL(drbd_set_st_err_str);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
new file mode 100644
index 000000000..249eba7d2
--- /dev/null
+++ b/drivers/block/drbd/drbd_nl.c
@@ -0,0 +1,4940 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_nl.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+ */
+
+#define pr_fmt(fmt)	KBUILD_MODNAME ": " fmt
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/in.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/slab.h>
+#include <linux/blkpg.h>
+#include <linux/cpumask.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_state_change.h"
+#include <asm/unaligned.h>
+#include <linux/drbd_limits.h>
+#include <linux/kthread.h>
+
+#include <net/genetlink.h>
+
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_devices_done(struct netlink_callback *cb);
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_connections_done(struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb);
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb);
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */
+
+DEFINE_MUTEX(notification_mutex);
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
+static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+	genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+	if (genlmsg_reply(skb, info))
+		pr_err("error sending genl reply\n");
+}
+
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+static int drbd_msg_put_info(struct sk_buff *skb, const char *info)
+{
+	struct nlattr *nla;
+	int err = -EMSGSIZE;
+
+	if (!info || !info[0])
+		return 0;
+
+	nla = nla_nest_start_noflag(skb, DRBD_NLA_CFG_REPLY);
+	if (!nla)
+		return err;
+
+	err = nla_put_string(skb, T_info_text, info);
+	if (err) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	} else
+		nla_nest_end(skb, nla);
+	return 0;
+}
+
+__printf(2, 3)
+static int drbd_msg_sprintf_info(struct sk_buff *skb, const char *fmt, ...)
+{
+	va_list args;
+	struct nlattr *nla, *txt;
+	int err = -EMSGSIZE;
+	int len;
+
+	nla = nla_nest_start_noflag(skb, DRBD_NLA_CFG_REPLY);
+	if (!nla)
+		return err;
+
+	txt = nla_reserve(skb, T_info_text, 256);
+	if (!txt) {
+		nla_nest_cancel(skb, nla);
+		return err;
+	}
+	va_start(args, fmt);
+	len = vscnprintf(nla_data(txt), 256, fmt, args);
+	va_end(args);
+
+	/* maybe: retry with larger reserve, if truncated */
+	txt->nla_len = nla_attr_size(len+1);
+	nlmsg_trim(skb, (char*)txt + NLA_ALIGN(txt->nla_len));
+	nla_nest_end(skb, nla);
+
+	return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ *
+ * At this point, we still rely on the global genl_lock().
+ * If we want to avoid that, and allow "genl_family.parallel_ops", we may need
+ * to add additional synchronization against object destruction/modification.
+ */
+#define DRBD_ADM_NEED_MINOR	1
+#define DRBD_ADM_NEED_RESOURCE	2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct drbd_config_context *adm_ctx,
+	struct sk_buff *skb, struct genl_info *info, unsigned flags)
+{
+	struct drbd_genlmsghdr *d_in = info->userhdr;
+	const u8 cmd = info->genlhdr->cmd;
+	int err;
+
+	memset(adm_ctx, 0, sizeof(*adm_ctx));
+
+	/* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+	if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+	       return -EPERM;
+
+	adm_ctx->reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+	if (!adm_ctx->reply_skb) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh = genlmsg_put_reply(adm_ctx->reply_skb,
+					info, &drbd_genl_family, 0, cmd);
+	/* put of a few bytes into a fresh skb of >= 4k will always succeed.
+	 * but anyways */
+	if (!adm_ctx->reply_dh) {
+		err = -ENOMEM;
+		goto fail;
+	}
+
+	adm_ctx->reply_dh->minor = d_in->minor;
+	adm_ctx->reply_dh->ret_code = NO_ERROR;
+
+	adm_ctx->volume = VOLUME_UNSPECIFIED;
+	if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+		struct nlattr *nla;
+		/* parse and validate only */
+		err = drbd_cfg_context_from_attrs(NULL, info);
+		if (err)
+			goto fail;
+
+		/* It was present, and valid,
+		 * copy it over to the reply skb. */
+		err = nla_put_nohdr(adm_ctx->reply_skb,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+				info->attrs[DRBD_NLA_CFG_CONTEXT]);
+		if (err)
+			goto fail;
+
+		/* and assign stuff to the adm_ctx */
+		nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+		if (nla)
+			adm_ctx->volume = nla_get_u32(nla);
+		nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+		if (nla)
+			adm_ctx->resource_name = nla_data(nla);
+		adm_ctx->my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+		adm_ctx->peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+		if ((adm_ctx->my_addr &&
+		     nla_len(adm_ctx->my_addr) > sizeof(adm_ctx->connection->my_addr)) ||
+		    (adm_ctx->peer_addr &&
+		     nla_len(adm_ctx->peer_addr) > sizeof(adm_ctx->connection->peer_addr))) {
+			err = -EINVAL;
+			goto fail;
+		}
+	}
+
+	adm_ctx->minor = d_in->minor;
+	adm_ctx->device = minor_to_device(d_in->minor);
+
+	/* We are protected by the global genl_lock().
+	 * But we may explicitly drop it/retake it in drbd_adm_set_role(),
+	 * so make sure this object stays around. */
+	if (adm_ctx->device)
+		kref_get(&adm_ctx->device->kref);
+
+	if (adm_ctx->resource_name) {
+		adm_ctx->resource = drbd_find_resource(adm_ctx->resource_name);
+	}
+
+	if (!adm_ctx->device && (flags & DRBD_ADM_NEED_MINOR)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown minor");
+		return ERR_MINOR_INVALID;
+	}
+	if (!adm_ctx->resource && (flags & DRBD_ADM_NEED_RESOURCE)) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "unknown resource");
+		if (adm_ctx->resource_name)
+			return ERR_RES_NOT_KNOWN;
+		return ERR_INVALID_REQUEST;
+	}
+
+	if (flags & DRBD_ADM_NEED_CONNECTION) {
+		if (adm_ctx->resource) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no resource name expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->device) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "no minor number expected");
+			return ERR_INVALID_REQUEST;
+		}
+		if (adm_ctx->my_addr && adm_ctx->peer_addr)
+			adm_ctx->connection = conn_get_by_addrs(nla_data(adm_ctx->my_addr),
+							  nla_len(adm_ctx->my_addr),
+							  nla_data(adm_ctx->peer_addr),
+							  nla_len(adm_ctx->peer_addr));
+		if (!adm_ctx->connection) {
+			drbd_msg_put_info(adm_ctx->reply_skb, "unknown connection");
+			return ERR_INVALID_REQUEST;
+		}
+	}
+
+	/* some more paranoia, if the request was over-determined */
+	if (adm_ctx->device && adm_ctx->resource &&
+	    adm_ctx->device->resource != adm_ctx->resource) {
+		pr_warn("request: minor=%u, resource=%s; but that minor belongs to resource %s\n",
+			adm_ctx->minor, adm_ctx->resource->name,
+			adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists in different resource");
+		return ERR_INVALID_REQUEST;
+	}
+	if (adm_ctx->device &&
+	    adm_ctx->volume != VOLUME_UNSPECIFIED &&
+	    adm_ctx->volume != adm_ctx->device->vnr) {
+		pr_warn("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+			adm_ctx->minor, adm_ctx->volume,
+			adm_ctx->device->vnr, adm_ctx->device->resource->name);
+		drbd_msg_put_info(adm_ctx->reply_skb, "minor exists as different volume");
+		return ERR_INVALID_REQUEST;
+	}
+
+	/* still, provide adm_ctx->resource always, if possible. */
+	if (!adm_ctx->resource) {
+		adm_ctx->resource = adm_ctx->device ? adm_ctx->device->resource
+			: adm_ctx->connection ? adm_ctx->connection->resource : NULL;
+		if (adm_ctx->resource)
+			kref_get(&adm_ctx->resource->kref);
+	}
+
+	return NO_ERROR;
+
+fail:
+	nlmsg_free(adm_ctx->reply_skb);
+	adm_ctx->reply_skb = NULL;
+	return err;
+}
+
+static int drbd_adm_finish(struct drbd_config_context *adm_ctx,
+	struct genl_info *info, int retcode)
+{
+	if (adm_ctx->device) {
+		kref_put(&adm_ctx->device->kref, drbd_destroy_device);
+		adm_ctx->device = NULL;
+	}
+	if (adm_ctx->connection) {
+		kref_put(&adm_ctx->connection->kref, &drbd_destroy_connection);
+		adm_ctx->connection = NULL;
+	}
+	if (adm_ctx->resource) {
+		kref_put(&adm_ctx->resource->kref, drbd_destroy_resource);
+		adm_ctx->resource = NULL;
+	}
+
+	if (!adm_ctx->reply_skb)
+		return -ENOMEM;
+
+	adm_ctx->reply_dh->ret_code = retcode;
+	drbd_adm_send_reply(adm_ctx->reply_skb, info);
+	return 0;
+}
+
+static void setup_khelper_env(struct drbd_connection *connection, char **envp)
+{
+	char *afs;
+
+	/* FIXME: A future version will not allow this case. */
+	if (connection->my_addr_len == 0 || connection->peer_addr_len == 0)
+		return;
+
+	switch (((struct sockaddr *)&connection->peer_addr)->sa_family) {
+	case AF_INET6:
+		afs = "ipv6";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+			 &((struct sockaddr_in6 *)&connection->peer_addr)->sin6_addr);
+		break;
+	case AF_INET:
+		afs = "ipv4";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+		break;
+	default:
+		afs = "ssocks";
+		snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+			 &((struct sockaddr_in *)&connection->peer_addr)->sin_addr);
+	}
+	snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
+
+int drbd_khelper(struct drbd_device *device, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char mb[14];
+	char *argv[] = {drbd_usermode_helper, cmd, mb, NULL };
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct sib_info sib;
+	int ret;
+
+	if (current == connection->worker.task)
+		set_bit(CALLBACK_PENDING, &connection->flags);
+
+	snprintf(mb, 14, "minor-%d", device_to_minor(device));
+	setup_khelper_env(connection, envp);
+
+	/* The helper may take some time.
+	 * write out any unsynced meta data changes now */
+	drbd_md_sync(device);
+
+	drbd_info(device, "helper command: %s %s %s\n", drbd_usermode_helper, cmd, mb);
+	sib.sib_reason = SIB_HELPER_PRE;
+	sib.helper_name = cmd;
+	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_CALL, device, connection, cmd, 0);
+	ret = call_usermodehelper(drbd_usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				drbd_usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	else
+		drbd_info(device, "helper command: %s %s %s exit code %u (0x%x)\n",
+				drbd_usermode_helper, cmd, mb,
+				(ret >> 8) & 0xff, ret);
+	sib.sib_reason = SIB_HELPER_POST;
+	sib.helper_exit_code = ret;
+	drbd_bcast_event(device, &sib);
+	notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret);
+
+	if (current == connection->worker.task)
+		clear_bit(CALLBACK_PENDING, &connection->flags);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+enum drbd_peer_state conn_khelper(struct drbd_connection *connection, char *cmd)
+{
+	char *envp[] = { "HOME=/",
+			"TERM=linux",
+			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+			 (char[20]) { }, /* address family */
+			 (char[60]) { }, /* address */
+			NULL };
+	char *resource_name = connection->resource->name;
+	char *argv[] = {drbd_usermode_helper, cmd, resource_name, NULL };
+	int ret;
+
+	setup_khelper_env(connection, envp);
+	conn_md_sync(connection);
+
+	drbd_info(connection, "helper command: %s %s %s\n", drbd_usermode_helper, cmd, resource_name);
+	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0);
+
+	ret = call_usermodehelper(drbd_usermode_helper, argv, envp, UMH_WAIT_PROC);
+	if (ret)
+		drbd_warn(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  drbd_usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	else
+		drbd_info(connection, "helper command: %s %s %s exit code %u (0x%x)\n",
+			  drbd_usermode_helper, cmd, resource_name,
+			  (ret >> 8) & 0xff, ret);
+	/* TODO: conn_bcast_event() ?? */
+	notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret);
+
+	if (ret < 0) /* Ignore any ERRNOs we got. */
+		ret = 0;
+
+	return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_connection *connection)
+{
+	enum drbd_fencing_p fp = FP_NOT_AVAIL;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev_if_state(device, D_CONSISTENT)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(peer_device->device->ldev->disk_conf);
+			fp = max_t(enum drbd_fencing_p, fp, disk_conf->fencing);
+			put_ldev(device);
+		}
+	}
+	rcu_read_unlock();
+
+	return fp;
+}
+
+static bool resource_is_supended(struct drbd_resource *resource)
+{
+	return resource->susp || resource->susp_fen || resource->susp_nod;
+}
+
+bool conn_try_outdate_peer(struct drbd_connection *connection)
+{
+	struct drbd_resource * const resource = connection->resource;
+	unsigned int connect_cnt;
+	union drbd_state mask = { };
+	union drbd_state val = { };
+	enum drbd_fencing_p fp;
+	char *ex_to_string;
+	int r;
+
+	spin_lock_irq(&resource->req_lock);
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		drbd_err(connection, "Expected cstate < C_WF_REPORT_PARAMS\n");
+		spin_unlock_irq(&resource->req_lock);
+		return false;
+	}
+
+	connect_cnt = connection->connect_cnt;
+	spin_unlock_irq(&resource->req_lock);
+
+	fp = highest_fencing_policy(connection);
+	switch (fp) {
+	case FP_NOT_AVAIL:
+		drbd_warn(connection, "Not fencing peer, I'm not even Consistent myself.\n");
+		spin_lock_irq(&resource->req_lock);
+		if (connection->cstate < C_WF_REPORT_PARAMS) {
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE | CS_HARD | CS_DC_SUSP);
+			/* We are no longer suspended due to the fencing policy.
+			 * We may still be suspended due to the on-no-data-accessible policy.
+			 * If that was OND_IO_ERROR, fail pending requests. */
+			if (!resource_is_supended(resource))
+				_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+		}
+		/* Else: in case we raced with a connection handshake,
+		 * let the handshake figure out if we maybe can RESEND,
+		 * and do not resume/fail pending requests here.
+		 * Worst case is we stay suspended for now, which may be
+		 * resolved by either re-establishing the replication link, or
+		 * the next link failure, or eventually the administrator.  */
+		spin_unlock_irq(&resource->req_lock);
+		return false;
+
+	case FP_DONT_CARE:
+		return true;
+	default: ;
+	}
+
+	r = conn_khelper(connection, "fence-peer");
+
+	switch ((r>>8) & 0xff) {
+	case P_INCONSISTENT: /* peer is inconsistent */
+		ex_to_string = "peer is inconsistent or worse";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_INCONSISTENT;
+		break;
+	case P_OUTDATED: /* peer got outdated, or was already outdated */
+		ex_to_string = "peer was fenced";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	case P_DOWN: /* peer was down */
+		if (conn_highest_disk(connection) == D_UP_TO_DATE) {
+			/* we will(have) create(d) a new UUID anyways... */
+			ex_to_string = "peer is unreachable, assumed to be dead";
+			mask.pdsk = D_MASK;
+			val.pdsk = D_OUTDATED;
+		} else {
+			ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
+		}
+		break;
+	case P_PRIMARY: /* Peer is primary, voluntarily outdate myself.
+		 * This is useful when an unconnected R_SECONDARY is asked to
+		 * become R_PRIMARY, but finds the other peer being active. */
+		ex_to_string = "peer is active";
+		drbd_warn(connection, "Peer is primary, outdating myself.\n");
+		mask.disk = D_MASK;
+		val.disk = D_OUTDATED;
+		break;
+	case P_FENCING:
+		/* THINK: do we need to handle this
+		 * like case 4, or more like case 5? */
+		if (fp != FP_STONITH)
+			drbd_err(connection, "fence-peer() = 7 && fencing != Stonith !!!\n");
+		ex_to_string = "peer was stonithed";
+		mask.pdsk = D_MASK;
+		val.pdsk = D_OUTDATED;
+		break;
+	default:
+		/* The script is broken ... */
+		drbd_err(connection, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+		return false; /* Eventually leave IO frozen */
+	}
+
+	drbd_info(connection, "fence-peer helper returned %d (%s)\n",
+		  (r>>8) & 0xff, ex_to_string);
+
+	/* Not using
+	   conn_request_state(connection, mask, val, CS_VERBOSE);
+	   here, because we might were able to re-establish the connection in the
+	   meantime. */
+	spin_lock_irq(&resource->req_lock);
+	if (connection->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &connection->flags)) {
+		if (connection->connect_cnt != connect_cnt)
+			/* In case the connection was established and droped
+			   while the fence-peer handler was running, ignore it */
+			drbd_info(connection, "Ignoring fence-peer exit code\n");
+		else
+			_conn_request_state(connection, mask, val, CS_VERBOSE);
+	}
+	spin_unlock_irq(&resource->req_lock);
+
+	return conn_highest_pdsk(connection) <= D_OUTDATED;
+}
+
+static int _try_outdate_peer_async(void *data)
+{
+	struct drbd_connection *connection = (struct drbd_connection *)data;
+
+	conn_try_outdate_peer(connection);
+
+	kref_put(&connection->kref, drbd_destroy_connection);
+	return 0;
+}
+
+void conn_try_outdate_peer_async(struct drbd_connection *connection)
+{
+	struct task_struct *opa;
+
+	kref_get(&connection->kref);
+	/* We may have just sent a signal to this thread
+	 * to get it out of some blocking network function.
+	 * Clear signals; otherwise kthread_run(), which internally uses
+	 * wait_on_completion_killable(), will mistake our pending signal
+	 * for a new fatal signal and fail. */
+	flush_signals(current);
+	opa = kthread_run(_try_outdate_peer_async, connection, "drbd_async_h");
+	if (IS_ERR(opa)) {
+		drbd_err(connection, "out of mem, failed to invoke fence-peer helper\n");
+		kref_put(&connection->kref, drbd_destroy_connection);
+	}
+}
+
+enum drbd_state_rv
+drbd_set_role(struct drbd_device *const device, enum drbd_role new_role, int force)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	const int max_tries = 4;
+	enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+	struct net_conf *nc;
+	int try = 0;
+	int forced = 0;
+	union drbd_state mask, val;
+
+	if (new_role == R_PRIMARY) {
+		struct drbd_connection *connection;
+
+		/* Detect dead peers as soon as possible.  */
+
+		rcu_read_lock();
+		for_each_connection(connection, device->resource)
+			request_ping(connection);
+		rcu_read_unlock();
+	}
+
+	mutex_lock(device->state_mutex);
+
+	mask.i = 0; mask.role = R_MASK;
+	val.i  = 0; val.role  = new_role;
+
+	while (try++ < max_tries) {
+		rv = _drbd_request_state_holding_state_mutex(device, mask, val, CS_WAIT_COMPLETE);
+
+		/* in case we first succeeded to outdate,
+		 * but now suddenly could establish a connection */
+		if (rv == SS_CW_FAILED_BY_PEER && mask.pdsk != 0) {
+			val.pdsk = 0;
+			mask.pdsk = 0;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK && force &&
+		    (device->state.disk < D_UP_TO_DATE &&
+		     device->state.disk >= D_INCONSISTENT)) {
+			mask.disk = D_MASK;
+			val.disk  = D_UP_TO_DATE;
+			forced = 1;
+			continue;
+		}
+
+		if (rv == SS_NO_UP_TO_DATE_DISK &&
+		    device->state.disk == D_CONSISTENT && mask.pdsk == 0) {
+			D_ASSERT(device, device->state.pdsk == D_UNKNOWN);
+
+			if (conn_try_outdate_peer(connection)) {
+				val.disk = D_UP_TO_DATE;
+				mask.disk = D_MASK;
+			}
+			continue;
+		}
+
+		if (rv == SS_NOTHING_TO_DO)
+			goto out;
+		if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
+			if (!conn_try_outdate_peer(connection) && force) {
+				drbd_warn(device, "Forced into split brain situation!\n");
+				mask.pdsk = D_MASK;
+				val.pdsk  = D_OUTDATED;
+
+			}
+			continue;
+		}
+		if (rv == SS_TWO_PRIMARIES) {
+			/* Maybe the peer is detected as dead very soon...
+			   retry at most once more in this case. */
+			if (try < max_tries) {
+				int timeo;
+				try = max_tries - 1;
+				rcu_read_lock();
+				nc = rcu_dereference(connection->net_conf);
+				timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
+			continue;
+		}
+		if (rv < SS_SUCCESS) {
+			rv = _drbd_request_state(device, mask, val,
+						CS_VERBOSE + CS_WAIT_COMPLETE);
+			if (rv < SS_SUCCESS)
+				goto out;
+		}
+		break;
+	}
+
+	if (rv < SS_SUCCESS)
+		goto out;
+
+	if (forced)
+		drbd_warn(device, "Forced to consider local data as UpToDate!\n");
+
+	/* Wait until nothing is on the fly :) */
+	wait_event(device->misc_wait, atomic_read(&device->ap_pending_cnt) == 0);
+
+	/* FIXME also wait for all pending P_BARRIER_ACK? */
+
+	if (new_role == R_SECONDARY) {
+		if (get_ldev(device)) {
+			device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+			put_ldev(device);
+		}
+	} else {
+		mutex_lock(&device->resource->conf_update);
+		nc = connection->net_conf;
+		if (nc)
+			nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+		mutex_unlock(&device->resource->conf_update);
+
+		if (get_ldev(device)) {
+			if (((device->state.conn < C_CONNECTED ||
+			       device->state.pdsk <= D_FAILED)
+			      && device->ldev->md.uuid[UI_BITMAP] == 0) || forced)
+				drbd_uuid_new_current(device);
+
+			device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+			put_ldev(device);
+		}
+	}
+
+	/* writeout of activity log covered areas of the bitmap
+	 * to stable storage done in after state change already */
+
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		/* if this was forced, we should consider sync */
+		if (forced)
+			drbd_send_uuids(peer_device);
+		drbd_send_current_state(peer_device);
+	}
+
+	drbd_md_sync(device);
+	set_disk_ro(device->vdisk, new_role == R_SECONDARY);
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+out:
+	mutex_unlock(device->state_mutex);
+	return rv;
+}
+
+static const char *from_attrs_err_to_txt(int err)
+{
+	return	err == -ENOMSG ? "required attribute missing" :
+		err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+		err == -EEXIST ? "can not change invariant setting" :
+		"invalid attribute value";
+}
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct set_role_parms parms;
+	int err;
+	enum drbd_ret_code retcode;
+	enum drbd_state_rv rv;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+		err = set_role_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	genl_unlock();
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+		rv = drbd_set_role(adm_ctx.device, R_PRIMARY, parms.assume_uptodate);
+	else
+		rv = drbd_set_role(adm_ctx.device, R_SECONDARY, 0);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	genl_lock();
+	drbd_adm_finish(&adm_ctx, info, rv);
+	return 0;
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+/* Initializes the md.*_offset members, so we are able to find
+ * the on disk meta data.
+ *
+ * We currently have two possible layouts:
+ * external:
+ *   |----------- md_size_sect ------------------|
+ *   [ 4k superblock ][ activity log ][  Bitmap  ]
+ *   | al_offset == 8 |
+ *   | bm_offset = al_offset + X      |
+ *  ==> bitmap sectors = md_size_sect - bm_offset
+ *
+ * internal:
+ *            |----------- md_size_sect ------------------|
+ * [data.....][  Bitmap  ][ activity log ][ 4k superblock ]
+ *                        | al_offset < 0 |
+ *            | bm_offset = al_offset - Y |
+ *  ==> bitmap sectors = Y = al_offset - bm_offset
+ *
+ *  Activity log size used to be fixed 32kB,
+ *  but is about to become configurable.
+ */
+static void drbd_md_set_sector_offsets(struct drbd_device *device,
+				       struct drbd_backing_dev *bdev)
+{
+	sector_t md_size_sect = 0;
+	unsigned int al_size_sect = bdev->md.al_size_4k * 8;
+
+	bdev->md.md_offset = drbd_md_ss(bdev);
+
+	switch (bdev->md.meta_dev_idx) {
+	default:
+		/* v07 style fixed size indexed meta data */
+		bdev->md.md_size_sect = MD_128MB_SECT;
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_FLEX_EXT:
+		/* just occupy the full device; unit: sectors */
+		bdev->md.md_size_sect = drbd_get_capacity(bdev->md_bdev);
+		bdev->md.al_offset = MD_4kB_SECT;
+		bdev->md.bm_offset = MD_4kB_SECT + al_size_sect;
+		break;
+	case DRBD_MD_INDEX_INTERNAL:
+	case DRBD_MD_INDEX_FLEX_INT:
+		/* al size is still fixed */
+		bdev->md.al_offset = -al_size_sect;
+		/* we need (slightly less than) ~ this much bitmap sectors: */
+		md_size_sect = drbd_get_capacity(bdev->backing_bdev);
+		md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
+		md_size_sect = BM_SECT_TO_EXT(md_size_sect);
+		md_size_sect = ALIGN(md_size_sect, 8);
+
+		/* plus the "drbd meta data super block",
+		 * and the activity log; */
+		md_size_sect += MD_4kB_SECT + al_size_sect;
+
+		bdev->md.md_size_sect = md_size_sect;
+		/* bitmap offset is adjusted by 'super' block size */
+		bdev->md.bm_offset   = -md_size_sect + MD_4kB_SECT;
+		break;
+	}
+}
+
+/* input size is expected to be in KB */
+char *ppsize(char *buf, unsigned long long size)
+{
+	/* Needs 9 bytes at max including trailing NUL:
+	 * -1ULL ==> "16384 EB" */
+	static char units[] = { 'K', 'M', 'G', 'T', 'P', 'E' };
+	int base = 0;
+	while (size >= 10000 && base < sizeof(units)-1) {
+		/* shift + round */
+		size = (size >> 10) + !!(size & (1<<9));
+		base++;
+	}
+	sprintf(buf, "%u %cB", (unsigned)size, units[base]);
+
+	return buf;
+}
+
+/* there is still a theoretical deadlock when called from receiver
+ * on an D_INCONSISTENT R_PRIMARY:
+ *  remote READ does inc_ap_bio, receiver would need to receive answer
+ *  packet from remote to dec_ap_bio again.
+ *  receiver receive_sizes(), comes here,
+ *  waits for ap_bio_cnt == 0. -> deadlock.
+ * but this cannot happen, actually, because:
+ *  R_PRIMARY D_INCONSISTENT, and peer's disk is unreachable
+ *  (not connected, or bad/no disk on peer):
+ *  see drbd_fail_request_early, ap_bio_cnt is zero.
+ *  R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
+ *  peer may not initiate a resize.
+ */
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an device->flag, is triggered by drbd internals,
+ * and should be short-lived. */
+/* It needs to be a counter, since multiple threads might
+   independently suspend and resume IO. */
+void drbd_suspend_io(struct drbd_device *device)
+{
+	atomic_inc(&device->suspend_cnt);
+	if (drbd_suspended(device))
+		return;
+	wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt));
+}
+
+void drbd_resume_io(struct drbd_device *device)
+{
+	if (atomic_dec_and_test(&device->suspend_cnt))
+		wake_up(&device->misc_wait);
+}
+
+/*
+ * drbd_determine_dev_size() -  Sets the right device size obeying all constraints
+ * @device:	DRBD device.
+ *
+ * Returns 0 on success, negative return values indicate errors.
+ * You should call drbd_md_sync() after calling this function.
+ */
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
+{
+	struct md_offsets_and_sizes {
+		u64 last_agreed_sect;
+		u64 md_offset;
+		s32 al_offset;
+		s32 bm_offset;
+		u32 md_size_sect;
+
+		u32 al_stripes;
+		u32 al_stripe_size_4k;
+	} prev;
+	sector_t u_size, size;
+	struct drbd_md *md = &device->ldev->md;
+	void *buffer;
+
+	int md_moved, la_size_changed;
+	enum determine_dev_size rv = DS_UNCHANGED;
+
+	/* We may change the on-disk offsets of our meta data below.  Lock out
+	 * anything that may cause meta data IO, to avoid acting on incomplete
+	 * layout changes or scribbling over meta data that is in the process
+	 * of being moved.
+	 *
+	 * Move is not exactly correct, btw, currently we have all our meta
+	 * data in core memory, to "move" it we just write it all out, there
+	 * are no reads. */
+	drbd_suspend_io(device);
+	buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */
+	if (!buffer) {
+		drbd_resume_io(device);
+		return DS_ERROR;
+	}
+
+	/* remember current offset and sizes */
+	prev.last_agreed_sect = md->la_size_sect;
+	prev.md_offset = md->md_offset;
+	prev.al_offset = md->al_offset;
+	prev.bm_offset = md->bm_offset;
+	prev.md_size_sect = md->md_size_sect;
+	prev.al_stripes = md->al_stripes;
+	prev.al_stripe_size_4k = md->al_stripe_size_4k;
+
+	if (rs) {
+		/* rs is non NULL if we should change the AL layout only */
+		md->al_stripes = rs->al_stripes;
+		md->al_stripe_size_4k = rs->al_stripe_size / 4;
+		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+	}
+
+	drbd_md_set_sector_offsets(device, device->ldev);
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED);
+
+	if (size < prev.last_agreed_sect) {
+		if (rs && u_size == 0) {
+			/* Remove "rs &&" later. This check should always be active, but
+			   right now the receiver expects the permissive behavior */
+			drbd_warn(device, "Implicit shrink not allowed. "
+				 "Use --size=%llus for explicit shrink.\n",
+				 (unsigned long long)size);
+			rv = DS_ERROR_SHRINK;
+		}
+		if (u_size > size)
+			rv = DS_ERROR_SPACE_MD;
+		if (rv != DS_UNCHANGED)
+			goto err_out;
+	}
+
+	if (get_capacity(device->vdisk) != size ||
+	    drbd_bm_capacity(device) != size) {
+		int err;
+		err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC));
+		if (unlikely(err)) {
+			/* currently there is only one error: ENOMEM! */
+			size = drbd_bm_capacity(device);
+			if (size == 0) {
+				drbd_err(device, "OUT OF MEMORY! "
+				    "Could not allocate bitmap!\n");
+			} else {
+				drbd_err(device, "BM resizing failed. "
+				    "Leaving size unchanged\n");
+			}
+			rv = DS_ERROR;
+		}
+		/* racy, see comments above. */
+		drbd_set_my_capacity(device, size);
+		md->la_size_sect = size;
+	}
+	if (rv <= DS_ERROR)
+		goto err_out;
+
+	la_size_changed = (prev.last_agreed_sect != md->la_size_sect);
+
+	md_moved = prev.md_offset    != md->md_offset
+		|| prev.md_size_sect != md->md_size_sect;
+
+	if (la_size_changed || md_moved || rs) {
+		u32 prev_flags;
+
+		/* We do some synchronous IO below, which may take some time.
+		 * Clear the timer, to avoid scary "timer expired!" messages,
+		 * "Superblock" is written out at least twice below, anyways. */
+		del_timer(&device->md_sync_timer);
+
+		/* We won't change the "al-extents" setting, we just may need
+		 * to move the on-disk location of the activity log ringbuffer.
+		 * Lock for transaction is good enough, it may well be "dirty"
+		 * or even "starving". */
+		wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log));
+
+		/* mark current on-disk bitmap and activity log as unreliable */
+		prev_flags = md->flags;
+		md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED;
+		drbd_md_write(device, buffer);
+
+		drbd_al_initialize(device, buffer);
+
+		drbd_info(device, "Writing the whole bitmap, %s\n",
+			 la_size_changed && md_moved ? "size changed and md moved" :
+			 la_size_changed ? "size changed" : "md moved");
+		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
+		drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+			       "size changed", BM_LOCKED_MASK);
+
+		/* on-disk bitmap and activity log is authoritative again
+		 * (unless there was an IO error meanwhile...) */
+		md->flags = prev_flags;
+		drbd_md_write(device, buffer);
+
+		if (rs)
+			drbd_info(device, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+				  md->al_stripes, md->al_stripe_size_4k * 4);
+	}
+
+	if (size > prev.last_agreed_sect)
+		rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO;
+	if (size < prev.last_agreed_sect)
+		rv = DS_SHRUNK;
+
+	if (0) {
+	err_out:
+		/* restore previous offset and sizes */
+		md->la_size_sect = prev.last_agreed_sect;
+		md->md_offset = prev.md_offset;
+		md->al_offset = prev.al_offset;
+		md->bm_offset = prev.bm_offset;
+		md->md_size_sect = prev.md_size_sect;
+		md->al_stripes = prev.al_stripes;
+		md->al_stripe_size_4k = prev.al_stripe_size_4k;
+		md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k;
+	}
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+	drbd_md_put_buffer(device);
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+sector_t
+drbd_new_dev_size(struct drbd_device *device, struct drbd_backing_dev *bdev,
+		  sector_t u_size, int assume_peer_has_space)
+{
+	sector_t p_size = device->p_size;   /* partner's disk size. */
+	sector_t la_size_sect = bdev->md.la_size_sect; /* last agreed size. */
+	sector_t m_size; /* my size */
+	sector_t size = 0;
+
+	m_size = drbd_get_max_capacity(bdev);
+
+	if (device->state.conn < C_CONNECTED && assume_peer_has_space) {
+		drbd_warn(device, "Resize while not connected was forced by the user!\n");
+		p_size = m_size;
+	}
+
+	if (p_size && m_size) {
+		size = min_t(sector_t, p_size, m_size);
+	} else {
+		if (la_size_sect) {
+			size = la_size_sect;
+			if (m_size && m_size < size)
+				size = m_size;
+			if (p_size && p_size < size)
+				size = p_size;
+		} else {
+			if (m_size)
+				size = m_size;
+			if (p_size)
+				size = p_size;
+		}
+	}
+
+	if (size == 0)
+		drbd_err(device, "Both nodes diskless!\n");
+
+	if (u_size) {
+		if (u_size > size)
+			drbd_err(device, "Requested disk size is too big (%lu > %lu)\n",
+			    (unsigned long)u_size>>1, (unsigned long)size>>1);
+		else
+			size = u_size;
+	}
+
+	return size;
+}
+
+/*
+ * drbd_check_al_size() - Ensures that the AL is of the right size
+ * @device:	DRBD device.
+ *
+ * Returns -EBUSY if current al lru is still used, -ENOMEM when allocation
+ * failed, and 0 on success. You should call drbd_md_sync() after you called
+ * this function.
+ */
+static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+	struct lru_cache *n, *t;
+	struct lc_element *e;
+	unsigned int in_use;
+	int i;
+
+	if (device->act_log &&
+	    device->act_log->nr_elements == dc->al_extents)
+		return 0;
+
+	in_use = 0;
+	t = device->act_log;
+	n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+		dc->al_extents, sizeof(struct lc_element), 0);
+
+	if (n == NULL) {
+		drbd_err(device, "Cannot allocate act_log lru!\n");
+		return -ENOMEM;
+	}
+	spin_lock_irq(&device->al_lock);
+	if (t) {
+		for (i = 0; i < t->nr_elements; i++) {
+			e = lc_element_by_index(t, i);
+			if (e->refcnt)
+				drbd_err(device, "refcnt(%d)==%d\n",
+				    e->lc_number, e->refcnt);
+			in_use += e->refcnt;
+		}
+	}
+	if (!in_use)
+		device->act_log = n;
+	spin_unlock_irq(&device->al_lock);
+	if (in_use) {
+		drbd_err(device, "Activity log still in use!\n");
+		lc_destroy(n);
+		return -EBUSY;
+	} else {
+		lc_destroy(t);
+	}
+	drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */
+	return 0;
+}
+
+static void blk_queue_discard_granularity(struct request_queue *q, unsigned int granularity)
+{
+	q->limits.discard_granularity = granularity;
+}
+
+static unsigned int drbd_max_discard_sectors(struct drbd_connection *connection)
+{
+	/* when we introduced REQ_WRITE_SAME support, we also bumped
+	 * our maximum supported batch bio size used for discards. */
+	if (connection->agreed_features & DRBD_FF_WSAME)
+		return DRBD_MAX_BBIO_SECTORS;
+	/* before, with DRBD <= 8.4.6, we only allowed up to one AL_EXTENT_SIZE. */
+	return AL_EXTENT_SIZE >> 9;
+}
+
+static void decide_on_discard_support(struct drbd_device *device,
+		struct drbd_backing_dev *bdev)
+{
+	struct drbd_connection *connection =
+		first_peer_device(device)->connection;
+	struct request_queue *q = device->rq_queue;
+	unsigned int max_discard_sectors;
+
+	if (bdev && !bdev_max_discard_sectors(bdev->backing_bdev))
+		goto not_supported;
+
+	if (connection->cstate >= C_CONNECTED &&
+	    !(connection->agreed_features & DRBD_FF_TRIM)) {
+		drbd_info(connection,
+			"peer DRBD too old, does not support TRIM: disabling discards\n");
+		goto not_supported;
+	}
+
+	/*
+	 * We don't care for the granularity, really.
+	 *
+	 * Stacking limits below should fix it for the local device.  Whether or
+	 * not it is a suitable granularity on the remote device is not our
+	 * problem, really. If you care, you need to use devices with similar
+	 * topology on all peers.
+	 */
+	blk_queue_discard_granularity(q, 512);
+	max_discard_sectors = drbd_max_discard_sectors(connection);
+	blk_queue_max_discard_sectors(q, max_discard_sectors);
+	blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
+	return;
+
+not_supported:
+	blk_queue_discard_granularity(q, 0);
+	blk_queue_max_discard_sectors(q, 0);
+}
+
+static void fixup_write_zeroes(struct drbd_device *device, struct request_queue *q)
+{
+	/* Fixup max_write_zeroes_sectors after blk_stack_limits():
+	 * if we can handle "zeroes" efficiently on the protocol,
+	 * we want to do that, even if our backend does not announce
+	 * max_write_zeroes_sectors itself. */
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	/* If the peer announces WZEROES support, use it.  Otherwise, rather
+	 * send explicit zeroes than rely on some discard-zeroes-data magic. */
+	if (connection->agreed_features & DRBD_FF_WZEROES)
+		q->limits.max_write_zeroes_sectors = DRBD_MAX_BBIO_SECTORS;
+	else
+		q->limits.max_write_zeroes_sectors = 0;
+}
+
+static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backing_dev *bdev,
+				   unsigned int max_bio_size, struct o_qlim *o)
+{
+	struct request_queue * const q = device->rq_queue;
+	unsigned int max_hw_sectors = max_bio_size >> 9;
+	unsigned int max_segments = 0;
+	struct request_queue *b = NULL;
+	struct disk_conf *dc;
+
+	if (bdev) {
+		b = bdev->backing_bdev->bd_disk->queue;
+
+		max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
+		rcu_read_lock();
+		dc = rcu_dereference(device->ldev->disk_conf);
+		max_segments = dc->max_bio_bvecs;
+		rcu_read_unlock();
+
+		blk_set_stacking_limits(&q->limits);
+	}
+
+	blk_queue_max_hw_sectors(q, max_hw_sectors);
+	/* This is the workaround for "bio would need to, but cannot, be split" */
+	blk_queue_max_segments(q, max_segments ? max_segments : BLK_MAX_SEGMENTS);
+	blk_queue_segment_boundary(q, PAGE_SIZE-1);
+	decide_on_discard_support(device, bdev);
+
+	if (b) {
+		blk_stack_limits(&q->limits, &b->limits, 0);
+		disk_update_readahead(device->vdisk);
+	}
+	fixup_write_zeroes(device, q);
+}
+
+void drbd_reconsider_queue_parameters(struct drbd_device *device, struct drbd_backing_dev *bdev, struct o_qlim *o)
+{
+	unsigned int now, new, local, peer;
+
+	now = queue_max_hw_sectors(device->rq_queue) << 9;
+	local = device->local_max_bio_size; /* Eventually last known value, from volatile memory */
+	peer = device->peer_max_bio_size; /* Eventually last known value, from meta data */
+
+	if (bdev) {
+		local = queue_max_hw_sectors(bdev->backing_bdev->bd_disk->queue) << 9;
+		device->local_max_bio_size = local;
+	}
+	local = min(local, DRBD_MAX_BIO_SIZE);
+
+	/* We may ignore peer limits if the peer is modern enough.
+	   Because new from 8.3.8 onwards the peer can use multiple
+	   BIOs for a single peer_request */
+	if (device->state.conn >= C_WF_REPORT_PARAMS) {
+		if (first_peer_device(device)->connection->agreed_pro_version < 94)
+			peer = min(device->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+			/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
+		else if (first_peer_device(device)->connection->agreed_pro_version == 94)
+			peer = DRBD_MAX_SIZE_H80_PACKET;
+		else if (first_peer_device(device)->connection->agreed_pro_version < 100)
+			peer = DRBD_MAX_BIO_SIZE_P95;  /* drbd 8.3.8 onwards, before 8.4.0 */
+		else
+			peer = DRBD_MAX_BIO_SIZE;
+
+		/* We may later detach and re-attach on a disconnected Primary.
+		 * Avoid this setting to jump back in that case.
+		 * We want to store what we know the peer DRBD can handle,
+		 * not what the peer IO backend can handle. */
+		if (peer > device->peer_max_bio_size)
+			device->peer_max_bio_size = peer;
+	}
+	new = min(local, peer);
+
+	if (device->state.role == R_PRIMARY && new < now)
+		drbd_err(device, "ASSERT FAILED new < now; (%u < %u)\n", new, now);
+
+	if (new != now)
+		drbd_info(device, "max BIO size = %u\n", new);
+
+	drbd_setup_queue_param(device, bdev, new, o);
+}
+
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_connection *connection)
+{
+	drbd_thread_start(&connection->worker);
+	drbd_flush_workqueue(&connection->sender_work);
+}
+
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_connection *connection)
+{
+	bool stop_threads;
+	spin_lock_irq(&connection->resource->req_lock);
+	stop_threads = conn_all_vols_unconf(connection) &&
+		connection->cstate == C_STANDALONE;
+	spin_unlock_irq(&connection->resource->req_lock);
+	if (stop_threads) {
+		/* ack_receiver thread and ack_sender workqueue are implicitly
+		 * stopped by receiver in conn_disconnect() */
+		drbd_thread_stop(&connection->receiver);
+		drbd_thread_stop(&connection->worker);
+	}
+}
+
+/* Make sure IO is suspended before calling this function(). */
+static void drbd_suspend_al(struct drbd_device *device)
+{
+	int s = 0;
+
+	if (!lc_try_lock(device->act_log)) {
+		drbd_warn(device, "Failed to lock al in drbd_suspend_al()\n");
+		return;
+	}
+
+	drbd_al_shrink(device);
+	spin_lock_irq(&device->resource->req_lock);
+	if (device->state.conn < C_CONNECTED)
+		s = !test_and_set_bit(AL_SUSPENDED, &device->flags);
+	spin_unlock_irq(&device->resource->req_lock);
+	lc_unlock(device->act_log);
+
+	if (s)
+		drbd_info(device, "Suspended AL updates\n");
+}
+
+
+static bool should_set_defaults(struct genl_info *info)
+{
+	unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+	return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static unsigned int drbd_al_extents_max(struct drbd_backing_dev *bdev)
+{
+	/* This is limited by 16 bit "slot" numbers,
+	 * and by available on-disk context storage.
+	 *
+	 * Also (u16)~0 is special (denotes a "free" extent).
+	 *
+	 * One transaction occupies one 4kB on-disk block,
+	 * we have n such blocks in the on disk ring buffer,
+	 * the "current" transaction may fail (n-1),
+	 * and there is 919 slot numbers context information per transaction.
+	 *
+	 * 72 transaction blocks amounts to more than 2**16 context slots,
+	 * so cap there first.
+	 */
+	const unsigned int max_al_nr = DRBD_AL_EXTENTS_MAX;
+	const unsigned int sufficient_on_disk =
+		(max_al_nr + AL_CONTEXT_PER_TRANSACTION -1)
+		/AL_CONTEXT_PER_TRANSACTION;
+
+	unsigned int al_size_4k = bdev->md.al_size_4k;
+
+	if (al_size_4k > sufficient_on_disk)
+		return max_al_nr;
+
+	return (al_size_4k - 1) * AL_CONTEXT_PER_TRANSACTION;
+}
+
+static bool write_ordering_changed(struct disk_conf *a, struct disk_conf *b)
+{
+	return	a->disk_barrier != b->disk_barrier ||
+		a->disk_flushes != b->disk_flushes ||
+		a->disk_drain != b->disk_drain;
+}
+
+static void sanitize_disk_conf(struct drbd_device *device, struct disk_conf *disk_conf,
+			       struct drbd_backing_dev *nbc)
+{
+	struct block_device *bdev = nbc->backing_bdev;
+
+	if (disk_conf->al_extents < DRBD_AL_EXTENTS_MIN)
+		disk_conf->al_extents = DRBD_AL_EXTENTS_MIN;
+	if (disk_conf->al_extents > drbd_al_extents_max(nbc))
+		disk_conf->al_extents = drbd_al_extents_max(nbc);
+
+	if (!bdev_max_discard_sectors(bdev)) {
+		if (disk_conf->rs_discard_granularity) {
+			disk_conf->rs_discard_granularity = 0; /* disable feature */
+			drbd_info(device, "rs_discard_granularity feature disabled\n");
+		}
+	}
+
+	if (disk_conf->rs_discard_granularity) {
+		int orig_value = disk_conf->rs_discard_granularity;
+		sector_t discard_size = bdev_max_discard_sectors(bdev) << 9;
+		unsigned int discard_granularity = bdev_discard_granularity(bdev);
+		int remainder;
+
+		if (discard_granularity > disk_conf->rs_discard_granularity)
+			disk_conf->rs_discard_granularity = discard_granularity;
+
+		remainder = disk_conf->rs_discard_granularity %
+				discard_granularity;
+		disk_conf->rs_discard_granularity += remainder;
+
+		if (disk_conf->rs_discard_granularity > discard_size)
+			disk_conf->rs_discard_granularity = discard_size;
+
+		if (disk_conf->rs_discard_granularity != orig_value)
+			drbd_info(device, "rs_discard_granularity changed to %d\n",
+				  disk_conf->rs_discard_granularity);
+	}
+}
+
+static int disk_opts_check_al_size(struct drbd_device *device, struct disk_conf *dc)
+{
+	int err = -EBUSY;
+
+	if (device->act_log &&
+	    device->act_log->nr_elements == dc->al_extents)
+		return 0;
+
+	drbd_suspend_io(device);
+	/* If IO completion is currently blocked, we would likely wait
+	 * "forever" for the activity log to become unused. So we don't. */
+	if (atomic_read(&device->ap_bio_cnt))
+		goto out;
+
+	wait_event(device->al_wait, lc_try_lock(device->act_log));
+	drbd_al_shrink(device);
+	err = drbd_check_al_size(device, dc);
+	lc_unlock(device->act_log);
+	wake_up(&device->al_wait);
+out:
+	drbd_resume_io(device);
+	return err;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_device *device;
+	struct disk_conf *new_disk_conf, *old_disk_conf;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	int err;
+	unsigned int fifo_size;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* we also need a disk
+	 * to change the options on */
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	mutex_lock(&device->resource->conf_update);
+	old_disk_conf = device->ldev->disk_conf;
+	*new_disk_conf = *old_disk_conf;
+	if (should_set_defaults(info))
+		set_disk_conf_defaults(new_disk_conf);
+
+	err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail_unlock;
+	}
+
+	if (!expect(new_disk_conf->resync_rate >= 1))
+		new_disk_conf->resync_rate = 1;
+
+	sanitize_disk_conf(device, new_disk_conf, device->ldev);
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+	if (fifo_size != device->rs_plan_s->size) {
+		new_plan = fifo_alloc(fifo_size);
+		if (!new_plan) {
+			drbd_err(device, "kmalloc of fifo_buffer failed");
+			retcode = ERR_NOMEM;
+			goto fail_unlock;
+		}
+	}
+
+	err = disk_opts_check_al_size(device, new_disk_conf);
+	if (err) {
+		/* Could be just "busy". Ignore?
+		 * Introduce dedicated error code? */
+		drbd_msg_put_info(adm_ctx.reply_skb,
+			"Try again without changing current al-extents setting");
+		retcode = ERR_NOMEM;
+		goto fail_unlock;
+	}
+
+	lock_all_resources();
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode == NO_ERROR) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		drbd_resync_after_changed(device);
+	}
+	unlock_all_resources();
+
+	if (retcode != NO_ERROR)
+		goto fail_unlock;
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&device->resource->conf_update);
+
+	if (new_disk_conf->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	if (write_ordering_changed(old_disk_conf, new_disk_conf))
+		drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH);
+
+	if (old_disk_conf->discard_zeroes_if_aligned !=
+	    new_disk_conf->discard_zeroes_if_aligned)
+		drbd_reconsider_queue_parameters(device, device->ldev, NULL);
+
+	drbd_md_sync(device);
+
+	if (device->state.conn >= C_CONNECTED) {
+		struct drbd_peer_device *peer_device;
+
+		for_each_peer_device(peer_device, device)
+			drbd_send_sync_param(peer_device);
+	}
+
+	kvfree_rcu(old_disk_conf);
+	kfree(old_plan);
+	mod_timer(&device->request_timer, jiffies + HZ);
+	goto success;
+
+fail_unlock:
+	mutex_unlock(&device->resource->conf_update);
+ fail:
+	kfree(new_disk_conf);
+	kfree(new_plan);
+success:
+	put_ldev(device);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static struct block_device *open_backing_dev(struct drbd_device *device,
+		const char *bdev_path, void *claim_ptr, bool do_bd_link)
+{
+	struct block_device *bdev;
+	int err = 0;
+
+	bdev = blkdev_get_by_path(bdev_path,
+				  FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr);
+	if (IS_ERR(bdev)) {
+		drbd_err(device, "open(\"%s\") failed with %ld\n",
+				bdev_path, PTR_ERR(bdev));
+		return bdev;
+	}
+
+	if (!do_bd_link)
+		return bdev;
+
+	err = bd_link_disk_holder(bdev, device->vdisk);
+	if (err) {
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+		drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n",
+				bdev_path, err);
+		bdev = ERR_PTR(err);
+	}
+	return bdev;
+}
+
+static int open_backing_devices(struct drbd_device *device,
+		struct disk_conf *new_disk_conf,
+		struct drbd_backing_dev *nbc)
+{
+	struct block_device *bdev;
+
+	bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true);
+	if (IS_ERR(bdev))
+		return ERR_OPEN_DISK;
+	nbc->backing_bdev = bdev;
+
+	/*
+	 * meta_dev_idx >= 0: external fixed size, possibly multiple
+	 * drbd sharing one meta device.  TODO in that case, paranoia
+	 * check that [md_bdev, meta_dev_idx] is not yet used by some
+	 * other drbd minor!  (if you use drbd.conf + drbdadm, that
+	 * should check it for you already; but if you don't, or
+	 * someone fooled it, we need to double check here)
+	 */
+	bdev = open_backing_dev(device, new_disk_conf->meta_dev,
+		/* claim ptr: device, if claimed exclusively; shared drbd_m_holder,
+		 * if potentially shared with other drbd minors */
+			(new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder,
+		/* avoid double bd_claim_by_disk() for the same (source,target) tuple,
+		 * as would happen with internal metadata. */
+			(new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT &&
+			 new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL));
+	if (IS_ERR(bdev))
+		return ERR_OPEN_MD_DISK;
+	nbc->md_bdev = bdev;
+	return NO_ERROR;
+}
+
+static void close_backing_dev(struct drbd_device *device, struct block_device *bdev,
+	bool do_bd_unlink)
+{
+	if (!bdev)
+		return;
+	if (do_bd_unlink)
+		bd_unlink_disk_holder(bdev, device->vdisk);
+	blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+}
+
+void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev)
+{
+	if (ldev == NULL)
+		return;
+
+	close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev);
+	close_backing_dev(device, ldev->backing_bdev, true);
+
+	kfree(ldev->disk_conf);
+	kfree(ldev);
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device;
+	struct drbd_connection *connection;
+	int err;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	sector_t max_possible_sectors;
+	sector_t min_md_device_sectors;
+	struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+	struct disk_conf *new_disk_conf = NULL;
+	struct lru_cache *resync_lru = NULL;
+	struct fifo_buffer *new_plan = NULL;
+	union drbd_state ns, os;
+	enum drbd_state_rv rv;
+	struct net_conf *nc;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	device = adm_ctx.device;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	peer_device = first_peer_device(device);
+	connection = peer_device->connection;
+	conn_reconfig_start(connection);
+
+	/* if you want to reconfigure, please tear down first */
+	if (device->state.disk > D_DISKLESS) {
+		retcode = ERR_DISK_CONFIGURED;
+		goto fail;
+	}
+	/* It may just now have detached because of IO error.  Make sure
+	 * drbd_ldev_destroy is done already, we may end up here very fast,
+	 * e.g. if someone calls attach from the on-io-error handler,
+	 * to realize a "hot spare" feature (not that I'd recommend that) */
+	wait_event(device->misc_wait, !test_bit(GOING_DISKLESS, &device->flags));
+
+	/* make sure there is no leftover from previous force-detach attempts */
+	clear_bit(FORCE_DETACH, &device->flags);
+	clear_bit(WAS_IO_ERROR, &device->flags);
+	clear_bit(WAS_READ_ERROR, &device->flags);
+
+	/* and no leftover from previously aborted resync or verify, either */
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+
+	/* allocation not in the IO path, drbdsetup context */
+	nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
+	if (!nbc) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	spin_lock_init(&nbc->md.uuid_lock);
+
+	new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+	if (!new_disk_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+	nbc->disk_conf = new_disk_conf;
+
+	set_disk_conf_defaults(new_disk_conf);
+	err = disk_conf_from_attrs(new_disk_conf, info);
+	if (err) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	if (new_disk_conf->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+		new_disk_conf->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+
+	new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+	if (!new_plan) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (nc) {
+		if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+			rcu_read_unlock();
+			retcode = ERR_STONITH_AND_PROT_A;
+			goto fail;
+		}
+	}
+	rcu_read_unlock();
+
+	retcode = open_backing_devices(device, new_disk_conf, nbc);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	if ((nbc->backing_bdev == nbc->md_bdev) !=
+	    (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+	     new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+		retcode = ERR_MD_IDX_INVALID;
+		goto fail;
+	}
+
+	resync_lru = lc_create("resync", drbd_bm_ext_cache,
+			1, 61, sizeof(struct bm_extent),
+			offsetof(struct bm_extent, lce));
+	if (!resync_lru) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	/* Read our meta data super block early.
+	 * This also sets other on-disk offsets. */
+	retcode = drbd_md_read(device, nbc);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	sanitize_disk_conf(device, new_disk_conf, nbc);
+
+	if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
+		drbd_err(device, "max capacity %llu smaller than disk size %llu\n",
+			(unsigned long long) drbd_get_max_capacity(nbc),
+			(unsigned long long) new_disk_conf->disk_size);
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	if (new_disk_conf->meta_dev_idx < 0) {
+		max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
+		/* at least one MB, otherwise it does not make sense */
+		min_md_device_sectors = (2<<10);
+	} else {
+		max_possible_sectors = DRBD_MAX_SECTORS;
+		min_md_device_sectors = MD_128MB_SECT * (new_disk_conf->meta_dev_idx + 1);
+	}
+
+	if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
+		retcode = ERR_MD_DISK_TOO_SMALL;
+		drbd_warn(device, "refusing attach: md-device too small, "
+		     "at least %llu sectors needed for this meta-disk type\n",
+		     (unsigned long long) min_md_device_sectors);
+		goto fail;
+	}
+
+	/* Make sure the new disk is big enough
+	 * (we may currently be R_PRIMARY with no local disk...) */
+	if (drbd_get_max_capacity(nbc) < get_capacity(device->vdisk)) {
+		retcode = ERR_DISK_TOO_SMALL;
+		goto fail;
+	}
+
+	nbc->known_size = drbd_get_capacity(nbc->backing_bdev);
+
+	if (nbc->known_size > max_possible_sectors) {
+		drbd_warn(device, "==> truncating very big lower level device "
+			"to currently maximum possible %llu sectors <==\n",
+			(unsigned long long) max_possible_sectors);
+		if (new_disk_conf->meta_dev_idx >= 0)
+			drbd_warn(device, "==>> using internal or flexible "
+				      "meta data may help <<==\n");
+	}
+
+	drbd_suspend_io(device);
+	/* also wait for the last barrier ack. */
+	/* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+	 * We need a way to either ignore barrier acks for barriers sent before a device
+	 * was attached, or a way to wait for all pending barrier acks to come in.
+	 * As barriers are counted per resource,
+	 * we'd need to suspend io on all devices of a resource.
+	 */
+	wait_event(device->misc_wait, !atomic_read(&device->ap_pending_cnt) || drbd_suspended(device));
+	/* and for any other previously queued work */
+	drbd_flush_workqueue(&connection->sender_work);
+
+	rv = _drbd_request_state(device, NS(disk, D_ATTACHING), CS_VERBOSE);
+	retcode = (enum drbd_ret_code)rv;
+	drbd_resume_io(device);
+	if (rv < SS_SUCCESS)
+		goto fail;
+
+	if (!get_ldev_if_state(device, D_ATTACHING))
+		goto force_diskless;
+
+	if (!device->bitmap) {
+		if (drbd_bm_init(device)) {
+			retcode = ERR_NOMEM;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (device->state.pdsk != D_UP_TO_DATE && device->ed_uuid &&
+	    (device->state.role == R_PRIMARY || device->state.peer == R_PRIMARY) &&
+            (device->ed_uuid & ~((u64)1)) != (nbc->md.uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only attach to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		retcode = ERR_DATA_NOT_CURRENT;
+		goto force_diskless_dec;
+	}
+
+	/* Since we are diskless, fix the activity log first... */
+	if (drbd_check_al_size(device, new_disk_conf)) {
+		retcode = ERR_NOMEM;
+		goto force_diskless_dec;
+	}
+
+	/* Prevent shrinking of consistent devices ! */
+	{
+	unsigned long long nsz = drbd_new_dev_size(device, nbc, nbc->disk_conf->disk_size, 0);
+	unsigned long long eff = nbc->md.la_size_sect;
+	if (drbd_md_test_flag(nbc, MDF_CONSISTENT) && nsz < eff) {
+		if (nsz == nbc->disk_conf->disk_size) {
+			drbd_warn(device, "truncating a consistent device during attach (%llu < %llu)\n", nsz, eff);
+		} else {
+			drbd_warn(device, "refusing to truncate a consistent device (%llu < %llu)\n", nsz, eff);
+			drbd_msg_sprintf_info(adm_ctx.reply_skb,
+				"To-be-attached device has last effective > current size, and is consistent\n"
+				"(%llu > %llu sectors). Refusing to attach.", eff, nsz);
+			retcode = ERR_IMPLICIT_SHRINK;
+			goto force_diskless_dec;
+		}
+	}
+	}
+
+	lock_all_resources();
+	retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after);
+	if (retcode != NO_ERROR) {
+		unlock_all_resources();
+		goto force_diskless_dec;
+	}
+
+	/* Reset the "barriers don't work" bits here, then force meta data to
+	 * be written, to ensure we determine if barriers are supported. */
+	if (new_disk_conf->md_flushes)
+		clear_bit(MD_NO_FUA, &device->flags);
+	else
+		set_bit(MD_NO_FUA, &device->flags);
+
+	/* Point of no return reached.
+	 * Devices and memory are no longer released by error cleanup below.
+	 * now device takes over responsibility, and the state engine should
+	 * clean it up somewhere.  */
+	D_ASSERT(device, device->ldev == NULL);
+	device->ldev = nbc;
+	device->resync = resync_lru;
+	device->rs_plan_s = new_plan;
+	nbc = NULL;
+	resync_lru = NULL;
+	new_disk_conf = NULL;
+	new_plan = NULL;
+
+	drbd_resync_after_changed(device);
+	drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH);
+	unlock_all_resources();
+
+	if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+	else
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !(device->state.role == R_PRIMARY && device->resource->susp_nod))
+		set_bit(CRASHED_PRIMARY, &device->flags);
+
+	device->send_cnt = 0;
+	device->recv_cnt = 0;
+	device->read_cnt = 0;
+	device->writ_cnt = 0;
+
+	drbd_reconsider_queue_parameters(device, device->ldev, NULL);
+
+	/* If I am currently not R_PRIMARY,
+	 * but meta data primary indicator is set,
+	 * I just now recover from a hard crash,
+	 * and have been R_PRIMARY before that crash.
+	 *
+	 * Now, if I had no connection before that crash
+	 * (have been degraded R_PRIMARY), chances are that
+	 * I won't find my peer now either.
+	 *
+	 * In that case, and _only_ in that case,
+	 * we use the degr-wfc-timeout instead of the default,
+	 * so we can automatically recover from a crash of a
+	 * degraded but active "cluster" after a certain timeout.
+	 */
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	if (device->state.role != R_PRIMARY &&
+	     drbd_md_test_flag(device->ldev, MDF_PRIMARY_IND) &&
+	    !drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND))
+		set_bit(USE_DEGR_WFC_T, &device->flags);
+
+	dd = drbd_determine_dev_size(device, 0, NULL);
+	if (dd <= DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto force_diskless_dec;
+	} else if (dd == DS_GREW)
+		set_bit(RESYNC_AFTER_NEG, &device->flags);
+
+	if (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ||
+	    (test_bit(CRASHED_PRIMARY, &device->flags) &&
+	     drbd_md_test_flag(device->ldev, MDF_AL_DISABLED))) {
+		drbd_info(device, "Assuming that all blocks are out of sync "
+		     "(aka FullSync)\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+			"set_n_write from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	} else {
+		if (drbd_bitmap_io(device, &drbd_bm_read,
+			"read from attaching", BM_LOCKED_MASK)) {
+			retcode = ERR_IO_MD_DISK;
+			goto force_diskless_dec;
+		}
+	}
+
+	if (_drbd_bm_total_weight(device) == drbd_bm_bits(device))
+		drbd_suspend_al(device); /* IO is still suspended here... */
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+	ns = os;
+	/* If MDF_CONSISTENT is not set go into inconsistent state,
+	   otherwise investigate MDF_WasUpToDate...
+	   If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
+	   otherwise into D_CONSISTENT state.
+	*/
+	if (drbd_md_test_flag(device->ldev, MDF_CONSISTENT)) {
+		if (drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE))
+			ns.disk = D_CONSISTENT;
+		else
+			ns.disk = D_OUTDATED;
+	} else {
+		ns.disk = D_INCONSISTENT;
+	}
+
+	if (drbd_md_test_flag(device->ldev, MDF_PEER_OUT_DATED))
+		ns.pdsk = D_OUTDATED;
+
+	rcu_read_lock();
+	if (ns.disk == D_CONSISTENT &&
+	    (ns.pdsk == D_OUTDATED || rcu_dereference(device->ldev->disk_conf)->fencing == FP_DONT_CARE))
+		ns.disk = D_UP_TO_DATE;
+
+	/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
+	   MDF_CONSISTENT and MDF_WAS_UP_TO_DATE must happen before
+	   this point, because drbd_request_state() modifies these
+	   flags. */
+
+	if (rcu_dereference(device->ldev->disk_conf)->al_updates)
+		device->ldev->md.flags &= ~MDF_AL_DISABLED;
+	else
+		device->ldev->md.flags |= MDF_AL_DISABLED;
+
+	rcu_read_unlock();
+
+	/* In case we are C_CONNECTED postpone any decision on the new disk
+	   state after the negotiation phase. */
+	if (device->state.conn == C_CONNECTED) {
+		device->new_state_tmp.i = ns.i;
+		ns.i = os.i;
+		ns.disk = D_NEGOTIATING;
+
+		/* We expect to receive up-to-date UUIDs soon.
+		   To avoid a race in receive_state, free p_uuid while
+		   holding req_lock. I.e. atomic with the state change */
+		kfree(device->p_uuid);
+		device->p_uuid = NULL;
+	}
+
+	rv = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS)
+		goto force_diskless_dec;
+
+	mod_timer(&device->request_timer, jiffies + HZ);
+
+	if (device->state.role == R_PRIMARY)
+		device->ldev->md.uuid[UI_CURRENT] |=  (u64)1;
+	else
+		device->ldev->md.uuid[UI_CURRENT] &= ~(u64)1;
+
+	drbd_md_mark_dirty(device);
+	drbd_md_sync(device);
+
+	kobject_uevent(&disk_to_dev(device->vdisk)->kobj, KOBJ_CHANGE);
+	put_ldev(device);
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ force_diskless_dec:
+	put_ldev(device);
+ force_diskless:
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+	drbd_md_sync(device);
+ fail:
+	conn_reconfig_done(connection);
+	if (nbc) {
+		close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev);
+		close_backing_dev(device, nbc->backing_bdev, true);
+		kfree(nbc);
+	}
+	kfree(new_disk_conf);
+	lc_destroy(resync_lru);
+	kfree(new_plan);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int adm_detach(struct drbd_device *device, int force)
+{
+	if (force) {
+		set_bit(FORCE_DETACH, &device->flags);
+		drbd_force_state(device, NS(disk, D_FAILED));
+		return SS_SUCCESS;
+	}
+
+	return drbd_request_detach_interruptible(device);
+}
+
+/* Detaching the disk is a process in multiple stages.  First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct detach_parms parms = { };
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+		err = detach_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_detach(adm_ctx.device, parms.force_detach);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static bool conn_resync_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_SYNC_SOURCE ||
+		    device->state.conn == C_SYNC_TARGET ||
+		    device->state.conn == C_PAUSED_SYNC_S ||
+		    device->state.conn == C_PAUSED_SYNC_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static bool conn_ov_running(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = false;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.conn == C_VERIFY_S ||
+		    device->state.conn == C_VERIFY_T) {
+			rv = true;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_connection *connection, struct net_conf *old_net_conf, struct net_conf *new_net_conf)
+{
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	if (old_net_conf && connection->cstate == C_WF_REPORT_PARAMS && connection->agreed_pro_version < 100) {
+		if (new_net_conf->wire_protocol != old_net_conf->wire_protocol)
+			return ERR_NEED_APV_100;
+
+		if (new_net_conf->two_primaries != old_net_conf->two_primaries)
+			return ERR_NEED_APV_100;
+
+		if (strcmp(new_net_conf->integrity_alg, old_net_conf->integrity_alg))
+			return ERR_NEED_APV_100;
+	}
+
+	if (!new_net_conf->two_primaries &&
+	    conn_highest_role(connection) == R_PRIMARY &&
+	    conn_highest_peer(connection) == R_PRIMARY)
+		return ERR_NEED_ALLOW_TWO_PRI;
+
+	if (new_net_conf->two_primaries &&
+	    (new_net_conf->wire_protocol != DRBD_PROT_C))
+		return ERR_NOT_PROTO_C;
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (get_ldev(device)) {
+			enum drbd_fencing_p fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+			put_ldev(device);
+			if (new_net_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+				return ERR_STONITH_AND_PROT_A;
+		}
+		if (device->state.role == R_PRIMARY && new_net_conf->discard_my_data)
+			return ERR_DISCARD_IMPOSSIBLE;
+	}
+
+	if (new_net_conf->on_congestion != OC_BLOCK && new_net_conf->wire_protocol != DRBD_PROT_A)
+		return ERR_CONG_NOT_PROTO_A;
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_connection *connection, struct net_conf *new_net_conf)
+{
+	enum drbd_ret_code rv;
+	struct drbd_peer_device *peer_device;
+	int i;
+
+	rcu_read_lock();
+	rv = _check_net_options(connection, rcu_dereference(connection->net_conf), new_net_conf);
+	rcu_read_unlock();
+
+	/* connection->peer_devices protected by genl_lock() here */
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		if (!device->bitmap) {
+			if (drbd_bm_init(device))
+				return ERR_NOMEM;
+		}
+	}
+
+	return rv;
+}
+
+struct crypto {
+	struct crypto_shash *verify_tfm;
+	struct crypto_shash *csums_tfm;
+	struct crypto_shash *cram_hmac_tfm;
+	struct crypto_shash *integrity_tfm;
+};
+
+static int
+alloc_shash(struct crypto_shash **tfm, char *tfm_name, int err_alg)
+{
+	if (!tfm_name[0])
+		return NO_ERROR;
+
+	*tfm = crypto_alloc_shash(tfm_name, 0, 0);
+	if (IS_ERR(*tfm)) {
+		*tfm = NULL;
+		return err_alg;
+	}
+
+	return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_net_conf)
+{
+	char hmac_name[CRYPTO_MAX_ALG_NAME];
+	enum drbd_ret_code rv;
+
+	rv = alloc_shash(&crypto->csums_tfm, new_net_conf->csums_alg,
+			 ERR_CSUMS_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_shash(&crypto->verify_tfm, new_net_conf->verify_alg,
+			 ERR_VERIFY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	rv = alloc_shash(&crypto->integrity_tfm, new_net_conf->integrity_alg,
+			 ERR_INTEGRITY_ALG);
+	if (rv != NO_ERROR)
+		return rv;
+	if (new_net_conf->cram_hmac_alg[0] != 0) {
+		snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+			 new_net_conf->cram_hmac_alg);
+
+		rv = alloc_shash(&crypto->cram_hmac_tfm, hmac_name,
+				 ERR_AUTH_ALG);
+	}
+
+	return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+	crypto_free_shash(crypto->cram_hmac_tfm);
+	crypto_free_shash(crypto->integrity_tfm);
+	crypto_free_shash(crypto->csums_tfm);
+	crypto_free_shash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct drbd_connection *connection;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	int err;
+	int ovr; /* online verify running */
+	int rsr; /* re-sync running */
+	struct crypto crypto = { };
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	connection = adm_ctx.connection;
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto out;
+	}
+
+	conn_reconfig_start(connection);
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+
+	if (!old_net_conf) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "net conf missing, try connect");
+		retcode = ERR_INVALID_REQUEST;
+		goto fail;
+	}
+
+	*new_net_conf = *old_net_conf;
+	if (should_set_defaults(info))
+		set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs_for_change(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	/* re-sync running */
+	rsr = conn_resync_running(connection);
+	if (rsr && strcmp(new_net_conf->csums_alg, old_net_conf->csums_alg)) {
+		retcode = ERR_CSUMS_RESYNC_RUNNING;
+		goto fail;
+	}
+
+	/* online verify running */
+	ovr = conn_ov_running(connection);
+	if (ovr && strcmp(new_net_conf->verify_alg, old_net_conf->verify_alg)) {
+		retcode = ERR_VERIFY_RUNNING;
+		goto fail;
+	}
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	if (!rsr) {
+		crypto_free_shash(connection->csums_tfm);
+		connection->csums_tfm = crypto.csums_tfm;
+		crypto.csums_tfm = NULL;
+	}
+	if (!ovr) {
+		crypto_free_shash(connection->verify_tfm);
+		connection->verify_tfm = crypto.verify_tfm;
+		crypto.verify_tfm = NULL;
+	}
+
+	crypto_free_shash(connection->integrity_tfm);
+	connection->integrity_tfm = crypto.integrity_tfm;
+	if (connection->cstate >= C_WF_REPORT_PARAMS && connection->agreed_pro_version >= 100)
+		/* Do this without trying to take connection->data.mutex again.  */
+		__drbd_send_protocol(connection, P_PROTOCOL_UPDATE);
+
+	crypto_free_shash(connection->cram_hmac_tfm);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	kvfree_rcu(old_net_conf);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		struct drbd_peer_device *peer_device;
+		int vnr;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			drbd_send_sync_param(peer_device);
+	}
+
+	goto done;
+
+ fail:
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+ done:
+	conn_reconfig_done(connection);
+ out:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static void connection_to_info(struct connection_info *info,
+			       struct drbd_connection *connection)
+{
+	info->conn_connection_state = connection->cstate;
+	info->conn_role = conn_highest_peer(connection);
+}
+
+static void peer_device_to_info(struct peer_device_info *info,
+				struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	info->peer_repl_state =
+		max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn);
+	info->peer_disk_state = device->state.pdsk;
+	info->peer_resync_susp_user = device->state.user_isp;
+	info->peer_resync_susp_peer = device->state.peer_isp;
+	info->peer_resync_susp_dependency = device->state.aftr_isp;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct connection_info connection_info;
+	enum drbd_notification_type flags;
+	unsigned int peer_devices = 0;
+	struct drbd_config_context adm_ctx;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct crypto crypto = { };
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	enum drbd_ret_code retcode;
+	enum drbd_state_rv rv;
+	int i;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+	if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "connection endpoint(s) missing");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* No need for _rcu here. All reconfiguration is
+	 * strictly serialized on genl_lock(). We are protected against
+	 * concurrent reconfiguration/addition/deletion */
+	for_each_resource(resource, &drbd_resources) {
+		for_each_connection(connection, resource) {
+			if (nla_len(adm_ctx.my_addr) == connection->my_addr_len &&
+			    !memcmp(nla_data(adm_ctx.my_addr), &connection->my_addr,
+				    connection->my_addr_len)) {
+				retcode = ERR_LOCAL_ADDR;
+				goto out;
+			}
+
+			if (nla_len(adm_ctx.peer_addr) == connection->peer_addr_len &&
+			    !memcmp(nla_data(adm_ctx.peer_addr), &connection->peer_addr,
+				    connection->peer_addr_len)) {
+				retcode = ERR_PEER_ADDR;
+				goto out;
+			}
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	connection = first_connection(adm_ctx.resource);
+	conn_reconfig_start(connection);
+
+	if (connection->cstate > C_STANDALONE) {
+		retcode = ERR_NET_CONFIGURED;
+		goto fail;
+	}
+
+	/* allocation not in the IO path, drbdsetup / netlink process context */
+	new_net_conf = kzalloc(sizeof(*new_net_conf), GFP_KERNEL);
+	if (!new_net_conf) {
+		retcode = ERR_NOMEM;
+		goto fail;
+	}
+
+	set_net_conf_defaults(new_net_conf);
+
+	err = net_conf_from_attrs(new_net_conf, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	retcode = check_net_options(connection, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	retcode = alloc_crypto(&crypto, new_net_conf);
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	((char *)new_net_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
+
+	drbd_flush_workqueue(&connection->sender_work);
+
+	mutex_lock(&adm_ctx.resource->conf_update);
+	old_net_conf = connection->net_conf;
+	if (old_net_conf) {
+		retcode = ERR_NET_CONFIGURED;
+		mutex_unlock(&adm_ctx.resource->conf_update);
+		goto fail;
+	}
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+
+	conn_free_crypto(connection);
+	connection->cram_hmac_tfm = crypto.cram_hmac_tfm;
+	connection->integrity_tfm = crypto.integrity_tfm;
+	connection->csums_tfm = crypto.csums_tfm;
+	connection->verify_tfm = crypto.verify_tfm;
+
+	connection->my_addr_len = nla_len(adm_ctx.my_addr);
+	memcpy(&connection->my_addr, nla_data(adm_ctx.my_addr), connection->my_addr_len);
+	connection->peer_addr_len = nla_len(adm_ctx.peer_addr);
+	memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len);
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		peer_devices++;
+	}
+
+	connection_to_info(&connection_info, connection);
+	flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+	mutex_lock(&notification_mutex);
+	notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags);
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct peer_device_info peer_device_info;
+
+		peer_device_to_info(&peer_device_info, peer_device);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags);
+	}
+	mutex_unlock(&notification_mutex);
+	mutex_unlock(&adm_ctx.resource->conf_update);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+		struct drbd_device *device = peer_device->device;
+		device->send_cnt = 0;
+		device->recv_cnt = 0;
+	}
+	rcu_read_unlock();
+
+	rv = conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	drbd_adm_finish(&adm_ctx, info, rv);
+	return 0;
+
+fail:
+	free_crypto(&crypto);
+	kfree(new_net_conf);
+
+	conn_reconfig_done(connection);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection, bool force)
+{
+	enum drbd_conns cstate;
+	enum drbd_state_rv rv;
+
+repeat:
+	rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+			force ? CS_HARD : 0);
+
+	switch (rv) {
+	case SS_NOTHING_TO_DO:
+		break;
+	case SS_ALREADY_STANDALONE:
+		return SS_SUCCESS;
+	case SS_PRIMARY_NOP:
+		/* Our state checking code wants to see the peer outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING, pdsk, D_OUTDATED), 0);
+
+		if (rv == SS_OUTDATE_WO_CONN) /* lost connection before graceful disconnect succeeded */
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_VERBOSE);
+
+		break;
+	case SS_CW_FAILED_BY_PEER:
+		spin_lock_irq(&connection->resource->req_lock);
+		cstate = connection->cstate;
+		spin_unlock_irq(&connection->resource->req_lock);
+		if (cstate <= C_WF_CONNECTION)
+			goto repeat;
+		/* The peer probably wants to see us outdated. */
+		rv = conn_request_state(connection, NS2(conn, C_DISCONNECTING,
+							disk, D_OUTDATED), 0);
+		if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+			rv = conn_request_state(connection, NS(conn, C_DISCONNECTING),
+					CS_HARD);
+		}
+		break;
+	default:;
+		/* no special handling necessary */
+	}
+
+	if (rv >= SS_SUCCESS) {
+		enum drbd_state_rv rv2;
+		/* No one else can reconfigure the network while I am here.
+		 * The state handling only uses drbd_thread_stop_nowait(),
+		 * we want to really wait here until the receiver is no more.
+		 */
+		drbd_thread_stop(&connection->receiver);
+
+		/* Race breaker.  This additional state change request may be
+		 * necessary, if this was a forced disconnect during a receiver
+		 * restart.  We may have "killed" the receiver thread just
+		 * after drbd_receiver() returned.  Typically, we should be
+		 * C_STANDALONE already, now, and this becomes a no-op.
+		 */
+		rv2 = conn_request_state(connection, NS(conn, C_STANDALONE),
+				CS_VERBOSE | CS_HARD);
+		if (rv2 < SS_SUCCESS)
+			drbd_err(connection,
+				"unexpected rv2=%d in conn_try_disconnect()\n",
+				rv2);
+		/* Unlike in DRBD 9, the state engine has generated
+		 * NOTIFY_DESTROY events before clearing connection->net_conf. */
+	}
+	return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disconnect_parms parms;
+	struct drbd_connection *connection;
+	enum drbd_state_rv rv;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_CONNECTION);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	connection = adm_ctx.connection;
+	memset(&parms, 0, sizeof(parms));
+	if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+		err = disconnect_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	rv = conn_try_disconnect(connection, parms.force_disconnect);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	if (rv < SS_SUCCESS) {
+		drbd_adm_finish(&adm_ctx, info, rv);
+		return 0;
+	}
+	retcode = NO_ERROR;
+ fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void resync_after_online_grow(struct drbd_device *device)
+{
+	int iass; /* I am sync source */
+
+	drbd_info(device, "Resync of new storage after online grow\n");
+	if (device->state.role != device->state.peer)
+		iass = (device->state.role == R_PRIMARY);
+	else
+		iass = test_bit(RESOLVE_CONFLICTS, &first_peer_device(device)->connection->flags);
+
+	if (iass)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	else
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
+}
+
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+	struct resize_parms rs;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	enum determine_dev_size dd;
+	bool change_al_layout = false;
+	enum dds_flags ddsf;
+	sector_t u_size;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
+	memset(&rs, 0, sizeof(struct resize_parms));
+	rs.al_stripes = device->ldev->md.al_stripes;
+	rs.al_stripe_size = device->ldev->md.al_stripe_size_4k * 4;
+	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+		err = resize_parms_from_attrs(&rs, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto fail_ldev;
+		}
+	}
+
+	if (device->state.conn > C_CONNECTED) {
+		retcode = ERR_RESIZE_RESYNC;
+		goto fail_ldev;
+	}
+
+	if (device->state.role == R_SECONDARY &&
+	    device->state.peer == R_SECONDARY) {
+		retcode = ERR_NO_PRIMARY;
+		goto fail_ldev;
+	}
+
+	if (rs.no_resync && first_peer_device(device)->connection->agreed_pro_version < 93) {
+		retcode = ERR_NEED_APV_93;
+		goto fail_ldev;
+	}
+
+	rcu_read_lock();
+	u_size = rcu_dereference(device->ldev->disk_conf)->disk_size;
+	rcu_read_unlock();
+	if (u_size != (sector_t)rs.resize_size) {
+		new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			retcode = ERR_NOMEM;
+			goto fail_ldev;
+		}
+	}
+
+	if (device->ldev->md.al_stripes != rs.al_stripes ||
+	    device->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+		u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+		if (al_size_k > (16 * 1024 * 1024)) {
+			retcode = ERR_MD_LAYOUT_TOO_BIG;
+			goto fail_ldev;
+		}
+
+		if (al_size_k < MD_32kB_SECT/2) {
+			retcode = ERR_MD_LAYOUT_TOO_SMALL;
+			goto fail_ldev;
+		}
+
+		if (device->state.conn != C_CONNECTED && !rs.resize_force) {
+			retcode = ERR_MD_LAYOUT_CONNECTED;
+			goto fail_ldev;
+		}
+
+		change_al_layout = true;
+	}
+
+	if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev))
+		device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+
+	if (new_disk_conf) {
+		mutex_lock(&device->resource->conf_update);
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+		new_disk_conf->disk_size = (sector_t)rs.resize_size;
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		mutex_unlock(&device->resource->conf_update);
+		kvfree_rcu(old_disk_conf);
+		new_disk_conf = NULL;
+	}
+
+	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
+	dd = drbd_determine_dev_size(device, ddsf, change_al_layout ? &rs : NULL);
+	drbd_md_sync(device);
+	put_ldev(device);
+	if (dd == DS_ERROR) {
+		retcode = ERR_NOMEM_BITMAP;
+		goto fail;
+	} else if (dd == DS_ERROR_SPACE_MD) {
+		retcode = ERR_MD_LAYOUT_NO_FIT;
+		goto fail;
+	} else if (dd == DS_ERROR_SHRINK) {
+		retcode = ERR_IMPLICIT_SHRINK;
+		goto fail;
+	}
+
+	if (device->state.conn == C_CONNECTED) {
+		if (dd == DS_GREW)
+			set_bit(RESIZE_PENDING, &device->flags);
+
+		drbd_send_uuids(first_peer_device(device));
+		drbd_send_sizes(first_peer_device(device), 1, ddsf);
+	}
+
+ fail:
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+ finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+
+ fail_ldev:
+	put_ldev(device);
+	kfree(new_disk_conf);
+	goto fail;
+}
+
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto fail;
+
+	res_opts = adm_ctx.resource->res_opts;
+	if (should_set_defaults(info))
+		set_res_opts_defaults(&res_opts);
+
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto fail;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	err = set_resource_options(adm_ctx.resource, &res_opts);
+	if (err) {
+		retcode = ERR_INVALID_REQUEST;
+		if (err == -ENOMEM)
+			retcode = ERR_NOMEM;
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+
+fail:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_SECONDARY, just change to
+	 * D_INCONSISTENT, and set all bits in the bitmap.  Otherwise,
+	 * try to start a resync handshake as sync target for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_SECONDARY) {
+		retcode = drbd_request_state(device, NS(disk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_n_write,
+				"set_n_write from invalidate", BM_LOCKED_MASK))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_T));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+		union drbd_state mask, union drbd_state val)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_request_state(adm_ctx.device, mask, val);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int drbd_bmio_set_susp_al(struct drbd_device *device) __must_hold(local)
+{
+	int rv;
+
+	rv = drbd_bmio_set_n_write(device);
+	drbd_suspend_al(device);
+	return rv;
+}
+
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	int retcode; /* drbd_ret_code, drbd_state_rv */
+	struct drbd_device *device;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* If there is still bitmap IO pending, probably because of a previous
+	 * resync just being finished, wait for it before requesting a new resync.
+	 * Also wait for it's after_state_ch(). */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	drbd_flush_workqueue(&first_peer_device(device)->connection->sender_work);
+
+	/* If we happen to be C_STANDALONE R_PRIMARY, just set all bits
+	 * in the bitmap.  Otherwise, try to start a resync handshake
+	 * as sync source for full sync.
+	 */
+	if (device->state.conn == C_STANDALONE && device->state.role == R_PRIMARY) {
+		/* The peer will get a resync upon connect anyways. Just make that
+		   into a full resync. */
+		retcode = drbd_request_state(device, NS(pdsk, D_INCONSISTENT));
+		if (retcode >= SS_SUCCESS) {
+			if (drbd_bitmap_io(device, &drbd_bmio_set_susp_al,
+				"set_n_write from invalidate_peer",
+				BM_LOCKED_SET_ALLOWED))
+				retcode = ERR_IO_MD_DISK;
+		}
+	} else
+		retcode = drbd_request_state(device, NS(conn, C_STARTING_SYNC_S));
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+	put_ldev(device);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+		retcode = ERR_PAUSE_IS_SET;
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	union drbd_dev_state s;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	if (drbd_request_state(adm_ctx.device, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+		s = adm_ctx.device->state;
+		if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
+			retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
+				  s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
+		} else {
+			retcode = ERR_PAUSE_IS_CLEAR;
+		}
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
+}
+
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	device = adm_ctx.device;
+	if (test_bit(NEW_CUR_UUID, &device->flags)) {
+		if (get_ldev_if_state(device, D_ATTACHING)) {
+			drbd_uuid_new_current(device);
+			put_ldev(device);
+		} else {
+			/* This is effectively a multi-stage "forced down".
+			 * The NEW_CUR_UUID bit is supposedly only set, if we
+			 * lost the replication connection, and are configured
+			 * to freeze IO and wait for some fence-peer handler.
+			 * So we still don't have a replication connection.
+			 * And now we don't have a local disk either.  After
+			 * resume, we will fail all pending and new IO, because
+			 * we don't have any data anymore.  Which means we will
+			 * eventually be able to terminate all users of this
+			 * device, and then take it down.  By bumping the
+			 * "effective" data uuid, we make sure that you really
+			 * need to tear down before you reconfigure, we will
+			 * the refuse to re-connect or re-attach (because no
+			 * matching real data uuid exists).
+			 */
+			u64 val;
+			get_random_bytes(&val, sizeof(u64));
+			drbd_set_ed_uuid(device, val);
+			drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n");
+		}
+		clear_bit(NEW_CUR_UUID, &device->flags);
+	}
+	drbd_suspend_io(device);
+	retcode = drbd_request_state(device, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+	if (retcode == SS_SUCCESS) {
+		if (device->state.conn < C_CONNECTED)
+			tl_clear(first_peer_device(device)->connection);
+		if (device->state.disk == D_DISKLESS || device->state.disk == D_FAILED)
+			tl_restart(first_peer_device(device)->connection, FAIL_FROZEN_DISK_IO);
+	}
+	drbd_resume_io(device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
+{
+	return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+static int nla_put_drbd_cfg_context(struct sk_buff *skb,
+				    struct drbd_resource *resource,
+				    struct drbd_connection *connection,
+				    struct drbd_device *device)
+{
+	struct nlattr *nla;
+	nla = nla_nest_start_noflag(skb, DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		goto nla_put_failure;
+	if (device &&
+	    nla_put_u32(skb, T_ctx_volume, device->vnr))
+		goto nla_put_failure;
+	if (nla_put_string(skb, T_ctx_resource_name, resource->name))
+		goto nla_put_failure;
+	if (connection) {
+		if (connection->my_addr_len &&
+		    nla_put(skb, T_ctx_my_addr, connection->my_addr_len, &connection->my_addr))
+			goto nla_put_failure;
+		if (connection->peer_addr_len &&
+		    nla_put(skb, T_ctx_peer_addr, connection->peer_addr_len, &connection->peer_addr))
+			goto nla_put_failure;
+	}
+	nla_nest_end(skb, nla);
+	return 0;
+
+nla_put_failure:
+	if (nla)
+		nla_nest_cancel(skb, nla);
+	return -EMSGSIZE;
+}
+
+/*
+ * The generic netlink dump callbacks are called outside the genl_lock(), so
+ * they cannot use the simple attribute parsing code which uses global
+ * attribute tables.
+ */
+static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	struct nlattr *nla;
+
+	nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen),
+		       DRBD_NLA_CFG_CONTEXT);
+	if (!nla)
+		return NULL;
+	return drbd_nla_find_nested(maxtype, nla, __nla_type(attr));
+}
+
+static void resource_to_info(struct resource_info *, struct drbd_resource *);
+
+int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *resource;
+	struct resource_info resource_info;
+	struct resource_statistics resource_statistics;
+	int err;
+
+	rcu_read_lock();
+	if (cb->args[0]) {
+		for_each_resource_rcu(resource, &drbd_resources)
+			if (resource == (struct drbd_resource *)cb->args[0])
+				goto found_resource;
+		err = 0;  /* resource was probably deleted */
+		goto out;
+	}
+	resource = list_entry(&drbd_resources,
+			      struct drbd_resource, resources);
+
+found_resource:
+	list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) {
+		goto put_result;
+	}
+	err = 0;
+	goto out;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_RESOURCES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL);
+	if (err)
+		goto out;
+	err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_to_info(&resource_info, resource);
+	err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto out;
+	cb->args[0] = (long)resource;
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+static void device_to_statistics(struct device_statistics *s,
+				 struct drbd_device *device)
+{
+	memset(s, 0, sizeof(*s));
+	s->dev_upper_blocked = !may_inc_ap_bio(device);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+		u64 *history_uuids = (u64 *)s->history_uuids;
+		int n;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->dev_current_uuid = md->uuid[UI_CURRENT];
+		BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1);
+		for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++)
+			history_uuids[n] = md->uuid[UI_HISTORY_START + n];
+		for (; n < HISTORY_UUIDS; n++)
+			history_uuids[n] = 0;
+		s->history_uuids_len = HISTORY_UUIDS;
+		spin_unlock_irq(&md->uuid_lock);
+
+		s->dev_disk_flags = md->flags;
+		put_ldev(device);
+	}
+	s->dev_size = get_capacity(device->vdisk);
+	s->dev_read = device->read_cnt;
+	s->dev_write = device->writ_cnt;
+	s->dev_al_writes = device->al_writ_cnt;
+	s->dev_bm_writes = device->bm_writ_cnt;
+	s->dev_upper_pending = atomic_read(&device->ap_bio_cnt);
+	s->dev_lower_pending = atomic_read(&device->local_cnt);
+	s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags);
+	s->dev_exposed_data_uuid = device->ed_uuid;
+}
+
+static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr)
+{
+	if (cb->args[0]) {
+		struct drbd_resource *resource =
+			(struct drbd_resource *)cb->args[0];
+		kref_put(&resource->kref, drbd_destroy_resource);
+	}
+
+	return 0;
+}
+
+int drbd_adm_dump_devices_done(struct netlink_callback *cb) {
+	return put_resource_in_arg0(cb, 7);
+}
+
+static void device_to_info(struct device_info *, struct drbd_device *);
+
+int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *device;
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct device_info device_info;
+	struct device_statistics device_statistics;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+		}
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_get_next(idr_to_search, &minor);
+	if (!device) {
+		err = 0;
+		goto out;
+	}
+	idr_for_each_entry_continue(idr_to_search, device, minor) {
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	err = 0;
+	goto out;  /* no more devices */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		dh->minor = device->minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device);
+		if (err)
+			goto out;
+		if (get_ldev(device)) {
+			struct disk_conf *disk_conf =
+				rcu_dereference(device->ldev->disk_conf);
+
+			err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN));
+			put_ldev(device);
+			if (err)
+				goto out;
+		}
+		device_to_info(&device_info, device);
+		err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+
+		device_to_statistics(&device_statistics, device);
+		err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor + 1;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+
+int drbd_adm_dump_connections_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 6);
+}
+
+enum { SINGLE_RESOURCE, ITERATE_RESOURCES };
+
+int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource = NULL, *next_resource;
+	struct drbd_connection *connection;
+	int err = 0, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct connection_info connection_info;
+	struct connection_statistics connection_statistics;
+
+	rcu_read_lock();
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+			cb->args[0] = (long)resource;
+			cb->args[1] = SINGLE_RESOURCE;
+		}
+	}
+	if (!resource) {
+		if (list_empty(&drbd_resources))
+			goto out;
+		resource = list_first_entry(&drbd_resources, struct drbd_resource, resources);
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[1] = ITERATE_RESOURCES;
+	}
+
+    next_resource:
+	rcu_read_unlock();
+	mutex_lock(&resource->conf_update);
+	rcu_read_lock();
+	if (cb->args[2]) {
+		for_each_connection_rcu(connection, resource)
+			if (connection == (struct drbd_connection *)cb->args[2])
+				goto found_connection;
+		/* connection was probably deleted */
+		goto no_more_connections;
+	}
+	connection = list_entry(&resource->connections, struct drbd_connection, connections);
+
+found_connection:
+	list_for_each_entry_continue_rcu(connection, &resource->connections, connections) {
+		if (!has_net_conf(connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+
+no_more_connections:
+	if (cb->args[1] == ITERATE_RESOURCES) {
+		for_each_resource_rcu(next_resource, &drbd_resources) {
+			if (next_resource == resource)
+				goto found_resource;
+		}
+		/* resource was probably deleted */
+	}
+	goto out;
+
+found_resource:
+	list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) {
+		mutex_unlock(&resource->conf_update);
+		kref_put(&resource->kref, drbd_destroy_resource);
+		resource = next_resource;
+		kref_get(&resource->kref);
+		cb->args[0] = (long)resource;
+		cb->args[2] = 0;
+		goto next_resource;
+	}
+	goto out;  /* no more resources */
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct net_conf *net_conf;
+
+		err = nla_put_drbd_cfg_context(skb, resource, connection, NULL);
+		if (err)
+			goto out;
+		net_conf = rcu_dereference(connection->net_conf);
+		if (net_conf) {
+			err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN));
+			if (err)
+				goto out;
+		}
+		connection_to_info(&connection_info, connection);
+		err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+		err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[2] = (long)connection;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (resource)
+		mutex_unlock(&resource->conf_update);
+	if (err)
+		return err;
+	return skb->len;
+}
+
+enum mdf_peer_flag {
+	MDF_PEER_CONNECTED =	1 << 0,
+	MDF_PEER_OUTDATED =	1 << 1,
+	MDF_PEER_FENCING =	1 << 2,
+	MDF_PEER_FULL_SYNC =	1 << 3,
+};
+
+static void peer_device_to_statistics(struct peer_device_statistics *s,
+				      struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+
+	memset(s, 0, sizeof(*s));
+	s->peer_dev_received = device->recv_cnt;
+	s->peer_dev_sent = device->send_cnt;
+	s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) +
+			      atomic_read(&device->rs_pending_cnt);
+	s->peer_dev_unacked = atomic_read(&device->unacked_cnt);
+	s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9);
+	s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9);
+	if (get_ldev(device)) {
+		struct drbd_md *md = &device->ldev->md;
+
+		spin_lock_irq(&md->uuid_lock);
+		s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP];
+		spin_unlock_irq(&md->uuid_lock);
+		s->peer_dev_flags =
+			(drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ?
+				MDF_PEER_CONNECTED : 0) +
+			(drbd_md_test_flag(device->ldev, MDF_CONSISTENT) &&
+			 !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ?
+				MDF_PEER_OUTDATED : 0) +
+			/* FIXME: MDF_PEER_FENCING? */
+			(drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ?
+				MDF_PEER_FULL_SYNC : 0);
+		put_ldev(device);
+	}
+}
+
+int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb)
+{
+	return put_resource_in_arg0(cb, 9);
+}
+
+int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nlattr *resource_filter;
+	struct drbd_resource *resource;
+	struct drbd_device *device;
+	struct drbd_peer_device *peer_device = NULL;
+	int minor, err, retcode;
+	struct drbd_genlmsghdr *dh;
+	struct idr *idr_to_search;
+
+	resource = (struct drbd_resource *)cb->args[0];
+	if (!cb->args[0] && !cb->args[1]) {
+		resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name);
+		if (resource_filter) {
+			retcode = ERR_RES_NOT_KNOWN;
+			resource = drbd_find_resource(nla_data(resource_filter));
+			if (!resource)
+				goto put_result;
+		}
+		cb->args[0] = (long)resource;
+	}
+
+	rcu_read_lock();
+	minor = cb->args[1];
+	idr_to_search = resource ? &resource->devices : &drbd_devices;
+	device = idr_find(idr_to_search, minor);
+	if (!device) {
+next_device:
+		minor++;
+		cb->args[2] = 0;
+		device = idr_get_next(idr_to_search, &minor);
+		if (!device) {
+			err = 0;
+			goto out;
+		}
+	}
+	if (cb->args[2]) {
+		for_each_peer_device(peer_device, device)
+			if (peer_device == (struct drbd_peer_device *)cb->args[2])
+				goto found_peer_device;
+		/* peer device was probably deleted */
+		goto next_device;
+	}
+	/* Make peer_device point to the list head (not the first entry). */
+	peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices);
+
+found_peer_device:
+	list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) {
+		if (!has_net_conf(peer_device->connection))
+			continue;
+		retcode = NO_ERROR;
+		goto put_result;  /* only one iteration */
+	}
+	goto next_device;
+
+put_result:
+	dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+			cb->nlh->nlmsg_seq, &drbd_genl_family,
+			NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES);
+	err = -ENOMEM;
+	if (!dh)
+		goto out;
+	dh->ret_code = retcode;
+	dh->minor = -1U;
+	if (retcode == NO_ERROR) {
+		struct peer_device_info peer_device_info;
+		struct peer_device_statistics peer_device_statistics;
+
+		dh->minor = minor;
+		err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device);
+		if (err)
+			goto out;
+		peer_device_to_info(&peer_device_info, peer_device);
+		err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		peer_device_to_statistics(&peer_device_statistics, peer_device);
+		err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+		if (err)
+			goto out;
+		cb->args[1] = minor;
+		cb->args[2] = (long)peer_device;
+	}
+	genlmsg_end(skb, dh);
+	err = 0;
+
+out:
+	rcu_read_unlock();
+	if (err)
+		return err;
+	return skb->len;
+}
+/*
+ * Return the connection of @resource if @resource has exactly one connection.
+ */
+static struct drbd_connection *the_only_connection(struct drbd_resource *resource)
+{
+	struct list_head *connections = &resource->connections;
+
+	if (list_empty(connections) || connections->next->next != connections)
+		return NULL;
+	return list_first_entry(&resource->connections, struct drbd_connection, connections);
+}
+
+static int nla_put_status_info(struct sk_buff *skb, struct drbd_device *device,
+		const struct sib_info *sib)
+{
+	struct drbd_resource *resource = device->resource;
+	struct state_info *si = NULL; /* for sizeof(si->member); */
+	struct nlattr *nla;
+	int got_ldev;
+	int err = 0;
+	int exclude_sensitive;
+
+	/* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+	 * to.  So we better exclude_sensitive information.
+	 *
+	 * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+	 * in the context of the requesting user process. Exclude sensitive
+	 * information, unless current has superuser.
+	 *
+	 * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+	 * relies on the current implementation of netlink_dump(), which
+	 * executes the dump callback successively from netlink_recvmsg(),
+	 * always in the context of the receiving process */
+	exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
+
+	got_ldev = get_ldev(device);
+
+	/* We need to add connection name and volume number information still.
+	 * Minor number is in drbd_genlmsghdr. */
+	if (nla_put_drbd_cfg_context(skb, resource, the_only_connection(resource), device))
+		goto nla_put_failure;
+
+	if (res_opts_to_skb(skb, &device->resource->res_opts, exclude_sensitive))
+		goto nla_put_failure;
+
+	rcu_read_lock();
+	if (got_ldev) {
+		struct disk_conf *disk_conf;
+
+		disk_conf = rcu_dereference(device->ldev->disk_conf);
+		err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+	}
+	if (!err) {
+		struct net_conf *nc;
+
+		nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+		if (nc)
+			err = net_conf_to_skb(skb, nc, exclude_sensitive);
+	}
+	rcu_read_unlock();
+	if (err)
+		goto nla_put_failure;
+
+	nla = nla_nest_start_noflag(skb, DRBD_NLA_STATE_INFO);
+	if (!nla)
+		goto nla_put_failure;
+	if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+	    nla_put_u32(skb, T_current_state, device->state.i) ||
+	    nla_put_u64_0pad(skb, T_ed_uuid, device->ed_uuid) ||
+	    nla_put_u64_0pad(skb, T_capacity, get_capacity(device->vdisk)) ||
+	    nla_put_u64_0pad(skb, T_send_cnt, device->send_cnt) ||
+	    nla_put_u64_0pad(skb, T_recv_cnt, device->recv_cnt) ||
+	    nla_put_u64_0pad(skb, T_read_cnt, device->read_cnt) ||
+	    nla_put_u64_0pad(skb, T_writ_cnt, device->writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_al_writ_cnt, device->al_writ_cnt) ||
+	    nla_put_u64_0pad(skb, T_bm_writ_cnt, device->bm_writ_cnt) ||
+	    nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&device->ap_bio_cnt)) ||
+	    nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&device->ap_pending_cnt)) ||
+	    nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&device->rs_pending_cnt)))
+		goto nla_put_failure;
+
+	if (got_ldev) {
+		int err;
+
+		spin_lock_irq(&device->ldev->md.uuid_lock);
+		err = nla_put(skb, T_uuids, sizeof(si->uuids), device->ldev->md.uuid);
+		spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+		if (err)
+			goto nla_put_failure;
+
+		if (nla_put_u32(skb, T_disk_flags, device->ldev->md.flags) ||
+		    nla_put_u64_0pad(skb, T_bits_total, drbd_bm_bits(device)) ||
+		    nla_put_u64_0pad(skb, T_bits_oos,
+				     drbd_bm_total_weight(device)))
+			goto nla_put_failure;
+		if (C_SYNC_SOURCE <= device->state.conn &&
+		    C_PAUSED_SYNC_T >= device->state.conn) {
+			if (nla_put_u64_0pad(skb, T_bits_rs_total,
+					     device->rs_total) ||
+			    nla_put_u64_0pad(skb, T_bits_rs_failed,
+					     device->rs_failed))
+				goto nla_put_failure;
+		}
+	}
+
+	if (sib) {
+		switch(sib->sib_reason) {
+		case SIB_SYNC_PROGRESS:
+		case SIB_GET_STATUS_REPLY:
+			break;
+		case SIB_STATE_CHANGE:
+			if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+			    nla_put_u32(skb, T_new_state, sib->ns.i))
+				goto nla_put_failure;
+			break;
+		case SIB_HELPER_POST:
+			if (nla_put_u32(skb, T_helper_exit_code,
+					sib->helper_exit_code))
+				goto nla_put_failure;
+			fallthrough;
+		case SIB_HELPER_PRE:
+			if (nla_put_string(skb, T_helper, sib->helper_name))
+				goto nla_put_failure;
+			break;
+		}
+	}
+	nla_nest_end(skb, nla);
+
+	if (0)
+nla_put_failure:
+		err = -EMSGSIZE;
+	if (got_ldev)
+		put_ldev(device);
+	return err;
+}
+
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.device, NULL);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_device *device;
+	struct drbd_genlmsghdr *dh;
+	struct drbd_resource *pos = (struct drbd_resource *)cb->args[0];
+	struct drbd_resource *resource = NULL;
+	struct drbd_resource *tmp;
+	unsigned volume = cb->args[1];
+
+	/* Open coded, deferred, iteration:
+	 * for_each_resource_safe(resource, tmp, &drbd_resources) {
+	 *      connection = "first connection of resource or undefined";
+	 *	idr_for_each_entry(&resource->devices, device, i) {
+	 *	  ...
+	 *	}
+	 * }
+	 * where resource is cb->args[0];
+	 * and i is cb->args[1];
+	 *
+	 * cb->args[2] indicates if we shall loop over all resources,
+	 * or just dump all volumes of a single resource.
+	 *
+	 * This may miss entries inserted after this dump started,
+	 * or entries deleted before they are reached.
+	 *
+	 * We need to make sure the device won't disappear while
+	 * we are looking at it, and revalidate our iterators
+	 * on each iteration.
+	 */
+
+	/* synchronize with conn_create()/drbd_destroy_connection() */
+	rcu_read_lock();
+	/* revalidate iterator position */
+	for_each_resource_rcu(tmp, &drbd_resources) {
+		if (pos == NULL) {
+			/* first iteration */
+			pos = tmp;
+			resource = pos;
+			break;
+		}
+		if (tmp == pos) {
+			resource = pos;
+			break;
+		}
+	}
+	if (resource) {
+next_resource:
+		device = idr_get_next(&resource->devices, &volume);
+		if (!device) {
+			/* No more volumes to dump on this resource.
+			 * Advance resource iterator. */
+			pos = list_entry_rcu(resource->resources.next,
+					     struct drbd_resource, resources);
+			/* Did we dump any volume of this resource yet? */
+			if (volume != 0) {
+				/* If we reached the end of the list,
+				 * or only a single resource dump was requested,
+				 * we are done. */
+				if (&pos->resources == &drbd_resources || cb->args[2])
+					goto out;
+				volume = 0;
+				resource = pos;
+				goto next_resource;
+			}
+		}
+
+		dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+				cb->nlh->nlmsg_seq, &drbd_genl_family,
+				NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+		if (!dh)
+			goto out;
+
+		if (!device) {
+			/* This is a connection without a single volume.
+			 * Suprisingly enough, it may have a network
+			 * configuration. */
+			struct drbd_connection *connection;
+
+			dh->minor = -1U;
+			dh->ret_code = NO_ERROR;
+			connection = the_only_connection(resource);
+			if (nla_put_drbd_cfg_context(skb, resource, connection, NULL))
+				goto cancel;
+			if (connection) {
+				struct net_conf *nc;
+
+				nc = rcu_dereference(connection->net_conf);
+				if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+					goto cancel;
+			}
+			goto done;
+		}
+
+		D_ASSERT(device, device->vnr == volume);
+		D_ASSERT(device, device->resource == resource);
+
+		dh->minor = device_to_minor(device);
+		dh->ret_code = NO_ERROR;
+
+		if (nla_put_status_info(skb, device, NULL)) {
+cancel:
+			genlmsg_cancel(skb, dh);
+			goto out;
+		}
+done:
+		genlmsg_end(skb, dh);
+	}
+
+out:
+	rcu_read_unlock();
+	/* where to start the next iteration */
+	cb->args[0] = (long)pos;
+	cb->args[1] = (pos == resource) ? volume + 1 : 0;
+
+	/* No more resources/volumes/minors found results in an empty skb.
+	 * Which will terminate the dump. */
+        return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock().  During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
+ */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+	struct nlattr *nla;
+	const char *resource_name;
+	struct drbd_resource *resource;
+	int maxtype;
+
+	/* Is this a followup call? */
+	if (cb->args[0]) {
+		/* ... of a single resource dump,
+		 * and the resource iterator has been advanced already? */
+		if (cb->args[2] && cb->args[2] != cb->args[0])
+			return 0; /* DONE. */
+		goto dump;
+	}
+
+	/* First call (from netlink_dump_start).  We need to figure out
+	 * which resource(s) the user wants us to dump. */
+	nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+			nlmsg_attrlen(cb->nlh, hdrlen),
+			DRBD_NLA_CFG_CONTEXT);
+
+	/* No explicit context given.  Dump all. */
+	if (!nla)
+		goto dump;
+	maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+	nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+	if (IS_ERR(nla))
+		return PTR_ERR(nla);
+	/* context given, but no name present? */
+	if (!nla)
+		return -EINVAL;
+	resource_name = nla_data(nla);
+	if (!*resource_name)
+		return -ENODEV;
+	resource = drbd_find_resource(resource_name);
+	if (!resource)
+		return -ENODEV;
+
+	kref_put(&resource->kref, drbd_destroy_resource); /* get_one_status() revalidates the resource */
+
+	/* prime iterators, and set "filter" mode mark:
+	 * only dump this connection. */
+	cb->args[0] = (long)resource;
+	/* cb->args[1] = 0; passed in this way. */
+	cb->args[2] = (long)resource;
+
+dump:
+	return get_one_status(skb, cb);
+}
+
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct timeout_parms tp;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	tp.timeout_type =
+		adm_ctx.device->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+		test_bit(USE_DEGR_WFC_T, &adm_ctx.device->flags) ? UT_DEGRADED :
+		UT_DEFAULT;
+
+	err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+	if (err) {
+		nlmsg_free(adm_ctx.reply_skb);
+		return err;
+	}
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	struct start_ov_parms parms;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	device = adm_ctx.device;
+
+	/* resume from last known position, if possible */
+	parms.ov_start_sector = device->ov_start_sector;
+	parms.ov_stop_sector = ULLONG_MAX;
+	if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+		int err = start_ov_parms_from_attrs(&parms, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out;
+		}
+	}
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+
+	/* w_make_ov_request expects position to be aligned */
+	device->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+	device->ov_stop_sector = parms.ov_stop_sector;
+
+	/* If there is still bitmap IO pending, e.g. previous resync or verify
+	 * just being finished, wait for it before requesting a new resync. */
+	drbd_suspend_io(device);
+	wait_event(device->misc_wait, !test_bit(BITMAP_IO, &device->flags));
+	retcode = drbd_request_state(device, NS(conn, C_VERIFY_S));
+	drbd_resume_io(device);
+
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_device *device;
+	enum drbd_ret_code retcode;
+	int skip_initial_sync = 0;
+	int err;
+	struct new_c_uuid_parms args;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out_nolock;
+
+	device = adm_ctx.device;
+	memset(&args, 0, sizeof(args));
+	if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+		err = new_c_uuid_parms_from_attrs(&args, info);
+		if (err) {
+			retcode = ERR_MANDATORY_TAG;
+			drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+			goto out_nolock;
+		}
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	mutex_lock(device->state_mutex); /* Protects us against serialized state changes. */
+
+	if (!get_ldev(device)) {
+		retcode = ERR_NO_DISK;
+		goto out;
+	}
+
+	/* this is "skip initial sync", assume to be clean */
+	if (device->state.conn == C_CONNECTED &&
+	    first_peer_device(device)->connection->agreed_pro_version >= 90 &&
+	    device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
+		drbd_info(device, "Preparing to skip initial sync\n");
+		skip_initial_sync = 1;
+	} else if (device->state.conn != C_STANDALONE) {
+		retcode = ERR_CONNECTED;
+		goto out_dec;
+	}
+
+	drbd_uuid_set(device, UI_BITMAP, 0); /* Rotate UI_BITMAP to History 1, etc... */
+	drbd_uuid_new_current(device); /* New current, previous to UI_BITMAP */
+
+	if (args.clear_bm) {
+		err = drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+			"clear_n_write from new_c_uuid", BM_LOCKED_MASK);
+		if (err) {
+			drbd_err(device, "Writing bitmap failed with %d\n", err);
+			retcode = ERR_IO_MD_DISK;
+		}
+		if (skip_initial_sync) {
+			drbd_send_uuids_skip_initial_sync(first_peer_device(device));
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			drbd_print_uuids(device, "cleared bitmap UUID");
+			spin_lock_irq(&device->resource->req_lock);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+	}
+
+	drbd_md_sync(device);
+out_dec:
+	put_ldev(device);
+out:
+	mutex_unlock(device->state_mutex);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out_nolock:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code
+drbd_check_resource_name(struct drbd_config_context *adm_ctx)
+{
+	const char *name = adm_ctx->resource_name;
+	if (!name || !name[0]) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "resource name missing");
+		return ERR_MANDATORY_TAG;
+	}
+	/* if we want to use these in sysfs/configfs/debugfs some day,
+	 * we must not allow slashes */
+	if (strchr(name, '/')) {
+		drbd_msg_put_info(adm_ctx->reply_skb, "invalid resource name");
+		return ERR_INVALID_REQUEST;
+	}
+	return NO_ERROR;
+}
+
+static void resource_to_info(struct resource_info *info,
+			     struct drbd_resource *resource)
+{
+	info->res_role = conn_highest_role(first_connection(resource));
+	info->res_susp = resource->susp;
+	info->res_susp_nod = resource->susp_nod;
+	info->res_susp_fen = resource->susp_fen;
+}
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_connection *connection;
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+	struct res_opts res_opts;
+	int err;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, 0);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	set_res_opts_defaults(&res_opts);
+	err = res_opts_from_attrs(&res_opts, info);
+	if (err && err != -ENOMSG) {
+		retcode = ERR_MANDATORY_TAG;
+		drbd_msg_put_info(adm_ctx.reply_skb, from_attrs_err_to_txt(err));
+		goto out;
+	}
+
+	retcode = drbd_check_resource_name(&adm_ctx);
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (adm_ctx.resource) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+			retcode = ERR_INVALID_REQUEST;
+			drbd_msg_put_info(adm_ctx.reply_skb, "resource exists");
+		}
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	/* not yet safe for genl_family.parallel_ops */
+	mutex_lock(&resources_mutex);
+	connection = conn_create(adm_ctx.resource_name, &res_opts);
+	mutex_unlock(&resources_mutex);
+
+	if (connection) {
+		struct resource_info resource_info;
+
+		mutex_lock(&notification_mutex);
+		resource_to_info(&resource_info, connection->resource);
+		notify_resource_state(NULL, 0, connection->resource,
+				      &resource_info, NOTIFY_CREATE);
+		mutex_unlock(&notification_mutex);
+	} else
+		retcode = ERR_NOMEM;
+
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static void device_to_info(struct device_info *info,
+			   struct drbd_device *device)
+{
+	info->dev_disk_state = device->state.disk;
+}
+
+
+int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_genlmsghdr *dh = info->userhdr;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	if (dh->minor > MINORMASK) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested minor out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+	if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+		drbd_msg_put_info(adm_ctx.reply_skb, "requested volume id out of range");
+		retcode = ERR_INVALID_REQUEST;
+		goto out;
+	}
+
+	/* drbd_adm_prepare made sure already
+	 * that first_peer_device(device)->connection and device->vnr match the request. */
+	if (adm_ctx.device) {
+		if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+			retcode = ERR_MINOR_OR_VOLUME_EXISTS;
+		/* else: still NO_ERROR */
+		goto out;
+	}
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = drbd_create_device(&adm_ctx, dh->minor);
+	if (retcode == NO_ERROR) {
+		struct drbd_device *device;
+		struct drbd_peer_device *peer_device;
+		struct device_info info;
+		unsigned int peer_devices = 0;
+		enum drbd_notification_type flags;
+
+		device = minor_to_device(dh->minor);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_devices++;
+		}
+
+		device_to_info(&info, device);
+		mutex_lock(&notification_mutex);
+		flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+		notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags);
+		for_each_peer_device(peer_device, device) {
+			struct peer_device_info peer_device_info;
+
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			peer_device_to_info(&peer_device_info, peer_device);
+			flags = (peer_devices--) ? NOTIFY_CONTINUES : 0;
+			notify_peer_device_state(NULL, 0, peer_device, &peer_device_info,
+						 NOTIFY_CREATE | flags);
+		}
+		mutex_unlock(&notification_mutex);
+	}
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static enum drbd_ret_code adm_del_minor(struct drbd_device *device)
+{
+	struct drbd_peer_device *peer_device;
+
+	if (device->state.disk == D_DISKLESS &&
+	    /* no need to be device->state.conn == C_STANDALONE &&
+	     * we may want to delete a minor from a live replication group.
+	     */
+	    device->state.role == R_SECONDARY) {
+		struct drbd_connection *connection =
+			first_connection(device->resource);
+
+		_drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS),
+				    CS_VERBOSE + CS_WAIT_COMPLETE);
+
+		/* If the state engine hasn't stopped the sender thread yet, we
+		 * need to flush the sender work queue before generating the
+		 * DESTROY events here. */
+		if (get_t_state(&connection->worker) == RUNNING)
+			drbd_flush_workqueue(&connection->sender_work);
+
+		mutex_lock(&notification_mutex);
+		for_each_peer_device(peer_device, device) {
+			if (!has_net_conf(peer_device->connection))
+				continue;
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		}
+		notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
+		drbd_delete_device(device);
+		return NO_ERROR;
+	} else
+		return ERR_MINOR_CONFIGURED;
+}
+
+int drbd_adm_del_minor(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_MINOR);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto out;
+
+	mutex_lock(&adm_ctx.resource->adm_mutex);
+	retcode = adm_del_minor(adm_ctx.device);
+	mutex_unlock(&adm_ctx.resource->adm_mutex);
+out:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+static int adm_del_resource(struct drbd_resource *resource)
+{
+	struct drbd_connection *connection;
+
+	for_each_connection(connection, resource) {
+		if (connection->cstate > C_STANDALONE)
+			return ERR_NET_CONFIGURED;
+	}
+	if (!idr_is_empty(&resource->devices))
+		return ERR_RES_IN_USE;
+
+	/* The state engine has stopped the sender thread, so we don't
+	 * need to flush the sender work queue before generating the
+	 * DESTROY event here. */
+	mutex_lock(&notification_mutex);
+	notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY);
+	mutex_unlock(&notification_mutex);
+
+	mutex_lock(&resources_mutex);
+	list_del_rcu(&resource->resources);
+	mutex_unlock(&resources_mutex);
+	/* Make sure all threads have actually stopped: state handling only
+	 * does drbd_thread_stop_nowait(). */
+	list_for_each_entry(connection, &resource->connections, connections)
+		drbd_thread_stop(&connection->worker);
+	synchronize_rcu();
+	drbd_free_resource(resource);
+	return NO_ERROR;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	struct drbd_connection *connection;
+	struct drbd_device *device;
+	int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+	unsigned i;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+
+	resource = adm_ctx.resource;
+	mutex_lock(&resource->adm_mutex);
+	/* demote */
+	for_each_connection(connection, resource) {
+		struct drbd_peer_device *peer_device;
+
+		idr_for_each_entry(&connection->peer_devices, peer_device, i) {
+			retcode = drbd_set_role(peer_device->device, R_SECONDARY, 0);
+			if (retcode < SS_SUCCESS) {
+				drbd_msg_put_info(adm_ctx.reply_skb, "failed to demote");
+				goto out;
+			}
+		}
+
+		retcode = conn_try_disconnect(connection, 0);
+		if (retcode < SS_SUCCESS) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to disconnect");
+			goto out;
+		}
+	}
+
+	/* detach */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_detach(device, 0);
+		if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to detach");
+			goto out;
+		}
+	}
+
+	/* delete volumes */
+	idr_for_each_entry(&resource->devices, device, i) {
+		retcode = adm_del_minor(device);
+		if (retcode != NO_ERROR) {
+			/* "can not happen" */
+			drbd_msg_put_info(adm_ctx.reply_skb, "failed to delete volume");
+			goto out;
+		}
+	}
+
+	retcode = adm_del_resource(resource);
+out:
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+	struct drbd_config_context adm_ctx;
+	struct drbd_resource *resource;
+	enum drbd_ret_code retcode;
+
+	retcode = drbd_adm_prepare(&adm_ctx, skb, info, DRBD_ADM_NEED_RESOURCE);
+	if (!adm_ctx.reply_skb)
+		return retcode;
+	if (retcode != NO_ERROR)
+		goto finish;
+	resource = adm_ctx.resource;
+
+	mutex_lock(&resource->adm_mutex);
+	retcode = adm_del_resource(resource);
+	mutex_unlock(&resource->adm_mutex);
+finish:
+	drbd_adm_finish(&adm_ctx, info, retcode);
+	return 0;
+}
+
+void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib)
+{
+	struct sk_buff *msg;
+	struct drbd_genlmsghdr *d_out;
+	unsigned seq;
+	int err = -ENOMEM;
+
+	seq = atomic_inc_return(&drbd_genl_seq);
+	msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	if (!msg)
+		goto failed;
+
+	err = -EMSGSIZE;
+	d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+	if (!d_out) /* cannot happen, but anyways. */
+		goto nla_put_failure;
+	d_out->minor = device_to_minor(device);
+	d_out->ret_code = NO_ERROR;
+
+	if (nla_put_status_info(msg, device, sib))
+		goto nla_put_failure;
+	genlmsg_end(msg, d_out);
+	err = drbd_genl_multicast_events(msg, GFP_NOWAIT);
+	/* msg has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto failed;
+
+	return;
+
+nla_put_failure:
+	nlmsg_free(msg);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. "
+			"Event seq:%u sib_reason:%u\n",
+			err, seq, sib->sib_reason);
+}
+
+static int nla_put_notification_header(struct sk_buff *msg,
+				       enum drbd_notification_type type)
+{
+	struct drbd_notification_header nh = {
+		.nh_type = type,
+	};
+
+	return drbd_notification_header_to_skb(msg, &nh, true);
+}
+
+int notify_resource_state(struct sk_buff *skb,
+			   unsigned int seq,
+			   struct drbd_resource *resource,
+			   struct resource_info *resource_info,
+			   enum drbd_notification_type type)
+{
+	struct resource_statistics resource_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     resource_info_to_skb(skb, resource_info, true)))
+		goto nla_put_failure;
+	resource_statistics.res_stat_write_ordering = resource->write_ordering;
+	err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN));
+	if (err)
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+			err, seq);
+	return err;
+}
+
+int notify_device_state(struct sk_buff *skb,
+			 unsigned int seq,
+			 struct drbd_device *device,
+			 struct device_info *device_info,
+			 enum drbd_notification_type type)
+{
+	struct device_statistics device_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = device->minor;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     device_info_to_skb(skb, device_info, true)))
+		goto nla_put_failure;
+	device_to_statistics(&device_statistics, device);
+	device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+	return err;
+}
+
+int notify_connection_state(struct sk_buff *skb,
+			     unsigned int seq,
+			     struct drbd_connection *connection,
+			     struct connection_info *connection_info,
+			     enum drbd_notification_type type)
+{
+	struct connection_statistics connection_statistics;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     connection_info_to_skb(skb, connection_info, true)))
+		goto nla_put_failure;
+	connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags);
+	connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+	return err;
+}
+
+int notify_peer_device_state(struct sk_buff *skb,
+			      unsigned int seq,
+			      struct drbd_peer_device *peer_device,
+			      struct peer_device_info *peer_device_info,
+			      enum drbd_notification_type type)
+{
+	struct peer_device_statistics peer_device_statistics;
+	struct drbd_resource *resource = peer_device->device->resource;
+	struct drbd_genlmsghdr *dh;
+	bool multicast = false;
+	int err;
+
+	if (!skb) {
+		seq = atomic_inc_return(&notify_genl_seq);
+		skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+		err = -ENOMEM;
+		if (!skb)
+			goto failed;
+		multicast = true;
+	}
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) ||
+	    nla_put_notification_header(skb, type) ||
+	    ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY &&
+	     peer_device_info_to_skb(skb, peer_device_info, true)))
+		goto nla_put_failure;
+	peer_device_to_statistics(&peer_device_statistics, peer_device);
+	peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN));
+	genlmsg_end(skb, dh);
+	if (multicast) {
+		err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+		/* skb has been consumed or freed in netlink_broadcast() */
+		if (err && err != -ESRCH)
+			goto failed;
+	}
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+failed:
+	drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+	return err;
+}
+
+void notify_helper(enum drbd_notification_type type,
+		   struct drbd_device *device, struct drbd_connection *connection,
+		   const char *name, int status)
+{
+	struct drbd_resource *resource = device ? device->resource : connection->resource;
+	struct drbd_helper_info helper_info;
+	unsigned int seq = atomic_inc_return(&notify_genl_seq);
+	struct sk_buff *skb = NULL;
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	strscpy(helper_info.helper_name, name, sizeof(helper_info.helper_name));
+	helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name));
+	helper_info.helper_status = status;
+
+	skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+	err = -ENOMEM;
+	if (!skb)
+		goto fail;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER);
+	if (!dh)
+		goto fail;
+	dh->minor = device ? device->minor : -1;
+	dh->ret_code = NO_ERROR;
+	mutex_lock(&notification_mutex);
+	if (nla_put_drbd_cfg_context(skb, resource, connection, device) ||
+	    nla_put_notification_header(skb, type) ||
+	    drbd_helper_info_to_skb(skb, &helper_info, true))
+		goto unlock_fail;
+	genlmsg_end(skb, dh);
+	err = drbd_genl_multicast_events(skb, GFP_NOWAIT);
+	skb = NULL;
+	/* skb has been consumed or freed in netlink_broadcast() */
+	if (err && err != -ESRCH)
+		goto unlock_fail;
+	mutex_unlock(&notification_mutex);
+	return;
+
+unlock_fail:
+	mutex_unlock(&notification_mutex);
+fail:
+	nlmsg_free(skb);
+	drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n",
+		 err, seq);
+}
+
+static int notify_initial_state_done(struct sk_buff *skb, unsigned int seq)
+{
+	struct drbd_genlmsghdr *dh;
+	int err;
+
+	err = -EMSGSIZE;
+	dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE);
+	if (!dh)
+		goto nla_put_failure;
+	dh->minor = -1U;
+	dh->ret_code = NO_ERROR;
+	if (nla_put_notification_header(skb, NOTIFY_EXISTS))
+		goto nla_put_failure;
+	genlmsg_end(skb, dh);
+	return 0;
+
+nla_put_failure:
+	nlmsg_free(skb);
+	pr_err("Error %d sending event. Event seq:%u\n", err, seq);
+	return err;
+}
+
+static void free_state_changes(struct list_head *list)
+{
+	while (!list_empty(list)) {
+		struct drbd_state_change *state_change =
+			list_first_entry(list, struct drbd_state_change, list);
+		list_del(&state_change->list);
+		forget_state_change(state_change);
+	}
+}
+
+static unsigned int notifications_for_state_change(struct drbd_state_change *state_change)
+{
+	return 1 +
+	       state_change->n_connections +
+	       state_change->n_devices +
+	       state_change->n_devices * state_change->n_connections;
+}
+
+static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0];
+	unsigned int seq = cb->args[2];
+	unsigned int n;
+	enum drbd_notification_type flags = 0;
+	int err = 0;
+
+	/* There is no need for taking notification_mutex here: it doesn't
+	   matter if the initial state events mix with later state chage
+	   events; we can always tell the events apart by the NOTIFY_EXISTS
+	   flag. */
+
+	cb->args[5]--;
+	if (cb->args[5] == 1) {
+		err = notify_initial_state_done(skb, seq);
+		goto out;
+	}
+	n = cb->args[4]++;
+	if (cb->args[4] < cb->args[3])
+		flags |= NOTIFY_CONTINUES;
+	if (n < 1) {
+		err = notify_resource_state_change(skb, seq, state_change->resource,
+					     NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n--;
+	if (n < state_change->n_connections) {
+		err = notify_connection_state_change(skb, seq, &state_change->connections[n],
+					       NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_connections;
+	if (n < state_change->n_devices) {
+		err = notify_device_state_change(skb, seq, &state_change->devices[n],
+					   NOTIFY_EXISTS | flags);
+		goto next;
+	}
+	n -= state_change->n_devices;
+	if (n < state_change->n_devices * state_change->n_connections) {
+		err = notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n],
+						NOTIFY_EXISTS | flags);
+		goto next;
+	}
+
+next:
+	if (cb->args[4] == cb->args[3]) {
+		struct drbd_state_change *next_state_change =
+			list_entry(state_change->list.next,
+				   struct drbd_state_change, list);
+		cb->args[0] = (long)next_state_change;
+		cb->args[3] = notifications_for_state_change(next_state_change);
+		cb->args[4] = 0;
+	}
+out:
+	if (err)
+		return err;
+	else
+		return skb->len;
+}
+
+int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct drbd_resource *resource;
+	LIST_HEAD(head);
+
+	if (cb->args[5] >= 1) {
+		if (cb->args[5] > 1)
+			return get_initial_state(skb, cb);
+		if (cb->args[0]) {
+			struct drbd_state_change *state_change =
+				(struct drbd_state_change *)cb->args[0];
+
+			/* connect list to head */
+			list_add(&head, &state_change->list);
+			free_state_changes(&head);
+		}
+		return 0;
+	}
+
+	cb->args[5] = 2;  /* number of iterations */
+	mutex_lock(&resources_mutex);
+	for_each_resource(resource, &drbd_resources) {
+		struct drbd_state_change *state_change;
+
+		state_change = remember_old_state(resource, GFP_KERNEL);
+		if (!state_change) {
+			if (!list_empty(&head))
+				free_state_changes(&head);
+			mutex_unlock(&resources_mutex);
+			return -ENOMEM;
+		}
+		copy_old_to_new_state_change(state_change);
+		list_add_tail(&state_change->list, &head);
+		cb->args[5] += notifications_for_state_change(state_change);
+	}
+	mutex_unlock(&resources_mutex);
+
+	if (!list_empty(&head)) {
+		struct drbd_state_change *state_change =
+			list_entry(head.next, struct drbd_state_change, list);
+		cb->args[0] = (long)state_change;
+		cb->args[3] = notifications_for_state_change(state_change);
+		list_del(&head);  /* detach list from head */
+	}
+
+	cb->args[2] = cb->nlh->nlmsg_seq;
+	return get_initial_state(skb, cb);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 000000000..6a09b0b98
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,56 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+	struct nlattr *head = nla_data(nla);
+	int len = nla_len(nla);
+	int rem;
+
+	/*
+	 * validate_nla (called from nla_parse_nested) ignores attributes
+	 * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+	 * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+	 * flag set also, check and remove that flag before calling
+	 * nla_parse_nested.
+	 */
+
+	nla_for_each_attr(nla, head, len, rem) {
+		if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+			nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+			if (nla_type(nla) > maxtype)
+				return -EOPNOTSUPP;
+		}
+	}
+	return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+			  const struct nla_policy *policy)
+{
+	int err;
+
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (!err)
+		err = nla_parse_nested_deprecated(tb, maxtype, nla, policy,
+						  NULL);
+
+	return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+	int err;
+	/*
+	 * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+	 * we don't know about that attribute, reject all the nested
+	 * attributes.
+	 */
+	err = drbd_nla_check_mandatory(maxtype, nla);
+	if (err)
+		return ERR_PTR(err);
+	return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 000000000..f5eaffb64
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,9 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+				 const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif  /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
new file mode 100644
index 000000000..3c0193de2
--- /dev/null
+++ b/drivers/block/drbd/drbd_proc.c
@@ -0,0 +1,323 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_proc.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/uaccess.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+struct proc_dir_entry *drbd_proc;
+
+static void seq_printf_with_thousands_grouping(struct seq_file *seq, long v)
+{
+	/* v is in kB/sec. We don't expect TiByte/sec yet. */
+	if (unlikely(v >= 1000000)) {
+		/* cool: > GiByte/s */
+		seq_printf(seq, "%ld,", v / 1000000);
+		v %= 1000000;
+		seq_printf(seq, "%03ld,%03ld", v/1000, v % 1000);
+	} else if (likely(v >= 1000))
+		seq_printf(seq, "%ld,%03ld", v/1000, v % 1000);
+	else
+		seq_printf(seq, "%ld", v);
+}
+
+static void drbd_get_syncer_progress(struct drbd_device *device,
+		union drbd_dev_state state, unsigned long *rs_total,
+		unsigned long *bits_left, unsigned int *per_mil_done)
+{
+	/* this is to break it at compile time when we change that, in case we
+	 * want to support more than (1<<32) bits on a 32bit arch. */
+	typecheck(unsigned long, device->rs_total);
+	*rs_total = device->rs_total;
+
+	/* note: both rs_total and rs_left are in bits, i.e. in
+	 * units of BM_BLOCK_SIZE.
+	 * for the percentage, we don't care. */
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		*bits_left = device->ov_left;
+	else
+		*bits_left = drbd_bm_total_weight(device) - device->rs_failed;
+	/* >> 10 to prevent overflow,
+	 * +1 to prevent division by zero */
+	if (*bits_left > *rs_total) {
+		/* D'oh. Maybe a logic bug somewhere.  More likely just a race
+		 * between state change and reset of rs_total.
+		 */
+		*bits_left = *rs_total;
+		*per_mil_done = *rs_total ? 0 : 1000;
+	} else {
+		/* Make sure the division happens in long context.
+		 * We allow up to one petabyte storage right now,
+		 * at a granularity of 4k per bit that is 2**38 bits.
+		 * After shift right and multiplication by 1000,
+		 * this should still fit easily into a 32bit long,
+		 * so we don't need a 64bit division on 32bit arch.
+		 * Note: currently we don't support such large bitmaps on 32bit
+		 * arch anyways, but no harm done to be prepared for it here.
+		 */
+		unsigned int shift = *rs_total > UINT_MAX ? 16 : 10;
+		unsigned long left = *bits_left >> shift;
+		unsigned long total = 1UL + (*rs_total >> shift);
+		unsigned long tmp = 1000UL - left * 1000UL/total;
+		*per_mil_done = tmp;
+	}
+}
+
+
+/*lge
+ * progress bars shamelessly adapted from driver/md/md.c
+ * output looks like
+ *	[=====>..............] 33.5% (23456/123456)
+ *	finish: 2:20:20 speed: 6,345 (6,456) K/sec
+ */
+static void drbd_syncer_progress(struct drbd_device *device, struct seq_file *seq,
+		union drbd_dev_state state)
+{
+	unsigned long db, dt, dbdt, rt, rs_total, rs_left;
+	unsigned int res;
+	int i, x, y;
+	int stalled = 0;
+
+	drbd_get_syncer_progress(device, state, &rs_total, &rs_left, &res);
+
+	x = res/50;
+	y = 20-x;
+	seq_puts(seq, "\t[");
+	for (i = 1; i < x; i++)
+		seq_putc(seq, '=');
+	seq_putc(seq, '>');
+	for (i = 0; i < y; i++)
+		seq_putc(seq, '.');
+	seq_puts(seq, "] ");
+
+	if (state.conn == C_VERIFY_S || state.conn == C_VERIFY_T)
+		seq_puts(seq, "verified:");
+	else
+		seq_puts(seq, "sync'ed:");
+	seq_printf(seq, "%3u.%u%% ", res / 10, res % 10);
+
+	/* if more than a few GB, display in MB */
+	if (rs_total > (4UL << (30 - BM_BLOCK_SHIFT)))
+		seq_printf(seq, "(%lu/%lu)M",
+			    (unsigned long) Bit2KB(rs_left >> 10),
+			    (unsigned long) Bit2KB(rs_total >> 10));
+	else
+		seq_printf(seq, "(%lu/%lu)K",
+			    (unsigned long) Bit2KB(rs_left),
+			    (unsigned long) Bit2KB(rs_total));
+
+	seq_puts(seq, "\n\t");
+
+	/* see drivers/md/md.c
+	 * We do not want to overflow, so the order of operands and
+	 * the * 100 / 100 trick are important. We do a +1 to be
+	 * safe against division by zero. We only estimate anyway.
+	 *
+	 * dt: time from mark until now
+	 * db: blocks written from mark until now
+	 * rt: remaining time
+	 */
+	/* Rolling marks. last_mark+1 may just now be modified.  last_mark+2 is
+	 * at least (DRBD_SYNC_MARKS-2)*DRBD_SYNC_MARK_STEP old, and has at
+	 * least DRBD_SYNC_MARK_STEP time before it will be modified. */
+	/* ------------------------ ~18s average ------------------------ */
+	i = (device->rs_last_mark + 2) % DRBD_SYNC_MARKS;
+	dt = (jiffies - device->rs_mark_time[i]) / HZ;
+	if (dt > 180)
+		stalled = 1;
+
+	if (!dt)
+		dt++;
+	db = device->rs_mark_left[i] - rs_left;
+	rt = (dt * (rs_left / (db/100+1)))/100; /* seconds */
+
+	seq_printf(seq, "finish: %lu:%02lu:%02lu",
+		rt / 3600, (rt % 3600) / 60, rt % 60);
+
+	dbdt = Bit2KB(db/dt);
+	seq_puts(seq, " speed: ");
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_puts(seq, " (");
+	/* ------------------------- ~3s average ------------------------ */
+	if (drbd_proc_details >= 1) {
+		/* this is what drbd_rs_should_slow_down() uses */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+		dt = (jiffies - device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+		seq_printf_with_thousands_grouping(seq, dbdt);
+		seq_puts(seq, " -- ");
+	}
+
+	/* --------------------- long term average ---------------------- */
+	/* mean speed since syncer started
+	 * we do account for PausedSync periods */
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt == 0)
+		dt = 1;
+	db = rs_total - rs_left;
+	dbdt = Bit2KB(db/dt);
+	seq_printf_with_thousands_grouping(seq, dbdt);
+	seq_putc(seq, ')');
+
+	if (state.conn == C_SYNC_TARGET ||
+	    state.conn == C_VERIFY_S) {
+		seq_puts(seq, " want: ");
+		seq_printf_with_thousands_grouping(seq, device->c_sync_rate);
+	}
+	seq_printf(seq, " K/sec%s\n", stalled ? " (stalled)" : "");
+
+	if (drbd_proc_details >= 1) {
+		/* 64 bit:
+		 * we convert to sectors in the display below. */
+		unsigned long bm_bits = drbd_bm_bits(device);
+		unsigned long bit_pos;
+		unsigned long long stop_sector = 0;
+		if (state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T) {
+			bit_pos = bm_bits - device->ov_left;
+			if (verify_can_do_stop_sector(device))
+				stop_sector = device->ov_stop_sector;
+		} else
+			bit_pos = device->bm_resync_fo;
+		/* Total sectors may be slightly off for oddly
+		 * sized devices. So what. */
+		seq_printf(seq,
+			"\t%3d%% sector pos: %llu/%llu",
+			(int)(bit_pos / (bm_bits/100+1)),
+			(unsigned long long)bit_pos * BM_SECT_PER_BIT,
+			(unsigned long long)bm_bits * BM_SECT_PER_BIT);
+		if (stop_sector != 0 && stop_sector != ULLONG_MAX)
+			seq_printf(seq, " stop sector: %llu", stop_sector);
+		seq_putc(seq, '\n');
+	}
+}
+
+int drbd_seq_show(struct seq_file *seq, void *v)
+{
+	int i, prev_i = -1;
+	const char *sn;
+	struct drbd_device *device;
+	struct net_conf *nc;
+	union drbd_dev_state state;
+	char wp;
+
+	static char write_ordering_chars[] = {
+		[WO_NONE] = 'n',
+		[WO_DRAIN_IO] = 'd',
+		[WO_BDEV_FLUSH] = 'f',
+	};
+
+	seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n",
+		   API_VERSION, PRO_VERSION_MIN, PRO_VERSION_MAX, drbd_buildtag());
+
+	/*
+	  cs .. connection state
+	  ro .. node role (local/remote)
+	  ds .. disk state (local/remote)
+	     protocol
+	     various flags
+	  ns .. network send
+	  nr .. network receive
+	  dw .. disk write
+	  dr .. disk read
+	  al .. activity log write count
+	  bm .. bitmap update write count
+	  pe .. pending (waiting for ack or data reply)
+	  ua .. unack'd (still need to send ack or data reply)
+	  ap .. application requests accepted, but not yet completed
+	  ep .. number of epochs currently "on the fly", P_BARRIER_ACK pending
+	  wo .. write ordering mode currently in use
+	 oos .. known out-of-sync kB
+	*/
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, device, i) {
+		if (prev_i != i - 1)
+			seq_putc(seq, '\n');
+		prev_i = i;
+
+		state = device->state;
+		sn = drbd_conn_str(state.conn);
+
+		if (state.conn == C_STANDALONE &&
+		    state.disk == D_DISKLESS &&
+		    state.role == R_SECONDARY) {
+			seq_printf(seq, "%2d: cs:Unconfigured\n", i);
+		} else {
+			/* reset device->congestion_reason */
+
+			nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+			wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
+			seq_printf(seq,
+			   "%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
+			   "    ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
+			   "lo:%d pe:%d ua:%d ap:%d ep:%d wo:%c",
+			   i, sn,
+			   drbd_role_str(state.role),
+			   drbd_role_str(state.peer),
+			   drbd_disk_str(state.disk),
+			   drbd_disk_str(state.pdsk),
+			   wp,
+			   drbd_suspended(device) ? 's' : 'r',
+			   state.aftr_isp ? 'a' : '-',
+			   state.peer_isp ? 'p' : '-',
+			   state.user_isp ? 'u' : '-',
+			   device->congestion_reason ?: '-',
+			   test_bit(AL_SUSPENDED, &device->flags) ? 's' : '-',
+			   device->send_cnt/2,
+			   device->recv_cnt/2,
+			   device->writ_cnt/2,
+			   device->read_cnt/2,
+			   device->al_writ_cnt,
+			   device->bm_writ_cnt,
+			   atomic_read(&device->local_cnt),
+			   atomic_read(&device->ap_pending_cnt) +
+			   atomic_read(&device->rs_pending_cnt),
+			   atomic_read(&device->unacked_cnt),
+			   atomic_read(&device->ap_bio_cnt),
+			   first_peer_device(device)->connection->epochs,
+			   write_ordering_chars[device->resource->write_ordering]
+			);
+			seq_printf(seq, " oos:%llu\n",
+				   Bit2KB((unsigned long long)
+					   drbd_bm_total_weight(device)));
+		}
+		if (state.conn == C_SYNC_SOURCE ||
+		    state.conn == C_SYNC_TARGET ||
+		    state.conn == C_VERIFY_S ||
+		    state.conn == C_VERIFY_T)
+			drbd_syncer_progress(device, seq, state);
+
+		if (drbd_proc_details >= 1 && get_ldev_if_state(device, D_FAILED)) {
+			lc_seq_printf_stats(seq, device->resync);
+			lc_seq_printf_stats(seq, device->act_log);
+			put_ldev(device);
+		}
+
+		if (drbd_proc_details >= 2)
+			seq_printf(seq, "\tblocked on activity log: %d\n", atomic_read(&device->ap_actlog_cnt));
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h
new file mode 100644
index 000000000..a882b65ab
--- /dev/null
+++ b/drivers/block/drbd/drbd_protocol.h
@@ -0,0 +1,428 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DRBD_PROTOCOL_H
+#define __DRBD_PROTOCOL_H
+
+enum drbd_packet {
+	/* receiver (data socket) */
+	P_DATA		      = 0x00,
+	P_DATA_REPLY	      = 0x01, /* Response to P_DATA_REQUEST */
+	P_RS_DATA_REPLY	      = 0x02, /* Response to P_RS_DATA_REQUEST */
+	P_BARRIER	      = 0x03,
+	P_BITMAP	      = 0x04,
+	P_BECOME_SYNC_TARGET  = 0x05,
+	P_BECOME_SYNC_SOURCE  = 0x06,
+	P_UNPLUG_REMOTE	      = 0x07, /* Used at various times to hint the peer */
+	P_DATA_REQUEST	      = 0x08, /* Used to ask for a data block */
+	P_RS_DATA_REQUEST     = 0x09, /* Used to ask for a data block for resync */
+	P_SYNC_PARAM	      = 0x0a,
+	P_PROTOCOL	      = 0x0b,
+	P_UUIDS		      = 0x0c,
+	P_SIZES		      = 0x0d,
+	P_STATE		      = 0x0e,
+	P_SYNC_UUID	      = 0x0f,
+	P_AUTH_CHALLENGE      = 0x10,
+	P_AUTH_RESPONSE	      = 0x11,
+	P_STATE_CHG_REQ	      = 0x12,
+
+	/* (meta socket) */
+	P_PING		      = 0x13,
+	P_PING_ACK	      = 0x14,
+	P_RECV_ACK	      = 0x15, /* Used in protocol B */
+	P_WRITE_ACK	      = 0x16, /* Used in protocol C */
+	P_RS_WRITE_ACK	      = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
+	P_SUPERSEDED	      = 0x18, /* Used in proto C, two-primaries conflict detection */
+	P_NEG_ACK	      = 0x19, /* Sent if local disk is unusable */
+	P_NEG_DREPLY	      = 0x1a, /* Local disk is broken... */
+	P_NEG_RS_DREPLY	      = 0x1b, /* Local disk is broken... */
+	P_BARRIER_ACK	      = 0x1c,
+	P_STATE_CHG_REPLY     = 0x1d,
+
+	/* "new" commands, no longer fitting into the ordering scheme above */
+
+	P_OV_REQUEST	      = 0x1e, /* data socket */
+	P_OV_REPLY	      = 0x1f,
+	P_OV_RESULT	      = 0x20, /* meta socket */
+	P_CSUM_RS_REQUEST     = 0x21, /* data socket */
+	P_RS_IS_IN_SYNC	      = 0x22, /* meta socket */
+	P_SYNC_PARAM89	      = 0x23, /* data socket, protocol version 89 replacement for P_SYNC_PARAM */
+	P_COMPRESSED_BITMAP   = 0x24, /* compressed or otherwise encoded bitmap transfer */
+	/* P_CKPT_FENCE_REQ      = 0x25, * currently reserved for protocol D */
+	/* P_CKPT_DISABLE_REQ    = 0x26, * currently reserved for protocol D */
+	P_DELAY_PROBE         = 0x27, /* is used on BOTH sockets */
+	P_OUT_OF_SYNC         = 0x28, /* Mark as out of sync (Outrunning), data socket */
+	P_RS_CANCEL           = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+	P_CONN_ST_CHG_REQ     = 0x2a, /* data sock: Connection wide state request */
+	P_CONN_ST_CHG_REPLY   = 0x2b, /* meta sock: Connection side state req reply */
+	P_RETRY_WRITE	      = 0x2c, /* Protocol C: retry conflicting write request */
+	P_PROTOCOL_UPDATE     = 0x2d, /* data sock: is used in established connections */
+        /* 0x2e to 0x30 reserved, used in drbd 9 */
+
+	/* REQ_OP_DISCARD. We used "discard" in different contexts before,
+	 * which is why I chose TRIM here, to disambiguate. */
+	P_TRIM                = 0x31,
+
+	/* Only use these two if both support FF_THIN_RESYNC */
+	P_RS_THIN_REQ         = 0x32, /* Request a block for resync or reply P_RS_DEALLOCATED */
+	P_RS_DEALLOCATED      = 0x33, /* Contains only zeros on sync source node */
+
+	/* REQ_WRITE_SAME.
+	 * On a receiving side without REQ_WRITE_SAME,
+	 * we may fall back to an opencoded loop instead. */
+	P_WSAME               = 0x34,
+
+	/* 0x35 already claimed in DRBD 9 */
+	P_ZEROES              = 0x36, /* data sock: zero-out, WRITE_ZEROES */
+
+	/* 0x40 .. 0x48 already claimed in DRBD 9 */
+
+	P_MAY_IGNORE	      = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
+	P_MAX_OPT_CMD	      = 0x101,
+
+	/* special command ids for handshake */
+
+	P_INITIAL_META	      = 0xfff1, /* First Packet on the MetaSock */
+	P_INITIAL_DATA	      = 0xfff2, /* First Packet on the Socket */
+
+	P_CONNECTION_FEATURES = 0xfffe	/* FIXED for the next century! */
+};
+
+#ifndef __packed
+#define __packed __attribute__((packed))
+#endif
+
+/* This is the layout for a packet on the wire.
+ * The byteorder is the network byte order.
+ *     (except block_id and barrier fields.
+ *	these are pointers to local structs
+ *	and have no relevance for the partner,
+ *	which just echoes them as received.)
+ *
+ * NOTE that the payload starts at a long aligned offset,
+ * regardless of 32 or 64 bit arch!
+ */
+struct p_header80 {
+	u32	  magic;
+	u16	  command;
+	u16	  length;	/* bytes of data after this header */
+} __packed;
+
+/* Header for big packets, Used for data packets exceeding 64kB */
+struct p_header95 {
+	u16	  magic;	/* use DRBD_MAGIC_BIG here */
+	u16	  command;
+	u32	  length;
+} __packed;
+
+struct p_header100 {
+	u32	  magic;
+	u16	  volume;
+	u16	  command;
+	u32	  length;
+	u32	  pad;
+} __packed;
+
+/* These defines must not be changed without changing the protocol version.
+ * New defines may only be introduced together with protocol version bump or
+ * new protocol feature flags.
+ */
+#define DP_HARDBARRIER	      1 /* no longer used */
+#define DP_RW_SYNC	      2 /* equals REQ_SYNC    */
+#define DP_MAY_SET_IN_SYNC    4
+#define DP_UNPLUG             8 /* not used anymore   */
+#define DP_FUA               16 /* equals REQ_FUA     */
+#define DP_FLUSH             32 /* equals REQ_PREFLUSH   */
+#define DP_DISCARD           64 /* equals REQ_OP_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK   256 /* This is a proto C write request */
+#define DP_WSAME            512 /* equiv. REQ_WRITE_SAME */
+#define DP_ZEROES          1024 /* equiv. REQ_OP_WRITE_ZEROES */
+
+/* possible combinations:
+ * REQ_OP_WRITE_ZEROES:  DP_DISCARD | DP_ZEROES
+ * REQ_OP_WRITE_ZEROES + REQ_NOUNMAP: DP_ZEROES
+ */
+
+struct p_data {
+	u64	    sector;    /* 64 bits sector number */
+	u64	    block_id;  /* to identify the request in protocol B&C */
+	u32	    seq_num;
+	u32	    dp_flags;
+} __packed;
+
+struct p_trim {
+	struct p_data p_data;
+	u32	    size;	/* == bio->bi_size */
+} __packed;
+
+struct p_wsame {
+	struct p_data p_data;
+	u32           size;     /* == bio->bi_size */
+} __packed;
+
+/*
+ * commands which share a struct:
+ *  p_block_ack:
+ *   P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
+ *   P_SUPERSEDED (proto C, two-primaries conflict detection)
+ *  p_block_req:
+ *   P_DATA_REQUEST, P_RS_DATA_REQUEST
+ */
+struct p_block_ack {
+	u64	    sector;
+	u64	    block_id;
+	u32	    blksize;
+	u32	    seq_num;
+} __packed;
+
+struct p_block_req {
+	u64 sector;
+	u64 block_id;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/*
+ * commands with their own struct for additional fields:
+ *   P_CONNECTION_FEATURES
+ *   P_BARRIER
+ *   P_BARRIER_ACK
+ *   P_SYNC_PARAM
+ *   ReportParams
+ */
+
+/* supports TRIM/DISCARD on the "wire" protocol */
+#define DRBD_FF_TRIM 1
+
+/* Detect all-zeros during resync, and rather TRIM/UNMAP/DISCARD those blocks
+ * instead of fully allocate a supposedly thin volume on initial resync */
+#define DRBD_FF_THIN_RESYNC 2
+
+/* supports REQ_WRITE_SAME on the "wire" protocol.
+ * Note: this flag is overloaded,
+ * its presence also
+ *   - indicates support for 128 MiB "batch bios",
+ *     max discard size of 128 MiB
+ *     instead of 4M before that.
+ *   - indicates that we exchange additional settings in p_sizes
+ *     drbd_send_sizes()/receive_sizes()
+ */
+#define DRBD_FF_WSAME 4
+
+/* supports REQ_OP_WRITE_ZEROES on the "wire" protocol.
+ *
+ * We used to map that to "discard" on the sending side, and if we cannot
+ * guarantee that discard zeroes data, the receiving side would map discard
+ * back to zero-out.
+ *
+ * With the introduction of REQ_OP_WRITE_ZEROES,
+ * we started to use that for both WRITE_ZEROES and DISCARDS,
+ * hoping that WRITE_ZEROES would "do what we want",
+ * UNMAP if possible, zero-out the rest.
+ *
+ * The example scenario is some LVM "thin" backend.
+ *
+ * While an un-allocated block on dm-thin reads as zeroes, on a dm-thin
+ * with "skip_block_zeroing=true", after a partial block write allocated
+ * that block, that same block may well map "undefined old garbage" from
+ * the backends on LBAs that have not yet been written to.
+ *
+ * If we cannot distinguish between zero-out and discard on the receiving
+ * side, to avoid "undefined old garbage" to pop up randomly at later times
+ * on supposedly zero-initialized blocks, we'd need to map all discards to
+ * zero-out on the receiving side.  But that would potentially do a full
+ * alloc on thinly provisioned backends, even when the expectation was to
+ * unmap/trim/discard/de-allocate.
+ *
+ * We need to distinguish on the protocol level, whether we need to guarantee
+ * zeroes (and thus use zero-out, potentially doing the mentioned full-alloc),
+ * or if we want to put the emphasis on discard, and only do a "best effort
+ * zeroing" (by "discarding" blocks aligned to discard-granularity, and zeroing
+ * only potential unaligned head and tail clippings), to at least *try* to
+ * avoid "false positives" in an online-verify later, hoping that someone
+ * set skip_block_zeroing=false.
+ */
+#define DRBD_FF_WZEROES 8
+
+
+struct p_connection_features {
+	u32 protocol_min;
+	u32 feature_flags;
+	u32 protocol_max;
+
+	/* should be more than enough for future enhancements
+	 * for now, feature_flags and the reserved array shall be zero.
+	 */
+
+	u32 _pad;
+	u64 reserved[7];
+} __packed;
+
+struct p_barrier {
+	u32 barrier;	/* barrier number _handle_ only */
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+struct p_barrier_ack {
+	u32 barrier;
+	u32 set_size;
+} __packed;
+
+struct p_rs_param {
+	u32 resync_rate;
+
+	      /* Since protocol version 88 and higher. */
+	char verify_alg[];
+} __packed;
+
+struct p_rs_param_89 {
+	u32 resync_rate;
+	/* protocol version 89: */
+	char verify_alg[SHARED_SECRET_MAX];
+	char csums_alg[SHARED_SECRET_MAX];
+} __packed;
+
+struct p_rs_param_95 {
+	u32 resync_rate;
+	struct_group(algs,
+		char verify_alg[SHARED_SECRET_MAX];
+		char csums_alg[SHARED_SECRET_MAX];
+	);
+	u32 c_plan_ahead;
+	u32 c_delay_target;
+	u32 c_fill_target;
+	u32 c_max_rate;
+} __packed;
+
+enum drbd_conn_flags {
+	CF_DISCARD_MY_DATA = 1,
+	CF_DRY_RUN = 2,
+};
+
+struct p_protocol {
+	u32 protocol;
+	u32 after_sb_0p;
+	u32 after_sb_1p;
+	u32 after_sb_2p;
+	u32 conn_flags;
+	u32 two_primaries;
+
+	/* Since protocol version 87 and higher. */
+	char integrity_alg[];
+
+} __packed;
+
+struct p_uuids {
+	u64 uuid[UI_EXTENDED_SIZE];
+} __packed;
+
+struct p_rs_uuid {
+	u64	    uuid;
+} __packed;
+
+/* optional queue_limits if (agreed_features & DRBD_FF_WSAME)
+ * see also struct queue_limits, as of late 2015 */
+struct o_qlim {
+	/* we don't need it yet, but we may as well communicate it now */
+	u32 physical_block_size;
+
+	/* so the original in struct queue_limits is unsigned short,
+	 * but I'd have to put in padding anyways. */
+	u32 logical_block_size;
+
+	/* One incoming bio becomes one DRBD request,
+	 * which may be translated to several bio on the receiving side.
+	 * We don't need to communicate chunk/boundary/segment ... limits.
+	 */
+
+	/* various IO hints may be useful with "diskless client" setups */
+	u32 alignment_offset;
+	u32 io_min;
+	u32 io_opt;
+
+	/* We may need to communicate integrity stuff at some point,
+	 * but let's not get ahead of ourselves. */
+
+	/* Backend discard capabilities.
+	 * Receiving side uses "blkdev_issue_discard()", no need to communicate
+	 * more specifics.  If the backend cannot do discards, the DRBD peer
+	 * may fall back to blkdev_issue_zeroout().
+	 */
+	u8 discard_enabled;
+	u8 discard_zeroes_data;
+	u8 write_same_capable;
+	u8 _pad;
+} __packed;
+
+struct p_sizes {
+	u64	    d_size;  /* size of disk */
+	u64	    u_size;  /* user requested size */
+	u64	    c_size;  /* current exported size */
+	u32	    max_bio_size;  /* Maximal size of a BIO */
+	u16	    queue_order_type;  /* not yet implemented in DRBD*/
+	u16	    dds_flags; /* use enum dds_flags here. */
+
+	/* optional queue_limits if (agreed_features & DRBD_FF_WSAME) */
+	struct o_qlim qlim[];
+} __packed;
+
+struct p_state {
+	u32	    state;
+} __packed;
+
+struct p_req_state {
+	u32	    mask;
+	u32	    val;
+} __packed;
+
+struct p_req_state_reply {
+	u32	    retcode;
+} __packed;
+
+struct p_drbd06_param {
+	u64	  size;
+	u32	  state;
+	u32	  blksize;
+	u32	  protocol;
+	u32	  version;
+	u32	  gen_cnt[5];
+	u32	  bit_map_gen[5];
+} __packed;
+
+struct p_block_desc {
+	u64 sector;
+	u32 blksize;
+	u32 pad;	/* to multiple of 8 Byte */
+} __packed;
+
+/* Valid values for the encoding field.
+ * Bump proto version when changing this. */
+enum drbd_bitmap_code {
+	/* RLE_VLI_Bytes = 0,
+	 * and other bit variants had been defined during
+	 * algorithm evaluation. */
+	RLE_VLI_Bits = 2,
+};
+
+struct p_compressed_bm {
+	/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
+	 * (encoding & 0x80): polarity (set/unset) of first runlength
+	 * ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
+	 * used to pad up to head.length bytes
+	 */
+	u8 encoding;
+
+	u8 code[];
+} __packed;
+
+struct p_delay_probe93 {
+	u32     seq_num; /* sequence number to match the two probe packets */
+	u32     offset;  /* usecs the probe got sent after the reference time point */
+} __packed;
+
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
+ */
+#define DRBD_SOCKET_BUFFER_SIZE 4096
+
+#endif  /* __DRBD_PROTOCOL_H */
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
new file mode 100644
index 000000000..4ba09abbc
--- /dev/null
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -0,0 +1,6113 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_receiver.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ */
+
+
+#include <linux/module.h>
+
+#include <linux/uaccess.h>
+#include <net/sock.h>
+
+#include <linux/drbd.h>
+#include <linux/fs.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <uapi/linux/sched/types.h>
+#include <linux/sched/signal.h>
+#include <linux/pkt_sched.h>
+#define __KERNEL_SYSCALLS__
+#include <linux/unistd.h>
+#include <linux/vmalloc.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <linux/part_stat.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_vli.h"
+
+#define PRO_FEATURES (DRBD_FF_TRIM|DRBD_FF_THIN_RESYNC|DRBD_FF_WSAME|DRBD_FF_WZEROES)
+
+struct packet_info {
+	enum drbd_packet cmd;
+	unsigned int size;
+	unsigned int vnr;
+	void *data;
+};
+
+enum finish_epoch {
+	FE_STILL_LIVE,
+	FE_DESTROYED,
+	FE_RECYCLED,
+};
+
+static int drbd_do_features(struct drbd_connection *connection);
+static int drbd_do_auth(struct drbd_connection *connection);
+static int drbd_disconnected(struct drbd_peer_device *);
+static void conn_wait_active_ee_empty(struct drbd_connection *connection);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
+
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+/*
+ * some helper functions to deal with single linked page lists,
+ * page->private being our "next" pointer.
+ */
+
+/* If at least n pages are linked at head, get n pages off.
+ * Otherwise, don't modify head, and return NULL.
+ * Locking is the responsibility of the caller.
+ */
+static struct page *page_chain_del(struct page **head, int n)
+{
+	struct page *page;
+	struct page *tmp;
+
+	BUG_ON(!n);
+	BUG_ON(!head);
+
+	page = *head;
+
+	if (!page)
+		return NULL;
+
+	while (page) {
+		tmp = page_chain_next(page);
+		if (--n == 0)
+			break; /* found sufficient pages */
+		if (tmp == NULL)
+			/* insufficient pages, don't use any of them. */
+			return NULL;
+		page = tmp;
+	}
+
+	/* add end of list marker for the returned list */
+	set_page_private(page, 0);
+	/* actual return value, and adjustment of head */
+	page = *head;
+	*head = tmp;
+	return page;
+}
+
+/* may be used outside of locks to find the tail of a (usually short)
+ * "private" page chain, before adding it back to a global chain head
+ * with page_chain_add() under a spinlock. */
+static struct page *page_chain_tail(struct page *page, int *len)
+{
+	struct page *tmp;
+	int i = 1;
+	while ((tmp = page_chain_next(page))) {
+		++i;
+		page = tmp;
+	}
+	if (len)
+		*len = i;
+	return page;
+}
+
+static int page_chain_free(struct page *page)
+{
+	struct page *tmp;
+	int i = 0;
+	page_chain_for_each_safe(page, tmp) {
+		put_page(page);
+		++i;
+	}
+	return i;
+}
+
+static void page_chain_add(struct page **head,
+		struct page *chain_first, struct page *chain_last)
+{
+#if 1
+	struct page *tmp;
+	tmp = page_chain_tail(chain_first, NULL);
+	BUG_ON(tmp != chain_last);
+#endif
+
+	/* add chain to head */
+	set_page_private(chain_last, (unsigned long)*head);
+	*head = chain_first;
+}
+
+static struct page *__drbd_alloc_pages(struct drbd_device *device,
+				       unsigned int number)
+{
+	struct page *page = NULL;
+	struct page *tmp = NULL;
+	unsigned int i = 0;
+
+	/* Yes, testing drbd_pp_vacant outside the lock is racy.
+	 * So what. It saves a spin_lock. */
+	if (drbd_pp_vacant >= number) {
+		spin_lock(&drbd_pp_lock);
+		page = page_chain_del(&drbd_pp_pool, number);
+		if (page)
+			drbd_pp_vacant -= number;
+		spin_unlock(&drbd_pp_lock);
+		if (page)
+			return page;
+	}
+
+	/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	for (i = 0; i < number; i++) {
+		tmp = alloc_page(GFP_TRY);
+		if (!tmp)
+			break;
+		set_page_private(tmp, (unsigned long)page);
+		page = tmp;
+	}
+
+	if (i == number)
+		return page;
+
+	/* Not enough pages immediately available this time.
+	 * No need to jump around here, drbd_alloc_pages will retry this
+	 * function "soon". */
+	if (page) {
+		tmp = page_chain_tail(page, NULL);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	return NULL;
+}
+
+static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
+					   struct list_head *to_be_freed)
+{
+	struct drbd_peer_request *peer_req, *tmp;
+
+	/* The EEs are always appended to the end of the list. Since
+	   they are sent in order over the wire, they have to finish
+	   in order. As soon as we see the first not finished we can
+	   stop to examine the list... */
+
+	list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
+		if (drbd_peer_req_has_active_page(peer_req))
+			break;
+		list_move(&peer_req->w.list, to_be_freed);
+	}
+}
+
+static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
+{
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	spin_unlock_irq(&device->resource->req_lock);
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+}
+
+static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (!atomic_read(&device->pp_in_use_by_net))
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_reclaim_net_peer_reqs(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+/**
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
+ * @peer_device:	DRBD device.
+ * @number:		number of pages requested
+ * @retry:		whether to retry, if not enough pages are available right now
+ *
+ * Tries to allocate number pages, first from our own page pool, then from
+ * the kernel.
+ * Possibly retry until DRBD frees sufficient pages somewhere else.
+ *
+ * If this allocation would exceed the max_buffers setting, we throttle
+ * allocation (schedule_timeout) to give the system some room to breathe.
+ *
+ * We do not use max-buffers as hard limit, because it could lead to
+ * congestion and further to a distributed deadlock during online-verify or
+ * (checksum based) resync, if the max-buffers, socket buffer sizes and
+ * resync-rate settings are mis-configured.
+ *
+ * Returns a page chain linked via page->private.
+ */
+struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int number,
+			      bool retry)
+{
+	struct drbd_device *device = peer_device->device;
+	struct page *page = NULL;
+	struct net_conf *nc;
+	DEFINE_WAIT(wait);
+	unsigned int mxb;
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	mxb = nc ? nc->max_buffers : 1000000;
+	rcu_read_unlock();
+
+	if (atomic_read(&device->pp_in_use) < mxb)
+		page = __drbd_alloc_pages(device, number);
+
+	/* Try to keep the fast path fast, but occasionally we need
+	 * to reclaim the pages we lended to the network stack. */
+	if (page && atomic_read(&device->pp_in_use_by_net) > 512)
+		drbd_reclaim_net_peer_reqs(device);
+
+	while (page == NULL) {
+		prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
+
+		drbd_reclaim_net_peer_reqs(device);
+
+		if (atomic_read(&device->pp_in_use) < mxb) {
+			page = __drbd_alloc_pages(device, number);
+			if (page)
+				break;
+		}
+
+		if (!retry)
+			break;
+
+		if (signal_pending(current)) {
+			drbd_warn(device, "drbd_alloc_pages interrupted!\n");
+			break;
+		}
+
+		if (schedule_timeout(HZ/10) == 0)
+			mxb = UINT_MAX;
+	}
+	finish_wait(&drbd_pp_wait, &wait);
+
+	if (page)
+		atomic_add(number, &device->pp_in_use);
+	return page;
+}
+
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&resource->req_lock);
+ * Either links the page chain back to the global pool,
+ * or returns all pages to the system. */
+static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
+{
+	atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
+	int i;
+
+	if (page == NULL)
+		return;
+
+	if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
+		i = page_chain_free(page);
+	else {
+		struct page *tmp;
+		tmp = page_chain_tail(page, &i);
+		spin_lock(&drbd_pp_lock);
+		page_chain_add(&drbd_pp_pool, page, tmp);
+		drbd_pp_vacant += i;
+		spin_unlock(&drbd_pp_lock);
+	}
+	i = atomic_sub_return(i, a);
+	if (i < 0)
+		drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
+			is_net ? "pp_in_use_by_net" : "pp_in_use", i);
+	wake_up(&drbd_pp_wait);
+}
+
+/*
+You need to hold the req_lock:
+ _drbd_wait_ee_list_empty()
+
+You must not have the req_lock:
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
+ drbd_ee_fix_bhs()
+ drbd_finish_peer_reqs()
+ drbd_clear_done_ee()
+ drbd_wait_ee_list_empty()
+*/
+
+/* normal: payload_size == request size (bi_size)
+ * w_same: payload_size == logical_block_size
+ * trim: payload_size == 0 */
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+		    unsigned int request_size, unsigned int payload_size, gfp_t gfp_mask) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+	struct page *page = NULL;
+	unsigned int nr_pages = PFN_UP(payload_size);
+
+	if (drbd_insert_fault(device, DRBD_FAULT_AL_EE))
+		return NULL;
+
+	peer_req = mempool_alloc(&drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+	if (!peer_req) {
+		if (!(gfp_mask & __GFP_NOWARN))
+			drbd_err(device, "%s: allocation failed\n", __func__);
+		return NULL;
+	}
+
+	if (nr_pages) {
+		page = drbd_alloc_pages(peer_device, nr_pages,
+					gfpflags_allow_blocking(gfp_mask));
+		if (!page)
+			goto fail;
+	}
+
+	memset(peer_req, 0, sizeof(*peer_req));
+	INIT_LIST_HEAD(&peer_req->w.list);
+	drbd_clear_interval(&peer_req->i);
+	peer_req->i.size = request_size;
+	peer_req->i.sector = sector;
+	peer_req->submit_jif = jiffies;
+	peer_req->peer_device = peer_device;
+	peer_req->pages = page;
+	/*
+	 * The block_id is opaque to the receiver.  It is not endianness
+	 * converted, and sent back to the sender unchanged.
+	 */
+	peer_req->block_id = id;
+
+	return peer_req;
+
+ fail:
+	mempool_free(peer_req, &drbd_ee_mempool);
+	return NULL;
+}
+
+void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
+		       int is_net)
+{
+	might_sleep();
+	if (peer_req->flags & EE_HAS_DIGEST)
+		kfree(peer_req->digest);
+	drbd_free_pages(device, peer_req->pages, is_net);
+	D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+	if (!expect(!(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
+	mempool_free(peer_req, &drbd_ee_mempool);
+}
+
+int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
+{
+	LIST_HEAD(work_list);
+	struct drbd_peer_request *peer_req, *t;
+	int count = 0;
+	int is_net = list == &device->net_ee;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_init(list, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		__drbd_free_peer_req(device, peer_req, is_net);
+		count++;
+	}
+	return count;
+}
+
+/*
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
+ */
+static int drbd_finish_peer_reqs(struct drbd_device *device)
+{
+	LIST_HEAD(work_list);
+	LIST_HEAD(reclaimed);
+	struct drbd_peer_request *peer_req, *t;
+	int err = 0;
+
+	spin_lock_irq(&device->resource->req_lock);
+	reclaim_finished_net_peer_reqs(device, &reclaimed);
+	list_splice_init(&device->done_ee, &work_list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+		drbd_free_net_peer_req(device, peer_req);
+
+	/* possible callbacks here:
+	 * e_end_block, and e_end_resync_block, e_send_superseded.
+	 * all ignore the last argument.
+	 */
+	list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+		int err2;
+
+		/* list_del not necessary, next/prev members not touched */
+		err2 = peer_req->w.cb(&peer_req->w, !!err);
+		if (!err)
+			err = err2;
+		drbd_free_peer_req(device, peer_req);
+	}
+	wake_up(&device->ee_wait);
+
+	return err;
+}
+
+static void _drbd_wait_ee_list_empty(struct drbd_device *device,
+				     struct list_head *head)
+{
+	DEFINE_WAIT(wait);
+
+	/* avoids spin_lock/unlock
+	 * and calling prepare_to_wait in the fast path */
+	while (!list_empty(head)) {
+		prepare_to_wait(&device->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
+		spin_unlock_irq(&device->resource->req_lock);
+		io_schedule();
+		finish_wait(&device->ee_wait, &wait);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+}
+
+static void drbd_wait_ee_list_empty(struct drbd_device *device,
+				    struct list_head *head)
+{
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, head);
+	spin_unlock_irq(&device->resource->req_lock);
+}
+
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
+{
+	struct kvec iov = {
+		.iov_base = buf,
+		.iov_len = size,
+	};
+	struct msghdr msg = {
+		.msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL)
+	};
+	iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, size);
+	return sock_recvmsg(sock, &msg, msg.msg_flags);
+}
+
+static int drbd_recv(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int rv;
+
+	rv = drbd_recv_short(connection->data.socket, buf, size, 0);
+
+	if (rv < 0) {
+		if (rv == -ECONNRESET)
+			drbd_info(connection, "sock was reset by peer\n");
+		else if (rv != -ERESTARTSYS)
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+	} else if (rv == 0) {
+		if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+			long t;
+			rcu_read_lock();
+			t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+			rcu_read_unlock();
+
+			t = wait_event_timeout(connection->ping_wait, connection->cstate < C_WF_REPORT_PARAMS, t);
+
+			if (t)
+				goto out;
+		}
+		drbd_info(connection, "sock was shut down by peer\n");
+	}
+
+	if (rv != size)
+		conn_request_state(connection, NS(conn, C_BROKEN_PIPE), CS_HARD);
+
+out:
+	return rv;
+}
+
+static int drbd_recv_all(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv(connection, buf, size);
+	if (err != size) {
+		if (err >= 0)
+			err = -EIO;
+	} else
+		err = 0;
+	return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_connection *connection, void *buf, size_t size)
+{
+	int err;
+
+	err = drbd_recv_all(connection, buf, size);
+	if (err && !signal_pending(current))
+		drbd_warn(connection, "short read (expected size %d)\n", (int)size);
+	return err;
+}
+
+/* quoting tcp(7):
+ *   On individual connections, the socket buffer size must be set prior to the
+ *   listen(2) or connect(2) calls in order to have it take effect.
+ * This is our wrapper to do so.
+ */
+static void drbd_setbufsize(struct socket *sock, unsigned int snd,
+		unsigned int rcv)
+{
+	/* open coded SO_SNDBUF, SO_RCVBUF */
+	if (snd) {
+		sock->sk->sk_sndbuf = snd;
+		sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+	}
+	if (rcv) {
+		sock->sk->sk_rcvbuf = rcv;
+		sock->sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+	}
+}
+
+static struct socket *drbd_try_connect(struct drbd_connection *connection)
+{
+	const char *what;
+	struct socket *sock;
+	struct sockaddr_in6 src_in6;
+	struct sockaddr_in6 peer_in6;
+	struct net_conf *nc;
+	int err, peer_addr_len, my_addr_len;
+	int sndbuf_size, rcvbuf_size, connect_int;
+	int disconnect_on_error = 1;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(src_in6));
+	memcpy(&src_in6, &connection->my_addr, my_addr_len);
+
+	if (((struct sockaddr *)&connection->my_addr)->sa_family == AF_INET6)
+		src_in6.sin6_port = 0;
+	else
+		((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+	peer_addr_len = min_t(int, connection->peer_addr_len, sizeof(src_in6));
+	memcpy(&peer_in6, &connection->peer_addr, peer_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(&init_net, ((struct sockaddr *)&src_in6)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &sock);
+	if (err < 0) {
+		sock = NULL;
+		goto out;
+	}
+
+	sock->sk->sk_rcvtimeo =
+	sock->sk->sk_sndtimeo = connect_int * HZ;
+	drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
+
+       /* explicitly bind to the configured IP as source IP
+	*  for the outgoing connections.
+	*  This is needed for multihomed hosts and to be
+	*  able to use lo: interfaces for drbd.
+	* Make sure to use 0 as port number, so linux selects
+	*  a free one dynamically.
+	*/
+	what = "bind before connect";
+	err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	/* connect may fail, peer not yet available.
+	 * stay C_WF_CONNECTION, don't go Disconnecting! */
+	disconnect_on_error = 0;
+	what = "connect";
+	err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
+
+out:
+	if (err < 0) {
+		if (sock) {
+			sock_release(sock);
+			sock = NULL;
+		}
+		switch (-err) {
+			/* timeout, busy, signal pending */
+		case ETIMEDOUT: case EAGAIN: case EINPROGRESS:
+		case EINTR: case ERESTARTSYS:
+			/* peer not (yet) available, network problem */
+		case ECONNREFUSED: case ENETUNREACH:
+		case EHOSTDOWN:    case EHOSTUNREACH:
+			disconnect_on_error = 0;
+			break;
+		default:
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+		}
+		if (disconnect_on_error)
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+
+	return sock;
+}
+
+struct accept_wait_data {
+	struct drbd_connection *connection;
+	struct socket *s_listen;
+	struct completion door_bell;
+	void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
+{
+	struct accept_wait_data *ad = sk->sk_user_data;
+	void (*state_change)(struct sock *sk);
+
+	state_change = ad->original_sk_state_change;
+	if (sk->sk_state == TCP_ESTABLISHED)
+		complete(&ad->door_bell);
+	state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int err, sndbuf_size, rcvbuf_size, my_addr_len;
+	struct sockaddr_in6 my_addr;
+	struct socket *s_listen;
+	struct net_conf *nc;
+	const char *what;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sndbuf_size = nc->sndbuf_size;
+	rcvbuf_size = nc->rcvbuf_size;
+	rcu_read_unlock();
+
+	my_addr_len = min_t(int, connection->my_addr_len, sizeof(struct sockaddr_in6));
+	memcpy(&my_addr, &connection->my_addr, my_addr_len);
+
+	what = "sock_create_kern";
+	err = sock_create_kern(&init_net, ((struct sockaddr *)&my_addr)->sa_family,
+			       SOCK_STREAM, IPPROTO_TCP, &s_listen);
+	if (err) {
+		s_listen = NULL;
+		goto out;
+	}
+
+	s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
+
+	what = "bind before listen";
+	err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
+	if (err < 0)
+		goto out;
+
+	ad->s_listen = s_listen;
+	write_lock_bh(&s_listen->sk->sk_callback_lock);
+	ad->original_sk_state_change = s_listen->sk->sk_state_change;
+	s_listen->sk->sk_state_change = drbd_incoming_connection;
+	s_listen->sk->sk_user_data = ad;
+	write_unlock_bh(&s_listen->sk->sk_callback_lock);
+
+	what = "listen";
+	err = s_listen->ops->listen(s_listen, 5);
+	if (err < 0)
+		goto out;
+
+	return 0;
+out:
+	if (s_listen)
+		sock_release(s_listen);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "%s failed, err = %d\n", what, err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+	write_lock_bh(&sk->sk_callback_lock);
+	sk->sk_state_change = ad->original_sk_state_change;
+	sk->sk_user_data = NULL;
+	write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_connection *connection, struct accept_wait_data *ad)
+{
+	int timeo, connect_int, err = 0;
+	struct socket *s_estab = NULL;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return NULL;
+	}
+	connect_int = nc->connect_int;
+	rcu_read_unlock();
+
+	timeo = connect_int * HZ;
+	/* 28.5% random jitter */
+	timeo += prandom_u32_max(2) ? timeo / 7 : -timeo / 7;
+
+	err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+	if (err <= 0)
+		return NULL;
+
+	err = kernel_accept(ad->s_listen, &s_estab, 0);
+	if (err < 0) {
+		if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+			drbd_err(connection, "accept failed, err = %d\n", err);
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	}
+
+	if (s_estab)
+		unregister_state_change(s_estab->sk, ad);
+
+	return s_estab;
+}
+
+static int decode_header(struct drbd_connection *, void *, struct packet_info *);
+
+static int send_first_packet(struct drbd_connection *connection, struct drbd_socket *sock,
+			     enum drbd_packet cmd)
+{
+	if (!conn_prepare_command(connection, sock))
+		return -EIO;
+	return conn_send_command(connection, sock, cmd, 0, NULL, 0);
+}
+
+static int receive_first_packet(struct drbd_connection *connection, struct socket *sock)
+{
+	unsigned int header_size = drbd_header_size(connection);
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (!nc) {
+		rcu_read_unlock();
+		return -EIO;
+	}
+	sock->sk->sk_rcvtimeo = nc->ping_timeo * 4 * HZ / 10;
+	rcu_read_unlock();
+
+	err = drbd_recv_short(sock, connection->data.rbuf, header_size, 0);
+	if (err != header_size) {
+		if (err >= 0)
+			err = -EIO;
+		return err;
+	}
+	err = decode_header(connection, connection->data.rbuf, &pi);
+	if (err)
+		return err;
+	return pi.cmd;
+}
+
+/**
+ * drbd_socket_okay() - Free the socket if its connection is not okay
+ * @sock:	pointer to the pointer to the socket.
+ */
+static bool drbd_socket_okay(struct socket **sock)
+{
+	int rr;
+	char tb[4];
+
+	if (!*sock)
+		return false;
+
+	rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+
+	if (rr > 0 || rr == -EAGAIN) {
+		return true;
+	} else {
+		sock_release(*sock);
+		*sock = NULL;
+		return false;
+	}
+}
+
+static bool connection_established(struct drbd_connection *connection,
+				   struct socket **sock1,
+				   struct socket **sock2)
+{
+	struct net_conf *nc;
+	int timeout;
+	bool ok;
+
+	if (!*sock1 || !*sock2)
+		return false;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	timeout = (nc->sock_check_timeo ?: nc->ping_timeo) * HZ / 10;
+	rcu_read_unlock();
+	schedule_timeout_interruptible(timeout);
+
+	ok = drbd_socket_okay(sock1);
+	ok = drbd_socket_okay(sock2) && ok;
+
+	return ok;
+}
+
+/* Gets called if a connection is established, or if a new minor gets created
+   in a connection */
+int drbd_connected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	atomic_set(&device->packet_seq, 0);
+	device->peer_seq = 0;
+
+	device->state_mutex = peer_device->connection->agreed_pro_version < 100 ?
+		&peer_device->connection->cstate_mutex :
+		&device->own_state_mutex;
+
+	err = drbd_send_sync_param(peer_device);
+	if (!err)
+		err = drbd_send_sizes(peer_device, 0, 0);
+	if (!err)
+		err = drbd_send_uuids(peer_device);
+	if (!err)
+		err = drbd_send_current_state(peer_device);
+	clear_bit(USE_DEGR_WFC_T, &device->flags);
+	clear_bit(RESIZE_PENDING, &device->flags);
+	atomic_set(&device->ap_in_flight, 0);
+	mod_timer(&device->request_timer, jiffies + HZ); /* just start it here. */
+	return err;
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ *  -2 We do not have a network config...
+ */
+static int conn_connect(struct drbd_connection *connection)
+{
+	struct drbd_socket sock, msock;
+	struct drbd_peer_device *peer_device;
+	struct net_conf *nc;
+	int vnr, timeout, h;
+	bool discard_my_data, ok;
+	enum drbd_state_rv rv;
+	struct accept_wait_data ad = {
+		.connection = connection,
+		.door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+	};
+
+	clear_bit(DISCONNECT_SENT, &connection->flags);
+	if (conn_request_state(connection, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
+		return -2;
+
+	mutex_init(&sock.mutex);
+	sock.sbuf = connection->data.sbuf;
+	sock.rbuf = connection->data.rbuf;
+	sock.socket = NULL;
+	mutex_init(&msock.mutex);
+	msock.sbuf = connection->meta.sbuf;
+	msock.rbuf = connection->meta.rbuf;
+	msock.socket = NULL;
+
+	/* Assume that the peer only understands protocol 80 until we know better.  */
+	connection->agreed_pro_version = 80;
+
+	if (prepare_listen_socket(connection, &ad))
+		return 0;
+
+	do {
+		struct socket *s;
+
+		s = drbd_try_connect(connection);
+		if (s) {
+			if (!sock.socket) {
+				sock.socket = s;
+				send_first_packet(connection, &sock, P_INITIAL_DATA);
+			} else if (!msock.socket) {
+				clear_bit(RESOLVE_CONFLICTS, &connection->flags);
+				msock.socket = s;
+				send_first_packet(connection, &msock, P_INITIAL_META);
+			} else {
+				drbd_err(connection, "Logic error in conn_connect()\n");
+				goto out_release_sockets;
+			}
+		}
+
+		if (connection_established(connection, &sock.socket, &msock.socket))
+			break;
+
+retry:
+		s = drbd_wait_for_connect(connection, &ad);
+		if (s) {
+			int fp = receive_first_packet(connection, s);
+			drbd_socket_okay(&sock.socket);
+			drbd_socket_okay(&msock.socket);
+			switch (fp) {
+			case P_INITIAL_DATA:
+				if (sock.socket) {
+					drbd_warn(connection, "initial packet S crossed\n");
+					sock_release(sock.socket);
+					sock.socket = s;
+					goto randomize;
+				}
+				sock.socket = s;
+				break;
+			case P_INITIAL_META:
+				set_bit(RESOLVE_CONFLICTS, &connection->flags);
+				if (msock.socket) {
+					drbd_warn(connection, "initial packet M crossed\n");
+					sock_release(msock.socket);
+					msock.socket = s;
+					goto randomize;
+				}
+				msock.socket = s;
+				break;
+			default:
+				drbd_warn(connection, "Error receiving initial packet\n");
+				sock_release(s);
+randomize:
+				if (prandom_u32_max(2))
+					goto retry;
+			}
+		}
+
+		if (connection->cstate <= C_DISCONNECTING)
+			goto out_release_sockets;
+		if (signal_pending(current)) {
+			flush_signals(current);
+			smp_rmb();
+			if (get_t_state(&connection->receiver) == EXITING)
+				goto out_release_sockets;
+		}
+
+		ok = connection_established(connection, &sock.socket, &msock.socket);
+	} while (!ok);
+
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+
+	sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+	msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+
+	sock.socket->sk->sk_allocation = GFP_NOIO;
+	msock.socket->sk->sk_allocation = GFP_NOIO;
+
+	sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+	msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
+
+	/* NOT YET ...
+	 * sock.socket->sk->sk_sndtimeo = connection->net_conf->timeout*HZ/10;
+	 * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+	 * first set it to the P_CONNECTION_FEATURES timeout,
+	 * which we set to 4x the configured ping_timeout. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+
+	sock.socket->sk->sk_sndtimeo =
+	sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+	msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+	timeout = nc->timeout * HZ / 10;
+	discard_my_data = nc->discard_my_data;
+	rcu_read_unlock();
+
+	msock.socket->sk->sk_sndtimeo = timeout;
+
+	/* we don't want delays.
+	 * we use TCP_CORK where appropriate, though */
+	tcp_sock_set_nodelay(sock.socket->sk);
+	tcp_sock_set_nodelay(msock.socket->sk);
+
+	connection->data.socket = sock.socket;
+	connection->meta.socket = msock.socket;
+	connection->last_received = jiffies;
+
+	h = drbd_do_features(connection);
+	if (h <= 0)
+		return h;
+
+	if (connection->cram_hmac_tfm) {
+		/* drbd_request_state(device, NS(conn, WFAuth)); */
+		switch (drbd_do_auth(connection)) {
+		case -1:
+			drbd_err(connection, "Authentication of peer failed\n");
+			return -1;
+		case 0:
+			drbd_err(connection, "Authentication of peer failed, trying again.\n");
+			return 0;
+		}
+	}
+
+	connection->data.socket->sk->sk_sndtimeo = timeout;
+	connection->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+
+	if (drbd_send_protocol(connection) == -EOPNOTSUPP)
+		return -1;
+
+	/* Prevent a race between resync-handshake and
+	 * being promoted to Primary.
+	 *
+	 * Grab and release the state mutex, so we know that any current
+	 * drbd_set_role() is finished, and any incoming drbd_set_role
+	 * will see the STATE_SENT flag, and wait for it to be cleared.
+	 */
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_lock(peer_device->device->state_mutex);
+
+	/* avoid a race with conn_request_state( C_DISCONNECTING ) */
+	spin_lock_irq(&connection->resource->req_lock);
+	set_bit(STATE_SENT, &connection->flags);
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		mutex_unlock(peer_device->device->state_mutex);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+
+		if (discard_my_data)
+			set_bit(DISCARD_MY_DATA, &device->flags);
+		else
+			clear_bit(DISCARD_MY_DATA, &device->flags);
+
+		drbd_connected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	rv = conn_request_state(connection, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+	if (rv < SS_SUCCESS || connection->cstate != C_WF_REPORT_PARAMS) {
+		clear_bit(STATE_SENT, &connection->flags);
+		return 0;
+	}
+
+	drbd_thread_start(&connection->ack_receiver);
+	/* opencoded create_singlethread_workqueue(),
+	 * to be able to use format string arguments */
+	connection->ack_sender =
+		alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name);
+	if (!connection->ack_sender) {
+		drbd_err(connection, "Failed to create workqueue ack_sender\n");
+		return 0;
+	}
+
+	mutex_lock(&connection->resource->conf_update);
+	/* The discard_my_data flag is a single-shot modifier to the next
+	 * connection attempt, the handshake of which is now well underway.
+	 * No need for rcu style copying of the whole struct
+	 * just to clear a single value. */
+	connection->net_conf->discard_my_data = 0;
+	mutex_unlock(&connection->resource->conf_update);
+
+	return h;
+
+out_release_sockets:
+	if (ad.s_listen)
+		sock_release(ad.s_listen);
+	if (sock.socket)
+		sock_release(sock.socket);
+	if (msock.socket)
+		sock_release(msock.socket);
+	return -1;
+}
+
+static int decode_header(struct drbd_connection *connection, void *header, struct packet_info *pi)
+{
+	unsigned int header_size = drbd_header_size(connection);
+
+	if (header_size == sizeof(struct p_header100) &&
+	    *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+		struct p_header100 *h = header;
+		if (h->pad != 0) {
+			drbd_err(connection, "Header padding is not zero\n");
+			return -EINVAL;
+		}
+		pi->vnr = be16_to_cpu(h->volume);
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+	} else if (header_size == sizeof(struct p_header95) &&
+		   *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+		struct p_header95 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be32_to_cpu(h->length);
+		pi->vnr = 0;
+	} else if (header_size == sizeof(struct p_header80) &&
+		   *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+		struct p_header80 *h = header;
+		pi->cmd = be16_to_cpu(h->command);
+		pi->size = be16_to_cpu(h->length);
+		pi->vnr = 0;
+	} else {
+		drbd_err(connection, "Wrong magic value 0x%08x in protocol version %d\n",
+			 be32_to_cpu(*(__be32 *)header),
+			 connection->agreed_pro_version);
+		return -EINVAL;
+	}
+	pi->data = header + header_size;
+	return 0;
+}
+
+static void drbd_unplug_all_devices(struct drbd_connection *connection)
+{
+	if (current->plug == &connection->receiver_plug) {
+		blk_finish_plug(&connection->receiver_plug);
+		blk_start_plug(&connection->receiver_plug);
+	} /* else: maybe just schedule() ?? */
+}
+
+static int drbd_recv_header(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int err;
+
+	err = drbd_recv_all_warn(connection, buffer, drbd_header_size(connection));
+	if (err)
+		return err;
+
+	err = decode_header(connection, buffer, pi);
+	connection->last_received = jiffies;
+
+	return err;
+}
+
+static int drbd_recv_header_maybe_unplug(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	unsigned int size = drbd_header_size(connection);
+	int err;
+
+	err = drbd_recv_short(connection->data.socket, buffer, size, MSG_NOSIGNAL|MSG_DONTWAIT);
+	if (err != size) {
+		/* If we have nothing in the receive buffer now, to reduce
+		 * application latency, try to drain the backend queues as
+		 * quickly as possible, and let remote TCP know what we have
+		 * received so far. */
+		if (err == -EAGAIN) {
+			tcp_sock_set_quickack(connection->data.socket->sk, 2);
+			drbd_unplug_all_devices(connection);
+		}
+		if (err > 0) {
+			buffer += err;
+			size -= err;
+		}
+		err = drbd_recv_all_warn(connection, buffer, size);
+		if (err)
+			return err;
+	}
+
+	err = decode_header(connection, connection->data.rbuf, pi);
+	connection->last_received = jiffies;
+
+	return err;
+}
+/* This is blkdev_issue_flush, but asynchronous.
+ * We want to submit to all component volumes in parallel,
+ * then wait for all completions.
+ */
+struct issue_flush_context {
+	atomic_t pending;
+	int error;
+	struct completion done;
+};
+struct one_flush_context {
+	struct drbd_device *device;
+	struct issue_flush_context *ctx;
+};
+
+static void one_flush_endio(struct bio *bio)
+{
+	struct one_flush_context *octx = bio->bi_private;
+	struct drbd_device *device = octx->device;
+	struct issue_flush_context *ctx = octx->ctx;
+
+	if (bio->bi_status) {
+		ctx->error = blk_status_to_errno(bio->bi_status);
+		drbd_info(device, "local disk FLUSH FAILED with status %d\n", bio->bi_status);
+	}
+	kfree(octx);
+	bio_put(bio);
+
+	clear_bit(FLUSH_PENDING, &device->flags);
+	put_ldev(device);
+	kref_put(&device->kref, drbd_destroy_device);
+
+	if (atomic_dec_and_test(&ctx->pending))
+		complete(&ctx->done);
+}
+
+static void submit_one_flush(struct drbd_device *device, struct issue_flush_context *ctx)
+{
+	struct bio *bio = bio_alloc(device->ldev->backing_bdev, 0,
+				    REQ_OP_WRITE | REQ_PREFLUSH, GFP_NOIO);
+	struct one_flush_context *octx = kmalloc(sizeof(*octx), GFP_NOIO);
+
+	if (!octx) {
+		drbd_warn(device, "Could not allocate a octx, CANNOT ISSUE FLUSH\n");
+		/* FIXME: what else can I do now?  disconnecting or detaching
+		 * really does not help to improve the state of the world, either.
+		 */
+		bio_put(bio);
+
+		ctx->error = -ENOMEM;
+		put_ldev(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		return;
+	}
+
+	octx->device = device;
+	octx->ctx = ctx;
+	bio->bi_private = octx;
+	bio->bi_end_io = one_flush_endio;
+
+	device->flush_jif = jiffies;
+	set_bit(FLUSH_PENDING, &device->flags);
+	atomic_inc(&ctx->pending);
+	submit_bio(bio);
+}
+
+static void drbd_flush(struct drbd_connection *connection)
+{
+	if (connection->resource->write_ordering >= WO_BDEV_FLUSH) {
+		struct drbd_peer_device *peer_device;
+		struct issue_flush_context ctx;
+		int vnr;
+
+		atomic_set(&ctx.pending, 1);
+		ctx.error = 0;
+		init_completion(&ctx.done);
+
+		rcu_read_lock();
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+			struct drbd_device *device = peer_device->device;
+
+			if (!get_ldev(device))
+				continue;
+			kref_get(&device->kref);
+			rcu_read_unlock();
+
+			submit_one_flush(device, &ctx);
+
+			rcu_read_lock();
+		}
+		rcu_read_unlock();
+
+		/* Do we want to add a timeout,
+		 * if disk-timeout is set? */
+		if (!atomic_dec_and_test(&ctx.pending))
+			wait_for_completion(&ctx.done);
+
+		if (ctx.error) {
+			/* would rather check on EOPNOTSUPP, but that is not reliable.
+			 * don't try again for ANY return value != 0
+			 * if (rv == -EOPNOTSUPP) */
+			/* Any error is already reported by bio_endio callback. */
+			drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO);
+		}
+	}
+}
+
+/**
+ * drbd_may_finish_epoch() - Applies an epoch_event to the epoch's state, eventually finishes it.
+ * @connection:	DRBD connection.
+ * @epoch:	Epoch object.
+ * @ev:		Epoch event.
+ */
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_connection *connection,
+					       struct drbd_epoch *epoch,
+					       enum epoch_event ev)
+{
+	int epoch_size;
+	struct drbd_epoch *next_epoch;
+	enum finish_epoch rv = FE_STILL_LIVE;
+
+	spin_lock(&connection->epoch_lock);
+	do {
+		next_epoch = NULL;
+
+		epoch_size = atomic_read(&epoch->epoch_size);
+
+		switch (ev & ~EV_CLEANUP) {
+		case EV_PUT:
+			atomic_dec(&epoch->active);
+			break;
+		case EV_GOT_BARRIER_NR:
+			set_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags);
+			break;
+		case EV_BECAME_LAST:
+			/* nothing to do*/
+			break;
+		}
+
+		if (epoch_size != 0 &&
+		    atomic_read(&epoch->active) == 0 &&
+		    (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
+			if (!(ev & EV_CLEANUP)) {
+				spin_unlock(&connection->epoch_lock);
+				drbd_send_b_ack(epoch->connection, epoch->barrier_nr, epoch_size);
+				spin_lock(&connection->epoch_lock);
+			}
+#if 0
+			/* FIXME: dec unacked on connection, once we have
+			 * something to count pending connection packets in. */
+			if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
+				dec_unacked(epoch->connection);
+#endif
+
+			if (connection->current_epoch != epoch) {
+				next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
+				list_del(&epoch->list);
+				ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
+				connection->epochs--;
+				kfree(epoch);
+
+				if (rv == FE_STILL_LIVE)
+					rv = FE_DESTROYED;
+			} else {
+				epoch->flags = 0;
+				atomic_set(&epoch->epoch_size, 0);
+				/* atomic_set(&epoch->active, 0); is already zero */
+				if (rv == FE_STILL_LIVE)
+					rv = FE_RECYCLED;
+			}
+		}
+
+		if (!next_epoch)
+			break;
+
+		epoch = next_epoch;
+	} while (1);
+
+	spin_unlock(&connection->epoch_lock);
+
+	return rv;
+}
+
+static enum write_ordering_e
+max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo)
+{
+	struct disk_conf *dc;
+
+	dc = rcu_dereference(bdev->disk_conf);
+
+	if (wo == WO_BDEV_FLUSH && !dc->disk_flushes)
+		wo = WO_DRAIN_IO;
+	if (wo == WO_DRAIN_IO && !dc->disk_drain)
+		wo = WO_NONE;
+
+	return wo;
+}
+
+/*
+ * drbd_bump_write_ordering() - Fall back to an other write ordering method
+ * @wo:		Write ordering method to try.
+ */
+void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backing_dev *bdev,
+			      enum write_ordering_e wo)
+{
+	struct drbd_device *device;
+	enum write_ordering_e pwo;
+	int vnr;
+	static char *write_ordering_str[] = {
+		[WO_NONE] = "none",
+		[WO_DRAIN_IO] = "drain",
+		[WO_BDEV_FLUSH] = "flush",
+	};
+
+	pwo = resource->write_ordering;
+	if (wo != WO_BDEV_FLUSH)
+		wo = min(pwo, wo);
+	rcu_read_lock();
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		if (get_ldev(device)) {
+			wo = max_allowed_wo(device->ldev, wo);
+			if (device->ldev == bdev)
+				bdev = NULL;
+			put_ldev(device);
+		}
+	}
+
+	if (bdev)
+		wo = max_allowed_wo(bdev, wo);
+
+	rcu_read_unlock();
+
+	resource->write_ordering = wo;
+	if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH)
+		drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]);
+}
+
+/*
+ * Mapping "discard" to ZEROOUT with UNMAP does not work for us:
+ * Drivers have to "announce" q->limits.max_write_zeroes_sectors, or it
+ * will directly go to fallback mode, submitting normal writes, and
+ * never even try to UNMAP.
+ *
+ * And dm-thin does not do this (yet), mostly because in general it has
+ * to assume that "skip_block_zeroing" is set.  See also:
+ * https://www.mail-archive.com/dm-devel%40redhat.com/msg07965.html
+ * https://www.redhat.com/archives/dm-devel/2018-January/msg00271.html
+ *
+ * We *may* ignore the discard-zeroes-data setting, if so configured.
+ *
+ * Assumption is that this "discard_zeroes_data=0" is only because the backend
+ * may ignore partial unaligned discards.
+ *
+ * LVM/DM thin as of at least
+ *   LVM version:     2.02.115(2)-RHEL7 (2015-01-28)
+ *   Library version: 1.02.93-RHEL7 (2015-01-28)
+ *   Driver version:  4.29.0
+ * still behaves this way.
+ *
+ * For unaligned (wrt. alignment and granularity) or too small discards,
+ * we zero-out the initial (and/or) trailing unaligned partial chunks,
+ * but discard all the aligned full chunks.
+ *
+ * At least for LVM/DM thin, with skip_block_zeroing=false,
+ * the result is effectively "discard_zeroes_data=1".
+ */
+/* flags: EE_TRIM|EE_ZEROOUT */
+int drbd_issue_discard_or_zero_out(struct drbd_device *device, sector_t start, unsigned int nr_sectors, int flags)
+{
+	struct block_device *bdev = device->ldev->backing_bdev;
+	sector_t tmp, nr;
+	unsigned int max_discard_sectors, granularity;
+	int alignment;
+	int err = 0;
+
+	if ((flags & EE_ZEROOUT) || !(flags & EE_TRIM))
+		goto zero_out;
+
+	/* Zero-sector (unknown) and one-sector granularities are the same.  */
+	granularity = max(bdev_discard_granularity(bdev) >> 9, 1U);
+	alignment = (bdev_discard_alignment(bdev) >> 9) % granularity;
+
+	max_discard_sectors = min(bdev_max_discard_sectors(bdev), (1U << 22));
+	max_discard_sectors -= max_discard_sectors % granularity;
+	if (unlikely(!max_discard_sectors))
+		goto zero_out;
+
+	if (nr_sectors < granularity)
+		goto zero_out;
+
+	tmp = start;
+	if (sector_div(tmp, granularity) != alignment) {
+		if (nr_sectors < 2*granularity)
+			goto zero_out;
+		/* start + gran - (start + gran - align) % gran */
+		tmp = start + granularity - alignment;
+		tmp = start + granularity - sector_div(tmp, granularity);
+
+		nr = tmp - start;
+		/* don't flag BLKDEV_ZERO_NOUNMAP, we don't know how many
+		 * layers are below us, some may have smaller granularity */
+		err |= blkdev_issue_zeroout(bdev, start, nr, GFP_NOIO, 0);
+		nr_sectors -= nr;
+		start = tmp;
+	}
+	while (nr_sectors >= max_discard_sectors) {
+		err |= blkdev_issue_discard(bdev, start, max_discard_sectors,
+					    GFP_NOIO);
+		nr_sectors -= max_discard_sectors;
+		start += max_discard_sectors;
+	}
+	if (nr_sectors) {
+		/* max_discard_sectors is unsigned int (and a multiple of
+		 * granularity, we made sure of that above already);
+		 * nr is < max_discard_sectors;
+		 * I don't need sector_div here, even though nr is sector_t */
+		nr = nr_sectors;
+		nr -= (unsigned int)nr % granularity;
+		if (nr) {
+			err |= blkdev_issue_discard(bdev, start, nr, GFP_NOIO);
+			nr_sectors -= nr;
+			start += nr;
+		}
+	}
+ zero_out:
+	if (nr_sectors) {
+		err |= blkdev_issue_zeroout(bdev, start, nr_sectors, GFP_NOIO,
+				(flags & EE_TRIM) ? 0 : BLKDEV_ZERO_NOUNMAP);
+	}
+	return err != 0;
+}
+
+static bool can_do_reliable_discards(struct drbd_device *device)
+{
+	struct disk_conf *dc;
+	bool can_do;
+
+	if (!bdev_max_discard_sectors(device->ldev->backing_bdev))
+		return false;
+
+	rcu_read_lock();
+	dc = rcu_dereference(device->ldev->disk_conf);
+	can_do = dc->discard_zeroes_if_aligned;
+	rcu_read_unlock();
+	return can_do;
+}
+
+static void drbd_issue_peer_discard_or_zero_out(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	/* If the backend cannot discard, or does not guarantee
+	 * read-back zeroes in discarded ranges, we fall back to
+	 * zero-out.  Unless configuration specifically requested
+	 * otherwise. */
+	if (!can_do_reliable_discards(device))
+		peer_req->flags |= EE_ZEROOUT;
+
+	if (drbd_issue_discard_or_zero_out(device, peer_req->i.sector,
+	    peer_req->i.size >> 9, peer_req->flags & (EE_ZEROOUT|EE_TRIM)))
+		peer_req->flags |= EE_WAS_ERROR;
+	drbd_endio_write_sec_final(peer_req);
+}
+
+/**
+ * drbd_submit_peer_request()
+ * @device:	DRBD device.
+ * @peer_req:	peer request
+ *
+ * May spread the pages to multiple bios,
+ * depending on bio_add_page restrictions.
+ *
+ * Returns 0 if all bios have been submitted,
+ * -ENOMEM if we could not allocate enough bios,
+ * -ENOSPC (any better suggestion?) if we have not been able to bio_add_page a
+ *  single page to an empty bio (which should never happen and likely indicates
+ *  that the lower level IO stack is in some way broken). This has been observed
+ *  on certain Xen deployments.
+ */
+/* TODO allocate from our own bio_set. */
+int drbd_submit_peer_request(struct drbd_device *device,
+			     struct drbd_peer_request *peer_req,
+			     const blk_opf_t opf, const int fault_type)
+{
+	struct bio *bios = NULL;
+	struct bio *bio;
+	struct page *page = peer_req->pages;
+	sector_t sector = peer_req->i.sector;
+	unsigned int data_size = peer_req->i.size;
+	unsigned int n_bios = 0;
+	unsigned int nr_pages = PFN_UP(data_size);
+
+	/* TRIM/DISCARD: for now, always use the helper function
+	 * blkdev_issue_zeroout(..., discard=true).
+	 * It's synchronous, but it does the right thing wrt. bio splitting.
+	 * Correctness first, performance later.  Next step is to code an
+	 * asynchronous variant of the same.
+	 */
+	if (peer_req->flags & (EE_TRIM | EE_ZEROOUT)) {
+		/* wait for all pending IO completions, before we start
+		 * zeroing things out. */
+		conn_wait_active_ee_empty(peer_req->peer_device->connection);
+		/* add it to the active list now,
+		 * so we can find it to present it in debugfs */
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_SUBMITTED;
+
+		/* If this was a resync request from receive_rs_deallocated(),
+		 * it is already on the sync_ee list */
+		if (list_empty(&peer_req->w.list)) {
+			spin_lock_irq(&device->resource->req_lock);
+			list_add_tail(&peer_req->w.list, &device->active_ee);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+
+		drbd_issue_peer_discard_or_zero_out(device, peer_req);
+		return 0;
+	}
+
+	/* In most cases, we will only need one bio.  But in case the lower
+	 * level restrictions happen to be different at this offset on this
+	 * side than those of the sending peer, we may need to submit the
+	 * request in more than one bio.
+	 *
+	 * Plain bio_alloc is good enough here, this is no DRBD internally
+	 * generated bio, but a bio allocated on behalf of the peer.
+	 */
+next_bio:
+	bio = bio_alloc(device->ldev->backing_bdev, nr_pages, opf, GFP_NOIO);
+	/* > peer_req->i.sector, unless this is the first bio */
+	bio->bi_iter.bi_sector = sector;
+	bio->bi_private = peer_req;
+	bio->bi_end_io = drbd_peer_request_endio;
+
+	bio->bi_next = bios;
+	bios = bio;
+	++n_bios;
+
+	page_chain_for_each(page) {
+		unsigned len = min_t(unsigned, data_size, PAGE_SIZE);
+		if (!bio_add_page(bio, page, len, 0))
+			goto next_bio;
+		data_size -= len;
+		sector += len >> 9;
+		--nr_pages;
+	}
+	D_ASSERT(device, data_size == 0);
+	D_ASSERT(device, page == NULL);
+
+	atomic_set(&peer_req->pending_bios, n_bios);
+	/* for debugfs: update timestamp, mark as submitted */
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_SUBMITTED;
+	do {
+		bio = bios;
+		bios = bios->bi_next;
+		bio->bi_next = NULL;
+
+		drbd_submit_bio_noacct(device, fault_type, bio);
+	} while (bios);
+	return 0;
+}
+
+static void drbd_remove_epoch_entry_interval(struct drbd_device *device,
+					     struct drbd_peer_request *peer_req)
+{
+	struct drbd_interval *i = &peer_req->i;
+
+	drbd_remove_interval(&device->write_requests, i);
+	drbd_clear_interval(i);
+
+	/* Wake up any processes waiting for this peer request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+static void conn_wait_active_ee_empty(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_wait_ee_list_empty(device, &device->active_ee);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi)
+{
+	int rv;
+	struct p_barrier *p = pi->data;
+	struct drbd_epoch *epoch;
+
+	/* FIXME these are unacked on connection,
+	 * not a specific (peer)device.
+	 */
+	connection->current_epoch->barrier_nr = p->barrier;
+	connection->current_epoch->connection = connection;
+	rv = drbd_may_finish_epoch(connection, connection->current_epoch, EV_GOT_BARRIER_NR);
+
+	/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
+	 * the activity log, which means it would not be resynced in case the
+	 * R_PRIMARY crashes now.
+	 * Therefore we must send the barrier_ack after the barrier request was
+	 * completed. */
+	switch (connection->resource->write_ordering) {
+	case WO_NONE:
+		if (rv == FE_RECYCLED)
+			return 0;
+
+		/* receiver context, in the writeout path of the other node.
+		 * avoid potential distributed deadlock */
+		epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+		if (epoch)
+			break;
+		else
+			drbd_warn(connection, "Allocation of an epoch failed, slowing down\n");
+		fallthrough;
+
+	case WO_BDEV_FLUSH:
+	case WO_DRAIN_IO:
+		conn_wait_active_ee_empty(connection);
+		drbd_flush(connection);
+
+		if (atomic_read(&connection->current_epoch->epoch_size)) {
+			epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
+			if (epoch)
+				break;
+		}
+
+		return 0;
+	default:
+		drbd_err(connection, "Strangeness in connection->write_ordering %d\n",
+			 connection->resource->write_ordering);
+		return -EIO;
+	}
+
+	epoch->flags = 0;
+	atomic_set(&epoch->epoch_size, 0);
+	atomic_set(&epoch->active, 0);
+
+	spin_lock(&connection->epoch_lock);
+	if (atomic_read(&connection->current_epoch->epoch_size)) {
+		list_add(&epoch->list, &connection->current_epoch->list);
+		connection->current_epoch = epoch;
+		connection->epochs++;
+	} else {
+		/* The current_epoch got recycled while we allocated this one... */
+		kfree(epoch);
+	}
+	spin_unlock(&connection->epoch_lock);
+
+	return 0;
+}
+
+/* quick wrapper in case payload size != request_size (write same) */
+static void drbd_csum_ee_size(struct crypto_shash *h,
+			      struct drbd_peer_request *r, void *d,
+			      unsigned int payload_size)
+{
+	unsigned int tmp = r->i.size;
+	r->i.size = payload_size;
+	drbd_csum_ee(h, r, d);
+	r->i.size = tmp;
+}
+
+/* used from receive_RSDataReply (recv_resync_read)
+ * and from receive_Data.
+ * data_size: actual payload ("data in")
+ * 	for normal writes that is bi_size.
+ * 	for discards, that is zero.
+ * 	for write same, it is logical_block_size.
+ * both trim and write same have the bi_size ("data len to be affected")
+ * as extra argument in the packet header.
+ */
+static struct drbd_peer_request *
+read_in_block(struct drbd_peer_device *peer_device, u64 id, sector_t sector,
+	      struct packet_info *pi) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	const sector_t capacity = get_capacity(device->vdisk);
+	struct drbd_peer_request *peer_req;
+	struct page *page;
+	int digest_size, err;
+	unsigned int data_size = pi->size, ds;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+	unsigned long *data;
+	struct p_trim *trim = (pi->cmd == P_TRIM) ? pi->data : NULL;
+	struct p_trim *zeroes = (pi->cmd == P_ZEROES) ? pi->data : NULL;
+
+	digest_size = 0;
+	if (!trim && peer_device->connection->peer_integrity_tfm) {
+		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
+		/*
+		 * FIXME: Receive the incoming digest into the receive buffer
+		 *	  here, together with its struct p_data?
+		 */
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+		if (err)
+			return NULL;
+		data_size -= digest_size;
+	}
+
+	/* assume request_size == data_size, but special case trim. */
+	ds = data_size;
+	if (trim) {
+		if (!expect(data_size == 0))
+			return NULL;
+		ds = be32_to_cpu(trim->size);
+	} else if (zeroes) {
+		if (!expect(data_size == 0))
+			return NULL;
+		ds = be32_to_cpu(zeroes->size);
+	}
+
+	if (!expect(IS_ALIGNED(ds, 512)))
+		return NULL;
+	if (trim || zeroes) {
+		if (!expect(ds <= (DRBD_MAX_BBIO_SECTORS << 9)))
+			return NULL;
+	} else if (!expect(ds <= DRBD_MAX_BIO_SIZE))
+		return NULL;
+
+	/* even though we trust out peer,
+	 * we sometimes have to double check. */
+	if (sector + (ds>>9) > capacity) {
+		drbd_err(device, "request from peer beyond end of local disk: "
+			"capacity: %llus < sector: %llus + size: %u\n",
+			(unsigned long long)capacity,
+			(unsigned long long)sector, ds);
+		return NULL;
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, id, sector, ds, data_size, GFP_NOIO);
+	if (!peer_req)
+		return NULL;
+
+	peer_req->flags |= EE_WRITE;
+	if (trim) {
+		peer_req->flags |= EE_TRIM;
+		return peer_req;
+	}
+	if (zeroes) {
+		peer_req->flags |= EE_ZEROOUT;
+		return peer_req;
+	}
+
+	/* receive payload size bytes into page chain */
+	ds = data_size;
+	page = peer_req->pages;
+	page_chain_for_each(page) {
+		unsigned len = min_t(int, ds, PAGE_SIZE);
+		data = kmap(page);
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (drbd_insert_fault(device, DRBD_FAULT_RECEIVE)) {
+			drbd_err(device, "Fault injection: Corrupting data on receive\n");
+			data[0] = data[0] ^ (unsigned long)-1;
+		}
+		kunmap(page);
+		if (err) {
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+		ds -= len;
+	}
+
+	if (digest_size) {
+		drbd_csum_ee_size(peer_device->connection->peer_integrity_tfm, peer_req, dig_vv, data_size);
+		if (memcmp(dig_in, dig_vv, digest_size)) {
+			drbd_err(device, "Digest integrity check FAILED: %llus +%u\n",
+				(unsigned long long)sector, data_size);
+			drbd_free_peer_req(device, peer_req);
+			return NULL;
+		}
+	}
+	device->recv_cnt += data_size >> 9;
+	return peer_req;
+}
+
+/* drbd_drain_block() just takes a data block
+ * out of the socket input buffer, and discards it.
+ */
+static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
+{
+	struct page *page;
+	int err = 0;
+	void *data;
+
+	if (!data_size)
+		return 0;
+
+	page = drbd_alloc_pages(peer_device, 1, 1);
+
+	data = kmap(page);
+	while (data_size) {
+		unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+		err = drbd_recv_all_warn(peer_device->connection, data, len);
+		if (err)
+			break;
+		data_size -= len;
+	}
+	kunmap(page);
+	drbd_free_pages(peer_device->device, page, 0);
+	return err;
+}
+
+static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_request *req,
+			   sector_t sector, int data_size)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct bio *bio;
+	int digest_size, err, expect;
+	void *dig_in = peer_device->connection->int_dig_in;
+	void *dig_vv = peer_device->connection->int_dig_vv;
+
+	digest_size = 0;
+	if (peer_device->connection->peer_integrity_tfm) {
+		digest_size = crypto_shash_digestsize(peer_device->connection->peer_integrity_tfm);
+		err = drbd_recv_all_warn(peer_device->connection, dig_in, digest_size);
+		if (err)
+			return err;
+		data_size -= digest_size;
+	}
+
+	/* optimistically update recv_cnt.  if receiving fails below,
+	 * we disconnect anyways, and counters will be reset. */
+	peer_device->device->recv_cnt += data_size>>9;
+
+	bio = req->master_bio;
+	D_ASSERT(peer_device->device, sector == bio->bi_iter.bi_sector);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		void *mapped = bvec_kmap_local(&bvec);
+		expect = min_t(int, data_size, bvec.bv_len);
+		err = drbd_recv_all_warn(peer_device->connection, mapped, expect);
+		kunmap_local(mapped);
+		if (err)
+			return err;
+		data_size -= expect;
+	}
+
+	if (digest_size) {
+		drbd_csum_bio(peer_device->connection->peer_integrity_tfm, bio, dig_vv);
+		if (memcmp(dig_in, dig_vv, digest_size)) {
+			drbd_err(peer_device, "Digest integrity check FAILED. Broken NICs?\n");
+			return -EINVAL;
+		}
+	}
+
+	D_ASSERT(peer_device->device, data_size == 0);
+	return 0;
+}
+
+/*
+ * e_end_resync_block() is called in ack_sender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err;
+
+	D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		drbd_set_in_sync(device, sector, peer_req->i.size);
+		err = drbd_send_ack(peer_device, P_RS_WRITE_ACK, peer_req);
+	} else {
+		/* Record failure to sync */
+		drbd_rs_failed_io(device, sector, peer_req->i.size);
+
+		err  = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+	}
+	dec_unacked(device);
+
+	return err;
+}
+
+static int recv_resync_read(struct drbd_peer_device *peer_device, sector_t sector,
+			    struct packet_info *pi) __releases(local)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	peer_req = read_in_block(peer_device, ID_SYNCER, sector, pi);
+	if (!peer_req)
+		goto fail;
+
+	dec_rs_pending(device);
+
+	inc_unacked(device);
+	/* corresponding dec_unacked() in e_end_resync_block()
+	 * respective _drbd_clear_done_ee */
+
+	peer_req->w.cb = e_end_resync_block;
+	peer_req->submit_jif = jiffies;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->sync_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(pi->size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, REQ_OP_WRITE,
+				     DRBD_FAULT_RS_WR) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+fail:
+	put_ldev(device);
+	return -EIO;
+}
+
+static struct drbd_request *
+find_request(struct drbd_device *device, struct rb_root *root, u64 id,
+	     sector_t sector, bool missing_ok, const char *func)
+{
+	struct drbd_request *req;
+
+	/* Request object according to our peer */
+	req = (struct drbd_request *)(unsigned long)id;
+	if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+		return req;
+	if (!missing_ok) {
+		drbd_err(device, "%s: failed to find request 0x%lx, sector %llus\n", func,
+			(unsigned long)id, (unsigned long long)sector);
+	}
+	return NULL;
+}
+
+static int receive_DataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct drbd_request *req;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, &device->read_requests, p->block_id, sector, false, __func__);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (unlikely(!req))
+		return -EIO;
+
+	err = recv_dless_read(peer_device, req, sector, pi->size);
+	if (!err)
+		req_mod(req, DATA_RECEIVED);
+	/* else: nothing. handled from drbd_disconnect...
+	 * I don't think we may complete this just yet
+	 * in case we are "on-disconnect: freeze" */
+
+	return err;
+}
+
+static int receive_RSDataReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int err;
+	struct p_data *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	D_ASSERT(device, p->block_id == ID_SYNCER);
+
+	if (get_ldev(device)) {
+		/* data is submitted to disk within recv_resync_read.
+		 * corresponding put_ldev done below on error,
+		 * or in drbd_peer_request_endio. */
+		err = recv_resync_read(peer_device, sector, pi);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not write resync data to local disk.\n");
+
+		err = drbd_drain_block(peer_device, pi->size);
+
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+	}
+
+	atomic_add(pi->size >> 9, &device->rs_sect_in);
+
+	return err;
+}
+
+static void restart_conflicting_writes(struct drbd_device *device,
+				       sector_t sector, int size)
+{
+	struct drbd_interval *i;
+	struct drbd_request *req;
+
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (req->rq_state & RQ_LOCAL_PENDING ||
+		    !(req->rq_state & RQ_POSTPONED))
+			continue;
+		/* as it is RQ_POSTPONED, this will cause it to
+		 * be queued on the retry workqueue. */
+		__req_mod(req, CONFLICT_RESOLVED, NULL);
+	}
+}
+
+/*
+ * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	int err = 0, pcmd;
+
+	if (peer_req->flags & EE_SEND_WRITE_ACK) {
+		if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+			pcmd = (device->state.conn >= C_SYNC_SOURCE &&
+				device->state.conn <= C_PAUSED_SYNC_T &&
+				peer_req->flags & EE_MAY_SET_IN_SYNC) ?
+				P_RS_WRITE_ACK : P_WRITE_ACK;
+			err = drbd_send_ack(peer_device, pcmd, peer_req);
+			if (pcmd == P_RS_WRITE_ACK)
+				drbd_set_in_sync(device, sector, peer_req->i.size);
+		} else {
+			err = drbd_send_ack(peer_device, P_NEG_ACK, peer_req);
+			/* we expect it to be marked out of sync anyways...
+			 * maybe assert this?  */
+		}
+		dec_unacked(device);
+	}
+
+	/* we delete from the conflict detection hash _after_ we sent out the
+	 * P_WRITE_ACK / P_NEG_ACK, to get the sequence number right.  */
+	if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+		spin_lock_irq(&device->resource->req_lock);
+		D_ASSERT(device, !drbd_interval_empty(&peer_req->i));
+		drbd_remove_epoch_entry_interval(device, peer_req);
+		if (peer_req->flags & EE_RESTART_REQUESTS)
+			restart_conflicting_writes(device, sector, peer_req->i.size);
+		spin_unlock_irq(&device->resource->req_lock);
+	} else
+		D_ASSERT(device, drbd_interval_empty(&peer_req->i));
+
+	drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+
+	return err;
+}
+
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	int err;
+
+	err = drbd_send_ack(peer_device, ack, peer_req);
+	dec_unacked(peer_device->device);
+
+	return err;
+}
+
+static int e_send_superseded(struct drbd_work *w, int unused)
+{
+	return e_send_ack(w, P_SUPERSEDED);
+}
+
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+	struct drbd_peer_request *peer_req =
+		container_of(w, struct drbd_peer_request, w);
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+
+	return e_send_ack(w, connection->agreed_pro_version >= 100 ?
+			     P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+	/*
+	 * We assume 32-bit wrap-around here.
+	 * For 24-bit wrap-around, we would have to shift:
+	 *  a <<= 8; b <<= 8;
+	 */
+	return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+	return seq_greater(a, b) ? a : b;
+}
+
+static void update_peer_seq(struct drbd_peer_device *peer_device, unsigned int peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int newest_peer_seq;
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)) {
+		spin_lock(&device->peer_seq_lock);
+		newest_peer_seq = seq_max(device->peer_seq, peer_seq);
+		device->peer_seq = newest_peer_seq;
+		spin_unlock(&device->peer_seq_lock);
+		/* wake up only if we actually changed device->peer_seq */
+		if (peer_seq == newest_peer_seq)
+			wake_up(&device->seq_wait);
+	}
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+	return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	struct drbd_peer_request *rs_req;
+	bool rv = false;
+
+	spin_lock_irq(&device->resource->req_lock);
+	list_for_each_entry(rs_req, &device->sync_ee, w.list) {
+		if (overlaps(peer_req->i.sector, peer_req->i.size,
+			     rs_req->i.sector, rs_req->i.size)) {
+			rv = true;
+			break;
+		}
+	}
+	spin_unlock_irq(&device->resource->req_lock);
+
+	return rv;
+}
+
+/* Called from receive_Data.
+ * Synchronize packets on sock with packets on msock.
+ *
+ * This is here so even when a P_DATA packet traveling via sock overtook an Ack
+ * packet traveling on msock, they are still processed in the order they have
+ * been sent.
+ *
+ * Note: we don't care for Ack packets overtaking P_DATA packets.
+ *
+ * In case packet_seq is larger than device->peer_seq number, there are
+ * outstanding packets on the msock. We wait for them to arrive.
+ * In case we are the logically next packet, we update device->peer_seq
+ * ourselves. Correctly handles 32bit wrap around.
+ *
+ * Assume we have a 10 GBit connection, that is about 1<<30 byte per second,
+ * about 1<<21 sectors per second. So "worst" case, we have 1<<3 == 8 seconds
+ * for the 24bit wrap (historical atomic_t guarantee on some archs), and we have
+ * 1<<9 == 512 seconds aka ages for the 32bit wrap around...
+ *
+ * returns 0 if we may process the packet,
+ * -ERESTARTSYS if we were interrupted (by disconnect signal). */
+static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, const u32 peer_seq)
+{
+	struct drbd_device *device = peer_device->device;
+	DEFINE_WAIT(wait);
+	long timeout;
+	int ret = 0, tp;
+
+	if (!test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags))
+		return 0;
+
+	spin_lock(&device->peer_seq_lock);
+	for (;;) {
+		if (!seq_greater(peer_seq - 1, device->peer_seq)) {
+			device->peer_seq = seq_max(device->peer_seq, peer_seq);
+			break;
+		}
+
+		if (signal_pending(current)) {
+			ret = -ERESTARTSYS;
+			break;
+		}
+
+		rcu_read_lock();
+		tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries;
+		rcu_read_unlock();
+
+		if (!tp)
+			break;
+
+		/* Only need to wait if two_primaries is enabled */
+		prepare_to_wait(&device->seq_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_unlock(&device->peer_seq_lock);
+		rcu_read_lock();
+		timeout = rcu_dereference(peer_device->connection->net_conf)->ping_timeo*HZ/10;
+		rcu_read_unlock();
+		timeout = schedule_timeout(timeout);
+		spin_lock(&device->peer_seq_lock);
+		if (!timeout) {
+			ret = -ETIMEDOUT;
+			drbd_err(device, "Timed out waiting for missing ack packets; disconnecting\n");
+			break;
+		}
+	}
+	spin_unlock(&device->peer_seq_lock);
+	finish_wait(&device->seq_wait, &wait);
+	return ret;
+}
+
+/* see also bio_flags_to_wire()
+ * DRBD_REQ_*, because we need to semantically map the flags to data packet
+ * flags and back. We may replicate to other kernel versions. */
+static blk_opf_t wire_flags_to_bio_flags(u32 dpf)
+{
+	return  (dpf & DP_RW_SYNC ? REQ_SYNC : 0) |
+		(dpf & DP_FUA ? REQ_FUA : 0) |
+		(dpf & DP_FLUSH ? REQ_PREFLUSH : 0);
+}
+
+static enum req_op wire_flags_to_bio_op(u32 dpf)
+{
+	if (dpf & DP_ZEROES)
+		return REQ_OP_WRITE_ZEROES;
+	if (dpf & DP_DISCARD)
+		return REQ_OP_DISCARD;
+	else
+		return REQ_OP_WRITE;
+}
+
+static void fail_postponed_requests(struct drbd_device *device, sector_t sector,
+				    unsigned int size)
+{
+	struct drbd_interval *i;
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		struct drbd_request *req;
+		struct bio_and_error m;
+
+		if (!i->local)
+			continue;
+		req = container_of(i, struct drbd_request, i);
+		if (!(req->rq_state & RQ_POSTPONED))
+			continue;
+		req->rq_state &= ~RQ_POSTPONED;
+		__req_mod(req, NEG_ACKED, &m);
+		spin_unlock_irq(&device->resource->req_lock);
+		if (m.bio)
+			complete_master_bio(device, &m);
+		spin_lock_irq(&device->resource->req_lock);
+		goto repeat;
+	}
+}
+
+static int handle_write_conflicts(struct drbd_device *device,
+				  struct drbd_peer_request *peer_req)
+{
+	struct drbd_connection *connection = peer_req->peer_device->connection;
+	bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+	sector_t sector = peer_req->i.sector;
+	const unsigned int size = peer_req->i.size;
+	struct drbd_interval *i;
+	bool equal;
+	int err;
+
+	/*
+	 * Inserting the peer request into the write_requests tree will prevent
+	 * new conflicting local requests from being added.
+	 */
+	drbd_insert_interval(&device->write_requests, &peer_req->i);
+
+    repeat:
+	drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+		if (i == &peer_req->i)
+			continue;
+		if (i->completed)
+			continue;
+
+		if (!i->local) {
+			/*
+			 * Our peer has sent a conflicting remote request; this
+			 * should not happen in a two-node setup.  Wait for the
+			 * earlier peer request to complete.
+			 */
+			err = drbd_wait_misc(device, i);
+			if (err)
+				goto out;
+			goto repeat;
+		}
+
+		equal = i->sector == sector && i->size == size;
+		if (resolve_conflicts) {
+			/*
+			 * If the peer request is fully contained within the
+			 * overlapping request, it can be considered overwritten
+			 * and thus superseded; otherwise, it will be retried
+			 * once all overlapping requests have completed.
+			 */
+			bool superseded = i->sector <= sector && i->sector +
+				       (i->size >> 9) >= sector + (size >> 9);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u, "
+					       "assuming %s came first\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size,
+					  superseded ? "local" : "remote");
+
+			peer_req->w.cb = superseded ? e_send_superseded :
+						   e_send_retry_write;
+			list_add_tail(&peer_req->w.list, &device->done_ee);
+			queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work);
+
+			err = -ENOENT;
+			goto out;
+		} else {
+			struct drbd_request *req =
+				container_of(i, struct drbd_request, i);
+
+			if (!equal)
+				drbd_alert(device, "Concurrent writes detected: "
+					       "local=%llus +%u, remote=%llus +%u\n",
+					  (unsigned long long)i->sector, i->size,
+					  (unsigned long long)sector, size);
+
+			if (req->rq_state & RQ_LOCAL_PENDING ||
+			    !(req->rq_state & RQ_POSTPONED)) {
+				/*
+				 * Wait for the node with the discard flag to
+				 * decide if this request has been superseded
+				 * or needs to be retried.
+				 * Requests that have been superseded will
+				 * disappear from the write_requests tree.
+				 *
+				 * In addition, wait for the conflicting
+				 * request to finish locally before submitting
+				 * the conflicting peer request.
+				 */
+				err = drbd_wait_misc(device, &req->i);
+				if (err) {
+					_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_HARD);
+					fail_postponed_requests(device, sector, size);
+					goto out;
+				}
+				goto repeat;
+			}
+			/*
+			 * Remember to restart the conflicting requests after
+			 * the new peer request has completed.
+			 */
+			peer_req->flags |= EE_RESTART_REQUESTS;
+		}
+	}
+	err = 0;
+
+    out:
+	if (err)
+		drbd_remove_epoch_entry_interval(device, peer_req);
+	return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct net_conf *nc;
+	sector_t sector;
+	struct drbd_peer_request *peer_req;
+	struct p_data *p = pi->data;
+	u32 peer_seq = be32_to_cpu(p->seq_num);
+	enum req_op op;
+	blk_opf_t op_flags;
+	u32 dp_flags;
+	int err, tp;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (!get_ldev(device)) {
+		int err2;
+
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		drbd_send_ack_dp(peer_device, P_NEG_ACK, p, pi->size);
+		atomic_inc(&connection->current_epoch->epoch_size);
+		err2 = drbd_drain_block(peer_device, pi->size);
+		if (!err)
+			err = err2;
+		return err;
+	}
+
+	/*
+	 * Corresponding put_ldev done either below (on various errors), or in
+	 * drbd_peer_request_endio, if we successfully submit the data at the
+	 * end of this function.
+	 */
+
+	sector = be64_to_cpu(p->sector);
+	peer_req = read_in_block(peer_device, p->block_id, sector, pi);
+	if (!peer_req) {
+		put_ldev(device);
+		return -EIO;
+	}
+
+	peer_req->w.cb = e_end_block;
+	peer_req->submit_jif = jiffies;
+	peer_req->flags |= EE_APPLICATION;
+
+	dp_flags = be32_to_cpu(p->dp_flags);
+	op = wire_flags_to_bio_op(dp_flags);
+	op_flags = wire_flags_to_bio_flags(dp_flags);
+	if (pi->cmd == P_TRIM) {
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, op == REQ_OP_DISCARD);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+		/* need to play safe: an older DRBD sender
+		 * may mean zero-out while sending P_TRIM. */
+		if (0 == (connection->agreed_features & DRBD_FF_WZEROES))
+			peer_req->flags |= EE_ZEROOUT;
+	} else if (pi->cmd == P_ZEROES) {
+		D_ASSERT(peer_device, peer_req->i.size > 0);
+		D_ASSERT(peer_device, op == REQ_OP_WRITE_ZEROES);
+		D_ASSERT(peer_device, peer_req->pages == NULL);
+		/* Do (not) pass down BLKDEV_ZERO_NOUNMAP? */
+		if (dp_flags & DP_DISCARD)
+			peer_req->flags |= EE_TRIM;
+	} else if (peer_req->pages == NULL) {
+		D_ASSERT(device, peer_req->i.size == 0);
+		D_ASSERT(device, dp_flags & DP_FLUSH);
+	}
+
+	if (dp_flags & DP_MAY_SET_IN_SYNC)
+		peer_req->flags |= EE_MAY_SET_IN_SYNC;
+
+	spin_lock(&connection->epoch_lock);
+	peer_req->epoch = connection->current_epoch;
+	atomic_inc(&peer_req->epoch->epoch_size);
+	atomic_inc(&peer_req->epoch->active);
+	spin_unlock(&connection->epoch_lock);
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	tp = nc->two_primaries;
+	if (peer_device->connection->agreed_pro_version < 100) {
+		switch (nc->wire_protocol) {
+		case DRBD_PROT_C:
+			dp_flags |= DP_SEND_WRITE_ACK;
+			break;
+		case DRBD_PROT_B:
+			dp_flags |= DP_SEND_RECEIVE_ACK;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	if (dp_flags & DP_SEND_WRITE_ACK) {
+		peer_req->flags |= EE_SEND_WRITE_ACK;
+		inc_unacked(device);
+		/* corresponding dec_unacked() in e_end_block()
+		 * respective _drbd_clear_done_ee */
+	}
+
+	if (dp_flags & DP_SEND_RECEIVE_ACK) {
+		/* I really don't like it that the receiver thread
+		 * sends on the msock, but anyways */
+		drbd_send_ack(peer_device, P_RECV_ACK, peer_req);
+	}
+
+	if (tp) {
+		/* two primaries implies protocol C */
+		D_ASSERT(device, dp_flags & DP_SEND_WRITE_ACK);
+		peer_req->flags |= EE_IN_INTERVAL_TREE;
+		err = wait_for_and_update_peer_seq(peer_device, peer_seq);
+		if (err)
+			goto out_interrupted;
+		spin_lock_irq(&device->resource->req_lock);
+		err = handle_write_conflicts(device, peer_req);
+		if (err) {
+			spin_unlock_irq(&device->resource->req_lock);
+			if (err == -ENOENT) {
+				put_ldev(device);
+				return 0;
+			}
+			goto out_interrupted;
+		}
+	} else {
+		update_peer_seq(peer_device, peer_seq);
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	/* TRIM and is processed synchronously,
+	 * we wait for all pending requests, respectively wait for
+	 * active_ee to become empty in drbd_submit_peer_request();
+	 * better not add ourselves here. */
+	if ((peer_req->flags & (EE_TRIM | EE_ZEROOUT)) == 0)
+		list_add_tail(&peer_req->w.list, &device->active_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (device->state.conn == C_SYNC_TARGET)
+		wait_event(device->ee_wait, !overlapping_resync_write(device, peer_req));
+
+	if (device->state.pdsk < D_INCONSISTENT) {
+		/* In case we have the only disk of the cluster, */
+		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+		peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+		drbd_al_begin_io(device, &peer_req->i);
+		peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+	}
+
+	err = drbd_submit_peer_request(device, peer_req, op | op_flags,
+				       DRBD_FAULT_DT_WR);
+	if (!err)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	drbd_remove_epoch_entry_interval(device, peer_req);
+	spin_unlock_irq(&device->resource->req_lock);
+	if (peer_req->flags & EE_CALL_AL_COMPLETE_IO) {
+		peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+		drbd_al_complete_io(device, &peer_req->i);
+	}
+
+out_interrupted:
+	drbd_may_finish_epoch(connection, peer_req->epoch, EV_PUT | EV_CLEANUP);
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return err;
+}
+
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector,
+		bool throttle_if_app_is_waiting)
+{
+	struct lc_element *tmp;
+	bool throttle = drbd_rs_c_min_rate_throttle(device);
+
+	if (!throttle || throttle_if_app_is_waiting)
+		return throttle;
+
+	spin_lock_irq(&device->al_lock);
+	tmp = lc_find(device->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags))
+			throttle = false;
+		/* Do not slow down if app IO is already waiting for this extent,
+		 * and our progress is necessary for application IO to complete. */
+	}
+	spin_unlock_irq(&device->al_lock);
+
+	return throttle;
+}
+
+bool drbd_rs_c_min_rate_throttle(struct drbd_device *device)
+{
+	struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
+	unsigned long db, dt, dbdt;
+	unsigned int c_min_rate;
+	int curr_events;
+
+	rcu_read_lock();
+	c_min_rate = rcu_dereference(device->ldev->disk_conf)->c_min_rate;
+	rcu_read_unlock();
+
+	/* feature disabled? */
+	if (c_min_rate == 0)
+		return false;
+
+	curr_events = (int)part_stat_read_accum(disk->part0, sectors) -
+			atomic_read(&device->rs_sect_ev);
+
+	if (atomic_read(&device->ap_actlog_cnt)
+	    || curr_events - device->rs_last_events > 64) {
+		unsigned long rs_left;
+		int i;
+
+		device->rs_last_events = curr_events;
+
+		/* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+		 * approx. */
+		i = (device->rs_last_mark + DRBD_SYNC_MARKS-1) % DRBD_SYNC_MARKS;
+
+		if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+			rs_left = device->ov_left;
+		else
+			rs_left = drbd_bm_total_weight(device) - device->rs_failed;
+
+		dt = ((long)jiffies - (long)device->rs_mark_time[i]) / HZ;
+		if (!dt)
+			dt++;
+		db = device->rs_mark_left[i] - rs_left;
+		dbdt = Bit2KB(db/dt);
+
+		if (dbdt > c_min_rate)
+			return true;
+	}
+	return false;
+}
+
+static int receive_DataRequest(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	sector_t capacity;
+	struct drbd_peer_request *peer_req;
+	struct digest_info *di = NULL;
+	int size, verb;
+	unsigned int fault_type;
+	struct p_block_req *p =	pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+	capacity = get_capacity(device->vdisk);
+
+	sector = be64_to_cpu(p->sector);
+	size   = be32_to_cpu(p->blksize);
+
+	if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+	if (sector + (size>>9) > capacity) {
+		drbd_err(device, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
+				(unsigned long long)sector, size);
+		return -EINVAL;
+	}
+
+	if (!get_ldev_if_state(device, D_UP_TO_DATE)) {
+		verb = 1;
+		switch (pi->cmd) {
+		case P_DATA_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_DREPLY, p);
+			break;
+		case P_RS_THIN_REQ:
+		case P_RS_DATA_REQUEST:
+		case P_CSUM_RS_REQUEST:
+		case P_OV_REQUEST:
+			drbd_send_ack_rp(peer_device, P_NEG_RS_DREPLY , p);
+			break;
+		case P_OV_REPLY:
+			verb = 0;
+			dec_rs_pending(device);
+			drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size, ID_IN_SYNC);
+			break;
+		default:
+			BUG();
+		}
+		if (verb && __ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Can not satisfy peer's read request, "
+			    "no local data.\n");
+
+		/* drain possibly payload */
+		return drbd_drain_block(peer_device, pi->size);
+	}
+
+	/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
+	 * "criss-cross" setup, that might cause write-out on some other DRBD,
+	 * which in turn might block on the other node at this very place.  */
+	peer_req = drbd_alloc_peer_req(peer_device, p->block_id, sector, size,
+			size, GFP_NOIO);
+	if (!peer_req) {
+		put_ldev(device);
+		return -ENOMEM;
+	}
+
+	switch (pi->cmd) {
+	case P_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_data_req;
+		fault_type = DRBD_FAULT_DT_RD;
+		/* application IO, don't drbd_rs_begin_io */
+		peer_req->flags |= EE_APPLICATION;
+		goto submit;
+
+	case P_RS_THIN_REQ:
+		/* If at some point in the future we have a smart way to
+		   find out if this data block is completely deallocated,
+		   then we would do something smarter here than reading
+		   the block... */
+		peer_req->flags |= EE_RS_THIN_REQ;
+		fallthrough;
+	case P_RS_DATA_REQUEST:
+		peer_req->w.cb = w_e_end_rsdata_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		/* used in the sector offset progress display */
+		device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+		break;
+
+	case P_OV_REPLY:
+	case P_CSUM_RS_REQUEST:
+		fault_type = DRBD_FAULT_RS_RD;
+		di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
+		if (!di)
+			goto out_free_e;
+
+		di->digest_size = pi->size;
+		di->digest = (((char *)di)+sizeof(struct digest_info));
+
+		peer_req->digest = di;
+		peer_req->flags |= EE_HAS_DIGEST;
+
+		if (drbd_recv_all(peer_device->connection, di->digest, pi->size))
+			goto out_free_e;
+
+		if (pi->cmd == P_CSUM_RS_REQUEST) {
+			D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+			peer_req->w.cb = w_e_end_csum_rs_req;
+			/* used in the sector offset progress display */
+			device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+			/* remember to report stats in drbd_resync_finished */
+			device->use_csums = true;
+		} else if (pi->cmd == P_OV_REPLY) {
+			/* track progress, we may need to throttle */
+			atomic_add(size >> 9, &device->rs_sect_in);
+			peer_req->w.cb = w_e_end_ov_reply;
+			dec_rs_pending(device);
+			/* drbd_rs_begin_io done when we sent this request,
+			 * but accounting still needs to be done. */
+			goto submit_for_resync;
+		}
+		break;
+
+	case P_OV_REQUEST:
+		if (device->ov_start_sector == ~(sector_t)0 &&
+		    peer_device->connection->agreed_pro_version >= 90) {
+			unsigned long now = jiffies;
+			int i;
+			device->ov_start_sector = sector;
+			device->ov_position = sector;
+			device->ov_left = drbd_bm_bits(device) - BM_SECT_TO_BIT(sector);
+			device->rs_total = device->ov_left;
+			for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+				device->rs_mark_left[i] = device->ov_left;
+				device->rs_mark_time[i] = now;
+			}
+			drbd_info(device, "Online Verify start sector: %llu\n",
+					(unsigned long long)sector);
+		}
+		peer_req->w.cb = w_e_end_ov_req;
+		fault_type = DRBD_FAULT_RS_RD;
+		break;
+
+	default:
+		BUG();
+	}
+
+	/* Throttle, drbd_rs_begin_io and submit should become asynchronous
+	 * wrt the receiver, but it is not as straightforward as it may seem.
+	 * Various places in the resync start and stop logic assume resync
+	 * requests are processed in order, requeuing this on the worker thread
+	 * introduces a bunch of new code for synchronization between threads.
+	 *
+	 * Unlimited throttling before drbd_rs_begin_io may stall the resync
+	 * "forever", throttling after drbd_rs_begin_io will lock that extent
+	 * for application writes for the same time.  For now, just throttle
+	 * here, where the rest of the code expects the receiver to sleep for
+	 * a while, anyways.
+	 */
+
+	/* Throttle before drbd_rs_begin_io, as that locks out application IO;
+	 * this defers syncer requests for some time, before letting at least
+	 * on request through.  The resync controller on the receiving side
+	 * will adapt to the incoming rate accordingly.
+	 *
+	 * We cannot throttle here if remote is Primary/SyncTarget:
+	 * we would also throttle its application reads.
+	 * In that case, throttling is done on the SyncTarget only.
+	 */
+
+	/* Even though this may be a resync request, we do add to "read_ee";
+	 * "sync_ee" is only used for resync WRITEs.
+	 * Add to list early, so debugfs can find this request
+	 * even if we have to sleep below. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	update_receiver_timing_details(connection, drbd_rs_should_slow_down);
+	if (device->state.peer != R_PRIMARY
+	&& drbd_rs_should_slow_down(device, sector, false))
+		schedule_timeout_uninterruptible(HZ/10);
+	update_receiver_timing_details(connection, drbd_rs_begin_io);
+	if (drbd_rs_begin_io(device, sector))
+		goto out_free_e;
+
+submit_for_resync:
+	atomic_add(size >> 9, &device->rs_sect_ev);
+
+submit:
+	update_receiver_timing_details(connection, drbd_submit_peer_request);
+	inc_unacked(device);
+	if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
+				     fault_type) == 0)
+		return 0;
+
+	/* don't care for the reason here */
+	drbd_err(device, "submit failed, triggering re-connect\n");
+
+out_free_e:
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+	/* no drbd_rs_complete_io(), we are dropping the connection anyways */
+
+	put_ldev(device);
+	drbd_free_peer_req(device, peer_req);
+	return -EIO;
+}
+
+/*
+ * drbd_asb_recover_0p  -  Recover after split-brain with no remaining primaries
+ */
+static int drbd_asb_recover_0p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int self, peer, rv = -100;
+	unsigned long ch_self, ch_peer;
+	enum drbd_after_sb_p after_sb_0p;
+
+	self = device->ldev->md.uuid[UI_BITMAP] & 1;
+	peer = device->p_uuid[UI_BITMAP] & 1;
+
+	ch_peer = device->p_uuid[UI_SIZE];
+	ch_self = device->comm_bm_set;
+
+	rcu_read_lock();
+	after_sb_0p = rcu_dereference(peer_device->connection->net_conf)->after_sb_0p;
+	rcu_read_unlock();
+	switch (after_sb_0p) {
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_CALL_HELPER:
+	case ASB_VIOLENTLY:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_DISCARD_YOUNGER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = -1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv =  1;
+			break;
+		}
+		fallthrough;	/* to one of the other strategies */
+	case ASB_DISCARD_OLDER_PRI:
+		if (self == 0 && peer == 1) {
+			rv = 1;
+			break;
+		}
+		if (self == 1 && peer == 0) {
+			rv = -1;
+			break;
+		}
+		/* Else fall through to one of the other strategies... */
+		drbd_warn(device, "Discard younger/older primary did not find a decision\n"
+		     "Using discard-least-changes instead\n");
+		fallthrough;
+	case ASB_DISCARD_ZERO_CHG:
+		if (ch_peer == 0 && ch_self == 0) {
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+			break;
+		} else {
+			if (ch_peer == 0) { rv =  1; break; }
+			if (ch_self == 0) { rv = -1; break; }
+		}
+		if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
+			break;
+		fallthrough;
+	case ASB_DISCARD_LEAST_CHG:
+		if	(ch_self < ch_peer)
+			rv = -1;
+		else if (ch_self > ch_peer)
+			rv =  1;
+		else /* ( ch_self == ch_peer ) */
+		     /* Well, then use something else. */
+			rv = test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags)
+				? -1 : 1;
+		break;
+	case ASB_DISCARD_LOCAL:
+		rv = -1;
+		break;
+	case ASB_DISCARD_REMOTE:
+		rv =  1;
+	}
+
+	return rv;
+}
+
+/*
+ * drbd_asb_recover_1p  -  Recover after split-brain with one remaining primary
+ */
+static int drbd_asb_recover_1p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_1p;
+
+	rcu_read_lock();
+	after_sb_1p = rcu_dereference(peer_device->connection->net_conf)->after_sb_1p;
+	rcu_read_unlock();
+	switch (after_sb_1p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CONSENSUS:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_SECONDARY)
+			rv = hg;
+		if (hg == 1  && device->state.role == R_PRIMARY)
+			rv = hg;
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCARD_SECONDARY:
+		return device->state.role == R_PRIMARY ? 1 : -1;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1 && device->state.role == R_PRIMARY) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+/*
+ * drbd_asb_recover_2p  -  Recover after split-brain with two remaining primaries
+ */
+static int drbd_asb_recover_2p(struct drbd_peer_device *peer_device) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	int hg, rv = -100;
+	enum drbd_after_sb_p after_sb_2p;
+
+	rcu_read_lock();
+	after_sb_2p = rcu_dereference(peer_device->connection->net_conf)->after_sb_2p;
+	rcu_read_unlock();
+	switch (after_sb_2p) {
+	case ASB_DISCARD_YOUNGER_PRI:
+	case ASB_DISCARD_OLDER_PRI:
+	case ASB_DISCARD_LEAST_CHG:
+	case ASB_DISCARD_LOCAL:
+	case ASB_DISCARD_REMOTE:
+	case ASB_CONSENSUS:
+	case ASB_DISCARD_SECONDARY:
+	case ASB_DISCARD_ZERO_CHG:
+		drbd_err(device, "Configuration error.\n");
+		break;
+	case ASB_VIOLENTLY:
+		rv = drbd_asb_recover_0p(peer_device);
+		break;
+	case ASB_DISCONNECT:
+		break;
+	case ASB_CALL_HELPER:
+		hg = drbd_asb_recover_0p(peer_device);
+		if (hg == -1) {
+			enum drbd_state_rv rv2;
+
+			 /* drbd_change_state() does not sleep while in SS_IN_TRANSIENT_STATE,
+			  * we might be here in C_WF_REPORT_PARAMS which is transient.
+			  * we do not need to wait for the after state change work either. */
+			rv2 = drbd_change_state(device, CS_VERBOSE, NS(role, R_SECONDARY));
+			if (rv2 != SS_SUCCESS) {
+				drbd_khelper(device, "pri-lost-after-sb");
+			} else {
+				drbd_warn(device, "Successfully gave up primary role.\n");
+				rv = hg;
+			}
+		} else
+			rv = hg;
+	}
+
+	return rv;
+}
+
+static void drbd_uuid_dump(struct drbd_device *device, char *text, u64 *uuid,
+			   u64 bits, u64 flags)
+{
+	if (!uuid) {
+		drbd_info(device, "%s uuid info vanished while I was looking!\n", text);
+		return;
+	}
+	drbd_info(device, "%s %016llX:%016llX:%016llX:%016llX bits:%llu flags:%llX\n",
+	     text,
+	     (unsigned long long)uuid[UI_CURRENT],
+	     (unsigned long long)uuid[UI_BITMAP],
+	     (unsigned long long)uuid[UI_HISTORY_START],
+	     (unsigned long long)uuid[UI_HISTORY_END],
+	     (unsigned long long)bits,
+	     (unsigned long long)flags);
+}
+
+/*
+  100	after split brain try auto recover
+    2	C_SYNC_SOURCE set BitMap
+    1	C_SYNC_SOURCE use BitMap
+    0	no Sync
+   -1	C_SYNC_TARGET use BitMap
+   -2	C_SYNC_TARGET set BitMap
+ -100	after split brain, disconnect
+-1000	unrelated data
+-1091   requires proto 91
+-1096   requires proto 96
+ */
+
+static int drbd_uuid_compare(struct drbd_device *const device, enum drbd_role const peer_role, int *rule_nr) __must_hold(local)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	u64 self, peer;
+	int i, j;
+
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+
+	*rule_nr = 10;
+	if (self == UUID_JUST_CREATED && peer == UUID_JUST_CREATED)
+		return 0;
+
+	*rule_nr = 20;
+	if ((self == UUID_JUST_CREATED || self == (u64)0) &&
+	     peer != UUID_JUST_CREATED)
+		return -2;
+
+	*rule_nr = 30;
+	if (self != UUID_JUST_CREATED &&
+	    (peer == UUID_JUST_CREATED || peer == (u64)0))
+		return 2;
+
+	if (self == peer) {
+		int rct, dc; /* roles at crash time */
+
+		if (device->p_uuid[UI_BITMAP] == (u64)0 && device->ldev->md.uuid[UI_BITMAP] != (u64)0) {
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1))) {
+				drbd_info(device, "was SyncSource, missed the resync finished event, corrected myself:\n");
+				drbd_uuid_move_history(device);
+				device->ldev->md.uuid[UI_HISTORY_START] = device->ldev->md.uuid[UI_BITMAP];
+				device->ldev->md.uuid[UI_BITMAP] = 0;
+
+				drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+					       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+				*rule_nr = 34;
+			} else {
+				drbd_info(device, "was SyncSource (peer failed to write sync_uuid)\n");
+				*rule_nr = 36;
+			}
+
+			return 1;
+		}
+
+		if (device->ldev->md.uuid[UI_BITMAP] == (u64)0 && device->p_uuid[UI_BITMAP] != (u64)0) {
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			if ((device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (device->p_uuid[UI_BITMAP] & ~((u64)1)) &&
+			    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) == (device->p_uuid[UI_HISTORY_START] & ~((u64)1))) {
+				drbd_info(device, "was SyncTarget, peer missed the resync finished event, corrected peer:\n");
+
+				device->p_uuid[UI_HISTORY_START + 1] = device->p_uuid[UI_HISTORY_START];
+				device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_BITMAP];
+				device->p_uuid[UI_BITMAP] = 0UL;
+
+				drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+				*rule_nr = 35;
+			} else {
+				drbd_info(device, "was SyncTarget (failed to write sync_uuid)\n");
+				*rule_nr = 37;
+			}
+
+			return -1;
+		}
+
+		/* Common power [off|failure] */
+		rct = (test_bit(CRASHED_PRIMARY, &device->flags) ? 1 : 0) +
+			(device->p_uuid[UI_FLAGS] & 2);
+		/* lowest bit is set when we were primary,
+		 * next bit (weight 2) is set when peer was primary */
+		*rule_nr = 40;
+
+		/* Neither has the "crashed primary" flag set,
+		 * only a replication link hickup. */
+		if (rct == 0)
+			return 0;
+
+		/* Current UUID equal and no bitmap uuid; does not necessarily
+		 * mean this was a "simultaneous hard crash", maybe IO was
+		 * frozen, so no UUID-bump happened.
+		 * This is a protocol change, overload DRBD_FF_WSAME as flag
+		 * for "new-enough" peer DRBD version. */
+		if (device->state.role == R_PRIMARY || peer_role == R_PRIMARY) {
+			*rule_nr = 41;
+			if (!(connection->agreed_features & DRBD_FF_WSAME)) {
+				drbd_warn(peer_device, "Equivalent unrotated UUIDs, but current primary present.\n");
+				return -(0x10000 | PRO_VERSION_MAX | (DRBD_FF_WSAME << 8));
+			}
+			if (device->state.role == R_PRIMARY && peer_role == R_PRIMARY) {
+				/* At least one has the "crashed primary" bit set,
+				 * both are primary now, but neither has rotated its UUIDs?
+				 * "Can not happen." */
+				drbd_err(peer_device, "Equivalent unrotated UUIDs, but both are primary. Can not resolve this.\n");
+				return -100;
+			}
+			if (device->state.role == R_PRIMARY)
+				return 1;
+			return -1;
+		}
+
+		/* Both are secondary.
+		 * Really looks like recovery from simultaneous hard crash.
+		 * Check which had been primary before, and arbitrate. */
+		switch (rct) {
+		case 0: /* !self_pri && !peer_pri */ return 0; /* already handled */
+		case 1: /*  self_pri && !peer_pri */ return 1;
+		case 2: /* !self_pri &&  peer_pri */ return -1;
+		case 3: /*  self_pri &&  peer_pri */
+			dc = test_bit(RESOLVE_CONFLICTS, &connection->flags);
+			return dc ? -1 : 1;
+		}
+	}
+
+	*rule_nr = 50;
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer)
+		return -1;
+
+	*rule_nr = 51;
+	peer = device->p_uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
+		    peer + UUID_NEW_BM_OFFSET == (device->p_uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of the peer's UUIDs. */
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			device->p_uuid[UI_BITMAP] = device->p_uuid[UI_HISTORY_START];
+			device->p_uuid[UI_HISTORY_START] = device->p_uuid[UI_HISTORY_START + 1];
+
+			drbd_info(device, "Lost last syncUUID packet, corrected:\n");
+			drbd_uuid_dump(device, "peer", device->p_uuid, device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+			return -1;
+		}
+	}
+
+	*rule_nr = 60;
+	self = device->ldev->md.uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		peer = device->p_uuid[i] & ~((u64)1);
+		if (self == peer)
+			return -2;
+	}
+
+	*rule_nr = 70;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	if (self == peer)
+		return 1;
+
+	*rule_nr = 71;
+	self = device->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
+	if (self == peer) {
+		if (connection->agreed_pro_version < 96 ?
+		    (device->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
+		    (device->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
+		    self + UUID_NEW_BM_OFFSET == (device->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
+			/* The last P_SYNC_UUID did not get though. Undo the last start of
+			   resync as sync source modifications of our UUIDs. */
+
+			if (connection->agreed_pro_version < 91)
+				return -1091;
+
+			__drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_HISTORY_START]);
+			__drbd_uuid_set(device, UI_HISTORY_START, device->ldev->md.uuid[UI_HISTORY_START + 1]);
+
+			drbd_info(device, "Last syncUUID did not get through, corrected:\n");
+			drbd_uuid_dump(device, "self", device->ldev->md.uuid,
+				       device->state.disk >= D_NEGOTIATING ? drbd_bm_total_weight(device) : 0, 0);
+
+			return 1;
+		}
+	}
+
+
+	*rule_nr = 80;
+	peer = device->p_uuid[UI_CURRENT] & ~((u64)1);
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		if (self == peer)
+			return 2;
+	}
+
+	*rule_nr = 90;
+	self = device->ldev->md.uuid[UI_BITMAP] & ~((u64)1);
+	peer = device->p_uuid[UI_BITMAP] & ~((u64)1);
+	if (self == peer && self != ((u64)0))
+		return 100;
+
+	*rule_nr = 100;
+	for (i = UI_HISTORY_START; i <= UI_HISTORY_END; i++) {
+		self = device->ldev->md.uuid[i] & ~((u64)1);
+		for (j = UI_HISTORY_START; j <= UI_HISTORY_END; j++) {
+			peer = device->p_uuid[j] & ~((u64)1);
+			if (self == peer)
+				return -100;
+		}
+	}
+
+	return -1000;
+}
+
+/* drbd_sync_handshake() returns the new conn state on success, or
+   CONN_MASK (-1) on failure.
+ */
+static enum drbd_conns drbd_sync_handshake(struct drbd_peer_device *peer_device,
+					   enum drbd_role peer_role,
+					   enum drbd_disk_state peer_disk) __must_hold(local)
+{
+	struct drbd_device *device = peer_device->device;
+	enum drbd_conns rv = C_MASK;
+	enum drbd_disk_state mydisk;
+	struct net_conf *nc;
+	int hg, rule_nr, rr_conflict, tentative, always_asbp;
+
+	mydisk = device->state.disk;
+	if (mydisk == D_NEGOTIATING)
+		mydisk = device->new_state_tmp.disk;
+
+	drbd_info(device, "drbd_sync_handshake:\n");
+
+	spin_lock_irq(&device->ldev->md.uuid_lock);
+	drbd_uuid_dump(device, "self", device->ldev->md.uuid, device->comm_bm_set, 0);
+	drbd_uuid_dump(device, "peer", device->p_uuid,
+		       device->p_uuid[UI_SIZE], device->p_uuid[UI_FLAGS]);
+
+	hg = drbd_uuid_compare(device, peer_role, &rule_nr);
+	spin_unlock_irq(&device->ldev->md.uuid_lock);
+
+	drbd_info(device, "uuid_compare()=%d by rule %d\n", hg, rule_nr);
+
+	if (hg == -1000) {
+		drbd_alert(device, "Unrelated data, aborting!\n");
+		return C_MASK;
+	}
+	if (hg < -0x10000) {
+		int proto, fflags;
+		hg = -hg;
+		proto = hg & 0xff;
+		fflags = (hg >> 8) & 0xff;
+		drbd_alert(device, "To resolve this both sides have to support at least protocol %d and feature flags 0x%x\n",
+					proto, fflags);
+		return C_MASK;
+	}
+	if (hg < -1000) {
+		drbd_alert(device, "To resolve this both sides have to support at least protocol %d\n", -hg - 1000);
+		return C_MASK;
+	}
+
+	if    ((mydisk == D_INCONSISTENT && peer_disk > D_INCONSISTENT) ||
+	    (peer_disk == D_INCONSISTENT && mydisk    > D_INCONSISTENT)) {
+		int f = (hg == -100) || abs(hg) == 2;
+		hg = mydisk > D_INCONSISTENT ? 1 : -1;
+		if (f)
+			hg = hg*2;
+		drbd_info(device, "Becoming sync %s due to disk states.\n",
+		     hg > 0 ? "source" : "target");
+	}
+
+	if (abs(hg) == 100)
+		drbd_khelper(device, "initial-split-brain");
+
+	rcu_read_lock();
+	nc = rcu_dereference(peer_device->connection->net_conf);
+	always_asbp = nc->always_asbp;
+	rr_conflict = nc->rr_conflict;
+	tentative = nc->tentative;
+	rcu_read_unlock();
+
+	if (hg == 100 || (hg == -100 && always_asbp)) {
+		int pcount = (device->state.role == R_PRIMARY)
+			   + (peer_role == R_PRIMARY);
+		int forced = (hg == -100);
+
+		switch (pcount) {
+		case 0:
+			hg = drbd_asb_recover_0p(peer_device);
+			break;
+		case 1:
+			hg = drbd_asb_recover_1p(peer_device);
+			break;
+		case 2:
+			hg = drbd_asb_recover_2p(peer_device);
+			break;
+		}
+		if (abs(hg) < 100) {
+			drbd_warn(device, "Split-Brain detected, %d primaries, "
+			     "automatically solved. Sync from %s node\n",
+			     pcount, (hg < 0) ? "peer" : "this");
+			if (forced) {
+				drbd_warn(device, "Doing a full sync, since"
+				     " UUIDs where ambiguous.\n");
+				hg = hg*2;
+			}
+		}
+	}
+
+	if (hg == -100) {
+		if (test_bit(DISCARD_MY_DATA, &device->flags) && !(device->p_uuid[UI_FLAGS]&1))
+			hg = -1;
+		if (!test_bit(DISCARD_MY_DATA, &device->flags) && (device->p_uuid[UI_FLAGS]&1))
+			hg = 1;
+
+		if (abs(hg) < 100)
+			drbd_warn(device, "Split-Brain detected, manually solved. "
+			     "Sync from %s node\n",
+			     (hg < 0) ? "peer" : "this");
+	}
+
+	if (hg == -100) {
+		/* FIXME this log message is not correct if we end up here
+		 * after an attempted attach on a diskless node.
+		 * We just refuse to attach -- well, we drop the "connection"
+		 * to that disk, in a way... */
+		drbd_alert(device, "Split-Brain detected but unresolved, dropping connection!\n");
+		drbd_khelper(device, "split-brain");
+		return C_MASK;
+	}
+
+	if (hg > 0 && mydisk <= D_INCONSISTENT) {
+		drbd_err(device, "I shall become SyncSource, but I am inconsistent!\n");
+		return C_MASK;
+	}
+
+	if (hg < 0 && /* by intention we do not use mydisk here. */
+	    device->state.role == R_PRIMARY && device->state.disk >= D_CONSISTENT) {
+		switch (rr_conflict) {
+		case ASB_CALL_HELPER:
+			drbd_khelper(device, "pri-lost");
+			fallthrough;
+		case ASB_DISCONNECT:
+			drbd_err(device, "I shall become SyncTarget, but I am primary!\n");
+			return C_MASK;
+		case ASB_VIOLENTLY:
+			drbd_warn(device, "Becoming SyncTarget, violating the stable-data"
+			     "assumption\n");
+		}
+	}
+
+	if (tentative || test_bit(CONN_DRY_RUN, &peer_device->connection->flags)) {
+		if (hg == 0)
+			drbd_info(device, "dry-run connect: No resync, would become Connected immediately.\n");
+		else
+			drbd_info(device, "dry-run connect: Would become %s, doing a %s resync.",
+				 drbd_conn_str(hg > 0 ? C_SYNC_SOURCE : C_SYNC_TARGET),
+				 abs(hg) >= 2 ? "full" : "bit-map based");
+		return C_MASK;
+	}
+
+	if (abs(hg) >= 2) {
+		drbd_info(device, "Writing the whole bitmap, full sync required after drbd_sync_handshake.\n");
+		if (drbd_bitmap_io(device, &drbd_bmio_set_n_write, "set_n_write from sync_handshake",
+					BM_LOCKED_SET_ALLOWED))
+			return C_MASK;
+	}
+
+	if (hg > 0) { /* become sync source. */
+		rv = C_WF_BITMAP_S;
+	} else if (hg < 0) { /* become sync target */
+		rv = C_WF_BITMAP_T;
+	} else {
+		rv = C_CONNECTED;
+		if (drbd_bm_total_weight(device)) {
+			drbd_info(device, "No resync, but %lu bits in bitmap!\n",
+			     drbd_bm_total_weight(device));
+		}
+	}
+
+	return rv;
+}
+
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
+{
+	/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
+	if (peer == ASB_DISCARD_REMOTE)
+		return ASB_DISCARD_LOCAL;
+
+	/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
+	if (peer == ASB_DISCARD_LOCAL)
+		return ASB_DISCARD_REMOTE;
+
+	/* everything else is valid if they are equal on both sides. */
+	return peer;
+}
+
+static int receive_protocol(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_protocol *p = pi->data;
+	enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+	int p_proto, p_discard_my_data, p_two_primaries, cf;
+	struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+	char integrity_alg[SHARED_SECRET_MAX] = "";
+	struct crypto_shash *peer_integrity_tfm = NULL;
+	void *int_dig_in = NULL, *int_dig_vv = NULL;
+
+	p_proto		= be32_to_cpu(p->protocol);
+	p_after_sb_0p	= be32_to_cpu(p->after_sb_0p);
+	p_after_sb_1p	= be32_to_cpu(p->after_sb_1p);
+	p_after_sb_2p	= be32_to_cpu(p->after_sb_2p);
+	p_two_primaries = be32_to_cpu(p->two_primaries);
+	cf		= be32_to_cpu(p->conn_flags);
+	p_discard_my_data = cf & CF_DISCARD_MY_DATA;
+
+	if (connection->agreed_pro_version >= 87) {
+		int err;
+
+		if (pi->size > sizeof(integrity_alg))
+			return -EIO;
+		err = drbd_recv_all(connection, integrity_alg, pi->size);
+		if (err)
+			return err;
+		integrity_alg[SHARED_SECRET_MAX - 1] = 0;
+	}
+
+	if (pi->cmd != P_PROTOCOL_UPDATE) {
+		clear_bit(CONN_DRY_RUN, &connection->flags);
+
+		if (cf & CF_DRY_RUN)
+			set_bit(CONN_DRY_RUN, &connection->flags);
+
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+
+		if (p_proto != nc->wire_protocol) {
+			drbd_err(connection, "incompatible %s settings\n", "protocol");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-0pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-1pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+			drbd_err(connection, "incompatible %s settings\n", "after-sb-2pri");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_discard_my_data && nc->discard_my_data) {
+			drbd_err(connection, "incompatible %s settings\n", "discard-my-data");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (p_two_primaries != nc->two_primaries) {
+			drbd_err(connection, "incompatible %s settings\n", "allow-two-primaries");
+			goto disconnect_rcu_unlock;
+		}
+
+		if (strcmp(integrity_alg, nc->integrity_alg)) {
+			drbd_err(connection, "incompatible %s settings\n", "data-integrity-alg");
+			goto disconnect_rcu_unlock;
+		}
+
+		rcu_read_unlock();
+	}
+
+	if (integrity_alg[0]) {
+		int hash_size;
+
+		/*
+		 * We can only change the peer data integrity algorithm
+		 * here.  Changing our own data integrity algorithm
+		 * requires that we send a P_PROTOCOL_UPDATE packet at
+		 * the same time; otherwise, the peer has no way to
+		 * tell between which packets the algorithm should
+		 * change.
+		 */
+
+		peer_integrity_tfm = crypto_alloc_shash(integrity_alg, 0, 0);
+		if (IS_ERR(peer_integrity_tfm)) {
+			peer_integrity_tfm = NULL;
+			drbd_err(connection, "peer data-integrity-alg %s not supported\n",
+				 integrity_alg);
+			goto disconnect;
+		}
+
+		hash_size = crypto_shash_digestsize(peer_integrity_tfm);
+		int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+		int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+		if (!(int_dig_in && int_dig_vv)) {
+			drbd_err(connection, "Allocation of buffers for data integrity checking failed\n");
+			goto disconnect;
+		}
+	}
+
+	new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+	if (!new_net_conf)
+		goto disconnect;
+
+	mutex_lock(&connection->data.mutex);
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = connection->net_conf;
+	*new_net_conf = *old_net_conf;
+
+	new_net_conf->wire_protocol = p_proto;
+	new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+	new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+	new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+	new_net_conf->two_primaries = p_two_primaries;
+
+	rcu_assign_pointer(connection->net_conf, new_net_conf);
+	mutex_unlock(&connection->resource->conf_update);
+	mutex_unlock(&connection->data.mutex);
+
+	crypto_free_shash(connection->peer_integrity_tfm);
+	kfree(connection->int_dig_in);
+	kfree(connection->int_dig_vv);
+	connection->peer_integrity_tfm = peer_integrity_tfm;
+	connection->int_dig_in = int_dig_in;
+	connection->int_dig_vv = int_dig_vv;
+
+	if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+		drbd_info(connection, "peer data-integrity-alg: %s\n",
+			  integrity_alg[0] ? integrity_alg : "(none)");
+
+	kvfree_rcu(old_net_conf);
+	return 0;
+
+disconnect_rcu_unlock:
+	rcu_read_unlock();
+disconnect:
+	crypto_free_shash(peer_integrity_tfm);
+	kfree(int_dig_in);
+	kfree(int_dig_vv);
+	conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* helper function
+ * input: alg name, feature name
+ * return: NULL (alg name was "")
+ *         ERR_PTR(error) if something goes wrong
+ *         or the crypto hash ptr, if it worked out ok. */
+static struct crypto_shash *drbd_crypto_alloc_digest_safe(
+		const struct drbd_device *device,
+		const char *alg, const char *name)
+{
+	struct crypto_shash *tfm;
+
+	if (!alg[0])
+		return NULL;
+
+	tfm = crypto_alloc_shash(alg, 0, 0);
+	if (IS_ERR(tfm)) {
+		drbd_err(device, "Can not allocate \"%s\" as %s (reason: %ld)\n",
+			alg, name, PTR_ERR(tfm));
+		return tfm;
+	}
+	return tfm;
+}
+
+static int ignore_remaining_packet(struct drbd_connection *connection, struct packet_info *pi)
+{
+	void *buffer = connection->data.rbuf;
+	int size = pi->size;
+
+	while (size) {
+		int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+		s = drbd_recv(connection, buffer, s);
+		if (s <= 0) {
+			if (s < 0)
+				return s;
+			break;
+		}
+		size -= s;
+	}
+	if (size)
+		return -EIO;
+	return 0;
+}
+
+/*
+ * config_unknown_volume  -  device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet.  It will warn and ignore these
+ * commands.  Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "%s packet received for volume %u, which is not configured locally\n",
+		  cmdname(pi->cmd), pi->vnr);
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_SyncParam(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_param_95 *p;
+	unsigned int header_size, data_size, exp_max_sz;
+	struct crypto_shash *verify_tfm = NULL;
+	struct crypto_shash *csums_tfm = NULL;
+	struct net_conf *old_net_conf, *new_net_conf = NULL;
+	struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+	const int apv = connection->agreed_pro_version;
+	struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+	unsigned int fifo_size = 0;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	exp_max_sz  = apv <= 87 ? sizeof(struct p_rs_param)
+		    : apv == 88 ? sizeof(struct p_rs_param)
+					+ SHARED_SECRET_MAX
+		    : apv <= 94 ? sizeof(struct p_rs_param_89)
+		    : /* apv >= 95 */ sizeof(struct p_rs_param_95);
+
+	if (pi->size > exp_max_sz) {
+		drbd_err(device, "SyncParam packet too long: received %u, expected <= %u bytes\n",
+		    pi->size, exp_max_sz);
+		return -EIO;
+	}
+
+	if (apv <= 88) {
+		header_size = sizeof(struct p_rs_param);
+		data_size = pi->size - header_size;
+	} else if (apv <= 94) {
+		header_size = sizeof(struct p_rs_param_89);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	} else {
+		header_size = sizeof(struct p_rs_param_95);
+		data_size = pi->size - header_size;
+		D_ASSERT(device, data_size == 0);
+	}
+
+	/* initialize verify_alg and csums_alg */
+	p = pi->data;
+	BUILD_BUG_ON(sizeof(p->algs) != 2 * SHARED_SECRET_MAX);
+	memset(&p->algs, 0, sizeof(p->algs));
+
+	err = drbd_recv_all(peer_device->connection, p, header_size);
+	if (err)
+		return err;
+
+	mutex_lock(&connection->resource->conf_update);
+	old_net_conf = peer_device->connection->net_conf;
+	if (get_ldev(device)) {
+		new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+		if (!new_disk_conf) {
+			put_ldev(device);
+			mutex_unlock(&connection->resource->conf_update);
+			drbd_err(device, "Allocation of new disk_conf failed\n");
+			return -ENOMEM;
+		}
+
+		old_disk_conf = device->ldev->disk_conf;
+		*new_disk_conf = *old_disk_conf;
+
+		new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+	}
+
+	if (apv >= 88) {
+		if (apv == 88) {
+			if (data_size > SHARED_SECRET_MAX || data_size == 0) {
+				drbd_err(device, "verify-alg of wrong size, "
+					"peer wants %u, accepting only up to %u byte\n",
+					data_size, SHARED_SECRET_MAX);
+				goto reconnect;
+			}
+
+			err = drbd_recv_all(peer_device->connection, p->verify_alg, data_size);
+			if (err)
+				goto reconnect;
+			/* we expect NUL terminated string */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[data_size-1] == 0);
+			p->verify_alg[data_size-1] = 0;
+
+		} else /* apv >= 89 */ {
+			/* we still expect NUL terminated strings */
+			/* but just in case someone tries to be evil */
+			D_ASSERT(device, p->verify_alg[SHARED_SECRET_MAX-1] == 0);
+			D_ASSERT(device, p->csums_alg[SHARED_SECRET_MAX-1] == 0);
+			p->verify_alg[SHARED_SECRET_MAX-1] = 0;
+			p->csums_alg[SHARED_SECRET_MAX-1] = 0;
+		}
+
+		if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->verify_alg, p->verify_alg);
+				goto disconnect;
+			}
+			verify_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->verify_alg, "verify-alg");
+			if (IS_ERR(verify_tfm)) {
+				verify_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
+			if (device->state.conn == C_WF_REPORT_PARAMS) {
+				drbd_err(device, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
+				    old_net_conf->csums_alg, p->csums_alg);
+				goto disconnect;
+			}
+			csums_tfm = drbd_crypto_alloc_digest_safe(device,
+					p->csums_alg, "csums-alg");
+			if (IS_ERR(csums_tfm)) {
+				csums_tfm = NULL;
+				goto disconnect;
+			}
+		}
+
+		if (apv > 94 && new_disk_conf) {
+			new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+			new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+			new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+			new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
+
+			fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+			if (fifo_size != device->rs_plan_s->size) {
+				new_plan = fifo_alloc(fifo_size);
+				if (!new_plan) {
+					drbd_err(device, "kmalloc of fifo_buffer failed");
+					put_ldev(device);
+					goto disconnect;
+				}
+			}
+		}
+
+		if (verify_tfm || csums_tfm) {
+			new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+			if (!new_net_conf)
+				goto disconnect;
+
+			*new_net_conf = *old_net_conf;
+
+			if (verify_tfm) {
+				strcpy(new_net_conf->verify_alg, p->verify_alg);
+				new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+				crypto_free_shash(peer_device->connection->verify_tfm);
+				peer_device->connection->verify_tfm = verify_tfm;
+				drbd_info(device, "using verify-alg: \"%s\"\n", p->verify_alg);
+			}
+			if (csums_tfm) {
+				strcpy(new_net_conf->csums_alg, p->csums_alg);
+				new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+				crypto_free_shash(peer_device->connection->csums_tfm);
+				peer_device->connection->csums_tfm = csums_tfm;
+				drbd_info(device, "using csums-alg: \"%s\"\n", p->csums_alg);
+			}
+			rcu_assign_pointer(connection->net_conf, new_net_conf);
+		}
+	}
+
+	if (new_disk_conf) {
+		rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+		put_ldev(device);
+	}
+
+	if (new_plan) {
+		old_plan = device->rs_plan_s;
+		rcu_assign_pointer(device->rs_plan_s, new_plan);
+	}
+
+	mutex_unlock(&connection->resource->conf_update);
+	synchronize_rcu();
+	if (new_net_conf)
+		kfree(old_net_conf);
+	kfree(old_disk_conf);
+	kfree(old_plan);
+
+	return 0;
+
+reconnect:
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	return -EIO;
+
+disconnect:
+	kfree(new_plan);
+	if (new_disk_conf) {
+		put_ldev(device);
+		kfree(new_disk_conf);
+	}
+	mutex_unlock(&connection->resource->conf_update);
+	/* just for completeness: actually not needed,
+	 * as this is not reached if csums_tfm was ok. */
+	crypto_free_shash(csums_tfm);
+	/* but free the verify_tfm again, if csums_tfm did not work out */
+	crypto_free_shash(verify_tfm);
+	conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	return -EIO;
+}
+
+/* warn if the arguments differ by more than 12.5% */
+static void warn_if_differ_considerably(struct drbd_device *device,
+	const char *s, sector_t a, sector_t b)
+{
+	sector_t d;
+	if (a == 0 || b == 0)
+		return;
+	d = (a > b) ? (a - b) : (b - a);
+	if (d > (a>>3) || d > (b>>3))
+		drbd_warn(device, "Considerable difference in %s: %llus vs. %llus\n", s,
+		     (unsigned long long)a, (unsigned long long)b);
+}
+
+static int receive_sizes(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_sizes *p = pi->data;
+	struct o_qlim *o = (connection->agreed_features & DRBD_FF_WSAME) ? p->qlim : NULL;
+	enum determine_dev_size dd = DS_UNCHANGED;
+	sector_t p_size, p_usize, p_csize, my_usize;
+	sector_t new_size, cur_size;
+	int ldsc = 0; /* local disk size changed */
+	enum dds_flags ddsf;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+	cur_size = get_capacity(device->vdisk);
+
+	p_size = be64_to_cpu(p->d_size);
+	p_usize = be64_to_cpu(p->u_size);
+	p_csize = be64_to_cpu(p->c_size);
+
+	/* just store the peer's disk size for now.
+	 * we still need to figure out whether we accept that. */
+	device->p_size = p_size;
+
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		my_usize = rcu_dereference(device->ldev->disk_conf)->disk_size;
+		rcu_read_unlock();
+
+		warn_if_differ_considerably(device, "lower level device sizes",
+			   p_size, drbd_get_max_capacity(device->ldev));
+		warn_if_differ_considerably(device, "user requested size",
+					    p_usize, my_usize);
+
+		/* if this is the first connect, or an otherwise expected
+		 * param exchange, choose the minimum */
+		if (device->state.conn == C_WF_REPORT_PARAMS)
+			p_usize = min_not_zero(my_usize, p_usize);
+
+		/* Never shrink a device with usable data during connect,
+		 * or "attach" on the peer.
+		 * But allow online shrinking if we are connected. */
+		new_size = drbd_new_dev_size(device, device->ldev, p_usize, 0);
+		if (new_size < cur_size &&
+		    device->state.disk >= D_OUTDATED &&
+		    (device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS)) {
+			drbd_err(device, "The peer's disk size is too small! (%llu < %llu sectors)\n",
+					(unsigned long long)new_size, (unsigned long long)cur_size);
+			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+			put_ldev(device);
+			return -EIO;
+		}
+
+		if (my_usize != p_usize) {
+			struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+			new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+			if (!new_disk_conf) {
+				put_ldev(device);
+				return -ENOMEM;
+			}
+
+			mutex_lock(&connection->resource->conf_update);
+			old_disk_conf = device->ldev->disk_conf;
+			*new_disk_conf = *old_disk_conf;
+			new_disk_conf->disk_size = p_usize;
+
+			rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf);
+			mutex_unlock(&connection->resource->conf_update);
+			kvfree_rcu(old_disk_conf);
+
+			drbd_info(device, "Peer sets u_size to %lu sectors (old: %lu)\n",
+				 (unsigned long)p_usize, (unsigned long)my_usize);
+		}
+
+		put_ldev(device);
+	}
+
+	device->peer_max_bio_size = be32_to_cpu(p->max_bio_size);
+	/* Leave drbd_reconsider_queue_parameters() before drbd_determine_dev_size().
+	   In case we cleared the QUEUE_FLAG_DISCARD from our queue in
+	   drbd_reconsider_queue_parameters(), we can be sure that after
+	   drbd_determine_dev_size() no REQ_DISCARDs are in the queue. */
+
+	ddsf = be16_to_cpu(p->dds_flags);
+	if (get_ldev(device)) {
+		drbd_reconsider_queue_parameters(device, device->ldev, o);
+		dd = drbd_determine_dev_size(device, ddsf, NULL);
+		put_ldev(device);
+		if (dd == DS_ERROR)
+			return -EIO;
+		drbd_md_sync(device);
+	} else {
+		/*
+		 * I am diskless, need to accept the peer's *current* size.
+		 * I must NOT accept the peers backing disk size,
+		 * it may have been larger than mine all along...
+		 *
+		 * At this point, the peer knows more about my disk, or at
+		 * least about what we last agreed upon, than myself.
+		 * So if his c_size is less than his d_size, the most likely
+		 * reason is that *my* d_size was smaller last time we checked.
+		 *
+		 * However, if he sends a zero current size,
+		 * take his (user-capped or) backing disk size anyways.
+		 *
+		 * Unless of course he does not have a disk himself.
+		 * In which case we ignore this completely.
+		 */
+		sector_t new_size = p_csize ?: p_usize ?: p_size;
+		drbd_reconsider_queue_parameters(device, NULL, o);
+		if (new_size == 0) {
+			/* Ignore, peer does not know nothing. */
+		} else if (new_size == cur_size) {
+			/* nothing to do */
+		} else if (cur_size != 0 && p_size == 0) {
+			drbd_warn(device, "Ignored diskless peer device size (peer:%llu != me:%llu sectors)!\n",
+					(unsigned long long)new_size, (unsigned long long)cur_size);
+		} else if (new_size < cur_size && device->state.role == R_PRIMARY) {
+			drbd_err(device, "The peer's device size is too small! (%llu < %llu sectors); demote me first!\n",
+					(unsigned long long)new_size, (unsigned long long)cur_size);
+			conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+			return -EIO;
+		} else {
+			/* I believe the peer, if
+			 *  - I don't have a current size myself
+			 *  - we agree on the size anyways
+			 *  - I do have a current size, am Secondary,
+			 *    and he has the only disk
+			 *  - I do have a current size, am Primary,
+			 *    and he has the only disk,
+			 *    which is larger than my current size
+			 */
+			drbd_set_my_capacity(device, new_size);
+		}
+	}
+
+	if (get_ldev(device)) {
+		if (device->ldev->known_size != drbd_get_capacity(device->ldev->backing_bdev)) {
+			device->ldev->known_size = drbd_get_capacity(device->ldev->backing_bdev);
+			ldsc = 1;
+		}
+
+		put_ldev(device);
+	}
+
+	if (device->state.conn > C_WF_REPORT_PARAMS) {
+		if (be64_to_cpu(p->c_size) != get_capacity(device->vdisk) ||
+		    ldsc) {
+			/* we have different sizes, probably peer
+			 * needs to know my new size... */
+			drbd_send_sizes(peer_device, 0, ddsf);
+		}
+		if (test_and_clear_bit(RESIZE_PENDING, &device->flags) ||
+		    (dd == DS_GREW && device->state.conn == C_CONNECTED)) {
+			if (device->state.pdsk >= D_INCONSISTENT &&
+			    device->state.disk >= D_INCONSISTENT) {
+				if (ddsf & DDSF_NO_RESYNC)
+					drbd_info(device, "Resync of new storage suppressed with --assume-clean\n");
+				else
+					resync_after_online_grow(device);
+			} else
+				set_bit(RESYNC_AFTER_NEG, &device->flags);
+		}
+	}
+
+	return 0;
+}
+
+static int receive_uuids(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_uuids *p = pi->data;
+	u64 *p_uuid;
+	int i, updated_uuids = 0;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	p_uuid = kmalloc_array(UI_EXTENDED_SIZE, sizeof(*p_uuid), GFP_NOIO);
+	if (!p_uuid)
+		return false;
+
+	for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
+		p_uuid[i] = be64_to_cpu(p->uuid[i]);
+
+	kfree(device->p_uuid);
+	device->p_uuid = p_uuid;
+
+	if ((device->state.conn < C_CONNECTED || device->state.pdsk == D_DISKLESS) &&
+	    device->state.disk < D_INCONSISTENT &&
+	    device->state.role == R_PRIMARY &&
+	    (device->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
+		drbd_err(device, "Can only connect to data with current UUID=%016llX\n",
+		    (unsigned long long)device->ed_uuid);
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (get_ldev(device)) {
+		int skip_initial_sync =
+			device->state.conn == C_CONNECTED &&
+			peer_device->connection->agreed_pro_version >= 90 &&
+			device->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
+			(p_uuid[UI_FLAGS] & 8);
+		if (skip_initial_sync) {
+			drbd_info(device, "Accepted new current UUID, preparing to skip initial sync\n");
+			drbd_bitmap_io(device, &drbd_bmio_clear_n_write,
+					"clear_n_write from receive_uuids",
+					BM_LOCKED_TEST_ALLOWED);
+			_drbd_uuid_set(device, UI_CURRENT, p_uuid[UI_CURRENT]);
+			_drbd_uuid_set(device, UI_BITMAP, 0);
+			_drbd_set_state(_NS2(device, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
+					CS_VERBOSE, NULL);
+			drbd_md_sync(device);
+			updated_uuids = 1;
+		}
+		put_ldev(device);
+	} else if (device->state.disk < D_INCONSISTENT &&
+		   device->state.role == R_PRIMARY) {
+		/* I am a diskless primary, the peer just created a new current UUID
+		   for me. */
+		updated_uuids = drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+	}
+
+	/* Before we test for the disk state, we should wait until an eventually
+	   ongoing cluster wide state change is finished. That is important if
+	   we are primary and are detaching from our disk. We need to see the
+	   new disk state... */
+	mutex_lock(device->state_mutex);
+	mutex_unlock(device->state_mutex);
+	if (device->state.conn >= C_CONNECTED && device->state.disk < D_INCONSISTENT)
+		updated_uuids |= drbd_set_ed_uuid(device, p_uuid[UI_CURRENT]);
+
+	if (updated_uuids)
+		drbd_print_uuids(device, "receiver updated UUIDs to");
+
+	return 0;
+}
+
+/**
+ * convert_state() - Converts the peer's view of the cluster state to our point of view
+ * @ps:		The state as seen by the peer.
+ */
+static union drbd_state convert_state(union drbd_state ps)
+{
+	union drbd_state ms;
+
+	static enum drbd_conns c_tab[] = {
+		[C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
+		[C_CONNECTED] = C_CONNECTED,
+
+		[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
+		[C_STARTING_SYNC_T] = C_STARTING_SYNC_S,
+		[C_DISCONNECTING] = C_TEAR_DOWN, /* C_NETWORK_FAILURE, */
+		[C_VERIFY_S]       = C_VERIFY_T,
+		[C_MASK]   = C_MASK,
+	};
+
+	ms.i = ps.i;
+
+	ms.conn = c_tab[ps.conn];
+	ms.peer = ps.role;
+	ms.role = ps.peer;
+	ms.pdsk = ps.disk;
+	ms.disk = ps.pdsk;
+	ms.peer_isp = (ps.aftr_isp | ps.user_isp);
+
+	return ms;
+}
+
+static int receive_req_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &peer_device->connection->flags) &&
+	    mutex_is_locked(device->state_mutex)) {
+		drbd_send_sr_reply(peer_device, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = drbd_change_state(device, CS_VERBOSE, mask, val);
+	drbd_send_sr_reply(peer_device, rv);
+
+	drbd_md_sync(device);
+
+	return 0;
+}
+
+static int receive_req_conn_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state *p = pi->data;
+	union drbd_state mask, val;
+	enum drbd_state_rv rv;
+
+	mask.i = be32_to_cpu(p->mask);
+	val.i = be32_to_cpu(p->val);
+
+	if (test_bit(RESOLVE_CONFLICTS, &connection->flags) &&
+	    mutex_is_locked(&connection->cstate_mutex)) {
+		conn_send_sr_reply(connection, SS_CONCURRENT_ST_CHG);
+		return 0;
+	}
+
+	mask = convert_state(mask);
+	val = convert_state(val);
+
+	rv = conn_request_state(connection, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+	conn_send_sr_reply(connection, rv);
+
+	return 0;
+}
+
+static int receive_state(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_state *p = pi->data;
+	union drbd_state os, ns, peer_state;
+	enum drbd_disk_state real_peer_disk;
+	enum chg_state_flags cs_flags;
+	int rv;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return config_unknown_volume(connection, pi);
+	device = peer_device->device;
+
+	peer_state.i = be32_to_cpu(p->state);
+
+	real_peer_disk = peer_state.disk;
+	if (peer_state.disk == D_NEGOTIATING) {
+		real_peer_disk = device->p_uuid[UI_FLAGS] & 4 ? D_INCONSISTENT : D_CONSISTENT;
+		drbd_info(device, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+ retry:
+	os = ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* If some other part of the code (ack_receiver thread, timeout)
+	 * already decided to close the connection again,
+	 * we must not "re-establish" it here. */
+	if (os.conn <= C_TEAR_DOWN)
+		return -ECONNRESET;
+
+	/* If this is the "end of sync" confirmation, usually the peer disk
+	 * transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
+	 * set) resync started in PausedSyncT, or if the timing of pause-/
+	 * unpause-sync events has been "just right", the peer disk may
+	 * transition from D_CONSISTENT to D_UP_TO_DATE as well.
+	 */
+	if ((os.pdsk == D_INCONSISTENT || os.pdsk == D_CONSISTENT) &&
+	    real_peer_disk == D_UP_TO_DATE &&
+	    os.conn > C_CONNECTED && os.disk == D_UP_TO_DATE) {
+		/* If we are (becoming) SyncSource, but peer is still in sync
+		 * preparation, ignore its uptodate-ness to avoid flapping, it
+		 * will change to inconsistent once the peer reaches active
+		 * syncing states.
+		 * It may have changed syncer-paused flags, however, so we
+		 * cannot ignore this completely. */
+		if (peer_state.conn > C_CONNECTED &&
+		    peer_state.conn < C_SYNC_SOURCE)
+			real_peer_disk = D_INCONSISTENT;
+
+		/* if peer_state changes to connected at the same time,
+		 * it explicitly notifies us that it finished resync.
+		 * Maybe we should finish it up, too? */
+		else if (os.conn >= C_SYNC_SOURCE &&
+			 peer_state.conn == C_CONNECTED) {
+			if (drbd_bm_total_weight(device) <= device->rs_failed)
+				drbd_resync_finished(device);
+			return 0;
+		}
+	}
+
+	/* explicit verify finished notification, stop sector reached. */
+	if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
+	    peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	/* peer says his disk is inconsistent, while we think it is uptodate,
+	 * and this happens while the peer still thinks we have a sync going on,
+	 * but we think we are already done with the sync.
+	 * We ignore this to avoid flapping pdsk.
+	 * This should not happen, if the peer is a recent version of drbd. */
+	if (os.pdsk == D_UP_TO_DATE && real_peer_disk == D_INCONSISTENT &&
+	    os.conn == C_CONNECTED && peer_state.conn > C_SYNC_SOURCE)
+		real_peer_disk = D_UP_TO_DATE;
+
+	if (ns.conn == C_WF_REPORT_PARAMS)
+		ns.conn = C_CONNECTED;
+
+	if (peer_state.conn == C_AHEAD)
+		ns.conn = C_BEHIND;
+
+	/* TODO:
+	 * if (primary and diskless and peer uuid != effective uuid)
+	 *     abort attach on peer;
+	 *
+	 * If this node does not have good data, was already connected, but
+	 * the peer did a late attach only now, trying to "negotiate" with me,
+	 * AND I am currently Primary, possibly frozen, with some specific
+	 * "effective" uuid, this should never be reached, really, because
+	 * we first send the uuids, then the current state.
+	 *
+	 * In this scenario, we already dropped the connection hard
+	 * when we received the unsuitable uuids (receive_uuids().
+	 *
+	 * Should we want to change this, that is: not drop the connection in
+	 * receive_uuids() already, then we would need to add a branch here
+	 * that aborts the attach of "unsuitable uuids" on the peer in case
+	 * this node is currently Diskless Primary.
+	 */
+
+	if (device->p_uuid && peer_state.disk >= D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		int cr; /* consider resync */
+
+		/* if we established a new connection */
+		cr  = (os.conn < C_CONNECTED);
+		/* if we had an established connection
+		 * and one of the nodes newly attaches a disk */
+		cr |= (os.conn == C_CONNECTED &&
+		       (peer_state.disk == D_NEGOTIATING ||
+			os.disk == D_NEGOTIATING));
+		/* if we have both been inconsistent, and the peer has been
+		 * forced to be UpToDate with --force */
+		cr |= test_bit(CONSIDER_RESYNC, &device->flags);
+		/* if we had been plain connected, and the admin requested to
+		 * start a sync by "invalidate" or "invalidate-remote" */
+		cr |= (os.conn == C_CONNECTED &&
+				(peer_state.conn >= C_STARTING_SYNC_S &&
+				 peer_state.conn <= C_WF_BITMAP_T));
+
+		if (cr)
+			ns.conn = drbd_sync_handshake(peer_device, peer_state.role, real_peer_disk);
+
+		put_ldev(device);
+		if (ns.conn == C_MASK) {
+			ns.conn = C_CONNECTED;
+			if (device->state.disk == D_NEGOTIATING) {
+				drbd_force_state(device, NS(disk, D_FAILED));
+			} else if (peer_state.disk == D_NEGOTIATING) {
+				drbd_err(device, "Disk attach process on the peer node was aborted.\n");
+				peer_state.disk = D_DISKLESS;
+				real_peer_disk = D_DISKLESS;
+			} else {
+				if (test_and_clear_bit(CONN_DRY_RUN, &peer_device->connection->flags))
+					return -EIO;
+				D_ASSERT(device, os.conn == C_WF_REPORT_PARAMS);
+				conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return -EIO;
+			}
+		}
+	}
+
+	spin_lock_irq(&device->resource->req_lock);
+	if (os.i != drbd_read_state(device).i)
+		goto retry;
+	clear_bit(CONSIDER_RESYNC, &device->flags);
+	ns.peer = peer_state.role;
+	ns.pdsk = real_peer_disk;
+	ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
+	if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
+		ns.disk = device->new_state_tmp.disk;
+	cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
+	if (ns.pdsk == D_CONSISTENT && drbd_suspended(device) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+	    test_bit(NEW_CUR_UUID, &device->flags)) {
+		/* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
+		   for temporal network outages! */
+		spin_unlock_irq(&device->resource->req_lock);
+		drbd_err(device, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
+		tl_clear(peer_device->connection);
+		drbd_uuid_new_current(device);
+		clear_bit(NEW_CUR_UUID, &device->flags);
+		conn_request_state(peer_device->connection, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+		return -EIO;
+	}
+	rv = _drbd_set_state(device, ns, cs_flags, NULL);
+	ns = drbd_read_state(device);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (rv < SS_SUCCESS) {
+		conn_request_state(peer_device->connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		return -EIO;
+	}
+
+	if (os.conn > C_WF_REPORT_PARAMS) {
+		if (ns.conn > C_CONNECTED && peer_state.conn <= C_CONNECTED &&
+		    peer_state.disk != D_NEGOTIATING ) {
+			/* we want resync, peer has not yet decided to sync... */
+			/* Nowadays only used when forcing a node into primary role and
+			   setting its disk to UpToDate with that */
+			drbd_send_uuids(peer_device);
+			drbd_send_current_state(peer_device);
+		}
+	}
+
+	clear_bit(DISCARD_MY_DATA, &device->flags);
+
+	drbd_md_sync(device); /* update connected indicator, la_size_sect, ... */
+
+	return 0;
+}
+
+static int receive_sync_uuid(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_rs_uuid *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	wait_event(device->misc_wait,
+		   device->state.conn == C_WF_SYNC_UUID ||
+		   device->state.conn == C_BEHIND ||
+		   device->state.conn < C_CONNECTED ||
+		   device->state.disk < D_NEGOTIATING);
+
+	/* D_ASSERT(device,  device->state.conn == C_WF_SYNC_UUID ); */
+
+	/* Here the _drbd_uuid_ functions are right, current should
+	   _not_ be rotated into the history */
+	if (get_ldev_if_state(device, D_NEGOTIATING)) {
+		_drbd_uuid_set(device, UI_CURRENT, be64_to_cpu(p->uuid));
+		_drbd_uuid_set(device, UI_BITMAP, 0UL);
+
+		drbd_print_uuids(device, "updated sync uuid");
+		drbd_start_resync(device, C_SYNC_TARGET);
+
+		put_ldev(device);
+	} else
+		drbd_err(device, "Ignoring SyncUUID packet!\n");
+
+	return 0;
+}
+
+/*
+ * receive_bitmap_plain
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+receive_bitmap_plain(struct drbd_peer_device *peer_device, unsigned int size,
+		     unsigned long *p, struct bm_xfer_ctx *c)
+{
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+				 drbd_header_size(peer_device->connection);
+	unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+				       c->bm_words - c->word_offset);
+	unsigned int want = num_words * sizeof(*p);
+	int err;
+
+	if (want != size) {
+		drbd_err(peer_device, "%s:want (%u) != size (%u)\n", __func__, want, size);
+		return -EIO;
+	}
+	if (want == 0)
+		return 0;
+	err = drbd_recv_all(peer_device->connection, p, want);
+	if (err)
+		return err;
+
+	drbd_bm_merge_lel(peer_device->device, c->word_offset, num_words, p);
+
+	c->word_offset += num_words;
+	c->bit_offset = c->word_offset * BITS_PER_LONG;
+	if (c->bit_offset > c->bm_bits)
+		c->bit_offset = c->bm_bits;
+
+	return 1;
+}
+
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+	return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+	return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+	return (p->encoding >> 4) & 0x7;
+}
+
+/*
+ * recv_bm_rle_bits
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+recv_bm_rle_bits(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		 struct bm_xfer_ctx *c,
+		 unsigned int len)
+{
+	struct bitstream bs;
+	u64 look_ahead;
+	u64 rl;
+	u64 tmp;
+	unsigned long s = c->bit_offset;
+	unsigned long e;
+	int toggle = dcbp_get_start(p);
+	int have;
+	int bits;
+
+	bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
+
+	bits = bitstream_get_bits(&bs, &look_ahead, 64);
+	if (bits < 0)
+		return -EIO;
+
+	for (have = bits; have > 0; s += rl, toggle = !toggle) {
+		bits = vli_decode_bits(&rl, look_ahead);
+		if (bits <= 0)
+			return -EIO;
+
+		if (toggle) {
+			e = s + rl -1;
+			if (e >= c->bm_bits) {
+				drbd_err(peer_device, "bitmap overflow (e:%lu) while decoding bm RLE packet\n", e);
+				return -EIO;
+			}
+			_drbd_bm_set_bits(peer_device->device, s, e);
+		}
+
+		if (have < bits) {
+			drbd_err(peer_device, "bitmap decoding error: h:%d b:%d la:0x%08llx l:%u/%u\n",
+				have, bits, look_ahead,
+				(unsigned int)(bs.cur.b - p->code),
+				(unsigned int)bs.buf_len);
+			return -EIO;
+		}
+		/* if we consumed all 64 bits, assign 0; >> 64 is "undefined"; */
+		if (likely(bits < 64))
+			look_ahead >>= bits;
+		else
+			look_ahead = 0;
+		have -= bits;
+
+		bits = bitstream_get_bits(&bs, &tmp, 64 - have);
+		if (bits < 0)
+			return -EIO;
+		look_ahead |= tmp << have;
+		have += bits;
+	}
+
+	c->bit_offset = s;
+	bm_xfer_ctx_bit_to_word_offset(c);
+
+	return (s != c->bm_bits);
+}
+
+/*
+ * decode_bitmap_c
+ *
+ * Return 0 when done, 1 when another iteration is needed, and a negative error
+ * code upon failure.
+ */
+static int
+decode_bitmap_c(struct drbd_peer_device *peer_device,
+		struct p_compressed_bm *p,
+		struct bm_xfer_ctx *c,
+		unsigned int len)
+{
+	if (dcbp_get_code(p) == RLE_VLI_Bits)
+		return recv_bm_rle_bits(peer_device, p, c, len - sizeof(*p));
+
+	/* other variants had been implemented for evaluation,
+	 * but have been dropped as this one turned out to be "best"
+	 * during all our tests. */
+
+	drbd_err(peer_device, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
+	conn_request_state(peer_device->connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+	return -EIO;
+}
+
+void INFO_bm_xfer_stats(struct drbd_device *device,
+		const char *direction, struct bm_xfer_ctx *c)
+{
+	/* what would it take to transfer it "plaintext" */
+	unsigned int header_size = drbd_header_size(first_peer_device(device)->connection);
+	unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+	unsigned int plain =
+		header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+		c->bm_words * sizeof(unsigned long);
+	unsigned int total = c->bytes[0] + c->bytes[1];
+	unsigned int r;
+
+	/* total can not be zero. but just in case: */
+	if (total == 0)
+		return;
+
+	/* don't report if not compressed */
+	if (total >= plain)
+		return;
+
+	/* total < plain. check for overflow, still */
+	r = (total > UINT_MAX/1000) ? (total / (plain/1000))
+		                    : (1000 * total / plain);
+
+	if (r > 1000)
+		r = 1000;
+
+	r = 1000 - r;
+	drbd_info(device, "%s bitmap stats [Bytes(packets)]: plain %u(%u), RLE %u(%u), "
+	     "total %u; compression: %u.%u%%\n",
+			direction,
+			c->bytes[1], c->packets[1],
+			c->bytes[0], c->packets[0],
+			total, r/10, r % 10);
+}
+
+/* Since we are processing the bitfield from lower addresses to higher,
+   it does not matter if the process it in 32 bit chunks or 64 bit
+   chunks as long as it is little endian. (Understand it as byte stream,
+   beginning with the lowest byte...) If we would use big endian
+   we would need to process it from the highest address to the lowest,
+   in order to be agnostic to the 32 vs 64 bits issue.
+
+   returns 0 on failure, 1 if we successfully received it. */
+static int receive_bitmap(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct bm_xfer_ctx c;
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	drbd_bm_lock(device, "receive bitmap", BM_LOCKED_SET_ALLOWED);
+	/* you are supposed to send additional out-of-sync information
+	 * if you actually set bits during this phase */
+
+	c = (struct bm_xfer_ctx) {
+		.bm_bits = drbd_bm_bits(device),
+		.bm_words = drbd_bm_words(device),
+	};
+
+	for(;;) {
+		if (pi->cmd == P_BITMAP)
+			err = receive_bitmap_plain(peer_device, pi->size, pi->data, &c);
+		else if (pi->cmd == P_COMPRESSED_BITMAP) {
+			/* MAYBE: sanity check that we speak proto >= 90,
+			 * and the feature is enabled! */
+			struct p_compressed_bm *p = pi->data;
+
+			if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(connection)) {
+				drbd_err(device, "ReportCBitmap packet too large\n");
+				err = -EIO;
+				goto out;
+			}
+			if (pi->size <= sizeof(*p)) {
+				drbd_err(device, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+				err = -EIO;
+				goto out;
+			}
+			err = drbd_recv_all(peer_device->connection, p, pi->size);
+			if (err)
+			       goto out;
+			err = decode_bitmap_c(peer_device, p, &c, pi->size);
+		} else {
+			drbd_warn(device, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+			err = -EIO;
+			goto out;
+		}
+
+		c.packets[pi->cmd == P_BITMAP]++;
+		c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(connection) + pi->size;
+
+		if (err <= 0) {
+			if (err < 0)
+				goto out;
+			break;
+		}
+		err = drbd_recv_header(peer_device->connection, pi);
+		if (err)
+			goto out;
+	}
+
+	INFO_bm_xfer_stats(device, "receive", &c);
+
+	if (device->state.conn == C_WF_BITMAP_T) {
+		enum drbd_state_rv rv;
+
+		err = drbd_send_bitmap(device);
+		if (err)
+			goto out;
+		/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
+		rv = _drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		D_ASSERT(device, rv == SS_SUCCESS);
+	} else if (device->state.conn != C_WF_BITMAP_S) {
+		/* admin may have requested C_DISCONNECTING,
+		 * other threads may have noticed network errors */
+		drbd_info(device, "unexpected cstate (%s) in receive_bitmap\n",
+		    drbd_conn_str(device->state.conn));
+	}
+	err = 0;
+
+ out:
+	drbd_bm_unlock(device);
+	if (!err && device->state.conn == C_WF_BITMAP_S)
+		drbd_start_resync(device, C_SYNC_SOURCE);
+	return err;
+}
+
+static int receive_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	drbd_warn(connection, "skipping unknown optional packet type %d, l: %d!\n",
+		 pi->cmd, pi->size);
+
+	return ignore_remaining_packet(connection, pi);
+}
+
+static int receive_UnplugRemote(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* Make sure we've acked all the TCP data associated
+	 * with the data requests being unplugged */
+	tcp_sock_set_quickack(connection->data.socket->sk, 2);
+	return 0;
+}
+
+static int receive_out_of_sync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_desc *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	switch (device->state.conn) {
+	case C_WF_SYNC_UUID:
+	case C_WF_BITMAP_T:
+	case C_BEHIND:
+			break;
+	default:
+		drbd_err(device, "ASSERT FAILED cstate = %s, expected: WFSyncUUID|WFBitMapT|Behind\n",
+				drbd_conn_str(device->state.conn));
+	}
+
+	drbd_set_out_of_sync(device, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
+
+	return 0;
+}
+
+static int receive_rs_deallocated(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct p_block_desc *p = pi->data;
+	struct drbd_device *device;
+	sector_t sector;
+	int size, err = 0;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	dec_rs_pending(device);
+
+	if (get_ldev(device)) {
+		struct drbd_peer_request *peer_req;
+		const enum req_op op = REQ_OP_WRITE_ZEROES;
+
+		peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER, sector,
+					       size, 0, GFP_NOIO);
+		if (!peer_req) {
+			put_ldev(device);
+			return -ENOMEM;
+		}
+
+		peer_req->w.cb = e_end_resync_block;
+		peer_req->submit_jif = jiffies;
+		peer_req->flags |= EE_TRIM;
+
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->sync_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+
+		atomic_add(pi->size >> 9, &device->rs_sect_ev);
+		err = drbd_submit_peer_request(device, peer_req, op,
+					       DRBD_FAULT_RS_WR);
+
+		if (err) {
+			spin_lock_irq(&device->resource->req_lock);
+			list_del(&peer_req->w.list);
+			spin_unlock_irq(&device->resource->req_lock);
+
+			drbd_free_peer_req(device, peer_req);
+			put_ldev(device);
+			err = 0;
+			goto fail;
+		}
+
+		inc_unacked(device);
+
+		/* No put_ldev() here. Gets called in drbd_endio_write_sec_final(),
+		   as well as drbd_rs_complete_io() */
+	} else {
+	fail:
+		drbd_rs_complete_io(device, sector);
+		drbd_send_ack_ex(peer_device, P_NEG_ACK, sector, size, ID_SYNCER);
+	}
+
+	atomic_add(size >> 9, &device->rs_sect_in);
+
+	return err;
+}
+
+struct data_cmd {
+	int expect_payload;
+	unsigned int pkt_size;
+	int (*fn)(struct drbd_connection *, struct packet_info *);
+};
+
+static struct data_cmd drbd_cmd_handler[] = {
+	[P_DATA]	    = { 1, sizeof(struct p_data), receive_Data },
+	[P_DATA_REPLY]	    = { 1, sizeof(struct p_data), receive_DataReply },
+	[P_RS_DATA_REPLY]   = { 1, sizeof(struct p_data), receive_RSDataReply } ,
+	[P_BARRIER]	    = { 0, sizeof(struct p_barrier), receive_Barrier } ,
+	[P_BITMAP]	    = { 1, 0, receive_bitmap } ,
+	[P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+	[P_UNPLUG_REMOTE]   = { 0, 0, receive_UnplugRemote },
+	[P_DATA_REQUEST]    = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_SYNC_PARAM]	    = { 1, 0, receive_SyncParam },
+	[P_SYNC_PARAM89]    = { 1, 0, receive_SyncParam },
+	[P_PROTOCOL]        = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_UUIDS]	    = { 0, sizeof(struct p_uuids), receive_uuids },
+	[P_SIZES]	    = { 0, sizeof(struct p_sizes), receive_sizes },
+	[P_STATE]	    = { 0, sizeof(struct p_state), receive_state },
+	[P_STATE_CHG_REQ]   = { 0, sizeof(struct p_req_state), receive_req_state },
+	[P_SYNC_UUID]       = { 0, sizeof(struct p_rs_uuid), receive_sync_uuid },
+	[P_OV_REQUEST]      = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_OV_REPLY]        = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
+	[P_RS_THIN_REQ]     = { 0, sizeof(struct p_block_req), receive_DataRequest },
+	[P_DELAY_PROBE]     = { 0, sizeof(struct p_delay_probe93), receive_skip },
+	[P_OUT_OF_SYNC]     = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
+	[P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+	[P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
+	[P_TRIM]	    = { 0, sizeof(struct p_trim), receive_Data },
+	[P_ZEROES]	    = { 0, sizeof(struct p_trim), receive_Data },
+	[P_RS_DEALLOCATED]  = { 0, sizeof(struct p_block_desc), receive_rs_deallocated },
+};
+
+static void drbdd(struct drbd_connection *connection)
+{
+	struct packet_info pi;
+	size_t shs; /* sub header size */
+	int err;
+
+	while (get_t_state(&connection->receiver) == RUNNING) {
+		struct data_cmd const *cmd;
+
+		drbd_thread_current_set_cpu(&connection->receiver);
+		update_receiver_timing_details(connection, drbd_recv_header_maybe_unplug);
+		if (drbd_recv_header_maybe_unplug(connection, &pi))
+			goto err_out;
+
+		cmd = &drbd_cmd_handler[pi.cmd];
+		if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+			drbd_err(connection, "Unexpected data packet %s (0x%04x)",
+				 cmdname(pi.cmd), pi.cmd);
+			goto err_out;
+		}
+
+		shs = cmd->pkt_size;
+		if (pi.cmd == P_SIZES && connection->agreed_features & DRBD_FF_WSAME)
+			shs += sizeof(struct o_qlim);
+		if (pi.size > shs && !cmd->expect_payload) {
+			drbd_err(connection, "No payload expected %s l:%d\n",
+				 cmdname(pi.cmd), pi.size);
+			goto err_out;
+		}
+		if (pi.size < shs) {
+			drbd_err(connection, "%s: unexpected packet size, expected:%d received:%d\n",
+				 cmdname(pi.cmd), (int)shs, pi.size);
+			goto err_out;
+		}
+
+		if (shs) {
+			update_receiver_timing_details(connection, drbd_recv_all_warn);
+			err = drbd_recv_all_warn(connection, pi.data, shs);
+			if (err)
+				goto err_out;
+			pi.size -= shs;
+		}
+
+		update_receiver_timing_details(connection, cmd->fn);
+		err = cmd->fn(connection, &pi);
+		if (err) {
+			drbd_err(connection, "error receiving %s, e: %d l: %d!\n",
+				 cmdname(pi.cmd), err, pi.size);
+			goto err_out;
+		}
+	}
+	return;
+
+    err_out:
+	conn_request_state(connection, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
+}
+
+static void conn_disconnect(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	enum drbd_conns oc;
+	int vnr;
+
+	if (connection->cstate == C_STANDALONE)
+		return;
+
+	/* We are about to start the cleanup after connection loss.
+	 * Make sure drbd_make_request knows about that.
+	 * Usually we should be in some network failure state already,
+	 * but just in case we are not, we fix it up here.
+	 */
+	conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+
+	/* ack_receiver does not clean up anything. it must not interfere, either */
+	drbd_thread_stop(&connection->ack_receiver);
+	if (connection->ack_sender) {
+		destroy_workqueue(connection->ack_sender);
+		connection->ack_sender = NULL;
+	}
+	drbd_free_sock(connection);
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_disconnected(peer_device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	if (!list_empty(&connection->current_epoch->list))
+		drbd_err(connection, "ASSERTION FAILED: connection->current_epoch->list not empty\n");
+	/* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+	atomic_set(&connection->current_epoch->epoch_size, 0);
+	connection->send.seen_any_write_yet = false;
+
+	drbd_info(connection, "Connection closed\n");
+
+	if (conn_highest_role(connection) == R_PRIMARY && conn_highest_pdsk(connection) >= D_UNKNOWN)
+		conn_try_outdate_peer_async(connection);
+
+	spin_lock_irq(&connection->resource->req_lock);
+	oc = connection->cstate;
+	if (oc >= C_UNCONNECTED)
+		_conn_request_state(connection, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	if (oc == C_DISCONNECTING)
+		conn_request_state(connection, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_peer_device *peer_device)
+{
+	struct drbd_device *device = peer_device->device;
+	unsigned int i;
+
+	/* wait for current activity to cease. */
+	spin_lock_irq(&device->resource->req_lock);
+	_drbd_wait_ee_list_empty(device, &device->active_ee);
+	_drbd_wait_ee_list_empty(device, &device->sync_ee);
+	_drbd_wait_ee_list_empty(device, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* We do not have data structures that would allow us to
+	 * get the rs_pending_cnt down to 0 again.
+	 *  * On C_SYNC_TARGET we do not have any data structures describing
+	 *    the pending RSDataRequest's we have sent.
+	 *  * On C_SYNC_SOURCE there is no data structure that tracks
+	 *    the P_RS_DATA_REPLY blocks that we sent to the SyncTarget.
+	 *  And no, it is not the sum of the reference counts in the
+	 *  resync_LRU. The resync_LRU tracks the whole operation including
+	 *  the disk-IO, while the rs_pending_cnt only tracks the blocks
+	 *  on the fly. */
+	drbd_rs_cancel_all(device);
+	device->rs_total = 0;
+	device->rs_failed = 0;
+	atomic_set(&device->rs_pending_cnt, 0);
+	wake_up(&device->misc_wait);
+
+	del_timer_sync(&device->resync_timer);
+	resync_timer_fn(&device->resync_timer);
+
+	/* wait for all w_e_end_data_req, w_e_end_rsdata_req, w_send_barrier,
+	 * w_make_resync_request etc. which may still be on the worker queue
+	 * to be "canceled" */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	drbd_finish_peer_reqs(device);
+
+	/* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+	   might have issued a work again. The one before drbd_finish_peer_reqs() is
+	   necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+	drbd_flush_workqueue(&peer_device->connection->sender_work);
+
+	/* need to do it again, drbd_finish_peer_reqs() may have populated it
+	 * again via drbd_try_clear_on_disk_bm(). */
+	drbd_rs_cancel_all(device);
+
+	kfree(device->p_uuid);
+	device->p_uuid = NULL;
+
+	if (!drbd_suspended(device))
+		tl_clear(peer_device->connection);
+
+	drbd_md_sync(device);
+
+	if (get_ldev(device)) {
+		drbd_bitmap_io(device, &drbd_bm_write_copy_pages,
+				"write from disconnected", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* tcp_close and release of sendpage pages can be deferred.  I don't
+	 * want to use SO_LINGER, because apparently it can be deferred for
+	 * more than 20 seconds (longest time I checked).
+	 *
+	 * Actually we don't care for exactly when the network stack does its
+	 * put_page(), but release our reference on these pages right here.
+	 */
+	i = drbd_free_peer_reqs(device, &device->net_ee);
+	if (i)
+		drbd_info(device, "net_ee not empty, killed %u entries\n", i);
+	i = atomic_read(&device->pp_in_use_by_net);
+	if (i)
+		drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
+	i = atomic_read(&device->pp_in_use);
+	if (i)
+		drbd_info(device, "pp_in_use = %d, expected 0\n", i);
+
+	D_ASSERT(device, list_empty(&device->read_ee));
+	D_ASSERT(device, list_empty(&device->active_ee));
+	D_ASSERT(device, list_empty(&device->sync_ee));
+	D_ASSERT(device, list_empty(&device->done_ee));
+
+	return 0;
+}
+
+/*
+ * We support PRO_VERSION_MIN to PRO_VERSION_MAX. The protocol version
+ * we can agree on is stored in agreed_pro_version.
+ *
+ * feature flags and the reserved array should be enough room for future
+ * enhancements of the handshake protocol, and possible plugins...
+ *
+ * for now, they are expected to be zero, but ignored.
+ */
+static int drbd_send_features(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	struct p_connection_features *p;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	memset(p, 0, sizeof(*p));
+	p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
+	p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
+	p->feature_flags = cpu_to_be32(PRO_FEATURES);
+	return conn_send_command(connection, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
+}
+
+/*
+ * return values:
+ *   1 yes, we have a valid connection
+ *   0 oops, did not work out, please try again
+ *  -1 peer talks different language,
+ *     no point in trying again, please go standalone.
+ */
+static int drbd_do_features(struct drbd_connection *connection)
+{
+	/* ASSERT current == connection->receiver ... */
+	struct p_connection_features *p;
+	const int expect = sizeof(struct p_connection_features);
+	struct packet_info pi;
+	int err;
+
+	err = drbd_send_features(connection);
+	if (err)
+		return 0;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err)
+		return 0;
+
+	if (pi.cmd != P_CONNECTION_FEATURES) {
+		drbd_err(connection, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		return -1;
+	}
+
+	if (pi.size != expect) {
+		drbd_err(connection, "expected ConnectionFeatures length: %u, received: %u\n",
+		     expect, pi.size);
+		return -1;
+	}
+
+	p = pi.data;
+	err = drbd_recv_all_warn(connection, p, expect);
+	if (err)
+		return 0;
+
+	p->protocol_min = be32_to_cpu(p->protocol_min);
+	p->protocol_max = be32_to_cpu(p->protocol_max);
+	if (p->protocol_max == 0)
+		p->protocol_max = p->protocol_min;
+
+	if (PRO_VERSION_MAX < p->protocol_min ||
+	    PRO_VERSION_MIN > p->protocol_max)
+		goto incompat;
+
+	connection->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+	connection->agreed_features = PRO_FEATURES & be32_to_cpu(p->feature_flags);
+
+	drbd_info(connection, "Handshake successful: "
+	     "Agreed network protocol version %d\n", connection->agreed_pro_version);
+
+	drbd_info(connection, "Feature flags enabled on protocol level: 0x%x%s%s%s%s.\n",
+		  connection->agreed_features,
+		  connection->agreed_features & DRBD_FF_TRIM ? " TRIM" : "",
+		  connection->agreed_features & DRBD_FF_THIN_RESYNC ? " THIN_RESYNC" : "",
+		  connection->agreed_features & DRBD_FF_WSAME ? " WRITE_SAME" : "",
+		  connection->agreed_features & DRBD_FF_WZEROES ? " WRITE_ZEROES" :
+		  connection->agreed_features ? "" : " none");
+
+	return 1;
+
+ incompat:
+	drbd_err(connection, "incompatible DRBD dialects: "
+	    "I support %d-%d, peer supports %d-%d\n",
+	    PRO_VERSION_MIN, PRO_VERSION_MAX,
+	    p->protocol_min, p->protocol_max);
+	return -1;
+}
+
+#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	drbd_err(connection, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
+	drbd_err(connection, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
+	return -1;
+}
+#else
+#define CHALLENGE_LEN 64
+
+/* Return value:
+	1 - auth succeeded,
+	0 - failed, try again (network error),
+	-1 - auth failed, don't try again.
+*/
+
+static int drbd_do_auth(struct drbd_connection *connection)
+{
+	struct drbd_socket *sock;
+	char my_challenge[CHALLENGE_LEN];  /* 64 Bytes... */
+	char *response = NULL;
+	char *right_response = NULL;
+	char *peers_ch = NULL;
+	unsigned int key_len;
+	char secret[SHARED_SECRET_MAX]; /* 64 byte */
+	unsigned int resp_size;
+	struct shash_desc *desc;
+	struct packet_info pi;
+	struct net_conf *nc;
+	int err, rv;
+
+	/* FIXME: Put the challenge/response into the preallocated socket buffer.  */
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	key_len = strlen(nc->shared_secret);
+	memcpy(secret, nc->shared_secret, key_len);
+	rcu_read_unlock();
+
+	desc = kmalloc(sizeof(struct shash_desc) +
+		       crypto_shash_descsize(connection->cram_hmac_tfm),
+		       GFP_KERNEL);
+	if (!desc) {
+		rv = -1;
+		goto fail;
+	}
+	desc->tfm = connection->cram_hmac_tfm;
+
+	rv = crypto_shash_setkey(connection->cram_hmac_tfm, (u8 *)secret, key_len);
+	if (rv) {
+		drbd_err(connection, "crypto_shash_setkey() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	get_random_bytes(my_challenge, CHALLENGE_LEN);
+
+	sock = &connection->data;
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_CHALLENGE, 0,
+				my_challenge, CHALLENGE_LEN);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_CHALLENGE) {
+		drbd_err(connection, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = -1;
+		goto fail;
+	}
+
+	if (pi.size > CHALLENGE_LEN * 2) {
+		drbd_err(connection, "expected AuthChallenge payload too big.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	if (pi.size < CHALLENGE_LEN) {
+		drbd_err(connection, "AuthChallenge payload too small.\n");
+		rv = -1;
+		goto fail;
+	}
+
+	peers_ch = kmalloc(pi.size, GFP_NOIO);
+	if (!peers_ch) {
+		rv = -1;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, peers_ch, pi.size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (!memcmp(my_challenge, peers_ch, CHALLENGE_LEN)) {
+		drbd_err(connection, "Peer presented the same challenge!\n");
+		rv = -1;
+		goto fail;
+	}
+
+	resp_size = crypto_shash_digestsize(connection->cram_hmac_tfm);
+	response = kmalloc(resp_size, GFP_NOIO);
+	if (!response) {
+		rv = -1;
+		goto fail;
+	}
+
+	rv = crypto_shash_digest(desc, peers_ch, pi.size, response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	if (!conn_prepare_command(connection, sock)) {
+		rv = 0;
+		goto fail;
+	}
+	rv = !conn_send_command(connection, sock, P_AUTH_RESPONSE, 0,
+				response, resp_size);
+	if (!rv)
+		goto fail;
+
+	err = drbd_recv_header(connection, &pi);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.cmd != P_AUTH_RESPONSE) {
+		drbd_err(connection, "expected AuthResponse packet, received: %s (0x%04x)\n",
+			 cmdname(pi.cmd), pi.cmd);
+		rv = 0;
+		goto fail;
+	}
+
+	if (pi.size != resp_size) {
+		drbd_err(connection, "expected AuthResponse payload of wrong size\n");
+		rv = 0;
+		goto fail;
+	}
+
+	err = drbd_recv_all_warn(connection, response , resp_size);
+	if (err) {
+		rv = 0;
+		goto fail;
+	}
+
+	right_response = kmalloc(resp_size, GFP_NOIO);
+	if (!right_response) {
+		rv = -1;
+		goto fail;
+	}
+
+	rv = crypto_shash_digest(desc, my_challenge, CHALLENGE_LEN,
+				 right_response);
+	if (rv) {
+		drbd_err(connection, "crypto_hash_digest() failed with %d\n", rv);
+		rv = -1;
+		goto fail;
+	}
+
+	rv = !memcmp(response, right_response, resp_size);
+
+	if (rv)
+		drbd_info(connection, "Peer authenticated using %d bytes HMAC\n",
+		     resp_size);
+	else
+		rv = -1;
+
+ fail:
+	kfree(peers_ch);
+	kfree(response);
+	kfree(right_response);
+	if (desc) {
+		shash_desc_zero(desc);
+		kfree(desc);
+	}
+
+	return rv;
+}
+#endif
+
+int drbd_receiver(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	int h;
+
+	drbd_info(connection, "receiver (re)started\n");
+
+	do {
+		h = conn_connect(connection);
+		if (h == 0) {
+			conn_disconnect(connection);
+			schedule_timeout_interruptible(HZ);
+		}
+		if (h == -1) {
+			drbd_warn(connection, "Discarding network configuration.\n");
+			conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+		}
+	} while (h == 0);
+
+	if (h > 0) {
+		blk_start_plug(&connection->receiver_plug);
+		drbdd(connection);
+		blk_finish_plug(&connection->receiver_plug);
+	}
+
+	conn_disconnect(connection);
+
+	drbd_info(connection, "receiver terminated\n");
+	return 0;
+}
+
+/* ********* acknowledge sender ******** */
+
+static int got_conn_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CONN_WD_ST_CHG_OKAY, &connection->flags);
+	} else {
+		set_bit(CONN_WD_ST_CHG_FAIL, &connection->flags);
+		drbd_err(connection, "Requested state change failed by peer: %s (%d)\n",
+			 drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_RqSReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_req_state_reply *p = pi->data;
+	int retcode = be32_to_cpu(p->retcode);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	if (test_bit(CONN_WD_ST_CHG_REQ, &connection->flags)) {
+		D_ASSERT(device, connection->agreed_pro_version < 100);
+		return got_conn_RqSReply(connection, pi);
+	}
+
+	if (retcode >= SS_SUCCESS) {
+		set_bit(CL_ST_CHG_SUCCESS, &device->flags);
+	} else {
+		set_bit(CL_ST_CHG_FAIL, &device->flags);
+		drbd_err(device, "Requested state change failed by peer: %s (%d)\n",
+			drbd_set_st_err_str(retcode), retcode);
+	}
+	wake_up(&device->state_wait);
+
+	return 0;
+}
+
+static int got_Ping(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return drbd_send_ping_ack(connection);
+
+}
+
+static int got_PingAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	/* restore idle timeout */
+	connection->meta.socket->sk->sk_rcvtimeo = connection->net_conf->ping_int*HZ;
+	if (!test_and_set_bit(GOT_PING_ACK, &connection->flags))
+		wake_up(&connection->ping_wait);
+
+	return 0;
+}
+
+static int got_IsInSync(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	D_ASSERT(device, peer_device->connection->agreed_pro_version >= 89);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, sector);
+		drbd_set_in_sync(device, sector, blksize);
+		/* rs_same_csums is supposed to count in units of BM_BLOCK_SIZE */
+		device->rs_same_csum += (blksize >> BM_BLOCK_SHIFT);
+		put_ldev(device);
+	}
+	dec_rs_pending(device);
+	atomic_add(blksize >> 9, &device->rs_sect_in);
+
+	return 0;
+}
+
+static int
+validate_req_change_req_state(struct drbd_device *device, u64 id, sector_t sector,
+			      struct rb_root *root, const char *func,
+			      enum drbd_req_event what, bool missing_ok)
+{
+	struct drbd_request *req;
+	struct bio_and_error m;
+
+	spin_lock_irq(&device->resource->req_lock);
+	req = find_request(device, root, id, sector, missing_ok, func);
+	if (unlikely(!req)) {
+		spin_unlock_irq(&device->resource->req_lock);
+		return -EIO;
+	}
+	__req_mod(req, what, &m);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+	return 0;
+}
+
+static int got_BlockAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int blksize = be32_to_cpu(p->blksize);
+	enum drbd_req_event what;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		drbd_set_in_sync(device, sector, blksize);
+		dec_rs_pending(device);
+		return 0;
+	}
+	switch (pi->cmd) {
+	case P_RS_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER_AND_SIS;
+		break;
+	case P_WRITE_ACK:
+		what = WRITE_ACKED_BY_PEER;
+		break;
+	case P_RECV_ACK:
+		what = RECV_ACKED_BY_PEER;
+		break;
+	case P_SUPERSEDED:
+		what = CONFLICT_RESOLVED;
+		break;
+	case P_RETRY_WRITE:
+		what = POSTPONE_WRITE;
+		break;
+	default:
+		BUG();
+	}
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->write_requests, __func__,
+					     what, false);
+}
+
+static int got_NegAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+	int size = be32_to_cpu(p->blksize);
+	int err;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (p->block_id == ID_SYNCER) {
+		dec_rs_pending(device);
+		drbd_rs_failed_io(device, sector, size);
+		return 0;
+	}
+
+	err = validate_req_change_req_state(device, p->block_id, sector,
+					    &device->write_requests, __func__,
+					    NEG_ACKED, true);
+	if (err) {
+		/* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+		   The master bio might already be completed, therefore the
+		   request is no longer in the collision hash. */
+		/* In Protocol B we might already have got a P_RECV_ACK
+		   but then get a P_NEG_ACK afterwards. */
+		drbd_set_out_of_sync(device, sector, size);
+	}
+	return 0;
+}
+
+static int got_NegDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	sector_t sector = be64_to_cpu(p->sector);
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	drbd_err(device, "Got NegDReply; Sector %llus, len %u.\n",
+	    (unsigned long long)sector, be32_to_cpu(p->blksize));
+
+	return validate_req_change_req_state(device, p->block_id, sector,
+					     &device->read_requests, __func__,
+					     NEG_ACKED, false);
+}
+
+static int got_NegRSDReply(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	sector_t sector;
+	int size;
+	struct p_block_ack *p = pi->data;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	dec_rs_pending(device);
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, sector);
+		switch (pi->cmd) {
+		case P_NEG_RS_DREPLY:
+			drbd_rs_failed_io(device, sector, size);
+			break;
+		case P_RS_CANCEL:
+			break;
+		default:
+			BUG();
+		}
+		put_ldev(device);
+	}
+
+	return 0;
+}
+
+static int got_BarrierAck(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct p_barrier_ack *p = pi->data;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	tl_release(connection, p->barrier, be32_to_cpu(p->set_size));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+
+		if (device->state.conn == C_AHEAD &&
+		    atomic_read(&device->ap_in_flight) == 0 &&
+		    !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &device->flags)) {
+			device->start_resync_timer.expires = jiffies + HZ;
+			add_timer(&device->start_resync_timer);
+		}
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
+
+static int got_OVResult(struct drbd_connection *connection, struct packet_info *pi)
+{
+	struct drbd_peer_device *peer_device;
+	struct drbd_device *device;
+	struct p_block_ack *p = pi->data;
+	struct drbd_device_work *dw;
+	sector_t sector;
+	int size;
+
+	peer_device = conn_peer_device(connection, pi->vnr);
+	if (!peer_device)
+		return -EIO;
+	device = peer_device->device;
+
+	sector = be64_to_cpu(p->sector);
+	size = be32_to_cpu(p->blksize);
+
+	update_peer_seq(peer_device, be32_to_cpu(p->seq_num));
+
+	if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	if (!get_ldev(device))
+		return 0;
+
+	drbd_rs_complete_io(device, sector);
+	dec_rs_pending(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	if (device->ov_left == 0) {
+		dw = kmalloc(sizeof(*dw), GFP_NOIO);
+		if (dw) {
+			dw->w.cb = w_ov_finished;
+			dw->device = device;
+			drbd_queue_work(&peer_device->connection->sender_work, &dw->w);
+		} else {
+			drbd_err(device, "kmalloc(dw) failed.");
+			ov_out_of_sync_print(device);
+			drbd_resync_finished(device);
+		}
+	}
+	put_ldev(device);
+	return 0;
+}
+
+static int got_skip(struct drbd_connection *connection, struct packet_info *pi)
+{
+	return 0;
+}
+
+struct meta_sock_cmd {
+	size_t pkt_size;
+	int (*fn)(struct drbd_connection *connection, struct packet_info *);
+};
+
+static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout)
+{
+	long t;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	t = ping_timeout ? nc->ping_timeo : nc->ping_int;
+	rcu_read_unlock();
+
+	t *= HZ;
+	if (ping_timeout)
+		t /= 10;
+
+	connection->meta.socket->sk->sk_rcvtimeo = t;
+}
+
+static void set_ping_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 1);
+}
+
+static void set_idle_timeout(struct drbd_connection *connection)
+{
+	set_rcvtimeo(connection, 0);
+}
+
+static struct meta_sock_cmd ack_receiver_tbl[] = {
+	[P_PING]	    = { 0, got_Ping },
+	[P_PING_ACK]	    = { 0, got_PingAck },
+	[P_RECV_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_WRITE_ACK]	    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_RS_WRITE_ACK]    = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_SUPERSEDED]   = { sizeof(struct p_block_ack), got_BlockAck },
+	[P_NEG_ACK]	    = { sizeof(struct p_block_ack), got_NegAck },
+	[P_NEG_DREPLY]	    = { sizeof(struct p_block_ack), got_NegDReply },
+	[P_NEG_RS_DREPLY]   = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_OV_RESULT]	    = { sizeof(struct p_block_ack), got_OVResult },
+	[P_BARRIER_ACK]	    = { sizeof(struct p_barrier_ack), got_BarrierAck },
+	[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
+	[P_RS_IS_IN_SYNC]   = { sizeof(struct p_block_ack), got_IsInSync },
+	[P_DELAY_PROBE]     = { sizeof(struct p_delay_probe93), got_skip },
+	[P_RS_CANCEL]       = { sizeof(struct p_block_ack), got_NegRSDReply },
+	[P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+	[P_RETRY_WRITE]	    = { sizeof(struct p_block_ack), got_BlockAck },
+};
+
+int drbd_ack_receiver(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct meta_sock_cmd *cmd = NULL;
+	struct packet_info pi;
+	unsigned long pre_recv_jif;
+	int rv;
+	void *buf    = connection->meta.rbuf;
+	int received = 0;
+	unsigned int header_size = drbd_header_size(connection);
+	int expect   = header_size;
+	bool ping_timeout_active = false;
+
+	sched_set_fifo_low(current);
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		conn_reclaim_net_peer_reqs(connection);
+
+		if (test_and_clear_bit(SEND_PING, &connection->flags)) {
+			if (drbd_send_ping(connection)) {
+				drbd_err(connection, "drbd_send_ping has failed\n");
+				goto reconnect;
+			}
+			set_ping_timeout(connection);
+			ping_timeout_active = true;
+		}
+
+		pre_recv_jif = jiffies;
+		rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0);
+
+		/* Note:
+		 * -EINTR	 (on meta) we got a signal
+		 * -EAGAIN	 (on meta) rcvtimeo expired
+		 * -ECONNRESET	 other side closed the connection
+		 * -ERESTARTSYS  (on data) we got a signal
+		 * rv <  0	 other than above: unexpected error!
+		 * rv == expected: full header or command
+		 * rv <  expected: "woken" by signal during receive
+		 * rv == 0	 : "connection shut down by peer"
+		 */
+		if (likely(rv > 0)) {
+			received += rv;
+			buf	 += rv;
+		} else if (rv == 0) {
+			if (test_bit(DISCONNECT_SENT, &connection->flags)) {
+				long t;
+				rcu_read_lock();
+				t = rcu_dereference(connection->net_conf)->ping_timeo * HZ/10;
+				rcu_read_unlock();
+
+				t = wait_event_timeout(connection->ping_wait,
+						       connection->cstate < C_WF_REPORT_PARAMS,
+						       t);
+				if (t)
+					break;
+			}
+			drbd_err(connection, "meta connection shut down by peer.\n");
+			goto reconnect;
+		} else if (rv == -EAGAIN) {
+			/* If the data socket received something meanwhile,
+			 * that is good enough: peer is still alive. */
+			if (time_after(connection->last_received, pre_recv_jif))
+				continue;
+			if (ping_timeout_active) {
+				drbd_err(connection, "PingAck did not arrive in time.\n");
+				goto reconnect;
+			}
+			set_bit(SEND_PING, &connection->flags);
+			continue;
+		} else if (rv == -EINTR) {
+			/* maybe drbd_thread_stop(): the while condition will notice.
+			 * maybe woken for send_ping: we'll send a ping above,
+			 * and change the rcvtimeo */
+			flush_signals(current);
+			continue;
+		} else {
+			drbd_err(connection, "sock_recvmsg returned %d\n", rv);
+			goto reconnect;
+		}
+
+		if (received == expect && cmd == NULL) {
+			if (decode_header(connection, connection->meta.rbuf, &pi))
+				goto reconnect;
+			cmd = &ack_receiver_tbl[pi.cmd];
+			if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) {
+				drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n",
+					 cmdname(pi.cmd), pi.cmd);
+				goto disconnect;
+			}
+			expect = header_size + cmd->pkt_size;
+			if (pi.size != expect - header_size) {
+				drbd_err(connection, "Wrong packet size on meta (c: %d, l: %d)\n",
+					pi.cmd, pi.size);
+				goto reconnect;
+			}
+		}
+		if (received == expect) {
+			bool err;
+
+			err = cmd->fn(connection, &pi);
+			if (err) {
+				drbd_err(connection, "%ps failed\n", cmd->fn);
+				goto reconnect;
+			}
+
+			connection->last_received = jiffies;
+
+			if (cmd == &ack_receiver_tbl[P_PING_ACK]) {
+				set_idle_timeout(connection);
+				ping_timeout_active = false;
+			}
+
+			buf	 = connection->meta.rbuf;
+			received = 0;
+			expect	 = header_size;
+			cmd	 = NULL;
+		}
+	}
+
+	if (0) {
+reconnect:
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		conn_md_sync(connection);
+	}
+	if (0) {
+disconnect:
+		conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+	}
+
+	drbd_info(connection, "ack_receiver terminated\n");
+
+	return 0;
+}
+
+void drbd_send_acks_wf(struct work_struct *ws)
+{
+	struct drbd_peer_device *peer_device =
+		container_of(ws, struct drbd_peer_device, send_acks_work);
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_device *device = peer_device->device;
+	struct net_conf *nc;
+	int tcp_cork, err;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	tcp_cork = nc->tcp_cork;
+	rcu_read_unlock();
+
+	if (tcp_cork)
+		tcp_sock_set_cork(connection->meta.socket->sk, true);
+
+	err = drbd_finish_peer_reqs(device);
+	kref_put(&device->kref, drbd_destroy_device);
+	/* get is in drbd_endio_write_sec_final(). That is necessary to keep the
+	   struct work_struct send_acks_work alive, which is in the peer_device object */
+
+	if (err) {
+		conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		return;
+	}
+
+	if (tcp_cork)
+		tcp_sock_set_cork(connection->meta.socket->sk, false);
+
+	return;
+}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
new file mode 100644
index 000000000..d700bf06b
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.c
@@ -0,0 +1,1783 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_req.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+ */
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size);
+
+static struct drbd_request *drbd_req_new(struct drbd_device *device, struct bio *bio_src)
+{
+	struct drbd_request *req;
+
+	req = mempool_alloc(&drbd_request_mempool, GFP_NOIO);
+	if (!req)
+		return NULL;
+	memset(req, 0, sizeof(*req));
+
+	req->rq_state = (bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0)
+		      | (bio_op(bio_src) == REQ_OP_WRITE_ZEROES ? RQ_ZEROES : 0)
+		      | (bio_op(bio_src) == REQ_OP_DISCARD ? RQ_UNMAP : 0);
+	req->device = device;
+	req->master_bio = bio_src;
+	req->epoch = 0;
+
+	drbd_clear_interval(&req->i);
+	req->i.sector     = bio_src->bi_iter.bi_sector;
+	req->i.size      = bio_src->bi_iter.bi_size;
+	req->i.local = true;
+	req->i.waiting = false;
+
+	INIT_LIST_HEAD(&req->tl_requests);
+	INIT_LIST_HEAD(&req->w.list);
+	INIT_LIST_HEAD(&req->req_pending_master_completion);
+	INIT_LIST_HEAD(&req->req_pending_local);
+
+	/* one reference to be put by __drbd_make_request */
+	atomic_set(&req->completion_ref, 1);
+	/* one kref as long as completion_ref > 0 */
+	kref_init(&req->kref);
+	return req;
+}
+
+static void drbd_remove_request_interval(struct rb_root *root,
+					 struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i = &req->i;
+
+	drbd_remove_interval(root, i);
+
+	/* Wake up any processes waiting for this request to complete.  */
+	if (i->waiting)
+		wake_up(&device->misc_wait);
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+	struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+	struct drbd_device *device = req->device;
+	const unsigned s = req->rq_state;
+
+	if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+		atomic_read(&req->completion_ref) ||
+		(s & RQ_LOCAL_PENDING) ||
+		((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+		drbd_err(device, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+				s, atomic_read(&req->completion_ref));
+		return;
+	}
+
+	/* If called from mod_rq_state (expected normal case) or
+	 * drbd_send_and_submit (the less likely normal path), this holds the
+	 * req_lock, and req->tl_requests will typicaly be on ->transfer_log,
+	 * though it may be still empty (never added to the transfer log).
+	 *
+	 * If called from do_retry(), we do NOT hold the req_lock, but we are
+	 * still allowed to unconditionally list_del(&req->tl_requests),
+	 * because it will be on a local on-stack list only. */
+	list_del_init(&req->tl_requests);
+
+	/* finally remove the request from the conflict detection
+	 * respective block_id verification interval tree. */
+	if (!drbd_interval_empty(&req->i)) {
+		struct rb_root *root;
+
+		if (s & RQ_WRITE)
+			root = &device->write_requests;
+		else
+			root = &device->read_requests;
+		drbd_remove_request_interval(root, req);
+	} else if (s & (RQ_NET_MASK & ~RQ_NET_DONE) && req->i.size != 0)
+		drbd_err(device, "drbd_req_destroy: Logic BUG: interval empty, but: rq_state=0x%x, sect=%llu, size=%u\n",
+			s, (unsigned long long)req->i.sector, req->i.size);
+
+	/* if it was a write, we may have to set the corresponding
+	 * bit(s) out-of-sync first. If it had a local part, we need to
+	 * release the reference to the activity log. */
+	if (s & RQ_WRITE) {
+		/* Set out-of-sync unless both OK flags are set
+		 * (local only or remote failed).
+		 * Other places where we set out-of-sync:
+		 * READ with local io-error */
+
+		/* There is a special case:
+		 * we may notice late that IO was suspended,
+		 * and postpone, or schedule for retry, a write,
+		 * before it even was submitted or sent.
+		 * In that case we do not want to touch the bitmap at all.
+		 */
+		if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+			if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+				drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+
+			if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+				drbd_set_in_sync(device, req->i.sector, req->i.size);
+		}
+
+		/* one might be tempted to move the drbd_al_complete_io
+		 * to the local io completion callback drbd_request_endio.
+		 * but, if this was a mirror write, we may only
+		 * drbd_al_complete_io after this is RQ_NET_DONE,
+		 * otherwise the extent could be dropped from the al
+		 * before it has actually been written on the peer.
+		 * if we crash before our peer knows about the request,
+		 * but after the extent has been dropped from the al,
+		 * we would forget to resync the corresponding extent.
+		 */
+		if (s & RQ_IN_ACT_LOG) {
+			if (get_ldev_if_state(device, D_FAILED)) {
+				drbd_al_complete_io(device, &req->i);
+				put_ldev(device);
+			} else if (__ratelimit(&drbd_ratelimit_state)) {
+				drbd_warn(device, "Should have called drbd_al_complete_io(, %llu, %u), "
+					 "but my Disk seems to have failed :(\n",
+					 (unsigned long long) req->i.sector, req->i.size);
+			}
+		}
+	}
+
+	mempool_free(req, &drbd_request_mempool);
+}
+
+static void wake_all_senders(struct drbd_connection *connection)
+{
+	wake_up(&connection->sender_work.q_wait);
+}
+
+/* must hold resource->req_lock */
+void start_new_tl_epoch(struct drbd_connection *connection)
+{
+	/* no point closing an epoch, if it is empty, anyways. */
+	if (connection->current_tle_writes == 0)
+		return;
+
+	connection->current_tle_writes = 0;
+	atomic_inc(&connection->current_tle_nr);
+	wake_all_senders(connection);
+}
+
+void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m)
+{
+	if (unlikely(m->error))
+		m->bio->bi_status = errno_to_blk_status(m->error);
+	bio_endio(m->bio);
+	dec_ap_bio(device);
+}
+
+
+/* Helper for __req_mod().
+ * Set m->bio to the master bio, if it is fit to be completed,
+ * or leave it alone (it is initialized to NULL in __req_mod),
+ * if it has already been completed, or cannot be completed yet.
+ * If m->bio is set, the error status to be returned is placed in m->error.
+ */
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
+{
+	const unsigned s = req->rq_state;
+	struct drbd_device *device = req->device;
+	int error, ok;
+
+	/* we must not complete the master bio, while it is
+	 *	still being processed by _drbd_send_zc_bio (drbd_send_dblock)
+	 *	not yet acknowledged by the peer
+	 *	not yet completed by the local io subsystem
+	 * these flags may get cleared in any order by
+	 *	the worker,
+	 *	the receiver,
+	 *	the bio_endio completion callbacks.
+	 */
+	if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+	    (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+	    (s & RQ_COMPLETION_SUSP)) {
+		drbd_err(device, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
+		return;
+	}
+
+	if (!req->master_bio) {
+		drbd_err(device, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
+		return;
+	}
+
+	/*
+	 * figure out whether to report success or failure.
+	 *
+	 * report success when at least one of the operations succeeded.
+	 * or, to put the other way,
+	 * only report failure, when both operations failed.
+	 *
+	 * what to do about the failures is handled elsewhere.
+	 * what we need to do here is just: complete the master_bio.
+	 *
+	 * local completion error, if any, has been stored as ERR_PTR
+	 * in private_bio within drbd_request_endio.
+	 */
+	ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+	error = PTR_ERR(req->private_bio);
+
+	/* Before we can signal completion to the upper layers,
+	 * we may need to close the current transfer log epoch.
+	 * We are within the request lock, so we can simply compare
+	 * the request epoch number with the current transfer log
+	 * epoch number.  If they match, increase the current_tle_nr,
+	 * and reset the transfer log epoch write_cnt.
+	 */
+	if (op_is_write(bio_op(req->master_bio)) &&
+	    req->epoch == atomic_read(&first_peer_device(device)->connection->current_tle_nr))
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+	/* Update disk stats */
+	bio_end_io_acct(req->master_bio, req->start_jif);
+
+	/* If READ failed,
+	 * have it be pushed back to the retry work queue,
+	 * so it will re-enter __drbd_make_request(),
+	 * and be re-assigned to a suitable local or remote path,
+	 * or failed if we do not have access to good data anymore.
+	 *
+	 * Unless it was failed early by __drbd_make_request(),
+	 * because no path was available, in which case
+	 * it was not even added to the transfer_log.
+	 *
+	 * read-ahead may fail, and will not be retried.
+	 *
+	 * WRITE should have used all available paths already.
+	 */
+	if (!ok &&
+	    bio_op(req->master_bio) == REQ_OP_READ &&
+	    !(req->master_bio->bi_opf & REQ_RAHEAD) &&
+	    !list_empty(&req->tl_requests))
+		req->rq_state |= RQ_POSTPONED;
+
+	if (!(req->rq_state & RQ_POSTPONED)) {
+		m->error = ok ? 0 : (error ?: -EIO);
+		m->bio = req->master_bio;
+		req->master_bio = NULL;
+		/* We leave it in the tree, to be able to verify later
+		 * write-acks in protocol != C during resync.
+		 * But we mark it as "complete", so it won't be counted as
+		 * conflict in a multi-primary setup. */
+		req->i.completed = true;
+	}
+
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	/* Either we are about to complete to upper layers,
+	 * or we will restart this request.
+	 * In either case, the request object will be destroyed soon,
+	 * so better remove it from all lists. */
+	list_del_init(&req->req_pending_master_completion);
+}
+
+/* still holds resource->req_lock */
+static void drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
+{
+	struct drbd_device *device = req->device;
+	D_ASSERT(device, m || (req->rq_state & RQ_POSTPONED));
+
+	if (!put)
+		return;
+
+	if (!atomic_sub_and_test(put, &req->completion_ref))
+		return;
+
+	drbd_req_complete(req, m);
+
+	/* local completion may still come in later,
+	 * we need to keep the req object around. */
+	if (req->rq_state & RQ_LOCAL_ABORTED)
+		return;
+
+	if (req->rq_state & RQ_POSTPONED) {
+		/* don't destroy the req object just yet,
+		 * but queue it for retry */
+		drbd_restart_request(req);
+		return;
+	}
+
+	kref_put(&req->kref, drbd_req_destroy);
+}
+
+static void set_if_null_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_next == NULL)
+		connection->req_next = req;
+}
+
+static void advance_conn_req_next(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	struct drbd_request *iter = req;
+	if (!connection)
+		return;
+	if (connection->req_next != req)
+		return;
+
+	req = NULL;
+	list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) {
+		const unsigned int s = iter->rq_state;
+
+		if (s & RQ_NET_QUEUED) {
+			req = iter;
+			break;
+		}
+	}
+	connection->req_next = req;
+}
+
+static void set_if_null_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending == NULL)
+		connection->req_ack_pending = req;
+}
+
+static void advance_conn_req_ack_pending(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	struct drbd_request *iter = req;
+	if (!connection)
+		return;
+	if (connection->req_ack_pending != req)
+		return;
+
+	req = NULL;
+	list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) {
+		const unsigned int s = iter->rq_state;
+
+		if ((s & RQ_NET_SENT) && (s & RQ_NET_PENDING)) {
+			req = iter;
+			break;
+		}
+	}
+	connection->req_ack_pending = req;
+}
+
+static void set_if_null_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done == NULL)
+		connection->req_not_net_done = req;
+}
+
+static void advance_conn_req_not_net_done(struct drbd_peer_device *peer_device, struct drbd_request *req)
+{
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	struct drbd_request *iter = req;
+	if (!connection)
+		return;
+	if (connection->req_not_net_done != req)
+		return;
+
+	req = NULL;
+	list_for_each_entry_continue(iter, &connection->transfer_log, tl_requests) {
+		const unsigned int s = iter->rq_state;
+
+		if ((s & RQ_NET_SENT) && !(s & RQ_NET_DONE)) {
+			req = iter;
+			break;
+		}
+	}
+	connection->req_not_net_done = req;
+}
+
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+		int clear, int set)
+{
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	unsigned s = req->rq_state;
+	int c_put = 0;
+
+	if (drbd_suspended(device) && !((s | clear) & RQ_COMPLETION_SUSP))
+		set |= RQ_COMPLETION_SUSP;
+
+	/* apply */
+
+	req->rq_state &= ~clear;
+	req->rq_state |= set;
+
+	/* no change? */
+	if (req->rq_state == s)
+		return;
+
+	/* intent: get references */
+
+	kref_get(&req->kref);
+
+	if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+		atomic_inc(&req->completion_ref);
+
+	if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+		inc_ap_pending(device);
+		atomic_inc(&req->completion_ref);
+	}
+
+	if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED)) {
+		atomic_inc(&req->completion_ref);
+		set_if_null_req_next(peer_device, req);
+	}
+
+	if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+		kref_get(&req->kref); /* wait for the DONE */
+
+	if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) {
+		/* potentially already completed in the ack_receiver thread */
+		if (!(s & RQ_NET_DONE)) {
+			atomic_add(req->i.size >> 9, &device->ap_in_flight);
+			set_if_null_req_not_net_done(peer_device, req);
+		}
+		if (req->rq_state & RQ_NET_PENDING)
+			set_if_null_req_ack_pending(peer_device, req);
+	}
+
+	if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+		atomic_inc(&req->completion_ref);
+
+	/* progress: put references */
+
+	if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+		++c_put;
+
+	if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+		D_ASSERT(device, req->rq_state & RQ_LOCAL_PENDING);
+		++c_put;
+	}
+
+	if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+		if (req->rq_state & RQ_LOCAL_ABORTED)
+			kref_put(&req->kref, drbd_req_destroy);
+		else
+			++c_put;
+		list_del_init(&req->req_pending_local);
+	}
+
+	if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+		dec_ap_pending(device);
+		++c_put;
+		req->acked_jif = jiffies;
+		advance_conn_req_ack_pending(peer_device, req);
+	}
+
+	if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED)) {
+		++c_put;
+		advance_conn_req_next(peer_device, req);
+	}
+
+	if (!(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+		if (s & RQ_NET_SENT)
+			atomic_sub(req->i.size >> 9, &device->ap_in_flight);
+		if (s & RQ_EXP_BARR_ACK)
+			kref_put(&req->kref, drbd_req_destroy);
+		req->net_done_jif = jiffies;
+
+		/* in ahead/behind mode, or just in case,
+		 * before we finally destroy this request,
+		 * the caching pointers must not reference it anymore */
+		advance_conn_req_next(peer_device, req);
+		advance_conn_req_ack_pending(peer_device, req);
+		advance_conn_req_not_net_done(peer_device, req);
+	}
+
+	/* potentially complete and destroy */
+
+	/* If we made progress, retry conflicting peer requests, if any. */
+	if (req->i.waiting)
+		wake_up(&device->misc_wait);
+
+	drbd_req_put_completion_ref(req, m, c_put);
+	kref_put(&req->kref, drbd_req_destroy);
+}
+
+static void drbd_report_io_error(struct drbd_device *device, struct drbd_request *req)
+{
+	if (!__ratelimit(&drbd_ratelimit_state))
+		return;
+
+	drbd_warn(device, "local %s IO error sector %llu+%u on %pg\n",
+			(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
+			(unsigned long long)req->i.sector,
+			req->i.size >> 9,
+			device->ldev->backing_bdev);
+}
+
+/* Helper for HANDED_OVER_TO_NETWORK.
+ * Is this a protocol A write (neither WRITE_ACK nor RECEIVE_ACK expected)?
+ * Is it also still "PENDING"?
+ * --> If so, clear PENDING and set NET_OK below.
+ * If it is a protocol A write, but not RQ_PENDING anymore, neg-ack was faster
+ * (and we must not set RQ_NET_OK) */
+static inline bool is_pending_write_protocol_A(struct drbd_request *req)
+{
+	return (req->rq_state &
+		   (RQ_WRITE|RQ_NET_PENDING|RQ_EXP_WRITE_ACK|RQ_EXP_RECEIVE_ACK))
+		== (RQ_WRITE|RQ_NET_PENDING);
+}
+
+/* obviously this could be coded as many single functions
+ * instead of one huge switch,
+ * or by putting the code directly in the respective locations
+ * (as it has been before).
+ *
+ * but having it this way
+ *  enforces that it is all in this one place, where it is easier to audit,
+ *  it makes it obvious that whatever "event" "happens" to a request should
+ *  happen "atomically" within the req_lock,
+ *  and it enforces that we have to think in a very structured manner
+ *  about the "events" that may happen to a request during its life time ...
+ */
+int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m)
+{
+	struct drbd_device *const device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	struct net_conf *nc;
+	int p, rv = 0;
+
+	if (m)
+		m->bio = NULL;
+
+	switch (what) {
+	default:
+		drbd_err(device, "LOGIC BUG in %s:%u\n", __FILE__ , __LINE__);
+		break;
+
+	/* does not happen...
+	 * initialization done in drbd_req_new
+	case CREATED:
+		break;
+		*/
+
+	case TO_BE_SENT: /* via network */
+		/* reached via __drbd_make_request
+		 * and from w_read_retry_remote */
+		D_ASSERT(device, !(req->rq_state & RQ_NET_MASK));
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		p = nc->wire_protocol;
+		rcu_read_unlock();
+		req->rq_state |=
+			p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+			p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+		mod_rq_state(req, m, 0, RQ_NET_PENDING);
+		break;
+
+	case TO_BE_SUBMITTED: /* locally */
+		/* reached via __drbd_make_request */
+		D_ASSERT(device, !(req->rq_state & RQ_LOCAL_MASK));
+		mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
+		break;
+
+	case COMPLETED_OK:
+		if (req->rq_state & RQ_WRITE)
+			device->writ_cnt += req->i.size >> 9;
+		else
+			device->read_cnt += req->i.size >> 9;
+
+		mod_rq_state(req, m, RQ_LOCAL_PENDING,
+				RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
+		break;
+
+	case ABORT_DISK_IO:
+		mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
+		break;
+
+	case WRITE_COMPLETED_WITH_ERROR:
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case READ_COMPLETED_WITH_ERROR:
+		drbd_set_out_of_sync(device, req->i.sector, req->i.size);
+		drbd_report_io_error(device, req);
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+		fallthrough;
+	case READ_AHEAD_COMPLETED_WITH_ERROR:
+		/* it is legal to fail read-ahead, no __drbd_chk_io_error in that case. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case DISCARD_COMPLETED_NOTSUPP:
+	case DISCARD_COMPLETED_WITH_ERROR:
+		/* I'd rather not detach from local disk just because it
+		 * failed a REQ_OP_DISCARD. */
+		mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+		break;
+
+	case QUEUE_FOR_NET_READ:
+		/* READ, and
+		 * no local disk,
+		 * or target area marked as invalid,
+		 * or just got an io-error. */
+		/* from __drbd_make_request
+		 * or from bio_endio during read io-error recovery */
+
+		/* So we can verify the handle in the answer packet.
+		 * Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->read_requests, &req->i);
+
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, (req->rq_state & RQ_LOCAL_MASK) == 0);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb = w_send_read_req;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case QUEUE_FOR_NET_WRITE:
+		/* assert something? */
+		/* from __drbd_make_request only */
+
+		/* Corresponding drbd_remove_request_interval is in
+		 * drbd_req_complete() */
+		D_ASSERT(device, drbd_interval_empty(&req->i));
+		drbd_insert_interval(&device->write_requests, &req->i);
+
+		/* NOTE
+		 * In case the req ended up on the transfer log before being
+		 * queued on the worker, it could lead to this request being
+		 * missed during cleanup after connection loss.
+		 * So we have to do both operations here,
+		 * within the same lock that protects the transfer log.
+		 *
+		 * _req_add_to_epoch(req); this has to be after the
+		 * _maybe_start_new_epoch(req); which happened in
+		 * __drbd_make_request, because we now may set the bit
+		 * again ourselves to close the current epoch.
+		 *
+		 * Add req to the (now) current epoch (barrier). */
+
+		/* otherwise we may lose an unplug, which may cause some remote
+		 * io-scheduler timeout to expire, increasing maximum latency,
+		 * hurting performance. */
+		set_bit(UNPLUG_REMOTE, &device->flags);
+
+		/* queue work item to send data */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
+		req->w.cb =  w_send_dblock;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+
+		/* close the epoch, in case it outgrew the limit */
+		rcu_read_lock();
+		nc = rcu_dereference(connection->net_conf);
+		p = nc->max_epoch_size;
+		rcu_read_unlock();
+		if (connection->current_tle_writes >= p)
+			start_new_tl_epoch(connection);
+
+		break;
+
+	case QUEUE_FOR_SEND_OOS:
+		mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+		req->w.cb =  w_send_out_of_sync;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case READ_RETRY_REMOTE_CANCELED:
+	case SEND_CANCELED:
+	case SEND_FAILED:
+		/* real cleanup will be done from tl_clear.  just update flags
+		 * so it is no longer marked as on the worker queue */
+		mod_rq_state(req, m, RQ_NET_QUEUED, 0);
+		break;
+
+	case HANDED_OVER_TO_NETWORK:
+		/* assert something? */
+		if (is_pending_write_protocol_A(req))
+			/* this is what is dangerous about protocol A:
+			 * pretend it was successfully written on the peer. */
+			mod_rq_state(req, m, RQ_NET_QUEUED|RQ_NET_PENDING,
+						RQ_NET_SENT|RQ_NET_OK);
+		else
+			mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
+		/* It is still not yet RQ_NET_DONE until the
+		 * corresponding epoch barrier got acked as well,
+		 * so we know what to dirty on connection loss. */
+		break;
+
+	case OOS_HANDED_TO_NETWORK:
+		/* Was not set PENDING, no longer QUEUED, so is now DONE
+		 * as far as this connection is concerned. */
+		mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
+		break;
+
+	case CONNECTION_LOST_WHILE_PENDING:
+		/* transfer log cleanup after connection loss */
+		mod_rq_state(req, m,
+				RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+				RQ_NET_DONE);
+		break;
+
+	case CONFLICT_RESOLVED:
+		/* for superseded conflicting writes of multiple primaries,
+		 * there is no need to keep anything in the tl, potential
+		 * node crashes are covered by the activity log.
+		 *
+		 * If this request had been marked as RQ_POSTPONED before,
+		 * it will actually not be completed, but "restarted",
+		 * resubmitted from the retry worker context. */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+		break;
+
+	case WRITE_ACKED_BY_PEER_AND_SIS:
+		req->rq_state |= RQ_NET_SIS;
+		fallthrough;
+	case WRITE_ACKED_BY_PEER:
+		/* Normal operation protocol C: successfully written on peer.
+		 * During resync, even in protocol != C,
+		 * we requested an explicit write ack anyways.
+		 * Which means we cannot even assert anything here.
+		 * Nothing more to do here.
+		 * We want to keep the tl in place for all protocols, to cater
+		 * for volatile write-back caches on lower level devices. */
+		goto ack_common;
+	case RECV_ACKED_BY_PEER:
+		D_ASSERT(device, req->rq_state & RQ_EXP_RECEIVE_ACK);
+		/* protocol B; pretends to be successfully written on peer.
+		 * see also notes above in HANDED_OVER_TO_NETWORK about
+		 * protocol != C */
+	ack_common:
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+		break;
+
+	case POSTPONE_WRITE:
+		D_ASSERT(device, req->rq_state & RQ_EXP_WRITE_ACK);
+		/* If this node has already detected the write conflict, the
+		 * worker will be waiting on misc_wait.  Wake it up once this
+		 * request has completed locally.
+		 */
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		req->rq_state |= RQ_POSTPONED;
+		if (req->i.waiting)
+			wake_up(&device->misc_wait);
+		/* Do not clear RQ_NET_PENDING. This request will make further
+		 * progress via restart_conflicting_writes() or
+		 * fail_postponed_requests(). Hopefully. */
+		break;
+
+	case NEG_ACKED:
+		mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+		break;
+
+	case FAIL_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+		break;
+
+	case RESTART_FROZEN_DISK_IO:
+		if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+			break;
+
+		mod_rq_state(req, m,
+				RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+				RQ_LOCAL_PENDING);
+
+		rv = MR_READ;
+		if (bio_data_dir(req->master_bio) == WRITE)
+			rv = MR_WRITE;
+
+		get_ldev(device); /* always succeeds in this call path */
+		req->w.cb = w_restart_disk_io;
+		drbd_queue_work(&connection->sender_work,
+				&req->w);
+		break;
+
+	case RESEND:
+		/* Simply complete (local only) READs. */
+		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+			break;
+		}
+
+		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
+		   before the connection loss (B&C only); only P_BARRIER_ACK
+		   (or the local completion?) was missing when we suspended.
+		   Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+		   During connection handshake, we ensure that the peer was not rebooted. */
+		if (!(req->rq_state & RQ_NET_OK)) {
+			/* FIXME could this possibly be a req->dw.cb == w_send_out_of_sync?
+			 * in that case we must not set RQ_NET_PENDING. */
+
+			mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
+			if (req->w.cb) {
+				/* w.cb expected to be w_send_dblock, or w_send_read_req */
+				drbd_queue_work(&connection->sender_work,
+						&req->w);
+				rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
+			} /* else: FIXME can this happen? */
+			break;
+		}
+		fallthrough;	/* to BARRIER_ACKED */
+
+	case BARRIER_ACKED:
+		/* barrier ack for READ requests does not make sense */
+		if (!(req->rq_state & RQ_WRITE))
+			break;
+
+		if (req->rq_state & RQ_NET_PENDING) {
+			/* barrier came in before all requests were acked.
+			 * this is bad, because if the connection is lost now,
+			 * we won't be able to clean them up... */
+			drbd_err(device, "FIXME (BARRIER_ACKED but pending)\n");
+		}
+		/* Allowed to complete requests, even while suspended.
+		 * As this is called for all requests within a matching epoch,
+		 * we need to filter, and only set RQ_NET_DONE for those that
+		 * have actually been on the wire. */
+		mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+				(req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
+		break;
+
+	case DATA_RECEIVED:
+		D_ASSERT(device, req->rq_state & RQ_NET_PENDING);
+		mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
+		break;
+
+	case QUEUE_AS_DRBD_BARRIER:
+		start_new_tl_epoch(connection);
+		mod_rq_state(req, m, 0, RQ_NET_OK|RQ_NET_DONE);
+		break;
+	}
+
+	return rv;
+}
+
+/* we may do a local read if:
+ * - we are consistent (of course),
+ * - or we are generally inconsistent,
+ *   BUT we are still/already IN SYNC for this area.
+ *   since size may be bigger than BM_BLOCK_SIZE,
+ *   we may need to check several bits.
+ */
+static bool drbd_may_do_local_read(struct drbd_device *device, sector_t sector, int size)
+{
+	unsigned long sbnr, ebnr;
+	sector_t esector, nr_sectors;
+
+	if (device->state.disk == D_UP_TO_DATE)
+		return true;
+	if (device->state.disk != D_INCONSISTENT)
+		return false;
+	esector = sector + (size >> 9) - 1;
+	nr_sectors = get_capacity(device->vdisk);
+	D_ASSERT(device, sector  < nr_sectors);
+	D_ASSERT(device, esector < nr_sectors);
+
+	sbnr = BM_SECT_TO_BIT(sector);
+	ebnr = BM_SECT_TO_BIT(esector);
+
+	return drbd_bm_count_bits(device, sbnr, ebnr) == 0;
+}
+
+static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t sector,
+		enum drbd_read_balancing rbm)
+{
+	int stripe_shift;
+
+	switch (rbm) {
+	case RB_CONGESTED_REMOTE:
+		return false;
+	case RB_LEAST_PENDING:
+		return atomic_read(&device->local_cnt) >
+			atomic_read(&device->ap_pending_cnt) + atomic_read(&device->rs_pending_cnt);
+	case RB_32K_STRIPING:  /* stripe_shift = 15 */
+	case RB_64K_STRIPING:
+	case RB_128K_STRIPING:
+	case RB_256K_STRIPING:
+	case RB_512K_STRIPING:
+	case RB_1M_STRIPING:   /* stripe_shift = 20 */
+		stripe_shift = (rbm - RB_32K_STRIPING + 15);
+		return (sector >> (stripe_shift - 9)) & 1;
+	case RB_ROUND_ROBIN:
+		return test_and_change_bit(READ_BALANCE_RR, &device->flags);
+	case RB_PREFER_REMOTE:
+		return true;
+	case RB_PREFER_LOCAL:
+	default:
+		return false;
+	}
+}
+
+/*
+ * complete_conflicting_writes  -  wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about.  Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+	DEFINE_WAIT(wait);
+	struct drbd_device *device = req->device;
+	struct drbd_interval *i;
+	sector_t sector = req->i.sector;
+	int size = req->i.size;
+
+	for (;;) {
+		drbd_for_each_overlap(i, &device->write_requests, sector, size) {
+			/* Ignore, if already completed to upper layers. */
+			if (i->completed)
+				continue;
+			/* Handle the first found overlap.  After the schedule
+			 * we have to restart the tree walk. */
+			break;
+		}
+		if (!i)	/* if any */
+			break;
+
+		/* Indicate to wake up device->misc_wait on progress.  */
+		prepare_to_wait(&device->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+		i->waiting = true;
+		spin_unlock_irq(&device->resource->req_lock);
+		schedule();
+		spin_lock_irq(&device->resource->req_lock);
+	}
+	finish_wait(&device->misc_wait, &wait);
+}
+
+/* called within req_lock */
+static void maybe_pull_ahead(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct net_conf *nc;
+	bool congested = false;
+	enum drbd_on_congestion on_congestion;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+	rcu_read_unlock();
+	if (on_congestion == OC_BLOCK ||
+	    connection->agreed_pro_version < 96)
+		return;
+
+	if (on_congestion == OC_PULL_AHEAD && device->state.conn == C_AHEAD)
+		return; /* nothing to do ... */
+
+	/* If I don't even have good local storage, we can not reasonably try
+	 * to pull ahead of the peer. We also need the local reference to make
+	 * sure device->act_log is there.
+	 */
+	if (!get_ldev_if_state(device, D_UP_TO_DATE))
+		return;
+
+	if (nc->cong_fill &&
+	    atomic_read(&device->ap_in_flight) >= nc->cong_fill) {
+		drbd_info(device, "Congestion-fill threshold reached\n");
+		congested = true;
+	}
+
+	if (device->act_log->used >= nc->cong_extents) {
+		drbd_info(device, "Congestion-extents threshold reached\n");
+		congested = true;
+	}
+
+	if (congested) {
+		/* start a new epoch for non-mirrored writes */
+		start_new_tl_epoch(first_peer_device(device)->connection);
+
+		if (on_congestion == OC_PULL_AHEAD)
+			_drbd_set_state(_NS(device, conn, C_AHEAD), 0, NULL);
+		else  /*nc->on_congestion == OC_DISCONNECT */
+			_drbd_set_state(_NS(device, conn, C_DISCONNECTING), 0, NULL);
+	}
+	put_ldev(device);
+}
+
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	enum drbd_read_balancing rbm;
+
+	if (req->private_bio) {
+		if (!drbd_may_do_local_read(device,
+					req->i.sector, req->i.size)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+	}
+
+	if (device->state.pdsk != D_UP_TO_DATE)
+		return false;
+
+	if (req->private_bio == NULL)
+		return true;
+
+	/* TODO: improve read balancing decisions, take into account drbd
+	 * protocol, pending requests etc. */
+
+	rcu_read_lock();
+	rbm = rcu_dereference(device->ldev->disk_conf)->read_balancing;
+	rcu_read_unlock();
+
+	if (rbm == RB_PREFER_LOCAL && req->private_bio)
+		return false; /* submit locally */
+
+	if (remote_due_to_read_balancing(device, req->i.sector, rbm)) {
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		return true;
+	}
+
+	return false;
+}
+
+bool drbd_should_do_remote(union drbd_dev_state s)
+{
+	return s.pdsk == D_UP_TO_DATE ||
+		(s.pdsk >= D_INCONSISTENT &&
+		 s.conn >= C_WF_BITMAP_T &&
+		 s.conn < C_AHEAD);
+	/* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T.
+	   That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
+	   states. */
+}
+
+static bool drbd_should_send_out_of_sync(union drbd_dev_state s)
+{
+	return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
+	/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
+	   since we enter state C_AHEAD only if proto >= 96 */
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	int remote, send_oos;
+
+	remote = drbd_should_do_remote(device->state);
+	send_oos = drbd_should_send_out_of_sync(device->state);
+
+	/* Need to replicate writes.  Unless it is an empty flush,
+	 * which is better mapped to a DRBD P_BARRIER packet,
+	 * also for drbd wire protocol compatibility reasons.
+	 * If this was a flush, just start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(req->i.size == 0)) {
+		/* The only size==0 bios we expect are empty flushes. */
+		D_ASSERT(device, req->master_bio->bi_opf & REQ_PREFLUSH);
+		if (remote)
+			_req_mod(req, QUEUE_AS_DRBD_BARRIER);
+		return remote;
+	}
+
+	if (!remote && !send_oos)
+		return 0;
+
+	D_ASSERT(device, !(remote && send_oos));
+
+	if (remote) {
+		_req_mod(req, TO_BE_SENT);
+		_req_mod(req, QUEUE_FOR_NET_WRITE);
+	} else if (drbd_set_out_of_sync(device, req->i.sector, req->i.size))
+		_req_mod(req, QUEUE_FOR_SEND_OOS);
+
+	return remote;
+}
+
+static void drbd_process_discard_or_zeroes_req(struct drbd_request *req, int flags)
+{
+	int err = drbd_issue_discard_or_zero_out(req->device,
+				req->i.sector, req->i.size >> 9, flags);
+	if (err)
+		req->private_bio->bi_status = BLK_STS_IOERR;
+	bio_endio(req->private_bio);
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+	struct drbd_device *device = req->device;
+	struct bio *bio = req->private_bio;
+	unsigned int type;
+
+	if (bio_op(bio) != REQ_OP_READ)
+		type = DRBD_FAULT_DT_WR;
+	else if (bio->bi_opf & REQ_RAHEAD)
+		type = DRBD_FAULT_DT_RA;
+	else
+		type = DRBD_FAULT_DT_RD;
+
+	/* State may have changed since we grabbed our reference on the
+	 * ->ldev member. Double check, and short-circuit to endio.
+	 * In case the last activity log transaction failed to get on
+	 * stable storage, and this is a WRITE, we may not even submit
+	 * this bio. */
+	if (get_ldev(device)) {
+		if (drbd_insert_fault(device, type))
+			bio_io_error(bio);
+		else if (bio_op(bio) == REQ_OP_WRITE_ZEROES)
+			drbd_process_discard_or_zeroes_req(req, EE_ZEROOUT |
+			    ((bio->bi_opf & REQ_NOUNMAP) ? 0 : EE_TRIM));
+		else if (bio_op(bio) == REQ_OP_DISCARD)
+			drbd_process_discard_or_zeroes_req(req, EE_TRIM);
+		else
+			submit_bio_noacct(bio);
+		put_ldev(device);
+	} else
+		bio_io_error(bio);
+}
+
+static void drbd_queue_write(struct drbd_device *device, struct drbd_request *req)
+{
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&req->tl_requests, &device->submit.writes);
+	list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[1 /* WRITE */]);
+	spin_unlock_irq(&device->resource->req_lock);
+	queue_work(device->submit.wq, &device->submit.worker);
+	/* do_submit() may sleep internally on al_wait, too */
+	wake_up(&device->al_wait);
+}
+
+/* returns the new drbd_request pointer, if the caller is expected to
+ * drbd_send_and_submit() it (to save latency), or NULL if we queued the
+ * request on the submitter thread.
+ * Returns ERR_PTR(-ENOMEM) if we cannot allocate a drbd_request.
+ */
+static struct drbd_request *
+drbd_request_prepare(struct drbd_device *device, struct bio *bio)
+{
+	const int rw = bio_data_dir(bio);
+	struct drbd_request *req;
+
+	/* allocate outside of all locks; */
+	req = drbd_req_new(device, bio);
+	if (!req) {
+		dec_ap_bio(device);
+		/* only pass the error to the upper layers.
+		 * if user cannot handle io errors, that's not our business. */
+		drbd_err(device, "could not kmalloc() req\n");
+		bio->bi_status = BLK_STS_RESOURCE;
+		bio_endio(bio);
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Update disk stats */
+	req->start_jif = bio_start_io_acct(req->master_bio);
+
+	if (get_ldev(device)) {
+		req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
+						   bio, GFP_NOIO,
+						   &drbd_io_bio_set);
+		req->private_bio->bi_private = req;
+		req->private_bio->bi_end_io = drbd_request_endio;
+	}
+
+	/* process discards always from our submitter thread */
+	if (bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+	    bio_op(bio) == REQ_OP_DISCARD)
+		goto queue_for_submitter_thread;
+
+	if (rw == WRITE && req->private_bio && req->i.size
+	&& !test_bit(AL_SUSPENDED, &device->flags)) {
+		if (!drbd_al_begin_io_fastpath(device, &req->i))
+			goto queue_for_submitter_thread;
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+	}
+	return req;
+
+ queue_for_submitter_thread:
+	atomic_inc(&device->ap_actlog_cnt);
+	drbd_queue_write(device, req);
+	return NULL;
+}
+
+/* Require at least one path to current data.
+ * We don't want to allow writes on C_STANDALONE D_INCONSISTENT:
+ * We would not allow to read what was written,
+ * we would not have bumped the data generation uuids,
+ * we would cause data divergence for all the wrong reasons.
+ *
+ * If we don't see at least one D_UP_TO_DATE, we will fail this request,
+ * which either returns EIO, or, if OND_SUSPEND_IO is set, suspends IO,
+ * and queues for retry later.
+ */
+static bool may_do_writes(struct drbd_device *device)
+{
+	const union drbd_dev_state s = device->state;
+	return s.disk == D_UP_TO_DATE || s.pdsk == D_UP_TO_DATE;
+}
+
+struct drbd_plug_cb {
+	struct blk_plug_cb cb;
+	struct drbd_request *most_recent_req;
+	/* do we need more? */
+};
+
+static void drbd_unplug(struct blk_plug_cb *cb, bool from_schedule)
+{
+	struct drbd_plug_cb *plug = container_of(cb, struct drbd_plug_cb, cb);
+	struct drbd_resource *resource = plug->cb.data;
+	struct drbd_request *req = plug->most_recent_req;
+
+	kfree(cb);
+	if (!req)
+		return;
+
+	spin_lock_irq(&resource->req_lock);
+	/* In case the sender did not process it yet, raise the flag to
+	 * have it followed with P_UNPLUG_REMOTE just after. */
+	req->rq_state |= RQ_UNPLUG;
+	/* but also queue a generic unplug */
+	drbd_queue_unplug(req->device);
+	kref_put(&req->kref, drbd_req_destroy);
+	spin_unlock_irq(&resource->req_lock);
+}
+
+static struct drbd_plug_cb* drbd_check_plugged(struct drbd_resource *resource)
+{
+	/* A lot of text to say
+	 * return (struct drbd_plug_cb*)blk_check_plugged(); */
+	struct drbd_plug_cb *plug;
+	struct blk_plug_cb *cb = blk_check_plugged(drbd_unplug, resource, sizeof(*plug));
+
+	if (cb)
+		plug = container_of(cb, struct drbd_plug_cb, cb);
+	else
+		plug = NULL;
+	return plug;
+}
+
+static void drbd_update_plug(struct drbd_plug_cb *plug, struct drbd_request *req)
+{
+	struct drbd_request *tmp = plug->most_recent_req;
+	/* Will be sent to some peer.
+	 * Remember to tag it with UNPLUG_REMOTE on unplug */
+	kref_get(&req->kref);
+	plug->most_recent_req = req;
+	if (tmp)
+		kref_put(&tmp->kref, drbd_req_destroy);
+}
+
+static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request *req)
+{
+	struct drbd_resource *resource = device->resource;
+	const int rw = bio_data_dir(req->master_bio);
+	struct bio_and_error m = { NULL, };
+	bool no_remote = false;
+	bool submit_private_bio = false;
+
+	spin_lock_irq(&resource->req_lock);
+	if (rw == WRITE) {
+		/* This may temporarily give up the req_lock,
+		 * but will re-aquire it before it returns here.
+		 * Needs to be before the check on drbd_suspended() */
+		complete_conflicting_writes(req);
+		/* no more giving up req_lock from now on! */
+
+		/* check for congestion, and potentially stop sending
+		 * full data updates, but start sending "dirty bits" only. */
+		maybe_pull_ahead(device);
+	}
+
+
+	if (drbd_suspended(device)) {
+		/* push back and retry: */
+		req->rq_state |= RQ_POSTPONED;
+		if (req->private_bio) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+		}
+		goto out;
+	}
+
+	/* We fail READ early, if we can not serve it.
+	 * We must do this before req is registered on any lists.
+	 * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+	if (rw != WRITE) {
+		if (!do_remote_read(req) && !req->private_bio)
+			goto nodata;
+	}
+
+	/* which transfer log epoch does this belong to? */
+	req->epoch = atomic_read(&first_peer_device(device)->connection->current_tle_nr);
+
+	/* no point in adding empty flushes to the transfer log,
+	 * they are mapped to drbd barriers already. */
+	if (likely(req->i.size!=0)) {
+		if (rw == WRITE)
+			first_peer_device(device)->connection->current_tle_writes++;
+
+		list_add_tail(&req->tl_requests, &first_peer_device(device)->connection->transfer_log);
+	}
+
+	if (rw == WRITE) {
+		if (req->private_bio && !may_do_writes(device)) {
+			bio_put(req->private_bio);
+			req->private_bio = NULL;
+			put_ldev(device);
+			goto nodata;
+		}
+		if (!drbd_process_write_request(req))
+			no_remote = true;
+	} else {
+		/* We either have a private_bio, or we can read from remote.
+		 * Otherwise we had done the goto nodata above. */
+		if (req->private_bio == NULL) {
+			_req_mod(req, TO_BE_SENT);
+			_req_mod(req, QUEUE_FOR_NET_READ);
+		} else
+			no_remote = true;
+	}
+
+	if (no_remote == false) {
+		struct drbd_plug_cb *plug = drbd_check_plugged(resource);
+		if (plug)
+			drbd_update_plug(plug, req);
+	}
+
+	/* If it took the fast path in drbd_request_prepare, add it here.
+	 * The slow path has added it already. */
+	if (list_empty(&req->req_pending_master_completion))
+		list_add_tail(&req->req_pending_master_completion,
+			&device->pending_master_completion[rw == WRITE]);
+	if (req->private_bio) {
+		/* needs to be marked within the same spinlock */
+		req->pre_submit_jif = jiffies;
+		list_add_tail(&req->req_pending_local,
+			&device->pending_completion[rw == WRITE]);
+		_req_mod(req, TO_BE_SUBMITTED);
+		/* but we need to give up the spinlock to submit */
+		submit_private_bio = true;
+	} else if (no_remote) {
+nodata:
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+					(unsigned long long)req->i.sector, req->i.size >> 9);
+		/* A write may have been queued for send_oos, however.
+		 * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
+	}
+
+out:
+	drbd_req_put_completion_ref(req, &m, 1);
+	spin_unlock_irq(&resource->req_lock);
+
+	/* Even though above is a kref_put(), this is safe.
+	 * As long as we still need to submit our private bio,
+	 * we hold a completion ref, and the request cannot disappear.
+	 * If however this request did not even have a private bio to submit
+	 * (e.g. remote read), req may already be invalid now.
+	 * That's why we cannot check on req->private_bio. */
+	if (submit_private_bio)
+		drbd_submit_req_private_bio(req);
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void __drbd_make_request(struct drbd_device *device, struct bio *bio)
+{
+	struct drbd_request *req = drbd_request_prepare(device, bio);
+	if (IS_ERR_OR_NULL(req))
+		return;
+	drbd_send_and_submit(device, req);
+}
+
+static void submit_fast_path(struct drbd_device *device, struct list_head *incoming)
+{
+	struct blk_plug plug;
+	struct drbd_request *req, *tmp;
+
+	blk_start_plug(&plug);
+	list_for_each_entry_safe(req, tmp, incoming, tl_requests) {
+		const int rw = bio_data_dir(req->master_bio);
+
+		if (rw == WRITE /* rw != WRITE should not even end up here! */
+		&& req->private_bio && req->i.size
+		&& !test_bit(AL_SUSPENDED, &device->flags)) {
+			if (!drbd_al_begin_io_fastpath(device, &req->i))
+				continue;
+
+			req->rq_state |= RQ_IN_ACT_LOG;
+			req->in_actlog_jif = jiffies;
+			atomic_dec(&device->ap_actlog_cnt);
+		}
+
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+	blk_finish_plug(&plug);
+}
+
+static bool prepare_al_transaction_nonblock(struct drbd_device *device,
+					    struct list_head *incoming,
+					    struct list_head *pending,
+					    struct list_head *later)
+{
+	struct drbd_request *req;
+	int wake = 0;
+	int err;
+
+	spin_lock_irq(&device->al_lock);
+	while ((req = list_first_entry_or_null(incoming, struct drbd_request, tl_requests))) {
+		err = drbd_al_begin_io_nonblock(device, &req->i);
+		if (err == -ENOBUFS)
+			break;
+		if (err == -EBUSY)
+			wake = 1;
+		if (err)
+			list_move_tail(&req->tl_requests, later);
+		else
+			list_move_tail(&req->tl_requests, pending);
+	}
+	spin_unlock_irq(&device->al_lock);
+	if (wake)
+		wake_up(&device->al_wait);
+	return !list_empty(pending);
+}
+
+static void send_and_submit_pending(struct drbd_device *device, struct list_head *pending)
+{
+	struct blk_plug plug;
+	struct drbd_request *req;
+
+	blk_start_plug(&plug);
+	while ((req = list_first_entry_or_null(pending, struct drbd_request, tl_requests))) {
+		req->rq_state |= RQ_IN_ACT_LOG;
+		req->in_actlog_jif = jiffies;
+		atomic_dec(&device->ap_actlog_cnt);
+		list_del_init(&req->tl_requests);
+		drbd_send_and_submit(device, req);
+	}
+	blk_finish_plug(&plug);
+}
+
+void do_submit(struct work_struct *ws)
+{
+	struct drbd_device *device = container_of(ws, struct drbd_device, submit.worker);
+	LIST_HEAD(incoming);	/* from drbd_make_request() */
+	LIST_HEAD(pending);	/* to be submitted after next AL-transaction commit */
+	LIST_HEAD(busy);	/* blocked by resync requests */
+
+	/* grab new incoming requests */
+	spin_lock_irq(&device->resource->req_lock);
+	list_splice_tail_init(&device->submit.writes, &incoming);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	for (;;) {
+		DEFINE_WAIT(wait);
+
+		/* move used-to-be-busy back to front of incoming */
+		list_splice_init(&busy, &incoming);
+		submit_fast_path(device, &incoming);
+		if (list_empty(&incoming))
+			break;
+
+		for (;;) {
+			prepare_to_wait(&device->al_wait, &wait, TASK_UNINTERRUPTIBLE);
+
+			list_splice_init(&busy, &incoming);
+			prepare_al_transaction_nonblock(device, &incoming, &pending, &busy);
+			if (!list_empty(&pending))
+				break;
+
+			schedule();
+
+			/* If all currently "hot" activity log extents are kept busy by
+			 * incoming requests, we still must not totally starve new
+			 * requests to "cold" extents.
+			 * Something left on &incoming means there had not been
+			 * enough update slots available, and the activity log
+			 * has been marked as "starving".
+			 *
+			 * Try again now, without looking for new requests,
+			 * effectively blocking all new requests until we made
+			 * at least _some_ progress with what we currently have.
+			 */
+			if (!list_empty(&incoming))
+				continue;
+
+			/* Nothing moved to pending, but nothing left
+			 * on incoming: all moved to busy!
+			 * Grab new and iterate. */
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+		}
+		finish_wait(&device->al_wait, &wait);
+
+		/* If the transaction was full, before all incoming requests
+		 * had been processed, skip ahead to commit, and iterate
+		 * without splicing in more incoming requests from upper layers.
+		 *
+		 * Else, if all incoming have been processed,
+		 * they have become either "pending" (to be submitted after
+		 * next transaction commit) or "busy" (blocked by resync).
+		 *
+		 * Maybe more was queued, while we prepared the transaction?
+		 * Try to stuff those into this transaction as well.
+		 * Be strictly non-blocking here,
+		 * we already have something to commit.
+		 *
+		 * Commit if we don't make any more progres.
+		 */
+
+		while (list_empty(&incoming)) {
+			LIST_HEAD(more_pending);
+			LIST_HEAD(more_incoming);
+			bool made_progress;
+
+			/* It is ok to look outside the lock,
+			 * it's only an optimization anyways */
+			if (list_empty(&device->submit.writes))
+				break;
+
+			spin_lock_irq(&device->resource->req_lock);
+			list_splice_tail_init(&device->submit.writes, &more_incoming);
+			spin_unlock_irq(&device->resource->req_lock);
+
+			if (list_empty(&more_incoming))
+				break;
+
+			made_progress = prepare_al_transaction_nonblock(device, &more_incoming, &more_pending, &busy);
+
+			list_splice_tail_init(&more_pending, &pending);
+			list_splice_tail_init(&more_incoming, &incoming);
+			if (!made_progress)
+				break;
+		}
+
+		drbd_al_begin_io_commit(device);
+		send_and_submit_pending(device, &pending);
+	}
+}
+
+void drbd_submit_bio(struct bio *bio)
+{
+	struct drbd_device *device = bio->bi_bdev->bd_disk->private_data;
+
+	bio = bio_split_to_limits(bio);
+	if (!bio)
+		return;
+
+	/*
+	 * what we "blindly" assume:
+	 */
+	D_ASSERT(device, IS_ALIGNED(bio->bi_iter.bi_size, 512));
+
+	inc_ap_bio(device);
+	__drbd_make_request(device, bio);
+}
+
+static bool net_timeout_reached(struct drbd_request *net_req,
+		struct drbd_connection *connection,
+		unsigned long now, unsigned long ent,
+		unsigned int ko_count, unsigned int timeout)
+{
+	struct drbd_device *device = net_req->device;
+
+	if (!time_after(now, net_req->pre_send_jif + ent))
+		return false;
+
+	if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent))
+		return false;
+
+	if (net_req->rq_state & RQ_NET_PENDING) {
+		drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return true;
+	}
+
+	/* We received an ACK already (or are using protocol A),
+	 * but are waiting for the epoch closing barrier ack.
+	 * Check if we sent the barrier already.  We should not blame the peer
+	 * for being unresponsive, if we did not even ask it yet. */
+	if (net_req->epoch == connection->send.current_epoch_nr) {
+		drbd_warn(device,
+			"We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n",
+			jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout);
+		return false;
+	}
+
+	/* Worst case: we may have been blocked for whatever reason, then
+	 * suddenly are able to send a lot of requests (and epoch separating
+	 * barriers) in quick succession.
+	 * The timestamp of the net_req may be much too old and not correspond
+	 * to the sending time of the relevant unack'ed barrier packet, so
+	 * would trigger a spurious timeout.  The latest barrier packet may
+	 * have a too recent timestamp to trigger the timeout, potentially miss
+	 * a timeout.  Right now we don't have a place to conveniently store
+	 * these timestamps.
+	 * But in this particular situation, the application requests are still
+	 * completed to upper layers, DRBD should still "feel" responsive.
+	 * No need yet to kill this connection, it may still recover.
+	 * If not, eventually we will have queued enough into the network for
+	 * us to block. From that point of view, the timestamp of the last sent
+	 * barrier packet is relevant enough.
+	 */
+	if (time_after(now, connection->send.last_sent_barrier_jif + ent)) {
+		drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n",
+			connection->send.last_sent_barrier_jif, now,
+			jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout);
+		return true;
+	}
+	return false;
+}
+
+/* A request is considered timed out, if
+ * - we have some effective timeout from the configuration,
+ *   with some state restrictions applied,
+ * - the oldest request is waiting for a response from the network
+ *   resp. the local disk,
+ * - the oldest request is in fact older than the effective timeout,
+ * - the connection was established (resp. disk was attached)
+ *   for longer than the timeout already.
+ * Note that for 32bit jiffies and very stable connections/disks,
+ * we may have a wrap around, which is catched by
+ *   !time_in_range(now, last_..._jif, last_..._jif + timeout).
+ *
+ * Side effect: once per 32bit wrap-around interval, which means every
+ * ~198 days with 250 HZ, we have a window where the timeout would need
+ * to expire twice (worst case) to become effective. Good enough.
+ */
+
+void request_timer_fn(struct timer_list *t)
+{
+	struct drbd_device *device = from_timer(device, t, request_timer);
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	struct drbd_request *req_read, *req_write, *req_peer; /* oldest request */
+	struct net_conf *nc;
+	unsigned long oldest_submit_jif;
+	unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
+	unsigned long now;
+	unsigned int ko_count = 0, timeout = 0;
+
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	if (nc && device->state.conn >= C_WF_REPORT_PARAMS) {
+		ko_count = nc->ko_count;
+		timeout = nc->timeout;
+	}
+
+	if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */
+		dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10;
+		put_ldev(device);
+	}
+	rcu_read_unlock();
+
+
+	ent = timeout * HZ/10 * ko_count;
+	et = min_not_zero(dt, ent);
+
+	if (!et)
+		return; /* Recurring timer stopped */
+
+	now = jiffies;
+	nt = now + et;
+
+	spin_lock_irq(&device->resource->req_lock);
+	req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local);
+	req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local);
+
+	/* maybe the oldest request waiting for the peer is in fact still
+	 * blocking in tcp sendmsg.  That's ok, though, that's handled via the
+	 * socket send timeout, requesting a ping, and bumping ko-count in
+	 * we_should_drop_the_connection().
+	 */
+
+	/* check the oldest request we did successfully sent,
+	 * but which is still waiting for an ACK. */
+	req_peer = connection->req_ack_pending;
+
+	/* if we don't have such request (e.g. protocoll A)
+	 * check the oldest requests which is still waiting on its epoch
+	 * closing barrier ack. */
+	if (!req_peer)
+		req_peer = connection->req_not_net_done;
+
+	/* evaluate the oldest peer request only in one timer! */
+	if (req_peer && req_peer->device != device)
+		req_peer = NULL;
+
+	/* do we have something to evaluate? */
+	if (req_peer == NULL && req_write == NULL && req_read == NULL)
+		goto out;
+
+	oldest_submit_jif =
+		(req_write && req_read)
+		? ( time_before(req_write->pre_submit_jif, req_read->pre_submit_jif)
+		  ? req_write->pre_submit_jif : req_read->pre_submit_jif )
+		: req_write ? req_write->pre_submit_jif
+		: req_read ? req_read->pre_submit_jif : now;
+
+	if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout))
+		_conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD);
+
+	if (dt && oldest_submit_jif != now &&
+		 time_after(now, oldest_submit_jif + dt) &&
+		!time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) {
+		drbd_warn(device, "Local backing device failed to meet the disk-timeout\n");
+		__drbd_chk_io_error(device, DRBD_FORCE_DETACH);
+	}
+
+	/* Reschedule timer for the nearest not already expired timeout.
+	 * Fallback to now + min(effective network timeout, disk timeout). */
+	ent = (ent && req_peer && time_before(now, req_peer->pre_send_jif + ent))
+		? req_peer->pre_send_jif + ent : now + et;
+	dt = (dt && oldest_submit_jif != now && time_before(now, oldest_submit_jif + dt))
+		? oldest_submit_jif + dt : now + et;
+	nt = time_before(ent, dt) ? ent : dt;
+out:
+	spin_unlock_irq(&device->resource->req_lock);
+	mod_timer(&device->request_timer, nt);
+}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
new file mode 100644
index 000000000..6237fa1dc
--- /dev/null
+++ b/drivers/block/drbd/drbd_req.h
@@ -0,0 +1,321 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+   drbd_req.h
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2006-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 2006-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+   Copyright (C) 2006-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+
+ */
+
+#ifndef _DRBD_REQ_H
+#define _DRBD_REQ_H
+
+#include <linux/module.h>
+
+#include <linux/slab.h>
+#include <linux/drbd.h>
+#include "drbd_int.h"
+
+/* The request callbacks will be called in irq context by the IDE drivers,
+   and in Softirqs/Tasklets/BH context by the SCSI drivers,
+   and by the receiver and worker in kernel-thread context.
+   Try to get the locking right :) */
+
+/*
+ * Objects of type struct drbd_request do only exist on a R_PRIMARY node, and are
+ * associated with IO requests originating from the block layer above us.
+ *
+ * There are quite a few things that may happen to a drbd request
+ * during its lifetime.
+ *
+ *  It will be created.
+ *  It will be marked with the intention to be
+ *    submitted to local disk and/or
+ *    send via the network.
+ *
+ *  It has to be placed on the transfer log and other housekeeping lists,
+ *  In case we have a network connection.
+ *
+ *  It may be identified as a concurrent (write) request
+ *    and be handled accordingly.
+ *
+ *  It may me handed over to the local disk subsystem.
+ *  It may be completed by the local disk subsystem,
+ *    either successfully or with io-error.
+ *  In case it is a READ request, and it failed locally,
+ *    it may be retried remotely.
+ *
+ *  It may be queued for sending.
+ *  It may be handed over to the network stack,
+ *    which may fail.
+ *  It may be acknowledged by the "peer" according to the wire_protocol in use.
+ *    this may be a negative ack.
+ *  It may receive a faked ack when the network connection is lost and the
+ *  transfer log is cleaned up.
+ *  Sending may be canceled due to network connection loss.
+ *  When it finally has outlived its time,
+ *    corresponding dirty bits in the resync-bitmap may be cleared or set,
+ *    it will be destroyed,
+ *    and completion will be signalled to the originator,
+ *      with or without "success".
+ */
+
+enum drbd_req_event {
+	CREATED,
+	TO_BE_SENT,
+	TO_BE_SUBMITTED,
+
+	/* XXX yes, now I am inconsistent...
+	 * these are not "events" but "actions"
+	 * oh, well... */
+	QUEUE_FOR_NET_WRITE,
+	QUEUE_FOR_NET_READ,
+	QUEUE_FOR_SEND_OOS,
+
+	/* An empty flush is queued as P_BARRIER,
+	 * which will cause it to complete "successfully",
+	 * even if the local disk flush failed.
+	 *
+	 * Just like "real" requests, empty flushes (blkdev_issue_flush()) will
+	 * only see an error if neither local nor remote data is reachable. */
+	QUEUE_AS_DRBD_BARRIER,
+
+	SEND_CANCELED,
+	SEND_FAILED,
+	HANDED_OVER_TO_NETWORK,
+	OOS_HANDED_TO_NETWORK,
+	CONNECTION_LOST_WHILE_PENDING,
+	READ_RETRY_REMOTE_CANCELED,
+	RECV_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER,
+	WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+	CONFLICT_RESOLVED,
+	POSTPONE_WRITE,
+	NEG_ACKED,
+	BARRIER_ACKED, /* in protocol A and B */
+	DATA_RECEIVED, /* (remote read) */
+
+	COMPLETED_OK,
+	READ_COMPLETED_WITH_ERROR,
+	READ_AHEAD_COMPLETED_WITH_ERROR,
+	WRITE_COMPLETED_WITH_ERROR,
+	DISCARD_COMPLETED_NOTSUPP,
+	DISCARD_COMPLETED_WITH_ERROR,
+
+	ABORT_DISK_IO,
+	RESEND,
+	FAIL_FROZEN_DISK_IO,
+	RESTART_FROZEN_DISK_IO,
+	NOTHING,
+};
+
+/* encoding of request states for now.  we don't actually need that many bits.
+ * we don't need to do atomic bit operations either, since most of the time we
+ * need to look at the connection state and/or manipulate some lists at the
+ * same time, so we should hold the request lock anyways.
+ */
+enum drbd_req_state_bits {
+	/* 3210
+	 * 0000: no local possible
+	 * 0001: to be submitted
+	 *    UNUSED, we could map: 011: submitted, completion still pending
+	 * 0110: completed ok
+	 * 0010: completed with error
+	 * 1001: Aborted (before completion)
+	 * 1x10: Aborted and completed -> free
+	 */
+	__RQ_LOCAL_PENDING,
+	__RQ_LOCAL_COMPLETED,
+	__RQ_LOCAL_OK,
+	__RQ_LOCAL_ABORTED,
+
+	/* 87654
+	 * 00000: no network possible
+	 * 00001: to be send
+	 * 00011: to be send, on worker queue
+	 * 00101: sent, expecting recv_ack (B) or write_ack (C)
+	 * 11101: sent,
+	 *        recv_ack (B) or implicit "ack" (A),
+	 *        still waiting for the barrier ack.
+	 *        master_bio may already be completed and invalidated.
+	 * 11100: write acked (C),
+	 *        data received (for remote read, any protocol)
+	 *        or finally the barrier ack has arrived (B,A)...
+	 *        request can be freed
+	 * 01100: neg-acked (write, protocol C)
+	 *        or neg-d-acked (read, any protocol)
+	 *        or killed from the transfer log
+	 *        during cleanup after connection loss
+	 *        request can be freed
+	 * 01000: canceled or send failed...
+	 *        request can be freed
+	 */
+
+	/* if "SENT" is not set, yet, this can still fail or be canceled.
+	 * if "SENT" is set already, we still wait for an Ack packet.
+	 * when cleared, the master_bio may be completed.
+	 * in (B,A) the request object may still linger on the transaction log
+	 * until the corresponding barrier ack comes in */
+	__RQ_NET_PENDING,
+
+	/* If it is QUEUED, and it is a WRITE, it is also registered in the
+	 * transfer log. Currently we need this flag to avoid conflicts between
+	 * worker canceling the request and tl_clear_barrier killing it from
+	 * transfer log.  We should restructure the code so this conflict does
+	 * no longer occur. */
+	__RQ_NET_QUEUED,
+
+	/* well, actually only "handed over to the network stack".
+	 *
+	 * TODO can potentially be dropped because of the similar meaning
+	 * of RQ_NET_SENT and ~RQ_NET_QUEUED.
+	 * however it is not exactly the same. before we drop it
+	 * we must ensure that we can tell a request with network part
+	 * from a request without, regardless of what happens to it. */
+	__RQ_NET_SENT,
+
+	/* when set, the request may be freed (if RQ_NET_QUEUED is clear).
+	 * basically this means the corresponding P_BARRIER_ACK was received */
+	__RQ_NET_DONE,
+
+	/* whether or not we know (C) or pretend (B,A) that the write
+	 * was successfully written on the peer.
+	 */
+	__RQ_NET_OK,
+
+	/* peer called drbd_set_in_sync() for this write */
+	__RQ_NET_SIS,
+
+	/* keep this last, its for the RQ_NET_MASK */
+	__RQ_NET_MAX,
+
+	/* Set when this is a write, clear for a read */
+	__RQ_WRITE,
+	__RQ_WSAME,
+	__RQ_UNMAP,
+	__RQ_ZEROES,
+
+	/* Should call drbd_al_complete_io() for this request... */
+	__RQ_IN_ACT_LOG,
+
+	/* This was the most recent request during some blk_finish_plug()
+	 * or its implicit from-schedule equivalent.
+	 * We may use it as hint to send a P_UNPLUG_REMOTE */
+	__RQ_UNPLUG,
+
+	/* The peer has sent a retry ACK */
+	__RQ_POSTPONED,
+
+	/* would have been completed,
+	 * but was not, because of drbd_suspended() */
+	__RQ_COMPLETION_SUSP,
+
+	/* We expect a receive ACK (wire proto B) */
+	__RQ_EXP_RECEIVE_ACK,
+
+	/* We expect a write ACK (wite proto C) */
+	__RQ_EXP_WRITE_ACK,
+
+	/* waiting for a barrier ack, did an extra kref_get */
+	__RQ_EXP_BARR_ACK,
+};
+
+#define RQ_LOCAL_PENDING   (1UL << __RQ_LOCAL_PENDING)
+#define RQ_LOCAL_COMPLETED (1UL << __RQ_LOCAL_COMPLETED)
+#define RQ_LOCAL_OK        (1UL << __RQ_LOCAL_OK)
+#define RQ_LOCAL_ABORTED   (1UL << __RQ_LOCAL_ABORTED)
+
+#define RQ_LOCAL_MASK      ((RQ_LOCAL_ABORTED << 1)-1)
+
+#define RQ_NET_PENDING     (1UL << __RQ_NET_PENDING)
+#define RQ_NET_QUEUED      (1UL << __RQ_NET_QUEUED)
+#define RQ_NET_SENT        (1UL << __RQ_NET_SENT)
+#define RQ_NET_DONE        (1UL << __RQ_NET_DONE)
+#define RQ_NET_OK          (1UL << __RQ_NET_OK)
+#define RQ_NET_SIS         (1UL << __RQ_NET_SIS)
+
+#define RQ_NET_MASK        (((1UL << __RQ_NET_MAX)-1) & ~RQ_LOCAL_MASK)
+
+#define RQ_WRITE           (1UL << __RQ_WRITE)
+#define RQ_WSAME           (1UL << __RQ_WSAME)
+#define RQ_UNMAP           (1UL << __RQ_UNMAP)
+#define RQ_ZEROES          (1UL << __RQ_ZEROES)
+#define RQ_IN_ACT_LOG      (1UL << __RQ_IN_ACT_LOG)
+#define RQ_UNPLUG          (1UL << __RQ_UNPLUG)
+#define RQ_POSTPONED	   (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK   (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK    (1UL << __RQ_EXP_BARR_ACK)
+
+/* For waking up the frozen transfer log mod_req() has to return if the request
+   should be counted in the epoch object*/
+#define MR_WRITE       1
+#define MR_READ        2
+
+/* Short lived temporary struct on the stack.
+ * We could squirrel the error to be returned into
+ * bio->bi_iter.bi_size, or similar. But that would be too ugly. */
+struct bio_and_error {
+	struct bio *bio;
+	int error;
+};
+
+extern void start_new_tl_epoch(struct drbd_connection *connection);
+extern void drbd_req_destroy(struct kref *kref);
+extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
+		struct bio_and_error *m);
+extern void complete_master_bio(struct drbd_device *device,
+		struct bio_and_error *m);
+extern void request_timer_fn(struct timer_list *t);
+extern void tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_connection *connection, enum drbd_req_event what);
+extern void tl_abort_disk_io(struct drbd_device *device);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
+
+/* use this if you don't want to deal with calling complete_master_bio()
+ * outside the spinlock, e.g. when walking some list on cleanup. */
+static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
+{
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	/* __req_mod possibly frees req, do not touch req after that! */
+	rv = __req_mod(req, what, &m);
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+/* completion of master bio is outside of our spinlock.
+ * We still may or may not be inside some irqs disabled section
+ * of the lower level driver completion callback, so we need to
+ * spin_lock_irqsave here. */
+static inline int req_mod(struct drbd_request *req,
+		enum drbd_req_event what)
+{
+	unsigned long flags;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	int rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	rv = __req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+
+	return rv;
+}
+
+extern bool drbd_should_do_remote(union drbd_dev_state);
+
+#endif
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 000000000..3f7bf9f2d
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,2388 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_state.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+   Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+   from Logicworks, Inc. for making SDP replication support possible.
+
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+#include "drbd_state_change.h"
+
+struct after_state_chg_work {
+	struct drbd_work w;
+	struct drbd_device *device;
+	union drbd_state os;
+	union drbd_state ns;
+	enum chg_state_flags flags;
+	struct completion *done;
+	struct drbd_state_change *state_change;
+};
+
+enum sanitize_state_warnings {
+	NO_WARNING,
+	ABORTED_ONLINE_VERIFY,
+	ABORTED_RESYNC,
+	CONNECTION_LOST_NEGOTIATING,
+	IMPLICITLY_UPGRADED_DISK,
+	IMPLICITLY_UPGRADED_PDSK,
+};
+
+static void count_objects(struct drbd_resource *resource,
+			  unsigned int *n_devices,
+			  unsigned int *n_connections)
+{
+	struct drbd_device *device;
+	struct drbd_connection *connection;
+	int vnr;
+
+	*n_devices = 0;
+	*n_connections = 0;
+
+	idr_for_each_entry(&resource->devices, device, vnr)
+		(*n_devices)++;
+	for_each_connection(connection, resource)
+		(*n_connections)++;
+}
+
+static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	unsigned int size, n;
+
+	size = sizeof(struct drbd_state_change) +
+	       n_devices * sizeof(struct drbd_device_state_change) +
+	       n_connections * sizeof(struct drbd_connection_state_change) +
+	       n_devices * n_connections * sizeof(struct drbd_peer_device_state_change);
+	state_change = kmalloc(size, gfp);
+	if (!state_change)
+		return NULL;
+	state_change->n_devices = n_devices;
+	state_change->n_connections = n_connections;
+	state_change->devices = (void *)(state_change + 1);
+	state_change->connections = (void *)&state_change->devices[n_devices];
+	state_change->peer_devices = (void *)&state_change->connections[n_connections];
+	state_change->resource->resource = NULL;
+	for (n = 0; n < n_devices; n++)
+		state_change->devices[n].device = NULL;
+	for (n = 0; n < n_connections; n++)
+		state_change->connections[n].connection = NULL;
+	return state_change;
+}
+
+struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp)
+{
+	struct drbd_state_change *state_change;
+	struct drbd_device *device;
+	unsigned int n_devices;
+	struct drbd_connection *connection;
+	unsigned int n_connections;
+	int vnr;
+
+	struct drbd_device_state_change *device_state_change;
+	struct drbd_peer_device_state_change *peer_device_state_change;
+	struct drbd_connection_state_change *connection_state_change;
+
+	/* Caller holds req_lock spinlock.
+	 * No state, no device IDR, no connections lists can change. */
+	count_objects(resource, &n_devices, &n_connections);
+	state_change = alloc_state_change(n_devices, n_connections, gfp);
+	if (!state_change)
+		return NULL;
+
+	kref_get(&resource->kref);
+	state_change->resource->resource = resource;
+	state_change->resource->role[OLD] =
+		conn_highest_role(first_connection(resource));
+	state_change->resource->susp[OLD] = resource->susp;
+	state_change->resource->susp_nod[OLD] = resource->susp_nod;
+	state_change->resource->susp_fen[OLD] = resource->susp_fen;
+
+	connection_state_change = state_change->connections;
+	for_each_connection(connection, resource) {
+		kref_get(&connection->kref);
+		connection_state_change->connection = connection;
+		connection_state_change->cstate[OLD] =
+			connection->cstate;
+		connection_state_change->peer_role[OLD] =
+			conn_highest_peer(connection);
+		connection_state_change++;
+	}
+
+	device_state_change = state_change->devices;
+	peer_device_state_change = state_change->peer_devices;
+	idr_for_each_entry(&resource->devices, device, vnr) {
+		kref_get(&device->kref);
+		device_state_change->device = device;
+		device_state_change->disk_state[OLD] = device->state.disk;
+
+		/* The peer_devices for each device have to be enumerated in
+		   the order of the connections. We may not use for_each_peer_device() here. */
+		for_each_connection(connection, resource) {
+			struct drbd_peer_device *peer_device;
+
+			peer_device = conn_peer_device(connection, device->vnr);
+			peer_device_state_change->peer_device = peer_device;
+			peer_device_state_change->disk_state[OLD] =
+				device->state.pdsk;
+			peer_device_state_change->repl_state[OLD] =
+				max_t(enum drbd_conns,
+				      C_WF_REPORT_PARAMS, device->state.conn);
+			peer_device_state_change->resync_susp_user[OLD] =
+				device->state.user_isp;
+			peer_device_state_change->resync_susp_peer[OLD] =
+				device->state.peer_isp;
+			peer_device_state_change->resync_susp_dependency[OLD] =
+				device->state.aftr_isp;
+			peer_device_state_change++;
+		}
+		device_state_change++;
+	}
+
+	return state_change;
+}
+
+static void remember_new_state(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change;
+	struct drbd_resource *resource;
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	resource_state_change = &state_change->resource[0];
+	resource = resource_state_change->resource;
+
+	resource_state_change->role[NEW] =
+		conn_highest_role(first_connection(resource));
+	resource_state_change->susp[NEW] = resource->susp;
+	resource_state_change->susp_nod[NEW] = resource->susp_nod;
+	resource_state_change->susp_fen[NEW] = resource->susp_fen;
+
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n];
+		struct drbd_device *device = device_state_change->device;
+
+		device_state_change->disk_state[NEW] = device->state.disk;
+	}
+
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection_state_change *connection_state_change =
+			&state_change->connections[n];
+		struct drbd_connection *connection =
+			connection_state_change->connection;
+
+		connection_state_change->cstate[NEW] = connection->cstate;
+		connection_state_change->peer_role[NEW] =
+			conn_highest_peer(connection);
+	}
+
+	for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) {
+		struct drbd_peer_device_state_change *peer_device_state_change =
+			&state_change->peer_devices[n];
+		struct drbd_device *device =
+			peer_device_state_change->peer_device->device;
+		union drbd_dev_state state = device->state;
+
+		peer_device_state_change->disk_state[NEW] = state.pdsk;
+		peer_device_state_change->repl_state[NEW] =
+			max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn);
+		peer_device_state_change->resync_susp_user[NEW] =
+			state.user_isp;
+		peer_device_state_change->resync_susp_peer[NEW] =
+			state.peer_isp;
+		peer_device_state_change->resync_susp_dependency[NEW] =
+			state.aftr_isp;
+	}
+}
+
+void copy_old_to_new_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+
+#define OLD_TO_NEW(x) \
+	(x[NEW] = x[OLD])
+
+	OLD_TO_NEW(resource_state_change->role);
+	OLD_TO_NEW(resource_state_change->susp);
+	OLD_TO_NEW(resource_state_change->susp_nod);
+	OLD_TO_NEW(resource_state_change->susp_fen);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		OLD_TO_NEW(connection_state_change->peer_role);
+		OLD_TO_NEW(connection_state_change->cstate);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		OLD_TO_NEW(device_state_change->disk_state);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		OLD_TO_NEW(p->disk_state);
+		OLD_TO_NEW(p->repl_state);
+		OLD_TO_NEW(p->resync_susp_user);
+		OLD_TO_NEW(p->resync_susp_peer);
+		OLD_TO_NEW(p->resync_susp_dependency);
+	}
+
+#undef OLD_TO_NEW
+}
+
+void forget_state_change(struct drbd_state_change *state_change)
+{
+	unsigned int n;
+
+	if (!state_change)
+		return;
+
+	if (state_change->resource->resource)
+		kref_put(&state_change->resource->resource->kref, drbd_destroy_resource);
+	for (n = 0; n < state_change->n_devices; n++) {
+		struct drbd_device *device = state_change->devices[n].device;
+
+		if (device)
+			kref_put(&device->kref, drbd_destroy_device);
+	}
+	for (n = 0; n < state_change->n_connections; n++) {
+		struct drbd_connection *connection =
+			state_change->connections[n].connection;
+
+		if (connection)
+			kref_put(&connection->kref, drbd_destroy_connection);
+	}
+	kfree(state_change);
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *);
+static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+        return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	bool rv = true;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		if (device->state.disk != D_DISKLESS ||
+		    device->state.conn != C_STANDALONE ||
+		    device->state.role != R_SECONDARY) {
+			rv = false;
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+   they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+		return R_PRIMARY;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_UNKNOWN;
+}
+
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+	if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+		return R_UNKNOWN;
+	if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+		return R_SECONDARY;
+	return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection)
+{
+	enum drbd_role role = R_SECONDARY;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		role = max_role(role, device->state.role);
+	}
+	rcu_read_unlock();
+
+	return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_connection *connection)
+{
+	enum drbd_role peer = R_UNKNOWN;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		peer = max_role(peer, device->state.peer);
+	}
+	rcu_read_unlock();
+
+	return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection)
+{
+	enum drbd_disk_state disk_state = D_DISKLESS;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		disk_state = max_t(enum drbd_disk_state, disk_state, device->state.pdsk);
+	}
+	rcu_read_unlock();
+
+	return disk_state;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection)
+{
+	enum drbd_conns conn = C_MASK;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		conn = min_t(enum drbd_conns, conn, device->state.conn);
+	}
+	rcu_read_unlock();
+
+	return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+	bool rv = true;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		if (peer_device->device->state.conn == C_WF_REPORT_PARAMS) {
+			rv = false;
+			break;
+		}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void wake_up_all_devices(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+		wake_up(&peer_device->device->state_wait);
+	rcu_read_unlock();
+
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @device:	DRBD device.
+ * @os:		old (current) state.
+ * @ns:		new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_device *device,
+			  union drbd_state os, union drbd_state ns)
+{
+	return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+		 ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+		  (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+		  (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+		  (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+		(os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+		(os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+		(os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+	union drbd_state ns;
+	ns.i = (os.i & ~mask.i) | val.i;
+	return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_device *device, enum chg_state_flags f,
+		  union drbd_state mask, union drbd_state val)
+{
+	unsigned long flags;
+	union drbd_state ns;
+	enum drbd_state_rv rv;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	ns = apply_mask_val(drbd_read_state(device), mask, val);
+	rv = _drbd_set_state(device, ns, f, NULL);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ */
+void drbd_force_state(struct drbd_device *device,
+	union drbd_state mask, union drbd_state val)
+{
+	drbd_change_state(device, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_device *device, union drbd_state mask,
+	     union drbd_state val)
+{
+	union drbd_state os, ns;
+	unsigned long flags;
+	enum drbd_state_rv rv;
+
+	if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &device->flags))
+		return SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CL_ST_CHG_FAIL, &device->flags))
+		return SS_CW_FAILED_BY_PEER;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv >= SS_SUCCESS)
+		rv = SS_UNKNOWN_ERROR;  /* cont waiting, otherwise fail. */
+
+	if (!cl_wide_st_chg(device, os, ns))
+		rv = SS_CW_NO_NEED;
+	if (rv == SS_UNKNOWN_ERROR) {
+		rv = is_valid_state(device, ns);
+		if (rv >= SS_SUCCESS) {
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+			if (rv >= SS_SUCCESS)
+				rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+		}
+	}
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_device *device, union drbd_state mask,
+	       union drbd_state val, enum chg_state_flags f)
+{
+	struct completion done;
+	unsigned long flags;
+	union drbd_state os, ns;
+	enum drbd_state_rv rv;
+	void *buffer = NULL;
+
+	init_completion(&done);
+
+	if (f & CS_SERIALIZE)
+		mutex_lock(device->state_mutex);
+	if (f & CS_INHIBIT_MD_IO)
+		buffer = drbd_md_get_buffer(device, __func__);
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	os = drbd_read_state(device);
+	ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS) {
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+		goto abort;
+	}
+
+	if (cl_wide_st_chg(device, os, ns)) {
+		rv = is_valid_state(device, ns);
+		if (rv == SS_SUCCESS)
+			rv = is_valid_soft_transition(os, ns, first_peer_device(device)->connection);
+		spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		if (drbd_send_state_req(first_peer_device(device), mask, val)) {
+			rv = SS_CW_FAILED_BY_PEER;
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+
+		wait_event(device->state_wait,
+			(rv = _req_st_cond(device, mask, val)));
+
+		if (rv < SS_SUCCESS) {
+			if (f & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			goto abort;
+		}
+		spin_lock_irqsave(&device->resource->req_lock, flags);
+		ns = apply_mask_val(drbd_read_state(device), mask, val);
+		rv = _drbd_set_state(device, ns, f, &done);
+	} else {
+		rv = _drbd_set_state(device, ns, f, &done);
+	}
+
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+		D_ASSERT(device, current != first_peer_device(device)->connection->worker.task);
+		wait_for_completion(&done);
+	}
+
+abort:
+	if (buffer)
+		drbd_md_put_buffer(device);
+	if (f & CS_SERIALIZE)
+		mutex_unlock(device->state_mutex);
+
+	return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ * @f:		flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	wait_event(device->state_wait,
+		   (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+	return rv;
+}
+
+/*
+ * We grab drbd_md_get_buffer(), because we don't want to "fail" the disk while
+ * there is IO in-flight: the transition into D_FAILED for detach purposes
+ * may get misinterpreted as actual IO error in a confused endio function.
+ *
+ * We wrap it all into wait_event(), to retry in case the drbd_req_state()
+ * returns SS_IN_TRANSIENT_STATE.
+ *
+ * To avoid potential deadlock with e.g. the receiver thread trying to grab
+ * drbd_md_get_buffer() while trying to get out of the "transient state", we
+ * need to grab and release the meta data buffer inside of that wait_event loop.
+ */
+static enum drbd_state_rv
+request_detach(struct drbd_device *device)
+{
+	return drbd_req_state(device, NS(disk, D_FAILED),
+			CS_VERBOSE | CS_ORDERED | CS_INHIBIT_MD_IO);
+}
+
+int drbd_request_detach_interruptible(struct drbd_device *device)
+{
+	int ret, rv;
+
+	drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */
+	wait_event_interruptible(device->state_wait,
+		(rv = request_detach(device)) != SS_IN_TRANSIENT_STATE);
+	drbd_resume_io(device);
+
+	ret = wait_event_interruptible(device->misc_wait,
+			device->state.disk != D_FAILED);
+
+	if (rv == SS_IS_DISKLESS)
+		rv = SS_NOTHING_TO_DO;
+	if (ret)
+		rv = ERR_INTR;
+
+	return rv;
+}
+
+enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *device, union drbd_state mask,
+		    union drbd_state val, enum chg_state_flags f)
+{
+	enum drbd_state_rv rv;
+
+	BUG_ON(f & CS_SERIALIZE);
+
+	wait_event_cmd(device->state_wait,
+		       (rv = drbd_req_state(device, mask, val, f)) != SS_IN_TRANSIENT_STATE,
+		       mutex_unlock(device->state_mutex),
+		       mutex_lock(device->state_mutex));
+
+	return rv;
+}
+
+static void print_st(struct drbd_device *device, const char *name, union drbd_state ns)
+{
+	drbd_err(device, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+	    name,
+	    drbd_conn_str(ns.conn),
+	    drbd_role_str(ns.role),
+	    drbd_role_str(ns.peer),
+	    drbd_disk_str(ns.disk),
+	    drbd_disk_str(ns.pdsk),
+	    is_susp(ns) ? 's' : 'r',
+	    ns.aftr_isp ? 'a' : '-',
+	    ns.peer_isp ? 'p' : '-',
+	    ns.user_isp ? 'u' : '-',
+	    ns.susp_fen ? 'F' : '-',
+	    ns.susp_nod ? 'N' : '-'
+	    );
+}
+
+void print_st_err(struct drbd_device *device, union drbd_state os,
+	          union drbd_state ns, enum drbd_state_rv err)
+{
+	if (err == SS_IN_TRANSIENT_STATE)
+		return;
+	drbd_err(device, "State change failed: %s\n", drbd_set_st_err_str(err));
+	print_st(device, " state", os);
+	print_st(device, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+			       enum chg_state_flags flags)
+{
+	char *pbp;
+	pbp = pb;
+	*pbp = 0;
+
+	if (ns.role != os.role && flags & CS_DC_ROLE)
+		pbp += sprintf(pbp, "role( %s -> %s ) ",
+			       drbd_role_str(os.role),
+			       drbd_role_str(ns.role));
+	if (ns.peer != os.peer && flags & CS_DC_PEER)
+		pbp += sprintf(pbp, "peer( %s -> %s ) ",
+			       drbd_role_str(os.peer),
+			       drbd_role_str(ns.peer));
+	if (ns.conn != os.conn && flags & CS_DC_CONN)
+		pbp += sprintf(pbp, "conn( %s -> %s ) ",
+			       drbd_conn_str(os.conn),
+			       drbd_conn_str(ns.conn));
+	if (ns.disk != os.disk && flags & CS_DC_DISK)
+		pbp += sprintf(pbp, "disk( %s -> %s ) ",
+			       drbd_disk_str(os.disk),
+			       drbd_disk_str(ns.disk));
+	if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+		pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+			       drbd_disk_str(os.pdsk),
+			       drbd_disk_str(ns.pdsk));
+
+	return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_device *device, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+	if (ns.aftr_isp != os.aftr_isp)
+		pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+			       os.aftr_isp,
+			       ns.aftr_isp);
+	if (ns.peer_isp != os.peer_isp)
+		pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+			       os.peer_isp,
+			       ns.peer_isp);
+	if (ns.user_isp != os.user_isp)
+		pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+			       os.user_isp,
+			       ns.user_isp);
+
+	if (pbp != pb)
+		drbd_info(device, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_connection *connection, union drbd_state os, union drbd_state ns,
+				 enum chg_state_flags flags)
+{
+	char pb[300];
+	char *pbp = pb;
+
+	pbp += print_state_change(pbp, os, ns, flags);
+
+	if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+		pbp += sprintf(pbp, "susp( %d -> %d ) ",
+			       is_susp(os),
+			       is_susp(ns));
+
+	if (pbp != pb)
+		drbd_info(connection, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @device:	DRBD device.
+ * @ns:		State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_device *device, union drbd_state ns)
+{
+	/* See drbd_state_sw_errors in drbd_strings.c */
+
+	enum drbd_fencing_p fp;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct net_conf *nc;
+
+	rcu_read_lock();
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		put_ldev(device);
+	}
+
+	nc = rcu_dereference(first_peer_device(device)->connection->net_conf);
+	if (nc) {
+		if (!nc->two_primaries && ns.role == R_PRIMARY) {
+			if (ns.peer == R_PRIMARY)
+				rv = SS_TWO_PRIMARIES;
+			else if (conn_highest_peer(first_peer_device(device)->connection) == R_PRIMARY)
+				rv = SS_O_VOL_PEER_PRI;
+		}
+	}
+
+	if (rv <= 0)
+		goto out; /* already found a reason to abort */
+	else if (ns.role == R_SECONDARY && device->open_cnt)
+		rv = SS_DEVICE_IN_USE;
+
+	else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (fp >= FP_RESOURCE &&
+		 ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+		rv = SS_PRIMARY_NOP;
+
+	else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+		rv = SS_NO_LOCAL_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+		rv = SS_NO_REMOTE_DISK;
+
+	else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_CONNECTED ||
+		  ns.conn == C_WF_BITMAP_S ||
+		  ns.conn == C_SYNC_SOURCE ||
+		  ns.conn == C_PAUSED_SYNC_S) &&
+		  ns.disk == D_OUTDATED)
+		rv = SS_CONNECTED_OUTDATES;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		 (nc->verify_alg[0] == 0))
+		rv = SS_NO_VERIFY_ALG;
+
+	else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+		  first_peer_device(device)->connection->agreed_pro_version < 88)
+		rv = SS_NOT_SUPPORTED;
+
+	else if (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+		rv = SS_NO_UP_TO_DATE_DISK;
+
+	else if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+                 ns.pdsk == D_UNKNOWN)
+		rv = SS_NEED_CONNECTION;
+
+	else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+		rv = SS_CONNECTED_OUTDATES;
+
+out:
+	rcu_read_unlock();
+
+	return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @os:		old state.
+ * @ns:		new state.
+ * @connection:  DRBD connection.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_connection *connection)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+
+	if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+	    os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+		rv = SS_ALREADY_STANDALONE;
+
+	if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+		rv = SS_NO_NET_CONFIG;
+
+	if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+		rv = SS_LOWER_THAN_OUTDATED;
+
+	if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* While establishing a connection only allow cstate to change.
+	   Delay/refuse role changes, detach attach etc... (they do not touch cstate) */
+	if (test_bit(STATE_SENT, &connection->flags) &&
+	    !((ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION) ||
+	      (ns.conn >= C_CONNECTED && os.conn == C_WF_REPORT_PARAMS)))
+		rv = SS_IN_TRANSIENT_STATE;
+
+	/* Do not promote during resync handshake triggered by "force primary".
+	 * This is a hack. It should really be rejected by the peer during the
+	 * cluster wide state change request. */
+	if (os.role != R_PRIMARY && ns.role == R_PRIMARY
+		&& ns.pdsk == D_UP_TO_DATE
+		&& ns.disk != D_UP_TO_DATE && ns.disk != D_DISKLESS
+		&& (ns.conn <= C_WF_SYNC_UUID || ns.conn != os.conn))
+			rv = SS_IN_TRANSIENT_STATE;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+	    ns.conn != os.conn && os.conn > C_CONNECTED)
+		rv = SS_RESYNC_RUNNING;
+
+	if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+	    os.conn < C_CONNECTED)
+		rv = SS_NEED_CONNECTION;
+
+	if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+	    && os.conn < C_WF_REPORT_PARAMS)
+		rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+	if (ns.conn == C_DISCONNECTING && ns.pdsk == D_OUTDATED &&
+	    os.conn < C_CONNECTED && os.pdsk > D_OUTDATED)
+		rv = SS_OUTDATE_WO_CONN;
+
+	return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+	/* no change -> nothing to do, at least for the connection part */
+	if (oc == nc)
+		return SS_NOTHING_TO_DO;
+
+	/* disconnect of an unconfigured connection does not make sense */
+	if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+		return SS_ALREADY_STANDALONE;
+
+	/* from C_STANDALONE, we start with C_UNCONNECTED */
+	if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* When establishing a connection we need to go through WF_REPORT_PARAMS!
+	   Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+	if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+		return SS_NEED_CONNECTION;
+
+	/* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+	if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+		return SS_IN_TRANSIENT_STATE;
+
+	/* After C_DISCONNECTING only C_STANDALONE may follow */
+	if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+		return SS_IN_TRANSIENT_STATE;
+
+	return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns:		new state.
+ * @os:		old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+	enum drbd_state_rv rv;
+
+	rv = is_valid_conn_transition(os.conn, ns.conn);
+
+	/* we cannot fail (again) if we already detached */
+	if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+		rv = SS_IS_DISKLESS;
+
+	return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_device *device, enum sanitize_state_warnings warn)
+{
+	static const char *msg_table[] = {
+		[NO_WARNING] = "",
+		[ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+		[ABORTED_RESYNC] = "Resync aborted.",
+		[CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+		[IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+		[IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+	};
+
+	if (warn != NO_WARNING)
+		drbd_warn(device, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @warn:	placeholder for returned state warning.
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_device *device, union drbd_state os,
+				       union drbd_state ns, enum sanitize_state_warnings *warn)
+{
+	enum drbd_fencing_p fp;
+	enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+	if (warn)
+		*warn = NO_WARNING;
+
+	fp = FP_DONT_CARE;
+	if (get_ldev(device)) {
+		rcu_read_lock();
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		rcu_read_unlock();
+		put_ldev(device);
+	}
+
+	/* Implications from connection to peer and peer_isp */
+	if (ns.conn < C_CONNECTED) {
+		ns.peer_isp = 0;
+		ns.peer = R_UNKNOWN;
+		if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+			ns.pdsk = D_UNKNOWN;
+	}
+
+	/* Clear the aftr_isp when becoming unconfigured */
+	if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+		ns.aftr_isp = 0;
+
+	/* An implication of the disk states onto the connection state */
+	/* Abort resync if a disk fails/detaches */
+	if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+		if (warn)
+			*warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+				ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+		ns.conn = C_CONNECTED;
+	}
+
+	/* Connection breaks down before we finished "Negotiating" */
+	if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+	    get_ldev_if_state(device, D_NEGOTIATING)) {
+		if (device->ed_uuid == device->ldev->md.uuid[UI_CURRENT]) {
+			ns.disk = device->new_state_tmp.disk;
+			ns.pdsk = device->new_state_tmp.pdsk;
+		} else {
+			if (warn)
+				*warn = CONNECTION_LOST_NEGOTIATING;
+			ns.disk = D_DISKLESS;
+			ns.pdsk = D_UNKNOWN;
+		}
+		put_ldev(device);
+	}
+
+	/* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+	if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+		if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+			ns.disk = D_UP_TO_DATE;
+		if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+			ns.pdsk = D_UP_TO_DATE;
+	}
+
+	/* Implications of the connection state on the disk states */
+	disk_min = D_DISKLESS;
+	disk_max = D_UP_TO_DATE;
+	pdsk_min = D_INCONSISTENT;
+	pdsk_max = D_UNKNOWN;
+	switch ((enum drbd_conns)ns.conn) {
+	case C_WF_BITMAP_T:
+	case C_PAUSED_SYNC_T:
+	case C_STARTING_SYNC_T:
+	case C_WF_SYNC_UUID:
+	case C_BEHIND:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_OUTDATED;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_VERIFY_S:
+	case C_VERIFY_T:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_CONNECTED:
+		disk_min = D_DISKLESS;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_DISKLESS;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_WF_BITMAP_S:
+	case C_PAUSED_SYNC_S:
+	case C_STARTING_SYNC_S:
+	case C_AHEAD:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+		break;
+	case C_SYNC_TARGET:
+		disk_min = D_INCONSISTENT;
+		disk_max = D_INCONSISTENT;
+		pdsk_min = D_UP_TO_DATE;
+		pdsk_max = D_UP_TO_DATE;
+		break;
+	case C_SYNC_SOURCE:
+		disk_min = D_UP_TO_DATE;
+		disk_max = D_UP_TO_DATE;
+		pdsk_min = D_INCONSISTENT;
+		pdsk_max = D_INCONSISTENT;
+		break;
+	case C_STANDALONE:
+	case C_DISCONNECTING:
+	case C_UNCONNECTED:
+	case C_TIMEOUT:
+	case C_BROKEN_PIPE:
+	case C_NETWORK_FAILURE:
+	case C_PROTOCOL_ERROR:
+	case C_TEAR_DOWN:
+	case C_WF_CONNECTION:
+	case C_WF_REPORT_PARAMS:
+	case C_MASK:
+		break;
+	}
+	if (ns.disk > disk_max)
+		ns.disk = disk_max;
+
+	if (ns.disk < disk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_DISK;
+		ns.disk = disk_min;
+	}
+	if (ns.pdsk > pdsk_max)
+		ns.pdsk = pdsk_max;
+
+	if (ns.pdsk < pdsk_min) {
+		if (warn)
+			*warn = IMPLICITLY_UPGRADED_PDSK;
+		ns.pdsk = pdsk_min;
+	}
+
+	if (fp == FP_STONITH &&
+	    (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
+	    !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
+		ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+	if (device->resource->res_opts.on_no_data == OND_SUSPEND_IO &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
+	    !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
+		ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+	if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+		if (ns.conn == C_SYNC_SOURCE)
+			ns.conn = C_PAUSED_SYNC_S;
+		if (ns.conn == C_SYNC_TARGET)
+			ns.conn = C_PAUSED_SYNC_T;
+	} else {
+		if (ns.conn == C_PAUSED_SYNC_S)
+			ns.conn = C_SYNC_SOURCE;
+		if (ns.conn == C_PAUSED_SYNC_T)
+			ns.conn = C_SYNC_TARGET;
+	}
+
+	return ns;
+}
+
+void drbd_resume_al(struct drbd_device *device)
+{
+	if (test_and_clear_bit(AL_SUSPENDED, &device->flags))
+		drbd_info(device, "Resumed AL updates\n");
+}
+
+/* helper for _drbd_set_state */
+static void set_ov_position(struct drbd_device *device, enum drbd_conns cs)
+{
+	if (first_peer_device(device)->connection->agreed_pro_version < 90)
+		device->ov_start_sector = 0;
+	device->rs_total = drbd_bm_bits(device);
+	device->ov_position = 0;
+	if (cs == C_VERIFY_T) {
+		/* starting online verify from an arbitrary position
+		 * does not fit well into the existing protocol.
+		 * on C_VERIFY_T, we initialize ov_left and friends
+		 * implicitly in receive_DataRequest once the
+		 * first P_OV_REQUEST is received */
+		device->ov_start_sector = ~(sector_t)0;
+	} else {
+		unsigned long bit = BM_SECT_TO_BIT(device->ov_start_sector);
+		if (bit >= device->rs_total) {
+			device->ov_start_sector =
+				BM_BIT_TO_SECT(device->rs_total - 1);
+			device->rs_total = 1;
+		} else
+			device->rs_total -= bit;
+		device->ov_position = device->ov_start_sector;
+	}
+	device->ov_left = device->rs_total;
+}
+
+/**
+ * _drbd_set_state() - Set a new DRBD state
+ * @device:	DRBD device.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @done:	Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock. Do not call directly.
+ */
+enum drbd_state_rv
+_drbd_set_state(struct drbd_device *device, union drbd_state ns,
+	        enum chg_state_flags flags, struct completion *done)
+{
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	union drbd_state os;
+	enum drbd_state_rv rv = SS_SUCCESS;
+	enum sanitize_state_warnings ssw;
+	struct after_state_chg_work *ascw;
+	struct drbd_state_change *state_change;
+
+	os = drbd_read_state(device);
+
+	ns = sanitize_state(device, os, ns, &ssw);
+	if (ns.i == os.i)
+		return SS_NOTHING_TO_DO;
+
+	rv = is_valid_transition(os, ns);
+	if (rv < SS_SUCCESS)
+		return rv;
+
+	if (!(flags & CS_HARD)) {
+		/*  pre-state-change checks ; only look at ns  */
+		/* See drbd_state_sw_errors in drbd_strings.c */
+
+		rv = is_valid_state(device, ns);
+		if (rv < SS_SUCCESS) {
+			/* If the old state was illegal as well, then let
+			   this happen...*/
+
+			if (is_valid_state(device, os) == rv)
+				rv = is_valid_soft_transition(os, ns, connection);
+		} else
+			rv = is_valid_soft_transition(os, ns, connection);
+	}
+
+	if (rv < SS_SUCCESS) {
+		if (flags & CS_VERBOSE)
+			print_st_err(device, os, ns, rv);
+		return rv;
+	}
+
+	print_sanitize_warnings(device, ssw);
+
+	drbd_pr_state_change(device, os, ns, flags);
+
+	/* Display changes to the susp* flags that where caused by the call to
+	   sanitize_state(). Only display it here if we where not called from
+	   _conn_request_state() */
+	if (!(flags & CS_DC_SUSP))
+		conn_pr_state_change(connection, os, ns,
+				     (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+	/* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+	 * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+	 * drbd_ldev_destroy() won't happen before our corresponding
+	 * after_state_ch works run, where we put_ldev again. */
+	if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+	    (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+		atomic_inc(&device->local_cnt);
+
+	if (!is_sync_state(os.conn) && is_sync_state(ns.conn))
+		clear_bit(RS_DONE, &device->flags);
+
+	/* FIXME: Have any flags been set earlier in this function already? */
+	state_change = remember_old_state(device->resource, GFP_ATOMIC);
+
+	/* changes to local_cnt and device flags should be visible before
+	 * changes to state, which again should be visible before anything else
+	 * depending on that change happens. */
+	smp_wmb();
+	device->state.i = ns.i;
+	device->resource->susp = ns.susp;
+	device->resource->susp_nod = ns.susp_nod;
+	device->resource->susp_fen = ns.susp_fen;
+	smp_wmb();
+
+	remember_new_state(state_change);
+
+	/* put replicated vs not-replicated requests in seperate epochs */
+	if (drbd_should_do_remote((union drbd_dev_state)os.i) !=
+	    drbd_should_do_remote((union drbd_dev_state)ns.i))
+		start_new_tl_epoch(connection);
+
+	if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+		drbd_print_uuids(device, "attached to UUIDs");
+
+	/* Wake up role changes, that were delayed because of connection establishing */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+	    no_peer_wf_report_params(connection)) {
+		clear_bit(STATE_SENT, &connection->flags);
+		wake_up_all_devices(connection);
+	}
+
+	wake_up(&device->misc_wait);
+	wake_up(&device->state_wait);
+	wake_up(&connection->ping_wait);
+
+	/* Aborted verify run, or we reached the stop sector.
+	 * Log the last position, unless end-of-device. */
+	if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+	    ns.conn <= C_CONNECTED) {
+		device->ov_start_sector =
+			BM_BIT_TO_SECT(drbd_bm_bits(device) - device->ov_left);
+		if (device->ov_left)
+			drbd_info(device, "Online Verify reached sector %llu\n",
+				(unsigned long long)device->ov_start_sector);
+	}
+
+	if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_TARGET  || ns.conn == C_SYNC_SOURCE)) {
+		drbd_info(device, "Syncer continues.\n");
+		device->rs_paused += (long)jiffies
+				  -(long)device->rs_mark_time[device->rs_last_mark];
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+	}
+
+	if ((os.conn == C_SYNC_TARGET  || os.conn == C_SYNC_SOURCE) &&
+	    (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+		drbd_info(device, "Resync suspended\n");
+		device->rs_mark_time[device->rs_last_mark] = jiffies;
+	}
+
+	if (os.conn == C_CONNECTED &&
+	    (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+		unsigned long now = jiffies;
+		int i;
+
+		set_ov_position(device, ns.conn);
+		device->rs_start = now;
+		device->rs_last_sect_ev = 0;
+		device->ov_last_oos_size = 0;
+		device->ov_last_oos_start = 0;
+
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = device->ov_left;
+			device->rs_mark_time[i] = now;
+		}
+
+		drbd_rs_controller_reset(device);
+
+		if (ns.conn == C_VERIFY_S) {
+			drbd_info(device, "Starting Online Verify from sector %llu\n",
+					(unsigned long long)device->ov_position);
+			mod_timer(&device->resync_timer, jiffies);
+		}
+	}
+
+	if (get_ldev(device)) {
+		u32 mdf = device->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+						 MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+						 MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+		mdf &= ~MDF_AL_CLEAN;
+		if (test_bit(CRASHED_PRIMARY, &device->flags))
+			mdf |= MDF_CRASHED_PRIMARY;
+		if (device->state.role == R_PRIMARY ||
+		    (device->state.pdsk < D_INCONSISTENT && device->state.peer == R_PRIMARY))
+			mdf |= MDF_PRIMARY_IND;
+		if (device->state.conn > C_WF_REPORT_PARAMS)
+			mdf |= MDF_CONNECTED_IND;
+		if (device->state.disk > D_INCONSISTENT)
+			mdf |= MDF_CONSISTENT;
+		if (device->state.disk > D_OUTDATED)
+			mdf |= MDF_WAS_UP_TO_DATE;
+		if (device->state.pdsk <= D_OUTDATED && device->state.pdsk >= D_INCONSISTENT)
+			mdf |= MDF_PEER_OUT_DATED;
+		if (mdf != device->ldev->md.flags) {
+			device->ldev->md.flags = mdf;
+			drbd_md_mark_dirty(device);
+		}
+		if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+			drbd_set_ed_uuid(device, device->ldev->md.uuid[UI_CURRENT]);
+		put_ldev(device);
+	}
+
+	/* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+	if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+	    os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+		set_bit(CONSIDER_RESYNC, &device->flags);
+
+	/* Receiver should clean up itself */
+	if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+		drbd_thread_stop_nowait(&connection->receiver);
+
+	/* Now the receiver finished cleaning up itself, it should die */
+	if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+		drbd_thread_stop_nowait(&connection->receiver);
+
+	/* Upon network failure, we need to restart the receiver. */
+	if (os.conn > C_WF_CONNECTION &&
+	    ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+		drbd_thread_restart_nowait(&connection->receiver);
+
+	/* Resume AL writing if we get a connection */
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
+		drbd_resume_al(device);
+		connection->connect_cnt++;
+	}
+
+	/* remember last attach time so request_timer_fn() won't
+	 * kill newly established sessions while we are still trying to thaw
+	 * previously frozen IO */
+	if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+	    ns.disk > D_NEGOTIATING)
+		device->last_reattach_jif = jiffies;
+
+	ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+	if (ascw) {
+		ascw->os = os;
+		ascw->ns = ns;
+		ascw->flags = flags;
+		ascw->w.cb = w_after_state_ch;
+		ascw->device = device;
+		ascw->done = done;
+		ascw->state_change = state_change;
+		drbd_queue_work(&connection->sender_work,
+				&ascw->w);
+	} else {
+		drbd_err(device, "Could not kmalloc an ascw\n");
+	}
+
+	return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_state_chg_work *ascw =
+		container_of(w, struct after_state_chg_work, w);
+	struct drbd_device *device = ascw->device;
+
+	after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change);
+	forget_state_change(ascw->state_change);
+	if (ascw->flags & CS_WAIT_COMPLETE)
+		complete(ascw->done);
+	kfree(ascw);
+
+	return 0;
+}
+
+static void abw_start_sync(struct drbd_device *device, int rv)
+{
+	if (rv) {
+		drbd_err(device, "Writing the bitmap failed not starting resync.\n");
+		_drbd_request_state(device, NS(conn, C_CONNECTED), CS_VERBOSE);
+		return;
+	}
+
+	switch (device->state.conn) {
+	case C_STARTING_SYNC_T:
+		_drbd_request_state(device, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+		break;
+	case C_STARTING_SYNC_S:
+		drbd_start_resync(device, C_SYNC_SOURCE);
+		break;
+	}
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_device *device,
+		int (*io_fn)(struct drbd_device *),
+		char *why, enum bm_flag flags)
+{
+	int rv;
+
+	D_ASSERT(device, current == first_peer_device(device)->connection->worker.task);
+
+	/* open coded non-blocking drbd_suspend_io(device); */
+	atomic_inc(&device->suspend_cnt);
+
+	drbd_bm_lock(device, why, flags);
+	rv = io_fn(device);
+	drbd_bm_unlock(device);
+
+	drbd_resume_io(device);
+
+	return rv;
+}
+
+int notify_resource_state_change(struct sk_buff *skb,
+				  unsigned int seq,
+				  struct drbd_resource_state_change *resource_state_change,
+				  enum drbd_notification_type type)
+{
+	struct drbd_resource *resource = resource_state_change->resource;
+	struct resource_info resource_info = {
+		.res_role = resource_state_change->role[NEW],
+		.res_susp = resource_state_change->susp[NEW],
+		.res_susp_nod = resource_state_change->susp_nod[NEW],
+		.res_susp_fen = resource_state_change->susp_fen[NEW],
+	};
+
+	return notify_resource_state(skb, seq, resource, &resource_info, type);
+}
+
+int notify_connection_state_change(struct sk_buff *skb,
+				    unsigned int seq,
+				    struct drbd_connection_state_change *connection_state_change,
+				    enum drbd_notification_type type)
+{
+	struct drbd_connection *connection = connection_state_change->connection;
+	struct connection_info connection_info = {
+		.conn_connection_state = connection_state_change->cstate[NEW],
+		.conn_role = connection_state_change->peer_role[NEW],
+	};
+
+	return notify_connection_state(skb, seq, connection, &connection_info, type);
+}
+
+int notify_device_state_change(struct sk_buff *skb,
+				unsigned int seq,
+				struct drbd_device_state_change *device_state_change,
+				enum drbd_notification_type type)
+{
+	struct drbd_device *device = device_state_change->device;
+	struct device_info device_info = {
+		.dev_disk_state = device_state_change->disk_state[NEW],
+	};
+
+	return notify_device_state(skb, seq, device, &device_info, type);
+}
+
+int notify_peer_device_state_change(struct sk_buff *skb,
+				     unsigned int seq,
+				     struct drbd_peer_device_state_change *p,
+				     enum drbd_notification_type type)
+{
+	struct drbd_peer_device *peer_device = p->peer_device;
+	struct peer_device_info peer_device_info = {
+		.peer_repl_state = p->repl_state[NEW],
+		.peer_disk_state = p->disk_state[NEW],
+		.peer_resync_susp_user = p->resync_susp_user[NEW],
+		.peer_resync_susp_peer = p->resync_susp_peer[NEW],
+		.peer_resync_susp_dependency = p->resync_susp_dependency[NEW],
+	};
+
+	return notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type);
+}
+
+static void broadcast_state_change(struct drbd_state_change *state_change)
+{
+	struct drbd_resource_state_change *resource_state_change = &state_change->resource[0];
+	bool resource_state_has_changed;
+	unsigned int n_device, n_connection, n_peer_device, n_peer_devices;
+	int (*last_func)(struct sk_buff *, unsigned int, void *,
+			  enum drbd_notification_type) = NULL;
+	void *last_arg = NULL;
+
+#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW])
+#define FINAL_STATE_CHANGE(type) \
+	({ if (last_func) \
+		last_func(NULL, 0, last_arg, type); \
+	})
+#define REMEMBER_STATE_CHANGE(func, arg, type) \
+	({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \
+	   last_func = (typeof(last_func))func; \
+	   last_arg = arg; \
+	 })
+
+	mutex_lock(&notification_mutex);
+
+	resource_state_has_changed =
+	    HAS_CHANGED(resource_state_change->role) ||
+	    HAS_CHANGED(resource_state_change->susp) ||
+	    HAS_CHANGED(resource_state_change->susp_nod) ||
+	    HAS_CHANGED(resource_state_change->susp_fen);
+
+	if (resource_state_has_changed)
+		REMEMBER_STATE_CHANGE(notify_resource_state_change,
+				      resource_state_change, NOTIFY_CHANGE);
+
+	for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) {
+		struct drbd_connection_state_change *connection_state_change =
+				&state_change->connections[n_connection];
+
+		if (HAS_CHANGED(connection_state_change->peer_role) ||
+		    HAS_CHANGED(connection_state_change->cstate))
+			REMEMBER_STATE_CHANGE(notify_connection_state_change,
+					      connection_state_change, NOTIFY_CHANGE);
+	}
+
+	for (n_device = 0; n_device < state_change->n_devices; n_device++) {
+		struct drbd_device_state_change *device_state_change =
+			&state_change->devices[n_device];
+
+		if (HAS_CHANGED(device_state_change->disk_state))
+			REMEMBER_STATE_CHANGE(notify_device_state_change,
+					      device_state_change, NOTIFY_CHANGE);
+	}
+
+	n_peer_devices = state_change->n_devices * state_change->n_connections;
+	for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) {
+		struct drbd_peer_device_state_change *p =
+			&state_change->peer_devices[n_peer_device];
+
+		if (HAS_CHANGED(p->disk_state) ||
+		    HAS_CHANGED(p->repl_state) ||
+		    HAS_CHANGED(p->resync_susp_user) ||
+		    HAS_CHANGED(p->resync_susp_peer) ||
+		    HAS_CHANGED(p->resync_susp_dependency))
+			REMEMBER_STATE_CHANGE(notify_peer_device_state_change,
+					      p, NOTIFY_CHANGE);
+	}
+
+	FINAL_STATE_CHANGE(NOTIFY_CHANGE);
+	mutex_unlock(&notification_mutex);
+
+#undef HAS_CHANGED
+#undef FINAL_STATE_CHANGE
+#undef REMEMBER_STATE_CHANGE
+}
+
+/* takes old and new peer disk state */
+static bool lost_contact_to_peer_data(enum drbd_disk_state os, enum drbd_disk_state ns)
+{
+	if ((os >= D_INCONSISTENT && os != D_UNKNOWN && os != D_OUTDATED)
+	&&  (ns < D_INCONSISTENT || ns == D_UNKNOWN || ns == D_OUTDATED))
+		return true;
+
+	/* Scenario, starting with normal operation
+	 * Connected Primary/Secondary UpToDate/UpToDate
+	 * NetworkFailure Primary/Unknown UpToDate/DUnknown (frozen)
+	 * ...
+	 * Connected Primary/Secondary UpToDate/Diskless (resumed; needs to bump uuid!)
+	 */
+	if (os == D_UNKNOWN
+	&&  (ns == D_DISKLESS || ns == D_FAILED || ns == D_OUTDATED))
+		return true;
+
+	return false;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @device:	DRBD device.
+ * @os:		old state.
+ * @ns:		new state.
+ * @flags:	Flags
+ * @state_change: state change to broadcast
+ */
+static void after_state_ch(struct drbd_device *device, union drbd_state os,
+			   union drbd_state ns, enum chg_state_flags flags,
+			   struct drbd_state_change *state_change)
+{
+	struct drbd_resource *resource = device->resource;
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	struct sib_info sib;
+
+	broadcast_state_change(state_change);
+
+	sib.sib_reason = SIB_STATE_CHANGE;
+	sib.os = os;
+	sib.ns = ns;
+
+	if ((os.disk != D_UP_TO_DATE || os.pdsk != D_UP_TO_DATE)
+	&&  (ns.disk == D_UP_TO_DATE && ns.pdsk == D_UP_TO_DATE)) {
+		clear_bit(CRASHED_PRIMARY, &device->flags);
+		if (device->p_uuid)
+			device->p_uuid[UI_FLAGS] &= ~((u64)2);
+	}
+
+	/* Inform userspace about the change... */
+	drbd_bcast_event(device, &sib);
+
+	if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+	    (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+		drbd_khelper(device, "pri-on-incon-degr");
+
+	/* Here we have the actions that are performed after a
+	   state change. This function might sleep */
+
+	if (ns.susp_nod) {
+		enum drbd_req_event what = NOTHING;
+
+		spin_lock_irq(&device->resource->req_lock);
+		if (os.conn < C_CONNECTED && conn_lowest_conn(connection) >= C_CONNECTED)
+			what = RESEND;
+
+		if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+		    conn_lowest_disk(connection) == D_UP_TO_DATE)
+			what = RESTART_FROZEN_DISK_IO;
+
+		if (resource->susp_nod && what != NOTHING) {
+			_tl_restart(connection, what);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_nod = 1 } },
+					    (union drbd_state) { { .susp_nod = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	if (ns.susp_fen) {
+		spin_lock_irq(&device->resource->req_lock);
+		if (resource->susp_fen && conn_lowest_conn(connection) >= C_CONNECTED) {
+			/* case2: The connection was established again: */
+			struct drbd_peer_device *peer_device;
+			int vnr;
+
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+				clear_bit(NEW_CUR_UUID, &peer_device->device->flags);
+			rcu_read_unlock();
+
+			/* We should actively create a new uuid, _before_
+			 * we resume/resent, if the peer is diskless
+			 * (recovery from a multiple error scenario).
+			 * Currently, this happens with a slight delay
+			 * below when checking lost_contact_to_peer_data() ...
+			 */
+			_tl_restart(connection, RESEND);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+		}
+		spin_unlock_irq(&device->resource->req_lock);
+	}
+
+	/* Became sync source.  With protocol >= 96, we still need to send out
+	 * the sync uuid now. Need to do that before any drbd_send_state, or
+	 * the other side may go "paused sync" before receiving the sync uuids,
+	 * which is unexpected. */
+	if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+	    (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+	    connection->agreed_pro_version >= 96 && get_ldev(device)) {
+		drbd_gen_and_send_sync_uuid(peer_device);
+		put_ldev(device);
+	}
+
+	/* Do not change the order of the if above and the two below... */
+	if (os.pdsk == D_DISKLESS &&
+	    ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) {      /* attach on the peer */
+		/* we probably will start a resync soon.
+		 * make sure those things are properly reset. */
+		device->rs_total = 0;
+		device->rs_failed = 0;
+		atomic_set(&device->rs_pending_cnt, 0);
+		drbd_rs_cancel_all(device);
+
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
+	}
+	/* No point in queuing send_bitmap if we don't have a connection
+	 * anymore, so check also the _current_ state, not only the new state
+	 * at the time this work was queued. */
+	if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+	    device->state.conn == C_WF_BITMAP_S)
+		drbd_queue_bitmap_io(device, &drbd_send_bitmap, NULL,
+				"send_bitmap (WFBitMapS)",
+				BM_LOCKED_TEST_ALLOWED);
+
+	/* Lost contact to peer's copy of the data */
+	if (lost_contact_to_peer_data(os.pdsk, ns.pdsk)) {
+		if (get_ldev(device)) {
+			if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+			    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+				if (drbd_suspended(device)) {
+					set_bit(NEW_CUR_UUID, &device->flags);
+				} else {
+					drbd_uuid_new_current(device);
+					drbd_send_uuids(peer_device);
+				}
+			}
+			put_ldev(device);
+		}
+	}
+
+	if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) {
+		if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY &&
+		    device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+			drbd_uuid_new_current(device);
+			drbd_send_uuids(peer_device);
+		}
+		/* D_DISKLESS Peer becomes secondary */
+		if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+			/* We may still be Primary ourselves.
+			 * No harm done if the bitmap still changes,
+			 * redirtied pages will follow later. */
+			drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote diskless peer", BM_LOCKED_SET_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Write out all changed bits on demote.
+	 * Though, no need to da that just yet
+	 * if there is a resync going on still */
+	if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+		device->state.conn <= C_CONNECTED && get_ldev(device)) {
+		/* No changes to the bitmap expected this time, so assert that,
+		 * even though no harm was done if it did change. */
+		drbd_bitmap_io_from_worker(device, &drbd_bm_write,
+				"demote", BM_LOCKED_TEST_ALLOWED);
+		put_ldev(device);
+	}
+
+	/* Last part of the attaching process ... */
+	if (ns.conn >= C_CONNECTED &&
+	    os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+		drbd_send_sizes(peer_device, 0, 0);  /* to start sync... */
+		drbd_send_uuids(peer_device);
+		drbd_send_state(peer_device, ns);
+	}
+
+	/* We want to pause/continue resync, tell peer. */
+	if (ns.conn >= C_CONNECTED &&
+	     ((os.aftr_isp != ns.aftr_isp) ||
+	      (os.user_isp != ns.user_isp)))
+		drbd_send_state(peer_device, ns);
+
+	/* In case one of the isp bits got set, suspend other devices. */
+	if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+	    (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+		suspend_other_sg(device);
+
+	/* Make sure the peer gets informed about eventual state
+	   changes (ISP bits) while we were in WFReportParams. */
+	if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+		drbd_send_state(peer_device, ns);
+
+	/* We are in the progress to start a full sync... */
+	if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+	    (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+		/* no other bitmap changes expected during this phase */
+		drbd_queue_bitmap_io(device,
+			&drbd_bmio_set_n_write, &abw_start_sync,
+			"set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+	/* first half of local IO error, failure to attach,
+	 * or administrative detach */
+	if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+		enum drbd_io_error_p eh = EP_PASS_ON;
+		int was_io_error = 0;
+		/* corresponding get_ldev was in _drbd_set_state, to serialize
+		 * our cleanup here with the transition to D_DISKLESS.
+		 * But is is still not save to dreference ldev here, since
+		 * we might come from an failed Attach before ldev was set. */
+		if (device->ldev) {
+			rcu_read_lock();
+			eh = rcu_dereference(device->ldev->disk_conf)->on_io_error;
+			rcu_read_unlock();
+
+			was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags);
+
+			/* Intentionally call this handler first, before drbd_send_state().
+			 * See: 2932204 drbd: call local-io-error handler early
+			 * People may chose to hard-reset the box from this handler.
+			 * It is useful if this looks like a "regular node crash". */
+			if (was_io_error && eh == EP_CALL_HELPER)
+				drbd_khelper(device, "local-io-error");
+
+			/* Immediately allow completion of all application IO,
+			 * that waits for completion from the local disk,
+			 * if this was a force-detach due to disk_timeout
+			 * or administrator request (drbdsetup detach --force).
+			 * Do NOT abort otherwise.
+			 * Aborting local requests may cause serious problems,
+			 * if requests are completed to upper layers already,
+			 * and then later the already submitted local bio completes.
+			 * This can cause DMA into former bio pages that meanwhile
+			 * have been re-used for other things.
+			 * So aborting local requests may cause crashes,
+			 * or even worse, silent data corruption.
+			 */
+			if (test_and_clear_bit(FORCE_DETACH, &device->flags))
+				tl_abort_disk_io(device);
+
+			/* current state still has to be D_FAILED,
+			 * there is only one way out: to D_DISKLESS,
+			 * and that may only happen after our put_ldev below. */
+			if (device->state.disk != D_FAILED)
+				drbd_err(device,
+					"ASSERT FAILED: disk is %s during detach\n",
+					drbd_disk_str(device->state.disk));
+
+			if (ns.conn >= C_CONNECTED)
+				drbd_send_state(peer_device, ns);
+
+			drbd_rs_cancel_all(device);
+
+			/* In case we want to get something to stable storage still,
+			 * this may be the last chance.
+			 * Following put_ldev may transition to D_DISKLESS. */
+			drbd_md_sync(device);
+		}
+		put_ldev(device);
+	}
+
+	/* second half of local IO error, failure to attach,
+	 * or administrative detach,
+	 * after local_cnt references have reached zero again */
+	if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+		/* We must still be diskless,
+		 * re-attach has to be serialized with this! */
+		if (device->state.disk != D_DISKLESS)
+			drbd_err(device,
+				 "ASSERT FAILED: disk is %s while going diskless\n",
+				 drbd_disk_str(device->state.disk));
+
+		if (ns.conn >= C_CONNECTED)
+			drbd_send_state(peer_device, ns);
+		/* corresponding get_ldev in __drbd_set_state
+		 * this may finally trigger drbd_ldev_destroy. */
+		put_ldev(device);
+	}
+
+	/* Notify peer that I had a local IO error, and did not detached.. */
+	if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	/* Disks got bigger while they were detached */
+	if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+	    test_and_clear_bit(RESYNC_AFTER_NEG, &device->flags)) {
+		if (ns.conn == C_CONNECTED)
+			resync_after_online_grow(device);
+	}
+
+	/* A resync finished or aborted, wake paused devices... */
+	if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+	    (os.peer_isp && !ns.peer_isp) ||
+	    (os.user_isp && !ns.user_isp))
+		resume_next_sg(device);
+
+	/* sync target done with resync.  Explicitly notify peer, even though
+	 * it should (at least for non-empty resyncs) already know itself. */
+	if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+		drbd_send_state(peer_device, ns);
+
+	/* Verify finished, or reached stop sector.  Peer did not know about
+	 * the stop sector, and we may even have changed the stop sector during
+	 * verify to interrupt/stop early.  Send the new state. */
+	if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+	&& verify_can_do_stop_sector(device))
+		drbd_send_state(peer_device, ns);
+
+	/* This triggers bitmap writeout of potentially still unwritten pages
+	 * if the resync finished cleanly, or aborted because of peer disk
+	 * failure, or on transition from resync back to AHEAD/BEHIND.
+	 *
+	 * Connection loss is handled in drbd_disconnected() by the receiver.
+	 *
+	 * For resync aborted because of local disk failure, we cannot do
+	 * any bitmap writeout anymore.
+	 *
+	 * No harm done if some bits change during this phase.
+	 */
+	if ((os.conn > C_CONNECTED && os.conn < C_AHEAD) &&
+	    (ns.conn == C_CONNECTED || ns.conn >= C_AHEAD) && get_ldev(device)) {
+		drbd_queue_bitmap_io(device, &drbd_bm_write_copy_pages, NULL,
+			"write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+		put_ldev(device);
+	}
+
+	if (ns.disk == D_DISKLESS &&
+	    ns.conn == C_STANDALONE &&
+	    ns.role == R_SECONDARY) {
+		if (os.aftr_isp != ns.aftr_isp)
+			resume_next_sg(device);
+	}
+
+	drbd_md_sync(device);
+}
+
+struct after_conn_state_chg_work {
+	struct drbd_work w;
+	enum drbd_conns oc;
+	union drbd_state ns_min;
+	union drbd_state ns_max; /* new, max state, over all devices */
+	enum chg_state_flags flags;
+	struct drbd_connection *connection;
+	struct drbd_state_change *state_change;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+	struct after_conn_state_chg_work *acscw =
+		container_of(w, struct after_conn_state_chg_work, w);
+	struct drbd_connection *connection = acscw->connection;
+	enum drbd_conns oc = acscw->oc;
+	union drbd_state ns_max = acscw->ns_max;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	broadcast_state_change(acscw->state_change);
+	forget_state_change(acscw->state_change);
+	kfree(acscw);
+
+	/* Upon network configuration, we need to start the receiver */
+	if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+		drbd_thread_start(&connection->receiver);
+
+	if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+		struct net_conf *old_conf;
+
+		mutex_lock(&notification_mutex);
+		idr_for_each_entry(&connection->peer_devices, peer_device, vnr)
+			notify_peer_device_state(NULL, 0, peer_device, NULL,
+						 NOTIFY_DESTROY | NOTIFY_CONTINUES);
+		notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY);
+		mutex_unlock(&notification_mutex);
+
+		mutex_lock(&connection->resource->conf_update);
+		old_conf = connection->net_conf;
+		connection->my_addr_len = 0;
+		connection->peer_addr_len = 0;
+		RCU_INIT_POINTER(connection->net_conf, NULL);
+		conn_free_crypto(connection);
+		mutex_unlock(&connection->resource->conf_update);
+
+		kvfree_rcu(old_conf);
+	}
+
+	if (ns_max.susp_fen) {
+		/* case1: The outdate peer handler is successful: */
+		if (ns_max.pdsk <= D_OUTDATED) {
+			rcu_read_lock();
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+				struct drbd_device *device = peer_device->device;
+				if (test_bit(NEW_CUR_UUID, &device->flags)) {
+					drbd_uuid_new_current(device);
+					clear_bit(NEW_CUR_UUID, &device->flags);
+				}
+			}
+			rcu_read_unlock();
+			spin_lock_irq(&connection->resource->req_lock);
+			_tl_restart(connection, CONNECTION_LOST_WHILE_PENDING);
+			_conn_request_state(connection,
+					    (union drbd_state) { { .susp_fen = 1 } },
+					    (union drbd_state) { { .susp_fen = 0 } },
+					    CS_VERBOSE);
+			spin_unlock_irq(&connection->resource->req_lock);
+		}
+	}
+	conn_md_sync(connection);
+	kref_put(&connection->kref, drbd_destroy_connection);
+
+	return 0;
+}
+
+static void conn_old_common_state(struct drbd_connection *connection, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+	enum chg_state_flags flags = ~0;
+	struct drbd_peer_device *peer_device;
+	int vnr, first_vol = 1;
+	union drbd_dev_state os, cs = {
+		{ .role = R_SECONDARY,
+		  .peer = R_UNKNOWN,
+		  .conn = connection->cstate,
+		  .disk = D_DISKLESS,
+		  .pdsk = D_UNKNOWN,
+		} };
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = device->state;
+
+		if (first_vol) {
+			cs = os;
+			first_vol = 0;
+			continue;
+		}
+
+		if (cs.role != os.role)
+			flags &= ~CS_DC_ROLE;
+
+		if (cs.peer != os.peer)
+			flags &= ~CS_DC_PEER;
+
+		if (cs.conn != os.conn)
+			flags &= ~CS_DC_CONN;
+
+		if (cs.disk != os.disk)
+			flags &= ~CS_DC_DISK;
+
+		if (cs.pdsk != os.pdsk)
+			flags &= ~CS_DC_PDSK;
+	}
+	rcu_read_unlock();
+
+	*pf |= CS_DC_MASK;
+	*pf &= flags;
+	(*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+			 enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	union drbd_state ns, os;
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		os = drbd_read_state(device);
+		ns = sanitize_state(device, os, apply_mask_val(os, mask, val), NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		if (ns.i == os.i)
+			continue;
+
+		rv = is_valid_transition(os, ns);
+
+		if (rv >= SS_SUCCESS && !(flags & CS_HARD)) {
+			rv = is_valid_state(device, ns);
+			if (rv < SS_SUCCESS) {
+				if (is_valid_state(device, os) == rv)
+					rv = is_valid_soft_transition(os, ns, connection);
+			} else
+				rv = is_valid_soft_transition(os, ns, connection);
+		}
+
+		if (rv < SS_SUCCESS) {
+			if (flags & CS_VERBOSE)
+				print_st_err(device, os, ns, rv);
+			break;
+		}
+	}
+	rcu_read_unlock();
+
+	return rv;
+}
+
+static void
+conn_set_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+	       union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+	union drbd_state ns, os, ns_max = { };
+	union drbd_state ns_min = {
+		{ .role = R_MASK,
+		  .peer = R_MASK,
+		  .conn = val.conn,
+		  .disk = D_MASK,
+		  .pdsk = D_MASK
+		} };
+	struct drbd_peer_device *peer_device;
+	enum drbd_state_rv rv;
+	int vnr, number_of_volumes = 0;
+
+	if (mask.conn == C_MASK) {
+		/* remember last connect time so request_timer_fn() won't
+		 * kill newly established sessions while we are still trying to thaw
+		 * previously frozen IO */
+		if (connection->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+			connection->last_reconnect_jif = jiffies;
+
+		connection->cstate = val.conn;
+	}
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		number_of_volumes++;
+		os = drbd_read_state(device);
+		ns = apply_mask_val(os, mask, val);
+		ns = sanitize_state(device, os, ns, NULL);
+
+		if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+			ns.disk = os.disk;
+
+		rv = _drbd_set_state(device, ns, flags, NULL);
+		BUG_ON(rv < SS_SUCCESS);
+		ns.i = device->state.i;
+		ns_max.role = max_role(ns.role, ns_max.role);
+		ns_max.peer = max_role(ns.peer, ns_max.peer);
+		ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+		ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+		ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+		ns_min.role = min_role(ns.role, ns_min.role);
+		ns_min.peer = min_role(ns.peer, ns_min.peer);
+		ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+		ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+		ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+	}
+	rcu_read_unlock();
+
+	if (number_of_volumes == 0) {
+		ns_min = ns_max = (union drbd_state) { {
+				.role = R_SECONDARY,
+				.peer = R_UNKNOWN,
+				.conn = val.conn,
+				.disk = D_DISKLESS,
+				.pdsk = D_UNKNOWN
+			} };
+	}
+
+	ns_min.susp = ns_max.susp = connection->resource->susp;
+	ns_min.susp_nod = ns_max.susp_nod = connection->resource->susp_nod;
+	ns_min.susp_fen = ns_max.susp_fen = connection->resource->susp_fen;
+
+	*pns_min = ns_min;
+	*pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_connection *connection, union drbd_state mask, union drbd_state val)
+{
+	enum drbd_state_rv err, rv = SS_UNKNOWN_ERROR; /* continue waiting */;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &connection->flags))
+		rv = SS_CW_SUCCESS;
+
+	if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &connection->flags))
+		rv = SS_CW_FAILED_BY_PEER;
+
+	err = conn_is_valid_transition(connection, mask, val, 0);
+	if (err == SS_SUCCESS && connection->cstate == C_WF_REPORT_PARAMS)
+		return rv;
+
+	return err;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv = SS_SUCCESS;
+	struct after_conn_state_chg_work *acscw;
+	enum drbd_conns oc = connection->cstate;
+	union drbd_state ns_max, ns_min, os;
+	bool have_mutex = false;
+	struct drbd_state_change *state_change;
+
+	if (mask.conn) {
+		rv = is_valid_conn_transition(oc, val.conn);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	rv = conn_is_valid_transition(connection, mask, val, flags);
+	if (rv < SS_SUCCESS)
+		goto abort;
+
+	if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+	    !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+		/* This will be a cluster-wide state change.
+		 * Need to give up the spinlock, grab the mutex,
+		 * then send the state change request, ... */
+		spin_unlock_irq(&connection->resource->req_lock);
+		mutex_lock(&connection->cstate_mutex);
+		have_mutex = true;
+
+		set_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (conn_send_state_req(connection, mask, val)) {
+			/* sending failed. */
+			clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+			rv = SS_CW_FAILED_BY_PEER;
+			/* need to re-aquire the spin lock, though */
+			goto abort_unlocked;
+		}
+
+		if (val.conn == C_DISCONNECTING)
+			set_bit(DISCONNECT_SENT, &connection->flags);
+
+		/* ... and re-aquire the spinlock.
+		 * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+		 * conn_set_state() within the same spinlock. */
+		spin_lock_irq(&connection->resource->req_lock);
+		wait_event_lock_irq(connection->ping_wait,
+				(rv = _conn_rq_cond(connection, mask, val)),
+				connection->resource->req_lock);
+		clear_bit(CONN_WD_ST_CHG_REQ, &connection->flags);
+		if (rv < SS_SUCCESS)
+			goto abort;
+	}
+
+	state_change = remember_old_state(connection->resource, GFP_ATOMIC);
+	conn_old_common_state(connection, &os, &flags);
+	flags |= CS_DC_SUSP;
+	conn_set_state(connection, mask, val, &ns_min, &ns_max, flags);
+	conn_pr_state_change(connection, os, ns_max, flags);
+	remember_new_state(state_change);
+
+	acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+	if (acscw) {
+		acscw->oc = os.conn;
+		acscw->ns_min = ns_min;
+		acscw->ns_max = ns_max;
+		acscw->flags = flags;
+		acscw->w.cb = w_after_conn_state_ch;
+		kref_get(&connection->kref);
+		acscw->connection = connection;
+		acscw->state_change = state_change;
+		drbd_queue_work(&connection->sender_work, &acscw->w);
+	} else {
+		drbd_err(connection, "Could not kmalloc an acscw\n");
+	}
+
+ abort:
+	if (have_mutex) {
+		/* mutex_unlock() "... must not be used in interrupt context.",
+		 * so give up the spinlock, then re-aquire it */
+		spin_unlock_irq(&connection->resource->req_lock);
+ abort_unlocked:
+		mutex_unlock(&connection->cstate_mutex);
+		spin_lock_irq(&connection->resource->req_lock);
+	}
+	if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+		drbd_err(connection, "State change failed: %s\n", drbd_set_st_err_str(rv));
+		drbd_err(connection, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+		drbd_err(connection, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+	}
+	return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags)
+{
+	enum drbd_state_rv rv;
+
+	spin_lock_irq(&connection->resource->req_lock);
+	rv = _conn_request_state(connection, mask, val, flags);
+	spin_unlock_irq(&connection->resource->req_lock);
+
+	return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 000000000..f87371e55
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,174 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_device;
+struct drbd_connection;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+	({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask; }), \
+	({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+	({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+	  mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+	({ union drbd_state val;  val.i = 0; val.T1 = (S1); \
+	  val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+	D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+	__ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+	CS_HARD	         = 1 << 0,
+	CS_VERBOSE       = 1 << 1,
+	CS_WAIT_COMPLETE = 1 << 2,
+	CS_SERIALIZE     = 1 << 3,
+	CS_ORDERED       = CS_WAIT_COMPLETE + CS_SERIALIZE,
+	CS_LOCAL_ONLY    = 1 << 4, /* Do not consider a device pair wide state change */
+	CS_DC_ROLE       = 1 << 5, /* DC = display as connection state change */
+	CS_DC_PEER       = 1 << 6,
+	CS_DC_CONN       = 1 << 7,
+	CS_DC_DISK       = 1 << 8,
+	CS_DC_PDSK       = 1 << 9,
+	CS_DC_SUSP       = 1 << 10,
+	CS_DC_MASK       = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+	CS_IGN_OUTD_FAIL = 1 << 11,
+
+	/* Make sure no meta data IO is in flight, by calling
+	 * drbd_md_get_buffer().  Used for graceful detach. */
+	CS_INHIBIT_MD_IO = 1 << 12,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+   small difference. There is no suspended flag (.susp), and no suspended
+   while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+	struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned _unused:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned peer_isp:1 ;
+		unsigned user_isp:1 ;
+		unsigned _pad:11;   /* 0	 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+		unsigned _pad:11;
+		unsigned user_isp:1 ;
+		unsigned peer_isp:1 ;
+		unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+		unsigned _unused:1 ;
+		unsigned pdsk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned disk:4 ;   /* 8/16	 from D_DISKLESS to D_UP_TO_DATE */
+		unsigned conn:5 ;   /* 17/32	 cstates */
+		unsigned peer:2 ;   /* 3/4	 primary/secondary/unknown */
+		unsigned role:2 ;   /* 3/4	 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+	};
+	unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_device *device,
+					    enum chg_state_flags f,
+					    union drbd_state mask,
+					    union drbd_state val);
+extern void drbd_force_state(struct drbd_device *, union drbd_state,
+			union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_device *,
+					      union drbd_state,
+					      union drbd_state,
+					      enum chg_state_flags);
+
+extern enum drbd_state_rv
+_drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state,
+					union drbd_state, enum chg_state_flags);
+
+extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state,
+					  enum chg_state_flags,
+					  struct completion *done);
+extern void print_st_err(struct drbd_device *, union drbd_state,
+			union drbd_state, enum drbd_state_rv);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		    enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_connection *connection, union drbd_state mask, union drbd_state val,
+		   enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_device *device);
+extern bool conn_all_vols_unconf(struct drbd_connection *connection);
+
+/**
+ * drbd_request_state() - Request a state change
+ * @device:	DRBD device.
+ * @mask:	mask of state bits to change.
+ * @val:	value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_device *device,
+				     union drbd_state mask,
+				     union drbd_state val)
+{
+	return _drbd_request_state(device, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+/* for use in adm_detach() (drbd_adm_detach(), drbd_adm_down()) */
+int drbd_request_detach_interruptible(struct drbd_device *device);
+
+enum drbd_role conn_highest_role(struct drbd_connection *connection);
+enum drbd_role conn_highest_peer(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_lowest_disk(struct drbd_connection *connection);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_connection *connection);
+enum drbd_conns conn_lowest_conn(struct drbd_connection *connection);
+
+#endif
diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h
new file mode 100644
index 000000000..d5b0479bc
--- /dev/null
+++ b/drivers/block/drbd/drbd_state_change.h
@@ -0,0 +1,64 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef DRBD_STATE_CHANGE_H
+#define DRBD_STATE_CHANGE_H
+
+struct drbd_resource_state_change {
+	struct drbd_resource *resource;
+	enum drbd_role role[2];
+	bool susp[2];
+	bool susp_nod[2];
+	bool susp_fen[2];
+};
+
+struct drbd_device_state_change {
+	struct drbd_device *device;
+	enum drbd_disk_state disk_state[2];
+};
+
+struct drbd_connection_state_change {
+	struct drbd_connection *connection;
+	enum drbd_conns cstate[2];  /* drbd9: enum drbd_conn_state */
+	enum drbd_role peer_role[2];
+};
+
+struct drbd_peer_device_state_change {
+	struct drbd_peer_device *peer_device;
+	enum drbd_disk_state disk_state[2];
+	enum drbd_conns repl_state[2];  /* drbd9: enum drbd_repl_state */
+	bool resync_susp_user[2];
+	bool resync_susp_peer[2];
+	bool resync_susp_dependency[2];
+};
+
+struct drbd_state_change {
+	struct list_head list;
+	unsigned int n_devices;
+	unsigned int n_connections;
+	struct drbd_resource_state_change resource[1];
+	struct drbd_device_state_change *devices;
+	struct drbd_connection_state_change *connections;
+	struct drbd_peer_device_state_change *peer_devices;
+};
+
+extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t);
+extern void copy_old_to_new_state_change(struct drbd_state_change *);
+extern void forget_state_change(struct drbd_state_change *);
+
+extern int notify_resource_state_change(struct sk_buff *,
+					 unsigned int,
+					 struct drbd_resource_state_change *,
+					 enum drbd_notification_type type);
+extern int notify_connection_state_change(struct sk_buff *,
+					   unsigned int,
+					   struct drbd_connection_state_change *,
+					   enum drbd_notification_type type);
+extern int notify_device_state_change(struct sk_buff *,
+				       unsigned int,
+				       struct drbd_device_state_change *,
+				       enum drbd_notification_type type);
+extern int notify_peer_device_state_change(struct sk_buff *,
+					    unsigned int,
+					    struct drbd_peer_device_state_change *,
+					    enum drbd_notification_type type);
+
+#endif  /* DRBD_STATE_CHANGE_H */
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
new file mode 100644
index 000000000..fc0130760
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.c
@@ -0,0 +1,106 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+  drbd.h
+
+  This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+  Copyright (C) 2003-2008, LINBIT Information Technologies GmbH.
+  Copyright (C) 2003-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+  Copyright (C) 2003-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+*/
+
+#include <linux/drbd.h>
+#include "drbd_strings.h"
+
+static const char * const drbd_conn_s_names[] = {
+	[C_STANDALONE]       = "StandAlone",
+	[C_DISCONNECTING]    = "Disconnecting",
+	[C_UNCONNECTED]      = "Unconnected",
+	[C_TIMEOUT]          = "Timeout",
+	[C_BROKEN_PIPE]      = "BrokenPipe",
+	[C_NETWORK_FAILURE]  = "NetworkFailure",
+	[C_PROTOCOL_ERROR]   = "ProtocolError",
+	[C_WF_CONNECTION]    = "WFConnection",
+	[C_WF_REPORT_PARAMS] = "WFReportParams",
+	[C_TEAR_DOWN]        = "TearDown",
+	[C_CONNECTED]        = "Connected",
+	[C_STARTING_SYNC_S]  = "StartingSyncS",
+	[C_STARTING_SYNC_T]  = "StartingSyncT",
+	[C_WF_BITMAP_S]      = "WFBitMapS",
+	[C_WF_BITMAP_T]      = "WFBitMapT",
+	[C_WF_SYNC_UUID]     = "WFSyncUUID",
+	[C_SYNC_SOURCE]      = "SyncSource",
+	[C_SYNC_TARGET]      = "SyncTarget",
+	[C_PAUSED_SYNC_S]    = "PausedSyncS",
+	[C_PAUSED_SYNC_T]    = "PausedSyncT",
+	[C_VERIFY_S]         = "VerifyS",
+	[C_VERIFY_T]         = "VerifyT",
+	[C_AHEAD]            = "Ahead",
+	[C_BEHIND]           = "Behind",
+};
+
+static const char * const drbd_role_s_names[] = {
+	[R_PRIMARY]   = "Primary",
+	[R_SECONDARY] = "Secondary",
+	[R_UNKNOWN]   = "Unknown"
+};
+
+static const char * const drbd_disk_s_names[] = {
+	[D_DISKLESS]     = "Diskless",
+	[D_ATTACHING]    = "Attaching",
+	[D_FAILED]       = "Failed",
+	[D_NEGOTIATING]  = "Negotiating",
+	[D_INCONSISTENT] = "Inconsistent",
+	[D_OUTDATED]     = "Outdated",
+	[D_UNKNOWN]      = "DUnknown",
+	[D_CONSISTENT]   = "Consistent",
+	[D_UP_TO_DATE]   = "UpToDate",
+};
+
+static const char * const drbd_state_sw_errors[] = {
+	[-SS_TWO_PRIMARIES] = "Multiple primaries not allowed by config",
+	[-SS_NO_UP_TO_DATE_DISK] = "Need access to UpToDate data",
+	[-SS_NO_LOCAL_DISK] = "Can not resync without local disk",
+	[-SS_NO_REMOTE_DISK] = "Can not resync without remote disk",
+	[-SS_CONNECTED_OUTDATES] = "Refusing to be Outdated while Connected",
+	[-SS_PRIMARY_NOP] = "Refusing to be Primary while peer is not outdated",
+	[-SS_RESYNC_RUNNING] = "Can not start OV/resync since it is already active",
+	[-SS_ALREADY_STANDALONE] = "Can not disconnect a StandAlone device",
+	[-SS_CW_FAILED_BY_PEER] = "State change was refused by peer node",
+	[-SS_IS_DISKLESS] = "Device is diskless, the requested operation requires a disk",
+	[-SS_DEVICE_IN_USE] = "Device is held open by someone",
+	[-SS_NO_NET_CONFIG] = "Have no net/connection configuration",
+	[-SS_NO_VERIFY_ALG] = "Need a verify algorithm to start online verify",
+	[-SS_NEED_CONNECTION] = "Need a connection to start verify or resync",
+	[-SS_NOT_SUPPORTED] = "Peer does not support protocol",
+	[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
+	[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
+	[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+	[-SS_OUTDATE_WO_CONN] = "Need a connection for a graceful disconnect/outdate peer",
+	[-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
+};
+
+const char *drbd_conn_str(enum drbd_conns s)
+{
+	/* enums are unsigned... */
+	return s > C_BEHIND ? "TOO_LARGE" : drbd_conn_s_names[s];
+}
+
+const char *drbd_role_str(enum drbd_role s)
+{
+	return s > R_SECONDARY   ? "TOO_LARGE" : drbd_role_s_names[s];
+}
+
+const char *drbd_disk_str(enum drbd_disk_state s)
+{
+	return s > D_UP_TO_DATE    ? "TOO_LARGE" : drbd_disk_s_names[s];
+}
+
+const char *drbd_set_st_err_str(enum drbd_state_rv err)
+{
+	return err <= SS_AFTER_LAST_ERROR ? "TOO_SMALL" :
+	       err > SS_TWO_PRIMARIES ? "TOO_LARGE"
+			: drbd_state_sw_errors[-err];
+}
diff --git a/drivers/block/drbd/drbd_strings.h b/drivers/block/drbd/drbd_strings.h
new file mode 100644
index 000000000..87b94a273
--- /dev/null
+++ b/drivers/block/drbd/drbd_strings.h
@@ -0,0 +1,10 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __DRBD_STRINGS_H
+#define __DRBD_STRINGS_H
+
+extern const char *drbd_conn_str(enum drbd_conns);
+extern const char *drbd_role_str(enum drbd_role);
+extern const char *drbd_disk_str(enum drbd_disk_state);
+extern const char *drbd_set_st_err_str(enum drbd_state_rv);
+
+#endif  /* __DRBD_STRINGS_H */
diff --git a/drivers/block/drbd/drbd_vli.h b/drivers/block/drbd/drbd_vli.h
new file mode 100644
index 000000000..01e3babc5
--- /dev/null
+++ b/drivers/block/drbd/drbd_vli.h
@@ -0,0 +1,339 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+-*- linux-c -*-
+   drbd_receiver.c
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ */
+
+#ifndef _DRBD_VLI_H
+#define _DRBD_VLI_H
+
+/*
+ * At a granularity of 4KiB storage represented per bit,
+ * and stroage sizes of several TiB,
+ * and possibly small-bandwidth replication,
+ * the bitmap transfer time can take much too long,
+ * if transmitted in plain text.
+ *
+ * We try to reduce the transferred bitmap information
+ * by encoding runlengths of bit polarity.
+ *
+ * We never actually need to encode a "zero" (runlengths are positive).
+ * But then we have to store the value of the first bit.
+ * The first bit of information thus shall encode if the first runlength
+ * gives the number of set or unset bits.
+ *
+ * We assume that large areas are either completely set or unset,
+ * which gives good compression with any runlength method,
+ * even when encoding the runlength as fixed size 32bit/64bit integers.
+ *
+ * Still, there may be areas where the polarity flips every few bits,
+ * and encoding the runlength sequence of those areas with fix size
+ * integers would be much worse than plaintext.
+ *
+ * We want to encode small runlength values with minimum code length,
+ * while still being able to encode a Huge run of all zeros.
+ *
+ * Thus we need a Variable Length Integer encoding, VLI.
+ *
+ * For some cases, we produce more code bits than plaintext input.
+ * We need to send incompressible chunks as plaintext, skip over them
+ * and then see if the next chunk compresses better.
+ *
+ * We don't care too much about "excellent" compression ratio for large
+ * runlengths (all set/all clear): whether we achieve a factor of 100
+ * or 1000 is not that much of an issue.
+ * We do not want to waste too much on short runlengths in the "noisy"
+ * parts of the bitmap, though.
+ *
+ * There are endless variants of VLI, we experimented with:
+ *  * simple byte-based
+ *  * various bit based with different code word length.
+ *
+ * To avoid yet an other configuration parameter (choice of bitmap compression
+ * algorithm) which was difficult to explain and tune, we just chose the one
+ * variant that turned out best in all test cases.
+ * Based on real world usage patterns, with device sizes ranging from a few GiB
+ * to several TiB, file server/mailserver/webserver/mysql/postgress,
+ * mostly idle to really busy, the all time winner (though sometimes only
+ * marginally better) is:
+ */
+
+/*
+ * encoding is "visualised" as
+ * __little endian__ bitstream, least significant bit first (left most)
+ *
+ * this particular encoding is chosen so that the prefix code
+ * starts as unary encoding the level, then modified so that
+ * 10 levels can be described in 8bit, with minimal overhead
+ * for the smaller levels.
+ *
+ * Number of data bits follow fibonacci sequence, with the exception of the
+ * last level (+1 data bit, so it makes 64bit total).  The only worse code when
+ * encoding bit polarity runlength is 1 plain bits => 2 code bits.
+prefix    data bits                                    max val  Nº data bits
+0 x                                                         0x2            1
+10 x                                                        0x4            1
+110 xx                                                      0x8            2
+1110 xxx                                                   0x10            3
+11110 xxx xx                                               0x30            5
+111110 xx xxxxxx                                          0x130            8
+11111100  xxxxxxxx xxxxx                                 0x2130           13
+11111110  xxxxxxxx xxxxxxxx xxxxx                      0x202130           21
+11111101  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xx   0x400202130           34
+11111111  xxxxxxxx xxxxxxxx xxxxxxxx  xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56
+ * maximum encodable value: 0x100000400202130 == 2**56 + some */
+
+/* compression "table":
+ transmitted   x                                0.29
+ as plaintext x                                  ........................
+             x                                   ........................
+            x                                    ........................
+           x    0.59                         0.21........................
+          x      ........................................................
+         x       .. c ...................................................
+        x    0.44.. o ...................................................
+       x .......... d ...................................................
+      x  .......... e ...................................................
+     X.............   ...................................................
+    x.............. b ...................................................
+2.0x............... i ...................................................
+ #X................ t ...................................................
+ #................. s ...........................  plain bits  ..........
+-+-----------------------------------------------------------------------
+ 1             16              32                              64
+*/
+
+/* LEVEL: (total bits, prefix bits, prefix value),
+ * sorted ascending by number of total bits.
+ * The rest of the code table is calculated at compiletime from this. */
+
+/* fibonacci data 1, 1, ... */
+#define VLI_L_1_1() do { \
+	LEVEL( 2, 1, 0x00); \
+	LEVEL( 3, 2, 0x01); \
+	LEVEL( 5, 3, 0x03); \
+	LEVEL( 7, 4, 0x07); \
+	LEVEL(10, 5, 0x0f); \
+	LEVEL(14, 6, 0x1f); \
+	LEVEL(21, 8, 0x3f); \
+	LEVEL(29, 8, 0x7f); \
+	LEVEL(42, 8, 0xbf); \
+	LEVEL(64, 8, 0xff); \
+	} while (0)
+
+/* finds a suitable level to decode the least significant part of in.
+ * returns number of bits consumed.
+ *
+ * BUG() for bad input, as that would mean a buggy code table. */
+static inline int vli_decode_bits(u64 *out, const u64 in)
+{
+	u64 adj = 1;
+
+#define LEVEL(t,b,v)					\
+	do {						\
+		if ((in & ((1 << b) -1)) == v) {	\
+			*out = ((in & ((~0ULL) >> (64-t))) >> b) + adj;	\
+			return t;			\
+		}					\
+		adj += 1ULL << (t - b);			\
+	} while (0)
+
+	VLI_L_1_1();
+
+	/* NOT REACHED, if VLI_LEVELS code table is defined properly */
+	BUG();
+#undef LEVEL
+}
+
+/* return number of code bits needed,
+ * or negative error number */
+static inline int __vli_encode_bits(u64 *out, const u64 in)
+{
+	u64 max = 0;
+	u64 adj = 1;
+
+	if (in == 0)
+		return -EINVAL;
+
+#define LEVEL(t,b,v) do {		\
+		max += 1ULL << (t - b);	\
+		if (in <= max) {	\
+			if (out)	\
+				*out = ((in - adj) << b) | v;	\
+			return t;	\
+		}			\
+		adj = max + 1;		\
+	} while (0)
+
+	VLI_L_1_1();
+
+	return -EOVERFLOW;
+#undef LEVEL
+}
+
+#undef VLI_L_1_1
+
+/* code from here down is independend of actually used bit code */
+
+/*
+ * Code length is determined by some unique (e.g. unary) prefix.
+ * This encodes arbitrary bit length, not whole bytes: we have a bit-stream,
+ * not a byte stream.
+ */
+
+/* for the bitstream, we need a cursor */
+struct bitstream_cursor {
+	/* the current byte */
+	u8 *b;
+	/* the current bit within *b, nomalized: 0..7 */
+	unsigned int bit;
+};
+
+/* initialize cursor to point to first bit of stream */
+static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s)
+{
+	cur->b = s;
+	cur->bit = 0;
+}
+
+/* advance cursor by that many bits; maximum expected input value: 64,
+ * but depending on VLI implementation, it may be more. */
+static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits)
+{
+	bits += cur->bit;
+	cur->b = cur->b + (bits >> 3);
+	cur->bit = bits & 7;
+}
+
+/* the bitstream itself knows its length */
+struct bitstream {
+	struct bitstream_cursor cur;
+	unsigned char *buf;
+	size_t buf_len;		/* in bytes */
+
+	/* for input stream:
+	 * number of trailing 0 bits for padding
+	 * total number of valid bits in stream: buf_len * 8 - pad_bits */
+	unsigned int pad_bits;
+};
+
+static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits)
+{
+	bs->buf = s;
+	bs->buf_len = len;
+	bs->pad_bits = pad_bits;
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+}
+
+static inline void bitstream_rewind(struct bitstream *bs)
+{
+	bitstream_cursor_reset(&bs->cur, bs->buf);
+	memset(bs->buf, 0, bs->buf_len);
+}
+
+/* Put (at most 64) least significant bits of val into bitstream, and advance cursor.
+ * Ignores "pad_bits".
+ * Returns zero if bits == 0 (nothing to do).
+ * Returns number of bits used if successful.
+ *
+ * If there is not enough room left in bitstream,
+ * leaves bitstream unchanged and returns -ENOBUFS.
+ */
+static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits)
+{
+	unsigned char *b = bs->cur.b;
+	unsigned int tmp;
+
+	if (bits == 0)
+		return 0;
+
+	if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len)
+		return -ENOBUFS;
+
+	/* paranoia: strip off hi bits; they should not be set anyways. */
+	if (bits < 64)
+		val &= ~0ULL >> (64 - bits);
+
+	*b++ |= (val & 0xff) << bs->cur.bit;
+
+	for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8)
+		*b++ |= (val >> tmp) & 0xff;
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	return bits;
+}
+
+/* Fetch (at most 64) bits from bitstream into *out, and advance cursor.
+ *
+ * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged.
+ *
+ * If there are less than the requested number of valid bits left in the
+ * bitstream, still fetches all available bits.
+ *
+ * Returns number of actually fetched bits.
+ */
+static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits)
+{
+	u64 val;
+	unsigned int n;
+
+	if (bits > 64)
+		return -EINVAL;
+
+	if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len)
+		bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3)
+			- bs->cur.bit - bs->pad_bits;
+
+	if (bits == 0) {
+		*out = 0;
+		return 0;
+	}
+
+	/* get the high bits */
+	val = 0;
+	n = (bs->cur.bit + bits + 7) >> 3;
+	/* n may be at most 9, if cur.bit + bits > 64 */
+	/* which means this copies at most 8 byte */
+	if (n) {
+		memcpy(&val, bs->cur.b+1, n - 1);
+		val = le64_to_cpu(val) << (8 - bs->cur.bit);
+	}
+
+	/* we still need the low bits */
+	val |= bs->cur.b[0] >> bs->cur.bit;
+
+	/* and mask out bits we don't want */
+	val &= ~0ULL >> (64 - bits);
+
+	bitstream_cursor_advance(&bs->cur, bits);
+	*out = val;
+
+	return bits;
+}
+
+/* encodes @in as vli into @bs;
+
+ * return values
+ *  > 0: number of bits successfully stored in bitstream
+ * -ENOBUFS @bs is full
+ * -EINVAL input zero (invalid)
+ * -EOVERFLOW input too large for this vli code (invalid)
+ */
+static inline int vli_encode_bits(struct bitstream *bs, u64 in)
+{
+	u64 code = code;
+	int bits = __vli_encode_bits(&code, in);
+
+	if (bits <= 0)
+		return bits;
+
+	return bitstream_put_bits(bs, code, bits);
+}
+
+#endif
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
new file mode 100644
index 000000000..0bb1a900c
--- /dev/null
+++ b/drivers/block/drbd/drbd_worker.c
@@ -0,0 +1,2233 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+   drbd_worker.c
+
+   This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+   Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+   Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+   Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+
+*/
+
+#include <linux/module.h>
+#include <linux/drbd.h>
+#include <linux/sched/signal.h>
+#include <linux/wait.h>
+#include <linux/mm.h>
+#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/string.h>
+#include <linux/scatterlist.h>
+#include <linux/part_stat.h>
+
+#include "drbd_int.h"
+#include "drbd_protocol.h"
+#include "drbd_req.h"
+
+static int make_ov_request(struct drbd_device *, int);
+static int make_resync_request(struct drbd_device *, int);
+
+/* endio handlers:
+ *   drbd_md_endio (defined here)
+ *   drbd_request_endio (defined here)
+ *   drbd_peer_request_endio (defined here)
+ *   drbd_bm_endio (defined in drbd_bitmap.c)
+ *
+ * For all these callbacks, note the following:
+ * The callbacks will be called in irq context by the IDE drivers,
+ * and in Softirqs/Tasklets/BH context by the SCSI drivers.
+ * Try to get the locking right :)
+ *
+ */
+
+/* used for synchronous meta data and bitmap IO
+ * submitted by drbd_md_sync_page_io()
+ */
+void drbd_md_endio(struct bio *bio)
+{
+	struct drbd_device *device;
+
+	device = bio->bi_private;
+	device->md_io.error = blk_status_to_errno(bio->bi_status);
+
+	/* special case: drbd_md_read() during drbd_adm_attach() */
+	if (device->ldev)
+		put_ldev(device);
+	bio_put(bio);
+
+	/* We grabbed an extra reference in _drbd_md_sync_page_io() to be able
+	 * to timeout on the lower level device, and eventually detach from it.
+	 * If this io completion runs after that timeout expired, this
+	 * drbd_md_put_buffer() may allow us to finally try and re-attach.
+	 * During normal operation, this only puts that extra reference
+	 * down to 1 again.
+	 * Make sure we first drop the reference, and only then signal
+	 * completion, or we may (in drbd_al_read_log()) cycle so fast into the
+	 * next drbd_md_sync_page_io(), that we trigger the
+	 * ASSERT(atomic_read(&device->md_io_in_use) == 1) there.
+	 */
+	drbd_md_put_buffer(device);
+	device->md_io.done = 1;
+	wake_up(&device->misc_wait);
+}
+
+/* reads on behalf of the partner,
+ * "submitted" by the receiver
+ */
+static void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->read_cnt += peer_req->i.size >> 9;
+	list_del(&peer_req->w.list);
+	if (list_empty(&device->read_ee))
+		wake_up(&device->ee_wait);
+	if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
+		__drbd_chk_io_error(device, DRBD_READ_ERROR);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	drbd_queue_work(&peer_device->connection->sender_work, &peer_req->w);
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver, final stage.  */
+void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
+{
+	unsigned long flags = 0;
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct drbd_connection *connection = peer_device->connection;
+	struct drbd_interval i;
+	int do_wake;
+	u64 block_id;
+	int do_al_complete_io;
+
+	/* after we moved peer_req to done_ee,
+	 * we may no longer access it,
+	 * it may be freed/reused already!
+	 * (as soon as we release the req_lock) */
+	i = peer_req->i;
+	do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+	block_id = peer_req->block_id;
+	peer_req->flags &= ~EE_CALL_AL_COMPLETE_IO;
+
+	if (peer_req->flags & EE_WAS_ERROR) {
+		/* In protocol != C, we usually do not send write acks.
+		 * In case of a write error, send the neg ack anyways. */
+		if (!__test_and_set_bit(__EE_SEND_WRITE_ACK, &peer_req->flags))
+			inc_unacked(device);
+		drbd_set_out_of_sync(device, peer_req->i.sector, peer_req->i.size);
+	}
+
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	device->writ_cnt += peer_req->i.size >> 9;
+	list_move_tail(&peer_req->w.list, &device->done_ee);
+
+	/*
+	 * Do not remove from the write_requests tree here: we did not send the
+	 * Ack yet and did not wake possibly waiting conflicting requests.
+	 * Removed from the tree from "drbd_process_done_ee" within the
+	 * appropriate dw.cb (e_end_block/e_end_resync_block) or from
+	 * _drbd_clear_done_ee.
+	 */
+
+	do_wake = list_empty(block_id == ID_SYNCER ? &device->sync_ee : &device->active_ee);
+
+	/* FIXME do we want to detach for failed REQ_OP_DISCARD?
+	 * ((peer_req->flags & (EE_WAS_ERROR|EE_TRIM)) == EE_WAS_ERROR) */
+	if (peer_req->flags & EE_WAS_ERROR)
+		__drbd_chk_io_error(device, DRBD_WRITE_ERROR);
+
+	if (connection->cstate >= C_WF_REPORT_PARAMS) {
+		kref_get(&device->kref); /* put is in drbd_send_acks_wf() */
+		if (!queue_work(connection->ack_sender, &peer_device->send_acks_work))
+			kref_put(&device->kref, drbd_destroy_device);
+	}
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+
+	if (block_id == ID_SYNCER)
+		drbd_rs_complete_io(device, i.sector);
+
+	if (do_wake)
+		wake_up(&device->ee_wait);
+
+	if (do_al_complete_io)
+		drbd_al_complete_io(device, &i);
+
+	put_ldev(device);
+}
+
+/* writes on behalf of the partner, or resync writes,
+ * "submitted" by the receiver.
+ */
+void drbd_peer_request_endio(struct bio *bio)
+{
+	struct drbd_peer_request *peer_req = bio->bi_private;
+	struct drbd_device *device = peer_req->peer_device->device;
+	bool is_write = bio_data_dir(bio) == WRITE;
+	bool is_discard = bio_op(bio) == REQ_OP_WRITE_ZEROES ||
+			  bio_op(bio) == REQ_OP_DISCARD;
+
+	if (bio->bi_status && __ratelimit(&drbd_ratelimit_state))
+		drbd_warn(device, "%s: error=%d s=%llus\n",
+				is_write ? (is_discard ? "discard" : "write")
+					: "read", bio->bi_status,
+				(unsigned long long)peer_req->i.sector);
+
+	if (bio->bi_status)
+		set_bit(__EE_WAS_ERROR, &peer_req->flags);
+
+	bio_put(bio); /* no need for the bio anymore */
+	if (atomic_dec_and_test(&peer_req->pending_bios)) {
+		if (is_write)
+			drbd_endio_write_sec_final(peer_req);
+		else
+			drbd_endio_read_sec_final(peer_req);
+	}
+}
+
+static void
+drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device)
+{
+	panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n",
+		device->minor, device->resource->name, device->vnr);
+}
+
+/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
+ */
+void drbd_request_endio(struct bio *bio)
+{
+	unsigned long flags;
+	struct drbd_request *req = bio->bi_private;
+	struct drbd_device *device = req->device;
+	struct bio_and_error m;
+	enum drbd_req_event what;
+
+	/* If this request was aborted locally before,
+	 * but now was completed "successfully",
+	 * chances are that this caused arbitrary data corruption.
+	 *
+	 * "aborting" requests, or force-detaching the disk, is intended for
+	 * completely blocked/hung local backing devices which do no longer
+	 * complete requests at all, not even do error completions.  In this
+	 * situation, usually a hard-reset and failover is the only way out.
+	 *
+	 * By "aborting", basically faking a local error-completion,
+	 * we allow for a more graceful swichover by cleanly migrating services.
+	 * Still the affected node has to be rebooted "soon".
+	 *
+	 * By completing these requests, we allow the upper layers to re-use
+	 * the associated data pages.
+	 *
+	 * If later the local backing device "recovers", and now DMAs some data
+	 * from disk into the original request pages, in the best case it will
+	 * just put random data into unused pages; but typically it will corrupt
+	 * meanwhile completely unrelated data, causing all sorts of damage.
+	 *
+	 * Which means delayed successful completion,
+	 * especially for READ requests,
+	 * is a reason to panic().
+	 *
+	 * We assume that a delayed *error* completion is OK,
+	 * though we still will complain noisily about it.
+	 */
+	if (unlikely(req->rq_state & RQ_LOCAL_ABORTED)) {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n");
+
+		if (!bio->bi_status)
+			drbd_panic_after_delayed_completion_of_aborted_request(device);
+	}
+
+	/* to avoid recursion in __req_mod */
+	if (unlikely(bio->bi_status)) {
+		switch (bio_op(bio)) {
+		case REQ_OP_WRITE_ZEROES:
+		case REQ_OP_DISCARD:
+			if (bio->bi_status == BLK_STS_NOTSUPP)
+				what = DISCARD_COMPLETED_NOTSUPP;
+			else
+				what = DISCARD_COMPLETED_WITH_ERROR;
+			break;
+		case REQ_OP_READ:
+			if (bio->bi_opf & REQ_RAHEAD)
+				what = READ_AHEAD_COMPLETED_WITH_ERROR;
+			else
+				what = READ_COMPLETED_WITH_ERROR;
+			break;
+		default:
+			what = WRITE_COMPLETED_WITH_ERROR;
+			break;
+		}
+	} else {
+		what = COMPLETED_OK;
+	}
+
+	req->private_bio = ERR_PTR(blk_status_to_errno(bio->bi_status));
+	bio_put(bio);
+
+	/* not req_mod(), we need irqsave here! */
+	spin_lock_irqsave(&device->resource->req_lock, flags);
+	__req_mod(req, what, &m);
+	spin_unlock_irqrestore(&device->resource->req_lock, flags);
+	put_ldev(device);
+
+	if (m.bio)
+		complete_master_bio(device, &m);
+}
+
+void drbd_csum_ee(struct crypto_shash *tfm, struct drbd_peer_request *peer_req, void *digest)
+{
+	SHASH_DESC_ON_STACK(desc, tfm);
+	struct page *page = peer_req->pages;
+	struct page *tmp;
+	unsigned len;
+	void *src;
+
+	desc->tfm = tfm;
+
+	crypto_shash_init(desc);
+
+	src = kmap_atomic(page);
+	while ((tmp = page_chain_next(page))) {
+		/* all but the last page will be fully used */
+		crypto_shash_update(desc, src, PAGE_SIZE);
+		kunmap_atomic(src);
+		page = tmp;
+		src = kmap_atomic(page);
+	}
+	/* and now the last, possibly only partially used page */
+	len = peer_req->i.size & (PAGE_SIZE - 1);
+	crypto_shash_update(desc, src, len ?: PAGE_SIZE);
+	kunmap_atomic(src);
+
+	crypto_shash_final(desc, digest);
+	shash_desc_zero(desc);
+}
+
+void drbd_csum_bio(struct crypto_shash *tfm, struct bio *bio, void *digest)
+{
+	SHASH_DESC_ON_STACK(desc, tfm);
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	desc->tfm = tfm;
+
+	crypto_shash_init(desc);
+
+	bio_for_each_segment(bvec, bio, iter) {
+		u8 *src;
+
+		src = bvec_kmap_local(&bvec);
+		crypto_shash_update(desc, src, bvec.bv_len);
+		kunmap_local(src);
+	}
+	crypto_shash_final(desc, digest);
+	shash_desc_zero(desc);
+}
+
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
+		goto out;
+
+	digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (digest) {
+		sector_t sector = peer_req->i.sector;
+		unsigned int size = peer_req->i.size;
+		drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+		/* Free peer_req and pages before send.
+		 * In case we block on congestion, we could otherwise run into
+		 * some distributed deadlock, if the other side blocks on
+		 * congestion as well, because our receiver blocks in
+		 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+		drbd_free_peer_req(device, peer_req);
+		peer_req = NULL;
+		inc_rs_pending(device);
+		err = drbd_send_drequest_csum(peer_device, sector, size,
+					      digest, digest_size,
+					      P_CSUM_RS_REQUEST);
+		kfree(digest);
+	} else {
+		drbd_err(device, "kmalloc() of digest failed.\n");
+		err = -ENOMEM;
+	}
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_drequest(..., csum) failed\n");
+	return err;
+}
+
+#define GFP_TRY	(__GFP_HIGHMEM | __GFP_NOWARN)
+
+static int read_for_csum(struct drbd_peer_device *peer_device, sector_t sector, int size)
+{
+	struct drbd_device *device = peer_device->device;
+	struct drbd_peer_request *peer_req;
+
+	if (!get_ldev(device))
+		return -EIO;
+
+	/* GFP_TRY, because if there is no memory available right now, this may
+	 * be rescheduled for later. It is "only" background resync, after all. */
+	peer_req = drbd_alloc_peer_req(peer_device, ID_SYNCER /* unused */, sector,
+				       size, size, GFP_TRY);
+	if (!peer_req)
+		goto defer;
+
+	peer_req->w.cb = w_e_send_csum;
+	spin_lock_irq(&device->resource->req_lock);
+	list_add_tail(&peer_req->w.list, &device->read_ee);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	atomic_add(size >> 9, &device->rs_sect_ev);
+	if (drbd_submit_peer_request(device, peer_req, REQ_OP_READ,
+				     DRBD_FAULT_RS_RD) == 0)
+		return 0;
+
+	/* If it failed because of ENOMEM, retry should help.  If it failed
+	 * because bio_add_page failed (probably broken lower level driver),
+	 * retry may or may not help.
+	 * If it does not, you may need to force disconnect. */
+	spin_lock_irq(&device->resource->req_lock);
+	list_del(&peer_req->w.list);
+	spin_unlock_irq(&device->resource->req_lock);
+
+	drbd_free_peer_req(device, peer_req);
+defer:
+	put_ldev(device);
+	return -EAGAIN;
+}
+
+int w_resync_timer(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, resync_work);
+
+	switch (device->state.conn) {
+	case C_VERIFY_S:
+		make_ov_request(device, cancel);
+		break;
+	case C_SYNC_TARGET:
+		make_resync_request(device, cancel);
+		break;
+	}
+
+	return 0;
+}
+
+void resync_timer_fn(struct timer_list *t)
+{
+	struct drbd_device *device = from_timer(device, t, resync_timer);
+
+	drbd_queue_work_if_unqueued(
+		&first_peer_device(device)->connection->sender_work,
+		&device->resync_work);
+}
+
+static void fifo_set(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] = value;
+}
+
+static int fifo_push(struct fifo_buffer *fb, int value)
+{
+	int ov;
+
+	ov = fb->values[fb->head_index];
+	fb->values[fb->head_index++] = value;
+
+	if (fb->head_index >= fb->size)
+		fb->head_index = 0;
+
+	return ov;
+}
+
+static void fifo_add_val(struct fifo_buffer *fb, int value)
+{
+	int i;
+
+	for (i = 0; i < fb->size; i++)
+		fb->values[i] += value;
+}
+
+struct fifo_buffer *fifo_alloc(unsigned int fifo_size)
+{
+	struct fifo_buffer *fb;
+
+	fb = kzalloc(struct_size(fb, values, fifo_size), GFP_NOIO);
+	if (!fb)
+		return NULL;
+
+	fb->head_index = 0;
+	fb->size = fifo_size;
+	fb->total = 0;
+
+	return fb;
+}
+
+static int drbd_rs_controller(struct drbd_device *device, unsigned int sect_in)
+{
+	struct disk_conf *dc;
+	unsigned int want;     /* The number of sectors we want in-flight */
+	int req_sect; /* Number of sectors to request in this turn */
+	int correction; /* Number of sectors more we need in-flight */
+	int cps; /* correction per invocation of drbd_rs_controller() */
+	int steps; /* Number of time steps to plan ahead */
+	int curr_corr;
+	int max_sect;
+	struct fifo_buffer *plan;
+
+	dc = rcu_dereference(device->ldev->disk_conf);
+	plan = rcu_dereference(device->rs_plan_s);
+
+	steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+
+	if (device->rs_in_flight + sect_in == 0) { /* At start of resync */
+		want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
+	} else { /* normal path */
+		want = dc->c_fill_target ? dc->c_fill_target :
+			sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
+	}
+
+	correction = want - device->rs_in_flight - plan->total;
+
+	/* Plan ahead */
+	cps = correction / steps;
+	fifo_add_val(plan, cps);
+	plan->total += cps * steps;
+
+	/* What we do in this step */
+	curr_corr = fifo_push(plan, 0);
+	plan->total -= curr_corr;
+
+	req_sect = sect_in + curr_corr;
+	if (req_sect < 0)
+		req_sect = 0;
+
+	max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
+	if (req_sect > max_sect)
+		req_sect = max_sect;
+
+	/*
+	drbd_warn(device, "si=%u if=%d wa=%u co=%d st=%d cps=%d pl=%d cc=%d rs=%d\n",
+		 sect_in, device->rs_in_flight, want, correction,
+		 steps, cps, device->rs_planed, curr_corr, req_sect);
+	*/
+
+	return req_sect;
+}
+
+static int drbd_rs_number_requests(struct drbd_device *device)
+{
+	unsigned int sect_in;  /* Number of sectors that came in since the last turn */
+	int number, mxb;
+
+	sect_in = atomic_xchg(&device->rs_sect_in, 0);
+	device->rs_in_flight -= sect_in;
+
+	rcu_read_lock();
+	mxb = drbd_get_max_buffers(device) / 2;
+	if (rcu_dereference(device->rs_plan_s)->size) {
+		number = drbd_rs_controller(device, sect_in) >> (BM_BLOCK_SHIFT - 9);
+		device->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
+	} else {
+		device->c_sync_rate = rcu_dereference(device->ldev->disk_conf)->resync_rate;
+		number = SLEEP_TIME * device->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
+	}
+	rcu_read_unlock();
+
+	/* Don't have more than "max-buffers"/2 in-flight.
+	 * Otherwise we may cause the remote site to stall on drbd_alloc_pages(),
+	 * potentially causing a distributed deadlock on congestion during
+	 * online-verify or (checksum-based) resync, if max-buffers,
+	 * socket buffer sizes and resync rate settings are mis-configured. */
+
+	/* note that "number" is in units of "BM_BLOCK_SIZE" (which is 4k),
+	 * mxb (as used here, and in drbd_alloc_pages on the peer) is
+	 * "number of pages" (typically also 4k),
+	 * but "rs_in_flight" is in "sectors" (512 Byte). */
+	if (mxb - device->rs_in_flight/8 < number)
+		number = mxb - device->rs_in_flight/8;
+
+	return number;
+}
+
+static int make_resync_request(struct drbd_device *const device, int cancel)
+{
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device ? peer_device->connection : NULL;
+	unsigned long bit;
+	sector_t sector;
+	const sector_t capacity = get_capacity(device->vdisk);
+	int max_bio_size;
+	int number, rollback_i, size;
+	int align, requeue = 0;
+	int i = 0;
+	int discard_granularity = 0;
+
+	if (unlikely(cancel))
+		return 0;
+
+	if (device->rs_total == 0) {
+		/* empty resync? */
+		drbd_resync_finished(device);
+		return 0;
+	}
+
+	if (!get_ldev(device)) {
+		/* Since we only need to access device->rsync a
+		   get_ldev_if_state(device,D_FAILED) would be sufficient, but
+		   to continue resync with a broken disk makes no sense at
+		   all */
+		drbd_err(device, "Disk broke down during resync!\n");
+		return 0;
+	}
+
+	if (connection->agreed_features & DRBD_FF_THIN_RESYNC) {
+		rcu_read_lock();
+		discard_granularity = rcu_dereference(device->ldev->disk_conf)->rs_discard_granularity;
+		rcu_read_unlock();
+	}
+
+	max_bio_size = queue_max_hw_sectors(device->rq_queue) << 9;
+	number = drbd_rs_number_requests(device);
+	if (number <= 0)
+		goto requeue;
+
+	for (i = 0; i < number; i++) {
+		/* Stop generating RS requests when half of the send buffer is filled,
+		 * but notify TCP that we'd like to have more space. */
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket) {
+			struct sock *sk = connection->data.socket->sk;
+			int queued = sk->sk_wmem_queued;
+			int sndbuf = sk->sk_sndbuf;
+			if (queued > sndbuf / 2) {
+				requeue = 1;
+				if (sk->sk_socket)
+					set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+			}
+		} else
+			requeue = 1;
+		mutex_unlock(&connection->data.mutex);
+		if (requeue)
+			goto requeue;
+
+next_sector:
+		size = BM_BLOCK_SIZE;
+		bit  = drbd_bm_find_next(device, device->bm_resync_fo);
+
+		if (bit == DRBD_END_OF_BITMAP) {
+			device->bm_resync_fo = drbd_bm_bits(device);
+			put_ldev(device);
+			return 0;
+		}
+
+		sector = BM_BIT_TO_SECT(bit);
+
+		if (drbd_try_rs_begin_io(device, sector)) {
+			device->bm_resync_fo = bit;
+			goto requeue;
+		}
+		device->bm_resync_fo = bit + 1;
+
+		if (unlikely(drbd_bm_test_bit(device, bit) == 0)) {
+			drbd_rs_complete_io(device, sector);
+			goto next_sector;
+		}
+
+#if DRBD_MAX_BIO_SIZE > BM_BLOCK_SIZE
+		/* try to find some adjacent bits.
+		 * we stop if we have already the maximum req size.
+		 *
+		 * Additionally always align bigger requests, in order to
+		 * be prepared for all stripe sizes of software RAIDs.
+		 */
+		align = 1;
+		rollback_i = i;
+		while (i < number) {
+			if (size + BM_BLOCK_SIZE > max_bio_size)
+				break;
+
+			/* Be always aligned */
+			if (sector & ((1<<(align+3))-1))
+				break;
+
+			if (discard_granularity && size == discard_granularity)
+				break;
+
+			/* do not cross extent boundaries */
+			if (((bit+1) & BM_BLOCKS_PER_BM_EXT_MASK) == 0)
+				break;
+			/* now, is it actually dirty, after all?
+			 * caution, drbd_bm_test_bit is tri-state for some
+			 * obscure reason; ( b == 0 ) would get the out-of-band
+			 * only accidentally right because of the "oddly sized"
+			 * adjustment below */
+			if (drbd_bm_test_bit(device, bit+1) != 1)
+				break;
+			bit++;
+			size += BM_BLOCK_SIZE;
+			if ((BM_BLOCK_SIZE << align) <= size)
+				align++;
+			i++;
+		}
+		/* if we merged some,
+		 * reset the offset to start the next drbd_bm_find_next from */
+		if (size > BM_BLOCK_SIZE)
+			device->bm_resync_fo = bit + 1;
+#endif
+
+		/* adjust very last sectors, in case we are oddly sized */
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		if (device->use_csums) {
+			switch (read_for_csum(peer_device, sector, size)) {
+			case -EIO: /* Disk failure */
+				put_ldev(device);
+				return -EIO;
+			case -EAGAIN: /* allocation failed, or ldev busy */
+				drbd_rs_complete_io(device, sector);
+				device->bm_resync_fo = BM_SECT_TO_BIT(sector);
+				i = rollback_i;
+				goto requeue;
+			case 0:
+				/* everything ok */
+				break;
+			default:
+				BUG();
+			}
+		} else {
+			int err;
+
+			inc_rs_pending(device);
+			err = drbd_send_drequest(peer_device,
+						 size == discard_granularity ? P_RS_THIN_REQ : P_RS_DATA_REQUEST,
+						 sector, size, ID_SYNCER);
+			if (err) {
+				drbd_err(device, "drbd_send_drequest() failed, aborting...\n");
+				dec_rs_pending(device);
+				put_ldev(device);
+				return err;
+			}
+		}
+	}
+
+	if (device->bm_resync_fo >= drbd_bm_bits(device)) {
+		/* last syncer _request_ was sent,
+		 * but the P_RS_DATA_REPLY not yet received.  sync will end (and
+		 * next sync group will resume), as soon as we receive the last
+		 * resync data block, and the last bit is cleared.
+		 * until then resync "work" is "inactive" ...
+		 */
+		put_ldev(device);
+		return 0;
+	}
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	put_ldev(device);
+	return 0;
+}
+
+static int make_ov_request(struct drbd_device *device, int cancel)
+{
+	int number, i, size;
+	sector_t sector;
+	const sector_t capacity = get_capacity(device->vdisk);
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel))
+		return 1;
+
+	number = drbd_rs_number_requests(device);
+
+	sector = device->ov_position;
+	for (i = 0; i < number; i++) {
+		if (sector >= capacity)
+			return 1;
+
+		/* We check for "finished" only in the reply path:
+		 * w_e_end_ov_reply().
+		 * We need to send at least one request out. */
+		stop_sector_reached = i > 0
+			&& verify_can_do_stop_sector(device)
+			&& sector >= device->ov_stop_sector;
+		if (stop_sector_reached)
+			break;
+
+		size = BM_BLOCK_SIZE;
+
+		if (drbd_try_rs_begin_io(device, sector)) {
+			device->ov_position = sector;
+			goto requeue;
+		}
+
+		if (sector + (size>>9) > capacity)
+			size = (capacity-sector)<<9;
+
+		inc_rs_pending(device);
+		if (drbd_send_ov_request(first_peer_device(device), sector, size)) {
+			dec_rs_pending(device);
+			return 0;
+		}
+		sector += BM_SECT_PER_BIT;
+	}
+	device->ov_position = sector;
+
+ requeue:
+	device->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
+	if (i == 0 || !stop_sector_reached)
+		mod_timer(&device->resync_timer, jiffies + SLEEP_TIME);
+	return 1;
+}
+
+int w_ov_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+	ov_out_of_sync_print(device);
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static int w_resync_finished(struct drbd_work *w, int cancel)
+{
+	struct drbd_device_work *dw =
+		container_of(w, struct drbd_device_work, w);
+	struct drbd_device *device = dw->device;
+	kfree(dw);
+
+	drbd_resync_finished(device);
+
+	return 0;
+}
+
+static void ping_peer(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+
+	clear_bit(GOT_PING_ACK, &connection->flags);
+	request_ping(connection);
+	wait_event(connection->ping_wait,
+		   test_bit(GOT_PING_ACK, &connection->flags) || device->state.conn < C_CONNECTED);
+}
+
+int drbd_resync_finished(struct drbd_device *device)
+{
+	struct drbd_connection *connection = first_peer_device(device)->connection;
+	unsigned long db, dt, dbdt;
+	unsigned long n_oos;
+	union drbd_state os, ns;
+	struct drbd_device_work *dw;
+	char *khelper_cmd = NULL;
+	int verify_done = 0;
+
+	/* Remove all elements from the resync LRU. Since future actions
+	 * might set bits in the (main) bitmap, then the entries in the
+	 * resync LRU would be wrong. */
+	if (drbd_rs_del_all(device)) {
+		/* In case this is not possible now, most probably because
+		 * there are P_RS_DATA_REPLY Packets lingering on the worker's
+		 * queue (or even the read operations for those packets
+		 * is not finished by now).   Retry in 100ms. */
+
+		schedule_timeout_interruptible(HZ / 10);
+		dw = kmalloc(sizeof(struct drbd_device_work), GFP_ATOMIC);
+		if (dw) {
+			dw->w.cb = w_resync_finished;
+			dw->device = device;
+			drbd_queue_work(&connection->sender_work, &dw->w);
+			return 1;
+		}
+		drbd_err(device, "Warn failed to drbd_rs_del_all() and to kmalloc(dw).\n");
+	}
+
+	dt = (jiffies - device->rs_start - device->rs_paused) / HZ;
+	if (dt <= 0)
+		dt = 1;
+
+	db = device->rs_total;
+	/* adjust for verify start and stop sectors, respective reached position */
+	if (device->state.conn == C_VERIFY_S || device->state.conn == C_VERIFY_T)
+		db -= device->ov_left;
+
+	dbdt = Bit2KB(db/dt);
+	device->rs_paused /= HZ;
+
+	if (!get_ldev(device))
+		goto out;
+
+	ping_peer(device);
+
+	spin_lock_irq(&device->resource->req_lock);
+	os = drbd_read_state(device);
+
+	verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
+
+	/* This protects us against multiple calls (that can happen in the presence
+	   of application IO), and against connectivity loss just before we arrive here. */
+	if (os.conn <= C_CONNECTED)
+		goto out_unlock;
+
+	ns = os;
+	ns.conn = C_CONNECTED;
+
+	drbd_info(device, "%s done (total %lu sec; paused %lu sec; %lu K/sec)\n",
+	     verify_done ? "Online verify" : "Resync",
+	     dt + device->rs_paused, device->rs_paused, dbdt);
+
+	n_oos = drbd_bm_total_weight(device);
+
+	if (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) {
+		if (n_oos) {
+			drbd_alert(device, "Online verify found %lu %dk block out of sync!\n",
+			      n_oos, Bit2KB(1));
+			khelper_cmd = "out-of-sync";
+		}
+	} else {
+		D_ASSERT(device, (n_oos - device->rs_failed) == 0);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
+			khelper_cmd = "after-resync-target";
+
+		if (device->use_csums && device->rs_total) {
+			const unsigned long s = device->rs_same_csum;
+			const unsigned long t = device->rs_total;
+			const int ratio =
+				(t == 0)     ? 0 :
+			(t < 100000) ? ((s*100)/t) : (s/(t/100));
+			drbd_info(device, "%u %% had equal checksums, eliminated: %luK; "
+			     "transferred %luK total %luK\n",
+			     ratio,
+			     Bit2KB(device->rs_same_csum),
+			     Bit2KB(device->rs_total - device->rs_same_csum),
+			     Bit2KB(device->rs_total));
+		}
+	}
+
+	if (device->rs_failed) {
+		drbd_info(device, "            %lu failed blocks\n", device->rs_failed);
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			ns.disk = D_INCONSISTENT;
+			ns.pdsk = D_UP_TO_DATE;
+		} else {
+			ns.disk = D_UP_TO_DATE;
+			ns.pdsk = D_INCONSISTENT;
+		}
+	} else {
+		ns.disk = D_UP_TO_DATE;
+		ns.pdsk = D_UP_TO_DATE;
+
+		if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T) {
+			if (device->p_uuid) {
+				int i;
+				for (i = UI_BITMAP ; i <= UI_HISTORY_END ; i++)
+					_drbd_uuid_set(device, i, device->p_uuid[i]);
+				drbd_uuid_set(device, UI_BITMAP, device->ldev->md.uuid[UI_CURRENT]);
+				_drbd_uuid_set(device, UI_CURRENT, device->p_uuid[UI_CURRENT]);
+			} else {
+				drbd_err(device, "device->p_uuid is NULL! BUG\n");
+			}
+		}
+
+		if (!(os.conn == C_VERIFY_S || os.conn == C_VERIFY_T)) {
+			/* for verify runs, we don't update uuids here,
+			 * so there would be nothing to report. */
+			drbd_uuid_set_bm(device, 0UL);
+			drbd_print_uuids(device, "updated UUIDs");
+			if (device->p_uuid) {
+				/* Now the two UUID sets are equal, update what we
+				 * know of the peer. */
+				int i;
+				for (i = UI_CURRENT ; i <= UI_HISTORY_END ; i++)
+					device->p_uuid[i] = device->ldev->md.uuid[i];
+			}
+		}
+	}
+
+	_drbd_set_state(device, ns, CS_VERBOSE, NULL);
+out_unlock:
+	spin_unlock_irq(&device->resource->req_lock);
+
+	/* If we have been sync source, and have an effective fencing-policy,
+	 * once *all* volumes are back in sync, call "unfence". */
+	if (os.conn == C_SYNC_SOURCE) {
+		enum drbd_disk_state disk_state = D_MASK;
+		enum drbd_disk_state pdsk_state = D_MASK;
+		enum drbd_fencing_p fp = FP_DONT_CARE;
+
+		rcu_read_lock();
+		fp = rcu_dereference(device->ldev->disk_conf)->fencing;
+		if (fp != FP_DONT_CARE) {
+			struct drbd_peer_device *peer_device;
+			int vnr;
+			idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+				struct drbd_device *device = peer_device->device;
+				disk_state = min_t(enum drbd_disk_state, disk_state, device->state.disk);
+				pdsk_state = min_t(enum drbd_disk_state, pdsk_state, device->state.pdsk);
+			}
+		}
+		rcu_read_unlock();
+		if (disk_state == D_UP_TO_DATE && pdsk_state == D_UP_TO_DATE)
+			conn_khelper(connection, "unfence-peer");
+	}
+
+	put_ldev(device);
+out:
+	device->rs_total  = 0;
+	device->rs_failed = 0;
+	device->rs_paused = 0;
+
+	/* reset start sector, if we reached end of device */
+	if (verify_done && device->ov_left == 0)
+		device->ov_start_sector = 0;
+
+	drbd_md_sync(device);
+
+	if (khelper_cmd)
+		drbd_khelper(device, khelper_cmd);
+
+	return 1;
+}
+
+/* helper */
+static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
+{
+	if (drbd_peer_req_has_active_page(peer_req)) {
+		/* This might happen if sendpage() has not finished */
+		int i = PFN_UP(peer_req->i.size);
+		atomic_add(i, &device->pp_in_use_by_net);
+		atomic_sub(i, &device->pp_in_use);
+		spin_lock_irq(&device->resource->req_lock);
+		list_add_tail(&peer_req->w.list, &device->net_ee);
+		spin_unlock_irq(&device->resource->req_lock);
+		wake_up(&drbd_pp_wait);
+	} else
+		drbd_free_peer_req(device, peer_req);
+}
+
+/**
+ * w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_data_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		err = drbd_send_block(peer_device, P_DATA_REPLY, peer_req);
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. sector=%llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+static bool all_zero(struct drbd_peer_request *peer_req)
+{
+	struct page *page = peer_req->pages;
+	unsigned int len = peer_req->i.size;
+
+	page_chain_for_each(page) {
+		unsigned int l = min_t(unsigned int, len, PAGE_SIZE);
+		unsigned int i, words = l / sizeof(long);
+		unsigned long *d;
+
+		d = kmap_atomic(page);
+		for (i = 0; i < words; i++) {
+			if (d[i]) {
+				kunmap_atomic(d);
+				return false;
+			}
+		}
+		kunmap_atomic(d);
+		len -= l;
+	}
+
+	return true;
+}
+
+/**
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	int err;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev_if_state(device, D_FAILED)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	if (device->state.conn == C_AHEAD) {
+		err = drbd_send_ack(peer_device, P_RS_CANCEL, peer_req);
+	} else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		if (likely(device->state.pdsk >= D_INCONSISTENT)) {
+			inc_rs_pending(device);
+			if (peer_req->flags & EE_RS_THIN_REQ && all_zero(peer_req))
+				err = drbd_send_rs_deallocated(peer_device, peer_req);
+			else
+				err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		} else {
+			if (__ratelimit(&drbd_ratelimit_state))
+				drbd_err(device, "Not sending RSDataReply, "
+				    "partner DISKLESS!\n");
+			err = 0;
+		}
+	} else {
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegRSDReply. sector %llus.\n",
+			    (unsigned long long)peer_req->i.sector);
+
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+
+		/* update resync data with failure */
+		drbd_rs_failed_io(device, peer_req->i.sector, peer_req->i.size);
+	}
+
+	dec_unacked(device);
+
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block() failed\n");
+	return err;
+}
+
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	int digest_size;
+	void *digest = NULL;
+	int err, eq = 0;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		/* quick hack to try to avoid a race against reconfiguration.
+		 * a real fix would be much more involved,
+		 * introducing more locking mechanisms */
+		if (peer_device->connection->csums_tfm) {
+			digest_size = crypto_shash_digestsize(peer_device->connection->csums_tfm);
+			D_ASSERT(device, digest_size == di->digest_size);
+			digest = kmalloc(digest_size, GFP_NOIO);
+		}
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->csums_tfm, peer_req, digest);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+
+		if (eq) {
+			drbd_set_in_sync(device, peer_req->i.sector, peer_req->i.size);
+			/* rs_same_csums unit is BM_BLOCK_SIZE */
+			device->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+			err = drbd_send_ack(peer_device, P_RS_IS_IN_SYNC, peer_req);
+		} else {
+			inc_rs_pending(device);
+			peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+			peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
+			kfree(di);
+			err = drbd_send_block(peer_device, P_RS_DATA_REPLY, peer_req);
+		}
+	} else {
+		err = drbd_send_ack(peer_device, P_NEG_RS_DREPLY, peer_req);
+		if (__ratelimit(&drbd_ratelimit_state))
+			drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
+	}
+
+	dec_unacked(device);
+	move_to_net_ee_or_free(device, peer_req);
+
+	if (unlikely(err))
+		drbd_err(device, "drbd_send_block/ack() failed\n");
+	return err;
+}
+
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	void *digest;
+	int err = 0;
+
+	if (unlikely(cancel))
+		goto out;
+
+	digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm);
+	digest = kmalloc(digest_size, GFP_NOIO);
+	if (!digest) {
+		err = 1;	/* terminate the connection in case the allocation failed */
+		goto out;
+	}
+
+	if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+		drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+	else
+		memset(digest, 0, digest_size);
+
+	/* Free e and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	peer_req = NULL;
+	inc_rs_pending(device);
+	err = drbd_send_drequest_csum(peer_device, sector, size, digest, digest_size, P_OV_REPLY);
+	if (err)
+		dec_rs_pending(device);
+	kfree(digest);
+
+out:
+	if (peer_req)
+		drbd_free_peer_req(device, peer_req);
+	dec_unacked(device);
+	return err;
+}
+
+void drbd_ov_out_of_sync_found(struct drbd_device *device, sector_t sector, int size)
+{
+	if (device->ov_last_oos_start + device->ov_last_oos_size == sector) {
+		device->ov_last_oos_size += size>>9;
+	} else {
+		device->ov_last_oos_start = sector;
+		device->ov_last_oos_size = size>>9;
+	}
+	drbd_set_out_of_sync(device, sector, size);
+}
+
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
+{
+	struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+	struct drbd_peer_device *peer_device = peer_req->peer_device;
+	struct drbd_device *device = peer_device->device;
+	struct digest_info *di;
+	void *digest;
+	sector_t sector = peer_req->i.sector;
+	unsigned int size = peer_req->i.size;
+	int digest_size;
+	int err, eq = 0;
+	bool stop_sector_reached = false;
+
+	if (unlikely(cancel)) {
+		drbd_free_peer_req(device, peer_req);
+		dec_unacked(device);
+		return 0;
+	}
+
+	/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
+	 * the resync lru has been cleaned up already */
+	if (get_ldev(device)) {
+		drbd_rs_complete_io(device, peer_req->i.sector);
+		put_ldev(device);
+	}
+
+	di = peer_req->digest;
+
+	if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+		digest_size = crypto_shash_digestsize(peer_device->connection->verify_tfm);
+		digest = kmalloc(digest_size, GFP_NOIO);
+		if (digest) {
+			drbd_csum_ee(peer_device->connection->verify_tfm, peer_req, digest);
+
+			D_ASSERT(device, digest_size == di->digest_size);
+			eq = !memcmp(digest, di->digest, digest_size);
+			kfree(digest);
+		}
+	}
+
+	/* Free peer_req and pages before send.
+	 * In case we block on congestion, we could otherwise run into
+	 * some distributed deadlock, if the other side blocks on
+	 * congestion as well, because our receiver blocks in
+	 * drbd_alloc_pages due to pp_in_use > max_buffers. */
+	drbd_free_peer_req(device, peer_req);
+	if (!eq)
+		drbd_ov_out_of_sync_found(device, sector, size);
+	else
+		ov_out_of_sync_print(device);
+
+	err = drbd_send_ack_ex(peer_device, P_OV_RESULT, sector, size,
+			       eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+
+	dec_unacked(device);
+
+	--device->ov_left;
+
+	/* let's advance progress step marks only for every other megabyte */
+	if ((device->ov_left & 0x200) == 0x200)
+		drbd_advance_rs_marks(device, device->ov_left);
+
+	stop_sector_reached = verify_can_do_stop_sector(device) &&
+		(sector + (size>>9)) >= device->ov_stop_sector;
+
+	if (device->ov_left == 0 || stop_sector_reached) {
+		ov_out_of_sync_print(device);
+		drbd_resync_finished(device);
+	}
+
+	return err;
+}
+
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+static int drbd_send_barrier(struct drbd_connection *connection)
+{
+	struct p_barrier *p;
+	struct drbd_socket *sock;
+
+	sock = &connection->data;
+	p = conn_prepare_command(connection, sock);
+	if (!p)
+		return -EIO;
+	p->barrier = connection->send.current_epoch_nr;
+	p->pad = 0;
+	connection->send.current_epoch_writes = 0;
+	connection->send.last_sent_barrier_jif = jiffies;
+
+	return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+static int pd_send_unplug_remote(struct drbd_peer_device *pd)
+{
+	struct drbd_socket *sock = &pd->connection->data;
+	if (!drbd_prepare_command(pd, sock))
+		return -EIO;
+	return drbd_send_command(pd, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+	struct drbd_device *device =
+		container_of(w, struct drbd_device, unplug_work);
+
+	if (cancel)
+		return 0;
+	return pd_send_unplug_remote(first_peer_device(device));
+}
+
+static void re_init_if_first_write(struct drbd_connection *connection, unsigned int epoch)
+{
+	if (!connection->send.seen_any_write_yet) {
+		connection->send.seen_any_write_yet = true;
+		connection->send.current_epoch_nr = epoch;
+		connection->send.current_epoch_writes = 0;
+		connection->send.last_sent_barrier_jif = jiffies;
+	}
+}
+
+static void maybe_send_barrier(struct drbd_connection *connection, unsigned int epoch)
+{
+	/* re-init if first write on this connection */
+	if (!connection->send.seen_any_write_yet)
+		return;
+	if (connection->send.current_epoch_nr != epoch) {
+		if (connection->send.current_epoch_writes)
+			drbd_send_barrier(connection);
+		connection->send.current_epoch_nr = epoch;
+	}
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *const connection = peer_device->connection;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	/* this time, no connection->send.current_epoch_writes++;
+	 * If it was sent, it was the closing barrier for the last
+	 * replicated epoch, before we went into AHEAD mode.
+	 * No more barriers will be sent, until we leave AHEAD mode again. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_out_of_sync(peer_device, req);
+	req_mod(req, OOS_HANDED_TO_NETWORK);
+
+	return err;
+}
+
+/**
+ * w_send_dblock() - Worker callback to send a P_DATA packet in order to mirror a write request
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_dblock(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
+	bool do_send_unplug = req->rq_state & RQ_UNPLUG;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	re_init_if_first_write(connection, req->epoch);
+	maybe_send_barrier(connection, req->epoch);
+	connection->send.current_epoch_writes++;
+
+	err = drbd_send_dblock(peer_device, req);
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	if (do_send_unplug && !err)
+		pd_send_unplug_remote(peer_device);
+
+	return err;
+}
+
+/**
+ * w_send_read_req() - Worker callback to send a read request (P_DATA_REQUEST) packet
+ * @w:		work object.
+ * @cancel:	The connection will be closed anyways
+ */
+int w_send_read_req(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+	struct drbd_peer_device *const peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device->connection;
+	bool do_send_unplug = req->rq_state & RQ_UNPLUG;
+	int err;
+
+	if (unlikely(cancel)) {
+		req_mod(req, SEND_CANCELED);
+		return 0;
+	}
+	req->pre_send_jif = jiffies;
+
+	/* Even read requests may close a write epoch,
+	 * if there was any yet. */
+	maybe_send_barrier(connection, req->epoch);
+
+	err = drbd_send_drequest(peer_device, P_DATA_REQUEST, req->i.sector, req->i.size,
+				 (unsigned long)req);
+
+	req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+	if (do_send_unplug && !err)
+		pd_send_unplug_remote(peer_device);
+
+	return err;
+}
+
+int w_restart_disk_io(struct drbd_work *w, int cancel)
+{
+	struct drbd_request *req = container_of(w, struct drbd_request, w);
+	struct drbd_device *device = req->device;
+
+	if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
+		drbd_al_begin_io(device, &req->i);
+
+	req->private_bio = bio_alloc_clone(device->ldev->backing_bdev,
+					   req->master_bio, GFP_NOIO,
+					  &drbd_io_bio_set);
+	req->private_bio->bi_private = req;
+	req->private_bio->bi_end_io = drbd_request_endio;
+	submit_bio_noacct(req->private_bio);
+
+	return 0;
+}
+
+static int _drbd_may_sync_now(struct drbd_device *device)
+{
+	struct drbd_device *odev = device;
+	int resync_after;
+
+	while (1) {
+		if (!odev->ldev || odev->state.disk == D_DISKLESS)
+			return 1;
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		if (resync_after == -1)
+			return 1;
+		odev = minor_to_device(resync_after);
+		if (!odev)
+			return 1;
+		if ((odev->state.conn >= C_SYNC_SOURCE &&
+		     odev->state.conn <= C_PAUSED_SYNC_T) ||
+		    odev->state.aftr_isp || odev->state.peer_isp ||
+		    odev->state.user_isp)
+			return 0;
+	}
+}
+
+/**
+ * drbd_pause_after() - Pause resync on all devices that may not resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and after_state_ch).
+ */
+static bool drbd_pause_after(struct drbd_device *device)
+{
+	bool changed = false;
+	struct drbd_device *odev;
+	int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (!_drbd_may_sync_now(odev) &&
+		    _drbd_set_state(_NS(odev, aftr_isp, 1),
+				    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+			changed = true;
+	}
+	rcu_read_unlock();
+
+	return changed;
+}
+
+/**
+ * drbd_resume_next() - Resume resync on all devices that may resync now
+ * @device:	DRBD device.
+ *
+ * Called from process context only (admin command and worker).
+ */
+static bool drbd_resume_next(struct drbd_device *device)
+{
+	bool changed = false;
+	struct drbd_device *odev;
+	int i;
+
+	rcu_read_lock();
+	idr_for_each_entry(&drbd_devices, odev, i) {
+		if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
+			continue;
+		if (odev->state.aftr_isp) {
+			if (_drbd_may_sync_now(odev) &&
+			    _drbd_set_state(_NS(odev, aftr_isp, 0),
+					    CS_HARD, NULL) != SS_NOTHING_TO_DO)
+				changed = true;
+		}
+	}
+	rcu_read_unlock();
+	return changed;
+}
+
+void resume_next_sg(struct drbd_device *device)
+{
+	lock_all_resources();
+	drbd_resume_next(device);
+	unlock_all_resources();
+}
+
+void suspend_other_sg(struct drbd_device *device)
+{
+	lock_all_resources();
+	drbd_pause_after(device);
+	unlock_all_resources();
+}
+
+/* caller must lock_all_resources() */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor)
+{
+	struct drbd_device *odev;
+	int resync_after;
+
+	if (o_minor == -1)
+		return NO_ERROR;
+	if (o_minor < -1 || o_minor > MINORMASK)
+		return ERR_RESYNC_AFTER;
+
+	/* check for loops */
+	odev = minor_to_device(o_minor);
+	while (1) {
+		if (odev == device)
+			return ERR_RESYNC_AFTER_CYCLE;
+
+		/* You are free to depend on diskless, non-existing,
+		 * or not yet/no longer existing minors.
+		 * We only reject dependency loops.
+		 * We cannot follow the dependency chain beyond a detached or
+		 * missing minor.
+		 */
+		if (!odev || !odev->ldev || odev->state.disk == D_DISKLESS)
+			return NO_ERROR;
+
+		rcu_read_lock();
+		resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+		rcu_read_unlock();
+		/* dependency chain ends here, no cycles. */
+		if (resync_after == -1)
+			return NO_ERROR;
+
+		/* follow the dependency chain */
+		odev = minor_to_device(resync_after);
+	}
+}
+
+/* caller must lock_all_resources() */
+void drbd_resync_after_changed(struct drbd_device *device)
+{
+	int changed;
+
+	do {
+		changed  = drbd_pause_after(device);
+		changed |= drbd_resume_next(device);
+	} while (changed);
+}
+
+void drbd_rs_controller_reset(struct drbd_device *device)
+{
+	struct gendisk *disk = device->ldev->backing_bdev->bd_disk;
+	struct fifo_buffer *plan;
+
+	atomic_set(&device->rs_sect_in, 0);
+	atomic_set(&device->rs_sect_ev, 0);
+	device->rs_in_flight = 0;
+	device->rs_last_events =
+		(int)part_stat_read_accum(disk->part0, sectors);
+
+	/* Updating the RCU protected object in place is necessary since
+	   this function gets called from atomic context.
+	   It is valid since all other updates also lead to an completely
+	   empty fifo */
+	rcu_read_lock();
+	plan = rcu_dereference(device->rs_plan_s);
+	plan->total = 0;
+	fifo_set(plan, 0);
+	rcu_read_unlock();
+}
+
+void start_resync_timer_fn(struct timer_list *t)
+{
+	struct drbd_device *device = from_timer(device, t, start_resync_timer);
+	drbd_device_post_work(device, RS_START);
+}
+
+static void do_start_resync(struct drbd_device *device)
+{
+	if (atomic_read(&device->unacked_cnt) || atomic_read(&device->rs_pending_cnt)) {
+		drbd_warn(device, "postponing start_resync ...\n");
+		device->start_resync_timer.expires = jiffies + HZ/10;
+		add_timer(&device->start_resync_timer);
+		return;
+	}
+
+	drbd_start_resync(device, C_SYNC_SOURCE);
+	clear_bit(AHEAD_TO_SYNC_SOURCE, &device->flags);
+}
+
+static bool use_checksum_based_resync(struct drbd_connection *connection, struct drbd_device *device)
+{
+	bool csums_after_crash_only;
+	rcu_read_lock();
+	csums_after_crash_only = rcu_dereference(connection->net_conf)->csums_after_crash_only;
+	rcu_read_unlock();
+	return connection->agreed_pro_version >= 89 &&		/* supported? */
+		connection->csums_tfm &&			/* configured? */
+		(csums_after_crash_only == false		/* use for each resync? */
+		 || test_bit(CRASHED_PRIMARY, &device->flags));	/* or only after Primary crash? */
+}
+
+/**
+ * drbd_start_resync() - Start the resync process
+ * @device:	DRBD device.
+ * @side:	Either C_SYNC_SOURCE or C_SYNC_TARGET
+ *
+ * This function might bring you directly into one of the
+ * C_PAUSED_SYNC_* states.
+ */
+void drbd_start_resync(struct drbd_device *device, enum drbd_conns side)
+{
+	struct drbd_peer_device *peer_device = first_peer_device(device);
+	struct drbd_connection *connection = peer_device ? peer_device->connection : NULL;
+	union drbd_state ns;
+	int r;
+
+	if (device->state.conn >= C_SYNC_SOURCE && device->state.conn < C_AHEAD) {
+		drbd_err(device, "Resync already running!\n");
+		return;
+	}
+
+	if (!connection) {
+		drbd_err(device, "No connection to peer, aborting!\n");
+		return;
+	}
+
+	if (!test_bit(B_RS_H_DONE, &device->flags)) {
+		if (side == C_SYNC_TARGET) {
+			/* Since application IO was locked out during C_WF_BITMAP_T and
+			   C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+			   we check that we might make the data inconsistent. */
+			r = drbd_khelper(device, "before-resync-target");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				drbd_info(device, "before-resync-target handler returned %d, "
+					 "dropping connection.\n", r);
+				conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD);
+				return;
+			}
+		} else /* C_SYNC_SOURCE */ {
+			r = drbd_khelper(device, "before-resync-source");
+			r = (r >> 8) & 0xff;
+			if (r > 0) {
+				if (r == 3) {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "ignoring. Old userland tools?", r);
+				} else {
+					drbd_info(device, "before-resync-source handler returned %d, "
+						 "dropping connection.\n", r);
+					conn_request_state(connection,
+							   NS(conn, C_DISCONNECTING), CS_HARD);
+					return;
+				}
+			}
+		}
+	}
+
+	if (current == connection->worker.task) {
+		/* The worker should not sleep waiting for state_mutex,
+		   that can take long */
+		if (!mutex_trylock(device->state_mutex)) {
+			set_bit(B_RS_H_DONE, &device->flags);
+			device->start_resync_timer.expires = jiffies + HZ/5;
+			add_timer(&device->start_resync_timer);
+			return;
+		}
+	} else {
+		mutex_lock(device->state_mutex);
+	}
+
+	lock_all_resources();
+	clear_bit(B_RS_H_DONE, &device->flags);
+	/* Did some connection breakage or IO error race with us? */
+	if (device->state.conn < C_CONNECTED
+	|| !get_ldev_if_state(device, D_NEGOTIATING)) {
+		unlock_all_resources();
+		goto out;
+	}
+
+	ns = drbd_read_state(device);
+
+	ns.aftr_isp = !_drbd_may_sync_now(device);
+
+	ns.conn = side;
+
+	if (side == C_SYNC_TARGET)
+		ns.disk = D_INCONSISTENT;
+	else /* side == C_SYNC_SOURCE */
+		ns.pdsk = D_INCONSISTENT;
+
+	r = _drbd_set_state(device, ns, CS_VERBOSE, NULL);
+	ns = drbd_read_state(device);
+
+	if (ns.conn < C_CONNECTED)
+		r = SS_UNKNOWN_ERROR;
+
+	if (r == SS_SUCCESS) {
+		unsigned long tw = drbd_bm_total_weight(device);
+		unsigned long now = jiffies;
+		int i;
+
+		device->rs_failed    = 0;
+		device->rs_paused    = 0;
+		device->rs_same_csum = 0;
+		device->rs_last_sect_ev = 0;
+		device->rs_total     = tw;
+		device->rs_start     = now;
+		for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+			device->rs_mark_left[i] = tw;
+			device->rs_mark_time[i] = now;
+		}
+		drbd_pause_after(device);
+		/* Forget potentially stale cached per resync extent bit-counts.
+		 * Open coded drbd_rs_cancel_all(device), we already have IRQs
+		 * disabled, and know the disk state is ok. */
+		spin_lock(&device->al_lock);
+		lc_reset(device->resync);
+		device->resync_locked = 0;
+		device->resync_wenr = LC_FREE;
+		spin_unlock(&device->al_lock);
+	}
+	unlock_all_resources();
+
+	if (r == SS_SUCCESS) {
+		wake_up(&device->al_wait); /* for lc_reset() above */
+		/* reset rs_last_bcast when a resync or verify is started,
+		 * to deal with potential jiffies wrap. */
+		device->rs_last_bcast = jiffies - HZ;
+
+		drbd_info(device, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
+		     drbd_conn_str(ns.conn),
+		     (unsigned long) device->rs_total << (BM_BLOCK_SHIFT-10),
+		     (unsigned long) device->rs_total);
+		if (side == C_SYNC_TARGET) {
+			device->bm_resync_fo = 0;
+			device->use_csums = use_checksum_based_resync(connection, device);
+		} else {
+			device->use_csums = false;
+		}
+
+		/* Since protocol 96, we must serialize drbd_gen_and_send_sync_uuid
+		 * with w_send_oos, or the sync target will get confused as to
+		 * how much bits to resync.  We cannot do that always, because for an
+		 * empty resync and protocol < 95, we need to do it here, as we call
+		 * drbd_resync_finished from here in that case.
+		 * We drbd_gen_and_send_sync_uuid here for protocol < 96,
+		 * and from after_state_ch otherwise. */
+		if (side == C_SYNC_SOURCE && connection->agreed_pro_version < 96)
+			drbd_gen_and_send_sync_uuid(peer_device);
+
+		if (connection->agreed_pro_version < 95 && device->rs_total == 0) {
+			/* This still has a race (about when exactly the peers
+			 * detect connection loss) that can lead to a full sync
+			 * on next handshake. In 8.3.9 we fixed this with explicit
+			 * resync-finished notifications, but the fix
+			 * introduces a protocol change.  Sleeping for some
+			 * time longer than the ping interval + timeout on the
+			 * SyncSource, to give the SyncTarget the chance to
+			 * detect connection loss, then waiting for a ping
+			 * response (implicit in drbd_resync_finished) reduces
+			 * the race considerably, but does not solve it. */
+			if (side == C_SYNC_SOURCE) {
+				struct net_conf *nc;
+				int timeo;
+
+				rcu_read_lock();
+				nc = rcu_dereference(connection->net_conf);
+				timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+				rcu_read_unlock();
+				schedule_timeout_interruptible(timeo);
+			}
+			drbd_resync_finished(device);
+		}
+
+		drbd_rs_controller_reset(device);
+		/* ns.conn may already be != device->state.conn,
+		 * we may have been paused in between, or become paused until
+		 * the timer triggers.
+		 * No matter, that is handled in resync_timer_fn() */
+		if (ns.conn == C_SYNC_TARGET)
+			mod_timer(&device->resync_timer, jiffies);
+
+		drbd_md_sync(device);
+	}
+	put_ldev(device);
+out:
+	mutex_unlock(device->state_mutex);
+}
+
+static void update_on_disk_bitmap(struct drbd_device *device, bool resync_done)
+{
+	struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
+	device->rs_last_bcast = jiffies;
+
+	if (!get_ldev(device))
+		return;
+
+	drbd_bm_write_lazy(device, 0);
+	if (resync_done && is_sync_state(device->state.conn))
+		drbd_resync_finished(device);
+
+	drbd_bcast_event(device, &sib);
+	/* update timestamp, in case it took a while to write out stuff */
+	device->rs_last_bcast = jiffies;
+	put_ldev(device);
+}
+
+static void drbd_ldev_destroy(struct drbd_device *device)
+{
+	lc_destroy(device->resync);
+	device->resync = NULL;
+	lc_destroy(device->act_log);
+	device->act_log = NULL;
+
+	__acquire(local);
+	drbd_backing_dev_free(device, device->ldev);
+	device->ldev = NULL;
+	__release(local);
+
+	clear_bit(GOING_DISKLESS, &device->flags);
+	wake_up(&device->misc_wait);
+}
+
+static void go_diskless(struct drbd_device *device)
+{
+	D_ASSERT(device, device->state.disk == D_FAILED);
+	/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
+	 * inc/dec it frequently. Once we are D_DISKLESS, no one will touch
+	 * the protected members anymore, though, so once put_ldev reaches zero
+	 * again, it will be safe to free them. */
+
+	/* Try to write changed bitmap pages, read errors may have just
+	 * set some bits outside the area covered by the activity log.
+	 *
+	 * If we have an IO error during the bitmap writeout,
+	 * we will want a full sync next time, just in case.
+	 * (Do we want a specific meta data flag for this?)
+	 *
+	 * If that does not make it to stable storage either,
+	 * we cannot do anything about that anymore.
+	 *
+	 * We still need to check if both bitmap and ldev are present, we may
+	 * end up here after a failed attach, before ldev was even assigned.
+	 */
+	if (device->bitmap && device->ldev) {
+		/* An interrupted resync or similar is allowed to recounts bits
+		 * while we detach.
+		 * Any modifications would not be expected anymore, though.
+		 */
+		if (drbd_bitmap_io_from_worker(device, drbd_bm_write,
+					"detach", BM_LOCKED_TEST_ALLOWED)) {
+			if (test_bit(WAS_READ_ERROR, &device->flags)) {
+				drbd_md_set_flag(device, MDF_FULL_SYNC);
+				drbd_md_sync(device);
+			}
+		}
+	}
+
+	drbd_force_state(device, NS(disk, D_DISKLESS));
+}
+
+static int do_md_sync(struct drbd_device *device)
+{
+	drbd_warn(device, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
+	drbd_md_sync(device);
+	return 0;
+}
+
+/* only called from drbd_worker thread, no locking */
+void __update_timing_details(
+		struct drbd_thread_timing_details *tdp,
+		unsigned int *cb_nr,
+		void *cb,
+		const char *fn, const unsigned int line)
+{
+	unsigned int i = *cb_nr % DRBD_THREAD_DETAILS_HIST;
+	struct drbd_thread_timing_details *td = tdp + i;
+
+	td->start_jif = jiffies;
+	td->cb_addr = cb;
+	td->caller_fn = fn;
+	td->line = line;
+	td->cb_nr = *cb_nr;
+
+	i = (i+1) % DRBD_THREAD_DETAILS_HIST;
+	td = tdp + i;
+	memset(td, 0, sizeof(*td));
+
+	++(*cb_nr);
+}
+
+static void do_device_work(struct drbd_device *device, const unsigned long todo)
+{
+	if (test_bit(MD_SYNC, &todo))
+		do_md_sync(device);
+	if (test_bit(RS_DONE, &todo) ||
+	    test_bit(RS_PROGRESS, &todo))
+		update_on_disk_bitmap(device, test_bit(RS_DONE, &todo));
+	if (test_bit(GO_DISKLESS, &todo))
+		go_diskless(device);
+	if (test_bit(DESTROY_DISK, &todo))
+		drbd_ldev_destroy(device);
+	if (test_bit(RS_START, &todo))
+		do_start_resync(device);
+}
+
+#define DRBD_DEVICE_WORK_MASK	\
+	((1UL << GO_DISKLESS)	\
+	|(1UL << DESTROY_DISK)	\
+	|(1UL << MD_SYNC)	\
+	|(1UL << RS_START)	\
+	|(1UL << RS_PROGRESS)	\
+	|(1UL << RS_DONE)	\
+	)
+
+static unsigned long get_work_bits(unsigned long *flags)
+{
+	unsigned long old, new;
+	do {
+		old = *flags;
+		new = old & ~DRBD_DEVICE_WORK_MASK;
+	} while (cmpxchg(flags, old, new) != old);
+	return old & DRBD_DEVICE_WORK_MASK;
+}
+
+static void do_unqueued_work(struct drbd_connection *connection)
+{
+	struct drbd_peer_device *peer_device;
+	int vnr;
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		unsigned long todo = get_work_bits(&device->flags);
+		if (!todo)
+			continue;
+
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		do_device_work(device, todo);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+}
+
+static bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+	spin_lock_irq(&queue->q_lock);
+	list_splice_tail_init(&queue->q, work_list);
+	spin_unlock_irq(&queue->q_lock);
+	return !list_empty(work_list);
+}
+
+static void wait_for_work(struct drbd_connection *connection, struct list_head *work_list)
+{
+	DEFINE_WAIT(wait);
+	struct net_conf *nc;
+	int uncork, cork;
+
+	dequeue_work_batch(&connection->sender_work, work_list);
+	if (!list_empty(work_list))
+		return;
+
+	/* Still nothing to do?
+	 * Maybe we still need to close the current epoch,
+	 * even if no new requests are queued yet.
+	 *
+	 * Also, poke TCP, just in case.
+	 * Then wait for new work (or signal). */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	uncork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	if (uncork) {
+		mutex_lock(&connection->data.mutex);
+		if (connection->data.socket)
+			tcp_sock_set_cork(connection->data.socket->sk, false);
+		mutex_unlock(&connection->data.mutex);
+	}
+
+	for (;;) {
+		int send_barrier;
+		prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+		spin_lock_irq(&connection->resource->req_lock);
+		spin_lock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(&connection->sender_work.q))
+			list_splice_tail_init(&connection->sender_work.q, work_list);
+		spin_unlock(&connection->sender_work.q_lock);	/* FIXME get rid of this one? */
+		if (!list_empty(work_list) || signal_pending(current)) {
+			spin_unlock_irq(&connection->resource->req_lock);
+			break;
+		}
+
+		/* We found nothing new to do, no to-be-communicated request,
+		 * no other work item.  We may still need to close the last
+		 * epoch.  Next incoming request epoch will be connection ->
+		 * current transfer log epoch number.  If that is different
+		 * from the epoch of the last request we communicated, it is
+		 * safe to send the epoch separating barrier now.
+		 */
+		send_barrier =
+			atomic_read(&connection->current_tle_nr) !=
+			connection->send.current_epoch_nr;
+		spin_unlock_irq(&connection->resource->req_lock);
+
+		if (send_barrier)
+			maybe_send_barrier(connection,
+					connection->send.current_epoch_nr + 1);
+
+		if (test_bit(DEVICE_WORK_PENDING, &connection->flags))
+			break;
+
+		/* drbd_send() may have called flush_signals() */
+		if (get_t_state(&connection->worker) != RUNNING)
+			break;
+
+		schedule();
+		/* may be woken up for other things but new work, too,
+		 * e.g. if the current epoch got closed.
+		 * In which case we send the barrier above. */
+	}
+	finish_wait(&connection->sender_work.q_wait, &wait);
+
+	/* someone may have changed the config while we have been waiting above. */
+	rcu_read_lock();
+	nc = rcu_dereference(connection->net_conf);
+	cork = nc ? nc->tcp_cork : 0;
+	rcu_read_unlock();
+	mutex_lock(&connection->data.mutex);
+	if (connection->data.socket) {
+		if (cork)
+			tcp_sock_set_cork(connection->data.socket->sk, true);
+		else if (!uncork)
+			tcp_sock_set_cork(connection->data.socket->sk, false);
+	}
+	mutex_unlock(&connection->data.mutex);
+}
+
+int drbd_worker(struct drbd_thread *thi)
+{
+	struct drbd_connection *connection = thi->connection;
+	struct drbd_work *w = NULL;
+	struct drbd_peer_device *peer_device;
+	LIST_HEAD(work_list);
+	int vnr;
+
+	while (get_t_state(thi) == RUNNING) {
+		drbd_thread_current_set_cpu(thi);
+
+		if (list_empty(&work_list)) {
+			update_worker_timing_details(connection, wait_for_work);
+			wait_for_work(connection, &work_list);
+		}
+
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
+
+		if (signal_pending(current)) {
+			flush_signals(current);
+			if (get_t_state(thi) == RUNNING) {
+				drbd_warn(connection, "Worker got an unexpected signal\n");
+				continue;
+			}
+			break;
+		}
+
+		if (get_t_state(thi) != RUNNING)
+			break;
+
+		if (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
+			if (w->cb(w, connection->cstate < C_WF_REPORT_PARAMS) == 0)
+				continue;
+			if (connection->cstate >= C_WF_REPORT_PARAMS)
+				conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+		}
+	}
+
+	do {
+		if (test_and_clear_bit(DEVICE_WORK_PENDING, &connection->flags)) {
+			update_worker_timing_details(connection, do_unqueued_work);
+			do_unqueued_work(connection);
+		}
+		if (!list_empty(&work_list)) {
+			w = list_first_entry(&work_list, struct drbd_work, list);
+			list_del_init(&w->list);
+			update_worker_timing_details(connection, w->cb);
+			w->cb(w, 1);
+		} else
+			dequeue_work_batch(&connection->sender_work, &work_list);
+	} while (!list_empty(&work_list) || test_bit(DEVICE_WORK_PENDING, &connection->flags));
+
+	rcu_read_lock();
+	idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
+		struct drbd_device *device = peer_device->device;
+		D_ASSERT(device, device->state.disk == D_DISKLESS && device->state.conn == C_STANDALONE);
+		kref_get(&device->kref);
+		rcu_read_unlock();
+		drbd_device_cleanup(device);
+		kref_put(&device->kref, drbd_destroy_device);
+		rcu_read_lock();
+	}
+	rcu_read_unlock();
+
+	return 0;
+}
diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c
new file mode 100644
index 000000000..487840e35
--- /dev/null
+++ b/drivers/block/floppy.c
@@ -0,0 +1,5027 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ *  linux/drivers/block/floppy.c
+ *
+ *  Copyright (C) 1991, 1992  Linus Torvalds
+ *  Copyright (C) 1993, 1994  Alain Knaff
+ *  Copyright (C) 1998 Alan Cox
+ */
+
+/*
+ * 02.12.91 - Changed to static variables to indicate need for reset
+ * and recalibrate. This makes some things easier (output_byte reset
+ * checking etc), and means less interrupt jumping in case of errors,
+ * so the code is hopefully easier to understand.
+ */
+
+/*
+ * This file is certainly a mess. I've tried my best to get it working,
+ * but I don't like programming floppies, and I have only one anyway.
+ * Urgel. I should check for more errors, and do more graceful error
+ * recovery. Seems there are problems with several drives. I've tried to
+ * correct them. No promises.
+ */
+
+/*
+ * As with hd.c, all routines within this file can (and will) be called
+ * by interrupts, so extreme caution is needed. A hardware interrupt
+ * handler may not sleep, or a kernel panic will happen. Thus I cannot
+ * call "floppy-on" directly, but have to set a special timer interrupt
+ * etc.
+ */
+
+/*
+ * 28.02.92 - made track-buffering routines, based on the routines written
+ * by entropy@wintermute.wpi.edu (Lawrence Foard). Linus.
+ */
+
+/*
+ * Automatic floppy-detection and formatting written by Werner Almesberger
+ * (almesber@nessie.cs.id.ethz.ch), who also corrected some problems with
+ * the floppy-change signal detection.
+ */
+
+/*
+ * 1992/7/22 -- Hennus Bergman: Added better error reporting, fixed
+ * FDC data overrun bug, added some preliminary stuff for vertical
+ * recording support.
+ *
+ * 1992/9/17: Added DMA allocation & DMA functions. -- hhb.
+ *
+ * TODO: Errors are still not counted properly.
+ */
+
+/* 1992/9/20
+ * Modifications for ``Sector Shifting'' by Rob Hooft (hooft@chem.ruu.nl)
+ * modeled after the freeware MS-DOS program fdformat/88 V1.8 by
+ * Christoph H. Hochst\"atter.
+ * I have fixed the shift values to the ones I always use. Maybe a new
+ * ioctl() should be created to be able to modify them.
+ * There is a bug in the driver that makes it impossible to format a
+ * floppy as the first thing after bootup.
+ */
+
+/*
+ * 1993/4/29 -- Linus -- cleaned up the timer handling in the kernel, and
+ * this helped the floppy driver as well. Much cleaner, and still seems to
+ * work.
+ */
+
+/* 1994/6/24 --bbroad-- added the floppy table entries and made
+ * minor modifications to allow 2.88 floppies to be run.
+ */
+
+/* 1994/7/13 -- Paul Vojta -- modified the probing code to allow three or more
+ * disk types.
+ */
+
+/*
+ * 1994/8/8 -- Alain Knaff -- Switched to fdpatch driver: Support for bigger
+ * format bug fixes, but unfortunately some new bugs too...
+ */
+
+/* 1994/9/17 -- Koen Holtman -- added logging of physical floppy write
+ * errors to allow safe writing by specialized programs.
+ */
+
+/* 1995/4/24 -- Dan Fandrich -- added support for Commodore 1581 3.5" disks
+ * by defining bit 1 of the "stretch" parameter to mean put sectors on the
+ * opposite side of the disk, leaving the sector IDs alone (i.e. Commodore's
+ * drives are "upside-down").
+ */
+
+/*
+ * 1995/8/26 -- Andreas Busse -- added Mips support.
+ */
+
+/*
+ * 1995/10/18 -- Ralf Baechle -- Portability cleanup; move machine dependent
+ * features to asm/floppy.h.
+ */
+
+/*
+ * 1998/1/21 -- Richard Gooch <rgooch@atnf.csiro.au> -- devfs support
+ */
+
+/*
+ * 1998/05/07 -- Russell King -- More portability cleanups; moved definition of
+ * interrupt and dma channel to asm/floppy.h. Cleaned up some formatting &
+ * use of '0' for NULL.
+ */
+
+/*
+ * 1998/06/07 -- Alan Cox -- Merged the 2.0.34 fixes for resource allocation
+ * failures.
+ */
+
+/*
+ * 1998/09/20 -- David Weinehall -- Added slow-down code for buggy PS/2-drives.
+ */
+
+/*
+ * 1999/08/13 -- Paul Slootman -- floppy stopped working on Alpha after 24
+ * days, 6 hours, 32 minutes and 32 seconds (i.e. MAXINT jiffies; ints were
+ * being used to store jiffies, which are unsigned longs).
+ */
+
+/*
+ * 2000/08/28 -- Arnaldo Carvalho de Melo <acme@conectiva.com.br>
+ * - get rid of check_region
+ * - s/suser/capable/
+ */
+
+/*
+ * 2001/08/26 -- Paul Gortmaker - fix insmod oops on machines with no
+ * floppy controller (lingering task on list after module is gone... boom.)
+ */
+
+/*
+ * 2002/02/07 -- Anton Altaparmakov - Fix io ports reservation to correct range
+ * (0x3f2-0x3f5, 0x3f7). This fix is a bit of a hack but the proper fix
+ * requires many non-obvious changes in arch dependent code.
+ */
+
+/* 2003/07/28 -- Daniele Bellucci <bellucda@tiscali.it>.
+ * Better audit of register_blkdev.
+ */
+
+#define REALLY_SLOW_IO
+
+#define DEBUGT 2
+
+#define DPRINT(format, args...) \
+	pr_info("floppy%d: " format, current_drive, ##args)
+
+#define DCL_DEBUG		/* debug disk change line */
+#ifdef DCL_DEBUG
+#define debug_dcl(test, fmt, args...) \
+	do { if ((test) & FD_DEBUG) DPRINT(fmt, ##args); } while (0)
+#else
+#define debug_dcl(test, fmt, args...) \
+	do { if (0) DPRINT(fmt, ##args); } while (0)
+#endif
+
+/* do print messages for unexpected interrupts */
+static int print_unex = 1;
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/timer.h>
+#include <linux/workqueue.h>
+#include <linux/fdreg.h>
+#include <linux/fd.h>
+#include <linux/hdreg.h>
+#include <linux/errno.h>
+#include <linux/slab.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/string.h>
+#include <linux/jiffies.h>
+#include <linux/fcntl.h>
+#include <linux/delay.h>
+#include <linux/mc146818rtc.h>	/* CMOS defines */
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/init.h>
+#include <linux/major.h>
+#include <linux/platform_device.h>
+#include <linux/mod_devicetable.h>
+#include <linux/mutex.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
+#include <linux/async.h>
+#include <linux/compat.h>
+
+/*
+ * PS/2 floppies have much slower step rates than regular floppies.
+ * It's been recommended that take about 1/4 of the default speed
+ * in some more extreme cases.
+ */
+static DEFINE_MUTEX(floppy_mutex);
+static int slow_floppy;
+
+#include <asm/dma.h>
+#include <asm/irq.h>
+
+static int FLOPPY_IRQ = 6;
+static int FLOPPY_DMA = 2;
+static int can_use_virtual_dma = 2;
+/* =======
+ * can use virtual DMA:
+ * 0 = use of virtual DMA disallowed by config
+ * 1 = use of virtual DMA prescribed by config
+ * 2 = no virtual DMA preference configured.  By default try hard DMA,
+ * but fall back on virtual DMA when not enough memory available
+ */
+
+static int use_virtual_dma;
+/* =======
+ * use virtual DMA
+ * 0 using hard DMA
+ * 1 using virtual DMA
+ * This variable is set to virtual when a DMA mem problem arises, and
+ * reset back in floppy_grab_irq_and_dma.
+ * It is not safe to reset it in other circumstances, because the floppy
+ * driver may have several buffers in use at once, and we do currently not
+ * record each buffers capabilities
+ */
+
+static DEFINE_SPINLOCK(floppy_lock);
+
+static unsigned short virtual_dma_port = 0x3f0;
+irqreturn_t floppy_interrupt(int irq, void *dev_id);
+static int set_dor(int fdc, char mask, char data);
+
+#define K_64	0x10000		/* 64KB */
+
+/* the following is the mask of allowed drives. By default units 2 and
+ * 3 of both floppy controllers are disabled, because switching on the
+ * motor of these drives causes system hangs on some PCI computers. drive
+ * 0 is the low bit (0x1), and drive 7 is the high bit (0x80). Bits are on if
+ * a drive is allowed.
+ *
+ * NOTE: This must come before we include the arch floppy header because
+ *       some ports reference this variable from there. -DaveM
+ */
+
+static int allowed_drive_mask = 0x33;
+
+#include <asm/floppy.h>
+
+static int irqdma_allocated;
+
+#include <linux/blk-mq.h>
+#include <linux/blkpg.h>
+#include <linux/cdrom.h>	/* for the compatibility eject ioctl */
+#include <linux/completion.h>
+
+static LIST_HEAD(floppy_reqs);
+static struct request *current_req;
+static int set_next_request(void);
+
+#ifndef fd_get_dma_residue
+#define fd_get_dma_residue() get_dma_residue(FLOPPY_DMA)
+#endif
+
+/* Dma Memory related stuff */
+
+#ifndef fd_dma_mem_free
+#define fd_dma_mem_free(addr, size) free_pages(addr, get_order(size))
+#endif
+
+#ifndef fd_dma_mem_alloc
+#define fd_dma_mem_alloc(size) __get_dma_pages(GFP_KERNEL, get_order(size))
+#endif
+
+#ifndef fd_cacheflush
+#define fd_cacheflush(addr, size) /* nothing... */
+#endif
+
+static inline void fallback_on_nodma_alloc(char **addr, size_t l)
+{
+#ifdef FLOPPY_CAN_FALLBACK_ON_NODMA
+	if (*addr)
+		return;		/* we have the memory */
+	if (can_use_virtual_dma != 2)
+		return;		/* no fallback allowed */
+	pr_info("DMA memory shortage. Temporarily falling back on virtual DMA\n");
+	*addr = (char *)nodma_mem_alloc(l);
+#else
+	return;
+#endif
+}
+
+/* End dma memory related stuff */
+
+static unsigned long fake_change;
+static bool initialized;
+
+#define ITYPE(x)	(((x) >> 2) & 0x1f)
+#define TOMINOR(x)	((x & 3) | ((x & 4) << 5))
+#define UNIT(x)		((x) & 0x03)		/* drive on fdc */
+#define FDC(x)		(((x) & 0x04) >> 2)	/* fdc of drive */
+	/* reverse mapping from unit and fdc to drive */
+#define REVDRIVE(fdc, unit) ((unit) + ((fdc) << 2))
+
+#define PH_HEAD(floppy, head) (((((floppy)->stretch & 2) >> 1) ^ head) << 2)
+#define STRETCH(floppy)	((floppy)->stretch & FD_STRETCH)
+
+/* read/write commands */
+#define COMMAND			0
+#define DR_SELECT		1
+#define TRACK			2
+#define HEAD			3
+#define SECTOR			4
+#define SIZECODE		5
+#define SECT_PER_TRACK		6
+#define GAP			7
+#define SIZECODE2		8
+#define NR_RW 9
+
+/* format commands */
+#define F_SIZECODE		2
+#define F_SECT_PER_TRACK	3
+#define F_GAP			4
+#define F_FILL			5
+#define NR_F 6
+
+/*
+ * Maximum disk size (in kilobytes).
+ * This default is used whenever the current disk size is unknown.
+ * [Now it is rather a minimum]
+ */
+#define MAX_DISK_SIZE 4		/* 3984 */
+
+/*
+ * globals used by 'result()'
+ */
+static unsigned char reply_buffer[FD_RAW_REPLY_SIZE];
+static int inr;		/* size of reply buffer, when called from interrupt */
+#define ST0		0
+#define ST1		1
+#define ST2		2
+#define ST3		0	/* result of GETSTATUS */
+#define R_TRACK		3
+#define R_HEAD		4
+#define R_SECTOR	5
+#define R_SIZECODE	6
+
+#define SEL_DLY		(2 * HZ / 100)
+
+/*
+ * this struct defines the different floppy drive types.
+ */
+static struct {
+	struct floppy_drive_params params;
+	const char *name;	/* name printed while booting */
+} default_drive_params[] = {
+/* NOTE: the time values in jiffies should be in msec!
+ CMOS drive type
+  |     Maximum data rate supported by drive type
+  |     |   Head load time, msec
+  |     |   |   Head unload time, msec (not used)
+  |     |   |   |     Step rate interval, usec
+  |     |   |   |     |       Time needed for spinup time (jiffies)
+  |     |   |   |     |       |      Timeout for spinning down (jiffies)
+  |     |   |   |     |       |      |   Spindown offset (where disk stops)
+  |     |   |   |     |       |      |   |     Select delay
+  |     |   |   |     |       |      |   |     |     RPS
+  |     |   |   |     |       |      |   |     |     |    Max number of tracks
+  |     |   |   |     |       |      |   |     |     |    |     Interrupt timeout
+  |     |   |   |     |       |      |   |     |     |    |     |   Max nonintlv. sectors
+  |     |   |   |     |       |      |   |     |     |    |     |   | -Max Errors- flags */
+{{0,  500, 16, 16, 8000,    1*HZ, 3*HZ,  0, SEL_DLY, 5,  80, 3*HZ, 20, {3,1,2,0,2}, 0,
+      0, { 7, 4, 8, 2, 1, 5, 3,10}, 3*HZ/2, 0 }, "unknown" },
+
+{{1,  300, 16, 16, 8000,    1*HZ, 3*HZ,  0, SEL_DLY, 5,  40, 3*HZ, 17, {3,1,2,0,2}, 0,
+      0, { 1, 0, 0, 0, 0, 0, 0, 0}, 3*HZ/2, 1 }, "360K PC" }, /*5 1/4 360 KB PC*/
+
+{{2,  500, 16, 16, 6000, 4*HZ/10, 3*HZ, 14, SEL_DLY, 6,  83, 3*HZ, 17, {3,1,2,0,2}, 0,
+      0, { 2, 5, 6,23,10,20,12, 0}, 3*HZ/2, 2 }, "1.2M" }, /*5 1/4 HD AT*/
+
+{{3,  250, 16, 16, 3000,    1*HZ, 3*HZ,  0, SEL_DLY, 5,  83, 3*HZ, 20, {3,1,2,0,2}, 0,
+      0, { 4,22,21,30, 3, 0, 0, 0}, 3*HZ/2, 4 }, "720k" }, /*3 1/2 DD*/
+
+{{4,  500, 16, 16, 4000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5,  83, 3*HZ, 20, {3,1,2,0,2}, 0,
+      0, { 7, 4,25,22,31,21,29,11}, 3*HZ/2, 7 }, "1.44M" }, /*3 1/2 HD*/
+
+{{5, 1000, 15,  8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5,  83, 3*HZ, 40, {3,1,2,0,2}, 0,
+      0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M AMI BIOS" }, /*3 1/2 ED*/
+
+{{6, 1000, 15,  8, 3000, 4*HZ/10, 3*HZ, 10, SEL_DLY, 5,  83, 3*HZ, 40, {3,1,2,0,2}, 0,
+      0, { 7, 8, 4,25,28,22,31,21}, 3*HZ/2, 8 }, "2.88M" } /*3 1/2 ED*/
+/*    |  --autodetected formats---    |      |      |
+ *    read_track                      |      |    Name printed when booting
+ *				      |     Native format
+ *	            Frequency of disk change checks */
+};
+
+static struct floppy_drive_params drive_params[N_DRIVE];
+static struct floppy_drive_struct drive_state[N_DRIVE];
+static struct floppy_write_errors write_errors[N_DRIVE];
+static struct timer_list motor_off_timer[N_DRIVE];
+static struct blk_mq_tag_set tag_sets[N_DRIVE];
+static struct block_device *opened_bdev[N_DRIVE];
+static DEFINE_MUTEX(open_lock);
+static struct floppy_raw_cmd *raw_cmd, default_raw_cmd;
+
+/*
+ * This struct defines the different floppy types.
+ *
+ * Bit 0 of 'stretch' tells if the tracks need to be doubled for some
+ * types (e.g. 360kB diskette in 1.2MB drive, etc.).  Bit 1 of 'stretch'
+ * tells if the disk is in Commodore 1581 format, which means side 0 sectors
+ * are located on side 1 of the disk but with a side 0 ID, and vice-versa.
+ * This is the same as the Sharp MZ-80 5.25" CP/M disk format, except that the
+ * 1581's logical side 0 is on physical side 1, whereas the Sharp's logical
+ * side 0 is on physical side 0 (but with the misnamed sector IDs).
+ * 'stretch' should probably be renamed to something more general, like
+ * 'options'.
+ *
+ * Bits 2 through 9 of 'stretch' tell the number of the first sector.
+ * The LSB (bit 2) is flipped. For most disks, the first sector
+ * is 1 (represented by 0x00<<2).  For some CP/M and music sampler
+ * disks (such as Ensoniq EPS 16plus) it is 0 (represented as 0x01<<2).
+ * For Amstrad CPC disks it is 0xC1 (represented as 0xC0<<2).
+ *
+ * Other parameters should be self-explanatory (see also setfdprm(8)).
+ */
+/*
+	    Size
+	     |  Sectors per track
+	     |  | Head
+	     |  | |  Tracks
+	     |  | |  | Stretch
+	     |  | |  | |  Gap 1 size
+	     |  | |  | |    |  Data rate, | 0x40 for perp
+	     |  | |  | |    |    |  Spec1 (stepping rate, head unload
+	     |  | |  | |    |    |    |    /fmt gap (gap2) */
+static struct floppy_struct floppy_type[32] = {
+	{    0, 0,0, 0,0,0x00,0x00,0x00,0x00,NULL    },	/*  0 no testing    */
+	{  720, 9,2,40,0,0x2A,0x02,0xDF,0x50,"d360"  }, /*  1 360KB PC      */
+	{ 2400,15,2,80,0,0x1B,0x00,0xDF,0x54,"h1200" },	/*  2 1.2MB AT      */
+	{  720, 9,1,80,0,0x2A,0x02,0xDF,0x50,"D360"  },	/*  3 360KB SS 3.5" */
+	{ 1440, 9,2,80,0,0x2A,0x02,0xDF,0x50,"D720"  },	/*  4 720KB 3.5"    */
+	{  720, 9,2,40,1,0x23,0x01,0xDF,0x50,"h360"  },	/*  5 360KB AT      */
+	{ 1440, 9,2,80,0,0x23,0x01,0xDF,0x50,"h720"  },	/*  6 720KB AT      */
+	{ 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,"H1440" },	/*  7 1.44MB 3.5"   */
+	{ 5760,36,2,80,0,0x1B,0x43,0xAF,0x54,"E2880" },	/*  8 2.88MB 3.5"   */
+	{ 6240,39,2,80,0,0x1B,0x43,0xAF,0x28,"E3120" },	/*  9 3.12MB 3.5"   */
+
+	{ 2880,18,2,80,0,0x25,0x00,0xDF,0x02,"h1440" }, /* 10 1.44MB 5.25"  */
+	{ 3360,21,2,80,0,0x1C,0x00,0xCF,0x0C,"H1680" }, /* 11 1.68MB 3.5"   */
+	{  820,10,2,41,1,0x25,0x01,0xDF,0x2E,"h410"  },	/* 12 410KB 5.25"   */
+	{ 1640,10,2,82,0,0x25,0x02,0xDF,0x2E,"H820"  },	/* 13 820KB 3.5"    */
+	{ 2952,18,2,82,0,0x25,0x00,0xDF,0x02,"h1476" },	/* 14 1.48MB 5.25"  */
+	{ 3444,21,2,82,0,0x25,0x00,0xDF,0x0C,"H1722" },	/* 15 1.72MB 3.5"   */
+	{  840,10,2,42,1,0x25,0x01,0xDF,0x2E,"h420"  },	/* 16 420KB 5.25"   */
+	{ 1660,10,2,83,0,0x25,0x02,0xDF,0x2E,"H830"  },	/* 17 830KB 3.5"    */
+	{ 2988,18,2,83,0,0x25,0x00,0xDF,0x02,"h1494" },	/* 18 1.49MB 5.25"  */
+	{ 3486,21,2,83,0,0x25,0x00,0xDF,0x0C,"H1743" }, /* 19 1.74 MB 3.5"  */
+
+	{ 1760,11,2,80,0,0x1C,0x09,0xCF,0x00,"h880"  }, /* 20 880KB 5.25"   */
+	{ 2080,13,2,80,0,0x1C,0x01,0xCF,0x00,"D1040" }, /* 21 1.04MB 3.5"   */
+	{ 2240,14,2,80,0,0x1C,0x19,0xCF,0x00,"D1120" }, /* 22 1.12MB 3.5"   */
+	{ 3200,20,2,80,0,0x1C,0x20,0xCF,0x2C,"h1600" }, /* 23 1.6MB 5.25"   */
+	{ 3520,22,2,80,0,0x1C,0x08,0xCF,0x2e,"H1760" }, /* 24 1.76MB 3.5"   */
+	{ 3840,24,2,80,0,0x1C,0x20,0xCF,0x00,"H1920" }, /* 25 1.92MB 3.5"   */
+	{ 6400,40,2,80,0,0x25,0x5B,0xCF,0x00,"E3200" }, /* 26 3.20MB 3.5"   */
+	{ 7040,44,2,80,0,0x25,0x5B,0xCF,0x00,"E3520" }, /* 27 3.52MB 3.5"   */
+	{ 7680,48,2,80,0,0x25,0x63,0xCF,0x00,"E3840" }, /* 28 3.84MB 3.5"   */
+	{ 3680,23,2,80,0,0x1C,0x10,0xCF,0x00,"H1840" }, /* 29 1.84MB 3.5"   */
+
+	{ 1600,10,2,80,0,0x25,0x02,0xDF,0x2E,"D800"  },	/* 30 800KB 3.5"    */
+	{ 3200,20,2,80,0,0x1C,0x00,0xCF,0x2C,"H1600" }, /* 31 1.6MB 3.5"    */
+};
+
+static struct gendisk *disks[N_DRIVE][ARRAY_SIZE(floppy_type)];
+
+#define SECTSIZE (_FD_SECTSIZE(*floppy))
+
+/* Auto-detection: Disk type used until the next media change occurs. */
+static struct floppy_struct *current_type[N_DRIVE];
+
+/*
+ * User-provided type information. current_type points to
+ * the respective entry of this array.
+ */
+static struct floppy_struct user_params[N_DRIVE];
+
+static sector_t floppy_sizes[256];
+
+static char floppy_device_name[] = "floppy";
+
+/*
+ * The driver is trying to determine the correct media format
+ * while probing is set. rw_interrupt() clears it after a
+ * successful access.
+ */
+static int probing;
+
+/* Synchronization of FDC access. */
+#define FD_COMMAND_NONE		-1
+#define FD_COMMAND_ERROR	2
+#define FD_COMMAND_OKAY		3
+
+static volatile int command_status = FD_COMMAND_NONE;
+static unsigned long fdc_busy;
+static DECLARE_WAIT_QUEUE_HEAD(fdc_wait);
+static DECLARE_WAIT_QUEUE_HEAD(command_done);
+
+/* errors encountered on the current (or last) request */
+static int floppy_errors;
+
+/* Format request descriptor. */
+static struct format_descr format_req;
+
+/*
+ * Rate is 0 for 500kb/s, 1 for 300kbps, 2 for 250kbps
+ * Spec1 is 0xSH, where S is stepping rate (F=1ms, E=2ms, D=3ms etc),
+ * H is head unload time (1=16ms, 2=32ms, etc)
+ */
+
+/*
+ * Track buffer
+ * Because these are written to by the DMA controller, they must
+ * not contain a 64k byte boundary crossing, or data will be
+ * corrupted/lost.
+ */
+static char *floppy_track_buffer;
+static int max_buffer_sectors;
+
+typedef void (*done_f)(int);
+static const struct cont_t {
+	void (*interrupt)(void);
+				/* this is called after the interrupt of the
+				 * main command */
+	void (*redo)(void);	/* this is called to retry the operation */
+	void (*error)(void);	/* this is called to tally an error */
+	done_f done;		/* this is called to say if the operation has
+				 * succeeded/failed */
+} *cont;
+
+static void floppy_ready(void);
+static void floppy_start(void);
+static void process_fd_request(void);
+static void recalibrate_floppy(void);
+static void floppy_shutdown(struct work_struct *);
+
+static int floppy_request_regions(int);
+static void floppy_release_regions(int);
+static int floppy_grab_irq_and_dma(void);
+static void floppy_release_irq_and_dma(void);
+
+/*
+ * The "reset" variable should be tested whenever an interrupt is scheduled,
+ * after the commands have been sent. This is to ensure that the driver doesn't
+ * get wedged when the interrupt doesn't come because of a failed command.
+ * reset doesn't need to be tested before sending commands, because
+ * output_byte is automatically disabled when reset is set.
+ */
+static void reset_fdc(void);
+static int floppy_revalidate(struct gendisk *disk);
+
+/*
+ * These are global variables, as that's the easiest way to give
+ * information to interrupts. They are the data used for the current
+ * request.
+ */
+#define NO_TRACK	-1
+#define NEED_1_RECAL	-2
+#define NEED_2_RECAL	-3
+
+static atomic_t usage_count = ATOMIC_INIT(0);
+
+/* buffer related variables */
+static int buffer_track = -1;
+static int buffer_drive = -1;
+static int buffer_min = -1;
+static int buffer_max = -1;
+
+/* fdc related variables, should end up in a struct */
+static struct floppy_fdc_state fdc_state[N_FDC];
+static int current_fdc;			/* current fdc */
+
+static struct workqueue_struct *floppy_wq;
+
+static struct floppy_struct *_floppy = floppy_type;
+static unsigned char current_drive;
+static long current_count_sectors;
+static unsigned char fsector_t;	/* sector in track */
+static unsigned char in_sector_offset;	/* offset within physical sector,
+					 * expressed in units of 512 bytes */
+
+static inline unsigned char fdc_inb(int fdc, int reg)
+{
+	return fd_inb(fdc_state[fdc].address, reg);
+}
+
+static inline void fdc_outb(unsigned char value, int fdc, int reg)
+{
+	fd_outb(value, fdc_state[fdc].address, reg);
+}
+
+static inline bool drive_no_geom(int drive)
+{
+	return !current_type[drive] && !ITYPE(drive_state[drive].fd_device);
+}
+
+#ifndef fd_eject
+static inline int fd_eject(int drive)
+{
+	return -EINVAL;
+}
+#endif
+
+/*
+ * Debugging
+ * =========
+ */
+#ifdef DEBUGT
+static long unsigned debugtimer;
+
+static inline void set_debugt(void)
+{
+	debugtimer = jiffies;
+}
+
+static inline void debugt(const char *func, const char *msg)
+{
+	if (drive_params[current_drive].flags & DEBUGT)
+		pr_info("%s:%s dtime=%lu\n", func, msg, jiffies - debugtimer);
+}
+#else
+static inline void set_debugt(void) { }
+static inline void debugt(const char *func, const char *msg) { }
+#endif /* DEBUGT */
+
+
+static DECLARE_DELAYED_WORK(fd_timeout, floppy_shutdown);
+static const char *timeout_message;
+
+static void is_alive(const char *func, const char *message)
+{
+	/* this routine checks whether the floppy driver is "alive" */
+	if (test_bit(0, &fdc_busy) && command_status < 2 &&
+	    !delayed_work_pending(&fd_timeout)) {
+		DPRINT("%s: timeout handler died.  %s\n", func, message);
+	}
+}
+
+static void (*do_floppy)(void) = NULL;
+
+#define OLOGSIZE 20
+
+static void (*lasthandler)(void);
+static unsigned long interruptjiffies;
+static unsigned long resultjiffies;
+static int resultsize;
+static unsigned long lastredo;
+
+static struct output_log {
+	unsigned char data;
+	unsigned char status;
+	unsigned long jiffies;
+} output_log[OLOGSIZE];
+
+static int output_log_pos;
+
+#define MAXTIMEOUT -2
+
+static void __reschedule_timeout(int drive, const char *message)
+{
+	unsigned long delay;
+
+	if (drive < 0 || drive >= N_DRIVE) {
+		delay = 20UL * HZ;
+		drive = 0;
+	} else
+		delay = drive_params[drive].timeout;
+
+	mod_delayed_work(floppy_wq, &fd_timeout, delay);
+	if (drive_params[drive].flags & FD_DEBUG)
+		DPRINT("reschedule timeout %s\n", message);
+	timeout_message = message;
+}
+
+static void reschedule_timeout(int drive, const char *message)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&floppy_lock, flags);
+	__reschedule_timeout(drive, message);
+	spin_unlock_irqrestore(&floppy_lock, flags);
+}
+
+#define INFBOUND(a, b) (a) = max_t(int, a, b)
+#define SUPBOUND(a, b) (a) = min_t(int, a, b)
+
+/*
+ * Bottom half floppy driver.
+ * ==========================
+ *
+ * This part of the file contains the code talking directly to the hardware,
+ * and also the main service loop (seek-configure-spinup-command)
+ */
+
+/*
+ * disk change.
+ * This routine is responsible for maintaining the FD_DISK_CHANGE flag,
+ * and the last_checked date.
+ *
+ * last_checked is the date of the last check which showed 'no disk change'
+ * FD_DISK_CHANGE is set under two conditions:
+ * 1. The floppy has been changed after some i/o to that floppy already
+ *    took place.
+ * 2. No floppy disk is in the drive. This is done in order to ensure that
+ *    requests are quickly flushed in case there is no disk in the drive. It
+ *    follows that FD_DISK_CHANGE can only be cleared if there is a disk in
+ *    the drive.
+ *
+ * For 1., maxblock is observed. Maxblock is 0 if no i/o has taken place yet.
+ * For 2., FD_DISK_NEWCHANGE is watched. FD_DISK_NEWCHANGE is cleared on
+ *  each seek. If a disk is present, the disk change line should also be
+ *  cleared on each seek. Thus, if FD_DISK_NEWCHANGE is clear, but the disk
+ *  change line is set, this means either that no disk is in the drive, or
+ *  that it has been removed since the last seek.
+ *
+ * This means that we really have a third possibility too:
+ *  The floppy has been changed after the last seek.
+ */
+
+static int disk_change(int drive)
+{
+	int fdc = FDC(drive);
+
+	if (time_before(jiffies, drive_state[drive].select_date + drive_params[drive].select_delay))
+		DPRINT("WARNING disk change called early\n");
+	if (!(fdc_state[fdc].dor & (0x10 << UNIT(drive))) ||
+	    (fdc_state[fdc].dor & 3) != UNIT(drive) || fdc != FDC(drive)) {
+		DPRINT("probing disk change on unselected drive\n");
+		DPRINT("drive=%d fdc=%d dor=%x\n", drive, FDC(drive),
+		       (unsigned int)fdc_state[fdc].dor);
+	}
+
+	debug_dcl(drive_params[drive].flags,
+		  "checking disk change line for drive %d\n", drive);
+	debug_dcl(drive_params[drive].flags, "jiffies=%lu\n", jiffies);
+	debug_dcl(drive_params[drive].flags, "disk change line=%x\n",
+		  fdc_inb(fdc, FD_DIR) & 0x80);
+	debug_dcl(drive_params[drive].flags, "flags=%lx\n",
+		  drive_state[drive].flags);
+
+	if (drive_params[drive].flags & FD_BROKEN_DCL)
+		return test_bit(FD_DISK_CHANGED_BIT,
+				&drive_state[drive].flags);
+	if ((fdc_inb(fdc, FD_DIR) ^ drive_params[drive].flags) & 0x80) {
+		set_bit(FD_VERIFY_BIT, &drive_state[drive].flags);
+					/* verify write protection */
+
+		if (drive_state[drive].maxblock)	/* mark it changed */
+			set_bit(FD_DISK_CHANGED_BIT,
+				&drive_state[drive].flags);
+
+		/* invalidate its geometry */
+		if (drive_state[drive].keep_data >= 0) {
+			if ((drive_params[drive].flags & FTD_MSG) &&
+			    current_type[drive] != NULL)
+				DPRINT("Disk type is undefined after disk change\n");
+			current_type[drive] = NULL;
+			floppy_sizes[TOMINOR(drive)] = MAX_DISK_SIZE << 1;
+		}
+
+		return 1;
+	} else {
+		drive_state[drive].last_checked = jiffies;
+		clear_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[drive].flags);
+	}
+	return 0;
+}
+
+static inline int is_selected(int dor, int unit)
+{
+	return ((dor & (0x10 << unit)) && (dor & 3) == unit);
+}
+
+static bool is_ready_state(int status)
+{
+	int state = status & (STATUS_READY | STATUS_DIR | STATUS_DMA);
+	return state == STATUS_READY;
+}
+
+static int set_dor(int fdc, char mask, char data)
+{
+	unsigned char unit;
+	unsigned char drive;
+	unsigned char newdor;
+	unsigned char olddor;
+
+	if (fdc_state[fdc].address == -1)
+		return -1;
+
+	olddor = fdc_state[fdc].dor;
+	newdor = (olddor & mask) | data;
+	if (newdor != olddor) {
+		unit = olddor & 0x3;
+		if (is_selected(olddor, unit) && !is_selected(newdor, unit)) {
+			drive = REVDRIVE(fdc, unit);
+			debug_dcl(drive_params[drive].flags,
+				  "calling disk change from set_dor\n");
+			disk_change(drive);
+		}
+		fdc_state[fdc].dor = newdor;
+		fdc_outb(newdor, fdc, FD_DOR);
+
+		unit = newdor & 0x3;
+		if (!is_selected(olddor, unit) && is_selected(newdor, unit)) {
+			drive = REVDRIVE(fdc, unit);
+			drive_state[drive].select_date = jiffies;
+		}
+	}
+	return olddor;
+}
+
+static void twaddle(int fdc, int drive)
+{
+	if (drive_params[drive].select_delay)
+		return;
+	fdc_outb(fdc_state[fdc].dor & ~(0x10 << UNIT(drive)),
+		 fdc, FD_DOR);
+	fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR);
+	drive_state[drive].select_date = jiffies;
+}
+
+/*
+ * Reset all driver information about the specified fdc.
+ * This is needed after a reset, and after a raw command.
+ */
+static void reset_fdc_info(int fdc, int mode)
+{
+	int drive;
+
+	fdc_state[fdc].spec1 = fdc_state[fdc].spec2 = -1;
+	fdc_state[fdc].need_configure = 1;
+	fdc_state[fdc].perp_mode = 1;
+	fdc_state[fdc].rawcmd = 0;
+	for (drive = 0; drive < N_DRIVE; drive++)
+		if (FDC(drive) == fdc &&
+		    (mode || drive_state[drive].track != NEED_1_RECAL))
+			drive_state[drive].track = NEED_2_RECAL;
+}
+
+/*
+ * selects the fdc and drive, and enables the fdc's input/dma.
+ * Both current_drive and current_fdc are changed to match the new drive.
+ */
+static void set_fdc(int drive)
+{
+	unsigned int fdc;
+
+	if (drive < 0 || drive >= N_DRIVE) {
+		pr_info("bad drive value %d\n", drive);
+		return;
+	}
+
+	fdc = FDC(drive);
+	if (fdc >= N_FDC) {
+		pr_info("bad fdc value\n");
+		return;
+	}
+
+	set_dor(fdc, ~0, 8);
+#if N_FDC > 1
+	set_dor(1 - fdc, ~8, 0);
+#endif
+	if (fdc_state[fdc].rawcmd == 2)
+		reset_fdc_info(fdc, 1);
+	if (fdc_inb(fdc, FD_STATUS) != STATUS_READY)
+		fdc_state[fdc].reset = 1;
+
+	current_drive = drive;
+	current_fdc = fdc;
+}
+
+/*
+ * locks the driver.
+ * Both current_drive and current_fdc are changed to match the new drive.
+ */
+static int lock_fdc(int drive)
+{
+	if (WARN(atomic_read(&usage_count) == 0,
+		 "Trying to lock fdc while usage count=0\n"))
+		return -1;
+
+	if (wait_event_interruptible(fdc_wait, !test_and_set_bit(0, &fdc_busy)))
+		return -EINTR;
+
+	command_status = FD_COMMAND_NONE;
+
+	reschedule_timeout(drive, "lock fdc");
+	set_fdc(drive);
+	return 0;
+}
+
+/* unlocks the driver */
+static void unlock_fdc(void)
+{
+	if (!test_bit(0, &fdc_busy))
+		DPRINT("FDC access conflict!\n");
+
+	raw_cmd = NULL;
+	command_status = FD_COMMAND_NONE;
+	cancel_delayed_work(&fd_timeout);
+	do_floppy = NULL;
+	cont = NULL;
+	clear_bit(0, &fdc_busy);
+	wake_up(&fdc_wait);
+}
+
+/* switches the motor off after a given timeout */
+static void motor_off_callback(struct timer_list *t)
+{
+	unsigned long nr = t - motor_off_timer;
+	unsigned char mask = ~(0x10 << UNIT(nr));
+
+	if (WARN_ON_ONCE(nr >= N_DRIVE))
+		return;
+
+	set_dor(FDC(nr), mask, 0);
+}
+
+/* schedules motor off */
+static void floppy_off(unsigned int drive)
+{
+	unsigned long volatile delta;
+	int fdc = FDC(drive);
+
+	if (!(fdc_state[fdc].dor & (0x10 << UNIT(drive))))
+		return;
+
+	del_timer(motor_off_timer + drive);
+
+	/* make spindle stop in a position which minimizes spinup time
+	 * next time */
+	if (drive_params[drive].rps) {
+		delta = jiffies - drive_state[drive].first_read_date + HZ -
+		    drive_params[drive].spindown_offset;
+		delta = ((delta * drive_params[drive].rps) % HZ) / drive_params[drive].rps;
+		motor_off_timer[drive].expires =
+		    jiffies + drive_params[drive].spindown - delta;
+	}
+	add_timer(motor_off_timer + drive);
+}
+
+/*
+ * cycle through all N_DRIVE floppy drives, for disk change testing.
+ * stopping at current drive. This is done before any long operation, to
+ * be sure to have up to date disk change information.
+ */
+static void scandrives(void)
+{
+	int i;
+	int drive;
+	int saved_drive;
+
+	if (drive_params[current_drive].select_delay)
+		return;
+
+	saved_drive = current_drive;
+	for (i = 0; i < N_DRIVE; i++) {
+		drive = (saved_drive + i + 1) % N_DRIVE;
+		if (drive_state[drive].fd_ref == 0 || drive_params[drive].select_delay != 0)
+			continue;	/* skip closed drives */
+		set_fdc(drive);
+		if (!(set_dor(current_fdc, ~3, UNIT(drive) | (0x10 << UNIT(drive))) &
+		      (0x10 << UNIT(drive))))
+			/* switch the motor off again, if it was off to
+			 * begin with */
+			set_dor(current_fdc, ~(0x10 << UNIT(drive)), 0);
+	}
+	set_fdc(saved_drive);
+}
+
+static void empty(void)
+{
+}
+
+static void (*floppy_work_fn)(void);
+
+static void floppy_work_workfn(struct work_struct *work)
+{
+	floppy_work_fn();
+}
+
+static DECLARE_WORK(floppy_work, floppy_work_workfn);
+
+static void schedule_bh(void (*handler)(void))
+{
+	WARN_ON(work_pending(&floppy_work));
+
+	floppy_work_fn = handler;
+	queue_work(floppy_wq, &floppy_work);
+}
+
+static void (*fd_timer_fn)(void) = NULL;
+
+static void fd_timer_workfn(struct work_struct *work)
+{
+	fd_timer_fn();
+}
+
+static DECLARE_DELAYED_WORK(fd_timer, fd_timer_workfn);
+
+static void cancel_activity(void)
+{
+	do_floppy = NULL;
+	cancel_delayed_work(&fd_timer);
+	cancel_work_sync(&floppy_work);
+}
+
+/* this function makes sure that the disk stays in the drive during the
+ * transfer */
+static void fd_watchdog(void)
+{
+	debug_dcl(drive_params[current_drive].flags,
+		  "calling disk change from watchdog\n");
+
+	if (disk_change(current_drive)) {
+		DPRINT("disk removed during i/o\n");
+		cancel_activity();
+		cont->done(0);
+		reset_fdc();
+	} else {
+		cancel_delayed_work(&fd_timer);
+		fd_timer_fn = fd_watchdog;
+		queue_delayed_work(floppy_wq, &fd_timer, HZ / 10);
+	}
+}
+
+static void main_command_interrupt(void)
+{
+	cancel_delayed_work(&fd_timer);
+	cont->interrupt();
+}
+
+/* waits for a delay (spinup or select) to pass */
+static int fd_wait_for_completion(unsigned long expires,
+				  void (*function)(void))
+{
+	if (fdc_state[current_fdc].reset) {
+		reset_fdc();	/* do the reset during sleep to win time
+				 * if we don't need to sleep, it's a good
+				 * occasion anyways */
+		return 1;
+	}
+
+	if (time_before(jiffies, expires)) {
+		cancel_delayed_work(&fd_timer);
+		fd_timer_fn = function;
+		queue_delayed_work(floppy_wq, &fd_timer, expires - jiffies);
+		return 1;
+	}
+	return 0;
+}
+
+static void setup_DMA(void)
+{
+	unsigned long f;
+
+	if (raw_cmd->length == 0) {
+		print_hex_dump(KERN_INFO, "zero dma transfer size: ",
+			       DUMP_PREFIX_NONE, 16, 1,
+			       raw_cmd->fullcmd, raw_cmd->cmd_count, false);
+		cont->done(0);
+		fdc_state[current_fdc].reset = 1;
+		return;
+	}
+	if (((unsigned long)raw_cmd->kernel_data) % 512) {
+		pr_info("non aligned address: %p\n", raw_cmd->kernel_data);
+		cont->done(0);
+		fdc_state[current_fdc].reset = 1;
+		return;
+	}
+	f = claim_dma_lock();
+	fd_disable_dma();
+#ifdef fd_dma_setup
+	if (fd_dma_setup(raw_cmd->kernel_data, raw_cmd->length,
+			 (raw_cmd->flags & FD_RAW_READ) ?
+			 DMA_MODE_READ : DMA_MODE_WRITE,
+			 fdc_state[current_fdc].address) < 0) {
+		release_dma_lock(f);
+		cont->done(0);
+		fdc_state[current_fdc].reset = 1;
+		return;
+	}
+	release_dma_lock(f);
+#else
+	fd_clear_dma_ff();
+	fd_cacheflush(raw_cmd->kernel_data, raw_cmd->length);
+	fd_set_dma_mode((raw_cmd->flags & FD_RAW_READ) ?
+			DMA_MODE_READ : DMA_MODE_WRITE);
+	fd_set_dma_addr(raw_cmd->kernel_data);
+	fd_set_dma_count(raw_cmd->length);
+	virtual_dma_port = fdc_state[current_fdc].address;
+	fd_enable_dma();
+	release_dma_lock(f);
+#endif
+}
+
+static void show_floppy(int fdc);
+
+/* waits until the fdc becomes ready */
+static int wait_til_ready(int fdc)
+{
+	int status;
+	int counter;
+
+	if (fdc_state[fdc].reset)
+		return -1;
+	for (counter = 0; counter < 10000; counter++) {
+		status = fdc_inb(fdc, FD_STATUS);
+		if (status & STATUS_READY)
+			return status;
+	}
+	if (initialized) {
+		DPRINT("Getstatus times out (%x) on fdc %d\n", status, fdc);
+		show_floppy(fdc);
+	}
+	fdc_state[fdc].reset = 1;
+	return -1;
+}
+
+/* sends a command byte to the fdc */
+static int output_byte(int fdc, char byte)
+{
+	int status = wait_til_ready(fdc);
+
+	if (status < 0)
+		return -1;
+
+	if (is_ready_state(status)) {
+		fdc_outb(byte, fdc, FD_DATA);
+		output_log[output_log_pos].data = byte;
+		output_log[output_log_pos].status = status;
+		output_log[output_log_pos].jiffies = jiffies;
+		output_log_pos = (output_log_pos + 1) % OLOGSIZE;
+		return 0;
+	}
+	fdc_state[fdc].reset = 1;
+	if (initialized) {
+		DPRINT("Unable to send byte %x to FDC. Fdc=%x Status=%x\n",
+		       byte, fdc, status);
+		show_floppy(fdc);
+	}
+	return -1;
+}
+
+/* gets the response from the fdc */
+static int result(int fdc)
+{
+	int i;
+	int status = 0;
+
+	for (i = 0; i < FD_RAW_REPLY_SIZE; i++) {
+		status = wait_til_ready(fdc);
+		if (status < 0)
+			break;
+		status &= STATUS_DIR | STATUS_READY | STATUS_BUSY | STATUS_DMA;
+		if ((status & ~STATUS_BUSY) == STATUS_READY) {
+			resultjiffies = jiffies;
+			resultsize = i;
+			return i;
+		}
+		if (status == (STATUS_DIR | STATUS_READY | STATUS_BUSY))
+			reply_buffer[i] = fdc_inb(fdc, FD_DATA);
+		else
+			break;
+	}
+	if (initialized) {
+		DPRINT("get result error. Fdc=%d Last status=%x Read bytes=%d\n",
+		       fdc, status, i);
+		show_floppy(fdc);
+	}
+	fdc_state[fdc].reset = 1;
+	return -1;
+}
+
+#define MORE_OUTPUT -2
+/* does the fdc need more output? */
+static int need_more_output(int fdc)
+{
+	int status = wait_til_ready(fdc);
+
+	if (status < 0)
+		return -1;
+
+	if (is_ready_state(status))
+		return MORE_OUTPUT;
+
+	return result(fdc);
+}
+
+/* Set perpendicular mode as required, based on data rate, if supported.
+ * 82077 Now tested. 1Mbps data rate only possible with 82077-1.
+ */
+static void perpendicular_mode(int fdc)
+{
+	unsigned char perp_mode;
+
+	if (raw_cmd->rate & 0x40) {
+		switch (raw_cmd->rate & 3) {
+		case 0:
+			perp_mode = 2;
+			break;
+		case 3:
+			perp_mode = 3;
+			break;
+		default:
+			DPRINT("Invalid data rate for perpendicular mode!\n");
+			cont->done(0);
+			fdc_state[fdc].reset = 1;
+					/*
+					 * convenient way to return to
+					 * redo without too much hassle
+					 * (deep stack et al.)
+					 */
+			return;
+		}
+	} else
+		perp_mode = 0;
+
+	if (fdc_state[fdc].perp_mode == perp_mode)
+		return;
+	if (fdc_state[fdc].version >= FDC_82077_ORIG) {
+		output_byte(fdc, FD_PERPENDICULAR);
+		output_byte(fdc, perp_mode);
+		fdc_state[fdc].perp_mode = perp_mode;
+	} else if (perp_mode) {
+		DPRINT("perpendicular mode not supported by this FDC.\n");
+	}
+}				/* perpendicular_mode */
+
+static int fifo_depth = 0xa;
+static int no_fifo;
+
+static int fdc_configure(int fdc)
+{
+	/* Turn on FIFO */
+	output_byte(fdc, FD_CONFIGURE);
+	if (need_more_output(fdc) != MORE_OUTPUT)
+		return 0;
+	output_byte(fdc, 0);
+	output_byte(fdc, 0x10 | (no_fifo & 0x20) | (fifo_depth & 0xf));
+	output_byte(fdc, 0);    /* pre-compensation from track 0 upwards */
+	return 1;
+}
+
+#define NOMINAL_DTR 500
+
+/* Issue a "SPECIFY" command to set the step rate time, head unload time,
+ * head load time, and DMA disable flag to values needed by floppy.
+ *
+ * The value "dtr" is the data transfer rate in Kbps.  It is needed
+ * to account for the data rate-based scaling done by the 82072 and 82077
+ * FDC types.  This parameter is ignored for other types of FDCs (i.e.
+ * 8272a).
+ *
+ * Note that changing the data transfer rate has a (probably deleterious)
+ * effect on the parameters subject to scaling for 82072/82077 FDCs, so
+ * fdc_specify is called again after each data transfer rate
+ * change.
+ *
+ * srt: 1000 to 16000 in microseconds
+ * hut: 16 to 240 milliseconds
+ * hlt: 2 to 254 milliseconds
+ *
+ * These values are rounded up to the next highest available delay time.
+ */
+static void fdc_specify(int fdc, int drive)
+{
+	unsigned char spec1;
+	unsigned char spec2;
+	unsigned long srt;
+	unsigned long hlt;
+	unsigned long hut;
+	unsigned long dtr = NOMINAL_DTR;
+	unsigned long scale_dtr = NOMINAL_DTR;
+	int hlt_max_code = 0x7f;
+	int hut_max_code = 0xf;
+
+	if (fdc_state[fdc].need_configure &&
+	    fdc_state[fdc].version >= FDC_82072A) {
+		fdc_configure(fdc);
+		fdc_state[fdc].need_configure = 0;
+	}
+
+	switch (raw_cmd->rate & 0x03) {
+	case 3:
+		dtr = 1000;
+		break;
+	case 1:
+		dtr = 300;
+		if (fdc_state[fdc].version >= FDC_82078) {
+			/* chose the default rate table, not the one
+			 * where 1 = 2 Mbps */
+			output_byte(fdc, FD_DRIVESPEC);
+			if (need_more_output(fdc) == MORE_OUTPUT) {
+				output_byte(fdc, UNIT(drive));
+				output_byte(fdc, 0xc0);
+			}
+		}
+		break;
+	case 2:
+		dtr = 250;
+		break;
+	}
+
+	if (fdc_state[fdc].version >= FDC_82072) {
+		scale_dtr = dtr;
+		hlt_max_code = 0x00;	/* 0==256msec*dtr0/dtr (not linear!) */
+		hut_max_code = 0x0;	/* 0==256msec*dtr0/dtr (not linear!) */
+	}
+
+	/* Convert step rate from microseconds to milliseconds and 4 bits */
+	srt = 16 - DIV_ROUND_UP(drive_params[drive].srt * scale_dtr / 1000,
+				NOMINAL_DTR);
+	if (slow_floppy)
+		srt = srt / 4;
+
+	SUPBOUND(srt, 0xf);
+	INFBOUND(srt, 0);
+
+	hlt = DIV_ROUND_UP(drive_params[drive].hlt * scale_dtr / 2,
+			   NOMINAL_DTR);
+	if (hlt < 0x01)
+		hlt = 0x01;
+	else if (hlt > 0x7f)
+		hlt = hlt_max_code;
+
+	hut = DIV_ROUND_UP(drive_params[drive].hut * scale_dtr / 16,
+			   NOMINAL_DTR);
+	if (hut < 0x1)
+		hut = 0x1;
+	else if (hut > 0xf)
+		hut = hut_max_code;
+
+	spec1 = (srt << 4) | hut;
+	spec2 = (hlt << 1) | (use_virtual_dma & 1);
+
+	/* If these parameters did not change, just return with success */
+	if (fdc_state[fdc].spec1 != spec1 ||
+	    fdc_state[fdc].spec2 != spec2) {
+		/* Go ahead and set spec1 and spec2 */
+		output_byte(fdc, FD_SPECIFY);
+		output_byte(fdc, fdc_state[fdc].spec1 = spec1);
+		output_byte(fdc, fdc_state[fdc].spec2 = spec2);
+	}
+}				/* fdc_specify */
+
+/* Set the FDC's data transfer rate on behalf of the specified drive.
+ * NOTE: with 82072/82077 FDCs, changing the data rate requires a reissue
+ * of the specify command (i.e. using the fdc_specify function).
+ */
+static int fdc_dtr(void)
+{
+	/* If data rate not already set to desired value, set it. */
+	if ((raw_cmd->rate & 3) == fdc_state[current_fdc].dtr)
+		return 0;
+
+	/* Set dtr */
+	fdc_outb(raw_cmd->rate & 3, current_fdc, FD_DCR);
+
+	/* TODO: some FDC/drive combinations (C&T 82C711 with TEAC 1.2MB)
+	 * need a stabilization period of several milliseconds to be
+	 * enforced after data rate changes before R/W operations.
+	 * Pause 5 msec to avoid trouble. (Needs to be 2 jiffies)
+	 */
+	fdc_state[current_fdc].dtr = raw_cmd->rate & 3;
+	return fd_wait_for_completion(jiffies + 2UL * HZ / 100, floppy_ready);
+}				/* fdc_dtr */
+
+static void tell_sector(void)
+{
+	pr_cont(": track %d, head %d, sector %d, size %d",
+		reply_buffer[R_TRACK], reply_buffer[R_HEAD],
+		reply_buffer[R_SECTOR],
+		reply_buffer[R_SIZECODE]);
+}				/* tell_sector */
+
+static void print_errors(void)
+{
+	DPRINT("");
+	if (reply_buffer[ST0] & ST0_ECE) {
+		pr_cont("Recalibrate failed!");
+	} else if (reply_buffer[ST2] & ST2_CRC) {
+		pr_cont("data CRC error");
+		tell_sector();
+	} else if (reply_buffer[ST1] & ST1_CRC) {
+		pr_cont("CRC error");
+		tell_sector();
+	} else if ((reply_buffer[ST1] & (ST1_MAM | ST1_ND)) ||
+		   (reply_buffer[ST2] & ST2_MAM)) {
+		if (!probing) {
+			pr_cont("sector not found");
+			tell_sector();
+		} else
+			pr_cont("probe failed...");
+	} else if (reply_buffer[ST2] & ST2_WC) {	/* seek error */
+		pr_cont("wrong cylinder");
+	} else if (reply_buffer[ST2] & ST2_BC) {	/* cylinder marked as bad */
+		pr_cont("bad cylinder");
+	} else {
+		pr_cont("unknown error. ST[0..2] are: 0x%x 0x%x 0x%x",
+			reply_buffer[ST0], reply_buffer[ST1],
+			reply_buffer[ST2]);
+		tell_sector();
+	}
+	pr_cont("\n");
+}
+
+/*
+ * OK, this error interpreting routine is called after a
+ * DMA read/write has succeeded
+ * or failed, so we check the results, and copy any buffers.
+ * hhb: Added better error reporting.
+ * ak: Made this into a separate routine.
+ */
+static int interpret_errors(void)
+{
+	char bad;
+
+	if (inr != 7) {
+		DPRINT("-- FDC reply error\n");
+		fdc_state[current_fdc].reset = 1;
+		return 1;
+	}
+
+	/* check IC to find cause of interrupt */
+	switch (reply_buffer[ST0] & ST0_INTR) {
+	case 0x40:		/* error occurred during command execution */
+		if (reply_buffer[ST1] & ST1_EOC)
+			return 0;	/* occurs with pseudo-DMA */
+		bad = 1;
+		if (reply_buffer[ST1] & ST1_WP) {
+			DPRINT("Drive is write protected\n");
+			clear_bit(FD_DISK_WRITABLE_BIT,
+				  &drive_state[current_drive].flags);
+			cont->done(0);
+			bad = 2;
+		} else if (reply_buffer[ST1] & ST1_ND) {
+			set_bit(FD_NEED_TWADDLE_BIT,
+				&drive_state[current_drive].flags);
+		} else if (reply_buffer[ST1] & ST1_OR) {
+			if (drive_params[current_drive].flags & FTD_MSG)
+				DPRINT("Over/Underrun - retrying\n");
+			bad = 0;
+		} else if (floppy_errors >= drive_params[current_drive].max_errors.reporting) {
+			print_errors();
+		}
+		if (reply_buffer[ST2] & ST2_WC || reply_buffer[ST2] & ST2_BC)
+			/* wrong cylinder => recal */
+			drive_state[current_drive].track = NEED_2_RECAL;
+		return bad;
+	case 0x80:		/* invalid command given */
+		DPRINT("Invalid FDC command given!\n");
+		cont->done(0);
+		return 2;
+	case 0xc0:
+		DPRINT("Abnormal termination caused by polling\n");
+		cont->error();
+		return 2;
+	default:		/* (0) Normal command termination */
+		return 0;
+	}
+}
+
+/*
+ * This routine is called when everything should be correctly set up
+ * for the transfer (i.e. floppy motor is on, the correct floppy is
+ * selected, and the head is sitting on the right track).
+ */
+static void setup_rw_floppy(void)
+{
+	int i;
+	int r;
+	int flags;
+	unsigned long ready_date;
+	void (*function)(void);
+
+	flags = raw_cmd->flags;
+	if (flags & (FD_RAW_READ | FD_RAW_WRITE))
+		flags |= FD_RAW_INTR;
+
+	if ((flags & FD_RAW_SPIN) && !(flags & FD_RAW_NO_MOTOR)) {
+		ready_date = drive_state[current_drive].spinup_date + drive_params[current_drive].spinup;
+		/* If spinup will take a long time, rerun scandrives
+		 * again just before spinup completion. Beware that
+		 * after scandrives, we must again wait for selection.
+		 */
+		if (time_after(ready_date, jiffies + drive_params[current_drive].select_delay)) {
+			ready_date -= drive_params[current_drive].select_delay;
+			function = floppy_start;
+		} else
+			function = setup_rw_floppy;
+
+		/* wait until the floppy is spinning fast enough */
+		if (fd_wait_for_completion(ready_date, function))
+			return;
+	}
+	if ((flags & FD_RAW_READ) || (flags & FD_RAW_WRITE))
+		setup_DMA();
+
+	if (flags & FD_RAW_INTR)
+		do_floppy = main_command_interrupt;
+
+	r = 0;
+	for (i = 0; i < raw_cmd->cmd_count; i++)
+		r |= output_byte(current_fdc, raw_cmd->fullcmd[i]);
+
+	debugt(__func__, "rw_command");
+
+	if (r) {
+		cont->error();
+		reset_fdc();
+		return;
+	}
+
+	if (!(flags & FD_RAW_INTR)) {
+		inr = result(current_fdc);
+		cont->interrupt();
+	} else if (flags & FD_RAW_NEED_DISK)
+		fd_watchdog();
+}
+
+static int blind_seek;
+
+/*
+ * This is the routine called after every seek (or recalibrate) interrupt
+ * from the floppy controller.
+ */
+static void seek_interrupt(void)
+{
+	debugt(__func__, "");
+	if (inr != 2 || (reply_buffer[ST0] & 0xF8) != 0x20) {
+		DPRINT("seek failed\n");
+		drive_state[current_drive].track = NEED_2_RECAL;
+		cont->error();
+		cont->redo();
+		return;
+	}
+	if (drive_state[current_drive].track >= 0 &&
+	    drive_state[current_drive].track != reply_buffer[ST1] &&
+	    !blind_seek) {
+		debug_dcl(drive_params[current_drive].flags,
+			  "clearing NEWCHANGE flag because of effective seek\n");
+		debug_dcl(drive_params[current_drive].flags, "jiffies=%lu\n",
+			  jiffies);
+		clear_bit(FD_DISK_NEWCHANGE_BIT,
+			  &drive_state[current_drive].flags);
+					/* effective seek */
+		drive_state[current_drive].select_date = jiffies;
+	}
+	drive_state[current_drive].track = reply_buffer[ST1];
+	floppy_ready();
+}
+
+static void check_wp(int fdc, int drive)
+{
+	if (test_bit(FD_VERIFY_BIT, &drive_state[drive].flags)) {
+					/* check write protection */
+		output_byte(fdc, FD_GETSTATUS);
+		output_byte(fdc, UNIT(drive));
+		if (result(fdc) != 1) {
+			fdc_state[fdc].reset = 1;
+			return;
+		}
+		clear_bit(FD_VERIFY_BIT, &drive_state[drive].flags);
+		clear_bit(FD_NEED_TWADDLE_BIT,
+			  &drive_state[drive].flags);
+		debug_dcl(drive_params[drive].flags,
+			  "checking whether disk is write protected\n");
+		debug_dcl(drive_params[drive].flags, "wp=%x\n",
+			  reply_buffer[ST3] & 0x40);
+		if (!(reply_buffer[ST3] & 0x40))
+			set_bit(FD_DISK_WRITABLE_BIT,
+				&drive_state[drive].flags);
+		else
+			clear_bit(FD_DISK_WRITABLE_BIT,
+				  &drive_state[drive].flags);
+	}
+}
+
+static void seek_floppy(void)
+{
+	int track;
+
+	blind_seek = 0;
+
+	debug_dcl(drive_params[current_drive].flags,
+		  "calling disk change from %s\n", __func__);
+
+	if (!test_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[current_drive].flags) &&
+	    disk_change(current_drive) && (raw_cmd->flags & FD_RAW_NEED_DISK)) {
+		/* the media changed flag should be cleared after the seek.
+		 * If it isn't, this means that there is really no disk in
+		 * the drive.
+		 */
+		set_bit(FD_DISK_CHANGED_BIT,
+			&drive_state[current_drive].flags);
+		cont->done(0);
+		cont->redo();
+		return;
+	}
+	if (drive_state[current_drive].track <= NEED_1_RECAL) {
+		recalibrate_floppy();
+		return;
+	} else if (test_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[current_drive].flags) &&
+		   (raw_cmd->flags & FD_RAW_NEED_DISK) &&
+		   (drive_state[current_drive].track <= NO_TRACK || drive_state[current_drive].track == raw_cmd->track)) {
+		/* we seek to clear the media-changed condition. Does anybody
+		 * know a more elegant way, which works on all drives? */
+		if (raw_cmd->track)
+			track = raw_cmd->track - 1;
+		else {
+			if (drive_params[current_drive].flags & FD_SILENT_DCL_CLEAR) {
+				set_dor(current_fdc, ~(0x10 << UNIT(current_drive)), 0);
+				blind_seek = 1;
+				raw_cmd->flags |= FD_RAW_NEED_SEEK;
+			}
+			track = 1;
+		}
+	} else {
+		check_wp(current_fdc, current_drive);
+		if (raw_cmd->track != drive_state[current_drive].track &&
+		    (raw_cmd->flags & FD_RAW_NEED_SEEK))
+			track = raw_cmd->track;
+		else {
+			setup_rw_floppy();
+			return;
+		}
+	}
+
+	do_floppy = seek_interrupt;
+	output_byte(current_fdc, FD_SEEK);
+	output_byte(current_fdc, UNIT(current_drive));
+	if (output_byte(current_fdc, track) < 0) {
+		reset_fdc();
+		return;
+	}
+	debugt(__func__, "");
+}
+
+static void recal_interrupt(void)
+{
+	debugt(__func__, "");
+	if (inr != 2)
+		fdc_state[current_fdc].reset = 1;
+	else if (reply_buffer[ST0] & ST0_ECE) {
+		switch (drive_state[current_drive].track) {
+		case NEED_1_RECAL:
+			debugt(__func__, "need 1 recal");
+			/* after a second recalibrate, we still haven't
+			 * reached track 0. Probably no drive. Raise an
+			 * error, as failing immediately might upset
+			 * computers possessed by the Devil :-) */
+			cont->error();
+			cont->redo();
+			return;
+		case NEED_2_RECAL:
+			debugt(__func__, "need 2 recal");
+			/* If we already did a recalibrate,
+			 * and we are not at track 0, this
+			 * means we have moved. (The only way
+			 * not to move at recalibration is to
+			 * be already at track 0.) Clear the
+			 * new change flag */
+			debug_dcl(drive_params[current_drive].flags,
+				  "clearing NEWCHANGE flag because of second recalibrate\n");
+
+			clear_bit(FD_DISK_NEWCHANGE_BIT,
+				  &drive_state[current_drive].flags);
+			drive_state[current_drive].select_date = jiffies;
+			fallthrough;
+		default:
+			debugt(__func__, "default");
+			/* Recalibrate moves the head by at
+			 * most 80 steps. If after one
+			 * recalibrate we don't have reached
+			 * track 0, this might mean that we
+			 * started beyond track 80.  Try
+			 * again.  */
+			drive_state[current_drive].track = NEED_1_RECAL;
+			break;
+		}
+	} else
+		drive_state[current_drive].track = reply_buffer[ST1];
+	floppy_ready();
+}
+
+static void print_result(char *message, int inr)
+{
+	int i;
+
+	DPRINT("%s ", message);
+	if (inr >= 0)
+		for (i = 0; i < inr; i++)
+			pr_cont("repl[%d]=%x ", i, reply_buffer[i]);
+	pr_cont("\n");
+}
+
+/* interrupt handler. Note that this can be called externally on the Sparc */
+irqreturn_t floppy_interrupt(int irq, void *dev_id)
+{
+	int do_print;
+	unsigned long f;
+	void (*handler)(void) = do_floppy;
+
+	lasthandler = handler;
+	interruptjiffies = jiffies;
+
+	f = claim_dma_lock();
+	fd_disable_dma();
+	release_dma_lock(f);
+
+	do_floppy = NULL;
+	if (current_fdc >= N_FDC || fdc_state[current_fdc].address == -1) {
+		/* we don't even know which FDC is the culprit */
+		pr_info("DOR0=%x\n", fdc_state[0].dor);
+		pr_info("floppy interrupt on bizarre fdc %d\n", current_fdc);
+		pr_info("handler=%ps\n", handler);
+		is_alive(__func__, "bizarre fdc");
+		return IRQ_NONE;
+	}
+
+	fdc_state[current_fdc].reset = 0;
+	/* We have to clear the reset flag here, because apparently on boxes
+	 * with level triggered interrupts (PS/2, Sparc, ...), it is needed to
+	 * emit SENSEI's to clear the interrupt line. And fdc_state[fdc].reset
+	 * blocks the emission of the SENSEI's.
+	 * It is OK to emit floppy commands because we are in an interrupt
+	 * handler here, and thus we have to fear no interference of other
+	 * activity.
+	 */
+
+	do_print = !handler && print_unex && initialized;
+
+	inr = result(current_fdc);
+	if (do_print)
+		print_result("unexpected interrupt", inr);
+	if (inr == 0) {
+		int max_sensei = 4;
+		do {
+			output_byte(current_fdc, FD_SENSEI);
+			inr = result(current_fdc);
+			if (do_print)
+				print_result("sensei", inr);
+			max_sensei--;
+		} while ((reply_buffer[ST0] & 0x83) != UNIT(current_drive) &&
+			 inr == 2 && max_sensei);
+	}
+	if (!handler) {
+		fdc_state[current_fdc].reset = 1;
+		return IRQ_NONE;
+	}
+	schedule_bh(handler);
+	is_alive(__func__, "normal interrupt end");
+
+	/* FIXME! Was it really for us? */
+	return IRQ_HANDLED;
+}
+
+static void recalibrate_floppy(void)
+{
+	debugt(__func__, "");
+	do_floppy = recal_interrupt;
+	output_byte(current_fdc, FD_RECALIBRATE);
+	if (output_byte(current_fdc, UNIT(current_drive)) < 0)
+		reset_fdc();
+}
+
+/*
+ * Must do 4 FD_SENSEIs after reset because of ``drive polling''.
+ */
+static void reset_interrupt(void)
+{
+	debugt(__func__, "");
+	result(current_fdc);		/* get the status ready for set_fdc */
+	if (fdc_state[current_fdc].reset) {
+		pr_info("reset set in interrupt, calling %ps\n", cont->error);
+		cont->error();	/* a reset just after a reset. BAD! */
+	}
+	cont->redo();
+}
+
+/*
+ * reset is done by pulling bit 2 of DOR low for a while (old FDCs),
+ * or by setting the self clearing bit 7 of STATUS (newer FDCs).
+ * This WILL trigger an interrupt, causing the handlers in the current
+ * cont's ->redo() to be called via reset_interrupt().
+ */
+static void reset_fdc(void)
+{
+	unsigned long flags;
+
+	do_floppy = reset_interrupt;
+	fdc_state[current_fdc].reset = 0;
+	reset_fdc_info(current_fdc, 0);
+
+	/* Pseudo-DMA may intercept 'reset finished' interrupt.  */
+	/* Irrelevant for systems with true DMA (i386).          */
+
+	flags = claim_dma_lock();
+	fd_disable_dma();
+	release_dma_lock(flags);
+
+	if (fdc_state[current_fdc].version >= FDC_82072A)
+		fdc_outb(0x80 | (fdc_state[current_fdc].dtr & 3),
+			 current_fdc, FD_STATUS);
+	else {
+		fdc_outb(fdc_state[current_fdc].dor & ~0x04, current_fdc, FD_DOR);
+		udelay(FD_RESET_DELAY);
+		fdc_outb(fdc_state[current_fdc].dor, current_fdc, FD_DOR);
+	}
+}
+
+static void show_floppy(int fdc)
+{
+	int i;
+
+	pr_info("\n");
+	pr_info("floppy driver state\n");
+	pr_info("-------------------\n");
+	pr_info("now=%lu last interrupt=%lu diff=%lu last called handler=%ps\n",
+		jiffies, interruptjiffies, jiffies - interruptjiffies,
+		lasthandler);
+
+	pr_info("timeout_message=%s\n", timeout_message);
+	pr_info("last output bytes:\n");
+	for (i = 0; i < OLOGSIZE; i++)
+		pr_info("%2x %2x %lu\n",
+			output_log[(i + output_log_pos) % OLOGSIZE].data,
+			output_log[(i + output_log_pos) % OLOGSIZE].status,
+			output_log[(i + output_log_pos) % OLOGSIZE].jiffies);
+	pr_info("last result at %lu\n", resultjiffies);
+	pr_info("last redo_fd_request at %lu\n", lastredo);
+	print_hex_dump(KERN_INFO, "", DUMP_PREFIX_NONE, 16, 1,
+		       reply_buffer, resultsize, true);
+
+	pr_info("status=%x\n", fdc_inb(fdc, FD_STATUS));
+	pr_info("fdc_busy=%lu\n", fdc_busy);
+	if (do_floppy)
+		pr_info("do_floppy=%ps\n", do_floppy);
+	if (work_pending(&floppy_work))
+		pr_info("floppy_work.func=%ps\n", floppy_work.func);
+	if (delayed_work_pending(&fd_timer))
+		pr_info("delayed work.function=%p expires=%ld\n",
+		       fd_timer.work.func,
+		       fd_timer.timer.expires - jiffies);
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("timer_function=%p expires=%ld\n",
+		       fd_timeout.work.func,
+		       fd_timeout.timer.expires - jiffies);
+
+	pr_info("cont=%p\n", cont);
+	pr_info("current_req=%p\n", current_req);
+	pr_info("command_status=%d\n", command_status);
+	pr_info("\n");
+}
+
+static void floppy_shutdown(struct work_struct *arg)
+{
+	unsigned long flags;
+
+	if (initialized)
+		show_floppy(current_fdc);
+	cancel_activity();
+
+	flags = claim_dma_lock();
+	fd_disable_dma();
+	release_dma_lock(flags);
+
+	/* avoid dma going to a random drive after shutdown */
+
+	if (initialized)
+		DPRINT("floppy timeout called\n");
+	fdc_state[current_fdc].reset = 1;
+	if (cont) {
+		cont->done(0);
+		cont->redo();	/* this will recall reset when needed */
+	} else {
+		pr_info("no cont in shutdown!\n");
+		process_fd_request();
+	}
+	is_alive(__func__, "");
+}
+
+/* start motor, check media-changed condition and write protection */
+static int start_motor(void (*function)(void))
+{
+	int mask;
+	int data;
+
+	mask = 0xfc;
+	data = UNIT(current_drive);
+	if (!(raw_cmd->flags & FD_RAW_NO_MOTOR)) {
+		if (!(fdc_state[current_fdc].dor & (0x10 << UNIT(current_drive)))) {
+			set_debugt();
+			/* no read since this drive is running */
+			drive_state[current_drive].first_read_date = 0;
+			/* note motor start time if motor is not yet running */
+			drive_state[current_drive].spinup_date = jiffies;
+			data |= (0x10 << UNIT(current_drive));
+		}
+	} else if (fdc_state[current_fdc].dor & (0x10 << UNIT(current_drive)))
+		mask &= ~(0x10 << UNIT(current_drive));
+
+	/* starts motor and selects floppy */
+	del_timer(motor_off_timer + current_drive);
+	set_dor(current_fdc, mask, data);
+
+	/* wait_for_completion also schedules reset if needed. */
+	return fd_wait_for_completion(drive_state[current_drive].select_date + drive_params[current_drive].select_delay,
+				      function);
+}
+
+static void floppy_ready(void)
+{
+	if (fdc_state[current_fdc].reset) {
+		reset_fdc();
+		return;
+	}
+	if (start_motor(floppy_ready))
+		return;
+	if (fdc_dtr())
+		return;
+
+	debug_dcl(drive_params[current_drive].flags,
+		  "calling disk change from floppy_ready\n");
+	if (!(raw_cmd->flags & FD_RAW_NO_MOTOR) &&
+	    disk_change(current_drive) && !drive_params[current_drive].select_delay)
+		twaddle(current_fdc, current_drive);	/* this clears the dcl on certain
+				 * drive/controller combinations */
+
+#ifdef fd_chose_dma_mode
+	if ((raw_cmd->flags & FD_RAW_READ) || (raw_cmd->flags & FD_RAW_WRITE)) {
+		unsigned long flags = claim_dma_lock();
+		fd_chose_dma_mode(raw_cmd->kernel_data, raw_cmd->length);
+		release_dma_lock(flags);
+	}
+#endif
+
+	if (raw_cmd->flags & (FD_RAW_NEED_SEEK | FD_RAW_NEED_DISK)) {
+		perpendicular_mode(current_fdc);
+		fdc_specify(current_fdc, current_drive); /* must be done here because of hut, hlt ... */
+		seek_floppy();
+	} else {
+		if ((raw_cmd->flags & FD_RAW_READ) ||
+		    (raw_cmd->flags & FD_RAW_WRITE))
+			fdc_specify(current_fdc, current_drive);
+		setup_rw_floppy();
+	}
+}
+
+static void floppy_start(void)
+{
+	reschedule_timeout(current_drive, "floppy start");
+
+	scandrives();
+	debug_dcl(drive_params[current_drive].flags,
+		  "setting NEWCHANGE in floppy_start\n");
+	set_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[current_drive].flags);
+	floppy_ready();
+}
+
+/*
+ * ========================================================================
+ * here ends the bottom half. Exported routines are:
+ * floppy_start, floppy_off, floppy_ready, lock_fdc, unlock_fdc, set_fdc,
+ * start_motor, reset_fdc, reset_fdc_info, interpret_errors.
+ * Initialization also uses output_byte, result, set_dor, floppy_interrupt
+ * and set_dor.
+ * ========================================================================
+ */
+/*
+ * General purpose continuations.
+ * ==============================
+ */
+
+static void do_wakeup(void)
+{
+	reschedule_timeout(MAXTIMEOUT, "do wakeup");
+	cont = NULL;
+	command_status += 2;
+	wake_up(&command_done);
+}
+
+static const struct cont_t wakeup_cont = {
+	.interrupt	= empty,
+	.redo		= do_wakeup,
+	.error		= empty,
+	.done		= (done_f)empty
+};
+
+static const struct cont_t intr_cont = {
+	.interrupt	= empty,
+	.redo		= process_fd_request,
+	.error		= empty,
+	.done		= (done_f)empty
+};
+
+/* schedules handler, waiting for completion. May be interrupted, will then
+ * return -EINTR, in which case the driver will automatically be unlocked.
+ */
+static int wait_til_done(void (*handler)(void), bool interruptible)
+{
+	int ret;
+
+	schedule_bh(handler);
+
+	if (interruptible)
+		wait_event_interruptible(command_done, command_status >= 2);
+	else
+		wait_event(command_done, command_status >= 2);
+
+	if (command_status < 2) {
+		cancel_activity();
+		cont = &intr_cont;
+		reset_fdc();
+		return -EINTR;
+	}
+
+	if (fdc_state[current_fdc].reset)
+		command_status = FD_COMMAND_ERROR;
+	if (command_status == FD_COMMAND_OKAY)
+		ret = 0;
+	else
+		ret = -EIO;
+	command_status = FD_COMMAND_NONE;
+	return ret;
+}
+
+static void generic_done(int result)
+{
+	command_status = result;
+	cont = &wakeup_cont;
+}
+
+static void generic_success(void)
+{
+	cont->done(1);
+}
+
+static void generic_failure(void)
+{
+	cont->done(0);
+}
+
+static void success_and_wakeup(void)
+{
+	generic_success();
+	cont->redo();
+}
+
+/*
+ * formatting and rw support.
+ * ==========================
+ */
+
+static int next_valid_format(int drive)
+{
+	int probed_format;
+
+	probed_format = drive_state[drive].probed_format;
+	while (1) {
+		if (probed_format >= FD_AUTODETECT_SIZE ||
+		    !drive_params[drive].autodetect[probed_format]) {
+			drive_state[drive].probed_format = 0;
+			return 1;
+		}
+		if (floppy_type[drive_params[drive].autodetect[probed_format]].sect) {
+			drive_state[drive].probed_format = probed_format;
+			return 0;
+		}
+		probed_format++;
+	}
+}
+
+static void bad_flp_intr(void)
+{
+	int err_count;
+
+	if (probing) {
+		drive_state[current_drive].probed_format++;
+		if (!next_valid_format(current_drive))
+			return;
+	}
+	err_count = ++floppy_errors;
+	INFBOUND(write_errors[current_drive].badness, err_count);
+	if (err_count > drive_params[current_drive].max_errors.abort)
+		cont->done(0);
+	if (err_count > drive_params[current_drive].max_errors.reset)
+		fdc_state[current_fdc].reset = 1;
+	else if (err_count > drive_params[current_drive].max_errors.recal)
+		drive_state[current_drive].track = NEED_2_RECAL;
+}
+
+static void set_floppy(int drive)
+{
+	int type = ITYPE(drive_state[drive].fd_device);
+
+	if (type)
+		_floppy = floppy_type + type;
+	else
+		_floppy = current_type[drive];
+}
+
+/*
+ * formatting support.
+ * ===================
+ */
+static void format_interrupt(void)
+{
+	switch (interpret_errors()) {
+	case 1:
+		cont->error();
+		break;
+	case 2:
+		break;
+	case 0:
+		cont->done(1);
+	}
+	cont->redo();
+}
+
+#define FM_MODE(x, y) ((y) & ~(((x)->rate & 0x80) >> 1))
+#define CT(x) ((x) | 0xc0)
+
+static void setup_format_params(int track)
+{
+	int n;
+	int il;
+	int count;
+	int head_shift;
+	int track_shift;
+	struct fparm {
+		unsigned char track, head, sect, size;
+	} *here = (struct fparm *)floppy_track_buffer;
+
+	raw_cmd = &default_raw_cmd;
+	raw_cmd->track = track;
+
+	raw_cmd->flags = (FD_RAW_WRITE | FD_RAW_INTR | FD_RAW_SPIN |
+			  FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK);
+	raw_cmd->rate = _floppy->rate & 0x43;
+	raw_cmd->cmd_count = NR_F;
+	raw_cmd->cmd[COMMAND] = FM_MODE(_floppy, FD_FORMAT);
+	raw_cmd->cmd[DR_SELECT] = UNIT(current_drive) + PH_HEAD(_floppy, format_req.head);
+	raw_cmd->cmd[F_SIZECODE] = FD_SIZECODE(_floppy);
+	raw_cmd->cmd[F_SECT_PER_TRACK] = _floppy->sect << 2 >> raw_cmd->cmd[F_SIZECODE];
+	raw_cmd->cmd[F_GAP] = _floppy->fmt_gap;
+	raw_cmd->cmd[F_FILL] = FD_FILL_BYTE;
+
+	raw_cmd->kernel_data = floppy_track_buffer;
+	raw_cmd->length = 4 * raw_cmd->cmd[F_SECT_PER_TRACK];
+
+	if (!raw_cmd->cmd[F_SECT_PER_TRACK])
+		return;
+
+	/* allow for about 30ms for data transport per track */
+	head_shift = (raw_cmd->cmd[F_SECT_PER_TRACK] + 5) / 6;
+
+	/* a ``cylinder'' is two tracks plus a little stepping time */
+	track_shift = 2 * head_shift + 3;
+
+	/* position of logical sector 1 on this track */
+	n = (track_shift * format_req.track + head_shift * format_req.head)
+	    % raw_cmd->cmd[F_SECT_PER_TRACK];
+
+	/* determine interleave */
+	il = 1;
+	if (_floppy->fmt_gap < 0x22)
+		il++;
+
+	/* initialize field */
+	for (count = 0; count < raw_cmd->cmd[F_SECT_PER_TRACK]; ++count) {
+		here[count].track = format_req.track;
+		here[count].head = format_req.head;
+		here[count].sect = 0;
+		here[count].size = raw_cmd->cmd[F_SIZECODE];
+	}
+	/* place logical sectors */
+	for (count = 1; count <= raw_cmd->cmd[F_SECT_PER_TRACK]; ++count) {
+		here[n].sect = count;
+		n = (n + il) % raw_cmd->cmd[F_SECT_PER_TRACK];
+		if (here[n].sect) {	/* sector busy, find next free sector */
+			++n;
+			if (n >= raw_cmd->cmd[F_SECT_PER_TRACK]) {
+				n -= raw_cmd->cmd[F_SECT_PER_TRACK];
+				while (here[n].sect)
+					++n;
+			}
+		}
+	}
+	if (_floppy->stretch & FD_SECTBASEMASK) {
+		for (count = 0; count < raw_cmd->cmd[F_SECT_PER_TRACK]; count++)
+			here[count].sect += FD_SECTBASE(_floppy) - 1;
+	}
+}
+
+static void redo_format(void)
+{
+	buffer_track = -1;
+	setup_format_params(format_req.track << STRETCH(_floppy));
+	floppy_start();
+	debugt(__func__, "queue format request");
+}
+
+static const struct cont_t format_cont = {
+	.interrupt	= format_interrupt,
+	.redo		= redo_format,
+	.error		= bad_flp_intr,
+	.done		= generic_done
+};
+
+static int do_format(int drive, struct format_descr *tmp_format_req)
+{
+	int ret;
+
+	if (lock_fdc(drive))
+		return -EINTR;
+
+	set_floppy(drive);
+	if (!_floppy ||
+	    _floppy->track > drive_params[current_drive].tracks ||
+	    tmp_format_req->track >= _floppy->track ||
+	    tmp_format_req->head >= _floppy->head ||
+	    (_floppy->sect << 2) % (1 << FD_SIZECODE(_floppy)) ||
+	    !_floppy->fmt_gap) {
+		process_fd_request();
+		return -EINVAL;
+	}
+	format_req = *tmp_format_req;
+	cont = &format_cont;
+	floppy_errors = 0;
+	ret = wait_til_done(redo_format, true);
+	if (ret == -EINTR)
+		return -EINTR;
+	process_fd_request();
+	return ret;
+}
+
+/*
+ * Buffer read/write and support
+ * =============================
+ */
+
+static void floppy_end_request(struct request *req, blk_status_t error)
+{
+	unsigned int nr_sectors = current_count_sectors;
+	unsigned int drive = (unsigned long)req->q->disk->private_data;
+
+	/* current_count_sectors can be zero if transfer failed */
+	if (error)
+		nr_sectors = blk_rq_cur_sectors(req);
+	if (blk_update_request(req, error, nr_sectors << 9))
+		return;
+	__blk_mq_end_request(req, error);
+
+	/* We're done with the request */
+	floppy_off(drive);
+	current_req = NULL;
+}
+
+/* new request_done. Can handle physical sectors which are smaller than a
+ * logical buffer */
+static void request_done(int uptodate)
+{
+	struct request *req = current_req;
+	int block;
+	char msg[sizeof("request done ") + sizeof(int) * 3];
+
+	probing = 0;
+	snprintf(msg, sizeof(msg), "request done %d", uptodate);
+	reschedule_timeout(MAXTIMEOUT, msg);
+
+	if (!req) {
+		pr_info("floppy.c: no request in request_done\n");
+		return;
+	}
+
+	if (uptodate) {
+		/* maintain values for invalidation on geometry
+		 * change */
+		block = current_count_sectors + blk_rq_pos(req);
+		INFBOUND(drive_state[current_drive].maxblock, block);
+		if (block > _floppy->sect)
+			drive_state[current_drive].maxtrack = 1;
+
+		floppy_end_request(req, 0);
+	} else {
+		if (rq_data_dir(req) == WRITE) {
+			/* record write error information */
+			write_errors[current_drive].write_errors++;
+			if (write_errors[current_drive].write_errors == 1) {
+				write_errors[current_drive].first_error_sector = blk_rq_pos(req);
+				write_errors[current_drive].first_error_generation = drive_state[current_drive].generation;
+			}
+			write_errors[current_drive].last_error_sector = blk_rq_pos(req);
+			write_errors[current_drive].last_error_generation = drive_state[current_drive].generation;
+		}
+		floppy_end_request(req, BLK_STS_IOERR);
+	}
+}
+
+/* Interrupt handler evaluating the result of the r/w operation */
+static void rw_interrupt(void)
+{
+	int eoc;
+	int ssize;
+	int heads;
+	int nr_sectors;
+
+	if (reply_buffer[R_HEAD] >= 2) {
+		/* some Toshiba floppy controllers occasionnally seem to
+		 * return bogus interrupts after read/write operations, which
+		 * can be recognized by a bad head number (>= 2) */
+		return;
+	}
+
+	if (!drive_state[current_drive].first_read_date)
+		drive_state[current_drive].first_read_date = jiffies;
+
+	ssize = DIV_ROUND_UP(1 << raw_cmd->cmd[SIZECODE], 4);
+
+	if (reply_buffer[ST1] & ST1_EOC)
+		eoc = 1;
+	else
+		eoc = 0;
+
+	if (raw_cmd->cmd[COMMAND] & 0x80)
+		heads = 2;
+	else
+		heads = 1;
+
+	nr_sectors = (((reply_buffer[R_TRACK] - raw_cmd->cmd[TRACK]) * heads +
+		       reply_buffer[R_HEAD] - raw_cmd->cmd[HEAD]) * raw_cmd->cmd[SECT_PER_TRACK] +
+		      reply_buffer[R_SECTOR] - raw_cmd->cmd[SECTOR] + eoc) << raw_cmd->cmd[SIZECODE] >> 2;
+
+	if (nr_sectors / ssize >
+	    DIV_ROUND_UP(in_sector_offset + current_count_sectors, ssize)) {
+		DPRINT("long rw: %x instead of %lx\n",
+		       nr_sectors, current_count_sectors);
+		pr_info("rs=%d s=%d\n", reply_buffer[R_SECTOR],
+			raw_cmd->cmd[SECTOR]);
+		pr_info("rh=%d h=%d\n", reply_buffer[R_HEAD],
+			raw_cmd->cmd[HEAD]);
+		pr_info("rt=%d t=%d\n", reply_buffer[R_TRACK],
+			raw_cmd->cmd[TRACK]);
+		pr_info("heads=%d eoc=%d\n", heads, eoc);
+		pr_info("spt=%d st=%d ss=%d\n",
+			raw_cmd->cmd[SECT_PER_TRACK], fsector_t, ssize);
+		pr_info("in_sector_offset=%d\n", in_sector_offset);
+	}
+
+	nr_sectors -= in_sector_offset;
+	INFBOUND(nr_sectors, 0);
+	SUPBOUND(current_count_sectors, nr_sectors);
+
+	switch (interpret_errors()) {
+	case 2:
+		cont->redo();
+		return;
+	case 1:
+		if (!current_count_sectors) {
+			cont->error();
+			cont->redo();
+			return;
+		}
+		break;
+	case 0:
+		if (!current_count_sectors) {
+			cont->redo();
+			return;
+		}
+		current_type[current_drive] = _floppy;
+		floppy_sizes[TOMINOR(current_drive)] = _floppy->size;
+		break;
+	}
+
+	if (probing) {
+		if (drive_params[current_drive].flags & FTD_MSG)
+			DPRINT("Auto-detected floppy type %s in fd%d\n",
+			       _floppy->name, current_drive);
+		current_type[current_drive] = _floppy;
+		floppy_sizes[TOMINOR(current_drive)] = _floppy->size;
+		probing = 0;
+	}
+
+	if (CT(raw_cmd->cmd[COMMAND]) != FD_READ) {
+		/* transfer directly from buffer */
+		cont->done(1);
+	} else {
+		buffer_track = raw_cmd->track;
+		buffer_drive = current_drive;
+		INFBOUND(buffer_max, nr_sectors + fsector_t);
+	}
+	cont->redo();
+}
+
+/* Compute the maximal transfer size */
+static int transfer_size(int ssize, int max_sector, int max_size)
+{
+	SUPBOUND(max_sector, fsector_t + max_size);
+
+	/* alignment */
+	max_sector -= (max_sector % _floppy->sect) % ssize;
+
+	/* transfer size, beginning not aligned */
+	current_count_sectors = max_sector - fsector_t;
+
+	return max_sector;
+}
+
+/*
+ * Move data from/to the track buffer to/from the buffer cache.
+ */
+static void copy_buffer(int ssize, int max_sector, int max_sector_2)
+{
+	int remaining;		/* number of transferred 512-byte sectors */
+	struct bio_vec bv;
+	char *dma_buffer;
+	int size;
+	struct req_iterator iter;
+
+	max_sector = transfer_size(ssize,
+				   min(max_sector, max_sector_2),
+				   blk_rq_sectors(current_req));
+
+	if (current_count_sectors <= 0 && CT(raw_cmd->cmd[COMMAND]) == FD_WRITE &&
+	    buffer_max > fsector_t + blk_rq_sectors(current_req))
+		current_count_sectors = min_t(int, buffer_max - fsector_t,
+					      blk_rq_sectors(current_req));
+
+	remaining = current_count_sectors << 9;
+	if (remaining > blk_rq_bytes(current_req) && CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) {
+		DPRINT("in copy buffer\n");
+		pr_info("current_count_sectors=%ld\n", current_count_sectors);
+		pr_info("remaining=%d\n", remaining >> 9);
+		pr_info("current_req->nr_sectors=%u\n",
+			blk_rq_sectors(current_req));
+		pr_info("current_req->current_nr_sectors=%u\n",
+			blk_rq_cur_sectors(current_req));
+		pr_info("max_sector=%d\n", max_sector);
+		pr_info("ssize=%d\n", ssize);
+	}
+
+	buffer_max = max(max_sector, buffer_max);
+
+	dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9);
+
+	size = blk_rq_cur_bytes(current_req);
+
+	rq_for_each_segment(bv, current_req, iter) {
+		if (!remaining)
+			break;
+
+		size = bv.bv_len;
+		SUPBOUND(size, remaining);
+		if (dma_buffer + size >
+		    floppy_track_buffer + (max_buffer_sectors << 10) ||
+		    dma_buffer < floppy_track_buffer) {
+			DPRINT("buffer overrun in copy buffer %d\n",
+			       (int)((floppy_track_buffer - dma_buffer) >> 9));
+			pr_info("fsector_t=%d buffer_min=%d\n",
+				fsector_t, buffer_min);
+			pr_info("current_count_sectors=%ld\n",
+				current_count_sectors);
+			if (CT(raw_cmd->cmd[COMMAND]) == FD_READ)
+				pr_info("read\n");
+			if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE)
+				pr_info("write\n");
+			break;
+		}
+
+		if (CT(raw_cmd->cmd[COMMAND]) == FD_READ)
+			memcpy_to_bvec(&bv, dma_buffer);
+		else
+			memcpy_from_bvec(dma_buffer, &bv);
+
+		remaining -= size;
+		dma_buffer += size;
+	}
+	if (remaining) {
+		if (remaining > 0)
+			max_sector -= remaining >> 9;
+		DPRINT("weirdness: remaining %d\n", remaining >> 9);
+	}
+}
+
+/* work around a bug in pseudo DMA
+ * (on some FDCs) pseudo DMA does not stop when the CPU stops
+ * sending data.  Hence we need a different way to signal the
+ * transfer length:  We use raw_cmd->cmd[SECT_PER_TRACK].  Unfortunately, this
+ * does not work with MT, hence we can only transfer one head at
+ * a time
+ */
+static void virtualdmabug_workaround(void)
+{
+	int hard_sectors;
+	int end_sector;
+
+	if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) {
+		raw_cmd->cmd[COMMAND] &= ~0x80;	/* switch off multiple track mode */
+
+		hard_sectors = raw_cmd->length >> (7 + raw_cmd->cmd[SIZECODE]);
+		end_sector = raw_cmd->cmd[SECTOR] + hard_sectors - 1;
+		if (end_sector > raw_cmd->cmd[SECT_PER_TRACK]) {
+			pr_info("too many sectors %d > %d\n",
+				end_sector, raw_cmd->cmd[SECT_PER_TRACK]);
+			return;
+		}
+		raw_cmd->cmd[SECT_PER_TRACK] = end_sector;
+					/* make sure raw_cmd->cmd[SECT_PER_TRACK]
+					 * points to end of transfer */
+	}
+}
+
+/*
+ * Formulate a read/write request.
+ * this routine decides where to load the data (directly to buffer, or to
+ * tmp floppy area), how much data to load (the size of the buffer, the whole
+ * track, or a single sector)
+ * All floppy_track_buffer handling goes in here. If we ever add track buffer
+ * allocation on the fly, it should be done here. No other part should need
+ * modification.
+ */
+
+static int make_raw_rw_request(void)
+{
+	int aligned_sector_t;
+	int max_sector;
+	int max_size;
+	int tracksize;
+	int ssize;
+
+	if (WARN(max_buffer_sectors == 0, "VFS: Block I/O scheduled on unopened device\n"))
+		return 0;
+
+	set_fdc((long)current_req->q->disk->private_data);
+
+	raw_cmd = &default_raw_cmd;
+	raw_cmd->flags = FD_RAW_SPIN | FD_RAW_NEED_DISK | FD_RAW_NEED_SEEK;
+	raw_cmd->cmd_count = NR_RW;
+	if (rq_data_dir(current_req) == READ) {
+		raw_cmd->flags |= FD_RAW_READ;
+		raw_cmd->cmd[COMMAND] = FM_MODE(_floppy, FD_READ);
+	} else if (rq_data_dir(current_req) == WRITE) {
+		raw_cmd->flags |= FD_RAW_WRITE;
+		raw_cmd->cmd[COMMAND] = FM_MODE(_floppy, FD_WRITE);
+	} else {
+		DPRINT("%s: unknown command\n", __func__);
+		return 0;
+	}
+
+	max_sector = _floppy->sect * _floppy->head;
+
+	raw_cmd->cmd[TRACK] = (int)blk_rq_pos(current_req) / max_sector;
+	fsector_t = (int)blk_rq_pos(current_req) % max_sector;
+	if (_floppy->track && raw_cmd->cmd[TRACK] >= _floppy->track) {
+		if (blk_rq_cur_sectors(current_req) & 1) {
+			current_count_sectors = 1;
+			return 1;
+		} else
+			return 0;
+	}
+	raw_cmd->cmd[HEAD] = fsector_t / _floppy->sect;
+
+	if (((_floppy->stretch & (FD_SWAPSIDES | FD_SECTBASEMASK)) ||
+	     test_bit(FD_NEED_TWADDLE_BIT, &drive_state[current_drive].flags)) &&
+	    fsector_t < _floppy->sect)
+		max_sector = _floppy->sect;
+
+	/* 2M disks have phantom sectors on the first track */
+	if ((_floppy->rate & FD_2M) && (!raw_cmd->cmd[TRACK]) && (!raw_cmd->cmd[HEAD])) {
+		max_sector = 2 * _floppy->sect / 3;
+		if (fsector_t >= max_sector) {
+			current_count_sectors =
+			    min_t(int, _floppy->sect - fsector_t,
+				  blk_rq_sectors(current_req));
+			return 1;
+		}
+		raw_cmd->cmd[SIZECODE] = 2;
+	} else
+		raw_cmd->cmd[SIZECODE] = FD_SIZECODE(_floppy);
+	raw_cmd->rate = _floppy->rate & 0x43;
+	if ((_floppy->rate & FD_2M) &&
+	    (raw_cmd->cmd[TRACK] || raw_cmd->cmd[HEAD]) && raw_cmd->rate == 2)
+		raw_cmd->rate = 1;
+
+	if (raw_cmd->cmd[SIZECODE])
+		raw_cmd->cmd[SIZECODE2] = 0xff;
+	else
+		raw_cmd->cmd[SIZECODE2] = 0x80;
+	raw_cmd->track = raw_cmd->cmd[TRACK] << STRETCH(_floppy);
+	raw_cmd->cmd[DR_SELECT] = UNIT(current_drive) + PH_HEAD(_floppy, raw_cmd->cmd[HEAD]);
+	raw_cmd->cmd[GAP] = _floppy->gap;
+	ssize = DIV_ROUND_UP(1 << raw_cmd->cmd[SIZECODE], 4);
+	raw_cmd->cmd[SECT_PER_TRACK] = _floppy->sect << 2 >> raw_cmd->cmd[SIZECODE];
+	raw_cmd->cmd[SECTOR] = ((fsector_t % _floppy->sect) << 2 >> raw_cmd->cmd[SIZECODE]) +
+	    FD_SECTBASE(_floppy);
+
+	/* tracksize describes the size which can be filled up with sectors
+	 * of size ssize.
+	 */
+	tracksize = _floppy->sect - _floppy->sect % ssize;
+	if (tracksize < _floppy->sect) {
+		raw_cmd->cmd[SECT_PER_TRACK]++;
+		if (tracksize <= fsector_t % _floppy->sect)
+			raw_cmd->cmd[SECTOR]--;
+
+		/* if we are beyond tracksize, fill up using smaller sectors */
+		while (tracksize <= fsector_t % _floppy->sect) {
+			while (tracksize + ssize > _floppy->sect) {
+				raw_cmd->cmd[SIZECODE]--;
+				ssize >>= 1;
+			}
+			raw_cmd->cmd[SECTOR]++;
+			raw_cmd->cmd[SECT_PER_TRACK]++;
+			tracksize += ssize;
+		}
+		max_sector = raw_cmd->cmd[HEAD] * _floppy->sect + tracksize;
+	} else if (!raw_cmd->cmd[TRACK] && !raw_cmd->cmd[HEAD] && !(_floppy->rate & FD_2M) && probing) {
+		max_sector = _floppy->sect;
+	} else if (!raw_cmd->cmd[HEAD] && CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) {
+		/* for virtual DMA bug workaround */
+		max_sector = _floppy->sect;
+	}
+
+	in_sector_offset = (fsector_t % _floppy->sect) % ssize;
+	aligned_sector_t = fsector_t - in_sector_offset;
+	max_size = blk_rq_sectors(current_req);
+	if ((raw_cmd->track == buffer_track) &&
+	    (current_drive == buffer_drive) &&
+	    (fsector_t >= buffer_min) && (fsector_t < buffer_max)) {
+		/* data already in track buffer */
+		if (CT(raw_cmd->cmd[COMMAND]) == FD_READ) {
+			copy_buffer(1, max_sector, buffer_max);
+			return 1;
+		}
+	} else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) {
+		if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) {
+			unsigned int sectors;
+
+			sectors = fsector_t + blk_rq_sectors(current_req);
+			if (sectors > ssize && sectors < ssize + ssize)
+				max_size = ssize + ssize;
+			else
+				max_size = ssize;
+		}
+		raw_cmd->flags &= ~FD_RAW_WRITE;
+		raw_cmd->flags |= FD_RAW_READ;
+		raw_cmd->cmd[COMMAND] = FM_MODE(_floppy, FD_READ);
+	}
+
+	if (CT(raw_cmd->cmd[COMMAND]) == FD_READ)
+		max_size = max_sector;	/* unbounded */
+
+	/* claim buffer track if needed */
+	if (buffer_track != raw_cmd->track ||	/* bad track */
+	    buffer_drive != current_drive ||	/* bad drive */
+	    fsector_t > buffer_max ||
+	    fsector_t < buffer_min ||
+	    ((CT(raw_cmd->cmd[COMMAND]) == FD_READ ||
+	      (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) &&
+	     max_sector > 2 * max_buffer_sectors + buffer_min &&
+	     max_size + fsector_t > 2 * max_buffer_sectors + buffer_min)) {
+		/* not enough space */
+		buffer_track = -1;
+		buffer_drive = current_drive;
+		buffer_max = buffer_min = aligned_sector_t;
+	}
+	raw_cmd->kernel_data = floppy_track_buffer +
+		((aligned_sector_t - buffer_min) << 9);
+
+	if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE) {
+		/* copy write buffer to track buffer.
+		 * if we get here, we know that the write
+		 * is either aligned or the data already in the buffer
+		 * (buffer will be overwritten) */
+		if (in_sector_offset && buffer_track == -1)
+			DPRINT("internal error offset !=0 on write\n");
+		buffer_track = raw_cmd->track;
+		buffer_drive = current_drive;
+		copy_buffer(ssize, max_sector,
+			    2 * max_buffer_sectors + buffer_min);
+	} else
+		transfer_size(ssize, max_sector,
+			      2 * max_buffer_sectors + buffer_min -
+			      aligned_sector_t);
+
+	/* round up current_count_sectors to get dma xfer size */
+	raw_cmd->length = in_sector_offset + current_count_sectors;
+	raw_cmd->length = ((raw_cmd->length - 1) | (ssize - 1)) + 1;
+	raw_cmd->length <<= 9;
+	if ((raw_cmd->length < current_count_sectors << 9) ||
+	    (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE &&
+	     (aligned_sector_t + (raw_cmd->length >> 9) > buffer_max ||
+	      aligned_sector_t < buffer_min)) ||
+	    raw_cmd->length % (128 << raw_cmd->cmd[SIZECODE]) ||
+	    raw_cmd->length <= 0 || current_count_sectors <= 0) {
+		DPRINT("fractionary current count b=%lx s=%lx\n",
+		       raw_cmd->length, current_count_sectors);
+		pr_info("addr=%d, length=%ld\n",
+			(int)((raw_cmd->kernel_data -
+			       floppy_track_buffer) >> 9),
+			current_count_sectors);
+		pr_info("st=%d ast=%d mse=%d msi=%d\n",
+			fsector_t, aligned_sector_t, max_sector, max_size);
+		pr_info("ssize=%x SIZECODE=%d\n", ssize, raw_cmd->cmd[SIZECODE]);
+		pr_info("command=%x SECTOR=%d HEAD=%d, TRACK=%d\n",
+			raw_cmd->cmd[COMMAND], raw_cmd->cmd[SECTOR],
+			raw_cmd->cmd[HEAD], raw_cmd->cmd[TRACK]);
+		pr_info("buffer drive=%d\n", buffer_drive);
+		pr_info("buffer track=%d\n", buffer_track);
+		pr_info("buffer_min=%d\n", buffer_min);
+		pr_info("buffer_max=%d\n", buffer_max);
+		return 0;
+	}
+
+	if (raw_cmd->kernel_data < floppy_track_buffer ||
+	    current_count_sectors < 0 ||
+	    raw_cmd->length < 0 ||
+	    raw_cmd->kernel_data + raw_cmd->length >
+	    floppy_track_buffer + (max_buffer_sectors << 10)) {
+		DPRINT("buffer overrun in schedule dma\n");
+		pr_info("fsector_t=%d buffer_min=%d current_count=%ld\n",
+			fsector_t, buffer_min, raw_cmd->length >> 9);
+		pr_info("current_count_sectors=%ld\n",
+			current_count_sectors);
+		if (CT(raw_cmd->cmd[COMMAND]) == FD_READ)
+			pr_info("read\n");
+		if (CT(raw_cmd->cmd[COMMAND]) == FD_WRITE)
+			pr_info("write\n");
+		return 0;
+	}
+	if (raw_cmd->length == 0) {
+		DPRINT("zero dma transfer attempted from make_raw_request\n");
+		return 0;
+	}
+
+	virtualdmabug_workaround();
+	return 2;
+}
+
+static int set_next_request(void)
+{
+	current_req = list_first_entry_or_null(&floppy_reqs, struct request,
+					       queuelist);
+	if (current_req) {
+		floppy_errors = 0;
+		list_del_init(&current_req->queuelist);
+		return 1;
+	}
+	return 0;
+}
+
+/* Starts or continues processing request. Will automatically unlock the
+ * driver at end of request.
+ */
+static void redo_fd_request(void)
+{
+	int drive;
+	int tmp;
+
+	lastredo = jiffies;
+	if (current_drive < N_DRIVE)
+		floppy_off(current_drive);
+
+do_request:
+	if (!current_req) {
+		int pending;
+
+		spin_lock_irq(&floppy_lock);
+		pending = set_next_request();
+		spin_unlock_irq(&floppy_lock);
+		if (!pending) {
+			do_floppy = NULL;
+			unlock_fdc();
+			return;
+		}
+	}
+	drive = (long)current_req->q->disk->private_data;
+	set_fdc(drive);
+	reschedule_timeout(current_drive, "redo fd request");
+
+	set_floppy(drive);
+	raw_cmd = &default_raw_cmd;
+	raw_cmd->flags = 0;
+	if (start_motor(redo_fd_request))
+		return;
+
+	disk_change(current_drive);
+	if (test_bit(current_drive, &fake_change) ||
+	    test_bit(FD_DISK_CHANGED_BIT, &drive_state[current_drive].flags)) {
+		DPRINT("disk absent or changed during operation\n");
+		request_done(0);
+		goto do_request;
+	}
+	if (!_floppy) {	/* Autodetection */
+		if (!probing) {
+			drive_state[current_drive].probed_format = 0;
+			if (next_valid_format(current_drive)) {
+				DPRINT("no autodetectable formats\n");
+				_floppy = NULL;
+				request_done(0);
+				goto do_request;
+			}
+		}
+		probing = 1;
+		_floppy = floppy_type + drive_params[current_drive].autodetect[drive_state[current_drive].probed_format];
+	} else
+		probing = 0;
+	tmp = make_raw_rw_request();
+	if (tmp < 2) {
+		request_done(tmp);
+		goto do_request;
+	}
+
+	if (test_bit(FD_NEED_TWADDLE_BIT, &drive_state[current_drive].flags))
+		twaddle(current_fdc, current_drive);
+	schedule_bh(floppy_start);
+	debugt(__func__, "queue fd request");
+	return;
+}
+
+static const struct cont_t rw_cont = {
+	.interrupt	= rw_interrupt,
+	.redo		= redo_fd_request,
+	.error		= bad_flp_intr,
+	.done		= request_done
+};
+
+/* schedule the request and automatically unlock the driver on completion */
+static void process_fd_request(void)
+{
+	cont = &rw_cont;
+	schedule_bh(redo_fd_request);
+}
+
+static blk_status_t floppy_queue_rq(struct blk_mq_hw_ctx *hctx,
+				    const struct blk_mq_queue_data *bd)
+{
+	blk_mq_start_request(bd->rq);
+
+	if (WARN(max_buffer_sectors == 0,
+		 "VFS: %s called on non-open device\n", __func__))
+		return BLK_STS_IOERR;
+
+	if (WARN(atomic_read(&usage_count) == 0,
+		 "warning: usage count=0, current_req=%p sect=%ld flags=%llx\n",
+		 current_req, (long)blk_rq_pos(current_req),
+		 (__force unsigned long long) current_req->cmd_flags))
+		return BLK_STS_IOERR;
+
+	if (test_and_set_bit(0, &fdc_busy)) {
+		/* fdc busy, this new request will be treated when the
+		   current one is done */
+		is_alive(__func__, "old request running");
+		return BLK_STS_RESOURCE;
+	}
+
+	spin_lock_irq(&floppy_lock);
+	list_add_tail(&bd->rq->queuelist, &floppy_reqs);
+	spin_unlock_irq(&floppy_lock);
+
+	command_status = FD_COMMAND_NONE;
+	__reschedule_timeout(MAXTIMEOUT, "fd_request");
+	set_fdc(0);
+	process_fd_request();
+	is_alive(__func__, "");
+	return BLK_STS_OK;
+}
+
+static const struct cont_t poll_cont = {
+	.interrupt	= success_and_wakeup,
+	.redo		= floppy_ready,
+	.error		= generic_failure,
+	.done		= generic_done
+};
+
+static int poll_drive(bool interruptible, int flag)
+{
+	/* no auto-sense, just clear dcl */
+	raw_cmd = &default_raw_cmd;
+	raw_cmd->flags = flag;
+	raw_cmd->track = 0;
+	raw_cmd->cmd_count = 0;
+	cont = &poll_cont;
+	debug_dcl(drive_params[current_drive].flags,
+		  "setting NEWCHANGE in poll_drive\n");
+	set_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[current_drive].flags);
+
+	return wait_til_done(floppy_ready, interruptible);
+}
+
+/*
+ * User triggered reset
+ * ====================
+ */
+
+static void reset_intr(void)
+{
+	pr_info("weird, reset interrupt called\n");
+}
+
+static const struct cont_t reset_cont = {
+	.interrupt	= reset_intr,
+	.redo		= success_and_wakeup,
+	.error		= generic_failure,
+	.done		= generic_done
+};
+
+/*
+ * Resets the FDC connected to drive <drive>.
+ * Both current_drive and current_fdc are changed to match the new drive.
+ */
+static int user_reset_fdc(int drive, int arg, bool interruptible)
+{
+	int ret;
+
+	if (lock_fdc(drive))
+		return -EINTR;
+
+	if (arg == FD_RESET_ALWAYS)
+		fdc_state[current_fdc].reset = 1;
+	if (fdc_state[current_fdc].reset) {
+		/* note: reset_fdc will take care of unlocking the driver
+		 * on completion.
+		 */
+		cont = &reset_cont;
+		ret = wait_til_done(reset_fdc, interruptible);
+		if (ret == -EINTR)
+			return -EINTR;
+	}
+	process_fd_request();
+	return 0;
+}
+
+/*
+ * Misc Ioctl's and support
+ * ========================
+ */
+static inline int fd_copyout(void __user *param, const void *address,
+			     unsigned long size)
+{
+	return copy_to_user(param, address, size) ? -EFAULT : 0;
+}
+
+static inline int fd_copyin(void __user *param, void *address,
+			    unsigned long size)
+{
+	return copy_from_user(address, param, size) ? -EFAULT : 0;
+}
+
+static const char *drive_name(int type, int drive)
+{
+	struct floppy_struct *floppy;
+
+	if (type)
+		floppy = floppy_type + type;
+	else {
+		if (drive_params[drive].native_format)
+			floppy = floppy_type + drive_params[drive].native_format;
+		else
+			return "(null)";
+	}
+	if (floppy->name)
+		return floppy->name;
+	else
+		return "(null)";
+}
+
+#ifdef CONFIG_BLK_DEV_FD_RAWCMD
+
+/* raw commands */
+static void raw_cmd_done(int flag)
+{
+	if (!flag) {
+		raw_cmd->flags |= FD_RAW_FAILURE;
+		raw_cmd->flags |= FD_RAW_HARDFAILURE;
+	} else {
+		raw_cmd->reply_count = inr;
+		if (raw_cmd->reply_count > FD_RAW_REPLY_SIZE)
+			raw_cmd->reply_count = 0;
+		memcpy(raw_cmd->reply, reply_buffer, raw_cmd->reply_count);
+
+		if (raw_cmd->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
+			unsigned long flags;
+			flags = claim_dma_lock();
+			raw_cmd->length = fd_get_dma_residue();
+			release_dma_lock(flags);
+		}
+
+		if ((raw_cmd->flags & FD_RAW_SOFTFAILURE) &&
+		    (!raw_cmd->reply_count || (raw_cmd->reply[0] & 0xc0)))
+			raw_cmd->flags |= FD_RAW_FAILURE;
+
+		if (disk_change(current_drive))
+			raw_cmd->flags |= FD_RAW_DISK_CHANGE;
+		else
+			raw_cmd->flags &= ~FD_RAW_DISK_CHANGE;
+		if (raw_cmd->flags & FD_RAW_NO_MOTOR_AFTER)
+			motor_off_callback(&motor_off_timer[current_drive]);
+
+		if (raw_cmd->next &&
+		    (!(raw_cmd->flags & FD_RAW_FAILURE) ||
+		     !(raw_cmd->flags & FD_RAW_STOP_IF_FAILURE)) &&
+		    ((raw_cmd->flags & FD_RAW_FAILURE) ||
+		     !(raw_cmd->flags & FD_RAW_STOP_IF_SUCCESS))) {
+			raw_cmd = raw_cmd->next;
+			return;
+		}
+	}
+	generic_done(flag);
+}
+
+static const struct cont_t raw_cmd_cont = {
+	.interrupt	= success_and_wakeup,
+	.redo		= floppy_start,
+	.error		= generic_failure,
+	.done		= raw_cmd_done
+};
+
+static int raw_cmd_copyout(int cmd, void __user *param,
+				  struct floppy_raw_cmd *ptr)
+{
+	int ret;
+
+	while (ptr) {
+		struct floppy_raw_cmd cmd = *ptr;
+		cmd.next = NULL;
+		cmd.kernel_data = NULL;
+		ret = copy_to_user(param, &cmd, sizeof(cmd));
+		if (ret)
+			return -EFAULT;
+		param += sizeof(struct floppy_raw_cmd);
+		if ((ptr->flags & FD_RAW_READ) && ptr->buffer_length) {
+			if (ptr->length >= 0 &&
+			    ptr->length <= ptr->buffer_length) {
+				long length = ptr->buffer_length - ptr->length;
+				ret = fd_copyout(ptr->data, ptr->kernel_data,
+						 length);
+				if (ret)
+					return ret;
+			}
+		}
+		ptr = ptr->next;
+	}
+
+	return 0;
+}
+
+static void raw_cmd_free(struct floppy_raw_cmd **ptr)
+{
+	struct floppy_raw_cmd *next;
+	struct floppy_raw_cmd *this;
+
+	this = *ptr;
+	*ptr = NULL;
+	while (this) {
+		if (this->buffer_length) {
+			fd_dma_mem_free((unsigned long)this->kernel_data,
+					this->buffer_length);
+			this->buffer_length = 0;
+		}
+		next = this->next;
+		kfree(this);
+		this = next;
+	}
+}
+
+#define MAX_LEN (1UL << MAX_ORDER << PAGE_SHIFT)
+
+static int raw_cmd_copyin(int cmd, void __user *param,
+				 struct floppy_raw_cmd **rcmd)
+{
+	struct floppy_raw_cmd *ptr;
+	int ret;
+
+	*rcmd = NULL;
+
+loop:
+	ptr = kmalloc(sizeof(struct floppy_raw_cmd), GFP_KERNEL);
+	if (!ptr)
+		return -ENOMEM;
+	*rcmd = ptr;
+	ret = copy_from_user(ptr, param, sizeof(*ptr));
+	ptr->next = NULL;
+	ptr->buffer_length = 0;
+	ptr->kernel_data = NULL;
+	if (ret)
+		return -EFAULT;
+	param += sizeof(struct floppy_raw_cmd);
+	if (ptr->cmd_count > FD_RAW_CMD_FULLSIZE)
+		return -EINVAL;
+
+	memset(ptr->reply, 0, FD_RAW_REPLY_SIZE);
+	ptr->resultcode = 0;
+
+	if (ptr->flags & (FD_RAW_READ | FD_RAW_WRITE)) {
+		if (ptr->length <= 0 || ptr->length >= MAX_LEN)
+			return -EINVAL;
+		ptr->kernel_data = (char *)fd_dma_mem_alloc(ptr->length);
+		fallback_on_nodma_alloc(&ptr->kernel_data, ptr->length);
+		if (!ptr->kernel_data)
+			return -ENOMEM;
+		ptr->buffer_length = ptr->length;
+	}
+	if (ptr->flags & FD_RAW_WRITE) {
+		ret = fd_copyin(ptr->data, ptr->kernel_data, ptr->length);
+		if (ret)
+			return ret;
+	}
+
+	if (ptr->flags & FD_RAW_MORE) {
+		rcmd = &(ptr->next);
+		ptr->rate &= 0x43;
+		goto loop;
+	}
+
+	return 0;
+}
+
+static int raw_cmd_ioctl(int cmd, void __user *param)
+{
+	struct floppy_raw_cmd *my_raw_cmd;
+	int drive;
+	int ret2;
+	int ret;
+
+	if (fdc_state[current_fdc].rawcmd <= 1)
+		fdc_state[current_fdc].rawcmd = 1;
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		if (FDC(drive) != current_fdc)
+			continue;
+		if (drive == current_drive) {
+			if (drive_state[drive].fd_ref > 1) {
+				fdc_state[current_fdc].rawcmd = 2;
+				break;
+			}
+		} else if (drive_state[drive].fd_ref) {
+			fdc_state[current_fdc].rawcmd = 2;
+			break;
+		}
+	}
+
+	if (fdc_state[current_fdc].reset)
+		return -EIO;
+
+	ret = raw_cmd_copyin(cmd, param, &my_raw_cmd);
+	if (ret) {
+		raw_cmd_free(&my_raw_cmd);
+		return ret;
+	}
+
+	raw_cmd = my_raw_cmd;
+	cont = &raw_cmd_cont;
+	ret = wait_til_done(floppy_start, true);
+	debug_dcl(drive_params[current_drive].flags,
+		  "calling disk change from raw_cmd ioctl\n");
+
+	if (ret != -EINTR && fdc_state[current_fdc].reset)
+		ret = -EIO;
+
+	drive_state[current_drive].track = NO_TRACK;
+
+	ret2 = raw_cmd_copyout(cmd, param, my_raw_cmd);
+	if (!ret)
+		ret = ret2;
+	raw_cmd_free(&my_raw_cmd);
+	return ret;
+}
+
+static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
+				void __user *param)
+{
+	int ret;
+
+	pr_warn_once("Note: FDRAWCMD is deprecated and will be removed from the kernel in the near future.\n");
+
+	if (type)
+		return -EINVAL;
+	if (lock_fdc(drive))
+		return -EINTR;
+	set_floppy(drive);
+	ret = raw_cmd_ioctl(cmd, param);
+	if (ret == -EINTR)
+		return -EINTR;
+	process_fd_request();
+	return ret;
+}
+
+#else /* CONFIG_BLK_DEV_FD_RAWCMD */
+
+static int floppy_raw_cmd_ioctl(int type, int drive, int cmd,
+				void __user *param)
+{
+	return -EOPNOTSUPP;
+}
+
+#endif
+
+static int invalidate_drive(struct block_device *bdev)
+{
+	/* invalidate the buffer track to force a reread */
+	set_bit((long)bdev->bd_disk->private_data, &fake_change);
+	process_fd_request();
+	if (bdev_check_media_change(bdev))
+		floppy_revalidate(bdev->bd_disk);
+	return 0;
+}
+
+static int set_geometry(unsigned int cmd, struct floppy_struct *g,
+			       int drive, int type, struct block_device *bdev)
+{
+	int cnt;
+
+	/* sanity checking for parameters. */
+	if ((int)g->sect <= 0 ||
+	    (int)g->head <= 0 ||
+	    /* check for overflow in max_sector */
+	    (int)(g->sect * g->head) <= 0 ||
+	    /* check for zero in raw_cmd->cmd[F_SECT_PER_TRACK] */
+	    (unsigned char)((g->sect << 2) >> FD_SIZECODE(g)) == 0 ||
+	    g->track <= 0 || g->track > drive_params[drive].tracks >> STRETCH(g) ||
+	    /* check if reserved bits are set */
+	    (g->stretch & ~(FD_STRETCH | FD_SWAPSIDES | FD_SECTBASEMASK)) != 0)
+		return -EINVAL;
+	if (type) {
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		mutex_lock(&open_lock);
+		if (lock_fdc(drive)) {
+			mutex_unlock(&open_lock);
+			return -EINTR;
+		}
+		floppy_type[type] = *g;
+		floppy_type[type].name = "user format";
+		for (cnt = type << 2; cnt < (type << 2) + 4; cnt++)
+			floppy_sizes[cnt] = floppy_sizes[cnt + 0x80] =
+			    floppy_type[type].size + 1;
+		process_fd_request();
+		for (cnt = 0; cnt < N_DRIVE; cnt++) {
+			struct block_device *bdev = opened_bdev[cnt];
+			if (!bdev || ITYPE(drive_state[cnt].fd_device) != type)
+				continue;
+			__invalidate_device(bdev, true);
+		}
+		mutex_unlock(&open_lock);
+	} else {
+		int oldStretch;
+
+		if (lock_fdc(drive))
+			return -EINTR;
+		if (cmd != FDDEFPRM) {
+			/* notice a disk change immediately, else
+			 * we lose our settings immediately*/
+			if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+				return -EINTR;
+		}
+		oldStretch = g->stretch;
+		user_params[drive] = *g;
+		if (buffer_drive == drive)
+			SUPBOUND(buffer_max, user_params[drive].sect);
+		current_type[drive] = &user_params[drive];
+		floppy_sizes[drive] = user_params[drive].size;
+		if (cmd == FDDEFPRM)
+			drive_state[current_drive].keep_data = -1;
+		else
+			drive_state[current_drive].keep_data = 1;
+		/* invalidation. Invalidate only when needed, i.e.
+		 * when there are already sectors in the buffer cache
+		 * whose number will change. This is useful, because
+		 * mtools often changes the geometry of the disk after
+		 * looking at the boot block */
+		if (drive_state[current_drive].maxblock > user_params[drive].sect ||
+		    drive_state[current_drive].maxtrack ||
+		    ((user_params[drive].sect ^ oldStretch) &
+		     (FD_SWAPSIDES | FD_SECTBASEMASK)))
+			invalidate_drive(bdev);
+		else
+			process_fd_request();
+	}
+	return 0;
+}
+
+/* handle obsolete ioctl's */
+static unsigned int ioctl_table[] = {
+	FDCLRPRM,
+	FDSETPRM,
+	FDDEFPRM,
+	FDGETPRM,
+	FDMSGON,
+	FDMSGOFF,
+	FDFMTBEG,
+	FDFMTTRK,
+	FDFMTEND,
+	FDSETEMSGTRESH,
+	FDFLUSH,
+	FDSETMAXERRS,
+	FDGETMAXERRS,
+	FDGETDRVTYP,
+	FDSETDRVPRM,
+	FDGETDRVPRM,
+	FDGETDRVSTAT,
+	FDPOLLDRVSTAT,
+	FDRESET,
+	FDGETFDCSTAT,
+	FDWERRORCLR,
+	FDWERRORGET,
+	FDRAWCMD,
+	FDEJECT,
+	FDTWADDLE
+};
+
+static int normalize_ioctl(unsigned int *cmd, int *size)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(ioctl_table); i++) {
+		if ((*cmd & 0xffff) == (ioctl_table[i] & 0xffff)) {
+			*size = _IOC_SIZE(*cmd);
+			*cmd = ioctl_table[i];
+			if (*size > _IOC_SIZE(*cmd)) {
+				pr_info("ioctl not yet supported\n");
+				return -EFAULT;
+			}
+			return 0;
+		}
+	}
+	return -EINVAL;
+}
+
+static int get_floppy_geometry(int drive, int type, struct floppy_struct **g)
+{
+	if (type)
+		*g = &floppy_type[type];
+	else {
+		if (lock_fdc(drive))
+			return -EINTR;
+		if (poll_drive(false, 0) == -EINTR)
+			return -EINTR;
+		process_fd_request();
+		*g = current_type[drive];
+	}
+	if (!*g)
+		return -ENODEV;
+	return 0;
+}
+
+static int fd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int type = ITYPE(drive_state[drive].fd_device);
+	struct floppy_struct *g;
+	int ret;
+
+	ret = get_floppy_geometry(drive, type, &g);
+	if (ret)
+		return ret;
+
+	geo->heads = g->head;
+	geo->sectors = g->sect;
+	geo->cylinders = g->track;
+	return 0;
+}
+
+static bool valid_floppy_drive_params(const short autodetect[FD_AUTODETECT_SIZE],
+		int native_format)
+{
+	size_t floppy_type_size = ARRAY_SIZE(floppy_type);
+	size_t i = 0;
+
+	for (i = 0; i < FD_AUTODETECT_SIZE; ++i) {
+		if (autodetect[i] < 0 ||
+		    autodetect[i] >= floppy_type_size)
+			return false;
+	}
+
+	if (native_format < 0 || native_format >= floppy_type_size)
+		return false;
+
+	return true;
+}
+
+static int fd_locked_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+		    unsigned long param)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int type = ITYPE(drive_state[drive].fd_device);
+	int ret;
+	int size;
+	union inparam {
+		struct floppy_struct g;	/* geometry */
+		struct format_descr f;
+		struct floppy_max_errors max_errors;
+		struct floppy_drive_params dp;
+	} inparam;		/* parameters coming from user space */
+	const void *outparam;	/* parameters passed back to user space */
+
+	/* convert compatibility eject ioctls into floppy eject ioctl.
+	 * We do this in order to provide a means to eject floppy disks before
+	 * installing the new fdutils package */
+	if (cmd == CDROMEJECT ||	/* CD-ROM eject */
+	    cmd == 0x6470) {		/* SunOS floppy eject */
+		DPRINT("obsolete eject ioctl\n");
+		DPRINT("please use floppycontrol --eject\n");
+		cmd = FDEJECT;
+	}
+
+	if (!((cmd & 0xff00) == 0x0200))
+		return -EINVAL;
+
+	/* convert the old style command into a new style command */
+	ret = normalize_ioctl(&cmd, &size);
+	if (ret)
+		return ret;
+
+	/* permission checks */
+	if (((cmd & 0x40) && !(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL))) ||
+	    ((cmd & 0x80) && !capable(CAP_SYS_ADMIN)))
+		return -EPERM;
+
+	if (WARN_ON(size < 0 || size > sizeof(inparam)))
+		return -EINVAL;
+
+	/* copyin */
+	memset(&inparam, 0, sizeof(inparam));
+	if (_IOC_DIR(cmd) & _IOC_WRITE) {
+		ret = fd_copyin((void __user *)param, &inparam, size);
+		if (ret)
+			return ret;
+	}
+
+	switch (cmd) {
+	case FDEJECT:
+		if (drive_state[drive].fd_ref != 1)
+			/* somebody else has this drive open */
+			return -EBUSY;
+		if (lock_fdc(drive))
+			return -EINTR;
+
+		/* do the actual eject. Fails on
+		 * non-Sparc architectures */
+		ret = fd_eject(UNIT(drive));
+
+		set_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags);
+		set_bit(FD_VERIFY_BIT, &drive_state[drive].flags);
+		process_fd_request();
+		return ret;
+	case FDCLRPRM:
+		if (lock_fdc(drive))
+			return -EINTR;
+		current_type[drive] = NULL;
+		floppy_sizes[drive] = MAX_DISK_SIZE << 1;
+		drive_state[drive].keep_data = 0;
+		return invalidate_drive(bdev);
+	case FDSETPRM:
+	case FDDEFPRM:
+		return set_geometry(cmd, &inparam.g, drive, type, bdev);
+	case FDGETPRM:
+		ret = get_floppy_geometry(drive, type,
+					  (struct floppy_struct **)&outparam);
+		if (ret)
+			return ret;
+		memcpy(&inparam.g, outparam,
+				offsetof(struct floppy_struct, name));
+		outparam = &inparam.g;
+		break;
+	case FDMSGON:
+		drive_params[drive].flags |= FTD_MSG;
+		return 0;
+	case FDMSGOFF:
+		drive_params[drive].flags &= ~FTD_MSG;
+		return 0;
+	case FDFMTBEG:
+		if (lock_fdc(drive))
+			return -EINTR;
+		if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+			return -EINTR;
+		ret = drive_state[drive].flags;
+		process_fd_request();
+		if (ret & FD_VERIFY)
+			return -ENODEV;
+		if (!(ret & FD_DISK_WRITABLE))
+			return -EROFS;
+		return 0;
+	case FDFMTTRK:
+		if (drive_state[drive].fd_ref != 1)
+			return -EBUSY;
+		return do_format(drive, &inparam.f);
+	case FDFMTEND:
+	case FDFLUSH:
+		if (lock_fdc(drive))
+			return -EINTR;
+		return invalidate_drive(bdev);
+	case FDSETEMSGTRESH:
+		drive_params[drive].max_errors.reporting = (unsigned short)(param & 0x0f);
+		return 0;
+	case FDGETMAXERRS:
+		outparam = &drive_params[drive].max_errors;
+		break;
+	case FDSETMAXERRS:
+		drive_params[drive].max_errors = inparam.max_errors;
+		break;
+	case FDGETDRVTYP:
+		outparam = drive_name(type, drive);
+		SUPBOUND(size, strlen((const char *)outparam) + 1);
+		break;
+	case FDSETDRVPRM:
+		if (!valid_floppy_drive_params(inparam.dp.autodetect,
+				inparam.dp.native_format))
+			return -EINVAL;
+		drive_params[drive] = inparam.dp;
+		break;
+	case FDGETDRVPRM:
+		outparam = &drive_params[drive];
+		break;
+	case FDPOLLDRVSTAT:
+		if (lock_fdc(drive))
+			return -EINTR;
+		if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+			return -EINTR;
+		process_fd_request();
+		fallthrough;
+	case FDGETDRVSTAT:
+		outparam = &drive_state[drive];
+		break;
+	case FDRESET:
+		return user_reset_fdc(drive, (int)param, true);
+	case FDGETFDCSTAT:
+		outparam = &fdc_state[FDC(drive)];
+		break;
+	case FDWERRORCLR:
+		memset(&write_errors[drive], 0, sizeof(write_errors[drive]));
+		return 0;
+	case FDWERRORGET:
+		outparam = &write_errors[drive];
+		break;
+	case FDRAWCMD:
+		return floppy_raw_cmd_ioctl(type, drive, cmd, (void __user *)param);
+	case FDTWADDLE:
+		if (lock_fdc(drive))
+			return -EINTR;
+		twaddle(current_fdc, current_drive);
+		process_fd_request();
+		return 0;
+	default:
+		return -EINVAL;
+	}
+
+	if (_IOC_DIR(cmd) & _IOC_READ)
+		return fd_copyout((void __user *)param, outparam, size);
+
+	return 0;
+}
+
+static int fd_ioctl(struct block_device *bdev, fmode_t mode,
+			     unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&floppy_mutex);
+	ret = fd_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&floppy_mutex);
+
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+
+struct compat_floppy_drive_params {
+	char		cmos;
+	compat_ulong_t	max_dtr;
+	compat_ulong_t	hlt;
+	compat_ulong_t	hut;
+	compat_ulong_t	srt;
+	compat_ulong_t	spinup;
+	compat_ulong_t	spindown;
+	unsigned char	spindown_offset;
+	unsigned char	select_delay;
+	unsigned char	rps;
+	unsigned char	tracks;
+	compat_ulong_t	timeout;
+	unsigned char	interleave_sect;
+	struct floppy_max_errors max_errors;
+	char		flags;
+	char		read_track;
+	short		autodetect[FD_AUTODETECT_SIZE];
+	compat_int_t	checkfreq;
+	compat_int_t	native_format;
+};
+
+struct compat_floppy_drive_struct {
+	signed char	flags;
+	compat_ulong_t	spinup_date;
+	compat_ulong_t	select_date;
+	compat_ulong_t	first_read_date;
+	short		probed_format;
+	short		track;
+	short		maxblock;
+	short		maxtrack;
+	compat_int_t	generation;
+	compat_int_t	keep_data;
+	compat_int_t	fd_ref;
+	compat_int_t	fd_device;
+	compat_int_t	last_checked;
+	compat_caddr_t dmabuf;
+	compat_int_t	bufblocks;
+};
+
+struct compat_floppy_fdc_state {
+	compat_int_t	spec1;
+	compat_int_t	spec2;
+	compat_int_t	dtr;
+	unsigned char	version;
+	unsigned char	dor;
+	compat_ulong_t	address;
+	unsigned int	rawcmd:2;
+	unsigned int	reset:1;
+	unsigned int	need_configure:1;
+	unsigned int	perp_mode:2;
+	unsigned int	has_fifo:1;
+	unsigned int	driver_version;
+	unsigned char	track[4];
+};
+
+struct compat_floppy_write_errors {
+	unsigned int	write_errors;
+	compat_ulong_t	first_error_sector;
+	compat_int_t	first_error_generation;
+	compat_ulong_t	last_error_sector;
+	compat_int_t	last_error_generation;
+	compat_uint_t	badness;
+};
+
+#define FDSETPRM32 _IOW(2, 0x42, struct compat_floppy_struct)
+#define FDDEFPRM32 _IOW(2, 0x43, struct compat_floppy_struct)
+#define FDSETDRVPRM32 _IOW(2, 0x90, struct compat_floppy_drive_params)
+#define FDGETDRVPRM32 _IOR(2, 0x11, struct compat_floppy_drive_params)
+#define FDGETDRVSTAT32 _IOR(2, 0x12, struct compat_floppy_drive_struct)
+#define FDPOLLDRVSTAT32 _IOR(2, 0x13, struct compat_floppy_drive_struct)
+#define FDGETFDCSTAT32 _IOR(2, 0x15, struct compat_floppy_fdc_state)
+#define FDWERRORGET32  _IOR(2, 0x17, struct compat_floppy_write_errors)
+
+static int compat_set_geometry(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+		    struct compat_floppy_struct __user *arg)
+{
+	struct floppy_struct v;
+	int drive, type;
+	int err;
+
+	BUILD_BUG_ON(offsetof(struct floppy_struct, name) !=
+		     offsetof(struct compat_floppy_struct, name));
+
+	if (!(mode & (FMODE_WRITE | FMODE_WRITE_IOCTL)))
+		return -EPERM;
+
+	memset(&v, 0, sizeof(struct floppy_struct));
+	if (copy_from_user(&v, arg, offsetof(struct floppy_struct, name)))
+		return -EFAULT;
+
+	mutex_lock(&floppy_mutex);
+	drive = (long)bdev->bd_disk->private_data;
+	type = ITYPE(drive_state[drive].fd_device);
+	err = set_geometry(cmd == FDSETPRM32 ? FDSETPRM : FDDEFPRM,
+			&v, drive, type, bdev);
+	mutex_unlock(&floppy_mutex);
+	return err;
+}
+
+static int compat_get_prm(int drive,
+			  struct compat_floppy_struct __user *arg)
+{
+	struct compat_floppy_struct v;
+	struct floppy_struct *p;
+	int err;
+
+	memset(&v, 0, sizeof(v));
+	mutex_lock(&floppy_mutex);
+	err = get_floppy_geometry(drive, ITYPE(drive_state[drive].fd_device),
+				  &p);
+	if (err) {
+		mutex_unlock(&floppy_mutex);
+		return err;
+	}
+	memcpy(&v, p, offsetof(struct floppy_struct, name));
+	mutex_unlock(&floppy_mutex);
+	if (copy_to_user(arg, &v, sizeof(struct compat_floppy_struct)))
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_setdrvprm(int drive,
+			    struct compat_floppy_drive_params __user *arg)
+{
+	struct compat_floppy_drive_params v;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+	if (copy_from_user(&v, arg, sizeof(struct compat_floppy_drive_params)))
+		return -EFAULT;
+	if (!valid_floppy_drive_params(v.autodetect, v.native_format))
+		return -EINVAL;
+	mutex_lock(&floppy_mutex);
+	drive_params[drive].cmos = v.cmos;
+	drive_params[drive].max_dtr = v.max_dtr;
+	drive_params[drive].hlt = v.hlt;
+	drive_params[drive].hut = v.hut;
+	drive_params[drive].srt = v.srt;
+	drive_params[drive].spinup = v.spinup;
+	drive_params[drive].spindown = v.spindown;
+	drive_params[drive].spindown_offset = v.spindown_offset;
+	drive_params[drive].select_delay = v.select_delay;
+	drive_params[drive].rps = v.rps;
+	drive_params[drive].tracks = v.tracks;
+	drive_params[drive].timeout = v.timeout;
+	drive_params[drive].interleave_sect = v.interleave_sect;
+	drive_params[drive].max_errors = v.max_errors;
+	drive_params[drive].flags = v.flags;
+	drive_params[drive].read_track = v.read_track;
+	memcpy(drive_params[drive].autodetect, v.autodetect,
+	       sizeof(v.autodetect));
+	drive_params[drive].checkfreq = v.checkfreq;
+	drive_params[drive].native_format = v.native_format;
+	mutex_unlock(&floppy_mutex);
+	return 0;
+}
+
+static int compat_getdrvprm(int drive,
+			    struct compat_floppy_drive_params __user *arg)
+{
+	struct compat_floppy_drive_params v;
+
+	memset(&v, 0, sizeof(struct compat_floppy_drive_params));
+	mutex_lock(&floppy_mutex);
+	v.cmos = drive_params[drive].cmos;
+	v.max_dtr = drive_params[drive].max_dtr;
+	v.hlt = drive_params[drive].hlt;
+	v.hut = drive_params[drive].hut;
+	v.srt = drive_params[drive].srt;
+	v.spinup = drive_params[drive].spinup;
+	v.spindown = drive_params[drive].spindown;
+	v.spindown_offset = drive_params[drive].spindown_offset;
+	v.select_delay = drive_params[drive].select_delay;
+	v.rps = drive_params[drive].rps;
+	v.tracks = drive_params[drive].tracks;
+	v.timeout = drive_params[drive].timeout;
+	v.interleave_sect = drive_params[drive].interleave_sect;
+	v.max_errors = drive_params[drive].max_errors;
+	v.flags = drive_params[drive].flags;
+	v.read_track = drive_params[drive].read_track;
+	memcpy(v.autodetect, drive_params[drive].autodetect,
+	       sizeof(v.autodetect));
+	v.checkfreq = drive_params[drive].checkfreq;
+	v.native_format = drive_params[drive].native_format;
+	mutex_unlock(&floppy_mutex);
+
+	if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_params)))
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_getdrvstat(int drive, bool poll,
+			    struct compat_floppy_drive_struct __user *arg)
+{
+	struct compat_floppy_drive_struct v;
+
+	memset(&v, 0, sizeof(struct compat_floppy_drive_struct));
+	mutex_lock(&floppy_mutex);
+
+	if (poll) {
+		if (lock_fdc(drive))
+			goto Eintr;
+		if (poll_drive(true, FD_RAW_NEED_DISK) == -EINTR)
+			goto Eintr;
+		process_fd_request();
+	}
+	v.spinup_date = drive_state[drive].spinup_date;
+	v.select_date = drive_state[drive].select_date;
+	v.first_read_date = drive_state[drive].first_read_date;
+	v.probed_format = drive_state[drive].probed_format;
+	v.track = drive_state[drive].track;
+	v.maxblock = drive_state[drive].maxblock;
+	v.maxtrack = drive_state[drive].maxtrack;
+	v.generation = drive_state[drive].generation;
+	v.keep_data = drive_state[drive].keep_data;
+	v.fd_ref = drive_state[drive].fd_ref;
+	v.fd_device = drive_state[drive].fd_device;
+	v.last_checked = drive_state[drive].last_checked;
+	v.dmabuf = (uintptr_t) drive_state[drive].dmabuf;
+	v.bufblocks = drive_state[drive].bufblocks;
+	mutex_unlock(&floppy_mutex);
+
+	if (copy_to_user(arg, &v, sizeof(struct compat_floppy_drive_struct)))
+		return -EFAULT;
+	return 0;
+Eintr:
+	mutex_unlock(&floppy_mutex);
+	return -EINTR;
+}
+
+static int compat_getfdcstat(int drive,
+			    struct compat_floppy_fdc_state __user *arg)
+{
+	struct compat_floppy_fdc_state v32;
+	struct floppy_fdc_state v;
+
+	mutex_lock(&floppy_mutex);
+	v = fdc_state[FDC(drive)];
+	mutex_unlock(&floppy_mutex);
+
+	memset(&v32, 0, sizeof(struct compat_floppy_fdc_state));
+	v32.spec1 = v.spec1;
+	v32.spec2 = v.spec2;
+	v32.dtr = v.dtr;
+	v32.version = v.version;
+	v32.dor = v.dor;
+	v32.address = v.address;
+	v32.rawcmd = v.rawcmd;
+	v32.reset = v.reset;
+	v32.need_configure = v.need_configure;
+	v32.perp_mode = v.perp_mode;
+	v32.has_fifo = v.has_fifo;
+	v32.driver_version = v.driver_version;
+	memcpy(v32.track, v.track, 4);
+	if (copy_to_user(arg, &v32, sizeof(struct compat_floppy_fdc_state)))
+		return -EFAULT;
+	return 0;
+}
+
+static int compat_werrorget(int drive,
+			    struct compat_floppy_write_errors __user *arg)
+{
+	struct compat_floppy_write_errors v32;
+	struct floppy_write_errors v;
+
+	memset(&v32, 0, sizeof(struct compat_floppy_write_errors));
+	mutex_lock(&floppy_mutex);
+	v = write_errors[drive];
+	mutex_unlock(&floppy_mutex);
+	v32.write_errors = v.write_errors;
+	v32.first_error_sector = v.first_error_sector;
+	v32.first_error_generation = v.first_error_generation;
+	v32.last_error_sector = v.last_error_sector;
+	v32.last_error_generation = v.last_error_generation;
+	v32.badness = v.badness;
+	if (copy_to_user(arg, &v32, sizeof(struct compat_floppy_write_errors)))
+		return -EFAULT;
+	return 0;
+}
+
+static int fd_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd,
+		    unsigned long param)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	switch (cmd) {
+	case CDROMEJECT: /* CD-ROM eject */
+	case 0x6470:	 /* SunOS floppy eject */
+
+	case FDMSGON:
+	case FDMSGOFF:
+	case FDSETEMSGTRESH:
+	case FDFLUSH:
+	case FDWERRORCLR:
+	case FDEJECT:
+	case FDCLRPRM:
+	case FDFMTBEG:
+	case FDRESET:
+	case FDTWADDLE:
+		return fd_ioctl(bdev, mode, cmd, param);
+	case FDSETMAXERRS:
+	case FDGETMAXERRS:
+	case FDGETDRVTYP:
+	case FDFMTEND:
+	case FDFMTTRK:
+	case FDRAWCMD:
+		return fd_ioctl(bdev, mode, cmd,
+				(unsigned long)compat_ptr(param));
+	case FDSETPRM32:
+	case FDDEFPRM32:
+		return compat_set_geometry(bdev, mode, cmd, compat_ptr(param));
+	case FDGETPRM32:
+		return compat_get_prm(drive, compat_ptr(param));
+	case FDSETDRVPRM32:
+		return compat_setdrvprm(drive, compat_ptr(param));
+	case FDGETDRVPRM32:
+		return compat_getdrvprm(drive, compat_ptr(param));
+	case FDPOLLDRVSTAT32:
+		return compat_getdrvstat(drive, true, compat_ptr(param));
+	case FDGETDRVSTAT32:
+		return compat_getdrvstat(drive, false, compat_ptr(param));
+	case FDGETFDCSTAT32:
+		return compat_getfdcstat(drive, compat_ptr(param));
+	case FDWERRORGET32:
+		return compat_werrorget(drive, compat_ptr(param));
+	}
+	return -EINVAL;
+}
+#endif
+
+static void __init config_types(void)
+{
+	bool has_drive = false;
+	int drive;
+
+	/* read drive info out of physical CMOS */
+	drive = 0;
+	if (!drive_params[drive].cmos)
+		drive_params[drive].cmos = FLOPPY0_TYPE;
+	drive = 1;
+	if (!drive_params[drive].cmos)
+		drive_params[drive].cmos = FLOPPY1_TYPE;
+
+	/* FIXME: additional physical CMOS drive detection should go here */
+
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		unsigned int type = drive_params[drive].cmos;
+		struct floppy_drive_params *params;
+		const char *name = NULL;
+		char temparea[32];
+
+		if (type < ARRAY_SIZE(default_drive_params)) {
+			params = &default_drive_params[type].params;
+			if (type) {
+				name = default_drive_params[type].name;
+				allowed_drive_mask |= 1 << drive;
+			} else
+				allowed_drive_mask &= ~(1 << drive);
+		} else {
+			params = &default_drive_params[0].params;
+			snprintf(temparea, sizeof(temparea),
+				 "unknown type %d (usb?)", type);
+			name = temparea;
+		}
+		if (name) {
+			const char *prepend;
+			if (!has_drive) {
+				prepend = "";
+				has_drive = true;
+				pr_info("Floppy drive(s):");
+			} else {
+				prepend = ",";
+			}
+
+			pr_cont("%s fd%d is %s", prepend, drive, name);
+		}
+		drive_params[drive] = *params;
+	}
+
+	if (has_drive)
+		pr_cont("\n");
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+	int drive = (long)disk->private_data;
+
+	mutex_lock(&floppy_mutex);
+	mutex_lock(&open_lock);
+	if (!drive_state[drive].fd_ref--) {
+		DPRINT("floppy_release with fd_ref == 0");
+		drive_state[drive].fd_ref = 0;
+	}
+	if (!drive_state[drive].fd_ref)
+		opened_bdev[drive] = NULL;
+	mutex_unlock(&open_lock);
+	mutex_unlock(&floppy_mutex);
+}
+
+/*
+ * floppy_open check for aliasing (/dev/fd0 can be the same as
+ * /dev/PS0 etc), and disallows simultaneous access to the same
+ * drive with different device numbers.
+ */
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+	int drive = (long)bdev->bd_disk->private_data;
+	int old_dev, new_dev;
+	int try;
+	int res = -EBUSY;
+	char *tmp;
+
+	mutex_lock(&floppy_mutex);
+	mutex_lock(&open_lock);
+	old_dev = drive_state[drive].fd_device;
+	if (opened_bdev[drive] && opened_bdev[drive] != bdev)
+		goto out2;
+
+	if (!drive_state[drive].fd_ref && (drive_params[drive].flags & FD_BROKEN_DCL)) {
+		set_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags);
+		set_bit(FD_VERIFY_BIT, &drive_state[drive].flags);
+	}
+
+	drive_state[drive].fd_ref++;
+
+	opened_bdev[drive] = bdev;
+
+	res = -ENXIO;
+
+	if (!floppy_track_buffer) {
+		/* if opening an ED drive, reserve a big buffer,
+		 * else reserve a small one */
+		if ((drive_params[drive].cmos == 6) || (drive_params[drive].cmos == 5))
+			try = 64;	/* Only 48 actually useful */
+		else
+			try = 32;	/* Only 24 actually useful */
+
+		tmp = (char *)fd_dma_mem_alloc(1024 * try);
+		if (!tmp && !floppy_track_buffer) {
+			try >>= 1;	/* buffer only one side */
+			INFBOUND(try, 16);
+			tmp = (char *)fd_dma_mem_alloc(1024 * try);
+		}
+		if (!tmp && !floppy_track_buffer)
+			fallback_on_nodma_alloc(&tmp, 2048 * try);
+		if (!tmp && !floppy_track_buffer) {
+			DPRINT("Unable to allocate DMA memory\n");
+			goto out;
+		}
+		if (floppy_track_buffer) {
+			if (tmp)
+				fd_dma_mem_free((unsigned long)tmp, try * 1024);
+		} else {
+			buffer_min = buffer_max = -1;
+			floppy_track_buffer = tmp;
+			max_buffer_sectors = try;
+		}
+	}
+
+	new_dev = MINOR(bdev->bd_dev);
+	drive_state[drive].fd_device = new_dev;
+	set_capacity(disks[drive][ITYPE(new_dev)], floppy_sizes[new_dev]);
+	if (old_dev != -1 && old_dev != new_dev) {
+		if (buffer_drive == drive)
+			buffer_track = -1;
+	}
+
+	if (fdc_state[FDC(drive)].rawcmd == 1)
+		fdc_state[FDC(drive)].rawcmd = 2;
+
+	if (!(mode & FMODE_NDELAY)) {
+		if (mode & (FMODE_READ|FMODE_WRITE)) {
+			drive_state[drive].last_checked = 0;
+			clear_bit(FD_OPEN_SHOULD_FAIL_BIT,
+				  &drive_state[drive].flags);
+			if (bdev_check_media_change(bdev))
+				floppy_revalidate(bdev->bd_disk);
+			if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags))
+				goto out;
+			if (test_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags))
+				goto out;
+		}
+		res = -EROFS;
+		if ((mode & FMODE_WRITE) &&
+		    !test_bit(FD_DISK_WRITABLE_BIT, &drive_state[drive].flags))
+			goto out;
+	}
+	mutex_unlock(&open_lock);
+	mutex_unlock(&floppy_mutex);
+	return 0;
+out:
+	drive_state[drive].fd_ref--;
+
+	if (!drive_state[drive].fd_ref)
+		opened_bdev[drive] = NULL;
+out2:
+	mutex_unlock(&open_lock);
+	mutex_unlock(&floppy_mutex);
+	return res;
+}
+
+/*
+ * Check if the disk has been changed or if a change has been faked.
+ */
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
+{
+	int drive = (long)disk->private_data;
+
+	if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags) ||
+	    test_bit(FD_VERIFY_BIT, &drive_state[drive].flags))
+		return DISK_EVENT_MEDIA_CHANGE;
+
+	if (time_after(jiffies, drive_state[drive].last_checked + drive_params[drive].checkfreq)) {
+		if (lock_fdc(drive))
+			return 0;
+		poll_drive(false, 0);
+		process_fd_request();
+	}
+
+	if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags) ||
+	    test_bit(FD_VERIFY_BIT, &drive_state[drive].flags) ||
+	    test_bit(drive, &fake_change) ||
+	    drive_no_geom(drive))
+		return DISK_EVENT_MEDIA_CHANGE;
+	return 0;
+}
+
+/*
+ * This implements "read block 0" for floppy_revalidate().
+ * Needed for format autodetection, checking whether there is
+ * a disk in the drive, and whether that disk is writable.
+ */
+
+struct rb0_cbdata {
+	int drive;
+	struct completion complete;
+};
+
+static void floppy_rb0_cb(struct bio *bio)
+{
+	struct rb0_cbdata *cbdata = (struct rb0_cbdata *)bio->bi_private;
+	int drive = cbdata->drive;
+
+	if (bio->bi_status) {
+		pr_info("floppy: error %d while reading block 0\n",
+			bio->bi_status);
+		set_bit(FD_OPEN_SHOULD_FAIL_BIT, &drive_state[drive].flags);
+	}
+	complete(&cbdata->complete);
+}
+
+static int __floppy_read_block_0(struct block_device *bdev, int drive)
+{
+	struct bio bio;
+	struct bio_vec bio_vec;
+	struct page *page;
+	struct rb0_cbdata cbdata;
+
+	page = alloc_page(GFP_NOIO);
+	if (!page) {
+		process_fd_request();
+		return -ENOMEM;
+	}
+
+	cbdata.drive = drive;
+
+	bio_init(&bio, bdev, &bio_vec, 1, REQ_OP_READ);
+	bio_add_page(&bio, page, block_size(bdev), 0);
+
+	bio.bi_iter.bi_sector = 0;
+	bio.bi_flags |= (1 << BIO_QUIET);
+	bio.bi_private = &cbdata;
+	bio.bi_end_io = floppy_rb0_cb;
+
+	init_completion(&cbdata.complete);
+
+	submit_bio(&bio);
+	process_fd_request();
+
+	wait_for_completion(&cbdata.complete);
+
+	__free_page(page);
+
+	return 0;
+}
+
+/* revalidate the floppy disk, i.e. trigger format autodetection by reading
+ * the bootblock (block 0). "Autodetection" is also needed to check whether
+ * there is a disk in the drive at all... Thus we also do it for fixed
+ * geometry formats */
+static int floppy_revalidate(struct gendisk *disk)
+{
+	int drive = (long)disk->private_data;
+	int cf;
+	int res = 0;
+
+	if (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags) ||
+	    test_bit(FD_VERIFY_BIT, &drive_state[drive].flags) ||
+	    test_bit(drive, &fake_change) ||
+	    drive_no_geom(drive)) {
+		if (WARN(atomic_read(&usage_count) == 0,
+			 "VFS: revalidate called on non-open device.\n"))
+			return -EFAULT;
+
+		res = lock_fdc(drive);
+		if (res)
+			return res;
+		cf = (test_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags) ||
+		      test_bit(FD_VERIFY_BIT, &drive_state[drive].flags));
+		if (!(cf || test_bit(drive, &fake_change) || drive_no_geom(drive))) {
+			process_fd_request();	/*already done by another thread */
+			return 0;
+		}
+		drive_state[drive].maxblock = 0;
+		drive_state[drive].maxtrack = 0;
+		if (buffer_drive == drive)
+			buffer_track = -1;
+		clear_bit(drive, &fake_change);
+		clear_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags);
+		if (cf)
+			drive_state[drive].generation++;
+		if (drive_no_geom(drive)) {
+			/* auto-sensing */
+			res = __floppy_read_block_0(opened_bdev[drive], drive);
+		} else {
+			if (cf)
+				poll_drive(false, FD_RAW_NEED_DISK);
+			process_fd_request();
+		}
+	}
+	set_capacity(disk, floppy_sizes[drive_state[drive].fd_device]);
+	return res;
+}
+
+static const struct block_device_operations floppy_fops = {
+	.owner			= THIS_MODULE,
+	.open			= floppy_open,
+	.release		= floppy_release,
+	.ioctl			= fd_ioctl,
+	.getgeo			= fd_getgeo,
+	.check_events		= floppy_check_events,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl		= fd_compat_ioctl,
+#endif
+};
+
+/*
+ * Floppy Driver initialization
+ * =============================
+ */
+
+/* Determine the floppy disk controller type */
+/* This routine was written by David C. Niemi */
+static char __init get_fdc_version(int fdc)
+{
+	int r;
+
+	output_byte(fdc, FD_DUMPREGS);	/* 82072 and better know DUMPREGS */
+	if (fdc_state[fdc].reset)
+		return FDC_NONE;
+	r = result(fdc);
+	if (r <= 0x00)
+		return FDC_NONE;	/* No FDC present ??? */
+	if ((r == 1) && (reply_buffer[ST0] == 0x80)) {
+		pr_info("FDC %d is an 8272A\n", fdc);
+		return FDC_8272A;	/* 8272a/765 don't know DUMPREGS */
+	}
+	if (r != 10) {
+		pr_info("FDC %d init: DUMPREGS: unexpected return of %d bytes.\n",
+			fdc, r);
+		return FDC_UNKNOWN;
+	}
+
+	if (!fdc_configure(fdc)) {
+		pr_info("FDC %d is an 82072\n", fdc);
+		return FDC_82072;	/* 82072 doesn't know CONFIGURE */
+	}
+
+	output_byte(fdc, FD_PERPENDICULAR);
+	if (need_more_output(fdc) == MORE_OUTPUT) {
+		output_byte(fdc, 0);
+	} else {
+		pr_info("FDC %d is an 82072A\n", fdc);
+		return FDC_82072A;	/* 82072A as found on Sparcs. */
+	}
+
+	output_byte(fdc, FD_UNLOCK);
+	r = result(fdc);
+	if ((r == 1) && (reply_buffer[ST0] == 0x80)) {
+		pr_info("FDC %d is a pre-1991 82077\n", fdc);
+		return FDC_82077_ORIG;	/* Pre-1991 82077, doesn't know
+					 * LOCK/UNLOCK */
+	}
+	if ((r != 1) || (reply_buffer[ST0] != 0x00)) {
+		pr_info("FDC %d init: UNLOCK: unexpected return of %d bytes.\n",
+			fdc, r);
+		return FDC_UNKNOWN;
+	}
+	output_byte(fdc, FD_PARTID);
+	r = result(fdc);
+	if (r != 1) {
+		pr_info("FDC %d init: PARTID: unexpected return of %d bytes.\n",
+			fdc, r);
+		return FDC_UNKNOWN;
+	}
+	if (reply_buffer[ST0] == 0x80) {
+		pr_info("FDC %d is a post-1991 82077\n", fdc);
+		return FDC_82077;	/* Revised 82077AA passes all the tests */
+	}
+	switch (reply_buffer[ST0] >> 5) {
+	case 0x0:
+		/* Either a 82078-1 or a 82078SL running at 5Volt */
+		pr_info("FDC %d is an 82078.\n", fdc);
+		return FDC_82078;
+	case 0x1:
+		pr_info("FDC %d is a 44pin 82078\n", fdc);
+		return FDC_82078;
+	case 0x2:
+		pr_info("FDC %d is a S82078B\n", fdc);
+		return FDC_S82078B;
+	case 0x3:
+		pr_info("FDC %d is a National Semiconductor PC87306\n", fdc);
+		return FDC_87306;
+	default:
+		pr_info("FDC %d init: 82078 variant with unknown PARTID=%d.\n",
+			fdc, reply_buffer[ST0] >> 5);
+		return FDC_82078_UNKN;
+	}
+}				/* get_fdc_version */
+
+/* lilo configuration */
+
+static void __init floppy_set_flags(int *ints, int param, int param2)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) {
+		if (param)
+			default_drive_params[i].params.flags |= param2;
+		else
+			default_drive_params[i].params.flags &= ~param2;
+	}
+	DPRINT("%s flag 0x%x\n", param2 ? "Setting" : "Clearing", param);
+}
+
+static void __init daring(int *ints, int param, int param2)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(default_drive_params); i++) {
+		if (param) {
+			default_drive_params[i].params.select_delay = 0;
+			default_drive_params[i].params.flags |=
+			    FD_SILENT_DCL_CLEAR;
+		} else {
+			default_drive_params[i].params.select_delay =
+			    2 * HZ / 100;
+			default_drive_params[i].params.flags &=
+			    ~FD_SILENT_DCL_CLEAR;
+		}
+	}
+	DPRINT("Assuming %s floppy hardware\n", param ? "standard" : "broken");
+}
+
+static void __init set_cmos(int *ints, int dummy, int dummy2)
+{
+	int current_drive = 0;
+
+	if (ints[0] != 2) {
+		DPRINT("wrong number of parameters for CMOS\n");
+		return;
+	}
+	current_drive = ints[1];
+	if (current_drive < 0 || current_drive >= 8) {
+		DPRINT("bad drive for set_cmos\n");
+		return;
+	}
+#if N_FDC > 1
+	if (current_drive >= 4 && !FDC2)
+		FDC2 = 0x370;
+#endif
+	drive_params[current_drive].cmos = ints[2];
+	DPRINT("setting CMOS code to %d\n", ints[2]);
+}
+
+static struct param_table {
+	const char *name;
+	void (*fn) (int *ints, int param, int param2);
+	int *var;
+	int def_param;
+	int param2;
+} config_params[] __initdata = {
+	{"allowed_drive_mask", NULL, &allowed_drive_mask, 0xff, 0}, /* obsolete */
+	{"all_drives", NULL, &allowed_drive_mask, 0xff, 0},	/* obsolete */
+	{"asus_pci", NULL, &allowed_drive_mask, 0x33, 0},
+	{"irq", NULL, &FLOPPY_IRQ, 6, 0},
+	{"dma", NULL, &FLOPPY_DMA, 2, 0},
+	{"daring", daring, NULL, 1, 0},
+#if N_FDC > 1
+	{"two_fdc", NULL, &FDC2, 0x370, 0},
+	{"one_fdc", NULL, &FDC2, 0, 0},
+#endif
+	{"thinkpad", floppy_set_flags, NULL, 1, FD_INVERTED_DCL},
+	{"broken_dcl", floppy_set_flags, NULL, 1, FD_BROKEN_DCL},
+	{"messages", floppy_set_flags, NULL, 1, FTD_MSG},
+	{"silent_dcl_clear", floppy_set_flags, NULL, 1, FD_SILENT_DCL_CLEAR},
+	{"debug", floppy_set_flags, NULL, 1, FD_DEBUG},
+	{"nodma", NULL, &can_use_virtual_dma, 1, 0},
+	{"omnibook", NULL, &can_use_virtual_dma, 1, 0},
+	{"yesdma", NULL, &can_use_virtual_dma, 0, 0},
+	{"fifo_depth", NULL, &fifo_depth, 0xa, 0},
+	{"nofifo", NULL, &no_fifo, 0x20, 0},
+	{"usefifo", NULL, &no_fifo, 0, 0},
+	{"cmos", set_cmos, NULL, 0, 0},
+	{"slow", NULL, &slow_floppy, 1, 0},
+	{"unexpected_interrupts", NULL, &print_unex, 1, 0},
+	{"no_unexpected_interrupts", NULL, &print_unex, 0, 0},
+	{"L40SX", NULL, &print_unex, 0, 0}
+
+	EXTRA_FLOPPY_PARAMS
+};
+
+static int __init floppy_setup(char *str)
+{
+	int i;
+	int param;
+	int ints[11];
+
+	str = get_options(str, ARRAY_SIZE(ints), ints);
+	if (str) {
+		for (i = 0; i < ARRAY_SIZE(config_params); i++) {
+			if (strcmp(str, config_params[i].name) == 0) {
+				if (ints[0])
+					param = ints[1];
+				else
+					param = config_params[i].def_param;
+				if (config_params[i].fn)
+					config_params[i].fn(ints, param,
+							    config_params[i].
+							    param2);
+				if (config_params[i].var) {
+					DPRINT("%s=%d\n", str, param);
+					*config_params[i].var = param;
+				}
+				return 1;
+			}
+		}
+	}
+	if (str) {
+		DPRINT("unknown floppy option [%s]\n", str);
+
+		DPRINT("allowed options are:");
+		for (i = 0; i < ARRAY_SIZE(config_params); i++)
+			pr_cont(" %s", config_params[i].name);
+		pr_cont("\n");
+	} else
+		DPRINT("botched floppy option\n");
+	DPRINT("Read Documentation/admin-guide/blockdev/floppy.rst\n");
+	return 0;
+}
+
+static int have_no_fdc = -ENODEV;
+
+static ssize_t floppy_cmos_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct platform_device *p = to_platform_device(dev);
+	int drive;
+
+	drive = p->id;
+	return sprintf(buf, "%X\n", drive_params[drive].cmos);
+}
+
+static DEVICE_ATTR(cmos, 0444, floppy_cmos_show, NULL);
+
+static struct attribute *floppy_dev_attrs[] = {
+	&dev_attr_cmos.attr,
+	NULL
+};
+
+ATTRIBUTE_GROUPS(floppy_dev);
+
+static void floppy_device_release(struct device *dev)
+{
+}
+
+static int floppy_resume(struct device *dev)
+{
+	int fdc;
+	int saved_drive;
+
+	saved_drive = current_drive;
+	for (fdc = 0; fdc < N_FDC; fdc++)
+		if (fdc_state[fdc].address != -1)
+			user_reset_fdc(REVDRIVE(fdc, 0), FD_RESET_ALWAYS, false);
+	set_fdc(saved_drive);
+	return 0;
+}
+
+static const struct dev_pm_ops floppy_pm_ops = {
+	.resume = floppy_resume,
+	.restore = floppy_resume,
+};
+
+static struct platform_driver floppy_driver = {
+	.driver = {
+		   .name = "floppy",
+		   .pm = &floppy_pm_ops,
+	},
+};
+
+static const struct blk_mq_ops floppy_mq_ops = {
+	.queue_rq = floppy_queue_rq,
+};
+
+static struct platform_device floppy_device[N_DRIVE];
+static bool registered[N_DRIVE];
+
+static bool floppy_available(int drive)
+{
+	if (!(allowed_drive_mask & (1 << drive)))
+		return false;
+	if (fdc_state[FDC(drive)].version == FDC_NONE)
+		return false;
+	return true;
+}
+
+static int floppy_alloc_disk(unsigned int drive, unsigned int type)
+{
+	struct gendisk *disk;
+
+	disk = blk_mq_alloc_disk(&tag_sets[drive], NULL);
+	if (IS_ERR(disk))
+		return PTR_ERR(disk);
+
+	blk_queue_max_hw_sectors(disk->queue, 64);
+	disk->major = FLOPPY_MAJOR;
+	disk->first_minor = TOMINOR(drive) | (type << 2);
+	disk->minors = 1;
+	disk->fops = &floppy_fops;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	if (type)
+		sprintf(disk->disk_name, "fd%d_type%d", drive, type);
+	else
+		sprintf(disk->disk_name, "fd%d", drive);
+	/* to be cleaned up... */
+	disk->private_data = (void *)(long)drive;
+	disk->flags |= GENHD_FL_REMOVABLE;
+
+	disks[drive][type] = disk;
+	return 0;
+}
+
+static DEFINE_MUTEX(floppy_probe_lock);
+
+static void floppy_probe(dev_t dev)
+{
+	unsigned int drive = (MINOR(dev) & 3) | ((MINOR(dev) & 0x80) >> 5);
+	unsigned int type = (MINOR(dev) >> 2) & 0x1f;
+
+	if (drive >= N_DRIVE || !floppy_available(drive) ||
+	    type >= ARRAY_SIZE(floppy_type))
+		return;
+
+	mutex_lock(&floppy_probe_lock);
+	if (disks[drive][type])
+		goto out;
+	if (floppy_alloc_disk(drive, type))
+		goto out;
+	if (add_disk(disks[drive][type]))
+		goto cleanup_disk;
+out:
+	mutex_unlock(&floppy_probe_lock);
+	return;
+
+cleanup_disk:
+	put_disk(disks[drive][type]);
+	disks[drive][type] = NULL;
+	mutex_unlock(&floppy_probe_lock);
+}
+
+static int __init do_floppy_init(void)
+{
+	int i, unit, drive, err;
+
+	set_debugt();
+	interruptjiffies = resultjiffies = jiffies;
+
+#if defined(CONFIG_PPC)
+	if (check_legacy_ioport(FDC1))
+		return -ENODEV;
+#endif
+
+	raw_cmd = NULL;
+
+	floppy_wq = alloc_ordered_workqueue("floppy", 0);
+	if (!floppy_wq)
+		return -ENOMEM;
+
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		memset(&tag_sets[drive], 0, sizeof(tag_sets[drive]));
+		tag_sets[drive].ops = &floppy_mq_ops;
+		tag_sets[drive].nr_hw_queues = 1;
+		tag_sets[drive].nr_maps = 1;
+		tag_sets[drive].queue_depth = 2;
+		tag_sets[drive].numa_node = NUMA_NO_NODE;
+		tag_sets[drive].flags = BLK_MQ_F_SHOULD_MERGE;
+		err = blk_mq_alloc_tag_set(&tag_sets[drive]);
+		if (err)
+			goto out_put_disk;
+
+		err = floppy_alloc_disk(drive, 0);
+		if (err) {
+			blk_mq_free_tag_set(&tag_sets[drive]);
+			goto out_put_disk;
+		}
+
+		timer_setup(&motor_off_timer[drive], motor_off_callback, 0);
+	}
+
+	err = __register_blkdev(FLOPPY_MAJOR, "fd", floppy_probe);
+	if (err)
+		goto out_put_disk;
+
+	err = platform_driver_register(&floppy_driver);
+	if (err)
+		goto out_unreg_blkdev;
+
+	for (i = 0; i < 256; i++)
+		if (ITYPE(i))
+			floppy_sizes[i] = floppy_type[ITYPE(i)].size;
+		else
+			floppy_sizes[i] = MAX_DISK_SIZE << 1;
+
+	reschedule_timeout(MAXTIMEOUT, "floppy init");
+	config_types();
+
+	for (i = 0; i < N_FDC; i++) {
+		memset(&fdc_state[i], 0, sizeof(*fdc_state));
+		fdc_state[i].dtr = -1;
+		fdc_state[i].dor = 0x4;
+#if defined(__sparc__) || defined(__mc68000__)
+	/*sparcs/sun3x don't have a DOR reset which we can fall back on to */
+#ifdef __mc68000__
+		if (MACH_IS_SUN3X)
+#endif
+			fdc_state[i].version = FDC_82072A;
+#endif
+	}
+
+	use_virtual_dma = can_use_virtual_dma & 1;
+	fdc_state[0].address = FDC1;
+	if (fdc_state[0].address == -1) {
+		cancel_delayed_work(&fd_timeout);
+		err = -ENODEV;
+		goto out_unreg_driver;
+	}
+#if N_FDC > 1
+	fdc_state[1].address = FDC2;
+#endif
+
+	current_fdc = 0;	/* reset fdc in case of unexpected interrupt */
+	err = floppy_grab_irq_and_dma();
+	if (err) {
+		cancel_delayed_work(&fd_timeout);
+		err = -EBUSY;
+		goto out_unreg_driver;
+	}
+
+	/* initialise drive state */
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		memset(&drive_state[drive], 0, sizeof(drive_state[drive]));
+		memset(&write_errors[drive], 0, sizeof(write_errors[drive]));
+		set_bit(FD_DISK_NEWCHANGE_BIT, &drive_state[drive].flags);
+		set_bit(FD_DISK_CHANGED_BIT, &drive_state[drive].flags);
+		set_bit(FD_VERIFY_BIT, &drive_state[drive].flags);
+		drive_state[drive].fd_device = -1;
+		floppy_track_buffer = NULL;
+		max_buffer_sectors = 0;
+	}
+	/*
+	 * Small 10 msec delay to let through any interrupt that
+	 * initialization might have triggered, to not
+	 * confuse detection:
+	 */
+	msleep(10);
+
+	for (i = 0; i < N_FDC; i++) {
+		fdc_state[i].driver_version = FD_DRIVER_VERSION;
+		for (unit = 0; unit < 4; unit++)
+			fdc_state[i].track[unit] = 0;
+		if (fdc_state[i].address == -1)
+			continue;
+		fdc_state[i].rawcmd = 2;
+		if (user_reset_fdc(REVDRIVE(i, 0), FD_RESET_ALWAYS, false)) {
+			/* free ioports reserved by floppy_grab_irq_and_dma() */
+			floppy_release_regions(i);
+			fdc_state[i].address = -1;
+			fdc_state[i].version = FDC_NONE;
+			continue;
+		}
+		/* Try to determine the floppy controller type */
+		fdc_state[i].version = get_fdc_version(i);
+		if (fdc_state[i].version == FDC_NONE) {
+			/* free ioports reserved by floppy_grab_irq_and_dma() */
+			floppy_release_regions(i);
+			fdc_state[i].address = -1;
+			continue;
+		}
+		if (can_use_virtual_dma == 2 &&
+		    fdc_state[i].version < FDC_82072A)
+			can_use_virtual_dma = 0;
+
+		have_no_fdc = 0;
+		/* Not all FDCs seem to be able to handle the version command
+		 * properly, so force a reset for the standard FDC clones,
+		 * to avoid interrupt garbage.
+		 */
+		user_reset_fdc(REVDRIVE(i, 0), FD_RESET_ALWAYS, false);
+	}
+	current_fdc = 0;
+	cancel_delayed_work(&fd_timeout);
+	current_drive = 0;
+	initialized = true;
+	if (have_no_fdc) {
+		DPRINT("no floppy controllers found\n");
+		err = have_no_fdc;
+		goto out_release_dma;
+	}
+
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		if (!floppy_available(drive))
+			continue;
+
+		floppy_device[drive].name = floppy_device_name;
+		floppy_device[drive].id = drive;
+		floppy_device[drive].dev.release = floppy_device_release;
+		floppy_device[drive].dev.groups = floppy_dev_groups;
+
+		err = platform_device_register(&floppy_device[drive]);
+		if (err)
+			goto out_remove_drives;
+
+		registered[drive] = true;
+
+		err = device_add_disk(&floppy_device[drive].dev,
+				      disks[drive][0], NULL);
+		if (err)
+			goto out_remove_drives;
+	}
+
+	return 0;
+
+out_remove_drives:
+	while (drive--) {
+		if (floppy_available(drive)) {
+			del_gendisk(disks[drive][0]);
+			if (registered[drive])
+				platform_device_unregister(&floppy_device[drive]);
+		}
+	}
+out_release_dma:
+	if (atomic_read(&usage_count))
+		floppy_release_irq_and_dma();
+out_unreg_driver:
+	platform_driver_unregister(&floppy_driver);
+out_unreg_blkdev:
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
+out_put_disk:
+	destroy_workqueue(floppy_wq);
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		if (!disks[drive][0])
+			break;
+		del_timer_sync(&motor_off_timer[drive]);
+		put_disk(disks[drive][0]);
+		blk_mq_free_tag_set(&tag_sets[drive]);
+	}
+	return err;
+}
+
+#ifndef MODULE
+static __init void floppy_async_init(void *data, async_cookie_t cookie)
+{
+	do_floppy_init();
+}
+#endif
+
+static int __init floppy_init(void)
+{
+#ifdef MODULE
+	return do_floppy_init();
+#else
+	/* Don't hold up the bootup by the floppy initialization */
+	async_schedule(floppy_async_init, NULL);
+	return 0;
+#endif
+}
+
+static const struct io_region {
+	int offset;
+	int size;
+} io_regions[] = {
+	{ 2, 1 },
+	/* address + 3 is sometimes reserved by pnp bios for motherboard */
+	{ 4, 2 },
+	/* address + 6 is reserved, and may be taken by IDE.
+	 * Unfortunately, Adaptec doesn't know this :-(, */
+	{ 7, 1 },
+};
+
+static void floppy_release_allocated_regions(int fdc, const struct io_region *p)
+{
+	while (p != io_regions) {
+		p--;
+		release_region(fdc_state[fdc].address + p->offset, p->size);
+	}
+}
+
+#define ARRAY_END(X) (&((X)[ARRAY_SIZE(X)]))
+
+static int floppy_request_regions(int fdc)
+{
+	const struct io_region *p;
+
+	for (p = io_regions; p < ARRAY_END(io_regions); p++) {
+		if (!request_region(fdc_state[fdc].address + p->offset,
+				    p->size, "floppy")) {
+			DPRINT("Floppy io-port 0x%04lx in use\n",
+			       fdc_state[fdc].address + p->offset);
+			floppy_release_allocated_regions(fdc, p);
+			return -EBUSY;
+		}
+	}
+	return 0;
+}
+
+static void floppy_release_regions(int fdc)
+{
+	floppy_release_allocated_regions(fdc, ARRAY_END(io_regions));
+}
+
+static int floppy_grab_irq_and_dma(void)
+{
+	int fdc;
+
+	if (atomic_inc_return(&usage_count) > 1)
+		return 0;
+
+	/*
+	 * We might have scheduled a free_irq(), wait it to
+	 * drain first:
+	 */
+	flush_workqueue(floppy_wq);
+
+	if (fd_request_irq()) {
+		DPRINT("Unable to grab IRQ%d for the floppy driver\n",
+		       FLOPPY_IRQ);
+		atomic_dec(&usage_count);
+		return -1;
+	}
+	if (fd_request_dma()) {
+		DPRINT("Unable to grab DMA%d for the floppy driver\n",
+		       FLOPPY_DMA);
+		if (can_use_virtual_dma & 2)
+			use_virtual_dma = can_use_virtual_dma = 1;
+		if (!(can_use_virtual_dma & 1)) {
+			fd_free_irq();
+			atomic_dec(&usage_count);
+			return -1;
+		}
+	}
+
+	for (fdc = 0; fdc < N_FDC; fdc++) {
+		if (fdc_state[fdc].address != -1) {
+			if (floppy_request_regions(fdc))
+				goto cleanup;
+		}
+	}
+	for (fdc = 0; fdc < N_FDC; fdc++) {
+		if (fdc_state[fdc].address != -1) {
+			reset_fdc_info(fdc, 1);
+			fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR);
+		}
+	}
+
+	set_dor(0, ~0, 8);	/* avoid immediate interrupt */
+
+	for (fdc = 0; fdc < N_FDC; fdc++)
+		if (fdc_state[fdc].address != -1)
+			fdc_outb(fdc_state[fdc].dor, fdc, FD_DOR);
+	/*
+	 * The driver will try and free resources and relies on us
+	 * to know if they were allocated or not.
+	 */
+	current_fdc = 0;
+	irqdma_allocated = 1;
+	return 0;
+cleanup:
+	fd_free_irq();
+	fd_free_dma();
+	while (--fdc >= 0)
+		floppy_release_regions(fdc);
+	current_fdc = 0;
+	atomic_dec(&usage_count);
+	return -1;
+}
+
+static void floppy_release_irq_and_dma(void)
+{
+	int fdc;
+#ifndef __sparc__
+	int drive;
+#endif
+	long tmpsize;
+	unsigned long tmpaddr;
+
+	if (!atomic_dec_and_test(&usage_count))
+		return;
+
+	if (irqdma_allocated) {
+		fd_disable_dma();
+		fd_free_dma();
+		fd_free_irq();
+		irqdma_allocated = 0;
+	}
+	set_dor(0, ~0, 8);
+#if N_FDC > 1
+	set_dor(1, ~8, 0);
+#endif
+
+	if (floppy_track_buffer && max_buffer_sectors) {
+		tmpsize = max_buffer_sectors * 1024;
+		tmpaddr = (unsigned long)floppy_track_buffer;
+		floppy_track_buffer = NULL;
+		max_buffer_sectors = 0;
+		buffer_min = buffer_max = -1;
+		fd_dma_mem_free(tmpaddr, tmpsize);
+	}
+#ifndef __sparc__
+	for (drive = 0; drive < N_FDC * 4; drive++)
+		if (timer_pending(motor_off_timer + drive))
+			pr_info("motor off timer %d still active\n", drive);
+#endif
+
+	if (delayed_work_pending(&fd_timeout))
+		pr_info("floppy timer still active:%s\n", timeout_message);
+	if (delayed_work_pending(&fd_timer))
+		pr_info("auxiliary floppy timer still active\n");
+	if (work_pending(&floppy_work))
+		pr_info("work still pending\n");
+	for (fdc = 0; fdc < N_FDC; fdc++)
+		if (fdc_state[fdc].address != -1)
+			floppy_release_regions(fdc);
+}
+
+#ifdef MODULE
+
+static char *floppy;
+
+static void __init parse_floppy_cfg_string(char *cfg)
+{
+	char *ptr;
+
+	while (*cfg) {
+		ptr = cfg;
+		while (*cfg && *cfg != ' ' && *cfg != '\t')
+			cfg++;
+		if (*cfg) {
+			*cfg = '\0';
+			cfg++;
+		}
+		if (*ptr)
+			floppy_setup(ptr);
+	}
+}
+
+static int __init floppy_module_init(void)
+{
+	if (floppy)
+		parse_floppy_cfg_string(floppy);
+	return floppy_init();
+}
+module_init(floppy_module_init);
+
+static void __exit floppy_module_exit(void)
+{
+	int drive, i;
+
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
+	platform_driver_unregister(&floppy_driver);
+
+	destroy_workqueue(floppy_wq);
+
+	for (drive = 0; drive < N_DRIVE; drive++) {
+		del_timer_sync(&motor_off_timer[drive]);
+
+		if (floppy_available(drive)) {
+			for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
+				if (disks[drive][i])
+					del_gendisk(disks[drive][i]);
+			}
+			if (registered[drive])
+				platform_device_unregister(&floppy_device[drive]);
+		}
+		for (i = 0; i < ARRAY_SIZE(floppy_type); i++) {
+			if (disks[drive][i])
+				put_disk(disks[drive][i]);
+		}
+		blk_mq_free_tag_set(&tag_sets[drive]);
+	}
+
+	cancel_delayed_work_sync(&fd_timeout);
+	cancel_delayed_work_sync(&fd_timer);
+
+	if (atomic_read(&usage_count))
+		floppy_release_irq_and_dma();
+
+	/* eject disk, if any */
+	fd_eject(0);
+}
+
+module_exit(floppy_module_exit);
+
+module_param(floppy, charp, 0);
+module_param(FLOPPY_IRQ, int, 0);
+module_param(FLOPPY_DMA, int, 0);
+MODULE_AUTHOR("Alain L. Knaff");
+MODULE_LICENSE("GPL");
+
+/* This doesn't actually get used other than for module information */
+static const struct pnp_device_id floppy_pnpids[] = {
+	{"PNP0700", 0},
+	{}
+};
+
+MODULE_DEVICE_TABLE(pnp, floppy_pnpids);
+
+#else
+
+__setup("floppy=", floppy_setup);
+module_init(floppy_init)
+#endif
+
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/loop.c b/drivers/block/loop.c
new file mode 100644
index 000000000..12ff6f58b
--- /dev/null
+++ b/drivers/block/loop.c
@@ -0,0 +1,2318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright 1993 by Theodore Ts'o.
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkpg.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/suspend.h>
+#include <linux/freezer.h>
+#include <linux/mutex.h>
+#include <linux/writeback.h>
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/splice.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include <linux/ioprio.h>
+#include <linux/blk-cgroup.h>
+#include <linux/sched/mm.h>
+#include <linux/statfs.h>
+#include <linux/uaccess.h>
+#include <linux/blk-mq.h>
+#include <linux/spinlock.h>
+#include <uapi/linux/loop.h>
+
+/* Possible states of device */
+enum {
+	Lo_unbound,
+	Lo_bound,
+	Lo_rundown,
+	Lo_deleting,
+};
+
+struct loop_func_table;
+
+struct loop_device {
+	int		lo_number;
+	loff_t		lo_offset;
+	loff_t		lo_sizelimit;
+	int		lo_flags;
+	char		lo_file_name[LO_NAME_SIZE];
+
+	struct file *	lo_backing_file;
+	struct block_device *lo_device;
+
+	gfp_t		old_gfp_mask;
+
+	spinlock_t		lo_lock;
+	int			lo_state;
+	spinlock_t              lo_work_lock;
+	struct workqueue_struct *workqueue;
+	struct work_struct      rootcg_work;
+	struct list_head        rootcg_cmd_list;
+	struct list_head        idle_worker_list;
+	struct rb_root          worker_tree;
+	struct timer_list       timer;
+	bool			use_dio;
+	bool			sysfs_inited;
+
+	struct request_queue	*lo_queue;
+	struct blk_mq_tag_set	tag_set;
+	struct gendisk		*lo_disk;
+	struct mutex		lo_mutex;
+	bool			idr_visible;
+};
+
+struct loop_cmd {
+	struct list_head list_entry;
+	bool use_aio; /* use AIO interface to handle I/O */
+	atomic_t ref; /* only for aio */
+	long ret;
+	struct kiocb iocb;
+	struct bio_vec *bvec;
+	struct cgroup_subsys_state *blkcg_css;
+	struct cgroup_subsys_state *memcg_css;
+};
+
+#define LOOP_IDLE_WORKER_TIMEOUT (60 * HZ)
+#define LOOP_DEFAULT_HW_Q_DEPTH (128)
+
+static DEFINE_IDR(loop_index_idr);
+static DEFINE_MUTEX(loop_ctl_mutex);
+static DEFINE_MUTEX(loop_validate_mutex);
+
+/**
+ * loop_global_lock_killable() - take locks for safe loop_validate_file() test
+ *
+ * @lo: struct loop_device
+ * @global: true if @lo is about to bind another "struct loop_device", false otherwise
+ *
+ * Returns 0 on success, -EINTR otherwise.
+ *
+ * Since loop_validate_file() traverses on other "struct loop_device" if
+ * is_loop_device() is true, we need a global lock for serializing concurrent
+ * loop_configure()/loop_change_fd()/__loop_clr_fd() calls.
+ */
+static int loop_global_lock_killable(struct loop_device *lo, bool global)
+{
+	int err;
+
+	if (global) {
+		err = mutex_lock_killable(&loop_validate_mutex);
+		if (err)
+			return err;
+	}
+	err = mutex_lock_killable(&lo->lo_mutex);
+	if (err && global)
+		mutex_unlock(&loop_validate_mutex);
+	return err;
+}
+
+/**
+ * loop_global_unlock() - release locks taken by loop_global_lock_killable()
+ *
+ * @lo: struct loop_device
+ * @global: true if @lo was about to bind another "struct loop_device", false otherwise
+ */
+static void loop_global_unlock(struct loop_device *lo, bool global)
+{
+	mutex_unlock(&lo->lo_mutex);
+	if (global)
+		mutex_unlock(&loop_validate_mutex);
+}
+
+static int max_part;
+static int part_shift;
+
+static loff_t get_size(loff_t offset, loff_t sizelimit, struct file *file)
+{
+	loff_t loopsize;
+
+	/* Compute loopsize in bytes */
+	loopsize = i_size_read(file->f_mapping->host);
+	if (offset > 0)
+		loopsize -= offset;
+	/* offset is beyond i_size, weird but possible */
+	if (loopsize < 0)
+		return 0;
+
+	if (sizelimit > 0 && sizelimit < loopsize)
+		loopsize = sizelimit;
+	/*
+	 * Unfortunately, if we want to do I/O on the device,
+	 * the number of 512-byte sectors has to fit into a sector_t.
+	 */
+	return loopsize >> 9;
+}
+
+static loff_t get_loop_size(struct loop_device *lo, struct file *file)
+{
+	return get_size(lo->lo_offset, lo->lo_sizelimit, file);
+}
+
+/*
+ * We support direct I/O only if lo_offset is aligned with the logical I/O size
+ * of backing device, and the logical block size of loop is bigger than that of
+ * the backing device.
+ */
+static bool lo_bdev_can_use_dio(struct loop_device *lo,
+		struct block_device *backing_bdev)
+{
+	unsigned short sb_bsize = bdev_logical_block_size(backing_bdev);
+
+	if (queue_logical_block_size(lo->lo_queue) < sb_bsize)
+		return false;
+	if (lo->lo_offset & (sb_bsize - 1))
+		return false;
+	return true;
+}
+
+static void __loop_update_dio(struct loop_device *lo, bool dio)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *inode = file->f_mapping->host;
+	struct block_device *backing_bdev = NULL;
+	bool use_dio;
+
+	if (S_ISBLK(inode->i_mode))
+		backing_bdev = I_BDEV(inode);
+	else if (inode->i_sb->s_bdev)
+		backing_bdev = inode->i_sb->s_bdev;
+
+	use_dio = dio && (file->f_mode & FMODE_CAN_ODIRECT) &&
+		(!backing_bdev || lo_bdev_can_use_dio(lo, backing_bdev));
+
+	if (lo->use_dio == use_dio)
+		return;
+
+	/* flush dirty pages before changing direct IO */
+	vfs_fsync(file, 0);
+
+	/*
+	 * The flag of LO_FLAGS_DIRECT_IO is handled similarly with
+	 * LO_FLAGS_READ_ONLY, both are set from kernel, and losetup
+	 * will get updated by ioctl(LOOP_GET_STATUS)
+	 */
+	if (lo->lo_state == Lo_bound)
+		blk_mq_freeze_queue(lo->lo_queue);
+	lo->use_dio = use_dio;
+	if (use_dio) {
+		blk_queue_flag_clear(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+		lo->lo_flags |= LO_FLAGS_DIRECT_IO;
+	} else {
+		blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+		lo->lo_flags &= ~LO_FLAGS_DIRECT_IO;
+	}
+	if (lo->lo_state == Lo_bound)
+		blk_mq_unfreeze_queue(lo->lo_queue);
+}
+
+/**
+ * loop_set_size() - sets device size and notifies userspace
+ * @lo: struct loop_device to set the size for
+ * @size: new size of the loop device
+ *
+ * Callers must validate that the size passed into this function fits into
+ * a sector_t, eg using loop_validate_size()
+ */
+static void loop_set_size(struct loop_device *lo, loff_t size)
+{
+	if (!set_capacity_and_notify(lo->lo_disk, size))
+		kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
+}
+
+static int lo_write_bvec(struct file *file, struct bio_vec *bvec, loff_t *ppos)
+{
+	struct iov_iter i;
+	ssize_t bw;
+
+	iov_iter_bvec(&i, ITER_SOURCE, bvec, 1, bvec->bv_len);
+
+	file_start_write(file);
+	bw = vfs_iter_write(file, &i, ppos, 0);
+	file_end_write(file);
+
+	if (likely(bw ==  bvec->bv_len))
+		return 0;
+
+	printk_ratelimited(KERN_ERR
+		"loop: Write error at byte offset %llu, length %i.\n",
+		(unsigned long long)*ppos, bvec->bv_len);
+	if (bw >= 0)
+		bw = -EIO;
+	return bw;
+}
+
+static int lo_write_simple(struct loop_device *lo, struct request *rq,
+		loff_t pos)
+{
+	struct bio_vec bvec;
+	struct req_iterator iter;
+	int ret = 0;
+
+	rq_for_each_segment(bvec, rq, iter) {
+		ret = lo_write_bvec(lo->lo_backing_file, &bvec, &pos);
+		if (ret < 0)
+			break;
+		cond_resched();
+	}
+
+	return ret;
+}
+
+static int lo_read_simple(struct loop_device *lo, struct request *rq,
+		loff_t pos)
+{
+	struct bio_vec bvec;
+	struct req_iterator iter;
+	struct iov_iter i;
+	ssize_t len;
+
+	rq_for_each_segment(bvec, rq, iter) {
+		iov_iter_bvec(&i, ITER_DEST, &bvec, 1, bvec.bv_len);
+		len = vfs_iter_read(lo->lo_backing_file, &i, &pos, 0);
+		if (len < 0)
+			return len;
+
+		flush_dcache_page(bvec.bv_page);
+
+		if (len != bvec.bv_len) {
+			struct bio *bio;
+
+			__rq_for_each_bio(bio, rq)
+				zero_fill_bio(bio);
+			break;
+		}
+		cond_resched();
+	}
+
+	return 0;
+}
+
+static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
+			int mode)
+{
+	/*
+	 * We use fallocate to manipulate the space mappings used by the image
+	 * a.k.a. discard/zerorange.
+	 */
+	struct file *file = lo->lo_backing_file;
+	int ret;
+
+	mode |= FALLOC_FL_KEEP_SIZE;
+
+	if (!bdev_max_discard_sectors(lo->lo_device))
+		return -EOPNOTSUPP;
+
+	ret = file->f_op->fallocate(file, mode, pos, blk_rq_bytes(rq));
+	if (unlikely(ret && ret != -EINVAL && ret != -EOPNOTSUPP))
+		return -EIO;
+	return ret;
+}
+
+static int lo_req_flush(struct loop_device *lo, struct request *rq)
+{
+	int ret = vfs_fsync(lo->lo_backing_file, 0);
+	if (unlikely(ret && ret != -EINVAL))
+		ret = -EIO;
+
+	return ret;
+}
+
+static void lo_complete_rq(struct request *rq)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	blk_status_t ret = BLK_STS_OK;
+
+	if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
+	    req_op(rq) != REQ_OP_READ) {
+		if (cmd->ret < 0)
+			ret = errno_to_blk_status(cmd->ret);
+		goto end_io;
+	}
+
+	/*
+	 * Short READ - if we got some data, advance our request and
+	 * retry it. If we got no data, end the rest with EIO.
+	 */
+	if (cmd->ret) {
+		blk_update_request(rq, BLK_STS_OK, cmd->ret);
+		cmd->ret = 0;
+		blk_mq_requeue_request(rq, true);
+	} else {
+		if (cmd->use_aio) {
+			struct bio *bio = rq->bio;
+
+			while (bio) {
+				zero_fill_bio(bio);
+				bio = bio->bi_next;
+			}
+		}
+		ret = BLK_STS_IOERR;
+end_io:
+		blk_mq_end_request(rq, ret);
+	}
+}
+
+static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
+{
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+
+	if (!atomic_dec_and_test(&cmd->ref))
+		return;
+	kfree(cmd->bvec);
+	cmd->bvec = NULL;
+	if (likely(!blk_should_fake_timeout(rq->q)))
+		blk_mq_complete_request(rq);
+}
+
+static void lo_rw_aio_complete(struct kiocb *iocb, long ret)
+{
+	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
+
+	cmd->ret = ret;
+	lo_rw_aio_do_completion(cmd);
+}
+
+static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
+		     loff_t pos, int rw)
+{
+	struct iov_iter iter;
+	struct req_iterator rq_iter;
+	struct bio_vec *bvec;
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	struct bio *bio = rq->bio;
+	struct file *file = lo->lo_backing_file;
+	struct bio_vec tmp;
+	unsigned int offset;
+	int nr_bvec = 0;
+	int ret;
+
+	rq_for_each_bvec(tmp, rq, rq_iter)
+		nr_bvec++;
+
+	if (rq->bio != rq->biotail) {
+
+		bvec = kmalloc_array(nr_bvec, sizeof(struct bio_vec),
+				     GFP_NOIO);
+		if (!bvec)
+			return -EIO;
+		cmd->bvec = bvec;
+
+		/*
+		 * The bios of the request may be started from the middle of
+		 * the 'bvec' because of bio splitting, so we can't directly
+		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_bvec
+		 * API will take care of all details for us.
+		 */
+		rq_for_each_bvec(tmp, rq, rq_iter) {
+			*bvec = tmp;
+			bvec++;
+		}
+		bvec = cmd->bvec;
+		offset = 0;
+	} else {
+		/*
+		 * Same here, this bio may be started from the middle of the
+		 * 'bvec' because of bio splitting, so offset from the bvec
+		 * must be passed to iov iterator
+		 */
+		offset = bio->bi_iter.bi_bvec_done;
+		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+	}
+	atomic_set(&cmd->ref, 2);
+
+	iov_iter_bvec(&iter, rw, bvec, nr_bvec, blk_rq_bytes(rq));
+	iter.iov_offset = offset;
+
+	cmd->iocb.ki_pos = pos;
+	cmd->iocb.ki_filp = file;
+	cmd->iocb.ki_complete = lo_rw_aio_complete;
+	cmd->iocb.ki_flags = IOCB_DIRECT;
+	cmd->iocb.ki_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0);
+
+	if (rw == ITER_SOURCE)
+		ret = call_write_iter(file, &cmd->iocb, &iter);
+	else
+		ret = call_read_iter(file, &cmd->iocb, &iter);
+
+	lo_rw_aio_do_completion(cmd);
+
+	if (ret != -EIOCBQUEUED)
+		lo_rw_aio_complete(&cmd->iocb, ret);
+	return 0;
+}
+
+static int do_req_filebacked(struct loop_device *lo, struct request *rq)
+{
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset;
+
+	/*
+	 * lo_write_simple and lo_read_simple should have been covered
+	 * by io submit style function like lo_rw_aio(), one blocker
+	 * is that lo_read_simple() need to call flush_dcache_page after
+	 * the page is written from kernel, and it isn't easy to handle
+	 * this in io submit style function which submits all segments
+	 * of the req at one time. And direct read IO doesn't need to
+	 * run flush_dcache_page().
+	 */
+	switch (req_op(rq)) {
+	case REQ_OP_FLUSH:
+		return lo_req_flush(lo, rq);
+	case REQ_OP_WRITE_ZEROES:
+		/*
+		 * If the caller doesn't want deallocation, call zeroout to
+		 * write zeroes the range.  Otherwise, punch them out.
+		 */
+		return lo_fallocate(lo, rq, pos,
+			(rq->cmd_flags & REQ_NOUNMAP) ?
+				FALLOC_FL_ZERO_RANGE :
+				FALLOC_FL_PUNCH_HOLE);
+	case REQ_OP_DISCARD:
+		return lo_fallocate(lo, rq, pos, FALLOC_FL_PUNCH_HOLE);
+	case REQ_OP_WRITE:
+		if (cmd->use_aio)
+			return lo_rw_aio(lo, cmd, pos, ITER_SOURCE);
+		else
+			return lo_write_simple(lo, rq, pos);
+	case REQ_OP_READ:
+		if (cmd->use_aio)
+			return lo_rw_aio(lo, cmd, pos, ITER_DEST);
+		else
+			return lo_read_simple(lo, rq, pos);
+	default:
+		WARN_ON_ONCE(1);
+		return -EIO;
+	}
+}
+
+static inline void loop_update_dio(struct loop_device *lo)
+{
+	__loop_update_dio(lo, (lo->lo_backing_file->f_flags & O_DIRECT) |
+				lo->use_dio);
+}
+
+static void loop_reread_partitions(struct loop_device *lo)
+{
+	int rc;
+
+	mutex_lock(&lo->lo_disk->open_mutex);
+	rc = bdev_disk_changed(lo->lo_disk, false);
+	mutex_unlock(&lo->lo_disk->open_mutex);
+	if (rc)
+		pr_warn("%s: partition scan of loop%d (%s) failed (rc=%d)\n",
+			__func__, lo->lo_number, lo->lo_file_name, rc);
+}
+
+static inline int is_loop_device(struct file *file)
+{
+	struct inode *i = file->f_mapping->host;
+
+	return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR;
+}
+
+static int loop_validate_file(struct file *file, struct block_device *bdev)
+{
+	struct inode	*inode = file->f_mapping->host;
+	struct file	*f = file;
+
+	/* Avoid recursion */
+	while (is_loop_device(f)) {
+		struct loop_device *l;
+
+		lockdep_assert_held(&loop_validate_mutex);
+		if (f->f_mapping->host->i_rdev == bdev->bd_dev)
+			return -EBADF;
+
+		l = I_BDEV(f->f_mapping->host)->bd_disk->private_data;
+		if (l->lo_state != Lo_bound)
+			return -EINVAL;
+		/* Order wrt setting lo->lo_backing_file in loop_configure(). */
+		rmb();
+		f = l->lo_backing_file;
+	}
+	if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode))
+		return -EINVAL;
+	return 0;
+}
+
+/*
+ * loop_change_fd switched the backing store of a loopback device to
+ * a new file. This is useful for operating system installers to free up
+ * the original file and in High Availability environments to switch to
+ * an alternative location for the content in case of server meltdown.
+ * This can only work if the loop device is used read-only, and if the
+ * new backing store is the same size and type as the old backing store.
+ */
+static int loop_change_fd(struct loop_device *lo, struct block_device *bdev,
+			  unsigned int arg)
+{
+	struct file *file = fget(arg);
+	struct file *old_file;
+	int error;
+	bool partscan;
+	bool is_loop;
+
+	if (!file)
+		return -EBADF;
+
+	/* suppress uevents while reconfiguring the device */
+	dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);
+
+	is_loop = is_loop_device(file);
+	error = loop_global_lock_killable(lo, is_loop);
+	if (error)
+		goto out_putf;
+	error = -ENXIO;
+	if (lo->lo_state != Lo_bound)
+		goto out_err;
+
+	/* the loop device has to be read-only */
+	error = -EINVAL;
+	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY))
+		goto out_err;
+
+	error = loop_validate_file(file, bdev);
+	if (error)
+		goto out_err;
+
+	old_file = lo->lo_backing_file;
+
+	error = -EINVAL;
+
+	/* size of the new backing store needs to be the same */
+	if (get_loop_size(lo, file) != get_loop_size(lo, old_file))
+		goto out_err;
+
+	/* and ... switch */
+	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+	blk_mq_freeze_queue(lo->lo_queue);
+	mapping_set_gfp_mask(old_file->f_mapping, lo->old_gfp_mask);
+	lo->lo_backing_file = file;
+	lo->old_gfp_mask = mapping_gfp_mask(file->f_mapping);
+	mapping_set_gfp_mask(file->f_mapping,
+			     lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+	loop_update_dio(lo);
+	blk_mq_unfreeze_queue(lo->lo_queue);
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
+	loop_global_unlock(lo, is_loop);
+
+	/*
+	 * Flush loop_validate_file() before fput(), for l->lo_backing_file
+	 * might be pointing at old_file which might be the last reference.
+	 */
+	if (!is_loop) {
+		mutex_lock(&loop_validate_mutex);
+		mutex_unlock(&loop_validate_mutex);
+	}
+	/*
+	 * We must drop file reference outside of lo_mutex as dropping
+	 * the file ref can take open_mutex which creates circular locking
+	 * dependency.
+	 */
+	fput(old_file);
+	if (partscan)
+		loop_reread_partitions(lo);
+
+	error = 0;
+done:
+	/* enable and uncork uevent now that we are done */
+	dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
+	return error;
+
+out_err:
+	loop_global_unlock(lo, is_loop);
+out_putf:
+	fput(file);
+	goto done;
+}
+
+/* loop sysfs attributes */
+
+static ssize_t loop_attr_show(struct device *dev, char *page,
+			      ssize_t (*callback)(struct loop_device *, char *))
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct loop_device *lo = disk->private_data;
+
+	return callback(lo, page);
+}
+
+#define LOOP_ATTR_RO(_name)						\
+static ssize_t loop_attr_##_name##_show(struct loop_device *, char *);	\
+static ssize_t loop_attr_do_show_##_name(struct device *d,		\
+				struct device_attribute *attr, char *b)	\
+{									\
+	return loop_attr_show(d, b, loop_attr_##_name##_show);		\
+}									\
+static struct device_attribute loop_attr_##_name =			\
+	__ATTR(_name, 0444, loop_attr_do_show_##_name, NULL);
+
+static ssize_t loop_attr_backing_file_show(struct loop_device *lo, char *buf)
+{
+	ssize_t ret;
+	char *p = NULL;
+
+	spin_lock_irq(&lo->lo_lock);
+	if (lo->lo_backing_file)
+		p = file_path(lo->lo_backing_file, buf, PAGE_SIZE - 1);
+	spin_unlock_irq(&lo->lo_lock);
+
+	if (IS_ERR_OR_NULL(p))
+		ret = PTR_ERR(p);
+	else {
+		ret = strlen(p);
+		memmove(buf, p, ret);
+		buf[ret++] = '\n';
+		buf[ret] = 0;
+	}
+
+	return ret;
+}
+
+static ssize_t loop_attr_offset_show(struct loop_device *lo, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_offset);
+}
+
+static ssize_t loop_attr_sizelimit_show(struct loop_device *lo, char *buf)
+{
+	return sysfs_emit(buf, "%llu\n", (unsigned long long)lo->lo_sizelimit);
+}
+
+static ssize_t loop_attr_autoclear_show(struct loop_device *lo, char *buf)
+{
+	int autoclear = (lo->lo_flags & LO_FLAGS_AUTOCLEAR);
+
+	return sysfs_emit(buf, "%s\n", autoclear ? "1" : "0");
+}
+
+static ssize_t loop_attr_partscan_show(struct loop_device *lo, char *buf)
+{
+	int partscan = (lo->lo_flags & LO_FLAGS_PARTSCAN);
+
+	return sysfs_emit(buf, "%s\n", partscan ? "1" : "0");
+}
+
+static ssize_t loop_attr_dio_show(struct loop_device *lo, char *buf)
+{
+	int dio = (lo->lo_flags & LO_FLAGS_DIRECT_IO);
+
+	return sysfs_emit(buf, "%s\n", dio ? "1" : "0");
+}
+
+LOOP_ATTR_RO(backing_file);
+LOOP_ATTR_RO(offset);
+LOOP_ATTR_RO(sizelimit);
+LOOP_ATTR_RO(autoclear);
+LOOP_ATTR_RO(partscan);
+LOOP_ATTR_RO(dio);
+
+static struct attribute *loop_attrs[] = {
+	&loop_attr_backing_file.attr,
+	&loop_attr_offset.attr,
+	&loop_attr_sizelimit.attr,
+	&loop_attr_autoclear.attr,
+	&loop_attr_partscan.attr,
+	&loop_attr_dio.attr,
+	NULL,
+};
+
+static struct attribute_group loop_attribute_group = {
+	.name = "loop",
+	.attrs= loop_attrs,
+};
+
+static void loop_sysfs_init(struct loop_device *lo)
+{
+	lo->sysfs_inited = !sysfs_create_group(&disk_to_dev(lo->lo_disk)->kobj,
+						&loop_attribute_group);
+}
+
+static void loop_sysfs_exit(struct loop_device *lo)
+{
+	if (lo->sysfs_inited)
+		sysfs_remove_group(&disk_to_dev(lo->lo_disk)->kobj,
+				   &loop_attribute_group);
+}
+
+static void loop_config_discard(struct loop_device *lo)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *inode = file->f_mapping->host;
+	struct request_queue *q = lo->lo_queue;
+	u32 granularity, max_discard_sectors;
+
+	/*
+	 * If the backing device is a block device, mirror its zeroing
+	 * capability. Set the discard sectors to the block device's zeroing
+	 * capabilities because loop discards result in blkdev_issue_zeroout(),
+	 * not blkdev_issue_discard(). This maintains consistent behavior with
+	 * file-backed loop devices: discarded regions read back as zero.
+	 */
+	if (S_ISBLK(inode->i_mode)) {
+		struct request_queue *backingq = bdev_get_queue(I_BDEV(inode));
+
+		max_discard_sectors = backingq->limits.max_write_zeroes_sectors;
+		granularity = bdev_discard_granularity(I_BDEV(inode)) ?:
+			queue_physical_block_size(backingq);
+
+	/*
+	 * We use punch hole to reclaim the free space used by the
+	 * image a.k.a. discard.
+	 */
+	} else if (!file->f_op->fallocate) {
+		max_discard_sectors = 0;
+		granularity = 0;
+
+	} else {
+		struct kstatfs sbuf;
+
+		max_discard_sectors = UINT_MAX >> 9;
+		if (!vfs_statfs(&file->f_path, &sbuf))
+			granularity = sbuf.f_bsize;
+		else
+			max_discard_sectors = 0;
+	}
+
+	if (max_discard_sectors) {
+		q->limits.discard_granularity = granularity;
+		blk_queue_max_discard_sectors(q, max_discard_sectors);
+		blk_queue_max_write_zeroes_sectors(q, max_discard_sectors);
+	} else {
+		q->limits.discard_granularity = 0;
+		blk_queue_max_discard_sectors(q, 0);
+		blk_queue_max_write_zeroes_sectors(q, 0);
+	}
+}
+
+struct loop_worker {
+	struct rb_node rb_node;
+	struct work_struct work;
+	struct list_head cmd_list;
+	struct list_head idle_list;
+	struct loop_device *lo;
+	struct cgroup_subsys_state *blkcg_css;
+	unsigned long last_ran_at;
+};
+
+static void loop_workfn(struct work_struct *work);
+
+#ifdef CONFIG_BLK_CGROUP
+static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
+{
+	return !css || css == blkcg_root_css;
+}
+#else
+static inline int queue_on_root_worker(struct cgroup_subsys_state *css)
+{
+	return !css;
+}
+#endif
+
+static void loop_queue_work(struct loop_device *lo, struct loop_cmd *cmd)
+{
+	struct rb_node **node, *parent = NULL;
+	struct loop_worker *cur_worker, *worker = NULL;
+	struct work_struct *work;
+	struct list_head *cmd_list;
+
+	spin_lock_irq(&lo->lo_work_lock);
+
+	if (queue_on_root_worker(cmd->blkcg_css))
+		goto queue_work;
+
+	node = &lo->worker_tree.rb_node;
+
+	while (*node) {
+		parent = *node;
+		cur_worker = container_of(*node, struct loop_worker, rb_node);
+		if (cur_worker->blkcg_css == cmd->blkcg_css) {
+			worker = cur_worker;
+			break;
+		} else if ((long)cur_worker->blkcg_css < (long)cmd->blkcg_css) {
+			node = &(*node)->rb_left;
+		} else {
+			node = &(*node)->rb_right;
+		}
+	}
+	if (worker)
+		goto queue_work;
+
+	worker = kzalloc(sizeof(struct loop_worker), GFP_NOWAIT | __GFP_NOWARN);
+	/*
+	 * In the event we cannot allocate a worker, just queue on the
+	 * rootcg worker and issue the I/O as the rootcg
+	 */
+	if (!worker) {
+		cmd->blkcg_css = NULL;
+		if (cmd->memcg_css)
+			css_put(cmd->memcg_css);
+		cmd->memcg_css = NULL;
+		goto queue_work;
+	}
+
+	worker->blkcg_css = cmd->blkcg_css;
+	css_get(worker->blkcg_css);
+	INIT_WORK(&worker->work, loop_workfn);
+	INIT_LIST_HEAD(&worker->cmd_list);
+	INIT_LIST_HEAD(&worker->idle_list);
+	worker->lo = lo;
+	rb_link_node(&worker->rb_node, parent, node);
+	rb_insert_color(&worker->rb_node, &lo->worker_tree);
+queue_work:
+	if (worker) {
+		/*
+		 * We need to remove from the idle list here while
+		 * holding the lock so that the idle timer doesn't
+		 * free the worker
+		 */
+		if (!list_empty(&worker->idle_list))
+			list_del_init(&worker->idle_list);
+		work = &worker->work;
+		cmd_list = &worker->cmd_list;
+	} else {
+		work = &lo->rootcg_work;
+		cmd_list = &lo->rootcg_cmd_list;
+	}
+	list_add_tail(&cmd->list_entry, cmd_list);
+	queue_work(lo->workqueue, work);
+	spin_unlock_irq(&lo->lo_work_lock);
+}
+
+static void loop_set_timer(struct loop_device *lo)
+{
+	timer_reduce(&lo->timer, jiffies + LOOP_IDLE_WORKER_TIMEOUT);
+}
+
+static void loop_free_idle_workers(struct loop_device *lo, bool delete_all)
+{
+	struct loop_worker *pos, *worker;
+
+	spin_lock_irq(&lo->lo_work_lock);
+	list_for_each_entry_safe(worker, pos, &lo->idle_worker_list,
+				idle_list) {
+		if (!delete_all &&
+		    time_is_after_jiffies(worker->last_ran_at +
+					  LOOP_IDLE_WORKER_TIMEOUT))
+			break;
+		list_del(&worker->idle_list);
+		rb_erase(&worker->rb_node, &lo->worker_tree);
+		css_put(worker->blkcg_css);
+		kfree(worker);
+	}
+	if (!list_empty(&lo->idle_worker_list))
+		loop_set_timer(lo);
+	spin_unlock_irq(&lo->lo_work_lock);
+}
+
+static void loop_free_idle_workers_timer(struct timer_list *timer)
+{
+	struct loop_device *lo = container_of(timer, struct loop_device, timer);
+
+	return loop_free_idle_workers(lo, false);
+}
+
+static void loop_update_rotational(struct loop_device *lo)
+{
+	struct file *file = lo->lo_backing_file;
+	struct inode *file_inode = file->f_mapping->host;
+	struct block_device *file_bdev = file_inode->i_sb->s_bdev;
+	struct request_queue *q = lo->lo_queue;
+	bool nonrot = true;
+
+	/* not all filesystems (e.g. tmpfs) have a sb->s_bdev */
+	if (file_bdev)
+		nonrot = bdev_nonrot(file_bdev);
+
+	if (nonrot)
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+}
+
+/**
+ * loop_set_status_from_info - configure device from loop_info
+ * @lo: struct loop_device to configure
+ * @info: struct loop_info64 to configure the device with
+ *
+ * Configures the loop device parameters according to the passed
+ * in loop_info64 configuration.
+ */
+static int
+loop_set_status_from_info(struct loop_device *lo,
+			  const struct loop_info64 *info)
+{
+	if ((unsigned int) info->lo_encrypt_key_size > LO_KEY_SIZE)
+		return -EINVAL;
+
+	switch (info->lo_encrypt_type) {
+	case LO_CRYPT_NONE:
+		break;
+	case LO_CRYPT_XOR:
+		pr_warn("support for the xor transformation has been removed.\n");
+		return -EINVAL;
+	case LO_CRYPT_CRYPTOAPI:
+		pr_warn("support for cryptoloop has been removed.  Use dm-crypt instead.\n");
+		return -EINVAL;
+	default:
+		return -EINVAL;
+	}
+
+	/* Avoid assigning overflow values */
+	if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX)
+		return -EOVERFLOW;
+
+	lo->lo_offset = info->lo_offset;
+	lo->lo_sizelimit = info->lo_sizelimit;
+
+	memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE);
+	lo->lo_file_name[LO_NAME_SIZE-1] = 0;
+	lo->lo_flags = info->lo_flags;
+	return 0;
+}
+
+static int loop_configure(struct loop_device *lo, fmode_t mode,
+			  struct block_device *bdev,
+			  const struct loop_config *config)
+{
+	struct file *file = fget(config->fd);
+	struct inode *inode;
+	struct address_space *mapping;
+	int error;
+	loff_t size;
+	bool partscan;
+	unsigned short bsize;
+	bool is_loop;
+
+	if (!file)
+		return -EBADF;
+	is_loop = is_loop_device(file);
+
+	/* This is safe, since we have a reference from open(). */
+	__module_get(THIS_MODULE);
+
+	/*
+	 * If we don't hold exclusive handle for the device, upgrade to it
+	 * here to avoid changing device under exclusive owner.
+	 */
+	if (!(mode & FMODE_EXCL)) {
+		error = bd_prepare_to_claim(bdev, loop_configure);
+		if (error)
+			goto out_putf;
+	}
+
+	error = loop_global_lock_killable(lo, is_loop);
+	if (error)
+		goto out_bdev;
+
+	error = -EBUSY;
+	if (lo->lo_state != Lo_unbound)
+		goto out_unlock;
+
+	error = loop_validate_file(file, bdev);
+	if (error)
+		goto out_unlock;
+
+	mapping = file->f_mapping;
+	inode = mapping->host;
+
+	if ((config->info.lo_flags & ~LOOP_CONFIGURE_SETTABLE_FLAGS) != 0) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (config->block_size) {
+		error = blk_validate_block_size(config->block_size);
+		if (error)
+			goto out_unlock;
+	}
+
+	error = loop_set_status_from_info(lo, &config->info);
+	if (error)
+		goto out_unlock;
+
+	if (!(file->f_mode & FMODE_WRITE) || !(mode & FMODE_WRITE) ||
+	    !file->f_op->write_iter)
+		lo->lo_flags |= LO_FLAGS_READ_ONLY;
+
+	if (!lo->workqueue) {
+		lo->workqueue = alloc_workqueue("loop%d",
+						WQ_UNBOUND | WQ_FREEZABLE,
+						0, lo->lo_number);
+		if (!lo->workqueue) {
+			error = -ENOMEM;
+			goto out_unlock;
+		}
+	}
+
+	/* suppress uevents while reconfiguring the device */
+	dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 1);
+
+	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+	set_disk_ro(lo->lo_disk, (lo->lo_flags & LO_FLAGS_READ_ONLY) != 0);
+
+	lo->use_dio = lo->lo_flags & LO_FLAGS_DIRECT_IO;
+	lo->lo_device = bdev;
+	lo->lo_backing_file = file;
+	lo->old_gfp_mask = mapping_gfp_mask(mapping);
+	mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS));
+
+	if (!(lo->lo_flags & LO_FLAGS_READ_ONLY) && file->f_op->fsync)
+		blk_queue_write_cache(lo->lo_queue, true, false);
+
+	if (config->block_size)
+		bsize = config->block_size;
+	else if ((lo->lo_backing_file->f_flags & O_DIRECT) && inode->i_sb->s_bdev)
+		/* In case of direct I/O, match underlying block size */
+		bsize = bdev_logical_block_size(inode->i_sb->s_bdev);
+	else
+		bsize = 512;
+
+	blk_queue_logical_block_size(lo->lo_queue, bsize);
+	blk_queue_physical_block_size(lo->lo_queue, bsize);
+	blk_queue_io_min(lo->lo_queue, bsize);
+
+	loop_config_discard(lo);
+	loop_update_rotational(lo);
+	loop_update_dio(lo);
+	loop_sysfs_init(lo);
+
+	size = get_loop_size(lo, file);
+	loop_set_size(lo, size);
+
+	/* Order wrt reading lo_state in loop_validate_file(). */
+	wmb();
+
+	lo->lo_state = Lo_bound;
+	if (part_shift)
+		lo->lo_flags |= LO_FLAGS_PARTSCAN;
+	partscan = lo->lo_flags & LO_FLAGS_PARTSCAN;
+	if (partscan)
+		clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
+
+	/* enable and uncork uevent now that we are done */
+	dev_set_uevent_suppress(disk_to_dev(lo->lo_disk), 0);
+
+	loop_global_unlock(lo, is_loop);
+	if (partscan)
+		loop_reread_partitions(lo);
+
+	if (!(mode & FMODE_EXCL))
+		bd_abort_claiming(bdev, loop_configure);
+
+	return 0;
+
+out_unlock:
+	loop_global_unlock(lo, is_loop);
+out_bdev:
+	if (!(mode & FMODE_EXCL))
+		bd_abort_claiming(bdev, loop_configure);
+out_putf:
+	fput(file);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return error;
+}
+
+static void __loop_clr_fd(struct loop_device *lo, bool release)
+{
+	struct file *filp;
+	gfp_t gfp = lo->old_gfp_mask;
+
+	if (test_bit(QUEUE_FLAG_WC, &lo->lo_queue->queue_flags))
+		blk_queue_write_cache(lo->lo_queue, false, false);
+
+	/*
+	 * Freeze the request queue when unbinding on a live file descriptor and
+	 * thus an open device.  When called from ->release we are guaranteed
+	 * that there is no I/O in progress already.
+	 */
+	if (!release)
+		blk_mq_freeze_queue(lo->lo_queue);
+
+	spin_lock_irq(&lo->lo_lock);
+	filp = lo->lo_backing_file;
+	lo->lo_backing_file = NULL;
+	spin_unlock_irq(&lo->lo_lock);
+
+	lo->lo_device = NULL;
+	lo->lo_offset = 0;
+	lo->lo_sizelimit = 0;
+	memset(lo->lo_file_name, 0, LO_NAME_SIZE);
+	blk_queue_logical_block_size(lo->lo_queue, 512);
+	blk_queue_physical_block_size(lo->lo_queue, 512);
+	blk_queue_io_min(lo->lo_queue, 512);
+	invalidate_disk(lo->lo_disk);
+	loop_sysfs_exit(lo);
+	/* let user-space know about this change */
+	kobject_uevent(&disk_to_dev(lo->lo_disk)->kobj, KOBJ_CHANGE);
+	mapping_set_gfp_mask(filp->f_mapping, gfp);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	if (!release)
+		blk_mq_unfreeze_queue(lo->lo_queue);
+
+	disk_force_media_change(lo->lo_disk, DISK_EVENT_MEDIA_CHANGE);
+
+	if (lo->lo_flags & LO_FLAGS_PARTSCAN) {
+		int err;
+
+		/*
+		 * open_mutex has been held already in release path, so don't
+		 * acquire it if this function is called in such case.
+		 *
+		 * If the reread partition isn't from release path, lo_refcnt
+		 * must be at least one and it can only become zero when the
+		 * current holder is released.
+		 */
+		if (!release)
+			mutex_lock(&lo->lo_disk->open_mutex);
+		err = bdev_disk_changed(lo->lo_disk, false);
+		if (!release)
+			mutex_unlock(&lo->lo_disk->open_mutex);
+		if (err)
+			pr_warn("%s: partition scan of loop%d failed (rc=%d)\n",
+				__func__, lo->lo_number, err);
+		/* Device is gone, no point in returning error */
+	}
+
+	/*
+	 * lo->lo_state is set to Lo_unbound here after above partscan has
+	 * finished. There cannot be anybody else entering __loop_clr_fd() as
+	 * Lo_rundown state protects us from all the other places trying to
+	 * change the 'lo' device.
+	 */
+	lo->lo_flags = 0;
+	if (!part_shift)
+		set_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
+	mutex_lock(&lo->lo_mutex);
+	lo->lo_state = Lo_unbound;
+	mutex_unlock(&lo->lo_mutex);
+
+	/*
+	 * Need not hold lo_mutex to fput backing file. Calling fput holding
+	 * lo_mutex triggers a circular lock dependency possibility warning as
+	 * fput can take open_mutex which is usually taken before lo_mutex.
+	 */
+	fput(filp);
+}
+
+static int loop_clr_fd(struct loop_device *lo)
+{
+	int err;
+
+	/*
+	 * Since lo_ioctl() is called without locks held, it is possible that
+	 * loop_configure()/loop_change_fd() and loop_clr_fd() run in parallel.
+	 *
+	 * Therefore, use global lock when setting Lo_rundown state in order to
+	 * make sure that loop_validate_file() will fail if the "struct file"
+	 * which loop_configure()/loop_change_fd() found via fget() was this
+	 * loop device.
+	 */
+	err = loop_global_lock_killable(lo, true);
+	if (err)
+		return err;
+	if (lo->lo_state != Lo_bound) {
+		loop_global_unlock(lo, true);
+		return -ENXIO;
+	}
+	/*
+	 * If we've explicitly asked to tear down the loop device,
+	 * and it has an elevated reference count, set it for auto-teardown when
+	 * the last reference goes away. This stops $!~#$@ udev from
+	 * preventing teardown because it decided that it needs to run blkid on
+	 * the loopback device whenever they appear. xfstests is notorious for
+	 * failing tests because blkid via udev races with a losetup
+	 * <dev>/do something like mkfs/losetup -d <dev> causing the losetup -d
+	 * command to fail with EBUSY.
+	 */
+	if (disk_openers(lo->lo_disk) > 1) {
+		lo->lo_flags |= LO_FLAGS_AUTOCLEAR;
+		loop_global_unlock(lo, true);
+		return 0;
+	}
+	lo->lo_state = Lo_rundown;
+	loop_global_unlock(lo, true);
+
+	__loop_clr_fd(lo, false);
+	return 0;
+}
+
+static int
+loop_set_status(struct loop_device *lo, const struct loop_info64 *info)
+{
+	int err;
+	int prev_lo_flags;
+	bool partscan = false;
+	bool size_changed = false;
+
+	err = mutex_lock_killable(&lo->lo_mutex);
+	if (err)
+		return err;
+	if (lo->lo_state != Lo_bound) {
+		err = -ENXIO;
+		goto out_unlock;
+	}
+
+	if (lo->lo_offset != info->lo_offset ||
+	    lo->lo_sizelimit != info->lo_sizelimit) {
+		size_changed = true;
+		sync_blockdev(lo->lo_device);
+		invalidate_bdev(lo->lo_device);
+	}
+
+	/* I/O need to be drained during transfer transition */
+	blk_mq_freeze_queue(lo->lo_queue);
+
+	prev_lo_flags = lo->lo_flags;
+
+	err = loop_set_status_from_info(lo, info);
+	if (err)
+		goto out_unfreeze;
+
+	/* Mask out flags that can't be set using LOOP_SET_STATUS. */
+	lo->lo_flags &= LOOP_SET_STATUS_SETTABLE_FLAGS;
+	/* For those flags, use the previous values instead */
+	lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_SETTABLE_FLAGS;
+	/* For flags that can't be cleared, use previous values too */
+	lo->lo_flags |= prev_lo_flags & ~LOOP_SET_STATUS_CLEARABLE_FLAGS;
+
+	if (size_changed) {
+		loff_t new_size = get_size(lo->lo_offset, lo->lo_sizelimit,
+					   lo->lo_backing_file);
+		loop_set_size(lo, new_size);
+	}
+
+	loop_config_discard(lo);
+
+	/* update dio if lo_offset or transfer is changed */
+	__loop_update_dio(lo, lo->use_dio);
+
+out_unfreeze:
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	if (!err && (lo->lo_flags & LO_FLAGS_PARTSCAN) &&
+	     !(prev_lo_flags & LO_FLAGS_PARTSCAN)) {
+		clear_bit(GD_SUPPRESS_PART_SCAN, &lo->lo_disk->state);
+		partscan = true;
+	}
+out_unlock:
+	mutex_unlock(&lo->lo_mutex);
+	if (partscan)
+		loop_reread_partitions(lo);
+
+	return err;
+}
+
+static int
+loop_get_status(struct loop_device *lo, struct loop_info64 *info)
+{
+	struct path path;
+	struct kstat stat;
+	int ret;
+
+	ret = mutex_lock_killable(&lo->lo_mutex);
+	if (ret)
+		return ret;
+	if (lo->lo_state != Lo_bound) {
+		mutex_unlock(&lo->lo_mutex);
+		return -ENXIO;
+	}
+
+	memset(info, 0, sizeof(*info));
+	info->lo_number = lo->lo_number;
+	info->lo_offset = lo->lo_offset;
+	info->lo_sizelimit = lo->lo_sizelimit;
+	info->lo_flags = lo->lo_flags;
+	memcpy(info->lo_file_name, lo->lo_file_name, LO_NAME_SIZE);
+
+	/* Drop lo_mutex while we call into the filesystem. */
+	path = lo->lo_backing_file->f_path;
+	path_get(&path);
+	mutex_unlock(&lo->lo_mutex);
+	ret = vfs_getattr(&path, &stat, STATX_INO, AT_STATX_SYNC_AS_STAT);
+	if (!ret) {
+		info->lo_device = huge_encode_dev(stat.dev);
+		info->lo_inode = stat.ino;
+		info->lo_rdevice = huge_encode_dev(stat.rdev);
+	}
+	path_put(&path);
+	return ret;
+}
+
+static void
+loop_info64_from_old(const struct loop_info *info, struct loop_info64 *info64)
+{
+	memset(info64, 0, sizeof(*info64));
+	info64->lo_number = info->lo_number;
+	info64->lo_device = info->lo_device;
+	info64->lo_inode = info->lo_inode;
+	info64->lo_rdevice = info->lo_rdevice;
+	info64->lo_offset = info->lo_offset;
+	info64->lo_sizelimit = 0;
+	info64->lo_flags = info->lo_flags;
+	memcpy(info64->lo_file_name, info->lo_name, LO_NAME_SIZE);
+}
+
+static int
+loop_info64_to_old(const struct loop_info64 *info64, struct loop_info *info)
+{
+	memset(info, 0, sizeof(*info));
+	info->lo_number = info64->lo_number;
+	info->lo_device = info64->lo_device;
+	info->lo_inode = info64->lo_inode;
+	info->lo_rdevice = info64->lo_rdevice;
+	info->lo_offset = info64->lo_offset;
+	info->lo_flags = info64->lo_flags;
+	memcpy(info->lo_name, info64->lo_file_name, LO_NAME_SIZE);
+
+	/* error in case values were truncated */
+	if (info->lo_device != info64->lo_device ||
+	    info->lo_rdevice != info64->lo_rdevice ||
+	    info->lo_inode != info64->lo_inode ||
+	    info->lo_offset != info64->lo_offset)
+		return -EOVERFLOW;
+
+	return 0;
+}
+
+static int
+loop_set_status_old(struct loop_device *lo, const struct loop_info __user *arg)
+{
+	struct loop_info info;
+	struct loop_info64 info64;
+
+	if (copy_from_user(&info, arg, sizeof (struct loop_info)))
+		return -EFAULT;
+	loop_info64_from_old(&info, &info64);
+	return loop_set_status(lo, &info64);
+}
+
+static int
+loop_set_status64(struct loop_device *lo, const struct loop_info64 __user *arg)
+{
+	struct loop_info64 info64;
+
+	if (copy_from_user(&info64, arg, sizeof (struct loop_info64)))
+		return -EFAULT;
+	return loop_set_status(lo, &info64);
+}
+
+static int
+loop_get_status_old(struct loop_device *lo, struct loop_info __user *arg) {
+	struct loop_info info;
+	struct loop_info64 info64;
+	int err;
+
+	if (!arg)
+		return -EINVAL;
+	err = loop_get_status(lo, &info64);
+	if (!err)
+		err = loop_info64_to_old(&info64, &info);
+	if (!err && copy_to_user(arg, &info, sizeof(info)))
+		err = -EFAULT;
+
+	return err;
+}
+
+static int
+loop_get_status64(struct loop_device *lo, struct loop_info64 __user *arg) {
+	struct loop_info64 info64;
+	int err;
+
+	if (!arg)
+		return -EINVAL;
+	err = loop_get_status(lo, &info64);
+	if (!err && copy_to_user(arg, &info64, sizeof(info64)))
+		err = -EFAULT;
+
+	return err;
+}
+
+static int loop_set_capacity(struct loop_device *lo)
+{
+	loff_t size;
+
+	if (unlikely(lo->lo_state != Lo_bound))
+		return -ENXIO;
+
+	size = get_loop_size(lo, lo->lo_backing_file);
+	loop_set_size(lo, size);
+
+	return 0;
+}
+
+static int loop_set_dio(struct loop_device *lo, unsigned long arg)
+{
+	int error = -ENXIO;
+	if (lo->lo_state != Lo_bound)
+		goto out;
+
+	__loop_update_dio(lo, !!arg);
+	if (lo->use_dio == !!arg)
+		return 0;
+	error = -EINVAL;
+ out:
+	return error;
+}
+
+static int loop_set_block_size(struct loop_device *lo, unsigned long arg)
+{
+	int err = 0;
+
+	if (lo->lo_state != Lo_bound)
+		return -ENXIO;
+
+	err = blk_validate_block_size(arg);
+	if (err)
+		return err;
+
+	if (lo->lo_queue->limits.logical_block_size == arg)
+		return 0;
+
+	sync_blockdev(lo->lo_device);
+	invalidate_bdev(lo->lo_device);
+
+	blk_mq_freeze_queue(lo->lo_queue);
+	blk_queue_logical_block_size(lo->lo_queue, arg);
+	blk_queue_physical_block_size(lo->lo_queue, arg);
+	blk_queue_io_min(lo->lo_queue, arg);
+	loop_update_dio(lo);
+	blk_mq_unfreeze_queue(lo->lo_queue);
+
+	return err;
+}
+
+static int lo_simple_ioctl(struct loop_device *lo, unsigned int cmd,
+			   unsigned long arg)
+{
+	int err;
+
+	err = mutex_lock_killable(&lo->lo_mutex);
+	if (err)
+		return err;
+	switch (cmd) {
+	case LOOP_SET_CAPACITY:
+		err = loop_set_capacity(lo);
+		break;
+	case LOOP_SET_DIRECT_IO:
+		err = loop_set_dio(lo, arg);
+		break;
+	case LOOP_SET_BLOCK_SIZE:
+		err = loop_set_block_size(lo, arg);
+		break;
+	default:
+		err = -EINVAL;
+	}
+	mutex_unlock(&lo->lo_mutex);
+	return err;
+}
+
+static int lo_ioctl(struct block_device *bdev, fmode_t mode,
+	unsigned int cmd, unsigned long arg)
+{
+	struct loop_device *lo = bdev->bd_disk->private_data;
+	void __user *argp = (void __user *) arg;
+	int err;
+
+	switch (cmd) {
+	case LOOP_SET_FD: {
+		/*
+		 * Legacy case - pass in a zeroed out struct loop_config with
+		 * only the file descriptor set , which corresponds with the
+		 * default parameters we'd have used otherwise.
+		 */
+		struct loop_config config;
+
+		memset(&config, 0, sizeof(config));
+		config.fd = arg;
+
+		return loop_configure(lo, mode, bdev, &config);
+	}
+	case LOOP_CONFIGURE: {
+		struct loop_config config;
+
+		if (copy_from_user(&config, argp, sizeof(config)))
+			return -EFAULT;
+
+		return loop_configure(lo, mode, bdev, &config);
+	}
+	case LOOP_CHANGE_FD:
+		return loop_change_fd(lo, bdev, arg);
+	case LOOP_CLR_FD:
+		return loop_clr_fd(lo);
+	case LOOP_SET_STATUS:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = loop_set_status_old(lo, argp);
+		}
+		break;
+	case LOOP_GET_STATUS:
+		return loop_get_status_old(lo, argp);
+	case LOOP_SET_STATUS64:
+		err = -EPERM;
+		if ((mode & FMODE_WRITE) || capable(CAP_SYS_ADMIN)) {
+			err = loop_set_status64(lo, argp);
+		}
+		break;
+	case LOOP_GET_STATUS64:
+		return loop_get_status64(lo, argp);
+	case LOOP_SET_CAPACITY:
+	case LOOP_SET_DIRECT_IO:
+	case LOOP_SET_BLOCK_SIZE:
+		if (!(mode & FMODE_WRITE) && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		fallthrough;
+	default:
+		err = lo_simple_ioctl(lo, cmd, arg);
+		break;
+	}
+
+	return err;
+}
+
+#ifdef CONFIG_COMPAT
+struct compat_loop_info {
+	compat_int_t	lo_number;      /* ioctl r/o */
+	compat_dev_t	lo_device;      /* ioctl r/o */
+	compat_ulong_t	lo_inode;       /* ioctl r/o */
+	compat_dev_t	lo_rdevice;     /* ioctl r/o */
+	compat_int_t	lo_offset;
+	compat_int_t	lo_encrypt_type;        /* obsolete, ignored */
+	compat_int_t	lo_encrypt_key_size;    /* ioctl w/o */
+	compat_int_t	lo_flags;       /* ioctl r/o */
+	char		lo_name[LO_NAME_SIZE];
+	unsigned char	lo_encrypt_key[LO_KEY_SIZE]; /* ioctl w/o */
+	compat_ulong_t	lo_init[2];
+	char		reserved[4];
+};
+
+/*
+ * Transfer 32-bit compatibility structure in userspace to 64-bit loop info
+ * - noinlined to reduce stack space usage in main part of driver
+ */
+static noinline int
+loop_info64_from_compat(const struct compat_loop_info __user *arg,
+			struct loop_info64 *info64)
+{
+	struct compat_loop_info info;
+
+	if (copy_from_user(&info, arg, sizeof(info)))
+		return -EFAULT;
+
+	memset(info64, 0, sizeof(*info64));
+	info64->lo_number = info.lo_number;
+	info64->lo_device = info.lo_device;
+	info64->lo_inode = info.lo_inode;
+	info64->lo_rdevice = info.lo_rdevice;
+	info64->lo_offset = info.lo_offset;
+	info64->lo_sizelimit = 0;
+	info64->lo_flags = info.lo_flags;
+	memcpy(info64->lo_file_name, info.lo_name, LO_NAME_SIZE);
+	return 0;
+}
+
+/*
+ * Transfer 64-bit loop info to 32-bit compatibility structure in userspace
+ * - noinlined to reduce stack space usage in main part of driver
+ */
+static noinline int
+loop_info64_to_compat(const struct loop_info64 *info64,
+		      struct compat_loop_info __user *arg)
+{
+	struct compat_loop_info info;
+
+	memset(&info, 0, sizeof(info));
+	info.lo_number = info64->lo_number;
+	info.lo_device = info64->lo_device;
+	info.lo_inode = info64->lo_inode;
+	info.lo_rdevice = info64->lo_rdevice;
+	info.lo_offset = info64->lo_offset;
+	info.lo_flags = info64->lo_flags;
+	memcpy(info.lo_name, info64->lo_file_name, LO_NAME_SIZE);
+
+	/* error in case values were truncated */
+	if (info.lo_device != info64->lo_device ||
+	    info.lo_rdevice != info64->lo_rdevice ||
+	    info.lo_inode != info64->lo_inode ||
+	    info.lo_offset != info64->lo_offset)
+		return -EOVERFLOW;
+
+	if (copy_to_user(arg, &info, sizeof(info)))
+		return -EFAULT;
+	return 0;
+}
+
+static int
+loop_set_status_compat(struct loop_device *lo,
+		       const struct compat_loop_info __user *arg)
+{
+	struct loop_info64 info64;
+	int ret;
+
+	ret = loop_info64_from_compat(arg, &info64);
+	if (ret < 0)
+		return ret;
+	return loop_set_status(lo, &info64);
+}
+
+static int
+loop_get_status_compat(struct loop_device *lo,
+		       struct compat_loop_info __user *arg)
+{
+	struct loop_info64 info64;
+	int err;
+
+	if (!arg)
+		return -EINVAL;
+	err = loop_get_status(lo, &info64);
+	if (!err)
+		err = loop_info64_to_compat(&info64, arg);
+	return err;
+}
+
+static int lo_compat_ioctl(struct block_device *bdev, fmode_t mode,
+			   unsigned int cmd, unsigned long arg)
+{
+	struct loop_device *lo = bdev->bd_disk->private_data;
+	int err;
+
+	switch(cmd) {
+	case LOOP_SET_STATUS:
+		err = loop_set_status_compat(lo,
+			     (const struct compat_loop_info __user *)arg);
+		break;
+	case LOOP_GET_STATUS:
+		err = loop_get_status_compat(lo,
+				     (struct compat_loop_info __user *)arg);
+		break;
+	case LOOP_SET_CAPACITY:
+	case LOOP_CLR_FD:
+	case LOOP_GET_STATUS64:
+	case LOOP_SET_STATUS64:
+	case LOOP_CONFIGURE:
+		arg = (unsigned long) compat_ptr(arg);
+		fallthrough;
+	case LOOP_SET_FD:
+	case LOOP_CHANGE_FD:
+	case LOOP_SET_BLOCK_SIZE:
+	case LOOP_SET_DIRECT_IO:
+		err = lo_ioctl(bdev, mode, cmd, arg);
+		break;
+	default:
+		err = -ENOIOCTLCMD;
+		break;
+	}
+	return err;
+}
+#endif
+
+static void lo_release(struct gendisk *disk, fmode_t mode)
+{
+	struct loop_device *lo = disk->private_data;
+
+	if (disk_openers(disk) > 0)
+		return;
+
+	mutex_lock(&lo->lo_mutex);
+	if (lo->lo_state == Lo_bound && (lo->lo_flags & LO_FLAGS_AUTOCLEAR)) {
+		lo->lo_state = Lo_rundown;
+		mutex_unlock(&lo->lo_mutex);
+		/*
+		 * In autoclear mode, stop the loop thread
+		 * and remove configuration after last close.
+		 */
+		__loop_clr_fd(lo, true);
+		return;
+	}
+	mutex_unlock(&lo->lo_mutex);
+}
+
+static void lo_free_disk(struct gendisk *disk)
+{
+	struct loop_device *lo = disk->private_data;
+
+	if (lo->workqueue)
+		destroy_workqueue(lo->workqueue);
+	loop_free_idle_workers(lo, true);
+	del_timer_sync(&lo->timer);
+	mutex_destroy(&lo->lo_mutex);
+	kfree(lo);
+}
+
+static const struct block_device_operations lo_fops = {
+	.owner =	THIS_MODULE,
+	.release =	lo_release,
+	.ioctl =	lo_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl =	lo_compat_ioctl,
+#endif
+	.free_disk =	lo_free_disk,
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+
+/*
+ * If max_loop is specified, create that many devices upfront.
+ * This also becomes a hard limit. If max_loop is not specified,
+ * the default isn't a hard limit (as before commit 85c50197716c
+ * changed the default value from 0 for max_loop=0 reasons), just
+ * create CONFIG_BLK_DEV_LOOP_MIN_COUNT loop devices at module
+ * init time. Loop devices can be requested on-demand with the
+ * /dev/loop-control interface, or be instantiated by accessing
+ * a 'dead' device node.
+ */
+static int max_loop = CONFIG_BLK_DEV_LOOP_MIN_COUNT;
+
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+static bool max_loop_specified;
+
+static int max_loop_param_set_int(const char *val,
+				  const struct kernel_param *kp)
+{
+	int ret;
+
+	ret = param_set_int(val, kp);
+	if (ret < 0)
+		return ret;
+
+	max_loop_specified = true;
+	return 0;
+}
+
+static const struct kernel_param_ops max_loop_param_ops = {
+	.set = max_loop_param_set_int,
+	.get = param_get_int,
+};
+
+module_param_cb(max_loop, &max_loop_param_ops, &max_loop, 0444);
+MODULE_PARM_DESC(max_loop, "Maximum number of loop devices");
+#else
+module_param(max_loop, int, 0444);
+MODULE_PARM_DESC(max_loop, "Initial number of loop devices");
+#endif
+
+module_param(max_part, int, 0444);
+MODULE_PARM_DESC(max_part, "Maximum number of partitions per loop device");
+
+static int hw_queue_depth = LOOP_DEFAULT_HW_Q_DEPTH;
+
+static int loop_set_hw_queue_depth(const char *s, const struct kernel_param *p)
+{
+	int ret = kstrtoint(s, 10, &hw_queue_depth);
+
+	return (ret || (hw_queue_depth < 1)) ? -EINVAL : 0;
+}
+
+static const struct kernel_param_ops loop_hw_qdepth_param_ops = {
+	.set	= loop_set_hw_queue_depth,
+	.get	= param_get_int,
+};
+
+device_param_cb(hw_queue_depth, &loop_hw_qdepth_param_ops, &hw_queue_depth, 0444);
+MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 128");
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(LOOP_MAJOR);
+
+static blk_status_t loop_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct loop_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct loop_device *lo = rq->q->queuedata;
+
+	blk_mq_start_request(rq);
+
+	if (lo->lo_state != Lo_bound)
+		return BLK_STS_IOERR;
+
+	switch (req_op(rq)) {
+	case REQ_OP_FLUSH:
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		cmd->use_aio = false;
+		break;
+	default:
+		cmd->use_aio = lo->use_dio;
+		break;
+	}
+
+	/* always use the first bio's css */
+	cmd->blkcg_css = NULL;
+	cmd->memcg_css = NULL;
+#ifdef CONFIG_BLK_CGROUP
+	if (rq->bio) {
+		cmd->blkcg_css = bio_blkcg_css(rq->bio);
+#ifdef CONFIG_MEMCG
+		if (cmd->blkcg_css) {
+			cmd->memcg_css =
+				cgroup_get_e_css(cmd->blkcg_css->cgroup,
+						&memory_cgrp_subsys);
+		}
+#endif
+	}
+#endif
+	loop_queue_work(lo, cmd);
+
+	return BLK_STS_OK;
+}
+
+static void loop_handle_cmd(struct loop_cmd *cmd)
+{
+	struct cgroup_subsys_state *cmd_blkcg_css = cmd->blkcg_css;
+	struct cgroup_subsys_state *cmd_memcg_css = cmd->memcg_css;
+	struct request *rq = blk_mq_rq_from_pdu(cmd);
+	const bool write = op_is_write(req_op(rq));
+	struct loop_device *lo = rq->q->queuedata;
+	int ret = 0;
+	struct mem_cgroup *old_memcg = NULL;
+	const bool use_aio = cmd->use_aio;
+
+	if (write && (lo->lo_flags & LO_FLAGS_READ_ONLY)) {
+		ret = -EIO;
+		goto failed;
+	}
+
+	if (cmd_blkcg_css)
+		kthread_associate_blkcg(cmd_blkcg_css);
+	if (cmd_memcg_css)
+		old_memcg = set_active_memcg(
+			mem_cgroup_from_css(cmd_memcg_css));
+
+	/*
+	 * do_req_filebacked() may call blk_mq_complete_request() synchronously
+	 * or asynchronously if using aio. Hence, do not touch 'cmd' after
+	 * do_req_filebacked() has returned unless we are sure that 'cmd' has
+	 * not yet been completed.
+	 */
+	ret = do_req_filebacked(lo, rq);
+
+	if (cmd_blkcg_css)
+		kthread_associate_blkcg(NULL);
+
+	if (cmd_memcg_css) {
+		set_active_memcg(old_memcg);
+		css_put(cmd_memcg_css);
+	}
+ failed:
+	/* complete non-aio request */
+	if (!use_aio || ret) {
+		if (ret == -EOPNOTSUPP)
+			cmd->ret = ret;
+		else
+			cmd->ret = ret ? -EIO : 0;
+		if (likely(!blk_should_fake_timeout(rq->q)))
+			blk_mq_complete_request(rq);
+	}
+}
+
+static void loop_process_work(struct loop_worker *worker,
+			struct list_head *cmd_list, struct loop_device *lo)
+{
+	int orig_flags = current->flags;
+	struct loop_cmd *cmd;
+
+	current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO;
+	spin_lock_irq(&lo->lo_work_lock);
+	while (!list_empty(cmd_list)) {
+		cmd = container_of(
+			cmd_list->next, struct loop_cmd, list_entry);
+		list_del(cmd_list->next);
+		spin_unlock_irq(&lo->lo_work_lock);
+
+		loop_handle_cmd(cmd);
+		cond_resched();
+
+		spin_lock_irq(&lo->lo_work_lock);
+	}
+
+	/*
+	 * We only add to the idle list if there are no pending cmds
+	 * *and* the worker will not run again which ensures that it
+	 * is safe to free any worker on the idle list
+	 */
+	if (worker && !work_pending(&worker->work)) {
+		worker->last_ran_at = jiffies;
+		list_add_tail(&worker->idle_list, &lo->idle_worker_list);
+		loop_set_timer(lo);
+	}
+	spin_unlock_irq(&lo->lo_work_lock);
+	current->flags = orig_flags;
+}
+
+static void loop_workfn(struct work_struct *work)
+{
+	struct loop_worker *worker =
+		container_of(work, struct loop_worker, work);
+	loop_process_work(worker, &worker->cmd_list, worker->lo);
+}
+
+static void loop_rootcg_workfn(struct work_struct *work)
+{
+	struct loop_device *lo =
+		container_of(work, struct loop_device, rootcg_work);
+	loop_process_work(NULL, &lo->rootcg_cmd_list, lo);
+}
+
+static const struct blk_mq_ops loop_mq_ops = {
+	.queue_rq       = loop_queue_rq,
+	.complete	= lo_complete_rq,
+};
+
+static int loop_add(int i)
+{
+	struct loop_device *lo;
+	struct gendisk *disk;
+	int err;
+
+	err = -ENOMEM;
+	lo = kzalloc(sizeof(*lo), GFP_KERNEL);
+	if (!lo)
+		goto out;
+	lo->worker_tree = RB_ROOT;
+	INIT_LIST_HEAD(&lo->idle_worker_list);
+	timer_setup(&lo->timer, loop_free_idle_workers_timer, TIMER_DEFERRABLE);
+	lo->lo_state = Lo_unbound;
+
+	err = mutex_lock_killable(&loop_ctl_mutex);
+	if (err)
+		goto out_free_dev;
+
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (i >= 0) {
+		err = idr_alloc(&loop_index_idr, lo, i, i + 1, GFP_KERNEL);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&loop_index_idr, lo, 0, 0, GFP_KERNEL);
+	}
+	mutex_unlock(&loop_ctl_mutex);
+	if (err < 0)
+		goto out_free_dev;
+	i = err;
+
+	lo->tag_set.ops = &loop_mq_ops;
+	lo->tag_set.nr_hw_queues = 1;
+	lo->tag_set.queue_depth = hw_queue_depth;
+	lo->tag_set.numa_node = NUMA_NO_NODE;
+	lo->tag_set.cmd_size = sizeof(struct loop_cmd);
+	lo->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_STACKING |
+		BLK_MQ_F_NO_SCHED_BY_DEFAULT;
+	lo->tag_set.driver_data = lo;
+
+	err = blk_mq_alloc_tag_set(&lo->tag_set);
+	if (err)
+		goto out_free_idr;
+
+	disk = lo->lo_disk = blk_mq_alloc_disk(&lo->tag_set, lo);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_cleanup_tags;
+	}
+	lo->lo_queue = lo->lo_disk->queue;
+
+	blk_queue_max_hw_sectors(lo->lo_queue, BLK_DEF_MAX_SECTORS);
+
+	/*
+	 * By default, we do buffer IO, so it doesn't make sense to enable
+	 * merge because the I/O submitted to backing file is handled page by
+	 * page. For directio mode, merge does help to dispatch bigger request
+	 * to underlayer disk. We will enable merge once directio is enabled.
+	 */
+	blk_queue_flag_set(QUEUE_FLAG_NOMERGES, lo->lo_queue);
+
+	/*
+	 * Disable partition scanning by default. The in-kernel partition
+	 * scanning can be requested individually per-device during its
+	 * setup. Userspace can always add and remove partitions from all
+	 * devices. The needed partition minors are allocated from the
+	 * extended minor space, the main loop device numbers will continue
+	 * to match the loop minors, regardless of the number of partitions
+	 * used.
+	 *
+	 * If max_part is given, partition scanning is globally enabled for
+	 * all loop devices. The minors for the main loop devices will be
+	 * multiples of max_part.
+	 *
+	 * Note: Global-for-all-devices, set-only-at-init, read-only module
+	 * parameteters like 'max_loop' and 'max_part' make things needlessly
+	 * complicated, are too static, inflexible and may surprise
+	 * userspace tools. Parameters like this in general should be avoided.
+	 */
+	if (!part_shift)
+		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+	mutex_init(&lo->lo_mutex);
+	lo->lo_number		= i;
+	spin_lock_init(&lo->lo_lock);
+	spin_lock_init(&lo->lo_work_lock);
+	INIT_WORK(&lo->rootcg_work, loop_rootcg_workfn);
+	INIT_LIST_HEAD(&lo->rootcg_cmd_list);
+	disk->major		= LOOP_MAJOR;
+	disk->first_minor	= i << part_shift;
+	disk->minors		= 1 << part_shift;
+	disk->fops		= &lo_fops;
+	disk->private_data	= lo;
+	disk->queue		= lo->lo_queue;
+	disk->events		= DISK_EVENT_MEDIA_CHANGE;
+	disk->event_flags	= DISK_EVENT_FLAG_UEVENT;
+	sprintf(disk->disk_name, "loop%d", i);
+	/* Make this loop device reachable from pathname. */
+	err = add_disk(disk);
+	if (err)
+		goto out_cleanup_disk;
+
+	/* Show this loop device. */
+	mutex_lock(&loop_ctl_mutex);
+	lo->idr_visible = true;
+	mutex_unlock(&loop_ctl_mutex);
+
+	return i;
+
+out_cleanup_disk:
+	put_disk(disk);
+out_cleanup_tags:
+	blk_mq_free_tag_set(&lo->tag_set);
+out_free_idr:
+	mutex_lock(&loop_ctl_mutex);
+	idr_remove(&loop_index_idr, i);
+	mutex_unlock(&loop_ctl_mutex);
+out_free_dev:
+	kfree(lo);
+out:
+	return err;
+}
+
+static void loop_remove(struct loop_device *lo)
+{
+	/* Make this loop device unreachable from pathname. */
+	del_gendisk(lo->lo_disk);
+	blk_mq_free_tag_set(&lo->tag_set);
+
+	mutex_lock(&loop_ctl_mutex);
+	idr_remove(&loop_index_idr, lo->lo_number);
+	mutex_unlock(&loop_ctl_mutex);
+
+	put_disk(lo->lo_disk);
+}
+
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+static void loop_probe(dev_t dev)
+{
+	int idx = MINOR(dev) >> part_shift;
+
+	if (max_loop_specified && max_loop && idx >= max_loop)
+		return;
+	loop_add(idx);
+}
+#else
+#define loop_probe NULL
+#endif /* !CONFIG_BLOCK_LEGACY_AUTOLOAD */
+
+static int loop_control_remove(int idx)
+{
+	struct loop_device *lo;
+	int ret;
+
+	if (idx < 0) {
+		pr_warn_once("deleting an unspecified loop device is not supported.\n");
+		return -EINVAL;
+	}
+		
+	/* Hide this loop device for serialization. */
+	ret = mutex_lock_killable(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+	lo = idr_find(&loop_index_idr, idx);
+	if (!lo || !lo->idr_visible)
+		ret = -ENODEV;
+	else
+		lo->idr_visible = false;
+	mutex_unlock(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+
+	/* Check whether this loop device can be removed. */
+	ret = mutex_lock_killable(&lo->lo_mutex);
+	if (ret)
+		goto mark_visible;
+	if (lo->lo_state != Lo_unbound || disk_openers(lo->lo_disk) > 0) {
+		mutex_unlock(&lo->lo_mutex);
+		ret = -EBUSY;
+		goto mark_visible;
+	}
+	/* Mark this loop device as no more bound, but not quite unbound yet */
+	lo->lo_state = Lo_deleting;
+	mutex_unlock(&lo->lo_mutex);
+
+	loop_remove(lo);
+	return 0;
+
+mark_visible:
+	/* Show this loop device again. */
+	mutex_lock(&loop_ctl_mutex);
+	lo->idr_visible = true;
+	mutex_unlock(&loop_ctl_mutex);
+	return ret;
+}
+
+static int loop_control_get_free(int idx)
+{
+	struct loop_device *lo;
+	int id, ret;
+
+	ret = mutex_lock_killable(&loop_ctl_mutex);
+	if (ret)
+		return ret;
+	idr_for_each_entry(&loop_index_idr, lo, id) {
+		/* Hitting a race results in creating a new loop device which is harmless. */
+		if (lo->idr_visible && data_race(lo->lo_state) == Lo_unbound)
+			goto found;
+	}
+	mutex_unlock(&loop_ctl_mutex);
+	return loop_add(-1);
+found:
+	mutex_unlock(&loop_ctl_mutex);
+	return id;
+}
+
+static long loop_control_ioctl(struct file *file, unsigned int cmd,
+			       unsigned long parm)
+{
+	switch (cmd) {
+	case LOOP_CTL_ADD:
+		return loop_add(parm);
+	case LOOP_CTL_REMOVE:
+		return loop_control_remove(parm);
+	case LOOP_CTL_GET_FREE:
+		return loop_control_get_free(parm);
+	default:
+		return -ENOSYS;
+	}
+}
+
+static const struct file_operations loop_ctl_fops = {
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= loop_control_ioctl,
+	.compat_ioctl	= loop_control_ioctl,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice loop_misc = {
+	.minor		= LOOP_CTRL_MINOR,
+	.name		= "loop-control",
+	.fops		= &loop_ctl_fops,
+};
+
+MODULE_ALIAS_MISCDEV(LOOP_CTRL_MINOR);
+MODULE_ALIAS("devname:loop-control");
+
+static int __init loop_init(void)
+{
+	int i;
+	int err;
+
+	part_shift = 0;
+	if (max_part > 0) {
+		part_shift = fls(max_part);
+
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can decide correct minor number
+		 * if [s]he want to create more devices.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
+
+	if ((1UL << part_shift) > DISK_MAX_PARTS) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	if (max_loop > 1UL << (MINORBITS - part_shift)) {
+		err = -EINVAL;
+		goto err_out;
+	}
+
+	err = misc_register(&loop_misc);
+	if (err < 0)
+		goto err_out;
+
+
+	if (__register_blkdev(LOOP_MAJOR, "loop", loop_probe)) {
+		err = -EIO;
+		goto misc_out;
+	}
+
+	/* pre-create number of devices given by config or max_loop */
+	for (i = 0; i < max_loop; i++)
+		loop_add(i);
+
+	printk(KERN_INFO "loop: module loaded\n");
+	return 0;
+
+misc_out:
+	misc_deregister(&loop_misc);
+err_out:
+	return err;
+}
+
+static void __exit loop_exit(void)
+{
+	struct loop_device *lo;
+	int id;
+
+	unregister_blkdev(LOOP_MAJOR, "loop");
+	misc_deregister(&loop_misc);
+
+	/*
+	 * There is no need to use loop_ctl_mutex here, for nobody else can
+	 * access loop_index_idr when this module is unloading (unless forced
+	 * module unloading is requested). If this is not a clean unloading,
+	 * we have no means to avoid kernel crash.
+	 */
+	idr_for_each_entry(&loop_index_idr, lo, id)
+		loop_remove(lo);
+
+	idr_destroy(&loop_index_idr);
+}
+
+module_init(loop_init);
+module_exit(loop_exit);
+
+#ifndef MODULE
+static int __init max_loop_setup(char *str)
+{
+	max_loop = simple_strtol(str, NULL, 0);
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+	max_loop_specified = true;
+#endif
+	return 1;
+}
+
+__setup("max_loop=", max_loop_setup);
+#endif
diff --git a/drivers/block/mtip32xx/Kconfig b/drivers/block/mtip32xx/Kconfig
new file mode 100644
index 000000000..bf2213585
--- /dev/null
+++ b/drivers/block/mtip32xx/Kconfig
@@ -0,0 +1,10 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# mtip32xx device driver configuration
+#
+
+config BLK_DEV_PCIESSD_MTIP32XX
+	tristate "Block Device Driver for Micron PCIe SSDs"
+	depends on PCI
+	help
+          This enables the block driver for Micron PCIe SSDs.
diff --git a/drivers/block/mtip32xx/Makefile b/drivers/block/mtip32xx/Makefile
new file mode 100644
index 000000000..bff32b5d3
--- /dev/null
+++ b/drivers/block/mtip32xx/Makefile
@@ -0,0 +1,6 @@
+# SPDX-License-Identifier: GPL-2.0-only
+#
+# Makefile for  Block device driver for Micron PCIe SSD
+#
+
+obj-$(CONFIG_BLK_DEV_PCIESSD_MTIP32XX) += mtip32xx.o
diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c
new file mode 100644
index 000000000..815d77ba6
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.c
@@ -0,0 +1,4089 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for the Micron P320 SSD
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/ata.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/uaccess.h>
+#include <linux/random.h>
+#include <linux/smp.h>
+#include <linux/compat.h>
+#include <linux/fs.h>
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/bio.h>
+#include <linux/dma-mapping.h>
+#include <linux/idr.h>
+#include <linux/kthread.h>
+#include <../drivers/ata/ahci.h>
+#include <linux/export.h>
+#include <linux/debugfs.h>
+#include <linux/prefetch.h>
+#include <linux/numa.h>
+#include "mtip32xx.h"
+
+#define HW_CMD_SLOT_SZ		(MTIP_MAX_COMMAND_SLOTS * 32)
+
+/* DMA region containing RX Fis, Identify, RLE10, and SMART buffers */
+#define AHCI_RX_FIS_SZ          0x100
+#define AHCI_RX_FIS_OFFSET      0x0
+#define AHCI_IDFY_SZ            ATA_SECT_SIZE
+#define AHCI_IDFY_OFFSET        0x400
+#define AHCI_SECTBUF_SZ         ATA_SECT_SIZE
+#define AHCI_SECTBUF_OFFSET     0x800
+#define AHCI_SMARTBUF_SZ        ATA_SECT_SIZE
+#define AHCI_SMARTBUF_OFFSET    0xC00
+/* 0x100 + 0x200 + 0x200 + 0x200 is smaller than 4k but we pad it out */
+#define BLOCK_DMA_ALLOC_SZ      4096
+
+/* DMA region containing command table (should be 8192 bytes) */
+#define AHCI_CMD_SLOT_SZ        sizeof(struct mtip_cmd_hdr)
+#define AHCI_CMD_TBL_SZ         (MTIP_MAX_COMMAND_SLOTS * AHCI_CMD_SLOT_SZ)
+#define AHCI_CMD_TBL_OFFSET     0x0
+
+/* DMA region per command (contains header and SGL) */
+#define AHCI_CMD_TBL_HDR_SZ     0x80
+#define AHCI_CMD_TBL_HDR_OFFSET 0x0
+#define AHCI_CMD_TBL_SGL_SZ     (MTIP_MAX_SG * sizeof(struct mtip_cmd_sg))
+#define AHCI_CMD_TBL_SGL_OFFSET AHCI_CMD_TBL_HDR_SZ
+#define CMD_DMA_ALLOC_SZ        (AHCI_CMD_TBL_SGL_SZ + AHCI_CMD_TBL_HDR_SZ)
+
+
+#define HOST_CAP_NZDMA		(1 << 19)
+#define HOST_HSORG		0xFC
+#define HSORG_DISABLE_SLOTGRP_INTR (1<<24)
+#define HSORG_DISABLE_SLOTGRP_PXIS (1<<16)
+#define HSORG_HWREV		0xFF00
+#define HSORG_STYLE		0x8
+#define HSORG_SLOTGROUPS	0x7
+
+#define PORT_COMMAND_ISSUE	0x38
+#define PORT_SDBV		0x7C
+
+#define PORT_OFFSET		0x100
+#define PORT_MEM_SIZE		0x80
+
+#define PORT_IRQ_ERR \
+	(PORT_IRQ_HBUS_ERR | PORT_IRQ_IF_ERR | PORT_IRQ_CONNECT | \
+	 PORT_IRQ_PHYRDY | PORT_IRQ_UNK_FIS | PORT_IRQ_BAD_PMP | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_HBUS_DATA_ERR | PORT_IRQ_IF_NONFATAL | \
+	 PORT_IRQ_OVERFLOW)
+#define PORT_IRQ_LEGACY \
+	(PORT_IRQ_PIOS_FIS | PORT_IRQ_D2H_REG_FIS)
+#define PORT_IRQ_HANDLED \
+	(PORT_IRQ_SDB_FIS | PORT_IRQ_LEGACY | \
+	 PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR | \
+	 PORT_IRQ_CONNECT | PORT_IRQ_PHYRDY)
+#define DEF_PORT_IRQ \
+	(PORT_IRQ_ERR | PORT_IRQ_LEGACY | PORT_IRQ_SDB_FIS)
+
+/* product numbers */
+#define MTIP_PRODUCT_UNKNOWN	0x00
+#define MTIP_PRODUCT_ASICFPGA	0x11
+
+/* Device instance number, incremented each time a device is probed. */
+static int instance;
+
+/*
+ * Global variable used to hold the major block device number
+ * allocated in mtip_init().
+ */
+static int mtip_major;
+static struct dentry *dfs_parent;
+
+static u32 cpu_use[NR_CPUS];
+
+static DEFINE_IDA(rssd_index_ida);
+
+static int mtip_block_initialize(struct driver_data *dd);
+
+#ifdef CONFIG_COMPAT
+struct mtip_compat_ide_task_request_s {
+	__u8		io_ports[8];
+	__u8		hob_ports[8];
+	ide_reg_valid_t	out_flags;
+	ide_reg_valid_t	in_flags;
+	int		data_phase;
+	int		req_cmd;
+	compat_ulong_t	out_size;
+	compat_ulong_t	in_size;
+};
+#endif
+
+/*
+ * This function check_for_surprise_removal is called
+ * while card is removed from the system and it will
+ * read the vendor id from the configuration space
+ *
+ * @pdev Pointer to the pci_dev structure.
+ *
+ * return value
+ *	 true if device removed, else false
+ */
+static bool mtip_check_surprise_removal(struct driver_data *dd)
+{
+	u16 vendor_id = 0;
+
+	if (dd->sr)
+		return true;
+
+       /* Read the vendorID from the configuration space */
+	pci_read_config_word(dd->pdev, 0x00, &vendor_id);
+	if (vendor_id == 0xFFFF) {
+		dd->sr = true;
+		if (dd->disk)
+			blk_mark_disk_dead(dd->disk);
+		return true; /* device removed */
+	}
+
+	return false; /* device present */
+}
+
+static struct mtip_cmd *mtip_cmd_from_tag(struct driver_data *dd,
+					  unsigned int tag)
+{
+	return blk_mq_rq_to_pdu(blk_mq_tag_to_rq(dd->tags.tags[0], tag));
+}
+
+/*
+ * Reset the HBA (without sleeping)
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	The reset was successful.
+ *	-1	The HBA Reset bit did not clear.
+ */
+static int mtip_hba_reset(struct driver_data *dd)
+{
+	unsigned long timeout;
+
+	/* Set the reset bit */
+	writel(HOST_RESET, dd->mmio + HOST_CTL);
+
+	/* Flush */
+	readl(dd->mmio + HOST_CTL);
+
+	/*
+	 * Spin for up to 10 seconds waiting for reset acknowledgement. Spec
+	 * is 1 sec but in LUN failure conditions, up to 10 secs are required
+	 */
+	timeout = jiffies + msecs_to_jiffies(10000);
+	do {
+		mdelay(10);
+		if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+			return -1;
+
+	} while ((readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		 && time_before(jiffies, timeout));
+
+	if (readl(dd->mmio + HOST_CTL) & HOST_RESET)
+		return -1;
+
+	return 0;
+}
+
+/*
+ * Issue a command to the hardware.
+ *
+ * Set the appropriate bit in the s_active and Command Issue hardware
+ * registers, causing hardware command processing to begin.
+ *
+ * @port Pointer to the port structure.
+ * @tag  The tag of the command to be issued.
+ *
+ * return value
+ *      None
+ */
+static inline void mtip_issue_ncq_command(struct mtip_port *port, int tag)
+{
+	int group = tag >> 5;
+
+	/* guard SACT and CI registers */
+	spin_lock(&port->cmd_issue_lock[group]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->s_active[MTIP_TAG_INDEX(tag)]);
+	writel((1 << MTIP_TAG_BIT(tag)),
+			port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+	spin_unlock(&port->cmd_issue_lock[group]);
+}
+
+/*
+ * Enable/disable the reception of FIS
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled
+ */
+static int mtip_enable_fis(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_FIS_RX, port->mmio + PORT_CMD);
+
+	/* Flush */
+	readl(port->mmio + PORT_CMD);
+
+	return (((tmp & PORT_CMD_FIS_RX) == PORT_CMD_FIS_RX));
+}
+
+/*
+ * Enable/disable the DMA engine
+ *
+ * @port   Pointer to the port data structure
+ * @enable 1 to enable, 0 to disable
+ *
+ * return value
+ *	Previous state: 1 enabled, 0 disabled.
+ */
+static int mtip_enable_engine(struct mtip_port *port, int enable)
+{
+	u32 tmp;
+
+	/* enable FIS reception */
+	tmp = readl(port->mmio + PORT_CMD);
+	if (enable)
+		writel(tmp | PORT_CMD_START, port->mmio + PORT_CMD);
+	else
+		writel(tmp & ~PORT_CMD_START, port->mmio + PORT_CMD);
+
+	readl(port->mmio + PORT_CMD);
+	return (((tmp & PORT_CMD_START) == PORT_CMD_START));
+}
+
+/*
+ * Enables the port DMA engine and FIS reception.
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_start_port(struct mtip_port *port)
+{
+	/* Enable FIS reception */
+	mtip_enable_fis(port, 1);
+
+	/* Enable the DMA engine */
+	mtip_enable_engine(port, 1);
+}
+
+/*
+ * Deinitialize a port by disabling port interrupts, the DMA engine,
+ * and FIS reception.
+ *
+ * @port Pointer to the port structure
+ *
+ * return value
+ *	None
+ */
+static inline void mtip_deinit_port(struct mtip_port *port)
+{
+	/* Disable interrupts on this port */
+	writel(0, port->mmio + PORT_IRQ_MASK);
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Disable FIS reception */
+	mtip_enable_fis(port, 0);
+}
+
+/*
+ * Initialize a port.
+ *
+ * This function deinitializes the port by calling mtip_deinit_port() and
+ * then initializes it by setting the command header and RX FIS addresses,
+ * clearing the SError register and any pending port interrupts before
+ * re-enabling the default set of port interrupts.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_init_port(struct mtip_port *port)
+{
+	int i;
+	mtip_deinit_port(port);
+
+	/* Program the command list base and FIS base addresses */
+	if (readl(port->dd->mmio + HOST_CAP) & HOST_CAP_64) {
+		writel((port->command_list_dma >> 16) >> 16,
+			 port->mmio + PORT_LST_ADDR_HI);
+		writel((port->rxfis_dma >> 16) >> 16,
+			 port->mmio + PORT_FIS_ADDR_HI);
+		set_bit(MTIP_PF_HOST_CAP_64, &port->flags);
+	}
+
+	writel(port->command_list_dma & 0xFFFFFFFF,
+			port->mmio + PORT_LST_ADDR);
+	writel(port->rxfis_dma & 0xFFFFFFFF, port->mmio + PORT_FIS_ADDR);
+
+	/* Clear SError */
+	writel(readl(port->mmio + PORT_SCR_ERR), port->mmio + PORT_SCR_ERR);
+
+	/* reset the completed registers.*/
+	for (i = 0; i < port->dd->slot_groups; i++)
+		writel(0xFFFFFFFF, port->completed[i]);
+
+	/* Clear any pending interrupts for this port */
+	writel(readl(port->mmio + PORT_IRQ_STAT), port->mmio + PORT_IRQ_STAT);
+
+	/* Clear any pending interrupts on the HBA. */
+	writel(readl(port->dd->mmio + HOST_IRQ_STAT),
+					port->dd->mmio + HOST_IRQ_STAT);
+
+	/* Enable port interrupts */
+	writel(DEF_PORT_IRQ, port->mmio + PORT_IRQ_MASK);
+}
+
+/*
+ * Restart a port
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_restart_port(struct mtip_port *port)
+{
+	unsigned long timeout;
+
+	/* Disable the DMA engine */
+	mtip_enable_engine(port, 0);
+
+	/* Chip quirk: wait up to 500ms for PxCMD.CR == 0 */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while ((readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON)
+		 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/*
+	 * Chip quirk: escalate to hba reset if
+	 * PxCMD.CR not clear after 500 ms
+	 */
+	if (readl(port->mmio + PORT_CMD) & PORT_CMD_LIST_ON) {
+		dev_warn(&port->dd->pdev->dev,
+			"PxCMD.CR not clear, escalating reset\n");
+
+		if (mtip_hba_reset(port->dd))
+			dev_err(&port->dd->pdev->dev,
+				"HBA reset escalation failed.\n");
+
+		/* 30 ms delay before com reset to quiesce chip */
+		mdelay(30);
+	}
+
+	dev_warn(&port->dd->pdev->dev, "Issuing COM reset\n");
+
+	/* Set PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) |
+			 1, port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 1 ms to quiesce chip function */
+	timeout = jiffies + msecs_to_jiffies(1);
+	while (time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	/* Clear PxSCTL.DET */
+	writel(readl(port->mmio + PORT_SCR_CTL) & ~1,
+			 port->mmio + PORT_SCR_CTL);
+	readl(port->mmio + PORT_SCR_CTL);
+
+	/* Wait 500 ms for bit 0 of PORT_SCR_STS to be set */
+	timeout = jiffies + msecs_to_jiffies(500);
+	while (((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+			 && time_before(jiffies, timeout))
+		;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return;
+
+	if ((readl(port->mmio + PORT_SCR_STAT) & 0x01) == 0)
+		dev_warn(&port->dd->pdev->dev,
+			"COM reset failed\n");
+
+	mtip_init_port(port);
+	mtip_start_port(port);
+
+}
+
+static int mtip_device_reset(struct driver_data *dd)
+{
+	int rv = 0;
+
+	if (mtip_check_surprise_removal(dd))
+		return 0;
+
+	if (mtip_hba_reset(dd) < 0)
+		rv = -EFAULT;
+
+	mdelay(1);
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+	return rv;
+}
+
+/*
+ * Helper function for tag logging
+ */
+static void print_tags(struct driver_data *dd,
+			char *msg,
+			unsigned long *tagbits,
+			int cnt)
+{
+	unsigned char tagmap[128];
+	int group, tagmap_len = 0;
+
+	memset(tagmap, 0, sizeof(tagmap));
+	for (group = SLOTBITS_IN_LONGS; group > 0; group--)
+		tagmap_len += sprintf(tagmap + tagmap_len, "%016lX ",
+						tagbits[group-1]);
+	dev_warn(&dd->pdev->dev,
+			"%d command(s) %s: tagmap [%s]", cnt, msg, tagmap);
+}
+
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors);
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib);
+
+static void mtip_complete_command(struct mtip_cmd *cmd, blk_status_t status)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+
+	cmd->status = status;
+	if (likely(!blk_should_fake_timeout(req->q)))
+		blk_mq_complete_request(req);
+}
+
+/*
+ * Handle an error.
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_handle_tfe(struct driver_data *dd)
+{
+	int group, tag, bit, reissue, rv;
+	struct mtip_port *port;
+	struct mtip_cmd  *cmd;
+	u32 completed;
+	struct host_to_dev_fis *fis;
+	unsigned long tagaccum[SLOTBITS_IN_LONGS];
+	unsigned int cmd_cnt = 0;
+	unsigned char *buf;
+	char *fail_reason = NULL;
+	int fail_all_ncq_write = 0, fail_all_ncq_cmds = 0;
+
+	dev_warn(&dd->pdev->dev, "Taskfile error\n");
+
+	port = dd->port;
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags)) {
+		cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+		dbg_printk(MTIP_DRV_NAME " TFE for the internal command\n");
+		mtip_complete_command(cmd, BLK_STS_IOERR);
+		return;
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		completed = readl(port->completed[group]);
+
+		dev_warn(&dd->pdev->dev, "g=%u, comp=%x\n", group, completed);
+
+		/* clear completed status register in the hardware.*/
+		writel(completed, port->completed[group]);
+
+		/* Process successfully completed commands */
+		for (bit = 0; bit < 32 && completed; bit++) {
+			if (!(completed & (1<<bit)))
+				continue;
+			tag = (group << 5) + bit;
+
+			/* Skip the internal command slot */
+			if (tag == MTIP_TAG_INTERNAL)
+				continue;
+
+			cmd = mtip_cmd_from_tag(dd, tag);
+			mtip_complete_command(cmd, 0);
+			set_bit(tag, tagaccum);
+			cmd_cnt++;
+		}
+	}
+
+	print_tags(dd, "completed (TFE)", tagaccum, cmd_cnt);
+
+	/* Restart the port */
+	mdelay(20);
+	mtip_restart_port(port);
+
+	/* Trying to determine the cause of the error */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+			fail_all_ncq_write = 1;
+			fail_reason = "write protect";
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+			fail_all_ncq_cmds = 1;
+			fail_reason = "thermal shutdown";
+		}
+		if (buf[288] == 0xBF) {
+			set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag);
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed. Secure erase required.\n");
+			fail_all_ncq_cmds = 1;
+			fail_reason = "rebuild failed";
+		}
+	}
+
+	/* clear the tag accumulator */
+	memset(tagaccum, 0, SLOTBITS_IN_LONGS * sizeof(long));
+
+	/* Loop through all the groups */
+	for (group = 0; group < dd->slot_groups; group++) {
+		for (bit = 0; bit < 32; bit++) {
+			reissue = 1;
+			tag = (group << 5) + bit;
+			cmd = mtip_cmd_from_tag(dd, tag);
+
+			fis = (struct host_to_dev_fis *)cmd->command;
+
+			/* Should re-issue? */
+			if (tag == MTIP_TAG_INTERNAL ||
+			    fis->command == ATA_CMD_SET_FEATURES)
+				reissue = 0;
+			else {
+				if (fail_all_ncq_cmds ||
+					(fail_all_ncq_write &&
+					fis->command == ATA_CMD_FPDMA_WRITE)) {
+					dev_warn(&dd->pdev->dev,
+					"  Fail: %s w/tag %d [%s].\n",
+					fis->command == ATA_CMD_FPDMA_WRITE ?
+						"write" : "read",
+					tag,
+					fail_reason != NULL ?
+						fail_reason : "unknown");
+					mtip_complete_command(cmd, BLK_STS_MEDIUM);
+					continue;
+				}
+			}
+
+			/*
+			 * First check if this command has
+			 *  exceeded its retries.
+			 */
+			if (reissue && (cmd->retries-- > 0)) {
+
+				set_bit(tag, tagaccum);
+
+				/* Re-issue the command. */
+				mtip_issue_ncq_command(port, tag);
+
+				continue;
+			}
+
+			/* Retire a command that will not be reissued */
+			dev_warn(&port->dd->pdev->dev,
+				"retiring tag %d\n", tag);
+
+			mtip_complete_command(cmd, BLK_STS_IOERR);
+		}
+	}
+	print_tags(dd, "reissued (TFE)", tagaccum, cmd_cnt);
+}
+
+/*
+ * Handle a set device bits interrupt
+ */
+static inline void mtip_workq_sdbfx(struct mtip_port *port, int group,
+							u32 completed)
+{
+	struct driver_data *dd = port->dd;
+	int tag, bit;
+	struct mtip_cmd *command;
+
+	if (!completed) {
+		WARN_ON_ONCE(!completed);
+		return;
+	}
+	/* clear completed status register in the hardware.*/
+	writel(completed, port->completed[group]);
+
+	/* Process completed commands. */
+	for (bit = 0; (bit < 32) && completed; bit++) {
+		if (completed & 0x01) {
+			tag = (group << 5) | bit;
+
+			/* skip internal command slot. */
+			if (unlikely(tag == MTIP_TAG_INTERNAL))
+				continue;
+
+			command = mtip_cmd_from_tag(dd, tag);
+			mtip_complete_command(command, 0);
+		}
+		completed >>= 1;
+	}
+
+	/* If last, re-enable interrupts */
+	if (atomic_dec_return(&dd->irq_workers_active) == 0)
+		writel(0xffffffff, dd->mmio + HOST_IRQ_STAT);
+}
+
+/*
+ * Process legacy pio and d2h interrupts
+ */
+static inline void mtip_process_legacy(struct driver_data *dd, u32 port_stat)
+{
+	struct mtip_port *port = dd->port;
+	struct mtip_cmd *cmd = mtip_cmd_from_tag(dd, MTIP_TAG_INTERNAL);
+
+	if (test_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags) && cmd) {
+		int group = MTIP_TAG_INDEX(MTIP_TAG_INTERNAL);
+		int status = readl(port->cmd_issue[group]);
+
+		if (!(status & (1 << MTIP_TAG_BIT(MTIP_TAG_INTERNAL))))
+			mtip_complete_command(cmd, 0);
+	}
+}
+
+/*
+ * Demux and handle errors
+ */
+static inline void mtip_process_errors(struct driver_data *dd, u32 port_stat)
+{
+	if (unlikely(port_stat & PORT_IRQ_CONNECT)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.x\n");
+		writel((1 << 26), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & PORT_IRQ_PHYRDY)) {
+		dev_warn(&dd->pdev->dev,
+			"Clearing PxSERR.DIAG.n\n");
+		writel((1 << 16), dd->port->mmio + PORT_SCR_ERR);
+	}
+
+	if (unlikely(port_stat & ~PORT_IRQ_HANDLED)) {
+		dev_warn(&dd->pdev->dev,
+			"Port stat errors %x unhandled\n",
+			(port_stat & ~PORT_IRQ_HANDLED));
+		if (mtip_check_surprise_removal(dd))
+			return;
+	}
+	if (likely(port_stat & (PORT_IRQ_TF_ERR | PORT_IRQ_IF_ERR))) {
+		set_bit(MTIP_PF_EH_ACTIVE_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+	}
+}
+
+static inline irqreturn_t mtip_handle_irq(struct driver_data *data)
+{
+	struct driver_data *dd = (struct driver_data *) data;
+	struct mtip_port *port = dd->port;
+	u32 hba_stat, port_stat;
+	int rv = IRQ_NONE;
+	int do_irq_enable = 1, i, workers;
+	struct mtip_work *twork;
+
+	hba_stat = readl(dd->mmio + HOST_IRQ_STAT);
+	if (hba_stat) {
+		rv = IRQ_HANDLED;
+
+		/* Acknowledge the interrupt status on the port.*/
+		port_stat = readl(port->mmio + PORT_IRQ_STAT);
+		if (unlikely(port_stat == 0xFFFFFFFF)) {
+			mtip_check_surprise_removal(dd);
+			return IRQ_HANDLED;
+		}
+		writel(port_stat, port->mmio + PORT_IRQ_STAT);
+
+		/* Demux port status */
+		if (likely(port_stat & PORT_IRQ_SDB_FIS)) {
+			do_irq_enable = 0;
+			WARN_ON_ONCE(atomic_read(&dd->irq_workers_active) != 0);
+
+			/* Start at 1: group zero is always local? */
+			for (i = 0, workers = 0; i < MTIP_MAX_SLOT_GROUPS;
+									i++) {
+				twork = &dd->work[i];
+				twork->completed = readl(port->completed[i]);
+				if (twork->completed)
+					workers++;
+			}
+
+			atomic_set(&dd->irq_workers_active, workers);
+			if (workers) {
+				for (i = 1; i < MTIP_MAX_SLOT_GROUPS; i++) {
+					twork = &dd->work[i];
+					if (twork->completed)
+						queue_work_on(
+							twork->cpu_binding,
+							dd->isr_workq,
+							&twork->work);
+				}
+
+				if (likely(dd->work[0].completed))
+					mtip_workq_sdbfx(port, 0,
+							dd->work[0].completed);
+
+			} else {
+				/*
+				 * Chip quirk: SDB interrupt but nothing
+				 * to complete
+				 */
+				do_irq_enable = 1;
+			}
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_ERR)) {
+			if (unlikely(mtip_check_surprise_removal(dd))) {
+				/* don't proceed further */
+				return IRQ_HANDLED;
+			}
+			if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+							&dd->dd_flag))
+				return rv;
+
+			mtip_process_errors(dd, port_stat & PORT_IRQ_ERR);
+		}
+
+		if (unlikely(port_stat & PORT_IRQ_LEGACY))
+			mtip_process_legacy(dd, port_stat & PORT_IRQ_LEGACY);
+	}
+
+	/* acknowledge interrupt */
+	if (unlikely(do_irq_enable))
+		writel(hba_stat, dd->mmio + HOST_IRQ_STAT);
+
+	return rv;
+}
+
+/*
+ * HBA interrupt subroutine.
+ *
+ * @irq		IRQ number.
+ * @instance	Pointer to the driver data structure.
+ *
+ * return value
+ *	IRQ_HANDLED	A HBA interrupt was pending and handled.
+ *	IRQ_NONE	This interrupt was not for the HBA.
+ */
+static irqreturn_t mtip_irq_handler(int irq, void *instance)
+{
+	struct driver_data *dd = instance;
+
+	return mtip_handle_irq(dd);
+}
+
+static void mtip_issue_non_ncq_command(struct mtip_port *port, int tag)
+{
+	writel(1 << MTIP_TAG_BIT(tag), port->cmd_issue[MTIP_TAG_INDEX(tag)]);
+}
+
+static bool mtip_pause_ncq(struct mtip_port *port,
+				struct host_to_dev_fis *fis)
+{
+	unsigned long task_file_data;
+
+	task_file_data = readl(port->mmio+PORT_TFDATA);
+	if ((task_file_data & 1))
+		return false;
+
+	if (fis->command == ATA_CMD_SEC_ERASE_PREP) {
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_DOWNLOAD_MICRO) &&
+					(fis->features == 0x03)) {
+		set_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+		port->ic_pause_timer = jiffies;
+		return true;
+	} else if ((fis->command == ATA_CMD_SEC_ERASE_UNIT) ||
+		((fis->command == 0xFC) &&
+			(fis->features == 0x27 || fis->features == 0x72 ||
+			 fis->features == 0x62 || fis->features == 0x26))) {
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+		clear_bit(MTIP_DDF_REBUILD_FAILED_BIT, &port->dd->dd_flag);
+		/* Com reset after secure erase or lowlevel format */
+		mtip_restart_port(port);
+		clear_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+		return false;
+	}
+
+	return false;
+}
+
+static bool mtip_commands_active(struct mtip_port *port)
+{
+	unsigned int active;
+	unsigned int n;
+
+	/*
+	 * Ignore s_active bit 0 of array element 0.
+	 * This bit will always be set
+	 */
+	active = readl(port->s_active[0]) & 0xFFFFFFFE;
+	for (n = 1; n < port->dd->slot_groups; n++)
+		active |= readl(port->s_active[n]);
+
+	return active != 0;
+}
+
+/*
+ * Wait for port to quiesce
+ *
+ * @port    Pointer to port data structure
+ * @timeout Max duration to wait (ms)
+ *
+ * return value
+ *	0	Success
+ *	-EBUSY  Commands still active
+ */
+static int mtip_quiesce_io(struct mtip_port *port, unsigned long timeout)
+{
+	unsigned long to;
+	bool active = true;
+
+	blk_mq_quiesce_queue(port->dd->queue);
+
+	to = jiffies + msecs_to_jiffies(timeout);
+	do {
+		if (test_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags) &&
+			test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			msleep(20);
+			continue; /* svc thd is actively issuing commands */
+		}
+
+		msleep(100);
+
+		if (mtip_check_surprise_removal(port->dd))
+			goto err_fault;
+
+		active = mtip_commands_active(port);
+		if (!active)
+			break;
+	} while (time_before(jiffies, to));
+
+	blk_mq_unquiesce_queue(port->dd->queue);
+	return active ? -EBUSY : 0;
+err_fault:
+	blk_mq_unquiesce_queue(port->dd->queue);
+	return -EFAULT;
+}
+
+struct mtip_int_cmd {
+	int fis_len;
+	dma_addr_t buffer;
+	int buf_len;
+	u32 opts;
+};
+
+/*
+ * Execute an internal command and wait for the completion.
+ *
+ * @port    Pointer to the port data structure.
+ * @fis     Pointer to the FIS that describes the command.
+ * @fis_len  Length in WORDS of the FIS.
+ * @buffer  DMA accessible for command data.
+ * @buf_len  Length, in bytes, of the data buffer.
+ * @opts    Command header options, excluding the FIS length
+ *             and the number of PRD entries.
+ * @timeout Time in ms to wait for the command to complete.
+ *
+ * return value
+ *	0	 Command completed successfully.
+ *	-EFAULT  The buffer address is not correctly aligned.
+ *	-EBUSY   Internal command or other IO in progress.
+ *	-EAGAIN  Time out waiting for command to complete.
+ */
+static int mtip_exec_internal_command(struct mtip_port *port,
+					struct host_to_dev_fis *fis,
+					int fis_len,
+					dma_addr_t buffer,
+					int buf_len,
+					u32 opts,
+					unsigned long timeout)
+{
+	struct mtip_cmd *int_cmd;
+	struct driver_data *dd = port->dd;
+	struct request *rq;
+	struct mtip_int_cmd icmd = {
+		.fis_len = fis_len,
+		.buffer = buffer,
+		.buf_len = buf_len,
+		.opts = opts
+	};
+	int rv = 0;
+
+	/* Make sure the buffer is 8 byte aligned. This is asic specific. */
+	if (buffer & 0x00000007) {
+		dev_err(&dd->pdev->dev, "SG buffer is not 8 byte aligned\n");
+		return -EFAULT;
+	}
+
+	if (mtip_check_surprise_removal(dd))
+		return -EFAULT;
+
+	rq = blk_mq_alloc_request(dd->queue, REQ_OP_DRV_IN, BLK_MQ_REQ_RESERVED);
+	if (IS_ERR(rq)) {
+		dbg_printk(MTIP_DRV_NAME "Unable to allocate tag for PIO cmd\n");
+		return -EFAULT;
+	}
+
+	set_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+
+	if (fis->command == ATA_CMD_SEC_ERASE_PREP)
+		set_bit(MTIP_PF_SE_ACTIVE_BIT, &port->flags);
+
+	clear_bit(MTIP_PF_DM_ACTIVE_BIT, &port->flags);
+
+	if (fis->command != ATA_CMD_STANDBYNOW1) {
+		/* wait for io to complete if non atomic */
+		if (mtip_quiesce_io(port, MTIP_QUIESCE_IO_TIMEOUT_MS) < 0) {
+			dev_warn(&dd->pdev->dev, "Failed to quiesce IO\n");
+			blk_mq_free_request(rq);
+			clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+			wake_up_interruptible(&port->svc_wait);
+			return -EBUSY;
+		}
+	}
+
+	/* Copy the command to the command table */
+	int_cmd = blk_mq_rq_to_pdu(rq);
+	int_cmd->icmd = &icmd;
+	memcpy(int_cmd->command, fis, fis_len*4);
+
+	rq->timeout = timeout;
+
+	/* insert request and run queue */
+	blk_execute_rq(rq, true);
+
+	if (int_cmd->status) {
+		dev_err(&dd->pdev->dev, "Internal command [%02X] failed %d\n",
+				fis->command, int_cmd->status);
+		rv = -EIO;
+
+		if (mtip_check_surprise_removal(dd) ||
+			test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+					&dd->dd_flag)) {
+			dev_err(&dd->pdev->dev,
+				"Internal command [%02X] wait returned due to SR\n",
+				fis->command);
+			rv = -ENXIO;
+			goto exec_ic_exit;
+		}
+		mtip_device_reset(dd); /* recover from timeout issue */
+		rv = -EAGAIN;
+		goto exec_ic_exit;
+	}
+
+	if (readl(port->cmd_issue[MTIP_TAG_INDEX(MTIP_TAG_INTERNAL)])
+			& (1 << MTIP_TAG_BIT(MTIP_TAG_INTERNAL))) {
+		rv = -ENXIO;
+		if (!test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+			mtip_device_reset(dd);
+			rv = -EAGAIN;
+		}
+	}
+exec_ic_exit:
+	/* Clear the allocated and active bits for the internal command. */
+	blk_mq_free_request(rq);
+	clear_bit(MTIP_PF_IC_ACTIVE_BIT, &port->flags);
+	if (rv >= 0 && mtip_pause_ncq(port, fis)) {
+		/* NCQ paused */
+		return rv;
+	}
+	wake_up_interruptible(&port->svc_wait);
+
+	return rv;
+}
+
+/*
+ * Byte-swap ATA ID strings.
+ *
+ * ATA identify data contains strings in byte-swapped 16-bit words.
+ * They must be swapped (on all architectures) to be usable as C strings.
+ * This function swaps bytes in-place.
+ *
+ * @buf The buffer location of the string
+ * @len The number of bytes to swap
+ *
+ * return value
+ *	None
+ */
+static inline void ata_swap_string(u16 *buf, unsigned int len)
+{
+	int i;
+	for (i = 0; i < (len/2); i++)
+		be16_to_cpus(&buf[i]);
+}
+
+static void mtip_set_timeout(struct driver_data *dd,
+					struct host_to_dev_fis *fis,
+					unsigned int *timeout, u8 erasemode)
+{
+	switch (fis->command) {
+	case ATA_CMD_DOWNLOAD_MICRO:
+		*timeout = 120000; /* 2 minutes */
+		break;
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case 0xFC:
+		if (erasemode)
+			*timeout = ((*(dd->port->identify + 90) * 2) * 60000);
+		else
+			*timeout = ((*(dd->port->identify + 89) * 2) * 60000);
+		break;
+	case ATA_CMD_STANDBYNOW1:
+		*timeout = 120000;  /* 2 minutes */
+		break;
+	case 0xF7:
+	case 0xFA:
+		*timeout = 60000;  /* 60 seconds */
+		break;
+	case ATA_CMD_SMART:
+		*timeout = 15000;  /* 15 seconds */
+		break;
+	default:
+		*timeout = MTIP_IOCTL_CMD_TIMEOUT_MS;
+		break;
+	}
+}
+
+/*
+ * Request the device identity information.
+ *
+ * If a user space buffer is not specified, i.e. is NULL, the
+ * identify information is still read from the drive and placed
+ * into the identify data buffer (@e port->identify) in the
+ * port data structure.
+ * When the identify buffer contains valid identify information @e
+ * port->identify_valid is non-zero.
+ *
+ * @port	 Pointer to the port structure.
+ * @user_buffer  A user space buffer where the identify data should be
+ *                    copied.
+ *
+ * return value
+ *	0	Command completed successfully.
+ *	-EFAULT An error occurred while coping data to the user buffer.
+ *	-1	Command failed.
+ */
+static int mtip_get_identify(struct mtip_port *port, void __user *user_buffer)
+{
+	int rv = 0;
+	struct host_to_dev_fis fis;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &port->dd->dd_flag))
+		return -EFAULT;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_ID_ATA;
+
+	/* Set the identify information as invalid. */
+	port->identify_valid = 0;
+
+	/* Clear the identify information. */
+	memset(port->identify, 0, sizeof(u16) * ATA_ID_WORDS);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				5,
+				port->identify_dma,
+				sizeof(u16) * ATA_ID_WORDS,
+				0,
+				MTIP_INT_CMD_TIMEOUT_MS)
+				< 0) {
+		rv = -1;
+		goto out;
+	}
+
+	/*
+	 * Perform any necessary byte-swapping.  Yes, the kernel does in fact
+	 * perform field-sensitive swapping on the string fields.
+	 * See the kernel use of ata_id_string() for proof of this.
+	 */
+#ifdef __LITTLE_ENDIAN
+	ata_swap_string(port->identify + 27, 40);  /* model string*/
+	ata_swap_string(port->identify + 23, 8);   /* firmware string*/
+	ata_swap_string(port->identify + 10, 20);  /* serial# string*/
+#else
+	{
+		int i;
+		for (i = 0; i < ATA_ID_WORDS; i++)
+			port->identify[i] = le16_to_cpu(port->identify[i]);
+	}
+#endif
+
+	/* Check security locked state */
+	if (port->identify[128] & 0x4)
+		set_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+	else
+		clear_bit(MTIP_DDF_SEC_LOCK_BIT, &port->dd->dd_flag);
+
+	/* Set the identify buffer as valid. */
+	port->identify_valid = 1;
+
+	if (user_buffer) {
+		if (copy_to_user(
+			user_buffer,
+			port->identify,
+			ATA_ID_WORDS * sizeof(u16))) {
+			rv = -EFAULT;
+			goto out;
+		}
+	}
+
+out:
+	return rv;
+}
+
+/*
+ * Issue a standby immediate command to the device.
+ *
+ * @port Pointer to the port structure.
+ *
+ * return value
+ *	0	Command was executed successfully.
+ *	-1	An error occurred while executing the command.
+ */
+static int mtip_standby_immediate(struct mtip_port *port)
+{
+	int rv;
+	struct host_to_dev_fis	fis;
+	unsigned long __maybe_unused start;
+	unsigned int timeout;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_STANDBYNOW1;
+
+	mtip_set_timeout(port->dd, &fis, &timeout, 0);
+
+	start = jiffies;
+	rv = mtip_exec_internal_command(port,
+					&fis,
+					5,
+					0,
+					0,
+					0,
+					timeout);
+	dbg_printk(MTIP_DRV_NAME "Time taken to complete standby cmd: %d ms\n",
+			jiffies_to_msecs(jiffies - start));
+	if (rv)
+		dev_warn(&port->dd->pdev->dev,
+			"STANDBY IMMEDIATE command failed.\n");
+
+	return rv;
+}
+
+/*
+ * Issue a READ LOG EXT command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @page	page number to fetch
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ * @sectors	page length to fetch, in sectors
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_read_log_page(struct mtip_port *port, u8 page, u16 *buffer,
+				dma_addr_t buffer_dma, unsigned int sectors)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_READ_LOG_EXT;
+	fis.sect_count	= sectors & 0xFF;
+	fis.sect_cnt_ex	= (sectors >> 8) & 0xFF;
+	fis.lba_low	= page;
+	fis.lba_mid	= 0;
+	fis.device	= ATA_DEVICE_OBS;
+
+	memset(buffer, 0, sectors * ATA_SECT_SIZE);
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					sectors * ATA_SECT_SIZE,
+					0,
+					MTIP_INT_CMD_TIMEOUT_MS);
+}
+
+/*
+ * Issue a SMART READ DATA command to the device.
+ *
+ * @port	pointer to the port structure.
+ * @buffer	pointer to buffer
+ * @buffer_dma	dma address corresponding to @buffer
+ *
+ * return value
+ *	@rv	return value from mtip_exec_internal_command()
+ */
+static int mtip_get_smart_data(struct mtip_port *port, u8 *buffer,
+					dma_addr_t buffer_dma)
+{
+	struct host_to_dev_fis fis;
+
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= ATA_CMD_SMART;
+	fis.features	= 0xD0;
+	fis.sect_count	= 1;
+	fis.lba_mid	= 0x4F;
+	fis.lba_hi	= 0xC2;
+	fis.device	= ATA_DEVICE_OBS;
+
+	return mtip_exec_internal_command(port,
+					&fis,
+					5,
+					buffer_dma,
+					ATA_SECT_SIZE,
+					0,
+					15000);
+}
+
+/*
+ * Get the value of a smart attribute
+ *
+ * @port	pointer to the port structure
+ * @id		attribute number
+ * @attrib	pointer to return attrib information corresponding to @id
+ *
+ * return value
+ *	-EINVAL	NULL buffer passed or unsupported attribute @id.
+ *	-EPERM	Identify data not valid, SMART not supported or not enabled
+ */
+static int mtip_get_smart_attr(struct mtip_port *port, unsigned int id,
+						struct smart_attr *attrib)
+{
+	int rv, i;
+	struct smart_attr *pattr;
+
+	if (!attrib)
+		return -EINVAL;
+
+	if (!port->identify_valid) {
+		dev_warn(&port->dd->pdev->dev, "IDENTIFY DATA not valid\n");
+		return -EPERM;
+	}
+	if (!(port->identify[82] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not supported\n");
+		return -EPERM;
+	}
+	if (!(port->identify[85] & 0x1)) {
+		dev_warn(&port->dd->pdev->dev, "SMART not enabled\n");
+		return -EPERM;
+	}
+
+	memset(port->smart_buf, 0, ATA_SECT_SIZE);
+	rv = mtip_get_smart_data(port, port->smart_buf, port->smart_buf_dma);
+	if (rv) {
+		dev_warn(&port->dd->pdev->dev, "Failed to ge SMART data\n");
+		return rv;
+	}
+
+	pattr = (struct smart_attr *)(port->smart_buf + 2);
+	for (i = 0; i < 29; i++, pattr++)
+		if (pattr->attr_id == id) {
+			memcpy(attrib, pattr, sizeof(struct smart_attr));
+			break;
+		}
+
+	if (i == 29) {
+		dev_warn(&port->dd->pdev->dev,
+			"Query for invalid SMART attribute ID\n");
+		rv = -EINVAL;
+	}
+
+	return rv;
+}
+
+/*
+ * Get the drive capacity.
+ *
+ * @dd      Pointer to the device data structure.
+ * @sectors Pointer to the variable that will receive the sector count.
+ *
+ * return value
+ *	1 Capacity was returned successfully.
+ *	0 The identify information is invalid.
+ */
+static bool mtip_hw_get_capacity(struct driver_data *dd, sector_t *sectors)
+{
+	struct mtip_port *port = dd->port;
+	u64 total, raw0, raw1, raw2, raw3;
+	raw0 = port->identify[100];
+	raw1 = port->identify[101];
+	raw2 = port->identify[102];
+	raw3 = port->identify[103];
+	total = raw0 | raw1<<16 | raw2<<32 | raw3<<48;
+	*sectors = total;
+	return (bool) !!port->identify_valid;
+}
+
+/*
+ * Display the identify command data.
+ *
+ * @port Pointer to the port data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_dump_identify(struct mtip_port *port)
+{
+	sector_t sectors;
+	unsigned short revid;
+	char cbuf[42];
+
+	if (!port->identify_valid)
+		return;
+
+	strscpy(cbuf, (char *)(port->identify + 10), 21);
+	dev_info(&port->dd->pdev->dev,
+		"Serial No.: %s\n", cbuf);
+
+	strscpy(cbuf, (char *)(port->identify + 23), 9);
+	dev_info(&port->dd->pdev->dev,
+		"Firmware Ver.: %s\n", cbuf);
+
+	strscpy(cbuf, (char *)(port->identify + 27), 41);
+	dev_info(&port->dd->pdev->dev, "Model: %s\n", cbuf);
+
+	dev_info(&port->dd->pdev->dev, "Security: %04x %s\n",
+		port->identify[128],
+		port->identify[128] & 0x4 ? "(LOCKED)" : "");
+
+	if (mtip_hw_get_capacity(port->dd, &sectors))
+		dev_info(&port->dd->pdev->dev,
+			"Capacity: %llu sectors (%llu MB)\n",
+			 (u64)sectors,
+			 ((u64)sectors) * ATA_SECT_SIZE >> 20);
+
+	pci_read_config_word(port->dd->pdev, PCI_REVISION_ID, &revid);
+	switch (revid & 0xFF) {
+	case 0x1:
+		strscpy(cbuf, "A0", 3);
+		break;
+	case 0x3:
+		strscpy(cbuf, "A2", 3);
+		break;
+	default:
+		strscpy(cbuf, "?", 2);
+		break;
+	}
+	dev_info(&port->dd->pdev->dev,
+		"Card Type: %s\n", cbuf);
+}
+
+/*
+ * Map the commands scatter list into the command table.
+ *
+ * @command Pointer to the command.
+ * @nents Number of scatter list entries.
+ *
+ * return value
+ *	None
+ */
+static inline void fill_command_sg(struct driver_data *dd,
+				struct mtip_cmd *command,
+				int nents)
+{
+	int n;
+	unsigned int dma_len;
+	struct mtip_cmd_sg *command_sg;
+	struct scatterlist *sg;
+
+	command_sg = command->command + AHCI_CMD_TBL_HDR_SZ;
+
+	for_each_sg(command->sg, sg, nents, n) {
+		dma_len = sg_dma_len(sg);
+		if (dma_len > 0x400000)
+			dev_err(&dd->pdev->dev,
+				"DMA segment length truncated\n");
+		command_sg->info = cpu_to_le32((dma_len-1) & 0x3FFFFF);
+		command_sg->dba	=  cpu_to_le32(sg_dma_address(sg));
+		command_sg->dba_upper =
+			cpu_to_le32((sg_dma_address(sg) >> 16) >> 16);
+		command_sg++;
+	}
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * return value 0 The command completed successfully.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_task(struct mtip_port *port, u8 *command)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply = (port->rxfis + RX_FIS_D2H_REG);
+	unsigned int to;
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[1];
+	fis.sect_count	= command[2];
+	fis.sector	= command[3];
+	fis.cyl_low	= command[4];
+	fis.cyl_hi	= command[5];
+	fis.device	= command[6] & ~0x10; /* Clear the dev bit*/
+
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
+	dbg_printk(MTIP_DRV_NAME " %s: User Command: cmd %x, feat %x, nsect %x, sect %x, lcyl %x, hcyl %x, sel %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3],
+		command[4],
+		command[5],
+		command[6]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				 &fis,
+				 5,
+				 0,
+				 0,
+				 0,
+				 to) < 0) {
+		return -1;
+	}
+
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[4] = reply->cyl_low;
+	command[5] = reply->cyl_hi;
+
+	dbg_printk(MTIP_DRV_NAME " %s: Completion Status: stat %x, err %x , cyl_lo %x cyl_hi %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[4],
+		command[5]);
+
+	return 0;
+}
+
+/*
+ * @brief Execute a drive command.
+ *
+ * @param port Pointer to the port data structure.
+ * @param command Pointer to the user specified command parameters.
+ * @param user_buffer Pointer to the user space buffer where read sector
+ *                   data should be copied.
+ *
+ * return value 0 The command completed successfully.
+ * return value -EFAULT An error occurred while copying the completion
+ *                 data to the user space buffer.
+ * return value -1 An error occurred while executing the command.
+ */
+static int exec_drive_command(struct mtip_port *port, u8 *command,
+				void __user *user_buffer)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *buf = NULL;
+	dma_addr_t dma_addr = 0;
+	int rv = 0, xfer_sz = command[3];
+	unsigned int to;
+
+	if (xfer_sz) {
+		if (!user_buffer)
+			return -EFAULT;
+
+		buf = dma_alloc_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz,
+				&dma_addr,
+				GFP_KERNEL);
+		if (!buf) {
+			dev_err(&port->dd->pdev->dev,
+				"Memory allocation failed (%d bytes)\n",
+				ATA_SECT_SIZE * xfer_sz);
+			return -ENOMEM;
+		}
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= command[0];
+	fis.features	= command[2];
+	fis.sect_count	= command[3];
+	if (fis.command == ATA_CMD_SMART) {
+		fis.sector	= command[1];
+		fis.cyl_low	= 0x4F;
+		fis.cyl_hi	= 0xC2;
+	}
+
+	mtip_set_timeout(port->dd, &fis, &to, 0);
+
+	if (xfer_sz)
+		reply = (port->rxfis + RX_FIS_PIO_SETUP);
+	else
+		reply = (port->rxfis + RX_FIS_D2H_REG);
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: User Command: cmd %x, sect %x, "
+		"feat %x, sectcnt %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2],
+		command[3]);
+
+	/* Execute the command. */
+	if (mtip_exec_internal_command(port,
+				&fis,
+				 5,
+				 (xfer_sz ? dma_addr : 0),
+				 (xfer_sz ? ATA_SECT_SIZE * xfer_sz : 0),
+				 0,
+				 to)
+				 < 0) {
+		rv = -EFAULT;
+		goto exit_drive_command;
+	}
+
+	/* Collect the completion status. */
+	command[0] = reply->command; /* Status*/
+	command[1] = reply->features; /* Error*/
+	command[2] = reply->sect_count;
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion Status: stat %x, "
+		"err %x, nsect %x\n",
+		__func__,
+		command[0],
+		command[1],
+		command[2]);
+
+	if (xfer_sz) {
+		if (copy_to_user(user_buffer,
+				 buf,
+				 ATA_SECT_SIZE * command[3])) {
+			rv = -EFAULT;
+			goto exit_drive_command;
+		}
+	}
+exit_drive_command:
+	if (buf)
+		dma_free_coherent(&port->dd->pdev->dev,
+				ATA_SECT_SIZE * xfer_sz, buf, dma_addr);
+	return rv;
+}
+
+/*
+ *  Indicates whether a command has a single sector payload.
+ *
+ *  @command passed to the device to perform the certain event.
+ *  @features passed to the device to perform the certain event.
+ *
+ *  return value
+ *	1	command is one that always has a single sector payload,
+ *		regardless of the value in the Sector Count field.
+ *      0       otherwise
+ *
+ */
+static unsigned int implicit_sector(unsigned char command,
+				    unsigned char features)
+{
+	unsigned int rv = 0;
+
+	/* list of commands that have an implicit sector count of 1 */
+	switch (command) {
+	case ATA_CMD_SEC_SET_PASS:
+	case ATA_CMD_SEC_UNLOCK:
+	case ATA_CMD_SEC_ERASE_PREP:
+	case ATA_CMD_SEC_ERASE_UNIT:
+	case ATA_CMD_SEC_FREEZE_LOCK:
+	case ATA_CMD_SEC_DISABLE_PASS:
+	case ATA_CMD_PMP_READ:
+	case ATA_CMD_PMP_WRITE:
+		rv = 1;
+		break;
+	case ATA_CMD_SET_MAX:
+		if (features == ATA_SET_MAX_UNLOCK)
+			rv = 1;
+		break;
+	case ATA_CMD_SMART:
+		if ((features == ATA_SMART_READ_VALUES) ||
+				(features == ATA_SMART_READ_THRESHOLDS))
+			rv = 1;
+		break;
+	case ATA_CMD_CONF_OVERLAY:
+		if ((features == ATA_DCO_IDENTIFY) ||
+				(features == ATA_DCO_SET))
+			rv = 1;
+		break;
+	}
+	return rv;
+}
+
+/*
+ * Executes a taskfile
+ * See ide_taskfile_ioctl() for derivation
+ */
+static int exec_drive_taskfile(struct driver_data *dd,
+			       void __user *buf,
+			       ide_task_request_t *req_task,
+			       int outtotal)
+{
+	struct host_to_dev_fis	fis;
+	struct host_to_dev_fis *reply;
+	u8 *outbuf = NULL;
+	u8 *inbuf = NULL;
+	dma_addr_t outbuf_dma = 0;
+	dma_addr_t inbuf_dma = 0;
+	dma_addr_t dma_buffer = 0;
+	int err = 0;
+	unsigned int taskin = 0;
+	unsigned int taskout = 0;
+	u8 nsect = 0;
+	unsigned int timeout;
+	unsigned int force_single_sector;
+	unsigned int transfer_size;
+	unsigned long task_file_data;
+	int intotal = outtotal + req_task->out_size;
+	int erasemode = 0;
+
+	taskout = req_task->out_size;
+	taskin = req_task->in_size;
+	/* 130560 = 512 * 0xFF*/
+	if (taskin > 130560 || taskout > 130560)
+		return -EINVAL;
+
+	if (taskout) {
+		outbuf = memdup_user(buf + outtotal, taskout);
+		if (IS_ERR(outbuf))
+			return PTR_ERR(outbuf);
+
+		outbuf_dma = dma_map_single(&dd->pdev->dev, outbuf,
+					    taskout, DMA_TO_DEVICE);
+		if (dma_mapping_error(&dd->pdev->dev, outbuf_dma)) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = outbuf_dma;
+	}
+
+	if (taskin) {
+		inbuf = memdup_user(buf + intotal, taskin);
+		if (IS_ERR(inbuf)) {
+			err = PTR_ERR(inbuf);
+			inbuf = NULL;
+			goto abort;
+		}
+		inbuf_dma = dma_map_single(&dd->pdev->dev, inbuf,
+					   taskin, DMA_FROM_DEVICE);
+		if (dma_mapping_error(&dd->pdev->dev, inbuf_dma)) {
+			err = -ENOMEM;
+			goto abort;
+		}
+		dma_buffer = inbuf_dma;
+	}
+
+	/* only supports PIO and non-data commands from this ioctl. */
+	switch (req_task->data_phase) {
+	case TASKFILE_OUT:
+		nsect = taskout / ATA_SECT_SIZE;
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_IN:
+		reply = (dd->port->rxfis + RX_FIS_PIO_SETUP);
+		break;
+	case TASKFILE_NO_DATA:
+		reply = (dd->port->rxfis + RX_FIS_D2H_REG);
+		break;
+	default:
+		err = -EINVAL;
+		goto abort;
+	}
+
+	/* Build the FIS. */
+	memset(&fis, 0, sizeof(struct host_to_dev_fis));
+
+	fis.type	= 0x27;
+	fis.opts	= 1 << 7;
+	fis.command	= req_task->io_ports[7];
+	fis.features	= req_task->io_ports[1];
+	fis.sect_count	= req_task->io_ports[2];
+	fis.lba_low	= req_task->io_ports[3];
+	fis.lba_mid	= req_task->io_ports[4];
+	fis.lba_hi	= req_task->io_ports[5];
+	 /* Clear the dev bit*/
+	fis.device	= req_task->io_ports[6] & ~0x10;
+
+	if ((req_task->in_flags.all == 0) && (req_task->out_flags.all & 1)) {
+		req_task->in_flags.all	=
+			IDE_TASKFILE_STD_IN_FLAGS |
+			(IDE_HOB_STD_IN_FLAGS << 8);
+		fis.lba_low_ex		= req_task->hob_ports[3];
+		fis.lba_mid_ex		= req_task->hob_ports[4];
+		fis.lba_hi_ex		= req_task->hob_ports[5];
+		fis.features_ex		= req_task->hob_ports[1];
+		fis.sect_cnt_ex		= req_task->hob_ports[2];
+
+	} else {
+		req_task->in_flags.all = IDE_TASKFILE_STD_IN_FLAGS;
+	}
+
+	force_single_sector = implicit_sector(fis.command, fis.features);
+
+	if ((taskin || taskout) && (!fis.sect_count)) {
+		if (nsect)
+			fis.sect_count = nsect;
+		else {
+			if (!force_single_sector) {
+				dev_warn(&dd->pdev->dev,
+					"data movement but "
+					"sect_count is 0\n");
+				err = -EINVAL;
+				goto abort;
+			}
+		}
+	}
+
+	dbg_printk(MTIP_DRV_NAME
+		" %s: cmd %x, feat %x, nsect %x,"
+		" sect/lbal %x, lcyl/lbam %x, hcyl/lbah %x,"
+		" head/dev %x\n",
+		__func__,
+		fis.command,
+		fis.features,
+		fis.sect_count,
+		fis.lba_low,
+		fis.lba_mid,
+		fis.lba_hi,
+		fis.device);
+
+	/* check for erase mode support during secure erase.*/
+	if ((fis.command == ATA_CMD_SEC_ERASE_UNIT) && outbuf &&
+					(outbuf[0] & MTIP_SEC_ERASE_MODE)) {
+		erasemode = 1;
+	}
+
+	mtip_set_timeout(dd, &fis, &timeout, erasemode);
+
+	/* Determine the correct transfer size.*/
+	if (force_single_sector)
+		transfer_size = ATA_SECT_SIZE;
+	else
+		transfer_size = ATA_SECT_SIZE * fis.sect_count;
+
+	/* Execute the command.*/
+	if (mtip_exec_internal_command(dd->port,
+				 &fis,
+				 5,
+				 dma_buffer,
+				 transfer_size,
+				 0,
+				 timeout) < 0) {
+		err = -EIO;
+		goto abort;
+	}
+
+	task_file_data = readl(dd->port->mmio+PORT_TFDATA);
+
+	if ((req_task->data_phase == TASKFILE_IN) && !(task_file_data & 1)) {
+		reply = dd->port->rxfis + RX_FIS_PIO_SETUP;
+		req_task->io_ports[7] = reply->control;
+	} else {
+		reply = dd->port->rxfis + RX_FIS_D2H_REG;
+		req_task->io_ports[7] = reply->command;
+	}
+
+	/* reclaim the DMA buffers.*/
+	if (inbuf_dma)
+		dma_unmap_single(&dd->pdev->dev, inbuf_dma, taskin,
+				 DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		dma_unmap_single(&dd->pdev->dev, outbuf_dma, taskout,
+				 DMA_TO_DEVICE);
+	inbuf_dma  = 0;
+	outbuf_dma = 0;
+
+	/* return the ATA registers to the caller.*/
+	req_task->io_ports[1] = reply->features;
+	req_task->io_ports[2] = reply->sect_count;
+	req_task->io_ports[3] = reply->lba_low;
+	req_task->io_ports[4] = reply->lba_mid;
+	req_task->io_ports[5] = reply->lba_hi;
+	req_task->io_ports[6] = reply->device;
+
+	if (req_task->out_flags.all & 1)  {
+
+		req_task->hob_ports[3] = reply->lba_low_ex;
+		req_task->hob_ports[4] = reply->lba_mid_ex;
+		req_task->hob_ports[5] = reply->lba_hi_ex;
+		req_task->hob_ports[1] = reply->features_ex;
+		req_task->hob_ports[2] = reply->sect_cnt_ex;
+	}
+	dbg_printk(MTIP_DRV_NAME
+		" %s: Completion: stat %x,"
+		"err %x, sect_cnt %x, lbalo %x,"
+		"lbamid %x, lbahi %x, dev %x\n",
+		__func__,
+		req_task->io_ports[7],
+		req_task->io_ports[1],
+		req_task->io_ports[2],
+		req_task->io_ports[3],
+		req_task->io_ports[4],
+		req_task->io_ports[5],
+		req_task->io_ports[6]);
+
+	if (taskout) {
+		if (copy_to_user(buf + outtotal, outbuf, taskout)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+	if (taskin) {
+		if (copy_to_user(buf + intotal, inbuf, taskin)) {
+			err = -EFAULT;
+			goto abort;
+		}
+	}
+abort:
+	if (inbuf_dma)
+		dma_unmap_single(&dd->pdev->dev, inbuf_dma, taskin,
+				 DMA_FROM_DEVICE);
+	if (outbuf_dma)
+		dma_unmap_single(&dd->pdev->dev, outbuf_dma, taskout,
+				 DMA_TO_DEVICE);
+	kfree(outbuf);
+	kfree(inbuf);
+
+	return err;
+}
+
+/*
+ * Handle IOCTL calls from the Block Layer.
+ *
+ * This function is called by the Block Layer when it receives an IOCTL
+ * command that it does not understand. If the IOCTL command is not supported
+ * this function returns -ENOTTY.
+ *
+ * @dd  Pointer to the driver data structure.
+ * @cmd IOCTL command passed from the Block Layer.
+ * @arg IOCTL argument passed from the Block Layer.
+ *
+ * return value
+ *	0	The IOCTL completed successfully.
+ *	-ENOTTY The specified command is not supported.
+ *	-EFAULT An error occurred copying data to a user space buffer.
+ *	-EIO	An error occurred while executing the command.
+ */
+static int mtip_hw_ioctl(struct driver_data *dd, unsigned int cmd,
+			 unsigned long arg)
+{
+	switch (cmd) {
+	case HDIO_GET_IDENTITY:
+	{
+		if (copy_to_user((void __user *)arg, dd->port->identify,
+						sizeof(u16) * ATA_ID_WORDS))
+			return -EFAULT;
+		break;
+	}
+	case HDIO_DRIVE_CMD:
+	{
+		u8 drive_command[4];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_command(dd->port,
+					 drive_command,
+					 (void __user *) (arg+4)))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASK:
+	{
+		u8 drive_command[7];
+
+		/* Copy the user command info to our buffer. */
+		if (copy_from_user(drive_command,
+					 (void __user *) arg,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		/* Execute the drive command. */
+		if (exec_drive_task(dd->port, drive_command))
+			return -EIO;
+
+		/* Copy the status back to the users buffer. */
+		if (copy_to_user((void __user *) arg,
+					 drive_command,
+					 sizeof(drive_command)))
+			return -EFAULT;
+
+		break;
+	}
+	case HDIO_DRIVE_TASKFILE: {
+		ide_task_request_t req_task;
+		int ret, outtotal;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+					sizeof(req_task)))
+			return -EFAULT;
+
+		outtotal = sizeof(req_task);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+							sizeof(req_task)))
+			return -EFAULT;
+
+		return ret;
+	}
+
+	default:
+		return -EINVAL;
+	}
+	return 0;
+}
+
+/*
+ * Submit an IO to the hw
+ *
+ * This function is called by the block layer to issue an io
+ * to the device. Upon completion, the callback function will
+ * be called with the data parameter passed as the callback data.
+ *
+ * @dd       Pointer to the driver data structure.
+ * @start    First sector to read.
+ * @nsect    Number of sectors to read.
+ * @tag      The tag of this read command.
+ * @callback Pointer to the function that should be called
+ *	     when the read completes.
+ * @data     Callback data passed to the callback function
+ *	     when the read completes.
+ * @dir      Direction (read or write)
+ *
+ * return value
+ *	None
+ */
+static void mtip_hw_submit_io(struct driver_data *dd, struct request *rq,
+			      struct mtip_cmd *command,
+			      struct blk_mq_hw_ctx *hctx)
+{
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
+	struct host_to_dev_fis	*fis;
+	struct mtip_port *port = dd->port;
+	int dma_dir = rq_data_dir(rq) == READ ? DMA_FROM_DEVICE : DMA_TO_DEVICE;
+	u64 start = blk_rq_pos(rq);
+	unsigned int nsect = blk_rq_sectors(rq);
+	unsigned int nents;
+
+	/* Map the scatter list for DMA access */
+	nents = blk_rq_map_sg(hctx->queue, rq, command->sg);
+	nents = dma_map_sg(&dd->pdev->dev, command->sg, nents, dma_dir);
+
+	prefetch(&port->flags);
+
+	command->scatter_ents = nents;
+
+	/*
+	 * The number of retries for this command before it is
+	 * reported as a failure to the upper layers.
+	 */
+	command->retries = MTIP_MAX_RETRIES;
+
+	/* Fill out fis */
+	fis = command->command;
+	fis->type        = 0x27;
+	fis->opts        = 1 << 7;
+	if (dma_dir == DMA_FROM_DEVICE)
+		fis->command = ATA_CMD_FPDMA_READ;
+	else
+		fis->command = ATA_CMD_FPDMA_WRITE;
+	fis->lba_low     = start & 0xFF;
+	fis->lba_mid     = (start >> 8) & 0xFF;
+	fis->lba_hi      = (start >> 16) & 0xFF;
+	fis->lba_low_ex  = (start >> 24) & 0xFF;
+	fis->lba_mid_ex  = (start >> 32) & 0xFF;
+	fis->lba_hi_ex   = (start >> 40) & 0xFF;
+	fis->device	 = 1 << 6;
+	fis->features    = nsect & 0xFF;
+	fis->features_ex = (nsect >> 8) & 0xFF;
+	fis->sect_count  = ((rq->tag << 3) | (rq->tag >> 5));
+	fis->sect_cnt_ex = 0;
+	fis->control     = 0;
+	fis->res2        = 0;
+	fis->res3        = 0;
+	fill_command_sg(dd, command, nents);
+
+	if (unlikely(command->unaligned))
+		fis->device |= 1 << 7;
+
+	/* Populate the command header */
+	hdr->ctba = cpu_to_le32(command->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((command->command_dma >> 16) >> 16);
+	hdr->opts = cpu_to_le32((nents << 16) | 5 | AHCI_CMD_PREFETCH);
+	hdr->byte_count = 0;
+
+	command->direction = dma_dir;
+
+	/*
+	 * To prevent this command from being issued
+	 * if an internal command is in progress or error handling is active.
+	 */
+	if (unlikely(port->flags & MTIP_PF_PAUSE_IO)) {
+		set_bit(rq->tag, port->cmds_to_issue);
+		set_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		return;
+	}
+
+	/* Issue the command to the hardware */
+	mtip_issue_ncq_command(port, rq->tag);
+}
+
+/*
+ * Sysfs status dump.
+ *
+ * @dev  Pointer to the device structure, passed by the kernrel.
+ * @attr Pointer to the device_attribute structure passed by the kernel.
+ * @buf  Pointer to the char buffer that will receive the stats info.
+ *
+ * return value
+ *	The size, in bytes, of the data copied into buf.
+ */
+static ssize_t mtip_hw_show_status(struct device *dev,
+				struct device_attribute *attr,
+				char *buf)
+{
+	struct driver_data *dd = dev_to_disk(dev)->private_data;
+	int size = 0;
+
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "thermal_shutdown\n");
+	else if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag))
+		size += sprintf(buf, "%s", "write_protect\n");
+	else
+		size += sprintf(buf, "%s", "online\n");
+
+	return size;
+}
+
+static DEVICE_ATTR(status, 0444, mtip_hw_show_status, NULL);
+
+static struct attribute *mtip_disk_attrs[] = {
+	&dev_attr_status.attr,
+	NULL,
+};
+
+static const struct attribute_group mtip_disk_attr_group = {
+	.attrs = mtip_disk_attrs,
+};
+
+static const struct attribute_group *mtip_disk_attr_groups[] = {
+	&mtip_disk_attr_group,
+	NULL,
+};
+
+static ssize_t mtip_hw_read_registers(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char *buf;
+	u32 group_allocated;
+	int size = *offset;
+	int n, rv = 0;
+
+	if (!len || size)
+		return 0;
+
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	size += sprintf(&buf[size], "H/ S ACTive      : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					 readl(dd->port->s_active[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Command Issue : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+					readl(dd->port->cmd_issue[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ Completed     : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--)
+		size += sprintf(&buf[size], "%08X ",
+				readl(dd->port->completed[n]));
+
+	size += sprintf(&buf[size], "]\n");
+	size += sprintf(&buf[size], "H/ PORT IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->port->mmio + PORT_IRQ_STAT));
+	size += sprintf(&buf[size], "H/ HOST IRQ STAT : [ 0x%08X ]\n",
+				readl(dd->mmio + HOST_IRQ_STAT));
+	size += sprintf(&buf[size], "\n");
+
+	size += sprintf(&buf[size], "L/ Commands in Q : [ 0x");
+
+	for (n = dd->slot_groups-1; n >= 0; n--) {
+		if (sizeof(long) > sizeof(u32))
+			group_allocated =
+				dd->port->cmds_to_issue[n/2] >> (32*(n&1));
+		else
+			group_allocated = dd->port->cmds_to_issue[n];
+		size += sprintf(&buf[size], "%08X ", group_allocated);
+	}
+	size += sprintf(&buf[size], "]\n");
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		rv = -EFAULT;
+
+	kfree(buf);
+	return rv ? rv : *offset;
+}
+
+static ssize_t mtip_hw_read_flags(struct file *f, char __user *ubuf,
+				  size_t len, loff_t *offset)
+{
+	struct driver_data *dd =  (struct driver_data *)f->private_data;
+	char *buf;
+	int size = *offset;
+	int rv = 0;
+
+	if (!len || size)
+		return 0;
+
+	buf = kzalloc(MTIP_DFS_MAX_BUF_SIZE, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	size += sprintf(&buf[size], "Flag-port : [ %08lX ]\n",
+							dd->port->flags);
+	size += sprintf(&buf[size], "Flag-dd   : [ %08lX ]\n",
+							dd->dd_flag);
+
+	*offset = size <= len ? size : len;
+	size = copy_to_user(ubuf, buf, *offset);
+	if (size)
+		rv = -EFAULT;
+
+	kfree(buf);
+	return rv ? rv : *offset;
+}
+
+static const struct file_operations mtip_regs_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_registers,
+	.llseek = no_llseek,
+};
+
+static const struct file_operations mtip_flags_fops = {
+	.owner  = THIS_MODULE,
+	.open   = simple_open,
+	.read   = mtip_hw_read_flags,
+	.llseek = no_llseek,
+};
+
+static int mtip_hw_debugfs_init(struct driver_data *dd)
+{
+	if (!dfs_parent)
+		return -1;
+
+	dd->dfs_node = debugfs_create_dir(dd->disk->disk_name, dfs_parent);
+	if (IS_ERR_OR_NULL(dd->dfs_node)) {
+		dev_warn(&dd->pdev->dev,
+			"Error creating node %s under debugfs\n",
+						dd->disk->disk_name);
+		dd->dfs_node = NULL;
+		return -1;
+	}
+
+	debugfs_create_file("flags", 0444, dd->dfs_node, dd, &mtip_flags_fops);
+	debugfs_create_file("registers", 0444, dd->dfs_node, dd,
+			    &mtip_regs_fops);
+
+	return 0;
+}
+
+static void mtip_hw_debugfs_exit(struct driver_data *dd)
+{
+	debugfs_remove_recursive(dd->dfs_node);
+}
+
+/*
+ * Perform any init/resume time hardware setup
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static inline void hba_setup(struct driver_data *dd)
+{
+	u32 hwdata;
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	/* interrupt bug workaround: use only 1 IS bit.*/
+	writel(hwdata |
+		HSORG_DISABLE_SLOTGRP_INTR |
+		HSORG_DISABLE_SLOTGRP_PXIS,
+		dd->mmio + HOST_HSORG);
+}
+
+static int mtip_device_unaligned_constrained(struct driver_data *dd)
+{
+	return (dd->pdev->device == P420M_DEVICE_ID ? 1 : 0);
+}
+
+/*
+ * Detect the details of the product, and store anything needed
+ * into the driver data structure.  This includes product type and
+ * version and number of slot groups.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	None
+ */
+static void mtip_detect_product(struct driver_data *dd)
+{
+	u32 hwdata;
+	unsigned int rev, slotgroups;
+
+	/*
+	 * HBA base + 0xFC [15:0] - vendor-specific hardware interface
+	 * info register:
+	 * [15:8] hardware/software interface rev#
+	 * [   3] asic-style interface
+	 * [ 2:0] number of slot groups, minus 1 (only valid for asic-style).
+	 */
+	hwdata = readl(dd->mmio + HOST_HSORG);
+
+	dd->product_type = MTIP_PRODUCT_UNKNOWN;
+	dd->slot_groups = 1;
+
+	if (hwdata & 0x8) {
+		dd->product_type = MTIP_PRODUCT_ASICFPGA;
+		rev = (hwdata & HSORG_HWREV) >> 8;
+		slotgroups = (hwdata & HSORG_SLOTGROUPS) + 1;
+		dev_info(&dd->pdev->dev,
+			"ASIC-FPGA design, HS rev 0x%x, "
+			"%i slot groups [%i slots]\n",
+			 rev,
+			 slotgroups,
+			 slotgroups * 32);
+
+		if (slotgroups > MTIP_MAX_SLOT_GROUPS) {
+			dev_warn(&dd->pdev->dev,
+				"Warning: driver only supports "
+				"%i slot groups.\n", MTIP_MAX_SLOT_GROUPS);
+			slotgroups = MTIP_MAX_SLOT_GROUPS;
+		}
+		dd->slot_groups = slotgroups;
+		return;
+	}
+
+	dev_warn(&dd->pdev->dev, "Unrecognized product id\n");
+}
+
+/*
+ * Blocking wait for FTL rebuild to complete
+ *
+ * @dd Pointer to the DRIVER_DATA structure.
+ *
+ * return value
+ *	0	FTL rebuild completed successfully
+ *	-EFAULT FTL rebuild error/timeout/interruption
+ */
+static int mtip_ftl_rebuild_poll(struct driver_data *dd)
+{
+	unsigned long timeout, cnt = 0, start;
+
+	dev_warn(&dd->pdev->dev,
+		"FTL rebuild in progress. Polling for completion.\n");
+
+	start = jiffies;
+	timeout = jiffies + msecs_to_jiffies(MTIP_FTL_REBUILD_TIMEOUT_MS);
+
+	do {
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			return -EFAULT;
+		if (mtip_check_surprise_removal(dd))
+			return -EFAULT;
+
+		if (mtip_get_identify(dd->port, NULL) < 0)
+			return -EFAULT;
+
+		if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+			MTIP_FTL_REBUILD_MAGIC) {
+			ssleep(1);
+			/* Print message every 3 minutes */
+			if (cnt++ >= 180) {
+				dev_warn(&dd->pdev->dev,
+				"FTL rebuild in progress (%d secs).\n",
+				jiffies_to_msecs(jiffies - start) / 1000);
+				cnt = 0;
+			}
+		} else {
+			dev_warn(&dd->pdev->dev,
+				"FTL rebuild complete (%d secs).\n",
+			jiffies_to_msecs(jiffies - start) / 1000);
+			mtip_block_initialize(dd);
+			return 0;
+		}
+	} while (time_before(jiffies, timeout));
+
+	/* Check for timeout */
+	dev_err(&dd->pdev->dev,
+		"Timed out waiting for FTL rebuild to complete (%d secs).\n",
+		jiffies_to_msecs(jiffies - start) / 1000);
+	return -EFAULT;
+}
+
+static void mtip_softirq_done_fn(struct request *rq)
+{
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct driver_data *dd = rq->q->queuedata;
+
+	/* Unmap the DMA scatter list entries */
+	dma_unmap_sg(&dd->pdev->dev, cmd->sg, cmd->scatter_ents,
+							cmd->direction);
+
+	if (unlikely(cmd->unaligned))
+		atomic_inc(&dd->port->cmd_slot_unal);
+
+	blk_mq_end_request(rq, cmd->status);
+}
+
+static bool mtip_abort_cmd(struct request *req, void *data)
+{
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
+	struct driver_data *dd = data;
+
+	dbg_printk(MTIP_DRV_NAME " Aborting request, tag = %d\n", req->tag);
+
+	clear_bit(req->tag, dd->port->cmds_to_issue);
+	cmd->status = BLK_STS_IOERR;
+	mtip_softirq_done_fn(req);
+	return true;
+}
+
+static bool mtip_queue_cmd(struct request *req, void *data)
+{
+	struct driver_data *dd = data;
+
+	set_bit(req->tag, dd->port->cmds_to_issue);
+	blk_abort_request(req);
+	return true;
+}
+
+/*
+ * service thread to issue queued commands
+ *
+ * @data Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+
+static int mtip_service_thread(void *data)
+{
+	struct driver_data *dd = (struct driver_data *)data;
+	unsigned long slot, slot_start, slot_wrap, to;
+	unsigned int num_cmd_slots = dd->slot_groups * 32;
+	struct mtip_port *port = dd->port;
+
+	while (1) {
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+		clear_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+		/*
+		 * the condition is to check neither an internal command is
+		 * is in progress nor error handling is active
+		 */
+		wait_event_interruptible(port->svc_wait, (port->flags) &&
+			(port->flags & MTIP_PF_SVC_THD_WORK));
+
+		if (kthread_should_stop() ||
+			test_bit(MTIP_PF_SVC_THD_STOP_BIT, &port->flags))
+			goto st_out;
+
+		if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT,
+				&dd->dd_flag)))
+			goto st_out;
+
+		set_bit(MTIP_PF_SVC_THD_ACTIVE_BIT, &port->flags);
+
+restart_eh:
+		/* Demux bits: start with error handling */
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags)) {
+			mtip_handle_tfe(dd);
+			clear_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags);
+		}
+
+		if (test_bit(MTIP_PF_EH_ACTIVE_BIT, &port->flags))
+			goto restart_eh;
+
+		if (test_bit(MTIP_PF_TO_ACTIVE_BIT, &port->flags)) {
+			to = jiffies + msecs_to_jiffies(5000);
+
+			do {
+				mdelay(100);
+			} while (atomic_read(&dd->irq_workers_active) != 0 &&
+				time_before(jiffies, to));
+
+			if (atomic_read(&dd->irq_workers_active) != 0)
+				dev_warn(&dd->pdev->dev,
+					"Completion workers still active!");
+
+			blk_mq_quiesce_queue(dd->queue);
+
+			blk_mq_tagset_busy_iter(&dd->tags, mtip_queue_cmd, dd);
+
+			set_bit(MTIP_PF_ISSUE_CMDS_BIT, &dd->port->flags);
+
+			if (mtip_device_reset(dd))
+				blk_mq_tagset_busy_iter(&dd->tags,
+							mtip_abort_cmd, dd);
+
+			clear_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags);
+
+			blk_mq_unquiesce_queue(dd->queue);
+		}
+
+		if (test_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags)) {
+			slot = 1;
+			/* used to restrict the loop to one iteration */
+			slot_start = num_cmd_slots;
+			slot_wrap = 0;
+			while (1) {
+				slot = find_next_bit(port->cmds_to_issue,
+						num_cmd_slots, slot);
+				if (slot_wrap == 1) {
+					if ((slot_start >= slot) ||
+						(slot >= num_cmd_slots))
+						break;
+				}
+				if (unlikely(slot_start == num_cmd_slots))
+					slot_start = slot;
+
+				if (unlikely(slot == num_cmd_slots)) {
+					slot = 1;
+					slot_wrap = 1;
+					continue;
+				}
+
+				/* Issue the command to the hardware */
+				mtip_issue_ncq_command(port, slot);
+
+				clear_bit(slot, port->cmds_to_issue);
+			}
+
+			clear_bit(MTIP_PF_ISSUE_CMDS_BIT, &port->flags);
+		}
+
+		if (test_bit(MTIP_PF_REBUILD_BIT, &port->flags)) {
+			if (mtip_ftl_rebuild_poll(dd) == 0)
+				clear_bit(MTIP_PF_REBUILD_BIT, &port->flags);
+		}
+	}
+
+st_out:
+	return 0;
+}
+
+/*
+ * DMA region teardown
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      None
+ */
+static void mtip_dma_free(struct driver_data *dd)
+{
+	struct mtip_port *port = dd->port;
+
+	if (port->block1)
+		dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+
+	if (port->command_list) {
+		dma_free_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+				port->command_list, port->command_list_dma);
+	}
+}
+
+/*
+ * DMA region setup
+ *
+ * @dd Pointer to driver_data structure
+ *
+ * return value
+ *      -ENOMEM Not enough free DMA region space to initialize driver
+ */
+static int mtip_dma_alloc(struct driver_data *dd)
+{
+	struct mtip_port *port = dd->port;
+
+	/* Allocate dma memory for RX Fis, Identify, and Sector Buffer */
+	port->block1 =
+		dma_alloc_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					&port->block1_dma, GFP_KERNEL);
+	if (!port->block1)
+		return -ENOMEM;
+
+	/* Allocate dma memory for command list */
+	port->command_list =
+		dma_alloc_coherent(&dd->pdev->dev, AHCI_CMD_TBL_SZ,
+					&port->command_list_dma, GFP_KERNEL);
+	if (!port->command_list) {
+		dma_free_coherent(&dd->pdev->dev, BLOCK_DMA_ALLOC_SZ,
+					port->block1, port->block1_dma);
+		port->block1 = NULL;
+		port->block1_dma = 0;
+		return -ENOMEM;
+	}
+
+	/* Setup all pointers into first DMA region */
+	port->rxfis         = port->block1 + AHCI_RX_FIS_OFFSET;
+	port->rxfis_dma     = port->block1_dma + AHCI_RX_FIS_OFFSET;
+	port->identify      = port->block1 + AHCI_IDFY_OFFSET;
+	port->identify_dma  = port->block1_dma + AHCI_IDFY_OFFSET;
+	port->log_buf       = port->block1 + AHCI_SECTBUF_OFFSET;
+	port->log_buf_dma   = port->block1_dma + AHCI_SECTBUF_OFFSET;
+	port->smart_buf     = port->block1 + AHCI_SMARTBUF_OFFSET;
+	port->smart_buf_dma = port->block1_dma + AHCI_SMARTBUF_OFFSET;
+
+	return 0;
+}
+
+static int mtip_hw_get_identify(struct driver_data *dd)
+{
+	struct smart_attr attr242;
+	unsigned char *buf;
+	int rv;
+
+	if (mtip_get_identify(dd->port, NULL) < 0)
+		return -EFAULT;
+
+	if (*(dd->port->identify + MTIP_FTL_REBUILD_OFFSET) ==
+		MTIP_FTL_REBUILD_MAGIC) {
+		set_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags);
+		return MTIP_FTL_REBUILD_MAGIC;
+	}
+	mtip_dump_identify(dd->port);
+
+	/* check write protect, over temp and rebuild statuses */
+	rv = mtip_read_log_page(dd->port, ATA_LOG_SATA_NCQ,
+				dd->port->log_buf,
+				dd->port->log_buf_dma, 1);
+	if (rv) {
+		dev_warn(&dd->pdev->dev,
+			"Error in READ LOG EXT (10h) command\n");
+		/* non-critical error, don't fail the load */
+	} else {
+		buf = (unsigned char *)dd->port->log_buf;
+		if (buf[259] & 0x1) {
+			dev_info(&dd->pdev->dev,
+				"Write protect bit is set.\n");
+			set_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xF7) {
+			dev_info(&dd->pdev->dev,
+				"Exceeded Tmax, drive in thermal shutdown.\n");
+			set_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag);
+		}
+		if (buf[288] == 0xBF) {
+			dev_info(&dd->pdev->dev,
+				"Drive indicates rebuild has failed.\n");
+			set_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag);
+		}
+	}
+
+	/* get write protect progess */
+	memset(&attr242, 0, sizeof(struct smart_attr));
+	if (mtip_get_smart_attr(dd->port, 242, &attr242))
+		dev_warn(&dd->pdev->dev,
+				"Unable to check write protect progress\n");
+	else
+		dev_info(&dd->pdev->dev,
+				"Write protect progress: %u%% (%u blocks)\n",
+				attr242.cur, le32_to_cpu(attr242.data));
+
+	return rv;
+}
+
+/*
+ * Called once for each card.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success, else an error code.
+ */
+static int mtip_hw_init(struct driver_data *dd)
+{
+	int i;
+	int rv;
+	unsigned long timeout, timetaken;
+
+	dd->mmio = pcim_iomap_table(dd->pdev)[MTIP_ABAR];
+
+	mtip_detect_product(dd);
+	if (dd->product_type == MTIP_PRODUCT_UNKNOWN) {
+		rv = -EIO;
+		goto out1;
+	}
+
+	hba_setup(dd);
+
+	dd->port = kzalloc_node(sizeof(struct mtip_port), GFP_KERNEL,
+				dd->numa_node);
+	if (!dd->port)
+		return -ENOMEM;
+
+	/* Continue workqueue setup */
+	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+		dd->work[i].port = dd->port;
+
+	/* Enable unaligned IO constraints for some devices */
+	if (mtip_device_unaligned_constrained(dd))
+		dd->unal_qdepth = MTIP_MAX_UNALIGNED_SLOTS;
+	else
+		dd->unal_qdepth = 0;
+
+	atomic_set(&dd->port->cmd_slot_unal, dd->unal_qdepth);
+
+	/* Spinlock to prevent concurrent issue */
+	for (i = 0; i < MTIP_MAX_SLOT_GROUPS; i++)
+		spin_lock_init(&dd->port->cmd_issue_lock[i]);
+
+	/* Set the port mmio base address. */
+	dd->port->mmio	= dd->mmio + PORT_OFFSET;
+	dd->port->dd	= dd;
+
+	/* DMA allocations */
+	rv = mtip_dma_alloc(dd);
+	if (rv < 0)
+		goto out1;
+
+	/* Setup the pointers to the extended s_active and CI registers. */
+	for (i = 0; i < dd->slot_groups; i++) {
+		dd->port->s_active[i] =
+			dd->port->mmio + i*0x80 + PORT_SCR_ACT;
+		dd->port->cmd_issue[i] =
+			dd->port->mmio + i*0x80 + PORT_COMMAND_ISSUE;
+		dd->port->completed[i] =
+			dd->port->mmio + i*0x80 + PORT_SDBV;
+	}
+
+	timetaken = jiffies;
+	timeout = jiffies + msecs_to_jiffies(30000);
+	while (((readl(dd->port->mmio + PORT_SCR_STAT) & 0x0F) != 0x03) &&
+		 time_before(jiffies, timeout)) {
+		mdelay(100);
+	}
+	if (unlikely(mtip_check_surprise_removal(dd))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Surprise removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -ENODEV;
+		goto out2 ;
+	}
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))) {
+		timetaken = jiffies - timetaken;
+		dev_warn(&dd->pdev->dev,
+			"Removal detected at %u ms\n",
+			jiffies_to_msecs(timetaken));
+		rv = -EFAULT;
+		goto out2;
+	}
+
+	/* Conditionally reset the HBA. */
+	if (!(readl(dd->mmio + HOST_CAP) & HOST_CAP_NZDMA)) {
+		if (mtip_hba_reset(dd) < 0) {
+			dev_err(&dd->pdev->dev,
+				"Card did not reset within timeout\n");
+			rv = -EIO;
+			goto out2;
+		}
+	} else {
+		/* Clear any pending interrupts on the HBA */
+		writel(readl(dd->mmio + HOST_IRQ_STAT),
+			dd->mmio + HOST_IRQ_STAT);
+	}
+
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Setup the ISR and enable interrupts. */
+	rv = request_irq(dd->pdev->irq, mtip_irq_handler, IRQF_SHARED,
+			 dev_driver_string(&dd->pdev->dev), dd);
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate IRQ %d\n", dd->pdev->irq);
+		goto out2;
+	}
+	irq_set_affinity_hint(dd->pdev->irq, get_cpu_mask(dd->isr_binding));
+
+	/* Enable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+					dd->mmio + HOST_CTL);
+
+	init_waitqueue_head(&dd->port->svc_wait);
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)) {
+		rv = -EFAULT;
+		goto out3;
+	}
+
+	return rv;
+
+out3:
+	/* Disable interrupts on the HBA. */
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	/* Release the IRQ. */
+	irq_set_affinity_hint(dd->pdev->irq, NULL);
+	free_irq(dd->pdev->irq, dd);
+
+out2:
+	mtip_deinit_port(dd->port);
+	mtip_dma_free(dd);
+
+out1:
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+
+	return rv;
+}
+
+static int mtip_standby_drive(struct driver_data *dd)
+{
+	int rv = 0;
+
+	if (dd->sr || !dd->port)
+		return -ENODEV;
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	if (!test_bit(MTIP_PF_REBUILD_BIT, &dd->port->flags) &&
+	    !test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag) &&
+	    !test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag)) {
+		rv = mtip_standby_immediate(dd->port);
+		if (rv)
+			dev_warn(&dd->pdev->dev,
+				"STANDBY IMMEDIATE failed\n");
+	}
+	return rv;
+}
+
+/*
+ * Called to deinitialize an interface.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_exit(struct driver_data *dd)
+{
+	if (!dd->sr) {
+		/* de-initialize the port. */
+		mtip_deinit_port(dd->port);
+
+		/* Disable interrupts on the HBA. */
+		writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+				dd->mmio + HOST_CTL);
+	}
+
+	/* Release the IRQ. */
+	irq_set_affinity_hint(dd->pdev->irq, NULL);
+	free_irq(dd->pdev->irq, dd);
+	msleep(1000);
+
+	/* Free dma regions */
+	mtip_dma_free(dd);
+
+	/* Free the memory allocated for the for structure. */
+	kfree(dd->port);
+	dd->port = NULL;
+
+	return 0;
+}
+
+/*
+ * Issue a Standby Immediate command to the device.
+ *
+ * This function is called by the Block Layer just before the
+ * system powers off during a shutdown.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_hw_shutdown(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive so that it
+	 * saves its state.
+	 */
+	mtip_standby_drive(dd);
+
+	return 0;
+}
+
+/*
+ * Suspend function
+ *
+ * This function is called by the Block Layer just before the
+ * system hibernates.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Suspend was successful
+ *	-EFAULT Suspend was not successful
+ */
+static int mtip_hw_suspend(struct driver_data *dd)
+{
+	/*
+	 * Send standby immediate (E0h) to the drive
+	 * so that it saves its state.
+	 */
+	if (mtip_standby_drive(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Failed standby-immediate command\n");
+		return -EFAULT;
+	}
+
+	/* Disable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) & ~HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+	mtip_deinit_port(dd->port);
+
+	return 0;
+}
+
+/*
+ * Resume function
+ *
+ * This function is called by the Block Layer as the
+ * system resumes.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0	Resume was successful
+ *      -EFAULT Resume was not successful
+ */
+static int mtip_hw_resume(struct driver_data *dd)
+{
+	/* Perform any needed hardware setup steps */
+	hba_setup(dd);
+
+	/* Reset the HBA */
+	if (mtip_hba_reset(dd) != 0) {
+		dev_err(&dd->pdev->dev,
+			"Unable to reset the HBA\n");
+		return -EFAULT;
+	}
+
+	/*
+	 * Enable the port, DMA engine, and FIS reception specific
+	 * h/w in controller.
+	 */
+	mtip_init_port(dd->port);
+	mtip_start_port(dd->port);
+
+	/* Enable interrupts on the HBA.*/
+	writel(readl(dd->mmio + HOST_CTL) | HOST_IRQ_EN,
+			dd->mmio + HOST_CTL);
+
+	return 0;
+}
+
+/*
+ * Helper function for reusing disk name
+ * upon hot insertion.
+ */
+static int rssd_disk_name_format(char *prefix,
+				 int index,
+				 char *buf,
+				 int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+/*
+ * Block layer IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+
+#ifdef CONFIG_COMPAT
+/*
+ * Block layer compat IOCTL handler.
+ *
+ * @dev Pointer to the block_device structure.
+ * @mode ignored
+ * @cmd IOCTL command passed from the user application.
+ * @arg Argument passed from the user application.
+ *
+ * return value
+ *	0        IOCTL completed successfully.
+ *	-ENOTTY  IOCTL not supported or invalid driver data
+ *                 structure pointer.
+ */
+static int mtip_block_compat_ioctl(struct block_device *dev,
+			    fmode_t mode,
+			    unsigned cmd,
+			    unsigned long arg)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EACCES;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (unlikely(test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag)))
+		return -ENOTTY;
+
+	switch (cmd) {
+	case BLKFLSBUF:
+		return -ENOTTY;
+	case HDIO_DRIVE_TASKFILE: {
+		struct mtip_compat_ide_task_request_s __user *compat_req_task;
+		ide_task_request_t req_task;
+		int compat_tasksize, outtotal, ret;
+
+		compat_tasksize =
+			sizeof(struct mtip_compat_ide_task_request_s);
+
+		compat_req_task =
+			(struct mtip_compat_ide_task_request_s __user *) arg;
+
+		if (copy_from_user(&req_task, (void __user *) arg,
+			compat_tasksize - (2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (get_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (get_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		outtotal = sizeof(struct mtip_compat_ide_task_request_s);
+
+		ret = exec_drive_taskfile(dd, (void __user *) arg,
+						&req_task, outtotal);
+
+		if (copy_to_user((void __user *) arg, &req_task,
+				compat_tasksize -
+				(2 * sizeof(compat_long_t))))
+			return -EFAULT;
+
+		if (put_user(req_task.out_size, &compat_req_task->out_size))
+			return -EFAULT;
+
+		if (put_user(req_task.in_size, &compat_req_task->in_size))
+			return -EFAULT;
+
+		return ret;
+	}
+	default:
+		return mtip_hw_ioctl(dd, cmd, arg);
+	}
+}
+#endif
+
+/*
+ * Obtain the geometry of the device.
+ *
+ * You may think that this function is obsolete, but some applications,
+ * fdisk for example still used CHS values. This function describes the
+ * device as having 224 heads and 56 sectors per cylinder. These values are
+ * chosen so that each cylinder is aligned on a 4KB boundary. Since a
+ * partition is described in terms of a start and end cylinder this means
+ * that each partition is also 4KB aligned. Non-aligned partitions adversely
+ * affects performance.
+ *
+ * @dev Pointer to the block_device strucutre.
+ * @geo Pointer to a hd_geometry structure.
+ *
+ * return value
+ *	0       Operation completed successfully.
+ *	-ENOTTY An error occurred while reading the drive capacity.
+ */
+static int mtip_block_getgeo(struct block_device *dev,
+				struct hd_geometry *geo)
+{
+	struct driver_data *dd = dev->bd_disk->private_data;
+	sector_t capacity;
+
+	if (!dd)
+		return -ENOTTY;
+
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not get drive capacity.\n");
+		return -ENOTTY;
+	}
+
+	geo->heads = 224;
+	geo->sectors = 56;
+	sector_div(capacity, (geo->heads * geo->sectors));
+	geo->cylinders = capacity;
+	return 0;
+}
+
+static void mtip_block_free_disk(struct gendisk *disk)
+{
+	struct driver_data *dd = disk->private_data;
+
+	ida_free(&rssd_index_ida, dd->index);
+	kfree(dd);
+}
+
+/*
+ * Block device operation function.
+ *
+ * This structure contains pointers to the functions required by the block
+ * layer.
+ */
+static const struct block_device_operations mtip_block_ops = {
+	.ioctl		= mtip_block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= mtip_block_compat_ioctl,
+#endif
+	.getgeo		= mtip_block_getgeo,
+	.free_disk	= mtip_block_free_disk,
+	.owner		= THIS_MODULE
+};
+
+static inline bool is_se_active(struct driver_data *dd)
+{
+	if (unlikely(test_bit(MTIP_PF_SE_ACTIVE_BIT, &dd->port->flags))) {
+		if (dd->port->ic_pause_timer) {
+			unsigned long to = dd->port->ic_pause_timer +
+							msecs_to_jiffies(1000);
+			if (time_after(jiffies, to)) {
+				clear_bit(MTIP_PF_SE_ACTIVE_BIT,
+							&dd->port->flags);
+				clear_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag);
+				dd->port->ic_pause_timer = 0;
+				wake_up_interruptible(&dd->port->svc_wait);
+				return false;
+			}
+		}
+		return true;
+	}
+	return false;
+}
+
+static inline bool is_stopped(struct driver_data *dd, struct request *rq)
+{
+	if (likely(!(dd->dd_flag & MTIP_DDF_STOP_IO)))
+		return false;
+
+	if (test_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_OVER_TEMP_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_WRITE_PROTECT_BIT, &dd->dd_flag) &&
+	    rq_data_dir(rq))
+		return true;
+	if (test_bit(MTIP_DDF_SEC_LOCK_BIT, &dd->dd_flag))
+		return true;
+	if (test_bit(MTIP_DDF_REBUILD_FAILED_BIT, &dd->dd_flag))
+		return true;
+
+	return false;
+}
+
+static bool mtip_check_unal_depth(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (rq_data_dir(rq) == READ || !dd->unal_qdepth)
+		return false;
+
+	/*
+	 * If unaligned depth must be limited on this controller, mark it
+	 * as unaligned if the IO isn't on a 4k boundary (start of length).
+	 */
+	if (blk_rq_sectors(rq) <= 64) {
+		if ((blk_rq_pos(rq) & 7) || (blk_rq_sectors(rq) & 7))
+			cmd->unaligned = 1;
+	}
+
+	if (cmd->unaligned && atomic_dec_if_positive(&dd->port->cmd_slot_unal) >= 0)
+		return true;
+
+	return false;
+}
+
+static blk_status_t mtip_issue_reserved_cmd(struct blk_mq_hw_ctx *hctx,
+		struct request *rq)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	struct mtip_int_cmd *icmd = cmd->icmd;
+	struct mtip_cmd_hdr *hdr =
+		dd->port->command_list + sizeof(struct mtip_cmd_hdr) * rq->tag;
+	struct mtip_cmd_sg *command_sg;
+
+	if (mtip_commands_active(dd->port))
+		return BLK_STS_DEV_RESOURCE;
+
+	hdr->ctba = cpu_to_le32(cmd->command_dma & 0xFFFFFFFF);
+	if (test_bit(MTIP_PF_HOST_CAP_64, &dd->port->flags))
+		hdr->ctbau = cpu_to_le32((cmd->command_dma >> 16) >> 16);
+	/* Populate the SG list */
+	hdr->opts = cpu_to_le32(icmd->opts | icmd->fis_len);
+	if (icmd->buf_len) {
+		command_sg = cmd->command + AHCI_CMD_TBL_HDR_SZ;
+
+		command_sg->info = cpu_to_le32((icmd->buf_len-1) & 0x3FFFFF);
+		command_sg->dba	= cpu_to_le32(icmd->buffer & 0xFFFFFFFF);
+		command_sg->dba_upper =
+			cpu_to_le32((icmd->buffer >> 16) >> 16);
+
+		hdr->opts |= cpu_to_le32((1 << 16));
+	}
+
+	/* Populate the command header */
+	hdr->byte_count = 0;
+
+	blk_mq_start_request(rq);
+	mtip_issue_non_ncq_command(dd->port, rq->tag);
+	return 0;
+}
+
+static blk_status_t mtip_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct driver_data *dd = hctx->queue->queuedata;
+	struct request *rq = bd->rq;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (blk_rq_is_passthrough(rq))
+		return mtip_issue_reserved_cmd(hctx, rq);
+
+	if (unlikely(mtip_check_unal_depth(hctx, rq)))
+		return BLK_STS_DEV_RESOURCE;
+
+	if (is_se_active(dd) || is_stopped(dd, rq))
+		return BLK_STS_IOERR;
+
+	blk_mq_start_request(rq);
+
+	mtip_hw_submit_io(dd, rq, cmd, hctx);
+	return BLK_STS_OK;
+}
+
+static void mtip_free_cmd(struct blk_mq_tag_set *set, struct request *rq,
+			  unsigned int hctx_idx)
+{
+	struct driver_data *dd = set->driver_data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (!cmd->command)
+		return;
+
+	dma_free_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ, cmd->command,
+			  cmd->command_dma);
+}
+
+static int mtip_init_cmd(struct blk_mq_tag_set *set, struct request *rq,
+			 unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct driver_data *dd = set->driver_data;
+	struct mtip_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	cmd->command = dma_alloc_coherent(&dd->pdev->dev, CMD_DMA_ALLOC_SZ,
+			&cmd->command_dma, GFP_KERNEL);
+	if (!cmd->command)
+		return -ENOMEM;
+
+	sg_init_table(cmd->sg, MTIP_MAX_SG);
+	return 0;
+}
+
+static enum blk_eh_timer_return mtip_cmd_timeout(struct request *req)
+{
+	struct driver_data *dd = req->q->queuedata;
+
+	if (blk_mq_is_reserved_rq(req)) {
+		struct mtip_cmd *cmd = blk_mq_rq_to_pdu(req);
+
+		cmd->status = BLK_STS_TIMEOUT;
+		blk_mq_complete_request(req);
+		return BLK_EH_DONE;
+	}
+
+	if (test_bit(req->tag, dd->port->cmds_to_issue))
+		goto exit_handler;
+
+	if (test_and_set_bit(MTIP_PF_TO_ACTIVE_BIT, &dd->port->flags))
+		goto exit_handler;
+
+	wake_up_interruptible(&dd->port->svc_wait);
+exit_handler:
+	return BLK_EH_RESET_TIMER;
+}
+
+static const struct blk_mq_ops mtip_mq_ops = {
+	.queue_rq	= mtip_queue_rq,
+	.init_request	= mtip_init_cmd,
+	.exit_request	= mtip_free_cmd,
+	.complete	= mtip_softirq_done_fn,
+	.timeout        = mtip_cmd_timeout,
+};
+
+/*
+ * Block layer initialization function.
+ *
+ * This function is called once by the PCI layer for each P320
+ * device that is connected to the system.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_block_initialize(struct driver_data *dd)
+{
+	int rv = 0, wait_for_rebuild = 0;
+	sector_t capacity;
+	unsigned int index = 0;
+
+	if (dd->disk)
+		goto skip_create_disk; /* hw init done, before rebuild */
+
+	if (mtip_hw_init(dd)) {
+		rv = -EINVAL;
+		goto protocol_init_error;
+	}
+
+	memset(&dd->tags, 0, sizeof(dd->tags));
+	dd->tags.ops = &mtip_mq_ops;
+	dd->tags.nr_hw_queues = 1;
+	dd->tags.queue_depth = MTIP_MAX_COMMAND_SLOTS;
+	dd->tags.reserved_tags = 1;
+	dd->tags.cmd_size = sizeof(struct mtip_cmd);
+	dd->tags.numa_node = dd->numa_node;
+	dd->tags.flags = BLK_MQ_F_SHOULD_MERGE;
+	dd->tags.driver_data = dd;
+	dd->tags.timeout = MTIP_NCQ_CMD_TIMEOUT_MS;
+
+	rv = blk_mq_alloc_tag_set(&dd->tags);
+	if (rv) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		goto block_queue_alloc_tag_error;
+	}
+
+	dd->disk = blk_mq_alloc_disk(&dd->tags, dd);
+	if (IS_ERR(dd->disk)) {
+		dev_err(&dd->pdev->dev,
+			"Unable to allocate request queue\n");
+		rv = -ENOMEM;
+		goto block_queue_alloc_init_error;
+	}
+	dd->queue		= dd->disk->queue;
+
+	rv = ida_alloc(&rssd_index_ida, GFP_KERNEL);
+	if (rv < 0)
+		goto ida_get_error;
+	index = rv;
+
+	rv = rssd_disk_name_format("rssd",
+				index,
+				dd->disk->disk_name,
+				DISK_NAME_LEN);
+	if (rv)
+		goto disk_index_error;
+
+	dd->disk->major		= dd->major;
+	dd->disk->first_minor	= index * MTIP_MAX_MINORS;
+	dd->disk->minors 	= MTIP_MAX_MINORS;
+	dd->disk->fops		= &mtip_block_ops;
+	dd->disk->private_data	= dd;
+	dd->index		= index;
+
+	mtip_hw_debugfs_init(dd);
+
+skip_create_disk:
+	/* Initialize the protocol layer. */
+	wait_for_rebuild = mtip_hw_get_identify(dd);
+	if (wait_for_rebuild < 0) {
+		dev_err(&dd->pdev->dev,
+			"Protocol layer initialization failed\n");
+		rv = -EINVAL;
+		goto init_hw_cmds_error;
+	}
+
+	/*
+	 * if rebuild pending, start the service thread, and delay the block
+	 * queue creation and device_add_disk()
+	 */
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		goto start_service_thread;
+
+	/* Set device limits. */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, dd->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, dd->queue);
+	blk_queue_max_segments(dd->queue, MTIP_MAX_SG);
+	blk_queue_physical_block_size(dd->queue, 4096);
+	blk_queue_max_hw_sectors(dd->queue, 0xffff);
+	blk_queue_max_segment_size(dd->queue, 0x400000);
+	dma_set_max_seg_size(&dd->pdev->dev, 0x400000);
+	blk_queue_io_min(dd->queue, 4096);
+
+	/* Set the capacity of the device in 512 byte sectors. */
+	if (!(mtip_hw_get_capacity(dd, &capacity))) {
+		dev_warn(&dd->pdev->dev,
+			"Could not read drive capacity\n");
+		rv = -EIO;
+		goto read_capacity_error;
+	}
+	set_capacity(dd->disk, capacity);
+
+	/* Enable the block device and add it to /dev */
+	rv = device_add_disk(&dd->pdev->dev, dd->disk, mtip_disk_attr_groups);
+	if (rv)
+		goto read_capacity_error;
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+		return rv; /* service thread created for handling rebuild */
+	}
+
+start_service_thread:
+	dd->mtip_svc_handler = kthread_create_on_node(mtip_service_thread,
+						dd, dd->numa_node,
+						"mtip_svc_thd_%02d", index);
+
+	if (IS_ERR(dd->mtip_svc_handler)) {
+		dev_err(&dd->pdev->dev, "service thread failed to start\n");
+		dd->mtip_svc_handler = NULL;
+		rv = -EFAULT;
+		goto kthread_run_error;
+	}
+	wake_up_process(dd->mtip_svc_handler);
+	if (wait_for_rebuild == MTIP_FTL_REBUILD_MAGIC)
+		rv = wait_for_rebuild;
+
+	return rv;
+
+kthread_run_error:
+	/* Delete our gendisk. This also removes the device from /dev */
+	del_gendisk(dd->disk);
+read_capacity_error:
+init_hw_cmds_error:
+	mtip_hw_debugfs_exit(dd);
+disk_index_error:
+	ida_free(&rssd_index_ida, index);
+ida_get_error:
+	put_disk(dd->disk);
+block_queue_alloc_init_error:
+	blk_mq_free_tag_set(&dd->tags);
+block_queue_alloc_tag_error:
+	mtip_hw_exit(dd); /* De-initialize the protocol layer. */
+protocol_init_error:
+	return rv;
+}
+
+/*
+ * Function called by the PCI layer when just before the
+ * machine shuts down.
+ *
+ * If a protocol layer shutdown function is present it will be called
+ * by this function.
+ *
+ * @dd Pointer to the driver data structure.
+ *
+ * return value
+ *	0
+ */
+static int mtip_block_shutdown(struct driver_data *dd)
+{
+	mtip_hw_shutdown(dd);
+
+	dev_info(&dd->pdev->dev,
+		"Shutting down %s ...\n", dd->disk->disk_name);
+
+	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
+		del_gendisk(dd->disk);
+
+	blk_mq_free_tag_set(&dd->tags);
+	put_disk(dd->disk);
+	return 0;
+}
+
+static int mtip_block_suspend(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev,
+		"Suspending %s ...\n", dd->disk->disk_name);
+	mtip_hw_suspend(dd);
+	return 0;
+}
+
+static int mtip_block_resume(struct driver_data *dd)
+{
+	dev_info(&dd->pdev->dev, "Resuming %s ...\n",
+		dd->disk->disk_name);
+	mtip_hw_resume(dd);
+	return 0;
+}
+
+static void drop_cpu(int cpu)
+{
+	cpu_use[cpu]--;
+}
+
+static int get_least_used_cpu_on_node(int node)
+{
+	int cpu, least_used_cpu, least_cnt;
+	const struct cpumask *node_mask;
+
+	node_mask = cpumask_of_node(node);
+	least_used_cpu = cpumask_first(node_mask);
+	least_cnt = cpu_use[least_used_cpu];
+	cpu = least_used_cpu;
+
+	for_each_cpu(cpu, node_mask) {
+		if (cpu_use[cpu] < least_cnt) {
+			least_used_cpu = cpu;
+			least_cnt = cpu_use[cpu];
+		}
+	}
+	cpu_use[least_used_cpu]++;
+	return least_used_cpu;
+}
+
+/* Helper for selecting a node in round robin mode */
+static inline int mtip_get_next_rr_node(void)
+{
+	static int next_node = NUMA_NO_NODE;
+
+	if (next_node == NUMA_NO_NODE) {
+		next_node = first_online_node;
+		return next_node;
+	}
+
+	next_node = next_online_node(next_node);
+	if (next_node == MAX_NUMNODES)
+		next_node = first_online_node;
+	return next_node;
+}
+
+static DEFINE_HANDLER(0);
+static DEFINE_HANDLER(1);
+static DEFINE_HANDLER(2);
+static DEFINE_HANDLER(3);
+static DEFINE_HANDLER(4);
+static DEFINE_HANDLER(5);
+static DEFINE_HANDLER(6);
+static DEFINE_HANDLER(7);
+
+static void mtip_disable_link_opts(struct driver_data *dd, struct pci_dev *pdev)
+{
+	unsigned short pcie_dev_ctrl;
+
+	if (pci_is_pcie(pdev)) {
+		pcie_capability_read_word(pdev, PCI_EXP_DEVCTL, &pcie_dev_ctrl);
+		if (pcie_dev_ctrl & PCI_EXP_DEVCTL_NOSNOOP_EN ||
+		    pcie_dev_ctrl & PCI_EXP_DEVCTL_RELAX_EN) {
+			dev_info(&dd->pdev->dev,
+				"Disabling ERO/No-Snoop on bridge device %04x:%04x\n",
+					pdev->vendor, pdev->device);
+			pcie_dev_ctrl &= ~(PCI_EXP_DEVCTL_NOSNOOP_EN |
+						PCI_EXP_DEVCTL_RELAX_EN);
+			pcie_capability_write_word(pdev, PCI_EXP_DEVCTL,
+				pcie_dev_ctrl);
+		}
+	}
+}
+
+static void mtip_fix_ero_nosnoop(struct driver_data *dd, struct pci_dev *pdev)
+{
+	/*
+	 * This workaround is specific to AMD/ATI chipset with a PCI upstream
+	 * device with device id 0x5aXX
+	 */
+	if (pdev->bus && pdev->bus->self) {
+		if (pdev->bus->self->vendor == PCI_VENDOR_ID_ATI &&
+		    ((pdev->bus->self->device & 0xff00) == 0x5a00)) {
+			mtip_disable_link_opts(dd, pdev->bus->self);
+		} else {
+			/* Check further up the topology */
+			struct pci_dev *parent_dev = pdev->bus->self;
+			if (parent_dev->bus &&
+				parent_dev->bus->parent &&
+				parent_dev->bus->parent->self &&
+				parent_dev->bus->parent->self->vendor ==
+					 PCI_VENDOR_ID_ATI &&
+				(parent_dev->bus->parent->self->device &
+					0xff00) == 0x5a00) {
+				mtip_disable_link_opts(dd,
+					parent_dev->bus->parent->self);
+			}
+		}
+	}
+}
+
+/*
+ * Called for each supported PCI device detected.
+ *
+ * This function allocates the private data structure, enables the
+ * PCI device and then calls the block layer initialization function.
+ *
+ * return value
+ *	0 on success else an error code.
+ */
+static int mtip_pci_probe(struct pci_dev *pdev,
+			const struct pci_device_id *ent)
+{
+	int rv = 0;
+	struct driver_data *dd = NULL;
+	char cpu_list[256];
+	const struct cpumask *node_mask;
+	int cpu, i = 0, j = 0;
+	int my_node = NUMA_NO_NODE;
+
+	/* Allocate memory for this devices private data. */
+	my_node = pcibus_to_node(pdev->bus);
+	if (my_node != NUMA_NO_NODE) {
+		if (!node_online(my_node))
+			my_node = mtip_get_next_rr_node();
+	} else {
+		dev_info(&pdev->dev, "Kernel not reporting proximity, choosing a node\n");
+		my_node = mtip_get_next_rr_node();
+	}
+	dev_info(&pdev->dev, "NUMA node %d (closest: %d,%d, probe on %d:%d)\n",
+		my_node, pcibus_to_node(pdev->bus), dev_to_node(&pdev->dev),
+		cpu_to_node(raw_smp_processor_id()), raw_smp_processor_id());
+
+	dd = kzalloc_node(sizeof(struct driver_data), GFP_KERNEL, my_node);
+	if (!dd)
+		return -ENOMEM;
+
+	/* Attach the private data to this PCI device.  */
+	pci_set_drvdata(pdev, dd);
+
+	rv = pcim_enable_device(pdev);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to enable device\n");
+		goto iomap_err;
+	}
+
+	/* Map BAR5 to memory. */
+	rv = pcim_iomap_regions(pdev, 1 << MTIP_ABAR, MTIP_DRV_NAME);
+	if (rv < 0) {
+		dev_err(&pdev->dev, "Unable to map regions\n");
+		goto iomap_err;
+	}
+
+	rv = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64));
+	if (rv) {
+		dev_warn(&pdev->dev, "64-bit DMA enable failed\n");
+		goto setmask_err;
+	}
+
+	/* Copy the info we may need later into the private data structure. */
+	dd->major	= mtip_major;
+	dd->instance	= instance;
+	dd->pdev	= pdev;
+	dd->numa_node	= my_node;
+
+	memset(dd->workq_name, 0, 32);
+	snprintf(dd->workq_name, 31, "mtipq%d", dd->instance);
+
+	dd->isr_workq = create_workqueue(dd->workq_name);
+	if (!dd->isr_workq) {
+		dev_warn(&pdev->dev, "Can't create wq %d\n", dd->instance);
+		rv = -ENOMEM;
+		goto setmask_err;
+	}
+
+	memset(cpu_list, 0, sizeof(cpu_list));
+
+	node_mask = cpumask_of_node(dd->numa_node);
+	if (!cpumask_empty(node_mask)) {
+		for_each_cpu(cpu, node_mask)
+		{
+			snprintf(&cpu_list[j], 256 - j, "%d ", cpu);
+			j = strlen(cpu_list);
+		}
+
+		dev_info(&pdev->dev, "Node %d on package %d has %d cpu(s): %s\n",
+			dd->numa_node,
+			topology_physical_package_id(cpumask_first(node_mask)),
+			nr_cpus_node(dd->numa_node),
+			cpu_list);
+	} else
+		dev_dbg(&pdev->dev, "mtip32xx: node_mask empty\n");
+
+	dd->isr_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dev_info(&pdev->dev, "Initial IRQ binding node:cpu %d:%d\n",
+		cpu_to_node(dd->isr_binding), dd->isr_binding);
+
+	/* first worker context always runs in ISR */
+	dd->work[0].cpu_binding = dd->isr_binding;
+	dd->work[1].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dd->work[2].cpu_binding = get_least_used_cpu_on_node(dd->numa_node);
+	dd->work[3].cpu_binding = dd->work[0].cpu_binding;
+	dd->work[4].cpu_binding = dd->work[1].cpu_binding;
+	dd->work[5].cpu_binding = dd->work[2].cpu_binding;
+	dd->work[6].cpu_binding = dd->work[2].cpu_binding;
+	dd->work[7].cpu_binding = dd->work[1].cpu_binding;
+
+	/* Log the bindings */
+	for_each_present_cpu(cpu) {
+		memset(cpu_list, 0, sizeof(cpu_list));
+		for (i = 0, j = 0; i < MTIP_MAX_SLOT_GROUPS; i++) {
+			if (dd->work[i].cpu_binding == cpu) {
+				snprintf(&cpu_list[j], 256 - j, "%d ", i);
+				j = strlen(cpu_list);
+			}
+		}
+		if (j)
+			dev_info(&pdev->dev, "CPU %d: WQs %s\n", cpu, cpu_list);
+	}
+
+	INIT_WORK(&dd->work[0].work, mtip_workq_sdbf0);
+	INIT_WORK(&dd->work[1].work, mtip_workq_sdbf1);
+	INIT_WORK(&dd->work[2].work, mtip_workq_sdbf2);
+	INIT_WORK(&dd->work[3].work, mtip_workq_sdbf3);
+	INIT_WORK(&dd->work[4].work, mtip_workq_sdbf4);
+	INIT_WORK(&dd->work[5].work, mtip_workq_sdbf5);
+	INIT_WORK(&dd->work[6].work, mtip_workq_sdbf6);
+	INIT_WORK(&dd->work[7].work, mtip_workq_sdbf7);
+
+	pci_set_master(pdev);
+	rv = pci_enable_msi(pdev);
+	if (rv) {
+		dev_warn(&pdev->dev,
+			"Unable to enable MSI interrupt.\n");
+		goto msi_initialize_err;
+	}
+
+	mtip_fix_ero_nosnoop(dd, pdev);
+
+	/* Initialize the block layer. */
+	rv = mtip_block_initialize(dd);
+	if (rv < 0) {
+		dev_err(&pdev->dev,
+			"Unable to initialize block layer\n");
+		goto block_initialize_err;
+	}
+
+	/*
+	 * Increment the instance count so that each device has a unique
+	 * instance number.
+	 */
+	instance++;
+	if (rv != MTIP_FTL_REBUILD_MAGIC)
+		set_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag);
+	else
+		rv = 0; /* device in rebuild state, return 0 from probe */
+
+	goto done;
+
+block_initialize_err:
+	pci_disable_msi(pdev);
+
+msi_initialize_err:
+	if (dd->isr_workq) {
+		destroy_workqueue(dd->isr_workq);
+		drop_cpu(dd->work[0].cpu_binding);
+		drop_cpu(dd->work[1].cpu_binding);
+		drop_cpu(dd->work[2].cpu_binding);
+	}
+setmask_err:
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+
+iomap_err:
+	kfree(dd);
+	pci_set_drvdata(pdev, NULL);
+	return rv;
+done:
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is removed or the
+ * driver is unloaded.
+ *
+ * return value
+ *	None
+ */
+static void mtip_pci_remove(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	unsigned long to;
+
+	mtip_check_surprise_removal(dd);
+	synchronize_irq(dd->pdev->irq);
+
+	/* Spin until workers are done */
+	to = jiffies + msecs_to_jiffies(4000);
+	do {
+		msleep(20);
+	} while (atomic_read(&dd->irq_workers_active) != 0 &&
+		time_before(jiffies, to));
+
+	if (atomic_read(&dd->irq_workers_active) != 0) {
+		dev_warn(&dd->pdev->dev,
+			"Completion workers still active!\n");
+	}
+
+	set_bit(MTIP_DDF_REMOVE_PENDING_BIT, &dd->dd_flag);
+
+	if (test_bit(MTIP_DDF_INIT_DONE_BIT, &dd->dd_flag))
+		del_gendisk(dd->disk);
+
+	mtip_hw_debugfs_exit(dd);
+
+	if (dd->mtip_svc_handler) {
+		set_bit(MTIP_PF_SVC_THD_STOP_BIT, &dd->port->flags);
+		wake_up_interruptible(&dd->port->svc_wait);
+		kthread_stop(dd->mtip_svc_handler);
+	}
+
+	if (!dd->sr) {
+		/*
+		 * Explicitly wait here for IOs to quiesce,
+		 * as mtip_standby_drive usually won't wait for IOs.
+		 */
+		if (!mtip_quiesce_io(dd->port, MTIP_QUIESCE_IO_TIMEOUT_MS))
+			mtip_standby_drive(dd);
+	}
+	else
+		dev_info(&dd->pdev->dev, "device %s surprise removal\n",
+						dd->disk->disk_name);
+
+	blk_mq_free_tag_set(&dd->tags);
+
+	/* De-initialize the protocol layer. */
+	mtip_hw_exit(dd);
+
+	if (dd->isr_workq) {
+		destroy_workqueue(dd->isr_workq);
+		drop_cpu(dd->work[0].cpu_binding);
+		drop_cpu(dd->work[1].cpu_binding);
+		drop_cpu(dd->work[2].cpu_binding);
+	}
+
+	pci_disable_msi(pdev);
+
+	pcim_iounmap_regions(pdev, 1 << MTIP_ABAR);
+	pci_set_drvdata(pdev, NULL);
+
+	put_disk(dd->disk);
+}
+
+/*
+ * Called for each probed device when the device is suspended.
+ *
+ * return value
+ *	0  Success
+ *	<0 Error
+ */
+static int __maybe_unused mtip_pci_suspend(struct device *dev)
+{
+	int rv = 0;
+	struct driver_data *dd = dev_get_drvdata(dev);
+
+	set_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	/* Disable ports & interrupts then send standby immediate */
+	rv = mtip_block_suspend(dd);
+	if (rv < 0)
+		dev_err(dev, "Failed to suspend controller\n");
+
+	return rv;
+}
+
+/*
+ * Called for each probed device when the device is resumed.
+ *
+ * return value
+ *      0  Success
+ *      <0 Error
+ */
+static int __maybe_unused mtip_pci_resume(struct device *dev)
+{
+	int rv = 0;
+	struct driver_data *dd = dev_get_drvdata(dev);
+
+	/*
+	 * Calls hbaReset, initPort, & startPort function
+	 * then enables interrupts
+	 */
+	rv = mtip_block_resume(dd);
+	if (rv < 0)
+		dev_err(dev, "Unable to resume\n");
+
+	clear_bit(MTIP_DDF_RESUME_BIT, &dd->dd_flag);
+
+	return rv;
+}
+
+/*
+ * Shutdown routine
+ *
+ * return value
+ *      None
+ */
+static void mtip_pci_shutdown(struct pci_dev *pdev)
+{
+	struct driver_data *dd = pci_get_drvdata(pdev);
+	if (dd)
+		mtip_block_shutdown(dd);
+}
+
+/* Table of device ids supported by this driver. */
+static const struct pci_device_id mtip_pci_tbl[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P320S_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P325M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420H_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P420M_DEVICE_ID) },
+	{ PCI_DEVICE(PCI_VENDOR_ID_MICRON, P425M_DEVICE_ID) },
+	{ 0 }
+};
+
+static SIMPLE_DEV_PM_OPS(mtip_pci_pm_ops, mtip_pci_suspend, mtip_pci_resume);
+
+/* Structure that describes the PCI driver functions. */
+static struct pci_driver mtip_pci_driver = {
+	.name			= MTIP_DRV_NAME,
+	.id_table		= mtip_pci_tbl,
+	.probe			= mtip_pci_probe,
+	.remove			= mtip_pci_remove,
+	.driver.pm		= &mtip_pci_pm_ops,
+	.shutdown		= mtip_pci_shutdown,
+};
+
+MODULE_DEVICE_TABLE(pci, mtip_pci_tbl);
+
+/*
+ * Module initialization function.
+ *
+ * Called once when the module is loaded. This function allocates a major
+ * block device number to the Cyclone devices and registers the PCI layer
+ * of the driver.
+ *
+ * Return value
+ *      0 on success else error code.
+ */
+static int __init mtip_init(void)
+{
+	int error;
+
+	pr_info(MTIP_DRV_NAME " Version " MTIP_DRV_VERSION "\n");
+
+	/* Allocate a major block device number to use with this driver. */
+	error = register_blkdev(0, MTIP_DRV_NAME);
+	if (error <= 0) {
+		pr_err("Unable to register block device (%d)\n",
+		error);
+		return -EBUSY;
+	}
+	mtip_major = error;
+
+	dfs_parent = debugfs_create_dir("rssd", NULL);
+	if (IS_ERR_OR_NULL(dfs_parent)) {
+		pr_warn("Error creating debugfs parent\n");
+		dfs_parent = NULL;
+	}
+
+	/* Register our PCI operations. */
+	error = pci_register_driver(&mtip_pci_driver);
+	if (error) {
+		debugfs_remove(dfs_parent);
+		unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+	}
+
+	return error;
+}
+
+/*
+ * Module de-initialization function.
+ *
+ * Called once when the module is unloaded. This function deallocates
+ * the major block device number allocated by mtip_init() and
+ * unregisters the PCI layer of the driver.
+ *
+ * Return value
+ *      none
+ */
+static void __exit mtip_exit(void)
+{
+	/* Release the allocated major block device number. */
+	unregister_blkdev(mtip_major, MTIP_DRV_NAME);
+
+	/* Unregister the PCI driver. */
+	pci_unregister_driver(&mtip_pci_driver);
+
+	debugfs_remove_recursive(dfs_parent);
+}
+
+MODULE_AUTHOR("Micron Technology, Inc");
+MODULE_DESCRIPTION("Micron RealSSD PCIe Block Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(MTIP_DRV_VERSION);
+
+module_init(mtip_init);
+module_exit(mtip_exit);
diff --git a/drivers/block/mtip32xx/mtip32xx.h b/drivers/block/mtip32xx/mtip32xx.h
new file mode 100644
index 000000000..f7328f19a
--- /dev/null
+++ b/drivers/block/mtip32xx/mtip32xx.h
@@ -0,0 +1,467 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * mtip32xx.h - Header file for the P320 SSD Block Driver
+ *   Copyright (C) 2011 Micron Technology, Inc.
+ *
+ * Portions of this code were derived from works subjected to the
+ * following copyright:
+ *    Copyright (C) 2009 Integrated Device Technology, Inc.
+ */
+
+#ifndef __MTIP32XX_H__
+#define __MTIP32XX_H__
+
+#include <linux/spinlock.h>
+#include <linux/rwsem.h>
+#include <linux/ata.h>
+#include <linux/interrupt.h>
+
+/* Offset of Subsystem Device ID in pci confoguration space */
+#define PCI_SUBSYSTEM_DEVICEID	0x2E
+
+/* offset of Device Control register in PCIe extended capabilites space */
+#define PCIE_CONFIG_EXT_DEVICE_CONTROL_OFFSET	0x48
+
+/* check for erase mode support during secure erase */
+#define MTIP_SEC_ERASE_MODE     0x2
+
+/* # of times to retry timed out/failed IOs */
+#define MTIP_MAX_RETRIES	2
+
+/* Various timeout values in ms */
+#define MTIP_NCQ_CMD_TIMEOUT_MS      15000
+#define MTIP_IOCTL_CMD_TIMEOUT_MS    5000
+#define MTIP_INT_CMD_TIMEOUT_MS      5000
+#define MTIP_QUIESCE_IO_TIMEOUT_MS   (MTIP_NCQ_CMD_TIMEOUT_MS * \
+				     (MTIP_MAX_RETRIES + 1))
+
+/* check for timeouts every 500ms */
+#define MTIP_TIMEOUT_CHECK_PERIOD	500
+
+/* ftl rebuild */
+#define MTIP_FTL_REBUILD_OFFSET		142
+#define MTIP_FTL_REBUILD_MAGIC		0xED51
+#define MTIP_FTL_REBUILD_TIMEOUT_MS	2400000
+
+/* unaligned IO handling */
+#define MTIP_MAX_UNALIGNED_SLOTS	2
+
+/* Macro to extract the tag bit number from a tag value. */
+#define MTIP_TAG_BIT(tag)	(tag & 0x1F)
+
+/*
+ * Macro to extract the tag index from a tag value. The index
+ * is used to access the correct s_active/Command Issue register based
+ * on the tag value.
+ */
+#define MTIP_TAG_INDEX(tag)	(tag >> 5)
+
+/*
+ * Maximum number of scatter gather entries
+ * a single command may have.
+ */
+#define MTIP_MAX_SG		504
+
+/*
+ * Maximum number of slot groups (Command Issue & s_active registers)
+ * NOTE: This is the driver maximum; check dd->slot_groups for actual value.
+ */
+#define MTIP_MAX_SLOT_GROUPS	8
+
+/* Internal command tag. */
+#define MTIP_TAG_INTERNAL	0
+
+/* Micron Vendor ID & P320x SSD Device ID */
+#define PCI_VENDOR_ID_MICRON    0x1344
+#define P320H_DEVICE_ID		0x5150
+#define P320M_DEVICE_ID		0x5151
+#define P320S_DEVICE_ID		0x5152
+#define P325M_DEVICE_ID		0x5153
+#define P420H_DEVICE_ID		0x5160
+#define P420M_DEVICE_ID		0x5161
+#define P425M_DEVICE_ID		0x5163
+
+/* Driver name and version strings */
+#define MTIP_DRV_NAME		"mtip32xx"
+#define MTIP_DRV_VERSION	"1.3.1"
+
+/* Maximum number of minor device numbers per device. */
+#define MTIP_MAX_MINORS		16
+
+/* Maximum number of supported command slots. */
+#define MTIP_MAX_COMMAND_SLOTS	(MTIP_MAX_SLOT_GROUPS * 32)
+
+/*
+ * Per-tag bitfield size in longs.
+ * Linux bit manipulation functions
+ * (i.e. test_and_set_bit, find_next_zero_bit)
+ * manipulate memory in longs, so we try to make the math work.
+ * take the slot groups and find the number of longs, rounding up.
+ * Careful! i386 and x86_64 use different size longs!
+ */
+#define U32_PER_LONG	(sizeof(long) / sizeof(u32))
+#define SLOTBITS_IN_LONGS ((MTIP_MAX_SLOT_GROUPS + \
+					(U32_PER_LONG-1))/U32_PER_LONG)
+
+/* BAR number used to access the HBA registers. */
+#define MTIP_ABAR		5
+
+#ifdef DEBUG
+ #define dbg_printk(format, arg...)	\
+	printk(pr_fmt(format), ##arg);
+#else
+ #define dbg_printk(format, arg...)
+#endif
+
+#define MTIP_DFS_MAX_BUF_SIZE 1024
+
+enum {
+	/* below are bit numbers in 'flags' defined in mtip_port */
+	MTIP_PF_IC_ACTIVE_BIT       = 0, /* pio/ioctl */
+	MTIP_PF_EH_ACTIVE_BIT       = 1, /* error handling */
+	MTIP_PF_SE_ACTIVE_BIT       = 2, /* secure erase */
+	MTIP_PF_DM_ACTIVE_BIT       = 3, /* download microcde */
+	MTIP_PF_TO_ACTIVE_BIT       = 9, /* timeout handling */
+	MTIP_PF_PAUSE_IO      =	((1 << MTIP_PF_IC_ACTIVE_BIT) |
+				(1 << MTIP_PF_EH_ACTIVE_BIT) |
+				(1 << MTIP_PF_SE_ACTIVE_BIT) |
+				(1 << MTIP_PF_DM_ACTIVE_BIT) |
+				(1 << MTIP_PF_TO_ACTIVE_BIT)),
+	MTIP_PF_HOST_CAP_64         = 10, /* cache HOST_CAP_64 */
+
+	MTIP_PF_SVC_THD_ACTIVE_BIT  = 4,
+	MTIP_PF_ISSUE_CMDS_BIT      = 5,
+	MTIP_PF_REBUILD_BIT         = 6,
+	MTIP_PF_SVC_THD_STOP_BIT    = 8,
+
+	MTIP_PF_SVC_THD_WORK	= ((1 << MTIP_PF_EH_ACTIVE_BIT) |
+				  (1 << MTIP_PF_ISSUE_CMDS_BIT) |
+				  (1 << MTIP_PF_REBUILD_BIT) |
+				  (1 << MTIP_PF_SVC_THD_STOP_BIT) |
+				  (1 << MTIP_PF_TO_ACTIVE_BIT)),
+
+	/* below are bit numbers in 'dd_flag' defined in driver_data */
+	MTIP_DDF_SEC_LOCK_BIT	    = 0,
+	MTIP_DDF_REMOVE_PENDING_BIT = 1,
+	MTIP_DDF_OVER_TEMP_BIT      = 2,
+	MTIP_DDF_WRITE_PROTECT_BIT  = 3,
+	MTIP_DDF_CLEANUP_BIT        = 5,
+	MTIP_DDF_RESUME_BIT         = 6,
+	MTIP_DDF_INIT_DONE_BIT      = 7,
+	MTIP_DDF_REBUILD_FAILED_BIT = 8,
+
+	MTIP_DDF_STOP_IO      = ((1 << MTIP_DDF_REMOVE_PENDING_BIT) |
+				(1 << MTIP_DDF_SEC_LOCK_BIT) |
+				(1 << MTIP_DDF_OVER_TEMP_BIT) |
+				(1 << MTIP_DDF_WRITE_PROTECT_BIT) |
+				(1 << MTIP_DDF_REBUILD_FAILED_BIT)),
+
+};
+
+struct smart_attr {
+	u8 attr_id;
+	__le16 flags;
+	u8 cur;
+	u8 worst;
+	__le32 data;
+	u8 res[3];
+} __packed;
+
+struct mtip_work {
+	struct work_struct work;
+	void *port;
+	int cpu_binding;
+	u32 completed;
+} ____cacheline_aligned_in_smp;
+
+#define DEFINE_HANDLER(group)                                  \
+	void mtip_workq_sdbf##group(struct work_struct *work)       \
+	{                                                      \
+		struct mtip_work *w = (struct mtip_work *) work;         \
+		mtip_workq_sdbfx(w->port, group, w->completed);     \
+	}
+
+/* Register Frame Information Structure (FIS), host to device. */
+struct host_to_dev_fis {
+	/*
+	 * FIS type.
+	 * - 27h Register FIS, host to device.
+	 * - 34h Register FIS, device to host.
+	 * - 39h DMA Activate FIS, device to host.
+	 * - 41h DMA Setup FIS, bi-directional.
+	 * - 46h Data FIS, bi-directional.
+	 * - 58h BIST Activate FIS, bi-directional.
+	 * - 5Fh PIO Setup FIS, device to host.
+	 * - A1h Set Device Bits FIS, device to host.
+	 */
+	unsigned char type;
+	unsigned char opts;
+	unsigned char command;
+	unsigned char features;
+
+	union {
+		unsigned char lba_low;
+		unsigned char sector;
+	};
+	union {
+		unsigned char lba_mid;
+		unsigned char cyl_low;
+	};
+	union {
+		unsigned char lba_hi;
+		unsigned char cyl_hi;
+	};
+	union {
+		unsigned char device;
+		unsigned char head;
+	};
+
+	union {
+		unsigned char lba_low_ex;
+		unsigned char sector_ex;
+	};
+	union {
+		unsigned char lba_mid_ex;
+		unsigned char cyl_low_ex;
+	};
+	union {
+		unsigned char lba_hi_ex;
+		unsigned char cyl_hi_ex;
+	};
+	unsigned char features_ex;
+
+	unsigned char sect_count;
+	unsigned char sect_cnt_ex;
+	unsigned char res2;
+	unsigned char control;
+
+	unsigned int res3;
+};
+
+/* Command header structure. */
+struct mtip_cmd_hdr {
+	/*
+	 * Command options.
+	 * - Bits 31:16 Number of PRD entries.
+	 * - Bits 15:8 Unused in this implementation.
+	 * - Bit 7 Prefetch bit, informs the drive to prefetch PRD entries.
+	 * - Bit 6 Write bit, should be set when writing data to the device.
+	 * - Bit 5 Unused in this implementation.
+	 * - Bits 4:0 Length of the command FIS in DWords (DWord = 4 bytes).
+	 */
+	__le32 opts;
+	/* This field is unsed when using NCQ. */
+	union {
+		__le32 byte_count;
+		__le32 status;
+	};
+	/*
+	 * Lower 32 bits of the command table address associated with this
+	 * header. The command table addresses must be 128 byte aligned.
+	 */
+	__le32 ctba;
+	/*
+	 * If 64 bit addressing is used this field is the upper 32 bits
+	 * of the command table address associated with this command.
+	 */
+	__le32 ctbau;
+	/* Reserved and unused. */
+	u32 res[4];
+};
+
+/* Command scatter gather structure (PRD). */
+struct mtip_cmd_sg {
+	/*
+	 * Low 32 bits of the data buffer address. For P320 this
+	 * address must be 8 byte aligned signified by bits 2:0 being
+	 * set to 0.
+	 */
+	__le32 dba;
+	/*
+	 * When 64 bit addressing is used this field is the upper
+	 * 32 bits of the data buffer address.
+	 */
+	__le32 dba_upper;
+	/* Unused. */
+	__le32 reserved;
+	/*
+	 * Bit 31: interrupt when this data block has been transferred.
+	 * Bits 30..22: reserved
+	 * Bits 21..0: byte count (minus 1).  For P320 the byte count must be
+	 * 8 byte aligned signified by bits 2:0 being set to 1.
+	 */
+	__le32 info;
+};
+struct mtip_port;
+
+struct mtip_int_cmd;
+
+/* Structure used to describe a command. */
+struct mtip_cmd {
+	void *command; /* ptr to command table entry */
+
+	dma_addr_t command_dma; /* corresponding physical address */
+
+	int scatter_ents; /* Number of scatter list entries used */
+
+	int unaligned; /* command is unaligned on 4k boundary */
+
+	union {
+		struct scatterlist sg[MTIP_MAX_SG]; /* Scatter list entries */
+		struct mtip_int_cmd *icmd;
+	};
+
+	int retries; /* The number of retries left for this command. */
+
+	int direction; /* Data transfer direction */
+	blk_status_t status;
+};
+
+/* Structure used to describe a port. */
+struct mtip_port {
+	/* Pointer back to the driver data for this port. */
+	struct driver_data *dd;
+	/*
+	 * Used to determine if the data pointed to by the
+	 * identify field is valid.
+	 */
+	unsigned long identify_valid;
+	/* Base address of the memory mapped IO for the port. */
+	void __iomem *mmio;
+	/* Array of pointers to the memory mapped s_active registers. */
+	void __iomem *s_active[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped completed registers. */
+	void __iomem *completed[MTIP_MAX_SLOT_GROUPS];
+	/* Array of pointers to the memory mapped Command Issue registers. */
+	void __iomem *cmd_issue[MTIP_MAX_SLOT_GROUPS];
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the driver.
+	 */
+	void *command_list;
+	/*
+	 * Pointer to the beginning of the command header memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t command_list_dma;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the driver.
+	 */
+	void *rxfis;
+	/*
+	 * Pointer to the beginning of the RX FIS memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t rxfis_dma;
+	/*
+	 * Pointer to the DMA region for RX Fis, Identify, RLE10, and SMART
+	 */
+	void *block1;
+	/*
+	 * DMA address of region for RX Fis, Identify, RLE10, and SMART
+	 */
+	dma_addr_t block1_dma;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the driver.
+	 */
+	u16 *identify;
+	/*
+	 * Pointer to the beginning of the identify data memory as used
+	 * by the DMA.
+	 */
+	dma_addr_t identify_dma;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the driver when issuing internal commands.
+	 */
+	u16 *sector_buffer;
+	/*
+	 * Pointer to the beginning of a sector buffer that is used
+	 * by the DMA when the driver issues internal commands.
+	 */
+	dma_addr_t sector_buffer_dma;
+
+	u16 *log_buf;
+	dma_addr_t log_buf_dma;
+
+	u8 *smart_buf;
+	dma_addr_t smart_buf_dma;
+
+	/*
+	 * used to queue commands when an internal command is in progress
+	 * or error handling is active
+	 */
+	unsigned long cmds_to_issue[SLOTBITS_IN_LONGS];
+	/* Used by mtip_service_thread to wait for an event */
+	wait_queue_head_t svc_wait;
+	/*
+	 * indicates the state of the port. Also, helps the service thread
+	 * to determine its action on wake up.
+	 */
+	unsigned long flags;
+	/*
+	 * Timer used to complete commands that have been active for too long.
+	 */
+	unsigned long ic_pause_timer;
+
+	/* Counter to control queue depth of unaligned IOs */
+	atomic_t cmd_slot_unal;
+
+	/* Spinlock for working around command-issue bug. */
+	spinlock_t cmd_issue_lock[MTIP_MAX_SLOT_GROUPS];
+};
+
+/*
+ * Driver private data structure.
+ *
+ * One structure is allocated per probed device.
+ */
+struct driver_data {
+	void __iomem *mmio; /* Base address of the HBA registers. */
+
+	int major; /* Major device number. */
+
+	int instance; /* Instance number. First device probed is 0, ... */
+
+	struct gendisk *disk; /* Pointer to our gendisk structure. */
+
+	struct pci_dev *pdev; /* Pointer to the PCI device structure. */
+
+	struct request_queue *queue; /* Our request queue. */
+
+	struct blk_mq_tag_set tags; /* blk_mq tags */
+
+	struct mtip_port *port; /* Pointer to the port data structure. */
+
+	unsigned product_type; /* magic value declaring the product type */
+
+	unsigned slot_groups; /* number of slot groups the product supports */
+
+	unsigned long index; /* Index to determine the disk name */
+
+	unsigned long dd_flag; /* NOTE: use atomic bit operations on this */
+
+	struct task_struct *mtip_svc_handler; /* task_struct of svc thd */
+
+	struct dentry *dfs_node;
+
+	bool sr;
+
+	int numa_node; /* NUMA support */
+
+	char workq_name[32];
+
+	struct workqueue_struct *isr_workq;
+
+	atomic_t irq_workers_active;
+
+	struct mtip_work work[MTIP_MAX_SLOT_GROUPS];
+
+	int isr_binding;
+
+	int unal_qdepth; /* qdepth of unaligned IO queue */
+};
+
+#endif
diff --git a/drivers/block/n64cart.c b/drivers/block/n64cart.c
new file mode 100644
index 000000000..d914156db
--- /dev/null
+++ b/drivers/block/n64cart.c
@@ -0,0 +1,180 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Support for the N64 cart.
+ *
+ * Copyright (c) 2021 Lauri Kasanen
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/dma-mapping.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+
+enum {
+	PI_DRAM_REG = 0,
+	PI_CART_REG,
+	PI_READ_REG,
+	PI_WRITE_REG,
+	PI_STATUS_REG,
+};
+
+#define PI_STATUS_DMA_BUSY	(1 << 0)
+#define PI_STATUS_IO_BUSY	(1 << 1)
+
+#define CART_DOMAIN		0x10000000
+#define CART_MAX		0x1FFFFFFF
+
+#define MIN_ALIGNMENT		8
+
+static u32 __iomem *reg_base;
+
+static unsigned int start;
+module_param(start, uint, 0);
+MODULE_PARM_DESC(start, "Start address of the cart block data");
+
+static unsigned int size;
+module_param(size, uint, 0);
+MODULE_PARM_DESC(size, "Size of the cart block data, in bytes");
+
+static void n64cart_write_reg(const u8 reg, const u32 value)
+{
+	writel(value, reg_base + reg);
+}
+
+static u32 n64cart_read_reg(const u8 reg)
+{
+	return readl(reg_base + reg);
+}
+
+static void n64cart_wait_dma(void)
+{
+	while (n64cart_read_reg(PI_STATUS_REG) &
+		(PI_STATUS_DMA_BUSY | PI_STATUS_IO_BUSY))
+		cpu_relax();
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static bool n64cart_do_bvec(struct device *dev, struct bio_vec *bv, u32 pos)
+{
+	dma_addr_t dma_addr;
+	const u32 bstart = pos + start;
+
+	/* Alignment check */
+	WARN_ON_ONCE((bv->bv_offset & (MIN_ALIGNMENT - 1)) ||
+		     (bv->bv_len & (MIN_ALIGNMENT - 1)));
+
+	dma_addr = dma_map_bvec(dev, bv, DMA_FROM_DEVICE, 0);
+	if (dma_mapping_error(dev, dma_addr))
+		return false;
+
+	n64cart_wait_dma();
+
+	n64cart_write_reg(PI_DRAM_REG, dma_addr);
+	n64cart_write_reg(PI_CART_REG, (bstart | CART_DOMAIN) & CART_MAX);
+	n64cart_write_reg(PI_WRITE_REG, bv->bv_len - 1);
+
+	n64cart_wait_dma();
+
+	dma_unmap_page(dev, dma_addr, bv->bv_len, DMA_FROM_DEVICE);
+	return true;
+}
+
+static void n64cart_submit_bio(struct bio *bio)
+{
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct device *dev = bio->bi_bdev->bd_disk->private_data;
+	u32 pos = bio->bi_iter.bi_sector << SECTOR_SHIFT;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		if (!n64cart_do_bvec(dev, &bvec, pos)) {
+			bio_io_error(bio);
+			return;
+		}
+		pos += bvec.bv_len;
+	}
+
+	bio_endio(bio);
+}
+
+static const struct block_device_operations n64cart_fops = {
+	.owner		= THIS_MODULE,
+	.submit_bio	= n64cart_submit_bio,
+};
+
+/*
+ * The target device is embedded and RAM-constrained. We save RAM
+ * by initializing in __init code that gets dropped late in boot.
+ * For the same reason there is no module or unloading support.
+ */
+static int __init n64cart_probe(struct platform_device *pdev)
+{
+	struct gendisk *disk;
+	int err = -ENOMEM;
+
+	if (!start || !size) {
+		pr_err("start or size not specified\n");
+		return -ENODEV;
+	}
+
+	if (size & 4095) {
+		pr_err("size must be a multiple of 4K\n");
+		return -ENODEV;
+	}
+
+	reg_base = devm_platform_ioremap_resource(pdev, 0);
+	if (IS_ERR(reg_base))
+		return PTR_ERR(reg_base);
+
+	disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!disk)
+		goto out;
+
+	disk->first_minor = 0;
+	disk->flags = GENHD_FL_NO_PART;
+	disk->fops = &n64cart_fops;
+	disk->private_data = &pdev->dev;
+	strcpy(disk->disk_name, "n64cart");
+
+	set_capacity(disk, size >> SECTOR_SHIFT);
+	set_disk_ro(disk, 1);
+
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_physical_block_size(disk->queue, 4096);
+	blk_queue_logical_block_size(disk->queue, 4096);
+
+	err = add_disk(disk);
+	if (err)
+		goto out_cleanup_disk;
+
+	pr_info("n64cart: %u kb disk\n", size / 1024);
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(disk);
+out:
+	return err;
+}
+
+static struct platform_driver n64cart_driver = {
+	.driver = {
+		.name = "n64cart",
+	},
+};
+
+static int __init n64cart_init(void)
+{
+	return platform_driver_probe(&n64cart_driver, n64cart_probe);
+}
+
+module_init(n64cart_init);
+
+MODULE_AUTHOR("Lauri Kasanen <cand@gmx.com>");
+MODULE_DESCRIPTION("Driver for the N64 cart");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
new file mode 100644
index 000000000..9a53165de
--- /dev/null
+++ b/drivers/block/nbd.c
@@ -0,0 +1,2601 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Network block device - make block devices work over TCP
+ *
+ * Note that you can not swap over this thing, yet. Seems to work but
+ * deadlocks sometimes - you can not swap over TCP in general.
+ * 
+ * Copyright 1997-2000, 2008 Pavel Machek <pavel@ucw.cz>
+ * Parts copyright 2001 Steven Whitehouse <steve@chygwyn.com>
+ *
+ * (part of code stolen from loop.c)
+ */
+
+#define pr_fmt(fmt) "nbd: " fmt
+
+#include <linux/major.h>
+
+#include <linux/blkdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/sched.h>
+#include <linux/sched/mm.h>
+#include <linux/fs.h>
+#include <linux/bio.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/file.h>
+#include <linux/ioctl.h>
+#include <linux/mutex.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <net/sock.h>
+#include <linux/net.h>
+#include <linux/kthread.h>
+#include <linux/types.h>
+#include <linux/debugfs.h>
+#include <linux/blk-mq.h>
+
+#include <linux/uaccess.h>
+#include <asm/types.h>
+
+#include <linux/nbd.h>
+#include <linux/nbd-netlink.h>
+#include <net/genetlink.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/nbd.h>
+
+static DEFINE_IDR(nbd_index_idr);
+static DEFINE_MUTEX(nbd_index_mutex);
+static struct workqueue_struct *nbd_del_wq;
+static int nbd_total_devices = 0;
+
+struct nbd_sock {
+	struct socket *sock;
+	struct mutex tx_lock;
+	struct request *pending;
+	int sent;
+	bool dead;
+	int fallback_index;
+	int cookie;
+};
+
+struct recv_thread_args {
+	struct work_struct work;
+	struct nbd_device *nbd;
+	struct nbd_sock *nsock;
+	int index;
+};
+
+struct link_dead_args {
+	struct work_struct work;
+	int index;
+};
+
+#define NBD_RT_TIMEDOUT			0
+#define NBD_RT_DISCONNECT_REQUESTED	1
+#define NBD_RT_DISCONNECTED		2
+#define NBD_RT_HAS_PID_FILE		3
+#define NBD_RT_HAS_CONFIG_REF		4
+#define NBD_RT_BOUND			5
+#define NBD_RT_DISCONNECT_ON_CLOSE	6
+#define NBD_RT_HAS_BACKEND_FILE		7
+
+#define NBD_DESTROY_ON_DISCONNECT	0
+#define NBD_DISCONNECT_REQUESTED	1
+
+struct nbd_config {
+	u32 flags;
+	unsigned long runtime_flags;
+	u64 dead_conn_timeout;
+
+	struct nbd_sock **socks;
+	int num_connections;
+	atomic_t live_connections;
+	wait_queue_head_t conn_wait;
+
+	atomic_t recv_threads;
+	wait_queue_head_t recv_wq;
+	unsigned int blksize_bits;
+	loff_t bytesize;
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+	struct dentry *dbg_dir;
+#endif
+};
+
+static inline unsigned int nbd_blksize(struct nbd_config *config)
+{
+	return 1u << config->blksize_bits;
+}
+
+struct nbd_device {
+	struct blk_mq_tag_set tag_set;
+
+	int index;
+	refcount_t config_refs;
+	refcount_t refs;
+	struct nbd_config *config;
+	struct mutex config_lock;
+	struct gendisk *disk;
+	struct workqueue_struct *recv_workq;
+	struct work_struct remove_work;
+
+	struct list_head list;
+	struct task_struct *task_setup;
+
+	unsigned long flags;
+	pid_t pid; /* pid of nbd-client, if attached */
+
+	char *backend;
+};
+
+#define NBD_CMD_REQUEUED	1
+/*
+ * This flag will be set if nbd_queue_rq() succeed, and will be checked and
+ * cleared in completion. Both setting and clearing of the flag are protected
+ * by cmd->lock.
+ */
+#define NBD_CMD_INFLIGHT	2
+
+struct nbd_cmd {
+	struct nbd_device *nbd;
+	struct mutex lock;
+	int index;
+	int cookie;
+	int retries;
+	blk_status_t status;
+	unsigned long flags;
+	u32 cmd_cookie;
+};
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+static struct dentry *nbd_dbg_dir;
+#endif
+
+#define nbd_name(nbd) ((nbd)->disk->disk_name)
+
+#define NBD_DEF_BLKSIZE_BITS 10
+
+static unsigned int nbds_max = 16;
+static int max_part = 16;
+static int part_shift;
+
+static int nbd_dev_dbg_init(struct nbd_device *nbd);
+static void nbd_dev_dbg_close(struct nbd_device *nbd);
+static void nbd_config_put(struct nbd_device *nbd);
+static void nbd_connect_reply(struct genl_info *info, int index);
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info);
+static void nbd_dead_link_work(struct work_struct *work);
+static void nbd_disconnect_and_put(struct nbd_device *nbd);
+
+static inline struct device *nbd_to_dev(struct nbd_device *nbd)
+{
+	return disk_to_dev(nbd->disk);
+}
+
+static void nbd_requeue_cmd(struct nbd_cmd *cmd)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+
+	if (!test_and_set_bit(NBD_CMD_REQUEUED, &cmd->flags))
+		blk_mq_requeue_request(req, true);
+}
+
+#define NBD_COOKIE_BITS 32
+
+static u64 nbd_cmd_handle(struct nbd_cmd *cmd)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+	u32 tag = blk_mq_unique_tag(req);
+	u64 cookie = cmd->cmd_cookie;
+
+	return (cookie << NBD_COOKIE_BITS) | tag;
+}
+
+static u32 nbd_handle_to_tag(u64 handle)
+{
+	return (u32)handle;
+}
+
+static u32 nbd_handle_to_cookie(u64 handle)
+{
+	return (u32)(handle >> NBD_COOKIE_BITS);
+}
+
+static const char *nbdcmd_to_ascii(int cmd)
+{
+	switch (cmd) {
+	case  NBD_CMD_READ: return "read";
+	case NBD_CMD_WRITE: return "write";
+	case  NBD_CMD_DISC: return "disconnect";
+	case NBD_CMD_FLUSH: return "flush";
+	case  NBD_CMD_TRIM: return "trim/discard";
+	}
+	return "invalid";
+}
+
+static ssize_t pid_show(struct device *dev,
+			struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+
+	return sprintf(buf, "%d\n", nbd->pid);
+}
+
+static const struct device_attribute pid_attr = {
+	.attr = { .name = "pid", .mode = 0444},
+	.show = pid_show,
+};
+
+static ssize_t backend_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct nbd_device *nbd = (struct nbd_device *)disk->private_data;
+
+	return sprintf(buf, "%s\n", nbd->backend ?: "");
+}
+
+static const struct device_attribute backend_attr = {
+	.attr = { .name = "backend", .mode = 0444},
+	.show = backend_show,
+};
+
+static void nbd_dev_remove(struct nbd_device *nbd)
+{
+	struct gendisk *disk = nbd->disk;
+
+	del_gendisk(disk);
+	blk_mq_free_tag_set(&nbd->tag_set);
+
+	/*
+	 * Remove from idr after del_gendisk() completes, so if the same ID is
+	 * reused, the following add_disk() will succeed.
+	 */
+	mutex_lock(&nbd_index_mutex);
+	idr_remove(&nbd_index_idr, nbd->index);
+	mutex_unlock(&nbd_index_mutex);
+	destroy_workqueue(nbd->recv_workq);
+	put_disk(disk);
+}
+
+static void nbd_dev_remove_work(struct work_struct *work)
+{
+	nbd_dev_remove(container_of(work, struct nbd_device, remove_work));
+}
+
+static void nbd_put(struct nbd_device *nbd)
+{
+	if (!refcount_dec_and_test(&nbd->refs))
+		return;
+
+	/* Call del_gendisk() asynchrounously to prevent deadlock */
+	if (test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+		queue_work(nbd_del_wq, &nbd->remove_work);
+	else
+		nbd_dev_remove(nbd);
+}
+
+static int nbd_disconnected(struct nbd_config *config)
+{
+	return test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags) ||
+		test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
+}
+
+static void nbd_mark_nsock_dead(struct nbd_device *nbd, struct nbd_sock *nsock,
+				int notify)
+{
+	if (!nsock->dead && notify && !nbd_disconnected(nbd->config)) {
+		struct link_dead_args *args;
+		args = kmalloc(sizeof(struct link_dead_args), GFP_NOIO);
+		if (args) {
+			INIT_WORK(&args->work, nbd_dead_link_work);
+			args->index = nbd->index;
+			queue_work(system_wq, &args->work);
+		}
+	}
+	if (!nsock->dead) {
+		kernel_sock_shutdown(nsock->sock, SHUT_RDWR);
+		if (atomic_dec_return(&nbd->config->live_connections) == 0) {
+			if (test_and_clear_bit(NBD_RT_DISCONNECT_REQUESTED,
+					       &nbd->config->runtime_flags)) {
+				set_bit(NBD_RT_DISCONNECTED,
+					&nbd->config->runtime_flags);
+				dev_info(nbd_to_dev(nbd),
+					"Disconnected due to user request.\n");
+			}
+		}
+	}
+	nsock->dead = true;
+	nsock->pending = NULL;
+	nsock->sent = 0;
+}
+
+static int nbd_set_size(struct nbd_device *nbd, loff_t bytesize,
+		loff_t blksize)
+{
+	if (!blksize)
+		blksize = 1u << NBD_DEF_BLKSIZE_BITS;
+
+	if (blk_validate_block_size(blksize))
+		return -EINVAL;
+
+	if (bytesize < 0)
+		return -EINVAL;
+
+	nbd->config->bytesize = bytesize;
+	nbd->config->blksize_bits = __ffs(blksize);
+
+	if (!nbd->pid)
+		return 0;
+
+	if (nbd->config->flags & NBD_FLAG_SEND_TRIM) {
+		nbd->disk->queue->limits.discard_granularity = blksize;
+		blk_queue_max_discard_sectors(nbd->disk->queue, UINT_MAX);
+	}
+	blk_queue_logical_block_size(nbd->disk->queue, blksize);
+	blk_queue_physical_block_size(nbd->disk->queue, blksize);
+
+	if (max_part)
+		set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
+	if (!set_capacity_and_notify(nbd->disk, bytesize >> 9))
+		kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+	return 0;
+}
+
+static void nbd_complete_rq(struct request *req)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+
+	dev_dbg(nbd_to_dev(cmd->nbd), "request %p: %s\n", req,
+		cmd->status ? "failed" : "done");
+
+	blk_mq_end_request(req, cmd->status);
+}
+
+/*
+ * Forcibly shutdown the socket causing all listeners to error
+ */
+static void sock_shutdown(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	int i;
+
+	if (config->num_connections == 0)
+		return;
+	if (test_and_set_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
+		return;
+
+	for (i = 0; i < config->num_connections; i++) {
+		struct nbd_sock *nsock = config->socks[i];
+		mutex_lock(&nsock->tx_lock);
+		nbd_mark_nsock_dead(nbd, nsock, 0);
+		mutex_unlock(&nsock->tx_lock);
+	}
+	dev_warn(disk_to_dev(nbd->disk), "shutting down sockets\n");
+}
+
+static u32 req_to_nbd_cmd_type(struct request *req)
+{
+	switch (req_op(req)) {
+	case REQ_OP_DISCARD:
+		return NBD_CMD_TRIM;
+	case REQ_OP_FLUSH:
+		return NBD_CMD_FLUSH;
+	case REQ_OP_WRITE:
+		return NBD_CMD_WRITE;
+	case REQ_OP_READ:
+		return NBD_CMD_READ;
+	default:
+		return U32_MAX;
+	}
+}
+
+static enum blk_eh_timer_return nbd_xmit_timeout(struct request *req)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+	struct nbd_device *nbd = cmd->nbd;
+	struct nbd_config *config;
+
+	if (!mutex_trylock(&cmd->lock))
+		return BLK_EH_RESET_TIMER;
+
+	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return BLK_EH_DONE;
+	}
+
+	if (!refcount_inc_not_zero(&nbd->config_refs)) {
+		cmd->status = BLK_STS_TIMEOUT;
+		__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+		mutex_unlock(&cmd->lock);
+		goto done;
+	}
+	config = nbd->config;
+
+	if (config->num_connections > 1 ||
+	    (config->num_connections == 1 && nbd->tag_set.timeout)) {
+		dev_err_ratelimited(nbd_to_dev(nbd),
+				    "Connection timed out, retrying (%d/%d alive)\n",
+				    atomic_read(&config->live_connections),
+				    config->num_connections);
+		/*
+		 * Hooray we have more connections, requeue this IO, the submit
+		 * path will put it on a real connection. Or if only one
+		 * connection is configured, the submit path will wait util
+		 * a new connection is reconfigured or util dead timeout.
+		 */
+		if (config->socks) {
+			if (cmd->index < config->num_connections) {
+				struct nbd_sock *nsock =
+					config->socks[cmd->index];
+				mutex_lock(&nsock->tx_lock);
+				/* We can have multiple outstanding requests, so
+				 * we don't want to mark the nsock dead if we've
+				 * already reconnected with a new socket, so
+				 * only mark it dead if its the same socket we
+				 * were sent out on.
+				 */
+				if (cmd->cookie == nsock->cookie)
+					nbd_mark_nsock_dead(nbd, nsock, 1);
+				mutex_unlock(&nsock->tx_lock);
+			}
+			mutex_unlock(&cmd->lock);
+			nbd_requeue_cmd(cmd);
+			nbd_config_put(nbd);
+			return BLK_EH_DONE;
+		}
+	}
+
+	if (!nbd->tag_set.timeout) {
+		/*
+		 * Userspace sets timeout=0 to disable socket disconnection,
+		 * so just warn and reset the timer.
+		 */
+		struct nbd_sock *nsock = config->socks[cmd->index];
+		cmd->retries++;
+		dev_info(nbd_to_dev(nbd), "Possible stuck request %p: control (%s@%llu,%uB). Runtime %u seconds\n",
+			req, nbdcmd_to_ascii(req_to_nbd_cmd_type(req)),
+			(unsigned long long)blk_rq_pos(req) << 9,
+			blk_rq_bytes(req), (req->timeout / HZ) * cmd->retries);
+
+		mutex_lock(&nsock->tx_lock);
+		if (cmd->cookie != nsock->cookie) {
+			nbd_requeue_cmd(cmd);
+			mutex_unlock(&nsock->tx_lock);
+			mutex_unlock(&cmd->lock);
+			nbd_config_put(nbd);
+			return BLK_EH_DONE;
+		}
+		mutex_unlock(&nsock->tx_lock);
+		mutex_unlock(&cmd->lock);
+		nbd_config_put(nbd);
+		return BLK_EH_RESET_TIMER;
+	}
+
+	dev_err_ratelimited(nbd_to_dev(nbd), "Connection timed out\n");
+	set_bit(NBD_RT_TIMEDOUT, &config->runtime_flags);
+	cmd->status = BLK_STS_IOERR;
+	__clear_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	mutex_unlock(&cmd->lock);
+	sock_shutdown(nbd);
+	nbd_config_put(nbd);
+done:
+	blk_mq_complete_request(req);
+	return BLK_EH_DONE;
+}
+
+static int __sock_xmit(struct nbd_device *nbd, struct socket *sock, int send,
+		       struct iov_iter *iter, int msg_flags, int *sent)
+{
+	int result;
+	struct msghdr msg = { };
+	unsigned int noreclaim_flag;
+
+	if (unlikely(!sock)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+			"Attempted %s on closed socket in sock_xmit\n",
+			(send ? "send" : "recv"));
+		return -EINVAL;
+	}
+
+	msg.msg_iter = *iter;
+
+	noreclaim_flag = memalloc_noreclaim_save();
+	do {
+		sock->sk->sk_allocation = GFP_NOIO | __GFP_MEMALLOC;
+		msg.msg_flags = msg_flags | MSG_NOSIGNAL;
+
+		if (send)
+			result = sock_sendmsg(sock, &msg);
+		else
+			result = sock_recvmsg(sock, &msg, msg.msg_flags);
+
+		if (result <= 0) {
+			if (result == 0)
+				result = -EPIPE; /* short read */
+			break;
+		}
+		if (sent)
+			*sent += result;
+	} while (msg_data_left(&msg));
+
+	memalloc_noreclaim_restore(noreclaim_flag);
+
+	return result;
+}
+
+/*
+ *  Send or receive packet. Return a positive value on success and
+ *  negtive value on failure, and never return 0.
+ */
+static int sock_xmit(struct nbd_device *nbd, int index, int send,
+		     struct iov_iter *iter, int msg_flags, int *sent)
+{
+	struct nbd_config *config = nbd->config;
+	struct socket *sock = config->socks[index]->sock;
+
+	return __sock_xmit(nbd, sock, send, iter, msg_flags, sent);
+}
+
+/*
+ * Different settings for sk->sk_sndtimeo can result in different return values
+ * if there is a signal pending when we enter sendmsg, because reasons?
+ */
+static inline int was_interrupted(int result)
+{
+	return result == -ERESTARTSYS || result == -EINTR;
+}
+
+/* always call with the tx_lock held */
+static int nbd_send_cmd(struct nbd_device *nbd, struct nbd_cmd *cmd, int index)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+	struct nbd_config *config = nbd->config;
+	struct nbd_sock *nsock = config->socks[index];
+	int result;
+	struct nbd_request request = {.magic = htonl(NBD_REQUEST_MAGIC)};
+	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
+	struct iov_iter from;
+	unsigned long size = blk_rq_bytes(req);
+	struct bio *bio;
+	u64 handle;
+	u32 type;
+	u32 nbd_cmd_flags = 0;
+	int sent = nsock->sent, skip = 0;
+
+	iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
+
+	type = req_to_nbd_cmd_type(req);
+	if (type == U32_MAX)
+		return -EIO;
+
+	if (rq_data_dir(req) == WRITE &&
+	    (config->flags & NBD_FLAG_READ_ONLY)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Write on read-only\n");
+		return -EIO;
+	}
+
+	if (req->cmd_flags & REQ_FUA)
+		nbd_cmd_flags |= NBD_CMD_FLAG_FUA;
+
+	/* We did a partial send previously, and we at least sent the whole
+	 * request struct, so just go and send the rest of the pages in the
+	 * request.
+	 */
+	if (sent) {
+		if (sent >= sizeof(request)) {
+			skip = sent - sizeof(request);
+
+			/* initialize handle for tracing purposes */
+			handle = nbd_cmd_handle(cmd);
+
+			goto send_pages;
+		}
+		iov_iter_advance(&from, sent);
+	} else {
+		cmd->cmd_cookie++;
+	}
+	cmd->index = index;
+	cmd->cookie = nsock->cookie;
+	cmd->retries = 0;
+	request.type = htonl(type | nbd_cmd_flags);
+	if (type != NBD_CMD_FLUSH) {
+		request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9);
+		request.len = htonl(size);
+	}
+	handle = nbd_cmd_handle(cmd);
+	memcpy(request.handle, &handle, sizeof(handle));
+
+	trace_nbd_send_request(&request, nbd->index, blk_mq_rq_from_pdu(cmd));
+
+	dev_dbg(nbd_to_dev(nbd), "request %p: sending control (%s@%llu,%uB)\n",
+		req, nbdcmd_to_ascii(type),
+		(unsigned long long)blk_rq_pos(req) << 9, blk_rq_bytes(req));
+	result = sock_xmit(nbd, index, 1, &from,
+			(type == NBD_CMD_WRITE) ? MSG_MORE : 0, &sent);
+	trace_nbd_header_sent(req, handle);
+	if (result < 0) {
+		if (was_interrupted(result)) {
+			/* If we havne't sent anything we can just return BUSY,
+			 * however if we have sent something we need to make
+			 * sure we only allow this req to be sent until we are
+			 * completely done.
+			 */
+			if (sent) {
+				nsock->pending = req;
+				nsock->sent = sent;
+			}
+			set_bit(NBD_CMD_REQUEUED, &cmd->flags);
+			return BLK_STS_RESOURCE;
+		}
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+			"Send control failed (result %d)\n", result);
+		return -EAGAIN;
+	}
+send_pages:
+	if (type != NBD_CMD_WRITE)
+		goto out;
+
+	bio = req->bio;
+	while (bio) {
+		struct bio *next = bio->bi_next;
+		struct bvec_iter iter;
+		struct bio_vec bvec;
+
+		bio_for_each_segment(bvec, bio, iter) {
+			bool is_last = !next && bio_iter_last(bvec, iter);
+			int flags = is_last ? 0 : MSG_MORE;
+
+			dev_dbg(nbd_to_dev(nbd), "request %p: sending %d bytes data\n",
+				req, bvec.bv_len);
+			iov_iter_bvec(&from, ITER_SOURCE, &bvec, 1, bvec.bv_len);
+			if (skip) {
+				if (skip >= iov_iter_count(&from)) {
+					skip -= iov_iter_count(&from);
+					continue;
+				}
+				iov_iter_advance(&from, skip);
+				skip = 0;
+			}
+			result = sock_xmit(nbd, index, 1, &from, flags, &sent);
+			if (result < 0) {
+				if (was_interrupted(result)) {
+					/* We've already sent the header, we
+					 * have no choice but to set pending and
+					 * return BUSY.
+					 */
+					nsock->pending = req;
+					nsock->sent = sent;
+					set_bit(NBD_CMD_REQUEUED, &cmd->flags);
+					return BLK_STS_RESOURCE;
+				}
+				dev_err(disk_to_dev(nbd->disk),
+					"Send data failed (result %d)\n",
+					result);
+				return -EAGAIN;
+			}
+			/*
+			 * The completion might already have come in,
+			 * so break for the last one instead of letting
+			 * the iterator do it. This prevents use-after-free
+			 * of the bio.
+			 */
+			if (is_last)
+				break;
+		}
+		bio = next;
+	}
+out:
+	trace_nbd_payload_sent(req, handle);
+	nsock->pending = NULL;
+	nsock->sent = 0;
+	return 0;
+}
+
+static int nbd_read_reply(struct nbd_device *nbd, struct socket *sock,
+			  struct nbd_reply *reply)
+{
+	struct kvec iov = {.iov_base = reply, .iov_len = sizeof(*reply)};
+	struct iov_iter to;
+	int result;
+
+	reply->magic = 0;
+	iov_iter_kvec(&to, ITER_DEST, &iov, 1, sizeof(*reply));
+	result = __sock_xmit(nbd, sock, 0, &to, MSG_WAITALL, NULL);
+	if (result < 0) {
+		if (!nbd_disconnected(nbd->config))
+			dev_err(disk_to_dev(nbd->disk),
+				"Receive control failed (result %d)\n", result);
+		return result;
+	}
+
+	if (ntohl(reply->magic) != NBD_REPLY_MAGIC) {
+		dev_err(disk_to_dev(nbd->disk), "Wrong magic (0x%lx)\n",
+				(unsigned long)ntohl(reply->magic));
+		return -EPROTO;
+	}
+
+	return 0;
+}
+
+/* NULL returned = something went wrong, inform userspace */
+static struct nbd_cmd *nbd_handle_reply(struct nbd_device *nbd, int index,
+					struct nbd_reply *reply)
+{
+	int result;
+	struct nbd_cmd *cmd;
+	struct request *req = NULL;
+	u64 handle;
+	u16 hwq;
+	u32 tag;
+	int ret = 0;
+
+	memcpy(&handle, reply->handle, sizeof(handle));
+	tag = nbd_handle_to_tag(handle);
+	hwq = blk_mq_unique_tag_to_hwq(tag);
+	if (hwq < nbd->tag_set.nr_hw_queues)
+		req = blk_mq_tag_to_rq(nbd->tag_set.tags[hwq],
+				       blk_mq_unique_tag_to_tag(tag));
+	if (!req || !blk_mq_request_started(req)) {
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply (%d) %p\n",
+			tag, req);
+		return ERR_PTR(-ENOENT);
+	}
+	trace_nbd_header_received(req, handle);
+	cmd = blk_mq_rq_to_pdu(req);
+
+	mutex_lock(&cmd->lock);
+	if (!test_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		dev_err(disk_to_dev(nbd->disk), "Suspicious reply %d (status %u flags %lu)",
+			tag, cmd->status, cmd->flags);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (cmd->index != index) {
+		dev_err(disk_to_dev(nbd->disk), "Unexpected reply %d from different sock %d (expected %d)",
+			tag, index, cmd->index);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (cmd->cmd_cookie != nbd_handle_to_cookie(handle)) {
+		dev_err(disk_to_dev(nbd->disk), "Double reply on req %p, cmd_cookie %u, handle cookie %u\n",
+			req, cmd->cmd_cookie, nbd_handle_to_cookie(handle));
+		ret = -ENOENT;
+		goto out;
+	}
+	if (cmd->status != BLK_STS_OK) {
+		dev_err(disk_to_dev(nbd->disk), "Command already handled %p\n",
+			req);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (test_bit(NBD_CMD_REQUEUED, &cmd->flags)) {
+		dev_err(disk_to_dev(nbd->disk), "Raced with timeout on req %p\n",
+			req);
+		ret = -ENOENT;
+		goto out;
+	}
+	if (ntohl(reply->error)) {
+		dev_err(disk_to_dev(nbd->disk), "Other side returned error (%d)\n",
+			ntohl(reply->error));
+		cmd->status = BLK_STS_IOERR;
+		goto out;
+	}
+
+	dev_dbg(nbd_to_dev(nbd), "request %p: got reply\n", req);
+	if (rq_data_dir(req) != WRITE) {
+		struct req_iterator iter;
+		struct bio_vec bvec;
+		struct iov_iter to;
+
+		rq_for_each_segment(bvec, req, iter) {
+			iov_iter_bvec(&to, ITER_DEST, &bvec, 1, bvec.bv_len);
+			result = sock_xmit(nbd, index, 0, &to, MSG_WAITALL, NULL);
+			if (result < 0) {
+				dev_err(disk_to_dev(nbd->disk), "Receive data failed (result %d)\n",
+					result);
+				/*
+				 * If we've disconnected, we need to make sure we
+				 * complete this request, otherwise error out
+				 * and let the timeout stuff handle resubmitting
+				 * this request onto another connection.
+				 */
+				if (nbd_disconnected(nbd->config)) {
+					cmd->status = BLK_STS_IOERR;
+					goto out;
+				}
+				ret = -EIO;
+				goto out;
+			}
+			dev_dbg(nbd_to_dev(nbd), "request %p: got %d bytes data\n",
+				req, bvec.bv_len);
+		}
+	}
+out:
+	trace_nbd_payload_received(req, handle);
+	mutex_unlock(&cmd->lock);
+	return ret ? ERR_PTR(ret) : cmd;
+}
+
+static void recv_work(struct work_struct *work)
+{
+	struct recv_thread_args *args = container_of(work,
+						     struct recv_thread_args,
+						     work);
+	struct nbd_device *nbd = args->nbd;
+	struct nbd_config *config = nbd->config;
+	struct request_queue *q = nbd->disk->queue;
+	struct nbd_sock *nsock = args->nsock;
+	struct nbd_cmd *cmd;
+	struct request *rq;
+
+	while (1) {
+		struct nbd_reply reply;
+
+		if (nbd_read_reply(nbd, nsock->sock, &reply))
+			break;
+
+		/*
+		 * Grab .q_usage_counter so request pool won't go away, then no
+		 * request use-after-free is possible during nbd_handle_reply().
+		 * If queue is frozen, there won't be any inflight requests, we
+		 * needn't to handle the incoming garbage message.
+		 */
+		if (!percpu_ref_tryget(&q->q_usage_counter)) {
+			dev_err(disk_to_dev(nbd->disk), "%s: no io inflight\n",
+				__func__);
+			break;
+		}
+
+		cmd = nbd_handle_reply(nbd, args->index, &reply);
+		if (IS_ERR(cmd)) {
+			percpu_ref_put(&q->q_usage_counter);
+			break;
+		}
+
+		rq = blk_mq_rq_from_pdu(cmd);
+		if (likely(!blk_should_fake_timeout(rq->q))) {
+			bool complete;
+
+			mutex_lock(&cmd->lock);
+			complete = __test_and_clear_bit(NBD_CMD_INFLIGHT,
+							&cmd->flags);
+			mutex_unlock(&cmd->lock);
+			if (complete)
+				blk_mq_complete_request(rq);
+		}
+		percpu_ref_put(&q->q_usage_counter);
+	}
+
+	mutex_lock(&nsock->tx_lock);
+	nbd_mark_nsock_dead(nbd, nsock, 1);
+	mutex_unlock(&nsock->tx_lock);
+
+	nbd_config_put(nbd);
+	atomic_dec(&config->recv_threads);
+	wake_up(&config->recv_wq);
+	kfree(args);
+}
+
+static bool nbd_clear_req(struct request *req, void *data)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(req);
+
+	/* don't abort one completed request */
+	if (blk_mq_request_completed(req))
+		return true;
+
+	mutex_lock(&cmd->lock);
+	if (!__test_and_clear_bit(NBD_CMD_INFLIGHT, &cmd->flags)) {
+		mutex_unlock(&cmd->lock);
+		return true;
+	}
+	cmd->status = BLK_STS_IOERR;
+	mutex_unlock(&cmd->lock);
+
+	blk_mq_complete_request(req);
+	return true;
+}
+
+static void nbd_clear_que(struct nbd_device *nbd)
+{
+	blk_mq_quiesce_queue(nbd->disk->queue);
+	blk_mq_tagset_busy_iter(&nbd->tag_set, nbd_clear_req, NULL);
+	blk_mq_unquiesce_queue(nbd->disk->queue);
+	dev_dbg(disk_to_dev(nbd->disk), "queue cleared\n");
+}
+
+static int find_fallback(struct nbd_device *nbd, int index)
+{
+	struct nbd_config *config = nbd->config;
+	int new_index = -1;
+	struct nbd_sock *nsock = config->socks[index];
+	int fallback = nsock->fallback_index;
+
+	if (test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags))
+		return new_index;
+
+	if (config->num_connections <= 1) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Dead connection, failed to find a fallback\n");
+		return new_index;
+	}
+
+	if (fallback >= 0 && fallback < config->num_connections &&
+	    !config->socks[fallback]->dead)
+		return fallback;
+
+	if (nsock->fallback_index < 0 ||
+	    nsock->fallback_index >= config->num_connections ||
+	    config->socks[nsock->fallback_index]->dead) {
+		int i;
+		for (i = 0; i < config->num_connections; i++) {
+			if (i == index)
+				continue;
+			if (!config->socks[i]->dead) {
+				new_index = i;
+				break;
+			}
+		}
+		nsock->fallback_index = new_index;
+		if (new_index < 0) {
+			dev_err_ratelimited(disk_to_dev(nbd->disk),
+					    "Dead connection, failed to find a fallback\n");
+			return new_index;
+		}
+	}
+	new_index = nsock->fallback_index;
+	return new_index;
+}
+
+static int wait_for_reconnect(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	if (!config->dead_conn_timeout)
+		return 0;
+
+	if (!wait_event_timeout(config->conn_wait,
+				test_bit(NBD_RT_DISCONNECTED,
+					 &config->runtime_flags) ||
+				atomic_read(&config->live_connections) > 0,
+				config->dead_conn_timeout))
+		return 0;
+
+	return !test_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
+}
+
+static int nbd_handle_cmd(struct nbd_cmd *cmd, int index)
+{
+	struct request *req = blk_mq_rq_from_pdu(cmd);
+	struct nbd_device *nbd = cmd->nbd;
+	struct nbd_config *config;
+	struct nbd_sock *nsock;
+	int ret;
+
+	if (!refcount_inc_not_zero(&nbd->config_refs)) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Socks array is empty\n");
+		return -EINVAL;
+	}
+	config = nbd->config;
+
+	if (index >= config->num_connections) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Attempted send on invalid socket\n");
+		nbd_config_put(nbd);
+		return -EINVAL;
+	}
+	cmd->status = BLK_STS_OK;
+again:
+	nsock = config->socks[index];
+	mutex_lock(&nsock->tx_lock);
+	if (nsock->dead) {
+		int old_index = index;
+		index = find_fallback(nbd, index);
+		mutex_unlock(&nsock->tx_lock);
+		if (index < 0) {
+			if (wait_for_reconnect(nbd)) {
+				index = old_index;
+				goto again;
+			}
+			/* All the sockets should already be down at this point,
+			 * we just want to make sure that DISCONNECTED is set so
+			 * any requests that come in that were queue'ed waiting
+			 * for the reconnect timer don't trigger the timer again
+			 * and instead just error out.
+			 */
+			sock_shutdown(nbd);
+			nbd_config_put(nbd);
+			return -EIO;
+		}
+		goto again;
+	}
+
+	/* Handle the case that we have a pending request that was partially
+	 * transmitted that _has_ to be serviced first.  We need to call requeue
+	 * here so that it gets put _after_ the request that is already on the
+	 * dispatch list.
+	 */
+	blk_mq_start_request(req);
+	if (unlikely(nsock->pending && nsock->pending != req)) {
+		nbd_requeue_cmd(cmd);
+		ret = 0;
+		goto out;
+	}
+	/*
+	 * Some failures are related to the link going down, so anything that
+	 * returns EAGAIN can be retried on a different socket.
+	 */
+	ret = nbd_send_cmd(nbd, cmd, index);
+	/*
+	 * Access to this flag is protected by cmd->lock, thus it's safe to set
+	 * the flag after nbd_send_cmd() succeed to send request to server.
+	 */
+	if (!ret)
+		__set_bit(NBD_CMD_INFLIGHT, &cmd->flags);
+	else if (ret == -EAGAIN) {
+		dev_err_ratelimited(disk_to_dev(nbd->disk),
+				    "Request send failed, requeueing\n");
+		nbd_mark_nsock_dead(nbd, nsock, 1);
+		nbd_requeue_cmd(cmd);
+		ret = 0;
+	}
+out:
+	mutex_unlock(&nsock->tx_lock);
+	nbd_config_put(nbd);
+	return ret;
+}
+
+static blk_status_t nbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+			const struct blk_mq_queue_data *bd)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	int ret;
+
+	/*
+	 * Since we look at the bio's to send the request over the network we
+	 * need to make sure the completion work doesn't mark this request done
+	 * before we are done doing our send.  This keeps us from dereferencing
+	 * freed data if we have particularly fast completions (ie we get the
+	 * completion before we exit sock_xmit on the last bvec) or in the case
+	 * that the server is misbehaving (or there was an error) before we're
+	 * done sending everything over the wire.
+	 */
+	mutex_lock(&cmd->lock);
+	clear_bit(NBD_CMD_REQUEUED, &cmd->flags);
+
+	/* We can be called directly from the user space process, which means we
+	 * could possibly have signals pending so our sendmsg will fail.  In
+	 * this case we need to return that we are busy, otherwise error out as
+	 * appropriate.
+	 */
+	ret = nbd_handle_cmd(cmd, hctx->queue_num);
+	if (ret < 0)
+		ret = BLK_STS_IOERR;
+	else if (!ret)
+		ret = BLK_STS_OK;
+	mutex_unlock(&cmd->lock);
+
+	return ret;
+}
+
+static struct socket *nbd_get_socket(struct nbd_device *nbd, unsigned long fd,
+				     int *err)
+{
+	struct socket *sock;
+
+	*err = 0;
+	sock = sockfd_lookup(fd, err);
+	if (!sock)
+		return NULL;
+
+	if (sock->ops->shutdown == sock_no_shutdown) {
+		dev_err(disk_to_dev(nbd->disk), "Unsupported socket: shutdown callout must be supported.\n");
+		*err = -EINVAL;
+		sockfd_put(sock);
+		return NULL;
+	}
+
+	return sock;
+}
+
+static int nbd_add_socket(struct nbd_device *nbd, unsigned long arg,
+			  bool netlink)
+{
+	struct nbd_config *config = nbd->config;
+	struct socket *sock;
+	struct nbd_sock **socks;
+	struct nbd_sock *nsock;
+	int err;
+
+	/* Arg will be cast to int, check it to avoid overflow */
+	if (arg > INT_MAX)
+		return -EINVAL;
+	sock = nbd_get_socket(nbd, arg, &err);
+	if (!sock)
+		return err;
+
+	/*
+	 * We need to make sure we don't get any errant requests while we're
+	 * reallocating the ->socks array.
+	 */
+	blk_mq_freeze_queue(nbd->disk->queue);
+
+	if (!netlink && !nbd->task_setup &&
+	    !test_bit(NBD_RT_BOUND, &config->runtime_flags))
+		nbd->task_setup = current;
+
+	if (!netlink &&
+	    (nbd->task_setup != current ||
+	     test_bit(NBD_RT_BOUND, &config->runtime_flags))) {
+		dev_err(disk_to_dev(nbd->disk),
+			"Device being setup by another task");
+		err = -EBUSY;
+		goto put_socket;
+	}
+
+	nsock = kzalloc(sizeof(*nsock), GFP_KERNEL);
+	if (!nsock) {
+		err = -ENOMEM;
+		goto put_socket;
+	}
+
+	socks = krealloc(config->socks, (config->num_connections + 1) *
+			 sizeof(struct nbd_sock *), GFP_KERNEL);
+	if (!socks) {
+		kfree(nsock);
+		err = -ENOMEM;
+		goto put_socket;
+	}
+
+	config->socks = socks;
+
+	nsock->fallback_index = -1;
+	nsock->dead = false;
+	mutex_init(&nsock->tx_lock);
+	nsock->sock = sock;
+	nsock->pending = NULL;
+	nsock->sent = 0;
+	nsock->cookie = 0;
+	socks[config->num_connections++] = nsock;
+	atomic_inc(&config->live_connections);
+	blk_mq_unfreeze_queue(nbd->disk->queue);
+
+	return 0;
+
+put_socket:
+	blk_mq_unfreeze_queue(nbd->disk->queue);
+	sockfd_put(sock);
+	return err;
+}
+
+static int nbd_reconnect_socket(struct nbd_device *nbd, unsigned long arg)
+{
+	struct nbd_config *config = nbd->config;
+	struct socket *sock, *old;
+	struct recv_thread_args *args;
+	int i;
+	int err;
+
+	sock = nbd_get_socket(nbd, arg, &err);
+	if (!sock)
+		return err;
+
+	args = kzalloc(sizeof(*args), GFP_KERNEL);
+	if (!args) {
+		sockfd_put(sock);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < config->num_connections; i++) {
+		struct nbd_sock *nsock = config->socks[i];
+
+		if (!nsock->dead)
+			continue;
+
+		mutex_lock(&nsock->tx_lock);
+		if (!nsock->dead) {
+			mutex_unlock(&nsock->tx_lock);
+			continue;
+		}
+		sk_set_memalloc(sock->sk);
+		if (nbd->tag_set.timeout)
+			sock->sk->sk_sndtimeo = nbd->tag_set.timeout;
+		atomic_inc(&config->recv_threads);
+		refcount_inc(&nbd->config_refs);
+		old = nsock->sock;
+		nsock->fallback_index = -1;
+		nsock->sock = sock;
+		nsock->dead = false;
+		INIT_WORK(&args->work, recv_work);
+		args->index = i;
+		args->nbd = nbd;
+		args->nsock = nsock;
+		nsock->cookie++;
+		mutex_unlock(&nsock->tx_lock);
+		sockfd_put(old);
+
+		clear_bit(NBD_RT_DISCONNECTED, &config->runtime_flags);
+
+		/* We take the tx_mutex in an error path in the recv_work, so we
+		 * need to queue_work outside of the tx_mutex.
+		 */
+		queue_work(nbd->recv_workq, &args->work);
+
+		atomic_inc(&config->live_connections);
+		wake_up(&config->conn_wait);
+		return 0;
+	}
+	sockfd_put(sock);
+	kfree(args);
+	return -ENOSPC;
+}
+
+static void nbd_bdev_reset(struct nbd_device *nbd)
+{
+	if (disk_openers(nbd->disk) > 1)
+		return;
+	set_capacity(nbd->disk, 0);
+}
+
+static void nbd_parse_flags(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	if (config->flags & NBD_FLAG_READ_ONLY)
+		set_disk_ro(nbd->disk, true);
+	else
+		set_disk_ro(nbd->disk, false);
+	if (config->flags & NBD_FLAG_SEND_FLUSH) {
+		if (config->flags & NBD_FLAG_SEND_FUA)
+			blk_queue_write_cache(nbd->disk->queue, true, true);
+		else
+			blk_queue_write_cache(nbd->disk->queue, true, false);
+	}
+	else
+		blk_queue_write_cache(nbd->disk->queue, false, false);
+}
+
+static void send_disconnects(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	struct nbd_request request = {
+		.magic = htonl(NBD_REQUEST_MAGIC),
+		.type = htonl(NBD_CMD_DISC),
+	};
+	struct kvec iov = {.iov_base = &request, .iov_len = sizeof(request)};
+	struct iov_iter from;
+	int i, ret;
+
+	for (i = 0; i < config->num_connections; i++) {
+		struct nbd_sock *nsock = config->socks[i];
+
+		iov_iter_kvec(&from, ITER_SOURCE, &iov, 1, sizeof(request));
+		mutex_lock(&nsock->tx_lock);
+		ret = sock_xmit(nbd, i, 1, &from, 0, NULL);
+		if (ret < 0)
+			dev_err(disk_to_dev(nbd->disk),
+				"Send disconnect failed %d\n", ret);
+		mutex_unlock(&nsock->tx_lock);
+	}
+}
+
+static int nbd_disconnect(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+
+	dev_info(disk_to_dev(nbd->disk), "NBD_DISCONNECT\n");
+	set_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags);
+	set_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags);
+	send_disconnects(nbd);
+	return 0;
+}
+
+static void nbd_clear_sock(struct nbd_device *nbd)
+{
+	sock_shutdown(nbd);
+	nbd_clear_que(nbd);
+	nbd->task_setup = NULL;
+}
+
+static void nbd_config_put(struct nbd_device *nbd)
+{
+	if (refcount_dec_and_mutex_lock(&nbd->config_refs,
+					&nbd->config_lock)) {
+		struct nbd_config *config = nbd->config;
+		nbd_dev_dbg_close(nbd);
+		invalidate_disk(nbd->disk);
+		if (nbd->config->bytesize)
+			kobject_uevent(&nbd_to_dev(nbd)->kobj, KOBJ_CHANGE);
+		if (test_and_clear_bit(NBD_RT_HAS_PID_FILE,
+				       &config->runtime_flags))
+			device_remove_file(disk_to_dev(nbd->disk), &pid_attr);
+		nbd->pid = 0;
+		if (test_and_clear_bit(NBD_RT_HAS_BACKEND_FILE,
+				       &config->runtime_flags)) {
+			device_remove_file(disk_to_dev(nbd->disk), &backend_attr);
+			kfree(nbd->backend);
+			nbd->backend = NULL;
+		}
+		nbd_clear_sock(nbd);
+		if (config->num_connections) {
+			int i;
+			for (i = 0; i < config->num_connections; i++) {
+				sockfd_put(config->socks[i]->sock);
+				kfree(config->socks[i]);
+			}
+			kfree(config->socks);
+		}
+		kfree(nbd->config);
+		nbd->config = NULL;
+
+		nbd->tag_set.timeout = 0;
+		nbd->disk->queue->limits.discard_granularity = 0;
+		blk_queue_max_discard_sectors(nbd->disk->queue, 0);
+
+		mutex_unlock(&nbd->config_lock);
+		nbd_put(nbd);
+		module_put(THIS_MODULE);
+	}
+}
+
+static int nbd_start_device(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	int num_connections = config->num_connections;
+	int error = 0, i;
+
+	if (nbd->pid)
+		return -EBUSY;
+	if (!config->socks)
+		return -EINVAL;
+	if (num_connections > 1 &&
+	    !(config->flags & NBD_FLAG_CAN_MULTI_CONN)) {
+		dev_err(disk_to_dev(nbd->disk), "server does not support multiple connections per device.\n");
+		return -EINVAL;
+	}
+
+	blk_mq_update_nr_hw_queues(&nbd->tag_set, config->num_connections);
+	nbd->pid = task_pid_nr(current);
+
+	nbd_parse_flags(nbd);
+
+	error = device_create_file(disk_to_dev(nbd->disk), &pid_attr);
+	if (error) {
+		dev_err(disk_to_dev(nbd->disk), "device_create_file failed for pid!\n");
+		return error;
+	}
+	set_bit(NBD_RT_HAS_PID_FILE, &config->runtime_flags);
+
+	nbd_dev_dbg_init(nbd);
+	for (i = 0; i < num_connections; i++) {
+		struct recv_thread_args *args;
+
+		args = kzalloc(sizeof(*args), GFP_KERNEL);
+		if (!args) {
+			sock_shutdown(nbd);
+			/*
+			 * If num_connections is m (2 < m),
+			 * and NO.1 ~ NO.n(1 < n < m) kzallocs are successful.
+			 * But NO.(n + 1) failed. We still have n recv threads.
+			 * So, add flush_workqueue here to prevent recv threads
+			 * dropping the last config_refs and trying to destroy
+			 * the workqueue from inside the workqueue.
+			 */
+			if (i)
+				flush_workqueue(nbd->recv_workq);
+			return -ENOMEM;
+		}
+		sk_set_memalloc(config->socks[i]->sock->sk);
+		if (nbd->tag_set.timeout)
+			config->socks[i]->sock->sk->sk_sndtimeo =
+				nbd->tag_set.timeout;
+		atomic_inc(&config->recv_threads);
+		refcount_inc(&nbd->config_refs);
+		INIT_WORK(&args->work, recv_work);
+		args->nbd = nbd;
+		args->nsock = config->socks[i];
+		args->index = i;
+		queue_work(nbd->recv_workq, &args->work);
+	}
+	return nbd_set_size(nbd, config->bytesize, nbd_blksize(config));
+}
+
+static int nbd_start_device_ioctl(struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	int ret;
+
+	ret = nbd_start_device(nbd);
+	if (ret)
+		return ret;
+
+	if (max_part)
+		set_bit(GD_NEED_PART_SCAN, &nbd->disk->state);
+	mutex_unlock(&nbd->config_lock);
+	ret = wait_event_interruptible(config->recv_wq,
+					 atomic_read(&config->recv_threads) == 0);
+	if (ret) {
+		sock_shutdown(nbd);
+		nbd_clear_que(nbd);
+	}
+
+	flush_workqueue(nbd->recv_workq);
+	mutex_lock(&nbd->config_lock);
+	nbd_bdev_reset(nbd);
+	/* user requested, ignore socket errors */
+	if (test_bit(NBD_RT_DISCONNECT_REQUESTED, &config->runtime_flags))
+		ret = 0;
+	if (test_bit(NBD_RT_TIMEDOUT, &config->runtime_flags))
+		ret = -ETIMEDOUT;
+	return ret;
+}
+
+static void nbd_clear_sock_ioctl(struct nbd_device *nbd,
+				 struct block_device *bdev)
+{
+	nbd_clear_sock(nbd);
+	__invalidate_device(bdev, true);
+	nbd_bdev_reset(nbd);
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
+			       &nbd->config->runtime_flags))
+		nbd_config_put(nbd);
+}
+
+static void nbd_set_cmd_timeout(struct nbd_device *nbd, u64 timeout)
+{
+	nbd->tag_set.timeout = timeout * HZ;
+	if (timeout)
+		blk_queue_rq_timeout(nbd->disk->queue, timeout * HZ);
+	else
+		blk_queue_rq_timeout(nbd->disk->queue, 30 * HZ);
+}
+
+/* Must be called with config_lock held */
+static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *nbd,
+		       unsigned int cmd, unsigned long arg)
+{
+	struct nbd_config *config = nbd->config;
+	loff_t bytesize;
+
+	switch (cmd) {
+	case NBD_DISCONNECT:
+		return nbd_disconnect(nbd);
+	case NBD_CLEAR_SOCK:
+		nbd_clear_sock_ioctl(nbd, bdev);
+		return 0;
+	case NBD_SET_SOCK:
+		return nbd_add_socket(nbd, arg, false);
+	case NBD_SET_BLKSIZE:
+		return nbd_set_size(nbd, config->bytesize, arg);
+	case NBD_SET_SIZE:
+		return nbd_set_size(nbd, arg, nbd_blksize(config));
+	case NBD_SET_SIZE_BLOCKS:
+		if (check_shl_overflow(arg, config->blksize_bits, &bytesize))
+			return -EINVAL;
+		return nbd_set_size(nbd, bytesize, nbd_blksize(config));
+	case NBD_SET_TIMEOUT:
+		nbd_set_cmd_timeout(nbd, arg);
+		return 0;
+
+	case NBD_SET_FLAGS:
+		config->flags = arg;
+		return 0;
+	case NBD_DO_IT:
+		return nbd_start_device_ioctl(nbd);
+	case NBD_CLEAR_QUE:
+		/*
+		 * This is for compatibility only.  The queue is always cleared
+		 * by NBD_DO_IT or NBD_CLEAR_SOCK.
+		 */
+		return 0;
+	case NBD_PRINT_DEBUG:
+		/*
+		 * For compatibility only, we no longer keep a list of
+		 * outstanding requests.
+		 */
+		return 0;
+	}
+	return -ENOTTY;
+}
+
+static int nbd_ioctl(struct block_device *bdev, fmode_t mode,
+		     unsigned int cmd, unsigned long arg)
+{
+	struct nbd_device *nbd = bdev->bd_disk->private_data;
+	struct nbd_config *config = nbd->config;
+	int error = -EINVAL;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* The block layer will pass back some non-nbd ioctls in case we have
+	 * special handling for them, but we don't so just return an error.
+	 */
+	if (_IOC_TYPE(cmd) != 0xab)
+		return -EINVAL;
+
+	mutex_lock(&nbd->config_lock);
+
+	/* Don't allow ioctl operations on a nbd device that was created with
+	 * netlink, unless it's DISCONNECT or CLEAR_SOCK, which are fine.
+	 */
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
+	    (cmd == NBD_DISCONNECT || cmd == NBD_CLEAR_SOCK))
+		error = __nbd_ioctl(bdev, nbd, cmd, arg);
+	else
+		dev_err(nbd_to_dev(nbd), "Cannot use ioctl interface on a netlink controlled device.\n");
+	mutex_unlock(&nbd->config_lock);
+	return error;
+}
+
+static int nbd_alloc_and_init_config(struct nbd_device *nbd)
+{
+	struct nbd_config *config;
+
+	if (WARN_ON(nbd->config))
+		return -EINVAL;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	config = kzalloc(sizeof(struct nbd_config), GFP_NOFS);
+	if (!config) {
+		module_put(THIS_MODULE);
+		return -ENOMEM;
+	}
+
+	atomic_set(&config->recv_threads, 0);
+	init_waitqueue_head(&config->recv_wq);
+	init_waitqueue_head(&config->conn_wait);
+	config->blksize_bits = NBD_DEF_BLKSIZE_BITS;
+	atomic_set(&config->live_connections, 0);
+	nbd->config = config;
+	refcount_set(&nbd->config_refs, 1);
+
+	return 0;
+}
+
+static int nbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct nbd_device *nbd;
+	int ret = 0;
+
+	mutex_lock(&nbd_index_mutex);
+	nbd = bdev->bd_disk->private_data;
+	if (!nbd) {
+		ret = -ENXIO;
+		goto out;
+	}
+	if (!refcount_inc_not_zero(&nbd->refs)) {
+		ret = -ENXIO;
+		goto out;
+	}
+	if (!refcount_inc_not_zero(&nbd->config_refs)) {
+		mutex_lock(&nbd->config_lock);
+		if (refcount_inc_not_zero(&nbd->config_refs)) {
+			mutex_unlock(&nbd->config_lock);
+			goto out;
+		}
+		ret = nbd_alloc_and_init_config(nbd);
+		if (ret) {
+			mutex_unlock(&nbd->config_lock);
+			goto out;
+		}
+
+		refcount_inc(&nbd->refs);
+		mutex_unlock(&nbd->config_lock);
+		if (max_part)
+			set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+	} else if (nbd_disconnected(nbd->config)) {
+		if (max_part)
+			set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+	}
+out:
+	mutex_unlock(&nbd_index_mutex);
+	return ret;
+}
+
+static void nbd_release(struct gendisk *disk, fmode_t mode)
+{
+	struct nbd_device *nbd = disk->private_data;
+
+	if (test_bit(NBD_RT_DISCONNECT_ON_CLOSE, &nbd->config->runtime_flags) &&
+			disk_openers(disk) == 0)
+		nbd_disconnect_and_put(nbd);
+
+	nbd_config_put(nbd);
+	nbd_put(nbd);
+}
+
+static void nbd_free_disk(struct gendisk *disk)
+{
+	struct nbd_device *nbd = disk->private_data;
+
+	kfree(nbd);
+}
+
+static const struct block_device_operations nbd_fops =
+{
+	.owner =	THIS_MODULE,
+	.open =		nbd_open,
+	.release =	nbd_release,
+	.ioctl =	nbd_ioctl,
+	.compat_ioctl =	nbd_ioctl,
+	.free_disk =	nbd_free_disk,
+};
+
+#if IS_ENABLED(CONFIG_DEBUG_FS)
+
+static int nbd_dbg_tasks_show(struct seq_file *s, void *unused)
+{
+	struct nbd_device *nbd = s->private;
+
+	if (nbd->pid)
+		seq_printf(s, "recv: %d\n", nbd->pid);
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_tasks);
+
+static int nbd_dbg_flags_show(struct seq_file *s, void *unused)
+{
+	struct nbd_device *nbd = s->private;
+	u32 flags = nbd->config->flags;
+
+	seq_printf(s, "Hex: 0x%08x\n\n", flags);
+
+	seq_puts(s, "Known flags:\n");
+
+	if (flags & NBD_FLAG_HAS_FLAGS)
+		seq_puts(s, "NBD_FLAG_HAS_FLAGS\n");
+	if (flags & NBD_FLAG_READ_ONLY)
+		seq_puts(s, "NBD_FLAG_READ_ONLY\n");
+	if (flags & NBD_FLAG_SEND_FLUSH)
+		seq_puts(s, "NBD_FLAG_SEND_FLUSH\n");
+	if (flags & NBD_FLAG_SEND_FUA)
+		seq_puts(s, "NBD_FLAG_SEND_FUA\n");
+	if (flags & NBD_FLAG_SEND_TRIM)
+		seq_puts(s, "NBD_FLAG_SEND_TRIM\n");
+
+	return 0;
+}
+
+DEFINE_SHOW_ATTRIBUTE(nbd_dbg_flags);
+
+static int nbd_dev_dbg_init(struct nbd_device *nbd)
+{
+	struct dentry *dir;
+	struct nbd_config *config = nbd->config;
+
+	if (!nbd_dbg_dir)
+		return -EIO;
+
+	dir = debugfs_create_dir(nbd_name(nbd), nbd_dbg_dir);
+	if (IS_ERR(dir)) {
+		dev_err(nbd_to_dev(nbd), "Failed to create debugfs dir for '%s'\n",
+			nbd_name(nbd));
+		return -EIO;
+	}
+	config->dbg_dir = dir;
+
+	debugfs_create_file("tasks", 0444, dir, nbd, &nbd_dbg_tasks_fops);
+	debugfs_create_u64("size_bytes", 0444, dir, &config->bytesize);
+	debugfs_create_u32("timeout", 0444, dir, &nbd->tag_set.timeout);
+	debugfs_create_u32("blocksize_bits", 0444, dir, &config->blksize_bits);
+	debugfs_create_file("flags", 0444, dir, nbd, &nbd_dbg_flags_fops);
+
+	return 0;
+}
+
+static void nbd_dev_dbg_close(struct nbd_device *nbd)
+{
+	debugfs_remove_recursive(nbd->config->dbg_dir);
+}
+
+static int nbd_dbg_init(void)
+{
+	struct dentry *dbg_dir;
+
+	dbg_dir = debugfs_create_dir("nbd", NULL);
+	if (IS_ERR(dbg_dir))
+		return -EIO;
+
+	nbd_dbg_dir = dbg_dir;
+
+	return 0;
+}
+
+static void nbd_dbg_close(void)
+{
+	debugfs_remove_recursive(nbd_dbg_dir);
+}
+
+#else  /* IS_ENABLED(CONFIG_DEBUG_FS) */
+
+static int nbd_dev_dbg_init(struct nbd_device *nbd)
+{
+	return 0;
+}
+
+static void nbd_dev_dbg_close(struct nbd_device *nbd)
+{
+}
+
+static int nbd_dbg_init(void)
+{
+	return 0;
+}
+
+static void nbd_dbg_close(void)
+{
+}
+
+#endif
+
+static int nbd_init_request(struct blk_mq_tag_set *set, struct request *rq,
+			    unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct nbd_cmd *cmd = blk_mq_rq_to_pdu(rq);
+	cmd->nbd = set->driver_data;
+	cmd->flags = 0;
+	mutex_init(&cmd->lock);
+	return 0;
+}
+
+static const struct blk_mq_ops nbd_mq_ops = {
+	.queue_rq	= nbd_queue_rq,
+	.complete	= nbd_complete_rq,
+	.init_request	= nbd_init_request,
+	.timeout	= nbd_xmit_timeout,
+};
+
+static struct nbd_device *nbd_dev_add(int index, unsigned int refs)
+{
+	struct nbd_device *nbd;
+	struct gendisk *disk;
+	int err = -ENOMEM;
+
+	nbd = kzalloc(sizeof(struct nbd_device), GFP_KERNEL);
+	if (!nbd)
+		goto out;
+
+	nbd->tag_set.ops = &nbd_mq_ops;
+	nbd->tag_set.nr_hw_queues = 1;
+	nbd->tag_set.queue_depth = 128;
+	nbd->tag_set.numa_node = NUMA_NO_NODE;
+	nbd->tag_set.cmd_size = sizeof(struct nbd_cmd);
+	nbd->tag_set.flags = BLK_MQ_F_SHOULD_MERGE |
+		BLK_MQ_F_BLOCKING;
+	nbd->tag_set.driver_data = nbd;
+	INIT_WORK(&nbd->remove_work, nbd_dev_remove_work);
+	nbd->backend = NULL;
+
+	err = blk_mq_alloc_tag_set(&nbd->tag_set);
+	if (err)
+		goto out_free_nbd;
+
+	mutex_lock(&nbd_index_mutex);
+	if (index >= 0) {
+		err = idr_alloc(&nbd_index_idr, nbd, index, index + 1,
+				GFP_KERNEL);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&nbd_index_idr, nbd, 0, 0, GFP_KERNEL);
+		if (err >= 0)
+			index = err;
+	}
+	nbd->index = index;
+	mutex_unlock(&nbd_index_mutex);
+	if (err < 0)
+		goto out_free_tags;
+
+	disk = blk_mq_alloc_disk(&nbd->tag_set, NULL);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_free_idr;
+	}
+	nbd->disk = disk;
+
+	nbd->recv_workq = alloc_workqueue("nbd%d-recv",
+					  WQ_MEM_RECLAIM | WQ_HIGHPRI |
+					  WQ_UNBOUND, 0, nbd->index);
+	if (!nbd->recv_workq) {
+		dev_err(disk_to_dev(nbd->disk), "Could not allocate knbd recv work queue.\n");
+		err = -ENOMEM;
+		goto out_err_disk;
+	}
+
+	/*
+	 * Tell the block layer that we are not a rotational device
+	 */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, disk->queue);
+	disk->queue->limits.discard_granularity = 0;
+	blk_queue_max_discard_sectors(disk->queue, 0);
+	blk_queue_max_segment_size(disk->queue, UINT_MAX);
+	blk_queue_max_segments(disk->queue, USHRT_MAX);
+	blk_queue_max_hw_sectors(disk->queue, 65536);
+	disk->queue->limits.max_sectors = 256;
+
+	mutex_init(&nbd->config_lock);
+	refcount_set(&nbd->config_refs, 0);
+	/*
+	 * Start out with a zero references to keep other threads from using
+	 * this device until it is fully initialized.
+	 */
+	refcount_set(&nbd->refs, 0);
+	INIT_LIST_HEAD(&nbd->list);
+	disk->major = NBD_MAJOR;
+	disk->first_minor = index << part_shift;
+	disk->minors = 1 << part_shift;
+	disk->fops = &nbd_fops;
+	disk->private_data = nbd;
+	sprintf(disk->disk_name, "nbd%d", index);
+	err = add_disk(disk);
+	if (err)
+		goto out_free_work;
+
+	/*
+	 * Now publish the device.
+	 */
+	refcount_set(&nbd->refs, refs);
+	nbd_total_devices++;
+	return nbd;
+
+out_free_work:
+	destroy_workqueue(nbd->recv_workq);
+out_err_disk:
+	put_disk(disk);
+out_free_idr:
+	mutex_lock(&nbd_index_mutex);
+	idr_remove(&nbd_index_idr, index);
+	mutex_unlock(&nbd_index_mutex);
+out_free_tags:
+	blk_mq_free_tag_set(&nbd->tag_set);
+out_free_nbd:
+	kfree(nbd);
+out:
+	return ERR_PTR(err);
+}
+
+static struct nbd_device *nbd_find_get_unused(void)
+{
+	struct nbd_device *nbd;
+	int id;
+
+	lockdep_assert_held(&nbd_index_mutex);
+
+	idr_for_each_entry(&nbd_index_idr, nbd, id) {
+		if (refcount_read(&nbd->config_refs) ||
+		    test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags))
+			continue;
+		if (refcount_inc_not_zero(&nbd->refs))
+			return nbd;
+	}
+
+	return NULL;
+}
+
+/* Netlink interface. */
+static const struct nla_policy nbd_attr_policy[NBD_ATTR_MAX + 1] = {
+	[NBD_ATTR_INDEX]		=	{ .type = NLA_U32 },
+	[NBD_ATTR_SIZE_BYTES]		=	{ .type = NLA_U64 },
+	[NBD_ATTR_BLOCK_SIZE_BYTES]	=	{ .type = NLA_U64 },
+	[NBD_ATTR_TIMEOUT]		=	{ .type = NLA_U64 },
+	[NBD_ATTR_SERVER_FLAGS]		=	{ .type = NLA_U64 },
+	[NBD_ATTR_CLIENT_FLAGS]		=	{ .type = NLA_U64 },
+	[NBD_ATTR_SOCKETS]		=	{ .type = NLA_NESTED},
+	[NBD_ATTR_DEAD_CONN_TIMEOUT]	=	{ .type = NLA_U64 },
+	[NBD_ATTR_DEVICE_LIST]		=	{ .type = NLA_NESTED},
+	[NBD_ATTR_BACKEND_IDENTIFIER]	=	{ .type = NLA_STRING},
+};
+
+static const struct nla_policy nbd_sock_policy[NBD_SOCK_MAX + 1] = {
+	[NBD_SOCK_FD]			=	{ .type = NLA_U32 },
+};
+
+/* We don't use this right now since we don't parse the incoming list, but we
+ * still want it here so userspace knows what to expect.
+ */
+static const struct nla_policy __attribute__((unused))
+nbd_device_policy[NBD_DEVICE_ATTR_MAX + 1] = {
+	[NBD_DEVICE_INDEX]		=	{ .type = NLA_U32 },
+	[NBD_DEVICE_CONNECTED]		=	{ .type = NLA_U8 },
+};
+
+static int nbd_genl_size_set(struct genl_info *info, struct nbd_device *nbd)
+{
+	struct nbd_config *config = nbd->config;
+	u64 bsize = nbd_blksize(config);
+	u64 bytes = config->bytesize;
+
+	if (info->attrs[NBD_ATTR_SIZE_BYTES])
+		bytes = nla_get_u64(info->attrs[NBD_ATTR_SIZE_BYTES]);
+
+	if (info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES])
+		bsize = nla_get_u64(info->attrs[NBD_ATTR_BLOCK_SIZE_BYTES]);
+
+	if (bytes != config->bytesize || bsize != nbd_blksize(config))
+		return nbd_set_size(nbd, bytes, bsize);
+	return 0;
+}
+
+static int nbd_genl_connect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nbd_device *nbd;
+	struct nbd_config *config;
+	int index = -1;
+	int ret;
+	bool put_dev = false;
+
+	if (!netlink_capable(skb, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (info->attrs[NBD_ATTR_INDEX]) {
+		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+
+		/*
+		 * Too big first_minor can cause duplicate creation of
+		 * sysfs files/links, since index << part_shift might overflow, or
+		 * MKDEV() expect that the max bits of first_minor is 20.
+		 */
+		if (index < 0 || index > MINORMASK >> part_shift) {
+			pr_err("illegal input index %d\n", index);
+			return -EINVAL;
+		}
+	}
+	if (!info->attrs[NBD_ATTR_SOCKETS]) {
+		pr_err("must specify at least one socket\n");
+		return -EINVAL;
+	}
+	if (!info->attrs[NBD_ATTR_SIZE_BYTES]) {
+		pr_err("must specify a size in bytes for the device\n");
+		return -EINVAL;
+	}
+again:
+	mutex_lock(&nbd_index_mutex);
+	if (index == -1) {
+		nbd = nbd_find_get_unused();
+	} else {
+		nbd = idr_find(&nbd_index_idr, index);
+		if (nbd) {
+			if ((test_bit(NBD_DESTROY_ON_DISCONNECT, &nbd->flags) &&
+			     test_bit(NBD_DISCONNECT_REQUESTED, &nbd->flags)) ||
+			    !refcount_inc_not_zero(&nbd->refs)) {
+				mutex_unlock(&nbd_index_mutex);
+				pr_err("device at index %d is going down\n",
+					index);
+				return -EINVAL;
+			}
+		}
+	}
+	mutex_unlock(&nbd_index_mutex);
+
+	if (!nbd) {
+		nbd = nbd_dev_add(index, 2);
+		if (IS_ERR(nbd)) {
+			pr_err("failed to add new device\n");
+			return PTR_ERR(nbd);
+		}
+	}
+
+	mutex_lock(&nbd->config_lock);
+	if (refcount_read(&nbd->config_refs)) {
+		mutex_unlock(&nbd->config_lock);
+		nbd_put(nbd);
+		if (index == -1)
+			goto again;
+		pr_err("nbd%d already in use\n", index);
+		return -EBUSY;
+	}
+
+	ret = nbd_alloc_and_init_config(nbd);
+	if (ret) {
+		mutex_unlock(&nbd->config_lock);
+		nbd_put(nbd);
+		pr_err("couldn't allocate config\n");
+		return ret;
+	}
+
+	config = nbd->config;
+	set_bit(NBD_RT_BOUND, &config->runtime_flags);
+	ret = nbd_genl_size_set(info, nbd);
+	if (ret)
+		goto out;
+
+	if (info->attrs[NBD_ATTR_TIMEOUT])
+		nbd_set_cmd_timeout(nbd,
+				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
+	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+		config->dead_conn_timeout =
+			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+		config->dead_conn_timeout *= HZ;
+	}
+	if (info->attrs[NBD_ATTR_SERVER_FLAGS])
+		config->flags =
+			nla_get_u64(info->attrs[NBD_ATTR_SERVER_FLAGS]);
+	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+			/*
+			 * We have 1 ref to keep the device around, and then 1
+			 * ref for our current operation here, which will be
+			 * inherited by the config.  If we already have
+			 * DESTROY_ON_DISCONNECT set then we know we don't have
+			 * that extra ref already held so we don't need the
+			 * put_dev.
+			 */
+			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+					      &nbd->flags))
+				put_dev = true;
+		} else {
+			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+					       &nbd->flags))
+				refcount_inc(&nbd->refs);
+		}
+		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
+				&config->runtime_flags);
+		}
+	}
+
+	if (info->attrs[NBD_ATTR_SOCKETS]) {
+		struct nlattr *attr;
+		int rem, fd;
+
+		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+				    rem) {
+			struct nlattr *socks[NBD_SOCK_MAX+1];
+
+			if (nla_type(attr) != NBD_SOCK_ITEM) {
+				pr_err("socks must be embedded in a SOCK_ITEM attr\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
+							  attr,
+							  nbd_sock_policy,
+							  info->extack);
+			if (ret != 0) {
+				pr_err("error processing sock list\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!socks[NBD_SOCK_FD])
+				continue;
+			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+			ret = nbd_add_socket(nbd, fd, true);
+			if (ret)
+				goto out;
+		}
+	}
+	ret = nbd_start_device(nbd);
+	if (ret)
+		goto out;
+	if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+		nbd->backend = nla_strdup(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+					  GFP_KERNEL);
+		if (!nbd->backend) {
+			ret = -ENOMEM;
+			goto out;
+		}
+	}
+	ret = device_create_file(disk_to_dev(nbd->disk), &backend_attr);
+	if (ret) {
+		dev_err(disk_to_dev(nbd->disk),
+			"device_create_file failed for backend!\n");
+		goto out;
+	}
+	set_bit(NBD_RT_HAS_BACKEND_FILE, &config->runtime_flags);
+out:
+	mutex_unlock(&nbd->config_lock);
+	if (!ret) {
+		set_bit(NBD_RT_HAS_CONFIG_REF, &config->runtime_flags);
+		refcount_inc(&nbd->config_refs);
+		nbd_connect_reply(info, nbd->index);
+	}
+	nbd_config_put(nbd);
+	if (put_dev)
+		nbd_put(nbd);
+	return ret;
+}
+
+static void nbd_disconnect_and_put(struct nbd_device *nbd)
+{
+	mutex_lock(&nbd->config_lock);
+	nbd_disconnect(nbd);
+	sock_shutdown(nbd);
+	wake_up(&nbd->config->conn_wait);
+	/*
+	 * Make sure recv thread has finished, we can safely call nbd_clear_que()
+	 * to cancel the inflight I/Os.
+	 */
+	flush_workqueue(nbd->recv_workq);
+	nbd_clear_que(nbd);
+	nbd->task_setup = NULL;
+	mutex_unlock(&nbd->config_lock);
+
+	if (test_and_clear_bit(NBD_RT_HAS_CONFIG_REF,
+			       &nbd->config->runtime_flags))
+		nbd_config_put(nbd);
+}
+
+static int nbd_genl_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nbd_device *nbd;
+	int index;
+
+	if (!netlink_capable(skb, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!info->attrs[NBD_ATTR_INDEX]) {
+		pr_err("must specify an index to disconnect\n");
+		return -EINVAL;
+	}
+	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+	mutex_lock(&nbd_index_mutex);
+	nbd = idr_find(&nbd_index_idr, index);
+	if (!nbd) {
+		mutex_unlock(&nbd_index_mutex);
+		pr_err("couldn't find device at index %d\n", index);
+		return -EINVAL;
+	}
+	if (!refcount_inc_not_zero(&nbd->refs)) {
+		mutex_unlock(&nbd_index_mutex);
+		pr_err("device at index %d is going down\n", index);
+		return -EINVAL;
+	}
+	mutex_unlock(&nbd_index_mutex);
+	if (!refcount_inc_not_zero(&nbd->config_refs))
+		goto put_nbd;
+	nbd_disconnect_and_put(nbd);
+	nbd_config_put(nbd);
+put_nbd:
+	nbd_put(nbd);
+	return 0;
+}
+
+static int nbd_genl_reconfigure(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nbd_device *nbd = NULL;
+	struct nbd_config *config;
+	int index;
+	int ret = 0;
+	bool put_dev = false;
+
+	if (!netlink_capable(skb, CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!info->attrs[NBD_ATTR_INDEX]) {
+		pr_err("must specify a device to reconfigure\n");
+		return -EINVAL;
+	}
+	index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+	mutex_lock(&nbd_index_mutex);
+	nbd = idr_find(&nbd_index_idr, index);
+	if (!nbd) {
+		mutex_unlock(&nbd_index_mutex);
+		pr_err("couldn't find a device at index %d\n", index);
+		return -EINVAL;
+	}
+	if (nbd->backend) {
+		if (info->attrs[NBD_ATTR_BACKEND_IDENTIFIER]) {
+			if (nla_strcmp(info->attrs[NBD_ATTR_BACKEND_IDENTIFIER],
+				       nbd->backend)) {
+				mutex_unlock(&nbd_index_mutex);
+				dev_err(nbd_to_dev(nbd),
+					"backend image doesn't match with %s\n",
+					nbd->backend);
+				return -EINVAL;
+			}
+		} else {
+			mutex_unlock(&nbd_index_mutex);
+			dev_err(nbd_to_dev(nbd), "must specify backend\n");
+			return -EINVAL;
+		}
+	}
+	if (!refcount_inc_not_zero(&nbd->refs)) {
+		mutex_unlock(&nbd_index_mutex);
+		pr_err("device at index %d is going down\n", index);
+		return -EINVAL;
+	}
+	mutex_unlock(&nbd_index_mutex);
+
+	if (!refcount_inc_not_zero(&nbd->config_refs)) {
+		dev_err(nbd_to_dev(nbd),
+			"not configured, cannot reconfigure\n");
+		nbd_put(nbd);
+		return -EINVAL;
+	}
+
+	mutex_lock(&nbd->config_lock);
+	config = nbd->config;
+	if (!test_bit(NBD_RT_BOUND, &config->runtime_flags) ||
+	    !nbd->pid) {
+		dev_err(nbd_to_dev(nbd),
+			"not configured, cannot reconfigure\n");
+		ret = -EINVAL;
+		goto out;
+	}
+
+	ret = nbd_genl_size_set(info, nbd);
+	if (ret)
+		goto out;
+
+	if (info->attrs[NBD_ATTR_TIMEOUT])
+		nbd_set_cmd_timeout(nbd,
+				    nla_get_u64(info->attrs[NBD_ATTR_TIMEOUT]));
+	if (info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]) {
+		config->dead_conn_timeout =
+			nla_get_u64(info->attrs[NBD_ATTR_DEAD_CONN_TIMEOUT]);
+		config->dead_conn_timeout *= HZ;
+	}
+	if (info->attrs[NBD_ATTR_CLIENT_FLAGS]) {
+		u64 flags = nla_get_u64(info->attrs[NBD_ATTR_CLIENT_FLAGS]);
+		if (flags & NBD_CFLAG_DESTROY_ON_DISCONNECT) {
+			if (!test_and_set_bit(NBD_DESTROY_ON_DISCONNECT,
+					      &nbd->flags))
+				put_dev = true;
+		} else {
+			if (test_and_clear_bit(NBD_DESTROY_ON_DISCONNECT,
+					       &nbd->flags))
+				refcount_inc(&nbd->refs);
+		}
+
+		if (flags & NBD_CFLAG_DISCONNECT_ON_CLOSE) {
+			set_bit(NBD_RT_DISCONNECT_ON_CLOSE,
+					&config->runtime_flags);
+		} else {
+			clear_bit(NBD_RT_DISCONNECT_ON_CLOSE,
+					&config->runtime_flags);
+		}
+	}
+
+	if (info->attrs[NBD_ATTR_SOCKETS]) {
+		struct nlattr *attr;
+		int rem, fd;
+
+		nla_for_each_nested(attr, info->attrs[NBD_ATTR_SOCKETS],
+				    rem) {
+			struct nlattr *socks[NBD_SOCK_MAX+1];
+
+			if (nla_type(attr) != NBD_SOCK_ITEM) {
+				pr_err("socks must be embedded in a SOCK_ITEM attr\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			ret = nla_parse_nested_deprecated(socks, NBD_SOCK_MAX,
+							  attr,
+							  nbd_sock_policy,
+							  info->extack);
+			if (ret != 0) {
+				pr_err("error processing sock list\n");
+				ret = -EINVAL;
+				goto out;
+			}
+			if (!socks[NBD_SOCK_FD])
+				continue;
+			fd = (int)nla_get_u32(socks[NBD_SOCK_FD]);
+			ret = nbd_reconnect_socket(nbd, fd);
+			if (ret) {
+				if (ret == -ENOSPC)
+					ret = 0;
+				goto out;
+			}
+			dev_info(nbd_to_dev(nbd), "reconnected socket\n");
+		}
+	}
+out:
+	mutex_unlock(&nbd->config_lock);
+	nbd_config_put(nbd);
+	nbd_put(nbd);
+	if (put_dev)
+		nbd_put(nbd);
+	return ret;
+}
+
+static const struct genl_small_ops nbd_connect_genl_ops[] = {
+	{
+		.cmd	= NBD_CMD_CONNECT,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= nbd_genl_connect,
+	},
+	{
+		.cmd	= NBD_CMD_DISCONNECT,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= nbd_genl_disconnect,
+	},
+	{
+		.cmd	= NBD_CMD_RECONFIGURE,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= nbd_genl_reconfigure,
+	},
+	{
+		.cmd	= NBD_CMD_STATUS,
+		.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
+		.doit	= nbd_genl_status,
+	},
+};
+
+static const struct genl_multicast_group nbd_mcast_grps[] = {
+	{ .name = NBD_GENL_MCAST_GROUP_NAME, },
+};
+
+static struct genl_family nbd_genl_family __ro_after_init = {
+	.hdrsize	= 0,
+	.name		= NBD_GENL_FAMILY_NAME,
+	.version	= NBD_GENL_VERSION,
+	.module		= THIS_MODULE,
+	.small_ops	= nbd_connect_genl_ops,
+	.n_small_ops	= ARRAY_SIZE(nbd_connect_genl_ops),
+	.resv_start_op	= NBD_CMD_STATUS + 1,
+	.maxattr	= NBD_ATTR_MAX,
+	.policy = nbd_attr_policy,
+	.mcgrps		= nbd_mcast_grps,
+	.n_mcgrps	= ARRAY_SIZE(nbd_mcast_grps),
+};
+
+static int populate_nbd_status(struct nbd_device *nbd, struct sk_buff *reply)
+{
+	struct nlattr *dev_opt;
+	u8 connected = 0;
+	int ret;
+
+	/* This is a little racey, but for status it's ok.  The
+	 * reason we don't take a ref here is because we can't
+	 * take a ref in the index == -1 case as we would need
+	 * to put under the nbd_index_mutex, which could
+	 * deadlock if we are configured to remove ourselves
+	 * once we're disconnected.
+	 */
+	if (refcount_read(&nbd->config_refs))
+		connected = 1;
+	dev_opt = nla_nest_start_noflag(reply, NBD_DEVICE_ITEM);
+	if (!dev_opt)
+		return -EMSGSIZE;
+	ret = nla_put_u32(reply, NBD_DEVICE_INDEX, nbd->index);
+	if (ret)
+		return -EMSGSIZE;
+	ret = nla_put_u8(reply, NBD_DEVICE_CONNECTED,
+			 connected);
+	if (ret)
+		return -EMSGSIZE;
+	nla_nest_end(reply, dev_opt);
+	return 0;
+}
+
+static int status_cb(int id, void *ptr, void *data)
+{
+	struct nbd_device *nbd = ptr;
+	return populate_nbd_status(nbd, (struct sk_buff *)data);
+}
+
+static int nbd_genl_status(struct sk_buff *skb, struct genl_info *info)
+{
+	struct nlattr *dev_list;
+	struct sk_buff *reply;
+	void *reply_head;
+	size_t msg_size;
+	int index = -1;
+	int ret = -ENOMEM;
+
+	if (info->attrs[NBD_ATTR_INDEX])
+		index = nla_get_u32(info->attrs[NBD_ATTR_INDEX]);
+
+	mutex_lock(&nbd_index_mutex);
+
+	msg_size = nla_total_size(nla_attr_size(sizeof(u32)) +
+				  nla_attr_size(sizeof(u8)));
+	msg_size *= (index == -1) ? nbd_total_devices : 1;
+
+	reply = genlmsg_new(msg_size, GFP_KERNEL);
+	if (!reply)
+		goto out;
+	reply_head = genlmsg_put_reply(reply, info, &nbd_genl_family, 0,
+				       NBD_CMD_STATUS);
+	if (!reply_head) {
+		nlmsg_free(reply);
+		goto out;
+	}
+
+	dev_list = nla_nest_start_noflag(reply, NBD_ATTR_DEVICE_LIST);
+	if (index == -1) {
+		ret = idr_for_each(&nbd_index_idr, &status_cb, reply);
+		if (ret) {
+			nlmsg_free(reply);
+			goto out;
+		}
+	} else {
+		struct nbd_device *nbd;
+		nbd = idr_find(&nbd_index_idr, index);
+		if (nbd) {
+			ret = populate_nbd_status(nbd, reply);
+			if (ret) {
+				nlmsg_free(reply);
+				goto out;
+			}
+		}
+	}
+	nla_nest_end(reply, dev_list);
+	genlmsg_end(reply, reply_head);
+	ret = genlmsg_reply(reply, info);
+out:
+	mutex_unlock(&nbd_index_mutex);
+	return ret;
+}
+
+static void nbd_connect_reply(struct genl_info *info, int index)
+{
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+
+	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+	if (!skb)
+		return;
+	msg_head = genlmsg_put_reply(skb, info, &nbd_genl_family, 0,
+				     NBD_CMD_CONNECT);
+	if (!msg_head) {
+		nlmsg_free(skb);
+		return;
+	}
+	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+	if (ret) {
+		nlmsg_free(skb);
+		return;
+	}
+	genlmsg_end(skb, msg_head);
+	genlmsg_reply(skb, info);
+}
+
+static void nbd_mcast_index(int index)
+{
+	struct sk_buff *skb;
+	void *msg_head;
+	int ret;
+
+	skb = genlmsg_new(nla_total_size(sizeof(u32)), GFP_KERNEL);
+	if (!skb)
+		return;
+	msg_head = genlmsg_put(skb, 0, 0, &nbd_genl_family, 0,
+				     NBD_CMD_LINK_DEAD);
+	if (!msg_head) {
+		nlmsg_free(skb);
+		return;
+	}
+	ret = nla_put_u32(skb, NBD_ATTR_INDEX, index);
+	if (ret) {
+		nlmsg_free(skb);
+		return;
+	}
+	genlmsg_end(skb, msg_head);
+	genlmsg_multicast(&nbd_genl_family, skb, 0, 0, GFP_KERNEL);
+}
+
+static void nbd_dead_link_work(struct work_struct *work)
+{
+	struct link_dead_args *args = container_of(work, struct link_dead_args,
+						   work);
+	nbd_mcast_index(args->index);
+	kfree(args);
+}
+
+static int __init nbd_init(void)
+{
+	int i;
+
+	BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
+
+	if (max_part < 0) {
+		pr_err("max_part must be >= 0\n");
+		return -EINVAL;
+	}
+
+	part_shift = 0;
+	if (max_part > 0) {
+		part_shift = fls(max_part);
+
+		/*
+		 * Adjust max_part according to part_shift as it is exported
+		 * to user space so that user can know the max number of
+		 * partition kernel should be able to manage.
+		 *
+		 * Note that -1 is required because partition 0 is reserved
+		 * for the whole disk.
+		 */
+		max_part = (1UL << part_shift) - 1;
+	}
+
+	if ((1UL << part_shift) > DISK_MAX_PARTS)
+		return -EINVAL;
+
+	if (nbds_max > 1UL << (MINORBITS - part_shift))
+		return -EINVAL;
+
+	if (register_blkdev(NBD_MAJOR, "nbd"))
+		return -EIO;
+
+	nbd_del_wq = alloc_workqueue("nbd-del", WQ_UNBOUND, 0);
+	if (!nbd_del_wq) {
+		unregister_blkdev(NBD_MAJOR, "nbd");
+		return -ENOMEM;
+	}
+
+	if (genl_register_family(&nbd_genl_family)) {
+		destroy_workqueue(nbd_del_wq);
+		unregister_blkdev(NBD_MAJOR, "nbd");
+		return -EINVAL;
+	}
+	nbd_dbg_init();
+
+	for (i = 0; i < nbds_max; i++)
+		nbd_dev_add(i, 1);
+	return 0;
+}
+
+static int nbd_exit_cb(int id, void *ptr, void *data)
+{
+	struct list_head *list = (struct list_head *)data;
+	struct nbd_device *nbd = ptr;
+
+	/* Skip nbd that is being removed asynchronously */
+	if (refcount_read(&nbd->refs))
+		list_add_tail(&nbd->list, list);
+
+	return 0;
+}
+
+static void __exit nbd_cleanup(void)
+{
+	struct nbd_device *nbd;
+	LIST_HEAD(del_list);
+
+	/*
+	 * Unregister netlink interface prior to waiting
+	 * for the completion of netlink commands.
+	 */
+	genl_unregister_family(&nbd_genl_family);
+
+	nbd_dbg_close();
+
+	mutex_lock(&nbd_index_mutex);
+	idr_for_each(&nbd_index_idr, &nbd_exit_cb, &del_list);
+	mutex_unlock(&nbd_index_mutex);
+
+	while (!list_empty(&del_list)) {
+		nbd = list_first_entry(&del_list, struct nbd_device, list);
+		list_del_init(&nbd->list);
+		if (refcount_read(&nbd->config_refs))
+			pr_err("possibly leaking nbd_config (ref %d)\n",
+					refcount_read(&nbd->config_refs));
+		if (refcount_read(&nbd->refs) != 1)
+			pr_err("possibly leaking a device\n");
+		nbd_put(nbd);
+	}
+
+	/* Also wait for nbd_dev_remove_work() completes */
+	destroy_workqueue(nbd_del_wq);
+
+	idr_destroy(&nbd_index_idr);
+	unregister_blkdev(NBD_MAJOR, "nbd");
+}
+
+module_init(nbd_init);
+module_exit(nbd_cleanup);
+
+MODULE_DESCRIPTION("Network Block Device");
+MODULE_LICENSE("GPL");
+
+module_param(nbds_max, int, 0444);
+MODULE_PARM_DESC(nbds_max, "number of network block devices to initialize (default: 16)");
+module_param(max_part, int, 0444);
+MODULE_PARM_DESC(max_part, "number of partitions per device (default: 16)");
diff --git a/drivers/block/null_blk/Kconfig b/drivers/block/null_blk/Kconfig
new file mode 100644
index 000000000..6bf1f8ca2
--- /dev/null
+++ b/drivers/block/null_blk/Kconfig
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Null block device driver configuration
+#
+
+config BLK_DEV_NULL_BLK
+	tristate "Null test block driver"
+	select CONFIGFS_FS
+
+config BLK_DEV_NULL_BLK_FAULT_INJECTION
+	bool "Support fault injection for Null test block driver"
+	depends on BLK_DEV_NULL_BLK && FAULT_INJECTION
diff --git a/drivers/block/null_blk/Makefile b/drivers/block/null_blk/Makefile
new file mode 100644
index 000000000..84c36e512
--- /dev/null
+++ b/drivers/block/null_blk/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0
+
+# needed for trace events
+ccflags-y			+= -I$(src)
+
+obj-$(CONFIG_BLK_DEV_NULL_BLK)	+= null_blk.o
+null_blk-objs			:= main.o
+ifeq ($(CONFIG_BLK_DEV_ZONED), y)
+null_blk-$(CONFIG_TRACING) 	+= trace.o
+endif
+null_blk-$(CONFIG_BLK_DEV_ZONED) += zoned.o
diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c
new file mode 100644
index 000000000..959952e8e
--- /dev/null
+++ b/drivers/block/null_blk/main.c
@@ -0,0 +1,2307 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Add configfs and memory store: Kyungchan Koh <kkc6196@fb.com> and
+ * Shaohua Li <shli@fb.com>
+ */
+#include <linux/module.h>
+
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include "null_blk.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt)	"null_blk: " fmt
+
+#define FREE_BATCH		16
+
+#define TICKS_PER_SEC		50ULL
+#define TIMER_INTERVAL		(NSEC_PER_SEC / TICKS_PER_SEC)
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static DECLARE_FAULT_ATTR(null_timeout_attr);
+static DECLARE_FAULT_ATTR(null_requeue_attr);
+static DECLARE_FAULT_ATTR(null_init_hctx_attr);
+#endif
+
+static inline u64 mb_per_tick(int mbps)
+{
+	return (1 << 20) / TICKS_PER_SEC * ((u64) mbps);
+}
+
+/*
+ * Status flags for nullb_device.
+ *
+ * CONFIGURED:	Device has been configured and turned on. Cannot reconfigure.
+ * UP:		Device is currently on and visible in userspace.
+ * THROTTLED:	Device is being throttled.
+ * CACHE:	Device is using a write-back cache.
+ */
+enum nullb_device_flags {
+	NULLB_DEV_FL_CONFIGURED	= 0,
+	NULLB_DEV_FL_UP		= 1,
+	NULLB_DEV_FL_THROTTLED	= 2,
+	NULLB_DEV_FL_CACHE	= 3,
+};
+
+#define MAP_SZ		((PAGE_SIZE >> SECTOR_SHIFT) + 2)
+/*
+ * nullb_page is a page in memory for nullb devices.
+ *
+ * @page:	The page holding the data.
+ * @bitmap:	The bitmap represents which sector in the page has data.
+ *		Each bit represents one block size. For example, sector 8
+ *		will use the 7th bit
+ * The highest 2 bits of bitmap are for special purpose. LOCK means the cache
+ * page is being flushing to storage. FREE means the cache page is freed and
+ * should be skipped from flushing to storage. Please see
+ * null_make_cache_space
+ */
+struct nullb_page {
+	struct page *page;
+	DECLARE_BITMAP(bitmap, MAP_SZ);
+};
+#define NULLB_PAGE_LOCK (MAP_SZ - 1)
+#define NULLB_PAGE_FREE (MAP_SZ - 2)
+
+static LIST_HEAD(nullb_list);
+static struct mutex lock;
+static int null_major;
+static DEFINE_IDA(nullb_indexes);
+static struct blk_mq_tag_set tag_set;
+
+enum {
+	NULL_IRQ_NONE		= 0,
+	NULL_IRQ_SOFTIRQ	= 1,
+	NULL_IRQ_TIMER		= 2,
+};
+
+static bool g_virt_boundary = false;
+module_param_named(virt_boundary, g_virt_boundary, bool, 0444);
+MODULE_PARM_DESC(virt_boundary, "Require a virtual boundary for the device. Default: False");
+
+static int g_no_sched;
+module_param_named(no_sched, g_no_sched, int, 0444);
+MODULE_PARM_DESC(no_sched, "No io scheduler");
+
+static int g_submit_queues = 1;
+module_param_named(submit_queues, g_submit_queues, int, 0444);
+MODULE_PARM_DESC(submit_queues, "Number of submission queues");
+
+static int g_poll_queues = 1;
+module_param_named(poll_queues, g_poll_queues, int, 0444);
+MODULE_PARM_DESC(poll_queues, "Number of IOPOLL submission queues");
+
+static int g_home_node = NUMA_NO_NODE;
+module_param_named(home_node, g_home_node, int, 0444);
+MODULE_PARM_DESC(home_node, "Home node for the device");
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+/*
+ * For more details about fault injection, please refer to
+ * Documentation/fault-injection/fault-injection.rst.
+ */
+static char g_timeout_str[80];
+module_param_string(timeout, g_timeout_str, sizeof(g_timeout_str), 0444);
+MODULE_PARM_DESC(timeout, "Fault injection. timeout=<interval>,<probability>,<space>,<times>");
+
+static char g_requeue_str[80];
+module_param_string(requeue, g_requeue_str, sizeof(g_requeue_str), 0444);
+MODULE_PARM_DESC(requeue, "Fault injection. requeue=<interval>,<probability>,<space>,<times>");
+
+static char g_init_hctx_str[80];
+module_param_string(init_hctx, g_init_hctx_str, sizeof(g_init_hctx_str), 0444);
+MODULE_PARM_DESC(init_hctx, "Fault injection to fail hctx init. init_hctx=<interval>,<probability>,<space>,<times>");
+#endif
+
+static int g_queue_mode = NULL_Q_MQ;
+
+static int null_param_store_val(const char *str, int *val, int min, int max)
+{
+	int ret, new_val;
+
+	ret = kstrtoint(str, 10, &new_val);
+	if (ret)
+		return -EINVAL;
+
+	if (new_val < min || new_val > max)
+		return -EINVAL;
+
+	*val = new_val;
+	return 0;
+}
+
+static int null_set_queue_mode(const char *str, const struct kernel_param *kp)
+{
+	return null_param_store_val(str, &g_queue_mode, NULL_Q_BIO, NULL_Q_MQ);
+}
+
+static const struct kernel_param_ops null_queue_mode_param_ops = {
+	.set	= null_set_queue_mode,
+	.get	= param_get_int,
+};
+
+device_param_cb(queue_mode, &null_queue_mode_param_ops, &g_queue_mode, 0444);
+MODULE_PARM_DESC(queue_mode, "Block interface to use (0=bio,1=rq,2=multiqueue)");
+
+static int g_gb = 250;
+module_param_named(gb, g_gb, int, 0444);
+MODULE_PARM_DESC(gb, "Size in GB");
+
+static int g_bs = 512;
+module_param_named(bs, g_bs, int, 0444);
+MODULE_PARM_DESC(bs, "Block size (in bytes)");
+
+static int g_max_sectors;
+module_param_named(max_sectors, g_max_sectors, int, 0444);
+MODULE_PARM_DESC(max_sectors, "Maximum size of a command (in 512B sectors)");
+
+static unsigned int nr_devices = 1;
+module_param(nr_devices, uint, 0444);
+MODULE_PARM_DESC(nr_devices, "Number of devices to register");
+
+static bool g_blocking;
+module_param_named(blocking, g_blocking, bool, 0444);
+MODULE_PARM_DESC(blocking, "Register as a blocking blk-mq driver device");
+
+static bool shared_tags;
+module_param(shared_tags, bool, 0444);
+MODULE_PARM_DESC(shared_tags, "Share tag set between devices for blk-mq");
+
+static bool g_shared_tag_bitmap;
+module_param_named(shared_tag_bitmap, g_shared_tag_bitmap, bool, 0444);
+MODULE_PARM_DESC(shared_tag_bitmap, "Use shared tag bitmap for all submission queues for blk-mq");
+
+static int g_irqmode = NULL_IRQ_SOFTIRQ;
+
+static int null_set_irqmode(const char *str, const struct kernel_param *kp)
+{
+	return null_param_store_val(str, &g_irqmode, NULL_IRQ_NONE,
+					NULL_IRQ_TIMER);
+}
+
+static const struct kernel_param_ops null_irqmode_param_ops = {
+	.set	= null_set_irqmode,
+	.get	= param_get_int,
+};
+
+device_param_cb(irqmode, &null_irqmode_param_ops, &g_irqmode, 0444);
+MODULE_PARM_DESC(irqmode, "IRQ completion handler. 0-none, 1-softirq, 2-timer");
+
+static unsigned long g_completion_nsec = 10000;
+module_param_named(completion_nsec, g_completion_nsec, ulong, 0444);
+MODULE_PARM_DESC(completion_nsec, "Time in ns to complete a request in hardware. Default: 10,000ns");
+
+static int g_hw_queue_depth = 64;
+module_param_named(hw_queue_depth, g_hw_queue_depth, int, 0444);
+MODULE_PARM_DESC(hw_queue_depth, "Queue depth for each hardware queue. Default: 64");
+
+static bool g_use_per_node_hctx;
+module_param_named(use_per_node_hctx, g_use_per_node_hctx, bool, 0444);
+MODULE_PARM_DESC(use_per_node_hctx, "Use per-node allocation for hardware context queues. Default: false");
+
+static bool g_memory_backed;
+module_param_named(memory_backed, g_memory_backed, bool, 0444);
+MODULE_PARM_DESC(memory_backed, "Create a memory-backed block device. Default: false");
+
+static bool g_discard;
+module_param_named(discard, g_discard, bool, 0444);
+MODULE_PARM_DESC(discard, "Support discard operations (requires memory-backed null_blk device). Default: false");
+
+static unsigned long g_cache_size;
+module_param_named(cache_size, g_cache_size, ulong, 0444);
+MODULE_PARM_DESC(mbps, "Cache size in MiB for memory-backed device. Default: 0 (none)");
+
+static unsigned int g_mbps;
+module_param_named(mbps, g_mbps, uint, 0444);
+MODULE_PARM_DESC(mbps, "Limit maximum bandwidth (in MiB/s). Default: 0 (no limit)");
+
+static bool g_zoned;
+module_param_named(zoned, g_zoned, bool, S_IRUGO);
+MODULE_PARM_DESC(zoned, "Make device as a host-managed zoned block device. Default: false");
+
+static unsigned long g_zone_size = 256;
+module_param_named(zone_size, g_zone_size, ulong, S_IRUGO);
+MODULE_PARM_DESC(zone_size, "Zone size in MB when block device is zoned. Must be power-of-two: Default: 256");
+
+static unsigned long g_zone_capacity;
+module_param_named(zone_capacity, g_zone_capacity, ulong, 0444);
+MODULE_PARM_DESC(zone_capacity, "Zone capacity in MB when block device is zoned. Can be less than or equal to zone size. Default: Zone size");
+
+static unsigned int g_zone_nr_conv;
+module_param_named(zone_nr_conv, g_zone_nr_conv, uint, 0444);
+MODULE_PARM_DESC(zone_nr_conv, "Number of conventional zones when block device is zoned. Default: 0");
+
+static unsigned int g_zone_max_open;
+module_param_named(zone_max_open, g_zone_max_open, uint, 0444);
+MODULE_PARM_DESC(zone_max_open, "Maximum number of open zones when block device is zoned. Default: 0 (no limit)");
+
+static unsigned int g_zone_max_active;
+module_param_named(zone_max_active, g_zone_max_active, uint, 0444);
+MODULE_PARM_DESC(zone_max_active, "Maximum number of active zones when block device is zoned. Default: 0 (no limit)");
+
+static struct nullb_device *null_alloc_dev(void);
+static void null_free_dev(struct nullb_device *dev);
+static void null_del_dev(struct nullb *nullb);
+static int null_add_dev(struct nullb_device *dev);
+static struct nullb *null_find_dev_by_name(const char *name);
+static void null_free_device_storage(struct nullb_device *dev, bool is_cache);
+
+static inline struct nullb_device *to_nullb_device(struct config_item *item)
+{
+	return item ? container_of(item, struct nullb_device, item) : NULL;
+}
+
+static inline ssize_t nullb_device_uint_attr_show(unsigned int val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static inline ssize_t nullb_device_ulong_attr_show(unsigned long val,
+	char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%lu\n", val);
+}
+
+static inline ssize_t nullb_device_bool_attr_show(bool val, char *page)
+{
+	return snprintf(page, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t nullb_device_uint_attr_store(unsigned int *val,
+	const char *page, size_t count)
+{
+	unsigned int tmp;
+	int result;
+
+	result = kstrtouint(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+static ssize_t nullb_device_ulong_attr_store(unsigned long *val,
+	const char *page, size_t count)
+{
+	int result;
+	unsigned long tmp;
+
+	result = kstrtoul(page, 0, &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+static ssize_t nullb_device_bool_attr_store(bool *val, const char *page,
+	size_t count)
+{
+	bool tmp;
+	int result;
+
+	result = kstrtobool(page,  &tmp);
+	if (result < 0)
+		return result;
+
+	*val = tmp;
+	return count;
+}
+
+/* The following macro should only be used with TYPE = {uint, ulong, bool}. */
+#define NULLB_DEVICE_ATTR(NAME, TYPE, APPLY)				\
+static ssize_t								\
+nullb_device_##NAME##_show(struct config_item *item, char *page)	\
+{									\
+	return nullb_device_##TYPE##_attr_show(				\
+				to_nullb_device(item)->NAME, page);	\
+}									\
+static ssize_t								\
+nullb_device_##NAME##_store(struct config_item *item, const char *page,	\
+			    size_t count)				\
+{									\
+	int (*apply_fn)(struct nullb_device *dev, TYPE new_value) = APPLY;\
+	struct nullb_device *dev = to_nullb_device(item);		\
+	TYPE new_value = 0;						\
+	int ret;							\
+									\
+	ret = nullb_device_##TYPE##_attr_store(&new_value, page, count);\
+	if (ret < 0)							\
+		return ret;						\
+	if (apply_fn)							\
+		ret = apply_fn(dev, new_value);				\
+	else if (test_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags)) 	\
+		ret = -EBUSY;						\
+	if (ret < 0)							\
+		return ret;						\
+	dev->NAME = new_value;						\
+	return count;							\
+}									\
+CONFIGFS_ATTR(nullb_device_, NAME);
+
+static int nullb_update_nr_hw_queues(struct nullb_device *dev,
+				     unsigned int submit_queues,
+				     unsigned int poll_queues)
+
+{
+	struct blk_mq_tag_set *set;
+	int ret, nr_hw_queues;
+
+	if (!dev->nullb)
+		return 0;
+
+	/*
+	 * Make sure at least one submit queue exists.
+	 */
+	if (!submit_queues)
+		return -EINVAL;
+
+	/*
+	 * Make sure that null_init_hctx() does not access nullb->queues[] past
+	 * the end of that array.
+	 */
+	if (submit_queues > nr_cpu_ids || poll_queues > g_poll_queues)
+		return -EINVAL;
+
+	/*
+	 * Keep previous and new queue numbers in nullb_device for reference in
+	 * the call back function null_map_queues().
+	 */
+	dev->prev_submit_queues = dev->submit_queues;
+	dev->prev_poll_queues = dev->poll_queues;
+	dev->submit_queues = submit_queues;
+	dev->poll_queues = poll_queues;
+
+	set = dev->nullb->tag_set;
+	nr_hw_queues = submit_queues + poll_queues;
+	blk_mq_update_nr_hw_queues(set, nr_hw_queues);
+	ret = set->nr_hw_queues == nr_hw_queues ? 0 : -ENOMEM;
+
+	if (ret) {
+		/* on error, revert the queue numbers */
+		dev->submit_queues = dev->prev_submit_queues;
+		dev->poll_queues = dev->prev_poll_queues;
+	}
+
+	return ret;
+}
+
+static int nullb_apply_submit_queues(struct nullb_device *dev,
+				     unsigned int submit_queues)
+{
+	return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues);
+}
+
+static int nullb_apply_poll_queues(struct nullb_device *dev,
+				   unsigned int poll_queues)
+{
+	return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues);
+}
+
+NULLB_DEVICE_ATTR(size, ulong, NULL);
+NULLB_DEVICE_ATTR(completion_nsec, ulong, NULL);
+NULLB_DEVICE_ATTR(submit_queues, uint, nullb_apply_submit_queues);
+NULLB_DEVICE_ATTR(poll_queues, uint, nullb_apply_poll_queues);
+NULLB_DEVICE_ATTR(home_node, uint, NULL);
+NULLB_DEVICE_ATTR(queue_mode, uint, NULL);
+NULLB_DEVICE_ATTR(blocksize, uint, NULL);
+NULLB_DEVICE_ATTR(max_sectors, uint, NULL);
+NULLB_DEVICE_ATTR(irqmode, uint, NULL);
+NULLB_DEVICE_ATTR(hw_queue_depth, uint, NULL);
+NULLB_DEVICE_ATTR(index, uint, NULL);
+NULLB_DEVICE_ATTR(blocking, bool, NULL);
+NULLB_DEVICE_ATTR(use_per_node_hctx, bool, NULL);
+NULLB_DEVICE_ATTR(memory_backed, bool, NULL);
+NULLB_DEVICE_ATTR(discard, bool, NULL);
+NULLB_DEVICE_ATTR(mbps, uint, NULL);
+NULLB_DEVICE_ATTR(cache_size, ulong, NULL);
+NULLB_DEVICE_ATTR(zoned, bool, NULL);
+NULLB_DEVICE_ATTR(zone_size, ulong, NULL);
+NULLB_DEVICE_ATTR(zone_capacity, ulong, NULL);
+NULLB_DEVICE_ATTR(zone_nr_conv, uint, NULL);
+NULLB_DEVICE_ATTR(zone_max_open, uint, NULL);
+NULLB_DEVICE_ATTR(zone_max_active, uint, NULL);
+NULLB_DEVICE_ATTR(virt_boundary, bool, NULL);
+NULLB_DEVICE_ATTR(no_sched, bool, NULL);
+NULLB_DEVICE_ATTR(shared_tag_bitmap, bool, NULL);
+
+static ssize_t nullb_device_power_show(struct config_item *item, char *page)
+{
+	return nullb_device_bool_attr_show(to_nullb_device(item)->power, page);
+}
+
+static ssize_t nullb_device_power_store(struct config_item *item,
+				     const char *page, size_t count)
+{
+	struct nullb_device *dev = to_nullb_device(item);
+	bool newp = false;
+	ssize_t ret;
+
+	ret = nullb_device_bool_attr_store(&newp, page, count);
+	if (ret < 0)
+		return ret;
+
+	if (!dev->power && newp) {
+		if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags))
+			return count;
+		ret = null_add_dev(dev);
+		if (ret) {
+			clear_bit(NULLB_DEV_FL_UP, &dev->flags);
+			return ret;
+		}
+
+		set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+		dev->power = newp;
+	} else if (dev->power && !newp) {
+		if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+			mutex_lock(&lock);
+			dev->power = newp;
+			null_del_dev(dev->nullb);
+			mutex_unlock(&lock);
+		}
+		clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags);
+	}
+
+	return count;
+}
+
+CONFIGFS_ATTR(nullb_device_, power);
+
+static ssize_t nullb_device_badblocks_show(struct config_item *item, char *page)
+{
+	struct nullb_device *t_dev = to_nullb_device(item);
+
+	return badblocks_show(&t_dev->badblocks, page, 0);
+}
+
+static ssize_t nullb_device_badblocks_store(struct config_item *item,
+				     const char *page, size_t count)
+{
+	struct nullb_device *t_dev = to_nullb_device(item);
+	char *orig, *buf, *tmp;
+	u64 start, end;
+	int ret;
+
+	orig = kstrndup(page, count, GFP_KERNEL);
+	if (!orig)
+		return -ENOMEM;
+
+	buf = strstrip(orig);
+
+	ret = -EINVAL;
+	if (buf[0] != '+' && buf[0] != '-')
+		goto out;
+	tmp = strchr(&buf[1], '-');
+	if (!tmp)
+		goto out;
+	*tmp = '\0';
+	ret = kstrtoull(buf + 1, 0, &start);
+	if (ret)
+		goto out;
+	ret = kstrtoull(tmp + 1, 0, &end);
+	if (ret)
+		goto out;
+	ret = -EINVAL;
+	if (start > end)
+		goto out;
+	/* enable badblocks */
+	cmpxchg(&t_dev->badblocks.shift, -1, 0);
+	if (buf[0] == '+')
+		ret = badblocks_set(&t_dev->badblocks, start,
+			end - start + 1, 1);
+	else
+		ret = badblocks_clear(&t_dev->badblocks, start,
+			end - start + 1);
+	if (ret == 0)
+		ret = count;
+out:
+	kfree(orig);
+	return ret;
+}
+CONFIGFS_ATTR(nullb_device_, badblocks);
+
+static struct configfs_attribute *nullb_device_attrs[] = {
+	&nullb_device_attr_size,
+	&nullb_device_attr_completion_nsec,
+	&nullb_device_attr_submit_queues,
+	&nullb_device_attr_poll_queues,
+	&nullb_device_attr_home_node,
+	&nullb_device_attr_queue_mode,
+	&nullb_device_attr_blocksize,
+	&nullb_device_attr_max_sectors,
+	&nullb_device_attr_irqmode,
+	&nullb_device_attr_hw_queue_depth,
+	&nullb_device_attr_index,
+	&nullb_device_attr_blocking,
+	&nullb_device_attr_use_per_node_hctx,
+	&nullb_device_attr_power,
+	&nullb_device_attr_memory_backed,
+	&nullb_device_attr_discard,
+	&nullb_device_attr_mbps,
+	&nullb_device_attr_cache_size,
+	&nullb_device_attr_badblocks,
+	&nullb_device_attr_zoned,
+	&nullb_device_attr_zone_size,
+	&nullb_device_attr_zone_capacity,
+	&nullb_device_attr_zone_nr_conv,
+	&nullb_device_attr_zone_max_open,
+	&nullb_device_attr_zone_max_active,
+	&nullb_device_attr_virt_boundary,
+	&nullb_device_attr_no_sched,
+	&nullb_device_attr_shared_tag_bitmap,
+	NULL,
+};
+
+static void nullb_device_release(struct config_item *item)
+{
+	struct nullb_device *dev = to_nullb_device(item);
+
+	null_free_device_storage(dev, false);
+	null_free_dev(dev);
+}
+
+static struct configfs_item_operations nullb_device_ops = {
+	.release	= nullb_device_release,
+};
+
+static const struct config_item_type nullb_device_type = {
+	.ct_item_ops	= &nullb_device_ops,
+	.ct_attrs	= nullb_device_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct
+config_item *nullb_group_make_item(struct config_group *group, const char *name)
+{
+	struct nullb_device *dev;
+
+	if (null_find_dev_by_name(name))
+		return ERR_PTR(-EEXIST);
+
+	dev = null_alloc_dev();
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	config_item_init_type_name(&dev->item, name, &nullb_device_type);
+
+	return &dev->item;
+}
+
+static void
+nullb_group_drop_item(struct config_group *group, struct config_item *item)
+{
+	struct nullb_device *dev = to_nullb_device(item);
+
+	if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) {
+		mutex_lock(&lock);
+		dev->power = false;
+		null_del_dev(dev->nullb);
+		mutex_unlock(&lock);
+	}
+
+	config_item_put(item);
+}
+
+static ssize_t memb_group_features_show(struct config_item *item, char *page)
+{
+	return snprintf(page, PAGE_SIZE,
+			"badblocks,blocking,blocksize,cache_size,"
+			"completion_nsec,discard,home_node,hw_queue_depth,"
+			"irqmode,max_sectors,mbps,memory_backed,no_sched,"
+			"poll_queues,power,queue_mode,shared_tag_bitmap,size,"
+			"submit_queues,use_per_node_hctx,virt_boundary,zoned,"
+			"zone_capacity,zone_max_active,zone_max_open,"
+			"zone_nr_conv,zone_size\n");
+}
+
+CONFIGFS_ATTR_RO(memb_group_, features);
+
+static struct configfs_attribute *nullb_group_attrs[] = {
+	&memb_group_attr_features,
+	NULL,
+};
+
+static struct configfs_group_operations nullb_group_ops = {
+	.make_item	= nullb_group_make_item,
+	.drop_item	= nullb_group_drop_item,
+};
+
+static const struct config_item_type nullb_group_type = {
+	.ct_group_ops	= &nullb_group_ops,
+	.ct_attrs	= nullb_group_attrs,
+	.ct_owner	= THIS_MODULE,
+};
+
+static struct configfs_subsystem nullb_subsys = {
+	.su_group = {
+		.cg_item = {
+			.ci_namebuf = "nullb",
+			.ci_type = &nullb_group_type,
+		},
+	},
+};
+
+static inline int null_cache_active(struct nullb *nullb)
+{
+	return test_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+}
+
+static struct nullb_device *null_alloc_dev(void)
+{
+	struct nullb_device *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return NULL;
+	INIT_RADIX_TREE(&dev->data, GFP_ATOMIC);
+	INIT_RADIX_TREE(&dev->cache, GFP_ATOMIC);
+	if (badblocks_init(&dev->badblocks, 0)) {
+		kfree(dev);
+		return NULL;
+	}
+
+	dev->size = g_gb * 1024;
+	dev->completion_nsec = g_completion_nsec;
+	dev->submit_queues = g_submit_queues;
+	dev->prev_submit_queues = g_submit_queues;
+	dev->poll_queues = g_poll_queues;
+	dev->prev_poll_queues = g_poll_queues;
+	dev->home_node = g_home_node;
+	dev->queue_mode = g_queue_mode;
+	dev->blocksize = g_bs;
+	dev->max_sectors = g_max_sectors;
+	dev->irqmode = g_irqmode;
+	dev->hw_queue_depth = g_hw_queue_depth;
+	dev->blocking = g_blocking;
+	dev->memory_backed = g_memory_backed;
+	dev->discard = g_discard;
+	dev->cache_size = g_cache_size;
+	dev->mbps = g_mbps;
+	dev->use_per_node_hctx = g_use_per_node_hctx;
+	dev->zoned = g_zoned;
+	dev->zone_size = g_zone_size;
+	dev->zone_capacity = g_zone_capacity;
+	dev->zone_nr_conv = g_zone_nr_conv;
+	dev->zone_max_open = g_zone_max_open;
+	dev->zone_max_active = g_zone_max_active;
+	dev->virt_boundary = g_virt_boundary;
+	dev->no_sched = g_no_sched;
+	dev->shared_tag_bitmap = g_shared_tag_bitmap;
+	return dev;
+}
+
+static void null_free_dev(struct nullb_device *dev)
+{
+	if (!dev)
+		return;
+
+	null_free_zoned_dev(dev);
+	badblocks_exit(&dev->badblocks);
+	kfree(dev);
+}
+
+static void put_tag(struct nullb_queue *nq, unsigned int tag)
+{
+	clear_bit_unlock(tag, nq->tag_map);
+
+	if (waitqueue_active(&nq->wait))
+		wake_up(&nq->wait);
+}
+
+static unsigned int get_tag(struct nullb_queue *nq)
+{
+	unsigned int tag;
+
+	do {
+		tag = find_first_zero_bit(nq->tag_map, nq->queue_depth);
+		if (tag >= nq->queue_depth)
+			return -1U;
+	} while (test_and_set_bit_lock(tag, nq->tag_map));
+
+	return tag;
+}
+
+static void free_cmd(struct nullb_cmd *cmd)
+{
+	put_tag(cmd->nq, cmd->tag);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer);
+
+static struct nullb_cmd *__alloc_cmd(struct nullb_queue *nq)
+{
+	struct nullb_cmd *cmd;
+	unsigned int tag;
+
+	tag = get_tag(nq);
+	if (tag != -1U) {
+		cmd = &nq->cmds[tag];
+		cmd->tag = tag;
+		cmd->error = BLK_STS_OK;
+		cmd->nq = nq;
+		if (nq->dev->irqmode == NULL_IRQ_TIMER) {
+			hrtimer_init(&cmd->timer, CLOCK_MONOTONIC,
+				     HRTIMER_MODE_REL);
+			cmd->timer.function = null_cmd_timer_expired;
+		}
+		return cmd;
+	}
+
+	return NULL;
+}
+
+static struct nullb_cmd *alloc_cmd(struct nullb_queue *nq, struct bio *bio)
+{
+	struct nullb_cmd *cmd;
+	DEFINE_WAIT(wait);
+
+	do {
+		/*
+		 * This avoids multiple return statements, multiple calls to
+		 * __alloc_cmd() and a fast path call to prepare_to_wait().
+		 */
+		cmd = __alloc_cmd(nq);
+		if (cmd) {
+			cmd->bio = bio;
+			return cmd;
+		}
+		prepare_to_wait(&nq->wait, &wait, TASK_UNINTERRUPTIBLE);
+		io_schedule();
+		finish_wait(&nq->wait, &wait);
+	} while (1);
+}
+
+static void end_cmd(struct nullb_cmd *cmd)
+{
+	int queue_mode = cmd->nq->dev->queue_mode;
+
+	switch (queue_mode)  {
+	case NULL_Q_MQ:
+		blk_mq_end_request(cmd->rq, cmd->error);
+		return;
+	case NULL_Q_BIO:
+		cmd->bio->bi_status = cmd->error;
+		bio_endio(cmd->bio);
+		break;
+	}
+
+	free_cmd(cmd);
+}
+
+static enum hrtimer_restart null_cmd_timer_expired(struct hrtimer *timer)
+{
+	end_cmd(container_of(timer, struct nullb_cmd, timer));
+
+	return HRTIMER_NORESTART;
+}
+
+static void null_cmd_end_timer(struct nullb_cmd *cmd)
+{
+	ktime_t kt = cmd->nq->dev->completion_nsec;
+
+	hrtimer_start(&cmd->timer, kt, HRTIMER_MODE_REL);
+}
+
+static void null_complete_rq(struct request *rq)
+{
+	end_cmd(blk_mq_rq_to_pdu(rq));
+}
+
+static struct nullb_page *null_alloc_page(void)
+{
+	struct nullb_page *t_page;
+
+	t_page = kmalloc(sizeof(struct nullb_page), GFP_NOIO);
+	if (!t_page)
+		return NULL;
+
+	t_page->page = alloc_pages(GFP_NOIO, 0);
+	if (!t_page->page) {
+		kfree(t_page);
+		return NULL;
+	}
+
+	memset(t_page->bitmap, 0, sizeof(t_page->bitmap));
+	return t_page;
+}
+
+static void null_free_page(struct nullb_page *t_page)
+{
+	__set_bit(NULLB_PAGE_FREE, t_page->bitmap);
+	if (test_bit(NULLB_PAGE_LOCK, t_page->bitmap))
+		return;
+	__free_page(t_page->page);
+	kfree(t_page);
+}
+
+static bool null_page_empty(struct nullb_page *page)
+{
+	int size = MAP_SZ - 2;
+
+	return find_first_bit(page->bitmap, size) == size;
+}
+
+static void null_free_sector(struct nullb *nullb, sector_t sector,
+	bool is_cache)
+{
+	unsigned int sector_bit;
+	u64 idx;
+	struct nullb_page *t_page, *ret;
+	struct radix_tree_root *root;
+
+	root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	sector_bit = (sector & SECTOR_MASK);
+
+	t_page = radix_tree_lookup(root, idx);
+	if (t_page) {
+		__clear_bit(sector_bit, t_page->bitmap);
+
+		if (null_page_empty(t_page)) {
+			ret = radix_tree_delete_item(root, idx, t_page);
+			WARN_ON(ret != t_page);
+			null_free_page(ret);
+			if (is_cache)
+				nullb->dev->curr_cache -= PAGE_SIZE;
+		}
+	}
+}
+
+static struct nullb_page *null_radix_tree_insert(struct nullb *nullb, u64 idx,
+	struct nullb_page *t_page, bool is_cache)
+{
+	struct radix_tree_root *root;
+
+	root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+
+	if (radix_tree_insert(root, idx, t_page)) {
+		null_free_page(t_page);
+		t_page = radix_tree_lookup(root, idx);
+		WARN_ON(!t_page || t_page->page->index != idx);
+	} else if (is_cache)
+		nullb->dev->curr_cache += PAGE_SIZE;
+
+	return t_page;
+}
+
+static void null_free_device_storage(struct nullb_device *dev, bool is_cache)
+{
+	unsigned long pos = 0;
+	int nr_pages;
+	struct nullb_page *ret, *t_pages[FREE_BATCH];
+	struct radix_tree_root *root;
+
+	root = is_cache ? &dev->cache : &dev->data;
+
+	do {
+		int i;
+
+		nr_pages = radix_tree_gang_lookup(root,
+				(void **)t_pages, pos, FREE_BATCH);
+
+		for (i = 0; i < nr_pages; i++) {
+			pos = t_pages[i]->page->index;
+			ret = radix_tree_delete_item(root, pos, t_pages[i]);
+			WARN_ON(ret != t_pages[i]);
+			null_free_page(ret);
+		}
+
+		pos++;
+	} while (nr_pages == FREE_BATCH);
+
+	if (is_cache)
+		dev->curr_cache = 0;
+}
+
+static struct nullb_page *__null_lookup_page(struct nullb *nullb,
+	sector_t sector, bool for_write, bool is_cache)
+{
+	unsigned int sector_bit;
+	u64 idx;
+	struct nullb_page *t_page;
+	struct radix_tree_root *root;
+
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	sector_bit = (sector & SECTOR_MASK);
+
+	root = is_cache ? &nullb->dev->cache : &nullb->dev->data;
+	t_page = radix_tree_lookup(root, idx);
+	WARN_ON(t_page && t_page->page->index != idx);
+
+	if (t_page && (for_write || test_bit(sector_bit, t_page->bitmap)))
+		return t_page;
+
+	return NULL;
+}
+
+static struct nullb_page *null_lookup_page(struct nullb *nullb,
+	sector_t sector, bool for_write, bool ignore_cache)
+{
+	struct nullb_page *page = NULL;
+
+	if (!ignore_cache)
+		page = __null_lookup_page(nullb, sector, for_write, true);
+	if (page)
+		return page;
+	return __null_lookup_page(nullb, sector, for_write, false);
+}
+
+static struct nullb_page *null_insert_page(struct nullb *nullb,
+					   sector_t sector, bool ignore_cache)
+	__releases(&nullb->lock)
+	__acquires(&nullb->lock)
+{
+	u64 idx;
+	struct nullb_page *t_page;
+
+	t_page = null_lookup_page(nullb, sector, true, ignore_cache);
+	if (t_page)
+		return t_page;
+
+	spin_unlock_irq(&nullb->lock);
+
+	t_page = null_alloc_page();
+	if (!t_page)
+		goto out_lock;
+
+	if (radix_tree_preload(GFP_NOIO))
+		goto out_freepage;
+
+	spin_lock_irq(&nullb->lock);
+	idx = sector >> PAGE_SECTORS_SHIFT;
+	t_page->page->index = idx;
+	t_page = null_radix_tree_insert(nullb, idx, t_page, !ignore_cache);
+	radix_tree_preload_end();
+
+	return t_page;
+out_freepage:
+	null_free_page(t_page);
+out_lock:
+	spin_lock_irq(&nullb->lock);
+	return null_lookup_page(nullb, sector, true, ignore_cache);
+}
+
+static int null_flush_cache_page(struct nullb *nullb, struct nullb_page *c_page)
+{
+	int i;
+	unsigned int offset;
+	u64 idx;
+	struct nullb_page *t_page, *ret;
+	void *dst, *src;
+
+	idx = c_page->page->index;
+
+	t_page = null_insert_page(nullb, idx << PAGE_SECTORS_SHIFT, true);
+
+	__clear_bit(NULLB_PAGE_LOCK, c_page->bitmap);
+	if (test_bit(NULLB_PAGE_FREE, c_page->bitmap)) {
+		null_free_page(c_page);
+		if (t_page && null_page_empty(t_page)) {
+			ret = radix_tree_delete_item(&nullb->dev->data,
+				idx, t_page);
+			null_free_page(t_page);
+		}
+		return 0;
+	}
+
+	if (!t_page)
+		return -ENOMEM;
+
+	src = kmap_atomic(c_page->page);
+	dst = kmap_atomic(t_page->page);
+
+	for (i = 0; i < PAGE_SECTORS;
+			i += (nullb->dev->blocksize >> SECTOR_SHIFT)) {
+		if (test_bit(i, c_page->bitmap)) {
+			offset = (i << SECTOR_SHIFT);
+			memcpy(dst + offset, src + offset,
+				nullb->dev->blocksize);
+			__set_bit(i, t_page->bitmap);
+		}
+	}
+
+	kunmap_atomic(dst);
+	kunmap_atomic(src);
+
+	ret = radix_tree_delete_item(&nullb->dev->cache, idx, c_page);
+	null_free_page(ret);
+	nullb->dev->curr_cache -= PAGE_SIZE;
+
+	return 0;
+}
+
+static int null_make_cache_space(struct nullb *nullb, unsigned long n)
+{
+	int i, err, nr_pages;
+	struct nullb_page *c_pages[FREE_BATCH];
+	unsigned long flushed = 0, one_round;
+
+again:
+	if ((nullb->dev->cache_size * 1024 * 1024) >
+	     nullb->dev->curr_cache + n || nullb->dev->curr_cache == 0)
+		return 0;
+
+	nr_pages = radix_tree_gang_lookup(&nullb->dev->cache,
+			(void **)c_pages, nullb->cache_flush_pos, FREE_BATCH);
+	/*
+	 * nullb_flush_cache_page could unlock before using the c_pages. To
+	 * avoid race, we don't allow page free
+	 */
+	for (i = 0; i < nr_pages; i++) {
+		nullb->cache_flush_pos = c_pages[i]->page->index;
+		/*
+		 * We found the page which is being flushed to disk by other
+		 * threads
+		 */
+		if (test_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap))
+			c_pages[i] = NULL;
+		else
+			__set_bit(NULLB_PAGE_LOCK, c_pages[i]->bitmap);
+	}
+
+	one_round = 0;
+	for (i = 0; i < nr_pages; i++) {
+		if (c_pages[i] == NULL)
+			continue;
+		err = null_flush_cache_page(nullb, c_pages[i]);
+		if (err)
+			return err;
+		one_round++;
+	}
+	flushed += one_round << PAGE_SHIFT;
+
+	if (n > flushed) {
+		if (nr_pages == 0)
+			nullb->cache_flush_pos = 0;
+		if (one_round == 0) {
+			/* give other threads a chance */
+			spin_unlock_irq(&nullb->lock);
+			spin_lock_irq(&nullb->lock);
+		}
+		goto again;
+	}
+	return 0;
+}
+
+static int copy_to_nullb(struct nullb *nullb, struct page *source,
+	unsigned int off, sector_t sector, size_t n, bool is_fua)
+{
+	size_t temp, count = 0;
+	unsigned int offset;
+	struct nullb_page *t_page;
+	void *dst, *src;
+
+	while (count < n) {
+		temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+		if (null_cache_active(nullb) && !is_fua)
+			null_make_cache_space(nullb, PAGE_SIZE);
+
+		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+		t_page = null_insert_page(nullb, sector,
+			!null_cache_active(nullb) || is_fua);
+		if (!t_page)
+			return -ENOSPC;
+
+		src = kmap_atomic(source);
+		dst = kmap_atomic(t_page->page);
+		memcpy(dst + offset, src + off + count, temp);
+		kunmap_atomic(dst);
+		kunmap_atomic(src);
+
+		__set_bit(sector & SECTOR_MASK, t_page->bitmap);
+
+		if (is_fua)
+			null_free_sector(nullb, sector, true);
+
+		count += temp;
+		sector += temp >> SECTOR_SHIFT;
+	}
+	return 0;
+}
+
+static int copy_from_nullb(struct nullb *nullb, struct page *dest,
+	unsigned int off, sector_t sector, size_t n)
+{
+	size_t temp, count = 0;
+	unsigned int offset;
+	struct nullb_page *t_page;
+	void *dst, *src;
+
+	while (count < n) {
+		temp = min_t(size_t, nullb->dev->blocksize, n - count);
+
+		offset = (sector & SECTOR_MASK) << SECTOR_SHIFT;
+		t_page = null_lookup_page(nullb, sector, false,
+			!null_cache_active(nullb));
+
+		dst = kmap_atomic(dest);
+		if (!t_page) {
+			memset(dst + off + count, 0, temp);
+			goto next;
+		}
+		src = kmap_atomic(t_page->page);
+		memcpy(dst + off + count, src + offset, temp);
+		kunmap_atomic(src);
+next:
+		kunmap_atomic(dst);
+
+		count += temp;
+		sector += temp >> SECTOR_SHIFT;
+	}
+	return 0;
+}
+
+static void nullb_fill_pattern(struct nullb *nullb, struct page *page,
+			       unsigned int len, unsigned int off)
+{
+	void *dst;
+
+	dst = kmap_atomic(page);
+	memset(dst + off, 0xFF, len);
+	kunmap_atomic(dst);
+}
+
+blk_status_t null_handle_discard(struct nullb_device *dev,
+				 sector_t sector, sector_t nr_sectors)
+{
+	struct nullb *nullb = dev->nullb;
+	size_t n = nr_sectors << SECTOR_SHIFT;
+	size_t temp;
+
+	spin_lock_irq(&nullb->lock);
+	while (n > 0) {
+		temp = min_t(size_t, n, dev->blocksize);
+		null_free_sector(nullb, sector, false);
+		if (null_cache_active(nullb))
+			null_free_sector(nullb, sector, true);
+		sector += temp >> SECTOR_SHIFT;
+		n -= temp;
+	}
+	spin_unlock_irq(&nullb->lock);
+
+	return BLK_STS_OK;
+}
+
+static int null_handle_flush(struct nullb *nullb)
+{
+	int err;
+
+	if (!null_cache_active(nullb))
+		return 0;
+
+	spin_lock_irq(&nullb->lock);
+	while (true) {
+		err = null_make_cache_space(nullb,
+			nullb->dev->cache_size * 1024 * 1024);
+		if (err || nullb->dev->curr_cache == 0)
+			break;
+	}
+
+	WARN_ON(!radix_tree_empty(&nullb->dev->cache));
+	spin_unlock_irq(&nullb->lock);
+	return err;
+}
+
+static int null_transfer(struct nullb *nullb, struct page *page,
+	unsigned int len, unsigned int off, bool is_write, sector_t sector,
+	bool is_fua)
+{
+	struct nullb_device *dev = nullb->dev;
+	unsigned int valid_len = len;
+	int err = 0;
+
+	if (!is_write) {
+		if (dev->zoned)
+			valid_len = null_zone_valid_read_len(nullb,
+				sector, len);
+
+		if (valid_len) {
+			err = copy_from_nullb(nullb, page, off,
+				sector, valid_len);
+			off += valid_len;
+			len -= valid_len;
+		}
+
+		if (len)
+			nullb_fill_pattern(nullb, page, len, off);
+		flush_dcache_page(page);
+	} else {
+		flush_dcache_page(page);
+		err = copy_to_nullb(nullb, page, off, sector, len, is_fua);
+	}
+
+	return err;
+}
+
+static int null_handle_rq(struct nullb_cmd *cmd)
+{
+	struct request *rq = cmd->rq;
+	struct nullb *nullb = cmd->nq->dev->nullb;
+	int err;
+	unsigned int len;
+	sector_t sector = blk_rq_pos(rq);
+	struct req_iterator iter;
+	struct bio_vec bvec;
+
+	spin_lock_irq(&nullb->lock);
+	rq_for_each_segment(bvec, rq, iter) {
+		len = bvec.bv_len;
+		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+				     op_is_write(req_op(rq)), sector,
+				     rq->cmd_flags & REQ_FUA);
+		if (err) {
+			spin_unlock_irq(&nullb->lock);
+			return err;
+		}
+		sector += len >> SECTOR_SHIFT;
+	}
+	spin_unlock_irq(&nullb->lock);
+
+	return 0;
+}
+
+static int null_handle_bio(struct nullb_cmd *cmd)
+{
+	struct bio *bio = cmd->bio;
+	struct nullb *nullb = cmd->nq->dev->nullb;
+	int err;
+	unsigned int len;
+	sector_t sector = bio->bi_iter.bi_sector;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+
+	spin_lock_irq(&nullb->lock);
+	bio_for_each_segment(bvec, bio, iter) {
+		len = bvec.bv_len;
+		err = null_transfer(nullb, bvec.bv_page, len, bvec.bv_offset,
+				     op_is_write(bio_op(bio)), sector,
+				     bio->bi_opf & REQ_FUA);
+		if (err) {
+			spin_unlock_irq(&nullb->lock);
+			return err;
+		}
+		sector += len >> SECTOR_SHIFT;
+	}
+	spin_unlock_irq(&nullb->lock);
+	return 0;
+}
+
+static void null_stop_queue(struct nullb *nullb)
+{
+	struct request_queue *q = nullb->q;
+
+	if (nullb->dev->queue_mode == NULL_Q_MQ)
+		blk_mq_stop_hw_queues(q);
+}
+
+static void null_restart_queue_async(struct nullb *nullb)
+{
+	struct request_queue *q = nullb->q;
+
+	if (nullb->dev->queue_mode == NULL_Q_MQ)
+		blk_mq_start_stopped_hw_queues(q, true);
+}
+
+static inline blk_status_t null_handle_throttled(struct nullb_cmd *cmd)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	struct nullb *nullb = dev->nullb;
+	blk_status_t sts = BLK_STS_OK;
+	struct request *rq = cmd->rq;
+
+	if (!hrtimer_active(&nullb->bw_timer))
+		hrtimer_restart(&nullb->bw_timer);
+
+	if (atomic_long_sub_return(blk_rq_bytes(rq), &nullb->cur_bytes) < 0) {
+		null_stop_queue(nullb);
+		/* race with timer */
+		if (atomic_long_read(&nullb->cur_bytes) > 0)
+			null_restart_queue_async(nullb);
+		/* requeue request */
+		sts = BLK_STS_DEV_RESOURCE;
+	}
+	return sts;
+}
+
+static inline blk_status_t null_handle_badblocks(struct nullb_cmd *cmd,
+						 sector_t sector,
+						 sector_t nr_sectors)
+{
+	struct badblocks *bb = &cmd->nq->dev->badblocks;
+	sector_t first_bad;
+	int bad_sectors;
+
+	if (badblocks_check(bb, sector, nr_sectors, &first_bad, &bad_sectors))
+		return BLK_STS_IOERR;
+
+	return BLK_STS_OK;
+}
+
+static inline blk_status_t null_handle_memory_backed(struct nullb_cmd *cmd,
+						     enum req_op op,
+						     sector_t sector,
+						     sector_t nr_sectors)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	int err;
+
+	if (op == REQ_OP_DISCARD)
+		return null_handle_discard(dev, sector, nr_sectors);
+
+	if (dev->queue_mode == NULL_Q_BIO)
+		err = null_handle_bio(cmd);
+	else
+		err = null_handle_rq(cmd);
+
+	return errno_to_blk_status(err);
+}
+
+static void nullb_zero_read_cmd_buffer(struct nullb_cmd *cmd)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	struct bio *bio;
+
+	if (dev->memory_backed)
+		return;
+
+	if (dev->queue_mode == NULL_Q_BIO && bio_op(cmd->bio) == REQ_OP_READ) {
+		zero_fill_bio(cmd->bio);
+	} else if (req_op(cmd->rq) == REQ_OP_READ) {
+		__rq_for_each_bio(bio, cmd->rq)
+			zero_fill_bio(bio);
+	}
+}
+
+static inline void nullb_complete_cmd(struct nullb_cmd *cmd)
+{
+	/*
+	 * Since root privileges are required to configure the null_blk
+	 * driver, it is fine that this driver does not initialize the
+	 * data buffers of read commands. Zero-initialize these buffers
+	 * anyway if KMSAN is enabled to prevent that KMSAN complains
+	 * about null_blk not initializing read data buffers.
+	 */
+	if (IS_ENABLED(CONFIG_KMSAN))
+		nullb_zero_read_cmd_buffer(cmd);
+
+	/* Complete IO by inline, softirq or timer */
+	switch (cmd->nq->dev->irqmode) {
+	case NULL_IRQ_SOFTIRQ:
+		switch (cmd->nq->dev->queue_mode) {
+		case NULL_Q_MQ:
+			blk_mq_complete_request(cmd->rq);
+			break;
+		case NULL_Q_BIO:
+			/*
+			 * XXX: no proper submitting cpu information available.
+			 */
+			end_cmd(cmd);
+			break;
+		}
+		break;
+	case NULL_IRQ_NONE:
+		end_cmd(cmd);
+		break;
+	case NULL_IRQ_TIMER:
+		null_cmd_end_timer(cmd);
+		break;
+	}
+}
+
+blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
+			      sector_t sector, unsigned int nr_sectors)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	blk_status_t ret;
+
+	if (dev->badblocks.shift != -1) {
+		ret = null_handle_badblocks(cmd, sector, nr_sectors);
+		if (ret != BLK_STS_OK)
+			return ret;
+	}
+
+	if (dev->memory_backed)
+		return null_handle_memory_backed(cmd, op, sector, nr_sectors);
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t null_handle_cmd(struct nullb_cmd *cmd, sector_t sector,
+				    sector_t nr_sectors, enum req_op op)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	struct nullb *nullb = dev->nullb;
+	blk_status_t sts;
+
+	if (test_bit(NULLB_DEV_FL_THROTTLED, &dev->flags)) {
+		sts = null_handle_throttled(cmd);
+		if (sts != BLK_STS_OK)
+			return sts;
+	}
+
+	if (op == REQ_OP_FLUSH) {
+		cmd->error = errno_to_blk_status(null_handle_flush(nullb));
+		goto out;
+	}
+
+	if (dev->zoned)
+		sts = null_process_zoned_cmd(cmd, op, sector, nr_sectors);
+	else
+		sts = null_process_cmd(cmd, op, sector, nr_sectors);
+
+	/* Do not overwrite errors (e.g. timeout errors) */
+	if (cmd->error == BLK_STS_OK)
+		cmd->error = sts;
+
+out:
+	nullb_complete_cmd(cmd);
+	return BLK_STS_OK;
+}
+
+static enum hrtimer_restart nullb_bwtimer_fn(struct hrtimer *timer)
+{
+	struct nullb *nullb = container_of(timer, struct nullb, bw_timer);
+	ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+	unsigned int mbps = nullb->dev->mbps;
+
+	if (atomic_long_read(&nullb->cur_bytes) == mb_per_tick(mbps))
+		return HRTIMER_NORESTART;
+
+	atomic_long_set(&nullb->cur_bytes, mb_per_tick(mbps));
+	null_restart_queue_async(nullb);
+
+	hrtimer_forward_now(&nullb->bw_timer, timer_interval);
+
+	return HRTIMER_RESTART;
+}
+
+static void nullb_setup_bwtimer(struct nullb *nullb)
+{
+	ktime_t timer_interval = ktime_set(0, TIMER_INTERVAL);
+
+	hrtimer_init(&nullb->bw_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	nullb->bw_timer.function = nullb_bwtimer_fn;
+	atomic_long_set(&nullb->cur_bytes, mb_per_tick(nullb->dev->mbps));
+	hrtimer_start(&nullb->bw_timer, timer_interval, HRTIMER_MODE_REL);
+}
+
+static struct nullb_queue *nullb_to_queue(struct nullb *nullb)
+{
+	int index = 0;
+
+	if (nullb->nr_queues != 1)
+		index = raw_smp_processor_id() / ((nr_cpu_ids + nullb->nr_queues - 1) / nullb->nr_queues);
+
+	return &nullb->queues[index];
+}
+
+static void null_submit_bio(struct bio *bio)
+{
+	sector_t sector = bio->bi_iter.bi_sector;
+	sector_t nr_sectors = bio_sectors(bio);
+	struct nullb *nullb = bio->bi_bdev->bd_disk->private_data;
+	struct nullb_queue *nq = nullb_to_queue(nullb);
+
+	null_handle_cmd(alloc_cmd(nq, bio), sector, nr_sectors, bio_op(bio));
+}
+
+static bool should_timeout_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (g_timeout_str[0])
+		return should_fail(&null_timeout_attr, 1);
+#endif
+	return false;
+}
+
+static bool should_requeue_request(struct request *rq)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (g_requeue_str[0])
+		return should_fail(&null_requeue_attr, 1);
+#endif
+	return false;
+}
+
+static void null_map_queues(struct blk_mq_tag_set *set)
+{
+	struct nullb *nullb = set->driver_data;
+	int i, qoff;
+	unsigned int submit_queues = g_submit_queues;
+	unsigned int poll_queues = g_poll_queues;
+
+	if (nullb) {
+		struct nullb_device *dev = nullb->dev;
+
+		/*
+		 * Refer nr_hw_queues of the tag set to check if the expected
+		 * number of hardware queues are prepared. If block layer failed
+		 * to prepare them, use previous numbers of submit queues and
+		 * poll queues to map queues.
+		 */
+		if (set->nr_hw_queues ==
+		    dev->submit_queues + dev->poll_queues) {
+			submit_queues = dev->submit_queues;
+			poll_queues = dev->poll_queues;
+		} else if (set->nr_hw_queues ==
+			   dev->prev_submit_queues + dev->prev_poll_queues) {
+			submit_queues = dev->prev_submit_queues;
+			poll_queues = dev->prev_poll_queues;
+		} else {
+			pr_warn("tag set has unexpected nr_hw_queues: %d\n",
+				set->nr_hw_queues);
+			WARN_ON_ONCE(true);
+			submit_queues = 1;
+			poll_queues = 0;
+		}
+	}
+
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		switch (i) {
+		case HCTX_TYPE_DEFAULT:
+			map->nr_queues = submit_queues;
+			break;
+		case HCTX_TYPE_READ:
+			map->nr_queues = 0;
+			continue;
+		case HCTX_TYPE_POLL:
+			map->nr_queues = poll_queues;
+			break;
+		}
+		map->queue_offset = qoff;
+		qoff += map->nr_queues;
+		blk_mq_map_queues(map);
+	}
+}
+
+static int null_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct nullb_queue *nq = hctx->driver_data;
+	LIST_HEAD(list);
+	int nr = 0;
+	struct request *rq;
+
+	spin_lock(&nq->poll_lock);
+	list_splice_init(&nq->poll_list, &list);
+	list_for_each_entry(rq, &list, queuelist)
+		blk_mq_set_request_complete(rq);
+	spin_unlock(&nq->poll_lock);
+
+	while (!list_empty(&list)) {
+		struct nullb_cmd *cmd;
+		struct request *req;
+
+		req = list_first_entry(&list, struct request, queuelist);
+		list_del_init(&req->queuelist);
+		cmd = blk_mq_rq_to_pdu(req);
+		cmd->error = null_process_cmd(cmd, req_op(req), blk_rq_pos(req),
+						blk_rq_sectors(req));
+		if (!blk_mq_add_to_batch(req, iob, (__force int) cmd->error,
+					blk_mq_end_request_batch))
+			end_cmd(cmd);
+		nr++;
+	}
+
+	return nr;
+}
+
+static enum blk_eh_timer_return null_timeout_rq(struct request *rq)
+{
+	struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(rq);
+
+	if (hctx->type == HCTX_TYPE_POLL) {
+		struct nullb_queue *nq = hctx->driver_data;
+
+		spin_lock(&nq->poll_lock);
+		/* The request may have completed meanwhile. */
+		if (blk_mq_request_completed(rq)) {
+			spin_unlock(&nq->poll_lock);
+			return BLK_EH_DONE;
+		}
+		list_del_init(&rq->queuelist);
+		spin_unlock(&nq->poll_lock);
+	}
+
+	pr_info("rq %p timed out\n", rq);
+
+	/*
+	 * If the device is marked as blocking (i.e. memory backed or zoned
+	 * device), the submission path may be blocked waiting for resources
+	 * and cause real timeouts. For these real timeouts, the submission
+	 * path will complete the request using blk_mq_complete_request().
+	 * Only fake timeouts need to execute blk_mq_complete_request() here.
+	 */
+	cmd->error = BLK_STS_TIMEOUT;
+	if (cmd->fake_timeout || hctx->type == HCTX_TYPE_POLL)
+		blk_mq_complete_request(rq);
+	return BLK_EH_DONE;
+}
+
+static blk_status_t null_queue_rq(struct blk_mq_hw_ctx *hctx,
+			 const struct blk_mq_queue_data *bd)
+{
+	struct nullb_cmd *cmd = blk_mq_rq_to_pdu(bd->rq);
+	struct nullb_queue *nq = hctx->driver_data;
+	sector_t nr_sectors = blk_rq_sectors(bd->rq);
+	sector_t sector = blk_rq_pos(bd->rq);
+	const bool is_poll = hctx->type == HCTX_TYPE_POLL;
+
+	might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+
+	if (!is_poll && nq->dev->irqmode == NULL_IRQ_TIMER) {
+		hrtimer_init(&cmd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+		cmd->timer.function = null_cmd_timer_expired;
+	}
+	cmd->rq = bd->rq;
+	cmd->error = BLK_STS_OK;
+	cmd->nq = nq;
+	cmd->fake_timeout = should_timeout_request(bd->rq) ||
+		blk_should_fake_timeout(bd->rq->q);
+
+	blk_mq_start_request(bd->rq);
+
+	if (should_requeue_request(bd->rq)) {
+		/*
+		 * Alternate between hitting the core BUSY path, and the
+		 * driver driven requeue path
+		 */
+		nq->requeue_selection++;
+		if (nq->requeue_selection & 1)
+			return BLK_STS_RESOURCE;
+		else {
+			blk_mq_requeue_request(bd->rq, true);
+			return BLK_STS_OK;
+		}
+	}
+
+	if (is_poll) {
+		spin_lock(&nq->poll_lock);
+		list_add_tail(&bd->rq->queuelist, &nq->poll_list);
+		spin_unlock(&nq->poll_lock);
+		return BLK_STS_OK;
+	}
+	if (cmd->fake_timeout)
+		return BLK_STS_OK;
+
+	return null_handle_cmd(cmd, sector, nr_sectors, req_op(bd->rq));
+}
+
+static void cleanup_queue(struct nullb_queue *nq)
+{
+	bitmap_free(nq->tag_map);
+	kfree(nq->cmds);
+}
+
+static void cleanup_queues(struct nullb *nullb)
+{
+	int i;
+
+	for (i = 0; i < nullb->nr_queues; i++)
+		cleanup_queue(&nullb->queues[i]);
+
+	kfree(nullb->queues);
+}
+
+static void null_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+	struct nullb_queue *nq = hctx->driver_data;
+	struct nullb *nullb = nq->dev->nullb;
+
+	nullb->nr_queues--;
+}
+
+static void null_init_queue(struct nullb *nullb, struct nullb_queue *nq)
+{
+	init_waitqueue_head(&nq->wait);
+	nq->queue_depth = nullb->queue_depth;
+	nq->dev = nullb->dev;
+	INIT_LIST_HEAD(&nq->poll_list);
+	spin_lock_init(&nq->poll_lock);
+}
+
+static int null_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
+			  unsigned int hctx_idx)
+{
+	struct nullb *nullb = hctx->queue->queuedata;
+	struct nullb_queue *nq;
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (g_init_hctx_str[0] && should_fail(&null_init_hctx_attr, 1))
+		return -EFAULT;
+#endif
+
+	nq = &nullb->queues[hctx_idx];
+	hctx->driver_data = nq;
+	null_init_queue(nullb, nq);
+	nullb->nr_queues++;
+
+	return 0;
+}
+
+static const struct blk_mq_ops null_mq_ops = {
+	.queue_rq       = null_queue_rq,
+	.complete	= null_complete_rq,
+	.timeout	= null_timeout_rq,
+	.poll		= null_poll,
+	.map_queues	= null_map_queues,
+	.init_hctx	= null_init_hctx,
+	.exit_hctx	= null_exit_hctx,
+};
+
+static void null_del_dev(struct nullb *nullb)
+{
+	struct nullb_device *dev;
+
+	if (!nullb)
+		return;
+
+	dev = nullb->dev;
+
+	ida_simple_remove(&nullb_indexes, nullb->index);
+
+	list_del_init(&nullb->list);
+
+	del_gendisk(nullb->disk);
+
+	if (test_bit(NULLB_DEV_FL_THROTTLED, &nullb->dev->flags)) {
+		hrtimer_cancel(&nullb->bw_timer);
+		atomic_long_set(&nullb->cur_bytes, LONG_MAX);
+		null_restart_queue_async(nullb);
+	}
+
+	put_disk(nullb->disk);
+	if (dev->queue_mode == NULL_Q_MQ &&
+	    nullb->tag_set == &nullb->__tag_set)
+		blk_mq_free_tag_set(nullb->tag_set);
+	cleanup_queues(nullb);
+	if (null_cache_active(nullb))
+		null_free_device_storage(nullb->dev, true);
+	kfree(nullb);
+	dev->nullb = NULL;
+}
+
+static void null_config_discard(struct nullb *nullb)
+{
+	if (nullb->dev->discard == false)
+		return;
+
+	if (!nullb->dev->memory_backed) {
+		nullb->dev->discard = false;
+		pr_info("discard option is ignored without memory backing\n");
+		return;
+	}
+
+	if (nullb->dev->zoned) {
+		nullb->dev->discard = false;
+		pr_info("discard option is ignored in zoned mode\n");
+		return;
+	}
+
+	nullb->q->limits.discard_granularity = nullb->dev->blocksize;
+	blk_queue_max_discard_sectors(nullb->q, UINT_MAX >> 9);
+}
+
+static const struct block_device_operations null_bio_ops = {
+	.owner		= THIS_MODULE,
+	.submit_bio	= null_submit_bio,
+	.report_zones	= null_report_zones,
+};
+
+static const struct block_device_operations null_rq_ops = {
+	.owner		= THIS_MODULE,
+	.report_zones	= null_report_zones,
+};
+
+static int setup_commands(struct nullb_queue *nq)
+{
+	struct nullb_cmd *cmd;
+	int i;
+
+	nq->cmds = kcalloc(nq->queue_depth, sizeof(*cmd), GFP_KERNEL);
+	if (!nq->cmds)
+		return -ENOMEM;
+
+	nq->tag_map = bitmap_zalloc(nq->queue_depth, GFP_KERNEL);
+	if (!nq->tag_map) {
+		kfree(nq->cmds);
+		return -ENOMEM;
+	}
+
+	for (i = 0; i < nq->queue_depth; i++) {
+		cmd = &nq->cmds[i];
+		cmd->tag = -1U;
+	}
+
+	return 0;
+}
+
+static int setup_queues(struct nullb *nullb)
+{
+	int nqueues = nr_cpu_ids;
+
+	if (g_poll_queues)
+		nqueues += g_poll_queues;
+
+	nullb->queues = kcalloc(nqueues, sizeof(struct nullb_queue),
+				GFP_KERNEL);
+	if (!nullb->queues)
+		return -ENOMEM;
+
+	nullb->queue_depth = nullb->dev->hw_queue_depth;
+	return 0;
+}
+
+static int init_driver_queues(struct nullb *nullb)
+{
+	struct nullb_queue *nq;
+	int i, ret = 0;
+
+	for (i = 0; i < nullb->dev->submit_queues; i++) {
+		nq = &nullb->queues[i];
+
+		null_init_queue(nullb, nq);
+
+		ret = setup_commands(nq);
+		if (ret)
+			return ret;
+		nullb->nr_queues++;
+	}
+	return 0;
+}
+
+static int null_gendisk_register(struct nullb *nullb)
+{
+	sector_t size = ((sector_t)nullb->dev->size * SZ_1M) >> SECTOR_SHIFT;
+	struct gendisk *disk = nullb->disk;
+
+	set_capacity(disk, size);
+
+	disk->major		= null_major;
+	disk->first_minor	= nullb->index;
+	disk->minors		= 1;
+	if (queue_is_mq(nullb->q))
+		disk->fops		= &null_rq_ops;
+	else
+		disk->fops		= &null_bio_ops;
+	disk->private_data	= nullb;
+	strncpy(disk->disk_name, nullb->disk_name, DISK_NAME_LEN);
+
+	if (nullb->dev->zoned) {
+		int ret = null_register_zoned_dev(nullb);
+
+		if (ret)
+			return ret;
+	}
+
+	return add_disk(disk);
+}
+
+static int null_init_tag_set(struct nullb *nullb, struct blk_mq_tag_set *set)
+{
+	unsigned int flags = BLK_MQ_F_SHOULD_MERGE;
+	int hw_queues, numa_node;
+	unsigned int queue_depth;
+	int poll_queues;
+
+	if (nullb) {
+		hw_queues = nullb->dev->submit_queues;
+		poll_queues = nullb->dev->poll_queues;
+		queue_depth = nullb->dev->hw_queue_depth;
+		numa_node = nullb->dev->home_node;
+		if (nullb->dev->no_sched)
+			flags |= BLK_MQ_F_NO_SCHED;
+		if (nullb->dev->shared_tag_bitmap)
+			flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+		if (nullb->dev->blocking)
+			flags |= BLK_MQ_F_BLOCKING;
+	} else {
+		hw_queues = g_submit_queues;
+		poll_queues = g_poll_queues;
+		queue_depth = g_hw_queue_depth;
+		numa_node = g_home_node;
+		if (g_no_sched)
+			flags |= BLK_MQ_F_NO_SCHED;
+		if (g_shared_tag_bitmap)
+			flags |= BLK_MQ_F_TAG_HCTX_SHARED;
+		if (g_blocking)
+			flags |= BLK_MQ_F_BLOCKING;
+	}
+
+	set->ops = &null_mq_ops;
+	set->cmd_size	= sizeof(struct nullb_cmd);
+	set->flags = flags;
+	set->driver_data = nullb;
+	set->nr_hw_queues = hw_queues;
+	set->queue_depth = queue_depth;
+	set->numa_node = numa_node;
+	if (poll_queues) {
+		set->nr_hw_queues += poll_queues;
+		set->nr_maps = 3;
+	} else {
+		set->nr_maps = 1;
+	}
+
+	return blk_mq_alloc_tag_set(set);
+}
+
+static int null_validate_conf(struct nullb_device *dev)
+{
+	if (dev->queue_mode == NULL_Q_RQ) {
+		pr_err("legacy IO path is no longer available\n");
+		return -EINVAL;
+	}
+
+	dev->blocksize = round_down(dev->blocksize, 512);
+	dev->blocksize = clamp_t(unsigned int, dev->blocksize, 512, 4096);
+
+	if (dev->queue_mode == NULL_Q_MQ && dev->use_per_node_hctx) {
+		if (dev->submit_queues != nr_online_nodes)
+			dev->submit_queues = nr_online_nodes;
+	} else if (dev->submit_queues > nr_cpu_ids)
+		dev->submit_queues = nr_cpu_ids;
+	else if (dev->submit_queues == 0)
+		dev->submit_queues = 1;
+	dev->prev_submit_queues = dev->submit_queues;
+
+	if (dev->poll_queues > g_poll_queues)
+		dev->poll_queues = g_poll_queues;
+	dev->prev_poll_queues = dev->poll_queues;
+
+	dev->queue_mode = min_t(unsigned int, dev->queue_mode, NULL_Q_MQ);
+	dev->irqmode = min_t(unsigned int, dev->irqmode, NULL_IRQ_TIMER);
+
+	/* Do memory allocation, so set blocking */
+	if (dev->memory_backed)
+		dev->blocking = true;
+	else /* cache is meaningless */
+		dev->cache_size = 0;
+	dev->cache_size = min_t(unsigned long, ULONG_MAX / 1024 / 1024,
+						dev->cache_size);
+	dev->mbps = min_t(unsigned int, 1024 * 40, dev->mbps);
+	/* can not stop a queue */
+	if (dev->queue_mode == NULL_Q_BIO)
+		dev->mbps = 0;
+
+	if (dev->zoned &&
+	    (!dev->zone_size || !is_power_of_2(dev->zone_size))) {
+		pr_err("zone_size must be power-of-two\n");
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+static bool __null_setup_fault(struct fault_attr *attr, char *str)
+{
+	if (!str[0])
+		return true;
+
+	if (!setup_fault_attr(attr, str))
+		return false;
+
+	attr->verbose = 0;
+	return true;
+}
+#endif
+
+static bool null_setup_fault(void)
+{
+#ifdef CONFIG_BLK_DEV_NULL_BLK_FAULT_INJECTION
+	if (!__null_setup_fault(&null_timeout_attr, g_timeout_str))
+		return false;
+	if (!__null_setup_fault(&null_requeue_attr, g_requeue_str))
+		return false;
+	if (!__null_setup_fault(&null_init_hctx_attr, g_init_hctx_str))
+		return false;
+#endif
+	return true;
+}
+
+static int null_add_dev(struct nullb_device *dev)
+{
+	struct nullb *nullb;
+	int rv;
+
+	rv = null_validate_conf(dev);
+	if (rv)
+		return rv;
+
+	nullb = kzalloc_node(sizeof(*nullb), GFP_KERNEL, dev->home_node);
+	if (!nullb) {
+		rv = -ENOMEM;
+		goto out;
+	}
+	nullb->dev = dev;
+	dev->nullb = nullb;
+
+	spin_lock_init(&nullb->lock);
+
+	rv = setup_queues(nullb);
+	if (rv)
+		goto out_free_nullb;
+
+	if (dev->queue_mode == NULL_Q_MQ) {
+		if (shared_tags) {
+			nullb->tag_set = &tag_set;
+			rv = 0;
+		} else {
+			nullb->tag_set = &nullb->__tag_set;
+			rv = null_init_tag_set(nullb, nullb->tag_set);
+		}
+
+		if (rv)
+			goto out_cleanup_queues;
+
+		if (!null_setup_fault())
+			goto out_cleanup_tags;
+
+		nullb->tag_set->timeout = 5 * HZ;
+		nullb->disk = blk_mq_alloc_disk(nullb->tag_set, nullb);
+		if (IS_ERR(nullb->disk)) {
+			rv = PTR_ERR(nullb->disk);
+			goto out_cleanup_tags;
+		}
+		nullb->q = nullb->disk->queue;
+	} else if (dev->queue_mode == NULL_Q_BIO) {
+		rv = -ENOMEM;
+		nullb->disk = blk_alloc_disk(nullb->dev->home_node);
+		if (!nullb->disk)
+			goto out_cleanup_queues;
+
+		nullb->q = nullb->disk->queue;
+		rv = init_driver_queues(nullb);
+		if (rv)
+			goto out_cleanup_disk;
+	}
+
+	if (dev->mbps) {
+		set_bit(NULLB_DEV_FL_THROTTLED, &dev->flags);
+		nullb_setup_bwtimer(nullb);
+	}
+
+	if (dev->cache_size > 0) {
+		set_bit(NULLB_DEV_FL_CACHE, &nullb->dev->flags);
+		blk_queue_write_cache(nullb->q, true, true);
+	}
+
+	if (dev->zoned) {
+		rv = null_init_zoned_dev(dev, nullb->q);
+		if (rv)
+			goto out_cleanup_disk;
+	}
+
+	nullb->q->queuedata = nullb;
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q);
+
+	mutex_lock(&lock);
+	rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL);
+	if (rv < 0) {
+		mutex_unlock(&lock);
+		goto out_cleanup_zone;
+	}
+	nullb->index = rv;
+	dev->index = rv;
+	mutex_unlock(&lock);
+
+	blk_queue_logical_block_size(nullb->q, dev->blocksize);
+	blk_queue_physical_block_size(nullb->q, dev->blocksize);
+	if (dev->max_sectors)
+		blk_queue_max_hw_sectors(nullb->q, dev->max_sectors);
+
+	if (dev->virt_boundary)
+		blk_queue_virt_boundary(nullb->q, PAGE_SIZE - 1);
+
+	null_config_discard(nullb);
+
+	if (config_item_name(&dev->item)) {
+		/* Use configfs dir name as the device name */
+		snprintf(nullb->disk_name, sizeof(nullb->disk_name),
+			 "%s", config_item_name(&dev->item));
+	} else {
+		sprintf(nullb->disk_name, "nullb%d", nullb->index);
+	}
+
+	rv = null_gendisk_register(nullb);
+	if (rv)
+		goto out_ida_free;
+
+	mutex_lock(&lock);
+	list_add_tail(&nullb->list, &nullb_list);
+	mutex_unlock(&lock);
+
+	pr_info("disk %s created\n", nullb->disk_name);
+
+	return 0;
+
+out_ida_free:
+	ida_free(&nullb_indexes, nullb->index);
+out_cleanup_zone:
+	null_free_zoned_dev(dev);
+out_cleanup_disk:
+	put_disk(nullb->disk);
+out_cleanup_tags:
+	if (dev->queue_mode == NULL_Q_MQ && nullb->tag_set == &nullb->__tag_set)
+		blk_mq_free_tag_set(nullb->tag_set);
+out_cleanup_queues:
+	cleanup_queues(nullb);
+out_free_nullb:
+	kfree(nullb);
+	dev->nullb = NULL;
+out:
+	return rv;
+}
+
+static struct nullb *null_find_dev_by_name(const char *name)
+{
+	struct nullb *nullb = NULL, *nb;
+
+	mutex_lock(&lock);
+	list_for_each_entry(nb, &nullb_list, list) {
+		if (strcmp(nb->disk_name, name) == 0) {
+			nullb = nb;
+			break;
+		}
+	}
+	mutex_unlock(&lock);
+
+	return nullb;
+}
+
+static int null_create_dev(void)
+{
+	struct nullb_device *dev;
+	int ret;
+
+	dev = null_alloc_dev();
+	if (!dev)
+		return -ENOMEM;
+
+	ret = null_add_dev(dev);
+	if (ret) {
+		null_free_dev(dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void null_destroy_dev(struct nullb *nullb)
+{
+	struct nullb_device *dev = nullb->dev;
+
+	null_del_dev(nullb);
+	null_free_device_storage(dev, false);
+	null_free_dev(dev);
+}
+
+static int __init null_init(void)
+{
+	int ret = 0;
+	unsigned int i;
+	struct nullb *nullb;
+
+	if (g_bs > PAGE_SIZE) {
+		pr_warn("invalid block size\n");
+		pr_warn("defaults block size to %lu\n", PAGE_SIZE);
+		g_bs = PAGE_SIZE;
+	}
+
+	if (g_home_node != NUMA_NO_NODE && g_home_node >= nr_online_nodes) {
+		pr_err("invalid home_node value\n");
+		g_home_node = NUMA_NO_NODE;
+	}
+
+	if (g_queue_mode == NULL_Q_RQ) {
+		pr_err("legacy IO path is no longer available\n");
+		return -EINVAL;
+	}
+
+	if (g_queue_mode == NULL_Q_MQ && g_use_per_node_hctx) {
+		if (g_submit_queues != nr_online_nodes) {
+			pr_warn("submit_queues param is set to %u.\n",
+				nr_online_nodes);
+			g_submit_queues = nr_online_nodes;
+		}
+	} else if (g_submit_queues > nr_cpu_ids) {
+		g_submit_queues = nr_cpu_ids;
+	} else if (g_submit_queues <= 0) {
+		g_submit_queues = 1;
+	}
+
+	if (g_queue_mode == NULL_Q_MQ && shared_tags) {
+		ret = null_init_tag_set(NULL, &tag_set);
+		if (ret)
+			return ret;
+	}
+
+	config_group_init(&nullb_subsys.su_group);
+	mutex_init(&nullb_subsys.su_mutex);
+
+	ret = configfs_register_subsystem(&nullb_subsys);
+	if (ret)
+		goto err_tagset;
+
+	mutex_init(&lock);
+
+	null_major = register_blkdev(0, "nullb");
+	if (null_major < 0) {
+		ret = null_major;
+		goto err_conf;
+	}
+
+	for (i = 0; i < nr_devices; i++) {
+		ret = null_create_dev();
+		if (ret)
+			goto err_dev;
+	}
+
+	pr_info("module loaded\n");
+	return 0;
+
+err_dev:
+	while (!list_empty(&nullb_list)) {
+		nullb = list_entry(nullb_list.next, struct nullb, list);
+		null_destroy_dev(nullb);
+	}
+	unregister_blkdev(null_major, "nullb");
+err_conf:
+	configfs_unregister_subsystem(&nullb_subsys);
+err_tagset:
+	if (g_queue_mode == NULL_Q_MQ && shared_tags)
+		blk_mq_free_tag_set(&tag_set);
+	return ret;
+}
+
+static void __exit null_exit(void)
+{
+	struct nullb *nullb;
+
+	configfs_unregister_subsystem(&nullb_subsys);
+
+	unregister_blkdev(null_major, "nullb");
+
+	mutex_lock(&lock);
+	while (!list_empty(&nullb_list)) {
+		nullb = list_entry(nullb_list.next, struct nullb, list);
+		null_destroy_dev(nullb);
+	}
+	mutex_unlock(&lock);
+
+	if (g_queue_mode == NULL_Q_MQ && shared_tags)
+		blk_mq_free_tag_set(&tag_set);
+}
+
+module_init(null_init);
+module_exit(null_exit);
+
+MODULE_AUTHOR("Jens Axboe <axboe@kernel.dk>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/null_blk/null_blk.h b/drivers/block/null_blk/null_blk.h
new file mode 100644
index 000000000..94ff68052
--- /dev/null
+++ b/drivers/block/null_blk/null_blk.h
@@ -0,0 +1,179 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __BLK_NULL_BLK_H
+#define __BLK_NULL_BLK_H
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/hrtimer.h>
+#include <linux/configfs.h>
+#include <linux/badblocks.h>
+#include <linux/fault-inject.h>
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+struct nullb_cmd {
+	union {
+		struct request *rq;
+		struct bio *bio;
+	};
+	unsigned int tag;
+	blk_status_t error;
+	bool fake_timeout;
+	struct nullb_queue *nq;
+	struct hrtimer timer;
+};
+
+struct nullb_queue {
+	unsigned long *tag_map;
+	wait_queue_head_t wait;
+	unsigned int queue_depth;
+	struct nullb_device *dev;
+	unsigned int requeue_selection;
+
+	struct list_head poll_list;
+	spinlock_t poll_lock;
+
+	struct nullb_cmd *cmds;
+};
+
+struct nullb_zone {
+	/*
+	 * Zone lock to prevent concurrent modification of a zone write
+	 * pointer position and condition: with memory backing, a write
+	 * command execution may sleep on memory allocation. For this case,
+	 * use mutex as the zone lock. Otherwise, use the spinlock for
+	 * locking the zone.
+	 */
+	union {
+		spinlock_t spinlock;
+		struct mutex mutex;
+	};
+	enum blk_zone_type type;
+	enum blk_zone_cond cond;
+	sector_t start;
+	sector_t wp;
+	unsigned int len;
+	unsigned int capacity;
+};
+
+/* Queue modes */
+enum {
+	NULL_Q_BIO	= 0,
+	NULL_Q_RQ	= 1,
+	NULL_Q_MQ	= 2,
+};
+
+struct nullb_device {
+	struct nullb *nullb;
+	struct config_item item;
+	struct radix_tree_root data; /* data stored in the disk */
+	struct radix_tree_root cache; /* disk cache data */
+	unsigned long flags; /* device flags */
+	unsigned int curr_cache;
+	struct badblocks badblocks;
+
+	unsigned int nr_zones;
+	unsigned int nr_zones_imp_open;
+	unsigned int nr_zones_exp_open;
+	unsigned int nr_zones_closed;
+	unsigned int imp_close_zone_no;
+	struct nullb_zone *zones;
+	sector_t zone_size_sects;
+	bool need_zone_res_mgmt;
+	spinlock_t zone_res_lock;
+
+	unsigned long size; /* device size in MB */
+	unsigned long completion_nsec; /* time in ns to complete a request */
+	unsigned long cache_size; /* disk cache size in MB */
+	unsigned long zone_size; /* zone size in MB if device is zoned */
+	unsigned long zone_capacity; /* zone capacity in MB if device is zoned */
+	unsigned int zone_nr_conv; /* number of conventional zones */
+	unsigned int zone_max_open; /* max number of open zones */
+	unsigned int zone_max_active; /* max number of active zones */
+	unsigned int submit_queues; /* number of submission queues */
+	unsigned int prev_submit_queues; /* number of submission queues before change */
+	unsigned int poll_queues; /* number of IOPOLL submission queues */
+	unsigned int prev_poll_queues; /* number of IOPOLL submission queues before change */
+	unsigned int home_node; /* home node for the device */
+	unsigned int queue_mode; /* block interface */
+	unsigned int blocksize; /* block size */
+	unsigned int max_sectors; /* Max sectors per command */
+	unsigned int irqmode; /* IRQ completion handler */
+	unsigned int hw_queue_depth; /* queue depth */
+	unsigned int index; /* index of the disk, only valid with a disk */
+	unsigned int mbps; /* Bandwidth throttle cap (in MB/s) */
+	bool blocking; /* blocking blk-mq device */
+	bool use_per_node_hctx; /* use per-node allocation for hardware context */
+	bool power; /* power on/off the device */
+	bool memory_backed; /* if data is stored in memory */
+	bool discard; /* if support discard */
+	bool zoned; /* if device is zoned */
+	bool virt_boundary; /* virtual boundary on/off for the device */
+	bool no_sched; /* no IO scheduler for the device */
+	bool shared_tag_bitmap; /* use hostwide shared tags */
+};
+
+struct nullb {
+	struct nullb_device *dev;
+	struct list_head list;
+	unsigned int index;
+	struct request_queue *q;
+	struct gendisk *disk;
+	struct blk_mq_tag_set *tag_set;
+	struct blk_mq_tag_set __tag_set;
+	unsigned int queue_depth;
+	atomic_long_t cur_bytes;
+	struct hrtimer bw_timer;
+	unsigned long cache_flush_pos;
+	spinlock_t lock;
+
+	struct nullb_queue *queues;
+	unsigned int nr_queues;
+	char disk_name[DISK_NAME_LEN];
+};
+
+blk_status_t null_handle_discard(struct nullb_device *dev, sector_t sector,
+				 sector_t nr_sectors);
+blk_status_t null_process_cmd(struct nullb_cmd *cmd, enum req_op op,
+			      sector_t sector, unsigned int nr_sectors);
+
+#ifdef CONFIG_BLK_DEV_ZONED
+int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q);
+int null_register_zoned_dev(struct nullb *nullb);
+void null_free_zoned_dev(struct nullb_device *dev);
+int null_report_zones(struct gendisk *disk, sector_t sector,
+		      unsigned int nr_zones, report_zones_cb cb, void *data);
+blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
+				    sector_t sector, sector_t nr_sectors);
+size_t null_zone_valid_read_len(struct nullb *nullb,
+				sector_t sector, unsigned int len);
+#else
+static inline int null_init_zoned_dev(struct nullb_device *dev,
+				      struct request_queue *q)
+{
+	pr_err("CONFIG_BLK_DEV_ZONED not enabled\n");
+	return -EINVAL;
+}
+static inline int null_register_zoned_dev(struct nullb *nullb)
+{
+	return -ENODEV;
+}
+static inline void null_free_zoned_dev(struct nullb_device *dev) {}
+static inline blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd,
+			enum req_op op, sector_t sector, sector_t nr_sectors)
+{
+	return BLK_STS_NOTSUPP;
+}
+static inline size_t null_zone_valid_read_len(struct nullb *nullb,
+					      sector_t sector,
+					      unsigned int len)
+{
+	return len;
+}
+#define null_report_zones	NULL
+#endif /* CONFIG_BLK_DEV_ZONED */
+#endif /* __NULL_BLK_H */
diff --git a/drivers/block/null_blk/trace.c b/drivers/block/null_blk/trace.c
new file mode 100644
index 000000000..3711cba16
--- /dev/null
+++ b/drivers/block/null_blk/trace.c
@@ -0,0 +1,21 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * null_blk trace related helpers.
+ *
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+#include "trace.h"
+
+/*
+ * Helper to use for all null_blk traces to extract disk name.
+ */
+const char *nullb_trace_disk_name(struct trace_seq *p, char *name)
+{
+	const char *ret = trace_seq_buffer_ptr(p);
+
+	if (name && *name)
+		trace_seq_printf(p, "disk=%s, ", name);
+	trace_seq_putc(p, 0);
+
+	return ret;
+}
diff --git a/drivers/block/null_blk/trace.h b/drivers/block/null_blk/trace.h
new file mode 100644
index 000000000..6b2b370e7
--- /dev/null
+++ b/drivers/block/null_blk/trace.h
@@ -0,0 +1,79 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * null_blk device driver tracepoints.
+ *
+ * Copyright (C) 2020 Western Digital Corporation or its affiliates.
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM nullb
+
+#if !defined(_TRACE_NULLB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_NULLB_H
+
+#include <linux/tracepoint.h>
+#include <linux/trace_seq.h>
+
+#include "null_blk.h"
+
+const char *nullb_trace_disk_name(struct trace_seq *p, char *name);
+
+#define __print_disk_name(name) nullb_trace_disk_name(p, name)
+
+#ifndef TRACE_HEADER_MULTI_READ
+static inline void __assign_disk_name(char *name, struct gendisk *disk)
+{
+	if (disk)
+		memcpy(name, disk->disk_name, DISK_NAME_LEN);
+	else
+		memset(name, 0, DISK_NAME_LEN);
+}
+#endif
+
+TRACE_EVENT(nullb_zone_op,
+	    TP_PROTO(struct nullb_cmd *cmd, unsigned int zone_no,
+		     unsigned int zone_cond),
+	    TP_ARGS(cmd, zone_no, zone_cond),
+	    TP_STRUCT__entry(
+		__array(char, disk, DISK_NAME_LEN)
+		__field(enum req_op, op)
+		__field(unsigned int, zone_no)
+		__field(unsigned int, zone_cond)
+	    ),
+	    TP_fast_assign(
+		__entry->op = req_op(cmd->rq);
+		__entry->zone_no = zone_no;
+		__entry->zone_cond = zone_cond;
+		__assign_disk_name(__entry->disk, cmd->rq->q->disk);
+	    ),
+	    TP_printk("%s req=%-15s zone_no=%u zone_cond=%-10s",
+		      __print_disk_name(__entry->disk),
+		      blk_op_str(__entry->op),
+		      __entry->zone_no,
+		      blk_zone_cond_str(__entry->zone_cond))
+);
+
+TRACE_EVENT(nullb_report_zones,
+	    TP_PROTO(struct nullb *nullb, unsigned int nr_zones),
+	    TP_ARGS(nullb, nr_zones),
+	    TP_STRUCT__entry(
+		__array(char, disk, DISK_NAME_LEN)
+		__field(unsigned int, nr_zones)
+	    ),
+	    TP_fast_assign(
+		__entry->nr_zones = nr_zones;
+		__assign_disk_name(__entry->disk, nullb->disk);
+	    ),
+	    TP_printk("%s nr_zones=%u",
+		      __print_disk_name(__entry->disk), __entry->nr_zones)
+);
+
+#endif /* _TRACE_NULLB_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_FILE trace
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/drivers/block/null_blk/zoned.c b/drivers/block/null_blk/zoned.c
new file mode 100644
index 000000000..55a69e48e
--- /dev/null
+++ b/drivers/block/null_blk/zoned.c
@@ -0,0 +1,683 @@
+// SPDX-License-Identifier: GPL-2.0
+#include <linux/vmalloc.h>
+#include <linux/bitmap.h>
+#include "null_blk.h"
+
+#define CREATE_TRACE_POINTS
+#include "trace.h"
+
+#undef pr_fmt
+#define pr_fmt(fmt)	"null_blk: " fmt
+
+static inline sector_t mb_to_sects(unsigned long mb)
+{
+	return ((sector_t)mb * SZ_1M) >> SECTOR_SHIFT;
+}
+
+static inline unsigned int null_zone_no(struct nullb_device *dev, sector_t sect)
+{
+	return sect >> ilog2(dev->zone_size_sects);
+}
+
+static inline void null_lock_zone_res(struct nullb_device *dev)
+{
+	if (dev->need_zone_res_mgmt)
+		spin_lock_irq(&dev->zone_res_lock);
+}
+
+static inline void null_unlock_zone_res(struct nullb_device *dev)
+{
+	if (dev->need_zone_res_mgmt)
+		spin_unlock_irq(&dev->zone_res_lock);
+}
+
+static inline void null_init_zone_lock(struct nullb_device *dev,
+				       struct nullb_zone *zone)
+{
+	if (!dev->memory_backed)
+		spin_lock_init(&zone->spinlock);
+	else
+		mutex_init(&zone->mutex);
+}
+
+static inline void null_lock_zone(struct nullb_device *dev,
+				  struct nullb_zone *zone)
+{
+	if (!dev->memory_backed)
+		spin_lock_irq(&zone->spinlock);
+	else
+		mutex_lock(&zone->mutex);
+}
+
+static inline void null_unlock_zone(struct nullb_device *dev,
+				    struct nullb_zone *zone)
+{
+	if (!dev->memory_backed)
+		spin_unlock_irq(&zone->spinlock);
+	else
+		mutex_unlock(&zone->mutex);
+}
+
+int null_init_zoned_dev(struct nullb_device *dev, struct request_queue *q)
+{
+	sector_t dev_capacity_sects, zone_capacity_sects;
+	struct nullb_zone *zone;
+	sector_t sector = 0;
+	unsigned int i;
+
+	if (!is_power_of_2(dev->zone_size)) {
+		pr_err("zone_size must be power-of-two\n");
+		return -EINVAL;
+	}
+	if (dev->zone_size > dev->size) {
+		pr_err("Zone size larger than device capacity\n");
+		return -EINVAL;
+	}
+
+	if (!dev->zone_capacity)
+		dev->zone_capacity = dev->zone_size;
+
+	if (dev->zone_capacity > dev->zone_size) {
+		pr_err("zone capacity (%lu MB) larger than zone size (%lu MB)\n",
+		       dev->zone_capacity, dev->zone_size);
+		return -EINVAL;
+	}
+
+	zone_capacity_sects = mb_to_sects(dev->zone_capacity);
+	dev_capacity_sects = mb_to_sects(dev->size);
+	dev->zone_size_sects = mb_to_sects(dev->zone_size);
+	dev->nr_zones = round_up(dev_capacity_sects, dev->zone_size_sects)
+		>> ilog2(dev->zone_size_sects);
+
+	dev->zones = kvmalloc_array(dev->nr_zones, sizeof(struct nullb_zone),
+				    GFP_KERNEL | __GFP_ZERO);
+	if (!dev->zones)
+		return -ENOMEM;
+
+	spin_lock_init(&dev->zone_res_lock);
+
+	if (dev->zone_nr_conv >= dev->nr_zones) {
+		dev->zone_nr_conv = dev->nr_zones - 1;
+		pr_info("changed the number of conventional zones to %u",
+			dev->zone_nr_conv);
+	}
+
+	/* Max active zones has to be < nbr of seq zones in order to be enforceable */
+	if (dev->zone_max_active >= dev->nr_zones - dev->zone_nr_conv) {
+		dev->zone_max_active = 0;
+		pr_info("zone_max_active limit disabled, limit >= zone count\n");
+	}
+
+	/* Max open zones has to be <= max active zones */
+	if (dev->zone_max_active && dev->zone_max_open > dev->zone_max_active) {
+		dev->zone_max_open = dev->zone_max_active;
+		pr_info("changed the maximum number of open zones to %u\n",
+			dev->nr_zones);
+	} else if (dev->zone_max_open >= dev->nr_zones - dev->zone_nr_conv) {
+		dev->zone_max_open = 0;
+		pr_info("zone_max_open limit disabled, limit >= zone count\n");
+	}
+	dev->need_zone_res_mgmt = dev->zone_max_active || dev->zone_max_open;
+	dev->imp_close_zone_no = dev->zone_nr_conv;
+
+	for (i = 0; i <  dev->zone_nr_conv; i++) {
+		zone = &dev->zones[i];
+
+		null_init_zone_lock(dev, zone);
+		zone->start = sector;
+		zone->len = dev->zone_size_sects;
+		zone->capacity = zone->len;
+		zone->wp = zone->start + zone->len;
+		zone->type = BLK_ZONE_TYPE_CONVENTIONAL;
+		zone->cond = BLK_ZONE_COND_NOT_WP;
+
+		sector += dev->zone_size_sects;
+	}
+
+	for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
+		zone = &dev->zones[i];
+
+		null_init_zone_lock(dev, zone);
+		zone->start = zone->wp = sector;
+		if (zone->start + dev->zone_size_sects > dev_capacity_sects)
+			zone->len = dev_capacity_sects - zone->start;
+		else
+			zone->len = dev->zone_size_sects;
+		zone->capacity =
+			min_t(sector_t, zone->len, zone_capacity_sects);
+		zone->type = BLK_ZONE_TYPE_SEQWRITE_REQ;
+		zone->cond = BLK_ZONE_COND_EMPTY;
+
+		sector += dev->zone_size_sects;
+	}
+
+	return 0;
+}
+
+int null_register_zoned_dev(struct nullb *nullb)
+{
+	struct nullb_device *dev = nullb->dev;
+	struct request_queue *q = nullb->q;
+
+	disk_set_zoned(nullb->disk, BLK_ZONED_HM);
+	blk_queue_flag_set(QUEUE_FLAG_ZONE_RESETALL, q);
+	blk_queue_required_elevator_features(q, ELEVATOR_F_ZBD_SEQ_WRITE);
+
+	if (queue_is_mq(q)) {
+		int ret = blk_revalidate_disk_zones(nullb->disk, NULL);
+
+		if (ret)
+			return ret;
+	} else {
+		blk_queue_chunk_sectors(q, dev->zone_size_sects);
+		nullb->disk->nr_zones = bdev_nr_zones(nullb->disk->part0);
+	}
+
+	blk_queue_max_zone_append_sectors(q, dev->zone_size_sects);
+	disk_set_max_open_zones(nullb->disk, dev->zone_max_open);
+	disk_set_max_active_zones(nullb->disk, dev->zone_max_active);
+
+	return 0;
+}
+
+void null_free_zoned_dev(struct nullb_device *dev)
+{
+	kvfree(dev->zones);
+	dev->zones = NULL;
+}
+
+int null_report_zones(struct gendisk *disk, sector_t sector,
+		unsigned int nr_zones, report_zones_cb cb, void *data)
+{
+	struct nullb *nullb = disk->private_data;
+	struct nullb_device *dev = nullb->dev;
+	unsigned int first_zone, i;
+	struct nullb_zone *zone;
+	struct blk_zone blkz;
+	int error;
+
+	first_zone = null_zone_no(dev, sector);
+	if (first_zone >= dev->nr_zones)
+		return 0;
+
+	nr_zones = min(nr_zones, dev->nr_zones - first_zone);
+	trace_nullb_report_zones(nullb, nr_zones);
+
+	memset(&blkz, 0, sizeof(struct blk_zone));
+	zone = &dev->zones[first_zone];
+	for (i = 0; i < nr_zones; i++, zone++) {
+		/*
+		 * Stacked DM target drivers will remap the zone information by
+		 * modifying the zone information passed to the report callback.
+		 * So use a local copy to avoid corruption of the device zone
+		 * array.
+		 */
+		null_lock_zone(dev, zone);
+		blkz.start = zone->start;
+		blkz.len = zone->len;
+		blkz.wp = zone->wp;
+		blkz.type = zone->type;
+		blkz.cond = zone->cond;
+		blkz.capacity = zone->capacity;
+		null_unlock_zone(dev, zone);
+
+		error = cb(&blkz, i, data);
+		if (error)
+			return error;
+	}
+
+	return nr_zones;
+}
+
+/*
+ * This is called in the case of memory backing from null_process_cmd()
+ * with the target zone already locked.
+ */
+size_t null_zone_valid_read_len(struct nullb *nullb,
+				sector_t sector, unsigned int len)
+{
+	struct nullb_device *dev = nullb->dev;
+	struct nullb_zone *zone = &dev->zones[null_zone_no(dev, sector)];
+	unsigned int nr_sectors = len >> SECTOR_SHIFT;
+
+	/* Read must be below the write pointer position */
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL ||
+	    sector + nr_sectors <= zone->wp)
+		return len;
+
+	if (sector > zone->wp)
+		return 0;
+
+	return (zone->wp - sector) << SECTOR_SHIFT;
+}
+
+static blk_status_t __null_close_zone(struct nullb_device *dev,
+				      struct nullb_zone *zone)
+{
+	switch (zone->cond) {
+	case BLK_ZONE_COND_CLOSED:
+		/* close operation on closed is not an error */
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_EMPTY:
+	case BLK_ZONE_COND_FULL:
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	if (zone->wp == zone->start) {
+		zone->cond = BLK_ZONE_COND_EMPTY;
+	} else {
+		zone->cond = BLK_ZONE_COND_CLOSED;
+		dev->nr_zones_closed++;
+	}
+
+	return BLK_STS_OK;
+}
+
+static void null_close_imp_open_zone(struct nullb_device *dev)
+{
+	struct nullb_zone *zone;
+	unsigned int zno, i;
+
+	zno = dev->imp_close_zone_no;
+	if (zno >= dev->nr_zones)
+		zno = dev->zone_nr_conv;
+
+	for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
+		zone = &dev->zones[zno];
+		zno++;
+		if (zno >= dev->nr_zones)
+			zno = dev->zone_nr_conv;
+
+		if (zone->cond == BLK_ZONE_COND_IMP_OPEN) {
+			__null_close_zone(dev, zone);
+			dev->imp_close_zone_no = zno;
+			return;
+		}
+	}
+}
+
+static blk_status_t null_check_active(struct nullb_device *dev)
+{
+	if (!dev->zone_max_active)
+		return BLK_STS_OK;
+
+	if (dev->nr_zones_exp_open + dev->nr_zones_imp_open +
+			dev->nr_zones_closed < dev->zone_max_active)
+		return BLK_STS_OK;
+
+	return BLK_STS_ZONE_ACTIVE_RESOURCE;
+}
+
+static blk_status_t null_check_open(struct nullb_device *dev)
+{
+	if (!dev->zone_max_open)
+		return BLK_STS_OK;
+
+	if (dev->nr_zones_exp_open + dev->nr_zones_imp_open < dev->zone_max_open)
+		return BLK_STS_OK;
+
+	if (dev->nr_zones_imp_open) {
+		if (null_check_active(dev) == BLK_STS_OK) {
+			null_close_imp_open_zone(dev);
+			return BLK_STS_OK;
+		}
+	}
+
+	return BLK_STS_ZONE_OPEN_RESOURCE;
+}
+
+/*
+ * This function matches the manage open zone resources function in the ZBC standard,
+ * with the addition of max active zones support (added in the ZNS standard).
+ *
+ * The function determines if a zone can transition to implicit open or explicit open,
+ * while maintaining the max open zone (and max active zone) limit(s). It may close an
+ * implicit open zone in order to make additional zone resources available.
+ *
+ * ZBC states that an implicit open zone shall be closed only if there is not
+ * room within the open limit. However, with the addition of an active limit,
+ * it is not certain that closing an implicit open zone will allow a new zone
+ * to be opened, since we might already be at the active limit capacity.
+ */
+static blk_status_t null_check_zone_resources(struct nullb_device *dev,
+					      struct nullb_zone *zone)
+{
+	blk_status_t ret;
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EMPTY:
+		ret = null_check_active(dev);
+		if (ret != BLK_STS_OK)
+			return ret;
+		fallthrough;
+	case BLK_ZONE_COND_CLOSED:
+		return null_check_open(dev);
+	default:
+		/* Should never be called for other states */
+		WARN_ON(1);
+		return BLK_STS_IOERR;
+	}
+}
+
+static blk_status_t null_zone_write(struct nullb_cmd *cmd, sector_t sector,
+				    unsigned int nr_sectors, bool append)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	unsigned int zno = null_zone_no(dev, sector);
+	struct nullb_zone *zone = &dev->zones[zno];
+	blk_status_t ret;
+
+	trace_nullb_zone_op(cmd, zno, zone->cond);
+
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL) {
+		if (append)
+			return BLK_STS_IOERR;
+		return null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
+	}
+
+	null_lock_zone(dev, zone);
+
+	if (zone->cond == BLK_ZONE_COND_FULL) {
+		/* Cannot write to a full zone */
+		ret = BLK_STS_IOERR;
+		goto unlock;
+	}
+
+	/*
+	 * Regular writes must be at the write pointer position.
+	 * Zone append writes are automatically issued at the write
+	 * pointer and the position returned using the request or BIO
+	 * sector.
+	 */
+	if (append) {
+		sector = zone->wp;
+		if (dev->queue_mode == NULL_Q_MQ)
+			cmd->rq->__sector = sector;
+		else
+			cmd->bio->bi_iter.bi_sector = sector;
+	} else if (sector != zone->wp) {
+		ret = BLK_STS_IOERR;
+		goto unlock;
+	}
+
+	if (zone->wp + nr_sectors > zone->start + zone->capacity) {
+		ret = BLK_STS_IOERR;
+		goto unlock;
+	}
+
+	if (zone->cond == BLK_ZONE_COND_CLOSED ||
+	    zone->cond == BLK_ZONE_COND_EMPTY) {
+		null_lock_zone_res(dev);
+
+		ret = null_check_zone_resources(dev, zone);
+		if (ret != BLK_STS_OK) {
+			null_unlock_zone_res(dev);
+			goto unlock;
+		}
+		if (zone->cond == BLK_ZONE_COND_CLOSED) {
+			dev->nr_zones_closed--;
+			dev->nr_zones_imp_open++;
+		} else if (zone->cond == BLK_ZONE_COND_EMPTY) {
+			dev->nr_zones_imp_open++;
+		}
+
+		if (zone->cond != BLK_ZONE_COND_EXP_OPEN)
+			zone->cond = BLK_ZONE_COND_IMP_OPEN;
+
+		null_unlock_zone_res(dev);
+	}
+
+	ret = null_process_cmd(cmd, REQ_OP_WRITE, sector, nr_sectors);
+	if (ret != BLK_STS_OK)
+		goto unlock;
+
+	zone->wp += nr_sectors;
+	if (zone->wp == zone->start + zone->capacity) {
+		null_lock_zone_res(dev);
+		if (zone->cond == BLK_ZONE_COND_EXP_OPEN)
+			dev->nr_zones_exp_open--;
+		else if (zone->cond == BLK_ZONE_COND_IMP_OPEN)
+			dev->nr_zones_imp_open--;
+		zone->cond = BLK_ZONE_COND_FULL;
+		null_unlock_zone_res(dev);
+	}
+
+	ret = BLK_STS_OK;
+
+unlock:
+	null_unlock_zone(dev, zone);
+
+	return ret;
+}
+
+static blk_status_t null_open_zone(struct nullb_device *dev,
+				   struct nullb_zone *zone)
+{
+	blk_status_t ret = BLK_STS_OK;
+
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	null_lock_zone_res(dev);
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EXP_OPEN:
+		/* open operation on exp open is not an error */
+		goto unlock;
+	case BLK_ZONE_COND_EMPTY:
+		ret = null_check_zone_resources(dev, zone);
+		if (ret != BLK_STS_OK)
+			goto unlock;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_CLOSED:
+		ret = null_check_zone_resources(dev, zone);
+		if (ret != BLK_STS_OK)
+			goto unlock;
+		dev->nr_zones_closed--;
+		break;
+	case BLK_ZONE_COND_FULL:
+	default:
+		ret = BLK_STS_IOERR;
+		goto unlock;
+	}
+
+	zone->cond = BLK_ZONE_COND_EXP_OPEN;
+	dev->nr_zones_exp_open++;
+
+unlock:
+	null_unlock_zone_res(dev);
+
+	return ret;
+}
+
+static blk_status_t null_close_zone(struct nullb_device *dev,
+				    struct nullb_zone *zone)
+{
+	blk_status_t ret;
+
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	null_lock_zone_res(dev);
+	ret = __null_close_zone(dev, zone);
+	null_unlock_zone_res(dev);
+
+	return ret;
+}
+
+static blk_status_t null_finish_zone(struct nullb_device *dev,
+				     struct nullb_zone *zone)
+{
+	blk_status_t ret = BLK_STS_OK;
+
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	null_lock_zone_res(dev);
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_FULL:
+		/* finish operation on full is not an error */
+		goto unlock;
+	case BLK_ZONE_COND_EMPTY:
+		ret = null_check_zone_resources(dev, zone);
+		if (ret != BLK_STS_OK)
+			goto unlock;
+		break;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_CLOSED:
+		ret = null_check_zone_resources(dev, zone);
+		if (ret != BLK_STS_OK)
+			goto unlock;
+		dev->nr_zones_closed--;
+		break;
+	default:
+		ret = BLK_STS_IOERR;
+		goto unlock;
+	}
+
+	zone->cond = BLK_ZONE_COND_FULL;
+	zone->wp = zone->start + zone->len;
+
+unlock:
+	null_unlock_zone_res(dev);
+
+	return ret;
+}
+
+static blk_status_t null_reset_zone(struct nullb_device *dev,
+				    struct nullb_zone *zone)
+{
+	if (zone->type == BLK_ZONE_TYPE_CONVENTIONAL)
+		return BLK_STS_IOERR;
+
+	null_lock_zone_res(dev);
+
+	switch (zone->cond) {
+	case BLK_ZONE_COND_EMPTY:
+		/* reset operation on empty is not an error */
+		null_unlock_zone_res(dev);
+		return BLK_STS_OK;
+	case BLK_ZONE_COND_IMP_OPEN:
+		dev->nr_zones_imp_open--;
+		break;
+	case BLK_ZONE_COND_EXP_OPEN:
+		dev->nr_zones_exp_open--;
+		break;
+	case BLK_ZONE_COND_CLOSED:
+		dev->nr_zones_closed--;
+		break;
+	case BLK_ZONE_COND_FULL:
+		break;
+	default:
+		null_unlock_zone_res(dev);
+		return BLK_STS_IOERR;
+	}
+
+	zone->cond = BLK_ZONE_COND_EMPTY;
+	zone->wp = zone->start;
+
+	null_unlock_zone_res(dev);
+
+	if (dev->memory_backed)
+		return null_handle_discard(dev, zone->start, zone->len);
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t null_zone_mgmt(struct nullb_cmd *cmd, enum req_op op,
+				   sector_t sector)
+{
+	struct nullb_device *dev = cmd->nq->dev;
+	unsigned int zone_no;
+	struct nullb_zone *zone;
+	blk_status_t ret;
+	size_t i;
+
+	if (op == REQ_OP_ZONE_RESET_ALL) {
+		for (i = dev->zone_nr_conv; i < dev->nr_zones; i++) {
+			zone = &dev->zones[i];
+			null_lock_zone(dev, zone);
+			if (zone->cond != BLK_ZONE_COND_EMPTY) {
+				null_reset_zone(dev, zone);
+				trace_nullb_zone_op(cmd, i, zone->cond);
+			}
+			null_unlock_zone(dev, zone);
+		}
+		return BLK_STS_OK;
+	}
+
+	zone_no = null_zone_no(dev, sector);
+	zone = &dev->zones[zone_no];
+
+	null_lock_zone(dev, zone);
+
+	switch (op) {
+	case REQ_OP_ZONE_RESET:
+		ret = null_reset_zone(dev, zone);
+		break;
+	case REQ_OP_ZONE_OPEN:
+		ret = null_open_zone(dev, zone);
+		break;
+	case REQ_OP_ZONE_CLOSE:
+		ret = null_close_zone(dev, zone);
+		break;
+	case REQ_OP_ZONE_FINISH:
+		ret = null_finish_zone(dev, zone);
+		break;
+	default:
+		ret = BLK_STS_NOTSUPP;
+		break;
+	}
+
+	if (ret == BLK_STS_OK)
+		trace_nullb_zone_op(cmd, zone_no, zone->cond);
+
+	null_unlock_zone(dev, zone);
+
+	return ret;
+}
+
+blk_status_t null_process_zoned_cmd(struct nullb_cmd *cmd, enum req_op op,
+				    sector_t sector, sector_t nr_sectors)
+{
+	struct nullb_device *dev;
+	struct nullb_zone *zone;
+	blk_status_t sts;
+
+	switch (op) {
+	case REQ_OP_WRITE:
+		return null_zone_write(cmd, sector, nr_sectors, false);
+	case REQ_OP_ZONE_APPEND:
+		return null_zone_write(cmd, sector, nr_sectors, true);
+	case REQ_OP_ZONE_RESET:
+	case REQ_OP_ZONE_RESET_ALL:
+	case REQ_OP_ZONE_OPEN:
+	case REQ_OP_ZONE_CLOSE:
+	case REQ_OP_ZONE_FINISH:
+		return null_zone_mgmt(cmd, op, sector);
+	default:
+		dev = cmd->nq->dev;
+		zone = &dev->zones[null_zone_no(dev, sector)];
+
+		null_lock_zone(dev, zone);
+		sts = null_process_cmd(cmd, op, sector, nr_sectors);
+		null_unlock_zone(dev, zone);
+		return sts;
+	}
+}
diff --git a/drivers/block/paride/Kconfig b/drivers/block/paride/Kconfig
new file mode 100644
index 000000000..a29563459
--- /dev/null
+++ b/drivers/block/paride/Kconfig
@@ -0,0 +1,302 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# PARIDE configuration
+#
+# PARIDE doesn't need PARPORT, but if PARPORT is configured as a module,
+# PARIDE must also be a module.
+# PARIDE only supports PC style parports. Tough for USB or other parports...
+
+comment "Parallel IDE high-level drivers"
+	depends on PARIDE
+
+config PARIDE_PD
+	tristate "Parallel port IDE disks"
+	depends on PARIDE
+	help
+	  This option enables the high-level driver for IDE-type disk devices
+	  connected through a parallel port. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  parallel port IDE driver, otherwise you should answer M to build
+	  it as a loadable module. The module will be called pd. You
+	  must also have at least one parallel port protocol driver in your
+	  system. Among the devices supported by this driver are the SyQuest
+	  EZ-135, EZ-230 and SparQ drives, the Avatar Shark and the backpack
+	  hard drives from MicroSolutions.
+
+config PARIDE_PCD
+	tristate "Parallel port ATAPI CD-ROMs"
+	depends on PARIDE
+	select CDROM
+	help
+	  This option enables the high-level driver for ATAPI CD-ROM devices
+	  connected through a parallel port. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  parallel port ATAPI CD-ROM driver, otherwise you should answer M to
+	  build it as a loadable module. The module will be called pcd. You
+	  must also have at least one parallel port protocol driver in your
+	  system. Among the devices supported by this driver are the
+	  MicroSolutions backpack CD-ROM drives and the Freecom Power CD. If
+	  you have such a CD-ROM drive, you should also say Y or M to "ISO
+	  9660 CD-ROM file system support" below, because that's the file
+	  system used on CD-ROMs.
+
+config PARIDE_PF
+	tristate "Parallel port ATAPI disks"
+	depends on PARIDE
+	help
+	  This option enables the high-level driver for ATAPI disk devices
+	  connected through a parallel port. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  parallel port ATAPI disk driver, otherwise you should answer M
+	  to build it as a loadable module. The module will be called pf.
+	  You must also have at least one parallel port protocol driver in
+	  your system. Among the devices supported by this driver are the
+	  MicroSolutions backpack PD/CD drive and the Imation Superdisk
+	  LS-120 drive.
+
+config PARIDE_PT
+	tristate "Parallel port ATAPI tapes"
+	depends on PARIDE
+	help
+	  This option enables the high-level driver for ATAPI tape devices
+	  connected through a parallel port. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  parallel port ATAPI disk driver, otherwise you should answer M
+	  to build it as a loadable module. The module will be called pt.
+	  You must also have at least one parallel port protocol driver in
+	  your system. Among the devices supported by this driver is the
+	  parallel port version of the HP 5GB drive.
+
+config PARIDE_PG
+	tristate "Parallel port generic ATAPI devices"
+	depends on PARIDE
+	help
+	  This option enables a special high-level driver for generic ATAPI
+	  devices connected through a parallel port. The driver allows user
+	  programs, such as cdrtools, to send ATAPI commands directly to a
+	  device.
+
+	  If you chose to build PARIDE support into your kernel, you may
+	  answer Y here to build in the parallel port generic ATAPI driver,
+	  otherwise you should answer M to build it as a loadable module. The
+	  module will be called pg.
+
+	  You must also have at least one parallel port protocol driver in
+	  your system.
+
+	  This driver implements an API loosely related to the generic SCSI
+	  driver. See <file:include/linux/pg.h>. for details.
+
+	  You can obtain the most recent version of cdrtools from
+	  <ftp://ftp.berlios.de/pub/cdrecord/>. Versions 1.6.1a3 and
+	  later fully support this driver.
+
+comment "Parallel IDE protocol modules"
+	depends on PARIDE
+
+config PARIDE_ATEN
+	tristate "ATEN EH-100 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the ATEN EH-100 parallel port IDE
+	  protocol. This protocol is used in some inexpensive low performance
+	  parallel port kits made in Hong Kong. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  protocol driver, otherwise you should answer M to build it as a
+	  loadable module. The module will be called aten. You must also
+	  have a high-level driver for the type of device that you want to
+	  support.
+
+config PARIDE_BPCK
+	tristate "MicroSolutions backpack (Series 5) protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the Micro Solutions BACKPACK
+	  parallel port Series 5 IDE protocol.  (Most BACKPACK drives made
+	  before 1999 were Series 5) Series 5 drives will NOT always have the
+	  Series noted on the bottom of the drive. Series 6 drivers will.
+
+	  In other words, if your BACKPACK drive doesn't say "Series 6" on the
+	  bottom, enable this option.
+
+	  If you chose to build PARIDE support into your kernel, you may
+	  answer Y here to build in the protocol driver, otherwise you should
+	  answer M to build it as a loadable module.  The module will be
+	  called bpck.  You must also have a high-level driver for the type
+	  of device that you want to support.
+
+config PARIDE_BPCK6
+	tristate "MicroSolutions backpack (Series 6) protocol"
+	depends on PARIDE && !64BIT
+	help
+	  This option enables support for the Micro Solutions BACKPACK
+	  parallel port Series 6 IDE protocol.  (Most BACKPACK drives made
+	  after 1999 were Series 6) Series 6 drives will have the Series noted
+	  on the bottom of the drive.  Series 5 drivers don't always have it
+	  noted.
+
+	  In other words, if your BACKPACK drive says "Series 6" on the
+	  bottom, enable this option.
+
+	  If you chose to build PARIDE support into your kernel, you may
+	  answer Y here to build in the protocol driver, otherwise you should
+	  answer M to build it as a loadable module.  The module will be
+	  called bpck6.  You must also have a high-level driver for the type
+	  of device that you want to support.
+
+config PARIDE_COMM
+	tristate "DataStor Commuter protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the Commuter parallel port IDE
+	  protocol from DataStor. If you chose to build PARIDE support
+	  into your kernel, you may answer Y here to build in the protocol
+	  driver, otherwise you should answer M to build it as a loadable
+	  module. The module will be called comm. You must also have
+	  a high-level driver for the type of device that you want to support.
+
+config PARIDE_DSTR
+	tristate "DataStor EP-2000 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the EP-2000 parallel port IDE
+	  protocol from DataStor. If you chose to build PARIDE support
+	  into your kernel, you may answer Y here to build in the protocol
+	  driver, otherwise you should answer M to build it as a loadable
+	  module. The module will be called dstr. You must also have
+	  a high-level driver for the type of device that you want to support.
+
+config PARIDE_FIT2
+	tristate "FIT TD-2000 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the TD-2000 parallel port IDE
+	  protocol from Fidelity International Technology. This is a simple
+	  (low speed) adapter that is used in some portable hard drives. If
+	  you chose to build PARIDE support into your kernel, you may answer Y
+	  here to build in the protocol driver, otherwise you should answer M
+	  to build it as a loadable module. The module will be called ktti.
+	  You must also have a high-level driver for the type of device that
+	  you want to support.
+
+config PARIDE_FIT3
+	tristate "FIT TD-3000 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the TD-3000 parallel port IDE
+	  protocol from Fidelity International Technology. This protocol is
+	  used in newer models of their portable disk, CD-ROM and PD/CD
+	  devices. If you chose to build PARIDE support into your kernel, you
+	  may answer Y here to build in the protocol driver, otherwise you
+	  should answer M to build it as a loadable module. The module will be
+	  called fit3. You must also have a high-level driver for the type
+	  of device that you want to support.
+
+config PARIDE_EPAT
+	tristate "Shuttle EPAT/EPEZ protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the EPAT parallel port IDE protocol.
+	  EPAT is a parallel port IDE adapter manufactured by Shuttle
+	  Technology and widely used in devices from major vendors such as
+	  Hewlett-Packard, SyQuest, Imation and Avatar. If you chose to build
+	  PARIDE support into your kernel, you may answer Y here to build in
+	  the protocol driver, otherwise you should answer M to build it as a
+	  loadable module. The module will be called epat. You must also
+	  have a high-level driver for the type of device that you want to
+	  support.
+
+config PARIDE_EPATC8
+	bool "Support c7/c8 chips"
+	depends on PARIDE_EPAT
+	help
+	  This option enables support for the newer Shuttle EP1284 (aka c7 and
+	  c8) chip. You need this if you are using any recent Imation SuperDisk
+	  (LS-120) drive.
+
+config PARIDE_EPIA
+	tristate "Shuttle EPIA protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the (obsolete) EPIA parallel port
+	  IDE protocol from Shuttle Technology. This adapter can still be
+	  found in some no-name kits. If you chose to build PARIDE support
+	  into your kernel, you may answer Y here to build in the protocol
+	  driver, otherwise you should answer M to build it as a loadable
+	  module. The module will be called epia. You must also have a
+	  high-level driver for the type of device that you want to support.
+
+config PARIDE_FRIQ
+	tristate "Freecom IQ ASIC-2 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for version 2 of the Freecom IQ parallel
+	  port IDE adapter.  This adapter is used by the Maxell Superdisk
+	  drive.  If you chose to build PARIDE support into your kernel, you
+	  may answer Y here to build in the protocol driver, otherwise you
+	  should answer M to build it as a loadable module. The module will be
+	  called friq. You must also have a high-level driver for the type
+	  of device that you want to support.
+
+config PARIDE_FRPW
+	tristate "FreeCom power protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the Freecom power parallel port IDE
+	  protocol. If you chose to build PARIDE support into your kernel, you
+	  may answer Y here to build in the protocol driver, otherwise you
+	  should answer M to build it as a loadable module. The module will be
+	  called frpw. You must also have a high-level driver for the type
+	  of device that you want to support.
+
+config PARIDE_KBIC
+	tristate "KingByte KBIC-951A/971A protocols"
+	depends on PARIDE
+	help
+	  This option enables support for the KBIC-951A and KBIC-971A parallel
+	  port IDE protocols from KingByte Information Corp. KingByte's
+	  adapters appear in many no-name portable disk and CD-ROM products,
+	  especially in Europe. If you chose to build PARIDE support into your
+	  kernel, you may answer Y here to build in the protocol driver,
+	  otherwise you should answer M to build it as a loadable module. The
+	  module will be called kbic. You must also have a high-level driver
+	  for the type of device that you want to support.
+
+config PARIDE_KTTI
+	tristate "KT PHd protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the "PHd" parallel port IDE protocol
+	  from KT Technology. This is a simple (low speed) adapter that is
+	  used in some 2.5" portable hard drives. If you chose to build PARIDE
+	  support into your kernel, you may answer Y here to build in the
+	  protocol driver, otherwise you should answer M to build it as a
+	  loadable module. The module will be called ktti. You must also
+	  have a high-level driver for the type of device that you want to
+	  support.
+
+config PARIDE_ON20
+	tristate "OnSpec 90c20 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the (obsolete) 90c20 parallel port
+	  IDE protocol from OnSpec (often marketed under the ValuStore brand
+	  name). If you chose to build PARIDE support into your kernel, you
+	  may answer Y here to build in the protocol driver, otherwise you
+	  should answer M to build it as a loadable module. The module will
+	  be called on20. You must also have a high-level driver for the
+	  type of device that you want to support.
+
+config PARIDE_ON26
+	tristate "OnSpec 90c26 protocol"
+	depends on PARIDE
+	help
+	  This option enables support for the 90c26 parallel port IDE protocol
+	  from OnSpec Electronics (often marketed under the ValuStore brand
+	  name). If you chose to build PARIDE support into your kernel, you
+	  may answer Y here to build in the protocol driver, otherwise you
+	  should answer M to build it as a loadable module. The module will be
+	  called on26. You must also have a high-level driver for the type
+	  of device that you want to support.
+
+#
diff --git a/drivers/block/paride/Makefile b/drivers/block/paride/Makefile
new file mode 100644
index 000000000..cf1742a84
--- /dev/null
+++ b/drivers/block/paride/Makefile
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0
+#
+# Makefile for Parallel port IDE device drivers.
+#
+# 7 October 2000, Bartlomiej Zolnierkiewicz <bkz@linux-ide.org>
+# Rewritten to use lists instead of if-statements.
+#
+
+obj-$(CONFIG_PARIDE)		+= paride.o
+obj-$(CONFIG_PARIDE_ATEN)	+= aten.o
+obj-$(CONFIG_PARIDE_BPCK)	+= bpck.o
+obj-$(CONFIG_PARIDE_COMM)	+= comm.o
+obj-$(CONFIG_PARIDE_DSTR)	+= dstr.o
+obj-$(CONFIG_PARIDE_KBIC)	+= kbic.o
+obj-$(CONFIG_PARIDE_EPAT)	+= epat.o
+obj-$(CONFIG_PARIDE_EPIA)	+= epia.o
+obj-$(CONFIG_PARIDE_FRPW)	+= frpw.o
+obj-$(CONFIG_PARIDE_FRIQ)	+= friq.o
+obj-$(CONFIG_PARIDE_FIT2)	+= fit2.o
+obj-$(CONFIG_PARIDE_FIT3)	+= fit3.o
+obj-$(CONFIG_PARIDE_ON20)	+= on20.o
+obj-$(CONFIG_PARIDE_ON26)	+= on26.o
+obj-$(CONFIG_PARIDE_KTTI)	+= ktti.o
+obj-$(CONFIG_PARIDE_BPCK6)	+= bpck6.o
+obj-$(CONFIG_PARIDE_PD)		+= pd.o
+obj-$(CONFIG_PARIDE_PCD)	+= pcd.o
+obj-$(CONFIG_PARIDE_PF)		+= pf.o
+obj-$(CONFIG_PARIDE_PT)		+= pt.o
+obj-$(CONFIG_PARIDE_PG)		+= pg.o
diff --git a/drivers/block/paride/Transition-notes b/drivers/block/paride/Transition-notes
new file mode 100644
index 000000000..70374907c
--- /dev/null
+++ b/drivers/block/paride/Transition-notes
@@ -0,0 +1,128 @@
+Lemma 1:
+	If ps_tq is scheduled, ps_tq_active is 1.  ps_tq_int() can be called
+	only when ps_tq_active is 1.
+Proof:	All assignments to ps_tq_active and all scheduling of ps_tq happen
+	under ps_spinlock.  There are three places where that can happen:
+	one in ps_set_intr() (A) and two in ps_tq_int() (B and C).
+	Consider the sequnce of these events.  A can not be preceded by
+	anything except B, since it is under if (!ps_tq_active) under
+	ps_spinlock.  C is always preceded by B, since we can't reach it
+	other than through B and we don't drop ps_spinlock between them.
+	IOW, the sequence is A?(BA|BC|B)*.  OTOH, number of B can not exceed
+	the sum of numbers of A and C, since each call of ps_tq_int() is
+	the result of ps_tq execution.  Therefore, the sequence starts with
+	A and each B is preceded by either A or C.  Moments when we enter
+	ps_tq_int() are sandwiched between {A,C} and B in that sequence,
+	since at any time number of B can not exceed the number of these
+	moments which, in turn, can not exceed the number of A and C.
+	In other words, the sequence of events is (A or C set ps_tq_active to
+	1 and schedule ps_tq, ps_tq is executed, ps_tq_int() is entered,
+	B resets ps_tq_active)*.
+
+
+consider the following area:
+	* in do_pd_request1(): to calls of pi_do_claimed() and return in
+	  case when pd_req is NULL.
+	* in next_request(): to call of do_pd_request1()
+	* in do_pd_read(): to call of ps_set_intr()
+	* in do_pd_read_start(): to calls of pi_do_claimed(), next_request()
+and ps_set_intr()
+	* in do_pd_read_drq(): to calls of pi_do_claimed() and next_request()
+	* in do_pd_write(): to call of ps_set_intr()
+	* in do_pd_write_start(): to calls of pi_do_claimed(), next_request()
+and ps_set_intr()
+	* in do_pd_write_done(): to calls of pi_do_claimed() and next_request()
+	* in ps_set_intr(): to check for ps_tq_active and to scheduling
+	  ps_tq if ps_tq_active was 0.
+	* in ps_tq_int(): from the moment when we get ps_spinlock() to the
+	  return, call of con() or scheduling ps_tq.
+	* in pi_schedule_claimed() when called from pi_do_claimed() called from
+	  pd.c, everything until returning 1 or setting or setting ->claim_cont
+	  on the path that returns 0
+	* in pi_do_claimed() when called from pd.c, everything until the call
+	  of pi_do_claimed() plus the everything until the call of cont() if
+	  pi_do_claimed() has returned 1.
+	* in pi_wake_up() called for PIA that belongs to pd.c, everything from
+	  the moment when pi_spinlock has been acquired.
+
+Lemma 2:
+	1) at any time at most one thread of execution can be in that area or
+	be preempted there.
+	2) When there is such a thread, pd_busy is set or pd_lock is held by
+	that thread.
+	3) When there is such a thread, ps_tq_active is 0 or ps_spinlock is
+	held by that thread.
+	4) When there is such a thread, all PIA belonging to pd.c have NULL
+	->claim_cont or pi_spinlock is held by thread in question.
+
+Proof:	consider the first moment when the above is not true.
+
+(1) can become not true if some thread enters that area while another is there.
+	a) do_pd_request1() can be called from next_request() or do_pd_request()
+	   In the first case the thread was already in the area.  In the second,
+	   the thread was holding pd_lock and found pd_busy not set, which would
+	   mean that (2) was already not true.
+	b) ps_set_intr() and pi_schedule_claimed() can be called only from the
+	   area.
+	c) pi_do_claimed() is called by pd.c only from the area.
+	d) ps_tq_int() can enter the area only when the thread is holding
+	   ps_spinlock and ps_tq_active is 1 (due to Lemma 1).  It means that
+	   (3) was already not true.
+	e) do_pd_{read,write}* could be called only from the area.  The only
+	   case that needs consideration is call from pi_wake_up() and there
+	   we would have to be called for the PIA that got ->claimed_cont
+	   from pd.c.  That could happen only if pi_do_claimed() had been
+	   called from pd.c for that PIA, which happens only for PIA belonging
+	   to pd.c.
+	f) pi_wake_up() can enter the area only when the thread is holding
+	   pi_spinlock and ->claimed_cont is non-NULL for PIA belonging to
+	   pd.c.  It means that (4) was already not true.
+
+(2) can become not true only when pd_lock is released by the thread in question.
+	Indeed, pd_busy is reset only in the area and thread that resets
+	it is holding pd_lock.	The only place within the area where we
+	release pd_lock is in pd_next_buf() (called from within the area).
+	But that code does not reset pd_busy, so pd_busy would have to be
+	0 when pd_next_buf() had acquired pd_lock.  If it become 0 while
+	we were acquiring the lock, (1) would be already false, since
+	the thread that had reset it would be in the area simulateously.
+	If it was 0 before we tried to acquire pd_lock, (2) would be
+	already false.
+
+For similar reasons, (3) can become not true only when ps_spinlock is released
+by the thread in question.  However, all such places within the area are right
+after resetting ps_tq_active to 0.
+
+(4) is done the same way - all places where we release pi_spinlock within
+the area are either after resetting ->claimed_cont to NULL while holding
+pi_spinlock, or after not tocuhing ->claimed_cont since acquiring pi_spinlock
+also in the area.  The only place where ->claimed_cont is made non-NULL is
+in the area, under pi_spinlock and we do not release it until after leaving
+the area.
+
+QED.
+
+
+Corollary 1: ps_tq_active can be killed.  Indeed, the only place where we
+check its value is in ps_set_intr() and if it had been non-zero at that
+point, we would have violated either (2.1) (if it was set while ps_set_intr()
+was acquiring ps_spinlock) or (2.3) (if it was set when we started to
+acquire ps_spinlock).
+
+Corollary 2: ps_spinlock can be killed.  Indeed, Lemma 1 and Lemma 2 show
+that the only possible contention is between scheduling ps_tq followed by
+immediate release of spinlock and beginning of execution of ps_tq on
+another CPU.
+
+Corollary 3: assignment to pd_busy in do_pd_read_start() and do_pd_write_start()
+can be killed.  Indeed, we are not holding pd_lock and thus pd_busy is already
+1 here.
+
+Corollary 4: in ps_tq_int() uses of con can be replaced with uses of
+ps_continuation, since the latter is changed only from the area.
+We don't need to reset it to NULL, since we are guaranteed that there
+will be a call of ps_set_intr() before we look at ps_continuation again.
+We can remove the check for ps_continuation being NULL for the same
+reason - the value is guaranteed to be set by the last ps_set_intr() and
+we never pass it NULL.  Assignements in the beginning of ps_set_intr()
+can be taken to callers as long as they remain within the area.
diff --git a/drivers/block/paride/aten.c b/drivers/block/paride/aten.c
new file mode 100644
index 000000000..269546556
--- /dev/null
+++ b/drivers/block/paride/aten.c
@@ -0,0 +1,162 @@
+/* 
+        aten.c  (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                            Under the terms of the GNU General Public License.
+
+	aten.c is a low-level protocol driver for the ATEN EH-100
+	parallel port adapter.  The EH-100 supports 4-bit and 8-bit
+        modes only.  There is also an EH-132 which supports EPP mode
+        transfers.  The EH-132 is not yet supported.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.05	init_proto, release_proto
+
+*/
+
+#define ATEN_VERSION      "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/wait.h>
+#include <linux/types.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b)                ((((a>>4)&0x0f)|(b&0xf0))^0x88)
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x08, 0x20 };
+
+static void  aten_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	int r;
+
+	r = regr + cont_map[cont] + 0x80;
+
+	w0(r); w2(0xe); w2(6); w0(val); w2(7); w2(6); w2(0xc);
+}
+
+static int aten_read_regr( PIA *pi, int cont, int regr )
+
+{	int  a, b, r;
+
+        r = regr + cont_map[cont] + 0x40;
+
+	switch (pi->mode) {
+
+        case 0: w0(r); w2(0xe); w2(6); 
+		w2(7); w2(6); w2(0);
+		a = r1(); w0(0x10); b = r1(); w2(0xc);
+		return j44(a,b);
+
+        case 1: r |= 0x10;
+		w0(r); w2(0xe); w2(6); w0(0xff); 
+		w2(0x27); w2(0x26); w2(0x20);
+		a = r0();
+		w2(0x26); w2(0xc);
+		return a;
+	}
+	return -1;
+}
+
+static void aten_read_block( PIA *pi, char * buf, int count )
+
+{	int  k, a, b, c, d;
+
+	switch (pi->mode) {
+
+	case 0:	w0(0x48); w2(0xe); w2(6);
+		for (k=0;k<count/2;k++) {
+			w2(7); w2(6); w2(2);
+			a = r1(); w0(0x58); b = r1();
+			w2(0); d = r1(); w0(0x48); c = r1();
+			buf[2*k] = j44(c,d);
+			buf[2*k+1] = j44(a,b);
+		}
+		w2(0xc);
+		break;
+
+	case 1: w0(0x58); w2(0xe); w2(6);
+		for (k=0;k<count/2;k++) {
+			w2(0x27); w2(0x26); w2(0x22);
+			a = r0(); w2(0x20); b = r0();
+			buf[2*k] = b; buf[2*k+1] = a;
+		}
+		w2(0x26); w2(0xc);
+		break;
+	}
+}
+
+static void aten_write_block( PIA *pi, char * buf, int count )
+
+{	int k;
+
+	w0(0x88); w2(0xe); w2(6);
+	for (k=0;k<count/2;k++) {
+		w0(buf[2*k+1]); w2(0xe); w2(6);
+		w0(buf[2*k]); w2(7); w2(6);
+	}
+	w2(0xc);
+}
+
+static void aten_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(0xc);	
+}
+
+static void aten_disconnect ( PIA *pi )
+
+{       w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void aten_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[2] = {"4-bit","8-bit"};
+
+        printk("%s: aten %s, ATEN EH-100 at 0x%x, ",
+                pi->device,ATEN_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol aten = {
+	.owner		= THIS_MODULE,
+	.name		= "aten",
+	.max_mode	= 2,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= aten_write_regr,
+	.read_regr	= aten_read_regr,
+	.write_block	= aten_write_block,
+	.read_block	= aten_read_block,
+	.connect	= aten_connect,
+	.disconnect	= aten_disconnect,
+	.log_adapter	= aten_log_adapter,
+};
+
+static int __init aten_init(void)
+{
+	return paride_register(&aten);
+}
+
+static void __exit aten_exit(void)
+{
+	paride_unregister( &aten );
+}
+
+MODULE_LICENSE("GPL");
+module_init(aten_init)
+module_exit(aten_exit)
diff --git a/drivers/block/paride/bpck.c b/drivers/block/paride/bpck.c
new file mode 100644
index 000000000..d880a9465
--- /dev/null
+++ b/drivers/block/paride/bpck.c
@@ -0,0 +1,477 @@
+/* 
+	bpck.c	(c) 1996-8  Grant R. Guenther <grant@torque.net>
+		            Under the terms of the GNU General Public License.
+
+	bpck.c is a low-level protocol driver for the MicroSolutions 
+	"backpack" parallel port IDE adapter.  
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.05 init_proto, release_proto, pi->delay 
+	1.02    GRG 1998.08.15 default pi->delay returned to 4
+
+*/
+
+#define	BPCK_VERSION	"1.02" 
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#undef r2
+#undef w2
+#undef PC
+
+#define PC			pi->private
+#define r2()			(PC=(in_p(2) & 0xff))
+#define w2(byte)  		{out_p(2,byte); PC = byte;}
+#define t2(pat)   		{PC ^= pat; out_p(2,PC);}
+#define e2()			{PC &= 0xfe; out_p(2,PC);}
+#define o2()			{PC |= 1; out_p(2,PC);}
+
+#define j44(l,h)     (((l>>3)&0x7)|((l>>4)&0x8)|((h<<1)&0x70)|(h&0x80))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+   cont = 2 - use internal bpck register addressing
+*/
+
+static int  cont_map[3] = { 0x40, 0x48, 0 };
+
+static int bpck_read_regr( PIA *pi, int cont, int regr )
+
+{       int r, l, h;
+
+	r = regr + cont_map[cont];
+
+	switch (pi->mode) {
+
+	case 0: w0(r & 0xf); w0(r); t2(2); t2(4);
+	        l = r1();
+        	t2(4);
+        	h = r1();
+        	return j44(l,h);
+
+	case 1: w0(r & 0xf); w0(r); t2(2);
+	        e2(); t2(0x20);
+		t2(4); h = r0();
+	        t2(1); t2(0x20);
+	        return h;
+
+	case 2:
+	case 3:
+	case 4: w0(r); w2(9); w2(0); w2(0x20);
+		h = r4();
+		w2(0);
+		return h;
+
+	}
+	return -1;
+}	
+
+static void bpck_write_regr( PIA *pi, int cont, int regr, int val )
+
+{	int	r;
+
+        r = regr + cont_map[cont];
+
+	switch (pi->mode) {
+
+	case 0:
+	case 1: w0(r);
+		t2(2);
+		w0(val);
+		o2(); t2(4); t2(1);
+		break;
+
+	case 2:
+	case 3:
+	case 4: w0(r); w2(9); w2(0);
+		w0(val); w2(1); w2(3); w2(0);
+		break;
+
+	}
+}
+
+/* These macros access the bpck registers in native addressing */
+
+#define WR(r,v)		bpck_write_regr(pi,2,r,v)
+#define RR(r)		(bpck_read_regr(pi,2,r))
+
+static void bpck_write_block( PIA *pi, char * buf, int count )
+
+{	int i;
+
+	switch (pi->mode) {
+
+	case 0: WR(4,0x40);
+		w0(0x40); t2(2); t2(1);
+		for (i=0;i<count;i++) { w0(buf[i]); t2(4); }
+		WR(4,0);
+		break;
+
+	case 1: WR(4,0x50);
+                w0(0x40); t2(2); t2(1);
+                for (i=0;i<count;i++) { w0(buf[i]); t2(4); }
+                WR(4,0x10);
+		break;
+
+	case 2: WR(4,0x48);
+		w0(0x40); w2(9); w2(0); w2(1);
+		for (i=0;i<count;i++) w4(buf[i]);
+		w2(0);
+		WR(4,8);
+		break;
+
+        case 3: WR(4,0x48);
+                w0(0x40); w2(9); w2(0); w2(1);
+                for (i=0;i<count/2;i++) w4w(((u16 *)buf)[i]);
+                w2(0);
+                WR(4,8);
+                break;
+ 
+        case 4: WR(4,0x48);
+                w0(0x40); w2(9); w2(0); w2(1);
+                for (i=0;i<count/4;i++) w4l(((u32 *)buf)[i]);
+                w2(0);
+                WR(4,8);
+                break;
+ 	}
+}
+
+static void bpck_read_block( PIA *pi, char * buf, int count )
+
+{	int i, l, h;
+
+	switch (pi->mode) {
+
+      	case 0: WR(4,0x40);
+		w0(0x40); t2(2);
+		for (i=0;i<count;i++) {
+		    t2(4); l = r1();
+		    t2(4); h = r1();
+		    buf[i] = j44(l,h);
+		}
+		WR(4,0);
+		break;
+
+	case 1: WR(4,0x50);
+		w0(0x40); t2(2); t2(0x20);
+      	        for(i=0;i<count;i++) { t2(4); buf[i] = r0(); }
+	        t2(1); t2(0x20);
+	        WR(4,0x10);
+		break;
+
+	case 2: WR(4,0x48);
+		w0(0x40); w2(9); w2(0); w2(0x20);
+		for (i=0;i<count;i++) buf[i] = r4();
+		w2(0);
+		WR(4,8);
+		break;
+
+        case 3: WR(4,0x48);
+                w0(0x40); w2(9); w2(0); w2(0x20);
+                for (i=0;i<count/2;i++) ((u16 *)buf)[i] = r4w();
+                w2(0);
+                WR(4,8);
+                break;
+
+        case 4: WR(4,0x48);
+                w0(0x40); w2(9); w2(0); w2(0x20);
+                for (i=0;i<count/4;i++) ((u32 *)buf)[i] = r4l();
+                w2(0);
+                WR(4,8);
+                break;
+
+	}
+}
+
+static int bpck_probe_unit ( PIA *pi )
+
+{	int o1, o0, f7, id;
+	int t, s;
+
+	id = pi->unit;
+	s = 0;
+	w2(4); w2(0xe); r2(); t2(2); 
+	o1 = r1()&0xf8;
+	o0 = r0();
+	w0(255-id); w2(4); w0(id);
+	t2(8); t2(8); t2(8);
+	t2(2); t = r1()&0xf8;
+	f7 = ((id % 8) == 7);
+	if ((f7) || (t != o1)) { t2(2); s = r1()&0xf8; }
+	if ((t == o1) && ((!f7) || (s == o1)))  {
+		w2(0x4c); w0(o0);
+		return 0;	
+	}
+	t2(8); w0(0); t2(2); w2(0x4c); w0(o0);
+	return 1;
+}
+	
+static void bpck_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+	w0(0xff-pi->unit); w2(4); w0(pi->unit);
+	t2(8); t2(8); t2(8); 
+	t2(2); t2(2);
+	
+	switch (pi->mode) {
+
+	case 0: t2(8); WR(4,0);
+		break;
+
+	case 1: t2(8); WR(4,0x10);
+		break;
+
+	case 2:
+        case 3:
+	case 4: w2(0); WR(4,8);
+		break;
+
+	}
+
+	WR(5,8);
+
+	if (pi->devtype == PI_PCD) {
+		WR(0x46,0x10);		/* fiddle with ESS logic ??? */
+		WR(0x4c,0x38);
+		WR(0x4d,0x88);
+		WR(0x46,0xa0);
+		WR(0x41,0);
+		WR(0x4e,8);
+		}
+}
+
+static void bpck_disconnect ( PIA *pi )
+
+{	w0(0); 
+	if (pi->mode >= 2) { w2(9); w2(0); } else t2(2);
+	w2(0x4c); w0(pi->saved_r0);
+} 
+
+static void bpck_force_spp ( PIA *pi )
+
+/* This fakes the EPP protocol to turn off EPP ... */
+
+{       pi->saved_r0 = r0();
+        w0(0xff-pi->unit); w2(4); w0(pi->unit);
+        t2(8); t2(8); t2(8); 
+        t2(2); t2(2);
+
+        w2(0); 
+        w0(4); w2(9); w2(0); 
+        w0(0); w2(1); w2(3); w2(0);     
+        w0(0); w2(9); w2(0);
+        w2(0x4c); w0(pi->saved_r0);
+}
+
+#define TEST_LEN  16
+
+static int bpck_test_proto( PIA *pi, char * scratch, int verbose )
+
+{	int i, e, l, h, om;
+	char buf[TEST_LEN];
+
+	bpck_force_spp(pi);
+
+	switch (pi->mode) {
+
+	case 0: bpck_connect(pi);
+		WR(0x13,0x7f);
+		w0(0x13); t2(2);
+		for(i=0;i<TEST_LEN;i++) {
+                    t2(4); l = r1();
+                    t2(4); h = r1();
+                    buf[i] = j44(l,h);
+		}
+		bpck_disconnect(pi);
+		break;
+
+        case 1: bpck_connect(pi);
+		WR(0x13,0x7f);
+                w0(0x13); t2(2); t2(0x20);
+                for(i=0;i<TEST_LEN;i++) { t2(4); buf[i] = r0(); }
+                t2(1); t2(0x20);
+		bpck_disconnect(pi);
+		break;
+
+	case 2:
+	case 3:
+	case 4: om = pi->mode;
+		pi->mode = 0;
+		bpck_connect(pi);
+		WR(7,3);
+		WR(4,8);
+		bpck_disconnect(pi);
+
+		pi->mode = om;
+		bpck_connect(pi);
+		w0(0x13); w2(9); w2(1); w0(0); w2(3); w2(0); w2(0xe0);
+
+		switch (pi->mode) {
+		  case 2: for (i=0;i<TEST_LEN;i++) buf[i] = r4();
+			  break;
+		  case 3: for (i=0;i<TEST_LEN/2;i++) ((u16 *)buf)[i] = r4w();
+                          break;
+		  case 4: for (i=0;i<TEST_LEN/4;i++) ((u32 *)buf)[i] = r4l();
+                          break;
+		}
+
+		w2(0);
+		WR(7,0);
+		bpck_disconnect(pi);
+
+		break;
+
+	}
+
+	if (verbose) {
+	    printk("%s: bpck: 0x%x unit %d mode %d: ",
+		   pi->device,pi->port,pi->unit,pi->mode);
+	    for (i=0;i<TEST_LEN;i++) printk("%3d",buf[i]);
+	    printk("\n");
+	}
+
+	e = 0;
+	for (i=0;i<TEST_LEN;i++) if (buf[i] != (i+1)) e++;
+	return e;
+}
+
+static void bpck_read_eeprom ( PIA *pi, char * buf )
+
+{       int i, j, k, p, v, f, om, od;
+
+	bpck_force_spp(pi);
+
+	om = pi->mode;  od = pi->delay;
+	pi->mode = 0; pi->delay = 6;
+
+	bpck_connect(pi);
+	
+	WR(4,0);
+	for (i=0;i<64;i++) {
+	    WR(6,8);  
+	    WR(6,0xc);
+	    p = 0x100;
+	    for (k=0;k<9;k++) {
+		f = (((i + 0x180) & p) != 0) * 2;
+		WR(6,f+0xc); 
+		WR(6,f+0xd); 
+		WR(6,f+0xc);
+		p = (p >> 1);
+	    }
+	    for (j=0;j<2;j++) {
+		v = 0;
+		for (k=0;k<8;k++) {
+		    WR(6,0xc); 
+		    WR(6,0xd); 
+		    WR(6,0xc); 
+		    f = RR(0);
+		    v = 2*v + (f == 0x84);
+		}
+		buf[2*i+1-j] = v;
+	    }
+	}
+	WR(6,8);
+	WR(6,0);
+	WR(5,8);
+
+	bpck_disconnect(pi);
+
+        if (om >= 2) {
+                bpck_connect(pi);
+                WR(7,3);
+                WR(4,8);
+                bpck_disconnect(pi);
+        }
+
+	pi->mode = om; pi->delay = od;
+}
+
+static int bpck_test_port ( PIA *pi ) 	/* check for 8-bit port */
+
+{	int	i, r, m;
+
+	w2(0x2c); i = r0(); w0(255-i); r = r0(); w0(i);
+	m = -1;
+	if (r == i) m = 2;
+	if (r == (255-i)) m = 0;
+
+	w2(0xc); i = r0(); w0(255-i); r = r0(); w0(i);
+	if (r != (255-i)) m = -1;
+	
+	if (m == 0) { w2(6); w2(0xc); r = r0(); w0(0xaa); w0(r); w0(0xaa); }
+	if (m == 2) { w2(0x26); w2(0xc); }
+
+	if (m == -1) return 0;
+	return 5;
+}
+
+static void bpck_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{	char	*mode_string[5] = { "4-bit","8-bit","EPP-8",
+				    "EPP-16","EPP-32" };
+
+#ifdef DUMP_EEPROM
+	int i;
+#endif
+
+	bpck_read_eeprom(pi,scratch);
+
+#ifdef DUMP_EEPROM
+	if (verbose) {
+	   for(i=0;i<128;i++)
+		if ((scratch[i] < ' ') || (scratch[i] > '~'))
+		    scratch[i] = '.';
+	   printk("%s: bpck EEPROM: %64.64s\n",pi->device,scratch);
+	   printk("%s:              %64.64s\n",pi->device,&scratch[64]);
+	}
+#endif
+
+	printk("%s: bpck %s, backpack %8.8s unit %d",
+		pi->device,BPCK_VERSION,&scratch[110],pi->unit);
+	printk(" at 0x%x, mode %d (%s), delay %d\n",pi->port,
+		pi->mode,mode_string[pi->mode],pi->delay);
+}
+
+static struct pi_protocol bpck = {
+	.owner		= THIS_MODULE,
+	.name		= "bpck",
+	.max_mode	= 5,
+	.epp_first	= 2,
+	.default_delay	= 4,
+	.max_units	= 255,
+	.write_regr	= bpck_write_regr,
+	.read_regr	= bpck_read_regr,
+	.write_block	= bpck_write_block,
+	.read_block	= bpck_read_block,
+	.connect	= bpck_connect,
+	.disconnect	= bpck_disconnect,
+	.test_port	= bpck_test_port,
+	.probe_unit	= bpck_probe_unit,
+	.test_proto	= bpck_test_proto,
+	.log_adapter	= bpck_log_adapter,
+};
+
+static int __init bpck_init(void)
+{
+	return paride_register(&bpck);
+}
+
+static void __exit bpck_exit(void)
+{
+	paride_unregister(&bpck);
+}
+
+MODULE_LICENSE("GPL");
+module_init(bpck_init)
+module_exit(bpck_exit)
diff --git a/drivers/block/paride/bpck6.c b/drivers/block/paride/bpck6.c
new file mode 100644
index 000000000..ec64e7f5d
--- /dev/null
+++ b/drivers/block/paride/bpck6.c
@@ -0,0 +1,267 @@
+/*
+	backpack.c (c) 2001 Micro Solutions Inc.
+		Released under the terms of the GNU General Public license
+
+	backpack.c is a low-level protocol driver for the Micro Solutions
+		"BACKPACK" parallel port IDE adapter
+		(Works on Series 6 drives)
+
+	Written by: Ken Hahn     (linux-dev@micro-solutions.com)
+	            Clive Turvey (linux-dev@micro-solutions.com)
+
+*/
+
+/*
+   This is Ken's linux wrapper for the PPC library
+   Version 1.0.0 is the backpack driver for which source is not available
+   Version 2.0.0 is the first to have source released 
+   Version 2.0.1 is the "Cox-ified" source code 
+   Version 2.0.2 - fixed version string usage, and made ppc functions static 
+*/
+
+
+#define BACKPACK_VERSION "2.0.2"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <asm/io.h>
+#include <linux/parport.h>
+
+#include "ppc6lnx.c"
+#include "paride.h"
+
+/* PARAMETERS */
+static bool verbose; /* set this to 1 to see debugging messages and whatnot */
+ 
+
+#define PPCSTRUCT(pi) ((Interface *)(pi->private))
+
+/****************************************************************/
+/*
+ ATAPI CDROM DRIVE REGISTERS
+*/
+#define ATAPI_DATA       0      /* data port                  */
+#define ATAPI_ERROR      1      /* error register (read)      */
+#define ATAPI_FEATURES   1      /* feature register (write)   */
+#define ATAPI_INT_REASON 2      /* interrupt reason register  */
+#define ATAPI_COUNT_LOW  4      /* byte count register (low)  */
+#define ATAPI_COUNT_HIGH 5      /* byte count register (high) */
+#define ATAPI_DRIVE_SEL  6      /* drive select register      */
+#define ATAPI_STATUS     7      /* status port (read)         */
+#define ATAPI_COMMAND    7      /* command port (write)       */
+#define ATAPI_ALT_STATUS 0x0e /* alternate status reg (read) */
+#define ATAPI_DEVICE_CONTROL 0x0e /* device control (write)   */
+/****************************************************************/
+
+static int bpck6_read_regr(PIA *pi, int cont, int reg)
+{
+	unsigned int out;
+
+	/* check for bad settings */
+	if (reg<0 || reg>7 || cont<0 || cont>2)
+	{
+		return(-1);
+	}
+	out=ppc6_rd_port(PPCSTRUCT(pi),cont?reg|8:reg);
+	return(out);
+}
+
+static void bpck6_write_regr(PIA *pi, int cont, int reg, int val)
+{
+	/* check for bad settings */
+	if (reg>=0 && reg<=7 && cont>=0 && cont<=1)
+	{
+		ppc6_wr_port(PPCSTRUCT(pi),cont?reg|8:reg,(u8)val);
+	}
+}
+
+static void bpck6_write_block( PIA *pi, char * buf, int len )
+{
+	ppc6_wr_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1); 
+}
+
+static void bpck6_read_block( PIA *pi, char * buf, int len )
+{
+	ppc6_rd_port16_blk(PPCSTRUCT(pi),ATAPI_DATA,buf,(u32)len>>1);
+}
+
+static void bpck6_connect ( PIA *pi  )
+{
+	if(verbose)
+	{
+		printk(KERN_DEBUG "connect\n");
+	}
+
+	if(pi->mode >=2)
+  	{
+		PPCSTRUCT(pi)->mode=4+pi->mode-2;	
+	}
+	else if(pi->mode==1)
+	{
+		PPCSTRUCT(pi)->mode=3;	
+	}
+	else
+	{
+		PPCSTRUCT(pi)->mode=1;		
+	}
+
+	ppc6_open(PPCSTRUCT(pi));  
+	ppc6_wr_extout(PPCSTRUCT(pi),0x3);
+}
+
+static void bpck6_disconnect ( PIA *pi )
+{
+	if(verbose)
+	{
+		printk("disconnect\n");
+	}
+	ppc6_wr_extout(PPCSTRUCT(pi),0x0);
+	ppc6_close(PPCSTRUCT(pi));
+}
+
+static int bpck6_test_port ( PIA *pi )   /* check for 8-bit port */
+{
+	if(verbose)
+	{
+		printk(KERN_DEBUG "PARPORT indicates modes=%x for lp=0x%lx\n",
+               		((struct pardevice*)(pi->pardev))->port->modes,
+			((struct pardevice *)(pi->pardev))->port->base); 
+	}
+
+	/*copy over duplicate stuff.. initialize state info*/
+	PPCSTRUCT(pi)->ppc_id=pi->unit;
+	PPCSTRUCT(pi)->lpt_addr=pi->port;
+
+	/* look at the parport device to see if what modes we can use */
+	if(((struct pardevice *)(pi->pardev))->port->modes & 
+		(PARPORT_MODE_EPP)
+          )
+	{
+		return 5; /* Can do EPP*/
+	}
+	else if(((struct pardevice *)(pi->pardev))->port->modes & 
+			(PARPORT_MODE_TRISTATE)
+               )
+	{
+		return 2;
+	}
+	else /*Just flat SPP*/
+	{
+		return 1;
+	}
+}
+
+static int bpck6_probe_unit ( PIA *pi )
+{
+	int out;
+
+	if(verbose)
+	{
+		printk(KERN_DEBUG "PROBE UNIT %x on port:%x\n",pi->unit,pi->port);
+	}
+
+	/*SET PPC UNIT NUMBER*/
+	PPCSTRUCT(pi)->ppc_id=pi->unit;
+
+	/*LOWER DOWN TO UNIDIRECTIONAL*/
+	PPCSTRUCT(pi)->mode=1;		
+
+	out=ppc6_open(PPCSTRUCT(pi));
+
+	if(verbose)
+	{
+		printk(KERN_DEBUG "ppc_open returned %2x\n",out);
+	}
+
+  	if(out)
+ 	{
+		ppc6_close(PPCSTRUCT(pi));
+		if(verbose)
+		{
+			printk(KERN_DEBUG "leaving probe\n");
+		}
+               return(1);
+	}
+  	else
+  	{
+		if(verbose)
+		{
+			printk(KERN_DEBUG "Failed open\n");
+		}
+    		return(0);
+  	}
+}
+
+static void bpck6_log_adapter( PIA *pi, char * scratch, int verbose )
+{
+	char *mode_string[5]=
+		{"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
+
+	printk("%s: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n",pi->device);
+	printk("%s: Copyright 2001 by Micro Solutions, Inc., DeKalb IL.\n",pi->device);
+	printk("%s: BACKPACK %s, Micro Solutions BACKPACK Drive at 0x%x\n",
+		pi->device,BACKPACK_VERSION,pi->port);
+	printk("%s: Unit: %d Mode:%d (%s) Delay %d\n",pi->device,
+		pi->unit,pi->mode,mode_string[pi->mode],pi->delay);
+}
+
+static int bpck6_init_proto(PIA *pi)
+{
+	Interface *p = kzalloc(sizeof(Interface), GFP_KERNEL);
+
+	if (p) {
+		pi->private = (unsigned long)p;
+		return 0;
+	}
+
+	printk(KERN_ERR "%s: ERROR COULDN'T ALLOCATE MEMORY\n", pi->device); 
+	return -1;
+}
+
+static void bpck6_release_proto(PIA *pi)
+{
+	kfree((void *)(pi->private)); 
+}
+
+static struct pi_protocol bpck6 = {
+	.owner		= THIS_MODULE,
+	.name		= "bpck6",
+	.max_mode	= 5,
+	.epp_first	= 2, /* 2-5 use epp (need 8 ports) */
+	.max_units	= 255,
+	.write_regr	= bpck6_write_regr,
+	.read_regr	= bpck6_read_regr,
+	.write_block	= bpck6_write_block,
+	.read_block	= bpck6_read_block,
+	.connect	= bpck6_connect,
+	.disconnect	= bpck6_disconnect,
+	.test_port	= bpck6_test_port,
+	.probe_unit	= bpck6_probe_unit,
+	.log_adapter	= bpck6_log_adapter,
+	.init_proto	= bpck6_init_proto,
+	.release_proto	= bpck6_release_proto,
+};
+
+static int __init bpck6_init(void)
+{
+	printk(KERN_INFO "bpck6: BACKPACK Protocol Driver V"BACKPACK_VERSION"\n");
+	printk(KERN_INFO "bpck6: Copyright 2001 by Micro Solutions, Inc., DeKalb IL. USA\n");
+	if(verbose)
+		printk(KERN_DEBUG "bpck6: verbose debug enabled.\n");
+	return paride_register(&bpck6);
+}
+
+static void __exit bpck6_exit(void)
+{
+	paride_unregister(&bpck6);
+}
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Micro Solutions Inc.");
+MODULE_DESCRIPTION("BACKPACK Protocol module, compatible with PARIDE");
+module_param(verbose, bool, 0644);
+module_init(bpck6_init)
+module_exit(bpck6_exit)
diff --git a/drivers/block/paride/comm.c b/drivers/block/paride/comm.c
new file mode 100644
index 000000000..9bcd35495
--- /dev/null
+++ b/drivers/block/paride/comm.c
@@ -0,0 +1,218 @@
+/* 
+        comm.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+	comm.c is a low-level protocol driver for some older models
+	of the DataStor "Commuter" parallel to IDE adapter.  Some of
+	the parallel port devices marketed by Arista currently
+	use this adapter.
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.05  init_proto, release_proto
+
+*/
+
+#define COMM_VERSION      "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes:  0  nybble reads, 8-bit writes
+                1  8-bit reads and writes
+                2  8-bit EPP mode
+*/
+
+#define j44(a,b)	(((a>>3)&0x0f)|((b<<1)&0xf0))
+
+#define P1	w2(5);w2(0xd);w2(0xd);w2(5);w2(4);
+#define P2	w2(5);w2(7);w2(7);w2(5);w2(4);
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x08, 0x10 };
+
+static int comm_read_regr( PIA *pi, int cont, int regr )
+
+{       int     l, h, r;
+
+        r = regr + cont_map[cont];
+
+        switch (pi->mode)  {
+
+        case 0: w0(r); P1; w0(0);
+        	w2(6); l = r1(); w0(0x80); h = r1(); w2(4);
+                return j44(l,h);
+
+        case 1: w0(r+0x20); P1; 
+        	w0(0); w2(0x26); h = r0(); w2(4);
+                return h;
+
+	case 2:
+	case 3:
+        case 4: w3(r+0x20); (void)r1();
+        	w2(0x24); h = r4(); w2(4);
+                return h;
+
+        }
+        return -1;
+}       
+
+static void comm_write_regr( PIA *pi, int cont, int regr, int val )
+
+{       int  r;
+
+        r = regr + cont_map[cont];
+
+        switch (pi->mode)  {
+
+        case 0:
+        case 1: w0(r); P1; w0(val); P2;
+		break;
+
+	case 2:
+	case 3:
+        case 4: w3(r); (void)r1(); w4(val);
+                break;
+        }
+}
+
+static void comm_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+        w2(4); w0(0xff); w2(6);
+        w2(4); w0(0xaa); w2(6);
+        w2(4); w0(0x00); w2(6);
+        w2(4); w0(0x87); w2(6);
+        w2(4); w0(0xe0); w2(0xc); w2(0xc); w2(4);
+}
+
+static void comm_disconnect ( PIA *pi )
+
+{       w2(0); w2(0); w2(0); w2(4); 
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void comm_read_block( PIA *pi, char * buf, int count )
+
+{       int     i, l, h;
+
+        switch (pi->mode) {
+        
+        case 0: w0(0x48); P1;
+                for(i=0;i<count;i++) {
+                        w0(0); w2(6); l = r1();
+                        w0(0x80); h = r1(); w2(4);
+                        buf[i] = j44(l,h);
+                }
+                break;
+
+        case 1: w0(0x68); P1; w0(0);
+                for(i=0;i<count;i++) {
+                        w2(0x26); buf[i] = r0(); w2(0x24);
+                }
+		w2(4);
+		break;
+		
+	case 2: w3(0x68); (void)r1(); w2(0x24);
+		for (i=0;i<count;i++) buf[i] = r4();
+		w2(4);
+		break;
+
+        case 3: w3(0x68); (void)r1(); w2(0x24);
+                for (i=0;i<count/2;i++) ((u16 *)buf)[i] = r4w();
+                w2(4);
+                break;
+
+        case 4: w3(0x68); (void)r1(); w2(0x24);
+                for (i=0;i<count/4;i++) ((u32 *)buf)[i] = r4l();
+                w2(4);
+                break;
+		
+	}
+}
+
+/* NB: Watch out for the byte swapped writes ! */
+
+static void comm_write_block( PIA *pi, char * buf, int count )
+
+{       int	k;
+
+        switch (pi->mode) {
+
+        case 0:
+        case 1: w0(0x68); P1;
+        	for (k=0;k<count;k++) {
+                        w2(5); w0(buf[k^1]); w2(7);
+                }
+                w2(5); w2(4);
+                break;
+
+        case 2: w3(0x48); (void)r1();
+                for (k=0;k<count;k++) w4(buf[k^1]);
+                break;
+
+        case 3: w3(0x48); (void)r1();
+                for (k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+                break;
+
+        case 4: w3(0x48); (void)r1();
+                for (k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+                break;
+
+
+        }
+}
+
+static void comm_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[5] = {"4-bit","8-bit","EPP-8","EPP-16","EPP-32"};
+
+        printk("%s: comm %s, DataStor Commuter at 0x%x, ",
+                pi->device,COMM_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol comm = {
+	.owner		= THIS_MODULE,
+	.name		= "comm",
+	.max_mode	= 5,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= comm_write_regr,
+	.read_regr	= comm_read_regr,
+	.write_block	= comm_write_block,
+	.read_block	= comm_read_block,
+	.connect	= comm_connect,
+	.disconnect	= comm_disconnect,
+	.log_adapter	= comm_log_adapter,
+};
+
+static int __init comm_init(void)
+{
+	return paride_register(&comm);
+}
+
+static void __exit comm_exit(void)
+{
+	paride_unregister(&comm);
+}
+
+MODULE_LICENSE("GPL");
+module_init(comm_init)
+module_exit(comm_exit)
diff --git a/drivers/block/paride/dstr.c b/drivers/block/paride/dstr.c
new file mode 100644
index 000000000..accc5c777
--- /dev/null
+++ b/drivers/block/paride/dstr.c
@@ -0,0 +1,233 @@
+/* 
+        dstr.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+        dstr.c is a low-level protocol driver for the 
+        DataStor EP2000 parallel to IDE adapter chip.
+
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define DSTR_VERSION      "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes:  0  nybble reads, 8-bit writes
+                1  8-bit reads and writes
+                2  8-bit EPP mode
+		3  EPP-16
+		4  EPP-32
+*/
+
+#define j44(a,b)  (((a>>3)&0x07)|((~a>>4)&0x08)|((b<<1)&0x70)|((~b)&0x80))
+
+#define P1	w2(5);w2(0xd);w2(5);w2(4);
+#define P2	w2(5);w2(7);w2(5);w2(4);
+#define P3      w2(6);w2(4);w2(6);w2(4);
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x20, 0x40 };
+
+static int dstr_read_regr( PIA *pi, int cont, int regr )
+
+{       int     a, b, r;
+
+        r = regr + cont_map[cont];
+
+	w0(0x81); P1;
+	if (pi->mode) { w0(0x11); } else { w0(1); }
+	P2; w0(r); P1;
+
+        switch (pi->mode)  {
+
+        case 0: w2(6); a = r1(); w2(4); w2(6); b = r1(); w2(4);
+                return j44(a,b);
+
+        case 1: w0(0); w2(0x26); a = r0(); w2(4);
+                return a;
+
+	case 2:
+	case 3:
+        case 4: w2(0x24); a = r4(); w2(4);
+                return a;
+
+        }
+        return -1;
+}       
+
+static void dstr_write_regr(  PIA *pi, int cont, int regr, int val )
+
+{       int  r;
+
+        r = regr + cont_map[cont];
+
+	w0(0x81); P1; 
+	if (pi->mode >= 2) { w0(0x11); } else { w0(1); }
+	P2; w0(r); P1;
+	
+        switch (pi->mode)  {
+
+        case 0:
+        case 1: w0(val); w2(5); w2(7); w2(5); w2(4);
+		break;
+
+	case 2:
+	case 3:
+        case 4: w4(val); 
+                break;
+        }
+}
+
+#define  CCP(x)  w0(0xff);w2(0xc);w2(4);\
+		 w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);w0(0x78);\
+		 w0(x);w2(5);w2(4);
+
+static void dstr_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+        w2(4); CCP(0xe0); w0(0xff);
+}
+
+static void dstr_disconnect ( PIA *pi )
+
+{       CCP(0x30);
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void dstr_read_block( PIA *pi, char * buf, int count )
+
+{       int     k, a, b;
+
+        w0(0x81); P1;
+        if (pi->mode) { w0(0x19); } else { w0(9); }
+	P2; w0(0x82); P1; P3; w0(0x20); P1;
+
+        switch (pi->mode) {
+
+        case 0: for (k=0;k<count;k++) {
+                        w2(6); a = r1(); w2(4);
+                        w2(6); b = r1(); w2(4);
+                        buf[k] = j44(a,b);
+                } 
+                break;
+
+        case 1: w0(0);
+                for (k=0;k<count;k++) {
+                        w2(0x26); buf[k] = r0(); w2(0x24);
+                }
+                w2(4);
+                break;
+
+        case 2: w2(0x24); 
+                for (k=0;k<count;k++) buf[k] = r4();
+                w2(4);
+                break;
+
+        case 3: w2(0x24); 
+                for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+                w2(4);
+                break;
+
+        case 4: w2(0x24); 
+                for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+                w2(4);
+                break;
+
+        }
+}
+
+static void dstr_write_block( PIA *pi, char * buf, int count )
+
+{       int	k;
+
+        w0(0x81); P1;
+        if (pi->mode) { w0(0x19); } else { w0(9); }
+        P2; w0(0x82); P1; P3; w0(0x20); P1;
+
+        switch (pi->mode) {
+
+        case 0:
+        case 1: for (k=0;k<count;k++) {
+                        w2(5); w0(buf[k]); w2(7);
+                }
+                w2(5); w2(4);
+                break;
+
+        case 2: w2(0xc5);
+                for (k=0;k<count;k++) w4(buf[k]);
+		w2(0xc4);
+                break;
+
+        case 3: w2(0xc5);
+                for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+                w2(0xc4);
+                break;
+
+        case 4: w2(0xc5);
+                for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+                w2(0xc4);
+                break;
+
+        }
+}
+
+
+static void dstr_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
+				   "EPP-16","EPP-32"};
+
+        printk("%s: dstr %s, DataStor EP2000 at 0x%x, ",
+                pi->device,DSTR_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol dstr = {
+	.owner		= THIS_MODULE,
+	.name		= "dstr",
+	.max_mode	= 5,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= dstr_write_regr,
+	.read_regr	= dstr_read_regr,
+	.write_block	= dstr_write_block,
+	.read_block	= dstr_read_block,
+	.connect	= dstr_connect,
+	.disconnect	= dstr_disconnect,
+	.log_adapter	= dstr_log_adapter,
+};
+
+static int __init dstr_init(void)
+{
+	return paride_register(&dstr);
+}
+
+static void __exit dstr_exit(void)
+{
+	paride_unregister(&dstr);
+}
+
+MODULE_LICENSE("GPL");
+module_init(dstr_init)
+module_exit(dstr_exit)
diff --git a/drivers/block/paride/epat.c b/drivers/block/paride/epat.c
new file mode 100644
index 000000000..1bcdff773
--- /dev/null
+++ b/drivers/block/paride/epat.c
@@ -0,0 +1,340 @@
+/* 
+        epat.c  (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                            Under the terms of the GNU General Public License.
+
+	This is the low level protocol driver for the EPAT parallel
+        to IDE adapter from Shuttle Technologies.  This adapter is
+        used in many popular parallel port disk products such as the
+        SyQuest EZ drives, the Avatar Shark and the Imation SuperDisk.
+	
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+        1.02    Joshua b. Jore CPP(renamed), epat_connect, epat_disconnect
+
+*/
+
+#define EPAT_VERSION      "1.02"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b)		(((a>>4)&0x0f)+(b&0xf0))
+#define j53(a,b)		(((a>>3)&0x1f)+((b<<4)&0xe0))
+
+static int epatc8;
+
+module_param(epatc8, int, 0);
+MODULE_PARM_DESC(epatc8, "support for the Shuttle EP1284 chip, "
+	"used in any recent Imation SuperDisk (LS-120) drive.");
+
+/* cont =  0   IDE register file
+   cont =  1   IDE control registers
+   cont =  2   internal EPAT registers
+*/
+
+static int cont_map[3] = { 0x18, 0x10, 0 };
+
+static void epat_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	int r;
+
+	r = regr + cont_map[cont];
+
+	switch (pi->mode) {
+
+	case 0:
+	case 1:
+	case 2:	w0(0x60+r); w2(1); w0(val); w2(4);
+		break;
+
+	case 3:
+	case 4:
+	case 5: w3(0x40+r); w4(val);
+		break;
+
+	}
+}
+
+static int epat_read_regr( PIA *pi, int cont, int regr )
+
+{	int  a, b, r;
+
+	r = regr + cont_map[cont];
+
+	switch (pi->mode) {
+
+	case 0:	w0(r); w2(1); w2(3); 
+		a = r1(); w2(4); b = r1();
+		return j44(a,b);
+
+	case 1: w0(0x40+r); w2(1); w2(4);
+		a = r1(); b = r2(); w0(0xff);
+		return j53(a,b);
+
+	case 2: w0(0x20+r); w2(1); w2(0x25);
+		a = r0(); w2(4);
+		return a;
+
+	case 3:
+	case 4:
+	case 5: w3(r); w2(0x24); a = r4(); w2(4);
+		return a;
+
+	}
+	return -1;	/* never gets here */
+}
+
+static void epat_read_block( PIA *pi, char * buf, int count )
+
+{	int  k, ph, a, b;
+
+	switch (pi->mode) {
+
+	case 0:	w0(7); w2(1); w2(3); w0(0xff);
+		ph = 0;
+		for(k=0;k<count;k++) {
+			if (k == count-1) w0(0xfd);
+			w2(6+ph); a = r1();
+			if (a & 8) b = a; 
+			  else { w2(4+ph); b = r1(); }
+			buf[k] = j44(a,b);
+			ph =  1 - ph;
+		}
+		w0(0); w2(4);
+		break;
+
+	case 1: w0(0x47); w2(1); w2(5); w0(0xff);
+		ph = 0;
+		for(k=0;k<count;k++) {
+			if (k == count-1) w0(0xfd); 
+			w2(4+ph);
+			a = r1(); b = r2();
+			buf[k] = j53(a,b);
+			ph = 1 - ph;
+		}
+		w0(0); w2(4);
+		break;
+
+	case 2: w0(0x27); w2(1); w2(0x25); w0(0);
+		ph = 0;
+		for(k=0;k<count-1;k++) {
+			w2(0x24+ph);
+			buf[k] = r0();
+			ph = 1 - ph;
+		}
+		w2(0x26); w2(0x27); buf[count-1] = r0(); 
+		w2(0x25); w2(4);
+		break;
+
+	case 3: w3(0x80); w2(0x24);
+		for(k=0;k<count-1;k++) buf[k] = r4();
+		w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+		w2(4);
+		break;
+
+	case 4: w3(0x80); w2(0x24);
+		for(k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+		buf[count-2] = r4();
+		w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+		w2(4);
+		break;
+
+	case 5: w3(0x80); w2(0x24);
+		for(k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+		for(k=count-4;k<count-1;k++) buf[k] = r4();
+		w2(4); w3(0xa0); w2(0x24); buf[count-1] = r4();
+		w2(4);
+		break;
+
+	}
+}
+
+static void epat_write_block( PIA *pi, char * buf, int count )   
+
+{	int ph, k;
+
+	switch (pi->mode) {
+
+	case 0:
+	case 1:
+	case 2: w0(0x67); w2(1); w2(5);
+		ph = 0;
+		for(k=0;k<count;k++) {
+		  	w0(buf[k]);
+			w2(4+ph);
+			ph = 1 - ph;
+		}
+		w2(7); w2(4);
+		break;
+
+	case 3: w3(0xc0); 
+		for(k=0;k<count;k++) w4(buf[k]);
+		w2(4);
+		break;
+
+	case 4: w3(0xc0); 
+		for(k=0;k<(count/2);k++) w4w(((u16 *)buf)[k]);
+		w2(4);
+		break;
+
+	case 5: w3(0xc0); 
+		for(k=0;k<(count/4);k++) w4l(((u32 *)buf)[k]);
+		w2(4);
+		break;
+
+	}
+}
+
+/* these macros access the EPAT registers in native addressing */
+
+#define	WR(r,v)		epat_write_regr(pi,2,r,v)
+#define	RR(r)		(epat_read_regr(pi,2,r))
+
+/* and these access the IDE task file */
+
+#define WRi(r,v)         epat_write_regr(pi,0,r,v)
+#define RRi(r)           (epat_read_regr(pi,0,r))
+
+/* FIXME:  the CPP stuff should be fixed to handle multiple EPATs on a chain */
+
+#define CPP(x) 	w2(4);w0(0x22);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
+                w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
+
+static void epat_connect ( PIA *pi )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+
+ 	/* Initialize the chip */
+	CPP(0);
+
+	if (epatc8) {
+		CPP(0x40);CPP(0xe0);
+		w0(0);w2(1);w2(4);
+		WR(0x8,0x12);WR(0xc,0x14);WR(0x12,0x10);
+		WR(0xe,0xf);WR(0xf,4);
+		/* WR(0xe,0xa);WR(0xf,4); */
+		WR(0xe,0xd);WR(0xf,0);
+		/* CPP(0x30); */
+	}
+
+        /* Connect to the chip */
+	CPP(0xe0);
+        w0(0);w2(1);w2(4); /* Idle into SPP */
+        if (pi->mode >= 3) {
+          w0(0);w2(1);w2(4);w2(0xc);
+          /* Request EPP */
+          w0(0x40);w2(6);w2(7);w2(4);w2(0xc);w2(4);
+        }
+
+	if (!epatc8) {
+		WR(8,0x10); WR(0xc,0x14); WR(0xa,0x38); WR(0x12,0x10);
+	}
+}
+
+static void epat_disconnect (PIA *pi)
+{	CPP(0x30);
+	w0(pi->saved_r0);
+	w2(pi->saved_r2);
+}
+
+static int epat_test_proto( PIA *pi, char * scratch, int verbose )
+
+{       int     k, j, f, cc;
+	int	e[2] = {0,0};
+
+        epat_connect(pi);
+	cc = RR(0xd);
+	epat_disconnect(pi);
+
+	epat_connect(pi);
+	for (j=0;j<2;j++) {
+  	    WRi(6,0xa0+j*0x10);
+            for (k=0;k<256;k++) {
+                WRi(2,k^0xaa);
+                WRi(3,k^0x55);
+                if (RRi(2) != (k^0xaa)) e[j]++;
+                }
+	    }
+        epat_disconnect(pi);
+
+        f = 0;
+        epat_connect(pi);
+        WR(0x13,1); WR(0x13,0); WR(0xa,0x11);
+        epat_read_block(pi,scratch,512);
+	
+        for (k=0;k<256;k++) {
+            if ((scratch[2*k] & 0xff) != k) f++;
+            if ((scratch[2*k+1] & 0xff) != (0xff-k)) f++;
+        }
+        epat_disconnect(pi);
+
+        if (verbose)  {
+            printk("%s: epat: port 0x%x, mode %d, ccr %x, test=(%d,%d,%d)\n",
+		   pi->device,pi->port,pi->mode,cc,e[0],e[1],f);
+	}
+	
+        return (e[0] && e[1]) || f;
+}
+
+static void epat_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{	int	ver;
+        char    *mode_string[6] = 
+		   {"4-bit","5/3","8-bit","EPP-8","EPP-16","EPP-32"};
+
+	epat_connect(pi);
+	WR(0xa,0x38);		/* read the version code */
+        ver = RR(0xb);
+        epat_disconnect(pi);
+
+	printk("%s: epat %s, Shuttle EPAT chip %x at 0x%x, ",
+		pi->device,EPAT_VERSION,ver,pi->port);
+	printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol epat = {
+	.owner		= THIS_MODULE,
+	.name		= "epat",
+	.max_mode	= 6,
+	.epp_first	= 3,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= epat_write_regr,
+	.read_regr	= epat_read_regr,
+	.write_block	= epat_write_block,
+	.read_block	= epat_read_block,
+	.connect	= epat_connect,
+	.disconnect	= epat_disconnect,
+	.test_proto	= epat_test_proto,
+	.log_adapter	= epat_log_adapter,
+};
+
+static int __init epat_init(void)
+{
+#ifdef CONFIG_PARIDE_EPATC8
+	epatc8 = 1;
+#endif
+	return paride_register(&epat);
+}
+
+static void __exit epat_exit(void)
+{
+	paride_unregister(&epat);
+}
+
+MODULE_LICENSE("GPL");
+module_init(epat_init)
+module_exit(epat_exit)
diff --git a/drivers/block/paride/epia.c b/drivers/block/paride/epia.c
new file mode 100644
index 000000000..fb0e782d0
--- /dev/null
+++ b/drivers/block/paride/epia.c
@@ -0,0 +1,316 @@
+/* 
+        epia.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+        epia.c is a low-level protocol driver for Shuttle Technologies 
+	EPIA parallel to IDE adapter chip.  This device is now obsolete
+	and has been replaced with the EPAT chip, which is supported
+	by epat.c, however, some devices based on EPIA are still
+	available.
+
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+	1.02    GRG 1998.06.17 support older versions of EPIA
+
+*/
+
+#define EPIA_VERSION      "1.02"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes:  0  nybble reads on port 1, 8-bit writes
+                1  5/3 reads on ports 1 & 2, 8-bit writes
+                2  8-bit reads and writes
+                3  8-bit EPP mode
+		4  16-bit EPP
+		5  32-bit EPP
+*/
+
+#define j44(a,b)                (((a>>4)&0x0f)+(b&0xf0))
+#define j53(a,b)                (((a>>3)&0x1f)+((b<<4)&0xe0))
+
+/* cont =  0   IDE register file
+   cont =  1   IDE control registers
+*/
+
+static int cont_map[2] = { 0, 0x80 };
+
+static int epia_read_regr( PIA *pi, int cont, int regr )
+
+{       int     a, b, r;
+
+	regr += cont_map[cont];
+
+        switch (pi->mode)  {
+
+        case 0: r = regr^0x39;
+                w0(r); w2(1); w2(3); w0(r);
+                a = r1(); w2(1); b = r1(); w2(4);
+                return j44(a,b);
+
+        case 1: r = regr^0x31;
+                w0(r); w2(1); w0(r&0x37); 
+                w2(3); w2(5); w0(r|0xf0);
+                a = r1(); b = r2(); w2(4);
+                return j53(a,b);
+
+        case 2: r = regr^0x29;
+                w0(r); w2(1); w2(0X21); w2(0x23); 
+                a = r0(); w2(4);
+                return a;
+
+	case 3:
+	case 4:
+        case 5: w3(regr); w2(0x24); a = r4(); w2(4);
+                return a;
+
+        }
+        return -1;
+}       
+
+static void epia_write_regr( PIA *pi, int cont, int regr, int val)
+
+{       int  r;
+
+	regr += cont_map[cont];
+
+        switch (pi->mode)  {
+
+        case 0:
+        case 1:
+        case 2: r = regr^0x19;
+                w0(r); w2(1); w0(val); w2(3); w2(4);
+                break;
+
+	case 3:
+	case 4:
+        case 5: r = regr^0x40;
+                w3(r); w4(val); w2(4);
+                break;
+        }
+}
+
+#define WR(r,v)         epia_write_regr(pi,0,r,v)
+#define RR(r)           (epia_read_regr(pi,0,r))
+
+/* The use of register 0x84 is entirely unclear - it seems to control
+   some EPP counters ...  currently we know about 3 different block
+   sizes:  the standard 512 byte reads and writes, 12 byte writes and 
+   2048 byte reads (the last two being used in the CDrom drivers.
+*/
+
+static void epia_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+
+        w2(4); w0(0xa0); w0(0x50); w0(0xc0); w0(0x30); w0(0xa0); w0(0);
+        w2(1); w2(4);
+        if (pi->mode >= 3) { 
+                w0(0xa); w2(1); w2(4); w0(0x82); w2(4); w2(0xc); w2(4);
+                w2(0x24); w2(0x26); w2(4);
+        }
+        WR(0x86,8);  
+}
+
+static void epia_disconnect ( PIA *pi )
+
+{       /* WR(0x84,0x10); */
+        w0(pi->saved_r0);
+        w2(1); w2(4);
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void epia_read_block( PIA *pi, char * buf, int count )
+
+{       int     k, ph, a, b;
+
+        switch (pi->mode) {
+
+        case 0: w0(0x81); w2(1); w2(3); w0(0xc1);
+                ph = 1;
+                for (k=0;k<count;k++) {
+                        w2(2+ph); a = r1();
+                        w2(4+ph); b = r1();
+                        buf[k] = j44(a,b);
+                        ph = 1 - ph;
+                } 
+                w0(0); w2(4);
+                break;
+
+        case 1: w0(0x91); w2(1); w0(0x10); w2(3); 
+                w0(0x51); w2(5); w0(0xd1); 
+                ph = 1;
+                for (k=0;k<count;k++) {
+                        w2(4+ph);
+                        a = r1(); b = r2();
+                        buf[k] = j53(a,b);
+                        ph = 1 - ph;
+                }
+                w0(0); w2(4);
+                break;
+
+        case 2: w0(0x89); w2(1); w2(0x23); w2(0x21); 
+                ph = 1;
+                for (k=0;k<count;k++) {
+                        w2(0x24+ph);
+                        buf[k] = r0();
+                        ph = 1 - ph;
+                }
+                w2(6); w2(4);
+                break;
+
+        case 3: if (count > 512) WR(0x84,3);
+		w3(0); w2(0x24);
+                for (k=0;k<count;k++) buf[k] = r4();
+                w2(4); WR(0x84,0);
+                break;
+
+        case 4: if (count > 512) WR(0x84,3);
+		w3(0); w2(0x24);
+		for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+                w2(4); WR(0x84,0);
+                break;
+
+        case 5: if (count > 512) WR(0x84,3);
+		w3(0); w2(0x24);
+                for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+                w2(4); WR(0x84,0);
+                break;
+
+        }
+}
+
+static void epia_write_block( PIA *pi, char * buf, int count )
+
+{       int     ph, k, last, d;
+
+        switch (pi->mode) {
+
+        case 0:
+        case 1:
+        case 2: w0(0xa1); w2(1); w2(3); w2(1); w2(5);
+                ph = 0;  last = 0x8000;
+                for (k=0;k<count;k++) {
+                        d = buf[k];
+                        if (d != last) { last = d; w0(d); }
+                        w2(4+ph);
+                        ph = 1 - ph;
+                }
+                w2(7); w2(4);
+                break;
+
+        case 3: if (count < 512) WR(0x84,1);
+		w3(0x40);
+                for (k=0;k<count;k++) w4(buf[k]);
+		if (count < 512) WR(0x84,0);
+                break;
+
+        case 4: if (count < 512) WR(0x84,1);
+		w3(0x40);
+                for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+		if (count < 512) WR(0x84,0);
+                break;
+
+        case 5: if (count < 512) WR(0x84,1);
+		w3(0x40);
+                for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+		if (count < 512) WR(0x84,0);
+                break;
+
+        }
+
+}
+
+static int epia_test_proto( PIA *pi, char * scratch, int verbose )
+
+{       int     j, k, f;
+	int	e[2] = {0,0};
+
+        epia_connect(pi);
+        for (j=0;j<2;j++) {
+            WR(6,0xa0+j*0x10);
+            for (k=0;k<256;k++) {
+                WR(2,k^0xaa);
+                WR(3,k^0x55);
+                if (RR(2) != (k^0xaa)) e[j]++;
+                }
+	    WR(2,1); WR(3,1);
+            }
+        epia_disconnect(pi);
+
+        f = 0;
+        epia_connect(pi);
+        WR(0x84,8);
+        epia_read_block(pi,scratch,512);
+        for (k=0;k<256;k++) {
+            if ((scratch[2*k] & 0xff) != ((k+1) & 0xff)) f++;
+            if ((scratch[2*k+1] & 0xff) != ((-2-k) & 0xff)) f++;
+        }
+        WR(0x84,0);
+        epia_disconnect(pi);
+
+        if (verbose)  {
+            printk("%s: epia: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+                   pi->device,pi->port,pi->mode,e[0],e[1],f);
+        }
+        
+        return (e[0] && e[1]) || f;
+
+}
+
+
+static void epia_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[6] = {"4-bit","5/3","8-bit",
+				   "EPP-8","EPP-16","EPP-32"};
+
+        printk("%s: epia %s, Shuttle EPIA at 0x%x, ",
+                pi->device,EPIA_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol epia = {
+	.owner		= THIS_MODULE,
+	.name		= "epia",
+	.max_mode	= 6,
+	.epp_first	= 3,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= epia_write_regr,
+	.read_regr	= epia_read_regr,
+	.write_block	= epia_write_block,
+	.read_block	= epia_read_block,
+	.connect	= epia_connect,
+	.disconnect	= epia_disconnect,
+	.test_proto	= epia_test_proto,
+	.log_adapter	= epia_log_adapter,
+};
+
+static int __init epia_init(void)
+{
+	return paride_register(&epia);
+}
+
+static void __exit epia_exit(void)
+{
+	paride_unregister(&epia);
+}
+
+MODULE_LICENSE("GPL");
+module_init(epia_init)
+module_exit(epia_exit)
diff --git a/drivers/block/paride/fit2.c b/drivers/block/paride/fit2.c
new file mode 100644
index 000000000..381283753
--- /dev/null
+++ b/drivers/block/paride/fit2.c
@@ -0,0 +1,151 @@
+/* 
+        fit2.c        (c) 1998  Grant R. Guenther <grant@torque.net>
+                          Under the terms of the GNU General Public License.
+
+	fit2.c is a low-level protocol driver for the older version
+        of the Fidelity International Technology parallel port adapter.  
+	This adapter is used in their TransDisk 2000 and older TransDisk
+	3000 portable hard-drives.  As far as I can tell, this device
+	supports 4-bit mode _only_.  
+
+	Newer models of the FIT products use an enhanced protocol.
+	The "fit3" protocol module should support current drives.
+
+*/
+
+#define FIT2_VERSION      "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b)                (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+
+NB:  The FIT adapter does not appear to use the control registers.
+So, we map ALT_STATUS to STATUS and NO-OP writes to the device
+control register - this means that IDE reset will not work on these
+devices.
+
+*/
+
+static void  fit2_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	if (cont == 1) return;
+	w2(0xc); w0(regr); w2(4); w0(val); w2(5); w0(0); w2(4);
+}
+
+static int fit2_read_regr( PIA *pi, int cont, int regr )
+
+{	int  a, b, r;
+
+	if (cont) {
+	  if (regr != 6) return 0xff;
+	  r = 7;
+	} else r = regr + 0x10;
+
+	w2(0xc); w0(r); w2(4); w2(5); 
+	         w0(0); a = r1();
+	         w0(1); b = r1();
+	w2(4);
+
+	return j44(a,b);
+
+}
+
+static void fit2_read_block( PIA *pi, char * buf, int count )
+
+{	int  k, a, b, c, d;
+
+	w2(0xc); w0(0x10);
+
+	for (k=0;k<count/4;k++) {
+
+		w2(4); w2(5);
+		w0(0); a = r1(); w0(1); b = r1();
+		w0(3); c = r1(); w0(2); d = r1(); 
+		buf[4*k+0] = j44(a,b);
+		buf[4*k+1] = j44(d,c);
+
+                w2(4); w2(5);
+                       a = r1(); w0(3); b = r1();
+                w0(1); c = r1(); w0(0); d = r1(); 
+                buf[4*k+2] = j44(d,c);
+                buf[4*k+3] = j44(a,b);
+
+	}
+
+	w2(4);
+
+}
+
+static void fit2_write_block( PIA *pi, char * buf, int count )
+
+{	int k;
+
+
+	w2(0xc); w0(0); 
+	for (k=0;k<count/2;k++) {
+		w2(4); w0(buf[2*k]); 
+		w2(5); w0(buf[2*k+1]);
+	}
+	w2(4);
+}
+
+static void fit2_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(0xcc); 
+}
+
+static void fit2_disconnect ( PIA *pi )
+
+{       w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void fit2_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       printk("%s: fit2 %s, FIT 2000 adapter at 0x%x, delay %d\n",
+                pi->device,FIT2_VERSION,pi->port,pi->delay);
+
+}
+
+static struct pi_protocol fit2 = {
+	.owner		= THIS_MODULE,
+	.name		= "fit2",
+	.max_mode	= 1,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= fit2_write_regr,
+	.read_regr	= fit2_read_regr,
+	.write_block	= fit2_write_block,
+	.read_block	= fit2_read_block,
+	.connect	= fit2_connect,
+	.disconnect	= fit2_disconnect,
+	.log_adapter	= fit2_log_adapter,
+};
+
+static int __init fit2_init(void)
+{
+	return paride_register(&fit2);
+}
+
+static void __exit fit2_exit(void)
+{
+	paride_unregister(&fit2);
+}
+
+MODULE_LICENSE("GPL");
+module_init(fit2_init)
+module_exit(fit2_exit)
diff --git a/drivers/block/paride/fit3.c b/drivers/block/paride/fit3.c
new file mode 100644
index 000000000..275d26945
--- /dev/null
+++ b/drivers/block/paride/fit3.c
@@ -0,0 +1,211 @@
+/* 
+        fit3.c        (c) 1998  Grant R. Guenther <grant@torque.net>
+                          Under the terms of the GNU General Public License.
+
+	fit3.c is a low-level protocol driver for newer models 
+        of the Fidelity International Technology parallel port adapter.  
+	This adapter is used in their TransDisk 3000 portable 
+	hard-drives, as well as CD-ROM, PD-CD and other devices.
+
+	The TD-2000 and certain older devices use a different protocol.
+	Try the fit2 protocol module with them.
+
+        NB:  The FIT adapters do not appear to support the control 
+	registers.  So, we map ALT_STATUS to STATUS and NO-OP writes 
+	to the device control register - this means that IDE reset 
+	will not work on these devices.
+
+*/
+
+#define FIT3_VERSION      "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b)                (((a>>3)&0x0f)|((b<<1)&0xf0))
+
+#define w7(byte)                {out_p(7,byte);}
+#define r7()                    (in_p(7) & 0xff)
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+
+*/
+
+static void  fit3_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	if (cont == 1) return;
+
+	switch (pi->mode) {
+
+	case 0:
+	case 1: w2(0xc); w0(regr); w2(0x8); w2(0xc); 
+		w0(val); w2(0xd); 
+		w0(0);   w2(0xc);
+		break;
+
+	case 2: w2(0xc); w0(regr); w2(0x8); w2(0xc);
+		w4(val); w4(0);
+		w2(0xc);
+		break;
+
+	}
+}
+
+static int fit3_read_regr( PIA *pi, int cont, int regr )
+
+{	int  a, b;
+
+	if (cont) {
+	  if (regr != 6) return 0xff;
+	  regr = 7;
+	} 
+
+	switch (pi->mode) {
+
+	case 0: w2(0xc); w0(regr + 0x10); w2(0x8); w2(0xc);
+		w2(0xd); a = r1();
+		w2(0xf); b = r1(); 
+		w2(0xc);
+		return j44(a,b);
+
+	case 1: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc);
+		w2(0xec); w2(0xee); w2(0xef); a = r0(); 
+		w2(0xc);
+		return a;
+
+	case 2: w2(0xc); w0(regr + 0x90); w2(0x8); w2(0xc); 
+		w2(0xec); 
+		a = r4(); b = r4(); 
+		w2(0xc);
+		return a;
+
+	}
+	return -1; 
+
+}
+
+static void fit3_read_block( PIA *pi, char * buf, int count )
+
+{	int  k, a, b, c, d;
+
+	switch (pi->mode) {
+
+	case 0: w2(0xc); w0(0x10); w2(0x8); w2(0xc);
+		for (k=0;k<count/2;k++) {
+		    w2(0xd); a = r1();
+		    w2(0xf); b = r1();
+		    w2(0xc); c = r1();
+		    w2(0xe); d = r1();
+		    buf[2*k  ] = j44(a,b);
+		    buf[2*k+1] = j44(c,d);
+		}
+		w2(0xc);
+		break;
+
+	case 1: w2(0xc); w0(0x90); w2(0x8); w2(0xc); 
+		w2(0xec); w2(0xee);
+		for (k=0;k<count/2;k++) {
+		    w2(0xef); a = r0();
+		    w2(0xee); b = r0();
+                    buf[2*k  ] = a;
+                    buf[2*k+1] = b;
+		}
+		w2(0xec); 
+		w2(0xc);
+		break;
+
+	case 2: w2(0xc); w0(0x90); w2(0x8); w2(0xc); 
+                w2(0xec);
+		for (k=0;k<count;k++) buf[k] = r4();
+                w2(0xc);
+		break;
+
+	}
+}
+
+static void fit3_write_block( PIA *pi, char * buf, int count )
+
+{	int k;
+
+        switch (pi->mode) {
+
+	case 0:
+        case 1: w2(0xc); w0(0); w2(0x8); w2(0xc);
+                for (k=0;k<count/2;k++) {
+ 		    w0(buf[2*k  ]); w2(0xd);
+ 		    w0(buf[2*k+1]); w2(0xc);
+		}
+		break;
+
+        case 2: w2(0xc); w0(0); w2(0x8); w2(0xc); 
+                for (k=0;k<count;k++) w4(buf[k]);
+                w2(0xc);
+		break;
+	}
+}
+
+static void fit3_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(0xc); w0(0); w2(0xa);
+	if (pi->mode == 2) { 
+		w2(0xc); w0(0x9); w2(0x8); w2(0xc); 
+		}
+}
+
+static void fit3_disconnect ( PIA *pi )
+
+{       w2(0xc); w0(0xa); w2(0x8); w2(0xc);
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void fit3_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[3] = {"4-bit","8-bit","EPP"};
+
+	printk("%s: fit3 %s, FIT 3000 adapter at 0x%x, "
+	       "mode %d (%s), delay %d\n",
+                pi->device,FIT3_VERSION,pi->port,
+		pi->mode,mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol fit3 = {
+	.owner		= THIS_MODULE,
+	.name		= "fit3",
+	.max_mode	= 3,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= fit3_write_regr,
+	.read_regr	= fit3_read_regr,
+	.write_block	= fit3_write_block,
+	.read_block	= fit3_read_block,
+	.connect	= fit3_connect,
+	.disconnect	= fit3_disconnect,
+	.log_adapter	= fit3_log_adapter,
+};
+
+static int __init fit3_init(void)
+{
+	return paride_register(&fit3);
+}
+
+static void __exit fit3_exit(void)
+{
+	paride_unregister(&fit3);
+}
+
+MODULE_LICENSE("GPL");
+module_init(fit3_init)
+module_exit(fit3_exit)
diff --git a/drivers/block/paride/friq.c b/drivers/block/paride/friq.c
new file mode 100644
index 000000000..4f2ba2446
--- /dev/null
+++ b/drivers/block/paride/friq.c
@@ -0,0 +1,276 @@
+/* 
+	friq.c	(c) 1998    Grant R. Guenther <grant@torque.net>
+		            Under the terms of the GNU General Public License
+
+	friq.c is a low-level protocol driver for the Freecom "IQ"
+	parallel port IDE adapter.   Early versions of this adapter
+	use the 'frpw' protocol.
+	
+	Freecom uses this adapter in a battery powered external 
+	CD-ROM drive.  It is also used in LS-120 drives by
+	Maxell and Panasonic, and other devices.
+
+	The battery powered drive requires software support to
+	control the power to the drive.  This module enables the
+	drive power when the high level driver (pcd) is loaded
+	and disables it when the module is unloaded.  Note, if
+	the friq module is built in to the kernel, the power
+	will never be switched off, so other means should be
+	used to conserve battery power.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.12.20	 Added support for soft power switch
+*/
+
+#define	FRIQ_VERSION	"1.01" 
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define CMD(x)		w2(4);w0(0xff);w0(0xff);w0(0x73);w0(0x73);\
+			w0(0xc9);w0(0xc9);w0(0x26);w0(0x26);w0(x);w0(x);
+
+#define j44(l,h)	(((l>>4)&0x0f)|(h&0xf0))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x08, 0x10 };
+
+static int friq_read_regr( PIA *pi, int cont, int regr )
+
+{	int	h,l,r;
+
+	r = regr + cont_map[cont];
+
+	CMD(r);
+	w2(6); l = r1();
+	w2(4); h = r1();
+	w2(4); 
+
+	return j44(l,h);
+
+}
+
+static void friq_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	int r;
+
+        r = regr + cont_map[cont];
+
+	CMD(r);
+	w0(val);
+	w2(5);w2(7);w2(5);w2(4);
+}
+
+static void friq_read_block_int( PIA *pi, char * buf, int count, int regr )
+
+{       int     h, l, k, ph;
+
+        switch(pi->mode) {
+
+        case 0: CMD(regr); 
+                for (k=0;k<count;k++) {
+                        w2(6); l = r1();
+                        w2(4); h = r1();
+                        buf[k] = j44(l,h);
+                }
+                w2(4);
+                break;
+
+        case 1: ph = 2;
+                CMD(regr+0xc0); 
+                w0(0xff);
+                for (k=0;k<count;k++) {
+                        w2(0xa4 + ph); 
+                        buf[k] = r0();
+                        ph = 2 - ph;
+                } 
+                w2(0xac); w2(0xa4); w2(4);
+                break;
+
+	case 2: CMD(regr+0x80);
+		for (k=0;k<count-2;k++) buf[k] = r4();
+		w2(0xac); w2(0xa4);
+		buf[count-2] = r4();
+		buf[count-1] = r4();
+		w2(4);
+		break;
+
+	case 3: CMD(regr+0x80);
+                for (k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+                w2(0xac); w2(0xa4);
+                buf[count-2] = r4();
+                buf[count-1] = r4();
+                w2(4);
+                break;
+
+	case 4: CMD(regr+0x80);
+                for (k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+                buf[count-4] = r4();
+                buf[count-3] = r4();
+                w2(0xac); w2(0xa4);
+                buf[count-2] = r4();
+                buf[count-1] = r4();
+                w2(4);
+                break;
+
+        }
+}
+
+static void friq_read_block( PIA *pi, char * buf, int count)
+
+{	friq_read_block_int(pi,buf,count,0x08);
+}
+
+static void friq_write_block( PIA *pi, char * buf, int count )
+ 
+{	int	k;
+
+	switch(pi->mode) {
+
+	case 0:
+	case 1: CMD(8); w2(5);
+        	for (k=0;k<count;k++) {
+			w0(buf[k]);
+			w2(7);w2(5);
+		}
+		w2(4);
+		break;
+
+	case 2: CMD(0xc8); w2(5);
+		for (k=0;k<count;k++) w4(buf[k]);
+		w2(4);
+		break;
+
+        case 3: CMD(0xc8); w2(5);
+                for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+                w2(4);
+                break;
+
+        case 4: CMD(0xc8); w2(5);
+                for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+                w2(4);
+                break;
+	}
+}
+
+static void friq_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(4);
+}
+
+static void friq_disconnect ( PIA *pi )
+
+{       CMD(0x20);
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static int friq_test_proto( PIA *pi, char * scratch, int verbose )
+
+{       int     j, k, r;
+	int	e[2] = {0,0};
+
+	pi->saved_r0 = r0();	
+	w0(0xff); udelay(20); CMD(0x3d); /* turn the power on */
+	udelay(500);
+	w0(pi->saved_r0);
+
+	friq_connect(pi);
+	for (j=0;j<2;j++) {
+                friq_write_regr(pi,0,6,0xa0+j*0x10);
+                for (k=0;k<256;k++) {
+                        friq_write_regr(pi,0,2,k^0xaa);
+                        friq_write_regr(pi,0,3,k^0x55);
+                        if (friq_read_regr(pi,0,2) != (k^0xaa)) e[j]++;
+                        }
+                }
+	friq_disconnect(pi);
+
+	friq_connect(pi);
+        friq_read_block_int(pi,scratch,512,0x10);
+        r = 0;
+        for (k=0;k<128;k++) if (scratch[k] != k) r++;
+	friq_disconnect(pi);
+
+        if (verbose)  {
+            printk("%s: friq: port 0x%x, mode %d, test=(%d,%d,%d)\n",
+                   pi->device,pi->port,pi->mode,e[0],e[1],r);
+        }
+
+        return (r || (e[0] && e[1]));
+}
+
+
+static void friq_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[6] = {"4-bit","8-bit",
+				   "EPP-8","EPP-16","EPP-32"};
+
+        printk("%s: friq %s, Freecom IQ ASIC-2 adapter at 0x%x, ", pi->device,
+		FRIQ_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+	pi->private = 1;
+	friq_connect(pi);
+	CMD(0x9e);  		/* disable sleep timer */
+	friq_disconnect(pi);
+
+}
+
+static void friq_release_proto( PIA *pi)
+{
+	if (pi->private) {		/* turn off the power */
+		friq_connect(pi);
+		CMD(0x1d); CMD(0x1e);
+		friq_disconnect(pi);
+		pi->private = 0;
+	}
+}
+
+static struct pi_protocol friq = {
+	.owner		= THIS_MODULE,
+	.name		= "friq",
+	.max_mode	= 5,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= friq_write_regr,
+	.read_regr	= friq_read_regr,
+	.write_block	= friq_write_block,
+	.read_block	= friq_read_block,
+	.connect	= friq_connect,
+	.disconnect	= friq_disconnect,
+	.test_proto	= friq_test_proto,
+	.log_adapter	= friq_log_adapter,
+	.release_proto	= friq_release_proto,
+};
+
+static int __init friq_init(void)
+{
+	return paride_register(&friq);
+}
+
+static void __exit friq_exit(void)
+{
+	paride_unregister(&friq);
+}
+
+MODULE_LICENSE("GPL");
+module_init(friq_init)
+module_exit(friq_exit)
diff --git a/drivers/block/paride/frpw.c b/drivers/block/paride/frpw.c
new file mode 100644
index 000000000..c3cde3646
--- /dev/null
+++ b/drivers/block/paride/frpw.c
@@ -0,0 +1,313 @@
+/* 
+	frpw.c	(c) 1996-8  Grant R. Guenther <grant@torque.net>
+		            Under the terms of the GNU General Public License
+
+	frpw.c is a low-level protocol driver for the Freecom "Power"
+	parallel port IDE adapter.
+	
+	Some applications of this adapter may require a "printer" reset
+	prior to loading the driver.  This can be done by loading and
+	unloading the "lp" driver, or it can be done by this driver
+	if you define FRPW_HARD_RESET.  The latter is not recommended
+	as it may upset devices on other ports.
+
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+			       fix chip detect
+			       added EPP-16 and EPP-32
+	1.02    GRG 1998.09.23 added hard reset to initialisation process
+	1.03    GRG 1998.12.14 made hard reset conditional
+
+*/
+
+#define	FRPW_VERSION	"1.03" 
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define cec4		w2(0xc);w2(0xe);w2(0xe);w2(0xc);w2(4);w2(4);w2(4);
+#define j44(l,h)	(((l>>4)&0x0f)|(h&0xf0))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x08, 0x10 };
+
+static int frpw_read_regr( PIA *pi, int cont, int regr )
+
+{	int	h,l,r;
+
+	r = regr + cont_map[cont];
+
+	w2(4);
+	w0(r); cec4;
+	w2(6); l = r1();
+	w2(4); h = r1();
+	w2(4); 
+
+	return j44(l,h);
+
+}
+
+static void frpw_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	int r;
+
+        r = regr + cont_map[cont];
+
+	w2(4); w0(r); cec4; 
+	w0(val);
+	w2(5);w2(7);w2(5);w2(4);
+}
+
+static void frpw_read_block_int( PIA *pi, char * buf, int count, int regr )
+
+{       int     h, l, k, ph;
+
+        switch(pi->mode) {
+
+        case 0: w2(4); w0(regr); cec4;
+                for (k=0;k<count;k++) {
+                        w2(6); l = r1();
+                        w2(4); h = r1();
+                        buf[k] = j44(l,h);
+                }
+                w2(4);
+                break;
+
+        case 1: ph = 2;
+                w2(4); w0(regr + 0xc0); cec4;
+                w0(0xff);
+                for (k=0;k<count;k++) {
+                        w2(0xa4 + ph); 
+                        buf[k] = r0();
+                        ph = 2 - ph;
+                } 
+                w2(0xac); w2(0xa4); w2(4);
+                break;
+
+        case 2: w2(4); w0(regr + 0x80); cec4;
+                for (k=0;k<count;k++) buf[k] = r4();
+                w2(0xac); w2(0xa4);
+                w2(4);
+                break;
+
+	case 3: w2(4); w0(regr + 0x80); cec4;
+		for (k=0;k<count-2;k++) buf[k] = r4();
+		w2(0xac); w2(0xa4);
+		buf[count-2] = r4();
+		buf[count-1] = r4();
+		w2(4);
+		break;
+
+	case 4: w2(4); w0(regr + 0x80); cec4;
+                for (k=0;k<(count/2)-1;k++) ((u16 *)buf)[k] = r4w();
+                w2(0xac); w2(0xa4);
+                buf[count-2] = r4();
+                buf[count-1] = r4();
+                w2(4);
+                break;
+
+	case 5: w2(4); w0(regr + 0x80); cec4;
+                for (k=0;k<(count/4)-1;k++) ((u32 *)buf)[k] = r4l();
+                buf[count-4] = r4();
+                buf[count-3] = r4();
+                w2(0xac); w2(0xa4);
+                buf[count-2] = r4();
+                buf[count-1] = r4();
+                w2(4);
+                break;
+
+        }
+}
+
+static void frpw_read_block( PIA *pi, char * buf, int count)
+
+{	frpw_read_block_int(pi,buf,count,0x08);
+}
+
+static void frpw_write_block( PIA *pi, char * buf, int count )
+ 
+{	int	k;
+
+	switch(pi->mode) {
+
+	case 0:
+	case 1:
+	case 2: w2(4); w0(8); cec4; w2(5);
+        	for (k=0;k<count;k++) {
+			w0(buf[k]);
+			w2(7);w2(5);
+		}
+		w2(4);
+		break;
+
+	case 3: w2(4); w0(0xc8); cec4; w2(5);
+		for (k=0;k<count;k++) w4(buf[k]);
+		w2(4);
+		break;
+
+        case 4: w2(4); w0(0xc8); cec4; w2(5);
+                for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+                w2(4);
+                break;
+
+        case 5: w2(4); w0(0xc8); cec4; w2(5);
+                for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+                w2(4);
+                break;
+	}
+}
+
+static void frpw_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(4);
+}
+
+static void frpw_disconnect ( PIA *pi )
+
+{       w2(4); w0(0x20); cec4;
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+/* Stub logic to see if PNP string is available - used to distinguish
+   between the Xilinx and ASIC implementations of the Freecom adapter.
+*/
+
+static int frpw_test_pnp ( PIA *pi )
+
+/*  returns chip_type:   0 = Xilinx, 1 = ASIC   */
+
+{	int olddelay, a, b;
+
+#ifdef FRPW_HARD_RESET
+        w0(0); w2(8); udelay(50); w2(0xc);   /* parallel bus reset */
+        mdelay(1500);
+#endif
+
+	olddelay = pi->delay;
+	pi->delay = 10;
+
+	pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	
+	w2(4); w0(4); w2(6); w2(7);
+	a = r1() & 0xff; w2(4); b = r1() & 0xff;
+	w2(0xc); w2(0xe); w2(4);
+
+	pi->delay = olddelay;
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+
+	return ((~a&0x40) && (b&0x40));
+} 
+
+/* We use the pi->private to remember the result of the PNP test.
+   To make this work, private = port*2 + chip.  Yes, I know it's
+   a hack :-(
+*/
+
+static int frpw_test_proto( PIA *pi, char * scratch, int verbose )
+
+{       int     j, k, r;
+	int	e[2] = {0,0};
+
+	if ((pi->private>>1) != pi->port)
+	   pi->private = frpw_test_pnp(pi) + 2*pi->port;
+
+	if (((pi->private%2) == 0) && (pi->mode > 2)) {
+	   if (verbose) 
+		printk("%s: frpw: Xilinx does not support mode %d\n",
+			pi->device, pi->mode);
+	   return 1;
+	}
+
+	if (((pi->private%2) == 1) && (pi->mode == 2)) {
+	   if (verbose)
+		printk("%s: frpw: ASIC does not support mode 2\n",
+			pi->device);
+	   return 1;
+	}
+
+	frpw_connect(pi);
+	for (j=0;j<2;j++) {
+                frpw_write_regr(pi,0,6,0xa0+j*0x10);
+                for (k=0;k<256;k++) {
+                        frpw_write_regr(pi,0,2,k^0xaa);
+                        frpw_write_regr(pi,0,3,k^0x55);
+                        if (frpw_read_regr(pi,0,2) != (k^0xaa)) e[j]++;
+                        }
+                }
+	frpw_disconnect(pi);
+
+	frpw_connect(pi);
+        frpw_read_block_int(pi,scratch,512,0x10);
+        r = 0;
+        for (k=0;k<128;k++) if (scratch[k] != k) r++;
+	frpw_disconnect(pi);
+
+        if (verbose)  {
+            printk("%s: frpw: port 0x%x, chip %ld, mode %d, test=(%d,%d,%d)\n",
+                   pi->device,pi->port,(pi->private%2),pi->mode,e[0],e[1],r);
+        }
+
+        return (r || (e[0] && e[1]));
+}
+
+
+static void frpw_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[6] = {"4-bit","8-bit","EPP",
+				   "EPP-8","EPP-16","EPP-32"};
+
+        printk("%s: frpw %s, Freecom (%s) adapter at 0x%x, ", pi->device,
+		FRPW_VERSION,((pi->private%2) == 0)?"Xilinx":"ASIC",pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol frpw = {
+	.owner		= THIS_MODULE,
+	.name		= "frpw",
+	.max_mode	= 6,
+	.epp_first	= 2,
+	.default_delay	= 2,
+	.max_units	= 1,
+	.write_regr	= frpw_write_regr,
+	.read_regr	= frpw_read_regr,
+	.write_block	= frpw_write_block,
+	.read_block	= frpw_read_block,
+	.connect	= frpw_connect,
+	.disconnect	= frpw_disconnect,
+	.test_proto	= frpw_test_proto,
+	.log_adapter	= frpw_log_adapter,
+};
+
+static int __init frpw_init(void)
+{
+	return paride_register(&frpw);
+}
+
+static void __exit frpw_exit(void)
+{
+	paride_unregister(&frpw);
+}
+
+MODULE_LICENSE("GPL");
+module_init(frpw_init)
+module_exit(frpw_exit)
diff --git a/drivers/block/paride/kbic.c b/drivers/block/paride/kbic.c
new file mode 100644
index 000000000..35999c415
--- /dev/null
+++ b/drivers/block/paride/kbic.c
@@ -0,0 +1,305 @@
+/*
+        kbic.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+        This is a low-level driver for the KBIC-951A and KBIC-971A
+        parallel to IDE adapter chips from KingByte Information Systems.
+
+	The chips are almost identical, however, the wakeup code 
+	required for the 971A interferes with the correct operation of
+        the 951A, so this driver registers itself twice, once for
+	each chip.
+
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define KBIC_VERSION      "1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define r12w()			(delay_p,inw(pi->port+1)&0xffff) 
+
+#define j44(a,b)                ((((a>>4)&0x0f)|(b&0xf0))^0x88)
+#define j53(w)                  (((w>>3)&0x1f)|((w>>4)&0xe0))
+
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x80, 0x40 };
+
+static int kbic_read_regr( PIA *pi, int cont, int regr )
+
+{       int     a, b, s;
+
+        s = cont_map[cont];
+
+	switch (pi->mode) {
+
+	case 0: w0(regr|0x18|s); w2(4); w2(6); w2(4); w2(1); w0(8);
+	        a = r1(); w0(0x28); b = r1(); w2(4);
+		return j44(a,b);
+
+	case 1: w0(regr|0x38|s); w2(4); w2(6); w2(4); w2(5); w0(8);
+		a = r12w(); w2(4);
+		return j53(a);
+
+	case 2: w0(regr|0x08|s); w2(4); w2(6); w2(4); w2(0xa5); w2(0xa1);
+		a = r0(); w2(4);
+       		return a;
+
+	case 3:
+	case 4:
+	case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr);
+		a = r4(); b = r4(); w2(4); w2(0); w2(4);
+		return a;
+
+	}
+	return -1;
+}       
+
+static void  kbic_write_regr( PIA *pi, int cont, int regr, int val)
+
+{       int  s;
+
+        s = cont_map[cont];
+
+        switch (pi->mode) {
+
+	case 0: 
+        case 1:
+	case 2:	w0(regr|0x10|s); w2(4); w2(6); w2(4); 
+		w0(val); w2(5); w2(4);
+		break;
+
+	case 3:
+	case 4:
+	case 5: w0(0x20|s); w2(4); w2(6); w2(4); w3(regr);
+		w4(val); w4(val);
+		w2(4); w2(0); w2(4);
+                break;
+
+	}
+}
+
+static void k951_connect ( PIA *pi  )
+
+{ 	pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+        w2(4); 
+}
+
+static void k951_disconnect ( PIA *pi )
+
+{      	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+}
+
+#define	CCP(x)	w2(0xc4);w0(0xaa);w0(0x55);w0(0);w0(0xff);w0(0x87);\
+		w0(0x78);w0(x);w2(0xc5);w2(0xc4);w0(0xff);
+
+static void k971_connect ( PIA *pi  )
+
+{ 	pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	CCP(0x20);
+        w2(4); 
+}
+
+static void k971_disconnect ( PIA *pi )
+
+{       CCP(0x30);
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+}
+
+/* counts must be congruent to 0 MOD 4, but all known applications
+   have this property.
+*/
+
+static void kbic_read_block( PIA *pi, char * buf, int count )
+
+{       int     k, a, b;
+
+        switch (pi->mode) {
+
+        case 0: w0(0x98); w2(4); w2(6); w2(4);
+                for (k=0;k<count/2;k++) {
+			w2(1); w0(8);    a = r1();
+			       w0(0x28); b = r1();
+			buf[2*k]   = j44(a,b);
+			w2(5);           b = r1();
+			       w0(8);    a = r1();
+			buf[2*k+1] = j44(a,b);
+			w2(4);
+                } 
+                break;
+
+        case 1: w0(0xb8); w2(4); w2(6); w2(4); 
+                for (k=0;k<count/4;k++) {
+                        w0(0xb8); 
+			w2(4); w2(5); 
+                        w0(8);    buf[4*k]   = j53(r12w());
+			w0(0xb8); buf[4*k+1] = j53(r12w());
+			w2(4); w2(5);
+			          buf[4*k+3] = j53(r12w());
+			w0(8);    buf[4*k+2] = j53(r12w());
+                }
+                w2(4);
+                break;
+
+        case 2: w0(0x88); w2(4); w2(6); w2(4);
+                for (k=0;k<count/2;k++) {
+                        w2(0xa0); w2(0xa1); buf[2*k] = r0();
+                        w2(0xa5); buf[2*k+1] = r0();
+                }
+                w2(4);
+                break;
+
+        case 3: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+                for (k=0;k<count;k++) buf[k] = r4();
+                w2(4); w2(0); w2(4);
+                break;
+
+	case 4: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+                for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+                w2(4); w2(0); w2(4);
+                break;
+
+        case 5: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+                for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+                w2(4); w2(0); w2(4);
+                break;
+
+
+        }
+}
+
+static void kbic_write_block( PIA *pi, char * buf, int count )
+
+{       int     k;
+
+        switch (pi->mode) {
+
+        case 0:
+        case 1:
+        case 2: w0(0x90); w2(4); w2(6); w2(4); 
+		for(k=0;k<count/2;k++) {
+			w0(buf[2*k+1]); w2(0); w2(4); 
+			w0(buf[2*k]);   w2(5); w2(4); 
+		}
+		break;
+
+        case 3: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+		for(k=0;k<count/2;k++) {
+			w4(buf[2*k+1]); 
+                        w4(buf[2*k]);
+                }
+		w2(4); w2(0); w2(4);
+		break;
+
+	case 4: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+                for(k=0;k<count/2;k++) w4w(pi_swab16(buf,k));
+                w2(4); w2(0); w2(4);
+                break;
+
+        case 5: w0(0xa0); w2(4); w2(6); w2(4); w3(0);
+                for(k=0;k<count/4;k++) w4l(pi_swab32(buf,k));
+                w2(4); w2(0); w2(4);
+                break;
+
+        }
+
+}
+
+static void kbic_log_adapter( PIA *pi, char * scratch, 
+			      int verbose, char * chip )
+
+{       char    *mode_string[6] = {"4-bit","5/3","8-bit",
+				   "EPP-8","EPP_16","EPP-32"};
+
+        printk("%s: kbic %s, KingByte %s at 0x%x, ",
+                pi->device,KBIC_VERSION,chip,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static void k951_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{	kbic_log_adapter(pi,scratch,verbose,"KBIC-951A");
+}
+
+static void k971_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       kbic_log_adapter(pi,scratch,verbose,"KBIC-971A");
+}
+
+static struct pi_protocol k951 = {
+	.owner		= THIS_MODULE,
+	.name		= "k951",
+	.max_mode	= 6,
+	.epp_first	= 3,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= kbic_write_regr,
+	.read_regr	= kbic_read_regr,
+	.write_block	= kbic_write_block,
+	.read_block	= kbic_read_block,
+	.connect	= k951_connect,
+	.disconnect	= k951_disconnect,
+	.log_adapter	= k951_log_adapter,
+};
+
+static struct pi_protocol k971 = {
+	.owner		= THIS_MODULE,
+	.name		= "k971",
+	.max_mode	= 6,
+	.epp_first	= 3,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= kbic_write_regr,
+	.read_regr	= kbic_read_regr,
+	.write_block	= kbic_write_block,
+	.read_block	= kbic_read_block,
+	.connect	= k971_connect,
+	.disconnect	= k971_disconnect,
+	.log_adapter	= k971_log_adapter,
+};
+
+static int __init kbic_init(void)
+{
+	int rv;
+
+	rv = paride_register(&k951);
+	if (rv < 0)
+		return rv;
+	rv = paride_register(&k971);
+	if (rv < 0)
+		paride_unregister(&k951);
+	return rv;
+}
+
+static void __exit kbic_exit(void)
+{
+	paride_unregister(&k951);
+	paride_unregister(&k971);
+}
+
+MODULE_LICENSE("GPL");
+module_init(kbic_init)
+module_exit(kbic_exit)
diff --git a/drivers/block/paride/ktti.c b/drivers/block/paride/ktti.c
new file mode 100644
index 000000000..117ab0e8c
--- /dev/null
+++ b/drivers/block/paride/ktti.c
@@ -0,0 +1,128 @@
+/* 
+        ktti.c        (c) 1998  Grant R. Guenther <grant@torque.net>
+                          Under the terms of the GNU General Public License.
+
+	ktti.c is a low-level protocol driver for the KT Technology
+	parallel port adapter.  This adapter is used in the "PHd" 
+        portable hard-drives.  As far as I can tell, this device
+	supports 4-bit mode _only_.  
+
+*/
+
+#define KTTI_VERSION      "1.0"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define j44(a,b)                (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int  cont_map[2] = { 0x10, 0x08 };
+
+static void  ktti_write_regr( PIA *pi, int cont, int regr, int val)
+
+{	int r;
+
+	r = regr + cont_map[cont];
+
+	w0(r); w2(0xb); w2(0xa); w2(3); w2(6); 
+	w0(val); w2(3); w0(0); w2(6); w2(0xb);
+}
+
+static int ktti_read_regr( PIA *pi, int cont, int regr )
+
+{	int  a, b, r;
+
+        r = regr + cont_map[cont];
+
+        w0(r); w2(0xb); w2(0xa); w2(9); w2(0xc); w2(9); 
+	a = r1(); w2(0xc);  b = r1(); w2(9); w2(0xc); w2(9);
+	return j44(a,b);
+
+}
+
+static void ktti_read_block( PIA *pi, char * buf, int count )
+
+{	int  k, a, b;
+
+	for (k=0;k<count/2;k++) {
+		w0(0x10); w2(0xb); w2(0xa); w2(9); w2(0xc); w2(9);
+		a = r1(); w2(0xc); b = r1(); w2(9);
+		buf[2*k] = j44(a,b);
+		a = r1(); w2(0xc); b = r1(); w2(9);
+		buf[2*k+1] = j44(a,b);
+	}
+}
+
+static void ktti_write_block( PIA *pi, char * buf, int count )
+
+{	int k;
+
+	for (k=0;k<count/2;k++) {
+		w0(0x10); w2(0xb); w2(0xa); w2(3); w2(6);
+		w0(buf[2*k]); w2(3);
+		w0(buf[2*k+1]); w2(6);
+		w2(0xb);
+	}
+}
+
+static void ktti_connect ( PIA *pi  )
+
+{       pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+	w2(0xb); w2(0xa); w0(0); w2(3); w2(6);	
+}
+
+static void ktti_disconnect ( PIA *pi )
+
+{       w2(0xb); w2(0xa); w0(0xa0); w2(3); w2(4);
+	w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void ktti_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       printk("%s: ktti %s, KT adapter at 0x%x, delay %d\n",
+                pi->device,KTTI_VERSION,pi->port,pi->delay);
+
+}
+
+static struct pi_protocol ktti = {
+	.owner		= THIS_MODULE,
+	.name		= "ktti",
+	.max_mode	= 1,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= ktti_write_regr,
+	.read_regr	= ktti_read_regr,
+	.write_block	= ktti_write_block,
+	.read_block	= ktti_read_block,
+	.connect	= ktti_connect,
+	.disconnect	= ktti_disconnect,
+	.log_adapter	= ktti_log_adapter,
+};
+
+static int __init ktti_init(void)
+{
+	return paride_register(&ktti);
+}
+
+static void __exit ktti_exit(void)
+{
+	paride_unregister(&ktti);
+}
+
+MODULE_LICENSE("GPL");
+module_init(ktti_init)
+module_exit(ktti_exit)
diff --git a/drivers/block/paride/mkd b/drivers/block/paride/mkd
new file mode 100644
index 000000000..6d0d80247
--- /dev/null
+++ b/drivers/block/paride/mkd
@@ -0,0 +1,31 @@
+#!/bin/bash
+# SPDX-License-Identifier: GPL-2.0
+#
+# mkd -- a script to create the device special files for the PARIDE subsystem
+#
+#  block devices:  	pd (45), pcd (46), pf (47)
+#  character devices:	pt (96), pg (97)
+#
+function mkdev {
+  mknod $1 $2 $3 $4 ; chmod 0660 $1 ; chown root:disk $1
+}
+#
+function pd {
+  D=$( printf \\$( printf "x%03x" $[ $1 + 97 ] ) )
+  mkdev pd$D b 45 $[ $1 * 16 ]
+  for P in 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+  do mkdev pd$D$P b 45 $[ $1 * 16 + $P ]
+  done
+}
+#
+cd /dev
+#
+for u in 0 1 2 3 ; do pd $u ; done
+for u in 0 1 2 3 ; do mkdev pcd$u b 46 $u ; done 
+for u in 0 1 2 3 ; do mkdev pf$u  b 47 $u ; done 
+for u in 0 1 2 3 ; do mkdev pt$u  c 96 $u ; done 
+for u in 0 1 2 3 ; do mkdev npt$u c 96 $[ $u + 128 ] ; done 
+for u in 0 1 2 3 ; do mkdev pg$u  c 97 $u ; done
+#
+# end of mkd
+
diff --git a/drivers/block/paride/on20.c b/drivers/block/paride/on20.c
new file mode 100644
index 000000000..0173697a1
--- /dev/null
+++ b/drivers/block/paride/on20.c
@@ -0,0 +1,153 @@
+/* 
+	on20.c	(c) 1996-8  Grant R. Guenther <grant@torque.net>
+		            Under the terms of the GNU General Public License.
+
+        on20.c is a low-level protocol driver for the
+        Onspec 90c20 parallel to IDE adapter. 
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+
+*/
+
+#define	ON20_VERSION	"1.01"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+#define op(f)	w2(4);w0(f);w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4);
+#define vl(v)	w2(4);w0(v);w2(5);w2(7);w2(5);w2(4);
+
+#define j44(a,b)  (((a>>4)&0x0f)|(b&0xf0))
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int on20_read_regr( PIA *pi, int cont, int regr )
+
+{	int h,l, r ;
+
+        r = (regr<<2) + 1 + cont;
+
+        op(1); vl(r); op(0);
+
+	switch (pi->mode)  {
+
+        case 0:  w2(4); w2(6); l = r1();
+                 w2(4); w2(6); h = r1();
+                 w2(4); w2(6); w2(4); w2(6); w2(4);
+		 return j44(l,h);
+
+	case 1:  w2(4); w2(0x26); r = r0(); 
+                 w2(4); w2(0x26); w2(4);
+		 return r;
+
+	}
+	return -1;
+}	
+
+static void on20_write_regr( PIA *pi, int cont, int regr, int val )
+
+{	int r;
+
+	r = (regr<<2) + 1 + cont;
+
+	op(1); vl(r); 
+	op(0); vl(val); 
+	op(0); vl(val);
+}
+
+static void on20_connect ( PIA *pi)
+
+{	pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+
+	w2(4);w0(0);w2(0xc);w2(4);w2(6);w2(4);w2(6);w2(4); 
+	if (pi->mode) { op(2); vl(8); op(2); vl(9); }
+	       else   { op(2); vl(0); op(2); vl(8); }
+}
+
+static void on20_disconnect ( PIA *pi )
+
+{	w2(4);w0(7);w2(4);w2(0xc);w2(4);
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+static void on20_read_block( PIA *pi, char * buf, int count )
+
+{	int     k, l, h; 
+
+	op(1); vl(1); op(0);
+
+	for (k=0;k<count;k++) 
+	    if (pi->mode) {
+		w2(4); w2(0x26); buf[k] = r0();
+	    } else {
+		w2(6); l = r1(); w2(4);
+		w2(6); h = r1(); w2(4);
+		buf[k] = j44(l,h);
+	    }
+	w2(4);
+}
+
+static void on20_write_block(  PIA *pi, char * buf, int count )
+
+{	int	k;
+
+	op(1); vl(1); op(0);
+
+	for (k=0;k<count;k++) { w2(5); w0(buf[k]); w2(7); }
+	w2(4);
+}
+
+static void on20_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[2] = {"4-bit","8-bit"};
+
+        printk("%s: on20 %s, OnSpec 90c20 at 0x%x, ",
+                pi->device,ON20_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol on20 = {
+	.owner		= THIS_MODULE,
+	.name		= "on20",
+	.max_mode	= 2,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= on20_write_regr,
+	.read_regr	= on20_read_regr,
+	.write_block	= on20_write_block,
+	.read_block	= on20_read_block,
+	.connect	= on20_connect,
+	.disconnect	= on20_disconnect,
+	.log_adapter	= on20_log_adapter,
+};
+
+static int __init on20_init(void)
+{
+	return paride_register(&on20);
+}
+
+static void __exit on20_exit(void)
+{
+	paride_unregister(&on20);
+}
+
+MODULE_LICENSE("GPL");
+module_init(on20_init)
+module_exit(on20_exit)
diff --git a/drivers/block/paride/on26.c b/drivers/block/paride/on26.c
new file mode 100644
index 000000000..95ba25692
--- /dev/null
+++ b/drivers/block/paride/on26.c
@@ -0,0 +1,319 @@
+/* 
+        on26.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+        on26.c is a low-level protocol driver for the 
+        OnSpec 90c26 parallel to IDE adapter chip.
+
+*/
+
+/* Changes:
+
+        1.01    GRG 1998.05.06 init_proto, release_proto
+	1.02    GRG 1998.09.23 updates for the -E rev chip
+	1.03    GRG 1998.12.14 fix for slave drives
+	1.04    GRG 1998.12.20 yet another bug fix
+
+*/
+
+#define ON26_VERSION      "1.04"
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/wait.h>
+#include <asm/io.h>
+
+#include "paride.h"
+
+/* mode codes:  0  nybble reads, 8-bit writes
+                1  8-bit reads and writes
+                2  8-bit EPP mode
+		3  EPP-16
+		4  EPP-32
+*/
+
+#define j44(a,b)  (((a>>4)&0x0f)|(b&0xf0))
+
+#define P1	w2(5);w2(0xd);w2(5);w2(0xd);w2(5);w2(4);
+#define P2	w2(5);w2(7);w2(5);w2(4);
+
+/* cont = 0 - access the IDE register file 
+   cont = 1 - access the IDE command set 
+*/
+
+static int on26_read_regr( PIA *pi, int cont, int regr )
+
+{       int     a, b, r;
+
+	r = (regr<<2) + 1 + cont;
+
+        switch (pi->mode)  {
+
+        case 0: w0(1); P1; w0(r); P2; w0(0); P1; 
+		w2(6); a = r1(); w2(4);
+		w2(6); b = r1(); w2(4);
+		w2(6); w2(4); w2(6); w2(4);
+                return j44(a,b);
+
+        case 1: w0(1); P1; w0(r); P2; w0(0); P1;
+		w2(0x26); a = r0(); w2(4); w2(0x26); w2(4);
+                return a;
+
+	case 2:
+	case 3:
+        case 4: w3(1); w3(1); w2(5); w4(r); w2(4);
+		w3(0); w3(0); w2(0x24); a = r4(); w2(4);
+		w2(0x24); (void)r4(); w2(4);
+                return a;
+
+        }
+        return -1;
+}       
+
+static void on26_write_regr( PIA *pi, int cont, int regr, int val )
+
+{       int  r;
+
+        r = (regr<<2) + 1 + cont;
+
+        switch (pi->mode)  {
+
+        case 0:
+        case 1: w0(1); P1; w0(r); P2; w0(0); P1;
+		w0(val); P2; w0(val); P2;
+		break;
+
+	case 2:
+	case 3:
+        case 4: w3(1); w3(1); w2(5); w4(r); w2(4);
+		w3(0); w3(0); 
+		w2(5); w4(val); w2(4);
+		w2(5); w4(val); w2(4);
+                break;
+        }
+}
+
+#define  CCP(x)  w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);\
+		 w0(0x87);w0(0x78);w0(x);w2(4);w2(5);w2(4);w0(0xff);
+
+static void on26_connect ( PIA *pi )
+
+{       int	x;
+
+	pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+
+        CCP(0x20);
+	x = 8; if (pi->mode) x = 9;
+
+	w0(2); P1; w0(8); P2;
+	w0(2); P1; w0(x); P2;
+}
+
+static void on26_disconnect ( PIA *pi )
+
+{       if (pi->mode >= 2) { w3(4); w3(4); w3(4); w3(4); }
+	              else { w0(4); P1; w0(4); P1; }
+	CCP(0x30);
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+} 
+
+#define	RESET_WAIT  200
+
+static int on26_test_port( PIA *pi)  /* hard reset */
+
+{       int     i, m, d, x=0, y=0;
+
+        pi->saved_r0 = r0();
+        pi->saved_r2 = r2();
+
+        d = pi->delay;
+        m = pi->mode;
+        pi->delay = 5;
+        pi->mode = 0;
+
+        w2(0xc);
+
+        CCP(0x30); CCP(0); 
+
+        w0(0xfe);w0(0xaa);w0(0x55);w0(0);w0(0xff);
+        i = ((r1() & 0xf0) << 4); w0(0x87);
+        i |= (r1() & 0xf0); w0(0x78);
+        w0(0x20);w2(4);w2(5);
+        i |= ((r1() & 0xf0) >> 4);
+        w2(4);w0(0xff);
+
+        if (i == 0xb5f) {
+
+            w0(2); P1; w0(0);   P2;
+            w0(3); P1; w0(0);   P2;
+            w0(2); P1; w0(8);   P2; udelay(100);
+            w0(2); P1; w0(0xa); P2; udelay(100);
+            w0(2); P1; w0(8);   P2; udelay(1000);
+            
+            on26_write_regr(pi,0,6,0xa0);
+
+            for (i=0;i<RESET_WAIT;i++) {
+                on26_write_regr(pi,0,6,0xa0);
+                x = on26_read_regr(pi,0,7);
+                on26_write_regr(pi,0,6,0xb0);
+                y = on26_read_regr(pi,0,7);
+                if (!((x&0x80)||(y&0x80))) break;
+                mdelay(100);
+            }
+
+	    if (i == RESET_WAIT) 
+		printk("on26: Device reset failed (%x,%x)\n",x,y);
+
+            w0(4); P1; w0(4); P1;
+        }
+
+        CCP(0x30);
+
+        pi->delay = d;
+        pi->mode = m;
+        w0(pi->saved_r0);
+        w2(pi->saved_r2);
+
+        return 5;
+}
+
+
+static void on26_read_block( PIA *pi, char * buf, int count )
+
+{       int     k, a, b;
+
+        switch (pi->mode) {
+
+        case 0: w0(1); P1; w0(1); P2; w0(2); P1; w0(0x18); P2; w0(0); P1;
+		udelay(10);
+		for (k=0;k<count;k++) {
+                        w2(6); a = r1();
+                        w2(4); b = r1();
+                        buf[k] = j44(a,b);
+                }
+		w0(2); P1; w0(8); P2; 
+                break;
+
+        case 1: w0(1); P1; w0(1); P2; w0(2); P1; w0(0x19); P2; w0(0); P1;
+		udelay(10);
+                for (k=0;k<count/2;k++) {
+                        w2(0x26); buf[2*k] = r0();  
+			w2(0x24); buf[2*k+1] = r0();
+                }
+                w0(2); P1; w0(9); P2;
+                break;
+
+        case 2: w3(1); w3(1); w2(5); w4(1); w2(4);
+		w3(0); w3(0); w2(0x24);
+		udelay(10);
+                for (k=0;k<count;k++) buf[k] = r4();
+                w2(4);
+                break;
+
+        case 3: w3(1); w3(1); w2(5); w4(1); w2(4);
+                w3(0); w3(0); w2(0x24);
+                udelay(10);
+                for (k=0;k<count/2;k++) ((u16 *)buf)[k] = r4w();
+                w2(4);
+                break;
+
+        case 4: w3(1); w3(1); w2(5); w4(1); w2(4);
+                w3(0); w3(0); w2(0x24);
+                udelay(10);
+                for (k=0;k<count/4;k++) ((u32 *)buf)[k] = r4l();
+                w2(4);
+                break;
+
+        }
+}
+
+static void on26_write_block( PIA *pi, char * buf, int count )
+
+{       int	k;
+
+        switch (pi->mode) {
+
+        case 0: 
+        case 1: w0(1); P1; w0(1); P2; 
+		w0(2); P1; w0(0x18+pi->mode); P2; w0(0); P1;
+		udelay(10);
+		for (k=0;k<count/2;k++) {
+                        w2(5); w0(buf[2*k]); 
+			w2(7); w0(buf[2*k+1]);
+                }
+                w2(5); w2(4);
+		w0(2); P1; w0(8+pi->mode); P2;
+                break;
+
+        case 2: w3(1); w3(1); w2(5); w4(1); w2(4);
+		w3(0); w3(0); w2(0xc5);
+		udelay(10);
+                for (k=0;k<count;k++) w4(buf[k]);
+		w2(0xc4);
+                break;
+
+        case 3: w3(1); w3(1); w2(5); w4(1); w2(4);
+                w3(0); w3(0); w2(0xc5);
+                udelay(10);
+                for (k=0;k<count/2;k++) w4w(((u16 *)buf)[k]);
+                w2(0xc4);
+                break;
+
+        case 4: w3(1); w3(1); w2(5); w4(1); w2(4);
+                w3(0); w3(0); w2(0xc5);
+                udelay(10);
+                for (k=0;k<count/4;k++) w4l(((u32 *)buf)[k]);
+                w2(0xc4);
+                break;
+
+        }
+
+}
+
+static void on26_log_adapter( PIA *pi, char * scratch, int verbose )
+
+{       char    *mode_string[5] = {"4-bit","8-bit","EPP-8",
+				   "EPP-16","EPP-32"};
+
+        printk("%s: on26 %s, OnSpec 90c26 at 0x%x, ",
+                pi->device,ON26_VERSION,pi->port);
+        printk("mode %d (%s), delay %d\n",pi->mode,
+		mode_string[pi->mode],pi->delay);
+
+}
+
+static struct pi_protocol on26 = {
+	.owner		= THIS_MODULE,
+	.name		= "on26",
+	.max_mode	= 5,
+	.epp_first	= 2,
+	.default_delay	= 1,
+	.max_units	= 1,
+	.write_regr	= on26_write_regr,
+	.read_regr	= on26_read_regr,
+	.write_block	= on26_write_block,
+	.read_block	= on26_read_block,
+	.connect	= on26_connect,
+	.disconnect	= on26_disconnect,
+	.test_port	= on26_test_port,
+	.log_adapter	= on26_log_adapter,
+};
+
+static int __init on26_init(void)
+{
+	return paride_register(&on26);
+}
+
+static void __exit on26_exit(void)
+{
+	paride_unregister(&on26);
+}
+
+MODULE_LICENSE("GPL");
+module_init(on26_init)
+module_exit(on26_exit)
diff --git a/drivers/block/paride/paride.c b/drivers/block/paride/paride.c
new file mode 100644
index 000000000..0e287993b
--- /dev/null
+++ b/drivers/block/paride/paride.c
@@ -0,0 +1,479 @@
+/* 
+        paride.c  (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                              Under the terms of the GNU General Public License.
+
+	This is the base module for the family of device drivers
+        that support parallel port IDE devices.  
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.03	Use spinlocks
+	1.02	GRG 1998.05.05  init_proto, release_proto, ktti
+	1.03	GRG 1998.08.15  eliminate compiler warning
+	1.04    GRG 1998.11.28  added support for FRIQ 
+	1.05    TMW 2000.06.06  use parport_find_number instead of
+				parport_enumerate
+	1.06    TMW 2001.03.26  more sane parport-or-not resource management
+*/
+
+#define PI_VERSION      "1.06"
+
+#include <linux/module.h>
+#include <linux/kmod.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/ioport.h>
+#include <linux/string.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/sched.h>	/* TASK_* */
+#include <linux/parport.h>
+#include <linux/slab.h>
+
+#include "paride.h"
+
+MODULE_LICENSE("GPL");
+
+#define MAX_PROTOS	32
+
+static struct pi_protocol *protocols[MAX_PROTOS];
+
+static DEFINE_SPINLOCK(pi_spinlock);
+
+void pi_write_regr(PIA * pi, int cont, int regr, int val)
+{
+	pi->proto->write_regr(pi, cont, regr, val);
+}
+
+EXPORT_SYMBOL(pi_write_regr);
+
+int pi_read_regr(PIA * pi, int cont, int regr)
+{
+	return pi->proto->read_regr(pi, cont, regr);
+}
+
+EXPORT_SYMBOL(pi_read_regr);
+
+void pi_write_block(PIA * pi, char *buf, int count)
+{
+	pi->proto->write_block(pi, buf, count);
+}
+
+EXPORT_SYMBOL(pi_write_block);
+
+void pi_read_block(PIA * pi, char *buf, int count)
+{
+	pi->proto->read_block(pi, buf, count);
+}
+
+EXPORT_SYMBOL(pi_read_block);
+
+static void pi_wake_up(void *p)
+{
+	PIA *pi = (PIA *) p;
+	unsigned long flags;
+	void (*cont) (void) = NULL;
+
+	spin_lock_irqsave(&pi_spinlock, flags);
+
+	if (pi->claim_cont && !parport_claim(pi->pardev)) {
+		cont = pi->claim_cont;
+		pi->claim_cont = NULL;
+		pi->claimed = 1;
+	}
+
+	spin_unlock_irqrestore(&pi_spinlock, flags);
+
+	wake_up(&(pi->parq));
+
+	if (cont)
+		cont();
+}
+
+int pi_schedule_claimed(PIA * pi, void (*cont) (void))
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pi_spinlock, flags);
+	if (pi->pardev && parport_claim(pi->pardev)) {
+		pi->claim_cont = cont;
+		spin_unlock_irqrestore(&pi_spinlock, flags);
+		return 0;
+	}
+	pi->claimed = 1;
+	spin_unlock_irqrestore(&pi_spinlock, flags);
+	return 1;
+}
+EXPORT_SYMBOL(pi_schedule_claimed);
+
+void pi_do_claimed(PIA * pi, void (*cont) (void))
+{
+	if (pi_schedule_claimed(pi, cont))
+		cont();
+}
+
+EXPORT_SYMBOL(pi_do_claimed);
+
+static void pi_claim(PIA * pi)
+{
+	if (pi->claimed)
+		return;
+	pi->claimed = 1;
+	if (pi->pardev)
+		wait_event(pi->parq,
+			   !parport_claim((struct pardevice *) pi->pardev));
+}
+
+static void pi_unclaim(PIA * pi)
+{
+	pi->claimed = 0;
+	if (pi->pardev)
+		parport_release((struct pardevice *) (pi->pardev));
+}
+
+void pi_connect(PIA * pi)
+{
+	pi_claim(pi);
+	pi->proto->connect(pi);
+}
+
+EXPORT_SYMBOL(pi_connect);
+
+void pi_disconnect(PIA * pi)
+{
+	pi->proto->disconnect(pi);
+	pi_unclaim(pi);
+}
+
+EXPORT_SYMBOL(pi_disconnect);
+
+static void pi_unregister_parport(PIA * pi)
+{
+	if (pi->pardev) {
+		parport_unregister_device((struct pardevice *) (pi->pardev));
+		pi->pardev = NULL;
+	}
+}
+
+void pi_release(PIA * pi)
+{
+	pi_unregister_parport(pi);
+	if (pi->proto->release_proto)
+		pi->proto->release_proto(pi);
+	module_put(pi->proto->owner);
+}
+
+EXPORT_SYMBOL(pi_release);
+
+static int default_test_proto(PIA * pi, char *scratch, int verbose)
+{
+	int j, k;
+	int e[2] = { 0, 0 };
+
+	pi->proto->connect(pi);
+
+	for (j = 0; j < 2; j++) {
+		pi_write_regr(pi, 0, 6, 0xa0 + j * 0x10);
+		for (k = 0; k < 256; k++) {
+			pi_write_regr(pi, 0, 2, k ^ 0xaa);
+			pi_write_regr(pi, 0, 3, k ^ 0x55);
+			if (pi_read_regr(pi, 0, 2) != (k ^ 0xaa))
+				e[j]++;
+		}
+	}
+	pi->proto->disconnect(pi);
+
+	if (verbose)
+		printk("%s: %s: port 0x%x, mode  %d, test=(%d,%d)\n",
+		       pi->device, pi->proto->name, pi->port,
+		       pi->mode, e[0], e[1]);
+
+	return (e[0] && e[1]);	/* not here if both > 0 */
+}
+
+static int pi_test_proto(PIA * pi, char *scratch, int verbose)
+{
+	int res;
+
+	pi_claim(pi);
+	if (pi->proto->test_proto)
+		res = pi->proto->test_proto(pi, scratch, verbose);
+	else
+		res = default_test_proto(pi, scratch, verbose);
+	pi_unclaim(pi);
+
+	return res;
+}
+
+int paride_register(PIP * pr)
+{
+	int k;
+
+	for (k = 0; k < MAX_PROTOS; k++)
+		if (protocols[k] && !strcmp(pr->name, protocols[k]->name)) {
+			printk("paride: %s protocol already registered\n",
+			       pr->name);
+			return -1;
+		}
+	k = 0;
+	while ((k < MAX_PROTOS) && (protocols[k]))
+		k++;
+	if (k == MAX_PROTOS) {
+		printk("paride: protocol table full\n");
+		return -1;
+	}
+	protocols[k] = pr;
+	pr->index = k;
+	printk("paride: %s registered as protocol %d\n", pr->name, k);
+	return 0;
+}
+
+EXPORT_SYMBOL(paride_register);
+
+void paride_unregister(PIP * pr)
+{
+	if (!pr)
+		return;
+	if (protocols[pr->index] != pr) {
+		printk("paride: %s not registered\n", pr->name);
+		return;
+	}
+	protocols[pr->index] = NULL;
+}
+
+EXPORT_SYMBOL(paride_unregister);
+
+static int pi_register_parport(PIA *pi, int verbose, int unit)
+{
+	struct parport *port;
+	struct pardev_cb par_cb;
+
+	port = parport_find_base(pi->port);
+	if (!port)
+		return 0;
+	memset(&par_cb, 0, sizeof(par_cb));
+	par_cb.wakeup = pi_wake_up;
+	par_cb.private = (void *)pi;
+	pi->pardev = parport_register_dev_model(port, pi->device, &par_cb,
+						unit);
+	parport_put_port(port);
+	if (!pi->pardev)
+		return 0;
+
+	init_waitqueue_head(&pi->parq);
+
+	if (verbose)
+		printk("%s: 0x%x is %s\n", pi->device, pi->port, port->name);
+
+	pi->parname = (char *) port->name;
+
+	return 1;
+}
+
+static int pi_probe_mode(PIA * pi, int max, char *scratch, int verbose)
+{
+	int best, range;
+
+	if (pi->mode != -1) {
+		if (pi->mode >= max)
+			return 0;
+		range = 3;
+		if (pi->mode >= pi->proto->epp_first)
+			range = 8;
+		if ((range == 8) && (pi->port % 8))
+			return 0;
+		pi->reserved = range;
+		return (!pi_test_proto(pi, scratch, verbose));
+	}
+	best = -1;
+	for (pi->mode = 0; pi->mode < max; pi->mode++) {
+		range = 3;
+		if (pi->mode >= pi->proto->epp_first)
+			range = 8;
+		if ((range == 8) && (pi->port % 8))
+			break;
+		pi->reserved = range;
+		if (!pi_test_proto(pi, scratch, verbose))
+			best = pi->mode;
+	}
+	pi->mode = best;
+	return (best > -1);
+}
+
+static int pi_probe_unit(PIA * pi, int unit, char *scratch, int verbose)
+{
+	int max, s, e;
+
+	s = unit;
+	e = s + 1;
+
+	if (s == -1) {
+		s = 0;
+		e = pi->proto->max_units;
+	}
+
+	if (!pi_register_parport(pi, verbose, s))
+		return 0;
+
+	if (pi->proto->test_port) {
+		pi_claim(pi);
+		max = pi->proto->test_port(pi);
+		pi_unclaim(pi);
+	} else
+		max = pi->proto->max_mode;
+
+	if (pi->proto->probe_unit) {
+		pi_claim(pi);
+		for (pi->unit = s; pi->unit < e; pi->unit++)
+			if (pi->proto->probe_unit(pi)) {
+				pi_unclaim(pi);
+				if (pi_probe_mode(pi, max, scratch, verbose))
+					return 1;
+				pi_unregister_parport(pi);
+				return 0;
+			}
+		pi_unclaim(pi);
+		pi_unregister_parport(pi);
+		return 0;
+	}
+
+	if (!pi_probe_mode(pi, max, scratch, verbose)) {
+		pi_unregister_parport(pi);
+		return 0;
+	}
+	return 1;
+
+}
+
+int pi_init(PIA * pi, int autoprobe, int port, int mode,
+	int unit, int protocol, int delay, char *scratch,
+	int devtype, int verbose, char *device)
+{
+	int p, k, s, e;
+	int lpts[7] = { 0x3bc, 0x378, 0x278, 0x268, 0x27c, 0x26c, 0 };
+
+	s = protocol;
+	e = s + 1;
+
+	if (!protocols[0])
+		request_module("paride_protocol");
+
+	if (autoprobe) {
+		s = 0;
+		e = MAX_PROTOS;
+	} else if ((s < 0) || (s >= MAX_PROTOS) || (port <= 0) ||
+		   (!protocols[s]) || (unit < 0) ||
+		   (unit >= protocols[s]->max_units)) {
+		printk("%s: Invalid parameters\n", device);
+		return 0;
+	}
+
+	for (p = s; p < e; p++) {
+		struct pi_protocol *proto = protocols[p];
+		if (!proto)
+			continue;
+		/* still racy */
+		if (!try_module_get(proto->owner))
+			continue;
+		pi->proto = proto;
+		pi->private = 0;
+		if (proto->init_proto && proto->init_proto(pi) < 0) {
+			pi->proto = NULL;
+			module_put(proto->owner);
+			continue;
+		}
+		if (delay == -1)
+			pi->delay = pi->proto->default_delay;
+		else
+			pi->delay = delay;
+		pi->devtype = devtype;
+		pi->device = device;
+
+		pi->parname = NULL;
+		pi->pardev = NULL;
+		init_waitqueue_head(&pi->parq);
+		pi->claimed = 0;
+		pi->claim_cont = NULL;
+
+		pi->mode = mode;
+		if (port != -1) {
+			pi->port = port;
+			if (pi_probe_unit(pi, unit, scratch, verbose))
+				break;
+			pi->port = 0;
+		} else {
+			k = 0;
+			while ((pi->port = lpts[k++]))
+				if (pi_probe_unit
+				    (pi, unit, scratch, verbose))
+					break;
+			if (pi->port)
+				break;
+		}
+		if (pi->proto->release_proto)
+			pi->proto->release_proto(pi);
+		module_put(proto->owner);
+	}
+
+	if (!pi->port) {
+		if (autoprobe)
+			printk("%s: Autoprobe failed\n", device);
+		else
+			printk("%s: Adapter not found\n", device);
+		return 0;
+	}
+
+	if (pi->parname)
+		printk("%s: Sharing %s at 0x%x\n", pi->device,
+		       pi->parname, pi->port);
+
+	pi->proto->log_adapter(pi, scratch, verbose);
+
+	return 1;
+}
+
+EXPORT_SYMBOL(pi_init);
+
+static int pi_probe(struct pardevice *par_dev)
+{
+	struct device_driver *drv = par_dev->dev.driver;
+	int len = strlen(drv->name);
+
+	if (strncmp(par_dev->name, drv->name, len))
+		return -ENODEV;
+
+	return 0;
+}
+
+void *pi_register_driver(char *name)
+{
+	struct parport_driver *parp_drv;
+	int ret;
+
+	parp_drv = kzalloc(sizeof(*parp_drv), GFP_KERNEL);
+	if (!parp_drv)
+		return NULL;
+
+	parp_drv->name = name;
+	parp_drv->probe = pi_probe;
+	parp_drv->devmodel = true;
+
+	ret = parport_register_driver(parp_drv);
+	if (ret) {
+		kfree(parp_drv);
+		return NULL;
+	}
+	return (void *)parp_drv;
+}
+EXPORT_SYMBOL(pi_register_driver);
+
+void pi_unregister_driver(void *_drv)
+{
+	struct parport_driver *drv = _drv;
+
+	parport_unregister_driver(drv);
+	kfree(drv);
+}
+EXPORT_SYMBOL(pi_unregister_driver);
diff --git a/drivers/block/paride/paride.h b/drivers/block/paride/paride.h
new file mode 100644
index 000000000..ddb9e589d
--- /dev/null
+++ b/drivers/block/paride/paride.h
@@ -0,0 +1,172 @@
+#ifndef __DRIVERS_PARIDE_H__
+#define __DRIVERS_PARIDE_H__
+
+/* 
+	paride.h	(c) 1997-8  Grant R. Guenther <grant@torque.net>
+   		                    Under the terms of the GPL.
+
+   This file defines the interface between the high-level parallel
+   IDE device drivers (pd, pf, pcd, pt) and the adapter chips.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.05	init_proto, release_proto
+*/
+
+#define PARIDE_H_VERSION 	"1.01"
+
+/* Some adapters need to know what kind of device they are in
+
+   Values for devtype:
+*/
+
+#define	PI_PD	0	/* IDE disk */
+#define PI_PCD	1	/* ATAPI CDrom */
+#define PI_PF   2	/* ATAPI disk */
+#define PI_PT	3	/* ATAPI tape */
+#define PI_PG   4       /* ATAPI generic */
+
+/* The paride module contains no state, instead the drivers allocate
+   a pi_adapter data structure and pass it to paride in every operation.
+
+*/
+
+struct pi_adapter  {
+
+	struct pi_protocol *proto;   /* adapter protocol */
+	int	port;		     /* base address of parallel port */
+	int	mode;		     /* transfer mode in use */
+	int     delay;		     /* adapter delay setting */
+	int	devtype;	     /* device type: PI_PD etc. */
+	char    *device;	     /* name of driver */
+	int     unit;		     /* unit number for chained adapters */
+	int	saved_r0;	     /* saved port state */
+	int	saved_r2;	     /* saved port state */
+	int	reserved;	     /* number of ports reserved */
+	unsigned long	private;     /* for protocol module */
+
+	wait_queue_head_t parq;     /* semaphore for parport sharing */
+	void	*pardev;	     /* pointer to pardevice */
+	char	*parname;	     /* parport name */
+	int	claimed;	     /* parport has already been claimed */
+	void (*claim_cont)(void);    /* continuation for parport wait */
+};
+
+typedef struct pi_adapter PIA;
+
+/* functions exported by paride to the high level drivers */
+
+extern int pi_init(PIA *pi, 
+	int autoprobe,		/* 1 to autoprobe */
+	int port, 		/* base port address */
+	int mode, 		/* -1 for autoprobe */
+	int unit,		/* unit number, if supported */
+	int protocol, 		/* protocol to use */
+	int delay, 		/* -1 to use adapter specific default */
+	char * scratch, 	/* address of 512 byte buffer */
+	int devtype,		/* device type: PI_PD, PI_PCD, etc ... */
+	int verbose,		/* log verbose data while probing */
+	char *device		/* name of the driver */
+	);			/* returns 0 on failure, 1 on success */
+
+extern void pi_release(PIA *pi);
+
+/* registers are addressed as (cont,regr)
+
+       	cont: 0 for command register file, 1 for control register(s)
+	regr: 0-7 for register number.
+
+*/
+
+extern void pi_write_regr(PIA *pi, int cont, int regr, int val);
+
+extern int pi_read_regr(PIA *pi, int cont, int regr);
+
+extern void pi_write_block(PIA *pi, char * buf, int count);
+
+extern void pi_read_block(PIA *pi, char * buf, int count);
+
+extern void pi_connect(PIA *pi);
+
+extern void pi_disconnect(PIA *pi);
+
+extern void pi_do_claimed(PIA *pi, void (*cont)(void));
+extern int pi_schedule_claimed(PIA *pi, void (*cont)(void));
+
+/* macros and functions exported to the protocol modules */
+
+#define delay_p			(pi->delay?udelay(pi->delay):(void)0)
+#define out_p(offs,byte)	outb(byte,pi->port+offs); delay_p;
+#define in_p(offs)		(delay_p,inb(pi->port+offs))
+
+#define w0(byte)                {out_p(0,byte);}
+#define r0()                    (in_p(0) & 0xff)
+#define w1(byte)                {out_p(1,byte);}
+#define r1()                    (in_p(1) & 0xff)
+#define w2(byte)                {out_p(2,byte);}
+#define r2()                    (in_p(2) & 0xff)
+#define w3(byte)                {out_p(3,byte);}
+#define w4(byte)                {out_p(4,byte);}
+#define r4()                    (in_p(4) & 0xff)
+#define w4w(data)     		{outw(data,pi->port+4); delay_p;}
+#define w4l(data)     		{outl(data,pi->port+4); delay_p;}
+#define r4w()         		(delay_p,inw(pi->port+4)&0xffff)
+#define r4l()         		(delay_p,inl(pi->port+4)&0xffffffff)
+
+static inline u16 pi_swab16( char *b, int k)
+
+{ 	union { u16 u; char t[2]; } r;
+
+	r.t[0]=b[2*k+1]; r.t[1]=b[2*k];
+        return r.u;
+}
+
+static inline u32 pi_swab32( char *b, int k)
+
+{ 	union { u32 u; char f[4]; } r;
+
+	r.f[0]=b[4*k+1]; r.f[1]=b[4*k];
+	r.f[2]=b[4*k+3]; r.f[3]=b[4*k+2];
+        return r.u;
+}
+
+struct pi_protocol {
+
+	char	name[8];	/* name for this protocol */
+	int	index;		/* index into protocol table */
+
+	int	max_mode;	/* max mode number */
+	int	epp_first;	/* modes >= this use 8 ports */
+	
+	int	default_delay;  /* delay parameter if not specified */
+	int	max_units;	/* max chained units probed for */
+
+	void (*write_regr)(PIA *,int,int,int);
+	int  (*read_regr)(PIA *,int,int);
+	void (*write_block)(PIA *,char *,int);
+	void (*read_block)(PIA *,char *,int);
+
+	void (*connect)(PIA *);
+	void (*disconnect)(PIA *);
+	
+	int  (*test_port)(PIA *);
+	int  (*probe_unit)(PIA *);
+	int  (*test_proto)(PIA *,char *,int);
+	void (*log_adapter)(PIA *,char *,int);
+	
+	int (*init_proto)(PIA *);
+	void (*release_proto)(PIA *);
+	struct module *owner;
+};
+
+typedef struct pi_protocol PIP;
+
+extern int paride_register( PIP * );
+extern void paride_unregister ( PIP * );
+void *pi_register_driver(char *);
+void pi_unregister_driver(void *);
+
+#endif /* __DRIVERS_PARIDE_H__ */
+/* end of paride.h */
diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c
new file mode 100644
index 000000000..a5ab40784
--- /dev/null
+++ b/drivers/block/paride/pcd.c
@@ -0,0 +1,1042 @@
+/* 
+	pcd.c	(c) 1997-8  Grant R. Guenther <grant@torque.net>
+		            Under the terms of the GNU General Public License.
+
+	This is a high-level driver for parallel port ATAPI CD-ROM
+        drives based on chips supported by the paride module.
+
+        By default, the driver will autoprobe for a single parallel
+        port ATAPI CD-ROM drive, but if their individual parameters are
+        specified, the driver can handle up to 4 drives.
+
+        The behaviour of the pcd driver can be altered by setting
+        some parameters from the insmod command line.  The following
+        parameters are adjustable:
+
+            drive0      These four arguments can be arrays of       
+            drive1      1-6 integers as follows:
+            drive2
+            drive3      <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+                        Where,
+
+                <prt>   is the base of the parallel port address for
+                        the corresponding drive.  (required)
+
+                <pro>   is the protocol number for the adapter that
+                        supports this drive.  These numbers are
+                        logged by 'paride' when the protocol modules
+                        are initialised.  (0 if not given)
+
+                <uni>   for those adapters that support chained
+                        devices, this is the unit selector for the
+                        chain of devices on the given port.  It should
+                        be zero for devices that don't support chaining.
+                        (0 if not given)
+
+                <mod>   this can be -1 to choose the best mode, or one
+                        of the mode numbers supported by the adapter.
+                        (-1 if not given)
+
+		<slv>   ATAPI CD-ROMs can be jumpered to master or slave.
+			Set this to 0 to choose the master drive, 1 to
+                        choose the slave, -1 (the default) to choose the
+			first drive found.
+
+                <dly>   some parallel ports require the driver to 
+                        go more slowly.  -1 sets a default value that
+                        should work with the chosen protocol.  Otherwise,
+                        set this to a small integer, the larger it is
+                        the slower the port i/o.  In some cases, setting
+                        this to zero will speed up the device. (default -1)
+                        
+            major       You may use this parameter to override the
+                        default major number (46) that this driver
+                        will use.  Be sure to change the device
+                        name as well.
+
+            name        This parameter is a character string that
+                        contains the name the kernel will use for this
+                        device (in /proc output, for instance).
+                        (default "pcd")
+
+            verbose     This parameter controls the amount of logging
+                        that the driver will do.  Set it to 0 for
+                        normal operation, 1 to see autoprobe progress
+                        messages, or 2 to see additional debugging
+                        output.  (default 0)
+  
+            nice        This parameter controls the driver's use of
+                        idle CPU time, at the expense of some speed.
+ 
+	If this driver is built into the kernel, you can use the
+        following kernel command line parameters, with the same values
+        as the corresponding module parameters listed above:
+
+	    pcd.drive0
+	    pcd.drive1
+	    pcd.drive2
+	    pcd.drive3
+	    pcd.nice
+
+        In addition, you can use the parameter pcd.disable to disable
+        the driver entirely.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.01.24	Added test unit ready support
+	1.02    GRG 1998.05.06  Changes to pcd_completion, ready_wait,
+				and loosen interpretation of ATAPI
+			        standard for clearing error status.
+				Use spinlocks. Eliminate sti().
+	1.03    GRG 1998.06.16  Eliminated an Ugh
+	1.04	GRG 1998.08.15  Added extra debugging, improvements to
+				pcd_completion, use HZ in loop timing
+	1.05	GRG 1998.08.16	Conformed to "Uniform CD-ROM" standard
+	1.06    GRG 1998.08.19  Added audio ioctl support
+	1.07    GRG 1998.09.24  Increased reset timeout, added jumbo support
+
+*/
+
+#define	PCD_VERSION	"1.07"
+#define PCD_MAJOR	46
+#define PCD_NAME	"pcd"
+#define PCD_UNITS	4
+
+/* Here are things one can override from the insmod command.
+   Most are autoprobed by paride unless set here.  Verbose is off
+   by default.
+
+*/
+
+static int verbose = 0;
+static int major = PCD_MAJOR;
+static char *name = PCD_NAME;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+static int pcd_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/cdrom.h>
+#include <linux/spinlock.h>
+#include <linux/blk-mq.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+
+static DEFINE_MUTEX(pcd_mutex);
+static DEFINE_SPINLOCK(pcd_lock);
+
+module_param(verbose, int, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+#include "pseudo.h"
+
+#define PCD_RETRIES	     5
+#define PCD_TMO		   800	/* timeout in jiffies */
+#define PCD_DELAY           50	/* spin delay in uS */
+#define PCD_READY_TMO	    20	/* in seconds */
+#define PCD_RESET_TMO	   100	/* in tenths of a second */
+
+#define PCD_SPIN	(1000000*PCD_TMO)/(HZ*PCD_DELAY)
+
+#define IDE_ERR		0x01
+#define IDE_DRQ         0x08
+#define IDE_READY       0x40
+#define IDE_BUSY        0x80
+
+static int pcd_open(struct cdrom_device_info *cdi, int purpose);
+static void pcd_release(struct cdrom_device_info *cdi);
+static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr);
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr);
+static int pcd_tray_move(struct cdrom_device_info *cdi, int position);
+static int pcd_lock_door(struct cdrom_device_info *cdi, int lock);
+static int pcd_drive_reset(struct cdrom_device_info *cdi);
+static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn);
+static int pcd_audio_ioctl(struct cdrom_device_info *cdi,
+			   unsigned int cmd, void *arg);
+static int pcd_packet(struct cdrom_device_info *cdi,
+		      struct packet_command *cgc);
+
+static void do_pcd_read_drq(void);
+static blk_status_t pcd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd);
+static void do_pcd_read(void);
+
+struct pcd_unit {
+	struct pi_adapter pia;	/* interface to paride layer */
+	struct pi_adapter *pi;
+	int drive;		/* master/slave */
+	int last_sense;		/* result of last request sense */
+	int changed;		/* media change seen */
+	int present;		/* does this unit exist ? */
+	char *name;		/* pcd0, pcd1, etc */
+	struct cdrom_device_info info;	/* uniform cdrom interface */
+	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
+	struct list_head rq_list;
+};
+
+static struct pcd_unit pcd[PCD_UNITS];
+
+static char pcd_scratch[64];
+static char pcd_buffer[2048];	/* raw block buffer */
+static int pcd_bufblk = -1;	/* block in buffer, in CD units,
+				   -1 for nothing there. See also
+				   pd_unit.
+				 */
+
+/* the variables below are used mainly in the I/O request engine, which
+   processes only one request at a time.
+*/
+
+static struct pcd_unit *pcd_current; /* current request's drive */
+static struct request *pcd_req;
+static int pcd_retries;		/* retries on current request */
+static int pcd_busy;		/* request being processed ? */
+static int pcd_sector;		/* address of next requested sector */
+static int pcd_count;		/* number of blocks still to do */
+static char *pcd_buf;		/* buffer for request in progress */
+static void *par_drv;		/* reference of parport driver */
+
+/* kernel glue structures */
+
+static int pcd_block_open(struct block_device *bdev, fmode_t mode)
+{
+	struct pcd_unit *cd = bdev->bd_disk->private_data;
+	int ret;
+
+	bdev_check_media_change(bdev);
+
+	mutex_lock(&pcd_mutex);
+	ret = cdrom_open(&cd->info, bdev, mode);
+	mutex_unlock(&pcd_mutex);
+
+	return ret;
+}
+
+static void pcd_block_release(struct gendisk *disk, fmode_t mode)
+{
+	struct pcd_unit *cd = disk->private_data;
+	mutex_lock(&pcd_mutex);
+	cdrom_release(&cd->info, mode);
+	mutex_unlock(&pcd_mutex);
+}
+
+static int pcd_block_ioctl(struct block_device *bdev, fmode_t mode,
+				unsigned cmd, unsigned long arg)
+{
+	struct pcd_unit *cd = bdev->bd_disk->private_data;
+	int ret;
+
+	mutex_lock(&pcd_mutex);
+	ret = cdrom_ioctl(&cd->info, bdev, mode, cmd, arg);
+	mutex_unlock(&pcd_mutex);
+
+	return ret;
+}
+
+static unsigned int pcd_block_check_events(struct gendisk *disk,
+					   unsigned int clearing)
+{
+	struct pcd_unit *cd = disk->private_data;
+	return cdrom_check_events(&cd->info, clearing);
+}
+
+static const struct block_device_operations pcd_bdops = {
+	.owner		= THIS_MODULE,
+	.open		= pcd_block_open,
+	.release	= pcd_block_release,
+	.ioctl		= pcd_block_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= blkdev_compat_ptr_ioctl,
+#endif
+	.check_events	= pcd_block_check_events,
+};
+
+static const struct cdrom_device_ops pcd_dops = {
+	.open		= pcd_open,
+	.release	= pcd_release,
+	.drive_status	= pcd_drive_status,
+	.check_events	= pcd_check_events,
+	.tray_move	= pcd_tray_move,
+	.lock_door	= pcd_lock_door,
+	.get_mcn	= pcd_get_mcn,
+	.reset		= pcd_drive_reset,
+	.audio_ioctl	= pcd_audio_ioctl,
+	.generic_packet	= pcd_packet,
+	.capability	= CDC_CLOSE_TRAY | CDC_OPEN_TRAY | CDC_LOCK |
+			  CDC_MCN | CDC_MEDIA_CHANGED | CDC_RESET |
+			  CDC_PLAY_AUDIO | CDC_GENERIC_PACKET | CDC_CD_R |
+			  CDC_CD_RW,
+};
+
+static const struct blk_mq_ops pcd_mq_ops = {
+	.queue_rq	= pcd_queue_rq,
+};
+
+static int pcd_open(struct cdrom_device_info *cdi, int purpose)
+{
+	struct pcd_unit *cd = cdi->handle;
+	if (!cd->present)
+		return -ENODEV;
+	return 0;
+}
+
+static void pcd_release(struct cdrom_device_info *cdi)
+{
+}
+
+static inline int status_reg(struct pcd_unit *cd)
+{
+	return pi_read_regr(cd->pi, 1, 6);
+}
+
+static inline int read_reg(struct pcd_unit *cd, int reg)
+{
+	return pi_read_regr(cd->pi, 0, reg);
+}
+
+static inline void write_reg(struct pcd_unit *cd, int reg, int val)
+{
+	pi_write_regr(cd->pi, 0, reg, val);
+}
+
+static int pcd_wait(struct pcd_unit *cd, int go, int stop, char *fun, char *msg)
+{
+	int j, r, e, s, p;
+
+	j = 0;
+	while ((((r = status_reg(cd)) & go) || (stop && (!(r & stop))))
+	       && (j++ < PCD_SPIN))
+		udelay(PCD_DELAY);
+
+	if ((r & (IDE_ERR & stop)) || (j > PCD_SPIN)) {
+		s = read_reg(cd, 7);
+		e = read_reg(cd, 1);
+		p = read_reg(cd, 2);
+		if (j > PCD_SPIN)
+			e |= 0x100;
+		if (fun)
+			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+			       " loop=%d phase=%d\n",
+			       cd->name, fun, msg, r, s, e, j, p);
+		return (s << 8) + r;
+	}
+	return 0;
+}
+
+static int pcd_command(struct pcd_unit *cd, char *cmd, int dlen, char *fun)
+{
+	pi_connect(cd->pi);
+
+	write_reg(cd, 6, 0xa0 + 0x10 * cd->drive);
+
+	if (pcd_wait(cd, IDE_BUSY | IDE_DRQ, 0, fun, "before command")) {
+		pi_disconnect(cd->pi);
+		return -1;
+	}
+
+	write_reg(cd, 4, dlen % 256);
+	write_reg(cd, 5, dlen / 256);
+	write_reg(cd, 7, 0xa0);	/* ATAPI packet command */
+
+	if (pcd_wait(cd, IDE_BUSY, IDE_DRQ, fun, "command DRQ")) {
+		pi_disconnect(cd->pi);
+		return -1;
+	}
+
+	if (read_reg(cd, 2) != 1) {
+		printk("%s: %s: command phase error\n", cd->name, fun);
+		pi_disconnect(cd->pi);
+		return -1;
+	}
+
+	pi_write_block(cd->pi, cmd, 12);
+
+	return 0;
+}
+
+static int pcd_completion(struct pcd_unit *cd, char *buf, char *fun)
+{
+	int r, d, p, n, k, j;
+
+	r = -1;
+	k = 0;
+	j = 0;
+
+	if (!pcd_wait(cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR,
+		      fun, "completion")) {
+		r = 0;
+		while (read_reg(cd, 7) & IDE_DRQ) {
+			d = read_reg(cd, 4) + 256 * read_reg(cd, 5);
+			n = (d + 3) & 0xfffc;
+			p = read_reg(cd, 2) & 3;
+
+			if ((p == 2) && (n > 0) && (j == 0)) {
+				pi_read_block(cd->pi, buf, n);
+				if (verbose > 1)
+					printk("%s: %s: Read %d bytes\n",
+					       cd->name, fun, n);
+				r = 0;
+				j++;
+			} else {
+				if (verbose > 1)
+					printk
+					    ("%s: %s: Unexpected phase %d, d=%d, k=%d\n",
+					     cd->name, fun, p, d, k);
+				if (verbose < 2)
+					printk_once(
+					    "%s: WARNING: ATAPI phase errors\n",
+					    cd->name);
+				mdelay(1);
+			}
+			if (k++ > PCD_TMO) {
+				printk("%s: Stuck DRQ\n", cd->name);
+				break;
+			}
+			if (pcd_wait
+			    (cd, IDE_BUSY, IDE_DRQ | IDE_READY | IDE_ERR, fun,
+			     "completion")) {
+				r = -1;
+				break;
+			}
+		}
+	}
+
+	pi_disconnect(cd->pi);
+
+	return r;
+}
+
+static void pcd_req_sense(struct pcd_unit *cd, char *fun)
+{
+	char rs_cmd[12] = { 0x03, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[16];
+	int r, c;
+
+	r = pcd_command(cd, rs_cmd, 16, "Request sense");
+	mdelay(1);
+	if (!r)
+		pcd_completion(cd, buf, "Request sense");
+
+	cd->last_sense = -1;
+	c = 2;
+	if (!r) {
+		if (fun)
+			printk("%s: %s: Sense key: %x, ASC: %x, ASQ: %x\n",
+			       cd->name, fun, buf[2] & 0xf, buf[12], buf[13]);
+		c = buf[2] & 0xf;
+		cd->last_sense =
+		    c | ((buf[12] & 0xff) << 8) | ((buf[13] & 0xff) << 16);
+	}
+	if ((c == 2) || (c == 6))
+		cd->changed = 1;
+}
+
+static int pcd_atapi(struct pcd_unit *cd, char *cmd, int dlen, char *buf, char *fun)
+{
+	int r;
+
+	r = pcd_command(cd, cmd, dlen, fun);
+	mdelay(1);
+	if (!r)
+		r = pcd_completion(cd, buf, fun);
+	if (r)
+		pcd_req_sense(cd, fun);
+
+	return r;
+}
+
+static int pcd_packet(struct cdrom_device_info *cdi, struct packet_command *cgc)
+{
+	return pcd_atapi(cdi->handle, cgc->cmd, cgc->buflen, cgc->buffer,
+			 "generic packet");
+}
+
+#define DBMSG(msg)	((verbose>1)?(msg):NULL)
+
+static unsigned int pcd_check_events(struct cdrom_device_info *cdi,
+				     unsigned int clearing, int slot_nr)
+{
+	struct pcd_unit *cd = cdi->handle;
+	int res = cd->changed;
+	if (res)
+		cd->changed = 0;
+	return res ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int pcd_lock_door(struct cdrom_device_info *cdi, int lock)
+{
+	char un_cmd[12] = { 0x1e, 0, 0, 0, lock, 0, 0, 0, 0, 0, 0, 0 };
+
+	return pcd_atapi(cdi->handle, un_cmd, 0, pcd_scratch,
+			 lock ? "lock door" : "unlock door");
+}
+
+static int pcd_tray_move(struct cdrom_device_info *cdi, int position)
+{
+	char ej_cmd[12] = { 0x1b, 0, 0, 0, 3 - position, 0, 0, 0, 0, 0, 0, 0 };
+
+	return pcd_atapi(cdi->handle, ej_cmd, 0, pcd_scratch,
+			 position ? "eject" : "close tray");
+}
+
+static void pcd_sleep(int cs)
+{
+	schedule_timeout_interruptible(cs);
+}
+
+static int pcd_reset(struct pcd_unit *cd)
+{
+	int i, k, flg;
+	int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+	pi_connect(cd->pi);
+	write_reg(cd, 6, 0xa0 + 0x10 * cd->drive);
+	write_reg(cd, 7, 8);
+
+	pcd_sleep(20 * HZ / 1000);	/* delay a bit */
+
+	k = 0;
+	while ((k++ < PCD_RESET_TMO) && (status_reg(cd) & IDE_BUSY))
+		pcd_sleep(HZ / 10);
+
+	flg = 1;
+	for (i = 0; i < 5; i++)
+		flg &= (read_reg(cd, i + 1) == expect[i]);
+
+	if (verbose) {
+		printk("%s: Reset (%d) signature = ", cd->name, k);
+		for (i = 0; i < 5; i++)
+			printk("%3x", read_reg(cd, i + 1));
+		if (!flg)
+			printk(" (incorrect)");
+		printk("\n");
+	}
+
+	pi_disconnect(cd->pi);
+	return flg - 1;
+}
+
+static int pcd_drive_reset(struct cdrom_device_info *cdi)
+{
+	return pcd_reset(cdi->handle);
+}
+
+static int pcd_ready_wait(struct pcd_unit *cd, int tmo)
+{
+	char tr_cmd[12] = { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int k, p;
+
+	k = 0;
+	while (k < tmo) {
+		cd->last_sense = 0;
+		pcd_atapi(cd, tr_cmd, 0, NULL, DBMSG("test unit ready"));
+		p = cd->last_sense;
+		if (!p)
+			return 0;
+		if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6)))
+			return p;
+		k++;
+		pcd_sleep(HZ);
+	}
+	return 0x000020;	/* timeout */
+}
+
+static int pcd_drive_status(struct cdrom_device_info *cdi, int slot_nr)
+{
+	char rc_cmd[12] = { 0x25, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	struct pcd_unit *cd = cdi->handle;
+
+	if (pcd_ready_wait(cd, PCD_READY_TMO))
+		return CDS_DRIVE_NOT_READY;
+	if (pcd_atapi(cd, rc_cmd, 8, pcd_scratch, DBMSG("check media")))
+		return CDS_NO_DISC;
+	return CDS_DISC_OK;
+}
+
+static int pcd_identify(struct pcd_unit *cd)
+{
+	char id_cmd[12] = { 0x12, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+	char id[18];
+	int k, s;
+
+	pcd_bufblk = -1;
+
+	s = pcd_atapi(cd, id_cmd, 36, pcd_buffer, "identify");
+
+	if (s)
+		return -1;
+	if ((pcd_buffer[0] & 0x1f) != 5) {
+		if (verbose)
+			printk("%s: %s is not a CD-ROM\n",
+			       cd->name, cd->drive ? "Slave" : "Master");
+		return -1;
+	}
+	memcpy(id, pcd_buffer + 16, 16);
+	id[16] = 0;
+	k = 16;
+	while ((k >= 0) && (id[k] <= 0x20)) {
+		id[k] = 0;
+		k--;
+	}
+
+	printk("%s: %s: %s\n", cd->name, cd->drive ? "Slave" : "Master", id);
+
+	return 0;
+}
+
+/*
+ * returns 0, with id set if drive is detected, otherwise an error code.
+ */
+static int pcd_probe(struct pcd_unit *cd, int ms)
+{
+	if (ms == -1) {
+		for (cd->drive = 0; cd->drive <= 1; cd->drive++)
+			if (!pcd_reset(cd) && !pcd_identify(cd))
+				return 0;
+	} else {
+		cd->drive = ms;
+		if (!pcd_reset(cd) && !pcd_identify(cd))
+			return 0;
+	}
+	return -ENODEV;
+}
+
+static int pcd_probe_capabilities(struct pcd_unit *cd)
+{
+	char cmd[12] = { 0x5a, 1 << 3, 0x2a, 0, 0, 0, 0, 18, 0, 0, 0, 0 };
+	char buffer[32];
+	int ret;
+
+	ret = pcd_atapi(cd, cmd, 18, buffer, "mode sense capabilities");
+	if (ret)
+		return ret;
+
+	/* we should now have the cap page */
+	if ((buffer[11] & 1) == 0)
+		cd->info.mask |= CDC_CD_R;
+	if ((buffer[11] & 2) == 0)
+		cd->info.mask |= CDC_CD_RW;
+	if ((buffer[12] & 1) == 0)
+		cd->info.mask |= CDC_PLAY_AUDIO;
+	if ((buffer[14] & 1) == 0)
+		cd->info.mask |= CDC_LOCK;
+	if ((buffer[14] & 8) == 0)
+		cd->info.mask |= CDC_OPEN_TRAY;
+	if ((buffer[14] >> 6) == 0)
+		cd->info.mask |= CDC_CLOSE_TRAY;
+
+	return 0;
+}
+
+/* I/O request processing */
+static int pcd_queue;
+
+static int set_next_request(void)
+{
+	struct pcd_unit *cd;
+	int old_pos = pcd_queue;
+
+	do {
+		cd = &pcd[pcd_queue];
+		if (++pcd_queue == PCD_UNITS)
+			pcd_queue = 0;
+		if (cd->present && !list_empty(&cd->rq_list)) {
+			pcd_req = list_first_entry(&cd->rq_list, struct request,
+							queuelist);
+			list_del_init(&pcd_req->queuelist);
+			blk_mq_start_request(pcd_req);
+			break;
+		}
+	} while (pcd_queue != old_pos);
+
+	return pcd_req != NULL;
+}
+
+static void pcd_request(void)
+{
+	struct pcd_unit *cd;
+
+	if (pcd_busy)
+		return;
+
+	if (!pcd_req && !set_next_request())
+		return;
+
+	cd = pcd_req->q->disk->private_data;
+	if (cd != pcd_current)
+		pcd_bufblk = -1;
+	pcd_current = cd;
+	pcd_sector = blk_rq_pos(pcd_req);
+	pcd_count = blk_rq_cur_sectors(pcd_req);
+	pcd_buf = bio_data(pcd_req->bio);
+	pcd_busy = 1;
+	ps_set_intr(do_pcd_read, NULL, 0, nice);
+}
+
+static blk_status_t pcd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
+{
+	struct pcd_unit *cd = hctx->queue->queuedata;
+
+	if (rq_data_dir(bd->rq) != READ) {
+		blk_mq_start_request(bd->rq);
+		return BLK_STS_IOERR;
+	}
+
+	spin_lock_irq(&pcd_lock);
+	list_add_tail(&bd->rq->queuelist, &cd->rq_list);
+	pcd_request();
+	spin_unlock_irq(&pcd_lock);
+
+	return BLK_STS_OK;
+}
+
+static inline void next_request(blk_status_t err)
+{
+	unsigned long saved_flags;
+
+	spin_lock_irqsave(&pcd_lock, saved_flags);
+	if (!blk_update_request(pcd_req, err, blk_rq_cur_bytes(pcd_req))) {
+		__blk_mq_end_request(pcd_req, err);
+		pcd_req = NULL;
+	}
+	pcd_busy = 0;
+	pcd_request();
+	spin_unlock_irqrestore(&pcd_lock, saved_flags);
+}
+
+static int pcd_ready(void)
+{
+	return (((status_reg(pcd_current) & (IDE_BUSY | IDE_DRQ)) == IDE_DRQ));
+}
+
+static void pcd_transfer(void)
+{
+
+	while (pcd_count && (pcd_sector / 4 == pcd_bufblk)) {
+		int o = (pcd_sector % 4) * 512;
+		memcpy(pcd_buf, pcd_buffer + o, 512);
+		pcd_count--;
+		pcd_buf += 512;
+		pcd_sector++;
+	}
+}
+
+static void pcd_start(void)
+{
+	int b, i;
+	char rd_cmd[12] = { 0xa8, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0 };
+
+	pcd_bufblk = pcd_sector / 4;
+	b = pcd_bufblk;
+	for (i = 0; i < 4; i++) {
+		rd_cmd[5 - i] = b & 0xff;
+		b = b >> 8;
+	}
+
+	if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) {
+		pcd_bufblk = -1;
+		next_request(BLK_STS_IOERR);
+		return;
+	}
+
+	mdelay(1);
+
+	ps_set_intr(do_pcd_read_drq, pcd_ready, PCD_TMO, nice);
+}
+
+static void do_pcd_read(void)
+{
+	pcd_busy = 1;
+	pcd_retries = 0;
+	pcd_transfer();
+	if (!pcd_count) {
+		next_request(0);
+		return;
+	}
+
+	pi_do_claimed(pcd_current->pi, pcd_start);
+}
+
+static void do_pcd_read_drq(void)
+{
+	unsigned long saved_flags;
+
+	if (pcd_completion(pcd_current, pcd_buffer, "read block")) {
+		if (pcd_retries < PCD_RETRIES) {
+			mdelay(1);
+			pcd_retries++;
+			pi_do_claimed(pcd_current->pi, pcd_start);
+			return;
+		}
+		pcd_bufblk = -1;
+		next_request(BLK_STS_IOERR);
+		return;
+	}
+
+	do_pcd_read();
+	spin_lock_irqsave(&pcd_lock, saved_flags);
+	pcd_request();
+	spin_unlock_irqrestore(&pcd_lock, saved_flags);
+}
+
+/* the audio_ioctl stuff is adapted from sr_ioctl.c */
+
+static int pcd_audio_ioctl(struct cdrom_device_info *cdi, unsigned int cmd, void *arg)
+{
+	struct pcd_unit *cd = cdi->handle;
+
+	switch (cmd) {
+
+	case CDROMREADTOCHDR:
+
+		{
+			char cmd[12] =
+			    { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12,
+			 0, 0, 0 };
+			struct cdrom_tochdr *tochdr =
+			    (struct cdrom_tochdr *) arg;
+			char buffer[32];
+			int r;
+
+			r = pcd_atapi(cd, cmd, 12, buffer, "read toc header");
+
+			tochdr->cdth_trk0 = buffer[2];
+			tochdr->cdth_trk1 = buffer[3];
+
+			return r ? -EIO : 0;
+		}
+
+	case CDROMREADTOCENTRY:
+
+		{
+			char cmd[12] =
+			    { GPCMD_READ_TOC_PMA_ATIP, 0, 0, 0, 0, 0, 0, 0, 12,
+			 0, 0, 0 };
+
+			struct cdrom_tocentry *tocentry =
+			    (struct cdrom_tocentry *) arg;
+			unsigned char buffer[32];
+			int r;
+
+			cmd[1] =
+			    (tocentry->cdte_format == CDROM_MSF ? 0x02 : 0);
+			cmd[6] = tocentry->cdte_track;
+
+			r = pcd_atapi(cd, cmd, 12, buffer, "read toc entry");
+
+			tocentry->cdte_ctrl = buffer[5] & 0xf;
+			tocentry->cdte_adr = buffer[5] >> 4;
+			tocentry->cdte_datamode =
+			    (tocentry->cdte_ctrl & 0x04) ? 1 : 0;
+			if (tocentry->cdte_format == CDROM_MSF) {
+				tocentry->cdte_addr.msf.minute = buffer[9];
+				tocentry->cdte_addr.msf.second = buffer[10];
+				tocentry->cdte_addr.msf.frame = buffer[11];
+			} else
+				tocentry->cdte_addr.lba =
+				    (((((buffer[8] << 8) + buffer[9]) << 8)
+				      + buffer[10]) << 8) + buffer[11];
+
+			return r ? -EIO : 0;
+		}
+
+	default:
+
+		return -ENOSYS;
+	}
+}
+
+static int pcd_get_mcn(struct cdrom_device_info *cdi, struct cdrom_mcn *mcn)
+{
+	char cmd[12] =
+	    { GPCMD_READ_SUBCHANNEL, 0, 0x40, 2, 0, 0, 0, 0, 24, 0, 0, 0 };
+	char buffer[32];
+
+	if (pcd_atapi(cdi->handle, cmd, 24, buffer, "get mcn"))
+		return -EIO;
+
+	memcpy(mcn->medium_catalog_number, buffer + 9, 13);
+	mcn->medium_catalog_number[13] = 0;
+
+	return 0;
+}
+
+static int pcd_init_unit(struct pcd_unit *cd, bool autoprobe, int port,
+		int mode, int unit, int protocol, int delay, int ms)
+{
+	struct gendisk *disk;
+	int ret;
+
+	ret = blk_mq_alloc_sq_tag_set(&cd->tag_set, &pcd_mq_ops, 1,
+				      BLK_MQ_F_SHOULD_MERGE);
+	if (ret)
+		return ret;
+
+	disk = blk_mq_alloc_disk(&cd->tag_set, cd);
+	if (IS_ERR(disk)) {
+		ret = PTR_ERR(disk);
+		goto out_free_tag_set;
+	}
+
+	INIT_LIST_HEAD(&cd->rq_list);
+	blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+	cd->disk = disk;
+	cd->pi = &cd->pia;
+	cd->present = 0;
+	cd->last_sense = 0;
+	cd->changed = 1;
+	cd->drive = (*drives[cd - pcd])[D_SLV];
+
+	cd->name = &cd->info.name[0];
+	snprintf(cd->name, sizeof(cd->info.name), "%s%d", name, unit);
+	cd->info.ops = &pcd_dops;
+	cd->info.handle = cd;
+	cd->info.speed = 0;
+	cd->info.capacity = 1;
+	cd->info.mask = 0;
+	disk->major = major;
+	disk->first_minor = unit;
+	disk->minors = 1;
+	strcpy(disk->disk_name, cd->name);	/* umm... */
+	disk->fops = &pcd_bdops;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	disk->event_flags = DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE;
+
+	if (!pi_init(cd->pi, autoprobe, port, mode, unit, protocol, delay,
+			pcd_buffer, PI_PCD, verbose, cd->name)) {
+		ret = -ENODEV;
+		goto out_free_disk;
+	}
+	ret = pcd_probe(cd, ms);
+	if (ret)
+		goto out_pi_release;
+
+	cd->present = 1;
+	pcd_probe_capabilities(cd);
+	ret = register_cdrom(cd->disk, &cd->info);
+	if (ret)
+		goto out_pi_release;
+	ret = add_disk(cd->disk);
+	if (ret)
+		goto out_unreg_cdrom;
+	return 0;
+
+out_unreg_cdrom:
+	unregister_cdrom(&cd->info);
+out_pi_release:
+	pi_release(cd->pi);
+out_free_disk:
+	put_disk(cd->disk);
+out_free_tag_set:
+	blk_mq_free_tag_set(&cd->tag_set);
+	return ret;
+}
+
+static int __init pcd_init(void)
+{
+	int found = 0, unit;
+
+	if (disable)
+		return -EINVAL;
+
+	if (register_blkdev(major, name))
+		return -EBUSY;
+
+	pr_info("%s: %s version %s, major %d, nice %d\n",
+		name, name, PCD_VERSION, major, nice);
+
+	par_drv = pi_register_driver(name);
+	if (!par_drv) {
+		pr_err("failed to register %s driver\n", name);
+		goto out_unregister_blkdev;
+	}
+
+	for (unit = 0; unit < PCD_UNITS; unit++) {
+		if ((*drives[unit])[D_PRT])
+			pcd_drive_count++;
+	}
+
+	if (pcd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+		if (!pcd_init_unit(pcd, 1, -1, -1, -1, -1, -1, -1))
+			found++;
+	} else {
+		for (unit = 0; unit < PCD_UNITS; unit++) {
+			struct pcd_unit *cd = &pcd[unit];
+			int *conf = *drives[unit];
+
+			if (!conf[D_PRT])
+				continue;
+			if (!pcd_init_unit(cd, 0, conf[D_PRT], conf[D_MOD],
+					conf[D_UNI], conf[D_PRO], conf[D_DLY],
+					conf[D_SLV]))
+				found++;
+		}
+	}
+
+	if (!found) {
+		pr_info("%s: No CD-ROM drive found\n", name);
+		goto out_unregister_pi_driver;
+	}
+
+	return 0;
+
+out_unregister_pi_driver:
+	pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+	unregister_blkdev(major, name);
+	return -ENODEV;
+}
+
+static void __exit pcd_exit(void)
+{
+	struct pcd_unit *cd;
+	int unit;
+
+	for (unit = 0, cd = pcd; unit < PCD_UNITS; unit++, cd++) {
+		if (!cd->present)
+			continue;
+
+		unregister_cdrom(&cd->info);
+		del_gendisk(cd->disk);
+		pi_release(cd->pi);
+		put_disk(cd->disk);
+
+		blk_mq_free_tag_set(&cd->tag_set);
+	}
+	pi_unregister_driver(par_drv);
+	unregister_blkdev(major, name);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pcd_init)
+module_exit(pcd_exit)
diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c
new file mode 100644
index 000000000..f8a75bc90
--- /dev/null
+++ b/drivers/block/paride/pd.c
@@ -0,0 +1,1032 @@
+/* 
+        pd.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                            Under the terms of the GNU General Public License.
+
+        This is the high-level driver for parallel port IDE hard
+        drives based on chips supported by the paride module.
+
+	By default, the driver will autoprobe for a single parallel
+	port IDE drive, but if their individual parameters are
+        specified, the driver can handle up to 4 drives.
+
+        The behaviour of the pd driver can be altered by setting
+        some parameters from the insmod command line.  The following
+        parameters are adjustable:
+ 
+	    drive0  	These four arguments can be arrays of	    
+	    drive1	1-8 integers as follows:
+	    drive2
+	    drive3	<prt>,<pro>,<uni>,<mod>,<geo>,<sby>,<dly>,<slv>
+
+			Where,
+
+		<prt>	is the base of the parallel port address for
+			the corresponding drive.  (required)
+
+		<pro>   is the protocol number for the adapter that
+			supports this drive.  These numbers are
+                        logged by 'paride' when the protocol modules
+			are initialised.  (0 if not given)
+
+		<uni>   for those adapters that support chained
+			devices, this is the unit selector for the
+		        chain of devices on the given port.  It should
+			be zero for devices that don't support chaining.
+			(0 if not given)
+
+		<mod>   this can be -1 to choose the best mode, or one
+		        of the mode numbers supported by the adapter.
+			(-1 if not given)
+
+		<geo>   this defaults to 0 to indicate that the driver
+			should use the CHS geometry provided by the drive
+			itself.  If set to 1, the driver will provide
+			a logical geometry with 64 heads and 32 sectors
+			per track, to be consistent with most SCSI
+		        drivers.  (0 if not given)
+
+		<sby>   set this to zero to disable the power saving
+			standby mode, if needed.  (1 if not given)
+
+		<dly>   some parallel ports require the driver to 
+			go more slowly.  -1 sets a default value that
+			should work with the chosen protocol.  Otherwise,
+			set this to a small integer, the larger it is
+			the slower the port i/o.  In some cases, setting
+			this to zero will speed up the device. (default -1)
+
+		<slv>   IDE disks can be jumpered to master or slave.
+                        Set this to 0 to choose the master drive, 1 to
+                        choose the slave, -1 (the default) to choose the
+                        first drive found.
+			
+
+            major       You may use this parameter to override the
+                        default major number (45) that this driver
+                        will use.  Be sure to change the device
+                        name as well.
+
+            name        This parameter is a character string that
+                        contains the name the kernel will use for this
+                        device (in /proc output, for instance).
+			(default "pd")
+
+	    cluster	The driver will attempt to aggregate requests
+			for adjacent blocks into larger multi-block
+			clusters.  The maximum cluster size (in 512
+			byte sectors) is set with this parameter.
+			(default 64)
+
+	    verbose	This parameter controls the amount of logging
+			that the driver will do.  Set it to 0 for 
+			normal operation, 1 to see autoprobe progress
+			messages, or 2 to see additional debugging
+			output.  (default 0)
+
+            nice        This parameter controls the driver's use of
+                        idle CPU time, at the expense of some speed.
+
+        If this driver is built into the kernel, you can use kernel
+        the following command line parameters, with the same values
+        as the corresponding module parameters listed above:
+
+            pd.drive0
+            pd.drive1
+            pd.drive2
+            pd.drive3
+            pd.cluster
+            pd.nice
+
+        In addition, you can use the parameter pd.disable to disable
+        the driver entirely.
+ 
+*/
+
+/* Changes:
+
+	1.01	GRG 1997.01.24	Restored pd_reset()
+				Added eject ioctl
+	1.02    GRG 1998.05.06  SMP spinlock changes, 
+				Added slave support
+	1.03    GRG 1998.06.16  Eliminate an Ugh.
+	1.04	GRG 1998.08.15  Extra debugging, use HZ in loop timing
+	1.05    GRG 1998.09.24  Added jumbo support
+
+*/
+
+#define PD_VERSION      "1.05"
+#define PD_MAJOR	45
+#define PD_NAME		"pd"
+#define PD_UNITS	4
+
+/* Here are things one can override from the insmod command.
+   Most are autoprobed by paride unless set here.  Verbose is off
+   by default.
+
+*/
+#include <linux/types.h>
+
+static int verbose = 0;
+static int major = PD_MAJOR;
+static char *name = PD_NAME;
+static int cluster = 64;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive1[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive2[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+static int drive3[8] = { 0, 0, 0, -1, 0, 1, -1, -1 };
+
+static int (*drives[4])[8] = {&drive0, &drive1, &drive2, &drive3};
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_GEO, D_SBY, D_DLY, D_SLV};
+
+/* end of parameters */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/gfp.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>	/* for the eject ioctl */
+#include <linux/blk-mq.h>
+#include <linux/blkpg.h>
+#include <linux/kernel.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+
+static DEFINE_MUTEX(pd_mutex);
+static DEFINE_SPINLOCK(pd_lock);
+
+module_param(verbose, int, 0);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(cluster, int, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PD_BITS    4
+
+/* numbers for "SCSI" geometry */
+
+#define PD_LOG_HEADS    64
+#define PD_LOG_SECTS    32
+
+#define PD_ID_OFF       54
+#define PD_ID_LEN       14
+
+#define PD_MAX_RETRIES  5
+#define PD_TMO          800	/* interrupt timeout in jiffies */
+#define PD_SPIN_DEL     50	/* spin delay in micro-seconds  */
+
+#define PD_SPIN         (1000000*PD_TMO)/(HZ*PD_SPIN_DEL)
+
+#define STAT_ERR        0x00001
+#define STAT_INDEX      0x00002
+#define STAT_ECC        0x00004
+#define STAT_DRQ        0x00008
+#define STAT_SEEK       0x00010
+#define STAT_WRERR      0x00020
+#define STAT_READY      0x00040
+#define STAT_BUSY       0x00080
+
+#define ERR_AMNF        0x00100
+#define ERR_TK0NF       0x00200
+#define ERR_ABRT        0x00400
+#define ERR_MCR         0x00800
+#define ERR_IDNF        0x01000
+#define ERR_MC          0x02000
+#define ERR_UNC         0x04000
+#define ERR_TMO         0x10000
+
+#define IDE_READ        	0x20
+#define IDE_WRITE       	0x30
+#define IDE_READ_VRFY		0x40
+#define IDE_INIT_DEV_PARMS	0x91
+#define IDE_STANDBY     	0x96
+#define IDE_ACKCHANGE   	0xdb
+#define IDE_DOORLOCK    	0xde
+#define IDE_DOORUNLOCK  	0xdf
+#define IDE_IDENTIFY    	0xec
+#define IDE_EJECT		0xed
+
+#define PD_NAMELEN	8
+
+struct pd_unit {
+	struct pi_adapter pia;	/* interface to paride layer */
+	struct pi_adapter *pi;
+	int access;		/* count of active opens ... */
+	int capacity;		/* Size of this volume in sectors */
+	int heads;		/* physical geometry */
+	int sectors;
+	int cylinders;
+	int can_lba;
+	int drive;		/* master=0 slave=1 */
+	int changed;		/* Have we seen a disk change ? */
+	int removable;		/* removable media device  ?  */
+	int standby;
+	int alt_geom;
+	char name[PD_NAMELEN];	/* pda, pdb, etc ... */
+	struct gendisk *gd;
+	struct blk_mq_tag_set tag_set;
+	struct list_head rq_list;
+};
+
+static struct pd_unit pd[PD_UNITS];
+
+struct pd_req {
+	/* for REQ_OP_DRV_IN: */
+	enum action (*func)(struct pd_unit *disk);
+};
+
+static char pd_scratch[512];	/* scratch block buffer */
+
+static char *pd_errs[17] = { "ERR", "INDEX", "ECC", "DRQ", "SEEK", "WRERR",
+	"READY", "BUSY", "AMNF", "TK0NF", "ABRT", "MCR",
+	"IDNF", "MC", "UNC", "???", "TMO"
+};
+
+static void *par_drv;		/* reference of parport driver */
+
+static inline int status_reg(struct pd_unit *disk)
+{
+	return pi_read_regr(disk->pi, 1, 6);
+}
+
+static inline int read_reg(struct pd_unit *disk, int reg)
+{
+	return pi_read_regr(disk->pi, 0, reg);
+}
+
+static inline void write_status(struct pd_unit *disk, int val)
+{
+	pi_write_regr(disk->pi, 1, 6, val);
+}
+
+static inline void write_reg(struct pd_unit *disk, int reg, int val)
+{
+	pi_write_regr(disk->pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pd_unit *disk)
+{
+	return 0xa0+0x10*disk->drive;
+}
+
+/*  ide command interface */
+
+static void pd_print_error(struct pd_unit *disk, char *msg, int status)
+{
+	int i;
+
+	printk("%s: %s: status = 0x%x =", disk->name, msg, status);
+	for (i = 0; i < ARRAY_SIZE(pd_errs); i++)
+		if (status & (1 << i))
+			printk(" %s", pd_errs[i]);
+	printk("\n");
+}
+
+static void pd_reset(struct pd_unit *disk)
+{				/* called only for MASTER drive */
+	write_status(disk, 4);
+	udelay(50);
+	write_status(disk, 0);
+	udelay(250);
+}
+
+#define DBMSG(msg)	((verbose>1)?(msg):NULL)
+
+static int pd_wait_for(struct pd_unit *disk, int w, char *msg)
+{				/* polled wait */
+	int k, r, e;
+
+	k = 0;
+	while (k < PD_SPIN) {
+		r = status_reg(disk);
+		k++;
+		if (((r & w) == w) && !(r & STAT_BUSY))
+			break;
+		udelay(PD_SPIN_DEL);
+	}
+	e = (read_reg(disk, 1) << 8) + read_reg(disk, 7);
+	if (k >= PD_SPIN)
+		e |= ERR_TMO;
+	if ((e & (STAT_ERR | ERR_TMO)) && (msg != NULL))
+		pd_print_error(disk, msg, e);
+	return e;
+}
+
+static void pd_send_command(struct pd_unit *disk, int n, int s, int h, int c0, int c1, int func)
+{
+	write_reg(disk, 6, DRIVE(disk) + h);
+	write_reg(disk, 1, 0);		/* the IDE task file */
+	write_reg(disk, 2, n);
+	write_reg(disk, 3, s);
+	write_reg(disk, 4, c0);
+	write_reg(disk, 5, c1);
+	write_reg(disk, 7, func);
+
+	udelay(1);
+}
+
+static void pd_ide_command(struct pd_unit *disk, int func, int block, int count)
+{
+	int c1, c0, h, s;
+
+	if (disk->can_lba) {
+		s = block & 255;
+		c0 = (block >>= 8) & 255;
+		c1 = (block >>= 8) & 255;
+		h = ((block >>= 8) & 15) + 0x40;
+	} else {
+		s = (block % disk->sectors) + 1;
+		h = (block /= disk->sectors) % disk->heads;
+		c0 = (block /= disk->heads) % 256;
+		c1 = (block >>= 8);
+	}
+	pd_send_command(disk, count, s, h, c0, c1, func);
+}
+
+/* The i/o request engine */
+
+enum action {Fail = 0, Ok = 1, Hold, Wait};
+
+static struct request *pd_req;	/* current request */
+static enum action (*phase)(void);
+
+static void run_fsm(void);
+
+static void ps_tq_int(struct work_struct *work);
+
+static DECLARE_DELAYED_WORK(fsm_tq, ps_tq_int);
+
+static void schedule_fsm(void)
+{
+	if (!nice)
+		schedule_delayed_work(&fsm_tq, 0);
+	else
+		schedule_delayed_work(&fsm_tq, nice-1);
+}
+
+static void ps_tq_int(struct work_struct *work)
+{
+	run_fsm();
+}
+
+static enum action do_pd_io_start(void);
+static enum action pd_special(void);
+static enum action do_pd_read_start(void);
+static enum action do_pd_write_start(void);
+static enum action do_pd_read_drq(void);
+static enum action do_pd_write_done(void);
+
+static int pd_queue;
+static int pd_claimed;
+
+static struct pd_unit *pd_current; /* current request's drive */
+static PIA *pi_current; /* current request's PIA */
+
+static int set_next_request(void)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	int old_pos = pd_queue;
+
+	do {
+		disk = pd[pd_queue].gd;
+		q = disk ? disk->queue : NULL;
+		if (++pd_queue == PD_UNITS)
+			pd_queue = 0;
+		if (q) {
+			struct pd_unit *disk = q->queuedata;
+
+			if (list_empty(&disk->rq_list))
+				continue;
+
+			pd_req = list_first_entry(&disk->rq_list,
+							struct request,
+							queuelist);
+			list_del_init(&pd_req->queuelist);
+			blk_mq_start_request(pd_req);
+			break;
+		}
+	} while (pd_queue != old_pos);
+
+	return pd_req != NULL;
+}
+
+static void run_fsm(void)
+{
+	while (1) {
+		enum action res;
+		int stop = 0;
+
+		if (!phase) {
+			pd_current = pd_req->q->disk->private_data;
+			pi_current = pd_current->pi;
+			phase = do_pd_io_start;
+		}
+
+		switch (pd_claimed) {
+			case 0:
+				pd_claimed = 1;
+				if (!pi_schedule_claimed(pi_current, run_fsm))
+					return;
+				fallthrough;
+			case 1:
+				pd_claimed = 2;
+				pi_current->proto->connect(pi_current);
+		}
+
+		switch(res = phase()) {
+			case Ok: case Fail: {
+				blk_status_t err;
+
+				err = res == Ok ? 0 : BLK_STS_IOERR;
+				pi_disconnect(pi_current);
+				pd_claimed = 0;
+				phase = NULL;
+				spin_lock_irq(&pd_lock);
+				if (!blk_update_request(pd_req, err,
+						blk_rq_cur_bytes(pd_req))) {
+					__blk_mq_end_request(pd_req, err);
+					pd_req = NULL;
+					stop = !set_next_request();
+				}
+				spin_unlock_irq(&pd_lock);
+				if (stop)
+					return;
+				}
+				fallthrough;
+			case Hold:
+				schedule_fsm();
+				return;
+			case Wait:
+				pi_disconnect(pi_current);
+				pd_claimed = 0;
+		}
+	}
+}
+
+static int pd_retries = 0;	/* i/o error retry count */
+static int pd_block;		/* address of next requested block */
+static int pd_count;		/* number of blocks still to do */
+static int pd_run;		/* sectors in current cluster */
+static char *pd_buf;		/* buffer for request in progress */
+
+static enum action do_pd_io_start(void)
+{
+	switch (req_op(pd_req)) {
+	case REQ_OP_DRV_IN:
+		phase = pd_special;
+		return pd_special();
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
+		pd_block = blk_rq_pos(pd_req);
+		pd_count = blk_rq_cur_sectors(pd_req);
+		if (pd_block + pd_count > get_capacity(pd_req->q->disk))
+			return Fail;
+		pd_run = blk_rq_sectors(pd_req);
+		pd_buf = bio_data(pd_req->bio);
+		pd_retries = 0;
+		if (req_op(pd_req) == REQ_OP_READ)
+			return do_pd_read_start();
+		else
+			return do_pd_write_start();
+	default:
+		break;
+	}
+	return Fail;
+}
+
+static enum action pd_special(void)
+{
+	struct pd_req *req = blk_mq_rq_to_pdu(pd_req);
+
+	return req->func(pd_current);
+}
+
+static int pd_next_buf(void)
+{
+	unsigned long saved_flags;
+
+	pd_count--;
+	pd_run--;
+	pd_buf += 512;
+	pd_block++;
+	if (!pd_run)
+		return 1;
+	if (pd_count)
+		return 0;
+	spin_lock_irqsave(&pd_lock, saved_flags);
+	if (!blk_update_request(pd_req, 0, blk_rq_cur_bytes(pd_req))) {
+		__blk_mq_end_request(pd_req, 0);
+		pd_req = NULL;
+		pd_count = 0;
+		pd_buf = NULL;
+	} else {
+		pd_count = blk_rq_cur_sectors(pd_req);
+		pd_buf = bio_data(pd_req->bio);
+	}
+	spin_unlock_irqrestore(&pd_lock, saved_flags);
+	return !pd_count;
+}
+
+static unsigned long pd_timeout;
+
+static enum action do_pd_read_start(void)
+{
+	if (pd_wait_for(pd_current, STAT_READY, "do_pd_read") & STAT_ERR) {
+		if (pd_retries < PD_MAX_RETRIES) {
+			pd_retries++;
+			return Wait;
+		}
+		return Fail;
+	}
+	pd_ide_command(pd_current, IDE_READ, pd_block, pd_run);
+	phase = do_pd_read_drq;
+	pd_timeout = jiffies + PD_TMO;
+	return Hold;
+}
+
+static enum action do_pd_write_start(void)
+{
+	if (pd_wait_for(pd_current, STAT_READY, "do_pd_write") & STAT_ERR) {
+		if (pd_retries < PD_MAX_RETRIES) {
+			pd_retries++;
+			return Wait;
+		}
+		return Fail;
+	}
+	pd_ide_command(pd_current, IDE_WRITE, pd_block, pd_run);
+	while (1) {
+		if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_write_drq") & STAT_ERR) {
+			if (pd_retries < PD_MAX_RETRIES) {
+				pd_retries++;
+				return Wait;
+			}
+			return Fail;
+		}
+		pi_write_block(pd_current->pi, pd_buf, 512);
+		if (pd_next_buf())
+			break;
+	}
+	phase = do_pd_write_done;
+	pd_timeout = jiffies + PD_TMO;
+	return Hold;
+}
+
+static inline int pd_ready(void)
+{
+	return !(status_reg(pd_current) & STAT_BUSY);
+}
+
+static enum action do_pd_read_drq(void)
+{
+	if (!pd_ready() && !time_after_eq(jiffies, pd_timeout))
+		return Hold;
+
+	while (1) {
+		if (pd_wait_for(pd_current, STAT_DRQ, "do_pd_read_drq") & STAT_ERR) {
+			if (pd_retries < PD_MAX_RETRIES) {
+				pd_retries++;
+				phase = do_pd_read_start;
+				return Wait;
+			}
+			return Fail;
+		}
+		pi_read_block(pd_current->pi, pd_buf, 512);
+		if (pd_next_buf())
+			break;
+	}
+	return Ok;
+}
+
+static enum action do_pd_write_done(void)
+{
+	if (!pd_ready() && !time_after_eq(jiffies, pd_timeout))
+		return Hold;
+
+	if (pd_wait_for(pd_current, STAT_READY, "do_pd_write_done") & STAT_ERR) {
+		if (pd_retries < PD_MAX_RETRIES) {
+			pd_retries++;
+			phase = do_pd_write_start;
+			return Wait;
+		}
+		return Fail;
+	}
+	return Ok;
+}
+
+/* special io requests */
+
+/* According to the ATA standard, the default CHS geometry should be
+   available following a reset.  Some Western Digital drives come up
+   in a mode where only LBA addresses are accepted until the device
+   parameters are initialised.
+*/
+
+static void pd_init_dev_parms(struct pd_unit *disk)
+{
+	pd_wait_for(disk, 0, DBMSG("before init_dev_parms"));
+	pd_send_command(disk, disk->sectors, 0, disk->heads - 1, 0, 0,
+			IDE_INIT_DEV_PARMS);
+	udelay(300);
+	pd_wait_for(disk, 0, "Initialise device parameters");
+}
+
+static enum action pd_door_lock(struct pd_unit *disk)
+{
+	if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) {
+		pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORLOCK);
+		pd_wait_for(disk, STAT_READY, "Lock done");
+	}
+	return Ok;
+}
+
+static enum action pd_door_unlock(struct pd_unit *disk)
+{
+	if (!(pd_wait_for(disk, STAT_READY, "Lock") & STAT_ERR)) {
+		pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK);
+		pd_wait_for(disk, STAT_READY, "Lock done");
+	}
+	return Ok;
+}
+
+static enum action pd_eject(struct pd_unit *disk)
+{
+	pd_wait_for(disk, 0, DBMSG("before unlock on eject"));
+	pd_send_command(disk, 1, 0, 0, 0, 0, IDE_DOORUNLOCK);
+	pd_wait_for(disk, 0, DBMSG("after unlock on eject"));
+	pd_wait_for(disk, 0, DBMSG("before eject"));
+	pd_send_command(disk, 0, 0, 0, 0, 0, IDE_EJECT);
+	pd_wait_for(disk, 0, DBMSG("after eject"));
+	return Ok;
+}
+
+static enum action pd_media_check(struct pd_unit *disk)
+{
+	int r = pd_wait_for(disk, STAT_READY, DBMSG("before media_check"));
+	if (!(r & STAT_ERR)) {
+		pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY);
+		r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after READ_VRFY"));
+	} else
+		disk->changed = 1;	/* say changed if other error */
+	if (r & ERR_MC) {
+		disk->changed = 1;
+		pd_send_command(disk, 1, 0, 0, 0, 0, IDE_ACKCHANGE);
+		pd_wait_for(disk, STAT_READY, DBMSG("RDY after ACKCHANGE"));
+		pd_send_command(disk, 1, 1, 0, 0, 0, IDE_READ_VRFY);
+		r = pd_wait_for(disk, STAT_READY, DBMSG("RDY after VRFY"));
+	}
+	return Ok;
+}
+
+static void pd_standby_off(struct pd_unit *disk)
+{
+	pd_wait_for(disk, 0, DBMSG("before STANDBY"));
+	pd_send_command(disk, 0, 0, 0, 0, 0, IDE_STANDBY);
+	pd_wait_for(disk, 0, DBMSG("after STANDBY"));
+}
+
+static enum action pd_identify(struct pd_unit *disk)
+{
+	int j;
+	char id[PD_ID_LEN + 1];
+
+/* WARNING:  here there may be dragons.  reset() applies to both drives,
+   but we call it only on probing the MASTER. This should allow most
+   common configurations to work, but be warned that a reset can clear
+   settings on the SLAVE drive.
+*/
+
+	if (disk->drive == 0)
+		pd_reset(disk);
+
+	write_reg(disk, 6, DRIVE(disk));
+	pd_wait_for(disk, 0, DBMSG("before IDENT"));
+	pd_send_command(disk, 1, 0, 0, 0, 0, IDE_IDENTIFY);
+
+	if (pd_wait_for(disk, STAT_DRQ, DBMSG("IDENT DRQ")) & STAT_ERR)
+		return Fail;
+	pi_read_block(disk->pi, pd_scratch, 512);
+	disk->can_lba = pd_scratch[99] & 2;
+	disk->sectors = le16_to_cpu(*(__le16 *) (pd_scratch + 12));
+	disk->heads = le16_to_cpu(*(__le16 *) (pd_scratch + 6));
+	disk->cylinders = le16_to_cpu(*(__le16 *) (pd_scratch + 2));
+	if (disk->can_lba)
+		disk->capacity = le32_to_cpu(*(__le32 *) (pd_scratch + 120));
+	else
+		disk->capacity = disk->sectors * disk->heads * disk->cylinders;
+
+	for (j = 0; j < PD_ID_LEN; j++)
+		id[j ^ 1] = pd_scratch[j + PD_ID_OFF];
+	j = PD_ID_LEN - 1;
+	while ((j >= 0) && (id[j] <= 0x20))
+		j--;
+	j++;
+	id[j] = 0;
+
+	disk->removable = pd_scratch[0] & 0x80;
+
+	printk("%s: %s, %s, %d blocks [%dM], (%d/%d/%d), %s media\n",
+	       disk->name, id,
+	       disk->drive ? "slave" : "master",
+	       disk->capacity, disk->capacity / 2048,
+	       disk->cylinders, disk->heads, disk->sectors,
+	       disk->removable ? "removable" : "fixed");
+
+	if (disk->capacity)
+		pd_init_dev_parms(disk);
+	if (!disk->standby)
+		pd_standby_off(disk);
+
+	return Ok;
+}
+
+/* end of io request engine */
+
+static blk_status_t pd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				const struct blk_mq_queue_data *bd)
+{
+	struct pd_unit *disk = hctx->queue->queuedata;
+
+	spin_lock_irq(&pd_lock);
+	if (!pd_req) {
+		pd_req = bd->rq;
+		blk_mq_start_request(pd_req);
+	} else
+		list_add_tail(&bd->rq->queuelist, &disk->rq_list);
+	spin_unlock_irq(&pd_lock);
+
+	run_fsm();
+	return BLK_STS_OK;
+}
+
+static int pd_special_command(struct pd_unit *disk,
+		      enum action (*func)(struct pd_unit *disk))
+{
+	struct request *rq;
+	struct pd_req *req;
+
+	rq = blk_mq_alloc_request(disk->gd->queue, REQ_OP_DRV_IN, 0);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+	req = blk_mq_rq_to_pdu(rq);
+
+	req->func = func;
+	blk_execute_rq(rq, false);
+	blk_mq_free_request(rq);
+	return 0;
+}
+
+/* kernel glue structures */
+
+static int pd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	mutex_lock(&pd_mutex);
+	disk->access++;
+
+	if (disk->removable) {
+		pd_special_command(disk, pd_media_check);
+		pd_special_command(disk, pd_door_lock);
+	}
+	mutex_unlock(&pd_mutex);
+	return 0;
+}
+
+static int pd_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	if (disk->alt_geom) {
+		geo->heads = PD_LOG_HEADS;
+		geo->sectors = PD_LOG_SECTS;
+		geo->cylinders = disk->capacity / (geo->heads * geo->sectors);
+	} else {
+		geo->heads = disk->heads;
+		geo->sectors = disk->sectors;
+		geo->cylinders = disk->cylinders;
+	}
+
+	return 0;
+}
+
+static int pd_ioctl(struct block_device *bdev, fmode_t mode,
+	 unsigned int cmd, unsigned long arg)
+{
+	struct pd_unit *disk = bdev->bd_disk->private_data;
+
+	switch (cmd) {
+	case CDROMEJECT:
+		mutex_lock(&pd_mutex);
+		if (disk->access == 1)
+			pd_special_command(disk, pd_eject);
+		mutex_unlock(&pd_mutex);
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static void pd_release(struct gendisk *p, fmode_t mode)
+{
+	struct pd_unit *disk = p->private_data;
+
+	mutex_lock(&pd_mutex);
+	if (!--disk->access && disk->removable)
+		pd_special_command(disk, pd_door_unlock);
+	mutex_unlock(&pd_mutex);
+}
+
+static unsigned int pd_check_events(struct gendisk *p, unsigned int clearing)
+{
+	struct pd_unit *disk = p->private_data;
+	int r;
+	if (!disk->removable)
+		return 0;
+	pd_special_command(disk, pd_media_check);
+	r = disk->changed;
+	disk->changed = 0;
+	return r ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static const struct block_device_operations pd_fops = {
+	.owner		= THIS_MODULE,
+	.open		= pd_open,
+	.release	= pd_release,
+	.ioctl		= pd_ioctl,
+	.compat_ioctl	= pd_ioctl,
+	.getgeo		= pd_getgeo,
+	.check_events	= pd_check_events,
+};
+
+/* probing */
+
+static const struct blk_mq_ops pd_mq_ops = {
+	.queue_rq	= pd_queue_rq,
+};
+
+static int pd_probe_drive(struct pd_unit *disk, int autoprobe, int port,
+		int mode, int unit, int protocol, int delay)
+{
+	int index = disk - pd;
+	int *parm = *drives[index];
+	struct gendisk *p;
+	int ret;
+
+	disk->pi = &disk->pia;
+	disk->access = 0;
+	disk->changed = 1;
+	disk->capacity = 0;
+	disk->drive = parm[D_SLV];
+	snprintf(disk->name, PD_NAMELEN, "%s%c", name, 'a' + index);
+	disk->alt_geom = parm[D_GEO];
+	disk->standby = parm[D_SBY];
+	INIT_LIST_HEAD(&disk->rq_list);
+
+	if (!pi_init(disk->pi, autoprobe, port, mode, unit, protocol, delay,
+			pd_scratch, PI_PD, verbose, disk->name))
+		return -ENXIO;
+
+	memset(&disk->tag_set, 0, sizeof(disk->tag_set));
+	disk->tag_set.ops = &pd_mq_ops;
+	disk->tag_set.cmd_size = sizeof(struct pd_req);
+	disk->tag_set.nr_hw_queues = 1;
+	disk->tag_set.nr_maps = 1;
+	disk->tag_set.queue_depth = 2;
+	disk->tag_set.numa_node = NUMA_NO_NODE;
+	disk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING;
+	ret = blk_mq_alloc_tag_set(&disk->tag_set);
+	if (ret)
+		goto pi_release;
+
+	p = blk_mq_alloc_disk(&disk->tag_set, disk);
+	if (IS_ERR(p)) {
+		ret = PTR_ERR(p);
+		goto free_tag_set;
+	}
+	disk->gd = p;
+
+	strcpy(p->disk_name, disk->name);
+	p->fops = &pd_fops;
+	p->major = major;
+	p->first_minor = (disk - pd) << PD_BITS;
+	p->minors = 1 << PD_BITS;
+	p->events = DISK_EVENT_MEDIA_CHANGE;
+	p->private_data = disk;
+	blk_queue_max_hw_sectors(p->queue, cluster);
+	blk_queue_bounce_limit(p->queue, BLK_BOUNCE_HIGH);
+
+	if (disk->drive == -1) {
+		for (disk->drive = 0; disk->drive <= 1; disk->drive++) {
+			ret = pd_special_command(disk, pd_identify);
+			if (ret == 0)
+				break;
+		}
+	} else {
+		ret = pd_special_command(disk, pd_identify);
+	}
+	if (ret)
+		goto put_disk;
+	set_capacity(disk->gd, disk->capacity);
+	ret = add_disk(disk->gd);
+	if (ret)
+		goto cleanup_disk;
+	return 0;
+cleanup_disk:
+	put_disk(disk->gd);
+put_disk:
+	put_disk(p);
+	disk->gd = NULL;
+free_tag_set:
+	blk_mq_free_tag_set(&disk->tag_set);
+pi_release:
+	pi_release(disk->pi);
+	return ret;
+}
+
+static int __init pd_init(void)
+{
+	int found = 0, unit, pd_drive_count = 0;
+	struct pd_unit *disk;
+
+	if (disable)
+		return -ENODEV;
+
+	if (register_blkdev(major, name))
+		return -ENODEV;
+
+	printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+	       name, name, PD_VERSION, major, cluster, nice);
+
+	par_drv = pi_register_driver(name);
+	if (!par_drv) {
+		pr_err("failed to register %s driver\n", name);
+		goto out_unregister_blkdev;
+	}
+
+	for (unit = 0; unit < PD_UNITS; unit++) {
+		int *parm = *drives[unit];
+
+		if (parm[D_PRT])
+			pd_drive_count++;
+	}
+
+	if (pd_drive_count == 0) { /* nothing spec'd - so autoprobe for 1 */
+		if (!pd_probe_drive(pd, 1, -1, -1, -1, -1, -1))
+			found++;
+	} else {
+		for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
+			int *parm = *drives[unit];
+			if (!parm[D_PRT])
+				continue;
+			if (!pd_probe_drive(disk, 0, parm[D_PRT], parm[D_MOD],
+					parm[D_UNI], parm[D_PRO], parm[D_DLY]))
+				found++;
+		}
+	}
+	if (!found) {
+		printk("%s: no valid drive found\n", name);
+		goto out_pi_unregister_driver;
+	}
+
+	return 0;
+
+out_pi_unregister_driver:
+	pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+	unregister_blkdev(major, name);
+	return -ENODEV;
+}
+
+static void __exit pd_exit(void)
+{
+	struct pd_unit *disk;
+	int unit;
+	unregister_blkdev(major, name);
+	for (unit = 0, disk = pd; unit < PD_UNITS; unit++, disk++) {
+		struct gendisk *p = disk->gd;
+		if (p) {
+			disk->gd = NULL;
+			del_gendisk(p);
+			put_disk(p);
+			blk_mq_free_tag_set(&disk->tag_set);
+			pi_release(disk->pi);
+		}
+	}
+}
+
+MODULE_LICENSE("GPL");
+module_init(pd_init)
+module_exit(pd_exit)
diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c
new file mode 100644
index 000000000..eec1b9fde
--- /dev/null
+++ b/drivers/block/paride/pf.c
@@ -0,0 +1,1057 @@
+/* 
+        pf.c    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                            Under the terms of the GNU General Public License.
+
+        This is the high-level driver for parallel port ATAPI disk
+        drives based on chips supported by the paride module.
+
+        By default, the driver will autoprobe for a single parallel
+        port ATAPI disk drive, but if their individual parameters are
+        specified, the driver can handle up to 4 drives.
+
+        The behaviour of the pf driver can be altered by setting
+        some parameters from the insmod command line.  The following
+        parameters are adjustable:
+
+            drive0      These four arguments can be arrays of       
+            drive1      1-7 integers as follows:
+            drive2
+            drive3      <prt>,<pro>,<uni>,<mod>,<slv>,<lun>,<dly>
+
+                        Where,
+
+                <prt>   is the base of the parallel port address for
+                        the corresponding drive.  (required)
+
+                <pro>   is the protocol number for the adapter that
+                        supports this drive.  These numbers are
+                        logged by 'paride' when the protocol modules
+                        are initialised.  (0 if not given)
+
+                <uni>   for those adapters that support chained
+                        devices, this is the unit selector for the
+                        chain of devices on the given port.  It should
+                        be zero for devices that don't support chaining.
+                        (0 if not given)
+
+                <mod>   this can be -1 to choose the best mode, or one
+                        of the mode numbers supported by the adapter.
+                        (-1 if not given)
+
+                <slv>   ATAPI CDroms can be jumpered to master or slave.
+                        Set this to 0 to choose the master drive, 1 to
+                        choose the slave, -1 (the default) to choose the
+                        first drive found.
+
+		<lun>   Some ATAPI devices support multiple LUNs.
+                        One example is the ATAPI PD/CD drive from
+                        Matshita/Panasonic.  This device has a 
+                        CD drive on LUN 0 and a PD drive on LUN 1.
+                        By default, the driver will search for the
+                        first LUN with a supported device.  Set 
+                        this parameter to force it to use a specific
+                        LUN.  (default -1)
+
+                <dly>   some parallel ports require the driver to 
+                        go more slowly.  -1 sets a default value that
+                        should work with the chosen protocol.  Otherwise,
+                        set this to a small integer, the larger it is
+                        the slower the port i/o.  In some cases, setting
+                        this to zero will speed up the device. (default -1)
+
+	    major	You may use this parameter to override the
+			default major number (47) that this driver
+			will use.  Be sure to change the device
+			name as well.
+
+	    name	This parameter is a character string that
+			contains the name the kernel will use for this
+			device (in /proc output, for instance).
+			(default "pf").
+
+            cluster     The driver will attempt to aggregate requests
+                        for adjacent blocks into larger multi-block
+                        clusters.  The maximum cluster size (in 512
+                        byte sectors) is set with this parameter.
+                        (default 64)
+
+            verbose     This parameter controls the amount of logging
+                        that the driver will do.  Set it to 0 for
+                        normal operation, 1 to see autoprobe progress
+                        messages, or 2 to see additional debugging
+                        output.  (default 0)
+ 
+	    nice        This parameter controls the driver's use of
+			idle CPU time, at the expense of some speed.
+
+        If this driver is built into the kernel, you can use the
+        following command line parameters, with the same values
+        as the corresponding module parameters listed above:
+
+            pf.drive0
+            pf.drive1
+            pf.drive2
+            pf.drive3
+	    pf.cluster
+            pf.nice
+
+        In addition, you can use the parameter pf.disable to disable
+        the driver entirely.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.05.03  Changes for SMP.  Eliminate sti().
+				Fix for drives that don't clear STAT_ERR
+			        until after next CDB delivered.
+				Small change in pf_completion to round
+				up transfer size.
+	1.02    GRG 1998.06.16  Eliminated an Ugh
+	1.03    GRG 1998.08.16  Use HZ in loop timings, extra debugging
+	1.04    GRG 1998.09.24  Added jumbo support
+
+*/
+
+#define PF_VERSION      "1.04"
+#define PF_MAJOR	47
+#define PF_NAME		"pf"
+#define PF_UNITS	4
+
+#include <linux/types.h>
+
+/* Here are things one can override from the insmod command.
+   Most are autoprobed by paride unless set here.  Verbose is off
+   by default.
+
+*/
+
+static bool verbose = 0;
+static int major = PF_MAJOR;
+static char *name = PF_NAME;
+static int cluster = 64;
+static int nice = 0;
+static int disable = 0;
+
+static int drive0[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive1[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive2[7] = { 0, 0, 0, -1, -1, -1, -1 };
+static int drive3[7] = { 0, 0, 0, -1, -1, -1, -1 };
+
+static int (*drives[4])[7] = {&drive0, &drive1, &drive2, &drive3};
+static int pf_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_LUN, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>
+#include <linux/spinlock.h>
+#include <linux/blk-mq.h>
+#include <linux/blkpg.h>
+#include <linux/mutex.h>
+#include <linux/uaccess.h>
+
+static DEFINE_MUTEX(pf_mutex);
+static DEFINE_SPINLOCK(pf_spin_lock);
+
+module_param(verbose, bool, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param(cluster, int, 0);
+module_param(nice, int, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+#include "pseudo.h"
+
+/* constants for faking geometry numbers */
+
+#define PF_FD_MAX	8192	/* use FD geometry under this size */
+#define PF_FD_HDS	2
+#define PF_FD_SPT	18
+#define PF_HD_HDS	64
+#define PF_HD_SPT	32
+
+#define PF_MAX_RETRIES  5
+#define PF_TMO          800	/* interrupt timeout in jiffies */
+#define PF_SPIN_DEL     50	/* spin delay in micro-seconds  */
+
+#define PF_SPIN         (1000000*PF_TMO)/(HZ*PF_SPIN_DEL)
+
+#define STAT_ERR        0x00001
+#define STAT_INDEX      0x00002
+#define STAT_ECC        0x00004
+#define STAT_DRQ        0x00008
+#define STAT_SEEK       0x00010
+#define STAT_WRERR      0x00020
+#define STAT_READY      0x00040
+#define STAT_BUSY       0x00080
+
+#define ATAPI_REQ_SENSE		0x03
+#define ATAPI_LOCK		0x1e
+#define ATAPI_DOOR		0x1b
+#define ATAPI_MODE_SENSE	0x5a
+#define ATAPI_CAPACITY		0x25
+#define ATAPI_IDENTIFY		0x12
+#define ATAPI_READ_10		0x28
+#define ATAPI_WRITE_10		0x2a
+
+static int pf_open(struct block_device *bdev, fmode_t mode);
+static blk_status_t pf_queue_rq(struct blk_mq_hw_ctx *hctx,
+				const struct blk_mq_queue_data *bd);
+static int pf_ioctl(struct block_device *bdev, fmode_t mode,
+		    unsigned int cmd, unsigned long arg);
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo);
+
+static void pf_release(struct gendisk *disk, fmode_t mode);
+
+static void do_pf_read(void);
+static void do_pf_read_start(void);
+static void do_pf_write(void);
+static void do_pf_write_start(void);
+static void do_pf_read_drq(void);
+static void do_pf_write_done(void);
+
+#define PF_NM           0
+#define PF_RO           1
+#define PF_RW           2
+
+#define PF_NAMELEN      8
+
+struct pf_unit {
+	struct pi_adapter pia;	/* interface to paride layer */
+	struct pi_adapter *pi;
+	int removable;		/* removable media device  ?  */
+	int media_status;	/* media present ?  WP ? */
+	int drive;		/* drive */
+	int lun;
+	int access;		/* count of active opens ... */
+	int present;		/* device present ? */
+	char name[PF_NAMELEN];	/* pf0, pf1, ... */
+	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
+	struct list_head rq_list;
+};
+
+static struct pf_unit units[PF_UNITS];
+
+static int pf_identify(struct pf_unit *pf);
+static void pf_lock(struct pf_unit *pf, int func);
+static void pf_eject(struct pf_unit *pf);
+static unsigned int pf_check_events(struct gendisk *disk,
+				    unsigned int clearing);
+
+static char pf_scratch[512];	/* scratch block buffer */
+
+/* the variables below are used mainly in the I/O request engine, which
+   processes only one request at a time.
+*/
+
+static int pf_retries = 0;	/* i/o error retry count */
+static int pf_busy = 0;		/* request being processed ? */
+static struct request *pf_req;	/* current request */
+static int pf_block;		/* address of next requested block */
+static int pf_count;		/* number of blocks still to do */
+static int pf_run;		/* sectors in current cluster */
+static int pf_cmd;		/* current command READ/WRITE */
+static struct pf_unit *pf_current;/* unit of current request */
+static int pf_mask;		/* stopper for pseudo-int */
+static char *pf_buf;		/* buffer for request in progress */
+static void *par_drv;		/* reference of parport driver */
+
+/* kernel glue structures */
+
+static const struct block_device_operations pf_fops = {
+	.owner		= THIS_MODULE,
+	.open		= pf_open,
+	.release	= pf_release,
+	.ioctl		= pf_ioctl,
+	.compat_ioctl	= pf_ioctl,
+	.getgeo		= pf_getgeo,
+	.check_events	= pf_check_events,
+};
+
+static const struct blk_mq_ops pf_mq_ops = {
+	.queue_rq	= pf_queue_rq,
+};
+
+static int pf_open(struct block_device *bdev, fmode_t mode)
+{
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+	int ret;
+
+	mutex_lock(&pf_mutex);
+	pf_identify(pf);
+
+	ret = -ENODEV;
+	if (pf->media_status == PF_NM)
+		goto out;
+
+	ret = -EROFS;
+	if ((pf->media_status == PF_RO) && (mode & FMODE_WRITE))
+		goto out;
+
+	ret = 0;
+	pf->access++;
+	if (pf->removable)
+		pf_lock(pf, 1);
+out:
+	mutex_unlock(&pf_mutex);
+	return ret;
+}
+
+static int pf_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+	sector_t capacity = get_capacity(pf->disk);
+
+	if (capacity < PF_FD_MAX) {
+		geo->cylinders = sector_div(capacity, PF_FD_HDS * PF_FD_SPT);
+		geo->heads = PF_FD_HDS;
+		geo->sectors = PF_FD_SPT;
+	} else {
+		geo->cylinders = sector_div(capacity, PF_HD_HDS * PF_HD_SPT);
+		geo->heads = PF_HD_HDS;
+		geo->sectors = PF_HD_SPT;
+	}
+
+	return 0;
+}
+
+static int pf_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+	struct pf_unit *pf = bdev->bd_disk->private_data;
+
+	if (cmd != CDROMEJECT)
+		return -EINVAL;
+
+	if (pf->access != 1)
+		return -EBUSY;
+	mutex_lock(&pf_mutex);
+	pf_eject(pf);
+	mutex_unlock(&pf_mutex);
+
+	return 0;
+}
+
+static void pf_release(struct gendisk *disk, fmode_t mode)
+{
+	struct pf_unit *pf = disk->private_data;
+
+	mutex_lock(&pf_mutex);
+	if (pf->access <= 0) {
+		mutex_unlock(&pf_mutex);
+		WARN_ON(1);
+		return;
+	}
+
+	pf->access--;
+
+	if (!pf->access && pf->removable)
+		pf_lock(pf, 0);
+
+	mutex_unlock(&pf_mutex);
+}
+
+static unsigned int pf_check_events(struct gendisk *disk, unsigned int clearing)
+{
+	return DISK_EVENT_MEDIA_CHANGE;
+}
+
+static inline int status_reg(struct pf_unit *pf)
+{
+	return pi_read_regr(pf->pi, 1, 6);
+}
+
+static inline int read_reg(struct pf_unit *pf, int reg)
+{
+	return pi_read_regr(pf->pi, 0, reg);
+}
+
+static inline void write_reg(struct pf_unit *pf, int reg, int val)
+{
+	pi_write_regr(pf->pi, 0, reg, val);
+}
+
+static int pf_wait(struct pf_unit *pf, int go, int stop, char *fun, char *msg)
+{
+	int j, r, e, s, p;
+
+	j = 0;
+	while ((((r = status_reg(pf)) & go) || (stop && (!(r & stop))))
+	       && (j++ < PF_SPIN))
+		udelay(PF_SPIN_DEL);
+
+	if ((r & (STAT_ERR & stop)) || (j > PF_SPIN)) {
+		s = read_reg(pf, 7);
+		e = read_reg(pf, 1);
+		p = read_reg(pf, 2);
+		if (j > PF_SPIN)
+			e |= 0x100;
+		if (fun)
+			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+			       " loop=%d phase=%d\n",
+			       pf->name, fun, msg, r, s, e, j, p);
+		return (e << 8) + s;
+	}
+	return 0;
+}
+
+static int pf_command(struct pf_unit *pf, char *cmd, int dlen, char *fun)
+{
+	pi_connect(pf->pi);
+
+	write_reg(pf, 6, 0xa0+0x10*pf->drive);
+
+	if (pf_wait(pf, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) {
+		pi_disconnect(pf->pi);
+		return -1;
+	}
+
+	write_reg(pf, 4, dlen % 256);
+	write_reg(pf, 5, dlen / 256);
+	write_reg(pf, 7, 0xa0);	/* ATAPI packet command */
+
+	if (pf_wait(pf, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) {
+		pi_disconnect(pf->pi);
+		return -1;
+	}
+
+	if (read_reg(pf, 2) != 1) {
+		printk("%s: %s: command phase error\n", pf->name, fun);
+		pi_disconnect(pf->pi);
+		return -1;
+	}
+
+	pi_write_block(pf->pi, cmd, 12);
+
+	return 0;
+}
+
+static int pf_completion(struct pf_unit *pf, char *buf, char *fun)
+{
+	int r, s, n;
+
+	r = pf_wait(pf, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+		    fun, "completion");
+
+	if ((read_reg(pf, 2) & 2) && (read_reg(pf, 7) & STAT_DRQ)) {
+		n = (((read_reg(pf, 4) + 256 * read_reg(pf, 5)) +
+		      3) & 0xfffc);
+		pi_read_block(pf->pi, buf, n);
+	}
+
+	s = pf_wait(pf, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done");
+
+	pi_disconnect(pf->pi);
+
+	return (r ? r : s);
+}
+
+static void pf_req_sense(struct pf_unit *pf, int quiet)
+{
+	char rs_cmd[12] =
+	    { ATAPI_REQ_SENSE, pf->lun << 5, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[16];
+	int r;
+
+	r = pf_command(pf, rs_cmd, 16, "Request sense");
+	mdelay(1);
+	if (!r)
+		pf_completion(pf, buf, "Request sense");
+
+	if ((!r) && (!quiet))
+		printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n",
+		       pf->name, buf[2] & 0xf, buf[12], buf[13]);
+}
+
+static int pf_atapi(struct pf_unit *pf, char *cmd, int dlen, char *buf, char *fun)
+{
+	int r;
+
+	r = pf_command(pf, cmd, dlen, fun);
+	mdelay(1);
+	if (!r)
+		r = pf_completion(pf, buf, fun);
+	if (r)
+		pf_req_sense(pf, !fun);
+
+	return r;
+}
+
+static void pf_lock(struct pf_unit *pf, int func)
+{
+	char lo_cmd[12] = { ATAPI_LOCK, pf->lun << 5, 0, 0, func, 0, 0, 0, 0, 0, 0, 0 };
+
+	pf_atapi(pf, lo_cmd, 0, pf_scratch, func ? "lock" : "unlock");
+}
+
+static void pf_eject(struct pf_unit *pf)
+{
+	char ej_cmd[12] = { ATAPI_DOOR, pf->lun << 5, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0 };
+
+	pf_lock(pf, 0);
+	pf_atapi(pf, ej_cmd, 0, pf_scratch, "eject");
+}
+
+#define PF_RESET_TMO   30	/* in tenths of a second */
+
+static void pf_sleep(int cs)
+{
+	schedule_timeout_interruptible(cs);
+}
+
+/* the ATAPI standard actually specifies the contents of all 7 registers
+   after a reset, but the specification is ambiguous concerning the last
+   two bytes, and different drives interpret the standard differently.
+ */
+
+static int pf_reset(struct pf_unit *pf)
+{
+	int i, k, flg;
+	int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+	pi_connect(pf->pi);
+	write_reg(pf, 6, 0xa0+0x10*pf->drive);
+	write_reg(pf, 7, 8);
+
+	pf_sleep(20 * HZ / 1000);
+
+	k = 0;
+	while ((k++ < PF_RESET_TMO) && (status_reg(pf) & STAT_BUSY))
+		pf_sleep(HZ / 10);
+
+	flg = 1;
+	for (i = 0; i < 5; i++)
+		flg &= (read_reg(pf, i + 1) == expect[i]);
+
+	if (verbose) {
+		printk("%s: Reset (%d) signature = ", pf->name, k);
+		for (i = 0; i < 5; i++)
+			printk("%3x", read_reg(pf, i + 1));
+		if (!flg)
+			printk(" (incorrect)");
+		printk("\n");
+	}
+
+	pi_disconnect(pf->pi);
+	return flg - 1;
+}
+
+static void pf_mode_sense(struct pf_unit *pf)
+{
+	char ms_cmd[12] =
+	    { ATAPI_MODE_SENSE, pf->lun << 5, 0, 0, 0, 0, 0, 0, 8, 0, 0, 0 };
+	char buf[8];
+
+	pf_atapi(pf, ms_cmd, 8, buf, "mode sense");
+	pf->media_status = PF_RW;
+	if (buf[3] & 0x80)
+		pf->media_status = PF_RO;
+}
+
+static void xs(char *buf, char *targ, int offs, int len)
+{
+	int j, k, l;
+
+	j = 0;
+	l = 0;
+	for (k = 0; k < len; k++)
+		if ((buf[k + offs] != 0x20) || (buf[k + offs] != l))
+			l = targ[j++] = buf[k + offs];
+	if (l == 0x20)
+		j--;
+	targ[j] = 0;
+}
+
+static int xl(char *buf, int offs)
+{
+	int v, k;
+
+	v = 0;
+	for (k = 0; k < 4; k++)
+		v = v * 256 + (buf[k + offs] & 0xff);
+	return v;
+}
+
+static void pf_get_capacity(struct pf_unit *pf)
+{
+	char rc_cmd[12] = { ATAPI_CAPACITY, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[8];
+	int bs;
+
+	if (pf_atapi(pf, rc_cmd, 8, buf, "get capacity")) {
+		pf->media_status = PF_NM;
+		return;
+	}
+	set_capacity(pf->disk, xl(buf, 0) + 1);
+	bs = xl(buf, 4);
+	if (bs != 512) {
+		set_capacity(pf->disk, 0);
+		if (verbose)
+			printk("%s: Drive %d, LUN %d,"
+			       " unsupported block size %d\n",
+			       pf->name, pf->drive, pf->lun, bs);
+	}
+}
+
+static int pf_identify(struct pf_unit *pf)
+{
+	int dt, s;
+	char *ms[2] = { "master", "slave" };
+	char mf[10], id[18];
+	char id_cmd[12] =
+	    { ATAPI_IDENTIFY, pf->lun << 5, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[36];
+
+	s = pf_atapi(pf, id_cmd, 36, buf, "identify");
+	if (s)
+		return -1;
+
+	dt = buf[0] & 0x1f;
+	if ((dt != 0) && (dt != 7)) {
+		if (verbose)
+			printk("%s: Drive %d, LUN %d, unsupported type %d\n",
+			       pf->name, pf->drive, pf->lun, dt);
+		return -1;
+	}
+
+	xs(buf, mf, 8, 8);
+	xs(buf, id, 16, 16);
+
+	pf->removable = (buf[1] & 0x80);
+
+	pf_mode_sense(pf);
+	pf_mode_sense(pf);
+	pf_mode_sense(pf);
+
+	pf_get_capacity(pf);
+
+	printk("%s: %s %s, %s LUN %d, type %d",
+	       pf->name, mf, id, ms[pf->drive], pf->lun, dt);
+	if (pf->removable)
+		printk(", removable");
+	if (pf->media_status == PF_NM)
+		printk(", no media\n");
+	else {
+		if (pf->media_status == PF_RO)
+			printk(", RO");
+		printk(", %llu blocks\n",
+			(unsigned long long)get_capacity(pf->disk));
+	}
+	return 0;
+}
+
+/*
+ * returns 0, with id set if drive is detected, otherwise an error code.
+ */
+static int pf_probe(struct pf_unit *pf)
+{
+	if (pf->drive == -1) {
+		for (pf->drive = 0; pf->drive <= 1; pf->drive++)
+			if (!pf_reset(pf)) {
+				if (pf->lun != -1)
+					return pf_identify(pf);
+				else
+					for (pf->lun = 0; pf->lun < 8; pf->lun++)
+						if (!pf_identify(pf))
+							return 0;
+			}
+	} else {
+		if (pf_reset(pf))
+			return -1;
+		if (pf->lun != -1)
+			return pf_identify(pf);
+		for (pf->lun = 0; pf->lun < 8; pf->lun++)
+			if (!pf_identify(pf))
+				return 0;
+	}
+	return -ENODEV;
+}
+
+/* The i/o request engine */
+
+static int pf_start(struct pf_unit *pf, int cmd, int b, int c)
+{
+	int i;
+	char io_cmd[12] = { cmd, pf->lun << 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	for (i = 0; i < 4; i++) {
+		io_cmd[5 - i] = b & 0xff;
+		b = b >> 8;
+	}
+
+	io_cmd[8] = c & 0xff;
+	io_cmd[7] = (c >> 8) & 0xff;
+
+	i = pf_command(pf, io_cmd, c * 512, "start i/o");
+
+	mdelay(1);
+
+	return i;
+}
+
+static int pf_ready(void)
+{
+	return (((status_reg(pf_current) & (STAT_BUSY | pf_mask)) == pf_mask));
+}
+
+static int pf_queue;
+
+static int set_next_request(void)
+{
+	struct pf_unit *pf;
+	int old_pos = pf_queue;
+
+	do {
+		pf = &units[pf_queue];
+		if (++pf_queue == PF_UNITS)
+			pf_queue = 0;
+		if (pf->present && !list_empty(&pf->rq_list)) {
+			pf_req = list_first_entry(&pf->rq_list, struct request,
+							queuelist);
+			list_del_init(&pf_req->queuelist);
+			blk_mq_start_request(pf_req);
+			break;
+		}
+	} while (pf_queue != old_pos);
+
+	return pf_req != NULL;
+}
+
+static void pf_end_request(blk_status_t err)
+{
+	if (!pf_req)
+		return;
+	if (!blk_update_request(pf_req, err, blk_rq_cur_bytes(pf_req))) {
+		__blk_mq_end_request(pf_req, err);
+		pf_req = NULL;
+	}
+}
+
+static void pf_request(void)
+{
+	if (pf_busy)
+		return;
+repeat:
+	if (!pf_req && !set_next_request())
+		return;
+
+	pf_current = pf_req->q->disk->private_data;
+	pf_block = blk_rq_pos(pf_req);
+	pf_run = blk_rq_sectors(pf_req);
+	pf_count = blk_rq_cur_sectors(pf_req);
+
+	if (pf_block + pf_count > get_capacity(pf_req->q->disk)) {
+		pf_end_request(BLK_STS_IOERR);
+		goto repeat;
+	}
+
+	pf_cmd = rq_data_dir(pf_req);
+	pf_buf = bio_data(pf_req->bio);
+	pf_retries = 0;
+
+	pf_busy = 1;
+	if (pf_cmd == READ)
+		pi_do_claimed(pf_current->pi, do_pf_read);
+	else if (pf_cmd == WRITE)
+		pi_do_claimed(pf_current->pi, do_pf_write);
+	else {
+		pf_busy = 0;
+		pf_end_request(BLK_STS_IOERR);
+		goto repeat;
+	}
+}
+
+static blk_status_t pf_queue_rq(struct blk_mq_hw_ctx *hctx,
+				const struct blk_mq_queue_data *bd)
+{
+	struct pf_unit *pf = hctx->queue->queuedata;
+
+	spin_lock_irq(&pf_spin_lock);
+	list_add_tail(&bd->rq->queuelist, &pf->rq_list);
+	pf_request();
+	spin_unlock_irq(&pf_spin_lock);
+
+	return BLK_STS_OK;
+}
+
+static int pf_next_buf(void)
+{
+	unsigned long saved_flags;
+
+	pf_count--;
+	pf_run--;
+	pf_buf += 512;
+	pf_block++;
+	if (!pf_run)
+		return 1;
+	if (!pf_count) {
+		spin_lock_irqsave(&pf_spin_lock, saved_flags);
+		pf_end_request(0);
+		spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
+		if (!pf_req)
+			return 1;
+		pf_count = blk_rq_cur_sectors(pf_req);
+		pf_buf = bio_data(pf_req->bio);
+	}
+	return 0;
+}
+
+static inline void next_request(blk_status_t err)
+{
+	unsigned long saved_flags;
+
+	spin_lock_irqsave(&pf_spin_lock, saved_flags);
+	pf_end_request(err);
+	pf_busy = 0;
+	pf_request();
+	spin_unlock_irqrestore(&pf_spin_lock, saved_flags);
+}
+
+/* detach from the calling context - in case the spinlock is held */
+static void do_pf_read(void)
+{
+	ps_set_intr(do_pf_read_start, NULL, 0, nice);
+}
+
+static void do_pf_read_start(void)
+{
+	pf_busy = 1;
+
+	if (pf_start(pf_current, ATAPI_READ_10, pf_block, pf_run)) {
+		pi_disconnect(pf_current->pi);
+		if (pf_retries < PF_MAX_RETRIES) {
+			pf_retries++;
+			pi_do_claimed(pf_current->pi, do_pf_read_start);
+			return;
+		}
+		next_request(BLK_STS_IOERR);
+		return;
+	}
+	pf_mask = STAT_DRQ;
+	ps_set_intr(do_pf_read_drq, pf_ready, PF_TMO, nice);
+}
+
+static void do_pf_read_drq(void)
+{
+	while (1) {
+		if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR,
+			    "read block", "completion") & STAT_ERR) {
+			pi_disconnect(pf_current->pi);
+			if (pf_retries < PF_MAX_RETRIES) {
+				pf_req_sense(pf_current, 0);
+				pf_retries++;
+				pi_do_claimed(pf_current->pi, do_pf_read_start);
+				return;
+			}
+			next_request(BLK_STS_IOERR);
+			return;
+		}
+		pi_read_block(pf_current->pi, pf_buf, 512);
+		if (pf_next_buf())
+			break;
+	}
+	pi_disconnect(pf_current->pi);
+	next_request(0);
+}
+
+static void do_pf_write(void)
+{
+	ps_set_intr(do_pf_write_start, NULL, 0, nice);
+}
+
+static void do_pf_write_start(void)
+{
+	pf_busy = 1;
+
+	if (pf_start(pf_current, ATAPI_WRITE_10, pf_block, pf_run)) {
+		pi_disconnect(pf_current->pi);
+		if (pf_retries < PF_MAX_RETRIES) {
+			pf_retries++;
+			pi_do_claimed(pf_current->pi, do_pf_write_start);
+			return;
+		}
+		next_request(BLK_STS_IOERR);
+		return;
+	}
+
+	while (1) {
+		if (pf_wait(pf_current, STAT_BUSY, STAT_DRQ | STAT_ERR,
+			    "write block", "data wait") & STAT_ERR) {
+			pi_disconnect(pf_current->pi);
+			if (pf_retries < PF_MAX_RETRIES) {
+				pf_retries++;
+				pi_do_claimed(pf_current->pi, do_pf_write_start);
+				return;
+			}
+			next_request(BLK_STS_IOERR);
+			return;
+		}
+		pi_write_block(pf_current->pi, pf_buf, 512);
+		if (pf_next_buf())
+			break;
+	}
+	pf_mask = 0;
+	ps_set_intr(do_pf_write_done, pf_ready, PF_TMO, nice);
+}
+
+static void do_pf_write_done(void)
+{
+	if (pf_wait(pf_current, STAT_BUSY, 0, "write block", "done") & STAT_ERR) {
+		pi_disconnect(pf_current->pi);
+		if (pf_retries < PF_MAX_RETRIES) {
+			pf_retries++;
+			pi_do_claimed(pf_current->pi, do_pf_write_start);
+			return;
+		}
+		next_request(BLK_STS_IOERR);
+		return;
+	}
+	pi_disconnect(pf_current->pi);
+	next_request(0);
+}
+
+static int __init pf_init_unit(struct pf_unit *pf, bool autoprobe, int port,
+		int mode, int unit, int protocol, int delay, int ms)
+{
+	struct gendisk *disk;
+	int ret;
+
+	ret = blk_mq_alloc_sq_tag_set(&pf->tag_set, &pf_mq_ops, 1,
+				      BLK_MQ_F_SHOULD_MERGE);
+	if (ret)
+		return ret;
+
+	disk = blk_mq_alloc_disk(&pf->tag_set, pf);
+	if (IS_ERR(disk)) {
+		ret = PTR_ERR(disk);
+		goto out_free_tag_set;
+	}
+	disk->major = major;
+	disk->first_minor = pf - units;
+	disk->minors = 1;
+	strcpy(disk->disk_name, pf->name);
+	disk->fops = &pf_fops;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	disk->private_data = pf;
+
+	blk_queue_max_segments(disk->queue, cluster);
+	blk_queue_bounce_limit(disk->queue, BLK_BOUNCE_HIGH);
+
+	INIT_LIST_HEAD(&pf->rq_list);
+	pf->disk = disk;
+	pf->pi = &pf->pia;
+	pf->media_status = PF_NM;
+	pf->drive = (*drives[disk->first_minor])[D_SLV];
+	pf->lun = (*drives[disk->first_minor])[D_LUN];
+	snprintf(pf->name, PF_NAMELEN, "%s%d", name, disk->first_minor);
+
+	if (!pi_init(pf->pi, autoprobe, port, mode, unit, protocol, delay,
+			pf_scratch, PI_PF, verbose, pf->name)) {
+		ret = -ENODEV;
+		goto out_free_disk;
+	}
+	ret = pf_probe(pf);
+	if (ret)
+		goto out_pi_release;
+
+	ret = add_disk(disk);
+	if (ret)
+		goto out_pi_release;
+	pf->present = 1;
+	return 0;
+
+out_pi_release:
+	pi_release(pf->pi);
+out_free_disk:
+	put_disk(pf->disk);
+out_free_tag_set:
+	blk_mq_free_tag_set(&pf->tag_set);
+	return ret;
+}
+
+static int __init pf_init(void)
+{				/* preliminary initialisation */
+	struct pf_unit *pf;
+	int found = 0, unit;
+
+	if (disable)
+		return -EINVAL;
+
+	if (register_blkdev(major, name))
+		return -EBUSY;
+
+	printk("%s: %s version %s, major %d, cluster %d, nice %d\n",
+	       name, name, PF_VERSION, major, cluster, nice);
+
+	par_drv = pi_register_driver(name);
+	if (!par_drv) {
+		pr_err("failed to register %s driver\n", name);
+		goto out_unregister_blkdev;
+	}
+
+	for (unit = 0; unit < PF_UNITS; unit++) {
+		if (!(*drives[unit])[D_PRT])
+			pf_drive_count++;
+	}
+
+	pf = units;
+	if (pf_drive_count == 0) {
+		if (pf_init_unit(pf, 1, -1, -1, -1, -1, -1, verbose))
+			found++;
+	} else {
+		for (unit = 0; unit < PF_UNITS; unit++, pf++) {
+			int *conf = *drives[unit];
+			if (!conf[D_PRT])
+				continue;
+			if (pf_init_unit(pf, 0, conf[D_PRT], conf[D_MOD],
+				    conf[D_UNI], conf[D_PRO], conf[D_DLY],
+				    verbose))
+				found++;
+		}
+	}
+	if (!found) {
+		printk("%s: No ATAPI disk detected\n", name);
+		goto out_unregister_pi_driver;
+	}
+	pf_busy = 0;
+	return 0;
+
+out_unregister_pi_driver:
+	pi_unregister_driver(par_drv);
+out_unregister_blkdev:
+	unregister_blkdev(major, name);
+	return -ENODEV;
+}
+
+static void __exit pf_exit(void)
+{
+	struct pf_unit *pf;
+	int unit;
+
+	for (pf = units, unit = 0; unit < PF_UNITS; pf++, unit++) {
+		if (!pf->present)
+			continue;
+		del_gendisk(pf->disk);
+		put_disk(pf->disk);
+		blk_mq_free_tag_set(&pf->tag_set);
+		pi_release(pf->pi);
+	}
+
+	unregister_blkdev(major, name);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pf_init)
+module_exit(pf_exit)
diff --git a/drivers/block/paride/pg.c b/drivers/block/paride/pg.c
new file mode 100644
index 000000000..3b5882bfb
--- /dev/null
+++ b/drivers/block/paride/pg.c
@@ -0,0 +1,734 @@
+/* 
+	pg.c    (c) 1998  Grant R. Guenther <grant@torque.net>
+			  Under the terms of the GNU General Public License.
+
+	The pg driver provides a simple character device interface for
+	sending ATAPI commands to a device.  With the exception of the
+	ATAPI reset operation, all operations are performed by a pair
+	of read and write operations to the appropriate /dev/pgN device.
+	A write operation delivers a command and any outbound data in
+	a single buffer.  Normally, the write will succeed unless the
+	device is offline or malfunctioning, or there is already another
+	command pending.  If the write succeeds, it should be followed
+	immediately by a read operation, to obtain any returned data and
+	status information.  A read will fail if there is no operation
+	in progress.
+
+	As a special case, the device can be reset with a write operation,
+	and in this case, no following read is expected, or permitted.
+
+	There are no ioctl() operations.  Any single operation
+	may transfer at most PG_MAX_DATA bytes.  Note that the driver must
+	copy the data through an internal buffer.  In keeping with all
+	current ATAPI devices, command packets are assumed to be exactly
+	12 bytes in length.
+
+	To permit future changes to this interface, the headers in the
+	read and write buffers contain a single character "magic" flag.
+	Currently this flag must be the character "P".
+
+	By default, the driver will autoprobe for a single parallel
+	port ATAPI device, but if their individual parameters are
+	specified, the driver can handle up to 4 devices.
+
+	To use this device, you must have the following device 
+	special files defined:
+
+		/dev/pg0 c 97 0
+		/dev/pg1 c 97 1
+		/dev/pg2 c 97 2
+		/dev/pg3 c 97 3
+
+	(You'll need to change the 97 to something else if you use
+	the 'major' parameter to install the driver on a different
+	major number.)
+
+	The behaviour of the pg driver can be altered by setting
+	some parameters from the insmod command line.  The following
+	parameters are adjustable:
+
+	    drive0      These four arguments can be arrays of       
+	    drive1      1-6 integers as follows:
+	    drive2
+	    drive3      <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+			Where,
+
+		<prt>   is the base of the parallel port address for
+			the corresponding drive.  (required)
+
+		<pro>   is the protocol number for the adapter that
+			supports this drive.  These numbers are
+			logged by 'paride' when the protocol modules
+			are initialised.  (0 if not given)
+
+		<uni>   for those adapters that support chained
+			devices, this is the unit selector for the
+			chain of devices on the given port.  It should
+			be zero for devices that don't support chaining.
+			(0 if not given)
+
+		<mod>   this can be -1 to choose the best mode, or one
+			of the mode numbers supported by the adapter.
+			(-1 if not given)
+
+		<slv>   ATAPI devices can be jumpered to master or slave.
+			Set this to 0 to choose the master drive, 1 to
+			choose the slave, -1 (the default) to choose the
+			first drive found.
+
+		<dly>   some parallel ports require the driver to 
+			go more slowly.  -1 sets a default value that
+			should work with the chosen protocol.  Otherwise,
+			set this to a small integer, the larger it is
+			the slower the port i/o.  In some cases, setting
+			this to zero will speed up the device. (default -1)
+
+	    major	You may use this parameter to override the
+			default major number (97) that this driver
+			will use.  Be sure to change the device
+			name as well.
+
+	    name	This parameter is a character string that
+			contains the name the kernel will use for this
+			device (in /proc output, for instance).
+			(default "pg").
+
+	    verbose     This parameter controls the amount of logging
+			that is done by the driver.  Set it to 0 for 
+			quiet operation, to 1 to enable progress
+			messages while the driver probes for devices,
+			or to 2 for full debug logging.  (default 0)
+
+	If this driver is built into the kernel, you can use 
+	the following command line parameters, with the same values
+	as the corresponding module parameters listed above:
+
+	    pg.drive0
+	    pg.drive1
+	    pg.drive2
+	    pg.drive3
+
+	In addition, you can use the parameter pg.disable to disable
+	the driver entirely.
+
+*/
+
+/* Changes:
+
+	1.01	GRG 1998.06.16	Bug fixes
+	1.02    GRG 1998.09.24  Added jumbo support
+
+*/
+
+#define PG_VERSION      "1.02"
+#define PG_MAJOR	97
+#define PG_NAME		"pg"
+#define PG_UNITS	4
+
+#ifndef PI_PG
+#define PI_PG	4
+#endif
+
+#include <linux/types.h>
+/* Here are things one can override from the insmod command.
+   Most are autoprobed by paride unless set here.  Verbose is 0
+   by default.
+
+*/
+
+static int verbose;
+static int major = PG_MAJOR;
+static char *name = PG_NAME;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+static int pg_drive_count;
+
+enum {D_PRT, D_PRO, D_UNI, D_MOD, D_SLV, D_DLY};
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mtio.h>
+#include <linux/pg.h>
+#include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_* */
+#include <linux/mutex.h>
+#include <linux/jiffies.h>
+
+#include <linux/uaccess.h>
+
+module_param(verbose, int, 0644);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PG_SPIN_DEL     50	/* spin delay in micro-seconds  */
+#define PG_SPIN         200
+#define PG_TMO		HZ
+#define PG_RESET_TMO	10*HZ
+
+#define STAT_ERR        0x01
+#define STAT_INDEX      0x02
+#define STAT_ECC        0x04
+#define STAT_DRQ        0x08
+#define STAT_SEEK       0x10
+#define STAT_WRERR      0x20
+#define STAT_READY      0x40
+#define STAT_BUSY       0x80
+
+#define ATAPI_IDENTIFY		0x12
+
+static DEFINE_MUTEX(pg_mutex);
+static int pg_open(struct inode *inode, struct file *file);
+static int pg_release(struct inode *inode, struct file *file);
+static ssize_t pg_read(struct file *filp, char __user *buf,
+		       size_t count, loff_t * ppos);
+static ssize_t pg_write(struct file *filp, const char __user *buf,
+			size_t count, loff_t * ppos);
+static int pg_detect(void);
+
+#define PG_NAMELEN      8
+
+struct pg {
+	struct pi_adapter pia;	/* interface to paride layer */
+	struct pi_adapter *pi;
+	int busy;		/* write done, read expected */
+	int start;		/* jiffies at command start */
+	int dlen;		/* transfer size requested */
+	unsigned long timeout;	/* timeout requested */
+	int status;		/* last sense key */
+	int drive;		/* drive */
+	unsigned long access;	/* count of active opens ... */
+	int present;		/* device present ? */
+	char *bufptr;
+	char name[PG_NAMELEN];	/* pg0, pg1, ... */
+};
+
+static struct pg devices[PG_UNITS];
+
+static int pg_identify(struct pg *dev, int log);
+
+static char pg_scratch[512];	/* scratch block buffer */
+
+static struct class *pg_class;
+static void *par_drv;		/* reference of parport driver */
+
+/* kernel glue structures */
+
+static const struct file_operations pg_fops = {
+	.owner = THIS_MODULE,
+	.read = pg_read,
+	.write = pg_write,
+	.open = pg_open,
+	.release = pg_release,
+	.llseek = noop_llseek,
+};
+
+static void pg_init_units(void)
+{
+	int unit;
+
+	pg_drive_count = 0;
+	for (unit = 0; unit < PG_UNITS; unit++) {
+		int *parm = *drives[unit];
+		struct pg *dev = &devices[unit];
+		dev->pi = &dev->pia;
+		clear_bit(0, &dev->access);
+		dev->busy = 0;
+		dev->present = 0;
+		dev->bufptr = NULL;
+		dev->drive = parm[D_SLV];
+		snprintf(dev->name, PG_NAMELEN, "%s%c", name, 'a'+unit);
+		if (parm[D_PRT])
+			pg_drive_count++;
+	}
+}
+
+static inline int status_reg(struct pg *dev)
+{
+	return pi_read_regr(dev->pi, 1, 6);
+}
+
+static inline int read_reg(struct pg *dev, int reg)
+{
+	return pi_read_regr(dev->pi, 0, reg);
+}
+
+static inline void write_reg(struct pg *dev, int reg, int val)
+{
+	pi_write_regr(dev->pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pg *dev)
+{
+	return 0xa0+0x10*dev->drive;
+}
+
+static void pg_sleep(int cs)
+{
+	schedule_timeout_interruptible(cs);
+}
+
+static int pg_wait(struct pg *dev, int go, int stop, unsigned long tmo, char *msg)
+{
+	int j, r, e, s, p, to;
+
+	dev->status = 0;
+
+	j = 0;
+	while ((((r = status_reg(dev)) & go) || (stop && (!(r & stop))))
+	       && time_before(jiffies, tmo)) {
+		if (j++ < PG_SPIN)
+			udelay(PG_SPIN_DEL);
+		else
+			pg_sleep(1);
+	}
+
+	to = time_after_eq(jiffies, tmo);
+
+	if ((r & (STAT_ERR & stop)) || to) {
+		s = read_reg(dev, 7);
+		e = read_reg(dev, 1);
+		p = read_reg(dev, 2);
+		if (verbose > 1)
+			printk("%s: %s: stat=0x%x err=0x%x phase=%d%s\n",
+			       dev->name, msg, s, e, p, to ? " timeout" : "");
+		if (to)
+			e |= 0x100;
+		dev->status = (e >> 4) & 0xff;
+		return -1;
+	}
+	return 0;
+}
+
+static int pg_command(struct pg *dev, char *cmd, int dlen, unsigned long tmo)
+{
+	int k;
+
+	pi_connect(dev->pi);
+
+	write_reg(dev, 6, DRIVE(dev));
+
+	if (pg_wait(dev, STAT_BUSY | STAT_DRQ, 0, tmo, "before command"))
+		goto fail;
+
+	write_reg(dev, 4, dlen % 256);
+	write_reg(dev, 5, dlen / 256);
+	write_reg(dev, 7, 0xa0);	/* ATAPI packet command */
+
+	if (pg_wait(dev, STAT_BUSY, STAT_DRQ, tmo, "command DRQ"))
+		goto fail;
+
+	if (read_reg(dev, 2) != 1) {
+		printk("%s: command phase error\n", dev->name);
+		goto fail;
+	}
+
+	pi_write_block(dev->pi, cmd, 12);
+
+	if (verbose > 1) {
+		printk("%s: Command sent, dlen=%d packet= ", dev->name, dlen);
+		for (k = 0; k < 12; k++)
+			printk("%02x ", cmd[k] & 0xff);
+		printk("\n");
+	}
+	return 0;
+fail:
+	pi_disconnect(dev->pi);
+	return -1;
+}
+
+static int pg_completion(struct pg *dev, char *buf, unsigned long tmo)
+{
+	int r, d, n, p;
+
+	r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+		    tmo, "completion");
+
+	dev->dlen = 0;
+
+	while (read_reg(dev, 7) & STAT_DRQ) {
+		d = (read_reg(dev, 4) + 256 * read_reg(dev, 5));
+		n = ((d + 3) & 0xfffc);
+		p = read_reg(dev, 2) & 3;
+		if (p == 0)
+			pi_write_block(dev->pi, buf, n);
+		if (p == 2)
+			pi_read_block(dev->pi, buf, n);
+		if (verbose > 1)
+			printk("%s: %s %d bytes\n", dev->name,
+			       p ? "Read" : "Write", n);
+		dev->dlen += (1 - p) * d;
+		buf += d;
+		r = pg_wait(dev, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+			    tmo, "completion");
+	}
+
+	pi_disconnect(dev->pi);
+
+	return r;
+}
+
+static int pg_reset(struct pg *dev)
+{
+	int i, k, err;
+	int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+	int got[5];
+
+	pi_connect(dev->pi);
+	write_reg(dev, 6, DRIVE(dev));
+	write_reg(dev, 7, 8);
+
+	pg_sleep(20 * HZ / 1000);
+
+	k = 0;
+	while ((k++ < PG_RESET_TMO) && (status_reg(dev) & STAT_BUSY))
+		pg_sleep(1);
+
+	for (i = 0; i < 5; i++)
+		got[i] = read_reg(dev, i + 1);
+
+	err = memcmp(expect, got, sizeof(got)) ? -1 : 0;
+
+	if (verbose) {
+		printk("%s: Reset (%d) signature = ", dev->name, k);
+		for (i = 0; i < 5; i++)
+			printk("%3x", got[i]);
+		if (err)
+			printk(" (incorrect)");
+		printk("\n");
+	}
+
+	pi_disconnect(dev->pi);
+	return err;
+}
+
+static void xs(char *buf, char *targ, int len)
+{
+	char l = '\0';
+	int k;
+
+	for (k = 0; k < len; k++) {
+		char c = *buf++;
+		if (c != ' ' && c != l)
+			l = *targ++ = c;
+	}
+	if (l == ' ')
+		targ--;
+	*targ = '\0';
+}
+
+static int pg_identify(struct pg *dev, int log)
+{
+	int s;
+	char *ms[2] = { "master", "slave" };
+	char mf[10], id[18];
+	char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[36];
+
+	s = pg_command(dev, id_cmd, 36, jiffies + PG_TMO);
+	if (s)
+		return -1;
+	s = pg_completion(dev, buf, jiffies + PG_TMO);
+	if (s)
+		return -1;
+
+	if (log) {
+		xs(buf + 8, mf, 8);
+		xs(buf + 16, id, 16);
+		printk("%s: %s %s, %s\n", dev->name, mf, id, ms[dev->drive]);
+	}
+
+	return 0;
+}
+
+/*
+ * returns  0, with id set if drive is detected
+ *	   -1, if drive detection failed
+ */
+static int pg_probe(struct pg *dev)
+{
+	if (dev->drive == -1) {
+		for (dev->drive = 0; dev->drive <= 1; dev->drive++)
+			if (!pg_reset(dev))
+				return pg_identify(dev, 1);
+	} else {
+		if (!pg_reset(dev))
+			return pg_identify(dev, 1);
+	}
+	return -1;
+}
+
+static int pg_detect(void)
+{
+	struct pg *dev = &devices[0];
+	int k, unit;
+
+	printk("%s: %s version %s, major %d\n", name, name, PG_VERSION, major);
+
+	par_drv = pi_register_driver(name);
+	if (!par_drv) {
+		pr_err("failed to register %s driver\n", name);
+		return -1;
+	}
+
+	k = 0;
+	if (pg_drive_count == 0) {
+		if (pi_init(dev->pi, 1, -1, -1, -1, -1, -1, pg_scratch,
+			    PI_PG, verbose, dev->name)) {
+			if (!pg_probe(dev)) {
+				dev->present = 1;
+				k++;
+			} else
+				pi_release(dev->pi);
+		}
+
+	} else
+		for (unit = 0; unit < PG_UNITS; unit++, dev++) {
+			int *parm = *drives[unit];
+			if (!parm[D_PRT])
+				continue;
+			if (pi_init(dev->pi, 0, parm[D_PRT], parm[D_MOD],
+				    parm[D_UNI], parm[D_PRO], parm[D_DLY],
+				    pg_scratch, PI_PG, verbose, dev->name)) {
+				if (!pg_probe(dev)) {
+					dev->present = 1;
+					k++;
+				} else
+					pi_release(dev->pi);
+			}
+		}
+
+	if (k)
+		return 0;
+
+	pi_unregister_driver(par_drv);
+	printk("%s: No ATAPI device detected\n", name);
+	return -1;
+}
+
+static int pg_open(struct inode *inode, struct file *file)
+{
+	int unit = iminor(inode) & 0x7f;
+	struct pg *dev = &devices[unit];
+	int ret = 0;
+
+	mutex_lock(&pg_mutex);
+	if ((unit >= PG_UNITS) || (!dev->present)) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (test_and_set_bit(0, &dev->access)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	if (dev->busy) {
+		pg_reset(dev);
+		dev->busy = 0;
+	}
+
+	pg_identify(dev, (verbose > 1));
+
+	dev->bufptr = kmalloc(PG_MAX_DATA, GFP_KERNEL);
+	if (dev->bufptr == NULL) {
+		clear_bit(0, &dev->access);
+		printk("%s: buffer allocation failed\n", dev->name);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	file->private_data = dev;
+
+out:
+	mutex_unlock(&pg_mutex);
+	return ret;
+}
+
+static int pg_release(struct inode *inode, struct file *file)
+{
+	struct pg *dev = file->private_data;
+
+	kfree(dev->bufptr);
+	dev->bufptr = NULL;
+	clear_bit(0, &dev->access);
+
+	return 0;
+}
+
+static ssize_t pg_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos)
+{
+	struct pg *dev = filp->private_data;
+	struct pg_write_hdr hdr;
+	int hs = sizeof (hdr);
+
+	if (dev->busy)
+		return -EBUSY;
+	if (count < hs)
+		return -EINVAL;
+
+	if (copy_from_user(&hdr, buf, hs))
+		return -EFAULT;
+
+	if (hdr.magic != PG_MAGIC)
+		return -EINVAL;
+	if (hdr.dlen < 0 || hdr.dlen > PG_MAX_DATA)
+		return -EINVAL;
+	if ((count - hs) > PG_MAX_DATA)
+		return -EINVAL;
+
+	if (hdr.func == PG_RESET) {
+		if (count != hs)
+			return -EINVAL;
+		if (pg_reset(dev))
+			return -EIO;
+		return count;
+	}
+
+	if (hdr.func != PG_COMMAND)
+		return -EINVAL;
+
+	dev->start = jiffies;
+	dev->timeout = hdr.timeout * HZ + HZ / 2 + jiffies;
+
+	if (pg_command(dev, hdr.packet, hdr.dlen, jiffies + PG_TMO)) {
+		if (dev->status & 0x10)
+			return -ETIME;
+		return -EIO;
+	}
+
+	dev->busy = 1;
+
+	if (copy_from_user(dev->bufptr, buf + hs, count - hs))
+		return -EFAULT;
+	return count;
+}
+
+static ssize_t pg_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos)
+{
+	struct pg *dev = filp->private_data;
+	struct pg_read_hdr hdr;
+	int hs = sizeof (hdr);
+	int copy;
+
+	if (!dev->busy)
+		return -EINVAL;
+	if (count < hs)
+		return -EINVAL;
+
+	dev->busy = 0;
+
+	if (pg_completion(dev, dev->bufptr, dev->timeout))
+		if (dev->status & 0x10)
+			return -ETIME;
+
+	memset(&hdr, 0, sizeof(hdr));
+	hdr.magic = PG_MAGIC;
+	hdr.dlen = dev->dlen;
+	copy = 0;
+
+	if (hdr.dlen < 0) {
+		hdr.dlen = -1 * hdr.dlen;
+		copy = hdr.dlen;
+		if (copy > (count - hs))
+			copy = count - hs;
+	}
+
+	hdr.duration = (jiffies - dev->start + HZ / 2) / HZ;
+	hdr.scsi = dev->status & 0x0f;
+
+	if (copy_to_user(buf, &hdr, hs))
+		return -EFAULT;
+	if (copy > 0)
+		if (copy_to_user(buf + hs, dev->bufptr, copy))
+			return -EFAULT;
+	return copy + hs;
+}
+
+static int __init pg_init(void)
+{
+	int unit;
+	int err;
+
+	if (disable){
+		err = -EINVAL;
+		goto out;
+	}
+
+	pg_init_units();
+
+	if (pg_detect()) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	err = register_chrdev(major, name, &pg_fops);
+	if (err < 0) {
+		printk("pg_init: unable to get major number %d\n", major);
+		for (unit = 0; unit < PG_UNITS; unit++) {
+			struct pg *dev = &devices[unit];
+			if (dev->present)
+				pi_release(dev->pi);
+		}
+		goto out;
+	}
+	major = err;	/* In case the user specified `major=0' (dynamic) */
+	pg_class = class_create(THIS_MODULE, "pg");
+	if (IS_ERR(pg_class)) {
+		err = PTR_ERR(pg_class);
+		goto out_chrdev;
+	}
+	for (unit = 0; unit < PG_UNITS; unit++) {
+		struct pg *dev = &devices[unit];
+		if (dev->present)
+			device_create(pg_class, NULL, MKDEV(major, unit), NULL,
+				      "pg%u", unit);
+	}
+	err = 0;
+	goto out;
+
+out_chrdev:
+	unregister_chrdev(major, "pg");
+out:
+	return err;
+}
+
+static void __exit pg_exit(void)
+{
+	int unit;
+
+	for (unit = 0; unit < PG_UNITS; unit++) {
+		struct pg *dev = &devices[unit];
+		if (dev->present)
+			device_destroy(pg_class, MKDEV(major, unit));
+	}
+	class_destroy(pg_class);
+	unregister_chrdev(major, name);
+
+	for (unit = 0; unit < PG_UNITS; unit++) {
+		struct pg *dev = &devices[unit];
+		if (dev->present)
+			pi_release(dev->pi);
+	}
+}
+
+MODULE_LICENSE("GPL");
+module_init(pg_init)
+module_exit(pg_exit)
diff --git a/drivers/block/paride/ppc6lnx.c b/drivers/block/paride/ppc6lnx.c
new file mode 100644
index 000000000..5e5521d3b
--- /dev/null
+++ b/drivers/block/paride/ppc6lnx.c
@@ -0,0 +1,726 @@
+/*
+	ppc6lnx.c (c) 2001 Micro Solutions Inc.
+		Released under the terms of the GNU General Public license
+
+	ppc6lnx.c  is a par of the protocol driver for the Micro Solutions
+		"BACKPACK" parallel port IDE adapter
+		(Works on Series 6 drives)
+
+*/
+
+//***************************************************************************
+
+// PPC 6 Code in C sanitized for LINUX
+// Original x86 ASM by Ron, Converted to C by Clive
+
+//***************************************************************************
+
+
+#define port_stb					1
+#define port_afd					2
+#define cmd_stb						port_afd
+#define port_init					4
+#define data_stb					port_init
+#define port_sel					8
+#define port_int					16
+#define port_dir					0x20
+
+#define ECR_EPP	0x80
+#define ECR_BI	0x20
+
+//***************************************************************************
+
+//  60772 Commands
+
+#define ACCESS_REG				0x00
+#define ACCESS_PORT				0x40
+
+#define ACCESS_READ				0x00
+#define ACCESS_WRITE			0x20
+
+//  60772 Command Prefix
+
+#define CMD_PREFIX_SET		0xe0		// Special command that modifies the next command's operation
+#define CMD_PREFIX_RESET	0xc0		// Resets current cmd modifier reg bits
+ #define PREFIX_IO16			0x01		// perform 16-bit wide I/O
+ #define PREFIX_FASTWR		0x04		// enable PPC mode fast-write
+ #define PREFIX_BLK				0x08		// enable block transfer mode
+
+// 60772 Registers
+
+#define REG_STATUS				0x00		// status register
+ #define STATUS_IRQA			0x01		// Peripheral IRQA line
+ #define STATUS_EEPROM_DO	0x40		// Serial EEPROM data bit
+#define REG_VERSION				0x01		// PPC version register (read)
+#define REG_HWCFG					0x02		// Hardware Config register
+#define REG_RAMSIZE				0x03		// Size of RAM Buffer
+ #define RAMSIZE_128K			0x02
+#define REG_EEPROM				0x06		// EEPROM control register
+ #define EEPROM_SK				0x01		// eeprom SK bit
+ #define EEPROM_DI				0x02		// eeprom DI bit
+ #define EEPROM_CS				0x04		// eeprom CS bit
+ #define EEPROM_EN				0x08		// eeprom output enable
+#define REG_BLKSIZE				0x08		// Block transfer len (24 bit)
+
+//***************************************************************************
+
+typedef struct ppc_storage {
+	u16	lpt_addr;				// LPT base address
+	u8	ppc_id;
+	u8	mode;						// operating mode
+					// 0 = PPC Uni SW
+					// 1 = PPC Uni FW
+					// 2 = PPC Bi SW
+					// 3 = PPC Bi FW
+					// 4 = EPP Byte
+					// 5 = EPP Word
+					// 6 = EPP Dword
+	u8	ppc_flags;
+	u8	org_data;				// original LPT data port contents
+	u8	org_ctrl;				// original LPT control port contents
+	u8	cur_ctrl;				// current control port contents
+} Interface;
+
+//***************************************************************************
+
+// ppc_flags
+
+#define fifo_wait					0x10
+
+//***************************************************************************
+
+// DONT CHANGE THESE LEST YOU BREAK EVERYTHING - BIT FIELD DEPENDENCIES
+
+#define PPCMODE_UNI_SW		0
+#define PPCMODE_UNI_FW		1
+#define PPCMODE_BI_SW			2
+#define PPCMODE_BI_FW			3
+#define PPCMODE_EPP_BYTE	4
+#define PPCMODE_EPP_WORD	5
+#define PPCMODE_EPP_DWORD	6
+
+//***************************************************************************
+
+static int ppc6_select(Interface *ppc);
+static void ppc6_deselect(Interface *ppc);
+static void ppc6_send_cmd(Interface *ppc, u8 cmd);
+static void ppc6_wr_data_byte(Interface *ppc, u8 data);
+static u8 ppc6_rd_data_byte(Interface *ppc);
+static u8 ppc6_rd_port(Interface *ppc, u8 port);
+static void ppc6_wr_port(Interface *ppc, u8 port, u8 data);
+static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count);
+static void ppc6_wait_for_fifo(Interface *ppc);
+static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count);
+static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length);
+static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length);
+static void ppc6_wr_extout(Interface *ppc, u8 regdata);
+static int ppc6_open(Interface *ppc);
+static void ppc6_close(Interface *ppc);
+
+//***************************************************************************
+
+static int ppc6_select(Interface *ppc)
+{
+	u8 i, j, k;
+
+	i = inb(ppc->lpt_addr + 1);
+
+	if (i & 1)
+		outb(i, ppc->lpt_addr + 1);
+
+	ppc->org_data = inb(ppc->lpt_addr);
+
+	ppc->org_ctrl = inb(ppc->lpt_addr + 2) & 0x5F; // readback ctrl
+
+	ppc->cur_ctrl = ppc->org_ctrl;
+
+	ppc->cur_ctrl |= port_sel;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	if (ppc->org_data == 'b')
+		outb('x', ppc->lpt_addr);
+
+	outb('b', ppc->lpt_addr);
+	outb('p', ppc->lpt_addr);
+	outb(ppc->ppc_id, ppc->lpt_addr);
+	outb(~ppc->ppc_id,ppc->lpt_addr);
+
+	ppc->cur_ctrl &= ~port_sel;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	ppc->cur_ctrl = (ppc->cur_ctrl & port_int) | port_init;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	i = ppc->mode & 0x0C;
+
+	if (i == 0)
+		i = (ppc->mode & 2) | 1;
+
+	outb(i, ppc->lpt_addr);
+
+	ppc->cur_ctrl |= port_sel;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	// DELAY
+
+	ppc->cur_ctrl |= port_afd;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	j = ((i & 0x08) << 4) | ((i & 0x07) << 3);
+
+	k = inb(ppc->lpt_addr + 1) & 0xB8;
+
+	if (j == k)
+	{
+		ppc->cur_ctrl &= ~port_afd;
+
+		outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+		k = (inb(ppc->lpt_addr + 1) & 0xB8) ^ 0xB8;
+
+		if (j == k)
+		{
+			if (i & 4)	// EPP
+				ppc->cur_ctrl &= ~(port_sel | port_init);
+			else				// PPC/ECP
+				ppc->cur_ctrl &= ~port_sel;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			return(1);
+		}
+	}
+
+	outb(ppc->org_ctrl, ppc->lpt_addr + 2);
+
+	outb(ppc->org_data, ppc->lpt_addr);
+
+	return(0); // FAIL
+}
+
+//***************************************************************************
+
+static void ppc6_deselect(Interface *ppc)
+{
+	if (ppc->mode & 4)	// EPP
+		ppc->cur_ctrl |= port_init;
+	else								// PPC/ECP
+		ppc->cur_ctrl |= port_sel;
+
+	outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+	outb(ppc->org_data, ppc->lpt_addr);
+
+	outb((ppc->org_ctrl | port_sel), ppc->lpt_addr + 2);
+
+	outb(ppc->org_ctrl, ppc->lpt_addr + 2);
+}
+
+//***************************************************************************
+
+static void ppc6_send_cmd(Interface *ppc, u8 cmd)
+{
+	switch(ppc->mode)
+	{
+		case PPCMODE_UNI_SW :
+		case PPCMODE_UNI_FW :
+		case PPCMODE_BI_SW :
+		case PPCMODE_BI_FW :
+		{
+			outb(cmd, ppc->lpt_addr);
+
+			ppc->cur_ctrl ^= cmd_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_BYTE :
+		case PPCMODE_EPP_WORD :
+		case PPCMODE_EPP_DWORD :
+		{
+			outb(cmd, ppc->lpt_addr + 3);
+
+			break;
+		}
+	}
+}
+
+//***************************************************************************
+
+static void ppc6_wr_data_byte(Interface *ppc, u8 data)
+{
+	switch(ppc->mode)
+	{
+		case PPCMODE_UNI_SW :
+		case PPCMODE_UNI_FW :
+		case PPCMODE_BI_SW :
+		case PPCMODE_BI_FW :
+		{
+			outb(data, ppc->lpt_addr);
+
+			ppc->cur_ctrl ^= data_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_BYTE :
+		case PPCMODE_EPP_WORD :
+		case PPCMODE_EPP_DWORD :
+		{
+			outb(data, ppc->lpt_addr + 4);
+
+			break;
+		}
+	}
+}
+
+//***************************************************************************
+
+static u8 ppc6_rd_data_byte(Interface *ppc)
+{
+	u8 data = 0;
+
+	switch(ppc->mode)
+	{
+		case PPCMODE_UNI_SW :
+		case PPCMODE_UNI_FW :
+		{
+			ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			// DELAY
+
+			data = inb(ppc->lpt_addr + 1);
+
+			data = ((data & 0x80) >> 1) | ((data & 0x38) >> 3);
+
+			ppc->cur_ctrl |= port_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			// DELAY
+
+			data |= inb(ppc->lpt_addr + 1) & 0xB8;
+
+			break;
+		}
+
+		case PPCMODE_BI_SW :
+		case PPCMODE_BI_FW :
+		{
+			ppc->cur_ctrl |= port_dir;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			ppc->cur_ctrl = (ppc->cur_ctrl | port_stb) ^ data_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			data = inb(ppc->lpt_addr);
+
+			ppc->cur_ctrl &= ~port_stb;
+
+			outb(ppc->cur_ctrl,ppc->lpt_addr + 2);
+
+			ppc->cur_ctrl &= ~port_dir;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_BYTE :
+		case PPCMODE_EPP_WORD :
+		case PPCMODE_EPP_DWORD :
+		{
+			outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2);
+
+			data = inb(ppc->lpt_addr + 4);
+
+			outb(ppc->cur_ctrl,ppc->lpt_addr + 2);
+
+			break;
+		}
+	}
+
+	return(data);
+}
+
+//***************************************************************************
+
+static u8 ppc6_rd_port(Interface *ppc, u8 port)
+{
+	ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_READ));
+
+	return(ppc6_rd_data_byte(ppc));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_port(Interface *ppc, u8 port, u8 data)
+{
+	ppc6_send_cmd(ppc,(u8)(port | ACCESS_PORT | ACCESS_WRITE));
+
+	ppc6_wr_data_byte(ppc, data);
+}
+
+//***************************************************************************
+
+static void ppc6_rd_data_blk(Interface *ppc, u8 *data, long count)
+{
+	switch(ppc->mode)
+	{
+		case PPCMODE_UNI_SW :
+		case PPCMODE_UNI_FW :
+		{
+			while(count)
+			{
+				u8 d;
+
+				ppc->cur_ctrl = (ppc->cur_ctrl & ~port_stb) ^ data_stb;
+
+				outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+				// DELAY
+
+				d = inb(ppc->lpt_addr + 1);
+
+				d = ((d & 0x80) >> 1) | ((d & 0x38) >> 3);
+
+				ppc->cur_ctrl |= port_stb;
+
+				outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+				// DELAY
+
+				d |= inb(ppc->lpt_addr + 1) & 0xB8;
+
+				*data++ = d;
+				count--;
+			}
+
+			break;
+		}
+
+		case PPCMODE_BI_SW :
+		case PPCMODE_BI_FW :
+		{
+			ppc->cur_ctrl |= port_dir;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			ppc->cur_ctrl |= port_stb;
+
+			while(count)
+			{
+				ppc->cur_ctrl ^= data_stb;
+
+				outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+				*data++ = inb(ppc->lpt_addr);
+				count--;
+			}
+
+			ppc->cur_ctrl &= ~port_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			ppc->cur_ctrl &= ~port_dir;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_BYTE :
+		{
+			outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2);
+
+			// DELAY
+
+			while(count)
+			{
+				*data++ = inb(ppc->lpt_addr + 4);
+				count--;
+			}
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_WORD :
+		{
+			outb((ppc->cur_ctrl | port_dir), ppc->lpt_addr + 2);
+
+			// DELAY
+
+			while(count > 1)
+			{
+				*((u16 *)data) = inw(ppc->lpt_addr + 4);
+				data  += 2;
+				count -= 2;
+			}
+
+			while(count)
+			{
+				*data++ = inb(ppc->lpt_addr + 4);
+				count--;
+			}
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+
+		case PPCMODE_EPP_DWORD :
+		{
+			outb((ppc->cur_ctrl | port_dir),ppc->lpt_addr + 2);
+
+			// DELAY
+
+			while(count > 3)
+			{
+				*((u32 *)data) = inl(ppc->lpt_addr + 4);
+				data  += 4;
+				count -= 4;
+			}
+
+			while(count)
+			{
+				*data++ = inb(ppc->lpt_addr + 4);
+				count--;
+			}
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			break;
+		}
+	}
+
+}
+
+//***************************************************************************
+
+static void ppc6_wait_for_fifo(Interface *ppc)
+{
+	int i;
+
+	if (ppc->ppc_flags & fifo_wait)
+	{
+		for(i=0; i<20; i++)
+			inb(ppc->lpt_addr + 1);
+	}
+}
+
+//***************************************************************************
+
+static void ppc6_wr_data_blk(Interface *ppc, u8 *data, long count)
+{
+	switch(ppc->mode)
+	{
+		case PPCMODE_UNI_SW :
+		case PPCMODE_BI_SW :
+		{
+			while(count--)
+			{
+				outb(*data++, ppc->lpt_addr);
+
+				ppc->cur_ctrl ^= data_stb;
+
+				outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+			}
+
+			break;
+		}
+
+		case PPCMODE_UNI_FW :
+		case PPCMODE_BI_FW :
+		{
+			u8 this, last;
+
+			ppc6_send_cmd(ppc,(CMD_PREFIX_SET | PREFIX_FASTWR));
+
+			ppc->cur_ctrl |= port_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			last = *data;
+
+			outb(last, ppc->lpt_addr);
+
+			while(count)
+			{
+				this = *data++;
+				count--;
+
+				if (this == last)
+				{
+					ppc->cur_ctrl ^= data_stb;
+
+					outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+				}
+				else
+				{
+					outb(this, ppc->lpt_addr);
+
+					last = this;
+				}
+			}
+
+			ppc->cur_ctrl &= ~port_stb;
+
+			outb(ppc->cur_ctrl, ppc->lpt_addr + 2);
+
+			ppc6_send_cmd(ppc,(CMD_PREFIX_RESET | PREFIX_FASTWR));
+
+			break;
+		}
+
+		case PPCMODE_EPP_BYTE :
+		{
+			while(count)
+			{
+				outb(*data++,ppc->lpt_addr + 4);
+				count--;
+			}
+
+			ppc6_wait_for_fifo(ppc);
+
+			break;
+		}
+
+		case PPCMODE_EPP_WORD :
+		{
+			while(count > 1)
+			{
+				outw(*((u16 *)data),ppc->lpt_addr + 4);
+				data  += 2;
+				count -= 2;
+			}
+
+			while(count)
+			{
+				outb(*data++,ppc->lpt_addr + 4);
+				count--;
+			}
+
+			ppc6_wait_for_fifo(ppc);
+
+			break;
+		}
+
+		case PPCMODE_EPP_DWORD :
+		{
+			while(count > 3)
+			{
+				outl(*((u32 *)data),ppc->lpt_addr + 4);
+				data  += 4;
+				count -= 4;
+			}
+
+			while(count)
+			{
+				outb(*data++,ppc->lpt_addr + 4);
+				count--;
+			}
+
+			ppc6_wait_for_fifo(ppc);
+
+			break;
+		}
+	}
+}
+
+//***************************************************************************
+
+static void ppc6_rd_port16_blk(Interface *ppc, u8 port, u8 *data, long length)
+{
+	length = length << 1;
+
+	ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE));
+	ppc6_wr_data_byte(ppc,(u8)length);
+	ppc6_wr_data_byte(ppc,(u8)(length >> 8));
+	ppc6_wr_data_byte(ppc,0);
+
+	ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK));
+
+	ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_READ));
+
+	ppc6_rd_data_blk(ppc, data, length);
+
+	ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_port16_blk(Interface *ppc, u8 port, u8 *data, long length)
+{
+	length = length << 1;
+
+	ppc6_send_cmd(ppc, (REG_BLKSIZE | ACCESS_REG | ACCESS_WRITE));
+	ppc6_wr_data_byte(ppc,(u8)length);
+	ppc6_wr_data_byte(ppc,(u8)(length >> 8));
+	ppc6_wr_data_byte(ppc,0);
+
+	ppc6_send_cmd(ppc, (CMD_PREFIX_SET | PREFIX_IO16 | PREFIX_BLK));
+
+	ppc6_send_cmd(ppc, (u8)(port | ACCESS_PORT | ACCESS_WRITE));
+
+	ppc6_wr_data_blk(ppc, data, length);
+
+	ppc6_send_cmd(ppc, (CMD_PREFIX_RESET | PREFIX_IO16 | PREFIX_BLK));
+}
+
+//***************************************************************************
+
+static void ppc6_wr_extout(Interface *ppc, u8 regdata)
+{
+	ppc6_send_cmd(ppc,(REG_VERSION | ACCESS_REG | ACCESS_WRITE));
+
+	ppc6_wr_data_byte(ppc, (u8)((regdata & 0x03) << 6));
+}
+
+//***************************************************************************
+
+static int ppc6_open(Interface *ppc)
+{
+	int ret;
+
+	ret = ppc6_select(ppc);
+
+	if (ret == 0)
+		return(ret);
+
+	ppc->ppc_flags &= ~fifo_wait;
+
+	ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_WRITE | REG_RAMSIZE));
+	ppc6_wr_data_byte(ppc, RAMSIZE_128K);
+
+	ppc6_send_cmd(ppc, (ACCESS_REG | ACCESS_READ | REG_VERSION));
+
+	if ((ppc6_rd_data_byte(ppc) & 0x3F) == 0x0C)
+		ppc->ppc_flags |= fifo_wait;
+
+	return(ret);
+}
+
+//***************************************************************************
+
+static void ppc6_close(Interface *ppc)
+{
+	ppc6_deselect(ppc);
+}
+
+//***************************************************************************
+
diff --git a/drivers/block/paride/pseudo.h b/drivers/block/paride/pseudo.h
new file mode 100644
index 000000000..bc3703294
--- /dev/null
+++ b/drivers/block/paride/pseudo.h
@@ -0,0 +1,102 @@
+/* 
+        pseudo.h    (c) 1997-8  Grant R. Guenther <grant@torque.net>
+                                Under the terms of the GNU General Public License.
+
+	This is the "pseudo-interrupt" logic for parallel port drivers.
+
+        This module is #included into each driver.  It makes one
+        function available:
+
+		ps_set_intr( void (*continuation)(void),
+			     int  (*ready)(void),
+			     int timeout,
+			     int nice )
+
+	Which will arrange for ready() to be evaluated frequently and
+	when either it returns true, or timeout jiffies have passed,
+	continuation() will be invoked.
+
+	If nice is 1, the test will done approximately once a
+	jiffy.  If nice is 0, the test will also be done whenever
+	the scheduler runs (by adding it to a task queue).  If
+	nice is greater than 1, the test will be done once every
+	(nice-1) jiffies. 
+
+*/
+
+/* Changes:
+
+	1.01	1998.05.03	Switched from cli()/sti() to spinlocks
+	1.02    1998.12.14      Added support for nice > 1
+*/
+	
+#define PS_VERSION	"1.02"
+
+#include <linux/sched.h>
+#include <linux/workqueue.h>
+
+static void ps_tq_int(struct work_struct *work);
+
+static void (* ps_continuation)(void);
+static int (* ps_ready)(void);
+static unsigned long ps_timeout;
+static int ps_tq_active = 0;
+static int ps_nice = 0;
+
+static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+
+static DECLARE_DELAYED_WORK(ps_tq, ps_tq_int);
+
+static void ps_set_intr(void (*continuation)(void), 
+			int (*ready)(void),
+			int timeout, int nice)
+{
+	unsigned long	flags;
+
+	spin_lock_irqsave(&ps_spinlock,flags);
+
+	ps_continuation = continuation;
+	ps_ready = ready;
+	ps_timeout = jiffies + timeout;
+	ps_nice = nice;
+
+	if (!ps_tq_active) {
+		ps_tq_active = 1;
+		if (!ps_nice)
+			schedule_delayed_work(&ps_tq, 0);
+		else
+			schedule_delayed_work(&ps_tq, ps_nice-1);
+	}
+	spin_unlock_irqrestore(&ps_spinlock,flags);
+}
+
+static void ps_tq_int(struct work_struct *work)
+{
+	void (*con)(void);
+	unsigned long flags;
+
+	spin_lock_irqsave(&ps_spinlock,flags);
+
+	con = ps_continuation;
+	ps_tq_active = 0;
+
+	if (!con) {
+		spin_unlock_irqrestore(&ps_spinlock,flags);
+		return;
+	}
+	if (!ps_ready || ps_ready() || time_after_eq(jiffies, ps_timeout)) {
+		ps_continuation = NULL;
+		spin_unlock_irqrestore(&ps_spinlock,flags);
+		con();
+		return;
+	}
+	ps_tq_active = 1;
+	if (!ps_nice)
+		schedule_delayed_work(&ps_tq, 0);
+	else
+		schedule_delayed_work(&ps_tq, ps_nice-1);
+	spin_unlock_irqrestore(&ps_spinlock,flags);
+}
+
+/* end of pseudo.h */
+
diff --git a/drivers/block/paride/pt.c b/drivers/block/paride/pt.c
new file mode 100644
index 000000000..e815312a0
--- /dev/null
+++ b/drivers/block/paride/pt.c
@@ -0,0 +1,1024 @@
+/* 
+        pt.c    (c) 1998  Grant R. Guenther <grant@torque.net>
+                          Under the terms of the GNU General Public License.
+
+        This is the high-level driver for parallel port ATAPI tape
+        drives based on chips supported by the paride module.
+
+	The driver implements both rewinding and non-rewinding
+	devices, filemarks, and the rewind ioctl.  It allocates
+	a small internal "bounce buffer" for each open device, but
+        otherwise expects buffering and blocking to be done at the
+        user level.  As with most block-structured tapes, short
+	writes are padded to full tape blocks, so reading back a file
+        may return more data than was actually written.
+
+        By default, the driver will autoprobe for a single parallel
+        port ATAPI tape drive, but if their individual parameters are
+        specified, the driver can handle up to 4 drives.
+
+	The rewinding devices are named /dev/pt0, /dev/pt1, ...
+	while the non-rewinding devices are /dev/npt0, /dev/npt1, etc.
+
+        The behaviour of the pt driver can be altered by setting
+        some parameters from the insmod command line.  The following
+        parameters are adjustable:
+
+            drive0      These four arguments can be arrays of       
+            drive1      1-6 integers as follows:
+            drive2
+            drive3      <prt>,<pro>,<uni>,<mod>,<slv>,<dly>
+
+                        Where,
+
+                <prt>   is the base of the parallel port address for
+                        the corresponding drive.  (required)
+
+                <pro>   is the protocol number for the adapter that
+                        supports this drive.  These numbers are
+                        logged by 'paride' when the protocol modules
+                        are initialised.  (0 if not given)
+
+                <uni>   for those adapters that support chained
+                        devices, this is the unit selector for the
+                        chain of devices on the given port.  It should
+                        be zero for devices that don't support chaining.
+                        (0 if not given)
+
+                <mod>   this can be -1 to choose the best mode, or one
+                        of the mode numbers supported by the adapter.
+                        (-1 if not given)
+
+                <slv>   ATAPI devices can be jumpered to master or slave.
+                        Set this to 0 to choose the master drive, 1 to
+                        choose the slave, -1 (the default) to choose the
+                        first drive found.
+
+                <dly>   some parallel ports require the driver to 
+                        go more slowly.  -1 sets a default value that
+                        should work with the chosen protocol.  Otherwise,
+                        set this to a small integer, the larger it is
+                        the slower the port i/o.  In some cases, setting
+                        this to zero will speed up the device. (default -1)
+
+	    major	You may use this parameter to override the
+			default major number (96) that this driver
+			will use.  Be sure to change the device
+			name as well.
+
+	    name	This parameter is a character string that
+			contains the name the kernel will use for this
+			device (in /proc output, for instance).
+			(default "pt").
+
+            verbose     This parameter controls the amount of logging
+                        that the driver will do.  Set it to 0 for
+                        normal operation, 1 to see autoprobe progress
+                        messages, or 2 to see additional debugging
+                        output.  (default 0)
+ 
+        If this driver is built into the kernel, you can use 
+        the following command line parameters, with the same values
+        as the corresponding module parameters listed above:
+
+            pt.drive0
+            pt.drive1
+            pt.drive2
+            pt.drive3
+
+        In addition, you can use the parameter pt.disable to disable
+        the driver entirely.
+
+*/
+
+/*   Changes:
+
+	1.01	GRG 1998.05.06	Round up transfer size, fix ready_wait,
+			        loosed interpretation of ATAPI standard
+				for clearing error status.
+				Eliminate sti();
+	1.02    GRG 1998.06.16  Eliminate an Ugh.
+	1.03    GRG 1998.08.15  Adjusted PT_TMO, use HZ in loop timing,
+				extra debugging
+	1.04    GRG 1998.09.24  Repair minor coding error, added jumbo support
+	
+*/
+
+#define PT_VERSION      "1.04"
+#define PT_MAJOR	96
+#define PT_NAME		"pt"
+#define PT_UNITS	4
+
+#include <linux/types.h>
+
+/* Here are things one can override from the insmod command.
+   Most are autoprobed by paride unless set here.  Verbose is on
+   by default.
+
+*/
+
+static int verbose = 0;
+static int major = PT_MAJOR;
+static char *name = PT_NAME;
+static int disable = 0;
+
+static int drive0[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive1[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive2[6] = { 0, 0, 0, -1, -1, -1 };
+static int drive3[6] = { 0, 0, 0, -1, -1, -1 };
+
+static int (*drives[4])[6] = {&drive0, &drive1, &drive2, &drive3};
+
+#define D_PRT   0
+#define D_PRO   1
+#define D_UNI   2
+#define D_MOD   3
+#define D_SLV   4
+#define D_DLY   5
+
+#define DU              (*drives[unit])
+
+/* end of parameters */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <linux/slab.h>
+#include <linux/mtio.h>
+#include <linux/device.h>
+#include <linux/sched.h>	/* current, TASK_*, schedule_timeout() */
+#include <linux/mutex.h>
+
+#include <linux/uaccess.h>
+
+module_param(verbose, int, 0);
+module_param(major, int, 0);
+module_param(name, charp, 0);
+module_param_array(drive0, int, NULL, 0);
+module_param_array(drive1, int, NULL, 0);
+module_param_array(drive2, int, NULL, 0);
+module_param_array(drive3, int, NULL, 0);
+
+#include "paride.h"
+
+#define PT_MAX_RETRIES  5
+#define PT_TMO          3000	/* interrupt timeout in jiffies */
+#define PT_SPIN_DEL     50	/* spin delay in micro-seconds  */
+#define PT_RESET_TMO    30	/* 30 seconds */
+#define PT_READY_TMO	60	/* 60 seconds */
+#define PT_REWIND_TMO	1200	/* 20 minutes */
+
+#define PT_SPIN         ((1000000/(HZ*PT_SPIN_DEL))*PT_TMO)
+
+#define STAT_ERR        0x00001
+#define STAT_INDEX      0x00002
+#define STAT_ECC        0x00004
+#define STAT_DRQ        0x00008
+#define STAT_SEEK       0x00010
+#define STAT_WRERR      0x00020
+#define STAT_READY      0x00040
+#define STAT_BUSY       0x00080
+#define STAT_SENSE	0x1f000
+
+#define ATAPI_TEST_READY	0x00
+#define ATAPI_REWIND		0x01
+#define ATAPI_REQ_SENSE		0x03
+#define ATAPI_READ_6		0x08
+#define ATAPI_WRITE_6		0x0a
+#define ATAPI_WFM		0x10
+#define ATAPI_IDENTIFY		0x12
+#define ATAPI_MODE_SENSE	0x1a
+#define ATAPI_LOG_SENSE		0x4d
+
+static DEFINE_MUTEX(pt_mutex);
+static int pt_open(struct inode *inode, struct file *file);
+static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+static int pt_release(struct inode *inode, struct file *file);
+static ssize_t pt_read(struct file *filp, char __user *buf,
+		       size_t count, loff_t * ppos);
+static ssize_t pt_write(struct file *filp, const char __user *buf,
+			size_t count, loff_t * ppos);
+static int pt_detect(void);
+
+/* bits in tape->flags */
+
+#define PT_MEDIA	1
+#define PT_WRITE_OK	2
+#define PT_REWIND	4
+#define PT_WRITING      8
+#define PT_READING     16
+#define PT_EOF	       32
+
+#define PT_NAMELEN      8
+#define PT_BUFSIZE  16384
+
+struct pt_unit {
+	struct pi_adapter pia;	/* interface to paride layer */
+	struct pi_adapter *pi;
+	int flags;		/* various state flags */
+	int last_sense;		/* result of last request sense */
+	int drive;		/* drive */
+	atomic_t available;	/* 1 if access is available 0 otherwise */
+	int bs;			/* block size */
+	int capacity;		/* Size of tape in KB */
+	int present;		/* device present ? */
+	char *bufptr;
+	char name[PT_NAMELEN];	/* pf0, pf1, ... */
+};
+
+static int pt_identify(struct pt_unit *tape);
+
+static struct pt_unit pt[PT_UNITS];
+
+static char pt_scratch[512];	/* scratch block buffer */
+static void *par_drv;		/* reference of parport driver */
+
+/* kernel glue structures */
+
+static const struct file_operations pt_fops = {
+	.owner = THIS_MODULE,
+	.read = pt_read,
+	.write = pt_write,
+	.unlocked_ioctl = pt_ioctl,
+	.open = pt_open,
+	.release = pt_release,
+	.llseek = noop_llseek,
+};
+
+/* sysfs class support */
+static struct class *pt_class;
+
+static inline int status_reg(struct pi_adapter *pi)
+{
+	return pi_read_regr(pi, 1, 6);
+}
+
+static inline int read_reg(struct pi_adapter *pi, int reg)
+{
+	return pi_read_regr(pi, 0, reg);
+}
+
+static inline void write_reg(struct pi_adapter *pi, int reg, int val)
+{
+	pi_write_regr(pi, 0, reg, val);
+}
+
+static inline u8 DRIVE(struct pt_unit *tape)
+{
+	return 0xa0+0x10*tape->drive;
+}
+
+static int pt_wait(struct pt_unit *tape, int go, int stop, char *fun, char *msg)
+{
+	int j, r, e, s, p;
+	struct pi_adapter *pi = tape->pi;
+
+	j = 0;
+	while ((((r = status_reg(pi)) & go) || (stop && (!(r & stop))))
+	       && (j++ < PT_SPIN))
+		udelay(PT_SPIN_DEL);
+
+	if ((r & (STAT_ERR & stop)) || (j > PT_SPIN)) {
+		s = read_reg(pi, 7);
+		e = read_reg(pi, 1);
+		p = read_reg(pi, 2);
+		if (j > PT_SPIN)
+			e |= 0x100;
+		if (fun)
+			printk("%s: %s %s: alt=0x%x stat=0x%x err=0x%x"
+			       " loop=%d phase=%d\n",
+			       tape->name, fun, msg, r, s, e, j, p);
+		return (e << 8) + s;
+	}
+	return 0;
+}
+
+static int pt_command(struct pt_unit *tape, char *cmd, int dlen, char *fun)
+{
+	struct pi_adapter *pi = tape->pi;
+	pi_connect(pi);
+
+	write_reg(pi, 6, DRIVE(tape));
+
+	if (pt_wait(tape, STAT_BUSY | STAT_DRQ, 0, fun, "before command")) {
+		pi_disconnect(pi);
+		return -1;
+	}
+
+	write_reg(pi, 4, dlen % 256);
+	write_reg(pi, 5, dlen / 256);
+	write_reg(pi, 7, 0xa0);	/* ATAPI packet command */
+
+	if (pt_wait(tape, STAT_BUSY, STAT_DRQ, fun, "command DRQ")) {
+		pi_disconnect(pi);
+		return -1;
+	}
+
+	if (read_reg(pi, 2) != 1) {
+		printk("%s: %s: command phase error\n", tape->name, fun);
+		pi_disconnect(pi);
+		return -1;
+	}
+
+	pi_write_block(pi, cmd, 12);
+
+	return 0;
+}
+
+static int pt_completion(struct pt_unit *tape, char *buf, char *fun)
+{
+	struct pi_adapter *pi = tape->pi;
+	int r, s, n, p;
+
+	r = pt_wait(tape, STAT_BUSY, STAT_DRQ | STAT_READY | STAT_ERR,
+		    fun, "completion");
+
+	if (read_reg(pi, 7) & STAT_DRQ) {
+		n = (((read_reg(pi, 4) + 256 * read_reg(pi, 5)) +
+		      3) & 0xfffc);
+		p = read_reg(pi, 2) & 3;
+		if (p == 0)
+			pi_write_block(pi, buf, n);
+		if (p == 2)
+			pi_read_block(pi, buf, n);
+	}
+
+	s = pt_wait(tape, STAT_BUSY, STAT_READY | STAT_ERR, fun, "data done");
+
+	pi_disconnect(pi);
+
+	return (r ? r : s);
+}
+
+static void pt_req_sense(struct pt_unit *tape, int quiet)
+{
+	char rs_cmd[12] = { ATAPI_REQ_SENSE, 0, 0, 0, 16, 0, 0, 0, 0, 0, 0, 0 };
+	char buf[16];
+	int r;
+
+	r = pt_command(tape, rs_cmd, 16, "Request sense");
+	mdelay(1);
+	if (!r)
+		pt_completion(tape, buf, "Request sense");
+
+	tape->last_sense = -1;
+	if (!r) {
+		if (!quiet)
+			printk("%s: Sense key: %x, ASC: %x, ASQ: %x\n",
+			       tape->name, buf[2] & 0xf, buf[12], buf[13]);
+		tape->last_sense = (buf[2] & 0xf) | ((buf[12] & 0xff) << 8)
+		    | ((buf[13] & 0xff) << 16);
+	}
+}
+
+static int pt_atapi(struct pt_unit *tape, char *cmd, int dlen, char *buf, char *fun)
+{
+	int r;
+
+	r = pt_command(tape, cmd, dlen, fun);
+	mdelay(1);
+	if (!r)
+		r = pt_completion(tape, buf, fun);
+	if (r)
+		pt_req_sense(tape, !fun);
+
+	return r;
+}
+
+static void pt_sleep(int cs)
+{
+	schedule_timeout_interruptible(cs);
+}
+
+static int pt_poll_dsc(struct pt_unit *tape, int pause, int tmo, char *msg)
+{
+	struct pi_adapter *pi = tape->pi;
+	int k, e, s;
+
+	k = 0;
+	e = 0;
+	s = 0;
+	while (k < tmo) {
+		pt_sleep(pause);
+		k++;
+		pi_connect(pi);
+		write_reg(pi, 6, DRIVE(tape));
+		s = read_reg(pi, 7);
+		e = read_reg(pi, 1);
+		pi_disconnect(pi);
+		if (s & (STAT_ERR | STAT_SEEK))
+			break;
+	}
+	if ((k >= tmo) || (s & STAT_ERR)) {
+		if (k >= tmo)
+			printk("%s: %s DSC timeout\n", tape->name, msg);
+		else
+			printk("%s: %s stat=0x%x err=0x%x\n", tape->name, msg, s,
+			       e);
+		pt_req_sense(tape, 0);
+		return 0;
+	}
+	return 1;
+}
+
+static void pt_media_access_cmd(struct pt_unit *tape, int tmo, char *cmd, char *fun)
+{
+	if (pt_command(tape, cmd, 0, fun)) {
+		pt_req_sense(tape, 0);
+		return;
+	}
+	pi_disconnect(tape->pi);
+	pt_poll_dsc(tape, HZ, tmo, fun);
+}
+
+static void pt_rewind(struct pt_unit *tape)
+{
+	char rw_cmd[12] = { ATAPI_REWIND, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+
+	pt_media_access_cmd(tape, PT_REWIND_TMO, rw_cmd, "rewind");
+}
+
+static void pt_write_fm(struct pt_unit *tape)
+{
+	char wm_cmd[12] = { ATAPI_WFM, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0 };
+
+	pt_media_access_cmd(tape, PT_TMO, wm_cmd, "write filemark");
+}
+
+#define DBMSG(msg)      ((verbose>1)?(msg):NULL)
+
+static int pt_reset(struct pt_unit *tape)
+{
+	struct pi_adapter *pi = tape->pi;
+	int i, k, flg;
+	int expect[5] = { 1, 1, 1, 0x14, 0xeb };
+
+	pi_connect(pi);
+	write_reg(pi, 6, DRIVE(tape));
+	write_reg(pi, 7, 8);
+
+	pt_sleep(20 * HZ / 1000);
+
+	k = 0;
+	while ((k++ < PT_RESET_TMO) && (status_reg(pi) & STAT_BUSY))
+		pt_sleep(HZ / 10);
+
+	flg = 1;
+	for (i = 0; i < 5; i++)
+		flg &= (read_reg(pi, i + 1) == expect[i]);
+
+	if (verbose) {
+		printk("%s: Reset (%d) signature = ", tape->name, k);
+		for (i = 0; i < 5; i++)
+			printk("%3x", read_reg(pi, i + 1));
+		if (!flg)
+			printk(" (incorrect)");
+		printk("\n");
+	}
+
+	pi_disconnect(pi);
+	return flg - 1;
+}
+
+static int pt_ready_wait(struct pt_unit *tape, int tmo)
+{
+	char tr_cmd[12] = { ATAPI_TEST_READY, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int k, p;
+
+	k = 0;
+	while (k < tmo) {
+		tape->last_sense = 0;
+		pt_atapi(tape, tr_cmd, 0, NULL, DBMSG("test unit ready"));
+		p = tape->last_sense;
+		if (!p)
+			return 0;
+		if (!(((p & 0xffff) == 0x0402) || ((p & 0xff) == 6)))
+			return p;
+		k++;
+		pt_sleep(HZ);
+	}
+	return 0x000020;	/* timeout */
+}
+
+static void xs(char *buf, char *targ, int offs, int len)
+{
+	int j, k, l;
+
+	j = 0;
+	l = 0;
+	for (k = 0; k < len; k++)
+		if ((buf[k + offs] != 0x20) || (buf[k + offs] != l))
+			l = targ[j++] = buf[k + offs];
+	if (l == 0x20)
+		j--;
+	targ[j] = 0;
+}
+
+static int xn(char *buf, int offs, int size)
+{
+	int v, k;
+
+	v = 0;
+	for (k = 0; k < size; k++)
+		v = v * 256 + (buf[k + offs] & 0xff);
+	return v;
+}
+
+static int pt_identify(struct pt_unit *tape)
+{
+	int dt, s;
+	char *ms[2] = { "master", "slave" };
+	char mf[10], id[18];
+	char id_cmd[12] = { ATAPI_IDENTIFY, 0, 0, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+	char ms_cmd[12] =
+	    { ATAPI_MODE_SENSE, 0, 0x2a, 0, 36, 0, 0, 0, 0, 0, 0, 0 };
+	char ls_cmd[12] =
+	    { ATAPI_LOG_SENSE, 0, 0x71, 0, 0, 0, 0, 0, 36, 0, 0, 0 };
+	char buf[36];
+
+	s = pt_atapi(tape, id_cmd, 36, buf, "identify");
+	if (s)
+		return -1;
+
+	dt = buf[0] & 0x1f;
+	if (dt != 1) {
+		if (verbose)
+			printk("%s: Drive %d, unsupported type %d\n",
+			       tape->name, tape->drive, dt);
+		return -1;
+	}
+
+	xs(buf, mf, 8, 8);
+	xs(buf, id, 16, 16);
+
+	tape->flags = 0;
+	tape->capacity = 0;
+	tape->bs = 0;
+
+	if (!pt_ready_wait(tape, PT_READY_TMO))
+		tape->flags |= PT_MEDIA;
+
+	if (!pt_atapi(tape, ms_cmd, 36, buf, "mode sense")) {
+		if (!(buf[2] & 0x80))
+			tape->flags |= PT_WRITE_OK;
+		tape->bs = xn(buf, 10, 2);
+	}
+
+	if (!pt_atapi(tape, ls_cmd, 36, buf, "log sense"))
+		tape->capacity = xn(buf, 24, 4);
+
+	printk("%s: %s %s, %s", tape->name, mf, id, ms[tape->drive]);
+	if (!(tape->flags & PT_MEDIA))
+		printk(", no media\n");
+	else {
+		if (!(tape->flags & PT_WRITE_OK))
+			printk(", RO");
+		printk(", blocksize %d, %d MB\n", tape->bs, tape->capacity / 1024);
+	}
+
+	return 0;
+}
+
+
+/*
+ * returns  0, with id set if drive is detected
+ *	   -1, if drive detection failed
+ */
+static int pt_probe(struct pt_unit *tape)
+{
+	if (tape->drive == -1) {
+		for (tape->drive = 0; tape->drive <= 1; tape->drive++)
+			if (!pt_reset(tape))
+				return pt_identify(tape);
+	} else {
+		if (!pt_reset(tape))
+			return pt_identify(tape);
+	}
+	return -1;
+}
+
+static int pt_detect(void)
+{
+	struct pt_unit *tape;
+	int specified = 0, found = 0;
+	int unit;
+
+	printk("%s: %s version %s, major %d\n", name, name, PT_VERSION, major);
+
+	par_drv = pi_register_driver(name);
+	if (!par_drv) {
+		pr_err("failed to register %s driver\n", name);
+		return -1;
+	}
+
+	specified = 0;
+	for (unit = 0; unit < PT_UNITS; unit++) {
+		struct pt_unit *tape = &pt[unit];
+		tape->pi = &tape->pia;
+		atomic_set(&tape->available, 1);
+		tape->flags = 0;
+		tape->last_sense = 0;
+		tape->present = 0;
+		tape->bufptr = NULL;
+		tape->drive = DU[D_SLV];
+		snprintf(tape->name, PT_NAMELEN, "%s%d", name, unit);
+		if (!DU[D_PRT])
+			continue;
+		specified++;
+		if (pi_init(tape->pi, 0, DU[D_PRT], DU[D_MOD], DU[D_UNI],
+		     DU[D_PRO], DU[D_DLY], pt_scratch, PI_PT,
+		     verbose, tape->name)) {
+			if (!pt_probe(tape)) {
+				tape->present = 1;
+				found++;
+			} else
+				pi_release(tape->pi);
+		}
+	}
+	if (specified == 0) {
+		tape = pt;
+		if (pi_init(tape->pi, 1, -1, -1, -1, -1, -1, pt_scratch,
+			    PI_PT, verbose, tape->name)) {
+			if (!pt_probe(tape)) {
+				tape->present = 1;
+				found++;
+			} else
+				pi_release(tape->pi);
+		}
+
+	}
+	if (found)
+		return 0;
+
+	pi_unregister_driver(par_drv);
+	printk("%s: No ATAPI tape drive detected\n", name);
+	return -1;
+}
+
+static int pt_open(struct inode *inode, struct file *file)
+{
+	int unit = iminor(inode) & 0x7F;
+	struct pt_unit *tape = pt + unit;
+	int err;
+
+	mutex_lock(&pt_mutex);
+	if (unit >= PT_UNITS || (!tape->present)) {
+		mutex_unlock(&pt_mutex);
+		return -ENODEV;
+	}
+
+	err = -EBUSY;
+	if (!atomic_dec_and_test(&tape->available))
+		goto out;
+
+	pt_identify(tape);
+
+	err = -ENODEV;
+	if (!(tape->flags & PT_MEDIA))
+		goto out;
+
+	err = -EROFS;
+	if ((!(tape->flags & PT_WRITE_OK)) && (file->f_mode & FMODE_WRITE))
+		goto out;
+
+	if (!(iminor(inode) & 128))
+		tape->flags |= PT_REWIND;
+
+	err = -ENOMEM;
+	tape->bufptr = kmalloc(PT_BUFSIZE, GFP_KERNEL);
+	if (tape->bufptr == NULL) {
+		printk("%s: buffer allocation failed\n", tape->name);
+		goto out;
+	}
+
+	file->private_data = tape;
+	mutex_unlock(&pt_mutex);
+	return 0;
+
+out:
+	atomic_inc(&tape->available);
+	mutex_unlock(&pt_mutex);
+	return err;
+}
+
+static long pt_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	struct pt_unit *tape = file->private_data;
+	struct mtop __user *p = (void __user *)arg;
+	struct mtop mtop;
+
+	switch (cmd) {
+	case MTIOCTOP:
+		if (copy_from_user(&mtop, p, sizeof(struct mtop)))
+			return -EFAULT;
+
+		switch (mtop.mt_op) {
+
+		case MTREW:
+			mutex_lock(&pt_mutex);
+			pt_rewind(tape);
+			mutex_unlock(&pt_mutex);
+			return 0;
+
+		case MTWEOF:
+			mutex_lock(&pt_mutex);
+			pt_write_fm(tape);
+			mutex_unlock(&pt_mutex);
+			return 0;
+
+		default:
+			/* FIXME: rate limit ?? */
+			printk(KERN_DEBUG "%s: Unimplemented mt_op %d\n", tape->name,
+			       mtop.mt_op);
+			return -EINVAL;
+		}
+
+	default:
+		return -ENOTTY;
+	}
+}
+
+static int
+pt_release(struct inode *inode, struct file *file)
+{
+	struct pt_unit *tape = file->private_data;
+
+	if (atomic_read(&tape->available) > 1)
+		return -EINVAL;
+
+	if (tape->flags & PT_WRITING)
+		pt_write_fm(tape);
+
+	if (tape->flags & PT_REWIND)
+		pt_rewind(tape);
+
+	kfree(tape->bufptr);
+	tape->bufptr = NULL;
+
+	atomic_inc(&tape->available);
+
+	return 0;
+
+}
+
+static ssize_t pt_read(struct file *filp, char __user *buf, size_t count, loff_t * ppos)
+{
+	struct pt_unit *tape = filp->private_data;
+	struct pi_adapter *pi = tape->pi;
+	char rd_cmd[12] = { ATAPI_READ_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int k, n, r, p, s, t, b;
+
+	if (!(tape->flags & (PT_READING | PT_WRITING))) {
+		tape->flags |= PT_READING;
+		if (pt_atapi(tape, rd_cmd, 0, NULL, "start read-ahead"))
+			return -EIO;
+	} else if (tape->flags & PT_WRITING)
+		return -EIO;
+
+	if (tape->flags & PT_EOF)
+		return 0;
+
+	t = 0;
+
+	while (count > 0) {
+
+		if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "read"))
+			return -EIO;
+
+		n = count;
+		if (n > 32768)
+			n = 32768;	/* max per command */
+		b = (n - 1 + tape->bs) / tape->bs;
+		n = b * tape->bs;	/* rounded up to even block */
+
+		rd_cmd[4] = b;
+
+		r = pt_command(tape, rd_cmd, n, "read");
+
+		mdelay(1);
+
+		if (r) {
+			pt_req_sense(tape, 0);
+			return -EIO;
+		}
+
+		while (1) {
+
+			r = pt_wait(tape, STAT_BUSY,
+				    STAT_DRQ | STAT_ERR | STAT_READY,
+				    DBMSG("read DRQ"), "");
+
+			if (r & STAT_SENSE) {
+				pi_disconnect(pi);
+				pt_req_sense(tape, 0);
+				return -EIO;
+			}
+
+			if (r)
+				tape->flags |= PT_EOF;
+
+			s = read_reg(pi, 7);
+
+			if (!(s & STAT_DRQ))
+				break;
+
+			n = (read_reg(pi, 4) + 256 * read_reg(pi, 5));
+			p = (read_reg(pi, 2) & 3);
+			if (p != 2) {
+				pi_disconnect(pi);
+				printk("%s: Phase error on read: %d\n", tape->name,
+				       p);
+				return -EIO;
+			}
+
+			while (n > 0) {
+				k = n;
+				if (k > PT_BUFSIZE)
+					k = PT_BUFSIZE;
+				pi_read_block(pi, tape->bufptr, k);
+				n -= k;
+				b = k;
+				if (b > count)
+					b = count;
+				if (copy_to_user(buf + t, tape->bufptr, b)) {
+					pi_disconnect(pi);
+					return -EFAULT;
+				}
+				t += b;
+				count -= b;
+			}
+
+		}
+		pi_disconnect(pi);
+		if (tape->flags & PT_EOF)
+			break;
+	}
+
+	return t;
+
+}
+
+static ssize_t pt_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos)
+{
+	struct pt_unit *tape = filp->private_data;
+	struct pi_adapter *pi = tape->pi;
+	char wr_cmd[12] = { ATAPI_WRITE_6, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 };
+	int k, n, r, p, s, t, b;
+
+	if (!(tape->flags & PT_WRITE_OK))
+		return -EROFS;
+
+	if (!(tape->flags & (PT_READING | PT_WRITING))) {
+		tape->flags |= PT_WRITING;
+		if (pt_atapi
+		    (tape, wr_cmd, 0, NULL, "start buffer-available mode"))
+			return -EIO;
+	} else if (tape->flags & PT_READING)
+		return -EIO;
+
+	if (tape->flags & PT_EOF)
+		return -ENOSPC;
+
+	t = 0;
+
+	while (count > 0) {
+
+		if (!pt_poll_dsc(tape, HZ / 100, PT_TMO, "write"))
+			return -EIO;
+
+		n = count;
+		if (n > 32768)
+			n = 32768;	/* max per command */
+		b = (n - 1 + tape->bs) / tape->bs;
+		n = b * tape->bs;	/* rounded up to even block */
+
+		wr_cmd[4] = b;
+
+		r = pt_command(tape, wr_cmd, n, "write");
+
+		mdelay(1);
+
+		if (r) {	/* error delivering command only */
+			pt_req_sense(tape, 0);
+			return -EIO;
+		}
+
+		while (1) {
+
+			r = pt_wait(tape, STAT_BUSY,
+				    STAT_DRQ | STAT_ERR | STAT_READY,
+				    DBMSG("write DRQ"), NULL);
+
+			if (r & STAT_SENSE) {
+				pi_disconnect(pi);
+				pt_req_sense(tape, 0);
+				return -EIO;
+			}
+
+			if (r)
+				tape->flags |= PT_EOF;
+
+			s = read_reg(pi, 7);
+
+			if (!(s & STAT_DRQ))
+				break;
+
+			n = (read_reg(pi, 4) + 256 * read_reg(pi, 5));
+			p = (read_reg(pi, 2) & 3);
+			if (p != 0) {
+				pi_disconnect(pi);
+				printk("%s: Phase error on write: %d \n",
+				       tape->name, p);
+				return -EIO;
+			}
+
+			while (n > 0) {
+				k = n;
+				if (k > PT_BUFSIZE)
+					k = PT_BUFSIZE;
+				b = k;
+				if (b > count)
+					b = count;
+				if (copy_from_user(tape->bufptr, buf + t, b)) {
+					pi_disconnect(pi);
+					return -EFAULT;
+				}
+				pi_write_block(pi, tape->bufptr, k);
+				t += b;
+				count -= b;
+				n -= k;
+			}
+
+		}
+		pi_disconnect(pi);
+		if (tape->flags & PT_EOF)
+			break;
+	}
+
+	return t;
+}
+
+static int __init pt_init(void)
+{
+	int unit;
+	int err;
+
+	if (disable) {
+		err = -EINVAL;
+		goto out;
+	}
+
+	if (pt_detect()) {
+		err = -ENODEV;
+		goto out;
+	}
+
+	err = register_chrdev(major, name, &pt_fops);
+	if (err < 0) {
+		printk("pt_init: unable to get major number %d\n", major);
+		for (unit = 0; unit < PT_UNITS; unit++)
+			if (pt[unit].present)
+				pi_release(pt[unit].pi);
+		goto out;
+	}
+	major = err;
+	pt_class = class_create(THIS_MODULE, "pt");
+	if (IS_ERR(pt_class)) {
+		err = PTR_ERR(pt_class);
+		goto out_chrdev;
+	}
+
+	for (unit = 0; unit < PT_UNITS; unit++)
+		if (pt[unit].present) {
+			device_create(pt_class, NULL, MKDEV(major, unit), NULL,
+				      "pt%d", unit);
+			device_create(pt_class, NULL, MKDEV(major, unit + 128),
+				      NULL, "pt%dn", unit);
+		}
+	goto out;
+
+out_chrdev:
+	unregister_chrdev(major, "pt");
+out:
+	return err;
+}
+
+static void __exit pt_exit(void)
+{
+	int unit;
+	for (unit = 0; unit < PT_UNITS; unit++)
+		if (pt[unit].present) {
+			device_destroy(pt_class, MKDEV(major, unit));
+			device_destroy(pt_class, MKDEV(major, unit + 128));
+		}
+	class_destroy(pt_class);
+	unregister_chrdev(major, name);
+	for (unit = 0; unit < PT_UNITS; unit++)
+		if (pt[unit].present)
+			pi_release(pt[unit].pi);
+}
+
+MODULE_LICENSE("GPL");
+module_init(pt_init)
+module_exit(pt_exit)
diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c
new file mode 100644
index 000000000..2f1a92509
--- /dev/null
+++ b/drivers/block/pktcdvd.c
@@ -0,0 +1,2946 @@
+/*
+ * Copyright (C) 2000 Jens Axboe <axboe@suse.de>
+ * Copyright (C) 2001-2004 Peter Osterlund <petero2@telia.com>
+ * Copyright (C) 2006 Thomas Maier <balagi@justmail.de>
+ *
+ * May be copied or modified under the terms of the GNU General Public
+ * License.  See linux/COPYING for more information.
+ *
+ * Packet writing layer for ATAPI and SCSI CD-RW, DVD+RW, DVD-RW and
+ * DVD-RAM devices.
+ *
+ * Theory of operation:
+ *
+ * At the lowest level, there is the standard driver for the CD/DVD device,
+ * such as drivers/scsi/sr.c. This driver can handle read and write requests,
+ * but it doesn't know anything about the special restrictions that apply to
+ * packet writing. One restriction is that write requests must be aligned to
+ * packet boundaries on the physical media, and the size of a write request
+ * must be equal to the packet size. Another restriction is that a
+ * GPCMD_FLUSH_CACHE command has to be issued to the drive before a read
+ * command, if the previous command was a write.
+ *
+ * The purpose of the packet writing driver is to hide these restrictions from
+ * higher layers, such as file systems, and present a block device that can be
+ * randomly read and written using 2kB-sized blocks.
+ *
+ * The lowest layer in the packet writing driver is the packet I/O scheduler.
+ * Its data is defined by the struct packet_iosched and includes two bio
+ * queues with pending read and write requests. These queues are processed
+ * by the pkt_iosched_process_queue() function. The write requests in this
+ * queue are already properly aligned and sized. This layer is responsible for
+ * issuing the flush cache commands and scheduling the I/O in a good order.
+ *
+ * The next layer transforms unaligned write requests to aligned writes. This
+ * transformation requires reading missing pieces of data from the underlying
+ * block device, assembling the pieces to full packets and queuing them to the
+ * packet I/O scheduler.
+ *
+ * At the top layer there is a custom ->submit_bio function that forwards
+ * read requests directly to the iosched queue and puts write requests in the
+ * unaligned write queue. A kernel thread performs the necessary read
+ * gathering to convert the unaligned writes to aligned writes and then feeds
+ * them to the packet I/O scheduler.
+ *
+ *************************************************************************/
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/pktcdvd.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/compat.h>
+#include <linux/kthread.h>
+#include <linux/errno.h>
+#include <linux/spinlock.h>
+#include <linux/file.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/miscdevice.h>
+#include <linux/freezer.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <scsi/scsi_cmnd.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/scsi.h>
+#include <linux/debugfs.h>
+#include <linux/device.h>
+#include <linux/nospec.h>
+#include <linux/uaccess.h>
+
+#define DRIVER_NAME	"pktcdvd"
+
+#define pkt_err(pd, fmt, ...)						\
+	pr_err("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_notice(pd, fmt, ...)					\
+	pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__)
+#define pkt_info(pd, fmt, ...)						\
+	pr_info("%s: " fmt, pd->name, ##__VA_ARGS__)
+
+#define pkt_dbg(level, pd, fmt, ...)					\
+do {									\
+	if (level == 2 && PACKET_DEBUG >= 2)				\
+		pr_notice("%s: %s():" fmt,				\
+			  pd->name, __func__, ##__VA_ARGS__);		\
+	else if (level == 1 && PACKET_DEBUG >= 1)			\
+		pr_notice("%s: " fmt, pd->name, ##__VA_ARGS__);		\
+} while (0)
+
+#define MAX_SPEED 0xffff
+
+static DEFINE_MUTEX(pktcdvd_mutex);
+static struct pktcdvd_device *pkt_devs[MAX_WRITERS];
+static struct proc_dir_entry *pkt_proc;
+static int pktdev_major;
+static int write_congestion_on  = PKT_WRITE_CONGESTION_ON;
+static int write_congestion_off = PKT_WRITE_CONGESTION_OFF;
+static struct mutex ctl_mutex;	/* Serialize open/close/setup/teardown */
+static mempool_t psd_pool;
+static struct bio_set pkt_bio_set;
+
+static struct class	*class_pktcdvd = NULL;    /* /sys/class/pktcdvd */
+static struct dentry	*pkt_debugfs_root = NULL; /* /sys/kernel/debug/pktcdvd */
+
+/* forward declaration */
+static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev);
+static int pkt_remove_dev(dev_t pkt_dev);
+static int pkt_seq_show(struct seq_file *m, void *p);
+
+static sector_t get_zone(sector_t sector, struct pktcdvd_device *pd)
+{
+	return (sector + pd->offset) & ~(sector_t)(pd->settings.size - 1);
+}
+
+/**********************************************************
+ * sysfs interface for pktcdvd
+ * by (C) 2006  Thomas Maier <balagi@justmail.de>
+ 
+  /sys/class/pktcdvd/pktcdvd[0-7]/
+                     stat/reset
+                     stat/packets_started
+                     stat/packets_finished
+                     stat/kb_written
+                     stat/kb_read
+                     stat/kb_read_gather
+                     write_queue/size
+                     write_queue/congestion_off
+                     write_queue/congestion_on
+ **********************************************************/
+
+static ssize_t packets_started_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", pd->stats.pkt_started);
+}
+static DEVICE_ATTR_RO(packets_started);
+
+static ssize_t packets_finished_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", pd->stats.pkt_ended);
+}
+static DEVICE_ATTR_RO(packets_finished);
+
+static ssize_t kb_written_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", pd->stats.secs_w >> 1);
+}
+static DEVICE_ATTR_RO(kb_written);
+
+static ssize_t kb_read_show(struct device *dev,
+			    struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", pd->stats.secs_r >> 1);
+}
+static DEVICE_ATTR_RO(kb_read);
+
+static ssize_t kb_read_gather_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	return sysfs_emit(buf, "%lu\n", pd->stats.secs_rg >> 1);
+}
+static DEVICE_ATTR_RO(kb_read_gather);
+
+static ssize_t reset_store(struct device *dev, struct device_attribute *attr,
+			   const char *buf, size_t len)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+
+	if (len > 0) {
+		pd->stats.pkt_started = 0;
+		pd->stats.pkt_ended = 0;
+		pd->stats.secs_w = 0;
+		pd->stats.secs_rg = 0;
+		pd->stats.secs_r = 0;
+	}
+	return len;
+}
+static DEVICE_ATTR_WO(reset);
+
+static struct attribute *pkt_stat_attrs[] = {
+	&dev_attr_packets_finished.attr,
+	&dev_attr_packets_started.attr,
+	&dev_attr_kb_read.attr,
+	&dev_attr_kb_written.attr,
+	&dev_attr_kb_read_gather.attr,
+	&dev_attr_reset.attr,
+	NULL,
+};
+
+static const struct attribute_group pkt_stat_group = {
+	.name = "stat",
+	.attrs = pkt_stat_attrs,
+};
+
+static ssize_t size_show(struct device *dev,
+			 struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+	int n;
+
+	spin_lock(&pd->lock);
+	n = sysfs_emit(buf, "%d\n", pd->bio_queue_size);
+	spin_unlock(&pd->lock);
+	return n;
+}
+static DEVICE_ATTR_RO(size);
+
+static void init_write_congestion_marks(int* lo, int* hi)
+{
+	if (*hi > 0) {
+		*hi = max(*hi, 500);
+		*hi = min(*hi, 1000000);
+		if (*lo <= 0)
+			*lo = *hi - 100;
+		else {
+			*lo = min(*lo, *hi - 100);
+			*lo = max(*lo, 100);
+		}
+	} else {
+		*hi = -1;
+		*lo = -1;
+	}
+}
+
+static ssize_t congestion_off_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+	int n;
+
+	spin_lock(&pd->lock);
+	n = sysfs_emit(buf, "%d\n", pd->write_congestion_off);
+	spin_unlock(&pd->lock);
+	return n;
+}
+
+static ssize_t congestion_off_store(struct device *dev,
+				    struct device_attribute *attr,
+				    const char *buf, size_t len)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) == 1) {
+		spin_lock(&pd->lock);
+		pd->write_congestion_off = val;
+		init_write_congestion_marks(&pd->write_congestion_off,
+					&pd->write_congestion_on);
+		spin_unlock(&pd->lock);
+	}
+	return len;
+}
+static DEVICE_ATTR_RW(congestion_off);
+
+static ssize_t congestion_on_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+	int n;
+
+	spin_lock(&pd->lock);
+	n = sysfs_emit(buf, "%d\n", pd->write_congestion_on);
+	spin_unlock(&pd->lock);
+	return n;
+}
+
+static ssize_t congestion_on_store(struct device *dev,
+				   struct device_attribute *attr,
+				   const char *buf, size_t len)
+{
+	struct pktcdvd_device *pd = dev_get_drvdata(dev);
+	int val;
+
+	if (sscanf(buf, "%d", &val) == 1) {
+		spin_lock(&pd->lock);
+		pd->write_congestion_on = val;
+		init_write_congestion_marks(&pd->write_congestion_off,
+					&pd->write_congestion_on);
+		spin_unlock(&pd->lock);
+	}
+	return len;
+}
+static DEVICE_ATTR_RW(congestion_on);
+
+static struct attribute *pkt_wq_attrs[] = {
+	&dev_attr_congestion_on.attr,
+	&dev_attr_congestion_off.attr,
+	&dev_attr_size.attr,
+	NULL,
+};
+
+static const struct attribute_group pkt_wq_group = {
+	.name = "write_queue",
+	.attrs = pkt_wq_attrs,
+};
+
+static const struct attribute_group *pkt_groups[] = {
+	&pkt_stat_group,
+	&pkt_wq_group,
+	NULL,
+};
+
+static void pkt_sysfs_dev_new(struct pktcdvd_device *pd)
+{
+	if (class_pktcdvd) {
+		pd->dev = device_create_with_groups(class_pktcdvd, NULL,
+						    MKDEV(0, 0), pd, pkt_groups,
+						    "%s", pd->name);
+		if (IS_ERR(pd->dev))
+			pd->dev = NULL;
+	}
+}
+
+static void pkt_sysfs_dev_remove(struct pktcdvd_device *pd)
+{
+	if (class_pktcdvd)
+		device_unregister(pd->dev);
+}
+
+
+/********************************************************************
+  /sys/class/pktcdvd/
+                     add            map block device
+                     remove         unmap packet dev
+                     device_map     show mappings
+ *******************************************************************/
+
+static void class_pktcdvd_release(struct class *cls)
+{
+	kfree(cls);
+}
+
+static ssize_t device_map_show(struct class *c, struct class_attribute *attr,
+			       char *data)
+{
+	int n = 0;
+	int idx;
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+	for (idx = 0; idx < MAX_WRITERS; idx++) {
+		struct pktcdvd_device *pd = pkt_devs[idx];
+		if (!pd)
+			continue;
+		n += sprintf(data+n, "%s %u:%u %u:%u\n",
+			pd->name,
+			MAJOR(pd->pkt_dev), MINOR(pd->pkt_dev),
+			MAJOR(pd->bdev->bd_dev),
+			MINOR(pd->bdev->bd_dev));
+	}
+	mutex_unlock(&ctl_mutex);
+	return n;
+}
+static CLASS_ATTR_RO(device_map);
+
+static ssize_t add_store(struct class *c, struct class_attribute *attr,
+			 const char *buf, size_t count)
+{
+	unsigned int major, minor;
+
+	if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
+		/* pkt_setup_dev() expects caller to hold reference to self */
+		if (!try_module_get(THIS_MODULE))
+			return -ENODEV;
+
+		pkt_setup_dev(MKDEV(major, minor), NULL);
+
+		module_put(THIS_MODULE);
+
+		return count;
+	}
+
+	return -EINVAL;
+}
+static CLASS_ATTR_WO(add);
+
+static ssize_t remove_store(struct class *c, struct class_attribute *attr,
+			    const char *buf, size_t count)
+{
+	unsigned int major, minor;
+	if (sscanf(buf, "%u:%u", &major, &minor) == 2) {
+		pkt_remove_dev(MKDEV(major, minor));
+		return count;
+	}
+	return -EINVAL;
+}
+static CLASS_ATTR_WO(remove);
+
+static struct attribute *class_pktcdvd_attrs[] = {
+	&class_attr_add.attr,
+	&class_attr_remove.attr,
+	&class_attr_device_map.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(class_pktcdvd);
+
+static int pkt_sysfs_init(void)
+{
+	int ret = 0;
+
+	/*
+	 * create control files in sysfs
+	 * /sys/class/pktcdvd/...
+	 */
+	class_pktcdvd = kzalloc(sizeof(*class_pktcdvd), GFP_KERNEL);
+	if (!class_pktcdvd)
+		return -ENOMEM;
+	class_pktcdvd->name = DRIVER_NAME;
+	class_pktcdvd->owner = THIS_MODULE;
+	class_pktcdvd->class_release = class_pktcdvd_release;
+	class_pktcdvd->class_groups = class_pktcdvd_groups;
+	ret = class_register(class_pktcdvd);
+	if (ret) {
+		kfree(class_pktcdvd);
+		class_pktcdvd = NULL;
+		pr_err("failed to create class pktcdvd\n");
+		return ret;
+	}
+	return 0;
+}
+
+static void pkt_sysfs_cleanup(void)
+{
+	if (class_pktcdvd)
+		class_destroy(class_pktcdvd);
+	class_pktcdvd = NULL;
+}
+
+/********************************************************************
+  entries in debugfs
+
+  /sys/kernel/debug/pktcdvd[0-7]/
+			info
+
+ *******************************************************************/
+
+static int pkt_debugfs_seq_show(struct seq_file *m, void *p)
+{
+	return pkt_seq_show(m, p);
+}
+
+static int pkt_debugfs_fops_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, pkt_debugfs_seq_show, inode->i_private);
+}
+
+static const struct file_operations debug_fops = {
+	.open		= pkt_debugfs_fops_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+	.owner		= THIS_MODULE,
+};
+
+static void pkt_debugfs_dev_new(struct pktcdvd_device *pd)
+{
+	if (!pkt_debugfs_root)
+		return;
+	pd->dfs_d_root = debugfs_create_dir(pd->name, pkt_debugfs_root);
+	if (!pd->dfs_d_root)
+		return;
+
+	pd->dfs_f_info = debugfs_create_file("info", 0444,
+					     pd->dfs_d_root, pd, &debug_fops);
+}
+
+static void pkt_debugfs_dev_remove(struct pktcdvd_device *pd)
+{
+	if (!pkt_debugfs_root)
+		return;
+	debugfs_remove(pd->dfs_f_info);
+	debugfs_remove(pd->dfs_d_root);
+	pd->dfs_f_info = NULL;
+	pd->dfs_d_root = NULL;
+}
+
+static void pkt_debugfs_init(void)
+{
+	pkt_debugfs_root = debugfs_create_dir(DRIVER_NAME, NULL);
+}
+
+static void pkt_debugfs_cleanup(void)
+{
+	debugfs_remove(pkt_debugfs_root);
+	pkt_debugfs_root = NULL;
+}
+
+/* ----------------------------------------------------------*/
+
+
+static void pkt_bio_finished(struct pktcdvd_device *pd)
+{
+	BUG_ON(atomic_read(&pd->cdrw.pending_bios) <= 0);
+	if (atomic_dec_and_test(&pd->cdrw.pending_bios)) {
+		pkt_dbg(2, pd, "queue empty\n");
+		atomic_set(&pd->iosched.attention, 1);
+		wake_up(&pd->wqueue);
+	}
+}
+
+/*
+ * Allocate a packet_data struct
+ */
+static struct packet_data *pkt_alloc_packet_data(int frames)
+{
+	int i;
+	struct packet_data *pkt;
+
+	pkt = kzalloc(sizeof(struct packet_data), GFP_KERNEL);
+	if (!pkt)
+		goto no_pkt;
+
+	pkt->frames = frames;
+	pkt->w_bio = bio_kmalloc(frames, GFP_KERNEL);
+	if (!pkt->w_bio)
+		goto no_bio;
+
+	for (i = 0; i < frames / FRAMES_PER_PAGE; i++) {
+		pkt->pages[i] = alloc_page(GFP_KERNEL|__GFP_ZERO);
+		if (!pkt->pages[i])
+			goto no_page;
+	}
+
+	spin_lock_init(&pkt->lock);
+	bio_list_init(&pkt->orig_bios);
+
+	for (i = 0; i < frames; i++) {
+		pkt->r_bios[i] = bio_kmalloc(1, GFP_KERNEL);
+		if (!pkt->r_bios[i])
+			goto no_rd_bio;
+	}
+
+	return pkt;
+
+no_rd_bio:
+	for (i = 0; i < frames; i++)
+		kfree(pkt->r_bios[i]);
+no_page:
+	for (i = 0; i < frames / FRAMES_PER_PAGE; i++)
+		if (pkt->pages[i])
+			__free_page(pkt->pages[i]);
+	kfree(pkt->w_bio);
+no_bio:
+	kfree(pkt);
+no_pkt:
+	return NULL;
+}
+
+/*
+ * Free a packet_data struct
+ */
+static void pkt_free_packet_data(struct packet_data *pkt)
+{
+	int i;
+
+	for (i = 0; i < pkt->frames; i++)
+		kfree(pkt->r_bios[i]);
+	for (i = 0; i < pkt->frames / FRAMES_PER_PAGE; i++)
+		__free_page(pkt->pages[i]);
+	kfree(pkt->w_bio);
+	kfree(pkt);
+}
+
+static void pkt_shrink_pktlist(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	BUG_ON(!list_empty(&pd->cdrw.pkt_active_list));
+
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_free_list, list) {
+		pkt_free_packet_data(pkt);
+	}
+	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+}
+
+static int pkt_grow_pktlist(struct pktcdvd_device *pd, int nr_packets)
+{
+	struct packet_data *pkt;
+
+	BUG_ON(!list_empty(&pd->cdrw.pkt_free_list));
+
+	while (nr_packets > 0) {
+		pkt = pkt_alloc_packet_data(pd->settings.size >> 2);
+		if (!pkt) {
+			pkt_shrink_pktlist(pd);
+			return 0;
+		}
+		pkt->id = nr_packets;
+		pkt->pd = pd;
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+		nr_packets--;
+	}
+	return 1;
+}
+
+static inline struct pkt_rb_node *pkt_rbtree_next(struct pkt_rb_node *node)
+{
+	struct rb_node *n = rb_next(&node->rb_node);
+	if (!n)
+		return NULL;
+	return rb_entry(n, struct pkt_rb_node, rb_node);
+}
+
+static void pkt_rbtree_erase(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	rb_erase(&node->rb_node, &pd->bio_queue);
+	mempool_free(node, &pd->rb_pool);
+	pd->bio_queue_size--;
+	BUG_ON(pd->bio_queue_size < 0);
+}
+
+/*
+ * Find the first node in the pd->bio_queue rb tree with a starting sector >= s.
+ */
+static struct pkt_rb_node *pkt_rbtree_find(struct pktcdvd_device *pd, sector_t s)
+{
+	struct rb_node *n = pd->bio_queue.rb_node;
+	struct rb_node *next;
+	struct pkt_rb_node *tmp;
+
+	if (!n) {
+		BUG_ON(pd->bio_queue_size > 0);
+		return NULL;
+	}
+
+	for (;;) {
+		tmp = rb_entry(n, struct pkt_rb_node, rb_node);
+		if (s <= tmp->bio->bi_iter.bi_sector)
+			next = n->rb_left;
+		else
+			next = n->rb_right;
+		if (!next)
+			break;
+		n = next;
+	}
+
+	if (s > tmp->bio->bi_iter.bi_sector) {
+		tmp = pkt_rbtree_next(tmp);
+		if (!tmp)
+			return NULL;
+	}
+	BUG_ON(s > tmp->bio->bi_iter.bi_sector);
+	return tmp;
+}
+
+/*
+ * Insert a node into the pd->bio_queue rb tree.
+ */
+static void pkt_rbtree_insert(struct pktcdvd_device *pd, struct pkt_rb_node *node)
+{
+	struct rb_node **p = &pd->bio_queue.rb_node;
+	struct rb_node *parent = NULL;
+	sector_t s = node->bio->bi_iter.bi_sector;
+	struct pkt_rb_node *tmp;
+
+	while (*p) {
+		parent = *p;
+		tmp = rb_entry(parent, struct pkt_rb_node, rb_node);
+		if (s < tmp->bio->bi_iter.bi_sector)
+			p = &(*p)->rb_left;
+		else
+			p = &(*p)->rb_right;
+	}
+	rb_link_node(&node->rb_node, parent, p);
+	rb_insert_color(&node->rb_node, &pd->bio_queue);
+	pd->bio_queue_size++;
+}
+
+/*
+ * Send a packet_command to the underlying block device and
+ * wait for completion.
+ */
+static int pkt_generic_packet(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	struct request_queue *q = bdev_get_queue(pd->bdev);
+	struct scsi_cmnd *scmd;
+	struct request *rq;
+	int ret = 0;
+
+	rq = scsi_alloc_request(q, (cgc->data_direction == CGC_DATA_WRITE) ?
+			     REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+	if (IS_ERR(rq))
+		return PTR_ERR(rq);
+	scmd = blk_mq_rq_to_pdu(rq);
+
+	if (cgc->buflen) {
+		ret = blk_rq_map_kern(q, rq, cgc->buffer, cgc->buflen,
+				      GFP_NOIO);
+		if (ret)
+			goto out;
+	}
+
+	scmd->cmd_len = COMMAND_SIZE(cgc->cmd[0]);
+	memcpy(scmd->cmnd, cgc->cmd, CDROM_PACKET_SIZE);
+
+	rq->timeout = 60*HZ;
+	if (cgc->quiet)
+		rq->rq_flags |= RQF_QUIET;
+
+	blk_execute_rq(rq, false);
+	if (scmd->result)
+		ret = -EIO;
+out:
+	blk_mq_free_request(rq);
+	return ret;
+}
+
+static const char *sense_key_string(__u8 index)
+{
+	static const char * const info[] = {
+		"No sense", "Recovered error", "Not ready",
+		"Medium error", "Hardware error", "Illegal request",
+		"Unit attention", "Data protect", "Blank check",
+	};
+
+	return index < ARRAY_SIZE(info) ? info[index] : "INVALID";
+}
+
+/*
+ * A generic sense dump / resolve mechanism should be implemented across
+ * all ATAPI + SCSI devices.
+ */
+static void pkt_dump_sense(struct pktcdvd_device *pd,
+			   struct packet_command *cgc)
+{
+	struct scsi_sense_hdr *sshdr = cgc->sshdr;
+
+	if (sshdr)
+		pkt_err(pd, "%*ph - sense %02x.%02x.%02x (%s)\n",
+			CDROM_PACKET_SIZE, cgc->cmd,
+			sshdr->sense_key, sshdr->asc, sshdr->ascq,
+			sense_key_string(sshdr->sense_key));
+	else
+		pkt_err(pd, "%*ph - no sense\n", CDROM_PACKET_SIZE, cgc->cmd);
+}
+
+/*
+ * flush the drive cache to media
+ */
+static int pkt_flush_cache(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_FLUSH_CACHE;
+	cgc.quiet = 1;
+
+	/*
+	 * the IMMED bit -- we default to not setting it, although that
+	 * would allow a much faster close, this is safer
+	 */
+#if 0
+	cgc.cmd[1] = 1 << 1;
+#endif
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * speed is given as the normal factor, e.g. 4 for 4x
+ */
+static noinline_for_stack int pkt_set_speed(struct pktcdvd_device *pd,
+				unsigned write_speed, unsigned read_speed)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	int ret;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sshdr = &sshdr;
+	cgc.cmd[0] = GPCMD_SET_SPEED;
+	cgc.cmd[2] = (read_speed >> 8) & 0xff;
+	cgc.cmd[3] = read_speed & 0xff;
+	cgc.cmd[4] = (write_speed >> 8) & 0xff;
+	cgc.cmd[5] = write_speed & 0xff;
+
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret)
+		pkt_dump_sense(pd, &cgc);
+
+	return ret;
+}
+
+/*
+ * Queue a bio for processing by the low-level CD device. Must be called
+ * from process context.
+ */
+static void pkt_queue_bio(struct pktcdvd_device *pd, struct bio *bio)
+{
+	spin_lock(&pd->iosched.lock);
+	if (bio_data_dir(bio) == READ)
+		bio_list_add(&pd->iosched.read_queue, bio);
+	else
+		bio_list_add(&pd->iosched.write_queue, bio);
+	spin_unlock(&pd->iosched.lock);
+
+	atomic_set(&pd->iosched.attention, 1);
+	wake_up(&pd->wqueue);
+}
+
+/*
+ * Process the queued read/write requests. This function handles special
+ * requirements for CDRW drives:
+ * - A cache flush command must be inserted before a read request if the
+ *   previous request was a write.
+ * - Switching between reading and writing is slow, so don't do it more often
+ *   than necessary.
+ * - Optimize for throughput at the expense of latency. This means that streaming
+ *   writes will never be interrupted by a read, but if the drive has to seek
+ *   before the next write, switch to reading instead if there are any pending
+ *   read requests.
+ * - Set the read speed according to current usage pattern. When only reading
+ *   from the device, it's best to use the highest possible read speed, but
+ *   when switching often between reading and writing, it's better to have the
+ *   same read and write speeds.
+ */
+static void pkt_iosched_process_queue(struct pktcdvd_device *pd)
+{
+
+	if (atomic_read(&pd->iosched.attention) == 0)
+		return;
+	atomic_set(&pd->iosched.attention, 0);
+
+	for (;;) {
+		struct bio *bio;
+		int reads_queued, writes_queued;
+
+		spin_lock(&pd->iosched.lock);
+		reads_queued = !bio_list_empty(&pd->iosched.read_queue);
+		writes_queued = !bio_list_empty(&pd->iosched.write_queue);
+		spin_unlock(&pd->iosched.lock);
+
+		if (!reads_queued && !writes_queued)
+			break;
+
+		if (pd->iosched.writing) {
+			int need_write_seek = 1;
+			spin_lock(&pd->iosched.lock);
+			bio = bio_list_peek(&pd->iosched.write_queue);
+			spin_unlock(&pd->iosched.lock);
+			if (bio && (bio->bi_iter.bi_sector ==
+				    pd->iosched.last_write))
+				need_write_seek = 0;
+			if (need_write_seek && reads_queued) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					pkt_dbg(2, pd, "write, waiting\n");
+					break;
+				}
+				pkt_flush_cache(pd);
+				pd->iosched.writing = 0;
+			}
+		} else {
+			if (!reads_queued && writes_queued) {
+				if (atomic_read(&pd->cdrw.pending_bios) > 0) {
+					pkt_dbg(2, pd, "read, waiting\n");
+					break;
+				}
+				pd->iosched.writing = 1;
+			}
+		}
+
+		spin_lock(&pd->iosched.lock);
+		if (pd->iosched.writing)
+			bio = bio_list_pop(&pd->iosched.write_queue);
+		else
+			bio = bio_list_pop(&pd->iosched.read_queue);
+		spin_unlock(&pd->iosched.lock);
+
+		if (!bio)
+			continue;
+
+		if (bio_data_dir(bio) == READ)
+			pd->iosched.successive_reads +=
+				bio->bi_iter.bi_size >> 10;
+		else {
+			pd->iosched.successive_reads = 0;
+			pd->iosched.last_write = bio_end_sector(bio);
+		}
+		if (pd->iosched.successive_reads >= HI_SPEED_SWITCH) {
+			if (pd->read_speed == pd->write_speed) {
+				pd->read_speed = MAX_SPEED;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		} else {
+			if (pd->read_speed != pd->write_speed) {
+				pd->read_speed = pd->write_speed;
+				pkt_set_speed(pd, pd->write_speed, pd->read_speed);
+			}
+		}
+
+		atomic_inc(&pd->cdrw.pending_bios);
+		submit_bio_noacct(bio);
+	}
+}
+
+/*
+ * Special care is needed if the underlying block device has a small
+ * max_phys_segments value.
+ */
+static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q)
+{
+	if ((pd->settings.size << 9) / CD_FRAMESIZE
+	    <= queue_max_segments(q)) {
+		/*
+		 * The cdrom device can handle one segment/frame
+		 */
+		clear_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else if ((pd->settings.size << 9) / PAGE_SIZE
+		   <= queue_max_segments(q)) {
+		/*
+		 * We can handle this case at the expense of some extra memory
+		 * copies during write operations
+		 */
+		set_bit(PACKET_MERGE_SEGS, &pd->flags);
+		return 0;
+	} else {
+		pkt_err(pd, "cdrom max_phys_segments too small\n");
+		return -EIO;
+	}
+}
+
+static void pkt_end_io_read(struct bio *bio)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	pkt_dbg(2, pd, "bio=%p sec0=%llx sec=%llx err=%d\n",
+		bio, (unsigned long long)pkt->sector,
+		(unsigned long long)bio->bi_iter.bi_sector, bio->bi_status);
+
+	if (bio->bi_status)
+		atomic_inc(&pkt->io_errors);
+	bio_uninit(bio);
+	if (atomic_dec_and_test(&pkt->io_wait)) {
+		atomic_inc(&pkt->run_sm);
+		wake_up(&pd->wqueue);
+	}
+	pkt_bio_finished(pd);
+}
+
+static void pkt_end_io_packet_write(struct bio *bio)
+{
+	struct packet_data *pkt = bio->bi_private;
+	struct pktcdvd_device *pd = pkt->pd;
+	BUG_ON(!pd);
+
+	pkt_dbg(2, pd, "id=%d, err=%d\n", pkt->id, bio->bi_status);
+
+	pd->stats.pkt_ended++;
+
+	bio_uninit(bio);
+	pkt_bio_finished(pd);
+	atomic_dec(&pkt->io_wait);
+	atomic_inc(&pkt->run_sm);
+	wake_up(&pd->wqueue);
+}
+
+/*
+ * Schedule reads for the holes in a packet
+ */
+static void pkt_gather_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int frames_read = 0;
+	struct bio *bio;
+	int f;
+	char written[PACKET_MAX_SIZE];
+
+	BUG_ON(bio_list_empty(&pkt->orig_bios));
+
+	atomic_set(&pkt->io_wait, 0);
+	atomic_set(&pkt->io_errors, 0);
+
+	/*
+	 * Figure out which frames we need to read before we can write.
+	 */
+	memset(written, 0, sizeof(written));
+	spin_lock(&pkt->lock);
+	bio_list_for_each(bio, &pkt->orig_bios) {
+		int first_frame = (bio->bi_iter.bi_sector - pkt->sector) /
+			(CD_FRAMESIZE >> 9);
+		int num_frames = bio->bi_iter.bi_size / CD_FRAMESIZE;
+		pd->stats.secs_w += num_frames * (CD_FRAMESIZE >> 9);
+		BUG_ON(first_frame < 0);
+		BUG_ON(first_frame + num_frames > pkt->frames);
+		for (f = first_frame; f < first_frame + num_frames; f++)
+			written[f] = 1;
+	}
+	spin_unlock(&pkt->lock);
+
+	if (pkt->cache_valid) {
+		pkt_dbg(2, pd, "zone %llx cached\n",
+			(unsigned long long)pkt->sector);
+		goto out_account;
+	}
+
+	/*
+	 * Schedule reads for missing parts of the packet.
+	 */
+	for (f = 0; f < pkt->frames; f++) {
+		int p, offset;
+
+		if (written[f])
+			continue;
+
+		bio = pkt->r_bios[f];
+		bio_init(bio, pd->bdev, bio->bi_inline_vecs, 1, REQ_OP_READ);
+		bio->bi_iter.bi_sector = pkt->sector + f * (CD_FRAMESIZE >> 9);
+		bio->bi_end_io = pkt_end_io_read;
+		bio->bi_private = pkt;
+
+		p = (f * CD_FRAMESIZE) / PAGE_SIZE;
+		offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+		pkt_dbg(2, pd, "Adding frame %d, page:%p offs:%d\n",
+			f, pkt->pages[p], offset);
+		if (!bio_add_page(bio, pkt->pages[p], CD_FRAMESIZE, offset))
+			BUG();
+
+		atomic_inc(&pkt->io_wait);
+		pkt_queue_bio(pd, bio);
+		frames_read++;
+	}
+
+out_account:
+	pkt_dbg(2, pd, "need %d frames for zone %llx\n",
+		frames_read, (unsigned long long)pkt->sector);
+	pd->stats.pkt_started++;
+	pd->stats.secs_rg += frames_read * (CD_FRAMESIZE >> 9);
+}
+
+/*
+ * Find a packet matching zone, or the least recently used packet if
+ * there is no match.
+ */
+static struct packet_data *pkt_get_packet_data(struct pktcdvd_device *pd, int zone)
+{
+	struct packet_data *pkt;
+
+	list_for_each_entry(pkt, &pd->cdrw.pkt_free_list, list) {
+		if (pkt->sector == zone || pkt->list.next == &pd->cdrw.pkt_free_list) {
+			list_del_init(&pkt->list);
+			if (pkt->sector != zone)
+				pkt->cache_valid = 0;
+			return pkt;
+		}
+	}
+	BUG();
+	return NULL;
+}
+
+static void pkt_put_packet_data(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	if (pkt->cache_valid) {
+		list_add(&pkt->list, &pd->cdrw.pkt_free_list);
+	} else {
+		list_add_tail(&pkt->list, &pd->cdrw.pkt_free_list);
+	}
+}
+
+static inline void pkt_set_state(struct packet_data *pkt, enum packet_data_state state)
+{
+#if PACKET_DEBUG > 1
+	static const char *state_name[] = {
+		"IDLE", "WAITING", "READ_WAIT", "WRITE_WAIT", "RECOVERY", "FINISHED"
+	};
+	enum packet_data_state old_state = pkt->state;
+	pkt_dbg(2, pd, "pkt %2d : s=%6llx %s -> %s\n",
+		pkt->id, (unsigned long long)pkt->sector,
+		state_name[old_state], state_name[state]);
+#endif
+	pkt->state = state;
+}
+
+/*
+ * Scan the work queue to see if we can start a new packet.
+ * returns non-zero if any work was done.
+ */
+static int pkt_handle_queue(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *p;
+	struct bio *bio = NULL;
+	sector_t zone = 0; /* Suppress gcc warning */
+	struct pkt_rb_node *node, *first_node;
+	struct rb_node *n;
+
+	atomic_set(&pd->scan_queue, 0);
+
+	if (list_empty(&pd->cdrw.pkt_free_list)) {
+		pkt_dbg(2, pd, "no pkt\n");
+		return 0;
+	}
+
+	/*
+	 * Try to find a zone we are not already working on.
+	 */
+	spin_lock(&pd->lock);
+	first_node = pkt_rbtree_find(pd, pd->current_sector);
+	if (!first_node) {
+		n = rb_first(&pd->bio_queue);
+		if (n)
+			first_node = rb_entry(n, struct pkt_rb_node, rb_node);
+	}
+	node = first_node;
+	while (node) {
+		bio = node->bio;
+		zone = get_zone(bio->bi_iter.bi_sector, pd);
+		list_for_each_entry(p, &pd->cdrw.pkt_active_list, list) {
+			if (p->sector == zone) {
+				bio = NULL;
+				goto try_next_bio;
+			}
+		}
+		break;
+try_next_bio:
+		node = pkt_rbtree_next(node);
+		if (!node) {
+			n = rb_first(&pd->bio_queue);
+			if (n)
+				node = rb_entry(n, struct pkt_rb_node, rb_node);
+		}
+		if (node == first_node)
+			node = NULL;
+	}
+	spin_unlock(&pd->lock);
+	if (!bio) {
+		pkt_dbg(2, pd, "no bio\n");
+		return 0;
+	}
+
+	pkt = pkt_get_packet_data(pd, zone);
+
+	pd->current_sector = zone + pd->settings.size;
+	pkt->sector = zone;
+	BUG_ON(pkt->frames != pd->settings.size >> 2);
+	pkt->write_size = 0;
+
+	/*
+	 * Scan work queue for bios in the same zone and link them
+	 * to this packet.
+	 */
+	spin_lock(&pd->lock);
+	pkt_dbg(2, pd, "looking for zone %llx\n", (unsigned long long)zone);
+	while ((node = pkt_rbtree_find(pd, zone)) != NULL) {
+		bio = node->bio;
+		pkt_dbg(2, pd, "found zone=%llx\n", (unsigned long long)
+			get_zone(bio->bi_iter.bi_sector, pd));
+		if (get_zone(bio->bi_iter.bi_sector, pd) != zone)
+			break;
+		pkt_rbtree_erase(pd, node);
+		spin_lock(&pkt->lock);
+		bio_list_add(&pkt->orig_bios, bio);
+		pkt->write_size += bio->bi_iter.bi_size / CD_FRAMESIZE;
+		spin_unlock(&pkt->lock);
+	}
+	/* check write congestion marks, and if bio_queue_size is
+	 * below, wake up any waiters
+	 */
+	if (pd->congested &&
+	    pd->bio_queue_size <= pd->write_congestion_off) {
+		pd->congested = false;
+		wake_up_var(&pd->congested);
+	}
+	spin_unlock(&pd->lock);
+
+	pkt->sleep_time = max(PACKET_WAIT_TIME, 1);
+	pkt_set_state(pkt, PACKET_WAITING_STATE);
+	atomic_set(&pkt->run_sm, 1);
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_add(&pkt->list, &pd->cdrw.pkt_active_list);
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	return 1;
+}
+
+/**
+ * bio_list_copy_data - copy contents of data buffers from one chain of bios to
+ * another
+ * @src: source bio list
+ * @dst: destination bio list
+ *
+ * Stops when it reaches the end of either the @src list or @dst list - that is,
+ * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
+ * bios).
+ */
+static void bio_list_copy_data(struct bio *dst, struct bio *src)
+{
+	struct bvec_iter src_iter = src->bi_iter;
+	struct bvec_iter dst_iter = dst->bi_iter;
+
+	while (1) {
+		if (!src_iter.bi_size) {
+			src = src->bi_next;
+			if (!src)
+				break;
+
+			src_iter = src->bi_iter;
+		}
+
+		if (!dst_iter.bi_size) {
+			dst = dst->bi_next;
+			if (!dst)
+				break;
+
+			dst_iter = dst->bi_iter;
+		}
+
+		bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
+	}
+}
+
+/*
+ * Assemble a bio to write one packet and queue the bio for processing
+ * by the underlying block device.
+ */
+static void pkt_start_write(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	int f;
+
+	bio_init(pkt->w_bio, pd->bdev, pkt->w_bio->bi_inline_vecs, pkt->frames,
+		 REQ_OP_WRITE);
+	pkt->w_bio->bi_iter.bi_sector = pkt->sector;
+	pkt->w_bio->bi_end_io = pkt_end_io_packet_write;
+	pkt->w_bio->bi_private = pkt;
+
+	/* XXX: locking? */
+	for (f = 0; f < pkt->frames; f++) {
+		struct page *page = pkt->pages[(f * CD_FRAMESIZE) / PAGE_SIZE];
+		unsigned offset = (f * CD_FRAMESIZE) % PAGE_SIZE;
+
+		if (!bio_add_page(pkt->w_bio, page, CD_FRAMESIZE, offset))
+			BUG();
+	}
+	pkt_dbg(2, pd, "vcnt=%d\n", pkt->w_bio->bi_vcnt);
+
+	/*
+	 * Fill-in bvec with data from orig_bios.
+	 */
+	spin_lock(&pkt->lock);
+	bio_list_copy_data(pkt->w_bio, pkt->orig_bios.head);
+
+	pkt_set_state(pkt, PACKET_WRITE_WAIT_STATE);
+	spin_unlock(&pkt->lock);
+
+	pkt_dbg(2, pd, "Writing %d frames for zone %llx\n",
+		pkt->write_size, (unsigned long long)pkt->sector);
+
+	if (test_bit(PACKET_MERGE_SEGS, &pd->flags) || (pkt->write_size < pkt->frames))
+		pkt->cache_valid = 1;
+	else
+		pkt->cache_valid = 0;
+
+	/* Start the write request */
+	atomic_set(&pkt->io_wait, 1);
+	pkt_queue_bio(pd, pkt->w_bio);
+}
+
+static void pkt_finish_packet(struct packet_data *pkt, blk_status_t status)
+{
+	struct bio *bio;
+
+	if (status)
+		pkt->cache_valid = 0;
+
+	/* Finish all bios corresponding to this packet */
+	while ((bio = bio_list_pop(&pkt->orig_bios))) {
+		bio->bi_status = status;
+		bio_endio(bio);
+	}
+}
+
+static void pkt_run_state_machine(struct pktcdvd_device *pd, struct packet_data *pkt)
+{
+	pkt_dbg(2, pd, "pkt %d\n", pkt->id);
+
+	for (;;) {
+		switch (pkt->state) {
+		case PACKET_WAITING_STATE:
+			if ((pkt->write_size < pkt->frames) && (pkt->sleep_time > 0))
+				return;
+
+			pkt->sleep_time = 0;
+			pkt_gather_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_READ_WAIT_STATE);
+			break;
+
+		case PACKET_READ_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (atomic_read(&pkt->io_errors) > 0) {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			} else {
+				pkt_start_write(pd, pkt);
+			}
+			break;
+
+		case PACKET_WRITE_WAIT_STATE:
+			if (atomic_read(&pkt->io_wait) > 0)
+				return;
+
+			if (!pkt->w_bio->bi_status) {
+				pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			} else {
+				pkt_set_state(pkt, PACKET_RECOVERY_STATE);
+			}
+			break;
+
+		case PACKET_RECOVERY_STATE:
+			pkt_dbg(2, pd, "No recovery possible\n");
+			pkt_set_state(pkt, PACKET_FINISHED_STATE);
+			break;
+
+		case PACKET_FINISHED_STATE:
+			pkt_finish_packet(pkt, pkt->w_bio->bi_status);
+			return;
+
+		default:
+			BUG();
+			break;
+		}
+	}
+}
+
+static void pkt_handle_packets(struct pktcdvd_device *pd)
+{
+	struct packet_data *pkt, *next;
+
+	/*
+	 * Run state machine for active packets
+	 */
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (atomic_read(&pkt->run_sm) > 0) {
+			atomic_set(&pkt->run_sm, 0);
+			pkt_run_state_machine(pd, pkt);
+		}
+	}
+
+	/*
+	 * Move no longer active packets to the free list
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry_safe(pkt, next, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->state == PACKET_FINISHED_STATE) {
+			list_del(&pkt->list);
+			pkt_put_packet_data(pd, pkt);
+			pkt_set_state(pkt, PACKET_IDLE_STATE);
+			atomic_set(&pd->scan_queue, 1);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+static void pkt_count_states(struct pktcdvd_device *pd, int *states)
+{
+	struct packet_data *pkt;
+	int i;
+
+	for (i = 0; i < PACKET_NUM_STATES; i++)
+		states[i] = 0;
+
+	spin_lock(&pd->cdrw.active_list_lock);
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		states[pkt->state]++;
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+}
+
+/*
+ * kcdrwd is woken up when writes have been queued for one of our
+ * registered devices
+ */
+static int kcdrwd(void *foobar)
+{
+	struct pktcdvd_device *pd = foobar;
+	struct packet_data *pkt;
+	long min_sleep_time, residue;
+
+	set_user_nice(current, MIN_NICE);
+	set_freezable();
+
+	for (;;) {
+		DECLARE_WAITQUEUE(wait, current);
+
+		/*
+		 * Wait until there is something to do
+		 */
+		add_wait_queue(&pd->wqueue, &wait);
+		for (;;) {
+			set_current_state(TASK_INTERRUPTIBLE);
+
+			/* Check if we need to run pkt_handle_queue */
+			if (atomic_read(&pd->scan_queue) > 0)
+				goto work_to_do;
+
+			/* Check if we need to run the state machine for some packet */
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (atomic_read(&pkt->run_sm) > 0)
+					goto work_to_do;
+			}
+
+			/* Check if we need to process the iosched queues */
+			if (atomic_read(&pd->iosched.attention) != 0)
+				goto work_to_do;
+
+			/* Otherwise, go to sleep */
+			if (PACKET_DEBUG > 1) {
+				int states[PACKET_NUM_STATES];
+				pkt_count_states(pd, states);
+				pkt_dbg(2, pd, "i:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+					states[0], states[1], states[2],
+					states[3], states[4], states[5]);
+			}
+
+			min_sleep_time = MAX_SCHEDULE_TIMEOUT;
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (pkt->sleep_time && pkt->sleep_time < min_sleep_time)
+					min_sleep_time = pkt->sleep_time;
+			}
+
+			pkt_dbg(2, pd, "sleeping\n");
+			residue = schedule_timeout(min_sleep_time);
+			pkt_dbg(2, pd, "wake up\n");
+
+			/* make swsusp happy with our thread */
+			try_to_freeze();
+
+			list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+				if (!pkt->sleep_time)
+					continue;
+				pkt->sleep_time -= min_sleep_time - residue;
+				if (pkt->sleep_time <= 0) {
+					pkt->sleep_time = 0;
+					atomic_inc(&pkt->run_sm);
+				}
+			}
+
+			if (kthread_should_stop())
+				break;
+		}
+work_to_do:
+		set_current_state(TASK_RUNNING);
+		remove_wait_queue(&pd->wqueue, &wait);
+
+		if (kthread_should_stop())
+			break;
+
+		/*
+		 * if pkt_handle_queue returns true, we can queue
+		 * another request.
+		 */
+		while (pkt_handle_queue(pd))
+			;
+
+		/*
+		 * Handle packet state machine
+		 */
+		pkt_handle_packets(pd);
+
+		/*
+		 * Handle iosched queues
+		 */
+		pkt_iosched_process_queue(pd);
+	}
+
+	return 0;
+}
+
+static void pkt_print_settings(struct pktcdvd_device *pd)
+{
+	pkt_info(pd, "%s packets, %u blocks, Mode-%c disc\n",
+		 pd->settings.fp ? "Fixed" : "Variable",
+		 pd->settings.size >> 2,
+		 pd->settings.block_mode == 8 ? '1' : '2');
+}
+
+static int pkt_mode_sense(struct pktcdvd_device *pd, struct packet_command *cgc, int page_code, int page_control)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+
+	cgc->cmd[0] = GPCMD_MODE_SENSE_10;
+	cgc->cmd[2] = page_code | (page_control << 6);
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_READ;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_mode_select(struct pktcdvd_device *pd, struct packet_command *cgc)
+{
+	memset(cgc->cmd, 0, sizeof(cgc->cmd));
+	memset(cgc->buffer, 0, 2);
+	cgc->cmd[0] = GPCMD_MODE_SELECT_10;
+	cgc->cmd[1] = 0x10;		/* PF */
+	cgc->cmd[7] = cgc->buflen >> 8;
+	cgc->cmd[8] = cgc->buflen & 0xff;
+	cgc->data_direction = CGC_DATA_WRITE;
+	return pkt_generic_packet(pd, cgc);
+}
+
+static int pkt_get_disc_info(struct pktcdvd_device *pd, disc_information *di)
+{
+	struct packet_command cgc;
+	int ret;
+
+	/* set up command and get the disc info */
+	init_cdrom_command(&cgc, di, sizeof(*di), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_DISC_INFO;
+	cgc.cmd[8] = cgc.buflen = 2;
+	cgc.quiet = 1;
+
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret)
+		return ret;
+
+	/* not all drives have the same disc_info length, so requeue
+	 * packet with the length the drive tells us it can supply
+	 */
+	cgc.buflen = be16_to_cpu(di->disc_information_length) +
+		     sizeof(di->disc_information_length);
+
+	if (cgc.buflen > sizeof(disc_information))
+		cgc.buflen = sizeof(disc_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static int pkt_get_track_info(struct pktcdvd_device *pd, __u16 track, __u8 type, track_information *ti)
+{
+	struct packet_command cgc;
+	int ret;
+
+	init_cdrom_command(&cgc, ti, 8, CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_READ_TRACK_RZONE_INFO;
+	cgc.cmd[1] = type & 3;
+	cgc.cmd[4] = (track & 0xff00) >> 8;
+	cgc.cmd[5] = track & 0xff;
+	cgc.cmd[8] = 8;
+	cgc.quiet = 1;
+
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret)
+		return ret;
+
+	cgc.buflen = be16_to_cpu(ti->track_information_length) +
+		     sizeof(ti->track_information_length);
+
+	if (cgc.buflen > sizeof(track_information))
+		cgc.buflen = sizeof(track_information);
+
+	cgc.cmd[8] = cgc.buflen;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+static noinline_for_stack int pkt_get_last_written(struct pktcdvd_device *pd,
+						long *last_written)
+{
+	disc_information di;
+	track_information ti;
+	__u32 last_track;
+	int ret;
+
+	ret = pkt_get_disc_info(pd, &di);
+	if (ret)
+		return ret;
+
+	last_track = (di.last_track_msb << 8) | di.last_track_lsb;
+	ret = pkt_get_track_info(pd, last_track, 1, &ti);
+	if (ret)
+		return ret;
+
+	/* if this track is blank, try the previous. */
+	if (ti.blank) {
+		last_track--;
+		ret = pkt_get_track_info(pd, last_track, 1, &ti);
+		if (ret)
+			return ret;
+	}
+
+	/* if last recorded field is valid, return it. */
+	if (ti.lra_v) {
+		*last_written = be32_to_cpu(ti.last_rec_address);
+	} else {
+		/* make it up instead */
+		*last_written = be32_to_cpu(ti.track_start) +
+				be32_to_cpu(ti.track_size);
+		if (ti.free_blocks)
+			*last_written -= (be32_to_cpu(ti.free_blocks) + 7);
+	}
+	return 0;
+}
+
+/*
+ * write mode select package based on pd->settings
+ */
+static noinline_for_stack int pkt_set_write_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	write_param_page *wp;
+	char buffer[128];
+	int ret, size;
+
+	/* doesn't apply to DVD+RW or DVD-RAM */
+	if ((pd->mmc3_profile == 0x1a) || (pd->mmc3_profile == 0x12))
+		return 0;
+
+	memset(buffer, 0, sizeof(buffer));
+	init_cdrom_command(&cgc, buffer, sizeof(*wp), CGC_DATA_READ);
+	cgc.sshdr = &sshdr;
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+	if (ret) {
+		pkt_dump_sense(pd, &cgc);
+		return ret;
+	}
+
+	size = 2 + ((buffer[0] << 8) | (buffer[1] & 0xff));
+	pd->mode_offset = (buffer[6] << 8) | (buffer[7] & 0xff);
+	if (size > sizeof(buffer))
+		size = sizeof(buffer);
+
+	/*
+	 * now get it all
+	 */
+	init_cdrom_command(&cgc, buffer, size, CGC_DATA_READ);
+	cgc.sshdr = &sshdr;
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_WRITE_PARMS_PAGE, 0);
+	if (ret) {
+		pkt_dump_sense(pd, &cgc);
+		return ret;
+	}
+
+	/*
+	 * write page is offset header + block descriptor length
+	 */
+	wp = (write_param_page *) &buffer[sizeof(struct mode_page_header) + pd->mode_offset];
+
+	wp->fp = pd->settings.fp;
+	wp->track_mode = pd->settings.track_mode;
+	wp->write_type = pd->settings.write_type;
+	wp->data_block_type = pd->settings.block_mode;
+
+	wp->multi_session = 0;
+
+#ifdef PACKET_USE_LS
+	wp->link_size = 7;
+	wp->ls_v = 1;
+#endif
+
+	if (wp->data_block_type == PACKET_BLOCK_MODE1) {
+		wp->session_format = 0;
+		wp->subhdr2 = 0x20;
+	} else if (wp->data_block_type == PACKET_BLOCK_MODE2) {
+		wp->session_format = 0x20;
+		wp->subhdr2 = 8;
+#if 0
+		wp->mcn[0] = 0x80;
+		memcpy(&wp->mcn[1], PACKET_MCN, sizeof(wp->mcn) - 1);
+#endif
+	} else {
+		/*
+		 * paranoia
+		 */
+		pkt_err(pd, "write mode wrong %d\n", wp->data_block_type);
+		return 1;
+	}
+	wp->packet_size = cpu_to_be32(pd->settings.size >> 2);
+
+	cgc.buflen = cgc.cmd[8] = size;
+	ret = pkt_mode_select(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(pd, &cgc);
+		return ret;
+	}
+
+	pkt_print_settings(pd);
+	return 0;
+}
+
+/*
+ * 1 -- we can write to this track, 0 -- we can't
+ */
+static int pkt_writable_track(struct pktcdvd_device *pd, track_information *ti)
+{
+	switch (pd->mmc3_profile) {
+		case 0x1a: /* DVD+RW */
+		case 0x12: /* DVD-RAM */
+			/* The track is always writable on DVD+RW/DVD-RAM */
+			return 1;
+		default:
+			break;
+	}
+
+	if (!ti->packet || !ti->fp)
+		return 0;
+
+	/*
+	 * "good" settings as per Mt Fuji.
+	 */
+	if (ti->rt == 0 && ti->blank == 0)
+		return 1;
+
+	if (ti->rt == 0 && ti->blank == 1)
+		return 1;
+
+	if (ti->rt == 1 && ti->blank == 0)
+		return 1;
+
+	pkt_err(pd, "bad state %d-%d-%d\n", ti->rt, ti->blank, ti->packet);
+	return 0;
+}
+
+/*
+ * 1 -- we can write to this disc, 0 -- we can't
+ */
+static int pkt_writable_disc(struct pktcdvd_device *pd, disc_information *di)
+{
+	switch (pd->mmc3_profile) {
+		case 0x0a: /* CD-RW */
+		case 0xffff: /* MMC3 not supported */
+			break;
+		case 0x1a: /* DVD+RW */
+		case 0x13: /* DVD-RW */
+		case 0x12: /* DVD-RAM */
+			return 1;
+		default:
+			pkt_dbg(2, pd, "Wrong disc profile (%x)\n",
+				pd->mmc3_profile);
+			return 0;
+	}
+
+	/*
+	 * for disc type 0xff we should probably reserve a new track.
+	 * but i'm not sure, should we leave this to user apps? probably.
+	 */
+	if (di->disc_type == 0xff) {
+		pkt_notice(pd, "unknown disc - no track?\n");
+		return 0;
+	}
+
+	if (di->disc_type != 0x20 && di->disc_type != 0) {
+		pkt_err(pd, "wrong disc type (%x)\n", di->disc_type);
+		return 0;
+	}
+
+	if (di->erasable == 0) {
+		pkt_notice(pd, "disc not erasable\n");
+		return 0;
+	}
+
+	if (di->border_status == PACKET_SESSION_RESERVED) {
+		pkt_err(pd, "can't write to last track (reserved)\n");
+		return 0;
+	}
+
+	return 1;
+}
+
+static noinline_for_stack int pkt_probe_settings(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	unsigned char buf[12];
+	disc_information di;
+	track_information ti;
+	int ret, track;
+
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.cmd[0] = GPCMD_GET_CONFIGURATION;
+	cgc.cmd[8] = 8;
+	ret = pkt_generic_packet(pd, &cgc);
+	pd->mmc3_profile = ret ? 0xffff : buf[6] << 8 | buf[7];
+
+	memset(&di, 0, sizeof(disc_information));
+	memset(&ti, 0, sizeof(track_information));
+
+	ret = pkt_get_disc_info(pd, &di);
+	if (ret) {
+		pkt_err(pd, "failed get_disc\n");
+		return ret;
+	}
+
+	if (!pkt_writable_disc(pd, &di))
+		return -EROFS;
+
+	pd->type = di.erasable ? PACKET_CDRW : PACKET_CDR;
+
+	track = 1; /* (di.last_track_msb << 8) | di.last_track_lsb; */
+	ret = pkt_get_track_info(pd, track, 1, &ti);
+	if (ret) {
+		pkt_err(pd, "failed get_track\n");
+		return ret;
+	}
+
+	if (!pkt_writable_track(pd, &ti)) {
+		pkt_err(pd, "can't write to this track\n");
+		return -EROFS;
+	}
+
+	/*
+	 * we keep packet size in 512 byte units, makes it easier to
+	 * deal with request calculations.
+	 */
+	pd->settings.size = be32_to_cpu(ti.fixed_packet_size) << 2;
+	if (pd->settings.size == 0) {
+		pkt_notice(pd, "detected zero packet size!\n");
+		return -ENXIO;
+	}
+	if (pd->settings.size > PACKET_MAX_SECTORS) {
+		pkt_err(pd, "packet size is too big\n");
+		return -EROFS;
+	}
+	pd->settings.fp = ti.fp;
+	pd->offset = (be32_to_cpu(ti.track_start) << 2) & (pd->settings.size - 1);
+
+	if (ti.nwa_v) {
+		pd->nwa = be32_to_cpu(ti.next_writable);
+		set_bit(PACKET_NWA_VALID, &pd->flags);
+	}
+
+	/*
+	 * in theory we could use lra on -RW media as well and just zero
+	 * blocks that haven't been written yet, but in practice that
+	 * is just a no-go. we'll use that for -R, naturally.
+	 */
+	if (ti.lra_v) {
+		pd->lra = be32_to_cpu(ti.last_rec_address);
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	} else {
+		pd->lra = 0xffffffff;
+		set_bit(PACKET_LRA_VALID, &pd->flags);
+	}
+
+	/*
+	 * fine for now
+	 */
+	pd->settings.link_loss = 7;
+	pd->settings.write_type = 0;	/* packet */
+	pd->settings.track_mode = ti.track_mode;
+
+	/*
+	 * mode1 or mode2 disc
+	 */
+	switch (ti.data_mode) {
+		case PACKET_MODE1:
+			pd->settings.block_mode = PACKET_BLOCK_MODE1;
+			break;
+		case PACKET_MODE2:
+			pd->settings.block_mode = PACKET_BLOCK_MODE2;
+			break;
+		default:
+			pkt_err(pd, "unknown data mode\n");
+			return -EROFS;
+	}
+	return 0;
+}
+
+/*
+ * enable/disable write caching on drive
+ */
+static noinline_for_stack int pkt_write_caching(struct pktcdvd_device *pd,
+						int set)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	unsigned char buf[64];
+	int ret;
+
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_READ);
+	cgc.sshdr = &sshdr;
+	cgc.buflen = pd->mode_offset + 12;
+
+	/*
+	 * caching mode page might not be there, so quiet this command
+	 */
+	cgc.quiet = 1;
+
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_WCACHING_PAGE, 0);
+	if (ret)
+		return ret;
+
+	buf[pd->mode_offset + 10] |= (!!set << 2);
+
+	cgc.buflen = cgc.cmd[8] = 2 + ((buf[0] << 8) | (buf[1] & 0xff));
+	ret = pkt_mode_select(pd, &cgc);
+	if (ret) {
+		pkt_err(pd, "write caching control failed\n");
+		pkt_dump_sense(pd, &cgc);
+	} else if (!ret && set)
+		pkt_notice(pd, "enabled write caching\n");
+	return ret;
+}
+
+static int pkt_lock_door(struct pktcdvd_device *pd, int lockflag)
+{
+	struct packet_command cgc;
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.cmd[0] = GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL;
+	cgc.cmd[4] = lockflag ? 1 : 0;
+	return pkt_generic_packet(pd, &cgc);
+}
+
+/*
+ * Returns drive maximum write speed
+ */
+static noinline_for_stack int pkt_get_max_speed(struct pktcdvd_device *pd,
+						unsigned *write_speed)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	unsigned char buf[256+18];
+	unsigned char *cap_buf;
+	int ret, offset;
+
+	cap_buf = &buf[sizeof(struct mode_page_header) + pd->mode_offset];
+	init_cdrom_command(&cgc, buf, sizeof(buf), CGC_DATA_UNKNOWN);
+	cgc.sshdr = &sshdr;
+
+	ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+	if (ret) {
+		cgc.buflen = pd->mode_offset + cap_buf[1] + 2 +
+			     sizeof(struct mode_page_header);
+		ret = pkt_mode_sense(pd, &cgc, GPMODE_CAPABILITIES_PAGE, 0);
+		if (ret) {
+			pkt_dump_sense(pd, &cgc);
+			return ret;
+		}
+	}
+
+	offset = 20;			    /* Obsoleted field, used by older drives */
+	if (cap_buf[1] >= 28)
+		offset = 28;		    /* Current write speed selected */
+	if (cap_buf[1] >= 30) {
+		/* If the drive reports at least one "Logical Unit Write
+		 * Speed Performance Descriptor Block", use the information
+		 * in the first block. (contains the highest speed)
+		 */
+		int num_spdb = (cap_buf[30] << 8) + cap_buf[31];
+		if (num_spdb > 0)
+			offset = 34;
+	}
+
+	*write_speed = (cap_buf[offset] << 8) | cap_buf[offset + 1];
+	return 0;
+}
+
+/* These tables from cdrecord - I don't have orange book */
+/* standard speed CD-RW (1-4x) */
+static char clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* high speed CD-RW (-10x) */
+static char hs_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 6, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+/* ultra high speed CD-RW */
+static char us_clv_to_speed[16] = {
+	/* 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 */
+	   0, 2, 4, 8, 0, 0,16, 0,24,32,40,48, 0, 0, 0, 0
+};
+
+/*
+ * reads the maximum media speed from ATIP
+ */
+static noinline_for_stack int pkt_media_speed(struct pktcdvd_device *pd,
+						unsigned *speed)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	unsigned char buf[64];
+	unsigned int size, st, sp;
+	int ret;
+
+	init_cdrom_command(&cgc, buf, 2, CGC_DATA_READ);
+	cgc.sshdr = &sshdr;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4; /* READ ATIP */
+	cgc.cmd[8] = 2;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(pd, &cgc);
+		return ret;
+	}
+	size = ((unsigned int) buf[0]<<8) + buf[1] + 2;
+	if (size > sizeof(buf))
+		size = sizeof(buf);
+
+	init_cdrom_command(&cgc, buf, size, CGC_DATA_READ);
+	cgc.sshdr = &sshdr;
+	cgc.cmd[0] = GPCMD_READ_TOC_PMA_ATIP;
+	cgc.cmd[1] = 2;
+	cgc.cmd[2] = 4;
+	cgc.cmd[8] = size;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret) {
+		pkt_dump_sense(pd, &cgc);
+		return ret;
+	}
+
+	if (!(buf[6] & 0x40)) {
+		pkt_notice(pd, "disc type is not CD-RW\n");
+		return 1;
+	}
+	if (!(buf[6] & 0x4)) {
+		pkt_notice(pd, "A1 values on media are not valid, maybe not CDRW?\n");
+		return 1;
+	}
+
+	st = (buf[6] >> 3) & 0x7; /* disc sub-type */
+
+	sp = buf[16] & 0xf; /* max speed from ATIP A1 field */
+
+	/* Info from cdrecord */
+	switch (st) {
+		case 0: /* standard speed */
+			*speed = clv_to_speed[sp];
+			break;
+		case 1: /* high speed */
+			*speed = hs_clv_to_speed[sp];
+			break;
+		case 2: /* ultra high speed */
+			*speed = us_clv_to_speed[sp];
+			break;
+		default:
+			pkt_notice(pd, "unknown disc sub-type %d\n", st);
+			return 1;
+	}
+	if (*speed) {
+		pkt_info(pd, "maximum media speed: %d\n", *speed);
+		return 0;
+	} else {
+		pkt_notice(pd, "unknown speed %d for sub-type %d\n", sp, st);
+		return 1;
+	}
+}
+
+static noinline_for_stack int pkt_perform_opc(struct pktcdvd_device *pd)
+{
+	struct packet_command cgc;
+	struct scsi_sense_hdr sshdr;
+	int ret;
+
+	pkt_dbg(2, pd, "Performing OPC\n");
+
+	init_cdrom_command(&cgc, NULL, 0, CGC_DATA_NONE);
+	cgc.sshdr = &sshdr;
+	cgc.timeout = 60*HZ;
+	cgc.cmd[0] = GPCMD_SEND_OPC;
+	cgc.cmd[1] = 1;
+	ret = pkt_generic_packet(pd, &cgc);
+	if (ret)
+		pkt_dump_sense(pd, &cgc);
+	return ret;
+}
+
+static int pkt_open_write(struct pktcdvd_device *pd)
+{
+	int ret;
+	unsigned int write_speed, media_write_speed, read_speed;
+
+	ret = pkt_probe_settings(pd);
+	if (ret) {
+		pkt_dbg(2, pd, "failed probe\n");
+		return ret;
+	}
+
+	ret = pkt_set_write_settings(pd);
+	if (ret) {
+		pkt_dbg(1, pd, "failed saving write settings\n");
+		return -EIO;
+	}
+
+	pkt_write_caching(pd, USE_WCACHING);
+
+	ret = pkt_get_max_speed(pd, &write_speed);
+	if (ret)
+		write_speed = 16 * 177;
+	switch (pd->mmc3_profile) {
+		case 0x13: /* DVD-RW */
+		case 0x1a: /* DVD+RW */
+		case 0x12: /* DVD-RAM */
+			pkt_dbg(1, pd, "write speed %ukB/s\n", write_speed);
+			break;
+		default:
+			ret = pkt_media_speed(pd, &media_write_speed);
+			if (ret)
+				media_write_speed = 16;
+			write_speed = min(write_speed, media_write_speed * 177);
+			pkt_dbg(1, pd, "write speed %ux\n", write_speed / 176);
+			break;
+	}
+	read_speed = write_speed;
+
+	ret = pkt_set_speed(pd, write_speed, read_speed);
+	if (ret) {
+		pkt_dbg(1, pd, "couldn't set write speed\n");
+		return -EIO;
+	}
+	pd->write_speed = write_speed;
+	pd->read_speed = read_speed;
+
+	ret = pkt_perform_opc(pd);
+	if (ret) {
+		pkt_dbg(1, pd, "Optimum Power Calibration failed\n");
+	}
+
+	return 0;
+}
+
+/*
+ * called at open time.
+ */
+static int pkt_open_dev(struct pktcdvd_device *pd, fmode_t write)
+{
+	int ret;
+	long lba;
+	struct request_queue *q;
+	struct block_device *bdev;
+
+	/*
+	 * We need to re-open the cdrom device without O_NONBLOCK to be able
+	 * to read/write from/to it. It is already opened in O_NONBLOCK mode
+	 * so open should not fail.
+	 */
+	bdev = blkdev_get_by_dev(pd->bdev->bd_dev, FMODE_READ | FMODE_EXCL, pd);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		goto out;
+	}
+
+	ret = pkt_get_last_written(pd, &lba);
+	if (ret) {
+		pkt_err(pd, "pkt_get_last_written failed\n");
+		goto out_putdev;
+	}
+
+	set_capacity(pd->disk, lba << 2);
+	set_capacity_and_notify(pd->bdev->bd_disk, lba << 2);
+
+	q = bdev_get_queue(pd->bdev);
+	if (write) {
+		ret = pkt_open_write(pd);
+		if (ret)
+			goto out_putdev;
+		/*
+		 * Some CDRW drives can not handle writes larger than one packet,
+		 * even if the size is a multiple of the packet size.
+		 */
+		blk_queue_max_hw_sectors(q, pd->settings.size);
+		set_bit(PACKET_WRITABLE, &pd->flags);
+	} else {
+		pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+		clear_bit(PACKET_WRITABLE, &pd->flags);
+	}
+
+	ret = pkt_set_segment_merging(pd, q);
+	if (ret)
+		goto out_putdev;
+
+	if (write) {
+		if (!pkt_grow_pktlist(pd, CONFIG_CDROM_PKTCDVD_BUFFERS)) {
+			pkt_err(pd, "not enough memory for buffers\n");
+			ret = -ENOMEM;
+			goto out_putdev;
+		}
+		pkt_info(pd, "%lukB available on disc\n", lba << 1);
+	}
+
+	return 0;
+
+out_putdev:
+	blkdev_put(bdev, FMODE_READ | FMODE_EXCL);
+out:
+	return ret;
+}
+
+/*
+ * called when the device is closed. makes sure that the device flushes
+ * the internal cache before we close.
+ */
+static void pkt_release_dev(struct pktcdvd_device *pd, int flush)
+{
+	if (flush && pkt_flush_cache(pd))
+		pkt_dbg(1, pd, "not flushing cache\n");
+
+	pkt_lock_door(pd, 0);
+
+	pkt_set_speed(pd, MAX_SPEED, MAX_SPEED);
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_EXCL);
+
+	pkt_shrink_pktlist(pd);
+}
+
+static struct pktcdvd_device *pkt_find_dev_from_minor(unsigned int dev_minor)
+{
+	if (dev_minor >= MAX_WRITERS)
+		return NULL;
+
+	dev_minor = array_index_nospec(dev_minor, MAX_WRITERS);
+	return pkt_devs[dev_minor];
+}
+
+static int pkt_open(struct block_device *bdev, fmode_t mode)
+{
+	struct pktcdvd_device *pd = NULL;
+	int ret;
+
+	mutex_lock(&pktcdvd_mutex);
+	mutex_lock(&ctl_mutex);
+	pd = pkt_find_dev_from_minor(MINOR(bdev->bd_dev));
+	if (!pd) {
+		ret = -ENODEV;
+		goto out;
+	}
+	BUG_ON(pd->refcnt < 0);
+
+	pd->refcnt++;
+	if (pd->refcnt > 1) {
+		if ((mode & FMODE_WRITE) &&
+		    !test_bit(PACKET_WRITABLE, &pd->flags)) {
+			ret = -EBUSY;
+			goto out_dec;
+		}
+	} else {
+		ret = pkt_open_dev(pd, mode & FMODE_WRITE);
+		if (ret)
+			goto out_dec;
+		/*
+		 * needed here as well, since ext2 (among others) may change
+		 * the blocksize at mount time
+		 */
+		set_blocksize(bdev, CD_FRAMESIZE);
+	}
+
+	mutex_unlock(&ctl_mutex);
+	mutex_unlock(&pktcdvd_mutex);
+	return 0;
+
+out_dec:
+	pd->refcnt--;
+out:
+	mutex_unlock(&ctl_mutex);
+	mutex_unlock(&pktcdvd_mutex);
+	return ret;
+}
+
+static void pkt_close(struct gendisk *disk, fmode_t mode)
+{
+	struct pktcdvd_device *pd = disk->private_data;
+
+	mutex_lock(&pktcdvd_mutex);
+	mutex_lock(&ctl_mutex);
+	pd->refcnt--;
+	BUG_ON(pd->refcnt < 0);
+	if (pd->refcnt == 0) {
+		int flush = test_bit(PACKET_WRITABLE, &pd->flags);
+		pkt_release_dev(pd, flush);
+	}
+	mutex_unlock(&ctl_mutex);
+	mutex_unlock(&pktcdvd_mutex);
+}
+
+
+static void pkt_end_io_read_cloned(struct bio *bio)
+{
+	struct packet_stacked_data *psd = bio->bi_private;
+	struct pktcdvd_device *pd = psd->pd;
+
+	psd->bio->bi_status = bio->bi_status;
+	bio_put(bio);
+	bio_endio(psd->bio);
+	mempool_free(psd, &psd_pool);
+	pkt_bio_finished(pd);
+}
+
+static void pkt_make_request_read(struct pktcdvd_device *pd, struct bio *bio)
+{
+	struct bio *cloned_bio =
+		bio_alloc_clone(pd->bdev, bio, GFP_NOIO, &pkt_bio_set);
+	struct packet_stacked_data *psd = mempool_alloc(&psd_pool, GFP_NOIO);
+
+	psd->pd = pd;
+	psd->bio = bio;
+	cloned_bio->bi_private = psd;
+	cloned_bio->bi_end_io = pkt_end_io_read_cloned;
+	pd->stats.secs_r += bio_sectors(bio);
+	pkt_queue_bio(pd, cloned_bio);
+}
+
+static void pkt_make_request_write(struct request_queue *q, struct bio *bio)
+{
+	struct pktcdvd_device *pd = q->queuedata;
+	sector_t zone;
+	struct packet_data *pkt;
+	int was_empty, blocked_bio;
+	struct pkt_rb_node *node;
+
+	zone = get_zone(bio->bi_iter.bi_sector, pd);
+
+	/*
+	 * If we find a matching packet in state WAITING or READ_WAIT, we can
+	 * just append this bio to that packet.
+	 */
+	spin_lock(&pd->cdrw.active_list_lock);
+	blocked_bio = 0;
+	list_for_each_entry(pkt, &pd->cdrw.pkt_active_list, list) {
+		if (pkt->sector == zone) {
+			spin_lock(&pkt->lock);
+			if ((pkt->state == PACKET_WAITING_STATE) ||
+			    (pkt->state == PACKET_READ_WAIT_STATE)) {
+				bio_list_add(&pkt->orig_bios, bio);
+				pkt->write_size +=
+					bio->bi_iter.bi_size / CD_FRAMESIZE;
+				if ((pkt->write_size >= pkt->frames) &&
+				    (pkt->state == PACKET_WAITING_STATE)) {
+					atomic_inc(&pkt->run_sm);
+					wake_up(&pd->wqueue);
+				}
+				spin_unlock(&pkt->lock);
+				spin_unlock(&pd->cdrw.active_list_lock);
+				return;
+			} else {
+				blocked_bio = 1;
+			}
+			spin_unlock(&pkt->lock);
+		}
+	}
+	spin_unlock(&pd->cdrw.active_list_lock);
+
+	/*
+	 * Test if there is enough room left in the bio work queue
+	 * (queue size >= congestion on mark).
+	 * If not, wait till the work queue size is below the congestion off mark.
+	 */
+	spin_lock(&pd->lock);
+	if (pd->write_congestion_on > 0
+	    && pd->bio_queue_size >= pd->write_congestion_on) {
+		struct wait_bit_queue_entry wqe;
+
+		init_wait_var_entry(&wqe, &pd->congested, 0);
+		for (;;) {
+			prepare_to_wait_event(__var_waitqueue(&pd->congested),
+					      &wqe.wq_entry,
+					      TASK_UNINTERRUPTIBLE);
+			if (pd->bio_queue_size <= pd->write_congestion_off)
+				break;
+			pd->congested = true;
+			spin_unlock(&pd->lock);
+			schedule();
+			spin_lock(&pd->lock);
+		}
+	}
+	spin_unlock(&pd->lock);
+
+	/*
+	 * No matching packet found. Store the bio in the work queue.
+	 */
+	node = mempool_alloc(&pd->rb_pool, GFP_NOIO);
+	node->bio = bio;
+	spin_lock(&pd->lock);
+	BUG_ON(pd->bio_queue_size < 0);
+	was_empty = (pd->bio_queue_size == 0);
+	pkt_rbtree_insert(pd, node);
+	spin_unlock(&pd->lock);
+
+	/*
+	 * Wake up the worker thread.
+	 */
+	atomic_set(&pd->scan_queue, 1);
+	if (was_empty) {
+		/* This wake_up is required for correct operation */
+		wake_up(&pd->wqueue);
+	} else if (!list_empty(&pd->cdrw.pkt_free_list) && !blocked_bio) {
+		/*
+		 * This wake up is not required for correct operation,
+		 * but improves performance in some cases.
+		 */
+		wake_up(&pd->wqueue);
+	}
+}
+
+static void pkt_submit_bio(struct bio *bio)
+{
+	struct pktcdvd_device *pd = bio->bi_bdev->bd_disk->queue->queuedata;
+	struct bio *split;
+
+	bio = bio_split_to_limits(bio);
+	if (!bio)
+		return;
+
+	pkt_dbg(2, pd, "start = %6llx stop = %6llx\n",
+		(unsigned long long)bio->bi_iter.bi_sector,
+		(unsigned long long)bio_end_sector(bio));
+
+	/*
+	 * Clone READ bios so we can have our own bi_end_io callback.
+	 */
+	if (bio_data_dir(bio) == READ) {
+		pkt_make_request_read(pd, bio);
+		return;
+	}
+
+	if (!test_bit(PACKET_WRITABLE, &pd->flags)) {
+		pkt_notice(pd, "WRITE for ro device (%llu)\n",
+			   (unsigned long long)bio->bi_iter.bi_sector);
+		goto end_io;
+	}
+
+	if (!bio->bi_iter.bi_size || (bio->bi_iter.bi_size % CD_FRAMESIZE)) {
+		pkt_err(pd, "wrong bio size\n");
+		goto end_io;
+	}
+
+	do {
+		sector_t zone = get_zone(bio->bi_iter.bi_sector, pd);
+		sector_t last_zone = get_zone(bio_end_sector(bio) - 1, pd);
+
+		if (last_zone != zone) {
+			BUG_ON(last_zone != zone + pd->settings.size);
+
+			split = bio_split(bio, last_zone -
+					  bio->bi_iter.bi_sector,
+					  GFP_NOIO, &pkt_bio_set);
+			bio_chain(split, bio);
+		} else {
+			split = bio;
+		}
+
+		pkt_make_request_write(bio->bi_bdev->bd_disk->queue, split);
+	} while (split != bio);
+
+	return;
+end_io:
+	bio_io_error(bio);
+}
+
+static void pkt_init_queue(struct pktcdvd_device *pd)
+{
+	struct request_queue *q = pd->disk->queue;
+
+	blk_queue_logical_block_size(q, CD_FRAMESIZE);
+	blk_queue_max_hw_sectors(q, PACKET_MAX_SECTORS);
+	q->queuedata = pd;
+}
+
+static int pkt_seq_show(struct seq_file *m, void *p)
+{
+	struct pktcdvd_device *pd = m->private;
+	char *msg;
+	int states[PACKET_NUM_STATES];
+
+	seq_printf(m, "Writer %s mapped to %pg:\n", pd->name, pd->bdev);
+
+	seq_printf(m, "\nSettings:\n");
+	seq_printf(m, "\tpacket size:\t\t%dkB\n", pd->settings.size / 2);
+
+	if (pd->settings.write_type == 0)
+		msg = "Packet";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\twrite type:\t\t%s\n", msg);
+
+	seq_printf(m, "\tpacket type:\t\t%s\n", pd->settings.fp ? "Fixed" : "Variable");
+	seq_printf(m, "\tlink loss:\t\t%d\n", pd->settings.link_loss);
+
+	seq_printf(m, "\ttrack mode:\t\t%d\n", pd->settings.track_mode);
+
+	if (pd->settings.block_mode == PACKET_BLOCK_MODE1)
+		msg = "Mode 1";
+	else if (pd->settings.block_mode == PACKET_BLOCK_MODE2)
+		msg = "Mode 2";
+	else
+		msg = "Unknown";
+	seq_printf(m, "\tblock mode:\t\t%s\n", msg);
+
+	seq_printf(m, "\nStatistics:\n");
+	seq_printf(m, "\tpackets started:\t%lu\n", pd->stats.pkt_started);
+	seq_printf(m, "\tpackets ended:\t\t%lu\n", pd->stats.pkt_ended);
+	seq_printf(m, "\twritten:\t\t%lukB\n", pd->stats.secs_w >> 1);
+	seq_printf(m, "\tread gather:\t\t%lukB\n", pd->stats.secs_rg >> 1);
+	seq_printf(m, "\tread:\t\t\t%lukB\n", pd->stats.secs_r >> 1);
+
+	seq_printf(m, "\nMisc:\n");
+	seq_printf(m, "\treference count:\t%d\n", pd->refcnt);
+	seq_printf(m, "\tflags:\t\t\t0x%lx\n", pd->flags);
+	seq_printf(m, "\tread speed:\t\t%ukB/s\n", pd->read_speed);
+	seq_printf(m, "\twrite speed:\t\t%ukB/s\n", pd->write_speed);
+	seq_printf(m, "\tstart offset:\t\t%lu\n", pd->offset);
+	seq_printf(m, "\tmode page offset:\t%u\n", pd->mode_offset);
+
+	seq_printf(m, "\nQueue state:\n");
+	seq_printf(m, "\tbios queued:\t\t%d\n", pd->bio_queue_size);
+	seq_printf(m, "\tbios pending:\t\t%d\n", atomic_read(&pd->cdrw.pending_bios));
+	seq_printf(m, "\tcurrent sector:\t\t0x%llx\n", (unsigned long long)pd->current_sector);
+
+	pkt_count_states(pd, states);
+	seq_printf(m, "\tstate:\t\t\ti:%d ow:%d rw:%d ww:%d rec:%d fin:%d\n",
+		   states[0], states[1], states[2], states[3], states[4], states[5]);
+
+	seq_printf(m, "\twrite congestion marks:\toff=%d on=%d\n",
+			pd->write_congestion_off,
+			pd->write_congestion_on);
+	return 0;
+}
+
+static int pkt_new_dev(struct pktcdvd_device *pd, dev_t dev)
+{
+	int i;
+	struct block_device *bdev;
+	struct scsi_device *sdev;
+
+	if (pd->pkt_dev == dev) {
+		pkt_err(pd, "recursive setup not allowed\n");
+		return -EBUSY;
+	}
+	for (i = 0; i < MAX_WRITERS; i++) {
+		struct pktcdvd_device *pd2 = pkt_devs[i];
+		if (!pd2)
+			continue;
+		if (pd2->bdev->bd_dev == dev) {
+			pkt_err(pd, "%pg already setup\n", pd2->bdev);
+			return -EBUSY;
+		}
+		if (pd2->pkt_dev == dev) {
+			pkt_err(pd, "can't chain pktcdvd devices\n");
+			return -EBUSY;
+		}
+	}
+
+	bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_NDELAY, NULL);
+	if (IS_ERR(bdev))
+		return PTR_ERR(bdev);
+	sdev = scsi_device_from_queue(bdev->bd_disk->queue);
+	if (!sdev) {
+		blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+		return -EINVAL;
+	}
+	put_device(&sdev->sdev_gendev);
+
+	/* This is safe, since we have a reference from open(). */
+	__module_get(THIS_MODULE);
+
+	pd->bdev = bdev;
+	set_blocksize(bdev, CD_FRAMESIZE);
+
+	pkt_init_queue(pd);
+
+	atomic_set(&pd->cdrw.pending_bios, 0);
+	pd->cdrw.thread = kthread_run(kcdrwd, pd, "%s", pd->name);
+	if (IS_ERR(pd->cdrw.thread)) {
+		pkt_err(pd, "can't start kernel thread\n");
+		goto out_mem;
+	}
+
+	proc_create_single_data(pd->name, 0, pkt_proc, pkt_seq_show, pd);
+	pkt_dbg(1, pd, "writer mapped to %pg\n", bdev);
+	return 0;
+
+out_mem:
+	blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+	return -ENOMEM;
+}
+
+static int pkt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg)
+{
+	struct pktcdvd_device *pd = bdev->bd_disk->private_data;
+	int ret;
+
+	pkt_dbg(2, pd, "cmd %x, dev %d:%d\n",
+		cmd, MAJOR(bdev->bd_dev), MINOR(bdev->bd_dev));
+
+	mutex_lock(&pktcdvd_mutex);
+	switch (cmd) {
+	case CDROMEJECT:
+		/*
+		 * The door gets locked when the device is opened, so we
+		 * have to unlock it or else the eject command fails.
+		 */
+		if (pd->refcnt == 1)
+			pkt_lock_door(pd, 0);
+		fallthrough;
+	/*
+	 * forward selected CDROM ioctls to CD-ROM, for UDF
+	 */
+	case CDROMMULTISESSION:
+	case CDROMREADTOCENTRY:
+	case CDROM_LAST_WRITTEN:
+	case CDROM_SEND_PACKET:
+	case SCSI_IOCTL_SEND_COMMAND:
+		if (!bdev->bd_disk->fops->ioctl)
+			ret = -ENOTTY;
+		else
+			ret = bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
+		break;
+	default:
+		pkt_dbg(2, pd, "Unknown ioctl (%x)\n", cmd);
+		ret = -ENOTTY;
+	}
+	mutex_unlock(&pktcdvd_mutex);
+
+	return ret;
+}
+
+static unsigned int pkt_check_events(struct gendisk *disk,
+				     unsigned int clearing)
+{
+	struct pktcdvd_device *pd = disk->private_data;
+	struct gendisk *attached_disk;
+
+	if (!pd)
+		return 0;
+	if (!pd->bdev)
+		return 0;
+	attached_disk = pd->bdev->bd_disk;
+	if (!attached_disk || !attached_disk->fops->check_events)
+		return 0;
+	return attached_disk->fops->check_events(attached_disk, clearing);
+}
+
+static char *pkt_devnode(struct gendisk *disk, umode_t *mode)
+{
+	return kasprintf(GFP_KERNEL, "pktcdvd/%s", disk->disk_name);
+}
+
+static const struct block_device_operations pktcdvd_ops = {
+	.owner =		THIS_MODULE,
+	.submit_bio =		pkt_submit_bio,
+	.open =			pkt_open,
+	.release =		pkt_close,
+	.ioctl =		pkt_ioctl,
+	.compat_ioctl =		blkdev_compat_ptr_ioctl,
+	.check_events =		pkt_check_events,
+	.devnode =		pkt_devnode,
+};
+
+/*
+ * Set up mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_setup_dev(dev_t dev, dev_t* pkt_dev)
+{
+	int idx;
+	int ret = -ENOMEM;
+	struct pktcdvd_device *pd;
+	struct gendisk *disk;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++)
+		if (!pkt_devs[idx])
+			break;
+	if (idx == MAX_WRITERS) {
+		pr_err("max %d writers supported\n", MAX_WRITERS);
+		ret = -EBUSY;
+		goto out_mutex;
+	}
+
+	pd = kzalloc(sizeof(struct pktcdvd_device), GFP_KERNEL);
+	if (!pd)
+		goto out_mutex;
+
+	ret = mempool_init_kmalloc_pool(&pd->rb_pool, PKT_RB_POOL_SIZE,
+					sizeof(struct pkt_rb_node));
+	if (ret)
+		goto out_mem;
+
+	INIT_LIST_HEAD(&pd->cdrw.pkt_free_list);
+	INIT_LIST_HEAD(&pd->cdrw.pkt_active_list);
+	spin_lock_init(&pd->cdrw.active_list_lock);
+
+	spin_lock_init(&pd->lock);
+	spin_lock_init(&pd->iosched.lock);
+	bio_list_init(&pd->iosched.read_queue);
+	bio_list_init(&pd->iosched.write_queue);
+	sprintf(pd->name, DRIVER_NAME"%d", idx);
+	init_waitqueue_head(&pd->wqueue);
+	pd->bio_queue = RB_ROOT;
+
+	pd->write_congestion_on  = write_congestion_on;
+	pd->write_congestion_off = write_congestion_off;
+
+	ret = -ENOMEM;
+	disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!disk)
+		goto out_mem;
+	pd->disk = disk;
+	disk->major = pktdev_major;
+	disk->first_minor = idx;
+	disk->minors = 1;
+	disk->fops = &pktcdvd_ops;
+	disk->flags = GENHD_FL_REMOVABLE | GENHD_FL_NO_PART;
+	strcpy(disk->disk_name, pd->name);
+	disk->private_data = pd;
+
+	pd->pkt_dev = MKDEV(pktdev_major, idx);
+	ret = pkt_new_dev(pd, dev);
+	if (ret)
+		goto out_mem2;
+
+	/* inherit events of the host device */
+	disk->events = pd->bdev->bd_disk->events;
+
+	ret = add_disk(disk);
+	if (ret)
+		goto out_mem2;
+
+	pkt_sysfs_dev_new(pd);
+	pkt_debugfs_dev_new(pd);
+
+	pkt_devs[idx] = pd;
+	if (pkt_dev)
+		*pkt_dev = pd->pkt_dev;
+
+	mutex_unlock(&ctl_mutex);
+	return 0;
+
+out_mem2:
+	put_disk(disk);
+out_mem:
+	mempool_exit(&pd->rb_pool);
+	kfree(pd);
+out_mutex:
+	mutex_unlock(&ctl_mutex);
+	pr_err("setup of pktcdvd device failed\n");
+	return ret;
+}
+
+/*
+ * Tear down mapping from pktcdvd device to CD-ROM device.
+ */
+static int pkt_remove_dev(dev_t pkt_dev)
+{
+	struct pktcdvd_device *pd;
+	int idx;
+	int ret = 0;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	for (idx = 0; idx < MAX_WRITERS; idx++) {
+		pd = pkt_devs[idx];
+		if (pd && (pd->pkt_dev == pkt_dev))
+			break;
+	}
+	if (idx == MAX_WRITERS) {
+		pr_debug("dev not setup\n");
+		ret = -ENXIO;
+		goto out;
+	}
+
+	if (pd->refcnt > 0) {
+		ret = -EBUSY;
+		goto out;
+	}
+	if (!IS_ERR(pd->cdrw.thread))
+		kthread_stop(pd->cdrw.thread);
+
+	pkt_devs[idx] = NULL;
+
+	pkt_debugfs_dev_remove(pd);
+	pkt_sysfs_dev_remove(pd);
+
+	blkdev_put(pd->bdev, FMODE_READ | FMODE_NDELAY);
+
+	remove_proc_entry(pd->name, pkt_proc);
+	pkt_dbg(1, pd, "writer unmapped\n");
+
+	del_gendisk(pd->disk);
+	put_disk(pd->disk);
+
+	mempool_exit(&pd->rb_pool);
+	kfree(pd);
+
+	/* This is safe: open() is still holding a reference. */
+	module_put(THIS_MODULE);
+
+out:
+	mutex_unlock(&ctl_mutex);
+	return ret;
+}
+
+static void pkt_get_status(struct pkt_ctrl_command *ctrl_cmd)
+{
+	struct pktcdvd_device *pd;
+
+	mutex_lock_nested(&ctl_mutex, SINGLE_DEPTH_NESTING);
+
+	pd = pkt_find_dev_from_minor(ctrl_cmd->dev_index);
+	if (pd) {
+		ctrl_cmd->dev = new_encode_dev(pd->bdev->bd_dev);
+		ctrl_cmd->pkt_dev = new_encode_dev(pd->pkt_dev);
+	} else {
+		ctrl_cmd->dev = 0;
+		ctrl_cmd->pkt_dev = 0;
+	}
+	ctrl_cmd->num_devices = MAX_WRITERS;
+
+	mutex_unlock(&ctl_mutex);
+}
+
+static long pkt_ctl_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	void __user *argp = (void __user *)arg;
+	struct pkt_ctrl_command ctrl_cmd;
+	int ret = 0;
+	dev_t pkt_dev = 0;
+
+	if (cmd != PACKET_CTRL_CMD)
+		return -ENOTTY;
+
+	if (copy_from_user(&ctrl_cmd, argp, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+
+	switch (ctrl_cmd.command) {
+	case PKT_CTRL_CMD_SETUP:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		ret = pkt_setup_dev(new_decode_dev(ctrl_cmd.dev), &pkt_dev);
+		ctrl_cmd.pkt_dev = new_encode_dev(pkt_dev);
+		break;
+	case PKT_CTRL_CMD_TEARDOWN:
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+		ret = pkt_remove_dev(new_decode_dev(ctrl_cmd.pkt_dev));
+		break;
+	case PKT_CTRL_CMD_STATUS:
+		pkt_get_status(&ctrl_cmd);
+		break;
+	default:
+		return -ENOTTY;
+	}
+
+	if (copy_to_user(argp, &ctrl_cmd, sizeof(struct pkt_ctrl_command)))
+		return -EFAULT;
+	return ret;
+}
+
+#ifdef CONFIG_COMPAT
+static long pkt_ctl_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	return pkt_ctl_ioctl(file, cmd, (unsigned long)compat_ptr(arg));
+}
+#endif
+
+static const struct file_operations pkt_ctl_fops = {
+	.open		= nonseekable_open,
+	.unlocked_ioctl	= pkt_ctl_ioctl,
+#ifdef CONFIG_COMPAT
+	.compat_ioctl	= pkt_ctl_compat_ioctl,
+#endif
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+};
+
+static struct miscdevice pkt_misc = {
+	.minor 		= MISC_DYNAMIC_MINOR,
+	.name  		= DRIVER_NAME,
+	.nodename	= "pktcdvd/control",
+	.fops  		= &pkt_ctl_fops
+};
+
+static int __init pkt_init(void)
+{
+	int ret;
+
+	mutex_init(&ctl_mutex);
+
+	ret = mempool_init_kmalloc_pool(&psd_pool, PSD_POOL_SIZE,
+				    sizeof(struct packet_stacked_data));
+	if (ret)
+		return ret;
+	ret = bioset_init(&pkt_bio_set, BIO_POOL_SIZE, 0, 0);
+	if (ret) {
+		mempool_exit(&psd_pool);
+		return ret;
+	}
+
+	ret = register_blkdev(pktdev_major, DRIVER_NAME);
+	if (ret < 0) {
+		pr_err("unable to register block device\n");
+		goto out2;
+	}
+	if (!pktdev_major)
+		pktdev_major = ret;
+
+	ret = pkt_sysfs_init();
+	if (ret)
+		goto out;
+
+	pkt_debugfs_init();
+
+	ret = misc_register(&pkt_misc);
+	if (ret) {
+		pr_err("unable to register misc device\n");
+		goto out_misc;
+	}
+
+	pkt_proc = proc_mkdir("driver/"DRIVER_NAME, NULL);
+
+	return 0;
+
+out_misc:
+	pkt_debugfs_cleanup();
+	pkt_sysfs_cleanup();
+out:
+	unregister_blkdev(pktdev_major, DRIVER_NAME);
+out2:
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
+	return ret;
+}
+
+static void __exit pkt_exit(void)
+{
+	remove_proc_entry("driver/"DRIVER_NAME, NULL);
+	misc_deregister(&pkt_misc);
+
+	pkt_debugfs_cleanup();
+	pkt_sysfs_cleanup();
+
+	unregister_blkdev(pktdev_major, DRIVER_NAME);
+	mempool_exit(&psd_pool);
+	bioset_exit(&pkt_bio_set);
+}
+
+MODULE_DESCRIPTION("Packet writing layer for CD/DVD drives");
+MODULE_AUTHOR("Jens Axboe <axboe@suse.de>");
+MODULE_LICENSE("GPL");
+
+module_init(pkt_init);
+module_exit(pkt_exit);
diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c
new file mode 100644
index 000000000..36d7b36c6
--- /dev/null
+++ b/drivers/block/ps3disk.c
@@ -0,0 +1,560 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * PS3 Disk Storage Driver
+ *
+ * Copyright (C) 2007 Sony Computer Entertainment Inc.
+ * Copyright 2007 Sony Corp.
+ */
+
+#include <linux/ata.h>
+#include <linux/blk-mq.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+
+#include <asm/lv1call.h>
+#include <asm/ps3stor.h>
+#include <asm/firmware.h>
+
+
+#define DEVICE_NAME		"ps3disk"
+
+#define BOUNCE_SIZE		(64*1024)
+
+#define PS3DISK_MAX_DISKS	16
+#define PS3DISK_MINORS		16
+
+
+#define PS3DISK_NAME		"ps3d%c"
+
+
+struct ps3disk_private {
+	spinlock_t lock;		/* Request queue spinlock */
+	struct blk_mq_tag_set tag_set;
+	struct gendisk *gendisk;
+	unsigned int blocking_factor;
+	struct request *req;
+	u64 raw_capacity;
+	unsigned char model[ATA_ID_PROD_LEN+1];
+};
+
+
+#define LV1_STORAGE_SEND_ATA_COMMAND	(2)
+#define LV1_STORAGE_ATA_HDDOUT		(0x23)
+
+struct lv1_ata_cmnd_block {
+	u16	features;
+	u16	sector_count;
+	u16	LBA_low;
+	u16	LBA_mid;
+	u16	LBA_high;
+	u8	device;
+	u8	command;
+	u32	is_ext;
+	u32	proto;
+	u32	in_out;
+	u32	size;
+	u64	buffer;
+	u32	arglen;
+};
+
+enum lv1_ata_proto {
+	NON_DATA_PROTO     = 0,
+	PIO_DATA_IN_PROTO  = 1,
+	PIO_DATA_OUT_PROTO = 2,
+	DMA_PROTO = 3
+};
+
+enum lv1_ata_in_out {
+	DIR_WRITE = 0,			/* memory -> device */
+	DIR_READ = 1			/* device -> memory */
+};
+
+static int ps3disk_major;
+
+
+static const struct block_device_operations ps3disk_fops = {
+	.owner		= THIS_MODULE,
+};
+
+
+static void ps3disk_scatter_gather(struct ps3_storage_device *dev,
+				   struct request *req, int gather)
+{
+	unsigned int offset = 0;
+	struct req_iterator iter;
+	struct bio_vec bvec;
+
+	rq_for_each_segment(bvec, req, iter) {
+		if (gather)
+			memcpy_from_bvec(dev->bounce_buf + offset, &bvec);
+		else
+			memcpy_to_bvec(&bvec, dev->bounce_buf + offset);
+	}
+}
+
+static blk_status_t ps3disk_submit_request_sg(struct ps3_storage_device *dev,
+					      struct request *req)
+{
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+	int write = rq_data_dir(req), res;
+	const char *op = write ? "write" : "read";
+	u64 start_sector, sectors;
+	unsigned int region_id = dev->regions[dev->region_idx].id;
+
+#ifdef DEBUG
+	unsigned int n = 0;
+	struct bio_vec bv;
+	struct req_iterator iter;
+
+	rq_for_each_segment(bv, req, iter)
+		n++;
+	dev_dbg(&dev->sbd.core,
+		"%s:%u: %s req has %u bvecs for %u sectors\n",
+		__func__, __LINE__, op, n, blk_rq_sectors(req));
+#endif
+
+	start_sector = blk_rq_pos(req) * priv->blocking_factor;
+	sectors = blk_rq_sectors(req) * priv->blocking_factor;
+	dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n",
+		__func__, __LINE__, op, sectors, start_sector);
+
+	if (write) {
+		ps3disk_scatter_gather(dev, req, 1);
+
+		res = lv1_storage_write(dev->sbd.dev_id, region_id,
+					start_sector, sectors, 0,
+					dev->bounce_lpar, &dev->tag);
+	} else {
+		res = lv1_storage_read(dev->sbd.dev_id, region_id,
+				       start_sector, sectors, 0,
+				       dev->bounce_lpar, &dev->tag);
+	}
+	if (res) {
+		dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__,
+			__LINE__, op, res);
+		return BLK_STS_IOERR;
+	}
+
+	priv->req = req;
+	return BLK_STS_OK;
+}
+
+static blk_status_t ps3disk_submit_flush_request(struct ps3_storage_device *dev,
+						 struct request *req)
+{
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+	u64 res;
+
+	dev_dbg(&dev->sbd.core, "%s:%u: flush request\n", __func__, __LINE__);
+
+	res = lv1_storage_send_device_command(dev->sbd.dev_id,
+					      LV1_STORAGE_ATA_HDDOUT, 0, 0, 0,
+					      0, &dev->tag);
+	if (res) {
+		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
+			__func__, __LINE__, res);
+		return BLK_STS_IOERR;
+	}
+
+	priv->req = req;
+	return BLK_STS_OK;
+}
+
+static blk_status_t ps3disk_do_request(struct ps3_storage_device *dev,
+				       struct request *req)
+{
+	dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__);
+
+	switch (req_op(req)) {
+	case REQ_OP_FLUSH:
+		return ps3disk_submit_flush_request(dev, req);
+	case REQ_OP_READ:
+	case REQ_OP_WRITE:
+		return ps3disk_submit_request_sg(dev, req);
+	default:
+		blk_dump_rq_flags(req, DEVICE_NAME " bad request");
+		return BLK_STS_IOERR;
+	}
+}
+
+static blk_status_t ps3disk_queue_rq(struct blk_mq_hw_ctx *hctx,
+				     const struct blk_mq_queue_data *bd)
+{
+	struct request_queue *q = hctx->queue;
+	struct ps3_storage_device *dev = q->queuedata;
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+	blk_status_t ret;
+
+	blk_mq_start_request(bd->rq);
+
+	spin_lock_irq(&priv->lock);
+	ret = ps3disk_do_request(dev, bd->rq);
+	spin_unlock_irq(&priv->lock);
+
+	return ret;
+}
+
+static irqreturn_t ps3disk_interrupt(int irq, void *data)
+{
+	struct ps3_storage_device *dev = data;
+	struct ps3disk_private *priv;
+	struct request *req;
+	int res, read;
+	blk_status_t error;
+	u64 tag, status;
+	const char *op;
+
+	res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status);
+
+	if (tag != dev->tag)
+		dev_err(&dev->sbd.core,
+			"%s:%u: tag mismatch, got %llx, expected %llx\n",
+			__func__, __LINE__, tag, dev->tag);
+
+	if (res) {
+		dev_err(&dev->sbd.core, "%s:%u: res=%d status=0x%llx\n",
+			__func__, __LINE__, res, status);
+		return IRQ_HANDLED;
+	}
+
+	priv = ps3_system_bus_get_drvdata(&dev->sbd);
+	req = priv->req;
+	if (!req) {
+		dev_dbg(&dev->sbd.core,
+			"%s:%u non-block layer request completed\n", __func__,
+			__LINE__);
+		dev->lv1_status = status;
+		complete(&dev->done);
+		return IRQ_HANDLED;
+	}
+
+	if (req_op(req) == REQ_OP_FLUSH) {
+		read = 0;
+		op = "flush";
+	} else {
+		read = !rq_data_dir(req);
+		op = read ? "read" : "write";
+	}
+	if (status) {
+		dev_dbg(&dev->sbd.core, "%s:%u: %s failed 0x%llx\n", __func__,
+			__LINE__, op, status);
+		error = BLK_STS_IOERR;
+	} else {
+		dev_dbg(&dev->sbd.core, "%s:%u: %s completed\n", __func__,
+			__LINE__, op);
+		error = 0;
+		if (read)
+			ps3disk_scatter_gather(dev, req, 0);
+	}
+
+	spin_lock(&priv->lock);
+	priv->req = NULL;
+	blk_mq_end_request(req, error);
+	spin_unlock(&priv->lock);
+
+	blk_mq_run_hw_queues(priv->gendisk->queue, true);
+	return IRQ_HANDLED;
+}
+
+static int ps3disk_sync_cache(struct ps3_storage_device *dev)
+{
+	u64 res;
+
+	dev_dbg(&dev->sbd.core, "%s:%u: sync cache\n", __func__, __LINE__);
+
+	res = ps3stor_send_command(dev, LV1_STORAGE_ATA_HDDOUT, 0, 0, 0, 0);
+	if (res) {
+		dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n",
+			__func__, __LINE__, res);
+		return -EIO;
+	}
+	return 0;
+}
+
+
+/* ATA helpers copied from drivers/ata/libata-core.c */
+
+static void swap_buf_le16(u16 *buf, unsigned int buf_words)
+{
+#ifdef __BIG_ENDIAN
+	unsigned int i;
+
+	for (i = 0; i < buf_words; i++)
+		buf[i] = le16_to_cpu(buf[i]);
+#endif /* __BIG_ENDIAN */
+}
+
+static u64 ata_id_n_sectors(const u16 *id)
+{
+	if (ata_id_has_lba(id)) {
+		if (ata_id_has_lba48(id))
+			return ata_id_u64(id, 100);
+		else
+			return ata_id_u32(id, 60);
+	} else {
+		if (ata_id_current_chs_valid(id))
+			return ata_id_u32(id, 57);
+		else
+			return id[1] * id[3] * id[6];
+	}
+}
+
+static void ata_id_string(const u16 *id, unsigned char *s, unsigned int ofs,
+			  unsigned int len)
+{
+	unsigned int c;
+
+	while (len > 0) {
+		c = id[ofs] >> 8;
+		*s = c;
+		s++;
+
+		c = id[ofs] & 0xff;
+		*s = c;
+		s++;
+
+		ofs++;
+		len -= 2;
+	}
+}
+
+static void ata_id_c_string(const u16 *id, unsigned char *s, unsigned int ofs,
+			    unsigned int len)
+{
+	unsigned char *p;
+
+	WARN_ON(!(len & 1));
+
+	ata_id_string(id, s, ofs, len - 1);
+
+	p = s + strnlen(s, len - 1);
+	while (p > s && p[-1] == ' ')
+		p--;
+	*p = '\0';
+}
+
+static int ps3disk_identify(struct ps3_storage_device *dev)
+{
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+	struct lv1_ata_cmnd_block ata_cmnd;
+	u16 *id = dev->bounce_buf;
+	u64 res;
+
+	dev_dbg(&dev->sbd.core, "%s:%u: identify disk\n", __func__, __LINE__);
+
+	memset(&ata_cmnd, 0, sizeof(struct lv1_ata_cmnd_block));
+	ata_cmnd.command = ATA_CMD_ID_ATA;
+	ata_cmnd.sector_count = 1;
+	ata_cmnd.size = ata_cmnd.arglen = ATA_ID_WORDS * 2;
+	ata_cmnd.buffer = dev->bounce_lpar;
+	ata_cmnd.proto = PIO_DATA_IN_PROTO;
+	ata_cmnd.in_out = DIR_READ;
+
+	res = ps3stor_send_command(dev, LV1_STORAGE_SEND_ATA_COMMAND,
+				   ps3_mm_phys_to_lpar(__pa(&ata_cmnd)),
+				   sizeof(ata_cmnd), ata_cmnd.buffer,
+				   ata_cmnd.arglen);
+	if (res) {
+		dev_err(&dev->sbd.core, "%s:%u: identify disk failed 0x%llx\n",
+			__func__, __LINE__, res);
+		return -EIO;
+	}
+
+	swap_buf_le16(id, ATA_ID_WORDS);
+
+	/* All we're interested in are raw capacity and model name */
+	priv->raw_capacity = ata_id_n_sectors(id);
+	ata_id_c_string(id, priv->model, ATA_ID_PROD, sizeof(priv->model));
+	return 0;
+}
+
+static unsigned long ps3disk_mask;
+
+static DEFINE_MUTEX(ps3disk_mask_mutex);
+
+static const struct blk_mq_ops ps3disk_mq_ops = {
+	.queue_rq	= ps3disk_queue_rq,
+};
+
+static int ps3disk_probe(struct ps3_system_bus_device *_dev)
+{
+	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
+	struct ps3disk_private *priv;
+	int error;
+	unsigned int devidx;
+	struct request_queue *queue;
+	struct gendisk *gendisk;
+
+	if (dev->blk_size < 512) {
+		dev_err(&dev->sbd.core,
+			"%s:%u: cannot handle block size %llu\n", __func__,
+			__LINE__, dev->blk_size);
+		return -EINVAL;
+	}
+
+	BUILD_BUG_ON(PS3DISK_MAX_DISKS > BITS_PER_LONG);
+	mutex_lock(&ps3disk_mask_mutex);
+	devidx = find_first_zero_bit(&ps3disk_mask, PS3DISK_MAX_DISKS);
+	if (devidx >= PS3DISK_MAX_DISKS) {
+		dev_err(&dev->sbd.core, "%s:%u: Too many disks\n", __func__,
+			__LINE__);
+		mutex_unlock(&ps3disk_mask_mutex);
+		return -ENOSPC;
+	}
+	__set_bit(devidx, &ps3disk_mask);
+	mutex_unlock(&ps3disk_mask_mutex);
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		error = -ENOMEM;
+		goto fail;
+	}
+
+	ps3_system_bus_set_drvdata(_dev, priv);
+	spin_lock_init(&priv->lock);
+
+	dev->bounce_size = BOUNCE_SIZE;
+	dev->bounce_buf = kmalloc(BOUNCE_SIZE, GFP_DMA);
+	if (!dev->bounce_buf) {
+		error = -ENOMEM;
+		goto fail_free_priv;
+	}
+
+	error = ps3stor_setup(dev, ps3disk_interrupt);
+	if (error)
+		goto fail_free_bounce;
+
+	ps3disk_identify(dev);
+
+	error = blk_mq_alloc_sq_tag_set(&priv->tag_set, &ps3disk_mq_ops, 1,
+					BLK_MQ_F_SHOULD_MERGE);
+	if (error)
+		goto fail_teardown;
+
+	gendisk = blk_mq_alloc_disk(&priv->tag_set, dev);
+	if (IS_ERR(gendisk)) {
+		dev_err(&dev->sbd.core, "%s:%u: blk_mq_alloc_disk failed\n",
+			__func__, __LINE__);
+		error = PTR_ERR(gendisk);
+		goto fail_free_tag_set;
+	}
+
+	queue = gendisk->queue;
+
+	blk_queue_max_hw_sectors(queue, dev->bounce_size >> 9);
+	blk_queue_dma_alignment(queue, dev->blk_size-1);
+	blk_queue_logical_block_size(queue, dev->blk_size);
+
+	blk_queue_write_cache(queue, true, false);
+
+	blk_queue_max_segments(queue, -1);
+	blk_queue_max_segment_size(queue, dev->bounce_size);
+
+	priv->gendisk = gendisk;
+	gendisk->major = ps3disk_major;
+	gendisk->first_minor = devidx * PS3DISK_MINORS;
+	gendisk->minors = PS3DISK_MINORS;
+	gendisk->fops = &ps3disk_fops;
+	gendisk->private_data = dev;
+	snprintf(gendisk->disk_name, sizeof(gendisk->disk_name), PS3DISK_NAME,
+		 devidx+'a');
+	priv->blocking_factor = dev->blk_size >> 9;
+	set_capacity(gendisk,
+		     dev->regions[dev->region_idx].size*priv->blocking_factor);
+
+	dev_info(&dev->sbd.core,
+		 "%s is a %s (%llu MiB total, %llu MiB for OtherOS)\n",
+		 gendisk->disk_name, priv->model, priv->raw_capacity >> 11,
+		 get_capacity(gendisk) >> 11);
+
+	error = device_add_disk(&dev->sbd.core, gendisk, NULL);
+	if (error)
+		goto fail_cleanup_disk;
+
+	return 0;
+fail_cleanup_disk:
+	put_disk(gendisk);
+fail_free_tag_set:
+	blk_mq_free_tag_set(&priv->tag_set);
+fail_teardown:
+	ps3stor_teardown(dev);
+fail_free_bounce:
+	kfree(dev->bounce_buf);
+fail_free_priv:
+	kfree(priv);
+	ps3_system_bus_set_drvdata(_dev, NULL);
+fail:
+	mutex_lock(&ps3disk_mask_mutex);
+	__clear_bit(devidx, &ps3disk_mask);
+	mutex_unlock(&ps3disk_mask_mutex);
+	return error;
+}
+
+static void ps3disk_remove(struct ps3_system_bus_device *_dev)
+{
+	struct ps3_storage_device *dev = to_ps3_storage_device(&_dev->core);
+	struct ps3disk_private *priv = ps3_system_bus_get_drvdata(&dev->sbd);
+
+	mutex_lock(&ps3disk_mask_mutex);
+	__clear_bit(MINOR(disk_devt(priv->gendisk)) / PS3DISK_MINORS,
+		    &ps3disk_mask);
+	mutex_unlock(&ps3disk_mask_mutex);
+	del_gendisk(priv->gendisk);
+	put_disk(priv->gendisk);
+	blk_mq_free_tag_set(&priv->tag_set);
+	dev_notice(&dev->sbd.core, "Synchronizing disk cache\n");
+	ps3disk_sync_cache(dev);
+	ps3stor_teardown(dev);
+	kfree(dev->bounce_buf);
+	kfree(priv);
+	ps3_system_bus_set_drvdata(_dev, NULL);
+}
+
+static struct ps3_system_bus_driver ps3disk = {
+	.match_id	= PS3_MATCH_ID_STOR_DISK,
+	.core.name	= DEVICE_NAME,
+	.core.owner	= THIS_MODULE,
+	.probe		= ps3disk_probe,
+	.remove		= ps3disk_remove,
+	.shutdown	= ps3disk_remove,
+};
+
+
+static int __init ps3disk_init(void)
+{
+	int error;
+
+	if (!firmware_has_feature(FW_FEATURE_PS3_LV1))
+		return -ENODEV;
+
+	error = register_blkdev(0, DEVICE_NAME);
+	if (error <= 0) {
+		printk(KERN_ERR "%s:%u: register_blkdev failed %d\n", __func__,
+		       __LINE__, error);
+		return error;
+	}
+	ps3disk_major = error;
+
+	pr_info("%s:%u: registered block device major %d\n", __func__,
+		__LINE__, ps3disk_major);
+
+	error = ps3_system_bus_driver_register(&ps3disk);
+	if (error)
+		unregister_blkdev(ps3disk_major, DEVICE_NAME);
+
+	return error;
+}
+
+static void __exit ps3disk_exit(void)
+{
+	ps3_system_bus_driver_unregister(&ps3disk);
+	unregister_blkdev(ps3disk_major, DEVICE_NAME);
+}
+
+module_init(ps3disk_init);
+module_exit(ps3disk_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PS3 Disk Storage Driver");
+MODULE_AUTHOR("Sony Corporation");
+MODULE_ALIAS(PS3_MODULE_ALIAS_STOR_DISK);
diff --git a/drivers/block/ps3vram.c b/drivers/block/ps3vram.c
new file mode 100644
index 000000000..574e470b2
--- /dev/null
+++ b/drivers/block/ps3vram.c
@@ -0,0 +1,860 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * ps3vram - Use extra PS3 video ram as block device.
+ *
+ * Copyright 2009 Sony Corporation
+ *
+ * Based on the MTD ps3vram driver, which is
+ * Copyright (c) 2007-2008 Jim Paris <jim@jtan.com>
+ * Added support RSX DMA Vivien Chappelier <vivien.chappelier@free.fr>
+ */
+
+#include <linux/blkdev.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+
+#include <asm/cell-regs.h>
+#include <asm/firmware.h>
+#include <asm/lv1call.h>
+#include <asm/ps3.h>
+#include <asm/ps3gpu.h>
+
+
+#define DEVICE_NAME		"ps3vram"
+
+
+#define XDR_BUF_SIZE (2 * 1024 * 1024) /* XDR buffer (must be 1MiB aligned) */
+#define XDR_IOIF 0x0c000000
+
+#define FIFO_BASE XDR_IOIF
+#define FIFO_SIZE (64 * 1024)
+
+#define DMA_PAGE_SIZE (4 * 1024)
+
+#define CACHE_PAGE_SIZE (256 * 1024)
+#define CACHE_PAGE_COUNT ((XDR_BUF_SIZE - FIFO_SIZE) / CACHE_PAGE_SIZE)
+
+#define CACHE_OFFSET CACHE_PAGE_SIZE
+#define FIFO_OFFSET 0
+
+#define CTRL_PUT 0x10
+#define CTRL_GET 0x11
+#define CTRL_TOP 0x15
+
+#define UPLOAD_SUBCH	1
+#define DOWNLOAD_SUBCH	2
+
+#define NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN	0x0000030c
+#define NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY	0x00000104
+
+#define CACHE_PAGE_PRESENT 1
+#define CACHE_PAGE_DIRTY   2
+
+struct ps3vram_tag {
+	unsigned int address;
+	unsigned int flags;
+};
+
+struct ps3vram_cache {
+	unsigned int page_count;
+	unsigned int page_size;
+	struct ps3vram_tag *tags;
+	unsigned int hit;
+	unsigned int miss;
+};
+
+struct ps3vram_priv {
+	struct gendisk *gendisk;
+
+	u64 size;
+
+	u64 memory_handle;
+	u64 context_handle;
+	u32 __iomem *ctrl;
+	void __iomem *reports;
+	u8 *xdr_buf;
+
+	u32 *fifo_base;
+	u32 *fifo_ptr;
+
+	struct ps3vram_cache cache;
+
+	spinlock_t lock;	/* protecting list of bios */
+	struct bio_list list;
+};
+
+
+static int ps3vram_major;
+
+#define DMA_NOTIFIER_HANDLE_BASE 0x66604200 /* first DMA notifier handle */
+#define DMA_NOTIFIER_OFFSET_BASE 0x1000     /* first DMA notifier offset */
+#define DMA_NOTIFIER_SIZE        0x40
+#define NOTIFIER 7	/* notifier used for completion report */
+
+static char *size = "256M";
+module_param(size, charp, 0);
+MODULE_PARM_DESC(size, "memory size");
+
+static u32 __iomem *ps3vram_get_notifier(void __iomem *reports, int notifier)
+{
+	return reports + DMA_NOTIFIER_OFFSET_BASE +
+	       DMA_NOTIFIER_SIZE * notifier;
+}
+
+static void ps3vram_notifier_reset(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
+	int i;
+
+	for (i = 0; i < 4; i++)
+		iowrite32be(0xffffffff, notify + i);
+}
+
+static int ps3vram_notifier_wait(struct ps3_system_bus_device *dev,
+				 unsigned int timeout_ms)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	u32 __iomem *notify = ps3vram_get_notifier(priv->reports, NOTIFIER);
+	unsigned long timeout;
+
+	for (timeout = 20; timeout; timeout--) {
+		if (!ioread32be(notify + 3))
+			return 0;
+		udelay(10);
+	}
+
+	timeout = jiffies + msecs_to_jiffies(timeout_ms);
+
+	do {
+		if (!ioread32be(notify + 3))
+			return 0;
+		msleep(1);
+	} while (time_before(jiffies, timeout));
+
+	return -ETIMEDOUT;
+}
+
+static void ps3vram_init_ring(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT);
+	iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_GET);
+}
+
+static int ps3vram_wait_ring(struct ps3_system_bus_device *dev,
+			     unsigned int timeout_ms)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	unsigned long timeout = jiffies + msecs_to_jiffies(timeout_ms);
+
+	do {
+		if (ioread32be(priv->ctrl + CTRL_PUT) == ioread32be(priv->ctrl + CTRL_GET))
+			return 0;
+		msleep(1);
+	} while (time_before(jiffies, timeout));
+
+	dev_warn(&dev->core, "FIFO timeout (%08x/%08x/%08x)\n",
+		 ioread32be(priv->ctrl + CTRL_PUT), ioread32be(priv->ctrl + CTRL_GET),
+		 ioread32be(priv->ctrl + CTRL_TOP));
+
+	return -ETIMEDOUT;
+}
+
+static void ps3vram_out_ring(struct ps3vram_priv *priv, u32 data)
+{
+	*(priv->fifo_ptr)++ = data;
+}
+
+static void ps3vram_begin_ring(struct ps3vram_priv *priv, u32 chan, u32 tag,
+			       u32 size)
+{
+	ps3vram_out_ring(priv, (size << 18) | (chan << 13) | tag);
+}
+
+static void ps3vram_rewind_ring(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	int status;
+
+	ps3vram_out_ring(priv, 0x20000000 | (FIFO_BASE + FIFO_OFFSET));
+
+	iowrite32be(FIFO_BASE + FIFO_OFFSET, priv->ctrl + CTRL_PUT);
+
+	/* asking the HV for a blit will kick the FIFO */
+	status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
+	if (status)
+		dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+			__func__, status);
+
+	priv->fifo_ptr = priv->fifo_base;
+}
+
+static void ps3vram_fire_ring(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	int status;
+
+	mutex_lock(&ps3_gpu_mutex);
+
+	iowrite32be(FIFO_BASE + FIFO_OFFSET + (priv->fifo_ptr - priv->fifo_base)
+		* sizeof(u32), priv->ctrl + CTRL_PUT);
+
+	/* asking the HV for a blit will kick the FIFO */
+	status = lv1_gpu_fb_blit(priv->context_handle, 0, 0, 0, 0);
+	if (status)
+		dev_err(&dev->core, "%s: lv1_gpu_fb_blit failed %d\n",
+			__func__, status);
+
+	if ((priv->fifo_ptr - priv->fifo_base) * sizeof(u32) >
+	    FIFO_SIZE - 1024) {
+		dev_dbg(&dev->core, "FIFO full, rewinding\n");
+		ps3vram_wait_ring(dev, 200);
+		ps3vram_rewind_ring(dev);
+	}
+
+	mutex_unlock(&ps3_gpu_mutex);
+}
+
+static void ps3vram_bind(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0, 1);
+	ps3vram_out_ring(priv, 0x31337303);
+	ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x180, 3);
+	ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER);
+	ps3vram_out_ring(priv, 0xfeed0001);	/* DMA system RAM instance */
+	ps3vram_out_ring(priv, 0xfeed0000);     /* DMA video RAM instance */
+
+	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0, 1);
+	ps3vram_out_ring(priv, 0x3137c0de);
+	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x180, 3);
+	ps3vram_out_ring(priv, DMA_NOTIFIER_HANDLE_BASE + NOTIFIER);
+	ps3vram_out_ring(priv, 0xfeed0000);	/* DMA video RAM instance */
+	ps3vram_out_ring(priv, 0xfeed0001);	/* DMA system RAM instance */
+
+	ps3vram_fire_ring(dev);
+}
+
+static int ps3vram_upload(struct ps3_system_bus_device *dev,
+			  unsigned int src_offset, unsigned int dst_offset,
+			  int len, int count)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	ps3vram_begin_ring(priv, UPLOAD_SUBCH,
+			   NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	ps3vram_out_ring(priv, XDR_IOIF + src_offset);
+	ps3vram_out_ring(priv, dst_offset);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, count);
+	ps3vram_out_ring(priv, (1 << 8) | 1);
+	ps3vram_out_ring(priv, 0);
+
+	ps3vram_notifier_reset(dev);
+	ps3vram_begin_ring(priv, UPLOAD_SUBCH,
+			   NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1);
+	ps3vram_out_ring(priv, 0);
+	ps3vram_begin_ring(priv, UPLOAD_SUBCH, 0x100, 1);
+	ps3vram_out_ring(priv, 0);
+	ps3vram_fire_ring(dev);
+	if (ps3vram_notifier_wait(dev, 200) < 0) {
+		dev_warn(&dev->core, "%s: Notifier timeout\n", __func__);
+		return -1;
+	}
+
+	return 0;
+}
+
+static int ps3vram_download(struct ps3_system_bus_device *dev,
+			    unsigned int src_offset, unsigned int dst_offset,
+			    int len, int count)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH,
+			   NV_MEMORY_TO_MEMORY_FORMAT_OFFSET_IN, 8);
+	ps3vram_out_ring(priv, src_offset);
+	ps3vram_out_ring(priv, XDR_IOIF + dst_offset);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, len);
+	ps3vram_out_ring(priv, count);
+	ps3vram_out_ring(priv, (1 << 8) | 1);
+	ps3vram_out_ring(priv, 0);
+
+	ps3vram_notifier_reset(dev);
+	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH,
+			   NV_MEMORY_TO_MEMORY_FORMAT_NOTIFY, 1);
+	ps3vram_out_ring(priv, 0);
+	ps3vram_begin_ring(priv, DOWNLOAD_SUBCH, 0x100, 1);
+	ps3vram_out_ring(priv, 0);
+	ps3vram_fire_ring(dev);
+	if (ps3vram_notifier_wait(dev, 200) < 0) {
+		dev_warn(&dev->core, "%s: Notifier timeout\n", __func__);
+		return -1;
+	}
+
+	return 0;
+}
+
+static void ps3vram_cache_evict(struct ps3_system_bus_device *dev, int entry)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	struct ps3vram_cache *cache = &priv->cache;
+
+	if (!(cache->tags[entry].flags & CACHE_PAGE_DIRTY))
+		return;
+
+	dev_dbg(&dev->core, "Flushing %d: 0x%08x\n", entry,
+		cache->tags[entry].address);
+	if (ps3vram_upload(dev, CACHE_OFFSET + entry * cache->page_size,
+			   cache->tags[entry].address, DMA_PAGE_SIZE,
+			   cache->page_size / DMA_PAGE_SIZE) < 0) {
+		dev_err(&dev->core,
+			"Failed to upload from 0x%x to " "0x%x size 0x%x\n",
+			entry * cache->page_size, cache->tags[entry].address,
+			cache->page_size);
+	}
+	cache->tags[entry].flags &= ~CACHE_PAGE_DIRTY;
+}
+
+static void ps3vram_cache_load(struct ps3_system_bus_device *dev, int entry,
+			       unsigned int address)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	struct ps3vram_cache *cache = &priv->cache;
+
+	dev_dbg(&dev->core, "Fetching %d: 0x%08x\n", entry, address);
+	if (ps3vram_download(dev, address,
+			     CACHE_OFFSET + entry * cache->page_size,
+			     DMA_PAGE_SIZE,
+			     cache->page_size / DMA_PAGE_SIZE) < 0) {
+		dev_err(&dev->core,
+			"Failed to download from 0x%x to 0x%x size 0x%x\n",
+			address, entry * cache->page_size, cache->page_size);
+	}
+
+	cache->tags[entry].address = address;
+	cache->tags[entry].flags |= CACHE_PAGE_PRESENT;
+}
+
+
+static void ps3vram_cache_flush(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	struct ps3vram_cache *cache = &priv->cache;
+	int i;
+
+	dev_dbg(&dev->core, "FLUSH\n");
+	for (i = 0; i < cache->page_count; i++) {
+		ps3vram_cache_evict(dev, i);
+		cache->tags[i].flags = 0;
+	}
+}
+
+static unsigned int ps3vram_cache_match(struct ps3_system_bus_device *dev,
+					loff_t address)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	struct ps3vram_cache *cache = &priv->cache;
+	unsigned int base;
+	unsigned int offset;
+	int i;
+	static int counter;
+
+	offset = (unsigned int) (address & (cache->page_size - 1));
+	base = (unsigned int) (address - offset);
+
+	/* fully associative check */
+	for (i = 0; i < cache->page_count; i++) {
+		if ((cache->tags[i].flags & CACHE_PAGE_PRESENT) &&
+		    cache->tags[i].address == base) {
+			cache->hit++;
+			dev_dbg(&dev->core, "Found entry %d: 0x%08x\n", i,
+				cache->tags[i].address);
+			return i;
+		}
+	}
+
+	/* choose a random entry */
+	i = (jiffies + (counter++)) % cache->page_count;
+	dev_dbg(&dev->core, "Using entry %d\n", i);
+
+	ps3vram_cache_evict(dev, i);
+	ps3vram_cache_load(dev, i, base);
+
+	cache->miss++;
+	return i;
+}
+
+static int ps3vram_cache_init(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	priv->cache.page_count = CACHE_PAGE_COUNT;
+	priv->cache.page_size = CACHE_PAGE_SIZE;
+	priv->cache.tags = kcalloc(CACHE_PAGE_COUNT,
+				   sizeof(struct ps3vram_tag),
+				   GFP_KERNEL);
+	if (!priv->cache.tags)
+		return -ENOMEM;
+
+	dev_info(&dev->core, "Created ram cache: %d entries, %d KiB each\n",
+		CACHE_PAGE_COUNT, CACHE_PAGE_SIZE / 1024);
+
+	return 0;
+}
+
+static void ps3vram_cache_cleanup(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	ps3vram_cache_flush(dev);
+	kfree(priv->cache.tags);
+}
+
+static blk_status_t ps3vram_read(struct ps3_system_bus_device *dev, loff_t from,
+			size_t len, size_t *retlen, u_char *buf)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	unsigned int cached, count;
+
+	dev_dbg(&dev->core, "%s: from=0x%08x len=0x%zx\n", __func__,
+		(unsigned int)from, len);
+
+	if (from >= priv->size)
+		return BLK_STS_IOERR;
+
+	if (len > priv->size - from)
+		len = priv->size - from;
+
+	/* Copy from vram to buf */
+	count = len;
+	while (count) {
+		unsigned int offset, avail;
+		unsigned int entry;
+
+		offset = (unsigned int) (from & (priv->cache.page_size - 1));
+		avail  = priv->cache.page_size - offset;
+
+		entry = ps3vram_cache_match(dev, from);
+		cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
+
+		dev_dbg(&dev->core, "%s: from=%08x cached=%08x offset=%08x "
+			"avail=%08x count=%08x\n", __func__,
+			(unsigned int)from, cached, offset, avail, count);
+
+		if (avail > count)
+			avail = count;
+		memcpy(buf, priv->xdr_buf + cached, avail);
+
+		buf += avail;
+		count -= avail;
+		from += avail;
+	}
+
+	*retlen = len;
+	return 0;
+}
+
+static blk_status_t ps3vram_write(struct ps3_system_bus_device *dev, loff_t to,
+			 size_t len, size_t *retlen, const u_char *buf)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	unsigned int cached, count;
+
+	if (to >= priv->size)
+		return BLK_STS_IOERR;
+
+	if (len > priv->size - to)
+		len = priv->size - to;
+
+	/* Copy from buf to vram */
+	count = len;
+	while (count) {
+		unsigned int offset, avail;
+		unsigned int entry;
+
+		offset = (unsigned int) (to & (priv->cache.page_size - 1));
+		avail  = priv->cache.page_size - offset;
+
+		entry = ps3vram_cache_match(dev, to);
+		cached = CACHE_OFFSET + entry * priv->cache.page_size + offset;
+
+		dev_dbg(&dev->core, "%s: to=%08x cached=%08x offset=%08x "
+			"avail=%08x count=%08x\n", __func__, (unsigned int)to,
+			cached, offset, avail, count);
+
+		if (avail > count)
+			avail = count;
+		memcpy(priv->xdr_buf + cached, buf, avail);
+
+		priv->cache.tags[entry].flags |= CACHE_PAGE_DIRTY;
+
+		buf += avail;
+		count -= avail;
+		to += avail;
+	}
+
+	*retlen = len;
+	return 0;
+}
+
+static int ps3vram_proc_show(struct seq_file *m, void *v)
+{
+	struct ps3vram_priv *priv = m->private;
+
+	seq_printf(m, "hit:%u\nmiss:%u\n", priv->cache.hit, priv->cache.miss);
+	return 0;
+}
+
+static void ps3vram_proc_init(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	struct proc_dir_entry *pde;
+
+	pde = proc_create_single_data(DEVICE_NAME, 0444, NULL,
+			ps3vram_proc_show, priv);
+	if (!pde)
+		dev_warn(&dev->core, "failed to create /proc entry\n");
+}
+
+static struct bio *ps3vram_do_bio(struct ps3_system_bus_device *dev,
+				  struct bio *bio)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	int write = bio_data_dir(bio) == WRITE;
+	const char *op = write ? "write" : "read";
+	loff_t offset = bio->bi_iter.bi_sector << 9;
+	blk_status_t error = 0;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	struct bio *next;
+
+	bio_for_each_segment(bvec, bio, iter) {
+		/* PS3 is ppc64, so we don't handle highmem */
+		char *ptr = bvec_virt(&bvec);
+		size_t len = bvec.bv_len, retlen;
+
+		dev_dbg(&dev->core, "    %s %zu bytes at offset %llu\n", op,
+			len, offset);
+		if (write)
+			error = ps3vram_write(dev, offset, len, &retlen, ptr);
+		else
+			error = ps3vram_read(dev, offset, len, &retlen, ptr);
+
+		if (error) {
+			dev_err(&dev->core, "%s failed\n", op);
+			goto out;
+		}
+
+		if (retlen != len) {
+			dev_err(&dev->core, "Short %s\n", op);
+			error = BLK_STS_IOERR;
+			goto out;
+		}
+
+		offset += len;
+	}
+
+	dev_dbg(&dev->core, "%s completed\n", op);
+
+out:
+	spin_lock_irq(&priv->lock);
+	bio_list_pop(&priv->list);
+	next = bio_list_peek(&priv->list);
+	spin_unlock_irq(&priv->lock);
+
+	bio->bi_status = error;
+	bio_endio(bio);
+	return next;
+}
+
+static void ps3vram_submit_bio(struct bio *bio)
+{
+	struct ps3_system_bus_device *dev = bio->bi_bdev->bd_disk->private_data;
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+	int busy;
+
+	dev_dbg(&dev->core, "%s\n", __func__);
+
+	bio = bio_split_to_limits(bio);
+	if (!bio)
+		return;
+
+	spin_lock_irq(&priv->lock);
+	busy = !bio_list_empty(&priv->list);
+	bio_list_add(&priv->list, bio);
+	spin_unlock_irq(&priv->lock);
+
+	if (busy)
+		return;
+
+	do {
+		bio = ps3vram_do_bio(dev, bio);
+	} while (bio);
+}
+
+static const struct block_device_operations ps3vram_fops = {
+	.owner		= THIS_MODULE,
+	.submit_bio	= ps3vram_submit_bio,
+};
+
+static int ps3vram_probe(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv;
+	int error, status;
+	struct gendisk *gendisk;
+	u64 ddr_size, ddr_lpar, ctrl_lpar, info_lpar, reports_lpar,
+	    reports_size, xdr_lpar;
+	char *rest;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv) {
+		error = -ENOMEM;
+		goto fail;
+	}
+
+	spin_lock_init(&priv->lock);
+	bio_list_init(&priv->list);
+	ps3_system_bus_set_drvdata(dev, priv);
+
+	/* Allocate XDR buffer (1MiB aligned) */
+	priv->xdr_buf = (void *)__get_free_pages(GFP_KERNEL,
+		get_order(XDR_BUF_SIZE));
+	if (priv->xdr_buf == NULL) {
+		dev_err(&dev->core, "Could not allocate XDR buffer\n");
+		error = -ENOMEM;
+		goto fail_free_priv;
+	}
+
+	/* Put FIFO at begginning of XDR buffer */
+	priv->fifo_base = (u32 *) (priv->xdr_buf + FIFO_OFFSET);
+	priv->fifo_ptr = priv->fifo_base;
+
+	/* XXX: Need to open GPU, in case ps3fb or snd_ps3 aren't loaded */
+	if (ps3_open_hv_device(dev)) {
+		dev_err(&dev->core, "ps3_open_hv_device failed\n");
+		error = -EAGAIN;
+		goto out_free_xdr_buf;
+	}
+
+	/* Request memory */
+	status = -1;
+	ddr_size = ALIGN(memparse(size, &rest), 1024*1024);
+	if (!ddr_size) {
+		dev_err(&dev->core, "Specified size is too small\n");
+		error = -EINVAL;
+		goto out_close_gpu;
+	}
+
+	while (ddr_size > 0) {
+		status = lv1_gpu_memory_allocate(ddr_size, 0, 0, 0, 0,
+						 &priv->memory_handle,
+						 &ddr_lpar);
+		if (!status)
+			break;
+		ddr_size -= 1024*1024;
+	}
+	if (status) {
+		dev_err(&dev->core, "lv1_gpu_memory_allocate failed %d\n",
+			status);
+		error = -ENOMEM;
+		goto out_close_gpu;
+	}
+
+	/* Request context */
+	status = lv1_gpu_context_allocate(priv->memory_handle, 0,
+					  &priv->context_handle, &ctrl_lpar,
+					  &info_lpar, &reports_lpar,
+					  &reports_size);
+	if (status) {
+		dev_err(&dev->core, "lv1_gpu_context_allocate failed %d\n",
+			status);
+		error = -ENOMEM;
+		goto out_free_memory;
+	}
+
+	/* Map XDR buffer to RSX */
+	xdr_lpar = ps3_mm_phys_to_lpar(__pa(priv->xdr_buf));
+	status = lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
+				       xdr_lpar, XDR_BUF_SIZE,
+				       CBE_IOPTE_PP_W | CBE_IOPTE_PP_R |
+				       CBE_IOPTE_M);
+	if (status) {
+		dev_err(&dev->core, "lv1_gpu_context_iomap failed %d\n",
+			status);
+		error = -ENOMEM;
+		goto out_free_context;
+	}
+
+	priv->ctrl = ioremap(ctrl_lpar, 64 * 1024);
+	if (!priv->ctrl) {
+		dev_err(&dev->core, "ioremap CTRL failed\n");
+		error = -ENOMEM;
+		goto out_unmap_context;
+	}
+
+	priv->reports = ioremap(reports_lpar, reports_size);
+	if (!priv->reports) {
+		dev_err(&dev->core, "ioremap REPORTS failed\n");
+		error = -ENOMEM;
+		goto out_unmap_ctrl;
+	}
+
+	mutex_lock(&ps3_gpu_mutex);
+	ps3vram_init_ring(dev);
+	mutex_unlock(&ps3_gpu_mutex);
+
+	priv->size = ddr_size;
+
+	ps3vram_bind(dev);
+
+	mutex_lock(&ps3_gpu_mutex);
+	error = ps3vram_wait_ring(dev, 100);
+	mutex_unlock(&ps3_gpu_mutex);
+	if (error < 0) {
+		dev_err(&dev->core, "Failed to initialize channels\n");
+		error = -ETIMEDOUT;
+		goto out_unmap_reports;
+	}
+
+	error = ps3vram_cache_init(dev);
+	if (error < 0) {
+		goto out_unmap_reports;
+	}
+
+	ps3vram_proc_init(dev);
+
+	gendisk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!gendisk) {
+		dev_err(&dev->core, "blk_alloc_disk failed\n");
+		error = -ENOMEM;
+		goto out_cache_cleanup;
+	}
+
+	priv->gendisk = gendisk;
+	gendisk->major = ps3vram_major;
+	gendisk->minors = 1;
+	gendisk->flags |= GENHD_FL_NO_PART;
+	gendisk->fops = &ps3vram_fops;
+	gendisk->private_data = dev;
+	strscpy(gendisk->disk_name, DEVICE_NAME, sizeof(gendisk->disk_name));
+	set_capacity(gendisk, priv->size >> 9);
+	blk_queue_max_segments(gendisk->queue, BLK_MAX_SEGMENTS);
+	blk_queue_max_segment_size(gendisk->queue, BLK_MAX_SEGMENT_SIZE);
+	blk_queue_max_hw_sectors(gendisk->queue, BLK_SAFE_MAX_SECTORS);
+
+	dev_info(&dev->core, "%s: Using %llu MiB of GPU memory\n",
+		 gendisk->disk_name, get_capacity(gendisk) >> 11);
+
+	error = device_add_disk(&dev->core, gendisk, NULL);
+	if (error)
+		goto out_cleanup_disk;
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(gendisk);
+out_cache_cleanup:
+	remove_proc_entry(DEVICE_NAME, NULL);
+	ps3vram_cache_cleanup(dev);
+out_unmap_reports:
+	iounmap(priv->reports);
+out_unmap_ctrl:
+	iounmap(priv->ctrl);
+out_unmap_context:
+	lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF, xdr_lpar,
+			      XDR_BUF_SIZE, CBE_IOPTE_M);
+out_free_context:
+	lv1_gpu_context_free(priv->context_handle);
+out_free_memory:
+	lv1_gpu_memory_free(priv->memory_handle);
+out_close_gpu:
+	ps3_close_hv_device(dev);
+out_free_xdr_buf:
+	free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
+fail_free_priv:
+	kfree(priv);
+	ps3_system_bus_set_drvdata(dev, NULL);
+fail:
+	return error;
+}
+
+static void ps3vram_remove(struct ps3_system_bus_device *dev)
+{
+	struct ps3vram_priv *priv = ps3_system_bus_get_drvdata(dev);
+
+	del_gendisk(priv->gendisk);
+	put_disk(priv->gendisk);
+	remove_proc_entry(DEVICE_NAME, NULL);
+	ps3vram_cache_cleanup(dev);
+	iounmap(priv->reports);
+	iounmap(priv->ctrl);
+	lv1_gpu_context_iomap(priv->context_handle, XDR_IOIF,
+			      ps3_mm_phys_to_lpar(__pa(priv->xdr_buf)),
+			      XDR_BUF_SIZE, CBE_IOPTE_M);
+	lv1_gpu_context_free(priv->context_handle);
+	lv1_gpu_memory_free(priv->memory_handle);
+	ps3_close_hv_device(dev);
+	free_pages((unsigned long) priv->xdr_buf, get_order(XDR_BUF_SIZE));
+	kfree(priv);
+	ps3_system_bus_set_drvdata(dev, NULL);
+}
+
+static struct ps3_system_bus_driver ps3vram = {
+	.match_id	= PS3_MATCH_ID_GPU,
+	.match_sub_id	= PS3_MATCH_SUB_ID_GPU_RAMDISK,
+	.core.name	= DEVICE_NAME,
+	.core.owner	= THIS_MODULE,
+	.probe		= ps3vram_probe,
+	.remove		= ps3vram_remove,
+	.shutdown	= ps3vram_remove,
+};
+
+
+static int __init ps3vram_init(void)
+{
+	int error;
+
+	if (!firmware_has_feature(FW_FEATURE_PS3_LV1))
+		return -ENODEV;
+
+	error = register_blkdev(0, DEVICE_NAME);
+	if (error <= 0) {
+		pr_err("%s: register_blkdev failed %d\n", DEVICE_NAME, error);
+		return error;
+	}
+	ps3vram_major = error;
+
+	pr_info("%s: registered block device major %d\n", DEVICE_NAME,
+		ps3vram_major);
+
+	error = ps3_system_bus_driver_register(&ps3vram);
+	if (error)
+		unregister_blkdev(ps3vram_major, DEVICE_NAME);
+
+	return error;
+}
+
+static void __exit ps3vram_exit(void)
+{
+	ps3_system_bus_driver_unregister(&ps3vram);
+	unregister_blkdev(ps3vram_major, DEVICE_NAME);
+}
+
+module_init(ps3vram_init);
+module_exit(ps3vram_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("PS3 Video RAM Storage Driver");
+MODULE_AUTHOR("Sony Corporation");
+MODULE_ALIAS(PS3_MODULE_ALIAS_GPU_RAMDISK);
diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c
new file mode 100644
index 000000000..f58ca9ce3
--- /dev/null
+++ b/drivers/block/rbd.c
@@ -0,0 +1,7464 @@
+
+/*
+   rbd.c -- Export ceph rados objects as a Linux block device
+
+
+   based on drivers/block/osdblk.c:
+
+   Copyright 2009 Red Hat, Inc.
+
+   This program is free software; you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation.
+
+   This program is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with this program; see the file COPYING.  If not, write to
+   the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+
+
+
+   For usage instructions, please refer to:
+
+                 Documentation/ABI/testing/sysfs-bus-rbd
+
+ */
+
+#include <linux/ceph/libceph.h>
+#include <linux/ceph/osd_client.h>
+#include <linux/ceph/mon_client.h>
+#include <linux/ceph/cls_lock_client.h>
+#include <linux/ceph/striper.h>
+#include <linux/ceph/decode.h>
+#include <linux/fs_parser.h>
+#include <linux/bsearch.h>
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/fs.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/workqueue.h>
+
+#include "rbd_types.h"
+
+#define RBD_DEBUG	/* Activate rbd_assert() calls */
+
+/*
+ * Increment the given counter and return its updated value.
+ * If the counter is already 0 it will not be incremented.
+ * If the counter is already at its maximum value returns
+ * -EINVAL without updating it.
+ */
+static int atomic_inc_return_safe(atomic_t *v)
+{
+	unsigned int counter;
+
+	counter = (unsigned int)atomic_fetch_add_unless(v, 1, 0);
+	if (counter <= (unsigned int)INT_MAX)
+		return (int)counter;
+
+	atomic_dec(v);
+
+	return -EINVAL;
+}
+
+/* Decrement the counter.  Return the resulting value, or -EINVAL */
+static int atomic_dec_return_safe(atomic_t *v)
+{
+	int counter;
+
+	counter = atomic_dec_return(v);
+	if (counter >= 0)
+		return counter;
+
+	atomic_inc(v);
+
+	return -EINVAL;
+}
+
+#define RBD_DRV_NAME "rbd"
+
+#define RBD_MINORS_PER_MAJOR		256
+#define RBD_SINGLE_MAJOR_PART_SHIFT	4
+
+#define RBD_MAX_PARENT_CHAIN_LEN	16
+
+#define RBD_SNAP_DEV_NAME_PREFIX	"snap_"
+#define RBD_MAX_SNAP_NAME_LEN	\
+			(NAME_MAX - (sizeof (RBD_SNAP_DEV_NAME_PREFIX) - 1))
+
+#define RBD_MAX_SNAP_COUNT	510	/* allows max snapc to fit in 4KB */
+
+#define RBD_SNAP_HEAD_NAME	"-"
+
+#define	BAD_SNAP_INDEX	U32_MAX		/* invalid index into snap array */
+
+/* This allows a single page to hold an image name sent by OSD */
+#define RBD_IMAGE_NAME_LEN_MAX	(PAGE_SIZE - sizeof (__le32) - 1)
+#define RBD_IMAGE_ID_LEN_MAX	64
+
+#define RBD_OBJ_PREFIX_LEN_MAX	64
+
+#define RBD_NOTIFY_TIMEOUT	5	/* seconds */
+#define RBD_RETRY_DELAY		msecs_to_jiffies(1000)
+
+/* Feature bits */
+
+#define RBD_FEATURE_LAYERING		(1ULL<<0)
+#define RBD_FEATURE_STRIPINGV2		(1ULL<<1)
+#define RBD_FEATURE_EXCLUSIVE_LOCK	(1ULL<<2)
+#define RBD_FEATURE_OBJECT_MAP		(1ULL<<3)
+#define RBD_FEATURE_FAST_DIFF		(1ULL<<4)
+#define RBD_FEATURE_DEEP_FLATTEN	(1ULL<<5)
+#define RBD_FEATURE_DATA_POOL		(1ULL<<7)
+#define RBD_FEATURE_OPERATIONS		(1ULL<<8)
+
+#define RBD_FEATURES_ALL	(RBD_FEATURE_LAYERING |		\
+				 RBD_FEATURE_STRIPINGV2 |	\
+				 RBD_FEATURE_EXCLUSIVE_LOCK |	\
+				 RBD_FEATURE_OBJECT_MAP |	\
+				 RBD_FEATURE_FAST_DIFF |	\
+				 RBD_FEATURE_DEEP_FLATTEN |	\
+				 RBD_FEATURE_DATA_POOL |	\
+				 RBD_FEATURE_OPERATIONS)
+
+/* Features supported by this (client software) implementation. */
+
+#define RBD_FEATURES_SUPPORTED	(RBD_FEATURES_ALL)
+
+/*
+ * An RBD device name will be "rbd#", where the "rbd" comes from
+ * RBD_DRV_NAME above, and # is a unique integer identifier.
+ */
+#define DEV_NAME_LEN		32
+
+/*
+ * block device image metadata (in-memory version)
+ */
+struct rbd_image_header {
+	/* These six fields never change for a given rbd image */
+	char *object_prefix;
+	__u8 obj_order;
+	u64 stripe_unit;
+	u64 stripe_count;
+	s64 data_pool_id;
+	u64 features;		/* Might be changeable someday? */
+
+	/* The remaining fields need to be updated occasionally */
+	u64 image_size;
+	struct ceph_snap_context *snapc;
+	char *snap_names;	/* format 1 only */
+	u64 *snap_sizes;	/* format 1 only */
+};
+
+/*
+ * An rbd image specification.
+ *
+ * The tuple (pool_id, image_id, snap_id) is sufficient to uniquely
+ * identify an image.  Each rbd_dev structure includes a pointer to
+ * an rbd_spec structure that encapsulates this identity.
+ *
+ * Each of the id's in an rbd_spec has an associated name.  For a
+ * user-mapped image, the names are supplied and the id's associated
+ * with them are looked up.  For a layered image, a parent image is
+ * defined by the tuple, and the names are looked up.
+ *
+ * An rbd_dev structure contains a parent_spec pointer which is
+ * non-null if the image it represents is a child in a layered
+ * image.  This pointer will refer to the rbd_spec structure used
+ * by the parent rbd_dev for its own identity (i.e., the structure
+ * is shared between the parent and child).
+ *
+ * Since these structures are populated once, during the discovery
+ * phase of image construction, they are effectively immutable so
+ * we make no effort to synchronize access to them.
+ *
+ * Note that code herein does not assume the image name is known (it
+ * could be a null pointer).
+ */
+struct rbd_spec {
+	u64		pool_id;
+	const char	*pool_name;
+	const char	*pool_ns;	/* NULL if default, never "" */
+
+	const char	*image_id;
+	const char	*image_name;
+
+	u64		snap_id;
+	const char	*snap_name;
+
+	struct kref	kref;
+};
+
+/*
+ * an instance of the client.  multiple devices may share an rbd client.
+ */
+struct rbd_client {
+	struct ceph_client	*client;
+	struct kref		kref;
+	struct list_head	node;
+};
+
+struct pending_result {
+	int			result;		/* first nonzero result */
+	int			num_pending;
+};
+
+struct rbd_img_request;
+
+enum obj_request_type {
+	OBJ_REQUEST_NODATA = 1,
+	OBJ_REQUEST_BIO,	/* pointer into provided bio (list) */
+	OBJ_REQUEST_BVECS,	/* pointer into provided bio_vec array */
+	OBJ_REQUEST_OWN_BVECS,	/* private bio_vec array, doesn't own pages */
+};
+
+enum obj_operation_type {
+	OBJ_OP_READ = 1,
+	OBJ_OP_WRITE,
+	OBJ_OP_DISCARD,
+	OBJ_OP_ZEROOUT,
+};
+
+#define RBD_OBJ_FLAG_DELETION			(1U << 0)
+#define RBD_OBJ_FLAG_COPYUP_ENABLED		(1U << 1)
+#define RBD_OBJ_FLAG_COPYUP_ZEROS		(1U << 2)
+#define RBD_OBJ_FLAG_MAY_EXIST			(1U << 3)
+#define RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT	(1U << 4)
+
+enum rbd_obj_read_state {
+	RBD_OBJ_READ_START = 1,
+	RBD_OBJ_READ_OBJECT,
+	RBD_OBJ_READ_PARENT,
+};
+
+/*
+ * Writes go through the following state machine to deal with
+ * layering:
+ *
+ *            . . . . . RBD_OBJ_WRITE_GUARD. . . . . . . . . . . . . .
+ *            .                 |                                    .
+ *            .                 v                                    .
+ *            .    RBD_OBJ_WRITE_READ_FROM_PARENT. . .               .
+ *            .                 |                    .               .
+ *            .                 v                    v (deep-copyup  .
+ *    (image  .   RBD_OBJ_WRITE_COPYUP_EMPTY_SNAPC   .  not needed)  .
+ * flattened) v                 |                    .               .
+ *            .                 v                    .               .
+ *            . . . .RBD_OBJ_WRITE_COPYUP_OPS. . . . .      (copyup  .
+ *                              |                        not needed) v
+ *                              v                                    .
+ *                            done . . . . . . . . . . . . . . . . . .
+ *                              ^
+ *                              |
+ *                     RBD_OBJ_WRITE_FLAT
+ *
+ * Writes start in RBD_OBJ_WRITE_GUARD or _FLAT, depending on whether
+ * assert_exists guard is needed or not (in some cases it's not needed
+ * even if there is a parent).
+ */
+enum rbd_obj_write_state {
+	RBD_OBJ_WRITE_START = 1,
+	RBD_OBJ_WRITE_PRE_OBJECT_MAP,
+	RBD_OBJ_WRITE_OBJECT,
+	__RBD_OBJ_WRITE_COPYUP,
+	RBD_OBJ_WRITE_COPYUP,
+	RBD_OBJ_WRITE_POST_OBJECT_MAP,
+};
+
+enum rbd_obj_copyup_state {
+	RBD_OBJ_COPYUP_START = 1,
+	RBD_OBJ_COPYUP_READ_PARENT,
+	__RBD_OBJ_COPYUP_OBJECT_MAPS,
+	RBD_OBJ_COPYUP_OBJECT_MAPS,
+	__RBD_OBJ_COPYUP_WRITE_OBJECT,
+	RBD_OBJ_COPYUP_WRITE_OBJECT,
+};
+
+struct rbd_obj_request {
+	struct ceph_object_extent ex;
+	unsigned int		flags;	/* RBD_OBJ_FLAG_* */
+	union {
+		enum rbd_obj_read_state	 read_state;	/* for reads */
+		enum rbd_obj_write_state write_state;	/* for writes */
+	};
+
+	struct rbd_img_request	*img_request;
+	struct ceph_file_extent	*img_extents;
+	u32			num_img_extents;
+
+	union {
+		struct ceph_bio_iter	bio_pos;
+		struct {
+			struct ceph_bvec_iter	bvec_pos;
+			u32			bvec_count;
+			u32			bvec_idx;
+		};
+	};
+
+	enum rbd_obj_copyup_state copyup_state;
+	struct bio_vec		*copyup_bvecs;
+	u32			copyup_bvec_count;
+
+	struct list_head	osd_reqs;	/* w/ r_private_item */
+
+	struct mutex		state_mutex;
+	struct pending_result	pending;
+	struct kref		kref;
+};
+
+enum img_req_flags {
+	IMG_REQ_CHILD,		/* initiator: block = 0, child image = 1 */
+	IMG_REQ_LAYERED,	/* ENOENT handling: normal = 0, layered = 1 */
+};
+
+enum rbd_img_state {
+	RBD_IMG_START = 1,
+	RBD_IMG_EXCLUSIVE_LOCK,
+	__RBD_IMG_OBJECT_REQUESTS,
+	RBD_IMG_OBJECT_REQUESTS,
+};
+
+struct rbd_img_request {
+	struct rbd_device	*rbd_dev;
+	enum obj_operation_type	op_type;
+	enum obj_request_type	data_type;
+	unsigned long		flags;
+	enum rbd_img_state	state;
+	union {
+		u64			snap_id;	/* for reads */
+		struct ceph_snap_context *snapc;	/* for writes */
+	};
+	struct rbd_obj_request	*obj_request;	/* obj req initiator */
+
+	struct list_head	lock_item;
+	struct list_head	object_extents;	/* obj_req.ex structs */
+
+	struct mutex		state_mutex;
+	struct pending_result	pending;
+	struct work_struct	work;
+	int			work_result;
+};
+
+#define for_each_obj_request(ireq, oreq) \
+	list_for_each_entry(oreq, &(ireq)->object_extents, ex.oe_item)
+#define for_each_obj_request_safe(ireq, oreq, n) \
+	list_for_each_entry_safe(oreq, n, &(ireq)->object_extents, ex.oe_item)
+
+enum rbd_watch_state {
+	RBD_WATCH_STATE_UNREGISTERED,
+	RBD_WATCH_STATE_REGISTERED,
+	RBD_WATCH_STATE_ERROR,
+};
+
+enum rbd_lock_state {
+	RBD_LOCK_STATE_UNLOCKED,
+	RBD_LOCK_STATE_LOCKED,
+	RBD_LOCK_STATE_RELEASING,
+};
+
+/* WatchNotify::ClientId */
+struct rbd_client_id {
+	u64 gid;
+	u64 handle;
+};
+
+struct rbd_mapping {
+	u64                     size;
+};
+
+/*
+ * a single device
+ */
+struct rbd_device {
+	int			dev_id;		/* blkdev unique id */
+
+	int			major;		/* blkdev assigned major */
+	int			minor;
+	struct gendisk		*disk;		/* blkdev's gendisk and rq */
+
+	u32			image_format;	/* Either 1 or 2 */
+	struct rbd_client	*rbd_client;
+
+	char			name[DEV_NAME_LEN]; /* blkdev name, e.g. rbd3 */
+
+	spinlock_t		lock;		/* queue, flags, open_count */
+
+	struct rbd_image_header	header;
+	unsigned long		flags;		/* possibly lock protected */
+	struct rbd_spec		*spec;
+	struct rbd_options	*opts;
+	char			*config_info;	/* add{,_single_major} string */
+
+	struct ceph_object_id	header_oid;
+	struct ceph_object_locator header_oloc;
+
+	struct ceph_file_layout	layout;		/* used for all rbd requests */
+
+	struct mutex		watch_mutex;
+	enum rbd_watch_state	watch_state;
+	struct ceph_osd_linger_request *watch_handle;
+	u64			watch_cookie;
+	struct delayed_work	watch_dwork;
+
+	struct rw_semaphore	lock_rwsem;
+	enum rbd_lock_state	lock_state;
+	char			lock_cookie[32];
+	struct rbd_client_id	owner_cid;
+	struct work_struct	acquired_lock_work;
+	struct work_struct	released_lock_work;
+	struct delayed_work	lock_dwork;
+	struct work_struct	unlock_work;
+	spinlock_t		lock_lists_lock;
+	struct list_head	acquiring_list;
+	struct list_head	running_list;
+	struct completion	acquire_wait;
+	int			acquire_err;
+	struct completion	releasing_wait;
+
+	spinlock_t		object_map_lock;
+	u8			*object_map;
+	u64			object_map_size;	/* in objects */
+	u64			object_map_flags;
+
+	struct workqueue_struct	*task_wq;
+
+	struct rbd_spec		*parent_spec;
+	u64			parent_overlap;
+	atomic_t		parent_ref;
+	struct rbd_device	*parent;
+
+	/* Block layer tags. */
+	struct blk_mq_tag_set	tag_set;
+
+	/* protects updating the header */
+	struct rw_semaphore     header_rwsem;
+
+	struct rbd_mapping	mapping;
+
+	struct list_head	node;
+
+	/* sysfs related */
+	struct device		dev;
+	unsigned long		open_count;	/* protected by lock */
+};
+
+/*
+ * Flag bits for rbd_dev->flags:
+ * - REMOVING (which is coupled with rbd_dev->open_count) is protected
+ *   by rbd_dev->lock
+ */
+enum rbd_dev_flags {
+	RBD_DEV_FLAG_EXISTS,	/* rbd_dev_device_setup() ran */
+	RBD_DEV_FLAG_REMOVING,	/* this mapping is being removed */
+	RBD_DEV_FLAG_READONLY,  /* -o ro or snapshot */
+};
+
+static DEFINE_MUTEX(client_mutex);	/* Serialize client creation */
+
+static LIST_HEAD(rbd_dev_list);    /* devices */
+static DEFINE_SPINLOCK(rbd_dev_list_lock);
+
+static LIST_HEAD(rbd_client_list);		/* clients */
+static DEFINE_SPINLOCK(rbd_client_list_lock);
+
+/* Slab caches for frequently-allocated structures */
+
+static struct kmem_cache	*rbd_img_request_cache;
+static struct kmem_cache	*rbd_obj_request_cache;
+
+static int rbd_major;
+static DEFINE_IDA(rbd_dev_id_ida);
+
+static struct workqueue_struct *rbd_wq;
+
+static struct ceph_snap_context rbd_empty_snapc = {
+	.nref = REFCOUNT_INIT(1),
+};
+
+/*
+ * single-major requires >= 0.75 version of userspace rbd utility.
+ */
+static bool single_major = true;
+module_param(single_major, bool, 0444);
+MODULE_PARM_DESC(single_major, "Use a single major number for all rbd devices (default: true)");
+
+static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count);
+static ssize_t remove_store(struct bus_type *bus, const char *buf,
+			    size_t count);
+static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+				      size_t count);
+static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+					 size_t count);
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth);
+
+static int rbd_dev_id_to_minor(int dev_id)
+{
+	return dev_id << RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static int minor_to_rbd_dev_id(int minor)
+{
+	return minor >> RBD_SINGLE_MAJOR_PART_SHIFT;
+}
+
+static bool rbd_is_ro(struct rbd_device *rbd_dev)
+{
+	return test_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
+}
+
+static bool rbd_is_snap(struct rbd_device *rbd_dev)
+{
+	return rbd_dev->spec->snap_id != CEPH_NOSNAP;
+}
+
+static bool __rbd_is_lock_owner(struct rbd_device *rbd_dev)
+{
+	lockdep_assert_held(&rbd_dev->lock_rwsem);
+
+	return rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED ||
+	       rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING;
+}
+
+static bool rbd_is_lock_owner(struct rbd_device *rbd_dev)
+{
+	bool is_lock_owner;
+
+	down_read(&rbd_dev->lock_rwsem);
+	is_lock_owner = __rbd_is_lock_owner(rbd_dev);
+	up_read(&rbd_dev->lock_rwsem);
+	return is_lock_owner;
+}
+
+static ssize_t supported_features_show(struct bus_type *bus, char *buf)
+{
+	return sprintf(buf, "0x%llx\n", RBD_FEATURES_SUPPORTED);
+}
+
+static BUS_ATTR_WO(add);
+static BUS_ATTR_WO(remove);
+static BUS_ATTR_WO(add_single_major);
+static BUS_ATTR_WO(remove_single_major);
+static BUS_ATTR_RO(supported_features);
+
+static struct attribute *rbd_bus_attrs[] = {
+	&bus_attr_add.attr,
+	&bus_attr_remove.attr,
+	&bus_attr_add_single_major.attr,
+	&bus_attr_remove_single_major.attr,
+	&bus_attr_supported_features.attr,
+	NULL,
+};
+
+static umode_t rbd_bus_is_visible(struct kobject *kobj,
+				  struct attribute *attr, int index)
+{
+	if (!single_major &&
+	    (attr == &bus_attr_add_single_major.attr ||
+	     attr == &bus_attr_remove_single_major.attr))
+		return 0;
+
+	return attr->mode;
+}
+
+static const struct attribute_group rbd_bus_group = {
+	.attrs = rbd_bus_attrs,
+	.is_visible = rbd_bus_is_visible,
+};
+__ATTRIBUTE_GROUPS(rbd_bus);
+
+static struct bus_type rbd_bus_type = {
+	.name		= "rbd",
+	.bus_groups	= rbd_bus_groups,
+};
+
+static void rbd_root_dev_release(struct device *dev)
+{
+}
+
+static struct device rbd_root_dev = {
+	.init_name =    "rbd",
+	.release =      rbd_root_dev_release,
+};
+
+static __printf(2, 3)
+void rbd_warn(struct rbd_device *rbd_dev, const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	if (!rbd_dev)
+		printk(KERN_WARNING "%s: %pV\n", RBD_DRV_NAME, &vaf);
+	else if (rbd_dev->disk)
+		printk(KERN_WARNING "%s: %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->disk->disk_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_name)
+		printk(KERN_WARNING "%s: image %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_name, &vaf);
+	else if (rbd_dev->spec && rbd_dev->spec->image_id)
+		printk(KERN_WARNING "%s: id %s: %pV\n",
+			RBD_DRV_NAME, rbd_dev->spec->image_id, &vaf);
+	else	/* punt */
+		printk(KERN_WARNING "%s: rbd_dev %p: %pV\n",
+			RBD_DRV_NAME, rbd_dev, &vaf);
+	va_end(args);
+}
+
+#ifdef RBD_DEBUG
+#define rbd_assert(expr)						\
+		if (unlikely(!(expr))) {				\
+			printk(KERN_ERR "\nAssertion failure in %s() "	\
+						"at line %d:\n\n"	\
+					"\trbd_assert(%s);\n\n",	\
+					__func__, __LINE__, #expr);	\
+			BUG();						\
+		}
+#else /* !RBD_DEBUG */
+#  define rbd_assert(expr)	((void) 0)
+#endif /* !RBD_DEBUG */
+
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev);
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev);
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
+				     struct rbd_image_header *header);
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id);
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u8 *order, u64 *snap_size);
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev);
+
+static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result);
+static void rbd_img_handle_request(struct rbd_img_request *img_req, int result);
+
+/*
+ * Return true if nothing else is pending.
+ */
+static bool pending_result_dec(struct pending_result *pending, int *result)
+{
+	rbd_assert(pending->num_pending > 0);
+
+	if (*result && !pending->result)
+		pending->result = *result;
+	if (--pending->num_pending)
+		return false;
+
+	*result = pending->result;
+	return true;
+}
+
+static int rbd_open(struct block_device *bdev, fmode_t mode)
+{
+	struct rbd_device *rbd_dev = bdev->bd_disk->private_data;
+	bool removing = false;
+
+	spin_lock_irq(&rbd_dev->lock);
+	if (test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags))
+		removing = true;
+	else
+		rbd_dev->open_count++;
+	spin_unlock_irq(&rbd_dev->lock);
+	if (removing)
+		return -ENOENT;
+
+	(void) get_device(&rbd_dev->dev);
+
+	return 0;
+}
+
+static void rbd_release(struct gendisk *disk, fmode_t mode)
+{
+	struct rbd_device *rbd_dev = disk->private_data;
+	unsigned long open_count_before;
+
+	spin_lock_irq(&rbd_dev->lock);
+	open_count_before = rbd_dev->open_count--;
+	spin_unlock_irq(&rbd_dev->lock);
+	rbd_assert(open_count_before > 0);
+
+	put_device(&rbd_dev->dev);
+}
+
+static const struct block_device_operations rbd_bd_ops = {
+	.owner			= THIS_MODULE,
+	.open			= rbd_open,
+	.release		= rbd_release,
+};
+
+/*
+ * Initialize an rbd client instance.  Success or not, this function
+ * consumes ceph_opts.  Caller holds client_mutex.
+ */
+static struct rbd_client *rbd_client_create(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *rbdc;
+	int ret = -ENOMEM;
+
+	dout("%s:\n", __func__);
+	rbdc = kmalloc(sizeof(struct rbd_client), GFP_KERNEL);
+	if (!rbdc)
+		goto out_opt;
+
+	kref_init(&rbdc->kref);
+	INIT_LIST_HEAD(&rbdc->node);
+
+	rbdc->client = ceph_create_client(ceph_opts, rbdc);
+	if (IS_ERR(rbdc->client))
+		goto out_rbdc;
+	ceph_opts = NULL; /* Now rbdc->client is responsible for ceph_opts */
+
+	ret = ceph_open_session(rbdc->client);
+	if (ret < 0)
+		goto out_client;
+
+	spin_lock(&rbd_client_list_lock);
+	list_add_tail(&rbdc->node, &rbd_client_list);
+	spin_unlock(&rbd_client_list_lock);
+
+	dout("%s: rbdc %p\n", __func__, rbdc);
+
+	return rbdc;
+out_client:
+	ceph_destroy_client(rbdc->client);
+out_rbdc:
+	kfree(rbdc);
+out_opt:
+	if (ceph_opts)
+		ceph_destroy_options(ceph_opts);
+	dout("%s: error %d\n", __func__, ret);
+
+	return ERR_PTR(ret);
+}
+
+static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
+{
+	kref_get(&rbdc->kref);
+
+	return rbdc;
+}
+
+/*
+ * Find a ceph client with specific addr and configuration.  If
+ * found, bump its reference count.
+ */
+static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *rbdc = NULL, *iter;
+
+	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
+		return NULL;
+
+	spin_lock(&rbd_client_list_lock);
+	list_for_each_entry(iter, &rbd_client_list, node) {
+		if (!ceph_compare_options(ceph_opts, iter->client)) {
+			__rbd_get_client(iter);
+
+			rbdc = iter;
+			break;
+		}
+	}
+	spin_unlock(&rbd_client_list_lock);
+
+	return rbdc;
+}
+
+/*
+ * (Per device) rbd map options
+ */
+enum {
+	Opt_queue_depth,
+	Opt_alloc_size,
+	Opt_lock_timeout,
+	/* int args above */
+	Opt_pool_ns,
+	Opt_compression_hint,
+	/* string args above */
+	Opt_read_only,
+	Opt_read_write,
+	Opt_lock_on_read,
+	Opt_exclusive,
+	Opt_notrim,
+};
+
+enum {
+	Opt_compression_hint_none,
+	Opt_compression_hint_compressible,
+	Opt_compression_hint_incompressible,
+};
+
+static const struct constant_table rbd_param_compression_hint[] = {
+	{"none",		Opt_compression_hint_none},
+	{"compressible",	Opt_compression_hint_compressible},
+	{"incompressible",	Opt_compression_hint_incompressible},
+	{}
+};
+
+static const struct fs_parameter_spec rbd_parameters[] = {
+	fsparam_u32	("alloc_size",			Opt_alloc_size),
+	fsparam_enum	("compression_hint",		Opt_compression_hint,
+			 rbd_param_compression_hint),
+	fsparam_flag	("exclusive",			Opt_exclusive),
+	fsparam_flag	("lock_on_read",		Opt_lock_on_read),
+	fsparam_u32	("lock_timeout",		Opt_lock_timeout),
+	fsparam_flag	("notrim",			Opt_notrim),
+	fsparam_string	("_pool_ns",			Opt_pool_ns),
+	fsparam_u32	("queue_depth",			Opt_queue_depth),
+	fsparam_flag	("read_only",			Opt_read_only),
+	fsparam_flag	("read_write",			Opt_read_write),
+	fsparam_flag	("ro",				Opt_read_only),
+	fsparam_flag	("rw",				Opt_read_write),
+	{}
+};
+
+struct rbd_options {
+	int	queue_depth;
+	int	alloc_size;
+	unsigned long	lock_timeout;
+	bool	read_only;
+	bool	lock_on_read;
+	bool	exclusive;
+	bool	trim;
+
+	u32 alloc_hint_flags;  /* CEPH_OSD_OP_ALLOC_HINT_FLAG_* */
+};
+
+#define RBD_QUEUE_DEPTH_DEFAULT	BLKDEV_DEFAULT_RQ
+#define RBD_ALLOC_SIZE_DEFAULT	(64 * 1024)
+#define RBD_LOCK_TIMEOUT_DEFAULT 0  /* no timeout */
+#define RBD_READ_ONLY_DEFAULT	false
+#define RBD_LOCK_ON_READ_DEFAULT false
+#define RBD_EXCLUSIVE_DEFAULT	false
+#define RBD_TRIM_DEFAULT	true
+
+struct rbd_parse_opts_ctx {
+	struct rbd_spec		*spec;
+	struct ceph_options	*copts;
+	struct rbd_options	*opts;
+};
+
+static char* obj_op_name(enum obj_operation_type op_type)
+{
+	switch (op_type) {
+	case OBJ_OP_READ:
+		return "read";
+	case OBJ_OP_WRITE:
+		return "write";
+	case OBJ_OP_DISCARD:
+		return "discard";
+	case OBJ_OP_ZEROOUT:
+		return "zeroout";
+	default:
+		return "???";
+	}
+}
+
+/*
+ * Destroy ceph client
+ *
+ * Caller must hold rbd_client_list_lock.
+ */
+static void rbd_client_release(struct kref *kref)
+{
+	struct rbd_client *rbdc = container_of(kref, struct rbd_client, kref);
+
+	dout("%s: rbdc %p\n", __func__, rbdc);
+	spin_lock(&rbd_client_list_lock);
+	list_del(&rbdc->node);
+	spin_unlock(&rbd_client_list_lock);
+
+	ceph_destroy_client(rbdc->client);
+	kfree(rbdc);
+}
+
+/*
+ * Drop reference to ceph client node. If it's not referenced anymore, release
+ * it.
+ */
+static void rbd_put_client(struct rbd_client *rbdc)
+{
+	if (rbdc)
+		kref_put(&rbdc->kref, rbd_client_release);
+}
+
+/*
+ * Get a ceph client with specific addr and configuration, if one does
+ * not exist create it.  Either way, ceph_opts is consumed by this
+ * function.
+ */
+static struct rbd_client *rbd_get_client(struct ceph_options *ceph_opts)
+{
+	struct rbd_client *rbdc;
+	int ret;
+
+	mutex_lock(&client_mutex);
+	rbdc = rbd_client_find(ceph_opts);
+	if (rbdc) {
+		ceph_destroy_options(ceph_opts);
+
+		/*
+		 * Using an existing client.  Make sure ->pg_pools is up to
+		 * date before we look up the pool id in do_rbd_add().
+		 */
+		ret = ceph_wait_for_latest_osdmap(rbdc->client,
+					rbdc->client->options->mount_timeout);
+		if (ret) {
+			rbd_warn(NULL, "failed to get latest osdmap: %d", ret);
+			rbd_put_client(rbdc);
+			rbdc = ERR_PTR(ret);
+		}
+	} else {
+		rbdc = rbd_client_create(ceph_opts);
+	}
+	mutex_unlock(&client_mutex);
+
+	return rbdc;
+}
+
+static bool rbd_image_format_valid(u32 image_format)
+{
+	return image_format == 1 || image_format == 2;
+}
+
+static bool rbd_dev_ondisk_valid(struct rbd_image_header_ondisk *ondisk)
+{
+	size_t size;
+	u32 snap_count;
+
+	/* The header has to start with the magic rbd header text */
+	if (memcmp(&ondisk->text, RBD_HEADER_TEXT, sizeof (RBD_HEADER_TEXT)))
+		return false;
+
+	/* The bio layer requires at least sector-sized I/O */
+
+	if (ondisk->options.order < SECTOR_SHIFT)
+		return false;
+
+	/* If we use u64 in a few spots we may be able to loosen this */
+
+	if (ondisk->options.order > 8 * sizeof (int) - 1)
+		return false;
+
+	/*
+	 * The size of a snapshot header has to fit in a size_t, and
+	 * that limits the number of snapshots.
+	 */
+	snap_count = le32_to_cpu(ondisk->snap_count);
+	size = SIZE_MAX - sizeof (struct ceph_snap_context);
+	if (snap_count > size / sizeof (__le64))
+		return false;
+
+	/*
+	 * Not only that, but the size of the entire the snapshot
+	 * header must also be representable in a size_t.
+	 */
+	size -= snap_count * sizeof (__le64);
+	if ((u64) size < le64_to_cpu(ondisk->snap_names_len))
+		return false;
+
+	return true;
+}
+
+/*
+ * returns the size of an object in the image
+ */
+static u32 rbd_obj_bytes(struct rbd_image_header *header)
+{
+	return 1U << header->obj_order;
+}
+
+static void rbd_init_layout(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev->header.stripe_unit == 0 ||
+	    rbd_dev->header.stripe_count == 0) {
+		rbd_dev->header.stripe_unit = rbd_obj_bytes(&rbd_dev->header);
+		rbd_dev->header.stripe_count = 1;
+	}
+
+	rbd_dev->layout.stripe_unit = rbd_dev->header.stripe_unit;
+	rbd_dev->layout.stripe_count = rbd_dev->header.stripe_count;
+	rbd_dev->layout.object_size = rbd_obj_bytes(&rbd_dev->header);
+	rbd_dev->layout.pool_id = rbd_dev->header.data_pool_id == CEPH_NOPOOL ?
+			  rbd_dev->spec->pool_id : rbd_dev->header.data_pool_id;
+	RCU_INIT_POINTER(rbd_dev->layout.pool_ns, NULL);
+}
+
+static void rbd_image_header_cleanup(struct rbd_image_header *header)
+{
+	kfree(header->object_prefix);
+	ceph_put_snap_context(header->snapc);
+	kfree(header->snap_sizes);
+	kfree(header->snap_names);
+
+	memset(header, 0, sizeof(*header));
+}
+
+/*
+ * Fill an rbd image header with information from the given format 1
+ * on-disk header.
+ */
+static int rbd_header_from_disk(struct rbd_image_header *header,
+				struct rbd_image_header_ondisk *ondisk,
+				bool first_time)
+{
+	struct ceph_snap_context *snapc;
+	char *object_prefix = NULL;
+	char *snap_names = NULL;
+	u64 *snap_sizes = NULL;
+	u32 snap_count;
+	int ret = -ENOMEM;
+	u32 i;
+
+	/* Allocate this now to avoid having to handle failure below */
+
+	if (first_time) {
+		object_prefix = kstrndup(ondisk->object_prefix,
+					 sizeof(ondisk->object_prefix),
+					 GFP_KERNEL);
+		if (!object_prefix)
+			return -ENOMEM;
+	}
+
+	/* Allocate the snapshot context and fill it in */
+
+	snap_count = le32_to_cpu(ondisk->snap_count);
+	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+	if (!snapc)
+		goto out_err;
+	snapc->seq = le64_to_cpu(ondisk->snap_seq);
+	if (snap_count) {
+		struct rbd_image_snap_ondisk *snaps;
+		u64 snap_names_len = le64_to_cpu(ondisk->snap_names_len);
+
+		/* We'll keep a copy of the snapshot names... */
+
+		if (snap_names_len > (u64)SIZE_MAX)
+			goto out_2big;
+		snap_names = kmalloc(snap_names_len, GFP_KERNEL);
+		if (!snap_names)
+			goto out_err;
+
+		/* ...as well as the array of their sizes. */
+		snap_sizes = kmalloc_array(snap_count,
+					   sizeof(*header->snap_sizes),
+					   GFP_KERNEL);
+		if (!snap_sizes)
+			goto out_err;
+
+		/*
+		 * Copy the names, and fill in each snapshot's id
+		 * and size.
+		 *
+		 * Note that rbd_dev_v1_header_info() guarantees the
+		 * ondisk buffer we're working with has
+		 * snap_names_len bytes beyond the end of the
+		 * snapshot id array, this memcpy() is safe.
+		 */
+		memcpy(snap_names, &ondisk->snaps[snap_count], snap_names_len);
+		snaps = ondisk->snaps;
+		for (i = 0; i < snap_count; i++) {
+			snapc->snaps[i] = le64_to_cpu(snaps[i].id);
+			snap_sizes[i] = le64_to_cpu(snaps[i].image_size);
+		}
+	}
+
+	/* We won't fail any more, fill in the header */
+
+	if (first_time) {
+		header->object_prefix = object_prefix;
+		header->obj_order = ondisk->options.order;
+	}
+
+	/* The remaining fields always get updated (when we refresh) */
+
+	header->image_size = le64_to_cpu(ondisk->image_size);
+	header->snapc = snapc;
+	header->snap_names = snap_names;
+	header->snap_sizes = snap_sizes;
+
+	return 0;
+out_2big:
+	ret = -EIO;
+out_err:
+	kfree(snap_sizes);
+	kfree(snap_names);
+	ceph_put_snap_context(snapc);
+	kfree(object_prefix);
+
+	return ret;
+}
+
+static const char *_rbd_dev_v1_snap_name(struct rbd_device *rbd_dev, u32 which)
+{
+	const char *snap_name;
+
+	rbd_assert(which < rbd_dev->header.snapc->num_snaps);
+
+	/* Skip over names until we find the one we are looking for */
+
+	snap_name = rbd_dev->header.snap_names;
+	while (which--)
+		snap_name += strlen(snap_name) + 1;
+
+	return kstrdup(snap_name, GFP_KERNEL);
+}
+
+/*
+ * Snapshot id comparison function for use with qsort()/bsearch().
+ * Note that result is for snapshots in *descending* order.
+ */
+static int snapid_compare_reverse(const void *s1, const void *s2)
+{
+	u64 snap_id1 = *(u64 *)s1;
+	u64 snap_id2 = *(u64 *)s2;
+
+	if (snap_id1 < snap_id2)
+		return 1;
+	return snap_id1 == snap_id2 ? 0 : -1;
+}
+
+/*
+ * Search a snapshot context to see if the given snapshot id is
+ * present.
+ *
+ * Returns the position of the snapshot id in the array if it's found,
+ * or BAD_SNAP_INDEX otherwise.
+ *
+ * Note: The snapshot array is in kept sorted (by the osd) in
+ * reverse order, highest snapshot id first.
+ */
+static u32 rbd_dev_snap_index(struct rbd_device *rbd_dev, u64 snap_id)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	u64 *found;
+
+	found = bsearch(&snap_id, &snapc->snaps, snapc->num_snaps,
+				sizeof (snap_id), snapid_compare_reverse);
+
+	return found ? (u32)(found - &snapc->snaps[0]) : BAD_SNAP_INDEX;
+}
+
+static const char *rbd_dev_v1_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id)
+{
+	u32 which;
+	const char *snap_name;
+
+	which = rbd_dev_snap_index(rbd_dev, snap_id);
+	if (which == BAD_SNAP_INDEX)
+		return ERR_PTR(-ENOENT);
+
+	snap_name = _rbd_dev_v1_snap_name(rbd_dev, which);
+	return snap_name ? snap_name : ERR_PTR(-ENOMEM);
+}
+
+static const char *rbd_snap_name(struct rbd_device *rbd_dev, u64 snap_id)
+{
+	if (snap_id == CEPH_NOSNAP)
+		return RBD_SNAP_HEAD_NAME;
+
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (rbd_dev->image_format == 1)
+		return rbd_dev_v1_snap_name(rbd_dev, snap_id);
+
+	return rbd_dev_v2_snap_name(rbd_dev, snap_id);
+}
+
+static int rbd_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u64 *snap_size)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (snap_id == CEPH_NOSNAP) {
+		*snap_size = rbd_dev->header.image_size;
+	} else if (rbd_dev->image_format == 1) {
+		u32 which;
+
+		which = rbd_dev_snap_index(rbd_dev, snap_id);
+		if (which == BAD_SNAP_INDEX)
+			return -ENOENT;
+
+		*snap_size = rbd_dev->header.snap_sizes[which];
+	} else {
+		u64 size = 0;
+		int ret;
+
+		ret = _rbd_dev_v2_snap_size(rbd_dev, snap_id, NULL, &size);
+		if (ret)
+			return ret;
+
+		*snap_size = size;
+	}
+	return 0;
+}
+
+static int rbd_dev_mapping_set(struct rbd_device *rbd_dev)
+{
+	u64 snap_id = rbd_dev->spec->snap_id;
+	u64 size = 0;
+	int ret;
+
+	ret = rbd_snap_size(rbd_dev, snap_id, &size);
+	if (ret)
+		return ret;
+
+	rbd_dev->mapping.size = size;
+	return 0;
+}
+
+static void rbd_dev_mapping_clear(struct rbd_device *rbd_dev)
+{
+	rbd_dev->mapping.size = 0;
+}
+
+static void zero_bios(struct ceph_bio_iter *bio_pos, u32 off, u32 bytes)
+{
+	struct ceph_bio_iter it = *bio_pos;
+
+	ceph_bio_iter_advance(&it, off);
+	ceph_bio_iter_advance_step(&it, bytes, ({
+		memzero_bvec(&bv);
+	}));
+}
+
+static void zero_bvecs(struct ceph_bvec_iter *bvec_pos, u32 off, u32 bytes)
+{
+	struct ceph_bvec_iter it = *bvec_pos;
+
+	ceph_bvec_iter_advance(&it, off);
+	ceph_bvec_iter_advance_step(&it, bytes, ({
+		memzero_bvec(&bv);
+	}));
+}
+
+/*
+ * Zero a range in @obj_req data buffer defined by a bio (list) or
+ * (private) bio_vec array.
+ *
+ * @off is relative to the start of the data buffer.
+ */
+static void rbd_obj_zero_range(struct rbd_obj_request *obj_req, u32 off,
+			       u32 bytes)
+{
+	dout("%s %p data buf %u~%u\n", __func__, obj_req, off, bytes);
+
+	switch (obj_req->img_request->data_type) {
+	case OBJ_REQUEST_BIO:
+		zero_bios(&obj_req->bio_pos, off, bytes);
+		break;
+	case OBJ_REQUEST_BVECS:
+	case OBJ_REQUEST_OWN_BVECS:
+		zero_bvecs(&obj_req->bvec_pos, off, bytes);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static void rbd_obj_request_destroy(struct kref *kref);
+static void rbd_obj_request_put(struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request != NULL);
+	dout("%s: obj %p (was %d)\n", __func__, obj_request,
+		kref_read(&obj_request->kref));
+	kref_put(&obj_request->kref, rbd_obj_request_destroy);
+}
+
+static inline void rbd_img_obj_request_add(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	rbd_assert(obj_request->img_request == NULL);
+
+	/* Image request now owns object's original reference */
+	obj_request->img_request = img_request;
+	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+}
+
+static inline void rbd_img_obj_request_del(struct rbd_img_request *img_request,
+					struct rbd_obj_request *obj_request)
+{
+	dout("%s: img %p obj %p\n", __func__, img_request, obj_request);
+	list_del(&obj_request->ex.oe_item);
+	rbd_assert(obj_request->img_request == img_request);
+	rbd_obj_request_put(obj_request);
+}
+
+static void rbd_osd_submit(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+	dout("%s osd_req %p for obj_req %p objno %llu %llu~%llu\n",
+	     __func__, osd_req, obj_req, obj_req->ex.oe_objno,
+	     obj_req->ex.oe_off, obj_req->ex.oe_len);
+	ceph_osdc_start_request(osd_req->r_osdc, osd_req);
+}
+
+/*
+ * The default/initial value for all image request flags is 0.  Each
+ * is conditionally set to 1 at image request initialization time
+ * and currently never change thereafter.
+ */
+static void img_request_layered_set(struct rbd_img_request *img_request)
+{
+	set_bit(IMG_REQ_LAYERED, &img_request->flags);
+}
+
+static bool img_request_layered_test(struct rbd_img_request *img_request)
+{
+	return test_bit(IMG_REQ_LAYERED, &img_request->flags) != 0;
+}
+
+static bool rbd_obj_is_entire(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	return !obj_req->ex.oe_off &&
+	       obj_req->ex.oe_len == rbd_dev->layout.object_size;
+}
+
+static bool rbd_obj_is_tail(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	return obj_req->ex.oe_off + obj_req->ex.oe_len ==
+					rbd_dev->layout.object_size;
+}
+
+/*
+ * Must be called after rbd_obj_calc_img_extents().
+ */
+static void rbd_obj_set_copyup_enabled(struct rbd_obj_request *obj_req)
+{
+	rbd_assert(obj_req->img_request->snapc);
+
+	if (obj_req->img_request->op_type == OBJ_OP_DISCARD) {
+		dout("%s %p objno %llu discard\n", __func__, obj_req,
+		     obj_req->ex.oe_objno);
+		return;
+	}
+
+	if (!obj_req->num_img_extents) {
+		dout("%s %p objno %llu not overlapping\n", __func__, obj_req,
+		     obj_req->ex.oe_objno);
+		return;
+	}
+
+	if (rbd_obj_is_entire(obj_req) &&
+	    !obj_req->img_request->snapc->num_snaps) {
+		dout("%s %p objno %llu entire\n", __func__, obj_req,
+		     obj_req->ex.oe_objno);
+		return;
+	}
+
+	obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ENABLED;
+}
+
+static u64 rbd_obj_img_extents_bytes(struct rbd_obj_request *obj_req)
+{
+	return ceph_file_extents_bytes(obj_req->img_extents,
+				       obj_req->num_img_extents);
+}
+
+static bool rbd_img_is_write(struct rbd_img_request *img_req)
+{
+	switch (img_req->op_type) {
+	case OBJ_OP_READ:
+		return false;
+	case OBJ_OP_WRITE:
+	case OBJ_OP_DISCARD:
+	case OBJ_OP_ZEROOUT:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static void rbd_osd_req_callback(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int result;
+
+	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+	     osd_req->r_result, obj_req);
+
+	/*
+	 * Writes aren't allowed to return a data payload.  In some
+	 * guarded write cases (e.g. stat + zero on an empty object)
+	 * a stat response makes it through, but we don't care.
+	 */
+	if (osd_req->r_result > 0 && rbd_img_is_write(obj_req->img_request))
+		result = 0;
+	else
+		result = osd_req->r_result;
+
+	rbd_obj_handle_request(obj_req, result);
+}
+
+static void rbd_osd_format_read(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_request = osd_req->r_priv;
+	struct rbd_device *rbd_dev = obj_request->img_request->rbd_dev;
+	struct ceph_options *opt = rbd_dev->rbd_client->client->options;
+
+	osd_req->r_flags = CEPH_OSD_FLAG_READ | opt->read_from_replica;
+	osd_req->r_snapid = obj_request->img_request->snap_id;
+}
+
+static void rbd_osd_format_write(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_request = osd_req->r_priv;
+
+	osd_req->r_flags = CEPH_OSD_FLAG_WRITE;
+	ktime_get_real_ts64(&osd_req->r_mtime);
+	osd_req->r_data_offset = obj_request->ex.oe_off;
+}
+
+static struct ceph_osd_request *
+__rbd_obj_add_osd_request(struct rbd_obj_request *obj_req,
+			  struct ceph_snap_context *snapc, int num_ops)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	const char *name_format = rbd_dev->image_format == 1 ?
+				      RBD_V1_DATA_FORMAT : RBD_V2_DATA_FORMAT;
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, snapc, num_ops, false, GFP_NOIO);
+	if (!req)
+		return ERR_PTR(-ENOMEM);
+
+	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+	req->r_callback = rbd_osd_req_callback;
+	req->r_priv = obj_req;
+
+	/*
+	 * Data objects may be stored in a separate pool, but always in
+	 * the same namespace in that pool as the header in its pool.
+	 */
+	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+	req->r_base_oloc.pool = rbd_dev->layout.pool_id;
+
+	ret = ceph_oid_aprintf(&req->r_base_oid, GFP_NOIO, name_format,
+			       rbd_dev->header.object_prefix,
+			       obj_req->ex.oe_objno);
+	if (ret)
+		return ERR_PTR(ret);
+
+	return req;
+}
+
+static struct ceph_osd_request *
+rbd_obj_add_osd_request(struct rbd_obj_request *obj_req, int num_ops)
+{
+	rbd_assert(obj_req->img_request->snapc);
+	return __rbd_obj_add_osd_request(obj_req, obj_req->img_request->snapc,
+					 num_ops);
+}
+
+static struct rbd_obj_request *rbd_obj_request_create(void)
+{
+	struct rbd_obj_request *obj_request;
+
+	obj_request = kmem_cache_zalloc(rbd_obj_request_cache, GFP_NOIO);
+	if (!obj_request)
+		return NULL;
+
+	ceph_object_extent_init(&obj_request->ex);
+	INIT_LIST_HEAD(&obj_request->osd_reqs);
+	mutex_init(&obj_request->state_mutex);
+	kref_init(&obj_request->kref);
+
+	dout("%s %p\n", __func__, obj_request);
+	return obj_request;
+}
+
+static void rbd_obj_request_destroy(struct kref *kref)
+{
+	struct rbd_obj_request *obj_request;
+	struct ceph_osd_request *osd_req;
+	u32 i;
+
+	obj_request = container_of(kref, struct rbd_obj_request, kref);
+
+	dout("%s: obj %p\n", __func__, obj_request);
+
+	while (!list_empty(&obj_request->osd_reqs)) {
+		osd_req = list_first_entry(&obj_request->osd_reqs,
+				    struct ceph_osd_request, r_private_item);
+		list_del_init(&osd_req->r_private_item);
+		ceph_osdc_put_request(osd_req);
+	}
+
+	switch (obj_request->img_request->data_type) {
+	case OBJ_REQUEST_NODATA:
+	case OBJ_REQUEST_BIO:
+	case OBJ_REQUEST_BVECS:
+		break;		/* Nothing to do */
+	case OBJ_REQUEST_OWN_BVECS:
+		kfree(obj_request->bvec_pos.bvecs);
+		break;
+	default:
+		BUG();
+	}
+
+	kfree(obj_request->img_extents);
+	if (obj_request->copyup_bvecs) {
+		for (i = 0; i < obj_request->copyup_bvec_count; i++) {
+			if (obj_request->copyup_bvecs[i].bv_page)
+				__free_page(obj_request->copyup_bvecs[i].bv_page);
+		}
+		kfree(obj_request->copyup_bvecs);
+	}
+
+	kmem_cache_free(rbd_obj_request_cache, obj_request);
+}
+
+/* It's OK to call this for a device with no parent */
+
+static void rbd_spec_put(struct rbd_spec *spec);
+static void rbd_dev_unparent(struct rbd_device *rbd_dev)
+{
+	rbd_dev_remove_parent(rbd_dev);
+	rbd_spec_put(rbd_dev->parent_spec);
+	rbd_dev->parent_spec = NULL;
+	rbd_dev->parent_overlap = 0;
+}
+
+/*
+ * Parent image reference counting is used to determine when an
+ * image's parent fields can be safely torn down--after there are no
+ * more in-flight requests to the parent image.  When the last
+ * reference is dropped, cleaning them up is safe.
+ */
+static void rbd_dev_parent_put(struct rbd_device *rbd_dev)
+{
+	int counter;
+
+	if (!rbd_dev->parent_spec)
+		return;
+
+	counter = atomic_dec_return_safe(&rbd_dev->parent_ref);
+	if (counter > 0)
+		return;
+
+	/* Last reference; clean up parent data structures */
+
+	if (!counter)
+		rbd_dev_unparent(rbd_dev);
+	else
+		rbd_warn(rbd_dev, "parent reference underflow");
+}
+
+/*
+ * If an image has a non-zero parent overlap, get a reference to its
+ * parent.
+ *
+ * Returns true if the rbd device has a parent with a non-zero
+ * overlap and a reference for it was successfully taken, or
+ * false otherwise.
+ */
+static bool rbd_dev_parent_get(struct rbd_device *rbd_dev)
+{
+	int counter = 0;
+
+	if (!rbd_dev->parent_spec)
+		return false;
+
+	if (rbd_dev->parent_overlap)
+		counter = atomic_inc_return_safe(&rbd_dev->parent_ref);
+
+	if (counter < 0)
+		rbd_warn(rbd_dev, "parent reference overflow");
+
+	return counter > 0;
+}
+
+static void rbd_img_request_init(struct rbd_img_request *img_request,
+				 struct rbd_device *rbd_dev,
+				 enum obj_operation_type op_type)
+{
+	memset(img_request, 0, sizeof(*img_request));
+
+	img_request->rbd_dev = rbd_dev;
+	img_request->op_type = op_type;
+
+	INIT_LIST_HEAD(&img_request->lock_item);
+	INIT_LIST_HEAD(&img_request->object_extents);
+	mutex_init(&img_request->state_mutex);
+}
+
+/*
+ * Only snap_id is captured here, for reads.  For writes, snapshot
+ * context is captured in rbd_img_object_requests() after exclusive
+ * lock is ensured to be held.
+ */
+static void rbd_img_capture_header(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+	lockdep_assert_held(&rbd_dev->header_rwsem);
+
+	if (!rbd_img_is_write(img_req))
+		img_req->snap_id = rbd_dev->spec->snap_id;
+
+	if (rbd_dev_parent_get(rbd_dev))
+		img_request_layered_set(img_req);
+}
+
+static void rbd_img_request_destroy(struct rbd_img_request *img_request)
+{
+	struct rbd_obj_request *obj_request;
+	struct rbd_obj_request *next_obj_request;
+
+	dout("%s: img %p\n", __func__, img_request);
+
+	WARN_ON(!list_empty(&img_request->lock_item));
+	for_each_obj_request_safe(img_request, obj_request, next_obj_request)
+		rbd_img_obj_request_del(img_request, obj_request);
+
+	if (img_request_layered_test(img_request))
+		rbd_dev_parent_put(img_request->rbd_dev);
+
+	if (rbd_img_is_write(img_request))
+		ceph_put_snap_context(img_request->snapc);
+
+	if (test_bit(IMG_REQ_CHILD, &img_request->flags))
+		kmem_cache_free(rbd_img_request_cache, img_request);
+}
+
+#define BITS_PER_OBJ	2
+#define OBJS_PER_BYTE	(BITS_PER_BYTE / BITS_PER_OBJ)
+#define OBJ_MASK	((1 << BITS_PER_OBJ) - 1)
+
+static void __rbd_object_map_index(struct rbd_device *rbd_dev, u64 objno,
+				   u64 *index, u8 *shift)
+{
+	u32 off;
+
+	rbd_assert(objno < rbd_dev->object_map_size);
+	*index = div_u64_rem(objno, OBJS_PER_BYTE, &off);
+	*shift = (OBJS_PER_BYTE - off - 1) * BITS_PER_OBJ;
+}
+
+static u8 __rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u64 index;
+	u8 shift;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	return (rbd_dev->object_map[index] >> shift) & OBJ_MASK;
+}
+
+static void __rbd_object_map_set(struct rbd_device *rbd_dev, u64 objno, u8 val)
+{
+	u64 index;
+	u8 shift;
+	u8 *p;
+
+	lockdep_assert_held(&rbd_dev->object_map_lock);
+	rbd_assert(!(val & ~OBJ_MASK));
+
+	__rbd_object_map_index(rbd_dev, objno, &index, &shift);
+	p = &rbd_dev->object_map[index];
+	*p = (*p & ~(OBJ_MASK << shift)) | (val << shift);
+}
+
+static u8 rbd_object_map_get(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	spin_unlock(&rbd_dev->object_map_lock);
+	return state;
+}
+
+static bool use_object_map(struct rbd_device *rbd_dev)
+{
+	/*
+	 * An image mapped read-only can't use the object map -- it isn't
+	 * loaded because the header lock isn't acquired.  Someone else can
+	 * write to the image and update the object map behind our back.
+	 *
+	 * A snapshot can't be written to, so using the object map is always
+	 * safe.
+	 */
+	if (!rbd_is_snap(rbd_dev) && rbd_is_ro(rbd_dev))
+		return false;
+
+	return ((rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) &&
+		!(rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID));
+}
+
+static bool rbd_object_map_may_exist(struct rbd_device *rbd_dev, u64 objno)
+{
+	u8 state;
+
+	/* fall back to default logic if object map is disabled or invalid */
+	if (!use_object_map(rbd_dev))
+		return true;
+
+	state = rbd_object_map_get(rbd_dev, objno);
+	return state != OBJECT_NONEXISTENT;
+}
+
+static void rbd_object_map_name(struct rbd_device *rbd_dev, u64 snap_id,
+				struct ceph_object_id *oid)
+{
+	if (snap_id == CEPH_NOSNAP)
+		ceph_oid_printf(oid, "%s%s", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id);
+	else
+		ceph_oid_printf(oid, "%s%s.%016llx", RBD_OBJECT_MAP_PREFIX,
+				rbd_dev->spec->image_id, snap_id);
+}
+
+static int rbd_object_map_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	u8 lock_type;
+	char *lock_tag;
+	struct ceph_locker *lockers;
+	u32 num_lockers;
+	bool broke_lock = false;
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+again:
+	ret = ceph_cls_lock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			    CEPH_CLS_LOCK_EXCLUSIVE, "", "", "", 0);
+	if (ret != -EBUSY || broke_lock) {
+		if (ret == -EEXIST)
+			ret = 0; /* already locked by myself */
+		if (ret)
+			rbd_warn(rbd_dev, "failed to lock object map: %d", ret);
+		return ret;
+	}
+
+	ret = ceph_cls_lock_info(osdc, &oid, &rbd_dev->header_oloc,
+				 RBD_LOCK_NAME, &lock_type, &lock_tag,
+				 &lockers, &num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to get object map lockers: %d", ret);
+		return ret;
+	}
+
+	kfree(lock_tag);
+	if (num_lockers == 0)
+		goto again;
+
+	rbd_warn(rbd_dev, "breaking object map lock owned by %s%llu",
+		 ENTITY_NAME(lockers[0].id.name));
+
+	ret = ceph_cls_break_lock(osdc, &oid, &rbd_dev->header_oloc,
+				  RBD_LOCK_NAME, lockers[0].id.cookie,
+				  &lockers[0].id.name);
+	ceph_free_lockers(lockers, num_lockers);
+	if (ret) {
+		if (ret == -ENOENT)
+			goto again;
+
+		rbd_warn(rbd_dev, "failed to break object map lock: %d", ret);
+		return ret;
+	}
+
+	broke_lock = true;
+	goto again;
+}
+
+static void rbd_object_map_unlock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	int ret;
+
+	rbd_object_map_name(rbd_dev, CEPH_NOSNAP, &oid);
+
+	ret = ceph_cls_unlock(osdc, &oid, &rbd_dev->header_oloc, RBD_LOCK_NAME,
+			      "");
+	if (ret && ret != -ENOENT)
+		rbd_warn(rbd_dev, "failed to unlock object map: %d", ret);
+}
+
+static int decode_object_map_header(void **p, void *end, u64 *object_map_size)
+{
+	u8 struct_v;
+	u32 struct_len;
+	u32 header_len;
+	void *header_end;
+	int ret;
+
+	ceph_decode_32_safe(p, end, header_len, e_inval);
+	header_end = *p + header_len;
+
+	ret = ceph_start_decoding(p, end, 1, "BitVector header", &struct_v,
+				  &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_64_safe(p, end, *object_map_size, e_inval);
+
+	*p = header_end;
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int __rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	struct page **pages;
+	void *p, *end;
+	size_t reply_len;
+	u64 num_objects;
+	u64 object_map_bytes;
+	u64 object_map_size;
+	int num_pages;
+	int ret;
+
+	rbd_assert(!rbd_dev->object_map && !rbd_dev->object_map_size);
+
+	num_objects = ceph_get_num_objects(&rbd_dev->layout,
+					   rbd_dev->mapping.size);
+	object_map_bytes = DIV_ROUND_UP_ULL(num_objects * BITS_PER_OBJ,
+					    BITS_PER_BYTE);
+	num_pages = calc_pages_for(0, object_map_bytes) + 1;
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	reply_len = num_pages * PAGE_SIZE;
+	rbd_object_map_name(rbd_dev, rbd_dev->spec->snap_id, &oid);
+	ret = ceph_osdc_call(osdc, &oid, &rbd_dev->header_oloc,
+			     "rbd", "object_map_load", CEPH_OSD_FLAG_READ,
+			     NULL, 0, pages, &reply_len);
+	if (ret)
+		goto out;
+
+	p = page_address(pages[0]);
+	end = p + min(reply_len, (size_t)PAGE_SIZE);
+	ret = decode_object_map_header(&p, end, &object_map_size);
+	if (ret)
+		goto out;
+
+	if (object_map_size != num_objects) {
+		rbd_warn(rbd_dev, "object map size mismatch: %llu vs %llu",
+			 object_map_size, num_objects);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (offset_in_page(p) + object_map_bytes > reply_len) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	rbd_dev->object_map = kvmalloc(object_map_bytes, GFP_KERNEL);
+	if (!rbd_dev->object_map) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	rbd_dev->object_map_size = object_map_size;
+	ceph_copy_from_page_vector(pages, rbd_dev->object_map,
+				   offset_in_page(p), object_map_bytes);
+
+out:
+	ceph_release_page_vector(pages, num_pages);
+	return ret;
+}
+
+static void rbd_object_map_free(struct rbd_device *rbd_dev)
+{
+	kvfree(rbd_dev->object_map);
+	rbd_dev->object_map = NULL;
+	rbd_dev->object_map_size = 0;
+}
+
+static int rbd_object_map_load(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = __rbd_object_map_load(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_v2_get_flags(rbd_dev);
+	if (ret) {
+		rbd_object_map_free(rbd_dev);
+		return ret;
+	}
+
+	if (rbd_dev->object_map_flags & RBD_FLAG_OBJECT_MAP_INVALID)
+		rbd_warn(rbd_dev, "object map is invalid");
+
+	return 0;
+}
+
+static int rbd_object_map_open(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_object_map_lock(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_object_map_load(rbd_dev);
+	if (ret) {
+		rbd_object_map_unlock(rbd_dev);
+		return ret;
+	}
+
+	return 0;
+}
+
+static void rbd_object_map_close(struct rbd_device *rbd_dev)
+{
+	rbd_object_map_free(rbd_dev);
+	rbd_object_map_unlock(rbd_dev);
+}
+
+/*
+ * This function needs snap_id (or more precisely just something to
+ * distinguish between HEAD and snapshot object maps), new_state and
+ * current_state that were passed to rbd_object_map_update().
+ *
+ * To avoid allocating and stashing a context we piggyback on the OSD
+ * request.  A HEAD update has two ops (assert_locked).  For new_state
+ * and current_state we decode our own object_map_update op, encoded in
+ * rbd_cls_object_map_update().
+ */
+static int rbd_object_map_update_finish(struct rbd_obj_request *obj_req,
+					struct ceph_osd_request *osd_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_data *osd_data;
+	u64 objno;
+	u8 state, new_state, current_state;
+	bool has_current_state;
+	void *p;
+
+	if (osd_req->r_result)
+		return osd_req->r_result;
+
+	/*
+	 * Nothing to do for a snapshot object map.
+	 */
+	if (osd_req->r_num_ops == 1)
+		return 0;
+
+	/*
+	 * Update in-memory HEAD object map.
+	 */
+	rbd_assert(osd_req->r_num_ops == 2);
+	osd_data = osd_req_op_data(osd_req, 1, cls, request_data);
+	rbd_assert(osd_data->type == CEPH_OSD_DATA_TYPE_PAGES);
+
+	p = page_address(osd_data->pages[0]);
+	objno = ceph_decode_64(&p);
+	rbd_assert(objno == obj_req->ex.oe_objno);
+	rbd_assert(ceph_decode_64(&p) == objno + 1);
+	new_state = ceph_decode_8(&p);
+	has_current_state = ceph_decode_8(&p);
+	if (has_current_state)
+		current_state = ceph_decode_8(&p);
+
+	spin_lock(&rbd_dev->object_map_lock);
+	state = __rbd_object_map_get(rbd_dev, objno);
+	if (!has_current_state || current_state == state ||
+	    (current_state == OBJECT_EXISTS && state == OBJECT_EXISTS_CLEAN))
+		__rbd_object_map_set(rbd_dev, objno, new_state);
+	spin_unlock(&rbd_dev->object_map_lock);
+
+	return 0;
+}
+
+static void rbd_object_map_callback(struct ceph_osd_request *osd_req)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int result;
+
+	dout("%s osd_req %p result %d for obj_req %p\n", __func__, osd_req,
+	     osd_req->r_result, obj_req);
+
+	result = rbd_object_map_update_finish(obj_req, osd_req);
+	rbd_obj_handle_request(obj_req, result);
+}
+
+static bool update_needed(struct rbd_device *rbd_dev, u64 objno, u8 new_state)
+{
+	u8 state = rbd_object_map_get(rbd_dev, objno);
+
+	if (state == new_state ||
+	    (new_state == OBJECT_PENDING && state == OBJECT_NONEXISTENT) ||
+	    (new_state == OBJECT_NONEXISTENT && state != OBJECT_PENDING))
+		return false;
+
+	return true;
+}
+
+static int rbd_cls_object_map_update(struct ceph_osd_request *req,
+				     int which, u64 objno, u8 new_state,
+				     const u8 *current_state)
+{
+	struct page **pages;
+	void *p, *start;
+	int ret;
+
+	ret = osd_req_op_cls_init(req, which, "rbd", "object_map_update");
+	if (ret)
+		return ret;
+
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	p = start = page_address(pages[0]);
+	ceph_encode_64(&p, objno);
+	ceph_encode_64(&p, objno + 1);
+	ceph_encode_8(&p, new_state);
+	if (current_state) {
+		ceph_encode_8(&p, 1);
+		ceph_encode_8(&p, *current_state);
+	} else {
+		ceph_encode_8(&p, 0);
+	}
+
+	osd_req_op_cls_request_data_pages(req, which, pages, p - start, 0,
+					  false, true);
+	return 0;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_object_map_update(struct rbd_obj_request *obj_req, u64 snap_id,
+				 u8 new_state, const u8 *current_state)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	int num_ops = 1;
+	int which = 0;
+	int ret;
+
+	if (snap_id == CEPH_NOSNAP) {
+		if (!update_needed(rbd_dev, obj_req->ex.oe_objno, new_state))
+			return 1;
+
+		num_ops++; /* assert_locked */
+	}
+
+	req = ceph_osdc_alloc_request(osdc, NULL, num_ops, false, GFP_NOIO);
+	if (!req)
+		return -ENOMEM;
+
+	list_add_tail(&req->r_private_item, &obj_req->osd_reqs);
+	req->r_callback = rbd_object_map_callback;
+	req->r_priv = obj_req;
+
+	rbd_object_map_name(rbd_dev, snap_id, &req->r_base_oid);
+	ceph_oloc_copy(&req->r_base_oloc, &rbd_dev->header_oloc);
+	req->r_flags = CEPH_OSD_FLAG_WRITE;
+	ktime_get_real_ts64(&req->r_mtime);
+
+	if (snap_id == CEPH_NOSNAP) {
+		/*
+		 * Protect against possible race conditions during lock
+		 * ownership transitions.
+		 */
+		ret = ceph_cls_assert_locked(req, which++, RBD_LOCK_NAME,
+					     CEPH_CLS_LOCK_EXCLUSIVE, "", "");
+		if (ret)
+			return ret;
+	}
+
+	ret = rbd_cls_object_map_update(req, which, obj_req->ex.oe_objno,
+					new_state, current_state);
+	if (ret)
+		return ret;
+
+	ret = ceph_osdc_alloc_messages(req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	ceph_osdc_start_request(osdc, req);
+	return 0;
+}
+
+static void prune_extents(struct ceph_file_extent *img_extents,
+			  u32 *num_img_extents, u64 overlap)
+{
+	u32 cnt = *num_img_extents;
+
+	/* drop extents completely beyond the overlap */
+	while (cnt && img_extents[cnt - 1].fe_off >= overlap)
+		cnt--;
+
+	if (cnt) {
+		struct ceph_file_extent *ex = &img_extents[cnt - 1];
+
+		/* trim final overlapping extent */
+		if (ex->fe_off + ex->fe_len > overlap)
+			ex->fe_len = overlap - ex->fe_off;
+	}
+
+	*num_img_extents = cnt;
+}
+
+/*
+ * Determine the byte range(s) covered by either just the object extent
+ * or the entire object in the parent image.
+ */
+static int rbd_obj_calc_img_extents(struct rbd_obj_request *obj_req,
+				    bool entire)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	int ret;
+
+	if (!rbd_dev->parent_overlap)
+		return 0;
+
+	ret = ceph_extent_to_file(&rbd_dev->layout, obj_req->ex.oe_objno,
+				  entire ? 0 : obj_req->ex.oe_off,
+				  entire ? rbd_dev->layout.object_size :
+							obj_req->ex.oe_len,
+				  &obj_req->img_extents,
+				  &obj_req->num_img_extents);
+	if (ret)
+		return ret;
+
+	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
+		      rbd_dev->parent_overlap);
+	return 0;
+}
+
+static void rbd_osd_setup_data(struct ceph_osd_request *osd_req, int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+	switch (obj_req->img_request->data_type) {
+	case OBJ_REQUEST_BIO:
+		osd_req_op_extent_osd_data_bio(osd_req, which,
+					       &obj_req->bio_pos,
+					       obj_req->ex.oe_len);
+		break;
+	case OBJ_REQUEST_BVECS:
+	case OBJ_REQUEST_OWN_BVECS:
+		rbd_assert(obj_req->bvec_pos.iter.bi_size ==
+							obj_req->ex.oe_len);
+		rbd_assert(obj_req->bvec_idx == obj_req->bvec_count);
+		osd_req_op_extent_osd_data_bvec_pos(osd_req, which,
+						    &obj_req->bvec_pos);
+		break;
+	default:
+		BUG();
+	}
+}
+
+static int rbd_osd_setup_stat(struct ceph_osd_request *osd_req, int which)
+{
+	struct page **pages;
+
+	/*
+	 * The response data for a STAT call consists of:
+	 *     le64 length;
+	 *     struct {
+	 *         le32 tv_sec;
+	 *         le32 tv_nsec;
+	 *     } mtime;
+	 */
+	pages = ceph_alloc_page_vector(1, GFP_NOIO);
+	if (IS_ERR(pages))
+		return PTR_ERR(pages);
+
+	osd_req_op_init(osd_req, which, CEPH_OSD_OP_STAT, 0);
+	osd_req_op_raw_data_in_pages(osd_req, which, pages,
+				     8 + sizeof(struct ceph_timespec),
+				     0, false, true);
+	return 0;
+}
+
+static int rbd_osd_setup_copyup(struct ceph_osd_request *osd_req, int which,
+				u32 bytes)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	int ret;
+
+	ret = osd_req_op_cls_init(osd_req, which, "rbd", "copyup");
+	if (ret)
+		return ret;
+
+	osd_req_op_cls_request_data_bvecs(osd_req, which, obj_req->copyup_bvecs,
+					  obj_req->copyup_bvec_count, bytes);
+	return 0;
+}
+
+static int rbd_obj_init_read(struct rbd_obj_request *obj_req)
+{
+	obj_req->read_state = RBD_OBJ_READ_START;
+	return 0;
+}
+
+static void __rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
+				      int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u16 opcode;
+
+	if (!use_object_map(rbd_dev) ||
+	    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST)) {
+		osd_req_op_alloc_hint_init(osd_req, which++,
+					   rbd_dev->layout.object_size,
+					   rbd_dev->layout.object_size,
+					   rbd_dev->opts->alloc_hint_flags);
+	}
+
+	if (rbd_obj_is_entire(obj_req))
+		opcode = CEPH_OSD_OP_WRITEFULL;
+	else
+		opcode = CEPH_OSD_OP_WRITE;
+
+	osd_req_op_extent_init(osd_req, which, opcode,
+			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
+	rbd_osd_setup_data(osd_req, which);
+}
+
+static int rbd_obj_init_write(struct rbd_obj_request *obj_req)
+{
+	int ret;
+
+	/* reverse map the entire object onto the parent */
+	ret = rbd_obj_calc_img_extents(obj_req, true);
+	if (ret)
+		return ret;
+
+	obj_req->write_state = RBD_OBJ_WRITE_START;
+	return 0;
+}
+
+static u16 truncate_or_zero_opcode(struct rbd_obj_request *obj_req)
+{
+	return rbd_obj_is_tail(obj_req) ? CEPH_OSD_OP_TRUNCATE :
+					  CEPH_OSD_OP_ZERO;
+}
+
+static void __rbd_osd_setup_discard_ops(struct ceph_osd_request *osd_req,
+					int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents) {
+		rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
+		osd_req_op_init(osd_req, which, CEPH_OSD_OP_DELETE, 0);
+	} else {
+		osd_req_op_extent_init(osd_req, which,
+				       truncate_or_zero_opcode(obj_req),
+				       obj_req->ex.oe_off, obj_req->ex.oe_len,
+				       0, 0);
+	}
+}
+
+static int rbd_obj_init_discard(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u64 off, next_off;
+	int ret;
+
+	/*
+	 * Align the range to alloc_size boundary and punt on discards
+	 * that are too small to free up any space.
+	 *
+	 * alloc_size == object_size && is_tail() is a special case for
+	 * filestore with filestore_punch_hole = false, needed to allow
+	 * truncate (in addition to delete).
+	 */
+	if (rbd_dev->opts->alloc_size != rbd_dev->layout.object_size ||
+	    !rbd_obj_is_tail(obj_req)) {
+		off = round_up(obj_req->ex.oe_off, rbd_dev->opts->alloc_size);
+		next_off = round_down(obj_req->ex.oe_off + obj_req->ex.oe_len,
+				      rbd_dev->opts->alloc_size);
+		if (off >= next_off)
+			return 1;
+
+		dout("%s %p %llu~%llu -> %llu~%llu\n", __func__,
+		     obj_req, obj_req->ex.oe_off, obj_req->ex.oe_len,
+		     off, next_off - off);
+		obj_req->ex.oe_off = off;
+		obj_req->ex.oe_len = next_off - off;
+	}
+
+	/* reverse map the entire object onto the parent */
+	ret = rbd_obj_calc_img_extents(obj_req, true);
+	if (ret)
+		return ret;
+
+	obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
+	if (rbd_obj_is_entire(obj_req) && !obj_req->num_img_extents)
+		obj_req->flags |= RBD_OBJ_FLAG_DELETION;
+
+	obj_req->write_state = RBD_OBJ_WRITE_START;
+	return 0;
+}
+
+static void __rbd_osd_setup_zeroout_ops(struct ceph_osd_request *osd_req,
+					int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+	u16 opcode;
+
+	if (rbd_obj_is_entire(obj_req)) {
+		if (obj_req->num_img_extents) {
+			if (!(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
+				osd_req_op_init(osd_req, which++,
+						CEPH_OSD_OP_CREATE, 0);
+			opcode = CEPH_OSD_OP_TRUNCATE;
+		} else {
+			rbd_assert(obj_req->flags & RBD_OBJ_FLAG_DELETION);
+			osd_req_op_init(osd_req, which++,
+					CEPH_OSD_OP_DELETE, 0);
+			opcode = 0;
+		}
+	} else {
+		opcode = truncate_or_zero_opcode(obj_req);
+	}
+
+	if (opcode)
+		osd_req_op_extent_init(osd_req, which, opcode,
+				       obj_req->ex.oe_off, obj_req->ex.oe_len,
+				       0, 0);
+}
+
+static int rbd_obj_init_zeroout(struct rbd_obj_request *obj_req)
+{
+	int ret;
+
+	/* reverse map the entire object onto the parent */
+	ret = rbd_obj_calc_img_extents(obj_req, true);
+	if (ret)
+		return ret;
+
+	if (!obj_req->num_img_extents) {
+		obj_req->flags |= RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT;
+		if (rbd_obj_is_entire(obj_req))
+			obj_req->flags |= RBD_OBJ_FLAG_DELETION;
+	}
+
+	obj_req->write_state = RBD_OBJ_WRITE_START;
+	return 0;
+}
+
+static int count_write_ops(struct rbd_obj_request *obj_req)
+{
+	struct rbd_img_request *img_req = obj_req->img_request;
+
+	switch (img_req->op_type) {
+	case OBJ_OP_WRITE:
+		if (!use_object_map(img_req->rbd_dev) ||
+		    !(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST))
+			return 2; /* setallochint + write/writefull */
+
+		return 1; /* write/writefull */
+	case OBJ_OP_DISCARD:
+		return 1; /* delete/truncate/zero */
+	case OBJ_OP_ZEROOUT:
+		if (rbd_obj_is_entire(obj_req) && obj_req->num_img_extents &&
+		    !(obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED))
+			return 2; /* create + truncate */
+
+		return 1; /* delete/truncate/zero */
+	default:
+		BUG();
+	}
+}
+
+static void rbd_osd_setup_write_ops(struct ceph_osd_request *osd_req,
+				    int which)
+{
+	struct rbd_obj_request *obj_req = osd_req->r_priv;
+
+	switch (obj_req->img_request->op_type) {
+	case OBJ_OP_WRITE:
+		__rbd_osd_setup_write_ops(osd_req, which);
+		break;
+	case OBJ_OP_DISCARD:
+		__rbd_osd_setup_discard_ops(osd_req, which);
+		break;
+	case OBJ_OP_ZEROOUT:
+		__rbd_osd_setup_zeroout_ops(osd_req, which);
+		break;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Prune the list of object requests (adjust offset and/or length, drop
+ * redundant requests).  Prepare object request state machines and image
+ * request state machine for execution.
+ */
+static int __rbd_img_fill_request(struct rbd_img_request *img_req)
+{
+	struct rbd_obj_request *obj_req, *next_obj_req;
+	int ret;
+
+	for_each_obj_request_safe(img_req, obj_req, next_obj_req) {
+		switch (img_req->op_type) {
+		case OBJ_OP_READ:
+			ret = rbd_obj_init_read(obj_req);
+			break;
+		case OBJ_OP_WRITE:
+			ret = rbd_obj_init_write(obj_req);
+			break;
+		case OBJ_OP_DISCARD:
+			ret = rbd_obj_init_discard(obj_req);
+			break;
+		case OBJ_OP_ZEROOUT:
+			ret = rbd_obj_init_zeroout(obj_req);
+			break;
+		default:
+			BUG();
+		}
+		if (ret < 0)
+			return ret;
+		if (ret > 0) {
+			rbd_img_obj_request_del(img_req, obj_req);
+			continue;
+		}
+	}
+
+	img_req->state = RBD_IMG_START;
+	return 0;
+}
+
+union rbd_img_fill_iter {
+	struct ceph_bio_iter	bio_iter;
+	struct ceph_bvec_iter	bvec_iter;
+};
+
+struct rbd_img_fill_ctx {
+	enum obj_request_type	pos_type;
+	union rbd_img_fill_iter	*pos;
+	union rbd_img_fill_iter	iter;
+	ceph_object_extent_fn_t	set_pos_fn;
+	ceph_object_extent_fn_t	count_fn;
+	ceph_object_extent_fn_t	copy_fn;
+};
+
+static struct ceph_object_extent *alloc_object_extent(void *arg)
+{
+	struct rbd_img_request *img_req = arg;
+	struct rbd_obj_request *obj_req;
+
+	obj_req = rbd_obj_request_create();
+	if (!obj_req)
+		return NULL;
+
+	rbd_img_obj_request_add(img_req, obj_req);
+	return &obj_req->ex;
+}
+
+/*
+ * While su != os && sc == 1 is technically not fancy (it's the same
+ * layout as su == os && sc == 1), we can't use the nocopy path for it
+ * because ->set_pos_fn() should be called only once per object.
+ * ceph_file_to_extents() invokes action_fn once per stripe unit, so
+ * treat su != os && sc == 1 as fancy.
+ */
+static bool rbd_layout_is_fancy(struct ceph_file_layout *l)
+{
+	return l->stripe_unit != l->object_size;
+}
+
+static int rbd_img_fill_request_nocopy(struct rbd_img_request *img_req,
+				       struct ceph_file_extent *img_extents,
+				       u32 num_img_extents,
+				       struct rbd_img_fill_ctx *fctx)
+{
+	u32 i;
+	int ret;
+
+	img_req->data_type = fctx->pos_type;
+
+	/*
+	 * Create object requests and set each object request's starting
+	 * position in the provided bio (list) or bio_vec array.
+	 */
+	fctx->iter = *fctx->pos;
+	for (i = 0; i < num_img_extents; i++) {
+		ret = ceph_file_to_extents(&img_req->rbd_dev->layout,
+					   img_extents[i].fe_off,
+					   img_extents[i].fe_len,
+					   &img_req->object_extents,
+					   alloc_object_extent, img_req,
+					   fctx->set_pos_fn, &fctx->iter);
+		if (ret)
+			return ret;
+	}
+
+	return __rbd_img_fill_request(img_req);
+}
+
+/*
+ * Map a list of image extents to a list of object extents, create the
+ * corresponding object requests (normally each to a different object,
+ * but not always) and add them to @img_req.  For each object request,
+ * set up its data descriptor to point to the corresponding chunk(s) of
+ * @fctx->pos data buffer.
+ *
+ * Because ceph_file_to_extents() will merge adjacent object extents
+ * together, each object request's data descriptor may point to multiple
+ * different chunks of @fctx->pos data buffer.
+ *
+ * @fctx->pos data buffer is assumed to be large enough.
+ */
+static int rbd_img_fill_request(struct rbd_img_request *img_req,
+				struct ceph_file_extent *img_extents,
+				u32 num_img_extents,
+				struct rbd_img_fill_ctx *fctx)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	struct rbd_obj_request *obj_req;
+	u32 i;
+	int ret;
+
+	if (fctx->pos_type == OBJ_REQUEST_NODATA ||
+	    !rbd_layout_is_fancy(&rbd_dev->layout))
+		return rbd_img_fill_request_nocopy(img_req, img_extents,
+						   num_img_extents, fctx);
+
+	img_req->data_type = OBJ_REQUEST_OWN_BVECS;
+
+	/*
+	 * Create object requests and determine ->bvec_count for each object
+	 * request.  Note that ->bvec_count sum over all object requests may
+	 * be greater than the number of bio_vecs in the provided bio (list)
+	 * or bio_vec array because when mapped, those bio_vecs can straddle
+	 * stripe unit boundaries.
+	 */
+	fctx->iter = *fctx->pos;
+	for (i = 0; i < num_img_extents; i++) {
+		ret = ceph_file_to_extents(&rbd_dev->layout,
+					   img_extents[i].fe_off,
+					   img_extents[i].fe_len,
+					   &img_req->object_extents,
+					   alloc_object_extent, img_req,
+					   fctx->count_fn, &fctx->iter);
+		if (ret)
+			return ret;
+	}
+
+	for_each_obj_request(img_req, obj_req) {
+		obj_req->bvec_pos.bvecs = kmalloc_array(obj_req->bvec_count,
+					      sizeof(*obj_req->bvec_pos.bvecs),
+					      GFP_NOIO);
+		if (!obj_req->bvec_pos.bvecs)
+			return -ENOMEM;
+	}
+
+	/*
+	 * Fill in each object request's private bio_vec array, splitting and
+	 * rearranging the provided bio_vecs in stripe unit chunks as needed.
+	 */
+	fctx->iter = *fctx->pos;
+	for (i = 0; i < num_img_extents; i++) {
+		ret = ceph_iterate_extents(&rbd_dev->layout,
+					   img_extents[i].fe_off,
+					   img_extents[i].fe_len,
+					   &img_req->object_extents,
+					   fctx->copy_fn, &fctx->iter);
+		if (ret)
+			return ret;
+	}
+
+	return __rbd_img_fill_request(img_req);
+}
+
+static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
+			       u64 off, u64 len)
+{
+	struct ceph_file_extent ex = { off, len };
+	union rbd_img_fill_iter dummy = {};
+	struct rbd_img_fill_ctx fctx = {
+		.pos_type = OBJ_REQUEST_NODATA,
+		.pos = &dummy,
+	};
+
+	return rbd_img_fill_request(img_req, &ex, 1, &fctx);
+}
+
+static void set_bio_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bio_iter *it = arg;
+
+	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
+	obj_req->bio_pos = *it;
+	ceph_bio_iter_advance(it, bytes);
+}
+
+static void count_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bio_iter *it = arg;
+
+	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
+	ceph_bio_iter_advance_step(it, bytes, ({
+		obj_req->bvec_count++;
+	}));
+
+}
+
+static void copy_bio_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bio_iter *it = arg;
+
+	dout("%s objno %llu bytes %u\n", __func__, ex->oe_objno, bytes);
+	ceph_bio_iter_advance_step(it, bytes, ({
+		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
+	}));
+}
+
+static int __rbd_img_fill_from_bio(struct rbd_img_request *img_req,
+				   struct ceph_file_extent *img_extents,
+				   u32 num_img_extents,
+				   struct ceph_bio_iter *bio_pos)
+{
+	struct rbd_img_fill_ctx fctx = {
+		.pos_type = OBJ_REQUEST_BIO,
+		.pos = (union rbd_img_fill_iter *)bio_pos,
+		.set_pos_fn = set_bio_pos,
+		.count_fn = count_bio_bvecs,
+		.copy_fn = copy_bio_bvecs,
+	};
+
+	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
+				    &fctx);
+}
+
+static int rbd_img_fill_from_bio(struct rbd_img_request *img_req,
+				 u64 off, u64 len, struct bio *bio)
+{
+	struct ceph_file_extent ex = { off, len };
+	struct ceph_bio_iter it = { .bio = bio, .iter = bio->bi_iter };
+
+	return __rbd_img_fill_from_bio(img_req, &ex, 1, &it);
+}
+
+static void set_bvec_pos(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bvec_iter *it = arg;
+
+	obj_req->bvec_pos = *it;
+	ceph_bvec_iter_shorten(&obj_req->bvec_pos, bytes);
+	ceph_bvec_iter_advance(it, bytes);
+}
+
+static void count_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bvec_iter *it = arg;
+
+	ceph_bvec_iter_advance_step(it, bytes, ({
+		obj_req->bvec_count++;
+	}));
+}
+
+static void copy_bvecs(struct ceph_object_extent *ex, u32 bytes, void *arg)
+{
+	struct rbd_obj_request *obj_req =
+	    container_of(ex, struct rbd_obj_request, ex);
+	struct ceph_bvec_iter *it = arg;
+
+	ceph_bvec_iter_advance_step(it, bytes, ({
+		obj_req->bvec_pos.bvecs[obj_req->bvec_idx++] = bv;
+		obj_req->bvec_pos.iter.bi_size += bv.bv_len;
+	}));
+}
+
+static int __rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
+				     struct ceph_file_extent *img_extents,
+				     u32 num_img_extents,
+				     struct ceph_bvec_iter *bvec_pos)
+{
+	struct rbd_img_fill_ctx fctx = {
+		.pos_type = OBJ_REQUEST_BVECS,
+		.pos = (union rbd_img_fill_iter *)bvec_pos,
+		.set_pos_fn = set_bvec_pos,
+		.count_fn = count_bvecs,
+		.copy_fn = copy_bvecs,
+	};
+
+	return rbd_img_fill_request(img_req, img_extents, num_img_extents,
+				    &fctx);
+}
+
+static int rbd_img_fill_from_bvecs(struct rbd_img_request *img_req,
+				   struct ceph_file_extent *img_extents,
+				   u32 num_img_extents,
+				   struct bio_vec *bvecs)
+{
+	struct ceph_bvec_iter it = {
+		.bvecs = bvecs,
+		.iter = { .bi_size = ceph_file_extents_bytes(img_extents,
+							     num_img_extents) },
+	};
+
+	return __rbd_img_fill_from_bvecs(img_req, img_extents, num_img_extents,
+					 &it);
+}
+
+static void rbd_img_handle_request_work(struct work_struct *work)
+{
+	struct rbd_img_request *img_req =
+	    container_of(work, struct rbd_img_request, work);
+
+	rbd_img_handle_request(img_req, img_req->work_result);
+}
+
+static void rbd_img_schedule(struct rbd_img_request *img_req, int result)
+{
+	INIT_WORK(&img_req->work, rbd_img_handle_request_work);
+	img_req->work_result = result;
+	queue_work(rbd_wq, &img_req->work);
+}
+
+static bool rbd_obj_may_exist(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno)) {
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+		return true;
+	}
+
+	dout("%s %p objno %llu assuming dne\n", __func__, obj_req,
+	     obj_req->ex.oe_objno);
+	return false;
+}
+
+static int rbd_obj_read_object(struct rbd_obj_request *obj_req)
+{
+	struct ceph_osd_request *osd_req;
+	int ret;
+
+	osd_req = __rbd_obj_add_osd_request(obj_req, NULL, 1);
+	if (IS_ERR(osd_req))
+		return PTR_ERR(osd_req);
+
+	osd_req_op_extent_init(osd_req, 0, CEPH_OSD_OP_READ,
+			       obj_req->ex.oe_off, obj_req->ex.oe_len, 0, 0);
+	rbd_osd_setup_data(osd_req, 0);
+	rbd_osd_format_read(osd_req);
+
+	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	rbd_osd_submit(osd_req);
+	return 0;
+}
+
+static int rbd_obj_read_from_parent(struct rbd_obj_request *obj_req)
+{
+	struct rbd_img_request *img_req = obj_req->img_request;
+	struct rbd_device *parent = img_req->rbd_dev->parent;
+	struct rbd_img_request *child_img_req;
+	int ret;
+
+	child_img_req = kmem_cache_alloc(rbd_img_request_cache, GFP_NOIO);
+	if (!child_img_req)
+		return -ENOMEM;
+
+	rbd_img_request_init(child_img_req, parent, OBJ_OP_READ);
+	__set_bit(IMG_REQ_CHILD, &child_img_req->flags);
+	child_img_req->obj_request = obj_req;
+
+	down_read(&parent->header_rwsem);
+	rbd_img_capture_header(child_img_req);
+	up_read(&parent->header_rwsem);
+
+	dout("%s child_img_req %p for obj_req %p\n", __func__, child_img_req,
+	     obj_req);
+
+	if (!rbd_img_is_write(img_req)) {
+		switch (img_req->data_type) {
+		case OBJ_REQUEST_BIO:
+			ret = __rbd_img_fill_from_bio(child_img_req,
+						      obj_req->img_extents,
+						      obj_req->num_img_extents,
+						      &obj_req->bio_pos);
+			break;
+		case OBJ_REQUEST_BVECS:
+		case OBJ_REQUEST_OWN_BVECS:
+			ret = __rbd_img_fill_from_bvecs(child_img_req,
+						      obj_req->img_extents,
+						      obj_req->num_img_extents,
+						      &obj_req->bvec_pos);
+			break;
+		default:
+			BUG();
+		}
+	} else {
+		ret = rbd_img_fill_from_bvecs(child_img_req,
+					      obj_req->img_extents,
+					      obj_req->num_img_extents,
+					      obj_req->copyup_bvecs);
+	}
+	if (ret) {
+		rbd_img_request_destroy(child_img_req);
+		return ret;
+	}
+
+	/* avoid parent chain recursion */
+	rbd_img_schedule(child_img_req, 0);
+	return 0;
+}
+
+static bool rbd_obj_advance_read(struct rbd_obj_request *obj_req, int *result)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	int ret;
+
+again:
+	switch (obj_req->read_state) {
+	case RBD_OBJ_READ_START:
+		rbd_assert(!*result);
+
+		if (!rbd_obj_may_exist(obj_req)) {
+			*result = -ENOENT;
+			obj_req->read_state = RBD_OBJ_READ_OBJECT;
+			goto again;
+		}
+
+		ret = rbd_obj_read_object(obj_req);
+		if (ret) {
+			*result = ret;
+			return true;
+		}
+		obj_req->read_state = RBD_OBJ_READ_OBJECT;
+		return false;
+	case RBD_OBJ_READ_OBJECT:
+		if (*result == -ENOENT && rbd_dev->parent_overlap) {
+			/* reverse map this object extent onto the parent */
+			ret = rbd_obj_calc_img_extents(obj_req, false);
+			if (ret) {
+				*result = ret;
+				return true;
+			}
+			if (obj_req->num_img_extents) {
+				ret = rbd_obj_read_from_parent(obj_req);
+				if (ret) {
+					*result = ret;
+					return true;
+				}
+				obj_req->read_state = RBD_OBJ_READ_PARENT;
+				return false;
+			}
+		}
+
+		/*
+		 * -ENOENT means a hole in the image -- zero-fill the entire
+		 * length of the request.  A short read also implies zero-fill
+		 * to the end of the request.
+		 */
+		if (*result == -ENOENT) {
+			rbd_obj_zero_range(obj_req, 0, obj_req->ex.oe_len);
+			*result = 0;
+		} else if (*result >= 0) {
+			if (*result < obj_req->ex.oe_len)
+				rbd_obj_zero_range(obj_req, *result,
+						obj_req->ex.oe_len - *result);
+			else
+				rbd_assert(*result == obj_req->ex.oe_len);
+			*result = 0;
+		}
+		return true;
+	case RBD_OBJ_READ_PARENT:
+		/*
+		 * The parent image is read only up to the overlap -- zero-fill
+		 * from the overlap to the end of the request.
+		 */
+		if (!*result) {
+			u32 obj_overlap = rbd_obj_img_extents_bytes(obj_req);
+
+			if (obj_overlap < obj_req->ex.oe_len)
+				rbd_obj_zero_range(obj_req, obj_overlap,
+					    obj_req->ex.oe_len - obj_overlap);
+		}
+		return true;
+	default:
+		BUG();
+	}
+}
+
+static bool rbd_obj_write_is_noop(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+
+	if (rbd_object_map_may_exist(rbd_dev, obj_req->ex.oe_objno))
+		obj_req->flags |= RBD_OBJ_FLAG_MAY_EXIST;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_MAY_EXIST) &&
+	    (obj_req->flags & RBD_OBJ_FLAG_NOOP_FOR_NONEXISTENT)) {
+		dout("%s %p noop for nonexistent\n", __func__, obj_req);
+		return true;
+	}
+
+	return false;
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_pre_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 new_state;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+		new_state = OBJECT_PENDING;
+	else
+		new_state = OBJECT_EXISTS;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, new_state, NULL);
+}
+
+static int rbd_obj_write_object(struct rbd_obj_request *obj_req)
+{
+	struct ceph_osd_request *osd_req;
+	int num_ops = count_write_ops(obj_req);
+	int which = 0;
+	int ret;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED)
+		num_ops++; /* stat */
+
+	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
+	if (IS_ERR(osd_req))
+		return PTR_ERR(osd_req);
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
+		ret = rbd_osd_setup_stat(osd_req, which++);
+		if (ret)
+			return ret;
+	}
+
+	rbd_osd_setup_write_ops(osd_req, which);
+	rbd_osd_format_write(osd_req);
+
+	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	rbd_osd_submit(osd_req);
+	return 0;
+}
+
+/*
+ * copyup_bvecs pages are never highmem pages
+ */
+static bool is_zero_bvecs(struct bio_vec *bvecs, u32 bytes)
+{
+	struct ceph_bvec_iter it = {
+		.bvecs = bvecs,
+		.iter = { .bi_size = bytes },
+	};
+
+	ceph_bvec_iter_advance_step(&it, bytes, ({
+		if (memchr_inv(bvec_virt(&bv), 0, bv.bv_len))
+			return false;
+	}));
+	return true;
+}
+
+#define MODS_ONLY	U32_MAX
+
+static int rbd_obj_copyup_empty_snapc(struct rbd_obj_request *obj_req,
+				      u32 bytes)
+{
+	struct ceph_osd_request *osd_req;
+	int ret;
+
+	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
+	rbd_assert(bytes > 0 && bytes != MODS_ONLY);
+
+	osd_req = __rbd_obj_add_osd_request(obj_req, &rbd_empty_snapc, 1);
+	if (IS_ERR(osd_req))
+		return PTR_ERR(osd_req);
+
+	ret = rbd_osd_setup_copyup(osd_req, 0, bytes);
+	if (ret)
+		return ret;
+
+	rbd_osd_format_write(osd_req);
+
+	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	rbd_osd_submit(osd_req);
+	return 0;
+}
+
+static int rbd_obj_copyup_current_snapc(struct rbd_obj_request *obj_req,
+					u32 bytes)
+{
+	struct ceph_osd_request *osd_req;
+	int num_ops = count_write_ops(obj_req);
+	int which = 0;
+	int ret;
+
+	dout("%s obj_req %p bytes %u\n", __func__, obj_req, bytes);
+
+	if (bytes != MODS_ONLY)
+		num_ops++; /* copyup */
+
+	osd_req = rbd_obj_add_osd_request(obj_req, num_ops);
+	if (IS_ERR(osd_req))
+		return PTR_ERR(osd_req);
+
+	if (bytes != MODS_ONLY) {
+		ret = rbd_osd_setup_copyup(osd_req, which++, bytes);
+		if (ret)
+			return ret;
+	}
+
+	rbd_osd_setup_write_ops(osd_req, which);
+	rbd_osd_format_write(osd_req);
+
+	ret = ceph_osdc_alloc_messages(osd_req, GFP_NOIO);
+	if (ret)
+		return ret;
+
+	rbd_osd_submit(osd_req);
+	return 0;
+}
+
+static int setup_copyup_bvecs(struct rbd_obj_request *obj_req, u64 obj_overlap)
+{
+	u32 i;
+
+	rbd_assert(!obj_req->copyup_bvecs);
+	obj_req->copyup_bvec_count = calc_pages_for(0, obj_overlap);
+	obj_req->copyup_bvecs = kcalloc(obj_req->copyup_bvec_count,
+					sizeof(*obj_req->copyup_bvecs),
+					GFP_NOIO);
+	if (!obj_req->copyup_bvecs)
+		return -ENOMEM;
+
+	for (i = 0; i < obj_req->copyup_bvec_count; i++) {
+		unsigned int len = min(obj_overlap, (u64)PAGE_SIZE);
+
+		obj_req->copyup_bvecs[i].bv_page = alloc_page(GFP_NOIO);
+		if (!obj_req->copyup_bvecs[i].bv_page)
+			return -ENOMEM;
+
+		obj_req->copyup_bvecs[i].bv_offset = 0;
+		obj_req->copyup_bvecs[i].bv_len = len;
+		obj_overlap -= len;
+	}
+
+	rbd_assert(!obj_overlap);
+	return 0;
+}
+
+/*
+ * The target object doesn't exist.  Read the data for the entire
+ * target object up to the overlap point (if any) from the parent,
+ * so we can use it for a copyup.
+ */
+static int rbd_obj_copyup_read_parent(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	int ret;
+
+	rbd_assert(obj_req->num_img_extents);
+	prune_extents(obj_req->img_extents, &obj_req->num_img_extents,
+		      rbd_dev->parent_overlap);
+	if (!obj_req->num_img_extents) {
+		/*
+		 * The overlap has become 0 (most likely because the
+		 * image has been flattened).  Re-submit the original write
+		 * request -- pass MODS_ONLY since the copyup isn't needed
+		 * anymore.
+		 */
+		return rbd_obj_copyup_current_snapc(obj_req, MODS_ONLY);
+	}
+
+	ret = setup_copyup_bvecs(obj_req, rbd_obj_img_extents_bytes(obj_req));
+	if (ret)
+		return ret;
+
+	return rbd_obj_read_from_parent(obj_req);
+}
+
+static void rbd_obj_copyup_object_maps(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	struct ceph_snap_context *snapc = obj_req->img_request->snapc;
+	u8 new_state;
+	u32 i;
+	int ret;
+
+	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return;
+
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+		return;
+
+	for (i = 0; i < snapc->num_snaps; i++) {
+		if ((rbd_dev->header.features & RBD_FEATURE_FAST_DIFF) &&
+		    i + 1 < snapc->num_snaps)
+			new_state = OBJECT_EXISTS_CLEAN;
+		else
+			new_state = OBJECT_EXISTS;
+
+		ret = rbd_object_map_update(obj_req, snapc->snaps[i],
+					    new_state, NULL);
+		if (ret < 0) {
+			obj_req->pending.result = ret;
+			return;
+		}
+
+		rbd_assert(!ret);
+		obj_req->pending.num_pending++;
+	}
+}
+
+static void rbd_obj_copyup_write_object(struct rbd_obj_request *obj_req)
+{
+	u32 bytes = rbd_obj_img_extents_bytes(obj_req);
+	int ret;
+
+	rbd_assert(!obj_req->pending.result && !obj_req->pending.num_pending);
+
+	/*
+	 * Only send non-zero copyup data to save some I/O and network
+	 * bandwidth -- zero copyup data is equivalent to the object not
+	 * existing.
+	 */
+	if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ZEROS)
+		bytes = 0;
+
+	if (obj_req->img_request->snapc->num_snaps && bytes > 0) {
+		/*
+		 * Send a copyup request with an empty snapshot context to
+		 * deep-copyup the object through all existing snapshots.
+		 * A second request with the current snapshot context will be
+		 * sent for the actual modification.
+		 */
+		ret = rbd_obj_copyup_empty_snapc(obj_req, bytes);
+		if (ret) {
+			obj_req->pending.result = ret;
+			return;
+		}
+
+		obj_req->pending.num_pending++;
+		bytes = MODS_ONLY;
+	}
+
+	ret = rbd_obj_copyup_current_snapc(obj_req, bytes);
+	if (ret) {
+		obj_req->pending.result = ret;
+		return;
+	}
+
+	obj_req->pending.num_pending++;
+}
+
+static bool rbd_obj_advance_copyup(struct rbd_obj_request *obj_req, int *result)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	int ret;
+
+again:
+	switch (obj_req->copyup_state) {
+	case RBD_OBJ_COPYUP_START:
+		rbd_assert(!*result);
+
+		ret = rbd_obj_copyup_read_parent(obj_req);
+		if (ret) {
+			*result = ret;
+			return true;
+		}
+		if (obj_req->num_img_extents)
+			obj_req->copyup_state = RBD_OBJ_COPYUP_READ_PARENT;
+		else
+			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
+		return false;
+	case RBD_OBJ_COPYUP_READ_PARENT:
+		if (*result)
+			return true;
+
+		if (is_zero_bvecs(obj_req->copyup_bvecs,
+				  rbd_obj_img_extents_bytes(obj_req))) {
+			dout("%s %p detected zeros\n", __func__, obj_req);
+			obj_req->flags |= RBD_OBJ_FLAG_COPYUP_ZEROS;
+		}
+
+		rbd_obj_copyup_object_maps(obj_req);
+		if (!obj_req->pending.num_pending) {
+			*result = obj_req->pending.result;
+			obj_req->copyup_state = RBD_OBJ_COPYUP_OBJECT_MAPS;
+			goto again;
+		}
+		obj_req->copyup_state = __RBD_OBJ_COPYUP_OBJECT_MAPS;
+		return false;
+	case __RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (!pending_result_dec(&obj_req->pending, result))
+			return false;
+		fallthrough;
+	case RBD_OBJ_COPYUP_OBJECT_MAPS:
+		if (*result) {
+			rbd_warn(rbd_dev, "snap object map update failed: %d",
+				 *result);
+			return true;
+		}
+
+		rbd_obj_copyup_write_object(obj_req);
+		if (!obj_req->pending.num_pending) {
+			*result = obj_req->pending.result;
+			obj_req->copyup_state = RBD_OBJ_COPYUP_WRITE_OBJECT;
+			goto again;
+		}
+		obj_req->copyup_state = __RBD_OBJ_COPYUP_WRITE_OBJECT;
+		return false;
+	case __RBD_OBJ_COPYUP_WRITE_OBJECT:
+		if (!pending_result_dec(&obj_req->pending, result))
+			return false;
+		fallthrough;
+	case RBD_OBJ_COPYUP_WRITE_OBJECT:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Return:
+ *   0 - object map update sent
+ *   1 - object map update isn't needed
+ *  <0 - error
+ */
+static int rbd_obj_write_post_object_map(struct rbd_obj_request *obj_req)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	u8 current_state = OBJECT_PENDING;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return 1;
+
+	if (!(obj_req->flags & RBD_OBJ_FLAG_DELETION))
+		return 1;
+
+	return rbd_object_map_update(obj_req, CEPH_NOSNAP, OBJECT_NONEXISTENT,
+				     &current_state);
+}
+
+static bool rbd_obj_advance_write(struct rbd_obj_request *obj_req, int *result)
+{
+	struct rbd_device *rbd_dev = obj_req->img_request->rbd_dev;
+	int ret;
+
+again:
+	switch (obj_req->write_state) {
+	case RBD_OBJ_WRITE_START:
+		rbd_assert(!*result);
+
+		rbd_obj_set_copyup_enabled(obj_req);
+		if (rbd_obj_write_is_noop(obj_req))
+			return true;
+
+		ret = rbd_obj_write_pre_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_PRE_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_PRE_OBJECT_MAP:
+		if (*result) {
+			rbd_warn(rbd_dev, "pre object map update failed: %d",
+				 *result);
+			return true;
+		}
+		ret = rbd_obj_write_object(obj_req);
+		if (ret) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_OBJECT;
+		return false;
+	case RBD_OBJ_WRITE_OBJECT:
+		if (*result == -ENOENT) {
+			if (obj_req->flags & RBD_OBJ_FLAG_COPYUP_ENABLED) {
+				*result = 0;
+				obj_req->copyup_state = RBD_OBJ_COPYUP_START;
+				obj_req->write_state = __RBD_OBJ_WRITE_COPYUP;
+				goto again;
+			}
+			/*
+			 * On a non-existent object:
+			 *   delete - -ENOENT, truncate/zero - 0
+			 */
+			if (obj_req->flags & RBD_OBJ_FLAG_DELETION)
+				*result = 0;
+		}
+		if (*result)
+			return true;
+
+		obj_req->write_state = RBD_OBJ_WRITE_COPYUP;
+		goto again;
+	case __RBD_OBJ_WRITE_COPYUP:
+		if (!rbd_obj_advance_copyup(obj_req, result))
+			return false;
+		fallthrough;
+	case RBD_OBJ_WRITE_COPYUP:
+		if (*result) {
+			rbd_warn(rbd_dev, "copyup failed: %d", *result);
+			return true;
+		}
+		ret = rbd_obj_write_post_object_map(obj_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		obj_req->write_state = RBD_OBJ_WRITE_POST_OBJECT_MAP;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_OBJ_WRITE_POST_OBJECT_MAP:
+		if (*result)
+			rbd_warn(rbd_dev, "post object map update failed: %d",
+				 *result);
+		return true;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Return true if @obj_req is completed.
+ */
+static bool __rbd_obj_handle_request(struct rbd_obj_request *obj_req,
+				     int *result)
+{
+	struct rbd_img_request *img_req = obj_req->img_request;
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	bool done;
+
+	mutex_lock(&obj_req->state_mutex);
+	if (!rbd_img_is_write(img_req))
+		done = rbd_obj_advance_read(obj_req, result);
+	else
+		done = rbd_obj_advance_write(obj_req, result);
+	mutex_unlock(&obj_req->state_mutex);
+
+	if (done && *result) {
+		rbd_assert(*result < 0);
+		rbd_warn(rbd_dev, "%s at objno %llu %llu~%llu result %d",
+			 obj_op_name(img_req->op_type), obj_req->ex.oe_objno,
+			 obj_req->ex.oe_off, obj_req->ex.oe_len, *result);
+	}
+	return done;
+}
+
+/*
+ * This is open-coded in rbd_img_handle_request() to avoid parent chain
+ * recursion.
+ */
+static void rbd_obj_handle_request(struct rbd_obj_request *obj_req, int result)
+{
+	if (__rbd_obj_handle_request(obj_req, &result))
+		rbd_img_handle_request(obj_req->img_request, result);
+}
+
+static bool need_exclusive_lock(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK))
+		return false;
+
+	if (rbd_is_ro(rbd_dev))
+		return false;
+
+	rbd_assert(!test_bit(IMG_REQ_CHILD, &img_req->flags));
+	if (rbd_dev->opts->lock_on_read ||
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP))
+		return true;
+
+	return rbd_img_is_write(img_req);
+}
+
+static bool rbd_lock_add_request(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	bool locked;
+
+	lockdep_assert_held(&rbd_dev->lock_rwsem);
+	locked = rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED;
+	spin_lock(&rbd_dev->lock_lists_lock);
+	rbd_assert(list_empty(&img_req->lock_item));
+	if (!locked)
+		list_add_tail(&img_req->lock_item, &rbd_dev->acquiring_list);
+	else
+		list_add_tail(&img_req->lock_item, &rbd_dev->running_list);
+	spin_unlock(&rbd_dev->lock_lists_lock);
+	return locked;
+}
+
+static void rbd_lock_del_request(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	bool need_wakeup = false;
+
+	lockdep_assert_held(&rbd_dev->lock_rwsem);
+	spin_lock(&rbd_dev->lock_lists_lock);
+	if (!list_empty(&img_req->lock_item)) {
+		list_del_init(&img_req->lock_item);
+		need_wakeup = (rbd_dev->lock_state == RBD_LOCK_STATE_RELEASING &&
+			       list_empty(&rbd_dev->running_list));
+	}
+	spin_unlock(&rbd_dev->lock_lists_lock);
+	if (need_wakeup)
+		complete(&rbd_dev->releasing_wait);
+}
+
+static int rbd_img_exclusive_lock(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+
+	if (!need_exclusive_lock(img_req))
+		return 1;
+
+	if (rbd_lock_add_request(img_req))
+		return 1;
+
+	if (rbd_dev->opts->exclusive) {
+		WARN_ON(1); /* lock got released? */
+		return -EROFS;
+	}
+
+	/*
+	 * Note the use of mod_delayed_work() in rbd_acquire_lock()
+	 * and cancel_delayed_work() in wake_lock_waiters().
+	 */
+	dout("%s rbd_dev %p queueing lock_dwork\n", __func__, rbd_dev);
+	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+	return 0;
+}
+
+static void rbd_img_object_requests(struct rbd_img_request *img_req)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	struct rbd_obj_request *obj_req;
+
+	rbd_assert(!img_req->pending.result && !img_req->pending.num_pending);
+	rbd_assert(!need_exclusive_lock(img_req) ||
+		   __rbd_is_lock_owner(rbd_dev));
+
+	if (rbd_img_is_write(img_req)) {
+		rbd_assert(!img_req->snapc);
+		down_read(&rbd_dev->header_rwsem);
+		img_req->snapc = ceph_get_snap_context(rbd_dev->header.snapc);
+		up_read(&rbd_dev->header_rwsem);
+	}
+
+	for_each_obj_request(img_req, obj_req) {
+		int result = 0;
+
+		if (__rbd_obj_handle_request(obj_req, &result)) {
+			if (result) {
+				img_req->pending.result = result;
+				return;
+			}
+		} else {
+			img_req->pending.num_pending++;
+		}
+	}
+}
+
+static bool rbd_img_advance(struct rbd_img_request *img_req, int *result)
+{
+	int ret;
+
+again:
+	switch (img_req->state) {
+	case RBD_IMG_START:
+		rbd_assert(!*result);
+
+		ret = rbd_img_exclusive_lock(img_req);
+		if (ret < 0) {
+			*result = ret;
+			return true;
+		}
+		img_req->state = RBD_IMG_EXCLUSIVE_LOCK;
+		if (ret > 0)
+			goto again;
+		return false;
+	case RBD_IMG_EXCLUSIVE_LOCK:
+		if (*result)
+			return true;
+
+		rbd_img_object_requests(img_req);
+		if (!img_req->pending.num_pending) {
+			*result = img_req->pending.result;
+			img_req->state = RBD_IMG_OBJECT_REQUESTS;
+			goto again;
+		}
+		img_req->state = __RBD_IMG_OBJECT_REQUESTS;
+		return false;
+	case __RBD_IMG_OBJECT_REQUESTS:
+		if (!pending_result_dec(&img_req->pending, result))
+			return false;
+		fallthrough;
+	case RBD_IMG_OBJECT_REQUESTS:
+		return true;
+	default:
+		BUG();
+	}
+}
+
+/*
+ * Return true if @img_req is completed.
+ */
+static bool __rbd_img_handle_request(struct rbd_img_request *img_req,
+				     int *result)
+{
+	struct rbd_device *rbd_dev = img_req->rbd_dev;
+	bool done;
+
+	if (need_exclusive_lock(img_req)) {
+		down_read(&rbd_dev->lock_rwsem);
+		mutex_lock(&img_req->state_mutex);
+		done = rbd_img_advance(img_req, result);
+		if (done)
+			rbd_lock_del_request(img_req);
+		mutex_unlock(&img_req->state_mutex);
+		up_read(&rbd_dev->lock_rwsem);
+	} else {
+		mutex_lock(&img_req->state_mutex);
+		done = rbd_img_advance(img_req, result);
+		mutex_unlock(&img_req->state_mutex);
+	}
+
+	if (done && *result) {
+		rbd_assert(*result < 0);
+		rbd_warn(rbd_dev, "%s%s result %d",
+		      test_bit(IMG_REQ_CHILD, &img_req->flags) ? "child " : "",
+		      obj_op_name(img_req->op_type), *result);
+	}
+	return done;
+}
+
+static void rbd_img_handle_request(struct rbd_img_request *img_req, int result)
+{
+again:
+	if (!__rbd_img_handle_request(img_req, &result))
+		return;
+
+	if (test_bit(IMG_REQ_CHILD, &img_req->flags)) {
+		struct rbd_obj_request *obj_req = img_req->obj_request;
+
+		rbd_img_request_destroy(img_req);
+		if (__rbd_obj_handle_request(obj_req, &result)) {
+			img_req = obj_req->img_request;
+			goto again;
+		}
+	} else {
+		struct request *rq = blk_mq_rq_from_pdu(img_req);
+
+		rbd_img_request_destroy(img_req);
+		blk_mq_end_request(rq, errno_to_blk_status(result));
+	}
+}
+
+static const struct rbd_client_id rbd_empty_cid;
+
+static bool rbd_cid_equal(const struct rbd_client_id *lhs,
+			  const struct rbd_client_id *rhs)
+{
+	return lhs->gid == rhs->gid && lhs->handle == rhs->handle;
+}
+
+static struct rbd_client_id rbd_get_cid(struct rbd_device *rbd_dev)
+{
+	struct rbd_client_id cid;
+
+	mutex_lock(&rbd_dev->watch_mutex);
+	cid.gid = ceph_client_gid(rbd_dev->rbd_client->client);
+	cid.handle = rbd_dev->watch_cookie;
+	mutex_unlock(&rbd_dev->watch_mutex);
+	return cid;
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_set_owner_cid(struct rbd_device *rbd_dev,
+			      const struct rbd_client_id *cid)
+{
+	dout("%s rbd_dev %p %llu-%llu -> %llu-%llu\n", __func__, rbd_dev,
+	     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle,
+	     cid->gid, cid->handle);
+	rbd_dev->owner_cid = *cid; /* struct */
+}
+
+static void format_lock_cookie(struct rbd_device *rbd_dev, char *buf)
+{
+	mutex_lock(&rbd_dev->watch_mutex);
+	sprintf(buf, "%s %llu", RBD_LOCK_COOKIE_PREFIX, rbd_dev->watch_cookie);
+	mutex_unlock(&rbd_dev->watch_mutex);
+}
+
+static void __rbd_lock(struct rbd_device *rbd_dev, const char *cookie)
+{
+	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
+
+	rbd_dev->lock_state = RBD_LOCK_STATE_LOCKED;
+	strcpy(rbd_dev->lock_cookie, cookie);
+	rbd_set_owner_cid(rbd_dev, &cid);
+	queue_work(rbd_dev->task_wq, &rbd_dev->acquired_lock_work);
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static int rbd_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	char cookie[32];
+	int ret;
+
+	WARN_ON(__rbd_is_lock_owner(rbd_dev) ||
+		rbd_dev->lock_cookie[0] != '\0');
+
+	format_lock_cookie(rbd_dev, cookie);
+	ret = ceph_cls_lock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
+			    RBD_LOCK_NAME, CEPH_CLS_LOCK_EXCLUSIVE, cookie,
+			    RBD_LOCK_TAG, "", 0);
+	if (ret && ret != -EEXIST)
+		return ret;
+
+	__rbd_lock(rbd_dev, cookie);
+	return 0;
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_unlock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
+
+	WARN_ON(!__rbd_is_lock_owner(rbd_dev) ||
+		rbd_dev->lock_cookie[0] == '\0');
+
+	ret = ceph_cls_unlock(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
+			      RBD_LOCK_NAME, rbd_dev->lock_cookie);
+	if (ret && ret != -ENOENT)
+		rbd_warn(rbd_dev, "failed to unlock header: %d", ret);
+
+	/* treat errors as the image is unlocked */
+	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
+	rbd_dev->lock_cookie[0] = '\0';
+	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
+	queue_work(rbd_dev->task_wq, &rbd_dev->released_lock_work);
+}
+
+static int __rbd_notify_op_lock(struct rbd_device *rbd_dev,
+				enum rbd_notify_op notify_op,
+				struct page ***preply_pages,
+				size_t *preply_len)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_client_id cid = rbd_get_cid(rbd_dev);
+	char buf[4 + 8 + 8 + CEPH_ENCODING_START_BLK_LEN];
+	int buf_size = sizeof(buf);
+	void *p = buf;
+
+	dout("%s rbd_dev %p notify_op %d\n", __func__, rbd_dev, notify_op);
+
+	/* encode *LockPayload NotifyMessage (op + ClientId) */
+	ceph_start_encoding(&p, 2, 1, buf_size - CEPH_ENCODING_START_BLK_LEN);
+	ceph_encode_32(&p, notify_op);
+	ceph_encode_64(&p, cid.gid);
+	ceph_encode_64(&p, cid.handle);
+
+	return ceph_osdc_notify(osdc, &rbd_dev->header_oid,
+				&rbd_dev->header_oloc, buf, buf_size,
+				RBD_NOTIFY_TIMEOUT, preply_pages, preply_len);
+}
+
+static void rbd_notify_op_lock(struct rbd_device *rbd_dev,
+			       enum rbd_notify_op notify_op)
+{
+	__rbd_notify_op_lock(rbd_dev, notify_op, NULL, NULL);
+}
+
+static void rbd_notify_acquired_lock(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
+						  acquired_lock_work);
+
+	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_ACQUIRED_LOCK);
+}
+
+static void rbd_notify_released_lock(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
+						  released_lock_work);
+
+	rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_RELEASED_LOCK);
+}
+
+static int rbd_request_lock(struct rbd_device *rbd_dev)
+{
+	struct page **reply_pages;
+	size_t reply_len;
+	bool lock_owner_responded = false;
+	int ret;
+
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	ret = __rbd_notify_op_lock(rbd_dev, RBD_NOTIFY_OP_REQUEST_LOCK,
+				   &reply_pages, &reply_len);
+	if (ret && ret != -ETIMEDOUT) {
+		rbd_warn(rbd_dev, "failed to request lock: %d", ret);
+		goto out;
+	}
+
+	if (reply_len > 0 && reply_len <= PAGE_SIZE) {
+		void *p = page_address(reply_pages[0]);
+		void *const end = p + reply_len;
+		u32 n;
+
+		ceph_decode_32_safe(&p, end, n, e_inval); /* num_acks */
+		while (n--) {
+			u8 struct_v;
+			u32 len;
+
+			ceph_decode_need(&p, end, 8 + 8, e_inval);
+			p += 8 + 8; /* skip gid and cookie */
+
+			ceph_decode_32_safe(&p, end, len, e_inval);
+			if (!len)
+				continue;
+
+			if (lock_owner_responded) {
+				rbd_warn(rbd_dev,
+					 "duplicate lock owners detected");
+				ret = -EIO;
+				goto out;
+			}
+
+			lock_owner_responded = true;
+			ret = ceph_start_decoding(&p, end, 1, "ResponseMessage",
+						  &struct_v, &len);
+			if (ret) {
+				rbd_warn(rbd_dev,
+					 "failed to decode ResponseMessage: %d",
+					 ret);
+				goto e_inval;
+			}
+
+			ret = ceph_decode_32(&p);
+		}
+	}
+
+	if (!lock_owner_responded) {
+		rbd_warn(rbd_dev, "no lock owners detected");
+		ret = -ETIMEDOUT;
+	}
+
+out:
+	ceph_release_page_vector(reply_pages, calc_pages_for(0, reply_len));
+	return ret;
+
+e_inval:
+	ret = -EINVAL;
+	goto out;
+}
+
+/*
+ * Either image request state machine(s) or rbd_add_acquire_lock()
+ * (i.e. "rbd map").
+ */
+static void wake_lock_waiters(struct rbd_device *rbd_dev, int result)
+{
+	struct rbd_img_request *img_req;
+
+	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
+	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
+
+	cancel_delayed_work(&rbd_dev->lock_dwork);
+	if (!completion_done(&rbd_dev->acquire_wait)) {
+		rbd_assert(list_empty(&rbd_dev->acquiring_list) &&
+			   list_empty(&rbd_dev->running_list));
+		rbd_dev->acquire_err = result;
+		complete_all(&rbd_dev->acquire_wait);
+		return;
+	}
+
+	while (!list_empty(&rbd_dev->acquiring_list)) {
+		img_req = list_first_entry(&rbd_dev->acquiring_list,
+					   struct rbd_img_request, lock_item);
+		mutex_lock(&img_req->state_mutex);
+		rbd_assert(img_req->state == RBD_IMG_EXCLUSIVE_LOCK);
+		if (!result)
+			list_move_tail(&img_req->lock_item,
+				       &rbd_dev->running_list);
+		else
+			list_del_init(&img_req->lock_item);
+		rbd_img_schedule(img_req, result);
+		mutex_unlock(&img_req->state_mutex);
+	}
+}
+
+static bool locker_equal(const struct ceph_locker *lhs,
+			 const struct ceph_locker *rhs)
+{
+	return lhs->id.name.type == rhs->id.name.type &&
+	       lhs->id.name.num == rhs->id.name.num &&
+	       !strcmp(lhs->id.cookie, rhs->id.cookie) &&
+	       ceph_addr_equal_no_type(&lhs->info.addr, &rhs->info.addr);
+}
+
+static void free_locker(struct ceph_locker *locker)
+{
+	if (locker)
+		ceph_free_lockers(locker, 1);
+}
+
+static struct ceph_locker *get_lock_owner_info(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_locker *lockers;
+	u32 num_lockers;
+	u8 lock_type;
+	char *lock_tag;
+	u64 handle;
+	int ret;
+
+	ret = ceph_cls_lock_info(osdc, &rbd_dev->header_oid,
+				 &rbd_dev->header_oloc, RBD_LOCK_NAME,
+				 &lock_type, &lock_tag, &lockers, &num_lockers);
+	if (ret) {
+		rbd_warn(rbd_dev, "failed to get header lockers: %d", ret);
+		return ERR_PTR(ret);
+	}
+
+	if (num_lockers == 0) {
+		dout("%s rbd_dev %p no lockers detected\n", __func__, rbd_dev);
+		lockers = NULL;
+		goto out;
+	}
+
+	if (strcmp(lock_tag, RBD_LOCK_TAG)) {
+		rbd_warn(rbd_dev, "locked by external mechanism, tag %s",
+			 lock_tag);
+		goto err_busy;
+	}
+
+	if (lock_type != CEPH_CLS_LOCK_EXCLUSIVE) {
+		rbd_warn(rbd_dev, "incompatible lock type detected");
+		goto err_busy;
+	}
+
+	WARN_ON(num_lockers != 1);
+	ret = sscanf(lockers[0].id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu",
+		     &handle);
+	if (ret != 1) {
+		rbd_warn(rbd_dev, "locked by external mechanism, cookie %s",
+			 lockers[0].id.cookie);
+		goto err_busy;
+	}
+	if (ceph_addr_is_blank(&lockers[0].info.addr)) {
+		rbd_warn(rbd_dev, "locker has a blank address");
+		goto err_busy;
+	}
+
+	dout("%s rbd_dev %p got locker %s%llu@%pISpc/%u handle %llu\n",
+	     __func__, rbd_dev, ENTITY_NAME(lockers[0].id.name),
+	     &lockers[0].info.addr.in_addr,
+	     le32_to_cpu(lockers[0].info.addr.nonce), handle);
+
+out:
+	kfree(lock_tag);
+	return lockers;
+
+err_busy:
+	kfree(lock_tag);
+	ceph_free_lockers(lockers, num_lockers);
+	return ERR_PTR(-EBUSY);
+}
+
+static int find_watcher(struct rbd_device *rbd_dev,
+			const struct ceph_locker *locker)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_watch_item *watchers;
+	u32 num_watchers;
+	u64 cookie;
+	int i;
+	int ret;
+
+	ret = ceph_osdc_list_watchers(osdc, &rbd_dev->header_oid,
+				      &rbd_dev->header_oloc, &watchers,
+				      &num_watchers);
+	if (ret) {
+		rbd_warn(rbd_dev, "failed to get watchers: %d", ret);
+		return ret;
+	}
+
+	sscanf(locker->id.cookie, RBD_LOCK_COOKIE_PREFIX " %llu", &cookie);
+	for (i = 0; i < num_watchers; i++) {
+		/*
+		 * Ignore addr->type while comparing.  This mimics
+		 * entity_addr_t::get_legacy_str() + strcmp().
+		 */
+		if (ceph_addr_equal_no_type(&watchers[i].addr,
+					    &locker->info.addr) &&
+		    watchers[i].cookie == cookie) {
+			struct rbd_client_id cid = {
+				.gid = le64_to_cpu(watchers[i].name.num),
+				.handle = cookie,
+			};
+
+			dout("%s rbd_dev %p found cid %llu-%llu\n", __func__,
+			     rbd_dev, cid.gid, cid.handle);
+			rbd_set_owner_cid(rbd_dev, &cid);
+			ret = 1;
+			goto out;
+		}
+	}
+
+	dout("%s rbd_dev %p no watchers\n", __func__, rbd_dev);
+	ret = 0;
+out:
+	kfree(watchers);
+	return ret;
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static int rbd_try_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_client *client = rbd_dev->rbd_client->client;
+	struct ceph_locker *locker, *refreshed_locker;
+	int ret;
+
+	for (;;) {
+		locker = refreshed_locker = NULL;
+
+		ret = rbd_lock(rbd_dev);
+		if (!ret)
+			goto out;
+		if (ret != -EBUSY) {
+			rbd_warn(rbd_dev, "failed to lock header: %d", ret);
+			goto out;
+		}
+
+		/* determine if the current lock holder is still alive */
+		locker = get_lock_owner_info(rbd_dev);
+		if (IS_ERR(locker)) {
+			ret = PTR_ERR(locker);
+			locker = NULL;
+			goto out;
+		}
+		if (!locker)
+			goto again;
+
+		ret = find_watcher(rbd_dev, locker);
+		if (ret)
+			goto out; /* request lock or error */
+
+		refreshed_locker = get_lock_owner_info(rbd_dev);
+		if (IS_ERR(refreshed_locker)) {
+			ret = PTR_ERR(refreshed_locker);
+			refreshed_locker = NULL;
+			goto out;
+		}
+		if (!refreshed_locker ||
+		    !locker_equal(locker, refreshed_locker))
+			goto again;
+
+		rbd_warn(rbd_dev, "breaking header lock owned by %s%llu",
+			 ENTITY_NAME(locker->id.name));
+
+		ret = ceph_monc_blocklist_add(&client->monc,
+					      &locker->info.addr);
+		if (ret) {
+			rbd_warn(rbd_dev, "failed to blocklist %s%llu: %d",
+				 ENTITY_NAME(locker->id.name), ret);
+			goto out;
+		}
+
+		ret = ceph_cls_break_lock(&client->osdc, &rbd_dev->header_oid,
+					  &rbd_dev->header_oloc, RBD_LOCK_NAME,
+					  locker->id.cookie, &locker->id.name);
+		if (ret && ret != -ENOENT) {
+			rbd_warn(rbd_dev, "failed to break header lock: %d",
+				 ret);
+			goto out;
+		}
+
+again:
+		free_locker(refreshed_locker);
+		free_locker(locker);
+	}
+
+out:
+	free_locker(refreshed_locker);
+	free_locker(locker);
+	return ret;
+}
+
+static int rbd_post_acquire_action(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		return ret;
+
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP) {
+		ret = rbd_object_map_open(rbd_dev);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * Return:
+ *   0 - lock acquired
+ *   1 - caller should call rbd_request_lock()
+ *  <0 - error
+ */
+static int rbd_try_acquire_lock(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	down_read(&rbd_dev->lock_rwsem);
+	dout("%s rbd_dev %p read lock_state %d\n", __func__, rbd_dev,
+	     rbd_dev->lock_state);
+	if (__rbd_is_lock_owner(rbd_dev)) {
+		up_read(&rbd_dev->lock_rwsem);
+		return 0;
+	}
+
+	up_read(&rbd_dev->lock_rwsem);
+	down_write(&rbd_dev->lock_rwsem);
+	dout("%s rbd_dev %p write lock_state %d\n", __func__, rbd_dev,
+	     rbd_dev->lock_state);
+	if (__rbd_is_lock_owner(rbd_dev)) {
+		up_write(&rbd_dev->lock_rwsem);
+		return 0;
+	}
+
+	ret = rbd_try_lock(rbd_dev);
+	if (ret < 0) {
+		rbd_warn(rbd_dev, "failed to acquire lock: %d", ret);
+		goto out;
+	}
+	if (ret > 0) {
+		up_write(&rbd_dev->lock_rwsem);
+		return ret;
+	}
+
+	rbd_assert(rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED);
+	rbd_assert(list_empty(&rbd_dev->running_list));
+
+	ret = rbd_post_acquire_action(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "post-acquire action failed: %d", ret);
+		/*
+		 * Can't stay in RBD_LOCK_STATE_LOCKED because
+		 * rbd_lock_add_request() would let the request through,
+		 * assuming that e.g. object map is locked and loaded.
+		 */
+		rbd_unlock(rbd_dev);
+	}
+
+out:
+	wake_lock_waiters(rbd_dev, ret);
+	up_write(&rbd_dev->lock_rwsem);
+	return ret;
+}
+
+static void rbd_acquire_lock(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
+					    struct rbd_device, lock_dwork);
+	int ret;
+
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+again:
+	ret = rbd_try_acquire_lock(rbd_dev);
+	if (ret <= 0) {
+		dout("%s rbd_dev %p ret %d - done\n", __func__, rbd_dev, ret);
+		return;
+	}
+
+	ret = rbd_request_lock(rbd_dev);
+	if (ret == -ETIMEDOUT) {
+		goto again; /* treat this as a dead client */
+	} else if (ret == -EROFS) {
+		rbd_warn(rbd_dev, "peer will not release lock");
+		down_write(&rbd_dev->lock_rwsem);
+		wake_lock_waiters(rbd_dev, ret);
+		up_write(&rbd_dev->lock_rwsem);
+	} else if (ret < 0) {
+		rbd_warn(rbd_dev, "error requesting lock: %d", ret);
+		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
+				 RBD_RETRY_DELAY);
+	} else {
+		/*
+		 * lock owner acked, but resend if we don't see them
+		 * release the lock
+		 */
+		dout("%s rbd_dev %p requeuing lock_dwork\n", __func__,
+		     rbd_dev);
+		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork,
+		    msecs_to_jiffies(2 * RBD_NOTIFY_TIMEOUT * MSEC_PER_SEC));
+	}
+}
+
+static bool rbd_quiesce_lock(struct rbd_device *rbd_dev)
+{
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+	lockdep_assert_held_write(&rbd_dev->lock_rwsem);
+
+	if (rbd_dev->lock_state != RBD_LOCK_STATE_LOCKED)
+		return false;
+
+	/*
+	 * Ensure that all in-flight IO is flushed.
+	 */
+	rbd_dev->lock_state = RBD_LOCK_STATE_RELEASING;
+	rbd_assert(!completion_done(&rbd_dev->releasing_wait));
+	if (list_empty(&rbd_dev->running_list))
+		return true;
+
+	up_write(&rbd_dev->lock_rwsem);
+	wait_for_completion(&rbd_dev->releasing_wait);
+
+	down_write(&rbd_dev->lock_rwsem);
+	if (rbd_dev->lock_state != RBD_LOCK_STATE_RELEASING)
+		return false;
+
+	rbd_assert(list_empty(&rbd_dev->running_list));
+	return true;
+}
+
+static void rbd_pre_release_action(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)
+		rbd_object_map_close(rbd_dev);
+}
+
+static void __rbd_release_lock(struct rbd_device *rbd_dev)
+{
+	rbd_assert(list_empty(&rbd_dev->running_list));
+
+	rbd_pre_release_action(rbd_dev);
+	rbd_unlock(rbd_dev);
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_release_lock(struct rbd_device *rbd_dev)
+{
+	if (!rbd_quiesce_lock(rbd_dev))
+		return;
+
+	__rbd_release_lock(rbd_dev);
+
+	/*
+	 * Give others a chance to grab the lock - we would re-acquire
+	 * almost immediately if we got new IO while draining the running
+	 * list otherwise.  We need to ack our own notifications, so this
+	 * lock_dwork will be requeued from rbd_handle_released_lock() by
+	 * way of maybe_kick_acquire().
+	 */
+	cancel_delayed_work(&rbd_dev->lock_dwork);
+}
+
+static void rbd_release_lock_work(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev = container_of(work, struct rbd_device,
+						  unlock_work);
+
+	down_write(&rbd_dev->lock_rwsem);
+	rbd_release_lock(rbd_dev);
+	up_write(&rbd_dev->lock_rwsem);
+}
+
+static void maybe_kick_acquire(struct rbd_device *rbd_dev)
+{
+	bool have_requests;
+
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+	if (__rbd_is_lock_owner(rbd_dev))
+		return;
+
+	spin_lock(&rbd_dev->lock_lists_lock);
+	have_requests = !list_empty(&rbd_dev->acquiring_list);
+	spin_unlock(&rbd_dev->lock_lists_lock);
+	if (have_requests || delayed_work_pending(&rbd_dev->lock_dwork)) {
+		dout("%s rbd_dev %p kicking lock_dwork\n", __func__, rbd_dev);
+		mod_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+	}
+}
+
+static void rbd_handle_acquired_lock(struct rbd_device *rbd_dev, u8 struct_v,
+				     void **p)
+{
+	struct rbd_client_id cid = { 0 };
+
+	if (struct_v >= 2) {
+		cid.gid = ceph_decode_64(p);
+		cid.handle = ceph_decode_64(p);
+	}
+
+	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
+	     cid.handle);
+	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
+		down_write(&rbd_dev->lock_rwsem);
+		if (rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
+			dout("%s rbd_dev %p cid %llu-%llu == owner_cid\n",
+			     __func__, rbd_dev, cid.gid, cid.handle);
+		} else {
+			rbd_set_owner_cid(rbd_dev, &cid);
+		}
+		downgrade_write(&rbd_dev->lock_rwsem);
+	} else {
+		down_read(&rbd_dev->lock_rwsem);
+	}
+
+	maybe_kick_acquire(rbd_dev);
+	up_read(&rbd_dev->lock_rwsem);
+}
+
+static void rbd_handle_released_lock(struct rbd_device *rbd_dev, u8 struct_v,
+				     void **p)
+{
+	struct rbd_client_id cid = { 0 };
+
+	if (struct_v >= 2) {
+		cid.gid = ceph_decode_64(p);
+		cid.handle = ceph_decode_64(p);
+	}
+
+	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
+	     cid.handle);
+	if (!rbd_cid_equal(&cid, &rbd_empty_cid)) {
+		down_write(&rbd_dev->lock_rwsem);
+		if (!rbd_cid_equal(&cid, &rbd_dev->owner_cid)) {
+			dout("%s rbd_dev %p cid %llu-%llu != owner_cid %llu-%llu\n",
+			     __func__, rbd_dev, cid.gid, cid.handle,
+			     rbd_dev->owner_cid.gid, rbd_dev->owner_cid.handle);
+		} else {
+			rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
+		}
+		downgrade_write(&rbd_dev->lock_rwsem);
+	} else {
+		down_read(&rbd_dev->lock_rwsem);
+	}
+
+	maybe_kick_acquire(rbd_dev);
+	up_read(&rbd_dev->lock_rwsem);
+}
+
+/*
+ * Returns result for ResponseMessage to be encoded (<= 0), or 1 if no
+ * ResponseMessage is needed.
+ */
+static int rbd_handle_request_lock(struct rbd_device *rbd_dev, u8 struct_v,
+				   void **p)
+{
+	struct rbd_client_id my_cid = rbd_get_cid(rbd_dev);
+	struct rbd_client_id cid = { 0 };
+	int result = 1;
+
+	if (struct_v >= 2) {
+		cid.gid = ceph_decode_64(p);
+		cid.handle = ceph_decode_64(p);
+	}
+
+	dout("%s rbd_dev %p cid %llu-%llu\n", __func__, rbd_dev, cid.gid,
+	     cid.handle);
+	if (rbd_cid_equal(&cid, &my_cid))
+		return result;
+
+	down_read(&rbd_dev->lock_rwsem);
+	if (__rbd_is_lock_owner(rbd_dev)) {
+		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED &&
+		    rbd_cid_equal(&rbd_dev->owner_cid, &rbd_empty_cid))
+			goto out_unlock;
+
+		/*
+		 * encode ResponseMessage(0) so the peer can detect
+		 * a missing owner
+		 */
+		result = 0;
+
+		if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED) {
+			if (!rbd_dev->opts->exclusive) {
+				dout("%s rbd_dev %p queueing unlock_work\n",
+				     __func__, rbd_dev);
+				queue_work(rbd_dev->task_wq,
+					   &rbd_dev->unlock_work);
+			} else {
+				/* refuse to release the lock */
+				result = -EROFS;
+			}
+		}
+	}
+
+out_unlock:
+	up_read(&rbd_dev->lock_rwsem);
+	return result;
+}
+
+static void __rbd_acknowledge_notify(struct rbd_device *rbd_dev,
+				     u64 notify_id, u64 cookie, s32 *result)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	char buf[4 + CEPH_ENCODING_START_BLK_LEN];
+	int buf_size = sizeof(buf);
+	int ret;
+
+	if (result) {
+		void *p = buf;
+
+		/* encode ResponseMessage */
+		ceph_start_encoding(&p, 1, 1,
+				    buf_size - CEPH_ENCODING_START_BLK_LEN);
+		ceph_encode_32(&p, *result);
+	} else {
+		buf_size = 0;
+	}
+
+	ret = ceph_osdc_notify_ack(osdc, &rbd_dev->header_oid,
+				   &rbd_dev->header_oloc, notify_id, cookie,
+				   buf, buf_size);
+	if (ret)
+		rbd_warn(rbd_dev, "acknowledge_notify failed: %d", ret);
+}
+
+static void rbd_acknowledge_notify(struct rbd_device *rbd_dev, u64 notify_id,
+				   u64 cookie)
+{
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, NULL);
+}
+
+static void rbd_acknowledge_notify_result(struct rbd_device *rbd_dev,
+					  u64 notify_id, u64 cookie, s32 result)
+{
+	dout("%s rbd_dev %p result %d\n", __func__, rbd_dev, result);
+	__rbd_acknowledge_notify(rbd_dev, notify_id, cookie, &result);
+}
+
+static void rbd_watch_cb(void *arg, u64 notify_id, u64 cookie,
+			 u64 notifier_id, void *data, size_t data_len)
+{
+	struct rbd_device *rbd_dev = arg;
+	void *p = data;
+	void *const end = p + data_len;
+	u8 struct_v = 0;
+	u32 len;
+	u32 notify_op;
+	int ret;
+
+	dout("%s rbd_dev %p cookie %llu notify_id %llu data_len %zu\n",
+	     __func__, rbd_dev, cookie, notify_id, data_len);
+	if (data_len) {
+		ret = ceph_start_decoding(&p, end, 1, "NotifyMessage",
+					  &struct_v, &len);
+		if (ret) {
+			rbd_warn(rbd_dev, "failed to decode NotifyMessage: %d",
+				 ret);
+			return;
+		}
+
+		notify_op = ceph_decode_32(&p);
+	} else {
+		/* legacy notification for header updates */
+		notify_op = RBD_NOTIFY_OP_HEADER_UPDATE;
+		len = 0;
+	}
+
+	dout("%s rbd_dev %p notify_op %u\n", __func__, rbd_dev, notify_op);
+	switch (notify_op) {
+	case RBD_NOTIFY_OP_ACQUIRED_LOCK:
+		rbd_handle_acquired_lock(rbd_dev, struct_v, &p);
+		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
+		break;
+	case RBD_NOTIFY_OP_RELEASED_LOCK:
+		rbd_handle_released_lock(rbd_dev, struct_v, &p);
+		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
+		break;
+	case RBD_NOTIFY_OP_REQUEST_LOCK:
+		ret = rbd_handle_request_lock(rbd_dev, struct_v, &p);
+		if (ret <= 0)
+			rbd_acknowledge_notify_result(rbd_dev, notify_id,
+						      cookie, ret);
+		else
+			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
+		break;
+	case RBD_NOTIFY_OP_HEADER_UPDATE:
+		ret = rbd_dev_refresh(rbd_dev);
+		if (ret)
+			rbd_warn(rbd_dev, "refresh failed: %d", ret);
+
+		rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
+		break;
+	default:
+		if (rbd_is_lock_owner(rbd_dev))
+			rbd_acknowledge_notify_result(rbd_dev, notify_id,
+						      cookie, -EOPNOTSUPP);
+		else
+			rbd_acknowledge_notify(rbd_dev, notify_id, cookie);
+		break;
+	}
+}
+
+static void __rbd_unregister_watch(struct rbd_device *rbd_dev);
+
+static void rbd_watch_errcb(void *arg, u64 cookie, int err)
+{
+	struct rbd_device *rbd_dev = arg;
+
+	rbd_warn(rbd_dev, "encountered watch error: %d", err);
+
+	down_write(&rbd_dev->lock_rwsem);
+	rbd_set_owner_cid(rbd_dev, &rbd_empty_cid);
+	up_write(&rbd_dev->lock_rwsem);
+
+	mutex_lock(&rbd_dev->watch_mutex);
+	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED) {
+		__rbd_unregister_watch(rbd_dev);
+		rbd_dev->watch_state = RBD_WATCH_STATE_ERROR;
+
+		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->watch_dwork, 0);
+	}
+	mutex_unlock(&rbd_dev->watch_mutex);
+}
+
+/*
+ * watch_mutex must be locked
+ */
+static int __rbd_register_watch(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_linger_request *handle;
+
+	rbd_assert(!rbd_dev->watch_handle);
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	handle = ceph_osdc_watch(osdc, &rbd_dev->header_oid,
+				 &rbd_dev->header_oloc, rbd_watch_cb,
+				 rbd_watch_errcb, rbd_dev);
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+
+	rbd_dev->watch_handle = handle;
+	return 0;
+}
+
+/*
+ * watch_mutex must be locked
+ */
+static void __rbd_unregister_watch(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	int ret;
+
+	rbd_assert(rbd_dev->watch_handle);
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	ret = ceph_osdc_unwatch(osdc, rbd_dev->watch_handle);
+	if (ret)
+		rbd_warn(rbd_dev, "failed to unwatch: %d", ret);
+
+	rbd_dev->watch_handle = NULL;
+}
+
+static int rbd_register_watch(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	mutex_lock(&rbd_dev->watch_mutex);
+	rbd_assert(rbd_dev->watch_state == RBD_WATCH_STATE_UNREGISTERED);
+	ret = __rbd_register_watch(rbd_dev);
+	if (ret)
+		goto out;
+
+	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
+	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
+
+out:
+	mutex_unlock(&rbd_dev->watch_mutex);
+	return ret;
+}
+
+static void cancel_tasks_sync(struct rbd_device *rbd_dev)
+{
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	cancel_work_sync(&rbd_dev->acquired_lock_work);
+	cancel_work_sync(&rbd_dev->released_lock_work);
+	cancel_delayed_work_sync(&rbd_dev->lock_dwork);
+	cancel_work_sync(&rbd_dev->unlock_work);
+}
+
+/*
+ * header_rwsem must not be held to avoid a deadlock with
+ * rbd_dev_refresh() when flushing notifies.
+ */
+static void rbd_unregister_watch(struct rbd_device *rbd_dev)
+{
+	cancel_tasks_sync(rbd_dev);
+
+	mutex_lock(&rbd_dev->watch_mutex);
+	if (rbd_dev->watch_state == RBD_WATCH_STATE_REGISTERED)
+		__rbd_unregister_watch(rbd_dev);
+	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
+	mutex_unlock(&rbd_dev->watch_mutex);
+
+	cancel_delayed_work_sync(&rbd_dev->watch_dwork);
+	ceph_osdc_flush_notifies(&rbd_dev->rbd_client->client->osdc);
+}
+
+/*
+ * lock_rwsem must be held for write
+ */
+static void rbd_reacquire_lock(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	char cookie[32];
+	int ret;
+
+	if (!rbd_quiesce_lock(rbd_dev))
+		return;
+
+	format_lock_cookie(rbd_dev, cookie);
+	ret = ceph_cls_set_cookie(osdc, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, RBD_LOCK_NAME,
+				  CEPH_CLS_LOCK_EXCLUSIVE, rbd_dev->lock_cookie,
+				  RBD_LOCK_TAG, cookie);
+	if (ret) {
+		if (ret != -EOPNOTSUPP)
+			rbd_warn(rbd_dev, "failed to update lock cookie: %d",
+				 ret);
+
+		/*
+		 * Lock cookie cannot be updated on older OSDs, so do
+		 * a manual release and queue an acquire.
+		 */
+		__rbd_release_lock(rbd_dev);
+		queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+	} else {
+		__rbd_lock(rbd_dev, cookie);
+		wake_lock_waiters(rbd_dev, 0);
+	}
+}
+
+static void rbd_reregister_watch(struct work_struct *work)
+{
+	struct rbd_device *rbd_dev = container_of(to_delayed_work(work),
+					    struct rbd_device, watch_dwork);
+	int ret;
+
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	mutex_lock(&rbd_dev->watch_mutex);
+	if (rbd_dev->watch_state != RBD_WATCH_STATE_ERROR) {
+		mutex_unlock(&rbd_dev->watch_mutex);
+		return;
+	}
+
+	ret = __rbd_register_watch(rbd_dev);
+	if (ret) {
+		rbd_warn(rbd_dev, "failed to reregister watch: %d", ret);
+		if (ret != -EBLOCKLISTED && ret != -ENOENT) {
+			queue_delayed_work(rbd_dev->task_wq,
+					   &rbd_dev->watch_dwork,
+					   RBD_RETRY_DELAY);
+			mutex_unlock(&rbd_dev->watch_mutex);
+			return;
+		}
+
+		mutex_unlock(&rbd_dev->watch_mutex);
+		down_write(&rbd_dev->lock_rwsem);
+		wake_lock_waiters(rbd_dev, ret);
+		up_write(&rbd_dev->lock_rwsem);
+		return;
+	}
+
+	rbd_dev->watch_state = RBD_WATCH_STATE_REGISTERED;
+	rbd_dev->watch_cookie = rbd_dev->watch_handle->linger_id;
+	mutex_unlock(&rbd_dev->watch_mutex);
+
+	down_write(&rbd_dev->lock_rwsem);
+	if (rbd_dev->lock_state == RBD_LOCK_STATE_LOCKED)
+		rbd_reacquire_lock(rbd_dev);
+	up_write(&rbd_dev->lock_rwsem);
+
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		rbd_warn(rbd_dev, "reregistration refresh failed: %d", ret);
+}
+
+/*
+ * Synchronous osd object method call.  Returns the number of bytes
+ * returned in the outbound buffer, or a negative error code.
+ */
+static int rbd_obj_method_sync(struct rbd_device *rbd_dev,
+			     struct ceph_object_id *oid,
+			     struct ceph_object_locator *oloc,
+			     const char *method_name,
+			     const void *outbound,
+			     size_t outbound_size,
+			     void *inbound,
+			     size_t inbound_size)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct page *req_page = NULL;
+	struct page *reply_page;
+	int ret;
+
+	/*
+	 * Method calls are ultimately read operations.  The result
+	 * should placed into the inbound buffer provided.  They
+	 * also supply outbound data--parameters for the object
+	 * method.  Currently if this is present it will be a
+	 * snapshot id.
+	 */
+	if (outbound) {
+		if (outbound_size > PAGE_SIZE)
+			return -E2BIG;
+
+		req_page = alloc_page(GFP_KERNEL);
+		if (!req_page)
+			return -ENOMEM;
+
+		memcpy(page_address(req_page), outbound, outbound_size);
+	}
+
+	reply_page = alloc_page(GFP_KERNEL);
+	if (!reply_page) {
+		if (req_page)
+			__free_page(req_page);
+		return -ENOMEM;
+	}
+
+	ret = ceph_osdc_call(osdc, oid, oloc, RBD_DRV_NAME, method_name,
+			     CEPH_OSD_FLAG_READ, req_page, outbound_size,
+			     &reply_page, &inbound_size);
+	if (!ret) {
+		memcpy(inbound, page_address(reply_page), inbound_size);
+		ret = inbound_size;
+	}
+
+	if (req_page)
+		__free_page(req_page);
+	__free_page(reply_page);
+	return ret;
+}
+
+static void rbd_queue_workfn(struct work_struct *work)
+{
+	struct rbd_img_request *img_request =
+	    container_of(work, struct rbd_img_request, work);
+	struct rbd_device *rbd_dev = img_request->rbd_dev;
+	enum obj_operation_type op_type = img_request->op_type;
+	struct request *rq = blk_mq_rq_from_pdu(img_request);
+	u64 offset = (u64)blk_rq_pos(rq) << SECTOR_SHIFT;
+	u64 length = blk_rq_bytes(rq);
+	u64 mapping_size;
+	int result;
+
+	/* Ignore/skip any zero-length requests */
+	if (!length) {
+		dout("%s: zero-length request\n", __func__);
+		result = 0;
+		goto err_img_request;
+	}
+
+	blk_mq_start_request(rq);
+
+	down_read(&rbd_dev->header_rwsem);
+	mapping_size = rbd_dev->mapping.size;
+	rbd_img_capture_header(img_request);
+	up_read(&rbd_dev->header_rwsem);
+
+	if (offset + length > mapping_size) {
+		rbd_warn(rbd_dev, "beyond EOD (%llu~%llu > %llu)", offset,
+			 length, mapping_size);
+		result = -EIO;
+		goto err_img_request;
+	}
+
+	dout("%s rbd_dev %p img_req %p %s %llu~%llu\n", __func__, rbd_dev,
+	     img_request, obj_op_name(op_type), offset, length);
+
+	if (op_type == OBJ_OP_DISCARD || op_type == OBJ_OP_ZEROOUT)
+		result = rbd_img_fill_nodata(img_request, offset, length);
+	else
+		result = rbd_img_fill_from_bio(img_request, offset, length,
+					       rq->bio);
+	if (result)
+		goto err_img_request;
+
+	rbd_img_handle_request(img_request, 0);
+	return;
+
+err_img_request:
+	rbd_img_request_destroy(img_request);
+	if (result)
+		rbd_warn(rbd_dev, "%s %llx at %llx result %d",
+			 obj_op_name(op_type), length, offset, result);
+	blk_mq_end_request(rq, errno_to_blk_status(result));
+}
+
+static blk_status_t rbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct rbd_device *rbd_dev = hctx->queue->queuedata;
+	struct rbd_img_request *img_req = blk_mq_rq_to_pdu(bd->rq);
+	enum obj_operation_type op_type;
+
+	switch (req_op(bd->rq)) {
+	case REQ_OP_DISCARD:
+		op_type = OBJ_OP_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		op_type = OBJ_OP_ZEROOUT;
+		break;
+	case REQ_OP_WRITE:
+		op_type = OBJ_OP_WRITE;
+		break;
+	case REQ_OP_READ:
+		op_type = OBJ_OP_READ;
+		break;
+	default:
+		rbd_warn(rbd_dev, "unknown req_op %d", req_op(bd->rq));
+		return BLK_STS_IOERR;
+	}
+
+	rbd_img_request_init(img_req, rbd_dev, op_type);
+
+	if (rbd_img_is_write(img_req)) {
+		if (rbd_is_ro(rbd_dev)) {
+			rbd_warn(rbd_dev, "%s on read-only mapping",
+				 obj_op_name(img_req->op_type));
+			return BLK_STS_IOERR;
+		}
+		rbd_assert(!rbd_is_snap(rbd_dev));
+	}
+
+	INIT_WORK(&img_req->work, rbd_queue_workfn);
+	queue_work(rbd_wq, &img_req->work);
+	return BLK_STS_OK;
+}
+
+static void rbd_free_disk(struct rbd_device *rbd_dev)
+{
+	put_disk(rbd_dev->disk);
+	blk_mq_free_tag_set(&rbd_dev->tag_set);
+	rbd_dev->disk = NULL;
+}
+
+static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
+			     struct ceph_object_id *oid,
+			     struct ceph_object_locator *oloc,
+			     void *buf, int buf_len)
+
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct ceph_osd_request *req;
+	struct page **pages;
+	int num_pages = calc_pages_for(0, buf_len);
+	int ret;
+
+	req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL);
+	if (!req)
+		return -ENOMEM;
+
+	ceph_oid_copy(&req->r_base_oid, oid);
+	ceph_oloc_copy(&req->r_base_oloc, oloc);
+	req->r_flags = CEPH_OSD_FLAG_READ;
+
+	pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
+	if (IS_ERR(pages)) {
+		ret = PTR_ERR(pages);
+		goto out_req;
+	}
+
+	osd_req_op_extent_init(req, 0, CEPH_OSD_OP_READ, 0, buf_len, 0, 0);
+	osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
+					 true);
+
+	ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
+	if (ret)
+		goto out_req;
+
+	ceph_osdc_start_request(osdc, req);
+	ret = ceph_osdc_wait_request(osdc, req);
+	if (ret >= 0)
+		ceph_copy_from_page_vector(pages, buf, 0, ret);
+
+out_req:
+	ceph_osdc_put_request(req);
+	return ret;
+}
+
+/*
+ * Read the complete header for the given rbd device.  On successful
+ * return, the rbd_dev->header field will contain up-to-date
+ * information about the image.
+ */
+static int rbd_dev_v1_header_info(struct rbd_device *rbd_dev,
+				  struct rbd_image_header *header,
+				  bool first_time)
+{
+	struct rbd_image_header_ondisk *ondisk = NULL;
+	u32 snap_count = 0;
+	u64 names_size = 0;
+	u32 want_count;
+	int ret;
+
+	/*
+	 * The complete header will include an array of its 64-bit
+	 * snapshot ids, followed by the names of those snapshots as
+	 * a contiguous block of NUL-terminated strings.  Note that
+	 * the number of snapshots could change by the time we read
+	 * it in, in which case we re-read it.
+	 */
+	do {
+		size_t size;
+
+		kfree(ondisk);
+
+		size = sizeof (*ondisk);
+		size += snap_count * sizeof (struct rbd_image_snap_ondisk);
+		size += names_size;
+		ondisk = kmalloc(size, GFP_KERNEL);
+		if (!ondisk)
+			return -ENOMEM;
+
+		ret = rbd_obj_read_sync(rbd_dev, &rbd_dev->header_oid,
+					&rbd_dev->header_oloc, ondisk, size);
+		if (ret < 0)
+			goto out;
+		if ((size_t)ret < size) {
+			ret = -ENXIO;
+			rbd_warn(rbd_dev, "short header read (want %zd got %d)",
+				size, ret);
+			goto out;
+		}
+		if (!rbd_dev_ondisk_valid(ondisk)) {
+			ret = -ENXIO;
+			rbd_warn(rbd_dev, "invalid header");
+			goto out;
+		}
+
+		names_size = le64_to_cpu(ondisk->snap_names_len);
+		want_count = snap_count;
+		snap_count = le32_to_cpu(ondisk->snap_count);
+	} while (snap_count != want_count);
+
+	ret = rbd_header_from_disk(header, ondisk, first_time);
+out:
+	kfree(ondisk);
+
+	return ret;
+}
+
+static void rbd_dev_update_size(struct rbd_device *rbd_dev)
+{
+	sector_t size;
+
+	/*
+	 * If EXISTS is not set, rbd_dev->disk may be NULL, so don't
+	 * try to update its size.  If REMOVING is set, updating size
+	 * is just useless work since the device can't be opened.
+	 */
+	if (test_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags) &&
+	    !test_bit(RBD_DEV_FLAG_REMOVING, &rbd_dev->flags)) {
+		size = (sector_t)rbd_dev->mapping.size / SECTOR_SIZE;
+		dout("setting size to %llu sectors", (unsigned long long)size);
+		set_capacity_and_notify(rbd_dev->disk, size);
+	}
+}
+
+static const struct blk_mq_ops rbd_mq_ops = {
+	.queue_rq	= rbd_queue_rq,
+};
+
+static int rbd_init_disk(struct rbd_device *rbd_dev)
+{
+	struct gendisk *disk;
+	struct request_queue *q;
+	unsigned int objset_bytes =
+	    rbd_dev->layout.object_size * rbd_dev->layout.stripe_count;
+	int err;
+
+	memset(&rbd_dev->tag_set, 0, sizeof(rbd_dev->tag_set));
+	rbd_dev->tag_set.ops = &rbd_mq_ops;
+	rbd_dev->tag_set.queue_depth = rbd_dev->opts->queue_depth;
+	rbd_dev->tag_set.numa_node = NUMA_NO_NODE;
+	rbd_dev->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	rbd_dev->tag_set.nr_hw_queues = num_present_cpus();
+	rbd_dev->tag_set.cmd_size = sizeof(struct rbd_img_request);
+
+	err = blk_mq_alloc_tag_set(&rbd_dev->tag_set);
+	if (err)
+		return err;
+
+	disk = blk_mq_alloc_disk(&rbd_dev->tag_set, rbd_dev);
+	if (IS_ERR(disk)) {
+		err = PTR_ERR(disk);
+		goto out_tag_set;
+	}
+	q = disk->queue;
+
+	snprintf(disk->disk_name, sizeof(disk->disk_name), RBD_DRV_NAME "%d",
+		 rbd_dev->dev_id);
+	disk->major = rbd_dev->major;
+	disk->first_minor = rbd_dev->minor;
+	if (single_major)
+		disk->minors = (1 << RBD_SINGLE_MAJOR_PART_SHIFT);
+	else
+		disk->minors = RBD_MINORS_PER_MAJOR;
+	disk->fops = &rbd_bd_ops;
+	disk->private_data = rbd_dev;
+
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+	/* QUEUE_FLAG_ADD_RANDOM is off by default for blk-mq */
+
+	blk_queue_max_hw_sectors(q, objset_bytes >> SECTOR_SHIFT);
+	q->limits.max_sectors = queue_max_hw_sectors(q);
+	blk_queue_max_segments(q, USHRT_MAX);
+	blk_queue_max_segment_size(q, UINT_MAX);
+	blk_queue_io_min(q, rbd_dev->opts->alloc_size);
+	blk_queue_io_opt(q, rbd_dev->opts->alloc_size);
+
+	if (rbd_dev->opts->trim) {
+		q->limits.discard_granularity = rbd_dev->opts->alloc_size;
+		blk_queue_max_discard_sectors(q, objset_bytes >> SECTOR_SHIFT);
+		blk_queue_max_write_zeroes_sectors(q, objset_bytes >> SECTOR_SHIFT);
+	}
+
+	if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC))
+		blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, q);
+
+	rbd_dev->disk = disk;
+
+	return 0;
+out_tag_set:
+	blk_mq_free_tag_set(&rbd_dev->tag_set);
+	return err;
+}
+
+/*
+  sysfs
+*/
+
+static struct rbd_device *dev_to_rbd_dev(struct device *dev)
+{
+	return container_of(dev, struct rbd_device, dev);
+}
+
+static ssize_t rbd_size_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%llu\n",
+		(unsigned long long)rbd_dev->mapping.size);
+}
+
+static ssize_t rbd_features_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "0x%016llx\n", rbd_dev->header.features);
+}
+
+static ssize_t rbd_major_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (rbd_dev->major)
+		return sprintf(buf, "%d\n", rbd_dev->major);
+
+	return sprintf(buf, "(none)\n");
+}
+
+static ssize_t rbd_minor_show(struct device *dev,
+			      struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%d\n", rbd_dev->minor);
+}
+
+static ssize_t rbd_client_addr_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	struct ceph_entity_addr *client_addr =
+	    ceph_client_addr(rbd_dev->rbd_client->client);
+
+	return sprintf(buf, "%pISpc/%u\n", &client_addr->in_addr,
+		       le32_to_cpu(client_addr->nonce));
+}
+
+static ssize_t rbd_client_id_show(struct device *dev,
+				  struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "client%lld\n",
+		       ceph_client_gid(rbd_dev->rbd_client->client));
+}
+
+static ssize_t rbd_cluster_fsid_show(struct device *dev,
+				     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%pU\n", &rbd_dev->rbd_client->client->fsid);
+}
+
+static ssize_t rbd_config_info_show(struct device *dev,
+				    struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	return sprintf(buf, "%s\n", rbd_dev->config_info);
+}
+
+static ssize_t rbd_pool_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->pool_name);
+}
+
+static ssize_t rbd_pool_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%llu\n",
+			(unsigned long long) rbd_dev->spec->pool_id);
+}
+
+static ssize_t rbd_pool_ns_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->pool_ns ?: "");
+}
+
+static ssize_t rbd_name_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	if (rbd_dev->spec->image_name)
+		return sprintf(buf, "%s\n", rbd_dev->spec->image_name);
+
+	return sprintf(buf, "(unknown)\n");
+}
+
+static ssize_t rbd_image_id_show(struct device *dev,
+			     struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->image_id);
+}
+
+/*
+ * Shows the name of the currently-mapped snapshot (or
+ * RBD_SNAP_HEAD_NAME for the base image).
+ */
+static ssize_t rbd_snap_show(struct device *dev,
+			     struct device_attribute *attr,
+			     char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%s\n", rbd_dev->spec->snap_name);
+}
+
+static ssize_t rbd_snap_id_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+
+	return sprintf(buf, "%llu\n", rbd_dev->spec->snap_id);
+}
+
+/*
+ * For a v2 image, shows the chain of parent images, separated by empty
+ * lines.  For v1 images or if there is no parent, shows "(no parent
+ * image)".
+ */
+static ssize_t rbd_parent_show(struct device *dev,
+			       struct device_attribute *attr,
+			       char *buf)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	ssize_t count = 0;
+
+	if (!rbd_dev->parent)
+		return sprintf(buf, "(no parent image)\n");
+
+	for ( ; rbd_dev->parent; rbd_dev = rbd_dev->parent) {
+		struct rbd_spec *spec = rbd_dev->parent_spec;
+
+		count += sprintf(&buf[count], "%s"
+			    "pool_id %llu\npool_name %s\n"
+			    "pool_ns %s\n"
+			    "image_id %s\nimage_name %s\n"
+			    "snap_id %llu\nsnap_name %s\n"
+			    "overlap %llu\n",
+			    !count ? "" : "\n", /* first? */
+			    spec->pool_id, spec->pool_name,
+			    spec->pool_ns ?: "",
+			    spec->image_id, spec->image_name ?: "(unknown)",
+			    spec->snap_id, spec->snap_name,
+			    rbd_dev->parent_overlap);
+	}
+
+	return count;
+}
+
+static ssize_t rbd_image_refresh(struct device *dev,
+				 struct device_attribute *attr,
+				 const char *buf,
+				 size_t size)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	ret = rbd_dev_refresh(rbd_dev);
+	if (ret)
+		return ret;
+
+	return size;
+}
+
+static DEVICE_ATTR(size, 0444, rbd_size_show, NULL);
+static DEVICE_ATTR(features, 0444, rbd_features_show, NULL);
+static DEVICE_ATTR(major, 0444, rbd_major_show, NULL);
+static DEVICE_ATTR(minor, 0444, rbd_minor_show, NULL);
+static DEVICE_ATTR(client_addr, 0444, rbd_client_addr_show, NULL);
+static DEVICE_ATTR(client_id, 0444, rbd_client_id_show, NULL);
+static DEVICE_ATTR(cluster_fsid, 0444, rbd_cluster_fsid_show, NULL);
+static DEVICE_ATTR(config_info, 0400, rbd_config_info_show, NULL);
+static DEVICE_ATTR(pool, 0444, rbd_pool_show, NULL);
+static DEVICE_ATTR(pool_id, 0444, rbd_pool_id_show, NULL);
+static DEVICE_ATTR(pool_ns, 0444, rbd_pool_ns_show, NULL);
+static DEVICE_ATTR(name, 0444, rbd_name_show, NULL);
+static DEVICE_ATTR(image_id, 0444, rbd_image_id_show, NULL);
+static DEVICE_ATTR(refresh, 0200, NULL, rbd_image_refresh);
+static DEVICE_ATTR(current_snap, 0444, rbd_snap_show, NULL);
+static DEVICE_ATTR(snap_id, 0444, rbd_snap_id_show, NULL);
+static DEVICE_ATTR(parent, 0444, rbd_parent_show, NULL);
+
+static struct attribute *rbd_attrs[] = {
+	&dev_attr_size.attr,
+	&dev_attr_features.attr,
+	&dev_attr_major.attr,
+	&dev_attr_minor.attr,
+	&dev_attr_client_addr.attr,
+	&dev_attr_client_id.attr,
+	&dev_attr_cluster_fsid.attr,
+	&dev_attr_config_info.attr,
+	&dev_attr_pool.attr,
+	&dev_attr_pool_id.attr,
+	&dev_attr_pool_ns.attr,
+	&dev_attr_name.attr,
+	&dev_attr_image_id.attr,
+	&dev_attr_current_snap.attr,
+	&dev_attr_snap_id.attr,
+	&dev_attr_parent.attr,
+	&dev_attr_refresh.attr,
+	NULL
+};
+
+static struct attribute_group rbd_attr_group = {
+	.attrs = rbd_attrs,
+};
+
+static const struct attribute_group *rbd_attr_groups[] = {
+	&rbd_attr_group,
+	NULL
+};
+
+static void rbd_dev_release(struct device *dev);
+
+static const struct device_type rbd_device_type = {
+	.name		= "rbd",
+	.groups		= rbd_attr_groups,
+	.release	= rbd_dev_release,
+};
+
+static struct rbd_spec *rbd_spec_get(struct rbd_spec *spec)
+{
+	kref_get(&spec->kref);
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref);
+static void rbd_spec_put(struct rbd_spec *spec)
+{
+	if (spec)
+		kref_put(&spec->kref, rbd_spec_free);
+}
+
+static struct rbd_spec *rbd_spec_alloc(void)
+{
+	struct rbd_spec *spec;
+
+	spec = kzalloc(sizeof (*spec), GFP_KERNEL);
+	if (!spec)
+		return NULL;
+
+	spec->pool_id = CEPH_NOPOOL;
+	spec->snap_id = CEPH_NOSNAP;
+	kref_init(&spec->kref);
+
+	return spec;
+}
+
+static void rbd_spec_free(struct kref *kref)
+{
+	struct rbd_spec *spec = container_of(kref, struct rbd_spec, kref);
+
+	kfree(spec->pool_name);
+	kfree(spec->pool_ns);
+	kfree(spec->image_id);
+	kfree(spec->image_name);
+	kfree(spec->snap_name);
+	kfree(spec);
+}
+
+static void rbd_dev_free(struct rbd_device *rbd_dev)
+{
+	WARN_ON(rbd_dev->watch_state != RBD_WATCH_STATE_UNREGISTERED);
+	WARN_ON(rbd_dev->lock_state != RBD_LOCK_STATE_UNLOCKED);
+
+	ceph_oid_destroy(&rbd_dev->header_oid);
+	ceph_oloc_destroy(&rbd_dev->header_oloc);
+	kfree(rbd_dev->config_info);
+
+	rbd_put_client(rbd_dev->rbd_client);
+	rbd_spec_put(rbd_dev->spec);
+	kfree(rbd_dev->opts);
+	kfree(rbd_dev);
+}
+
+static void rbd_dev_release(struct device *dev)
+{
+	struct rbd_device *rbd_dev = dev_to_rbd_dev(dev);
+	bool need_put = !!rbd_dev->opts;
+
+	if (need_put) {
+		destroy_workqueue(rbd_dev->task_wq);
+		ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+	}
+
+	rbd_dev_free(rbd_dev);
+
+	/*
+	 * This is racy, but way better than putting module outside of
+	 * the release callback.  The race window is pretty small, so
+	 * doing something similar to dm (dm-builtin.c) is overkill.
+	 */
+	if (need_put)
+		module_put(THIS_MODULE);
+}
+
+static struct rbd_device *__rbd_dev_create(struct rbd_spec *spec)
+{
+	struct rbd_device *rbd_dev;
+
+	rbd_dev = kzalloc(sizeof(*rbd_dev), GFP_KERNEL);
+	if (!rbd_dev)
+		return NULL;
+
+	spin_lock_init(&rbd_dev->lock);
+	INIT_LIST_HEAD(&rbd_dev->node);
+	init_rwsem(&rbd_dev->header_rwsem);
+
+	rbd_dev->header.data_pool_id = CEPH_NOPOOL;
+	ceph_oid_init(&rbd_dev->header_oid);
+	rbd_dev->header_oloc.pool = spec->pool_id;
+	if (spec->pool_ns) {
+		WARN_ON(!*spec->pool_ns);
+		rbd_dev->header_oloc.pool_ns =
+		    ceph_find_or_create_string(spec->pool_ns,
+					       strlen(spec->pool_ns));
+	}
+
+	mutex_init(&rbd_dev->watch_mutex);
+	rbd_dev->watch_state = RBD_WATCH_STATE_UNREGISTERED;
+	INIT_DELAYED_WORK(&rbd_dev->watch_dwork, rbd_reregister_watch);
+
+	init_rwsem(&rbd_dev->lock_rwsem);
+	rbd_dev->lock_state = RBD_LOCK_STATE_UNLOCKED;
+	INIT_WORK(&rbd_dev->acquired_lock_work, rbd_notify_acquired_lock);
+	INIT_WORK(&rbd_dev->released_lock_work, rbd_notify_released_lock);
+	INIT_DELAYED_WORK(&rbd_dev->lock_dwork, rbd_acquire_lock);
+	INIT_WORK(&rbd_dev->unlock_work, rbd_release_lock_work);
+	spin_lock_init(&rbd_dev->lock_lists_lock);
+	INIT_LIST_HEAD(&rbd_dev->acquiring_list);
+	INIT_LIST_HEAD(&rbd_dev->running_list);
+	init_completion(&rbd_dev->acquire_wait);
+	init_completion(&rbd_dev->releasing_wait);
+
+	spin_lock_init(&rbd_dev->object_map_lock);
+
+	rbd_dev->dev.bus = &rbd_bus_type;
+	rbd_dev->dev.type = &rbd_device_type;
+	rbd_dev->dev.parent = &rbd_root_dev;
+	device_initialize(&rbd_dev->dev);
+
+	return rbd_dev;
+}
+
+/*
+ * Create a mapping rbd_dev.
+ */
+static struct rbd_device *rbd_dev_create(struct rbd_client *rbdc,
+					 struct rbd_spec *spec,
+					 struct rbd_options *opts)
+{
+	struct rbd_device *rbd_dev;
+
+	rbd_dev = __rbd_dev_create(spec);
+	if (!rbd_dev)
+		return NULL;
+
+	/* get an id and fill in device name */
+	rbd_dev->dev_id = ida_simple_get(&rbd_dev_id_ida, 0,
+					 minor_to_rbd_dev_id(1 << MINORBITS),
+					 GFP_KERNEL);
+	if (rbd_dev->dev_id < 0)
+		goto fail_rbd_dev;
+
+	sprintf(rbd_dev->name, RBD_DRV_NAME "%d", rbd_dev->dev_id);
+	rbd_dev->task_wq = alloc_ordered_workqueue("%s-tasks", WQ_MEM_RECLAIM,
+						   rbd_dev->name);
+	if (!rbd_dev->task_wq)
+		goto fail_dev_id;
+
+	/* we have a ref from do_rbd_add() */
+	__module_get(THIS_MODULE);
+
+	rbd_dev->rbd_client = rbdc;
+	rbd_dev->spec = spec;
+	rbd_dev->opts = opts;
+
+	dout("%s rbd_dev %p dev_id %d\n", __func__, rbd_dev, rbd_dev->dev_id);
+	return rbd_dev;
+
+fail_dev_id:
+	ida_simple_remove(&rbd_dev_id_ida, rbd_dev->dev_id);
+fail_rbd_dev:
+	rbd_dev_free(rbd_dev);
+	return NULL;
+}
+
+static void rbd_dev_destroy(struct rbd_device *rbd_dev)
+{
+	if (rbd_dev)
+		put_device(&rbd_dev->dev);
+}
+
+/*
+ * Get the size and object order for an image snapshot, or if
+ * snap_id is CEPH_NOSNAP, gets this information for the base
+ * image.
+ */
+static int _rbd_dev_v2_snap_size(struct rbd_device *rbd_dev, u64 snap_id,
+				u8 *order, u64 *snap_size)
+{
+	__le64 snapid = cpu_to_le64(snap_id);
+	int ret;
+	struct {
+		u8 order;
+		__le64 size;
+	} __attribute__ ((packed)) size_buf = { 0 };
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_size",
+				  &snapid, sizeof(snapid),
+				  &size_buf, sizeof(size_buf));
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof (size_buf))
+		return -ERANGE;
+
+	if (order) {
+		*order = size_buf.order;
+		dout("  order %u", (unsigned int)*order);
+	}
+	*snap_size = le64_to_cpu(size_buf.size);
+
+	dout("  snap_id 0x%016llx snap_size = %llu\n",
+		(unsigned long long)snap_id,
+		(unsigned long long)*snap_size);
+
+	return 0;
+}
+
+static int rbd_dev_v2_object_prefix(struct rbd_device *rbd_dev,
+				    char **pobject_prefix)
+{
+	size_t size;
+	void *reply_buf;
+	char *object_prefix;
+	int ret;
+	void *p;
+
+	/* Response will be an encoded string, which includes a length */
+	size = sizeof(__le32) + RBD_OBJ_PREFIX_LEN_MAX;
+	reply_buf = kzalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		return -ENOMEM;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_object_prefix",
+				  NULL, 0, reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out;
+
+	p = reply_buf;
+	object_prefix = ceph_extract_encoded_string(&p, p + ret, NULL,
+						    GFP_NOIO);
+	if (IS_ERR(object_prefix)) {
+		ret = PTR_ERR(object_prefix);
+		goto out;
+	}
+	ret = 0;
+
+	*pobject_prefix = object_prefix;
+	dout("  object_prefix = %s\n", object_prefix);
+out:
+	kfree(reply_buf);
+
+	return ret;
+}
+
+static int _rbd_dev_v2_snap_features(struct rbd_device *rbd_dev, u64 snap_id,
+				     bool read_only, u64 *snap_features)
+{
+	struct {
+		__le64 snap_id;
+		u8 read_only;
+	} features_in;
+	struct {
+		__le64 features;
+		__le64 incompat;
+	} __attribute__ ((packed)) features_buf = { 0 };
+	u64 unsup;
+	int ret;
+
+	features_in.snap_id = cpu_to_le64(snap_id);
+	features_in.read_only = read_only;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_features",
+				  &features_in, sizeof(features_in),
+				  &features_buf, sizeof(features_buf));
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof (features_buf))
+		return -ERANGE;
+
+	unsup = le64_to_cpu(features_buf.incompat) & ~RBD_FEATURES_SUPPORTED;
+	if (unsup) {
+		rbd_warn(rbd_dev, "image uses unsupported features: 0x%llx",
+			 unsup);
+		return -ENXIO;
+	}
+
+	*snap_features = le64_to_cpu(features_buf.features);
+
+	dout("  snap_id 0x%016llx features = 0x%016llx incompat = 0x%016llx\n",
+		(unsigned long long)snap_id,
+		(unsigned long long)*snap_features,
+		(unsigned long long)le64_to_cpu(features_buf.incompat));
+
+	return 0;
+}
+
+/*
+ * These are generic image flags, but since they are used only for
+ * object map, store them in rbd_dev->object_map_flags.
+ *
+ * For the same reason, this function is called only on object map
+ * (re)load and not on header refresh.
+ */
+static int rbd_dev_v2_get_flags(struct rbd_device *rbd_dev)
+{
+	__le64 snapid = cpu_to_le64(rbd_dev->spec->snap_id);
+	__le64 flags;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_flags",
+				  &snapid, sizeof(snapid),
+				  &flags, sizeof(flags));
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof(flags))
+		return -EBADMSG;
+
+	rbd_dev->object_map_flags = le64_to_cpu(flags);
+	return 0;
+}
+
+struct parent_image_info {
+	u64		pool_id;
+	const char	*pool_ns;
+	const char	*image_id;
+	u64		snap_id;
+
+	bool		has_overlap;
+	u64		overlap;
+};
+
+static void rbd_parent_info_cleanup(struct parent_image_info *pii)
+{
+	kfree(pii->pool_ns);
+	kfree(pii->image_id);
+
+	memset(pii, 0, sizeof(*pii));
+}
+
+/*
+ * The caller is responsible for @pii.
+ */
+static int decode_parent_image_spec(void **p, void *end,
+				    struct parent_image_info *pii)
+{
+	u8 struct_v;
+	u32 struct_len;
+	int ret;
+
+	ret = ceph_start_decoding(p, end, 1, "ParentImageSpec",
+				  &struct_v, &struct_len);
+	if (ret)
+		return ret;
+
+	ceph_decode_64_safe(p, end, pii->pool_id, e_inval);
+	pii->pool_ns = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(pii->pool_ns)) {
+		ret = PTR_ERR(pii->pool_ns);
+		pii->pool_ns = NULL;
+		return ret;
+	}
+	pii->image_id = ceph_extract_encoded_string(p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(pii->image_id)) {
+		ret = PTR_ERR(pii->image_id);
+		pii->image_id = NULL;
+		return ret;
+	}
+	ceph_decode_64_safe(p, end, pii->snap_id, e_inval);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int __get_parent_info(struct rbd_device *rbd_dev,
+			     struct page *req_page,
+			     struct page *reply_page,
+			     struct parent_image_info *pii)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	size_t reply_len = PAGE_SIZE;
+	void *p, *end;
+	int ret;
+
+	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
+			     "rbd", "parent_get", CEPH_OSD_FLAG_READ,
+			     req_page, sizeof(u64), &reply_page, &reply_len);
+	if (ret)
+		return ret == -EOPNOTSUPP ? 1 : ret;
+
+	p = page_address(reply_page);
+	end = p + reply_len;
+	ret = decode_parent_image_spec(&p, end, pii);
+	if (ret)
+		return ret;
+
+	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
+			     "rbd", "parent_overlap_get", CEPH_OSD_FLAG_READ,
+			     req_page, sizeof(u64), &reply_page, &reply_len);
+	if (ret)
+		return ret;
+
+	p = page_address(reply_page);
+	end = p + reply_len;
+	ceph_decode_8_safe(&p, end, pii->has_overlap, e_inval);
+	if (pii->has_overlap)
+		ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+
+	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+	     __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+	     pii->has_overlap, pii->overlap);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+/*
+ * The caller is responsible for @pii.
+ */
+static int __get_parent_info_legacy(struct rbd_device *rbd_dev,
+				    struct page *req_page,
+				    struct page *reply_page,
+				    struct parent_image_info *pii)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	size_t reply_len = PAGE_SIZE;
+	void *p, *end;
+	int ret;
+
+	ret = ceph_osdc_call(osdc, &rbd_dev->header_oid, &rbd_dev->header_oloc,
+			     "rbd", "get_parent", CEPH_OSD_FLAG_READ,
+			     req_page, sizeof(u64), &reply_page, &reply_len);
+	if (ret)
+		return ret;
+
+	p = page_address(reply_page);
+	end = p + reply_len;
+	ceph_decode_64_safe(&p, end, pii->pool_id, e_inval);
+	pii->image_id = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(pii->image_id)) {
+		ret = PTR_ERR(pii->image_id);
+		pii->image_id = NULL;
+		return ret;
+	}
+	ceph_decode_64_safe(&p, end, pii->snap_id, e_inval);
+	pii->has_overlap = true;
+	ceph_decode_64_safe(&p, end, pii->overlap, e_inval);
+
+	dout("%s pool_id %llu pool_ns %s image_id %s snap_id %llu has_overlap %d overlap %llu\n",
+	     __func__, pii->pool_id, pii->pool_ns, pii->image_id, pii->snap_id,
+	     pii->has_overlap, pii->overlap);
+	return 0;
+
+e_inval:
+	return -EINVAL;
+}
+
+static int rbd_dev_v2_parent_info(struct rbd_device *rbd_dev,
+				  struct parent_image_info *pii)
+{
+	struct page *req_page, *reply_page;
+	void *p;
+	int ret;
+
+	req_page = alloc_page(GFP_KERNEL);
+	if (!req_page)
+		return -ENOMEM;
+
+	reply_page = alloc_page(GFP_KERNEL);
+	if (!reply_page) {
+		__free_page(req_page);
+		return -ENOMEM;
+	}
+
+	p = page_address(req_page);
+	ceph_encode_64(&p, rbd_dev->spec->snap_id);
+	ret = __get_parent_info(rbd_dev, req_page, reply_page, pii);
+	if (ret > 0)
+		ret = __get_parent_info_legacy(rbd_dev, req_page, reply_page,
+					       pii);
+
+	__free_page(req_page);
+	__free_page(reply_page);
+	return ret;
+}
+
+static int rbd_dev_setup_parent(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *parent_spec;
+	struct parent_image_info pii = { 0 };
+	int ret;
+
+	parent_spec = rbd_spec_alloc();
+	if (!parent_spec)
+		return -ENOMEM;
+
+	ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
+	if (ret)
+		goto out_err;
+
+	if (pii.pool_id == CEPH_NOPOOL || !pii.has_overlap)
+		goto out;	/* No parent?  No problem. */
+
+	/* The ceph file layout needs to fit pool id in 32 bits */
+
+	ret = -EIO;
+	if (pii.pool_id > (u64)U32_MAX) {
+		rbd_warn(NULL, "parent pool id too large (%llu > %u)",
+			(unsigned long long)pii.pool_id, U32_MAX);
+		goto out_err;
+	}
+
+	/*
+	 * The parent won't change except when the clone is flattened,
+	 * so we only need to record the parent image spec once.
+	 */
+	parent_spec->pool_id = pii.pool_id;
+	if (pii.pool_ns && *pii.pool_ns) {
+		parent_spec->pool_ns = pii.pool_ns;
+		pii.pool_ns = NULL;
+	}
+	parent_spec->image_id = pii.image_id;
+	pii.image_id = NULL;
+	parent_spec->snap_id = pii.snap_id;
+
+	rbd_assert(!rbd_dev->parent_spec);
+	rbd_dev->parent_spec = parent_spec;
+	parent_spec = NULL;	/* rbd_dev now owns this */
+
+	/*
+	 * Record the parent overlap.  If it's zero, issue a warning as
+	 * we will proceed as if there is no parent.
+	 */
+	if (!pii.overlap)
+		rbd_warn(rbd_dev, "clone is standalone (overlap 0)");
+	rbd_dev->parent_overlap = pii.overlap;
+
+out:
+	ret = 0;
+out_err:
+	rbd_parent_info_cleanup(&pii);
+	rbd_spec_put(parent_spec);
+	return ret;
+}
+
+static int rbd_dev_v2_striping_info(struct rbd_device *rbd_dev,
+				    u64 *stripe_unit, u64 *stripe_count)
+{
+	struct {
+		__le64 stripe_unit;
+		__le64 stripe_count;
+	} __attribute__ ((packed)) striping_info_buf = { 0 };
+	size_t size = sizeof (striping_info_buf);
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				&rbd_dev->header_oloc, "get_stripe_unit_count",
+				NULL, 0, &striping_info_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < size)
+		return -ERANGE;
+
+	*stripe_unit = le64_to_cpu(striping_info_buf.stripe_unit);
+	*stripe_count = le64_to_cpu(striping_info_buf.stripe_count);
+	dout("  stripe_unit = %llu stripe_count = %llu\n", *stripe_unit,
+	     *stripe_count);
+
+	return 0;
+}
+
+static int rbd_dev_v2_data_pool(struct rbd_device *rbd_dev, s64 *data_pool_id)
+{
+	__le64 data_pool_buf;
+	int ret;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_data_pool",
+				  NULL, 0, &data_pool_buf,
+				  sizeof(data_pool_buf));
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		return ret;
+	if (ret < sizeof(data_pool_buf))
+		return -EBADMSG;
+
+	*data_pool_id = le64_to_cpu(data_pool_buf);
+	dout("  data_pool_id = %lld\n", *data_pool_id);
+	WARN_ON(*data_pool_id == CEPH_NOPOOL);
+
+	return 0;
+}
+
+static char *rbd_dev_image_name(struct rbd_device *rbd_dev)
+{
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	size_t image_id_size;
+	char *image_id;
+	void *p;
+	void *end;
+	size_t size;
+	void *reply_buf = NULL;
+	size_t len = 0;
+	char *image_name = NULL;
+	int ret;
+
+	rbd_assert(!rbd_dev->spec->image_name);
+
+	len = strlen(rbd_dev->spec->image_id);
+	image_id_size = sizeof (__le32) + len;
+	image_id = kmalloc(image_id_size, GFP_KERNEL);
+	if (!image_id)
+		return NULL;
+
+	p = image_id;
+	end = image_id + image_id_size;
+	ceph_encode_string(&p, end, rbd_dev->spec->image_id, (u32)len);
+
+	size = sizeof (__le32) + RBD_IMAGE_NAME_LEN_MAX;
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		goto out;
+
+	ceph_oid_printf(&oid, "%s", RBD_DIRECTORY);
+	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+				  "dir_get_name", image_id, image_id_size,
+				  reply_buf, size);
+	if (ret < 0)
+		goto out;
+	p = reply_buf;
+	end = reply_buf + ret;
+
+	image_name = ceph_extract_encoded_string(&p, end, &len, GFP_KERNEL);
+	if (IS_ERR(image_name))
+		image_name = NULL;
+	else
+		dout("%s: name is %s len is %zd\n", __func__, image_name, len);
+out:
+	kfree(reply_buf);
+	kfree(image_id);
+
+	return image_name;
+}
+
+static u64 rbd_v1_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	const char *snap_name;
+	u32 which = 0;
+
+	/* Skip over names until we find the one we are looking for */
+
+	snap_name = rbd_dev->header.snap_names;
+	while (which < snapc->num_snaps) {
+		if (!strcmp(name, snap_name))
+			return snapc->snaps[which];
+		snap_name += strlen(snap_name) + 1;
+		which++;
+	}
+	return CEPH_NOSNAP;
+}
+
+static u64 rbd_v2_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	struct ceph_snap_context *snapc = rbd_dev->header.snapc;
+	u32 which;
+	bool found = false;
+	u64 snap_id;
+
+	for (which = 0; !found && which < snapc->num_snaps; which++) {
+		const char *snap_name;
+
+		snap_id = snapc->snaps[which];
+		snap_name = rbd_dev_v2_snap_name(rbd_dev, snap_id);
+		if (IS_ERR(snap_name)) {
+			/* ignore no-longer existing snapshots */
+			if (PTR_ERR(snap_name) == -ENOENT)
+				continue;
+			else
+				break;
+		}
+		found = !strcmp(name, snap_name);
+		kfree(snap_name);
+	}
+	return found ? snap_id : CEPH_NOSNAP;
+}
+
+/*
+ * Assumes name is never RBD_SNAP_HEAD_NAME; returns CEPH_NOSNAP if
+ * no snapshot by that name is found, or if an error occurs.
+ */
+static u64 rbd_snap_id_by_name(struct rbd_device *rbd_dev, const char *name)
+{
+	if (rbd_dev->image_format == 1)
+		return rbd_v1_snap_id_by_name(rbd_dev, name);
+
+	return rbd_v2_snap_id_by_name(rbd_dev, name);
+}
+
+/*
+ * An image being mapped will have everything but the snap id.
+ */
+static int rbd_spec_fill_snap_id(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *spec = rbd_dev->spec;
+
+	rbd_assert(spec->pool_id != CEPH_NOPOOL && spec->pool_name);
+	rbd_assert(spec->image_id && spec->image_name);
+	rbd_assert(spec->snap_name);
+
+	if (strcmp(spec->snap_name, RBD_SNAP_HEAD_NAME)) {
+		u64 snap_id;
+
+		snap_id = rbd_snap_id_by_name(rbd_dev, spec->snap_name);
+		if (snap_id == CEPH_NOSNAP)
+			return -ENOENT;
+
+		spec->snap_id = snap_id;
+	} else {
+		spec->snap_id = CEPH_NOSNAP;
+	}
+
+	return 0;
+}
+
+/*
+ * A parent image will have all ids but none of the names.
+ *
+ * All names in an rbd spec are dynamically allocated.  It's OK if we
+ * can't figure out the name for an image id.
+ */
+static int rbd_spec_fill_names(struct rbd_device *rbd_dev)
+{
+	struct ceph_osd_client *osdc = &rbd_dev->rbd_client->client->osdc;
+	struct rbd_spec *spec = rbd_dev->spec;
+	const char *pool_name;
+	const char *image_name;
+	const char *snap_name;
+	int ret;
+
+	rbd_assert(spec->pool_id != CEPH_NOPOOL);
+	rbd_assert(spec->image_id);
+	rbd_assert(spec->snap_id != CEPH_NOSNAP);
+
+	/* Get the pool name; we have to make our own copy of this */
+
+	pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, spec->pool_id);
+	if (!pool_name) {
+		rbd_warn(rbd_dev, "no pool with id %llu", spec->pool_id);
+		return -EIO;
+	}
+	pool_name = kstrdup(pool_name, GFP_KERNEL);
+	if (!pool_name)
+		return -ENOMEM;
+
+	/* Fetch the image name; tolerate failure here */
+
+	image_name = rbd_dev_image_name(rbd_dev);
+	if (!image_name)
+		rbd_warn(rbd_dev, "unable to get image name");
+
+	/* Fetch the snapshot name */
+
+	snap_name = rbd_snap_name(rbd_dev, spec->snap_id);
+	if (IS_ERR(snap_name)) {
+		ret = PTR_ERR(snap_name);
+		goto out_err;
+	}
+
+	spec->pool_name = pool_name;
+	spec->image_name = image_name;
+	spec->snap_name = snap_name;
+
+	return 0;
+
+out_err:
+	kfree(image_name);
+	kfree(pool_name);
+	return ret;
+}
+
+static int rbd_dev_v2_snap_context(struct rbd_device *rbd_dev,
+				   struct ceph_snap_context **psnapc)
+{
+	size_t size;
+	int ret;
+	void *reply_buf;
+	void *p;
+	void *end;
+	u64 seq;
+	u32 snap_count;
+	struct ceph_snap_context *snapc;
+	u32 i;
+
+	/*
+	 * We'll need room for the seq value (maximum snapshot id),
+	 * snapshot count, and array of that many snapshot ids.
+	 * For now we have a fixed upper limit on the number we're
+	 * prepared to receive.
+	 */
+	size = sizeof (__le64) + sizeof (__le32) +
+			RBD_MAX_SNAP_COUNT * sizeof (__le64);
+	reply_buf = kzalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		return -ENOMEM;
+
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_snapcontext",
+				  NULL, 0, reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0)
+		goto out;
+
+	p = reply_buf;
+	end = reply_buf + ret;
+	ret = -ERANGE;
+	ceph_decode_64_safe(&p, end, seq, out);
+	ceph_decode_32_safe(&p, end, snap_count, out);
+
+	/*
+	 * Make sure the reported number of snapshot ids wouldn't go
+	 * beyond the end of our buffer.  But before checking that,
+	 * make sure the computed size of the snapshot context we
+	 * allocate is representable in a size_t.
+	 */
+	if (snap_count > (SIZE_MAX - sizeof (struct ceph_snap_context))
+				 / sizeof (u64)) {
+		ret = -EINVAL;
+		goto out;
+	}
+	if (!ceph_has_room(&p, end, snap_count * sizeof (__le64)))
+		goto out;
+	ret = 0;
+
+	snapc = ceph_create_snap_context(snap_count, GFP_KERNEL);
+	if (!snapc) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	snapc->seq = seq;
+	for (i = 0; i < snap_count; i++)
+		snapc->snaps[i] = ceph_decode_64(&p);
+
+	*psnapc = snapc;
+	dout("  snap context seq = %llu, snap_count = %u\n",
+		(unsigned long long)seq, (unsigned int)snap_count);
+out:
+	kfree(reply_buf);
+
+	return ret;
+}
+
+static const char *rbd_dev_v2_snap_name(struct rbd_device *rbd_dev,
+					u64 snap_id)
+{
+	size_t size;
+	void *reply_buf;
+	__le64 snapid;
+	int ret;
+	void *p;
+	void *end;
+	char *snap_name;
+
+	size = sizeof (__le32) + RBD_MAX_SNAP_NAME_LEN;
+	reply_buf = kmalloc(size, GFP_KERNEL);
+	if (!reply_buf)
+		return ERR_PTR(-ENOMEM);
+
+	snapid = cpu_to_le64(snap_id);
+	ret = rbd_obj_method_sync(rbd_dev, &rbd_dev->header_oid,
+				  &rbd_dev->header_oloc, "get_snapshot_name",
+				  &snapid, sizeof(snapid), reply_buf, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret < 0) {
+		snap_name = ERR_PTR(ret);
+		goto out;
+	}
+
+	p = reply_buf;
+	end = reply_buf + ret;
+	snap_name = ceph_extract_encoded_string(&p, end, NULL, GFP_KERNEL);
+	if (IS_ERR(snap_name))
+		goto out;
+
+	dout("  snap_id 0x%016llx snap_name = %s\n",
+		(unsigned long long)snap_id, snap_name);
+out:
+	kfree(reply_buf);
+
+	return snap_name;
+}
+
+static int rbd_dev_v2_header_info(struct rbd_device *rbd_dev,
+				  struct rbd_image_header *header,
+				  bool first_time)
+{
+	int ret;
+
+	ret = _rbd_dev_v2_snap_size(rbd_dev, CEPH_NOSNAP,
+				    first_time ? &header->obj_order : NULL,
+				    &header->image_size);
+	if (ret)
+		return ret;
+
+	if (first_time) {
+		ret = rbd_dev_v2_header_onetime(rbd_dev, header);
+		if (ret)
+			return ret;
+	}
+
+	ret = rbd_dev_v2_snap_context(rbd_dev, &header->snapc);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int rbd_dev_header_info(struct rbd_device *rbd_dev,
+			       struct rbd_image_header *header,
+			       bool first_time)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	rbd_assert(!header->object_prefix && !header->snapc);
+
+	if (rbd_dev->image_format == 1)
+		return rbd_dev_v1_header_info(rbd_dev, header, first_time);
+
+	return rbd_dev_v2_header_info(rbd_dev, header, first_time);
+}
+
+/*
+ * Skips over white space at *buf, and updates *buf to point to the
+ * first found non-space character (if any). Returns the length of
+ * the token (string of non-white space characters) found.  Note
+ * that *buf must be terminated with '\0'.
+ */
+static inline size_t next_token(const char **buf)
+{
+        /*
+        * These are the characters that produce nonzero for
+        * isspace() in the "C" and "POSIX" locales.
+        */
+	static const char spaces[] = " \f\n\r\t\v";
+
+        *buf += strspn(*buf, spaces);	/* Find start of token */
+
+	return strcspn(*buf, spaces);   /* Return token length */
+}
+
+/*
+ * Finds the next token in *buf, dynamically allocates a buffer big
+ * enough to hold a copy of it, and copies the token into the new
+ * buffer.  The copy is guaranteed to be terminated with '\0'.  Note
+ * that a duplicate buffer is created even for a zero-length token.
+ *
+ * Returns a pointer to the newly-allocated duplicate, or a null
+ * pointer if memory for the duplicate was not available.  If
+ * the lenp argument is a non-null pointer, the length of the token
+ * (not including the '\0') is returned in *lenp.
+ *
+ * If successful, the *buf pointer will be updated to point beyond
+ * the end of the found token.
+ *
+ * Note: uses GFP_KERNEL for allocation.
+ */
+static inline char *dup_token(const char **buf, size_t *lenp)
+{
+	char *dup;
+	size_t len;
+
+	len = next_token(buf);
+	dup = kmemdup(*buf, len + 1, GFP_KERNEL);
+	if (!dup)
+		return NULL;
+	*(dup + len) = '\0';
+	*buf += len;
+
+	if (lenp)
+		*lenp = len;
+
+	return dup;
+}
+
+static int rbd_parse_param(struct fs_parameter *param,
+			    struct rbd_parse_opts_ctx *pctx)
+{
+	struct rbd_options *opt = pctx->opts;
+	struct fs_parse_result result;
+	struct p_log log = {.prefix = "rbd"};
+	int token, ret;
+
+	ret = ceph_parse_param(param, pctx->copts, NULL);
+	if (ret != -ENOPARAM)
+		return ret;
+
+	token = __fs_parse(&log, rbd_parameters, param, &result);
+	dout("%s fs_parse '%s' token %d\n", __func__, param->key, token);
+	if (token < 0) {
+		if (token == -ENOPARAM)
+			return inval_plog(&log, "Unknown parameter '%s'",
+					  param->key);
+		return token;
+	}
+
+	switch (token) {
+	case Opt_queue_depth:
+		if (result.uint_32 < 1)
+			goto out_of_range;
+		opt->queue_depth = result.uint_32;
+		break;
+	case Opt_alloc_size:
+		if (result.uint_32 < SECTOR_SIZE)
+			goto out_of_range;
+		if (!is_power_of_2(result.uint_32))
+			return inval_plog(&log, "alloc_size must be a power of 2");
+		opt->alloc_size = result.uint_32;
+		break;
+	case Opt_lock_timeout:
+		/* 0 is "wait forever" (i.e. infinite timeout) */
+		if (result.uint_32 > INT_MAX / 1000)
+			goto out_of_range;
+		opt->lock_timeout = msecs_to_jiffies(result.uint_32 * 1000);
+		break;
+	case Opt_pool_ns:
+		kfree(pctx->spec->pool_ns);
+		pctx->spec->pool_ns = param->string;
+		param->string = NULL;
+		break;
+	case Opt_compression_hint:
+		switch (result.uint_32) {
+		case Opt_compression_hint_none:
+			opt->alloc_hint_flags &=
+			    ~(CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE |
+			      CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE);
+			break;
+		case Opt_compression_hint_compressible:
+			opt->alloc_hint_flags |=
+			    CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
+			opt->alloc_hint_flags &=
+			    ~CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+			break;
+		case Opt_compression_hint_incompressible:
+			opt->alloc_hint_flags |=
+			    CEPH_OSD_ALLOC_HINT_FLAG_INCOMPRESSIBLE;
+			opt->alloc_hint_flags &=
+			    ~CEPH_OSD_ALLOC_HINT_FLAG_COMPRESSIBLE;
+			break;
+		default:
+			BUG();
+		}
+		break;
+	case Opt_read_only:
+		opt->read_only = true;
+		break;
+	case Opt_read_write:
+		opt->read_only = false;
+		break;
+	case Opt_lock_on_read:
+		opt->lock_on_read = true;
+		break;
+	case Opt_exclusive:
+		opt->exclusive = true;
+		break;
+	case Opt_notrim:
+		opt->trim = false;
+		break;
+	default:
+		BUG();
+	}
+
+	return 0;
+
+out_of_range:
+	return inval_plog(&log, "%s out of range", param->key);
+}
+
+/*
+ * This duplicates most of generic_parse_monolithic(), untying it from
+ * fs_context and skipping standard superblock and security options.
+ */
+static int rbd_parse_options(char *options, struct rbd_parse_opts_ctx *pctx)
+{
+	char *key;
+	int ret = 0;
+
+	dout("%s '%s'\n", __func__, options);
+	while ((key = strsep(&options, ",")) != NULL) {
+		if (*key) {
+			struct fs_parameter param = {
+				.key	= key,
+				.type	= fs_value_is_flag,
+			};
+			char *value = strchr(key, '=');
+			size_t v_len = 0;
+
+			if (value) {
+				if (value == key)
+					continue;
+				*value++ = 0;
+				v_len = strlen(value);
+				param.string = kmemdup_nul(value, v_len,
+							   GFP_KERNEL);
+				if (!param.string)
+					return -ENOMEM;
+				param.type = fs_value_is_string;
+			}
+			param.size = v_len;
+
+			ret = rbd_parse_param(&param, pctx);
+			kfree(param.string);
+			if (ret)
+				break;
+		}
+	}
+
+	return ret;
+}
+
+/*
+ * Parse the options provided for an "rbd add" (i.e., rbd image
+ * mapping) request.  These arrive via a write to /sys/bus/rbd/add,
+ * and the data written is passed here via a NUL-terminated buffer.
+ * Returns 0 if successful or an error code otherwise.
+ *
+ * The information extracted from these options is recorded in
+ * the other parameters which return dynamically-allocated
+ * structures:
+ *  ceph_opts
+ *      The address of a pointer that will refer to a ceph options
+ *      structure.  Caller must release the returned pointer using
+ *      ceph_destroy_options() when it is no longer needed.
+ *  rbd_opts
+ *	Address of an rbd options pointer.  Fully initialized by
+ *	this function; caller must release with kfree().
+ *  spec
+ *	Address of an rbd image specification pointer.  Fully
+ *	initialized by this function based on parsed options.
+ *	Caller must release with rbd_spec_put().
+ *
+ * The options passed take this form:
+ *  <mon_addrs> <options> <pool_name> <image_name> [<snap_id>]
+ * where:
+ *  <mon_addrs>
+ *      A comma-separated list of one or more monitor addresses.
+ *      A monitor address is an ip address, optionally followed
+ *      by a port number (separated by a colon).
+ *        I.e.:  ip1[:port1][,ip2[:port2]...]
+ *  <options>
+ *      A comma-separated list of ceph and/or rbd options.
+ *  <pool_name>
+ *      The name of the rados pool containing the rbd image.
+ *  <image_name>
+ *      The name of the image in that pool to map.
+ *  <snap_id>
+ *      An optional snapshot id.  If provided, the mapping will
+ *      present data from the image at the time that snapshot was
+ *      created.  The image head is used if no snapshot id is
+ *      provided.  Snapshot mappings are always read-only.
+ */
+static int rbd_add_parse_args(const char *buf,
+				struct ceph_options **ceph_opts,
+				struct rbd_options **opts,
+				struct rbd_spec **rbd_spec)
+{
+	size_t len;
+	char *options;
+	const char *mon_addrs;
+	char *snap_name;
+	size_t mon_addrs_size;
+	struct rbd_parse_opts_ctx pctx = { 0 };
+	int ret;
+
+	/* The first four tokens are required */
+
+	len = next_token(&buf);
+	if (!len) {
+		rbd_warn(NULL, "no monitor address(es) provided");
+		return -EINVAL;
+	}
+	mon_addrs = buf;
+	mon_addrs_size = len;
+	buf += len;
+
+	ret = -EINVAL;
+	options = dup_token(&buf, NULL);
+	if (!options)
+		return -ENOMEM;
+	if (!*options) {
+		rbd_warn(NULL, "no options provided");
+		goto out_err;
+	}
+
+	pctx.spec = rbd_spec_alloc();
+	if (!pctx.spec)
+		goto out_mem;
+
+	pctx.spec->pool_name = dup_token(&buf, NULL);
+	if (!pctx.spec->pool_name)
+		goto out_mem;
+	if (!*pctx.spec->pool_name) {
+		rbd_warn(NULL, "no pool name provided");
+		goto out_err;
+	}
+
+	pctx.spec->image_name = dup_token(&buf, NULL);
+	if (!pctx.spec->image_name)
+		goto out_mem;
+	if (!*pctx.spec->image_name) {
+		rbd_warn(NULL, "no image name provided");
+		goto out_err;
+	}
+
+	/*
+	 * Snapshot name is optional; default is to use "-"
+	 * (indicating the head/no snapshot).
+	 */
+	len = next_token(&buf);
+	if (!len) {
+		buf = RBD_SNAP_HEAD_NAME; /* No snapshot supplied */
+		len = sizeof (RBD_SNAP_HEAD_NAME) - 1;
+	} else if (len > RBD_MAX_SNAP_NAME_LEN) {
+		ret = -ENAMETOOLONG;
+		goto out_err;
+	}
+	snap_name = kmemdup(buf, len + 1, GFP_KERNEL);
+	if (!snap_name)
+		goto out_mem;
+	*(snap_name + len) = '\0';
+	pctx.spec->snap_name = snap_name;
+
+	pctx.copts = ceph_alloc_options();
+	if (!pctx.copts)
+		goto out_mem;
+
+	/* Initialize all rbd options to the defaults */
+
+	pctx.opts = kzalloc(sizeof(*pctx.opts), GFP_KERNEL);
+	if (!pctx.opts)
+		goto out_mem;
+
+	pctx.opts->read_only = RBD_READ_ONLY_DEFAULT;
+	pctx.opts->queue_depth = RBD_QUEUE_DEPTH_DEFAULT;
+	pctx.opts->alloc_size = RBD_ALLOC_SIZE_DEFAULT;
+	pctx.opts->lock_timeout = RBD_LOCK_TIMEOUT_DEFAULT;
+	pctx.opts->lock_on_read = RBD_LOCK_ON_READ_DEFAULT;
+	pctx.opts->exclusive = RBD_EXCLUSIVE_DEFAULT;
+	pctx.opts->trim = RBD_TRIM_DEFAULT;
+
+	ret = ceph_parse_mon_ips(mon_addrs, mon_addrs_size, pctx.copts, NULL,
+				 ',');
+	if (ret)
+		goto out_err;
+
+	ret = rbd_parse_options(options, &pctx);
+	if (ret)
+		goto out_err;
+
+	*ceph_opts = pctx.copts;
+	*opts = pctx.opts;
+	*rbd_spec = pctx.spec;
+	kfree(options);
+	return 0;
+
+out_mem:
+	ret = -ENOMEM;
+out_err:
+	kfree(pctx.opts);
+	ceph_destroy_options(pctx.copts);
+	rbd_spec_put(pctx.spec);
+	kfree(options);
+	return ret;
+}
+
+static void rbd_dev_image_unlock(struct rbd_device *rbd_dev)
+{
+	down_write(&rbd_dev->lock_rwsem);
+	if (__rbd_is_lock_owner(rbd_dev))
+		__rbd_release_lock(rbd_dev);
+	up_write(&rbd_dev->lock_rwsem);
+}
+
+/*
+ * If the wait is interrupted, an error is returned even if the lock
+ * was successfully acquired.  rbd_dev_image_unlock() will release it
+ * if needed.
+ */
+static int rbd_add_acquire_lock(struct rbd_device *rbd_dev)
+{
+	long ret;
+
+	if (!(rbd_dev->header.features & RBD_FEATURE_EXCLUSIVE_LOCK)) {
+		if (!rbd_dev->opts->exclusive && !rbd_dev->opts->lock_on_read)
+			return 0;
+
+		rbd_warn(rbd_dev, "exclusive-lock feature is not enabled");
+		return -EINVAL;
+	}
+
+	if (rbd_is_ro(rbd_dev))
+		return 0;
+
+	rbd_assert(!rbd_is_lock_owner(rbd_dev));
+	queue_delayed_work(rbd_dev->task_wq, &rbd_dev->lock_dwork, 0);
+	ret = wait_for_completion_killable_timeout(&rbd_dev->acquire_wait,
+			    ceph_timeout_jiffies(rbd_dev->opts->lock_timeout));
+	if (ret > 0) {
+		ret = rbd_dev->acquire_err;
+	} else {
+		cancel_delayed_work_sync(&rbd_dev->lock_dwork);
+		if (!ret)
+			ret = -ETIMEDOUT;
+
+		rbd_warn(rbd_dev, "failed to acquire lock: %ld", ret);
+	}
+	if (ret)
+		return ret;
+
+	/*
+	 * The lock may have been released by now, unless automatic lock
+	 * transitions are disabled.
+	 */
+	rbd_assert(!rbd_dev->opts->exclusive || rbd_is_lock_owner(rbd_dev));
+	return 0;
+}
+
+/*
+ * An rbd format 2 image has a unique identifier, distinct from the
+ * name given to it by the user.  Internally, that identifier is
+ * what's used to specify the names of objects related to the image.
+ *
+ * A special "rbd id" object is used to map an rbd image name to its
+ * id.  If that object doesn't exist, then there is no v2 rbd image
+ * with the supplied name.
+ *
+ * This function will record the given rbd_dev's image_id field if
+ * it can be determined, and in that case will return 0.  If any
+ * errors occur a negative errno will be returned and the rbd_dev's
+ * image_id field will be unchanged (and should be NULL).
+ */
+static int rbd_dev_image_id(struct rbd_device *rbd_dev)
+{
+	int ret;
+	size_t size;
+	CEPH_DEFINE_OID_ONSTACK(oid);
+	void *response;
+	char *image_id;
+
+	/*
+	 * When probing a parent image, the image id is already
+	 * known (and the image name likely is not).  There's no
+	 * need to fetch the image id again in this case.  We
+	 * do still need to set the image format though.
+	 */
+	if (rbd_dev->spec->image_id) {
+		rbd_dev->image_format = *rbd_dev->spec->image_id ? 2 : 1;
+
+		return 0;
+	}
+
+	/*
+	 * First, see if the format 2 image id file exists, and if
+	 * so, get the image's persistent id from it.
+	 */
+	ret = ceph_oid_aprintf(&oid, GFP_KERNEL, "%s%s", RBD_ID_PREFIX,
+			       rbd_dev->spec->image_name);
+	if (ret)
+		return ret;
+
+	dout("rbd id object name is %s\n", oid.name);
+
+	/* Response will be an encoded string, which includes a length */
+	size = sizeof (__le32) + RBD_IMAGE_ID_LEN_MAX;
+	response = kzalloc(size, GFP_NOIO);
+	if (!response) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	/* If it doesn't exist we'll assume it's a format 1 image */
+
+	ret = rbd_obj_method_sync(rbd_dev, &oid, &rbd_dev->header_oloc,
+				  "get_id", NULL, 0,
+				  response, size);
+	dout("%s: rbd_obj_method_sync returned %d\n", __func__, ret);
+	if (ret == -ENOENT) {
+		image_id = kstrdup("", GFP_KERNEL);
+		ret = image_id ? 0 : -ENOMEM;
+		if (!ret)
+			rbd_dev->image_format = 1;
+	} else if (ret >= 0) {
+		void *p = response;
+
+		image_id = ceph_extract_encoded_string(&p, p + ret,
+						NULL, GFP_NOIO);
+		ret = PTR_ERR_OR_ZERO(image_id);
+		if (!ret)
+			rbd_dev->image_format = 2;
+	}
+
+	if (!ret) {
+		rbd_dev->spec->image_id = image_id;
+		dout("image_id is %s\n", image_id);
+	}
+out:
+	kfree(response);
+	ceph_oid_destroy(&oid);
+	return ret;
+}
+
+/*
+ * Undo whatever state changes are made by v1 or v2 header info
+ * call.
+ */
+static void rbd_dev_unprobe(struct rbd_device *rbd_dev)
+{
+	rbd_dev_parent_put(rbd_dev);
+	rbd_object_map_free(rbd_dev);
+	rbd_dev_mapping_clear(rbd_dev);
+
+	/* Free dynamic fields from the header, then zero it out */
+
+	rbd_image_header_cleanup(&rbd_dev->header);
+}
+
+static int rbd_dev_v2_header_onetime(struct rbd_device *rbd_dev,
+				     struct rbd_image_header *header)
+{
+	int ret;
+
+	ret = rbd_dev_v2_object_prefix(rbd_dev, &header->object_prefix);
+	if (ret)
+		return ret;
+
+	/*
+	 * Get the and check features for the image.  Currently the
+	 * features are assumed to never change.
+	 */
+	ret = _rbd_dev_v2_snap_features(rbd_dev, CEPH_NOSNAP,
+					rbd_is_ro(rbd_dev), &header->features);
+	if (ret)
+		return ret;
+
+	/* If the image supports fancy striping, get its parameters */
+
+	if (header->features & RBD_FEATURE_STRIPINGV2) {
+		ret = rbd_dev_v2_striping_info(rbd_dev, &header->stripe_unit,
+					       &header->stripe_count);
+		if (ret)
+			return ret;
+	}
+
+	if (header->features & RBD_FEATURE_DATA_POOL) {
+		ret = rbd_dev_v2_data_pool(rbd_dev, &header->data_pool_id);
+		if (ret)
+			return ret;
+	}
+
+	return 0;
+}
+
+/*
+ * @depth is rbd_dev_image_probe() -> rbd_dev_probe_parent() ->
+ * rbd_dev_image_probe() recursion depth, which means it's also the
+ * length of the already discovered part of the parent chain.
+ */
+static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth)
+{
+	struct rbd_device *parent = NULL;
+	int ret;
+
+	if (!rbd_dev->parent_spec)
+		return 0;
+
+	if (++depth > RBD_MAX_PARENT_CHAIN_LEN) {
+		pr_info("parent chain is too long (%d)\n", depth);
+		ret = -EINVAL;
+		goto out_err;
+	}
+
+	parent = __rbd_dev_create(rbd_dev->parent_spec);
+	if (!parent) {
+		ret = -ENOMEM;
+		goto out_err;
+	}
+
+	/*
+	 * Images related by parent/child relationships always share
+	 * rbd_client and spec/parent_spec, so bump their refcounts.
+	 */
+	parent->rbd_client = __rbd_get_client(rbd_dev->rbd_client);
+	parent->spec = rbd_spec_get(rbd_dev->parent_spec);
+
+	__set_bit(RBD_DEV_FLAG_READONLY, &parent->flags);
+
+	ret = rbd_dev_image_probe(parent, depth);
+	if (ret < 0)
+		goto out_err;
+
+	rbd_dev->parent = parent;
+	atomic_set(&rbd_dev->parent_ref, 1);
+	return 0;
+
+out_err:
+	rbd_dev_unparent(rbd_dev);
+	rbd_dev_destroy(parent);
+	return ret;
+}
+
+static void rbd_dev_device_release(struct rbd_device *rbd_dev)
+{
+	clear_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+	rbd_free_disk(rbd_dev);
+	if (!single_major)
+		unregister_blkdev(rbd_dev->major, rbd_dev->name);
+}
+
+/*
+ * rbd_dev->header_rwsem must be locked for write and will be unlocked
+ * upon return.
+ */
+static int rbd_dev_device_setup(struct rbd_device *rbd_dev)
+{
+	int ret;
+
+	/* Record our major and minor device numbers. */
+
+	if (!single_major) {
+		ret = register_blkdev(0, rbd_dev->name);
+		if (ret < 0)
+			goto err_out_unlock;
+
+		rbd_dev->major = ret;
+		rbd_dev->minor = 0;
+	} else {
+		rbd_dev->major = rbd_major;
+		rbd_dev->minor = rbd_dev_id_to_minor(rbd_dev->dev_id);
+	}
+
+	/* Set up the blkdev mapping. */
+
+	ret = rbd_init_disk(rbd_dev);
+	if (ret)
+		goto err_out_blkdev;
+
+	set_capacity(rbd_dev->disk, rbd_dev->mapping.size / SECTOR_SIZE);
+	set_disk_ro(rbd_dev->disk, rbd_is_ro(rbd_dev));
+
+	ret = dev_set_name(&rbd_dev->dev, "%d", rbd_dev->dev_id);
+	if (ret)
+		goto err_out_disk;
+
+	set_bit(RBD_DEV_FLAG_EXISTS, &rbd_dev->flags);
+	up_write(&rbd_dev->header_rwsem);
+	return 0;
+
+err_out_disk:
+	rbd_free_disk(rbd_dev);
+err_out_blkdev:
+	if (!single_major)
+		unregister_blkdev(rbd_dev->major, rbd_dev->name);
+err_out_unlock:
+	up_write(&rbd_dev->header_rwsem);
+	return ret;
+}
+
+static int rbd_dev_header_name(struct rbd_device *rbd_dev)
+{
+	struct rbd_spec *spec = rbd_dev->spec;
+	int ret;
+
+	/* Record the header object name for this rbd image. */
+
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	if (rbd_dev->image_format == 1)
+		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+				       spec->image_name, RBD_SUFFIX);
+	else
+		ret = ceph_oid_aprintf(&rbd_dev->header_oid, GFP_KERNEL, "%s%s",
+				       RBD_HEADER_PREFIX, spec->image_id);
+
+	return ret;
+}
+
+static void rbd_print_dne(struct rbd_device *rbd_dev, bool is_snap)
+{
+	if (!is_snap) {
+		pr_info("image %s/%s%s%s does not exist\n",
+			rbd_dev->spec->pool_name,
+			rbd_dev->spec->pool_ns ?: "",
+			rbd_dev->spec->pool_ns ? "/" : "",
+			rbd_dev->spec->image_name);
+	} else {
+		pr_info("snap %s/%s%s%s@%s does not exist\n",
+			rbd_dev->spec->pool_name,
+			rbd_dev->spec->pool_ns ?: "",
+			rbd_dev->spec->pool_ns ? "/" : "",
+			rbd_dev->spec->image_name,
+			rbd_dev->spec->snap_name);
+	}
+}
+
+static void rbd_dev_image_release(struct rbd_device *rbd_dev)
+{
+	if (!rbd_is_ro(rbd_dev))
+		rbd_unregister_watch(rbd_dev);
+
+	rbd_dev_unprobe(rbd_dev);
+	rbd_dev->image_format = 0;
+	kfree(rbd_dev->spec->image_id);
+	rbd_dev->spec->image_id = NULL;
+}
+
+/*
+ * Probe for the existence of the header object for the given rbd
+ * device.  If this image is the one being mapped (i.e., not a
+ * parent), initiate a watch on its header object before using that
+ * object to get detailed information about the rbd image.
+ *
+ * On success, returns with header_rwsem held for write if called
+ * with @depth == 0.
+ */
+static int rbd_dev_image_probe(struct rbd_device *rbd_dev, int depth)
+{
+	bool need_watch = !rbd_is_ro(rbd_dev);
+	int ret;
+
+	/*
+	 * Get the id from the image id object.  Unless there's an
+	 * error, rbd_dev->spec->image_id will be filled in with
+	 * a dynamically-allocated string, and rbd_dev->image_format
+	 * will be set to either 1 or 2.
+	 */
+	ret = rbd_dev_image_id(rbd_dev);
+	if (ret)
+		return ret;
+
+	ret = rbd_dev_header_name(rbd_dev);
+	if (ret)
+		goto err_out_format;
+
+	if (need_watch) {
+		ret = rbd_register_watch(rbd_dev);
+		if (ret) {
+			if (ret == -ENOENT)
+				rbd_print_dne(rbd_dev, false);
+			goto err_out_format;
+		}
+	}
+
+	if (!depth)
+		down_write(&rbd_dev->header_rwsem);
+
+	ret = rbd_dev_header_info(rbd_dev, &rbd_dev->header, true);
+	if (ret) {
+		if (ret == -ENOENT && !need_watch)
+			rbd_print_dne(rbd_dev, false);
+		goto err_out_probe;
+	}
+
+	rbd_init_layout(rbd_dev);
+
+	/*
+	 * If this image is the one being mapped, we have pool name and
+	 * id, image name and id, and snap name - need to fill snap id.
+	 * Otherwise this is a parent image, identified by pool, image
+	 * and snap ids - need to fill in names for those ids.
+	 */
+	if (!depth)
+		ret = rbd_spec_fill_snap_id(rbd_dev);
+	else
+		ret = rbd_spec_fill_names(rbd_dev);
+	if (ret) {
+		if (ret == -ENOENT)
+			rbd_print_dne(rbd_dev, true);
+		goto err_out_probe;
+	}
+
+	ret = rbd_dev_mapping_set(rbd_dev);
+	if (ret)
+		goto err_out_probe;
+
+	if (rbd_is_snap(rbd_dev) &&
+	    (rbd_dev->header.features & RBD_FEATURE_OBJECT_MAP)) {
+		ret = rbd_object_map_load(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+	}
+
+	if (rbd_dev->header.features & RBD_FEATURE_LAYERING) {
+		ret = rbd_dev_setup_parent(rbd_dev);
+		if (ret)
+			goto err_out_probe;
+	}
+
+	ret = rbd_dev_probe_parent(rbd_dev, depth);
+	if (ret)
+		goto err_out_probe;
+
+	dout("discovered format %u image, header name is %s\n",
+		rbd_dev->image_format, rbd_dev->header_oid.name);
+	return 0;
+
+err_out_probe:
+	if (!depth)
+		up_write(&rbd_dev->header_rwsem);
+	if (need_watch)
+		rbd_unregister_watch(rbd_dev);
+	rbd_dev_unprobe(rbd_dev);
+err_out_format:
+	rbd_dev->image_format = 0;
+	kfree(rbd_dev->spec->image_id);
+	rbd_dev->spec->image_id = NULL;
+	return ret;
+}
+
+static void rbd_dev_update_header(struct rbd_device *rbd_dev,
+				  struct rbd_image_header *header)
+{
+	rbd_assert(rbd_image_format_valid(rbd_dev->image_format));
+	rbd_assert(rbd_dev->header.object_prefix); /* !first_time */
+
+	if (rbd_dev->header.image_size != header->image_size) {
+		rbd_dev->header.image_size = header->image_size;
+
+		if (!rbd_is_snap(rbd_dev)) {
+			rbd_dev->mapping.size = header->image_size;
+			rbd_dev_update_size(rbd_dev);
+		}
+	}
+
+	ceph_put_snap_context(rbd_dev->header.snapc);
+	rbd_dev->header.snapc = header->snapc;
+	header->snapc = NULL;
+
+	if (rbd_dev->image_format == 1) {
+		kfree(rbd_dev->header.snap_names);
+		rbd_dev->header.snap_names = header->snap_names;
+		header->snap_names = NULL;
+
+		kfree(rbd_dev->header.snap_sizes);
+		rbd_dev->header.snap_sizes = header->snap_sizes;
+		header->snap_sizes = NULL;
+	}
+}
+
+static void rbd_dev_update_parent(struct rbd_device *rbd_dev,
+				  struct parent_image_info *pii)
+{
+	if (pii->pool_id == CEPH_NOPOOL || !pii->has_overlap) {
+		/*
+		 * Either the parent never existed, or we have
+		 * record of it but the image got flattened so it no
+		 * longer has a parent.  When the parent of a
+		 * layered image disappears we immediately set the
+		 * overlap to 0.  The effect of this is that all new
+		 * requests will be treated as if the image had no
+		 * parent.
+		 *
+		 * If !pii.has_overlap, the parent image spec is not
+		 * applicable.  It's there to avoid duplication in each
+		 * snapshot record.
+		 */
+		if (rbd_dev->parent_overlap) {
+			rbd_dev->parent_overlap = 0;
+			rbd_dev_parent_put(rbd_dev);
+			pr_info("%s: clone has been flattened\n",
+				rbd_dev->disk->disk_name);
+		}
+	} else {
+		rbd_assert(rbd_dev->parent_spec);
+
+		/*
+		 * Update the parent overlap.  If it became zero, issue
+		 * a warning as we will proceed as if there is no parent.
+		 */
+		if (!pii->overlap && rbd_dev->parent_overlap)
+			rbd_warn(rbd_dev,
+				 "clone has become standalone (overlap 0)");
+		rbd_dev->parent_overlap = pii->overlap;
+	}
+}
+
+static int rbd_dev_refresh(struct rbd_device *rbd_dev)
+{
+	struct rbd_image_header	header = { 0 };
+	struct parent_image_info pii = { 0 };
+	int ret;
+
+	dout("%s rbd_dev %p\n", __func__, rbd_dev);
+
+	ret = rbd_dev_header_info(rbd_dev, &header, false);
+	if (ret)
+		goto out;
+
+	/*
+	 * If there is a parent, see if it has disappeared due to the
+	 * mapped image getting flattened.
+	 */
+	if (rbd_dev->parent) {
+		ret = rbd_dev_v2_parent_info(rbd_dev, &pii);
+		if (ret)
+			goto out;
+	}
+
+	down_write(&rbd_dev->header_rwsem);
+	rbd_dev_update_header(rbd_dev, &header);
+	if (rbd_dev->parent)
+		rbd_dev_update_parent(rbd_dev, &pii);
+	up_write(&rbd_dev->header_rwsem);
+
+out:
+	rbd_parent_info_cleanup(&pii);
+	rbd_image_header_cleanup(&header);
+	return ret;
+}
+
+static ssize_t do_rbd_add(struct bus_type *bus,
+			  const char *buf,
+			  size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	struct ceph_options *ceph_opts = NULL;
+	struct rbd_options *rbd_opts = NULL;
+	struct rbd_spec *spec = NULL;
+	struct rbd_client *rbdc;
+	int rc;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (!try_module_get(THIS_MODULE))
+		return -ENODEV;
+
+	/* parse add command */
+	rc = rbd_add_parse_args(buf, &ceph_opts, &rbd_opts, &spec);
+	if (rc < 0)
+		goto out;
+
+	rbdc = rbd_get_client(ceph_opts);
+	if (IS_ERR(rbdc)) {
+		rc = PTR_ERR(rbdc);
+		goto err_out_args;
+	}
+
+	/* pick the pool */
+	rc = ceph_pg_poolid_by_name(rbdc->client->osdc.osdmap, spec->pool_name);
+	if (rc < 0) {
+		if (rc == -ENOENT)
+			pr_info("pool %s does not exist\n", spec->pool_name);
+		goto err_out_client;
+	}
+	spec->pool_id = (u64)rc;
+
+	rbd_dev = rbd_dev_create(rbdc, spec, rbd_opts);
+	if (!rbd_dev) {
+		rc = -ENOMEM;
+		goto err_out_client;
+	}
+	rbdc = NULL;		/* rbd_dev now owns this */
+	spec = NULL;		/* rbd_dev now owns this */
+	rbd_opts = NULL;	/* rbd_dev now owns this */
+
+	/* if we are mapping a snapshot it will be a read-only mapping */
+	if (rbd_dev->opts->read_only ||
+	    strcmp(rbd_dev->spec->snap_name, RBD_SNAP_HEAD_NAME))
+		__set_bit(RBD_DEV_FLAG_READONLY, &rbd_dev->flags);
+
+	rbd_dev->config_info = kstrdup(buf, GFP_KERNEL);
+	if (!rbd_dev->config_info) {
+		rc = -ENOMEM;
+		goto err_out_rbd_dev;
+	}
+
+	rc = rbd_dev_image_probe(rbd_dev, 0);
+	if (rc < 0)
+		goto err_out_rbd_dev;
+
+	if (rbd_dev->opts->alloc_size > rbd_dev->layout.object_size) {
+		rbd_warn(rbd_dev, "alloc_size adjusted to %u",
+			 rbd_dev->layout.object_size);
+		rbd_dev->opts->alloc_size = rbd_dev->layout.object_size;
+	}
+
+	rc = rbd_dev_device_setup(rbd_dev);
+	if (rc)
+		goto err_out_image_probe;
+
+	rc = rbd_add_acquire_lock(rbd_dev);
+	if (rc)
+		goto err_out_image_lock;
+
+	/* Everything's ready.  Announce the disk to the world. */
+
+	rc = device_add(&rbd_dev->dev);
+	if (rc)
+		goto err_out_image_lock;
+
+	rc = device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
+	if (rc)
+		goto err_out_cleanup_disk;
+
+	spin_lock(&rbd_dev_list_lock);
+	list_add_tail(&rbd_dev->node, &rbd_dev_list);
+	spin_unlock(&rbd_dev_list_lock);
+
+	pr_info("%s: capacity %llu features 0x%llx\n", rbd_dev->disk->disk_name,
+		(unsigned long long)get_capacity(rbd_dev->disk) << SECTOR_SHIFT,
+		rbd_dev->header.features);
+	rc = count;
+out:
+	module_put(THIS_MODULE);
+	return rc;
+
+err_out_cleanup_disk:
+	rbd_free_disk(rbd_dev);
+err_out_image_lock:
+	rbd_dev_image_unlock(rbd_dev);
+	rbd_dev_device_release(rbd_dev);
+err_out_image_probe:
+	rbd_dev_image_release(rbd_dev);
+err_out_rbd_dev:
+	rbd_dev_destroy(rbd_dev);
+err_out_client:
+	rbd_put_client(rbdc);
+err_out_args:
+	rbd_spec_put(spec);
+	kfree(rbd_opts);
+	goto out;
+}
+
+static ssize_t add_store(struct bus_type *bus, const char *buf, size_t count)
+{
+	if (single_major)
+		return -EINVAL;
+
+	return do_rbd_add(bus, buf, count);
+}
+
+static ssize_t add_single_major_store(struct bus_type *bus, const char *buf,
+				      size_t count)
+{
+	return do_rbd_add(bus, buf, count);
+}
+
+static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
+{
+	while (rbd_dev->parent) {
+		struct rbd_device *first = rbd_dev;
+		struct rbd_device *second = first->parent;
+		struct rbd_device *third;
+
+		/*
+		 * Follow to the parent with no grandparent and
+		 * remove it.
+		 */
+		while (second && (third = second->parent)) {
+			first = second;
+			second = third;
+		}
+		rbd_assert(second);
+		rbd_dev_image_release(second);
+		rbd_dev_destroy(second);
+		first->parent = NULL;
+		first->parent_overlap = 0;
+
+		rbd_assert(first->parent_spec);
+		rbd_spec_put(first->parent_spec);
+		first->parent_spec = NULL;
+	}
+}
+
+static ssize_t do_rbd_remove(struct bus_type *bus,
+			     const char *buf,
+			     size_t count)
+{
+	struct rbd_device *rbd_dev = NULL;
+	struct list_head *tmp;
+	int dev_id;
+	char opt_buf[6];
+	bool force = false;
+	int ret;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	dev_id = -1;
+	opt_buf[0] = '\0';
+	sscanf(buf, "%d %5s", &dev_id, opt_buf);
+	if (dev_id < 0) {
+		pr_err("dev_id out of range\n");
+		return -EINVAL;
+	}
+	if (opt_buf[0] != '\0') {
+		if (!strcmp(opt_buf, "force")) {
+			force = true;
+		} else {
+			pr_err("bad remove option at '%s'\n", opt_buf);
+			return -EINVAL;
+		}
+	}
+
+	ret = -ENOENT;
+	spin_lock(&rbd_dev_list_lock);
+	list_for_each(tmp, &rbd_dev_list) {
+		rbd_dev = list_entry(tmp, struct rbd_device, node);
+		if (rbd_dev->dev_id == dev_id) {
+			ret = 0;
+			break;
+		}
+	}
+	if (!ret) {
+		spin_lock_irq(&rbd_dev->lock);
+		if (rbd_dev->open_count && !force)
+			ret = -EBUSY;
+		else if (test_and_set_bit(RBD_DEV_FLAG_REMOVING,
+					  &rbd_dev->flags))
+			ret = -EINPROGRESS;
+		spin_unlock_irq(&rbd_dev->lock);
+	}
+	spin_unlock(&rbd_dev_list_lock);
+	if (ret)
+		return ret;
+
+	if (force) {
+		/*
+		 * Prevent new IO from being queued and wait for existing
+		 * IO to complete/fail.
+		 */
+		blk_mq_freeze_queue(rbd_dev->disk->queue);
+		blk_mark_disk_dead(rbd_dev->disk);
+	}
+
+	del_gendisk(rbd_dev->disk);
+	spin_lock(&rbd_dev_list_lock);
+	list_del_init(&rbd_dev->node);
+	spin_unlock(&rbd_dev_list_lock);
+	device_del(&rbd_dev->dev);
+
+	rbd_dev_image_unlock(rbd_dev);
+	rbd_dev_device_release(rbd_dev);
+	rbd_dev_image_release(rbd_dev);
+	rbd_dev_destroy(rbd_dev);
+	return count;
+}
+
+static ssize_t remove_store(struct bus_type *bus, const char *buf, size_t count)
+{
+	if (single_major)
+		return -EINVAL;
+
+	return do_rbd_remove(bus, buf, count);
+}
+
+static ssize_t remove_single_major_store(struct bus_type *bus, const char *buf,
+					 size_t count)
+{
+	return do_rbd_remove(bus, buf, count);
+}
+
+/*
+ * create control files in sysfs
+ * /sys/bus/rbd/...
+ */
+static int __init rbd_sysfs_init(void)
+{
+	int ret;
+
+	ret = device_register(&rbd_root_dev);
+	if (ret < 0) {
+		put_device(&rbd_root_dev);
+		return ret;
+	}
+
+	ret = bus_register(&rbd_bus_type);
+	if (ret < 0)
+		device_unregister(&rbd_root_dev);
+
+	return ret;
+}
+
+static void __exit rbd_sysfs_cleanup(void)
+{
+	bus_unregister(&rbd_bus_type);
+	device_unregister(&rbd_root_dev);
+}
+
+static int __init rbd_slab_init(void)
+{
+	rbd_assert(!rbd_img_request_cache);
+	rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
+	if (!rbd_img_request_cache)
+		return -ENOMEM;
+
+	rbd_assert(!rbd_obj_request_cache);
+	rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
+	if (!rbd_obj_request_cache)
+		goto out_err;
+
+	return 0;
+
+out_err:
+	kmem_cache_destroy(rbd_img_request_cache);
+	rbd_img_request_cache = NULL;
+	return -ENOMEM;
+}
+
+static void rbd_slab_exit(void)
+{
+	rbd_assert(rbd_obj_request_cache);
+	kmem_cache_destroy(rbd_obj_request_cache);
+	rbd_obj_request_cache = NULL;
+
+	rbd_assert(rbd_img_request_cache);
+	kmem_cache_destroy(rbd_img_request_cache);
+	rbd_img_request_cache = NULL;
+}
+
+static int __init rbd_init(void)
+{
+	int rc;
+
+	if (!libceph_compatible(NULL)) {
+		rbd_warn(NULL, "libceph incompatibility (quitting)");
+		return -EINVAL;
+	}
+
+	rc = rbd_slab_init();
+	if (rc)
+		return rc;
+
+	/*
+	 * The number of active work items is limited by the number of
+	 * rbd devices * queue depth, so leave @max_active at default.
+	 */
+	rbd_wq = alloc_workqueue(RBD_DRV_NAME, WQ_MEM_RECLAIM, 0);
+	if (!rbd_wq) {
+		rc = -ENOMEM;
+		goto err_out_slab;
+	}
+
+	if (single_major) {
+		rbd_major = register_blkdev(0, RBD_DRV_NAME);
+		if (rbd_major < 0) {
+			rc = rbd_major;
+			goto err_out_wq;
+		}
+	}
+
+	rc = rbd_sysfs_init();
+	if (rc)
+		goto err_out_blkdev;
+
+	if (single_major)
+		pr_info("loaded (major %d)\n", rbd_major);
+	else
+		pr_info("loaded\n");
+
+	return 0;
+
+err_out_blkdev:
+	if (single_major)
+		unregister_blkdev(rbd_major, RBD_DRV_NAME);
+err_out_wq:
+	destroy_workqueue(rbd_wq);
+err_out_slab:
+	rbd_slab_exit();
+	return rc;
+}
+
+static void __exit rbd_exit(void)
+{
+	ida_destroy(&rbd_dev_id_ida);
+	rbd_sysfs_cleanup();
+	if (single_major)
+		unregister_blkdev(rbd_major, RBD_DRV_NAME);
+	destroy_workqueue(rbd_wq);
+	rbd_slab_exit();
+}
+
+module_init(rbd_init);
+module_exit(rbd_exit);
+
+MODULE_AUTHOR("Alex Elder <elder@inktank.com>");
+MODULE_AUTHOR("Sage Weil <sage@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yehuda@hq.newdream.net>");
+/* following authorship retained from original osdblk.c */
+MODULE_AUTHOR("Jeff Garzik <jeff@garzik.org>");
+
+MODULE_DESCRIPTION("RADOS Block Device (RBD) driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/rbd_types.h b/drivers/block/rbd_types.h
new file mode 100644
index 000000000..a600e0eb6
--- /dev/null
+++ b/drivers/block/rbd_types.h
@@ -0,0 +1,100 @@
+/*
+ * Ceph - scalable distributed file system
+ *
+ * Copyright (C) 2004-2010 Sage Weil <sage@newdream.net>
+ *
+ * This is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License version 2.1, as published by the Free Software
+ * Foundation.  See file COPYING.
+ *
+ */
+
+#ifndef CEPH_RBD_TYPES_H
+#define CEPH_RBD_TYPES_H
+
+#include <linux/types.h>
+
+/* For format version 2, rbd image 'foo' consists of objects
+ *   rbd_id.foo		- id of image
+ *   rbd_header.<id>	- image metadata
+ *   rbd_object_map.<id> - optional image object map
+ *   rbd_data.<id>.0000000000000000
+ *   rbd_data.<id>.0000000000000001
+ *   ...		- data
+ * Clients do not access header data directly in rbd format 2.
+ */
+
+#define RBD_HEADER_PREFIX      "rbd_header."
+#define RBD_OBJECT_MAP_PREFIX  "rbd_object_map."
+#define RBD_ID_PREFIX          "rbd_id."
+#define RBD_V2_DATA_FORMAT     "%s.%016llx"
+
+#define RBD_LOCK_NAME          "rbd_lock"
+#define RBD_LOCK_TAG           "internal"
+#define RBD_LOCK_COOKIE_PREFIX "auto"
+
+enum rbd_notify_op {
+	RBD_NOTIFY_OP_ACQUIRED_LOCK      = 0,
+	RBD_NOTIFY_OP_RELEASED_LOCK      = 1,
+	RBD_NOTIFY_OP_REQUEST_LOCK       = 2,
+	RBD_NOTIFY_OP_HEADER_UPDATE      = 3,
+};
+
+#define OBJECT_NONEXISTENT	0
+#define OBJECT_EXISTS		1
+#define OBJECT_PENDING		2
+#define OBJECT_EXISTS_CLEAN	3
+
+#define RBD_FLAG_OBJECT_MAP_INVALID	(1ULL << 0)
+#define RBD_FLAG_FAST_DIFF_INVALID	(1ULL << 1)
+
+/*
+ * For format version 1, rbd image 'foo' consists of objects
+ *   foo.rbd		- image metadata
+ *   rb.<idhi>.<idlo>.<extra>.000000000000
+ *   rb.<idhi>.<idlo>.<extra>.000000000001
+ *   ...		- data
+ * There is no notion of a persistent image id in rbd format 1.
+ */
+
+#define RBD_SUFFIX		".rbd"
+#define RBD_V1_DATA_FORMAT	"%s.%012llx"
+
+#define RBD_DIRECTORY           "rbd_directory"
+#define RBD_INFO                "rbd_info"
+
+#define RBD_DEFAULT_OBJ_ORDER	22   /* 4MB */
+#define RBD_MIN_OBJ_ORDER       16
+#define RBD_MAX_OBJ_ORDER       30
+
+#define RBD_HEADER_TEXT		"<<< Rados Block Device Image >>>\n"
+#define RBD_HEADER_SIGNATURE	"RBD"
+#define RBD_HEADER_VERSION	"001.005"
+
+struct rbd_image_snap_ondisk {
+	__le64 id;
+	__le64 image_size;
+} __attribute__((packed));
+
+struct rbd_image_header_ondisk {
+	char text[40];
+	char object_prefix[24];
+	char signature[4];
+	char version[8];
+	struct {
+		__u8 order;
+		__u8 crypt_type;
+		__u8 comp_type;
+		__u8 unused;
+	} __attribute__((packed)) options;
+	__le64 image_size;
+	__le64 snap_seq;
+	__le32 snap_count;
+	__le32 reserved;
+	__le64 snap_names_len;
+	struct rbd_image_snap_ondisk snaps[];
+} __attribute__((packed));
+
+
+#endif
diff --git a/drivers/block/rnbd/Kconfig b/drivers/block/rnbd/Kconfig
new file mode 100644
index 000000000..2ff05a0d2
--- /dev/null
+++ b/drivers/block/rnbd/Kconfig
@@ -0,0 +1,29 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+config BLK_DEV_RNBD
+	bool
+
+config BLK_DEV_RNBD_CLIENT
+	tristate "RDMA Network Block Device driver client"
+	depends on INFINIBAND_RTRS_CLIENT
+	select BLK_DEV_RNBD
+	select SG_POOL
+	help
+	  RNBD client is a network block device driver using rdma transport.
+
+	  RNBD client allows for mapping of a remote block devices over
+	  RTRS protocol from a target system where RNBD server is running.
+
+	  If unsure, say N.
+
+config BLK_DEV_RNBD_SERVER
+	tristate "RDMA Network Block Device driver server"
+	depends on INFINIBAND_RTRS_SERVER
+	select BLK_DEV_RNBD
+	help
+	  RNBD server is the server side of RNBD using rdma transport.
+
+	  RNBD server allows for exporting local block devices to a remote client
+	  over RTRS protocol.
+
+	  If unsure, say N.
diff --git a/drivers/block/rnbd/Makefile b/drivers/block/rnbd/Makefile
new file mode 100644
index 000000000..40b316308
--- /dev/null
+++ b/drivers/block/rnbd/Makefile
@@ -0,0 +1,17 @@
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+ccflags-y := -I$(srctree)/drivers/infiniband/ulp/rtrs
+
+rnbd-client-y := rnbd-clt.o \
+		  rnbd-clt-sysfs.o \
+		  rnbd-common.o
+
+CFLAGS_rnbd-srv-trace.o = -I$(src)
+
+rnbd-server-y := rnbd-common.o \
+		  rnbd-srv.o \
+		  rnbd-srv-sysfs.o \
+		  rnbd-srv-trace.o
+
+obj-$(CONFIG_BLK_DEV_RNBD_CLIENT) += rnbd-client.o
+obj-$(CONFIG_BLK_DEV_RNBD_SERVER) += rnbd-server.o
diff --git a/drivers/block/rnbd/README b/drivers/block/rnbd/README
new file mode 100644
index 000000000..080f58a54
--- /dev/null
+++ b/drivers/block/rnbd/README
@@ -0,0 +1,93 @@
+********************************
+RDMA Network Block Device (RNBD)
+********************************
+
+Introduction
+------------
+
+RNBD (RDMA Network Block Device) is a pair of kernel modules
+(client and server) that allow for remote access of a block device on
+the server over RTRS protocol using the RDMA (InfiniBand, RoCE, iWARP)
+transport. After being mapped, the remote block devices can be accessed
+on the client side as local block devices.
+
+I/O is transferred between client and server by the RTRS transport
+modules. The administration of RNBD and RTRS modules is done via
+sysfs entries.
+
+Requirements
+------------
+
+  RTRS kernel modules
+
+Quick Start
+-----------
+
+Server side:
+  # modprobe rnbd_server
+
+Client side:
+  # modprobe rnbd_client
+  # echo "sessname=blya path=ip:10.50.100.66 device_path=/dev/ram0" > \
+            /sys/devices/virtual/rnbd-client/ctl/map_device
+
+  Where "sessname=" is a session name, a string to identify the session
+  on client and on server sides; "path=" is a destination IP address or
+  a pair of a source and a destination IPs, separated by comma.  Multiple
+  "path=" options can be specified in order to use multipath  (see RTRS
+  description for details); "device_path=" is the block device to be
+  mapped from the server side. After the session to the server machine is
+  established, the mapped device will appear on the client side under
+  /dev/rnbd<N>.
+
+
+RNBD-Server Module Parameters
+=============================
+
+dev_search_path
+---------------
+
+When a device is mapped from the client, the server generates the path
+to the block device on the server side by concatenating dev_search_path
+and the "device_path" that was specified in the map_device operation.
+
+The default dev_search_path is: "/".
+
+dev_search_path option can also contain %SESSNAME% in order to provide
+different device namespaces for different sessions.  See "device_path"
+option for details.
+
+============================
+Protocol (rnbd/rnbd-proto.h)
+============================
+
+1. Before mapping first device from a given server, client sends an
+RNBD_MSG_SESS_INFO to the server. Server responds with
+RNBD_MSG_SESS_INFO_RSP. Currently the messages only contain the protocol
+version for backward compatibility.
+
+2. Client requests to open a device by sending RNBD_MSG_OPEN message. This
+contains the path to the device and access mode (read-only or writable).
+Server responds to the message with RNBD_MSG_OPEN_RSP. This contains
+a 32 bit device id to be used for  IOs and device "geometry" related
+information: side, max_hw_sectors, etc.
+
+3. Client attaches RNBD_MSG_IO to each IO message send to a device. This
+message contains device id, provided by server in his rnbd_msg_open_rsp,
+sector to be accessed, read-write flags and bi_size.
+
+4. Client closes a device by sending RNBD_MSG_CLOSE which contains only the
+device id provided by the server.
+
+=========================================
+Contributors List(in alphabetical order)
+=========================================
+Danil Kipnis <danil.kipnis@profitbricks.com>
+Fabian Holler <mail@fholler.de>
+Guoqing Jiang <guoqing.jiang@cloud.ionos.com>
+Jack Wang <jinpu.wang@profitbricks.com>
+Kleber Souza <kleber.souza@profitbricks.com>
+Lutz Pogrell <lutz.pogrell@cloud.ionos.com>
+Milind Dumbare <Milind.dumbare@gmail.com>
+Roman Penyaev <roman.penyaev@profitbricks.com>
+Swapnil Ingle <ingleswapnil@gmail.com>
diff --git a/drivers/block/rnbd/rnbd-clt-sysfs.c b/drivers/block/rnbd/rnbd-clt-sysfs.c
new file mode 100644
index 000000000..e7c7d9a68
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-clt-sysfs.c
@@ -0,0 +1,683 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/types.h>
+#include <linux/ctype.h>
+#include <linux/parser.h>
+#include <linux/module.h>
+#include <linux/in6.h>
+#include <linux/fs.h>
+#include <linux/uaccess.h>
+#include <linux/device.h>
+#include <rdma/ib.h>
+#include <rdma/rdma_cm.h>
+
+#include "rnbd-clt.h"
+
+static struct device *rnbd_dev;
+static struct class *rnbd_dev_class;
+static struct kobject *rnbd_devs_kobj;
+
+enum {
+	RNBD_OPT_ERR		= 0,
+	RNBD_OPT_DEST_PORT	= 1 << 0,
+	RNBD_OPT_PATH		= 1 << 1,
+	RNBD_OPT_DEV_PATH	= 1 << 2,
+	RNBD_OPT_ACCESS_MODE	= 1 << 3,
+	RNBD_OPT_SESSNAME	= 1 << 6,
+	RNBD_OPT_NR_POLL_QUEUES	= 1 << 7,
+};
+
+static const unsigned int rnbd_opt_mandatory[] = {
+	RNBD_OPT_DEV_PATH,
+	RNBD_OPT_SESSNAME,
+};
+
+static const match_table_t rnbd_opt_tokens = {
+	{RNBD_OPT_PATH,			"path=%s"		},
+	{RNBD_OPT_DEV_PATH,		"device_path=%s"	},
+	{RNBD_OPT_DEST_PORT,		"dest_port=%d"		},
+	{RNBD_OPT_ACCESS_MODE,		"access_mode=%s"	},
+	{RNBD_OPT_SESSNAME,		"sessname=%s"		},
+	{RNBD_OPT_NR_POLL_QUEUES,	"nr_poll_queues=%d"	},
+	{RNBD_OPT_ERR,			NULL			},
+};
+
+struct rnbd_map_options {
+	char *sessname;
+	struct rtrs_addr *paths;
+	size_t *path_cnt;
+	char *pathname;
+	u16 *dest_port;
+	enum rnbd_access_mode *access_mode;
+	u32 *nr_poll_queues;
+};
+
+static int rnbd_clt_parse_map_options(const char *buf, size_t max_path_cnt,
+				       struct rnbd_map_options *opt)
+{
+	char *options, *sep_opt;
+	char *p;
+	substring_t args[MAX_OPT_ARGS];
+	int opt_mask = 0;
+	int token;
+	int ret = -EINVAL;
+	int nr_poll_queues = 0;
+	int dest_port = 0;
+	int p_cnt = 0;
+	int i;
+
+	options = kstrdup(buf, GFP_KERNEL);
+	if (!options)
+		return -ENOMEM;
+
+	sep_opt = strstrip(options);
+	while ((p = strsep(&sep_opt, " ")) != NULL) {
+		if (!*p)
+			continue;
+
+		token = match_token(p, rnbd_opt_tokens, args);
+		opt_mask |= token;
+
+		switch (token) {
+		case RNBD_OPT_SESSNAME:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("map_device: sessname too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(opt->sessname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case RNBD_OPT_PATH:
+			if (p_cnt >= max_path_cnt) {
+				pr_err("map_device: too many (> %zu) paths provided\n",
+				       max_path_cnt);
+				ret = -ENOMEM;
+				goto out;
+			}
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			ret = rtrs_addr_to_sockaddr(p, strlen(p),
+						    *opt->dest_port,
+						    &opt->paths[p_cnt]);
+			if (ret) {
+				pr_err("Can't parse path %s: %d\n", p, ret);
+				kfree(p);
+				goto out;
+			}
+
+			p_cnt++;
+
+			kfree(p);
+			break;
+
+		case RNBD_OPT_DEV_PATH:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+			if (strlen(p) > NAME_MAX) {
+				pr_err("map_device: Device path too long\n");
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+			strscpy(opt->pathname, p, NAME_MAX);
+			kfree(p);
+			break;
+
+		case RNBD_OPT_DEST_PORT:
+			if (match_int(args, &dest_port) || dest_port < 0 ||
+			    dest_port > 65535) {
+				pr_err("bad destination port number parameter '%d'\n",
+				       dest_port);
+				ret = -EINVAL;
+				goto out;
+			}
+			*opt->dest_port = dest_port;
+			break;
+
+		case RNBD_OPT_ACCESS_MODE:
+			p = match_strdup(args);
+			if (!p) {
+				ret = -ENOMEM;
+				goto out;
+			}
+
+			if (!strcmp(p, "ro")) {
+				*opt->access_mode = RNBD_ACCESS_RO;
+			} else if (!strcmp(p, "rw")) {
+				*opt->access_mode = RNBD_ACCESS_RW;
+			} else if (!strcmp(p, "migration")) {
+				*opt->access_mode = RNBD_ACCESS_MIGRATION;
+			} else {
+				pr_err("map_device: Invalid access_mode: '%s'\n",
+				       p);
+				ret = -EINVAL;
+				kfree(p);
+				goto out;
+			}
+
+			kfree(p);
+			break;
+
+		case RNBD_OPT_NR_POLL_QUEUES:
+			if (match_int(args, &nr_poll_queues) || nr_poll_queues < -1 ||
+			    nr_poll_queues > (int)nr_cpu_ids) {
+				pr_err("bad nr_poll_queues parameter '%d'\n",
+				       nr_poll_queues);
+				ret = -EINVAL;
+				goto out;
+			}
+			if (nr_poll_queues == -1)
+				nr_poll_queues = nr_cpu_ids;
+			*opt->nr_poll_queues = nr_poll_queues;
+			break;
+
+		default:
+			pr_err("map_device: Unknown parameter or missing value '%s'\n",
+			       p);
+			ret = -EINVAL;
+			goto out;
+		}
+	}
+
+	for (i = 0; i < ARRAY_SIZE(rnbd_opt_mandatory); i++) {
+		if ((opt_mask & rnbd_opt_mandatory[i])) {
+			ret = 0;
+		} else {
+			pr_err("map_device: Parameters missing\n");
+			ret = -EINVAL;
+			break;
+		}
+	}
+
+out:
+	*opt->path_cnt = p_cnt;
+	kfree(options);
+	return ret;
+}
+
+static ssize_t state_show(struct kobject *kobj,
+			  struct kobj_attribute *attr, char *page)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	switch (dev->dev_state) {
+	case DEV_STATE_INIT:
+		return sysfs_emit(page, "init\n");
+	case DEV_STATE_MAPPED:
+		/* TODO fix cli tool before changing to proper state */
+		return sysfs_emit(page, "open\n");
+	case DEV_STATE_MAPPED_DISCONNECTED:
+		/* TODO fix cli tool before changing to proper state */
+		return sysfs_emit(page, "closed\n");
+	case DEV_STATE_UNMAPPED:
+		return sysfs_emit(page, "unmapped\n");
+	default:
+		return sysfs_emit(page, "unknown\n");
+	}
+}
+
+static struct kobj_attribute rnbd_clt_state_attr = __ATTR_RO(state);
+
+static ssize_t nr_poll_queues_show(struct kobject *kobj,
+				   struct kobj_attribute *attr, char *page)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	return sysfs_emit(page, "%d\n", dev->nr_poll_queues);
+}
+
+static struct kobj_attribute rnbd_clt_nr_poll_queues =
+	__ATTR_RO(nr_poll_queues);
+
+static ssize_t mapping_path_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *page)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	return sysfs_emit(page, "%s\n", dev->pathname);
+}
+
+static struct kobj_attribute rnbd_clt_mapping_path_attr =
+	__ATTR_RO(mapping_path);
+
+static ssize_t access_mode_show(struct kobject *kobj,
+				struct kobj_attribute *attr, char *page)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	return sysfs_emit(page, "%s\n", rnbd_access_mode_str(dev->access_mode));
+}
+
+static struct kobj_attribute rnbd_clt_access_mode =
+	__ATTR_RO(access_mode);
+
+static ssize_t rnbd_clt_unmap_dev_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	return sysfs_emit(page, "Usage: echo <normal|force> > %s\n",
+			  attr->attr.name);
+}
+
+static ssize_t rnbd_clt_unmap_dev_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct rnbd_clt_dev *dev;
+	char *opt, *options;
+	bool force;
+	int err;
+
+	opt = kstrdup(buf, GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+
+	options = strstrip(opt);
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+	if (sysfs_streq(options, "normal")) {
+		force = false;
+	} else if (sysfs_streq(options, "force")) {
+		force = true;
+	} else {
+		rnbd_clt_err(dev,
+			      "unmap_device: Invalid value: %s\n",
+			      options);
+		err = -EINVAL;
+		goto out;
+	}
+
+	rnbd_clt_info(dev, "Unmapping device, option: %s.\n",
+		       force ? "force" : "normal");
+
+	/*
+	 * We take explicit module reference only for one reason: do not
+	 * race with lockless rnbd_destroy_sessions().
+	 */
+	if (!try_module_get(THIS_MODULE)) {
+		err = -ENODEV;
+		goto out;
+	}
+	err = rnbd_clt_unmap_device(dev, force, &attr->attr);
+	if (err) {
+		if (err != -EALREADY)
+			rnbd_clt_err(dev, "unmap_device: %d\n",  err);
+		goto module_put;
+	}
+
+	/*
+	 * Here device can be vanished!
+	 */
+
+	err = count;
+
+module_put:
+	module_put(THIS_MODULE);
+out:
+	kfree(opt);
+
+	return err;
+}
+
+static struct kobj_attribute rnbd_clt_unmap_device_attr =
+	__ATTR(unmap_device, 0644, rnbd_clt_unmap_dev_show,
+	       rnbd_clt_unmap_dev_store);
+
+static ssize_t rnbd_clt_resize_dev_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return sysfs_emit(page, "Usage: echo <new size in sectors> > %s\n",
+			  attr->attr.name);
+}
+
+static ssize_t rnbd_clt_resize_dev_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	int ret;
+	unsigned long sectors;
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	ret = kstrtoul(buf, 0, &sectors);
+	if (ret)
+		return ret;
+
+	ret = rnbd_clt_resize_disk(dev, sectors);
+	if (ret)
+		return ret;
+
+	return count;
+}
+
+static struct kobj_attribute rnbd_clt_resize_dev_attr =
+	__ATTR(resize, 0644, rnbd_clt_resize_dev_show,
+	       rnbd_clt_resize_dev_store);
+
+static ssize_t rnbd_clt_remap_dev_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	return sysfs_emit(page, "Usage: echo <1> > %s\n", attr->attr.name);
+}
+
+static ssize_t rnbd_clt_remap_dev_store(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct rnbd_clt_dev *dev;
+	char *opt, *options;
+	int err;
+
+	opt = kstrdup(buf, GFP_KERNEL);
+	if (!opt)
+		return -ENOMEM;
+
+	options = strstrip(opt);
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+	if (!sysfs_streq(options, "1")) {
+		rnbd_clt_err(dev,
+			      "remap_device: Invalid value: %s\n",
+			      options);
+		err = -EINVAL;
+		goto out;
+	}
+	err = rnbd_clt_remap_device(dev);
+	if (likely(!err))
+		err = count;
+
+out:
+	kfree(opt);
+
+	return err;
+}
+
+static struct kobj_attribute rnbd_clt_remap_device_attr =
+	__ATTR(remap_device, 0644, rnbd_clt_remap_dev_show,
+	       rnbd_clt_remap_dev_store);
+
+static ssize_t session_show(struct kobject *kobj, struct kobj_attribute *attr,
+			    char *page)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_clt_dev, kobj);
+
+	return sysfs_emit(page, "%s\n", dev->sess->sessname);
+}
+
+static struct kobj_attribute rnbd_clt_session_attr =
+	__ATTR_RO(session);
+
+static struct attribute *rnbd_dev_attrs[] = {
+	&rnbd_clt_unmap_device_attr.attr,
+	&rnbd_clt_resize_dev_attr.attr,
+	&rnbd_clt_remap_device_attr.attr,
+	&rnbd_clt_mapping_path_attr.attr,
+	&rnbd_clt_state_attr.attr,
+	&rnbd_clt_session_attr.attr,
+	&rnbd_clt_access_mode.attr,
+	&rnbd_clt_nr_poll_queues.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(rnbd_dev);
+
+void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev)
+{
+	/*
+	 * The module unload rnbd_client_exit path is racing with unmapping of
+	 * the last single device from the sysfs manually
+	 * i.e. rnbd_clt_unmap_dev_store() leading to a sysfs warning because
+	 * of sysfs link already was removed already.
+	 */
+	if (dev->blk_symlink_name) {
+		if (try_module_get(THIS_MODULE)) {
+			sysfs_remove_link(rnbd_devs_kobj, dev->blk_symlink_name);
+			module_put(THIS_MODULE);
+		}
+		/* It should be freed always. */
+		kfree(dev->blk_symlink_name);
+		dev->blk_symlink_name = NULL;
+	}
+}
+
+static struct kobj_type rnbd_dev_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+	.default_groups = rnbd_dev_groups,
+};
+
+static int rnbd_clt_add_dev_kobj(struct rnbd_clt_dev *dev)
+{
+	int ret;
+	struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj;
+
+	ret = kobject_init_and_add(&dev->kobj, &rnbd_dev_ktype, gd_kobj, "%s",
+				   "rnbd");
+	if (ret) {
+		rnbd_clt_err(dev, "Failed to create device sysfs dir, err: %d\n",
+			      ret);
+		kobject_put(&dev->kobj);
+	}
+	kobject_uevent(gd_kobj, KOBJ_ONLINE);
+
+	return ret;
+}
+
+static ssize_t rnbd_clt_map_device_show(struct kobject *kobj,
+					 struct kobj_attribute *attr,
+					 char *page)
+{
+	return sysfs_emit(page,
+			  "Usage: echo \"[dest_port=server port number] sessname=<name of the rtrs session> path=<[srcaddr@]dstaddr> [path=<[srcaddr@]dstaddr>] device_path=<full path on remote side> [access_mode=<ro|rw|migration>] [nr_poll_queues=<number of queues>]\" > %s\n\naddr ::= [ ip:<ipv4> | ip:<ipv6> | gid:<gid> ]\n",
+			 attr->attr.name);
+}
+
+static int rnbd_clt_get_path_name(struct rnbd_clt_dev *dev, char *buf,
+				   size_t len)
+{
+	int ret;
+	char pathname[NAME_MAX], *s;
+
+	strscpy(pathname, dev->pathname, sizeof(pathname));
+	while ((s = strchr(pathname, '/')))
+		s[0] = '!';
+
+	ret = snprintf(buf, len, "%s@%s", pathname, dev->sess->sessname);
+	if (ret >= len)
+		return -ENAMETOOLONG;
+
+	return 0;
+}
+
+static int rnbd_clt_add_dev_symlink(struct rnbd_clt_dev *dev)
+{
+	struct kobject *gd_kobj = &disk_to_dev(dev->gd)->kobj;
+	int ret, len;
+
+	len = strlen(dev->pathname) + strlen(dev->sess->sessname) + 2;
+	dev->blk_symlink_name = kzalloc(len, GFP_KERNEL);
+	if (!dev->blk_symlink_name) {
+		rnbd_clt_err(dev, "Failed to allocate memory for blk_symlink_name\n");
+		return -ENOMEM;
+	}
+
+	ret = rnbd_clt_get_path_name(dev, dev->blk_symlink_name,
+				      len);
+	if (ret) {
+		rnbd_clt_err(dev, "Failed to get /sys/block symlink path, err: %d\n",
+			      ret);
+		goto out_err;
+	}
+
+	ret = sysfs_create_link(rnbd_devs_kobj, gd_kobj,
+				dev->blk_symlink_name);
+	if (ret) {
+		rnbd_clt_err(dev, "Creating /sys/block symlink failed, err: %d\n",
+			      ret);
+		goto out_err;
+	}
+
+	return 0;
+
+out_err:
+	kfree(dev->blk_symlink_name);
+	dev->blk_symlink_name = NULL ;
+	return ret;
+}
+
+static ssize_t rnbd_clt_map_device_store(struct kobject *kobj,
+					  struct kobj_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct rnbd_clt_dev *dev;
+	struct rnbd_map_options opt;
+	int ret;
+	char pathname[NAME_MAX];
+	char sessname[NAME_MAX];
+	enum rnbd_access_mode access_mode = RNBD_ACCESS_RW;
+	u16 port_nr = RTRS_PORT;
+	u32 nr_poll_queues = 0;
+
+	struct sockaddr_storage *addrs;
+	struct rtrs_addr paths[6];
+	size_t path_cnt;
+
+	opt.sessname = sessname;
+	opt.paths = paths;
+	opt.path_cnt = &path_cnt;
+	opt.pathname = pathname;
+	opt.dest_port = &port_nr;
+	opt.access_mode = &access_mode;
+	opt.nr_poll_queues = &nr_poll_queues;
+	addrs = kcalloc(ARRAY_SIZE(paths) * 2, sizeof(*addrs), GFP_KERNEL);
+	if (!addrs)
+		return -ENOMEM;
+
+	for (path_cnt = 0; path_cnt < ARRAY_SIZE(paths); path_cnt++) {
+		paths[path_cnt].src = &addrs[path_cnt * 2];
+		paths[path_cnt].dst = &addrs[path_cnt * 2 + 1];
+	}
+
+	ret = rnbd_clt_parse_map_options(buf, ARRAY_SIZE(paths), &opt);
+	if (ret)
+		goto out;
+
+	pr_info("Mapping device %s on session %s, (access_mode: %s, nr_poll_queues: %d)\n",
+		pathname, sessname,
+		rnbd_access_mode_str(access_mode),
+		nr_poll_queues);
+
+	dev = rnbd_clt_map_device(sessname, paths, path_cnt, port_nr, pathname,
+				  access_mode, nr_poll_queues);
+	if (IS_ERR(dev)) {
+		ret = PTR_ERR(dev);
+		goto out;
+	}
+
+	ret = rnbd_clt_add_dev_kobj(dev);
+	if (ret)
+		goto unmap_dev;
+
+	ret = rnbd_clt_add_dev_symlink(dev);
+	if (ret)
+		goto unmap_dev;
+
+	kfree(addrs);
+	return count;
+
+unmap_dev:
+	rnbd_clt_unmap_device(dev, true, NULL);
+out:
+	kfree(addrs);
+	return ret;
+}
+
+static struct kobj_attribute rnbd_clt_map_device_attr =
+	__ATTR(map_device, 0644,
+	       rnbd_clt_map_device_show, rnbd_clt_map_device_store);
+
+static struct attribute *default_attrs[] = {
+	&rnbd_clt_map_device_attr.attr,
+	NULL,
+};
+
+static struct attribute_group default_attr_group = {
+	.attrs = default_attrs,
+};
+
+static const struct attribute_group *default_attr_groups[] = {
+	&default_attr_group,
+	NULL,
+};
+
+int rnbd_clt_create_sysfs_files(void)
+{
+	int err;
+
+	rnbd_dev_class = class_create(THIS_MODULE, "rnbd-client");
+	if (IS_ERR(rnbd_dev_class))
+		return PTR_ERR(rnbd_dev_class);
+
+	rnbd_dev = device_create_with_groups(rnbd_dev_class, NULL,
+					      MKDEV(0, 0), NULL,
+					      default_attr_groups, "ctl");
+	if (IS_ERR(rnbd_dev)) {
+		err = PTR_ERR(rnbd_dev);
+		goto cls_destroy;
+	}
+	rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj);
+	if (!rnbd_devs_kobj) {
+		err = -ENOMEM;
+		goto dev_destroy;
+	}
+
+	return 0;
+
+dev_destroy:
+	device_destroy(rnbd_dev_class, MKDEV(0, 0));
+cls_destroy:
+	class_destroy(rnbd_dev_class);
+
+	return err;
+}
+
+void rnbd_clt_destroy_sysfs_files(void)
+{
+	sysfs_remove_group(&rnbd_dev->kobj, &default_attr_group);
+	kobject_del(rnbd_devs_kobj);
+	kobject_put(rnbd_devs_kobj);
+	device_destroy(rnbd_dev_class, MKDEV(0, 0));
+	class_destroy(rnbd_dev_class);
+}
diff --git a/drivers/block/rnbd/rnbd-clt.c b/drivers/block/rnbd/rnbd-clt.c
new file mode 100644
index 000000000..5eb8c7855
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-clt.c
@@ -0,0 +1,1848 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/scatterlist.h>
+#include <linux/idr.h>
+
+#include "rnbd-clt.h"
+
+MODULE_DESCRIPTION("RDMA Network Block Device Client");
+MODULE_LICENSE("GPL");
+
+static int rnbd_client_major;
+static DEFINE_IDA(index_ida);
+static DEFINE_MUTEX(sess_lock);
+static LIST_HEAD(sess_list);
+static struct workqueue_struct *rnbd_clt_wq;
+
+/*
+ * Maximum number of partitions an instance can have.
+ * 6 bits = 64 minors = 63 partitions (one minor is used for the device itself)
+ */
+#define RNBD_PART_BITS		6
+
+static inline bool rnbd_clt_get_sess(struct rnbd_clt_session *sess)
+{
+	return refcount_inc_not_zero(&sess->refcount);
+}
+
+static void free_sess(struct rnbd_clt_session *sess);
+
+static void rnbd_clt_put_sess(struct rnbd_clt_session *sess)
+{
+	might_sleep();
+
+	if (refcount_dec_and_test(&sess->refcount))
+		free_sess(sess);
+}
+
+static void rnbd_clt_put_dev(struct rnbd_clt_dev *dev)
+{
+	might_sleep();
+
+	if (!refcount_dec_and_test(&dev->refcount))
+		return;
+
+	ida_free(&index_ida, dev->clt_device_id);
+	kfree(dev->hw_queues);
+	kfree(dev->pathname);
+	rnbd_clt_put_sess(dev->sess);
+	mutex_destroy(&dev->lock);
+	kfree(dev);
+}
+
+static inline bool rnbd_clt_get_dev(struct rnbd_clt_dev *dev)
+{
+	return refcount_inc_not_zero(&dev->refcount);
+}
+
+static void rnbd_clt_change_capacity(struct rnbd_clt_dev *dev,
+				    sector_t new_nsectors)
+{
+	if (get_capacity(dev->gd) == new_nsectors)
+		return;
+
+	/*
+	 * If the size changed, we need to revalidate it
+	 */
+	rnbd_clt_info(dev, "Device size changed from %llu to %llu sectors\n",
+		      get_capacity(dev->gd), new_nsectors);
+	set_capacity_and_notify(dev->gd, new_nsectors);
+}
+
+static int process_msg_open_rsp(struct rnbd_clt_dev *dev,
+				struct rnbd_msg_open_rsp *rsp)
+{
+	struct kobject *gd_kobj;
+	int err = 0;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state == DEV_STATE_UNMAPPED) {
+		rnbd_clt_info(dev,
+			       "Ignoring Open-Response message from server for  unmapped device\n");
+		err = -ENOENT;
+		goto out;
+	}
+	if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED) {
+		u64 nsectors = le64_to_cpu(rsp->nsectors);
+
+		rnbd_clt_change_capacity(dev, nsectors);
+		gd_kobj = &disk_to_dev(dev->gd)->kobj;
+		kobject_uevent(gd_kobj, KOBJ_ONLINE);
+		rnbd_clt_info(dev, "Device online, device remapped successfully\n");
+	}
+	if (!rsp->logical_block_size) {
+		err = -EINVAL;
+		goto out;
+	}
+	dev->device_id = le32_to_cpu(rsp->device_id);
+	dev->dev_state = DEV_STATE_MAPPED;
+
+out:
+	mutex_unlock(&dev->lock);
+
+	return err;
+}
+
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize)
+{
+	int ret = 0;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state != DEV_STATE_MAPPED) {
+		pr_err("Failed to set new size of the device, device is not opened\n");
+		ret = -ENOENT;
+		goto out;
+	}
+	rnbd_clt_change_capacity(dev, newsize);
+
+out:
+	mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+static inline void rnbd_clt_dev_requeue(struct rnbd_queue *q)
+{
+	if (WARN_ON(!q->hctx))
+		return;
+
+	/* We can come here from interrupt, thus async=true */
+	blk_mq_run_hw_queue(q->hctx, true);
+}
+
+enum {
+	RNBD_DELAY_IFBUSY = -1,
+};
+
+/**
+ * rnbd_get_cpu_qlist() - finds a list with HW queues to be rerun
+ * @sess:	Session to find a queue for
+ * @cpu:	Cpu to start the search from
+ *
+ * Description:
+ *     Each CPU has a list of HW queues, which needs to be rerun.  If a list
+ *     is not empty - it is marked with a bit.  This function finds first
+ *     set bit in a bitmap and returns corresponding CPU list.
+ */
+static struct rnbd_cpu_qlist *
+rnbd_get_cpu_qlist(struct rnbd_clt_session *sess, int cpu)
+{
+	int bit;
+
+	/* Search from cpu to nr_cpu_ids */
+	bit = find_next_bit(sess->cpu_queues_bm, nr_cpu_ids, cpu);
+	if (bit < nr_cpu_ids) {
+		return per_cpu_ptr(sess->cpu_queues, bit);
+	} else if (cpu != 0) {
+		/* Search from 0 to cpu */
+		bit = find_first_bit(sess->cpu_queues_bm, cpu);
+		if (bit < cpu)
+			return per_cpu_ptr(sess->cpu_queues, bit);
+	}
+
+	return NULL;
+}
+
+static inline int nxt_cpu(int cpu)
+{
+	return (cpu + 1) % nr_cpu_ids;
+}
+
+/**
+ * rnbd_rerun_if_needed() - rerun next queue marked as stopped
+ * @sess:	Session to rerun a queue on
+ *
+ * Description:
+ *     Each CPU has it's own list of HW queues, which should be rerun.
+ *     Function finds such list with HW queues, takes a list lock, picks up
+ *     the first HW queue out of the list and requeues it.
+ *
+ * Return:
+ *     True if the queue was requeued, false otherwise.
+ *
+ * Context:
+ *     Does not matter.
+ */
+static bool rnbd_rerun_if_needed(struct rnbd_clt_session *sess)
+{
+	struct rnbd_queue *q = NULL;
+	struct rnbd_cpu_qlist *cpu_q;
+	unsigned long flags;
+	int *cpup;
+
+	/*
+	 * To keep fairness and not to let other queues starve we always
+	 * try to wake up someone else in round-robin manner.  That of course
+	 * increases latency but queues always have a chance to be executed.
+	 */
+	cpup = get_cpu_ptr(sess->cpu_rr);
+	for (cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(*cpup)); cpu_q;
+	     cpu_q = rnbd_get_cpu_qlist(sess, nxt_cpu(cpu_q->cpu))) {
+		if (!spin_trylock_irqsave(&cpu_q->requeue_lock, flags))
+			continue;
+		if (!test_bit(cpu_q->cpu, sess->cpu_queues_bm))
+			goto unlock;
+		q = list_first_entry_or_null(&cpu_q->requeue_list,
+					     typeof(*q), requeue_list);
+		if (WARN_ON(!q))
+			goto clear_bit;
+		list_del_init(&q->requeue_list);
+		clear_bit_unlock(0, &q->in_list);
+
+		if (list_empty(&cpu_q->requeue_list)) {
+			/* Clear bit if nothing is left */
+clear_bit:
+			clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
+		}
+unlock:
+		spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+
+		if (q)
+			break;
+	}
+
+	/**
+	 * Saves the CPU that is going to be requeued on the per-cpu var. Just
+	 * incrementing it doesn't work because rnbd_get_cpu_qlist() will
+	 * always return the first CPU with something on the queue list when the
+	 * value stored on the var is greater than the last CPU with something
+	 * on the list.
+	 */
+	if (cpu_q)
+		*cpup = cpu_q->cpu;
+	put_cpu_ptr(sess->cpu_rr);
+
+	if (q)
+		rnbd_clt_dev_requeue(q);
+
+	return q;
+}
+
+/**
+ * rnbd_rerun_all_if_idle() - rerun all queues left in the list if
+ *				 session is idling (there are no requests
+ *				 in-flight).
+ * @sess:	Session to rerun the queues on
+ *
+ * Description:
+ *     This function tries to rerun all stopped queues if there are no
+ *     requests in-flight anymore.  This function tries to solve an obvious
+ *     problem, when number of tags < than number of queues (hctx), which
+ *     are stopped and put to sleep.  If last permit, which has been just put,
+ *     does not wake up all left queues (hctxs), IO requests hang forever.
+ *
+ *     That can happen when all number of permits, say N, have been exhausted
+ *     from one CPU, and we have many block devices per session, say M.
+ *     Each block device has it's own queue (hctx) for each CPU, so eventually
+ *     we can put that number of queues (hctxs) to sleep: M x nr_cpu_ids.
+ *     If number of permits N < M x nr_cpu_ids finally we will get an IO hang.
+ *
+ *     To avoid this hang last caller of rnbd_put_permit() (last caller is the
+ *     one who observes sess->busy == 0) must wake up all remaining queues.
+ *
+ * Context:
+ *     Does not matter.
+ */
+static void rnbd_rerun_all_if_idle(struct rnbd_clt_session *sess)
+{
+	bool requeued;
+
+	do {
+		requeued = rnbd_rerun_if_needed(sess);
+	} while (atomic_read(&sess->busy) == 0 && requeued);
+}
+
+static struct rtrs_permit *rnbd_get_permit(struct rnbd_clt_session *sess,
+					     enum rtrs_clt_con_type con_type,
+					     enum wait_type wait)
+{
+	struct rtrs_permit *permit;
+
+	permit = rtrs_clt_get_permit(sess->rtrs, con_type, wait);
+	if (permit)
+		/* We have a subtle rare case here, when all permits can be
+		 * consumed before busy counter increased.  This is safe,
+		 * because loser will get NULL as a permit, observe 0 busy
+		 * counter and immediately restart the queue himself.
+		 */
+		atomic_inc(&sess->busy);
+
+	return permit;
+}
+
+static void rnbd_put_permit(struct rnbd_clt_session *sess,
+			     struct rtrs_permit *permit)
+{
+	rtrs_clt_put_permit(sess->rtrs, permit);
+	atomic_dec(&sess->busy);
+	/* Paired with rnbd_clt_dev_add_to_requeue().  Decrement first
+	 * and then check queue bits.
+	 */
+	smp_mb__after_atomic();
+	rnbd_rerun_all_if_idle(sess);
+}
+
+static struct rnbd_iu *rnbd_get_iu(struct rnbd_clt_session *sess,
+				     enum rtrs_clt_con_type con_type,
+				     enum wait_type wait)
+{
+	struct rnbd_iu *iu;
+	struct rtrs_permit *permit;
+
+	iu = kzalloc(sizeof(*iu), GFP_KERNEL);
+	if (!iu)
+		return NULL;
+
+	permit = rnbd_get_permit(sess, con_type, wait);
+	if (!permit) {
+		kfree(iu);
+		return NULL;
+	}
+
+	iu->permit = permit;
+	/*
+	 * 1st reference is dropped after finishing sending a "user" message,
+	 * 2nd reference is dropped after confirmation with the response is
+	 * returned.
+	 * 1st and 2nd can happen in any order, so the rnbd_iu should be
+	 * released (rtrs_permit returned to rtrs) only after both
+	 * are finished.
+	 */
+	atomic_set(&iu->refcount, 2);
+	init_waitqueue_head(&iu->comp.wait);
+	iu->comp.errno = INT_MAX;
+
+	if (sg_alloc_table(&iu->sgt, 1, GFP_KERNEL)) {
+		rnbd_put_permit(sess, permit);
+		kfree(iu);
+		return NULL;
+	}
+
+	return iu;
+}
+
+static void rnbd_put_iu(struct rnbd_clt_session *sess, struct rnbd_iu *iu)
+{
+	if (atomic_dec_and_test(&iu->refcount)) {
+		sg_free_table(&iu->sgt);
+		rnbd_put_permit(sess, iu->permit);
+		kfree(iu);
+	}
+}
+
+static void rnbd_softirq_done_fn(struct request *rq)
+{
+	struct rnbd_clt_dev *dev	= rq->q->disk->private_data;
+	struct rnbd_clt_session *sess	= dev->sess;
+	struct rnbd_iu *iu;
+
+	iu = blk_mq_rq_to_pdu(rq);
+	sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT);
+	rnbd_put_permit(sess, iu->permit);
+	blk_mq_end_request(rq, errno_to_blk_status(iu->errno));
+}
+
+static void msg_io_conf(void *priv, int errno)
+{
+	struct rnbd_iu *iu = priv;
+	struct rnbd_clt_dev *dev = iu->dev;
+	struct request *rq = iu->rq;
+	int rw = rq_data_dir(rq);
+
+	iu->errno = errno;
+
+	blk_mq_complete_request(rq);
+
+	if (errno)
+		rnbd_clt_info_rl(dev, "%s I/O failed with err: %d\n",
+				 rw == READ ? "read" : "write", errno);
+}
+
+static void wake_up_iu_comp(struct rnbd_iu *iu, int errno)
+{
+	iu->comp.errno = errno;
+	wake_up(&iu->comp.wait);
+}
+
+static void msg_conf(void *priv, int errno)
+{
+	struct rnbd_iu *iu = priv;
+
+	iu->errno = errno;
+	schedule_work(&iu->work);
+}
+
+static int send_usr_msg(struct rtrs_clt_sess *rtrs, int dir,
+			struct rnbd_iu *iu, struct kvec *vec,
+			size_t len, struct scatterlist *sg, unsigned int sg_len,
+			void (*conf)(struct work_struct *work),
+			int *errno, int wait)
+{
+	int err;
+	struct rtrs_clt_req_ops req_ops;
+
+	INIT_WORK(&iu->work, conf);
+	req_ops = (struct rtrs_clt_req_ops) {
+		.priv = iu,
+		.conf_fn = msg_conf,
+	};
+	err = rtrs_clt_request(dir, &req_ops, rtrs, iu->permit,
+				vec, 1, len, sg, sg_len);
+	if (!err && wait) {
+		wait_event(iu->comp.wait, iu->comp.errno != INT_MAX);
+		*errno = iu->comp.errno;
+	} else {
+		*errno = 0;
+	}
+
+	return err;
+}
+
+static void msg_close_conf(struct work_struct *work)
+{
+	struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
+	struct rnbd_clt_dev *dev = iu->dev;
+
+	wake_up_iu_comp(iu, iu->errno);
+	rnbd_put_iu(dev->sess, iu);
+	rnbd_clt_put_dev(dev);
+}
+
+static int send_msg_close(struct rnbd_clt_dev *dev, u32 device_id,
+			  enum wait_type wait)
+{
+	struct rnbd_clt_session *sess = dev->sess;
+	struct rnbd_msg_close msg;
+	struct rnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+	if (!iu)
+		return -ENOMEM;
+
+	iu->buf = NULL;
+	iu->dev = dev;
+
+	msg.hdr.type	= cpu_to_le16(RNBD_MSG_CLOSE);
+	msg.device_id	= cpu_to_le32(device_id);
+
+	WARN_ON(!rnbd_clt_get_dev(dev));
+	err = send_usr_msg(sess->rtrs, WRITE, iu, &vec, 0, NULL, 0,
+			   msg_close_conf, &errno, wait);
+	if (err) {
+		rnbd_clt_put_dev(dev);
+		rnbd_put_iu(sess, iu);
+	} else {
+		err = errno;
+	}
+
+	rnbd_put_iu(sess, iu);
+	return err;
+}
+
+static void msg_open_conf(struct work_struct *work)
+{
+	struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
+	struct rnbd_msg_open_rsp *rsp = iu->buf;
+	struct rnbd_clt_dev *dev = iu->dev;
+	int errno = iu->errno;
+	bool from_map = false;
+
+	/* INIT state is only triggered from rnbd_clt_map_device */
+	if (dev->dev_state == DEV_STATE_INIT)
+		from_map = true;
+
+	if (errno) {
+		rnbd_clt_err(dev,
+			      "Opening failed, server responded: %d\n",
+			      errno);
+	} else {
+		errno = process_msg_open_rsp(dev, rsp);
+		if (errno) {
+			u32 device_id = le32_to_cpu(rsp->device_id);
+			/*
+			 * If server thinks its fine, but we fail to process
+			 * then be nice and send a close to server.
+			 */
+			send_msg_close(dev, device_id, RTRS_PERMIT_NOWAIT);
+		}
+	}
+	/* We free rsp in rnbd_clt_map_device for map scenario */
+	if (!from_map)
+		kfree(rsp);
+	wake_up_iu_comp(iu, errno);
+	rnbd_put_iu(dev->sess, iu);
+	rnbd_clt_put_dev(dev);
+}
+
+static void msg_sess_info_conf(struct work_struct *work)
+{
+	struct rnbd_iu *iu = container_of(work, struct rnbd_iu, work);
+	struct rnbd_msg_sess_info_rsp *rsp = iu->buf;
+	struct rnbd_clt_session *sess = iu->sess;
+
+	if (!iu->errno)
+		sess->ver = min_t(u8, rsp->ver, RNBD_PROTO_VER_MAJOR);
+
+	kfree(rsp);
+	wake_up_iu_comp(iu, iu->errno);
+	rnbd_put_iu(sess, iu);
+	rnbd_clt_put_sess(sess);
+}
+
+static int send_msg_open(struct rnbd_clt_dev *dev, enum wait_type wait)
+{
+	struct rnbd_clt_session *sess = dev->sess;
+	struct rnbd_msg_open_rsp *rsp;
+	struct rnbd_msg_open msg;
+	struct rnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+	if (!iu) {
+		kfree(rsp);
+		return -ENOMEM;
+	}
+
+	iu->buf = rsp;
+	iu->dev = dev;
+
+	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
+
+	msg.hdr.type	= cpu_to_le16(RNBD_MSG_OPEN);
+	msg.access_mode	= dev->access_mode;
+	strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
+
+	WARN_ON(!rnbd_clt_get_dev(dev));
+	err = send_usr_msg(sess->rtrs, READ, iu,
+			   &vec, sizeof(*rsp), iu->sgt.sgl, 1,
+			   msg_open_conf, &errno, wait);
+	if (err) {
+		rnbd_clt_put_dev(dev);
+		rnbd_put_iu(sess, iu);
+		kfree(rsp);
+	} else {
+		err = errno;
+	}
+
+	rnbd_put_iu(sess, iu);
+	return err;
+}
+
+static int send_msg_sess_info(struct rnbd_clt_session *sess, enum wait_type wait)
+{
+	struct rnbd_msg_sess_info_rsp *rsp;
+	struct rnbd_msg_sess_info msg;
+	struct rnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	int err, errno;
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp)
+		return -ENOMEM;
+
+	iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+	if (!iu) {
+		kfree(rsp);
+		return -ENOMEM;
+	}
+
+	iu->buf = rsp;
+	iu->sess = sess;
+	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
+
+	msg.hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO);
+	msg.ver      = RNBD_PROTO_VER_MAJOR;
+
+	if (!rnbd_clt_get_sess(sess)) {
+		/*
+		 * That can happen only in one case, when RTRS has restablished
+		 * the connection and link_ev() is called, but session is almost
+		 * dead, last reference on session is put and caller is waiting
+		 * for RTRS to close everything.
+		 */
+		err = -ENODEV;
+		goto put_iu;
+	}
+	err = send_usr_msg(sess->rtrs, READ, iu,
+			   &vec, sizeof(*rsp), iu->sgt.sgl, 1,
+			   msg_sess_info_conf, &errno, wait);
+	if (err) {
+		rnbd_clt_put_sess(sess);
+put_iu:
+		rnbd_put_iu(sess, iu);
+		kfree(rsp);
+	} else {
+		err = errno;
+	}
+	rnbd_put_iu(sess, iu);
+	return err;
+}
+
+static void set_dev_states_to_disconnected(struct rnbd_clt_session *sess)
+{
+	struct rnbd_clt_dev *dev;
+	struct kobject *gd_kobj;
+
+	mutex_lock(&sess->lock);
+	list_for_each_entry(dev, &sess->devs_list, list) {
+		rnbd_clt_err(dev, "Device disconnected.\n");
+
+		mutex_lock(&dev->lock);
+		if (dev->dev_state == DEV_STATE_MAPPED) {
+			dev->dev_state = DEV_STATE_MAPPED_DISCONNECTED;
+			gd_kobj = &disk_to_dev(dev->gd)->kobj;
+			kobject_uevent(gd_kobj, KOBJ_OFFLINE);
+		}
+		mutex_unlock(&dev->lock);
+	}
+	mutex_unlock(&sess->lock);
+}
+
+static void remap_devs(struct rnbd_clt_session *sess)
+{
+	struct rnbd_clt_dev *dev;
+	struct rtrs_attrs attrs;
+	int err;
+
+	/*
+	 * Careful here: we are called from RTRS link event directly,
+	 * thus we can't send any RTRS request and wait for response
+	 * or RTRS will not be able to complete request with failure
+	 * if something goes wrong (failing of outstanding requests
+	 * happens exactly from the context where we are blocking now).
+	 *
+	 * So to avoid deadlocks each usr message sent from here must
+	 * be asynchronous.
+	 */
+
+	err = send_msg_sess_info(sess, RTRS_PERMIT_NOWAIT);
+	if (err) {
+		pr_err("send_msg_sess_info(\"%s\"): %d\n", sess->sessname, err);
+		return;
+	}
+
+	err = rtrs_clt_query(sess->rtrs, &attrs);
+	if (err) {
+		pr_err("rtrs_clt_query(\"%s\"): %d\n", sess->sessname, err);
+		return;
+	}
+	mutex_lock(&sess->lock);
+	sess->max_io_size = attrs.max_io_size;
+
+	list_for_each_entry(dev, &sess->devs_list, list) {
+		bool skip;
+
+		mutex_lock(&dev->lock);
+		skip = (dev->dev_state == DEV_STATE_INIT);
+		mutex_unlock(&dev->lock);
+		if (skip)
+			/*
+			 * When device is establishing connection for the first
+			 * time - do not remap, it will be closed soon.
+			 */
+			continue;
+
+		rnbd_clt_info(dev, "session reconnected, remapping device\n");
+		err = send_msg_open(dev, RTRS_PERMIT_NOWAIT);
+		if (err) {
+			rnbd_clt_err(dev, "send_msg_open(): %d\n", err);
+			break;
+		}
+	}
+	mutex_unlock(&sess->lock);
+}
+
+static void rnbd_clt_link_ev(void *priv, enum rtrs_clt_link_ev ev)
+{
+	struct rnbd_clt_session *sess = priv;
+
+	switch (ev) {
+	case RTRS_CLT_LINK_EV_DISCONNECTED:
+		set_dev_states_to_disconnected(sess);
+		break;
+	case RTRS_CLT_LINK_EV_RECONNECTED:
+		remap_devs(sess);
+		break;
+	default:
+		pr_err("Unknown session event received (%d), session: %s\n",
+		       ev, sess->sessname);
+	}
+}
+
+static void rnbd_init_cpu_qlists(struct rnbd_cpu_qlist __percpu *cpu_queues)
+{
+	unsigned int cpu;
+	struct rnbd_cpu_qlist *cpu_q;
+
+	for_each_possible_cpu(cpu) {
+		cpu_q = per_cpu_ptr(cpu_queues, cpu);
+
+		cpu_q->cpu = cpu;
+		INIT_LIST_HEAD(&cpu_q->requeue_list);
+		spin_lock_init(&cpu_q->requeue_lock);
+	}
+}
+
+static void destroy_mq_tags(struct rnbd_clt_session *sess)
+{
+	if (sess->tag_set.tags)
+		blk_mq_free_tag_set(&sess->tag_set);
+}
+
+static inline void wake_up_rtrs_waiters(struct rnbd_clt_session *sess)
+{
+	sess->rtrs_ready = true;
+	wake_up_all(&sess->rtrs_waitq);
+}
+
+static void close_rtrs(struct rnbd_clt_session *sess)
+{
+	might_sleep();
+
+	if (!IS_ERR_OR_NULL(sess->rtrs)) {
+		rtrs_clt_close(sess->rtrs);
+		sess->rtrs = NULL;
+		wake_up_rtrs_waiters(sess);
+	}
+}
+
+static void free_sess(struct rnbd_clt_session *sess)
+{
+	WARN_ON(!list_empty(&sess->devs_list));
+
+	might_sleep();
+
+	close_rtrs(sess);
+	destroy_mq_tags(sess);
+	if (!list_empty(&sess->list)) {
+		mutex_lock(&sess_lock);
+		list_del(&sess->list);
+		mutex_unlock(&sess_lock);
+	}
+	free_percpu(sess->cpu_queues);
+	free_percpu(sess->cpu_rr);
+	mutex_destroy(&sess->lock);
+	kfree(sess);
+}
+
+static struct rnbd_clt_session *alloc_sess(const char *sessname)
+{
+	struct rnbd_clt_session *sess;
+	int err, cpu;
+
+	sess = kzalloc_node(sizeof(*sess), GFP_KERNEL, NUMA_NO_NODE);
+	if (!sess)
+		return ERR_PTR(-ENOMEM);
+	strscpy(sess->sessname, sessname, sizeof(sess->sessname));
+	atomic_set(&sess->busy, 0);
+	mutex_init(&sess->lock);
+	INIT_LIST_HEAD(&sess->devs_list);
+	INIT_LIST_HEAD(&sess->list);
+	bitmap_zero(sess->cpu_queues_bm, num_possible_cpus());
+	init_waitqueue_head(&sess->rtrs_waitq);
+	refcount_set(&sess->refcount, 1);
+
+	sess->cpu_queues = alloc_percpu(struct rnbd_cpu_qlist);
+	if (!sess->cpu_queues) {
+		err = -ENOMEM;
+		goto err;
+	}
+	rnbd_init_cpu_qlists(sess->cpu_queues);
+
+	/*
+	 * That is simple percpu variable which stores cpu indices, which are
+	 * incremented on each access.  We need that for the sake of fairness
+	 * to wake up queues in a round-robin manner.
+	 */
+	sess->cpu_rr = alloc_percpu(int);
+	if (!sess->cpu_rr) {
+		err = -ENOMEM;
+		goto err;
+	}
+	for_each_possible_cpu(cpu)
+		* per_cpu_ptr(sess->cpu_rr, cpu) = cpu;
+
+	return sess;
+
+err:
+	free_sess(sess);
+
+	return ERR_PTR(err);
+}
+
+static int wait_for_rtrs_connection(struct rnbd_clt_session *sess)
+{
+	wait_event(sess->rtrs_waitq, sess->rtrs_ready);
+	if (IS_ERR_OR_NULL(sess->rtrs))
+		return -ECONNRESET;
+
+	return 0;
+}
+
+static void wait_for_rtrs_disconnection(struct rnbd_clt_session *sess)
+	__releases(&sess_lock)
+	__acquires(&sess_lock)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&sess->rtrs_waitq, &wait, TASK_UNINTERRUPTIBLE);
+	if (IS_ERR_OR_NULL(sess->rtrs)) {
+		finish_wait(&sess->rtrs_waitq, &wait);
+		return;
+	}
+	mutex_unlock(&sess_lock);
+	/* loop in caller, see __find_and_get_sess().
+	 * You can't leave mutex locked and call schedule(), you will catch a
+	 * deadlock with a caller of free_sess(), which has just put the last
+	 * reference and is about to take the sess_lock in order to delete
+	 * the session from the list.
+	 */
+	schedule();
+	mutex_lock(&sess_lock);
+}
+
+static struct rnbd_clt_session *__find_and_get_sess(const char *sessname)
+	__releases(&sess_lock)
+	__acquires(&sess_lock)
+{
+	struct rnbd_clt_session *sess, *sn;
+	int err;
+
+again:
+	list_for_each_entry_safe(sess, sn, &sess_list, list) {
+		if (strcmp(sessname, sess->sessname))
+			continue;
+
+		if (sess->rtrs_ready && IS_ERR_OR_NULL(sess->rtrs))
+			/*
+			 * No RTRS connection, session is dying.
+			 */
+			continue;
+
+		if (rnbd_clt_get_sess(sess)) {
+			/*
+			 * Alive session is found, wait for RTRS connection.
+			 */
+			mutex_unlock(&sess_lock);
+			err = wait_for_rtrs_connection(sess);
+			if (err)
+				rnbd_clt_put_sess(sess);
+			mutex_lock(&sess_lock);
+
+			if (err)
+				/* Session is dying, repeat the loop */
+				goto again;
+
+			return sess;
+		}
+		/*
+		 * Ref is 0, session is dying, wait for RTRS disconnect
+		 * in order to avoid session names clashes.
+		 */
+		wait_for_rtrs_disconnection(sess);
+		/*
+		 * RTRS is disconnected and soon session will be freed,
+		 * so repeat a loop.
+		 */
+		goto again;
+	}
+
+	return NULL;
+}
+
+/* caller is responsible for initializing 'first' to false */
+static struct
+rnbd_clt_session *find_or_create_sess(const char *sessname, bool *first)
+{
+	struct rnbd_clt_session *sess = NULL;
+
+	mutex_lock(&sess_lock);
+	sess = __find_and_get_sess(sessname);
+	if (!sess) {
+		sess = alloc_sess(sessname);
+		if (IS_ERR(sess)) {
+			mutex_unlock(&sess_lock);
+			return sess;
+		}
+		list_add(&sess->list, &sess_list);
+		*first = true;
+	}
+	mutex_unlock(&sess_lock);
+
+	return sess;
+}
+
+static int rnbd_client_open(struct block_device *block_device, fmode_t mode)
+{
+	struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+
+	if (get_disk_ro(dev->gd) && (mode & FMODE_WRITE))
+		return -EPERM;
+
+	if (dev->dev_state == DEV_STATE_UNMAPPED ||
+	    !rnbd_clt_get_dev(dev))
+		return -EIO;
+
+	return 0;
+}
+
+static void rnbd_client_release(struct gendisk *gen, fmode_t mode)
+{
+	struct rnbd_clt_dev *dev = gen->private_data;
+
+	rnbd_clt_put_dev(dev);
+}
+
+static int rnbd_client_getgeo(struct block_device *block_device,
+			      struct hd_geometry *geo)
+{
+	u64 size;
+	struct rnbd_clt_dev *dev = block_device->bd_disk->private_data;
+	struct queue_limits *limit = &dev->queue->limits;
+
+	size = dev->size * (limit->logical_block_size / SECTOR_SIZE);
+	geo->cylinders	= size >> 6;	/* size/64 */
+	geo->heads	= 4;
+	geo->sectors	= 16;
+	geo->start	= 0;
+
+	return 0;
+}
+
+static const struct block_device_operations rnbd_client_ops = {
+	.owner		= THIS_MODULE,
+	.open		= rnbd_client_open,
+	.release	= rnbd_client_release,
+	.getgeo		= rnbd_client_getgeo
+};
+
+/* The amount of data that belongs to an I/O and the amount of data that
+ * should be read or written to the disk (bi_size) can differ.
+ *
+ * E.g. When WRITE_SAME is used, only a small amount of data is
+ * transferred that is then written repeatedly over a lot of sectors.
+ *
+ * Get the size of data to be transferred via RTRS by summing up the size
+ * of the scather-gather list entries.
+ */
+static size_t rnbd_clt_get_sg_size(struct scatterlist *sglist, u32 len)
+{
+	struct scatterlist *sg;
+	size_t tsize = 0;
+	int i;
+
+	for_each_sg(sglist, sg, len, i)
+		tsize += sg->length;
+	return tsize;
+}
+
+static int rnbd_client_xfer_request(struct rnbd_clt_dev *dev,
+				     struct request *rq,
+				     struct rnbd_iu *iu)
+{
+	struct rtrs_clt_sess *rtrs = dev->sess->rtrs;
+	struct rtrs_permit *permit = iu->permit;
+	struct rnbd_msg_io msg;
+	struct rtrs_clt_req_ops req_ops;
+	unsigned int sg_cnt = 0;
+	struct kvec vec;
+	size_t size;
+	int err;
+
+	iu->rq		= rq;
+	iu->dev		= dev;
+	msg.sector	= cpu_to_le64(blk_rq_pos(rq));
+	msg.bi_size	= cpu_to_le32(blk_rq_bytes(rq));
+	msg.rw		= cpu_to_le32(rq_to_rnbd_flags(rq));
+	msg.prio	= cpu_to_le16(req_get_ioprio(rq));
+
+	/*
+	 * We only support discards with single segment for now.
+	 * See queue limits.
+	 */
+	if (req_op(rq) != REQ_OP_DISCARD)
+		sg_cnt = blk_rq_map_sg(dev->queue, rq, iu->sgt.sgl);
+
+	if (sg_cnt == 0)
+		sg_mark_end(&iu->sgt.sgl[0]);
+
+	msg.hdr.type	= cpu_to_le16(RNBD_MSG_IO);
+	msg.device_id	= cpu_to_le32(dev->device_id);
+
+	vec = (struct kvec) {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+	size = rnbd_clt_get_sg_size(iu->sgt.sgl, sg_cnt);
+	req_ops = (struct rtrs_clt_req_ops) {
+		.priv = iu,
+		.conf_fn = msg_io_conf,
+	};
+	err = rtrs_clt_request(rq_data_dir(rq), &req_ops, rtrs, permit,
+			       &vec, 1, size, iu->sgt.sgl, sg_cnt);
+	if (err) {
+		rnbd_clt_err_rl(dev, "RTRS failed to transfer IO, err: %d\n",
+				 err);
+		return err;
+	}
+
+	return 0;
+}
+
+/**
+ * rnbd_clt_dev_add_to_requeue() - add device to requeue if session is busy
+ * @dev:	Device to be checked
+ * @q:		Queue to be added to the requeue list if required
+ *
+ * Description:
+ *     If session is busy, that means someone will requeue us when resources
+ *     are freed.  If session is not doing anything - device is not added to
+ *     the list and @false is returned.
+ */
+static bool rnbd_clt_dev_add_to_requeue(struct rnbd_clt_dev *dev,
+						struct rnbd_queue *q)
+{
+	struct rnbd_clt_session *sess = dev->sess;
+	struct rnbd_cpu_qlist *cpu_q;
+	unsigned long flags;
+	bool added = true;
+	bool need_set;
+
+	cpu_q = get_cpu_ptr(sess->cpu_queues);
+	spin_lock_irqsave(&cpu_q->requeue_lock, flags);
+
+	if (!test_and_set_bit_lock(0, &q->in_list)) {
+		if (WARN_ON(!list_empty(&q->requeue_list)))
+			goto unlock;
+
+		need_set = !test_bit(cpu_q->cpu, sess->cpu_queues_bm);
+		if (need_set) {
+			set_bit(cpu_q->cpu, sess->cpu_queues_bm);
+			/* Paired with rnbd_put_permit(). Set a bit first
+			 * and then observe the busy counter.
+			 */
+			smp_mb__before_atomic();
+		}
+		if (atomic_read(&sess->busy)) {
+			list_add_tail(&q->requeue_list, &cpu_q->requeue_list);
+		} else {
+			/* Very unlikely, but possible: busy counter was
+			 * observed as zero.  Drop all bits and return
+			 * false to restart the queue by ourselves.
+			 */
+			if (need_set)
+				clear_bit(cpu_q->cpu, sess->cpu_queues_bm);
+			clear_bit_unlock(0, &q->in_list);
+			added = false;
+		}
+	}
+unlock:
+	spin_unlock_irqrestore(&cpu_q->requeue_lock, flags);
+	put_cpu_ptr(sess->cpu_queues);
+
+	return added;
+}
+
+static void rnbd_clt_dev_kick_mq_queue(struct rnbd_clt_dev *dev,
+					struct blk_mq_hw_ctx *hctx,
+					int delay)
+{
+	struct rnbd_queue *q = hctx->driver_data;
+
+	if (delay != RNBD_DELAY_IFBUSY)
+		blk_mq_delay_run_hw_queue(hctx, delay);
+	else if (!rnbd_clt_dev_add_to_requeue(dev, q))
+		/*
+		 * If session is not busy we have to restart
+		 * the queue ourselves.
+		 */
+		blk_mq_delay_run_hw_queue(hctx, 10/*ms*/);
+}
+
+static blk_status_t rnbd_queue_rq(struct blk_mq_hw_ctx *hctx,
+				   const struct blk_mq_queue_data *bd)
+{
+	struct request *rq = bd->rq;
+	struct rnbd_clt_dev *dev = rq->q->disk->private_data;
+	struct rnbd_iu *iu = blk_mq_rq_to_pdu(rq);
+	int err;
+	blk_status_t ret = BLK_STS_IOERR;
+
+	if (dev->dev_state != DEV_STATE_MAPPED)
+		return BLK_STS_IOERR;
+
+	iu->permit = rnbd_get_permit(dev->sess, RTRS_IO_CON,
+				      RTRS_PERMIT_NOWAIT);
+	if (!iu->permit) {
+		rnbd_clt_dev_kick_mq_queue(dev, hctx, RNBD_DELAY_IFBUSY);
+		return BLK_STS_RESOURCE;
+	}
+
+	iu->sgt.sgl = iu->first_sgl;
+	err = sg_alloc_table_chained(&iu->sgt,
+				     /* Even-if the request has no segment,
+				      * sglist must have one entry at least.
+				      */
+				     blk_rq_nr_phys_segments(rq) ? : 1,
+				     iu->sgt.sgl,
+				     RNBD_INLINE_SG_CNT);
+	if (err) {
+		rnbd_clt_err_rl(dev, "sg_alloc_table_chained ret=%d\n", err);
+		rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
+		rnbd_put_permit(dev->sess, iu->permit);
+		return BLK_STS_RESOURCE;
+	}
+
+	blk_mq_start_request(rq);
+	err = rnbd_client_xfer_request(dev, rq, iu);
+	if (err == 0)
+		return BLK_STS_OK;
+	if (err == -EAGAIN || err == -ENOMEM) {
+		rnbd_clt_dev_kick_mq_queue(dev, hctx, 10/*ms*/);
+		ret = BLK_STS_RESOURCE;
+	}
+	sg_free_table_chained(&iu->sgt, RNBD_INLINE_SG_CNT);
+	rnbd_put_permit(dev->sess, iu->permit);
+	return ret;
+}
+
+static int rnbd_rdma_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct rnbd_queue *q = hctx->driver_data;
+	struct rnbd_clt_dev *dev = q->dev;
+
+	return rtrs_clt_rdma_cq_direct(dev->sess->rtrs, hctx->queue_num);
+}
+
+static void rnbd_rdma_map_queues(struct blk_mq_tag_set *set)
+{
+	struct rnbd_clt_session *sess = set->driver_data;
+
+	/* shared read/write queues */
+	set->map[HCTX_TYPE_DEFAULT].nr_queues = num_online_cpus();
+	set->map[HCTX_TYPE_DEFAULT].queue_offset = 0;
+	set->map[HCTX_TYPE_READ].nr_queues = num_online_cpus();
+	set->map[HCTX_TYPE_READ].queue_offset = 0;
+	blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+	blk_mq_map_queues(&set->map[HCTX_TYPE_READ]);
+
+	if (sess->nr_poll_queues) {
+		/* dedicated queue for poll */
+		set->map[HCTX_TYPE_POLL].nr_queues = sess->nr_poll_queues;
+		set->map[HCTX_TYPE_POLL].queue_offset = set->map[HCTX_TYPE_READ].queue_offset +
+			set->map[HCTX_TYPE_READ].nr_queues;
+		blk_mq_map_queues(&set->map[HCTX_TYPE_POLL]);
+		pr_info("[session=%s] mapped %d/%d/%d default/read/poll queues.\n",
+			sess->sessname,
+			set->map[HCTX_TYPE_DEFAULT].nr_queues,
+			set->map[HCTX_TYPE_READ].nr_queues,
+			set->map[HCTX_TYPE_POLL].nr_queues);
+	} else {
+		pr_info("[session=%s] mapped %d/%d default/read queues.\n",
+			sess->sessname,
+			set->map[HCTX_TYPE_DEFAULT].nr_queues,
+			set->map[HCTX_TYPE_READ].nr_queues);
+	}
+}
+
+static struct blk_mq_ops rnbd_mq_ops = {
+	.queue_rq	= rnbd_queue_rq,
+	.complete	= rnbd_softirq_done_fn,
+	.map_queues     = rnbd_rdma_map_queues,
+	.poll           = rnbd_rdma_poll,
+};
+
+static int setup_mq_tags(struct rnbd_clt_session *sess)
+{
+	struct blk_mq_tag_set *tag_set = &sess->tag_set;
+
+	memset(tag_set, 0, sizeof(*tag_set));
+	tag_set->ops		= &rnbd_mq_ops;
+	tag_set->queue_depth	= sess->queue_depth;
+	tag_set->numa_node		= NUMA_NO_NODE;
+	tag_set->flags		= BLK_MQ_F_SHOULD_MERGE |
+				  BLK_MQ_F_TAG_QUEUE_SHARED;
+	tag_set->cmd_size	= sizeof(struct rnbd_iu) + RNBD_RDMA_SGL_SIZE;
+
+	/* for HCTX_TYPE_DEFAULT, HCTX_TYPE_READ, HCTX_TYPE_POLL */
+	tag_set->nr_maps        = sess->nr_poll_queues ? HCTX_MAX_TYPES : 2;
+	/*
+	 * HCTX_TYPE_DEFAULT and HCTX_TYPE_READ share one set of queues
+	 * others are for HCTX_TYPE_POLL
+	 */
+	tag_set->nr_hw_queues	= num_online_cpus() + sess->nr_poll_queues;
+	tag_set->driver_data    = sess;
+
+	return blk_mq_alloc_tag_set(tag_set);
+}
+
+static struct rnbd_clt_session *
+find_and_get_or_create_sess(const char *sessname,
+			    const struct rtrs_addr *paths,
+			    size_t path_cnt, u16 port_nr, u32 nr_poll_queues)
+{
+	struct rnbd_clt_session *sess;
+	struct rtrs_attrs attrs;
+	int err;
+	bool first = false;
+	struct rtrs_clt_ops rtrs_ops;
+
+	sess = find_or_create_sess(sessname, &first);
+	if (sess == ERR_PTR(-ENOMEM)) {
+		return ERR_PTR(-ENOMEM);
+	} else if ((nr_poll_queues && !first) ||  (!nr_poll_queues && sess->nr_poll_queues)) {
+		/*
+		 * A device MUST have its own session to use the polling-mode.
+		 * It must fail to map new device with the same session.
+		 */
+		err = -EINVAL;
+		goto put_sess;
+	}
+
+	if (!first)
+		return sess;
+
+	if (!path_cnt) {
+		pr_err("Session %s not found, and path parameter not given", sessname);
+		err = -ENXIO;
+		goto put_sess;
+	}
+
+	rtrs_ops = (struct rtrs_clt_ops) {
+		.priv = sess,
+		.link_ev = rnbd_clt_link_ev,
+	};
+	/*
+	 * Nothing was found, establish rtrs connection and proceed further.
+	 */
+	sess->rtrs = rtrs_clt_open(&rtrs_ops, sessname,
+				   paths, path_cnt, port_nr,
+				   0, /* Do not use pdu of rtrs */
+				   RECONNECT_DELAY,
+				   MAX_RECONNECTS, nr_poll_queues);
+	if (IS_ERR(sess->rtrs)) {
+		err = PTR_ERR(sess->rtrs);
+		goto wake_up_and_put;
+	}
+
+	err = rtrs_clt_query(sess->rtrs, &attrs);
+	if (err)
+		goto close_rtrs;
+
+	sess->max_io_size = attrs.max_io_size;
+	sess->queue_depth = attrs.queue_depth;
+	sess->nr_poll_queues = nr_poll_queues;
+	sess->max_segments = attrs.max_segments;
+
+	err = setup_mq_tags(sess);
+	if (err)
+		goto close_rtrs;
+
+	err = send_msg_sess_info(sess, RTRS_PERMIT_WAIT);
+	if (err)
+		goto close_rtrs;
+
+	wake_up_rtrs_waiters(sess);
+
+	return sess;
+
+close_rtrs:
+	close_rtrs(sess);
+put_sess:
+	rnbd_clt_put_sess(sess);
+
+	return ERR_PTR(err);
+
+wake_up_and_put:
+	wake_up_rtrs_waiters(sess);
+	goto put_sess;
+}
+
+static inline void rnbd_init_hw_queue(struct rnbd_clt_dev *dev,
+				       struct rnbd_queue *q,
+				       struct blk_mq_hw_ctx *hctx)
+{
+	INIT_LIST_HEAD(&q->requeue_list);
+	q->dev  = dev;
+	q->hctx = hctx;
+}
+
+static void rnbd_init_mq_hw_queues(struct rnbd_clt_dev *dev)
+{
+	unsigned long i;
+	struct blk_mq_hw_ctx *hctx;
+	struct rnbd_queue *q;
+
+	queue_for_each_hw_ctx(dev->queue, hctx, i) {
+		q = &dev->hw_queues[i];
+		rnbd_init_hw_queue(dev, q, hctx);
+		hctx->driver_data = q;
+	}
+}
+
+static void setup_request_queue(struct rnbd_clt_dev *dev,
+				struct rnbd_msg_open_rsp *rsp)
+{
+	blk_queue_logical_block_size(dev->queue,
+				     le16_to_cpu(rsp->logical_block_size));
+	blk_queue_physical_block_size(dev->queue,
+				      le16_to_cpu(rsp->physical_block_size));
+	blk_queue_max_hw_sectors(dev->queue,
+				 dev->sess->max_io_size / SECTOR_SIZE);
+
+	/*
+	 * we don't support discards to "discontiguous" segments
+	 * in on request
+	 */
+	blk_queue_max_discard_segments(dev->queue, 1);
+
+	blk_queue_max_discard_sectors(dev->queue,
+				      le32_to_cpu(rsp->max_discard_sectors));
+	dev->queue->limits.discard_granularity =
+					le32_to_cpu(rsp->discard_granularity);
+	dev->queue->limits.discard_alignment =
+					le32_to_cpu(rsp->discard_alignment);
+	if (le16_to_cpu(rsp->secure_discard))
+		blk_queue_max_secure_erase_sectors(dev->queue,
+					le32_to_cpu(rsp->max_discard_sectors));
+	blk_queue_flag_set(QUEUE_FLAG_SAME_COMP, dev->queue);
+	blk_queue_flag_set(QUEUE_FLAG_SAME_FORCE, dev->queue);
+	blk_queue_max_segments(dev->queue, dev->sess->max_segments);
+	blk_queue_io_opt(dev->queue, dev->sess->max_io_size);
+	blk_queue_virt_boundary(dev->queue, SZ_4K - 1);
+	blk_queue_write_cache(dev->queue,
+			      !!(rsp->cache_policy & RNBD_WRITEBACK),
+			      !!(rsp->cache_policy & RNBD_FUA));
+}
+
+static int rnbd_clt_setup_gen_disk(struct rnbd_clt_dev *dev,
+				   struct rnbd_msg_open_rsp *rsp, int idx)
+{
+	int err;
+
+	dev->gd->major		= rnbd_client_major;
+	dev->gd->first_minor	= idx << RNBD_PART_BITS;
+	dev->gd->minors		= 1 << RNBD_PART_BITS;
+	dev->gd->fops		= &rnbd_client_ops;
+	dev->gd->queue		= dev->queue;
+	dev->gd->private_data	= dev;
+	snprintf(dev->gd->disk_name, sizeof(dev->gd->disk_name), "rnbd%d",
+		 idx);
+	pr_debug("disk_name=%s, capacity=%llu\n",
+		 dev->gd->disk_name,
+		 le64_to_cpu(rsp->nsectors) *
+		 (le16_to_cpu(rsp->logical_block_size) / SECTOR_SIZE));
+
+	set_capacity(dev->gd, le64_to_cpu(rsp->nsectors));
+
+	if (dev->access_mode == RNBD_ACCESS_RO)
+		set_disk_ro(dev->gd, true);
+
+	/*
+	 * Network device does not need rotational
+	 */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, dev->queue);
+	err = add_disk(dev->gd);
+	if (err)
+		put_disk(dev->gd);
+
+	return err;
+}
+
+static int rnbd_client_setup_device(struct rnbd_clt_dev *dev,
+				    struct rnbd_msg_open_rsp *rsp)
+{
+	int idx = dev->clt_device_id;
+
+	dev->size = le64_to_cpu(rsp->nsectors) *
+			le16_to_cpu(rsp->logical_block_size);
+
+	dev->gd = blk_mq_alloc_disk(&dev->sess->tag_set, dev);
+	if (IS_ERR(dev->gd))
+		return PTR_ERR(dev->gd);
+	dev->queue = dev->gd->queue;
+	rnbd_init_mq_hw_queues(dev);
+
+	setup_request_queue(dev, rsp);
+	return rnbd_clt_setup_gen_disk(dev, rsp, idx);
+}
+
+static struct rnbd_clt_dev *init_dev(struct rnbd_clt_session *sess,
+				      enum rnbd_access_mode access_mode,
+				      const char *pathname,
+				      u32 nr_poll_queues)
+{
+	struct rnbd_clt_dev *dev;
+	int ret;
+
+	dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, NUMA_NO_NODE);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * nr_cpu_ids: the number of softirq queues
+	 * nr_poll_queues: the number of polling queues
+	 */
+	dev->hw_queues = kcalloc(nr_cpu_ids + nr_poll_queues,
+				 sizeof(*dev->hw_queues),
+				 GFP_KERNEL);
+	if (!dev->hw_queues) {
+		ret = -ENOMEM;
+		goto out_alloc;
+	}
+
+	ret = ida_alloc_max(&index_ida, (1 << (MINORBITS - RNBD_PART_BITS)) - 1,
+			    GFP_KERNEL);
+	if (ret < 0) {
+		pr_err("Failed to initialize device '%s' from session %s, allocating idr failed, err: %d\n",
+		       pathname, sess->sessname, ret);
+		goto out_queues;
+	}
+
+	dev->pathname = kstrdup(pathname, GFP_KERNEL);
+	if (!dev->pathname) {
+		ret = -ENOMEM;
+		goto out_queues;
+	}
+
+	dev->clt_device_id	= ret;
+	dev->sess		= sess;
+	dev->access_mode	= access_mode;
+	dev->nr_poll_queues	= nr_poll_queues;
+	mutex_init(&dev->lock);
+	refcount_set(&dev->refcount, 1);
+	dev->dev_state = DEV_STATE_INIT;
+
+	/*
+	 * Here we called from sysfs entry, thus clt-sysfs is
+	 * responsible that session will not disappear.
+	 */
+	WARN_ON(!rnbd_clt_get_sess(sess));
+
+	return dev;
+
+out_queues:
+	kfree(dev->hw_queues);
+out_alloc:
+	kfree(dev);
+	return ERR_PTR(ret);
+}
+
+static bool __exists_dev(const char *pathname, const char *sessname)
+{
+	struct rnbd_clt_session *sess;
+	struct rnbd_clt_dev *dev;
+	bool found = false;
+
+	list_for_each_entry(sess, &sess_list, list) {
+		if (sessname && strncmp(sess->sessname, sessname,
+					sizeof(sess->sessname)))
+			continue;
+		mutex_lock(&sess->lock);
+		list_for_each_entry(dev, &sess->devs_list, list) {
+			if (strlen(dev->pathname) == strlen(pathname) &&
+			    !strcmp(dev->pathname, pathname)) {
+				found = true;
+				break;
+			}
+		}
+		mutex_unlock(&sess->lock);
+		if (found)
+			break;
+	}
+
+	return found;
+}
+
+static bool exists_devpath(const char *pathname, const char *sessname)
+{
+	bool found;
+
+	mutex_lock(&sess_lock);
+	found = __exists_dev(pathname, sessname);
+	mutex_unlock(&sess_lock);
+
+	return found;
+}
+
+static bool insert_dev_if_not_exists_devpath(struct rnbd_clt_dev *dev)
+{
+	bool found;
+	struct rnbd_clt_session *sess = dev->sess;
+
+	mutex_lock(&sess_lock);
+	found = __exists_dev(dev->pathname, sess->sessname);
+	if (!found) {
+		mutex_lock(&sess->lock);
+		list_add_tail(&dev->list, &sess->devs_list);
+		mutex_unlock(&sess->lock);
+	}
+	mutex_unlock(&sess_lock);
+
+	return found;
+}
+
+static void delete_dev(struct rnbd_clt_dev *dev)
+{
+	struct rnbd_clt_session *sess = dev->sess;
+
+	mutex_lock(&sess->lock);
+	list_del(&dev->list);
+	mutex_unlock(&sess->lock);
+}
+
+struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
+					   struct rtrs_addr *paths,
+					   size_t path_cnt, u16 port_nr,
+					   const char *pathname,
+					   enum rnbd_access_mode access_mode,
+					   u32 nr_poll_queues)
+{
+	struct rnbd_clt_session *sess;
+	struct rnbd_clt_dev *dev;
+	int ret, errno;
+	struct rnbd_msg_open_rsp *rsp;
+	struct rnbd_msg_open msg;
+	struct rnbd_iu *iu;
+	struct kvec vec = {
+		.iov_base = &msg,
+		.iov_len  = sizeof(msg)
+	};
+
+	if (exists_devpath(pathname, sessname))
+		return ERR_PTR(-EEXIST);
+
+	sess = find_and_get_or_create_sess(sessname, paths, path_cnt, port_nr, nr_poll_queues);
+	if (IS_ERR(sess))
+		return ERR_CAST(sess);
+
+	dev = init_dev(sess, access_mode, pathname, nr_poll_queues);
+	if (IS_ERR(dev)) {
+		pr_err("map_device: failed to map device '%s' from session %s, can't initialize device, err: %ld\n",
+		       pathname, sess->sessname, PTR_ERR(dev));
+		ret = PTR_ERR(dev);
+		goto put_sess;
+	}
+	if (insert_dev_if_not_exists_devpath(dev)) {
+		ret = -EEXIST;
+		goto put_dev;
+	}
+
+	rsp = kzalloc(sizeof(*rsp), GFP_KERNEL);
+	if (!rsp) {
+		ret = -ENOMEM;
+		goto del_dev;
+	}
+
+	iu = rnbd_get_iu(sess, RTRS_ADMIN_CON, RTRS_PERMIT_WAIT);
+	if (!iu) {
+		ret = -ENOMEM;
+		kfree(rsp);
+		goto del_dev;
+	}
+	iu->buf = rsp;
+	iu->dev = dev;
+	sg_init_one(iu->sgt.sgl, rsp, sizeof(*rsp));
+
+	msg.hdr.type    = cpu_to_le16(RNBD_MSG_OPEN);
+	msg.access_mode = dev->access_mode;
+	strscpy(msg.dev_name, dev->pathname, sizeof(msg.dev_name));
+
+	WARN_ON(!rnbd_clt_get_dev(dev));
+	ret = send_usr_msg(sess->rtrs, READ, iu,
+			   &vec, sizeof(*rsp), iu->sgt.sgl, 1,
+			   msg_open_conf, &errno, RTRS_PERMIT_WAIT);
+	if (ret) {
+		rnbd_clt_put_dev(dev);
+		rnbd_put_iu(sess, iu);
+	} else {
+		ret = errno;
+	}
+	if (ret) {
+		rnbd_clt_err(dev,
+			      "map_device: failed, can't open remote device, err: %d\n",
+			      ret);
+		goto put_iu;
+	}
+	mutex_lock(&dev->lock);
+	pr_debug("Opened remote device: session=%s, path='%s'\n",
+		 sess->sessname, pathname);
+	ret = rnbd_client_setup_device(dev, rsp);
+	if (ret) {
+		rnbd_clt_err(dev,
+			      "map_device: Failed to configure device, err: %d\n",
+			      ret);
+		mutex_unlock(&dev->lock);
+		goto send_close;
+	}
+
+	rnbd_clt_info(dev,
+		       "map_device: Device mapped as %s (nsectors: %llu, logical_block_size: %d, physical_block_size: %d, max_discard_sectors: %d, discard_granularity: %d, discard_alignment: %d, secure_discard: %d, max_segments: %d, max_hw_sectors: %d, wc: %d, fua: %d)\n",
+		       dev->gd->disk_name, le64_to_cpu(rsp->nsectors),
+		       le16_to_cpu(rsp->logical_block_size),
+		       le16_to_cpu(rsp->physical_block_size),
+		       le32_to_cpu(rsp->max_discard_sectors),
+		       le32_to_cpu(rsp->discard_granularity),
+		       le32_to_cpu(rsp->discard_alignment),
+		       le16_to_cpu(rsp->secure_discard),
+		       sess->max_segments, sess->max_io_size / SECTOR_SIZE,
+		       !!(rsp->cache_policy & RNBD_WRITEBACK),
+		       !!(rsp->cache_policy & RNBD_FUA));
+
+	mutex_unlock(&dev->lock);
+	kfree(rsp);
+	rnbd_put_iu(sess, iu);
+	rnbd_clt_put_sess(sess);
+
+	return dev;
+
+send_close:
+	send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
+put_iu:
+	kfree(rsp);
+	rnbd_put_iu(sess, iu);
+del_dev:
+	delete_dev(dev);
+put_dev:
+	rnbd_clt_put_dev(dev);
+put_sess:
+	rnbd_clt_put_sess(sess);
+
+	return ERR_PTR(ret);
+}
+
+static void destroy_gen_disk(struct rnbd_clt_dev *dev)
+{
+	del_gendisk(dev->gd);
+	put_disk(dev->gd);
+}
+
+static void destroy_sysfs(struct rnbd_clt_dev *dev,
+			  const struct attribute *sysfs_self)
+{
+	rnbd_clt_remove_dev_symlink(dev);
+	if (dev->kobj.state_initialized) {
+		if (sysfs_self)
+			/* To avoid deadlock firstly remove itself */
+			sysfs_remove_file_self(&dev->kobj, sysfs_self);
+		kobject_del(&dev->kobj);
+		kobject_put(&dev->kobj);
+	}
+}
+
+int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
+			   const struct attribute *sysfs_self)
+{
+	struct rnbd_clt_session *sess = dev->sess;
+	int refcount, ret = 0;
+	bool was_mapped;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state == DEV_STATE_UNMAPPED) {
+		rnbd_clt_info(dev, "Device is already being unmapped\n");
+		ret = -EALREADY;
+		goto err;
+	}
+	refcount = refcount_read(&dev->refcount);
+	if (!force && refcount > 1) {
+		rnbd_clt_err(dev,
+			      "Closing device failed, device is in use, (%d device users)\n",
+			      refcount - 1);
+		ret = -EBUSY;
+		goto err;
+	}
+	was_mapped = (dev->dev_state == DEV_STATE_MAPPED);
+	dev->dev_state = DEV_STATE_UNMAPPED;
+	mutex_unlock(&dev->lock);
+
+	delete_dev(dev);
+	destroy_sysfs(dev, sysfs_self);
+	destroy_gen_disk(dev);
+	if (was_mapped && sess->rtrs)
+		send_msg_close(dev, dev->device_id, RTRS_PERMIT_WAIT);
+
+	rnbd_clt_info(dev, "Device is unmapped\n");
+
+	/* Likely last reference put */
+	rnbd_clt_put_dev(dev);
+
+	/*
+	 * Here device and session can be vanished!
+	 */
+
+	return 0;
+err:
+	mutex_unlock(&dev->lock);
+
+	return ret;
+}
+
+int rnbd_clt_remap_device(struct rnbd_clt_dev *dev)
+{
+	int err;
+
+	mutex_lock(&dev->lock);
+	if (dev->dev_state == DEV_STATE_MAPPED_DISCONNECTED)
+		err = 0;
+	else if (dev->dev_state == DEV_STATE_UNMAPPED)
+		err = -ENODEV;
+	else if (dev->dev_state == DEV_STATE_MAPPED)
+		err = -EALREADY;
+	else
+		err = -EBUSY;
+	mutex_unlock(&dev->lock);
+	if (!err) {
+		rnbd_clt_info(dev, "Remapping device.\n");
+		err = send_msg_open(dev, RTRS_PERMIT_WAIT);
+		if (err)
+			rnbd_clt_err(dev, "remap_device: %d\n", err);
+	}
+
+	return err;
+}
+
+static void unmap_device_work(struct work_struct *work)
+{
+	struct rnbd_clt_dev *dev;
+
+	dev = container_of(work, typeof(*dev), unmap_on_rmmod_work);
+	rnbd_clt_unmap_device(dev, true, NULL);
+}
+
+static void rnbd_destroy_sessions(void)
+{
+	struct rnbd_clt_session *sess, *sn;
+	struct rnbd_clt_dev *dev, *tn;
+
+	/* Firstly forbid access through sysfs interface */
+	rnbd_clt_destroy_sysfs_files();
+
+	/*
+	 * Here at this point there is no any concurrent access to sessions
+	 * list and devices list:
+	 *   1. New session or device can't be created - session sysfs files
+	 *      are removed.
+	 *   2. Device or session can't be removed - module reference is taken
+	 *      into account in unmap device sysfs callback.
+	 *   3. No IO requests inflight - each file open of block_dev increases
+	 *      module reference in get_disk().
+	 *
+	 * But still there can be user requests inflights, which are sent by
+	 * asynchronous send_msg_*() functions, thus before unmapping devices
+	 * RTRS session must be explicitly closed.
+	 */
+
+	list_for_each_entry_safe(sess, sn, &sess_list, list) {
+		if (!rnbd_clt_get_sess(sess))
+			continue;
+		close_rtrs(sess);
+		list_for_each_entry_safe(dev, tn, &sess->devs_list, list) {
+			/*
+			 * Here unmap happens in parallel for only one reason:
+			 * del_gendisk() takes around half a second, so
+			 * on huge amount of devices the whole module unload
+			 * procedure takes minutes.
+			 */
+			INIT_WORK(&dev->unmap_on_rmmod_work, unmap_device_work);
+			queue_work(rnbd_clt_wq, &dev->unmap_on_rmmod_work);
+		}
+		rnbd_clt_put_sess(sess);
+	}
+	/* Wait for all scheduled unmap works */
+	flush_workqueue(rnbd_clt_wq);
+	WARN_ON(!list_empty(&sess_list));
+}
+
+static int __init rnbd_client_init(void)
+{
+	int err = 0;
+
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56);
+	rnbd_client_major = register_blkdev(rnbd_client_major, "rnbd");
+	if (rnbd_client_major <= 0) {
+		pr_err("Failed to load module, block device registration failed\n");
+		return -EBUSY;
+	}
+
+	err = rnbd_clt_create_sysfs_files();
+	if (err) {
+		pr_err("Failed to load module, creating sysfs device files failed, err: %d\n",
+		       err);
+		unregister_blkdev(rnbd_client_major, "rnbd");
+		return err;
+	}
+	rnbd_clt_wq = alloc_workqueue("rnbd_clt_wq", 0, 0);
+	if (!rnbd_clt_wq) {
+		pr_err("Failed to load module, alloc_workqueue failed.\n");
+		rnbd_clt_destroy_sysfs_files();
+		unregister_blkdev(rnbd_client_major, "rnbd");
+		err = -ENOMEM;
+	}
+
+	return err;
+}
+
+static void __exit rnbd_client_exit(void)
+{
+	rnbd_destroy_sessions();
+	unregister_blkdev(rnbd_client_major, "rnbd");
+	ida_destroy(&index_ida);
+	destroy_workqueue(rnbd_clt_wq);
+}
+
+module_init(rnbd_client_init);
+module_exit(rnbd_client_exit);
diff --git a/drivers/block/rnbd/rnbd-clt.h b/drivers/block/rnbd/rnbd-clt.h
new file mode 100644
index 000000000..a48e040ab
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-clt.h
@@ -0,0 +1,151 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+
+#ifndef RNBD_CLT_H
+#define RNBD_CLT_H
+
+#include <linux/wait.h>
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/blk-mq.h>
+#include <linux/refcount.h>
+
+#include <rtrs.h>
+#include "rnbd-proto.h"
+#include "rnbd-log.h"
+
+/*  time in seconds between reconnect tries, default to 30 s */
+#define RECONNECT_DELAY 30
+/*
+ * Number of times to reconnect on error before giving up, 0 for * disabled,
+ * -1 for forever
+ */
+#define MAX_RECONNECTS -1
+
+enum rnbd_clt_dev_state {
+	DEV_STATE_INIT,
+	DEV_STATE_MAPPED,
+	DEV_STATE_MAPPED_DISCONNECTED,
+	DEV_STATE_UNMAPPED,
+};
+
+struct rnbd_iu_comp {
+	wait_queue_head_t wait;
+	int errno;
+};
+
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define RNBD_INLINE_SG_CNT 0
+#else
+#define RNBD_INLINE_SG_CNT 2
+#endif
+#define RNBD_RDMA_SGL_SIZE (sizeof(struct scatterlist) * RNBD_INLINE_SG_CNT)
+
+struct rnbd_iu {
+	union {
+		struct request *rq; /* for block io */
+		void *buf; /* for user messages */
+	};
+	struct rtrs_permit	*permit;
+	union {
+		/* use to send msg associated with a dev */
+		struct rnbd_clt_dev *dev;
+		/* use to send msg associated with a sess */
+		struct rnbd_clt_session *sess;
+	};
+	struct sg_table		sgt;
+	struct work_struct	work;
+	int			errno;
+	struct rnbd_iu_comp	comp;
+	atomic_t		refcount;
+	struct scatterlist	first_sgl[]; /* must be the last one */
+};
+
+struct rnbd_cpu_qlist {
+	struct list_head	requeue_list;
+	spinlock_t		requeue_lock;
+	unsigned int		cpu;
+};
+
+struct rnbd_clt_session {
+	struct list_head        list;
+	struct rtrs_clt_sess        *rtrs;
+	wait_queue_head_t       rtrs_waitq;
+	bool                    rtrs_ready;
+	struct rnbd_cpu_qlist	__percpu
+				*cpu_queues;
+	DECLARE_BITMAP(cpu_queues_bm, NR_CPUS);
+	int	__percpu	*cpu_rr; /* per-cpu var for CPU round-robin */
+	atomic_t		busy;
+	size_t			queue_depth;
+	u32			max_io_size;
+	u32			max_segments;
+	struct blk_mq_tag_set	tag_set;
+	u32			nr_poll_queues;
+	struct mutex		lock; /* protects state and devs_list */
+	struct list_head        devs_list; /* list of struct rnbd_clt_dev */
+	refcount_t		refcount;
+	char			sessname[NAME_MAX];
+	u8			ver; /* protocol version */
+};
+
+/**
+ * Submission queues.
+ */
+struct rnbd_queue {
+	struct list_head	requeue_list;
+	unsigned long		in_list;
+	struct rnbd_clt_dev	*dev;
+	struct blk_mq_hw_ctx	*hctx;
+};
+
+struct rnbd_clt_dev {
+	struct kobject		kobj;
+	struct rnbd_clt_session	*sess;
+	struct request_queue	*queue;
+	struct rnbd_queue	*hw_queues;
+	u32			device_id;
+	/* local Idr index - used to track minor number allocations. */
+	u32			clt_device_id;
+	struct mutex		lock;
+	enum rnbd_clt_dev_state	dev_state;
+	refcount_t		refcount;
+	char			*pathname;
+	enum rnbd_access_mode	access_mode;
+	u32			nr_poll_queues;
+	u64			size;		/* device size in bytes */
+	struct list_head        list;
+	struct gendisk		*gd;
+	char			*blk_symlink_name;
+	struct work_struct	unmap_on_rmmod_work;
+};
+
+/* rnbd-clt.c */
+
+struct rnbd_clt_dev *rnbd_clt_map_device(const char *sessname,
+					   struct rtrs_addr *paths,
+					   size_t path_cnt, u16 port_nr,
+					   const char *pathname,
+					   enum rnbd_access_mode access_mode,
+					   u32 nr_poll_queues);
+int rnbd_clt_unmap_device(struct rnbd_clt_dev *dev, bool force,
+			   const struct attribute *sysfs_self);
+
+int rnbd_clt_remap_device(struct rnbd_clt_dev *dev);
+int rnbd_clt_resize_disk(struct rnbd_clt_dev *dev, sector_t newsize);
+
+/* rnbd-clt-sysfs.c */
+
+int rnbd_clt_create_sysfs_files(void);
+
+void rnbd_clt_destroy_sysfs_files(void);
+
+void rnbd_clt_remove_dev_symlink(struct rnbd_clt_dev *dev);
+
+#endif /* RNBD_CLT_H */
diff --git a/drivers/block/rnbd/rnbd-common.c b/drivers/block/rnbd/rnbd-common.c
new file mode 100644
index 000000000..596c3f732
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-common.c
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#include "rnbd-proto.h"
+
+const char *rnbd_access_mode_str(enum rnbd_access_mode mode)
+{
+	switch (mode) {
+	case RNBD_ACCESS_RO:
+		return "ro";
+	case RNBD_ACCESS_RW:
+		return "rw";
+	case RNBD_ACCESS_MIGRATION:
+		return "migration";
+	default:
+		return "unknown";
+	}
+}
diff --git a/drivers/block/rnbd/rnbd-log.h b/drivers/block/rnbd/rnbd-log.h
new file mode 100644
index 000000000..136e7d6c3
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-log.h
@@ -0,0 +1,41 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RNBD_LOG_H
+#define RNBD_LOG_H
+
+#include "rnbd-clt.h"
+#include "rnbd-srv.h"
+
+#define rnbd_clt_log(fn, dev, fmt, ...) (				\
+		fn("<%s@%s> " fmt, (dev)->pathname,			\
+		(dev)->sess->sessname,					\
+		   ##__VA_ARGS__))
+#define rnbd_srv_log(fn, dev, fmt, ...) (				\
+			fn("<%s@%s>: " fmt, (dev)->pathname,		\
+			   (dev)->sess->sessname, ##__VA_ARGS__))
+
+#define rnbd_clt_err(dev, fmt, ...)	\
+	rnbd_clt_log(pr_err, dev, fmt, ##__VA_ARGS__)
+#define rnbd_clt_err_rl(dev, fmt, ...)	\
+	rnbd_clt_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define rnbd_clt_info(dev, fmt, ...) \
+	rnbd_clt_log(pr_info, dev, fmt, ##__VA_ARGS__)
+#define rnbd_clt_info_rl(dev, fmt, ...) \
+	rnbd_clt_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__)
+
+#define rnbd_srv_err(dev, fmt, ...)	\
+	rnbd_srv_log(pr_err, dev, fmt, ##__VA_ARGS__)
+#define rnbd_srv_err_rl(dev, fmt, ...)	\
+	rnbd_srv_log(pr_err_ratelimited, dev, fmt, ##__VA_ARGS__)
+#define rnbd_srv_info(dev, fmt, ...) \
+	rnbd_srv_log(pr_info, dev, fmt, ##__VA_ARGS__)
+#define rnbd_srv_info_rl(dev, fmt, ...) \
+	rnbd_srv_log(pr_info_ratelimited, dev, fmt, ##__VA_ARGS__)
+
+#endif /* RNBD_LOG_H */
diff --git a/drivers/block/rnbd/rnbd-proto.h b/drivers/block/rnbd/rnbd-proto.h
new file mode 100644
index 000000000..da1d0542d
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-proto.h
@@ -0,0 +1,305 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RNBD_PROTO_H
+#define RNBD_PROTO_H
+
+#include <linux/types.h>
+#include <linux/blk-mq.h>
+#include <linux/limits.h>
+#include <linux/inet.h>
+#include <linux/in.h>
+#include <linux/in6.h>
+#include <rdma/ib.h>
+
+#define RNBD_PROTO_VER_MAJOR 2
+#define RNBD_PROTO_VER_MINOR 0
+
+/* The default port number the RTRS server is listening on. */
+#define RTRS_PORT 1234
+
+/**
+ * enum rnbd_msg_types - RNBD message types
+ * @RNBD_MSG_SESS_INFO:	initial session info from client to server
+ * @RNBD_MSG_SESS_INFO_RSP:	initial session info from server to client
+ * @RNBD_MSG_OPEN:		open (map) device request
+ * @RNBD_MSG_OPEN_RSP:		response to an @RNBD_MSG_OPEN
+ * @RNBD_MSG_IO:		block IO request operation
+ * @RNBD_MSG_CLOSE:		close (unmap) device request
+ */
+enum rnbd_msg_type {
+	RNBD_MSG_SESS_INFO,
+	RNBD_MSG_SESS_INFO_RSP,
+	RNBD_MSG_OPEN,
+	RNBD_MSG_OPEN_RSP,
+	RNBD_MSG_IO,
+	RNBD_MSG_CLOSE,
+};
+
+/**
+ * struct rnbd_msg_hdr - header of RNBD messages
+ * @type:	Message type, valid values see: enum rnbd_msg_types
+ */
+struct rnbd_msg_hdr {
+	__le16		type;
+	__le16		__padding;
+};
+
+/**
+ * We allow to map RO many times and RW only once. We allow to map yet another
+ * time RW, if MIGRATION is provided (second RW export can be required for
+ * example for VM migration)
+ */
+enum rnbd_access_mode {
+	RNBD_ACCESS_RO,
+	RNBD_ACCESS_RW,
+	RNBD_ACCESS_MIGRATION,
+};
+
+/**
+ * struct rnbd_msg_sess_info - initial session info from client to server
+ * @hdr:		message header
+ * @ver:		RNBD protocol version
+ */
+struct rnbd_msg_sess_info {
+	struct rnbd_msg_hdr hdr;
+	u8		ver;
+	u8		reserved[31];
+};
+
+/**
+ * struct rnbd_msg_sess_info_rsp - initial session info from server to client
+ * @hdr:		message header
+ * @ver:		RNBD protocol version
+ */
+struct rnbd_msg_sess_info_rsp {
+	struct rnbd_msg_hdr hdr;
+	u8		ver;
+	u8		reserved[31];
+};
+
+/**
+ * struct rnbd_msg_open - request to open a remote device.
+ * @hdr:		message header
+ * @access_mode:	the mode to open remote device, valid values see:
+ *			enum rnbd_access_mode
+ * @device_name:	device path on remote side
+ */
+struct rnbd_msg_open {
+	struct rnbd_msg_hdr hdr;
+	u8		access_mode;
+	u8		resv1;
+	s8		dev_name[NAME_MAX];
+	u8		reserved[3];
+};
+
+/**
+ * struct rnbd_msg_close - request to close a remote device.
+ * @hdr:	message header
+ * @device_id:	device_id on server side to identify the device
+ */
+struct rnbd_msg_close {
+	struct rnbd_msg_hdr hdr;
+	__le32		device_id;
+};
+
+enum rnbd_cache_policy {
+	RNBD_FUA = 1 << 0,
+	RNBD_WRITEBACK = 1 << 1,
+};
+
+/**
+ * struct rnbd_msg_open_rsp - response message to RNBD_MSG_OPEN
+ * @hdr:		message header
+ * @device_id:		device_id on server side to identify the device
+ * @nsectors:		number of sectors in the usual 512b unit
+ * @max_hw_sectors:	max hardware sectors in the usual 512b unit
+ * @max_write_same_sectors: max sectors for WRITE SAME in the 512b unit
+ * @max_discard_sectors: max. sectors that can be discarded at once in 512b
+ * unit.
+ * @discard_granularity: size of the internal discard allocation unit in bytes
+ * @discard_alignment: offset from internal allocation assignment in bytes
+ * @physical_block_size: physical block size device supports in bytes
+ * @logical_block_size: logical block size device supports in bytes
+ * @max_segments:	max segments hardware support in one transfer
+ * @secure_discard:	supports secure discard
+ * @obsolete_rotational: obsolete, not in used.
+ * @cache_policy: 	support write-back caching or FUA?
+ */
+struct rnbd_msg_open_rsp {
+	struct rnbd_msg_hdr	hdr;
+	__le32			device_id;
+	__le64			nsectors;
+	__le32			max_hw_sectors;
+	__le32			max_write_same_sectors;
+	__le32			max_discard_sectors;
+	__le32			discard_granularity;
+	__le32			discard_alignment;
+	__le16			physical_block_size;
+	__le16			logical_block_size;
+	__le16			max_segments;
+	__le16			secure_discard;
+	u8			obsolete_rotational;
+	u8			cache_policy;
+	u8			reserved[10];
+};
+
+/**
+ * struct rnbd_msg_io - message for I/O read/write
+ * @hdr:	message header
+ * @device_id:	device_id on server side to find the right device
+ * @sector:	bi_sector attribute from struct bio
+ * @rw:		valid values are defined in enum rnbd_io_flags
+ * @bi_size:    number of bytes for I/O read/write
+ * @prio:       priority
+ */
+struct rnbd_msg_io {
+	struct rnbd_msg_hdr hdr;
+	__le32		device_id;
+	__le64		sector;
+	__le32		rw;
+	__le32		bi_size;
+	__le16		prio;
+};
+
+#define RNBD_OP_BITS  8
+#define RNBD_OP_MASK  ((1 << RNBD_OP_BITS) - 1)
+
+/**
+ * enum rnbd_io_flags - RNBD request types from rq_flag_bits
+ * @RNBD_OP_READ:	     read sectors from the device
+ * @RNBD_OP_WRITE:	     write sectors to the device
+ * @RNBD_OP_FLUSH:	     flush the volatile write cache
+ * @RNBD_OP_DISCARD:        discard sectors
+ * @RNBD_OP_SECURE_ERASE:   securely erase sectors
+ * @RNBD_OP_WRITE_SAME:     write the same sectors many times
+
+ * @RNBD_F_SYNC:	     request is sync (sync write or read)
+ * @RNBD_F_FUA:             forced unit access
+ */
+enum rnbd_io_flags {
+
+	/* Operations */
+
+	RNBD_OP_READ		= 0,
+	RNBD_OP_WRITE		= 1,
+	RNBD_OP_FLUSH		= 2,
+	RNBD_OP_DISCARD	= 3,
+	RNBD_OP_SECURE_ERASE	= 4,
+	RNBD_OP_WRITE_SAME	= 5,
+
+	RNBD_OP_LAST,
+
+	/* Flags */
+
+	RNBD_F_SYNC  = 1<<(RNBD_OP_BITS + 0),
+	RNBD_F_FUA   = 1<<(RNBD_OP_BITS + 1),
+
+	RNBD_F_ALL   = (RNBD_F_SYNC | RNBD_F_FUA)
+
+};
+
+static inline u32 rnbd_op(u32 flags)
+{
+	return flags & RNBD_OP_MASK;
+}
+
+static inline u32 rnbd_flags(u32 flags)
+{
+	return flags & ~RNBD_OP_MASK;
+}
+
+static inline bool rnbd_flags_supported(u32 flags)
+{
+	u32 op;
+
+	op = rnbd_op(flags);
+	flags = rnbd_flags(flags);
+
+	if (op >= RNBD_OP_LAST)
+		return false;
+	if (flags & ~RNBD_F_ALL)
+		return false;
+
+	return true;
+}
+
+static inline blk_opf_t rnbd_to_bio_flags(u32 rnbd_opf)
+{
+	blk_opf_t bio_opf;
+
+	switch (rnbd_op(rnbd_opf)) {
+	case RNBD_OP_READ:
+		bio_opf = REQ_OP_READ;
+		break;
+	case RNBD_OP_WRITE:
+		bio_opf = REQ_OP_WRITE;
+		break;
+	case RNBD_OP_FLUSH:
+		bio_opf = REQ_OP_WRITE | REQ_PREFLUSH;
+		break;
+	case RNBD_OP_DISCARD:
+		bio_opf = REQ_OP_DISCARD;
+		break;
+	case RNBD_OP_SECURE_ERASE:
+		bio_opf = REQ_OP_SECURE_ERASE;
+		break;
+	default:
+		WARN(1, "Unknown RNBD type: %d (flags %d)\n",
+		     rnbd_op(rnbd_opf), rnbd_opf);
+		bio_opf = 0;
+	}
+
+	if (rnbd_opf & RNBD_F_SYNC)
+		bio_opf |= REQ_SYNC;
+
+	if (rnbd_opf & RNBD_F_FUA)
+		bio_opf |= REQ_FUA;
+
+	return bio_opf;
+}
+
+static inline u32 rq_to_rnbd_flags(struct request *rq)
+{
+	u32 rnbd_opf;
+
+	switch (req_op(rq)) {
+	case REQ_OP_READ:
+		rnbd_opf = RNBD_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		rnbd_opf = RNBD_OP_WRITE;
+		break;
+	case REQ_OP_DISCARD:
+		rnbd_opf = RNBD_OP_DISCARD;
+		break;
+	case REQ_OP_SECURE_ERASE:
+		rnbd_opf = RNBD_OP_SECURE_ERASE;
+		break;
+	case REQ_OP_FLUSH:
+		rnbd_opf = RNBD_OP_FLUSH;
+		break;
+	default:
+		WARN(1, "Unknown request type %d (flags %llu)\n",
+		     (__force u32)req_op(rq),
+		     (__force unsigned long long)rq->cmd_flags);
+		rnbd_opf = 0;
+	}
+
+	if (op_is_sync(rq->cmd_flags))
+		rnbd_opf |= RNBD_F_SYNC;
+
+	if (op_is_flush(rq->cmd_flags))
+		rnbd_opf |= RNBD_F_FUA;
+
+	return rnbd_opf;
+}
+
+const char *rnbd_access_mode_str(enum rnbd_access_mode mode);
+
+#endif /* RNBD_PROTO_H */
diff --git a/drivers/block/rnbd/rnbd-srv-sysfs.c b/drivers/block/rnbd/rnbd-srv-sysfs.c
new file mode 100644
index 000000000..297a6924f
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-srv-sysfs.c
@@ -0,0 +1,250 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <uapi/linux/limits.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/stat.h>
+#include <linux/list.h>
+#include <linux/moduleparam.h>
+#include <linux/device.h>
+
+#include "rnbd-srv.h"
+
+static struct device *rnbd_dev;
+static struct class *rnbd_dev_class;
+static struct kobject *rnbd_devs_kobj;
+
+static void rnbd_srv_dev_release(struct kobject *kobj)
+{
+	struct rnbd_srv_dev *dev;
+
+	dev = container_of(kobj, struct rnbd_srv_dev, dev_kobj);
+
+	kfree(dev);
+}
+
+static struct kobj_type dev_ktype = {
+	.sysfs_ops = &kobj_sysfs_ops,
+	.release = rnbd_srv_dev_release
+};
+
+int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev,
+			       struct block_device *bdev)
+{
+	struct kobject *bdev_kobj;
+	int ret;
+
+	ret = kobject_init_and_add(&dev->dev_kobj, &dev_ktype,
+				   rnbd_devs_kobj, "%pg", bdev);
+	if (ret) {
+		kobject_put(&dev->dev_kobj);
+		return ret;
+	}
+
+	dev->dev_sessions_kobj = kobject_create_and_add("sessions",
+							&dev->dev_kobj);
+	if (!dev->dev_sessions_kobj) {
+		ret = -ENOMEM;
+		goto free_dev_kobj;
+	}
+
+	bdev_kobj = &disk_to_dev(bdev->bd_disk)->kobj;
+	ret = sysfs_create_link(&dev->dev_kobj, bdev_kobj, "block_dev");
+	if (ret)
+		goto put_sess_kobj;
+
+	return 0;
+
+put_sess_kobj:
+	kobject_put(dev->dev_sessions_kobj);
+free_dev_kobj:
+	kobject_del(&dev->dev_kobj);
+	kobject_put(&dev->dev_kobj);
+	return ret;
+}
+
+void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev)
+{
+	sysfs_remove_link(&dev->dev_kobj, "block_dev");
+	kobject_del(dev->dev_sessions_kobj);
+	kobject_put(dev->dev_sessions_kobj);
+	kobject_del(&dev->dev_kobj);
+	kobject_put(&dev->dev_kobj);
+}
+
+static ssize_t read_only_show(struct kobject *kobj, struct kobj_attribute *attr,
+			      char *page)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
+
+	return sysfs_emit(page, "%d\n",
+			  !(sess_dev->open_flags & FMODE_WRITE));
+}
+
+static struct kobj_attribute rnbd_srv_dev_session_ro_attr =
+	__ATTR_RO(read_only);
+
+static ssize_t access_mode_show(struct kobject *kobj,
+				struct kobj_attribute *attr,
+				char *page)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
+
+	return sysfs_emit(page, "%s\n",
+			  rnbd_access_mode_str(sess_dev->access_mode));
+}
+
+static struct kobj_attribute rnbd_srv_dev_session_access_mode_attr =
+	__ATTR_RO(access_mode);
+
+static ssize_t mapping_path_show(struct kobject *kobj,
+				 struct kobj_attribute *attr, char *page)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
+
+	return sysfs_emit(page, "%s\n", sess_dev->pathname);
+}
+
+static struct kobj_attribute rnbd_srv_dev_session_mapping_path_attr =
+	__ATTR_RO(mapping_path);
+
+static ssize_t rnbd_srv_dev_session_force_close_show(struct kobject *kobj,
+					struct kobj_attribute *attr, char *page)
+{
+	return sysfs_emit(page, "Usage: echo 1 > %s\n",
+			  attr->attr.name);
+}
+
+static ssize_t rnbd_srv_dev_session_force_close_store(struct kobject *kobj,
+					struct kobj_attribute *attr,
+					const char *buf, size_t count)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
+
+	if (!sysfs_streq(buf, "1")) {
+		rnbd_srv_err(sess_dev, "%s: invalid value: '%s'\n",
+			      attr->attr.name, buf);
+		return -EINVAL;
+	}
+
+	rnbd_srv_info(sess_dev, "force close requested\n");
+	rnbd_srv_sess_dev_force_close(sess_dev, attr);
+
+	return count;
+}
+
+static struct kobj_attribute rnbd_srv_dev_session_force_close_attr =
+	__ATTR(force_close, 0644,
+	       rnbd_srv_dev_session_force_close_show,
+	       rnbd_srv_dev_session_force_close_store);
+
+static struct attribute *rnbd_srv_default_dev_sessions_attrs[] = {
+	&rnbd_srv_dev_session_access_mode_attr.attr,
+	&rnbd_srv_dev_session_ro_attr.attr,
+	&rnbd_srv_dev_session_mapping_path_attr.attr,
+	&rnbd_srv_dev_session_force_close_attr.attr,
+	NULL,
+};
+
+static struct attribute_group rnbd_srv_default_dev_session_attr_group = {
+	.attrs = rnbd_srv_default_dev_sessions_attrs,
+};
+
+void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev)
+{
+	sysfs_remove_group(&sess_dev->kobj,
+			   &rnbd_srv_default_dev_session_attr_group);
+
+	kobject_del(&sess_dev->kobj);
+	kobject_put(&sess_dev->kobj);
+}
+
+static void rnbd_srv_sess_dev_release(struct kobject *kobj)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kobj, struct rnbd_srv_sess_dev, kobj);
+	rnbd_destroy_sess_dev(sess_dev, sess_dev->keep_id);
+}
+
+static struct kobj_type rnbd_srv_sess_dev_ktype = {
+	.sysfs_ops	= &kobj_sysfs_ops,
+	.release	= rnbd_srv_sess_dev_release,
+};
+
+int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev)
+{
+	int ret;
+
+	ret = kobject_init_and_add(&sess_dev->kobj, &rnbd_srv_sess_dev_ktype,
+				   sess_dev->dev->dev_sessions_kobj, "%s",
+				   sess_dev->sess->sessname);
+	if (ret) {
+		kobject_put(&sess_dev->kobj);
+		return ret;
+	}
+
+	ret = sysfs_create_group(&sess_dev->kobj,
+				 &rnbd_srv_default_dev_session_attr_group);
+	if (ret) {
+		kobject_del(&sess_dev->kobj);
+		kobject_put(&sess_dev->kobj);
+	}
+
+	return ret;
+}
+
+int rnbd_srv_create_sysfs_files(void)
+{
+	int err;
+
+	rnbd_dev_class = class_create(THIS_MODULE, "rnbd-server");
+	if (IS_ERR(rnbd_dev_class))
+		return PTR_ERR(rnbd_dev_class);
+
+	rnbd_dev = device_create(rnbd_dev_class, NULL,
+				  MKDEV(0, 0), NULL, "ctl");
+	if (IS_ERR(rnbd_dev)) {
+		err = PTR_ERR(rnbd_dev);
+		goto cls_destroy;
+	}
+	rnbd_devs_kobj = kobject_create_and_add("devices", &rnbd_dev->kobj);
+	if (!rnbd_devs_kobj) {
+		err = -ENOMEM;
+		goto dev_destroy;
+	}
+
+	return 0;
+
+dev_destroy:
+	device_destroy(rnbd_dev_class, MKDEV(0, 0));
+cls_destroy:
+	class_destroy(rnbd_dev_class);
+
+	return err;
+}
+
+void rnbd_srv_destroy_sysfs_files(void)
+{
+	kobject_del(rnbd_devs_kobj);
+	kobject_put(rnbd_devs_kobj);
+	device_destroy(rnbd_dev_class, MKDEV(0, 0));
+	class_destroy(rnbd_dev_class);
+}
diff --git a/drivers/block/rnbd/rnbd-srv-trace.c b/drivers/block/rnbd/rnbd-srv-trace.c
new file mode 100644
index 000000000..30f0895c1
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-srv-trace.c
@@ -0,0 +1,17 @@
+// SPDX-License-Identifier: GPL-2.0+
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#include "rtrs.h"
+#include "rtrs-srv.h"
+#include "rnbd-srv.h"
+#include "rnbd-proto.h"
+
+/*
+ * We include this last to have the helpers above available for the trace
+ * event implementations.
+ */
+#define CREATE_TRACE_POINTS
+#include "rnbd-srv-trace.h"
diff --git a/drivers/block/rnbd/rnbd-srv-trace.h b/drivers/block/rnbd/rnbd-srv-trace.h
new file mode 100644
index 000000000..8dedf73bd
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-srv-trace.h
@@ -0,0 +1,207 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2022 1&1 IONOS SE. All rights reserved.
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM rnbd_srv
+
+#if !defined(_TRACE_RNBD_SRV_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_RNBD_SRV_H
+
+#include <linux/tracepoint.h>
+
+struct rnbd_srv_session;
+struct rtrs_srv_op;
+
+DECLARE_EVENT_CLASS(rnbd_srv_link_class,
+	TP_PROTO(struct rnbd_srv_session *srv),
+
+	TP_ARGS(srv),
+
+	TP_STRUCT__entry(
+		__field(int, qdepth)
+		__string(sessname, srv->sessname)
+	),
+
+	TP_fast_assign(
+		__entry->qdepth = srv->queue_depth;
+		__assign_str(sessname, srv->sessname);
+	),
+
+	TP_printk("sessname: %s qdepth: %d",
+		   __get_str(sessname),
+		   __entry->qdepth
+	)
+);
+
+#define DEFINE_LINK_EVENT(name) \
+DEFINE_EVENT(rnbd_srv_link_class, name, \
+	TP_PROTO(struct rnbd_srv_session *srv), \
+	TP_ARGS(srv))
+
+DEFINE_LINK_EVENT(create_sess);
+DEFINE_LINK_EVENT(destroy_sess);
+
+TRACE_DEFINE_ENUM(RNBD_OP_READ);
+TRACE_DEFINE_ENUM(RNBD_OP_WRITE);
+TRACE_DEFINE_ENUM(RNBD_OP_FLUSH);
+TRACE_DEFINE_ENUM(RNBD_OP_DISCARD);
+TRACE_DEFINE_ENUM(RNBD_OP_SECURE_ERASE);
+TRACE_DEFINE_ENUM(RNBD_F_SYNC);
+TRACE_DEFINE_ENUM(RNBD_F_FUA);
+
+#define show_rnbd_rw_flags(x) \
+	__print_flags(x, "|", \
+		{ RNBD_OP_READ,		"READ" }, \
+		{ RNBD_OP_WRITE,	"WRITE" }, \
+		{ RNBD_OP_FLUSH,	"FLUSH" }, \
+		{ RNBD_OP_DISCARD,	"DISCARD" }, \
+		{ RNBD_OP_SECURE_ERASE,	"SECURE_ERASE" }, \
+		{ RNBD_F_SYNC,		"SYNC" }, \
+		{ RNBD_F_FUA,		"FUA" })
+
+TRACE_EVENT(process_rdma,
+	TP_PROTO(struct rnbd_srv_session *srv,
+		 const struct rnbd_msg_io *msg,
+		 struct rtrs_srv_op *id,
+		 u32 datalen,
+		 size_t usrlen),
+
+	TP_ARGS(srv, msg, id, datalen, usrlen),
+
+	TP_STRUCT__entry(
+		__string(sessname, srv->sessname)
+		__field(u8, dir)
+		__field(u8, ver)
+		__field(u32, device_id)
+		__field(u64, sector)
+		__field(u32, flags)
+		__field(u32, bi_size)
+		__field(u16, ioprio)
+		__field(u32, datalen)
+		__field(size_t, usrlen)
+	),
+
+	TP_fast_assign(
+		__assign_str(sessname, srv->sessname);
+		__entry->dir = id->dir;
+		__entry->ver = srv->ver;
+		__entry->device_id = le32_to_cpu(msg->device_id);
+		__entry->sector = le64_to_cpu(msg->sector);
+		__entry->bi_size = le32_to_cpu(msg->bi_size);
+		__entry->flags = le32_to_cpu(msg->rw);
+		__entry->ioprio = le16_to_cpu(msg->prio);
+		__entry->datalen = datalen;
+		__entry->usrlen = usrlen;
+	),
+
+	TP_printk("I/O req: sess: %s, type: %s, ver: %d, devid: %u, sector: %llu, bsize: %u, flags: %s, ioprio: %d, datalen: %u, usrlen: %zu",
+		   __get_str(sessname),
+		   __print_symbolic(__entry->dir,
+			 { READ,  "READ" },
+			 { WRITE, "WRITE" }),
+		   __entry->ver,
+		   __entry->device_id,
+		   __entry->sector,
+		   __entry->bi_size,
+		   show_rnbd_rw_flags(__entry->flags),
+		   __entry->ioprio,
+		   __entry->datalen,
+		   __entry->usrlen
+	)
+);
+
+TRACE_EVENT(process_msg_sess_info,
+	TP_PROTO(struct rnbd_srv_session *srv,
+		 const struct rnbd_msg_sess_info *msg),
+
+	TP_ARGS(srv, msg),
+
+	TP_STRUCT__entry(
+		__field(u8, proto_ver)
+		__field(u8, clt_ver)
+		__field(u8, srv_ver)
+		__string(sessname, srv->sessname)
+	),
+
+	TP_fast_assign(
+		__entry->proto_ver = srv->ver;
+		__entry->clt_ver = msg->ver;
+		__entry->srv_ver = RNBD_PROTO_VER_MAJOR;
+		__assign_str(sessname, srv->sessname);
+	),
+
+	TP_printk("Session %s using proto-ver %d (clt-ver: %d, srv-ver: %d)",
+		   __get_str(sessname),
+		   __entry->proto_ver,
+		   __entry->clt_ver,
+		   __entry->srv_ver
+	)
+);
+
+TRACE_DEFINE_ENUM(RNBD_ACCESS_RO);
+TRACE_DEFINE_ENUM(RNBD_ACCESS_RW);
+TRACE_DEFINE_ENUM(RNBD_ACCESS_MIGRATION);
+
+#define show_rnbd_access_mode(x) \
+	__print_symbolic(x, \
+		{ RNBD_ACCESS_RO,		"RO" }, \
+		{ RNBD_ACCESS_RW,		"RW" }, \
+		{ RNBD_ACCESS_MIGRATION,	"MIGRATION" })
+
+TRACE_EVENT(process_msg_open,
+	TP_PROTO(struct rnbd_srv_session *srv,
+		 const struct rnbd_msg_open *msg),
+
+	TP_ARGS(srv, msg),
+
+	TP_STRUCT__entry(
+		__field(u8, access_mode)
+		__string(sessname, srv->sessname)
+		__string(dev_name, msg->dev_name)
+	),
+
+	TP_fast_assign(
+		__entry->access_mode = msg->access_mode;
+		__assign_str(sessname, srv->sessname);
+		__assign_str(dev_name, msg->dev_name);
+	),
+
+	TP_printk("Open message received: session='%s' path='%s' access_mode=%s",
+		   __get_str(sessname),
+		   __get_str(dev_name),
+		   show_rnbd_access_mode(__entry->access_mode)
+	)
+);
+
+TRACE_EVENT(process_msg_close,
+	TP_PROTO(struct rnbd_srv_session *srv,
+		 const struct rnbd_msg_close *msg),
+
+	TP_ARGS(srv, msg),
+
+	TP_STRUCT__entry(
+		__field(u32, device_id)
+		__string(sessname, srv->sessname)
+	),
+
+	TP_fast_assign(
+		__entry->device_id = le32_to_cpu(msg->device_id);
+		__assign_str(sessname, srv->sessname);
+	),
+
+	TP_printk("Close message received: session='%s' device id='%d'",
+		   __get_str(sessname),
+		   __entry->device_id
+	)
+);
+
+#endif /* _TRACE_RNBD_SRV_H */
+
+#undef TRACE_INCLUDE_PATH
+#define TRACE_INCLUDE_PATH .
+#define TRACE_INCLUDE_FILE rnbd-srv-trace
+#include <trace/define_trace.h>
+
diff --git a/drivers/block/rnbd/rnbd-srv.c b/drivers/block/rnbd/rnbd-srv.c
new file mode 100644
index 000000000..2cfed2e58
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-srv.c
@@ -0,0 +1,848 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#undef pr_fmt
+#define pr_fmt(fmt) KBUILD_MODNAME " L" __stringify(__LINE__) ": " fmt
+
+#include <linux/module.h>
+#include <linux/blkdev.h>
+
+#include "rnbd-srv.h"
+#include "rnbd-srv-trace.h"
+
+MODULE_DESCRIPTION("RDMA Network Block Device Server");
+MODULE_LICENSE("GPL");
+
+static u16 port_nr = RTRS_PORT;
+
+module_param_named(port_nr, port_nr, ushort, 0444);
+MODULE_PARM_DESC(port_nr,
+		 "The port number the server is listening on (default: "
+		 __stringify(RTRS_PORT)")");
+
+#define DEFAULT_DEV_SEARCH_PATH "/"
+
+static char dev_search_path[PATH_MAX] = DEFAULT_DEV_SEARCH_PATH;
+
+static int dev_search_path_set(const char *val, const struct kernel_param *kp)
+{
+	const char *p = strrchr(val, '\n') ? : val + strlen(val);
+
+	if (strlen(val) >= sizeof(dev_search_path))
+		return -EINVAL;
+
+	snprintf(dev_search_path, sizeof(dev_search_path), "%.*s",
+		 (int)(p - val), val);
+
+	pr_info("dev_search_path changed to '%s'\n", dev_search_path);
+
+	return 0;
+}
+
+static struct kparam_string dev_search_path_kparam_str = {
+	.maxlen	= sizeof(dev_search_path),
+	.string	= dev_search_path
+};
+
+static const struct kernel_param_ops dev_search_path_ops = {
+	.set	= dev_search_path_set,
+	.get	= param_get_string,
+};
+
+module_param_cb(dev_search_path, &dev_search_path_ops,
+		&dev_search_path_kparam_str, 0444);
+MODULE_PARM_DESC(dev_search_path,
+		 "Sets the dev_search_path. When a device is mapped this path is prepended to the device path from the map device operation.  If %SESSNAME% is specified in a path, then device will be searched in a session namespace. (default: "
+		 DEFAULT_DEV_SEARCH_PATH ")");
+
+static DEFINE_MUTEX(sess_lock);
+static DEFINE_SPINLOCK(dev_lock);
+
+static LIST_HEAD(sess_list);
+static LIST_HEAD(dev_list);
+
+struct rnbd_io_private {
+	struct rtrs_srv_op		*id;
+	struct rnbd_srv_sess_dev	*sess_dev;
+};
+
+static void rnbd_sess_dev_release(struct kref *kref)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	sess_dev = container_of(kref, struct rnbd_srv_sess_dev, kref);
+	complete(sess_dev->destroy_comp);
+}
+
+static inline void rnbd_put_sess_dev(struct rnbd_srv_sess_dev *sess_dev)
+{
+	kref_put(&sess_dev->kref, rnbd_sess_dev_release);
+}
+
+static struct rnbd_srv_sess_dev *
+rnbd_get_sess_dev(int dev_id, struct rnbd_srv_session *srv_sess)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+	int ret = 0;
+
+	rcu_read_lock();
+	sess_dev = xa_load(&srv_sess->index_idr, dev_id);
+	if (sess_dev)
+		ret = kref_get_unless_zero(&sess_dev->kref);
+	rcu_read_unlock();
+
+	if (!sess_dev || !ret)
+		return ERR_PTR(-ENXIO);
+
+	return sess_dev;
+}
+
+static void rnbd_dev_bi_end_io(struct bio *bio)
+{
+	struct rnbd_io_private *rnbd_priv = bio->bi_private;
+	struct rnbd_srv_sess_dev *sess_dev = rnbd_priv->sess_dev;
+
+	rnbd_put_sess_dev(sess_dev);
+	rtrs_srv_resp_rdma(rnbd_priv->id, blk_status_to_errno(bio->bi_status));
+
+	kfree(rnbd_priv);
+	bio_put(bio);
+}
+
+static int process_rdma(struct rnbd_srv_session *srv_sess,
+			struct rtrs_srv_op *id, void *data, u32 datalen,
+			const void *usr, size_t usrlen)
+{
+	const struct rnbd_msg_io *msg = usr;
+	struct rnbd_io_private *priv;
+	struct rnbd_srv_sess_dev *sess_dev;
+	u32 dev_id;
+	int err;
+	struct bio *bio;
+	short prio;
+
+	trace_process_rdma(srv_sess, msg, id, datalen, usrlen);
+
+	priv = kmalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	dev_id = le32_to_cpu(msg->device_id);
+
+	sess_dev = rnbd_get_sess_dev(dev_id, srv_sess);
+	if (IS_ERR(sess_dev)) {
+		pr_err_ratelimited("Got I/O request on session %s for unknown device id %d\n",
+				   srv_sess->sessname, dev_id);
+		err = -ENOTCONN;
+		goto err;
+	}
+
+	priv->sess_dev = sess_dev;
+	priv->id = id;
+
+	bio = bio_alloc(sess_dev->bdev, 1,
+			rnbd_to_bio_flags(le32_to_cpu(msg->rw)), GFP_KERNEL);
+	if (bio_add_page(bio, virt_to_page(data), datalen,
+			offset_in_page(data)) != datalen) {
+		rnbd_srv_err(sess_dev, "Failed to map data to bio\n");
+		err = -EINVAL;
+		goto bio_put;
+	}
+
+	bio->bi_end_io = rnbd_dev_bi_end_io;
+	bio->bi_private = priv;
+	bio->bi_iter.bi_sector = le64_to_cpu(msg->sector);
+	bio->bi_iter.bi_size = le32_to_cpu(msg->bi_size);
+	prio = srv_sess->ver < RNBD_PROTO_VER_MAJOR ||
+	       usrlen < sizeof(*msg) ? 0 : le16_to_cpu(msg->prio);
+	bio_set_prio(bio, prio);
+
+	submit_bio(bio);
+
+	return 0;
+
+bio_put:
+	bio_put(bio);
+	rnbd_put_sess_dev(sess_dev);
+err:
+	kfree(priv);
+	return err;
+}
+
+static void destroy_device(struct kref *kref)
+{
+	struct rnbd_srv_dev *dev = container_of(kref, struct rnbd_srv_dev, kref);
+
+	WARN_ONCE(!list_empty(&dev->sess_dev_list),
+		  "Device %s is being destroyed but still in use!\n",
+		  dev->id);
+
+	spin_lock(&dev_lock);
+	list_del(&dev->list);
+	spin_unlock(&dev_lock);
+
+	mutex_destroy(&dev->lock);
+	if (dev->dev_kobj.state_in_sysfs)
+		/*
+		 * Destroy kobj only if it was really created.
+		 */
+		rnbd_srv_destroy_dev_sysfs(dev);
+	else
+		kfree(dev);
+}
+
+static void rnbd_put_srv_dev(struct rnbd_srv_dev *dev)
+{
+	kref_put(&dev->kref, destroy_device);
+}
+
+void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id)
+{
+	DECLARE_COMPLETION_ONSTACK(dc);
+
+	if (keep_id)
+		/* free the resources for the id but don't  */
+		/* allow to re-use the id itself because it */
+		/* is still used by the client              */
+		xa_cmpxchg(&sess_dev->sess->index_idr, sess_dev->device_id,
+			   sess_dev, NULL, 0);
+	else
+		xa_erase(&sess_dev->sess->index_idr, sess_dev->device_id);
+	synchronize_rcu();
+
+	sess_dev->destroy_comp = &dc;
+	rnbd_put_sess_dev(sess_dev);
+	wait_for_completion(&dc); /* wait for inflights to drop to zero */
+
+	blkdev_put(sess_dev->bdev, sess_dev->open_flags);
+	mutex_lock(&sess_dev->dev->lock);
+	list_del(&sess_dev->dev_list);
+	if (sess_dev->open_flags & FMODE_WRITE)
+		sess_dev->dev->open_write_cnt--;
+	mutex_unlock(&sess_dev->dev->lock);
+
+	rnbd_put_srv_dev(sess_dev->dev);
+
+	rnbd_srv_info(sess_dev, "Device closed\n");
+	kfree(sess_dev);
+}
+
+static void destroy_sess(struct rnbd_srv_session *srv_sess)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+	unsigned long index;
+
+	if (xa_empty(&srv_sess->index_idr))
+		goto out;
+
+	trace_destroy_sess(srv_sess);
+
+	mutex_lock(&srv_sess->lock);
+	xa_for_each(&srv_sess->index_idr, index, sess_dev)
+		rnbd_srv_destroy_dev_session_sysfs(sess_dev);
+	mutex_unlock(&srv_sess->lock);
+
+out:
+	xa_destroy(&srv_sess->index_idr);
+
+	pr_info("RTRS Session %s disconnected\n", srv_sess->sessname);
+
+	mutex_lock(&sess_lock);
+	list_del(&srv_sess->list);
+	mutex_unlock(&sess_lock);
+
+	mutex_destroy(&srv_sess->lock);
+	kfree(srv_sess);
+}
+
+static int create_sess(struct rtrs_srv_sess *rtrs)
+{
+	struct rnbd_srv_session *srv_sess;
+	char pathname[NAME_MAX];
+	int err;
+
+	err = rtrs_srv_get_path_name(rtrs, pathname, sizeof(pathname));
+	if (err) {
+		pr_err("rtrs_srv_get_path_name(%s): %d\n", pathname, err);
+
+		return err;
+	}
+	srv_sess = kzalloc(sizeof(*srv_sess), GFP_KERNEL);
+	if (!srv_sess)
+		return -ENOMEM;
+
+	srv_sess->queue_depth = rtrs_srv_get_queue_depth(rtrs);
+	xa_init_flags(&srv_sess->index_idr, XA_FLAGS_ALLOC);
+	mutex_init(&srv_sess->lock);
+	mutex_lock(&sess_lock);
+	list_add(&srv_sess->list, &sess_list);
+	mutex_unlock(&sess_lock);
+
+	srv_sess->rtrs = rtrs;
+	strscpy(srv_sess->sessname, pathname, sizeof(srv_sess->sessname));
+
+	rtrs_srv_set_sess_priv(rtrs, srv_sess);
+
+	trace_create_sess(srv_sess);
+
+	return 0;
+}
+
+static int rnbd_srv_link_ev(struct rtrs_srv_sess *rtrs,
+			     enum rtrs_srv_link_ev ev, void *priv)
+{
+	struct rnbd_srv_session *srv_sess = priv;
+
+	switch (ev) {
+	case RTRS_SRV_LINK_EV_CONNECTED:
+		return create_sess(rtrs);
+
+	case RTRS_SRV_LINK_EV_DISCONNECTED:
+		if (WARN_ON_ONCE(!srv_sess))
+			return -EINVAL;
+
+		destroy_sess(srv_sess);
+		return 0;
+
+	default:
+		pr_warn("Received unknown RTRS session event %d from session %s\n",
+			ev, srv_sess->sessname);
+		return -EINVAL;
+	}
+}
+
+void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
+				   struct kobj_attribute *attr)
+{
+	struct rnbd_srv_session	*sess = sess_dev->sess;
+
+	/* It is already started to close by client's close message. */
+	if (!mutex_trylock(&sess->lock))
+		return;
+
+	sess_dev->keep_id = true;
+	/* first remove sysfs itself to avoid deadlock */
+	sysfs_remove_file_self(&sess_dev->kobj, &attr->attr);
+	rnbd_srv_destroy_dev_session_sysfs(sess_dev);
+	mutex_unlock(&sess->lock);
+}
+
+static void process_msg_close(struct rnbd_srv_session *srv_sess,
+			     void *data, size_t datalen, const void *usr,
+			     size_t usrlen)
+{
+	const struct rnbd_msg_close *close_msg = usr;
+	struct rnbd_srv_sess_dev *sess_dev;
+
+	trace_process_msg_close(srv_sess, close_msg);
+
+	sess_dev = rnbd_get_sess_dev(le32_to_cpu(close_msg->device_id),
+				      srv_sess);
+	if (IS_ERR(sess_dev))
+		return;
+
+	rnbd_put_sess_dev(sess_dev);
+	mutex_lock(&srv_sess->lock);
+	rnbd_srv_destroy_dev_session_sysfs(sess_dev);
+	mutex_unlock(&srv_sess->lock);
+}
+
+static int process_msg_open(struct rnbd_srv_session *srv_sess,
+			    const void *msg, size_t len,
+			    void *data, size_t datalen);
+
+static int process_msg_sess_info(struct rnbd_srv_session *srv_sess,
+				 const void *msg, size_t len,
+				 void *data, size_t datalen);
+
+static int rnbd_srv_rdma_ev(void *priv, struct rtrs_srv_op *id,
+			    void *data, size_t datalen,
+			    const void *usr, size_t usrlen)
+{
+	struct rnbd_srv_session *srv_sess = priv;
+	const struct rnbd_msg_hdr *hdr = usr;
+	int ret = 0;
+	u16 type;
+
+	if (WARN_ON_ONCE(!srv_sess))
+		return -ENODEV;
+
+	type = le16_to_cpu(hdr->type);
+
+	switch (type) {
+	case RNBD_MSG_IO:
+		return process_rdma(srv_sess, id, data, datalen, usr, usrlen);
+	case RNBD_MSG_CLOSE:
+		process_msg_close(srv_sess, data, datalen, usr, usrlen);
+		break;
+	case RNBD_MSG_OPEN:
+		ret = process_msg_open(srv_sess, usr, usrlen, data, datalen);
+		break;
+	case RNBD_MSG_SESS_INFO:
+		ret = process_msg_sess_info(srv_sess, usr, usrlen, data,
+					    datalen);
+		break;
+	default:
+		pr_warn("Received unexpected message type %d from session %s\n",
+			type, srv_sess->sessname);
+		return -EINVAL;
+	}
+
+	/*
+	 * Since ret is passed to rtrs to handle the failure case, we
+	 * just return 0 at the end otherwise callers in rtrs would call
+	 * send_io_resp_imm again to print redundant err message.
+	 */
+	rtrs_srv_resp_rdma(id, ret);
+	return 0;
+}
+
+static struct rnbd_srv_sess_dev
+*rnbd_sess_dev_alloc(struct rnbd_srv_session *srv_sess)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+	int error;
+
+	sess_dev = kzalloc(sizeof(*sess_dev), GFP_KERNEL);
+	if (!sess_dev)
+		return ERR_PTR(-ENOMEM);
+
+	error = xa_alloc(&srv_sess->index_idr, &sess_dev->device_id, sess_dev,
+			 xa_limit_32b, GFP_NOWAIT);
+	if (error < 0) {
+		pr_warn("Allocating idr failed, err: %d\n", error);
+		kfree(sess_dev);
+		return ERR_PTR(error);
+	}
+
+	return sess_dev;
+}
+
+static struct rnbd_srv_dev *rnbd_srv_init_srv_dev(struct block_device *bdev)
+{
+	struct rnbd_srv_dev *dev;
+
+	dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+	if (!dev)
+		return ERR_PTR(-ENOMEM);
+
+	snprintf(dev->id, sizeof(dev->id), "%pg", bdev);
+	kref_init(&dev->kref);
+	INIT_LIST_HEAD(&dev->sess_dev_list);
+	mutex_init(&dev->lock);
+
+	return dev;
+}
+
+static struct rnbd_srv_dev *
+rnbd_srv_find_or_add_srv_dev(struct rnbd_srv_dev *new_dev)
+{
+	struct rnbd_srv_dev *dev;
+
+	spin_lock(&dev_lock);
+	list_for_each_entry(dev, &dev_list, list) {
+		if (!strncmp(dev->id, new_dev->id, sizeof(dev->id))) {
+			if (!kref_get_unless_zero(&dev->kref))
+				/*
+				 * We lost the race, device is almost dead.
+				 *  Continue traversing to find a valid one.
+				 */
+				continue;
+			spin_unlock(&dev_lock);
+			return dev;
+		}
+	}
+	list_add(&new_dev->list, &dev_list);
+	spin_unlock(&dev_lock);
+
+	return new_dev;
+}
+
+static int rnbd_srv_check_update_open_perm(struct rnbd_srv_dev *srv_dev,
+					    struct rnbd_srv_session *srv_sess,
+					    enum rnbd_access_mode access_mode)
+{
+	int ret = -EPERM;
+
+	mutex_lock(&srv_dev->lock);
+
+	switch (access_mode) {
+	case RNBD_ACCESS_RO:
+		ret = 0;
+		break;
+	case RNBD_ACCESS_RW:
+		if (srv_dev->open_write_cnt == 0)  {
+			srv_dev->open_write_cnt++;
+			ret = 0;
+		} else {
+			pr_err("Mapping device '%s' for session %s with RW permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n",
+			       srv_dev->id, srv_sess->sessname,
+			       srv_dev->open_write_cnt,
+			       rnbd_access_mode_str(access_mode));
+		}
+		break;
+	case RNBD_ACCESS_MIGRATION:
+		if (srv_dev->open_write_cnt < 2) {
+			srv_dev->open_write_cnt++;
+			ret = 0;
+		} else {
+			pr_err("Mapping device '%s' for session %s with migration permissions failed. Device already opened as 'RW' by %d client(s), access mode %s.\n",
+			       srv_dev->id, srv_sess->sessname,
+			       srv_dev->open_write_cnt,
+			       rnbd_access_mode_str(access_mode));
+		}
+		break;
+	default:
+		pr_err("Received mapping request for device '%s' on session %s with invalid access mode: %d\n",
+		       srv_dev->id, srv_sess->sessname, access_mode);
+		ret = -EINVAL;
+	}
+
+	mutex_unlock(&srv_dev->lock);
+
+	return ret;
+}
+
+static struct rnbd_srv_dev *
+rnbd_srv_get_or_create_srv_dev(struct block_device *bdev,
+				struct rnbd_srv_session *srv_sess,
+				enum rnbd_access_mode access_mode)
+{
+	int ret;
+	struct rnbd_srv_dev *new_dev, *dev;
+
+	new_dev = rnbd_srv_init_srv_dev(bdev);
+	if (IS_ERR(new_dev))
+		return new_dev;
+
+	dev = rnbd_srv_find_or_add_srv_dev(new_dev);
+	if (dev != new_dev)
+		kfree(new_dev);
+
+	ret = rnbd_srv_check_update_open_perm(dev, srv_sess, access_mode);
+	if (ret) {
+		rnbd_put_srv_dev(dev);
+		return ERR_PTR(ret);
+	}
+
+	return dev;
+}
+
+static void rnbd_srv_fill_msg_open_rsp(struct rnbd_msg_open_rsp *rsp,
+					struct rnbd_srv_sess_dev *sess_dev)
+{
+	struct block_device *bdev = sess_dev->bdev;
+
+	rsp->hdr.type = cpu_to_le16(RNBD_MSG_OPEN_RSP);
+	rsp->device_id = cpu_to_le32(sess_dev->device_id);
+	rsp->nsectors = cpu_to_le64(bdev_nr_sectors(bdev));
+	rsp->logical_block_size	= cpu_to_le16(bdev_logical_block_size(bdev));
+	rsp->physical_block_size = cpu_to_le16(bdev_physical_block_size(bdev));
+	rsp->max_segments = cpu_to_le16(bdev_max_segments(bdev));
+	rsp->max_hw_sectors =
+		cpu_to_le32(queue_max_hw_sectors(bdev_get_queue(bdev)));
+	rsp->max_write_same_sectors = 0;
+	rsp->max_discard_sectors = cpu_to_le32(bdev_max_discard_sectors(bdev));
+	rsp->discard_granularity = cpu_to_le32(bdev_discard_granularity(bdev));
+	rsp->discard_alignment = cpu_to_le32(bdev_discard_alignment(bdev));
+	rsp->secure_discard = cpu_to_le16(bdev_max_secure_erase_sectors(bdev));
+	rsp->cache_policy = 0;
+	if (bdev_write_cache(bdev))
+		rsp->cache_policy |= RNBD_WRITEBACK;
+	if (bdev_fua(bdev))
+		rsp->cache_policy |= RNBD_FUA;
+}
+
+static struct rnbd_srv_sess_dev *
+rnbd_srv_create_set_sess_dev(struct rnbd_srv_session *srv_sess,
+			      const struct rnbd_msg_open *open_msg,
+			      struct block_device *bdev, fmode_t open_flags,
+			      struct rnbd_srv_dev *srv_dev)
+{
+	struct rnbd_srv_sess_dev *sdev = rnbd_sess_dev_alloc(srv_sess);
+
+	if (IS_ERR(sdev))
+		return sdev;
+
+	kref_init(&sdev->kref);
+
+	strscpy(sdev->pathname, open_msg->dev_name, sizeof(sdev->pathname));
+
+	sdev->bdev		= bdev;
+	sdev->sess		= srv_sess;
+	sdev->dev		= srv_dev;
+	sdev->open_flags	= open_flags;
+	sdev->access_mode	= open_msg->access_mode;
+
+	return sdev;
+}
+
+static char *rnbd_srv_get_full_path(struct rnbd_srv_session *srv_sess,
+				     const char *dev_name)
+{
+	char *full_path;
+	char *a, *b;
+
+	full_path = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!full_path)
+		return ERR_PTR(-ENOMEM);
+
+	/*
+	 * Replace %SESSNAME% with a real session name in order to
+	 * create device namespace.
+	 */
+	a = strnstr(dev_search_path, "%SESSNAME%", sizeof(dev_search_path));
+	if (a) {
+		int len = a - dev_search_path;
+
+		len = snprintf(full_path, PATH_MAX, "%.*s/%s/%s", len,
+			       dev_search_path, srv_sess->sessname, dev_name);
+		if (len >= PATH_MAX) {
+			pr_err("Too long path: %s, %s, %s\n",
+			       dev_search_path, srv_sess->sessname, dev_name);
+			kfree(full_path);
+			return ERR_PTR(-EINVAL);
+		}
+	} else {
+		snprintf(full_path, PATH_MAX, "%s/%s",
+			 dev_search_path, dev_name);
+	}
+
+	/* eliminitate duplicated slashes */
+	a = strchr(full_path, '/');
+	b = a;
+	while (*b != '\0') {
+		if (*b == '/' && *a == '/') {
+			b++;
+		} else {
+			a++;
+			*a = *b;
+			b++;
+		}
+	}
+	a++;
+	*a = '\0';
+
+	return full_path;
+}
+
+static int process_msg_sess_info(struct rnbd_srv_session *srv_sess,
+				 const void *msg, size_t len,
+				 void *data, size_t datalen)
+{
+	const struct rnbd_msg_sess_info *sess_info_msg = msg;
+	struct rnbd_msg_sess_info_rsp *rsp = data;
+
+	srv_sess->ver = min_t(u8, sess_info_msg->ver, RNBD_PROTO_VER_MAJOR);
+
+	trace_process_msg_sess_info(srv_sess, sess_info_msg);
+
+	rsp->hdr.type = cpu_to_le16(RNBD_MSG_SESS_INFO_RSP);
+	rsp->ver = srv_sess->ver;
+
+	return 0;
+}
+
+/**
+ * find_srv_sess_dev() - a dev is already opened by this name
+ * @srv_sess:	the session to search.
+ * @dev_name:	string containing the name of the device.
+ *
+ * Return struct rnbd_srv_sess_dev if srv_sess already opened the dev_name
+ * NULL if the session didn't open the device yet.
+ */
+static struct rnbd_srv_sess_dev *
+find_srv_sess_dev(struct rnbd_srv_session *srv_sess, const char *dev_name)
+{
+	struct rnbd_srv_sess_dev *sess_dev;
+	unsigned long index;
+
+	if (xa_empty(&srv_sess->index_idr))
+		return NULL;
+
+	xa_for_each(&srv_sess->index_idr, index, sess_dev)
+		if (!strcmp(sess_dev->pathname, dev_name))
+			return sess_dev;
+
+	return NULL;
+}
+
+static int process_msg_open(struct rnbd_srv_session *srv_sess,
+			    const void *msg, size_t len,
+			    void *data, size_t datalen)
+{
+	int ret;
+	struct rnbd_srv_dev *srv_dev;
+	struct rnbd_srv_sess_dev *srv_sess_dev;
+	const struct rnbd_msg_open *open_msg = msg;
+	struct block_device *bdev;
+	fmode_t open_flags;
+	char *full_path;
+	struct rnbd_msg_open_rsp *rsp = data;
+
+	trace_process_msg_open(srv_sess, open_msg);
+
+	open_flags = FMODE_READ;
+	if (open_msg->access_mode != RNBD_ACCESS_RO)
+		open_flags |= FMODE_WRITE;
+
+	mutex_lock(&srv_sess->lock);
+
+	srv_sess_dev = find_srv_sess_dev(srv_sess, open_msg->dev_name);
+	if (srv_sess_dev)
+		goto fill_response;
+
+	if ((strlen(dev_search_path) + strlen(open_msg->dev_name))
+	    >= PATH_MAX) {
+		pr_err("Opening device for session %s failed, device path too long. '%s/%s' is longer than PATH_MAX (%d)\n",
+		       srv_sess->sessname, dev_search_path, open_msg->dev_name,
+		       PATH_MAX);
+		ret = -EINVAL;
+		goto reject;
+	}
+	if (strstr(open_msg->dev_name, "..")) {
+		pr_err("Opening device for session %s failed, device path %s contains relative path ..\n",
+		       srv_sess->sessname, open_msg->dev_name);
+		ret = -EINVAL;
+		goto reject;
+	}
+	full_path = rnbd_srv_get_full_path(srv_sess, open_msg->dev_name);
+	if (IS_ERR(full_path)) {
+		ret = PTR_ERR(full_path);
+		pr_err("Opening device '%s' for client %s failed, failed to get device full path, err: %d\n",
+		       open_msg->dev_name, srv_sess->sessname, ret);
+		goto reject;
+	}
+
+	bdev = blkdev_get_by_path(full_path, open_flags, THIS_MODULE);
+	if (IS_ERR(bdev)) {
+		ret = PTR_ERR(bdev);
+		pr_err("Opening device '%s' on session %s failed, failed to open the block device, err: %d\n",
+		       full_path, srv_sess->sessname, ret);
+		goto free_path;
+	}
+
+	srv_dev = rnbd_srv_get_or_create_srv_dev(bdev, srv_sess,
+						  open_msg->access_mode);
+	if (IS_ERR(srv_dev)) {
+		pr_err("Opening device '%s' on session %s failed, creating srv_dev failed, err: %ld\n",
+		       full_path, srv_sess->sessname, PTR_ERR(srv_dev));
+		ret = PTR_ERR(srv_dev);
+		goto blkdev_put;
+	}
+
+	srv_sess_dev = rnbd_srv_create_set_sess_dev(srv_sess, open_msg,
+						     bdev, open_flags,
+						     srv_dev);
+	if (IS_ERR(srv_sess_dev)) {
+		pr_err("Opening device '%s' on session %s failed, creating sess_dev failed, err: %ld\n",
+		       full_path, srv_sess->sessname, PTR_ERR(srv_sess_dev));
+		ret = PTR_ERR(srv_sess_dev);
+		goto srv_dev_put;
+	}
+
+	/* Create the srv_dev sysfs files if they haven't been created yet. The
+	 * reason to delay the creation is not to create the sysfs files before
+	 * we are sure the device can be opened.
+	 */
+	mutex_lock(&srv_dev->lock);
+	if (!srv_dev->dev_kobj.state_in_sysfs) {
+		ret = rnbd_srv_create_dev_sysfs(srv_dev, bdev);
+		if (ret) {
+			mutex_unlock(&srv_dev->lock);
+			rnbd_srv_err(srv_sess_dev,
+				      "Opening device failed, failed to create device sysfs files, err: %d\n",
+				      ret);
+			goto free_srv_sess_dev;
+		}
+	}
+
+	ret = rnbd_srv_create_dev_session_sysfs(srv_sess_dev);
+	if (ret) {
+		mutex_unlock(&srv_dev->lock);
+		rnbd_srv_err(srv_sess_dev,
+			      "Opening device failed, failed to create dev client sysfs files, err: %d\n",
+			      ret);
+		goto free_srv_sess_dev;
+	}
+
+	list_add(&srv_sess_dev->dev_list, &srv_dev->sess_dev_list);
+	mutex_unlock(&srv_dev->lock);
+
+	rnbd_srv_info(srv_sess_dev, "Opened device '%s'\n", srv_dev->id);
+
+	kfree(full_path);
+
+fill_response:
+	rnbd_srv_fill_msg_open_rsp(rsp, srv_sess_dev);
+	mutex_unlock(&srv_sess->lock);
+	return 0;
+
+free_srv_sess_dev:
+	xa_erase(&srv_sess->index_idr, srv_sess_dev->device_id);
+	synchronize_rcu();
+	kfree(srv_sess_dev);
+srv_dev_put:
+	if (open_msg->access_mode != RNBD_ACCESS_RO) {
+		mutex_lock(&srv_dev->lock);
+		srv_dev->open_write_cnt--;
+		mutex_unlock(&srv_dev->lock);
+	}
+	rnbd_put_srv_dev(srv_dev);
+blkdev_put:
+	blkdev_put(bdev, open_flags);
+free_path:
+	kfree(full_path);
+reject:
+	mutex_unlock(&srv_sess->lock);
+	return ret;
+}
+
+static struct rtrs_srv_ctx *rtrs_ctx;
+
+static struct rtrs_srv_ops rtrs_ops;
+static int __init rnbd_srv_init_module(void)
+{
+	int err;
+
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_hdr) != 4);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info) != 36);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_sess_info_rsp) != 36);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_open) != 264);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_close) != 8);
+	BUILD_BUG_ON(sizeof(struct rnbd_msg_open_rsp) != 56);
+	rtrs_ops = (struct rtrs_srv_ops) {
+		.rdma_ev = rnbd_srv_rdma_ev,
+		.link_ev = rnbd_srv_link_ev,
+	};
+	rtrs_ctx = rtrs_srv_open(&rtrs_ops, port_nr);
+	if (IS_ERR(rtrs_ctx)) {
+		err = PTR_ERR(rtrs_ctx);
+		pr_err("rtrs_srv_open(), err: %d\n", err);
+		return err;
+	}
+
+	err = rnbd_srv_create_sysfs_files();
+	if (err) {
+		pr_err("rnbd_srv_create_sysfs_files(), err: %d\n", err);
+		rtrs_srv_close(rtrs_ctx);
+		return err;
+	}
+
+	return 0;
+}
+
+static void __exit rnbd_srv_cleanup_module(void)
+{
+	rtrs_srv_close(rtrs_ctx);
+	WARN_ON(!list_empty(&sess_list));
+	rnbd_srv_destroy_sysfs_files();
+}
+
+module_init(rnbd_srv_init_module);
+module_exit(rnbd_srv_cleanup_module);
diff --git a/drivers/block/rnbd/rnbd-srv.h b/drivers/block/rnbd/rnbd-srv.h
new file mode 100644
index 000000000..f5962fd31
--- /dev/null
+++ b/drivers/block/rnbd/rnbd-srv.h
@@ -0,0 +1,75 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * RDMA Network Block Driver
+ *
+ * Copyright (c) 2014 - 2018 ProfitBricks GmbH. All rights reserved.
+ * Copyright (c) 2018 - 2019 1&1 IONOS Cloud GmbH. All rights reserved.
+ * Copyright (c) 2019 - 2020 1&1 IONOS SE. All rights reserved.
+ */
+#ifndef RNBD_SRV_H
+#define RNBD_SRV_H
+
+#include <linux/types.h>
+#include <linux/idr.h>
+#include <linux/kref.h>
+
+#include <rtrs.h>
+#include "rnbd-proto.h"
+#include "rnbd-log.h"
+
+struct rnbd_srv_session {
+	/* Entry inside global sess_list */
+	struct list_head        list;
+	struct rtrs_srv_sess	*rtrs;
+	char			sessname[NAME_MAX];
+	int			queue_depth;
+
+	struct xarray		index_idr;
+	struct mutex		lock;
+	u8			ver;
+};
+
+struct rnbd_srv_dev {
+	/* Entry inside global dev_list */
+	struct list_head                list;
+	struct kobject                  dev_kobj;
+	struct kobject                  *dev_sessions_kobj;
+	struct kref                     kref;
+	char				id[NAME_MAX];
+	/* List of rnbd_srv_sess_dev structs */
+	struct list_head		sess_dev_list;
+	struct mutex			lock;
+	int				open_write_cnt;
+};
+
+/* Structure which binds N devices and N sessions */
+struct rnbd_srv_sess_dev {
+	/* Entry inside rnbd_srv_dev struct */
+	struct list_head		dev_list;
+	struct block_device		*bdev;
+	struct rnbd_srv_session		*sess;
+	struct rnbd_srv_dev		*dev;
+	struct kobject                  kobj;
+	u32                             device_id;
+	bool				keep_id;
+	fmode_t                         open_flags;
+	struct kref			kref;
+	struct completion               *destroy_comp;
+	char				pathname[NAME_MAX];
+	enum rnbd_access_mode		access_mode;
+};
+
+void rnbd_srv_sess_dev_force_close(struct rnbd_srv_sess_dev *sess_dev,
+				   struct kobj_attribute *attr);
+/* rnbd-srv-sysfs.c */
+
+int rnbd_srv_create_dev_sysfs(struct rnbd_srv_dev *dev,
+			      struct block_device *bdev);
+void rnbd_srv_destroy_dev_sysfs(struct rnbd_srv_dev *dev);
+int rnbd_srv_create_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev);
+void rnbd_srv_destroy_dev_session_sysfs(struct rnbd_srv_sess_dev *sess_dev);
+int rnbd_srv_create_sysfs_files(void);
+void rnbd_srv_destroy_sysfs_files(void);
+void rnbd_destroy_sess_dev(struct rnbd_srv_sess_dev *sess_dev, bool keep_id);
+
+#endif /* RNBD_SRV_H */
diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c
new file mode 100644
index 000000000..9fa821fa7
--- /dev/null
+++ b/drivers/block/sunvdc.c
@@ -0,0 +1,1249 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/* sunvdc.c: Sun LDOM Virtual Disk Client.
+ *
+ * Copyright (C) 2007, 2008 David S. Miller <davem@davemloft.net>
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/blk-mq.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/completion.h>
+#include <linux/delay.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/scatterlist.h>
+
+#include <asm/vio.h>
+#include <asm/ldc.h>
+
+#define DRV_MODULE_NAME		"sunvdc"
+#define PFX DRV_MODULE_NAME	": "
+#define DRV_MODULE_VERSION	"1.2"
+#define DRV_MODULE_RELDATE	"November 24, 2014"
+
+static char version[] =
+	DRV_MODULE_NAME ".c:v" DRV_MODULE_VERSION " (" DRV_MODULE_RELDATE ")\n";
+MODULE_AUTHOR("David S. Miller (davem@davemloft.net)");
+MODULE_DESCRIPTION("Sun LDOM virtual disk client driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION(DRV_MODULE_VERSION);
+
+#define VDC_TX_RING_SIZE	512
+#define VDC_DEFAULT_BLK_SIZE	512
+
+#define MAX_XFER_BLKS		(128 * 1024)
+#define MAX_XFER_SIZE		(MAX_XFER_BLKS / VDC_DEFAULT_BLK_SIZE)
+#define MAX_RING_COOKIES	((MAX_XFER_BLKS / PAGE_SIZE) + 2)
+
+#define WAITING_FOR_LINK_UP	0x01
+#define WAITING_FOR_TX_SPACE	0x02
+#define WAITING_FOR_GEN_CMD	0x04
+#define WAITING_FOR_ANY		-1
+
+#define	VDC_MAX_RETRIES	10
+
+static struct workqueue_struct *sunvdc_wq;
+
+struct vdc_req_entry {
+	struct request		*req;
+};
+
+struct vdc_port {
+	struct vio_driver_state	vio;
+
+	struct gendisk		*disk;
+
+	struct vdc_completion	*cmp;
+
+	u64			req_id;
+	u64			seq;
+	struct vdc_req_entry	rq_arr[VDC_TX_RING_SIZE];
+
+	unsigned long		ring_cookies;
+
+	u64			max_xfer_size;
+	u32			vdisk_block_size;
+	u32			drain;
+
+	u64			ldc_timeout;
+	struct delayed_work	ldc_reset_timer_work;
+	struct work_struct	ldc_reset_work;
+
+	/* The server fills these in for us in the disk attribute
+	 * ACK packet.
+	 */
+	u64			operations;
+	u32			vdisk_size;
+	u8			vdisk_type;
+	u8			vdisk_mtype;
+	u32			vdisk_phys_blksz;
+
+	struct blk_mq_tag_set	tag_set;
+
+	char			disk_name[32];
+};
+
+static void vdc_ldc_reset(struct vdc_port *port);
+static void vdc_ldc_reset_work(struct work_struct *work);
+static void vdc_ldc_reset_timer_work(struct work_struct *work);
+
+static inline struct vdc_port *to_vdc_port(struct vio_driver_state *vio)
+{
+	return container_of(vio, struct vdc_port, vio);
+}
+
+/* Ordered from largest major to lowest */
+static struct vio_version vdc_versions[] = {
+	{ .major = 1, .minor = 2 },
+	{ .major = 1, .minor = 1 },
+	{ .major = 1, .minor = 0 },
+};
+
+static inline int vdc_version_supported(struct vdc_port *port,
+					u16 major, u16 minor)
+{
+	return port->vio.ver.major == major && port->vio.ver.minor >= minor;
+}
+
+#define VDCBLK_NAME	"vdisk"
+static int vdc_major;
+#define PARTITION_SHIFT	3
+
+static inline u32 vdc_tx_dring_avail(struct vio_dring_state *dr)
+{
+	return vio_dring_avail(dr, VDC_TX_RING_SIZE);
+}
+
+static int vdc_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct gendisk *disk = bdev->bd_disk;
+	sector_t nsect = get_capacity(disk);
+	sector_t cylinders = nsect;
+
+	geo->heads = 0xff;
+	geo->sectors = 0x3f;
+	sector_div(cylinders, geo->heads * geo->sectors);
+	geo->cylinders = cylinders;
+	if ((sector_t)(geo->cylinders + 1) * geo->heads * geo->sectors < nsect)
+		geo->cylinders = 0xffff;
+
+	return 0;
+}
+
+/* Add ioctl/CDROM_GET_CAPABILITY to support cdrom_id in udev
+ * when vdisk_mtype is VD_MEDIA_TYPE_CD or VD_MEDIA_TYPE_DVD.
+ * Needed to be able to install inside an ldom from an iso image.
+ */
+static int vdc_ioctl(struct block_device *bdev, fmode_t mode,
+		     unsigned command, unsigned long argument)
+{
+	struct vdc_port *port = bdev->bd_disk->private_data;
+	int i;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		pr_debug(PFX "Multisession CDs not supported\n");
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+
+	case CDROM_GET_CAPABILITY:
+		if (!vdc_version_supported(port, 1, 1))
+			return -EINVAL;
+		switch (port->vdisk_mtype) {
+		case VD_MEDIA_TYPE_CD:
+		case VD_MEDIA_TYPE_DVD:
+			return 0;
+		default:
+			return -EINVAL;
+		}
+	default:
+		pr_debug(PFX "ioctl %08x not supported\n", command);
+		return -EINVAL;
+	}
+}
+
+static const struct block_device_operations vdc_fops = {
+	.owner		= THIS_MODULE,
+	.getgeo		= vdc_getgeo,
+	.ioctl		= vdc_ioctl,
+	.compat_ioctl	= blkdev_compat_ptr_ioctl,
+};
+
+static void vdc_blk_queue_start(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	/* restart blk queue when ring is half emptied. also called after
+	 * handshake completes, so check for initial handshake before we've
+	 * allocated a disk.
+	 */
+	if (port->disk && vdc_tx_dring_avail(dr) * 100 / VDC_TX_RING_SIZE >= 50)
+		blk_mq_start_stopped_hw_queues(port->disk->queue, true);
+}
+
+static void vdc_finish(struct vio_driver_state *vio, int err, int waiting_for)
+{
+	if (vio->cmp &&
+	    (waiting_for == -1 ||
+	     vio->cmp->waiting_for == waiting_for)) {
+		vio->cmp->err = err;
+		complete(&vio->cmp->com);
+		vio->cmp = NULL;
+	}
+}
+
+static void vdc_handshake_complete(struct vio_driver_state *vio)
+{
+	struct vdc_port *port = to_vdc_port(vio);
+
+	cancel_delayed_work(&port->ldc_reset_timer_work);
+	vdc_finish(vio, 0, WAITING_FOR_LINK_UP);
+	vdc_blk_queue_start(port);
+}
+
+static int vdc_handle_unknown(struct vdc_port *port, void *arg)
+{
+	struct vio_msg_tag *pkt = arg;
+
+	printk(KERN_ERR PFX "Received unknown msg [%02x:%02x:%04x:%08x]\n",
+	       pkt->type, pkt->stype, pkt->stype_env, pkt->sid);
+	printk(KERN_ERR PFX "Resetting connection.\n");
+
+	ldc_disconnect(port->vio.lp);
+
+	return -ECONNRESET;
+}
+
+static int vdc_send_attr(struct vio_driver_state *vio)
+{
+	struct vdc_port *port = to_vdc_port(vio);
+	struct vio_disk_attr_info pkt;
+
+	memset(&pkt, 0, sizeof(pkt));
+
+	pkt.tag.type = VIO_TYPE_CTRL;
+	pkt.tag.stype = VIO_SUBTYPE_INFO;
+	pkt.tag.stype_env = VIO_ATTR_INFO;
+	pkt.tag.sid = vio_send_sid(vio);
+
+	pkt.xfer_mode = VIO_DRING_MODE;
+	pkt.vdisk_block_size = port->vdisk_block_size;
+	pkt.max_xfer_size = port->max_xfer_size;
+
+	viodbg(HS, "SEND ATTR xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+	       pkt.xfer_mode, pkt.vdisk_block_size, pkt.max_xfer_size);
+
+	return vio_ldc_send(&port->vio, &pkt, sizeof(pkt));
+}
+
+static int vdc_handle_attr(struct vio_driver_state *vio, void *arg)
+{
+	struct vdc_port *port = to_vdc_port(vio);
+	struct vio_disk_attr_info *pkt = arg;
+
+	viodbg(HS, "GOT ATTR stype[0x%x] ops[%llx] disk_size[%llu] disk_type[%x] "
+	       "mtype[0x%x] xfer_mode[0x%x] blksz[%u] max_xfer[%llu]\n",
+	       pkt->tag.stype, pkt->operations,
+	       pkt->vdisk_size, pkt->vdisk_type, pkt->vdisk_mtype,
+	       pkt->xfer_mode, pkt->vdisk_block_size,
+	       pkt->max_xfer_size);
+
+	if (pkt->tag.stype == VIO_SUBTYPE_ACK) {
+		switch (pkt->vdisk_type) {
+		case VD_DISK_TYPE_DISK:
+		case VD_DISK_TYPE_SLICE:
+			break;
+
+		default:
+			printk(KERN_ERR PFX "%s: Bogus vdisk_type 0x%x\n",
+			       vio->name, pkt->vdisk_type);
+			return -ECONNRESET;
+		}
+
+		if (pkt->vdisk_block_size > port->vdisk_block_size) {
+			printk(KERN_ERR PFX "%s: BLOCK size increased "
+			       "%u --> %u\n",
+			       vio->name,
+			       port->vdisk_block_size, pkt->vdisk_block_size);
+			return -ECONNRESET;
+		}
+
+		port->operations = pkt->operations;
+		port->vdisk_type = pkt->vdisk_type;
+		if (vdc_version_supported(port, 1, 1)) {
+			port->vdisk_size = pkt->vdisk_size;
+			port->vdisk_mtype = pkt->vdisk_mtype;
+		}
+		if (pkt->max_xfer_size < port->max_xfer_size)
+			port->max_xfer_size = pkt->max_xfer_size;
+		port->vdisk_block_size = pkt->vdisk_block_size;
+
+		port->vdisk_phys_blksz = VDC_DEFAULT_BLK_SIZE;
+		if (vdc_version_supported(port, 1, 2))
+			port->vdisk_phys_blksz = pkt->phys_block_size;
+
+		return 0;
+	} else {
+		printk(KERN_ERR PFX "%s: Attribute NACK\n", vio->name);
+
+		return -ECONNRESET;
+	}
+}
+
+static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc)
+{
+	int err = desc->status;
+
+	vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD);
+}
+
+static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr,
+			unsigned int index)
+{
+	struct vio_disk_desc *desc = vio_dring_entry(dr, index);
+	struct vdc_req_entry *rqe = &port->rq_arr[index];
+	struct request *req;
+
+	if (unlikely(desc->hdr.state != VIO_DESC_DONE))
+		return;
+
+	ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies);
+	desc->hdr.state = VIO_DESC_FREE;
+	dr->cons = vio_dring_next(dr, index);
+
+	req = rqe->req;
+	if (req == NULL) {
+		vdc_end_special(port, desc);
+		return;
+	}
+
+	rqe->req = NULL;
+
+	blk_mq_end_request(req, desc->status ? BLK_STS_IOERR : 0);
+
+	vdc_blk_queue_start(port);
+}
+
+static int vdc_ack(struct vdc_port *port, void *msgbuf)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data *pkt = msgbuf;
+
+	if (unlikely(pkt->dring_ident != dr->ident ||
+		     pkt->start_idx != pkt->end_idx ||
+		     pkt->start_idx >= VDC_TX_RING_SIZE))
+		return 0;
+
+	vdc_end_one(port, dr, pkt->start_idx);
+
+	return 0;
+}
+
+static int vdc_nack(struct vdc_port *port, void *msgbuf)
+{
+	/* XXX Implement me XXX */
+	return 0;
+}
+
+static void vdc_event(void *arg, int event)
+{
+	struct vdc_port *port = arg;
+	struct vio_driver_state *vio = &port->vio;
+	unsigned long flags;
+	int err;
+
+	spin_lock_irqsave(&vio->lock, flags);
+
+	if (unlikely(event == LDC_EVENT_RESET)) {
+		vio_link_state_change(vio, event);
+		queue_work(sunvdc_wq, &port->ldc_reset_work);
+		goto out;
+	}
+
+	if (unlikely(event == LDC_EVENT_UP)) {
+		vio_link_state_change(vio, event);
+		goto out;
+	}
+
+	if (unlikely(event != LDC_EVENT_DATA_READY)) {
+		pr_warn(PFX "Unexpected LDC event %d\n", event);
+		goto out;
+	}
+
+	err = 0;
+	while (1) {
+		union {
+			struct vio_msg_tag tag;
+			u64 raw[8];
+		} msgbuf;
+
+		err = ldc_read(vio->lp, &msgbuf, sizeof(msgbuf));
+		if (unlikely(err < 0)) {
+			if (err == -ECONNRESET)
+				vio_conn_reset(vio);
+			break;
+		}
+		if (err == 0)
+			break;
+		viodbg(DATA, "TAG [%02x:%02x:%04x:%08x]\n",
+		       msgbuf.tag.type,
+		       msgbuf.tag.stype,
+		       msgbuf.tag.stype_env,
+		       msgbuf.tag.sid);
+		err = vio_validate_sid(vio, &msgbuf.tag);
+		if (err < 0)
+			break;
+
+		if (likely(msgbuf.tag.type == VIO_TYPE_DATA)) {
+			if (msgbuf.tag.stype == VIO_SUBTYPE_ACK)
+				err = vdc_ack(port, &msgbuf);
+			else if (msgbuf.tag.stype == VIO_SUBTYPE_NACK)
+				err = vdc_nack(port, &msgbuf);
+			else
+				err = vdc_handle_unknown(port, &msgbuf);
+		} else if (msgbuf.tag.type == VIO_TYPE_CTRL) {
+			err = vio_control_pkt_engine(vio, &msgbuf);
+		} else {
+			err = vdc_handle_unknown(port, &msgbuf);
+		}
+		if (err < 0)
+			break;
+	}
+	if (err < 0)
+		vdc_finish(&port->vio, err, WAITING_FOR_ANY);
+out:
+	spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static int __vdc_tx_trigger(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct vio_dring_data hdr = {
+		.tag = {
+			.type		= VIO_TYPE_DATA,
+			.stype		= VIO_SUBTYPE_INFO,
+			.stype_env	= VIO_DRING_DATA,
+			.sid		= vio_send_sid(&port->vio),
+		},
+		.dring_ident		= dr->ident,
+		.start_idx		= dr->prod,
+		.end_idx		= dr->prod,
+	};
+	int err, delay;
+	int retries = 0;
+
+	hdr.seq = dr->snd_nxt;
+	delay = 1;
+	do {
+		err = vio_ldc_send(&port->vio, &hdr, sizeof(hdr));
+		if (err > 0) {
+			dr->snd_nxt++;
+			break;
+		}
+		udelay(delay);
+		if ((delay <<= 1) > 128)
+			delay = 128;
+		if (retries++ > VDC_MAX_RETRIES)
+			break;
+	} while (err == -EAGAIN);
+
+	if (err == -ENOTCONN)
+		vdc_ldc_reset(port);
+	return err;
+}
+
+static int __send_request(struct request *req)
+{
+	struct vdc_port *port = req->q->disk->private_data;
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	struct scatterlist sg[MAX_RING_COOKIES];
+	struct vdc_req_entry *rqe;
+	struct vio_disk_desc *desc;
+	unsigned int map_perm;
+	int nsg, err, i;
+	u64 len;
+	u8 op;
+
+	if (WARN_ON(port->ring_cookies > MAX_RING_COOKIES))
+		return -EINVAL;
+
+	map_perm = LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+	if (rq_data_dir(req) == READ) {
+		map_perm |= LDC_MAP_W;
+		op = VD_OP_BREAD;
+	} else {
+		map_perm |= LDC_MAP_R;
+		op = VD_OP_BWRITE;
+	}
+
+	sg_init_table(sg, port->ring_cookies);
+	nsg = blk_rq_map_sg(req->q, req, sg);
+
+	len = 0;
+	for (i = 0; i < nsg; i++)
+		len += sg[i].length;
+
+	desc = vio_dring_cur(dr);
+
+	err = ldc_map_sg(port->vio.lp, sg, nsg,
+			 desc->cookies, port->ring_cookies,
+			 map_perm);
+	if (err < 0) {
+		printk(KERN_ERR PFX "ldc_map_sg() failure, err=%d.\n", err);
+		return err;
+	}
+
+	rqe = &port->rq_arr[dr->prod];
+	rqe->req = req;
+
+	desc->hdr.ack = VIO_ACK_ENABLE;
+	desc->req_id = port->req_id;
+	desc->operation = op;
+	if (port->vdisk_type == VD_DISK_TYPE_DISK) {
+		desc->slice = 0xff;
+	} else {
+		desc->slice = 0;
+	}
+	desc->status = ~0;
+	desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size;
+	desc->size = len;
+	desc->ncookies = err;
+
+	/* This has to be a non-SMP write barrier because we are writing
+	 * to memory which is shared with the peer LDOM.
+	 */
+	wmb();
+	desc->hdr.state = VIO_DESC_READY;
+
+	err = __vdc_tx_trigger(port);
+	if (err < 0) {
+		printk(KERN_ERR PFX "vdc_tx_trigger() failure, err=%d\n", err);
+	} else {
+		port->req_id++;
+		dr->prod = vio_dring_next(dr, dr->prod);
+	}
+
+	return err;
+}
+
+static blk_status_t vdc_queue_rq(struct blk_mq_hw_ctx *hctx,
+				 const struct blk_mq_queue_data *bd)
+{
+	struct vdc_port *port = hctx->queue->queuedata;
+	struct vio_dring_state *dr;
+	unsigned long flags;
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	blk_mq_start_request(bd->rq);
+
+	spin_lock_irqsave(&port->vio.lock, flags);
+
+	/*
+	 * Doing drain, just end the request in error
+	 */
+	if (unlikely(port->drain)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
+	}
+
+	if (unlikely(vdc_tx_dring_avail(dr) < 1)) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		blk_mq_stop_hw_queue(hctx);
+		return BLK_STS_DEV_RESOURCE;
+	}
+
+	if (__send_request(bd->rq) < 0) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		return BLK_STS_IOERR;
+	}
+
+	spin_unlock_irqrestore(&port->vio.lock, flags);
+	return BLK_STS_OK;
+}
+
+static int generic_request(struct vdc_port *port, u8 op, void *buf, int len)
+{
+	struct vio_dring_state *dr;
+	struct vio_completion comp;
+	struct vio_disk_desc *desc;
+	unsigned int map_perm;
+	unsigned long flags;
+	int op_len, err;
+	void *req_buf;
+
+	if (!(((u64)1 << (u64)op) & port->operations))
+		return -EOPNOTSUPP;
+
+	switch (op) {
+	case VD_OP_BREAD:
+	case VD_OP_BWRITE:
+	default:
+		return -EINVAL;
+
+	case VD_OP_FLUSH:
+		op_len = 0;
+		map_perm = 0;
+		break;
+
+	case VD_OP_GET_WCE:
+		op_len = sizeof(u32);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_WCE:
+		op_len = sizeof(u32);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_GET_VTOC:
+		op_len = sizeof(struct vio_disk_vtoc);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_VTOC:
+		op_len = sizeof(struct vio_disk_vtoc);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_GET_DISKGEOM:
+		op_len = sizeof(struct vio_disk_geom);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_SET_DISKGEOM:
+		op_len = sizeof(struct vio_disk_geom);
+		map_perm = LDC_MAP_R;
+		break;
+
+	case VD_OP_SCSICMD:
+		op_len = 16;
+		map_perm = LDC_MAP_RW;
+		break;
+
+	case VD_OP_GET_DEVID:
+		op_len = sizeof(struct vio_disk_devid);
+		map_perm = LDC_MAP_W;
+		break;
+
+	case VD_OP_GET_EFI:
+	case VD_OP_SET_EFI:
+		return -EOPNOTSUPP;
+	}
+
+	map_perm |= LDC_MAP_SHADOW | LDC_MAP_DIRECT | LDC_MAP_IO;
+
+	op_len = (op_len + 7) & ~7;
+	req_buf = kzalloc(op_len, GFP_KERNEL);
+	if (!req_buf)
+		return -ENOMEM;
+
+	if (len > op_len)
+		len = op_len;
+
+	if (map_perm & LDC_MAP_R)
+		memcpy(req_buf, buf, len);
+
+	spin_lock_irqsave(&port->vio.lock, flags);
+
+	dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	/* XXX If we want to use this code generically we have to
+	 * XXX handle TX ring exhaustion etc.
+	 */
+	desc = vio_dring_cur(dr);
+
+	err = ldc_map_single(port->vio.lp, req_buf, op_len,
+			     desc->cookies, port->ring_cookies,
+			     map_perm);
+	if (err < 0) {
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+		kfree(req_buf);
+		return err;
+	}
+
+	init_completion(&comp.com);
+	comp.waiting_for = WAITING_FOR_GEN_CMD;
+	port->vio.cmp = &comp;
+
+	desc->hdr.ack = VIO_ACK_ENABLE;
+	desc->req_id = port->req_id;
+	desc->operation = op;
+	desc->slice = 0;
+	desc->status = ~0;
+	desc->offset = 0;
+	desc->size = op_len;
+	desc->ncookies = err;
+
+	/* This has to be a non-SMP write barrier because we are writing
+	 * to memory which is shared with the peer LDOM.
+	 */
+	wmb();
+	desc->hdr.state = VIO_DESC_READY;
+
+	err = __vdc_tx_trigger(port);
+	if (err >= 0) {
+		port->req_id++;
+		dr->prod = vio_dring_next(dr, dr->prod);
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+
+		wait_for_completion(&comp.com);
+		err = comp.err;
+	} else {
+		port->vio.cmp = NULL;
+		spin_unlock_irqrestore(&port->vio.lock, flags);
+	}
+
+	if (map_perm & LDC_MAP_W)
+		memcpy(buf, req_buf, len);
+
+	kfree(req_buf);
+
+	return err;
+}
+
+static int vdc_alloc_tx_ring(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	unsigned long len, entry_size;
+	int ncookies;
+	void *dring;
+
+	entry_size = sizeof(struct vio_disk_desc) +
+		(sizeof(struct ldc_trans_cookie) * port->ring_cookies);
+	len = (VDC_TX_RING_SIZE * entry_size);
+
+	ncookies = VIO_MAX_RING_COOKIES;
+	dring = ldc_alloc_exp_dring(port->vio.lp, len,
+				    dr->cookies, &ncookies,
+				    (LDC_MAP_SHADOW |
+				     LDC_MAP_DIRECT |
+				     LDC_MAP_RW));
+	if (IS_ERR(dring))
+		return PTR_ERR(dring);
+
+	dr->base = dring;
+	dr->entry_size = entry_size;
+	dr->num_entries = VDC_TX_RING_SIZE;
+	dr->prod = dr->cons = 0;
+	dr->pending = VDC_TX_RING_SIZE;
+	dr->ncookies = ncookies;
+
+	return 0;
+}
+
+static void vdc_free_tx_ring(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+
+	if (dr->base) {
+		ldc_free_exp_dring(port->vio.lp, dr->base,
+				   (dr->entry_size * dr->num_entries),
+				   dr->cookies, dr->ncookies);
+		dr->base = NULL;
+		dr->entry_size = 0;
+		dr->num_entries = 0;
+		dr->pending = 0;
+		dr->ncookies = 0;
+	}
+}
+
+static int vdc_port_up(struct vdc_port *port)
+{
+	struct vio_completion comp;
+
+	init_completion(&comp.com);
+	comp.err = 0;
+	comp.waiting_for = WAITING_FOR_LINK_UP;
+	port->vio.cmp = &comp;
+
+	vio_port_up(&port->vio);
+	wait_for_completion(&comp.com);
+	return comp.err;
+}
+
+static void vdc_port_down(struct vdc_port *port)
+{
+	ldc_disconnect(port->vio.lp);
+	ldc_unbind(port->vio.lp);
+	vdc_free_tx_ring(port);
+	vio_ldc_free(&port->vio);
+}
+
+static const struct blk_mq_ops vdc_mq_ops = {
+	.queue_rq	= vdc_queue_rq,
+};
+
+static int probe_disk(struct vdc_port *port)
+{
+	struct request_queue *q;
+	struct gendisk *g;
+	int err;
+
+	err = vdc_port_up(port);
+	if (err)
+		return err;
+
+	/* Using version 1.2 means vdisk_phys_blksz should be set unless the
+	 * disk is reserved by another system.
+	 */
+	if (vdc_version_supported(port, 1, 2) && !port->vdisk_phys_blksz)
+		return -ENODEV;
+
+	if (vdc_version_supported(port, 1, 1)) {
+		/* vdisk_size should be set during the handshake, if it wasn't
+		 * then the underlying disk is reserved by another system
+		 */
+		if (port->vdisk_size == -1)
+			return -ENODEV;
+	} else {
+		struct vio_disk_geom geom;
+
+		err = generic_request(port, VD_OP_GET_DISKGEOM,
+				      &geom, sizeof(geom));
+		if (err < 0) {
+			printk(KERN_ERR PFX "VD_OP_GET_DISKGEOM returns "
+			       "error %d\n", err);
+			return err;
+		}
+		port->vdisk_size = ((u64)geom.num_cyl *
+				    (u64)geom.num_hd *
+				    (u64)geom.num_sec);
+	}
+
+	err = blk_mq_alloc_sq_tag_set(&port->tag_set, &vdc_mq_ops,
+			VDC_TX_RING_SIZE, BLK_MQ_F_SHOULD_MERGE);
+	if (err)
+		return err;
+
+	g = blk_mq_alloc_disk(&port->tag_set, port);
+	if (IS_ERR(g)) {
+		printk(KERN_ERR PFX "%s: Could not allocate gendisk.\n",
+		       port->vio.name);
+		err = PTR_ERR(g);
+		goto out_free_tag;
+	}
+
+	port->disk = g;
+	q = g->queue;
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(q, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(q, PAGE_SIZE);
+
+	blk_queue_max_segments(q, port->ring_cookies);
+	blk_queue_max_hw_sectors(q, port->max_xfer_size);
+	g->major = vdc_major;
+	g->first_minor = port->vio.vdev->dev_no << PARTITION_SHIFT;
+	g->minors = 1 << PARTITION_SHIFT;
+	strcpy(g->disk_name, port->disk_name);
+
+	g->fops = &vdc_fops;
+	g->queue = q;
+	g->private_data = port;
+
+	set_capacity(g, port->vdisk_size);
+
+	if (vdc_version_supported(port, 1, 1)) {
+		switch (port->vdisk_mtype) {
+		case VD_MEDIA_TYPE_CD:
+			pr_info(PFX "Virtual CDROM %s\n", port->disk_name);
+			g->flags |= GENHD_FL_REMOVABLE;
+			set_disk_ro(g, 1);
+			break;
+
+		case VD_MEDIA_TYPE_DVD:
+			pr_info(PFX "Virtual DVD %s\n", port->disk_name);
+			g->flags |= GENHD_FL_REMOVABLE;
+			set_disk_ro(g, 1);
+			break;
+
+		case VD_MEDIA_TYPE_FIXED:
+			pr_info(PFX "Virtual Hard disk %s\n", port->disk_name);
+			break;
+		}
+	}
+
+	blk_queue_physical_block_size(q, port->vdisk_phys_blksz);
+
+	pr_info(PFX "%s: %u sectors (%u MB) protocol %d.%d\n",
+	       g->disk_name,
+	       port->vdisk_size, (port->vdisk_size >> (20 - 9)),
+	       port->vio.ver.major, port->vio.ver.minor);
+
+	err = device_add_disk(&port->vio.vdev->dev, g, NULL);
+	if (err)
+		goto out_cleanup_disk;
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(g);
+out_free_tag:
+	blk_mq_free_tag_set(&port->tag_set);
+	return err;
+}
+
+static struct ldc_channel_config vdc_ldc_cfg = {
+	.event		= vdc_event,
+	.mtu		= 64,
+	.mode		= LDC_MODE_UNRELIABLE,
+};
+
+static struct vio_driver_ops vdc_vio_ops = {
+	.send_attr		= vdc_send_attr,
+	.handle_attr		= vdc_handle_attr,
+	.handshake_complete	= vdc_handshake_complete,
+};
+
+static void print_version(void)
+{
+	static int version_printed;
+
+	if (version_printed++ == 0)
+		printk(KERN_INFO "%s", version);
+}
+
+struct vdc_check_port_data {
+	int	dev_no;
+	char	*type;
+};
+
+static int vdc_device_probed(struct device *dev, void *arg)
+{
+	struct vio_dev *vdev = to_vio_dev(dev);
+	struct vdc_check_port_data *port_data;
+
+	port_data = (struct vdc_check_port_data *)arg;
+
+	if ((vdev->dev_no == port_data->dev_no) &&
+	    (!(strcmp((char *)&vdev->type, port_data->type))) &&
+		dev_get_drvdata(dev)) {
+		/* This device has already been configured
+		 * by vdc_port_probe()
+		 */
+		return 1;
+	} else {
+		return 0;
+	}
+}
+
+/* Determine whether the VIO device is part of an mpgroup
+ * by locating all the virtual-device-port nodes associated
+ * with the parent virtual-device node for the VIO device
+ * and checking whether any of these nodes are vdc-ports
+ * which have already been configured.
+ *
+ * Returns true if this device is part of an mpgroup and has
+ * already been probed.
+ */
+static bool vdc_port_mpgroup_check(struct vio_dev *vdev)
+{
+	struct vdc_check_port_data port_data;
+	struct device *dev;
+
+	port_data.dev_no = vdev->dev_no;
+	port_data.type = (char *)&vdev->type;
+
+	dev = device_find_child(vdev->dev.parent, &port_data,
+				vdc_device_probed);
+
+	if (dev)
+		return true;
+
+	return false;
+}
+
+static int vdc_port_probe(struct vio_dev *vdev, const struct vio_device_id *id)
+{
+	struct mdesc_handle *hp;
+	struct vdc_port *port;
+	int err;
+	const u64 *ldc_timeout;
+
+	print_version();
+
+	hp = mdesc_grab();
+	if (!hp)
+		return -ENODEV;
+
+	err = -ENODEV;
+	if ((vdev->dev_no << PARTITION_SHIFT) & ~(u64)MINORMASK) {
+		printk(KERN_ERR PFX "Port id [%llu] too large.\n",
+		       vdev->dev_no);
+		goto err_out_release_mdesc;
+	}
+
+	/* Check if this device is part of an mpgroup */
+	if (vdc_port_mpgroup_check(vdev)) {
+		printk(KERN_WARNING
+			"VIO: Ignoring extra vdisk port %s",
+			dev_name(&vdev->dev));
+		goto err_out_release_mdesc;
+	}
+
+	port = kzalloc(sizeof(*port), GFP_KERNEL);
+	if (!port) {
+		err = -ENOMEM;
+		goto err_out_release_mdesc;
+	}
+
+	if (vdev->dev_no >= 26)
+		snprintf(port->disk_name, sizeof(port->disk_name),
+			 VDCBLK_NAME "%c%c",
+			 'a' + ((int)vdev->dev_no / 26) - 1,
+			 'a' + ((int)vdev->dev_no % 26));
+	else
+		snprintf(port->disk_name, sizeof(port->disk_name),
+			 VDCBLK_NAME "%c", 'a' + ((int)vdev->dev_no % 26));
+	port->vdisk_size = -1;
+
+	/* Actual wall time may be double due to do_generic_file_read() doing
+	 * a readahead I/O first, and once that fails it will try to read a
+	 * single page.
+	 */
+	ldc_timeout = mdesc_get_property(hp, vdev->mp, "vdc-timeout", NULL);
+	port->ldc_timeout = ldc_timeout ? *ldc_timeout : 0;
+	INIT_DELAYED_WORK(&port->ldc_reset_timer_work, vdc_ldc_reset_timer_work);
+	INIT_WORK(&port->ldc_reset_work, vdc_ldc_reset_work);
+
+	err = vio_driver_init(&port->vio, vdev, VDEV_DISK,
+			      vdc_versions, ARRAY_SIZE(vdc_versions),
+			      &vdc_vio_ops, port->disk_name);
+	if (err)
+		goto err_out_free_port;
+
+	port->vdisk_block_size = VDC_DEFAULT_BLK_SIZE;
+	port->max_xfer_size = MAX_XFER_SIZE;
+	port->ring_cookies = MAX_RING_COOKIES;
+
+	err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port);
+	if (err)
+		goto err_out_free_port;
+
+	err = vdc_alloc_tx_ring(port);
+	if (err)
+		goto err_out_free_ldc;
+
+	err = probe_disk(port);
+	if (err)
+		goto err_out_free_tx_ring;
+
+	/* Note that the device driver_data is used to determine
+	 * whether the port has been probed.
+	 */
+	dev_set_drvdata(&vdev->dev, port);
+
+	mdesc_release(hp);
+
+	return 0;
+
+err_out_free_tx_ring:
+	vdc_free_tx_ring(port);
+
+err_out_free_ldc:
+	vio_ldc_free(&port->vio);
+
+err_out_free_port:
+	kfree(port);
+
+err_out_release_mdesc:
+	mdesc_release(hp);
+	return err;
+}
+
+static void vdc_port_remove(struct vio_dev *vdev)
+{
+	struct vdc_port *port = dev_get_drvdata(&vdev->dev);
+
+	if (port) {
+		blk_mq_stop_hw_queues(port->disk->queue);
+
+		flush_work(&port->ldc_reset_work);
+		cancel_delayed_work_sync(&port->ldc_reset_timer_work);
+		del_timer_sync(&port->vio.timer);
+
+		del_gendisk(port->disk);
+		put_disk(port->disk);
+		blk_mq_free_tag_set(&port->tag_set);
+
+		vdc_free_tx_ring(port);
+		vio_ldc_free(&port->vio);
+
+		dev_set_drvdata(&vdev->dev, NULL);
+
+		kfree(port);
+	}
+}
+
+static void vdc_requeue_inflight(struct vdc_port *port)
+{
+	struct vio_dring_state *dr = &port->vio.drings[VIO_DRIVER_TX_RING];
+	u32 idx;
+
+	for (idx = dr->cons; idx != dr->prod; idx = vio_dring_next(dr, idx)) {
+		struct vio_disk_desc *desc = vio_dring_entry(dr, idx);
+		struct vdc_req_entry *rqe = &port->rq_arr[idx];
+		struct request *req;
+
+		ldc_unmap(port->vio.lp, desc->cookies, desc->ncookies);
+		desc->hdr.state = VIO_DESC_FREE;
+		dr->cons = vio_dring_next(dr, idx);
+
+		req = rqe->req;
+		if (req == NULL) {
+			vdc_end_special(port, desc);
+			continue;
+		}
+
+		rqe->req = NULL;
+		blk_mq_requeue_request(req, false);
+	}
+}
+
+static void vdc_queue_drain(struct vdc_port *port)
+{
+	struct request_queue *q = port->disk->queue;
+
+	/*
+	 * Mark the queue as draining, then freeze/quiesce to ensure
+	 * that all existing requests are seen in ->queue_rq() and killed
+	 */
+	port->drain = 1;
+	spin_unlock_irq(&port->vio.lock);
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+
+	spin_lock_irq(&port->vio.lock);
+	port->drain = 0;
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
+}
+
+static void vdc_ldc_reset_timer_work(struct work_struct *work)
+{
+	struct vdc_port *port;
+	struct vio_driver_state *vio;
+
+	port = container_of(work, struct vdc_port, ldc_reset_timer_work.work);
+	vio = &port->vio;
+
+	spin_lock_irq(&vio->lock);
+	if (!(port->vio.hs_state & VIO_HS_COMPLETE)) {
+		pr_warn(PFX "%s ldc down %llu seconds, draining queue\n",
+			port->disk_name, port->ldc_timeout);
+		vdc_queue_drain(port);
+		vdc_blk_queue_start(port);
+	}
+	spin_unlock_irq(&vio->lock);
+}
+
+static void vdc_ldc_reset_work(struct work_struct *work)
+{
+	struct vdc_port *port;
+	struct vio_driver_state *vio;
+	unsigned long flags;
+
+	port = container_of(work, struct vdc_port, ldc_reset_work);
+	vio = &port->vio;
+
+	spin_lock_irqsave(&vio->lock, flags);
+	vdc_ldc_reset(port);
+	spin_unlock_irqrestore(&vio->lock, flags);
+}
+
+static void vdc_ldc_reset(struct vdc_port *port)
+{
+	int err;
+
+	assert_spin_locked(&port->vio.lock);
+
+	pr_warn(PFX "%s ldc link reset\n", port->disk_name);
+	blk_mq_stop_hw_queues(port->disk->queue);
+	vdc_requeue_inflight(port);
+	vdc_port_down(port);
+
+	err = vio_ldc_alloc(&port->vio, &vdc_ldc_cfg, port);
+	if (err) {
+		pr_err(PFX "%s vio_ldc_alloc:%d\n", port->disk_name, err);
+		return;
+	}
+
+	err = vdc_alloc_tx_ring(port);
+	if (err) {
+		pr_err(PFX "%s vio_alloc_tx_ring:%d\n", port->disk_name, err);
+		goto err_free_ldc;
+	}
+
+	if (port->ldc_timeout)
+		mod_delayed_work(system_wq, &port->ldc_reset_timer_work,
+			  round_jiffies(jiffies + HZ * port->ldc_timeout));
+	mod_timer(&port->vio.timer, round_jiffies(jiffies + HZ));
+	return;
+
+err_free_ldc:
+	vio_ldc_free(&port->vio);
+}
+
+static const struct vio_device_id vdc_port_match[] = {
+	{
+		.type = "vdc-port",
+	},
+	{},
+};
+MODULE_DEVICE_TABLE(vio, vdc_port_match);
+
+static struct vio_driver vdc_port_driver = {
+	.id_table	= vdc_port_match,
+	.probe		= vdc_port_probe,
+	.remove		= vdc_port_remove,
+	.name		= "vdc_port",
+};
+
+static int __init vdc_init(void)
+{
+	int err;
+
+	sunvdc_wq = alloc_workqueue("sunvdc", 0, 0);
+	if (!sunvdc_wq)
+		return -ENOMEM;
+
+	err = register_blkdev(0, VDCBLK_NAME);
+	if (err < 0)
+		goto out_free_wq;
+
+	vdc_major = err;
+
+	err = vio_register_driver(&vdc_port_driver);
+	if (err)
+		goto out_unregister_blkdev;
+
+	return 0;
+
+out_unregister_blkdev:
+	unregister_blkdev(vdc_major, VDCBLK_NAME);
+	vdc_major = 0;
+
+out_free_wq:
+	destroy_workqueue(sunvdc_wq);
+	return err;
+}
+
+static void __exit vdc_exit(void)
+{
+	vio_unregister_driver(&vdc_port_driver);
+	unregister_blkdev(vdc_major, VDCBLK_NAME);
+	destroy_workqueue(sunvdc_wq);
+}
+
+module_init(vdc_init);
+module_exit(vdc_exit);
diff --git a/drivers/block/swim.c b/drivers/block/swim.c
new file mode 100644
index 000000000..42b4b6828
--- /dev/null
+++ b/drivers/block/swim.c
@@ -0,0 +1,971 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for SWIM (Sander Woz Integrated Machine) floppy controller
+ *
+ * Copyright (C) 2004,2008 Laurent Vivier <Laurent@lvivier.info>
+ *
+ * based on Alastair Bridgewater SWIM analysis, 2001
+ * based on SWIM3 driver (c) Paul Mackerras, 1996
+ * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath.
+ *
+ * 2004-08-21 (lv) - Initial implementation
+ * 2008-10-30 (lv) - Port to 2.6
+ */
+
+#include <linux/module.h>
+#include <linux/fd.h>
+#include <linux/slab.h>
+#include <linux/blk-mq.h>
+#include <linux/major.h>
+#include <linux/mutex.h>
+#include <linux/hdreg.h>
+#include <linux/kernel.h>
+#include <linux/delay.h>
+#include <linux/platform_device.h>
+
+#include <asm/mac_via.h>
+
+#define CARDNAME "swim"
+
+struct sector_header {
+	unsigned char side;
+	unsigned char track;
+	unsigned char sector;
+	unsigned char size;
+	unsigned char crc0;
+	unsigned char crc1;
+} __attribute__((packed));
+
+#define DRIVER_VERSION "Version 0.2 (2008-10-30)"
+
+#define REG(x)	unsigned char x, x ## _pad[0x200 - 1];
+
+struct swim {
+	REG(write_data)
+	REG(write_mark)
+	REG(write_CRC)
+	REG(write_parameter)
+	REG(write_phase)
+	REG(write_setup)
+	REG(write_mode0)
+	REG(write_mode1)
+
+	REG(read_data)
+	REG(read_mark)
+	REG(read_error)
+	REG(read_parameter)
+	REG(read_phase)
+	REG(read_setup)
+	REG(read_status)
+	REG(read_handshake)
+} __attribute__((packed));
+
+#define swim_write(base, reg, v) 	out_8(&(base)->write_##reg, (v))
+#define swim_read(base, reg)		in_8(&(base)->read_##reg)
+
+/* IWM registers */
+
+struct iwm {
+	REG(ph0L)
+	REG(ph0H)
+	REG(ph1L)
+	REG(ph1H)
+	REG(ph2L)
+	REG(ph2H)
+	REG(ph3L)
+	REG(ph3H)
+	REG(mtrOff)
+	REG(mtrOn)
+	REG(intDrive)
+	REG(extDrive)
+	REG(q6L)
+	REG(q6H)
+	REG(q7L)
+	REG(q7H)
+} __attribute__((packed));
+
+#define iwm_write(base, reg, v) 	out_8(&(base)->reg, (v))
+#define iwm_read(base, reg)		in_8(&(base)->reg)
+
+/* bits in phase register */
+
+#define SEEK_POSITIVE	0x070
+#define SEEK_NEGATIVE	0x074
+#define STEP		0x071
+#define MOTOR_ON	0x072
+#define MOTOR_OFF	0x076
+#define INDEX		0x073
+#define EJECT		0x077
+#define SETMFM		0x171
+#define SETGCR		0x175
+
+#define RELAX		0x033
+#define LSTRB		0x008
+
+#define CA_MASK		0x077
+
+/* Select values for swim_select and swim_readbit */
+
+#define READ_DATA_0	0x074
+#define ONEMEG_DRIVE	0x075
+#define SINGLE_SIDED	0x076
+#define DRIVE_PRESENT	0x077
+#define DISK_IN		0x170
+#define WRITE_PROT	0x171
+#define TRACK_ZERO	0x172
+#define TACHO		0x173
+#define READ_DATA_1	0x174
+#define GCR_MODE	0x175
+#define SEEK_COMPLETE	0x176
+#define TWOMEG_MEDIA	0x177
+
+/* Bits in handshake register */
+
+#define MARK_BYTE	0x01
+#define CRC_ZERO	0x02
+#define RDDATA		0x04
+#define SENSE		0x08
+#define MOTEN		0x10
+#define ERROR		0x20
+#define DAT2BYTE	0x40
+#define DAT1BYTE	0x80
+
+/* bits in setup register */
+
+#define S_INV_WDATA	0x01
+#define S_3_5_SELECT	0x02
+#define S_GCR		0x04
+#define S_FCLK_DIV2	0x08
+#define S_ERROR_CORR	0x10
+#define S_IBM_DRIVE	0x20
+#define S_GCR_WRITE	0x40
+#define S_TIMEOUT	0x80
+
+/* bits in mode register */
+
+#define CLFIFO		0x01
+#define ENBL1		0x02
+#define ENBL2		0x04
+#define ACTION		0x08
+#define WRITE_MODE	0x10
+#define HEDSEL		0x20
+#define MOTON		0x80
+
+/*----------------------------------------------------------------------------*/
+
+enum drive_location {
+	INTERNAL_DRIVE = 0x02,
+	EXTERNAL_DRIVE = 0x04,
+};
+
+enum media_type {
+	DD_MEDIA,
+	HD_MEDIA,
+};
+
+struct floppy_state {
+
+	/* physical properties */
+
+	enum drive_location location;	/* internal or external drive */
+	int		 head_number;	/* single- or double-sided drive */
+
+	/* media */
+
+	int		 disk_in;
+	int		 ejected;
+	enum media_type	 type;
+	int		 write_protected;
+
+	int		 total_secs;
+	int		 secpercyl;
+	int		 secpertrack;
+
+	/* in-use information */
+
+	int		track;
+	int		ref_count;
+	bool registered;
+
+	struct gendisk *disk;
+	struct blk_mq_tag_set tag_set;
+
+	/* parent controller */
+
+	struct swim_priv *swd;
+};
+
+enum motor_action {
+	OFF,
+	ON,
+};
+
+enum head {
+	LOWER_HEAD = 0,
+	UPPER_HEAD = 1,
+};
+
+#define FD_MAX_UNIT	2
+
+struct swim_priv {
+	struct swim __iomem *base;
+	spinlock_t lock;
+	int floppy_count;
+	struct floppy_state unit[FD_MAX_UNIT];
+};
+
+extern int swim_read_sector_header(struct swim __iomem *base,
+				   struct sector_header *header);
+extern int swim_read_sector_data(struct swim __iomem *base,
+				 unsigned char *data);
+
+static DEFINE_MUTEX(swim_mutex);
+static inline void set_swim_mode(struct swim __iomem *base, int enable)
+{
+	struct iwm __iomem *iwm_base;
+	unsigned long flags;
+
+	if (!enable) {
+		swim_write(base, mode0, 0xf8);
+		return;
+	}
+
+	iwm_base = (struct iwm __iomem *)base;
+	local_irq_save(flags);
+
+	iwm_read(iwm_base, q7L);
+	iwm_read(iwm_base, mtrOff);
+	iwm_read(iwm_base, q6H);
+
+	iwm_write(iwm_base, q7H, 0x57);
+	iwm_write(iwm_base, q7H, 0x17);
+	iwm_write(iwm_base, q7H, 0x57);
+	iwm_write(iwm_base, q7H, 0x57);
+
+	local_irq_restore(flags);
+}
+
+static inline int get_swim_mode(struct swim __iomem *base)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	swim_write(base, phase, 0xf5);
+	if (swim_read(base, phase) != 0xf5)
+		goto is_iwm;
+	swim_write(base, phase, 0xf6);
+	if (swim_read(base, phase) != 0xf6)
+		goto is_iwm;
+	swim_write(base, phase, 0xf7);
+	if (swim_read(base, phase) != 0xf7)
+		goto is_iwm;
+	local_irq_restore(flags);
+	return 1;
+is_iwm:
+	local_irq_restore(flags);
+	return 0;
+}
+
+static inline void swim_select(struct swim __iomem *base, int sel)
+{
+	swim_write(base, phase, RELAX);
+
+	via1_set_head(sel & 0x100);
+
+	swim_write(base, phase, sel & CA_MASK);
+}
+
+static inline void swim_action(struct swim __iomem *base, int action)
+{
+	unsigned long flags;
+
+	local_irq_save(flags);
+
+	swim_select(base, action);
+	udelay(1);
+	swim_write(base, phase, (LSTRB<<4) | LSTRB);
+	udelay(1);
+	swim_write(base, phase, (LSTRB<<4) | ((~LSTRB) & 0x0F));
+	udelay(1);
+
+	local_irq_restore(flags);
+}
+
+static inline int swim_readbit(struct swim __iomem *base, int bit)
+{
+	int stat;
+
+	swim_select(base, bit);
+
+	udelay(10);
+
+	stat = swim_read(base, handshake);
+
+	return (stat & SENSE) == 0;
+}
+
+static inline void swim_drive(struct swim __iomem *base,
+			      enum drive_location location)
+{
+	if (location == INTERNAL_DRIVE) {
+		swim_write(base, mode0, EXTERNAL_DRIVE); /* clear drive 1 bit */
+		swim_write(base, mode1, INTERNAL_DRIVE); /* set drive 0 bit */
+	} else if (location == EXTERNAL_DRIVE) {
+		swim_write(base, mode0, INTERNAL_DRIVE); /* clear drive 0 bit */
+		swim_write(base, mode1, EXTERNAL_DRIVE); /* set drive 1 bit */
+	}
+}
+
+static inline void swim_motor(struct swim __iomem *base,
+			      enum motor_action action)
+{
+	if (action == ON) {
+		int i;
+
+		swim_action(base, MOTOR_ON);
+
+		for (i = 0; i < 2*HZ; i++) {
+			swim_select(base, RELAX);
+			if (swim_readbit(base, MOTOR_ON))
+				break;
+			set_current_state(TASK_INTERRUPTIBLE);
+			schedule_timeout(1);
+		}
+	} else if (action == OFF) {
+		swim_action(base, MOTOR_OFF);
+		swim_select(base, RELAX);
+	}
+}
+
+static inline void swim_eject(struct swim __iomem *base)
+{
+	int i;
+
+	swim_action(base, EJECT);
+
+	for (i = 0; i < 2*HZ; i++) {
+		swim_select(base, RELAX);
+		if (!swim_readbit(base, DISK_IN))
+			break;
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(1);
+	}
+	swim_select(base, RELAX);
+}
+
+static inline void swim_head(struct swim __iomem *base, enum head head)
+{
+	/* wait drive is ready */
+
+	if (head == UPPER_HEAD)
+		swim_select(base, READ_DATA_1);
+	else if (head == LOWER_HEAD)
+		swim_select(base, READ_DATA_0);
+}
+
+static inline int swim_step(struct swim __iomem *base)
+{
+	int wait;
+
+	swim_action(base, STEP);
+
+	for (wait = 0; wait < HZ; wait++) {
+
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(1);
+
+		swim_select(base, RELAX);
+		if (!swim_readbit(base, STEP))
+			return 0;
+	}
+	return -1;
+}
+
+static inline int swim_track00(struct swim __iomem *base)
+{
+	int try;
+
+	swim_action(base, SEEK_NEGATIVE);
+
+	for (try = 0; try < 100; try++) {
+
+		swim_select(base, RELAX);
+		if (swim_readbit(base, TRACK_ZERO))
+			break;
+
+		if (swim_step(base))
+			return -1;
+	}
+
+	if (swim_readbit(base, TRACK_ZERO))
+		return 0;
+
+	return -1;
+}
+
+static inline int swim_seek(struct swim __iomem *base, int step)
+{
+	if (step == 0)
+		return 0;
+
+	if (step < 0) {
+		swim_action(base, SEEK_NEGATIVE);
+		step = -step;
+	} else
+		swim_action(base, SEEK_POSITIVE);
+
+	for ( ; step > 0; step--) {
+		if (swim_step(base))
+			return -1;
+	}
+
+	return 0;
+}
+
+static inline int swim_track(struct floppy_state *fs,  int track)
+{
+	struct swim __iomem *base = fs->swd->base;
+	int ret;
+
+	ret = swim_seek(base, track - fs->track);
+
+	if (ret == 0)
+		fs->track = track;
+	else {
+		swim_track00(base);
+		fs->track = 0;
+	}
+
+	return ret;
+}
+
+static int floppy_eject(struct floppy_state *fs)
+{
+	struct swim __iomem *base = fs->swd->base;
+
+	swim_drive(base, fs->location);
+	swim_motor(base, OFF);
+	swim_eject(base);
+
+	fs->disk_in = 0;
+	fs->ejected = 1;
+
+	return 0;
+}
+
+static inline int swim_read_sector(struct floppy_state *fs,
+				   int side, int track,
+				   int sector, unsigned char *buffer)
+{
+	struct swim __iomem *base = fs->swd->base;
+	unsigned long flags;
+	struct sector_header header;
+	int ret = -1;
+	short i;
+
+	swim_track(fs, track);
+
+	swim_write(base, mode1, MOTON);
+	swim_head(base, side);
+	swim_write(base, mode0, side);
+
+	local_irq_save(flags);
+	for (i = 0; i < 36; i++) {
+		ret = swim_read_sector_header(base, &header);
+		if (!ret && (header.sector == sector)) {
+			/* found */
+
+			ret = swim_read_sector_data(base, buffer);
+			break;
+		}
+	}
+	local_irq_restore(flags);
+
+	swim_write(base, mode0, MOTON);
+
+	if ((header.side != side)  || (header.track != track) ||
+	     (header.sector != sector))
+		return 0;
+
+	return ret;
+}
+
+static blk_status_t floppy_read_sectors(struct floppy_state *fs,
+			       int req_sector, int sectors_nb,
+			       unsigned char *buffer)
+{
+	struct swim __iomem *base = fs->swd->base;
+	int ret;
+	int side, track, sector;
+	int i, try;
+
+
+	swim_drive(base, fs->location);
+	for (i = req_sector; i < req_sector + sectors_nb; i++) {
+		int x;
+		track = i / fs->secpercyl;
+		x = i % fs->secpercyl;
+		side = x / fs->secpertrack;
+		sector = x % fs->secpertrack + 1;
+
+		try = 5;
+		do {
+			ret = swim_read_sector(fs, side, track, sector,
+						buffer);
+			if (try-- == 0)
+				return BLK_STS_IOERR;
+		} while (ret != 512);
+
+		buffer += ret;
+	}
+
+	return 0;
+}
+
+static blk_status_t swim_queue_rq(struct blk_mq_hw_ctx *hctx,
+				  const struct blk_mq_queue_data *bd)
+{
+	struct floppy_state *fs = hctx->queue->queuedata;
+	struct swim_priv *swd = fs->swd;
+	struct request *req = bd->rq;
+	blk_status_t err;
+
+	if (!spin_trylock_irq(&swd->lock))
+		return BLK_STS_DEV_RESOURCE;
+
+	blk_mq_start_request(req);
+
+	if (!fs->disk_in || rq_data_dir(req) == WRITE) {
+		err = BLK_STS_IOERR;
+		goto out;
+	}
+
+	do {
+		err = floppy_read_sectors(fs, blk_rq_pos(req),
+					  blk_rq_cur_sectors(req),
+					  bio_data(req->bio));
+	} while (blk_update_request(req, err, blk_rq_cur_bytes(req)));
+	__blk_mq_end_request(req, err);
+
+	err = BLK_STS_OK;
+out:
+	spin_unlock_irq(&swd->lock);
+	return err;
+
+}
+
+static struct floppy_struct floppy_type[4] = {
+	{    0,  0, 0,  0, 0, 0x00, 0x00, 0x00, 0x00, NULL }, /* no testing   */
+	{  720,  9, 1, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 360KB SS 3.5"*/
+	{ 1440,  9, 2, 80, 0, 0x2A, 0x02, 0xDF, 0x50, NULL }, /* 720KB 3.5"   */
+	{ 2880, 18, 2, 80, 0, 0x1B, 0x00, 0xCF, 0x6C, NULL }, /* 1.44MB 3.5"  */
+};
+
+static int get_floppy_geometry(struct floppy_state *fs, int type,
+			       struct floppy_struct **g)
+{
+	if (type >= ARRAY_SIZE(floppy_type))
+		return -EINVAL;
+
+	if (type)
+		*g = &floppy_type[type];
+	else if (fs->type == HD_MEDIA) /* High-Density media */
+		*g = &floppy_type[3];
+	else if (fs->head_number == 2) /* double-sided */
+		*g = &floppy_type[2];
+	else
+		*g = &floppy_type[1];
+
+	return 0;
+}
+
+static void setup_medium(struct floppy_state *fs)
+{
+	struct swim __iomem *base = fs->swd->base;
+
+	if (swim_readbit(base, DISK_IN)) {
+		struct floppy_struct *g;
+		fs->disk_in = 1;
+		fs->write_protected = swim_readbit(base, WRITE_PROT);
+
+		if (swim_track00(base))
+			printk(KERN_ERR
+				"SWIM: cannot move floppy head to track 0\n");
+
+		swim_track00(base);
+
+		fs->type = swim_readbit(base, TWOMEG_MEDIA) ?
+			HD_MEDIA : DD_MEDIA;
+		fs->head_number = swim_readbit(base, SINGLE_SIDED) ? 1 : 2;
+		get_floppy_geometry(fs, 0, &g);
+		fs->total_secs = g->size;
+		fs->secpercyl = g->head * g->sect;
+		fs->secpertrack = g->sect;
+		fs->track = 0;
+	} else {
+		fs->disk_in = 0;
+	}
+}
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+	struct floppy_state *fs = bdev->bd_disk->private_data;
+	struct swim __iomem *base = fs->swd->base;
+	int err;
+
+	if (fs->ref_count == -1 || (fs->ref_count && mode & FMODE_EXCL))
+		return -EBUSY;
+
+	if (mode & FMODE_EXCL)
+		fs->ref_count = -1;
+	else
+		fs->ref_count++;
+
+	swim_write(base, setup, S_IBM_DRIVE  | S_FCLK_DIV2);
+	udelay(10);
+	swim_drive(base, fs->location);
+	swim_motor(base, ON);
+	swim_action(base, SETMFM);
+	if (fs->ejected)
+		setup_medium(fs);
+	if (!fs->disk_in) {
+		err = -ENXIO;
+		goto out;
+	}
+
+	set_capacity(fs->disk, fs->total_secs);
+
+	if (mode & FMODE_NDELAY)
+		return 0;
+
+	if (mode & (FMODE_READ|FMODE_WRITE)) {
+		if (bdev_check_media_change(bdev) && fs->disk_in)
+			fs->ejected = 0;
+		if ((mode & FMODE_WRITE) && fs->write_protected) {
+			err = -EROFS;
+			goto out;
+		}
+	}
+	return 0;
+out:
+	if (fs->ref_count < 0)
+		fs->ref_count = 0;
+	else if (fs->ref_count > 0)
+		--fs->ref_count;
+
+	if (fs->ref_count == 0)
+		swim_motor(base, OFF);
+	return err;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&swim_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&swim_mutex);
+
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+	struct floppy_state *fs = disk->private_data;
+	struct swim __iomem *base = fs->swd->base;
+
+	mutex_lock(&swim_mutex);
+	if (fs->ref_count < 0)
+		fs->ref_count = 0;
+	else if (fs->ref_count > 0)
+		--fs->ref_count;
+
+	if (fs->ref_count == 0)
+		swim_motor(base, OFF);
+	mutex_unlock(&swim_mutex);
+}
+
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long param)
+{
+	struct floppy_state *fs = bdev->bd_disk->private_data;
+	int err;
+
+	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+	switch (cmd) {
+	case FDEJECT:
+		if (fs->ref_count != 1)
+			return -EBUSY;
+		mutex_lock(&swim_mutex);
+		err = floppy_eject(fs);
+		mutex_unlock(&swim_mutex);
+		return err;
+
+	case FDGETPRM:
+		if (copy_to_user((void __user *) param, (void *) &floppy_type,
+				 sizeof(struct floppy_struct)))
+			return -EFAULT;
+		return 0;
+	}
+	return -ENOTTY;
+}
+
+static int floppy_getgeo(struct block_device *bdev, struct hd_geometry *geo)
+{
+	struct floppy_state *fs = bdev->bd_disk->private_data;
+	struct floppy_struct *g;
+	int ret;
+
+	ret = get_floppy_geometry(fs, 0, &g);
+	if (ret)
+		return ret;
+
+	geo->heads = g->head;
+	geo->sectors = g->sect;
+	geo->cylinders = g->track;
+
+	return 0;
+}
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
+{
+	struct floppy_state *fs = disk->private_data;
+
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static const struct block_device_operations floppy_fops = {
+	.owner		 = THIS_MODULE,
+	.open		 = floppy_unlocked_open,
+	.release	 = floppy_release,
+	.ioctl		 = floppy_ioctl,
+	.getgeo		 = floppy_getgeo,
+	.check_events	 = floppy_check_events,
+};
+
+static int swim_add_floppy(struct swim_priv *swd, enum drive_location location)
+{
+	struct floppy_state *fs = &swd->unit[swd->floppy_count];
+	struct swim __iomem *base = swd->base;
+
+	fs->location = location;
+
+	swim_drive(base, location);
+
+	swim_motor(base, OFF);
+
+	fs->type = HD_MEDIA;
+	fs->head_number = 2;
+
+	fs->ref_count = 0;
+	fs->ejected = 1;
+
+	swd->floppy_count++;
+
+	return 0;
+}
+
+static const struct blk_mq_ops swim_mq_ops = {
+	.queue_rq = swim_queue_rq,
+};
+
+static void swim_cleanup_floppy_disk(struct floppy_state *fs)
+{
+	struct gendisk *disk = fs->disk;
+
+	if (!disk)
+		return;
+
+	if (fs->registered)
+		del_gendisk(fs->disk);
+
+	put_disk(disk);
+	blk_mq_free_tag_set(&fs->tag_set);
+}
+
+static int swim_floppy_init(struct swim_priv *swd)
+{
+	int err;
+	int drive;
+	struct swim __iomem *base = swd->base;
+
+	/* scan floppy drives */
+
+	swim_drive(base, INTERNAL_DRIVE);
+	if (swim_readbit(base, DRIVE_PRESENT) &&
+	    !swim_readbit(base, ONEMEG_DRIVE))
+		swim_add_floppy(swd, INTERNAL_DRIVE);
+	swim_drive(base, EXTERNAL_DRIVE);
+	if (swim_readbit(base, DRIVE_PRESENT) &&
+	    !swim_readbit(base, ONEMEG_DRIVE))
+		swim_add_floppy(swd, EXTERNAL_DRIVE);
+
+	/* register floppy drives */
+
+	err = register_blkdev(FLOPPY_MAJOR, "fd");
+	if (err) {
+		printk(KERN_ERR "Unable to get major %d for SWIM floppy\n",
+		       FLOPPY_MAJOR);
+		return -EBUSY;
+	}
+
+	spin_lock_init(&swd->lock);
+
+	for (drive = 0; drive < swd->floppy_count; drive++) {
+		err = blk_mq_alloc_sq_tag_set(&swd->unit[drive].tag_set,
+				&swim_mq_ops, 2, BLK_MQ_F_SHOULD_MERGE);
+		if (err)
+			goto exit_put_disks;
+
+		swd->unit[drive].disk =
+			blk_mq_alloc_disk(&swd->unit[drive].tag_set,
+					  &swd->unit[drive]);
+		if (IS_ERR(swd->unit[drive].disk)) {
+			blk_mq_free_tag_set(&swd->unit[drive].tag_set);
+			err = PTR_ERR(swd->unit[drive].disk);
+			goto exit_put_disks;
+		}
+
+		swd->unit[drive].swd = swd;
+	}
+
+	for (drive = 0; drive < swd->floppy_count; drive++) {
+		swd->unit[drive].disk->flags = GENHD_FL_REMOVABLE;
+		swd->unit[drive].disk->major = FLOPPY_MAJOR;
+		swd->unit[drive].disk->first_minor = drive;
+		swd->unit[drive].disk->minors = 1;
+		sprintf(swd->unit[drive].disk->disk_name, "fd%d", drive);
+		swd->unit[drive].disk->fops = &floppy_fops;
+		swd->unit[drive].disk->flags |= GENHD_FL_NO_PART;
+		swd->unit[drive].disk->events = DISK_EVENT_MEDIA_CHANGE;
+		swd->unit[drive].disk->private_data = &swd->unit[drive];
+		set_capacity(swd->unit[drive].disk, 2880);
+		err = add_disk(swd->unit[drive].disk);
+		if (err)
+			goto exit_put_disks;
+		swd->unit[drive].registered = true;
+	}
+
+	return 0;
+
+exit_put_disks:
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
+	do {
+		swim_cleanup_floppy_disk(&swd->unit[drive]);
+	} while (drive--);
+	return err;
+}
+
+static int swim_probe(struct platform_device *dev)
+{
+	struct resource *res;
+	struct swim __iomem *swim_base;
+	struct swim_priv *swd;
+	int ret;
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (!res) {
+		ret = -ENODEV;
+		goto out;
+	}
+
+	if (!request_mem_region(res->start, resource_size(res), CARDNAME)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	swim_base = (struct swim __iomem *)res->start;
+	if (!swim_base) {
+		ret = -ENOMEM;
+		goto out_release_io;
+	}
+
+	/* probe device */
+
+	set_swim_mode(swim_base, 1);
+	if (!get_swim_mode(swim_base)) {
+		printk(KERN_INFO "SWIM device not found !\n");
+		ret = -ENODEV;
+		goto out_release_io;
+	}
+
+	/* set platform driver data */
+
+	swd = kzalloc(sizeof(struct swim_priv), GFP_KERNEL);
+	if (!swd) {
+		ret = -ENOMEM;
+		goto out_release_io;
+	}
+	platform_set_drvdata(dev, swd);
+
+	swd->base = swim_base;
+
+	ret = swim_floppy_init(swd);
+	if (ret)
+		goto out_kfree;
+
+	return 0;
+
+out_kfree:
+	kfree(swd);
+out_release_io:
+	release_mem_region(res->start, resource_size(res));
+out:
+	return ret;
+}
+
+static int swim_remove(struct platform_device *dev)
+{
+	struct swim_priv *swd = platform_get_drvdata(dev);
+	int drive;
+	struct resource *res;
+
+	for (drive = 0; drive < swd->floppy_count; drive++)
+		swim_cleanup_floppy_disk(&swd->unit[drive]);
+
+	unregister_blkdev(FLOPPY_MAJOR, "fd");
+
+	/* eject floppies */
+
+	for (drive = 0; drive < swd->floppy_count; drive++)
+		floppy_eject(&swd->unit[drive]);
+
+	res = platform_get_resource(dev, IORESOURCE_MEM, 0);
+	if (res)
+		release_mem_region(res->start, resource_size(res));
+
+	kfree(swd);
+
+	return 0;
+}
+
+static struct platform_driver swim_driver = {
+	.probe  = swim_probe,
+	.remove = swim_remove,
+	.driver   = {
+		.name	= CARDNAME,
+	},
+};
+
+static int __init swim_init(void)
+{
+	printk(KERN_INFO "SWIM floppy driver %s\n", DRIVER_VERSION);
+
+	return platform_driver_register(&swim_driver);
+}
+module_init(swim_init);
+
+static void __exit swim_exit(void)
+{
+	platform_driver_unregister(&swim_driver);
+}
+module_exit(swim_exit);
+
+MODULE_DESCRIPTION("Driver for SWIM floppy controller");
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Laurent Vivier <laurent@lvivier.info>");
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c
new file mode 100644
index 000000000..da811a7da
--- /dev/null
+++ b/drivers/block/swim3.c
@@ -0,0 +1,1291 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Driver for the SWIM3 (Super Woz Integrated Machine 3)
+ * floppy controller found on Power Macintoshes.
+ *
+ * Copyright (C) 1996 Paul Mackerras.
+ */
+
+/*
+ * TODO:
+ * handle 2 drives
+ * handle GCR disks
+ */
+
+#undef DEBUG
+
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/sched/signal.h>
+#include <linux/timer.h>
+#include <linux/delay.h>
+#include <linux/fd.h>
+#include <linux/ioctl.h>
+#include <linux/blk-mq.h>
+#include <linux/interrupt.h>
+#include <linux/mutex.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/wait.h>
+#include <linux/major.h>
+#include <asm/io.h>
+#include <asm/dbdma.h>
+#include <asm/prom.h>
+#include <linux/uaccess.h>
+#include <asm/mediabay.h>
+#include <asm/machdep.h>
+#include <asm/pmac_feature.h>
+
+#define MAX_FLOPPIES	2
+
+static DEFINE_MUTEX(swim3_mutex);
+static struct gendisk *disks[MAX_FLOPPIES];
+
+enum swim_state {
+	idle,
+	locating,
+	seeking,
+	settling,
+	do_transfer,
+	jogging,
+	available,
+	revalidating,
+	ejecting
+};
+
+#define REG(x)	unsigned char x; char x ## _pad[15];
+
+/*
+ * The names for these registers mostly represent speculation on my part.
+ * It will be interesting to see how close they are to the names Apple uses.
+ */
+struct swim3 {
+	REG(data);
+	REG(timer);		/* counts down at 1MHz */
+	REG(error);
+	REG(mode);
+	REG(select);		/* controls CA0, CA1, CA2 and LSTRB signals */
+	REG(setup);
+	REG(control);		/* writing bits clears them */
+	REG(status);		/* writing bits sets them in control */
+	REG(intr);
+	REG(nseek);		/* # tracks to seek */
+	REG(ctrack);		/* current track number */
+	REG(csect);		/* current sector number */
+	REG(gap3);		/* size of gap 3 in track format */
+	REG(sector);		/* sector # to read or write */
+	REG(nsect);		/* # sectors to read or write */
+	REG(intr_enable);
+};
+
+#define control_bic	control
+#define control_bis	status
+
+/* Bits in select register */
+#define CA_MASK		7
+#define LSTRB		8
+
+/* Bits in control register */
+#define DO_SEEK		0x80
+#define FORMAT		0x40
+#define SELECT		0x20
+#define WRITE_SECTORS	0x10
+#define DO_ACTION	0x08
+#define DRIVE2_ENABLE	0x04
+#define DRIVE_ENABLE	0x02
+#define INTR_ENABLE	0x01
+
+/* Bits in status register */
+#define FIFO_1BYTE	0x80
+#define FIFO_2BYTE	0x40
+#define ERROR		0x20
+#define DATA		0x08
+#define RDDATA		0x04
+#define INTR_PENDING	0x02
+#define MARK_BYTE	0x01
+
+/* Bits in intr and intr_enable registers */
+#define ERROR_INTR	0x20
+#define DATA_CHANGED	0x10
+#define TRANSFER_DONE	0x08
+#define SEEN_SECTOR	0x04
+#define SEEK_DONE	0x02
+#define TIMER_DONE	0x01
+
+/* Bits in error register */
+#define ERR_DATA_CRC	0x80
+#define ERR_ADDR_CRC	0x40
+#define ERR_OVERRUN	0x04
+#define ERR_UNDERRUN	0x01
+
+/* Bits in setup register */
+#define S_SW_RESET	0x80
+#define S_GCR_WRITE	0x40
+#define S_IBM_DRIVE	0x20
+#define S_TEST_MODE	0x10
+#define S_FCLK_DIV2	0x08
+#define S_GCR		0x04
+#define S_COPY_PROT	0x02
+#define S_INV_WDATA	0x01
+
+/* Select values for swim3_action */
+#define SEEK_POSITIVE	0
+#define SEEK_NEGATIVE	4
+#define STEP		1
+#define MOTOR_ON	2
+#define MOTOR_OFF	6
+#define INDEX		3
+#define EJECT		7
+#define SETMFM		9
+#define SETGCR		13
+
+/* Select values for swim3_select and swim3_readbit */
+#define STEP_DIR	0
+#define STEPPING	1
+#define MOTOR_ON	2
+#define RELAX		3	/* also eject in progress */
+#define READ_DATA_0	4
+#define ONEMEG_DRIVE	5
+#define SINGLE_SIDED	6	/* drive or diskette is 4MB type? */
+#define DRIVE_PRESENT	7
+#define DISK_IN		8
+#define WRITE_PROT	9
+#define TRACK_ZERO	10
+#define TACHO		11
+#define READ_DATA_1	12
+#define GCR_MODE	13
+#define SEEK_COMPLETE	14
+#define TWOMEG_MEDIA	15
+
+/* Definitions of values used in writing and formatting */
+#define DATA_ESCAPE	0x99
+#define GCR_SYNC_EXC	0x3f
+#define GCR_SYNC_CONV	0x80
+#define GCR_FIRST_MARK	0xd5
+#define GCR_SECOND_MARK	0xaa
+#define GCR_ADDR_MARK	"\xd5\xaa\x00"
+#define GCR_DATA_MARK	"\xd5\xaa\x0b"
+#define GCR_SLIP_BYTE	"\x27\xaa"
+#define GCR_SELF_SYNC	"\x3f\xbf\x1e\x34\x3c\x3f"
+
+#define DATA_99		"\x99\x99"
+#define MFM_ADDR_MARK	"\x99\xa1\x99\xa1\x99\xa1\x99\xfe"
+#define MFM_INDEX_MARK	"\x99\xc2\x99\xc2\x99\xc2\x99\xfc"
+#define MFM_GAP_LEN	12
+
+struct floppy_state {
+	enum swim_state	state;
+	struct swim3 __iomem *swim3;	/* hardware registers */
+	struct dbdma_regs __iomem *dma;	/* DMA controller registers */
+	int	swim3_intr;	/* interrupt number for SWIM3 */
+	int	dma_intr;	/* interrupt number for DMA channel */
+	int	cur_cyl;	/* cylinder head is on, or -1 */
+	int	cur_sector;	/* last sector we saw go past */
+	int	req_cyl;	/* the cylinder for the current r/w request */
+	int	head;		/* head number ditto */
+	int	req_sector;	/* sector number ditto */
+	int	scount;		/* # sectors we're transferring at present */
+	int	retries;
+	int	settle_time;
+	int	secpercyl;	/* disk geometry information */
+	int	secpertrack;
+	int	total_secs;
+	int	write_prot;	/* 1 if write-protected, 0 if not, -1 dunno */
+	struct dbdma_cmd *dma_cmd;
+	int	ref_count;
+	int	expect_cyl;
+	struct timer_list timeout;
+	int	timeout_pending;
+	int	ejected;
+	wait_queue_head_t wait;
+	int	wanted;
+	struct macio_dev *mdev;
+	char	dbdma_cmd_space[5 * sizeof(struct dbdma_cmd)];
+	int	index;
+	struct request *cur_req;
+	struct blk_mq_tag_set tag_set;
+};
+
+#define swim3_err(fmt, arg...)	dev_err(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_warn(fmt, arg...)	dev_warn(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#define swim3_info(fmt, arg...)	dev_info(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+
+#ifdef DEBUG
+#define swim3_dbg(fmt, arg...)	dev_dbg(&fs->mdev->ofdev.dev, "[fd%d] " fmt, fs->index, arg)
+#else
+#define swim3_dbg(fmt, arg...)	do { } while(0)
+#endif
+
+static struct floppy_state floppy_states[MAX_FLOPPIES];
+static int floppy_count = 0;
+static DEFINE_SPINLOCK(swim3_lock);
+
+static unsigned short write_preamble[] = {
+	0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e, 0x4e4e,	/* gap field */
+	0, 0, 0, 0, 0, 0,			/* sync field */
+	0x99a1, 0x99a1, 0x99a1, 0x99fb,		/* data address mark */
+	0x990f					/* no escape for 512 bytes */
+};
+
+static unsigned short write_postamble[] = {
+	0x9904,					/* insert CRC */
+	0x4e4e, 0x4e4e,
+	0x9908,					/* stop writing */
+	0, 0, 0, 0, 0, 0
+};
+
+static void seek_track(struct floppy_state *fs, int n);
+static void act(struct floppy_state *fs);
+static void scan_timeout(struct timer_list *t);
+static void seek_timeout(struct timer_list *t);
+static void settle_timeout(struct timer_list *t);
+static void xfer_timeout(struct timer_list *t);
+static irqreturn_t swim3_interrupt(int irq, void *dev_id);
+/*static void fd_dma_interrupt(int irq, void *dev_id);*/
+static int grab_drive(struct floppy_state *fs, enum swim_state state,
+		      int interruptible);
+static void release_drive(struct floppy_state *fs);
+static int fd_eject(struct floppy_state *fs);
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long param);
+static int floppy_open(struct block_device *bdev, fmode_t mode);
+static void floppy_release(struct gendisk *disk, fmode_t mode);
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing);
+static int floppy_revalidate(struct gendisk *disk);
+
+static bool swim3_end_request(struct floppy_state *fs, blk_status_t err, unsigned int nr_bytes)
+{
+	struct request *req = fs->cur_req;
+
+	swim3_dbg("  end request, err=%d nr_bytes=%d, cur_req=%p\n",
+		  err, nr_bytes, req);
+
+	if (err)
+		nr_bytes = blk_rq_cur_bytes(req);
+	if (blk_update_request(req, err, nr_bytes))
+		return true;
+	__blk_mq_end_request(req, err);
+	fs->cur_req = NULL;
+	return false;
+}
+
+static void swim3_select(struct floppy_state *fs, int sel)
+{
+	struct swim3 __iomem *sw = fs->swim3;
+
+	out_8(&sw->select, RELAX);
+	if (sel & 8)
+		out_8(&sw->control_bis, SELECT);
+	else
+		out_8(&sw->control_bic, SELECT);
+	out_8(&sw->select, sel & CA_MASK);
+}
+
+static void swim3_action(struct floppy_state *fs, int action)
+{
+	struct swim3 __iomem *sw = fs->swim3;
+
+	swim3_select(fs, action);
+	udelay(1);
+	out_8(&sw->select, sw->select | LSTRB);
+	udelay(2);
+	out_8(&sw->select, sw->select & ~LSTRB);
+	udelay(1);
+}
+
+static int swim3_readbit(struct floppy_state *fs, int bit)
+{
+	struct swim3 __iomem *sw = fs->swim3;
+	int stat;
+
+	swim3_select(fs, bit);
+	udelay(1);
+	stat = in_8(&sw->status);
+	return (stat & DATA) == 0;
+}
+
+static blk_status_t swim3_queue_rq(struct blk_mq_hw_ctx *hctx,
+				   const struct blk_mq_queue_data *bd)
+{
+	struct floppy_state *fs = hctx->queue->queuedata;
+	struct request *req = bd->rq;
+	unsigned long x;
+
+	spin_lock_irq(&swim3_lock);
+	if (fs->cur_req || fs->state != idle) {
+		spin_unlock_irq(&swim3_lock);
+		return BLK_STS_DEV_RESOURCE;
+	}
+	blk_mq_start_request(req);
+	fs->cur_req = req;
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD) {
+		swim3_dbg("%s", "  media bay absent, dropping req\n");
+		swim3_end_request(fs, BLK_STS_IOERR, 0);
+		goto out;
+	}
+	if (fs->ejected) {
+		swim3_dbg("%s", "  disk ejected\n");
+		swim3_end_request(fs, BLK_STS_IOERR, 0);
+		goto out;
+	}
+	if (rq_data_dir(req) == WRITE) {
+		if (fs->write_prot < 0)
+			fs->write_prot = swim3_readbit(fs, WRITE_PROT);
+		if (fs->write_prot) {
+			swim3_dbg("%s", "  try to write, disk write protected\n");
+			swim3_end_request(fs, BLK_STS_IOERR, 0);
+			goto out;
+		}
+	}
+
+	/*
+	 * Do not remove the cast. blk_rq_pos(req) is now a sector_t and can be
+	 * 64 bits, but it will never go past 32 bits for this driver anyway, so
+	 * we can safely cast it down and not have to do a 64/32 division
+	 */
+	fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl;
+	x = ((long)blk_rq_pos(req)) % fs->secpercyl;
+	fs->head = x / fs->secpertrack;
+	fs->req_sector = x % fs->secpertrack + 1;
+	fs->state = do_transfer;
+	fs->retries = 0;
+
+	act(fs);
+
+out:
+	spin_unlock_irq(&swim3_lock);
+	return BLK_STS_OK;
+}
+
+static void set_timeout(struct floppy_state *fs, int nticks,
+			void (*proc)(struct timer_list *t))
+{
+	if (fs->timeout_pending)
+		del_timer(&fs->timeout);
+	fs->timeout.expires = jiffies + nticks;
+	fs->timeout.function = proc;
+	add_timer(&fs->timeout);
+	fs->timeout_pending = 1;
+}
+
+static inline void scan_track(struct floppy_state *fs)
+{
+	struct swim3 __iomem *sw = fs->swim3;
+
+	swim3_select(fs, READ_DATA_0);
+	in_8(&sw->intr);		/* clear SEEN_SECTOR bit */
+	in_8(&sw->error);
+	out_8(&sw->intr_enable, SEEN_SECTOR);
+	out_8(&sw->control_bis, DO_ACTION);
+	/* enable intr when track found */
+	set_timeout(fs, HZ, scan_timeout);	/* enable timeout */
+}
+
+static inline void seek_track(struct floppy_state *fs, int n)
+{
+	struct swim3 __iomem *sw = fs->swim3;
+
+	if (n >= 0) {
+		swim3_action(fs, SEEK_POSITIVE);
+		sw->nseek = n;
+	} else {
+		swim3_action(fs, SEEK_NEGATIVE);
+		sw->nseek = -n;
+	}
+	fs->expect_cyl = (fs->cur_cyl >= 0)? fs->cur_cyl + n: -1;
+	swim3_select(fs, STEP);
+	in_8(&sw->error);
+	/* enable intr when seek finished */
+	out_8(&sw->intr_enable, SEEK_DONE);
+	out_8(&sw->control_bis, DO_SEEK);
+	set_timeout(fs, 3*HZ, seek_timeout);	/* enable timeout */
+	fs->settle_time = 0;
+}
+
+/*
+ * XXX: this is a horrible hack, but at least allows ppc32 to get
+ * out of defining virt_to_bus, and this driver out of using the
+ * deprecated block layer bounce buffering for highmem addresses
+ * for no good reason.
+ */
+static unsigned long swim3_phys_to_bus(phys_addr_t paddr)
+{
+	return paddr + PCI_DRAM_OFFSET;
+}
+
+static phys_addr_t swim3_bio_phys(struct bio *bio)
+{
+	return page_to_phys(bio_page(bio)) + bio_offset(bio);
+}
+
+static inline void init_dma(struct dbdma_cmd *cp, int cmd,
+			    phys_addr_t paddr, int count)
+{
+	cp->req_count = cpu_to_le16(count);
+	cp->command = cpu_to_le16(cmd);
+	cp->phy_addr = cpu_to_le32(swim3_phys_to_bus(paddr));
+	cp->xfer_status = 0;
+}
+
+static inline void setup_transfer(struct floppy_state *fs)
+{
+	int n;
+	struct swim3 __iomem *sw = fs->swim3;
+	struct dbdma_cmd *cp = fs->dma_cmd;
+	struct dbdma_regs __iomem *dr = fs->dma;
+	struct request *req = fs->cur_req;
+
+	if (blk_rq_cur_sectors(req) <= 0) {
+		swim3_warn("%s", "Transfer 0 sectors ?\n");
+		return;
+	}
+	if (rq_data_dir(req) == WRITE)
+		n = 1;
+	else {
+		n = fs->secpertrack - fs->req_sector + 1;
+		if (n > blk_rq_cur_sectors(req))
+			n = blk_rq_cur_sectors(req);
+	}
+
+	swim3_dbg("  setup xfer at sect %d (of %d) head %d for %d\n",
+		  fs->req_sector, fs->secpertrack, fs->head, n);
+
+	fs->scount = n;
+	swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0);
+	out_8(&sw->sector, fs->req_sector);
+	out_8(&sw->nsect, n);
+	out_8(&sw->gap3, 0);
+	out_le32(&dr->cmdptr, swim3_phys_to_bus(virt_to_phys(cp)));
+	if (rq_data_dir(req) == WRITE) {
+		/* Set up 3 dma commands: write preamble, data, postamble */
+		init_dma(cp, OUTPUT_MORE, virt_to_phys(write_preamble),
+			 sizeof(write_preamble));
+		++cp;
+		init_dma(cp, OUTPUT_MORE, swim3_bio_phys(req->bio), 512);
+		++cp;
+		init_dma(cp, OUTPUT_LAST, virt_to_phys(write_postamble),
+			sizeof(write_postamble));
+	} else {
+		init_dma(cp, INPUT_LAST, swim3_bio_phys(req->bio), n * 512);
+	}
+	++cp;
+	out_le16(&cp->command, DBDMA_STOP);
+	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+	in_8(&sw->error);
+	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+	if (rq_data_dir(req) == WRITE)
+		out_8(&sw->control_bis, WRITE_SECTORS);
+	in_8(&sw->intr);
+	out_le32(&dr->control, (RUN << 16) | RUN);
+	/* enable intr when transfer complete */
+	out_8(&sw->intr_enable, TRANSFER_DONE);
+	out_8(&sw->control_bis, DO_ACTION);
+	set_timeout(fs, 2*HZ, xfer_timeout);	/* enable timeout */
+}
+
+static void act(struct floppy_state *fs)
+{
+	for (;;) {
+		swim3_dbg("  act loop, state=%d, req_cyl=%d, cur_cyl=%d\n",
+			  fs->state, fs->req_cyl, fs->cur_cyl);
+
+		switch (fs->state) {
+		case idle:
+			return;		/* XXX shouldn't get here */
+
+		case locating:
+			if (swim3_readbit(fs, TRACK_ZERO)) {
+				swim3_dbg("%s", "    locate track 0\n");
+				fs->cur_cyl = 0;
+				if (fs->req_cyl == 0)
+					fs->state = do_transfer;
+				else
+					fs->state = seeking;
+				break;
+			}
+			scan_track(fs);
+			return;
+
+		case seeking:
+			if (fs->cur_cyl < 0) {
+				fs->expect_cyl = -1;
+				fs->state = locating;
+				break;
+			}
+			if (fs->req_cyl == fs->cur_cyl) {
+				swim3_warn("%s", "Whoops, seeking 0\n");
+				fs->state = do_transfer;
+				break;
+			}
+			seek_track(fs, fs->req_cyl - fs->cur_cyl);
+			return;
+
+		case settling:
+			/* check for SEEK_COMPLETE after 30ms */
+			fs->settle_time = (HZ + 32) / 33;
+			set_timeout(fs, fs->settle_time, settle_timeout);
+			return;
+
+		case do_transfer:
+			if (fs->cur_cyl != fs->req_cyl) {
+				if (fs->retries > 5) {
+					swim3_err("Wrong cylinder in transfer, want: %d got %d\n",
+						  fs->req_cyl, fs->cur_cyl);
+					swim3_end_request(fs, BLK_STS_IOERR, 0);
+					fs->state = idle;
+					return;
+				}
+				fs->state = seeking;
+				break;
+			}
+			setup_transfer(fs);
+			return;
+
+		case jogging:
+			seek_track(fs, -5);
+			return;
+
+		default:
+			swim3_err("Unknown state %d\n", fs->state);
+			return;
+		}
+	}
+}
+
+static void scan_timeout(struct timer_list *t)
+{
+	struct floppy_state *fs = from_timer(fs, t, timeout);
+	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* scan timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	fs->timeout_pending = 0;
+	out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+	out_8(&sw->select, RELAX);
+	out_8(&sw->intr_enable, 0);
+	fs->cur_cyl = -1;
+	if (fs->retries > 5) {
+		swim3_end_request(fs, BLK_STS_IOERR, 0);
+		fs->state = idle;
+	} else {
+		fs->state = jogging;
+		act(fs);
+	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void seek_timeout(struct timer_list *t)
+{
+	struct floppy_state *fs = from_timer(fs, t, timeout);
+	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* seek timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	fs->timeout_pending = 0;
+	out_8(&sw->control_bic, DO_SEEK);
+	out_8(&sw->select, RELAX);
+	out_8(&sw->intr_enable, 0);
+	swim3_err("%s", "Seek timeout\n");
+	swim3_end_request(fs, BLK_STS_IOERR, 0);
+	fs->state = idle;
+	spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void settle_timeout(struct timer_list *t)
+{
+	struct floppy_state *fs = from_timer(fs, t, timeout);
+	struct swim3 __iomem *sw = fs->swim3;
+	unsigned long flags;
+
+	swim3_dbg("* settle timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	fs->timeout_pending = 0;
+	if (swim3_readbit(fs, SEEK_COMPLETE)) {
+		out_8(&sw->select, RELAX);
+		fs->state = locating;
+		act(fs);
+		goto unlock;
+	}
+	out_8(&sw->select, RELAX);
+	if (fs->settle_time < 2*HZ) {
+		++fs->settle_time;
+		set_timeout(fs, 1, settle_timeout);
+		goto unlock;
+	}
+	swim3_err("%s", "Seek settle timeout\n");
+	swim3_end_request(fs, BLK_STS_IOERR, 0);
+	fs->state = idle;
+ unlock:
+	spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static void xfer_timeout(struct timer_list *t)
+{
+	struct floppy_state *fs = from_timer(fs, t, timeout);
+	struct swim3 __iomem *sw = fs->swim3;
+	struct dbdma_regs __iomem *dr = fs->dma;
+	unsigned long flags;
+	int n;
+
+	swim3_dbg("* xfer timeout, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	fs->timeout_pending = 0;
+	out_le32(&dr->control, RUN << 16);
+	/* We must wait a bit for dbdma to stop */
+	for (n = 0; (in_le32(&dr->status) & ACTIVE) && n < 1000; n++)
+		udelay(1);
+	out_8(&sw->intr_enable, 0);
+	out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
+	out_8(&sw->select, RELAX);
+	swim3_err("Timeout %sing sector %ld\n",
+	       (rq_data_dir(fs->cur_req)==WRITE? "writ": "read"),
+	       (long)blk_rq_pos(fs->cur_req));
+	swim3_end_request(fs, BLK_STS_IOERR, 0);
+	fs->state = idle;
+	spin_unlock_irqrestore(&swim3_lock, flags);
+}
+
+static irqreturn_t swim3_interrupt(int irq, void *dev_id)
+{
+	struct floppy_state *fs = (struct floppy_state *) dev_id;
+	struct swim3 __iomem *sw = fs->swim3;
+	int intr, err, n;
+	int stat, resid;
+	struct dbdma_regs __iomem *dr;
+	struct dbdma_cmd *cp;
+	unsigned long flags;
+	struct request *req = fs->cur_req;
+
+	swim3_dbg("* interrupt, state=%d\n", fs->state);
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	intr = in_8(&sw->intr);
+	err = (intr & ERROR_INTR)? in_8(&sw->error): 0;
+	if ((intr & ERROR_INTR) && fs->state != do_transfer)
+		swim3_err("Non-transfer error interrupt: state=%d, dir=%x, intr=%x, err=%x\n",
+			  fs->state, rq_data_dir(req), intr, err);
+	switch (fs->state) {
+	case locating:
+		if (intr & SEEN_SECTOR) {
+			out_8(&sw->control_bic, DO_ACTION | WRITE_SECTORS);
+			out_8(&sw->select, RELAX);
+			out_8(&sw->intr_enable, 0);
+			del_timer(&fs->timeout);
+			fs->timeout_pending = 0;
+			if (sw->ctrack == 0xff) {
+				swim3_err("%s", "Seen sector but cyl=ff?\n");
+				fs->cur_cyl = -1;
+				if (fs->retries > 5) {
+					swim3_end_request(fs, BLK_STS_IOERR, 0);
+					fs->state = idle;
+				} else {
+					fs->state = jogging;
+					act(fs);
+				}
+				break;
+			}
+			fs->cur_cyl = sw->ctrack;
+			fs->cur_sector = sw->csect;
+			if (fs->expect_cyl != -1 && fs->expect_cyl != fs->cur_cyl)
+				swim3_err("Expected cyl %d, got %d\n",
+					  fs->expect_cyl, fs->cur_cyl);
+			fs->state = do_transfer;
+			act(fs);
+		}
+		break;
+	case seeking:
+	case jogging:
+		if (sw->nseek == 0) {
+			out_8(&sw->control_bic, DO_SEEK);
+			out_8(&sw->select, RELAX);
+			out_8(&sw->intr_enable, 0);
+			del_timer(&fs->timeout);
+			fs->timeout_pending = 0;
+			if (fs->state == seeking)
+				++fs->retries;
+			fs->state = settling;
+			act(fs);
+		}
+		break;
+	case settling:
+		out_8(&sw->intr_enable, 0);
+		del_timer(&fs->timeout);
+		fs->timeout_pending = 0;
+		act(fs);
+		break;
+	case do_transfer:
+		if ((intr & (ERROR_INTR | TRANSFER_DONE)) == 0)
+			break;
+		out_8(&sw->intr_enable, 0);
+		out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION);
+		out_8(&sw->select, RELAX);
+		del_timer(&fs->timeout);
+		fs->timeout_pending = 0;
+		dr = fs->dma;
+		cp = fs->dma_cmd;
+		if (rq_data_dir(req) == WRITE)
+			++cp;
+		/*
+		 * Check that the main data transfer has finished.
+		 * On writing, the swim3 sometimes doesn't use
+		 * up all the bytes of the postamble, so we can still
+		 * see DMA active here.  That doesn't matter as long
+		 * as all the sector data has been transferred.
+		 */
+		if ((intr & ERROR_INTR) == 0 && cp->xfer_status == 0) {
+			/* wait a little while for DMA to complete */
+			for (n = 0; n < 100; ++n) {
+				if (cp->xfer_status != 0)
+					break;
+				udelay(1);
+				barrier();
+			}
+		}
+		/* turn off DMA */
+		out_le32(&dr->control, (RUN | PAUSE) << 16);
+		stat = le16_to_cpu(cp->xfer_status);
+		resid = le16_to_cpu(cp->res_count);
+		if (intr & ERROR_INTR) {
+			n = fs->scount - 1 - resid / 512;
+			if (n > 0) {
+				blk_update_request(req, 0, n << 9);
+				fs->req_sector += n;
+			}
+			if (fs->retries < 5) {
+				++fs->retries;
+				act(fs);
+			} else {
+				swim3_err("Error %sing block %ld (err=%x)\n",
+				       rq_data_dir(req) == WRITE? "writ": "read",
+				       (long)blk_rq_pos(req), err);
+				swim3_end_request(fs, BLK_STS_IOERR, 0);
+				fs->state = idle;
+			}
+		} else {
+			if ((stat & ACTIVE) == 0 || resid != 0) {
+				/* musta been an error */
+				swim3_err("fd dma error: stat=%x resid=%d\n", stat, resid);
+				swim3_err("  state=%d, dir=%x, intr=%x, err=%x\n",
+					  fs->state, rq_data_dir(req), intr, err);
+				swim3_end_request(fs, BLK_STS_IOERR, 0);
+				fs->state = idle;
+				break;
+			}
+			fs->retries = 0;
+			if (swim3_end_request(fs, 0, fs->scount << 9)) {
+				fs->req_sector += fs->scount;
+				if (fs->req_sector > fs->secpertrack) {
+					fs->req_sector -= fs->secpertrack;
+					if (++fs->head > 1) {
+						fs->head = 0;
+						++fs->req_cyl;
+					}
+				}
+				act(fs);
+			} else
+				fs->state = idle;
+		}
+		break;
+	default:
+		swim3_err("Don't know what to do in state %d\n", fs->state);
+	}
+	spin_unlock_irqrestore(&swim3_lock, flags);
+	return IRQ_HANDLED;
+}
+
+/*
+static void fd_dma_interrupt(int irq, void *dev_id)
+{
+}
+*/
+
+/* Called under the mutex to grab exclusive access to a drive */
+static int grab_drive(struct floppy_state *fs, enum swim_state state,
+		      int interruptible)
+{
+	unsigned long flags;
+
+	swim3_dbg("%s", "-> grab drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	if (fs->state != idle && fs->state != available) {
+		++fs->wanted;
+		/* this will enable irqs in order to sleep */
+		if (!interruptible)
+			wait_event_lock_irq(fs->wait,
+                                        fs->state == available,
+                                        swim3_lock);
+		else if (wait_event_interruptible_lock_irq(fs->wait,
+					fs->state == available,
+					swim3_lock)) {
+			--fs->wanted;
+			spin_unlock_irqrestore(&swim3_lock, flags);
+			return -EINTR;
+		}
+		--fs->wanted;
+	}
+	fs->state = state;
+	spin_unlock_irqrestore(&swim3_lock, flags);
+
+	return 0;
+}
+
+static void release_drive(struct floppy_state *fs)
+{
+	struct request_queue *q = disks[fs->index]->queue;
+	unsigned long flags;
+
+	swim3_dbg("%s", "-> release drive\n");
+
+	spin_lock_irqsave(&swim3_lock, flags);
+	fs->state = idle;
+	spin_unlock_irqrestore(&swim3_lock, flags);
+
+	blk_mq_freeze_queue(q);
+	blk_mq_quiesce_queue(q);
+	blk_mq_unquiesce_queue(q);
+	blk_mq_unfreeze_queue(q);
+}
+
+static int fd_eject(struct floppy_state *fs)
+{
+	int err, n;
+
+	err = grab_drive(fs, ejecting, 1);
+	if (err)
+		return err;
+	swim3_action(fs, EJECT);
+	for (n = 20; n > 0; --n) {
+		if (signal_pending(current)) {
+			err = -EINTR;
+			break;
+		}
+		swim3_select(fs, RELAX);
+		schedule_timeout_interruptible(1);
+		if (swim3_readbit(fs, DISK_IN) == 0)
+			break;
+	}
+	swim3_select(fs, RELAX);
+	udelay(150);
+	fs->ejected = 1;
+	release_drive(fs);
+	return err;
+}
+
+static struct floppy_struct floppy_type =
+	{ 2880,18,2,80,0,0x1B,0x00,0xCF,0x6C,NULL };	/*  7 1.44MB 3.5"   */
+
+static int floppy_locked_ioctl(struct block_device *bdev, fmode_t mode,
+			unsigned int cmd, unsigned long param)
+{
+	struct floppy_state *fs = bdev->bd_disk->private_data;
+	int err;
+		
+	if ((cmd & 0x80) && !capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
+		return -ENXIO;
+
+	switch (cmd) {
+	case FDEJECT:
+		if (fs->ref_count != 1)
+			return -EBUSY;
+		err = fd_eject(fs);
+		return err;
+	case FDGETPRM:
+	        if (copy_to_user((void __user *) param, &floppy_type,
+				 sizeof(struct floppy_struct)))
+			return -EFAULT;
+		return 0;
+	}
+	return -ENOTTY;
+}
+
+static int floppy_ioctl(struct block_device *bdev, fmode_t mode,
+				 unsigned int cmd, unsigned long param)
+{
+	int ret;
+
+	mutex_lock(&swim3_mutex);
+	ret = floppy_locked_ioctl(bdev, mode, cmd, param);
+	mutex_unlock(&swim3_mutex);
+
+	return ret;
+}
+
+static int floppy_open(struct block_device *bdev, fmode_t mode)
+{
+	struct floppy_state *fs = bdev->bd_disk->private_data;
+	struct swim3 __iomem *sw = fs->swim3;
+	int n, err = 0;
+
+	if (fs->ref_count == 0) {
+		if (fs->mdev->media_bay &&
+		    check_media_bay(fs->mdev->media_bay) != MB_FD)
+			return -ENXIO;
+		out_8(&sw->setup, S_IBM_DRIVE | S_FCLK_DIV2);
+		out_8(&sw->control_bic, 0xff);
+		out_8(&sw->mode, 0x95);
+		udelay(10);
+		out_8(&sw->intr_enable, 0);
+		out_8(&sw->control_bis, DRIVE_ENABLE | INTR_ENABLE);
+		swim3_action(fs, MOTOR_ON);
+		fs->write_prot = -1;
+		fs->cur_cyl = -1;
+		for (n = 0; n < 2 * HZ; ++n) {
+			if (n >= HZ/30 && swim3_readbit(fs, SEEK_COMPLETE))
+				break;
+			if (signal_pending(current)) {
+				err = -EINTR;
+				break;
+			}
+			swim3_select(fs, RELAX);
+			schedule_timeout_interruptible(1);
+		}
+		if (err == 0 && (swim3_readbit(fs, SEEK_COMPLETE) == 0
+				 || swim3_readbit(fs, DISK_IN) == 0))
+			err = -ENXIO;
+		swim3_action(fs, SETMFM);
+		swim3_select(fs, RELAX);
+
+	} else if (fs->ref_count == -1 || mode & FMODE_EXCL)
+		return -EBUSY;
+
+	if (err == 0 && (mode & FMODE_NDELAY) == 0
+	    && (mode & (FMODE_READ|FMODE_WRITE))) {
+		if (bdev_check_media_change(bdev))
+			floppy_revalidate(bdev->bd_disk);
+		if (fs->ejected)
+			err = -ENXIO;
+	}
+
+	if (err == 0 && (mode & FMODE_WRITE)) {
+		if (fs->write_prot < 0)
+			fs->write_prot = swim3_readbit(fs, WRITE_PROT);
+		if (fs->write_prot)
+			err = -EROFS;
+	}
+
+	if (err) {
+		if (fs->ref_count == 0) {
+			swim3_action(fs, MOTOR_OFF);
+			out_8(&sw->control_bic, DRIVE_ENABLE | INTR_ENABLE);
+			swim3_select(fs, RELAX);
+		}
+		return err;
+	}
+
+	if (mode & FMODE_EXCL)
+		fs->ref_count = -1;
+	else
+		++fs->ref_count;
+
+	return 0;
+}
+
+static int floppy_unlocked_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret;
+
+	mutex_lock(&swim3_mutex);
+	ret = floppy_open(bdev, mode);
+	mutex_unlock(&swim3_mutex);
+
+	return ret;
+}
+
+static void floppy_release(struct gendisk *disk, fmode_t mode)
+{
+	struct floppy_state *fs = disk->private_data;
+	struct swim3 __iomem *sw = fs->swim3;
+
+	mutex_lock(&swim3_mutex);
+	if (fs->ref_count > 0)
+		--fs->ref_count;
+	else if (fs->ref_count == -1)
+		fs->ref_count = 0;
+	if (fs->ref_count == 0) {
+		swim3_action(fs, MOTOR_OFF);
+		out_8(&sw->control_bic, 0xff);
+		swim3_select(fs, RELAX);
+	}
+	mutex_unlock(&swim3_mutex);
+}
+
+static unsigned int floppy_check_events(struct gendisk *disk,
+					unsigned int clearing)
+{
+	struct floppy_state *fs = disk->private_data;
+	return fs->ejected ? DISK_EVENT_MEDIA_CHANGE : 0;
+}
+
+static int floppy_revalidate(struct gendisk *disk)
+{
+	struct floppy_state *fs = disk->private_data;
+	struct swim3 __iomem *sw;
+	int ret, n;
+
+	if (fs->mdev->media_bay &&
+	    check_media_bay(fs->mdev->media_bay) != MB_FD)
+		return -ENXIO;
+
+	sw = fs->swim3;
+	grab_drive(fs, revalidating, 0);
+	out_8(&sw->intr_enable, 0);
+	out_8(&sw->control_bis, DRIVE_ENABLE);
+	swim3_action(fs, MOTOR_ON);	/* necessary? */
+	fs->write_prot = -1;
+	fs->cur_cyl = -1;
+	mdelay(1);
+	for (n = HZ; n > 0; --n) {
+		if (swim3_readbit(fs, SEEK_COMPLETE))
+			break;
+		if (signal_pending(current))
+			break;
+		swim3_select(fs, RELAX);
+		schedule_timeout_interruptible(1);
+	}
+	ret = swim3_readbit(fs, SEEK_COMPLETE) == 0
+		|| swim3_readbit(fs, DISK_IN) == 0;
+	if (ret)
+		swim3_action(fs, MOTOR_OFF);
+	else {
+		fs->ejected = 0;
+		swim3_action(fs, SETMFM);
+	}
+	swim3_select(fs, RELAX);
+
+	release_drive(fs);
+	return ret;
+}
+
+static const struct block_device_operations floppy_fops = {
+	.open		= floppy_unlocked_open,
+	.release	= floppy_release,
+	.ioctl		= floppy_ioctl,
+	.check_events	= floppy_check_events,
+};
+
+static const struct blk_mq_ops swim3_mq_ops = {
+	.queue_rq = swim3_queue_rq,
+};
+
+static void swim3_mb_event(struct macio_dev* mdev, int mb_state)
+{
+	struct floppy_state *fs = macio_get_drvdata(mdev);
+	struct swim3 __iomem *sw;
+
+	if (!fs)
+		return;
+
+	sw = fs->swim3;
+
+	if (mb_state != MB_FD)
+		return;
+
+	/* Clear state */
+	out_8(&sw->intr_enable, 0);
+	in_8(&sw->intr);
+	in_8(&sw->error);
+}
+
+static int swim3_add_device(struct macio_dev *mdev, int index)
+{
+	struct device_node *swim = mdev->ofdev.dev.of_node;
+	struct floppy_state *fs = &floppy_states[index];
+	int rc = -EBUSY;
+
+	fs->mdev = mdev;
+	fs->index = index;
+
+	/* Check & Request resources */
+	if (macio_resource_count(mdev) < 2) {
+		swim3_err("%s", "No address in device-tree\n");
+		return -ENXIO;
+	}
+	if (macio_irq_count(mdev) < 1) {
+		swim3_err("%s", "No interrupt in device-tree\n");
+		return -ENXIO;
+	}
+	if (macio_request_resource(mdev, 0, "swim3 (mmio)")) {
+		swim3_err("%s", "Can't request mmio resource\n");
+		return -EBUSY;
+	}
+	if (macio_request_resource(mdev, 1, "swim3 (dma)")) {
+		swim3_err("%s", "Can't request dma resource\n");
+		macio_release_resource(mdev, 0);
+		return -EBUSY;
+	}
+	dev_set_drvdata(&mdev->ofdev.dev, fs);
+
+	if (mdev->media_bay == NULL)
+		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 1);
+	
+	fs->state = idle;
+	fs->swim3 = (struct swim3 __iomem *)
+		ioremap(macio_resource_start(mdev, 0), 0x200);
+	if (fs->swim3 == NULL) {
+		swim3_err("%s", "Couldn't map mmio registers\n");
+		rc = -ENOMEM;
+		goto out_release;
+	}
+	fs->dma = (struct dbdma_regs __iomem *)
+		ioremap(macio_resource_start(mdev, 1), 0x200);
+	if (fs->dma == NULL) {
+		swim3_err("%s", "Couldn't map dma registers\n");
+		iounmap(fs->swim3);
+		rc = -ENOMEM;
+		goto out_release;
+	}
+	fs->swim3_intr = macio_irq(mdev, 0);
+	fs->dma_intr = macio_irq(mdev, 1);
+	fs->cur_cyl = -1;
+	fs->cur_sector = -1;
+	fs->secpercyl = 36;
+	fs->secpertrack = 18;
+	fs->total_secs = 2880;
+	init_waitqueue_head(&fs->wait);
+
+	fs->dma_cmd = (struct dbdma_cmd *) DBDMA_ALIGN(fs->dbdma_cmd_space);
+	memset(fs->dma_cmd, 0, 2 * sizeof(struct dbdma_cmd));
+	fs->dma_cmd[1].command = cpu_to_le16(DBDMA_STOP);
+
+	if (mdev->media_bay == NULL || check_media_bay(mdev->media_bay) == MB_FD)
+		swim3_mb_event(mdev, MB_FD);
+
+	if (request_irq(fs->swim3_intr, swim3_interrupt, 0, "SWIM3", fs)) {
+		swim3_err("%s", "Couldn't request interrupt\n");
+		pmac_call_feature(PMAC_FTR_SWIM3_ENABLE, swim, 0, 0);
+		goto out_unmap;
+	}
+
+	timer_setup(&fs->timeout, NULL, 0);
+
+	swim3_info("SWIM3 floppy controller %s\n",
+		mdev->media_bay ? "in media bay" : "");
+
+	return 0;
+
+ out_unmap:
+	iounmap(fs->dma);
+	iounmap(fs->swim3);
+
+ out_release:
+	macio_release_resource(mdev, 0);
+	macio_release_resource(mdev, 1);
+
+	return rc;
+}
+
+static int swim3_attach(struct macio_dev *mdev,
+			const struct of_device_id *match)
+{
+	struct floppy_state *fs;
+	struct gendisk *disk;
+	int rc;
+
+	if (floppy_count >= MAX_FLOPPIES)
+		return -ENXIO;
+
+	if (floppy_count == 0) {
+		rc = register_blkdev(FLOPPY_MAJOR, "fd");
+		if (rc)
+			return rc;
+	}
+
+	fs = &floppy_states[floppy_count];
+	memset(fs, 0, sizeof(*fs));
+
+	rc = blk_mq_alloc_sq_tag_set(&fs->tag_set, &swim3_mq_ops, 2,
+			BLK_MQ_F_SHOULD_MERGE);
+	if (rc)
+		goto out_unregister;
+
+	disk = blk_mq_alloc_disk(&fs->tag_set, fs);
+	if (IS_ERR(disk)) {
+		rc = PTR_ERR(disk);
+		goto out_free_tag_set;
+	}
+
+	rc = swim3_add_device(mdev, floppy_count);
+	if (rc)
+		goto out_cleanup_disk;
+
+	disk->major = FLOPPY_MAJOR;
+	disk->first_minor = floppy_count;
+	disk->minors = 1;
+	disk->fops = &floppy_fops;
+	disk->private_data = fs;
+	disk->events = DISK_EVENT_MEDIA_CHANGE;
+	disk->flags |= GENHD_FL_REMOVABLE | GENHD_FL_NO_PART;
+	sprintf(disk->disk_name, "fd%d", floppy_count);
+	set_capacity(disk, 2880);
+	rc = add_disk(disk);
+	if (rc)
+		goto out_cleanup_disk;
+
+	disks[floppy_count++] = disk;
+	return 0;
+
+out_cleanup_disk:
+	put_disk(disk);
+out_free_tag_set:
+	blk_mq_free_tag_set(&fs->tag_set);
+out_unregister:
+	if (floppy_count == 0)
+		unregister_blkdev(FLOPPY_MAJOR, "fd");
+	return rc;
+}
+
+static const struct of_device_id swim3_match[] =
+{
+	{
+	.name		= "swim3",
+	},
+	{
+	.compatible	= "ohare-swim3"
+	},
+	{
+	.compatible	= "swim3"
+	},
+	{ /* end of list */ }
+};
+
+static struct macio_driver swim3_driver =
+{
+	.driver = {
+		.name 		= "swim3",
+		.of_match_table	= swim3_match,
+	},
+	.probe		= swim3_attach,
+#ifdef CONFIG_PMAC_MEDIABAY
+	.mediabay_event	= swim3_mb_event,
+#endif
+#if 0
+	.suspend	= swim3_suspend,
+	.resume		= swim3_resume,
+#endif
+};
+
+
+int swim3_init(void)
+{
+	macio_register_driver(&swim3_driver);
+	return 0;
+}
+
+module_init(swim3_init)
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Paul Mackerras");
+MODULE_ALIAS_BLOCKDEV_MAJOR(FLOPPY_MAJOR);
diff --git a/drivers/block/swim_asm.S b/drivers/block/swim_asm.S
new file mode 100644
index 000000000..3d7a2d875
--- /dev/null
+++ b/drivers/block/swim_asm.S
@@ -0,0 +1,243 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * low-level functions for the SWIM floppy controller
+ *
+ * needs assembly language because is very timing dependent
+ * this controller exists only on macintosh 680x0 based
+ *
+ * Copyright (C) 2004,2008 Laurent Vivier <Laurent@lvivier.info>
+ *
+ * based on Alastair Bridgewater SWIM analysis, 2001
+ * based on netBSD IWM driver (c) 1997, 1998 Hauke Fath.
+ *
+ * 2004-08-21 (lv) - Initial implementation
+ * 2008-11-05 (lv) - add get_swim_mode
+ */
+
+	.equ	write_data,	0x0000
+	.equ	write_mark,	0x0200
+	.equ	write_CRC,	0x0400
+	.equ	write_parameter,0x0600
+	.equ	write_phase,	0x0800
+	.equ	write_setup,	0x0a00
+	.equ	write_mode0,	0x0c00
+	.equ	write_mode1,	0x0e00
+	.equ	read_data,	0x1000
+	.equ	read_mark,	0x1200
+	.equ	read_error,	0x1400
+	.equ	read_parameter,	0x1600
+	.equ	read_phase,	0x1800
+	.equ	read_setup,	0x1a00
+	.equ	read_status,	0x1c00
+	.equ	read_handshake,	0x1e00
+
+	.equ	o_side, 0
+	.equ	o_track, 1
+	.equ	o_sector, 2
+	.equ	o_size, 3
+	.equ	o_crc0, 4
+	.equ	o_crc1, 5
+
+	.equ	seek_time, 30000
+	.equ	max_retry, 40
+	.equ	sector_size, 512
+
+	.global swim_read_sector_header
+swim_read_sector_header:
+	link	%a6, #0
+	moveml	%d1-%d5/%a0-%a4,%sp@-
+	movel	%a6@(0x0c), %a4
+	bsr	mfm_read_addrmark
+	moveml	%sp@+, %d1-%d5/%a0-%a4
+	unlk	%a6
+	rts
+
+sector_address_mark:
+	.byte	0xa1, 0xa1, 0xa1, 0xfe
+sector_data_mark:
+	.byte	0xa1, 0xa1, 0xa1, 0xfb
+
+mfm_read_addrmark:
+	movel	%a6@(0x08), %a3
+	lea	%a3@(read_handshake), %a2
+	lea	%a3@(read_mark), %a3
+	moveq	#-1, %d0
+	movew	#seek_time, %d2
+
+wait_header_init:
+	tstb	%a3@(read_error - read_mark)
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+	moveb	#0x01, %a3@(write_mode1 - read_mark)
+	moveb	#0x01, %a3@(write_mode0 - read_mark)
+	tstb	%a3@(read_error - read_mark)
+	moveb	#0x08, %a3@(write_mode1 - read_mark)
+
+	lea	sector_address_mark, %a0
+	moveq	#3, %d1
+
+wait_addr_mark_byte:
+
+	tstb	%a2@
+	dbmi	%d2, wait_addr_mark_byte
+	bpl	header_exit
+
+	moveb	%a3@, %d3
+	cmpb	%a0@+, %d3
+	dbne	%d1, wait_addr_mark_byte
+	bne	wait_header_init
+
+	moveq	#max_retry, %d2
+
+amark0:	tstb	%a2@
+	dbmi	%d2, amark0
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_track)
+
+	moveq	#max_retry, %d2
+
+amark1:	tstb	%a2@
+	dbmi	%d2, amark1
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_side)
+
+	moveq	#max_retry, %d2
+
+amark2:	tstb	%a2@
+	dbmi	%d2, amark2
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_sector)
+
+	moveq	#max_retry, %d2
+
+amark3:	tstb	%a2@
+	dbmi	%d2, amark3
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_size)
+
+	moveq	#max_retry, %d2
+
+crc0:	tstb	%a2@
+	dbmi	%d2, crc0
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_crc0)
+
+	moveq	#max_retry, %d2
+
+crc1:	tstb	%a2@
+	dbmi	%d2, crc1
+	bpl	signal_nonyb
+
+	moveb	%a3@, %a4@(o_crc1)
+
+	tstb	%a3@(read_error - read_mark)
+
+header_exit:
+	moveq	#0, %d0
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+	rts
+signal_nonyb:
+	moveq	#-1, %d0
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+	rts
+
+	.global swim_read_sector_data
+swim_read_sector_data:
+	link	%a6, #0
+	moveml	%d1-%d5/%a0-%a5,%sp@-
+	movel	%a6@(0x0c), %a4
+	bsr	mfm_read_data
+	moveml	%sp@+, %d1-%d5/%a0-%a5
+	unlk	%a6
+	rts
+
+mfm_read_data:
+	movel	%a6@(0x08), %a3
+	lea	%a3@(read_handshake), %a2
+	lea	%a3@(read_data), %a5
+	lea	%a3@(read_mark), %a3
+	movew	#seek_time, %d2
+
+wait_data_init:
+	tstb	%a3@(read_error - read_mark)
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+	moveb	#0x01, %a3@(write_mode1 - read_mark)
+	moveb	#0x01, %a3@(write_mode0 - read_mark)
+	tstb	%a3@(read_error - read_mark)
+	moveb	#0x08, %a3@(write_mode1 - read_mark)
+
+	lea	sector_data_mark, %a0
+	moveq	#3, %d1
+
+	/* wait data address mark */
+
+wait_data_mark_byte:
+
+	tstb	%a2@
+	dbmi	%d2, wait_data_mark_byte
+	bpl	data_exit
+
+	moveb	%a3@, %d3
+	cmpb	%a0@+, %d3
+	dbne	%d1, wait_data_mark_byte
+	bne	wait_data_init
+
+	/* read data */
+
+	tstb	%a3@(read_error - read_mark)
+
+	movel	#sector_size-1, %d4		/* sector size */
+read_new_data:
+	movew	#max_retry, %d2
+read_data_loop:
+	moveb	%a2@, %d5
+	andb	#0xc0, %d5
+	dbne	%d2, read_data_loop
+	beq	data_exit
+	moveb	%a5@, %a4@+
+	andb	#0x40, %d5
+	dbne	%d4, read_new_data
+	beq	exit_loop
+	moveb	%a5@, %a4@+
+	dbra	%d4, read_new_data
+exit_loop:
+
+	/* read CRC */
+
+	movew	#max_retry, %d2
+data_crc0:
+
+	tstb	%a2@
+	dbmi	%d2, data_crc0
+	bpl	data_exit
+
+	moveb	%a3@, %d5
+
+	moveq	#max_retry, %d2
+
+data_crc1:
+
+	tstb	%a2@
+	dbmi	%d2, data_crc1
+	bpl	data_exit
+
+	moveb	%a3@, %d5
+
+	tstb	%a3@(read_error - read_mark)
+
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+
+	/* return number of bytes read */
+
+	movel	#sector_size, %d0
+	addw	#1, %d4
+	subl	%d4, %d0
+	rts
+data_exit:
+	moveb	#0x18, %a3@(write_mode0 - read_mark)
+	moveq	#-1, %d0
+	rts
diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c
new file mode 100644
index 000000000..3fa74051f
--- /dev/null
+++ b/drivers/block/ublk_drv.c
@@ -0,0 +1,2119 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Userspace block device - block device which IO is handled from userspace
+ *
+ * Take full use of io_uring passthrough command for communicating with
+ * ublk userspace daemon(ublksrvd) for handling basic IO request.
+ *
+ * Copyright 2022 Ming Lei <ming.lei@redhat.com>
+ *
+ * (part of code stolen from loop.c)
+ */
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/pagemap.h>
+#include <linux/file.h>
+#include <linux/stat.h>
+#include <linux/errno.h>
+#include <linux/major.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/init.h>
+#include <linux/swap.h>
+#include <linux/slab.h>
+#include <linux/compat.h>
+#include <linux/mutex.h>
+#include <linux/writeback.h>
+#include <linux/completion.h>
+#include <linux/highmem.h>
+#include <linux/sysfs.h>
+#include <linux/miscdevice.h>
+#include <linux/falloc.h>
+#include <linux/uio.h>
+#include <linux/ioprio.h>
+#include <linux/sched/mm.h>
+#include <linux/uaccess.h>
+#include <linux/cdev.h>
+#include <linux/io_uring.h>
+#include <linux/blk-mq.h>
+#include <linux/delay.h>
+#include <linux/mm.h>
+#include <asm/page.h>
+#include <linux/task_work.h>
+#include <uapi/linux/ublk_cmd.h>
+
+#define UBLK_MINORS		(1U << MINORBITS)
+
+/* All UBLK_F_* have to be included into UBLK_F_ALL */
+#define UBLK_F_ALL (UBLK_F_SUPPORT_ZERO_COPY \
+		| UBLK_F_URING_CMD_COMP_IN_TASK \
+		| UBLK_F_NEED_GET_DATA \
+		| UBLK_F_USER_RECOVERY \
+		| UBLK_F_USER_RECOVERY_REISSUE)
+
+/* All UBLK_PARAM_TYPE_* should be included here */
+#define UBLK_PARAM_TYPE_ALL (UBLK_PARAM_TYPE_BASIC | UBLK_PARAM_TYPE_DISCARD)
+
+struct ublk_rq_data {
+	struct llist_node node;
+	struct callback_head work;
+};
+
+struct ublk_uring_cmd_pdu {
+	struct ublk_queue *ubq;
+};
+
+/*
+ * io command is active: sqe cmd is received, and its cqe isn't done
+ *
+ * If the flag is set, the io command is owned by ublk driver, and waited
+ * for incoming blk-mq request from the ublk block device.
+ *
+ * If the flag is cleared, the io command will be completed, and owned by
+ * ublk server.
+ */
+#define UBLK_IO_FLAG_ACTIVE	0x01
+
+/*
+ * IO command is completed via cqe, and it is being handled by ublksrv, and
+ * not committed yet
+ *
+ * Basically exclusively with UBLK_IO_FLAG_ACTIVE, so can be served for
+ * cross verification
+ */
+#define UBLK_IO_FLAG_OWNED_BY_SRV 0x02
+
+/*
+ * IO command is aborted, so this flag is set in case of
+ * !UBLK_IO_FLAG_ACTIVE.
+ *
+ * After this flag is observed, any pending or new incoming request
+ * associated with this io command will be failed immediately
+ */
+#define UBLK_IO_FLAG_ABORTED 0x04
+
+/*
+ * UBLK_IO_FLAG_NEED_GET_DATA is set because IO command requires
+ * get data buffer address from ublksrv.
+ *
+ * Then, bio data could be copied into this data buffer for a WRITE request
+ * after the IO command is issued again and UBLK_IO_FLAG_NEED_GET_DATA is unset.
+ */
+#define UBLK_IO_FLAG_NEED_GET_DATA 0x08
+
+/* atomic RW with ubq->cancel_lock */
+#define UBLK_IO_FLAG_CANCELED	0x80000000
+
+struct ublk_io {
+	/* userspace buffer address from io cmd */
+	__u64	addr;
+	unsigned int flags;
+	int res;
+
+	struct io_uring_cmd *cmd;
+};
+
+struct ublk_queue {
+	int q_id;
+	int q_depth;
+
+	unsigned long flags;
+	struct task_struct	*ubq_daemon;
+	char *io_cmd_buf;
+
+	struct llist_head	io_cmds;
+
+	unsigned long io_addr;	/* mapped vm address */
+	unsigned int max_io_sz;
+	bool force_abort;
+	unsigned short nr_io_ready;	/* how many ios setup */
+	spinlock_t		cancel_lock;
+	struct ublk_device *dev;
+	struct ublk_io ios[];
+};
+
+#define UBLK_DAEMON_MONITOR_PERIOD	(5 * HZ)
+
+struct ublk_device {
+	struct gendisk		*ub_disk;
+
+	char	*__queues;
+
+	unsigned int	queue_size;
+	struct ublksrv_ctrl_dev_info	dev_info;
+
+	struct blk_mq_tag_set	tag_set;
+
+	struct cdev		cdev;
+	struct device		cdev_dev;
+
+#define UB_STATE_OPEN		0
+#define UB_STATE_USED		1
+	unsigned long		state;
+	int			ub_number;
+
+	struct mutex		mutex;
+
+	spinlock_t		mm_lock;
+	struct mm_struct	*mm;
+
+	struct ublk_params	params;
+
+	struct completion	completion;
+	unsigned int		nr_queues_ready;
+	unsigned int		nr_privileged_daemon;
+
+	/*
+	 * Our ubq->daemon may be killed without any notification, so
+	 * monitor each queue's daemon periodically
+	 */
+	struct delayed_work	monitor_work;
+	struct work_struct	quiesce_work;
+	struct work_struct	stop_work;
+};
+
+/* header of ublk_params */
+struct ublk_params_header {
+	__u32	len;
+	__u32	types;
+};
+
+static dev_t ublk_chr_devt;
+static struct class *ublk_chr_class;
+
+static DEFINE_IDR(ublk_index_idr);
+static DEFINE_SPINLOCK(ublk_idr_lock);
+static wait_queue_head_t ublk_idr_wq;	/* wait until one idr is freed */
+
+static DEFINE_MUTEX(ublk_ctl_mutex);
+
+static struct miscdevice ublk_misc;
+
+static void ublk_dev_param_basic_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_basic *p = &ub->params.basic;
+
+	blk_queue_logical_block_size(q, 1 << p->logical_bs_shift);
+	blk_queue_physical_block_size(q, 1 << p->physical_bs_shift);
+	blk_queue_io_min(q, 1 << p->io_min_shift);
+	blk_queue_io_opt(q, 1 << p->io_opt_shift);
+
+	blk_queue_write_cache(q, p->attrs & UBLK_ATTR_VOLATILE_CACHE,
+			p->attrs & UBLK_ATTR_FUA);
+	if (p->attrs & UBLK_ATTR_ROTATIONAL)
+		blk_queue_flag_clear(QUEUE_FLAG_NONROT, q);
+	else
+		blk_queue_flag_set(QUEUE_FLAG_NONROT, q);
+
+	blk_queue_max_hw_sectors(q, p->max_sectors);
+	blk_queue_chunk_sectors(q, p->chunk_sectors);
+	blk_queue_virt_boundary(q, p->virt_boundary_mask);
+
+	if (p->attrs & UBLK_ATTR_READ_ONLY)
+		set_disk_ro(ub->ub_disk, true);
+
+	set_capacity(ub->ub_disk, p->dev_sectors);
+}
+
+static void ublk_dev_param_discard_apply(struct ublk_device *ub)
+{
+	struct request_queue *q = ub->ub_disk->queue;
+	const struct ublk_param_discard *p = &ub->params.discard;
+
+	q->limits.discard_alignment = p->discard_alignment;
+	q->limits.discard_granularity = p->discard_granularity;
+	blk_queue_max_discard_sectors(q, p->max_discard_sectors);
+	blk_queue_max_write_zeroes_sectors(q,
+			p->max_write_zeroes_sectors);
+	blk_queue_max_discard_segments(q, p->max_discard_segments);
+}
+
+static int ublk_validate_params(const struct ublk_device *ub)
+{
+	/* basic param is the only one which must be set */
+	if (ub->params.types & UBLK_PARAM_TYPE_BASIC) {
+		const struct ublk_param_basic *p = &ub->params.basic;
+
+		if (p->logical_bs_shift > PAGE_SHIFT || p->logical_bs_shift < 9)
+			return -EINVAL;
+
+		if (p->logical_bs_shift > p->physical_bs_shift)
+			return -EINVAL;
+
+		if (p->max_sectors > (ub->dev_info.max_io_buf_bytes >> 9))
+			return -EINVAL;
+	} else
+		return -EINVAL;
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD) {
+		const struct ublk_param_discard *p = &ub->params.discard;
+
+		/* So far, only support single segment discard */
+		if (p->max_discard_sectors && p->max_discard_segments != 1)
+			return -EINVAL;
+
+		if (!p->discard_granularity)
+			return -EINVAL;
+	}
+
+	return 0;
+}
+
+static int ublk_apply_params(struct ublk_device *ub)
+{
+	if (!(ub->params.types & UBLK_PARAM_TYPE_BASIC))
+		return -EINVAL;
+
+	ublk_dev_param_basic_apply(ub);
+
+	if (ub->params.types & UBLK_PARAM_TYPE_DISCARD)
+		ublk_dev_param_discard_apply(ub);
+
+	return 0;
+}
+
+static inline bool ublk_can_use_task_work(const struct ublk_queue *ubq)
+{
+	if (IS_BUILTIN(CONFIG_BLK_DEV_UBLK) &&
+			!(ubq->flags & UBLK_F_URING_CMD_COMP_IN_TASK))
+		return true;
+	return false;
+}
+
+static inline bool ublk_need_get_data(const struct ublk_queue *ubq)
+{
+	if (ubq->flags & UBLK_F_NEED_GET_DATA)
+		return true;
+	return false;
+}
+
+static struct ublk_device *ublk_get_device(struct ublk_device *ub)
+{
+	if (kobject_get_unless_zero(&ub->cdev_dev.kobj))
+		return ub;
+	return NULL;
+}
+
+static void ublk_put_device(struct ublk_device *ub)
+{
+	put_device(&ub->cdev_dev);
+}
+
+static inline struct ublk_queue *ublk_get_queue(struct ublk_device *dev,
+		int qid)
+{
+       return (struct ublk_queue *)&(dev->__queues[qid * dev->queue_size]);
+}
+
+static inline bool ublk_rq_has_data(const struct request *rq)
+{
+	return rq->bio && bio_has_data(rq->bio);
+}
+
+static inline struct ublksrv_io_desc *ublk_get_iod(struct ublk_queue *ubq,
+		int tag)
+{
+	return (struct ublksrv_io_desc *)
+		&(ubq->io_cmd_buf[tag * sizeof(struct ublksrv_io_desc)]);
+}
+
+static inline char *ublk_queue_cmd_buf(struct ublk_device *ub, int q_id)
+{
+	return ublk_get_queue(ub, q_id)->io_cmd_buf;
+}
+
+static inline int ublk_queue_cmd_buf_size(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	return round_up(ubq->q_depth * sizeof(struct ublksrv_io_desc),
+			PAGE_SIZE);
+}
+
+static inline bool ublk_queue_can_use_recovery_reissue(
+		struct ublk_queue *ubq)
+{
+	if ((ubq->flags & UBLK_F_USER_RECOVERY) &&
+			(ubq->flags & UBLK_F_USER_RECOVERY_REISSUE))
+		return true;
+	return false;
+}
+
+static inline bool ublk_queue_can_use_recovery(
+		struct ublk_queue *ubq)
+{
+	if (ubq->flags & UBLK_F_USER_RECOVERY)
+		return true;
+	return false;
+}
+
+static inline bool ublk_can_use_recovery(struct ublk_device *ub)
+{
+	if (ub->dev_info.flags & UBLK_F_USER_RECOVERY)
+		return true;
+	return false;
+}
+
+static void ublk_free_disk(struct gendisk *disk)
+{
+	struct ublk_device *ub = disk->private_data;
+
+	clear_bit(UB_STATE_USED, &ub->state);
+	put_device(&ub->cdev_dev);
+}
+
+static const struct block_device_operations ub_fops = {
+	.owner =	THIS_MODULE,
+	.free_disk =	ublk_free_disk,
+};
+
+#define UBLK_MAX_PIN_PAGES	32
+
+struct ublk_map_data {
+	const struct ublk_queue *ubq;
+	const struct request *rq;
+	const struct ublk_io *io;
+	unsigned max_bytes;
+};
+
+struct ublk_io_iter {
+	struct page *pages[UBLK_MAX_PIN_PAGES];
+	unsigned pg_off;	/* offset in the 1st page in pages */
+	int nr_pages;		/* how many page pointers in pages */
+	struct bio *bio;
+	struct bvec_iter iter;
+};
+
+static inline unsigned ublk_copy_io_pages(struct ublk_io_iter *data,
+		unsigned max_bytes, bool to_vm)
+{
+	const unsigned total = min_t(unsigned, max_bytes,
+			PAGE_SIZE - data->pg_off +
+			((data->nr_pages - 1) << PAGE_SHIFT));
+	unsigned done = 0;
+	unsigned pg_idx = 0;
+
+	while (done < total) {
+		struct bio_vec bv = bio_iter_iovec(data->bio, data->iter);
+		const unsigned int bytes = min3(bv.bv_len, total - done,
+				(unsigned)(PAGE_SIZE - data->pg_off));
+		void *bv_buf = bvec_kmap_local(&bv);
+		void *pg_buf = kmap_local_page(data->pages[pg_idx]);
+
+		if (to_vm)
+			memcpy(pg_buf + data->pg_off, bv_buf, bytes);
+		else
+			memcpy(bv_buf, pg_buf + data->pg_off, bytes);
+
+		kunmap_local(pg_buf);
+		kunmap_local(bv_buf);
+
+		/* advance page array */
+		data->pg_off += bytes;
+		if (data->pg_off == PAGE_SIZE) {
+			pg_idx += 1;
+			data->pg_off = 0;
+		}
+
+		done += bytes;
+
+		/* advance bio */
+		bio_advance_iter_single(data->bio, &data->iter, bytes);
+		if (!data->iter.bi_size) {
+			data->bio = data->bio->bi_next;
+			if (data->bio == NULL)
+				break;
+			data->iter = data->bio->bi_iter;
+		}
+	}
+
+	return done;
+}
+
+static inline int ublk_copy_user_pages(struct ublk_map_data *data,
+		bool to_vm)
+{
+	const unsigned int gup_flags = to_vm ? FOLL_WRITE : 0;
+	const unsigned long start_vm = data->io->addr;
+	unsigned int done = 0;
+	struct ublk_io_iter iter = {
+		.pg_off	= start_vm & (PAGE_SIZE - 1),
+		.bio	= data->rq->bio,
+		.iter	= data->rq->bio->bi_iter,
+	};
+	const unsigned int nr_pages = round_up(data->max_bytes +
+			(start_vm & (PAGE_SIZE - 1)), PAGE_SIZE) >> PAGE_SHIFT;
+
+	while (done < nr_pages) {
+		const unsigned to_pin = min_t(unsigned, UBLK_MAX_PIN_PAGES,
+				nr_pages - done);
+		unsigned i, len;
+
+		iter.nr_pages = get_user_pages_fast(start_vm +
+				(done << PAGE_SHIFT), to_pin, gup_flags,
+				iter.pages);
+		if (iter.nr_pages <= 0)
+			return done == 0 ? iter.nr_pages : done;
+		len = ublk_copy_io_pages(&iter, data->max_bytes, to_vm);
+		for (i = 0; i < iter.nr_pages; i++) {
+			if (to_vm)
+				set_page_dirty(iter.pages[i]);
+			put_page(iter.pages[i]);
+		}
+		data->max_bytes -= len;
+		done += iter.nr_pages;
+	}
+
+	return done;
+}
+
+static int ublk_map_io(const struct ublk_queue *ubq, const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+	/*
+	 * no zero copy, we delay copy WRITE request data into ublksrv
+	 * context and the big benefit is that pinning pages in current
+	 * context is pretty fast, see ublk_pin_user_pages
+	 */
+	if (req_op(req) != REQ_OP_WRITE && req_op(req) != REQ_OP_FLUSH)
+		return rq_bytes;
+
+	if (ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	rq_bytes,
+		};
+
+		ublk_copy_user_pages(&data, true);
+
+		return rq_bytes - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static int ublk_unmap_io(const struct ublk_queue *ubq,
+		const struct request *req,
+		struct ublk_io *io)
+{
+	const unsigned int rq_bytes = blk_rq_bytes(req);
+
+	if (req_op(req) == REQ_OP_READ && ublk_rq_has_data(req)) {
+		struct ublk_map_data data = {
+			.ubq	=	ubq,
+			.rq	=	req,
+			.io	=	io,
+			.max_bytes =	io->res,
+		};
+
+		WARN_ON_ONCE(io->res > rq_bytes);
+
+		ublk_copy_user_pages(&data, false);
+
+		return io->res - data.max_bytes;
+	}
+	return rq_bytes;
+}
+
+static inline unsigned int ublk_req_build_flags(struct request *req)
+{
+	unsigned flags = 0;
+
+	if (req->cmd_flags & REQ_FAILFAST_DEV)
+		flags |= UBLK_IO_F_FAILFAST_DEV;
+
+	if (req->cmd_flags & REQ_FAILFAST_TRANSPORT)
+		flags |= UBLK_IO_F_FAILFAST_TRANSPORT;
+
+	if (req->cmd_flags & REQ_FAILFAST_DRIVER)
+		flags |= UBLK_IO_F_FAILFAST_DRIVER;
+
+	if (req->cmd_flags & REQ_META)
+		flags |= UBLK_IO_F_META;
+
+	if (req->cmd_flags & REQ_FUA)
+		flags |= UBLK_IO_F_FUA;
+
+	if (req->cmd_flags & REQ_NOUNMAP)
+		flags |= UBLK_IO_F_NOUNMAP;
+
+	if (req->cmd_flags & REQ_SWAP)
+		flags |= UBLK_IO_F_SWAP;
+
+	return flags;
+}
+
+static blk_status_t ublk_setup_iod(struct ublk_queue *ubq, struct request *req)
+{
+	struct ublksrv_io_desc *iod = ublk_get_iod(ubq, req->tag);
+	struct ublk_io *io = &ubq->ios[req->tag];
+	u32 ublk_op;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		ublk_op = UBLK_IO_OP_READ;
+		break;
+	case REQ_OP_WRITE:
+		ublk_op = UBLK_IO_OP_WRITE;
+		break;
+	case REQ_OP_FLUSH:
+		ublk_op = UBLK_IO_OP_FLUSH;
+		break;
+	case REQ_OP_DISCARD:
+		ublk_op = UBLK_IO_OP_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		ublk_op = UBLK_IO_OP_WRITE_ZEROES;
+		break;
+	default:
+		return BLK_STS_IOERR;
+	}
+
+	/* need to translate since kernel may change */
+	iod->op_flags = ublk_op | ublk_req_build_flags(req);
+	iod->nr_sectors = blk_rq_sectors(req);
+	iod->start_sector = blk_rq_pos(req);
+	iod->addr = io->addr;
+
+	return BLK_STS_OK;
+}
+
+static inline struct ublk_uring_cmd_pdu *ublk_get_uring_cmd_pdu(
+		struct io_uring_cmd *ioucmd)
+{
+	return (struct ublk_uring_cmd_pdu *)&ioucmd->pdu;
+}
+
+static inline bool ubq_daemon_is_dying(struct ublk_queue *ubq)
+{
+	return ubq->ubq_daemon->flags & PF_EXITING;
+}
+
+/* todo: handle partial completion */
+static void ublk_complete_rq(struct request *req)
+{
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	struct ublk_io *io = &ubq->ios[req->tag];
+	unsigned int unmapped_bytes;
+
+	/* failed read IO if nothing is read */
+	if (!io->res && req_op(req) == REQ_OP_READ)
+		io->res = -EIO;
+
+	if (io->res < 0) {
+		blk_mq_end_request(req, errno_to_blk_status(io->res));
+		return;
+	}
+
+	/*
+	 * FLUSH or DISCARD usually won't return bytes returned, so end them
+	 * directly.
+	 *
+	 * Both the two needn't unmap.
+	 */
+	if (req_op(req) != REQ_OP_READ && req_op(req) != REQ_OP_WRITE) {
+		blk_mq_end_request(req, BLK_STS_OK);
+		return;
+	}
+
+	/* for READ request, writing data in iod->addr to rq buffers */
+	unmapped_bytes = ublk_unmap_io(ubq, req, io);
+
+	/*
+	 * Extremely impossible since we got data filled in just before
+	 *
+	 * Re-read simply for this unlikely case.
+	 */
+	if (unlikely(unmapped_bytes < io->res))
+		io->res = unmapped_bytes;
+
+	if (blk_update_request(req, BLK_STS_OK, io->res))
+		blk_mq_requeue_request(req, true);
+	else
+		__blk_mq_end_request(req, BLK_STS_OK);
+}
+
+/*
+ * Since __ublk_rq_task_work always fails requests immediately during
+ * exiting, __ublk_fail_req() is only called from abort context during
+ * exiting. So lock is unnecessary.
+ *
+ * Also aborting may not be started yet, keep in mind that one failed
+ * request may be issued by block layer again.
+ */
+static void __ublk_fail_req(struct ublk_queue *ubq, struct ublk_io *io,
+		struct request *req)
+{
+	WARN_ON_ONCE(io->flags & UBLK_IO_FLAG_ACTIVE);
+
+	if (!(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		io->flags |= UBLK_IO_FLAG_ABORTED;
+		if (ublk_queue_can_use_recovery_reissue(ubq))
+			blk_mq_requeue_request(req, false);
+		else
+			blk_mq_end_request(req, BLK_STS_IOERR);
+	}
+}
+
+static void ubq_complete_io_cmd(struct ublk_io *io, int res,
+				unsigned issue_flags)
+{
+	/* mark this cmd owned by ublksrv */
+	io->flags |= UBLK_IO_FLAG_OWNED_BY_SRV;
+
+	/*
+	 * clear ACTIVE since we are done with this sqe/cmd slot
+	 * We can only accept io cmd in case of being not active.
+	 */
+	io->flags &= ~UBLK_IO_FLAG_ACTIVE;
+
+	/* tell ublksrv one io request is coming */
+	io_uring_cmd_done(io->cmd, res, 0, issue_flags);
+}
+
+#define UBLK_REQUEUE_DELAY_MS	3
+
+static inline void __ublk_abort_rq(struct ublk_queue *ubq,
+		struct request *rq)
+{
+	/* We cannot process this rq so just requeue it. */
+	if (ublk_queue_can_use_recovery(ubq))
+		blk_mq_requeue_request(rq, false);
+	else
+		blk_mq_end_request(rq, BLK_STS_IOERR);
+
+	mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0);
+}
+
+static inline void __ublk_rq_task_work(struct request *req,
+				       unsigned issue_flags)
+{
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	int tag = req->tag;
+	struct ublk_io *io = &ubq->ios[tag];
+	unsigned int mapped_bytes;
+
+	pr_devel("%s: complete: op %d, qid %d tag %d io_flags %x addr %llx\n",
+			__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+			ublk_get_iod(ubq, req->tag)->addr);
+
+	/*
+	 * Task is exiting if either:
+	 *
+	 * (1) current != ubq_daemon.
+	 * io_uring_cmd_complete_in_task() tries to run task_work
+	 * in a workqueue if ubq_daemon(cmd's task) is PF_EXITING.
+	 *
+	 * (2) current->flags & PF_EXITING.
+	 */
+	if (unlikely(current != ubq->ubq_daemon || current->flags & PF_EXITING)) {
+		__ublk_abort_rq(ubq, req);
+		return;
+	}
+
+	if (ublk_need_get_data(ubq) &&
+			(req_op(req) == REQ_OP_WRITE ||
+			req_op(req) == REQ_OP_FLUSH)) {
+		/*
+		 * We have not handled UBLK_IO_NEED_GET_DATA command yet,
+		 * so immepdately pass UBLK_IO_RES_NEED_GET_DATA to ublksrv
+		 * and notify it.
+		 */
+		if (!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA)) {
+			io->flags |= UBLK_IO_FLAG_NEED_GET_DATA;
+			pr_devel("%s: need get data. op %d, qid %d tag %d io_flags %x\n",
+					__func__, io->cmd->cmd_op, ubq->q_id,
+					req->tag, io->flags);
+			ubq_complete_io_cmd(io, UBLK_IO_RES_NEED_GET_DATA, issue_flags);
+			return;
+		}
+		/*
+		 * We have handled UBLK_IO_NEED_GET_DATA command,
+		 * so clear UBLK_IO_FLAG_NEED_GET_DATA now and just
+		 * do the copy work.
+		 */
+		io->flags &= ~UBLK_IO_FLAG_NEED_GET_DATA;
+		/* update iod->addr because ublksrv may have passed a new io buffer */
+		ublk_get_iod(ubq, req->tag)->addr = io->addr;
+		pr_devel("%s: update iod->addr: op %d, qid %d tag %d io_flags %x addr %llx\n",
+				__func__, io->cmd->cmd_op, ubq->q_id, req->tag, io->flags,
+				ublk_get_iod(ubq, req->tag)->addr);
+	}
+
+	mapped_bytes = ublk_map_io(ubq, req, io);
+
+	/* partially mapped, update io descriptor */
+	if (unlikely(mapped_bytes != blk_rq_bytes(req))) {
+		/*
+		 * Nothing mapped, retry until we succeed.
+		 *
+		 * We may never succeed in mapping any bytes here because
+		 * of OOM. TODO: reserve one buffer with single page pinned
+		 * for providing forward progress guarantee.
+		 */
+		if (unlikely(!mapped_bytes)) {
+			blk_mq_requeue_request(req, false);
+			blk_mq_delay_kick_requeue_list(req->q,
+					UBLK_REQUEUE_DELAY_MS);
+			return;
+		}
+
+		ublk_get_iod(ubq, req->tag)->nr_sectors =
+			mapped_bytes >> 9;
+	}
+
+	ubq_complete_io_cmd(io, UBLK_IO_RES_OK, issue_flags);
+}
+
+static inline void ublk_forward_io_cmds(struct ublk_queue *ubq,
+					unsigned issue_flags)
+{
+	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+	struct ublk_rq_data *data, *tmp;
+
+	io_cmds = llist_reverse_order(io_cmds);
+	llist_for_each_entry_safe(data, tmp, io_cmds, node)
+		__ublk_rq_task_work(blk_mq_rq_from_pdu(data), issue_flags);
+}
+
+static inline void ublk_abort_io_cmds(struct ublk_queue *ubq)
+{
+	struct llist_node *io_cmds = llist_del_all(&ubq->io_cmds);
+	struct ublk_rq_data *data, *tmp;
+
+	llist_for_each_entry_safe(data, tmp, io_cmds, node)
+		__ublk_abort_rq(ubq, blk_mq_rq_from_pdu(data));
+}
+
+static void ublk_rq_task_work_cb(struct io_uring_cmd *cmd, unsigned issue_flags)
+{
+	struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+	struct ublk_queue *ubq = pdu->ubq;
+
+	ublk_forward_io_cmds(ubq, issue_flags);
+}
+
+static void ublk_rq_task_work_fn(struct callback_head *work)
+{
+	struct ublk_rq_data *data = container_of(work,
+			struct ublk_rq_data, work);
+	struct request *req = blk_mq_rq_from_pdu(data);
+	struct ublk_queue *ubq = req->mq_hctx->driver_data;
+	unsigned issue_flags = IO_URING_F_UNLOCKED;
+
+	ublk_forward_io_cmds(ubq, issue_flags);
+}
+
+static void ublk_queue_cmd(struct ublk_queue *ubq, struct request *rq)
+{
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(rq);
+	struct ublk_io *io;
+
+	if (!llist_add(&data->node, &ubq->io_cmds))
+		return;
+
+	io = &ubq->ios[rq->tag];
+	/*
+	 * If the check pass, we know that this is a re-issued request aborted
+	 * previously in monitor_work because the ubq_daemon(cmd's task) is
+	 * PF_EXITING. We cannot call io_uring_cmd_complete_in_task() anymore
+	 * because this ioucmd's io_uring context may be freed now if no inflight
+	 * ioucmd exists. Otherwise we may cause null-deref in ctx->fallback_work.
+	 *
+	 * Note: monitor_work sets UBLK_IO_FLAG_ABORTED and ends this request(releasing
+	 * the tag). Then the request is re-started(allocating the tag) and we are here.
+	 * Since releasing/allocating a tag implies smp_mb(), finding UBLK_IO_FLAG_ABORTED
+	 * guarantees that here is a re-issued request aborted previously.
+	 */
+	if (unlikely(io->flags & UBLK_IO_FLAG_ABORTED)) {
+		ublk_abort_io_cmds(ubq);
+	} else if (ublk_can_use_task_work(ubq)) {
+		if (task_work_add(ubq->ubq_daemon, &data->work,
+					TWA_SIGNAL_NO_IPI))
+			ublk_abort_io_cmds(ubq);
+	} else {
+		struct io_uring_cmd *cmd = io->cmd;
+		struct ublk_uring_cmd_pdu *pdu = ublk_get_uring_cmd_pdu(cmd);
+
+		pdu->ubq = ubq;
+		io_uring_cmd_complete_in_task(cmd, ublk_rq_task_work_cb);
+	}
+}
+
+static blk_status_t ublk_queue_rq(struct blk_mq_hw_ctx *hctx,
+		const struct blk_mq_queue_data *bd)
+{
+	struct ublk_queue *ubq = hctx->driver_data;
+	struct request *rq = bd->rq;
+	blk_status_t res;
+
+	/* fill iod to slot in io cmd buffer */
+	res = ublk_setup_iod(ubq, rq);
+	if (unlikely(res != BLK_STS_OK))
+		return BLK_STS_IOERR;
+
+	/* With recovery feature enabled, force_abort is set in
+	 * ublk_stop_dev() before calling del_gendisk(). We have to
+	 * abort all requeued and new rqs here to let del_gendisk()
+	 * move on. Besides, we cannot not call io_uring_cmd_complete_in_task()
+	 * to avoid UAF on io_uring ctx.
+	 *
+	 * Note: force_abort is guaranteed to be seen because it is set
+	 * before request queue is unqiuesced.
+	 */
+	if (ublk_queue_can_use_recovery(ubq) && unlikely(ubq->force_abort))
+		return BLK_STS_IOERR;
+
+	blk_mq_start_request(bd->rq);
+
+	if (unlikely(ubq_daemon_is_dying(ubq))) {
+		__ublk_abort_rq(ubq, rq);
+		return BLK_STS_OK;
+	}
+
+	ublk_queue_cmd(ubq, rq);
+
+	return BLK_STS_OK;
+}
+
+static int ublk_init_hctx(struct blk_mq_hw_ctx *hctx, void *driver_data,
+		unsigned int hctx_idx)
+{
+	struct ublk_device *ub = driver_data;
+	struct ublk_queue *ubq = ublk_get_queue(ub, hctx->queue_num);
+
+	hctx->driver_data = ubq;
+	return 0;
+}
+
+static int ublk_init_rq(struct blk_mq_tag_set *set, struct request *req,
+		unsigned int hctx_idx, unsigned int numa_node)
+{
+	struct ublk_rq_data *data = blk_mq_rq_to_pdu(req);
+
+	init_task_work(&data->work, ublk_rq_task_work_fn);
+	return 0;
+}
+
+static const struct blk_mq_ops ublk_mq_ops = {
+	.queue_rq       = ublk_queue_rq,
+	.init_hctx	= ublk_init_hctx,
+	.init_request   = ublk_init_rq,
+};
+
+static int ublk_ch_open(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = container_of(inode->i_cdev,
+			struct ublk_device, cdev);
+
+	if (test_and_set_bit(UB_STATE_OPEN, &ub->state))
+		return -EBUSY;
+	filp->private_data = ub;
+	return 0;
+}
+
+static int ublk_ch_release(struct inode *inode, struct file *filp)
+{
+	struct ublk_device *ub = filp->private_data;
+
+	clear_bit(UB_STATE_OPEN, &ub->state);
+	return 0;
+}
+
+/* map pre-allocated per-queue cmd buffer to ublksrv daemon */
+static int ublk_ch_mmap(struct file *filp, struct vm_area_struct *vma)
+{
+	struct ublk_device *ub = filp->private_data;
+	size_t sz = vma->vm_end - vma->vm_start;
+	unsigned max_sz = UBLK_MAX_QUEUE_DEPTH * sizeof(struct ublksrv_io_desc);
+	unsigned long pfn, end, phys_off = vma->vm_pgoff << PAGE_SHIFT;
+	int q_id, ret = 0;
+
+	spin_lock(&ub->mm_lock);
+	if (!ub->mm)
+		ub->mm = current->mm;
+	if (current->mm != ub->mm)
+		ret = -EINVAL;
+	spin_unlock(&ub->mm_lock);
+
+	if (ret)
+		return ret;
+
+	if (vma->vm_flags & VM_WRITE)
+		return -EPERM;
+
+	end = UBLKSRV_CMD_BUF_OFFSET + ub->dev_info.nr_hw_queues * max_sz;
+	if (phys_off < UBLKSRV_CMD_BUF_OFFSET || phys_off >= end)
+		return -EINVAL;
+
+	q_id = (phys_off - UBLKSRV_CMD_BUF_OFFSET) / max_sz;
+	pr_devel("%s: qid %d, pid %d, addr %lx pg_off %lx sz %lu\n",
+			__func__, q_id, current->pid, vma->vm_start,
+			phys_off, (unsigned long)sz);
+
+	if (sz != ublk_queue_cmd_buf_size(ub, q_id))
+		return -EINVAL;
+
+	pfn = virt_to_phys(ublk_queue_cmd_buf(ub, q_id)) >> PAGE_SHIFT;
+	return remap_pfn_range(vma, vma->vm_start, pfn, sz, vma->vm_page_prot);
+}
+
+static void ublk_commit_completion(struct ublk_device *ub,
+		struct ublksrv_io_cmd *ub_cmd)
+{
+	u32 qid = ub_cmd->q_id, tag = ub_cmd->tag;
+	struct ublk_queue *ubq = ublk_get_queue(ub, qid);
+	struct ublk_io *io = &ubq->ios[tag];
+	struct request *req;
+
+	/* now this cmd slot is owned by nbd driver */
+	io->flags &= ~UBLK_IO_FLAG_OWNED_BY_SRV;
+	io->res = ub_cmd->result;
+
+	/* find the io request and complete */
+	req = blk_mq_tag_to_rq(ub->tag_set.tags[qid], tag);
+
+	if (req && likely(!blk_should_fake_timeout(req->q)))
+		ublk_complete_rq(req);
+}
+
+/*
+ * When ->ubq_daemon is exiting, either new request is ended immediately,
+ * or any queued io command is drained, so it is safe to abort queue
+ * lockless
+ */
+static void ublk_abort_queue(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	int i;
+
+	if (!ublk_get_device(ub))
+		return;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (!(io->flags & UBLK_IO_FLAG_ACTIVE)) {
+			struct request *rq;
+
+			/*
+			 * Either we fail the request or ublk_rq_task_work_fn
+			 * will do it
+			 */
+			rq = blk_mq_tag_to_rq(ub->tag_set.tags[ubq->q_id], i);
+			if (rq)
+				__ublk_fail_req(ubq, io, rq);
+		}
+	}
+	ublk_put_device(ub);
+}
+
+static void ublk_daemon_monitor_work(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, monitor_work.work);
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++) {
+		struct ublk_queue *ubq = ublk_get_queue(ub, i);
+
+		if (ubq_daemon_is_dying(ubq)) {
+			if (ublk_queue_can_use_recovery(ubq))
+				schedule_work(&ub->quiesce_work);
+			else
+				schedule_work(&ub->stop_work);
+
+			/* abort queue is for making forward progress */
+			ublk_abort_queue(ub, ubq);
+		}
+	}
+
+	/*
+	 * We can't schedule monitor work after ub's state is not UBLK_S_DEV_LIVE.
+	 * after ublk_remove() or __ublk_quiesce_dev() is started.
+	 *
+	 * No need ub->mutex, monitor work are canceled after state is marked
+	 * as not LIVE, so new state is observed reliably.
+	 */
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE)
+		schedule_delayed_work(&ub->monitor_work,
+				UBLK_DAEMON_MONITOR_PERIOD);
+}
+
+static inline bool ublk_queue_ready(struct ublk_queue *ubq)
+{
+	return ubq->nr_io_ready == ubq->q_depth;
+}
+
+static void ublk_cancel_queue(struct ublk_queue *ubq)
+{
+	int i;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+			bool done;
+
+			spin_lock(&ubq->cancel_lock);
+			done = !!(io->flags & UBLK_IO_FLAG_CANCELED);
+			if (!done)
+				io->flags |= UBLK_IO_FLAG_CANCELED;
+			spin_unlock(&ubq->cancel_lock);
+
+			if (!done)
+				io_uring_cmd_done(io->cmd,
+						UBLK_IO_RES_ABORT, 0,
+						IO_URING_F_UNLOCKED);
+		}
+	}
+}
+
+/* Cancel all pending commands, must be called after del_gendisk() returns */
+static void ublk_cancel_dev(struct ublk_device *ub)
+{
+	int i;
+
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_cancel_queue(ublk_get_queue(ub, i));
+}
+
+static bool ublk_check_inflight_rq(struct request *rq, void *data)
+{
+	bool *idle = data;
+
+	if (blk_mq_request_started(rq)) {
+		*idle = false;
+		return false;
+	}
+	return true;
+}
+
+static void ublk_wait_tagset_rqs_idle(struct ublk_device *ub)
+{
+	bool idle;
+
+	WARN_ON_ONCE(!blk_queue_quiesced(ub->ub_disk->queue));
+	while (true) {
+		idle = true;
+		blk_mq_tagset_busy_iter(&ub->tag_set,
+				ublk_check_inflight_rq, &idle);
+		if (idle)
+			break;
+		msleep(UBLK_REQUEUE_DELAY_MS);
+	}
+}
+
+static void __ublk_quiesce_dev(struct ublk_device *ub)
+{
+	pr_devel("%s: quiesce ub: dev_id %d state %s\n",
+			__func__, ub->dev_info.dev_id,
+			ub->dev_info.state == UBLK_S_DEV_LIVE ?
+			"LIVE" : "QUIESCED");
+	blk_mq_quiesce_queue(ub->ub_disk->queue);
+	ublk_wait_tagset_rqs_idle(ub);
+	ub->dev_info.state = UBLK_S_DEV_QUIESCED;
+	/* we are going to release task_struct of ubq_daemon and resets
+	 * ->ubq_daemon to NULL. So in monitor_work, check on ubq_daemon causes UAF.
+	 * Besides, monitor_work is not necessary in QUIESCED state since we have
+	 * already scheduled quiesce_work and quiesced all ubqs.
+	 *
+	 * Do not let monitor_work schedule itself if state it QUIESCED. And we cancel
+	 * it here and re-schedule it in END_USER_RECOVERY to avoid UAF.
+	 */
+	cancel_delayed_work_sync(&ub->monitor_work);
+}
+
+static void ublk_quiesce_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, quiesce_work);
+
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state != UBLK_S_DEV_LIVE)
+		goto unlock;
+	__ublk_quiesce_dev(ub);
+ unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_cancel_dev(ub);
+}
+
+static void ublk_unquiesce_dev(struct ublk_device *ub)
+{
+	int i;
+
+	pr_devel("%s: unquiesce ub: dev_id %d state %s\n",
+			__func__, ub->dev_info.dev_id,
+			ub->dev_info.state == UBLK_S_DEV_LIVE ?
+			"LIVE" : "QUIESCED");
+	/* quiesce_work has run. We let requeued rqs be aborted
+	 * before running fallback_wq. "force_abort" must be seen
+	 * after request queue is unqiuesced. Then del_gendisk()
+	 * can move on.
+	 */
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_get_queue(ub, i)->force_abort = true;
+
+	blk_mq_unquiesce_queue(ub->ub_disk->queue);
+	/* We may have requeued some rqs in ublk_quiesce_queue() */
+	blk_mq_kick_requeue_list(ub->ub_disk->queue);
+}
+
+static void ublk_stop_dev(struct ublk_device *ub)
+{
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_DEAD)
+		goto unlock;
+	if (ublk_can_use_recovery(ub)) {
+		if (ub->dev_info.state == UBLK_S_DEV_LIVE)
+			__ublk_quiesce_dev(ub);
+		ublk_unquiesce_dev(ub);
+	}
+	del_gendisk(ub->ub_disk);
+	ub->dev_info.state = UBLK_S_DEV_DEAD;
+	ub->dev_info.ublksrv_pid = -1;
+	put_disk(ub->ub_disk);
+	ub->ub_disk = NULL;
+ unlock:
+	mutex_unlock(&ub->mutex);
+	ublk_cancel_dev(ub);
+	cancel_delayed_work_sync(&ub->monitor_work);
+}
+
+/* device can only be started after all IOs are ready */
+static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	mutex_lock(&ub->mutex);
+	ubq->nr_io_ready++;
+	if (ublk_queue_ready(ubq)) {
+		ubq->ubq_daemon = current;
+		get_task_struct(ubq->ubq_daemon);
+		ub->nr_queues_ready++;
+
+		if (capable(CAP_SYS_ADMIN))
+			ub->nr_privileged_daemon++;
+	}
+	if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues)
+		complete_all(&ub->completion);
+	mutex_unlock(&ub->mutex);
+}
+
+static void ublk_handle_need_get_data(struct ublk_device *ub, int q_id,
+		int tag)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+	struct request *req = blk_mq_tag_to_rq(ub->tag_set.tags[q_id], tag);
+
+	ublk_queue_cmd(ubq, req);
+}
+
+static int __ublk_ch_uring_cmd(struct io_uring_cmd *cmd,
+			       unsigned int issue_flags,
+			       struct ublksrv_io_cmd *ub_cmd)
+{
+	struct ublk_device *ub = cmd->file->private_data;
+	struct ublk_queue *ubq;
+	struct ublk_io *io;
+	u32 cmd_op = cmd->cmd_op;
+	unsigned tag = ub_cmd->tag;
+	int ret = -EINVAL;
+	struct request *req;
+
+	pr_devel("%s: received: cmd op %d queue %d tag %d result %d\n",
+			__func__, cmd->cmd_op, ub_cmd->q_id, tag,
+			ub_cmd->result);
+
+	if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues)
+		goto out;
+
+	ubq = ublk_get_queue(ub, ub_cmd->q_id);
+	if (!ubq || ub_cmd->q_id != ubq->q_id)
+		goto out;
+
+	if (ubq->ubq_daemon && ubq->ubq_daemon != current)
+		goto out;
+
+	if (tag >= ubq->q_depth)
+		goto out;
+
+	io = &ubq->ios[tag];
+
+	/* there is pending io cmd, something must be wrong */
+	if (io->flags & UBLK_IO_FLAG_ACTIVE) {
+		ret = -EBUSY;
+		goto out;
+	}
+
+	/*
+	 * ensure that the user issues UBLK_IO_NEED_GET_DATA
+	 * iff the driver have set the UBLK_IO_FLAG_NEED_GET_DATA.
+	 */
+	if ((!!(io->flags & UBLK_IO_FLAG_NEED_GET_DATA))
+			^ (cmd_op == UBLK_IO_NEED_GET_DATA))
+		goto out;
+
+	switch (cmd_op) {
+	case UBLK_IO_FETCH_REQ:
+		/* UBLK_IO_FETCH_REQ is only allowed before queue is setup */
+		if (ublk_queue_ready(ubq)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		/*
+		 * The io is being handled by server, so COMMIT_RQ is expected
+		 * instead of FETCH_REQ
+		 */
+		if (io->flags & UBLK_IO_FLAG_OWNED_BY_SRV)
+			goto out;
+		/* FETCH_RQ has to provide IO buffer if NEED GET DATA is not enabled */
+		if (!ub_cmd->addr && !ublk_need_get_data(ubq))
+			goto out;
+		io->cmd = cmd;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->addr = ub_cmd->addr;
+
+		ublk_mark_io_ready(ub, ubq);
+		break;
+	case UBLK_IO_COMMIT_AND_FETCH_REQ:
+		req = blk_mq_tag_to_rq(ub->tag_set.tags[ub_cmd->q_id], tag);
+		/*
+		 * COMMIT_AND_FETCH_REQ has to provide IO buffer if NEED GET DATA is
+		 * not enabled or it is Read IO.
+		 */
+		if (!ub_cmd->addr && (!ublk_need_get_data(ubq) || req_op(req) == REQ_OP_READ))
+			goto out;
+		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+			goto out;
+		io->addr = ub_cmd->addr;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		io->cmd = cmd;
+		ublk_commit_completion(ub, ub_cmd);
+		break;
+	case UBLK_IO_NEED_GET_DATA:
+		if (!(io->flags & UBLK_IO_FLAG_OWNED_BY_SRV))
+			goto out;
+		io->addr = ub_cmd->addr;
+		io->cmd = cmd;
+		io->flags |= UBLK_IO_FLAG_ACTIVE;
+		ublk_handle_need_get_data(ub, ub_cmd->q_id, ub_cmd->tag);
+		break;
+	default:
+		goto out;
+	}
+	return -EIOCBQUEUED;
+
+ out:
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+	pr_devel("%s: complete: cmd op %d, tag %d ret %x io_flags %x\n",
+			__func__, cmd_op, tag, ret, io->flags);
+	return -EIOCBQUEUED;
+}
+
+static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
+{
+	struct ublksrv_io_cmd *ub_src = (struct ublksrv_io_cmd *) cmd->cmd;
+	struct ublksrv_io_cmd ub_cmd;
+
+	/*
+	 * Not necessary for async retry, but let's keep it simple and always
+	 * copy the values to avoid any potential reuse.
+	 */
+	ub_cmd.q_id = READ_ONCE(ub_src->q_id);
+	ub_cmd.tag = READ_ONCE(ub_src->tag);
+	ub_cmd.result = READ_ONCE(ub_src->result);
+	ub_cmd.addr = READ_ONCE(ub_src->addr);
+
+	return __ublk_ch_uring_cmd(cmd, issue_flags, &ub_cmd);
+}
+
+static const struct file_operations ublk_ch_fops = {
+	.owner = THIS_MODULE,
+	.open = ublk_ch_open,
+	.release = ublk_ch_release,
+	.llseek = no_llseek,
+	.uring_cmd = ublk_ch_uring_cmd,
+	.mmap = ublk_ch_mmap,
+};
+
+static void ublk_deinit_queue(struct ublk_device *ub, int q_id)
+{
+	int size = ublk_queue_cmd_buf_size(ub, q_id);
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+
+	if (ubq->ubq_daemon)
+		put_task_struct(ubq->ubq_daemon);
+	if (ubq->io_cmd_buf)
+		free_pages((unsigned long)ubq->io_cmd_buf, get_order(size));
+}
+
+static int ublk_init_queue(struct ublk_device *ub, int q_id)
+{
+	struct ublk_queue *ubq = ublk_get_queue(ub, q_id);
+	gfp_t gfp_flags = GFP_KERNEL | __GFP_ZERO;
+	void *ptr;
+	int size;
+
+	spin_lock_init(&ubq->cancel_lock);
+	ubq->flags = ub->dev_info.flags;
+	ubq->q_id = q_id;
+	ubq->q_depth = ub->dev_info.queue_depth;
+	size = ublk_queue_cmd_buf_size(ub, q_id);
+
+	ptr = (void *) __get_free_pages(gfp_flags, get_order(size));
+	if (!ptr)
+		return -ENOMEM;
+
+	ubq->io_cmd_buf = ptr;
+	ubq->dev = ub;
+	return 0;
+}
+
+static void ublk_deinit_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int i;
+
+	if (!ub->__queues)
+		return;
+
+	for (i = 0; i < nr_queues; i++)
+		ublk_deinit_queue(ub, i);
+	kfree(ub->__queues);
+}
+
+static int ublk_init_queues(struct ublk_device *ub)
+{
+	int nr_queues = ub->dev_info.nr_hw_queues;
+	int depth = ub->dev_info.queue_depth;
+	int ubq_size = sizeof(struct ublk_queue) + depth * sizeof(struct ublk_io);
+	int i, ret = -ENOMEM;
+
+	ub->queue_size = ubq_size;
+	ub->__queues = kcalloc(nr_queues, ubq_size, GFP_KERNEL);
+	if (!ub->__queues)
+		return ret;
+
+	for (i = 0; i < nr_queues; i++) {
+		if (ublk_init_queue(ub, i))
+			goto fail;
+	}
+
+	init_completion(&ub->completion);
+	return 0;
+
+ fail:
+	ublk_deinit_queues(ub);
+	return ret;
+}
+
+static int ublk_alloc_dev_number(struct ublk_device *ub, int idx)
+{
+	int i = idx;
+	int err;
+
+	spin_lock(&ublk_idr_lock);
+	/* allocate id, if @id >= 0, we're requesting that specific id */
+	if (i >= 0) {
+		err = idr_alloc(&ublk_index_idr, ub, i, i + 1, GFP_NOWAIT);
+		if (err == -ENOSPC)
+			err = -EEXIST;
+	} else {
+		err = idr_alloc(&ublk_index_idr, ub, 0, 0, GFP_NOWAIT);
+	}
+	spin_unlock(&ublk_idr_lock);
+
+	if (err >= 0)
+		ub->ub_number = err;
+
+	return err;
+}
+
+static void ublk_free_dev_number(struct ublk_device *ub)
+{
+	spin_lock(&ublk_idr_lock);
+	idr_remove(&ublk_index_idr, ub->ub_number);
+	wake_up_all(&ublk_idr_wq);
+	spin_unlock(&ublk_idr_lock);
+}
+
+static void ublk_cdev_rel(struct device *dev)
+{
+	struct ublk_device *ub = container_of(dev, struct ublk_device, cdev_dev);
+
+	blk_mq_free_tag_set(&ub->tag_set);
+	ublk_deinit_queues(ub);
+	ublk_free_dev_number(ub);
+	mutex_destroy(&ub->mutex);
+	kfree(ub);
+}
+
+static int ublk_add_chdev(struct ublk_device *ub)
+{
+	struct device *dev = &ub->cdev_dev;
+	int minor = ub->ub_number;
+	int ret;
+
+	dev->parent = ublk_misc.this_device;
+	dev->devt = MKDEV(MAJOR(ublk_chr_devt), minor);
+	dev->class = ublk_chr_class;
+	dev->release = ublk_cdev_rel;
+	device_initialize(dev);
+
+	ret = dev_set_name(dev, "ublkc%d", minor);
+	if (ret)
+		goto fail;
+
+	cdev_init(&ub->cdev, &ublk_ch_fops);
+	ret = cdev_device_add(&ub->cdev, dev);
+	if (ret)
+		goto fail;
+	return 0;
+ fail:
+	put_device(dev);
+	return ret;
+}
+
+static void ublk_stop_work_fn(struct work_struct *work)
+{
+	struct ublk_device *ub =
+		container_of(work, struct ublk_device, stop_work);
+
+	ublk_stop_dev(ub);
+}
+
+/* align max io buffer size with PAGE_SIZE */
+static void ublk_align_max_io_size(struct ublk_device *ub)
+{
+	unsigned int max_io_bytes = ub->dev_info.max_io_buf_bytes;
+
+	ub->dev_info.max_io_buf_bytes =
+		round_down(max_io_bytes, PAGE_SIZE);
+}
+
+static int ublk_add_tag_set(struct ublk_device *ub)
+{
+	ub->tag_set.ops = &ublk_mq_ops;
+	ub->tag_set.nr_hw_queues = ub->dev_info.nr_hw_queues;
+	ub->tag_set.queue_depth = ub->dev_info.queue_depth;
+	ub->tag_set.numa_node = NUMA_NO_NODE;
+	ub->tag_set.cmd_size = sizeof(struct ublk_rq_data);
+	ub->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ub->tag_set.driver_data = ub;
+	return blk_mq_alloc_tag_set(&ub->tag_set);
+}
+
+static void ublk_remove(struct ublk_device *ub)
+{
+	ublk_stop_dev(ub);
+	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
+	cdev_device_del(&ub->cdev, &ub->cdev_dev);
+	put_device(&ub->cdev_dev);
+}
+
+static struct ublk_device *ublk_get_device_from_id(int idx)
+{
+	struct ublk_device *ub = NULL;
+
+	if (idx < 0)
+		return NULL;
+
+	spin_lock(&ublk_idr_lock);
+	ub = idr_find(&ublk_index_idr, idx);
+	if (ub)
+		ub = ublk_get_device(ub);
+	spin_unlock(&ublk_idr_lock);
+
+	return ub;
+}
+
+static int ublk_ctrl_start_dev(struct ublk_device *ub, struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ublksrv_pid = (int)header->data[0];
+	struct gendisk *disk;
+	int ret = -EINVAL;
+
+	if (ublksrv_pid <= 0)
+		return -EINVAL;
+
+	if (wait_for_completion_interruptible(&ub->completion) != 0)
+		return -EINTR;
+
+	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE ||
+	    test_bit(UB_STATE_USED, &ub->state)) {
+		ret = -EEXIST;
+		goto out_unlock;
+	}
+
+	disk = blk_mq_alloc_disk(&ub->tag_set, ub);
+	if (IS_ERR(disk)) {
+		ret = PTR_ERR(disk);
+		goto out_unlock;
+	}
+	sprintf(disk->disk_name, "ublkb%d", ub->ub_number);
+	disk->fops = &ub_fops;
+	disk->private_data = ub;
+
+	ub->dev_info.ublksrv_pid = ublksrv_pid;
+	ub->ub_disk = disk;
+
+	ret = ublk_apply_params(ub);
+	if (ret)
+		goto out_put_disk;
+
+	/* don't probe partitions if any one ubq daemon is un-trusted */
+	if (ub->nr_privileged_daemon != ub->nr_queues_ready)
+		set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
+
+	get_device(&ub->cdev_dev);
+	ub->dev_info.state = UBLK_S_DEV_LIVE;
+	ret = add_disk(disk);
+	if (ret) {
+		/*
+		 * Has to drop the reference since ->free_disk won't be
+		 * called in case of add_disk failure.
+		 */
+		ub->dev_info.state = UBLK_S_DEV_DEAD;
+		ublk_put_device(ub);
+		goto out_put_disk;
+	}
+	set_bit(UB_STATE_USED, &ub->state);
+out_put_disk:
+	if (ret)
+		put_disk(disk);
+out_unlock:
+	mutex_unlock(&ub->mutex);
+	return ret;
+}
+
+static int ublk_ctrl_get_queue_affinity(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	cpumask_var_t cpumask;
+	unsigned long queue;
+	unsigned int retlen;
+	unsigned int i;
+	int ret;
+
+	if (header->len * BITS_PER_BYTE < nr_cpu_ids)
+		return -EINVAL;
+	if (header->len & (sizeof(unsigned long)-1))
+		return -EINVAL;
+	if (!header->addr)
+		return -EINVAL;
+
+	queue = header->data[0];
+	if (queue >= ub->dev_info.nr_hw_queues)
+		return -EINVAL;
+
+	if (!zalloc_cpumask_var(&cpumask, GFP_KERNEL))
+		return -ENOMEM;
+
+	for_each_possible_cpu(i) {
+		if (ub->tag_set.map[HCTX_TYPE_DEFAULT].mq_map[i] == queue)
+			cpumask_set_cpu(i, cpumask);
+	}
+
+	ret = -EFAULT;
+	retlen = min_t(unsigned short, header->len, cpumask_size());
+	if (copy_to_user(argp, cpumask, retlen))
+		goto out_free_cpumask;
+	if (retlen != header->len &&
+	    clear_user(argp + retlen, header->len - retlen))
+		goto out_free_cpumask;
+
+	ret = 0;
+out_free_cpumask:
+	free_cpumask_var(cpumask);
+	return ret;
+}
+
+static inline void ublk_dump_dev_info(struct ublksrv_ctrl_dev_info *info)
+{
+	pr_devel("%s: dev id %d flags %llx\n", __func__,
+			info->dev_id, info->flags);
+	pr_devel("\t nr_hw_queues %d queue_depth %d\n",
+			info->nr_hw_queues, info->queue_depth);
+}
+
+static int ublk_ctrl_add_dev(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublksrv_ctrl_dev_info info;
+	struct ublk_device *ub;
+	int ret = -EINVAL;
+
+	if (header->len < sizeof(info) || !header->addr)
+		return -EINVAL;
+	if (header->queue_id != (u16)-1) {
+		pr_warn("%s: queue_id is wrong %x\n",
+			__func__, header->queue_id);
+		return -EINVAL;
+	}
+	if (copy_from_user(&info, argp, sizeof(info)))
+		return -EFAULT;
+	ublk_dump_dev_info(&info);
+	if (header->dev_id != info.dev_id) {
+		pr_warn("%s: dev id not match %u %u\n",
+			__func__, header->dev_id, info.dev_id);
+		return -EINVAL;
+	}
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ret = -ENOMEM;
+	ub = kzalloc(sizeof(*ub), GFP_KERNEL);
+	if (!ub)
+		goto out_unlock;
+	mutex_init(&ub->mutex);
+	spin_lock_init(&ub->mm_lock);
+	INIT_WORK(&ub->quiesce_work, ublk_quiesce_work_fn);
+	INIT_WORK(&ub->stop_work, ublk_stop_work_fn);
+	INIT_DELAYED_WORK(&ub->monitor_work, ublk_daemon_monitor_work);
+
+	ret = ublk_alloc_dev_number(ub, header->dev_id);
+	if (ret < 0)
+		goto out_free_ub;
+
+	memcpy(&ub->dev_info, &info, sizeof(info));
+
+	/* update device id */
+	ub->dev_info.dev_id = ub->ub_number;
+
+	/*
+	 * 64bit flags will be copied back to userspace as feature
+	 * negotiation result, so have to clear flags which driver
+	 * doesn't support yet, then userspace can get correct flags
+	 * (features) to handle.
+	 */
+	ub->dev_info.flags &= UBLK_F_ALL;
+
+	if (!IS_BUILTIN(CONFIG_BLK_DEV_UBLK))
+		ub->dev_info.flags |= UBLK_F_URING_CMD_COMP_IN_TASK;
+
+	/* We are not ready to support zero copy */
+	ub->dev_info.flags &= ~UBLK_F_SUPPORT_ZERO_COPY;
+
+	ub->dev_info.nr_hw_queues = min_t(unsigned int,
+			ub->dev_info.nr_hw_queues, nr_cpu_ids);
+	ublk_align_max_io_size(ub);
+
+	ret = ublk_init_queues(ub);
+	if (ret)
+		goto out_free_dev_number;
+
+	ret = ublk_add_tag_set(ub);
+	if (ret)
+		goto out_deinit_queues;
+
+	ret = -EFAULT;
+	if (copy_to_user(argp, &ub->dev_info, sizeof(info)))
+		goto out_free_tag_set;
+
+	/*
+	 * Add the char dev so that ublksrv daemon can be setup.
+	 * ublk_add_chdev() will cleanup everything if it fails.
+	 */
+	ret = ublk_add_chdev(ub);
+	goto out_unlock;
+
+out_free_tag_set:
+	blk_mq_free_tag_set(&ub->tag_set);
+out_deinit_queues:
+	ublk_deinit_queues(ub);
+out_free_dev_number:
+	ublk_free_dev_number(ub);
+out_free_ub:
+	mutex_destroy(&ub->mutex);
+	kfree(ub);
+out_unlock:
+	mutex_unlock(&ublk_ctl_mutex);
+	return ret;
+}
+
+static inline bool ublk_idr_freed(int id)
+{
+	void *ptr;
+
+	spin_lock(&ublk_idr_lock);
+	ptr = idr_find(&ublk_index_idr, id);
+	spin_unlock(&ublk_idr_lock);
+
+	return ptr == NULL;
+}
+
+static int ublk_ctrl_del_dev(struct ublk_device **p_ub)
+{
+	struct ublk_device *ub = *p_ub;
+	int idx = ub->ub_number;
+	int ret;
+
+	ret = mutex_lock_killable(&ublk_ctl_mutex);
+	if (ret)
+		return ret;
+
+	ublk_remove(ub);
+
+	/* Mark the reference as consumed */
+	*p_ub = NULL;
+	ublk_put_device(ub);
+
+	/*
+	 * Wait until the idr is removed, then it can be reused after
+	 * DEL_DEV command is returned.
+	 */
+	wait_event(ublk_idr_wq, ublk_idr_freed(idx));
+	mutex_unlock(&ublk_ctl_mutex);
+
+	return ret;
+}
+
+static inline void ublk_ctrl_cmd_dump(struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+
+	pr_devel("%s: cmd_op %x, dev id %d qid %d data %llx buf %llx len %u\n",
+			__func__, cmd->cmd_op, header->dev_id, header->queue_id,
+			header->data[0], header->addr, header->len);
+}
+
+static int ublk_ctrl_stop_dev(struct ublk_device *ub)
+{
+	ublk_stop_dev(ub);
+	cancel_work_sync(&ub->stop_work);
+	cancel_work_sync(&ub->quiesce_work);
+
+	return 0;
+}
+
+static int ublk_ctrl_get_dev_info(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+
+	if (header->len < sizeof(struct ublksrv_ctrl_dev_info) || !header->addr)
+		return -EINVAL;
+
+	if (copy_to_user(argp, &ub->dev_info, sizeof(ub->dev_info)))
+		return -EFAULT;
+
+	return 0;
+}
+
+static int ublk_ctrl_get_params(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	int ret;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	mutex_lock(&ub->mutex);
+	if (copy_to_user(argp, &ub->params, ph.len))
+		ret = -EFAULT;
+	else
+		ret = 0;
+	mutex_unlock(&ub->mutex);
+
+	return ret;
+}
+
+static int ublk_ctrl_set_params(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	void __user *argp = (void __user *)(unsigned long)header->addr;
+	struct ublk_params_header ph;
+	int ret = -EFAULT;
+
+	if (header->len <= sizeof(ph) || !header->addr)
+		return -EINVAL;
+
+	if (copy_from_user(&ph, argp, sizeof(ph)))
+		return -EFAULT;
+
+	if (ph.len > header->len || !ph.len || !ph.types)
+		return -EINVAL;
+
+	if (ph.len > sizeof(struct ublk_params))
+		ph.len = sizeof(struct ublk_params);
+
+	/* parameters can only be changed when device isn't live */
+	mutex_lock(&ub->mutex);
+	if (ub->dev_info.state == UBLK_S_DEV_LIVE) {
+		ret = -EACCES;
+	} else if (copy_from_user(&ub->params, argp, ph.len)) {
+		ret = -EFAULT;
+	} else {
+		/* clear all we don't support yet */
+		ub->params.types &= UBLK_PARAM_TYPE_ALL;
+		ret = ublk_validate_params(ub);
+		if (ret)
+			ub->params.types = 0;
+	}
+	mutex_unlock(&ub->mutex);
+
+	return ret;
+}
+
+static void ublk_queue_reinit(struct ublk_device *ub, struct ublk_queue *ubq)
+{
+	int i;
+
+	WARN_ON_ONCE(!(ubq->ubq_daemon && ubq_daemon_is_dying(ubq)));
+
+	/* All old ioucmds have to be completed */
+	ubq->nr_io_ready = 0;
+	/* old daemon is PF_EXITING, put it now */
+	put_task_struct(ubq->ubq_daemon);
+	/* We have to reset it to NULL, otherwise ub won't accept new FETCH_REQ */
+	ubq->ubq_daemon = NULL;
+
+	for (i = 0; i < ubq->q_depth; i++) {
+		struct ublk_io *io = &ubq->ios[i];
+
+		/* forget everything now and be ready for new FETCH_REQ */
+		io->flags = 0;
+		io->cmd = NULL;
+		io->addr = 0;
+	}
+}
+
+static int ublk_ctrl_start_recovery(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ret = -EINVAL;
+	int i;
+
+	mutex_lock(&ub->mutex);
+	if (!ublk_can_use_recovery(ub))
+		goto out_unlock;
+	/*
+	 * START_RECOVERY is only allowd after:
+	 *
+	 * (1) UB_STATE_OPEN is not set, which means the dying process is exited
+	 *     and related io_uring ctx is freed so file struct of /dev/ublkcX is
+	 *     released.
+	 *
+	 * (2) UBLK_S_DEV_QUIESCED is set, which means the quiesce_work:
+	 *     (a)has quiesced request queue
+	 *     (b)has requeued every inflight rqs whose io_flags is ACTIVE
+	 *     (c)has requeued/aborted every inflight rqs whose io_flags is NOT ACTIVE
+	 *     (d)has completed/camceled all ioucmds owned by ther dying process
+	 */
+	if (test_bit(UB_STATE_OPEN, &ub->state) ||
+			ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	pr_devel("%s: start recovery for dev id %d.\n", __func__, header->dev_id);
+	for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
+		ublk_queue_reinit(ub, ublk_get_queue(ub, i));
+	/* set to NULL, otherwise new ubq_daemon cannot mmap the io_cmd_buf */
+	ub->mm = NULL;
+	ub->nr_queues_ready = 0;
+	ub->nr_privileged_daemon = 0;
+	init_completion(&ub->completion);
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&ub->mutex);
+	return ret;
+}
+
+static int ublk_ctrl_end_recovery(struct ublk_device *ub,
+		struct io_uring_cmd *cmd)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	int ublksrv_pid = (int)header->data[0];
+	int ret = -EINVAL;
+
+	pr_devel("%s: Waiting for new ubq_daemons(nr: %d) are ready, dev id %d...\n",
+			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
+	/* wait until new ubq_daemon sending all FETCH_REQ */
+	if (wait_for_completion_interruptible(&ub->completion))
+		return -EINTR;
+
+	pr_devel("%s: All new ubq_daemons(nr: %d) are ready, dev id %d\n",
+			__func__, ub->dev_info.nr_hw_queues, header->dev_id);
+
+	mutex_lock(&ub->mutex);
+	if (!ublk_can_use_recovery(ub))
+		goto out_unlock;
+
+	if (ub->dev_info.state != UBLK_S_DEV_QUIESCED) {
+		ret = -EBUSY;
+		goto out_unlock;
+	}
+	ub->dev_info.ublksrv_pid = ublksrv_pid;
+	pr_devel("%s: new ublksrv_pid %d, dev id %d\n",
+			__func__, ublksrv_pid, header->dev_id);
+	blk_mq_unquiesce_queue(ub->ub_disk->queue);
+	pr_devel("%s: queue unquiesced, dev id %d.\n",
+			__func__, header->dev_id);
+	blk_mq_kick_requeue_list(ub->ub_disk->queue);
+	ub->dev_info.state = UBLK_S_DEV_LIVE;
+	schedule_delayed_work(&ub->monitor_work, UBLK_DAEMON_MONITOR_PERIOD);
+	ret = 0;
+ out_unlock:
+	mutex_unlock(&ub->mutex);
+	return ret;
+}
+
+static int ublk_ctrl_uring_cmd(struct io_uring_cmd *cmd,
+		unsigned int issue_flags)
+{
+	struct ublksrv_ctrl_cmd *header = (struct ublksrv_ctrl_cmd *)cmd->cmd;
+	struct ublk_device *ub = NULL;
+	int ret = -EINVAL;
+
+	if (issue_flags & IO_URING_F_NONBLOCK)
+		return -EAGAIN;
+
+	ublk_ctrl_cmd_dump(cmd);
+
+	if (!(issue_flags & IO_URING_F_SQE128))
+		goto out;
+
+	ret = -EPERM;
+	if (!capable(CAP_SYS_ADMIN))
+		goto out;
+
+	if (cmd->cmd_op != UBLK_CMD_ADD_DEV) {
+		ret = -ENODEV;
+		ub = ublk_get_device_from_id(header->dev_id);
+		if (!ub)
+			goto out;
+	}
+
+	switch (cmd->cmd_op) {
+	case UBLK_CMD_START_DEV:
+		ret = ublk_ctrl_start_dev(ub, cmd);
+		break;
+	case UBLK_CMD_STOP_DEV:
+		ret = ublk_ctrl_stop_dev(ub);
+		break;
+	case UBLK_CMD_GET_DEV_INFO:
+		ret = ublk_ctrl_get_dev_info(ub, cmd);
+		break;
+	case UBLK_CMD_ADD_DEV:
+		ret = ublk_ctrl_add_dev(cmd);
+		break;
+	case UBLK_CMD_DEL_DEV:
+		ret = ublk_ctrl_del_dev(&ub);
+		break;
+	case UBLK_CMD_GET_QUEUE_AFFINITY:
+		ret = ublk_ctrl_get_queue_affinity(ub, cmd);
+		break;
+	case UBLK_CMD_GET_PARAMS:
+		ret = ublk_ctrl_get_params(ub, cmd);
+		break;
+	case UBLK_CMD_SET_PARAMS:
+		ret = ublk_ctrl_set_params(ub, cmd);
+		break;
+	case UBLK_CMD_START_USER_RECOVERY:
+		ret = ublk_ctrl_start_recovery(ub, cmd);
+		break;
+	case UBLK_CMD_END_USER_RECOVERY:
+		ret = ublk_ctrl_end_recovery(ub, cmd);
+		break;
+	default:
+		ret = -ENOTSUPP;
+		break;
+	}
+	if (ub)
+		ublk_put_device(ub);
+ out:
+	io_uring_cmd_done(cmd, ret, 0, issue_flags);
+	pr_devel("%s: cmd done ret %d cmd_op %x, dev id %d qid %d\n",
+			__func__, ret, cmd->cmd_op, header->dev_id, header->queue_id);
+	return -EIOCBQUEUED;
+}
+
+static const struct file_operations ublk_ctl_fops = {
+	.open		= nonseekable_open,
+	.uring_cmd      = ublk_ctrl_uring_cmd,
+	.owner		= THIS_MODULE,
+	.llseek		= noop_llseek,
+};
+
+static struct miscdevice ublk_misc = {
+	.minor		= MISC_DYNAMIC_MINOR,
+	.name		= "ublk-control",
+	.fops		= &ublk_ctl_fops,
+};
+
+static int __init ublk_init(void)
+{
+	int ret;
+
+	init_waitqueue_head(&ublk_idr_wq);
+
+	ret = misc_register(&ublk_misc);
+	if (ret)
+		return ret;
+
+	ret = alloc_chrdev_region(&ublk_chr_devt, 0, UBLK_MINORS, "ublk-char");
+	if (ret)
+		goto unregister_mis;
+
+	ublk_chr_class = class_create(THIS_MODULE, "ublk-char");
+	if (IS_ERR(ublk_chr_class)) {
+		ret = PTR_ERR(ublk_chr_class);
+		goto free_chrdev_region;
+	}
+	return 0;
+
+free_chrdev_region:
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+unregister_mis:
+	misc_deregister(&ublk_misc);
+	return ret;
+}
+
+static void __exit ublk_exit(void)
+{
+	struct ublk_device *ub;
+	int id;
+
+	idr_for_each_entry(&ublk_index_idr, ub, id)
+		ublk_remove(ub);
+
+	class_destroy(ublk_chr_class);
+	misc_deregister(&ublk_misc);
+
+	idr_destroy(&ublk_index_idr);
+	unregister_chrdev_region(ublk_chr_devt, UBLK_MINORS);
+}
+
+module_init(ublk_init);
+module_exit(ublk_exit);
+
+MODULE_AUTHOR("Ming Lei <ming.lei@redhat.com>");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c
new file mode 100644
index 000000000..3124837aa
--- /dev/null
+++ b/drivers/block/virtio_blk.c
@@ -0,0 +1,1314 @@
+// SPDX-License-Identifier: GPL-2.0-only
+//#define DEBUG
+#include <linux/spinlock.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/hdreg.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/interrupt.h>
+#include <linux/virtio.h>
+#include <linux/virtio_blk.h>
+#include <linux/scatterlist.h>
+#include <linux/string_helpers.h>
+#include <linux/idr.h>
+#include <linux/blk-mq.h>
+#include <linux/blk-mq-virtio.h>
+#include <linux/numa.h>
+#include <uapi/linux/virtio_ring.h>
+
+#define PART_BITS 4
+#define VQ_NAME_LEN 16
+#define MAX_DISCARD_SEGMENTS 256u
+
+/* The maximum number of sg elements that fit into a virtqueue */
+#define VIRTIO_BLK_MAX_SG_ELEMS 32768
+
+#ifdef CONFIG_ARCH_NO_SG_CHAIN
+#define VIRTIO_BLK_INLINE_SG_CNT	0
+#else
+#define VIRTIO_BLK_INLINE_SG_CNT	2
+#endif
+
+static unsigned int num_request_queues;
+module_param(num_request_queues, uint, 0644);
+MODULE_PARM_DESC(num_request_queues,
+		 "Limit the number of request queues to use for blk device. "
+		 "0 for no limit. "
+		 "Values > nr_cpu_ids truncated to nr_cpu_ids.");
+
+static unsigned int poll_queues;
+module_param(poll_queues, uint, 0644);
+MODULE_PARM_DESC(poll_queues, "The number of dedicated virtqueues for polling I/O");
+
+static int major;
+static DEFINE_IDA(vd_index_ida);
+
+static struct workqueue_struct *virtblk_wq;
+
+struct virtio_blk_vq {
+	struct virtqueue *vq;
+	spinlock_t lock;
+	char name[VQ_NAME_LEN];
+} ____cacheline_aligned_in_smp;
+
+struct virtio_blk {
+	/*
+	 * This mutex must be held by anything that may run after
+	 * virtblk_remove() sets vblk->vdev to NULL.
+	 *
+	 * blk-mq, virtqueue processing, and sysfs attribute code paths are
+	 * shut down before vblk->vdev is set to NULL and therefore do not need
+	 * to hold this mutex.
+	 */
+	struct mutex vdev_mutex;
+	struct virtio_device *vdev;
+
+	/* The disk structure for the kernel. */
+	struct gendisk *disk;
+
+	/* Block layer tags. */
+	struct blk_mq_tag_set tag_set;
+
+	/* Process context for config space updates */
+	struct work_struct config_work;
+
+	/* Ida index - used to track minor number allocations. */
+	int index;
+
+	/* num of vqs */
+	int num_vqs;
+	int io_queues[HCTX_MAX_TYPES];
+	struct virtio_blk_vq *vqs;
+};
+
+struct virtblk_req {
+	struct virtio_blk_outhdr out_hdr;
+	u8 status;
+	struct sg_table sg_table;
+	struct scatterlist sg[];
+};
+
+static inline blk_status_t virtblk_result(struct virtblk_req *vbr)
+{
+	switch (vbr->status) {
+	case VIRTIO_BLK_S_OK:
+		return BLK_STS_OK;
+	case VIRTIO_BLK_S_UNSUPP:
+		return BLK_STS_NOTSUPP;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static inline struct virtio_blk_vq *get_virtio_blk_vq(struct blk_mq_hw_ctx *hctx)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
+
+	return vq;
+}
+
+static int virtblk_add_req(struct virtqueue *vq, struct virtblk_req *vbr)
+{
+	struct scatterlist hdr, status, *sgs[3];
+	unsigned int num_out = 0, num_in = 0;
+
+	sg_init_one(&hdr, &vbr->out_hdr, sizeof(vbr->out_hdr));
+	sgs[num_out++] = &hdr;
+
+	if (vbr->sg_table.nents) {
+		if (vbr->out_hdr.type & cpu_to_virtio32(vq->vdev, VIRTIO_BLK_T_OUT))
+			sgs[num_out++] = vbr->sg_table.sgl;
+		else
+			sgs[num_out + num_in++] = vbr->sg_table.sgl;
+	}
+
+	sg_init_one(&status, &vbr->status, sizeof(vbr->status));
+	sgs[num_out + num_in++] = &status;
+
+	return virtqueue_add_sgs(vq, sgs, num_out, num_in, vbr, GFP_ATOMIC);
+}
+
+static int virtblk_setup_discard_write_zeroes_erase(struct request *req, bool unmap)
+{
+	unsigned short segments = blk_rq_nr_discard_segments(req);
+	unsigned short n = 0;
+	struct virtio_blk_discard_write_zeroes *range;
+	struct bio *bio;
+	u32 flags = 0;
+
+	if (unmap)
+		flags |= VIRTIO_BLK_WRITE_ZEROES_FLAG_UNMAP;
+
+	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
+	if (!range)
+		return -ENOMEM;
+
+	/*
+	 * Single max discard segment means multi-range discard isn't
+	 * supported, and block layer only runs contiguity merge like
+	 * normal RW request. So we can't reply on bio for retrieving
+	 * each range info.
+	 */
+	if (queue_max_discard_segments(req->q) == 1) {
+		range[0].flags = cpu_to_le32(flags);
+		range[0].num_sectors = cpu_to_le32(blk_rq_sectors(req));
+		range[0].sector = cpu_to_le64(blk_rq_pos(req));
+		n = 1;
+	} else {
+		__rq_for_each_bio(bio, req) {
+			u64 sector = bio->bi_iter.bi_sector;
+			u32 num_sectors = bio->bi_iter.bi_size >> SECTOR_SHIFT;
+
+			range[n].flags = cpu_to_le32(flags);
+			range[n].num_sectors = cpu_to_le32(num_sectors);
+			range[n].sector = cpu_to_le64(sector);
+			n++;
+		}
+	}
+
+	WARN_ON_ONCE(n != segments);
+
+	req->special_vec.bv_page = virt_to_page(range);
+	req->special_vec.bv_offset = offset_in_page(range);
+	req->special_vec.bv_len = sizeof(*range) * segments;
+	req->rq_flags |= RQF_SPECIAL_PAYLOAD;
+
+	return 0;
+}
+
+static void virtblk_unmap_data(struct request *req, struct virtblk_req *vbr)
+{
+	if (blk_rq_nr_phys_segments(req))
+		sg_free_table_chained(&vbr->sg_table,
+				      VIRTIO_BLK_INLINE_SG_CNT);
+}
+
+static int virtblk_map_data(struct blk_mq_hw_ctx *hctx, struct request *req,
+		struct virtblk_req *vbr)
+{
+	int err;
+
+	if (!blk_rq_nr_phys_segments(req))
+		return 0;
+
+	vbr->sg_table.sgl = vbr->sg;
+	err = sg_alloc_table_chained(&vbr->sg_table,
+				     blk_rq_nr_phys_segments(req),
+				     vbr->sg_table.sgl,
+				     VIRTIO_BLK_INLINE_SG_CNT);
+	if (unlikely(err))
+		return -ENOMEM;
+
+	return blk_rq_map_sg(hctx->queue, req, vbr->sg_table.sgl);
+}
+
+static void virtblk_cleanup_cmd(struct request *req)
+{
+	if (req->rq_flags & RQF_SPECIAL_PAYLOAD)
+		kfree(bvec_virt(&req->special_vec));
+}
+
+static blk_status_t virtblk_setup_cmd(struct virtio_device *vdev,
+				      struct request *req,
+				      struct virtblk_req *vbr)
+{
+	bool unmap = false;
+	u32 type;
+
+	vbr->out_hdr.sector = 0;
+
+	switch (req_op(req)) {
+	case REQ_OP_READ:
+		type = VIRTIO_BLK_T_IN;
+		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
+						      blk_rq_pos(req));
+		break;
+	case REQ_OP_WRITE:
+		type = VIRTIO_BLK_T_OUT;
+		vbr->out_hdr.sector = cpu_to_virtio64(vdev,
+						      blk_rq_pos(req));
+		break;
+	case REQ_OP_FLUSH:
+		type = VIRTIO_BLK_T_FLUSH;
+		break;
+	case REQ_OP_DISCARD:
+		type = VIRTIO_BLK_T_DISCARD;
+		break;
+	case REQ_OP_WRITE_ZEROES:
+		type = VIRTIO_BLK_T_WRITE_ZEROES;
+		unmap = !(req->cmd_flags & REQ_NOUNMAP);
+		break;
+	case REQ_OP_SECURE_ERASE:
+		type = VIRTIO_BLK_T_SECURE_ERASE;
+		break;
+	case REQ_OP_DRV_IN:
+		type = VIRTIO_BLK_T_GET_ID;
+		break;
+	default:
+		WARN_ON_ONCE(1);
+		return BLK_STS_IOERR;
+	}
+
+	vbr->out_hdr.type = cpu_to_virtio32(vdev, type);
+	vbr->out_hdr.ioprio = cpu_to_virtio32(vdev, req_get_ioprio(req));
+
+	if (type == VIRTIO_BLK_T_DISCARD || type == VIRTIO_BLK_T_WRITE_ZEROES ||
+	    type == VIRTIO_BLK_T_SECURE_ERASE) {
+		if (virtblk_setup_discard_write_zeroes_erase(req, unmap))
+			return BLK_STS_RESOURCE;
+	}
+
+	return 0;
+}
+
+static inline void virtblk_request_done(struct request *req)
+{
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+
+	virtblk_unmap_data(req, vbr);
+	virtblk_cleanup_cmd(req);
+	blk_mq_end_request(req, virtblk_result(vbr));
+}
+
+static void virtblk_done(struct virtqueue *vq)
+{
+	struct virtio_blk *vblk = vq->vdev->priv;
+	bool req_done = false;
+	int qid = vq->index;
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+	do {
+		virtqueue_disable_cb(vq);
+		while ((vbr = virtqueue_get_buf(vblk->vqs[qid].vq, &len)) != NULL) {
+			struct request *req = blk_mq_rq_from_pdu(vbr);
+
+			if (likely(!blk_should_fake_timeout(req->q)))
+				blk_mq_complete_request(req);
+			req_done = true;
+		}
+		if (unlikely(virtqueue_is_broken(vq)))
+			break;
+	} while (!virtqueue_enable_cb(vq));
+
+	/* In case queue is stopped waiting for more buffers. */
+	if (req_done)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+}
+
+static void virtio_commit_rqs(struct blk_mq_hw_ctx *hctx)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = &vblk->vqs[hctx->queue_num];
+	bool kick;
+
+	spin_lock_irq(&vq->lock);
+	kick = virtqueue_kick_prepare(vq->vq);
+	spin_unlock_irq(&vq->lock);
+
+	if (kick)
+		virtqueue_notify(vq->vq);
+}
+
+static blk_status_t virtblk_fail_to_queue(struct request *req, int rc)
+{
+	virtblk_cleanup_cmd(req);
+	switch (rc) {
+	case -ENOSPC:
+		return BLK_STS_DEV_RESOURCE;
+	case -ENOMEM:
+		return BLK_STS_RESOURCE;
+	default:
+		return BLK_STS_IOERR;
+	}
+}
+
+static blk_status_t virtblk_prep_rq(struct blk_mq_hw_ctx *hctx,
+					struct virtio_blk *vblk,
+					struct request *req,
+					struct virtblk_req *vbr)
+{
+	blk_status_t status;
+	int num;
+
+	status = virtblk_setup_cmd(vblk->vdev, req, vbr);
+	if (unlikely(status))
+		return status;
+
+	num = virtblk_map_data(hctx, req, vbr);
+	if (unlikely(num < 0))
+		return virtblk_fail_to_queue(req, -ENOMEM);
+	vbr->sg_table.nents = num;
+
+	blk_mq_start_request(req);
+
+	return BLK_STS_OK;
+}
+
+static blk_status_t virtio_queue_rq(struct blk_mq_hw_ctx *hctx,
+			   const struct blk_mq_queue_data *bd)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct request *req = bd->rq;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+	unsigned long flags;
+	int qid = hctx->queue_num;
+	bool notify = false;
+	blk_status_t status;
+	int err;
+
+	status = virtblk_prep_rq(hctx, vblk, req, vbr);
+	if (unlikely(status))
+		return status;
+
+	spin_lock_irqsave(&vblk->vqs[qid].lock, flags);
+	err = virtblk_add_req(vblk->vqs[qid].vq, vbr);
+	if (err) {
+		virtqueue_kick(vblk->vqs[qid].vq);
+		/* Don't stop the queue if -ENOMEM: we may have failed to
+		 * bounce the buffer due to global resource outage.
+		 */
+		if (err == -ENOSPC)
+			blk_mq_stop_hw_queue(hctx);
+		spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+		virtblk_unmap_data(req, vbr);
+		return virtblk_fail_to_queue(req, err);
+	}
+
+	if (bd->last && virtqueue_kick_prepare(vblk->vqs[qid].vq))
+		notify = true;
+	spin_unlock_irqrestore(&vblk->vqs[qid].lock, flags);
+
+	if (notify)
+		virtqueue_notify(vblk->vqs[qid].vq);
+	return BLK_STS_OK;
+}
+
+static bool virtblk_prep_rq_batch(struct request *req)
+{
+	struct virtio_blk *vblk = req->mq_hctx->queue->queuedata;
+	struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+
+	req->mq_hctx->tags->rqs[req->tag] = req;
+
+	return virtblk_prep_rq(req->mq_hctx, vblk, req, vbr) == BLK_STS_OK;
+}
+
+static bool virtblk_add_req_batch(struct virtio_blk_vq *vq,
+					struct request **rqlist)
+{
+	unsigned long flags;
+	int err;
+	bool kick;
+
+	spin_lock_irqsave(&vq->lock, flags);
+
+	while (!rq_list_empty(*rqlist)) {
+		struct request *req = rq_list_pop(rqlist);
+		struct virtblk_req *vbr = blk_mq_rq_to_pdu(req);
+
+		err = virtblk_add_req(vq->vq, vbr);
+		if (err) {
+			virtblk_unmap_data(req, vbr);
+			virtblk_cleanup_cmd(req);
+			blk_mq_requeue_request(req, true);
+		}
+	}
+
+	kick = virtqueue_kick_prepare(vq->vq);
+	spin_unlock_irqrestore(&vq->lock, flags);
+
+	return kick;
+}
+
+static void virtio_queue_rqs(struct request **rqlist)
+{
+	struct request *req, *next, *prev = NULL;
+	struct request *requeue_list = NULL;
+
+	rq_list_for_each_safe(rqlist, req, next) {
+		struct virtio_blk_vq *vq = get_virtio_blk_vq(req->mq_hctx);
+		bool kick;
+
+		if (!virtblk_prep_rq_batch(req)) {
+			rq_list_move(rqlist, &requeue_list, req, prev);
+			req = prev;
+			if (!req)
+				continue;
+		}
+
+		if (!next || req->mq_hctx != next->mq_hctx) {
+			req->rq_next = NULL;
+			kick = virtblk_add_req_batch(vq, rqlist);
+			if (kick)
+				virtqueue_notify(vq->vq);
+
+			*rqlist = next;
+			prev = NULL;
+		} else
+			prev = req;
+	}
+
+	*rqlist = requeue_list;
+}
+
+/* return id (s/n) string for *disk to *id_str
+ */
+static int virtblk_get_id(struct gendisk *disk, char *id_str)
+{
+	struct virtio_blk *vblk = disk->private_data;
+	struct request_queue *q = vblk->disk->queue;
+	struct request *req;
+	int err;
+
+	req = blk_mq_alloc_request(q, REQ_OP_DRV_IN, 0);
+	if (IS_ERR(req))
+		return PTR_ERR(req);
+
+	err = blk_rq_map_kern(q, req, id_str, VIRTIO_BLK_ID_BYTES, GFP_KERNEL);
+	if (err)
+		goto out;
+
+	blk_execute_rq(req, false);
+	err = blk_status_to_errno(virtblk_result(blk_mq_rq_to_pdu(req)));
+out:
+	blk_mq_free_request(req);
+	return err;
+}
+
+/* We provide getgeo only to please some old bootloader/partitioning tools */
+static int virtblk_getgeo(struct block_device *bd, struct hd_geometry *geo)
+{
+	struct virtio_blk *vblk = bd->bd_disk->private_data;
+	int ret = 0;
+
+	mutex_lock(&vblk->vdev_mutex);
+
+	if (!vblk->vdev) {
+		ret = -ENXIO;
+		goto out;
+	}
+
+	/* see if the host passed in geometry config */
+	if (virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_GEOMETRY)) {
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.cylinders, &geo->cylinders);
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.heads, &geo->heads);
+		virtio_cread(vblk->vdev, struct virtio_blk_config,
+			     geometry.sectors, &geo->sectors);
+	} else {
+		/* some standard values, similar to sd */
+		geo->heads = 1 << 6;
+		geo->sectors = 1 << 5;
+		geo->cylinders = get_capacity(bd->bd_disk) >> 11;
+	}
+out:
+	mutex_unlock(&vblk->vdev_mutex);
+	return ret;
+}
+
+static void virtblk_free_disk(struct gendisk *disk)
+{
+	struct virtio_blk *vblk = disk->private_data;
+
+	ida_simple_remove(&vd_index_ida, vblk->index);
+	mutex_destroy(&vblk->vdev_mutex);
+	kfree(vblk);
+}
+
+static const struct block_device_operations virtblk_fops = {
+	.owner  	= THIS_MODULE,
+	.getgeo		= virtblk_getgeo,
+	.free_disk	= virtblk_free_disk,
+};
+
+static int index_to_minor(int index)
+{
+	return index << PART_BITS;
+}
+
+static int minor_to_index(int minor)
+{
+	return minor >> PART_BITS;
+}
+
+static ssize_t serial_show(struct device *dev,
+			   struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	int err;
+
+	/* sysfs gives us a PAGE_SIZE buffer */
+	BUILD_BUG_ON(PAGE_SIZE < VIRTIO_BLK_ID_BYTES);
+
+	buf[VIRTIO_BLK_ID_BYTES] = '\0';
+	err = virtblk_get_id(disk, buf);
+	if (!err)
+		return strlen(buf);
+
+	if (err == -EIO) /* Unsupported? Make it empty. */
+		return 0;
+
+	return err;
+}
+
+static DEVICE_ATTR_RO(serial);
+
+/* The queue's logical block size must be set before calling this */
+static void virtblk_update_capacity(struct virtio_blk *vblk, bool resize)
+{
+	struct virtio_device *vdev = vblk->vdev;
+	struct request_queue *q = vblk->disk->queue;
+	char cap_str_2[10], cap_str_10[10];
+	unsigned long long nblocks;
+	u64 capacity;
+
+	/* Host must always specify the capacity. */
+	virtio_cread(vdev, struct virtio_blk_config, capacity, &capacity);
+
+	nblocks = DIV_ROUND_UP_ULL(capacity, queue_logical_block_size(q) >> 9);
+
+	string_get_size(nblocks, queue_logical_block_size(q),
+			STRING_UNITS_2, cap_str_2, sizeof(cap_str_2));
+	string_get_size(nblocks, queue_logical_block_size(q),
+			STRING_UNITS_10, cap_str_10, sizeof(cap_str_10));
+
+	dev_notice(&vdev->dev,
+		   "[%s] %s%llu %d-byte logical blocks (%s/%s)\n",
+		   vblk->disk->disk_name,
+		   resize ? "new size: " : "",
+		   nblocks,
+		   queue_logical_block_size(q),
+		   cap_str_10,
+		   cap_str_2);
+
+	set_capacity_and_notify(vblk->disk, capacity);
+}
+
+static void virtblk_config_changed_work(struct work_struct *work)
+{
+	struct virtio_blk *vblk =
+		container_of(work, struct virtio_blk, config_work);
+
+	virtblk_update_capacity(vblk, true);
+}
+
+static void virtblk_config_changed(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	queue_work(virtblk_wq, &vblk->config_work);
+}
+
+static int init_vq(struct virtio_blk *vblk)
+{
+	int err;
+	unsigned short i;
+	vq_callback_t **callbacks;
+	const char **names;
+	struct virtqueue **vqs;
+	unsigned short num_vqs;
+	unsigned short num_poll_vqs;
+	struct virtio_device *vdev = vblk->vdev;
+	struct irq_affinity desc = { 0, };
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_MQ,
+				   struct virtio_blk_config, num_queues,
+				   &num_vqs);
+	if (err)
+		num_vqs = 1;
+
+	if (!err && !num_vqs) {
+		dev_err(&vdev->dev, "MQ advertised but zero queues reported\n");
+		return -EINVAL;
+	}
+
+	num_vqs = min_t(unsigned int,
+			min_not_zero(num_request_queues, nr_cpu_ids),
+			num_vqs);
+
+	num_poll_vqs = min_t(unsigned int, poll_queues, num_vqs - 1);
+
+	vblk->io_queues[HCTX_TYPE_DEFAULT] = num_vqs - num_poll_vqs;
+	vblk->io_queues[HCTX_TYPE_READ] = 0;
+	vblk->io_queues[HCTX_TYPE_POLL] = num_poll_vqs;
+
+	dev_info(&vdev->dev, "%d/%d/%d default/read/poll queues\n",
+				vblk->io_queues[HCTX_TYPE_DEFAULT],
+				vblk->io_queues[HCTX_TYPE_READ],
+				vblk->io_queues[HCTX_TYPE_POLL]);
+
+	vblk->vqs = kmalloc_array(num_vqs, sizeof(*vblk->vqs), GFP_KERNEL);
+	if (!vblk->vqs)
+		return -ENOMEM;
+
+	names = kmalloc_array(num_vqs, sizeof(*names), GFP_KERNEL);
+	callbacks = kmalloc_array(num_vqs, sizeof(*callbacks), GFP_KERNEL);
+	vqs = kmalloc_array(num_vqs, sizeof(*vqs), GFP_KERNEL);
+	if (!names || !callbacks || !vqs) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	for (i = 0; i < num_vqs - num_poll_vqs; i++) {
+		callbacks[i] = virtblk_done;
+		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req.%u", i);
+		names[i] = vblk->vqs[i].name;
+	}
+
+	for (; i < num_vqs; i++) {
+		callbacks[i] = NULL;
+		snprintf(vblk->vqs[i].name, VQ_NAME_LEN, "req_poll.%u", i);
+		names[i] = vblk->vqs[i].name;
+	}
+
+	/* Discover virtqueues and write information to configuration.  */
+	err = virtio_find_vqs(vdev, num_vqs, vqs, callbacks, names, &desc);
+	if (err)
+		goto out;
+
+	for (i = 0; i < num_vqs; i++) {
+		spin_lock_init(&vblk->vqs[i].lock);
+		vblk->vqs[i].vq = vqs[i];
+	}
+	vblk->num_vqs = num_vqs;
+
+out:
+	kfree(vqs);
+	kfree(callbacks);
+	kfree(names);
+	if (err)
+		kfree(vblk->vqs);
+	return err;
+}
+
+/*
+ * Legacy naming scheme used for virtio devices.  We are stuck with it for
+ * virtio blk but don't ever use it for any new driver.
+ */
+static int virtblk_name_format(char *prefix, int index, char *buf, int buflen)
+{
+	const int base = 'z' - 'a' + 1;
+	char *begin = buf + strlen(prefix);
+	char *end = buf + buflen;
+	char *p;
+	int unit;
+
+	p = end - 1;
+	*p = '\0';
+	unit = base;
+	do {
+		if (p == begin)
+			return -EINVAL;
+		*--p = 'a' + (index % unit);
+		index = (index / unit) - 1;
+	} while (index >= 0);
+
+	memmove(begin, p, end - p);
+	memcpy(buf, prefix, strlen(prefix));
+
+	return 0;
+}
+
+static int virtblk_get_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback;
+	int err;
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE,
+				   struct virtio_blk_config, wce,
+				   &writeback);
+
+	/*
+	 * If WCE is not configurable and flush is not available,
+	 * assume no writeback cache is in use.
+	 */
+	if (err)
+		writeback = virtio_has_feature(vdev, VIRTIO_BLK_F_FLUSH);
+
+	return writeback;
+}
+
+static void virtblk_update_cache_mode(struct virtio_device *vdev)
+{
+	u8 writeback = virtblk_get_cache_mode(vdev);
+	struct virtio_blk *vblk = vdev->priv;
+
+	blk_queue_write_cache(vblk->disk->queue, writeback, false);
+}
+
+static const char *const virtblk_cache_types[] = {
+	"write through", "write back"
+};
+
+static ssize_t
+cache_type_store(struct device *dev, struct device_attribute *attr,
+		 const char *buf, size_t count)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	struct virtio_device *vdev = vblk->vdev;
+	int i;
+
+	BUG_ON(!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_CONFIG_WCE));
+	i = sysfs_match_string(virtblk_cache_types, buf);
+	if (i < 0)
+		return i;
+
+	virtio_cwrite8(vdev, offsetof(struct virtio_blk_config, wce), i);
+	virtblk_update_cache_mode(vdev);
+	return count;
+}
+
+static ssize_t
+cache_type_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	u8 writeback = virtblk_get_cache_mode(vblk->vdev);
+
+	BUG_ON(writeback >= ARRAY_SIZE(virtblk_cache_types));
+	return sysfs_emit(buf, "%s\n", virtblk_cache_types[writeback]);
+}
+
+static DEVICE_ATTR_RW(cache_type);
+
+static struct attribute *virtblk_attrs[] = {
+	&dev_attr_serial.attr,
+	&dev_attr_cache_type.attr,
+	NULL,
+};
+
+static umode_t virtblk_attrs_are_visible(struct kobject *kobj,
+		struct attribute *a, int n)
+{
+	struct device *dev = kobj_to_dev(kobj);
+	struct gendisk *disk = dev_to_disk(dev);
+	struct virtio_blk *vblk = disk->private_data;
+	struct virtio_device *vdev = vblk->vdev;
+
+	if (a == &dev_attr_cache_type.attr &&
+	    !virtio_has_feature(vdev, VIRTIO_BLK_F_CONFIG_WCE))
+		return S_IRUGO;
+
+	return a->mode;
+}
+
+static const struct attribute_group virtblk_attr_group = {
+	.attrs = virtblk_attrs,
+	.is_visible = virtblk_attrs_are_visible,
+};
+
+static const struct attribute_group *virtblk_attr_groups[] = {
+	&virtblk_attr_group,
+	NULL,
+};
+
+static void virtblk_map_queues(struct blk_mq_tag_set *set)
+{
+	struct virtio_blk *vblk = set->driver_data;
+	int i, qoff;
+
+	for (i = 0, qoff = 0; i < set->nr_maps; i++) {
+		struct blk_mq_queue_map *map = &set->map[i];
+
+		map->nr_queues = vblk->io_queues[i];
+		map->queue_offset = qoff;
+		qoff += map->nr_queues;
+
+		if (map->nr_queues == 0)
+			continue;
+
+		/*
+		 * Regular queues have interrupts and hence CPU affinity is
+		 * defined by the core virtio code, but polling queues have
+		 * no interrupts so we let the block layer assign CPU affinity.
+		 */
+		if (i == HCTX_TYPE_POLL)
+			blk_mq_map_queues(&set->map[i]);
+		else
+			blk_mq_virtio_map_queues(&set->map[i], vblk->vdev, 0);
+	}
+}
+
+static void virtblk_complete_batch(struct io_comp_batch *iob)
+{
+	struct request *req;
+
+	rq_list_for_each(&iob->req_list, req) {
+		virtblk_unmap_data(req, blk_mq_rq_to_pdu(req));
+		virtblk_cleanup_cmd(req);
+	}
+	blk_mq_end_request_batch(iob);
+}
+
+static int virtblk_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
+{
+	struct virtio_blk *vblk = hctx->queue->queuedata;
+	struct virtio_blk_vq *vq = get_virtio_blk_vq(hctx);
+	struct virtblk_req *vbr;
+	unsigned long flags;
+	unsigned int len;
+	int found = 0;
+
+	spin_lock_irqsave(&vq->lock, flags);
+
+	while ((vbr = virtqueue_get_buf(vq->vq, &len)) != NULL) {
+		struct request *req = blk_mq_rq_from_pdu(vbr);
+
+		found++;
+		if (!blk_mq_add_to_batch(req, iob, vbr->status,
+						virtblk_complete_batch))
+			blk_mq_complete_request(req);
+	}
+
+	if (found)
+		blk_mq_start_stopped_hw_queues(vblk->disk->queue, true);
+
+	spin_unlock_irqrestore(&vq->lock, flags);
+
+	return found;
+}
+
+static const struct blk_mq_ops virtio_mq_ops = {
+	.queue_rq	= virtio_queue_rq,
+	.queue_rqs	= virtio_queue_rqs,
+	.commit_rqs	= virtio_commit_rqs,
+	.complete	= virtblk_request_done,
+	.map_queues	= virtblk_map_queues,
+	.poll		= virtblk_poll,
+};
+
+static unsigned int virtblk_queue_depth;
+module_param_named(queue_depth, virtblk_queue_depth, uint, 0444);
+
+static int virtblk_probe(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk;
+	struct request_queue *q;
+	int err, index;
+
+	u32 v, blk_size, max_size, sg_elems, opt_io_size;
+	u32 max_discard_segs = 0;
+	u32 discard_granularity = 0;
+	u16 min_io_size;
+	u8 physical_block_exp, alignment_offset;
+	unsigned int queue_depth;
+	size_t max_dma_size;
+
+	if (!vdev->config->get) {
+		dev_err(&vdev->dev, "%s failure: config access disabled\n",
+			__func__);
+		return -EINVAL;
+	}
+
+	err = ida_simple_get(&vd_index_ida, 0, minor_to_index(1 << MINORBITS),
+			     GFP_KERNEL);
+	if (err < 0)
+		goto out;
+	index = err;
+
+	/* We need to know how many segments before we allocate. */
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SEG_MAX,
+				   struct virtio_blk_config, seg_max,
+				   &sg_elems);
+
+	/* We need at least one SG element, whatever they say. */
+	if (err || !sg_elems)
+		sg_elems = 1;
+
+	/* Prevent integer overflows and honor max vq size */
+	sg_elems = min_t(u32, sg_elems, VIRTIO_BLK_MAX_SG_ELEMS - 2);
+
+	vdev->priv = vblk = kmalloc(sizeof(*vblk), GFP_KERNEL);
+	if (!vblk) {
+		err = -ENOMEM;
+		goto out_free_index;
+	}
+
+	mutex_init(&vblk->vdev_mutex);
+
+	vblk->vdev = vdev;
+
+	INIT_WORK(&vblk->config_work, virtblk_config_changed_work);
+
+	err = init_vq(vblk);
+	if (err)
+		goto out_free_vblk;
+
+	/* Default queue sizing is to fill the ring. */
+	if (!virtblk_queue_depth) {
+		queue_depth = vblk->vqs[0].vq->num_free;
+		/* ... but without indirect descs, we use 2 descs per req */
+		if (!virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC))
+			queue_depth /= 2;
+	} else {
+		queue_depth = virtblk_queue_depth;
+	}
+
+	memset(&vblk->tag_set, 0, sizeof(vblk->tag_set));
+	vblk->tag_set.ops = &virtio_mq_ops;
+	vblk->tag_set.queue_depth = queue_depth;
+	vblk->tag_set.numa_node = NUMA_NO_NODE;
+	vblk->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	vblk->tag_set.cmd_size =
+		sizeof(struct virtblk_req) +
+		sizeof(struct scatterlist) * VIRTIO_BLK_INLINE_SG_CNT;
+	vblk->tag_set.driver_data = vblk;
+	vblk->tag_set.nr_hw_queues = vblk->num_vqs;
+	vblk->tag_set.nr_maps = 1;
+	if (vblk->io_queues[HCTX_TYPE_POLL])
+		vblk->tag_set.nr_maps = 3;
+
+	err = blk_mq_alloc_tag_set(&vblk->tag_set);
+	if (err)
+		goto out_free_vq;
+
+	vblk->disk = blk_mq_alloc_disk(&vblk->tag_set, vblk);
+	if (IS_ERR(vblk->disk)) {
+		err = PTR_ERR(vblk->disk);
+		goto out_free_tags;
+	}
+	q = vblk->disk->queue;
+
+	virtblk_name_format("vd", index, vblk->disk->disk_name, DISK_NAME_LEN);
+
+	vblk->disk->major = major;
+	vblk->disk->first_minor = index_to_minor(index);
+	vblk->disk->minors = 1 << PART_BITS;
+	vblk->disk->private_data = vblk;
+	vblk->disk->fops = &virtblk_fops;
+	vblk->index = index;
+
+	/* configure queue flush support */
+	virtblk_update_cache_mode(vdev);
+
+	/* If disk is read-only in the host, the guest should obey */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_RO))
+		set_disk_ro(vblk->disk, 1);
+
+	/* We can handle whatever the host told us to handle. */
+	blk_queue_max_segments(q, sg_elems);
+
+	/* No real sector limit. */
+	blk_queue_max_hw_sectors(q, -1U);
+
+	max_dma_size = virtio_max_dma_size(vdev);
+	max_size = max_dma_size > U32_MAX ? U32_MAX : max_dma_size;
+
+	/* Host can optionally specify maximum segment size and number of
+	 * segments. */
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_SIZE_MAX,
+				   struct virtio_blk_config, size_max, &v);
+	if (!err)
+		max_size = min(max_size, v);
+
+	blk_queue_max_segment_size(q, max_size);
+
+	/* Host can optionally specify the block size of the device */
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_BLK_SIZE,
+				   struct virtio_blk_config, blk_size,
+				   &blk_size);
+	if (!err) {
+		err = blk_validate_block_size(blk_size);
+		if (err) {
+			dev_err(&vdev->dev,
+				"virtio_blk: invalid block size: 0x%x\n",
+				blk_size);
+			goto out_cleanup_disk;
+		}
+
+		blk_queue_logical_block_size(q, blk_size);
+	} else
+		blk_size = queue_logical_block_size(q);
+
+	/* Use topology information if available */
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, physical_block_exp,
+				   &physical_block_exp);
+	if (!err && physical_block_exp)
+		blk_queue_physical_block_size(q,
+				blk_size * (1 << physical_block_exp));
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, alignment_offset,
+				   &alignment_offset);
+	if (!err && alignment_offset)
+		blk_queue_alignment_offset(q, blk_size * alignment_offset);
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, min_io_size,
+				   &min_io_size);
+	if (!err && min_io_size)
+		blk_queue_io_min(q, blk_size * min_io_size);
+
+	err = virtio_cread_feature(vdev, VIRTIO_BLK_F_TOPOLOGY,
+				   struct virtio_blk_config, opt_io_size,
+				   &opt_io_size);
+	if (!err && opt_io_size)
+		blk_queue_io_opt(q, blk_size * opt_io_size);
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD)) {
+		virtio_cread(vdev, struct virtio_blk_config,
+			     discard_sector_alignment, &discard_granularity);
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_discard_sectors, &v);
+		blk_queue_max_discard_sectors(q, v ? v : UINT_MAX);
+
+		virtio_cread(vdev, struct virtio_blk_config, max_discard_seg,
+			     &max_discard_segs);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_WRITE_ZEROES)) {
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_write_zeroes_sectors, &v);
+		blk_queue_max_write_zeroes_sectors(q, v ? v : UINT_MAX);
+	}
+
+	/* The discard and secure erase limits are combined since the Linux
+	 * block layer uses the same limit for both commands.
+	 *
+	 * If both VIRTIO_BLK_F_SECURE_ERASE and VIRTIO_BLK_F_DISCARD features
+	 * are negotiated, we will use the minimum between the limits.
+	 *
+	 * discard sector alignment is set to the minimum between discard_sector_alignment
+	 * and secure_erase_sector_alignment.
+	 *
+	 * max discard sectors is set to the minimum between max_discard_seg and
+	 * max_secure_erase_seg.
+	 */
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     secure_erase_sector_alignment, &v);
+
+		/* secure_erase_sector_alignment should not be zero, the device should set a
+		 * valid number of sectors.
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: secure_erase_sector_alignment can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		discard_granularity = min_not_zero(discard_granularity, v);
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_secure_erase_sectors, &v);
+
+		/* max_secure_erase_sectors should not be zero, the device should set a
+		 * valid number of sectors.
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: max_secure_erase_sectors can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		blk_queue_max_secure_erase_sectors(q, v);
+
+		virtio_cread(vdev, struct virtio_blk_config,
+			     max_secure_erase_seg, &v);
+
+		/* max_secure_erase_seg should not be zero, the device should set a
+		 * valid number of segments
+		 */
+		if (!v) {
+			dev_err(&vdev->dev,
+				"virtio_blk: max_secure_erase_seg can't be 0\n");
+			err = -EINVAL;
+			goto out_cleanup_disk;
+		}
+
+		max_discard_segs = min_not_zero(max_discard_segs, v);
+	}
+
+	if (virtio_has_feature(vdev, VIRTIO_BLK_F_DISCARD) ||
+	    virtio_has_feature(vdev, VIRTIO_BLK_F_SECURE_ERASE)) {
+		/* max_discard_seg and discard_granularity will be 0 only
+		 * if max_discard_seg and discard_sector_alignment fields in the virtio
+		 * config are 0 and VIRTIO_BLK_F_SECURE_ERASE feature is not negotiated.
+		 * In this case, we use default values.
+		 */
+		if (!max_discard_segs)
+			max_discard_segs = sg_elems;
+
+		blk_queue_max_discard_segments(q,
+					       min(max_discard_segs, MAX_DISCARD_SEGMENTS));
+
+		if (discard_granularity)
+			q->limits.discard_granularity = discard_granularity << SECTOR_SHIFT;
+		else
+			q->limits.discard_granularity = blk_size;
+	}
+
+	virtblk_update_capacity(vblk, false);
+	virtio_device_ready(vdev);
+
+	err = device_add_disk(&vdev->dev, vblk->disk, virtblk_attr_groups);
+	if (err)
+		goto out_cleanup_disk;
+
+	return 0;
+
+out_cleanup_disk:
+	put_disk(vblk->disk);
+out_free_tags:
+	blk_mq_free_tag_set(&vblk->tag_set);
+out_free_vq:
+	vdev->config->del_vqs(vdev);
+	kfree(vblk->vqs);
+out_free_vblk:
+	kfree(vblk);
+out_free_index:
+	ida_simple_remove(&vd_index_ida, index);
+out:
+	return err;
+}
+
+static void virtblk_remove(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	/* Make sure no work handler is accessing the device. */
+	flush_work(&vblk->config_work);
+
+	del_gendisk(vblk->disk);
+	blk_mq_free_tag_set(&vblk->tag_set);
+
+	mutex_lock(&vblk->vdev_mutex);
+
+	/* Stop all the virtqueues. */
+	virtio_reset_device(vdev);
+
+	/* Virtqueues are stopped, nothing can use vblk->vdev anymore. */
+	vblk->vdev = NULL;
+
+	vdev->config->del_vqs(vdev);
+	kfree(vblk->vqs);
+
+	mutex_unlock(&vblk->vdev_mutex);
+
+	put_disk(vblk->disk);
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int virtblk_freeze(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+
+	/* Ensure we don't receive any more interrupts */
+	virtio_reset_device(vdev);
+
+	/* Make sure no work handler is accessing the device. */
+	flush_work(&vblk->config_work);
+
+	blk_mq_quiesce_queue(vblk->disk->queue);
+
+	vdev->config->del_vqs(vdev);
+	kfree(vblk->vqs);
+
+	return 0;
+}
+
+static int virtblk_restore(struct virtio_device *vdev)
+{
+	struct virtio_blk *vblk = vdev->priv;
+	int ret;
+
+	ret = init_vq(vdev->priv);
+	if (ret)
+		return ret;
+
+	virtio_device_ready(vdev);
+
+	blk_mq_unquiesce_queue(vblk->disk->queue);
+	return 0;
+}
+#endif
+
+static const struct virtio_device_id id_table[] = {
+	{ VIRTIO_ID_BLOCK, VIRTIO_DEV_ANY_ID },
+	{ 0 },
+};
+
+static unsigned int features_legacy[] = {
+	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
+	VIRTIO_BLK_F_SECURE_ERASE,
+}
+;
+static unsigned int features[] = {
+	VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY,
+	VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE,
+	VIRTIO_BLK_F_FLUSH, VIRTIO_BLK_F_TOPOLOGY, VIRTIO_BLK_F_CONFIG_WCE,
+	VIRTIO_BLK_F_MQ, VIRTIO_BLK_F_DISCARD, VIRTIO_BLK_F_WRITE_ZEROES,
+	VIRTIO_BLK_F_SECURE_ERASE,
+};
+
+static struct virtio_driver virtio_blk = {
+	.feature_table			= features,
+	.feature_table_size		= ARRAY_SIZE(features),
+	.feature_table_legacy		= features_legacy,
+	.feature_table_size_legacy	= ARRAY_SIZE(features_legacy),
+	.driver.name			= KBUILD_MODNAME,
+	.driver.owner			= THIS_MODULE,
+	.id_table			= id_table,
+	.probe				= virtblk_probe,
+	.remove				= virtblk_remove,
+	.config_changed			= virtblk_config_changed,
+#ifdef CONFIG_PM_SLEEP
+	.freeze				= virtblk_freeze,
+	.restore			= virtblk_restore,
+#endif
+};
+
+static int __init virtio_blk_init(void)
+{
+	int error;
+
+	virtblk_wq = alloc_workqueue("virtio-blk", 0, 0);
+	if (!virtblk_wq)
+		return -ENOMEM;
+
+	major = register_blkdev(0, "virtblk");
+	if (major < 0) {
+		error = major;
+		goto out_destroy_workqueue;
+	}
+
+	error = register_virtio_driver(&virtio_blk);
+	if (error)
+		goto out_unregister_blkdev;
+	return 0;
+
+out_unregister_blkdev:
+	unregister_blkdev(major, "virtblk");
+out_destroy_workqueue:
+	destroy_workqueue(virtblk_wq);
+	return error;
+}
+
+static void __exit virtio_blk_fini(void)
+{
+	unregister_virtio_driver(&virtio_blk);
+	unregister_blkdev(major, "virtblk");
+	destroy_workqueue(virtblk_wq);
+}
+module_init(virtio_blk_init);
+module_exit(virtio_blk_fini);
+
+MODULE_DEVICE_TABLE(virtio, id_table);
+MODULE_DESCRIPTION("Virtio block driver");
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/xen-blkback/Makefile b/drivers/block/xen-blkback/Makefile
new file mode 100644
index 000000000..b0ea5ab5b
--- /dev/null
+++ b/drivers/block/xen-blkback/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+obj-$(CONFIG_XEN_BLKDEV_BACKEND) := xen-blkback.o
+
+xen-blkback-y	:= blkback.o xenbus.o
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
new file mode 100644
index 000000000..a5cf7f1e8
--- /dev/null
+++ b/drivers/block/xen-blkback/blkback.c
@@ -0,0 +1,1463 @@
+/******************************************************************************
+ *
+ * Back-end of the driver for virtual block devices. This portion of the
+ * driver exports a 'unified' block-device interface that can be accessed
+ * by any operating system that implements a compatible front end. A
+ * reference front-end implementation can be found in:
+ *  drivers/block/xen-blkfront.c
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Copyright (c) 2005, Christopher Clark
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#define pr_fmt(fmt) "xen-blkback: " fmt
+
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
+#include <linux/list.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/bitmap.h>
+
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/xen.h>
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+#include <xen/balloon.h>
+#include <xen/grant_table.h>
+#include "common.h"
+
+/*
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
+ *
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
+ */
+
+static int max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
+
+/*
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
+ */
+
+static int max_pgrants = 1056;
+module_param_named(max_persistent_grants, max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+                 "Maximum number of grants to map persistently");
+
+/*
+ * How long a persistent grant is allowed to remain allocated without being in
+ * use. The time is in seconds, 0 means indefinitely long.
+ */
+
+static unsigned int pgrant_timeout = 60;
+module_param_named(persistent_grant_unused_seconds, pgrant_timeout,
+		   uint, 0644);
+MODULE_PARM_DESC(persistent_grant_unused_seconds,
+		 "Time in seconds an unused persistent grant is allowed to "
+		 "remain allocated. Default is 60, 0 means unlimited.");
+
+/*
+ * Maximum number of rings/queues blkback supports, allow as many queues as there
+ * are CPUs if user has not specified a value.
+ */
+unsigned int xenblk_max_queues;
+module_param_named(max_queues, xenblk_max_queues, uint, 0644);
+MODULE_PARM_DESC(max_queues,
+		 "Maximum number of hardware queues per virtual disk." \
+		 "By default it is the number of online CPUs.");
+
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+unsigned int xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
+
+/*
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
+ */
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+static inline bool persistent_gnt_timeout(struct persistent_gnt *persistent_gnt)
+{
+	return pgrant_timeout && (jiffies - persistent_gnt->last_used >=
+			HZ * pgrant_timeout);
+}
+
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
+
+static int do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags);
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
+				struct blkif_request *req,
+				struct pending_req *pending_req);
+static void make_response(struct xen_blkif_ring *ring, u64 id,
+			  unsigned short op, int st);
+
+#define foreach_grant_safe(pos, n, rbtree, node) \
+	for ((pos) = container_of(rb_first((rbtree)), typeof(*(pos)), node), \
+	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL; \
+	     &(pos)->node != NULL; \
+	     (pos) = container_of(n, typeof(*(pos)), node), \
+	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
+
+
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backend, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif_ring *ring,
+			       struct persistent_gnt *persistent_gnt)
+{
+	struct rb_node **new = NULL, *parent = NULL;
+	struct persistent_gnt *this;
+	struct xen_blkif *blkif = ring->blkif;
+
+	if (ring->persistent_gnt_c >= max_pgrants) {
+		if (!blkif->vbd.overflow_max_grants)
+			blkif->vbd.overflow_max_grants = 1;
+		return -EBUSY;
+	}
+	/* Figure out where to put new node */
+	new = &ring->persistent_gnts.rb_node;
+	while (*new) {
+		this = container_of(*new, struct persistent_gnt, node);
+
+		parent = *new;
+		if (persistent_gnt->gnt < this->gnt)
+			new = &((*new)->rb_left);
+		else if (persistent_gnt->gnt > this->gnt)
+			new = &((*new)->rb_right);
+		else {
+			pr_alert_ratelimited("trying to add a gref that's already in the tree\n");
+			return -EINVAL;
+		}
+	}
+
+	persistent_gnt->active = true;
+	/* Add new node and rebalance tree. */
+	rb_link_node(&(persistent_gnt->node), parent, new);
+	rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts);
+	ring->persistent_gnt_c++;
+	atomic_inc(&ring->persistent_gnt_in_use);
+	return 0;
+}
+
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring,
+						 grant_ref_t gref)
+{
+	struct persistent_gnt *data;
+	struct rb_node *node = NULL;
+
+	node = ring->persistent_gnts.rb_node;
+	while (node) {
+		data = container_of(node, struct persistent_gnt, node);
+
+		if (gref < data->gnt)
+			node = node->rb_left;
+		else if (gref > data->gnt)
+			node = node->rb_right;
+		else {
+			if (data->active) {
+				pr_alert_ratelimited("requesting a grant already in use\n");
+				return NULL;
+			}
+			data->active = true;
+			atomic_inc(&ring->persistent_gnt_in_use);
+			return data;
+		}
+	}
+	return NULL;
+}
+
+static void put_persistent_gnt(struct xen_blkif_ring *ring,
+                               struct persistent_gnt *persistent_gnt)
+{
+	if (!persistent_gnt->active)
+		pr_alert_ratelimited("freeing a grant already unused\n");
+	persistent_gnt->last_used = jiffies;
+	persistent_gnt->active = false;
+	atomic_dec(&ring->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root,
+                                 unsigned int num)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	struct rb_node *n;
+	int segs_to_unmap = 0;
+	struct gntab_unmap_queue_data unmap_data;
+
+	unmap_data.pages = pages;
+	unmap_data.unmap_ops = unmap;
+	unmap_data.kunmap_ops = NULL;
+
+	foreach_grant_safe(persistent_gnt, n, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			(unsigned long) pfn_to_kaddr(page_to_pfn(
+				persistent_gnt->page)),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST ||
+			!rb_next(&persistent_gnt->node)) {
+
+			unmap_data.count = segs_to_unmap;
+			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+
+			gnttab_page_cache_put(&ring->free_pages, pages,
+					      segs_to_unmap);
+			segs_to_unmap = 0;
+		}
+
+		rb_erase(&persistent_gnt->node, root);
+		kfree(persistent_gnt);
+		num--;
+	}
+	BUG_ON(num != 0);
+}
+
+void xen_blkbk_unmap_purged_grants(struct work_struct *work)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	int segs_to_unmap = 0;
+	struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work);
+	struct gntab_unmap_queue_data unmap_data;
+
+	unmap_data.pages = pages;
+	unmap_data.unmap_ops = unmap;
+	unmap_data.kunmap_ops = NULL;
+
+	while(!list_empty(&ring->persistent_purge_list)) {
+		persistent_gnt = list_first_entry(&ring->persistent_purge_list,
+		                                  struct persistent_gnt,
+		                                  remove_node);
+		list_del(&persistent_gnt->remove_node);
+
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			vaddr(persistent_gnt->page),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			unmap_data.count = segs_to_unmap;
+			BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+			gnttab_page_cache_put(&ring->free_pages, pages,
+					      segs_to_unmap);
+			segs_to_unmap = 0;
+		}
+		kfree(persistent_gnt);
+	}
+	if (segs_to_unmap > 0) {
+		unmap_data.count = segs_to_unmap;
+		BUG_ON(gnttab_unmap_refs_sync(&unmap_data));
+		gnttab_page_cache_put(&ring->free_pages, pages, segs_to_unmap);
+	}
+}
+
+static void purge_persistent_gnt(struct xen_blkif_ring *ring)
+{
+	struct persistent_gnt *persistent_gnt;
+	struct rb_node *n;
+	unsigned int num_clean, total;
+	bool scan_used = false;
+	struct rb_root *root;
+
+	if (work_busy(&ring->persistent_purge_work)) {
+		pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n");
+		goto out;
+	}
+
+	if (ring->persistent_gnt_c < max_pgrants ||
+	    (ring->persistent_gnt_c == max_pgrants &&
+	    !ring->blkif->vbd.overflow_max_grants)) {
+		num_clean = 0;
+	} else {
+		num_clean = (max_pgrants / 100) * LRU_PERCENT_CLEAN;
+		num_clean = ring->persistent_gnt_c - max_pgrants + num_clean;
+		num_clean = min(ring->persistent_gnt_c, num_clean);
+		pr_debug("Going to purge at least %u persistent grants\n",
+			 num_clean);
+	}
+
+	/*
+	 * At this point, we can assure that there will be no calls
+         * to get_persistent_grant (because we are executing this code from
+         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+         * which means that the number of currently used grants will go down,
+         * but never up, so we will always be able to remove the requested
+         * number of grants.
+	 */
+
+	total = 0;
+
+	BUG_ON(!list_empty(&ring->persistent_purge_list));
+	root = &ring->persistent_gnts;
+purge_list:
+	foreach_grant_safe(persistent_gnt, n, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+
+		if (persistent_gnt->active)
+			continue;
+		if (!scan_used && !persistent_gnt_timeout(persistent_gnt))
+			continue;
+		if (scan_used && total >= num_clean)
+			continue;
+
+		rb_erase(&persistent_gnt->node, root);
+		list_add(&persistent_gnt->remove_node,
+			 &ring->persistent_purge_list);
+		total++;
+	}
+	/*
+	 * Check whether we also need to start cleaning
+	 * grants that were used since last purge in order to cope
+	 * with the requested num
+	 */
+	if (!scan_used && total < num_clean) {
+		pr_debug("Still missing %u purged frames\n", num_clean - total);
+		scan_used = true;
+		goto purge_list;
+	}
+
+	if (total) {
+		ring->persistent_gnt_c -= total;
+		ring->blkif->vbd.overflow_max_grants = 0;
+
+		/* We can defer this work */
+		schedule_work(&ring->persistent_purge_work);
+		pr_debug("Purged %u/%u\n", num_clean, total);
+	}
+
+out:
+	return;
+}
+
+/*
+ * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
+ */
+static struct pending_req *alloc_req(struct xen_blkif_ring *ring)
+{
+	struct pending_req *req = NULL;
+	unsigned long flags;
+
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	if (!list_empty(&ring->pending_free)) {
+		req = list_entry(ring->pending_free.next, struct pending_req,
+				 free_list);
+		list_del(&req->free_list);
+	}
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
+	return req;
+}
+
+/*
+ * Return the 'pending_req' structure back to the freepool. We also
+ * wake up the thread if it was waiting for a free page.
+ */
+static void free_req(struct xen_blkif_ring *ring, struct pending_req *req)
+{
+	unsigned long flags;
+	int was_empty;
+
+	spin_lock_irqsave(&ring->pending_free_lock, flags);
+	was_empty = list_empty(&ring->pending_free);
+	list_add(&req->free_list, &ring->pending_free);
+	spin_unlock_irqrestore(&ring->pending_free_lock, flags);
+	if (was_empty)
+		wake_up(&ring->pending_free_wq);
+}
+
+/*
+ * Routines for managing virtual block devices (vbds).
+ */
+static int xen_vbd_translate(struct phys_req *req, struct xen_blkif *blkif,
+			     enum req_op operation)
+{
+	struct xen_vbd *vbd = &blkif->vbd;
+	int rc = -EACCES;
+
+	if ((operation != REQ_OP_READ) && vbd->readonly)
+		goto out;
+
+	if (likely(req->nr_sects)) {
+		blkif_sector_t end = req->sector_number + req->nr_sects;
+
+		if (unlikely(end < req->sector_number))
+			goto out;
+		if (unlikely(end > vbd_sz(vbd)))
+			goto out;
+	}
+
+	req->dev  = vbd->pdevice;
+	req->bdev = vbd->bdev;
+	rc = 0;
+
+ out:
+	return rc;
+}
+
+static void xen_vbd_resize(struct xen_blkif *blkif)
+{
+	struct xen_vbd *vbd = &blkif->vbd;
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = xen_blkbk_xenbus(blkif->be);
+	unsigned long long new_size = vbd_sz(vbd);
+
+	pr_info("VBD Resize: Domid: %d, Device: (%d, %d)\n",
+		blkif->domid, MAJOR(vbd->pdevice), MINOR(vbd->pdevice));
+	pr_info("VBD Resize: new size %llu\n", new_size);
+	vbd->size = new_size;
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		pr_warn("Error starting transaction\n");
+		return;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    (unsigned long long)vbd_sz(vbd));
+	if (err) {
+		pr_warn("Error writing new size\n");
+		goto abort;
+	}
+	/*
+	 * Write the current state; we will use this to synchronize
+	 * the front-end. If the current state is "connected" the
+	 * front-end will get the new size information online.
+	 */
+	err = xenbus_printf(xbt, dev->nodename, "state", "%d", dev->state);
+	if (err) {
+		pr_warn("Error writing the state\n");
+		goto abort;
+	}
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		pr_warn("Error ending transaction\n");
+	return;
+abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+/*
+ * Notification from the guest OS.
+ */
+static void blkif_notify_work(struct xen_blkif_ring *ring)
+{
+	ring->waiting_reqs = 1;
+	wake_up(&ring->wq);
+}
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
+{
+	blkif_notify_work(dev_id);
+	return IRQ_HANDLED;
+}
+
+/*
+ * SCHEDULER FUNCTIONS
+ */
+
+static void print_stats(struct xen_blkif_ring *ring)
+{
+	pr_info("(%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
+		 "  |  ds %4llu | pg: %4u/%4d\n",
+		 current->comm, ring->st_oo_req,
+		 ring->st_rd_req, ring->st_wr_req,
+		 ring->st_f_req, ring->st_ds_req,
+		 ring->persistent_gnt_c, max_pgrants);
+	ring->st_print = jiffies + msecs_to_jiffies(10 * 1000);
+	ring->st_rd_req = 0;
+	ring->st_wr_req = 0;
+	ring->st_oo_req = 0;
+	ring->st_ds_req = 0;
+}
+
+int xen_blkif_schedule(void *arg)
+{
+	struct xen_blkif_ring *ring = arg;
+	struct xen_blkif *blkif = ring->blkif;
+	struct xen_vbd *vbd = &blkif->vbd;
+	unsigned long timeout;
+	int ret;
+	bool do_eoi;
+	unsigned int eoi_flags = XEN_EOI_FLAG_SPURIOUS;
+
+	set_freezable();
+	while (!kthread_should_stop()) {
+		if (try_to_freeze())
+			continue;
+		if (unlikely(vbd->size != vbd_sz(vbd)))
+			xen_vbd_resize(blkif);
+
+		timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+		timeout = wait_event_interruptible_timeout(
+			ring->wq,
+			ring->waiting_reqs || kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
+		timeout = wait_event_interruptible_timeout(
+			ring->pending_free_wq,
+			!list_empty(&ring->pending_free) ||
+			kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
+
+		do_eoi = ring->waiting_reqs;
+
+		ring->waiting_reqs = 0;
+		smp_mb(); /* clear flag *before* checking for work */
+
+		ret = do_block_io_op(ring, &eoi_flags);
+		if (ret > 0)
+			ring->waiting_reqs = 1;
+		if (ret == -EACCES)
+			wait_event_interruptible(ring->shutdown_wq,
+						 kthread_should_stop());
+
+		if (do_eoi && !ring->waiting_reqs) {
+			xen_irq_lateeoi(ring->irq, eoi_flags);
+			eoi_flags |= XEN_EOI_FLAG_SPURIOUS;
+		}
+
+purge_gnt_list:
+		if (blkif->vbd.feature_gnt_persistent &&
+		    time_after(jiffies, ring->next_lru)) {
+			purge_persistent_gnt(ring);
+			ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+		}
+
+		/* Shrink the free pages pool if it is too large. */
+		if (time_before(jiffies, blkif->buffer_squeeze_end))
+			gnttab_page_cache_shrink(&ring->free_pages, 0);
+		else
+			gnttab_page_cache_shrink(&ring->free_pages,
+						 max_buffer_pages);
+
+		if (log_stats && time_after(jiffies, ring->st_print))
+			print_stats(ring);
+	}
+
+	/* Drain pending purge work */
+	flush_work(&ring->persistent_purge_work);
+
+	if (log_stats)
+		print_stats(ring);
+
+	ring->xenblkd = NULL;
+
+	return 0;
+}
+
+/*
+ * Remove persistent grants and empty the pool of free pages
+ */
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring)
+{
+	/* Free all persistent grant pages */
+	if (!RB_EMPTY_ROOT(&ring->persistent_gnts))
+		free_persistent_gnts(ring, &ring->persistent_gnts,
+			ring->persistent_gnt_c);
+
+	BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+	ring->persistent_gnt_c = 0;
+
+	/* Since we are shutting down remove all pages from the buffer */
+	gnttab_page_cache_shrink(&ring->free_pages, 0 /* All */);
+}
+
+static unsigned int xen_blkbk_unmap_prepare(
+	struct xen_blkif_ring *ring,
+	struct grant_page **pages,
+	unsigned int num,
+	struct gnttab_unmap_grant_ref *unmap_ops,
+	struct page **unmap_pages)
+{
+	unsigned int i, invcount = 0;
+
+	for (i = 0; i < num; i++) {
+		if (pages[i]->persistent_gnt != NULL) {
+			put_persistent_gnt(ring, pages[i]->persistent_gnt);
+			continue;
+		}
+		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
+			continue;
+		unmap_pages[invcount] = pages[i]->page;
+		gnttab_set_unmap_op(&unmap_ops[invcount], vaddr(pages[i]->page),
+				    GNTMAP_host_map, pages[i]->handle);
+		pages[i]->handle = BLKBACK_INVALID_HANDLE;
+		invcount++;
+	}
+
+	return invcount;
+}
+
+static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data)
+{
+	struct pending_req *pending_req = (struct pending_req *)(data->data);
+	struct xen_blkif_ring *ring = pending_req->ring;
+	struct xen_blkif *blkif = ring->blkif;
+
+	/* BUG_ON used to reproduce existing behaviour,
+	   but is this the best way to deal with this? */
+	BUG_ON(result);
+
+	gnttab_page_cache_put(&ring->free_pages, data->pages, data->count);
+	make_response(ring, pending_req->id,
+		      pending_req->operation, pending_req->status);
+	free_req(ring, pending_req);
+	/*
+	 * Make sure the request is freed before releasing blkif,
+	 * or there could be a race between free_req and the
+	 * cleanup done in xen_blkif_free during shutdown.
+	 *
+	 * NB: The fact that we might try to wake up pending_free_wq
+	 * before drain_complete (in case there's a drain going on)
+	 * it's not a problem with our current implementation
+	 * because we can assure there's no thread waiting on
+	 * pending_free_wq if there's a drain going on, but it has
+	 * to be taken into account if the current model is changed.
+	 */
+	if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) {
+		complete(&blkif->drain_complete);
+	}
+	xen_blkif_put(blkif);
+}
+
+static void xen_blkbk_unmap_and_respond(struct pending_req *req)
+{
+	struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data;
+	struct xen_blkif_ring *ring = req->ring;
+	struct grant_page **pages = req->segments;
+	unsigned int invcount;
+
+	invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs,
+					   req->unmap, req->unmap_pages);
+
+	work->data = req;
+	work->done = xen_blkbk_unmap_and_respond_callback;
+	work->unmap_ops = req->unmap;
+	work->kunmap_ops = NULL;
+	work->pages = req->unmap_pages;
+	work->count = invcount;
+
+	gnttab_unmap_refs_async(&req->gnttab_unmap_data);
+}
+
+
+/*
+ * Unmap the grant references.
+ *
+ * This could accumulate ops up to the batch size to reduce the number
+ * of hypercalls, but since this is only used in error paths there's
+ * no real need.
+ */
+static void xen_blkbk_unmap(struct xen_blkif_ring *ring,
+                            struct grant_page *pages[],
+                            int num)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	unsigned int invcount = 0;
+	int ret;
+
+	while (num) {
+		unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+
+		invcount = xen_blkbk_unmap_prepare(ring, pages, batch,
+						   unmap, unmap_pages);
+		if (invcount) {
+			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+			BUG_ON(ret);
+			gnttab_page_cache_put(&ring->free_pages, unmap_pages,
+					      invcount);
+		}
+		pages += batch;
+		num -= batch;
+	}
+}
+
+static int xen_blkbk_map(struct xen_blkif_ring *ring,
+			 struct grant_page *pages[],
+			 int num, bool ro)
+{
+	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt = NULL;
+	phys_addr_t addr = 0;
+	int i, seg_idx, new_map_idx;
+	int segs_to_map = 0;
+	int ret = 0;
+	int last_map = 0, map_until = 0;
+	int use_persistent_gnts;
+	struct xen_blkif *blkif = ring->blkif;
+
+	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
+
+	/*
+	 * Fill out preq.nr_sects with proper amount of sectors, and setup
+	 * assign map[..] with the PFN of the page in our domain with the
+	 * corresponding grant reference for each page.
+	 */
+again:
+	for (i = map_until; i < num; i++) {
+		uint32_t flags;
+
+		if (use_persistent_gnts) {
+			persistent_gnt = get_persistent_gnt(
+				ring,
+				pages[i]->gref);
+		}
+
+		if (persistent_gnt) {
+			/*
+			 * We are using persistent grants and
+			 * the grant is already mapped
+			 */
+			pages[i]->page = persistent_gnt->page;
+			pages[i]->persistent_gnt = persistent_gnt;
+		} else {
+			if (gnttab_page_cache_get(&ring->free_pages,
+						  &pages[i]->page)) {
+				gnttab_page_cache_put(&ring->free_pages,
+						      pages_to_gnt,
+						      segs_to_map);
+				ret = -ENOMEM;
+				goto out;
+			}
+			addr = vaddr(pages[i]->page);
+			pages_to_gnt[segs_to_map] = pages[i]->page;
+			pages[i]->persistent_gnt = NULL;
+			flags = GNTMAP_host_map;
+			if (!use_persistent_gnts && ro)
+				flags |= GNTMAP_readonly;
+			gnttab_set_map_op(&map[segs_to_map++], addr,
+					  flags, pages[i]->gref,
+					  blkif->domid);
+		}
+		map_until = i + 1;
+		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+			break;
+	}
+
+	if (segs_to_map)
+		ret = gnttab_map_refs(map, NULL, pages_to_gnt, segs_to_map);
+
+	/*
+	 * Now swizzle the MFN in our domain with the MFN from the other domain
+	 * so that when we access vaddr(pending_req,i) it has the contents of
+	 * the page from the other domain.
+	 */
+	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+		if (!pages[seg_idx]->persistent_gnt) {
+			/* This is a newly mapped grant */
+			BUG_ON(new_map_idx >= segs_to_map);
+			if (unlikely(map[new_map_idx].status != 0)) {
+				pr_debug("invalid buffer -- could not remap it\n");
+				gnttab_page_cache_put(&ring->free_pages,
+						      &pages[seg_idx]->page, 1);
+				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
+				ret |= !ret;
+				goto next;
+			}
+			pages[seg_idx]->handle = map[new_map_idx].handle;
+		} else {
+			continue;
+		}
+		if (use_persistent_gnts &&
+		    ring->persistent_gnt_c < max_pgrants) {
+			/*
+			 * We are using persistent grants, the grant is
+			 * not mapped but we might have room for it.
+			 */
+			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+				                 GFP_KERNEL);
+			if (!persistent_gnt) {
+				/*
+				 * If we don't have enough memory to
+				 * allocate the persistent_gnt struct
+				 * map this grant non-persistenly
+				 */
+				goto next;
+			}
+			persistent_gnt->gnt = map[new_map_idx].ref;
+			persistent_gnt->handle = map[new_map_idx].handle;
+			persistent_gnt->page = pages[seg_idx]->page;
+			if (add_persistent_gnt(ring,
+			                       persistent_gnt)) {
+				kfree(persistent_gnt);
+				persistent_gnt = NULL;
+				goto next;
+			}
+			pages[seg_idx]->persistent_gnt = persistent_gnt;
+			pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n",
+				 persistent_gnt->gnt, ring->persistent_gnt_c,
+				 max_pgrants);
+			goto next;
+		}
+		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+			blkif->vbd.overflow_max_grants = 1;
+			pr_debug("domain %u, device %#x is using maximum number of persistent grants\n",
+			         blkif->domid, blkif->vbd.handle);
+		}
+		/*
+		 * We could not map this grant persistently, so use it as
+		 * a non-persistent grant.
+		 */
+next:
+		new_map_idx++;
+	}
+	segs_to_map = 0;
+	last_map = map_until;
+	if (!ret && map_until != num)
+		goto again;
+
+out:
+	for (i = last_map; i < num; i++) {
+		/* Don't zap current batch's valid persistent grants. */
+		if(i >= map_until)
+			pages[i]->persistent_gnt = NULL;
+		pages[i]->handle = BLKBACK_INVALID_HANDLE;
+	}
+
+	return ret;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+	int rc;
+
+	rc = xen_blkbk_map(pending_req->ring, pending_req->segments,
+			   pending_req->nr_segs,
+	                   (pending_req->operation != BLKIF_OP_READ));
+
+	return rc;
+}
+
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+				    struct pending_req *pending_req,
+				    struct seg_buf seg[],
+				    struct phys_req *preq)
+{
+	struct grant_page **pages = pending_req->indirect_pages;
+	struct xen_blkif_ring *ring = pending_req->ring;
+	int indirect_grefs, rc, n, nseg, i;
+	struct blkif_request_segment *segments = NULL;
+
+	nseg = pending_req->nr_segs;
+	indirect_grefs = INDIRECT_PAGES(nseg);
+	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+	for (i = 0; i < indirect_grefs; i++)
+		pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+	rc = xen_blkbk_map(ring, pages, indirect_grefs, true);
+	if (rc)
+		goto unmap;
+
+	for (n = 0; n < nseg; n++) {
+		uint8_t first_sect, last_sect;
+
+		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+			/* Map indirect segments */
+			if (segments)
+				kunmap_atomic(segments);
+			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+		}
+		i = n % SEGS_PER_INDIRECT_FRAME;
+
+		pending_req->segments[n]->gref = segments[i].gref;
+
+		first_sect = READ_ONCE(segments[i].first_sect);
+		last_sect = READ_ONCE(segments[i].last_sect);
+		if (last_sect >= (XEN_PAGE_SIZE >> 9) || last_sect < first_sect) {
+			rc = -EINVAL;
+			goto unmap;
+		}
+
+		seg[n].nsec = last_sect - first_sect + 1;
+		seg[n].offset = first_sect << 9;
+		preq->nr_sects += seg[n].nsec;
+	}
+
+unmap:
+	if (segments)
+		kunmap_atomic(segments);
+	xen_blkbk_unmap(ring, pages, indirect_grefs);
+	return rc;
+}
+
+static int dispatch_discard_io(struct xen_blkif_ring *ring,
+				struct blkif_request *req)
+{
+	int err = 0;
+	int status = BLKIF_RSP_OKAY;
+	struct xen_blkif *blkif = ring->blkif;
+	struct block_device *bdev = blkif->vbd.bdev;
+	struct phys_req preq;
+
+	xen_blkif_get(blkif);
+
+	preq.sector_number = req->u.discard.sector_number;
+	preq.nr_sects      = req->u.discard.nr_sectors;
+
+	err = xen_vbd_translate(&preq, blkif, REQ_OP_WRITE);
+	if (err) {
+		pr_warn("access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+		goto fail_response;
+	}
+	ring->st_ds_req++;
+
+	if (blkif->vbd.discard_secure &&
+	    (req->u.discard.flag & BLKIF_DISCARD_SECURE))
+		err = blkdev_issue_secure_erase(bdev,
+				req->u.discard.sector_number,
+				req->u.discard.nr_sectors, GFP_KERNEL);
+	else
+		err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
+				req->u.discard.nr_sectors, GFP_KERNEL);
+
+fail_response:
+	if (err == -EOPNOTSUPP) {
+		pr_debug("discard op failed, not supported\n");
+		status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (err)
+		status = BLKIF_RSP_ERROR;
+
+	make_response(ring, req->u.discard.id, req->operation, status);
+	xen_blkif_put(blkif);
+	return err;
+}
+
+static int dispatch_other_io(struct xen_blkif_ring *ring,
+			     struct blkif_request *req,
+			     struct pending_req *pending_req)
+{
+	free_req(ring, pending_req);
+	make_response(ring, req->u.other.id, req->operation,
+		      BLKIF_RSP_EOPNOTSUPP);
+	return -EIO;
+}
+
+static void xen_blk_drain_io(struct xen_blkif_ring *ring)
+{
+	struct xen_blkif *blkif = ring->blkif;
+
+	atomic_set(&blkif->drain, 1);
+	do {
+		if (atomic_read(&ring->inflight) == 0)
+			break;
+		wait_for_completion_interruptible_timeout(
+				&blkif->drain_complete, HZ);
+
+		if (!atomic_read(&blkif->drain))
+			break;
+	} while (!kthread_should_stop());
+	atomic_set(&blkif->drain, 0);
+}
+
+static void __end_block_io_op(struct pending_req *pending_req,
+		blk_status_t error)
+{
+	/* An error fails the entire request. */
+	if (pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE &&
+	    error == BLK_STS_NOTSUPP) {
+		pr_debug("flush diskcache op failed, not supported\n");
+		xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (pending_req->operation == BLKIF_OP_WRITE_BARRIER &&
+		   error == BLK_STS_NOTSUPP) {
+		pr_debug("write barrier op failed, not supported\n");
+		xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0);
+		pending_req->status = BLKIF_RSP_EOPNOTSUPP;
+	} else if (error) {
+		pr_debug("Buffer not up-to-date at end of operation,"
+			 " error=%d\n", error);
+		pending_req->status = BLKIF_RSP_ERROR;
+	}
+
+	/*
+	 * If all of the bio's have completed it is time to unmap
+	 * the grant references associated with 'request' and provide
+	 * the proper response on the ring.
+	 */
+	if (atomic_dec_and_test(&pending_req->pendcnt))
+		xen_blkbk_unmap_and_respond(pending_req);
+}
+
+/*
+ * bio callback.
+ */
+static void end_block_io_op(struct bio *bio)
+{
+	__end_block_io_op(bio->bi_private, bio->bi_status);
+	bio_put(bio);
+}
+
+
+
+/*
+ * Function to copy the from the ring buffer the 'struct blkif_request'
+ * (which has the sectors we want, number of them, grant references, etc),
+ * and transmute  it to the block API to hand it over to the proper block disk.
+ */
+static int
+__do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
+{
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
+	struct blkif_request req;
+	struct pending_req *pending_req;
+	RING_IDX rc, rp;
+	int more_to_do = 0;
+
+	rc = blk_rings->common.req_cons;
+	rp = blk_rings->common.sring->req_prod;
+	rmb(); /* Ensure we see queued requests up to 'rp'. */
+
+	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+		rc = blk_rings->common.rsp_prod_pvt;
+		pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+			rp, rc, rp - rc, ring->blkif->vbd.pdevice);
+		return -EACCES;
+	}
+	while (rc != rp) {
+
+		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
+			break;
+
+		/* We've seen a request, so clear spurious eoi flag. */
+		*eoi_flags &= ~XEN_EOI_FLAG_SPURIOUS;
+
+		if (kthread_should_stop()) {
+			more_to_do = 1;
+			break;
+		}
+
+		pending_req = alloc_req(ring);
+		if (NULL == pending_req) {
+			ring->st_oo_req++;
+			more_to_do = 1;
+			break;
+		}
+
+		switch (ring->blkif->blk_protocol) {
+		case BLKIF_PROTOCOL_NATIVE:
+			memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req));
+			break;
+		case BLKIF_PROTOCOL_X86_32:
+			blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc));
+			break;
+		case BLKIF_PROTOCOL_X86_64:
+			blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc));
+			break;
+		default:
+			BUG();
+		}
+		blk_rings->common.req_cons = ++rc; /* before make_response() */
+
+		/* Apply all sanity checks to /private copy/ of request. */
+		barrier();
+
+		switch (req.operation) {
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+		case BLKIF_OP_WRITE_BARRIER:
+		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_INDIRECT:
+			if (dispatch_rw_block_io(ring, &req, pending_req))
+				goto done;
+			break;
+		case BLKIF_OP_DISCARD:
+			free_req(ring, pending_req);
+			if (dispatch_discard_io(ring, &req))
+				goto done;
+			break;
+		default:
+			if (dispatch_other_io(ring, &req, pending_req))
+				goto done;
+			break;
+		}
+
+		/* Yield point for this unbounded loop. */
+		cond_resched();
+	}
+done:
+	return more_to_do;
+}
+
+static int
+do_block_io_op(struct xen_blkif_ring *ring, unsigned int *eoi_flags)
+{
+	union blkif_back_rings *blk_rings = &ring->blk_rings;
+	int more_to_do;
+
+	do {
+		more_to_do = __do_block_io_op(ring, eoi_flags);
+		if (more_to_do)
+			break;
+
+		RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do);
+	} while (more_to_do);
+
+	return more_to_do;
+}
+/*
+ * Transmutation of the 'struct blkif_request' to a proper 'struct bio'
+ * and call the 'submit_bio' to pass it to the underlying storage.
+ */
+static int dispatch_rw_block_io(struct xen_blkif_ring *ring,
+				struct blkif_request *req,
+				struct pending_req *pending_req)
+{
+	struct phys_req preq;
+	struct seg_buf *seg = pending_req->seg;
+	unsigned int nseg;
+	struct bio *bio = NULL;
+	struct bio **biolist = pending_req->biolist;
+	int i, nbio = 0;
+	enum req_op operation;
+	blk_opf_t operation_flags = 0;
+	struct blk_plug plug;
+	bool drain = false;
+	struct grant_page **pages = pending_req->segments;
+	unsigned short req_operation;
+
+	req_operation = req->operation == BLKIF_OP_INDIRECT ?
+			req->u.indirect.indirect_op : req->operation;
+
+	if ((req->operation == BLKIF_OP_INDIRECT) &&
+	    (req_operation != BLKIF_OP_READ) &&
+	    (req_operation != BLKIF_OP_WRITE)) {
+		pr_debug("Invalid indirect operation (%u)\n", req_operation);
+		goto fail_response;
+	}
+
+	switch (req_operation) {
+	case BLKIF_OP_READ:
+		ring->st_rd_req++;
+		operation = REQ_OP_READ;
+		break;
+	case BLKIF_OP_WRITE:
+		ring->st_wr_req++;
+		operation = REQ_OP_WRITE;
+		operation_flags = REQ_SYNC | REQ_IDLE;
+		break;
+	case BLKIF_OP_WRITE_BARRIER:
+		drain = true;
+		fallthrough;
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		ring->st_f_req++;
+		operation = REQ_OP_WRITE;
+		operation_flags = REQ_PREFLUSH;
+		break;
+	default:
+		operation = 0; /* make gcc happy */
+		goto fail_response;
+		break;
+	}
+
+	/* Check that the number of segments is sane. */
+	nseg = req->operation == BLKIF_OP_INDIRECT ?
+	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
+
+	if (unlikely(nseg == 0 && operation_flags != REQ_PREFLUSH) ||
+	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+		     (nseg > MAX_INDIRECT_SEGMENTS))) {
+		pr_debug("Bad number of segments in request (%d)\n", nseg);
+		/* Haven't submitted any bio's yet. */
+		goto fail_response;
+	}
+
+	preq.nr_sects      = 0;
+
+	pending_req->ring      = ring;
+	pending_req->id        = req->u.rw.id;
+	pending_req->operation = req_operation;
+	pending_req->status    = BLKIF_RSP_OKAY;
+	pending_req->nr_segs   = nseg;
+
+	if (req->operation != BLKIF_OP_INDIRECT) {
+		preq.dev               = req->u.rw.handle;
+		preq.sector_number     = req->u.rw.sector_number;
+		for (i = 0; i < nseg; i++) {
+			pages[i]->gref = req->u.rw.seg[i].gref;
+			seg[i].nsec = req->u.rw.seg[i].last_sect -
+				req->u.rw.seg[i].first_sect + 1;
+			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+			if ((req->u.rw.seg[i].last_sect >= (XEN_PAGE_SIZE >> 9)) ||
+			    (req->u.rw.seg[i].last_sect <
+			     req->u.rw.seg[i].first_sect))
+				goto fail_response;
+			preq.nr_sects += seg[i].nsec;
+		}
+	} else {
+		preq.dev               = req->u.indirect.handle;
+		preq.sector_number     = req->u.indirect.sector_number;
+		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
+			goto fail_response;
+	}
+
+	if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) {
+		pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n",
+			 operation == REQ_OP_READ ? "read" : "write",
+			 preq.sector_number,
+			 preq.sector_number + preq.nr_sects,
+			 ring->blkif->vbd.pdevice);
+		goto fail_response;
+	}
+
+	/*
+	 * This check _MUST_ be done after xen_vbd_translate as the preq.bdev
+	 * is set there.
+	 */
+	for (i = 0; i < nseg; i++) {
+		if (((int)preq.sector_number|(int)seg[i].nsec) &
+		    ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) {
+			pr_debug("Misaligned I/O request from domain %d\n",
+				 ring->blkif->domid);
+			goto fail_response;
+		}
+	}
+
+	/* Wait on all outstanding I/O's and once that has been completed
+	 * issue the flush.
+	 */
+	if (drain)
+		xen_blk_drain_io(pending_req->ring);
+
+	/*
+	 * If we have failed at this point, we need to undo the M2P override,
+	 * set gnttab_set_unmap_op on all of the grant references and perform
+	 * the hypercall to unmap the grants - that is all done in
+	 * xen_blkbk_unmap.
+	 */
+	if (xen_blkbk_map_seg(pending_req))
+		goto fail_flush;
+
+	/*
+	 * This corresponding xen_blkif_put is done in __end_block_io_op, or
+	 * below (in "!bio") if we are handling a BLKIF_OP_DISCARD.
+	 */
+	xen_blkif_get(ring->blkif);
+	atomic_inc(&ring->inflight);
+
+	for (i = 0; i < nseg; i++) {
+		while ((bio == NULL) ||
+		       (bio_add_page(bio,
+				     pages[i]->page,
+				     seg[i].nsec << 9,
+				     seg[i].offset) == 0)) {
+			bio = bio_alloc(preq.bdev, bio_max_segs(nseg - i),
+					operation | operation_flags,
+					GFP_KERNEL);
+			biolist[nbio++] = bio;
+			bio->bi_private = pending_req;
+			bio->bi_end_io  = end_block_io_op;
+			bio->bi_iter.bi_sector  = preq.sector_number;
+		}
+
+		preq.sector_number += seg[i].nsec;
+	}
+
+	/* This will be hit if the operation was a flush or discard. */
+	if (!bio) {
+		BUG_ON(operation_flags != REQ_PREFLUSH);
+
+		bio = bio_alloc(preq.bdev, 0, operation | operation_flags,
+				GFP_KERNEL);
+		biolist[nbio++] = bio;
+		bio->bi_private = pending_req;
+		bio->bi_end_io  = end_block_io_op;
+	}
+
+	atomic_set(&pending_req->pendcnt, nbio);
+	blk_start_plug(&plug);
+
+	for (i = 0; i < nbio; i++)
+		submit_bio(biolist[i]);
+
+	/* Let the I/Os go.. */
+	blk_finish_plug(&plug);
+
+	if (operation == REQ_OP_READ)
+		ring->st_rd_sect += preq.nr_sects;
+	else if (operation == REQ_OP_WRITE)
+		ring->st_wr_sect += preq.nr_sects;
+
+	return 0;
+
+ fail_flush:
+	xen_blkbk_unmap(ring, pending_req->segments,
+	                pending_req->nr_segs);
+ fail_response:
+	/* Haven't submitted any bio's yet. */
+	make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+	free_req(ring, pending_req);
+	msleep(1); /* back off a bit */
+	return -EIO;
+}
+
+
+
+/*
+ * Put a response on the ring on how the operation fared.
+ */
+static void make_response(struct xen_blkif_ring *ring, u64 id,
+			  unsigned short op, int st)
+{
+	struct blkif_response *resp;
+	unsigned long     flags;
+	union blkif_back_rings *blk_rings;
+	int notify;
+
+	spin_lock_irqsave(&ring->blk_ring_lock, flags);
+	blk_rings = &ring->blk_rings;
+	/* Place on the response ring for the relevant domain. */
+	switch (ring->blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+		resp = RING_GET_RESPONSE(&blk_rings->native,
+					 blk_rings->native.rsp_prod_pvt);
+		break;
+	case BLKIF_PROTOCOL_X86_32:
+		resp = RING_GET_RESPONSE(&blk_rings->x86_32,
+					 blk_rings->x86_32.rsp_prod_pvt);
+		break;
+	case BLKIF_PROTOCOL_X86_64:
+		resp = RING_GET_RESPONSE(&blk_rings->x86_64,
+					 blk_rings->x86_64.rsp_prod_pvt);
+		break;
+	default:
+		BUG();
+	}
+
+	resp->id        = id;
+	resp->operation = op;
+	resp->status    = st;
+
+	blk_rings->common.rsp_prod_pvt++;
+	RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify);
+	spin_unlock_irqrestore(&ring->blk_ring_lock, flags);
+	if (notify)
+		notify_remote_via_irq(ring->irq);
+}
+
+static int __init xen_blkif_init(void)
+{
+	int rc = 0;
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
+		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
+		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+	}
+
+	if (xenblk_max_queues == 0)
+		xenblk_max_queues = num_online_cpus();
+
+	rc = xen_blkif_interface_init();
+	if (rc)
+		goto failed_init;
+
+	rc = xen_blkif_xenbus_init();
+	if (rc)
+		goto failed_init;
+
+ failed_init:
+	return rc;
+}
+
+module_init(xen_blkif_init);
+
+static void __exit xen_blkif_fini(void)
+{
+	xen_blkif_xenbus_fini();
+	xen_blkif_interface_fini();
+}
+
+module_exit(xen_blkif_fini);
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_ALIAS("xen-backend:vbd");
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
new file mode 100644
index 000000000..a28473470
--- /dev/null
+++ b/drivers/block/xen-blkback/common.h
@@ -0,0 +1,494 @@
+/*
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __XEN_BLKIF__BACKEND__COMMON_H__
+#define __XEN_BLKIF__BACKEND__COMMON_H__
+
+#include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/blkdev.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+#include <linux/io.h>
+#include <linux/rbtree.h>
+#include <asm/setup.h>
+#include <asm/hypervisor.h>
+#include <xen/grant_table.h>
+#include <xen/page.h>
+#include <xen/xenbus.h>
+#include <xen/interface/io/ring.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+extern unsigned int xen_blkif_max_ring_order;
+extern unsigned int xenblk_max_queues;
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+/*
+ * Xen use 4K pages. The guest may use different page size (4K or 64K)
+ * Number of Xen pages per segment
+ */
+#define XEN_PAGES_PER_SEGMENT   (PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define XEN_PAGES_PER_INDIRECT_FRAME \
+	(XEN_PAGE_SIZE/sizeof(struct blkif_request_segment))
+#define SEGS_PER_INDIRECT_FRAME	\
+	(XEN_PAGES_PER_INDIRECT_FRAME / XEN_PAGES_PER_SEGMENT)
+
+#define MAX_INDIRECT_PAGES \
+	((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) DIV_ROUND_UP(_segs, XEN_PAGES_PER_INDIRECT_FRAME)
+
+/* Not a real protocol.  Used to generate ring structs which contain
+ * the elements common to all protocols only.  This way we get a
+ * compiler-checkable way to use common struct elements, so we can
+ * avoid using switch(protocol) in a number of places.  */
+struct blkif_common_request {
+	char dummy;
+};
+
+/* i386 protocol version */
+
+struct blkif_x86_32_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_other {
+	uint8_t        _pad1;
+	blkif_vdev_t   _pad2;
+	uint64_t       id;           /* private guest value, echoed in resp  */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad1;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint64_t       _pad2;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_32_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_32_request_rw rw;
+		struct blkif_x86_32_request_discard discard;
+		struct blkif_x86_32_request_other other;
+		struct blkif_x86_32_request_indirect indirect;
+	} u;
+} __attribute__((__packed__));
+
+/* x86_64 protocol version */
+
+struct blkif_x86_64_request_rw {
+	uint8_t        nr_segments;  /* number of segments                   */
+	blkif_vdev_t   handle;       /* only for read/write requests         */
+	uint32_t       _pad1;        /* offsetof(blkif_reqest..,u.rw.id)==8  */
+	uint64_t       id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	struct blkif_request_segment seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_discard {
+	uint8_t        flag;         /* BLKIF_DISCARD_SECURE or zero         */
+	blkif_vdev_t   _pad1;        /* was "handle" for read/write requests */
+        uint32_t       _pad2;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;/* start sector idx on disk (r/w only)  */
+	uint64_t       nr_sectors;
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_other {
+	uint8_t        _pad1;
+	blkif_vdev_t   _pad2;
+	uint32_t       _pad3;        /* offsetof(blkif_..,u.discard.id)==8   */
+	uint64_t       id;           /* private guest value, echoed in resp  */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad2;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint32_t       _pad3;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
+struct blkif_x86_64_request {
+	uint8_t        operation;    /* BLKIF_OP_???                         */
+	union {
+		struct blkif_x86_64_request_rw rw;
+		struct blkif_x86_64_request_discard discard;
+		struct blkif_x86_64_request_other other;
+		struct blkif_x86_64_request_indirect indirect;
+	} u;
+} __attribute__((__packed__));
+
+DEFINE_RING_TYPES(blkif_common, struct blkif_common_request,
+		  struct blkif_response);
+DEFINE_RING_TYPES(blkif_x86_32, struct blkif_x86_32_request,
+		  struct blkif_response __packed);
+DEFINE_RING_TYPES(blkif_x86_64, struct blkif_x86_64_request,
+		  struct blkif_response);
+
+union blkif_back_rings {
+	struct blkif_back_ring        native;
+	struct blkif_common_back_ring common;
+	struct blkif_x86_32_back_ring x86_32;
+	struct blkif_x86_64_back_ring x86_64;
+};
+
+enum blkif_protocol {
+	BLKIF_PROTOCOL_NATIVE = 1,
+	BLKIF_PROTOCOL_X86_32 = 2,
+	BLKIF_PROTOCOL_X86_64 = 3,
+};
+
+/*
+ * Default protocol if the frontend doesn't specify one.
+ */
+#ifdef CONFIG_X86
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_X86_32
+#else
+#  define BLKIF_PROTOCOL_DEFAULT BLKIF_PROTOCOL_NATIVE
+#endif
+
+struct xen_vbd {
+	/* What the domain refers to this vbd as. */
+	blkif_vdev_t		handle;
+	/* Non-zero -> read-only */
+	unsigned char		readonly;
+	/* VDISK_xxx */
+	unsigned char		type;
+	/* phys device that this vbd maps to. */
+	u32			pdevice;
+	struct block_device	*bdev;
+	/* Cached size parameter. */
+	sector_t		size;
+	unsigned int		flush_support:1;
+	unsigned int		discard_secure:1;
+	/* Connect-time cached feature_persistent parameter value */
+	unsigned int		feature_gnt_persistent_parm:1;
+	/* Persistent grants feature negotiation result */
+	unsigned int		feature_gnt_persistent:1;
+	unsigned int		overflow_max_grants:1;
+};
+
+struct backend_info;
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS_PER_PAGE		32
+
+struct persistent_gnt {
+	struct page *page;
+	grant_ref_t gnt;
+	grant_handle_t handle;
+	unsigned long last_used;
+	bool active;
+	struct rb_node node;
+	struct list_head remove_node;
+};
+
+/* Per-ring information. */
+struct xen_blkif_ring {
+	/* Physical parameters of the comms window. */
+	unsigned int		irq;
+	union blkif_back_rings	blk_rings;
+	void			*blk_ring;
+	/* Private fields. */
+	spinlock_t		blk_ring_lock;
+
+	wait_queue_head_t	wq;
+	atomic_t		inflight;
+	bool			active;
+	/* One thread per blkif ring. */
+	struct task_struct	*xenblkd;
+	unsigned int		waiting_reqs;
+
+	/* List of all 'pending_req' available */
+	struct list_head	pending_free;
+	/* And its spinlock. */
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
+
+	/* Tree to store persistent grants. */
+	struct rb_root		persistent_gnts;
+	unsigned int		persistent_gnt_c;
+	atomic_t		persistent_gnt_in_use;
+	unsigned long           next_lru;
+
+	/* Statistics. */
+	unsigned long		st_print;
+	unsigned long long	st_rd_req;
+	unsigned long long	st_wr_req;
+	unsigned long long	st_oo_req;
+	unsigned long long	st_f_req;
+	unsigned long long	st_ds_req;
+	unsigned long long	st_rd_sect;
+	unsigned long long	st_wr_sect;
+
+	/* Used by the kworker that offload work from the persistent purge. */
+	struct list_head	persistent_purge_list;
+	struct work_struct	persistent_purge_work;
+
+	/* Buffer of free pages to map grant refs. */
+	struct gnttab_page_cache free_pages;
+
+	struct work_struct	free_work;
+	/* Thread shutdown wait queue. */
+	wait_queue_head_t	shutdown_wq;
+	struct xen_blkif 	*blkif;
+};
+
+struct xen_blkif {
+	/* Unique identifier for this interface. */
+	domid_t			domid;
+	unsigned int		handle;
+	/* Comms information. */
+	enum blkif_protocol	blk_protocol;
+	/* The VBD attached to this interface. */
+	struct xen_vbd		vbd;
+	/* Back pointer to the backend_info. */
+	struct backend_info	*be;
+	atomic_t		refcnt;
+	/* for barrier (drain) requests */
+	struct completion	drain_complete;
+	atomic_t		drain;
+
+	struct work_struct	free_work;
+	unsigned int 		nr_ring_pages;
+	bool			multi_ref;
+	/* All rings for this device. */
+	struct xen_blkif_ring	*rings;
+	unsigned int		nr_rings;
+	unsigned long		buffer_squeeze_end;
+};
+
+struct seg_buf {
+	unsigned long offset;
+	unsigned int nsec;
+};
+
+struct grant_page {
+	struct page 		*page;
+	struct persistent_gnt	*persistent_gnt;
+	grant_handle_t		handle;
+	grant_ref_t		gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+	struct xen_blkif_ring   *ring;
+	u64			id;
+	int			nr_segs;
+	atomic_t		pendcnt;
+	unsigned short		operation;
+	int			status;
+	struct list_head	free_list;
+	struct grant_page	*segments[MAX_INDIRECT_SEGMENTS];
+	/* Indirect descriptors */
+	struct grant_page	*indirect_pages[MAX_INDIRECT_PAGES];
+	struct seg_buf		seg[MAX_INDIRECT_SEGMENTS];
+	struct bio		*biolist[MAX_INDIRECT_SEGMENTS];
+	struct gnttab_unmap_grant_ref unmap[MAX_INDIRECT_SEGMENTS];
+	struct page                   *unmap_pages[MAX_INDIRECT_SEGMENTS];
+	struct gntab_unmap_queue_data gnttab_unmap_data;
+};
+
+
+#define vbd_sz(_v)	bdev_nr_sectors((_v)->bdev)
+
+#define xen_blkif_get(_b) (atomic_inc(&(_b)->refcnt))
+#define xen_blkif_put(_b)				\
+	do {						\
+		if (atomic_dec_and_test(&(_b)->refcnt))	\
+			schedule_work(&(_b)->free_work);\
+	} while (0)
+
+struct phys_req {
+	unsigned short		dev;
+	blkif_sector_t		nr_sects;
+	struct block_device	*bdev;
+	blkif_sector_t		sector_number;
+};
+
+int xen_blkif_interface_init(void);
+void xen_blkif_interface_fini(void);
+
+int xen_blkif_xenbus_init(void);
+void xen_blkif_xenbus_fini(void);
+
+irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
+int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
+void xen_blkbk_free_caches(struct xen_blkif_ring *ring);
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state);
+
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state);
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
+void xen_blkbk_unmap_purged_grants(struct work_struct *work);
+
+static inline void blkif_get_x86_32_req(struct blkif_request *dst,
+					struct blkif_x86_32_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+	dst->operation = READ_ONCE(src->operation);
+	switch (dst->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
+	default:
+		/*
+		 * Don't know how to translate this op. Only get the
+		 * ID so failure can be reported to the frontend.
+		 */
+		dst->u.other.id = src->u.other.id;
+		break;
+	}
+}
+
+static inline void blkif_get_x86_64_req(struct blkif_request *dst,
+					struct blkif_x86_64_request *src)
+{
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
+	dst->operation = READ_ONCE(src->operation);
+	switch (dst->operation) {
+	case BLKIF_OP_READ:
+	case BLKIF_OP_WRITE:
+	case BLKIF_OP_WRITE_BARRIER:
+	case BLKIF_OP_FLUSH_DISKCACHE:
+		dst->u.rw.nr_segments = src->u.rw.nr_segments;
+		dst->u.rw.handle = src->u.rw.handle;
+		dst->u.rw.id = src->u.rw.id;
+		dst->u.rw.sector_number = src->u.rw.sector_number;
+		barrier();
+		if (n > dst->u.rw.nr_segments)
+			n = dst->u.rw.nr_segments;
+		for (i = 0; i < n; i++)
+			dst->u.rw.seg[i] = src->u.rw.seg[i];
+		break;
+	case BLKIF_OP_DISCARD:
+		dst->u.discard.flag = src->u.discard.flag;
+		dst->u.discard.id = src->u.discard.id;
+		dst->u.discard.sector_number = src->u.discard.sector_number;
+		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
+		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
+	default:
+		/*
+		 * Don't know how to translate this op. Only get the
+		 * ID so failure can be reported to the frontend.
+		 */
+		dst->u.other.id = src->u.other.id;
+		break;
+	}
+}
+
+#endif /* __XEN_BLKIF__BACKEND__COMMON_H__ */
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
new file mode 100644
index 000000000..c0227dfa4
--- /dev/null
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -0,0 +1,1180 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*  Xenbus code for blkif backend
+    Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au>
+    Copyright (C) 2005 XenSource Ltd
+
+
+*/
+
+#define pr_fmt(fmt) "xen-blkback: " fmt
+
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/pagemap.h>
+#include <xen/events.h>
+#include <xen/grant_table.h>
+#include "common.h"
+
+/* On the XenBus the max length of 'ring-ref%u'. */
+#define RINGREF_NAME_LEN (20)
+
+struct backend_info {
+	struct xenbus_device	*dev;
+	struct xen_blkif	*blkif;
+	struct xenbus_watch	backend_watch;
+	unsigned		major;
+	unsigned		minor;
+	char			*mode;
+};
+
+static struct kmem_cache *xen_blkif_cachep;
+static void connect(struct backend_info *);
+static int connect_ring(struct backend_info *);
+static void backend_changed(struct xenbus_watch *, const char *,
+			    const char *);
+static void xen_blkif_free(struct xen_blkif *blkif);
+static void xen_vbd_free(struct xen_vbd *vbd);
+
+struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be)
+{
+	return be->dev;
+}
+
+/*
+ * The last request could free the device from softirq context and
+ * xen_blkif_free() can sleep.
+ */
+static void xen_blkif_deferred_free(struct work_struct *work)
+{
+	struct xen_blkif *blkif;
+
+	blkif = container_of(work, struct xen_blkif, free_work);
+	xen_blkif_free(blkif);
+}
+
+static int blkback_name(struct xen_blkif *blkif, char *buf)
+{
+	char *devpath, *devname;
+	struct xenbus_device *dev = blkif->be->dev;
+
+	devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL);
+	if (IS_ERR(devpath))
+		return PTR_ERR(devpath);
+
+	devname = strstr(devpath, "/dev/");
+	if (devname != NULL)
+		devname += strlen("/dev/");
+	else
+		devname  = devpath;
+
+	snprintf(buf, TASK_COMM_LEN, "%d.%s", blkif->domid, devname);
+	kfree(devpath);
+
+	return 0;
+}
+
+static void xen_update_blkif_status(struct xen_blkif *blkif)
+{
+	int err;
+	char name[TASK_COMM_LEN];
+	struct xen_blkif_ring *ring;
+	int i;
+
+	/* Not ready to connect? */
+	if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev)
+		return;
+
+	/* Already connected? */
+	if (blkif->be->dev->state == XenbusStateConnected)
+		return;
+
+	/* Attempt to connect: exit if we fail to. */
+	connect(blkif->be);
+	if (blkif->be->dev->state != XenbusStateConnected)
+		return;
+
+	err = blkback_name(blkif, name);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "get blkback dev name");
+		return;
+	}
+
+	err = sync_blockdev(blkif->vbd.bdev);
+	if (err) {
+		xenbus_dev_error(blkif->be->dev, err, "block flush");
+		return;
+	}
+	invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping);
+
+	for (i = 0; i < blkif->nr_rings; i++) {
+		ring = &blkif->rings[i];
+		ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i);
+		if (IS_ERR(ring->xenblkd)) {
+			err = PTR_ERR(ring->xenblkd);
+			ring->xenblkd = NULL;
+			xenbus_dev_fatal(blkif->be->dev, err,
+					"start %s-%d xenblkd", name, i);
+			goto out;
+		}
+	}
+	return;
+
+out:
+	while (--i >= 0) {
+		ring = &blkif->rings[i];
+		kthread_stop(ring->xenblkd);
+	}
+	return;
+}
+
+static int xen_blkif_alloc_rings(struct xen_blkif *blkif)
+{
+	unsigned int r;
+
+	blkif->rings = kcalloc(blkif->nr_rings, sizeof(struct xen_blkif_ring),
+			       GFP_KERNEL);
+	if (!blkif->rings)
+		return -ENOMEM;
+
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+
+		spin_lock_init(&ring->blk_ring_lock);
+		init_waitqueue_head(&ring->wq);
+		INIT_LIST_HEAD(&ring->pending_free);
+		INIT_LIST_HEAD(&ring->persistent_purge_list);
+		INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants);
+		gnttab_page_cache_init(&ring->free_pages);
+
+		spin_lock_init(&ring->pending_free_lock);
+		init_waitqueue_head(&ring->pending_free_wq);
+		init_waitqueue_head(&ring->shutdown_wq);
+		ring->blkif = blkif;
+		ring->st_print = jiffies;
+		ring->active = true;
+	}
+
+	return 0;
+}
+
+/* Enable the persistent grants feature. */
+static bool feature_persistent = true;
+module_param(feature_persistent, bool, 0644);
+MODULE_PARM_DESC(feature_persistent, "Enables the persistent grants feature");
+
+static struct xen_blkif *xen_blkif_alloc(domid_t domid)
+{
+	struct xen_blkif *blkif;
+
+	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+	blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
+	if (!blkif)
+		return ERR_PTR(-ENOMEM);
+
+	blkif->domid = domid;
+	atomic_set(&blkif->refcnt, 1);
+	init_completion(&blkif->drain_complete);
+
+	/*
+	 * Because freeing back to the cache may be deferred, it is not
+	 * safe to unload the module (and hence destroy the cache) until
+	 * this has completed. To prevent premature unloading, take an
+	 * extra module reference here and release only when the object
+	 * has been freed back to the cache.
+	 */
+	__module_get(THIS_MODULE);
+	INIT_WORK(&blkif->free_work, xen_blkif_deferred_free);
+
+	return blkif;
+}
+
+static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref,
+			 unsigned int nr_grefs, unsigned int evtchn)
+{
+	int err;
+	struct xen_blkif *blkif = ring->blkif;
+	const struct blkif_common_sring *sring_common;
+	RING_IDX rsp_prod, req_prod;
+	unsigned int size;
+
+	/* Already connected through? */
+	if (ring->irq)
+		return 0;
+
+	err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs,
+				     &ring->blk_ring);
+	if (err < 0)
+		return err;
+
+	sring_common = (struct blkif_common_sring *)ring->blk_ring;
+	rsp_prod = READ_ONCE(sring_common->rsp_prod);
+	req_prod = READ_ONCE(sring_common->req_prod);
+
+	switch (blkif->blk_protocol) {
+	case BLKIF_PROTOCOL_NATIVE:
+	{
+		struct blkif_sring *sring_native =
+			(struct blkif_sring *)ring->blk_ring;
+
+		BACK_RING_ATTACH(&ring->blk_rings.native, sring_native,
+				 rsp_prod, XEN_PAGE_SIZE * nr_grefs);
+		size = __RING_SIZE(sring_native, XEN_PAGE_SIZE * nr_grefs);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_32:
+	{
+		struct blkif_x86_32_sring *sring_x86_32 =
+			(struct blkif_x86_32_sring *)ring->blk_ring;
+
+		BACK_RING_ATTACH(&ring->blk_rings.x86_32, sring_x86_32,
+				 rsp_prod, XEN_PAGE_SIZE * nr_grefs);
+		size = __RING_SIZE(sring_x86_32, XEN_PAGE_SIZE * nr_grefs);
+		break;
+	}
+	case BLKIF_PROTOCOL_X86_64:
+	{
+		struct blkif_x86_64_sring *sring_x86_64 =
+			(struct blkif_x86_64_sring *)ring->blk_ring;
+
+		BACK_RING_ATTACH(&ring->blk_rings.x86_64, sring_x86_64,
+				 rsp_prod, XEN_PAGE_SIZE * nr_grefs);
+		size = __RING_SIZE(sring_x86_64, XEN_PAGE_SIZE * nr_grefs);
+		break;
+	}
+	default:
+		BUG();
+	}
+
+	err = -EIO;
+	if (req_prod - rsp_prod > size)
+		goto fail;
+
+	err = bind_interdomain_evtchn_to_irqhandler_lateeoi(blkif->be->dev,
+			evtchn, xen_blkif_be_int, 0, "blkif-backend", ring);
+	if (err < 0)
+		goto fail;
+	ring->irq = err;
+
+	return 0;
+
+fail:
+	xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+	ring->blk_rings.common.sring = NULL;
+	return err;
+}
+
+static int xen_blkif_disconnect(struct xen_blkif *blkif)
+{
+	struct pending_req *req, *n;
+	unsigned int j, r;
+	bool busy = false;
+
+	for (r = 0; r < blkif->nr_rings; r++) {
+		struct xen_blkif_ring *ring = &blkif->rings[r];
+		unsigned int i = 0;
+
+		if (!ring->active)
+			continue;
+
+		if (ring->xenblkd) {
+			kthread_stop(ring->xenblkd);
+			ring->xenblkd = NULL;
+			wake_up(&ring->shutdown_wq);
+		}
+
+		/* The above kthread_stop() guarantees that at this point we
+		 * don't have any discard_io or other_io requests. So, checking
+		 * for inflight IO is enough.
+		 */
+		if (atomic_read(&ring->inflight) > 0) {
+			busy = true;
+			continue;
+		}
+
+		if (ring->irq) {
+			unbind_from_irqhandler(ring->irq, ring);
+			ring->irq = 0;
+		}
+
+		if (ring->blk_rings.common.sring) {
+			xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring);
+			ring->blk_rings.common.sring = NULL;
+		}
+
+		/* Remove all persistent grants and the cache of ballooned pages. */
+		xen_blkbk_free_caches(ring);
+
+		/* Check that there is no request in use */
+		list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+			list_del(&req->free_list);
+
+			for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+				kfree(req->segments[j]);
+
+			for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+				kfree(req->indirect_pages[j]);
+
+			kfree(req);
+			i++;
+		}
+
+		BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0);
+		BUG_ON(!list_empty(&ring->persistent_purge_list));
+		BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts));
+		BUG_ON(ring->free_pages.num_pages != 0);
+		BUG_ON(ring->persistent_gnt_c != 0);
+		WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages));
+		ring->active = false;
+	}
+	if (busy)
+		return -EBUSY;
+
+	blkif->nr_ring_pages = 0;
+	/*
+	 * blkif->rings was allocated in connect_ring, so we should free it in
+	 * here.
+	 */
+	kfree(blkif->rings);
+	blkif->rings = NULL;
+	blkif->nr_rings = 0;
+
+	return 0;
+}
+
+static void xen_blkif_free(struct xen_blkif *blkif)
+{
+	WARN_ON(xen_blkif_disconnect(blkif));
+	xen_vbd_free(&blkif->vbd);
+	kfree(blkif->be->mode);
+	kfree(blkif->be);
+
+	/* Make sure everything is drained before shutting down */
+	kmem_cache_free(xen_blkif_cachep, blkif);
+	module_put(THIS_MODULE);
+}
+
+int __init xen_blkif_interface_init(void)
+{
+	xen_blkif_cachep = kmem_cache_create("blkif_cache",
+					     sizeof(struct xen_blkif),
+					     0, 0, NULL);
+	if (!xen_blkif_cachep)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void xen_blkif_interface_fini(void)
+{
+	kmem_cache_destroy(xen_blkif_cachep);
+	xen_blkif_cachep = NULL;
+}
+
+/*
+ *  sysfs interface for VBD I/O requests
+ */
+
+#define VBD_SHOW_ALLRING(name, format)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+		struct xen_blkif *blkif = be->blkif;			\
+		unsigned int i;						\
+		unsigned long long result = 0;				\
+									\
+		if (!blkif->rings)				\
+			goto out;					\
+									\
+		for (i = 0; i < blkif->nr_rings; i++) {		\
+			struct xen_blkif_ring *ring = &blkif->rings[i];	\
+									\
+			result += ring->st_##name;			\
+		}							\
+									\
+out:									\
+		return sprintf(buf, format, result);			\
+	}								\
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+VBD_SHOW_ALLRING(oo_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_req,  "%llu\n");
+VBD_SHOW_ALLRING(wr_req,  "%llu\n");
+VBD_SHOW_ALLRING(f_req,  "%llu\n");
+VBD_SHOW_ALLRING(ds_req,  "%llu\n");
+VBD_SHOW_ALLRING(rd_sect, "%llu\n");
+VBD_SHOW_ALLRING(wr_sect, "%llu\n");
+
+static struct attribute *xen_vbdstat_attrs[] = {
+	&dev_attr_oo_req.attr,
+	&dev_attr_rd_req.attr,
+	&dev_attr_wr_req.attr,
+	&dev_attr_f_req.attr,
+	&dev_attr_ds_req.attr,
+	&dev_attr_rd_sect.attr,
+	&dev_attr_wr_sect.attr,
+	NULL
+};
+
+static const struct attribute_group xen_vbdstat_group = {
+	.name = "statistics",
+	.attrs = xen_vbdstat_attrs,
+};
+
+#define VBD_SHOW(name, format, args...)					\
+	static ssize_t show_##name(struct device *_dev,			\
+				   struct device_attribute *attr,	\
+				   char *buf)				\
+	{								\
+		struct xenbus_device *dev = to_xenbus_device(_dev);	\
+		struct backend_info *be = dev_get_drvdata(&dev->dev);	\
+									\
+		return sprintf(buf, format, ##args);			\
+	}								\
+	static DEVICE_ATTR(name, 0444, show_##name, NULL)
+
+VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor);
+VBD_SHOW(mode, "%s\n", be->mode);
+
+static int xenvbd_sysfs_addif(struct xenbus_device *dev)
+{
+	int error;
+
+	error = device_create_file(&dev->dev, &dev_attr_physical_device);
+	if (error)
+		goto fail1;
+
+	error = device_create_file(&dev->dev, &dev_attr_mode);
+	if (error)
+		goto fail2;
+
+	error = sysfs_create_group(&dev->dev.kobj, &xen_vbdstat_group);
+	if (error)
+		goto fail3;
+
+	return 0;
+
+fail3:	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+fail2:	device_remove_file(&dev->dev, &dev_attr_mode);
+fail1:	device_remove_file(&dev->dev, &dev_attr_physical_device);
+	return error;
+}
+
+static void xenvbd_sysfs_delif(struct xenbus_device *dev)
+{
+	sysfs_remove_group(&dev->dev.kobj, &xen_vbdstat_group);
+	device_remove_file(&dev->dev, &dev_attr_mode);
+	device_remove_file(&dev->dev, &dev_attr_physical_device);
+}
+
+static void xen_vbd_free(struct xen_vbd *vbd)
+{
+	if (vbd->bdev)
+		blkdev_put(vbd->bdev, vbd->readonly ? FMODE_READ : FMODE_WRITE);
+	vbd->bdev = NULL;
+}
+
+static int xen_vbd_create(struct xen_blkif *blkif, blkif_vdev_t handle,
+			  unsigned major, unsigned minor, int readonly,
+			  int cdrom)
+{
+	struct xen_vbd *vbd;
+	struct block_device *bdev;
+
+	vbd = &blkif->vbd;
+	vbd->handle   = handle;
+	vbd->readonly = readonly;
+	vbd->type     = 0;
+
+	vbd->pdevice  = MKDEV(major, minor);
+
+	bdev = blkdev_get_by_dev(vbd->pdevice, vbd->readonly ?
+				 FMODE_READ : FMODE_WRITE, NULL);
+
+	if (IS_ERR(bdev)) {
+		pr_warn("xen_vbd_create: device %08x could not be opened\n",
+			vbd->pdevice);
+		return -ENOENT;
+	}
+
+	vbd->bdev = bdev;
+	if (vbd->bdev->bd_disk == NULL) {
+		pr_warn("xen_vbd_create: device %08x doesn't exist\n",
+			vbd->pdevice);
+		xen_vbd_free(vbd);
+		return -ENOENT;
+	}
+	vbd->size = vbd_sz(vbd);
+
+	if (cdrom || disk_to_cdi(vbd->bdev->bd_disk))
+		vbd->type |= VDISK_CDROM;
+	if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE)
+		vbd->type |= VDISK_REMOVABLE;
+
+	if (bdev_write_cache(bdev))
+		vbd->flush_support = true;
+	if (bdev_max_secure_erase_sectors(bdev))
+		vbd->discard_secure = true;
+
+	pr_debug("Successful creation of handle=%04x (dom=%u)\n",
+		handle, blkif->domid);
+	return 0;
+}
+
+static int xen_blkbk_remove(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+	if (be->major || be->minor)
+		xenvbd_sysfs_delif(dev);
+
+	if (be->backend_watch.node) {
+		unregister_xenbus_watch(&be->backend_watch);
+		kfree(be->backend_watch.node);
+		be->backend_watch.node = NULL;
+	}
+
+	dev_set_drvdata(&dev->dev, NULL);
+
+	if (be->blkif) {
+		xen_blkif_disconnect(be->blkif);
+
+		/* Put the reference we set in xen_blkif_alloc(). */
+		xen_blkif_put(be->blkif);
+	}
+
+	return 0;
+}
+
+int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
+			      struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-flush-cache",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-flush-cache (%d)", err);
+
+	return err;
+}
+
+static void xen_blkbk_discard(struct xenbus_transaction xbt, struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	struct xen_blkif *blkif = be->blkif;
+	int err;
+	int state = 0;
+	struct block_device *bdev = be->blkif->vbd.bdev;
+
+	if (!xenbus_read_unsigned(dev->nodename, "discard-enable", 1))
+		return;
+
+	if (bdev_max_discard_sectors(bdev)) {
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-granularity", "%u",
+			bdev_discard_granularity(bdev));
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-granularity (%d)", err);
+			return;
+		}
+		err = xenbus_printf(xbt, dev->nodename,
+			"discard-alignment", "%u",
+			bdev_discard_alignment(bdev));
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-alignment (%d)", err);
+			return;
+		}
+		state = 1;
+		/* Optional. */
+		err = xenbus_printf(xbt, dev->nodename,
+				    "discard-secure", "%d",
+				    blkif->vbd.discard_secure);
+		if (err) {
+			dev_warn(&dev->dev, "writing discard-secure (%d)", err);
+			return;
+		}
+	}
+	err = xenbus_printf(xbt, dev->nodename, "feature-discard",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-discard (%d)", err);
+}
+
+int xen_blkbk_barrier(struct xenbus_transaction xbt,
+		      struct backend_info *be, int state)
+{
+	struct xenbus_device *dev = be->dev;
+	int err;
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-barrier",
+			    "%d", state);
+	if (err)
+		dev_warn(&dev->dev, "writing feature-barrier (%d)", err);
+
+	return err;
+}
+
+/*
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures, and watch the store waiting for the hotplug scripts to tell us
+ * the device's physical major and minor numbers.  Switch to InitWait.
+ */
+static int xen_blkbk_probe(struct xenbus_device *dev,
+			   const struct xenbus_device_id *id)
+{
+	int err;
+	struct backend_info *be = kzalloc(sizeof(struct backend_info),
+					  GFP_KERNEL);
+
+	/* match the pr_debug in xen_blkbk_remove */
+	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+	if (!be) {
+		xenbus_dev_fatal(dev, -ENOMEM,
+				 "allocating backend structure");
+		return -ENOMEM;
+	}
+	be->dev = dev;
+	dev_set_drvdata(&dev->dev, be);
+
+	be->blkif = xen_blkif_alloc(dev->otherend_id);
+	if (IS_ERR(be->blkif)) {
+		err = PTR_ERR(be->blkif);
+		be->blkif = NULL;
+		xenbus_dev_fatal(dev, err, "creating block interface");
+		goto fail;
+	}
+
+	err = xenbus_printf(XBT_NIL, dev->nodename,
+			    "feature-max-indirect-segments", "%u",
+			    MAX_INDIRECT_SEGMENTS);
+	if (err)
+		dev_warn(&dev->dev,
+			 "writing %s/feature-max-indirect-segments (%d)",
+			 dev->nodename, err);
+
+	/* Multi-queue: advertise how many queues are supported by us.*/
+	err = xenbus_printf(XBT_NIL, dev->nodename,
+			    "multi-queue-max-queues", "%u", xenblk_max_queues);
+	if (err)
+		pr_warn("Error writing multi-queue-max-queues\n");
+
+	/* setup back pointer */
+	be->blkif->be = be;
+
+	err = xenbus_watch_pathfmt(dev, &be->backend_watch, NULL,
+				   backend_changed,
+				   "%s/%s", dev->nodename, "physical-device");
+	if (err)
+		goto fail;
+
+	err = xenbus_printf(XBT_NIL, dev->nodename, "max-ring-page-order", "%u",
+			    xen_blkif_max_ring_order);
+	if (err)
+		pr_warn("%s write out 'max-ring-page-order' failed\n", __func__);
+
+	err = xenbus_switch_state(dev, XenbusStateInitWait);
+	if (err)
+		goto fail;
+
+	return 0;
+
+fail:
+	pr_warn("%s failed\n", __func__);
+	xen_blkbk_remove(dev);
+	return err;
+}
+
+/*
+ * Callback received when the hotplug scripts have placed the physical-device
+ * node.  Read it and the mode node, and create a vbd.  If the frontend is
+ * ready, connect.
+ */
+static void backend_changed(struct xenbus_watch *watch,
+			    const char *path, const char *token)
+{
+	int err;
+	unsigned major;
+	unsigned minor;
+	struct backend_info *be
+		= container_of(watch, struct backend_info, backend_watch);
+	struct xenbus_device *dev = be->dev;
+	int cdrom = 0;
+	unsigned long handle;
+	char *device_type;
+
+	pr_debug("%s %p %d\n", __func__, dev, dev->otherend_id);
+
+	err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x",
+			   &major, &minor);
+	if (XENBUS_EXIST_ERR(err)) {
+		/*
+		 * Since this watch will fire once immediately after it is
+		 * registered, we expect this.  Ignore it, and wait for the
+		 * hotplug scripts.
+		 */
+		return;
+	}
+	if (err != 2) {
+		xenbus_dev_fatal(dev, err, "reading physical-device");
+		return;
+	}
+
+	if (be->major | be->minor) {
+		if (be->major != major || be->minor != minor)
+			pr_warn("changing physical device (from %x:%x to %x:%x) not supported.\n",
+				be->major, be->minor, major, minor);
+		return;
+	}
+
+	be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL);
+	if (IS_ERR(be->mode)) {
+		err = PTR_ERR(be->mode);
+		be->mode = NULL;
+		xenbus_dev_fatal(dev, err, "reading mode");
+		return;
+	}
+
+	device_type = xenbus_read(XBT_NIL, dev->otherend, "device-type", NULL);
+	if (!IS_ERR(device_type)) {
+		cdrom = strcmp(device_type, "cdrom") == 0;
+		kfree(device_type);
+	}
+
+	/* Front end dir is a number, which is used as the handle. */
+	err = kstrtoul(strrchr(dev->otherend, '/') + 1, 0, &handle);
+	if (err) {
+		kfree(be->mode);
+		be->mode = NULL;
+		return;
+	}
+
+	be->major = major;
+	be->minor = minor;
+
+	err = xen_vbd_create(be->blkif, handle, major, minor,
+			     !strchr(be->mode, 'w'), cdrom);
+
+	if (err)
+		xenbus_dev_fatal(dev, err, "creating vbd structure");
+	else {
+		err = xenvbd_sysfs_addif(dev);
+		if (err) {
+			xen_vbd_free(&be->blkif->vbd);
+			xenbus_dev_fatal(dev, err, "creating sysfs entries");
+		}
+	}
+
+	if (err) {
+		kfree(be->mode);
+		be->mode = NULL;
+		be->major = 0;
+		be->minor = 0;
+	} else {
+		/* We're potentially connected now */
+		xen_update_blkif_status(be->blkif);
+	}
+}
+
+/*
+ * Callback received when the frontend's state changes.
+ */
+static void frontend_changed(struct xenbus_device *dev,
+			     enum xenbus_state frontend_state)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+	int err;
+
+	pr_debug("%s %p %s\n", __func__, dev, xenbus_strstate(frontend_state));
+
+	switch (frontend_state) {
+	case XenbusStateInitialising:
+		if (dev->state == XenbusStateClosed) {
+			pr_info("%s: prepare for reconnect\n", dev->nodename);
+			xenbus_switch_state(dev, XenbusStateInitWait);
+		}
+		break;
+
+	case XenbusStateInitialised:
+	case XenbusStateConnected:
+		/*
+		 * Ensure we connect even when two watches fire in
+		 * close succession and we miss the intermediate value
+		 * of frontend_state.
+		 */
+		if (dev->state == XenbusStateConnected)
+			break;
+
+		/*
+		 * Enforce precondition before potential leak point.
+		 * xen_blkif_disconnect() is idempotent.
+		 */
+		err = xen_blkif_disconnect(be->blkif);
+		if (err) {
+			xenbus_dev_fatal(dev, err, "pending I/O");
+			break;
+		}
+
+		err = connect_ring(be);
+		if (err) {
+			/*
+			 * Clean up so that memory resources can be used by
+			 * other devices. connect_ring reported already error.
+			 */
+			xen_blkif_disconnect(be->blkif);
+			break;
+		}
+		xen_update_blkif_status(be->blkif);
+		break;
+
+	case XenbusStateClosing:
+		xenbus_switch_state(dev, XenbusStateClosing);
+		break;
+
+	case XenbusStateClosed:
+		xen_blkif_disconnect(be->blkif);
+		xenbus_switch_state(dev, XenbusStateClosed);
+		if (xenbus_dev_is_online(dev))
+			break;
+		fallthrough;
+		/* if not online */
+	case XenbusStateUnknown:
+		/* implies xen_blkif_disconnect() via xen_blkbk_remove() */
+		device_unregister(&dev->dev);
+		break;
+
+	default:
+		xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend",
+				 frontend_state);
+		break;
+	}
+}
+
+/* Once a memory pressure is detected, squeeze free page pools for a while. */
+static unsigned int buffer_squeeze_duration_ms = 10;
+module_param_named(buffer_squeeze_duration_ms,
+		buffer_squeeze_duration_ms, int, 0644);
+MODULE_PARM_DESC(buffer_squeeze_duration_ms,
+"Duration in ms to squeeze pages buffer when a memory pressure is detected");
+
+/*
+ * Callback received when the memory pressure is detected.
+ */
+static void reclaim_memory(struct xenbus_device *dev)
+{
+	struct backend_info *be = dev_get_drvdata(&dev->dev);
+
+	if (!be)
+		return;
+	be->blkif->buffer_squeeze_end = jiffies +
+		msecs_to_jiffies(buffer_squeeze_duration_ms);
+}
+
+/* ** Connection ** */
+
+/*
+ * Write the physical details regarding the block device to the store, and
+ * switch to Connected state.
+ */
+static void connect(struct backend_info *be)
+{
+	struct xenbus_transaction xbt;
+	int err;
+	struct xenbus_device *dev = be->dev;
+
+	pr_debug("%s %s\n", __func__, dev->otherend);
+
+	/* Supply the information about the device the frontend needs */
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		return;
+	}
+
+	/* If we can't advertise it is OK. */
+	xen_blkbk_flush_diskcache(xbt, be, be->blkif->vbd.flush_support);
+
+	xen_blkbk_discard(xbt, be);
+
+	xen_blkbk_barrier(xbt, be, be->blkif->vbd.flush_support);
+
+	err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
+			be->blkif->vbd.feature_gnt_persistent_parm);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/feature-persistent",
+				 dev->nodename);
+		goto abort;
+	}
+
+	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
+			    (unsigned long long)vbd_sz(&be->blkif->vbd));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sectors",
+				 dev->nodename);
+		goto abort;
+	}
+
+	/* FIXME: use a typename instead */
+	err = xenbus_printf(xbt, dev->nodename, "info", "%u",
+			    be->blkif->vbd.type |
+			    (be->blkif->vbd.readonly ? VDISK_READONLY : 0));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/info",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu",
+			    (unsigned long)
+			    bdev_logical_block_size(be->blkif->vbd.bdev));
+	if (err) {
+		xenbus_dev_fatal(dev, err, "writing %s/sector-size",
+				 dev->nodename);
+		goto abort;
+	}
+	err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+			    bdev_physical_block_size(be->blkif->vbd.bdev));
+	if (err)
+		xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+				 dev->nodename);
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err == -EAGAIN)
+		goto again;
+	if (err)
+		xenbus_dev_fatal(dev, err, "ending transaction");
+
+	err = xenbus_switch_state(dev, XenbusStateConnected);
+	if (err)
+		xenbus_dev_fatal(dev, err, "%s: switching to Connected state",
+				 dev->nodename);
+
+	return;
+ abort:
+	xenbus_transaction_end(xbt, 1);
+}
+
+/*
+ * Each ring may have multi pages, depends on "ring-page-order".
+ */
+static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir)
+{
+	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+	struct pending_req *req, *n;
+	int err, i, j;
+	struct xen_blkif *blkif = ring->blkif;
+	struct xenbus_device *dev = blkif->be->dev;
+	unsigned int nr_grefs, evtchn;
+
+	err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u",
+			  &evtchn);
+	if (err != 1) {
+		err = -EINVAL;
+		xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir);
+		return err;
+	}
+
+	nr_grefs = blkif->nr_ring_pages;
+
+	if (unlikely(!nr_grefs)) {
+		WARN_ON(true);
+		return -EINVAL;
+	}
+
+	for (i = 0; i < nr_grefs; i++) {
+		char ring_ref_name[RINGREF_NAME_LEN];
+
+		if (blkif->multi_ref)
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+		else {
+			WARN_ON(i != 0);
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref");
+		}
+
+		err = xenbus_scanf(XBT_NIL, dir, ring_ref_name,
+				   "%u", &ring_ref[i]);
+
+		if (err != 1) {
+			err = -EINVAL;
+			xenbus_dev_fatal(dev, err, "reading %s/%s",
+					 dir, ring_ref_name);
+			return err;
+		}
+	}
+
+	err = -ENOMEM;
+	for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) {
+		req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			goto fail;
+		list_add_tail(&req->free_list, &ring->pending_free);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL);
+			if (!req->segments[j])
+				goto fail;
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+							 GFP_KERNEL);
+			if (!req->indirect_pages[j])
+				goto fail;
+		}
+	}
+
+	/* Map the shared frame, irq etc. */
+	err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn);
+		goto fail;
+	}
+
+	return 0;
+
+fail:
+	list_for_each_entry_safe(req, n, &ring->pending_free, free_list) {
+		list_del(&req->free_list);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			if (!req->segments[j])
+				break;
+			kfree(req->segments[j]);
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			if (!req->indirect_pages[j])
+				break;
+			kfree(req->indirect_pages[j]);
+		}
+		kfree(req);
+	}
+	return err;
+}
+
+static int connect_ring(struct backend_info *be)
+{
+	struct xenbus_device *dev = be->dev;
+	struct xen_blkif *blkif = be->blkif;
+	char protocol[64] = "";
+	int err, i;
+	char *xspath;
+	size_t xspathsize;
+	const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */
+	unsigned int requested_num_queues = 0;
+	unsigned int ring_page_order;
+
+	pr_debug("%s %s\n", __func__, dev->otherend);
+
+	blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT;
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "protocol",
+			   "%63s", protocol);
+	if (err <= 0)
+		strcpy(protocol, "unspecified, assuming default");
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE))
+		blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32))
+		blkif->blk_protocol = BLKIF_PROTOCOL_X86_32;
+	else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64))
+		blkif->blk_protocol = BLKIF_PROTOCOL_X86_64;
+	else {
+		xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol);
+		return -ENOSYS;
+	}
+
+	blkif->vbd.feature_gnt_persistent_parm = feature_persistent;
+	blkif->vbd.feature_gnt_persistent =
+		blkif->vbd.feature_gnt_persistent_parm &&
+		xenbus_read_unsigned(dev->otherend, "feature-persistent", 0);
+
+	blkif->vbd.overflow_max_grants = 0;
+
+	/*
+	 * Read the number of hardware queues from frontend.
+	 */
+	requested_num_queues = xenbus_read_unsigned(dev->otherend,
+						    "multi-queue-num-queues",
+						    1);
+	if (requested_num_queues > xenblk_max_queues
+	    || requested_num_queues == 0) {
+		/* Buggy or malicious guest. */
+		xenbus_dev_fatal(dev, err,
+				"guest requested %u queues, exceeding the maximum of %u.",
+				requested_num_queues, xenblk_max_queues);
+		return -ENOSYS;
+	}
+	blkif->nr_rings = requested_num_queues;
+	if (xen_blkif_alloc_rings(blkif))
+		return -ENOMEM;
+
+	pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename,
+		 blkif->nr_rings, blkif->blk_protocol, protocol,
+		 blkif->vbd.feature_gnt_persistent ? "persistent grants" : "");
+
+	err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u",
+			   &ring_page_order);
+	if (err != 1) {
+		blkif->nr_ring_pages = 1;
+		blkif->multi_ref = false;
+	} else if (ring_page_order <= xen_blkif_max_ring_order) {
+		blkif->nr_ring_pages = 1 << ring_page_order;
+		blkif->multi_ref = true;
+	} else {
+		err = -EINVAL;
+		xenbus_dev_fatal(dev, err,
+				 "requested ring page order %d exceed max:%d",
+				 ring_page_order,
+				 xen_blkif_max_ring_order);
+		return err;
+	}
+
+	if (blkif->nr_rings == 1)
+		return read_per_ring_refs(&blkif->rings[0], dev->otherend);
+	else {
+		xspathsize = strlen(dev->otherend) + xenstore_path_ext_size;
+		xspath = kmalloc(xspathsize, GFP_KERNEL);
+		if (!xspath) {
+			xenbus_dev_fatal(dev, -ENOMEM, "reading ring references");
+			return -ENOMEM;
+		}
+
+		for (i = 0; i < blkif->nr_rings; i++) {
+			memset(xspath, 0, xspathsize);
+			snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i);
+			err = read_per_ring_refs(&blkif->rings[i], xspath);
+			if (err) {
+				kfree(xspath);
+				return err;
+			}
+		}
+		kfree(xspath);
+	}
+	return 0;
+}
+
+static const struct xenbus_device_id xen_blkbk_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+static struct xenbus_driver xen_blkbk_driver = {
+	.ids  = xen_blkbk_ids,
+	.probe = xen_blkbk_probe,
+	.remove = xen_blkbk_remove,
+	.otherend_changed = frontend_changed,
+	.allow_rebind = true,
+	.reclaim_memory = reclaim_memory,
+};
+
+int xen_blkif_xenbus_init(void)
+{
+	return xenbus_register_backend(&xen_blkbk_driver);
+}
+
+void xen_blkif_xenbus_fini(void)
+{
+	xenbus_unregister_driver(&xen_blkbk_driver);
+}
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
new file mode 100644
index 000000000..5ddf393aa
--- /dev/null
+++ b/drivers/block/xen-blkfront.c
@@ -0,0 +1,2649 @@
+/*
+ * blkfront.c
+ *
+ * XenLinux virtual block device driver.
+ *
+ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand
+ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge
+ * Copyright (c) 2004, Christian Limpach
+ * Copyright (c) 2004, Andrew Warfield
+ * Copyright (c) 2005, Christopher Clark
+ * Copyright (c) 2005, XenSource Ltd
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/interrupt.h>
+#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
+#include <linux/hdreg.h>
+#include <linux/cdrom.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/major.h>
+#include <linux/mutex.h>
+#include <linux/scatterlist.h>
+#include <linux/bitmap.h>
+#include <linux/list.h>
+#include <linux/workqueue.h>
+#include <linux/sched/mm.h>
+
+#include <xen/xen.h>
+#include <xen/xenbus.h>
+#include <xen/grant_table.h>
+#include <xen/events.h>
+#include <xen/page.h>
+#include <xen/platform_pci.h>
+
+#include <xen/interface/grant_table.h>
+#include <xen/interface/io/blkif.h>
+#include <xen/interface/io/protocols.h>
+
+#include <asm/xen/hypervisor.h>
+
+/*
+ * The minimal size of segment supported by the block framework is PAGE_SIZE.
+ * When Linux is using a different page size than Xen, it may not be possible
+ * to put all the data in a single segment.
+ * This can happen when the backend doesn't support indirect descriptor and
+ * therefore the maximum amount of data that a request can carry is
+ * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB
+ *
+ * Note that we only support one extra request. So the Linux page size
+ * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) =
+ * 88KB.
+ */
+#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE)
+
+enum blkif_state {
+	BLKIF_STATE_DISCONNECTED,
+	BLKIF_STATE_CONNECTED,
+	BLKIF_STATE_SUSPENDED,
+	BLKIF_STATE_ERROR,
+};
+
+struct grant {
+	grant_ref_t gref;
+	struct page *page;
+	struct list_head node;
+};
+
+enum blk_req_status {
+	REQ_PROCESSING,
+	REQ_WAITING,
+	REQ_DONE,
+	REQ_ERROR,
+	REQ_EOPNOTSUPP,
+};
+
+struct blk_shadow {
+	struct blkif_request req;
+	struct request *request;
+	struct grant **grants_used;
+	struct grant **indirect_grants;
+	struct scatterlist *sg;
+	unsigned int num_sg;
+	enum blk_req_status status;
+
+	#define NO_ASSOCIATED_ID ~0UL
+	/*
+	 * Id of the sibling if we ever need 2 requests when handling a
+	 * block I/O request
+	 */
+	unsigned long associated_id;
+};
+
+struct blkif_req {
+	blk_status_t	error;
+};
+
+static inline struct blkif_req *blkif_req(struct request *rq)
+{
+	return blk_mq_rq_to_pdu(rq);
+}
+
+static DEFINE_MUTEX(blkfront_mutex);
+static const struct block_device_operations xlvbd_block_fops;
+static struct delayed_work blkfront_work;
+static LIST_HEAD(info_list);
+
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max_indirect_segments, xen_blkif_max_segments, uint, 0444);
+MODULE_PARM_DESC(max_indirect_segments,
+		 "Maximum amount of segments in indirect requests (default is 32)");
+
+static unsigned int xen_blkif_max_queues = 4;
+module_param_named(max_queues, xen_blkif_max_queues, uint, 0444);
+MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk");
+
+/*
+ * Maximum order of pages to be used for the shared ring between front and
+ * backend, 4KB page granularity is used.
+ */
+static unsigned int xen_blkif_max_ring_order;
+module_param_named(max_ring_page_order, xen_blkif_max_ring_order, int, 0444);
+MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the shared ring");
+
+static bool __read_mostly xen_blkif_trusted = true;
+module_param_named(trusted, xen_blkif_trusted, bool, 0644);
+MODULE_PARM_DESC(trusted, "Is the backend trusted");
+
+#define BLK_RING_SIZE(info)	\
+	__CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * (info)->nr_ring_pages)
+
+/*
+ * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19
+ * characters are enough. Define to 20 to keep consistent with backend.
+ */
+#define RINGREF_NAME_LEN (20)
+/*
+ * queue-%u would take 7 + 10(UINT_MAX) = 17 characters.
+ */
+#define QUEUE_NAME_LEN (17)
+
+/*
+ *  Per-ring info.
+ *  Every blkfront device can associate with one or more blkfront_ring_info,
+ *  depending on how many hardware queues/rings to be used.
+ */
+struct blkfront_ring_info {
+	/* Lock to protect data in every ring buffer. */
+	spinlock_t ring_lock;
+	struct blkif_front_ring ring;
+	unsigned int ring_ref[XENBUS_MAX_RING_GRANTS];
+	unsigned int evtchn, irq;
+	struct work_struct work;
+	struct gnttab_free_callback callback;
+	struct list_head indirect_pages;
+	struct list_head grants;
+	unsigned int persistent_gnts_c;
+	unsigned long shadow_free;
+	struct blkfront_info *dev_info;
+	struct blk_shadow shadow[];
+};
+
+/*
+ * We have one of these per vbd, whether ide, scsi or 'other'.  They
+ * hang in private_data off the gendisk structure. We may end up
+ * putting all kinds of interesting stuff here :-)
+ */
+struct blkfront_info
+{
+	struct mutex mutex;
+	struct xenbus_device *xbdev;
+	struct gendisk *gd;
+	u16 sector_size;
+	unsigned int physical_sector_size;
+	unsigned long vdisk_info;
+	int vdevice;
+	blkif_vdev_t handle;
+	enum blkif_state connected;
+	/* Number of pages per ring buffer. */
+	unsigned int nr_ring_pages;
+	struct request_queue *rq;
+	unsigned int feature_flush:1;
+	unsigned int feature_fua:1;
+	unsigned int feature_discard:1;
+	unsigned int feature_secdiscard:1;
+	/* Connect-time cached feature_persistent parameter */
+	unsigned int feature_persistent_parm:1;
+	/* Persistent grants feature negotiation result */
+	unsigned int feature_persistent:1;
+	unsigned int bounce:1;
+	unsigned int discard_granularity;
+	unsigned int discard_alignment;
+	/* Number of 4KB segments handled */
+	unsigned int max_indirect_segments;
+	int is_ready;
+	struct blk_mq_tag_set tag_set;
+	struct blkfront_ring_info *rinfo;
+	unsigned int nr_rings;
+	unsigned int rinfo_size;
+	/* Save uncomplete reqs and bios for migration. */
+	struct list_head requests;
+	struct bio_list bio_list;
+	struct list_head info_list;
+};
+
+static unsigned int nr_minors;
+static unsigned long *minors;
+static DEFINE_SPINLOCK(minor_lock);
+
+#define PARTS_PER_DISK		16
+#define PARTS_PER_EXT_DISK      256
+
+#define BLKIF_MAJOR(dev) ((dev)>>8)
+#define BLKIF_MINOR(dev) ((dev) & 0xff)
+
+#define EXT_SHIFT 28
+#define EXTENDED (1<<EXT_SHIFT)
+#define VDEV_IS_EXTENDED(dev) ((dev)&(EXTENDED))
+#define BLKIF_MINOR_EXT(dev) ((dev)&(~EXTENDED))
+#define EMULATED_HD_DISK_MINOR_OFFSET (0)
+#define EMULATED_HD_DISK_NAME_OFFSET (EMULATED_HD_DISK_MINOR_OFFSET / 256)
+#define EMULATED_SD_DISK_MINOR_OFFSET (0)
+#define EMULATED_SD_DISK_NAME_OFFSET (EMULATED_SD_DISK_MINOR_OFFSET / 256)
+
+#define DEV_NAME	"xvd"	/* name in /dev */
+
+/*
+ * Grants are always the same size as a Xen page (i.e 4KB).
+ * A physical segment is always the same size as a Linux page.
+ * Number of grants per physical segment
+ */
+#define GRANTS_PER_PSEG	(PAGE_SIZE / XEN_PAGE_SIZE)
+
+#define GRANTS_PER_INDIRECT_FRAME \
+	(XEN_PAGE_SIZE / sizeof(struct blkif_request_segment))
+
+#define INDIRECT_GREFS(_grants)		\
+	DIV_ROUND_UP(_grants, GRANTS_PER_INDIRECT_FRAME)
+
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo);
+static void blkfront_gather_backend_features(struct blkfront_info *info);
+static int negotiate_mq(struct blkfront_info *info);
+
+#define for_each_rinfo(info, ptr, idx)				\
+	for ((ptr) = (info)->rinfo, (idx) = 0;			\
+	     (idx) < (info)->nr_rings;				\
+	     (idx)++, (ptr) = (void *)(ptr) + (info)->rinfo_size)
+
+static inline struct blkfront_ring_info *
+get_rinfo(const struct blkfront_info *info, unsigned int i)
+{
+	BUG_ON(i >= info->nr_rings);
+	return (void *)info->rinfo + i * info->rinfo_size;
+}
+
+static int get_id_from_freelist(struct blkfront_ring_info *rinfo)
+{
+	unsigned long free = rinfo->shadow_free;
+
+	BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info));
+	rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id;
+	rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */
+	return free;
+}
+
+static int add_id_to_freelist(struct blkfront_ring_info *rinfo,
+			      unsigned long id)
+{
+	if (rinfo->shadow[id].req.u.rw.id != id)
+		return -EINVAL;
+	if (rinfo->shadow[id].request == NULL)
+		return -EINVAL;
+	rinfo->shadow[id].req.u.rw.id  = rinfo->shadow_free;
+	rinfo->shadow[id].request = NULL;
+	rinfo->shadow_free = id;
+	return 0;
+}
+
+static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num)
+{
+	struct blkfront_info *info = rinfo->dev_info;
+	struct page *granted_page;
+	struct grant *gnt_list_entry, *n;
+	int i = 0;
+
+	while (i < num) {
+		gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO);
+		if (!gnt_list_entry)
+			goto out_of_memory;
+
+		if (info->bounce) {
+			granted_page = alloc_page(GFP_NOIO | __GFP_ZERO);
+			if (!granted_page) {
+				kfree(gnt_list_entry);
+				goto out_of_memory;
+			}
+			gnt_list_entry->page = granted_page;
+		}
+
+		gnt_list_entry->gref = INVALID_GRANT_REF;
+		list_add(&gnt_list_entry->node, &rinfo->grants);
+		i++;
+	}
+
+	return 0;
+
+out_of_memory:
+	list_for_each_entry_safe(gnt_list_entry, n,
+	                         &rinfo->grants, node) {
+		list_del(&gnt_list_entry->node);
+		if (info->bounce)
+			__free_page(gnt_list_entry->page);
+		kfree(gnt_list_entry);
+		i--;
+	}
+	BUG_ON(i != 0);
+	return -ENOMEM;
+}
+
+static struct grant *get_free_grant(struct blkfront_ring_info *rinfo)
+{
+	struct grant *gnt_list_entry;
+
+	BUG_ON(list_empty(&rinfo->grants));
+	gnt_list_entry = list_first_entry(&rinfo->grants, struct grant,
+					  node);
+	list_del(&gnt_list_entry->node);
+
+	if (gnt_list_entry->gref != INVALID_GRANT_REF)
+		rinfo->persistent_gnts_c--;
+
+	return gnt_list_entry;
+}
+
+static inline void grant_foreign_access(const struct grant *gnt_list_entry,
+					const struct blkfront_info *info)
+{
+	gnttab_page_grant_foreign_access_ref_one(gnt_list_entry->gref,
+						 info->xbdev->otherend_id,
+						 gnt_list_entry->page,
+						 0);
+}
+
+static struct grant *get_grant(grant_ref_t *gref_head,
+			       unsigned long gfn,
+			       struct blkfront_ring_info *rinfo)
+{
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
+
+	if (gnt_list_entry->gref != INVALID_GRANT_REF)
+		return gnt_list_entry;
+
+	/* Assign a gref to this page */
+	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (info->bounce)
+		grant_foreign_access(gnt_list_entry, info);
+	else {
+		/* Grant access to the GFN passed by the caller */
+		gnttab_grant_foreign_access_ref(gnt_list_entry->gref,
+						info->xbdev->otherend_id,
+						gfn, 0);
+	}
+
+	return gnt_list_entry;
+}
+
+static struct grant *get_indirect_grant(grant_ref_t *gref_head,
+					struct blkfront_ring_info *rinfo)
+{
+	struct grant *gnt_list_entry = get_free_grant(rinfo);
+	struct blkfront_info *info = rinfo->dev_info;
+
+	if (gnt_list_entry->gref != INVALID_GRANT_REF)
+		return gnt_list_entry;
+
+	/* Assign a gref to this page */
+	gnt_list_entry->gref = gnttab_claim_grant_reference(gref_head);
+	BUG_ON(gnt_list_entry->gref == -ENOSPC);
+	if (!info->bounce) {
+		struct page *indirect_page;
+
+		/* Fetch a pre-allocated page to use for indirect grefs */
+		BUG_ON(list_empty(&rinfo->indirect_pages));
+		indirect_page = list_first_entry(&rinfo->indirect_pages,
+						 struct page, lru);
+		list_del(&indirect_page->lru);
+		gnt_list_entry->page = indirect_page;
+	}
+	grant_foreign_access(gnt_list_entry, info);
+
+	return gnt_list_entry;
+}
+
+static const char *op_name(int op)
+{
+	static const char *const names[] = {
+		[BLKIF_OP_READ] = "read",
+		[BLKIF_OP_WRITE] = "write",
+		[BLKIF_OP_WRITE_BARRIER] = "barrier",
+		[BLKIF_OP_FLUSH_DISKCACHE] = "flush",
+		[BLKIF_OP_DISCARD] = "discard" };
+
+	if (op < 0 || op >= ARRAY_SIZE(names))
+		return "unknown";
+
+	if (!names[op])
+		return "reserved";
+
+	return names[op];
+}
+static int xlbd_reserve_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+	int rc;
+
+	if (end > nr_minors) {
+		unsigned long *bitmap, *old;
+
+		bitmap = kcalloc(BITS_TO_LONGS(end), sizeof(*bitmap),
+				 GFP_KERNEL);
+		if (bitmap == NULL)
+			return -ENOMEM;
+
+		spin_lock(&minor_lock);
+		if (end > nr_minors) {
+			old = minors;
+			memcpy(bitmap, minors,
+			       BITS_TO_LONGS(nr_minors) * sizeof(*bitmap));
+			minors = bitmap;
+			nr_minors = BITS_TO_LONGS(end) * BITS_PER_LONG;
+		} else
+			old = bitmap;
+		spin_unlock(&minor_lock);
+		kfree(old);
+	}
+
+	spin_lock(&minor_lock);
+	if (find_next_bit(minors, end, minor) >= end) {
+		bitmap_set(minors, minor, nr);
+		rc = 0;
+	} else
+		rc = -EBUSY;
+	spin_unlock(&minor_lock);
+
+	return rc;
+}
+
+static void xlbd_release_minors(unsigned int minor, unsigned int nr)
+{
+	unsigned int end = minor + nr;
+
+	BUG_ON(end > nr_minors);
+	spin_lock(&minor_lock);
+	bitmap_clear(minors,  minor, nr);
+	spin_unlock(&minor_lock);
+}
+
+static void blkif_restart_queue_callback(void *arg)
+{
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg;
+	schedule_work(&rinfo->work);
+}
+
+static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg)
+{
+	/* We don't have real geometry info, but let's at least return
+	   values consistent with the size of the device */
+	sector_t nsect = get_capacity(bd->bd_disk);
+	sector_t cylinders = nsect;
+
+	hg->heads = 0xff;
+	hg->sectors = 0x3f;
+	sector_div(cylinders, hg->heads * hg->sectors);
+	hg->cylinders = cylinders;
+	if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect)
+		hg->cylinders = 0xffff;
+	return 0;
+}
+
+static int blkif_ioctl(struct block_device *bdev, fmode_t mode,
+		       unsigned command, unsigned long argument)
+{
+	struct blkfront_info *info = bdev->bd_disk->private_data;
+	int i;
+
+	switch (command) {
+	case CDROMMULTISESSION:
+		for (i = 0; i < sizeof(struct cdrom_multisession); i++)
+			if (put_user(0, (char __user *)(argument + i)))
+				return -EFAULT;
+		return 0;
+	case CDROM_GET_CAPABILITY:
+		if (!(info->vdisk_info & VDISK_CDROM))
+			return -EINVAL;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo,
+					    struct request *req,
+					    struct blkif_request **ring_req)
+{
+	unsigned long id;
+
+	*ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt);
+	rinfo->ring.req_prod_pvt++;
+
+	id = get_id_from_freelist(rinfo);
+	rinfo->shadow[id].request = req;
+	rinfo->shadow[id].status = REQ_PROCESSING;
+	rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID;
+
+	rinfo->shadow[id].req.u.rw.id = id;
+
+	return id;
+}
+
+static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blkif_request *ring_req, *final_ring_req;
+	unsigned long id;
+
+	/* Fill out a communications ring structure. */
+	id = blkif_ring_get_request(rinfo, req, &final_ring_req);
+	ring_req = &rinfo->shadow[id].req;
+
+	ring_req->operation = BLKIF_OP_DISCARD;
+	ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+	ring_req->u.discard.id = id;
+	ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
+	if (req_op(req) == REQ_OP_SECURE_ERASE && info->feature_secdiscard)
+		ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
+	else
+		ring_req->u.discard.flag = 0;
+
+	/* Copy the request to the ring page. */
+	*final_ring_req = *ring_req;
+	rinfo->shadow[id].status = REQ_WAITING;
+
+	return 0;
+}
+
+struct setup_rw_req {
+	unsigned int grant_idx;
+	struct blkif_request_segment *segments;
+	struct blkfront_ring_info *rinfo;
+	struct blkif_request *ring_req;
+	grant_ref_t gref_head;
+	unsigned int id;
+	/* Only used when persistent grant is used and it's a write request */
+	bool need_copy;
+	unsigned int bvec_off;
+	char *bvec_data;
+
+	bool require_extra_req;
+	struct blkif_request *extra_ring_req;
+};
+
+static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset,
+				     unsigned int len, void *data)
+{
+	struct setup_rw_req *setup = data;
+	int n, ref;
+	struct grant *gnt_list_entry;
+	unsigned int fsect, lsect;
+	/* Convenient aliases */
+	unsigned int grant_idx = setup->grant_idx;
+	struct blkif_request *ring_req = setup->ring_req;
+	struct blkfront_ring_info *rinfo = setup->rinfo;
+	/*
+	 * We always use the shadow of the first request to store the list
+	 * of grant associated to the block I/O request. This made the
+	 * completion more easy to handle even if the block I/O request is
+	 * split.
+	 */
+	struct blk_shadow *shadow = &rinfo->shadow[setup->id];
+
+	if (unlikely(setup->require_extra_req &&
+		     grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+		/*
+		 * We are using the second request, setup grant_idx
+		 * to be the index of the segment array.
+		 */
+		grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		ring_req = setup->extra_ring_req;
+	}
+
+	if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+	    (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) {
+		if (setup->segments)
+			kunmap_atomic(setup->segments);
+
+		n = grant_idx / GRANTS_PER_INDIRECT_FRAME;
+		gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo);
+		shadow->indirect_grants[n] = gnt_list_entry;
+		setup->segments = kmap_atomic(gnt_list_entry->page);
+		ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+	}
+
+	gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo);
+	ref = gnt_list_entry->gref;
+	/*
+	 * All the grants are stored in the shadow of the first
+	 * request. Therefore we have to use the global index.
+	 */
+	shadow->grants_used[setup->grant_idx] = gnt_list_entry;
+
+	if (setup->need_copy) {
+		void *shared_data;
+
+		shared_data = kmap_atomic(gnt_list_entry->page);
+		/*
+		 * this does not wipe data stored outside the
+		 * range sg->offset..sg->offset+sg->length.
+		 * Therefore, blkback *could* see data from
+		 * previous requests. This is OK as long as
+		 * persistent grants are shared with just one
+		 * domain. It may need refactoring if this
+		 * changes
+		 */
+		memcpy(shared_data + offset,
+		       setup->bvec_data + setup->bvec_off,
+		       len);
+
+		kunmap_atomic(shared_data);
+		setup->bvec_off += len;
+	}
+
+	fsect = offset >> 9;
+	lsect = fsect + (len >> 9) - 1;
+	if (ring_req->operation != BLKIF_OP_INDIRECT) {
+		ring_req->u.rw.seg[grant_idx] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	} else {
+		setup->segments[grant_idx % GRANTS_PER_INDIRECT_FRAME] =
+			(struct blkif_request_segment) {
+				.gref       = ref,
+				.first_sect = fsect,
+				.last_sect  = lsect };
+	}
+
+	(setup->grant_idx)++;
+}
+
+static void blkif_setup_extra_req(struct blkif_request *first,
+				  struct blkif_request *second)
+{
+	uint16_t nr_segments = first->u.rw.nr_segments;
+
+	/*
+	 * The second request is only present when the first request uses
+	 * all its segments. It's always the continuity of the first one.
+	 */
+	first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	second->u.rw.sector_number = first->u.rw.sector_number +
+		(BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512;
+
+	second->u.rw.handle = first->u.rw.handle;
+	second->operation = first->operation;
+}
+
+static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo)
+{
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blkif_request *ring_req, *extra_ring_req = NULL;
+	struct blkif_request *final_ring_req, *final_extra_ring_req = NULL;
+	unsigned long id, extra_id = NO_ASSOCIATED_ID;
+	bool require_extra_req = false;
+	int i;
+	struct setup_rw_req setup = {
+		.grant_idx = 0,
+		.segments = NULL,
+		.rinfo = rinfo,
+		.need_copy = rq_data_dir(req) && info->bounce,
+	};
+
+	/*
+	 * Used to store if we are able to queue the request by just using
+	 * existing persistent grants, or if we have to get new grants,
+	 * as there are not sufficiently many free.
+	 */
+	bool new_persistent_gnts = false;
+	struct scatterlist *sg;
+	int num_sg, max_grefs, num_grant;
+
+	max_grefs = req->nr_phys_segments * GRANTS_PER_PSEG;
+	if (max_grefs > BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		/*
+		 * If we are using indirect segments we need to account
+		 * for the indirect grefs used in the request.
+		 */
+		max_grefs += INDIRECT_GREFS(max_grefs);
+
+	/* Check if we have enough persistent grants to allocate a requests */
+	if (rinfo->persistent_gnts_c < max_grefs) {
+		new_persistent_gnts = true;
+
+		if (gnttab_alloc_grant_references(
+		    max_grefs - rinfo->persistent_gnts_c,
+		    &setup.gref_head) < 0) {
+			gnttab_request_free_callback(
+				&rinfo->callback,
+				blkif_restart_queue_callback,
+				rinfo,
+				max_grefs - rinfo->persistent_gnts_c);
+			return 1;
+		}
+	}
+
+	/* Fill out a communications ring structure. */
+	id = blkif_ring_get_request(rinfo, req, &final_ring_req);
+	ring_req = &rinfo->shadow[id].req;
+
+	num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg);
+	num_grant = 0;
+	/* Calculate the number of grant used */
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i)
+	       num_grant += gnttab_count_grant(sg->offset, sg->length);
+
+	require_extra_req = info->max_indirect_segments == 0 &&
+		num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	BUG_ON(!HAS_EXTRA_REQ && require_extra_req);
+
+	rinfo->shadow[id].num_sg = num_sg;
+	if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST &&
+	    likely(!require_extra_req)) {
+		/*
+		 * The indirect operation can only be a BLKIF_OP_READ or
+		 * BLKIF_OP_WRITE
+		 */
+		BUG_ON(req_op(req) == REQ_OP_FLUSH || req->cmd_flags & REQ_FUA);
+		ring_req->operation = BLKIF_OP_INDIRECT;
+		ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.indirect.handle = info->handle;
+		ring_req->u.indirect.nr_segments = num_grant;
+	} else {
+		ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+		ring_req->u.rw.handle = info->handle;
+		ring_req->operation = rq_data_dir(req) ?
+			BLKIF_OP_WRITE : BLKIF_OP_READ;
+		if (req_op(req) == REQ_OP_FLUSH ||
+		    (req_op(req) == REQ_OP_WRITE && (req->cmd_flags & REQ_FUA))) {
+			/*
+			 * Ideally we can do an unordered flush-to-disk.
+			 * In case the backend onlysupports barriers, use that.
+			 * A barrier request a superset of FUA, so we can
+			 * implement it the same way.  (It's also a FLUSH+FUA,
+			 * since it is guaranteed ordered WRT previous writes.)
+			 */
+			if (info->feature_flush && info->feature_fua)
+				ring_req->operation =
+					BLKIF_OP_WRITE_BARRIER;
+			else if (info->feature_flush)
+				ring_req->operation =
+					BLKIF_OP_FLUSH_DISKCACHE;
+			else
+				ring_req->operation = 0;
+		}
+		ring_req->u.rw.nr_segments = num_grant;
+		if (unlikely(require_extra_req)) {
+			extra_id = blkif_ring_get_request(rinfo, req,
+							  &final_extra_ring_req);
+			extra_ring_req = &rinfo->shadow[extra_id].req;
+
+			/*
+			 * Only the first request contains the scatter-gather
+			 * list.
+			 */
+			rinfo->shadow[extra_id].num_sg = 0;
+
+			blkif_setup_extra_req(ring_req, extra_ring_req);
+
+			/* Link the 2 requests together */
+			rinfo->shadow[extra_id].associated_id = id;
+			rinfo->shadow[id].associated_id = extra_id;
+		}
+	}
+
+	setup.ring_req = ring_req;
+	setup.id = id;
+
+	setup.require_extra_req = require_extra_req;
+	if (unlikely(require_extra_req))
+		setup.extra_ring_req = extra_ring_req;
+
+	for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) {
+		BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+
+		if (setup.need_copy) {
+			setup.bvec_off = sg->offset;
+			setup.bvec_data = kmap_atomic(sg_page(sg));
+		}
+
+		gnttab_foreach_grant_in_range(sg_page(sg),
+					      sg->offset,
+					      sg->length,
+					      blkif_setup_rw_req_grant,
+					      &setup);
+
+		if (setup.need_copy)
+			kunmap_atomic(setup.bvec_data);
+	}
+	if (setup.segments)
+		kunmap_atomic(setup.segments);
+
+	/* Copy request(s) to the ring page. */
+	*final_ring_req = *ring_req;
+	rinfo->shadow[id].status = REQ_WAITING;
+	if (unlikely(require_extra_req)) {
+		*final_extra_ring_req = *extra_ring_req;
+		rinfo->shadow[extra_id].status = REQ_WAITING;
+	}
+
+	if (new_persistent_gnts)
+		gnttab_free_grant_references(setup.gref_head);
+
+	return 0;
+}
+
+/*
+ * Generate a Xen blkfront IO request from a blk layer request.  Reads
+ * and writes are handled as expected.
+ *
+ * @req: a request struct
+ */
+static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo)
+{
+	if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED))
+		return 1;
+
+	if (unlikely(req_op(req) == REQ_OP_DISCARD ||
+		     req_op(req) == REQ_OP_SECURE_ERASE))
+		return blkif_queue_discard_req(req, rinfo);
+	else
+		return blkif_queue_rw_req(req, rinfo);
+}
+
+static inline void flush_requests(struct blkfront_ring_info *rinfo)
+{
+	int notify;
+
+	RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify);
+
+	if (notify)
+		notify_remote_via_irq(rinfo->irq);
+}
+
+static inline bool blkif_request_flush_invalid(struct request *req,
+					       struct blkfront_info *info)
+{
+	return (blk_rq_is_passthrough(req) ||
+		((req_op(req) == REQ_OP_FLUSH) &&
+		 !info->feature_flush) ||
+		((req->cmd_flags & REQ_FUA) &&
+		 !info->feature_fua));
+}
+
+static blk_status_t blkif_queue_rq(struct blk_mq_hw_ctx *hctx,
+			  const struct blk_mq_queue_data *qd)
+{
+	unsigned long flags;
+	int qid = hctx->queue_num;
+	struct blkfront_info *info = hctx->queue->queuedata;
+	struct blkfront_ring_info *rinfo = NULL;
+
+	rinfo = get_rinfo(info, qid);
+	blk_mq_start_request(qd->rq);
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+	if (RING_FULL(&rinfo->ring))
+		goto out_busy;
+
+	if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info))
+		goto out_err;
+
+	if (blkif_queue_request(qd->rq, rinfo))
+		goto out_busy;
+
+	flush_requests(rinfo);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+	return BLK_STS_OK;
+
+out_err:
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+	return BLK_STS_IOERR;
+
+out_busy:
+	blk_mq_stop_hw_queue(hctx);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+	return BLK_STS_DEV_RESOURCE;
+}
+
+static void blkif_complete_rq(struct request *rq)
+{
+	blk_mq_end_request(rq, blkif_req(rq)->error);
+}
+
+static const struct blk_mq_ops blkfront_mq_ops = {
+	.queue_rq = blkif_queue_rq,
+	.complete = blkif_complete_rq,
+};
+
+static void blkif_set_queue_limits(struct blkfront_info *info)
+{
+	struct request_queue *rq = info->rq;
+	struct gendisk *gd = info->gd;
+	unsigned int segments = info->max_indirect_segments ? :
+				BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	blk_queue_flag_set(QUEUE_FLAG_VIRT, rq);
+
+	if (info->feature_discard) {
+		blk_queue_max_discard_sectors(rq, get_capacity(gd));
+		rq->limits.discard_granularity = info->discard_granularity ?:
+						 info->physical_sector_size;
+		rq->limits.discard_alignment = info->discard_alignment;
+		if (info->feature_secdiscard)
+			blk_queue_max_secure_erase_sectors(rq,
+							   get_capacity(gd));
+	}
+
+	/* Hard sector size and max sectors impersonate the equiv. hardware. */
+	blk_queue_logical_block_size(rq, info->sector_size);
+	blk_queue_physical_block_size(rq, info->physical_sector_size);
+	blk_queue_max_hw_sectors(rq, (segments * XEN_PAGE_SIZE) / 512);
+
+	/* Each segment in a request is up to an aligned page in size. */
+	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
+	blk_queue_max_segment_size(rq, PAGE_SIZE);
+
+	/* Ensure a merged request will fit in a single I/O ring slot. */
+	blk_queue_max_segments(rq, segments / GRANTS_PER_PSEG);
+
+	/* Make sure buffer addresses are sector-aligned. */
+	blk_queue_dma_alignment(rq, 511);
+}
+
+static const char *flush_info(struct blkfront_info *info)
+{
+	if (info->feature_flush && info->feature_fua)
+		return "barrier: enabled;";
+	else if (info->feature_flush)
+		return "flush diskcache: enabled;";
+	else
+		return "barrier or flush: disabled;";
+}
+
+static void xlvbd_flush(struct blkfront_info *info)
+{
+	blk_queue_write_cache(info->rq, info->feature_flush ? true : false,
+			      info->feature_fua ? true : false);
+	pr_info("blkfront: %s: %s %s %s %s %s %s %s\n",
+		info->gd->disk_name, flush_info(info),
+		"persistent grants:", info->feature_persistent ?
+		"enabled;" : "disabled;", "indirect descriptors:",
+		info->max_indirect_segments ? "enabled;" : "disabled;",
+		"bounce buffer:", info->bounce ? "enabled" : "disabled;");
+}
+
+static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
+{
+	int major;
+	major = BLKIF_MAJOR(vdevice);
+	*minor = BLKIF_MINOR(vdevice);
+	switch (major) {
+		case XEN_IDE0_MAJOR:
+			*offset = (*minor / 64) + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = ((*minor / 64) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_IDE1_MAJOR:
+			*offset = (*minor / 64) + 2 + EMULATED_HD_DISK_NAME_OFFSET;
+			*minor = (((*minor / 64) + 2) * PARTS_PER_DISK) +
+				EMULATED_HD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK0_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor + EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK1_MAJOR:
+		case XEN_SCSI_DISK2_MAJOR:
+		case XEN_SCSI_DISK3_MAJOR:
+		case XEN_SCSI_DISK4_MAJOR:
+		case XEN_SCSI_DISK5_MAJOR:
+		case XEN_SCSI_DISK6_MAJOR:
+		case XEN_SCSI_DISK7_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK1_MAJOR + 1) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XEN_SCSI_DISK8_MAJOR:
+		case XEN_SCSI_DISK9_MAJOR:
+		case XEN_SCSI_DISK10_MAJOR:
+		case XEN_SCSI_DISK11_MAJOR:
+		case XEN_SCSI_DISK12_MAJOR:
+		case XEN_SCSI_DISK13_MAJOR:
+		case XEN_SCSI_DISK14_MAJOR:
+		case XEN_SCSI_DISK15_MAJOR:
+			*offset = (*minor / PARTS_PER_DISK) + 
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16) +
+				EMULATED_SD_DISK_NAME_OFFSET;
+			*minor = *minor +
+				((major - XEN_SCSI_DISK8_MAJOR + 8) * 16 * PARTS_PER_DISK) +
+				EMULATED_SD_DISK_MINOR_OFFSET;
+			break;
+		case XENVBD_MAJOR:
+			*offset = *minor / PARTS_PER_DISK;
+			break;
+		default:
+			printk(KERN_WARNING "blkfront: your disk configuration is "
+					"incorrect, please use an xvd device instead\n");
+			return -ENODEV;
+	}
+	return 0;
+}
+
+static char *encode_disk_name(char *ptr, unsigned int n)
+{
+	if (n >= 26)
+		ptr = encode_disk_name(ptr, n / 26 - 1);
+	*ptr = 'a' + n % 26;
+	return ptr + 1;
+}
+
+static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
+		struct blkfront_info *info, u16 sector_size,
+		unsigned int physical_sector_size)
+{
+	struct gendisk *gd;
+	int nr_minors = 1;
+	int err;
+	unsigned int offset;
+	int minor;
+	int nr_parts;
+	char *ptr;
+
+	BUG_ON(info->gd != NULL);
+	BUG_ON(info->rq != NULL);
+
+	if ((info->vdevice>>EXT_SHIFT) > 1) {
+		/* this is above the extended range; something is wrong */
+		printk(KERN_WARNING "blkfront: vdevice 0x%x is above the extended range; ignoring\n", info->vdevice);
+		return -ENODEV;
+	}
+
+	if (!VDEV_IS_EXTENDED(info->vdevice)) {
+		err = xen_translate_vdev(info->vdevice, &minor, &offset);
+		if (err)
+			return err;
+		nr_parts = PARTS_PER_DISK;
+	} else {
+		minor = BLKIF_MINOR_EXT(info->vdevice);
+		nr_parts = PARTS_PER_EXT_DISK;
+		offset = minor / nr_parts;
+		if (xen_hvm_domain() && offset < EMULATED_HD_DISK_NAME_OFFSET + 4)
+			printk(KERN_WARNING "blkfront: vdevice 0x%x might conflict with "
+					"emulated IDE disks,\n\t choose an xvd device name"
+					"from xvde on\n", info->vdevice);
+	}
+	if (minor >> MINORBITS) {
+		pr_warn("blkfront: %#x's minor (%#x) out of range; ignoring\n",
+			info->vdevice, minor);
+		return -ENODEV;
+	}
+
+	if ((minor % nr_parts) == 0)
+		nr_minors = nr_parts;
+
+	err = xlbd_reserve_minors(minor, nr_minors);
+	if (err)
+		return err;
+
+	memset(&info->tag_set, 0, sizeof(info->tag_set));
+	info->tag_set.ops = &blkfront_mq_ops;
+	info->tag_set.nr_hw_queues = info->nr_rings;
+	if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) {
+		/*
+		 * When indirect descriptior is not supported, the I/O request
+		 * will be split between multiple request in the ring.
+		 * To avoid problems when sending the request, divide by
+		 * 2 the depth of the queue.
+		 */
+		info->tag_set.queue_depth =  BLK_RING_SIZE(info) / 2;
+	} else
+		info->tag_set.queue_depth = BLK_RING_SIZE(info);
+	info->tag_set.numa_node = NUMA_NO_NODE;
+	info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	info->tag_set.cmd_size = sizeof(struct blkif_req);
+	info->tag_set.driver_data = info;
+
+	err = blk_mq_alloc_tag_set(&info->tag_set);
+	if (err)
+		goto out_release_minors;
+
+	gd = blk_mq_alloc_disk(&info->tag_set, info);
+	if (IS_ERR(gd)) {
+		err = PTR_ERR(gd);
+		goto out_free_tag_set;
+	}
+
+	strcpy(gd->disk_name, DEV_NAME);
+	ptr = encode_disk_name(gd->disk_name + sizeof(DEV_NAME) - 1, offset);
+	BUG_ON(ptr >= gd->disk_name + DISK_NAME_LEN);
+	if (nr_minors > 1)
+		*ptr = 0;
+	else
+		snprintf(ptr, gd->disk_name + DISK_NAME_LEN - ptr,
+			 "%d", minor & (nr_parts - 1));
+
+	gd->major = XENVBD_MAJOR;
+	gd->first_minor = minor;
+	gd->minors = nr_minors;
+	gd->fops = &xlvbd_block_fops;
+	gd->private_data = info;
+	set_capacity(gd, capacity);
+
+	info->rq = gd->queue;
+	info->gd = gd;
+	info->sector_size = sector_size;
+	info->physical_sector_size = physical_sector_size;
+	blkif_set_queue_limits(info);
+
+	xlvbd_flush(info);
+
+	if (info->vdisk_info & VDISK_READONLY)
+		set_disk_ro(gd, 1);
+	if (info->vdisk_info & VDISK_REMOVABLE)
+		gd->flags |= GENHD_FL_REMOVABLE;
+
+	return 0;
+
+out_free_tag_set:
+	blk_mq_free_tag_set(&info->tag_set);
+out_release_minors:
+	xlbd_release_minors(minor, nr_minors);
+	return err;
+}
+
+/* Already hold rinfo->ring_lock. */
+static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo)
+{
+	if (!RING_FULL(&rinfo->ring))
+		blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true);
+}
+
+static void kick_pending_request_queues(struct blkfront_ring_info *rinfo)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+	kick_pending_request_queues_locked(rinfo);
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+}
+
+static void blkif_restart_queue(struct work_struct *work)
+{
+	struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work);
+
+	if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED)
+		kick_pending_request_queues(rinfo);
+}
+
+static void blkif_free_ring(struct blkfront_ring_info *rinfo)
+{
+	struct grant *persistent_gnt, *n;
+	struct blkfront_info *info = rinfo->dev_info;
+	int i, j, segs;
+
+	/*
+	 * Remove indirect pages, this only happens when using indirect
+	 * descriptors but not persistent grants
+	 */
+	if (!list_empty(&rinfo->indirect_pages)) {
+		struct page *indirect_page, *n;
+
+		BUG_ON(info->bounce);
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+
+	/* Remove all persistent grants. */
+	if (!list_empty(&rinfo->grants)) {
+		list_for_each_entry_safe(persistent_gnt, n,
+					 &rinfo->grants, node) {
+			list_del(&persistent_gnt->node);
+			if (persistent_gnt->gref != INVALID_GRANT_REF) {
+				gnttab_end_foreign_access(persistent_gnt->gref,
+							  NULL);
+				rinfo->persistent_gnts_c--;
+			}
+			if (info->bounce)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+	}
+	BUG_ON(rinfo->persistent_gnts_c != 0);
+
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		/*
+		 * Clear persistent grants present in requests already
+		 * on the shared ring
+		 */
+		if (!rinfo->shadow[i].request)
+			goto free_shadow;
+
+		segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       rinfo->shadow[i].req.u.indirect.nr_segments :
+		       rinfo->shadow[i].req.u.rw.nr_segments;
+		for (j = 0; j < segs; j++) {
+			persistent_gnt = rinfo->shadow[i].grants_used[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, NULL);
+			if (info->bounce)
+				__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+
+		if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+			/*
+			 * If this is not an indirect operation don't try to
+			 * free indirect segments
+			 */
+			goto free_shadow;
+
+		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+			persistent_gnt = rinfo->shadow[i].indirect_grants[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, NULL);
+			__free_page(persistent_gnt->page);
+			kfree(persistent_gnt);
+		}
+
+free_shadow:
+		kvfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kvfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+		kvfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
+	}
+
+	/* No more gnttab callback work. */
+	gnttab_cancel_free_callback(&rinfo->callback);
+
+	/* Flush gnttab callback work. Must be done with no locks held. */
+	flush_work(&rinfo->work);
+
+	/* Free resources associated with old device channel. */
+	xenbus_teardown_ring((void **)&rinfo->ring.sring, info->nr_ring_pages,
+			     rinfo->ring_ref);
+
+	if (rinfo->irq)
+		unbind_from_irqhandler(rinfo->irq, rinfo);
+	rinfo->evtchn = rinfo->irq = 0;
+}
+
+static void blkif_free(struct blkfront_info *info, int suspend)
+{
+	unsigned int i;
+	struct blkfront_ring_info *rinfo;
+
+	/* Prevent new requests being issued until we fix things up. */
+	info->connected = suspend ?
+		BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED;
+	/* No more blkif_request(). */
+	if (info->rq)
+		blk_mq_stop_hw_queues(info->rq);
+
+	for_each_rinfo(info, rinfo, i)
+		blkif_free_ring(rinfo);
+
+	kvfree(info->rinfo);
+	info->rinfo = NULL;
+	info->nr_rings = 0;
+}
+
+struct copy_from_grant {
+	const struct blk_shadow *s;
+	unsigned int grant_idx;
+	unsigned int bvec_offset;
+	char *bvec_data;
+};
+
+static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset,
+				  unsigned int len, void *data)
+{
+	struct copy_from_grant *info = data;
+	char *shared_data;
+	/* Convenient aliases */
+	const struct blk_shadow *s = info->s;
+
+	shared_data = kmap_atomic(s->grants_used[info->grant_idx]->page);
+
+	memcpy(info->bvec_data + info->bvec_offset,
+	       shared_data + offset, len);
+
+	info->bvec_offset += len;
+	info->grant_idx++;
+
+	kunmap_atomic(shared_data);
+}
+
+static enum blk_req_status blkif_rsp_to_req_status(int rsp)
+{
+	switch (rsp)
+	{
+	case BLKIF_RSP_OKAY:
+		return REQ_DONE;
+	case BLKIF_RSP_EOPNOTSUPP:
+		return REQ_EOPNOTSUPP;
+	case BLKIF_RSP_ERROR:
+	default:
+		return REQ_ERROR;
+	}
+}
+
+/*
+ * Get the final status of the block request based on two ring response
+ */
+static int blkif_get_final_status(enum blk_req_status s1,
+				  enum blk_req_status s2)
+{
+	BUG_ON(s1 < REQ_DONE);
+	BUG_ON(s2 < REQ_DONE);
+
+	if (s1 == REQ_ERROR || s2 == REQ_ERROR)
+		return BLKIF_RSP_ERROR;
+	else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP)
+		return BLKIF_RSP_EOPNOTSUPP;
+	return BLKIF_RSP_OKAY;
+}
+
+/*
+ * Return values:
+ *  1 response processed.
+ *  0 missing further responses.
+ * -1 error while processing.
+ */
+static int blkif_completion(unsigned long *id,
+			    struct blkfront_ring_info *rinfo,
+			    struct blkif_response *bret)
+{
+	int i = 0;
+	struct scatterlist *sg;
+	int num_sg, num_grant;
+	struct blkfront_info *info = rinfo->dev_info;
+	struct blk_shadow *s = &rinfo->shadow[*id];
+	struct copy_from_grant data = {
+		.grant_idx = 0,
+	};
+
+	num_grant = s->req.operation == BLKIF_OP_INDIRECT ?
+		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
+
+	/* The I/O request may be split in two. */
+	if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) {
+		struct blk_shadow *s2 = &rinfo->shadow[s->associated_id];
+
+		/* Keep the status of the current response in shadow. */
+		s->status = blkif_rsp_to_req_status(bret->status);
+
+		/* Wait the second response if not yet here. */
+		if (s2->status < REQ_DONE)
+			return 0;
+
+		bret->status = blkif_get_final_status(s->status,
+						      s2->status);
+
+		/*
+		 * All the grants is stored in the first shadow in order
+		 * to make the completion code simpler.
+		 */
+		num_grant += s2->req.u.rw.nr_segments;
+
+		/*
+		 * The two responses may not come in order. Only the
+		 * first request will store the scatter-gather list.
+		 */
+		if (s2->num_sg != 0) {
+			/* Update "id" with the ID of the first response. */
+			*id = s->associated_id;
+			s = s2;
+		}
+
+		/*
+		 * We don't need anymore the second request, so recycling
+		 * it now.
+		 */
+		if (add_id_to_freelist(rinfo, s->associated_id))
+			WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n",
+			     info->gd->disk_name, s->associated_id);
+	}
+
+	data.s = s;
+	num_sg = s->num_sg;
+
+	if (bret->operation == BLKIF_OP_READ && info->bounce) {
+		for_each_sg(s->sg, sg, num_sg, i) {
+			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
+
+			data.bvec_offset = sg->offset;
+			data.bvec_data = kmap_atomic(sg_page(sg));
+
+			gnttab_foreach_grant_in_range(sg_page(sg),
+						      sg->offset,
+						      sg->length,
+						      blkif_copy_from_grant,
+						      &data);
+
+			kunmap_atomic(data.bvec_data);
+		}
+	}
+	/* Add the persistent grant into the list of free grants */
+	for (i = 0; i < num_grant; i++) {
+		if (!gnttab_try_end_foreign_access(s->grants_used[i]->gref)) {
+			/*
+			 * If the grant is still mapped by the backend (the
+			 * backend has chosen to make this grant persistent)
+			 * we add it at the head of the list, so it will be
+			 * reused first.
+			 */
+			if (!info->feature_persistent) {
+				pr_alert("backed has not unmapped grant: %u\n",
+					 s->grants_used[i]->gref);
+				return -1;
+			}
+			list_add(&s->grants_used[i]->node, &rinfo->grants);
+			rinfo->persistent_gnts_c++;
+		} else {
+			/*
+			 * If the grant is not mapped by the backend we add it
+			 * to the tail of the list, so it will not be picked
+			 * again unless we run out of persistent grants.
+			 */
+			s->grants_used[i]->gref = INVALID_GRANT_REF;
+			list_add_tail(&s->grants_used[i]->node, &rinfo->grants);
+		}
+	}
+	if (s->req.operation == BLKIF_OP_INDIRECT) {
+		for (i = 0; i < INDIRECT_GREFS(num_grant); i++) {
+			if (!gnttab_try_end_foreign_access(s->indirect_grants[i]->gref)) {
+				if (!info->feature_persistent) {
+					pr_alert("backed has not unmapped grant: %u\n",
+						 s->indirect_grants[i]->gref);
+					return -1;
+				}
+				list_add(&s->indirect_grants[i]->node, &rinfo->grants);
+				rinfo->persistent_gnts_c++;
+			} else {
+				struct page *indirect_page;
+
+				/*
+				 * Add the used indirect page back to the list of
+				 * available pages for indirect grefs.
+				 */
+				if (!info->bounce) {
+					indirect_page = s->indirect_grants[i]->page;
+					list_add(&indirect_page->lru, &rinfo->indirect_pages);
+				}
+				s->indirect_grants[i]->gref = INVALID_GRANT_REF;
+				list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants);
+			}
+		}
+	}
+
+	return 1;
+}
+
+static irqreturn_t blkif_interrupt(int irq, void *dev_id)
+{
+	struct request *req;
+	struct blkif_response bret;
+	RING_IDX i, rp;
+	unsigned long flags;
+	struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id;
+	struct blkfront_info *info = rinfo->dev_info;
+	unsigned int eoiflag = XEN_EOI_FLAG_SPURIOUS;
+
+	if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) {
+		xen_irq_lateeoi(irq, XEN_EOI_FLAG_SPURIOUS);
+		return IRQ_HANDLED;
+	}
+
+	spin_lock_irqsave(&rinfo->ring_lock, flags);
+ again:
+	rp = READ_ONCE(rinfo->ring.sring->rsp_prod);
+	virt_rmb(); /* Ensure we see queued responses up to 'rp'. */
+	if (RING_RESPONSE_PROD_OVERFLOW(&rinfo->ring, rp)) {
+		pr_alert("%s: illegal number of responses %u\n",
+			 info->gd->disk_name, rp - rinfo->ring.rsp_cons);
+		goto err;
+	}
+
+	for (i = rinfo->ring.rsp_cons; i != rp; i++) {
+		unsigned long id;
+		unsigned int op;
+
+		eoiflag = 0;
+
+		RING_COPY_RESPONSE(&rinfo->ring, i, &bret);
+		id = bret.id;
+
+		/*
+		 * The backend has messed up and given us an id that we would
+		 * never have given to it (we stamp it up to BLK_RING_SIZE -
+		 * look in get_id_from_freelist.
+		 */
+		if (id >= BLK_RING_SIZE(info)) {
+			pr_alert("%s: response has incorrect id (%ld)\n",
+				 info->gd->disk_name, id);
+			goto err;
+		}
+		if (rinfo->shadow[id].status != REQ_WAITING) {
+			pr_alert("%s: response references no pending request\n",
+				 info->gd->disk_name);
+			goto err;
+		}
+
+		rinfo->shadow[id].status = REQ_PROCESSING;
+		req  = rinfo->shadow[id].request;
+
+		op = rinfo->shadow[id].req.operation;
+		if (op == BLKIF_OP_INDIRECT)
+			op = rinfo->shadow[id].req.u.indirect.indirect_op;
+		if (bret.operation != op) {
+			pr_alert("%s: response has wrong operation (%u instead of %u)\n",
+				 info->gd->disk_name, bret.operation, op);
+			goto err;
+		}
+
+		if (bret.operation != BLKIF_OP_DISCARD) {
+			int ret;
+
+			/*
+			 * We may need to wait for an extra response if the
+			 * I/O request is split in 2
+			 */
+			ret = blkif_completion(&id, rinfo, &bret);
+			if (!ret)
+				continue;
+			if (unlikely(ret < 0))
+				goto err;
+		}
+
+		if (add_id_to_freelist(rinfo, id)) {
+			WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n",
+			     info->gd->disk_name, op_name(bret.operation), id);
+			continue;
+		}
+
+		if (bret.status == BLKIF_RSP_OKAY)
+			blkif_req(req)->error = BLK_STS_OK;
+		else
+			blkif_req(req)->error = BLK_STS_IOERR;
+
+		switch (bret.operation) {
+		case BLKIF_OP_DISCARD:
+			if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
+				struct request_queue *rq = info->rq;
+
+				pr_warn_ratelimited("blkfront: %s: %s op failed\n",
+					   info->gd->disk_name, op_name(bret.operation));
+				blkif_req(req)->error = BLK_STS_NOTSUPP;
+				info->feature_discard = 0;
+				info->feature_secdiscard = 0;
+				blk_queue_max_discard_sectors(rq, 0);
+				blk_queue_max_secure_erase_sectors(rq, 0);
+			}
+			break;
+		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_WRITE_BARRIER:
+			if (unlikely(bret.status == BLKIF_RSP_EOPNOTSUPP)) {
+				pr_warn_ratelimited("blkfront: %s: %s op failed\n",
+				       info->gd->disk_name, op_name(bret.operation));
+				blkif_req(req)->error = BLK_STS_NOTSUPP;
+			}
+			if (unlikely(bret.status == BLKIF_RSP_ERROR &&
+				     rinfo->shadow[id].req.u.rw.nr_segments == 0)) {
+				pr_warn_ratelimited("blkfront: %s: empty %s op failed\n",
+				       info->gd->disk_name, op_name(bret.operation));
+				blkif_req(req)->error = BLK_STS_NOTSUPP;
+			}
+			if (unlikely(blkif_req(req)->error)) {
+				if (blkif_req(req)->error == BLK_STS_NOTSUPP)
+					blkif_req(req)->error = BLK_STS_OK;
+				info->feature_fua = 0;
+				info->feature_flush = 0;
+				xlvbd_flush(info);
+			}
+			fallthrough;
+		case BLKIF_OP_READ:
+		case BLKIF_OP_WRITE:
+			if (unlikely(bret.status != BLKIF_RSP_OKAY))
+				dev_dbg_ratelimited(&info->xbdev->dev,
+					"Bad return from blkdev data request: %#x\n",
+					bret.status);
+
+			break;
+		default:
+			BUG();
+		}
+
+		if (likely(!blk_should_fake_timeout(req->q)))
+			blk_mq_complete_request(req);
+	}
+
+	rinfo->ring.rsp_cons = i;
+
+	if (i != rinfo->ring.req_prod_pvt) {
+		int more_to_do;
+		RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do);
+		if (more_to_do)
+			goto again;
+	} else
+		rinfo->ring.sring->rsp_event = i + 1;
+
+	kick_pending_request_queues_locked(rinfo);
+
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+
+	xen_irq_lateeoi(irq, eoiflag);
+
+	return IRQ_HANDLED;
+
+ err:
+	info->connected = BLKIF_STATE_ERROR;
+
+	spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+
+	/* No EOI in order to avoid further interrupts. */
+
+	pr_alert("%s disabled for further use\n", info->gd->disk_name);
+	return IRQ_HANDLED;
+}
+
+
+static int setup_blkring(struct xenbus_device *dev,
+			 struct blkfront_ring_info *rinfo)
+{
+	struct blkif_sring *sring;
+	int err;
+	struct blkfront_info *info = rinfo->dev_info;
+	unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE;
+
+	err = xenbus_setup_ring(dev, GFP_NOIO, (void **)&sring,
+				info->nr_ring_pages, rinfo->ring_ref);
+	if (err)
+		goto fail;
+
+	XEN_FRONT_RING_INIT(&rinfo->ring, sring, ring_size);
+
+	err = xenbus_alloc_evtchn(dev, &rinfo->evtchn);
+	if (err)
+		goto fail;
+
+	err = bind_evtchn_to_irqhandler_lateeoi(rinfo->evtchn, blkif_interrupt,
+						0, "blkif", rinfo);
+	if (err <= 0) {
+		xenbus_dev_fatal(dev, err,
+				 "bind_evtchn_to_irqhandler failed");
+		goto fail;
+	}
+	rinfo->irq = err;
+
+	return 0;
+fail:
+	blkif_free(info, 0);
+	return err;
+}
+
+/*
+ * Write out per-ring/queue nodes including ring-ref and event-channel, and each
+ * ring buffer may have multi pages depending on ->nr_ring_pages.
+ */
+static int write_per_ring_nodes(struct xenbus_transaction xbt,
+				struct blkfront_ring_info *rinfo, const char *dir)
+{
+	int err;
+	unsigned int i;
+	const char *message = NULL;
+	struct blkfront_info *info = rinfo->dev_info;
+
+	if (info->nr_ring_pages == 1) {
+		err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]);
+		if (err) {
+			message = "writing ring-ref";
+			goto abort_transaction;
+		}
+	} else {
+		for (i = 0; i < info->nr_ring_pages; i++) {
+			char ring_ref_name[RINGREF_NAME_LEN];
+
+			snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i);
+			err = xenbus_printf(xbt, dir, ring_ref_name,
+					    "%u", rinfo->ring_ref[i]);
+			if (err) {
+				message = "writing ring-ref";
+				goto abort_transaction;
+			}
+		}
+	}
+
+	err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn);
+	if (err) {
+		message = "writing event-channel";
+		goto abort_transaction;
+	}
+
+	return 0;
+
+abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(info->xbdev, err, "%s", message);
+
+	return err;
+}
+
+/* Enable the persistent grants feature. */
+static bool feature_persistent = true;
+module_param(feature_persistent, bool, 0644);
+MODULE_PARM_DESC(feature_persistent,
+		"Enables the persistent grants feature");
+
+/* Common code used when first setting up, and when resuming. */
+static int talk_to_blkback(struct xenbus_device *dev,
+			   struct blkfront_info *info)
+{
+	const char *message = NULL;
+	struct xenbus_transaction xbt;
+	int err;
+	unsigned int i, max_page_order;
+	unsigned int ring_page_order;
+	struct blkfront_ring_info *rinfo;
+
+	if (!info)
+		return -ENODEV;
+
+	/* Check if backend is trusted. */
+	info->bounce = !xen_blkif_trusted ||
+		       !xenbus_read_unsigned(dev->nodename, "trusted", 1);
+
+	max_page_order = xenbus_read_unsigned(info->xbdev->otherend,
+					      "max-ring-page-order", 0);
+	ring_page_order = min(xen_blkif_max_ring_order, max_page_order);
+	info->nr_ring_pages = 1 << ring_page_order;
+
+	err = negotiate_mq(info);
+	if (err)
+		goto destroy_blkring;
+
+	for_each_rinfo(info, rinfo, i) {
+		/* Create shared ring, alloc event channel. */
+		err = setup_blkring(dev, rinfo);
+		if (err)
+			goto destroy_blkring;
+	}
+
+again:
+	err = xenbus_transaction_start(&xbt);
+	if (err) {
+		xenbus_dev_fatal(dev, err, "starting transaction");
+		goto destroy_blkring;
+	}
+
+	if (info->nr_ring_pages > 1) {
+		err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u",
+				    ring_page_order);
+		if (err) {
+			message = "writing ring-page-order";
+			goto abort_transaction;
+		}
+	}
+
+	/* We already got the number of queues/rings in _probe */
+	if (info->nr_rings == 1) {
+		err = write_per_ring_nodes(xbt, info->rinfo, dev->nodename);
+		if (err)
+			goto destroy_blkring;
+	} else {
+		char *path;
+		size_t pathsize;
+
+		err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u",
+				    info->nr_rings);
+		if (err) {
+			message = "writing multi-queue-num-queues";
+			goto abort_transaction;
+		}
+
+		pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN;
+		path = kmalloc(pathsize, GFP_KERNEL);
+		if (!path) {
+			err = -ENOMEM;
+			message = "ENOMEM while writing ring references";
+			goto abort_transaction;
+		}
+
+		for_each_rinfo(info, rinfo, i) {
+			memset(path, 0, pathsize);
+			snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i);
+			err = write_per_ring_nodes(xbt, rinfo, path);
+			if (err) {
+				kfree(path);
+				goto destroy_blkring;
+			}
+		}
+		kfree(path);
+	}
+	err = xenbus_printf(xbt, dev->nodename, "protocol", "%s",
+			    XEN_IO_PROTO_ABI_NATIVE);
+	if (err) {
+		message = "writing protocol";
+		goto abort_transaction;
+	}
+	info->feature_persistent_parm = feature_persistent;
+	err = xenbus_printf(xbt, dev->nodename, "feature-persistent", "%u",
+			info->feature_persistent_parm);
+	if (err)
+		dev_warn(&dev->dev,
+			 "writing persistent grants feature to xenbus");
+
+	err = xenbus_transaction_end(xbt, 0);
+	if (err) {
+		if (err == -EAGAIN)
+			goto again;
+		xenbus_dev_fatal(dev, err, "completing transaction");
+		goto destroy_blkring;
+	}
+
+	for_each_rinfo(info, rinfo, i) {
+		unsigned int j;
+
+		for (j = 0; j < BLK_RING_SIZE(info); j++)
+			rinfo->shadow[j].req.u.rw.id = j + 1;
+		rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff;
+	}
+	xenbus_switch_state(dev, XenbusStateInitialised);
+
+	return 0;
+
+ abort_transaction:
+	xenbus_transaction_end(xbt, 1);
+	if (message)
+		xenbus_dev_fatal(dev, err, "%s", message);
+ destroy_blkring:
+	blkif_free(info, 0);
+	return err;
+}
+
+static int negotiate_mq(struct blkfront_info *info)
+{
+	unsigned int backend_max_queues;
+	unsigned int i;
+	struct blkfront_ring_info *rinfo;
+
+	BUG_ON(info->nr_rings);
+
+	/* Check if backend supports multiple queues. */
+	backend_max_queues = xenbus_read_unsigned(info->xbdev->otherend,
+						  "multi-queue-max-queues", 1);
+	info->nr_rings = min(backend_max_queues, xen_blkif_max_queues);
+	/* We need at least one ring. */
+	if (!info->nr_rings)
+		info->nr_rings = 1;
+
+	info->rinfo_size = struct_size(info->rinfo, shadow,
+				       BLK_RING_SIZE(info));
+	info->rinfo = kvcalloc(info->nr_rings, info->rinfo_size, GFP_KERNEL);
+	if (!info->rinfo) {
+		xenbus_dev_fatal(info->xbdev, -ENOMEM, "allocating ring_info structure");
+		info->nr_rings = 0;
+		return -ENOMEM;
+	}
+
+	for_each_rinfo(info, rinfo, i) {
+		INIT_LIST_HEAD(&rinfo->indirect_pages);
+		INIT_LIST_HEAD(&rinfo->grants);
+		rinfo->dev_info = info;
+		INIT_WORK(&rinfo->work, blkif_restart_queue);
+		spin_lock_init(&rinfo->ring_lock);
+	}
+	return 0;
+}
+
+/*
+ * Entry point to this code when a new device is created.  Allocate the basic
+ * structures and the ring buffer for communication with the backend, and
+ * inform the backend of the appropriate details for those.  Switch to
+ * Initialised state.
+ */
+static int blkfront_probe(struct xenbus_device *dev,
+			  const struct xenbus_device_id *id)
+{
+	int err, vdevice;
+	struct blkfront_info *info;
+
+	/* FIXME: Use dynamic device id if this is not set. */
+	err = xenbus_scanf(XBT_NIL, dev->nodename,
+			   "virtual-device", "%i", &vdevice);
+	if (err != 1) {
+		/* go looking in the extended area instead */
+		err = xenbus_scanf(XBT_NIL, dev->nodename, "virtual-device-ext",
+				   "%i", &vdevice);
+		if (err != 1) {
+			xenbus_dev_fatal(dev, err, "reading virtual-device");
+			return err;
+		}
+	}
+
+	if (xen_hvm_domain()) {
+		char *type;
+		int len;
+		/* no unplug has been done: do not hook devices != xen vbds */
+		if (xen_has_pv_and_legacy_disk_devices()) {
+			int major;
+
+			if (!VDEV_IS_EXTENDED(vdevice))
+				major = BLKIF_MAJOR(vdevice);
+			else
+				major = XENVBD_MAJOR;
+
+			if (major != XENVBD_MAJOR) {
+				printk(KERN_INFO
+						"%s: HVM does not support vbd %d as xen block device\n",
+						__func__, vdevice);
+				return -ENODEV;
+			}
+		}
+		/* do not create a PV cdrom device if we are an HVM guest */
+		type = xenbus_read(XBT_NIL, dev->nodename, "device-type", &len);
+		if (IS_ERR(type))
+			return -ENODEV;
+		if (strncmp(type, "cdrom", 5) == 0) {
+			kfree(type);
+			return -ENODEV;
+		}
+		kfree(type);
+	}
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info) {
+		xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure");
+		return -ENOMEM;
+	}
+
+	info->xbdev = dev;
+
+	mutex_init(&info->mutex);
+	info->vdevice = vdevice;
+	info->connected = BLKIF_STATE_DISCONNECTED;
+
+	/* Front end dir is a number, which is used as the id. */
+	info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0);
+	dev_set_drvdata(&dev->dev, info);
+
+	mutex_lock(&blkfront_mutex);
+	list_add(&info->info_list, &info_list);
+	mutex_unlock(&blkfront_mutex);
+
+	return 0;
+}
+
+static int blkif_recover(struct blkfront_info *info)
+{
+	unsigned int r_index;
+	struct request *req, *n;
+	int rc;
+	struct bio *bio;
+	unsigned int segs;
+	struct blkfront_ring_info *rinfo;
+
+	blkfront_gather_backend_features(info);
+	/* Reset limits changed by blk_mq_update_nr_hw_queues(). */
+	blkif_set_queue_limits(info);
+	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	blk_queue_max_segments(info->rq, segs / GRANTS_PER_PSEG);
+
+	for_each_rinfo(info, rinfo, r_index) {
+		rc = blkfront_setup_indirect(rinfo);
+		if (rc)
+			return rc;
+	}
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	/* Now safe for us to use the shared ring */
+	info->connected = BLKIF_STATE_CONNECTED;
+
+	for_each_rinfo(info, rinfo, r_index) {
+		/* Kick any other new requests queued since we resumed */
+		kick_pending_request_queues(rinfo);
+	}
+
+	list_for_each_entry_safe(req, n, &info->requests, queuelist) {
+		/* Requeue pending requests (flush or discard) */
+		list_del_init(&req->queuelist);
+		BUG_ON(req->nr_phys_segments > segs);
+		blk_mq_requeue_request(req, false);
+	}
+	blk_mq_start_stopped_hw_queues(info->rq, true);
+	blk_mq_kick_requeue_list(info->rq);
+
+	while ((bio = bio_list_pop(&info->bio_list)) != NULL) {
+		/* Traverse the list of pending bios and re-queue them */
+		submit_bio(bio);
+	}
+
+	return 0;
+}
+
+/*
+ * We are reconnecting to the backend, due to a suspend/resume, or a backend
+ * driver restart.  We tear down our blkif structure and recreate it, but
+ * leave the device-layer structures intact so that this is transparent to the
+ * rest of the kernel.
+ */
+static int blkfront_resume(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+	int err = 0;
+	unsigned int i, j;
+	struct blkfront_ring_info *rinfo;
+
+	dev_dbg(&dev->dev, "blkfront_resume: %s\n", dev->nodename);
+
+	bio_list_init(&info->bio_list);
+	INIT_LIST_HEAD(&info->requests);
+	for_each_rinfo(info, rinfo, i) {
+		struct bio_list merge_bio;
+		struct blk_shadow *shadow = rinfo->shadow;
+
+		for (j = 0; j < BLK_RING_SIZE(info); j++) {
+			/* Not in use? */
+			if (!shadow[j].request)
+				continue;
+
+			/*
+			 * Get the bios in the request so we can re-queue them.
+			 */
+			if (req_op(shadow[j].request) == REQ_OP_FLUSH ||
+			    req_op(shadow[j].request) == REQ_OP_DISCARD ||
+			    req_op(shadow[j].request) == REQ_OP_SECURE_ERASE ||
+			    shadow[j].request->cmd_flags & REQ_FUA) {
+				/*
+				 * Flush operations don't contain bios, so
+				 * we need to requeue the whole request
+				 *
+				 * XXX: but this doesn't make any sense for a
+				 * write with the FUA flag set..
+				 */
+				list_add(&shadow[j].request->queuelist, &info->requests);
+				continue;
+			}
+			merge_bio.head = shadow[j].request->bio;
+			merge_bio.tail = shadow[j].request->biotail;
+			bio_list_merge(&info->bio_list, &merge_bio);
+			shadow[j].request->bio = NULL;
+			blk_mq_end_request(shadow[j].request, BLK_STS_OK);
+		}
+	}
+
+	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
+
+	err = talk_to_blkback(dev, info);
+	if (!err)
+		blk_mq_update_nr_hw_queues(&info->tag_set, info->nr_rings);
+
+	/*
+	 * We have to wait for the backend to switch to
+	 * connected state, since we want to read which
+	 * features it supports.
+	 */
+
+	return err;
+}
+
+static void blkfront_closing(struct blkfront_info *info)
+{
+	struct xenbus_device *xbdev = info->xbdev;
+	struct blkfront_ring_info *rinfo;
+	unsigned int i;
+
+	if (xbdev->state == XenbusStateClosing)
+		return;
+
+	/* No more blkif_request(). */
+	if (info->rq && info->gd) {
+		blk_mq_stop_hw_queues(info->rq);
+		blk_mark_disk_dead(info->gd);
+		set_capacity(info->gd, 0);
+	}
+
+	for_each_rinfo(info, rinfo, i) {
+		/* No more gnttab callback work. */
+		gnttab_cancel_free_callback(&rinfo->callback);
+
+		/* Flush gnttab callback work. Must be done with no locks held. */
+		flush_work(&rinfo->work);
+	}
+
+	xenbus_frontend_closed(xbdev);
+}
+
+static void blkfront_setup_discard(struct blkfront_info *info)
+{
+	info->feature_discard = 1;
+	info->discard_granularity = xenbus_read_unsigned(info->xbdev->otherend,
+							 "discard-granularity",
+							 0);
+	info->discard_alignment = xenbus_read_unsigned(info->xbdev->otherend,
+						       "discard-alignment", 0);
+	info->feature_secdiscard =
+		!!xenbus_read_unsigned(info->xbdev->otherend, "discard-secure",
+				       0);
+}
+
+static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo)
+{
+	unsigned int psegs, grants, memflags;
+	int err, i;
+	struct blkfront_info *info = rinfo->dev_info;
+
+	memflags = memalloc_noio_save();
+
+	if (info->max_indirect_segments == 0) {
+		if (!HAS_EXTRA_REQ)
+			grants = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+		else {
+			/*
+			 * When an extra req is required, the maximum
+			 * grants supported is related to the size of the
+			 * Linux block segment.
+			 */
+			grants = GRANTS_PER_PSEG;
+		}
+	}
+	else
+		grants = info->max_indirect_segments;
+	psegs = DIV_ROUND_UP(grants, GRANTS_PER_PSEG);
+
+	err = fill_grant_buffer(rinfo,
+				(grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info));
+	if (err)
+		goto out_of_memory;
+
+	if (!info->bounce && info->max_indirect_segments) {
+		/*
+		 * We are using indirect descriptors but don't have a bounce
+		 * buffer, we need to allocate a set of pages that can be
+		 * used for mapping indirect grefs
+		 */
+		int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info);
+
+		BUG_ON(!list_empty(&rinfo->indirect_pages));
+		for (i = 0; i < num; i++) {
+			struct page *indirect_page = alloc_page(GFP_KERNEL |
+								__GFP_ZERO);
+			if (!indirect_page)
+				goto out_of_memory;
+			list_add(&indirect_page->lru, &rinfo->indirect_pages);
+		}
+	}
+
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		rinfo->shadow[i].grants_used =
+			kvcalloc(grants,
+				 sizeof(rinfo->shadow[i].grants_used[0]),
+				 GFP_KERNEL);
+		rinfo->shadow[i].sg = kvcalloc(psegs,
+					       sizeof(rinfo->shadow[i].sg[0]),
+					       GFP_KERNEL);
+		if (info->max_indirect_segments)
+			rinfo->shadow[i].indirect_grants =
+				kvcalloc(INDIRECT_GREFS(grants),
+					 sizeof(rinfo->shadow[i].indirect_grants[0]),
+					 GFP_KERNEL);
+		if ((rinfo->shadow[i].grants_used == NULL) ||
+			(rinfo->shadow[i].sg == NULL) ||
+		     (info->max_indirect_segments &&
+		     (rinfo->shadow[i].indirect_grants == NULL)))
+			goto out_of_memory;
+		sg_init_table(rinfo->shadow[i].sg, psegs);
+	}
+
+	memalloc_noio_restore(memflags);
+
+	return 0;
+
+out_of_memory:
+	for (i = 0; i < BLK_RING_SIZE(info); i++) {
+		kvfree(rinfo->shadow[i].grants_used);
+		rinfo->shadow[i].grants_used = NULL;
+		kvfree(rinfo->shadow[i].sg);
+		rinfo->shadow[i].sg = NULL;
+		kvfree(rinfo->shadow[i].indirect_grants);
+		rinfo->shadow[i].indirect_grants = NULL;
+	}
+	if (!list_empty(&rinfo->indirect_pages)) {
+		struct page *indirect_page, *n;
+		list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) {
+			list_del(&indirect_page->lru);
+			__free_page(indirect_page);
+		}
+	}
+
+	memalloc_noio_restore(memflags);
+
+	return -ENOMEM;
+}
+
+/*
+ * Gather all backend feature-*
+ */
+static void blkfront_gather_backend_features(struct blkfront_info *info)
+{
+	unsigned int indirect_segments;
+
+	info->feature_flush = 0;
+	info->feature_fua = 0;
+
+	/*
+	 * If there's no "feature-barrier" defined, then it means
+	 * we're dealing with a very old backend which writes
+	 * synchronously; nothing to do.
+	 *
+	 * If there are barriers, then we use flush.
+	 */
+	if (xenbus_read_unsigned(info->xbdev->otherend, "feature-barrier", 0)) {
+		info->feature_flush = 1;
+		info->feature_fua = 1;
+	}
+
+	/*
+	 * And if there is "feature-flush-cache" use that above
+	 * barriers.
+	 */
+	if (xenbus_read_unsigned(info->xbdev->otherend, "feature-flush-cache",
+				 0)) {
+		info->feature_flush = 1;
+		info->feature_fua = 0;
+	}
+
+	if (xenbus_read_unsigned(info->xbdev->otherend, "feature-discard", 0))
+		blkfront_setup_discard(info);
+
+	if (info->feature_persistent_parm)
+		info->feature_persistent =
+			!!xenbus_read_unsigned(info->xbdev->otherend,
+					       "feature-persistent", 0);
+	if (info->feature_persistent)
+		info->bounce = true;
+
+	indirect_segments = xenbus_read_unsigned(info->xbdev->otherend,
+					"feature-max-indirect-segments", 0);
+	if (indirect_segments > xen_blkif_max_segments)
+		indirect_segments = xen_blkif_max_segments;
+	if (indirect_segments <= BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		indirect_segments = 0;
+	info->max_indirect_segments = indirect_segments;
+
+	if (info->feature_persistent) {
+		mutex_lock(&blkfront_mutex);
+		schedule_delayed_work(&blkfront_work, HZ * 10);
+		mutex_unlock(&blkfront_mutex);
+	}
+}
+
+/*
+ * Invoked when the backend is finally 'ready' (and has told produced
+ * the details about the physical device - #sectors, size, etc).
+ */
+static void blkfront_connect(struct blkfront_info *info)
+{
+	unsigned long long sectors;
+	unsigned long sector_size;
+	unsigned int physical_sector_size;
+	int err, i;
+	struct blkfront_ring_info *rinfo;
+
+	switch (info->connected) {
+	case BLKIF_STATE_CONNECTED:
+		/*
+		 * Potentially, the back-end may be signalling
+		 * a capacity change; update the capacity.
+		 */
+		err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+				   "sectors", "%Lu", &sectors);
+		if (XENBUS_EXIST_ERR(err))
+			return;
+		printk(KERN_INFO "Setting capacity to %Lu\n",
+		       sectors);
+		set_capacity_and_notify(info->gd, sectors);
+
+		return;
+	case BLKIF_STATE_SUSPENDED:
+		/*
+		 * If we are recovering from suspension, we need to wait
+		 * for the backend to announce it's features before
+		 * reconnecting, at least we need to know if the backend
+		 * supports indirect descriptors, and how many.
+		 */
+		blkif_recover(info);
+		return;
+
+	default:
+		break;
+	}
+
+	dev_dbg(&info->xbdev->dev, "%s:%s.\n",
+		__func__, info->xbdev->otherend);
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "sectors", "%llu", &sectors,
+			    "info", "%u", &info->vdisk_info,
+			    "sector-size", "%lu", &sector_size,
+			    NULL);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err,
+				 "reading backend fields at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	/*
+	 * physical-sector-size is a newer field, so old backends may not
+	 * provide this. Assume physical sector size to be the same as
+	 * sector_size in that case.
+	 */
+	physical_sector_size = xenbus_read_unsigned(info->xbdev->otherend,
+						    "physical-sector-size",
+						    sector_size);
+	blkfront_gather_backend_features(info);
+	for_each_rinfo(info, rinfo, i) {
+		err = blkfront_setup_indirect(rinfo);
+		if (err) {
+			xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+					 info->xbdev->otherend);
+			blkif_free(info, 0);
+			break;
+		}
+	}
+
+	err = xlvbd_alloc_gendisk(sectors, info, sector_size,
+				  physical_sector_size);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
+				 info->xbdev->otherend);
+		goto fail;
+	}
+
+	xenbus_switch_state(info->xbdev, XenbusStateConnected);
+
+	/* Kick pending requests. */
+	info->connected = BLKIF_STATE_CONNECTED;
+	for_each_rinfo(info, rinfo, i)
+		kick_pending_request_queues(rinfo);
+
+	err = device_add_disk(&info->xbdev->dev, info->gd, NULL);
+	if (err) {
+		put_disk(info->gd);
+		blk_mq_free_tag_set(&info->tag_set);
+		info->rq = NULL;
+		goto fail;
+	}
+
+	info->is_ready = 1;
+	return;
+
+fail:
+	blkif_free(info, 0);
+	return;
+}
+
+/*
+ * Callback received when the backend's state changes.
+ */
+static void blkback_changed(struct xenbus_device *dev,
+			    enum xenbus_state backend_state)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+	dev_dbg(&dev->dev, "blkfront:blkback_changed to state %d.\n", backend_state);
+
+	switch (backend_state) {
+	case XenbusStateInitWait:
+		if (dev->state != XenbusStateInitialising)
+			break;
+		if (talk_to_blkback(dev, info))
+			break;
+		break;
+	case XenbusStateInitialising:
+	case XenbusStateInitialised:
+	case XenbusStateReconfiguring:
+	case XenbusStateReconfigured:
+	case XenbusStateUnknown:
+		break;
+
+	case XenbusStateConnected:
+		/*
+		 * talk_to_blkback sets state to XenbusStateInitialised
+		 * and blkfront_connect sets it to XenbusStateConnected
+		 * (if connection went OK).
+		 *
+		 * If the backend (or toolstack) decides to poke at backend
+		 * state (and re-trigger the watch by setting the state repeatedly
+		 * to XenbusStateConnected (4)) we need to deal with this.
+		 * This is allowed as this is used to communicate to the guest
+		 * that the size of disk has changed!
+		 */
+		if ((dev->state != XenbusStateInitialised) &&
+		    (dev->state != XenbusStateConnected)) {
+			if (talk_to_blkback(dev, info))
+				break;
+		}
+
+		blkfront_connect(info);
+		break;
+
+	case XenbusStateClosed:
+		if (dev->state == XenbusStateClosed)
+			break;
+		fallthrough;
+	case XenbusStateClosing:
+		blkfront_closing(info);
+		break;
+	}
+}
+
+static int blkfront_remove(struct xenbus_device *xbdev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&xbdev->dev);
+
+	dev_dbg(&xbdev->dev, "%s removed", xbdev->nodename);
+
+	if (info->gd)
+		del_gendisk(info->gd);
+
+	mutex_lock(&blkfront_mutex);
+	list_del(&info->info_list);
+	mutex_unlock(&blkfront_mutex);
+
+	blkif_free(info, 0);
+	if (info->gd) {
+		xlbd_release_minors(info->gd->first_minor, info->gd->minors);
+		put_disk(info->gd);
+		blk_mq_free_tag_set(&info->tag_set);
+	}
+
+	kfree(info);
+	return 0;
+}
+
+static int blkfront_is_ready(struct xenbus_device *dev)
+{
+	struct blkfront_info *info = dev_get_drvdata(&dev->dev);
+
+	return info->is_ready && info->xbdev;
+}
+
+static const struct block_device_operations xlvbd_block_fops =
+{
+	.owner = THIS_MODULE,
+	.getgeo = blkif_getgeo,
+	.ioctl = blkif_ioctl,
+	.compat_ioctl = blkdev_compat_ptr_ioctl,
+};
+
+
+static const struct xenbus_device_id blkfront_ids[] = {
+	{ "vbd" },
+	{ "" }
+};
+
+static struct xenbus_driver blkfront_driver = {
+	.ids  = blkfront_ids,
+	.probe = blkfront_probe,
+	.remove = blkfront_remove,
+	.resume = blkfront_resume,
+	.otherend_changed = blkback_changed,
+	.is_ready = blkfront_is_ready,
+};
+
+static void purge_persistent_grants(struct blkfront_info *info)
+{
+	unsigned int i;
+	unsigned long flags;
+	struct blkfront_ring_info *rinfo;
+
+	for_each_rinfo(info, rinfo, i) {
+		struct grant *gnt_list_entry, *tmp;
+		LIST_HEAD(grants);
+
+		spin_lock_irqsave(&rinfo->ring_lock, flags);
+
+		if (rinfo->persistent_gnts_c == 0) {
+			spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+			continue;
+		}
+
+		list_for_each_entry_safe(gnt_list_entry, tmp, &rinfo->grants,
+					 node) {
+			if (gnt_list_entry->gref == INVALID_GRANT_REF ||
+			    !gnttab_try_end_foreign_access(gnt_list_entry->gref))
+				continue;
+
+			list_del(&gnt_list_entry->node);
+			rinfo->persistent_gnts_c--;
+			gnt_list_entry->gref = INVALID_GRANT_REF;
+			list_add_tail(&gnt_list_entry->node, &grants);
+		}
+
+		list_splice_tail(&grants, &rinfo->grants);
+
+		spin_unlock_irqrestore(&rinfo->ring_lock, flags);
+	}
+}
+
+static void blkfront_delay_work(struct work_struct *work)
+{
+	struct blkfront_info *info;
+	bool need_schedule_work = false;
+
+	/*
+	 * Note that when using bounce buffers but not persistent grants
+	 * there's no need to run blkfront_delay_work because grants are
+	 * revoked in blkif_completion or else an error is reported and the
+	 * connection is closed.
+	 */
+
+	mutex_lock(&blkfront_mutex);
+
+	list_for_each_entry(info, &info_list, info_list) {
+		if (info->feature_persistent) {
+			need_schedule_work = true;
+			mutex_lock(&info->mutex);
+			purge_persistent_grants(info);
+			mutex_unlock(&info->mutex);
+		}
+	}
+
+	if (need_schedule_work)
+		schedule_delayed_work(&blkfront_work, HZ * 10);
+
+	mutex_unlock(&blkfront_mutex);
+}
+
+static int __init xlblk_init(void)
+{
+	int ret;
+	int nr_cpus = num_online_cpus();
+
+	if (!xen_domain())
+		return -ENODEV;
+
+	if (!xen_has_pv_disk_devices())
+		return -ENODEV;
+
+	if (register_blkdev(XENVBD_MAJOR, DEV_NAME)) {
+		pr_warn("xen_blk: can't get major %d with name %s\n",
+			XENVBD_MAJOR, DEV_NAME);
+		return -ENODEV;
+	}
+
+	if (xen_blkif_max_segments < BLKIF_MAX_SEGMENTS_PER_REQUEST)
+		xen_blkif_max_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) {
+		pr_info("Invalid max_ring_order (%d), will use default max: %d.\n",
+			xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER);
+		xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER;
+	}
+
+	if (xen_blkif_max_queues > nr_cpus) {
+		pr_info("Invalid max_queues (%d), will use default max: %d.\n",
+			xen_blkif_max_queues, nr_cpus);
+		xen_blkif_max_queues = nr_cpus;
+	}
+
+	INIT_DELAYED_WORK(&blkfront_work, blkfront_delay_work);
+
+	ret = xenbus_register_frontend(&blkfront_driver);
+	if (ret) {
+		unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+		return ret;
+	}
+
+	return 0;
+}
+module_init(xlblk_init);
+
+
+static void __exit xlblk_exit(void)
+{
+	cancel_delayed_work_sync(&blkfront_work);
+
+	xenbus_unregister_driver(&blkfront_driver);
+	unregister_blkdev(XENVBD_MAJOR, DEV_NAME);
+	kfree(minors);
+}
+module_exit(xlblk_exit);
+
+MODULE_DESCRIPTION("Xen virtual block device frontend");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(XENVBD_MAJOR);
+MODULE_ALIAS("xen:vbd");
+MODULE_ALIAS("xenblk");
diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c
new file mode 100644
index 000000000..c1e85f356
--- /dev/null
+++ b/drivers/block/z2ram.c
@@ -0,0 +1,414 @@
+/*
+** z2ram - Amiga pseudo-driver to access 16bit-RAM in ZorroII space
+**         as a block device, to be used as a RAM disk or swap space
+** 
+** Copyright (C) 1994 by Ingo Wilken (Ingo.Wilken@informatik.uni-oldenburg.de)
+**
+** ++Geert: support for zorro_unused_z2ram, better range checking
+** ++roman: translate accesses via an array
+** ++Milan: support for ChipRAM usage
+** ++yambo: converted to 2.0 kernel
+** ++yambo: modularized and support added for 3 minor devices including:
+**          MAJOR  MINOR  DESCRIPTION
+**          -----  -----  ----------------------------------------------
+**          37     0       Use Zorro II and Chip ram
+**          37     1       Use only Zorro II ram
+**          37     2       Use only Chip ram
+**          37     4-7     Use memory list entry 1-4 (first is 0)
+** ++jskov: support for 1-4th memory list entry.
+**
+** Permission to use, copy, modify, and distribute this software and its
+** documentation for any purpose and without fee is hereby granted, provided
+** that the above copyright notice appear in all copies and that both that
+** copyright notice and this permission notice appear in supporting
+** documentation.  This software is provided "as is" without express or
+** implied warranty.
+*/
+
+#define DEVICE_NAME "Z2RAM"
+
+#include <linux/major.h>
+#include <linux/vmalloc.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/blk-mq.h>
+#include <linux/bitops.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/pgtable.h>
+
+#include <asm/setup.h>
+#include <asm/amigahw.h>
+
+#include <linux/zorro.h>
+
+#define Z2MINOR_COMBINED      (0)
+#define Z2MINOR_Z2ONLY        (1)
+#define Z2MINOR_CHIPONLY      (2)
+#define Z2MINOR_MEMLIST1      (4)
+#define Z2MINOR_MEMLIST2      (5)
+#define Z2MINOR_MEMLIST3      (6)
+#define Z2MINOR_MEMLIST4      (7)
+#define Z2MINOR_COUNT         (8)	/* Move this down when adding a new minor */
+
+#define Z2RAM_CHUNK1024       ( Z2RAM_CHUNKSIZE >> 10 )
+
+static DEFINE_MUTEX(z2ram_mutex);
+static u_long *z2ram_map = NULL;
+static u_long z2ram_size = 0;
+static int z2_count = 0;
+static int chip_count = 0;
+static int list_count = 0;
+static int current_device = -1;
+
+static DEFINE_SPINLOCK(z2ram_lock);
+
+static struct gendisk *z2ram_gendisk[Z2MINOR_COUNT];
+
+static blk_status_t z2_queue_rq(struct blk_mq_hw_ctx *hctx,
+				const struct blk_mq_queue_data *bd)
+{
+	struct request *req = bd->rq;
+	unsigned long start = blk_rq_pos(req) << 9;
+	unsigned long len = blk_rq_cur_bytes(req);
+
+	blk_mq_start_request(req);
+
+	if (start + len > z2ram_size) {
+		pr_err(DEVICE_NAME ": bad access: block=%llu, "
+		       "count=%u\n",
+		       (unsigned long long)blk_rq_pos(req),
+		       blk_rq_cur_sectors(req));
+		return BLK_STS_IOERR;
+	}
+
+	spin_lock_irq(&z2ram_lock);
+
+	while (len) {
+		unsigned long addr = start & Z2RAM_CHUNKMASK;
+		unsigned long size = Z2RAM_CHUNKSIZE - addr;
+		void *buffer = bio_data(req->bio);
+
+		if (len < size)
+			size = len;
+		addr += z2ram_map[start >> Z2RAM_CHUNKSHIFT];
+		if (rq_data_dir(req) == READ)
+			memcpy(buffer, (char *)addr, size);
+		else
+			memcpy((char *)addr, buffer, size);
+		start += size;
+		len -= size;
+	}
+
+	spin_unlock_irq(&z2ram_lock);
+	blk_mq_end_request(req, BLK_STS_OK);
+	return BLK_STS_OK;
+}
+
+static void get_z2ram(void)
+{
+	int i;
+
+	for (i = 0; i < Z2RAM_SIZE / Z2RAM_CHUNKSIZE; i++) {
+		if (test_bit(i, zorro_unused_z2ram)) {
+			z2_count++;
+			z2ram_map[z2ram_size++] =
+			    (unsigned long)ZTWO_VADDR(Z2RAM_START) +
+			    (i << Z2RAM_CHUNKSHIFT);
+			clear_bit(i, zorro_unused_z2ram);
+		}
+	}
+
+	return;
+}
+
+static void get_chipram(void)
+{
+
+	while (amiga_chip_avail() > (Z2RAM_CHUNKSIZE * 4)) {
+		chip_count++;
+		z2ram_map[z2ram_size] =
+		    (u_long) amiga_chip_alloc(Z2RAM_CHUNKSIZE, "z2ram");
+
+		if (z2ram_map[z2ram_size] == 0) {
+			break;
+		}
+
+		z2ram_size++;
+	}
+
+	return;
+}
+
+static int z2_open(struct block_device *bdev, fmode_t mode)
+{
+	int device;
+	int max_z2_map = (Z2RAM_SIZE / Z2RAM_CHUNKSIZE) * sizeof(z2ram_map[0]);
+	int max_chip_map = (amiga_chip_size / Z2RAM_CHUNKSIZE) *
+	    sizeof(z2ram_map[0]);
+	int rc = -ENOMEM;
+
+	device = MINOR(bdev->bd_dev);
+
+	mutex_lock(&z2ram_mutex);
+	if (current_device != -1 && current_device != device) {
+		rc = -EBUSY;
+		goto err_out;
+	}
+
+	if (current_device == -1) {
+		z2_count = 0;
+		chip_count = 0;
+		list_count = 0;
+		z2ram_size = 0;
+
+		/* Use a specific list entry. */
+		if (device >= Z2MINOR_MEMLIST1 && device <= Z2MINOR_MEMLIST4) {
+			int index = device - Z2MINOR_MEMLIST1 + 1;
+			unsigned long size, paddr, vaddr;
+
+			if (index >= m68k_realnum_memory) {
+				printk(KERN_ERR DEVICE_NAME
+				       ": no such entry in z2ram_map\n");
+				goto err_out;
+			}
+
+			paddr = m68k_memory[index].addr;
+			size = m68k_memory[index].size & ~(Z2RAM_CHUNKSIZE - 1);
+
+#ifdef __powerpc__
+			/* FIXME: ioremap doesn't build correct memory tables. */
+			{
+				vfree(vmalloc(size));
+			}
+
+			vaddr = (unsigned long)ioremap_wt(paddr, size);
+
+#else
+			vaddr =
+			    (unsigned long)z_remap_nocache_nonser(paddr, size);
+#endif
+			z2ram_map =
+			    kmalloc_array(size / Z2RAM_CHUNKSIZE,
+					  sizeof(z2ram_map[0]), GFP_KERNEL);
+			if (z2ram_map == NULL) {
+				printk(KERN_ERR DEVICE_NAME
+				       ": cannot get mem for z2ram_map\n");
+				goto err_out;
+			}
+
+			while (size) {
+				z2ram_map[z2ram_size++] = vaddr;
+				size -= Z2RAM_CHUNKSIZE;
+				vaddr += Z2RAM_CHUNKSIZE;
+				list_count++;
+			}
+
+			if (z2ram_size != 0)
+				printk(KERN_INFO DEVICE_NAME
+				       ": using %iK List Entry %d Memory\n",
+				       list_count * Z2RAM_CHUNK1024, index);
+		} else
+			switch (device) {
+			case Z2MINOR_COMBINED:
+
+				z2ram_map =
+				    kmalloc(max_z2_map + max_chip_map,
+					    GFP_KERNEL);
+				if (z2ram_map == NULL) {
+					printk(KERN_ERR DEVICE_NAME
+					       ": cannot get mem for z2ram_map\n");
+					goto err_out;
+				}
+
+				get_z2ram();
+				get_chipram();
+
+				if (z2ram_size != 0)
+					printk(KERN_INFO DEVICE_NAME
+					       ": using %iK Zorro II RAM and %iK Chip RAM (Total %dK)\n",
+					       z2_count * Z2RAM_CHUNK1024,
+					       chip_count * Z2RAM_CHUNK1024,
+					       (z2_count +
+						chip_count) * Z2RAM_CHUNK1024);
+
+				break;
+
+			case Z2MINOR_Z2ONLY:
+				z2ram_map = kmalloc(max_z2_map, GFP_KERNEL);
+				if (!z2ram_map)
+					goto err_out;
+
+				get_z2ram();
+
+				if (z2ram_size != 0)
+					printk(KERN_INFO DEVICE_NAME
+					       ": using %iK of Zorro II RAM\n",
+					       z2_count * Z2RAM_CHUNK1024);
+
+				break;
+
+			case Z2MINOR_CHIPONLY:
+				z2ram_map = kmalloc(max_chip_map, GFP_KERNEL);
+				if (!z2ram_map)
+					goto err_out;
+
+				get_chipram();
+
+				if (z2ram_size != 0)
+					printk(KERN_INFO DEVICE_NAME
+					       ": using %iK Chip RAM\n",
+					       chip_count * Z2RAM_CHUNK1024);
+
+				break;
+
+			default:
+				rc = -ENODEV;
+				goto err_out;
+
+				break;
+			}
+
+		if (z2ram_size == 0) {
+			printk(KERN_NOTICE DEVICE_NAME
+			       ": no unused ZII/Chip RAM found\n");
+			goto err_out_kfree;
+		}
+
+		current_device = device;
+		z2ram_size <<= Z2RAM_CHUNKSHIFT;
+		set_capacity(z2ram_gendisk[device], z2ram_size >> 9);
+	}
+
+	mutex_unlock(&z2ram_mutex);
+	return 0;
+
+err_out_kfree:
+	kfree(z2ram_map);
+err_out:
+	mutex_unlock(&z2ram_mutex);
+	return rc;
+}
+
+static void z2_release(struct gendisk *disk, fmode_t mode)
+{
+	mutex_lock(&z2ram_mutex);
+	if (current_device == -1) {
+		mutex_unlock(&z2ram_mutex);
+		return;
+	}
+	mutex_unlock(&z2ram_mutex);
+	/*
+	 * FIXME: unmap memory
+	 */
+}
+
+static const struct block_device_operations z2_fops = {
+	.owner = THIS_MODULE,
+	.open = z2_open,
+	.release = z2_release,
+};
+
+static struct blk_mq_tag_set tag_set;
+
+static const struct blk_mq_ops z2_mq_ops = {
+	.queue_rq = z2_queue_rq,
+};
+
+static int z2ram_register_disk(int minor)
+{
+	struct gendisk *disk;
+	int err;
+
+	disk = blk_mq_alloc_disk(&tag_set, NULL);
+	if (IS_ERR(disk))
+		return PTR_ERR(disk);
+
+	disk->major = Z2RAM_MAJOR;
+	disk->first_minor = minor;
+	disk->minors = 1;
+	disk->flags |= GENHD_FL_NO_PART;
+	disk->fops = &z2_fops;
+	if (minor)
+		sprintf(disk->disk_name, "z2ram%d", minor);
+	else
+		sprintf(disk->disk_name, "z2ram");
+
+	z2ram_gendisk[minor] = disk;
+	err = add_disk(disk);
+	if (err)
+		put_disk(disk);
+	return err;
+}
+
+static int __init z2_init(void)
+{
+	int ret, i;
+
+	if (!MACH_IS_AMIGA)
+		return -ENODEV;
+
+	if (register_blkdev(Z2RAM_MAJOR, DEVICE_NAME))
+		return -EBUSY;
+
+	tag_set.ops = &z2_mq_ops;
+	tag_set.nr_hw_queues = 1;
+	tag_set.nr_maps = 1;
+	tag_set.queue_depth = 16;
+	tag_set.numa_node = NUMA_NO_NODE;
+	tag_set.flags = BLK_MQ_F_SHOULD_MERGE;
+	ret = blk_mq_alloc_tag_set(&tag_set);
+	if (ret)
+		goto out_unregister_blkdev;
+
+	for (i = 0; i < Z2MINOR_COUNT; i++) {
+		ret = z2ram_register_disk(i);
+		if (ret && i == 0)
+			goto out_free_tagset;
+	}
+
+	return 0;
+
+out_free_tagset:
+	blk_mq_free_tag_set(&tag_set);
+out_unregister_blkdev:
+	unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME);
+	return ret;
+}
+
+static void __exit z2_exit(void)
+{
+	int i, j;
+
+	unregister_blkdev(Z2RAM_MAJOR, DEVICE_NAME);
+
+	for (i = 0; i < Z2MINOR_COUNT; i++) {
+		del_gendisk(z2ram_gendisk[i]);
+		put_disk(z2ram_gendisk[i]);
+	}
+	blk_mq_free_tag_set(&tag_set);
+
+	if (current_device != -1) {
+		i = 0;
+
+		for (j = 0; j < z2_count; j++) {
+			set_bit(i++, zorro_unused_z2ram);
+		}
+
+		for (j = 0; j < chip_count; j++) {
+			if (z2ram_map[i]) {
+				amiga_chip_free((void *)z2ram_map[i++]);
+			}
+		}
+
+		if (z2ram_map != NULL) {
+			kfree(z2ram_map);
+		}
+	}
+
+	return;
+}
+
+module_init(z2_init);
+module_exit(z2_exit);
+MODULE_LICENSE("GPL");
diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig
new file mode 100644
index 000000000..d4100b0c0
--- /dev/null
+++ b/drivers/block/zram/Kconfig
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: GPL-2.0
+config ZRAM
+	tristate "Compressed RAM block device support"
+	depends on BLOCK && SYSFS && MMU
+	depends on CRYPTO_LZO || CRYPTO_ZSTD || CRYPTO_LZ4 || CRYPTO_LZ4HC || CRYPTO_842
+	select ZSMALLOC
+	help
+	  Creates virtual block devices called /dev/zramX (X = 0, 1, ...).
+	  Pages written to these disks are compressed and stored in memory
+	  itself. These disks allow very fast I/O and compression provides
+	  good amounts of memory savings.
+
+	  It has several use cases, for example: /tmp storage, use as swap
+	  disks and maybe many more.
+
+	  See Documentation/admin-guide/blockdev/zram.rst for more information.
+
+choice
+	prompt "Default zram compressor"
+	default ZRAM_DEF_COMP_LZORLE
+	depends on ZRAM
+
+config ZRAM_DEF_COMP_LZORLE
+	bool "lzo-rle"
+	depends on CRYPTO_LZO
+
+config ZRAM_DEF_COMP_ZSTD
+	bool "zstd"
+	depends on CRYPTO_ZSTD
+
+config ZRAM_DEF_COMP_LZ4
+	bool "lz4"
+	depends on CRYPTO_LZ4
+
+config ZRAM_DEF_COMP_LZO
+	bool "lzo"
+	depends on CRYPTO_LZO
+
+config ZRAM_DEF_COMP_LZ4HC
+	bool "lz4hc"
+	depends on CRYPTO_LZ4HC
+
+config ZRAM_DEF_COMP_842
+	bool "842"
+	depends on CRYPTO_842
+
+endchoice
+
+config ZRAM_DEF_COMP
+	string
+	default "lzo-rle" if ZRAM_DEF_COMP_LZORLE
+	default "zstd" if ZRAM_DEF_COMP_ZSTD
+	default "lz4" if ZRAM_DEF_COMP_LZ4
+	default "lzo" if ZRAM_DEF_COMP_LZO
+	default "lz4hc" if ZRAM_DEF_COMP_LZ4HC
+	default "842" if ZRAM_DEF_COMP_842
+
+config ZRAM_WRITEBACK
+       bool "Write back incompressible or idle page to backing device"
+       depends on ZRAM
+       help
+	 With incompressible page, there is no memory saving to keep it
+	 in memory. Instead, write it out to backing device.
+	 For this feature, admin should set up backing device via
+	 /sys/block/zramX/backing_dev.
+
+	 With /sys/block/zramX/{idle,writeback}, application could ask
+	 idle page's writeback to the backing device to save in memory.
+
+	 See Documentation/admin-guide/blockdev/zram.rst for more information.
+
+config ZRAM_MEMORY_TRACKING
+	bool "Track zRam block status"
+	depends on ZRAM && DEBUG_FS
+	help
+	  With this feature, admin can track the state of allocated blocks
+	  of zRAM. Admin could see the information via
+	  /sys/kernel/debug/zram/zramX/block_state.
+
+	  See Documentation/admin-guide/blockdev/zram.rst for more information.
diff --git a/drivers/block/zram/Makefile b/drivers/block/zram/Makefile
new file mode 100644
index 000000000..de9e45790
--- /dev/null
+++ b/drivers/block/zram/Makefile
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-2.0-only
+zram-y	:=	zcomp.o zram_drv.o
+
+obj-$(CONFIG_ZRAM)	+=	zram.o
diff --git a/drivers/block/zram/zcomp.c b/drivers/block/zram/zcomp.c
new file mode 100644
index 000000000..0916de952
--- /dev/null
+++ b/drivers/block/zram/zcomp.c
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ */
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/err.h>
+#include <linux/slab.h>
+#include <linux/wait.h>
+#include <linux/sched.h>
+#include <linux/cpu.h>
+#include <linux/crypto.h>
+
+#include "zcomp.h"
+
+static const char * const backends[] = {
+#if IS_ENABLED(CONFIG_CRYPTO_LZO)
+	"lzo",
+	"lzo-rle",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4)
+	"lz4",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_LZ4HC)
+	"lz4hc",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_842)
+	"842",
+#endif
+#if IS_ENABLED(CONFIG_CRYPTO_ZSTD)
+	"zstd",
+#endif
+};
+
+static void zcomp_strm_free(struct zcomp_strm *zstrm)
+{
+	if (!IS_ERR_OR_NULL(zstrm->tfm))
+		crypto_free_comp(zstrm->tfm);
+	free_pages((unsigned long)zstrm->buffer, 1);
+	zstrm->tfm = NULL;
+	zstrm->buffer = NULL;
+}
+
+/*
+ * Initialize zcomp_strm structure with ->tfm initialized by backend, and
+ * ->buffer. Return a negative value on error.
+ */
+static int zcomp_strm_init(struct zcomp_strm *zstrm, struct zcomp *comp)
+{
+	zstrm->tfm = crypto_alloc_comp(comp->name, 0, 0);
+	/*
+	 * allocate 2 pages. 1 for compressed data, plus 1 extra for the
+	 * case when compressed size is larger than the original one
+	 */
+	zstrm->buffer = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO, 1);
+	if (IS_ERR_OR_NULL(zstrm->tfm) || !zstrm->buffer) {
+		zcomp_strm_free(zstrm);
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+bool zcomp_available_algorithm(const char *comp)
+{
+	/*
+	 * Crypto does not ignore a trailing new line symbol,
+	 * so make sure you don't supply a string containing
+	 * one.
+	 * This also means that we permit zcomp initialisation
+	 * with any compressing algorithm known to crypto api.
+	 */
+	return crypto_has_comp(comp, 0, 0) == 1;
+}
+
+/* show available compressors */
+ssize_t zcomp_available_show(const char *comp, char *buf)
+{
+	bool known_algorithm = false;
+	ssize_t sz = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(backends); i++) {
+		if (!strcmp(comp, backends[i])) {
+			known_algorithm = true;
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"[%s] ", backends[i]);
+		} else {
+			sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+					"%s ", backends[i]);
+		}
+	}
+
+	/*
+	 * Out-of-tree module known to crypto api or a missing
+	 * entry in `backends'.
+	 */
+	if (!known_algorithm && crypto_has_comp(comp, 0, 0) == 1)
+		sz += scnprintf(buf + sz, PAGE_SIZE - sz - 2,
+				"[%s] ", comp);
+
+	sz += scnprintf(buf + sz, PAGE_SIZE - sz, "\n");
+	return sz;
+}
+
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp)
+{
+	local_lock(&comp->stream->lock);
+	return this_cpu_ptr(comp->stream);
+}
+
+void zcomp_stream_put(struct zcomp *comp)
+{
+	local_unlock(&comp->stream->lock);
+}
+
+int zcomp_compress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int *dst_len)
+{
+	/*
+	 * Our dst memory (zstrm->buffer) is always `2 * PAGE_SIZE' sized
+	 * because sometimes we can endup having a bigger compressed data
+	 * due to various reasons: for example compression algorithms tend
+	 * to add some padding to the compressed buffer. Speaking of padding,
+	 * comp algorithm `842' pads the compressed length to multiple of 8
+	 * and returns -ENOSP when the dst memory is not big enough, which
+	 * is not something that ZRAM wants to see. We can handle the
+	 * `compressed_size > PAGE_SIZE' case easily in ZRAM, but when we
+	 * receive -ERRNO from the compressing backend we can't help it
+	 * anymore. To make `842' happy we need to tell the exact size of
+	 * the dst buffer, zram_drv will take care of the fact that
+	 * compressed buffer is too big.
+	 */
+	*dst_len = PAGE_SIZE * 2;
+
+	return crypto_comp_compress(zstrm->tfm,
+			src, PAGE_SIZE,
+			zstrm->buffer, dst_len);
+}
+
+int zcomp_decompress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int src_len, void *dst)
+{
+	unsigned int dst_len = PAGE_SIZE;
+
+	return crypto_comp_decompress(zstrm->tfm,
+			src, src_len,
+			dst, &dst_len);
+}
+
+int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
+{
+	struct zcomp *comp = hlist_entry(node, struct zcomp, node);
+	struct zcomp_strm *zstrm;
+	int ret;
+
+	zstrm = per_cpu_ptr(comp->stream, cpu);
+	local_lock_init(&zstrm->lock);
+
+	ret = zcomp_strm_init(zstrm, comp);
+	if (ret)
+		pr_err("Can't allocate a compression stream\n");
+	return ret;
+}
+
+int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+	struct zcomp *comp = hlist_entry(node, struct zcomp, node);
+	struct zcomp_strm *zstrm;
+
+	zstrm = per_cpu_ptr(comp->stream, cpu);
+	zcomp_strm_free(zstrm);
+	return 0;
+}
+
+static int zcomp_init(struct zcomp *comp)
+{
+	int ret;
+
+	comp->stream = alloc_percpu(struct zcomp_strm);
+	if (!comp->stream)
+		return -ENOMEM;
+
+	ret = cpuhp_state_add_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
+	if (ret < 0)
+		goto cleanup;
+	return 0;
+
+cleanup:
+	free_percpu(comp->stream);
+	return ret;
+}
+
+void zcomp_destroy(struct zcomp *comp)
+{
+	cpuhp_state_remove_instance(CPUHP_ZCOMP_PREPARE, &comp->node);
+	free_percpu(comp->stream);
+	kfree(comp);
+}
+
+/*
+ * search available compressors for requested algorithm.
+ * allocate new zcomp and initialize it. return compressing
+ * backend pointer or ERR_PTR if things went bad. ERR_PTR(-EINVAL)
+ * if requested algorithm is not supported, ERR_PTR(-ENOMEM) in
+ * case of allocation error, or any other error potentially
+ * returned by zcomp_init().
+ */
+struct zcomp *zcomp_create(const char *compress)
+{
+	struct zcomp *comp;
+	int error;
+
+	/*
+	 * Crypto API will execute /sbin/modprobe if the compression module
+	 * is not loaded yet. We must do it here, otherwise we are about to
+	 * call /sbin/modprobe under CPU hot-plug lock.
+	 */
+	if (!zcomp_available_algorithm(compress))
+		return ERR_PTR(-EINVAL);
+
+	comp = kzalloc(sizeof(struct zcomp), GFP_KERNEL);
+	if (!comp)
+		return ERR_PTR(-ENOMEM);
+
+	comp->name = compress;
+	error = zcomp_init(comp);
+	if (error) {
+		kfree(comp);
+		return ERR_PTR(error);
+	}
+	return comp;
+}
diff --git a/drivers/block/zram/zcomp.h b/drivers/block/zram/zcomp.h
new file mode 100644
index 000000000..40f6420f4
--- /dev/null
+++ b/drivers/block/zram/zcomp.h
@@ -0,0 +1,43 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+/*
+ * Copyright (C) 2014 Sergey Senozhatsky.
+ */
+
+#ifndef _ZCOMP_H_
+#define _ZCOMP_H_
+#include <linux/local_lock.h>
+
+struct zcomp_strm {
+	/* The members ->buffer and ->tfm are protected by ->lock. */
+	local_lock_t lock;
+	/* compression/decompression buffer */
+	void *buffer;
+	struct crypto_comp *tfm;
+};
+
+/* dynamic per-device compression frontend */
+struct zcomp {
+	struct zcomp_strm __percpu *stream;
+	const char *name;
+	struct hlist_node node;
+};
+
+int zcomp_cpu_up_prepare(unsigned int cpu, struct hlist_node *node);
+int zcomp_cpu_dead(unsigned int cpu, struct hlist_node *node);
+ssize_t zcomp_available_show(const char *comp, char *buf);
+bool zcomp_available_algorithm(const char *comp);
+
+struct zcomp *zcomp_create(const char *comp);
+void zcomp_destroy(struct zcomp *comp);
+
+struct zcomp_strm *zcomp_stream_get(struct zcomp *comp);
+void zcomp_stream_put(struct zcomp *comp);
+
+int zcomp_compress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int *dst_len);
+
+int zcomp_decompress(struct zcomp_strm *zstrm,
+		const void *src, unsigned int src_len, void *dst);
+
+bool zcomp_set_max_streams(struct zcomp *comp, int num_strm);
+#endif /* _ZCOMP_H_ */
diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c
new file mode 100644
index 000000000..966aab902
--- /dev/null
+++ b/drivers/block/zram/zram_drv.c
@@ -0,0 +1,2169 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#define KMSG_COMPONENT "zram"
+#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/bio.h>
+#include <linux/bitops.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/device.h>
+#include <linux/highmem.h>
+#include <linux/slab.h>
+#include <linux/backing-dev.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+#include <linux/err.h>
+#include <linux/idr.h>
+#include <linux/sysfs.h>
+#include <linux/debugfs.h>
+#include <linux/cpuhotplug.h>
+#include <linux/part_stat.h>
+
+#include "zram_drv.h"
+
+static DEFINE_IDR(zram_index_idr);
+/* idr index must be protected */
+static DEFINE_MUTEX(zram_index_mutex);
+
+static int zram_major;
+static const char *default_compressor = CONFIG_ZRAM_DEF_COMP;
+
+/* Module params (documentation at end) */
+static unsigned int num_devices = 1;
+/*
+ * Pages that compress to sizes equals or greater than this are stored
+ * uncompressed in memory.
+ */
+static size_t huge_class_size;
+
+static const struct block_device_operations zram_devops;
+
+static void zram_free_page(struct zram *zram, size_t index);
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset, struct bio *bio);
+
+
+static int zram_slot_trylock(struct zram *zram, u32 index)
+{
+	return bit_spin_trylock(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_lock(struct zram *zram, u32 index)
+{
+	bit_spin_lock(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static void zram_slot_unlock(struct zram *zram, u32 index)
+{
+	bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags);
+}
+
+static inline bool init_done(struct zram *zram)
+{
+	return zram->disksize;
+}
+
+static inline struct zram *dev_to_zram(struct device *dev)
+{
+	return (struct zram *)dev_to_disk(dev)->private_data;
+}
+
+static unsigned long zram_get_handle(struct zram *zram, u32 index)
+{
+	return zram->table[index].handle;
+}
+
+static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle)
+{
+	zram->table[index].handle = handle;
+}
+
+/* flag operations require table entry bit_spin_lock() being held */
+static bool zram_test_flag(struct zram *zram, u32 index,
+			enum zram_pageflags flag)
+{
+	return zram->table[index].flags & BIT(flag);
+}
+
+static void zram_set_flag(struct zram *zram, u32 index,
+			enum zram_pageflags flag)
+{
+	zram->table[index].flags |= BIT(flag);
+}
+
+static void zram_clear_flag(struct zram *zram, u32 index,
+			enum zram_pageflags flag)
+{
+	zram->table[index].flags &= ~BIT(flag);
+}
+
+static inline void zram_set_element(struct zram *zram, u32 index,
+			unsigned long element)
+{
+	zram->table[index].element = element;
+}
+
+static unsigned long zram_get_element(struct zram *zram, u32 index)
+{
+	return zram->table[index].element;
+}
+
+static size_t zram_get_obj_size(struct zram *zram, u32 index)
+{
+	return zram->table[index].flags & (BIT(ZRAM_FLAG_SHIFT) - 1);
+}
+
+static void zram_set_obj_size(struct zram *zram,
+					u32 index, size_t size)
+{
+	unsigned long flags = zram->table[index].flags >> ZRAM_FLAG_SHIFT;
+
+	zram->table[index].flags = (flags << ZRAM_FLAG_SHIFT) | size;
+}
+
+static inline bool zram_allocated(struct zram *zram, u32 index)
+{
+	return zram_get_obj_size(zram, index) ||
+			zram_test_flag(zram, index, ZRAM_SAME) ||
+			zram_test_flag(zram, index, ZRAM_WB);
+}
+
+#if PAGE_SIZE != 4096
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+	return bvec->bv_len != PAGE_SIZE;
+}
+#else
+static inline bool is_partial_io(struct bio_vec *bvec)
+{
+	return false;
+}
+#endif
+
+/*
+ * Check if request is within bounds and aligned on zram logical blocks.
+ */
+static inline bool valid_io_request(struct zram *zram,
+		sector_t start, unsigned int size)
+{
+	u64 end, bound;
+
+	/* unaligned request */
+	if (unlikely(start & (ZRAM_SECTOR_PER_LOGICAL_BLOCK - 1)))
+		return false;
+	if (unlikely(size & (ZRAM_LOGICAL_BLOCK_SIZE - 1)))
+		return false;
+
+	end = start + (size >> SECTOR_SHIFT);
+	bound = zram->disksize >> SECTOR_SHIFT;
+	/* out of range range */
+	if (unlikely(start >= bound || end > bound || start > end))
+		return false;
+
+	/* I/O request is valid */
+	return true;
+}
+
+static void update_position(u32 *index, int *offset, struct bio_vec *bvec)
+{
+	*index  += (*offset + bvec->bv_len) / PAGE_SIZE;
+	*offset = (*offset + bvec->bv_len) % PAGE_SIZE;
+}
+
+static inline void update_used_max(struct zram *zram,
+					const unsigned long pages)
+{
+	unsigned long old_max, cur_max;
+
+	old_max = atomic_long_read(&zram->stats.max_used_pages);
+
+	do {
+		cur_max = old_max;
+		if (pages > cur_max)
+			old_max = atomic_long_cmpxchg(
+				&zram->stats.max_used_pages, cur_max, pages);
+	} while (old_max != cur_max);
+}
+
+static inline void zram_fill_page(void *ptr, unsigned long len,
+					unsigned long value)
+{
+	WARN_ON_ONCE(!IS_ALIGNED(len, sizeof(unsigned long)));
+	memset_l(ptr, value, len / sizeof(unsigned long));
+}
+
+static bool page_same_filled(void *ptr, unsigned long *element)
+{
+	unsigned long *page;
+	unsigned long val;
+	unsigned int pos, last_pos = PAGE_SIZE / sizeof(*page) - 1;
+
+	page = (unsigned long *)ptr;
+	val = page[0];
+
+	if (val != page[last_pos])
+		return false;
+
+	for (pos = 1; pos < last_pos; pos++) {
+		if (val != page[pos])
+			return false;
+	}
+
+	*element = val;
+
+	return true;
+}
+
+static ssize_t initstate_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u32 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	val = init_done(zram);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", val);
+}
+
+static ssize_t disksize_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", zram->disksize);
+}
+
+static ssize_t mem_limit_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 limit;
+	char *tmp;
+	struct zram *zram = dev_to_zram(dev);
+
+	limit = memparse(buf, &tmp);
+	if (buf == tmp) /* no chars parsed, invalid input */
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	zram->limit_pages = PAGE_ALIGN(limit) >> PAGE_SHIFT;
+	up_write(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t mem_used_max_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int err;
+	unsigned long val;
+	struct zram *zram = dev_to_zram(dev);
+
+	err = kstrtoul(buf, 10, &val);
+	if (err || val != 0)
+		return -EINVAL;
+
+	down_read(&zram->init_lock);
+	if (init_done(zram)) {
+		atomic_long_set(&zram->stats.max_used_pages,
+				zs_get_total_pages(zram->mem_pool));
+	}
+	up_read(&zram->init_lock);
+
+	return len;
+}
+
+/*
+ * Mark all pages which are older than or equal to cutoff as IDLE.
+ * Callers should hold the zram init lock in read mode
+ */
+static void mark_idle(struct zram *zram, ktime_t cutoff)
+{
+	int is_idle = 1;
+	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	int index;
+
+	for (index = 0; index < nr_pages; index++) {
+		/*
+		 * Do not mark ZRAM_UNDER_WB slot as ZRAM_IDLE to close race.
+		 * See the comment in writeback_store.
+		 */
+		zram_slot_lock(zram, index);
+		if (zram_allocated(zram, index) &&
+				!zram_test_flag(zram, index, ZRAM_UNDER_WB)) {
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+			is_idle = !cutoff || ktime_after(cutoff, zram->table[index].ac_time);
+#endif
+			if (is_idle)
+				zram_set_flag(zram, index, ZRAM_IDLE);
+		}
+		zram_slot_unlock(zram, index);
+	}
+}
+
+static ssize_t idle_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	ktime_t cutoff_time = 0;
+	ssize_t rv = -EINVAL;
+
+	if (!sysfs_streq(buf, "all")) {
+		/*
+		 * If it did not parse as 'all' try to treat it as an integer
+		 * when we have memory tracking enabled.
+		 */
+		u64 age_sec;
+
+		if (IS_ENABLED(CONFIG_ZRAM_MEMORY_TRACKING) && !kstrtoull(buf, 0, &age_sec))
+			cutoff_time = ktime_sub(ktime_get_boottime(),
+					ns_to_ktime(age_sec * NSEC_PER_SEC));
+		else
+			goto out;
+	}
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram))
+		goto out_unlock;
+
+	/*
+	 * A cutoff_time of 0 marks everything as idle, this is the
+	 * "all" behavior.
+	 */
+	mark_idle(zram, cutoff_time);
+	rv = len;
+
+out_unlock:
+	up_read(&zram->init_lock);
+out:
+	return rv;
+}
+
+#ifdef CONFIG_ZRAM_WRITEBACK
+static ssize_t writeback_limit_enable_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	u64 val;
+	ssize_t ret = -EINVAL;
+
+	if (kstrtoull(buf, 10, &val))
+		return ret;
+
+	down_read(&zram->init_lock);
+	spin_lock(&zram->wb_limit_lock);
+	zram->wb_limit_enable = val;
+	spin_unlock(&zram->wb_limit_lock);
+	up_read(&zram->init_lock);
+	ret = len;
+
+	return ret;
+}
+
+static ssize_t writeback_limit_enable_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	bool val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	spin_lock(&zram->wb_limit_lock);
+	val = zram->wb_limit_enable;
+	spin_unlock(&zram->wb_limit_lock);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%d\n", val);
+}
+
+static ssize_t writeback_limit_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	u64 val;
+	ssize_t ret = -EINVAL;
+
+	if (kstrtoull(buf, 10, &val))
+		return ret;
+
+	down_read(&zram->init_lock);
+	spin_lock(&zram->wb_limit_lock);
+	zram->bd_wb_limit = val;
+	spin_unlock(&zram->wb_limit_lock);
+	up_read(&zram->init_lock);
+	ret = len;
+
+	return ret;
+}
+
+static ssize_t writeback_limit_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	u64 val;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	spin_lock(&zram->wb_limit_lock);
+	val = zram->bd_wb_limit;
+	spin_unlock(&zram->wb_limit_lock);
+	up_read(&zram->init_lock);
+
+	return scnprintf(buf, PAGE_SIZE, "%llu\n", val);
+}
+
+static void reset_bdev(struct zram *zram)
+{
+	struct block_device *bdev;
+
+	if (!zram->backing_dev)
+		return;
+
+	bdev = zram->bdev;
+	blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
+	/* hope filp_close flush all of IO */
+	filp_close(zram->backing_dev, NULL);
+	zram->backing_dev = NULL;
+	zram->bdev = NULL;
+	zram->disk->fops = &zram_devops;
+	kvfree(zram->bitmap);
+	zram->bitmap = NULL;
+}
+
+static ssize_t backing_dev_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct file *file;
+	struct zram *zram = dev_to_zram(dev);
+	char *p;
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	file = zram->backing_dev;
+	if (!file) {
+		memcpy(buf, "none\n", 5);
+		up_read(&zram->init_lock);
+		return 5;
+	}
+
+	p = file_path(file, buf, PAGE_SIZE - 1);
+	if (IS_ERR(p)) {
+		ret = PTR_ERR(p);
+		goto out;
+	}
+
+	ret = strlen(p);
+	memmove(buf, p, ret);
+	buf[ret++] = '\n';
+out:
+	up_read(&zram->init_lock);
+	return ret;
+}
+
+static ssize_t backing_dev_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	char *file_name;
+	size_t sz;
+	struct file *backing_dev = NULL;
+	struct inode *inode;
+	struct address_space *mapping;
+	unsigned int bitmap_sz;
+	unsigned long nr_pages, *bitmap = NULL;
+	struct block_device *bdev = NULL;
+	int err;
+	struct zram *zram = dev_to_zram(dev);
+
+	file_name = kmalloc(PATH_MAX, GFP_KERNEL);
+	if (!file_name)
+		return -ENOMEM;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Can't setup backing device for initialized device\n");
+		err = -EBUSY;
+		goto out;
+	}
+
+	strscpy(file_name, buf, PATH_MAX);
+	/* ignore trailing newline */
+	sz = strlen(file_name);
+	if (sz > 0 && file_name[sz - 1] == '\n')
+		file_name[sz - 1] = 0x00;
+
+	backing_dev = filp_open(file_name, O_RDWR|O_LARGEFILE, 0);
+	if (IS_ERR(backing_dev)) {
+		err = PTR_ERR(backing_dev);
+		backing_dev = NULL;
+		goto out;
+	}
+
+	mapping = backing_dev->f_mapping;
+	inode = mapping->host;
+
+	/* Support only block device in this moment */
+	if (!S_ISBLK(inode->i_mode)) {
+		err = -ENOTBLK;
+		goto out;
+	}
+
+	bdev = blkdev_get_by_dev(inode->i_rdev,
+			FMODE_READ | FMODE_WRITE | FMODE_EXCL, zram);
+	if (IS_ERR(bdev)) {
+		err = PTR_ERR(bdev);
+		bdev = NULL;
+		goto out;
+	}
+
+	nr_pages = i_size_read(inode) >> PAGE_SHIFT;
+	bitmap_sz = BITS_TO_LONGS(nr_pages) * sizeof(long);
+	bitmap = kvzalloc(bitmap_sz, GFP_KERNEL);
+	if (!bitmap) {
+		err = -ENOMEM;
+		goto out;
+	}
+
+	reset_bdev(zram);
+
+	zram->bdev = bdev;
+	zram->backing_dev = backing_dev;
+	zram->bitmap = bitmap;
+	zram->nr_pages = nr_pages;
+	up_write(&zram->init_lock);
+
+	pr_info("setup backing device %s\n", file_name);
+	kfree(file_name);
+
+	return len;
+out:
+	kvfree(bitmap);
+
+	if (bdev)
+		blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
+
+	if (backing_dev)
+		filp_close(backing_dev, NULL);
+
+	up_write(&zram->init_lock);
+
+	kfree(file_name);
+
+	return err;
+}
+
+static unsigned long alloc_block_bdev(struct zram *zram)
+{
+	unsigned long blk_idx = 1;
+retry:
+	/* skip 0 bit to confuse zram.handle = 0 */
+	blk_idx = find_next_zero_bit(zram->bitmap, zram->nr_pages, blk_idx);
+	if (blk_idx == zram->nr_pages)
+		return 0;
+
+	if (test_and_set_bit(blk_idx, zram->bitmap))
+		goto retry;
+
+	atomic64_inc(&zram->stats.bd_count);
+	return blk_idx;
+}
+
+static void free_block_bdev(struct zram *zram, unsigned long blk_idx)
+{
+	int was_set;
+
+	was_set = test_and_clear_bit(blk_idx, zram->bitmap);
+	WARN_ON_ONCE(!was_set);
+	atomic64_dec(&zram->stats.bd_count);
+}
+
+static void zram_page_end_io(struct bio *bio)
+{
+	struct page *page = bio_first_page_all(bio);
+
+	page_endio(page, op_is_write(bio_op(bio)),
+			blk_status_to_errno(bio->bi_status));
+	bio_put(bio);
+}
+
+/*
+ * Returns 1 if the submission is successful.
+ */
+static int read_from_bdev_async(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent)
+{
+	struct bio *bio;
+
+	bio = bio_alloc(zram->bdev, 1, parent ? parent->bi_opf : REQ_OP_READ,
+			GFP_NOIO);
+	if (!bio)
+		return -ENOMEM;
+
+	bio->bi_iter.bi_sector = entry * (PAGE_SIZE >> 9);
+	if (!bio_add_page(bio, bvec->bv_page, bvec->bv_len, bvec->bv_offset)) {
+		bio_put(bio);
+		return -EIO;
+	}
+
+	if (!parent)
+		bio->bi_end_io = zram_page_end_io;
+	else
+		bio_chain(bio, parent);
+
+	submit_bio(bio);
+	return 1;
+}
+
+#define PAGE_WB_SIG "page_index="
+
+#define PAGE_WRITEBACK 0
+#define HUGE_WRITEBACK (1<<0)
+#define IDLE_WRITEBACK (1<<1)
+
+
+static ssize_t writeback_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	unsigned long index = 0;
+	struct bio bio;
+	struct bio_vec bio_vec;
+	struct page *page;
+	ssize_t ret = len;
+	int mode, err;
+	unsigned long blk_idx = 0;
+
+	if (sysfs_streq(buf, "idle"))
+		mode = IDLE_WRITEBACK;
+	else if (sysfs_streq(buf, "huge"))
+		mode = HUGE_WRITEBACK;
+	else if (sysfs_streq(buf, "huge_idle"))
+		mode = IDLE_WRITEBACK | HUGE_WRITEBACK;
+	else {
+		if (strncmp(buf, PAGE_WB_SIG, sizeof(PAGE_WB_SIG) - 1))
+			return -EINVAL;
+
+		if (kstrtol(buf + sizeof(PAGE_WB_SIG) - 1, 10, &index) ||
+				index >= nr_pages)
+			return -EINVAL;
+
+		nr_pages = 1;
+		mode = PAGE_WRITEBACK;
+	}
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		ret = -EINVAL;
+		goto release_init_lock;
+	}
+
+	if (!zram->backing_dev) {
+		ret = -ENODEV;
+		goto release_init_lock;
+	}
+
+	page = alloc_page(GFP_KERNEL);
+	if (!page) {
+		ret = -ENOMEM;
+		goto release_init_lock;
+	}
+
+	for (; nr_pages != 0; index++, nr_pages--) {
+		struct bio_vec bvec;
+
+		bvec.bv_page = page;
+		bvec.bv_len = PAGE_SIZE;
+		bvec.bv_offset = 0;
+
+		spin_lock(&zram->wb_limit_lock);
+		if (zram->wb_limit_enable && !zram->bd_wb_limit) {
+			spin_unlock(&zram->wb_limit_lock);
+			ret = -EIO;
+			break;
+		}
+		spin_unlock(&zram->wb_limit_lock);
+
+		if (!blk_idx) {
+			blk_idx = alloc_block_bdev(zram);
+			if (!blk_idx) {
+				ret = -ENOSPC;
+				break;
+			}
+		}
+
+		zram_slot_lock(zram, index);
+		if (!zram_allocated(zram, index))
+			goto next;
+
+		if (zram_test_flag(zram, index, ZRAM_WB) ||
+				zram_test_flag(zram, index, ZRAM_SAME) ||
+				zram_test_flag(zram, index, ZRAM_UNDER_WB))
+			goto next;
+
+		if (mode & IDLE_WRITEBACK &&
+			  !zram_test_flag(zram, index, ZRAM_IDLE))
+			goto next;
+		if (mode & HUGE_WRITEBACK &&
+			  !zram_test_flag(zram, index, ZRAM_HUGE))
+			goto next;
+		/*
+		 * Clearing ZRAM_UNDER_WB is duty of caller.
+		 * IOW, zram_free_page never clear it.
+		 */
+		zram_set_flag(zram, index, ZRAM_UNDER_WB);
+		/* Need for hugepage writeback racing */
+		zram_set_flag(zram, index, ZRAM_IDLE);
+		zram_slot_unlock(zram, index);
+		if (zram_bvec_read(zram, &bvec, index, 0, NULL)) {
+			zram_slot_lock(zram, index);
+			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
+			zram_clear_flag(zram, index, ZRAM_IDLE);
+			zram_slot_unlock(zram, index);
+			continue;
+		}
+
+		bio_init(&bio, zram->bdev, &bio_vec, 1,
+			 REQ_OP_WRITE | REQ_SYNC);
+		bio.bi_iter.bi_sector = blk_idx * (PAGE_SIZE >> 9);
+
+		bio_add_page(&bio, bvec.bv_page, bvec.bv_len,
+				bvec.bv_offset);
+		/*
+		 * XXX: A single page IO would be inefficient for write
+		 * but it would be not bad as starter.
+		 */
+		err = submit_bio_wait(&bio);
+		if (err) {
+			zram_slot_lock(zram, index);
+			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
+			zram_clear_flag(zram, index, ZRAM_IDLE);
+			zram_slot_unlock(zram, index);
+			/*
+			 * Return last IO error unless every IO were
+			 * not suceeded.
+			 */
+			ret = err;
+			continue;
+		}
+
+		atomic64_inc(&zram->stats.bd_writes);
+		/*
+		 * We released zram_slot_lock so need to check if the slot was
+		 * changed. If there is freeing for the slot, we can catch it
+		 * easily by zram_allocated.
+		 * A subtle case is the slot is freed/reallocated/marked as
+		 * ZRAM_IDLE again. To close the race, idle_store doesn't
+		 * mark ZRAM_IDLE once it found the slot was ZRAM_UNDER_WB.
+		 * Thus, we could close the race by checking ZRAM_IDLE bit.
+		 */
+		zram_slot_lock(zram, index);
+		if (!zram_allocated(zram, index) ||
+			  !zram_test_flag(zram, index, ZRAM_IDLE)) {
+			zram_clear_flag(zram, index, ZRAM_UNDER_WB);
+			zram_clear_flag(zram, index, ZRAM_IDLE);
+			goto next;
+		}
+
+		zram_free_page(zram, index);
+		zram_clear_flag(zram, index, ZRAM_UNDER_WB);
+		zram_set_flag(zram, index, ZRAM_WB);
+		zram_set_element(zram, index, blk_idx);
+		blk_idx = 0;
+		atomic64_inc(&zram->stats.pages_stored);
+		spin_lock(&zram->wb_limit_lock);
+		if (zram->wb_limit_enable && zram->bd_wb_limit > 0)
+			zram->bd_wb_limit -=  1UL << (PAGE_SHIFT - 12);
+		spin_unlock(&zram->wb_limit_lock);
+next:
+		zram_slot_unlock(zram, index);
+	}
+
+	if (blk_idx)
+		free_block_bdev(zram, blk_idx);
+	__free_page(page);
+release_init_lock:
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+struct zram_work {
+	struct work_struct work;
+	struct zram *zram;
+	unsigned long entry;
+	struct bio *bio;
+	struct bio_vec bvec;
+};
+
+#if PAGE_SIZE != 4096
+static void zram_sync_read(struct work_struct *work)
+{
+	struct zram_work *zw = container_of(work, struct zram_work, work);
+	struct zram *zram = zw->zram;
+	unsigned long entry = zw->entry;
+	struct bio *bio = zw->bio;
+
+	read_from_bdev_async(zram, &zw->bvec, entry, bio);
+}
+
+/*
+ * Block layer want one ->submit_bio to be active at a time, so if we use
+ * chained IO with parent IO in same context, it's a deadlock. To avoid that,
+ * use a worker thread context.
+ */
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	struct zram_work work;
+
+	work.bvec = *bvec;
+	work.zram = zram;
+	work.entry = entry;
+	work.bio = bio;
+
+	INIT_WORK_ONSTACK(&work.work, zram_sync_read);
+	queue_work(system_unbound_wq, &work.work);
+	flush_work(&work.work);
+	destroy_work_on_stack(&work.work);
+
+	return 1;
+}
+#else
+static int read_from_bdev_sync(struct zram *zram, struct bio_vec *bvec,
+				unsigned long entry, struct bio *bio)
+{
+	WARN_ON(1);
+	return -EIO;
+}
+#endif
+
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	atomic64_inc(&zram->stats.bd_reads);
+	if (sync)
+		return read_from_bdev_sync(zram, bvec, entry, parent);
+	else
+		return read_from_bdev_async(zram, bvec, entry, parent);
+}
+#else
+static inline void reset_bdev(struct zram *zram) {};
+static int read_from_bdev(struct zram *zram, struct bio_vec *bvec,
+			unsigned long entry, struct bio *parent, bool sync)
+{
+	return -EIO;
+}
+
+static void free_block_bdev(struct zram *zram, unsigned long blk_idx) {};
+#endif
+
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+
+static struct dentry *zram_debugfs_root;
+
+static void zram_debugfs_create(void)
+{
+	zram_debugfs_root = debugfs_create_dir("zram", NULL);
+}
+
+static void zram_debugfs_destroy(void)
+{
+	debugfs_remove_recursive(zram_debugfs_root);
+}
+
+static void zram_accessed(struct zram *zram, u32 index)
+{
+	zram_clear_flag(zram, index, ZRAM_IDLE);
+	zram->table[index].ac_time = ktime_get_boottime();
+}
+
+static ssize_t read_block_state(struct file *file, char __user *buf,
+				size_t count, loff_t *ppos)
+{
+	char *kbuf;
+	ssize_t index, written = 0;
+	struct zram *zram = file->private_data;
+	unsigned long nr_pages = zram->disksize >> PAGE_SHIFT;
+	struct timespec64 ts;
+
+	kbuf = kvmalloc(count, GFP_KERNEL);
+	if (!kbuf)
+		return -ENOMEM;
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_read(&zram->init_lock);
+		kvfree(kbuf);
+		return -EINVAL;
+	}
+
+	for (index = *ppos; index < nr_pages; index++) {
+		int copied;
+
+		zram_slot_lock(zram, index);
+		if (!zram_allocated(zram, index))
+			goto next;
+
+		ts = ktime_to_timespec64(zram->table[index].ac_time);
+		copied = snprintf(kbuf + written, count,
+			"%12zd %12lld.%06lu %c%c%c%c\n",
+			index, (s64)ts.tv_sec,
+			ts.tv_nsec / NSEC_PER_USEC,
+			zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.',
+			zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.',
+			zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.',
+			zram_test_flag(zram, index, ZRAM_IDLE) ? 'i' : '.');
+
+		if (count <= copied) {
+			zram_slot_unlock(zram, index);
+			break;
+		}
+		written += copied;
+		count -= copied;
+next:
+		zram_slot_unlock(zram, index);
+		*ppos += 1;
+	}
+
+	up_read(&zram->init_lock);
+	if (copy_to_user(buf, kbuf, written))
+		written = -EFAULT;
+	kvfree(kbuf);
+
+	return written;
+}
+
+static const struct file_operations proc_zram_block_state_op = {
+	.open = simple_open,
+	.read = read_block_state,
+	.llseek = default_llseek,
+};
+
+static void zram_debugfs_register(struct zram *zram)
+{
+	if (!zram_debugfs_root)
+		return;
+
+	zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name,
+						zram_debugfs_root);
+	debugfs_create_file("block_state", 0400, zram->debugfs_dir,
+				zram, &proc_zram_block_state_op);
+}
+
+static void zram_debugfs_unregister(struct zram *zram)
+{
+	debugfs_remove_recursive(zram->debugfs_dir);
+}
+#else
+static void zram_debugfs_create(void) {};
+static void zram_debugfs_destroy(void) {};
+static void zram_accessed(struct zram *zram, u32 index)
+{
+	zram_clear_flag(zram, index, ZRAM_IDLE);
+};
+static void zram_debugfs_register(struct zram *zram) {};
+static void zram_debugfs_unregister(struct zram *zram) {};
+#endif
+
+/*
+ * We switched to per-cpu streams and this attr is not needed anymore.
+ * However, we will keep it around for some time, because:
+ * a) we may revert per-cpu streams in the future
+ * b) it's visible to user space and we need to follow our 2 years
+ *    retirement rule; but we already have a number of 'soon to be
+ *    altered' attrs, so max_comp_streams need to wait for the next
+ *    layoff cycle.
+ */
+static ssize_t max_comp_streams_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	return scnprintf(buf, PAGE_SIZE, "%d\n", num_online_cpus());
+}
+
+static ssize_t max_comp_streams_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	return len;
+}
+
+static ssize_t comp_algorithm_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	size_t sz;
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	sz = zcomp_available_show(zram->compressor, buf);
+	up_read(&zram->init_lock);
+
+	return sz;
+}
+
+static ssize_t comp_algorithm_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+	char compressor[ARRAY_SIZE(zram->compressor)];
+	size_t sz;
+
+	strscpy(compressor, buf, sizeof(compressor));
+	/* ignore trailing newline */
+	sz = strlen(compressor);
+	if (sz > 0 && compressor[sz - 1] == '\n')
+		compressor[sz - 1] = 0x00;
+
+	if (!zcomp_available_algorithm(compressor))
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		up_write(&zram->init_lock);
+		pr_info("Can't change algorithm for initialized device\n");
+		return -EBUSY;
+	}
+
+	strcpy(zram->compressor, compressor);
+	up_write(&zram->init_lock);
+	return len;
+}
+
+static ssize_t compact_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	struct zram *zram = dev_to_zram(dev);
+
+	down_read(&zram->init_lock);
+	if (!init_done(zram)) {
+		up_read(&zram->init_lock);
+		return -EINVAL;
+	}
+
+	zs_compact(zram->mem_pool);
+	up_read(&zram->init_lock);
+
+	return len;
+}
+
+static ssize_t io_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+			"%8llu %8llu %8llu %8llu\n",
+			(u64)atomic64_read(&zram->stats.failed_reads),
+			(u64)atomic64_read(&zram->stats.failed_writes),
+			(u64)atomic64_read(&zram->stats.invalid_io),
+			(u64)atomic64_read(&zram->stats.notify_free));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+static ssize_t mm_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	struct zs_pool_stats pool_stats;
+	u64 orig_size, mem_used = 0;
+	long max_used;
+	ssize_t ret;
+
+	memset(&pool_stats, 0x00, sizeof(struct zs_pool_stats));
+
+	down_read(&zram->init_lock);
+	if (init_done(zram)) {
+		mem_used = zs_get_total_pages(zram->mem_pool);
+		zs_pool_stats(zram->mem_pool, &pool_stats);
+	}
+
+	orig_size = atomic64_read(&zram->stats.pages_stored);
+	max_used = atomic_long_read(&zram->stats.max_used_pages);
+
+	ret = scnprintf(buf, PAGE_SIZE,
+			"%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu %8llu\n",
+			orig_size << PAGE_SHIFT,
+			(u64)atomic64_read(&zram->stats.compr_data_size),
+			mem_used << PAGE_SHIFT,
+			zram->limit_pages << PAGE_SHIFT,
+			max_used << PAGE_SHIFT,
+			(u64)atomic64_read(&zram->stats.same_pages),
+			atomic_long_read(&pool_stats.pages_compacted),
+			(u64)atomic64_read(&zram->stats.huge_pages),
+			(u64)atomic64_read(&zram->stats.huge_pages_since));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+#ifdef CONFIG_ZRAM_WRITEBACK
+#define FOUR_K(x) ((x) * (1 << (PAGE_SHIFT - 12)))
+static ssize_t bd_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+		"%8llu %8llu %8llu\n",
+			FOUR_K((u64)atomic64_read(&zram->stats.bd_count)),
+			FOUR_K((u64)atomic64_read(&zram->stats.bd_reads)),
+			FOUR_K((u64)atomic64_read(&zram->stats.bd_writes)));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+#endif
+
+static ssize_t debug_stat_show(struct device *dev,
+		struct device_attribute *attr, char *buf)
+{
+	int version = 1;
+	struct zram *zram = dev_to_zram(dev);
+	ssize_t ret;
+
+	down_read(&zram->init_lock);
+	ret = scnprintf(buf, PAGE_SIZE,
+			"version: %d\n%8llu %8llu\n",
+			version,
+			(u64)atomic64_read(&zram->stats.writestall),
+			(u64)atomic64_read(&zram->stats.miss_free));
+	up_read(&zram->init_lock);
+
+	return ret;
+}
+
+static DEVICE_ATTR_RO(io_stat);
+static DEVICE_ATTR_RO(mm_stat);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RO(bd_stat);
+#endif
+static DEVICE_ATTR_RO(debug_stat);
+
+static void zram_meta_free(struct zram *zram, u64 disksize)
+{
+	size_t num_pages = disksize >> PAGE_SHIFT;
+	size_t index;
+
+	/* Free all pages that are still in this zram device */
+	for (index = 0; index < num_pages; index++)
+		zram_free_page(zram, index);
+
+	zs_destroy_pool(zram->mem_pool);
+	vfree(zram->table);
+}
+
+static bool zram_meta_alloc(struct zram *zram, u64 disksize)
+{
+	size_t num_pages;
+
+	num_pages = disksize >> PAGE_SHIFT;
+	zram->table = vzalloc(array_size(num_pages, sizeof(*zram->table)));
+	if (!zram->table)
+		return false;
+
+	zram->mem_pool = zs_create_pool(zram->disk->disk_name);
+	if (!zram->mem_pool) {
+		vfree(zram->table);
+		return false;
+	}
+
+	if (!huge_class_size)
+		huge_class_size = zs_huge_class_size(zram->mem_pool);
+	return true;
+}
+
+/*
+ * To protect concurrent access to the same index entry,
+ * caller should hold this table index entry's bit_spinlock to
+ * indicate this index entry is accessing.
+ */
+static void zram_free_page(struct zram *zram, size_t index)
+{
+	unsigned long handle;
+
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+	zram->table[index].ac_time = 0;
+#endif
+	if (zram_test_flag(zram, index, ZRAM_IDLE))
+		zram_clear_flag(zram, index, ZRAM_IDLE);
+
+	if (zram_test_flag(zram, index, ZRAM_HUGE)) {
+		zram_clear_flag(zram, index, ZRAM_HUGE);
+		atomic64_dec(&zram->stats.huge_pages);
+	}
+
+	if (zram_test_flag(zram, index, ZRAM_WB)) {
+		zram_clear_flag(zram, index, ZRAM_WB);
+		free_block_bdev(zram, zram_get_element(zram, index));
+		goto out;
+	}
+
+	/*
+	 * No memory is allocated for same element filled pages.
+	 * Simply clear same page flag.
+	 */
+	if (zram_test_flag(zram, index, ZRAM_SAME)) {
+		zram_clear_flag(zram, index, ZRAM_SAME);
+		atomic64_dec(&zram->stats.same_pages);
+		goto out;
+	}
+
+	handle = zram_get_handle(zram, index);
+	if (!handle)
+		return;
+
+	zs_free(zram->mem_pool, handle);
+
+	atomic64_sub(zram_get_obj_size(zram, index),
+			&zram->stats.compr_data_size);
+out:
+	atomic64_dec(&zram->stats.pages_stored);
+	zram_set_handle(zram, index, 0);
+	zram_set_obj_size(zram, index, 0);
+	WARN_ON_ONCE(zram->table[index].flags &
+		~(1UL << ZRAM_LOCK | 1UL << ZRAM_UNDER_WB));
+}
+
+static int __zram_bvec_read(struct zram *zram, struct page *page, u32 index,
+				struct bio *bio, bool partial_io)
+{
+	struct zcomp_strm *zstrm;
+	unsigned long handle;
+	unsigned int size;
+	void *src, *dst;
+	int ret;
+
+	zram_slot_lock(zram, index);
+	if (zram_test_flag(zram, index, ZRAM_WB)) {
+		struct bio_vec bvec;
+
+		zram_slot_unlock(zram, index);
+		/* A null bio means rw_page was used, we must fallback to bio */
+		if (!bio)
+			return -EOPNOTSUPP;
+
+		bvec.bv_page = page;
+		bvec.bv_len = PAGE_SIZE;
+		bvec.bv_offset = 0;
+		return read_from_bdev(zram, &bvec,
+				zram_get_element(zram, index),
+				bio, partial_io);
+	}
+
+	handle = zram_get_handle(zram, index);
+	if (!handle || zram_test_flag(zram, index, ZRAM_SAME)) {
+		unsigned long value;
+		void *mem;
+
+		value = handle ? zram_get_element(zram, index) : 0;
+		mem = kmap_atomic(page);
+		zram_fill_page(mem, PAGE_SIZE, value);
+		kunmap_atomic(mem);
+		zram_slot_unlock(zram, index);
+		return 0;
+	}
+
+	size = zram_get_obj_size(zram, index);
+
+	if (size != PAGE_SIZE)
+		zstrm = zcomp_stream_get(zram->comp);
+
+	src = zs_map_object(zram->mem_pool, handle, ZS_MM_RO);
+	if (size == PAGE_SIZE) {
+		dst = kmap_atomic(page);
+		memcpy(dst, src, PAGE_SIZE);
+		kunmap_atomic(dst);
+		ret = 0;
+	} else {
+		dst = kmap_atomic(page);
+		ret = zcomp_decompress(zstrm, src, size, dst);
+		kunmap_atomic(dst);
+		zcomp_stream_put(zram->comp);
+	}
+	zs_unmap_object(zram->mem_pool, handle);
+	zram_slot_unlock(zram, index);
+
+	/* Should NEVER happen. Return bio error if it does. */
+	if (WARN_ON(ret))
+		pr_err("Decompression failed! err=%d, page=%u\n", ret, index);
+
+	return ret;
+}
+
+static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset, struct bio *bio)
+{
+	int ret;
+	struct page *page;
+
+	page = bvec->bv_page;
+	if (is_partial_io(bvec)) {
+		/* Use a temporary buffer to decompress the page */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+	}
+
+	ret = __zram_bvec_read(zram, page, index, bio, is_partial_io(bvec));
+	if (unlikely(ret))
+		goto out;
+
+	if (is_partial_io(bvec)) {
+		void *src = kmap_atomic(page);
+
+		memcpy_to_bvec(bvec, src + offset);
+		kunmap_atomic(src);
+	}
+out:
+	if (is_partial_io(bvec))
+		__free_page(page);
+
+	return ret;
+}
+
+static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, struct bio *bio)
+{
+	int ret = 0;
+	unsigned long alloced_pages;
+	unsigned long handle = -ENOMEM;
+	unsigned int comp_len = 0;
+	void *src, *dst, *mem;
+	struct zcomp_strm *zstrm;
+	struct page *page = bvec->bv_page;
+	unsigned long element = 0;
+	enum zram_pageflags flags = 0;
+
+	mem = kmap_atomic(page);
+	if (page_same_filled(mem, &element)) {
+		kunmap_atomic(mem);
+		/* Free memory associated with this sector now. */
+		flags = ZRAM_SAME;
+		atomic64_inc(&zram->stats.same_pages);
+		goto out;
+	}
+	kunmap_atomic(mem);
+
+compress_again:
+	zstrm = zcomp_stream_get(zram->comp);
+	src = kmap_atomic(page);
+	ret = zcomp_compress(zstrm, src, &comp_len);
+	kunmap_atomic(src);
+
+	if (unlikely(ret)) {
+		zcomp_stream_put(zram->comp);
+		pr_err("Compression failed! err=%d\n", ret);
+		zs_free(zram->mem_pool, handle);
+		return ret;
+	}
+
+	if (comp_len >= huge_class_size)
+		comp_len = PAGE_SIZE;
+	/*
+	 * handle allocation has 2 paths:
+	 * a) fast path is executed with preemption disabled (for
+	 *  per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear,
+	 *  since we can't sleep;
+	 * b) slow path enables preemption and attempts to allocate
+	 *  the page with __GFP_DIRECT_RECLAIM bit set. we have to
+	 *  put per-cpu compression stream and, thus, to re-do
+	 *  the compression once handle is allocated.
+	 *
+	 * if we have a 'non-null' handle here then we are coming
+	 * from the slow path and handle has already been allocated.
+	 */
+	if (IS_ERR((void *)handle))
+		handle = zs_malloc(zram->mem_pool, comp_len,
+				__GFP_KSWAPD_RECLAIM |
+				__GFP_NOWARN |
+				__GFP_HIGHMEM |
+				__GFP_MOVABLE);
+	if (IS_ERR((void *)handle)) {
+		zcomp_stream_put(zram->comp);
+		atomic64_inc(&zram->stats.writestall);
+		handle = zs_malloc(zram->mem_pool, comp_len,
+				GFP_NOIO | __GFP_HIGHMEM |
+				__GFP_MOVABLE);
+		if (IS_ERR((void *)handle))
+			return PTR_ERR((void *)handle);
+
+		if (comp_len != PAGE_SIZE)
+			goto compress_again;
+		/*
+		 * If the page is not compressible, you need to acquire the
+		 * lock and execute the code below. The zcomp_stream_get()
+		 * call is needed to disable the cpu hotplug and grab the
+		 * zstrm buffer back. It is necessary that the dereferencing
+		 * of the zstrm variable below occurs correctly.
+		 */
+		zstrm = zcomp_stream_get(zram->comp);
+	}
+
+	alloced_pages = zs_get_total_pages(zram->mem_pool);
+	update_used_max(zram, alloced_pages);
+
+	if (zram->limit_pages && alloced_pages > zram->limit_pages) {
+		zcomp_stream_put(zram->comp);
+		zs_free(zram->mem_pool, handle);
+		return -ENOMEM;
+	}
+
+	dst = zs_map_object(zram->mem_pool, handle, ZS_MM_WO);
+
+	src = zstrm->buffer;
+	if (comp_len == PAGE_SIZE)
+		src = kmap_atomic(page);
+	memcpy(dst, src, comp_len);
+	if (comp_len == PAGE_SIZE)
+		kunmap_atomic(src);
+
+	zcomp_stream_put(zram->comp);
+	zs_unmap_object(zram->mem_pool, handle);
+	atomic64_add(comp_len, &zram->stats.compr_data_size);
+out:
+	/*
+	 * Free memory associated with this sector
+	 * before overwriting unused sectors.
+	 */
+	zram_slot_lock(zram, index);
+	zram_free_page(zram, index);
+
+	if (comp_len == PAGE_SIZE) {
+		zram_set_flag(zram, index, ZRAM_HUGE);
+		atomic64_inc(&zram->stats.huge_pages);
+		atomic64_inc(&zram->stats.huge_pages_since);
+	}
+
+	if (flags) {
+		zram_set_flag(zram, index, flags);
+		zram_set_element(zram, index, element);
+	}  else {
+		zram_set_handle(zram, index, handle);
+		zram_set_obj_size(zram, index, comp_len);
+	}
+	zram_slot_unlock(zram, index);
+
+	/* Update stats */
+	atomic64_inc(&zram->stats.pages_stored);
+	return ret;
+}
+
+static int zram_bvec_write(struct zram *zram, struct bio_vec *bvec,
+				u32 index, int offset, struct bio *bio)
+{
+	int ret;
+	struct page *page = NULL;
+	struct bio_vec vec;
+
+	vec = *bvec;
+	if (is_partial_io(bvec)) {
+		void *dst;
+		/*
+		 * This is a partial IO. We need to read the full page
+		 * before to write the changes.
+		 */
+		page = alloc_page(GFP_NOIO|__GFP_HIGHMEM);
+		if (!page)
+			return -ENOMEM;
+
+		ret = __zram_bvec_read(zram, page, index, bio, true);
+		if (ret)
+			goto out;
+
+		dst = kmap_atomic(page);
+		memcpy_from_bvec(dst + offset, bvec);
+		kunmap_atomic(dst);
+
+		vec.bv_page = page;
+		vec.bv_len = PAGE_SIZE;
+		vec.bv_offset = 0;
+	}
+
+	ret = __zram_bvec_write(zram, &vec, index, bio);
+out:
+	if (is_partial_io(bvec))
+		__free_page(page);
+	return ret;
+}
+
+/*
+ * zram_bio_discard - handler on discard request
+ * @index: physical block index in PAGE_SIZE units
+ * @offset: byte offset within physical block
+ */
+static void zram_bio_discard(struct zram *zram, u32 index,
+			     int offset, struct bio *bio)
+{
+	size_t n = bio->bi_iter.bi_size;
+
+	/*
+	 * zram manages data in physical block size units. Because logical block
+	 * size isn't identical with physical block size on some arch, we
+	 * could get a discard request pointing to a specific offset within a
+	 * certain physical block.  Although we can handle this request by
+	 * reading that physiclal block and decompressing and partially zeroing
+	 * and re-compressing and then re-storing it, this isn't reasonable
+	 * because our intent with a discard request is to save memory.  So
+	 * skipping this logical block is appropriate here.
+	 */
+	if (offset) {
+		if (n <= (PAGE_SIZE - offset))
+			return;
+
+		n -= (PAGE_SIZE - offset);
+		index++;
+	}
+
+	while (n >= PAGE_SIZE) {
+		zram_slot_lock(zram, index);
+		zram_free_page(zram, index);
+		zram_slot_unlock(zram, index);
+		atomic64_inc(&zram->stats.notify_free);
+		index++;
+		n -= PAGE_SIZE;
+	}
+}
+
+/*
+ * Returns errno if it has some problem. Otherwise return 0 or 1.
+ * Returns 0 if IO request was done synchronously
+ * Returns 1 if IO request was successfully submitted.
+ */
+static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index,
+			int offset, enum req_op op, struct bio *bio)
+{
+	int ret;
+
+	if (!op_is_write(op)) {
+		atomic64_inc(&zram->stats.num_reads);
+		ret = zram_bvec_read(zram, bvec, index, offset, bio);
+		flush_dcache_page(bvec->bv_page);
+	} else {
+		atomic64_inc(&zram->stats.num_writes);
+		ret = zram_bvec_write(zram, bvec, index, offset, bio);
+	}
+
+	zram_slot_lock(zram, index);
+	zram_accessed(zram, index);
+	zram_slot_unlock(zram, index);
+
+	if (unlikely(ret < 0)) {
+		if (!op_is_write(op))
+			atomic64_inc(&zram->stats.failed_reads);
+		else
+			atomic64_inc(&zram->stats.failed_writes);
+	}
+
+	return ret;
+}
+
+static void __zram_make_request(struct zram *zram, struct bio *bio)
+{
+	int offset;
+	u32 index;
+	struct bio_vec bvec;
+	struct bvec_iter iter;
+	unsigned long start_time;
+
+	index = bio->bi_iter.bi_sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = (bio->bi_iter.bi_sector &
+		  (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+	switch (bio_op(bio)) {
+	case REQ_OP_DISCARD:
+	case REQ_OP_WRITE_ZEROES:
+		zram_bio_discard(zram, index, offset, bio);
+		bio_endio(bio);
+		return;
+	default:
+		break;
+	}
+
+	start_time = bio_start_io_acct(bio);
+	bio_for_each_segment(bvec, bio, iter) {
+		struct bio_vec bv = bvec;
+		unsigned int unwritten = bvec.bv_len;
+
+		do {
+			bv.bv_len = min_t(unsigned int, PAGE_SIZE - offset,
+							unwritten);
+			if (zram_bvec_rw(zram, &bv, index, offset,
+					 bio_op(bio), bio) < 0) {
+				bio->bi_status = BLK_STS_IOERR;
+				break;
+			}
+
+			bv.bv_offset += bv.bv_len;
+			unwritten -= bv.bv_len;
+
+			update_position(&index, &offset, &bv);
+		} while (unwritten);
+	}
+	bio_end_io_acct(bio, start_time);
+	bio_endio(bio);
+}
+
+/*
+ * Handler function for all zram I/O requests.
+ */
+static void zram_submit_bio(struct bio *bio)
+{
+	struct zram *zram = bio->bi_bdev->bd_disk->private_data;
+
+	if (!valid_io_request(zram, bio->bi_iter.bi_sector,
+					bio->bi_iter.bi_size)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		bio_io_error(bio);
+		return;
+	}
+
+	__zram_make_request(zram, bio);
+}
+
+static void zram_slot_free_notify(struct block_device *bdev,
+				unsigned long index)
+{
+	struct zram *zram;
+
+	zram = bdev->bd_disk->private_data;
+
+	atomic64_inc(&zram->stats.notify_free);
+	if (!zram_slot_trylock(zram, index)) {
+		atomic64_inc(&zram->stats.miss_free);
+		return;
+	}
+
+	zram_free_page(zram, index);
+	zram_slot_unlock(zram, index);
+}
+
+static int zram_rw_page(struct block_device *bdev, sector_t sector,
+		       struct page *page, enum req_op op)
+{
+	int offset, ret;
+	u32 index;
+	struct zram *zram;
+	struct bio_vec bv;
+	unsigned long start_time;
+
+	if (PageTransHuge(page))
+		return -ENOTSUPP;
+	zram = bdev->bd_disk->private_data;
+
+	if (!valid_io_request(zram, sector, PAGE_SIZE)) {
+		atomic64_inc(&zram->stats.invalid_io);
+		ret = -EINVAL;
+		goto out;
+	}
+
+	index = sector >> SECTORS_PER_PAGE_SHIFT;
+	offset = (sector & (SECTORS_PER_PAGE - 1)) << SECTOR_SHIFT;
+
+	bv.bv_page = page;
+	bv.bv_len = PAGE_SIZE;
+	bv.bv_offset = 0;
+
+	start_time = bdev_start_io_acct(bdev->bd_disk->part0,
+			SECTORS_PER_PAGE, op, jiffies);
+	ret = zram_bvec_rw(zram, &bv, index, offset, op, NULL);
+	bdev_end_io_acct(bdev->bd_disk->part0, op, start_time);
+out:
+	/*
+	 * If I/O fails, just return error(ie, non-zero) without
+	 * calling page_endio.
+	 * It causes resubmit the I/O with bio request by upper functions
+	 * of rw_page(e.g., swap_readpage, __swap_writepage) and
+	 * bio->bi_end_io does things to handle the error
+	 * (e.g., SetPageError, set_page_dirty and extra works).
+	 */
+	if (unlikely(ret < 0))
+		return ret;
+
+	switch (ret) {
+	case 0:
+		page_endio(page, op_is_write(op), 0);
+		break;
+	case 1:
+		ret = 0;
+		break;
+	default:
+		WARN_ON(1);
+	}
+	return ret;
+}
+
+static void zram_reset_device(struct zram *zram)
+{
+	down_write(&zram->init_lock);
+
+	zram->limit_pages = 0;
+
+	if (!init_done(zram)) {
+		up_write(&zram->init_lock);
+		return;
+	}
+
+	set_capacity_and_notify(zram->disk, 0);
+	part_stat_set_all(zram->disk->part0, 0);
+
+	/* I/O operation under all of CPU are done so let's free */
+	zram_meta_free(zram, zram->disksize);
+	zram->disksize = 0;
+	memset(&zram->stats, 0, sizeof(zram->stats));
+	zcomp_destroy(zram->comp);
+	zram->comp = NULL;
+	reset_bdev(zram);
+
+	up_write(&zram->init_lock);
+}
+
+static ssize_t disksize_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	u64 disksize;
+	struct zcomp *comp;
+	struct zram *zram = dev_to_zram(dev);
+	int err;
+
+	disksize = memparse(buf, NULL);
+	if (!disksize)
+		return -EINVAL;
+
+	down_write(&zram->init_lock);
+	if (init_done(zram)) {
+		pr_info("Cannot change disksize for initialized device\n");
+		err = -EBUSY;
+		goto out_unlock;
+	}
+
+	disksize = PAGE_ALIGN(disksize);
+	if (!zram_meta_alloc(zram, disksize)) {
+		err = -ENOMEM;
+		goto out_unlock;
+	}
+
+	comp = zcomp_create(zram->compressor);
+	if (IS_ERR(comp)) {
+		pr_err("Cannot initialise %s compressing backend\n",
+				zram->compressor);
+		err = PTR_ERR(comp);
+		goto out_free_meta;
+	}
+
+	zram->comp = comp;
+	zram->disksize = disksize;
+	set_capacity_and_notify(zram->disk, zram->disksize >> SECTOR_SHIFT);
+	up_write(&zram->init_lock);
+
+	return len;
+
+out_free_meta:
+	zram_meta_free(zram, disksize);
+out_unlock:
+	up_write(&zram->init_lock);
+	return err;
+}
+
+static ssize_t reset_store(struct device *dev,
+		struct device_attribute *attr, const char *buf, size_t len)
+{
+	int ret;
+	unsigned short do_reset;
+	struct zram *zram;
+	struct gendisk *disk;
+
+	ret = kstrtou16(buf, 10, &do_reset);
+	if (ret)
+		return ret;
+
+	if (!do_reset)
+		return -EINVAL;
+
+	zram = dev_to_zram(dev);
+	disk = zram->disk;
+
+	mutex_lock(&disk->open_mutex);
+	/* Do not reset an active device or claimed device */
+	if (disk_openers(disk) || zram->claim) {
+		mutex_unlock(&disk->open_mutex);
+		return -EBUSY;
+	}
+
+	/* From now on, anyone can't open /dev/zram[0-9] */
+	zram->claim = true;
+	mutex_unlock(&disk->open_mutex);
+
+	/* Make sure all the pending I/O are finished */
+	sync_blockdev(disk->part0);
+	zram_reset_device(zram);
+
+	mutex_lock(&disk->open_mutex);
+	zram->claim = false;
+	mutex_unlock(&disk->open_mutex);
+
+	return len;
+}
+
+static int zram_open(struct block_device *bdev, fmode_t mode)
+{
+	int ret = 0;
+	struct zram *zram;
+
+	WARN_ON(!mutex_is_locked(&bdev->bd_disk->open_mutex));
+
+	zram = bdev->bd_disk->private_data;
+	/* zram was claimed to reset so open request fails */
+	if (zram->claim)
+		ret = -EBUSY;
+
+	return ret;
+}
+
+static const struct block_device_operations zram_devops = {
+	.open = zram_open,
+	.submit_bio = zram_submit_bio,
+	.swap_slot_free_notify = zram_slot_free_notify,
+	.rw_page = zram_rw_page,
+	.owner = THIS_MODULE
+};
+
+static DEVICE_ATTR_WO(compact);
+static DEVICE_ATTR_RW(disksize);
+static DEVICE_ATTR_RO(initstate);
+static DEVICE_ATTR_WO(reset);
+static DEVICE_ATTR_WO(mem_limit);
+static DEVICE_ATTR_WO(mem_used_max);
+static DEVICE_ATTR_WO(idle);
+static DEVICE_ATTR_RW(max_comp_streams);
+static DEVICE_ATTR_RW(comp_algorithm);
+#ifdef CONFIG_ZRAM_WRITEBACK
+static DEVICE_ATTR_RW(backing_dev);
+static DEVICE_ATTR_WO(writeback);
+static DEVICE_ATTR_RW(writeback_limit);
+static DEVICE_ATTR_RW(writeback_limit_enable);
+#endif
+
+static struct attribute *zram_disk_attrs[] = {
+	&dev_attr_disksize.attr,
+	&dev_attr_initstate.attr,
+	&dev_attr_reset.attr,
+	&dev_attr_compact.attr,
+	&dev_attr_mem_limit.attr,
+	&dev_attr_mem_used_max.attr,
+	&dev_attr_idle.attr,
+	&dev_attr_max_comp_streams.attr,
+	&dev_attr_comp_algorithm.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+	&dev_attr_backing_dev.attr,
+	&dev_attr_writeback.attr,
+	&dev_attr_writeback_limit.attr,
+	&dev_attr_writeback_limit_enable.attr,
+#endif
+	&dev_attr_io_stat.attr,
+	&dev_attr_mm_stat.attr,
+#ifdef CONFIG_ZRAM_WRITEBACK
+	&dev_attr_bd_stat.attr,
+#endif
+	&dev_attr_debug_stat.attr,
+	NULL,
+};
+
+ATTRIBUTE_GROUPS(zram_disk);
+
+/*
+ * Allocate and initialize new zram device. the function returns
+ * '>= 0' device_id upon success, and negative value otherwise.
+ */
+static int zram_add(void)
+{
+	struct zram *zram;
+	int ret, device_id;
+
+	zram = kzalloc(sizeof(struct zram), GFP_KERNEL);
+	if (!zram)
+		return -ENOMEM;
+
+	ret = idr_alloc(&zram_index_idr, zram, 0, 0, GFP_KERNEL);
+	if (ret < 0)
+		goto out_free_dev;
+	device_id = ret;
+
+	init_rwsem(&zram->init_lock);
+#ifdef CONFIG_ZRAM_WRITEBACK
+	spin_lock_init(&zram->wb_limit_lock);
+#endif
+
+	/* gendisk structure */
+	zram->disk = blk_alloc_disk(NUMA_NO_NODE);
+	if (!zram->disk) {
+		pr_err("Error allocating disk structure for device %d\n",
+			device_id);
+		ret = -ENOMEM;
+		goto out_free_idr;
+	}
+
+	zram->disk->major = zram_major;
+	zram->disk->first_minor = device_id;
+	zram->disk->minors = 1;
+	zram->disk->flags |= GENHD_FL_NO_PART;
+	zram->disk->fops = &zram_devops;
+	zram->disk->private_data = zram;
+	snprintf(zram->disk->disk_name, 16, "zram%d", device_id);
+
+	/* Actual capacity set using syfs (/sys/block/zram<id>/disksize */
+	set_capacity(zram->disk, 0);
+	/* zram devices sort of resembles non-rotational disks */
+	blk_queue_flag_set(QUEUE_FLAG_NONROT, zram->disk->queue);
+	blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zram->disk->queue);
+
+	/*
+	 * To ensure that we always get PAGE_SIZE aligned
+	 * and n*PAGE_SIZED sized I/O requests.
+	 */
+	blk_queue_physical_block_size(zram->disk->queue, PAGE_SIZE);
+	blk_queue_logical_block_size(zram->disk->queue,
+					ZRAM_LOGICAL_BLOCK_SIZE);
+	blk_queue_io_min(zram->disk->queue, PAGE_SIZE);
+	blk_queue_io_opt(zram->disk->queue, PAGE_SIZE);
+	zram->disk->queue->limits.discard_granularity = PAGE_SIZE;
+	blk_queue_max_discard_sectors(zram->disk->queue, UINT_MAX);
+
+	/*
+	 * zram_bio_discard() will clear all logical blocks if logical block
+	 * size is identical with physical block size(PAGE_SIZE). But if it is
+	 * different, we will skip discarding some parts of logical blocks in
+	 * the part of the request range which isn't aligned to physical block
+	 * size.  So we can't ensure that all discarded logical blocks are
+	 * zeroed.
+	 */
+	if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE)
+		blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
+
+	blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
+	ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
+	if (ret)
+		goto out_cleanup_disk;
+
+	strscpy(zram->compressor, default_compressor, sizeof(zram->compressor));
+
+	zram_debugfs_register(zram);
+	pr_info("Added device: %s\n", zram->disk->disk_name);
+	return device_id;
+
+out_cleanup_disk:
+	put_disk(zram->disk);
+out_free_idr:
+	idr_remove(&zram_index_idr, device_id);
+out_free_dev:
+	kfree(zram);
+	return ret;
+}
+
+static int zram_remove(struct zram *zram)
+{
+	bool claimed;
+
+	mutex_lock(&zram->disk->open_mutex);
+	if (disk_openers(zram->disk)) {
+		mutex_unlock(&zram->disk->open_mutex);
+		return -EBUSY;
+	}
+
+	claimed = zram->claim;
+	if (!claimed)
+		zram->claim = true;
+	mutex_unlock(&zram->disk->open_mutex);
+
+	zram_debugfs_unregister(zram);
+
+	if (claimed) {
+		/*
+		 * If we were claimed by reset_store(), del_gendisk() will
+		 * wait until reset_store() is done, so nothing need to do.
+		 */
+		;
+	} else {
+		/* Make sure all the pending I/O are finished */
+		sync_blockdev(zram->disk->part0);
+		zram_reset_device(zram);
+	}
+
+	pr_info("Removed device: %s\n", zram->disk->disk_name);
+
+	del_gendisk(zram->disk);
+
+	/* del_gendisk drains pending reset_store */
+	WARN_ON_ONCE(claimed && zram->claim);
+
+	/*
+	 * disksize_store() may be called in between zram_reset_device()
+	 * and del_gendisk(), so run the last reset to avoid leaking
+	 * anything allocated with disksize_store()
+	 */
+	zram_reset_device(zram);
+
+	put_disk(zram->disk);
+	kfree(zram);
+	return 0;
+}
+
+/* zram-control sysfs attributes */
+
+/*
+ * NOTE: hot_add attribute is not the usual read-only sysfs attribute. In a
+ * sense that reading from this file does alter the state of your system -- it
+ * creates a new un-initialized zram device and returns back this device's
+ * device_id (or an error code if it fails to create a new device).
+ */
+static ssize_t hot_add_show(struct class *class,
+			struct class_attribute *attr,
+			char *buf)
+{
+	int ret;
+
+	mutex_lock(&zram_index_mutex);
+	ret = zram_add();
+	mutex_unlock(&zram_index_mutex);
+
+	if (ret < 0)
+		return ret;
+	return scnprintf(buf, PAGE_SIZE, "%d\n", ret);
+}
+static struct class_attribute class_attr_hot_add =
+	__ATTR(hot_add, 0400, hot_add_show, NULL);
+
+static ssize_t hot_remove_store(struct class *class,
+			struct class_attribute *attr,
+			const char *buf,
+			size_t count)
+{
+	struct zram *zram;
+	int ret, dev_id;
+
+	/* dev_id is gendisk->first_minor, which is `int' */
+	ret = kstrtoint(buf, 10, &dev_id);
+	if (ret)
+		return ret;
+	if (dev_id < 0)
+		return -EINVAL;
+
+	mutex_lock(&zram_index_mutex);
+
+	zram = idr_find(&zram_index_idr, dev_id);
+	if (zram) {
+		ret = zram_remove(zram);
+		if (!ret)
+			idr_remove(&zram_index_idr, dev_id);
+	} else {
+		ret = -ENODEV;
+	}
+
+	mutex_unlock(&zram_index_mutex);
+	return ret ? ret : count;
+}
+static CLASS_ATTR_WO(hot_remove);
+
+static struct attribute *zram_control_class_attrs[] = {
+	&class_attr_hot_add.attr,
+	&class_attr_hot_remove.attr,
+	NULL,
+};
+ATTRIBUTE_GROUPS(zram_control_class);
+
+static struct class zram_control_class = {
+	.name		= "zram-control",
+	.owner		= THIS_MODULE,
+	.class_groups	= zram_control_class_groups,
+};
+
+static int zram_remove_cb(int id, void *ptr, void *data)
+{
+	WARN_ON_ONCE(zram_remove(ptr));
+	return 0;
+}
+
+static void destroy_devices(void)
+{
+	class_unregister(&zram_control_class);
+	idr_for_each(&zram_index_idr, &zram_remove_cb, NULL);
+	zram_debugfs_destroy();
+	idr_destroy(&zram_index_idr);
+	unregister_blkdev(zram_major, "zram");
+	cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
+}
+
+static int __init zram_init(void)
+{
+	int ret;
+
+	BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG);
+
+	ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare",
+				      zcomp_cpu_up_prepare, zcomp_cpu_dead);
+	if (ret < 0)
+		return ret;
+
+	ret = class_register(&zram_control_class);
+	if (ret) {
+		pr_err("Unable to register zram-control class\n");
+		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
+		return ret;
+	}
+
+	zram_debugfs_create();
+	zram_major = register_blkdev(0, "zram");
+	if (zram_major <= 0) {
+		pr_err("Unable to get major number\n");
+		class_unregister(&zram_control_class);
+		cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE);
+		return -EBUSY;
+	}
+
+	while (num_devices != 0) {
+		mutex_lock(&zram_index_mutex);
+		ret = zram_add();
+		mutex_unlock(&zram_index_mutex);
+		if (ret < 0)
+			goto out_error;
+		num_devices--;
+	}
+
+	return 0;
+
+out_error:
+	destroy_devices();
+	return ret;
+}
+
+static void __exit zram_exit(void)
+{
+	destroy_devices();
+}
+
+module_init(zram_init);
+module_exit(zram_exit);
+
+module_param(num_devices, uint, 0);
+MODULE_PARM_DESC(num_devices, "Number of pre-created zram devices");
+
+MODULE_LICENSE("Dual BSD/GPL");
+MODULE_AUTHOR("Nitin Gupta <ngupta@vflare.org>");
+MODULE_DESCRIPTION("Compressed RAM Block Device");
diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h
new file mode 100644
index 000000000..a2bda5302
--- /dev/null
+++ b/drivers/block/zram/zram_drv.h
@@ -0,0 +1,128 @@
+/*
+ * Compressed RAM block device
+ *
+ * Copyright (C) 2008, 2009, 2010  Nitin Gupta
+ *               2012, 2013 Minchan Kim
+ *
+ * This code is released using a dual license strategy: BSD/GPL
+ * You can choose the licence that better fits your requirements.
+ *
+ * Released under the terms of 3-clause BSD License
+ * Released under the terms of GNU General Public License Version 2.0
+ *
+ */
+
+#ifndef _ZRAM_DRV_H_
+#define _ZRAM_DRV_H_
+
+#include <linux/rwsem.h>
+#include <linux/zsmalloc.h>
+#include <linux/crypto.h>
+
+#include "zcomp.h"
+
+#define SECTORS_PER_PAGE_SHIFT	(PAGE_SHIFT - SECTOR_SHIFT)
+#define SECTORS_PER_PAGE	(1 << SECTORS_PER_PAGE_SHIFT)
+#define ZRAM_LOGICAL_BLOCK_SHIFT 12
+#define ZRAM_LOGICAL_BLOCK_SIZE	(1 << ZRAM_LOGICAL_BLOCK_SHIFT)
+#define ZRAM_SECTOR_PER_LOGICAL_BLOCK	\
+	(1 << (ZRAM_LOGICAL_BLOCK_SHIFT - SECTOR_SHIFT))
+
+
+/*
+ * ZRAM is mainly used for memory efficiency so we want to keep memory
+ * footprint small and thus squeeze size and zram pageflags into a flags
+ * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding
+ * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT
+ * bits), the higher bits are for zram_pageflags.
+ *
+ * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow.
+ */
+#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1)
+
+/* Flags for zram pages (table[page_no].flags) */
+enum zram_pageflags {
+	/* zram slot is locked */
+	ZRAM_LOCK = ZRAM_FLAG_SHIFT,
+	ZRAM_SAME,	/* Page consists the same element */
+	ZRAM_WB,	/* page is stored on backing_device */
+	ZRAM_UNDER_WB,	/* page is under writeback */
+	ZRAM_HUGE,	/* Incompressible page */
+	ZRAM_IDLE,	/* not accessed page since last idle marking */
+
+	__NR_ZRAM_PAGEFLAGS,
+};
+
+/*-- Data structures */
+
+/* Allocated for each disk page */
+struct zram_table_entry {
+	union {
+		unsigned long handle;
+		unsigned long element;
+	};
+	unsigned long flags;
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+	ktime_t ac_time;
+#endif
+};
+
+struct zram_stats {
+	atomic64_t compr_data_size;	/* compressed size of pages stored */
+	atomic64_t num_reads;	/* failed + successful */
+	atomic64_t num_writes;	/* --do-- */
+	atomic64_t failed_reads;	/* can happen when memory is too low */
+	atomic64_t failed_writes;	/* can happen when memory is too low */
+	atomic64_t invalid_io;	/* non-page-aligned I/O requests */
+	atomic64_t notify_free;	/* no. of swap slot free notifications */
+	atomic64_t same_pages;		/* no. of same element filled pages */
+	atomic64_t huge_pages;		/* no. of huge pages */
+	atomic64_t huge_pages_since;	/* no. of huge pages since zram set up */
+	atomic64_t pages_stored;	/* no. of pages currently stored */
+	atomic_long_t max_used_pages;	/* no. of maximum pages stored */
+	atomic64_t writestall;		/* no. of write slow paths */
+	atomic64_t miss_free;		/* no. of missed free */
+#ifdef	CONFIG_ZRAM_WRITEBACK
+	atomic64_t bd_count;		/* no. of pages in backing device */
+	atomic64_t bd_reads;		/* no. of reads from backing device */
+	atomic64_t bd_writes;		/* no. of writes from backing device */
+#endif
+};
+
+struct zram {
+	struct zram_table_entry *table;
+	struct zs_pool *mem_pool;
+	struct zcomp *comp;
+	struct gendisk *disk;
+	/* Prevent concurrent execution of device init */
+	struct rw_semaphore init_lock;
+	/*
+	 * the number of pages zram can consume for storing compressed data
+	 */
+	unsigned long limit_pages;
+
+	struct zram_stats stats;
+	/*
+	 * This is the limit on amount of *uncompressed* worth of data
+	 * we can store in a disk.
+	 */
+	u64 disksize;	/* bytes */
+	char compressor[CRYPTO_MAX_ALG_NAME];
+	/*
+	 * zram is claimed so open request will be failed
+	 */
+	bool claim; /* Protected by disk->open_mutex */
+#ifdef CONFIG_ZRAM_WRITEBACK
+	struct file *backing_dev;
+	spinlock_t wb_limit_lock;
+	bool wb_limit_enable;
+	u64 bd_wb_limit;
+	struct block_device *bdev;
+	unsigned long *bitmap;
+	unsigned long nr_pages;
+#endif
+#ifdef CONFIG_ZRAM_MEMORY_TRACKING
+	struct dentry *debugfs_dir;
+#endif
+};
+#endif